@arizeai/phoenix-client 6.5.5 → 6.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,8 @@ The experiments module runs tasks over dataset examples, records experiment runs
12
12
  <li><code>src/experiments/helpers/getExperimentEvaluators.ts</code> for evaluator normalization</li>
13
13
  <li><code>src/experiments/helpers/fromPhoenixLLMEvaluator.ts</code> for the phoenix-evals bridge</li>
14
14
  <li><code>src/experiments/getExperimentRuns.ts</code> for reading runs back after execution</li>
15
+ <li><code>src/types/experiments.ts</code> for <code>EvaluatorParams</code> including <code>traceId</code></li>
16
+ <li><code>src/spans/getSpans.ts</code> for fetching spans by trace ID and span kind</li>
15
17
  </ul>
16
18
  </section>
17
19
 
@@ -226,10 +228,113 @@ When an evaluator runs, it receives a normalized object with these fields:
226
228
  | `output` | The task output for that run |
227
229
  | `expected` | The dataset example's `output` object |
228
230
  | `metadata` | The dataset example's `metadata` object |
231
+ | `traceId` | The OpenTelemetry trace ID of the task run (optional, `string \| null`) |
229
232
 
230
233
  This is why the `createClassificationEvaluator()` prompt can reference `{{input.question}}` and `{{output}}`.
231
234
 
232
- For code-based evaluators created with `asExperimentEvaluator()`, those same fields are available inside `evaluate({ input, output, expected, metadata })`.
235
+ For code-based evaluators created with `asExperimentEvaluator()`, those same fields are available inside `evaluate({ input, output, expected, metadata, traceId })`.
236
+
237
+ ## Trace-Based Evaluation
238
+
239
+ Each task run captures an OpenTelemetry trace ID. Evaluators can use `traceId` to fetch the task's spans from Phoenix and evaluate the execution trajectory — for example, verifying that specific tool calls were made or inspecting intermediate steps.
240
+
241
+ This pattern works best with `evaluateExperiment()` as a separate step after `runExperiment()`, so that all task spans are ingested into Phoenix before the evaluator queries them.
242
+
243
+ ```ts
244
+ import { traceTool } from "@arizeai/openinference-core";
245
+ import { createClient } from "@arizeai/phoenix-client";
246
+ import { createDataset } from "@arizeai/phoenix-client/datasets";
247
+ import {
248
+ asExperimentEvaluator,
249
+ evaluateExperiment,
250
+ runExperiment,
251
+ } from "@arizeai/phoenix-client/experiments";
252
+ import { getSpans } from "@arizeai/phoenix-client/spans";
253
+
254
+ const client = createClient();
255
+
256
+ const { datasetId } = await createDataset({
257
+ client,
258
+ name: "tool-call-dataset",
259
+ description: "Questions that require tool use",
260
+ examples: [
261
+ {
262
+ input: { question: "What is the weather in San Francisco?" },
263
+ output: { expectedTool: "getWeather" },
264
+ metadata: {},
265
+ },
266
+ ],
267
+ });
268
+
269
+ // Step 1: Run the experiment with traced tool calls
270
+ const experiment = await runExperiment({
271
+ client,
272
+ dataset: { datasetId },
273
+ setGlobalTracerProvider: true,
274
+ task: async (example) => {
275
+ // traceTool wraps a function with a TOOL span
276
+ const getWeather = traceTool(
277
+ ({ location }: { location: string }) => ({
278
+ location,
279
+ temperature: 72,
280
+ condition: "sunny",
281
+ }),
282
+ { name: "getWeather" }
283
+ );
284
+
285
+ const city = (example.input.question as string).match(/in (.+)\?/)?.[1];
286
+ const result = getWeather({ location: city ?? "Unknown" });
287
+ return `The weather in ${result.location} is ${result.temperature}F.`;
288
+ },
289
+ });
290
+
291
+ const projectName = experiment.projectName!;
292
+
293
+ // Step 2: Evaluate using traceId to inspect the task's spans
294
+ const evaluated = await evaluateExperiment({
295
+ client,
296
+ experiment,
297
+ evaluators: [
298
+ asExperimentEvaluator({
299
+ name: "has-expected-tool-call",
300
+ kind: "CODE",
301
+ evaluate: async ({ traceId, expected }) => {
302
+ if (!traceId) {
303
+ return { label: "no trace", score: 0 };
304
+ }
305
+
306
+ // Fetch TOOL spans from this task's trace
307
+ const { spans: toolSpans } = await getSpans({
308
+ client,
309
+ project: { projectName },
310
+ traceIds: [traceId],
311
+ spanKind: "TOOL",
312
+ });
313
+
314
+ const expectedTool = (expected as { expectedTool?: string })
315
+ ?.expectedTool;
316
+ const toolNames = toolSpans.map((s) => s.name);
317
+ const found = toolNames.some((name) => name.includes(expectedTool!));
318
+
319
+ return {
320
+ label: found ? "tool called" : "no tool call",
321
+ score: found ? 1 : 0,
322
+ explanation: found
323
+ ? `Found: ${toolNames.join(", ")}`
324
+ : `Expected "${expectedTool}" but found none`,
325
+ };
326
+ },
327
+ }),
328
+ ],
329
+ });
330
+ ```
331
+
332
+ Key points:
333
+
334
+ - Use `setGlobalTracerProvider: true` on `runExperiment()` so that child spans from `traceTool` or other OTel instrumentation land in the same trace as the task
335
+ - Use `evaluateExperiment()` as a separate step so spans are ingested before querying
336
+ - Use `getSpans()` with `traceIds` and `spanKind` filters to fetch specific spans from the task trace
337
+ - `traceId` is `null` in dry-run mode since no real traces are recorded
233
338
 
234
339
  ## What `runExperiment()` Returns
235
340
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arizeai/phoenix-client",
3
- "version": "6.5.5",
3
+ "version": "6.6.0",
4
4
  "description": "A client for the Phoenix API",
5
5
  "keywords": [
6
6
  "arize",
@@ -79,8 +79,8 @@
79
79
  "openapi-fetch": "^0.12.5",
80
80
  "tiny-invariant": "^1.3.3",
81
81
  "zod": "^4.0.14",
82
- "@arizeai/phoenix-otel": "0.4.3",
83
- "@arizeai/phoenix-config": "0.1.3"
82
+ "@arizeai/phoenix-config": "0.1.3",
83
+ "@arizeai/phoenix-otel": "0.4.3"
84
84
  },
85
85
  "devDependencies": {
86
86
  "@ai-sdk/openai": "^3.0.29",
@@ -692,6 +692,7 @@ async function runSingleEvaluation({
692
692
  output: taskOutput,
693
693
  expected: expectedOutput,
694
694
  metadata: datasetExample.metadata,
695
+ traceId: experimentRun.traceId,
695
696
  })
696
697
  );
697
698
  results = Array.isArray(result) ? result : [result];
@@ -746,6 +747,7 @@ async function runSingleEvaluation({
746
747
  output: taskOutput,
747
748
  expected: expectedOutput,
748
749
  metadata: datasetExample.metadata,
750
+ traceId: experimentRun.traceId,
749
751
  })
750
752
  );
751
753
 
@@ -853,6 +853,7 @@ async function runEvaluator({
853
853
  output: run.output ?? null,
854
854
  expected: example.output,
855
855
  metadata: example?.metadata,
856
+ traceId: run.traceId,
856
857
  });
857
858
  thisEval.result = result;
858
859
  } catch (error) {
@@ -131,6 +131,12 @@ export type EvaluatorParams<TaskOutputType = TaskOutput> = {
131
131
  * Metadata associated with the Dataset Example
132
132
  */
133
133
  metadata?: Example["metadata"];
134
+ /**
135
+ * The trace ID of the task run, if available.
136
+ * Can be used to fetch and analyze the task's trace
137
+ * (e.g., for trajectory evaluation or action verification).
138
+ */
139
+ traceId?: string | null;
134
140
  };
135
141
 
136
142
  export type Evaluator = {