@arizeai/phoenix-client 6.5.5 → 6.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/experiments/resumeEvaluation.js +2 -0
- package/dist/esm/experiments/resumeEvaluation.js.map +1 -1
- package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/runExperiment.js +1 -0
- package/dist/esm/experiments/runExperiment.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/experiments.d.ts +6 -0
- package/dist/esm/types/experiments.d.ts.map +1 -1
- package/dist/src/experiments/resumeEvaluation.js +2 -0
- package/dist/src/experiments/resumeEvaluation.js.map +1 -1
- package/dist/src/experiments/runExperiment.d.ts.map +1 -1
- package/dist/src/experiments/runExperiment.js +1 -0
- package/dist/src/experiments/runExperiment.js.map +1 -1
- package/dist/src/types/experiments.d.ts +6 -0
- package/dist/src/types/experiments.d.ts.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/docs/experiments.mdx +106 -1
- package/package.json +3 -3
- package/src/experiments/resumeEvaluation.ts +2 -0
- package/src/experiments/runExperiment.ts +1 -0
- package/src/types/experiments.ts +6 -0
package/docs/experiments.mdx
CHANGED
|
@@ -12,6 +12,8 @@ The experiments module runs tasks over dataset examples, records experiment runs
|
|
|
12
12
|
<li><code>src/experiments/helpers/getExperimentEvaluators.ts</code> for evaluator normalization</li>
|
|
13
13
|
<li><code>src/experiments/helpers/fromPhoenixLLMEvaluator.ts</code> for the phoenix-evals bridge</li>
|
|
14
14
|
<li><code>src/experiments/getExperimentRuns.ts</code> for reading runs back after execution</li>
|
|
15
|
+
<li><code>src/types/experiments.ts</code> for <code>EvaluatorParams</code> including <code>traceId</code></li>
|
|
16
|
+
<li><code>src/spans/getSpans.ts</code> for fetching spans by trace ID and span kind</li>
|
|
15
17
|
</ul>
|
|
16
18
|
</section>
|
|
17
19
|
|
|
@@ -226,10 +228,113 @@ When an evaluator runs, it receives a normalized object with these fields:
|
|
|
226
228
|
| `output` | The task output for that run |
|
|
227
229
|
| `expected` | The dataset example's `output` object |
|
|
228
230
|
| `metadata` | The dataset example's `metadata` object |
|
|
231
|
+
| `traceId` | The OpenTelemetry trace ID of the task run (optional, `string \| null`) |
|
|
229
232
|
|
|
230
233
|
This is why the `createClassificationEvaluator()` prompt can reference `{{input.question}}` and `{{output}}`.
|
|
231
234
|
|
|
232
|
-
For code-based evaluators created with `asExperimentEvaluator()`, those same fields are available inside `evaluate({ input, output, expected, metadata })`.
|
|
235
|
+
For code-based evaluators created with `asExperimentEvaluator()`, those same fields are available inside `evaluate({ input, output, expected, metadata, traceId })`.
|
|
236
|
+
|
|
237
|
+
## Trace-Based Evaluation
|
|
238
|
+
|
|
239
|
+
Each task run captures an OpenTelemetry trace ID. Evaluators can use `traceId` to fetch the task's spans from Phoenix and evaluate the execution trajectory — for example, verifying that specific tool calls were made or inspecting intermediate steps.
|
|
240
|
+
|
|
241
|
+
This pattern works best with `evaluateExperiment()` as a separate step after `runExperiment()`, so that all task spans are ingested into Phoenix before the evaluator queries them.
|
|
242
|
+
|
|
243
|
+
```ts
|
|
244
|
+
import { traceTool } from "@arizeai/openinference-core";
|
|
245
|
+
import { createClient } from "@arizeai/phoenix-client";
|
|
246
|
+
import { createDataset } from "@arizeai/phoenix-client/datasets";
|
|
247
|
+
import {
|
|
248
|
+
asExperimentEvaluator,
|
|
249
|
+
evaluateExperiment,
|
|
250
|
+
runExperiment,
|
|
251
|
+
} from "@arizeai/phoenix-client/experiments";
|
|
252
|
+
import { getSpans } from "@arizeai/phoenix-client/spans";
|
|
253
|
+
|
|
254
|
+
const client = createClient();
|
|
255
|
+
|
|
256
|
+
const { datasetId } = await createDataset({
|
|
257
|
+
client,
|
|
258
|
+
name: "tool-call-dataset",
|
|
259
|
+
description: "Questions that require tool use",
|
|
260
|
+
examples: [
|
|
261
|
+
{
|
|
262
|
+
input: { question: "What is the weather in San Francisco?" },
|
|
263
|
+
output: { expectedTool: "getWeather" },
|
|
264
|
+
metadata: {},
|
|
265
|
+
},
|
|
266
|
+
],
|
|
267
|
+
});
|
|
268
|
+
|
|
269
|
+
// Step 1: Run the experiment with traced tool calls
|
|
270
|
+
const experiment = await runExperiment({
|
|
271
|
+
client,
|
|
272
|
+
dataset: { datasetId },
|
|
273
|
+
setGlobalTracerProvider: true,
|
|
274
|
+
task: async (example) => {
|
|
275
|
+
// traceTool wraps a function with a TOOL span
|
|
276
|
+
const getWeather = traceTool(
|
|
277
|
+
({ location }: { location: string }) => ({
|
|
278
|
+
location,
|
|
279
|
+
temperature: 72,
|
|
280
|
+
condition: "sunny",
|
|
281
|
+
}),
|
|
282
|
+
{ name: "getWeather" }
|
|
283
|
+
);
|
|
284
|
+
|
|
285
|
+
const city = (example.input.question as string).match(/in (.+)\?/)?.[1];
|
|
286
|
+
const result = getWeather({ location: city ?? "Unknown" });
|
|
287
|
+
return `The weather in ${result.location} is ${result.temperature}F.`;
|
|
288
|
+
},
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
const projectName = experiment.projectName!;
|
|
292
|
+
|
|
293
|
+
// Step 2: Evaluate using traceId to inspect the task's spans
|
|
294
|
+
const evaluated = await evaluateExperiment({
|
|
295
|
+
client,
|
|
296
|
+
experiment,
|
|
297
|
+
evaluators: [
|
|
298
|
+
asExperimentEvaluator({
|
|
299
|
+
name: "has-expected-tool-call",
|
|
300
|
+
kind: "CODE",
|
|
301
|
+
evaluate: async ({ traceId, expected }) => {
|
|
302
|
+
if (!traceId) {
|
|
303
|
+
return { label: "no trace", score: 0 };
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Fetch TOOL spans from this task's trace
|
|
307
|
+
const { spans: toolSpans } = await getSpans({
|
|
308
|
+
client,
|
|
309
|
+
project: { projectName },
|
|
310
|
+
traceIds: [traceId],
|
|
311
|
+
spanKind: "TOOL",
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
const expectedTool = (expected as { expectedTool?: string })
|
|
315
|
+
?.expectedTool;
|
|
316
|
+
const toolNames = toolSpans.map((s) => s.name);
|
|
317
|
+
const found = toolNames.some((name) => name.includes(expectedTool!));
|
|
318
|
+
|
|
319
|
+
return {
|
|
320
|
+
label: found ? "tool called" : "no tool call",
|
|
321
|
+
score: found ? 1 : 0,
|
|
322
|
+
explanation: found
|
|
323
|
+
? `Found: ${toolNames.join(", ")}`
|
|
324
|
+
: `Expected "${expectedTool}" but found none`,
|
|
325
|
+
};
|
|
326
|
+
},
|
|
327
|
+
}),
|
|
328
|
+
],
|
|
329
|
+
});
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
Key points:
|
|
333
|
+
|
|
334
|
+
- Use `setGlobalTracerProvider: true` on `runExperiment()` so that child spans from `traceTool` or other OTel instrumentation land in the same trace as the task
|
|
335
|
+
- Use `evaluateExperiment()` as a separate step so spans are ingested before querying
|
|
336
|
+
- Use `getSpans()` with `traceIds` and `spanKind` filters to fetch specific spans from the task trace
|
|
337
|
+
- `traceId` is `null` in dry-run mode since no real traces are recorded
|
|
233
338
|
|
|
234
339
|
## What `runExperiment()` Returns
|
|
235
340
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arizeai/phoenix-client",
|
|
3
|
-
"version": "6.
|
|
3
|
+
"version": "6.6.0",
|
|
4
4
|
"description": "A client for the Phoenix API",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"arize",
|
|
@@ -79,8 +79,8 @@
|
|
|
79
79
|
"openapi-fetch": "^0.12.5",
|
|
80
80
|
"tiny-invariant": "^1.3.3",
|
|
81
81
|
"zod": "^4.0.14",
|
|
82
|
-
"@arizeai/phoenix-
|
|
83
|
-
"@arizeai/phoenix-
|
|
82
|
+
"@arizeai/phoenix-config": "0.1.3",
|
|
83
|
+
"@arizeai/phoenix-otel": "0.4.3"
|
|
84
84
|
},
|
|
85
85
|
"devDependencies": {
|
|
86
86
|
"@ai-sdk/openai": "^3.0.29",
|
|
@@ -692,6 +692,7 @@ async function runSingleEvaluation({
|
|
|
692
692
|
output: taskOutput,
|
|
693
693
|
expected: expectedOutput,
|
|
694
694
|
metadata: datasetExample.metadata,
|
|
695
|
+
traceId: experimentRun.traceId,
|
|
695
696
|
})
|
|
696
697
|
);
|
|
697
698
|
results = Array.isArray(result) ? result : [result];
|
|
@@ -746,6 +747,7 @@ async function runSingleEvaluation({
|
|
|
746
747
|
output: taskOutput,
|
|
747
748
|
expected: expectedOutput,
|
|
748
749
|
metadata: datasetExample.metadata,
|
|
750
|
+
traceId: experimentRun.traceId,
|
|
749
751
|
})
|
|
750
752
|
);
|
|
751
753
|
|
package/src/types/experiments.ts
CHANGED
|
@@ -131,6 +131,12 @@ export type EvaluatorParams<TaskOutputType = TaskOutput> = {
|
|
|
131
131
|
* Metadata associated with the Dataset Example
|
|
132
132
|
*/
|
|
133
133
|
metadata?: Example["metadata"];
|
|
134
|
+
/**
|
|
135
|
+
* The trace ID of the task run, if available.
|
|
136
|
+
* Can be used to fetch and analyze the task's trace
|
|
137
|
+
* (e.g., for trajectory evaluation or action verification).
|
|
138
|
+
*/
|
|
139
|
+
traceId?: string | null;
|
|
134
140
|
};
|
|
135
141
|
|
|
136
142
|
export type Evaluator = {
|