@arizeai/phoenix-client 6.5.4 → 6.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/__generated__/api/v1.d.ts +244 -0
- package/dist/esm/__generated__/api/v1.d.ts.map +1 -1
- package/dist/esm/experiments/resumeEvaluation.d.ts.map +1 -1
- package/dist/esm/experiments/resumeEvaluation.js +181 -170
- package/dist/esm/experiments/resumeEvaluation.js.map +1 -1
- package/dist/esm/experiments/resumeExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/resumeExperiment.js +201 -185
- package/dist/esm/experiments/resumeExperiment.js.map +1 -1
- package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/runExperiment.js +239 -207
- package/dist/esm/experiments/runExperiment.js.map +1 -1
- package/dist/esm/experiments/tracing.d.ts +10 -0
- package/dist/esm/experiments/tracing.d.ts.map +1 -0
- package/dist/esm/experiments/tracing.js +21 -0
- package/dist/esm/experiments/tracing.js.map +1 -0
- package/dist/esm/prompts/sdks/toSDK.d.ts +2 -2
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/experiments.d.ts +6 -0
- package/dist/esm/types/experiments.d.ts.map +1 -1
- package/dist/esm/utils/formatPromptMessages.d.ts.map +1 -1
- package/dist/esm/utils/getPromptBySelector.d.ts.map +1 -1
- package/dist/src/__generated__/api/v1.d.ts +244 -0
- package/dist/src/__generated__/api/v1.d.ts.map +1 -1
- package/dist/src/experiments/resumeEvaluation.d.ts.map +1 -1
- package/dist/src/experiments/resumeEvaluation.js +194 -183
- package/dist/src/experiments/resumeEvaluation.js.map +1 -1
- package/dist/src/experiments/resumeExperiment.d.ts.map +1 -1
- package/dist/src/experiments/resumeExperiment.js +214 -198
- package/dist/src/experiments/resumeExperiment.js.map +1 -1
- package/dist/src/experiments/runExperiment.d.ts.map +1 -1
- package/dist/src/experiments/runExperiment.js +229 -197
- package/dist/src/experiments/runExperiment.js.map +1 -1
- package/dist/src/experiments/tracing.d.ts +10 -0
- package/dist/src/experiments/tracing.d.ts.map +1 -0
- package/dist/src/experiments/tracing.js +24 -0
- package/dist/src/experiments/tracing.js.map +1 -0
- package/dist/src/types/experiments.d.ts +6 -0
- package/dist/src/types/experiments.d.ts.map +1 -1
- package/dist/src/utils/formatPromptMessages.d.ts.map +1 -1
- package/dist/src/utils/getPromptBySelector.d.ts.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/docs/annotations.mdx +83 -0
- package/docs/datasets.mdx +77 -0
- package/docs/document-annotations.mdx +208 -0
- package/docs/experiments.mdx +376 -0
- package/docs/overview.mdx +176 -0
- package/docs/prompts.mdx +73 -0
- package/docs/session-annotations.mdx +158 -0
- package/docs/sessions.mdx +87 -0
- package/docs/span-annotations.mdx +283 -0
- package/docs/spans.mdx +76 -0
- package/docs/traces.mdx +63 -0
- package/package.json +9 -3
- package/src/__generated__/api/v1.ts +244 -0
- package/src/experiments/resumeEvaluation.ts +226 -206
- package/src/experiments/resumeExperiment.ts +237 -213
- package/src/experiments/runExperiment.ts +282 -243
- package/src/experiments/tracing.ts +30 -0
- package/src/types/experiments.ts +6 -0
|
@@ -3,14 +3,18 @@ import {
|
|
|
3
3
|
OpenInferenceSpanKind,
|
|
4
4
|
SemanticConventions,
|
|
5
5
|
} from "@arizeai/openinference-semantic-conventions";
|
|
6
|
-
import type {
|
|
6
|
+
import type {
|
|
7
|
+
GlobalTracerProviderRegistration,
|
|
8
|
+
NodeTracerProvider,
|
|
9
|
+
Tracer,
|
|
10
|
+
} from "@arizeai/phoenix-otel";
|
|
7
11
|
import {
|
|
12
|
+
attachGlobalTracerProvider,
|
|
8
13
|
createNoOpProvider,
|
|
9
14
|
type DiagLogLevel,
|
|
10
15
|
objectAsAttributes,
|
|
11
16
|
register,
|
|
12
17
|
SpanStatusCode,
|
|
13
|
-
trace,
|
|
14
18
|
} from "@arizeai/phoenix-otel";
|
|
15
19
|
import { queue } from "async";
|
|
16
20
|
import invariant from "tiny-invariant";
|
|
@@ -53,6 +57,7 @@ import {
|
|
|
53
57
|
logTaskSummary,
|
|
54
58
|
PROGRESS_PREFIX,
|
|
55
59
|
} from "./logging";
|
|
60
|
+
import { cleanupOwnedTracerProvider } from "./tracing";
|
|
56
61
|
|
|
57
62
|
/**
|
|
58
63
|
* Validate that a repetition is valid
|
|
@@ -187,7 +192,8 @@ export async function runExperiment({
|
|
|
187
192
|
isValidRepetitionParam(repetitions),
|
|
188
193
|
"repetitions must be an integer greater than 0"
|
|
189
194
|
);
|
|
190
|
-
let
|
|
195
|
+
let taskProvider: NodeTracerProvider | undefined;
|
|
196
|
+
let taskGlobalRegistration: GlobalTracerProviderRegistration | null = null;
|
|
191
197
|
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
192
198
|
const client = _client ?? createClient();
|
|
193
199
|
const dataset = await getDataset({
|
|
@@ -272,7 +278,7 @@ export async function runExperiment({
|
|
|
272
278
|
"Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
|
|
273
279
|
);
|
|
274
280
|
|
|
275
|
-
|
|
281
|
+
taskProvider = register({
|
|
276
282
|
projectName,
|
|
277
283
|
url: baseUrl,
|
|
278
284
|
headers: client.config.headers
|
|
@@ -280,122 +286,145 @@ export async function runExperiment({
|
|
|
280
286
|
: undefined,
|
|
281
287
|
batch: useBatchSpanProcessor,
|
|
282
288
|
diagLogLevel,
|
|
283
|
-
global:
|
|
289
|
+
global: false,
|
|
284
290
|
});
|
|
291
|
+
taskGlobalRegistration = setGlobalTracerProvider
|
|
292
|
+
? attachGlobalTracerProvider(taskProvider)
|
|
293
|
+
: null;
|
|
285
294
|
|
|
286
|
-
taskTracer =
|
|
295
|
+
taskTracer = taskProvider.getTracer(projectName);
|
|
287
296
|
}
|
|
288
|
-
|
|
297
|
+
try {
|
|
298
|
+
if (!record) {
|
|
299
|
+
logger.info(
|
|
300
|
+
`Running experiment in readonly mode. Results will not be recorded.`
|
|
301
|
+
);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
const links: Array<{ label: string; url: string }> = [];
|
|
305
|
+
if (!isDryRun && client.config.baseUrl) {
|
|
306
|
+
links.push({
|
|
307
|
+
label: "Dataset",
|
|
308
|
+
url: getDatasetUrl({
|
|
309
|
+
baseUrl: client.config.baseUrl,
|
|
310
|
+
datasetId: dataset.id,
|
|
311
|
+
}),
|
|
312
|
+
});
|
|
313
|
+
links.push({
|
|
314
|
+
label: "Experiments",
|
|
315
|
+
url: getDatasetExperimentsUrl({
|
|
316
|
+
baseUrl: client.config.baseUrl,
|
|
317
|
+
datasetId: dataset.id,
|
|
318
|
+
}),
|
|
319
|
+
});
|
|
320
|
+
links.push({
|
|
321
|
+
label: "Experiment",
|
|
322
|
+
url: getExperimentUrl({
|
|
323
|
+
baseUrl: client.config.baseUrl,
|
|
324
|
+
datasetId: dataset.id,
|
|
325
|
+
experimentId: experiment.id,
|
|
326
|
+
}),
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const evCount = evaluators?.length ?? 0;
|
|
289
331
|
logger.info(
|
|
290
|
-
|
|
332
|
+
`${PROGRESS_PREFIX.start}Experiment ${experimentName || "<unnamed>"} (dataset ${dataset.name}, ${nExamples} ${pluralize("example", nExamples)}, ${evCount} ${pluralize("evaluator", evCount)})`
|
|
291
333
|
);
|
|
292
|
-
}
|
|
293
334
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
335
|
+
const runs: Record<ExperimentRunID, ExperimentRun> = {};
|
|
336
|
+
await runTaskWithExamples({
|
|
337
|
+
client,
|
|
338
|
+
experimentId: experiment.id,
|
|
339
|
+
task,
|
|
340
|
+
dataset,
|
|
341
|
+
logger,
|
|
342
|
+
onComplete: (run) => {
|
|
343
|
+
runs[run.id] = run;
|
|
344
|
+
},
|
|
345
|
+
concurrency,
|
|
346
|
+
isDryRun,
|
|
347
|
+
nExamples,
|
|
348
|
+
tracer: taskTracer,
|
|
349
|
+
repetitions,
|
|
302
350
|
});
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
351
|
+
const taskRuns = Object.values(runs);
|
|
352
|
+
const taskErrors = taskRuns.filter((run) => run.error != null).length;
|
|
353
|
+
const taskTotal = nExamples * repetitions;
|
|
354
|
+
const taskOkStr =
|
|
355
|
+
taskErrors > 0
|
|
356
|
+
? `${taskRuns.length - taskErrors}/${taskTotal} ok (${taskErrors} failed)`
|
|
357
|
+
: `${taskRuns.length}/${taskTotal} ok`;
|
|
358
|
+
logger.info(`${PROGRESS_PREFIX.completed}Tasks ${taskOkStr}`);
|
|
359
|
+
|
|
360
|
+
const ranExperiment: RanExperiment = {
|
|
361
|
+
...experiment,
|
|
362
|
+
runs,
|
|
363
|
+
};
|
|
364
|
+
|
|
365
|
+
await cleanupOwnedTracerProvider({
|
|
366
|
+
provider: taskProvider,
|
|
367
|
+
globalRegistration: taskGlobalRegistration,
|
|
309
368
|
});
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
369
|
+
taskProvider = undefined;
|
|
370
|
+
taskGlobalRegistration = null;
|
|
371
|
+
|
|
372
|
+
if (evaluators && evaluators.length > 0) {
|
|
373
|
+
const evNames = getExperimentEvaluators(evaluators)
|
|
374
|
+
.map((evaluator) => evaluator.name)
|
|
375
|
+
.join(", ");
|
|
376
|
+
logger.info(`${PROGRESS_PREFIX.start}Evaluations (${evNames})`);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
const { evaluationRuns } = await evaluateExperiment({
|
|
380
|
+
experiment: ranExperiment,
|
|
381
|
+
evaluators: evaluators ?? [],
|
|
382
|
+
client,
|
|
383
|
+
logger,
|
|
384
|
+
concurrency,
|
|
385
|
+
dryRun,
|
|
386
|
+
diagLogLevel,
|
|
387
|
+
useBatchSpanProcessor,
|
|
388
|
+
setGlobalTracerProvider,
|
|
317
389
|
});
|
|
318
|
-
|
|
390
|
+
ranExperiment.evaluationRuns = evaluationRuns;
|
|
319
391
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
392
|
+
// Refresh experiment info from server to get updated counts (non-dry-run only)
|
|
393
|
+
if (!isDryRun) {
|
|
394
|
+
const updatedExperiment = await getExperimentInfo({
|
|
395
|
+
client,
|
|
396
|
+
experimentId: experiment.id,
|
|
397
|
+
});
|
|
398
|
+
// Update the experiment info with the latest from the server
|
|
399
|
+
Object.assign(ranExperiment, updatedExperiment);
|
|
400
|
+
}
|
|
324
401
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
logger,
|
|
332
|
-
onComplete: (run) => {
|
|
333
|
-
runs[run.id] = run;
|
|
334
|
-
},
|
|
335
|
-
concurrency,
|
|
336
|
-
isDryRun,
|
|
337
|
-
nExamples,
|
|
338
|
-
tracer: taskTracer,
|
|
339
|
-
repetitions,
|
|
340
|
-
});
|
|
341
|
-
const taskRuns = Object.values(runs);
|
|
342
|
-
const taskErrors = taskRuns.filter((run) => run.error != null).length;
|
|
343
|
-
const taskTotal = nExamples * repetitions;
|
|
344
|
-
const taskOkStr =
|
|
345
|
-
taskErrors > 0
|
|
346
|
-
? `${taskRuns.length - taskErrors}/${taskTotal} ok (${taskErrors} failed)`
|
|
347
|
-
: `${taskRuns.length}/${taskTotal} ok`;
|
|
348
|
-
logger.info(`${PROGRESS_PREFIX.completed}Tasks ${taskOkStr}`);
|
|
349
|
-
|
|
350
|
-
const ranExperiment: RanExperiment = {
|
|
351
|
-
...experiment,
|
|
352
|
-
runs,
|
|
353
|
-
};
|
|
402
|
+
logTaskSummary(logger, {
|
|
403
|
+
nExamples,
|
|
404
|
+
repetitions,
|
|
405
|
+
nRuns: taskRuns.length,
|
|
406
|
+
nErrors: taskErrors,
|
|
407
|
+
});
|
|
354
408
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
.
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
409
|
+
if (
|
|
410
|
+
ranExperiment.evaluationRuns &&
|
|
411
|
+
ranExperiment.evaluationRuns.length > 0
|
|
412
|
+
) {
|
|
413
|
+
logEvalSummary(logger, ranExperiment.evaluationRuns);
|
|
414
|
+
}
|
|
361
415
|
|
|
362
|
-
|
|
363
|
-
experiment: ranExperiment,
|
|
364
|
-
evaluators: evaluators ?? [],
|
|
365
|
-
client,
|
|
366
|
-
logger,
|
|
367
|
-
concurrency,
|
|
368
|
-
dryRun,
|
|
369
|
-
tracerProvider: provider,
|
|
370
|
-
diagLogLevel,
|
|
371
|
-
useBatchSpanProcessor,
|
|
372
|
-
});
|
|
373
|
-
ranExperiment.evaluationRuns = evaluationRuns;
|
|
416
|
+
logLinks(logger, links);
|
|
374
417
|
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
418
|
+
return ranExperiment;
|
|
419
|
+
} finally {
|
|
420
|
+
// Safety net: on error paths the happy-path cleanup above is skipped,
|
|
421
|
+
// so ensure the task provider is always cleaned up. On the happy path
|
|
422
|
+
// taskProvider is already undefined (no-op).
|
|
423
|
+
await cleanupOwnedTracerProvider({
|
|
424
|
+
provider: taskProvider,
|
|
425
|
+
globalRegistration: taskGlobalRegistration,
|
|
380
426
|
});
|
|
381
|
-
// Update the experiment info with the latest from the server
|
|
382
|
-
Object.assign(ranExperiment, updatedExperiment);
|
|
383
427
|
}
|
|
384
|
-
|
|
385
|
-
logTaskSummary(logger, {
|
|
386
|
-
nExamples,
|
|
387
|
-
repetitions,
|
|
388
|
-
nRuns: taskRuns.length,
|
|
389
|
-
nErrors: taskErrors,
|
|
390
|
-
});
|
|
391
|
-
|
|
392
|
-
if (ranExperiment.evaluationRuns && ranExperiment.evaluationRuns.length > 0) {
|
|
393
|
-
logEvalSummary(logger, ranExperiment.evaluationRuns);
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
logLinks(logger, links);
|
|
397
|
-
|
|
398
|
-
return ranExperiment;
|
|
399
428
|
}
|
|
400
429
|
|
|
401
430
|
/**
|
|
@@ -600,6 +629,8 @@ export async function evaluateExperiment({
|
|
|
600
629
|
"Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
|
|
601
630
|
);
|
|
602
631
|
let provider: NodeTracerProvider;
|
|
632
|
+
let globalRegistration: GlobalTracerProviderRegistration | null = null;
|
|
633
|
+
const ownsProvider = !paramsTracerProvider;
|
|
603
634
|
|
|
604
635
|
// Always allow changing of tracer providers
|
|
605
636
|
if (paramsTracerProvider) {
|
|
@@ -613,165 +644,172 @@ export async function evaluateExperiment({
|
|
|
613
644
|
: undefined,
|
|
614
645
|
batch: useBatchSpanProcessor,
|
|
615
646
|
diagLogLevel,
|
|
616
|
-
global:
|
|
647
|
+
global: false,
|
|
617
648
|
});
|
|
649
|
+
globalRegistration = setGlobalTracerProvider
|
|
650
|
+
? attachGlobalTracerProvider(provider)
|
|
651
|
+
: null;
|
|
618
652
|
} else {
|
|
619
653
|
provider = createNoOpProvider();
|
|
620
654
|
}
|
|
621
655
|
const tracer = isDryRun
|
|
622
656
|
? provider.getTracer("no-op")
|
|
623
657
|
: provider.getTracer("evaluators");
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
dataset
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
658
|
+
try {
|
|
659
|
+
const nRuns =
|
|
660
|
+
typeof dryRun === "number"
|
|
661
|
+
? Math.min(dryRun, Object.keys(experiment.runs).length)
|
|
662
|
+
: Object.keys(experiment.runs).length;
|
|
663
|
+
const dataset = await getDataset({
|
|
664
|
+
dataset: {
|
|
665
|
+
datasetId: experiment.datasetId,
|
|
666
|
+
versionId: experiment.datasetVersionId,
|
|
667
|
+
splits: experiment.datasetSplits,
|
|
668
|
+
},
|
|
669
|
+
client,
|
|
670
|
+
});
|
|
671
|
+
invariant(dataset, `Dataset "${experiment.datasetId}" not found`);
|
|
672
|
+
invariant(
|
|
673
|
+
dataset.examples.length > 0,
|
|
674
|
+
`Dataset "${experiment.datasetId}" has no examples`
|
|
675
|
+
);
|
|
676
|
+
invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
|
|
642
677
|
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
678
|
+
const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
|
|
679
|
+
if (evaluators?.length === 0) {
|
|
680
|
+
return {
|
|
681
|
+
...experiment,
|
|
682
|
+
evaluationRuns: [],
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
type EvaluationId = string;
|
|
686
|
+
const evaluationRuns: Record<EvaluationId, ExperimentEvaluationRun> = {};
|
|
652
687
|
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
688
|
+
const examplesById: Record<string, Example> = {};
|
|
689
|
+
for (const example of dataset.examples) {
|
|
690
|
+
examplesById[example.id] = example;
|
|
691
|
+
}
|
|
657
692
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
693
|
+
const onEvaluationComplete = (run: ExperimentEvaluationRun) => {
|
|
694
|
+
evaluationRuns[run.id] = run;
|
|
695
|
+
};
|
|
661
696
|
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
});
|
|
683
|
-
span.setAttributes({
|
|
684
|
-
[SemanticConventions.OPENINFERENCE_SPAN_KIND]:
|
|
685
|
-
OpenInferenceSpanKind.EVALUATOR,
|
|
686
|
-
[SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
|
|
687
|
-
[SemanticConventions.INPUT_VALUE]: ensureString({
|
|
688
|
-
input: examplesById[evaluatorAndRun.run.datasetExampleId]?.input,
|
|
689
|
-
output: evaluatorAndRun.run.output,
|
|
690
|
-
expected:
|
|
691
|
-
examplesById[evaluatorAndRun.run.datasetExampleId]?.output,
|
|
692
|
-
metadata:
|
|
693
|
-
examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata,
|
|
694
|
-
}),
|
|
695
|
-
[SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
|
|
696
|
-
[SemanticConventions.OUTPUT_VALUE]: ensureString(evalResult.result),
|
|
697
|
-
});
|
|
698
|
-
if (evalResult.error) {
|
|
699
|
-
span.setStatus({
|
|
700
|
-
code: SpanStatusCode.ERROR,
|
|
701
|
-
message: evalResult.error,
|
|
697
|
+
// Run evaluators against all runs
|
|
698
|
+
// Flat list of evaluator + run tuples
|
|
699
|
+
const normalizedEvaluators = getExperimentEvaluators(evaluators);
|
|
700
|
+
const evaluatorsAndRuns = normalizedEvaluators.flatMap((evaluator) =>
|
|
701
|
+
runsToEvaluate.map((run) => ({
|
|
702
|
+
evaluator,
|
|
703
|
+
run,
|
|
704
|
+
}))
|
|
705
|
+
);
|
|
706
|
+
const evaluatorsQueue = queue(
|
|
707
|
+
async (evaluatorAndRun: { evaluator: Evaluator; run: ExperimentRun }) => {
|
|
708
|
+
return tracer.startActiveSpan(
|
|
709
|
+
`Evaluation: ${evaluatorAndRun.evaluator.name}`,
|
|
710
|
+
async (span) => {
|
|
711
|
+
const evalResult = await runEvaluator({
|
|
712
|
+
evaluator: evaluatorAndRun.evaluator,
|
|
713
|
+
run: evaluatorAndRun.run,
|
|
714
|
+
exampleCache: examplesById,
|
|
715
|
+
onComplete: onEvaluationComplete,
|
|
716
|
+
logger,
|
|
702
717
|
});
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
result: {
|
|
721
|
-
...evalResult.result,
|
|
722
|
-
},
|
|
723
|
-
error: evalResult.error,
|
|
724
|
-
trace_id: evalResult.traceId,
|
|
725
|
-
},
|
|
718
|
+
span.setAttributes({
|
|
719
|
+
[SemanticConventions.OPENINFERENCE_SPAN_KIND]:
|
|
720
|
+
OpenInferenceSpanKind.EVALUATOR,
|
|
721
|
+
[SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
|
|
722
|
+
[SemanticConventions.INPUT_VALUE]: ensureString({
|
|
723
|
+
input:
|
|
724
|
+
examplesById[evaluatorAndRun.run.datasetExampleId]?.input,
|
|
725
|
+
output: evaluatorAndRun.run.output,
|
|
726
|
+
expected:
|
|
727
|
+
examplesById[evaluatorAndRun.run.datasetExampleId]?.output,
|
|
728
|
+
metadata:
|
|
729
|
+
examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata,
|
|
730
|
+
}),
|
|
731
|
+
[SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
|
|
732
|
+
[SemanticConventions.OUTPUT_VALUE]: ensureString(
|
|
733
|
+
evalResult.result
|
|
734
|
+
),
|
|
726
735
|
});
|
|
736
|
+
if (evalResult.error) {
|
|
737
|
+
span.setStatus({
|
|
738
|
+
code: SpanStatusCode.ERROR,
|
|
739
|
+
message: evalResult.error,
|
|
740
|
+
});
|
|
741
|
+
} else {
|
|
742
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
743
|
+
}
|
|
744
|
+
if (evalResult.result) {
|
|
745
|
+
span.setAttributes(objectAsAttributes(evalResult.result));
|
|
746
|
+
}
|
|
747
|
+
evalResult.traceId = span.spanContext().traceId;
|
|
748
|
+
if (!isDryRun) {
|
|
749
|
+
// Log the evaluation to the server
|
|
750
|
+
// We log this without awaiting (e.g. best effort)
|
|
751
|
+
client.POST("/v1/experiment_evaluations", {
|
|
752
|
+
body: {
|
|
753
|
+
experiment_run_id: evaluatorAndRun.run.id,
|
|
754
|
+
name: evaluatorAndRun.evaluator.name,
|
|
755
|
+
annotator_kind: evaluatorAndRun.evaluator.kind,
|
|
756
|
+
start_time: evalResult.startTime.toISOString(),
|
|
757
|
+
end_time: evalResult.endTime.toISOString(),
|
|
758
|
+
result: {
|
|
759
|
+
...evalResult.result,
|
|
760
|
+
},
|
|
761
|
+
error: evalResult.error,
|
|
762
|
+
trace_id: evalResult.traceId,
|
|
763
|
+
},
|
|
764
|
+
});
|
|
765
|
+
}
|
|
766
|
+
span.end();
|
|
767
|
+
return evalResult;
|
|
727
768
|
}
|
|
728
|
-
|
|
729
|
-
|
|
769
|
+
);
|
|
770
|
+
},
|
|
771
|
+
concurrency
|
|
772
|
+
);
|
|
773
|
+
if (!evaluatorsAndRuns.length) {
|
|
774
|
+
logger.warn(`No evaluators to run`);
|
|
775
|
+
return {
|
|
776
|
+
...experiment,
|
|
777
|
+
evaluationRuns: [],
|
|
778
|
+
};
|
|
779
|
+
}
|
|
780
|
+
evaluatorsAndRuns.forEach((evaluatorAndRun) =>
|
|
781
|
+
evaluatorsQueue.push(evaluatorAndRun, (err) => {
|
|
782
|
+
if (err) {
|
|
783
|
+
logger.error(
|
|
784
|
+
`Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`
|
|
785
|
+
);
|
|
730
786
|
}
|
|
731
|
-
)
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
787
|
+
})
|
|
788
|
+
);
|
|
789
|
+
await evaluatorsQueue.drain();
|
|
790
|
+
const evalTotal = Object.values(evaluationRuns).length;
|
|
791
|
+
const evalErrors = Object.values(evaluationRuns).filter(
|
|
792
|
+
(ev) => ev.error != null
|
|
793
|
+
).length;
|
|
794
|
+
const evalExpected = runsToEvaluate.length * normalizedEvaluators.length;
|
|
795
|
+
const evalOkStr =
|
|
796
|
+
evalErrors > 0
|
|
797
|
+
? `${evalTotal - evalErrors}/${evalExpected} ok (${evalErrors} failed)`
|
|
798
|
+
: `${evalTotal}/${evalExpected} ok`;
|
|
799
|
+
logger.info(`${PROGRESS_PREFIX.completed}Evaluations ${evalOkStr}`);
|
|
800
|
+
|
|
737
801
|
return {
|
|
738
802
|
...experiment,
|
|
739
|
-
evaluationRuns:
|
|
803
|
+
evaluationRuns: Object.values(evaluationRuns),
|
|
740
804
|
};
|
|
741
|
-
}
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
);
|
|
748
|
-
}
|
|
749
|
-
})
|
|
750
|
-
);
|
|
751
|
-
await evaluatorsQueue.drain();
|
|
752
|
-
const evalTotal = Object.values(evaluationRuns).length;
|
|
753
|
-
const evalErrors = Object.values(evaluationRuns).filter(
|
|
754
|
-
(ev) => ev.error != null
|
|
755
|
-
).length;
|
|
756
|
-
const evalExpected = runsToEvaluate.length * normalizedEvaluators.length;
|
|
757
|
-
const evalOkStr =
|
|
758
|
-
evalErrors > 0
|
|
759
|
-
? `${evalTotal - evalErrors}/${evalExpected} ok (${evalErrors} failed)`
|
|
760
|
-
: `${evalTotal}/${evalExpected} ok`;
|
|
761
|
-
logger.info(`${PROGRESS_PREFIX.completed}Evaluations ${evalOkStr}`);
|
|
762
|
-
|
|
763
|
-
if (provider) {
|
|
764
|
-
await provider.shutdown();
|
|
765
|
-
// Make sure it's not set globally anymore
|
|
766
|
-
if (setGlobalTracerProvider) {
|
|
767
|
-
trace.disable();
|
|
805
|
+
} finally {
|
|
806
|
+
if (ownsProvider) {
|
|
807
|
+
await cleanupOwnedTracerProvider({
|
|
808
|
+
provider,
|
|
809
|
+
globalRegistration,
|
|
810
|
+
});
|
|
768
811
|
}
|
|
769
812
|
}
|
|
770
|
-
|
|
771
|
-
return {
|
|
772
|
-
...experiment,
|
|
773
|
-
evaluationRuns: Object.values(evaluationRuns),
|
|
774
|
-
};
|
|
775
813
|
}
|
|
776
814
|
|
|
777
815
|
/**
|
|
@@ -815,6 +853,7 @@ async function runEvaluator({
|
|
|
815
853
|
output: run.output ?? null,
|
|
816
854
|
expected: example.output,
|
|
817
855
|
metadata: example?.metadata,
|
|
856
|
+
traceId: run.traceId,
|
|
818
857
|
});
|
|
819
858
|
thisEval.result = result;
|
|
820
859
|
} catch (error) {
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
GlobalTracerProviderRegistration,
|
|
3
|
+
NodeTracerProvider,
|
|
4
|
+
} from "@arizeai/phoenix-otel";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Flushes and shuts down a tracer provider that this package created, then
|
|
8
|
+
* detaches any global OTEL registration it owns so another provider can be mounted.
|
|
9
|
+
*/
|
|
10
|
+
export async function cleanupOwnedTracerProvider({
|
|
11
|
+
provider,
|
|
12
|
+
globalRegistration,
|
|
13
|
+
}: {
|
|
14
|
+
provider: NodeTracerProvider | null | undefined;
|
|
15
|
+
globalRegistration?: GlobalTracerProviderRegistration | null;
|
|
16
|
+
}): Promise<void> {
|
|
17
|
+
if (!provider) {
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
try {
|
|
22
|
+
await provider.forceFlush();
|
|
23
|
+
} finally {
|
|
24
|
+
try {
|
|
25
|
+
await provider.shutdown();
|
|
26
|
+
} finally {
|
|
27
|
+
globalRegistration?.detach();
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
package/src/types/experiments.ts
CHANGED
|
@@ -131,6 +131,12 @@ export type EvaluatorParams<TaskOutputType = TaskOutput> = {
|
|
|
131
131
|
* Metadata associated with the Dataset Example
|
|
132
132
|
*/
|
|
133
133
|
metadata?: Example["metadata"];
|
|
134
|
+
/**
|
|
135
|
+
* The trace ID of the task run, if available.
|
|
136
|
+
* Can be used to fetch and analyze the task's trace
|
|
137
|
+
* (e.g., for trajectory evaluation or action verification).
|
|
138
|
+
*/
|
|
139
|
+
traceId?: string | null;
|
|
134
140
|
};
|
|
135
141
|
|
|
136
142
|
export type Evaluator = {
|