@arizeai/phoenix-client 6.5.4 → 6.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/dist/esm/__generated__/api/v1.d.ts +244 -0
  2. package/dist/esm/__generated__/api/v1.d.ts.map +1 -1
  3. package/dist/esm/experiments/resumeEvaluation.d.ts.map +1 -1
  4. package/dist/esm/experiments/resumeEvaluation.js +181 -170
  5. package/dist/esm/experiments/resumeEvaluation.js.map +1 -1
  6. package/dist/esm/experiments/resumeExperiment.d.ts.map +1 -1
  7. package/dist/esm/experiments/resumeExperiment.js +201 -185
  8. package/dist/esm/experiments/resumeExperiment.js.map +1 -1
  9. package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
  10. package/dist/esm/experiments/runExperiment.js +239 -207
  11. package/dist/esm/experiments/runExperiment.js.map +1 -1
  12. package/dist/esm/experiments/tracing.d.ts +10 -0
  13. package/dist/esm/experiments/tracing.d.ts.map +1 -0
  14. package/dist/esm/experiments/tracing.js +21 -0
  15. package/dist/esm/experiments/tracing.js.map +1 -0
  16. package/dist/esm/prompts/sdks/toSDK.d.ts +2 -2
  17. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  18. package/dist/esm/types/experiments.d.ts +6 -0
  19. package/dist/esm/types/experiments.d.ts.map +1 -1
  20. package/dist/esm/utils/formatPromptMessages.d.ts.map +1 -1
  21. package/dist/esm/utils/getPromptBySelector.d.ts.map +1 -1
  22. package/dist/src/__generated__/api/v1.d.ts +244 -0
  23. package/dist/src/__generated__/api/v1.d.ts.map +1 -1
  24. package/dist/src/experiments/resumeEvaluation.d.ts.map +1 -1
  25. package/dist/src/experiments/resumeEvaluation.js +194 -183
  26. package/dist/src/experiments/resumeEvaluation.js.map +1 -1
  27. package/dist/src/experiments/resumeExperiment.d.ts.map +1 -1
  28. package/dist/src/experiments/resumeExperiment.js +214 -198
  29. package/dist/src/experiments/resumeExperiment.js.map +1 -1
  30. package/dist/src/experiments/runExperiment.d.ts.map +1 -1
  31. package/dist/src/experiments/runExperiment.js +229 -197
  32. package/dist/src/experiments/runExperiment.js.map +1 -1
  33. package/dist/src/experiments/tracing.d.ts +10 -0
  34. package/dist/src/experiments/tracing.d.ts.map +1 -0
  35. package/dist/src/experiments/tracing.js +24 -0
  36. package/dist/src/experiments/tracing.js.map +1 -0
  37. package/dist/src/types/experiments.d.ts +6 -0
  38. package/dist/src/types/experiments.d.ts.map +1 -1
  39. package/dist/src/utils/formatPromptMessages.d.ts.map +1 -1
  40. package/dist/src/utils/getPromptBySelector.d.ts.map +1 -1
  41. package/dist/tsconfig.tsbuildinfo +1 -1
  42. package/docs/annotations.mdx +83 -0
  43. package/docs/datasets.mdx +77 -0
  44. package/docs/document-annotations.mdx +208 -0
  45. package/docs/experiments.mdx +376 -0
  46. package/docs/overview.mdx +176 -0
  47. package/docs/prompts.mdx +73 -0
  48. package/docs/session-annotations.mdx +158 -0
  49. package/docs/sessions.mdx +87 -0
  50. package/docs/span-annotations.mdx +283 -0
  51. package/docs/spans.mdx +76 -0
  52. package/docs/traces.mdx +63 -0
  53. package/package.json +9 -3
  54. package/src/__generated__/api/v1.ts +244 -0
  55. package/src/experiments/resumeEvaluation.ts +226 -206
  56. package/src/experiments/resumeExperiment.ts +237 -213
  57. package/src/experiments/runExperiment.ts +282 -243
  58. package/src/experiments/tracing.ts +30 -0
  59. package/src/types/experiments.ts +6 -0
@@ -3,14 +3,18 @@ import {
3
3
  OpenInferenceSpanKind,
4
4
  SemanticConventions,
5
5
  } from "@arizeai/openinference-semantic-conventions";
6
- import type { NodeTracerProvider, Tracer } from "@arizeai/phoenix-otel";
6
+ import type {
7
+ GlobalTracerProviderRegistration,
8
+ NodeTracerProvider,
9
+ Tracer,
10
+ } from "@arizeai/phoenix-otel";
7
11
  import {
12
+ attachGlobalTracerProvider,
8
13
  createNoOpProvider,
9
14
  type DiagLogLevel,
10
15
  objectAsAttributes,
11
16
  register,
12
17
  SpanStatusCode,
13
- trace,
14
18
  } from "@arizeai/phoenix-otel";
15
19
  import { queue } from "async";
16
20
  import invariant from "tiny-invariant";
@@ -53,6 +57,7 @@ import {
53
57
  logTaskSummary,
54
58
  PROGRESS_PREFIX,
55
59
  } from "./logging";
60
+ import { cleanupOwnedTracerProvider } from "./tracing";
56
61
 
57
62
  /**
58
63
  * Validate that a repetition is valid
@@ -187,7 +192,8 @@ export async function runExperiment({
187
192
  isValidRepetitionParam(repetitions),
188
193
  "repetitions must be an integer greater than 0"
189
194
  );
190
- let provider: NodeTracerProvider | undefined;
195
+ let taskProvider: NodeTracerProvider | undefined;
196
+ let taskGlobalRegistration: GlobalTracerProviderRegistration | null = null;
191
197
  const isDryRun = typeof dryRun === "number" || dryRun === true;
192
198
  const client = _client ?? createClient();
193
199
  const dataset = await getDataset({
@@ -272,7 +278,7 @@ export async function runExperiment({
272
278
  "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
273
279
  );
274
280
 
275
- provider = register({
281
+ taskProvider = register({
276
282
  projectName,
277
283
  url: baseUrl,
278
284
  headers: client.config.headers
@@ -280,122 +286,145 @@ export async function runExperiment({
280
286
  : undefined,
281
287
  batch: useBatchSpanProcessor,
282
288
  diagLogLevel,
283
- global: setGlobalTracerProvider,
289
+ global: false,
284
290
  });
291
+ taskGlobalRegistration = setGlobalTracerProvider
292
+ ? attachGlobalTracerProvider(taskProvider)
293
+ : null;
285
294
 
286
- taskTracer = provider.getTracer(projectName);
295
+ taskTracer = taskProvider.getTracer(projectName);
287
296
  }
288
- if (!record) {
297
+ try {
298
+ if (!record) {
299
+ logger.info(
300
+ `Running experiment in readonly mode. Results will not be recorded.`
301
+ );
302
+ }
303
+
304
+ const links: Array<{ label: string; url: string }> = [];
305
+ if (!isDryRun && client.config.baseUrl) {
306
+ links.push({
307
+ label: "Dataset",
308
+ url: getDatasetUrl({
309
+ baseUrl: client.config.baseUrl,
310
+ datasetId: dataset.id,
311
+ }),
312
+ });
313
+ links.push({
314
+ label: "Experiments",
315
+ url: getDatasetExperimentsUrl({
316
+ baseUrl: client.config.baseUrl,
317
+ datasetId: dataset.id,
318
+ }),
319
+ });
320
+ links.push({
321
+ label: "Experiment",
322
+ url: getExperimentUrl({
323
+ baseUrl: client.config.baseUrl,
324
+ datasetId: dataset.id,
325
+ experimentId: experiment.id,
326
+ }),
327
+ });
328
+ }
329
+
330
+ const evCount = evaluators?.length ?? 0;
289
331
  logger.info(
290
- `Running experiment in readonly mode. Results will not be recorded.`
332
+ `${PROGRESS_PREFIX.start}Experiment ${experimentName || "<unnamed>"} (dataset ${dataset.name}, ${nExamples} ${pluralize("example", nExamples)}, ${evCount} ${pluralize("evaluator", evCount)})`
291
333
  );
292
- }
293
334
 
294
- const links: Array<{ label: string; url: string }> = [];
295
- if (!isDryRun && client.config.baseUrl) {
296
- links.push({
297
- label: "Dataset",
298
- url: getDatasetUrl({
299
- baseUrl: client.config.baseUrl,
300
- datasetId: dataset.id,
301
- }),
335
+ const runs: Record<ExperimentRunID, ExperimentRun> = {};
336
+ await runTaskWithExamples({
337
+ client,
338
+ experimentId: experiment.id,
339
+ task,
340
+ dataset,
341
+ logger,
342
+ onComplete: (run) => {
343
+ runs[run.id] = run;
344
+ },
345
+ concurrency,
346
+ isDryRun,
347
+ nExamples,
348
+ tracer: taskTracer,
349
+ repetitions,
302
350
  });
303
- links.push({
304
- label: "Experiments",
305
- url: getDatasetExperimentsUrl({
306
- baseUrl: client.config.baseUrl,
307
- datasetId: dataset.id,
308
- }),
351
+ const taskRuns = Object.values(runs);
352
+ const taskErrors = taskRuns.filter((run) => run.error != null).length;
353
+ const taskTotal = nExamples * repetitions;
354
+ const taskOkStr =
355
+ taskErrors > 0
356
+ ? `${taskRuns.length - taskErrors}/${taskTotal} ok (${taskErrors} failed)`
357
+ : `${taskRuns.length}/${taskTotal} ok`;
358
+ logger.info(`${PROGRESS_PREFIX.completed}Tasks ${taskOkStr}`);
359
+
360
+ const ranExperiment: RanExperiment = {
361
+ ...experiment,
362
+ runs,
363
+ };
364
+
365
+ await cleanupOwnedTracerProvider({
366
+ provider: taskProvider,
367
+ globalRegistration: taskGlobalRegistration,
309
368
  });
310
- links.push({
311
- label: "Experiment",
312
- url: getExperimentUrl({
313
- baseUrl: client.config.baseUrl,
314
- datasetId: dataset.id,
315
- experimentId: experiment.id,
316
- }),
369
+ taskProvider = undefined;
370
+ taskGlobalRegistration = null;
371
+
372
+ if (evaluators && evaluators.length > 0) {
373
+ const evNames = getExperimentEvaluators(evaluators)
374
+ .map((evaluator) => evaluator.name)
375
+ .join(", ");
376
+ logger.info(`${PROGRESS_PREFIX.start}Evaluations (${evNames})`);
377
+ }
378
+
379
+ const { evaluationRuns } = await evaluateExperiment({
380
+ experiment: ranExperiment,
381
+ evaluators: evaluators ?? [],
382
+ client,
383
+ logger,
384
+ concurrency,
385
+ dryRun,
386
+ diagLogLevel,
387
+ useBatchSpanProcessor,
388
+ setGlobalTracerProvider,
317
389
  });
318
- }
390
+ ranExperiment.evaluationRuns = evaluationRuns;
319
391
 
320
- const evCount = evaluators?.length ?? 0;
321
- logger.info(
322
- `${PROGRESS_PREFIX.start}Experiment ${experimentName || "<unnamed>"} (dataset ${dataset.name}, ${nExamples} ${pluralize("example", nExamples)}, ${evCount} ${pluralize("evaluator", evCount)})`
323
- );
392
+ // Refresh experiment info from server to get updated counts (non-dry-run only)
393
+ if (!isDryRun) {
394
+ const updatedExperiment = await getExperimentInfo({
395
+ client,
396
+ experimentId: experiment.id,
397
+ });
398
+ // Update the experiment info with the latest from the server
399
+ Object.assign(ranExperiment, updatedExperiment);
400
+ }
324
401
 
325
- const runs: Record<ExperimentRunID, ExperimentRun> = {};
326
- await runTaskWithExamples({
327
- client,
328
- experimentId: experiment.id,
329
- task,
330
- dataset,
331
- logger,
332
- onComplete: (run) => {
333
- runs[run.id] = run;
334
- },
335
- concurrency,
336
- isDryRun,
337
- nExamples,
338
- tracer: taskTracer,
339
- repetitions,
340
- });
341
- const taskRuns = Object.values(runs);
342
- const taskErrors = taskRuns.filter((run) => run.error != null).length;
343
- const taskTotal = nExamples * repetitions;
344
- const taskOkStr =
345
- taskErrors > 0
346
- ? `${taskRuns.length - taskErrors}/${taskTotal} ok (${taskErrors} failed)`
347
- : `${taskRuns.length}/${taskTotal} ok`;
348
- logger.info(`${PROGRESS_PREFIX.completed}Tasks ${taskOkStr}`);
349
-
350
- const ranExperiment: RanExperiment = {
351
- ...experiment,
352
- runs,
353
- };
402
+ logTaskSummary(logger, {
403
+ nExamples,
404
+ repetitions,
405
+ nRuns: taskRuns.length,
406
+ nErrors: taskErrors,
407
+ });
354
408
 
355
- if (evaluators && evaluators.length > 0) {
356
- const evNames = getExperimentEvaluators(evaluators)
357
- .map((evaluator) => evaluator.name)
358
- .join(", ");
359
- logger.info(`${PROGRESS_PREFIX.start}Evaluations (${evNames})`);
360
- }
409
+ if (
410
+ ranExperiment.evaluationRuns &&
411
+ ranExperiment.evaluationRuns.length > 0
412
+ ) {
413
+ logEvalSummary(logger, ranExperiment.evaluationRuns);
414
+ }
361
415
 
362
- const { evaluationRuns } = await evaluateExperiment({
363
- experiment: ranExperiment,
364
- evaluators: evaluators ?? [],
365
- client,
366
- logger,
367
- concurrency,
368
- dryRun,
369
- tracerProvider: provider,
370
- diagLogLevel,
371
- useBatchSpanProcessor,
372
- });
373
- ranExperiment.evaluationRuns = evaluationRuns;
416
+ logLinks(logger, links);
374
417
 
375
- // Refresh experiment info from server to get updated counts (non-dry-run only)
376
- if (!isDryRun) {
377
- const updatedExperiment = await getExperimentInfo({
378
- client,
379
- experimentId: experiment.id,
418
+ return ranExperiment;
419
+ } finally {
420
+ // Safety net: on error paths the happy-path cleanup above is skipped,
421
+ // so ensure the task provider is always cleaned up. On the happy path
422
+ // taskProvider is already undefined (no-op).
423
+ await cleanupOwnedTracerProvider({
424
+ provider: taskProvider,
425
+ globalRegistration: taskGlobalRegistration,
380
426
  });
381
- // Update the experiment info with the latest from the server
382
- Object.assign(ranExperiment, updatedExperiment);
383
427
  }
384
-
385
- logTaskSummary(logger, {
386
- nExamples,
387
- repetitions,
388
- nRuns: taskRuns.length,
389
- nErrors: taskErrors,
390
- });
391
-
392
- if (ranExperiment.evaluationRuns && ranExperiment.evaluationRuns.length > 0) {
393
- logEvalSummary(logger, ranExperiment.evaluationRuns);
394
- }
395
-
396
- logLinks(logger, links);
397
-
398
- return ranExperiment;
399
428
  }
400
429
 
401
430
  /**
@@ -600,6 +629,8 @@ export async function evaluateExperiment({
600
629
  "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
601
630
  );
602
631
  let provider: NodeTracerProvider;
632
+ let globalRegistration: GlobalTracerProviderRegistration | null = null;
633
+ const ownsProvider = !paramsTracerProvider;
603
634
 
604
635
  // Always allow changing of tracer providers
605
636
  if (paramsTracerProvider) {
@@ -613,165 +644,172 @@ export async function evaluateExperiment({
613
644
  : undefined,
614
645
  batch: useBatchSpanProcessor,
615
646
  diagLogLevel,
616
- global: setGlobalTracerProvider,
647
+ global: false,
617
648
  });
649
+ globalRegistration = setGlobalTracerProvider
650
+ ? attachGlobalTracerProvider(provider)
651
+ : null;
618
652
  } else {
619
653
  provider = createNoOpProvider();
620
654
  }
621
655
  const tracer = isDryRun
622
656
  ? provider.getTracer("no-op")
623
657
  : provider.getTracer("evaluators");
624
- const nRuns =
625
- typeof dryRun === "number"
626
- ? Math.min(dryRun, Object.keys(experiment.runs).length)
627
- : Object.keys(experiment.runs).length;
628
- const dataset = await getDataset({
629
- dataset: {
630
- datasetId: experiment.datasetId,
631
- versionId: experiment.datasetVersionId,
632
- splits: experiment.datasetSplits,
633
- },
634
- client,
635
- });
636
- invariant(dataset, `Dataset "${experiment.datasetId}" not found`);
637
- invariant(
638
- dataset.examples.length > 0,
639
- `Dataset "${experiment.datasetId}" has no examples`
640
- );
641
- invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
658
+ try {
659
+ const nRuns =
660
+ typeof dryRun === "number"
661
+ ? Math.min(dryRun, Object.keys(experiment.runs).length)
662
+ : Object.keys(experiment.runs).length;
663
+ const dataset = await getDataset({
664
+ dataset: {
665
+ datasetId: experiment.datasetId,
666
+ versionId: experiment.datasetVersionId,
667
+ splits: experiment.datasetSplits,
668
+ },
669
+ client,
670
+ });
671
+ invariant(dataset, `Dataset "${experiment.datasetId}" not found`);
672
+ invariant(
673
+ dataset.examples.length > 0,
674
+ `Dataset "${experiment.datasetId}" has no examples`
675
+ );
676
+ invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
642
677
 
643
- const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
644
- if (evaluators?.length === 0) {
645
- return {
646
- ...experiment,
647
- evaluationRuns: [],
648
- };
649
- }
650
- type EvaluationId = string;
651
- const evaluationRuns: Record<EvaluationId, ExperimentEvaluationRun> = {};
678
+ const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
679
+ if (evaluators?.length === 0) {
680
+ return {
681
+ ...experiment,
682
+ evaluationRuns: [],
683
+ };
684
+ }
685
+ type EvaluationId = string;
686
+ const evaluationRuns: Record<EvaluationId, ExperimentEvaluationRun> = {};
652
687
 
653
- const examplesById: Record<string, Example> = {};
654
- for (const example of dataset.examples) {
655
- examplesById[example.id] = example;
656
- }
688
+ const examplesById: Record<string, Example> = {};
689
+ for (const example of dataset.examples) {
690
+ examplesById[example.id] = example;
691
+ }
657
692
 
658
- const onEvaluationComplete = (run: ExperimentEvaluationRun) => {
659
- evaluationRuns[run.id] = run;
660
- };
693
+ const onEvaluationComplete = (run: ExperimentEvaluationRun) => {
694
+ evaluationRuns[run.id] = run;
695
+ };
661
696
 
662
- // Run evaluators against all runs
663
- // Flat list of evaluator + run tuples
664
- const normalizedEvaluators = getExperimentEvaluators(evaluators);
665
- const evaluatorsAndRuns = normalizedEvaluators.flatMap((evaluator) =>
666
- runsToEvaluate.map((run) => ({
667
- evaluator,
668
- run,
669
- }))
670
- );
671
- const evaluatorsQueue = queue(
672
- async (evaluatorAndRun: { evaluator: Evaluator; run: ExperimentRun }) => {
673
- return tracer.startActiveSpan(
674
- `Evaluation: ${evaluatorAndRun.evaluator.name}`,
675
- async (span) => {
676
- const evalResult = await runEvaluator({
677
- evaluator: evaluatorAndRun.evaluator,
678
- run: evaluatorAndRun.run,
679
- exampleCache: examplesById,
680
- onComplete: onEvaluationComplete,
681
- logger,
682
- });
683
- span.setAttributes({
684
- [SemanticConventions.OPENINFERENCE_SPAN_KIND]:
685
- OpenInferenceSpanKind.EVALUATOR,
686
- [SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
687
- [SemanticConventions.INPUT_VALUE]: ensureString({
688
- input: examplesById[evaluatorAndRun.run.datasetExampleId]?.input,
689
- output: evaluatorAndRun.run.output,
690
- expected:
691
- examplesById[evaluatorAndRun.run.datasetExampleId]?.output,
692
- metadata:
693
- examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata,
694
- }),
695
- [SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
696
- [SemanticConventions.OUTPUT_VALUE]: ensureString(evalResult.result),
697
- });
698
- if (evalResult.error) {
699
- span.setStatus({
700
- code: SpanStatusCode.ERROR,
701
- message: evalResult.error,
697
+ // Run evaluators against all runs
698
+ // Flat list of evaluator + run tuples
699
+ const normalizedEvaluators = getExperimentEvaluators(evaluators);
700
+ const evaluatorsAndRuns = normalizedEvaluators.flatMap((evaluator) =>
701
+ runsToEvaluate.map((run) => ({
702
+ evaluator,
703
+ run,
704
+ }))
705
+ );
706
+ const evaluatorsQueue = queue(
707
+ async (evaluatorAndRun: { evaluator: Evaluator; run: ExperimentRun }) => {
708
+ return tracer.startActiveSpan(
709
+ `Evaluation: ${evaluatorAndRun.evaluator.name}`,
710
+ async (span) => {
711
+ const evalResult = await runEvaluator({
712
+ evaluator: evaluatorAndRun.evaluator,
713
+ run: evaluatorAndRun.run,
714
+ exampleCache: examplesById,
715
+ onComplete: onEvaluationComplete,
716
+ logger,
702
717
  });
703
- } else {
704
- span.setStatus({ code: SpanStatusCode.OK });
705
- }
706
- if (evalResult.result) {
707
- span.setAttributes(objectAsAttributes(evalResult.result));
708
- }
709
- evalResult.traceId = span.spanContext().traceId;
710
- if (!isDryRun) {
711
- // Log the evaluation to the server
712
- // We log this without awaiting (e.g. best effort)
713
- client.POST("/v1/experiment_evaluations", {
714
- body: {
715
- experiment_run_id: evaluatorAndRun.run.id,
716
- name: evaluatorAndRun.evaluator.name,
717
- annotator_kind: evaluatorAndRun.evaluator.kind,
718
- start_time: evalResult.startTime.toISOString(),
719
- end_time: evalResult.endTime.toISOString(),
720
- result: {
721
- ...evalResult.result,
722
- },
723
- error: evalResult.error,
724
- trace_id: evalResult.traceId,
725
- },
718
+ span.setAttributes({
719
+ [SemanticConventions.OPENINFERENCE_SPAN_KIND]:
720
+ OpenInferenceSpanKind.EVALUATOR,
721
+ [SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
722
+ [SemanticConventions.INPUT_VALUE]: ensureString({
723
+ input:
724
+ examplesById[evaluatorAndRun.run.datasetExampleId]?.input,
725
+ output: evaluatorAndRun.run.output,
726
+ expected:
727
+ examplesById[evaluatorAndRun.run.datasetExampleId]?.output,
728
+ metadata:
729
+ examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata,
730
+ }),
731
+ [SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
732
+ [SemanticConventions.OUTPUT_VALUE]: ensureString(
733
+ evalResult.result
734
+ ),
726
735
  });
736
+ if (evalResult.error) {
737
+ span.setStatus({
738
+ code: SpanStatusCode.ERROR,
739
+ message: evalResult.error,
740
+ });
741
+ } else {
742
+ span.setStatus({ code: SpanStatusCode.OK });
743
+ }
744
+ if (evalResult.result) {
745
+ span.setAttributes(objectAsAttributes(evalResult.result));
746
+ }
747
+ evalResult.traceId = span.spanContext().traceId;
748
+ if (!isDryRun) {
749
+ // Log the evaluation to the server
750
+ // We log this without awaiting (e.g. best effort)
751
+ client.POST("/v1/experiment_evaluations", {
752
+ body: {
753
+ experiment_run_id: evaluatorAndRun.run.id,
754
+ name: evaluatorAndRun.evaluator.name,
755
+ annotator_kind: evaluatorAndRun.evaluator.kind,
756
+ start_time: evalResult.startTime.toISOString(),
757
+ end_time: evalResult.endTime.toISOString(),
758
+ result: {
759
+ ...evalResult.result,
760
+ },
761
+ error: evalResult.error,
762
+ trace_id: evalResult.traceId,
763
+ },
764
+ });
765
+ }
766
+ span.end();
767
+ return evalResult;
727
768
  }
728
- span.end();
729
- return evalResult;
769
+ );
770
+ },
771
+ concurrency
772
+ );
773
+ if (!evaluatorsAndRuns.length) {
774
+ logger.warn(`No evaluators to run`);
775
+ return {
776
+ ...experiment,
777
+ evaluationRuns: [],
778
+ };
779
+ }
780
+ evaluatorsAndRuns.forEach((evaluatorAndRun) =>
781
+ evaluatorsQueue.push(evaluatorAndRun, (err) => {
782
+ if (err) {
783
+ logger.error(
784
+ `Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`
785
+ );
730
786
  }
731
- );
732
- },
733
- concurrency
734
- );
735
- if (!evaluatorsAndRuns.length) {
736
- logger.warn(`No evaluators to run`);
787
+ })
788
+ );
789
+ await evaluatorsQueue.drain();
790
+ const evalTotal = Object.values(evaluationRuns).length;
791
+ const evalErrors = Object.values(evaluationRuns).filter(
792
+ (ev) => ev.error != null
793
+ ).length;
794
+ const evalExpected = runsToEvaluate.length * normalizedEvaluators.length;
795
+ const evalOkStr =
796
+ evalErrors > 0
797
+ ? `${evalTotal - evalErrors}/${evalExpected} ok (${evalErrors} failed)`
798
+ : `${evalTotal}/${evalExpected} ok`;
799
+ logger.info(`${PROGRESS_PREFIX.completed}Evaluations ${evalOkStr}`);
800
+
737
801
  return {
738
802
  ...experiment,
739
- evaluationRuns: [],
803
+ evaluationRuns: Object.values(evaluationRuns),
740
804
  };
741
- }
742
- evaluatorsAndRuns.forEach((evaluatorAndRun) =>
743
- evaluatorsQueue.push(evaluatorAndRun, (err) => {
744
- if (err) {
745
- logger.error(
746
- `Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`
747
- );
748
- }
749
- })
750
- );
751
- await evaluatorsQueue.drain();
752
- const evalTotal = Object.values(evaluationRuns).length;
753
- const evalErrors = Object.values(evaluationRuns).filter(
754
- (ev) => ev.error != null
755
- ).length;
756
- const evalExpected = runsToEvaluate.length * normalizedEvaluators.length;
757
- const evalOkStr =
758
- evalErrors > 0
759
- ? `${evalTotal - evalErrors}/${evalExpected} ok (${evalErrors} failed)`
760
- : `${evalTotal}/${evalExpected} ok`;
761
- logger.info(`${PROGRESS_PREFIX.completed}Evaluations ${evalOkStr}`);
762
-
763
- if (provider) {
764
- await provider.shutdown();
765
- // Make sure it's not set globally anymore
766
- if (setGlobalTracerProvider) {
767
- trace.disable();
805
+ } finally {
806
+ if (ownsProvider) {
807
+ await cleanupOwnedTracerProvider({
808
+ provider,
809
+ globalRegistration,
810
+ });
768
811
  }
769
812
  }
770
-
771
- return {
772
- ...experiment,
773
- evaluationRuns: Object.values(evaluationRuns),
774
- };
775
813
  }
776
814
 
777
815
  /**
@@ -815,6 +853,7 @@ async function runEvaluator({
815
853
  output: run.output ?? null,
816
854
  expected: example.output,
817
855
  metadata: example?.metadata,
856
+ traceId: run.traceId,
818
857
  });
819
858
  thisEval.result = result;
820
859
  } catch (error) {
@@ -0,0 +1,30 @@
1
+ import type {
2
+ GlobalTracerProviderRegistration,
3
+ NodeTracerProvider,
4
+ } from "@arizeai/phoenix-otel";
5
+
6
+ /**
7
+ * Flushes and shuts down a tracer provider that this package created, then
8
+ * detaches any global OTEL registration it owns so another provider can be mounted.
9
+ */
10
+ export async function cleanupOwnedTracerProvider({
11
+ provider,
12
+ globalRegistration,
13
+ }: {
14
+ provider: NodeTracerProvider | null | undefined;
15
+ globalRegistration?: GlobalTracerProviderRegistration | null;
16
+ }): Promise<void> {
17
+ if (!provider) {
18
+ return;
19
+ }
20
+
21
+ try {
22
+ await provider.forceFlush();
23
+ } finally {
24
+ try {
25
+ await provider.shutdown();
26
+ } finally {
27
+ globalRegistration?.detach();
28
+ }
29
+ }
30
+ }
@@ -131,6 +131,12 @@ export type EvaluatorParams<TaskOutputType = TaskOutput> = {
131
131
  * Metadata associated with the Dataset Example
132
132
  */
133
133
  metadata?: Example["metadata"];
134
+ /**
135
+ * The trace ID of the task run, if available.
136
+ * Can be used to fetch and analyze the task's trace
137
+ * (e.g., for trajectory evaluation or action verification).
138
+ */
139
+ traceId?: string | null;
134
140
  };
135
141
 
136
142
  export type Evaluator = {