@arizeai/phoenix-client 6.5.4 → 6.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/dist/esm/__generated__/api/v1.d.ts +244 -0
  2. package/dist/esm/__generated__/api/v1.d.ts.map +1 -1
  3. package/dist/esm/experiments/resumeEvaluation.d.ts.map +1 -1
  4. package/dist/esm/experiments/resumeEvaluation.js +181 -170
  5. package/dist/esm/experiments/resumeEvaluation.js.map +1 -1
  6. package/dist/esm/experiments/resumeExperiment.d.ts.map +1 -1
  7. package/dist/esm/experiments/resumeExperiment.js +201 -185
  8. package/dist/esm/experiments/resumeExperiment.js.map +1 -1
  9. package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
  10. package/dist/esm/experiments/runExperiment.js +239 -207
  11. package/dist/esm/experiments/runExperiment.js.map +1 -1
  12. package/dist/esm/experiments/tracing.d.ts +10 -0
  13. package/dist/esm/experiments/tracing.d.ts.map +1 -0
  14. package/dist/esm/experiments/tracing.js +21 -0
  15. package/dist/esm/experiments/tracing.js.map +1 -0
  16. package/dist/esm/prompts/sdks/toSDK.d.ts +2 -2
  17. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  18. package/dist/esm/types/experiments.d.ts +6 -0
  19. package/dist/esm/types/experiments.d.ts.map +1 -1
  20. package/dist/esm/utils/formatPromptMessages.d.ts.map +1 -1
  21. package/dist/esm/utils/getPromptBySelector.d.ts.map +1 -1
  22. package/dist/src/__generated__/api/v1.d.ts +244 -0
  23. package/dist/src/__generated__/api/v1.d.ts.map +1 -1
  24. package/dist/src/experiments/resumeEvaluation.d.ts.map +1 -1
  25. package/dist/src/experiments/resumeEvaluation.js +194 -183
  26. package/dist/src/experiments/resumeEvaluation.js.map +1 -1
  27. package/dist/src/experiments/resumeExperiment.d.ts.map +1 -1
  28. package/dist/src/experiments/resumeExperiment.js +214 -198
  29. package/dist/src/experiments/resumeExperiment.js.map +1 -1
  30. package/dist/src/experiments/runExperiment.d.ts.map +1 -1
  31. package/dist/src/experiments/runExperiment.js +229 -197
  32. package/dist/src/experiments/runExperiment.js.map +1 -1
  33. package/dist/src/experiments/tracing.d.ts +10 -0
  34. package/dist/src/experiments/tracing.d.ts.map +1 -0
  35. package/dist/src/experiments/tracing.js +24 -0
  36. package/dist/src/experiments/tracing.js.map +1 -0
  37. package/dist/src/types/experiments.d.ts +6 -0
  38. package/dist/src/types/experiments.d.ts.map +1 -1
  39. package/dist/src/utils/formatPromptMessages.d.ts.map +1 -1
  40. package/dist/src/utils/getPromptBySelector.d.ts.map +1 -1
  41. package/dist/tsconfig.tsbuildinfo +1 -1
  42. package/docs/annotations.mdx +83 -0
  43. package/docs/datasets.mdx +77 -0
  44. package/docs/document-annotations.mdx +208 -0
  45. package/docs/experiments.mdx +376 -0
  46. package/docs/overview.mdx +176 -0
  47. package/docs/prompts.mdx +73 -0
  48. package/docs/session-annotations.mdx +158 -0
  49. package/docs/sessions.mdx +87 -0
  50. package/docs/span-annotations.mdx +283 -0
  51. package/docs/spans.mdx +76 -0
  52. package/docs/traces.mdx +63 -0
  53. package/package.json +9 -3
  54. package/src/__generated__/api/v1.ts +244 -0
  55. package/src/experiments/resumeEvaluation.ts +226 -206
  56. package/src/experiments/resumeExperiment.ts +237 -213
  57. package/src/experiments/runExperiment.ts +282 -243
  58. package/src/experiments/tracing.ts +30 -0
  59. package/src/types/experiments.ts +6 -0
@@ -3,8 +3,13 @@ import {
3
3
  OpenInferenceSpanKind,
4
4
  SemanticConventions,
5
5
  } from "@arizeai/openinference-semantic-conventions";
6
- import type { NodeTracerProvider, Tracer } from "@arizeai/phoenix-otel";
6
+ import type {
7
+ GlobalTracerProviderRegistration,
8
+ NodeTracerProvider,
9
+ Tracer,
10
+ } from "@arizeai/phoenix-otel";
7
11
  import {
12
+ attachGlobalTracerProvider,
8
13
  type DiagLogLevel,
9
14
  objectAsAttributes,
10
15
  register,
@@ -29,6 +34,7 @@ import { toObjectHeaders } from "../utils/toObjectHeaders";
29
34
  import { getExperimentInfo } from "./getExperimentInfo.js";
30
35
  import { getExperimentEvaluators } from "./helpers";
31
36
  import { logEvalResumeSummary, PROGRESS_PREFIX } from "./logging";
37
+ import { cleanupOwnedTracerProvider } from "./tracing";
32
38
 
33
39
  /**
34
40
  * Error thrown when evaluation is aborted due to a failure in stopOnFirstError mode.
@@ -207,7 +213,11 @@ function setupEvaluationTracer({
207
213
  useBatchSpanProcessor: boolean;
208
214
  diagLogLevel?: DiagLogLevel;
209
215
  setGlobalTracerProvider: boolean;
210
- }): { provider: NodeTracerProvider; tracer: Tracer } | null {
216
+ }): {
217
+ provider: NodeTracerProvider;
218
+ tracer: Tracer;
219
+ globalRegistration: GlobalTracerProviderRegistration | null;
220
+ } | null {
211
221
  if (!projectName) {
212
222
  return null;
213
223
  }
@@ -218,11 +228,14 @@ function setupEvaluationTracer({
218
228
  headers,
219
229
  batch: useBatchSpanProcessor,
220
230
  diagLogLevel,
221
- global: setGlobalTracerProvider,
231
+ global: false,
222
232
  });
233
+ const globalRegistration = setGlobalTracerProvider
234
+ ? attachGlobalTracerProvider(provider)
235
+ : null;
223
236
 
224
237
  const tracer = provider.getTracer(projectName);
225
- return { provider, tracer };
238
+ return { provider, tracer, globalRegistration };
226
239
  }
227
240
 
228
241
  /**
@@ -329,252 +342,257 @@ export async function resumeEvaluation({
329
342
  });
330
343
 
331
344
  const provider = tracerSetup?.provider ?? null;
345
+ const globalRegistration = tracerSetup?.globalRegistration ?? null;
332
346
  const evalTracer = tracerSetup?.tracer ?? null;
333
347
 
334
- // Build evaluation names list for query - derive from evaluator names
335
- const evaluationNamesList = evaluators.map((e) => e.name);
348
+ try {
349
+ // Build evaluation names list for query - derive from evaluator names
350
+ const evaluationNamesList = evaluators.map((e) => e.name);
336
351
 
337
- // Create a CSP-style bounded buffer for evaluation distribution
338
- const evalChannel = new Channel<EvalItem>(
339
- pageSize * CHANNEL_CAPACITY_MULTIPLIER
340
- );
352
+ // Create a CSP-style bounded buffer for evaluation distribution
353
+ const evalChannel = new Channel<EvalItem>(
354
+ pageSize * CHANNEL_CAPACITY_MULTIPLIER
355
+ );
341
356
 
342
- // Abort controller for stopOnFirstError coordination
343
- const abortController = new AbortController();
344
- const { signal } = abortController;
357
+ // Abort controller for stopOnFirstError coordination
358
+ const abortController = new AbortController();
359
+ const { signal } = abortController;
345
360
 
346
- let totalProcessed = 0;
347
- let totalCompleted = 0;
348
- let totalFailed = 0;
361
+ let totalProcessed = 0;
362
+ let totalCompleted = 0;
363
+ let totalFailed = 0;
349
364
 
350
- // Producer: Fetch incomplete evaluations and send to channel
351
- async function fetchIncompleteEvaluations(): Promise<void> {
352
- let cursor: string | null = null;
365
+ // Producer: Fetch incomplete evaluations and send to channel
366
+ async function fetchIncompleteEvaluations(): Promise<void> {
367
+ let cursor: string | null = null;
353
368
 
354
- try {
355
- do {
356
- // Stop fetching if abort signal received
357
- if (signal.aborted) {
358
- logger.debug(`${PROGRESS_PREFIX.progress}Stopping fetch.`);
359
- break;
360
- }
369
+ try {
370
+ do {
371
+ // Stop fetching if abort signal received
372
+ if (signal.aborted) {
373
+ logger.debug(`${PROGRESS_PREFIX.progress}Stopping fetch.`);
374
+ break;
375
+ }
361
376
 
362
- let res: {
363
- data?: components["schemas"]["GetIncompleteEvaluationsResponseBody"];
364
- error?: unknown;
365
- };
377
+ let res: {
378
+ data?: components["schemas"]["GetIncompleteEvaluationsResponseBody"];
379
+ error?: unknown;
380
+ };
366
381
 
367
- try {
368
- res = await client.GET(
369
- "/v1/experiments/{experiment_id}/incomplete-evaluations",
370
- {
371
- params: {
372
- path: {
373
- experiment_id: experimentId,
374
- },
375
- query: {
376
- cursor,
377
- limit: pageSize,
378
- evaluation_name: evaluationNamesList,
379
- },
380
- },
381
- }
382
- );
383
- } catch (error: unknown) {
384
- // Check for version compatibility issues and throw helpful error
385
382
  try {
386
- await handleEvaluationFetchError(
387
- error,
388
- client,
389
- "resume_evaluation"
390
- );
391
- // TypeScript: handleEvaluationFetchError never returns, but add throw for safety
392
- throw new Error("handleEvaluationFetchError should never return");
393
- } catch (handledError) {
394
- // Wrap the error (from handleEvaluationFetchError or original) in semantic error type
395
- throw new EvaluationFetchError(
396
- "Failed to fetch incomplete evaluations from server",
397
- handledError instanceof Error ? handledError : undefined
383
+ res = await client.GET(
384
+ "/v1/experiments/{experiment_id}/incomplete-evaluations",
385
+ {
386
+ params: {
387
+ path: {
388
+ experiment_id: experimentId,
389
+ },
390
+ query: {
391
+ cursor,
392
+ limit: pageSize,
393
+ evaluation_name: evaluationNamesList,
394
+ },
395
+ },
396
+ }
398
397
  );
398
+ } catch (error: unknown) {
399
+ // Check for version compatibility issues and throw helpful error
400
+ try {
401
+ await handleEvaluationFetchError(
402
+ error,
403
+ client,
404
+ "resume_evaluation"
405
+ );
406
+ // TypeScript: handleEvaluationFetchError never returns, but add throw for safety
407
+ throw new Error("handleEvaluationFetchError should never return");
408
+ } catch (handledError) {
409
+ // Wrap the error (from handleEvaluationFetchError or original) in semantic error type
410
+ throw new EvaluationFetchError(
411
+ "Failed to fetch incomplete evaluations from server",
412
+ handledError instanceof Error ? handledError : undefined
413
+ );
414
+ }
399
415
  }
400
- }
401
416
 
402
- // Check for API errors
403
- if (res.error) {
404
- throw new EvaluationFetchError(
405
- `Failed to fetch incomplete evaluations: ${ensureString(res.error)}`
406
- );
407
- }
408
-
409
- cursor = res.data?.next_cursor ?? null;
410
- const batchIncomplete = res.data?.data;
411
- invariant(batchIncomplete, "Failed to fetch incomplete evaluations");
412
-
413
- if (batchIncomplete.length === 0) {
414
- if (totalProcessed === 0) {
415
- logger.info(
416
- `${PROGRESS_PREFIX.completed}No incomplete evaluations found.`
417
+ // Check for API errors
418
+ if (res.error) {
419
+ throw new EvaluationFetchError(
420
+ `Failed to fetch incomplete evaluations: ${ensureString(res.error)}`
417
421
  );
418
422
  }
419
- break;
420
- }
421
423
 
422
- if (totalProcessed === 0) {
423
- logger.info(`${PROGRESS_PREFIX.start}Resuming evaluations.`);
424
- }
424
+ cursor = res.data?.next_cursor ?? null;
425
+ const batchIncomplete = res.data?.data;
426
+ invariant(batchIncomplete, "Failed to fetch incomplete evaluations");
425
427
 
426
- // Build evaluation tasks and send to channel
427
- let batchCount = 0;
428
- for (const incomplete of batchIncomplete) {
429
- // Stop sending items if abort signal received
430
- if (signal.aborted) {
428
+ if (batchIncomplete.length === 0) {
429
+ if (totalProcessed === 0) {
430
+ logger.info(
431
+ `${PROGRESS_PREFIX.completed}No incomplete evaluations found.`
432
+ );
433
+ }
431
434
  break;
432
435
  }
433
436
 
434
- const incompleteEval = buildIncompleteEvaluation(incomplete);
435
-
436
- const evaluatorsToRun = evaluators.filter((evaluator) =>
437
- shouldRunEvaluator(evaluator, incompleteEval)
438
- );
437
+ if (totalProcessed === 0) {
438
+ logger.info(`${PROGRESS_PREFIX.start}Resuming evaluations.`);
439
+ }
439
440
 
440
- // Flatten: Send one channel item per evaluator
441
- for (const evaluator of evaluatorsToRun) {
441
+ // Build evaluation tasks and send to channel
442
+ let batchCount = 0;
443
+ for (const incomplete of batchIncomplete) {
442
444
  // Stop sending items if abort signal received
443
445
  if (signal.aborted) {
444
446
  break;
445
447
  }
446
448
 
447
- await evalChannel.send({ incompleteEval, evaluator });
448
- batchCount++;
449
- totalProcessed++;
449
+ const incompleteEval = buildIncompleteEvaluation(incomplete);
450
+
451
+ const evaluatorsToRun = evaluators.filter((evaluator) =>
452
+ shouldRunEvaluator(evaluator, incompleteEval)
453
+ );
454
+
455
+ // Flatten: Send one channel item per evaluator
456
+ for (const evaluator of evaluatorsToRun) {
457
+ // Stop sending items if abort signal received
458
+ if (signal.aborted) {
459
+ break;
460
+ }
461
+
462
+ await evalChannel.send({ incompleteEval, evaluator });
463
+ batchCount++;
464
+ totalProcessed++;
465
+ }
450
466
  }
451
- }
452
467
 
453
- logger.debug(
454
- `${PROGRESS_PREFIX.progress}Fetched batch of ${batchCount} evaluation tasks.`
468
+ logger.debug(
469
+ `${PROGRESS_PREFIX.progress}Fetched batch of ${batchCount} evaluation tasks.`
470
+ );
471
+ } while (cursor !== null && !signal.aborted);
472
+ } catch (error) {
473
+ // Re-throw with context preservation
474
+ if (error instanceof EvaluationFetchError) {
475
+ throw error;
476
+ }
477
+ // ChannelError from blocked send() should bubble up naturally
478
+ // (happens when channel closes while producer is blocked)
479
+ if (error instanceof ChannelError) {
480
+ throw error;
481
+ }
482
+ // Wrap any unexpected errors from channel operations
483
+ throw new EvaluationFetchError(
484
+ "Unexpected error during evaluation fetch",
485
+ error instanceof Error ? error : undefined
455
486
  );
456
- } while (cursor !== null && !signal.aborted);
457
- } catch (error) {
458
- // Re-throw with context preservation
459
- if (error instanceof EvaluationFetchError) {
460
- throw error;
461
- }
462
- // ChannelError from blocked send() should bubble up naturally
463
- // (happens when channel closes while producer is blocked)
464
- if (error instanceof ChannelError) {
465
- throw error;
487
+ } finally {
488
+ evalChannel.close(); // Signal workers we're done
466
489
  }
467
- // Wrap any unexpected errors from channel operations
468
- throw new EvaluationFetchError(
469
- "Unexpected error during evaluation fetch",
470
- error instanceof Error ? error : undefined
471
- );
472
- } finally {
473
- evalChannel.close(); // Signal workers we're done
474
490
  }
475
- }
476
491
 
477
- // Worker: Process evaluations from channel
478
- async function processEvaluationsFromChannel(): Promise<void> {
479
- for await (const item of evalChannel) {
480
- // Stop processing if abort signal received
481
- if (signal.aborted) {
482
- break;
483
- }
492
+ // Worker: Process evaluations from channel
493
+ async function processEvaluationsFromChannel(): Promise<void> {
494
+ for await (const item of evalChannel) {
495
+ // Stop processing if abort signal received
496
+ if (signal.aborted) {
497
+ break;
498
+ }
484
499
 
485
- try {
486
- await runSingleEvaluation({
487
- client,
488
- experimentId,
489
- evaluator: item.evaluator,
490
- experimentRun: item.incompleteEval.experimentRun,
491
- datasetExample: item.incompleteEval.datasetExample,
492
- tracer: evalTracer,
493
- });
494
- totalCompleted++;
495
- } catch (error) {
496
- totalFailed++;
497
- logger.error(
498
- `Failed to run evaluator "${item.evaluator.name}" for run ${item.incompleteEval.experimentRun.id}: ${error}`
499
- );
500
+ try {
501
+ await runSingleEvaluation({
502
+ client,
503
+ experimentId,
504
+ evaluator: item.evaluator,
505
+ experimentRun: item.incompleteEval.experimentRun,
506
+ datasetExample: item.incompleteEval.datasetExample,
507
+ tracer: evalTracer,
508
+ });
509
+ totalCompleted++;
510
+ } catch (error) {
511
+ totalFailed++;
512
+ logger.error(
513
+ `Failed to run evaluator "${item.evaluator.name}" for run ${item.incompleteEval.experimentRun.id}: ${error}`
514
+ );
500
515
 
501
- // If stopOnFirstError is enabled, abort and re-throw
502
- if (stopOnFirstError) {
503
- logger.warn("Stopping on first error");
504
- abortController.abort();
505
- throw error;
516
+ // If stopOnFirstError is enabled, abort and re-throw
517
+ if (stopOnFirstError) {
518
+ logger.warn("Stopping on first error");
519
+ abortController.abort();
520
+ throw error;
521
+ }
506
522
  }
507
523
  }
508
524
  }
509
- }
510
-
511
- // Start concurrent execution
512
- // Wrap in try-finally to ensure channel is always closed, even if Promise.all throws
513
- let executionError: Error | null = null;
514
- try {
515
- const producerTask = fetchIncompleteEvaluations();
516
- const workerTasks = Array.from({ length: concurrency }, () =>
517
- processEvaluationsFromChannel()
518
- );
519
525
 
520
- // Wait for producer and all workers to finish
521
- await Promise.all([producerTask, ...workerTasks]);
522
- } catch (error) {
523
- // Classify and handle errors based on their nature
524
- const err = error instanceof Error ? error : new Error(String(error));
525
-
526
- // Always surface producer/infrastructure errors
527
- if (error instanceof EvaluationFetchError) {
528
- // Producer failed - this is ALWAYS critical regardless of stopOnFirstError
529
- logger.error(`Critical: Failed to fetch evaluations from server`);
530
- executionError = err;
531
- } else if (error instanceof ChannelError && signal.aborted) {
532
- // Channel closed due to intentional abort - wrap in semantic error
533
- executionError = new EvaluationAbortedError(
534
- "Evaluation stopped due to error in concurrent evaluator",
535
- err
526
+ // Start concurrent execution
527
+ // Wrap in try-finally to ensure channel is always closed, even if Promise.all throws
528
+ let executionError: Error | null = null;
529
+ try {
530
+ const producerTask = fetchIncompleteEvaluations();
531
+ const workerTasks = Array.from({ length: concurrency }, () =>
532
+ processEvaluationsFromChannel()
536
533
  );
537
- } else if (stopOnFirstError) {
538
- // Worker error in stopOnFirstError mode - already logged by worker
539
- executionError = err;
540
- } else {
541
- // Unexpected error (not from worker, not from producer fetch)
542
- // This could be a bug in our code or infrastructure failure
543
- logger.error(`Unexpected error during evaluation: ${err.message}`);
544
- executionError = err;
545
- }
546
- } finally {
547
- // Ensure channel is closed even if there are unexpected errors
548
- // This is a safety net in case producer's finally block didn't execute
549
- if (!evalChannel.isClosed) {
550
- evalChannel.close();
551
- }
552
- }
553
534
 
554
- // Only show completion message if we didn't stop on error
555
- if (!executionError) {
556
- logger.info(`${PROGRESS_PREFIX.completed}Evaluations completed.`);
557
- }
535
+ // Wait for producer and all workers to finish
536
+ await Promise.all([producerTask, ...workerTasks]);
537
+ } catch (error) {
538
+ // Classify and handle errors based on their nature
539
+ const err = error instanceof Error ? error : new Error(String(error));
558
540
 
559
- if (totalFailed > 0 && !executionError) {
560
- logger.warn(`${totalFailed} out of ${totalProcessed} evaluations failed.`);
561
- }
541
+ // Always surface producer/infrastructure errors
542
+ if (error instanceof EvaluationFetchError) {
543
+ // Producer failed - this is ALWAYS critical regardless of stopOnFirstError
544
+ logger.error(`Critical: Failed to fetch evaluations from server`);
545
+ executionError = err;
546
+ } else if (error instanceof ChannelError && signal.aborted) {
547
+ // Channel closed due to intentional abort - wrap in semantic error
548
+ executionError = new EvaluationAbortedError(
549
+ "Evaluation stopped due to error in concurrent evaluator",
550
+ err
551
+ );
552
+ } else if (stopOnFirstError) {
553
+ // Worker error in stopOnFirstError mode - already logged by worker
554
+ executionError = err;
555
+ } else {
556
+ // Unexpected error (not from worker, not from producer fetch)
557
+ // This could be a bug in our code or infrastructure failure
558
+ logger.error(`Unexpected error during evaluation: ${err.message}`);
559
+ executionError = err;
560
+ }
561
+ } finally {
562
+ // Ensure channel is closed even if there are unexpected errors
563
+ // This is a safety net in case producer's finally block didn't execute
564
+ if (!evalChannel.isClosed) {
565
+ evalChannel.close();
566
+ }
567
+ }
562
568
 
563
- logEvalResumeSummary(logger, {
564
- experimentId: experiment.id,
565
- processed: totalProcessed,
566
- completed: totalCompleted,
567
- failed: totalFailed,
568
- });
569
+ // Only show completion message if we didn't stop on error
570
+ if (!executionError) {
571
+ logger.info(`${PROGRESS_PREFIX.completed}Evaluations completed.`);
572
+ }
569
573
 
570
- // Flush spans (if tracer was initialized)
571
- if (provider) {
572
- await provider.forceFlush();
573
- }
574
+ if (totalFailed > 0 && !executionError) {
575
+ logger.warn(
576
+ `${totalFailed} out of ${totalProcessed} evaluations failed.`
577
+ );
578
+ }
579
+
580
+ logEvalResumeSummary(logger, {
581
+ experimentId: experiment.id,
582
+ processed: totalProcessed,
583
+ completed: totalCompleted,
584
+ failed: totalFailed,
585
+ });
574
586
 
575
- // Re-throw error if evaluation failed
576
- if (executionError) {
577
- throw executionError;
587
+ // Re-throw error if evaluation failed
588
+ if (executionError) {
589
+ throw executionError;
590
+ }
591
+ } finally {
592
+ await cleanupOwnedTracerProvider({
593
+ provider,
594
+ globalRegistration,
595
+ });
578
596
  }
579
597
  }
580
598
 
@@ -674,6 +692,7 @@ async function runSingleEvaluation({
674
692
  output: taskOutput,
675
693
  expected: expectedOutput,
676
694
  metadata: datasetExample.metadata,
695
+ traceId: experimentRun.traceId,
677
696
  })
678
697
  );
679
698
  results = Array.isArray(result) ? result : [result];
@@ -728,6 +747,7 @@ async function runSingleEvaluation({
728
747
  output: taskOutput,
729
748
  expected: expectedOutput,
730
749
  metadata: datasetExample.metadata,
750
+ traceId: experimentRun.traceId,
731
751
  })
732
752
  );
733
753