@arizeai/phoenix-client 5.2.1 → 5.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/dist/esm/__generated__/api/v1.d.ts +321 -9
  2. package/dist/esm/__generated__/api/v1.d.ts.map +1 -1
  3. package/dist/esm/experiments/createExperiment.d.ts +39 -0
  4. package/dist/esm/experiments/createExperiment.d.ts.map +1 -0
  5. package/dist/esm/experiments/createExperiment.js +43 -0
  6. package/dist/esm/experiments/createExperiment.js.map +1 -0
  7. package/dist/esm/experiments/deleteExperiment.d.ts +36 -0
  8. package/dist/esm/experiments/deleteExperiment.d.ts.map +1 -0
  9. package/dist/esm/experiments/deleteExperiment.js +49 -0
  10. package/dist/esm/experiments/deleteExperiment.js.map +1 -0
  11. package/dist/esm/experiments/getExperimentInfo.d.ts.map +1 -1
  12. package/dist/esm/experiments/getExperimentInfo.js +9 -2
  13. package/dist/esm/experiments/getExperimentInfo.js.map +1 -1
  14. package/dist/esm/experiments/index.d.ts +5 -0
  15. package/dist/esm/experiments/index.d.ts.map +1 -1
  16. package/dist/esm/experiments/index.js +5 -0
  17. package/dist/esm/experiments/index.js.map +1 -1
  18. package/dist/esm/experiments/listExperiments.d.ts +29 -0
  19. package/dist/esm/experiments/listExperiments.d.ts.map +1 -0
  20. package/dist/esm/experiments/listExperiments.js +59 -0
  21. package/dist/esm/experiments/listExperiments.js.map +1 -0
  22. package/dist/esm/experiments/resumeEvaluation.d.ts +105 -0
  23. package/dist/esm/experiments/resumeEvaluation.d.ts.map +1 -0
  24. package/dist/esm/experiments/resumeEvaluation.js +558 -0
  25. package/dist/esm/experiments/resumeEvaluation.js.map +1 -0
  26. package/dist/esm/experiments/resumeExperiment.d.ts +102 -0
  27. package/dist/esm/experiments/resumeExperiment.d.ts.map +1 -0
  28. package/dist/esm/experiments/resumeExperiment.js +517 -0
  29. package/dist/esm/experiments/resumeExperiment.js.map +1 -0
  30. package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
  31. package/dist/esm/experiments/runExperiment.js +28 -2
  32. package/dist/esm/experiments/runExperiment.js.map +1 -1
  33. package/dist/esm/prompts/createPrompt.d.ts +19 -1
  34. package/dist/esm/prompts/createPrompt.d.ts.map +1 -1
  35. package/dist/esm/prompts/createPrompt.js +14 -1
  36. package/dist/esm/prompts/createPrompt.js.map +1 -1
  37. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  38. package/dist/esm/types/experiments.d.ts +60 -3
  39. package/dist/esm/types/experiments.d.ts.map +1 -1
  40. package/dist/esm/utils/channel.d.ts +229 -0
  41. package/dist/esm/utils/channel.d.ts.map +1 -0
  42. package/dist/esm/utils/channel.js +352 -0
  43. package/dist/esm/utils/channel.js.map +1 -0
  44. package/dist/esm/utils/formatPromptMessages.d.ts.map +1 -1
  45. package/dist/esm/utils/getPromptBySelector.d.ts.map +1 -1
  46. package/dist/esm/utils/isHttpError.d.ts +21 -0
  47. package/dist/esm/utils/isHttpError.d.ts.map +1 -0
  48. package/dist/esm/utils/isHttpError.js +33 -0
  49. package/dist/esm/utils/isHttpError.js.map +1 -0
  50. package/dist/src/__generated__/api/v1.d.ts +321 -9
  51. package/dist/src/__generated__/api/v1.d.ts.map +1 -1
  52. package/dist/src/experiments/createExperiment.d.ts +39 -0
  53. package/dist/src/experiments/createExperiment.d.ts.map +1 -0
  54. package/dist/src/experiments/createExperiment.js +43 -0
  55. package/dist/src/experiments/createExperiment.js.map +1 -0
  56. package/dist/src/experiments/deleteExperiment.d.ts +36 -0
  57. package/dist/src/experiments/deleteExperiment.d.ts.map +1 -0
  58. package/dist/src/experiments/deleteExperiment.js +52 -0
  59. package/dist/src/experiments/deleteExperiment.js.map +1 -0
  60. package/dist/src/experiments/getExperimentInfo.d.ts.map +1 -1
  61. package/dist/src/experiments/getExperimentInfo.js +9 -2
  62. package/dist/src/experiments/getExperimentInfo.js.map +1 -1
  63. package/dist/src/experiments/index.d.ts +5 -0
  64. package/dist/src/experiments/index.d.ts.map +1 -1
  65. package/dist/src/experiments/index.js +5 -0
  66. package/dist/src/experiments/index.js.map +1 -1
  67. package/dist/src/experiments/listExperiments.d.ts +29 -0
  68. package/dist/src/experiments/listExperiments.d.ts.map +1 -0
  69. package/dist/src/experiments/listExperiments.js +66 -0
  70. package/dist/src/experiments/listExperiments.js.map +1 -0
  71. package/dist/src/experiments/resumeEvaluation.d.ts +105 -0
  72. package/dist/src/experiments/resumeEvaluation.d.ts.map +1 -0
  73. package/dist/src/experiments/resumeEvaluation.js +584 -0
  74. package/dist/src/experiments/resumeEvaluation.js.map +1 -0
  75. package/dist/src/experiments/resumeExperiment.d.ts +102 -0
  76. package/dist/src/experiments/resumeExperiment.d.ts.map +1 -0
  77. package/dist/src/experiments/resumeExperiment.js +540 -0
  78. package/dist/src/experiments/resumeExperiment.js.map +1 -0
  79. package/dist/src/experiments/runExperiment.d.ts.map +1 -1
  80. package/dist/src/experiments/runExperiment.js +28 -2
  81. package/dist/src/experiments/runExperiment.js.map +1 -1
  82. package/dist/src/prompts/createPrompt.d.ts +19 -1
  83. package/dist/src/prompts/createPrompt.d.ts.map +1 -1
  84. package/dist/src/prompts/createPrompt.js +14 -1
  85. package/dist/src/prompts/createPrompt.js.map +1 -1
  86. package/dist/src/types/experiments.d.ts +60 -3
  87. package/dist/src/types/experiments.d.ts.map +1 -1
  88. package/dist/src/utils/channel.d.ts +229 -0
  89. package/dist/src/utils/channel.d.ts.map +1 -0
  90. package/dist/src/utils/channel.js +385 -0
  91. package/dist/src/utils/channel.js.map +1 -0
  92. package/dist/src/utils/formatPromptMessages.d.ts.map +1 -1
  93. package/dist/src/utils/getPromptBySelector.d.ts.map +1 -1
  94. package/dist/src/utils/isHttpError.d.ts +21 -0
  95. package/dist/src/utils/isHttpError.d.ts.map +1 -0
  96. package/dist/src/utils/isHttpError.js +37 -0
  97. package/dist/src/utils/isHttpError.js.map +1 -0
  98. package/dist/tsconfig.tsbuildinfo +1 -1
  99. package/package.json +1 -1
  100. package/src/__generated__/api/v1.ts +321 -9
  101. package/src/experiments/createExperiment.ts +90 -0
  102. package/src/experiments/deleteExperiment.ts +67 -0
  103. package/src/experiments/getExperimentInfo.ts +9 -2
  104. package/src/experiments/index.ts +5 -0
  105. package/src/experiments/listExperiments.ts +83 -0
  106. package/src/experiments/resumeEvaluation.ts +799 -0
  107. package/src/experiments/resumeExperiment.ts +742 -0
  108. package/src/experiments/runExperiment.ts +30 -2
  109. package/src/prompts/createPrompt.ts +19 -1
  110. package/src/types/experiments.ts +62 -3
  111. package/src/utils/channel.ts +397 -0
  112. package/src/utils/isHttpError.ts +45 -0
@@ -0,0 +1,799 @@
1
+ import {
2
+ MimeType,
3
+ OpenInferenceSpanKind,
4
+ SemanticConventions,
5
+ } from "@arizeai/openinference-semantic-conventions";
6
+ import {
7
+ type DiagLogLevel,
8
+ NodeTracerProvider,
9
+ objectAsAttributes,
10
+ register,
11
+ SpanStatusCode,
12
+ Tracer,
13
+ } from "@arizeai/phoenix-otel";
14
+
15
+ import { components } from "../__generated__/api/v1";
16
+ import { createClient, type PhoenixClient } from "../client";
17
+ import { ClientFn } from "../types/core";
18
+ import type {
19
+ EvaluationResult,
20
+ Evaluator,
21
+ IncompleteEvaluation,
22
+ TaskOutput,
23
+ } from "../types/experiments";
24
+ import { type Logger } from "../types/logger";
25
+ import { Channel, ChannelError } from "../utils/channel";
26
+ import { ensureString } from "../utils/ensureString";
27
+ import { toObjectHeaders } from "../utils/toObjectHeaders";
28
+
29
+ import { getExperimentInfo } from "./getExperimentInfo.js";
30
+
31
+ import invariant from "tiny-invariant";
32
+
33
+ /**
34
+ * Error thrown when evaluation is aborted due to a failure in stopOnFirstError mode.
35
+ * This provides semantic context that the abort was intentional, not an infrastructure failure.
36
+ * @internal - Not exported to minimize API surface area
37
+ */
38
+ class EvaluationAbortedError extends Error {
39
+ constructor(message: string, cause?: Error) {
40
+ super(message);
41
+ this.name = "EvaluationAbortedError";
42
+ this.cause = cause;
43
+ }
44
+ }
45
+
46
+ /**
47
+ * Error thrown when the producer fails to fetch incomplete evaluations from the server.
48
+ * This is a critical error that should always be surfaced, even in stopOnFirstError=false mode.
49
+ * @internal - Not exported to minimize API surface area
50
+ */
51
+ class EvaluationFetchError extends Error {
52
+ constructor(message: string, cause?: Error) {
53
+ super(message);
54
+ this.name = "EvaluationFetchError";
55
+ this.cause = cause;
56
+ }
57
+ }
58
+
59
+ export type ResumeEvaluationParams = ClientFn & {
60
+ /**
61
+ * The ID of the experiment to resume evaluations for
62
+ */
63
+ readonly experimentId: string;
64
+ /**
65
+ * A single evaluator or list of evaluators to run on incomplete evaluations
66
+ */
67
+ readonly evaluators: Evaluator | readonly Evaluator[];
68
+ /**
69
+ * The logger to use
70
+ * @default console
71
+ */
72
+ readonly logger?: Logger;
73
+ /**
74
+ * The number of concurrent evaluation executions
75
+ * @default 5
76
+ */
77
+ readonly concurrency?: number;
78
+ /**
79
+ * Whether to set the global tracer provider when running evaluators.
80
+ * @default true
81
+ */
82
+ readonly setGlobalTracerProvider?: boolean;
83
+ /**
84
+ * Whether to use batch span processor for tracing.
85
+ * @default true
86
+ */
87
+ readonly useBatchSpanProcessor?: boolean;
88
+ /**
89
+ * Log level to set for the default DiagConsoleLogger when tracing.
90
+ */
91
+ readonly diagLogLevel?: DiagLogLevel;
92
+ /**
93
+ * Stop processing and exit as soon as any evaluation fails.
94
+ * @default false
95
+ */
96
+ readonly stopOnFirstError?: boolean;
97
+ };
98
+
99
+ const DEFAULT_PAGE_SIZE = 50 as const;
100
+ /**
101
+ * Channel capacity multiplier for producer-consumer buffering.
102
+ * A value of 2 enables pipeline efficiency: workers process batch N while
103
+ * the producer fetches batch N+1, eliminating idle time without excessive
104
+ * memory usage. The channel blocks when full, providing natural backpressure.
105
+ */
106
+ const CHANNEL_CAPACITY_MULTIPLIER = 2 as const;
107
+
108
+ /**
109
+ * Evaluation item for the producer-consumer channel
110
+ */
111
+ type EvalItem = {
112
+ readonly incompleteEval: IncompleteEvaluation;
113
+ readonly evaluator: Evaluator;
114
+ };
115
+
116
+ /**
117
+ * Transforms API incomplete evaluation response to IncompleteEvaluation
118
+ */
119
+ function buildIncompleteEvaluation(
120
+ apiResponse: components["schemas"]["IncompleteExperimentEvaluation"]
121
+ ): IncompleteEvaluation {
122
+ return {
123
+ experimentRun: {
124
+ id: apiResponse.experiment_run.id,
125
+ experimentId: apiResponse.experiment_run.experiment_id,
126
+ datasetExampleId: apiResponse.experiment_run.dataset_example_id,
127
+ output: apiResponse.experiment_run.output ?? null,
128
+ startTime: new Date(apiResponse.experiment_run.start_time),
129
+ endTime: new Date(apiResponse.experiment_run.end_time),
130
+ error: apiResponse.experiment_run.error ?? null,
131
+ traceId: apiResponse.experiment_run.trace_id ?? null,
132
+ },
133
+ datasetExample: {
134
+ id: apiResponse.dataset_example.id,
135
+ input: apiResponse.dataset_example.input,
136
+ output: apiResponse.dataset_example.output ?? null,
137
+ metadata: apiResponse.dataset_example.metadata || {},
138
+ updatedAt: new Date(apiResponse.dataset_example.updated_at),
139
+ },
140
+ evaluationNames: apiResponse.evaluation_names,
141
+ };
142
+ }
143
+
144
+ /**
145
+ * Determines if an evaluator should run for an incomplete evaluation
146
+ */
147
+ function shouldRunEvaluator(
148
+ evaluator: Evaluator,
149
+ incompleteEval: IncompleteEvaluation
150
+ ): boolean {
151
+ // Match evaluator name directly
152
+ return incompleteEval.evaluationNames.includes(evaluator.name);
153
+ }
154
+
155
+ /**
156
+ * Handles fetch errors with helpful version information for unsupported features
157
+ */
158
+ async function handleEvaluationFetchError(
159
+ error: unknown,
160
+ client: PhoenixClient,
161
+ featureName: string
162
+ ): Promise<never> {
163
+ // Check if this is a JSON parse error (likely 404 HTML response from old server)
164
+ const isJsonError =
165
+ error instanceof SyntaxError &&
166
+ error.message.toLowerCase().includes("json");
167
+
168
+ if (isJsonError) {
169
+ // Fetch server version to provide helpful context
170
+ let versionInfo = "";
171
+ try {
172
+ const baseUrl = client.config.baseUrl || "";
173
+ const versionRes = await fetch(`${baseUrl}/arize_phoenix_version`);
174
+ if (versionRes.ok) {
175
+ const version = await versionRes.text();
176
+ versionInfo = ` Your current server version is ${version}.`;
177
+ }
178
+ } catch {
179
+ // Ignore errors fetching version
180
+ }
181
+
182
+ throw new Error(
183
+ `The ${featureName} feature is not available on this Phoenix server. ` +
184
+ "Please upgrade your Phoenix server to use this feature." +
185
+ versionInfo
186
+ );
187
+ }
188
+ throw error;
189
+ }
190
+
191
+ /**
192
+ * Sets up OpenTelemetry tracer for evaluation tracing
193
+ */
194
+ function setupEvaluationTracer({
195
+ projectName,
196
+ baseUrl,
197
+ headers,
198
+ useBatchSpanProcessor,
199
+ diagLogLevel,
200
+ setGlobalTracerProvider,
201
+ }: {
202
+ projectName: string | null;
203
+ baseUrl: string;
204
+ headers?: Record<string, string>;
205
+ useBatchSpanProcessor: boolean;
206
+ diagLogLevel?: DiagLogLevel;
207
+ setGlobalTracerProvider: boolean;
208
+ }): { provider: NodeTracerProvider; tracer: Tracer } | null {
209
+ if (!projectName) {
210
+ return null;
211
+ }
212
+
213
+ const provider = register({
214
+ projectName,
215
+ url: baseUrl,
216
+ headers,
217
+ batch: useBatchSpanProcessor,
218
+ diagLogLevel,
219
+ global: setGlobalTracerProvider,
220
+ });
221
+
222
+ const tracer = provider.getTracer(projectName);
223
+ return { provider, tracer };
224
+ }
225
+
226
+ /**
227
+ * Prints evaluation summary to logger
228
+ */
229
+ function printEvaluationSummary({
230
+ logger,
231
+ experimentId,
232
+ totalProcessed,
233
+ totalCompleted,
234
+ }: {
235
+ logger: Logger;
236
+ experimentId: string;
237
+ totalProcessed: number;
238
+ totalCompleted: number;
239
+ }): void {
240
+ logger.info("\n" + "=".repeat(70));
241
+ logger.info("📊 Evaluation Resume Summary");
242
+ logger.info("=".repeat(70));
243
+ logger.info(`Experiment ID: ${experimentId}`);
244
+ logger.info(`Runs processed: ${totalProcessed}`);
245
+ logger.info(`Evaluations completed: ${totalCompleted}`);
246
+ logger.info("=".repeat(70));
247
+ }
248
+
249
+ /**
250
+ * Resume incomplete evaluations for an experiment.
251
+ *
252
+ * This function identifies which evaluations have not been completed (either missing or failed)
253
+ * and runs the evaluators only for those runs. This is useful for:
254
+ * - Recovering from transient evaluator failures
255
+ * - Adding new evaluators to completed experiments
256
+ * - Completing partially evaluated experiments
257
+ *
258
+ * The function processes incomplete evaluations in batches using pagination to minimize memory usage.
259
+ *
260
+ * Evaluation names are matched to evaluator names. For example, if you pass
261
+ * an evaluator with name "accuracy", it will check for and resume any runs missing the "accuracy" evaluation.
262
+ *
263
+ * **Note:** Multi-output evaluators (evaluators that return an array of results) are not
264
+ * supported for resume operations. Each evaluator should produce a single evaluation
265
+ * result with a name matching the evaluator's name.
266
+ *
267
+ * @throws {Error} Throws different error types based on failure:
268
+ * - "EvaluationFetchError": Unable to fetch incomplete evaluations from the server.
269
+ * Always thrown regardless of stopOnFirstError, as it indicates critical infrastructure failure.
270
+ * - "EvaluationAbortedError": stopOnFirstError=true and an evaluator failed.
271
+ * Original error preserved in `cause` property.
272
+ * - Generic Error: Other evaluator execution errors or unexpected failures.
273
+ *
274
+ * @example
275
+ * ```ts
276
+ * import { resumeEvaluation } from "@arizeai/phoenix-client/experiments";
277
+ *
278
+ * // Standard usage: evaluation name matches evaluator name
279
+ * try {
280
+ * await resumeEvaluation({
281
+ * experimentId: "exp_123",
282
+ * evaluators: [{
283
+ * name: "correctness",
284
+ * kind: "CODE",
285
+ * evaluate: async ({ output, expected }) => ({
286
+ * score: output === expected ? 1 : 0
287
+ * })
288
+ * }],
289
+ * });
290
+ * } catch (error) {
291
+ * // Handle by error name (no instanceof needed)
292
+ * if (error.name === "EvaluationFetchError") {
293
+ * console.error("Failed to connect to server:", error.cause);
294
+ * } else if (error.name === "EvaluationAbortedError") {
295
+ * console.error("Evaluation stopped due to error:", error.cause);
296
+ * } else {
297
+ * console.error("Unexpected error:", error);
298
+ * }
299
+ * }
300
+ *
301
+ * // Stop on first error (useful for debugging)
302
+ * await resumeEvaluation({
303
+ * experimentId: "exp_123",
304
+ * evaluators: [myEvaluator],
305
+ * stopOnFirstError: true, // Exit immediately on first failure
306
+ * });
307
+ * ```
308
+ */
309
+ export async function resumeEvaluation({
310
+ client: _client,
311
+ experimentId,
312
+ evaluators: _evaluators,
313
+ logger = console,
314
+ concurrency = 5,
315
+ setGlobalTracerProvider = true,
316
+ useBatchSpanProcessor = true,
317
+ diagLogLevel,
318
+ stopOnFirstError = false,
319
+ }: ResumeEvaluationParams): Promise<void> {
320
+ const client = _client ?? createClient();
321
+ const pageSize = DEFAULT_PAGE_SIZE;
322
+
323
+ // Normalize evaluators to array
324
+ const evaluators = Array.isArray(_evaluators) ? _evaluators : [_evaluators];
325
+
326
+ // Validate inputs
327
+ invariant(evaluators.length > 0, "Must specify at least one evaluator");
328
+
329
+ // Get experiment info
330
+ logger.info(`🔍 Checking for incomplete evaluations...`);
331
+ const experiment = await getExperimentInfo({ client, experimentId });
332
+
333
+ // Initialize tracer (only if experiment has a project_name)
334
+ const baseUrl = client.config.baseUrl;
335
+ invariant(
336
+ baseUrl,
337
+ "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
338
+ );
339
+
340
+ const tracerSetup = setupEvaluationTracer({
341
+ projectName: experiment.projectName,
342
+ baseUrl,
343
+ headers: client.config.headers
344
+ ? toObjectHeaders(client.config.headers)
345
+ : undefined,
346
+ useBatchSpanProcessor,
347
+ diagLogLevel,
348
+ setGlobalTracerProvider,
349
+ });
350
+
351
+ const provider = tracerSetup?.provider ?? null;
352
+ const evalTracer = tracerSetup?.tracer ?? null;
353
+
354
+ // Build evaluation names list for query - derive from evaluator names
355
+ const evaluationNamesList = evaluators.map((e) => e.name);
356
+
357
+ // Create a CSP-style bounded buffer for evaluation distribution
358
+ const evalChannel = new Channel<EvalItem>(
359
+ pageSize * CHANNEL_CAPACITY_MULTIPLIER
360
+ );
361
+
362
+ // Abort controller for stopOnFirstError coordination
363
+ const abortController = new AbortController();
364
+ const { signal } = abortController;
365
+
366
+ let totalProcessed = 0;
367
+ let totalCompleted = 0;
368
+ let totalFailed = 0;
369
+
370
+ // Producer: Fetch incomplete evaluations and send to channel
371
+ async function fetchIncompleteEvaluations(): Promise<void> {
372
+ let cursor: string | null = null;
373
+
374
+ try {
375
+ do {
376
+ // Stop fetching if abort signal received
377
+ if (signal.aborted) {
378
+ logger.info("🛑 Stopping fetch due to error in evaluation");
379
+ break;
380
+ }
381
+
382
+ let res: {
383
+ data?: components["schemas"]["GetIncompleteEvaluationsResponseBody"];
384
+ error?: unknown;
385
+ };
386
+
387
+ try {
388
+ res = await client.GET(
389
+ "/v1/experiments/{experiment_id}/incomplete-evaluations",
390
+ {
391
+ params: {
392
+ path: {
393
+ experiment_id: experimentId,
394
+ },
395
+ query: {
396
+ cursor,
397
+ limit: pageSize,
398
+ evaluation_name: evaluationNamesList,
399
+ },
400
+ },
401
+ }
402
+ );
403
+ } catch (error: unknown) {
404
+ // Check for version compatibility issues and throw helpful error
405
+ try {
406
+ await handleEvaluationFetchError(
407
+ error,
408
+ client,
409
+ "resume_evaluation"
410
+ );
411
+ // TypeScript: handleEvaluationFetchError never returns, but add throw for safety
412
+ throw new Error("handleEvaluationFetchError should never return");
413
+ } catch (handledError) {
414
+ // Wrap the error (from handleEvaluationFetchError or original) in semantic error type
415
+ throw new EvaluationFetchError(
416
+ "Failed to fetch incomplete evaluations from server",
417
+ handledError instanceof Error ? handledError : undefined
418
+ );
419
+ }
420
+ }
421
+
422
+ // Check for API errors
423
+ if (res.error) {
424
+ throw new EvaluationFetchError(
425
+ `Failed to fetch incomplete evaluations: ${ensureString(res.error)}`
426
+ );
427
+ }
428
+
429
+ cursor = res.data?.next_cursor ?? null;
430
+ const batchIncomplete = res.data?.data;
431
+ invariant(batchIncomplete, "Failed to fetch incomplete evaluations");
432
+
433
+ if (batchIncomplete.length === 0) {
434
+ if (totalProcessed === 0) {
435
+ logger.info(
436
+ "✅ No incomplete evaluations found. All evaluations are complete."
437
+ );
438
+ }
439
+ break;
440
+ }
441
+
442
+ if (totalProcessed === 0) {
443
+ logger.info("🧠 Resuming evaluations...");
444
+ }
445
+
446
+ // Build evaluation tasks and send to channel
447
+ let batchCount = 0;
448
+ for (const incomplete of batchIncomplete) {
449
+ // Stop sending items if abort signal received
450
+ if (signal.aborted) {
451
+ break;
452
+ }
453
+
454
+ const incompleteEval = buildIncompleteEvaluation(incomplete);
455
+
456
+ const evaluatorsToRun = evaluators.filter((evaluator) =>
457
+ shouldRunEvaluator(evaluator, incompleteEval)
458
+ );
459
+
460
+ // Flatten: Send one channel item per evaluator
461
+ for (const evaluator of evaluatorsToRun) {
462
+ // Stop sending items if abort signal received
463
+ if (signal.aborted) {
464
+ break;
465
+ }
466
+
467
+ await evalChannel.send({ incompleteEval, evaluator });
468
+ batchCount++;
469
+ totalProcessed++;
470
+ }
471
+ }
472
+
473
+ logger.info(
474
+ `Fetched batch of ${batchCount} evaluation tasks (channel buffer: ${evalChannel.length})`
475
+ );
476
+ } while (cursor !== null && !signal.aborted);
477
+ } catch (error) {
478
+ // Re-throw with context preservation
479
+ if (error instanceof EvaluationFetchError) {
480
+ throw error;
481
+ }
482
+ // ChannelError from blocked send() should bubble up naturally
483
+ // (happens when channel closes while producer is blocked)
484
+ if (error instanceof ChannelError) {
485
+ throw error;
486
+ }
487
+ // Wrap any unexpected errors from channel operations
488
+ throw new EvaluationFetchError(
489
+ "Unexpected error during evaluation fetch",
490
+ error instanceof Error ? error : undefined
491
+ );
492
+ } finally {
493
+ evalChannel.close(); // Signal workers we're done
494
+ }
495
+ }
496
+
497
+ // Worker: Process evaluations from channel
498
+ async function processEvaluationsFromChannel(): Promise<void> {
499
+ for await (const item of evalChannel) {
500
+ // Stop processing if abort signal received
501
+ if (signal.aborted) {
502
+ break;
503
+ }
504
+
505
+ try {
506
+ await runSingleEvaluation({
507
+ client,
508
+ experimentId,
509
+ evaluator: item.evaluator,
510
+ experimentRun: item.incompleteEval.experimentRun,
511
+ datasetExample: item.incompleteEval.datasetExample,
512
+ tracer: evalTracer,
513
+ });
514
+ totalCompleted++;
515
+ } catch (error) {
516
+ totalFailed++;
517
+ logger.error(
518
+ `Failed to run evaluator "${item.evaluator.name}" for run ${item.incompleteEval.experimentRun.id}: ${error}`
519
+ );
520
+
521
+ // If stopOnFirstError is enabled, abort and re-throw
522
+ if (stopOnFirstError) {
523
+ logger.error("🛑 Stopping on first error");
524
+ abortController.abort();
525
+ throw error;
526
+ }
527
+ }
528
+ }
529
+ }
530
+
531
+ // Start concurrent execution
532
+ // Wrap in try-finally to ensure channel is always closed, even if Promise.all throws
533
+ let executionError: Error | null = null;
534
+ try {
535
+ const producerTask = fetchIncompleteEvaluations();
536
+ const workerTasks = Array.from({ length: concurrency }, () =>
537
+ processEvaluationsFromChannel()
538
+ );
539
+
540
+ // Wait for producer and all workers to finish
541
+ await Promise.all([producerTask, ...workerTasks]);
542
+ } catch (error) {
543
+ // Classify and handle errors based on their nature
544
+ const err = error instanceof Error ? error : new Error(String(error));
545
+
546
+ // Always surface producer/infrastructure errors
547
+ if (error instanceof EvaluationFetchError) {
548
+ // Producer failed - this is ALWAYS critical regardless of stopOnFirstError
549
+ logger.error(`❌ Critical: Failed to fetch evaluations from server`);
550
+ executionError = err;
551
+ } else if (error instanceof ChannelError && signal.aborted) {
552
+ // Channel closed due to intentional abort - wrap in semantic error
553
+ executionError = new EvaluationAbortedError(
554
+ "Evaluation stopped due to error in concurrent evaluator",
555
+ err
556
+ );
557
+ } else if (stopOnFirstError) {
558
+ // Worker error in stopOnFirstError mode - already logged by worker
559
+ executionError = err;
560
+ } else {
561
+ // Unexpected error (not from worker, not from producer fetch)
562
+ // This could be a bug in our code or infrastructure failure
563
+ logger.error(`❌ Unexpected error during evaluation: ${err.message}`);
564
+ executionError = err;
565
+ }
566
+ } finally {
567
+ // Ensure channel is closed even if there are unexpected errors
568
+ // This is a safety net in case producer's finally block didn't execute
569
+ if (!evalChannel.isClosed) {
570
+ evalChannel.close();
571
+ }
572
+ }
573
+
574
+ // Only show completion message if we didn't stop on error
575
+ if (!executionError) {
576
+ logger.info(`✅ Evaluations completed.`);
577
+ }
578
+
579
+ if (totalFailed > 0 && !executionError) {
580
+ logger.info(
581
+ `⚠️ Warning: ${totalFailed} out of ${totalProcessed} evaluations failed.`
582
+ );
583
+ }
584
+
585
+ // Print summary
586
+ printEvaluationSummary({
587
+ logger,
588
+ experimentId: experiment.id,
589
+ totalProcessed,
590
+ totalCompleted,
591
+ });
592
+
593
+ // Flush spans (if tracer was initialized)
594
+ if (provider) {
595
+ await provider.forceFlush();
596
+ }
597
+
598
+ // Re-throw error if evaluation failed
599
+ if (executionError) {
600
+ throw executionError;
601
+ }
602
+ }
603
+
604
+ /**
605
+ * Record evaluation results to API.
606
+ */
607
+ async function recordEvaluationResults({
608
+ client,
609
+ evaluator,
610
+ experimentRun,
611
+ results,
612
+ error,
613
+ startTime,
614
+ endTime,
615
+ traceId = null,
616
+ }: {
617
+ readonly client: PhoenixClient;
618
+ readonly evaluator: Evaluator;
619
+ readonly experimentRun: IncompleteEvaluation["experimentRun"];
620
+ readonly results?: readonly EvaluationResult[];
621
+ readonly error?: string;
622
+ readonly startTime: Date;
623
+ readonly endTime: Date;
624
+ readonly traceId?: string | null;
625
+ }): Promise<void> {
626
+ if (results) {
627
+ // Success case: record each evaluation result
628
+ for (const singleResult of results) {
629
+ await client.POST("/v1/experiment_evaluations", {
630
+ body: {
631
+ experiment_run_id: experimentRun.id,
632
+ name: evaluator.name,
633
+ annotator_kind: evaluator.kind,
634
+ result: {
635
+ score: singleResult.score ?? null,
636
+ label: singleResult.label ?? null,
637
+ explanation: singleResult.explanation ?? null,
638
+ metadata: singleResult.metadata ?? {},
639
+ },
640
+ start_time: startTime.toISOString(),
641
+ end_time: endTime.toISOString(),
642
+ error: null,
643
+ trace_id: traceId,
644
+ },
645
+ });
646
+ }
647
+ } else if (error) {
648
+ // Error case: record failed evaluation with evaluator name
649
+ await client.POST("/v1/experiment_evaluations", {
650
+ body: {
651
+ experiment_run_id: experimentRun.id,
652
+ name: evaluator.name,
653
+ annotator_kind: evaluator.kind,
654
+ result: null,
655
+ start_time: startTime.toISOString(),
656
+ end_time: endTime.toISOString(),
657
+ error,
658
+ trace_id: traceId,
659
+ },
660
+ });
661
+ }
662
+ }
663
+
664
+ /**
665
+ * Run a single evaluation and record the result.
666
+ */
667
+ async function runSingleEvaluation({
668
+ client,
669
+ experimentId,
670
+ evaluator,
671
+ experimentRun,
672
+ datasetExample,
673
+ tracer,
674
+ }: {
675
+ readonly client: PhoenixClient;
676
+ readonly experimentId: string;
677
+ readonly evaluator: Evaluator;
678
+ readonly experimentRun: IncompleteEvaluation["experimentRun"];
679
+ readonly datasetExample: IncompleteEvaluation["datasetExample"];
680
+ readonly tracer: Tracer | null;
681
+ }): Promise<void> {
682
+ const startTime = new Date();
683
+
684
+ // Prepare evaluator inputs
685
+ const taskOutput: TaskOutput = experimentRun.output ?? null;
686
+ const expectedOutput = datasetExample.output ?? undefined;
687
+
688
+ // If no tracer (no project_name), execute without tracing
689
+ if (!tracer) {
690
+ let results: readonly EvaluationResult[] | undefined;
691
+ let error: string | undefined;
692
+
693
+ try {
694
+ const result = await Promise.resolve(
695
+ evaluator.evaluate({
696
+ input: datasetExample.input,
697
+ output: taskOutput,
698
+ expected: expectedOutput,
699
+ metadata: datasetExample.metadata,
700
+ })
701
+ );
702
+ results = Array.isArray(result) ? result : [result];
703
+ } catch (err) {
704
+ error = err instanceof Error ? err.message : String(err);
705
+ throw err;
706
+ } finally {
707
+ const endTime = new Date();
708
+ await recordEvaluationResults({
709
+ client,
710
+ evaluator,
711
+ experimentRun,
712
+ results,
713
+ error,
714
+ startTime,
715
+ endTime,
716
+ });
717
+ }
718
+ return;
719
+ }
720
+
721
+ // With tracer: wrap execution in a span for observability
722
+ return tracer.startActiveSpan(
723
+ `Evaluation: ${evaluator.name}`,
724
+ async (span) => {
725
+ // Set span attributes for input
726
+ span.setAttributes({
727
+ [SemanticConventions.OPENINFERENCE_SPAN_KIND]:
728
+ OpenInferenceSpanKind.EVALUATOR,
729
+ [SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
730
+ [SemanticConventions.INPUT_VALUE]: ensureString({
731
+ input: datasetExample.input,
732
+ output: experimentRun.output,
733
+ expected: datasetExample.output,
734
+ metadata: datasetExample.metadata,
735
+ }),
736
+ ...objectAsAttributes({
737
+ experiment_id: experimentId,
738
+ experiment_run_id: experimentRun.id,
739
+ dataset_example_id: datasetExample.id,
740
+ }),
741
+ });
742
+
743
+ let results: readonly EvaluationResult[] | undefined;
744
+ let error: string | undefined;
745
+
746
+ try {
747
+ // Execute the evaluator (only once!)
748
+ const result = await Promise.resolve(
749
+ evaluator.evaluate({
750
+ input: datasetExample.input,
751
+ output: taskOutput,
752
+ expected: expectedOutput,
753
+ metadata: datasetExample.metadata,
754
+ })
755
+ );
756
+
757
+ results = Array.isArray(result) ? result : [result];
758
+
759
+ // Set output span attributes
760
+ span.setAttributes({
761
+ [SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
762
+ [SemanticConventions.OUTPUT_VALUE]: ensureString(result),
763
+ });
764
+
765
+ // Set attributes from first result for span metadata
766
+ if (results[0]) {
767
+ span.setAttributes(objectAsAttributes(results[0]));
768
+ }
769
+
770
+ span.setStatus({ code: SpanStatusCode.OK });
771
+ } catch (err) {
772
+ error = err instanceof Error ? err.message : String(err);
773
+
774
+ span.setStatus({
775
+ code: SpanStatusCode.ERROR,
776
+ message: error,
777
+ });
778
+ span.recordException(err as Error);
779
+
780
+ throw err;
781
+ } finally {
782
+ const endTime = new Date();
783
+ span.end();
784
+
785
+ // Record results to API
786
+ await recordEvaluationResults({
787
+ client,
788
+ evaluator,
789
+ experimentRun,
790
+ results,
791
+ error,
792
+ startTime,
793
+ endTime,
794
+ traceId: span.spanContext().traceId,
795
+ });
796
+ }
797
+ }
798
+ );
799
+ }