@arizeai/phoenix-client 5.2.1 → 5.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. package/README.md +3 -3
  2. package/dist/esm/__generated__/api/v1.d.ts +321 -9
  3. package/dist/esm/__generated__/api/v1.d.ts.map +1 -1
  4. package/dist/esm/experiments/createExperiment.d.ts +39 -0
  5. package/dist/esm/experiments/createExperiment.d.ts.map +1 -0
  6. package/dist/esm/experiments/createExperiment.js +43 -0
  7. package/dist/esm/experiments/createExperiment.js.map +1 -0
  8. package/dist/esm/experiments/deleteExperiment.d.ts +36 -0
  9. package/dist/esm/experiments/deleteExperiment.d.ts.map +1 -0
  10. package/dist/esm/experiments/deleteExperiment.js +49 -0
  11. package/dist/esm/experiments/deleteExperiment.js.map +1 -0
  12. package/dist/esm/experiments/getExperimentInfo.d.ts.map +1 -1
  13. package/dist/esm/experiments/getExperimentInfo.js +9 -2
  14. package/dist/esm/experiments/getExperimentInfo.js.map +1 -1
  15. package/dist/esm/experiments/helpers/asExperimentEvaluator.d.ts +19 -0
  16. package/dist/esm/experiments/helpers/asExperimentEvaluator.d.ts.map +1 -0
  17. package/dist/esm/experiments/helpers/asExperimentEvaluator.js +19 -0
  18. package/dist/esm/experiments/helpers/asExperimentEvaluator.js.map +1 -0
  19. package/dist/esm/experiments/helpers/fromPhoenixLLMEvaluator.d.ts +9 -0
  20. package/dist/esm/experiments/helpers/fromPhoenixLLMEvaluator.d.ts.map +1 -0
  21. package/dist/esm/experiments/helpers/fromPhoenixLLMEvaluator.js +18 -0
  22. package/dist/esm/experiments/helpers/fromPhoenixLLMEvaluator.js.map +1 -0
  23. package/dist/esm/experiments/helpers/getExperimentEvaluators.d.ts +6 -0
  24. package/dist/esm/experiments/helpers/getExperimentEvaluators.d.ts.map +1 -0
  25. package/dist/esm/experiments/helpers/getExperimentEvaluators.js +58 -0
  26. package/dist/esm/experiments/helpers/getExperimentEvaluators.js.map +1 -0
  27. package/dist/esm/experiments/helpers/index.d.ts +4 -0
  28. package/dist/esm/experiments/helpers/index.d.ts.map +1 -0
  29. package/dist/esm/experiments/helpers/index.js +4 -0
  30. package/dist/esm/experiments/helpers/index.js.map +1 -0
  31. package/dist/esm/experiments/index.d.ts +6 -0
  32. package/dist/esm/experiments/index.d.ts.map +1 -1
  33. package/dist/esm/experiments/index.js +6 -0
  34. package/dist/esm/experiments/index.js.map +1 -1
  35. package/dist/esm/experiments/listExperiments.d.ts +29 -0
  36. package/dist/esm/experiments/listExperiments.d.ts.map +1 -0
  37. package/dist/esm/experiments/listExperiments.js +59 -0
  38. package/dist/esm/experiments/listExperiments.js.map +1 -0
  39. package/dist/esm/experiments/resumeEvaluation.d.ts +105 -0
  40. package/dist/esm/experiments/resumeEvaluation.d.ts.map +1 -0
  41. package/dist/esm/experiments/resumeEvaluation.js +559 -0
  42. package/dist/esm/experiments/resumeEvaluation.js.map +1 -0
  43. package/dist/esm/experiments/resumeExperiment.d.ts +102 -0
  44. package/dist/esm/experiments/resumeExperiment.d.ts.map +1 -0
  45. package/dist/esm/experiments/resumeExperiment.js +517 -0
  46. package/dist/esm/experiments/resumeExperiment.js.map +1 -0
  47. package/dist/esm/experiments/runExperiment.d.ts +4 -3
  48. package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
  49. package/dist/esm/experiments/runExperiment.js +32 -3
  50. package/dist/esm/experiments/runExperiment.js.map +1 -1
  51. package/dist/esm/prompts/createPrompt.d.ts +19 -1
  52. package/dist/esm/prompts/createPrompt.d.ts.map +1 -1
  53. package/dist/esm/prompts/createPrompt.js +14 -1
  54. package/dist/esm/prompts/createPrompt.js.map +1 -1
  55. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  56. package/dist/esm/types/experiments.d.ts +66 -3
  57. package/dist/esm/types/experiments.d.ts.map +1 -1
  58. package/dist/esm/utils/channel.d.ts +229 -0
  59. package/dist/esm/utils/channel.d.ts.map +1 -0
  60. package/dist/esm/utils/channel.js +352 -0
  61. package/dist/esm/utils/channel.js.map +1 -0
  62. package/dist/esm/utils/formatPromptMessages.d.ts.map +1 -1
  63. package/dist/esm/utils/getPromptBySelector.d.ts.map +1 -1
  64. package/dist/esm/utils/isHttpError.d.ts +21 -0
  65. package/dist/esm/utils/isHttpError.d.ts.map +1 -0
  66. package/dist/esm/utils/isHttpError.js +33 -0
  67. package/dist/esm/utils/isHttpError.js.map +1 -0
  68. package/dist/src/__generated__/api/v1.d.ts +321 -9
  69. package/dist/src/__generated__/api/v1.d.ts.map +1 -1
  70. package/dist/src/experiments/createExperiment.d.ts +39 -0
  71. package/dist/src/experiments/createExperiment.d.ts.map +1 -0
  72. package/dist/src/experiments/createExperiment.js +43 -0
  73. package/dist/src/experiments/createExperiment.js.map +1 -0
  74. package/dist/src/experiments/deleteExperiment.d.ts +36 -0
  75. package/dist/src/experiments/deleteExperiment.d.ts.map +1 -0
  76. package/dist/src/experiments/deleteExperiment.js +52 -0
  77. package/dist/src/experiments/deleteExperiment.js.map +1 -0
  78. package/dist/src/experiments/getExperimentInfo.d.ts.map +1 -1
  79. package/dist/src/experiments/getExperimentInfo.js +9 -2
  80. package/dist/src/experiments/getExperimentInfo.js.map +1 -1
  81. package/dist/src/experiments/helpers/asExperimentEvaluator.d.ts +19 -0
  82. package/dist/src/experiments/helpers/asExperimentEvaluator.d.ts.map +1 -0
  83. package/dist/src/experiments/helpers/asExperimentEvaluator.js +22 -0
  84. package/dist/src/experiments/helpers/asExperimentEvaluator.js.map +1 -0
  85. package/dist/src/experiments/helpers/fromPhoenixLLMEvaluator.d.ts +9 -0
  86. package/dist/src/experiments/helpers/fromPhoenixLLMEvaluator.d.ts.map +1 -0
  87. package/dist/src/experiments/helpers/fromPhoenixLLMEvaluator.js +21 -0
  88. package/dist/src/experiments/helpers/fromPhoenixLLMEvaluator.js.map +1 -0
  89. package/dist/src/experiments/helpers/getExperimentEvaluators.d.ts +6 -0
  90. package/dist/src/experiments/helpers/getExperimentEvaluators.d.ts.map +1 -0
  91. package/dist/src/experiments/helpers/getExperimentEvaluators.js +61 -0
  92. package/dist/src/experiments/helpers/getExperimentEvaluators.js.map +1 -0
  93. package/dist/src/experiments/helpers/index.d.ts +4 -0
  94. package/dist/src/experiments/helpers/index.d.ts.map +1 -0
  95. package/dist/src/experiments/helpers/index.js +20 -0
  96. package/dist/src/experiments/helpers/index.js.map +1 -0
  97. package/dist/src/experiments/index.d.ts +6 -0
  98. package/dist/src/experiments/index.d.ts.map +1 -1
  99. package/dist/src/experiments/index.js +6 -0
  100. package/dist/src/experiments/index.js.map +1 -1
  101. package/dist/src/experiments/listExperiments.d.ts +29 -0
  102. package/dist/src/experiments/listExperiments.d.ts.map +1 -0
  103. package/dist/src/experiments/listExperiments.js +66 -0
  104. package/dist/src/experiments/listExperiments.js.map +1 -0
  105. package/dist/src/experiments/resumeEvaluation.d.ts +105 -0
  106. package/dist/src/experiments/resumeEvaluation.d.ts.map +1 -0
  107. package/dist/src/experiments/resumeEvaluation.js +585 -0
  108. package/dist/src/experiments/resumeEvaluation.js.map +1 -0
  109. package/dist/src/experiments/resumeExperiment.d.ts +102 -0
  110. package/dist/src/experiments/resumeExperiment.d.ts.map +1 -0
  111. package/dist/src/experiments/resumeExperiment.js +540 -0
  112. package/dist/src/experiments/resumeExperiment.js.map +1 -0
  113. package/dist/src/experiments/runExperiment.d.ts +4 -3
  114. package/dist/src/experiments/runExperiment.d.ts.map +1 -1
  115. package/dist/src/experiments/runExperiment.js +32 -3
  116. package/dist/src/experiments/runExperiment.js.map +1 -1
  117. package/dist/src/prompts/createPrompt.d.ts +19 -1
  118. package/dist/src/prompts/createPrompt.d.ts.map +1 -1
  119. package/dist/src/prompts/createPrompt.js +14 -1
  120. package/dist/src/prompts/createPrompt.js.map +1 -1
  121. package/dist/src/types/experiments.d.ts +66 -3
  122. package/dist/src/types/experiments.d.ts.map +1 -1
  123. package/dist/src/utils/channel.d.ts +229 -0
  124. package/dist/src/utils/channel.d.ts.map +1 -0
  125. package/dist/src/utils/channel.js +385 -0
  126. package/dist/src/utils/channel.js.map +1 -0
  127. package/dist/src/utils/formatPromptMessages.d.ts.map +1 -1
  128. package/dist/src/utils/getPromptBySelector.d.ts.map +1 -1
  129. package/dist/src/utils/isHttpError.d.ts +21 -0
  130. package/dist/src/utils/isHttpError.d.ts.map +1 -0
  131. package/dist/src/utils/isHttpError.js +37 -0
  132. package/dist/src/utils/isHttpError.js.map +1 -0
  133. package/dist/tsconfig.tsbuildinfo +1 -1
  134. package/package.json +6 -5
  135. package/src/__generated__/api/v1.ts +321 -9
  136. package/src/experiments/createExperiment.ts +90 -0
  137. package/src/experiments/deleteExperiment.ts +67 -0
  138. package/src/experiments/getExperimentInfo.ts +9 -2
  139. package/src/experiments/helpers/asExperimentEvaluator.ts +29 -0
  140. package/src/experiments/helpers/fromPhoenixLLMEvaluator.ts +24 -0
  141. package/src/experiments/helpers/getExperimentEvaluators.ts +74 -0
  142. package/src/experiments/helpers/index.ts +3 -0
  143. package/src/experiments/index.ts +6 -0
  144. package/src/experiments/listExperiments.ts +83 -0
  145. package/src/experiments/resumeEvaluation.ts +804 -0
  146. package/src/experiments/resumeExperiment.ts +745 -0
  147. package/src/experiments/runExperiment.ts +37 -5
  148. package/src/prompts/createPrompt.ts +19 -1
  149. package/src/types/experiments.ts +72 -3
  150. package/src/utils/channel.ts +397 -0
  151. package/src/utils/isHttpError.ts +45 -0
@@ -0,0 +1,804 @@
1
+ import {
2
+ MimeType,
3
+ OpenInferenceSpanKind,
4
+ SemanticConventions,
5
+ } from "@arizeai/openinference-semantic-conventions";
6
+ import {
7
+ type DiagLogLevel,
8
+ NodeTracerProvider,
9
+ objectAsAttributes,
10
+ register,
11
+ SpanStatusCode,
12
+ Tracer,
13
+ } from "@arizeai/phoenix-otel";
14
+
15
+ import { components } from "../__generated__/api/v1";
16
+ import { createClient, type PhoenixClient } from "../client";
17
+ import { ClientFn } from "../types/core";
18
+ import type {
19
+ EvaluationResult,
20
+ Evaluator,
21
+ ExperimentEvaluatorLike,
22
+ IncompleteEvaluation,
23
+ TaskOutput,
24
+ } from "../types/experiments";
25
+ import { type Logger } from "../types/logger";
26
+ import { Channel, ChannelError } from "../utils/channel";
27
+ import { ensureString } from "../utils/ensureString";
28
+ import { toObjectHeaders } from "../utils/toObjectHeaders";
29
+
30
+ import { getExperimentInfo } from "./getExperimentInfo.js";
31
+ import { getExperimentEvaluators } from "./helpers";
32
+
33
+ import invariant from "tiny-invariant";
34
+
35
+ /**
36
+ * Error thrown when evaluation is aborted due to a failure in stopOnFirstError mode.
37
+ * This provides semantic context that the abort was intentional, not an infrastructure failure.
38
+ * @internal - Not exported to minimize API surface area
39
+ */
40
+ class EvaluationAbortedError extends Error {
41
+ constructor(message: string, cause?: Error) {
42
+ super(message);
43
+ this.name = "EvaluationAbortedError";
44
+ this.cause = cause;
45
+ }
46
+ }
47
+
48
+ /**
49
+ * Error thrown when the producer fails to fetch incomplete evaluations from the server.
50
+ * This is a critical error that should always be surfaced, even in stopOnFirstError=false mode.
51
+ * @internal - Not exported to minimize API surface area
52
+ */
53
+ class EvaluationFetchError extends Error {
54
+ constructor(message: string, cause?: Error) {
55
+ super(message);
56
+ this.name = "EvaluationFetchError";
57
+ this.cause = cause;
58
+ }
59
+ }
60
+
61
+ export type ResumeEvaluationParams = ClientFn & {
62
+ /**
63
+ * The ID of the experiment to resume evaluations for
64
+ */
65
+ readonly experimentId: string;
66
+ /**
67
+ * A single evaluator or list of evaluators to run on incomplete evaluations
68
+ */
69
+ readonly evaluators:
70
+ | ExperimentEvaluatorLike
71
+ | readonly ExperimentEvaluatorLike[];
72
+ /**
73
+ * The logger to use
74
+ * @default console
75
+ */
76
+ readonly logger?: Logger;
77
+ /**
78
+ * The number of concurrent evaluation executions
79
+ * @default 5
80
+ */
81
+ readonly concurrency?: number;
82
+ /**
83
+ * Whether to set the global tracer provider when running evaluators.
84
+ * @default true
85
+ */
86
+ readonly setGlobalTracerProvider?: boolean;
87
+ /**
88
+ * Whether to use batch span processor for tracing.
89
+ * @default true
90
+ */
91
+ readonly useBatchSpanProcessor?: boolean;
92
+ /**
93
+ * Log level to set for the default DiagConsoleLogger when tracing.
94
+ */
95
+ readonly diagLogLevel?: DiagLogLevel;
96
+ /**
97
+ * Stop processing and exit as soon as any evaluation fails.
98
+ * @default false
99
+ */
100
+ readonly stopOnFirstError?: boolean;
101
+ };
102
+
103
+ const DEFAULT_PAGE_SIZE = 50 as const;
104
+ /**
105
+ * Channel capacity multiplier for producer-consumer buffering.
106
+ * A value of 2 enables pipeline efficiency: workers process batch N while
107
+ * the producer fetches batch N+1, eliminating idle time without excessive
108
+ * memory usage. The channel blocks when full, providing natural backpressure.
109
+ */
110
+ const CHANNEL_CAPACITY_MULTIPLIER = 2 as const;
111
+
112
+ /**
113
+ * Evaluation item for the producer-consumer channel
114
+ */
115
+ type EvalItem = {
116
+ readonly incompleteEval: IncompleteEvaluation;
117
+ readonly evaluator: Evaluator;
118
+ };
119
+
120
+ /**
121
+ * Transforms API incomplete evaluation response to IncompleteEvaluation
122
+ */
123
+ function buildIncompleteEvaluation(
124
+ apiResponse: components["schemas"]["IncompleteExperimentEvaluation"]
125
+ ): IncompleteEvaluation {
126
+ return {
127
+ experimentRun: {
128
+ id: apiResponse.experiment_run.id,
129
+ experimentId: apiResponse.experiment_run.experiment_id,
130
+ datasetExampleId: apiResponse.experiment_run.dataset_example_id,
131
+ output: apiResponse.experiment_run.output ?? null,
132
+ startTime: new Date(apiResponse.experiment_run.start_time),
133
+ endTime: new Date(apiResponse.experiment_run.end_time),
134
+ error: apiResponse.experiment_run.error ?? null,
135
+ traceId: apiResponse.experiment_run.trace_id ?? null,
136
+ },
137
+ datasetExample: {
138
+ id: apiResponse.dataset_example.id,
139
+ input: apiResponse.dataset_example.input,
140
+ output: apiResponse.dataset_example.output ?? null,
141
+ metadata: apiResponse.dataset_example.metadata || {},
142
+ updatedAt: new Date(apiResponse.dataset_example.updated_at),
143
+ },
144
+ evaluationNames: apiResponse.evaluation_names,
145
+ };
146
+ }
147
+
148
+ /**
149
+ * Determines if an evaluator should run for an incomplete evaluation
150
+ */
151
+ function shouldRunEvaluator(
152
+ evaluator: Evaluator,
153
+ incompleteEval: IncompleteEvaluation
154
+ ): boolean {
155
+ // Match evaluator name directly
156
+ return incompleteEval.evaluationNames.includes(evaluator.name);
157
+ }
158
+
159
+ /**
160
+ * Handles fetch errors with helpful version information for unsupported features
161
+ */
162
+ async function handleEvaluationFetchError(
163
+ error: unknown,
164
+ client: PhoenixClient,
165
+ featureName: string
166
+ ): Promise<never> {
167
+ // Check if this is a JSON parse error (likely 404 HTML response from old server)
168
+ const isJsonError =
169
+ error instanceof SyntaxError &&
170
+ error.message.toLowerCase().includes("json");
171
+
172
+ if (isJsonError) {
173
+ // Fetch server version to provide helpful context
174
+ let versionInfo = "";
175
+ try {
176
+ const baseUrl = client.config.baseUrl || "";
177
+ const versionRes = await fetch(`${baseUrl}/arize_phoenix_version`);
178
+ if (versionRes.ok) {
179
+ const version = await versionRes.text();
180
+ versionInfo = ` Your current server version is ${version}.`;
181
+ }
182
+ } catch {
183
+ // Ignore errors fetching version
184
+ }
185
+
186
+ throw new Error(
187
+ `The ${featureName} feature is not available on this Phoenix server. ` +
188
+ "Please upgrade your Phoenix server to use this feature." +
189
+ versionInfo
190
+ );
191
+ }
192
+ throw error;
193
+ }
194
+
195
+ /**
196
+ * Sets up OpenTelemetry tracer for evaluation tracing
197
+ */
198
+ function setupEvaluationTracer({
199
+ projectName,
200
+ baseUrl,
201
+ headers,
202
+ useBatchSpanProcessor,
203
+ diagLogLevel,
204
+ setGlobalTracerProvider,
205
+ }: {
206
+ projectName: string | null;
207
+ baseUrl: string;
208
+ headers?: Record<string, string>;
209
+ useBatchSpanProcessor: boolean;
210
+ diagLogLevel?: DiagLogLevel;
211
+ setGlobalTracerProvider: boolean;
212
+ }): { provider: NodeTracerProvider; tracer: Tracer } | null {
213
+ if (!projectName) {
214
+ return null;
215
+ }
216
+
217
+ const provider = register({
218
+ projectName,
219
+ url: baseUrl,
220
+ headers,
221
+ batch: useBatchSpanProcessor,
222
+ diagLogLevel,
223
+ global: setGlobalTracerProvider,
224
+ });
225
+
226
+ const tracer = provider.getTracer(projectName);
227
+ return { provider, tracer };
228
+ }
229
+
230
+ /**
231
+ * Prints evaluation summary to logger
232
+ */
233
+ function printEvaluationSummary({
234
+ logger,
235
+ experimentId,
236
+ totalProcessed,
237
+ totalCompleted,
238
+ }: {
239
+ logger: Logger;
240
+ experimentId: string;
241
+ totalProcessed: number;
242
+ totalCompleted: number;
243
+ }): void {
244
+ logger.info("\n" + "=".repeat(70));
245
+ logger.info("📊 Evaluation Resume Summary");
246
+ logger.info("=".repeat(70));
247
+ logger.info(`Experiment ID: ${experimentId}`);
248
+ logger.info(`Runs processed: ${totalProcessed}`);
249
+ logger.info(`Evaluations completed: ${totalCompleted}`);
250
+ logger.info("=".repeat(70));
251
+ }
252
+
253
+ /**
254
+ * Resume incomplete evaluations for an experiment.
255
+ *
256
+ * This function identifies which evaluations have not been completed (either missing or failed)
257
+ * and runs the evaluators only for those runs. This is useful for:
258
+ * - Recovering from transient evaluator failures
259
+ * - Adding new evaluators to completed experiments
260
+ * - Completing partially evaluated experiments
261
+ *
262
+ * The function processes incomplete evaluations in batches using pagination to minimize memory usage.
263
+ *
264
+ * Evaluation names are matched to evaluator names. For example, if you pass
265
+ * an evaluator with name "accuracy", it will check for and resume any runs missing the "accuracy" evaluation.
266
+ *
267
+ * **Note:** Multi-output evaluators (evaluators that return an array of results) are not
268
+ * supported for resume operations. Each evaluator should produce a single evaluation
269
+ * result with a name matching the evaluator's name.
270
+ *
271
+ * @throws {Error} Throws different error types based on failure:
272
+ * - "EvaluationFetchError": Unable to fetch incomplete evaluations from the server.
273
+ * Always thrown regardless of stopOnFirstError, as it indicates critical infrastructure failure.
274
+ * - "EvaluationAbortedError": stopOnFirstError=true and an evaluator failed.
275
+ * Original error preserved in `cause` property.
276
+ * - Generic Error: Other evaluator execution errors or unexpected failures.
277
+ *
278
+ * @example
279
+ * ```ts
280
+ * import { resumeEvaluation } from "@arizeai/phoenix-client/experiments";
281
+ *
282
+ * // Standard usage: evaluation name matches evaluator name
283
+ * try {
284
+ * await resumeEvaluation({
285
+ * experimentId: "exp_123",
286
+ * evaluators: [{
287
+ * name: "correctness",
288
+ * kind: "CODE",
289
+ * evaluate: async ({ output, expected }) => ({
290
+ * score: output === expected ? 1 : 0
291
+ * })
292
+ * }],
293
+ * });
294
+ * } catch (error) {
295
+ * // Handle by error name (no instanceof needed)
296
+ * if (error.name === "EvaluationFetchError") {
297
+ * console.error("Failed to connect to server:", error.cause);
298
+ * } else if (error.name === "EvaluationAbortedError") {
299
+ * console.error("Evaluation stopped due to error:", error.cause);
300
+ * } else {
301
+ * console.error("Unexpected error:", error);
302
+ * }
303
+ * }
304
+ *
305
+ * // Stop on first error (useful for debugging)
306
+ * await resumeEvaluation({
307
+ * experimentId: "exp_123",
308
+ * evaluators: [myEvaluator],
309
+ * stopOnFirstError: true, // Exit immediately on first failure
310
+ * });
311
+ * ```
312
+ */
313
+ export async function resumeEvaluation({
314
+ client: _client,
315
+ experimentId,
316
+ evaluators: _evaluators,
317
+ logger = console,
318
+ concurrency = 5,
319
+ setGlobalTracerProvider = true,
320
+ useBatchSpanProcessor = true,
321
+ diagLogLevel,
322
+ stopOnFirstError = false,
323
+ }: ResumeEvaluationParams): Promise<void> {
324
+ const client = _client ?? createClient();
325
+ const pageSize = DEFAULT_PAGE_SIZE;
326
+
327
+ // Normalize evaluators to array
328
+ const evaluators = getExperimentEvaluators(
329
+ Array.isArray(_evaluators) ? _evaluators : [_evaluators]
330
+ );
331
+ // Validate inputs
332
+ invariant(evaluators.length > 0, "Must specify at least one evaluator");
333
+
334
+ // Get experiment info
335
+ logger.info(`🔍 Checking for incomplete evaluations...`);
336
+ const experiment = await getExperimentInfo({ client, experimentId });
337
+
338
+ // Initialize tracer (only if experiment has a project_name)
339
+ const baseUrl = client.config.baseUrl;
340
+ invariant(
341
+ baseUrl,
342
+ "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
343
+ );
344
+
345
+ const tracerSetup = setupEvaluationTracer({
346
+ projectName: experiment.projectName,
347
+ baseUrl,
348
+ headers: client.config.headers
349
+ ? toObjectHeaders(client.config.headers)
350
+ : undefined,
351
+ useBatchSpanProcessor,
352
+ diagLogLevel,
353
+ setGlobalTracerProvider,
354
+ });
355
+
356
+ const provider = tracerSetup?.provider ?? null;
357
+ const evalTracer = tracerSetup?.tracer ?? null;
358
+
359
+ // Build evaluation names list for query - derive from evaluator names
360
+ const evaluationNamesList = evaluators.map((e) => e.name);
361
+
362
+ // Create a CSP-style bounded buffer for evaluation distribution
363
+ const evalChannel = new Channel<EvalItem>(
364
+ pageSize * CHANNEL_CAPACITY_MULTIPLIER
365
+ );
366
+
367
+ // Abort controller for stopOnFirstError coordination
368
+ const abortController = new AbortController();
369
+ const { signal } = abortController;
370
+
371
+ let totalProcessed = 0;
372
+ let totalCompleted = 0;
373
+ let totalFailed = 0;
374
+
375
+ // Producer: Fetch incomplete evaluations and send to channel
376
+ async function fetchIncompleteEvaluations(): Promise<void> {
377
+ let cursor: string | null = null;
378
+
379
+ try {
380
+ do {
381
+ // Stop fetching if abort signal received
382
+ if (signal.aborted) {
383
+ logger.info("🛑 Stopping fetch due to error in evaluation");
384
+ break;
385
+ }
386
+
387
+ let res: {
388
+ data?: components["schemas"]["GetIncompleteEvaluationsResponseBody"];
389
+ error?: unknown;
390
+ };
391
+
392
+ try {
393
+ res = await client.GET(
394
+ "/v1/experiments/{experiment_id}/incomplete-evaluations",
395
+ {
396
+ params: {
397
+ path: {
398
+ experiment_id: experimentId,
399
+ },
400
+ query: {
401
+ cursor,
402
+ limit: pageSize,
403
+ evaluation_name: evaluationNamesList,
404
+ },
405
+ },
406
+ }
407
+ );
408
+ } catch (error: unknown) {
409
+ // Check for version compatibility issues and throw helpful error
410
+ try {
411
+ await handleEvaluationFetchError(
412
+ error,
413
+ client,
414
+ "resume_evaluation"
415
+ );
416
+ // TypeScript: handleEvaluationFetchError never returns, but add throw for safety
417
+ throw new Error("handleEvaluationFetchError should never return");
418
+ } catch (handledError) {
419
+ // Wrap the error (from handleEvaluationFetchError or original) in semantic error type
420
+ throw new EvaluationFetchError(
421
+ "Failed to fetch incomplete evaluations from server",
422
+ handledError instanceof Error ? handledError : undefined
423
+ );
424
+ }
425
+ }
426
+
427
+ // Check for API errors
428
+ if (res.error) {
429
+ throw new EvaluationFetchError(
430
+ `Failed to fetch incomplete evaluations: ${ensureString(res.error)}`
431
+ );
432
+ }
433
+
434
+ cursor = res.data?.next_cursor ?? null;
435
+ const batchIncomplete = res.data?.data;
436
+ invariant(batchIncomplete, "Failed to fetch incomplete evaluations");
437
+
438
+ if (batchIncomplete.length === 0) {
439
+ if (totalProcessed === 0) {
440
+ logger.info(
441
+ "✅ No incomplete evaluations found. All evaluations are complete."
442
+ );
443
+ }
444
+ break;
445
+ }
446
+
447
+ if (totalProcessed === 0) {
448
+ logger.info("🧠 Resuming evaluations...");
449
+ }
450
+
451
+ // Build evaluation tasks and send to channel
452
+ let batchCount = 0;
453
+ for (const incomplete of batchIncomplete) {
454
+ // Stop sending items if abort signal received
455
+ if (signal.aborted) {
456
+ break;
457
+ }
458
+
459
+ const incompleteEval = buildIncompleteEvaluation(incomplete);
460
+
461
+ const evaluatorsToRun = evaluators.filter((evaluator) =>
462
+ shouldRunEvaluator(evaluator, incompleteEval)
463
+ );
464
+
465
+ // Flatten: Send one channel item per evaluator
466
+ for (const evaluator of evaluatorsToRun) {
467
+ // Stop sending items if abort signal received
468
+ if (signal.aborted) {
469
+ break;
470
+ }
471
+
472
+ await evalChannel.send({ incompleteEval, evaluator });
473
+ batchCount++;
474
+ totalProcessed++;
475
+ }
476
+ }
477
+
478
+ logger.info(
479
+ `Fetched batch of ${batchCount} evaluation tasks (channel buffer: ${evalChannel.length})`
480
+ );
481
+ } while (cursor !== null && !signal.aborted);
482
+ } catch (error) {
483
+ // Re-throw with context preservation
484
+ if (error instanceof EvaluationFetchError) {
485
+ throw error;
486
+ }
487
+ // ChannelError from blocked send() should bubble up naturally
488
+ // (happens when channel closes while producer is blocked)
489
+ if (error instanceof ChannelError) {
490
+ throw error;
491
+ }
492
+ // Wrap any unexpected errors from channel operations
493
+ throw new EvaluationFetchError(
494
+ "Unexpected error during evaluation fetch",
495
+ error instanceof Error ? error : undefined
496
+ );
497
+ } finally {
498
+ evalChannel.close(); // Signal workers we're done
499
+ }
500
+ }
501
+
502
+ // Worker: Process evaluations from channel
503
+ async function processEvaluationsFromChannel(): Promise<void> {
504
+ for await (const item of evalChannel) {
505
+ // Stop processing if abort signal received
506
+ if (signal.aborted) {
507
+ break;
508
+ }
509
+
510
+ try {
511
+ await runSingleEvaluation({
512
+ client,
513
+ experimentId,
514
+ evaluator: item.evaluator,
515
+ experimentRun: item.incompleteEval.experimentRun,
516
+ datasetExample: item.incompleteEval.datasetExample,
517
+ tracer: evalTracer,
518
+ });
519
+ totalCompleted++;
520
+ } catch (error) {
521
+ totalFailed++;
522
+ logger.error(
523
+ `Failed to run evaluator "${item.evaluator.name}" for run ${item.incompleteEval.experimentRun.id}: ${error}`
524
+ );
525
+
526
+ // If stopOnFirstError is enabled, abort and re-throw
527
+ if (stopOnFirstError) {
528
+ logger.error("🛑 Stopping on first error");
529
+ abortController.abort();
530
+ throw error;
531
+ }
532
+ }
533
+ }
534
+ }
535
+
536
+ // Start concurrent execution
537
+ // Wrap in try-finally to ensure channel is always closed, even if Promise.all throws
538
+ let executionError: Error | null = null;
539
+ try {
540
+ const producerTask = fetchIncompleteEvaluations();
541
+ const workerTasks = Array.from({ length: concurrency }, () =>
542
+ processEvaluationsFromChannel()
543
+ );
544
+
545
+ // Wait for producer and all workers to finish
546
+ await Promise.all([producerTask, ...workerTasks]);
547
+ } catch (error) {
548
+ // Classify and handle errors based on their nature
549
+ const err = error instanceof Error ? error : new Error(String(error));
550
+
551
+ // Always surface producer/infrastructure errors
552
+ if (error instanceof EvaluationFetchError) {
553
+ // Producer failed - this is ALWAYS critical regardless of stopOnFirstError
554
+ logger.error(`❌ Critical: Failed to fetch evaluations from server`);
555
+ executionError = err;
556
+ } else if (error instanceof ChannelError && signal.aborted) {
557
+ // Channel closed due to intentional abort - wrap in semantic error
558
+ executionError = new EvaluationAbortedError(
559
+ "Evaluation stopped due to error in concurrent evaluator",
560
+ err
561
+ );
562
+ } else if (stopOnFirstError) {
563
+ // Worker error in stopOnFirstError mode - already logged by worker
564
+ executionError = err;
565
+ } else {
566
+ // Unexpected error (not from worker, not from producer fetch)
567
+ // This could be a bug in our code or infrastructure failure
568
+ logger.error(`❌ Unexpected error during evaluation: ${err.message}`);
569
+ executionError = err;
570
+ }
571
+ } finally {
572
+ // Ensure channel is closed even if there are unexpected errors
573
+ // This is a safety net in case producer's finally block didn't execute
574
+ if (!evalChannel.isClosed) {
575
+ evalChannel.close();
576
+ }
577
+ }
578
+
579
+ // Only show completion message if we didn't stop on error
580
+ if (!executionError) {
581
+ logger.info(`✅ Evaluations completed.`);
582
+ }
583
+
584
+ if (totalFailed > 0 && !executionError) {
585
+ logger.info(
586
+ `⚠️ Warning: ${totalFailed} out of ${totalProcessed} evaluations failed.`
587
+ );
588
+ }
589
+
590
+ // Print summary
591
+ printEvaluationSummary({
592
+ logger,
593
+ experimentId: experiment.id,
594
+ totalProcessed,
595
+ totalCompleted,
596
+ });
597
+
598
+ // Flush spans (if tracer was initialized)
599
+ if (provider) {
600
+ await provider.forceFlush();
601
+ }
602
+
603
+ // Re-throw error if evaluation failed
604
+ if (executionError) {
605
+ throw executionError;
606
+ }
607
+ }
608
+
609
+ /**
610
+ * Record evaluation results to API.
611
+ */
612
+ async function recordEvaluationResults({
613
+ client,
614
+ evaluator,
615
+ experimentRun,
616
+ results,
617
+ error,
618
+ startTime,
619
+ endTime,
620
+ traceId = null,
621
+ }: {
622
+ readonly client: PhoenixClient;
623
+ readonly evaluator: Evaluator;
624
+ readonly experimentRun: IncompleteEvaluation["experimentRun"];
625
+ readonly results?: readonly EvaluationResult[];
626
+ readonly error?: string;
627
+ readonly startTime: Date;
628
+ readonly endTime: Date;
629
+ readonly traceId?: string | null;
630
+ }): Promise<void> {
631
+ if (results) {
632
+ // Success case: record each evaluation result
633
+ for (const singleResult of results) {
634
+ await client.POST("/v1/experiment_evaluations", {
635
+ body: {
636
+ experiment_run_id: experimentRun.id,
637
+ name: evaluator.name,
638
+ annotator_kind: evaluator.kind,
639
+ result: {
640
+ score: singleResult.score ?? null,
641
+ label: singleResult.label ?? null,
642
+ explanation: singleResult.explanation ?? null,
643
+ metadata: singleResult.metadata ?? {},
644
+ },
645
+ start_time: startTime.toISOString(),
646
+ end_time: endTime.toISOString(),
647
+ error: null,
648
+ trace_id: traceId,
649
+ },
650
+ });
651
+ }
652
+ } else if (error) {
653
+ // Error case: record failed evaluation with evaluator name
654
+ await client.POST("/v1/experiment_evaluations", {
655
+ body: {
656
+ experiment_run_id: experimentRun.id,
657
+ name: evaluator.name,
658
+ annotator_kind: evaluator.kind,
659
+ result: null,
660
+ start_time: startTime.toISOString(),
661
+ end_time: endTime.toISOString(),
662
+ error,
663
+ trace_id: traceId,
664
+ },
665
+ });
666
+ }
667
+ }
668
+
669
+ /**
670
+ * Run a single evaluation and record the result.
671
+ */
672
+ async function runSingleEvaluation({
673
+ client,
674
+ experimentId,
675
+ evaluator,
676
+ experimentRun,
677
+ datasetExample,
678
+ tracer,
679
+ }: {
680
+ readonly client: PhoenixClient;
681
+ readonly experimentId: string;
682
+ readonly evaluator: Evaluator;
683
+ readonly experimentRun: IncompleteEvaluation["experimentRun"];
684
+ readonly datasetExample: IncompleteEvaluation["datasetExample"];
685
+ readonly tracer: Tracer | null;
686
+ }): Promise<void> {
687
+ const startTime = new Date();
688
+
689
+ // Prepare evaluator inputs
690
+ const taskOutput: TaskOutput = experimentRun.output ?? null;
691
+ const expectedOutput = datasetExample.output ?? undefined;
692
+
693
+ // If no tracer (no project_name), execute without tracing
694
+ if (!tracer) {
695
+ let results: readonly EvaluationResult[] | undefined;
696
+ let error: string | undefined;
697
+
698
+ try {
699
+ const result = await Promise.resolve(
700
+ evaluator.evaluate({
701
+ input: datasetExample.input,
702
+ output: taskOutput,
703
+ expected: expectedOutput,
704
+ metadata: datasetExample.metadata,
705
+ })
706
+ );
707
+ results = Array.isArray(result) ? result : [result];
708
+ } catch (err) {
709
+ error = err instanceof Error ? err.message : String(err);
710
+ throw err;
711
+ } finally {
712
+ const endTime = new Date();
713
+ await recordEvaluationResults({
714
+ client,
715
+ evaluator,
716
+ experimentRun,
717
+ results,
718
+ error,
719
+ startTime,
720
+ endTime,
721
+ });
722
+ }
723
+ return;
724
+ }
725
+
726
+ // With tracer: wrap execution in a span for observability
727
+ return tracer.startActiveSpan(
728
+ `Evaluation: ${evaluator.name}`,
729
+ async (span) => {
730
+ // Set span attributes for input
731
+ span.setAttributes({
732
+ [SemanticConventions.OPENINFERENCE_SPAN_KIND]:
733
+ OpenInferenceSpanKind.EVALUATOR,
734
+ [SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
735
+ [SemanticConventions.INPUT_VALUE]: ensureString({
736
+ input: datasetExample.input,
737
+ output: experimentRun.output,
738
+ expected: datasetExample.output,
739
+ metadata: datasetExample.metadata,
740
+ }),
741
+ ...objectAsAttributes({
742
+ experiment_id: experimentId,
743
+ experiment_run_id: experimentRun.id,
744
+ dataset_example_id: datasetExample.id,
745
+ }),
746
+ });
747
+
748
+ let results: readonly EvaluationResult[] | undefined;
749
+ let error: string | undefined;
750
+
751
+ try {
752
+ // Execute the evaluator (only once!)
753
+ const result = await Promise.resolve(
754
+ evaluator.evaluate({
755
+ input: datasetExample.input,
756
+ output: taskOutput,
757
+ expected: expectedOutput,
758
+ metadata: datasetExample.metadata,
759
+ })
760
+ );
761
+
762
+ results = Array.isArray(result) ? result : [result];
763
+
764
+ // Set output span attributes
765
+ span.setAttributes({
766
+ [SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
767
+ [SemanticConventions.OUTPUT_VALUE]: ensureString(result),
768
+ });
769
+
770
+ // Set attributes from first result for span metadata
771
+ if (results[0]) {
772
+ span.setAttributes(objectAsAttributes(results[0]));
773
+ }
774
+
775
+ span.setStatus({ code: SpanStatusCode.OK });
776
+ } catch (err) {
777
+ error = err instanceof Error ? err.message : String(err);
778
+
779
+ span.setStatus({
780
+ code: SpanStatusCode.ERROR,
781
+ message: error,
782
+ });
783
+ span.recordException(err as Error);
784
+
785
+ throw err;
786
+ } finally {
787
+ const endTime = new Date();
788
+ span.end();
789
+
790
+ // Record results to API
791
+ await recordEvaluationResults({
792
+ client,
793
+ evaluator,
794
+ experimentRun,
795
+ results,
796
+ error,
797
+ startTime,
798
+ endTime,
799
+ traceId: span.spanContext().traceId,
800
+ });
801
+ }
802
+ }
803
+ );
804
+ }