@arizeai/phoenix-client 6.5.4 → 6.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/__generated__/api/v1.d.ts +244 -0
- package/dist/esm/__generated__/api/v1.d.ts.map +1 -1
- package/dist/esm/experiments/resumeEvaluation.d.ts.map +1 -1
- package/dist/esm/experiments/resumeEvaluation.js +181 -170
- package/dist/esm/experiments/resumeEvaluation.js.map +1 -1
- package/dist/esm/experiments/resumeExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/resumeExperiment.js +201 -185
- package/dist/esm/experiments/resumeExperiment.js.map +1 -1
- package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/runExperiment.js +239 -207
- package/dist/esm/experiments/runExperiment.js.map +1 -1
- package/dist/esm/experiments/tracing.d.ts +10 -0
- package/dist/esm/experiments/tracing.d.ts.map +1 -0
- package/dist/esm/experiments/tracing.js +21 -0
- package/dist/esm/experiments/tracing.js.map +1 -0
- package/dist/esm/prompts/sdks/toSDK.d.ts +2 -2
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/experiments.d.ts +6 -0
- package/dist/esm/types/experiments.d.ts.map +1 -1
- package/dist/esm/utils/formatPromptMessages.d.ts.map +1 -1
- package/dist/esm/utils/getPromptBySelector.d.ts.map +1 -1
- package/dist/src/__generated__/api/v1.d.ts +244 -0
- package/dist/src/__generated__/api/v1.d.ts.map +1 -1
- package/dist/src/experiments/resumeEvaluation.d.ts.map +1 -1
- package/dist/src/experiments/resumeEvaluation.js +194 -183
- package/dist/src/experiments/resumeEvaluation.js.map +1 -1
- package/dist/src/experiments/resumeExperiment.d.ts.map +1 -1
- package/dist/src/experiments/resumeExperiment.js +214 -198
- package/dist/src/experiments/resumeExperiment.js.map +1 -1
- package/dist/src/experiments/runExperiment.d.ts.map +1 -1
- package/dist/src/experiments/runExperiment.js +229 -197
- package/dist/src/experiments/runExperiment.js.map +1 -1
- package/dist/src/experiments/tracing.d.ts +10 -0
- package/dist/src/experiments/tracing.d.ts.map +1 -0
- package/dist/src/experiments/tracing.js +24 -0
- package/dist/src/experiments/tracing.js.map +1 -0
- package/dist/src/types/experiments.d.ts +6 -0
- package/dist/src/types/experiments.d.ts.map +1 -1
- package/dist/src/utils/formatPromptMessages.d.ts.map +1 -1
- package/dist/src/utils/getPromptBySelector.d.ts.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/docs/annotations.mdx +83 -0
- package/docs/datasets.mdx +77 -0
- package/docs/document-annotations.mdx +208 -0
- package/docs/experiments.mdx +376 -0
- package/docs/overview.mdx +176 -0
- package/docs/prompts.mdx +73 -0
- package/docs/session-annotations.mdx +158 -0
- package/docs/sessions.mdx +87 -0
- package/docs/span-annotations.mdx +283 -0
- package/docs/spans.mdx +76 -0
- package/docs/traces.mdx +63 -0
- package/package.json +9 -3
- package/src/__generated__/api/v1.ts +244 -0
- package/src/experiments/resumeEvaluation.ts +226 -206
- package/src/experiments/resumeExperiment.ts +237 -213
- package/src/experiments/runExperiment.ts +282 -243
- package/src/experiments/tracing.ts +30 -0
- package/src/types/experiments.ts +6 -0
|
@@ -3,8 +3,13 @@ import {
|
|
|
3
3
|
OpenInferenceSpanKind,
|
|
4
4
|
SemanticConventions,
|
|
5
5
|
} from "@arizeai/openinference-semantic-conventions";
|
|
6
|
-
import type {
|
|
6
|
+
import type {
|
|
7
|
+
GlobalTracerProviderRegistration,
|
|
8
|
+
NodeTracerProvider,
|
|
9
|
+
Tracer,
|
|
10
|
+
} from "@arizeai/phoenix-otel";
|
|
7
11
|
import {
|
|
12
|
+
attachGlobalTracerProvider,
|
|
8
13
|
type DiagLogLevel,
|
|
9
14
|
objectAsAttributes,
|
|
10
15
|
register,
|
|
@@ -29,6 +34,7 @@ import { toObjectHeaders } from "../utils/toObjectHeaders";
|
|
|
29
34
|
import { getExperimentInfo } from "./getExperimentInfo.js";
|
|
30
35
|
import { getExperimentEvaluators } from "./helpers";
|
|
31
36
|
import { logEvalResumeSummary, PROGRESS_PREFIX } from "./logging";
|
|
37
|
+
import { cleanupOwnedTracerProvider } from "./tracing";
|
|
32
38
|
|
|
33
39
|
/**
|
|
34
40
|
* Error thrown when evaluation is aborted due to a failure in stopOnFirstError mode.
|
|
@@ -207,7 +213,11 @@ function setupEvaluationTracer({
|
|
|
207
213
|
useBatchSpanProcessor: boolean;
|
|
208
214
|
diagLogLevel?: DiagLogLevel;
|
|
209
215
|
setGlobalTracerProvider: boolean;
|
|
210
|
-
}): {
|
|
216
|
+
}): {
|
|
217
|
+
provider: NodeTracerProvider;
|
|
218
|
+
tracer: Tracer;
|
|
219
|
+
globalRegistration: GlobalTracerProviderRegistration | null;
|
|
220
|
+
} | null {
|
|
211
221
|
if (!projectName) {
|
|
212
222
|
return null;
|
|
213
223
|
}
|
|
@@ -218,11 +228,14 @@ function setupEvaluationTracer({
|
|
|
218
228
|
headers,
|
|
219
229
|
batch: useBatchSpanProcessor,
|
|
220
230
|
diagLogLevel,
|
|
221
|
-
global:
|
|
231
|
+
global: false,
|
|
222
232
|
});
|
|
233
|
+
const globalRegistration = setGlobalTracerProvider
|
|
234
|
+
? attachGlobalTracerProvider(provider)
|
|
235
|
+
: null;
|
|
223
236
|
|
|
224
237
|
const tracer = provider.getTracer(projectName);
|
|
225
|
-
return { provider, tracer };
|
|
238
|
+
return { provider, tracer, globalRegistration };
|
|
226
239
|
}
|
|
227
240
|
|
|
228
241
|
/**
|
|
@@ -329,252 +342,257 @@ export async function resumeEvaluation({
|
|
|
329
342
|
});
|
|
330
343
|
|
|
331
344
|
const provider = tracerSetup?.provider ?? null;
|
|
345
|
+
const globalRegistration = tracerSetup?.globalRegistration ?? null;
|
|
332
346
|
const evalTracer = tracerSetup?.tracer ?? null;
|
|
333
347
|
|
|
334
|
-
|
|
335
|
-
|
|
348
|
+
try {
|
|
349
|
+
// Build evaluation names list for query - derive from evaluator names
|
|
350
|
+
const evaluationNamesList = evaluators.map((e) => e.name);
|
|
336
351
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
352
|
+
// Create a CSP-style bounded buffer for evaluation distribution
|
|
353
|
+
const evalChannel = new Channel<EvalItem>(
|
|
354
|
+
pageSize * CHANNEL_CAPACITY_MULTIPLIER
|
|
355
|
+
);
|
|
341
356
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
357
|
+
// Abort controller for stopOnFirstError coordination
|
|
358
|
+
const abortController = new AbortController();
|
|
359
|
+
const { signal } = abortController;
|
|
345
360
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
361
|
+
let totalProcessed = 0;
|
|
362
|
+
let totalCompleted = 0;
|
|
363
|
+
let totalFailed = 0;
|
|
349
364
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
365
|
+
// Producer: Fetch incomplete evaluations and send to channel
|
|
366
|
+
async function fetchIncompleteEvaluations(): Promise<void> {
|
|
367
|
+
let cursor: string | null = null;
|
|
353
368
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
369
|
+
try {
|
|
370
|
+
do {
|
|
371
|
+
// Stop fetching if abort signal received
|
|
372
|
+
if (signal.aborted) {
|
|
373
|
+
logger.debug(`${PROGRESS_PREFIX.progress}Stopping fetch.`);
|
|
374
|
+
break;
|
|
375
|
+
}
|
|
361
376
|
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
377
|
+
let res: {
|
|
378
|
+
data?: components["schemas"]["GetIncompleteEvaluationsResponseBody"];
|
|
379
|
+
error?: unknown;
|
|
380
|
+
};
|
|
366
381
|
|
|
367
|
-
try {
|
|
368
|
-
res = await client.GET(
|
|
369
|
-
"/v1/experiments/{experiment_id}/incomplete-evaluations",
|
|
370
|
-
{
|
|
371
|
-
params: {
|
|
372
|
-
path: {
|
|
373
|
-
experiment_id: experimentId,
|
|
374
|
-
},
|
|
375
|
-
query: {
|
|
376
|
-
cursor,
|
|
377
|
-
limit: pageSize,
|
|
378
|
-
evaluation_name: evaluationNamesList,
|
|
379
|
-
},
|
|
380
|
-
},
|
|
381
|
-
}
|
|
382
|
-
);
|
|
383
|
-
} catch (error: unknown) {
|
|
384
|
-
// Check for version compatibility issues and throw helpful error
|
|
385
382
|
try {
|
|
386
|
-
await
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
383
|
+
res = await client.GET(
|
|
384
|
+
"/v1/experiments/{experiment_id}/incomplete-evaluations",
|
|
385
|
+
{
|
|
386
|
+
params: {
|
|
387
|
+
path: {
|
|
388
|
+
experiment_id: experimentId,
|
|
389
|
+
},
|
|
390
|
+
query: {
|
|
391
|
+
cursor,
|
|
392
|
+
limit: pageSize,
|
|
393
|
+
evaluation_name: evaluationNamesList,
|
|
394
|
+
},
|
|
395
|
+
},
|
|
396
|
+
}
|
|
398
397
|
);
|
|
398
|
+
} catch (error: unknown) {
|
|
399
|
+
// Check for version compatibility issues and throw helpful error
|
|
400
|
+
try {
|
|
401
|
+
await handleEvaluationFetchError(
|
|
402
|
+
error,
|
|
403
|
+
client,
|
|
404
|
+
"resume_evaluation"
|
|
405
|
+
);
|
|
406
|
+
// TypeScript: handleEvaluationFetchError never returns, but add throw for safety
|
|
407
|
+
throw new Error("handleEvaluationFetchError should never return");
|
|
408
|
+
} catch (handledError) {
|
|
409
|
+
// Wrap the error (from handleEvaluationFetchError or original) in semantic error type
|
|
410
|
+
throw new EvaluationFetchError(
|
|
411
|
+
"Failed to fetch incomplete evaluations from server",
|
|
412
|
+
handledError instanceof Error ? handledError : undefined
|
|
413
|
+
);
|
|
414
|
+
}
|
|
399
415
|
}
|
|
400
|
-
}
|
|
401
416
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
);
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
cursor = res.data?.next_cursor ?? null;
|
|
410
|
-
const batchIncomplete = res.data?.data;
|
|
411
|
-
invariant(batchIncomplete, "Failed to fetch incomplete evaluations");
|
|
412
|
-
|
|
413
|
-
if (batchIncomplete.length === 0) {
|
|
414
|
-
if (totalProcessed === 0) {
|
|
415
|
-
logger.info(
|
|
416
|
-
`${PROGRESS_PREFIX.completed}No incomplete evaluations found.`
|
|
417
|
+
// Check for API errors
|
|
418
|
+
if (res.error) {
|
|
419
|
+
throw new EvaluationFetchError(
|
|
420
|
+
`Failed to fetch incomplete evaluations: ${ensureString(res.error)}`
|
|
417
421
|
);
|
|
418
422
|
}
|
|
419
|
-
break;
|
|
420
|
-
}
|
|
421
423
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
424
|
+
cursor = res.data?.next_cursor ?? null;
|
|
425
|
+
const batchIncomplete = res.data?.data;
|
|
426
|
+
invariant(batchIncomplete, "Failed to fetch incomplete evaluations");
|
|
425
427
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
428
|
+
if (batchIncomplete.length === 0) {
|
|
429
|
+
if (totalProcessed === 0) {
|
|
430
|
+
logger.info(
|
|
431
|
+
`${PROGRESS_PREFIX.completed}No incomplete evaluations found.`
|
|
432
|
+
);
|
|
433
|
+
}
|
|
431
434
|
break;
|
|
432
435
|
}
|
|
433
436
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
shouldRunEvaluator(evaluator, incompleteEval)
|
|
438
|
-
);
|
|
437
|
+
if (totalProcessed === 0) {
|
|
438
|
+
logger.info(`${PROGRESS_PREFIX.start}Resuming evaluations.`);
|
|
439
|
+
}
|
|
439
440
|
|
|
440
|
-
//
|
|
441
|
-
|
|
441
|
+
// Build evaluation tasks and send to channel
|
|
442
|
+
let batchCount = 0;
|
|
443
|
+
for (const incomplete of batchIncomplete) {
|
|
442
444
|
// Stop sending items if abort signal received
|
|
443
445
|
if (signal.aborted) {
|
|
444
446
|
break;
|
|
445
447
|
}
|
|
446
448
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
449
|
+
const incompleteEval = buildIncompleteEvaluation(incomplete);
|
|
450
|
+
|
|
451
|
+
const evaluatorsToRun = evaluators.filter((evaluator) =>
|
|
452
|
+
shouldRunEvaluator(evaluator, incompleteEval)
|
|
453
|
+
);
|
|
454
|
+
|
|
455
|
+
// Flatten: Send one channel item per evaluator
|
|
456
|
+
for (const evaluator of evaluatorsToRun) {
|
|
457
|
+
// Stop sending items if abort signal received
|
|
458
|
+
if (signal.aborted) {
|
|
459
|
+
break;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
await evalChannel.send({ incompleteEval, evaluator });
|
|
463
|
+
batchCount++;
|
|
464
|
+
totalProcessed++;
|
|
465
|
+
}
|
|
450
466
|
}
|
|
451
|
-
}
|
|
452
467
|
|
|
453
|
-
|
|
454
|
-
|
|
468
|
+
logger.debug(
|
|
469
|
+
`${PROGRESS_PREFIX.progress}Fetched batch of ${batchCount} evaluation tasks.`
|
|
470
|
+
);
|
|
471
|
+
} while (cursor !== null && !signal.aborted);
|
|
472
|
+
} catch (error) {
|
|
473
|
+
// Re-throw with context preservation
|
|
474
|
+
if (error instanceof EvaluationFetchError) {
|
|
475
|
+
throw error;
|
|
476
|
+
}
|
|
477
|
+
// ChannelError from blocked send() should bubble up naturally
|
|
478
|
+
// (happens when channel closes while producer is blocked)
|
|
479
|
+
if (error instanceof ChannelError) {
|
|
480
|
+
throw error;
|
|
481
|
+
}
|
|
482
|
+
// Wrap any unexpected errors from channel operations
|
|
483
|
+
throw new EvaluationFetchError(
|
|
484
|
+
"Unexpected error during evaluation fetch",
|
|
485
|
+
error instanceof Error ? error : undefined
|
|
455
486
|
);
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
// Re-throw with context preservation
|
|
459
|
-
if (error instanceof EvaluationFetchError) {
|
|
460
|
-
throw error;
|
|
461
|
-
}
|
|
462
|
-
// ChannelError from blocked send() should bubble up naturally
|
|
463
|
-
// (happens when channel closes while producer is blocked)
|
|
464
|
-
if (error instanceof ChannelError) {
|
|
465
|
-
throw error;
|
|
487
|
+
} finally {
|
|
488
|
+
evalChannel.close(); // Signal workers we're done
|
|
466
489
|
}
|
|
467
|
-
// Wrap any unexpected errors from channel operations
|
|
468
|
-
throw new EvaluationFetchError(
|
|
469
|
-
"Unexpected error during evaluation fetch",
|
|
470
|
-
error instanceof Error ? error : undefined
|
|
471
|
-
);
|
|
472
|
-
} finally {
|
|
473
|
-
evalChannel.close(); // Signal workers we're done
|
|
474
490
|
}
|
|
475
|
-
}
|
|
476
491
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
492
|
+
// Worker: Process evaluations from channel
|
|
493
|
+
async function processEvaluationsFromChannel(): Promise<void> {
|
|
494
|
+
for await (const item of evalChannel) {
|
|
495
|
+
// Stop processing if abort signal received
|
|
496
|
+
if (signal.aborted) {
|
|
497
|
+
break;
|
|
498
|
+
}
|
|
484
499
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
+
try {
|
|
501
|
+
await runSingleEvaluation({
|
|
502
|
+
client,
|
|
503
|
+
experimentId,
|
|
504
|
+
evaluator: item.evaluator,
|
|
505
|
+
experimentRun: item.incompleteEval.experimentRun,
|
|
506
|
+
datasetExample: item.incompleteEval.datasetExample,
|
|
507
|
+
tracer: evalTracer,
|
|
508
|
+
});
|
|
509
|
+
totalCompleted++;
|
|
510
|
+
} catch (error) {
|
|
511
|
+
totalFailed++;
|
|
512
|
+
logger.error(
|
|
513
|
+
`Failed to run evaluator "${item.evaluator.name}" for run ${item.incompleteEval.experimentRun.id}: ${error}`
|
|
514
|
+
);
|
|
500
515
|
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
516
|
+
// If stopOnFirstError is enabled, abort and re-throw
|
|
517
|
+
if (stopOnFirstError) {
|
|
518
|
+
logger.warn("Stopping on first error");
|
|
519
|
+
abortController.abort();
|
|
520
|
+
throw error;
|
|
521
|
+
}
|
|
506
522
|
}
|
|
507
523
|
}
|
|
508
524
|
}
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
// Start concurrent execution
|
|
512
|
-
// Wrap in try-finally to ensure channel is always closed, even if Promise.all throws
|
|
513
|
-
let executionError: Error | null = null;
|
|
514
|
-
try {
|
|
515
|
-
const producerTask = fetchIncompleteEvaluations();
|
|
516
|
-
const workerTasks = Array.from({ length: concurrency }, () =>
|
|
517
|
-
processEvaluationsFromChannel()
|
|
518
|
-
);
|
|
519
525
|
|
|
520
|
-
//
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
if (error instanceof EvaluationFetchError) {
|
|
528
|
-
// Producer failed - this is ALWAYS critical regardless of stopOnFirstError
|
|
529
|
-
logger.error(`Critical: Failed to fetch evaluations from server`);
|
|
530
|
-
executionError = err;
|
|
531
|
-
} else if (error instanceof ChannelError && signal.aborted) {
|
|
532
|
-
// Channel closed due to intentional abort - wrap in semantic error
|
|
533
|
-
executionError = new EvaluationAbortedError(
|
|
534
|
-
"Evaluation stopped due to error in concurrent evaluator",
|
|
535
|
-
err
|
|
526
|
+
// Start concurrent execution
|
|
527
|
+
// Wrap in try-finally to ensure channel is always closed, even if Promise.all throws
|
|
528
|
+
let executionError: Error | null = null;
|
|
529
|
+
try {
|
|
530
|
+
const producerTask = fetchIncompleteEvaluations();
|
|
531
|
+
const workerTasks = Array.from({ length: concurrency }, () =>
|
|
532
|
+
processEvaluationsFromChannel()
|
|
536
533
|
);
|
|
537
|
-
} else if (stopOnFirstError) {
|
|
538
|
-
// Worker error in stopOnFirstError mode - already logged by worker
|
|
539
|
-
executionError = err;
|
|
540
|
-
} else {
|
|
541
|
-
// Unexpected error (not from worker, not from producer fetch)
|
|
542
|
-
// This could be a bug in our code or infrastructure failure
|
|
543
|
-
logger.error(`Unexpected error during evaluation: ${err.message}`);
|
|
544
|
-
executionError = err;
|
|
545
|
-
}
|
|
546
|
-
} finally {
|
|
547
|
-
// Ensure channel is closed even if there are unexpected errors
|
|
548
|
-
// This is a safety net in case producer's finally block didn't execute
|
|
549
|
-
if (!evalChannel.isClosed) {
|
|
550
|
-
evalChannel.close();
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
534
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
535
|
+
// Wait for producer and all workers to finish
|
|
536
|
+
await Promise.all([producerTask, ...workerTasks]);
|
|
537
|
+
} catch (error) {
|
|
538
|
+
// Classify and handle errors based on their nature
|
|
539
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
558
540
|
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
541
|
+
// Always surface producer/infrastructure errors
|
|
542
|
+
if (error instanceof EvaluationFetchError) {
|
|
543
|
+
// Producer failed - this is ALWAYS critical regardless of stopOnFirstError
|
|
544
|
+
logger.error(`Critical: Failed to fetch evaluations from server`);
|
|
545
|
+
executionError = err;
|
|
546
|
+
} else if (error instanceof ChannelError && signal.aborted) {
|
|
547
|
+
// Channel closed due to intentional abort - wrap in semantic error
|
|
548
|
+
executionError = new EvaluationAbortedError(
|
|
549
|
+
"Evaluation stopped due to error in concurrent evaluator",
|
|
550
|
+
err
|
|
551
|
+
);
|
|
552
|
+
} else if (stopOnFirstError) {
|
|
553
|
+
// Worker error in stopOnFirstError mode - already logged by worker
|
|
554
|
+
executionError = err;
|
|
555
|
+
} else {
|
|
556
|
+
// Unexpected error (not from worker, not from producer fetch)
|
|
557
|
+
// This could be a bug in our code or infrastructure failure
|
|
558
|
+
logger.error(`Unexpected error during evaluation: ${err.message}`);
|
|
559
|
+
executionError = err;
|
|
560
|
+
}
|
|
561
|
+
} finally {
|
|
562
|
+
// Ensure channel is closed even if there are unexpected errors
|
|
563
|
+
// This is a safety net in case producer's finally block didn't execute
|
|
564
|
+
if (!evalChannel.isClosed) {
|
|
565
|
+
evalChannel.close();
|
|
566
|
+
}
|
|
567
|
+
}
|
|
562
568
|
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
failed: totalFailed,
|
|
568
|
-
});
|
|
569
|
+
// Only show completion message if we didn't stop on error
|
|
570
|
+
if (!executionError) {
|
|
571
|
+
logger.info(`${PROGRESS_PREFIX.completed}Evaluations completed.`);
|
|
572
|
+
}
|
|
569
573
|
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
+
if (totalFailed > 0 && !executionError) {
|
|
575
|
+
logger.warn(
|
|
576
|
+
`${totalFailed} out of ${totalProcessed} evaluations failed.`
|
|
577
|
+
);
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
logEvalResumeSummary(logger, {
|
|
581
|
+
experimentId: experiment.id,
|
|
582
|
+
processed: totalProcessed,
|
|
583
|
+
completed: totalCompleted,
|
|
584
|
+
failed: totalFailed,
|
|
585
|
+
});
|
|
574
586
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
587
|
+
// Re-throw error if evaluation failed
|
|
588
|
+
if (executionError) {
|
|
589
|
+
throw executionError;
|
|
590
|
+
}
|
|
591
|
+
} finally {
|
|
592
|
+
await cleanupOwnedTracerProvider({
|
|
593
|
+
provider,
|
|
594
|
+
globalRegistration,
|
|
595
|
+
});
|
|
578
596
|
}
|
|
579
597
|
}
|
|
580
598
|
|
|
@@ -674,6 +692,7 @@ async function runSingleEvaluation({
|
|
|
674
692
|
output: taskOutput,
|
|
675
693
|
expected: expectedOutput,
|
|
676
694
|
metadata: datasetExample.metadata,
|
|
695
|
+
traceId: experimentRun.traceId,
|
|
677
696
|
})
|
|
678
697
|
);
|
|
679
698
|
results = Array.isArray(result) ? result : [result];
|
|
@@ -728,6 +747,7 @@ async function runSingleEvaluation({
|
|
|
728
747
|
output: taskOutput,
|
|
729
748
|
expected: expectedOutput,
|
|
730
749
|
metadata: datasetExample.metadata,
|
|
750
|
+
traceId: experimentRun.traceId,
|
|
731
751
|
})
|
|
732
752
|
);
|
|
733
753
|
|