@m4trix/evals 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +599 -224
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +600 -225
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +214 -105
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +215 -106
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +23 -5
- package/dist/index.js +218 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -56,7 +56,8 @@ var defaultRunnerConfig = {
|
|
|
56
56
|
],
|
|
57
57
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
58
58
|
},
|
|
59
|
-
artifactDirectory: ".eval-results"
|
|
59
|
+
artifactDirectory: ".eval-results",
|
|
60
|
+
maxConcurrency: 1
|
|
60
61
|
};
|
|
61
62
|
function toRunnerConfigOverrides(config) {
|
|
62
63
|
if (!config) {
|
|
@@ -89,6 +90,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
89
90
|
if (config.artifactDirectory !== void 0) {
|
|
90
91
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
91
92
|
}
|
|
93
|
+
if (config.maxConcurrency !== void 0) {
|
|
94
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
95
|
+
}
|
|
92
96
|
if (Object.keys(discovery).length > 0) {
|
|
93
97
|
overrides.discovery = discovery;
|
|
94
98
|
}
|
|
@@ -313,6 +317,7 @@ var Metric = {
|
|
|
313
317
|
const def = {
|
|
314
318
|
id: config.id,
|
|
315
319
|
name: config.name,
|
|
320
|
+
aggregate: config.aggregate,
|
|
316
321
|
format: config.format,
|
|
317
322
|
make: (data) => ({ id: config.id, data })
|
|
318
323
|
};
|
|
@@ -332,6 +337,7 @@ var Score = {
|
|
|
332
337
|
id: config.id,
|
|
333
338
|
name: config.name,
|
|
334
339
|
displayStrategy: config.displayStrategy,
|
|
340
|
+
aggregate: config.aggregate,
|
|
335
341
|
format: config.format,
|
|
336
342
|
make: (data, options) => {
|
|
337
343
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -350,23 +356,62 @@ function getScoreById(id) {
|
|
|
350
356
|
return registry2.get(id);
|
|
351
357
|
}
|
|
352
358
|
|
|
359
|
+
// src/evals/aggregators.ts
|
|
360
|
+
function aggregateAverage(values) {
|
|
361
|
+
if (values.length === 0) {
|
|
362
|
+
return { value: 0 };
|
|
363
|
+
}
|
|
364
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
365
|
+
return { value: sum / values.length };
|
|
366
|
+
}
|
|
367
|
+
function aggregateAll(values) {
|
|
368
|
+
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
369
|
+
}
|
|
370
|
+
function aggregateTokenCountSum(values) {
|
|
371
|
+
const initial = {
|
|
372
|
+
input: 0,
|
|
373
|
+
output: 0,
|
|
374
|
+
inputCached: 0,
|
|
375
|
+
outputCached: 0
|
|
376
|
+
};
|
|
377
|
+
return values.reduce(
|
|
378
|
+
(acc, v) => ({
|
|
379
|
+
input: acc.input + (v.input ?? 0),
|
|
380
|
+
output: acc.output + (v.output ?? 0),
|
|
381
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
382
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
383
|
+
}),
|
|
384
|
+
initial
|
|
385
|
+
);
|
|
386
|
+
}
|
|
387
|
+
function aggregateLatencyAverage(values) {
|
|
388
|
+
if (values.length === 0) {
|
|
389
|
+
return { ms: 0 };
|
|
390
|
+
}
|
|
391
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
392
|
+
return { ms: sum / values.length };
|
|
393
|
+
}
|
|
394
|
+
|
|
353
395
|
// src/evals/metrics/standard.ts
|
|
354
396
|
Metric.of({
|
|
355
397
|
id: "token-count",
|
|
356
398
|
name: "Tokens",
|
|
357
|
-
|
|
399
|
+
aggregate: aggregateTokenCountSum,
|
|
400
|
+
format: (data, options) => {
|
|
358
401
|
const input = data.input ?? 0;
|
|
359
402
|
const output = data.output ?? 0;
|
|
360
403
|
const inputCached = data.inputCached ?? 0;
|
|
361
404
|
const outputCached = data.outputCached ?? 0;
|
|
362
405
|
const cached = inputCached + outputCached;
|
|
363
|
-
|
|
406
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
407
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
364
408
|
}
|
|
365
409
|
});
|
|
366
410
|
Metric.of({
|
|
367
411
|
id: "latency",
|
|
368
412
|
name: "Latency",
|
|
369
|
-
|
|
413
|
+
aggregate: aggregateLatencyAverage,
|
|
414
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
370
415
|
});
|
|
371
416
|
|
|
372
417
|
// src/evals/scores/standard.ts
|
|
@@ -374,16 +419,36 @@ Score.of({
|
|
|
374
419
|
id: "percent",
|
|
375
420
|
name: "Score",
|
|
376
421
|
displayStrategy: "bar",
|
|
377
|
-
format: (data) => data.value.toFixed(2)
|
|
422
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
423
|
+
aggregate: aggregateAverage
|
|
378
424
|
});
|
|
379
425
|
Score.of({
|
|
380
426
|
id: "binary",
|
|
381
427
|
name: "Result",
|
|
382
428
|
displayStrategy: "passFail",
|
|
383
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
429
|
+
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
430
|
+
aggregate: aggregateAll
|
|
384
431
|
});
|
|
385
432
|
|
|
386
433
|
// src/runner/score-utils.ts
|
|
434
|
+
function aggregateScoreItems(items) {
|
|
435
|
+
if (items.length === 0)
|
|
436
|
+
return void 0;
|
|
437
|
+
const def = getScoreById(items[0].id);
|
|
438
|
+
if (!def?.aggregate)
|
|
439
|
+
return items[items.length - 1];
|
|
440
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
441
|
+
return { ...items[0], data: aggregated };
|
|
442
|
+
}
|
|
443
|
+
function aggregateMetricItems(items) {
|
|
444
|
+
if (items.length === 0)
|
|
445
|
+
return void 0;
|
|
446
|
+
const def = getMetricById(items[0].id);
|
|
447
|
+
if (!def?.aggregate)
|
|
448
|
+
return items[items.length - 1];
|
|
449
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
450
|
+
return { ...items[0], data: aggregated };
|
|
451
|
+
}
|
|
387
452
|
function toNumericScoreFromScores(scores) {
|
|
388
453
|
for (const item of scores) {
|
|
389
454
|
const def = getScoreById(item.id);
|
|
@@ -462,6 +527,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
462
527
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
463
528
|
);
|
|
464
529
|
}
|
|
530
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
531
|
+
return effect.Effect.gen(function* () {
|
|
532
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
533
|
+
const rerunPassed = [];
|
|
534
|
+
for (let r = 0; r < reruns; r++) {
|
|
535
|
+
const started = Date.now();
|
|
536
|
+
const evaluatorScores = [];
|
|
537
|
+
let testCaseError;
|
|
538
|
+
const output = readOutput(testCaseItem.testCase);
|
|
539
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
540
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
541
|
+
if (!evaluateFn) {
|
|
542
|
+
continue;
|
|
543
|
+
}
|
|
544
|
+
try {
|
|
545
|
+
const logs = [];
|
|
546
|
+
const logDiff = (expected, actual, options) => {
|
|
547
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
548
|
+
};
|
|
549
|
+
const ctx = yield* effect.Effect.promise(
|
|
550
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
551
|
+
);
|
|
552
|
+
const result = yield* effect.Effect.promise(
|
|
553
|
+
() => Promise.resolve(
|
|
554
|
+
evaluateFn({
|
|
555
|
+
input: testCaseItem.testCase.getInput(),
|
|
556
|
+
ctx,
|
|
557
|
+
output,
|
|
558
|
+
logDiff
|
|
559
|
+
})
|
|
560
|
+
)
|
|
561
|
+
);
|
|
562
|
+
const { scores, metrics } = normalizeResult(result);
|
|
563
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
564
|
+
evaluatorScores.push({
|
|
565
|
+
evaluatorId,
|
|
566
|
+
scores,
|
|
567
|
+
passed: passed2,
|
|
568
|
+
metrics,
|
|
569
|
+
logs: logs.length > 0 ? logs : void 0
|
|
570
|
+
});
|
|
571
|
+
} catch (error) {
|
|
572
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
573
|
+
evaluatorScores.push({
|
|
574
|
+
evaluatorId,
|
|
575
|
+
scores: [],
|
|
576
|
+
passed: false
|
|
577
|
+
});
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
581
|
+
rerunPassed.push(rerunPassedThis);
|
|
582
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
583
|
+
n + 1,
|
|
584
|
+
n + 1
|
|
585
|
+
]);
|
|
586
|
+
const progressEvent = {
|
|
587
|
+
type: "TestCaseProgress",
|
|
588
|
+
runId: task.runId,
|
|
589
|
+
testCaseId: testCaseItem.id,
|
|
590
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
591
|
+
completedTestCases: completedEvaluations,
|
|
592
|
+
totalTestCases: totalEvaluations,
|
|
593
|
+
rerunIndex: r + 1,
|
|
594
|
+
rerunTotal: reruns,
|
|
595
|
+
passed: rerunPassedThis,
|
|
596
|
+
durationMs: Date.now() - started,
|
|
597
|
+
evaluatorScores,
|
|
598
|
+
output,
|
|
599
|
+
errorMessage: testCaseError
|
|
600
|
+
};
|
|
601
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
602
|
+
...snapshot,
|
|
603
|
+
completedTestCases: completedEvaluations
|
|
604
|
+
}));
|
|
605
|
+
yield* publishEvent(progressEvent);
|
|
606
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
607
|
+
runId: task.runId,
|
|
608
|
+
artifactPath: task.snapshot.artifactPath,
|
|
609
|
+
payload: progressEvent
|
|
610
|
+
});
|
|
611
|
+
}
|
|
612
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
613
|
+
if (testCasePassed) {
|
|
614
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
615
|
+
} else {
|
|
616
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
617
|
+
}
|
|
618
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
619
|
+
effect.Ref.get(passedRef),
|
|
620
|
+
effect.Ref.get(failedRef)
|
|
621
|
+
]);
|
|
622
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
623
|
+
...snapshot,
|
|
624
|
+
passedTestCases: passed,
|
|
625
|
+
failedTestCases: failed
|
|
626
|
+
}));
|
|
627
|
+
});
|
|
628
|
+
}
|
|
465
629
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
466
630
|
const startedAt = Date.now();
|
|
467
631
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -474,104 +638,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
474
638
|
runId: task.runId,
|
|
475
639
|
startedAt
|
|
476
640
|
});
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
)
|
|
507
|
-
);
|
|
508
|
-
const { scores, metrics } = normalizeResult(result);
|
|
509
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
510
|
-
evaluatorScores.push({
|
|
511
|
-
evaluatorId,
|
|
512
|
-
scores,
|
|
513
|
-
passed,
|
|
514
|
-
metrics,
|
|
515
|
-
logs: logs.length > 0 ? logs : void 0
|
|
516
|
-
});
|
|
517
|
-
} catch (error) {
|
|
518
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
519
|
-
evaluatorScores.push({
|
|
520
|
-
evaluatorId,
|
|
521
|
-
scores: [],
|
|
522
|
-
passed: false
|
|
523
|
-
});
|
|
524
|
-
}
|
|
525
|
-
}
|
|
526
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
527
|
-
completedTestCases += 1;
|
|
528
|
-
if (testCasePassed) {
|
|
529
|
-
passedTestCases += 1;
|
|
530
|
-
} else {
|
|
531
|
-
failedTestCases += 1;
|
|
532
|
-
}
|
|
533
|
-
const progressEvent = {
|
|
534
|
-
type: "TestCaseProgress",
|
|
535
|
-
runId: task.runId,
|
|
536
|
-
testCaseId: testCaseItem.id,
|
|
537
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
538
|
-
completedTestCases,
|
|
539
|
-
totalTestCases: task.testCases.length,
|
|
540
|
-
passed: testCasePassed,
|
|
541
|
-
durationMs: Date.now() - started,
|
|
542
|
-
evaluatorScores,
|
|
543
|
-
output,
|
|
544
|
-
errorMessage: testCaseError
|
|
545
|
-
};
|
|
546
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
547
|
-
...snapshot,
|
|
548
|
-
completedTestCases,
|
|
549
|
-
passedTestCases,
|
|
550
|
-
failedTestCases
|
|
551
|
-
}));
|
|
552
|
-
yield* publishEvent(progressEvent);
|
|
553
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
554
|
-
runId: task.runId,
|
|
555
|
-
artifactPath: task.snapshot.artifactPath,
|
|
556
|
-
payload: progressEvent
|
|
557
|
-
});
|
|
558
|
-
}
|
|
641
|
+
const totalEvaluations = task.testCases.reduce(
|
|
642
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
643
|
+
0
|
|
644
|
+
);
|
|
645
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
646
|
+
const completedRef = yield* effect.Ref.make(0);
|
|
647
|
+
const passedRef = yield* effect.Ref.make(0);
|
|
648
|
+
const failedRef = yield* effect.Ref.make(0);
|
|
649
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
650
|
+
task,
|
|
651
|
+
testCaseItem,
|
|
652
|
+
totalEvaluations,
|
|
653
|
+
publishEvent,
|
|
654
|
+
persistenceQueue,
|
|
655
|
+
updateSnapshot,
|
|
656
|
+
completedRef,
|
|
657
|
+
passedRef,
|
|
658
|
+
failedRef
|
|
659
|
+
);
|
|
660
|
+
yield* effect.Effect.forEach(
|
|
661
|
+
task.testCases,
|
|
662
|
+
processTestCase,
|
|
663
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
664
|
+
);
|
|
665
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
666
|
+
effect.Ref.get(completedRef),
|
|
667
|
+
effect.Ref.get(passedRef),
|
|
668
|
+
effect.Ref.get(failedRef)
|
|
669
|
+
]);
|
|
559
670
|
const finishedAt = Date.now();
|
|
560
671
|
const completedEvent = {
|
|
561
672
|
type: "RunCompleted",
|
|
562
673
|
runId: task.runId,
|
|
563
674
|
finishedAt,
|
|
564
|
-
passedTestCases,
|
|
565
|
-
failedTestCases,
|
|
675
|
+
passedTestCases: passedUniqueTestCases,
|
|
676
|
+
failedTestCases: failedUniqueTestCases,
|
|
566
677
|
totalTestCases: task.testCases.length,
|
|
567
678
|
artifactPath: task.snapshot.artifactPath
|
|
568
679
|
};
|
|
569
680
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
570
681
|
...snapshot,
|
|
571
682
|
status: "completed",
|
|
572
|
-
completedTestCases,
|
|
573
|
-
passedTestCases,
|
|
574
|
-
failedTestCases,
|
|
683
|
+
completedTestCases: completedEvaluations,
|
|
684
|
+
passedTestCases: passedUniqueTestCases,
|
|
685
|
+
failedTestCases: failedUniqueTestCases,
|
|
575
686
|
finishedAt
|
|
576
687
|
}));
|
|
577
688
|
yield* publishEvent(completedEvent);
|
|
@@ -659,7 +770,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
659
770
|
const artifactPath = filePath;
|
|
660
771
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
661
772
|
const progress = aggregateTestCaseProgress(lines);
|
|
662
|
-
const completedTestCases = runCompleted
|
|
773
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
663
774
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
664
775
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
665
776
|
return {
|
|
@@ -681,23 +792,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
681
792
|
}
|
|
682
793
|
function aggregateTestCaseProgress(lines) {
|
|
683
794
|
let completedTestCases = 0;
|
|
684
|
-
|
|
685
|
-
let failedTestCases = 0;
|
|
795
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
686
796
|
for (const line of lines) {
|
|
687
797
|
try {
|
|
688
798
|
const event = JSON.parse(line);
|
|
689
799
|
if (event.type === "TestCaseProgress") {
|
|
690
800
|
const ev = event;
|
|
691
801
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
failedTestCases += 1;
|
|
696
|
-
}
|
|
802
|
+
const id = ev.testCaseId;
|
|
803
|
+
const current = testCasePassedBy.get(id);
|
|
804
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
697
805
|
}
|
|
698
806
|
} catch {
|
|
699
807
|
}
|
|
700
808
|
}
|
|
809
|
+
let passedTestCases = 0;
|
|
810
|
+
let failedTestCases = 0;
|
|
811
|
+
for (const passed of testCasePassedBy.values()) {
|
|
812
|
+
if (passed) {
|
|
813
|
+
passedTestCases += 1;
|
|
814
|
+
} else {
|
|
815
|
+
failedTestCases += 1;
|
|
816
|
+
}
|
|
817
|
+
}
|
|
701
818
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
702
819
|
}
|
|
703
820
|
async function appendJsonLine(artifactPath, payload) {
|
|
@@ -892,6 +1009,10 @@ var EffectRunner = class {
|
|
|
892
1009
|
throw new Error("No evaluators selected for run");
|
|
893
1010
|
}
|
|
894
1011
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1012
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1013
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1014
|
+
0
|
|
1015
|
+
);
|
|
895
1016
|
const runId = `run-${crypto.randomUUID()}`;
|
|
896
1017
|
const artifactPath = createArtifactPath(
|
|
897
1018
|
this.config.artifactDirectory,
|
|
@@ -904,7 +1025,7 @@ var EffectRunner = class {
|
|
|
904
1025
|
datasetName: dataset.dataset.getName(),
|
|
905
1026
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
906
1027
|
queuedAt: Date.now(),
|
|
907
|
-
totalTestCases:
|
|
1028
|
+
totalTestCases: totalEvaluations,
|
|
908
1029
|
completedTestCases: 0,
|
|
909
1030
|
passedTestCases: 0,
|
|
910
1031
|
failedTestCases: 0,
|
|
@@ -918,7 +1039,7 @@ var EffectRunner = class {
|
|
|
918
1039
|
datasetId: request.datasetId,
|
|
919
1040
|
datasetName: dataset.dataset.getName(),
|
|
920
1041
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
921
|
-
totalTestCases:
|
|
1042
|
+
totalTestCases: totalEvaluations,
|
|
922
1043
|
artifactPath
|
|
923
1044
|
};
|
|
924
1045
|
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -929,6 +1050,7 @@ var EffectRunner = class {
|
|
|
929
1050
|
payload: queuedEvent
|
|
930
1051
|
})
|
|
931
1052
|
);
|
|
1053
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
932
1054
|
await effect.Effect.runPromise(
|
|
933
1055
|
effect.Queue.offer(this.runQueue, {
|
|
934
1056
|
runId,
|
|
@@ -936,7 +1058,8 @@ var EffectRunner = class {
|
|
|
936
1058
|
dataset: dataset.dataset,
|
|
937
1059
|
evaluators: selectedEvaluators,
|
|
938
1060
|
testCases: selectedTestCases,
|
|
939
|
-
snapshot
|
|
1061
|
+
snapshot,
|
|
1062
|
+
maxConcurrency
|
|
940
1063
|
})
|
|
941
1064
|
);
|
|
942
1065
|
return snapshot;
|
|
@@ -1254,13 +1377,62 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1254
1377
|
const filled = Math.round(safe / max * width);
|
|
1255
1378
|
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1256
1379
|
}
|
|
1257
|
-
function
|
|
1380
|
+
function aggregateEvaluatorScores(events, nameById) {
|
|
1381
|
+
if (events.length === 0)
|
|
1382
|
+
return [];
|
|
1383
|
+
const evaluatorIds = new Set(
|
|
1384
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1385
|
+
);
|
|
1386
|
+
const result = [];
|
|
1387
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1388
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1389
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1390
|
+
for (const ev of events) {
|
|
1391
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1392
|
+
for (const s of es?.scores ?? []) {
|
|
1393
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1394
|
+
list.push(s);
|
|
1395
|
+
scoreIdToItems.set(s.id, list);
|
|
1396
|
+
}
|
|
1397
|
+
for (const m of es?.metrics ?? []) {
|
|
1398
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1399
|
+
list.push(m);
|
|
1400
|
+
metricIdToItems.set(m.id, list);
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
const aggregatedScores = [];
|
|
1404
|
+
for (const items of scoreIdToItems.values()) {
|
|
1405
|
+
const agg = aggregateScoreItems(items);
|
|
1406
|
+
if (agg)
|
|
1407
|
+
aggregatedScores.push(agg);
|
|
1408
|
+
}
|
|
1409
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1410
|
+
const passed = events.every((ev) => {
|
|
1411
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1412
|
+
return es?.passed ?? false;
|
|
1413
|
+
});
|
|
1414
|
+
const lastEvent = events[events.length - 1];
|
|
1415
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1416
|
+
(x) => x.evaluatorId === evaluatorId
|
|
1417
|
+
);
|
|
1418
|
+
result.push({
|
|
1419
|
+
evaluatorId,
|
|
1420
|
+
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
1421
|
+
scores: aggregatedScores,
|
|
1422
|
+
passed,
|
|
1423
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
|
|
1424
|
+
logs: lastEs?.logs
|
|
1425
|
+
});
|
|
1426
|
+
}
|
|
1427
|
+
return result;
|
|
1428
|
+
}
|
|
1429
|
+
function formatScorePart(item, scoreToColor2, options) {
|
|
1258
1430
|
const def = getScoreById(item.id);
|
|
1259
1431
|
if (!def) {
|
|
1260
1432
|
const numeric = toNumericScore(item.data);
|
|
1261
1433
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1262
1434
|
}
|
|
1263
|
-
const formatted = def.format(item.data);
|
|
1435
|
+
const formatted = def.format(item.data, options);
|
|
1264
1436
|
if (def.displayStrategy === "bar") {
|
|
1265
1437
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1266
1438
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1280,6 +1452,7 @@ function RunView({
|
|
|
1280
1452
|
);
|
|
1281
1453
|
const [runInfo, setRunInfo] = React2.useState(null);
|
|
1282
1454
|
const [testCases, setTestCases] = React2.useState([]);
|
|
1455
|
+
const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
|
|
1283
1456
|
const [summary, setSummary] = React2.useState(null);
|
|
1284
1457
|
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1285
1458
|
const runEval = React2.useCallback(async () => {
|
|
@@ -1306,10 +1479,7 @@ function RunView({
|
|
|
1306
1479
|
return;
|
|
1307
1480
|
}
|
|
1308
1481
|
const nameById = new Map(
|
|
1309
|
-
evaluators.map((item) => [
|
|
1310
|
-
item.id,
|
|
1311
|
-
item.evaluator.getName() ?? item.id
|
|
1312
|
-
])
|
|
1482
|
+
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1313
1483
|
);
|
|
1314
1484
|
setEvaluatorNameById(nameById);
|
|
1315
1485
|
const aggregates = /* @__PURE__ */ new Map();
|
|
@@ -1319,7 +1489,7 @@ function RunView({
|
|
|
1319
1489
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1320
1490
|
if (event.type === "TestCaseProgress") {
|
|
1321
1491
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1322
|
-
|
|
1492
|
+
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1323
1493
|
for (const item of event.evaluatorScores) {
|
|
1324
1494
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1325
1495
|
if (numeric !== void 0) {
|
|
@@ -1339,15 +1509,10 @@ function RunView({
|
|
|
1339
1509
|
overallScoreCount += 1;
|
|
1340
1510
|
}
|
|
1341
1511
|
}
|
|
1342
|
-
setTestCases((prev) =>
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
completedTestCases: event.completedTestCases,
|
|
1347
|
-
totalTestCases: event.totalTestCases,
|
|
1348
|
-
durationMs: event.durationMs,
|
|
1349
|
-
passed: event.passed,
|
|
1350
|
-
averageScore,
|
|
1512
|
+
setTestCases((prev) => {
|
|
1513
|
+
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1514
|
+
const existing = byId.get(event.testCaseId);
|
|
1515
|
+
const newEvent = {
|
|
1351
1516
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1352
1517
|
evaluatorId: item.evaluatorId,
|
|
1353
1518
|
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
@@ -1355,9 +1520,33 @@ function RunView({
|
|
|
1355
1520
|
passed: item.passed,
|
|
1356
1521
|
metrics: item.metrics,
|
|
1357
1522
|
logs: item.logs
|
|
1358
|
-
}))
|
|
1359
|
-
|
|
1360
|
-
|
|
1523
|
+
})),
|
|
1524
|
+
passed: event.passed,
|
|
1525
|
+
durationMs: event.durationMs
|
|
1526
|
+
};
|
|
1527
|
+
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1528
|
+
const isAggregated = events.length > 1;
|
|
1529
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1530
|
+
events,
|
|
1531
|
+
nameById
|
|
1532
|
+
);
|
|
1533
|
+
const merged = {
|
|
1534
|
+
name: event.testCaseName,
|
|
1535
|
+
testCaseId: event.testCaseId,
|
|
1536
|
+
completedTestCases: event.completedTestCases,
|
|
1537
|
+
totalTestCases: event.totalTestCases,
|
|
1538
|
+
rerunIndex: event.rerunIndex,
|
|
1539
|
+
rerunTotal: event.rerunTotal,
|
|
1540
|
+
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1541
|
+
passed: events.every((e) => e.passed),
|
|
1542
|
+
events,
|
|
1543
|
+
aggregatedEvaluatorScores,
|
|
1544
|
+
isAggregated
|
|
1545
|
+
};
|
|
1546
|
+
byId.set(event.testCaseId, merged);
|
|
1547
|
+
setCompletedEvaluations(event.completedTestCases);
|
|
1548
|
+
return Array.from(byId.values());
|
|
1549
|
+
});
|
|
1361
1550
|
}
|
|
1362
1551
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1363
1552
|
unsubscribe();
|
|
@@ -1372,9 +1561,7 @@ function RunView({
|
|
|
1372
1561
|
setRunInfo({
|
|
1373
1562
|
runId: snapshot.runId,
|
|
1374
1563
|
datasetName: snapshot.datasetName,
|
|
1375
|
-
evaluatorNames: evaluators.map(
|
|
1376
|
-
(e) => e.evaluator.getName() ?? e.id
|
|
1377
|
-
),
|
|
1564
|
+
evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
|
|
1378
1565
|
totalTestCases: snapshot.totalTestCases
|
|
1379
1566
|
});
|
|
1380
1567
|
setPhase("running");
|
|
@@ -1402,29 +1589,41 @@ function RunView({
|
|
|
1402
1589
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1403
1590
|
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1404
1591
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1405
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1592
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1593
|
+
"Run",
|
|
1594
|
+
" "
|
|
1595
|
+
] }),
|
|
1406
1596
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
1407
1597
|
] }),
|
|
1408
1598
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1409
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1599
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1600
|
+
"Dataset",
|
|
1601
|
+
" "
|
|
1602
|
+
] }),
|
|
1410
1603
|
runInfo.datasetName
|
|
1411
1604
|
] }),
|
|
1412
1605
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1413
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1606
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1607
|
+
"Evaluators",
|
|
1608
|
+
" "
|
|
1609
|
+
] }),
|
|
1414
1610
|
runInfo.evaluatorNames.join(", ")
|
|
1415
1611
|
] }),
|
|
1416
1612
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1417
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1613
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1614
|
+
"Test cases",
|
|
1615
|
+
" "
|
|
1616
|
+
] }),
|
|
1418
1617
|
runInfo.totalTestCases
|
|
1419
1618
|
] })
|
|
1420
1619
|
] }),
|
|
1421
1620
|
phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1422
1621
|
Spinner,
|
|
1423
1622
|
{
|
|
1424
|
-
label: `Evaluations ${
|
|
1623
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
|
|
1425
1624
|
}
|
|
1426
1625
|
) }),
|
|
1427
|
-
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc
|
|
1626
|
+
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1428
1627
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1429
1628
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1430
1629
|
"[",
|
|
@@ -1435,49 +1634,78 @@ function RunView({
|
|
|
1435
1634
|
] }),
|
|
1436
1635
|
" ",
|
|
1437
1636
|
tc.name,
|
|
1637
|
+
" ",
|
|
1638
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1639
|
+
"(",
|
|
1640
|
+
tc.rerunIndex,
|
|
1641
|
+
"/",
|
|
1642
|
+
tc.rerunTotal,
|
|
1643
|
+
")"
|
|
1644
|
+
] }),
|
|
1438
1645
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1439
1646
|
" (",
|
|
1440
1647
|
tc.durationMs,
|
|
1441
1648
|
"ms)"
|
|
1442
1649
|
] })
|
|
1443
1650
|
] }),
|
|
1444
|
-
tc.
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1651
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1652
|
+
ink.Box,
|
|
1653
|
+
{
|
|
1654
|
+
flexDirection: "column",
|
|
1655
|
+
marginLeft: 2,
|
|
1656
|
+
children: [
|
|
1657
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1658
|
+
item.evaluatorName,
|
|
1659
|
+
":",
|
|
1660
|
+
" ",
|
|
1661
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1662
|
+
" ",
|
|
1663
|
+
item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1664
|
+
ink.Text,
|
|
1665
|
+
{
|
|
1666
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1667
|
+
children: [
|
|
1668
|
+
formatScorePart(s, scoreColor, {
|
|
1669
|
+
isAggregated: tc.isAggregated
|
|
1670
|
+
}),
|
|
1671
|
+
" "
|
|
1672
|
+
]
|
|
1673
|
+
},
|
|
1674
|
+
s.id
|
|
1675
|
+
)),
|
|
1676
|
+
item.metrics?.map((m) => {
|
|
1677
|
+
const def = getMetricById(m.id);
|
|
1678
|
+
if (!def)
|
|
1679
|
+
return null;
|
|
1680
|
+
const formatted = def.format(m.data, {
|
|
1681
|
+
isAggregated: tc.isAggregated
|
|
1682
|
+
});
|
|
1683
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1684
|
+
"[",
|
|
1685
|
+
def.name ? `${def.name}: ` : "",
|
|
1686
|
+
formatted,
|
|
1687
|
+
"]",
|
|
1688
|
+
" "
|
|
1689
|
+
] }, m.id);
|
|
1690
|
+
})
|
|
1691
|
+
] }),
|
|
1692
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1693
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1694
|
+
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
1695
|
+
ink.Text,
|
|
1696
|
+
{
|
|
1697
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1698
|
+
children: line
|
|
1699
|
+
},
|
|
1700
|
+
lineIdx
|
|
1701
|
+
)
|
|
1702
|
+
) }, logIdx) : null
|
|
1703
|
+
) })
|
|
1704
|
+
]
|
|
1705
|
+
},
|
|
1706
|
+
item.evaluatorId
|
|
1707
|
+
))
|
|
1708
|
+
] }, tc.testCaseId)) }),
|
|
1481
1709
|
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1482
1710
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1483
1711
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
|
|
@@ -1524,7 +1752,8 @@ function RunView({
|
|
|
1524
1752
|
name.padEnd(28),
|
|
1525
1753
|
" avg=",
|
|
1526
1754
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1527
|
-
"
|
|
1755
|
+
" ",
|
|
1756
|
+
"passed=",
|
|
1528
1757
|
agg.passed,
|
|
1529
1758
|
" failed=",
|
|
1530
1759
|
agg.failed
|
|
@@ -1533,28 +1762,38 @@ function RunView({
|
|
|
1533
1762
|
] }),
|
|
1534
1763
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1535
1764
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
|
|
1536
|
-
testCases.map((tc
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
]
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1765
|
+
testCases.map((tc) => {
|
|
1766
|
+
const numericScores = tc.aggregatedEvaluatorScores.flatMap(
|
|
1767
|
+
(item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
|
|
1768
|
+
);
|
|
1769
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1770
|
+
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1771
|
+
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1772
|
+
isAggregated: true
|
|
1773
|
+
}) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
|
|
1774
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1775
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1776
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1777
|
+
" ",
|
|
1778
|
+
tc.name.padEnd(24)
|
|
1546
1779
|
] }),
|
|
1780
|
+
averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1781
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(averageScore), children: [
|
|
1782
|
+
"score=",
|
|
1783
|
+
scoreLabel
|
|
1784
|
+
] }),
|
|
1785
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1786
|
+
" ",
|
|
1787
|
+
createBar(averageScore, 100, 14)
|
|
1788
|
+
] })
|
|
1789
|
+
] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
|
|
1547
1790
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1548
|
-
" ",
|
|
1549
|
-
|
|
1791
|
+
" (",
|
|
1792
|
+
tc.durationMs,
|
|
1793
|
+
"ms)"
|
|
1550
1794
|
] })
|
|
1551
|
-
] }
|
|
1552
|
-
|
|
1553
|
-
" (",
|
|
1554
|
-
tc.durationMs,
|
|
1555
|
-
"ms)"
|
|
1556
|
-
] })
|
|
1557
|
-
] }, i))
|
|
1795
|
+
] }, tc.testCaseId);
|
|
1796
|
+
})
|
|
1558
1797
|
] }),
|
|
1559
1798
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1560
1799
|
"artifact: ",
|
|
@@ -1565,6 +1804,51 @@ function RunView({
|
|
|
1565
1804
|
}
|
|
1566
1805
|
|
|
1567
1806
|
// src/cli-simple/run.ts
|
|
1807
|
+
function buildTestCaseSummaries(byId) {
|
|
1808
|
+
const summaries = [];
|
|
1809
|
+
for (const { name, events } of byId.values()) {
|
|
1810
|
+
const passed = events.every((e) => e.passed);
|
|
1811
|
+
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1812
|
+
const isAggregated = events.length > 1;
|
|
1813
|
+
const numericScores = [];
|
|
1814
|
+
let firstAggregatedScore;
|
|
1815
|
+
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1816
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1817
|
+
for (const ev of events) {
|
|
1818
|
+
const es = ev.evaluatorScores.find(
|
|
1819
|
+
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
1820
|
+
);
|
|
1821
|
+
for (const s of es?.scores ?? []) {
|
|
1822
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1823
|
+
list.push(s);
|
|
1824
|
+
scoreIdToItems.set(s.id, list);
|
|
1825
|
+
}
|
|
1826
|
+
}
|
|
1827
|
+
for (const items of scoreIdToItems.values()) {
|
|
1828
|
+
const agg = aggregateScoreItems(items);
|
|
1829
|
+
if (agg) {
|
|
1830
|
+
const n = toNumericScoreFromScores([agg]);
|
|
1831
|
+
if (n !== void 0) {
|
|
1832
|
+
numericScores.push(n);
|
|
1833
|
+
if (firstAggregatedScore === void 0) {
|
|
1834
|
+
firstAggregatedScore = agg;
|
|
1835
|
+
}
|
|
1836
|
+
}
|
|
1837
|
+
}
|
|
1838
|
+
}
|
|
1839
|
+
}
|
|
1840
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1841
|
+
summaries.push({
|
|
1842
|
+
name,
|
|
1843
|
+
averageScore,
|
|
1844
|
+
aggregatedScoreItem: firstAggregatedScore,
|
|
1845
|
+
isAggregated,
|
|
1846
|
+
durationMs,
|
|
1847
|
+
passed
|
|
1848
|
+
});
|
|
1849
|
+
}
|
|
1850
|
+
return summaries;
|
|
1851
|
+
}
|
|
1568
1852
|
var ansi2 = {
|
|
1569
1853
|
reset: "\x1B[0m",
|
|
1570
1854
|
bold: "\x1B[1m",
|
|
@@ -1599,7 +1883,50 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
1599
1883
|
const filled = Math.round(safe / max * width);
|
|
1600
1884
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
1601
1885
|
}
|
|
1602
|
-
function
|
|
1886
|
+
function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
1887
|
+
if (events.length === 0)
|
|
1888
|
+
return [];
|
|
1889
|
+
const evaluatorIds = new Set(
|
|
1890
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1891
|
+
);
|
|
1892
|
+
const result = [];
|
|
1893
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1894
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1895
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1896
|
+
for (const ev of events) {
|
|
1897
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1898
|
+
for (const s of es?.scores ?? []) {
|
|
1899
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1900
|
+
list.push(s);
|
|
1901
|
+
scoreIdToItems.set(s.id, list);
|
|
1902
|
+
}
|
|
1903
|
+
for (const m of es?.metrics ?? []) {
|
|
1904
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1905
|
+
list.push(m);
|
|
1906
|
+
metricIdToItems.set(m.id, list);
|
|
1907
|
+
}
|
|
1908
|
+
}
|
|
1909
|
+
const aggregatedScores = [];
|
|
1910
|
+
for (const items of scoreIdToItems.values()) {
|
|
1911
|
+
const agg = aggregateScoreItems(items);
|
|
1912
|
+
if (agg)
|
|
1913
|
+
aggregatedScores.push(agg);
|
|
1914
|
+
}
|
|
1915
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1916
|
+
const passed = events.every((ev) => {
|
|
1917
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1918
|
+
return es?.passed ?? false;
|
|
1919
|
+
});
|
|
1920
|
+
result.push({
|
|
1921
|
+
evaluatorId,
|
|
1922
|
+
scores: aggregatedScores,
|
|
1923
|
+
passed,
|
|
1924
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
|
|
1925
|
+
});
|
|
1926
|
+
}
|
|
1927
|
+
return result;
|
|
1928
|
+
}
|
|
1929
|
+
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1603
1930
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1604
1931
|
const scoreParts = [];
|
|
1605
1932
|
for (const item of scores) {
|
|
@@ -1611,7 +1938,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1611
1938
|
);
|
|
1612
1939
|
continue;
|
|
1613
1940
|
}
|
|
1614
|
-
const formatted = def.format(item.data);
|
|
1941
|
+
const formatted = def.format(item.data, options);
|
|
1615
1942
|
switch (def.displayStrategy) {
|
|
1616
1943
|
case "bar": {
|
|
1617
1944
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -1644,7 +1971,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1644
1971
|
for (const { id, data } of metrics) {
|
|
1645
1972
|
const def = getMetricById(id);
|
|
1646
1973
|
if (def) {
|
|
1647
|
-
const formatted = def.format(data);
|
|
1974
|
+
const formatted = def.format(data, options);
|
|
1648
1975
|
metricParts.push(
|
|
1649
1976
|
def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
|
|
1650
1977
|
);
|
|
@@ -1677,7 +2004,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1677
2004
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1678
2005
|
);
|
|
1679
2006
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1680
|
-
const
|
|
2007
|
+
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
1681
2008
|
let overallScoreTotal = 0;
|
|
1682
2009
|
let overallScoreCount = 0;
|
|
1683
2010
|
let completedCount = 0;
|
|
@@ -1691,6 +2018,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1691
2018
|
}
|
|
1692
2019
|
process.stdout.write("\r\x1B[2K");
|
|
1693
2020
|
}
|
|
2021
|
+
function cursorUp(n) {
|
|
2022
|
+
if (!process.stdout.isTTY || n <= 0)
|
|
2023
|
+
return;
|
|
2024
|
+
process.stdout.write(`\x1B[${n}A`);
|
|
2025
|
+
}
|
|
1694
2026
|
function drawSpinner() {
|
|
1695
2027
|
if (!process.stdout.isTTY || runFinished) {
|
|
1696
2028
|
return;
|
|
@@ -1704,6 +2036,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1704
2036
|
)} ${colorize("(live)", ansi2.dim)}`
|
|
1705
2037
|
);
|
|
1706
2038
|
}
|
|
2039
|
+
let lastPrintedTestCaseId = null;
|
|
2040
|
+
let lastPrintedLineCount = 0;
|
|
1707
2041
|
let spinnerTimer;
|
|
1708
2042
|
const done = new Promise((resolve5) => {
|
|
1709
2043
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
@@ -1711,31 +2045,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1711
2045
|
completedCount = event.completedTestCases;
|
|
1712
2046
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1713
2047
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
2048
|
+
const testCaseId = event.testCaseId;
|
|
2049
|
+
const existing = testCaseByTestId.get(testCaseId) ?? {
|
|
2050
|
+
name: event.testCaseName,
|
|
2051
|
+
events: []
|
|
2052
|
+
};
|
|
2053
|
+
existing.events.push({
|
|
2054
|
+
averageScore,
|
|
2055
|
+
passed: event.passed,
|
|
2056
|
+
durationMs: event.durationMs,
|
|
2057
|
+
evaluatorScores: event.evaluatorScores
|
|
2058
|
+
});
|
|
2059
|
+
testCaseByTestId.set(testCaseId, existing);
|
|
1718
2060
|
for (const item of event.evaluatorScores) {
|
|
1719
|
-
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
1720
|
-
console.log(
|
|
1721
|
-
formatEvaluatorScoreLine(
|
|
1722
|
-
name,
|
|
1723
|
-
item.scores,
|
|
1724
|
-
item.passed,
|
|
1725
|
-
item.metrics
|
|
1726
|
-
)
|
|
1727
|
-
);
|
|
1728
|
-
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1729
|
-
for (const log of item.logs) {
|
|
1730
|
-
if (log.type === "diff") {
|
|
1731
|
-
const useColor = process.stdout.isTTY;
|
|
1732
|
-
for (const { type, line } of getDiffLines(log)) {
|
|
1733
|
-
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1734
|
-
console.log(colored);
|
|
1735
|
-
}
|
|
1736
|
-
}
|
|
1737
|
-
}
|
|
1738
|
-
}
|
|
1739
2061
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1740
2062
|
if (numeric !== void 0) {
|
|
1741
2063
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
@@ -1754,12 +2076,60 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1754
2076
|
overallScoreCount += 1;
|
|
1755
2077
|
}
|
|
1756
2078
|
}
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
2079
|
+
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2080
|
+
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
2081
|
+
const isNonTty = !process.stdout.isTTY;
|
|
2082
|
+
const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
|
|
2083
|
+
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2084
|
+
cursorUp(lastPrintedLineCount);
|
|
2085
|
+
}
|
|
2086
|
+
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2087
|
+
existing.events);
|
|
2088
|
+
const isAggregated = existing.events.length > 1;
|
|
2089
|
+
const durationMs = existing.events.reduce(
|
|
2090
|
+
(s, e) => s + e.durationMs,
|
|
2091
|
+
0
|
|
2092
|
+
);
|
|
2093
|
+
existing.events.every((e) => e.passed);
|
|
2094
|
+
const lines = [];
|
|
2095
|
+
lines.push(
|
|
2096
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2097
|
+
);
|
|
2098
|
+
for (const item of aggregatedScores) {
|
|
2099
|
+
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2100
|
+
lines.push(
|
|
2101
|
+
formatEvaluatorScoreLine(
|
|
2102
|
+
name,
|
|
2103
|
+
item.scores,
|
|
2104
|
+
item.passed,
|
|
2105
|
+
item.metrics,
|
|
2106
|
+
{ isAggregated }
|
|
2107
|
+
)
|
|
2108
|
+
);
|
|
2109
|
+
const lastEvent = existing.events[existing.events.length - 1];
|
|
2110
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2111
|
+
(x) => x.evaluatorId === item.evaluatorId
|
|
2112
|
+
);
|
|
2113
|
+
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2114
|
+
for (const log of lastEs.logs) {
|
|
2115
|
+
if (log.type === "diff") {
|
|
2116
|
+
const useColor = process.stdout.isTTY;
|
|
2117
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
2118
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2119
|
+
lines.push(colored);
|
|
2120
|
+
}
|
|
2121
|
+
}
|
|
2122
|
+
}
|
|
2123
|
+
}
|
|
2124
|
+
}
|
|
2125
|
+
if (!skipPrintNonTty) {
|
|
2126
|
+
for (let i = 0; i < lines.length; i++) {
|
|
2127
|
+
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2128
|
+
`);
|
|
2129
|
+
}
|
|
2130
|
+
lastPrintedTestCaseId = testCaseId;
|
|
2131
|
+
lastPrintedLineCount = lines.length;
|
|
2132
|
+
}
|
|
1763
2133
|
drawSpinner();
|
|
1764
2134
|
}
|
|
1765
2135
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
@@ -1823,6 +2193,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1823
2193
|
getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
|
|
1824
2194
|
);
|
|
1825
2195
|
}
|
|
2196
|
+
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
1826
2197
|
if (testCaseSummaries.length > 0) {
|
|
1827
2198
|
console.log(colorize("- test case scores:", ansi2.magenta));
|
|
1828
2199
|
for (const summary of testCaseSummaries) {
|
|
@@ -1833,9 +2204,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1833
2204
|
);
|
|
1834
2205
|
continue;
|
|
1835
2206
|
}
|
|
2207
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2208
|
+
summary.aggregatedScoreItem.data,
|
|
2209
|
+
{ isAggregated: true }
|
|
2210
|
+
) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
|
|
1836
2211
|
console.log(
|
|
1837
2212
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1838
|
-
|
|
2213
|
+
scoreLabel,
|
|
1839
2214
|
scoreToColor(summary.averageScore)
|
|
1840
2215
|
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1841
2216
|
);
|