@m4trix/evals 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +719 -227
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +721 -229
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +1320 -928
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +1322 -930
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +335 -99
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +24 -5
- package/dist/index.js +337 -101
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -56,7 +56,8 @@ var defaultRunnerConfig = {
|
|
|
56
56
|
],
|
|
57
57
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
58
58
|
},
|
|
59
|
-
artifactDirectory: ".eval-results"
|
|
59
|
+
artifactDirectory: ".eval-results",
|
|
60
|
+
maxConcurrency: 1
|
|
60
61
|
};
|
|
61
62
|
function toRunnerConfigOverrides(config) {
|
|
62
63
|
if (!config) {
|
|
@@ -89,6 +90,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
89
90
|
if (config.artifactDirectory !== void 0) {
|
|
90
91
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
91
92
|
}
|
|
93
|
+
if (config.maxConcurrency !== void 0) {
|
|
94
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
95
|
+
}
|
|
92
96
|
if (Object.keys(discovery).length > 0) {
|
|
93
97
|
overrides.discovery = discovery;
|
|
94
98
|
}
|
|
@@ -313,6 +317,7 @@ var Metric = {
|
|
|
313
317
|
const def = {
|
|
314
318
|
id: config.id,
|
|
315
319
|
name: config.name,
|
|
320
|
+
aggregate: config.aggregate,
|
|
316
321
|
format: config.format,
|
|
317
322
|
make: (data) => ({ id: config.id, data })
|
|
318
323
|
};
|
|
@@ -332,6 +337,7 @@ var Score = {
|
|
|
332
337
|
id: config.id,
|
|
333
338
|
name: config.name,
|
|
334
339
|
displayStrategy: config.displayStrategy,
|
|
340
|
+
aggregate: config.aggregate,
|
|
335
341
|
format: config.format,
|
|
336
342
|
make: (data, options) => {
|
|
337
343
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -350,23 +356,62 @@ function getScoreById(id) {
|
|
|
350
356
|
return registry2.get(id);
|
|
351
357
|
}
|
|
352
358
|
|
|
359
|
+
// src/evals/aggregators.ts
|
|
360
|
+
function aggregateAverage(values) {
|
|
361
|
+
if (values.length === 0) {
|
|
362
|
+
return { value: 0 };
|
|
363
|
+
}
|
|
364
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
365
|
+
return { value: sum / values.length };
|
|
366
|
+
}
|
|
367
|
+
function aggregateAll(values) {
|
|
368
|
+
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
369
|
+
}
|
|
370
|
+
function aggregateTokenCountSum(values) {
|
|
371
|
+
const initial = {
|
|
372
|
+
input: 0,
|
|
373
|
+
output: 0,
|
|
374
|
+
inputCached: 0,
|
|
375
|
+
outputCached: 0
|
|
376
|
+
};
|
|
377
|
+
return values.reduce(
|
|
378
|
+
(acc, v) => ({
|
|
379
|
+
input: acc.input + (v.input ?? 0),
|
|
380
|
+
output: acc.output + (v.output ?? 0),
|
|
381
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
382
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
383
|
+
}),
|
|
384
|
+
initial
|
|
385
|
+
);
|
|
386
|
+
}
|
|
387
|
+
function aggregateLatencyAverage(values) {
|
|
388
|
+
if (values.length === 0) {
|
|
389
|
+
return { ms: 0 };
|
|
390
|
+
}
|
|
391
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
392
|
+
return { ms: sum / values.length };
|
|
393
|
+
}
|
|
394
|
+
|
|
353
395
|
// src/evals/metrics/standard.ts
|
|
354
396
|
Metric.of({
|
|
355
397
|
id: "token-count",
|
|
356
398
|
name: "Tokens",
|
|
357
|
-
|
|
399
|
+
aggregate: aggregateTokenCountSum,
|
|
400
|
+
format: (data, options) => {
|
|
358
401
|
const input = data.input ?? 0;
|
|
359
402
|
const output = data.output ?? 0;
|
|
360
403
|
const inputCached = data.inputCached ?? 0;
|
|
361
404
|
const outputCached = data.outputCached ?? 0;
|
|
362
405
|
const cached = inputCached + outputCached;
|
|
363
|
-
|
|
406
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
407
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
364
408
|
}
|
|
365
409
|
});
|
|
366
410
|
Metric.of({
|
|
367
411
|
id: "latency",
|
|
368
412
|
name: "Latency",
|
|
369
|
-
|
|
413
|
+
aggregate: aggregateLatencyAverage,
|
|
414
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
370
415
|
});
|
|
371
416
|
|
|
372
417
|
// src/evals/scores/standard.ts
|
|
@@ -374,16 +419,36 @@ Score.of({
|
|
|
374
419
|
id: "percent",
|
|
375
420
|
name: "Score",
|
|
376
421
|
displayStrategy: "bar",
|
|
377
|
-
format: (data) => data.value.toFixed(2)
|
|
422
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
423
|
+
aggregate: aggregateAverage
|
|
378
424
|
});
|
|
379
425
|
Score.of({
|
|
380
426
|
id: "binary",
|
|
381
427
|
name: "Result",
|
|
382
428
|
displayStrategy: "passFail",
|
|
383
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
429
|
+
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
430
|
+
aggregate: aggregateAll
|
|
384
431
|
});
|
|
385
432
|
|
|
386
433
|
// src/runner/score-utils.ts
|
|
434
|
+
function aggregateScoreItems(items) {
|
|
435
|
+
if (items.length === 0)
|
|
436
|
+
return void 0;
|
|
437
|
+
const def = getScoreById(items[0].id);
|
|
438
|
+
if (!def?.aggregate)
|
|
439
|
+
return items[items.length - 1];
|
|
440
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
441
|
+
return { ...items[0], data: aggregated };
|
|
442
|
+
}
|
|
443
|
+
function aggregateMetricItems(items) {
|
|
444
|
+
if (items.length === 0)
|
|
445
|
+
return void 0;
|
|
446
|
+
const def = getMetricById(items[0].id);
|
|
447
|
+
if (!def?.aggregate)
|
|
448
|
+
return items[items.length - 1];
|
|
449
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
450
|
+
return { ...items[0], data: aggregated };
|
|
451
|
+
}
|
|
387
452
|
function toNumericScoreFromScores(scores) {
|
|
388
453
|
for (const item of scores) {
|
|
389
454
|
const def = getScoreById(item.id);
|
|
@@ -462,6 +527,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
462
527
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
463
528
|
);
|
|
464
529
|
}
|
|
530
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
531
|
+
return effect.Effect.gen(function* () {
|
|
532
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
533
|
+
const rerunPassed = [];
|
|
534
|
+
for (let r = 0; r < reruns; r++) {
|
|
535
|
+
const started = Date.now();
|
|
536
|
+
const evaluatorScores = [];
|
|
537
|
+
let testCaseError;
|
|
538
|
+
const output = readOutput(testCaseItem.testCase);
|
|
539
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
540
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
541
|
+
if (!evaluateFn) {
|
|
542
|
+
continue;
|
|
543
|
+
}
|
|
544
|
+
try {
|
|
545
|
+
const logs = [];
|
|
546
|
+
const logDiff = (expected, actual, options) => {
|
|
547
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
548
|
+
};
|
|
549
|
+
const ctx = yield* effect.Effect.promise(
|
|
550
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
551
|
+
);
|
|
552
|
+
const result = yield* effect.Effect.promise(
|
|
553
|
+
() => Promise.resolve(
|
|
554
|
+
evaluateFn({
|
|
555
|
+
input: testCaseItem.testCase.getInput(),
|
|
556
|
+
ctx,
|
|
557
|
+
output,
|
|
558
|
+
logDiff
|
|
559
|
+
})
|
|
560
|
+
)
|
|
561
|
+
);
|
|
562
|
+
const { scores, metrics } = normalizeResult(result);
|
|
563
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
564
|
+
evaluatorScores.push({
|
|
565
|
+
evaluatorId,
|
|
566
|
+
scores,
|
|
567
|
+
passed: passed2,
|
|
568
|
+
metrics,
|
|
569
|
+
logs: logs.length > 0 ? logs : void 0
|
|
570
|
+
});
|
|
571
|
+
} catch (error) {
|
|
572
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
573
|
+
evaluatorScores.push({
|
|
574
|
+
evaluatorId,
|
|
575
|
+
scores: [],
|
|
576
|
+
passed: false
|
|
577
|
+
});
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
581
|
+
rerunPassed.push(rerunPassedThis);
|
|
582
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
583
|
+
n + 1,
|
|
584
|
+
n + 1
|
|
585
|
+
]);
|
|
586
|
+
const progressEvent = {
|
|
587
|
+
type: "TestCaseProgress",
|
|
588
|
+
runId: task.runId,
|
|
589
|
+
testCaseId: testCaseItem.id,
|
|
590
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
591
|
+
completedTestCases: completedEvaluations,
|
|
592
|
+
totalTestCases: totalEvaluations,
|
|
593
|
+
rerunIndex: r + 1,
|
|
594
|
+
rerunTotal: reruns,
|
|
595
|
+
passed: rerunPassedThis,
|
|
596
|
+
durationMs: Date.now() - started,
|
|
597
|
+
evaluatorScores,
|
|
598
|
+
output,
|
|
599
|
+
errorMessage: testCaseError
|
|
600
|
+
};
|
|
601
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
602
|
+
...snapshot,
|
|
603
|
+
completedTestCases: completedEvaluations
|
|
604
|
+
}));
|
|
605
|
+
yield* publishEvent(progressEvent);
|
|
606
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
607
|
+
runId: task.runId,
|
|
608
|
+
artifactPath: task.snapshot.artifactPath,
|
|
609
|
+
payload: progressEvent
|
|
610
|
+
});
|
|
611
|
+
}
|
|
612
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
613
|
+
if (testCasePassed) {
|
|
614
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
615
|
+
} else {
|
|
616
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
617
|
+
}
|
|
618
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
619
|
+
effect.Ref.get(passedRef),
|
|
620
|
+
effect.Ref.get(failedRef)
|
|
621
|
+
]);
|
|
622
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
623
|
+
...snapshot,
|
|
624
|
+
passedTestCases: passed,
|
|
625
|
+
failedTestCases: failed
|
|
626
|
+
}));
|
|
627
|
+
});
|
|
628
|
+
}
|
|
465
629
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
466
630
|
const startedAt = Date.now();
|
|
467
631
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -474,104 +638,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
474
638
|
runId: task.runId,
|
|
475
639
|
startedAt
|
|
476
640
|
});
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
)
|
|
507
|
-
);
|
|
508
|
-
const { scores, metrics } = normalizeResult(result);
|
|
509
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
510
|
-
evaluatorScores.push({
|
|
511
|
-
evaluatorId,
|
|
512
|
-
scores,
|
|
513
|
-
passed,
|
|
514
|
-
metrics,
|
|
515
|
-
logs: logs.length > 0 ? logs : void 0
|
|
516
|
-
});
|
|
517
|
-
} catch (error) {
|
|
518
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
519
|
-
evaluatorScores.push({
|
|
520
|
-
evaluatorId,
|
|
521
|
-
scores: [],
|
|
522
|
-
passed: false
|
|
523
|
-
});
|
|
524
|
-
}
|
|
525
|
-
}
|
|
526
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
527
|
-
completedTestCases += 1;
|
|
528
|
-
if (testCasePassed) {
|
|
529
|
-
passedTestCases += 1;
|
|
530
|
-
} else {
|
|
531
|
-
failedTestCases += 1;
|
|
532
|
-
}
|
|
533
|
-
const progressEvent = {
|
|
534
|
-
type: "TestCaseProgress",
|
|
535
|
-
runId: task.runId,
|
|
536
|
-
testCaseId: testCaseItem.id,
|
|
537
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
538
|
-
completedTestCases,
|
|
539
|
-
totalTestCases: task.testCases.length,
|
|
540
|
-
passed: testCasePassed,
|
|
541
|
-
durationMs: Date.now() - started,
|
|
542
|
-
evaluatorScores,
|
|
543
|
-
output,
|
|
544
|
-
errorMessage: testCaseError
|
|
545
|
-
};
|
|
546
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
547
|
-
...snapshot,
|
|
548
|
-
completedTestCases,
|
|
549
|
-
passedTestCases,
|
|
550
|
-
failedTestCases
|
|
551
|
-
}));
|
|
552
|
-
yield* publishEvent(progressEvent);
|
|
553
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
554
|
-
runId: task.runId,
|
|
555
|
-
artifactPath: task.snapshot.artifactPath,
|
|
556
|
-
payload: progressEvent
|
|
557
|
-
});
|
|
558
|
-
}
|
|
641
|
+
const totalEvaluations = task.testCases.reduce(
|
|
642
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
643
|
+
0
|
|
644
|
+
);
|
|
645
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
646
|
+
const completedRef = yield* effect.Ref.make(0);
|
|
647
|
+
const passedRef = yield* effect.Ref.make(0);
|
|
648
|
+
const failedRef = yield* effect.Ref.make(0);
|
|
649
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
650
|
+
task,
|
|
651
|
+
testCaseItem,
|
|
652
|
+
totalEvaluations,
|
|
653
|
+
publishEvent,
|
|
654
|
+
persistenceQueue,
|
|
655
|
+
updateSnapshot,
|
|
656
|
+
completedRef,
|
|
657
|
+
passedRef,
|
|
658
|
+
failedRef
|
|
659
|
+
);
|
|
660
|
+
yield* effect.Effect.forEach(
|
|
661
|
+
task.testCases,
|
|
662
|
+
processTestCase,
|
|
663
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
664
|
+
);
|
|
665
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
666
|
+
effect.Ref.get(completedRef),
|
|
667
|
+
effect.Ref.get(passedRef),
|
|
668
|
+
effect.Ref.get(failedRef)
|
|
669
|
+
]);
|
|
559
670
|
const finishedAt = Date.now();
|
|
560
671
|
const completedEvent = {
|
|
561
672
|
type: "RunCompleted",
|
|
562
673
|
runId: task.runId,
|
|
563
674
|
finishedAt,
|
|
564
|
-
passedTestCases,
|
|
565
|
-
failedTestCases,
|
|
675
|
+
passedTestCases: passedUniqueTestCases,
|
|
676
|
+
failedTestCases: failedUniqueTestCases,
|
|
566
677
|
totalTestCases: task.testCases.length,
|
|
567
678
|
artifactPath: task.snapshot.artifactPath
|
|
568
679
|
};
|
|
569
680
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
570
681
|
...snapshot,
|
|
571
682
|
status: "completed",
|
|
572
|
-
completedTestCases,
|
|
573
|
-
passedTestCases,
|
|
574
|
-
failedTestCases,
|
|
683
|
+
completedTestCases: completedEvaluations,
|
|
684
|
+
passedTestCases: passedUniqueTestCases,
|
|
685
|
+
failedTestCases: failedUniqueTestCases,
|
|
575
686
|
finishedAt
|
|
576
687
|
}));
|
|
577
688
|
yield* publishEvent(completedEvent);
|
|
@@ -586,6 +697,126 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
586
697
|
artifactPath: task.snapshot.artifactPath
|
|
587
698
|
});
|
|
588
699
|
});
|
|
700
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
701
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
702
|
+
let entries;
|
|
703
|
+
try {
|
|
704
|
+
entries = await promises.readdir(baseDir);
|
|
705
|
+
} catch {
|
|
706
|
+
return [];
|
|
707
|
+
}
|
|
708
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
709
|
+
const snapshots = [];
|
|
710
|
+
for (const fileName of jsonlFiles) {
|
|
711
|
+
const filePath = path.join(baseDir, fileName);
|
|
712
|
+
try {
|
|
713
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
714
|
+
if (snapshot) {
|
|
715
|
+
snapshots.push(snapshot);
|
|
716
|
+
}
|
|
717
|
+
} catch {
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
721
|
+
}
|
|
722
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
723
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
724
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
725
|
+
if (lines.length === 0) {
|
|
726
|
+
return null;
|
|
727
|
+
}
|
|
728
|
+
let runQueued = null;
|
|
729
|
+
let runCompleted = null;
|
|
730
|
+
let runFailed = null;
|
|
731
|
+
let runStarted = null;
|
|
732
|
+
for (const line of lines) {
|
|
733
|
+
try {
|
|
734
|
+
const event = JSON.parse(line);
|
|
735
|
+
const type = event.type;
|
|
736
|
+
if (type === "RunQueued") {
|
|
737
|
+
runQueued = {
|
|
738
|
+
runId: event.runId,
|
|
739
|
+
datasetId: event.datasetId,
|
|
740
|
+
datasetName: event.datasetName,
|
|
741
|
+
evaluatorIds: event.evaluatorIds,
|
|
742
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
743
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
744
|
+
ts: event.ts
|
|
745
|
+
};
|
|
746
|
+
}
|
|
747
|
+
if (type === "RunStarted") {
|
|
748
|
+
runStarted = { startedAt: event.startedAt };
|
|
749
|
+
}
|
|
750
|
+
if (type === "RunCompleted") {
|
|
751
|
+
runCompleted = {
|
|
752
|
+
passedTestCases: event.passedTestCases,
|
|
753
|
+
failedTestCases: event.failedTestCases,
|
|
754
|
+
totalTestCases: event.totalTestCases,
|
|
755
|
+
finishedAt: event.finishedAt
|
|
756
|
+
};
|
|
757
|
+
}
|
|
758
|
+
if (type === "RunFailed") {
|
|
759
|
+
runFailed = {
|
|
760
|
+
finishedAt: event.finishedAt,
|
|
761
|
+
errorMessage: event.errorMessage
|
|
762
|
+
};
|
|
763
|
+
}
|
|
764
|
+
} catch {
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
if (!runQueued) {
|
|
768
|
+
return null;
|
|
769
|
+
}
|
|
770
|
+
const artifactPath = filePath;
|
|
771
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
772
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
773
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
774
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
775
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
776
|
+
return {
|
|
777
|
+
runId: runQueued.runId,
|
|
778
|
+
datasetId: runQueued.datasetId,
|
|
779
|
+
datasetName: runQueued.datasetName,
|
|
780
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
781
|
+
queuedAt: runQueued.ts ?? 0,
|
|
782
|
+
startedAt: runStarted?.startedAt,
|
|
783
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
784
|
+
totalTestCases: runQueued.totalTestCases,
|
|
785
|
+
completedTestCases,
|
|
786
|
+
passedTestCases,
|
|
787
|
+
failedTestCases,
|
|
788
|
+
status,
|
|
789
|
+
artifactPath,
|
|
790
|
+
errorMessage: runFailed?.errorMessage
|
|
791
|
+
};
|
|
792
|
+
}
|
|
793
|
+
function aggregateTestCaseProgress(lines) {
|
|
794
|
+
let completedTestCases = 0;
|
|
795
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
796
|
+
for (const line of lines) {
|
|
797
|
+
try {
|
|
798
|
+
const event = JSON.parse(line);
|
|
799
|
+
if (event.type === "TestCaseProgress") {
|
|
800
|
+
const ev = event;
|
|
801
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
802
|
+
const id = ev.testCaseId;
|
|
803
|
+
const current = testCasePassedBy.get(id);
|
|
804
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
805
|
+
}
|
|
806
|
+
} catch {
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
let passedTestCases = 0;
|
|
810
|
+
let failedTestCases = 0;
|
|
811
|
+
for (const passed of testCasePassedBy.values()) {
|
|
812
|
+
if (passed) {
|
|
813
|
+
passedTestCases += 1;
|
|
814
|
+
} else {
|
|
815
|
+
failedTestCases += 1;
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
819
|
+
}
|
|
589
820
|
async function appendJsonLine(artifactPath, payload) {
|
|
590
821
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
591
822
|
await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
|
|
@@ -778,6 +1009,10 @@ var EffectRunner = class {
|
|
|
778
1009
|
throw new Error("No evaluators selected for run");
|
|
779
1010
|
}
|
|
780
1011
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1012
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1013
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1014
|
+
0
|
|
1015
|
+
);
|
|
781
1016
|
const runId = `run-${crypto.randomUUID()}`;
|
|
782
1017
|
const artifactPath = createArtifactPath(
|
|
783
1018
|
this.config.artifactDirectory,
|
|
@@ -790,7 +1025,7 @@ var EffectRunner = class {
|
|
|
790
1025
|
datasetName: dataset.dataset.getName(),
|
|
791
1026
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
792
1027
|
queuedAt: Date.now(),
|
|
793
|
-
totalTestCases:
|
|
1028
|
+
totalTestCases: totalEvaluations,
|
|
794
1029
|
completedTestCases: 0,
|
|
795
1030
|
passedTestCases: 0,
|
|
796
1031
|
failedTestCases: 0,
|
|
@@ -804,7 +1039,7 @@ var EffectRunner = class {
|
|
|
804
1039
|
datasetId: request.datasetId,
|
|
805
1040
|
datasetName: dataset.dataset.getName(),
|
|
806
1041
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
807
|
-
totalTestCases:
|
|
1042
|
+
totalTestCases: totalEvaluations,
|
|
808
1043
|
artifactPath
|
|
809
1044
|
};
|
|
810
1045
|
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -815,6 +1050,7 @@ var EffectRunner = class {
|
|
|
815
1050
|
payload: queuedEvent
|
|
816
1051
|
})
|
|
817
1052
|
);
|
|
1053
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
818
1054
|
await effect.Effect.runPromise(
|
|
819
1055
|
effect.Queue.offer(this.runQueue, {
|
|
820
1056
|
runId,
|
|
@@ -822,7 +1058,8 @@ var EffectRunner = class {
|
|
|
822
1058
|
dataset: dataset.dataset,
|
|
823
1059
|
evaluators: selectedEvaluators,
|
|
824
1060
|
testCases: selectedTestCases,
|
|
825
|
-
snapshot
|
|
1061
|
+
snapshot,
|
|
1062
|
+
maxConcurrency
|
|
826
1063
|
})
|
|
827
1064
|
);
|
|
828
1065
|
return snapshot;
|
|
@@ -842,6 +1079,9 @@ var EffectRunner = class {
|
|
|
842
1079
|
(a, b) => b.queuedAt - a.queuedAt
|
|
843
1080
|
);
|
|
844
1081
|
}
|
|
1082
|
+
async loadRunSnapshotsFromArtifacts() {
|
|
1083
|
+
return loadRunSnapshotsFromArtifacts(this.config);
|
|
1084
|
+
}
|
|
845
1085
|
async shutdown() {
|
|
846
1086
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
|
|
847
1087
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
|
|
@@ -973,7 +1213,7 @@ function GenerateView({
|
|
|
973
1213
|
return;
|
|
974
1214
|
}
|
|
975
1215
|
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
976
|
-
const { join:
|
|
1216
|
+
const { join: join4, parse: parse2, resolve: resolve5 } = await import('path');
|
|
977
1217
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
978
1218
|
const payload = testCases.map((item) => {
|
|
979
1219
|
const tc = item.testCase;
|
|
@@ -983,9 +1223,9 @@ function GenerateView({
|
|
|
983
1223
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
984
1224
|
};
|
|
985
1225
|
});
|
|
986
|
-
const absoluteDatasetPath =
|
|
1226
|
+
const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
|
|
987
1227
|
const parsed = parse2(absoluteDatasetPath);
|
|
988
|
-
const outputPath =
|
|
1228
|
+
const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
|
|
989
1229
|
await writeFile2(
|
|
990
1230
|
outputPath,
|
|
991
1231
|
`${JSON.stringify(payload, null, 2)}
|
|
@@ -1060,7 +1300,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1060
1300
|
console.log(`Wrote ${outputPath}`);
|
|
1061
1301
|
}
|
|
1062
1302
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1063
|
-
return new Promise((
|
|
1303
|
+
return new Promise((resolve5, reject) => {
|
|
1064
1304
|
const app = ink.render(
|
|
1065
1305
|
React2__default.default.createElement(GenerateView, {
|
|
1066
1306
|
runner,
|
|
@@ -1070,7 +1310,7 @@ async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
|
1070
1310
|
if (err) {
|
|
1071
1311
|
reject(err);
|
|
1072
1312
|
} else {
|
|
1073
|
-
|
|
1313
|
+
resolve5();
|
|
1074
1314
|
}
|
|
1075
1315
|
}
|
|
1076
1316
|
})
|
|
@@ -1137,13 +1377,62 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1137
1377
|
const filled = Math.round(safe / max * width);
|
|
1138
1378
|
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1139
1379
|
}
|
|
1140
|
-
function
|
|
1380
|
+
function aggregateEvaluatorScores(events, nameById) {
|
|
1381
|
+
if (events.length === 0)
|
|
1382
|
+
return [];
|
|
1383
|
+
const evaluatorIds = new Set(
|
|
1384
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1385
|
+
);
|
|
1386
|
+
const result = [];
|
|
1387
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1388
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1389
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1390
|
+
for (const ev of events) {
|
|
1391
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1392
|
+
for (const s of es?.scores ?? []) {
|
|
1393
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1394
|
+
list.push(s);
|
|
1395
|
+
scoreIdToItems.set(s.id, list);
|
|
1396
|
+
}
|
|
1397
|
+
for (const m of es?.metrics ?? []) {
|
|
1398
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1399
|
+
list.push(m);
|
|
1400
|
+
metricIdToItems.set(m.id, list);
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
const aggregatedScores = [];
|
|
1404
|
+
for (const items of scoreIdToItems.values()) {
|
|
1405
|
+
const agg = aggregateScoreItems(items);
|
|
1406
|
+
if (agg)
|
|
1407
|
+
aggregatedScores.push(agg);
|
|
1408
|
+
}
|
|
1409
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1410
|
+
const passed = events.every((ev) => {
|
|
1411
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1412
|
+
return es?.passed ?? false;
|
|
1413
|
+
});
|
|
1414
|
+
const lastEvent = events[events.length - 1];
|
|
1415
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1416
|
+
(x) => x.evaluatorId === evaluatorId
|
|
1417
|
+
);
|
|
1418
|
+
result.push({
|
|
1419
|
+
evaluatorId,
|
|
1420
|
+
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
1421
|
+
scores: aggregatedScores,
|
|
1422
|
+
passed,
|
|
1423
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
|
|
1424
|
+
logs: lastEs?.logs
|
|
1425
|
+
});
|
|
1426
|
+
}
|
|
1427
|
+
return result;
|
|
1428
|
+
}
|
|
1429
|
+
function formatScorePart(item, scoreToColor2, options) {
|
|
1141
1430
|
const def = getScoreById(item.id);
|
|
1142
1431
|
if (!def) {
|
|
1143
1432
|
const numeric = toNumericScore(item.data);
|
|
1144
1433
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1145
1434
|
}
|
|
1146
|
-
const formatted = def.format(item.data);
|
|
1435
|
+
const formatted = def.format(item.data, options);
|
|
1147
1436
|
if (def.displayStrategy === "bar") {
|
|
1148
1437
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1149
1438
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1163,6 +1452,7 @@ function RunView({
|
|
|
1163
1452
|
);
|
|
1164
1453
|
const [runInfo, setRunInfo] = React2.useState(null);
|
|
1165
1454
|
const [testCases, setTestCases] = React2.useState([]);
|
|
1455
|
+
const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
|
|
1166
1456
|
const [summary, setSummary] = React2.useState(null);
|
|
1167
1457
|
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1168
1458
|
const runEval = React2.useCallback(async () => {
|
|
@@ -1189,20 +1479,17 @@ function RunView({
|
|
|
1189
1479
|
return;
|
|
1190
1480
|
}
|
|
1191
1481
|
const nameById = new Map(
|
|
1192
|
-
evaluators.map((item) => [
|
|
1193
|
-
item.id,
|
|
1194
|
-
item.evaluator.getName() ?? item.id
|
|
1195
|
-
])
|
|
1482
|
+
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1196
1483
|
);
|
|
1197
1484
|
setEvaluatorNameById(nameById);
|
|
1198
1485
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1199
1486
|
let overallScoreTotal = 0;
|
|
1200
1487
|
let overallScoreCount = 0;
|
|
1201
|
-
const done = new Promise((
|
|
1488
|
+
const done = new Promise((resolve5) => {
|
|
1202
1489
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1203
1490
|
if (event.type === "TestCaseProgress") {
|
|
1204
1491
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1205
|
-
|
|
1492
|
+
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1206
1493
|
for (const item of event.evaluatorScores) {
|
|
1207
1494
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1208
1495
|
if (numeric !== void 0) {
|
|
@@ -1222,15 +1509,10 @@ function RunView({
|
|
|
1222
1509
|
overallScoreCount += 1;
|
|
1223
1510
|
}
|
|
1224
1511
|
}
|
|
1225
|
-
setTestCases((prev) =>
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
completedTestCases: event.completedTestCases,
|
|
1230
|
-
totalTestCases: event.totalTestCases,
|
|
1231
|
-
durationMs: event.durationMs,
|
|
1232
|
-
passed: event.passed,
|
|
1233
|
-
averageScore,
|
|
1512
|
+
setTestCases((prev) => {
|
|
1513
|
+
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1514
|
+
const existing = byId.get(event.testCaseId);
|
|
1515
|
+
const newEvent = {
|
|
1234
1516
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1235
1517
|
evaluatorId: item.evaluatorId,
|
|
1236
1518
|
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
@@ -1238,13 +1520,37 @@ function RunView({
|
|
|
1238
1520
|
passed: item.passed,
|
|
1239
1521
|
metrics: item.metrics,
|
|
1240
1522
|
logs: item.logs
|
|
1241
|
-
}))
|
|
1242
|
-
|
|
1243
|
-
|
|
1523
|
+
})),
|
|
1524
|
+
passed: event.passed,
|
|
1525
|
+
durationMs: event.durationMs
|
|
1526
|
+
};
|
|
1527
|
+
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1528
|
+
const isAggregated = events.length > 1;
|
|
1529
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1530
|
+
events,
|
|
1531
|
+
nameById
|
|
1532
|
+
);
|
|
1533
|
+
const merged = {
|
|
1534
|
+
name: event.testCaseName,
|
|
1535
|
+
testCaseId: event.testCaseId,
|
|
1536
|
+
completedTestCases: event.completedTestCases,
|
|
1537
|
+
totalTestCases: event.totalTestCases,
|
|
1538
|
+
rerunIndex: event.rerunIndex,
|
|
1539
|
+
rerunTotal: event.rerunTotal,
|
|
1540
|
+
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1541
|
+
passed: events.every((e) => e.passed),
|
|
1542
|
+
events,
|
|
1543
|
+
aggregatedEvaluatorScores,
|
|
1544
|
+
isAggregated
|
|
1545
|
+
};
|
|
1546
|
+
byId.set(event.testCaseId, merged);
|
|
1547
|
+
setCompletedEvaluations(event.completedTestCases);
|
|
1548
|
+
return Array.from(byId.values());
|
|
1549
|
+
});
|
|
1244
1550
|
}
|
|
1245
1551
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1246
1552
|
unsubscribe();
|
|
1247
|
-
|
|
1553
|
+
resolve5(event);
|
|
1248
1554
|
}
|
|
1249
1555
|
});
|
|
1250
1556
|
});
|
|
@@ -1255,9 +1561,7 @@ function RunView({
|
|
|
1255
1561
|
setRunInfo({
|
|
1256
1562
|
runId: snapshot.runId,
|
|
1257
1563
|
datasetName: snapshot.datasetName,
|
|
1258
|
-
evaluatorNames: evaluators.map(
|
|
1259
|
-
(e) => e.evaluator.getName() ?? e.id
|
|
1260
|
-
),
|
|
1564
|
+
evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
|
|
1261
1565
|
totalTestCases: snapshot.totalTestCases
|
|
1262
1566
|
});
|
|
1263
1567
|
setPhase("running");
|
|
@@ -1285,29 +1589,41 @@ function RunView({
|
|
|
1285
1589
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1286
1590
|
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1287
1591
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1288
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1592
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1593
|
+
"Run",
|
|
1594
|
+
" "
|
|
1595
|
+
] }),
|
|
1289
1596
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
1290
1597
|
] }),
|
|
1291
1598
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1292
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1599
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1600
|
+
"Dataset",
|
|
1601
|
+
" "
|
|
1602
|
+
] }),
|
|
1293
1603
|
runInfo.datasetName
|
|
1294
1604
|
] }),
|
|
1295
1605
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1296
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1606
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1607
|
+
"Evaluators",
|
|
1608
|
+
" "
|
|
1609
|
+
] }),
|
|
1297
1610
|
runInfo.evaluatorNames.join(", ")
|
|
1298
1611
|
] }),
|
|
1299
1612
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1300
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1613
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1614
|
+
"Test cases",
|
|
1615
|
+
" "
|
|
1616
|
+
] }),
|
|
1301
1617
|
runInfo.totalTestCases
|
|
1302
1618
|
] })
|
|
1303
1619
|
] }),
|
|
1304
1620
|
phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1305
1621
|
Spinner,
|
|
1306
1622
|
{
|
|
1307
|
-
label: `Evaluations ${
|
|
1623
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
|
|
1308
1624
|
}
|
|
1309
1625
|
) }),
|
|
1310
|
-
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc
|
|
1626
|
+
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1311
1627
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1312
1628
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1313
1629
|
"[",
|
|
@@ -1318,49 +1634,78 @@ function RunView({
|
|
|
1318
1634
|
] }),
|
|
1319
1635
|
" ",
|
|
1320
1636
|
tc.name,
|
|
1637
|
+
" ",
|
|
1638
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1639
|
+
"(",
|
|
1640
|
+
tc.rerunIndex,
|
|
1641
|
+
"/",
|
|
1642
|
+
tc.rerunTotal,
|
|
1643
|
+
")"
|
|
1644
|
+
] }),
|
|
1321
1645
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1322
1646
|
" (",
|
|
1323
1647
|
tc.durationMs,
|
|
1324
1648
|
"ms)"
|
|
1325
1649
|
] })
|
|
1326
1650
|
] }),
|
|
1327
|
-
tc.
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1651
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1652
|
+
ink.Box,
|
|
1653
|
+
{
|
|
1654
|
+
flexDirection: "column",
|
|
1655
|
+
marginLeft: 2,
|
|
1656
|
+
children: [
|
|
1657
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1658
|
+
item.evaluatorName,
|
|
1659
|
+
":",
|
|
1660
|
+
" ",
|
|
1661
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1662
|
+
" ",
|
|
1663
|
+
item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1664
|
+
ink.Text,
|
|
1665
|
+
{
|
|
1666
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1667
|
+
children: [
|
|
1668
|
+
formatScorePart(s, scoreColor, {
|
|
1669
|
+
isAggregated: tc.isAggregated
|
|
1670
|
+
}),
|
|
1671
|
+
" "
|
|
1672
|
+
]
|
|
1673
|
+
},
|
|
1674
|
+
s.id
|
|
1675
|
+
)),
|
|
1676
|
+
item.metrics?.map((m) => {
|
|
1677
|
+
const def = getMetricById(m.id);
|
|
1678
|
+
if (!def)
|
|
1679
|
+
return null;
|
|
1680
|
+
const formatted = def.format(m.data, {
|
|
1681
|
+
isAggregated: tc.isAggregated
|
|
1682
|
+
});
|
|
1683
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1684
|
+
"[",
|
|
1685
|
+
def.name ? `${def.name}: ` : "",
|
|
1686
|
+
formatted,
|
|
1687
|
+
"]",
|
|
1688
|
+
" "
|
|
1689
|
+
] }, m.id);
|
|
1690
|
+
})
|
|
1691
|
+
] }),
|
|
1692
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1693
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1694
|
+
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
1695
|
+
ink.Text,
|
|
1696
|
+
{
|
|
1697
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1698
|
+
children: line
|
|
1699
|
+
},
|
|
1700
|
+
lineIdx
|
|
1701
|
+
)
|
|
1702
|
+
) }, logIdx) : null
|
|
1703
|
+
) })
|
|
1704
|
+
]
|
|
1705
|
+
},
|
|
1706
|
+
item.evaluatorId
|
|
1707
|
+
))
|
|
1708
|
+
] }, tc.testCaseId)) }),
|
|
1364
1709
|
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1365
1710
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1366
1711
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
|
|
@@ -1407,7 +1752,8 @@ function RunView({
|
|
|
1407
1752
|
name.padEnd(28),
|
|
1408
1753
|
" avg=",
|
|
1409
1754
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1410
|
-
"
|
|
1755
|
+
" ",
|
|
1756
|
+
"passed=",
|
|
1411
1757
|
agg.passed,
|
|
1412
1758
|
" failed=",
|
|
1413
1759
|
agg.failed
|
|
@@ -1416,28 +1762,38 @@ function RunView({
|
|
|
1416
1762
|
] }),
|
|
1417
1763
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1418
1764
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
|
|
1419
|
-
testCases.map((tc
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
]
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1765
|
+
testCases.map((tc) => {
|
|
1766
|
+
const numericScores = tc.aggregatedEvaluatorScores.flatMap(
|
|
1767
|
+
(item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
|
|
1768
|
+
);
|
|
1769
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1770
|
+
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1771
|
+
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1772
|
+
isAggregated: true
|
|
1773
|
+
}) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
|
|
1774
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1775
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1776
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1777
|
+
" ",
|
|
1778
|
+
tc.name.padEnd(24)
|
|
1429
1779
|
] }),
|
|
1780
|
+
averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1781
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(averageScore), children: [
|
|
1782
|
+
"score=",
|
|
1783
|
+
scoreLabel
|
|
1784
|
+
] }),
|
|
1785
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1786
|
+
" ",
|
|
1787
|
+
createBar(averageScore, 100, 14)
|
|
1788
|
+
] })
|
|
1789
|
+
] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
|
|
1430
1790
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1431
|
-
" ",
|
|
1432
|
-
|
|
1791
|
+
" (",
|
|
1792
|
+
tc.durationMs,
|
|
1793
|
+
"ms)"
|
|
1433
1794
|
] })
|
|
1434
|
-
] }
|
|
1435
|
-
|
|
1436
|
-
" (",
|
|
1437
|
-
tc.durationMs,
|
|
1438
|
-
"ms)"
|
|
1439
|
-
] })
|
|
1440
|
-
] }, i))
|
|
1795
|
+
] }, tc.testCaseId);
|
|
1796
|
+
})
|
|
1441
1797
|
] }),
|
|
1442
1798
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1443
1799
|
"artifact: ",
|
|
@@ -1448,6 +1804,51 @@ function RunView({
|
|
|
1448
1804
|
}
|
|
1449
1805
|
|
|
1450
1806
|
// src/cli-simple/run.ts
|
|
1807
|
+
function buildTestCaseSummaries(byId) {
|
|
1808
|
+
const summaries = [];
|
|
1809
|
+
for (const { name, events } of byId.values()) {
|
|
1810
|
+
const passed = events.every((e) => e.passed);
|
|
1811
|
+
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1812
|
+
const isAggregated = events.length > 1;
|
|
1813
|
+
const numericScores = [];
|
|
1814
|
+
let firstAggregatedScore;
|
|
1815
|
+
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1816
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1817
|
+
for (const ev of events) {
|
|
1818
|
+
const es = ev.evaluatorScores.find(
|
|
1819
|
+
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
1820
|
+
);
|
|
1821
|
+
for (const s of es?.scores ?? []) {
|
|
1822
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1823
|
+
list.push(s);
|
|
1824
|
+
scoreIdToItems.set(s.id, list);
|
|
1825
|
+
}
|
|
1826
|
+
}
|
|
1827
|
+
for (const items of scoreIdToItems.values()) {
|
|
1828
|
+
const agg = aggregateScoreItems(items);
|
|
1829
|
+
if (agg) {
|
|
1830
|
+
const n = toNumericScoreFromScores([agg]);
|
|
1831
|
+
if (n !== void 0) {
|
|
1832
|
+
numericScores.push(n);
|
|
1833
|
+
if (firstAggregatedScore === void 0) {
|
|
1834
|
+
firstAggregatedScore = agg;
|
|
1835
|
+
}
|
|
1836
|
+
}
|
|
1837
|
+
}
|
|
1838
|
+
}
|
|
1839
|
+
}
|
|
1840
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1841
|
+
summaries.push({
|
|
1842
|
+
name,
|
|
1843
|
+
averageScore,
|
|
1844
|
+
aggregatedScoreItem: firstAggregatedScore,
|
|
1845
|
+
isAggregated,
|
|
1846
|
+
durationMs,
|
|
1847
|
+
passed
|
|
1848
|
+
});
|
|
1849
|
+
}
|
|
1850
|
+
return summaries;
|
|
1851
|
+
}
|
|
1451
1852
|
var ansi2 = {
|
|
1452
1853
|
reset: "\x1B[0m",
|
|
1453
1854
|
bold: "\x1B[1m",
|
|
@@ -1482,7 +1883,50 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
1482
1883
|
const filled = Math.round(safe / max * width);
|
|
1483
1884
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
1484
1885
|
}
|
|
1485
|
-
function
|
|
1886
|
+
function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
1887
|
+
if (events.length === 0)
|
|
1888
|
+
return [];
|
|
1889
|
+
const evaluatorIds = new Set(
|
|
1890
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1891
|
+
);
|
|
1892
|
+
const result = [];
|
|
1893
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1894
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1895
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1896
|
+
for (const ev of events) {
|
|
1897
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1898
|
+
for (const s of es?.scores ?? []) {
|
|
1899
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1900
|
+
list.push(s);
|
|
1901
|
+
scoreIdToItems.set(s.id, list);
|
|
1902
|
+
}
|
|
1903
|
+
for (const m of es?.metrics ?? []) {
|
|
1904
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1905
|
+
list.push(m);
|
|
1906
|
+
metricIdToItems.set(m.id, list);
|
|
1907
|
+
}
|
|
1908
|
+
}
|
|
1909
|
+
const aggregatedScores = [];
|
|
1910
|
+
for (const items of scoreIdToItems.values()) {
|
|
1911
|
+
const agg = aggregateScoreItems(items);
|
|
1912
|
+
if (agg)
|
|
1913
|
+
aggregatedScores.push(agg);
|
|
1914
|
+
}
|
|
1915
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1916
|
+
const passed = events.every((ev) => {
|
|
1917
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1918
|
+
return es?.passed ?? false;
|
|
1919
|
+
});
|
|
1920
|
+
result.push({
|
|
1921
|
+
evaluatorId,
|
|
1922
|
+
scores: aggregatedScores,
|
|
1923
|
+
passed,
|
|
1924
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
|
|
1925
|
+
});
|
|
1926
|
+
}
|
|
1927
|
+
return result;
|
|
1928
|
+
}
|
|
1929
|
+
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1486
1930
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1487
1931
|
const scoreParts = [];
|
|
1488
1932
|
for (const item of scores) {
|
|
@@ -1494,7 +1938,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1494
1938
|
);
|
|
1495
1939
|
continue;
|
|
1496
1940
|
}
|
|
1497
|
-
const formatted = def.format(item.data);
|
|
1941
|
+
const formatted = def.format(item.data, options);
|
|
1498
1942
|
switch (def.displayStrategy) {
|
|
1499
1943
|
case "bar": {
|
|
1500
1944
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -1527,7 +1971,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1527
1971
|
for (const { id, data } of metrics) {
|
|
1528
1972
|
const def = getMetricById(id);
|
|
1529
1973
|
if (def) {
|
|
1530
|
-
const formatted = def.format(data);
|
|
1974
|
+
const formatted = def.format(data, options);
|
|
1531
1975
|
metricParts.push(
|
|
1532
1976
|
def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
|
|
1533
1977
|
);
|
|
@@ -1560,7 +2004,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1560
2004
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1561
2005
|
);
|
|
1562
2006
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1563
|
-
const
|
|
2007
|
+
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
1564
2008
|
let overallScoreTotal = 0;
|
|
1565
2009
|
let overallScoreCount = 0;
|
|
1566
2010
|
let completedCount = 0;
|
|
@@ -1574,6 +2018,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1574
2018
|
}
|
|
1575
2019
|
process.stdout.write("\r\x1B[2K");
|
|
1576
2020
|
}
|
|
2021
|
+
function cursorUp(n) {
|
|
2022
|
+
if (!process.stdout.isTTY || n <= 0)
|
|
2023
|
+
return;
|
|
2024
|
+
process.stdout.write(`\x1B[${n}A`);
|
|
2025
|
+
}
|
|
1577
2026
|
function drawSpinner() {
|
|
1578
2027
|
if (!process.stdout.isTTY || runFinished) {
|
|
1579
2028
|
return;
|
|
@@ -1587,38 +2036,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1587
2036
|
)} ${colorize("(live)", ansi2.dim)}`
|
|
1588
2037
|
);
|
|
1589
2038
|
}
|
|
2039
|
+
let lastPrintedTestCaseId = null;
|
|
2040
|
+
let lastPrintedLineCount = 0;
|
|
1590
2041
|
let spinnerTimer;
|
|
1591
|
-
const done = new Promise((
|
|
2042
|
+
const done = new Promise((resolve5) => {
|
|
1592
2043
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1593
2044
|
if (event.type === "TestCaseProgress") {
|
|
1594
2045
|
completedCount = event.completedTestCases;
|
|
1595
2046
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1596
2047
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
2048
|
+
const testCaseId = event.testCaseId;
|
|
2049
|
+
const existing = testCaseByTestId.get(testCaseId) ?? {
|
|
2050
|
+
name: event.testCaseName,
|
|
2051
|
+
events: []
|
|
2052
|
+
};
|
|
2053
|
+
existing.events.push({
|
|
2054
|
+
averageScore,
|
|
2055
|
+
passed: event.passed,
|
|
2056
|
+
durationMs: event.durationMs,
|
|
2057
|
+
evaluatorScores: event.evaluatorScores
|
|
2058
|
+
});
|
|
2059
|
+
testCaseByTestId.set(testCaseId, existing);
|
|
1601
2060
|
for (const item of event.evaluatorScores) {
|
|
1602
|
-
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
1603
|
-
console.log(
|
|
1604
|
-
formatEvaluatorScoreLine(
|
|
1605
|
-
name,
|
|
1606
|
-
item.scores,
|
|
1607
|
-
item.passed,
|
|
1608
|
-
item.metrics
|
|
1609
|
-
)
|
|
1610
|
-
);
|
|
1611
|
-
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1612
|
-
for (const log of item.logs) {
|
|
1613
|
-
if (log.type === "diff") {
|
|
1614
|
-
const useColor = process.stdout.isTTY;
|
|
1615
|
-
for (const { type, line } of getDiffLines(log)) {
|
|
1616
|
-
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1617
|
-
console.log(colored);
|
|
1618
|
-
}
|
|
1619
|
-
}
|
|
1620
|
-
}
|
|
1621
|
-
}
|
|
1622
2061
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1623
2062
|
if (numeric !== void 0) {
|
|
1624
2063
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
@@ -1637,19 +2076,67 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1637
2076
|
overallScoreCount += 1;
|
|
1638
2077
|
}
|
|
1639
2078
|
}
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
2079
|
+
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2080
|
+
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
2081
|
+
const isNonTty = !process.stdout.isTTY;
|
|
2082
|
+
const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
|
|
2083
|
+
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2084
|
+
cursorUp(lastPrintedLineCount);
|
|
2085
|
+
}
|
|
2086
|
+
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2087
|
+
existing.events);
|
|
2088
|
+
const isAggregated = existing.events.length > 1;
|
|
2089
|
+
const durationMs = existing.events.reduce(
|
|
2090
|
+
(s, e) => s + e.durationMs,
|
|
2091
|
+
0
|
|
2092
|
+
);
|
|
2093
|
+
existing.events.every((e) => e.passed);
|
|
2094
|
+
const lines = [];
|
|
2095
|
+
lines.push(
|
|
2096
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2097
|
+
);
|
|
2098
|
+
for (const item of aggregatedScores) {
|
|
2099
|
+
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2100
|
+
lines.push(
|
|
2101
|
+
formatEvaluatorScoreLine(
|
|
2102
|
+
name,
|
|
2103
|
+
item.scores,
|
|
2104
|
+
item.passed,
|
|
2105
|
+
item.metrics,
|
|
2106
|
+
{ isAggregated }
|
|
2107
|
+
)
|
|
2108
|
+
);
|
|
2109
|
+
const lastEvent = existing.events[existing.events.length - 1];
|
|
2110
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2111
|
+
(x) => x.evaluatorId === item.evaluatorId
|
|
2112
|
+
);
|
|
2113
|
+
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2114
|
+
for (const log of lastEs.logs) {
|
|
2115
|
+
if (log.type === "diff") {
|
|
2116
|
+
const useColor = process.stdout.isTTY;
|
|
2117
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
2118
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2119
|
+
lines.push(colored);
|
|
2120
|
+
}
|
|
2121
|
+
}
|
|
2122
|
+
}
|
|
2123
|
+
}
|
|
2124
|
+
}
|
|
2125
|
+
if (!skipPrintNonTty) {
|
|
2126
|
+
for (let i = 0; i < lines.length; i++) {
|
|
2127
|
+
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2128
|
+
`);
|
|
2129
|
+
}
|
|
2130
|
+
lastPrintedTestCaseId = testCaseId;
|
|
2131
|
+
lastPrintedLineCount = lines.length;
|
|
2132
|
+
}
|
|
1646
2133
|
drawSpinner();
|
|
1647
2134
|
}
|
|
1648
2135
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1649
2136
|
runFinished = true;
|
|
1650
2137
|
clearLine();
|
|
1651
2138
|
unsubscribe();
|
|
1652
|
-
|
|
2139
|
+
resolve5(event);
|
|
1653
2140
|
}
|
|
1654
2141
|
});
|
|
1655
2142
|
});
|
|
@@ -1706,6 +2193,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1706
2193
|
getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
|
|
1707
2194
|
);
|
|
1708
2195
|
}
|
|
2196
|
+
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
1709
2197
|
if (testCaseSummaries.length > 0) {
|
|
1710
2198
|
console.log(colorize("- test case scores:", ansi2.magenta));
|
|
1711
2199
|
for (const summary of testCaseSummaries) {
|
|
@@ -1716,9 +2204,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1716
2204
|
);
|
|
1717
2205
|
continue;
|
|
1718
2206
|
}
|
|
2207
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2208
|
+
summary.aggregatedScoreItem.data,
|
|
2209
|
+
{ isAggregated: true }
|
|
2210
|
+
) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
|
|
1719
2211
|
console.log(
|
|
1720
2212
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1721
|
-
|
|
2213
|
+
scoreLabel,
|
|
1722
2214
|
scoreToColor(summary.averageScore)
|
|
1723
2215
|
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1724
2216
|
);
|
|
@@ -1727,7 +2219,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1727
2219
|
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1728
2220
|
}
|
|
1729
2221
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1730
|
-
return new Promise((
|
|
2222
|
+
return new Promise((resolve5, reject) => {
|
|
1731
2223
|
const app = ink.render(
|
|
1732
2224
|
React2__default.default.createElement(RunView, {
|
|
1733
2225
|
runner,
|
|
@@ -1738,7 +2230,7 @@ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
|
1738
2230
|
if (err) {
|
|
1739
2231
|
reject(err);
|
|
1740
2232
|
} else {
|
|
1741
|
-
|
|
2233
|
+
resolve5();
|
|
1742
2234
|
}
|
|
1743
2235
|
}
|
|
1744
2236
|
})
|