@m4trix/evals 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +706 -231
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +707 -232
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +710 -390
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +702 -382
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +289 -108
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +28 -5
- package/dist/index.js +290 -109
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,7 +8,7 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
var
|
|
11
|
+
var diff = require('diff');
|
|
12
12
|
var React2 = require('react');
|
|
13
13
|
var ink = require('ink');
|
|
14
14
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -56,7 +56,8 @@ var defaultRunnerConfig = {
|
|
|
56
56
|
],
|
|
57
57
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
58
58
|
},
|
|
59
|
-
artifactDirectory: ".eval-results"
|
|
59
|
+
artifactDirectory: ".eval-results",
|
|
60
|
+
maxConcurrency: 1
|
|
60
61
|
};
|
|
61
62
|
function toRunnerConfigOverrides(config) {
|
|
62
63
|
if (!config) {
|
|
@@ -89,6 +90,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
89
90
|
if (config.artifactDirectory !== void 0) {
|
|
90
91
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
91
92
|
}
|
|
93
|
+
if (config.maxConcurrency !== void 0) {
|
|
94
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
95
|
+
}
|
|
92
96
|
if (Object.keys(discovery).length > 0) {
|
|
93
97
|
overrides.discovery = discovery;
|
|
94
98
|
}
|
|
@@ -282,8 +286,35 @@ async function collectTestCasesFromFiles(config) {
|
|
|
282
286
|
);
|
|
283
287
|
return found.flat();
|
|
284
288
|
}
|
|
289
|
+
function toJsonLines(value) {
|
|
290
|
+
try {
|
|
291
|
+
return JSON.stringify(value, null, 2);
|
|
292
|
+
} catch {
|
|
293
|
+
return String(value);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
function formatDiffString(changes) {
|
|
297
|
+
const lines = [];
|
|
298
|
+
for (const part of changes) {
|
|
299
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
300
|
+
const partLines = part.value.split("\n");
|
|
301
|
+
if (partLines[partLines.length - 1] === "") {
|
|
302
|
+
partLines.pop();
|
|
303
|
+
}
|
|
304
|
+
for (const line of partLines) {
|
|
305
|
+
lines.push(`${prefix} ${line}`);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return lines.join("\n");
|
|
309
|
+
}
|
|
310
|
+
function createDiffString(expected, actual) {
|
|
311
|
+
const expectedStr = toJsonLines(expected);
|
|
312
|
+
const actualStr = toJsonLines(actual);
|
|
313
|
+
const changes = diff.diffLines(expectedStr, actualStr);
|
|
314
|
+
return formatDiffString(changes);
|
|
315
|
+
}
|
|
285
316
|
function createDiffLogEntry(expected, actual, options) {
|
|
286
|
-
const diff =
|
|
317
|
+
const diff = createDiffString(expected, actual);
|
|
287
318
|
return {
|
|
288
319
|
type: "diff",
|
|
289
320
|
label: options?.label,
|
|
@@ -293,7 +324,7 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
293
324
|
};
|
|
294
325
|
}
|
|
295
326
|
function getDiffLines(entry) {
|
|
296
|
-
const raw =
|
|
327
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
297
328
|
return raw.split("\n").map((line) => {
|
|
298
329
|
const trimmed = line.trimStart();
|
|
299
330
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -313,6 +344,7 @@ var Metric = {
|
|
|
313
344
|
const def = {
|
|
314
345
|
id: config.id,
|
|
315
346
|
name: config.name,
|
|
347
|
+
aggregate: config.aggregate,
|
|
316
348
|
format: config.format,
|
|
317
349
|
make: (data) => ({ id: config.id, data })
|
|
318
350
|
};
|
|
@@ -332,6 +364,7 @@ var Score = {
|
|
|
332
364
|
id: config.id,
|
|
333
365
|
name: config.name,
|
|
334
366
|
displayStrategy: config.displayStrategy,
|
|
367
|
+
aggregate: config.aggregate,
|
|
335
368
|
format: config.format,
|
|
336
369
|
make: (data, options) => {
|
|
337
370
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -350,23 +383,75 @@ function getScoreById(id) {
|
|
|
350
383
|
return registry2.get(id);
|
|
351
384
|
}
|
|
352
385
|
|
|
386
|
+
// src/evals/aggregators.ts
|
|
387
|
+
function aggregateAverageWithVariance(values) {
|
|
388
|
+
if (values.length === 0) {
|
|
389
|
+
return { value: 0, count: 0 };
|
|
390
|
+
}
|
|
391
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
392
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
393
|
+
const mean = sum / values.length;
|
|
394
|
+
let stdDev;
|
|
395
|
+
if (values.length >= 2) {
|
|
396
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
397
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
398
|
+
}
|
|
399
|
+
return { value: mean, stdDev, count: values.length };
|
|
400
|
+
}
|
|
401
|
+
function aggregateAll(values) {
|
|
402
|
+
const total = values.length;
|
|
403
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
404
|
+
return {
|
|
405
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
406
|
+
passedCount,
|
|
407
|
+
totalCount: total
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
function aggregateTokenCountSum(values) {
|
|
411
|
+
const initial = {
|
|
412
|
+
input: 0,
|
|
413
|
+
output: 0,
|
|
414
|
+
inputCached: 0,
|
|
415
|
+
outputCached: 0
|
|
416
|
+
};
|
|
417
|
+
return values.reduce(
|
|
418
|
+
(acc, v) => ({
|
|
419
|
+
input: acc.input + (v.input ?? 0),
|
|
420
|
+
output: acc.output + (v.output ?? 0),
|
|
421
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
422
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
423
|
+
}),
|
|
424
|
+
initial
|
|
425
|
+
);
|
|
426
|
+
}
|
|
427
|
+
function aggregateLatencyAverage(values) {
|
|
428
|
+
if (values.length === 0) {
|
|
429
|
+
return { ms: 0 };
|
|
430
|
+
}
|
|
431
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
432
|
+
return { ms: sum / values.length };
|
|
433
|
+
}
|
|
434
|
+
|
|
353
435
|
// src/evals/metrics/standard.ts
|
|
354
436
|
Metric.of({
|
|
355
437
|
id: "token-count",
|
|
356
438
|
name: "Tokens",
|
|
357
|
-
|
|
439
|
+
aggregate: aggregateTokenCountSum,
|
|
440
|
+
format: (data, options) => {
|
|
358
441
|
const input = data.input ?? 0;
|
|
359
442
|
const output = data.output ?? 0;
|
|
360
443
|
const inputCached = data.inputCached ?? 0;
|
|
361
444
|
const outputCached = data.outputCached ?? 0;
|
|
362
445
|
const cached = inputCached + outputCached;
|
|
363
|
-
|
|
446
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
447
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
364
448
|
}
|
|
365
449
|
});
|
|
366
450
|
Metric.of({
|
|
367
451
|
id: "latency",
|
|
368
452
|
name: "Latency",
|
|
369
|
-
|
|
453
|
+
aggregate: aggregateLatencyAverage,
|
|
454
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
370
455
|
});
|
|
371
456
|
|
|
372
457
|
// src/evals/scores/standard.ts
|
|
@@ -374,16 +459,50 @@ Score.of({
|
|
|
374
459
|
id: "percent",
|
|
375
460
|
name: "Score",
|
|
376
461
|
displayStrategy: "bar",
|
|
377
|
-
format: (data) =>
|
|
462
|
+
format: (data, options) => {
|
|
463
|
+
if (options?.isAggregated) {
|
|
464
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
465
|
+
}
|
|
466
|
+
return data.value.toFixed(2);
|
|
467
|
+
},
|
|
468
|
+
aggregate: aggregateAverageWithVariance
|
|
378
469
|
});
|
|
379
470
|
Score.of({
|
|
380
471
|
id: "binary",
|
|
381
472
|
name: "Result",
|
|
382
473
|
displayStrategy: "passFail",
|
|
383
|
-
format: (data) =>
|
|
474
|
+
format: (data, options) => {
|
|
475
|
+
if (options?.isAggregated) {
|
|
476
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
477
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
478
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
479
|
+
}
|
|
480
|
+
return base;
|
|
481
|
+
}
|
|
482
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
483
|
+
},
|
|
484
|
+
aggregate: aggregateAll
|
|
384
485
|
});
|
|
385
486
|
|
|
386
487
|
// src/runner/score-utils.ts
|
|
488
|
+
function aggregateScoreItems(items) {
|
|
489
|
+
if (items.length === 0)
|
|
490
|
+
return void 0;
|
|
491
|
+
const def = getScoreById(items[0].id);
|
|
492
|
+
if (!def?.aggregate)
|
|
493
|
+
return items[items.length - 1];
|
|
494
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
495
|
+
return { ...items[0], data: aggregated };
|
|
496
|
+
}
|
|
497
|
+
function aggregateMetricItems(items) {
|
|
498
|
+
if (items.length === 0)
|
|
499
|
+
return void 0;
|
|
500
|
+
const def = getMetricById(items[0].id);
|
|
501
|
+
if (!def?.aggregate)
|
|
502
|
+
return items[items.length - 1];
|
|
503
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
504
|
+
return { ...items[0], data: aggregated };
|
|
505
|
+
}
|
|
387
506
|
function toNumericScoreFromScores(scores) {
|
|
388
507
|
for (const item of scores) {
|
|
389
508
|
const def = getScoreById(item.id);
|
|
@@ -462,6 +581,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
462
581
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
463
582
|
);
|
|
464
583
|
}
|
|
584
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
585
|
+
return effect.Effect.gen(function* () {
|
|
586
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
587
|
+
const rerunPassed = [];
|
|
588
|
+
for (let r = 0; r < reruns; r++) {
|
|
589
|
+
const started = Date.now();
|
|
590
|
+
const evaluatorScores = [];
|
|
591
|
+
let testCaseError;
|
|
592
|
+
const output = readOutput(testCaseItem.testCase);
|
|
593
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
594
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
595
|
+
if (!evaluateFn) {
|
|
596
|
+
continue;
|
|
597
|
+
}
|
|
598
|
+
try {
|
|
599
|
+
const logs = [];
|
|
600
|
+
const logDiff = (expected, actual, options) => {
|
|
601
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
602
|
+
};
|
|
603
|
+
const ctx = yield* effect.Effect.promise(
|
|
604
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
605
|
+
);
|
|
606
|
+
const result = yield* effect.Effect.promise(
|
|
607
|
+
() => Promise.resolve(
|
|
608
|
+
evaluateFn({
|
|
609
|
+
input: testCaseItem.testCase.getInput(),
|
|
610
|
+
ctx,
|
|
611
|
+
output,
|
|
612
|
+
logDiff
|
|
613
|
+
})
|
|
614
|
+
)
|
|
615
|
+
);
|
|
616
|
+
const { scores, metrics } = normalizeResult(result);
|
|
617
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
618
|
+
evaluatorScores.push({
|
|
619
|
+
evaluatorId,
|
|
620
|
+
scores,
|
|
621
|
+
passed: passed2,
|
|
622
|
+
metrics,
|
|
623
|
+
logs: logs.length > 0 ? logs : void 0
|
|
624
|
+
});
|
|
625
|
+
} catch (error) {
|
|
626
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
627
|
+
evaluatorScores.push({
|
|
628
|
+
evaluatorId,
|
|
629
|
+
scores: [],
|
|
630
|
+
passed: false
|
|
631
|
+
});
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
635
|
+
rerunPassed.push(rerunPassedThis);
|
|
636
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
637
|
+
n + 1,
|
|
638
|
+
n + 1
|
|
639
|
+
]);
|
|
640
|
+
const progressEvent = {
|
|
641
|
+
type: "TestCaseProgress",
|
|
642
|
+
runId: task.runId,
|
|
643
|
+
testCaseId: testCaseItem.id,
|
|
644
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
645
|
+
completedTestCases: completedEvaluations,
|
|
646
|
+
totalTestCases: totalEvaluations,
|
|
647
|
+
rerunIndex: r + 1,
|
|
648
|
+
rerunTotal: reruns,
|
|
649
|
+
passed: rerunPassedThis,
|
|
650
|
+
durationMs: Date.now() - started,
|
|
651
|
+
evaluatorScores,
|
|
652
|
+
output,
|
|
653
|
+
errorMessage: testCaseError
|
|
654
|
+
};
|
|
655
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
656
|
+
...snapshot,
|
|
657
|
+
completedTestCases: completedEvaluations
|
|
658
|
+
}));
|
|
659
|
+
yield* publishEvent(progressEvent);
|
|
660
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
661
|
+
runId: task.runId,
|
|
662
|
+
artifactPath: task.snapshot.artifactPath,
|
|
663
|
+
payload: progressEvent
|
|
664
|
+
});
|
|
665
|
+
}
|
|
666
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
667
|
+
if (testCasePassed) {
|
|
668
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
669
|
+
} else {
|
|
670
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
671
|
+
}
|
|
672
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
673
|
+
effect.Ref.get(passedRef),
|
|
674
|
+
effect.Ref.get(failedRef)
|
|
675
|
+
]);
|
|
676
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
677
|
+
...snapshot,
|
|
678
|
+
passedTestCases: passed,
|
|
679
|
+
failedTestCases: failed
|
|
680
|
+
}));
|
|
681
|
+
});
|
|
682
|
+
}
|
|
465
683
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
466
684
|
const startedAt = Date.now();
|
|
467
685
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -474,104 +692,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
474
692
|
runId: task.runId,
|
|
475
693
|
startedAt
|
|
476
694
|
});
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
)
|
|
507
|
-
);
|
|
508
|
-
const { scores, metrics } = normalizeResult(result);
|
|
509
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
510
|
-
evaluatorScores.push({
|
|
511
|
-
evaluatorId,
|
|
512
|
-
scores,
|
|
513
|
-
passed,
|
|
514
|
-
metrics,
|
|
515
|
-
logs: logs.length > 0 ? logs : void 0
|
|
516
|
-
});
|
|
517
|
-
} catch (error) {
|
|
518
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
519
|
-
evaluatorScores.push({
|
|
520
|
-
evaluatorId,
|
|
521
|
-
scores: [],
|
|
522
|
-
passed: false
|
|
523
|
-
});
|
|
524
|
-
}
|
|
525
|
-
}
|
|
526
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
527
|
-
completedTestCases += 1;
|
|
528
|
-
if (testCasePassed) {
|
|
529
|
-
passedTestCases += 1;
|
|
530
|
-
} else {
|
|
531
|
-
failedTestCases += 1;
|
|
532
|
-
}
|
|
533
|
-
const progressEvent = {
|
|
534
|
-
type: "TestCaseProgress",
|
|
535
|
-
runId: task.runId,
|
|
536
|
-
testCaseId: testCaseItem.id,
|
|
537
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
538
|
-
completedTestCases,
|
|
539
|
-
totalTestCases: task.testCases.length,
|
|
540
|
-
passed: testCasePassed,
|
|
541
|
-
durationMs: Date.now() - started,
|
|
542
|
-
evaluatorScores,
|
|
543
|
-
output,
|
|
544
|
-
errorMessage: testCaseError
|
|
545
|
-
};
|
|
546
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
547
|
-
...snapshot,
|
|
548
|
-
completedTestCases,
|
|
549
|
-
passedTestCases,
|
|
550
|
-
failedTestCases
|
|
551
|
-
}));
|
|
552
|
-
yield* publishEvent(progressEvent);
|
|
553
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
554
|
-
runId: task.runId,
|
|
555
|
-
artifactPath: task.snapshot.artifactPath,
|
|
556
|
-
payload: progressEvent
|
|
557
|
-
});
|
|
558
|
-
}
|
|
695
|
+
const totalEvaluations = task.testCases.reduce(
|
|
696
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
697
|
+
0
|
|
698
|
+
);
|
|
699
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
700
|
+
const completedRef = yield* effect.Ref.make(0);
|
|
701
|
+
const passedRef = yield* effect.Ref.make(0);
|
|
702
|
+
const failedRef = yield* effect.Ref.make(0);
|
|
703
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
704
|
+
task,
|
|
705
|
+
testCaseItem,
|
|
706
|
+
totalEvaluations,
|
|
707
|
+
publishEvent,
|
|
708
|
+
persistenceQueue,
|
|
709
|
+
updateSnapshot,
|
|
710
|
+
completedRef,
|
|
711
|
+
passedRef,
|
|
712
|
+
failedRef
|
|
713
|
+
);
|
|
714
|
+
yield* effect.Effect.forEach(
|
|
715
|
+
task.testCases,
|
|
716
|
+
processTestCase,
|
|
717
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
718
|
+
);
|
|
719
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
720
|
+
effect.Ref.get(completedRef),
|
|
721
|
+
effect.Ref.get(passedRef),
|
|
722
|
+
effect.Ref.get(failedRef)
|
|
723
|
+
]);
|
|
559
724
|
const finishedAt = Date.now();
|
|
560
725
|
const completedEvent = {
|
|
561
726
|
type: "RunCompleted",
|
|
562
727
|
runId: task.runId,
|
|
563
728
|
finishedAt,
|
|
564
|
-
passedTestCases,
|
|
565
|
-
failedTestCases,
|
|
729
|
+
passedTestCases: passedUniqueTestCases,
|
|
730
|
+
failedTestCases: failedUniqueTestCases,
|
|
566
731
|
totalTestCases: task.testCases.length,
|
|
567
732
|
artifactPath: task.snapshot.artifactPath
|
|
568
733
|
};
|
|
569
734
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
570
735
|
...snapshot,
|
|
571
736
|
status: "completed",
|
|
572
|
-
completedTestCases,
|
|
573
|
-
passedTestCases,
|
|
574
|
-
failedTestCases,
|
|
737
|
+
completedTestCases: completedEvaluations,
|
|
738
|
+
passedTestCases: passedUniqueTestCases,
|
|
739
|
+
failedTestCases: failedUniqueTestCases,
|
|
575
740
|
finishedAt
|
|
576
741
|
}));
|
|
577
742
|
yield* publishEvent(completedEvent);
|
|
@@ -659,7 +824,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
659
824
|
const artifactPath = filePath;
|
|
660
825
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
661
826
|
const progress = aggregateTestCaseProgress(lines);
|
|
662
|
-
const completedTestCases = runCompleted
|
|
827
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
663
828
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
664
829
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
665
830
|
return {
|
|
@@ -681,23 +846,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
681
846
|
}
|
|
682
847
|
function aggregateTestCaseProgress(lines) {
|
|
683
848
|
let completedTestCases = 0;
|
|
684
|
-
|
|
685
|
-
let failedTestCases = 0;
|
|
849
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
686
850
|
for (const line of lines) {
|
|
687
851
|
try {
|
|
688
852
|
const event = JSON.parse(line);
|
|
689
853
|
if (event.type === "TestCaseProgress") {
|
|
690
854
|
const ev = event;
|
|
691
855
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
failedTestCases += 1;
|
|
696
|
-
}
|
|
856
|
+
const id = ev.testCaseId;
|
|
857
|
+
const current = testCasePassedBy.get(id);
|
|
858
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
697
859
|
}
|
|
698
860
|
} catch {
|
|
699
861
|
}
|
|
700
862
|
}
|
|
863
|
+
let passedTestCases = 0;
|
|
864
|
+
let failedTestCases = 0;
|
|
865
|
+
for (const passed of testCasePassedBy.values()) {
|
|
866
|
+
if (passed) {
|
|
867
|
+
passedTestCases += 1;
|
|
868
|
+
} else {
|
|
869
|
+
failedTestCases += 1;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
701
872
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
702
873
|
}
|
|
703
874
|
async function appendJsonLine(artifactPath, payload) {
|
|
@@ -892,6 +1063,10 @@ var EffectRunner = class {
|
|
|
892
1063
|
throw new Error("No evaluators selected for run");
|
|
893
1064
|
}
|
|
894
1065
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1066
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1067
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1068
|
+
0
|
|
1069
|
+
);
|
|
895
1070
|
const runId = `run-${crypto.randomUUID()}`;
|
|
896
1071
|
const artifactPath = createArtifactPath(
|
|
897
1072
|
this.config.artifactDirectory,
|
|
@@ -904,7 +1079,7 @@ var EffectRunner = class {
|
|
|
904
1079
|
datasetName: dataset.dataset.getName(),
|
|
905
1080
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
906
1081
|
queuedAt: Date.now(),
|
|
907
|
-
totalTestCases:
|
|
1082
|
+
totalTestCases: totalEvaluations,
|
|
908
1083
|
completedTestCases: 0,
|
|
909
1084
|
passedTestCases: 0,
|
|
910
1085
|
failedTestCases: 0,
|
|
@@ -918,7 +1093,7 @@ var EffectRunner = class {
|
|
|
918
1093
|
datasetId: request.datasetId,
|
|
919
1094
|
datasetName: dataset.dataset.getName(),
|
|
920
1095
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
921
|
-
totalTestCases:
|
|
1096
|
+
totalTestCases: totalEvaluations,
|
|
922
1097
|
artifactPath
|
|
923
1098
|
};
|
|
924
1099
|
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -929,6 +1104,7 @@ var EffectRunner = class {
|
|
|
929
1104
|
payload: queuedEvent
|
|
930
1105
|
})
|
|
931
1106
|
);
|
|
1107
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
932
1108
|
await effect.Effect.runPromise(
|
|
933
1109
|
effect.Queue.offer(this.runQueue, {
|
|
934
1110
|
runId,
|
|
@@ -936,7 +1112,8 @@ var EffectRunner = class {
|
|
|
936
1112
|
dataset: dataset.dataset,
|
|
937
1113
|
evaluators: selectedEvaluators,
|
|
938
1114
|
testCases: selectedTestCases,
|
|
939
|
-
snapshot
|
|
1115
|
+
snapshot,
|
|
1116
|
+
maxConcurrency
|
|
940
1117
|
})
|
|
941
1118
|
);
|
|
942
1119
|
return snapshot;
|
|
@@ -1242,6 +1419,13 @@ function Spinner({ label = "Running" }) {
|
|
|
1242
1419
|
label
|
|
1243
1420
|
] });
|
|
1244
1421
|
}
|
|
1422
|
+
function sampleStdDev(sum, sumSq, n) {
|
|
1423
|
+
if (n < 2)
|
|
1424
|
+
return void 0;
|
|
1425
|
+
const mean = sum / n;
|
|
1426
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1427
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1428
|
+
}
|
|
1245
1429
|
function scoreColor(score) {
|
|
1246
1430
|
if (score >= 80)
|
|
1247
1431
|
return "green";
|
|
@@ -1254,13 +1438,62 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1254
1438
|
const filled = Math.round(safe / max * width);
|
|
1255
1439
|
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1256
1440
|
}
|
|
1257
|
-
function
|
|
1441
|
+
function aggregateEvaluatorScores(events, nameById) {
|
|
1442
|
+
if (events.length === 0)
|
|
1443
|
+
return [];
|
|
1444
|
+
const evaluatorIds = new Set(
|
|
1445
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1446
|
+
);
|
|
1447
|
+
const result = [];
|
|
1448
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1449
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1450
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1451
|
+
for (const ev of events) {
|
|
1452
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1453
|
+
for (const s of es?.scores ?? []) {
|
|
1454
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1455
|
+
list.push(s);
|
|
1456
|
+
scoreIdToItems.set(s.id, list);
|
|
1457
|
+
}
|
|
1458
|
+
for (const m of es?.metrics ?? []) {
|
|
1459
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1460
|
+
list.push(m);
|
|
1461
|
+
metricIdToItems.set(m.id, list);
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
const aggregatedScores = [];
|
|
1465
|
+
for (const items of scoreIdToItems.values()) {
|
|
1466
|
+
const agg = aggregateScoreItems(items);
|
|
1467
|
+
if (agg)
|
|
1468
|
+
aggregatedScores.push(agg);
|
|
1469
|
+
}
|
|
1470
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1471
|
+
const passed = events.every((ev) => {
|
|
1472
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1473
|
+
return es?.passed ?? false;
|
|
1474
|
+
});
|
|
1475
|
+
const lastEvent = events[events.length - 1];
|
|
1476
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1477
|
+
(x) => x.evaluatorId === evaluatorId
|
|
1478
|
+
);
|
|
1479
|
+
result.push({
|
|
1480
|
+
evaluatorId,
|
|
1481
|
+
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
1482
|
+
scores: aggregatedScores,
|
|
1483
|
+
passed,
|
|
1484
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
|
|
1485
|
+
logs: lastEs?.logs
|
|
1486
|
+
});
|
|
1487
|
+
}
|
|
1488
|
+
return result;
|
|
1489
|
+
}
|
|
1490
|
+
function formatScorePart(item, scoreToColor2, options) {
|
|
1258
1491
|
const def = getScoreById(item.id);
|
|
1259
1492
|
if (!def) {
|
|
1260
1493
|
const numeric = toNumericScore(item.data);
|
|
1261
1494
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1262
1495
|
}
|
|
1263
|
-
const formatted = def.format(item.data);
|
|
1496
|
+
const formatted = def.format(item.data, options);
|
|
1264
1497
|
if (def.displayStrategy === "bar") {
|
|
1265
1498
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1266
1499
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1280,6 +1513,7 @@ function RunView({
|
|
|
1280
1513
|
);
|
|
1281
1514
|
const [runInfo, setRunInfo] = React2.useState(null);
|
|
1282
1515
|
const [testCases, setTestCases] = React2.useState([]);
|
|
1516
|
+
const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
|
|
1283
1517
|
const [summary, setSummary] = React2.useState(null);
|
|
1284
1518
|
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1285
1519
|
const runEval = React2.useCallback(async () => {
|
|
@@ -1306,48 +1540,44 @@ function RunView({
|
|
|
1306
1540
|
return;
|
|
1307
1541
|
}
|
|
1308
1542
|
const nameById = new Map(
|
|
1309
|
-
evaluators.map((item) => [
|
|
1310
|
-
item.id,
|
|
1311
|
-
item.evaluator.getName() ?? item.id
|
|
1312
|
-
])
|
|
1543
|
+
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1313
1544
|
);
|
|
1314
1545
|
setEvaluatorNameById(nameById);
|
|
1315
1546
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1316
1547
|
let overallScoreTotal = 0;
|
|
1548
|
+
let overallScoreSumSq = 0;
|
|
1317
1549
|
let overallScoreCount = 0;
|
|
1318
1550
|
const done = new Promise((resolve5) => {
|
|
1319
1551
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1320
1552
|
if (event.type === "TestCaseProgress") {
|
|
1321
1553
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1322
|
-
|
|
1554
|
+
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1323
1555
|
for (const item of event.evaluatorScores) {
|
|
1324
1556
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1325
1557
|
if (numeric !== void 0) {
|
|
1326
1558
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1327
1559
|
total: 0,
|
|
1560
|
+
sumSq: 0,
|
|
1328
1561
|
count: 0,
|
|
1329
1562
|
passed: 0,
|
|
1330
1563
|
failed: 0
|
|
1331
1564
|
};
|
|
1332
1565
|
aggregates.set(item.evaluatorId, {
|
|
1333
1566
|
total: current.total + numeric,
|
|
1567
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
1334
1568
|
count: current.count + 1,
|
|
1335
1569
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
1336
1570
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
1337
1571
|
});
|
|
1338
1572
|
overallScoreTotal += numeric;
|
|
1573
|
+
overallScoreSumSq += numeric * numeric;
|
|
1339
1574
|
overallScoreCount += 1;
|
|
1340
1575
|
}
|
|
1341
1576
|
}
|
|
1342
|
-
setTestCases((prev) =>
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
completedTestCases: event.completedTestCases,
|
|
1347
|
-
totalTestCases: event.totalTestCases,
|
|
1348
|
-
durationMs: event.durationMs,
|
|
1349
|
-
passed: event.passed,
|
|
1350
|
-
averageScore,
|
|
1577
|
+
setTestCases((prev) => {
|
|
1578
|
+
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1579
|
+
const existing = byId.get(event.testCaseId);
|
|
1580
|
+
const newEvent = {
|
|
1351
1581
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1352
1582
|
evaluatorId: item.evaluatorId,
|
|
1353
1583
|
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
@@ -1355,9 +1585,33 @@ function RunView({
|
|
|
1355
1585
|
passed: item.passed,
|
|
1356
1586
|
metrics: item.metrics,
|
|
1357
1587
|
logs: item.logs
|
|
1358
|
-
}))
|
|
1359
|
-
|
|
1360
|
-
|
|
1588
|
+
})),
|
|
1589
|
+
passed: event.passed,
|
|
1590
|
+
durationMs: event.durationMs
|
|
1591
|
+
};
|
|
1592
|
+
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1593
|
+
const isAggregated = events.length > 1;
|
|
1594
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1595
|
+
events,
|
|
1596
|
+
nameById
|
|
1597
|
+
);
|
|
1598
|
+
const merged = {
|
|
1599
|
+
name: event.testCaseName,
|
|
1600
|
+
testCaseId: event.testCaseId,
|
|
1601
|
+
completedTestCases: event.completedTestCases,
|
|
1602
|
+
totalTestCases: event.totalTestCases,
|
|
1603
|
+
rerunIndex: event.rerunIndex,
|
|
1604
|
+
rerunTotal: event.rerunTotal,
|
|
1605
|
+
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1606
|
+
passed: events.every((e) => e.passed),
|
|
1607
|
+
events,
|
|
1608
|
+
aggregatedEvaluatorScores,
|
|
1609
|
+
isAggregated
|
|
1610
|
+
};
|
|
1611
|
+
byId.set(event.testCaseId, merged);
|
|
1612
|
+
setCompletedEvaluations(event.completedTestCases);
|
|
1613
|
+
return Array.from(byId.values());
|
|
1614
|
+
});
|
|
1361
1615
|
}
|
|
1362
1616
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1363
1617
|
unsubscribe();
|
|
@@ -1372,9 +1626,7 @@ function RunView({
|
|
|
1372
1626
|
setRunInfo({
|
|
1373
1627
|
runId: snapshot.runId,
|
|
1374
1628
|
datasetName: snapshot.datasetName,
|
|
1375
|
-
evaluatorNames: evaluators.map(
|
|
1376
|
-
(e) => e.evaluator.getName() ?? e.id
|
|
1377
|
-
),
|
|
1629
|
+
evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
|
|
1378
1630
|
totalTestCases: snapshot.totalTestCases
|
|
1379
1631
|
});
|
|
1380
1632
|
setPhase("running");
|
|
@@ -1388,6 +1640,7 @@ function RunView({
|
|
|
1388
1640
|
failedTestCases: finalEvent.failedTestCases,
|
|
1389
1641
|
totalTestCases: finalEvent.totalTestCases,
|
|
1390
1642
|
overallScoreTotal,
|
|
1643
|
+
overallScoreSumSq,
|
|
1391
1644
|
overallScoreCount,
|
|
1392
1645
|
aggregates: new Map(aggregates),
|
|
1393
1646
|
artifactPath: finalEvent.artifactPath
|
|
@@ -1402,29 +1655,41 @@ function RunView({
|
|
|
1402
1655
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1403
1656
|
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1404
1657
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1405
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1658
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1659
|
+
"Run",
|
|
1660
|
+
" "
|
|
1661
|
+
] }),
|
|
1406
1662
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
1407
1663
|
] }),
|
|
1408
1664
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1409
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1665
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1666
|
+
"Dataset",
|
|
1667
|
+
" "
|
|
1668
|
+
] }),
|
|
1410
1669
|
runInfo.datasetName
|
|
1411
1670
|
] }),
|
|
1412
1671
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1413
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1672
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1673
|
+
"Evaluators",
|
|
1674
|
+
" "
|
|
1675
|
+
] }),
|
|
1414
1676
|
runInfo.evaluatorNames.join(", ")
|
|
1415
1677
|
] }),
|
|
1416
1678
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1417
|
-
/* @__PURE__ */ jsxRuntime.
|
|
1679
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1680
|
+
"Test cases",
|
|
1681
|
+
" "
|
|
1682
|
+
] }),
|
|
1418
1683
|
runInfo.totalTestCases
|
|
1419
1684
|
] })
|
|
1420
1685
|
] }),
|
|
1421
1686
|
phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1422
1687
|
Spinner,
|
|
1423
1688
|
{
|
|
1424
|
-
label: `Evaluations ${
|
|
1689
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
|
|
1425
1690
|
}
|
|
1426
1691
|
) }),
|
|
1427
|
-
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc
|
|
1692
|
+
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1428
1693
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1429
1694
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1430
1695
|
"[",
|
|
@@ -1435,49 +1700,78 @@ function RunView({
|
|
|
1435
1700
|
] }),
|
|
1436
1701
|
" ",
|
|
1437
1702
|
tc.name,
|
|
1703
|
+
" ",
|
|
1704
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1705
|
+
"(",
|
|
1706
|
+
tc.rerunIndex,
|
|
1707
|
+
"/",
|
|
1708
|
+
tc.rerunTotal,
|
|
1709
|
+
")"
|
|
1710
|
+
] }),
|
|
1438
1711
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1439
1712
|
" (",
|
|
1440
1713
|
tc.durationMs,
|
|
1441
1714
|
"ms)"
|
|
1442
1715
|
] })
|
|
1443
1716
|
] }),
|
|
1444
|
-
tc.
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1717
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1718
|
+
ink.Box,
|
|
1719
|
+
{
|
|
1720
|
+
flexDirection: "column",
|
|
1721
|
+
marginLeft: 2,
|
|
1722
|
+
children: [
|
|
1723
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1724
|
+
item.evaluatorName,
|
|
1725
|
+
":",
|
|
1726
|
+
" ",
|
|
1727
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1728
|
+
" ",
|
|
1729
|
+
item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1730
|
+
ink.Text,
|
|
1731
|
+
{
|
|
1732
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1733
|
+
children: [
|
|
1734
|
+
formatScorePart(s, scoreColor, {
|
|
1735
|
+
isAggregated: tc.isAggregated
|
|
1736
|
+
}),
|
|
1737
|
+
" "
|
|
1738
|
+
]
|
|
1739
|
+
},
|
|
1740
|
+
s.id
|
|
1741
|
+
)),
|
|
1742
|
+
item.metrics?.map((m) => {
|
|
1743
|
+
const def = getMetricById(m.id);
|
|
1744
|
+
if (!def)
|
|
1745
|
+
return null;
|
|
1746
|
+
const formatted = def.format(m.data, {
|
|
1747
|
+
isAggregated: tc.isAggregated
|
|
1748
|
+
});
|
|
1749
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1750
|
+
"[",
|
|
1751
|
+
def.name ? `${def.name}: ` : "",
|
|
1752
|
+
formatted,
|
|
1753
|
+
"]",
|
|
1754
|
+
" "
|
|
1755
|
+
] }, m.id);
|
|
1756
|
+
})
|
|
1757
|
+
] }),
|
|
1758
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1759
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1760
|
+
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
1761
|
+
ink.Text,
|
|
1762
|
+
{
|
|
1763
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1764
|
+
children: line
|
|
1765
|
+
},
|
|
1766
|
+
lineIdx
|
|
1767
|
+
)
|
|
1768
|
+
) }, logIdx) : null
|
|
1769
|
+
) })
|
|
1770
|
+
]
|
|
1771
|
+
},
|
|
1772
|
+
item.evaluatorId
|
|
1773
|
+
))
|
|
1774
|
+
] }, tc.testCaseId)) }),
|
|
1481
1775
|
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1482
1776
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1483
1777
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
|
|
@@ -1504,7 +1798,14 @@ function RunView({
|
|
|
1504
1798
|
label: "overall avg",
|
|
1505
1799
|
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1506
1800
|
barWidth: 20,
|
|
1507
|
-
format: (v) =>
|
|
1801
|
+
format: (v) => {
|
|
1802
|
+
const sd = sampleStdDev(
|
|
1803
|
+
summary.overallScoreTotal,
|
|
1804
|
+
summary.overallScoreSumSq,
|
|
1805
|
+
summary.overallScoreCount
|
|
1806
|
+
);
|
|
1807
|
+
return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
|
|
1808
|
+
}
|
|
1508
1809
|
}
|
|
1509
1810
|
) }),
|
|
1510
1811
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
@@ -1519,12 +1820,15 @@ function RunView({
|
|
|
1519
1820
|
] }, id);
|
|
1520
1821
|
}
|
|
1521
1822
|
const mean = agg.total / agg.count;
|
|
1823
|
+
const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
|
|
1824
|
+
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1522
1825
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1523
1826
|
"- ",
|
|
1524
1827
|
name.padEnd(28),
|
|
1525
1828
|
" avg=",
|
|
1526
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children:
|
|
1527
|
-
"
|
|
1829
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: meanStr }),
|
|
1830
|
+
" ",
|
|
1831
|
+
"passed=",
|
|
1528
1832
|
agg.passed,
|
|
1529
1833
|
" failed=",
|
|
1530
1834
|
agg.failed
|
|
@@ -1533,28 +1837,41 @@ function RunView({
|
|
|
1533
1837
|
] }),
|
|
1534
1838
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1535
1839
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
|
|
1536
|
-
testCases.map((tc
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1840
|
+
testCases.map((tc) => {
|
|
1841
|
+
const allScores = tc.events.flatMap(
|
|
1842
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1843
|
+
);
|
|
1844
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1845
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1846
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1847
|
+
const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
|
|
1848
|
+
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1849
|
+
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1850
|
+
isAggregated: true
|
|
1851
|
+
}) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
|
|
1852
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1853
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1854
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1855
|
+
" ",
|
|
1856
|
+
tc.name.padEnd(24)
|
|
1546
1857
|
] }),
|
|
1858
|
+
averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1859
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(averageScore), children: [
|
|
1860
|
+
"score=",
|
|
1861
|
+
scoreLabel
|
|
1862
|
+
] }),
|
|
1863
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1864
|
+
" ",
|
|
1865
|
+
createBar(averageScore, 100, 14)
|
|
1866
|
+
] })
|
|
1867
|
+
] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
|
|
1547
1868
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1548
|
-
" ",
|
|
1549
|
-
|
|
1869
|
+
" (",
|
|
1870
|
+
tc.durationMs,
|
|
1871
|
+
"ms)"
|
|
1550
1872
|
] })
|
|
1551
|
-
] }
|
|
1552
|
-
|
|
1553
|
-
" (",
|
|
1554
|
-
tc.durationMs,
|
|
1555
|
-
"ms)"
|
|
1556
|
-
] })
|
|
1557
|
-
] }, i))
|
|
1873
|
+
] }, tc.testCaseId);
|
|
1874
|
+
})
|
|
1558
1875
|
] }),
|
|
1559
1876
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1560
1877
|
"artifact: ",
|
|
@@ -1565,6 +1882,61 @@ function RunView({
|
|
|
1565
1882
|
}
|
|
1566
1883
|
|
|
1567
1884
|
// src/cli-simple/run.ts
|
|
1885
|
+
function sampleStdDev2(sum, sumSq, n) {
|
|
1886
|
+
if (n < 2)
|
|
1887
|
+
return void 0;
|
|
1888
|
+
const mean = sum / n;
|
|
1889
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1890
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1891
|
+
}
|
|
1892
|
+
function buildTestCaseSummaries(byId) {
|
|
1893
|
+
const summaries = [];
|
|
1894
|
+
for (const { name, events } of byId.values()) {
|
|
1895
|
+
const passed = events.every((e) => e.passed);
|
|
1896
|
+
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1897
|
+
const isAggregated = events.length > 1;
|
|
1898
|
+
const allScores = events.flatMap(
|
|
1899
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1900
|
+
);
|
|
1901
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1902
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1903
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1904
|
+
const stdDev = sampleStdDev2(total, sumSq, allScores.length);
|
|
1905
|
+
let firstAggregatedScore;
|
|
1906
|
+
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1907
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1908
|
+
for (const ev of events) {
|
|
1909
|
+
const es = ev.evaluatorScores.find(
|
|
1910
|
+
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
1911
|
+
);
|
|
1912
|
+
for (const s of es?.scores ?? []) {
|
|
1913
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1914
|
+
list.push(s);
|
|
1915
|
+
scoreIdToItems.set(s.id, list);
|
|
1916
|
+
}
|
|
1917
|
+
}
|
|
1918
|
+
for (const items of scoreIdToItems.values()) {
|
|
1919
|
+
const agg = aggregateScoreItems(items);
|
|
1920
|
+
if (agg && firstAggregatedScore === void 0) {
|
|
1921
|
+
firstAggregatedScore = agg;
|
|
1922
|
+
break;
|
|
1923
|
+
}
|
|
1924
|
+
}
|
|
1925
|
+
if (firstAggregatedScore !== void 0)
|
|
1926
|
+
break;
|
|
1927
|
+
}
|
|
1928
|
+
summaries.push({
|
|
1929
|
+
name,
|
|
1930
|
+
averageScore,
|
|
1931
|
+
stdDev: stdDev ?? void 0,
|
|
1932
|
+
aggregatedScoreItem: firstAggregatedScore,
|
|
1933
|
+
isAggregated,
|
|
1934
|
+
durationMs,
|
|
1935
|
+
passed
|
|
1936
|
+
});
|
|
1937
|
+
}
|
|
1938
|
+
return summaries;
|
|
1939
|
+
}
|
|
1568
1940
|
var ansi2 = {
|
|
1569
1941
|
reset: "\x1B[0m",
|
|
1570
1942
|
bold: "\x1B[1m",
|
|
@@ -1592,14 +1964,59 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
1592
1964
|
return `- ${evaluatorName.padEnd(28)} no numeric scores`;
|
|
1593
1965
|
}
|
|
1594
1966
|
const mean = aggregate.total / aggregate.count;
|
|
1595
|
-
|
|
1967
|
+
const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
|
|
1968
|
+
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1969
|
+
return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
1596
1970
|
}
|
|
1597
1971
|
function createBar2(value, max = 100, width = 20) {
|
|
1598
1972
|
const safe = Math.max(0, Math.min(max, value));
|
|
1599
1973
|
const filled = Math.round(safe / max * width);
|
|
1600
1974
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
1601
1975
|
}
|
|
1602
|
-
function
|
|
1976
|
+
function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
1977
|
+
if (events.length === 0)
|
|
1978
|
+
return [];
|
|
1979
|
+
const evaluatorIds = new Set(
|
|
1980
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1981
|
+
);
|
|
1982
|
+
const result = [];
|
|
1983
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1984
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1985
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1986
|
+
for (const ev of events) {
|
|
1987
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1988
|
+
for (const s of es?.scores ?? []) {
|
|
1989
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1990
|
+
list.push(s);
|
|
1991
|
+
scoreIdToItems.set(s.id, list);
|
|
1992
|
+
}
|
|
1993
|
+
for (const m of es?.metrics ?? []) {
|
|
1994
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1995
|
+
list.push(m);
|
|
1996
|
+
metricIdToItems.set(m.id, list);
|
|
1997
|
+
}
|
|
1998
|
+
}
|
|
1999
|
+
const aggregatedScores = [];
|
|
2000
|
+
for (const items of scoreIdToItems.values()) {
|
|
2001
|
+
const agg = aggregateScoreItems(items);
|
|
2002
|
+
if (agg)
|
|
2003
|
+
aggregatedScores.push(agg);
|
|
2004
|
+
}
|
|
2005
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
2006
|
+
const passed = events.every((ev) => {
|
|
2007
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
2008
|
+
return es?.passed ?? false;
|
|
2009
|
+
});
|
|
2010
|
+
result.push({
|
|
2011
|
+
evaluatorId,
|
|
2012
|
+
scores: aggregatedScores,
|
|
2013
|
+
passed,
|
|
2014
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
|
|
2015
|
+
});
|
|
2016
|
+
}
|
|
2017
|
+
return result;
|
|
2018
|
+
}
|
|
2019
|
+
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1603
2020
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1604
2021
|
const scoreParts = [];
|
|
1605
2022
|
for (const item of scores) {
|
|
@@ -1611,7 +2028,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1611
2028
|
);
|
|
1612
2029
|
continue;
|
|
1613
2030
|
}
|
|
1614
|
-
const formatted = def.format(item.data);
|
|
2031
|
+
const formatted = def.format(item.data, options);
|
|
1615
2032
|
switch (def.displayStrategy) {
|
|
1616
2033
|
case "bar": {
|
|
1617
2034
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -1644,7 +2061,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1644
2061
|
for (const { id, data } of metrics) {
|
|
1645
2062
|
const def = getMetricById(id);
|
|
1646
2063
|
if (def) {
|
|
1647
|
-
const formatted = def.format(data);
|
|
2064
|
+
const formatted = def.format(data, options);
|
|
1648
2065
|
metricParts.push(
|
|
1649
2066
|
def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
|
|
1650
2067
|
);
|
|
@@ -1677,8 +2094,9 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1677
2094
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1678
2095
|
);
|
|
1679
2096
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1680
|
-
const
|
|
2097
|
+
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
1681
2098
|
let overallScoreTotal = 0;
|
|
2099
|
+
let overallScoreSumSq = 0;
|
|
1682
2100
|
let overallScoreCount = 0;
|
|
1683
2101
|
let completedCount = 0;
|
|
1684
2102
|
let totalCount = 0;
|
|
@@ -1691,6 +2109,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1691
2109
|
}
|
|
1692
2110
|
process.stdout.write("\r\x1B[2K");
|
|
1693
2111
|
}
|
|
2112
|
+
function cursorUp(n) {
|
|
2113
|
+
if (!process.stdout.isTTY || n <= 0)
|
|
2114
|
+
return;
|
|
2115
|
+
process.stdout.write(`\x1B[${n}A`);
|
|
2116
|
+
}
|
|
1694
2117
|
function drawSpinner() {
|
|
1695
2118
|
if (!process.stdout.isTTY || runFinished) {
|
|
1696
2119
|
return;
|
|
@@ -1704,6 +2127,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1704
2127
|
)} ${colorize("(live)", ansi2.dim)}`
|
|
1705
2128
|
);
|
|
1706
2129
|
}
|
|
2130
|
+
let lastPrintedTestCaseId = null;
|
|
2131
|
+
let lastPrintedLineCount = 0;
|
|
1707
2132
|
let spinnerTimer;
|
|
1708
2133
|
const done = new Promise((resolve5) => {
|
|
1709
2134
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
@@ -1711,55 +2136,94 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1711
2136
|
completedCount = event.completedTestCases;
|
|
1712
2137
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1713
2138
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
2139
|
+
const testCaseId = event.testCaseId;
|
|
2140
|
+
const existing = testCaseByTestId.get(testCaseId) ?? {
|
|
2141
|
+
name: event.testCaseName,
|
|
2142
|
+
events: []
|
|
2143
|
+
};
|
|
2144
|
+
existing.events.push({
|
|
2145
|
+
averageScore,
|
|
2146
|
+
passed: event.passed,
|
|
2147
|
+
durationMs: event.durationMs,
|
|
2148
|
+
evaluatorScores: event.evaluatorScores
|
|
2149
|
+
});
|
|
2150
|
+
testCaseByTestId.set(testCaseId, existing);
|
|
1718
2151
|
for (const item of event.evaluatorScores) {
|
|
1719
|
-
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
1720
|
-
console.log(
|
|
1721
|
-
formatEvaluatorScoreLine(
|
|
1722
|
-
name,
|
|
1723
|
-
item.scores,
|
|
1724
|
-
item.passed,
|
|
1725
|
-
item.metrics
|
|
1726
|
-
)
|
|
1727
|
-
);
|
|
1728
|
-
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1729
|
-
for (const log of item.logs) {
|
|
1730
|
-
if (log.type === "diff") {
|
|
1731
|
-
const useColor = process.stdout.isTTY;
|
|
1732
|
-
for (const { type, line } of getDiffLines(log)) {
|
|
1733
|
-
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1734
|
-
console.log(colored);
|
|
1735
|
-
}
|
|
1736
|
-
}
|
|
1737
|
-
}
|
|
1738
|
-
}
|
|
1739
2152
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1740
2153
|
if (numeric !== void 0) {
|
|
1741
2154
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1742
2155
|
total: 0,
|
|
2156
|
+
sumSq: 0,
|
|
1743
2157
|
count: 0,
|
|
1744
2158
|
passed: 0,
|
|
1745
2159
|
failed: 0
|
|
1746
2160
|
};
|
|
1747
2161
|
aggregates.set(item.evaluatorId, {
|
|
1748
2162
|
total: current.total + numeric,
|
|
2163
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
1749
2164
|
count: current.count + 1,
|
|
1750
2165
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
1751
2166
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
1752
2167
|
});
|
|
1753
2168
|
overallScoreTotal += numeric;
|
|
2169
|
+
overallScoreSumSq += numeric * numeric;
|
|
1754
2170
|
overallScoreCount += 1;
|
|
1755
2171
|
}
|
|
1756
2172
|
}
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
2173
|
+
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2174
|
+
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
2175
|
+
const isNonTty = !process.stdout.isTTY;
|
|
2176
|
+
const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
|
|
2177
|
+
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2178
|
+
cursorUp(lastPrintedLineCount);
|
|
2179
|
+
}
|
|
2180
|
+
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2181
|
+
existing.events);
|
|
2182
|
+
const isAggregated = existing.events.length > 1;
|
|
2183
|
+
const durationMs = existing.events.reduce(
|
|
2184
|
+
(s, e) => s + e.durationMs,
|
|
2185
|
+
0
|
|
2186
|
+
);
|
|
2187
|
+
existing.events.every((e) => e.passed);
|
|
2188
|
+
const lines = [];
|
|
2189
|
+
lines.push(
|
|
2190
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2191
|
+
);
|
|
2192
|
+
for (const item of aggregatedScores) {
|
|
2193
|
+
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2194
|
+
lines.push(
|
|
2195
|
+
formatEvaluatorScoreLine(
|
|
2196
|
+
name,
|
|
2197
|
+
item.scores,
|
|
2198
|
+
item.passed,
|
|
2199
|
+
item.metrics,
|
|
2200
|
+
{ isAggregated }
|
|
2201
|
+
)
|
|
2202
|
+
);
|
|
2203
|
+
const lastEvent = existing.events[existing.events.length - 1];
|
|
2204
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2205
|
+
(x) => x.evaluatorId === item.evaluatorId
|
|
2206
|
+
);
|
|
2207
|
+
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2208
|
+
for (const log of lastEs.logs) {
|
|
2209
|
+
if (log.type === "diff") {
|
|
2210
|
+
const useColor = process.stdout.isTTY;
|
|
2211
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
2212
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2213
|
+
lines.push(colored);
|
|
2214
|
+
}
|
|
2215
|
+
}
|
|
2216
|
+
}
|
|
2217
|
+
}
|
|
2218
|
+
}
|
|
2219
|
+
if (!skipPrintNonTty) {
|
|
2220
|
+
for (let i = 0; i < lines.length; i++) {
|
|
2221
|
+
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2222
|
+
`);
|
|
2223
|
+
}
|
|
2224
|
+
lastPrintedTestCaseId = testCaseId;
|
|
2225
|
+
lastPrintedLineCount = lines.length;
|
|
2226
|
+
}
|
|
1763
2227
|
drawSpinner();
|
|
1764
2228
|
}
|
|
1765
2229
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
@@ -1810,9 +2274,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1810
2274
|
);
|
|
1811
2275
|
if (overallScoreCount > 0) {
|
|
1812
2276
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2277
|
+
const overallSd = sampleStdDev2(
|
|
2278
|
+
overallScoreTotal,
|
|
2279
|
+
overallScoreSumSq,
|
|
2280
|
+
overallScoreCount
|
|
2281
|
+
);
|
|
2282
|
+
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
1813
2283
|
console.log(
|
|
1814
2284
|
`- overall avg score: ${colorize(
|
|
1815
|
-
|
|
2285
|
+
avgStr,
|
|
1816
2286
|
scoreToColor(overallAverage)
|
|
1817
2287
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
1818
2288
|
);
|
|
@@ -1823,6 +2293,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1823
2293
|
getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
|
|
1824
2294
|
);
|
|
1825
2295
|
}
|
|
2296
|
+
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
1826
2297
|
if (testCaseSummaries.length > 0) {
|
|
1827
2298
|
console.log(colorize("- test case scores:", ansi2.magenta));
|
|
1828
2299
|
for (const summary of testCaseSummaries) {
|
|
@@ -1833,9 +2304,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1833
2304
|
);
|
|
1834
2305
|
continue;
|
|
1835
2306
|
}
|
|
2307
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2308
|
+
summary.aggregatedScoreItem.data,
|
|
2309
|
+
{ isAggregated: true }
|
|
2310
|
+
) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
1836
2311
|
console.log(
|
|
1837
2312
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1838
|
-
|
|
2313
|
+
scoreLabel,
|
|
1839
2314
|
scoreToColor(summary.averageScore)
|
|
1840
2315
|
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1841
2316
|
);
|