@m4trix/evals 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +179 -88
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +179 -88
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +124 -50
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +124 -50
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +120 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +42 -6
- package/dist/index.js +119 -46
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -347,7 +347,11 @@ var Metric = {
|
|
|
347
347
|
name: config.name,
|
|
348
348
|
aggregate: config.aggregate,
|
|
349
349
|
format: config.format,
|
|
350
|
-
make: (data) => ({
|
|
350
|
+
make: (data, options) => ({
|
|
351
|
+
id: config.id,
|
|
352
|
+
data,
|
|
353
|
+
...options?.name !== void 0 && { name: options.name }
|
|
354
|
+
})
|
|
351
355
|
};
|
|
352
356
|
registry.set(config.id, def);
|
|
353
357
|
return def;
|
|
@@ -359,20 +363,107 @@ function getMetricById(id) {
|
|
|
359
363
|
|
|
360
364
|
// src/evals/score.ts
|
|
361
365
|
var registry2 = /* @__PURE__ */ new Map();
|
|
366
|
+
function formatScoreData(def, data, options) {
|
|
367
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
368
|
+
}
|
|
369
|
+
var ScoreAggregate = {
|
|
370
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
371
|
+
averageFields(fields) {
|
|
372
|
+
return (values) => {
|
|
373
|
+
const count = values.length || 1;
|
|
374
|
+
const result = {};
|
|
375
|
+
for (const field of fields) {
|
|
376
|
+
result[field] = values.reduce(
|
|
377
|
+
(s, v) => s + (v[field] ?? 0),
|
|
378
|
+
0
|
|
379
|
+
) / count;
|
|
380
|
+
}
|
|
381
|
+
return result;
|
|
382
|
+
};
|
|
383
|
+
},
|
|
384
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
385
|
+
averageWithVariance(fields) {
|
|
386
|
+
return (values) => {
|
|
387
|
+
const count = values.length;
|
|
388
|
+
const result = {};
|
|
389
|
+
for (const field of fields) {
|
|
390
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
391
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
392
|
+
0
|
|
393
|
+
) / count;
|
|
394
|
+
}
|
|
395
|
+
const valueField = "value";
|
|
396
|
+
const hasValueField = fields.includes(valueField);
|
|
397
|
+
if (count === 0) {
|
|
398
|
+
if (hasValueField) {
|
|
399
|
+
result[valueField] = 0;
|
|
400
|
+
}
|
|
401
|
+
return {
|
|
402
|
+
...result,
|
|
403
|
+
stdDev: void 0,
|
|
404
|
+
count: 0
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
let stdDev;
|
|
408
|
+
if (hasValueField && count >= 2) {
|
|
409
|
+
const sum = values.reduce(
|
|
410
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
411
|
+
0
|
|
412
|
+
);
|
|
413
|
+
const sumSq = values.reduce(
|
|
414
|
+
(s, v) => {
|
|
415
|
+
const value = v[valueField] ?? 0;
|
|
416
|
+
return s + value * value;
|
|
417
|
+
},
|
|
418
|
+
0
|
|
419
|
+
);
|
|
420
|
+
const mean = sum / count;
|
|
421
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
422
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
423
|
+
}
|
|
424
|
+
return {
|
|
425
|
+
...values[0],
|
|
426
|
+
...result,
|
|
427
|
+
stdDev,
|
|
428
|
+
count
|
|
429
|
+
};
|
|
430
|
+
};
|
|
431
|
+
},
|
|
432
|
+
/** All runs must pass. Use for binary scores. */
|
|
433
|
+
all(values) {
|
|
434
|
+
const total = values.length;
|
|
435
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
436
|
+
return {
|
|
437
|
+
...values[0],
|
|
438
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
439
|
+
passedCount,
|
|
440
|
+
totalCount: total
|
|
441
|
+
};
|
|
442
|
+
},
|
|
443
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
444
|
+
last(values) {
|
|
445
|
+
return values[values.length - 1] ?? {};
|
|
446
|
+
}
|
|
447
|
+
};
|
|
362
448
|
var Score = {
|
|
449
|
+
aggregate: ScoreAggregate,
|
|
363
450
|
of(config) {
|
|
364
451
|
const def = {
|
|
365
452
|
id: config.id,
|
|
366
453
|
name: config.name,
|
|
367
454
|
displayStrategy: config.displayStrategy,
|
|
368
|
-
|
|
369
|
-
|
|
455
|
+
formatValue: config.formatValue,
|
|
456
|
+
formatAggregate: config.formatAggregate,
|
|
457
|
+
aggregateValues: config.aggregateValues,
|
|
370
458
|
make: (data, options) => {
|
|
371
459
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
372
460
|
return {
|
|
373
461
|
id: config.id,
|
|
374
462
|
data,
|
|
375
|
-
...passed !== void 0 && { passed }
|
|
463
|
+
...passed !== void 0 && { passed },
|
|
464
|
+
...options?.name !== void 0 && { name: options.name },
|
|
465
|
+
def
|
|
466
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
376
467
|
};
|
|
377
468
|
}
|
|
378
469
|
};
|
|
@@ -385,29 +476,6 @@ function getScoreById(id) {
|
|
|
385
476
|
}
|
|
386
477
|
|
|
387
478
|
// src/evals/aggregators.ts
|
|
388
|
-
function aggregateAverageWithVariance(values) {
|
|
389
|
-
if (values.length === 0) {
|
|
390
|
-
return { value: 0, count: 0 };
|
|
391
|
-
}
|
|
392
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
393
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
394
|
-
const mean = sum / values.length;
|
|
395
|
-
let stdDev;
|
|
396
|
-
if (values.length >= 2) {
|
|
397
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
398
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
399
|
-
}
|
|
400
|
-
return { value: mean, stdDev, count: values.length };
|
|
401
|
-
}
|
|
402
|
-
function aggregateAll(values) {
|
|
403
|
-
const total = values.length;
|
|
404
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
405
|
-
return {
|
|
406
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
407
|
-
passedCount,
|
|
408
|
-
totalCount: total
|
|
409
|
-
};
|
|
410
|
-
}
|
|
411
479
|
function aggregateTokenCountSum(values) {
|
|
412
480
|
const initial = {
|
|
413
481
|
input: 0,
|
|
@@ -460,40 +528,59 @@ Score.of({
|
|
|
460
528
|
id: "percent",
|
|
461
529
|
name: "Score",
|
|
462
530
|
displayStrategy: "bar",
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
531
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
532
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
533
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
534
|
+
});
|
|
535
|
+
Score.of({
|
|
536
|
+
id: "delta",
|
|
537
|
+
name: "Delta",
|
|
538
|
+
displayStrategy: "number",
|
|
539
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
540
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
541
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
470
542
|
});
|
|
471
543
|
Score.of({
|
|
472
544
|
id: "binary",
|
|
473
545
|
name: "Result",
|
|
474
546
|
displayStrategy: "passFail",
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
}
|
|
481
|
-
return base;
|
|
547
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
548
|
+
formatAggregate: (data) => {
|
|
549
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
550
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
551
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
482
552
|
}
|
|
483
|
-
return
|
|
553
|
+
return base;
|
|
484
554
|
},
|
|
485
|
-
|
|
555
|
+
aggregateValues: Score.aggregate.all
|
|
486
556
|
});
|
|
487
557
|
|
|
488
558
|
// src/runner/score-utils.ts
|
|
559
|
+
function getScoreDef(item) {
|
|
560
|
+
return item.def ?? getScoreById(item.id);
|
|
561
|
+
}
|
|
562
|
+
function lastNonEmptyName(items) {
|
|
563
|
+
for (let i = items.length - 1; i >= 0; i--) {
|
|
564
|
+
const n = items[i].name;
|
|
565
|
+
if (n != null && n.trim().length > 0)
|
|
566
|
+
return n;
|
|
567
|
+
}
|
|
568
|
+
return void 0;
|
|
569
|
+
}
|
|
489
570
|
function aggregateScoreItems(items) {
|
|
490
571
|
if (items.length === 0)
|
|
491
572
|
return void 0;
|
|
492
|
-
const def =
|
|
493
|
-
if (!def?.
|
|
573
|
+
const def = getScoreDef(items[0]);
|
|
574
|
+
if (!def?.aggregateValues)
|
|
494
575
|
return items[items.length - 1];
|
|
495
|
-
const aggregated = def.
|
|
496
|
-
|
|
576
|
+
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
577
|
+
const nameOverride = lastNonEmptyName(items);
|
|
578
|
+
return {
|
|
579
|
+
...items[0],
|
|
580
|
+
data: aggregated,
|
|
581
|
+
def,
|
|
582
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
583
|
+
};
|
|
497
584
|
}
|
|
498
585
|
function aggregateMetricItems(items) {
|
|
499
586
|
if (items.length === 0)
|
|
@@ -502,11 +589,16 @@ function aggregateMetricItems(items) {
|
|
|
502
589
|
if (!def?.aggregate)
|
|
503
590
|
return items[items.length - 1];
|
|
504
591
|
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
505
|
-
|
|
592
|
+
const nameOverride = lastNonEmptyName(items);
|
|
593
|
+
return {
|
|
594
|
+
...items[0],
|
|
595
|
+
data: aggregated,
|
|
596
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
597
|
+
};
|
|
506
598
|
}
|
|
507
599
|
function toNumericScoreFromScores(scores) {
|
|
508
600
|
for (const item of scores) {
|
|
509
|
-
const def =
|
|
601
|
+
const def = getScoreDef(item);
|
|
510
602
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
511
603
|
const value = item.data.value;
|
|
512
604
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -894,7 +986,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
|
894
986
|
() => appendJsonLine(message.artifactPath, {
|
|
895
987
|
runId: message.runId,
|
|
896
988
|
ts: Date.now(),
|
|
897
|
-
...message.payload
|
|
989
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
898
990
|
})
|
|
899
991
|
);
|
|
900
992
|
})
|
|
@@ -1480,7 +1572,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1480
1572
|
if (agg)
|
|
1481
1573
|
aggregatedScores.push(agg);
|
|
1482
1574
|
}
|
|
1483
|
-
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([
|
|
1575
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1484
1576
|
const passed = events.every((ev) => {
|
|
1485
1577
|
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1486
1578
|
return es?.passed ?? false;
|
|
@@ -1500,13 +1592,13 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1500
1592
|
}
|
|
1501
1593
|
return result;
|
|
1502
1594
|
}
|
|
1503
|
-
function formatScorePart(item,
|
|
1504
|
-
const def = getScoreById(item.id);
|
|
1595
|
+
function formatScorePart(item, _scoreToColor, options) {
|
|
1596
|
+
const def = item.def ?? getScoreById(item.id);
|
|
1505
1597
|
if (!def) {
|
|
1506
1598
|
const numeric = toNumericScore(item.data);
|
|
1507
1599
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1508
1600
|
}
|
|
1509
|
-
const formatted = def
|
|
1601
|
+
const formatted = formatScoreData(def, item.data, options);
|
|
1510
1602
|
if (def.displayStrategy === "bar") {
|
|
1511
1603
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1512
1604
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1564,8 +1656,6 @@ function RunView({
|
|
|
1564
1656
|
const done = new Promise((resolve5) => {
|
|
1565
1657
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1566
1658
|
if (event.type === "TestCaseProgress") {
|
|
1567
|
-
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1568
|
-
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1569
1659
|
for (const item of event.evaluatorScores) {
|
|
1570
1660
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1571
1661
|
if (numeric !== void 0) {
|
|
@@ -1655,16 +1745,17 @@ function RunView({
|
|
|
1655
1745
|
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1656
1746
|
return;
|
|
1657
1747
|
}
|
|
1748
|
+
const completed = finalEvent;
|
|
1658
1749
|
setSummary({
|
|
1659
|
-
passedTestCases:
|
|
1660
|
-
failedTestCases:
|
|
1661
|
-
totalTestCases:
|
|
1750
|
+
passedTestCases: completed.passedTestCases,
|
|
1751
|
+
failedTestCases: completed.failedTestCases,
|
|
1752
|
+
totalTestCases: completed.totalTestCases,
|
|
1662
1753
|
overallScoreTotal,
|
|
1663
1754
|
overallScoreSumSq,
|
|
1664
1755
|
overallScoreCount,
|
|
1665
1756
|
aggregates: new Map(aggregates),
|
|
1666
1757
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1667
|
-
artifactPath:
|
|
1758
|
+
artifactPath: completed.artifactPath
|
|
1668
1759
|
});
|
|
1669
1760
|
setPhase("completed");
|
|
1670
1761
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1755,9 +1846,10 @@ function RunView({
|
|
|
1755
1846
|
const formatted = def.format(m.data, {
|
|
1756
1847
|
isAggregated: tc.isAggregated
|
|
1757
1848
|
});
|
|
1849
|
+
const label = m.name ?? def.name;
|
|
1758
1850
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1759
1851
|
"[",
|
|
1760
|
-
|
|
1852
|
+
label ? `${label}: ` : "",
|
|
1761
1853
|
formatted,
|
|
1762
1854
|
"]",
|
|
1763
1855
|
" "
|
|
@@ -1766,8 +1858,8 @@ function RunView({
|
|
|
1766
1858
|
] }) : null
|
|
1767
1859
|
] }),
|
|
1768
1860
|
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1769
|
-
const def = getScoreById(s.id);
|
|
1770
|
-
const scoreLabel =
|
|
1861
|
+
const def = s.def ?? getScoreById(s.id);
|
|
1862
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
1771
1863
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1772
1864
|
ink.Text,
|
|
1773
1865
|
{
|
|
@@ -1870,11 +1962,9 @@ function RunView({
|
|
|
1870
1962
|
const aggregated = aggregateScoreItems(items);
|
|
1871
1963
|
if (!aggregated)
|
|
1872
1964
|
return null;
|
|
1873
|
-
const def = getScoreById(aggregated.id);
|
|
1874
|
-
const label =
|
|
1875
|
-
const formatted = def
|
|
1876
|
-
isAggregated: true
|
|
1877
|
-
}) ?? "n/a";
|
|
1965
|
+
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1966
|
+
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
1967
|
+
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1878
1968
|
const numeric = toNumericScore(aggregated.data);
|
|
1879
1969
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1880
1970
|
ink.Text,
|
|
@@ -2033,9 +2123,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2033
2123
|
const agg = aggregateScoreItems(items);
|
|
2034
2124
|
if (!agg)
|
|
2035
2125
|
continue;
|
|
2036
|
-
const def = getScoreById(agg.id);
|
|
2037
|
-
const label =
|
|
2038
|
-
const formatted = def
|
|
2126
|
+
const def = agg.def ?? getScoreById(agg.id);
|
|
2127
|
+
const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
|
|
2128
|
+
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2039
2129
|
const numeric = toNumericScore(agg.data);
|
|
2040
2130
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2041
2131
|
scoreLines.push(` ${label}: ${colored}`);
|
|
@@ -2053,7 +2143,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2053
2143
|
const filled = Math.round(safe / max * width);
|
|
2054
2144
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
2055
2145
|
}
|
|
2056
|
-
function aggregateEvaluatorScoresFromEvents(events,
|
|
2146
|
+
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2057
2147
|
if (events.length === 0)
|
|
2058
2148
|
return [];
|
|
2059
2149
|
const evaluatorIds = new Set(
|
|
@@ -2100,26 +2190,27 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2100
2190
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
2101
2191
|
const metricParts = [];
|
|
2102
2192
|
if (metrics && metrics.length > 0) {
|
|
2103
|
-
for (const
|
|
2104
|
-
const def = getMetricById(id);
|
|
2193
|
+
for (const m of metrics) {
|
|
2194
|
+
const def = getMetricById(m.id);
|
|
2105
2195
|
if (def) {
|
|
2106
|
-
const formatted = def.format(data, options);
|
|
2196
|
+
const formatted = def.format(m.data, options);
|
|
2197
|
+
const label = m.name ?? def.name;
|
|
2107
2198
|
metricParts.push(
|
|
2108
|
-
|
|
2199
|
+
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2109
2200
|
);
|
|
2110
2201
|
}
|
|
2111
2202
|
}
|
|
2112
2203
|
}
|
|
2113
2204
|
const scoreLines = [];
|
|
2114
2205
|
for (const item of scores) {
|
|
2115
|
-
const def = getScoreById(item.id);
|
|
2116
|
-
const scoreLabel =
|
|
2206
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2207
|
+
const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
|
|
2117
2208
|
let formatted;
|
|
2118
2209
|
if (!def) {
|
|
2119
2210
|
const numeric = toNumericScore(item.data);
|
|
2120
2211
|
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2121
2212
|
} else {
|
|
2122
|
-
const raw = def
|
|
2213
|
+
const raw = formatScoreData(def, item.data, options);
|
|
2123
2214
|
switch (def.displayStrategy) {
|
|
2124
2215
|
case "bar": {
|
|
2125
2216
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -2271,7 +2362,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2271
2362
|
(s, e) => s + e.durationMs,
|
|
2272
2363
|
0
|
|
2273
2364
|
);
|
|
2274
|
-
existing.events.every((e) => e.passed);
|
|
2275
2365
|
const lines = [];
|
|
2276
2366
|
lines.push(
|
|
2277
2367
|
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
@@ -2349,18 +2439,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2349
2439
|
if (finalEvent.type === "RunFailed") {
|
|
2350
2440
|
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2351
2441
|
}
|
|
2442
|
+
const completed = finalEvent;
|
|
2352
2443
|
console.log("");
|
|
2353
2444
|
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2354
2445
|
console.log(
|
|
2355
2446
|
`- passed: ${colorize(
|
|
2356
|
-
`${
|
|
2447
|
+
`${completed.passedTestCases}/${completed.totalTestCases}`,
|
|
2357
2448
|
ansi2.green
|
|
2358
2449
|
)}`
|
|
2359
2450
|
);
|
|
2360
2451
|
console.log(
|
|
2361
2452
|
`- failed: ${colorize(
|
|
2362
|
-
`${
|
|
2363
|
-
|
|
2453
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2454
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2364
2455
|
)}`
|
|
2365
2456
|
);
|
|
2366
2457
|
if (overallScoreCount > 0) {
|
|
@@ -2401,10 +2492,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2401
2492
|
);
|
|
2402
2493
|
continue;
|
|
2403
2494
|
}
|
|
2404
|
-
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ?
|
|
2405
|
-
summary.aggregatedScoreItem.
|
|
2406
|
-
|
|
2407
|
-
)
|
|
2495
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
|
|
2496
|
+
const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
|
|
2497
|
+
return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
|
|
2498
|
+
})() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2408
2499
|
console.log(
|
|
2409
2500
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2410
2501
|
scoreLabel,
|
|
@@ -2413,7 +2504,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2413
2504
|
);
|
|
2414
2505
|
}
|
|
2415
2506
|
}
|
|
2416
|
-
console.log(`- artifact: ${colorize(
|
|
2507
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2417
2508
|
}
|
|
2418
2509
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2419
2510
|
return new Promise((resolve5, reject) => {
|