@m4trix/evals 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +179 -88
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +179 -88
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +124 -50
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +124 -50
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +120 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +42 -6
- package/dist/index.js +119 -46
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -321,7 +321,11 @@ var Metric = {
|
|
|
321
321
|
name: config.name,
|
|
322
322
|
aggregate: config.aggregate,
|
|
323
323
|
format: config.format,
|
|
324
|
-
make: (data) => ({
|
|
324
|
+
make: (data, options) => ({
|
|
325
|
+
id: config.id,
|
|
326
|
+
data,
|
|
327
|
+
...options?.name !== void 0 && { name: options.name }
|
|
328
|
+
})
|
|
325
329
|
};
|
|
326
330
|
registry.set(config.id, def);
|
|
327
331
|
return def;
|
|
@@ -333,20 +337,107 @@ function getMetricById(id) {
|
|
|
333
337
|
|
|
334
338
|
// src/evals/score.ts
|
|
335
339
|
var registry2 = /* @__PURE__ */ new Map();
|
|
340
|
+
function formatScoreData(def, data, options) {
|
|
341
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
342
|
+
}
|
|
343
|
+
var ScoreAggregate = {
|
|
344
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
345
|
+
averageFields(fields) {
|
|
346
|
+
return (values) => {
|
|
347
|
+
const count = values.length || 1;
|
|
348
|
+
const result = {};
|
|
349
|
+
for (const field of fields) {
|
|
350
|
+
result[field] = values.reduce(
|
|
351
|
+
(s, v) => s + (v[field] ?? 0),
|
|
352
|
+
0
|
|
353
|
+
) / count;
|
|
354
|
+
}
|
|
355
|
+
return result;
|
|
356
|
+
};
|
|
357
|
+
},
|
|
358
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
359
|
+
averageWithVariance(fields) {
|
|
360
|
+
return (values) => {
|
|
361
|
+
const count = values.length;
|
|
362
|
+
const result = {};
|
|
363
|
+
for (const field of fields) {
|
|
364
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
365
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
366
|
+
0
|
|
367
|
+
) / count;
|
|
368
|
+
}
|
|
369
|
+
const valueField = "value";
|
|
370
|
+
const hasValueField = fields.includes(valueField);
|
|
371
|
+
if (count === 0) {
|
|
372
|
+
if (hasValueField) {
|
|
373
|
+
result[valueField] = 0;
|
|
374
|
+
}
|
|
375
|
+
return {
|
|
376
|
+
...result,
|
|
377
|
+
stdDev: void 0,
|
|
378
|
+
count: 0
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
let stdDev;
|
|
382
|
+
if (hasValueField && count >= 2) {
|
|
383
|
+
const sum = values.reduce(
|
|
384
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
385
|
+
0
|
|
386
|
+
);
|
|
387
|
+
const sumSq = values.reduce(
|
|
388
|
+
(s, v) => {
|
|
389
|
+
const value = v[valueField] ?? 0;
|
|
390
|
+
return s + value * value;
|
|
391
|
+
},
|
|
392
|
+
0
|
|
393
|
+
);
|
|
394
|
+
const mean = sum / count;
|
|
395
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
396
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
397
|
+
}
|
|
398
|
+
return {
|
|
399
|
+
...values[0],
|
|
400
|
+
...result,
|
|
401
|
+
stdDev,
|
|
402
|
+
count
|
|
403
|
+
};
|
|
404
|
+
};
|
|
405
|
+
},
|
|
406
|
+
/** All runs must pass. Use for binary scores. */
|
|
407
|
+
all(values) {
|
|
408
|
+
const total = values.length;
|
|
409
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
410
|
+
return {
|
|
411
|
+
...values[0],
|
|
412
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
413
|
+
passedCount,
|
|
414
|
+
totalCount: total
|
|
415
|
+
};
|
|
416
|
+
},
|
|
417
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
418
|
+
last(values) {
|
|
419
|
+
return values[values.length - 1] ?? {};
|
|
420
|
+
}
|
|
421
|
+
};
|
|
336
422
|
var Score = {
|
|
423
|
+
aggregate: ScoreAggregate,
|
|
337
424
|
of(config) {
|
|
338
425
|
const def = {
|
|
339
426
|
id: config.id,
|
|
340
427
|
name: config.name,
|
|
341
428
|
displayStrategy: config.displayStrategy,
|
|
342
|
-
|
|
343
|
-
|
|
429
|
+
formatValue: config.formatValue,
|
|
430
|
+
formatAggregate: config.formatAggregate,
|
|
431
|
+
aggregateValues: config.aggregateValues,
|
|
344
432
|
make: (data, options) => {
|
|
345
433
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
346
434
|
return {
|
|
347
435
|
id: config.id,
|
|
348
436
|
data,
|
|
349
|
-
...passed !== void 0 && { passed }
|
|
437
|
+
...passed !== void 0 && { passed },
|
|
438
|
+
...options?.name !== void 0 && { name: options.name },
|
|
439
|
+
def
|
|
440
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
350
441
|
};
|
|
351
442
|
}
|
|
352
443
|
};
|
|
@@ -359,29 +450,6 @@ function getScoreById(id) {
|
|
|
359
450
|
}
|
|
360
451
|
|
|
361
452
|
// src/evals/aggregators.ts
|
|
362
|
-
function aggregateAverageWithVariance(values) {
|
|
363
|
-
if (values.length === 0) {
|
|
364
|
-
return { value: 0, count: 0 };
|
|
365
|
-
}
|
|
366
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
367
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
368
|
-
const mean = sum / values.length;
|
|
369
|
-
let stdDev;
|
|
370
|
-
if (values.length >= 2) {
|
|
371
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
372
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
373
|
-
}
|
|
374
|
-
return { value: mean, stdDev, count: values.length };
|
|
375
|
-
}
|
|
376
|
-
function aggregateAll(values) {
|
|
377
|
-
const total = values.length;
|
|
378
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
379
|
-
return {
|
|
380
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
381
|
-
passedCount,
|
|
382
|
-
totalCount: total
|
|
383
|
-
};
|
|
384
|
-
}
|
|
385
453
|
function aggregateTokenCountSum(values) {
|
|
386
454
|
const initial = {
|
|
387
455
|
input: 0,
|
|
@@ -434,40 +502,59 @@ Score.of({
|
|
|
434
502
|
id: "percent",
|
|
435
503
|
name: "Score",
|
|
436
504
|
displayStrategy: "bar",
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
505
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
506
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
507
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
508
|
+
});
|
|
509
|
+
Score.of({
|
|
510
|
+
id: "delta",
|
|
511
|
+
name: "Delta",
|
|
512
|
+
displayStrategy: "number",
|
|
513
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
514
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
515
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
444
516
|
});
|
|
445
517
|
Score.of({
|
|
446
518
|
id: "binary",
|
|
447
519
|
name: "Result",
|
|
448
520
|
displayStrategy: "passFail",
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
}
|
|
455
|
-
return base;
|
|
521
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
522
|
+
formatAggregate: (data) => {
|
|
523
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
524
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
525
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
456
526
|
}
|
|
457
|
-
return
|
|
527
|
+
return base;
|
|
458
528
|
},
|
|
459
|
-
|
|
529
|
+
aggregateValues: Score.aggregate.all
|
|
460
530
|
});
|
|
461
531
|
|
|
462
532
|
// src/runner/score-utils.ts
|
|
533
|
+
function getScoreDef(item) {
|
|
534
|
+
return item.def ?? getScoreById(item.id);
|
|
535
|
+
}
|
|
536
|
+
function lastNonEmptyName(items) {
|
|
537
|
+
for (let i = items.length - 1; i >= 0; i--) {
|
|
538
|
+
const n = items[i].name;
|
|
539
|
+
if (n != null && n.trim().length > 0)
|
|
540
|
+
return n;
|
|
541
|
+
}
|
|
542
|
+
return void 0;
|
|
543
|
+
}
|
|
463
544
|
function aggregateScoreItems(items) {
|
|
464
545
|
if (items.length === 0)
|
|
465
546
|
return void 0;
|
|
466
|
-
const def =
|
|
467
|
-
if (!def?.
|
|
547
|
+
const def = getScoreDef(items[0]);
|
|
548
|
+
if (!def?.aggregateValues)
|
|
468
549
|
return items[items.length - 1];
|
|
469
|
-
const aggregated = def.
|
|
470
|
-
|
|
550
|
+
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
551
|
+
const nameOverride = lastNonEmptyName(items);
|
|
552
|
+
return {
|
|
553
|
+
...items[0],
|
|
554
|
+
data: aggregated,
|
|
555
|
+
def,
|
|
556
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
557
|
+
};
|
|
471
558
|
}
|
|
472
559
|
function aggregateMetricItems(items) {
|
|
473
560
|
if (items.length === 0)
|
|
@@ -476,11 +563,16 @@ function aggregateMetricItems(items) {
|
|
|
476
563
|
if (!def?.aggregate)
|
|
477
564
|
return items[items.length - 1];
|
|
478
565
|
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
479
|
-
|
|
566
|
+
const nameOverride = lastNonEmptyName(items);
|
|
567
|
+
return {
|
|
568
|
+
...items[0],
|
|
569
|
+
data: aggregated,
|
|
570
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
571
|
+
};
|
|
480
572
|
}
|
|
481
573
|
function toNumericScoreFromScores(scores) {
|
|
482
574
|
for (const item of scores) {
|
|
483
|
-
const def =
|
|
575
|
+
const def = getScoreDef(item);
|
|
484
576
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
485
577
|
const value = item.data.value;
|
|
486
578
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -868,7 +960,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
868
960
|
() => appendJsonLine(message.artifactPath, {
|
|
869
961
|
runId: message.runId,
|
|
870
962
|
ts: Date.now(),
|
|
871
|
-
...message.payload
|
|
963
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
872
964
|
})
|
|
873
965
|
);
|
|
874
966
|
})
|
|
@@ -1454,7 +1546,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1454
1546
|
if (agg)
|
|
1455
1547
|
aggregatedScores.push(agg);
|
|
1456
1548
|
}
|
|
1457
|
-
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([
|
|
1549
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1458
1550
|
const passed = events.every((ev) => {
|
|
1459
1551
|
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1460
1552
|
return es?.passed ?? false;
|
|
@@ -1474,13 +1566,13 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1474
1566
|
}
|
|
1475
1567
|
return result;
|
|
1476
1568
|
}
|
|
1477
|
-
function formatScorePart(item,
|
|
1478
|
-
const def = getScoreById(item.id);
|
|
1569
|
+
function formatScorePart(item, _scoreToColor, options) {
|
|
1570
|
+
const def = item.def ?? getScoreById(item.id);
|
|
1479
1571
|
if (!def) {
|
|
1480
1572
|
const numeric = toNumericScore(item.data);
|
|
1481
1573
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1482
1574
|
}
|
|
1483
|
-
const formatted = def
|
|
1575
|
+
const formatted = formatScoreData(def, item.data, options);
|
|
1484
1576
|
if (def.displayStrategy === "bar") {
|
|
1485
1577
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1486
1578
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1538,8 +1630,6 @@ function RunView({
|
|
|
1538
1630
|
const done = new Promise((resolve5) => {
|
|
1539
1631
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1540
1632
|
if (event.type === "TestCaseProgress") {
|
|
1541
|
-
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1542
|
-
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1543
1633
|
for (const item of event.evaluatorScores) {
|
|
1544
1634
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1545
1635
|
if (numeric !== void 0) {
|
|
@@ -1629,16 +1719,17 @@ function RunView({
|
|
|
1629
1719
|
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1630
1720
|
return;
|
|
1631
1721
|
}
|
|
1722
|
+
const completed = finalEvent;
|
|
1632
1723
|
setSummary({
|
|
1633
|
-
passedTestCases:
|
|
1634
|
-
failedTestCases:
|
|
1635
|
-
totalTestCases:
|
|
1724
|
+
passedTestCases: completed.passedTestCases,
|
|
1725
|
+
failedTestCases: completed.failedTestCases,
|
|
1726
|
+
totalTestCases: completed.totalTestCases,
|
|
1636
1727
|
overallScoreTotal,
|
|
1637
1728
|
overallScoreSumSq,
|
|
1638
1729
|
overallScoreCount,
|
|
1639
1730
|
aggregates: new Map(aggregates),
|
|
1640
1731
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1641
|
-
artifactPath:
|
|
1732
|
+
artifactPath: completed.artifactPath
|
|
1642
1733
|
});
|
|
1643
1734
|
setPhase("completed");
|
|
1644
1735
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1729,9 +1820,10 @@ function RunView({
|
|
|
1729
1820
|
const formatted = def.format(m.data, {
|
|
1730
1821
|
isAggregated: tc.isAggregated
|
|
1731
1822
|
});
|
|
1823
|
+
const label = m.name ?? def.name;
|
|
1732
1824
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1733
1825
|
"[",
|
|
1734
|
-
|
|
1826
|
+
label ? `${label}: ` : "",
|
|
1735
1827
|
formatted,
|
|
1736
1828
|
"]",
|
|
1737
1829
|
" "
|
|
@@ -1740,8 +1832,8 @@ function RunView({
|
|
|
1740
1832
|
] }) : null
|
|
1741
1833
|
] }),
|
|
1742
1834
|
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1743
|
-
const def = getScoreById(s.id);
|
|
1744
|
-
const scoreLabel =
|
|
1835
|
+
const def = s.def ?? getScoreById(s.id);
|
|
1836
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
1745
1837
|
return /* @__PURE__ */ jsxs(
|
|
1746
1838
|
Text,
|
|
1747
1839
|
{
|
|
@@ -1844,11 +1936,9 @@ function RunView({
|
|
|
1844
1936
|
const aggregated = aggregateScoreItems(items);
|
|
1845
1937
|
if (!aggregated)
|
|
1846
1938
|
return null;
|
|
1847
|
-
const def = getScoreById(aggregated.id);
|
|
1848
|
-
const label =
|
|
1849
|
-
const formatted = def
|
|
1850
|
-
isAggregated: true
|
|
1851
|
-
}) ?? "n/a";
|
|
1939
|
+
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1940
|
+
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
1941
|
+
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1852
1942
|
const numeric = toNumericScore(aggregated.data);
|
|
1853
1943
|
return /* @__PURE__ */ jsxs(
|
|
1854
1944
|
Text,
|
|
@@ -2007,9 +2097,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2007
2097
|
const agg = aggregateScoreItems(items);
|
|
2008
2098
|
if (!agg)
|
|
2009
2099
|
continue;
|
|
2010
|
-
const def = getScoreById(agg.id);
|
|
2011
|
-
const label =
|
|
2012
|
-
const formatted = def
|
|
2100
|
+
const def = agg.def ?? getScoreById(agg.id);
|
|
2101
|
+
const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
|
|
2102
|
+
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2013
2103
|
const numeric = toNumericScore(agg.data);
|
|
2014
2104
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2015
2105
|
scoreLines.push(` ${label}: ${colored}`);
|
|
@@ -2027,7 +2117,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2027
2117
|
const filled = Math.round(safe / max * width);
|
|
2028
2118
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
2029
2119
|
}
|
|
2030
|
-
function aggregateEvaluatorScoresFromEvents(events,
|
|
2120
|
+
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2031
2121
|
if (events.length === 0)
|
|
2032
2122
|
return [];
|
|
2033
2123
|
const evaluatorIds = new Set(
|
|
@@ -2074,26 +2164,27 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2074
2164
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
2075
2165
|
const metricParts = [];
|
|
2076
2166
|
if (metrics && metrics.length > 0) {
|
|
2077
|
-
for (const
|
|
2078
|
-
const def = getMetricById(id);
|
|
2167
|
+
for (const m of metrics) {
|
|
2168
|
+
const def = getMetricById(m.id);
|
|
2079
2169
|
if (def) {
|
|
2080
|
-
const formatted = def.format(data, options);
|
|
2170
|
+
const formatted = def.format(m.data, options);
|
|
2171
|
+
const label = m.name ?? def.name;
|
|
2081
2172
|
metricParts.push(
|
|
2082
|
-
|
|
2173
|
+
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2083
2174
|
);
|
|
2084
2175
|
}
|
|
2085
2176
|
}
|
|
2086
2177
|
}
|
|
2087
2178
|
const scoreLines = [];
|
|
2088
2179
|
for (const item of scores) {
|
|
2089
|
-
const def = getScoreById(item.id);
|
|
2090
|
-
const scoreLabel =
|
|
2180
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2181
|
+
const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
|
|
2091
2182
|
let formatted;
|
|
2092
2183
|
if (!def) {
|
|
2093
2184
|
const numeric = toNumericScore(item.data);
|
|
2094
2185
|
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2095
2186
|
} else {
|
|
2096
|
-
const raw = def
|
|
2187
|
+
const raw = formatScoreData(def, item.data, options);
|
|
2097
2188
|
switch (def.displayStrategy) {
|
|
2098
2189
|
case "bar": {
|
|
2099
2190
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -2245,7 +2336,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2245
2336
|
(s, e) => s + e.durationMs,
|
|
2246
2337
|
0
|
|
2247
2338
|
);
|
|
2248
|
-
existing.events.every((e) => e.passed);
|
|
2249
2339
|
const lines = [];
|
|
2250
2340
|
lines.push(
|
|
2251
2341
|
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
@@ -2323,18 +2413,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2323
2413
|
if (finalEvent.type === "RunFailed") {
|
|
2324
2414
|
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2325
2415
|
}
|
|
2416
|
+
const completed = finalEvent;
|
|
2326
2417
|
console.log("");
|
|
2327
2418
|
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2328
2419
|
console.log(
|
|
2329
2420
|
`- passed: ${colorize(
|
|
2330
|
-
`${
|
|
2421
|
+
`${completed.passedTestCases}/${completed.totalTestCases}`,
|
|
2331
2422
|
ansi2.green
|
|
2332
2423
|
)}`
|
|
2333
2424
|
);
|
|
2334
2425
|
console.log(
|
|
2335
2426
|
`- failed: ${colorize(
|
|
2336
|
-
`${
|
|
2337
|
-
|
|
2427
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2428
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2338
2429
|
)}`
|
|
2339
2430
|
);
|
|
2340
2431
|
if (overallScoreCount > 0) {
|
|
@@ -2375,10 +2466,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2375
2466
|
);
|
|
2376
2467
|
continue;
|
|
2377
2468
|
}
|
|
2378
|
-
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ?
|
|
2379
|
-
summary.aggregatedScoreItem.
|
|
2380
|
-
|
|
2381
|
-
)
|
|
2469
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
|
|
2470
|
+
const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
|
|
2471
|
+
return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
|
|
2472
|
+
})() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2382
2473
|
console.log(
|
|
2383
2474
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2384
2475
|
scoreLabel,
|
|
@@ -2387,7 +2478,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2387
2478
|
);
|
|
2388
2479
|
}
|
|
2389
2480
|
}
|
|
2390
|
-
console.log(`- artifact: ${colorize(
|
|
2481
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2391
2482
|
}
|
|
2392
2483
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2393
2484
|
return new Promise((resolve5, reject) => {
|