@m4trix/evals 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +91 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +91 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +61 -19
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +61 -19
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +58 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +13 -8
- package/dist/index.js +58 -17
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -347,7 +347,11 @@ var Metric = {
|
|
|
347
347
|
name: config.name,
|
|
348
348
|
aggregate: config.aggregate,
|
|
349
349
|
format: config.format,
|
|
350
|
-
make: (data) => ({
|
|
350
|
+
make: (data, options) => ({
|
|
351
|
+
id: config.id,
|
|
352
|
+
data,
|
|
353
|
+
...options?.name !== void 0 && { name: options.name }
|
|
354
|
+
})
|
|
351
355
|
};
|
|
352
356
|
registry.set(config.id, def);
|
|
353
357
|
return def;
|
|
@@ -369,25 +373,61 @@ var ScoreAggregate = {
|
|
|
369
373
|
const count = values.length || 1;
|
|
370
374
|
const result = {};
|
|
371
375
|
for (const field of fields) {
|
|
372
|
-
result[field] = values.reduce(
|
|
376
|
+
result[field] = values.reduce(
|
|
377
|
+
(s, v) => s + (v[field] ?? 0),
|
|
378
|
+
0
|
|
379
|
+
) / count;
|
|
373
380
|
}
|
|
374
381
|
return result;
|
|
375
382
|
};
|
|
376
383
|
},
|
|
377
|
-
/** Average
|
|
378
|
-
averageWithVariance(
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
384
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
385
|
+
averageWithVariance(fields) {
|
|
386
|
+
return (values) => {
|
|
387
|
+
const count = values.length;
|
|
388
|
+
const result = {};
|
|
389
|
+
for (const field of fields) {
|
|
390
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
391
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
392
|
+
0
|
|
393
|
+
) / count;
|
|
394
|
+
}
|
|
395
|
+
const valueField = "value";
|
|
396
|
+
const hasValueField = fields.includes(valueField);
|
|
397
|
+
if (count === 0) {
|
|
398
|
+
if (hasValueField) {
|
|
399
|
+
result[valueField] = 0;
|
|
400
|
+
}
|
|
401
|
+
return {
|
|
402
|
+
...result,
|
|
403
|
+
stdDev: void 0,
|
|
404
|
+
count: 0
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
let stdDev;
|
|
408
|
+
if (hasValueField && count >= 2) {
|
|
409
|
+
const sum = values.reduce(
|
|
410
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
411
|
+
0
|
|
412
|
+
);
|
|
413
|
+
const sumSq = values.reduce(
|
|
414
|
+
(s, v) => {
|
|
415
|
+
const value = v[valueField] ?? 0;
|
|
416
|
+
return s + value * value;
|
|
417
|
+
},
|
|
418
|
+
0
|
|
419
|
+
);
|
|
420
|
+
const mean = sum / count;
|
|
421
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
422
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
423
|
+
}
|
|
424
|
+
return {
|
|
425
|
+
...values[0],
|
|
426
|
+
...result,
|
|
427
|
+
stdDev,
|
|
428
|
+
count
|
|
429
|
+
};
|
|
430
|
+
};
|
|
391
431
|
},
|
|
392
432
|
/** All runs must pass. Use for binary scores. */
|
|
393
433
|
all(values) {
|
|
@@ -421,6 +461,7 @@ var Score = {
|
|
|
421
461
|
id: config.id,
|
|
422
462
|
data,
|
|
423
463
|
...passed !== void 0 && { passed },
|
|
464
|
+
...options?.name !== void 0 && { name: options.name },
|
|
424
465
|
def
|
|
425
466
|
// Attach def so rendering/aggregation works without registry lookup
|
|
426
467
|
};
|
|
@@ -489,7 +530,7 @@ Score.of({
|
|
|
489
530
|
displayStrategy: "bar",
|
|
490
531
|
formatValue: (data) => data.value.toFixed(2),
|
|
491
532
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
492
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
533
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
493
534
|
});
|
|
494
535
|
Score.of({
|
|
495
536
|
id: "delta",
|
|
@@ -518,6 +559,14 @@ Score.of({
|
|
|
518
559
|
function getScoreDef(item) {
|
|
519
560
|
return item.def ?? getScoreById(item.id);
|
|
520
561
|
}
|
|
562
|
+
function lastNonEmptyName(items) {
|
|
563
|
+
for (let i = items.length - 1; i >= 0; i--) {
|
|
564
|
+
const n = items[i].name;
|
|
565
|
+
if (n != null && n.trim().length > 0)
|
|
566
|
+
return n;
|
|
567
|
+
}
|
|
568
|
+
return void 0;
|
|
569
|
+
}
|
|
521
570
|
function aggregateScoreItems(items) {
|
|
522
571
|
if (items.length === 0)
|
|
523
572
|
return void 0;
|
|
@@ -525,7 +574,13 @@ function aggregateScoreItems(items) {
|
|
|
525
574
|
if (!def?.aggregateValues)
|
|
526
575
|
return items[items.length - 1];
|
|
527
576
|
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
528
|
-
|
|
577
|
+
const nameOverride = lastNonEmptyName(items);
|
|
578
|
+
return {
|
|
579
|
+
...items[0],
|
|
580
|
+
data: aggregated,
|
|
581
|
+
def,
|
|
582
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
583
|
+
};
|
|
529
584
|
}
|
|
530
585
|
function aggregateMetricItems(items) {
|
|
531
586
|
if (items.length === 0)
|
|
@@ -534,7 +589,12 @@ function aggregateMetricItems(items) {
|
|
|
534
589
|
if (!def?.aggregate)
|
|
535
590
|
return items[items.length - 1];
|
|
536
591
|
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
537
|
-
|
|
592
|
+
const nameOverride = lastNonEmptyName(items);
|
|
593
|
+
return {
|
|
594
|
+
...items[0],
|
|
595
|
+
data: aggregated,
|
|
596
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
597
|
+
};
|
|
538
598
|
}
|
|
539
599
|
function toNumericScoreFromScores(scores) {
|
|
540
600
|
for (const item of scores) {
|
|
@@ -1786,9 +1846,10 @@ function RunView({
|
|
|
1786
1846
|
const formatted = def.format(m.data, {
|
|
1787
1847
|
isAggregated: tc.isAggregated
|
|
1788
1848
|
});
|
|
1849
|
+
const label = m.name ?? def.name;
|
|
1789
1850
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1790
1851
|
"[",
|
|
1791
|
-
|
|
1852
|
+
label ? `${label}: ` : "",
|
|
1792
1853
|
formatted,
|
|
1793
1854
|
"]",
|
|
1794
1855
|
" "
|
|
@@ -1797,8 +1858,8 @@ function RunView({
|
|
|
1797
1858
|
] }) : null
|
|
1798
1859
|
] }),
|
|
1799
1860
|
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1800
|
-
const def = getScoreById(s.id);
|
|
1801
|
-
const scoreLabel =
|
|
1861
|
+
const def = s.def ?? getScoreById(s.id);
|
|
1862
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
1802
1863
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1803
1864
|
ink.Text,
|
|
1804
1865
|
{
|
|
@@ -1902,7 +1963,7 @@ function RunView({
|
|
|
1902
1963
|
if (!aggregated)
|
|
1903
1964
|
return null;
|
|
1904
1965
|
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1905
|
-
const label =
|
|
1966
|
+
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
1906
1967
|
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1907
1968
|
const numeric = toNumericScore(aggregated.data);
|
|
1908
1969
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
@@ -2063,7 +2124,7 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2063
2124
|
if (!agg)
|
|
2064
2125
|
continue;
|
|
2065
2126
|
const def = agg.def ?? getScoreById(agg.id);
|
|
2066
|
-
const label =
|
|
2127
|
+
const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
|
|
2067
2128
|
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2068
2129
|
const numeric = toNumericScore(agg.data);
|
|
2069
2130
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
@@ -2129,12 +2190,13 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2129
2190
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
2130
2191
|
const metricParts = [];
|
|
2131
2192
|
if (metrics && metrics.length > 0) {
|
|
2132
|
-
for (const
|
|
2133
|
-
const def = getMetricById(id);
|
|
2193
|
+
for (const m of metrics) {
|
|
2194
|
+
const def = getMetricById(m.id);
|
|
2134
2195
|
if (def) {
|
|
2135
|
-
const formatted = def.format(data, options);
|
|
2196
|
+
const formatted = def.format(m.data, options);
|
|
2197
|
+
const label = m.name ?? def.name;
|
|
2136
2198
|
metricParts.push(
|
|
2137
|
-
|
|
2199
|
+
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2138
2200
|
);
|
|
2139
2201
|
}
|
|
2140
2202
|
}
|
|
@@ -2142,7 +2204,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2142
2204
|
const scoreLines = [];
|
|
2143
2205
|
for (const item of scores) {
|
|
2144
2206
|
const def = item.def ?? getScoreById(item.id);
|
|
2145
|
-
const scoreLabel =
|
|
2207
|
+
const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
|
|
2146
2208
|
let formatted;
|
|
2147
2209
|
if (!def) {
|
|
2148
2210
|
const numeric = toNumericScore(item.data);
|