@m4trix/evals 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +91 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +91 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +61 -19
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +61 -19
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +58 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +13 -8
- package/dist/index.js +58 -17
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -321,7 +321,11 @@ var Metric = {
|
|
|
321
321
|
name: config.name,
|
|
322
322
|
aggregate: config.aggregate,
|
|
323
323
|
format: config.format,
|
|
324
|
-
make: (data) => ({
|
|
324
|
+
make: (data, options) => ({
|
|
325
|
+
id: config.id,
|
|
326
|
+
data,
|
|
327
|
+
...options?.name !== void 0 && { name: options.name }
|
|
328
|
+
})
|
|
325
329
|
};
|
|
326
330
|
registry.set(config.id, def);
|
|
327
331
|
return def;
|
|
@@ -343,25 +347,61 @@ var ScoreAggregate = {
|
|
|
343
347
|
const count = values.length || 1;
|
|
344
348
|
const result = {};
|
|
345
349
|
for (const field of fields) {
|
|
346
|
-
result[field] = values.reduce(
|
|
350
|
+
result[field] = values.reduce(
|
|
351
|
+
(s, v) => s + (v[field] ?? 0),
|
|
352
|
+
0
|
|
353
|
+
) / count;
|
|
347
354
|
}
|
|
348
355
|
return result;
|
|
349
356
|
};
|
|
350
357
|
},
|
|
351
|
-
/** Average
|
|
352
|
-
averageWithVariance(
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
358
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
359
|
+
averageWithVariance(fields) {
|
|
360
|
+
return (values) => {
|
|
361
|
+
const count = values.length;
|
|
362
|
+
const result = {};
|
|
363
|
+
for (const field of fields) {
|
|
364
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
365
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
366
|
+
0
|
|
367
|
+
) / count;
|
|
368
|
+
}
|
|
369
|
+
const valueField = "value";
|
|
370
|
+
const hasValueField = fields.includes(valueField);
|
|
371
|
+
if (count === 0) {
|
|
372
|
+
if (hasValueField) {
|
|
373
|
+
result[valueField] = 0;
|
|
374
|
+
}
|
|
375
|
+
return {
|
|
376
|
+
...result,
|
|
377
|
+
stdDev: void 0,
|
|
378
|
+
count: 0
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
let stdDev;
|
|
382
|
+
if (hasValueField && count >= 2) {
|
|
383
|
+
const sum = values.reduce(
|
|
384
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
385
|
+
0
|
|
386
|
+
);
|
|
387
|
+
const sumSq = values.reduce(
|
|
388
|
+
(s, v) => {
|
|
389
|
+
const value = v[valueField] ?? 0;
|
|
390
|
+
return s + value * value;
|
|
391
|
+
},
|
|
392
|
+
0
|
|
393
|
+
);
|
|
394
|
+
const mean = sum / count;
|
|
395
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
396
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
397
|
+
}
|
|
398
|
+
return {
|
|
399
|
+
...values[0],
|
|
400
|
+
...result,
|
|
401
|
+
stdDev,
|
|
402
|
+
count
|
|
403
|
+
};
|
|
404
|
+
};
|
|
365
405
|
},
|
|
366
406
|
/** All runs must pass. Use for binary scores. */
|
|
367
407
|
all(values) {
|
|
@@ -395,6 +435,7 @@ var Score = {
|
|
|
395
435
|
id: config.id,
|
|
396
436
|
data,
|
|
397
437
|
...passed !== void 0 && { passed },
|
|
438
|
+
...options?.name !== void 0 && { name: options.name },
|
|
398
439
|
def
|
|
399
440
|
// Attach def so rendering/aggregation works without registry lookup
|
|
400
441
|
};
|
|
@@ -463,7 +504,7 @@ Score.of({
|
|
|
463
504
|
displayStrategy: "bar",
|
|
464
505
|
formatValue: (data) => data.value.toFixed(2),
|
|
465
506
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
466
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
507
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
467
508
|
});
|
|
468
509
|
Score.of({
|
|
469
510
|
id: "delta",
|
|
@@ -492,6 +533,14 @@ Score.of({
|
|
|
492
533
|
function getScoreDef(item) {
|
|
493
534
|
return item.def ?? getScoreById(item.id);
|
|
494
535
|
}
|
|
536
|
+
function lastNonEmptyName(items) {
|
|
537
|
+
for (let i = items.length - 1; i >= 0; i--) {
|
|
538
|
+
const n = items[i].name;
|
|
539
|
+
if (n != null && n.trim().length > 0)
|
|
540
|
+
return n;
|
|
541
|
+
}
|
|
542
|
+
return void 0;
|
|
543
|
+
}
|
|
495
544
|
function aggregateScoreItems(items) {
|
|
496
545
|
if (items.length === 0)
|
|
497
546
|
return void 0;
|
|
@@ -499,7 +548,13 @@ function aggregateScoreItems(items) {
|
|
|
499
548
|
if (!def?.aggregateValues)
|
|
500
549
|
return items[items.length - 1];
|
|
501
550
|
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
502
|
-
|
|
551
|
+
const nameOverride = lastNonEmptyName(items);
|
|
552
|
+
return {
|
|
553
|
+
...items[0],
|
|
554
|
+
data: aggregated,
|
|
555
|
+
def,
|
|
556
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
557
|
+
};
|
|
503
558
|
}
|
|
504
559
|
function aggregateMetricItems(items) {
|
|
505
560
|
if (items.length === 0)
|
|
@@ -508,7 +563,12 @@ function aggregateMetricItems(items) {
|
|
|
508
563
|
if (!def?.aggregate)
|
|
509
564
|
return items[items.length - 1];
|
|
510
565
|
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
511
|
-
|
|
566
|
+
const nameOverride = lastNonEmptyName(items);
|
|
567
|
+
return {
|
|
568
|
+
...items[0],
|
|
569
|
+
data: aggregated,
|
|
570
|
+
...nameOverride !== void 0 && { name: nameOverride }
|
|
571
|
+
};
|
|
512
572
|
}
|
|
513
573
|
function toNumericScoreFromScores(scores) {
|
|
514
574
|
for (const item of scores) {
|
|
@@ -1760,9 +1820,10 @@ function RunView({
|
|
|
1760
1820
|
const formatted = def.format(m.data, {
|
|
1761
1821
|
isAggregated: tc.isAggregated
|
|
1762
1822
|
});
|
|
1823
|
+
const label = m.name ?? def.name;
|
|
1763
1824
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1764
1825
|
"[",
|
|
1765
|
-
|
|
1826
|
+
label ? `${label}: ` : "",
|
|
1766
1827
|
formatted,
|
|
1767
1828
|
"]",
|
|
1768
1829
|
" "
|
|
@@ -1771,8 +1832,8 @@ function RunView({
|
|
|
1771
1832
|
] }) : null
|
|
1772
1833
|
] }),
|
|
1773
1834
|
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
1774
|
-
const def = getScoreById(s.id);
|
|
1775
|
-
const scoreLabel =
|
|
1835
|
+
const def = s.def ?? getScoreById(s.id);
|
|
1836
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
1776
1837
|
return /* @__PURE__ */ jsxs(
|
|
1777
1838
|
Text,
|
|
1778
1839
|
{
|
|
@@ -1876,7 +1937,7 @@ function RunView({
|
|
|
1876
1937
|
if (!aggregated)
|
|
1877
1938
|
return null;
|
|
1878
1939
|
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1879
|
-
const label =
|
|
1940
|
+
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
1880
1941
|
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1881
1942
|
const numeric = toNumericScore(aggregated.data);
|
|
1882
1943
|
return /* @__PURE__ */ jsxs(
|
|
@@ -2037,7 +2098,7 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2037
2098
|
if (!agg)
|
|
2038
2099
|
continue;
|
|
2039
2100
|
const def = agg.def ?? getScoreById(agg.id);
|
|
2040
|
-
const label =
|
|
2101
|
+
const label = agg.name ?? def?.name ?? def?.id ?? agg.id;
|
|
2041
2102
|
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2042
2103
|
const numeric = toNumericScore(agg.data);
|
|
2043
2104
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
@@ -2103,12 +2164,13 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2103
2164
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
2104
2165
|
const metricParts = [];
|
|
2105
2166
|
if (metrics && metrics.length > 0) {
|
|
2106
|
-
for (const
|
|
2107
|
-
const def = getMetricById(id);
|
|
2167
|
+
for (const m of metrics) {
|
|
2168
|
+
const def = getMetricById(m.id);
|
|
2108
2169
|
if (def) {
|
|
2109
|
-
const formatted = def.format(data, options);
|
|
2170
|
+
const formatted = def.format(m.data, options);
|
|
2171
|
+
const label = m.name ?? def.name;
|
|
2110
2172
|
metricParts.push(
|
|
2111
|
-
|
|
2173
|
+
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2112
2174
|
);
|
|
2113
2175
|
}
|
|
2114
2176
|
}
|
|
@@ -2116,7 +2178,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2116
2178
|
const scoreLines = [];
|
|
2117
2179
|
for (const item of scores) {
|
|
2118
2180
|
const def = item.def ?? getScoreById(item.id);
|
|
2119
|
-
const scoreLabel =
|
|
2181
|
+
const scoreLabel = item.name ?? def?.name ?? def?.id ?? item.id;
|
|
2120
2182
|
let formatted;
|
|
2121
2183
|
if (!def) {
|
|
2122
2184
|
const numeric = toNumericScore(item.data);
|