@m4trix/evals 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -359,20 +359,70 @@ function getMetricById(id) {
359
359
 
360
360
  // src/evals/score.ts
361
361
  var registry2 = /* @__PURE__ */ new Map();
362
+ function formatScoreData(def, data, options) {
363
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
364
+ }
365
+ var ScoreAggregate = {
366
+ /** Average numeric fields. Use for scores like { value, delta }. */
367
+ averageFields(fields) {
368
+ return (values) => {
369
+ const count = values.length || 1;
370
+ const result = {};
371
+ for (const field of fields) {
372
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
373
+ }
374
+ return result;
375
+ };
376
+ },
377
+ /** Average `value` with sample std dev. Use for percent-style scores. */
378
+ averageWithVariance(values) {
379
+ if (values.length === 0) {
380
+ return { value: 0, stdDev: void 0, count: 0 };
381
+ }
382
+ const sum = values.reduce((s, v) => s + v.value, 0);
383
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
384
+ const mean = sum / values.length;
385
+ let stdDev;
386
+ if (values.length >= 2) {
387
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
388
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
389
+ }
390
+ return { ...values[0], value: mean, stdDev, count: values.length };
391
+ },
392
+ /** All runs must pass. Use for binary scores. */
393
+ all(values) {
394
+ const total = values.length;
395
+ const passedCount = values.filter((v) => v.passed).length;
396
+ return {
397
+ ...values[0],
398
+ passed: total > 0 && values.every((v) => v.passed),
399
+ passedCount,
400
+ totalCount: total
401
+ };
402
+ },
403
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
404
+ last(values) {
405
+ return values[values.length - 1] ?? {};
406
+ }
407
+ };
362
408
  var Score = {
409
+ aggregate: ScoreAggregate,
363
410
  of(config) {
364
411
  const def = {
365
412
  id: config.id,
366
413
  name: config.name,
367
414
  displayStrategy: config.displayStrategy,
368
- aggregate: config.aggregate,
369
- format: config.format,
415
+ formatValue: config.formatValue,
416
+ formatAggregate: config.formatAggregate,
417
+ aggregateValues: config.aggregateValues,
370
418
  make: (data, options) => {
371
419
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
372
420
  return {
373
421
  id: config.id,
374
422
  data,
375
- ...passed !== void 0 && { passed }
423
+ ...passed !== void 0 && { passed },
424
+ def
425
+ // Attach def so rendering/aggregation works without registry lookup
376
426
  };
377
427
  }
378
428
  };
@@ -385,29 +435,6 @@ function getScoreById(id) {
385
435
  }
386
436
 
387
437
  // src/evals/aggregators.ts
388
- function aggregateAverageWithVariance(values) {
389
- if (values.length === 0) {
390
- return { value: 0, count: 0 };
391
- }
392
- const sum = values.reduce((s, v) => s + v.value, 0);
393
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
394
- const mean = sum / values.length;
395
- let stdDev;
396
- if (values.length >= 2) {
397
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
398
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
399
- }
400
- return { value: mean, stdDev, count: values.length };
401
- }
402
- function aggregateAll(values) {
403
- const total = values.length;
404
- const passedCount = values.filter((v) => v.passed).length;
405
- return {
406
- passed: total > 0 && values.every((v) => v.passed),
407
- passedCount,
408
- totalCount: total
409
- };
410
- }
411
438
  function aggregateTokenCountSum(values) {
412
439
  const initial = {
413
440
  input: 0,
@@ -460,40 +487,45 @@ Score.of({
460
487
  id: "percent",
461
488
  name: "Score",
462
489
  displayStrategy: "bar",
463
- format: (data, options) => {
464
- if (options?.isAggregated) {
465
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
466
- }
467
- return data.value.toFixed(2);
468
- },
469
- aggregate: aggregateAverageWithVariance
490
+ formatValue: (data) => data.value.toFixed(2),
491
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
492
+ aggregateValues: Score.aggregate.averageWithVariance
493
+ });
494
+ Score.of({
495
+ id: "delta",
496
+ name: "Delta",
497
+ displayStrategy: "number",
498
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
499
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
500
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
470
501
  });
471
502
  Score.of({
472
503
  id: "binary",
473
504
  name: "Result",
474
505
  displayStrategy: "passFail",
475
- format: (data, options) => {
476
- if (options?.isAggregated) {
477
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
478
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
479
- return `${base} (${data.passedCount}/${data.totalCount})`;
480
- }
481
- return base;
506
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
507
+ formatAggregate: (data) => {
508
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
509
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
510
+ return `${base} (${data.passedCount}/${data.totalCount})`;
482
511
  }
483
- return data.passed ? "PASSED" : "NOT PASSED";
512
+ return base;
484
513
  },
485
- aggregate: aggregateAll
514
+ aggregateValues: Score.aggregate.all
486
515
  });
487
516
 
488
517
  // src/runner/score-utils.ts
518
+ function getScoreDef(item) {
519
+ return item.def ?? getScoreById(item.id);
520
+ }
489
521
  function aggregateScoreItems(items) {
490
522
  if (items.length === 0)
491
523
  return void 0;
492
- const def = getScoreById(items[0].id);
493
- if (!def?.aggregate)
524
+ const def = getScoreDef(items[0]);
525
+ if (!def?.aggregateValues)
494
526
  return items[items.length - 1];
495
- const aggregated = def.aggregate(items.map((i) => i.data));
496
- return { ...items[0], data: aggregated };
527
+ const aggregated = def.aggregateValues(items.map((i) => i.data));
528
+ return { ...items[0], data: aggregated, def };
497
529
  }
498
530
  function aggregateMetricItems(items) {
499
531
  if (items.length === 0)
@@ -506,7 +538,7 @@ function aggregateMetricItems(items) {
506
538
  }
507
539
  function toNumericScoreFromScores(scores) {
508
540
  for (const item of scores) {
509
- const def = getScoreById(item.id);
541
+ const def = getScoreDef(item);
510
542
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
511
543
  const value = item.data.value;
512
544
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -587,6 +619,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
587
619
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
588
620
  const rerunPassed = [];
589
621
  for (let r = 0; r < reruns; r++) {
622
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
590
623
  const started = Date.now();
591
624
  const evaluatorScores = [];
592
625
  let testCaseError;
@@ -613,6 +646,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
613
646
  input: testCaseItem.testCase.getInput(),
614
647
  ctx,
615
648
  output,
649
+ meta: {
650
+ triggerId: task.triggerId,
651
+ runId: evaluatorRunId,
652
+ datasetId: task.datasetId
653
+ },
616
654
  logDiff,
617
655
  log
618
656
  })
@@ -888,7 +926,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
888
926
  () => appendJsonLine(message.artifactPath, {
889
927
  runId: message.runId,
890
928
  ts: Date.now(),
891
- ...message.payload
929
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
892
930
  })
893
931
  );
894
932
  })
@@ -1072,6 +1110,7 @@ var EffectRunner = class {
1072
1110
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1073
1111
  0
1074
1112
  );
1113
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1075
1114
  const runId = `run-${crypto.randomUUID()}`;
1076
1115
  const artifactPath = createArtifactPath(
1077
1116
  this.config.artifactDirectory,
@@ -1113,6 +1152,7 @@ var EffectRunner = class {
1113
1152
  await effect.Effect.runPromise(
1114
1153
  effect.Queue.offer(this.runQueue, {
1115
1154
  runId,
1155
+ triggerId,
1116
1156
  datasetId: request.datasetId,
1117
1157
  dataset: dataset.dataset,
1118
1158
  evaluators: selectedEvaluators,
@@ -1472,7 +1512,7 @@ function aggregateEvaluatorScores(events, nameById) {
1472
1512
  if (agg)
1473
1513
  aggregatedScores.push(agg);
1474
1514
  }
1475
- const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1515
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1476
1516
  const passed = events.every((ev) => {
1477
1517
  const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1478
1518
  return es?.passed ?? false;
@@ -1492,13 +1532,13 @@ function aggregateEvaluatorScores(events, nameById) {
1492
1532
  }
1493
1533
  return result;
1494
1534
  }
1495
- function formatScorePart(item, scoreToColor2, options) {
1496
- const def = getScoreById(item.id);
1535
+ function formatScorePart(item, _scoreToColor, options) {
1536
+ const def = item.def ?? getScoreById(item.id);
1497
1537
  if (!def) {
1498
1538
  const numeric = toNumericScore(item.data);
1499
1539
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1500
1540
  }
1501
- const formatted = def.format(item.data, options);
1541
+ const formatted = formatScoreData(def, item.data, options);
1502
1542
  if (def.displayStrategy === "bar") {
1503
1543
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1504
1544
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1556,8 +1596,6 @@ function RunView({
1556
1596
  const done = new Promise((resolve5) => {
1557
1597
  const unsubscribe = runner.subscribeRunEvents((event) => {
1558
1598
  if (event.type === "TestCaseProgress") {
1559
- const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1560
- numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1561
1599
  for (const item of event.evaluatorScores) {
1562
1600
  const numeric = toNumericScoreFromScores(item.scores);
1563
1601
  if (numeric !== void 0) {
@@ -1647,16 +1685,17 @@ function RunView({
1647
1685
  onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1648
1686
  return;
1649
1687
  }
1688
+ const completed = finalEvent;
1650
1689
  setSummary({
1651
- passedTestCases: finalEvent.passedTestCases,
1652
- failedTestCases: finalEvent.failedTestCases,
1653
- totalTestCases: finalEvent.totalTestCases,
1690
+ passedTestCases: completed.passedTestCases,
1691
+ failedTestCases: completed.failedTestCases,
1692
+ totalTestCases: completed.totalTestCases,
1654
1693
  overallScoreTotal,
1655
1694
  overallScoreSumSq,
1656
1695
  overallScoreCount,
1657
1696
  aggregates: new Map(aggregates),
1658
1697
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1659
- artifactPath: finalEvent.artifactPath
1698
+ artifactPath: completed.artifactPath
1660
1699
  });
1661
1700
  setPhase("completed");
1662
1701
  setTimeout(() => onComplete(), 200);
@@ -1862,11 +1901,9 @@ function RunView({
1862
1901
  const aggregated = aggregateScoreItems(items);
1863
1902
  if (!aggregated)
1864
1903
  return null;
1865
- const def = getScoreById(aggregated.id);
1904
+ const def = aggregated.def ?? getScoreById(aggregated.id);
1866
1905
  const label = def ? def.name ?? def.id : aggregated.id;
1867
- const formatted = def?.format(aggregated.data, {
1868
- isAggregated: true
1869
- }) ?? "n/a";
1906
+ const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1870
1907
  const numeric = toNumericScore(aggregated.data);
1871
1908
  return /* @__PURE__ */ jsxRuntime.jsxs(
1872
1909
  ink.Text,
@@ -2025,9 +2062,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
2025
2062
  const agg = aggregateScoreItems(items);
2026
2063
  if (!agg)
2027
2064
  continue;
2028
- const def = getScoreById(agg.id);
2065
+ const def = agg.def ?? getScoreById(agg.id);
2029
2066
  const label = def ? def.name ?? def.id : agg.id;
2030
- const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2067
+ const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2031
2068
  const numeric = toNumericScore(agg.data);
2032
2069
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2033
2070
  scoreLines.push(` ${label}: ${colored}`);
@@ -2045,7 +2082,7 @@ function createBar2(value, max = 100, width = 20) {
2045
2082
  const filled = Math.round(safe / max * width);
2046
2083
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
2047
2084
  }
2048
- function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
2085
+ function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2049
2086
  if (events.length === 0)
2050
2087
  return [];
2051
2088
  const evaluatorIds = new Set(
@@ -2104,14 +2141,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2104
2141
  }
2105
2142
  const scoreLines = [];
2106
2143
  for (const item of scores) {
2107
- const def = getScoreById(item.id);
2144
+ const def = item.def ?? getScoreById(item.id);
2108
2145
  const scoreLabel = def ? def.name ?? def.id : item.id;
2109
2146
  let formatted;
2110
2147
  if (!def) {
2111
2148
  const numeric = toNumericScore(item.data);
2112
2149
  formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2113
2150
  } else {
2114
- const raw = def.format(item.data, options);
2151
+ const raw = formatScoreData(def, item.data, options);
2115
2152
  switch (def.displayStrategy) {
2116
2153
  case "bar": {
2117
2154
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -2263,7 +2300,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2263
2300
  (s, e) => s + e.durationMs,
2264
2301
  0
2265
2302
  );
2266
- existing.events.every((e) => e.passed);
2267
2303
  const lines = [];
2268
2304
  lines.push(
2269
2305
  `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
@@ -2341,18 +2377,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2341
2377
  if (finalEvent.type === "RunFailed") {
2342
2378
  throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2343
2379
  }
2380
+ const completed = finalEvent;
2344
2381
  console.log("");
2345
2382
  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2346
2383
  console.log(
2347
2384
  `- passed: ${colorize(
2348
- `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
2385
+ `${completed.passedTestCases}/${completed.totalTestCases}`,
2349
2386
  ansi2.green
2350
2387
  )}`
2351
2388
  );
2352
2389
  console.log(
2353
2390
  `- failed: ${colorize(
2354
- `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
2355
- finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
2391
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2392
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2356
2393
  )}`
2357
2394
  );
2358
2395
  if (overallScoreCount > 0) {
@@ -2393,10 +2430,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2393
2430
  );
2394
2431
  continue;
2395
2432
  }
2396
- const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2397
- summary.aggregatedScoreItem.data,
2398
- { isAggregated: true }
2399
- ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2433
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
2434
+ const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
2435
+ return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
2436
+ })() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2400
2437
  console.log(
2401
2438
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2402
2439
  scoreLabel,
@@ -2405,7 +2442,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2405
2442
  );
2406
2443
  }
2407
2444
  }
2408
- console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
2445
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2409
2446
  }
2410
2447
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2411
2448
  return new Promise((resolve5, reject) => {