@m4trix/evals 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -333,20 +333,70 @@ function getMetricById(id) {
333
333
 
334
334
  // src/evals/score.ts
335
335
  var registry2 = /* @__PURE__ */ new Map();
336
+ function formatScoreData(def, data, options) {
337
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
338
+ }
339
+ var ScoreAggregate = {
340
+ /** Average numeric fields. Use for scores like { value, delta }. */
341
+ averageFields(fields) {
342
+ return (values) => {
343
+ const count = values.length || 1;
344
+ const result = {};
345
+ for (const field of fields) {
346
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
347
+ }
348
+ return result;
349
+ };
350
+ },
351
+ /** Average `value` with sample std dev. Use for percent-style scores. */
352
+ averageWithVariance(values) {
353
+ if (values.length === 0) {
354
+ return { value: 0, stdDev: void 0, count: 0 };
355
+ }
356
+ const sum = values.reduce((s, v) => s + v.value, 0);
357
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
358
+ const mean = sum / values.length;
359
+ let stdDev;
360
+ if (values.length >= 2) {
361
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
362
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
363
+ }
364
+ return { ...values[0], value: mean, stdDev, count: values.length };
365
+ },
366
+ /** All runs must pass. Use for binary scores. */
367
+ all(values) {
368
+ const total = values.length;
369
+ const passedCount = values.filter((v) => v.passed).length;
370
+ return {
371
+ ...values[0],
372
+ passed: total > 0 && values.every((v) => v.passed),
373
+ passedCount,
374
+ totalCount: total
375
+ };
376
+ },
377
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
378
+ last(values) {
379
+ return values[values.length - 1] ?? {};
380
+ }
381
+ };
336
382
  var Score = {
383
+ aggregate: ScoreAggregate,
337
384
  of(config) {
338
385
  const def = {
339
386
  id: config.id,
340
387
  name: config.name,
341
388
  displayStrategy: config.displayStrategy,
342
- aggregate: config.aggregate,
343
- format: config.format,
389
+ formatValue: config.formatValue,
390
+ formatAggregate: config.formatAggregate,
391
+ aggregateValues: config.aggregateValues,
344
392
  make: (data, options) => {
345
393
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
346
394
  return {
347
395
  id: config.id,
348
396
  data,
349
- ...passed !== void 0 && { passed }
397
+ ...passed !== void 0 && { passed },
398
+ def
399
+ // Attach def so rendering/aggregation works without registry lookup
350
400
  };
351
401
  }
352
402
  };
@@ -359,29 +409,6 @@ function getScoreById(id) {
359
409
  }
360
410
 
361
411
  // src/evals/aggregators.ts
362
- function aggregateAverageWithVariance(values) {
363
- if (values.length === 0) {
364
- return { value: 0, count: 0 };
365
- }
366
- const sum = values.reduce((s, v) => s + v.value, 0);
367
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
368
- const mean = sum / values.length;
369
- let stdDev;
370
- if (values.length >= 2) {
371
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
372
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
373
- }
374
- return { value: mean, stdDev, count: values.length };
375
- }
376
- function aggregateAll(values) {
377
- const total = values.length;
378
- const passedCount = values.filter((v) => v.passed).length;
379
- return {
380
- passed: total > 0 && values.every((v) => v.passed),
381
- passedCount,
382
- totalCount: total
383
- };
384
- }
385
412
  function aggregateTokenCountSum(values) {
386
413
  const initial = {
387
414
  input: 0,
@@ -434,40 +461,45 @@ Score.of({
434
461
  id: "percent",
435
462
  name: "Score",
436
463
  displayStrategy: "bar",
437
- format: (data, options) => {
438
- if (options?.isAggregated) {
439
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
440
- }
441
- return data.value.toFixed(2);
442
- },
443
- aggregate: aggregateAverageWithVariance
464
+ formatValue: (data) => data.value.toFixed(2),
465
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
466
+ aggregateValues: Score.aggregate.averageWithVariance
467
+ });
468
+ Score.of({
469
+ id: "delta",
470
+ name: "Delta",
471
+ displayStrategy: "number",
472
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
473
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
474
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
444
475
  });
445
476
  Score.of({
446
477
  id: "binary",
447
478
  name: "Result",
448
479
  displayStrategy: "passFail",
449
- format: (data, options) => {
450
- if (options?.isAggregated) {
451
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
452
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
453
- return `${base} (${data.passedCount}/${data.totalCount})`;
454
- }
455
- return base;
480
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
481
+ formatAggregate: (data) => {
482
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
483
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
484
+ return `${base} (${data.passedCount}/${data.totalCount})`;
456
485
  }
457
- return data.passed ? "PASSED" : "NOT PASSED";
486
+ return base;
458
487
  },
459
- aggregate: aggregateAll
488
+ aggregateValues: Score.aggregate.all
460
489
  });
461
490
 
462
491
  // src/runner/score-utils.ts
492
+ function getScoreDef(item) {
493
+ return item.def ?? getScoreById(item.id);
494
+ }
463
495
  function aggregateScoreItems(items) {
464
496
  if (items.length === 0)
465
497
  return void 0;
466
- const def = getScoreById(items[0].id);
467
- if (!def?.aggregate)
498
+ const def = getScoreDef(items[0]);
499
+ if (!def?.aggregateValues)
468
500
  return items[items.length - 1];
469
- const aggregated = def.aggregate(items.map((i) => i.data));
470
- return { ...items[0], data: aggregated };
501
+ const aggregated = def.aggregateValues(items.map((i) => i.data));
502
+ return { ...items[0], data: aggregated, def };
471
503
  }
472
504
  function aggregateMetricItems(items) {
473
505
  if (items.length === 0)
@@ -480,7 +512,7 @@ function aggregateMetricItems(items) {
480
512
  }
481
513
  function toNumericScoreFromScores(scores) {
482
514
  for (const item of scores) {
483
- const def = getScoreById(item.id);
515
+ const def = getScoreDef(item);
484
516
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
485
517
  const value = item.data.value;
486
518
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -561,6 +593,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
561
593
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
562
594
  const rerunPassed = [];
563
595
  for (let r = 0; r < reruns; r++) {
596
+ const evaluatorRunId = `run-${randomUUID()}`;
564
597
  const started = Date.now();
565
598
  const evaluatorScores = [];
566
599
  let testCaseError;
@@ -587,6 +620,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
587
620
  input: testCaseItem.testCase.getInput(),
588
621
  ctx,
589
622
  output,
623
+ meta: {
624
+ triggerId: task.triggerId,
625
+ runId: evaluatorRunId,
626
+ datasetId: task.datasetId
627
+ },
590
628
  logDiff,
591
629
  log
592
630
  })
@@ -862,7 +900,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
862
900
  () => appendJsonLine(message.artifactPath, {
863
901
  runId: message.runId,
864
902
  ts: Date.now(),
865
- ...message.payload
903
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
866
904
  })
867
905
  );
868
906
  })
@@ -1046,6 +1084,7 @@ var EffectRunner = class {
1046
1084
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1047
1085
  0
1048
1086
  );
1087
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1049
1088
  const runId = `run-${randomUUID()}`;
1050
1089
  const artifactPath = createArtifactPath(
1051
1090
  this.config.artifactDirectory,
@@ -1087,6 +1126,7 @@ var EffectRunner = class {
1087
1126
  await Effect.runPromise(
1088
1127
  Queue.offer(this.runQueue, {
1089
1128
  runId,
1129
+ triggerId,
1090
1130
  datasetId: request.datasetId,
1091
1131
  dataset: dataset.dataset,
1092
1132
  evaluators: selectedEvaluators,
@@ -1446,7 +1486,7 @@ function aggregateEvaluatorScores(events, nameById) {
1446
1486
  if (agg)
1447
1487
  aggregatedScores.push(agg);
1448
1488
  }
1449
- const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1489
+ const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
1450
1490
  const passed = events.every((ev) => {
1451
1491
  const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1452
1492
  return es?.passed ?? false;
@@ -1466,13 +1506,13 @@ function aggregateEvaluatorScores(events, nameById) {
1466
1506
  }
1467
1507
  return result;
1468
1508
  }
1469
- function formatScorePart(item, scoreToColor2, options) {
1470
- const def = getScoreById(item.id);
1509
+ function formatScorePart(item, _scoreToColor, options) {
1510
+ const def = item.def ?? getScoreById(item.id);
1471
1511
  if (!def) {
1472
1512
  const numeric = toNumericScore(item.data);
1473
1513
  return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1474
1514
  }
1475
- const formatted = def.format(item.data, options);
1515
+ const formatted = formatScoreData(def, item.data, options);
1476
1516
  if (def.displayStrategy === "bar") {
1477
1517
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1478
1518
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
@@ -1530,8 +1570,6 @@ function RunView({
1530
1570
  const done = new Promise((resolve5) => {
1531
1571
  const unsubscribe = runner.subscribeRunEvents((event) => {
1532
1572
  if (event.type === "TestCaseProgress") {
1533
- const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1534
- numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1535
1573
  for (const item of event.evaluatorScores) {
1536
1574
  const numeric = toNumericScoreFromScores(item.scores);
1537
1575
  if (numeric !== void 0) {
@@ -1621,16 +1659,17 @@ function RunView({
1621
1659
  onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1622
1660
  return;
1623
1661
  }
1662
+ const completed = finalEvent;
1624
1663
  setSummary({
1625
- passedTestCases: finalEvent.passedTestCases,
1626
- failedTestCases: finalEvent.failedTestCases,
1627
- totalTestCases: finalEvent.totalTestCases,
1664
+ passedTestCases: completed.passedTestCases,
1665
+ failedTestCases: completed.failedTestCases,
1666
+ totalTestCases: completed.totalTestCases,
1628
1667
  overallScoreTotal,
1629
1668
  overallScoreSumSq,
1630
1669
  overallScoreCount,
1631
1670
  aggregates: new Map(aggregates),
1632
1671
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1633
- artifactPath: finalEvent.artifactPath
1672
+ artifactPath: completed.artifactPath
1634
1673
  });
1635
1674
  setPhase("completed");
1636
1675
  setTimeout(() => onComplete(), 200);
@@ -1836,11 +1875,9 @@ function RunView({
1836
1875
  const aggregated = aggregateScoreItems(items);
1837
1876
  if (!aggregated)
1838
1877
  return null;
1839
- const def = getScoreById(aggregated.id);
1878
+ const def = aggregated.def ?? getScoreById(aggregated.id);
1840
1879
  const label = def ? def.name ?? def.id : aggregated.id;
1841
- const formatted = def?.format(aggregated.data, {
1842
- isAggregated: true
1843
- }) ?? "n/a";
1880
+ const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
1844
1881
  const numeric = toNumericScore(aggregated.data);
1845
1882
  return /* @__PURE__ */ jsxs(
1846
1883
  Text,
@@ -1999,9 +2036,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
1999
2036
  const agg = aggregateScoreItems(items);
2000
2037
  if (!agg)
2001
2038
  continue;
2002
- const def = getScoreById(agg.id);
2039
+ const def = agg.def ?? getScoreById(agg.id);
2003
2040
  const label = def ? def.name ?? def.id : agg.id;
2004
- const formatted = def?.format(agg.data, { isAggregated: true }) ?? "n/a";
2041
+ const formatted = def ? def.formatAggregate(agg.data) : "n/a";
2005
2042
  const numeric = toNumericScore(agg.data);
2006
2043
  const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
2007
2044
  scoreLines.push(` ${label}: ${colored}`);
@@ -2019,7 +2056,7 @@ function createBar2(value, max = 100, width = 20) {
2019
2056
  const filled = Math.round(safe / max * width);
2020
2057
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
2021
2058
  }
2022
- function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
2059
+ function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2023
2060
  if (events.length === 0)
2024
2061
  return [];
2025
2062
  const evaluatorIds = new Set(
@@ -2078,14 +2115,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2078
2115
  }
2079
2116
  const scoreLines = [];
2080
2117
  for (const item of scores) {
2081
- const def = getScoreById(item.id);
2118
+ const def = item.def ?? getScoreById(item.id);
2082
2119
  const scoreLabel = def ? def.name ?? def.id : item.id;
2083
2120
  let formatted;
2084
2121
  if (!def) {
2085
2122
  const numeric = toNumericScore(item.data);
2086
2123
  formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
2087
2124
  } else {
2088
- const raw = def.format(item.data, options);
2125
+ const raw = formatScoreData(def, item.data, options);
2089
2126
  switch (def.displayStrategy) {
2090
2127
  case "bar": {
2091
2128
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
@@ -2237,7 +2274,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2237
2274
  (s, e) => s + e.durationMs,
2238
2275
  0
2239
2276
  );
2240
- existing.events.every((e) => e.passed);
2241
2277
  const lines = [];
2242
2278
  lines.push(
2243
2279
  `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
@@ -2315,18 +2351,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2315
2351
  if (finalEvent.type === "RunFailed") {
2316
2352
  throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2317
2353
  }
2354
+ const completed = finalEvent;
2318
2355
  console.log("");
2319
2356
  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2320
2357
  console.log(
2321
2358
  `- passed: ${colorize(
2322
- `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
2359
+ `${completed.passedTestCases}/${completed.totalTestCases}`,
2323
2360
  ansi2.green
2324
2361
  )}`
2325
2362
  );
2326
2363
  console.log(
2327
2364
  `- failed: ${colorize(
2328
- `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
2329
- finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
2365
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2366
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2330
2367
  )}`
2331
2368
  );
2332
2369
  if (overallScoreCount > 0) {
@@ -2367,10 +2404,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2367
2404
  );
2368
2405
  continue;
2369
2406
  }
2370
- const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
2371
- summary.aggregatedScoreItem.data,
2372
- { isAggregated: true }
2373
- ) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2407
+ const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
2408
+ const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
2409
+ return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
2410
+ })() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
2374
2411
  console.log(
2375
2412
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
2376
2413
  scoreLabel,
@@ -2379,7 +2416,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2379
2416
  );
2380
2417
  }
2381
2418
  }
2382
- console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
2419
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2383
2420
  }
2384
2421
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2385
2422
  return new Promise((resolve5, reject) => {