@m4trix/evals 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -535,20 +535,70 @@ function getMetricById(id) {
535
535
 
536
536
  // src/evals/score.ts
537
537
  var registry2 = /* @__PURE__ */ new Map();
538
+ function formatScoreData(def, data, options) {
539
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
540
+ }
541
+ var ScoreAggregate = {
542
+ /** Average numeric fields. Use for scores like { value, delta }. */
543
+ averageFields(fields) {
544
+ return (values) => {
545
+ const count = values.length || 1;
546
+ const result = {};
547
+ for (const field of fields) {
548
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
549
+ }
550
+ return result;
551
+ };
552
+ },
553
+ /** Average `value` with sample std dev. Use for percent-style scores. */
554
+ averageWithVariance(values) {
555
+ if (values.length === 0) {
556
+ return { value: 0, stdDev: void 0, count: 0 };
557
+ }
558
+ const sum = values.reduce((s, v) => s + v.value, 0);
559
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
560
+ const mean = sum / values.length;
561
+ let stdDev;
562
+ if (values.length >= 2) {
563
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
564
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
565
+ }
566
+ return { ...values[0], value: mean, stdDev, count: values.length };
567
+ },
568
+ /** All runs must pass. Use for binary scores. */
569
+ all(values) {
570
+ const total = values.length;
571
+ const passedCount = values.filter((v) => v.passed).length;
572
+ return {
573
+ ...values[0],
574
+ passed: total > 0 && values.every((v) => v.passed),
575
+ passedCount,
576
+ totalCount: total
577
+ };
578
+ },
579
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
580
+ last(values) {
581
+ return values[values.length - 1] ?? {};
582
+ }
583
+ };
538
584
  var Score = {
585
+ aggregate: ScoreAggregate,
539
586
  of(config) {
540
587
  const def = {
541
588
  id: config.id,
542
589
  name: config.name,
543
590
  displayStrategy: config.displayStrategy,
544
- aggregate: config.aggregate,
545
- format: config.format,
591
+ formatValue: config.formatValue,
592
+ formatAggregate: config.formatAggregate,
593
+ aggregateValues: config.aggregateValues,
546
594
  make: (data, options) => {
547
595
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
548
596
  return {
549
597
  id: config.id,
550
598
  data,
551
- ...passed !== void 0 && { passed }
599
+ ...passed !== void 0 && { passed },
600
+ def
601
+ // Attach def so rendering/aggregation works without registry lookup
552
602
  };
553
603
  }
554
604
  };
@@ -561,29 +611,6 @@ function getScoreById(id) {
561
611
  }
562
612
 
563
613
  // src/evals/aggregators.ts
564
- function aggregateAverageWithVariance(values) {
565
- if (values.length === 0) {
566
- return { value: 0, count: 0 };
567
- }
568
- const sum = values.reduce((s, v) => s + v.value, 0);
569
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
570
- const mean = sum / values.length;
571
- let stdDev;
572
- if (values.length >= 2) {
573
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
574
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
575
- }
576
- return { value: mean, stdDev, count: values.length };
577
- }
578
- function aggregateAll(values) {
579
- const total = values.length;
580
- const passedCount = values.filter((v) => v.passed).length;
581
- return {
582
- passed: total > 0 && values.every((v) => v.passed),
583
- passedCount,
584
- totalCount: total
585
- };
586
- }
587
614
  function aggregateTokenCountSum(values) {
588
615
  const initial = {
589
616
  input: 0,
@@ -636,29 +663,31 @@ var percentScore = Score.of({
636
663
  id: "percent",
637
664
  name: "Score",
638
665
  displayStrategy: "bar",
639
- format: (data, options) => {
640
- if (options?.isAggregated) {
641
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
642
- }
643
- return data.value.toFixed(2);
644
- },
645
- aggregate: aggregateAverageWithVariance
666
+ formatValue: (data) => data.value.toFixed(2),
667
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
668
+ aggregateValues: Score.aggregate.averageWithVariance
669
+ });
670
+ var deltaScore = Score.of({
671
+ id: "delta",
672
+ name: "Delta",
673
+ displayStrategy: "number",
674
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
675
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
676
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
646
677
  });
647
678
  var binaryScore = Score.of({
648
679
  id: "binary",
649
680
  name: "Result",
650
681
  displayStrategy: "passFail",
651
- format: (data, options) => {
652
- if (options?.isAggregated) {
653
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
654
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
655
- return `${base} (${data.passedCount}/${data.totalCount})`;
656
- }
657
- return base;
682
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
683
+ formatAggregate: (data) => {
684
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
685
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
686
+ return `${base} (${data.passedCount}/${data.totalCount})`;
658
687
  }
659
- return data.passed ? "PASSED" : "NOT PASSED";
688
+ return base;
660
689
  },
661
- aggregate: aggregateAll
690
+ aggregateValues: Score.aggregate.all
662
691
  });
663
692
  function createDiffString(expected, actual, diffOptions) {
664
693
  const opts = { ...diffOptions, color: false };
@@ -974,9 +1003,12 @@ async function collectTestCasesFromFiles(config) {
974
1003
  }
975
1004
 
976
1005
  // src/runner/score-utils.ts
1006
+ function getScoreDef(item) {
1007
+ return item.def ?? getScoreById(item.id);
1008
+ }
977
1009
  function toNumericScoreFromScores(scores) {
978
1010
  for (const item of scores) {
979
- const def = getScoreById(item.id);
1011
+ const def = getScoreDef(item);
980
1012
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
981
1013
  const value = item.data.value;
982
1014
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1057,6 +1089,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1057
1089
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1058
1090
  const rerunPassed = [];
1059
1091
  for (let r = 0; r < reruns; r++) {
1092
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
1060
1093
  const started = Date.now();
1061
1094
  const evaluatorScores = [];
1062
1095
  let testCaseError;
@@ -1083,6 +1116,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1083
1116
  input: testCaseItem.testCase.getInput(),
1084
1117
  ctx,
1085
1118
  output,
1119
+ meta: {
1120
+ triggerId: task.triggerId,
1121
+ runId: evaluatorRunId,
1122
+ datasetId: task.datasetId
1123
+ },
1086
1124
  logDiff,
1087
1125
  log
1088
1126
  })
@@ -1358,7 +1396,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
1358
1396
  () => appendJsonLine(message.artifactPath, {
1359
1397
  runId: message.runId,
1360
1398
  ts: Date.now(),
1361
- ...message.payload
1399
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1362
1400
  })
1363
1401
  );
1364
1402
  })
@@ -1542,6 +1580,7 @@ var EffectRunner = class {
1542
1580
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1543
1581
  0
1544
1582
  );
1583
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1545
1584
  const runId = `run-${crypto.randomUUID()}`;
1546
1585
  const artifactPath = createArtifactPath(
1547
1586
  this.config.artifactDirectory,
@@ -1583,6 +1622,7 @@ var EffectRunner = class {
1583
1622
  await effect.Effect.runPromise(
1584
1623
  effect.Queue.offer(this.runQueue, {
1585
1624
  runId,
1625
+ triggerId,
1586
1626
  datasetId: request.datasetId,
1587
1627
  dataset: dataset.dataset,
1588
1628
  evaluators: selectedEvaluators,
@@ -1670,6 +1710,8 @@ exports.createLogEntry = createLogEntry;
1670
1710
  exports.createRunner = createRunner;
1671
1711
  exports.defaultRunnerConfig = defaultRunnerConfig;
1672
1712
  exports.defineConfig = defineConfig;
1713
+ exports.deltaScore = deltaScore;
1714
+ exports.formatScoreData = formatScoreData;
1673
1715
  exports.getLogLines = getLogLines;
1674
1716
  exports.getMetricById = getMetricById;
1675
1717
  exports.getScoreById = getScoreById;