@m4trix/evals 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -234,10 +234,23 @@ interface EvalMiddleware<TCtx> {
234
234
  name: string;
235
235
  resolve: () => TCtx | Promise<TCtx>;
236
236
  }
237
+ interface EvaluateMeta {
238
+ /** Identifier of the trigger that started the run (for example, a CLI invocation). */
239
+ triggerId: string;
240
+ /**
241
+ * Identifier of the current test-case execution shared across all evaluators
242
+ * for this specific test-case run.
243
+ */
244
+ runId: string;
245
+ /** Identifier of the dataset currently being evaluated. */
246
+ datasetId: string;
247
+ }
237
248
  interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
238
249
  input: TInput;
239
250
  ctx: TCtx;
240
251
  output?: TOutput;
252
+ /** Metadata about the current evaluator invocation. */
253
+ meta: EvaluateMeta;
241
254
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
242
255
  logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
243
256
  /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -302,6 +315,8 @@ interface ScoreItem<TData = unknown> {
302
315
  readonly id: string;
303
316
  readonly data: TData;
304
317
  readonly passed?: boolean;
318
+ /** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
319
+ readonly def?: ScoreDef<TData>;
305
320
  }
306
321
  interface FormatScoreOptions {
307
322
  isAggregated?: boolean;
@@ -310,19 +325,43 @@ interface ScoreDef<TData = unknown> {
310
325
  readonly id: string;
311
326
  readonly name?: string;
312
327
  readonly displayStrategy: ScoreDisplayStrategy;
313
- readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
314
- format(data: TData, options?: FormatScoreOptions): string;
328
+ readonly formatValue: (data: TData) => string;
329
+ readonly formatAggregate: (data: TData) => string;
330
+ readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
315
331
  make(data: TData, options?: {
316
332
  definePassed?: (data: TData) => boolean;
317
333
  }): ScoreItem<TData>;
318
334
  }
335
+ /** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
336
+ declare function formatScoreData<TData>(def: ScoreDef<TData>, data: TData, options?: FormatScoreOptions): string;
319
337
  declare const Score: {
338
+ aggregate: {
339
+ /** Average numeric fields. Use for scores like { value, delta }. */
340
+ averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
341
+ /** Average `value` with sample std dev. Use for percent-style scores. */
342
+ averageWithVariance<T extends {
343
+ value: number;
344
+ }>(values: readonly T[]): T & {
345
+ stdDev?: number | undefined;
346
+ count: number;
347
+ };
348
+ /** All runs must pass. Use for binary scores. */
349
+ all<T_1 extends {
350
+ passed: boolean;
351
+ }>(values: readonly T_1[]): T_1 & {
352
+ passedCount?: number | undefined;
353
+ totalCount?: number | undefined;
354
+ };
355
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
356
+ last<T_2>(values: readonly T_2[]): T_2;
357
+ };
320
358
  of<TData>(config: {
321
359
  id: string;
322
360
  name?: string | undefined;
323
361
  displayStrategy: ScoreDisplayStrategy;
324
- format: (data: TData, options?: FormatScoreOptions) => string;
325
- aggregate?: ((values: readonly TData[]) => TData) | undefined;
362
+ formatValue: (data: TData) => string;
363
+ formatAggregate: (data: TData) => string;
364
+ aggregateValues: (values: readonly TData[]) => TData;
326
365
  }): ScoreDef<TData>;
327
366
  };
328
367
  declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
@@ -349,6 +388,11 @@ interface SearchTestCasesQuery {
349
388
  excludedPaths?: ReadonlyArray<string | RegExp>;
350
389
  }
351
390
  interface RunDatasetRequest {
391
+ /**
392
+ * Identifier for what triggered the run request (for example, a CLI command).
393
+ * When omitted, the runner generates one in the format `trg-[uuid]`.
394
+ */
395
+ triggerId?: string;
352
396
  datasetId: string;
353
397
  evaluatorIds: ReadonlyArray<string>;
354
398
  concurrency?: number;
@@ -462,6 +506,11 @@ interface PercentScoreData {
462
506
  count?: number;
463
507
  }
464
508
  declare const percentScore: ScoreDef<PercentScoreData>;
509
+ interface DeltaScoreData {
510
+ value: number;
511
+ delta: number;
512
+ }
513
+ declare const deltaScore: ScoreDef<DeltaScoreData>;
465
514
  interface BinaryScoreData {
466
515
  passed: boolean;
467
516
  passedCount?: number;
@@ -469,4 +518,4 @@ interface BinaryScoreData {
469
518
  }
470
519
  declare const binaryScore: ScoreDef<BinaryScoreData>;
471
520
 
472
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
521
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -513,20 +513,70 @@ function getMetricById(id) {
513
513
 
514
514
  // src/evals/score.ts
515
515
  var registry2 = /* @__PURE__ */ new Map();
516
+ function formatScoreData(def, data, options) {
517
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
518
+ }
519
+ var ScoreAggregate = {
520
+ /** Average numeric fields. Use for scores like { value, delta }. */
521
+ averageFields(fields) {
522
+ return (values) => {
523
+ const count = values.length || 1;
524
+ const result = {};
525
+ for (const field of fields) {
526
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
527
+ }
528
+ return result;
529
+ };
530
+ },
531
+ /** Average `value` with sample std dev. Use for percent-style scores. */
532
+ averageWithVariance(values) {
533
+ if (values.length === 0) {
534
+ return { value: 0, stdDev: void 0, count: 0 };
535
+ }
536
+ const sum = values.reduce((s, v) => s + v.value, 0);
537
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
538
+ const mean = sum / values.length;
539
+ let stdDev;
540
+ if (values.length >= 2) {
541
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
542
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
543
+ }
544
+ return { ...values[0], value: mean, stdDev, count: values.length };
545
+ },
546
+ /** All runs must pass. Use for binary scores. */
547
+ all(values) {
548
+ const total = values.length;
549
+ const passedCount = values.filter((v) => v.passed).length;
550
+ return {
551
+ ...values[0],
552
+ passed: total > 0 && values.every((v) => v.passed),
553
+ passedCount,
554
+ totalCount: total
555
+ };
556
+ },
557
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
558
+ last(values) {
559
+ return values[values.length - 1] ?? {};
560
+ }
561
+ };
516
562
  var Score = {
563
+ aggregate: ScoreAggregate,
517
564
  of(config) {
518
565
  const def = {
519
566
  id: config.id,
520
567
  name: config.name,
521
568
  displayStrategy: config.displayStrategy,
522
- aggregate: config.aggregate,
523
- format: config.format,
569
+ formatValue: config.formatValue,
570
+ formatAggregate: config.formatAggregate,
571
+ aggregateValues: config.aggregateValues,
524
572
  make: (data, options) => {
525
573
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
526
574
  return {
527
575
  id: config.id,
528
576
  data,
529
- ...passed !== void 0 && { passed }
577
+ ...passed !== void 0 && { passed },
578
+ def
579
+ // Attach def so rendering/aggregation works without registry lookup
530
580
  };
531
581
  }
532
582
  };
@@ -539,29 +589,6 @@ function getScoreById(id) {
539
589
  }
540
590
 
541
591
  // src/evals/aggregators.ts
542
- function aggregateAverageWithVariance(values) {
543
- if (values.length === 0) {
544
- return { value: 0, count: 0 };
545
- }
546
- const sum = values.reduce((s, v) => s + v.value, 0);
547
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
548
- const mean = sum / values.length;
549
- let stdDev;
550
- if (values.length >= 2) {
551
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
552
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
553
- }
554
- return { value: mean, stdDev, count: values.length };
555
- }
556
- function aggregateAll(values) {
557
- const total = values.length;
558
- const passedCount = values.filter((v) => v.passed).length;
559
- return {
560
- passed: total > 0 && values.every((v) => v.passed),
561
- passedCount,
562
- totalCount: total
563
- };
564
- }
565
592
  function aggregateTokenCountSum(values) {
566
593
  const initial = {
567
594
  input: 0,
@@ -614,29 +641,31 @@ var percentScore = Score.of({
614
641
  id: "percent",
615
642
  name: "Score",
616
643
  displayStrategy: "bar",
617
- format: (data, options) => {
618
- if (options?.isAggregated) {
619
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
620
- }
621
- return data.value.toFixed(2);
622
- },
623
- aggregate: aggregateAverageWithVariance
644
+ formatValue: (data) => data.value.toFixed(2),
645
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
646
+ aggregateValues: Score.aggregate.averageWithVariance
647
+ });
648
+ var deltaScore = Score.of({
649
+ id: "delta",
650
+ name: "Delta",
651
+ displayStrategy: "number",
652
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
653
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
654
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
624
655
  });
625
656
  var binaryScore = Score.of({
626
657
  id: "binary",
627
658
  name: "Result",
628
659
  displayStrategy: "passFail",
629
- format: (data, options) => {
630
- if (options?.isAggregated) {
631
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
632
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
633
- return `${base} (${data.passedCount}/${data.totalCount})`;
634
- }
635
- return base;
660
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
661
+ formatAggregate: (data) => {
662
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
663
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
664
+ return `${base} (${data.passedCount}/${data.totalCount})`;
636
665
  }
637
- return data.passed ? "PASSED" : "NOT PASSED";
666
+ return base;
638
667
  },
639
- aggregate: aggregateAll
668
+ aggregateValues: Score.aggregate.all
640
669
  });
641
670
  function createDiffString(expected, actual, diffOptions) {
642
671
  const opts = { ...diffOptions, color: false };
@@ -952,9 +981,12 @@ async function collectTestCasesFromFiles(config) {
952
981
  }
953
982
 
954
983
  // src/runner/score-utils.ts
984
+ function getScoreDef(item) {
985
+ return item.def ?? getScoreById(item.id);
986
+ }
955
987
  function toNumericScoreFromScores(scores) {
956
988
  for (const item of scores) {
957
- const def = getScoreById(item.id);
989
+ const def = getScoreDef(item);
958
990
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
959
991
  const value = item.data.value;
960
992
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1035,6 +1067,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1035
1067
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1036
1068
  const rerunPassed = [];
1037
1069
  for (let r = 0; r < reruns; r++) {
1070
+ const evaluatorRunId = `run-${randomUUID()}`;
1038
1071
  const started = Date.now();
1039
1072
  const evaluatorScores = [];
1040
1073
  let testCaseError;
@@ -1061,6 +1094,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1061
1094
  input: testCaseItem.testCase.getInput(),
1062
1095
  ctx,
1063
1096
  output,
1097
+ meta: {
1098
+ triggerId: task.triggerId,
1099
+ runId: evaluatorRunId,
1100
+ datasetId: task.datasetId
1101
+ },
1064
1102
  logDiff,
1065
1103
  log
1066
1104
  })
@@ -1336,7 +1374,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
1336
1374
  () => appendJsonLine(message.artifactPath, {
1337
1375
  runId: message.runId,
1338
1376
  ts: Date.now(),
1339
- ...message.payload
1377
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1340
1378
  })
1341
1379
  );
1342
1380
  })
@@ -1520,6 +1558,7 @@ var EffectRunner = class {
1520
1558
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1521
1559
  0
1522
1560
  );
1561
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1523
1562
  const runId = `run-${randomUUID()}`;
1524
1563
  const artifactPath = createArtifactPath(
1525
1564
  this.config.artifactDirectory,
@@ -1561,6 +1600,7 @@ var EffectRunner = class {
1561
1600
  await Effect.runPromise(
1562
1601
  Queue.offer(this.runQueue, {
1563
1602
  runId,
1603
+ triggerId,
1564
1604
  datasetId: request.datasetId,
1565
1605
  dataset: dataset.dataset,
1566
1606
  evaluators: selectedEvaluators,
@@ -1634,6 +1674,6 @@ var EffectRunner = class {
1634
1674
  }
1635
1675
  };
1636
1676
 
1637
- export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1677
+ export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1638
1678
  //# sourceMappingURL=out.js.map
1639
1679
  //# sourceMappingURL=index.js.map