@m4trix/evals 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -315,6 +315,8 @@ interface ScoreItem<TData = unknown> {
315
315
  readonly id: string;
316
316
  readonly data: TData;
317
317
  readonly passed?: boolean;
318
+ /** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
319
+ readonly def?: ScoreDef<TData>;
318
320
  }
319
321
  interface FormatScoreOptions {
320
322
  isAggregated?: boolean;
@@ -323,19 +325,43 @@ interface ScoreDef<TData = unknown> {
323
325
  readonly id: string;
324
326
  readonly name?: string;
325
327
  readonly displayStrategy: ScoreDisplayStrategy;
326
- readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
327
- format(data: TData, options?: FormatScoreOptions): string;
328
+ readonly formatValue: (data: TData) => string;
329
+ readonly formatAggregate: (data: TData) => string;
330
+ readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
328
331
  make(data: TData, options?: {
329
332
  definePassed?: (data: TData) => boolean;
330
333
  }): ScoreItem<TData>;
331
334
  }
335
+ /** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
336
+ declare function formatScoreData<TData>(def: ScoreDef<TData>, data: TData, options?: FormatScoreOptions): string;
332
337
  declare const Score: {
338
+ aggregate: {
339
+ /** Average numeric fields. Use for scores like { value, delta }. */
340
+ averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
341
+ /** Average `value` with sample std dev. Use for percent-style scores. */
342
+ averageWithVariance<T extends {
343
+ value: number;
344
+ }>(values: readonly T[]): T & {
345
+ stdDev?: number | undefined;
346
+ count: number;
347
+ };
348
+ /** All runs must pass. Use for binary scores. */
349
+ all<T_1 extends {
350
+ passed: boolean;
351
+ }>(values: readonly T_1[]): T_1 & {
352
+ passedCount?: number | undefined;
353
+ totalCount?: number | undefined;
354
+ };
355
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
356
+ last<T_2>(values: readonly T_2[]): T_2;
357
+ };
333
358
  of<TData>(config: {
334
359
  id: string;
335
360
  name?: string | undefined;
336
361
  displayStrategy: ScoreDisplayStrategy;
337
- format: (data: TData, options?: FormatScoreOptions) => string;
338
- aggregate?: ((values: readonly TData[]) => TData) | undefined;
362
+ formatValue: (data: TData) => string;
363
+ formatAggregate: (data: TData) => string;
364
+ aggregateValues: (values: readonly TData[]) => TData;
339
365
  }): ScoreDef<TData>;
340
366
  };
341
367
  declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
@@ -480,6 +506,11 @@ interface PercentScoreData {
480
506
  count?: number;
481
507
  }
482
508
  declare const percentScore: ScoreDef<PercentScoreData>;
509
+ interface DeltaScoreData {
510
+ value: number;
511
+ delta: number;
512
+ }
513
+ declare const deltaScore: ScoreDef<DeltaScoreData>;
483
514
  interface BinaryScoreData {
484
515
  passed: boolean;
485
516
  passedCount?: number;
@@ -487,4 +518,4 @@ interface BinaryScoreData {
487
518
  }
488
519
  declare const binaryScore: ScoreDef<BinaryScoreData>;
489
520
 
490
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
521
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -513,20 +513,70 @@ function getMetricById(id) {
513
513
 
514
514
  // src/evals/score.ts
515
515
  var registry2 = /* @__PURE__ */ new Map();
516
+ function formatScoreData(def, data, options) {
517
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
518
+ }
519
+ var ScoreAggregate = {
520
+ /** Average numeric fields. Use for scores like { value, delta }. */
521
+ averageFields(fields) {
522
+ return (values) => {
523
+ const count = values.length || 1;
524
+ const result = {};
525
+ for (const field of fields) {
526
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
527
+ }
528
+ return result;
529
+ };
530
+ },
531
+ /** Average `value` with sample std dev. Use for percent-style scores. */
532
+ averageWithVariance(values) {
533
+ if (values.length === 0) {
534
+ return { value: 0, stdDev: void 0, count: 0 };
535
+ }
536
+ const sum = values.reduce((s, v) => s + v.value, 0);
537
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
538
+ const mean = sum / values.length;
539
+ let stdDev;
540
+ if (values.length >= 2) {
541
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
542
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
543
+ }
544
+ return { ...values[0], value: mean, stdDev, count: values.length };
545
+ },
546
+ /** All runs must pass. Use for binary scores. */
547
+ all(values) {
548
+ const total = values.length;
549
+ const passedCount = values.filter((v) => v.passed).length;
550
+ return {
551
+ ...values[0],
552
+ passed: total > 0 && values.every((v) => v.passed),
553
+ passedCount,
554
+ totalCount: total
555
+ };
556
+ },
557
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
558
+ last(values) {
559
+ return values[values.length - 1] ?? {};
560
+ }
561
+ };
516
562
  var Score = {
563
+ aggregate: ScoreAggregate,
517
564
  of(config) {
518
565
  const def = {
519
566
  id: config.id,
520
567
  name: config.name,
521
568
  displayStrategy: config.displayStrategy,
522
- aggregate: config.aggregate,
523
- format: config.format,
569
+ formatValue: config.formatValue,
570
+ formatAggregate: config.formatAggregate,
571
+ aggregateValues: config.aggregateValues,
524
572
  make: (data, options) => {
525
573
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
526
574
  return {
527
575
  id: config.id,
528
576
  data,
529
- ...passed !== void 0 && { passed }
577
+ ...passed !== void 0 && { passed },
578
+ def
579
+ // Attach def so rendering/aggregation works without registry lookup
530
580
  };
531
581
  }
532
582
  };
@@ -539,29 +589,6 @@ function getScoreById(id) {
539
589
  }
540
590
 
541
591
  // src/evals/aggregators.ts
542
- function aggregateAverageWithVariance(values) {
543
- if (values.length === 0) {
544
- return { value: 0, count: 0 };
545
- }
546
- const sum = values.reduce((s, v) => s + v.value, 0);
547
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
548
- const mean = sum / values.length;
549
- let stdDev;
550
- if (values.length >= 2) {
551
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
552
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
553
- }
554
- return { value: mean, stdDev, count: values.length };
555
- }
556
- function aggregateAll(values) {
557
- const total = values.length;
558
- const passedCount = values.filter((v) => v.passed).length;
559
- return {
560
- passed: total > 0 && values.every((v) => v.passed),
561
- passedCount,
562
- totalCount: total
563
- };
564
- }
565
592
  function aggregateTokenCountSum(values) {
566
593
  const initial = {
567
594
  input: 0,
@@ -614,29 +641,31 @@ var percentScore = Score.of({
614
641
  id: "percent",
615
642
  name: "Score",
616
643
  displayStrategy: "bar",
617
- format: (data, options) => {
618
- if (options?.isAggregated) {
619
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
620
- }
621
- return data.value.toFixed(2);
622
- },
623
- aggregate: aggregateAverageWithVariance
644
+ formatValue: (data) => data.value.toFixed(2),
645
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
646
+ aggregateValues: Score.aggregate.averageWithVariance
647
+ });
648
+ var deltaScore = Score.of({
649
+ id: "delta",
650
+ name: "Delta",
651
+ displayStrategy: "number",
652
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
653
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
654
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
624
655
  });
625
656
  var binaryScore = Score.of({
626
657
  id: "binary",
627
658
  name: "Result",
628
659
  displayStrategy: "passFail",
629
- format: (data, options) => {
630
- if (options?.isAggregated) {
631
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
632
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
633
- return `${base} (${data.passedCount}/${data.totalCount})`;
634
- }
635
- return base;
660
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
661
+ formatAggregate: (data) => {
662
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
663
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
664
+ return `${base} (${data.passedCount}/${data.totalCount})`;
636
665
  }
637
- return data.passed ? "PASSED" : "NOT PASSED";
666
+ return base;
638
667
  },
639
- aggregate: aggregateAll
668
+ aggregateValues: Score.aggregate.all
640
669
  });
641
670
  function createDiffString(expected, actual, diffOptions) {
642
671
  const opts = { ...diffOptions, color: false };
@@ -952,9 +981,12 @@ async function collectTestCasesFromFiles(config) {
952
981
  }
953
982
 
954
983
  // src/runner/score-utils.ts
984
+ function getScoreDef(item) {
985
+ return item.def ?? getScoreById(item.id);
986
+ }
955
987
  function toNumericScoreFromScores(scores) {
956
988
  for (const item of scores) {
957
- const def = getScoreById(item.id);
989
+ const def = getScoreDef(item);
958
990
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
959
991
  const value = item.data.value;
960
992
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1342,7 +1374,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
1342
1374
  () => appendJsonLine(message.artifactPath, {
1343
1375
  runId: message.runId,
1344
1376
  ts: Date.now(),
1345
- ...message.payload
1377
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1346
1378
  })
1347
1379
  );
1348
1380
  })
@@ -1642,6 +1674,6 @@ var EffectRunner = class {
1642
1674
  }
1643
1675
  };
1644
1676
 
1645
- export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1677
+ export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1646
1678
  //# sourceMappingURL=out.js.map
1647
1679
  //# sourceMappingURL=index.js.map