@m4trix/evals 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -289,6 +289,8 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
289
289
  interface MetricItem<TData = unknown> {
290
290
  readonly id: string;
291
291
  readonly data: TData;
292
+ /** Per-item display name override (wins over def.name in rendering) */
293
+ readonly name?: string;
292
294
  }
293
295
  interface FormatMetricOptions {
294
296
  isAggregated?: boolean;
@@ -298,7 +300,9 @@ interface MetricDef<TData = unknown> {
298
300
  readonly name?: string;
299
301
  readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
300
302
  format(data: TData, options?: FormatMetricOptions): string;
301
- make(data: TData): MetricItem<TData>;
303
+ make(data: TData, options?: {
304
+ name?: string;
305
+ }): MetricItem<TData>;
302
306
  }
303
307
  declare const Metric: {
304
308
  of<TData>(config: {
@@ -315,6 +319,10 @@ interface ScoreItem<TData = unknown> {
315
319
  readonly id: string;
316
320
  readonly data: TData;
317
321
  readonly passed?: boolean;
322
+ /** Per-item display name override (wins over def.name in rendering) */
323
+ readonly name?: string;
324
+ /** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
325
+ readonly def?: ScoreDef<TData>;
318
326
  }
319
327
  interface FormatScoreOptions {
320
328
  isAggregated?: boolean;
@@ -323,19 +331,42 @@ interface ScoreDef<TData = unknown> {
323
331
  readonly id: string;
324
332
  readonly name?: string;
325
333
  readonly displayStrategy: ScoreDisplayStrategy;
326
- readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
327
- format(data: TData, options?: FormatScoreOptions): string;
334
+ readonly formatValue: (data: TData) => string;
335
+ readonly formatAggregate: (data: TData) => string;
336
+ readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
328
337
  make(data: TData, options?: {
329
338
  definePassed?: (data: TData) => boolean;
339
+ name?: string;
330
340
  }): ScoreItem<TData>;
331
341
  }
342
+ /** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
343
+ declare function formatScoreData<TData>(def: ScoreDef<TData>, data: TData, options?: FormatScoreOptions): string;
332
344
  declare const Score: {
345
+ aggregate: {
346
+ /** Average numeric fields. Use for scores like { value, delta }. */
347
+ averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
348
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
349
+ averageWithVariance<K_1 extends string>(fields: readonly K_1[]): (values: readonly Record<K_1, number>[]) => Record<K_1, number> & {
350
+ stdDev?: number | undefined;
351
+ count: number;
352
+ };
353
+ /** All runs must pass. Use for binary scores. */
354
+ all<T extends {
355
+ passed: boolean;
356
+ }>(values: readonly T[]): T & {
357
+ passedCount?: number | undefined;
358
+ totalCount?: number | undefined;
359
+ };
360
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
361
+ last<T_1>(values: readonly T_1[]): T_1;
362
+ };
333
363
  of<TData>(config: {
334
364
  id: string;
335
365
  name?: string | undefined;
336
366
  displayStrategy: ScoreDisplayStrategy;
337
- format: (data: TData, options?: FormatScoreOptions) => string;
338
- aggregate?: ((values: readonly TData[]) => TData) | undefined;
367
+ formatValue: (data: TData) => string;
368
+ formatAggregate: (data: TData) => string;
369
+ aggregateValues: (values: readonly TData[]) => TData;
339
370
  }): ScoreDef<TData>;
340
371
  };
341
372
  declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
@@ -480,6 +511,11 @@ interface PercentScoreData {
480
511
  count?: number;
481
512
  }
482
513
  declare const percentScore: ScoreDef<PercentScoreData>;
514
+ interface DeltaScoreData {
515
+ value: number;
516
+ delta: number;
517
+ }
518
+ declare const deltaScore: ScoreDef<DeltaScoreData>;
483
519
  interface BinaryScoreData {
484
520
  passed: boolean;
485
521
  passedCount?: number;
@@ -487,4 +523,4 @@ interface BinaryScoreData {
487
523
  }
488
524
  declare const binaryScore: ScoreDef<BinaryScoreData>;
489
525
 
490
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
526
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -501,7 +501,11 @@ var Metric = {
501
501
  name: config.name,
502
502
  aggregate: config.aggregate,
503
503
  format: config.format,
504
- make: (data) => ({ id: config.id, data })
504
+ make: (data, options) => ({
505
+ id: config.id,
506
+ data,
507
+ ...options?.name !== void 0 && { name: options.name }
508
+ })
505
509
  };
506
510
  registry.set(config.id, def);
507
511
  return def;
@@ -513,20 +517,107 @@ function getMetricById(id) {
513
517
 
514
518
  // src/evals/score.ts
515
519
  var registry2 = /* @__PURE__ */ new Map();
520
+ function formatScoreData(def, data, options) {
521
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
522
+ }
523
+ var ScoreAggregate = {
524
+ /** Average numeric fields. Use for scores like { value, delta }. */
525
+ averageFields(fields) {
526
+ return (values) => {
527
+ const count = values.length || 1;
528
+ const result = {};
529
+ for (const field of fields) {
530
+ result[field] = values.reduce(
531
+ (s, v) => s + (v[field] ?? 0),
532
+ 0
533
+ ) / count;
534
+ }
535
+ return result;
536
+ };
537
+ },
538
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
539
+ averageWithVariance(fields) {
540
+ return (values) => {
541
+ const count = values.length;
542
+ const result = {};
543
+ for (const field of fields) {
544
+ result[field] = count === 0 ? 0 : values.reduce(
545
+ (sum, item) => sum + (item[field] ?? 0),
546
+ 0
547
+ ) / count;
548
+ }
549
+ const valueField = "value";
550
+ const hasValueField = fields.includes(valueField);
551
+ if (count === 0) {
552
+ if (hasValueField) {
553
+ result[valueField] = 0;
554
+ }
555
+ return {
556
+ ...result,
557
+ stdDev: void 0,
558
+ count: 0
559
+ };
560
+ }
561
+ let stdDev;
562
+ if (hasValueField && count >= 2) {
563
+ const sum = values.reduce(
564
+ (s, v) => s + (v[valueField] ?? 0),
565
+ 0
566
+ );
567
+ const sumSq = values.reduce(
568
+ (s, v) => {
569
+ const value = v[valueField] ?? 0;
570
+ return s + value * value;
571
+ },
572
+ 0
573
+ );
574
+ const mean = sum / count;
575
+ const variance = (sumSq - count * mean * mean) / (count - 1);
576
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
577
+ }
578
+ return {
579
+ ...values[0],
580
+ ...result,
581
+ stdDev,
582
+ count
583
+ };
584
+ };
585
+ },
586
+ /** All runs must pass. Use for binary scores. */
587
+ all(values) {
588
+ const total = values.length;
589
+ const passedCount = values.filter((v) => v.passed).length;
590
+ return {
591
+ ...values[0],
592
+ passed: total > 0 && values.every((v) => v.passed),
593
+ passedCount,
594
+ totalCount: total
595
+ };
596
+ },
597
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
598
+ last(values) {
599
+ return values[values.length - 1] ?? {};
600
+ }
601
+ };
516
602
  var Score = {
603
+ aggregate: ScoreAggregate,
517
604
  of(config) {
518
605
  const def = {
519
606
  id: config.id,
520
607
  name: config.name,
521
608
  displayStrategy: config.displayStrategy,
522
- aggregate: config.aggregate,
523
- format: config.format,
609
+ formatValue: config.formatValue,
610
+ formatAggregate: config.formatAggregate,
611
+ aggregateValues: config.aggregateValues,
524
612
  make: (data, options) => {
525
613
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
526
614
  return {
527
615
  id: config.id,
528
616
  data,
529
- ...passed !== void 0 && { passed }
617
+ ...passed !== void 0 && { passed },
618
+ ...options?.name !== void 0 && { name: options.name },
619
+ def
620
+ // Attach def so rendering/aggregation works without registry lookup
530
621
  };
531
622
  }
532
623
  };
@@ -539,29 +630,6 @@ function getScoreById(id) {
539
630
  }
540
631
 
541
632
  // src/evals/aggregators.ts
542
- function aggregateAverageWithVariance(values) {
543
- if (values.length === 0) {
544
- return { value: 0, count: 0 };
545
- }
546
- const sum = values.reduce((s, v) => s + v.value, 0);
547
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
548
- const mean = sum / values.length;
549
- let stdDev;
550
- if (values.length >= 2) {
551
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
552
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
553
- }
554
- return { value: mean, stdDev, count: values.length };
555
- }
556
- function aggregateAll(values) {
557
- const total = values.length;
558
- const passedCount = values.filter((v) => v.passed).length;
559
- return {
560
- passed: total > 0 && values.every((v) => v.passed),
561
- passedCount,
562
- totalCount: total
563
- };
564
- }
565
633
  function aggregateTokenCountSum(values) {
566
634
  const initial = {
567
635
  input: 0,
@@ -614,29 +682,31 @@ var percentScore = Score.of({
614
682
  id: "percent",
615
683
  name: "Score",
616
684
  displayStrategy: "bar",
617
- format: (data, options) => {
618
- if (options?.isAggregated) {
619
- return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
620
- }
621
- return data.value.toFixed(2);
622
- },
623
- aggregate: aggregateAverageWithVariance
685
+ formatValue: (data) => data.value.toFixed(2),
686
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
687
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
688
+ });
689
+ var deltaScore = Score.of({
690
+ id: "delta",
691
+ name: "Delta",
692
+ displayStrategy: "number",
693
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
694
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
695
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
624
696
  });
625
697
  var binaryScore = Score.of({
626
698
  id: "binary",
627
699
  name: "Result",
628
700
  displayStrategy: "passFail",
629
- format: (data, options) => {
630
- if (options?.isAggregated) {
631
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
632
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
633
- return `${base} (${data.passedCount}/${data.totalCount})`;
634
- }
635
- return base;
701
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
702
+ formatAggregate: (data) => {
703
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
704
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
705
+ return `${base} (${data.passedCount}/${data.totalCount})`;
636
706
  }
637
- return data.passed ? "PASSED" : "NOT PASSED";
707
+ return base;
638
708
  },
639
- aggregate: aggregateAll
709
+ aggregateValues: Score.aggregate.all
640
710
  });
641
711
  function createDiffString(expected, actual, diffOptions) {
642
712
  const opts = { ...diffOptions, color: false };
@@ -952,9 +1022,12 @@ async function collectTestCasesFromFiles(config) {
952
1022
  }
953
1023
 
954
1024
  // src/runner/score-utils.ts
1025
+ function getScoreDef(item) {
1026
+ return item.def ?? getScoreById(item.id);
1027
+ }
955
1028
  function toNumericScoreFromScores(scores) {
956
1029
  for (const item of scores) {
957
- const def = getScoreById(item.id);
1030
+ const def = getScoreDef(item);
958
1031
  if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
959
1032
  const value = item.data.value;
960
1033
  if (typeof value === "number" && Number.isFinite(value)) {
@@ -1342,7 +1415,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
1342
1415
  () => appendJsonLine(message.artifactPath, {
1343
1416
  runId: message.runId,
1344
1417
  ts: Date.now(),
1345
- ...message.payload
1418
+ ...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
1346
1419
  })
1347
1420
  );
1348
1421
  })
@@ -1642,6 +1715,6 @@ var EffectRunner = class {
1642
1715
  }
1643
1716
  };
1644
1717
 
1645
- export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1718
+ export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1646
1719
  //# sourceMappingURL=out.js.map
1647
1720
  //# sourceMappingURL=index.js.map