@m4trix/evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -74,6 +74,7 @@ interface CliState {
74
74
  datasetMenuIndex: number;
75
75
  runMenuIndex: number;
76
76
  detailsScrollOffset: number;
77
+ overviewScrollOffset: number;
77
78
  selectedEvaluatorIds: string[];
78
79
  evaluatorMenuIndex: number;
79
80
  searchQuery: string;
@@ -412,10 +413,14 @@ declare const latencyMetric: MetricDef<LatencyData>;
412
413
 
413
414
  interface PercentScoreData {
414
415
  value: number;
416
+ stdDev?: number;
417
+ count?: number;
415
418
  }
416
419
  declare const percentScore: ScoreDef<PercentScoreData>;
417
420
  interface BinaryScoreData {
418
421
  passed: boolean;
422
+ passedCount?: number;
423
+ totalCount?: number;
419
424
  }
420
425
  declare const binaryScore: ScoreDef<BinaryScoreData>;
421
426
 
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
- import { diffString } from 'json-diff';
3
+ import { diffLines } from 'diff';
4
4
  import { randomUUID } from 'crypto';
5
5
  import { existsSync } from 'fs';
6
6
  import { resolve as resolve$1, relative, join, dirname } from 'path';
@@ -539,15 +539,28 @@ function getScoreById(id) {
539
539
  }
540
540
 
541
541
  // src/evals/aggregators.ts
542
- function aggregateAverage(values) {
542
+ function aggregateAverageWithVariance(values) {
543
543
  if (values.length === 0) {
544
- return { value: 0 };
544
+ return { value: 0, count: 0 };
545
545
  }
546
546
  const sum = values.reduce((s, v) => s + v.value, 0);
547
- return { value: sum / values.length };
547
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
548
+ const mean = sum / values.length;
549
+ let stdDev;
550
+ if (values.length >= 2) {
551
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
552
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
553
+ }
554
+ return { value: mean, stdDev, count: values.length };
548
555
  }
549
556
  function aggregateAll(values) {
550
- return { passed: values.length > 0 && values.every((v) => v.passed) };
557
+ const total = values.length;
558
+ const passedCount = values.filter((v) => v.passed).length;
559
+ return {
560
+ passed: total > 0 && values.every((v) => v.passed),
561
+ passedCount,
562
+ totalCount: total
563
+ };
551
564
  }
552
565
  function aggregateTokenCountSum(values) {
553
566
  const initial = {
@@ -601,18 +614,59 @@ var percentScore = Score.of({
601
614
  id: "percent",
602
615
  name: "Score",
603
616
  displayStrategy: "bar",
604
- format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
605
- aggregate: aggregateAverage
617
+ format: (data, options) => {
618
+ if (options?.isAggregated) {
619
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
620
+ }
621
+ return data.value.toFixed(2);
622
+ },
623
+ aggregate: aggregateAverageWithVariance
606
624
  });
607
625
  var binaryScore = Score.of({
608
626
  id: "binary",
609
627
  name: "Result",
610
628
  displayStrategy: "passFail",
611
- format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
629
+ format: (data, options) => {
630
+ if (options?.isAggregated) {
631
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
632
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
633
+ return `${base} (${data.passedCount}/${data.totalCount})`;
634
+ }
635
+ return base;
636
+ }
637
+ return data.passed ? "PASSED" : "NOT PASSED";
638
+ },
612
639
  aggregate: aggregateAll
613
640
  });
641
+ function toJsonLines(value) {
642
+ try {
643
+ return JSON.stringify(value, null, 2);
644
+ } catch {
645
+ return String(value);
646
+ }
647
+ }
648
+ function formatDiffString(changes) {
649
+ const lines = [];
650
+ for (const part of changes) {
651
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
652
+ const partLines = part.value.split("\n");
653
+ if (partLines[partLines.length - 1] === "") {
654
+ partLines.pop();
655
+ }
656
+ for (const line of partLines) {
657
+ lines.push(`${prefix} ${line}`);
658
+ }
659
+ }
660
+ return lines.join("\n");
661
+ }
662
+ function createDiffString(expected, actual) {
663
+ const expectedStr = toJsonLines(expected);
664
+ const actualStr = toJsonLines(actual);
665
+ const changes = diffLines(expectedStr, actualStr);
666
+ return formatDiffString(changes);
667
+ }
614
668
  function createDiffLogEntry(expected, actual, options) {
615
- const diff = diffString(expected, actual, { color: false });
669
+ const diff = createDiffString(expected, actual);
616
670
  return {
617
671
  type: "diff",
618
672
  label: options?.label,
@@ -622,8 +676,22 @@ function createDiffLogEntry(expected, actual, options) {
622
676
  };
623
677
  }
624
678
  function printJsonDiff(expected, actual, options = {}) {
625
- const { color = true } = options;
626
- const diff = diffString(expected, actual, { color });
679
+ const diff = createDiffString(expected, actual);
680
+ if (options.color) {
681
+ const lines = diff.split("\n").map((line) => {
682
+ const trimmed = line.trimStart();
683
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
684
+ return `\x1B[31m${line}\x1B[0m`;
685
+ }
686
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
687
+ return `\x1B[32m${line}\x1B[0m`;
688
+ }
689
+ return line;
690
+ });
691
+ const colored = lines.join("\n");
692
+ console.log(colored || "(no differences)");
693
+ return colored;
694
+ }
627
695
  console.log(diff || "(no differences)");
628
696
  return diff;
629
697
  }