@m4trix/evals 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -167,6 +167,30 @@ declare class Dataset {
167
167
  matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
168
168
  }
169
169
 
170
+ /**
171
+ * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
172
+ * @see https://www.npmjs.com/package/json-diff
173
+ */
174
+ interface JsonDiffOptions {
175
+ /** Include equal sections of the document, not just deltas */
176
+ full?: boolean;
177
+ /** Sort primitive values in arrays before comparing */
178
+ sort?: boolean;
179
+ /** Compare only keys, ignore value differences */
180
+ keysOnly?: boolean;
181
+ /** Always output these keys when their parent object has any diff (comma-separated or array) */
182
+ outputKeys?: string | string[];
183
+ /** Output only new/updated values (no - lines) */
184
+ outputNewOnly?: boolean;
185
+ /** Exclude these keys from comparison (comma-separated or array) */
186
+ excludeKeys?: string | string[];
187
+ /** Include unchanged values in output */
188
+ keepUnchangedValues?: boolean;
189
+ /** Round floats to this many decimals before comparing */
190
+ precision?: number;
191
+ /** Max ... elisions in a row before collapsing */
192
+ maxElisions?: number;
193
+ }
170
194
  interface DiffLogEntry {
171
195
  type: 'diff';
172
196
  label?: string;
@@ -174,7 +198,26 @@ interface DiffLogEntry {
174
198
  actual: unknown;
175
199
  diff: string;
176
200
  }
177
- interface PrintJsonDiffOptions {
201
+ interface LogEntry {
202
+ type: 'log';
203
+ label?: string;
204
+ message: string;
205
+ }
206
+ type EvaluatorLogEntry = DiffLogEntry | LogEntry;
207
+ /**
208
+ * Creates a LogEntry for storage in run artifacts. Use for logging objects or text.
209
+ */
210
+ declare function createLogEntry(message: unknown, options?: {
211
+ label?: string;
212
+ }): LogEntry;
213
+ /**
214
+ * Returns lines from a log entry for display.
215
+ */
216
+ declare function getLogLines(entry: LogEntry): string[];
217
+ interface CreateDiffLogEntryOptions extends JsonDiffOptions {
218
+ label?: string;
219
+ }
220
+ interface PrintJsonDiffOptions extends JsonDiffOptions {
178
221
  /** Enable ANSI colors (default: true) */
179
222
  color?: boolean;
180
223
  }
@@ -196,7 +239,9 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
196
239
  ctx: TCtx;
197
240
  output?: TOutput;
198
241
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
199
- logDiff: (expected: unknown, actual: unknown, options?: {
242
+ logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
243
+ /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
244
+ log: (message: unknown, options?: {
200
245
  label?: string;
201
246
  }) => void;
202
247
  }
@@ -352,7 +397,7 @@ type RunnerEvent = {
352
397
  scores: ReadonlyArray<ScoreItem>;
353
398
  passed: boolean;
354
399
  metrics?: ReadonlyArray<MetricItem>;
355
- logs?: ReadonlyArray<DiffLogEntry>;
400
+ logs?: ReadonlyArray<EvaluatorLogEntry>;
356
401
  }>;
357
402
  output?: unknown;
358
403
  errorMessage?: string;
@@ -424,4 +469,4 @@ interface BinaryScoreData {
424
469
  }
425
470
  declare const binaryScore: ScoreDef<BinaryScoreData>;
426
471
 
427
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
472
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
- import { diffLines } from 'diff';
3
+ import { diffString } from 'json-diff';
4
4
  import { randomUUID } from 'crypto';
5
5
  import { existsSync } from 'fs';
6
6
  import { resolve as resolve$1, relative, join, dirname } from 'path';
@@ -638,46 +638,48 @@ var binaryScore = Score.of({
638
638
  },
639
639
  aggregate: aggregateAll
640
640
  });
641
- function toJsonLines(value) {
641
+ function createDiffString(expected, actual, diffOptions) {
642
+ const opts = { ...diffOptions, color: false };
643
+ const result = diffString(expected, actual, opts);
644
+ return typeof result === "string" ? result : "";
645
+ }
646
+ function formatLogMessage(msg) {
647
+ if (typeof msg === "string")
648
+ return msg;
642
649
  try {
643
- return JSON.stringify(value, null, 2);
650
+ if (msg !== null && typeof msg === "object") {
651
+ return JSON.stringify(msg, null, 2);
652
+ }
653
+ return String(msg);
644
654
  } catch {
645
- return String(value);
655
+ return String(msg);
646
656
  }
647
657
  }
648
- function formatDiffString(changes) {
649
- const lines = [];
650
- for (const part of changes) {
651
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
652
- const partLines = part.value.split("\n");
653
- if (partLines[partLines.length - 1] === "") {
654
- partLines.pop();
655
- }
656
- for (const line of partLines) {
657
- lines.push(`${prefix} ${line}`);
658
- }
659
- }
660
- return lines.join("\n");
658
+ function createLogEntry(message, options) {
659
+ return {
660
+ type: "log",
661
+ label: options?.label,
662
+ message: formatLogMessage(message)
663
+ };
661
664
  }
662
- function createDiffString(expected, actual) {
663
- const expectedStr = toJsonLines(expected);
664
- const actualStr = toJsonLines(actual);
665
- const changes = diffLines(expectedStr, actualStr);
666
- return formatDiffString(changes);
665
+ function getLogLines(entry) {
666
+ return entry.message.split("\n");
667
667
  }
668
668
  function createDiffLogEntry(expected, actual, options) {
669
- const diff = createDiffString(expected, actual);
669
+ const { label, ...diffOpts } = options ?? {};
670
+ const diff = createDiffString(expected, actual, diffOpts);
670
671
  return {
671
672
  type: "diff",
672
- label: options?.label,
673
+ label,
673
674
  expected,
674
675
  actual,
675
676
  diff: diff || "(no differences)"
676
677
  };
677
678
  }
678
679
  function printJsonDiff(expected, actual, options = {}) {
679
- const diff = createDiffString(expected, actual);
680
- if (options.color) {
680
+ const { color = true, ...diffOpts } = options;
681
+ const diff = createDiffString(expected, actual, diffOpts);
682
+ if (color) {
681
683
  const lines = diff.split("\n").map((line) => {
682
684
  const trimmed = line.trimStart();
683
685
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -1047,6 +1049,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1047
1049
  const logDiff = (expected, actual, options) => {
1048
1050
  logs.push(createDiffLogEntry(expected, actual, options));
1049
1051
  };
1052
+ const log = (message, options) => {
1053
+ logs.push(createLogEntry(message, options));
1054
+ };
1050
1055
  const ctx = yield* Effect.promise(
1051
1056
  () => Promise.resolve(evaluator.resolveContext())
1052
1057
  );
@@ -1056,7 +1061,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1056
1061
  input: testCaseItem.testCase.getInput(),
1057
1062
  ctx,
1058
1063
  output,
1059
- logDiff
1064
+ logDiff,
1065
+ log
1060
1066
  })
1061
1067
  )
1062
1068
  );
@@ -1628,6 +1634,6 @@ var EffectRunner = class {
1628
1634
  }
1629
1635
  };
1630
1636
 
1631
- export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1637
+ export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1632
1638
  //# sourceMappingURL=out.js.map
1633
1639
  //# sourceMappingURL=index.js.map