@m4trix/evals 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -167,6 +167,30 @@ declare class Dataset {
167
167
  matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
168
168
  }
169
169
 
170
+ /**
171
+ * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
172
+ * @see https://www.npmjs.com/package/json-diff
173
+ */
174
+ interface JsonDiffOptions {
175
+ /** Include equal sections of the document, not just deltas */
176
+ full?: boolean;
177
+ /** Sort primitive values in arrays before comparing */
178
+ sort?: boolean;
179
+ /** Compare only keys, ignore value differences */
180
+ keysOnly?: boolean;
181
+ /** Always output these keys when their parent object has any diff (comma-separated or array) */
182
+ outputKeys?: string | string[];
183
+ /** Output only new/updated values (no - lines) */
184
+ outputNewOnly?: boolean;
185
+ /** Exclude these keys from comparison (comma-separated or array) */
186
+ excludeKeys?: string | string[];
187
+ /** Include unchanged values in output */
188
+ keepUnchangedValues?: boolean;
189
+ /** Round floats to this many decimals before comparing */
190
+ precision?: number;
191
+ /** Max ... elisions in a row before collapsing */
192
+ maxElisions?: number;
193
+ }
170
194
  interface DiffLogEntry {
171
195
  type: 'diff';
172
196
  label?: string;
@@ -174,7 +198,26 @@ interface DiffLogEntry {
174
198
  actual: unknown;
175
199
  diff: string;
176
200
  }
177
- interface PrintJsonDiffOptions {
201
+ interface LogEntry {
202
+ type: 'log';
203
+ label?: string;
204
+ message: string;
205
+ }
206
+ type EvaluatorLogEntry = DiffLogEntry | LogEntry;
207
+ /**
208
+ * Creates a LogEntry for storage in run artifacts. Use for logging objects or text.
209
+ */
210
+ declare function createLogEntry(message: unknown, options?: {
211
+ label?: string;
212
+ }): LogEntry;
213
+ /**
214
+ * Returns lines from a log entry for display.
215
+ */
216
+ declare function getLogLines(entry: LogEntry): string[];
217
+ interface CreateDiffLogEntryOptions extends JsonDiffOptions {
218
+ label?: string;
219
+ }
220
+ interface PrintJsonDiffOptions extends JsonDiffOptions {
178
221
  /** Enable ANSI colors (default: true) */
179
222
  color?: boolean;
180
223
  }
@@ -191,12 +234,27 @@ interface EvalMiddleware<TCtx> {
191
234
  name: string;
192
235
  resolve: () => TCtx | Promise<TCtx>;
193
236
  }
237
+ interface EvaluateMeta {
238
+ /** Identifier of the trigger that started the run (for example, a CLI invocation). */
239
+ triggerId: string;
240
+ /**
241
+ * Identifier of the current test-case execution shared across all evaluators
242
+ * for this specific test-case run.
243
+ */
244
+ runId: string;
245
+ /** Identifier of the dataset currently being evaluated. */
246
+ datasetId: string;
247
+ }
194
248
  interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
195
249
  input: TInput;
196
250
  ctx: TCtx;
197
251
  output?: TOutput;
252
+ /** Metadata about the current evaluator invocation. */
253
+ meta: EvaluateMeta;
198
254
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
199
- logDiff: (expected: unknown, actual: unknown, options?: {
255
+ logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
256
+ /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
257
+ log: (message: unknown, options?: {
200
258
  label?: string;
201
259
  }) => void;
202
260
  }
@@ -304,6 +362,11 @@ interface SearchTestCasesQuery {
304
362
  excludedPaths?: ReadonlyArray<string | RegExp>;
305
363
  }
306
364
  interface RunDatasetRequest {
365
+ /**
366
+ * Identifier for what triggered the run request (for example, a CLI command).
367
+ * When omitted, the runner generates one in the format `trg-[uuid]`.
368
+ */
369
+ triggerId?: string;
307
370
  datasetId: string;
308
371
  evaluatorIds: ReadonlyArray<string>;
309
372
  concurrency?: number;
@@ -352,7 +415,7 @@ type RunnerEvent = {
352
415
  scores: ReadonlyArray<ScoreItem>;
353
416
  passed: boolean;
354
417
  metrics?: ReadonlyArray<MetricItem>;
355
- logs?: ReadonlyArray<DiffLogEntry>;
418
+ logs?: ReadonlyArray<EvaluatorLogEntry>;
356
419
  }>;
357
420
  output?: unknown;
358
421
  errorMessage?: string;
@@ -424,4 +487,4 @@ interface BinaryScoreData {
424
487
  }
425
488
  declare const binaryScore: ScoreDef<BinaryScoreData>;
426
489
 
427
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
490
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
- import { diffLines } from 'diff';
3
+ import { diffString } from 'json-diff';
4
4
  import { randomUUID } from 'crypto';
5
5
  import { existsSync } from 'fs';
6
6
  import { resolve as resolve$1, relative, join, dirname } from 'path';
@@ -638,46 +638,48 @@ var binaryScore = Score.of({
638
638
  },
639
639
  aggregate: aggregateAll
640
640
  });
641
- function toJsonLines(value) {
641
+ function createDiffString(expected, actual, diffOptions) {
642
+ const opts = { ...diffOptions, color: false };
643
+ const result = diffString(expected, actual, opts);
644
+ return typeof result === "string" ? result : "";
645
+ }
646
+ function formatLogMessage(msg) {
647
+ if (typeof msg === "string")
648
+ return msg;
642
649
  try {
643
- return JSON.stringify(value, null, 2);
650
+ if (msg !== null && typeof msg === "object") {
651
+ return JSON.stringify(msg, null, 2);
652
+ }
653
+ return String(msg);
644
654
  } catch {
645
- return String(value);
655
+ return String(msg);
646
656
  }
647
657
  }
648
- function formatDiffString(changes) {
649
- const lines = [];
650
- for (const part of changes) {
651
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
652
- const partLines = part.value.split("\n");
653
- if (partLines[partLines.length - 1] === "") {
654
- partLines.pop();
655
- }
656
- for (const line of partLines) {
657
- lines.push(`${prefix} ${line}`);
658
- }
659
- }
660
- return lines.join("\n");
658
+ function createLogEntry(message, options) {
659
+ return {
660
+ type: "log",
661
+ label: options?.label,
662
+ message: formatLogMessage(message)
663
+ };
661
664
  }
662
- function createDiffString(expected, actual) {
663
- const expectedStr = toJsonLines(expected);
664
- const actualStr = toJsonLines(actual);
665
- const changes = diffLines(expectedStr, actualStr);
666
- return formatDiffString(changes);
665
+ function getLogLines(entry) {
666
+ return entry.message.split("\n");
667
667
  }
668
668
  function createDiffLogEntry(expected, actual, options) {
669
- const diff = createDiffString(expected, actual);
669
+ const { label, ...diffOpts } = options ?? {};
670
+ const diff = createDiffString(expected, actual, diffOpts);
670
671
  return {
671
672
  type: "diff",
672
- label: options?.label,
673
+ label,
673
674
  expected,
674
675
  actual,
675
676
  diff: diff || "(no differences)"
676
677
  };
677
678
  }
678
679
  function printJsonDiff(expected, actual, options = {}) {
679
- const diff = createDiffString(expected, actual);
680
- if (options.color) {
680
+ const { color = true, ...diffOpts } = options;
681
+ const diff = createDiffString(expected, actual, diffOpts);
682
+ if (color) {
681
683
  const lines = diff.split("\n").map((line) => {
682
684
  const trimmed = line.trimStart();
683
685
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -1033,6 +1035,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1033
1035
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1034
1036
  const rerunPassed = [];
1035
1037
  for (let r = 0; r < reruns; r++) {
1038
+ const evaluatorRunId = `run-${randomUUID()}`;
1036
1039
  const started = Date.now();
1037
1040
  const evaluatorScores = [];
1038
1041
  let testCaseError;
@@ -1047,6 +1050,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1047
1050
  const logDiff = (expected, actual, options) => {
1048
1051
  logs.push(createDiffLogEntry(expected, actual, options));
1049
1052
  };
1053
+ const log = (message, options) => {
1054
+ logs.push(createLogEntry(message, options));
1055
+ };
1050
1056
  const ctx = yield* Effect.promise(
1051
1057
  () => Promise.resolve(evaluator.resolveContext())
1052
1058
  );
@@ -1056,7 +1062,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1056
1062
  input: testCaseItem.testCase.getInput(),
1057
1063
  ctx,
1058
1064
  output,
1059
- logDiff
1065
+ meta: {
1066
+ triggerId: task.triggerId,
1067
+ runId: evaluatorRunId,
1068
+ datasetId: task.datasetId
1069
+ },
1070
+ logDiff,
1071
+ log
1060
1072
  })
1061
1073
  )
1062
1074
  );
@@ -1514,6 +1526,7 @@ var EffectRunner = class {
1514
1526
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1515
1527
  0
1516
1528
  );
1529
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1517
1530
  const runId = `run-${randomUUID()}`;
1518
1531
  const artifactPath = createArtifactPath(
1519
1532
  this.config.artifactDirectory,
@@ -1555,6 +1568,7 @@ var EffectRunner = class {
1555
1568
  await Effect.runPromise(
1556
1569
  Queue.offer(this.runQueue, {
1557
1570
  runId,
1571
+ triggerId,
1558
1572
  datasetId: request.datasetId,
1559
1573
  dataset: dataset.dataset,
1560
1574
  evaluators: selectedEvaluators,
@@ -1628,6 +1642,6 @@ var EffectRunner = class {
1628
1642
  }
1629
1643
  };
1630
1644
 
1631
- export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1645
+ export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
1632
1646
  //# sourceMappingURL=out.js.map
1633
1647
  //# sourceMappingURL=index.js.map