@m4trix/evals 0.24.0 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -184,9 +184,7 @@ var data_mock_default = {
184
184
  { name: "contract_match", score: 100 },
185
185
  { name: "arg_validity", score: 100 }
186
186
  ],
187
- checks: [
188
- { name: "tool_calls", passed: true, detail: "0 unexpected" }
189
- ],
187
+ checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
190
188
  failures: [],
191
189
  meta: {
192
190
  model: "gpt-4o-mini",
@@ -209,9 +207,21 @@ var data_mock_default = {
209
207
  }
210
208
  ],
211
209
  evaluators: [
212
- { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
213
- { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
214
- { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
210
+ {
211
+ id: "json-schema-validator",
212
+ name: "JSON Schema Validator",
213
+ configPreview: "strict=true"
214
+ },
215
+ {
216
+ id: "tool-call-contract-checker",
217
+ name: "Tool-call Contract Checker",
218
+ configPreview: "unexpectedCalls=error"
219
+ },
220
+ {
221
+ id: "rubric-judge",
222
+ name: "Rubric Judge (LLM)",
223
+ configPreview: "model=gpt-4o-mini; scale=0-100"
224
+ },
215
225
  { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
216
226
  ]
217
227
  };
@@ -291,9 +301,7 @@ async function loadRunnerData(runner) {
291
301
  const memSnapshots = runner.getAllRunSnapshots();
292
302
  const seen = new Set(memSnapshots.map((s) => s.runId));
293
303
  const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
294
- const snapshots = [...memSnapshots, ...fromDisk].sort(
295
- (a, b) => b.queuedAt - a.queuedAt
296
- );
304
+ const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
297
305
  if (datasets.length === 0 && evaluators.length === 0) {
298
306
  return loadMockData();
299
307
  }
@@ -447,9 +455,7 @@ var Evaluator = class _Evaluator {
447
455
  return this._config.passCriterion;
448
456
  }
449
457
  async resolveContext() {
450
- const parts = await Promise.all(
451
- this._config.middlewares.map((mw) => mw.resolve())
452
- );
458
+ const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
453
459
  return Object.assign({}, ...parts);
454
460
  }
455
461
  };
@@ -553,10 +559,7 @@ var ScoreAggregate = {
553
559
  const count = values.length || 1;
554
560
  const result = {};
555
561
  for (const field of fields) {
556
- result[field] = values.reduce(
557
- (s, v) => s + (v[field] ?? 0),
558
- 0
559
- ) / count;
562
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
560
563
  }
561
564
  return result;
562
565
  };
@@ -590,13 +593,10 @@ var ScoreAggregate = {
590
593
  (s, v) => s + (v[valueField] ?? 0),
591
594
  0
592
595
  );
593
- const sumSq = values.reduce(
594
- (s, v) => {
595
- const value = v[valueField] ?? 0;
596
- return s + value * value;
597
- },
598
- 0
599
- );
596
+ const sumSq = values.reduce((s, v) => {
597
+ const value = v[valueField] ?? 0;
598
+ return s + value * value;
599
+ }, 0);
600
600
  const mean = sum / count;
601
601
  const variance = (sumSq - count * mean * mean) / (count - 1);
602
602
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -791,16 +791,8 @@ function createDiffString(expected, actual, diffOptions) {
791
791
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
792
792
  const actualProcessed = preprocessForDiff(actual, diffOptions);
793
793
  if (diffOptions?.keysOnly) {
794
- const expectedKeys = JSON.stringify(
795
- extractKeys(expectedProcessed),
796
- null,
797
- 2
798
- );
799
- const actualKeys = JSON.stringify(
800
- extractKeys(actualProcessed),
801
- null,
802
- 2
803
- );
794
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
795
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
804
796
  const parts2 = diff.diffLines(expectedKeys, actualKeys);
805
797
  return formatDiffParts(parts2);
806
798
  }
@@ -811,9 +803,7 @@ function createDiffString(expected, actual, diffOptions) {
811
803
  }
812
804
  const parts = diff.diffLines(expectedStr, actualStr);
813
805
  if (diffOptions?.outputNewOnly) {
814
- const filtered = parts.filter(
815
- (p) => p.added === true
816
- );
806
+ const filtered = parts.filter((p) => p.added === true);
817
807
  return formatDiffParts(filtered);
818
808
  }
819
809
  return formatDiffParts(parts);
@@ -896,18 +886,8 @@ var defaultRunnerConfig = {
896
886
  discovery: {
897
887
  rootDir: process.cwd(),
898
888
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
899
- evaluatorSuffixes: [
900
- ".evaluator.ts",
901
- ".evaluator.tsx",
902
- ".evaluator.js",
903
- ".evaluator.mjs"
904
- ],
905
- testCaseSuffixes: [
906
- ".test-case.ts",
907
- ".test-case.tsx",
908
- ".test-case.js",
909
- ".test-case.mjs"
910
- ],
889
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
890
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
911
891
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
912
892
  },
913
893
  artifactDirectory: ".eval-results",
@@ -974,14 +954,15 @@ function getJitiLoader() {
974
954
  }
975
955
  const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
976
956
  if (typeof createJiti2 !== "function") {
977
- throw new Error(
978
- "Failed to initialize jiti for m4trix eval config loading."
979
- );
957
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
980
958
  }
981
- cachedLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
982
- interopDefault: true,
983
- moduleCache: true
984
- });
959
+ cachedLoader = createJiti2(
960
+ (typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
961
+ {
962
+ interopDefault: true,
963
+ moduleCache: true
964
+ }
965
+ );
985
966
  return cachedLoader;
986
967
  }
987
968
  function resolveConfigModuleExport(loadedModule) {
@@ -1085,9 +1066,7 @@ async function loadModuleExports(filePath) {
1085
1066
  }
1086
1067
  async function collectDatasetsFromFiles(config) {
1087
1068
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1088
- const matched = files.filter(
1089
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
1090
- );
1069
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
1091
1070
  const found = await Promise.all(
1092
1071
  matched.map(async (absolutePath) => {
1093
1072
  const exports = await loadModuleExports(absolutePath);
@@ -1104,9 +1083,7 @@ async function collectDatasetsFromFiles(config) {
1104
1083
  }
1105
1084
  async function collectEvaluatorsFromFiles(config) {
1106
1085
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1107
- const matched = files.filter(
1108
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
1109
- );
1086
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
1110
1087
  const found = await Promise.all(
1111
1088
  matched.map(async (absolutePath) => {
1112
1089
  const exports = await loadModuleExports(absolutePath);
@@ -1123,9 +1100,7 @@ async function collectEvaluatorsFromFiles(config) {
1123
1100
  }
1124
1101
  async function collectTestCasesFromFiles(config) {
1125
1102
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1126
- const matched = files.filter(
1127
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1128
- );
1103
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
1129
1104
  const found = await Promise.all(
1130
1105
  matched.map(async (absolutePath) => {
1131
1106
  const exports = await loadModuleExports(absolutePath);
@@ -1233,20 +1208,14 @@ function nowIsoForFile() {
1233
1208
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1234
1209
  }
1235
1210
  function createArtifactPath(artifactDirectory, datasetId, runId) {
1236
- return path.join(
1237
- artifactDirectory,
1238
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1239
- );
1211
+ return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1240
1212
  }
1241
1213
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1242
1214
  const { testCaseItem, rerunIndex, rerunTotal } = unit;
1243
1215
  return effect.Effect.gen(function* () {
1244
1216
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1245
1217
  const started = Date.now();
1246
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1247
- n + 1,
1248
- n + 1
1249
- ]);
1218
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
1250
1219
  yield* publishEvent({
1251
1220
  type: "TestCaseStarted",
1252
1221
  runId: task.runId,
@@ -1279,9 +1248,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1279
1248
  return error;
1280
1249
  };
1281
1250
  try {
1282
- const ctx = yield* effect.Effect.promise(
1283
- () => Promise.resolve(evaluator.resolveContext())
1284
- );
1251
+ const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
1285
1252
  const result = yield* effect.Effect.promise(
1286
1253
  () => Promise.resolve().then(
1287
1254
  () => evaluateFn({
@@ -1336,10 +1303,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1336
1303
  }
1337
1304
  }
1338
1305
  const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1339
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1340
- n + 1,
1341
- n + 1
1342
- ]);
1306
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1343
1307
  const progressEvent = {
1344
1308
  type: "TestCaseProgress",
1345
1309
  runId: task.runId,
@@ -1388,10 +1352,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1388
1352
  } else {
1389
1353
  yield* effect.Ref.update(failedRef, (n) => n + 1);
1390
1354
  }
1391
- const [passed, failed] = yield* effect.Effect.all([
1392
- effect.Ref.get(passedRef),
1393
- effect.Ref.get(failedRef)
1394
- ]);
1355
+ const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
1395
1356
  yield* updateSnapshot(task.runId, (snapshot) => ({
1396
1357
  ...snapshot,
1397
1358
  passedTestCases: passed,
@@ -1711,15 +1672,11 @@ var EffectRunner = class {
1711
1672
  this.persistenceQueue = effect.Effect.runSync(
1712
1673
  effect.Queue.unbounded()
1713
1674
  );
1714
- this.snapshotsRef = effect.Effect.runSync(
1715
- effect.Ref.make(/* @__PURE__ */ new Map())
1716
- );
1675
+ this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
1717
1676
  this.listeners = /* @__PURE__ */ new Set();
1718
1677
  this.datasetsById = /* @__PURE__ */ new Map();
1719
1678
  this.evaluatorsById = /* @__PURE__ */ new Map();
1720
- this.schedulerFiber = effect.Effect.runFork(
1721
- this.createSchedulerEffect()
1722
- );
1679
+ this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1723
1680
  this.persistenceFiber = effect.Effect.runFork(
1724
1681
  createPersistenceWorker(this.persistenceQueue)
1725
1682
  );
@@ -1866,9 +1823,9 @@ var EffectRunner = class {
1866
1823
  return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1867
1824
  }
1868
1825
  getAllRunSnapshots() {
1869
- return Array.from(
1870
- effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1871
- ).sort((a, b) => b.queuedAt - a.queuedAt);
1826
+ return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
1827
+ (a, b) => b.queuedAt - a.queuedAt
1828
+ );
1872
1829
  }
1873
1830
  async loadRunSnapshotsFromArtifacts() {
1874
1831
  return loadRunSnapshotsFromArtifacts(this.config);