@m4trix/evals 0.25.0 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -159,9 +159,7 @@ var data_mock_default = {
159
159
  { name: "contract_match", score: 100 },
160
160
  { name: "arg_validity", score: 100 }
161
161
  ],
162
- checks: [
163
- { name: "tool_calls", passed: true, detail: "0 unexpected" }
164
- ],
162
+ checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
165
163
  failures: [],
166
164
  meta: {
167
165
  model: "gpt-4o-mini",
@@ -184,9 +182,21 @@ var data_mock_default = {
184
182
  }
185
183
  ],
186
184
  evaluators: [
187
- { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
188
- { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
189
- { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
185
+ {
186
+ id: "json-schema-validator",
187
+ name: "JSON Schema Validator",
188
+ configPreview: "strict=true"
189
+ },
190
+ {
191
+ id: "tool-call-contract-checker",
192
+ name: "Tool-call Contract Checker",
193
+ configPreview: "unexpectedCalls=error"
194
+ },
195
+ {
196
+ id: "rubric-judge",
197
+ name: "Rubric Judge (LLM)",
198
+ configPreview: "model=gpt-4o-mini; scale=0-100"
199
+ },
190
200
  { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
191
201
  ]
192
202
  };
@@ -266,9 +276,7 @@ async function loadRunnerData(runner) {
266
276
  const memSnapshots = runner.getAllRunSnapshots();
267
277
  const seen = new Set(memSnapshots.map((s) => s.runId));
268
278
  const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
269
- const snapshots = [...memSnapshots, ...fromDisk].sort(
270
- (a, b) => b.queuedAt - a.queuedAt
271
- );
279
+ const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
272
280
  if (datasets.length === 0 && evaluators.length === 0) {
273
281
  return loadMockData();
274
282
  }
@@ -422,9 +430,7 @@ var Evaluator = class _Evaluator {
422
430
  return this._config.passCriterion;
423
431
  }
424
432
  async resolveContext() {
425
- const parts = await Promise.all(
426
- this._config.middlewares.map((mw) => mw.resolve())
427
- );
433
+ const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
428
434
  return Object.assign({}, ...parts);
429
435
  }
430
436
  };
@@ -528,10 +534,7 @@ var ScoreAggregate = {
528
534
  const count = values.length || 1;
529
535
  const result = {};
530
536
  for (const field of fields) {
531
- result[field] = values.reduce(
532
- (s, v) => s + (v[field] ?? 0),
533
- 0
534
- ) / count;
537
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
535
538
  }
536
539
  return result;
537
540
  };
@@ -565,13 +568,10 @@ var ScoreAggregate = {
565
568
  (s, v) => s + (v[valueField] ?? 0),
566
569
  0
567
570
  );
568
- const sumSq = values.reduce(
569
- (s, v) => {
570
- const value = v[valueField] ?? 0;
571
- return s + value * value;
572
- },
573
- 0
574
- );
571
+ const sumSq = values.reduce((s, v) => {
572
+ const value = v[valueField] ?? 0;
573
+ return s + value * value;
574
+ }, 0);
575
575
  const mean = sum / count;
576
576
  const variance = (sumSq - count * mean * mean) / (count - 1);
577
577
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -766,16 +766,8 @@ function createDiffString(expected, actual, diffOptions) {
766
766
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
767
767
  const actualProcessed = preprocessForDiff(actual, diffOptions);
768
768
  if (diffOptions?.keysOnly) {
769
- const expectedKeys = JSON.stringify(
770
- extractKeys(expectedProcessed),
771
- null,
772
- 2
773
- );
774
- const actualKeys = JSON.stringify(
775
- extractKeys(actualProcessed),
776
- null,
777
- 2
778
- );
769
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
770
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
779
771
  const parts2 = diffLines(expectedKeys, actualKeys);
780
772
  return formatDiffParts(parts2);
781
773
  }
@@ -786,9 +778,7 @@ function createDiffString(expected, actual, diffOptions) {
786
778
  }
787
779
  const parts = diffLines(expectedStr, actualStr);
788
780
  if (diffOptions?.outputNewOnly) {
789
- const filtered = parts.filter(
790
- (p) => p.added === true
791
- );
781
+ const filtered = parts.filter((p) => p.added === true);
792
782
  return formatDiffParts(filtered);
793
783
  }
794
784
  return formatDiffParts(parts);
@@ -871,18 +861,8 @@ var defaultRunnerConfig = {
871
861
  discovery: {
872
862
  rootDir: process.cwd(),
873
863
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
874
- evaluatorSuffixes: [
875
- ".evaluator.ts",
876
- ".evaluator.tsx",
877
- ".evaluator.js",
878
- ".evaluator.mjs"
879
- ],
880
- testCaseSuffixes: [
881
- ".test-case.ts",
882
- ".test-case.tsx",
883
- ".test-case.js",
884
- ".test-case.mjs"
885
- ],
864
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
865
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
886
866
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
887
867
  },
888
868
  artifactDirectory: ".eval-results",
@@ -949,14 +929,15 @@ function getJitiLoader() {
949
929
  }
950
930
  const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
951
931
  if (typeof createJiti2 !== "function") {
952
- throw new Error(
953
- "Failed to initialize jiti for m4trix eval config loading."
954
- );
932
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
955
933
  }
956
- cachedLoader = createJiti2(import.meta.url, {
957
- interopDefault: true,
958
- moduleCache: true
959
- });
934
+ cachedLoader = createJiti2(
935
+ import.meta.url,
936
+ {
937
+ interopDefault: true,
938
+ moduleCache: true
939
+ }
940
+ );
960
941
  return cachedLoader;
961
942
  }
962
943
  function resolveConfigModuleExport(loadedModule) {
@@ -1060,9 +1041,7 @@ async function loadModuleExports(filePath) {
1060
1041
  }
1061
1042
  async function collectDatasetsFromFiles(config) {
1062
1043
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1063
- const matched = files.filter(
1064
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
1065
- );
1044
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
1066
1045
  const found = await Promise.all(
1067
1046
  matched.map(async (absolutePath) => {
1068
1047
  const exports = await loadModuleExports(absolutePath);
@@ -1079,9 +1058,7 @@ async function collectDatasetsFromFiles(config) {
1079
1058
  }
1080
1059
  async function collectEvaluatorsFromFiles(config) {
1081
1060
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1082
- const matched = files.filter(
1083
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
1084
- );
1061
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
1085
1062
  const found = await Promise.all(
1086
1063
  matched.map(async (absolutePath) => {
1087
1064
  const exports = await loadModuleExports(absolutePath);
@@ -1098,9 +1075,7 @@ async function collectEvaluatorsFromFiles(config) {
1098
1075
  }
1099
1076
  async function collectTestCasesFromFiles(config) {
1100
1077
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1101
- const matched = files.filter(
1102
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1103
- );
1078
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
1104
1079
  const found = await Promise.all(
1105
1080
  matched.map(async (absolutePath) => {
1106
1081
  const exports = await loadModuleExports(absolutePath);
@@ -1208,20 +1183,14 @@ function nowIsoForFile() {
1208
1183
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1209
1184
  }
1210
1185
  function createArtifactPath(artifactDirectory, datasetId, runId) {
1211
- return join(
1212
- artifactDirectory,
1213
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1214
- );
1186
+ return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1215
1187
  }
1216
1188
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1217
1189
  const { testCaseItem, rerunIndex, rerunTotal } = unit;
1218
1190
  return Effect.gen(function* () {
1219
1191
  const evaluatorRunId = `run-${randomUUID()}`;
1220
1192
  const started = Date.now();
1221
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1222
- n + 1,
1223
- n + 1
1224
- ]);
1193
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
1225
1194
  yield* publishEvent({
1226
1195
  type: "TestCaseStarted",
1227
1196
  runId: task.runId,
@@ -1254,9 +1223,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1254
1223
  return error;
1255
1224
  };
1256
1225
  try {
1257
- const ctx = yield* Effect.promise(
1258
- () => Promise.resolve(evaluator.resolveContext())
1259
- );
1226
+ const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
1260
1227
  const result = yield* Effect.promise(
1261
1228
  () => Promise.resolve().then(
1262
1229
  () => evaluateFn({
@@ -1311,10 +1278,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1311
1278
  }
1312
1279
  }
1313
1280
  const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1314
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1315
- n + 1,
1316
- n + 1
1317
- ]);
1281
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1318
1282
  const progressEvent = {
1319
1283
  type: "TestCaseProgress",
1320
1284
  runId: task.runId,
@@ -1363,10 +1327,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1363
1327
  } else {
1364
1328
  yield* Ref.update(failedRef, (n) => n + 1);
1365
1329
  }
1366
- const [passed, failed] = yield* Effect.all([
1367
- Ref.get(passedRef),
1368
- Ref.get(failedRef)
1369
- ]);
1330
+ const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
1370
1331
  yield* updateSnapshot(task.runId, (snapshot) => ({
1371
1332
  ...snapshot,
1372
1333
  passedTestCases: passed,
@@ -1686,15 +1647,11 @@ var EffectRunner = class {
1686
1647
  this.persistenceQueue = Effect.runSync(
1687
1648
  Queue.unbounded()
1688
1649
  );
1689
- this.snapshotsRef = Effect.runSync(
1690
- Ref.make(/* @__PURE__ */ new Map())
1691
- );
1650
+ this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
1692
1651
  this.listeners = /* @__PURE__ */ new Set();
1693
1652
  this.datasetsById = /* @__PURE__ */ new Map();
1694
1653
  this.evaluatorsById = /* @__PURE__ */ new Map();
1695
- this.schedulerFiber = Effect.runFork(
1696
- this.createSchedulerEffect()
1697
- );
1654
+ this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
1698
1655
  this.persistenceFiber = Effect.runFork(
1699
1656
  createPersistenceWorker(this.persistenceQueue)
1700
1657
  );
@@ -1841,9 +1798,9 @@ var EffectRunner = class {
1841
1798
  return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1842
1799
  }
1843
1800
  getAllRunSnapshots() {
1844
- return Array.from(
1845
- Effect.runSync(Ref.get(this.snapshotsRef)).values()
1846
- ).sort((a, b) => b.queuedAt - a.queuedAt);
1801
+ return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
1802
+ (a, b) => b.queuedAt - a.queuedAt
1803
+ );
1847
1804
  }
1848
1805
  async loadRunSnapshotsFromArtifacts() {
1849
1806
  return loadRunSnapshotsFromArtifacts(this.config);