@m4trix/evals 0.24.0 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +147 -260
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +133 -246
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +143 -291
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +130 -278
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +49 -92
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +49 -92
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/index.js
CHANGED
|
@@ -159,9 +159,7 @@ var data_mock_default = {
|
|
|
159
159
|
{ name: "contract_match", score: 100 },
|
|
160
160
|
{ name: "arg_validity", score: 100 }
|
|
161
161
|
],
|
|
162
|
-
checks: [
|
|
163
|
-
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
164
|
-
],
|
|
162
|
+
checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
|
|
165
163
|
failures: [],
|
|
166
164
|
meta: {
|
|
167
165
|
model: "gpt-4o-mini",
|
|
@@ -184,9 +182,21 @@ var data_mock_default = {
|
|
|
184
182
|
}
|
|
185
183
|
],
|
|
186
184
|
evaluators: [
|
|
187
|
-
{
|
|
188
|
-
|
|
189
|
-
|
|
185
|
+
{
|
|
186
|
+
id: "json-schema-validator",
|
|
187
|
+
name: "JSON Schema Validator",
|
|
188
|
+
configPreview: "strict=true"
|
|
189
|
+
},
|
|
190
|
+
{
|
|
191
|
+
id: "tool-call-contract-checker",
|
|
192
|
+
name: "Tool-call Contract Checker",
|
|
193
|
+
configPreview: "unexpectedCalls=error"
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
id: "rubric-judge",
|
|
197
|
+
name: "Rubric Judge (LLM)",
|
|
198
|
+
configPreview: "model=gpt-4o-mini; scale=0-100"
|
|
199
|
+
},
|
|
190
200
|
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
191
201
|
]
|
|
192
202
|
};
|
|
@@ -266,9 +276,7 @@ async function loadRunnerData(runner) {
|
|
|
266
276
|
const memSnapshots = runner.getAllRunSnapshots();
|
|
267
277
|
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
268
278
|
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
269
|
-
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
270
|
-
(a, b) => b.queuedAt - a.queuedAt
|
|
271
|
-
);
|
|
279
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
|
|
272
280
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
273
281
|
return loadMockData();
|
|
274
282
|
}
|
|
@@ -422,9 +430,7 @@ var Evaluator = class _Evaluator {
|
|
|
422
430
|
return this._config.passCriterion;
|
|
423
431
|
}
|
|
424
432
|
async resolveContext() {
|
|
425
|
-
const parts = await Promise.all(
|
|
426
|
-
this._config.middlewares.map((mw) => mw.resolve())
|
|
427
|
-
);
|
|
433
|
+
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
428
434
|
return Object.assign({}, ...parts);
|
|
429
435
|
}
|
|
430
436
|
};
|
|
@@ -528,10 +534,7 @@ var ScoreAggregate = {
|
|
|
528
534
|
const count = values.length || 1;
|
|
529
535
|
const result = {};
|
|
530
536
|
for (const field of fields) {
|
|
531
|
-
result[field] = values.reduce(
|
|
532
|
-
(s, v) => s + (v[field] ?? 0),
|
|
533
|
-
0
|
|
534
|
-
) / count;
|
|
537
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
535
538
|
}
|
|
536
539
|
return result;
|
|
537
540
|
};
|
|
@@ -565,13 +568,10 @@ var ScoreAggregate = {
|
|
|
565
568
|
(s, v) => s + (v[valueField] ?? 0),
|
|
566
569
|
0
|
|
567
570
|
);
|
|
568
|
-
const sumSq = values.reduce(
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
},
|
|
573
|
-
0
|
|
574
|
-
);
|
|
571
|
+
const sumSq = values.reduce((s, v) => {
|
|
572
|
+
const value = v[valueField] ?? 0;
|
|
573
|
+
return s + value * value;
|
|
574
|
+
}, 0);
|
|
575
575
|
const mean = sum / count;
|
|
576
576
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
577
577
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -766,16 +766,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
766
766
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
767
767
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
768
768
|
if (diffOptions?.keysOnly) {
|
|
769
|
-
const expectedKeys = JSON.stringify(
|
|
770
|
-
|
|
771
|
-
null,
|
|
772
|
-
2
|
|
773
|
-
);
|
|
774
|
-
const actualKeys = JSON.stringify(
|
|
775
|
-
extractKeys(actualProcessed),
|
|
776
|
-
null,
|
|
777
|
-
2
|
|
778
|
-
);
|
|
769
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
770
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
779
771
|
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
780
772
|
return formatDiffParts(parts2);
|
|
781
773
|
}
|
|
@@ -786,9 +778,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
786
778
|
}
|
|
787
779
|
const parts = diffLines(expectedStr, actualStr);
|
|
788
780
|
if (diffOptions?.outputNewOnly) {
|
|
789
|
-
const filtered = parts.filter(
|
|
790
|
-
(p) => p.added === true
|
|
791
|
-
);
|
|
781
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
792
782
|
return formatDiffParts(filtered);
|
|
793
783
|
}
|
|
794
784
|
return formatDiffParts(parts);
|
|
@@ -871,18 +861,8 @@ var defaultRunnerConfig = {
|
|
|
871
861
|
discovery: {
|
|
872
862
|
rootDir: process.cwd(),
|
|
873
863
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
874
|
-
evaluatorSuffixes: [
|
|
875
|
-
|
|
876
|
-
".evaluator.tsx",
|
|
877
|
-
".evaluator.js",
|
|
878
|
-
".evaluator.mjs"
|
|
879
|
-
],
|
|
880
|
-
testCaseSuffixes: [
|
|
881
|
-
".test-case.ts",
|
|
882
|
-
".test-case.tsx",
|
|
883
|
-
".test-case.js",
|
|
884
|
-
".test-case.mjs"
|
|
885
|
-
],
|
|
864
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
865
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
886
866
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
887
867
|
},
|
|
888
868
|
artifactDirectory: ".eval-results",
|
|
@@ -949,14 +929,15 @@ function getJitiLoader() {
|
|
|
949
929
|
}
|
|
950
930
|
const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
|
|
951
931
|
if (typeof createJiti2 !== "function") {
|
|
952
|
-
throw new Error(
|
|
953
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
954
|
-
);
|
|
932
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
955
933
|
}
|
|
956
|
-
cachedLoader = createJiti2(
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
934
|
+
cachedLoader = createJiti2(
|
|
935
|
+
import.meta.url,
|
|
936
|
+
{
|
|
937
|
+
interopDefault: true,
|
|
938
|
+
moduleCache: true
|
|
939
|
+
}
|
|
940
|
+
);
|
|
960
941
|
return cachedLoader;
|
|
961
942
|
}
|
|
962
943
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -1060,9 +1041,7 @@ async function loadModuleExports(filePath) {
|
|
|
1060
1041
|
}
|
|
1061
1042
|
async function collectDatasetsFromFiles(config) {
|
|
1062
1043
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1063
|
-
const matched = files.filter(
|
|
1064
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
1065
|
-
);
|
|
1044
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
1066
1045
|
const found = await Promise.all(
|
|
1067
1046
|
matched.map(async (absolutePath) => {
|
|
1068
1047
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1079,9 +1058,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
1079
1058
|
}
|
|
1080
1059
|
async function collectEvaluatorsFromFiles(config) {
|
|
1081
1060
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1082
|
-
const matched = files.filter(
|
|
1083
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
1084
|
-
);
|
|
1061
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
1085
1062
|
const found = await Promise.all(
|
|
1086
1063
|
matched.map(async (absolutePath) => {
|
|
1087
1064
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1098,9 +1075,7 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1098
1075
|
}
|
|
1099
1076
|
async function collectTestCasesFromFiles(config) {
|
|
1100
1077
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1101
|
-
const matched = files.filter(
|
|
1102
|
-
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
1103
|
-
);
|
|
1078
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
1104
1079
|
const found = await Promise.all(
|
|
1105
1080
|
matched.map(async (absolutePath) => {
|
|
1106
1081
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1208,20 +1183,14 @@ function nowIsoForFile() {
|
|
|
1208
1183
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1209
1184
|
}
|
|
1210
1185
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1211
|
-
return join(
|
|
1212
|
-
artifactDirectory,
|
|
1213
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1214
|
-
);
|
|
1186
|
+
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1215
1187
|
}
|
|
1216
1188
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1217
1189
|
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
1218
1190
|
return Effect.gen(function* () {
|
|
1219
1191
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1220
1192
|
const started = Date.now();
|
|
1221
|
-
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1222
|
-
n + 1,
|
|
1223
|
-
n + 1
|
|
1224
|
-
]);
|
|
1193
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
1225
1194
|
yield* publishEvent({
|
|
1226
1195
|
type: "TestCaseStarted",
|
|
1227
1196
|
runId: task.runId,
|
|
@@ -1254,9 +1223,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1254
1223
|
return error;
|
|
1255
1224
|
};
|
|
1256
1225
|
try {
|
|
1257
|
-
const ctx = yield* Effect.promise(
|
|
1258
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
1259
|
-
);
|
|
1226
|
+
const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
1260
1227
|
const result = yield* Effect.promise(
|
|
1261
1228
|
() => Promise.resolve().then(
|
|
1262
1229
|
() => evaluateFn({
|
|
@@ -1311,10 +1278,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1311
1278
|
}
|
|
1312
1279
|
}
|
|
1313
1280
|
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1314
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1315
|
-
n + 1,
|
|
1316
|
-
n + 1
|
|
1317
|
-
]);
|
|
1281
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1318
1282
|
const progressEvent = {
|
|
1319
1283
|
type: "TestCaseProgress",
|
|
1320
1284
|
runId: task.runId,
|
|
@@ -1363,10 +1327,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1363
1327
|
} else {
|
|
1364
1328
|
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1365
1329
|
}
|
|
1366
|
-
const [passed, failed] = yield* Effect.all([
|
|
1367
|
-
Ref.get(passedRef),
|
|
1368
|
-
Ref.get(failedRef)
|
|
1369
|
-
]);
|
|
1330
|
+
const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
|
|
1370
1331
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1371
1332
|
...snapshot,
|
|
1372
1333
|
passedTestCases: passed,
|
|
@@ -1686,15 +1647,11 @@ var EffectRunner = class {
|
|
|
1686
1647
|
this.persistenceQueue = Effect.runSync(
|
|
1687
1648
|
Queue.unbounded()
|
|
1688
1649
|
);
|
|
1689
|
-
this.snapshotsRef = Effect.runSync(
|
|
1690
|
-
Ref.make(/* @__PURE__ */ new Map())
|
|
1691
|
-
);
|
|
1650
|
+
this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
|
|
1692
1651
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1693
1652
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1694
1653
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1695
|
-
this.schedulerFiber = Effect.runFork(
|
|
1696
|
-
this.createSchedulerEffect()
|
|
1697
|
-
);
|
|
1654
|
+
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1698
1655
|
this.persistenceFiber = Effect.runFork(
|
|
1699
1656
|
createPersistenceWorker(this.persistenceQueue)
|
|
1700
1657
|
);
|
|
@@ -1841,9 +1798,9 @@ var EffectRunner = class {
|
|
|
1841
1798
|
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
1842
1799
|
}
|
|
1843
1800
|
getAllRunSnapshots() {
|
|
1844
|
-
return Array.from(
|
|
1845
|
-
|
|
1846
|
-
)
|
|
1801
|
+
return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
|
|
1802
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
1803
|
+
);
|
|
1847
1804
|
}
|
|
1848
1805
|
async loadRunSnapshotsFromArtifacts() {
|
|
1849
1806
|
return loadRunSnapshotsFromArtifacts(this.config);
|