@m4trix/evals 0.25.0 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +147 -260
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +133 -246
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +143 -291
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +130 -278
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +49 -92
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +49 -92
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/index.cjs
CHANGED
|
@@ -184,9 +184,7 @@ var data_mock_default = {
|
|
|
184
184
|
{ name: "contract_match", score: 100 },
|
|
185
185
|
{ name: "arg_validity", score: 100 }
|
|
186
186
|
],
|
|
187
|
-
checks: [
|
|
188
|
-
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
189
|
-
],
|
|
187
|
+
checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
|
|
190
188
|
failures: [],
|
|
191
189
|
meta: {
|
|
192
190
|
model: "gpt-4o-mini",
|
|
@@ -209,9 +207,21 @@ var data_mock_default = {
|
|
|
209
207
|
}
|
|
210
208
|
],
|
|
211
209
|
evaluators: [
|
|
212
|
-
{
|
|
213
|
-
|
|
214
|
-
|
|
210
|
+
{
|
|
211
|
+
id: "json-schema-validator",
|
|
212
|
+
name: "JSON Schema Validator",
|
|
213
|
+
configPreview: "strict=true"
|
|
214
|
+
},
|
|
215
|
+
{
|
|
216
|
+
id: "tool-call-contract-checker",
|
|
217
|
+
name: "Tool-call Contract Checker",
|
|
218
|
+
configPreview: "unexpectedCalls=error"
|
|
219
|
+
},
|
|
220
|
+
{
|
|
221
|
+
id: "rubric-judge",
|
|
222
|
+
name: "Rubric Judge (LLM)",
|
|
223
|
+
configPreview: "model=gpt-4o-mini; scale=0-100"
|
|
224
|
+
},
|
|
215
225
|
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
216
226
|
]
|
|
217
227
|
};
|
|
@@ -291,9 +301,7 @@ async function loadRunnerData(runner) {
|
|
|
291
301
|
const memSnapshots = runner.getAllRunSnapshots();
|
|
292
302
|
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
293
303
|
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
294
|
-
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
295
|
-
(a, b) => b.queuedAt - a.queuedAt
|
|
296
|
-
);
|
|
304
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
|
|
297
305
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
298
306
|
return loadMockData();
|
|
299
307
|
}
|
|
@@ -447,9 +455,7 @@ var Evaluator = class _Evaluator {
|
|
|
447
455
|
return this._config.passCriterion;
|
|
448
456
|
}
|
|
449
457
|
async resolveContext() {
|
|
450
|
-
const parts = await Promise.all(
|
|
451
|
-
this._config.middlewares.map((mw) => mw.resolve())
|
|
452
|
-
);
|
|
458
|
+
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
453
459
|
return Object.assign({}, ...parts);
|
|
454
460
|
}
|
|
455
461
|
};
|
|
@@ -553,10 +559,7 @@ var ScoreAggregate = {
|
|
|
553
559
|
const count = values.length || 1;
|
|
554
560
|
const result = {};
|
|
555
561
|
for (const field of fields) {
|
|
556
|
-
result[field] = values.reduce(
|
|
557
|
-
(s, v) => s + (v[field] ?? 0),
|
|
558
|
-
0
|
|
559
|
-
) / count;
|
|
562
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
560
563
|
}
|
|
561
564
|
return result;
|
|
562
565
|
};
|
|
@@ -590,13 +593,10 @@ var ScoreAggregate = {
|
|
|
590
593
|
(s, v) => s + (v[valueField] ?? 0),
|
|
591
594
|
0
|
|
592
595
|
);
|
|
593
|
-
const sumSq = values.reduce(
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
},
|
|
598
|
-
0
|
|
599
|
-
);
|
|
596
|
+
const sumSq = values.reduce((s, v) => {
|
|
597
|
+
const value = v[valueField] ?? 0;
|
|
598
|
+
return s + value * value;
|
|
599
|
+
}, 0);
|
|
600
600
|
const mean = sum / count;
|
|
601
601
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
602
602
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -791,16 +791,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
791
791
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
792
792
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
793
793
|
if (diffOptions?.keysOnly) {
|
|
794
|
-
const expectedKeys = JSON.stringify(
|
|
795
|
-
|
|
796
|
-
null,
|
|
797
|
-
2
|
|
798
|
-
);
|
|
799
|
-
const actualKeys = JSON.stringify(
|
|
800
|
-
extractKeys(actualProcessed),
|
|
801
|
-
null,
|
|
802
|
-
2
|
|
803
|
-
);
|
|
794
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
795
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
804
796
|
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
805
797
|
return formatDiffParts(parts2);
|
|
806
798
|
}
|
|
@@ -811,9 +803,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
811
803
|
}
|
|
812
804
|
const parts = diff.diffLines(expectedStr, actualStr);
|
|
813
805
|
if (diffOptions?.outputNewOnly) {
|
|
814
|
-
const filtered = parts.filter(
|
|
815
|
-
(p) => p.added === true
|
|
816
|
-
);
|
|
806
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
817
807
|
return formatDiffParts(filtered);
|
|
818
808
|
}
|
|
819
809
|
return formatDiffParts(parts);
|
|
@@ -896,18 +886,8 @@ var defaultRunnerConfig = {
|
|
|
896
886
|
discovery: {
|
|
897
887
|
rootDir: process.cwd(),
|
|
898
888
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
899
|
-
evaluatorSuffixes: [
|
|
900
|
-
|
|
901
|
-
".evaluator.tsx",
|
|
902
|
-
".evaluator.js",
|
|
903
|
-
".evaluator.mjs"
|
|
904
|
-
],
|
|
905
|
-
testCaseSuffixes: [
|
|
906
|
-
".test-case.ts",
|
|
907
|
-
".test-case.tsx",
|
|
908
|
-
".test-case.js",
|
|
909
|
-
".test-case.mjs"
|
|
910
|
-
],
|
|
889
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
890
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
911
891
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
912
892
|
},
|
|
913
893
|
artifactDirectory: ".eval-results",
|
|
@@ -974,14 +954,15 @@ function getJitiLoader() {
|
|
|
974
954
|
}
|
|
975
955
|
const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
|
|
976
956
|
if (typeof createJiti2 !== "function") {
|
|
977
|
-
throw new Error(
|
|
978
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
979
|
-
);
|
|
957
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
980
958
|
}
|
|
981
|
-
cachedLoader = createJiti2(
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
959
|
+
cachedLoader = createJiti2(
|
|
960
|
+
(typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
|
|
961
|
+
{
|
|
962
|
+
interopDefault: true,
|
|
963
|
+
moduleCache: true
|
|
964
|
+
}
|
|
965
|
+
);
|
|
985
966
|
return cachedLoader;
|
|
986
967
|
}
|
|
987
968
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -1085,9 +1066,7 @@ async function loadModuleExports(filePath) {
|
|
|
1085
1066
|
}
|
|
1086
1067
|
async function collectDatasetsFromFiles(config) {
|
|
1087
1068
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1088
|
-
const matched = files.filter(
|
|
1089
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
1090
|
-
);
|
|
1069
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
1091
1070
|
const found = await Promise.all(
|
|
1092
1071
|
matched.map(async (absolutePath) => {
|
|
1093
1072
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1104,9 +1083,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
1104
1083
|
}
|
|
1105
1084
|
async function collectEvaluatorsFromFiles(config) {
|
|
1106
1085
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1107
|
-
const matched = files.filter(
|
|
1108
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
1109
|
-
);
|
|
1086
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
1110
1087
|
const found = await Promise.all(
|
|
1111
1088
|
matched.map(async (absolutePath) => {
|
|
1112
1089
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1123,9 +1100,7 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1123
1100
|
}
|
|
1124
1101
|
async function collectTestCasesFromFiles(config) {
|
|
1125
1102
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1126
|
-
const matched = files.filter(
|
|
1127
|
-
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
1128
|
-
);
|
|
1103
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
1129
1104
|
const found = await Promise.all(
|
|
1130
1105
|
matched.map(async (absolutePath) => {
|
|
1131
1106
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1233,20 +1208,14 @@ function nowIsoForFile() {
|
|
|
1233
1208
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1234
1209
|
}
|
|
1235
1210
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1236
|
-
return path.join(
|
|
1237
|
-
artifactDirectory,
|
|
1238
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1239
|
-
);
|
|
1211
|
+
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1240
1212
|
}
|
|
1241
1213
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1242
1214
|
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
1243
1215
|
return effect.Effect.gen(function* () {
|
|
1244
1216
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1245
1217
|
const started = Date.now();
|
|
1246
|
-
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1247
|
-
n + 1,
|
|
1248
|
-
n + 1
|
|
1249
|
-
]);
|
|
1218
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
1250
1219
|
yield* publishEvent({
|
|
1251
1220
|
type: "TestCaseStarted",
|
|
1252
1221
|
runId: task.runId,
|
|
@@ -1279,9 +1248,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1279
1248
|
return error;
|
|
1280
1249
|
};
|
|
1281
1250
|
try {
|
|
1282
|
-
const ctx = yield* effect.Effect.promise(
|
|
1283
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
1284
|
-
);
|
|
1251
|
+
const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
1285
1252
|
const result = yield* effect.Effect.promise(
|
|
1286
1253
|
() => Promise.resolve().then(
|
|
1287
1254
|
() => evaluateFn({
|
|
@@ -1336,10 +1303,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1336
1303
|
}
|
|
1337
1304
|
}
|
|
1338
1305
|
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1339
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1340
|
-
n + 1,
|
|
1341
|
-
n + 1
|
|
1342
|
-
]);
|
|
1306
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1343
1307
|
const progressEvent = {
|
|
1344
1308
|
type: "TestCaseProgress",
|
|
1345
1309
|
runId: task.runId,
|
|
@@ -1388,10 +1352,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1388
1352
|
} else {
|
|
1389
1353
|
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1390
1354
|
}
|
|
1391
|
-
const [passed, failed] = yield* effect.Effect.all([
|
|
1392
|
-
effect.Ref.get(passedRef),
|
|
1393
|
-
effect.Ref.get(failedRef)
|
|
1394
|
-
]);
|
|
1355
|
+
const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
|
|
1395
1356
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1396
1357
|
...snapshot,
|
|
1397
1358
|
passedTestCases: passed,
|
|
@@ -1711,15 +1672,11 @@ var EffectRunner = class {
|
|
|
1711
1672
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1712
1673
|
effect.Queue.unbounded()
|
|
1713
1674
|
);
|
|
1714
|
-
this.snapshotsRef = effect.Effect.runSync(
|
|
1715
|
-
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1716
|
-
);
|
|
1675
|
+
this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
|
|
1717
1676
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1718
1677
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1719
1678
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1720
|
-
this.schedulerFiber = effect.Effect.runFork(
|
|
1721
|
-
this.createSchedulerEffect()
|
|
1722
|
-
);
|
|
1679
|
+
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1723
1680
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1724
1681
|
createPersistenceWorker(this.persistenceQueue)
|
|
1725
1682
|
);
|
|
@@ -1866,9 +1823,9 @@ var EffectRunner = class {
|
|
|
1866
1823
|
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
1867
1824
|
}
|
|
1868
1825
|
getAllRunSnapshots() {
|
|
1869
|
-
return Array.from(
|
|
1870
|
-
|
|
1871
|
-
)
|
|
1826
|
+
return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
|
|
1827
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
1828
|
+
);
|
|
1872
1829
|
}
|
|
1873
1830
|
async loadRunSnapshotsFromArtifacts() {
|
|
1874
1831
|
return loadRunSnapshotsFromArtifacts(this.config);
|