@m4trix/evals 0.25.1 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/dist/cli-simple.cjs +845 -455
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +846 -456
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +543 -273
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +543 -273
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +948 -545
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +228 -14
- package/dist/index.js +933 -547
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -5,16 +5,16 @@ var fullscreenInk = require('fullscreen-ink');
|
|
|
5
5
|
var React = require('react');
|
|
6
6
|
var ink = require('ink');
|
|
7
7
|
var jsxRuntime = require('react/jsx-runtime');
|
|
8
|
-
var path = require('path');
|
|
9
|
-
var inkChart = require('@pppp606/ink-chart');
|
|
10
|
-
var crypto = require('crypto');
|
|
11
8
|
var effect = require('effect');
|
|
9
|
+
var crypto = require('crypto');
|
|
10
|
+
var promises = require('fs/promises');
|
|
11
|
+
var path = require('path');
|
|
12
12
|
var fs = require('fs');
|
|
13
13
|
var jitiModule = require('jiti');
|
|
14
|
-
var promises = require('fs/promises');
|
|
15
14
|
var url = require('url');
|
|
16
15
|
var diff = require('diff');
|
|
17
16
|
var stringify = require('fast-json-stable-stringify');
|
|
17
|
+
var inkChart = require('@pppp606/ink-chart');
|
|
18
18
|
|
|
19
19
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
20
20
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
@@ -264,6 +264,59 @@ function isPrintableCharacter(input) {
|
|
|
264
264
|
function isBackKey(key) {
|
|
265
265
|
return key.backspace || key.delete;
|
|
266
266
|
}
|
|
267
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
268
|
+
function makeEntityIdSchema(brand, label) {
|
|
269
|
+
return effect.Schema.String.pipe(
|
|
270
|
+
effect.Schema.trimmed(),
|
|
271
|
+
effect.Schema.minLength(1, {
|
|
272
|
+
message: () => `${label} must be non-empty.`
|
|
273
|
+
}),
|
|
274
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
275
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
276
|
+
}),
|
|
277
|
+
effect.Schema.brand(brand)
|
|
278
|
+
);
|
|
279
|
+
}
|
|
280
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
281
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
282
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
283
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
284
|
+
function validateWithSchema(schema, raw, context) {
|
|
285
|
+
const trimmed = raw.trim();
|
|
286
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
287
|
+
schema
|
|
288
|
+
);
|
|
289
|
+
const result = decode(trimmed);
|
|
290
|
+
if (effect.Either.isLeft(result)) {
|
|
291
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
292
|
+
}
|
|
293
|
+
return result.right;
|
|
294
|
+
}
|
|
295
|
+
function validateRunConfigName(raw, context) {
|
|
296
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// src/evals/dataset.ts
|
|
300
|
+
function getDatasetDisplayLabel(dataset) {
|
|
301
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
302
|
+
return dataset.getDisplayLabel();
|
|
303
|
+
}
|
|
304
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// src/evals/evaluator.ts
|
|
308
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
309
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
310
|
+
const label = evaluator.getDisplayLabel();
|
|
311
|
+
if (label !== void 0) {
|
|
312
|
+
return label;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
316
|
+
}
|
|
317
|
+
function getEvaluatorTagList(evaluator) {
|
|
318
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
319
|
+
}
|
|
267
320
|
|
|
268
321
|
// src/cli/data.mock.json
|
|
269
322
|
var data_mock_default = {
|
|
@@ -511,7 +564,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
511
564
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
512
565
|
return {
|
|
513
566
|
id: item.id,
|
|
514
|
-
name: item.dataset
|
|
567
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
515
568
|
overview: `Discovered from ${item.filePath}`,
|
|
516
569
|
runs
|
|
517
570
|
};
|
|
@@ -519,7 +572,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
519
572
|
function toEvaluatorOption(item) {
|
|
520
573
|
return {
|
|
521
574
|
id: item.id,
|
|
522
|
-
name: item.evaluator
|
|
575
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
523
576
|
configPreview: `Source: ${item.filePath}`
|
|
524
577
|
};
|
|
525
578
|
}
|
|
@@ -762,6 +815,159 @@ function reduceCliState(state, action) {
|
|
|
762
815
|
}
|
|
763
816
|
return state;
|
|
764
817
|
}
|
|
818
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
819
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
820
|
+
let entries;
|
|
821
|
+
try {
|
|
822
|
+
entries = await promises.readdir(baseDir);
|
|
823
|
+
} catch {
|
|
824
|
+
return [];
|
|
825
|
+
}
|
|
826
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
827
|
+
const snapshots = [];
|
|
828
|
+
for (const fileName of jsonlFiles) {
|
|
829
|
+
const filePath = path.join(baseDir, fileName);
|
|
830
|
+
try {
|
|
831
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
832
|
+
if (snapshot) {
|
|
833
|
+
snapshots.push(snapshot);
|
|
834
|
+
}
|
|
835
|
+
} catch {
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
839
|
+
}
|
|
840
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
841
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
842
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
843
|
+
if (lines.length === 0) {
|
|
844
|
+
return null;
|
|
845
|
+
}
|
|
846
|
+
let runQueued = null;
|
|
847
|
+
let runCompleted = null;
|
|
848
|
+
let runFailed = null;
|
|
849
|
+
let runStarted = null;
|
|
850
|
+
for (const line of lines) {
|
|
851
|
+
try {
|
|
852
|
+
const event = JSON.parse(line);
|
|
853
|
+
const type = event.type;
|
|
854
|
+
if (type === "RunQueued") {
|
|
855
|
+
runQueued = {
|
|
856
|
+
runId: event.runId,
|
|
857
|
+
datasetId: event.datasetId,
|
|
858
|
+
datasetName: event.datasetName,
|
|
859
|
+
evaluatorIds: event.evaluatorIds,
|
|
860
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
861
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
862
|
+
ts: event.ts
|
|
863
|
+
};
|
|
864
|
+
}
|
|
865
|
+
if (type === "RunStarted") {
|
|
866
|
+
runStarted = { startedAt: event.startedAt };
|
|
867
|
+
}
|
|
868
|
+
if (type === "RunCompleted") {
|
|
869
|
+
runCompleted = {
|
|
870
|
+
passedTestCases: event.passedTestCases,
|
|
871
|
+
failedTestCases: event.failedTestCases,
|
|
872
|
+
totalTestCases: event.totalTestCases,
|
|
873
|
+
finishedAt: event.finishedAt
|
|
874
|
+
};
|
|
875
|
+
}
|
|
876
|
+
if (type === "RunFailed") {
|
|
877
|
+
runFailed = {
|
|
878
|
+
finishedAt: event.finishedAt,
|
|
879
|
+
errorMessage: event.errorMessage
|
|
880
|
+
};
|
|
881
|
+
}
|
|
882
|
+
} catch {
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
if (!runQueued) {
|
|
886
|
+
return null;
|
|
887
|
+
}
|
|
888
|
+
const artifactPath = filePath;
|
|
889
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
890
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
891
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
892
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
893
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
894
|
+
return {
|
|
895
|
+
runId: runQueued.runId,
|
|
896
|
+
datasetId: runQueued.datasetId,
|
|
897
|
+
datasetName: runQueued.datasetName,
|
|
898
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
899
|
+
queuedAt: runQueued.ts ?? 0,
|
|
900
|
+
startedAt: runStarted?.startedAt,
|
|
901
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
902
|
+
totalTestCases: runQueued.totalTestCases,
|
|
903
|
+
completedTestCases,
|
|
904
|
+
passedTestCases,
|
|
905
|
+
failedTestCases,
|
|
906
|
+
status,
|
|
907
|
+
artifactPath,
|
|
908
|
+
errorMessage: runFailed?.errorMessage
|
|
909
|
+
};
|
|
910
|
+
}
|
|
911
|
+
function aggregateTestCaseProgress(lines) {
|
|
912
|
+
let completedTestCases = 0;
|
|
913
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
914
|
+
for (const line of lines) {
|
|
915
|
+
try {
|
|
916
|
+
const event = JSON.parse(line);
|
|
917
|
+
if (event.type === "TestCaseProgress") {
|
|
918
|
+
const ev = event;
|
|
919
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
920
|
+
const id = ev.testCaseId;
|
|
921
|
+
const current = testCasePassedBy.get(id);
|
|
922
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
923
|
+
}
|
|
924
|
+
} catch {
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
let passedTestCases = 0;
|
|
928
|
+
let failedTestCases = 0;
|
|
929
|
+
for (const passed of testCasePassedBy.values()) {
|
|
930
|
+
if (passed) {
|
|
931
|
+
passedTestCases += 1;
|
|
932
|
+
} else {
|
|
933
|
+
failedTestCases += 1;
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
937
|
+
}
|
|
938
|
+
async function parseArtifactFile(artifactPath) {
|
|
939
|
+
try {
|
|
940
|
+
const content = await promises.readFile(artifactPath, "utf8");
|
|
941
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
942
|
+
const results = [];
|
|
943
|
+
for (const line of lines) {
|
|
944
|
+
try {
|
|
945
|
+
const event = JSON.parse(line);
|
|
946
|
+
if (event.type === "TestCaseProgress") {
|
|
947
|
+
const ev = event;
|
|
948
|
+
const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
|
|
949
|
+
const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
|
|
950
|
+
results.push({
|
|
951
|
+
testCaseId: ev.testCaseId,
|
|
952
|
+
testCaseName: ev.testCaseName,
|
|
953
|
+
completedTestCases: ev.completedTestCases,
|
|
954
|
+
totalTestCases: ev.totalTestCases,
|
|
955
|
+
repetitionId: ev.repetitionId,
|
|
956
|
+
repetitionIndex,
|
|
957
|
+
repetitionCount,
|
|
958
|
+
passed: ev.passed,
|
|
959
|
+
durationMs: ev.durationMs,
|
|
960
|
+
evaluatorScores: ev.evaluatorScores ?? []
|
|
961
|
+
});
|
|
962
|
+
}
|
|
963
|
+
} catch {
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
return results;
|
|
967
|
+
} catch {
|
|
968
|
+
return [];
|
|
969
|
+
}
|
|
970
|
+
}
|
|
765
971
|
|
|
766
972
|
// src/runner/config.ts
|
|
767
973
|
var defaultRunnerConfig = {
|
|
@@ -769,6 +975,7 @@ var defaultRunnerConfig = {
|
|
|
769
975
|
rootDir: process.cwd(),
|
|
770
976
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
771
977
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
978
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
772
979
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
773
980
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
774
981
|
},
|
|
@@ -794,6 +1001,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
794
1001
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
795
1002
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
796
1003
|
}
|
|
1004
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
1005
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
1006
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
1007
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
1008
|
+
}
|
|
797
1009
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
798
1010
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
799
1011
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -892,6 +1104,9 @@ function isDatasetLike(value) {
|
|
|
892
1104
|
function isEvaluatorLike(value) {
|
|
893
1105
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
894
1106
|
}
|
|
1107
|
+
function isRunConfigLike(value) {
|
|
1108
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1109
|
+
}
|
|
895
1110
|
function isTestCaseLike(value) {
|
|
896
1111
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
897
1112
|
}
|
|
@@ -980,6 +1195,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
980
1195
|
);
|
|
981
1196
|
return found.flat();
|
|
982
1197
|
}
|
|
1198
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1199
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1200
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1201
|
+
const found = await Promise.all(
|
|
1202
|
+
matched.map(async (absolutePath) => {
|
|
1203
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1204
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1205
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1206
|
+
return runConfigs.map((runConfig) => ({
|
|
1207
|
+
id: runConfig.getName(),
|
|
1208
|
+
filePath: relPath,
|
|
1209
|
+
runConfig
|
|
1210
|
+
}));
|
|
1211
|
+
})
|
|
1212
|
+
);
|
|
1213
|
+
return found.flat();
|
|
1214
|
+
}
|
|
983
1215
|
async function collectTestCasesFromFiles(config) {
|
|
984
1216
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
985
1217
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -1133,6 +1365,17 @@ function getDiffLines(entry) {
|
|
|
1133
1365
|
});
|
|
1134
1366
|
}
|
|
1135
1367
|
|
|
1368
|
+
// src/evals/test-case.ts
|
|
1369
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1370
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1371
|
+
return testCase.getDisplayLabel();
|
|
1372
|
+
}
|
|
1373
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
1374
|
+
}
|
|
1375
|
+
function getTestCaseTagList(testCase) {
|
|
1376
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1136
1379
|
// src/evals/metric.ts
|
|
1137
1380
|
var registry = /* @__PURE__ */ new Map();
|
|
1138
1381
|
var Metric = {
|
|
@@ -1156,6 +1399,54 @@ function getMetricById(id) {
|
|
|
1156
1399
|
return registry.get(id);
|
|
1157
1400
|
}
|
|
1158
1401
|
|
|
1402
|
+
// src/evals/aggregators.ts
|
|
1403
|
+
function aggregateTokenCountSum(values) {
|
|
1404
|
+
const initial = {
|
|
1405
|
+
input: 0,
|
|
1406
|
+
output: 0,
|
|
1407
|
+
inputCached: 0,
|
|
1408
|
+
outputCached: 0
|
|
1409
|
+
};
|
|
1410
|
+
return values.reduce(
|
|
1411
|
+
(acc, v) => ({
|
|
1412
|
+
input: acc.input + (v.input ?? 0),
|
|
1413
|
+
output: acc.output + (v.output ?? 0),
|
|
1414
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1415
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1416
|
+
}),
|
|
1417
|
+
initial
|
|
1418
|
+
);
|
|
1419
|
+
}
|
|
1420
|
+
function aggregateLatencyAverage(values) {
|
|
1421
|
+
if (values.length === 0) {
|
|
1422
|
+
return { ms: 0 };
|
|
1423
|
+
}
|
|
1424
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1425
|
+
return { ms: sum / values.length };
|
|
1426
|
+
}
|
|
1427
|
+
|
|
1428
|
+
// src/evals/metrics/standard.ts
|
|
1429
|
+
Metric.of({
|
|
1430
|
+
id: "token-count",
|
|
1431
|
+
name: "Tokens",
|
|
1432
|
+
aggregate: aggregateTokenCountSum,
|
|
1433
|
+
format: (data, options) => {
|
|
1434
|
+
const input = data.input ?? 0;
|
|
1435
|
+
const output = data.output ?? 0;
|
|
1436
|
+
const inputCached = data.inputCached ?? 0;
|
|
1437
|
+
const outputCached = data.outputCached ?? 0;
|
|
1438
|
+
const cached = inputCached + outputCached;
|
|
1439
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1440
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1441
|
+
}
|
|
1442
|
+
});
|
|
1443
|
+
Metric.of({
|
|
1444
|
+
id: "latency",
|
|
1445
|
+
name: "Latency",
|
|
1446
|
+
aggregate: aggregateLatencyAverage,
|
|
1447
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1448
|
+
});
|
|
1449
|
+
|
|
1159
1450
|
// src/evals/score.ts
|
|
1160
1451
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1161
1452
|
function formatScoreData(def, data, options) {
|
|
@@ -1264,54 +1555,6 @@ function getScoreById(id) {
|
|
|
1264
1555
|
return registry2.get(id);
|
|
1265
1556
|
}
|
|
1266
1557
|
|
|
1267
|
-
// src/evals/aggregators.ts
|
|
1268
|
-
function aggregateTokenCountSum(values) {
|
|
1269
|
-
const initial = {
|
|
1270
|
-
input: 0,
|
|
1271
|
-
output: 0,
|
|
1272
|
-
inputCached: 0,
|
|
1273
|
-
outputCached: 0
|
|
1274
|
-
};
|
|
1275
|
-
return values.reduce(
|
|
1276
|
-
(acc, v) => ({
|
|
1277
|
-
input: acc.input + (v.input ?? 0),
|
|
1278
|
-
output: acc.output + (v.output ?? 0),
|
|
1279
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1280
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1281
|
-
}),
|
|
1282
|
-
initial
|
|
1283
|
-
);
|
|
1284
|
-
}
|
|
1285
|
-
function aggregateLatencyAverage(values) {
|
|
1286
|
-
if (values.length === 0) {
|
|
1287
|
-
return { ms: 0 };
|
|
1288
|
-
}
|
|
1289
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1290
|
-
return { ms: sum / values.length };
|
|
1291
|
-
}
|
|
1292
|
-
|
|
1293
|
-
// src/evals/metrics/standard.ts
|
|
1294
|
-
Metric.of({
|
|
1295
|
-
id: "token-count",
|
|
1296
|
-
name: "Tokens",
|
|
1297
|
-
aggregate: aggregateTokenCountSum,
|
|
1298
|
-
format: (data, options) => {
|
|
1299
|
-
const input = data.input ?? 0;
|
|
1300
|
-
const output = data.output ?? 0;
|
|
1301
|
-
const inputCached = data.inputCached ?? 0;
|
|
1302
|
-
const outputCached = data.outputCached ?? 0;
|
|
1303
|
-
const cached = inputCached + outputCached;
|
|
1304
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1305
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1306
|
-
}
|
|
1307
|
-
});
|
|
1308
|
-
Metric.of({
|
|
1309
|
-
id: "latency",
|
|
1310
|
-
name: "Latency",
|
|
1311
|
-
aggregate: aggregateLatencyAverage,
|
|
1312
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1313
|
-
});
|
|
1314
|
-
|
|
1315
1558
|
// src/evals/scores/standard.ts
|
|
1316
1559
|
Score.of({
|
|
1317
1560
|
id: "percent",
|
|
@@ -1418,15 +1661,17 @@ function readOutput(testCase) {
|
|
|
1418
1661
|
}
|
|
1419
1662
|
return candidate.getOutput();
|
|
1420
1663
|
}
|
|
1421
|
-
function buildEvaluationUnits(testCases) {
|
|
1664
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1665
|
+
const count = Math.max(1, repetitionCount);
|
|
1422
1666
|
const units = [];
|
|
1423
1667
|
for (const testCaseItem of testCases) {
|
|
1424
|
-
const
|
|
1425
|
-
for (let r = 0; r <
|
|
1668
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
1669
|
+
for (let r = 0; r < count; r++) {
|
|
1426
1670
|
units.push({
|
|
1427
1671
|
testCaseItem,
|
|
1428
|
-
|
|
1429
|
-
|
|
1672
|
+
repetitionId,
|
|
1673
|
+
repetitionIndex: r + 1,
|
|
1674
|
+
repetitionCount: count
|
|
1430
1675
|
});
|
|
1431
1676
|
}
|
|
1432
1677
|
}
|
|
@@ -1439,7 +1684,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1439
1684
|
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1440
1685
|
}
|
|
1441
1686
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1442
|
-
const { testCaseItem,
|
|
1687
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1443
1688
|
return effect.Effect.gen(function* () {
|
|
1444
1689
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1445
1690
|
const started = Date.now();
|
|
@@ -1448,11 +1693,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1448
1693
|
type: "TestCaseStarted",
|
|
1449
1694
|
runId: task.runId,
|
|
1450
1695
|
testCaseId: testCaseItem.id,
|
|
1451
|
-
testCaseName: testCaseItem.testCase
|
|
1696
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1452
1697
|
startedTestCases: startedEvaluations,
|
|
1453
1698
|
totalTestCases: totalEvaluations,
|
|
1454
|
-
|
|
1455
|
-
|
|
1699
|
+
repetitionId,
|
|
1700
|
+
repetitionIndex,
|
|
1701
|
+
repetitionCount
|
|
1456
1702
|
});
|
|
1457
1703
|
const evaluatorScores = [];
|
|
1458
1704
|
let testCaseError;
|
|
@@ -1486,8 +1732,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1486
1732
|
meta: {
|
|
1487
1733
|
triggerId: task.triggerId,
|
|
1488
1734
|
runId: evaluatorRunId,
|
|
1489
|
-
|
|
1735
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1736
|
+
repetitionId,
|
|
1737
|
+
repetitionIndex,
|
|
1738
|
+
repetitionCount,
|
|
1739
|
+
runConfigName: task.runConfigName
|
|
1490
1740
|
},
|
|
1741
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1742
|
+
runConfigTags: task.runConfigTags,
|
|
1743
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1491
1744
|
logDiff,
|
|
1492
1745
|
log,
|
|
1493
1746
|
createError
|
|
@@ -1530,18 +1783,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1530
1783
|
});
|
|
1531
1784
|
}
|
|
1532
1785
|
}
|
|
1533
|
-
const
|
|
1786
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1534
1787
|
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1535
1788
|
const progressEvent = {
|
|
1536
1789
|
type: "TestCaseProgress",
|
|
1537
1790
|
runId: task.runId,
|
|
1538
1791
|
testCaseId: testCaseItem.id,
|
|
1539
|
-
testCaseName: testCaseItem.testCase
|
|
1792
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1540
1793
|
completedTestCases: completedEvaluations,
|
|
1541
1794
|
totalTestCases: totalEvaluations,
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1795
|
+
repetitionId,
|
|
1796
|
+
repetitionIndex,
|
|
1797
|
+
repetitionCount,
|
|
1798
|
+
passed: repetitionPassedThis,
|
|
1545
1799
|
durationMs: Date.now() - started,
|
|
1546
1800
|
evaluatorScores,
|
|
1547
1801
|
output,
|
|
@@ -1562,9 +1816,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1562
1816
|
(map) => {
|
|
1563
1817
|
const key = testCaseItem.id;
|
|
1564
1818
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1565
|
-
const newResults = [...existing.results,
|
|
1819
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1566
1820
|
const newCompletedCount = existing.completedCount + 1;
|
|
1567
|
-
const isLast = newCompletedCount ===
|
|
1821
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1568
1822
|
const newMap = new Map(map);
|
|
1569
1823
|
newMap.set(key, {
|
|
1570
1824
|
completedCount: newCompletedCount,
|
|
@@ -1601,10 +1855,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1601
1855
|
runId: task.runId,
|
|
1602
1856
|
startedAt
|
|
1603
1857
|
});
|
|
1604
|
-
const totalEvaluations = task.testCases.
|
|
1605
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1606
|
-
0
|
|
1607
|
-
);
|
|
1858
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1608
1859
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1609
1860
|
const completedRef = yield* effect.Ref.make(0);
|
|
1610
1861
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -1613,7 +1864,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1613
1864
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
1614
1865
|
/* @__PURE__ */ new Map()
|
|
1615
1866
|
);
|
|
1616
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1867
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1617
1868
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1618
1869
|
task,
|
|
1619
1870
|
unit,
|
|
@@ -1627,11 +1878,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1627
1878
|
failedRef,
|
|
1628
1879
|
testCaseResultsRef
|
|
1629
1880
|
);
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1881
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1882
|
+
if (globalSem !== void 0) {
|
|
1883
|
+
yield* effect.Effect.forEach(
|
|
1884
|
+
evaluationUnits,
|
|
1885
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1886
|
+
{ concurrency: "unbounded", discard: true }
|
|
1887
|
+
);
|
|
1888
|
+
} else {
|
|
1889
|
+
yield* effect.Effect.forEach(
|
|
1890
|
+
evaluationUnits,
|
|
1891
|
+
processEvaluation,
|
|
1892
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1893
|
+
);
|
|
1894
|
+
}
|
|
1635
1895
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1636
1896
|
effect.Ref.get(completedRef),
|
|
1637
1897
|
effect.Ref.get(passedRef),
|
|
@@ -1667,155 +1927,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1667
1927
|
artifactPath: task.snapshot.artifactPath
|
|
1668
1928
|
});
|
|
1669
1929
|
});
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
} catch {
|
|
1676
|
-
return [];
|
|
1677
|
-
}
|
|
1678
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1679
|
-
const snapshots = [];
|
|
1680
|
-
for (const fileName of jsonlFiles) {
|
|
1681
|
-
const filePath = path.join(baseDir, fileName);
|
|
1682
|
-
try {
|
|
1683
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1684
|
-
if (snapshot) {
|
|
1685
|
-
snapshots.push(snapshot);
|
|
1686
|
-
}
|
|
1687
|
-
} catch {
|
|
1688
|
-
}
|
|
1689
|
-
}
|
|
1690
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1691
|
-
}
|
|
1692
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1693
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1694
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1695
|
-
if (lines.length === 0) {
|
|
1696
|
-
return null;
|
|
1697
|
-
}
|
|
1698
|
-
let runQueued = null;
|
|
1699
|
-
let runCompleted = null;
|
|
1700
|
-
let runFailed = null;
|
|
1701
|
-
let runStarted = null;
|
|
1702
|
-
for (const line of lines) {
|
|
1703
|
-
try {
|
|
1704
|
-
const event = JSON.parse(line);
|
|
1705
|
-
const type = event.type;
|
|
1706
|
-
if (type === "RunQueued") {
|
|
1707
|
-
runQueued = {
|
|
1708
|
-
runId: event.runId,
|
|
1709
|
-
datasetId: event.datasetId,
|
|
1710
|
-
datasetName: event.datasetName,
|
|
1711
|
-
evaluatorIds: event.evaluatorIds,
|
|
1712
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1713
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1714
|
-
ts: event.ts
|
|
1715
|
-
};
|
|
1716
|
-
}
|
|
1717
|
-
if (type === "RunStarted") {
|
|
1718
|
-
runStarted = { startedAt: event.startedAt };
|
|
1719
|
-
}
|
|
1720
|
-
if (type === "RunCompleted") {
|
|
1721
|
-
runCompleted = {
|
|
1722
|
-
passedTestCases: event.passedTestCases,
|
|
1723
|
-
failedTestCases: event.failedTestCases,
|
|
1724
|
-
totalTestCases: event.totalTestCases,
|
|
1725
|
-
finishedAt: event.finishedAt
|
|
1726
|
-
};
|
|
1727
|
-
}
|
|
1728
|
-
if (type === "RunFailed") {
|
|
1729
|
-
runFailed = {
|
|
1730
|
-
finishedAt: event.finishedAt,
|
|
1731
|
-
errorMessage: event.errorMessage
|
|
1732
|
-
};
|
|
1733
|
-
}
|
|
1734
|
-
} catch {
|
|
1735
|
-
}
|
|
1930
|
+
|
|
1931
|
+
// src/runner/name-pattern.ts
|
|
1932
|
+
function parseRegexLiteral(pattern) {
|
|
1933
|
+
if (!pattern.startsWith("/")) {
|
|
1934
|
+
return void 0;
|
|
1736
1935
|
}
|
|
1737
|
-
|
|
1738
|
-
|
|
1936
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1937
|
+
if (lastSlash <= 0) {
|
|
1938
|
+
return void 0;
|
|
1739
1939
|
}
|
|
1740
|
-
const artifactPath = filePath;
|
|
1741
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1742
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1743
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1744
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1745
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1746
1940
|
return {
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
datasetName: runQueued.datasetName,
|
|
1750
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1751
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1752
|
-
startedAt: runStarted?.startedAt,
|
|
1753
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1754
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1755
|
-
completedTestCases,
|
|
1756
|
-
passedTestCases,
|
|
1757
|
-
failedTestCases,
|
|
1758
|
-
status,
|
|
1759
|
-
artifactPath,
|
|
1760
|
-
errorMessage: runFailed?.errorMessage
|
|
1941
|
+
source: pattern.slice(1, lastSlash),
|
|
1942
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1761
1943
|
};
|
|
1762
1944
|
}
|
|
1763
|
-
function
|
|
1764
|
-
|
|
1765
|
-
const
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
if (event.type === "TestCaseProgress") {
|
|
1770
|
-
const ev = event;
|
|
1771
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1772
|
-
const id = ev.testCaseId;
|
|
1773
|
-
const current = testCasePassedBy.get(id);
|
|
1774
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1775
|
-
}
|
|
1776
|
-
} catch {
|
|
1777
|
-
}
|
|
1778
|
-
}
|
|
1779
|
-
let passedTestCases = 0;
|
|
1780
|
-
let failedTestCases = 0;
|
|
1781
|
-
for (const passed of testCasePassedBy.values()) {
|
|
1782
|
-
if (passed) {
|
|
1783
|
-
passedTestCases += 1;
|
|
1784
|
-
} else {
|
|
1785
|
-
failedTestCases += 1;
|
|
1786
|
-
}
|
|
1945
|
+
function createNameMatcher(pattern) {
|
|
1946
|
+
const normalizedPattern = pattern.trim();
|
|
1947
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1948
|
+
if (regexLiteral) {
|
|
1949
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1950
|
+
return (value) => regex.test(value);
|
|
1787
1951
|
}
|
|
1788
|
-
|
|
1789
|
-
}
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
const content = await promises.readFile(artifactPath, "utf8");
|
|
1793
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1794
|
-
const results = [];
|
|
1795
|
-
for (const line of lines) {
|
|
1796
|
-
try {
|
|
1797
|
-
const event = JSON.parse(line);
|
|
1798
|
-
if (event.type === "TestCaseProgress") {
|
|
1799
|
-
const ev = event;
|
|
1800
|
-
results.push({
|
|
1801
|
-
testCaseId: ev.testCaseId,
|
|
1802
|
-
testCaseName: ev.testCaseName,
|
|
1803
|
-
completedTestCases: ev.completedTestCases,
|
|
1804
|
-
totalTestCases: ev.totalTestCases,
|
|
1805
|
-
rerunIndex: ev.rerunIndex,
|
|
1806
|
-
rerunTotal: ev.rerunTotal,
|
|
1807
|
-
passed: ev.passed,
|
|
1808
|
-
durationMs: ev.durationMs,
|
|
1809
|
-
evaluatorScores: ev.evaluatorScores ?? []
|
|
1810
|
-
});
|
|
1811
|
-
}
|
|
1812
|
-
} catch {
|
|
1813
|
-
}
|
|
1814
|
-
}
|
|
1815
|
-
return results;
|
|
1816
|
-
} catch {
|
|
1817
|
-
return [];
|
|
1952
|
+
if (normalizedPattern.includes("*")) {
|
|
1953
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1954
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1955
|
+
return (value) => regex.test(value);
|
|
1818
1956
|
}
|
|
1957
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1819
1958
|
}
|
|
1820
1959
|
async function appendJsonLine(artifactPath, payload) {
|
|
1821
1960
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1874,32 +2013,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1874
2013
|
}
|
|
1875
2014
|
|
|
1876
2015
|
// src/runner/api.ts
|
|
1877
|
-
function
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1882
|
-
if (lastSlash <= 0) {
|
|
1883
|
-
return void 0;
|
|
2016
|
+
function normalizeRunRepetitions(value) {
|
|
2017
|
+
const n = value ?? 1;
|
|
2018
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
2019
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1884
2020
|
}
|
|
1885
|
-
return
|
|
1886
|
-
source: pattern.slice(1, lastSlash),
|
|
1887
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1888
|
-
};
|
|
1889
|
-
}
|
|
1890
|
-
function createNameMatcher(pattern) {
|
|
1891
|
-
const normalizedPattern = pattern.trim();
|
|
1892
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1893
|
-
if (regexLiteral) {
|
|
1894
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1895
|
-
return (value) => regex.test(value);
|
|
1896
|
-
}
|
|
1897
|
-
if (normalizedPattern.includes("*")) {
|
|
1898
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1899
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1900
|
-
return (value) => regex.test(value);
|
|
1901
|
-
}
|
|
1902
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
2021
|
+
return n;
|
|
1903
2022
|
}
|
|
1904
2023
|
function mergeRunnerOverrides(base, next) {
|
|
1905
2024
|
if (!base) {
|
|
@@ -1934,6 +2053,7 @@ var EffectRunner = class {
|
|
|
1934
2053
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1935
2054
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1936
2055
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
2056
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1937
2057
|
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1938
2058
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1939
2059
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1974,6 +2094,137 @@ var EffectRunner = class {
|
|
|
1974
2094
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1975
2095
|
);
|
|
1976
2096
|
}
|
|
2097
|
+
async collectRunConfigs() {
|
|
2098
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
2099
|
+
this.runConfigsById.clear();
|
|
2100
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
2101
|
+
for (const item of runConfigs) {
|
|
2102
|
+
const id = item.runConfig.getName();
|
|
2103
|
+
const lower = id.toLowerCase();
|
|
2104
|
+
const prev = byNameLower.get(lower);
|
|
2105
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
2106
|
+
throw new Error(
|
|
2107
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
2108
|
+
);
|
|
2109
|
+
}
|
|
2110
|
+
byNameLower.set(lower, item);
|
|
2111
|
+
this.runConfigsById.set(id, item);
|
|
2112
|
+
}
|
|
2113
|
+
return runConfigs;
|
|
2114
|
+
}
|
|
2115
|
+
async resolveRunConfigByName(name) {
|
|
2116
|
+
if (this.runConfigsById.size === 0) {
|
|
2117
|
+
await this.collectRunConfigs();
|
|
2118
|
+
}
|
|
2119
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
2120
|
+
const keyLower = key.toLowerCase();
|
|
2121
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
2122
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
2123
|
+
);
|
|
2124
|
+
if (matches.length === 0) {
|
|
2125
|
+
return void 0;
|
|
2126
|
+
}
|
|
2127
|
+
if (matches.length > 1) {
|
|
2128
|
+
throw new Error(
|
|
2129
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
2130
|
+
);
|
|
2131
|
+
}
|
|
2132
|
+
return matches[0];
|
|
2133
|
+
}
|
|
2134
|
+
async expandRunConfigToJobs(collected) {
|
|
2135
|
+
if (this.datasetsById.size === 0) {
|
|
2136
|
+
await this.collectDatasets();
|
|
2137
|
+
}
|
|
2138
|
+
if (this.evaluatorsById.size === 0) {
|
|
2139
|
+
await this.collectEvaluators();
|
|
2140
|
+
}
|
|
2141
|
+
const rcName = collected.runConfig.getName();
|
|
2142
|
+
const jobs = [];
|
|
2143
|
+
const runs = collected.runConfig.getRuns();
|
|
2144
|
+
for (const [i, row] of runs.entries()) {
|
|
2145
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
2146
|
+
(d) => d.dataset === row.dataset
|
|
2147
|
+
);
|
|
2148
|
+
if (!dsCollected) {
|
|
2149
|
+
throw new Error(
|
|
2150
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2151
|
+
);
|
|
2152
|
+
}
|
|
2153
|
+
let evaluatorIds;
|
|
2154
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
2155
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
2156
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
2157
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
2158
|
+
);
|
|
2159
|
+
if (matched.length === 0) {
|
|
2160
|
+
throw new Error(
|
|
2161
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2162
|
+
);
|
|
2163
|
+
}
|
|
2164
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2165
|
+
} else {
|
|
2166
|
+
const evaluators = row.evaluators;
|
|
2167
|
+
evaluatorIds = [];
|
|
2168
|
+
for (const ev of evaluators) {
|
|
2169
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2170
|
+
(item) => item.evaluator === ev
|
|
2171
|
+
);
|
|
2172
|
+
if (!found) {
|
|
2173
|
+
throw new Error(
|
|
2174
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2175
|
+
);
|
|
2176
|
+
}
|
|
2177
|
+
evaluatorIds.push(found.id);
|
|
2178
|
+
}
|
|
2179
|
+
}
|
|
2180
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2181
|
+
jobs.push({
|
|
2182
|
+
datasetId: dsCollected.id,
|
|
2183
|
+
evaluatorIds,
|
|
2184
|
+
runConfigName: rcName,
|
|
2185
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2186
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2187
|
+
repetitions
|
|
2188
|
+
});
|
|
2189
|
+
}
|
|
2190
|
+
return jobs;
|
|
2191
|
+
}
|
|
2192
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2193
|
+
const jobs = [];
|
|
2194
|
+
for (const name of names) {
|
|
2195
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2196
|
+
if (!collected) {
|
|
2197
|
+
const known = await this.collectRunConfigs();
|
|
2198
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2199
|
+
throw new Error(
|
|
2200
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2201
|
+
);
|
|
2202
|
+
}
|
|
2203
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2204
|
+
}
|
|
2205
|
+
return jobs;
|
|
2206
|
+
}
|
|
2207
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2208
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2209
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2210
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2211
|
+
const snapshots = [];
|
|
2212
|
+
for (const job of request.jobs) {
|
|
2213
|
+
snapshots.push(
|
|
2214
|
+
await this.startDatasetRun({
|
|
2215
|
+
datasetId: job.datasetId,
|
|
2216
|
+
evaluatorIds: job.evaluatorIds,
|
|
2217
|
+
triggerId,
|
|
2218
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2219
|
+
globalEvaluationSemaphore: sem,
|
|
2220
|
+
runConfigName: job.runConfigName,
|
|
2221
|
+
runConfigTags: job.runConfigTags,
|
|
2222
|
+
repetitions: job.repetitions
|
|
2223
|
+
})
|
|
2224
|
+
);
|
|
2225
|
+
}
|
|
2226
|
+
return snapshots;
|
|
2227
|
+
}
|
|
1977
2228
|
async searchTestCases(query) {
|
|
1978
2229
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1979
2230
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1992,36 +2243,46 @@ var EffectRunner = class {
|
|
|
1992
2243
|
);
|
|
1993
2244
|
}
|
|
1994
2245
|
async runDatasetWith(request) {
|
|
2246
|
+
const runConfigName = validateRunConfigName(
|
|
2247
|
+
request.runConfigName,
|
|
2248
|
+
"runDatasetWith.runConfigName"
|
|
2249
|
+
);
|
|
2250
|
+
return this.startDatasetRun({
|
|
2251
|
+
datasetId: request.datasetId,
|
|
2252
|
+
evaluatorIds: request.evaluatorIds,
|
|
2253
|
+
triggerId: request.triggerId,
|
|
2254
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2255
|
+
repetitions: request.repetitions,
|
|
2256
|
+
runConfigName,
|
|
2257
|
+
runConfigTags: request.runConfigTags
|
|
2258
|
+
});
|
|
2259
|
+
}
|
|
2260
|
+
async startDatasetRun(params) {
|
|
1995
2261
|
if (this.datasetsById.size === 0) {
|
|
1996
2262
|
await this.collectDatasets();
|
|
1997
2263
|
}
|
|
1998
2264
|
if (this.evaluatorsById.size === 0) {
|
|
1999
2265
|
await this.collectEvaluators();
|
|
2000
2266
|
}
|
|
2001
|
-
const dataset = this.datasetsById.get(
|
|
2267
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
2002
2268
|
if (!dataset) {
|
|
2003
|
-
throw new Error(`Unknown dataset: ${
|
|
2269
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
2004
2270
|
}
|
|
2005
|
-
const selectedEvaluators =
|
|
2271
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
2006
2272
|
if (selectedEvaluators.length === 0) {
|
|
2007
2273
|
throw new Error("No evaluators selected for run");
|
|
2008
2274
|
}
|
|
2009
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
2010
|
-
const
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
)
|
|
2014
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2275
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2276
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2277
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2278
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2279
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2015
2280
|
const runId = `run-${crypto.randomUUID()}`;
|
|
2016
|
-
const artifactPath = createArtifactPath(
|
|
2017
|
-
this.config.artifactDirectory,
|
|
2018
|
-
request.datasetId,
|
|
2019
|
-
runId
|
|
2020
|
-
);
|
|
2281
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
2021
2282
|
const snapshot = {
|
|
2022
2283
|
runId,
|
|
2023
|
-
datasetId:
|
|
2024
|
-
datasetName: dataset.dataset.
|
|
2284
|
+
datasetId: params.datasetId,
|
|
2285
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2025
2286
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2026
2287
|
queuedAt: Date.now(),
|
|
2027
2288
|
totalTestCases: totalEvaluations,
|
|
@@ -2041,8 +2302,8 @@ var EffectRunner = class {
|
|
|
2041
2302
|
const queuedEvent = {
|
|
2042
2303
|
type: "RunQueued",
|
|
2043
2304
|
runId,
|
|
2044
|
-
datasetId:
|
|
2045
|
-
datasetName: dataset.dataset.
|
|
2305
|
+
datasetId: params.datasetId,
|
|
2306
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2046
2307
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2047
2308
|
totalTestCases: totalEvaluations,
|
|
2048
2309
|
artifactPath
|
|
@@ -2055,17 +2316,20 @@ var EffectRunner = class {
|
|
|
2055
2316
|
payload: queuedEvent
|
|
2056
2317
|
})
|
|
2057
2318
|
);
|
|
2058
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
2059
2319
|
await effect.Effect.runPromise(
|
|
2060
2320
|
effect.Queue.offer(this.runQueue, {
|
|
2061
2321
|
runId,
|
|
2062
2322
|
triggerId,
|
|
2063
|
-
datasetId:
|
|
2323
|
+
datasetId: params.datasetId,
|
|
2064
2324
|
dataset: dataset.dataset,
|
|
2065
2325
|
evaluators: selectedEvaluators,
|
|
2066
2326
|
testCases: selectedTestCases,
|
|
2067
2327
|
snapshot,
|
|
2068
|
-
maxConcurrency
|
|
2328
|
+
maxConcurrency: params.maxConcurrency,
|
|
2329
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2330
|
+
runConfigName: params.runConfigName,
|
|
2331
|
+
runConfigTags,
|
|
2332
|
+
repetitions
|
|
2069
2333
|
})
|
|
2070
2334
|
);
|
|
2071
2335
|
return snapshot;
|
|
@@ -2136,6 +2400,11 @@ var EffectRunner = class {
|
|
|
2136
2400
|
);
|
|
2137
2401
|
}
|
|
2138
2402
|
};
|
|
2403
|
+
|
|
2404
|
+
// src/runner/events.ts
|
|
2405
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2406
|
+
runConfigName: "programmatic"
|
|
2407
|
+
};
|
|
2139
2408
|
var LEFT_PANE_WIDTH2 = 44;
|
|
2140
2409
|
var MAX_RUNS_FOR_CHART = 12;
|
|
2141
2410
|
var MAX_RUNS_FOR_TREND = 20;
|
|
@@ -2483,7 +2752,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2483
2752
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
|
|
2484
2753
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
2485
2754
|
for (const tc of testCases) {
|
|
2486
|
-
const
|
|
2755
|
+
const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
|
|
2487
2756
|
rows.push(
|
|
2488
2757
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2489
2758
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
@@ -2495,13 +2764,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2495
2764
|
] }),
|
|
2496
2765
|
" ",
|
|
2497
2766
|
tc.testCaseName,
|
|
2498
|
-
|
|
2767
|
+
repetitionPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: repetitionPart }) : null,
|
|
2499
2768
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2500
2769
|
" (",
|
|
2501
2770
|
tc.durationMs,
|
|
2502
2771
|
"ms)"
|
|
2503
2772
|
] })
|
|
2504
|
-
] }, `tc-${tc.testCaseId}-${tc.
|
|
2773
|
+
] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
|
|
2505
2774
|
);
|
|
2506
2775
|
for (const item of tc.evaluatorScores) {
|
|
2507
2776
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -2827,7 +3096,8 @@ function EvalsCliApp({ data, args, runner }) {
|
|
|
2827
3096
|
}
|
|
2828
3097
|
void runner.runDatasetWith({
|
|
2829
3098
|
datasetId: selectedDataset.id,
|
|
2830
|
-
evaluatorIds: clampedState.selectedEvaluatorIds
|
|
3099
|
+
evaluatorIds: clampedState.selectedEvaluatorIds,
|
|
3100
|
+
...PROGRAMMATIC_RUN_CONFIG
|
|
2831
3101
|
}).then((snapshot) => {
|
|
2832
3102
|
setRuntimeMessage(
|
|
2833
3103
|
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|