@m4trix/evals 0.25.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +831 -450
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +832 -451
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +531 -270
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +531 -270
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +888 -509
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +878 -513
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -5,16 +5,16 @@ var fullscreenInk = require('fullscreen-ink');
|
|
|
5
5
|
var React = require('react');
|
|
6
6
|
var ink = require('ink');
|
|
7
7
|
var jsxRuntime = require('react/jsx-runtime');
|
|
8
|
-
var path = require('path');
|
|
9
|
-
var inkChart = require('@pppp606/ink-chart');
|
|
10
|
-
var crypto = require('crypto');
|
|
11
8
|
var effect = require('effect');
|
|
9
|
+
var crypto = require('crypto');
|
|
10
|
+
var promises = require('fs/promises');
|
|
11
|
+
var path = require('path');
|
|
12
12
|
var fs = require('fs');
|
|
13
13
|
var jitiModule = require('jiti');
|
|
14
|
-
var promises = require('fs/promises');
|
|
15
14
|
var url = require('url');
|
|
16
15
|
var diff = require('diff');
|
|
17
16
|
var stringify = require('fast-json-stable-stringify');
|
|
17
|
+
var inkChart = require('@pppp606/ink-chart');
|
|
18
18
|
|
|
19
19
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
20
20
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
@@ -264,6 +264,50 @@ function isPrintableCharacter(input) {
|
|
|
264
264
|
function isBackKey(key) {
|
|
265
265
|
return key.backspace || key.delete;
|
|
266
266
|
}
|
|
267
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
268
|
+
function makeEntityIdSchema(brand, label) {
|
|
269
|
+
return effect.Schema.String.pipe(
|
|
270
|
+
effect.Schema.trimmed(),
|
|
271
|
+
effect.Schema.minLength(1, {
|
|
272
|
+
message: () => `${label} must be non-empty.`
|
|
273
|
+
}),
|
|
274
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
275
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
276
|
+
}),
|
|
277
|
+
effect.Schema.brand(brand)
|
|
278
|
+
);
|
|
279
|
+
}
|
|
280
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
281
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
282
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
283
|
+
function validateWithSchema(schema, raw, context) {
|
|
284
|
+
const trimmed = raw.trim();
|
|
285
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
286
|
+
schema
|
|
287
|
+
);
|
|
288
|
+
const result = decode(trimmed);
|
|
289
|
+
if (effect.Either.isLeft(result)) {
|
|
290
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
291
|
+
}
|
|
292
|
+
return result.right;
|
|
293
|
+
}
|
|
294
|
+
function validateRunConfigName(raw, context) {
|
|
295
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// src/evals/evaluator.ts
|
|
299
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
300
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
301
|
+
const label = evaluator.getDisplayLabel();
|
|
302
|
+
if (label !== void 0) {
|
|
303
|
+
return label;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
307
|
+
}
|
|
308
|
+
function getEvaluatorTagList(evaluator) {
|
|
309
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
310
|
+
}
|
|
267
311
|
|
|
268
312
|
// src/cli/data.mock.json
|
|
269
313
|
var data_mock_default = {
|
|
@@ -519,7 +563,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
519
563
|
function toEvaluatorOption(item) {
|
|
520
564
|
return {
|
|
521
565
|
id: item.id,
|
|
522
|
-
name: item.evaluator
|
|
566
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
523
567
|
configPreview: `Source: ${item.filePath}`
|
|
524
568
|
};
|
|
525
569
|
}
|
|
@@ -762,6 +806,159 @@ function reduceCliState(state, action) {
|
|
|
762
806
|
}
|
|
763
807
|
return state;
|
|
764
808
|
}
|
|
809
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
810
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
811
|
+
let entries;
|
|
812
|
+
try {
|
|
813
|
+
entries = await promises.readdir(baseDir);
|
|
814
|
+
} catch {
|
|
815
|
+
return [];
|
|
816
|
+
}
|
|
817
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
818
|
+
const snapshots = [];
|
|
819
|
+
for (const fileName of jsonlFiles) {
|
|
820
|
+
const filePath = path.join(baseDir, fileName);
|
|
821
|
+
try {
|
|
822
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
823
|
+
if (snapshot) {
|
|
824
|
+
snapshots.push(snapshot);
|
|
825
|
+
}
|
|
826
|
+
} catch {
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
830
|
+
}
|
|
831
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
832
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
833
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
834
|
+
if (lines.length === 0) {
|
|
835
|
+
return null;
|
|
836
|
+
}
|
|
837
|
+
let runQueued = null;
|
|
838
|
+
let runCompleted = null;
|
|
839
|
+
let runFailed = null;
|
|
840
|
+
let runStarted = null;
|
|
841
|
+
for (const line of lines) {
|
|
842
|
+
try {
|
|
843
|
+
const event = JSON.parse(line);
|
|
844
|
+
const type = event.type;
|
|
845
|
+
if (type === "RunQueued") {
|
|
846
|
+
runQueued = {
|
|
847
|
+
runId: event.runId,
|
|
848
|
+
datasetId: event.datasetId,
|
|
849
|
+
datasetName: event.datasetName,
|
|
850
|
+
evaluatorIds: event.evaluatorIds,
|
|
851
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
852
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
853
|
+
ts: event.ts
|
|
854
|
+
};
|
|
855
|
+
}
|
|
856
|
+
if (type === "RunStarted") {
|
|
857
|
+
runStarted = { startedAt: event.startedAt };
|
|
858
|
+
}
|
|
859
|
+
if (type === "RunCompleted") {
|
|
860
|
+
runCompleted = {
|
|
861
|
+
passedTestCases: event.passedTestCases,
|
|
862
|
+
failedTestCases: event.failedTestCases,
|
|
863
|
+
totalTestCases: event.totalTestCases,
|
|
864
|
+
finishedAt: event.finishedAt
|
|
865
|
+
};
|
|
866
|
+
}
|
|
867
|
+
if (type === "RunFailed") {
|
|
868
|
+
runFailed = {
|
|
869
|
+
finishedAt: event.finishedAt,
|
|
870
|
+
errorMessage: event.errorMessage
|
|
871
|
+
};
|
|
872
|
+
}
|
|
873
|
+
} catch {
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
if (!runQueued) {
|
|
877
|
+
return null;
|
|
878
|
+
}
|
|
879
|
+
const artifactPath = filePath;
|
|
880
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
881
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
882
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
883
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
884
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
885
|
+
return {
|
|
886
|
+
runId: runQueued.runId,
|
|
887
|
+
datasetId: runQueued.datasetId,
|
|
888
|
+
datasetName: runQueued.datasetName,
|
|
889
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
890
|
+
queuedAt: runQueued.ts ?? 0,
|
|
891
|
+
startedAt: runStarted?.startedAt,
|
|
892
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
893
|
+
totalTestCases: runQueued.totalTestCases,
|
|
894
|
+
completedTestCases,
|
|
895
|
+
passedTestCases,
|
|
896
|
+
failedTestCases,
|
|
897
|
+
status,
|
|
898
|
+
artifactPath,
|
|
899
|
+
errorMessage: runFailed?.errorMessage
|
|
900
|
+
};
|
|
901
|
+
}
|
|
902
|
+
function aggregateTestCaseProgress(lines) {
|
|
903
|
+
let completedTestCases = 0;
|
|
904
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
905
|
+
for (const line of lines) {
|
|
906
|
+
try {
|
|
907
|
+
const event = JSON.parse(line);
|
|
908
|
+
if (event.type === "TestCaseProgress") {
|
|
909
|
+
const ev = event;
|
|
910
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
911
|
+
const id = ev.testCaseId;
|
|
912
|
+
const current = testCasePassedBy.get(id);
|
|
913
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
914
|
+
}
|
|
915
|
+
} catch {
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
let passedTestCases = 0;
|
|
919
|
+
let failedTestCases = 0;
|
|
920
|
+
for (const passed of testCasePassedBy.values()) {
|
|
921
|
+
if (passed) {
|
|
922
|
+
passedTestCases += 1;
|
|
923
|
+
} else {
|
|
924
|
+
failedTestCases += 1;
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
928
|
+
}
|
|
929
|
+
async function parseArtifactFile(artifactPath) {
|
|
930
|
+
try {
|
|
931
|
+
const content = await promises.readFile(artifactPath, "utf8");
|
|
932
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
933
|
+
const results = [];
|
|
934
|
+
for (const line of lines) {
|
|
935
|
+
try {
|
|
936
|
+
const event = JSON.parse(line);
|
|
937
|
+
if (event.type === "TestCaseProgress") {
|
|
938
|
+
const ev = event;
|
|
939
|
+
const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
|
|
940
|
+
const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
|
|
941
|
+
results.push({
|
|
942
|
+
testCaseId: ev.testCaseId,
|
|
943
|
+
testCaseName: ev.testCaseName,
|
|
944
|
+
completedTestCases: ev.completedTestCases,
|
|
945
|
+
totalTestCases: ev.totalTestCases,
|
|
946
|
+
repetitionId: ev.repetitionId,
|
|
947
|
+
repetitionIndex,
|
|
948
|
+
repetitionCount,
|
|
949
|
+
passed: ev.passed,
|
|
950
|
+
durationMs: ev.durationMs,
|
|
951
|
+
evaluatorScores: ev.evaluatorScores ?? []
|
|
952
|
+
});
|
|
953
|
+
}
|
|
954
|
+
} catch {
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
return results;
|
|
958
|
+
} catch {
|
|
959
|
+
return [];
|
|
960
|
+
}
|
|
961
|
+
}
|
|
765
962
|
|
|
766
963
|
// src/runner/config.ts
|
|
767
964
|
var defaultRunnerConfig = {
|
|
@@ -769,6 +966,7 @@ var defaultRunnerConfig = {
|
|
|
769
966
|
rootDir: process.cwd(),
|
|
770
967
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
771
968
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
969
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
772
970
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
773
971
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
774
972
|
},
|
|
@@ -794,6 +992,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
794
992
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
795
993
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
796
994
|
}
|
|
995
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
996
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
997
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
998
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
999
|
+
}
|
|
797
1000
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
798
1001
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
799
1002
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -892,6 +1095,9 @@ function isDatasetLike(value) {
|
|
|
892
1095
|
function isEvaluatorLike(value) {
|
|
893
1096
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
894
1097
|
}
|
|
1098
|
+
function isRunConfigLike(value) {
|
|
1099
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1100
|
+
}
|
|
895
1101
|
function isTestCaseLike(value) {
|
|
896
1102
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
897
1103
|
}
|
|
@@ -980,6 +1186,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
980
1186
|
);
|
|
981
1187
|
return found.flat();
|
|
982
1188
|
}
|
|
1189
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1190
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1191
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1192
|
+
const found = await Promise.all(
|
|
1193
|
+
matched.map(async (absolutePath) => {
|
|
1194
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1195
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1196
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1197
|
+
return runConfigs.map((runConfig) => ({
|
|
1198
|
+
id: runConfig.getName(),
|
|
1199
|
+
filePath: relPath,
|
|
1200
|
+
runConfig
|
|
1201
|
+
}));
|
|
1202
|
+
})
|
|
1203
|
+
);
|
|
1204
|
+
return found.flat();
|
|
1205
|
+
}
|
|
983
1206
|
async function collectTestCasesFromFiles(config) {
|
|
984
1207
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
985
1208
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -1133,6 +1356,17 @@ function getDiffLines(entry) {
|
|
|
1133
1356
|
});
|
|
1134
1357
|
}
|
|
1135
1358
|
|
|
1359
|
+
// src/evals/test-case.ts
|
|
1360
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1361
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1362
|
+
return testCase.getDisplayLabel();
|
|
1363
|
+
}
|
|
1364
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
1365
|
+
}
|
|
1366
|
+
function getTestCaseTagList(testCase) {
|
|
1367
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1136
1370
|
// src/evals/metric.ts
|
|
1137
1371
|
var registry = /* @__PURE__ */ new Map();
|
|
1138
1372
|
var Metric = {
|
|
@@ -1156,6 +1390,54 @@ function getMetricById(id) {
|
|
|
1156
1390
|
return registry.get(id);
|
|
1157
1391
|
}
|
|
1158
1392
|
|
|
1393
|
+
// src/evals/aggregators.ts
|
|
1394
|
+
function aggregateTokenCountSum(values) {
|
|
1395
|
+
const initial = {
|
|
1396
|
+
input: 0,
|
|
1397
|
+
output: 0,
|
|
1398
|
+
inputCached: 0,
|
|
1399
|
+
outputCached: 0
|
|
1400
|
+
};
|
|
1401
|
+
return values.reduce(
|
|
1402
|
+
(acc, v) => ({
|
|
1403
|
+
input: acc.input + (v.input ?? 0),
|
|
1404
|
+
output: acc.output + (v.output ?? 0),
|
|
1405
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1406
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1407
|
+
}),
|
|
1408
|
+
initial
|
|
1409
|
+
);
|
|
1410
|
+
}
|
|
1411
|
+
function aggregateLatencyAverage(values) {
|
|
1412
|
+
if (values.length === 0) {
|
|
1413
|
+
return { ms: 0 };
|
|
1414
|
+
}
|
|
1415
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1416
|
+
return { ms: sum / values.length };
|
|
1417
|
+
}
|
|
1418
|
+
|
|
1419
|
+
// src/evals/metrics/standard.ts
|
|
1420
|
+
Metric.of({
|
|
1421
|
+
id: "token-count",
|
|
1422
|
+
name: "Tokens",
|
|
1423
|
+
aggregate: aggregateTokenCountSum,
|
|
1424
|
+
format: (data, options) => {
|
|
1425
|
+
const input = data.input ?? 0;
|
|
1426
|
+
const output = data.output ?? 0;
|
|
1427
|
+
const inputCached = data.inputCached ?? 0;
|
|
1428
|
+
const outputCached = data.outputCached ?? 0;
|
|
1429
|
+
const cached = inputCached + outputCached;
|
|
1430
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1431
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1432
|
+
}
|
|
1433
|
+
});
|
|
1434
|
+
Metric.of({
|
|
1435
|
+
id: "latency",
|
|
1436
|
+
name: "Latency",
|
|
1437
|
+
aggregate: aggregateLatencyAverage,
|
|
1438
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1439
|
+
});
|
|
1440
|
+
|
|
1159
1441
|
// src/evals/score.ts
|
|
1160
1442
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1161
1443
|
function formatScoreData(def, data, options) {
|
|
@@ -1264,54 +1546,6 @@ function getScoreById(id) {
|
|
|
1264
1546
|
return registry2.get(id);
|
|
1265
1547
|
}
|
|
1266
1548
|
|
|
1267
|
-
// src/evals/aggregators.ts
|
|
1268
|
-
function aggregateTokenCountSum(values) {
|
|
1269
|
-
const initial = {
|
|
1270
|
-
input: 0,
|
|
1271
|
-
output: 0,
|
|
1272
|
-
inputCached: 0,
|
|
1273
|
-
outputCached: 0
|
|
1274
|
-
};
|
|
1275
|
-
return values.reduce(
|
|
1276
|
-
(acc, v) => ({
|
|
1277
|
-
input: acc.input + (v.input ?? 0),
|
|
1278
|
-
output: acc.output + (v.output ?? 0),
|
|
1279
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1280
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1281
|
-
}),
|
|
1282
|
-
initial
|
|
1283
|
-
);
|
|
1284
|
-
}
|
|
1285
|
-
function aggregateLatencyAverage(values) {
|
|
1286
|
-
if (values.length === 0) {
|
|
1287
|
-
return { ms: 0 };
|
|
1288
|
-
}
|
|
1289
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1290
|
-
return { ms: sum / values.length };
|
|
1291
|
-
}
|
|
1292
|
-
|
|
1293
|
-
// src/evals/metrics/standard.ts
|
|
1294
|
-
Metric.of({
|
|
1295
|
-
id: "token-count",
|
|
1296
|
-
name: "Tokens",
|
|
1297
|
-
aggregate: aggregateTokenCountSum,
|
|
1298
|
-
format: (data, options) => {
|
|
1299
|
-
const input = data.input ?? 0;
|
|
1300
|
-
const output = data.output ?? 0;
|
|
1301
|
-
const inputCached = data.inputCached ?? 0;
|
|
1302
|
-
const outputCached = data.outputCached ?? 0;
|
|
1303
|
-
const cached = inputCached + outputCached;
|
|
1304
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1305
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1306
|
-
}
|
|
1307
|
-
});
|
|
1308
|
-
Metric.of({
|
|
1309
|
-
id: "latency",
|
|
1310
|
-
name: "Latency",
|
|
1311
|
-
aggregate: aggregateLatencyAverage,
|
|
1312
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1313
|
-
});
|
|
1314
|
-
|
|
1315
1549
|
// src/evals/scores/standard.ts
|
|
1316
1550
|
Score.of({
|
|
1317
1551
|
id: "percent",
|
|
@@ -1418,15 +1652,17 @@ function readOutput(testCase) {
|
|
|
1418
1652
|
}
|
|
1419
1653
|
return candidate.getOutput();
|
|
1420
1654
|
}
|
|
1421
|
-
function buildEvaluationUnits(testCases) {
|
|
1655
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1656
|
+
const count = Math.max(1, repetitionCount);
|
|
1422
1657
|
const units = [];
|
|
1423
1658
|
for (const testCaseItem of testCases) {
|
|
1424
|
-
const
|
|
1425
|
-
for (let r = 0; r <
|
|
1659
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
1660
|
+
for (let r = 0; r < count; r++) {
|
|
1426
1661
|
units.push({
|
|
1427
1662
|
testCaseItem,
|
|
1428
|
-
|
|
1429
|
-
|
|
1663
|
+
repetitionId,
|
|
1664
|
+
repetitionIndex: r + 1,
|
|
1665
|
+
repetitionCount: count
|
|
1430
1666
|
});
|
|
1431
1667
|
}
|
|
1432
1668
|
}
|
|
@@ -1439,7 +1675,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1439
1675
|
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1440
1676
|
}
|
|
1441
1677
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1442
|
-
const { testCaseItem,
|
|
1678
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1443
1679
|
return effect.Effect.gen(function* () {
|
|
1444
1680
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1445
1681
|
const started = Date.now();
|
|
@@ -1448,11 +1684,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1448
1684
|
type: "TestCaseStarted",
|
|
1449
1685
|
runId: task.runId,
|
|
1450
1686
|
testCaseId: testCaseItem.id,
|
|
1451
|
-
testCaseName: testCaseItem.testCase
|
|
1687
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1452
1688
|
startedTestCases: startedEvaluations,
|
|
1453
1689
|
totalTestCases: totalEvaluations,
|
|
1454
|
-
|
|
1455
|
-
|
|
1690
|
+
repetitionId,
|
|
1691
|
+
repetitionIndex,
|
|
1692
|
+
repetitionCount
|
|
1456
1693
|
});
|
|
1457
1694
|
const evaluatorScores = [];
|
|
1458
1695
|
let testCaseError;
|
|
@@ -1486,8 +1723,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1486
1723
|
meta: {
|
|
1487
1724
|
triggerId: task.triggerId,
|
|
1488
1725
|
runId: evaluatorRunId,
|
|
1489
|
-
datasetId: task.datasetId
|
|
1726
|
+
datasetId: task.datasetId,
|
|
1727
|
+
repetitionId,
|
|
1728
|
+
repetitionIndex,
|
|
1729
|
+
repetitionCount,
|
|
1730
|
+
runConfigName: task.runConfigName
|
|
1490
1731
|
},
|
|
1732
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1733
|
+
runConfigTags: task.runConfigTags,
|
|
1734
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1491
1735
|
logDiff,
|
|
1492
1736
|
log,
|
|
1493
1737
|
createError
|
|
@@ -1530,18 +1774,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1530
1774
|
});
|
|
1531
1775
|
}
|
|
1532
1776
|
}
|
|
1533
|
-
const
|
|
1777
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1534
1778
|
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1535
1779
|
const progressEvent = {
|
|
1536
1780
|
type: "TestCaseProgress",
|
|
1537
1781
|
runId: task.runId,
|
|
1538
1782
|
testCaseId: testCaseItem.id,
|
|
1539
|
-
testCaseName: testCaseItem.testCase
|
|
1783
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1540
1784
|
completedTestCases: completedEvaluations,
|
|
1541
1785
|
totalTestCases: totalEvaluations,
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1786
|
+
repetitionId,
|
|
1787
|
+
repetitionIndex,
|
|
1788
|
+
repetitionCount,
|
|
1789
|
+
passed: repetitionPassedThis,
|
|
1545
1790
|
durationMs: Date.now() - started,
|
|
1546
1791
|
evaluatorScores,
|
|
1547
1792
|
output,
|
|
@@ -1562,9 +1807,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1562
1807
|
(map) => {
|
|
1563
1808
|
const key = testCaseItem.id;
|
|
1564
1809
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1565
|
-
const newResults = [...existing.results,
|
|
1810
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1566
1811
|
const newCompletedCount = existing.completedCount + 1;
|
|
1567
|
-
const isLast = newCompletedCount ===
|
|
1812
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1568
1813
|
const newMap = new Map(map);
|
|
1569
1814
|
newMap.set(key, {
|
|
1570
1815
|
completedCount: newCompletedCount,
|
|
@@ -1601,10 +1846,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1601
1846
|
runId: task.runId,
|
|
1602
1847
|
startedAt
|
|
1603
1848
|
});
|
|
1604
|
-
const totalEvaluations = task.testCases.
|
|
1605
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1606
|
-
0
|
|
1607
|
-
);
|
|
1849
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1608
1850
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1609
1851
|
const completedRef = yield* effect.Ref.make(0);
|
|
1610
1852
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -1613,7 +1855,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1613
1855
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
1614
1856
|
/* @__PURE__ */ new Map()
|
|
1615
1857
|
);
|
|
1616
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1858
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1617
1859
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1618
1860
|
task,
|
|
1619
1861
|
unit,
|
|
@@ -1627,11 +1869,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1627
1869
|
failedRef,
|
|
1628
1870
|
testCaseResultsRef
|
|
1629
1871
|
);
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1872
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1873
|
+
if (globalSem !== void 0) {
|
|
1874
|
+
yield* effect.Effect.forEach(
|
|
1875
|
+
evaluationUnits,
|
|
1876
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1877
|
+
{ concurrency: "unbounded", discard: true }
|
|
1878
|
+
);
|
|
1879
|
+
} else {
|
|
1880
|
+
yield* effect.Effect.forEach(
|
|
1881
|
+
evaluationUnits,
|
|
1882
|
+
processEvaluation,
|
|
1883
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1884
|
+
);
|
|
1885
|
+
}
|
|
1635
1886
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1636
1887
|
effect.Ref.get(completedRef),
|
|
1637
1888
|
effect.Ref.get(passedRef),
|
|
@@ -1667,155 +1918,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1667
1918
|
artifactPath: task.snapshot.artifactPath
|
|
1668
1919
|
});
|
|
1669
1920
|
});
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
} catch {
|
|
1676
|
-
return [];
|
|
1677
|
-
}
|
|
1678
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1679
|
-
const snapshots = [];
|
|
1680
|
-
for (const fileName of jsonlFiles) {
|
|
1681
|
-
const filePath = path.join(baseDir, fileName);
|
|
1682
|
-
try {
|
|
1683
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1684
|
-
if (snapshot) {
|
|
1685
|
-
snapshots.push(snapshot);
|
|
1686
|
-
}
|
|
1687
|
-
} catch {
|
|
1688
|
-
}
|
|
1689
|
-
}
|
|
1690
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1691
|
-
}
|
|
1692
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1693
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1694
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1695
|
-
if (lines.length === 0) {
|
|
1696
|
-
return null;
|
|
1697
|
-
}
|
|
1698
|
-
let runQueued = null;
|
|
1699
|
-
let runCompleted = null;
|
|
1700
|
-
let runFailed = null;
|
|
1701
|
-
let runStarted = null;
|
|
1702
|
-
for (const line of lines) {
|
|
1703
|
-
try {
|
|
1704
|
-
const event = JSON.parse(line);
|
|
1705
|
-
const type = event.type;
|
|
1706
|
-
if (type === "RunQueued") {
|
|
1707
|
-
runQueued = {
|
|
1708
|
-
runId: event.runId,
|
|
1709
|
-
datasetId: event.datasetId,
|
|
1710
|
-
datasetName: event.datasetName,
|
|
1711
|
-
evaluatorIds: event.evaluatorIds,
|
|
1712
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1713
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1714
|
-
ts: event.ts
|
|
1715
|
-
};
|
|
1716
|
-
}
|
|
1717
|
-
if (type === "RunStarted") {
|
|
1718
|
-
runStarted = { startedAt: event.startedAt };
|
|
1719
|
-
}
|
|
1720
|
-
if (type === "RunCompleted") {
|
|
1721
|
-
runCompleted = {
|
|
1722
|
-
passedTestCases: event.passedTestCases,
|
|
1723
|
-
failedTestCases: event.failedTestCases,
|
|
1724
|
-
totalTestCases: event.totalTestCases,
|
|
1725
|
-
finishedAt: event.finishedAt
|
|
1726
|
-
};
|
|
1727
|
-
}
|
|
1728
|
-
if (type === "RunFailed") {
|
|
1729
|
-
runFailed = {
|
|
1730
|
-
finishedAt: event.finishedAt,
|
|
1731
|
-
errorMessage: event.errorMessage
|
|
1732
|
-
};
|
|
1733
|
-
}
|
|
1734
|
-
} catch {
|
|
1735
|
-
}
|
|
1921
|
+
|
|
1922
|
+
// src/runner/name-pattern.ts
|
|
1923
|
+
function parseRegexLiteral(pattern) {
|
|
1924
|
+
if (!pattern.startsWith("/")) {
|
|
1925
|
+
return void 0;
|
|
1736
1926
|
}
|
|
1737
|
-
|
|
1738
|
-
|
|
1927
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1928
|
+
if (lastSlash <= 0) {
|
|
1929
|
+
return void 0;
|
|
1739
1930
|
}
|
|
1740
|
-
const artifactPath = filePath;
|
|
1741
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1742
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1743
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1744
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1745
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1746
1931
|
return {
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
datasetName: runQueued.datasetName,
|
|
1750
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1751
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1752
|
-
startedAt: runStarted?.startedAt,
|
|
1753
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1754
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1755
|
-
completedTestCases,
|
|
1756
|
-
passedTestCases,
|
|
1757
|
-
failedTestCases,
|
|
1758
|
-
status,
|
|
1759
|
-
artifactPath,
|
|
1760
|
-
errorMessage: runFailed?.errorMessage
|
|
1932
|
+
source: pattern.slice(1, lastSlash),
|
|
1933
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1761
1934
|
};
|
|
1762
1935
|
}
|
|
1763
|
-
function
|
|
1764
|
-
|
|
1765
|
-
const
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
if (event.type === "TestCaseProgress") {
|
|
1770
|
-
const ev = event;
|
|
1771
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1772
|
-
const id = ev.testCaseId;
|
|
1773
|
-
const current = testCasePassedBy.get(id);
|
|
1774
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1775
|
-
}
|
|
1776
|
-
} catch {
|
|
1777
|
-
}
|
|
1778
|
-
}
|
|
1779
|
-
let passedTestCases = 0;
|
|
1780
|
-
let failedTestCases = 0;
|
|
1781
|
-
for (const passed of testCasePassedBy.values()) {
|
|
1782
|
-
if (passed) {
|
|
1783
|
-
passedTestCases += 1;
|
|
1784
|
-
} else {
|
|
1785
|
-
failedTestCases += 1;
|
|
1786
|
-
}
|
|
1936
|
+
function createNameMatcher(pattern) {
|
|
1937
|
+
const normalizedPattern = pattern.trim();
|
|
1938
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1939
|
+
if (regexLiteral) {
|
|
1940
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1941
|
+
return (value) => regex.test(value);
|
|
1787
1942
|
}
|
|
1788
|
-
|
|
1789
|
-
}
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
const content = await promises.readFile(artifactPath, "utf8");
|
|
1793
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1794
|
-
const results = [];
|
|
1795
|
-
for (const line of lines) {
|
|
1796
|
-
try {
|
|
1797
|
-
const event = JSON.parse(line);
|
|
1798
|
-
if (event.type === "TestCaseProgress") {
|
|
1799
|
-
const ev = event;
|
|
1800
|
-
results.push({
|
|
1801
|
-
testCaseId: ev.testCaseId,
|
|
1802
|
-
testCaseName: ev.testCaseName,
|
|
1803
|
-
completedTestCases: ev.completedTestCases,
|
|
1804
|
-
totalTestCases: ev.totalTestCases,
|
|
1805
|
-
rerunIndex: ev.rerunIndex,
|
|
1806
|
-
rerunTotal: ev.rerunTotal,
|
|
1807
|
-
passed: ev.passed,
|
|
1808
|
-
durationMs: ev.durationMs,
|
|
1809
|
-
evaluatorScores: ev.evaluatorScores ?? []
|
|
1810
|
-
});
|
|
1811
|
-
}
|
|
1812
|
-
} catch {
|
|
1813
|
-
}
|
|
1814
|
-
}
|
|
1815
|
-
return results;
|
|
1816
|
-
} catch {
|
|
1817
|
-
return [];
|
|
1943
|
+
if (normalizedPattern.includes("*")) {
|
|
1944
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1945
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1946
|
+
return (value) => regex.test(value);
|
|
1818
1947
|
}
|
|
1948
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1819
1949
|
}
|
|
1820
1950
|
async function appendJsonLine(artifactPath, payload) {
|
|
1821
1951
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1874,32 +2004,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1874
2004
|
}
|
|
1875
2005
|
|
|
1876
2006
|
// src/runner/api.ts
|
|
1877
|
-
function
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1882
|
-
if (lastSlash <= 0) {
|
|
1883
|
-
return void 0;
|
|
2007
|
+
function normalizeRunRepetitions(value) {
|
|
2008
|
+
const n = value ?? 1;
|
|
2009
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
2010
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1884
2011
|
}
|
|
1885
|
-
return
|
|
1886
|
-
source: pattern.slice(1, lastSlash),
|
|
1887
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1888
|
-
};
|
|
1889
|
-
}
|
|
1890
|
-
function createNameMatcher(pattern) {
|
|
1891
|
-
const normalizedPattern = pattern.trim();
|
|
1892
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1893
|
-
if (regexLiteral) {
|
|
1894
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1895
|
-
return (value) => regex.test(value);
|
|
1896
|
-
}
|
|
1897
|
-
if (normalizedPattern.includes("*")) {
|
|
1898
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1899
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1900
|
-
return (value) => regex.test(value);
|
|
1901
|
-
}
|
|
1902
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
2012
|
+
return n;
|
|
1903
2013
|
}
|
|
1904
2014
|
function mergeRunnerOverrides(base, next) {
|
|
1905
2015
|
if (!base) {
|
|
@@ -1934,6 +2044,7 @@ var EffectRunner = class {
|
|
|
1934
2044
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1935
2045
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1936
2046
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
2047
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1937
2048
|
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1938
2049
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1939
2050
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1974,6 +2085,137 @@ var EffectRunner = class {
|
|
|
1974
2085
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1975
2086
|
);
|
|
1976
2087
|
}
|
|
2088
|
+
async collectRunConfigs() {
|
|
2089
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
2090
|
+
this.runConfigsById.clear();
|
|
2091
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
2092
|
+
for (const item of runConfigs) {
|
|
2093
|
+
const id = item.runConfig.getName();
|
|
2094
|
+
const lower = id.toLowerCase();
|
|
2095
|
+
const prev = byNameLower.get(lower);
|
|
2096
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
2097
|
+
throw new Error(
|
|
2098
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
2099
|
+
);
|
|
2100
|
+
}
|
|
2101
|
+
byNameLower.set(lower, item);
|
|
2102
|
+
this.runConfigsById.set(id, item);
|
|
2103
|
+
}
|
|
2104
|
+
return runConfigs;
|
|
2105
|
+
}
|
|
2106
|
+
async resolveRunConfigByName(name) {
|
|
2107
|
+
if (this.runConfigsById.size === 0) {
|
|
2108
|
+
await this.collectRunConfigs();
|
|
2109
|
+
}
|
|
2110
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
2111
|
+
const keyLower = key.toLowerCase();
|
|
2112
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
2113
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
2114
|
+
);
|
|
2115
|
+
if (matches.length === 0) {
|
|
2116
|
+
return void 0;
|
|
2117
|
+
}
|
|
2118
|
+
if (matches.length > 1) {
|
|
2119
|
+
throw new Error(
|
|
2120
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
2121
|
+
);
|
|
2122
|
+
}
|
|
2123
|
+
return matches[0];
|
|
2124
|
+
}
|
|
2125
|
+
async expandRunConfigToJobs(collected) {
|
|
2126
|
+
if (this.datasetsById.size === 0) {
|
|
2127
|
+
await this.collectDatasets();
|
|
2128
|
+
}
|
|
2129
|
+
if (this.evaluatorsById.size === 0) {
|
|
2130
|
+
await this.collectEvaluators();
|
|
2131
|
+
}
|
|
2132
|
+
const rcName = collected.runConfig.getName();
|
|
2133
|
+
const jobs = [];
|
|
2134
|
+
const runs = collected.runConfig.getRuns();
|
|
2135
|
+
for (const [i, row] of runs.entries()) {
|
|
2136
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
2137
|
+
(d) => d.dataset === row.dataset
|
|
2138
|
+
);
|
|
2139
|
+
if (!dsCollected) {
|
|
2140
|
+
throw new Error(
|
|
2141
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2142
|
+
);
|
|
2143
|
+
}
|
|
2144
|
+
let evaluatorIds;
|
|
2145
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
2146
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
2147
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
2148
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
2149
|
+
);
|
|
2150
|
+
if (matched.length === 0) {
|
|
2151
|
+
throw new Error(
|
|
2152
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2153
|
+
);
|
|
2154
|
+
}
|
|
2155
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2156
|
+
} else {
|
|
2157
|
+
const evaluators = row.evaluators;
|
|
2158
|
+
evaluatorIds = [];
|
|
2159
|
+
for (const ev of evaluators) {
|
|
2160
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2161
|
+
(item) => item.evaluator === ev
|
|
2162
|
+
);
|
|
2163
|
+
if (!found) {
|
|
2164
|
+
throw new Error(
|
|
2165
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2166
|
+
);
|
|
2167
|
+
}
|
|
2168
|
+
evaluatorIds.push(found.id);
|
|
2169
|
+
}
|
|
2170
|
+
}
|
|
2171
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2172
|
+
jobs.push({
|
|
2173
|
+
datasetId: dsCollected.id,
|
|
2174
|
+
evaluatorIds,
|
|
2175
|
+
runConfigName: rcName,
|
|
2176
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2177
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2178
|
+
repetitions
|
|
2179
|
+
});
|
|
2180
|
+
}
|
|
2181
|
+
return jobs;
|
|
2182
|
+
}
|
|
2183
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2184
|
+
const jobs = [];
|
|
2185
|
+
for (const name of names) {
|
|
2186
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2187
|
+
if (!collected) {
|
|
2188
|
+
const known = await this.collectRunConfigs();
|
|
2189
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2190
|
+
throw new Error(
|
|
2191
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2192
|
+
);
|
|
2193
|
+
}
|
|
2194
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2195
|
+
}
|
|
2196
|
+
return jobs;
|
|
2197
|
+
}
|
|
2198
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2199
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2200
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2201
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2202
|
+
const snapshots = [];
|
|
2203
|
+
for (const job of request.jobs) {
|
|
2204
|
+
snapshots.push(
|
|
2205
|
+
await this.startDatasetRun({
|
|
2206
|
+
datasetId: job.datasetId,
|
|
2207
|
+
evaluatorIds: job.evaluatorIds,
|
|
2208
|
+
triggerId,
|
|
2209
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2210
|
+
globalEvaluationSemaphore: sem,
|
|
2211
|
+
runConfigName: job.runConfigName,
|
|
2212
|
+
runConfigTags: job.runConfigTags,
|
|
2213
|
+
repetitions: job.repetitions
|
|
2214
|
+
})
|
|
2215
|
+
);
|
|
2216
|
+
}
|
|
2217
|
+
return snapshots;
|
|
2218
|
+
}
|
|
1977
2219
|
async searchTestCases(query) {
|
|
1978
2220
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1979
2221
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1992,35 +2234,45 @@ var EffectRunner = class {
|
|
|
1992
2234
|
);
|
|
1993
2235
|
}
|
|
1994
2236
|
async runDatasetWith(request) {
|
|
2237
|
+
const runConfigName = validateRunConfigName(
|
|
2238
|
+
request.runConfigName,
|
|
2239
|
+
"runDatasetWith.runConfigName"
|
|
2240
|
+
);
|
|
2241
|
+
return this.startDatasetRun({
|
|
2242
|
+
datasetId: request.datasetId,
|
|
2243
|
+
evaluatorIds: request.evaluatorIds,
|
|
2244
|
+
triggerId: request.triggerId,
|
|
2245
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2246
|
+
repetitions: request.repetitions,
|
|
2247
|
+
runConfigName,
|
|
2248
|
+
runConfigTags: request.runConfigTags
|
|
2249
|
+
});
|
|
2250
|
+
}
|
|
2251
|
+
async startDatasetRun(params) {
|
|
1995
2252
|
if (this.datasetsById.size === 0) {
|
|
1996
2253
|
await this.collectDatasets();
|
|
1997
2254
|
}
|
|
1998
2255
|
if (this.evaluatorsById.size === 0) {
|
|
1999
2256
|
await this.collectEvaluators();
|
|
2000
2257
|
}
|
|
2001
|
-
const dataset = this.datasetsById.get(
|
|
2258
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
2002
2259
|
if (!dataset) {
|
|
2003
|
-
throw new Error(`Unknown dataset: ${
|
|
2260
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
2004
2261
|
}
|
|
2005
|
-
const selectedEvaluators =
|
|
2262
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
2006
2263
|
if (selectedEvaluators.length === 0) {
|
|
2007
2264
|
throw new Error("No evaluators selected for run");
|
|
2008
2265
|
}
|
|
2009
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
2010
|
-
const
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
)
|
|
2014
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2266
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2267
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2268
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2269
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2270
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2015
2271
|
const runId = `run-${crypto.randomUUID()}`;
|
|
2016
|
-
const artifactPath = createArtifactPath(
|
|
2017
|
-
this.config.artifactDirectory,
|
|
2018
|
-
request.datasetId,
|
|
2019
|
-
runId
|
|
2020
|
-
);
|
|
2272
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
2021
2273
|
const snapshot = {
|
|
2022
2274
|
runId,
|
|
2023
|
-
datasetId:
|
|
2275
|
+
datasetId: params.datasetId,
|
|
2024
2276
|
datasetName: dataset.dataset.getName(),
|
|
2025
2277
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2026
2278
|
queuedAt: Date.now(),
|
|
@@ -2041,7 +2293,7 @@ var EffectRunner = class {
|
|
|
2041
2293
|
const queuedEvent = {
|
|
2042
2294
|
type: "RunQueued",
|
|
2043
2295
|
runId,
|
|
2044
|
-
datasetId:
|
|
2296
|
+
datasetId: params.datasetId,
|
|
2045
2297
|
datasetName: dataset.dataset.getName(),
|
|
2046
2298
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2047
2299
|
totalTestCases: totalEvaluations,
|
|
@@ -2055,17 +2307,20 @@ var EffectRunner = class {
|
|
|
2055
2307
|
payload: queuedEvent
|
|
2056
2308
|
})
|
|
2057
2309
|
);
|
|
2058
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
2059
2310
|
await effect.Effect.runPromise(
|
|
2060
2311
|
effect.Queue.offer(this.runQueue, {
|
|
2061
2312
|
runId,
|
|
2062
2313
|
triggerId,
|
|
2063
|
-
datasetId:
|
|
2314
|
+
datasetId: params.datasetId,
|
|
2064
2315
|
dataset: dataset.dataset,
|
|
2065
2316
|
evaluators: selectedEvaluators,
|
|
2066
2317
|
testCases: selectedTestCases,
|
|
2067
2318
|
snapshot,
|
|
2068
|
-
maxConcurrency
|
|
2319
|
+
maxConcurrency: params.maxConcurrency,
|
|
2320
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2321
|
+
runConfigName: params.runConfigName,
|
|
2322
|
+
runConfigTags,
|
|
2323
|
+
repetitions
|
|
2069
2324
|
})
|
|
2070
2325
|
);
|
|
2071
2326
|
return snapshot;
|
|
@@ -2136,6 +2391,11 @@ var EffectRunner = class {
|
|
|
2136
2391
|
);
|
|
2137
2392
|
}
|
|
2138
2393
|
};
|
|
2394
|
+
|
|
2395
|
+
// src/runner/events.ts
|
|
2396
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2397
|
+
runConfigName: "programmatic"
|
|
2398
|
+
};
|
|
2139
2399
|
var LEFT_PANE_WIDTH2 = 44;
|
|
2140
2400
|
var MAX_RUNS_FOR_CHART = 12;
|
|
2141
2401
|
var MAX_RUNS_FOR_TREND = 20;
|
|
@@ -2483,7 +2743,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2483
2743
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
|
|
2484
2744
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
2485
2745
|
for (const tc of testCases) {
|
|
2486
|
-
const
|
|
2746
|
+
const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
|
|
2487
2747
|
rows.push(
|
|
2488
2748
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2489
2749
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
@@ -2495,13 +2755,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2495
2755
|
] }),
|
|
2496
2756
|
" ",
|
|
2497
2757
|
tc.testCaseName,
|
|
2498
|
-
|
|
2758
|
+
repetitionPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: repetitionPart }) : null,
|
|
2499
2759
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2500
2760
|
" (",
|
|
2501
2761
|
tc.durationMs,
|
|
2502
2762
|
"ms)"
|
|
2503
2763
|
] })
|
|
2504
|
-
] }, `tc-${tc.testCaseId}-${tc.
|
|
2764
|
+
] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
|
|
2505
2765
|
);
|
|
2506
2766
|
for (const item of tc.evaluatorScores) {
|
|
2507
2767
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -2827,7 +3087,8 @@ function EvalsCliApp({ data, args, runner }) {
|
|
|
2827
3087
|
}
|
|
2828
3088
|
void runner.runDatasetWith({
|
|
2829
3089
|
datasetId: selectedDataset.id,
|
|
2830
|
-
evaluatorIds: clampedState.selectedEvaluatorIds
|
|
3090
|
+
evaluatorIds: clampedState.selectedEvaluatorIds,
|
|
3091
|
+
...PROGRAMMATIC_RUN_CONFIG
|
|
2831
3092
|
}).then((snapshot) => {
|
|
2832
3093
|
setRuntimeMessage(
|
|
2833
3094
|
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|