@m4trix/evals 0.25.1 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/dist/cli-simple.cjs +845 -455
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +846 -456
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +543 -273
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +543 -273
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +948 -545
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +228 -14
- package/dist/index.js +933 -547
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -3,16 +3,16 @@ import { withFullScreen, useScreenSize } from 'fullscreen-ink';
|
|
|
3
3
|
import React, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
|
|
4
4
|
import { useApp, useInput, Box, Text } from 'ink';
|
|
5
5
|
import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
|
|
6
|
-
import {
|
|
7
|
-
import { LineGraph } from '@pppp606/ink-chart';
|
|
6
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
8
7
|
import { randomUUID } from 'crypto';
|
|
9
|
-
import {
|
|
8
|
+
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
9
|
+
import { resolve, join, relative, dirname } from 'path';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
|
-
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
13
12
|
import { pathToFileURL } from 'url';
|
|
14
13
|
import { diffLines } from 'diff';
|
|
15
14
|
import stringify from 'fast-json-stable-stringify';
|
|
15
|
+
import { LineGraph } from '@pppp606/ink-chart';
|
|
16
16
|
|
|
17
17
|
var SEP = " ";
|
|
18
18
|
var ARROW = "\u203A";
|
|
@@ -237,6 +237,59 @@ function isPrintableCharacter(input) {
|
|
|
237
237
|
function isBackKey(key) {
|
|
238
238
|
return key.backspace || key.delete;
|
|
239
239
|
}
|
|
240
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
241
|
+
function makeEntityIdSchema(brand, label) {
|
|
242
|
+
return Schema.String.pipe(
|
|
243
|
+
Schema.trimmed(),
|
|
244
|
+
Schema.minLength(1, {
|
|
245
|
+
message: () => `${label} must be non-empty.`
|
|
246
|
+
}),
|
|
247
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
248
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
249
|
+
}),
|
|
250
|
+
Schema.brand(brand)
|
|
251
|
+
);
|
|
252
|
+
}
|
|
253
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
254
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
255
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
256
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
257
|
+
function validateWithSchema(schema, raw, context) {
|
|
258
|
+
const trimmed = raw.trim();
|
|
259
|
+
const decode = Schema.decodeUnknownEither(
|
|
260
|
+
schema
|
|
261
|
+
);
|
|
262
|
+
const result = decode(trimmed);
|
|
263
|
+
if (Either.isLeft(result)) {
|
|
264
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
265
|
+
}
|
|
266
|
+
return result.right;
|
|
267
|
+
}
|
|
268
|
+
function validateRunConfigName(raw, context) {
|
|
269
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// src/evals/dataset.ts
|
|
273
|
+
function getDatasetDisplayLabel(dataset) {
|
|
274
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
275
|
+
return dataset.getDisplayLabel();
|
|
276
|
+
}
|
|
277
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// src/evals/evaluator.ts
|
|
281
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
282
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
283
|
+
const label = evaluator.getDisplayLabel();
|
|
284
|
+
if (label !== void 0) {
|
|
285
|
+
return label;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
289
|
+
}
|
|
290
|
+
function getEvaluatorTagList(evaluator) {
|
|
291
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
292
|
+
}
|
|
240
293
|
|
|
241
294
|
// src/cli/data.mock.json
|
|
242
295
|
var data_mock_default = {
|
|
@@ -484,7 +537,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
484
537
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
485
538
|
return {
|
|
486
539
|
id: item.id,
|
|
487
|
-
name: item.dataset
|
|
540
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
488
541
|
overview: `Discovered from ${item.filePath}`,
|
|
489
542
|
runs
|
|
490
543
|
};
|
|
@@ -492,7 +545,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
492
545
|
function toEvaluatorOption(item) {
|
|
493
546
|
return {
|
|
494
547
|
id: item.id,
|
|
495
|
-
name: item.evaluator
|
|
548
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
496
549
|
configPreview: `Source: ${item.filePath}`
|
|
497
550
|
};
|
|
498
551
|
}
|
|
@@ -735,6 +788,159 @@ function reduceCliState(state, action) {
|
|
|
735
788
|
}
|
|
736
789
|
return state;
|
|
737
790
|
}
|
|
791
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
792
|
+
const baseDir = resolve(config.artifactDirectory);
|
|
793
|
+
let entries;
|
|
794
|
+
try {
|
|
795
|
+
entries = await readdir(baseDir);
|
|
796
|
+
} catch {
|
|
797
|
+
return [];
|
|
798
|
+
}
|
|
799
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
800
|
+
const snapshots = [];
|
|
801
|
+
for (const fileName of jsonlFiles) {
|
|
802
|
+
const filePath = join(baseDir, fileName);
|
|
803
|
+
try {
|
|
804
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
805
|
+
if (snapshot) {
|
|
806
|
+
snapshots.push(snapshot);
|
|
807
|
+
}
|
|
808
|
+
} catch {
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
812
|
+
}
|
|
813
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
814
|
+
const content = await readFile(filePath, "utf8");
|
|
815
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
816
|
+
if (lines.length === 0) {
|
|
817
|
+
return null;
|
|
818
|
+
}
|
|
819
|
+
let runQueued = null;
|
|
820
|
+
let runCompleted = null;
|
|
821
|
+
let runFailed = null;
|
|
822
|
+
let runStarted = null;
|
|
823
|
+
for (const line of lines) {
|
|
824
|
+
try {
|
|
825
|
+
const event = JSON.parse(line);
|
|
826
|
+
const type = event.type;
|
|
827
|
+
if (type === "RunQueued") {
|
|
828
|
+
runQueued = {
|
|
829
|
+
runId: event.runId,
|
|
830
|
+
datasetId: event.datasetId,
|
|
831
|
+
datasetName: event.datasetName,
|
|
832
|
+
evaluatorIds: event.evaluatorIds,
|
|
833
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
834
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
835
|
+
ts: event.ts
|
|
836
|
+
};
|
|
837
|
+
}
|
|
838
|
+
if (type === "RunStarted") {
|
|
839
|
+
runStarted = { startedAt: event.startedAt };
|
|
840
|
+
}
|
|
841
|
+
if (type === "RunCompleted") {
|
|
842
|
+
runCompleted = {
|
|
843
|
+
passedTestCases: event.passedTestCases,
|
|
844
|
+
failedTestCases: event.failedTestCases,
|
|
845
|
+
totalTestCases: event.totalTestCases,
|
|
846
|
+
finishedAt: event.finishedAt
|
|
847
|
+
};
|
|
848
|
+
}
|
|
849
|
+
if (type === "RunFailed") {
|
|
850
|
+
runFailed = {
|
|
851
|
+
finishedAt: event.finishedAt,
|
|
852
|
+
errorMessage: event.errorMessage
|
|
853
|
+
};
|
|
854
|
+
}
|
|
855
|
+
} catch {
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
if (!runQueued) {
|
|
859
|
+
return null;
|
|
860
|
+
}
|
|
861
|
+
const artifactPath = filePath;
|
|
862
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
863
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
864
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
865
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
866
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
867
|
+
return {
|
|
868
|
+
runId: runQueued.runId,
|
|
869
|
+
datasetId: runQueued.datasetId,
|
|
870
|
+
datasetName: runQueued.datasetName,
|
|
871
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
872
|
+
queuedAt: runQueued.ts ?? 0,
|
|
873
|
+
startedAt: runStarted?.startedAt,
|
|
874
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
875
|
+
totalTestCases: runQueued.totalTestCases,
|
|
876
|
+
completedTestCases,
|
|
877
|
+
passedTestCases,
|
|
878
|
+
failedTestCases,
|
|
879
|
+
status,
|
|
880
|
+
artifactPath,
|
|
881
|
+
errorMessage: runFailed?.errorMessage
|
|
882
|
+
};
|
|
883
|
+
}
|
|
884
|
+
function aggregateTestCaseProgress(lines) {
|
|
885
|
+
let completedTestCases = 0;
|
|
886
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
887
|
+
for (const line of lines) {
|
|
888
|
+
try {
|
|
889
|
+
const event = JSON.parse(line);
|
|
890
|
+
if (event.type === "TestCaseProgress") {
|
|
891
|
+
const ev = event;
|
|
892
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
893
|
+
const id = ev.testCaseId;
|
|
894
|
+
const current = testCasePassedBy.get(id);
|
|
895
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
896
|
+
}
|
|
897
|
+
} catch {
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
let passedTestCases = 0;
|
|
901
|
+
let failedTestCases = 0;
|
|
902
|
+
for (const passed of testCasePassedBy.values()) {
|
|
903
|
+
if (passed) {
|
|
904
|
+
passedTestCases += 1;
|
|
905
|
+
} else {
|
|
906
|
+
failedTestCases += 1;
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
910
|
+
}
|
|
911
|
+
async function parseArtifactFile(artifactPath) {
|
|
912
|
+
try {
|
|
913
|
+
const content = await readFile(artifactPath, "utf8");
|
|
914
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
915
|
+
const results = [];
|
|
916
|
+
for (const line of lines) {
|
|
917
|
+
try {
|
|
918
|
+
const event = JSON.parse(line);
|
|
919
|
+
if (event.type === "TestCaseProgress") {
|
|
920
|
+
const ev = event;
|
|
921
|
+
const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
|
|
922
|
+
const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
|
|
923
|
+
results.push({
|
|
924
|
+
testCaseId: ev.testCaseId,
|
|
925
|
+
testCaseName: ev.testCaseName,
|
|
926
|
+
completedTestCases: ev.completedTestCases,
|
|
927
|
+
totalTestCases: ev.totalTestCases,
|
|
928
|
+
repetitionId: ev.repetitionId,
|
|
929
|
+
repetitionIndex,
|
|
930
|
+
repetitionCount,
|
|
931
|
+
passed: ev.passed,
|
|
932
|
+
durationMs: ev.durationMs,
|
|
933
|
+
evaluatorScores: ev.evaluatorScores ?? []
|
|
934
|
+
});
|
|
935
|
+
}
|
|
936
|
+
} catch {
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
return results;
|
|
940
|
+
} catch {
|
|
941
|
+
return [];
|
|
942
|
+
}
|
|
943
|
+
}
|
|
738
944
|
|
|
739
945
|
// src/runner/config.ts
|
|
740
946
|
var defaultRunnerConfig = {
|
|
@@ -742,6 +948,7 @@ var defaultRunnerConfig = {
|
|
|
742
948
|
rootDir: process.cwd(),
|
|
743
949
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
744
950
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
951
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
745
952
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
746
953
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
747
954
|
},
|
|
@@ -767,6 +974,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
767
974
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
768
975
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
769
976
|
}
|
|
977
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
978
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
979
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
980
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
981
|
+
}
|
|
770
982
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
771
983
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
772
984
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -865,6 +1077,9 @@ function isDatasetLike(value) {
|
|
|
865
1077
|
function isEvaluatorLike(value) {
|
|
866
1078
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
867
1079
|
}
|
|
1080
|
+
function isRunConfigLike(value) {
|
|
1081
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1082
|
+
}
|
|
868
1083
|
function isTestCaseLike(value) {
|
|
869
1084
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
870
1085
|
}
|
|
@@ -953,6 +1168,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
953
1168
|
);
|
|
954
1169
|
return found.flat();
|
|
955
1170
|
}
|
|
1171
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1172
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1173
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1174
|
+
const found = await Promise.all(
|
|
1175
|
+
matched.map(async (absolutePath) => {
|
|
1176
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1177
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1178
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
1179
|
+
return runConfigs.map((runConfig) => ({
|
|
1180
|
+
id: runConfig.getName(),
|
|
1181
|
+
filePath: relPath,
|
|
1182
|
+
runConfig
|
|
1183
|
+
}));
|
|
1184
|
+
})
|
|
1185
|
+
);
|
|
1186
|
+
return found.flat();
|
|
1187
|
+
}
|
|
956
1188
|
async function collectTestCasesFromFiles(config) {
|
|
957
1189
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
958
1190
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -1106,6 +1338,17 @@ function getDiffLines(entry) {
|
|
|
1106
1338
|
});
|
|
1107
1339
|
}
|
|
1108
1340
|
|
|
1341
|
+
// src/evals/test-case.ts
|
|
1342
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1343
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1344
|
+
return testCase.getDisplayLabel();
|
|
1345
|
+
}
|
|
1346
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
1347
|
+
}
|
|
1348
|
+
function getTestCaseTagList(testCase) {
|
|
1349
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1109
1352
|
// src/evals/metric.ts
|
|
1110
1353
|
var registry = /* @__PURE__ */ new Map();
|
|
1111
1354
|
var Metric = {
|
|
@@ -1129,6 +1372,54 @@ function getMetricById(id) {
|
|
|
1129
1372
|
return registry.get(id);
|
|
1130
1373
|
}
|
|
1131
1374
|
|
|
1375
|
+
// src/evals/aggregators.ts
|
|
1376
|
+
function aggregateTokenCountSum(values) {
|
|
1377
|
+
const initial = {
|
|
1378
|
+
input: 0,
|
|
1379
|
+
output: 0,
|
|
1380
|
+
inputCached: 0,
|
|
1381
|
+
outputCached: 0
|
|
1382
|
+
};
|
|
1383
|
+
return values.reduce(
|
|
1384
|
+
(acc, v) => ({
|
|
1385
|
+
input: acc.input + (v.input ?? 0),
|
|
1386
|
+
output: acc.output + (v.output ?? 0),
|
|
1387
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1388
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1389
|
+
}),
|
|
1390
|
+
initial
|
|
1391
|
+
);
|
|
1392
|
+
}
|
|
1393
|
+
function aggregateLatencyAverage(values) {
|
|
1394
|
+
if (values.length === 0) {
|
|
1395
|
+
return { ms: 0 };
|
|
1396
|
+
}
|
|
1397
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1398
|
+
return { ms: sum / values.length };
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
// src/evals/metrics/standard.ts
|
|
1402
|
+
Metric.of({
|
|
1403
|
+
id: "token-count",
|
|
1404
|
+
name: "Tokens",
|
|
1405
|
+
aggregate: aggregateTokenCountSum,
|
|
1406
|
+
format: (data, options) => {
|
|
1407
|
+
const input = data.input ?? 0;
|
|
1408
|
+
const output = data.output ?? 0;
|
|
1409
|
+
const inputCached = data.inputCached ?? 0;
|
|
1410
|
+
const outputCached = data.outputCached ?? 0;
|
|
1411
|
+
const cached = inputCached + outputCached;
|
|
1412
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1413
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1414
|
+
}
|
|
1415
|
+
});
|
|
1416
|
+
Metric.of({
|
|
1417
|
+
id: "latency",
|
|
1418
|
+
name: "Latency",
|
|
1419
|
+
aggregate: aggregateLatencyAverage,
|
|
1420
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1421
|
+
});
|
|
1422
|
+
|
|
1132
1423
|
// src/evals/score.ts
|
|
1133
1424
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1134
1425
|
function formatScoreData(def, data, options) {
|
|
@@ -1237,54 +1528,6 @@ function getScoreById(id) {
|
|
|
1237
1528
|
return registry2.get(id);
|
|
1238
1529
|
}
|
|
1239
1530
|
|
|
1240
|
-
// src/evals/aggregators.ts
|
|
1241
|
-
function aggregateTokenCountSum(values) {
|
|
1242
|
-
const initial = {
|
|
1243
|
-
input: 0,
|
|
1244
|
-
output: 0,
|
|
1245
|
-
inputCached: 0,
|
|
1246
|
-
outputCached: 0
|
|
1247
|
-
};
|
|
1248
|
-
return values.reduce(
|
|
1249
|
-
(acc, v) => ({
|
|
1250
|
-
input: acc.input + (v.input ?? 0),
|
|
1251
|
-
output: acc.output + (v.output ?? 0),
|
|
1252
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1253
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1254
|
-
}),
|
|
1255
|
-
initial
|
|
1256
|
-
);
|
|
1257
|
-
}
|
|
1258
|
-
function aggregateLatencyAverage(values) {
|
|
1259
|
-
if (values.length === 0) {
|
|
1260
|
-
return { ms: 0 };
|
|
1261
|
-
}
|
|
1262
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1263
|
-
return { ms: sum / values.length };
|
|
1264
|
-
}
|
|
1265
|
-
|
|
1266
|
-
// src/evals/metrics/standard.ts
|
|
1267
|
-
Metric.of({
|
|
1268
|
-
id: "token-count",
|
|
1269
|
-
name: "Tokens",
|
|
1270
|
-
aggregate: aggregateTokenCountSum,
|
|
1271
|
-
format: (data, options) => {
|
|
1272
|
-
const input = data.input ?? 0;
|
|
1273
|
-
const output = data.output ?? 0;
|
|
1274
|
-
const inputCached = data.inputCached ?? 0;
|
|
1275
|
-
const outputCached = data.outputCached ?? 0;
|
|
1276
|
-
const cached = inputCached + outputCached;
|
|
1277
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1278
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1279
|
-
}
|
|
1280
|
-
});
|
|
1281
|
-
Metric.of({
|
|
1282
|
-
id: "latency",
|
|
1283
|
-
name: "Latency",
|
|
1284
|
-
aggregate: aggregateLatencyAverage,
|
|
1285
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1286
|
-
});
|
|
1287
|
-
|
|
1288
1531
|
// src/evals/scores/standard.ts
|
|
1289
1532
|
Score.of({
|
|
1290
1533
|
id: "percent",
|
|
@@ -1391,15 +1634,17 @@ function readOutput(testCase) {
|
|
|
1391
1634
|
}
|
|
1392
1635
|
return candidate.getOutput();
|
|
1393
1636
|
}
|
|
1394
|
-
function buildEvaluationUnits(testCases) {
|
|
1637
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1638
|
+
const count = Math.max(1, repetitionCount);
|
|
1395
1639
|
const units = [];
|
|
1396
1640
|
for (const testCaseItem of testCases) {
|
|
1397
|
-
const
|
|
1398
|
-
for (let r = 0; r <
|
|
1641
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
1642
|
+
for (let r = 0; r < count; r++) {
|
|
1399
1643
|
units.push({
|
|
1400
1644
|
testCaseItem,
|
|
1401
|
-
|
|
1402
|
-
|
|
1645
|
+
repetitionId,
|
|
1646
|
+
repetitionIndex: r + 1,
|
|
1647
|
+
repetitionCount: count
|
|
1403
1648
|
});
|
|
1404
1649
|
}
|
|
1405
1650
|
}
|
|
@@ -1412,7 +1657,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1412
1657
|
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1413
1658
|
}
|
|
1414
1659
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1415
|
-
const { testCaseItem,
|
|
1660
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1416
1661
|
return Effect.gen(function* () {
|
|
1417
1662
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1418
1663
|
const started = Date.now();
|
|
@@ -1421,11 +1666,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1421
1666
|
type: "TestCaseStarted",
|
|
1422
1667
|
runId: task.runId,
|
|
1423
1668
|
testCaseId: testCaseItem.id,
|
|
1424
|
-
testCaseName: testCaseItem.testCase
|
|
1669
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1425
1670
|
startedTestCases: startedEvaluations,
|
|
1426
1671
|
totalTestCases: totalEvaluations,
|
|
1427
|
-
|
|
1428
|
-
|
|
1672
|
+
repetitionId,
|
|
1673
|
+
repetitionIndex,
|
|
1674
|
+
repetitionCount
|
|
1429
1675
|
});
|
|
1430
1676
|
const evaluatorScores = [];
|
|
1431
1677
|
let testCaseError;
|
|
@@ -1459,8 +1705,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1459
1705
|
meta: {
|
|
1460
1706
|
triggerId: task.triggerId,
|
|
1461
1707
|
runId: evaluatorRunId,
|
|
1462
|
-
|
|
1708
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1709
|
+
repetitionId,
|
|
1710
|
+
repetitionIndex,
|
|
1711
|
+
repetitionCount,
|
|
1712
|
+
runConfigName: task.runConfigName
|
|
1463
1713
|
},
|
|
1714
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1715
|
+
runConfigTags: task.runConfigTags,
|
|
1716
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1464
1717
|
logDiff,
|
|
1465
1718
|
log,
|
|
1466
1719
|
createError
|
|
@@ -1503,18 +1756,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1503
1756
|
});
|
|
1504
1757
|
}
|
|
1505
1758
|
}
|
|
1506
|
-
const
|
|
1759
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1507
1760
|
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1508
1761
|
const progressEvent = {
|
|
1509
1762
|
type: "TestCaseProgress",
|
|
1510
1763
|
runId: task.runId,
|
|
1511
1764
|
testCaseId: testCaseItem.id,
|
|
1512
|
-
testCaseName: testCaseItem.testCase
|
|
1765
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1513
1766
|
completedTestCases: completedEvaluations,
|
|
1514
1767
|
totalTestCases: totalEvaluations,
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1768
|
+
repetitionId,
|
|
1769
|
+
repetitionIndex,
|
|
1770
|
+
repetitionCount,
|
|
1771
|
+
passed: repetitionPassedThis,
|
|
1518
1772
|
durationMs: Date.now() - started,
|
|
1519
1773
|
evaluatorScores,
|
|
1520
1774
|
output,
|
|
@@ -1535,9 +1789,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1535
1789
|
(map) => {
|
|
1536
1790
|
const key = testCaseItem.id;
|
|
1537
1791
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1538
|
-
const newResults = [...existing.results,
|
|
1792
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1539
1793
|
const newCompletedCount = existing.completedCount + 1;
|
|
1540
|
-
const isLast = newCompletedCount ===
|
|
1794
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1541
1795
|
const newMap = new Map(map);
|
|
1542
1796
|
newMap.set(key, {
|
|
1543
1797
|
completedCount: newCompletedCount,
|
|
@@ -1574,10 +1828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1574
1828
|
runId: task.runId,
|
|
1575
1829
|
startedAt
|
|
1576
1830
|
});
|
|
1577
|
-
const totalEvaluations = task.testCases.
|
|
1578
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1579
|
-
0
|
|
1580
|
-
);
|
|
1831
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1581
1832
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1582
1833
|
const completedRef = yield* Ref.make(0);
|
|
1583
1834
|
const startedRef = yield* Ref.make(0);
|
|
@@ -1586,7 +1837,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1586
1837
|
const testCaseResultsRef = yield* Ref.make(
|
|
1587
1838
|
/* @__PURE__ */ new Map()
|
|
1588
1839
|
);
|
|
1589
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1840
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1590
1841
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1591
1842
|
task,
|
|
1592
1843
|
unit,
|
|
@@ -1600,11 +1851,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1600
1851
|
failedRef,
|
|
1601
1852
|
testCaseResultsRef
|
|
1602
1853
|
);
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1854
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1855
|
+
if (globalSem !== void 0) {
|
|
1856
|
+
yield* Effect.forEach(
|
|
1857
|
+
evaluationUnits,
|
|
1858
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1859
|
+
{ concurrency: "unbounded", discard: true }
|
|
1860
|
+
);
|
|
1861
|
+
} else {
|
|
1862
|
+
yield* Effect.forEach(
|
|
1863
|
+
evaluationUnits,
|
|
1864
|
+
processEvaluation,
|
|
1865
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1866
|
+
);
|
|
1867
|
+
}
|
|
1608
1868
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1609
1869
|
Ref.get(completedRef),
|
|
1610
1870
|
Ref.get(passedRef),
|
|
@@ -1640,155 +1900,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1640
1900
|
artifactPath: task.snapshot.artifactPath
|
|
1641
1901
|
});
|
|
1642
1902
|
});
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
} catch {
|
|
1649
|
-
return [];
|
|
1650
|
-
}
|
|
1651
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1652
|
-
const snapshots = [];
|
|
1653
|
-
for (const fileName of jsonlFiles) {
|
|
1654
|
-
const filePath = join(baseDir, fileName);
|
|
1655
|
-
try {
|
|
1656
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1657
|
-
if (snapshot) {
|
|
1658
|
-
snapshots.push(snapshot);
|
|
1659
|
-
}
|
|
1660
|
-
} catch {
|
|
1661
|
-
}
|
|
1662
|
-
}
|
|
1663
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1664
|
-
}
|
|
1665
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1666
|
-
const content = await readFile(filePath, "utf8");
|
|
1667
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1668
|
-
if (lines.length === 0) {
|
|
1669
|
-
return null;
|
|
1670
|
-
}
|
|
1671
|
-
let runQueued = null;
|
|
1672
|
-
let runCompleted = null;
|
|
1673
|
-
let runFailed = null;
|
|
1674
|
-
let runStarted = null;
|
|
1675
|
-
for (const line of lines) {
|
|
1676
|
-
try {
|
|
1677
|
-
const event = JSON.parse(line);
|
|
1678
|
-
const type = event.type;
|
|
1679
|
-
if (type === "RunQueued") {
|
|
1680
|
-
runQueued = {
|
|
1681
|
-
runId: event.runId,
|
|
1682
|
-
datasetId: event.datasetId,
|
|
1683
|
-
datasetName: event.datasetName,
|
|
1684
|
-
evaluatorIds: event.evaluatorIds,
|
|
1685
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1686
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1687
|
-
ts: event.ts
|
|
1688
|
-
};
|
|
1689
|
-
}
|
|
1690
|
-
if (type === "RunStarted") {
|
|
1691
|
-
runStarted = { startedAt: event.startedAt };
|
|
1692
|
-
}
|
|
1693
|
-
if (type === "RunCompleted") {
|
|
1694
|
-
runCompleted = {
|
|
1695
|
-
passedTestCases: event.passedTestCases,
|
|
1696
|
-
failedTestCases: event.failedTestCases,
|
|
1697
|
-
totalTestCases: event.totalTestCases,
|
|
1698
|
-
finishedAt: event.finishedAt
|
|
1699
|
-
};
|
|
1700
|
-
}
|
|
1701
|
-
if (type === "RunFailed") {
|
|
1702
|
-
runFailed = {
|
|
1703
|
-
finishedAt: event.finishedAt,
|
|
1704
|
-
errorMessage: event.errorMessage
|
|
1705
|
-
};
|
|
1706
|
-
}
|
|
1707
|
-
} catch {
|
|
1708
|
-
}
|
|
1903
|
+
|
|
1904
|
+
// src/runner/name-pattern.ts
|
|
1905
|
+
function parseRegexLiteral(pattern) {
|
|
1906
|
+
if (!pattern.startsWith("/")) {
|
|
1907
|
+
return void 0;
|
|
1709
1908
|
}
|
|
1710
|
-
|
|
1711
|
-
|
|
1909
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1910
|
+
if (lastSlash <= 0) {
|
|
1911
|
+
return void 0;
|
|
1712
1912
|
}
|
|
1713
|
-
const artifactPath = filePath;
|
|
1714
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1715
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1716
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1717
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1718
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1719
1913
|
return {
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
datasetName: runQueued.datasetName,
|
|
1723
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1724
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1725
|
-
startedAt: runStarted?.startedAt,
|
|
1726
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1727
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1728
|
-
completedTestCases,
|
|
1729
|
-
passedTestCases,
|
|
1730
|
-
failedTestCases,
|
|
1731
|
-
status,
|
|
1732
|
-
artifactPath,
|
|
1733
|
-
errorMessage: runFailed?.errorMessage
|
|
1914
|
+
source: pattern.slice(1, lastSlash),
|
|
1915
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1734
1916
|
};
|
|
1735
1917
|
}
|
|
1736
|
-
function
|
|
1737
|
-
|
|
1738
|
-
const
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
if (event.type === "TestCaseProgress") {
|
|
1743
|
-
const ev = event;
|
|
1744
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1745
|
-
const id = ev.testCaseId;
|
|
1746
|
-
const current = testCasePassedBy.get(id);
|
|
1747
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1748
|
-
}
|
|
1749
|
-
} catch {
|
|
1750
|
-
}
|
|
1751
|
-
}
|
|
1752
|
-
let passedTestCases = 0;
|
|
1753
|
-
let failedTestCases = 0;
|
|
1754
|
-
for (const passed of testCasePassedBy.values()) {
|
|
1755
|
-
if (passed) {
|
|
1756
|
-
passedTestCases += 1;
|
|
1757
|
-
} else {
|
|
1758
|
-
failedTestCases += 1;
|
|
1759
|
-
}
|
|
1918
|
+
function createNameMatcher(pattern) {
|
|
1919
|
+
const normalizedPattern = pattern.trim();
|
|
1920
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1921
|
+
if (regexLiteral) {
|
|
1922
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1923
|
+
return (value) => regex.test(value);
|
|
1760
1924
|
}
|
|
1761
|
-
|
|
1762
|
-
}
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
const content = await readFile(artifactPath, "utf8");
|
|
1766
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1767
|
-
const results = [];
|
|
1768
|
-
for (const line of lines) {
|
|
1769
|
-
try {
|
|
1770
|
-
const event = JSON.parse(line);
|
|
1771
|
-
if (event.type === "TestCaseProgress") {
|
|
1772
|
-
const ev = event;
|
|
1773
|
-
results.push({
|
|
1774
|
-
testCaseId: ev.testCaseId,
|
|
1775
|
-
testCaseName: ev.testCaseName,
|
|
1776
|
-
completedTestCases: ev.completedTestCases,
|
|
1777
|
-
totalTestCases: ev.totalTestCases,
|
|
1778
|
-
rerunIndex: ev.rerunIndex,
|
|
1779
|
-
rerunTotal: ev.rerunTotal,
|
|
1780
|
-
passed: ev.passed,
|
|
1781
|
-
durationMs: ev.durationMs,
|
|
1782
|
-
evaluatorScores: ev.evaluatorScores ?? []
|
|
1783
|
-
});
|
|
1784
|
-
}
|
|
1785
|
-
} catch {
|
|
1786
|
-
}
|
|
1787
|
-
}
|
|
1788
|
-
return results;
|
|
1789
|
-
} catch {
|
|
1790
|
-
return [];
|
|
1925
|
+
if (normalizedPattern.includes("*")) {
|
|
1926
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1927
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1928
|
+
return (value) => regex.test(value);
|
|
1791
1929
|
}
|
|
1930
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1792
1931
|
}
|
|
1793
1932
|
async function appendJsonLine(artifactPath, payload) {
|
|
1794
1933
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1847,32 +1986,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1847
1986
|
}
|
|
1848
1987
|
|
|
1849
1988
|
// src/runner/api.ts
|
|
1850
|
-
function
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1855
|
-
if (lastSlash <= 0) {
|
|
1856
|
-
return void 0;
|
|
1989
|
+
function normalizeRunRepetitions(value) {
|
|
1990
|
+
const n = value ?? 1;
|
|
1991
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1992
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1857
1993
|
}
|
|
1858
|
-
return
|
|
1859
|
-
source: pattern.slice(1, lastSlash),
|
|
1860
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1861
|
-
};
|
|
1862
|
-
}
|
|
1863
|
-
function createNameMatcher(pattern) {
|
|
1864
|
-
const normalizedPattern = pattern.trim();
|
|
1865
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1866
|
-
if (regexLiteral) {
|
|
1867
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1868
|
-
return (value) => regex.test(value);
|
|
1869
|
-
}
|
|
1870
|
-
if (normalizedPattern.includes("*")) {
|
|
1871
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1872
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1873
|
-
return (value) => regex.test(value);
|
|
1874
|
-
}
|
|
1875
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1994
|
+
return n;
|
|
1876
1995
|
}
|
|
1877
1996
|
function mergeRunnerOverrides(base, next) {
|
|
1878
1997
|
if (!base) {
|
|
@@ -1907,6 +2026,7 @@ var EffectRunner = class {
|
|
|
1907
2026
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1908
2027
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1909
2028
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
2029
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1910
2030
|
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1911
2031
|
this.persistenceFiber = Effect.runFork(
|
|
1912
2032
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1947,6 +2067,137 @@ var EffectRunner = class {
|
|
|
1947
2067
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1948
2068
|
);
|
|
1949
2069
|
}
|
|
2070
|
+
async collectRunConfigs() {
|
|
2071
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
2072
|
+
this.runConfigsById.clear();
|
|
2073
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
2074
|
+
for (const item of runConfigs) {
|
|
2075
|
+
const id = item.runConfig.getName();
|
|
2076
|
+
const lower = id.toLowerCase();
|
|
2077
|
+
const prev = byNameLower.get(lower);
|
|
2078
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
2079
|
+
throw new Error(
|
|
2080
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
2081
|
+
);
|
|
2082
|
+
}
|
|
2083
|
+
byNameLower.set(lower, item);
|
|
2084
|
+
this.runConfigsById.set(id, item);
|
|
2085
|
+
}
|
|
2086
|
+
return runConfigs;
|
|
2087
|
+
}
|
|
2088
|
+
async resolveRunConfigByName(name) {
|
|
2089
|
+
if (this.runConfigsById.size === 0) {
|
|
2090
|
+
await this.collectRunConfigs();
|
|
2091
|
+
}
|
|
2092
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
2093
|
+
const keyLower = key.toLowerCase();
|
|
2094
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
2095
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
2096
|
+
);
|
|
2097
|
+
if (matches.length === 0) {
|
|
2098
|
+
return void 0;
|
|
2099
|
+
}
|
|
2100
|
+
if (matches.length > 1) {
|
|
2101
|
+
throw new Error(
|
|
2102
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
2103
|
+
);
|
|
2104
|
+
}
|
|
2105
|
+
return matches[0];
|
|
2106
|
+
}
|
|
2107
|
+
async expandRunConfigToJobs(collected) {
|
|
2108
|
+
if (this.datasetsById.size === 0) {
|
|
2109
|
+
await this.collectDatasets();
|
|
2110
|
+
}
|
|
2111
|
+
if (this.evaluatorsById.size === 0) {
|
|
2112
|
+
await this.collectEvaluators();
|
|
2113
|
+
}
|
|
2114
|
+
const rcName = collected.runConfig.getName();
|
|
2115
|
+
const jobs = [];
|
|
2116
|
+
const runs = collected.runConfig.getRuns();
|
|
2117
|
+
for (const [i, row] of runs.entries()) {
|
|
2118
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
2119
|
+
(d) => d.dataset === row.dataset
|
|
2120
|
+
);
|
|
2121
|
+
if (!dsCollected) {
|
|
2122
|
+
throw new Error(
|
|
2123
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2124
|
+
);
|
|
2125
|
+
}
|
|
2126
|
+
let evaluatorIds;
|
|
2127
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
2128
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
2129
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
2130
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
2131
|
+
);
|
|
2132
|
+
if (matched.length === 0) {
|
|
2133
|
+
throw new Error(
|
|
2134
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2135
|
+
);
|
|
2136
|
+
}
|
|
2137
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2138
|
+
} else {
|
|
2139
|
+
const evaluators = row.evaluators;
|
|
2140
|
+
evaluatorIds = [];
|
|
2141
|
+
for (const ev of evaluators) {
|
|
2142
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2143
|
+
(item) => item.evaluator === ev
|
|
2144
|
+
);
|
|
2145
|
+
if (!found) {
|
|
2146
|
+
throw new Error(
|
|
2147
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2148
|
+
);
|
|
2149
|
+
}
|
|
2150
|
+
evaluatorIds.push(found.id);
|
|
2151
|
+
}
|
|
2152
|
+
}
|
|
2153
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2154
|
+
jobs.push({
|
|
2155
|
+
datasetId: dsCollected.id,
|
|
2156
|
+
evaluatorIds,
|
|
2157
|
+
runConfigName: rcName,
|
|
2158
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2159
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2160
|
+
repetitions
|
|
2161
|
+
});
|
|
2162
|
+
}
|
|
2163
|
+
return jobs;
|
|
2164
|
+
}
|
|
2165
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2166
|
+
const jobs = [];
|
|
2167
|
+
for (const name of names) {
|
|
2168
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2169
|
+
if (!collected) {
|
|
2170
|
+
const known = await this.collectRunConfigs();
|
|
2171
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2172
|
+
throw new Error(
|
|
2173
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2174
|
+
);
|
|
2175
|
+
}
|
|
2176
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2177
|
+
}
|
|
2178
|
+
return jobs;
|
|
2179
|
+
}
|
|
2180
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2181
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2182
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2183
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2184
|
+
const snapshots = [];
|
|
2185
|
+
for (const job of request.jobs) {
|
|
2186
|
+
snapshots.push(
|
|
2187
|
+
await this.startDatasetRun({
|
|
2188
|
+
datasetId: job.datasetId,
|
|
2189
|
+
evaluatorIds: job.evaluatorIds,
|
|
2190
|
+
triggerId,
|
|
2191
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2192
|
+
globalEvaluationSemaphore: sem,
|
|
2193
|
+
runConfigName: job.runConfigName,
|
|
2194
|
+
runConfigTags: job.runConfigTags,
|
|
2195
|
+
repetitions: job.repetitions
|
|
2196
|
+
})
|
|
2197
|
+
);
|
|
2198
|
+
}
|
|
2199
|
+
return snapshots;
|
|
2200
|
+
}
|
|
1950
2201
|
async searchTestCases(query) {
|
|
1951
2202
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1952
2203
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1965,36 +2216,46 @@ var EffectRunner = class {
|
|
|
1965
2216
|
);
|
|
1966
2217
|
}
|
|
1967
2218
|
async runDatasetWith(request) {
|
|
2219
|
+
const runConfigName = validateRunConfigName(
|
|
2220
|
+
request.runConfigName,
|
|
2221
|
+
"runDatasetWith.runConfigName"
|
|
2222
|
+
);
|
|
2223
|
+
return this.startDatasetRun({
|
|
2224
|
+
datasetId: request.datasetId,
|
|
2225
|
+
evaluatorIds: request.evaluatorIds,
|
|
2226
|
+
triggerId: request.triggerId,
|
|
2227
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2228
|
+
repetitions: request.repetitions,
|
|
2229
|
+
runConfigName,
|
|
2230
|
+
runConfigTags: request.runConfigTags
|
|
2231
|
+
});
|
|
2232
|
+
}
|
|
2233
|
+
async startDatasetRun(params) {
|
|
1968
2234
|
if (this.datasetsById.size === 0) {
|
|
1969
2235
|
await this.collectDatasets();
|
|
1970
2236
|
}
|
|
1971
2237
|
if (this.evaluatorsById.size === 0) {
|
|
1972
2238
|
await this.collectEvaluators();
|
|
1973
2239
|
}
|
|
1974
|
-
const dataset = this.datasetsById.get(
|
|
2240
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1975
2241
|
if (!dataset) {
|
|
1976
|
-
throw new Error(`Unknown dataset: ${
|
|
2242
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1977
2243
|
}
|
|
1978
|
-
const selectedEvaluators =
|
|
2244
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1979
2245
|
if (selectedEvaluators.length === 0) {
|
|
1980
2246
|
throw new Error("No evaluators selected for run");
|
|
1981
2247
|
}
|
|
1982
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1983
|
-
const
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
)
|
|
1987
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2248
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2249
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2250
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2251
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2252
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1988
2253
|
const runId = `run-${randomUUID()}`;
|
|
1989
|
-
const artifactPath = createArtifactPath(
|
|
1990
|
-
this.config.artifactDirectory,
|
|
1991
|
-
request.datasetId,
|
|
1992
|
-
runId
|
|
1993
|
-
);
|
|
2254
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1994
2255
|
const snapshot = {
|
|
1995
2256
|
runId,
|
|
1996
|
-
datasetId:
|
|
1997
|
-
datasetName: dataset.dataset.
|
|
2257
|
+
datasetId: params.datasetId,
|
|
2258
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1998
2259
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1999
2260
|
queuedAt: Date.now(),
|
|
2000
2261
|
totalTestCases: totalEvaluations,
|
|
@@ -2014,8 +2275,8 @@ var EffectRunner = class {
|
|
|
2014
2275
|
const queuedEvent = {
|
|
2015
2276
|
type: "RunQueued",
|
|
2016
2277
|
runId,
|
|
2017
|
-
datasetId:
|
|
2018
|
-
datasetName: dataset.dataset.
|
|
2278
|
+
datasetId: params.datasetId,
|
|
2279
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2019
2280
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2020
2281
|
totalTestCases: totalEvaluations,
|
|
2021
2282
|
artifactPath
|
|
@@ -2028,17 +2289,20 @@ var EffectRunner = class {
|
|
|
2028
2289
|
payload: queuedEvent
|
|
2029
2290
|
})
|
|
2030
2291
|
);
|
|
2031
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
2032
2292
|
await Effect.runPromise(
|
|
2033
2293
|
Queue.offer(this.runQueue, {
|
|
2034
2294
|
runId,
|
|
2035
2295
|
triggerId,
|
|
2036
|
-
datasetId:
|
|
2296
|
+
datasetId: params.datasetId,
|
|
2037
2297
|
dataset: dataset.dataset,
|
|
2038
2298
|
evaluators: selectedEvaluators,
|
|
2039
2299
|
testCases: selectedTestCases,
|
|
2040
2300
|
snapshot,
|
|
2041
|
-
maxConcurrency
|
|
2301
|
+
maxConcurrency: params.maxConcurrency,
|
|
2302
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2303
|
+
runConfigName: params.runConfigName,
|
|
2304
|
+
runConfigTags,
|
|
2305
|
+
repetitions
|
|
2042
2306
|
})
|
|
2043
2307
|
);
|
|
2044
2308
|
return snapshot;
|
|
@@ -2109,6 +2373,11 @@ var EffectRunner = class {
|
|
|
2109
2373
|
);
|
|
2110
2374
|
}
|
|
2111
2375
|
};
|
|
2376
|
+
|
|
2377
|
+
// src/runner/events.ts
|
|
2378
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2379
|
+
runConfigName: "programmatic"
|
|
2380
|
+
};
|
|
2112
2381
|
var LEFT_PANE_WIDTH2 = 44;
|
|
2113
2382
|
var MAX_RUNS_FOR_CHART = 12;
|
|
2114
2383
|
var MAX_RUNS_FOR_TREND = 20;
|
|
@@ -2456,7 +2725,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2456
2725
|
rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
|
|
2457
2726
|
rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
2458
2727
|
for (const tc of testCases) {
|
|
2459
|
-
const
|
|
2728
|
+
const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
|
|
2460
2729
|
rows.push(
|
|
2461
2730
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
2462
2731
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
@@ -2468,13 +2737,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2468
2737
|
] }),
|
|
2469
2738
|
" ",
|
|
2470
2739
|
tc.testCaseName,
|
|
2471
|
-
|
|
2740
|
+
repetitionPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: repetitionPart }) : null,
|
|
2472
2741
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2473
2742
|
" (",
|
|
2474
2743
|
tc.durationMs,
|
|
2475
2744
|
"ms)"
|
|
2476
2745
|
] })
|
|
2477
|
-
] }, `tc-${tc.testCaseId}-${tc.
|
|
2746
|
+
] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
|
|
2478
2747
|
);
|
|
2479
2748
|
for (const item of tc.evaluatorScores) {
|
|
2480
2749
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -2800,7 +3069,8 @@ function EvalsCliApp({ data, args, runner }) {
|
|
|
2800
3069
|
}
|
|
2801
3070
|
void runner.runDatasetWith({
|
|
2802
3071
|
datasetId: selectedDataset.id,
|
|
2803
|
-
evaluatorIds: clampedState.selectedEvaluatorIds
|
|
3072
|
+
evaluatorIds: clampedState.selectedEvaluatorIds,
|
|
3073
|
+
...PROGRAMMATIC_RUN_CONFIG
|
|
2804
3074
|
}).then((snapshot) => {
|
|
2805
3075
|
setRuntimeMessage(
|
|
2806
3076
|
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|