@m4trix/evals 0.25.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +831 -450
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +832 -451
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +531 -270
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +531 -270
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +888 -509
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +878 -513
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -3,16 +3,16 @@ import { withFullScreen, useScreenSize } from 'fullscreen-ink';
|
|
|
3
3
|
import React, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
|
|
4
4
|
import { useApp, useInput, Box, Text } from 'ink';
|
|
5
5
|
import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
|
|
6
|
-
import {
|
|
7
|
-
import { LineGraph } from '@pppp606/ink-chart';
|
|
6
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
8
7
|
import { randomUUID } from 'crypto';
|
|
9
|
-
import {
|
|
8
|
+
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
9
|
+
import { resolve, join, relative, dirname } from 'path';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
|
-
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
13
12
|
import { pathToFileURL } from 'url';
|
|
14
13
|
import { diffLines } from 'diff';
|
|
15
14
|
import stringify from 'fast-json-stable-stringify';
|
|
15
|
+
import { LineGraph } from '@pppp606/ink-chart';
|
|
16
16
|
|
|
17
17
|
var SEP = " ";
|
|
18
18
|
var ARROW = "\u203A";
|
|
@@ -237,6 +237,50 @@ function isPrintableCharacter(input) {
|
|
|
237
237
|
function isBackKey(key) {
|
|
238
238
|
return key.backspace || key.delete;
|
|
239
239
|
}
|
|
240
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
241
|
+
function makeEntityIdSchema(brand, label) {
|
|
242
|
+
return Schema.String.pipe(
|
|
243
|
+
Schema.trimmed(),
|
|
244
|
+
Schema.minLength(1, {
|
|
245
|
+
message: () => `${label} must be non-empty.`
|
|
246
|
+
}),
|
|
247
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
248
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
249
|
+
}),
|
|
250
|
+
Schema.brand(brand)
|
|
251
|
+
);
|
|
252
|
+
}
|
|
253
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
254
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
255
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
256
|
+
function validateWithSchema(schema, raw, context) {
|
|
257
|
+
const trimmed = raw.trim();
|
|
258
|
+
const decode = Schema.decodeUnknownEither(
|
|
259
|
+
schema
|
|
260
|
+
);
|
|
261
|
+
const result = decode(trimmed);
|
|
262
|
+
if (Either.isLeft(result)) {
|
|
263
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
264
|
+
}
|
|
265
|
+
return result.right;
|
|
266
|
+
}
|
|
267
|
+
function validateRunConfigName(raw, context) {
|
|
268
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// src/evals/evaluator.ts
|
|
272
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
273
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
274
|
+
const label = evaluator.getDisplayLabel();
|
|
275
|
+
if (label !== void 0) {
|
|
276
|
+
return label;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
280
|
+
}
|
|
281
|
+
function getEvaluatorTagList(evaluator) {
|
|
282
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
283
|
+
}
|
|
240
284
|
|
|
241
285
|
// src/cli/data.mock.json
|
|
242
286
|
var data_mock_default = {
|
|
@@ -492,7 +536,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
492
536
|
function toEvaluatorOption(item) {
|
|
493
537
|
return {
|
|
494
538
|
id: item.id,
|
|
495
|
-
name: item.evaluator
|
|
539
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
496
540
|
configPreview: `Source: ${item.filePath}`
|
|
497
541
|
};
|
|
498
542
|
}
|
|
@@ -735,6 +779,159 @@ function reduceCliState(state, action) {
|
|
|
735
779
|
}
|
|
736
780
|
return state;
|
|
737
781
|
}
|
|
782
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
783
|
+
const baseDir = resolve(config.artifactDirectory);
|
|
784
|
+
let entries;
|
|
785
|
+
try {
|
|
786
|
+
entries = await readdir(baseDir);
|
|
787
|
+
} catch {
|
|
788
|
+
return [];
|
|
789
|
+
}
|
|
790
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
791
|
+
const snapshots = [];
|
|
792
|
+
for (const fileName of jsonlFiles) {
|
|
793
|
+
const filePath = join(baseDir, fileName);
|
|
794
|
+
try {
|
|
795
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
796
|
+
if (snapshot) {
|
|
797
|
+
snapshots.push(snapshot);
|
|
798
|
+
}
|
|
799
|
+
} catch {
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
803
|
+
}
|
|
804
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
805
|
+
const content = await readFile(filePath, "utf8");
|
|
806
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
807
|
+
if (lines.length === 0) {
|
|
808
|
+
return null;
|
|
809
|
+
}
|
|
810
|
+
let runQueued = null;
|
|
811
|
+
let runCompleted = null;
|
|
812
|
+
let runFailed = null;
|
|
813
|
+
let runStarted = null;
|
|
814
|
+
for (const line of lines) {
|
|
815
|
+
try {
|
|
816
|
+
const event = JSON.parse(line);
|
|
817
|
+
const type = event.type;
|
|
818
|
+
if (type === "RunQueued") {
|
|
819
|
+
runQueued = {
|
|
820
|
+
runId: event.runId,
|
|
821
|
+
datasetId: event.datasetId,
|
|
822
|
+
datasetName: event.datasetName,
|
|
823
|
+
evaluatorIds: event.evaluatorIds,
|
|
824
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
825
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
826
|
+
ts: event.ts
|
|
827
|
+
};
|
|
828
|
+
}
|
|
829
|
+
if (type === "RunStarted") {
|
|
830
|
+
runStarted = { startedAt: event.startedAt };
|
|
831
|
+
}
|
|
832
|
+
if (type === "RunCompleted") {
|
|
833
|
+
runCompleted = {
|
|
834
|
+
passedTestCases: event.passedTestCases,
|
|
835
|
+
failedTestCases: event.failedTestCases,
|
|
836
|
+
totalTestCases: event.totalTestCases,
|
|
837
|
+
finishedAt: event.finishedAt
|
|
838
|
+
};
|
|
839
|
+
}
|
|
840
|
+
if (type === "RunFailed") {
|
|
841
|
+
runFailed = {
|
|
842
|
+
finishedAt: event.finishedAt,
|
|
843
|
+
errorMessage: event.errorMessage
|
|
844
|
+
};
|
|
845
|
+
}
|
|
846
|
+
} catch {
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
if (!runQueued) {
|
|
850
|
+
return null;
|
|
851
|
+
}
|
|
852
|
+
const artifactPath = filePath;
|
|
853
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
854
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
855
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
856
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
857
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
858
|
+
return {
|
|
859
|
+
runId: runQueued.runId,
|
|
860
|
+
datasetId: runQueued.datasetId,
|
|
861
|
+
datasetName: runQueued.datasetName,
|
|
862
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
863
|
+
queuedAt: runQueued.ts ?? 0,
|
|
864
|
+
startedAt: runStarted?.startedAt,
|
|
865
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
866
|
+
totalTestCases: runQueued.totalTestCases,
|
|
867
|
+
completedTestCases,
|
|
868
|
+
passedTestCases,
|
|
869
|
+
failedTestCases,
|
|
870
|
+
status,
|
|
871
|
+
artifactPath,
|
|
872
|
+
errorMessage: runFailed?.errorMessage
|
|
873
|
+
};
|
|
874
|
+
}
|
|
875
|
+
function aggregateTestCaseProgress(lines) {
|
|
876
|
+
let completedTestCases = 0;
|
|
877
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
878
|
+
for (const line of lines) {
|
|
879
|
+
try {
|
|
880
|
+
const event = JSON.parse(line);
|
|
881
|
+
if (event.type === "TestCaseProgress") {
|
|
882
|
+
const ev = event;
|
|
883
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
884
|
+
const id = ev.testCaseId;
|
|
885
|
+
const current = testCasePassedBy.get(id);
|
|
886
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
887
|
+
}
|
|
888
|
+
} catch {
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
let passedTestCases = 0;
|
|
892
|
+
let failedTestCases = 0;
|
|
893
|
+
for (const passed of testCasePassedBy.values()) {
|
|
894
|
+
if (passed) {
|
|
895
|
+
passedTestCases += 1;
|
|
896
|
+
} else {
|
|
897
|
+
failedTestCases += 1;
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
901
|
+
}
|
|
902
|
+
async function parseArtifactFile(artifactPath) {
|
|
903
|
+
try {
|
|
904
|
+
const content = await readFile(artifactPath, "utf8");
|
|
905
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
906
|
+
const results = [];
|
|
907
|
+
for (const line of lines) {
|
|
908
|
+
try {
|
|
909
|
+
const event = JSON.parse(line);
|
|
910
|
+
if (event.type === "TestCaseProgress") {
|
|
911
|
+
const ev = event;
|
|
912
|
+
const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
|
|
913
|
+
const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
|
|
914
|
+
results.push({
|
|
915
|
+
testCaseId: ev.testCaseId,
|
|
916
|
+
testCaseName: ev.testCaseName,
|
|
917
|
+
completedTestCases: ev.completedTestCases,
|
|
918
|
+
totalTestCases: ev.totalTestCases,
|
|
919
|
+
repetitionId: ev.repetitionId,
|
|
920
|
+
repetitionIndex,
|
|
921
|
+
repetitionCount,
|
|
922
|
+
passed: ev.passed,
|
|
923
|
+
durationMs: ev.durationMs,
|
|
924
|
+
evaluatorScores: ev.evaluatorScores ?? []
|
|
925
|
+
});
|
|
926
|
+
}
|
|
927
|
+
} catch {
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
return results;
|
|
931
|
+
} catch {
|
|
932
|
+
return [];
|
|
933
|
+
}
|
|
934
|
+
}
|
|
738
935
|
|
|
739
936
|
// src/runner/config.ts
|
|
740
937
|
var defaultRunnerConfig = {
|
|
@@ -742,6 +939,7 @@ var defaultRunnerConfig = {
|
|
|
742
939
|
rootDir: process.cwd(),
|
|
743
940
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
744
941
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
942
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
745
943
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
746
944
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
747
945
|
},
|
|
@@ -767,6 +965,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
767
965
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
768
966
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
769
967
|
}
|
|
968
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
969
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
970
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
971
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
972
|
+
}
|
|
770
973
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
771
974
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
772
975
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -865,6 +1068,9 @@ function isDatasetLike(value) {
|
|
|
865
1068
|
function isEvaluatorLike(value) {
|
|
866
1069
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
867
1070
|
}
|
|
1071
|
+
function isRunConfigLike(value) {
|
|
1072
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1073
|
+
}
|
|
868
1074
|
function isTestCaseLike(value) {
|
|
869
1075
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
870
1076
|
}
|
|
@@ -953,6 +1159,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
953
1159
|
);
|
|
954
1160
|
return found.flat();
|
|
955
1161
|
}
|
|
1162
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1163
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1164
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1165
|
+
const found = await Promise.all(
|
|
1166
|
+
matched.map(async (absolutePath) => {
|
|
1167
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1168
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1169
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
1170
|
+
return runConfigs.map((runConfig) => ({
|
|
1171
|
+
id: runConfig.getName(),
|
|
1172
|
+
filePath: relPath,
|
|
1173
|
+
runConfig
|
|
1174
|
+
}));
|
|
1175
|
+
})
|
|
1176
|
+
);
|
|
1177
|
+
return found.flat();
|
|
1178
|
+
}
|
|
956
1179
|
async function collectTestCasesFromFiles(config) {
|
|
957
1180
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
958
1181
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -1106,6 +1329,17 @@ function getDiffLines(entry) {
|
|
|
1106
1329
|
});
|
|
1107
1330
|
}
|
|
1108
1331
|
|
|
1332
|
+
// src/evals/test-case.ts
|
|
1333
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1334
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1335
|
+
return testCase.getDisplayLabel();
|
|
1336
|
+
}
|
|
1337
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
1338
|
+
}
|
|
1339
|
+
function getTestCaseTagList(testCase) {
|
|
1340
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1109
1343
|
// src/evals/metric.ts
|
|
1110
1344
|
var registry = /* @__PURE__ */ new Map();
|
|
1111
1345
|
var Metric = {
|
|
@@ -1129,6 +1363,54 @@ function getMetricById(id) {
|
|
|
1129
1363
|
return registry.get(id);
|
|
1130
1364
|
}
|
|
1131
1365
|
|
|
1366
|
+
// src/evals/aggregators.ts
|
|
1367
|
+
function aggregateTokenCountSum(values) {
|
|
1368
|
+
const initial = {
|
|
1369
|
+
input: 0,
|
|
1370
|
+
output: 0,
|
|
1371
|
+
inputCached: 0,
|
|
1372
|
+
outputCached: 0
|
|
1373
|
+
};
|
|
1374
|
+
return values.reduce(
|
|
1375
|
+
(acc, v) => ({
|
|
1376
|
+
input: acc.input + (v.input ?? 0),
|
|
1377
|
+
output: acc.output + (v.output ?? 0),
|
|
1378
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1379
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1380
|
+
}),
|
|
1381
|
+
initial
|
|
1382
|
+
);
|
|
1383
|
+
}
|
|
1384
|
+
function aggregateLatencyAverage(values) {
|
|
1385
|
+
if (values.length === 0) {
|
|
1386
|
+
return { ms: 0 };
|
|
1387
|
+
}
|
|
1388
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1389
|
+
return { ms: sum / values.length };
|
|
1390
|
+
}
|
|
1391
|
+
|
|
1392
|
+
// src/evals/metrics/standard.ts
|
|
1393
|
+
Metric.of({
|
|
1394
|
+
id: "token-count",
|
|
1395
|
+
name: "Tokens",
|
|
1396
|
+
aggregate: aggregateTokenCountSum,
|
|
1397
|
+
format: (data, options) => {
|
|
1398
|
+
const input = data.input ?? 0;
|
|
1399
|
+
const output = data.output ?? 0;
|
|
1400
|
+
const inputCached = data.inputCached ?? 0;
|
|
1401
|
+
const outputCached = data.outputCached ?? 0;
|
|
1402
|
+
const cached = inputCached + outputCached;
|
|
1403
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1404
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1405
|
+
}
|
|
1406
|
+
});
|
|
1407
|
+
Metric.of({
|
|
1408
|
+
id: "latency",
|
|
1409
|
+
name: "Latency",
|
|
1410
|
+
aggregate: aggregateLatencyAverage,
|
|
1411
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1412
|
+
});
|
|
1413
|
+
|
|
1132
1414
|
// src/evals/score.ts
|
|
1133
1415
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1134
1416
|
function formatScoreData(def, data, options) {
|
|
@@ -1237,54 +1519,6 @@ function getScoreById(id) {
|
|
|
1237
1519
|
return registry2.get(id);
|
|
1238
1520
|
}
|
|
1239
1521
|
|
|
1240
|
-
// src/evals/aggregators.ts
|
|
1241
|
-
function aggregateTokenCountSum(values) {
|
|
1242
|
-
const initial = {
|
|
1243
|
-
input: 0,
|
|
1244
|
-
output: 0,
|
|
1245
|
-
inputCached: 0,
|
|
1246
|
-
outputCached: 0
|
|
1247
|
-
};
|
|
1248
|
-
return values.reduce(
|
|
1249
|
-
(acc, v) => ({
|
|
1250
|
-
input: acc.input + (v.input ?? 0),
|
|
1251
|
-
output: acc.output + (v.output ?? 0),
|
|
1252
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1253
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1254
|
-
}),
|
|
1255
|
-
initial
|
|
1256
|
-
);
|
|
1257
|
-
}
|
|
1258
|
-
function aggregateLatencyAverage(values) {
|
|
1259
|
-
if (values.length === 0) {
|
|
1260
|
-
return { ms: 0 };
|
|
1261
|
-
}
|
|
1262
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1263
|
-
return { ms: sum / values.length };
|
|
1264
|
-
}
|
|
1265
|
-
|
|
1266
|
-
// src/evals/metrics/standard.ts
|
|
1267
|
-
Metric.of({
|
|
1268
|
-
id: "token-count",
|
|
1269
|
-
name: "Tokens",
|
|
1270
|
-
aggregate: aggregateTokenCountSum,
|
|
1271
|
-
format: (data, options) => {
|
|
1272
|
-
const input = data.input ?? 0;
|
|
1273
|
-
const output = data.output ?? 0;
|
|
1274
|
-
const inputCached = data.inputCached ?? 0;
|
|
1275
|
-
const outputCached = data.outputCached ?? 0;
|
|
1276
|
-
const cached = inputCached + outputCached;
|
|
1277
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1278
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1279
|
-
}
|
|
1280
|
-
});
|
|
1281
|
-
Metric.of({
|
|
1282
|
-
id: "latency",
|
|
1283
|
-
name: "Latency",
|
|
1284
|
-
aggregate: aggregateLatencyAverage,
|
|
1285
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1286
|
-
});
|
|
1287
|
-
|
|
1288
1522
|
// src/evals/scores/standard.ts
|
|
1289
1523
|
Score.of({
|
|
1290
1524
|
id: "percent",
|
|
@@ -1391,15 +1625,17 @@ function readOutput(testCase) {
|
|
|
1391
1625
|
}
|
|
1392
1626
|
return candidate.getOutput();
|
|
1393
1627
|
}
|
|
1394
|
-
function buildEvaluationUnits(testCases) {
|
|
1628
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1629
|
+
const count = Math.max(1, repetitionCount);
|
|
1395
1630
|
const units = [];
|
|
1396
1631
|
for (const testCaseItem of testCases) {
|
|
1397
|
-
const
|
|
1398
|
-
for (let r = 0; r <
|
|
1632
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
1633
|
+
for (let r = 0; r < count; r++) {
|
|
1399
1634
|
units.push({
|
|
1400
1635
|
testCaseItem,
|
|
1401
|
-
|
|
1402
|
-
|
|
1636
|
+
repetitionId,
|
|
1637
|
+
repetitionIndex: r + 1,
|
|
1638
|
+
repetitionCount: count
|
|
1403
1639
|
});
|
|
1404
1640
|
}
|
|
1405
1641
|
}
|
|
@@ -1412,7 +1648,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1412
1648
|
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1413
1649
|
}
|
|
1414
1650
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1415
|
-
const { testCaseItem,
|
|
1651
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1416
1652
|
return Effect.gen(function* () {
|
|
1417
1653
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1418
1654
|
const started = Date.now();
|
|
@@ -1421,11 +1657,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1421
1657
|
type: "TestCaseStarted",
|
|
1422
1658
|
runId: task.runId,
|
|
1423
1659
|
testCaseId: testCaseItem.id,
|
|
1424
|
-
testCaseName: testCaseItem.testCase
|
|
1660
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1425
1661
|
startedTestCases: startedEvaluations,
|
|
1426
1662
|
totalTestCases: totalEvaluations,
|
|
1427
|
-
|
|
1428
|
-
|
|
1663
|
+
repetitionId,
|
|
1664
|
+
repetitionIndex,
|
|
1665
|
+
repetitionCount
|
|
1429
1666
|
});
|
|
1430
1667
|
const evaluatorScores = [];
|
|
1431
1668
|
let testCaseError;
|
|
@@ -1459,8 +1696,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1459
1696
|
meta: {
|
|
1460
1697
|
triggerId: task.triggerId,
|
|
1461
1698
|
runId: evaluatorRunId,
|
|
1462
|
-
datasetId: task.datasetId
|
|
1699
|
+
datasetId: task.datasetId,
|
|
1700
|
+
repetitionId,
|
|
1701
|
+
repetitionIndex,
|
|
1702
|
+
repetitionCount,
|
|
1703
|
+
runConfigName: task.runConfigName
|
|
1463
1704
|
},
|
|
1705
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1706
|
+
runConfigTags: task.runConfigTags,
|
|
1707
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1464
1708
|
logDiff,
|
|
1465
1709
|
log,
|
|
1466
1710
|
createError
|
|
@@ -1503,18 +1747,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1503
1747
|
});
|
|
1504
1748
|
}
|
|
1505
1749
|
}
|
|
1506
|
-
const
|
|
1750
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1507
1751
|
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1508
1752
|
const progressEvent = {
|
|
1509
1753
|
type: "TestCaseProgress",
|
|
1510
1754
|
runId: task.runId,
|
|
1511
1755
|
testCaseId: testCaseItem.id,
|
|
1512
|
-
testCaseName: testCaseItem.testCase
|
|
1756
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1513
1757
|
completedTestCases: completedEvaluations,
|
|
1514
1758
|
totalTestCases: totalEvaluations,
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1759
|
+
repetitionId,
|
|
1760
|
+
repetitionIndex,
|
|
1761
|
+
repetitionCount,
|
|
1762
|
+
passed: repetitionPassedThis,
|
|
1518
1763
|
durationMs: Date.now() - started,
|
|
1519
1764
|
evaluatorScores,
|
|
1520
1765
|
output,
|
|
@@ -1535,9 +1780,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1535
1780
|
(map) => {
|
|
1536
1781
|
const key = testCaseItem.id;
|
|
1537
1782
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1538
|
-
const newResults = [...existing.results,
|
|
1783
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1539
1784
|
const newCompletedCount = existing.completedCount + 1;
|
|
1540
|
-
const isLast = newCompletedCount ===
|
|
1785
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1541
1786
|
const newMap = new Map(map);
|
|
1542
1787
|
newMap.set(key, {
|
|
1543
1788
|
completedCount: newCompletedCount,
|
|
@@ -1574,10 +1819,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1574
1819
|
runId: task.runId,
|
|
1575
1820
|
startedAt
|
|
1576
1821
|
});
|
|
1577
|
-
const totalEvaluations = task.testCases.
|
|
1578
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1579
|
-
0
|
|
1580
|
-
);
|
|
1822
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1581
1823
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1582
1824
|
const completedRef = yield* Ref.make(0);
|
|
1583
1825
|
const startedRef = yield* Ref.make(0);
|
|
@@ -1586,7 +1828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1586
1828
|
const testCaseResultsRef = yield* Ref.make(
|
|
1587
1829
|
/* @__PURE__ */ new Map()
|
|
1588
1830
|
);
|
|
1589
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1831
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1590
1832
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1591
1833
|
task,
|
|
1592
1834
|
unit,
|
|
@@ -1600,11 +1842,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1600
1842
|
failedRef,
|
|
1601
1843
|
testCaseResultsRef
|
|
1602
1844
|
);
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1845
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1846
|
+
if (globalSem !== void 0) {
|
|
1847
|
+
yield* Effect.forEach(
|
|
1848
|
+
evaluationUnits,
|
|
1849
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1850
|
+
{ concurrency: "unbounded", discard: true }
|
|
1851
|
+
);
|
|
1852
|
+
} else {
|
|
1853
|
+
yield* Effect.forEach(
|
|
1854
|
+
evaluationUnits,
|
|
1855
|
+
processEvaluation,
|
|
1856
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1857
|
+
);
|
|
1858
|
+
}
|
|
1608
1859
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1609
1860
|
Ref.get(completedRef),
|
|
1610
1861
|
Ref.get(passedRef),
|
|
@@ -1640,155 +1891,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1640
1891
|
artifactPath: task.snapshot.artifactPath
|
|
1641
1892
|
});
|
|
1642
1893
|
});
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
} catch {
|
|
1649
|
-
return [];
|
|
1650
|
-
}
|
|
1651
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1652
|
-
const snapshots = [];
|
|
1653
|
-
for (const fileName of jsonlFiles) {
|
|
1654
|
-
const filePath = join(baseDir, fileName);
|
|
1655
|
-
try {
|
|
1656
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1657
|
-
if (snapshot) {
|
|
1658
|
-
snapshots.push(snapshot);
|
|
1659
|
-
}
|
|
1660
|
-
} catch {
|
|
1661
|
-
}
|
|
1662
|
-
}
|
|
1663
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1664
|
-
}
|
|
1665
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1666
|
-
const content = await readFile(filePath, "utf8");
|
|
1667
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1668
|
-
if (lines.length === 0) {
|
|
1669
|
-
return null;
|
|
1670
|
-
}
|
|
1671
|
-
let runQueued = null;
|
|
1672
|
-
let runCompleted = null;
|
|
1673
|
-
let runFailed = null;
|
|
1674
|
-
let runStarted = null;
|
|
1675
|
-
for (const line of lines) {
|
|
1676
|
-
try {
|
|
1677
|
-
const event = JSON.parse(line);
|
|
1678
|
-
const type = event.type;
|
|
1679
|
-
if (type === "RunQueued") {
|
|
1680
|
-
runQueued = {
|
|
1681
|
-
runId: event.runId,
|
|
1682
|
-
datasetId: event.datasetId,
|
|
1683
|
-
datasetName: event.datasetName,
|
|
1684
|
-
evaluatorIds: event.evaluatorIds,
|
|
1685
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1686
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1687
|
-
ts: event.ts
|
|
1688
|
-
};
|
|
1689
|
-
}
|
|
1690
|
-
if (type === "RunStarted") {
|
|
1691
|
-
runStarted = { startedAt: event.startedAt };
|
|
1692
|
-
}
|
|
1693
|
-
if (type === "RunCompleted") {
|
|
1694
|
-
runCompleted = {
|
|
1695
|
-
passedTestCases: event.passedTestCases,
|
|
1696
|
-
failedTestCases: event.failedTestCases,
|
|
1697
|
-
totalTestCases: event.totalTestCases,
|
|
1698
|
-
finishedAt: event.finishedAt
|
|
1699
|
-
};
|
|
1700
|
-
}
|
|
1701
|
-
if (type === "RunFailed") {
|
|
1702
|
-
runFailed = {
|
|
1703
|
-
finishedAt: event.finishedAt,
|
|
1704
|
-
errorMessage: event.errorMessage
|
|
1705
|
-
};
|
|
1706
|
-
}
|
|
1707
|
-
} catch {
|
|
1708
|
-
}
|
|
1894
|
+
|
|
1895
|
+
// src/runner/name-pattern.ts
|
|
1896
|
+
function parseRegexLiteral(pattern) {
|
|
1897
|
+
if (!pattern.startsWith("/")) {
|
|
1898
|
+
return void 0;
|
|
1709
1899
|
}
|
|
1710
|
-
|
|
1711
|
-
|
|
1900
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1901
|
+
if (lastSlash <= 0) {
|
|
1902
|
+
return void 0;
|
|
1712
1903
|
}
|
|
1713
|
-
const artifactPath = filePath;
|
|
1714
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1715
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1716
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1717
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1718
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1719
1904
|
return {
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
datasetName: runQueued.datasetName,
|
|
1723
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1724
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1725
|
-
startedAt: runStarted?.startedAt,
|
|
1726
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1727
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1728
|
-
completedTestCases,
|
|
1729
|
-
passedTestCases,
|
|
1730
|
-
failedTestCases,
|
|
1731
|
-
status,
|
|
1732
|
-
artifactPath,
|
|
1733
|
-
errorMessage: runFailed?.errorMessage
|
|
1905
|
+
source: pattern.slice(1, lastSlash),
|
|
1906
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1734
1907
|
};
|
|
1735
1908
|
}
|
|
1736
|
-
function
|
|
1737
|
-
|
|
1738
|
-
const
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
if (event.type === "TestCaseProgress") {
|
|
1743
|
-
const ev = event;
|
|
1744
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1745
|
-
const id = ev.testCaseId;
|
|
1746
|
-
const current = testCasePassedBy.get(id);
|
|
1747
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1748
|
-
}
|
|
1749
|
-
} catch {
|
|
1750
|
-
}
|
|
1751
|
-
}
|
|
1752
|
-
let passedTestCases = 0;
|
|
1753
|
-
let failedTestCases = 0;
|
|
1754
|
-
for (const passed of testCasePassedBy.values()) {
|
|
1755
|
-
if (passed) {
|
|
1756
|
-
passedTestCases += 1;
|
|
1757
|
-
} else {
|
|
1758
|
-
failedTestCases += 1;
|
|
1759
|
-
}
|
|
1909
|
+
function createNameMatcher(pattern) {
|
|
1910
|
+
const normalizedPattern = pattern.trim();
|
|
1911
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1912
|
+
if (regexLiteral) {
|
|
1913
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1914
|
+
return (value) => regex.test(value);
|
|
1760
1915
|
}
|
|
1761
|
-
|
|
1762
|
-
}
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
const content = await readFile(artifactPath, "utf8");
|
|
1766
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1767
|
-
const results = [];
|
|
1768
|
-
for (const line of lines) {
|
|
1769
|
-
try {
|
|
1770
|
-
const event = JSON.parse(line);
|
|
1771
|
-
if (event.type === "TestCaseProgress") {
|
|
1772
|
-
const ev = event;
|
|
1773
|
-
results.push({
|
|
1774
|
-
testCaseId: ev.testCaseId,
|
|
1775
|
-
testCaseName: ev.testCaseName,
|
|
1776
|
-
completedTestCases: ev.completedTestCases,
|
|
1777
|
-
totalTestCases: ev.totalTestCases,
|
|
1778
|
-
rerunIndex: ev.rerunIndex,
|
|
1779
|
-
rerunTotal: ev.rerunTotal,
|
|
1780
|
-
passed: ev.passed,
|
|
1781
|
-
durationMs: ev.durationMs,
|
|
1782
|
-
evaluatorScores: ev.evaluatorScores ?? []
|
|
1783
|
-
});
|
|
1784
|
-
}
|
|
1785
|
-
} catch {
|
|
1786
|
-
}
|
|
1787
|
-
}
|
|
1788
|
-
return results;
|
|
1789
|
-
} catch {
|
|
1790
|
-
return [];
|
|
1916
|
+
if (normalizedPattern.includes("*")) {
|
|
1917
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1918
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1919
|
+
return (value) => regex.test(value);
|
|
1791
1920
|
}
|
|
1921
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1792
1922
|
}
|
|
1793
1923
|
async function appendJsonLine(artifactPath, payload) {
|
|
1794
1924
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1847,32 +1977,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1847
1977
|
}
|
|
1848
1978
|
|
|
1849
1979
|
// src/runner/api.ts
|
|
1850
|
-
function
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1855
|
-
if (lastSlash <= 0) {
|
|
1856
|
-
return void 0;
|
|
1980
|
+
function normalizeRunRepetitions(value) {
|
|
1981
|
+
const n = value ?? 1;
|
|
1982
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1983
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1857
1984
|
}
|
|
1858
|
-
return
|
|
1859
|
-
source: pattern.slice(1, lastSlash),
|
|
1860
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1861
|
-
};
|
|
1862
|
-
}
|
|
1863
|
-
function createNameMatcher(pattern) {
|
|
1864
|
-
const normalizedPattern = pattern.trim();
|
|
1865
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1866
|
-
if (regexLiteral) {
|
|
1867
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1868
|
-
return (value) => regex.test(value);
|
|
1869
|
-
}
|
|
1870
|
-
if (normalizedPattern.includes("*")) {
|
|
1871
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1872
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1873
|
-
return (value) => regex.test(value);
|
|
1874
|
-
}
|
|
1875
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1985
|
+
return n;
|
|
1876
1986
|
}
|
|
1877
1987
|
function mergeRunnerOverrides(base, next) {
|
|
1878
1988
|
if (!base) {
|
|
@@ -1907,6 +2017,7 @@ var EffectRunner = class {
|
|
|
1907
2017
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1908
2018
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1909
2019
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
2020
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1910
2021
|
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1911
2022
|
this.persistenceFiber = Effect.runFork(
|
|
1912
2023
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1947,6 +2058,137 @@ var EffectRunner = class {
|
|
|
1947
2058
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1948
2059
|
);
|
|
1949
2060
|
}
|
|
2061
|
+
async collectRunConfigs() {
|
|
2062
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
2063
|
+
this.runConfigsById.clear();
|
|
2064
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
2065
|
+
for (const item of runConfigs) {
|
|
2066
|
+
const id = item.runConfig.getName();
|
|
2067
|
+
const lower = id.toLowerCase();
|
|
2068
|
+
const prev = byNameLower.get(lower);
|
|
2069
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
2070
|
+
throw new Error(
|
|
2071
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
2072
|
+
);
|
|
2073
|
+
}
|
|
2074
|
+
byNameLower.set(lower, item);
|
|
2075
|
+
this.runConfigsById.set(id, item);
|
|
2076
|
+
}
|
|
2077
|
+
return runConfigs;
|
|
2078
|
+
}
|
|
2079
|
+
async resolveRunConfigByName(name) {
|
|
2080
|
+
if (this.runConfigsById.size === 0) {
|
|
2081
|
+
await this.collectRunConfigs();
|
|
2082
|
+
}
|
|
2083
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
2084
|
+
const keyLower = key.toLowerCase();
|
|
2085
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
2086
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
2087
|
+
);
|
|
2088
|
+
if (matches.length === 0) {
|
|
2089
|
+
return void 0;
|
|
2090
|
+
}
|
|
2091
|
+
if (matches.length > 1) {
|
|
2092
|
+
throw new Error(
|
|
2093
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
2094
|
+
);
|
|
2095
|
+
}
|
|
2096
|
+
return matches[0];
|
|
2097
|
+
}
|
|
2098
|
+
async expandRunConfigToJobs(collected) {
|
|
2099
|
+
if (this.datasetsById.size === 0) {
|
|
2100
|
+
await this.collectDatasets();
|
|
2101
|
+
}
|
|
2102
|
+
if (this.evaluatorsById.size === 0) {
|
|
2103
|
+
await this.collectEvaluators();
|
|
2104
|
+
}
|
|
2105
|
+
const rcName = collected.runConfig.getName();
|
|
2106
|
+
const jobs = [];
|
|
2107
|
+
const runs = collected.runConfig.getRuns();
|
|
2108
|
+
for (const [i, row] of runs.entries()) {
|
|
2109
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
2110
|
+
(d) => d.dataset === row.dataset
|
|
2111
|
+
);
|
|
2112
|
+
if (!dsCollected) {
|
|
2113
|
+
throw new Error(
|
|
2114
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2115
|
+
);
|
|
2116
|
+
}
|
|
2117
|
+
let evaluatorIds;
|
|
2118
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
2119
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
2120
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
2121
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
2122
|
+
);
|
|
2123
|
+
if (matched.length === 0) {
|
|
2124
|
+
throw new Error(
|
|
2125
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2126
|
+
);
|
|
2127
|
+
}
|
|
2128
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2129
|
+
} else {
|
|
2130
|
+
const evaluators = row.evaluators;
|
|
2131
|
+
evaluatorIds = [];
|
|
2132
|
+
for (const ev of evaluators) {
|
|
2133
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2134
|
+
(item) => item.evaluator === ev
|
|
2135
|
+
);
|
|
2136
|
+
if (!found) {
|
|
2137
|
+
throw new Error(
|
|
2138
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2139
|
+
);
|
|
2140
|
+
}
|
|
2141
|
+
evaluatorIds.push(found.id);
|
|
2142
|
+
}
|
|
2143
|
+
}
|
|
2144
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2145
|
+
jobs.push({
|
|
2146
|
+
datasetId: dsCollected.id,
|
|
2147
|
+
evaluatorIds,
|
|
2148
|
+
runConfigName: rcName,
|
|
2149
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2150
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2151
|
+
repetitions
|
|
2152
|
+
});
|
|
2153
|
+
}
|
|
2154
|
+
return jobs;
|
|
2155
|
+
}
|
|
2156
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2157
|
+
const jobs = [];
|
|
2158
|
+
for (const name of names) {
|
|
2159
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2160
|
+
if (!collected) {
|
|
2161
|
+
const known = await this.collectRunConfigs();
|
|
2162
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2163
|
+
throw new Error(
|
|
2164
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2165
|
+
);
|
|
2166
|
+
}
|
|
2167
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2168
|
+
}
|
|
2169
|
+
return jobs;
|
|
2170
|
+
}
|
|
2171
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2172
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2173
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2174
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2175
|
+
const snapshots = [];
|
|
2176
|
+
for (const job of request.jobs) {
|
|
2177
|
+
snapshots.push(
|
|
2178
|
+
await this.startDatasetRun({
|
|
2179
|
+
datasetId: job.datasetId,
|
|
2180
|
+
evaluatorIds: job.evaluatorIds,
|
|
2181
|
+
triggerId,
|
|
2182
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2183
|
+
globalEvaluationSemaphore: sem,
|
|
2184
|
+
runConfigName: job.runConfigName,
|
|
2185
|
+
runConfigTags: job.runConfigTags,
|
|
2186
|
+
repetitions: job.repetitions
|
|
2187
|
+
})
|
|
2188
|
+
);
|
|
2189
|
+
}
|
|
2190
|
+
return snapshots;
|
|
2191
|
+
}
|
|
1950
2192
|
async searchTestCases(query) {
|
|
1951
2193
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1952
2194
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1965,35 +2207,45 @@ var EffectRunner = class {
|
|
|
1965
2207
|
);
|
|
1966
2208
|
}
|
|
1967
2209
|
async runDatasetWith(request) {
|
|
2210
|
+
const runConfigName = validateRunConfigName(
|
|
2211
|
+
request.runConfigName,
|
|
2212
|
+
"runDatasetWith.runConfigName"
|
|
2213
|
+
);
|
|
2214
|
+
return this.startDatasetRun({
|
|
2215
|
+
datasetId: request.datasetId,
|
|
2216
|
+
evaluatorIds: request.evaluatorIds,
|
|
2217
|
+
triggerId: request.triggerId,
|
|
2218
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2219
|
+
repetitions: request.repetitions,
|
|
2220
|
+
runConfigName,
|
|
2221
|
+
runConfigTags: request.runConfigTags
|
|
2222
|
+
});
|
|
2223
|
+
}
|
|
2224
|
+
async startDatasetRun(params) {
|
|
1968
2225
|
if (this.datasetsById.size === 0) {
|
|
1969
2226
|
await this.collectDatasets();
|
|
1970
2227
|
}
|
|
1971
2228
|
if (this.evaluatorsById.size === 0) {
|
|
1972
2229
|
await this.collectEvaluators();
|
|
1973
2230
|
}
|
|
1974
|
-
const dataset = this.datasetsById.get(
|
|
2231
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1975
2232
|
if (!dataset) {
|
|
1976
|
-
throw new Error(`Unknown dataset: ${
|
|
2233
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1977
2234
|
}
|
|
1978
|
-
const selectedEvaluators =
|
|
2235
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1979
2236
|
if (selectedEvaluators.length === 0) {
|
|
1980
2237
|
throw new Error("No evaluators selected for run");
|
|
1981
2238
|
}
|
|
1982
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1983
|
-
const
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
)
|
|
1987
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2239
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2240
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2241
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2242
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2243
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1988
2244
|
const runId = `run-${randomUUID()}`;
|
|
1989
|
-
const artifactPath = createArtifactPath(
|
|
1990
|
-
this.config.artifactDirectory,
|
|
1991
|
-
request.datasetId,
|
|
1992
|
-
runId
|
|
1993
|
-
);
|
|
2245
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1994
2246
|
const snapshot = {
|
|
1995
2247
|
runId,
|
|
1996
|
-
datasetId:
|
|
2248
|
+
datasetId: params.datasetId,
|
|
1997
2249
|
datasetName: dataset.dataset.getName(),
|
|
1998
2250
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1999
2251
|
queuedAt: Date.now(),
|
|
@@ -2014,7 +2266,7 @@ var EffectRunner = class {
|
|
|
2014
2266
|
const queuedEvent = {
|
|
2015
2267
|
type: "RunQueued",
|
|
2016
2268
|
runId,
|
|
2017
|
-
datasetId:
|
|
2269
|
+
datasetId: params.datasetId,
|
|
2018
2270
|
datasetName: dataset.dataset.getName(),
|
|
2019
2271
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2020
2272
|
totalTestCases: totalEvaluations,
|
|
@@ -2028,17 +2280,20 @@ var EffectRunner = class {
|
|
|
2028
2280
|
payload: queuedEvent
|
|
2029
2281
|
})
|
|
2030
2282
|
);
|
|
2031
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
2032
2283
|
await Effect.runPromise(
|
|
2033
2284
|
Queue.offer(this.runQueue, {
|
|
2034
2285
|
runId,
|
|
2035
2286
|
triggerId,
|
|
2036
|
-
datasetId:
|
|
2287
|
+
datasetId: params.datasetId,
|
|
2037
2288
|
dataset: dataset.dataset,
|
|
2038
2289
|
evaluators: selectedEvaluators,
|
|
2039
2290
|
testCases: selectedTestCases,
|
|
2040
2291
|
snapshot,
|
|
2041
|
-
maxConcurrency
|
|
2292
|
+
maxConcurrency: params.maxConcurrency,
|
|
2293
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2294
|
+
runConfigName: params.runConfigName,
|
|
2295
|
+
runConfigTags,
|
|
2296
|
+
repetitions
|
|
2042
2297
|
})
|
|
2043
2298
|
);
|
|
2044
2299
|
return snapshot;
|
|
@@ -2109,6 +2364,11 @@ var EffectRunner = class {
|
|
|
2109
2364
|
);
|
|
2110
2365
|
}
|
|
2111
2366
|
};
|
|
2367
|
+
|
|
2368
|
+
// src/runner/events.ts
|
|
2369
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2370
|
+
runConfigName: "programmatic"
|
|
2371
|
+
};
|
|
2112
2372
|
var LEFT_PANE_WIDTH2 = 44;
|
|
2113
2373
|
var MAX_RUNS_FOR_CHART = 12;
|
|
2114
2374
|
var MAX_RUNS_FOR_TREND = 20;
|
|
@@ -2456,7 +2716,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2456
2716
|
rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
|
|
2457
2717
|
rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
2458
2718
|
for (const tc of testCases) {
|
|
2459
|
-
const
|
|
2719
|
+
const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
|
|
2460
2720
|
rows.push(
|
|
2461
2721
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
2462
2722
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
@@ -2468,13 +2728,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2468
2728
|
] }),
|
|
2469
2729
|
" ",
|
|
2470
2730
|
tc.testCaseName,
|
|
2471
|
-
|
|
2731
|
+
repetitionPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: repetitionPart }) : null,
|
|
2472
2732
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2473
2733
|
" (",
|
|
2474
2734
|
tc.durationMs,
|
|
2475
2735
|
"ms)"
|
|
2476
2736
|
] })
|
|
2477
|
-
] }, `tc-${tc.testCaseId}-${tc.
|
|
2737
|
+
] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
|
|
2478
2738
|
);
|
|
2479
2739
|
for (const item of tc.evaluatorScores) {
|
|
2480
2740
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -2800,7 +3060,8 @@ function EvalsCliApp({ data, args, runner }) {
|
|
|
2800
3060
|
}
|
|
2801
3061
|
void runner.runDatasetWith({
|
|
2802
3062
|
datasetId: selectedDataset.id,
|
|
2803
|
-
evaluatorIds: clampedState.selectedEvaluatorIds
|
|
3063
|
+
evaluatorIds: clampedState.selectedEvaluatorIds,
|
|
3064
|
+
...PROGRAMMATIC_RUN_CONFIG
|
|
2804
3065
|
}).then((snapshot) => {
|
|
2805
3066
|
setRuntimeMessage(
|
|
2806
3067
|
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|