@m4trix/evals 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +911 -643
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +898 -630
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +688 -575
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +679 -566
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +959 -623
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +947 -625
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/cli-simple.cjs
CHANGED
|
@@ -3,14 +3,14 @@
|
|
|
3
3
|
|
|
4
4
|
var crypto = require('crypto');
|
|
5
5
|
var effect = require('effect');
|
|
6
|
-
var
|
|
6
|
+
var promises = require('fs/promises');
|
|
7
7
|
var path = require('path');
|
|
8
|
+
var fs = require('fs');
|
|
8
9
|
var jitiModule = require('jiti');
|
|
9
|
-
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
11
|
var diff = require('diff');
|
|
12
12
|
var stringify = require('fast-json-stable-stringify');
|
|
13
|
-
var
|
|
13
|
+
var React = require('react');
|
|
14
14
|
var ink = require('ink');
|
|
15
15
|
var jsxRuntime = require('react/jsx-runtime');
|
|
16
16
|
|
|
@@ -37,25 +37,181 @@ function _interopNamespace(e) {
|
|
|
37
37
|
|
|
38
38
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
39
39
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
40
|
-
var
|
|
40
|
+
var React__namespace = /*#__PURE__*/_interopNamespace(React);
|
|
41
|
+
|
|
42
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
43
|
+
function makeEntityIdSchema(brand, label) {
|
|
44
|
+
return effect.Schema.String.pipe(
|
|
45
|
+
effect.Schema.trimmed(),
|
|
46
|
+
effect.Schema.minLength(1, {
|
|
47
|
+
message: () => `${label} must be non-empty.`
|
|
48
|
+
}),
|
|
49
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
50
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
51
|
+
}),
|
|
52
|
+
effect.Schema.brand(brand)
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
56
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
57
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
58
|
+
function validateWithSchema(schema, raw, context) {
|
|
59
|
+
const trimmed = raw.trim();
|
|
60
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
61
|
+
schema
|
|
62
|
+
);
|
|
63
|
+
const result = decode(trimmed);
|
|
64
|
+
if (effect.Either.isLeft(result)) {
|
|
65
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
66
|
+
}
|
|
67
|
+
return result.right;
|
|
68
|
+
}
|
|
69
|
+
function validateRunConfigName(raw, context) {
|
|
70
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// src/evals/evaluator.ts
|
|
74
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
75
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
76
|
+
const label = evaluator.getDisplayLabel();
|
|
77
|
+
if (label !== void 0) {
|
|
78
|
+
return label;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
82
|
+
}
|
|
83
|
+
function getEvaluatorTagList(evaluator) {
|
|
84
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
85
|
+
}
|
|
86
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
87
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
88
|
+
let entries;
|
|
89
|
+
try {
|
|
90
|
+
entries = await promises.readdir(baseDir);
|
|
91
|
+
} catch {
|
|
92
|
+
return [];
|
|
93
|
+
}
|
|
94
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
95
|
+
const snapshots = [];
|
|
96
|
+
for (const fileName of jsonlFiles) {
|
|
97
|
+
const filePath = path.join(baseDir, fileName);
|
|
98
|
+
try {
|
|
99
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
100
|
+
if (snapshot) {
|
|
101
|
+
snapshots.push(snapshot);
|
|
102
|
+
}
|
|
103
|
+
} catch {
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
107
|
+
}
|
|
108
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
109
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
110
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
111
|
+
if (lines.length === 0) {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
let runQueued = null;
|
|
115
|
+
let runCompleted = null;
|
|
116
|
+
let runFailed = null;
|
|
117
|
+
let runStarted = null;
|
|
118
|
+
for (const line of lines) {
|
|
119
|
+
try {
|
|
120
|
+
const event = JSON.parse(line);
|
|
121
|
+
const type = event.type;
|
|
122
|
+
if (type === "RunQueued") {
|
|
123
|
+
runQueued = {
|
|
124
|
+
runId: event.runId,
|
|
125
|
+
datasetId: event.datasetId,
|
|
126
|
+
datasetName: event.datasetName,
|
|
127
|
+
evaluatorIds: event.evaluatorIds,
|
|
128
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
129
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
130
|
+
ts: event.ts
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
if (type === "RunStarted") {
|
|
134
|
+
runStarted = { startedAt: event.startedAt };
|
|
135
|
+
}
|
|
136
|
+
if (type === "RunCompleted") {
|
|
137
|
+
runCompleted = {
|
|
138
|
+
passedTestCases: event.passedTestCases,
|
|
139
|
+
failedTestCases: event.failedTestCases,
|
|
140
|
+
totalTestCases: event.totalTestCases,
|
|
141
|
+
finishedAt: event.finishedAt
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
if (type === "RunFailed") {
|
|
145
|
+
runFailed = {
|
|
146
|
+
finishedAt: event.finishedAt,
|
|
147
|
+
errorMessage: event.errorMessage
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
} catch {
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (!runQueued) {
|
|
154
|
+
return null;
|
|
155
|
+
}
|
|
156
|
+
const artifactPath = filePath;
|
|
157
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
158
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
159
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
160
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
161
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
162
|
+
return {
|
|
163
|
+
runId: runQueued.runId,
|
|
164
|
+
datasetId: runQueued.datasetId,
|
|
165
|
+
datasetName: runQueued.datasetName,
|
|
166
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
167
|
+
queuedAt: runQueued.ts ?? 0,
|
|
168
|
+
startedAt: runStarted?.startedAt,
|
|
169
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
170
|
+
totalTestCases: runQueued.totalTestCases,
|
|
171
|
+
completedTestCases,
|
|
172
|
+
passedTestCases,
|
|
173
|
+
failedTestCases,
|
|
174
|
+
status,
|
|
175
|
+
artifactPath,
|
|
176
|
+
errorMessage: runFailed?.errorMessage
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
function aggregateTestCaseProgress(lines) {
|
|
180
|
+
let completedTestCases = 0;
|
|
181
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
182
|
+
for (const line of lines) {
|
|
183
|
+
try {
|
|
184
|
+
const event = JSON.parse(line);
|
|
185
|
+
if (event.type === "TestCaseProgress") {
|
|
186
|
+
const ev = event;
|
|
187
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
188
|
+
const id = ev.testCaseId;
|
|
189
|
+
const current = testCasePassedBy.get(id);
|
|
190
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
191
|
+
}
|
|
192
|
+
} catch {
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
let passedTestCases = 0;
|
|
196
|
+
let failedTestCases = 0;
|
|
197
|
+
for (const passed of testCasePassedBy.values()) {
|
|
198
|
+
if (passed) {
|
|
199
|
+
passedTestCases += 1;
|
|
200
|
+
} else {
|
|
201
|
+
failedTestCases += 1;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
205
|
+
}
|
|
41
206
|
|
|
42
207
|
// src/runner/config.ts
|
|
43
208
|
var defaultRunnerConfig = {
|
|
44
209
|
discovery: {
|
|
45
210
|
rootDir: process.cwd(),
|
|
46
211
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
47
|
-
evaluatorSuffixes: [
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
".evaluator.js",
|
|
51
|
-
".evaluator.mjs"
|
|
52
|
-
],
|
|
53
|
-
testCaseSuffixes: [
|
|
54
|
-
".test-case.ts",
|
|
55
|
-
".test-case.tsx",
|
|
56
|
-
".test-case.js",
|
|
57
|
-
".test-case.mjs"
|
|
58
|
-
],
|
|
212
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
213
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
214
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
59
215
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
60
216
|
},
|
|
61
217
|
artifactDirectory: ".eval-results",
|
|
@@ -80,6 +236,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
80
236
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
81
237
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
82
238
|
}
|
|
239
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
240
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
241
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
242
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
243
|
+
}
|
|
83
244
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
84
245
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
85
246
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -122,14 +283,15 @@ function getJitiLoader() {
|
|
|
122
283
|
}
|
|
123
284
|
const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
|
|
124
285
|
if (typeof createJiti2 !== "function") {
|
|
125
|
-
throw new Error(
|
|
126
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
127
|
-
);
|
|
286
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
128
287
|
}
|
|
129
|
-
cachedLoader = createJiti2(
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
288
|
+
cachedLoader = createJiti2(
|
|
289
|
+
(typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
|
|
290
|
+
{
|
|
291
|
+
interopDefault: true,
|
|
292
|
+
moduleCache: true
|
|
293
|
+
}
|
|
294
|
+
);
|
|
133
295
|
return cachedLoader;
|
|
134
296
|
}
|
|
135
297
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -177,6 +339,9 @@ function isDatasetLike(value) {
|
|
|
177
339
|
function isEvaluatorLike(value) {
|
|
178
340
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
179
341
|
}
|
|
342
|
+
function isRunConfigLike(value) {
|
|
343
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
344
|
+
}
|
|
180
345
|
function isTestCaseLike(value) {
|
|
181
346
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
182
347
|
}
|
|
@@ -233,9 +398,7 @@ async function loadModuleExports(filePath) {
|
|
|
233
398
|
}
|
|
234
399
|
async function collectDatasetsFromFiles(config) {
|
|
235
400
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
236
|
-
const matched = files.filter(
|
|
237
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
238
|
-
);
|
|
401
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
239
402
|
const found = await Promise.all(
|
|
240
403
|
matched.map(async (absolutePath) => {
|
|
241
404
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -252,9 +415,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
252
415
|
}
|
|
253
416
|
async function collectEvaluatorsFromFiles(config) {
|
|
254
417
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
255
|
-
const matched = files.filter(
|
|
256
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
257
|
-
);
|
|
418
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
258
419
|
const found = await Promise.all(
|
|
259
420
|
matched.map(async (absolutePath) => {
|
|
260
421
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -269,11 +430,26 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
269
430
|
);
|
|
270
431
|
return found.flat();
|
|
271
432
|
}
|
|
272
|
-
async function
|
|
433
|
+
async function collectRunConfigsFromFiles(config) {
|
|
273
434
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
274
|
-
const matched = files.filter(
|
|
275
|
-
|
|
435
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
436
|
+
const found = await Promise.all(
|
|
437
|
+
matched.map(async (absolutePath) => {
|
|
438
|
+
const exports = await loadModuleExports(absolutePath);
|
|
439
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
440
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
441
|
+
return runConfigs.map((runConfig) => ({
|
|
442
|
+
id: runConfig.getName(),
|
|
443
|
+
filePath: relPath,
|
|
444
|
+
runConfig
|
|
445
|
+
}));
|
|
446
|
+
})
|
|
276
447
|
);
|
|
448
|
+
return found.flat();
|
|
449
|
+
}
|
|
450
|
+
async function collectTestCasesFromFiles(config) {
|
|
451
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
452
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
277
453
|
const found = await Promise.all(
|
|
278
454
|
matched.map(async (absolutePath) => {
|
|
279
455
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -345,16 +521,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
345
521
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
346
522
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
347
523
|
if (diffOptions?.keysOnly) {
|
|
348
|
-
const expectedKeys = JSON.stringify(
|
|
349
|
-
|
|
350
|
-
null,
|
|
351
|
-
2
|
|
352
|
-
);
|
|
353
|
-
const actualKeys = JSON.stringify(
|
|
354
|
-
extractKeys(actualProcessed),
|
|
355
|
-
null,
|
|
356
|
-
2
|
|
357
|
-
);
|
|
524
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
525
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
358
526
|
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
359
527
|
return formatDiffParts(parts2);
|
|
360
528
|
}
|
|
@@ -365,9 +533,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
365
533
|
}
|
|
366
534
|
const parts = diff.diffLines(expectedStr, actualStr);
|
|
367
535
|
if (diffOptions?.outputNewOnly) {
|
|
368
|
-
const filtered = parts.filter(
|
|
369
|
-
(p) => p.added === true
|
|
370
|
-
);
|
|
536
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
371
537
|
return formatDiffParts(filtered);
|
|
372
538
|
}
|
|
373
539
|
return formatDiffParts(parts);
|
|
@@ -434,6 +600,17 @@ function getDiffLines(entry) {
|
|
|
434
600
|
});
|
|
435
601
|
}
|
|
436
602
|
|
|
603
|
+
// src/evals/test-case.ts
|
|
604
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
605
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
606
|
+
return testCase.getDisplayLabel();
|
|
607
|
+
}
|
|
608
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
609
|
+
}
|
|
610
|
+
function getTestCaseTagList(testCase) {
|
|
611
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
612
|
+
}
|
|
613
|
+
|
|
437
614
|
// src/evals/metric.ts
|
|
438
615
|
var registry = /* @__PURE__ */ new Map();
|
|
439
616
|
var Metric = {
|
|
@@ -457,6 +634,54 @@ function getMetricById(id) {
|
|
|
457
634
|
return registry.get(id);
|
|
458
635
|
}
|
|
459
636
|
|
|
637
|
+
// src/evals/aggregators.ts
|
|
638
|
+
function aggregateTokenCountSum(values) {
|
|
639
|
+
const initial = {
|
|
640
|
+
input: 0,
|
|
641
|
+
output: 0,
|
|
642
|
+
inputCached: 0,
|
|
643
|
+
outputCached: 0
|
|
644
|
+
};
|
|
645
|
+
return values.reduce(
|
|
646
|
+
(acc, v) => ({
|
|
647
|
+
input: acc.input + (v.input ?? 0),
|
|
648
|
+
output: acc.output + (v.output ?? 0),
|
|
649
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
650
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
651
|
+
}),
|
|
652
|
+
initial
|
|
653
|
+
);
|
|
654
|
+
}
|
|
655
|
+
function aggregateLatencyAverage(values) {
|
|
656
|
+
if (values.length === 0) {
|
|
657
|
+
return { ms: 0 };
|
|
658
|
+
}
|
|
659
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
660
|
+
return { ms: sum / values.length };
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// src/evals/metrics/standard.ts
|
|
664
|
+
Metric.of({
|
|
665
|
+
id: "token-count",
|
|
666
|
+
name: "Tokens",
|
|
667
|
+
aggregate: aggregateTokenCountSum,
|
|
668
|
+
format: (data, options) => {
|
|
669
|
+
const input = data.input ?? 0;
|
|
670
|
+
const output = data.output ?? 0;
|
|
671
|
+
const inputCached = data.inputCached ?? 0;
|
|
672
|
+
const outputCached = data.outputCached ?? 0;
|
|
673
|
+
const cached = inputCached + outputCached;
|
|
674
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
675
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
676
|
+
}
|
|
677
|
+
});
|
|
678
|
+
Metric.of({
|
|
679
|
+
id: "latency",
|
|
680
|
+
name: "Latency",
|
|
681
|
+
aggregate: aggregateLatencyAverage,
|
|
682
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
683
|
+
});
|
|
684
|
+
|
|
460
685
|
// src/evals/score.ts
|
|
461
686
|
var registry2 = /* @__PURE__ */ new Map();
|
|
462
687
|
function formatScoreData(def, data, options) {
|
|
@@ -469,10 +694,7 @@ var ScoreAggregate = {
|
|
|
469
694
|
const count = values.length || 1;
|
|
470
695
|
const result = {};
|
|
471
696
|
for (const field of fields) {
|
|
472
|
-
result[field] = values.reduce(
|
|
473
|
-
(s, v) => s + (v[field] ?? 0),
|
|
474
|
-
0
|
|
475
|
-
) / count;
|
|
697
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
476
698
|
}
|
|
477
699
|
return result;
|
|
478
700
|
};
|
|
@@ -506,13 +728,10 @@ var ScoreAggregate = {
|
|
|
506
728
|
(s, v) => s + (v[valueField] ?? 0),
|
|
507
729
|
0
|
|
508
730
|
);
|
|
509
|
-
const sumSq = values.reduce(
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
},
|
|
514
|
-
0
|
|
515
|
-
);
|
|
731
|
+
const sumSq = values.reduce((s, v) => {
|
|
732
|
+
const value = v[valueField] ?? 0;
|
|
733
|
+
return s + value * value;
|
|
734
|
+
}, 0);
|
|
516
735
|
const mean = sum / count;
|
|
517
736
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
518
737
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -571,54 +790,6 @@ function getScoreById(id) {
|
|
|
571
790
|
return registry2.get(id);
|
|
572
791
|
}
|
|
573
792
|
|
|
574
|
-
// src/evals/aggregators.ts
|
|
575
|
-
function aggregateTokenCountSum(values) {
|
|
576
|
-
const initial = {
|
|
577
|
-
input: 0,
|
|
578
|
-
output: 0,
|
|
579
|
-
inputCached: 0,
|
|
580
|
-
outputCached: 0
|
|
581
|
-
};
|
|
582
|
-
return values.reduce(
|
|
583
|
-
(acc, v) => ({
|
|
584
|
-
input: acc.input + (v.input ?? 0),
|
|
585
|
-
output: acc.output + (v.output ?? 0),
|
|
586
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
587
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
588
|
-
}),
|
|
589
|
-
initial
|
|
590
|
-
);
|
|
591
|
-
}
|
|
592
|
-
function aggregateLatencyAverage(values) {
|
|
593
|
-
if (values.length === 0) {
|
|
594
|
-
return { ms: 0 };
|
|
595
|
-
}
|
|
596
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
597
|
-
return { ms: sum / values.length };
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
// src/evals/metrics/standard.ts
|
|
601
|
-
Metric.of({
|
|
602
|
-
id: "token-count",
|
|
603
|
-
name: "Tokens",
|
|
604
|
-
aggregate: aggregateTokenCountSum,
|
|
605
|
-
format: (data, options) => {
|
|
606
|
-
const input = data.input ?? 0;
|
|
607
|
-
const output = data.output ?? 0;
|
|
608
|
-
const inputCached = data.inputCached ?? 0;
|
|
609
|
-
const outputCached = data.outputCached ?? 0;
|
|
610
|
-
const cached = inputCached + outputCached;
|
|
611
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
612
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
613
|
-
}
|
|
614
|
-
});
|
|
615
|
-
Metric.of({
|
|
616
|
-
id: "latency",
|
|
617
|
-
name: "Latency",
|
|
618
|
-
aggregate: aggregateLatencyAverage,
|
|
619
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
620
|
-
});
|
|
621
|
-
|
|
622
793
|
// src/evals/scores/standard.ts
|
|
623
794
|
Score.of({
|
|
624
795
|
id: "percent",
|
|
@@ -762,15 +933,17 @@ function readOutput(testCase) {
|
|
|
762
933
|
}
|
|
763
934
|
return candidate.getOutput();
|
|
764
935
|
}
|
|
765
|
-
function buildEvaluationUnits(testCases) {
|
|
936
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
937
|
+
const count = Math.max(1, repetitionCount);
|
|
766
938
|
const units = [];
|
|
767
939
|
for (const testCaseItem of testCases) {
|
|
768
|
-
const
|
|
769
|
-
for (let r = 0; r <
|
|
940
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
941
|
+
for (let r = 0; r < count; r++) {
|
|
770
942
|
units.push({
|
|
771
943
|
testCaseItem,
|
|
772
|
-
|
|
773
|
-
|
|
944
|
+
repetitionId,
|
|
945
|
+
repetitionIndex: r + 1,
|
|
946
|
+
repetitionCount: count
|
|
774
947
|
});
|
|
775
948
|
}
|
|
776
949
|
}
|
|
@@ -780,29 +953,24 @@ function nowIsoForFile() {
|
|
|
780
953
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
781
954
|
}
|
|
782
955
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
783
|
-
return path.join(
|
|
784
|
-
artifactDirectory,
|
|
785
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
786
|
-
);
|
|
956
|
+
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
787
957
|
}
|
|
788
958
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
789
|
-
const { testCaseItem,
|
|
959
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
790
960
|
return effect.Effect.gen(function* () {
|
|
791
961
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
792
962
|
const started = Date.now();
|
|
793
|
-
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
794
|
-
n + 1,
|
|
795
|
-
n + 1
|
|
796
|
-
]);
|
|
963
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
797
964
|
yield* publishEvent({
|
|
798
965
|
type: "TestCaseStarted",
|
|
799
966
|
runId: task.runId,
|
|
800
967
|
testCaseId: testCaseItem.id,
|
|
801
|
-
testCaseName: testCaseItem.testCase
|
|
968
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
802
969
|
startedTestCases: startedEvaluations,
|
|
803
970
|
totalTestCases: totalEvaluations,
|
|
804
|
-
|
|
805
|
-
|
|
971
|
+
repetitionId,
|
|
972
|
+
repetitionIndex,
|
|
973
|
+
repetitionCount
|
|
806
974
|
});
|
|
807
975
|
const evaluatorScores = [];
|
|
808
976
|
let testCaseError;
|
|
@@ -826,9 +994,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
826
994
|
return error;
|
|
827
995
|
};
|
|
828
996
|
try {
|
|
829
|
-
const ctx = yield* effect.Effect.promise(
|
|
830
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
831
|
-
);
|
|
997
|
+
const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
832
998
|
const result = yield* effect.Effect.promise(
|
|
833
999
|
() => Promise.resolve().then(
|
|
834
1000
|
() => evaluateFn({
|
|
@@ -838,8 +1004,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
838
1004
|
meta: {
|
|
839
1005
|
triggerId: task.triggerId,
|
|
840
1006
|
runId: evaluatorRunId,
|
|
841
|
-
datasetId: task.datasetId
|
|
1007
|
+
datasetId: task.datasetId,
|
|
1008
|
+
repetitionId,
|
|
1009
|
+
repetitionIndex,
|
|
1010
|
+
repetitionCount,
|
|
1011
|
+
runConfigName: task.runConfigName
|
|
842
1012
|
},
|
|
1013
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1014
|
+
runConfigTags: task.runConfigTags,
|
|
1015
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
843
1016
|
logDiff,
|
|
844
1017
|
log,
|
|
845
1018
|
createError
|
|
@@ -882,21 +1055,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
882
1055
|
});
|
|
883
1056
|
}
|
|
884
1057
|
}
|
|
885
|
-
const
|
|
886
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
887
|
-
n + 1,
|
|
888
|
-
n + 1
|
|
889
|
-
]);
|
|
1058
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1059
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
890
1060
|
const progressEvent = {
|
|
891
1061
|
type: "TestCaseProgress",
|
|
892
1062
|
runId: task.runId,
|
|
893
1063
|
testCaseId: testCaseItem.id,
|
|
894
|
-
testCaseName: testCaseItem.testCase
|
|
1064
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
895
1065
|
completedTestCases: completedEvaluations,
|
|
896
1066
|
totalTestCases: totalEvaluations,
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
1067
|
+
repetitionId,
|
|
1068
|
+
repetitionIndex,
|
|
1069
|
+
repetitionCount,
|
|
1070
|
+
passed: repetitionPassedThis,
|
|
900
1071
|
durationMs: Date.now() - started,
|
|
901
1072
|
evaluatorScores,
|
|
902
1073
|
output,
|
|
@@ -917,9 +1088,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
917
1088
|
(map) => {
|
|
918
1089
|
const key = testCaseItem.id;
|
|
919
1090
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
920
|
-
const newResults = [...existing.results,
|
|
1091
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
921
1092
|
const newCompletedCount = existing.completedCount + 1;
|
|
922
|
-
const isLast = newCompletedCount ===
|
|
1093
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
923
1094
|
const newMap = new Map(map);
|
|
924
1095
|
newMap.set(key, {
|
|
925
1096
|
completedCount: newCompletedCount,
|
|
@@ -935,10 +1106,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
935
1106
|
} else {
|
|
936
1107
|
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
937
1108
|
}
|
|
938
|
-
const [passed, failed] = yield* effect.Effect.all([
|
|
939
|
-
effect.Ref.get(passedRef),
|
|
940
|
-
effect.Ref.get(failedRef)
|
|
941
|
-
]);
|
|
1109
|
+
const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
|
|
942
1110
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
943
1111
|
...snapshot,
|
|
944
1112
|
passedTestCases: passed,
|
|
@@ -959,10 +1127,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
959
1127
|
runId: task.runId,
|
|
960
1128
|
startedAt
|
|
961
1129
|
});
|
|
962
|
-
const totalEvaluations = task.testCases.
|
|
963
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
964
|
-
0
|
|
965
|
-
);
|
|
1130
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
966
1131
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
967
1132
|
const completedRef = yield* effect.Ref.make(0);
|
|
968
1133
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -971,7 +1136,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
971
1136
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
972
1137
|
/* @__PURE__ */ new Map()
|
|
973
1138
|
);
|
|
974
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1139
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
975
1140
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
976
1141
|
task,
|
|
977
1142
|
unit,
|
|
@@ -985,11 +1150,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
985
1150
|
failedRef,
|
|
986
1151
|
testCaseResultsRef
|
|
987
1152
|
);
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
1153
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1154
|
+
if (globalSem !== void 0) {
|
|
1155
|
+
yield* effect.Effect.forEach(
|
|
1156
|
+
evaluationUnits,
|
|
1157
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1158
|
+
{ concurrency: "unbounded", discard: true }
|
|
1159
|
+
);
|
|
1160
|
+
} else {
|
|
1161
|
+
yield* effect.Effect.forEach(
|
|
1162
|
+
evaluationUnits,
|
|
1163
|
+
processEvaluation,
|
|
1164
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1165
|
+
);
|
|
1166
|
+
}
|
|
993
1167
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
994
1168
|
effect.Ref.get(completedRef),
|
|
995
1169
|
effect.Ref.get(passedRef),
|
|
@@ -1025,125 +1199,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1025
1199
|
artifactPath: task.snapshot.artifactPath
|
|
1026
1200
|
});
|
|
1027
1201
|
});
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
} catch {
|
|
1034
|
-
return [];
|
|
1035
|
-
}
|
|
1036
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1037
|
-
const snapshots = [];
|
|
1038
|
-
for (const fileName of jsonlFiles) {
|
|
1039
|
-
const filePath = path.join(baseDir, fileName);
|
|
1040
|
-
try {
|
|
1041
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1042
|
-
if (snapshot) {
|
|
1043
|
-
snapshots.push(snapshot);
|
|
1044
|
-
}
|
|
1045
|
-
} catch {
|
|
1046
|
-
}
|
|
1047
|
-
}
|
|
1048
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1049
|
-
}
|
|
1050
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1051
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1052
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1053
|
-
if (lines.length === 0) {
|
|
1054
|
-
return null;
|
|
1055
|
-
}
|
|
1056
|
-
let runQueued = null;
|
|
1057
|
-
let runCompleted = null;
|
|
1058
|
-
let runFailed = null;
|
|
1059
|
-
let runStarted = null;
|
|
1060
|
-
for (const line of lines) {
|
|
1061
|
-
try {
|
|
1062
|
-
const event = JSON.parse(line);
|
|
1063
|
-
const type = event.type;
|
|
1064
|
-
if (type === "RunQueued") {
|
|
1065
|
-
runQueued = {
|
|
1066
|
-
runId: event.runId,
|
|
1067
|
-
datasetId: event.datasetId,
|
|
1068
|
-
datasetName: event.datasetName,
|
|
1069
|
-
evaluatorIds: event.evaluatorIds,
|
|
1070
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1071
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1072
|
-
ts: event.ts
|
|
1073
|
-
};
|
|
1074
|
-
}
|
|
1075
|
-
if (type === "RunStarted") {
|
|
1076
|
-
runStarted = { startedAt: event.startedAt };
|
|
1077
|
-
}
|
|
1078
|
-
if (type === "RunCompleted") {
|
|
1079
|
-
runCompleted = {
|
|
1080
|
-
passedTestCases: event.passedTestCases,
|
|
1081
|
-
failedTestCases: event.failedTestCases,
|
|
1082
|
-
totalTestCases: event.totalTestCases,
|
|
1083
|
-
finishedAt: event.finishedAt
|
|
1084
|
-
};
|
|
1085
|
-
}
|
|
1086
|
-
if (type === "RunFailed") {
|
|
1087
|
-
runFailed = {
|
|
1088
|
-
finishedAt: event.finishedAt,
|
|
1089
|
-
errorMessage: event.errorMessage
|
|
1090
|
-
};
|
|
1091
|
-
}
|
|
1092
|
-
} catch {
|
|
1093
|
-
}
|
|
1202
|
+
|
|
1203
|
+
// src/runner/name-pattern.ts
|
|
1204
|
+
function parseRegexLiteral(pattern) {
|
|
1205
|
+
if (!pattern.startsWith("/")) {
|
|
1206
|
+
return void 0;
|
|
1094
1207
|
}
|
|
1095
|
-
|
|
1096
|
-
|
|
1208
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1209
|
+
if (lastSlash <= 0) {
|
|
1210
|
+
return void 0;
|
|
1097
1211
|
}
|
|
1098
|
-
const artifactPath = filePath;
|
|
1099
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1100
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1101
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1102
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1103
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1104
1212
|
return {
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
datasetName: runQueued.datasetName,
|
|
1108
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1109
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1110
|
-
startedAt: runStarted?.startedAt,
|
|
1111
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1112
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1113
|
-
completedTestCases,
|
|
1114
|
-
passedTestCases,
|
|
1115
|
-
failedTestCases,
|
|
1116
|
-
status,
|
|
1117
|
-
artifactPath,
|
|
1118
|
-
errorMessage: runFailed?.errorMessage
|
|
1213
|
+
source: pattern.slice(1, lastSlash),
|
|
1214
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1119
1215
|
};
|
|
1120
1216
|
}
|
|
1121
|
-
function
|
|
1122
|
-
|
|
1123
|
-
const
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
if (event.type === "TestCaseProgress") {
|
|
1128
|
-
const ev = event;
|
|
1129
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1130
|
-
const id = ev.testCaseId;
|
|
1131
|
-
const current = testCasePassedBy.get(id);
|
|
1132
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1133
|
-
}
|
|
1134
|
-
} catch {
|
|
1135
|
-
}
|
|
1217
|
+
function createNameMatcher(pattern) {
|
|
1218
|
+
const normalizedPattern = pattern.trim();
|
|
1219
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1220
|
+
if (regexLiteral) {
|
|
1221
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1222
|
+
return (value) => regex.test(value);
|
|
1136
1223
|
}
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
passedTestCases += 1;
|
|
1142
|
-
} else {
|
|
1143
|
-
failedTestCases += 1;
|
|
1144
|
-
}
|
|
1224
|
+
if (normalizedPattern.includes("*")) {
|
|
1225
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1226
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1227
|
+
return (value) => regex.test(value);
|
|
1145
1228
|
}
|
|
1146
|
-
return
|
|
1229
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1147
1230
|
}
|
|
1148
1231
|
async function appendJsonLine(artifactPath, payload) {
|
|
1149
1232
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1202,32 +1285,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1202
1285
|
}
|
|
1203
1286
|
|
|
1204
1287
|
// src/runner/api.ts
|
|
1205
|
-
function
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1210
|
-
if (lastSlash <= 0) {
|
|
1211
|
-
return void 0;
|
|
1212
|
-
}
|
|
1213
|
-
return {
|
|
1214
|
-
source: pattern.slice(1, lastSlash),
|
|
1215
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1216
|
-
};
|
|
1217
|
-
}
|
|
1218
|
-
function createNameMatcher(pattern) {
|
|
1219
|
-
const normalizedPattern = pattern.trim();
|
|
1220
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1221
|
-
if (regexLiteral) {
|
|
1222
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1223
|
-
return (value) => regex.test(value);
|
|
1224
|
-
}
|
|
1225
|
-
if (normalizedPattern.includes("*")) {
|
|
1226
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1227
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1228
|
-
return (value) => regex.test(value);
|
|
1288
|
+
function normalizeRunRepetitions(value) {
|
|
1289
|
+
const n = value ?? 1;
|
|
1290
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1291
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1229
1292
|
}
|
|
1230
|
-
return
|
|
1293
|
+
return n;
|
|
1231
1294
|
}
|
|
1232
1295
|
function mergeRunnerOverrides(base, next) {
|
|
1233
1296
|
if (!base) {
|
|
@@ -1258,15 +1321,12 @@ var EffectRunner = class {
|
|
|
1258
1321
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1259
1322
|
effect.Queue.unbounded()
|
|
1260
1323
|
);
|
|
1261
|
-
this.snapshotsRef = effect.Effect.runSync(
|
|
1262
|
-
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1263
|
-
);
|
|
1324
|
+
this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
|
|
1264
1325
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1265
1326
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1266
1327
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1267
|
-
this.
|
|
1268
|
-
|
|
1269
|
-
);
|
|
1328
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1329
|
+
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1270
1330
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1271
1331
|
createPersistenceWorker(this.persistenceQueue)
|
|
1272
1332
|
);
|
|
@@ -1306,6 +1366,137 @@ var EffectRunner = class {
|
|
|
1306
1366
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1307
1367
|
);
|
|
1308
1368
|
}
|
|
1369
|
+
async collectRunConfigs() {
|
|
1370
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1371
|
+
this.runConfigsById.clear();
|
|
1372
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1373
|
+
for (const item of runConfigs) {
|
|
1374
|
+
const id = item.runConfig.getName();
|
|
1375
|
+
const lower = id.toLowerCase();
|
|
1376
|
+
const prev = byNameLower.get(lower);
|
|
1377
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1378
|
+
throw new Error(
|
|
1379
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1380
|
+
);
|
|
1381
|
+
}
|
|
1382
|
+
byNameLower.set(lower, item);
|
|
1383
|
+
this.runConfigsById.set(id, item);
|
|
1384
|
+
}
|
|
1385
|
+
return runConfigs;
|
|
1386
|
+
}
|
|
1387
|
+
async resolveRunConfigByName(name) {
|
|
1388
|
+
if (this.runConfigsById.size === 0) {
|
|
1389
|
+
await this.collectRunConfigs();
|
|
1390
|
+
}
|
|
1391
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1392
|
+
const keyLower = key.toLowerCase();
|
|
1393
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1394
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1395
|
+
);
|
|
1396
|
+
if (matches.length === 0) {
|
|
1397
|
+
return void 0;
|
|
1398
|
+
}
|
|
1399
|
+
if (matches.length > 1) {
|
|
1400
|
+
throw new Error(
|
|
1401
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1402
|
+
);
|
|
1403
|
+
}
|
|
1404
|
+
return matches[0];
|
|
1405
|
+
}
|
|
1406
|
+
async expandRunConfigToJobs(collected) {
|
|
1407
|
+
if (this.datasetsById.size === 0) {
|
|
1408
|
+
await this.collectDatasets();
|
|
1409
|
+
}
|
|
1410
|
+
if (this.evaluatorsById.size === 0) {
|
|
1411
|
+
await this.collectEvaluators();
|
|
1412
|
+
}
|
|
1413
|
+
const rcName = collected.runConfig.getName();
|
|
1414
|
+
const jobs = [];
|
|
1415
|
+
const runs = collected.runConfig.getRuns();
|
|
1416
|
+
for (const [i, row] of runs.entries()) {
|
|
1417
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1418
|
+
(d) => d.dataset === row.dataset
|
|
1419
|
+
);
|
|
1420
|
+
if (!dsCollected) {
|
|
1421
|
+
throw new Error(
|
|
1422
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1423
|
+
);
|
|
1424
|
+
}
|
|
1425
|
+
let evaluatorIds;
|
|
1426
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1427
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1428
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1429
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1430
|
+
);
|
|
1431
|
+
if (matched.length === 0) {
|
|
1432
|
+
throw new Error(
|
|
1433
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1434
|
+
);
|
|
1435
|
+
}
|
|
1436
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1437
|
+
} else {
|
|
1438
|
+
const evaluators = row.evaluators;
|
|
1439
|
+
evaluatorIds = [];
|
|
1440
|
+
for (const ev of evaluators) {
|
|
1441
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
1442
|
+
(item) => item.evaluator === ev
|
|
1443
|
+
);
|
|
1444
|
+
if (!found) {
|
|
1445
|
+
throw new Error(
|
|
1446
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
1447
|
+
);
|
|
1448
|
+
}
|
|
1449
|
+
evaluatorIds.push(found.id);
|
|
1450
|
+
}
|
|
1451
|
+
}
|
|
1452
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
1453
|
+
jobs.push({
|
|
1454
|
+
datasetId: dsCollected.id,
|
|
1455
|
+
evaluatorIds,
|
|
1456
|
+
runConfigName: rcName,
|
|
1457
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
1458
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
1459
|
+
repetitions
|
|
1460
|
+
});
|
|
1461
|
+
}
|
|
1462
|
+
return jobs;
|
|
1463
|
+
}
|
|
1464
|
+
async expandRunConfigNamesToJobs(names) {
|
|
1465
|
+
const jobs = [];
|
|
1466
|
+
for (const name of names) {
|
|
1467
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
1468
|
+
if (!collected) {
|
|
1469
|
+
const known = await this.collectRunConfigs();
|
|
1470
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
1471
|
+
throw new Error(
|
|
1472
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
1473
|
+
);
|
|
1474
|
+
}
|
|
1475
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
1476
|
+
}
|
|
1477
|
+
return jobs;
|
|
1478
|
+
}
|
|
1479
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
1480
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1481
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1482
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1483
|
+
const snapshots = [];
|
|
1484
|
+
for (const job of request.jobs) {
|
|
1485
|
+
snapshots.push(
|
|
1486
|
+
await this.startDatasetRun({
|
|
1487
|
+
datasetId: job.datasetId,
|
|
1488
|
+
evaluatorIds: job.evaluatorIds,
|
|
1489
|
+
triggerId,
|
|
1490
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1491
|
+
globalEvaluationSemaphore: sem,
|
|
1492
|
+
runConfigName: job.runConfigName,
|
|
1493
|
+
runConfigTags: job.runConfigTags,
|
|
1494
|
+
repetitions: job.repetitions
|
|
1495
|
+
})
|
|
1496
|
+
);
|
|
1497
|
+
}
|
|
1498
|
+
return snapshots;
|
|
1499
|
+
}
|
|
1309
1500
|
async searchTestCases(query) {
|
|
1310
1501
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1311
1502
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1324,35 +1515,45 @@ var EffectRunner = class {
|
|
|
1324
1515
|
);
|
|
1325
1516
|
}
|
|
1326
1517
|
async runDatasetWith(request) {
|
|
1518
|
+
const runConfigName = validateRunConfigName(
|
|
1519
|
+
request.runConfigName,
|
|
1520
|
+
"runDatasetWith.runConfigName"
|
|
1521
|
+
);
|
|
1522
|
+
return this.startDatasetRun({
|
|
1523
|
+
datasetId: request.datasetId,
|
|
1524
|
+
evaluatorIds: request.evaluatorIds,
|
|
1525
|
+
triggerId: request.triggerId,
|
|
1526
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1527
|
+
repetitions: request.repetitions,
|
|
1528
|
+
runConfigName,
|
|
1529
|
+
runConfigTags: request.runConfigTags
|
|
1530
|
+
});
|
|
1531
|
+
}
|
|
1532
|
+
async startDatasetRun(params) {
|
|
1327
1533
|
if (this.datasetsById.size === 0) {
|
|
1328
1534
|
await this.collectDatasets();
|
|
1329
1535
|
}
|
|
1330
1536
|
if (this.evaluatorsById.size === 0) {
|
|
1331
1537
|
await this.collectEvaluators();
|
|
1332
1538
|
}
|
|
1333
|
-
const dataset = this.datasetsById.get(
|
|
1539
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1334
1540
|
if (!dataset) {
|
|
1335
|
-
throw new Error(`Unknown dataset: ${
|
|
1541
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1336
1542
|
}
|
|
1337
|
-
const selectedEvaluators =
|
|
1543
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1338
1544
|
if (selectedEvaluators.length === 0) {
|
|
1339
1545
|
throw new Error("No evaluators selected for run");
|
|
1340
1546
|
}
|
|
1341
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1342
|
-
const
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
)
|
|
1346
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1547
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
1548
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
1549
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1550
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1551
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1347
1552
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1348
|
-
const artifactPath = createArtifactPath(
|
|
1349
|
-
this.config.artifactDirectory,
|
|
1350
|
-
request.datasetId,
|
|
1351
|
-
runId
|
|
1352
|
-
);
|
|
1553
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1353
1554
|
const snapshot = {
|
|
1354
1555
|
runId,
|
|
1355
|
-
datasetId:
|
|
1556
|
+
datasetId: params.datasetId,
|
|
1356
1557
|
datasetName: dataset.dataset.getName(),
|
|
1357
1558
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1358
1559
|
queuedAt: Date.now(),
|
|
@@ -1373,7 +1574,7 @@ var EffectRunner = class {
|
|
|
1373
1574
|
const queuedEvent = {
|
|
1374
1575
|
type: "RunQueued",
|
|
1375
1576
|
runId,
|
|
1376
|
-
datasetId:
|
|
1577
|
+
datasetId: params.datasetId,
|
|
1377
1578
|
datasetName: dataset.dataset.getName(),
|
|
1378
1579
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1379
1580
|
totalTestCases: totalEvaluations,
|
|
@@ -1387,17 +1588,20 @@ var EffectRunner = class {
|
|
|
1387
1588
|
payload: queuedEvent
|
|
1388
1589
|
})
|
|
1389
1590
|
);
|
|
1390
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1391
1591
|
await effect.Effect.runPromise(
|
|
1392
1592
|
effect.Queue.offer(this.runQueue, {
|
|
1393
1593
|
runId,
|
|
1394
1594
|
triggerId,
|
|
1395
|
-
datasetId:
|
|
1595
|
+
datasetId: params.datasetId,
|
|
1396
1596
|
dataset: dataset.dataset,
|
|
1397
1597
|
evaluators: selectedEvaluators,
|
|
1398
1598
|
testCases: selectedTestCases,
|
|
1399
1599
|
snapshot,
|
|
1400
|
-
maxConcurrency
|
|
1600
|
+
maxConcurrency: params.maxConcurrency,
|
|
1601
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1602
|
+
runConfigName: params.runConfigName,
|
|
1603
|
+
runConfigTags,
|
|
1604
|
+
repetitions
|
|
1401
1605
|
})
|
|
1402
1606
|
);
|
|
1403
1607
|
return snapshot;
|
|
@@ -1413,9 +1617,9 @@ var EffectRunner = class {
|
|
|
1413
1617
|
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
1414
1618
|
}
|
|
1415
1619
|
getAllRunSnapshots() {
|
|
1416
|
-
return Array.from(
|
|
1417
|
-
|
|
1418
|
-
)
|
|
1620
|
+
return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
|
|
1621
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
1622
|
+
);
|
|
1419
1623
|
}
|
|
1420
1624
|
async loadRunSnapshotsFromArtifacts() {
|
|
1421
1625
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1476,6 +1680,8 @@ function getDefaultConcurrency() {
|
|
|
1476
1680
|
function parseSimpleCliArgs(argv) {
|
|
1477
1681
|
const args = {
|
|
1478
1682
|
help: false,
|
|
1683
|
+
ci: false,
|
|
1684
|
+
runConfigNames: [],
|
|
1479
1685
|
unknownArgs: []
|
|
1480
1686
|
};
|
|
1481
1687
|
let index = 0;
|
|
@@ -1489,18 +1695,26 @@ function parseSimpleCliArgs(argv) {
|
|
|
1489
1695
|
args.help = true;
|
|
1490
1696
|
continue;
|
|
1491
1697
|
}
|
|
1698
|
+
if (token === "--ci") {
|
|
1699
|
+
args.ci = true;
|
|
1700
|
+
continue;
|
|
1701
|
+
}
|
|
1492
1702
|
if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
|
|
1493
1703
|
args.datasetName = argv[index + 1];
|
|
1494
1704
|
index += 1;
|
|
1495
1705
|
continue;
|
|
1496
1706
|
}
|
|
1497
|
-
if ((token === "--
|
|
1498
|
-
|
|
1707
|
+
if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
|
|
1708
|
+
const next = argv[index + 1];
|
|
1709
|
+
if (typeof next === "string") {
|
|
1710
|
+
args.runConfigNames.push(next);
|
|
1711
|
+
}
|
|
1499
1712
|
index += 1;
|
|
1500
1713
|
continue;
|
|
1501
1714
|
}
|
|
1502
1715
|
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1503
|
-
const
|
|
1716
|
+
const nextConc = argv[index + 1];
|
|
1717
|
+
const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
|
|
1504
1718
|
if (!Number.isNaN(n) && n >= 1) {
|
|
1505
1719
|
args.concurrency = n;
|
|
1506
1720
|
}
|
|
@@ -1514,16 +1728,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1514
1728
|
function getSimpleCliUsage() {
|
|
1515
1729
|
return [
|
|
1516
1730
|
"Usage:",
|
|
1517
|
-
" eval-agents-simple run --
|
|
1731
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1518
1732
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1519
1733
|
"",
|
|
1520
1734
|
"Options:",
|
|
1521
|
-
" --
|
|
1522
|
-
""
|
|
1523
|
-
"Pattern examples for --evaluator:",
|
|
1524
|
-
" score-evaluator exact name (case-insensitive)",
|
|
1525
|
-
' "*score*" wildcard pattern',
|
|
1526
|
-
' "/score/i" regex literal'
|
|
1735
|
+
" --ci With run: exit with code 1 if any test case fails.",
|
|
1736
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1527
1737
|
].join("\n");
|
|
1528
1738
|
}
|
|
1529
1739
|
|
|
@@ -1557,9 +1767,9 @@ function GenerateView({
|
|
|
1557
1767
|
datasetName,
|
|
1558
1768
|
onComplete
|
|
1559
1769
|
}) {
|
|
1560
|
-
const [result, setResult] =
|
|
1561
|
-
const [error, setError] =
|
|
1562
|
-
|
|
1770
|
+
const [result, setResult] = React.useState(null);
|
|
1771
|
+
const [error, setError] = React.useState(null);
|
|
1772
|
+
React.useEffect(() => {
|
|
1563
1773
|
let cancelled = false;
|
|
1564
1774
|
async function run() {
|
|
1565
1775
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
@@ -1574,7 +1784,7 @@ function GenerateView({
|
|
|
1574
1784
|
const payload = testCases.map((item) => {
|
|
1575
1785
|
const tc = item.testCase;
|
|
1576
1786
|
return {
|
|
1577
|
-
name: item.testCase
|
|
1787
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1578
1788
|
input: item.testCase.getInput(),
|
|
1579
1789
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
1580
1790
|
};
|
|
@@ -1582,12 +1792,8 @@ function GenerateView({
|
|
|
1582
1792
|
const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
|
|
1583
1793
|
const parsed = parse2(absoluteDatasetPath);
|
|
1584
1794
|
const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
|
|
1585
|
-
await writeFile2(
|
|
1586
|
-
|
|
1587
|
-
`${JSON.stringify(payload, null, 2)}
|
|
1588
|
-
`,
|
|
1589
|
-
"utf8"
|
|
1590
|
-
);
|
|
1795
|
+
await writeFile2(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1796
|
+
`, "utf8");
|
|
1591
1797
|
if (!cancelled) {
|
|
1592
1798
|
setResult({
|
|
1593
1799
|
count: payload.length,
|
|
@@ -1644,7 +1850,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1644
1850
|
}
|
|
1645
1851
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
1646
1852
|
const payload = testCases.map((item) => ({
|
|
1647
|
-
name: item.testCase
|
|
1853
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1648
1854
|
input: item.testCase.getInput(),
|
|
1649
1855
|
output: readOutput2(item.testCase)
|
|
1650
1856
|
}));
|
|
@@ -1658,7 +1864,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1658
1864
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1659
1865
|
return new Promise((resolve5, reject) => {
|
|
1660
1866
|
const app = ink.render(
|
|
1661
|
-
|
|
1867
|
+
React__namespace.default.createElement(GenerateView, {
|
|
1662
1868
|
runner,
|
|
1663
1869
|
datasetName,
|
|
1664
1870
|
onComplete: (err) => {
|
|
@@ -1708,8 +1914,8 @@ function TextBar({
|
|
|
1708
1914
|
}
|
|
1709
1915
|
var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
1710
1916
|
function Spinner({ label = "Running" }) {
|
|
1711
|
-
const [frame, setFrame] =
|
|
1712
|
-
|
|
1917
|
+
const [frame, setFrame] = React.useState(0);
|
|
1918
|
+
React.useEffect(() => {
|
|
1713
1919
|
const timer = setInterval(() => {
|
|
1714
1920
|
setFrame((f) => (f + 1) % FRAMES.length);
|
|
1715
1921
|
}, 100);
|
|
@@ -1743,9 +1949,7 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1743
1949
|
function aggregateEvaluatorScores(events, nameById) {
|
|
1744
1950
|
if (events.length === 0)
|
|
1745
1951
|
return [];
|
|
1746
|
-
const evaluatorIds = new Set(
|
|
1747
|
-
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1748
|
-
);
|
|
1952
|
+
const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
|
|
1749
1953
|
const result = [];
|
|
1750
1954
|
for (const evaluatorId of evaluatorIds) {
|
|
1751
1955
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -1775,9 +1979,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1775
1979
|
return es?.passed ?? false;
|
|
1776
1980
|
});
|
|
1777
1981
|
const lastEvent = events[events.length - 1];
|
|
1778
|
-
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1779
|
-
(x) => x.evaluatorId === evaluatorId
|
|
1780
|
-
);
|
|
1982
|
+
const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1781
1983
|
result.push({
|
|
1782
1984
|
evaluatorId,
|
|
1783
1985
|
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
@@ -1806,46 +2008,43 @@ function formatScorePart(item, _scoreToColor, options) {
|
|
|
1806
2008
|
}
|
|
1807
2009
|
function RunView({
|
|
1808
2010
|
runner,
|
|
1809
|
-
|
|
1810
|
-
evaluatorPattern,
|
|
2011
|
+
runConfigNames,
|
|
1811
2012
|
concurrency,
|
|
1812
2013
|
onComplete
|
|
1813
2014
|
}) {
|
|
1814
|
-
const [phase, setPhase] =
|
|
1815
|
-
|
|
1816
|
-
);
|
|
1817
|
-
const [
|
|
1818
|
-
const [
|
|
1819
|
-
const [
|
|
1820
|
-
const [
|
|
1821
|
-
const [
|
|
1822
|
-
const
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
if (!dataset) {
|
|
1827
|
-
const known = await runner.collectDatasets();
|
|
1828
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1829
|
-
onComplete(
|
|
1830
|
-
new Error(
|
|
1831
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1832
|
-
)
|
|
1833
|
-
);
|
|
2015
|
+
const [phase, setPhase] = React.useState("loading");
|
|
2016
|
+
const [runInfo, setRunInfo] = React.useState(null);
|
|
2017
|
+
const [testCases, setTestCases] = React.useState([]);
|
|
2018
|
+
const [startedEvaluations, setStartedEvaluations] = React.useState(0);
|
|
2019
|
+
const [completedEvaluations, setCompletedEvaluations] = React.useState(0);
|
|
2020
|
+
const [runningEvaluations, setRunningEvaluations] = React.useState([]);
|
|
2021
|
+
const [summary, setSummary] = React.useState(null);
|
|
2022
|
+
const [evaluatorNameById, setEvaluatorNameById] = React.useState(/* @__PURE__ */ new Map());
|
|
2023
|
+
const runEval = React.useCallback(async () => {
|
|
2024
|
+
const rcList = runConfigNames.filter((n) => n.trim().length > 0);
|
|
2025
|
+
if (rcList.length === 0) {
|
|
2026
|
+
onComplete(new Error("At least one RunConfig name is required."));
|
|
1834
2027
|
return;
|
|
1835
2028
|
}
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
2029
|
+
setStartedEvaluations(0);
|
|
2030
|
+
setCompletedEvaluations(0);
|
|
2031
|
+
setTestCases([]);
|
|
2032
|
+
setRunningEvaluations([]);
|
|
2033
|
+
setSummary(null);
|
|
2034
|
+
let jobs;
|
|
2035
|
+
try {
|
|
2036
|
+
jobs = await runner.expandRunConfigNamesToJobs(rcList);
|
|
2037
|
+
} catch (err) {
|
|
2038
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
2039
|
+
return;
|
|
2040
|
+
}
|
|
2041
|
+
if (jobs.length === 0) {
|
|
2042
|
+
onComplete(new Error("No jobs expanded from RunConfigs."));
|
|
1845
2043
|
return;
|
|
1846
2044
|
}
|
|
2045
|
+
const allEvaluators = await runner.collectEvaluators();
|
|
1847
2046
|
const nameById = new Map(
|
|
1848
|
-
|
|
2047
|
+
allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
1849
2048
|
);
|
|
1850
2049
|
setEvaluatorNameById(nameById);
|
|
1851
2050
|
const aggregates = /* @__PURE__ */ new Map();
|
|
@@ -1853,21 +2052,30 @@ function RunView({
|
|
|
1853
2052
|
let overallScoreTotal = 0;
|
|
1854
2053
|
let overallScoreSumSq = 0;
|
|
1855
2054
|
let overallScoreCount = 0;
|
|
1856
|
-
const
|
|
2055
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2056
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2057
|
+
let batchReady = false;
|
|
2058
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2059
|
+
const done = new Promise((resolve5, reject) => {
|
|
1857
2060
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2061
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2062
|
+
return;
|
|
2063
|
+
}
|
|
1858
2064
|
if (event.type === "TestCaseStarted") {
|
|
1859
|
-
setStartedEvaluations(
|
|
2065
|
+
setStartedEvaluations((c) => c + 1);
|
|
1860
2066
|
setRunningEvaluations((prev) => {
|
|
1861
2067
|
const withoutDuplicate = prev.filter(
|
|
1862
|
-
(item) => !(item.testCaseId === event.testCaseId && item.
|
|
2068
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
1863
2069
|
);
|
|
1864
2070
|
return [
|
|
1865
2071
|
...withoutDuplicate,
|
|
1866
2072
|
{
|
|
2073
|
+
runId: event.runId,
|
|
1867
2074
|
testCaseId: event.testCaseId,
|
|
1868
2075
|
name: event.testCaseName,
|
|
1869
|
-
|
|
1870
|
-
|
|
2076
|
+
repetitionId: event.repetitionId,
|
|
2077
|
+
repetitionIndex: event.repetitionIndex,
|
|
2078
|
+
repetitionCount: event.repetitionCount,
|
|
1871
2079
|
startedTestCases: event.startedTestCases,
|
|
1872
2080
|
totalTestCases: event.totalTestCases
|
|
1873
2081
|
}
|
|
@@ -1903,9 +2111,12 @@ function RunView({
|
|
|
1903
2111
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
1904
2112
|
}
|
|
1905
2113
|
}
|
|
2114
|
+
const label = runIdToLabel.get(event.runId);
|
|
2115
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2116
|
+
const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
|
|
1906
2117
|
setTestCases((prev) => {
|
|
1907
2118
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1908
|
-
const existing = byId.get(
|
|
2119
|
+
const existing = byId.get(compositeId);
|
|
1909
2120
|
const newEvent = {
|
|
1910
2121
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1911
2122
|
evaluatorId: item.evaluatorId,
|
|
@@ -1920,17 +2131,14 @@ function RunView({
|
|
|
1920
2131
|
};
|
|
1921
2132
|
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1922
2133
|
const isAggregated = events.length > 1;
|
|
1923
|
-
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1924
|
-
events,
|
|
1925
|
-
nameById
|
|
1926
|
-
);
|
|
2134
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
|
|
1927
2135
|
const merged = {
|
|
1928
|
-
name:
|
|
1929
|
-
testCaseId:
|
|
2136
|
+
name: displayName,
|
|
2137
|
+
testCaseId: compositeId,
|
|
1930
2138
|
completedTestCases: event.completedTestCases,
|
|
1931
2139
|
totalTestCases: event.totalTestCases,
|
|
1932
|
-
|
|
1933
|
-
|
|
2140
|
+
repetitionIndex: event.repetitionIndex,
|
|
2141
|
+
repetitionCount: event.repetitionCount,
|
|
1934
2142
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1935
2143
|
passed: events.every((e) => e.passed),
|
|
1936
2144
|
errorMessage: event.errorMessage,
|
|
@@ -1938,84 +2146,118 @@ function RunView({
|
|
|
1938
2146
|
aggregatedEvaluatorScores,
|
|
1939
2147
|
isAggregated
|
|
1940
2148
|
};
|
|
1941
|
-
byId.set(
|
|
1942
|
-
setCompletedEvaluations(event.completedTestCases);
|
|
1943
|
-
setRunningEvaluations(
|
|
1944
|
-
(running) => running.filter(
|
|
1945
|
-
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1946
|
-
)
|
|
1947
|
-
);
|
|
2149
|
+
byId.set(compositeId, merged);
|
|
1948
2150
|
return Array.from(byId.values());
|
|
1949
2151
|
});
|
|
2152
|
+
setCompletedEvaluations((c) => c + 1);
|
|
2153
|
+
setRunningEvaluations(
|
|
2154
|
+
(running) => running.filter(
|
|
2155
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
2156
|
+
)
|
|
2157
|
+
);
|
|
1950
2158
|
}
|
|
1951
|
-
if (event.type === "
|
|
2159
|
+
if (event.type === "RunFailed") {
|
|
2160
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2161
|
+
return;
|
|
2162
|
+
}
|
|
1952
2163
|
unsubscribe();
|
|
1953
|
-
|
|
2164
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2165
|
+
return;
|
|
2166
|
+
}
|
|
2167
|
+
if (event.type === "RunCompleted") {
|
|
2168
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2169
|
+
return;
|
|
2170
|
+
}
|
|
2171
|
+
completedRuns.set(event.runId, event);
|
|
2172
|
+
batchPendingRunIds.delete(event.runId);
|
|
2173
|
+
if (batchPendingRunIds.size === 0) {
|
|
2174
|
+
unsubscribe();
|
|
2175
|
+
resolve5();
|
|
2176
|
+
}
|
|
1954
2177
|
}
|
|
1955
2178
|
});
|
|
1956
2179
|
});
|
|
1957
|
-
const
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
concurrency
|
|
2180
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2181
|
+
jobs,
|
|
2182
|
+
globalConcurrency: concurrency
|
|
1961
2183
|
});
|
|
2184
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2185
|
+
const snap = snapshots[i];
|
|
2186
|
+
const job = jobs[i];
|
|
2187
|
+
if (snap && job) {
|
|
2188
|
+
runIdToLabel.set(
|
|
2189
|
+
snap.runId,
|
|
2190
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2191
|
+
);
|
|
2192
|
+
batchPendingRunIds.add(snap.runId);
|
|
2193
|
+
}
|
|
2194
|
+
}
|
|
2195
|
+
const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2196
|
+
batchReady = true;
|
|
2197
|
+
const runConfigLabels = await Promise.all(
|
|
2198
|
+
rcList.map(async (n) => {
|
|
2199
|
+
const collected = await runner.resolveRunConfigByName(n);
|
|
2200
|
+
return collected?.runConfig.getDisplayLabel() ?? n;
|
|
2201
|
+
})
|
|
2202
|
+
);
|
|
1962
2203
|
setRunInfo({
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
totalTestCases: snapshot.totalTestCases
|
|
2204
|
+
names: runConfigLabels,
|
|
2205
|
+
jobs: jobs.length,
|
|
2206
|
+
totalTestCases: totalUnits
|
|
1967
2207
|
});
|
|
1968
2208
|
setPhase("running");
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
2209
|
+
try {
|
|
2210
|
+
await done;
|
|
2211
|
+
} catch (err) {
|
|
2212
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
1972
2213
|
return;
|
|
1973
2214
|
}
|
|
1974
|
-
|
|
2215
|
+
let passedTestCases = 0;
|
|
2216
|
+
let failedTestCases = 0;
|
|
2217
|
+
let totalTestCases = 0;
|
|
2218
|
+
const artifacts = [];
|
|
2219
|
+
for (const ev of completedRuns.values()) {
|
|
2220
|
+
passedTestCases += ev.passedTestCases;
|
|
2221
|
+
failedTestCases += ev.failedTestCases;
|
|
2222
|
+
totalTestCases += ev.totalTestCases;
|
|
2223
|
+
artifacts.push(ev.artifactPath);
|
|
2224
|
+
}
|
|
1975
2225
|
setSummary({
|
|
1976
|
-
passedTestCases
|
|
1977
|
-
failedTestCases
|
|
1978
|
-
totalTestCases
|
|
2226
|
+
passedTestCases,
|
|
2227
|
+
failedTestCases,
|
|
2228
|
+
totalTestCases,
|
|
1979
2229
|
overallScoreTotal,
|
|
1980
2230
|
overallScoreSumSq,
|
|
1981
2231
|
overallScoreCount,
|
|
1982
2232
|
aggregates: new Map(aggregates),
|
|
1983
2233
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1984
|
-
artifactPath:
|
|
2234
|
+
artifactPath: artifacts.join("\n")
|
|
1985
2235
|
});
|
|
1986
2236
|
setPhase("completed");
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
2237
|
+
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2238
|
+
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2239
|
+
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
2240
|
+
React.useEffect(() => {
|
|
1990
2241
|
void runEval();
|
|
1991
2242
|
}, [runEval]);
|
|
1992
2243
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
1993
2244
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1994
2245
|
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1995
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
" "
|
|
1999
|
-
] }),
|
|
2000
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
2001
|
-
] }),
|
|
2002
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2003
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
2004
|
-
"Dataset",
|
|
2005
|
-
" "
|
|
2006
|
-
] }),
|
|
2007
|
-
runInfo.datasetName
|
|
2246
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
2247
|
+
"RunConfigs",
|
|
2248
|
+
" "
|
|
2008
2249
|
] }),
|
|
2250
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.names.join(", ") }),
|
|
2009
2251
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2010
2252
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
2011
|
-
"
|
|
2253
|
+
"Jobs",
|
|
2012
2254
|
" "
|
|
2013
2255
|
] }),
|
|
2014
|
-
runInfo.
|
|
2256
|
+
runInfo.jobs
|
|
2015
2257
|
] }),
|
|
2016
2258
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2017
2259
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
2018
|
-
"
|
|
2260
|
+
"Evaluation units",
|
|
2019
2261
|
" "
|
|
2020
2262
|
] }),
|
|
2021
2263
|
runInfo.totalTestCases
|
|
@@ -2037,20 +2279,19 @@ function RunView({
|
|
|
2037
2279
|
item.startedTestCases,
|
|
2038
2280
|
"/",
|
|
2039
2281
|
item.totalTestCases,
|
|
2040
|
-
"]",
|
|
2041
|
-
" ",
|
|
2282
|
+
"] ",
|
|
2042
2283
|
item.name,
|
|
2043
2284
|
" ",
|
|
2044
2285
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2045
2286
|
"(",
|
|
2046
|
-
item.
|
|
2287
|
+
item.repetitionIndex,
|
|
2047
2288
|
"/",
|
|
2048
|
-
item.
|
|
2289
|
+
item.repetitionCount,
|
|
2049
2290
|
")"
|
|
2050
2291
|
] })
|
|
2051
2292
|
]
|
|
2052
2293
|
},
|
|
2053
|
-
`${item.testCaseId}:${item.
|
|
2294
|
+
`${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
|
|
2054
2295
|
)) })
|
|
2055
2296
|
] }),
|
|
2056
2297
|
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
@@ -2067,9 +2308,9 @@ function RunView({
|
|
|
2067
2308
|
" ",
|
|
2068
2309
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
2069
2310
|
"(",
|
|
2070
|
-
tc.
|
|
2311
|
+
tc.repetitionIndex,
|
|
2071
2312
|
"/",
|
|
2072
|
-
tc.
|
|
2313
|
+
tc.repetitionCount,
|
|
2073
2314
|
")"
|
|
2074
2315
|
] }),
|
|
2075
2316
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
@@ -2083,73 +2324,70 @@ function RunView({
|
|
|
2083
2324
|
] }) : null
|
|
2084
2325
|
] }),
|
|
2085
2326
|
tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
|
|
2086
|
-
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2087
|
-
ink.
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
children:
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2327
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
2328
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2329
|
+
item.evaluatorName,
|
|
2330
|
+
":",
|
|
2331
|
+
" ",
|
|
2332
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
2333
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2334
|
+
" ",
|
|
2335
|
+
item.metrics.map((m) => {
|
|
2336
|
+
const def = getMetricById(m.id);
|
|
2337
|
+
if (!def)
|
|
2338
|
+
return null;
|
|
2339
|
+
const formatted = def.format(m.data, {
|
|
2340
|
+
isAggregated: tc.isAggregated
|
|
2341
|
+
});
|
|
2342
|
+
const label = m.name ?? def.name;
|
|
2343
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2344
|
+
"[",
|
|
2345
|
+
label ? `${label}: ` : "",
|
|
2346
|
+
formatted,
|
|
2347
|
+
"]",
|
|
2348
|
+
" "
|
|
2349
|
+
] }, m.id);
|
|
2350
|
+
})
|
|
2351
|
+
] }) : null
|
|
2352
|
+
] }),
|
|
2353
|
+
item.scores.length > 0 ? item.scores.map((s) => {
|
|
2354
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2355
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2356
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2357
|
+
ink.Text,
|
|
2358
|
+
{
|
|
2359
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
2360
|
+
children: [
|
|
2361
|
+
" ",
|
|
2362
|
+
scoreLabel,
|
|
2363
|
+
":",
|
|
2098
2364
|
" ",
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
if (!def)
|
|
2102
|
-
return null;
|
|
2103
|
-
const formatted = def.format(m.data, {
|
|
2104
|
-
isAggregated: tc.isAggregated
|
|
2105
|
-
});
|
|
2106
|
-
const label = m.name ?? def.name;
|
|
2107
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2108
|
-
"[",
|
|
2109
|
-
label ? `${label}: ` : "",
|
|
2110
|
-
formatted,
|
|
2111
|
-
"]",
|
|
2112
|
-
" "
|
|
2113
|
-
] }, m.id);
|
|
2365
|
+
formatScorePart(s, scoreColor, {
|
|
2366
|
+
isAggregated: tc.isAggregated
|
|
2114
2367
|
})
|
|
2115
|
-
]
|
|
2116
|
-
|
|
2117
|
-
item.
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2368
|
+
]
|
|
2369
|
+
},
|
|
2370
|
+
`${item.evaluatorId}-${s.id}-${scoreLabel}`
|
|
2371
|
+
);
|
|
2372
|
+
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
2373
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2374
|
+
(log) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(
|
|
2375
|
+
ink.Box,
|
|
2376
|
+
{
|
|
2377
|
+
flexDirection: "column",
|
|
2378
|
+
children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2121
2379
|
ink.Text,
|
|
2122
2380
|
{
|
|
2123
|
-
color:
|
|
2124
|
-
children:
|
|
2125
|
-
" ",
|
|
2126
|
-
scoreLabel,
|
|
2127
|
-
":",
|
|
2128
|
-
" ",
|
|
2129
|
-
formatScorePart(s, scoreColor, {
|
|
2130
|
-
isAggregated: tc.isAggregated
|
|
2131
|
-
})
|
|
2132
|
-
]
|
|
2381
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2382
|
+
children: line
|
|
2133
2383
|
},
|
|
2134
|
-
`${
|
|
2135
|
-
)
|
|
2136
|
-
}
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
{
|
|
2142
|
-
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2143
|
-
children: line
|
|
2144
|
-
},
|
|
2145
|
-
lineIdx
|
|
2146
|
-
)
|
|
2147
|
-
) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
2148
|
-
) })
|
|
2149
|
-
]
|
|
2150
|
-
},
|
|
2151
|
-
item.evaluatorId
|
|
2152
|
-
))
|
|
2384
|
+
`${type}:${line}`
|
|
2385
|
+
))
|
|
2386
|
+
},
|
|
2387
|
+
`diff:${getDiffLines(log).map((x) => x.line).join("|")}`
|
|
2388
|
+
) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
|
|
2389
|
+
) })
|
|
2390
|
+
] }, item.evaluatorId))
|
|
2153
2391
|
] }, tc.testCaseId)) }),
|
|
2154
2392
|
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
2155
2393
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
@@ -2191,9 +2429,9 @@ function RunView({
|
|
|
2191
2429
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
2192
2430
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
2193
2431
|
const agg = summary.aggregates.get(id);
|
|
2194
|
-
const scoreKeys = [
|
|
2195
|
-
|
|
2196
|
-
|
|
2432
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
2433
|
+
(k) => k.startsWith(`${id}:`)
|
|
2434
|
+
);
|
|
2197
2435
|
if (scoreKeys.length === 0) {
|
|
2198
2436
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2199
2437
|
"- ",
|
|
@@ -2223,19 +2461,12 @@ function RunView({
|
|
|
2223
2461
|
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
2224
2462
|
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
2225
2463
|
const numeric = toNumericScore(aggregated.data);
|
|
2226
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
label,
|
|
2233
|
-
": ",
|
|
2234
|
-
formatted
|
|
2235
|
-
]
|
|
2236
|
-
},
|
|
2237
|
-
key
|
|
2238
|
-
);
|
|
2464
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: numeric !== void 0 ? scoreColor(numeric) : "gray", children: [
|
|
2465
|
+
" ",
|
|
2466
|
+
label,
|
|
2467
|
+
": ",
|
|
2468
|
+
formatted
|
|
2469
|
+
] }, key);
|
|
2239
2470
|
})
|
|
2240
2471
|
] }, id);
|
|
2241
2472
|
})
|
|
@@ -2278,10 +2509,10 @@ function RunView({
|
|
|
2278
2509
|
] }, tc.testCaseId);
|
|
2279
2510
|
})
|
|
2280
2511
|
] }),
|
|
2281
|
-
/* @__PURE__ */ jsxRuntime.
|
|
2282
|
-
"artifact:
|
|
2283
|
-
summary.artifactPath
|
|
2284
|
-
] })
|
|
2512
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
2513
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "artifact(s):" }),
|
|
2514
|
+
summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line))
|
|
2515
|
+
] })
|
|
2285
2516
|
] })
|
|
2286
2517
|
] });
|
|
2287
2518
|
}
|
|
@@ -2311,9 +2542,7 @@ function buildTestCaseSummaries(byId) {
|
|
|
2311
2542
|
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
2312
2543
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
2313
2544
|
for (const ev of events) {
|
|
2314
|
-
const es = ev.evaluatorScores.find(
|
|
2315
|
-
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
2316
|
-
);
|
|
2545
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorScores.evaluatorId);
|
|
2317
2546
|
for (const s of es?.scores ?? []) {
|
|
2318
2547
|
const list = scoreIdToItems.get(s.id) ?? [];
|
|
2319
2548
|
list.push(s);
|
|
@@ -2366,9 +2595,7 @@ function scoreToColor(score) {
|
|
|
2366
2595
|
}
|
|
2367
2596
|
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
2368
2597
|
const lines = [];
|
|
2369
|
-
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
2370
|
-
(k) => k.startsWith(`${evaluatorId}:`)
|
|
2371
|
-
);
|
|
2598
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter((k) => k.startsWith(`${evaluatorId}:`));
|
|
2372
2599
|
if (scoreKeys.length === 0) {
|
|
2373
2600
|
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
2374
2601
|
return lines;
|
|
@@ -2403,9 +2630,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2403
2630
|
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2404
2631
|
if (events.length === 0)
|
|
2405
2632
|
return [];
|
|
2406
|
-
const evaluatorIds = new Set(
|
|
2407
|
-
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
2408
|
-
);
|
|
2633
|
+
const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
|
|
2409
2634
|
const result = [];
|
|
2410
2635
|
for (const evaluatorId of evaluatorIds) {
|
|
2411
2636
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -2452,9 +2677,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2452
2677
|
if (def) {
|
|
2453
2678
|
const formatted = def.format(m.data, options);
|
|
2454
2679
|
const label = m.name ?? def.name;
|
|
2455
|
-
metricParts.push(
|
|
2456
|
-
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2457
|
-
);
|
|
2680
|
+
metricParts.push(label ? `[${label}: ${formatted}]` : `[${formatted}]`);
|
|
2458
2681
|
}
|
|
2459
2682
|
}
|
|
2460
2683
|
}
|
|
@@ -2501,25 +2724,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2501
2724
|
}
|
|
2502
2725
|
return lines;
|
|
2503
2726
|
}
|
|
2504
|
-
async function
|
|
2505
|
-
const
|
|
2506
|
-
if (
|
|
2507
|
-
|
|
2508
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
2509
|
-
throw new Error(
|
|
2510
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
|
|
2511
|
-
);
|
|
2512
|
-
}
|
|
2513
|
-
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
2514
|
-
if (evaluators.length === 0) {
|
|
2515
|
-
const known = await runner.collectEvaluators();
|
|
2516
|
-
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
2517
|
-
throw new Error(
|
|
2518
|
-
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
|
|
2519
|
-
);
|
|
2727
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2728
|
+
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2729
|
+
if (jobs.length === 0) {
|
|
2730
|
+
throw new Error("No jobs expanded from RunConfigs.");
|
|
2520
2731
|
}
|
|
2732
|
+
const evaluators = await runner.collectEvaluators();
|
|
2521
2733
|
const evaluatorNameById = new Map(
|
|
2522
|
-
evaluators.map((item) => [item.id, item.evaluator
|
|
2734
|
+
evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
2523
2735
|
);
|
|
2524
2736
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2525
2737
|
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
@@ -2527,11 +2739,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2527
2739
|
let overallScoreTotal = 0;
|
|
2528
2740
|
let overallScoreSumSq = 0;
|
|
2529
2741
|
let overallScoreCount = 0;
|
|
2530
|
-
let
|
|
2531
|
-
let
|
|
2742
|
+
let globalStartedUnits = 0;
|
|
2743
|
+
let globalCompletedUnits = 0;
|
|
2532
2744
|
let totalCount = 0;
|
|
2533
2745
|
let runFinished = false;
|
|
2534
|
-
const
|
|
2746
|
+
const inFlightRepetitions = /* @__PURE__ */ new Set();
|
|
2535
2747
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2536
2748
|
let spinnerIndex = 0;
|
|
2537
2749
|
function clearLine() {
|
|
@@ -2553,33 +2765,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2553
2765
|
spinnerIndex += 1;
|
|
2554
2766
|
process.stdout.write(
|
|
2555
2767
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2556
|
-
`${
|
|
2768
|
+
`${globalCompletedUnits}/${totalCount}`,
|
|
2557
2769
|
ansi2.bold
|
|
2558
|
-
)} completed ${colorize(`${
|
|
2770
|
+
)} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
|
|
2559
2771
|
);
|
|
2560
2772
|
}
|
|
2561
2773
|
let lastPrintedTestCaseId = null;
|
|
2562
2774
|
let lastPrintedLineCount = 0;
|
|
2563
2775
|
let spinnerTimer;
|
|
2564
|
-
const
|
|
2776
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2777
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2778
|
+
let batchReady = false;
|
|
2779
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2780
|
+
const done = new Promise((resolve5, reject) => {
|
|
2565
2781
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2782
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2783
|
+
return;
|
|
2784
|
+
}
|
|
2785
|
+
const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
|
|
2786
|
+
const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
|
|
2566
2787
|
if (event.type === "TestCaseStarted") {
|
|
2567
|
-
|
|
2568
|
-
|
|
2788
|
+
globalStartedUnits += 1;
|
|
2789
|
+
inFlightRepetitions.add(
|
|
2790
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2791
|
+
);
|
|
2569
2792
|
clearLine();
|
|
2570
2793
|
process.stdout.write(
|
|
2571
|
-
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2794
|
+
`${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2572
2795
|
`
|
|
2573
2796
|
);
|
|
2574
2797
|
drawSpinner();
|
|
2575
2798
|
}
|
|
2576
2799
|
if (event.type === "TestCaseProgress") {
|
|
2577
|
-
|
|
2578
|
-
|
|
2800
|
+
globalCompletedUnits += 1;
|
|
2801
|
+
inFlightRepetitions.delete(
|
|
2802
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2803
|
+
);
|
|
2579
2804
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2580
2805
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2581
|
-
const
|
|
2582
|
-
const existing = testCaseByTestId.get(
|
|
2806
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2807
|
+
const existing = testCaseByTestId.get(compositeId) ?? {
|
|
2583
2808
|
name: event.testCaseName,
|
|
2584
2809
|
events: []
|
|
2585
2810
|
};
|
|
@@ -2589,7 +2814,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2589
2814
|
durationMs: event.durationMs,
|
|
2590
2815
|
evaluatorScores: event.evaluatorScores
|
|
2591
2816
|
});
|
|
2592
|
-
testCaseByTestId.set(
|
|
2817
|
+
testCaseByTestId.set(compositeId, existing);
|
|
2593
2818
|
for (const item of event.evaluatorScores) {
|
|
2594
2819
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
2595
2820
|
if (numeric !== void 0) {
|
|
@@ -2618,24 +2843,21 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2618
2843
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
2619
2844
|
}
|
|
2620
2845
|
}
|
|
2621
|
-
const isSameTestCase = lastPrintedTestCaseId ===
|
|
2622
|
-
const
|
|
2846
|
+
const isSameTestCase = lastPrintedTestCaseId === compositeId;
|
|
2847
|
+
const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
|
|
2623
2848
|
const isNonTty = !process.stdout.isTTY;
|
|
2624
|
-
const skipPrintNonTty = isNonTty && event.
|
|
2849
|
+
const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
|
|
2625
2850
|
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2626
2851
|
cursorUp(lastPrintedLineCount);
|
|
2627
2852
|
}
|
|
2628
2853
|
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2629
2854
|
existing.events);
|
|
2630
2855
|
const isAggregated = existing.events.length > 1;
|
|
2631
|
-
const durationMs = existing.events.reduce(
|
|
2632
|
-
(s, e) => s + e.durationMs,
|
|
2633
|
-
0
|
|
2634
|
-
);
|
|
2856
|
+
const durationMs = existing.events.reduce((s, e) => s + e.durationMs, 0);
|
|
2635
2857
|
const lines = [];
|
|
2636
2858
|
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2637
2859
|
lines.push(
|
|
2638
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2860
|
+
`${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2639
2861
|
);
|
|
2640
2862
|
if (event.errorMessage) {
|
|
2641
2863
|
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
@@ -2643,18 +2865,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2643
2865
|
for (const item of aggregatedScores) {
|
|
2644
2866
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2645
2867
|
lines.push(
|
|
2646
|
-
...formatEvaluatorScoreLine(
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
item.passed,
|
|
2650
|
-
item.metrics,
|
|
2651
|
-
{ isAggregated }
|
|
2652
|
-
)
|
|
2868
|
+
...formatEvaluatorScoreLine(name, item.scores, item.passed, item.metrics, {
|
|
2869
|
+
isAggregated
|
|
2870
|
+
})
|
|
2653
2871
|
);
|
|
2654
2872
|
const lastEvent = existing.events[existing.events.length - 1];
|
|
2655
|
-
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2656
|
-
(x) => x.evaluatorId === item.evaluatorId
|
|
2657
|
-
);
|
|
2873
|
+
const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === item.evaluatorId);
|
|
2658
2874
|
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2659
2875
|
for (const log of lastEs.logs) {
|
|
2660
2876
|
if (log.type === "diff") {
|
|
@@ -2672,73 +2888,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2672
2888
|
}
|
|
2673
2889
|
}
|
|
2674
2890
|
if (!skipPrintNonTty) {
|
|
2675
|
-
for (let i = 0; i < lines.length; i
|
|
2891
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
2676
2892
|
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2677
2893
|
`);
|
|
2678
2894
|
}
|
|
2679
|
-
lastPrintedTestCaseId =
|
|
2895
|
+
lastPrintedTestCaseId = compositeId;
|
|
2680
2896
|
lastPrintedLineCount = lines.length;
|
|
2681
2897
|
}
|
|
2682
2898
|
drawSpinner();
|
|
2683
2899
|
}
|
|
2684
|
-
if (event.type === "
|
|
2900
|
+
if (event.type === "RunFailed") {
|
|
2901
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2902
|
+
return;
|
|
2903
|
+
}
|
|
2685
2904
|
runFinished = true;
|
|
2686
2905
|
clearLine();
|
|
2687
2906
|
unsubscribe();
|
|
2688
|
-
|
|
2907
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2908
|
+
return;
|
|
2909
|
+
}
|
|
2910
|
+
if (event.type === "RunCompleted") {
|
|
2911
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2912
|
+
return;
|
|
2913
|
+
}
|
|
2914
|
+
completedRuns.set(event.runId, event);
|
|
2915
|
+
batchPendingRunIds.delete(event.runId);
|
|
2916
|
+
if (batchPendingRunIds.size === 0) {
|
|
2917
|
+
runFinished = true;
|
|
2918
|
+
clearLine();
|
|
2919
|
+
unsubscribe();
|
|
2920
|
+
resolve5();
|
|
2921
|
+
}
|
|
2689
2922
|
}
|
|
2690
2923
|
});
|
|
2691
2924
|
});
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2925
|
+
console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2926
|
+
for (const name of runConfigNames) {
|
|
2927
|
+
const collected = await runner.resolveRunConfigByName(name);
|
|
2928
|
+
const label = collected?.runConfig.getDisplayLabel() ?? name;
|
|
2929
|
+
console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
|
|
2930
|
+
}
|
|
2931
|
+
console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
|
|
2932
|
+
console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
|
|
2933
|
+
console.log("");
|
|
2934
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2935
|
+
jobs,
|
|
2936
|
+
globalConcurrency: concurrency
|
|
2696
2937
|
});
|
|
2697
|
-
|
|
2698
|
-
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2938
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2939
|
+
const snap = snapshots[i];
|
|
2940
|
+
const job = jobs[i];
|
|
2941
|
+
if (snap && job) {
|
|
2942
|
+
runIdToLabel.set(
|
|
2943
|
+
snap.runId,
|
|
2944
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2945
|
+
);
|
|
2946
|
+
batchPendingRunIds.add(snap.runId);
|
|
2947
|
+
}
|
|
2948
|
+
}
|
|
2949
|
+
totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2950
|
+
console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
|
|
2707
2951
|
console.log("");
|
|
2952
|
+
batchReady = true;
|
|
2708
2953
|
drawSpinner();
|
|
2709
2954
|
spinnerTimer = setInterval(drawSpinner, 100);
|
|
2710
|
-
|
|
2955
|
+
await done;
|
|
2711
2956
|
if (spinnerTimer) {
|
|
2712
2957
|
clearInterval(spinnerTimer);
|
|
2713
2958
|
}
|
|
2714
|
-
if (finalEvent.type === "RunFailed") {
|
|
2715
|
-
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2716
|
-
}
|
|
2717
|
-
const completed = finalEvent;
|
|
2718
2959
|
console.log("");
|
|
2719
|
-
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
|
|
2730
|
-
)
|
|
2731
|
-
|
|
2960
|
+
console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2961
|
+
for (const snap of snapshots) {
|
|
2962
|
+
const completed = completedRuns.get(snap.runId);
|
|
2963
|
+
if (!completed) {
|
|
2964
|
+
continue;
|
|
2965
|
+
}
|
|
2966
|
+
const label = runIdToLabel.get(snap.runId) ?? snap.runId;
|
|
2967
|
+
console.log("");
|
|
2968
|
+
console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
|
|
2969
|
+
console.log(
|
|
2970
|
+
`- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
|
|
2971
|
+
);
|
|
2972
|
+
console.log(
|
|
2973
|
+
`- failed: ${colorize(
|
|
2974
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2975
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2976
|
+
)}`
|
|
2977
|
+
);
|
|
2978
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2979
|
+
}
|
|
2732
2980
|
if (overallScoreCount > 0) {
|
|
2733
2981
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2734
|
-
const overallSd = sampleStdDev2(
|
|
2735
|
-
overallScoreTotal,
|
|
2736
|
-
overallScoreSumSq,
|
|
2737
|
-
overallScoreCount
|
|
2738
|
-
);
|
|
2982
|
+
const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
|
|
2739
2983
|
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2984
|
+
console.log("");
|
|
2740
2985
|
console.log(
|
|
2741
|
-
`- overall avg score: ${colorize(
|
|
2986
|
+
`- overall avg score (all jobs): ${colorize(
|
|
2742
2987
|
avgStr,
|
|
2743
2988
|
scoreToColor(overallAverage)
|
|
2744
2989
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
@@ -2779,22 +3024,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2779
3024
|
);
|
|
2780
3025
|
}
|
|
2781
3026
|
}
|
|
2782
|
-
|
|
3027
|
+
let failedTestCasesTotal = 0;
|
|
3028
|
+
for (const snap of snapshots) {
|
|
3029
|
+
const completed = completedRuns.get(snap.runId);
|
|
3030
|
+
if (completed) {
|
|
3031
|
+
failedTestCasesTotal += completed.failedTestCases;
|
|
3032
|
+
}
|
|
3033
|
+
}
|
|
3034
|
+
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
2783
3035
|
}
|
|
2784
|
-
async function
|
|
3036
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
2785
3037
|
return new Promise((resolve5, reject) => {
|
|
2786
3038
|
const app = ink.render(
|
|
2787
|
-
|
|
3039
|
+
React__namespace.createElement(RunView, {
|
|
2788
3040
|
runner,
|
|
2789
|
-
|
|
2790
|
-
evaluatorPattern,
|
|
3041
|
+
runConfigNames,
|
|
2791
3042
|
concurrency,
|
|
2792
|
-
onComplete: (err) => {
|
|
3043
|
+
onComplete: (err, exitCode) => {
|
|
2793
3044
|
app.unmount();
|
|
2794
3045
|
if (err) {
|
|
2795
3046
|
reject(err);
|
|
2796
3047
|
} else {
|
|
2797
|
-
resolve5();
|
|
3048
|
+
resolve5(exitCode ?? 0);
|
|
2798
3049
|
}
|
|
2799
3050
|
}
|
|
2800
3051
|
})
|
|
@@ -2820,12 +3071,22 @@ async function main() {
|
|
|
2820
3071
|
if (!args.command) {
|
|
2821
3072
|
printUsageAndExit(1);
|
|
2822
3073
|
}
|
|
2823
|
-
if (
|
|
2824
|
-
|
|
2825
|
-
|
|
3074
|
+
if (args.command === "run") {
|
|
3075
|
+
if (args.runConfigNames.length === 0) {
|
|
3076
|
+
console.error(
|
|
3077
|
+
"Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
|
|
3078
|
+
);
|
|
3079
|
+
printUsageAndExit(1);
|
|
3080
|
+
}
|
|
3081
|
+
if (args.datasetName !== void 0) {
|
|
3082
|
+
console.error(
|
|
3083
|
+
"The run command no longer accepts --dataset; use --run-config <RunConfig name>."
|
|
3084
|
+
);
|
|
3085
|
+
printUsageAndExit(1);
|
|
3086
|
+
}
|
|
2826
3087
|
}
|
|
2827
|
-
if (args.command === "
|
|
2828
|
-
console.error("
|
|
3088
|
+
if (args.command === "generate" && args.runConfigNames.length > 0) {
|
|
3089
|
+
console.error("generate does not accept --run-config.");
|
|
2829
3090
|
printUsageAndExit(1);
|
|
2830
3091
|
}
|
|
2831
3092
|
const useInk = process.stdout.isTTY === true;
|
|
@@ -2836,17 +3097,24 @@ async function main() {
|
|
|
2836
3097
|
try {
|
|
2837
3098
|
if (args.command === "run") {
|
|
2838
3099
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2839
|
-
await (useInk ?
|
|
3100
|
+
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
2840
3101
|
runner,
|
|
2841
|
-
args.
|
|
2842
|
-
args.evaluatorPattern,
|
|
3102
|
+
args.runConfigNames,
|
|
2843
3103
|
concurrency
|
|
2844
3104
|
);
|
|
3105
|
+
if (args.ci && exitCode !== 0) {
|
|
3106
|
+
process.exit(1);
|
|
3107
|
+
}
|
|
2845
3108
|
return;
|
|
2846
3109
|
}
|
|
3110
|
+
const genDataset = args.datasetName;
|
|
3111
|
+
if (!genDataset) {
|
|
3112
|
+
console.error("Missing required --dataset <datasetName> argument.");
|
|
3113
|
+
printUsageAndExit(1);
|
|
3114
|
+
}
|
|
2847
3115
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
2848
3116
|
runner,
|
|
2849
|
-
|
|
3117
|
+
genDataset
|
|
2850
3118
|
);
|
|
2851
3119
|
} finally {
|
|
2852
3120
|
await runner.shutdown();
|