@m4trix/evals 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +911 -643
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +898 -630
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +688 -575
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +679 -566
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +959 -623
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +947 -625
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/cli-simple.js
CHANGED
|
@@ -1,35 +1,191 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
|
-
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
3
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
4
|
+
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
5
|
+
import { resolve, join, relative, parse, dirname } from 'path';
|
|
4
6
|
import { existsSync } from 'fs';
|
|
5
|
-
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
7
|
import * as jitiModule from 'jiti';
|
|
7
|
-
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import { diffLines } from 'diff';
|
|
10
10
|
import stringify from 'fast-json-stable-stringify';
|
|
11
|
-
import * as
|
|
12
|
-
import
|
|
11
|
+
import * as React from 'react';
|
|
12
|
+
import React__default, { useState, useEffect, useCallback } from 'react';
|
|
13
13
|
import { render, Box, Text } from 'ink';
|
|
14
14
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
15
15
|
|
|
16
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
17
|
+
function makeEntityIdSchema(brand, label) {
|
|
18
|
+
return Schema.String.pipe(
|
|
19
|
+
Schema.trimmed(),
|
|
20
|
+
Schema.minLength(1, {
|
|
21
|
+
message: () => `${label} must be non-empty.`
|
|
22
|
+
}),
|
|
23
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
24
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
25
|
+
}),
|
|
26
|
+
Schema.brand(brand)
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
30
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
31
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
32
|
+
function validateWithSchema(schema, raw, context) {
|
|
33
|
+
const trimmed = raw.trim();
|
|
34
|
+
const decode = Schema.decodeUnknownEither(
|
|
35
|
+
schema
|
|
36
|
+
);
|
|
37
|
+
const result = decode(trimmed);
|
|
38
|
+
if (Either.isLeft(result)) {
|
|
39
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
40
|
+
}
|
|
41
|
+
return result.right;
|
|
42
|
+
}
|
|
43
|
+
function validateRunConfigName(raw, context) {
|
|
44
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// src/evals/evaluator.ts
|
|
48
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
49
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
50
|
+
const label = evaluator.getDisplayLabel();
|
|
51
|
+
if (label !== void 0) {
|
|
52
|
+
return label;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
56
|
+
}
|
|
57
|
+
function getEvaluatorTagList(evaluator) {
|
|
58
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
59
|
+
}
|
|
60
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
61
|
+
const baseDir = resolve(config.artifactDirectory);
|
|
62
|
+
let entries;
|
|
63
|
+
try {
|
|
64
|
+
entries = await readdir(baseDir);
|
|
65
|
+
} catch {
|
|
66
|
+
return [];
|
|
67
|
+
}
|
|
68
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
69
|
+
const snapshots = [];
|
|
70
|
+
for (const fileName of jsonlFiles) {
|
|
71
|
+
const filePath = join(baseDir, fileName);
|
|
72
|
+
try {
|
|
73
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
74
|
+
if (snapshot) {
|
|
75
|
+
snapshots.push(snapshot);
|
|
76
|
+
}
|
|
77
|
+
} catch {
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
81
|
+
}
|
|
82
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
83
|
+
const content = await readFile(filePath, "utf8");
|
|
84
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
85
|
+
if (lines.length === 0) {
|
|
86
|
+
return null;
|
|
87
|
+
}
|
|
88
|
+
let runQueued = null;
|
|
89
|
+
let runCompleted = null;
|
|
90
|
+
let runFailed = null;
|
|
91
|
+
let runStarted = null;
|
|
92
|
+
for (const line of lines) {
|
|
93
|
+
try {
|
|
94
|
+
const event = JSON.parse(line);
|
|
95
|
+
const type = event.type;
|
|
96
|
+
if (type === "RunQueued") {
|
|
97
|
+
runQueued = {
|
|
98
|
+
runId: event.runId,
|
|
99
|
+
datasetId: event.datasetId,
|
|
100
|
+
datasetName: event.datasetName,
|
|
101
|
+
evaluatorIds: event.evaluatorIds,
|
|
102
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
103
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
104
|
+
ts: event.ts
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
if (type === "RunStarted") {
|
|
108
|
+
runStarted = { startedAt: event.startedAt };
|
|
109
|
+
}
|
|
110
|
+
if (type === "RunCompleted") {
|
|
111
|
+
runCompleted = {
|
|
112
|
+
passedTestCases: event.passedTestCases,
|
|
113
|
+
failedTestCases: event.failedTestCases,
|
|
114
|
+
totalTestCases: event.totalTestCases,
|
|
115
|
+
finishedAt: event.finishedAt
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
if (type === "RunFailed") {
|
|
119
|
+
runFailed = {
|
|
120
|
+
finishedAt: event.finishedAt,
|
|
121
|
+
errorMessage: event.errorMessage
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
} catch {
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
if (!runQueued) {
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
const artifactPath = filePath;
|
|
131
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
132
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
133
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
134
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
135
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
136
|
+
return {
|
|
137
|
+
runId: runQueued.runId,
|
|
138
|
+
datasetId: runQueued.datasetId,
|
|
139
|
+
datasetName: runQueued.datasetName,
|
|
140
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
141
|
+
queuedAt: runQueued.ts ?? 0,
|
|
142
|
+
startedAt: runStarted?.startedAt,
|
|
143
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
144
|
+
totalTestCases: runQueued.totalTestCases,
|
|
145
|
+
completedTestCases,
|
|
146
|
+
passedTestCases,
|
|
147
|
+
failedTestCases,
|
|
148
|
+
status,
|
|
149
|
+
artifactPath,
|
|
150
|
+
errorMessage: runFailed?.errorMessage
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
function aggregateTestCaseProgress(lines) {
|
|
154
|
+
let completedTestCases = 0;
|
|
155
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
156
|
+
for (const line of lines) {
|
|
157
|
+
try {
|
|
158
|
+
const event = JSON.parse(line);
|
|
159
|
+
if (event.type === "TestCaseProgress") {
|
|
160
|
+
const ev = event;
|
|
161
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
162
|
+
const id = ev.testCaseId;
|
|
163
|
+
const current = testCasePassedBy.get(id);
|
|
164
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
165
|
+
}
|
|
166
|
+
} catch {
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
let passedTestCases = 0;
|
|
170
|
+
let failedTestCases = 0;
|
|
171
|
+
for (const passed of testCasePassedBy.values()) {
|
|
172
|
+
if (passed) {
|
|
173
|
+
passedTestCases += 1;
|
|
174
|
+
} else {
|
|
175
|
+
failedTestCases += 1;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
179
|
+
}
|
|
180
|
+
|
|
16
181
|
// src/runner/config.ts
|
|
17
182
|
var defaultRunnerConfig = {
|
|
18
183
|
discovery: {
|
|
19
184
|
rootDir: process.cwd(),
|
|
20
185
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
21
|
-
evaluatorSuffixes: [
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
".evaluator.js",
|
|
25
|
-
".evaluator.mjs"
|
|
26
|
-
],
|
|
27
|
-
testCaseSuffixes: [
|
|
28
|
-
".test-case.ts",
|
|
29
|
-
".test-case.tsx",
|
|
30
|
-
".test-case.js",
|
|
31
|
-
".test-case.mjs"
|
|
32
|
-
],
|
|
186
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
187
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
188
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
33
189
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
34
190
|
},
|
|
35
191
|
artifactDirectory: ".eval-results",
|
|
@@ -54,6 +210,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
54
210
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
55
211
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
56
212
|
}
|
|
213
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
214
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
215
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
216
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
217
|
+
}
|
|
57
218
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
58
219
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
59
220
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -96,14 +257,15 @@ function getJitiLoader() {
|
|
|
96
257
|
}
|
|
97
258
|
const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
|
|
98
259
|
if (typeof createJiti2 !== "function") {
|
|
99
|
-
throw new Error(
|
|
100
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
101
|
-
);
|
|
260
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
102
261
|
}
|
|
103
|
-
cachedLoader = createJiti2(
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
262
|
+
cachedLoader = createJiti2(
|
|
263
|
+
import.meta.url,
|
|
264
|
+
{
|
|
265
|
+
interopDefault: true,
|
|
266
|
+
moduleCache: true
|
|
267
|
+
}
|
|
268
|
+
);
|
|
107
269
|
return cachedLoader;
|
|
108
270
|
}
|
|
109
271
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -151,6 +313,9 @@ function isDatasetLike(value) {
|
|
|
151
313
|
function isEvaluatorLike(value) {
|
|
152
314
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
153
315
|
}
|
|
316
|
+
function isRunConfigLike(value) {
|
|
317
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
318
|
+
}
|
|
154
319
|
function isTestCaseLike(value) {
|
|
155
320
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
156
321
|
}
|
|
@@ -207,9 +372,7 @@ async function loadModuleExports(filePath) {
|
|
|
207
372
|
}
|
|
208
373
|
async function collectDatasetsFromFiles(config) {
|
|
209
374
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
210
|
-
const matched = files.filter(
|
|
211
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
212
|
-
);
|
|
375
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
213
376
|
const found = await Promise.all(
|
|
214
377
|
matched.map(async (absolutePath) => {
|
|
215
378
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -226,9 +389,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
226
389
|
}
|
|
227
390
|
async function collectEvaluatorsFromFiles(config) {
|
|
228
391
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
229
|
-
const matched = files.filter(
|
|
230
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
231
|
-
);
|
|
392
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
232
393
|
const found = await Promise.all(
|
|
233
394
|
matched.map(async (absolutePath) => {
|
|
234
395
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -243,11 +404,26 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
243
404
|
);
|
|
244
405
|
return found.flat();
|
|
245
406
|
}
|
|
246
|
-
async function
|
|
407
|
+
async function collectRunConfigsFromFiles(config) {
|
|
247
408
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
248
|
-
const matched = files.filter(
|
|
249
|
-
|
|
409
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
410
|
+
const found = await Promise.all(
|
|
411
|
+
matched.map(async (absolutePath) => {
|
|
412
|
+
const exports = await loadModuleExports(absolutePath);
|
|
413
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
414
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
415
|
+
return runConfigs.map((runConfig) => ({
|
|
416
|
+
id: runConfig.getName(),
|
|
417
|
+
filePath: relPath,
|
|
418
|
+
runConfig
|
|
419
|
+
}));
|
|
420
|
+
})
|
|
250
421
|
);
|
|
422
|
+
return found.flat();
|
|
423
|
+
}
|
|
424
|
+
async function collectTestCasesFromFiles(config) {
|
|
425
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
426
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
251
427
|
const found = await Promise.all(
|
|
252
428
|
matched.map(async (absolutePath) => {
|
|
253
429
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -319,16 +495,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
319
495
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
320
496
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
321
497
|
if (diffOptions?.keysOnly) {
|
|
322
|
-
const expectedKeys = JSON.stringify(
|
|
323
|
-
|
|
324
|
-
null,
|
|
325
|
-
2
|
|
326
|
-
);
|
|
327
|
-
const actualKeys = JSON.stringify(
|
|
328
|
-
extractKeys(actualProcessed),
|
|
329
|
-
null,
|
|
330
|
-
2
|
|
331
|
-
);
|
|
498
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
499
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
332
500
|
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
333
501
|
return formatDiffParts(parts2);
|
|
334
502
|
}
|
|
@@ -339,9 +507,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
339
507
|
}
|
|
340
508
|
const parts = diffLines(expectedStr, actualStr);
|
|
341
509
|
if (diffOptions?.outputNewOnly) {
|
|
342
|
-
const filtered = parts.filter(
|
|
343
|
-
(p) => p.added === true
|
|
344
|
-
);
|
|
510
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
345
511
|
return formatDiffParts(filtered);
|
|
346
512
|
}
|
|
347
513
|
return formatDiffParts(parts);
|
|
@@ -408,6 +574,17 @@ function getDiffLines(entry) {
|
|
|
408
574
|
});
|
|
409
575
|
}
|
|
410
576
|
|
|
577
|
+
// src/evals/test-case.ts
|
|
578
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
579
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
580
|
+
return testCase.getDisplayLabel();
|
|
581
|
+
}
|
|
582
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
583
|
+
}
|
|
584
|
+
function getTestCaseTagList(testCase) {
|
|
585
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
586
|
+
}
|
|
587
|
+
|
|
411
588
|
// src/evals/metric.ts
|
|
412
589
|
var registry = /* @__PURE__ */ new Map();
|
|
413
590
|
var Metric = {
|
|
@@ -431,6 +608,54 @@ function getMetricById(id) {
|
|
|
431
608
|
return registry.get(id);
|
|
432
609
|
}
|
|
433
610
|
|
|
611
|
+
// src/evals/aggregators.ts
|
|
612
|
+
function aggregateTokenCountSum(values) {
|
|
613
|
+
const initial = {
|
|
614
|
+
input: 0,
|
|
615
|
+
output: 0,
|
|
616
|
+
inputCached: 0,
|
|
617
|
+
outputCached: 0
|
|
618
|
+
};
|
|
619
|
+
return values.reduce(
|
|
620
|
+
(acc, v) => ({
|
|
621
|
+
input: acc.input + (v.input ?? 0),
|
|
622
|
+
output: acc.output + (v.output ?? 0),
|
|
623
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
624
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
625
|
+
}),
|
|
626
|
+
initial
|
|
627
|
+
);
|
|
628
|
+
}
|
|
629
|
+
function aggregateLatencyAverage(values) {
|
|
630
|
+
if (values.length === 0) {
|
|
631
|
+
return { ms: 0 };
|
|
632
|
+
}
|
|
633
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
634
|
+
return { ms: sum / values.length };
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
// src/evals/metrics/standard.ts
|
|
638
|
+
Metric.of({
|
|
639
|
+
id: "token-count",
|
|
640
|
+
name: "Tokens",
|
|
641
|
+
aggregate: aggregateTokenCountSum,
|
|
642
|
+
format: (data, options) => {
|
|
643
|
+
const input = data.input ?? 0;
|
|
644
|
+
const output = data.output ?? 0;
|
|
645
|
+
const inputCached = data.inputCached ?? 0;
|
|
646
|
+
const outputCached = data.outputCached ?? 0;
|
|
647
|
+
const cached = inputCached + outputCached;
|
|
648
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
649
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
650
|
+
}
|
|
651
|
+
});
|
|
652
|
+
Metric.of({
|
|
653
|
+
id: "latency",
|
|
654
|
+
name: "Latency",
|
|
655
|
+
aggregate: aggregateLatencyAverage,
|
|
656
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
657
|
+
});
|
|
658
|
+
|
|
434
659
|
// src/evals/score.ts
|
|
435
660
|
var registry2 = /* @__PURE__ */ new Map();
|
|
436
661
|
function formatScoreData(def, data, options) {
|
|
@@ -443,10 +668,7 @@ var ScoreAggregate = {
|
|
|
443
668
|
const count = values.length || 1;
|
|
444
669
|
const result = {};
|
|
445
670
|
for (const field of fields) {
|
|
446
|
-
result[field] = values.reduce(
|
|
447
|
-
(s, v) => s + (v[field] ?? 0),
|
|
448
|
-
0
|
|
449
|
-
) / count;
|
|
671
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
450
672
|
}
|
|
451
673
|
return result;
|
|
452
674
|
};
|
|
@@ -480,13 +702,10 @@ var ScoreAggregate = {
|
|
|
480
702
|
(s, v) => s + (v[valueField] ?? 0),
|
|
481
703
|
0
|
|
482
704
|
);
|
|
483
|
-
const sumSq = values.reduce(
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
},
|
|
488
|
-
0
|
|
489
|
-
);
|
|
705
|
+
const sumSq = values.reduce((s, v) => {
|
|
706
|
+
const value = v[valueField] ?? 0;
|
|
707
|
+
return s + value * value;
|
|
708
|
+
}, 0);
|
|
490
709
|
const mean = sum / count;
|
|
491
710
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
492
711
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -545,54 +764,6 @@ function getScoreById(id) {
|
|
|
545
764
|
return registry2.get(id);
|
|
546
765
|
}
|
|
547
766
|
|
|
548
|
-
// src/evals/aggregators.ts
|
|
549
|
-
function aggregateTokenCountSum(values) {
|
|
550
|
-
const initial = {
|
|
551
|
-
input: 0,
|
|
552
|
-
output: 0,
|
|
553
|
-
inputCached: 0,
|
|
554
|
-
outputCached: 0
|
|
555
|
-
};
|
|
556
|
-
return values.reduce(
|
|
557
|
-
(acc, v) => ({
|
|
558
|
-
input: acc.input + (v.input ?? 0),
|
|
559
|
-
output: acc.output + (v.output ?? 0),
|
|
560
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
561
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
562
|
-
}),
|
|
563
|
-
initial
|
|
564
|
-
);
|
|
565
|
-
}
|
|
566
|
-
function aggregateLatencyAverage(values) {
|
|
567
|
-
if (values.length === 0) {
|
|
568
|
-
return { ms: 0 };
|
|
569
|
-
}
|
|
570
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
571
|
-
return { ms: sum / values.length };
|
|
572
|
-
}
|
|
573
|
-
|
|
574
|
-
// src/evals/metrics/standard.ts
|
|
575
|
-
Metric.of({
|
|
576
|
-
id: "token-count",
|
|
577
|
-
name: "Tokens",
|
|
578
|
-
aggregate: aggregateTokenCountSum,
|
|
579
|
-
format: (data, options) => {
|
|
580
|
-
const input = data.input ?? 0;
|
|
581
|
-
const output = data.output ?? 0;
|
|
582
|
-
const inputCached = data.inputCached ?? 0;
|
|
583
|
-
const outputCached = data.outputCached ?? 0;
|
|
584
|
-
const cached = inputCached + outputCached;
|
|
585
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
586
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
587
|
-
}
|
|
588
|
-
});
|
|
589
|
-
Metric.of({
|
|
590
|
-
id: "latency",
|
|
591
|
-
name: "Latency",
|
|
592
|
-
aggregate: aggregateLatencyAverage,
|
|
593
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
594
|
-
});
|
|
595
|
-
|
|
596
767
|
// src/evals/scores/standard.ts
|
|
597
768
|
Score.of({
|
|
598
769
|
id: "percent",
|
|
@@ -736,15 +907,17 @@ function readOutput(testCase) {
|
|
|
736
907
|
}
|
|
737
908
|
return candidate.getOutput();
|
|
738
909
|
}
|
|
739
|
-
function buildEvaluationUnits(testCases) {
|
|
910
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
911
|
+
const count = Math.max(1, repetitionCount);
|
|
740
912
|
const units = [];
|
|
741
913
|
for (const testCaseItem of testCases) {
|
|
742
|
-
const
|
|
743
|
-
for (let r = 0; r <
|
|
914
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
915
|
+
for (let r = 0; r < count; r++) {
|
|
744
916
|
units.push({
|
|
745
917
|
testCaseItem,
|
|
746
|
-
|
|
747
|
-
|
|
918
|
+
repetitionId,
|
|
919
|
+
repetitionIndex: r + 1,
|
|
920
|
+
repetitionCount: count
|
|
748
921
|
});
|
|
749
922
|
}
|
|
750
923
|
}
|
|
@@ -754,29 +927,24 @@ function nowIsoForFile() {
|
|
|
754
927
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
755
928
|
}
|
|
756
929
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
757
|
-
return join(
|
|
758
|
-
artifactDirectory,
|
|
759
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
760
|
-
);
|
|
930
|
+
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
761
931
|
}
|
|
762
932
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
763
|
-
const { testCaseItem,
|
|
933
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
764
934
|
return Effect.gen(function* () {
|
|
765
935
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
766
936
|
const started = Date.now();
|
|
767
|
-
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
768
|
-
n + 1,
|
|
769
|
-
n + 1
|
|
770
|
-
]);
|
|
937
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
771
938
|
yield* publishEvent({
|
|
772
939
|
type: "TestCaseStarted",
|
|
773
940
|
runId: task.runId,
|
|
774
941
|
testCaseId: testCaseItem.id,
|
|
775
|
-
testCaseName: testCaseItem.testCase
|
|
942
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
776
943
|
startedTestCases: startedEvaluations,
|
|
777
944
|
totalTestCases: totalEvaluations,
|
|
778
|
-
|
|
779
|
-
|
|
945
|
+
repetitionId,
|
|
946
|
+
repetitionIndex,
|
|
947
|
+
repetitionCount
|
|
780
948
|
});
|
|
781
949
|
const evaluatorScores = [];
|
|
782
950
|
let testCaseError;
|
|
@@ -800,9 +968,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
800
968
|
return error;
|
|
801
969
|
};
|
|
802
970
|
try {
|
|
803
|
-
const ctx = yield* Effect.promise(
|
|
804
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
805
|
-
);
|
|
971
|
+
const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
806
972
|
const result = yield* Effect.promise(
|
|
807
973
|
() => Promise.resolve().then(
|
|
808
974
|
() => evaluateFn({
|
|
@@ -812,8 +978,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
812
978
|
meta: {
|
|
813
979
|
triggerId: task.triggerId,
|
|
814
980
|
runId: evaluatorRunId,
|
|
815
|
-
datasetId: task.datasetId
|
|
981
|
+
datasetId: task.datasetId,
|
|
982
|
+
repetitionId,
|
|
983
|
+
repetitionIndex,
|
|
984
|
+
repetitionCount,
|
|
985
|
+
runConfigName: task.runConfigName
|
|
816
986
|
},
|
|
987
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
988
|
+
runConfigTags: task.runConfigTags,
|
|
989
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
817
990
|
logDiff,
|
|
818
991
|
log,
|
|
819
992
|
createError
|
|
@@ -856,21 +1029,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
856
1029
|
});
|
|
857
1030
|
}
|
|
858
1031
|
}
|
|
859
|
-
const
|
|
860
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
861
|
-
n + 1,
|
|
862
|
-
n + 1
|
|
863
|
-
]);
|
|
1032
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1033
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
864
1034
|
const progressEvent = {
|
|
865
1035
|
type: "TestCaseProgress",
|
|
866
1036
|
runId: task.runId,
|
|
867
1037
|
testCaseId: testCaseItem.id,
|
|
868
|
-
testCaseName: testCaseItem.testCase
|
|
1038
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
869
1039
|
completedTestCases: completedEvaluations,
|
|
870
1040
|
totalTestCases: totalEvaluations,
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
1041
|
+
repetitionId,
|
|
1042
|
+
repetitionIndex,
|
|
1043
|
+
repetitionCount,
|
|
1044
|
+
passed: repetitionPassedThis,
|
|
874
1045
|
durationMs: Date.now() - started,
|
|
875
1046
|
evaluatorScores,
|
|
876
1047
|
output,
|
|
@@ -891,9 +1062,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
891
1062
|
(map) => {
|
|
892
1063
|
const key = testCaseItem.id;
|
|
893
1064
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
894
|
-
const newResults = [...existing.results,
|
|
1065
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
895
1066
|
const newCompletedCount = existing.completedCount + 1;
|
|
896
|
-
const isLast = newCompletedCount ===
|
|
1067
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
897
1068
|
const newMap = new Map(map);
|
|
898
1069
|
newMap.set(key, {
|
|
899
1070
|
completedCount: newCompletedCount,
|
|
@@ -909,10 +1080,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
909
1080
|
} else {
|
|
910
1081
|
yield* Ref.update(failedRef, (n) => n + 1);
|
|
911
1082
|
}
|
|
912
|
-
const [passed, failed] = yield* Effect.all([
|
|
913
|
-
Ref.get(passedRef),
|
|
914
|
-
Ref.get(failedRef)
|
|
915
|
-
]);
|
|
1083
|
+
const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
|
|
916
1084
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
917
1085
|
...snapshot,
|
|
918
1086
|
passedTestCases: passed,
|
|
@@ -933,10 +1101,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
933
1101
|
runId: task.runId,
|
|
934
1102
|
startedAt
|
|
935
1103
|
});
|
|
936
|
-
const totalEvaluations = task.testCases.
|
|
937
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
938
|
-
0
|
|
939
|
-
);
|
|
1104
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
940
1105
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
941
1106
|
const completedRef = yield* Ref.make(0);
|
|
942
1107
|
const startedRef = yield* Ref.make(0);
|
|
@@ -945,7 +1110,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
945
1110
|
const testCaseResultsRef = yield* Ref.make(
|
|
946
1111
|
/* @__PURE__ */ new Map()
|
|
947
1112
|
);
|
|
948
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1113
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
949
1114
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
950
1115
|
task,
|
|
951
1116
|
unit,
|
|
@@ -959,11 +1124,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
959
1124
|
failedRef,
|
|
960
1125
|
testCaseResultsRef
|
|
961
1126
|
);
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
1127
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1128
|
+
if (globalSem !== void 0) {
|
|
1129
|
+
yield* Effect.forEach(
|
|
1130
|
+
evaluationUnits,
|
|
1131
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1132
|
+
{ concurrency: "unbounded", discard: true }
|
|
1133
|
+
);
|
|
1134
|
+
} else {
|
|
1135
|
+
yield* Effect.forEach(
|
|
1136
|
+
evaluationUnits,
|
|
1137
|
+
processEvaluation,
|
|
1138
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1139
|
+
);
|
|
1140
|
+
}
|
|
967
1141
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
968
1142
|
Ref.get(completedRef),
|
|
969
1143
|
Ref.get(passedRef),
|
|
@@ -999,125 +1173,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
999
1173
|
artifactPath: task.snapshot.artifactPath
|
|
1000
1174
|
});
|
|
1001
1175
|
});
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
} catch {
|
|
1008
|
-
return [];
|
|
1009
|
-
}
|
|
1010
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1011
|
-
const snapshots = [];
|
|
1012
|
-
for (const fileName of jsonlFiles) {
|
|
1013
|
-
const filePath = join(baseDir, fileName);
|
|
1014
|
-
try {
|
|
1015
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1016
|
-
if (snapshot) {
|
|
1017
|
-
snapshots.push(snapshot);
|
|
1018
|
-
}
|
|
1019
|
-
} catch {
|
|
1020
|
-
}
|
|
1021
|
-
}
|
|
1022
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1023
|
-
}
|
|
1024
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1025
|
-
const content = await readFile(filePath, "utf8");
|
|
1026
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1027
|
-
if (lines.length === 0) {
|
|
1028
|
-
return null;
|
|
1029
|
-
}
|
|
1030
|
-
let runQueued = null;
|
|
1031
|
-
let runCompleted = null;
|
|
1032
|
-
let runFailed = null;
|
|
1033
|
-
let runStarted = null;
|
|
1034
|
-
for (const line of lines) {
|
|
1035
|
-
try {
|
|
1036
|
-
const event = JSON.parse(line);
|
|
1037
|
-
const type = event.type;
|
|
1038
|
-
if (type === "RunQueued") {
|
|
1039
|
-
runQueued = {
|
|
1040
|
-
runId: event.runId,
|
|
1041
|
-
datasetId: event.datasetId,
|
|
1042
|
-
datasetName: event.datasetName,
|
|
1043
|
-
evaluatorIds: event.evaluatorIds,
|
|
1044
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1045
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1046
|
-
ts: event.ts
|
|
1047
|
-
};
|
|
1048
|
-
}
|
|
1049
|
-
if (type === "RunStarted") {
|
|
1050
|
-
runStarted = { startedAt: event.startedAt };
|
|
1051
|
-
}
|
|
1052
|
-
if (type === "RunCompleted") {
|
|
1053
|
-
runCompleted = {
|
|
1054
|
-
passedTestCases: event.passedTestCases,
|
|
1055
|
-
failedTestCases: event.failedTestCases,
|
|
1056
|
-
totalTestCases: event.totalTestCases,
|
|
1057
|
-
finishedAt: event.finishedAt
|
|
1058
|
-
};
|
|
1059
|
-
}
|
|
1060
|
-
if (type === "RunFailed") {
|
|
1061
|
-
runFailed = {
|
|
1062
|
-
finishedAt: event.finishedAt,
|
|
1063
|
-
errorMessage: event.errorMessage
|
|
1064
|
-
};
|
|
1065
|
-
}
|
|
1066
|
-
} catch {
|
|
1067
|
-
}
|
|
1176
|
+
|
|
1177
|
+
// src/runner/name-pattern.ts
|
|
1178
|
+
function parseRegexLiteral(pattern) {
|
|
1179
|
+
if (!pattern.startsWith("/")) {
|
|
1180
|
+
return void 0;
|
|
1068
1181
|
}
|
|
1069
|
-
|
|
1070
|
-
|
|
1182
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1183
|
+
if (lastSlash <= 0) {
|
|
1184
|
+
return void 0;
|
|
1071
1185
|
}
|
|
1072
|
-
const artifactPath = filePath;
|
|
1073
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1074
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1075
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1076
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1077
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1078
1186
|
return {
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
datasetName: runQueued.datasetName,
|
|
1082
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1083
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1084
|
-
startedAt: runStarted?.startedAt,
|
|
1085
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1086
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1087
|
-
completedTestCases,
|
|
1088
|
-
passedTestCases,
|
|
1089
|
-
failedTestCases,
|
|
1090
|
-
status,
|
|
1091
|
-
artifactPath,
|
|
1092
|
-
errorMessage: runFailed?.errorMessage
|
|
1187
|
+
source: pattern.slice(1, lastSlash),
|
|
1188
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1093
1189
|
};
|
|
1094
1190
|
}
|
|
1095
|
-
function
|
|
1096
|
-
|
|
1097
|
-
const
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
if (event.type === "TestCaseProgress") {
|
|
1102
|
-
const ev = event;
|
|
1103
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1104
|
-
const id = ev.testCaseId;
|
|
1105
|
-
const current = testCasePassedBy.get(id);
|
|
1106
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1107
|
-
}
|
|
1108
|
-
} catch {
|
|
1109
|
-
}
|
|
1191
|
+
function createNameMatcher(pattern) {
|
|
1192
|
+
const normalizedPattern = pattern.trim();
|
|
1193
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1194
|
+
if (regexLiteral) {
|
|
1195
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1196
|
+
return (value) => regex.test(value);
|
|
1110
1197
|
}
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
passedTestCases += 1;
|
|
1116
|
-
} else {
|
|
1117
|
-
failedTestCases += 1;
|
|
1118
|
-
}
|
|
1198
|
+
if (normalizedPattern.includes("*")) {
|
|
1199
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1200
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1201
|
+
return (value) => regex.test(value);
|
|
1119
1202
|
}
|
|
1120
|
-
return
|
|
1203
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1121
1204
|
}
|
|
1122
1205
|
async function appendJsonLine(artifactPath, payload) {
|
|
1123
1206
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1176,32 +1259,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1176
1259
|
}
|
|
1177
1260
|
|
|
1178
1261
|
// src/runner/api.ts
|
|
1179
|
-
function
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1184
|
-
if (lastSlash <= 0) {
|
|
1185
|
-
return void 0;
|
|
1186
|
-
}
|
|
1187
|
-
return {
|
|
1188
|
-
source: pattern.slice(1, lastSlash),
|
|
1189
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1190
|
-
};
|
|
1191
|
-
}
|
|
1192
|
-
function createNameMatcher(pattern) {
|
|
1193
|
-
const normalizedPattern = pattern.trim();
|
|
1194
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1195
|
-
if (regexLiteral) {
|
|
1196
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1197
|
-
return (value) => regex.test(value);
|
|
1198
|
-
}
|
|
1199
|
-
if (normalizedPattern.includes("*")) {
|
|
1200
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1201
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1202
|
-
return (value) => regex.test(value);
|
|
1262
|
+
function normalizeRunRepetitions(value) {
|
|
1263
|
+
const n = value ?? 1;
|
|
1264
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1265
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1203
1266
|
}
|
|
1204
|
-
return
|
|
1267
|
+
return n;
|
|
1205
1268
|
}
|
|
1206
1269
|
function mergeRunnerOverrides(base, next) {
|
|
1207
1270
|
if (!base) {
|
|
@@ -1232,15 +1295,12 @@ var EffectRunner = class {
|
|
|
1232
1295
|
this.persistenceQueue = Effect.runSync(
|
|
1233
1296
|
Queue.unbounded()
|
|
1234
1297
|
);
|
|
1235
|
-
this.snapshotsRef = Effect.runSync(
|
|
1236
|
-
Ref.make(/* @__PURE__ */ new Map())
|
|
1237
|
-
);
|
|
1298
|
+
this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
|
|
1238
1299
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1239
1300
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1240
1301
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1241
|
-
this.
|
|
1242
|
-
|
|
1243
|
-
);
|
|
1302
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1303
|
+
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1244
1304
|
this.persistenceFiber = Effect.runFork(
|
|
1245
1305
|
createPersistenceWorker(this.persistenceQueue)
|
|
1246
1306
|
);
|
|
@@ -1280,6 +1340,137 @@ var EffectRunner = class {
|
|
|
1280
1340
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1281
1341
|
);
|
|
1282
1342
|
}
|
|
1343
|
+
async collectRunConfigs() {
|
|
1344
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1345
|
+
this.runConfigsById.clear();
|
|
1346
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1347
|
+
for (const item of runConfigs) {
|
|
1348
|
+
const id = item.runConfig.getName();
|
|
1349
|
+
const lower = id.toLowerCase();
|
|
1350
|
+
const prev = byNameLower.get(lower);
|
|
1351
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1352
|
+
throw new Error(
|
|
1353
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1354
|
+
);
|
|
1355
|
+
}
|
|
1356
|
+
byNameLower.set(lower, item);
|
|
1357
|
+
this.runConfigsById.set(id, item);
|
|
1358
|
+
}
|
|
1359
|
+
return runConfigs;
|
|
1360
|
+
}
|
|
1361
|
+
async resolveRunConfigByName(name) {
|
|
1362
|
+
if (this.runConfigsById.size === 0) {
|
|
1363
|
+
await this.collectRunConfigs();
|
|
1364
|
+
}
|
|
1365
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1366
|
+
const keyLower = key.toLowerCase();
|
|
1367
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1368
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1369
|
+
);
|
|
1370
|
+
if (matches.length === 0) {
|
|
1371
|
+
return void 0;
|
|
1372
|
+
}
|
|
1373
|
+
if (matches.length > 1) {
|
|
1374
|
+
throw new Error(
|
|
1375
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1376
|
+
);
|
|
1377
|
+
}
|
|
1378
|
+
return matches[0];
|
|
1379
|
+
}
|
|
1380
|
+
async expandRunConfigToJobs(collected) {
|
|
1381
|
+
if (this.datasetsById.size === 0) {
|
|
1382
|
+
await this.collectDatasets();
|
|
1383
|
+
}
|
|
1384
|
+
if (this.evaluatorsById.size === 0) {
|
|
1385
|
+
await this.collectEvaluators();
|
|
1386
|
+
}
|
|
1387
|
+
const rcName = collected.runConfig.getName();
|
|
1388
|
+
const jobs = [];
|
|
1389
|
+
const runs = collected.runConfig.getRuns();
|
|
1390
|
+
for (const [i, row] of runs.entries()) {
|
|
1391
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1392
|
+
(d) => d.dataset === row.dataset
|
|
1393
|
+
);
|
|
1394
|
+
if (!dsCollected) {
|
|
1395
|
+
throw new Error(
|
|
1396
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1397
|
+
);
|
|
1398
|
+
}
|
|
1399
|
+
let evaluatorIds;
|
|
1400
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1401
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1402
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1403
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1404
|
+
);
|
|
1405
|
+
if (matched.length === 0) {
|
|
1406
|
+
throw new Error(
|
|
1407
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1408
|
+
);
|
|
1409
|
+
}
|
|
1410
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1411
|
+
} else {
|
|
1412
|
+
const evaluators = row.evaluators;
|
|
1413
|
+
evaluatorIds = [];
|
|
1414
|
+
for (const ev of evaluators) {
|
|
1415
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
1416
|
+
(item) => item.evaluator === ev
|
|
1417
|
+
);
|
|
1418
|
+
if (!found) {
|
|
1419
|
+
throw new Error(
|
|
1420
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
1421
|
+
);
|
|
1422
|
+
}
|
|
1423
|
+
evaluatorIds.push(found.id);
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
1427
|
+
jobs.push({
|
|
1428
|
+
datasetId: dsCollected.id,
|
|
1429
|
+
evaluatorIds,
|
|
1430
|
+
runConfigName: rcName,
|
|
1431
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
1432
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
1433
|
+
repetitions
|
|
1434
|
+
});
|
|
1435
|
+
}
|
|
1436
|
+
return jobs;
|
|
1437
|
+
}
|
|
1438
|
+
async expandRunConfigNamesToJobs(names) {
|
|
1439
|
+
const jobs = [];
|
|
1440
|
+
for (const name of names) {
|
|
1441
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
1442
|
+
if (!collected) {
|
|
1443
|
+
const known = await this.collectRunConfigs();
|
|
1444
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
1445
|
+
throw new Error(
|
|
1446
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
1447
|
+
);
|
|
1448
|
+
}
|
|
1449
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
1450
|
+
}
|
|
1451
|
+
return jobs;
|
|
1452
|
+
}
|
|
1453
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
1454
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1455
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1456
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1457
|
+
const snapshots = [];
|
|
1458
|
+
for (const job of request.jobs) {
|
|
1459
|
+
snapshots.push(
|
|
1460
|
+
await this.startDatasetRun({
|
|
1461
|
+
datasetId: job.datasetId,
|
|
1462
|
+
evaluatorIds: job.evaluatorIds,
|
|
1463
|
+
triggerId,
|
|
1464
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1465
|
+
globalEvaluationSemaphore: sem,
|
|
1466
|
+
runConfigName: job.runConfigName,
|
|
1467
|
+
runConfigTags: job.runConfigTags,
|
|
1468
|
+
repetitions: job.repetitions
|
|
1469
|
+
})
|
|
1470
|
+
);
|
|
1471
|
+
}
|
|
1472
|
+
return snapshots;
|
|
1473
|
+
}
|
|
1283
1474
|
async searchTestCases(query) {
|
|
1284
1475
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1285
1476
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1298,35 +1489,45 @@ var EffectRunner = class {
|
|
|
1298
1489
|
);
|
|
1299
1490
|
}
|
|
1300
1491
|
async runDatasetWith(request) {
|
|
1492
|
+
const runConfigName = validateRunConfigName(
|
|
1493
|
+
request.runConfigName,
|
|
1494
|
+
"runDatasetWith.runConfigName"
|
|
1495
|
+
);
|
|
1496
|
+
return this.startDatasetRun({
|
|
1497
|
+
datasetId: request.datasetId,
|
|
1498
|
+
evaluatorIds: request.evaluatorIds,
|
|
1499
|
+
triggerId: request.triggerId,
|
|
1500
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1501
|
+
repetitions: request.repetitions,
|
|
1502
|
+
runConfigName,
|
|
1503
|
+
runConfigTags: request.runConfigTags
|
|
1504
|
+
});
|
|
1505
|
+
}
|
|
1506
|
+
async startDatasetRun(params) {
|
|
1301
1507
|
if (this.datasetsById.size === 0) {
|
|
1302
1508
|
await this.collectDatasets();
|
|
1303
1509
|
}
|
|
1304
1510
|
if (this.evaluatorsById.size === 0) {
|
|
1305
1511
|
await this.collectEvaluators();
|
|
1306
1512
|
}
|
|
1307
|
-
const dataset = this.datasetsById.get(
|
|
1513
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1308
1514
|
if (!dataset) {
|
|
1309
|
-
throw new Error(`Unknown dataset: ${
|
|
1515
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1310
1516
|
}
|
|
1311
|
-
const selectedEvaluators =
|
|
1517
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1312
1518
|
if (selectedEvaluators.length === 0) {
|
|
1313
1519
|
throw new Error("No evaluators selected for run");
|
|
1314
1520
|
}
|
|
1315
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1316
|
-
const
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
)
|
|
1320
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1521
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
1522
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
1523
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1524
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1525
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1321
1526
|
const runId = `run-${randomUUID()}`;
|
|
1322
|
-
const artifactPath = createArtifactPath(
|
|
1323
|
-
this.config.artifactDirectory,
|
|
1324
|
-
request.datasetId,
|
|
1325
|
-
runId
|
|
1326
|
-
);
|
|
1527
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1327
1528
|
const snapshot = {
|
|
1328
1529
|
runId,
|
|
1329
|
-
datasetId:
|
|
1530
|
+
datasetId: params.datasetId,
|
|
1330
1531
|
datasetName: dataset.dataset.getName(),
|
|
1331
1532
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1332
1533
|
queuedAt: Date.now(),
|
|
@@ -1347,7 +1548,7 @@ var EffectRunner = class {
|
|
|
1347
1548
|
const queuedEvent = {
|
|
1348
1549
|
type: "RunQueued",
|
|
1349
1550
|
runId,
|
|
1350
|
-
datasetId:
|
|
1551
|
+
datasetId: params.datasetId,
|
|
1351
1552
|
datasetName: dataset.dataset.getName(),
|
|
1352
1553
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1353
1554
|
totalTestCases: totalEvaluations,
|
|
@@ -1361,17 +1562,20 @@ var EffectRunner = class {
|
|
|
1361
1562
|
payload: queuedEvent
|
|
1362
1563
|
})
|
|
1363
1564
|
);
|
|
1364
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1365
1565
|
await Effect.runPromise(
|
|
1366
1566
|
Queue.offer(this.runQueue, {
|
|
1367
1567
|
runId,
|
|
1368
1568
|
triggerId,
|
|
1369
|
-
datasetId:
|
|
1569
|
+
datasetId: params.datasetId,
|
|
1370
1570
|
dataset: dataset.dataset,
|
|
1371
1571
|
evaluators: selectedEvaluators,
|
|
1372
1572
|
testCases: selectedTestCases,
|
|
1373
1573
|
snapshot,
|
|
1374
|
-
maxConcurrency
|
|
1574
|
+
maxConcurrency: params.maxConcurrency,
|
|
1575
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1576
|
+
runConfigName: params.runConfigName,
|
|
1577
|
+
runConfigTags,
|
|
1578
|
+
repetitions
|
|
1375
1579
|
})
|
|
1376
1580
|
);
|
|
1377
1581
|
return snapshot;
|
|
@@ -1387,9 +1591,9 @@ var EffectRunner = class {
|
|
|
1387
1591
|
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
1388
1592
|
}
|
|
1389
1593
|
getAllRunSnapshots() {
|
|
1390
|
-
return Array.from(
|
|
1391
|
-
|
|
1392
|
-
)
|
|
1594
|
+
return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
|
|
1595
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
1596
|
+
);
|
|
1393
1597
|
}
|
|
1394
1598
|
async loadRunSnapshotsFromArtifacts() {
|
|
1395
1599
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1450,6 +1654,8 @@ function getDefaultConcurrency() {
|
|
|
1450
1654
|
function parseSimpleCliArgs(argv) {
|
|
1451
1655
|
const args = {
|
|
1452
1656
|
help: false,
|
|
1657
|
+
ci: false,
|
|
1658
|
+
runConfigNames: [],
|
|
1453
1659
|
unknownArgs: []
|
|
1454
1660
|
};
|
|
1455
1661
|
let index = 0;
|
|
@@ -1463,18 +1669,26 @@ function parseSimpleCliArgs(argv) {
|
|
|
1463
1669
|
args.help = true;
|
|
1464
1670
|
continue;
|
|
1465
1671
|
}
|
|
1672
|
+
if (token === "--ci") {
|
|
1673
|
+
args.ci = true;
|
|
1674
|
+
continue;
|
|
1675
|
+
}
|
|
1466
1676
|
if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
|
|
1467
1677
|
args.datasetName = argv[index + 1];
|
|
1468
1678
|
index += 1;
|
|
1469
1679
|
continue;
|
|
1470
1680
|
}
|
|
1471
|
-
if ((token === "--
|
|
1472
|
-
|
|
1681
|
+
if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
|
|
1682
|
+
const next = argv[index + 1];
|
|
1683
|
+
if (typeof next === "string") {
|
|
1684
|
+
args.runConfigNames.push(next);
|
|
1685
|
+
}
|
|
1473
1686
|
index += 1;
|
|
1474
1687
|
continue;
|
|
1475
1688
|
}
|
|
1476
1689
|
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1477
|
-
const
|
|
1690
|
+
const nextConc = argv[index + 1];
|
|
1691
|
+
const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
|
|
1478
1692
|
if (!Number.isNaN(n) && n >= 1) {
|
|
1479
1693
|
args.concurrency = n;
|
|
1480
1694
|
}
|
|
@@ -1488,16 +1702,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1488
1702
|
function getSimpleCliUsage() {
|
|
1489
1703
|
return [
|
|
1490
1704
|
"Usage:",
|
|
1491
|
-
" eval-agents-simple run --
|
|
1705
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1492
1706
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1493
1707
|
"",
|
|
1494
1708
|
"Options:",
|
|
1495
|
-
" --
|
|
1496
|
-
""
|
|
1497
|
-
"Pattern examples for --evaluator:",
|
|
1498
|
-
" score-evaluator exact name (case-insensitive)",
|
|
1499
|
-
' "*score*" wildcard pattern',
|
|
1500
|
-
' "/score/i" regex literal'
|
|
1709
|
+
" --ci With run: exit with code 1 if any test case fails.",
|
|
1710
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1501
1711
|
].join("\n");
|
|
1502
1712
|
}
|
|
1503
1713
|
|
|
@@ -1548,7 +1758,7 @@ function GenerateView({
|
|
|
1548
1758
|
const payload = testCases.map((item) => {
|
|
1549
1759
|
const tc = item.testCase;
|
|
1550
1760
|
return {
|
|
1551
|
-
name: item.testCase
|
|
1761
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1552
1762
|
input: item.testCase.getInput(),
|
|
1553
1763
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
1554
1764
|
};
|
|
@@ -1556,12 +1766,8 @@ function GenerateView({
|
|
|
1556
1766
|
const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
|
|
1557
1767
|
const parsed = parse2(absoluteDatasetPath);
|
|
1558
1768
|
const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
|
|
1559
|
-
await writeFile2(
|
|
1560
|
-
|
|
1561
|
-
`${JSON.stringify(payload, null, 2)}
|
|
1562
|
-
`,
|
|
1563
|
-
"utf8"
|
|
1564
|
-
);
|
|
1769
|
+
await writeFile2(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1770
|
+
`, "utf8");
|
|
1565
1771
|
if (!cancelled) {
|
|
1566
1772
|
setResult({
|
|
1567
1773
|
count: payload.length,
|
|
@@ -1618,7 +1824,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1618
1824
|
}
|
|
1619
1825
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
1620
1826
|
const payload = testCases.map((item) => ({
|
|
1621
|
-
name: item.testCase
|
|
1827
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1622
1828
|
input: item.testCase.getInput(),
|
|
1623
1829
|
output: readOutput2(item.testCase)
|
|
1624
1830
|
}));
|
|
@@ -1632,7 +1838,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1632
1838
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1633
1839
|
return new Promise((resolve5, reject) => {
|
|
1634
1840
|
const app = render(
|
|
1635
|
-
|
|
1841
|
+
React__default.createElement(GenerateView, {
|
|
1636
1842
|
runner,
|
|
1637
1843
|
datasetName,
|
|
1638
1844
|
onComplete: (err) => {
|
|
@@ -1717,9 +1923,7 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1717
1923
|
function aggregateEvaluatorScores(events, nameById) {
|
|
1718
1924
|
if (events.length === 0)
|
|
1719
1925
|
return [];
|
|
1720
|
-
const evaluatorIds = new Set(
|
|
1721
|
-
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1722
|
-
);
|
|
1926
|
+
const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
|
|
1723
1927
|
const result = [];
|
|
1724
1928
|
for (const evaluatorId of evaluatorIds) {
|
|
1725
1929
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -1749,9 +1953,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1749
1953
|
return es?.passed ?? false;
|
|
1750
1954
|
});
|
|
1751
1955
|
const lastEvent = events[events.length - 1];
|
|
1752
|
-
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1753
|
-
(x) => x.evaluatorId === evaluatorId
|
|
1754
|
-
);
|
|
1956
|
+
const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1755
1957
|
result.push({
|
|
1756
1958
|
evaluatorId,
|
|
1757
1959
|
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
@@ -1780,14 +1982,11 @@ function formatScorePart(item, _scoreToColor, options) {
|
|
|
1780
1982
|
}
|
|
1781
1983
|
function RunView({
|
|
1782
1984
|
runner,
|
|
1783
|
-
|
|
1784
|
-
evaluatorPattern,
|
|
1985
|
+
runConfigNames,
|
|
1785
1986
|
concurrency,
|
|
1786
1987
|
onComplete
|
|
1787
1988
|
}) {
|
|
1788
|
-
const [phase, setPhase] = useState(
|
|
1789
|
-
"loading"
|
|
1790
|
-
);
|
|
1989
|
+
const [phase, setPhase] = useState("loading");
|
|
1791
1990
|
const [runInfo, setRunInfo] = useState(null);
|
|
1792
1991
|
const [testCases, setTestCases] = useState([]);
|
|
1793
1992
|
const [startedEvaluations, setStartedEvaluations] = useState(0);
|
|
@@ -1796,30 +1995,30 @@ function RunView({
|
|
|
1796
1995
|
const [summary, setSummary] = useState(null);
|
|
1797
1996
|
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1798
1997
|
const runEval = useCallback(async () => {
|
|
1799
|
-
const
|
|
1800
|
-
if (
|
|
1801
|
-
|
|
1802
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1803
|
-
onComplete(
|
|
1804
|
-
new Error(
|
|
1805
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1806
|
-
)
|
|
1807
|
-
);
|
|
1998
|
+
const rcList = runConfigNames.filter((n) => n.trim().length > 0);
|
|
1999
|
+
if (rcList.length === 0) {
|
|
2000
|
+
onComplete(new Error("At least one RunConfig name is required."));
|
|
1808
2001
|
return;
|
|
1809
2002
|
}
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
2003
|
+
setStartedEvaluations(0);
|
|
2004
|
+
setCompletedEvaluations(0);
|
|
2005
|
+
setTestCases([]);
|
|
2006
|
+
setRunningEvaluations([]);
|
|
2007
|
+
setSummary(null);
|
|
2008
|
+
let jobs;
|
|
2009
|
+
try {
|
|
2010
|
+
jobs = await runner.expandRunConfigNamesToJobs(rcList);
|
|
2011
|
+
} catch (err) {
|
|
2012
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
2013
|
+
return;
|
|
2014
|
+
}
|
|
2015
|
+
if (jobs.length === 0) {
|
|
2016
|
+
onComplete(new Error("No jobs expanded from RunConfigs."));
|
|
1819
2017
|
return;
|
|
1820
2018
|
}
|
|
2019
|
+
const allEvaluators = await runner.collectEvaluators();
|
|
1821
2020
|
const nameById = new Map(
|
|
1822
|
-
|
|
2021
|
+
allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
1823
2022
|
);
|
|
1824
2023
|
setEvaluatorNameById(nameById);
|
|
1825
2024
|
const aggregates = /* @__PURE__ */ new Map();
|
|
@@ -1827,21 +2026,30 @@ function RunView({
|
|
|
1827
2026
|
let overallScoreTotal = 0;
|
|
1828
2027
|
let overallScoreSumSq = 0;
|
|
1829
2028
|
let overallScoreCount = 0;
|
|
1830
|
-
const
|
|
2029
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2030
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2031
|
+
let batchReady = false;
|
|
2032
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2033
|
+
const done = new Promise((resolve5, reject) => {
|
|
1831
2034
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2035
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2036
|
+
return;
|
|
2037
|
+
}
|
|
1832
2038
|
if (event.type === "TestCaseStarted") {
|
|
1833
|
-
setStartedEvaluations(
|
|
2039
|
+
setStartedEvaluations((c) => c + 1);
|
|
1834
2040
|
setRunningEvaluations((prev) => {
|
|
1835
2041
|
const withoutDuplicate = prev.filter(
|
|
1836
|
-
(item) => !(item.testCaseId === event.testCaseId && item.
|
|
2042
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
1837
2043
|
);
|
|
1838
2044
|
return [
|
|
1839
2045
|
...withoutDuplicate,
|
|
1840
2046
|
{
|
|
2047
|
+
runId: event.runId,
|
|
1841
2048
|
testCaseId: event.testCaseId,
|
|
1842
2049
|
name: event.testCaseName,
|
|
1843
|
-
|
|
1844
|
-
|
|
2050
|
+
repetitionId: event.repetitionId,
|
|
2051
|
+
repetitionIndex: event.repetitionIndex,
|
|
2052
|
+
repetitionCount: event.repetitionCount,
|
|
1845
2053
|
startedTestCases: event.startedTestCases,
|
|
1846
2054
|
totalTestCases: event.totalTestCases
|
|
1847
2055
|
}
|
|
@@ -1877,9 +2085,12 @@ function RunView({
|
|
|
1877
2085
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
1878
2086
|
}
|
|
1879
2087
|
}
|
|
2088
|
+
const label = runIdToLabel.get(event.runId);
|
|
2089
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2090
|
+
const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
|
|
1880
2091
|
setTestCases((prev) => {
|
|
1881
2092
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1882
|
-
const existing = byId.get(
|
|
2093
|
+
const existing = byId.get(compositeId);
|
|
1883
2094
|
const newEvent = {
|
|
1884
2095
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1885
2096
|
evaluatorId: item.evaluatorId,
|
|
@@ -1894,17 +2105,14 @@ function RunView({
|
|
|
1894
2105
|
};
|
|
1895
2106
|
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1896
2107
|
const isAggregated = events.length > 1;
|
|
1897
|
-
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1898
|
-
events,
|
|
1899
|
-
nameById
|
|
1900
|
-
);
|
|
2108
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
|
|
1901
2109
|
const merged = {
|
|
1902
|
-
name:
|
|
1903
|
-
testCaseId:
|
|
2110
|
+
name: displayName,
|
|
2111
|
+
testCaseId: compositeId,
|
|
1904
2112
|
completedTestCases: event.completedTestCases,
|
|
1905
2113
|
totalTestCases: event.totalTestCases,
|
|
1906
|
-
|
|
1907
|
-
|
|
2114
|
+
repetitionIndex: event.repetitionIndex,
|
|
2115
|
+
repetitionCount: event.repetitionCount,
|
|
1908
2116
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1909
2117
|
passed: events.every((e) => e.passed),
|
|
1910
2118
|
errorMessage: event.errorMessage,
|
|
@@ -1912,84 +2120,118 @@ function RunView({
|
|
|
1912
2120
|
aggregatedEvaluatorScores,
|
|
1913
2121
|
isAggregated
|
|
1914
2122
|
};
|
|
1915
|
-
byId.set(
|
|
1916
|
-
setCompletedEvaluations(event.completedTestCases);
|
|
1917
|
-
setRunningEvaluations(
|
|
1918
|
-
(running) => running.filter(
|
|
1919
|
-
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1920
|
-
)
|
|
1921
|
-
);
|
|
2123
|
+
byId.set(compositeId, merged);
|
|
1922
2124
|
return Array.from(byId.values());
|
|
1923
2125
|
});
|
|
2126
|
+
setCompletedEvaluations((c) => c + 1);
|
|
2127
|
+
setRunningEvaluations(
|
|
2128
|
+
(running) => running.filter(
|
|
2129
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
2130
|
+
)
|
|
2131
|
+
);
|
|
1924
2132
|
}
|
|
1925
|
-
if (event.type === "
|
|
2133
|
+
if (event.type === "RunFailed") {
|
|
2134
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2135
|
+
return;
|
|
2136
|
+
}
|
|
1926
2137
|
unsubscribe();
|
|
1927
|
-
|
|
2138
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2139
|
+
return;
|
|
2140
|
+
}
|
|
2141
|
+
if (event.type === "RunCompleted") {
|
|
2142
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2143
|
+
return;
|
|
2144
|
+
}
|
|
2145
|
+
completedRuns.set(event.runId, event);
|
|
2146
|
+
batchPendingRunIds.delete(event.runId);
|
|
2147
|
+
if (batchPendingRunIds.size === 0) {
|
|
2148
|
+
unsubscribe();
|
|
2149
|
+
resolve5();
|
|
2150
|
+
}
|
|
1928
2151
|
}
|
|
1929
2152
|
});
|
|
1930
2153
|
});
|
|
1931
|
-
const
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
concurrency
|
|
2154
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2155
|
+
jobs,
|
|
2156
|
+
globalConcurrency: concurrency
|
|
1935
2157
|
});
|
|
2158
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2159
|
+
const snap = snapshots[i];
|
|
2160
|
+
const job = jobs[i];
|
|
2161
|
+
if (snap && job) {
|
|
2162
|
+
runIdToLabel.set(
|
|
2163
|
+
snap.runId,
|
|
2164
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2165
|
+
);
|
|
2166
|
+
batchPendingRunIds.add(snap.runId);
|
|
2167
|
+
}
|
|
2168
|
+
}
|
|
2169
|
+
const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2170
|
+
batchReady = true;
|
|
2171
|
+
const runConfigLabels = await Promise.all(
|
|
2172
|
+
rcList.map(async (n) => {
|
|
2173
|
+
const collected = await runner.resolveRunConfigByName(n);
|
|
2174
|
+
return collected?.runConfig.getDisplayLabel() ?? n;
|
|
2175
|
+
})
|
|
2176
|
+
);
|
|
1936
2177
|
setRunInfo({
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
totalTestCases: snapshot.totalTestCases
|
|
2178
|
+
names: runConfigLabels,
|
|
2179
|
+
jobs: jobs.length,
|
|
2180
|
+
totalTestCases: totalUnits
|
|
1941
2181
|
});
|
|
1942
2182
|
setPhase("running");
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
2183
|
+
try {
|
|
2184
|
+
await done;
|
|
2185
|
+
} catch (err) {
|
|
2186
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
1946
2187
|
return;
|
|
1947
2188
|
}
|
|
1948
|
-
|
|
2189
|
+
let passedTestCases = 0;
|
|
2190
|
+
let failedTestCases = 0;
|
|
2191
|
+
let totalTestCases = 0;
|
|
2192
|
+
const artifacts = [];
|
|
2193
|
+
for (const ev of completedRuns.values()) {
|
|
2194
|
+
passedTestCases += ev.passedTestCases;
|
|
2195
|
+
failedTestCases += ev.failedTestCases;
|
|
2196
|
+
totalTestCases += ev.totalTestCases;
|
|
2197
|
+
artifacts.push(ev.artifactPath);
|
|
2198
|
+
}
|
|
1949
2199
|
setSummary({
|
|
1950
|
-
passedTestCases
|
|
1951
|
-
failedTestCases
|
|
1952
|
-
totalTestCases
|
|
2200
|
+
passedTestCases,
|
|
2201
|
+
failedTestCases,
|
|
2202
|
+
totalTestCases,
|
|
1953
2203
|
overallScoreTotal,
|
|
1954
2204
|
overallScoreSumSq,
|
|
1955
2205
|
overallScoreCount,
|
|
1956
2206
|
aggregates: new Map(aggregates),
|
|
1957
2207
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1958
|
-
artifactPath:
|
|
2208
|
+
artifactPath: artifacts.join("\n")
|
|
1959
2209
|
});
|
|
1960
2210
|
setPhase("completed");
|
|
1961
|
-
|
|
1962
|
-
|
|
2211
|
+
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2212
|
+
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2213
|
+
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
1963
2214
|
useEffect(() => {
|
|
1964
2215
|
void runEval();
|
|
1965
2216
|
}, [runEval]);
|
|
1966
2217
|
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
1967
2218
|
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
1968
2219
|
runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1969
|
-
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
" "
|
|
1973
|
-
] }),
|
|
1974
|
-
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
|
|
1975
|
-
] }),
|
|
1976
|
-
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1977
|
-
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1978
|
-
"Dataset",
|
|
1979
|
-
" "
|
|
1980
|
-
] }),
|
|
1981
|
-
runInfo.datasetName
|
|
2220
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
2221
|
+
"RunConfigs",
|
|
2222
|
+
" "
|
|
1982
2223
|
] }),
|
|
2224
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.names.join(", ") }),
|
|
1983
2225
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1984
2226
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1985
|
-
"
|
|
2227
|
+
"Jobs",
|
|
1986
2228
|
" "
|
|
1987
2229
|
] }),
|
|
1988
|
-
runInfo.
|
|
2230
|
+
runInfo.jobs
|
|
1989
2231
|
] }),
|
|
1990
2232
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1991
2233
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1992
|
-
"
|
|
2234
|
+
"Evaluation units",
|
|
1993
2235
|
" "
|
|
1994
2236
|
] }),
|
|
1995
2237
|
runInfo.totalTestCases
|
|
@@ -2011,20 +2253,19 @@ function RunView({
|
|
|
2011
2253
|
item.startedTestCases,
|
|
2012
2254
|
"/",
|
|
2013
2255
|
item.totalTestCases,
|
|
2014
|
-
"]",
|
|
2015
|
-
" ",
|
|
2256
|
+
"] ",
|
|
2016
2257
|
item.name,
|
|
2017
2258
|
" ",
|
|
2018
2259
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2019
2260
|
"(",
|
|
2020
|
-
item.
|
|
2261
|
+
item.repetitionIndex,
|
|
2021
2262
|
"/",
|
|
2022
|
-
item.
|
|
2263
|
+
item.repetitionCount,
|
|
2023
2264
|
")"
|
|
2024
2265
|
] })
|
|
2025
2266
|
]
|
|
2026
2267
|
},
|
|
2027
|
-
`${item.testCaseId}:${item.
|
|
2268
|
+
`${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
|
|
2028
2269
|
)) })
|
|
2029
2270
|
] }),
|
|
2030
2271
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
@@ -2041,9 +2282,9 @@ function RunView({
|
|
|
2041
2282
|
" ",
|
|
2042
2283
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
2043
2284
|
"(",
|
|
2044
|
-
tc.
|
|
2285
|
+
tc.repetitionIndex,
|
|
2045
2286
|
"/",
|
|
2046
|
-
tc.
|
|
2287
|
+
tc.repetitionCount,
|
|
2047
2288
|
")"
|
|
2048
2289
|
] }),
|
|
2049
2290
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
@@ -2057,73 +2298,70 @@ function RunView({
|
|
|
2057
2298
|
] }) : null
|
|
2058
2299
|
] }),
|
|
2059
2300
|
tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
|
|
2060
|
-
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
children:
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2301
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
2302
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
2303
|
+
item.evaluatorName,
|
|
2304
|
+
":",
|
|
2305
|
+
" ",
|
|
2306
|
+
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
2307
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2308
|
+
" ",
|
|
2309
|
+
item.metrics.map((m) => {
|
|
2310
|
+
const def = getMetricById(m.id);
|
|
2311
|
+
if (!def)
|
|
2312
|
+
return null;
|
|
2313
|
+
const formatted = def.format(m.data, {
|
|
2314
|
+
isAggregated: tc.isAggregated
|
|
2315
|
+
});
|
|
2316
|
+
const label = m.name ?? def.name;
|
|
2317
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2318
|
+
"[",
|
|
2319
|
+
label ? `${label}: ` : "",
|
|
2320
|
+
formatted,
|
|
2321
|
+
"]",
|
|
2322
|
+
" "
|
|
2323
|
+
] }, m.id);
|
|
2324
|
+
})
|
|
2325
|
+
] }) : null
|
|
2326
|
+
] }),
|
|
2327
|
+
item.scores.length > 0 ? item.scores.map((s) => {
|
|
2328
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2329
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2330
|
+
return /* @__PURE__ */ jsxs(
|
|
2331
|
+
Text,
|
|
2332
|
+
{
|
|
2333
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
2334
|
+
children: [
|
|
2335
|
+
" ",
|
|
2336
|
+
scoreLabel,
|
|
2337
|
+
":",
|
|
2072
2338
|
" ",
|
|
2073
|
-
|
|
2074
|
-
|
|
2075
|
-
if (!def)
|
|
2076
|
-
return null;
|
|
2077
|
-
const formatted = def.format(m.data, {
|
|
2078
|
-
isAggregated: tc.isAggregated
|
|
2079
|
-
});
|
|
2080
|
-
const label = m.name ?? def.name;
|
|
2081
|
-
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2082
|
-
"[",
|
|
2083
|
-
label ? `${label}: ` : "",
|
|
2084
|
-
formatted,
|
|
2085
|
-
"]",
|
|
2086
|
-
" "
|
|
2087
|
-
] }, m.id);
|
|
2339
|
+
formatScorePart(s, scoreColor, {
|
|
2340
|
+
isAggregated: tc.isAggregated
|
|
2088
2341
|
})
|
|
2089
|
-
]
|
|
2090
|
-
|
|
2091
|
-
item.
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2342
|
+
]
|
|
2343
|
+
},
|
|
2344
|
+
`${item.evaluatorId}-${s.id}-${scoreLabel}`
|
|
2345
|
+
);
|
|
2346
|
+
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
2347
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2348
|
+
(log) => log.type === "diff" ? /* @__PURE__ */ jsx(
|
|
2349
|
+
Box,
|
|
2350
|
+
{
|
|
2351
|
+
flexDirection: "column",
|
|
2352
|
+
children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsx(
|
|
2095
2353
|
Text,
|
|
2096
2354
|
{
|
|
2097
|
-
color:
|
|
2098
|
-
children:
|
|
2099
|
-
" ",
|
|
2100
|
-
scoreLabel,
|
|
2101
|
-
":",
|
|
2102
|
-
" ",
|
|
2103
|
-
formatScorePart(s, scoreColor, {
|
|
2104
|
-
isAggregated: tc.isAggregated
|
|
2105
|
-
})
|
|
2106
|
-
]
|
|
2355
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2356
|
+
children: line
|
|
2107
2357
|
},
|
|
2108
|
-
`${
|
|
2109
|
-
)
|
|
2110
|
-
}
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
{
|
|
2116
|
-
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2117
|
-
children: line
|
|
2118
|
-
},
|
|
2119
|
-
lineIdx
|
|
2120
|
-
)
|
|
2121
|
-
) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
2122
|
-
) })
|
|
2123
|
-
]
|
|
2124
|
-
},
|
|
2125
|
-
item.evaluatorId
|
|
2126
|
-
))
|
|
2358
|
+
`${type}:${line}`
|
|
2359
|
+
))
|
|
2360
|
+
},
|
|
2361
|
+
`diff:${getDiffLines(log).map((x) => x.line).join("|")}`
|
|
2362
|
+
) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
|
|
2363
|
+
) })
|
|
2364
|
+
] }, item.evaluatorId))
|
|
2127
2365
|
] }, tc.testCaseId)) }),
|
|
2128
2366
|
phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
2129
2367
|
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
@@ -2165,9 +2403,9 @@ function RunView({
|
|
|
2165
2403
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
2166
2404
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
2167
2405
|
const agg = summary.aggregates.get(id);
|
|
2168
|
-
const scoreKeys = [
|
|
2169
|
-
|
|
2170
|
-
|
|
2406
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
2407
|
+
(k) => k.startsWith(`${id}:`)
|
|
2408
|
+
);
|
|
2171
2409
|
if (scoreKeys.length === 0) {
|
|
2172
2410
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2173
2411
|
"- ",
|
|
@@ -2197,19 +2435,12 @@ function RunView({
|
|
|
2197
2435
|
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
2198
2436
|
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
2199
2437
|
const numeric = toNumericScore(aggregated.data);
|
|
2200
|
-
return /* @__PURE__ */ jsxs(
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
label,
|
|
2207
|
-
": ",
|
|
2208
|
-
formatted
|
|
2209
|
-
]
|
|
2210
|
-
},
|
|
2211
|
-
key
|
|
2212
|
-
);
|
|
2438
|
+
return /* @__PURE__ */ jsxs(Text, { color: numeric !== void 0 ? scoreColor(numeric) : "gray", children: [
|
|
2439
|
+
" ",
|
|
2440
|
+
label,
|
|
2441
|
+
": ",
|
|
2442
|
+
formatted
|
|
2443
|
+
] }, key);
|
|
2213
2444
|
})
|
|
2214
2445
|
] }, id);
|
|
2215
2446
|
})
|
|
@@ -2252,10 +2483,10 @@ function RunView({
|
|
|
2252
2483
|
] }, tc.testCaseId);
|
|
2253
2484
|
})
|
|
2254
2485
|
] }),
|
|
2255
|
-
/* @__PURE__ */
|
|
2256
|
-
"artifact:
|
|
2257
|
-
summary.artifactPath
|
|
2258
|
-
] })
|
|
2486
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
2487
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "artifact(s):" }),
|
|
2488
|
+
summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, line))
|
|
2489
|
+
] })
|
|
2259
2490
|
] })
|
|
2260
2491
|
] });
|
|
2261
2492
|
}
|
|
@@ -2285,9 +2516,7 @@ function buildTestCaseSummaries(byId) {
|
|
|
2285
2516
|
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
2286
2517
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
2287
2518
|
for (const ev of events) {
|
|
2288
|
-
const es = ev.evaluatorScores.find(
|
|
2289
|
-
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
2290
|
-
);
|
|
2519
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorScores.evaluatorId);
|
|
2291
2520
|
for (const s of es?.scores ?? []) {
|
|
2292
2521
|
const list = scoreIdToItems.get(s.id) ?? [];
|
|
2293
2522
|
list.push(s);
|
|
@@ -2340,9 +2569,7 @@ function scoreToColor(score) {
|
|
|
2340
2569
|
}
|
|
2341
2570
|
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
2342
2571
|
const lines = [];
|
|
2343
|
-
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
2344
|
-
(k) => k.startsWith(`${evaluatorId}:`)
|
|
2345
|
-
);
|
|
2572
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter((k) => k.startsWith(`${evaluatorId}:`));
|
|
2346
2573
|
if (scoreKeys.length === 0) {
|
|
2347
2574
|
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
2348
2575
|
return lines;
|
|
@@ -2377,9 +2604,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2377
2604
|
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2378
2605
|
if (events.length === 0)
|
|
2379
2606
|
return [];
|
|
2380
|
-
const evaluatorIds = new Set(
|
|
2381
|
-
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
2382
|
-
);
|
|
2607
|
+
const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
|
|
2383
2608
|
const result = [];
|
|
2384
2609
|
for (const evaluatorId of evaluatorIds) {
|
|
2385
2610
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -2426,9 +2651,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2426
2651
|
if (def) {
|
|
2427
2652
|
const formatted = def.format(m.data, options);
|
|
2428
2653
|
const label = m.name ?? def.name;
|
|
2429
|
-
metricParts.push(
|
|
2430
|
-
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2431
|
-
);
|
|
2654
|
+
metricParts.push(label ? `[${label}: ${formatted}]` : `[${formatted}]`);
|
|
2432
2655
|
}
|
|
2433
2656
|
}
|
|
2434
2657
|
}
|
|
@@ -2475,25 +2698,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2475
2698
|
}
|
|
2476
2699
|
return lines;
|
|
2477
2700
|
}
|
|
2478
|
-
async function
|
|
2479
|
-
const
|
|
2480
|
-
if (
|
|
2481
|
-
|
|
2482
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
2483
|
-
throw new Error(
|
|
2484
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
|
|
2485
|
-
);
|
|
2486
|
-
}
|
|
2487
|
-
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
2488
|
-
if (evaluators.length === 0) {
|
|
2489
|
-
const known = await runner.collectEvaluators();
|
|
2490
|
-
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
2491
|
-
throw new Error(
|
|
2492
|
-
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
|
|
2493
|
-
);
|
|
2701
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2702
|
+
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2703
|
+
if (jobs.length === 0) {
|
|
2704
|
+
throw new Error("No jobs expanded from RunConfigs.");
|
|
2494
2705
|
}
|
|
2706
|
+
const evaluators = await runner.collectEvaluators();
|
|
2495
2707
|
const evaluatorNameById = new Map(
|
|
2496
|
-
evaluators.map((item) => [item.id, item.evaluator
|
|
2708
|
+
evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
2497
2709
|
);
|
|
2498
2710
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2499
2711
|
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
@@ -2501,11 +2713,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2501
2713
|
let overallScoreTotal = 0;
|
|
2502
2714
|
let overallScoreSumSq = 0;
|
|
2503
2715
|
let overallScoreCount = 0;
|
|
2504
|
-
let
|
|
2505
|
-
let
|
|
2716
|
+
let globalStartedUnits = 0;
|
|
2717
|
+
let globalCompletedUnits = 0;
|
|
2506
2718
|
let totalCount = 0;
|
|
2507
2719
|
let runFinished = false;
|
|
2508
|
-
const
|
|
2720
|
+
const inFlightRepetitions = /* @__PURE__ */ new Set();
|
|
2509
2721
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2510
2722
|
let spinnerIndex = 0;
|
|
2511
2723
|
function clearLine() {
|
|
@@ -2527,33 +2739,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2527
2739
|
spinnerIndex += 1;
|
|
2528
2740
|
process.stdout.write(
|
|
2529
2741
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2530
|
-
`${
|
|
2742
|
+
`${globalCompletedUnits}/${totalCount}`,
|
|
2531
2743
|
ansi2.bold
|
|
2532
|
-
)} completed ${colorize(`${
|
|
2744
|
+
)} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
|
|
2533
2745
|
);
|
|
2534
2746
|
}
|
|
2535
2747
|
let lastPrintedTestCaseId = null;
|
|
2536
2748
|
let lastPrintedLineCount = 0;
|
|
2537
2749
|
let spinnerTimer;
|
|
2538
|
-
const
|
|
2750
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2751
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2752
|
+
let batchReady = false;
|
|
2753
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2754
|
+
const done = new Promise((resolve5, reject) => {
|
|
2539
2755
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2756
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2757
|
+
return;
|
|
2758
|
+
}
|
|
2759
|
+
const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
|
|
2760
|
+
const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
|
|
2540
2761
|
if (event.type === "TestCaseStarted") {
|
|
2541
|
-
|
|
2542
|
-
|
|
2762
|
+
globalStartedUnits += 1;
|
|
2763
|
+
inFlightRepetitions.add(
|
|
2764
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2765
|
+
);
|
|
2543
2766
|
clearLine();
|
|
2544
2767
|
process.stdout.write(
|
|
2545
|
-
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2768
|
+
`${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2546
2769
|
`
|
|
2547
2770
|
);
|
|
2548
2771
|
drawSpinner();
|
|
2549
2772
|
}
|
|
2550
2773
|
if (event.type === "TestCaseProgress") {
|
|
2551
|
-
|
|
2552
|
-
|
|
2774
|
+
globalCompletedUnits += 1;
|
|
2775
|
+
inFlightRepetitions.delete(
|
|
2776
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2777
|
+
);
|
|
2553
2778
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2554
2779
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2555
|
-
const
|
|
2556
|
-
const existing = testCaseByTestId.get(
|
|
2780
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2781
|
+
const existing = testCaseByTestId.get(compositeId) ?? {
|
|
2557
2782
|
name: event.testCaseName,
|
|
2558
2783
|
events: []
|
|
2559
2784
|
};
|
|
@@ -2563,7 +2788,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2563
2788
|
durationMs: event.durationMs,
|
|
2564
2789
|
evaluatorScores: event.evaluatorScores
|
|
2565
2790
|
});
|
|
2566
|
-
testCaseByTestId.set(
|
|
2791
|
+
testCaseByTestId.set(compositeId, existing);
|
|
2567
2792
|
for (const item of event.evaluatorScores) {
|
|
2568
2793
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
2569
2794
|
if (numeric !== void 0) {
|
|
@@ -2592,24 +2817,21 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2592
2817
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
2593
2818
|
}
|
|
2594
2819
|
}
|
|
2595
|
-
const isSameTestCase = lastPrintedTestCaseId ===
|
|
2596
|
-
const
|
|
2820
|
+
const isSameTestCase = lastPrintedTestCaseId === compositeId;
|
|
2821
|
+
const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
|
|
2597
2822
|
const isNonTty = !process.stdout.isTTY;
|
|
2598
|
-
const skipPrintNonTty = isNonTty && event.
|
|
2823
|
+
const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
|
|
2599
2824
|
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2600
2825
|
cursorUp(lastPrintedLineCount);
|
|
2601
2826
|
}
|
|
2602
2827
|
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2603
2828
|
existing.events);
|
|
2604
2829
|
const isAggregated = existing.events.length > 1;
|
|
2605
|
-
const durationMs = existing.events.reduce(
|
|
2606
|
-
(s, e) => s + e.durationMs,
|
|
2607
|
-
0
|
|
2608
|
-
);
|
|
2830
|
+
const durationMs = existing.events.reduce((s, e) => s + e.durationMs, 0);
|
|
2609
2831
|
const lines = [];
|
|
2610
2832
|
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2611
2833
|
lines.push(
|
|
2612
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2834
|
+
`${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2613
2835
|
);
|
|
2614
2836
|
if (event.errorMessage) {
|
|
2615
2837
|
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
@@ -2617,18 +2839,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2617
2839
|
for (const item of aggregatedScores) {
|
|
2618
2840
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2619
2841
|
lines.push(
|
|
2620
|
-
...formatEvaluatorScoreLine(
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
item.passed,
|
|
2624
|
-
item.metrics,
|
|
2625
|
-
{ isAggregated }
|
|
2626
|
-
)
|
|
2842
|
+
...formatEvaluatorScoreLine(name, item.scores, item.passed, item.metrics, {
|
|
2843
|
+
isAggregated
|
|
2844
|
+
})
|
|
2627
2845
|
);
|
|
2628
2846
|
const lastEvent = existing.events[existing.events.length - 1];
|
|
2629
|
-
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2630
|
-
(x) => x.evaluatorId === item.evaluatorId
|
|
2631
|
-
);
|
|
2847
|
+
const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === item.evaluatorId);
|
|
2632
2848
|
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2633
2849
|
for (const log of lastEs.logs) {
|
|
2634
2850
|
if (log.type === "diff") {
|
|
@@ -2646,73 +2862,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2646
2862
|
}
|
|
2647
2863
|
}
|
|
2648
2864
|
if (!skipPrintNonTty) {
|
|
2649
|
-
for (let i = 0; i < lines.length; i
|
|
2865
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
2650
2866
|
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2651
2867
|
`);
|
|
2652
2868
|
}
|
|
2653
|
-
lastPrintedTestCaseId =
|
|
2869
|
+
lastPrintedTestCaseId = compositeId;
|
|
2654
2870
|
lastPrintedLineCount = lines.length;
|
|
2655
2871
|
}
|
|
2656
2872
|
drawSpinner();
|
|
2657
2873
|
}
|
|
2658
|
-
if (event.type === "
|
|
2874
|
+
if (event.type === "RunFailed") {
|
|
2875
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2876
|
+
return;
|
|
2877
|
+
}
|
|
2659
2878
|
runFinished = true;
|
|
2660
2879
|
clearLine();
|
|
2661
2880
|
unsubscribe();
|
|
2662
|
-
|
|
2881
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2882
|
+
return;
|
|
2883
|
+
}
|
|
2884
|
+
if (event.type === "RunCompleted") {
|
|
2885
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2886
|
+
return;
|
|
2887
|
+
}
|
|
2888
|
+
completedRuns.set(event.runId, event);
|
|
2889
|
+
batchPendingRunIds.delete(event.runId);
|
|
2890
|
+
if (batchPendingRunIds.size === 0) {
|
|
2891
|
+
runFinished = true;
|
|
2892
|
+
clearLine();
|
|
2893
|
+
unsubscribe();
|
|
2894
|
+
resolve5();
|
|
2895
|
+
}
|
|
2663
2896
|
}
|
|
2664
2897
|
});
|
|
2665
2898
|
});
|
|
2666
|
-
|
|
2667
|
-
|
|
2668
|
-
|
|
2669
|
-
|
|
2899
|
+
console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2900
|
+
for (const name of runConfigNames) {
|
|
2901
|
+
const collected = await runner.resolveRunConfigByName(name);
|
|
2902
|
+
const label = collected?.runConfig.getDisplayLabel() ?? name;
|
|
2903
|
+
console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
|
|
2904
|
+
}
|
|
2905
|
+
console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
|
|
2906
|
+
console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
|
|
2907
|
+
console.log("");
|
|
2908
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2909
|
+
jobs,
|
|
2910
|
+
globalConcurrency: concurrency
|
|
2670
2911
|
});
|
|
2671
|
-
|
|
2672
|
-
|
|
2673
|
-
|
|
2674
|
-
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2912
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2913
|
+
const snap = snapshots[i];
|
|
2914
|
+
const job = jobs[i];
|
|
2915
|
+
if (snap && job) {
|
|
2916
|
+
runIdToLabel.set(
|
|
2917
|
+
snap.runId,
|
|
2918
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2919
|
+
);
|
|
2920
|
+
batchPendingRunIds.add(snap.runId);
|
|
2921
|
+
}
|
|
2922
|
+
}
|
|
2923
|
+
totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2924
|
+
console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
|
|
2681
2925
|
console.log("");
|
|
2926
|
+
batchReady = true;
|
|
2682
2927
|
drawSpinner();
|
|
2683
2928
|
spinnerTimer = setInterval(drawSpinner, 100);
|
|
2684
|
-
|
|
2929
|
+
await done;
|
|
2685
2930
|
if (spinnerTimer) {
|
|
2686
2931
|
clearInterval(spinnerTimer);
|
|
2687
2932
|
}
|
|
2688
|
-
if (finalEvent.type === "RunFailed") {
|
|
2689
|
-
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2690
|
-
}
|
|
2691
|
-
const completed = finalEvent;
|
|
2692
2933
|
console.log("");
|
|
2693
|
-
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
|
|
2697
|
-
|
|
2698
|
-
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
|
|
2704
|
-
)
|
|
2705
|
-
|
|
2934
|
+
console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2935
|
+
for (const snap of snapshots) {
|
|
2936
|
+
const completed = completedRuns.get(snap.runId);
|
|
2937
|
+
if (!completed) {
|
|
2938
|
+
continue;
|
|
2939
|
+
}
|
|
2940
|
+
const label = runIdToLabel.get(snap.runId) ?? snap.runId;
|
|
2941
|
+
console.log("");
|
|
2942
|
+
console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
|
|
2943
|
+
console.log(
|
|
2944
|
+
`- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
|
|
2945
|
+
);
|
|
2946
|
+
console.log(
|
|
2947
|
+
`- failed: ${colorize(
|
|
2948
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2949
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2950
|
+
)}`
|
|
2951
|
+
);
|
|
2952
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2953
|
+
}
|
|
2706
2954
|
if (overallScoreCount > 0) {
|
|
2707
2955
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2708
|
-
const overallSd = sampleStdDev2(
|
|
2709
|
-
overallScoreTotal,
|
|
2710
|
-
overallScoreSumSq,
|
|
2711
|
-
overallScoreCount
|
|
2712
|
-
);
|
|
2956
|
+
const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
|
|
2713
2957
|
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2958
|
+
console.log("");
|
|
2714
2959
|
console.log(
|
|
2715
|
-
`- overall avg score: ${colorize(
|
|
2960
|
+
`- overall avg score (all jobs): ${colorize(
|
|
2716
2961
|
avgStr,
|
|
2717
2962
|
scoreToColor(overallAverage)
|
|
2718
2963
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
@@ -2753,22 +2998,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2753
2998
|
);
|
|
2754
2999
|
}
|
|
2755
3000
|
}
|
|
2756
|
-
|
|
3001
|
+
let failedTestCasesTotal = 0;
|
|
3002
|
+
for (const snap of snapshots) {
|
|
3003
|
+
const completed = completedRuns.get(snap.runId);
|
|
3004
|
+
if (completed) {
|
|
3005
|
+
failedTestCasesTotal += completed.failedTestCases;
|
|
3006
|
+
}
|
|
3007
|
+
}
|
|
3008
|
+
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
2757
3009
|
}
|
|
2758
|
-
async function
|
|
3010
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
2759
3011
|
return new Promise((resolve5, reject) => {
|
|
2760
3012
|
const app = render(
|
|
2761
|
-
|
|
3013
|
+
React.createElement(RunView, {
|
|
2762
3014
|
runner,
|
|
2763
|
-
|
|
2764
|
-
evaluatorPattern,
|
|
3015
|
+
runConfigNames,
|
|
2765
3016
|
concurrency,
|
|
2766
|
-
onComplete: (err) => {
|
|
3017
|
+
onComplete: (err, exitCode) => {
|
|
2767
3018
|
app.unmount();
|
|
2768
3019
|
if (err) {
|
|
2769
3020
|
reject(err);
|
|
2770
3021
|
} else {
|
|
2771
|
-
resolve5();
|
|
3022
|
+
resolve5(exitCode ?? 0);
|
|
2772
3023
|
}
|
|
2773
3024
|
}
|
|
2774
3025
|
})
|
|
@@ -2794,12 +3045,22 @@ async function main() {
|
|
|
2794
3045
|
if (!args.command) {
|
|
2795
3046
|
printUsageAndExit(1);
|
|
2796
3047
|
}
|
|
2797
|
-
if (
|
|
2798
|
-
|
|
2799
|
-
|
|
3048
|
+
if (args.command === "run") {
|
|
3049
|
+
if (args.runConfigNames.length === 0) {
|
|
3050
|
+
console.error(
|
|
3051
|
+
"Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
|
|
3052
|
+
);
|
|
3053
|
+
printUsageAndExit(1);
|
|
3054
|
+
}
|
|
3055
|
+
if (args.datasetName !== void 0) {
|
|
3056
|
+
console.error(
|
|
3057
|
+
"The run command no longer accepts --dataset; use --run-config <RunConfig name>."
|
|
3058
|
+
);
|
|
3059
|
+
printUsageAndExit(1);
|
|
3060
|
+
}
|
|
2800
3061
|
}
|
|
2801
|
-
if (args.command === "
|
|
2802
|
-
console.error("
|
|
3062
|
+
if (args.command === "generate" && args.runConfigNames.length > 0) {
|
|
3063
|
+
console.error("generate does not accept --run-config.");
|
|
2803
3064
|
printUsageAndExit(1);
|
|
2804
3065
|
}
|
|
2805
3066
|
const useInk = process.stdout.isTTY === true;
|
|
@@ -2810,17 +3071,24 @@ async function main() {
|
|
|
2810
3071
|
try {
|
|
2811
3072
|
if (args.command === "run") {
|
|
2812
3073
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2813
|
-
await (useInk ?
|
|
3074
|
+
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
2814
3075
|
runner,
|
|
2815
|
-
args.
|
|
2816
|
-
args.evaluatorPattern,
|
|
3076
|
+
args.runConfigNames,
|
|
2817
3077
|
concurrency
|
|
2818
3078
|
);
|
|
3079
|
+
if (args.ci && exitCode !== 0) {
|
|
3080
|
+
process.exit(1);
|
|
3081
|
+
}
|
|
2819
3082
|
return;
|
|
2820
3083
|
}
|
|
3084
|
+
const genDataset = args.datasetName;
|
|
3085
|
+
if (!genDataset) {
|
|
3086
|
+
console.error("Missing required --dataset <datasetName> argument.");
|
|
3087
|
+
printUsageAndExit(1);
|
|
3088
|
+
}
|
|
2821
3089
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
2822
3090
|
runner,
|
|
2823
|
-
|
|
3091
|
+
genDataset
|
|
2824
3092
|
);
|
|
2825
3093
|
} finally {
|
|
2826
3094
|
await runner.shutdown();
|