@m4trix/evals 0.25.1 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/dist/cli-simple.cjs +845 -455
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +846 -456
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +543 -273
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +543 -273
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +948 -545
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +228 -14
- package/dist/index.js +933 -547
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
var crypto = require('crypto');
|
|
5
5
|
var effect = require('effect');
|
|
6
|
-
var
|
|
6
|
+
var promises = require('fs/promises');
|
|
7
7
|
var path = require('path');
|
|
8
|
+
var fs = require('fs');
|
|
8
9
|
var jitiModule = require('jiti');
|
|
9
|
-
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
11
|
var diff = require('diff');
|
|
12
12
|
var stringify = require('fast-json-stable-stringify');
|
|
@@ -39,12 +39,179 @@ var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
|
39
39
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
40
40
|
var React__namespace = /*#__PURE__*/_interopNamespace(React);
|
|
41
41
|
|
|
42
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
43
|
+
function makeEntityIdSchema(brand, label) {
|
|
44
|
+
return effect.Schema.String.pipe(
|
|
45
|
+
effect.Schema.trimmed(),
|
|
46
|
+
effect.Schema.minLength(1, {
|
|
47
|
+
message: () => `${label} must be non-empty.`
|
|
48
|
+
}),
|
|
49
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
50
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
51
|
+
}),
|
|
52
|
+
effect.Schema.brand(brand)
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
56
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
57
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
58
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
59
|
+
function validateWithSchema(schema, raw, context) {
|
|
60
|
+
const trimmed = raw.trim();
|
|
61
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
62
|
+
schema
|
|
63
|
+
);
|
|
64
|
+
const result = decode(trimmed);
|
|
65
|
+
if (effect.Either.isLeft(result)) {
|
|
66
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
67
|
+
}
|
|
68
|
+
return result.right;
|
|
69
|
+
}
|
|
70
|
+
function validateRunConfigName(raw, context) {
|
|
71
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// src/evals/evaluator.ts
|
|
75
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
76
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
77
|
+
const label = evaluator.getDisplayLabel();
|
|
78
|
+
if (label !== void 0) {
|
|
79
|
+
return label;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
83
|
+
}
|
|
84
|
+
function getEvaluatorTagList(evaluator) {
|
|
85
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
86
|
+
}
|
|
87
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
88
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
89
|
+
let entries;
|
|
90
|
+
try {
|
|
91
|
+
entries = await promises.readdir(baseDir);
|
|
92
|
+
} catch {
|
|
93
|
+
return [];
|
|
94
|
+
}
|
|
95
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
96
|
+
const snapshots = [];
|
|
97
|
+
for (const fileName of jsonlFiles) {
|
|
98
|
+
const filePath = path.join(baseDir, fileName);
|
|
99
|
+
try {
|
|
100
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
101
|
+
if (snapshot) {
|
|
102
|
+
snapshots.push(snapshot);
|
|
103
|
+
}
|
|
104
|
+
} catch {
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
108
|
+
}
|
|
109
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
110
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
111
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
112
|
+
if (lines.length === 0) {
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
let runQueued = null;
|
|
116
|
+
let runCompleted = null;
|
|
117
|
+
let runFailed = null;
|
|
118
|
+
let runStarted = null;
|
|
119
|
+
for (const line of lines) {
|
|
120
|
+
try {
|
|
121
|
+
const event = JSON.parse(line);
|
|
122
|
+
const type = event.type;
|
|
123
|
+
if (type === "RunQueued") {
|
|
124
|
+
runQueued = {
|
|
125
|
+
runId: event.runId,
|
|
126
|
+
datasetId: event.datasetId,
|
|
127
|
+
datasetName: event.datasetName,
|
|
128
|
+
evaluatorIds: event.evaluatorIds,
|
|
129
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
130
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
131
|
+
ts: event.ts
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
if (type === "RunStarted") {
|
|
135
|
+
runStarted = { startedAt: event.startedAt };
|
|
136
|
+
}
|
|
137
|
+
if (type === "RunCompleted") {
|
|
138
|
+
runCompleted = {
|
|
139
|
+
passedTestCases: event.passedTestCases,
|
|
140
|
+
failedTestCases: event.failedTestCases,
|
|
141
|
+
totalTestCases: event.totalTestCases,
|
|
142
|
+
finishedAt: event.finishedAt
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
if (type === "RunFailed") {
|
|
146
|
+
runFailed = {
|
|
147
|
+
finishedAt: event.finishedAt,
|
|
148
|
+
errorMessage: event.errorMessage
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
} catch {
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
if (!runQueued) {
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
const artifactPath = filePath;
|
|
158
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
159
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
160
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
161
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
162
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
163
|
+
return {
|
|
164
|
+
runId: runQueued.runId,
|
|
165
|
+
datasetId: runQueued.datasetId,
|
|
166
|
+
datasetName: runQueued.datasetName,
|
|
167
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
168
|
+
queuedAt: runQueued.ts ?? 0,
|
|
169
|
+
startedAt: runStarted?.startedAt,
|
|
170
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
171
|
+
totalTestCases: runQueued.totalTestCases,
|
|
172
|
+
completedTestCases,
|
|
173
|
+
passedTestCases,
|
|
174
|
+
failedTestCases,
|
|
175
|
+
status,
|
|
176
|
+
artifactPath,
|
|
177
|
+
errorMessage: runFailed?.errorMessage
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
function aggregateTestCaseProgress(lines) {
|
|
181
|
+
let completedTestCases = 0;
|
|
182
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
183
|
+
for (const line of lines) {
|
|
184
|
+
try {
|
|
185
|
+
const event = JSON.parse(line);
|
|
186
|
+
if (event.type === "TestCaseProgress") {
|
|
187
|
+
const ev = event;
|
|
188
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
189
|
+
const id = ev.testCaseId;
|
|
190
|
+
const current = testCasePassedBy.get(id);
|
|
191
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
192
|
+
}
|
|
193
|
+
} catch {
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
let passedTestCases = 0;
|
|
197
|
+
let failedTestCases = 0;
|
|
198
|
+
for (const passed of testCasePassedBy.values()) {
|
|
199
|
+
if (passed) {
|
|
200
|
+
passedTestCases += 1;
|
|
201
|
+
} else {
|
|
202
|
+
failedTestCases += 1;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
206
|
+
}
|
|
207
|
+
|
|
42
208
|
// src/runner/config.ts
|
|
43
209
|
var defaultRunnerConfig = {
|
|
44
210
|
discovery: {
|
|
45
211
|
rootDir: process.cwd(),
|
|
46
212
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
47
213
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
214
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
48
215
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
49
216
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
50
217
|
},
|
|
@@ -70,6 +237,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
70
237
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
71
238
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
72
239
|
}
|
|
240
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
241
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
242
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
243
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
244
|
+
}
|
|
73
245
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
74
246
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
75
247
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -168,6 +340,9 @@ function isDatasetLike(value) {
|
|
|
168
340
|
function isEvaluatorLike(value) {
|
|
169
341
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
170
342
|
}
|
|
343
|
+
function isRunConfigLike(value) {
|
|
344
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
345
|
+
}
|
|
171
346
|
function isTestCaseLike(value) {
|
|
172
347
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
173
348
|
}
|
|
@@ -256,6 +431,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
256
431
|
);
|
|
257
432
|
return found.flat();
|
|
258
433
|
}
|
|
434
|
+
async function collectRunConfigsFromFiles(config) {
|
|
435
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
436
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
437
|
+
const found = await Promise.all(
|
|
438
|
+
matched.map(async (absolutePath) => {
|
|
439
|
+
const exports = await loadModuleExports(absolutePath);
|
|
440
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
441
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
442
|
+
return runConfigs.map((runConfig) => ({
|
|
443
|
+
id: runConfig.getName(),
|
|
444
|
+
filePath: relPath,
|
|
445
|
+
runConfig
|
|
446
|
+
}));
|
|
447
|
+
})
|
|
448
|
+
);
|
|
449
|
+
return found.flat();
|
|
450
|
+
}
|
|
259
451
|
async function collectTestCasesFromFiles(config) {
|
|
260
452
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
261
453
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -409,6 +601,25 @@ function getDiffLines(entry) {
|
|
|
409
601
|
});
|
|
410
602
|
}
|
|
411
603
|
|
|
604
|
+
// src/evals/test-case.ts
|
|
605
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
606
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
607
|
+
return testCase.getDisplayLabel();
|
|
608
|
+
}
|
|
609
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
610
|
+
}
|
|
611
|
+
function getTestCaseTagList(testCase) {
|
|
612
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
// src/evals/dataset.ts
|
|
616
|
+
function getDatasetDisplayLabel(dataset) {
|
|
617
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
618
|
+
return dataset.getDisplayLabel();
|
|
619
|
+
}
|
|
620
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
621
|
+
}
|
|
622
|
+
|
|
412
623
|
// src/evals/metric.ts
|
|
413
624
|
var registry = /* @__PURE__ */ new Map();
|
|
414
625
|
var Metric = {
|
|
@@ -432,6 +643,54 @@ function getMetricById(id) {
|
|
|
432
643
|
return registry.get(id);
|
|
433
644
|
}
|
|
434
645
|
|
|
646
|
+
// src/evals/aggregators.ts
|
|
647
|
+
function aggregateTokenCountSum(values) {
|
|
648
|
+
const initial = {
|
|
649
|
+
input: 0,
|
|
650
|
+
output: 0,
|
|
651
|
+
inputCached: 0,
|
|
652
|
+
outputCached: 0
|
|
653
|
+
};
|
|
654
|
+
return values.reduce(
|
|
655
|
+
(acc, v) => ({
|
|
656
|
+
input: acc.input + (v.input ?? 0),
|
|
657
|
+
output: acc.output + (v.output ?? 0),
|
|
658
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
659
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
660
|
+
}),
|
|
661
|
+
initial
|
|
662
|
+
);
|
|
663
|
+
}
|
|
664
|
+
function aggregateLatencyAverage(values) {
|
|
665
|
+
if (values.length === 0) {
|
|
666
|
+
return { ms: 0 };
|
|
667
|
+
}
|
|
668
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
669
|
+
return { ms: sum / values.length };
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
// src/evals/metrics/standard.ts
|
|
673
|
+
Metric.of({
|
|
674
|
+
id: "token-count",
|
|
675
|
+
name: "Tokens",
|
|
676
|
+
aggregate: aggregateTokenCountSum,
|
|
677
|
+
format: (data, options) => {
|
|
678
|
+
const input = data.input ?? 0;
|
|
679
|
+
const output = data.output ?? 0;
|
|
680
|
+
const inputCached = data.inputCached ?? 0;
|
|
681
|
+
const outputCached = data.outputCached ?? 0;
|
|
682
|
+
const cached = inputCached + outputCached;
|
|
683
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
684
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
685
|
+
}
|
|
686
|
+
});
|
|
687
|
+
Metric.of({
|
|
688
|
+
id: "latency",
|
|
689
|
+
name: "Latency",
|
|
690
|
+
aggregate: aggregateLatencyAverage,
|
|
691
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
692
|
+
});
|
|
693
|
+
|
|
435
694
|
// src/evals/score.ts
|
|
436
695
|
var registry2 = /* @__PURE__ */ new Map();
|
|
437
696
|
function formatScoreData(def, data, options) {
|
|
@@ -540,54 +799,6 @@ function getScoreById(id) {
|
|
|
540
799
|
return registry2.get(id);
|
|
541
800
|
}
|
|
542
801
|
|
|
543
|
-
// src/evals/aggregators.ts
|
|
544
|
-
function aggregateTokenCountSum(values) {
|
|
545
|
-
const initial = {
|
|
546
|
-
input: 0,
|
|
547
|
-
output: 0,
|
|
548
|
-
inputCached: 0,
|
|
549
|
-
outputCached: 0
|
|
550
|
-
};
|
|
551
|
-
return values.reduce(
|
|
552
|
-
(acc, v) => ({
|
|
553
|
-
input: acc.input + (v.input ?? 0),
|
|
554
|
-
output: acc.output + (v.output ?? 0),
|
|
555
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
556
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
557
|
-
}),
|
|
558
|
-
initial
|
|
559
|
-
);
|
|
560
|
-
}
|
|
561
|
-
function aggregateLatencyAverage(values) {
|
|
562
|
-
if (values.length === 0) {
|
|
563
|
-
return { ms: 0 };
|
|
564
|
-
}
|
|
565
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
566
|
-
return { ms: sum / values.length };
|
|
567
|
-
}
|
|
568
|
-
|
|
569
|
-
// src/evals/metrics/standard.ts
|
|
570
|
-
Metric.of({
|
|
571
|
-
id: "token-count",
|
|
572
|
-
name: "Tokens",
|
|
573
|
-
aggregate: aggregateTokenCountSum,
|
|
574
|
-
format: (data, options) => {
|
|
575
|
-
const input = data.input ?? 0;
|
|
576
|
-
const output = data.output ?? 0;
|
|
577
|
-
const inputCached = data.inputCached ?? 0;
|
|
578
|
-
const outputCached = data.outputCached ?? 0;
|
|
579
|
-
const cached = inputCached + outputCached;
|
|
580
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
581
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
582
|
-
}
|
|
583
|
-
});
|
|
584
|
-
Metric.of({
|
|
585
|
-
id: "latency",
|
|
586
|
-
name: "Latency",
|
|
587
|
-
aggregate: aggregateLatencyAverage,
|
|
588
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
589
|
-
});
|
|
590
|
-
|
|
591
802
|
// src/evals/scores/standard.ts
|
|
592
803
|
Score.of({
|
|
593
804
|
id: "percent",
|
|
@@ -731,15 +942,17 @@ function readOutput(testCase) {
|
|
|
731
942
|
}
|
|
732
943
|
return candidate.getOutput();
|
|
733
944
|
}
|
|
734
|
-
function buildEvaluationUnits(testCases) {
|
|
945
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
946
|
+
const count = Math.max(1, repetitionCount);
|
|
735
947
|
const units = [];
|
|
736
948
|
for (const testCaseItem of testCases) {
|
|
737
|
-
const
|
|
738
|
-
for (let r = 0; r <
|
|
949
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
950
|
+
for (let r = 0; r < count; r++) {
|
|
739
951
|
units.push({
|
|
740
952
|
testCaseItem,
|
|
741
|
-
|
|
742
|
-
|
|
953
|
+
repetitionId,
|
|
954
|
+
repetitionIndex: r + 1,
|
|
955
|
+
repetitionCount: count
|
|
743
956
|
});
|
|
744
957
|
}
|
|
745
958
|
}
|
|
@@ -752,7 +965,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
752
965
|
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
753
966
|
}
|
|
754
967
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
755
|
-
const { testCaseItem,
|
|
968
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
756
969
|
return effect.Effect.gen(function* () {
|
|
757
970
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
758
971
|
const started = Date.now();
|
|
@@ -761,11 +974,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
761
974
|
type: "TestCaseStarted",
|
|
762
975
|
runId: task.runId,
|
|
763
976
|
testCaseId: testCaseItem.id,
|
|
764
|
-
testCaseName: testCaseItem.testCase
|
|
977
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
765
978
|
startedTestCases: startedEvaluations,
|
|
766
979
|
totalTestCases: totalEvaluations,
|
|
767
|
-
|
|
768
|
-
|
|
980
|
+
repetitionId,
|
|
981
|
+
repetitionIndex,
|
|
982
|
+
repetitionCount
|
|
769
983
|
});
|
|
770
984
|
const evaluatorScores = [];
|
|
771
985
|
let testCaseError;
|
|
@@ -799,8 +1013,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
799
1013
|
meta: {
|
|
800
1014
|
triggerId: task.triggerId,
|
|
801
1015
|
runId: evaluatorRunId,
|
|
802
|
-
|
|
1016
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1017
|
+
repetitionId,
|
|
1018
|
+
repetitionIndex,
|
|
1019
|
+
repetitionCount,
|
|
1020
|
+
runConfigName: task.runConfigName
|
|
803
1021
|
},
|
|
1022
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1023
|
+
runConfigTags: task.runConfigTags,
|
|
1024
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
804
1025
|
logDiff,
|
|
805
1026
|
log,
|
|
806
1027
|
createError
|
|
@@ -843,18 +1064,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
843
1064
|
});
|
|
844
1065
|
}
|
|
845
1066
|
}
|
|
846
|
-
const
|
|
1067
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
847
1068
|
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
848
1069
|
const progressEvent = {
|
|
849
1070
|
type: "TestCaseProgress",
|
|
850
1071
|
runId: task.runId,
|
|
851
1072
|
testCaseId: testCaseItem.id,
|
|
852
|
-
testCaseName: testCaseItem.testCase
|
|
1073
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
853
1074
|
completedTestCases: completedEvaluations,
|
|
854
1075
|
totalTestCases: totalEvaluations,
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
1076
|
+
repetitionId,
|
|
1077
|
+
repetitionIndex,
|
|
1078
|
+
repetitionCount,
|
|
1079
|
+
passed: repetitionPassedThis,
|
|
858
1080
|
durationMs: Date.now() - started,
|
|
859
1081
|
evaluatorScores,
|
|
860
1082
|
output,
|
|
@@ -875,9 +1097,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
875
1097
|
(map) => {
|
|
876
1098
|
const key = testCaseItem.id;
|
|
877
1099
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
878
|
-
const newResults = [...existing.results,
|
|
1100
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
879
1101
|
const newCompletedCount = existing.completedCount + 1;
|
|
880
|
-
const isLast = newCompletedCount ===
|
|
1102
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
881
1103
|
const newMap = new Map(map);
|
|
882
1104
|
newMap.set(key, {
|
|
883
1105
|
completedCount: newCompletedCount,
|
|
@@ -914,10 +1136,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
914
1136
|
runId: task.runId,
|
|
915
1137
|
startedAt
|
|
916
1138
|
});
|
|
917
|
-
const totalEvaluations = task.testCases.
|
|
918
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
919
|
-
0
|
|
920
|
-
);
|
|
1139
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
921
1140
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
922
1141
|
const completedRef = yield* effect.Ref.make(0);
|
|
923
1142
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -926,7 +1145,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
926
1145
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
927
1146
|
/* @__PURE__ */ new Map()
|
|
928
1147
|
);
|
|
929
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1148
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
930
1149
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
931
1150
|
task,
|
|
932
1151
|
unit,
|
|
@@ -940,11 +1159,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
940
1159
|
failedRef,
|
|
941
1160
|
testCaseResultsRef
|
|
942
1161
|
);
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
1162
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1163
|
+
if (globalSem !== void 0) {
|
|
1164
|
+
yield* effect.Effect.forEach(
|
|
1165
|
+
evaluationUnits,
|
|
1166
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1167
|
+
{ concurrency: "unbounded", discard: true }
|
|
1168
|
+
);
|
|
1169
|
+
} else {
|
|
1170
|
+
yield* effect.Effect.forEach(
|
|
1171
|
+
evaluationUnits,
|
|
1172
|
+
processEvaluation,
|
|
1173
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1174
|
+
);
|
|
1175
|
+
}
|
|
948
1176
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
949
1177
|
effect.Ref.get(completedRef),
|
|
950
1178
|
effect.Ref.get(passedRef),
|
|
@@ -961,144 +1189,53 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
961
1189
|
artifactPath: task.snapshot.artifactPath
|
|
962
1190
|
};
|
|
963
1191
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
964
|
-
...snapshot,
|
|
965
|
-
status: "completed",
|
|
966
|
-
completedTestCases: completedEvaluations,
|
|
967
|
-
passedTestCases: passedUniqueTestCases,
|
|
968
|
-
failedTestCases: failedUniqueTestCases,
|
|
969
|
-
finishedAt
|
|
970
|
-
}));
|
|
971
|
-
yield* publishEvent(completedEvent);
|
|
972
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
973
|
-
runId: task.runId,
|
|
974
|
-
artifactPath: task.snapshot.artifactPath,
|
|
975
|
-
payload: completedEvent
|
|
976
|
-
});
|
|
977
|
-
yield* publishEvent({
|
|
978
|
-
type: "ArtifactFlushed",
|
|
979
|
-
runId: task.runId,
|
|
980
|
-
artifactPath: task.snapshot.artifactPath
|
|
981
|
-
});
|
|
982
|
-
});
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
} catch {
|
|
989
|
-
return [];
|
|
990
|
-
}
|
|
991
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
992
|
-
const snapshots = [];
|
|
993
|
-
for (const fileName of jsonlFiles) {
|
|
994
|
-
const filePath = path.join(baseDir, fileName);
|
|
995
|
-
try {
|
|
996
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
997
|
-
if (snapshot) {
|
|
998
|
-
snapshots.push(snapshot);
|
|
999
|
-
}
|
|
1000
|
-
} catch {
|
|
1001
|
-
}
|
|
1002
|
-
}
|
|
1003
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1004
|
-
}
|
|
1005
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1006
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1007
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1008
|
-
if (lines.length === 0) {
|
|
1009
|
-
return null;
|
|
1010
|
-
}
|
|
1011
|
-
let runQueued = null;
|
|
1012
|
-
let runCompleted = null;
|
|
1013
|
-
let runFailed = null;
|
|
1014
|
-
let runStarted = null;
|
|
1015
|
-
for (const line of lines) {
|
|
1016
|
-
try {
|
|
1017
|
-
const event = JSON.parse(line);
|
|
1018
|
-
const type = event.type;
|
|
1019
|
-
if (type === "RunQueued") {
|
|
1020
|
-
runQueued = {
|
|
1021
|
-
runId: event.runId,
|
|
1022
|
-
datasetId: event.datasetId,
|
|
1023
|
-
datasetName: event.datasetName,
|
|
1024
|
-
evaluatorIds: event.evaluatorIds,
|
|
1025
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1026
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1027
|
-
ts: event.ts
|
|
1028
|
-
};
|
|
1029
|
-
}
|
|
1030
|
-
if (type === "RunStarted") {
|
|
1031
|
-
runStarted = { startedAt: event.startedAt };
|
|
1032
|
-
}
|
|
1033
|
-
if (type === "RunCompleted") {
|
|
1034
|
-
runCompleted = {
|
|
1035
|
-
passedTestCases: event.passedTestCases,
|
|
1036
|
-
failedTestCases: event.failedTestCases,
|
|
1037
|
-
totalTestCases: event.totalTestCases,
|
|
1038
|
-
finishedAt: event.finishedAt
|
|
1039
|
-
};
|
|
1040
|
-
}
|
|
1041
|
-
if (type === "RunFailed") {
|
|
1042
|
-
runFailed = {
|
|
1043
|
-
finishedAt: event.finishedAt,
|
|
1044
|
-
errorMessage: event.errorMessage
|
|
1045
|
-
};
|
|
1046
|
-
}
|
|
1047
|
-
} catch {
|
|
1048
|
-
}
|
|
1192
|
+
...snapshot,
|
|
1193
|
+
status: "completed",
|
|
1194
|
+
completedTestCases: completedEvaluations,
|
|
1195
|
+
passedTestCases: passedUniqueTestCases,
|
|
1196
|
+
failedTestCases: failedUniqueTestCases,
|
|
1197
|
+
finishedAt
|
|
1198
|
+
}));
|
|
1199
|
+
yield* publishEvent(completedEvent);
|
|
1200
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1201
|
+
runId: task.runId,
|
|
1202
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1203
|
+
payload: completedEvent
|
|
1204
|
+
});
|
|
1205
|
+
yield* publishEvent({
|
|
1206
|
+
type: "ArtifactFlushed",
|
|
1207
|
+
runId: task.runId,
|
|
1208
|
+
artifactPath: task.snapshot.artifactPath
|
|
1209
|
+
});
|
|
1210
|
+
});
|
|
1211
|
+
|
|
1212
|
+
// src/runner/name-pattern.ts
|
|
1213
|
+
function parseRegexLiteral(pattern) {
|
|
1214
|
+
if (!pattern.startsWith("/")) {
|
|
1215
|
+
return void 0;
|
|
1049
1216
|
}
|
|
1050
|
-
|
|
1051
|
-
|
|
1217
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1218
|
+
if (lastSlash <= 0) {
|
|
1219
|
+
return void 0;
|
|
1052
1220
|
}
|
|
1053
|
-
const artifactPath = filePath;
|
|
1054
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1055
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1056
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1057
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1058
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1059
1221
|
return {
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
datasetName: runQueued.datasetName,
|
|
1063
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1064
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1065
|
-
startedAt: runStarted?.startedAt,
|
|
1066
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1067
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1068
|
-
completedTestCases,
|
|
1069
|
-
passedTestCases,
|
|
1070
|
-
failedTestCases,
|
|
1071
|
-
status,
|
|
1072
|
-
artifactPath,
|
|
1073
|
-
errorMessage: runFailed?.errorMessage
|
|
1222
|
+
source: pattern.slice(1, lastSlash),
|
|
1223
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1074
1224
|
};
|
|
1075
1225
|
}
|
|
1076
|
-
function
|
|
1077
|
-
|
|
1078
|
-
const
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
if (event.type === "TestCaseProgress") {
|
|
1083
|
-
const ev = event;
|
|
1084
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1085
|
-
const id = ev.testCaseId;
|
|
1086
|
-
const current = testCasePassedBy.get(id);
|
|
1087
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1088
|
-
}
|
|
1089
|
-
} catch {
|
|
1090
|
-
}
|
|
1226
|
+
function createNameMatcher(pattern) {
|
|
1227
|
+
const normalizedPattern = pattern.trim();
|
|
1228
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1229
|
+
if (regexLiteral) {
|
|
1230
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1231
|
+
return (value) => regex.test(value);
|
|
1091
1232
|
}
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
passedTestCases += 1;
|
|
1097
|
-
} else {
|
|
1098
|
-
failedTestCases += 1;
|
|
1099
|
-
}
|
|
1233
|
+
if (normalizedPattern.includes("*")) {
|
|
1234
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1235
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1236
|
+
return (value) => regex.test(value);
|
|
1100
1237
|
}
|
|
1101
|
-
return
|
|
1238
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1102
1239
|
}
|
|
1103
1240
|
async function appendJsonLine(artifactPath, payload) {
|
|
1104
1241
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1157,32 +1294,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1157
1294
|
}
|
|
1158
1295
|
|
|
1159
1296
|
// src/runner/api.ts
|
|
1160
|
-
function
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1165
|
-
if (lastSlash <= 0) {
|
|
1166
|
-
return void 0;
|
|
1167
|
-
}
|
|
1168
|
-
return {
|
|
1169
|
-
source: pattern.slice(1, lastSlash),
|
|
1170
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1171
|
-
};
|
|
1172
|
-
}
|
|
1173
|
-
function createNameMatcher(pattern) {
|
|
1174
|
-
const normalizedPattern = pattern.trim();
|
|
1175
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1176
|
-
if (regexLiteral) {
|
|
1177
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1178
|
-
return (value) => regex.test(value);
|
|
1297
|
+
function normalizeRunRepetitions(value) {
|
|
1298
|
+
const n = value ?? 1;
|
|
1299
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1300
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1179
1301
|
}
|
|
1180
|
-
|
|
1181
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1182
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1183
|
-
return (value) => regex.test(value);
|
|
1184
|
-
}
|
|
1185
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1302
|
+
return n;
|
|
1186
1303
|
}
|
|
1187
1304
|
function mergeRunnerOverrides(base, next) {
|
|
1188
1305
|
if (!base) {
|
|
@@ -1217,6 +1334,7 @@ var EffectRunner = class {
|
|
|
1217
1334
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1218
1335
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1219
1336
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1337
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1220
1338
|
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1221
1339
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1222
1340
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1257,6 +1375,137 @@ var EffectRunner = class {
|
|
|
1257
1375
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1258
1376
|
);
|
|
1259
1377
|
}
|
|
1378
|
+
async collectRunConfigs() {
|
|
1379
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1380
|
+
this.runConfigsById.clear();
|
|
1381
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1382
|
+
for (const item of runConfigs) {
|
|
1383
|
+
const id = item.runConfig.getName();
|
|
1384
|
+
const lower = id.toLowerCase();
|
|
1385
|
+
const prev = byNameLower.get(lower);
|
|
1386
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1387
|
+
throw new Error(
|
|
1388
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1389
|
+
);
|
|
1390
|
+
}
|
|
1391
|
+
byNameLower.set(lower, item);
|
|
1392
|
+
this.runConfigsById.set(id, item);
|
|
1393
|
+
}
|
|
1394
|
+
return runConfigs;
|
|
1395
|
+
}
|
|
1396
|
+
async resolveRunConfigByName(name) {
|
|
1397
|
+
if (this.runConfigsById.size === 0) {
|
|
1398
|
+
await this.collectRunConfigs();
|
|
1399
|
+
}
|
|
1400
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1401
|
+
const keyLower = key.toLowerCase();
|
|
1402
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1403
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1404
|
+
);
|
|
1405
|
+
if (matches.length === 0) {
|
|
1406
|
+
return void 0;
|
|
1407
|
+
}
|
|
1408
|
+
if (matches.length > 1) {
|
|
1409
|
+
throw new Error(
|
|
1410
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1411
|
+
);
|
|
1412
|
+
}
|
|
1413
|
+
return matches[0];
|
|
1414
|
+
}
|
|
1415
|
+
async expandRunConfigToJobs(collected) {
|
|
1416
|
+
if (this.datasetsById.size === 0) {
|
|
1417
|
+
await this.collectDatasets();
|
|
1418
|
+
}
|
|
1419
|
+
if (this.evaluatorsById.size === 0) {
|
|
1420
|
+
await this.collectEvaluators();
|
|
1421
|
+
}
|
|
1422
|
+
const rcName = collected.runConfig.getName();
|
|
1423
|
+
const jobs = [];
|
|
1424
|
+
const runs = collected.runConfig.getRuns();
|
|
1425
|
+
for (const [i, row] of runs.entries()) {
|
|
1426
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1427
|
+
(d) => d.dataset === row.dataset
|
|
1428
|
+
);
|
|
1429
|
+
if (!dsCollected) {
|
|
1430
|
+
throw new Error(
|
|
1431
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1432
|
+
);
|
|
1433
|
+
}
|
|
1434
|
+
let evaluatorIds;
|
|
1435
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1436
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1437
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1438
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1439
|
+
);
|
|
1440
|
+
if (matched.length === 0) {
|
|
1441
|
+
throw new Error(
|
|
1442
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1443
|
+
);
|
|
1444
|
+
}
|
|
1445
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1446
|
+
} else {
|
|
1447
|
+
const evaluators = row.evaluators;
|
|
1448
|
+
evaluatorIds = [];
|
|
1449
|
+
for (const ev of evaluators) {
|
|
1450
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
1451
|
+
(item) => item.evaluator === ev
|
|
1452
|
+
);
|
|
1453
|
+
if (!found) {
|
|
1454
|
+
throw new Error(
|
|
1455
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
1456
|
+
);
|
|
1457
|
+
}
|
|
1458
|
+
evaluatorIds.push(found.id);
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
1462
|
+
jobs.push({
|
|
1463
|
+
datasetId: dsCollected.id,
|
|
1464
|
+
evaluatorIds,
|
|
1465
|
+
runConfigName: rcName,
|
|
1466
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
1467
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
1468
|
+
repetitions
|
|
1469
|
+
});
|
|
1470
|
+
}
|
|
1471
|
+
return jobs;
|
|
1472
|
+
}
|
|
1473
|
+
async expandRunConfigNamesToJobs(names) {
|
|
1474
|
+
const jobs = [];
|
|
1475
|
+
for (const name of names) {
|
|
1476
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
1477
|
+
if (!collected) {
|
|
1478
|
+
const known = await this.collectRunConfigs();
|
|
1479
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
1480
|
+
throw new Error(
|
|
1481
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
1482
|
+
);
|
|
1483
|
+
}
|
|
1484
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
1485
|
+
}
|
|
1486
|
+
return jobs;
|
|
1487
|
+
}
|
|
1488
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
1489
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1490
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1491
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1492
|
+
const snapshots = [];
|
|
1493
|
+
for (const job of request.jobs) {
|
|
1494
|
+
snapshots.push(
|
|
1495
|
+
await this.startDatasetRun({
|
|
1496
|
+
datasetId: job.datasetId,
|
|
1497
|
+
evaluatorIds: job.evaluatorIds,
|
|
1498
|
+
triggerId,
|
|
1499
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1500
|
+
globalEvaluationSemaphore: sem,
|
|
1501
|
+
runConfigName: job.runConfigName,
|
|
1502
|
+
runConfigTags: job.runConfigTags,
|
|
1503
|
+
repetitions: job.repetitions
|
|
1504
|
+
})
|
|
1505
|
+
);
|
|
1506
|
+
}
|
|
1507
|
+
return snapshots;
|
|
1508
|
+
}
|
|
1260
1509
|
async searchTestCases(query) {
|
|
1261
1510
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1262
1511
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1275,36 +1524,46 @@ var EffectRunner = class {
|
|
|
1275
1524
|
);
|
|
1276
1525
|
}
|
|
1277
1526
|
async runDatasetWith(request) {
|
|
1527
|
+
const runConfigName = validateRunConfigName(
|
|
1528
|
+
request.runConfigName,
|
|
1529
|
+
"runDatasetWith.runConfigName"
|
|
1530
|
+
);
|
|
1531
|
+
return this.startDatasetRun({
|
|
1532
|
+
datasetId: request.datasetId,
|
|
1533
|
+
evaluatorIds: request.evaluatorIds,
|
|
1534
|
+
triggerId: request.triggerId,
|
|
1535
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1536
|
+
repetitions: request.repetitions,
|
|
1537
|
+
runConfigName,
|
|
1538
|
+
runConfigTags: request.runConfigTags
|
|
1539
|
+
});
|
|
1540
|
+
}
|
|
1541
|
+
async startDatasetRun(params) {
|
|
1278
1542
|
if (this.datasetsById.size === 0) {
|
|
1279
1543
|
await this.collectDatasets();
|
|
1280
1544
|
}
|
|
1281
1545
|
if (this.evaluatorsById.size === 0) {
|
|
1282
1546
|
await this.collectEvaluators();
|
|
1283
1547
|
}
|
|
1284
|
-
const dataset = this.datasetsById.get(
|
|
1548
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1285
1549
|
if (!dataset) {
|
|
1286
|
-
throw new Error(`Unknown dataset: ${
|
|
1550
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1287
1551
|
}
|
|
1288
|
-
const selectedEvaluators =
|
|
1552
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1289
1553
|
if (selectedEvaluators.length === 0) {
|
|
1290
1554
|
throw new Error("No evaluators selected for run");
|
|
1291
1555
|
}
|
|
1292
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1293
|
-
const
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
)
|
|
1297
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1556
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
1557
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
1558
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1559
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1560
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1298
1561
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1299
|
-
const artifactPath = createArtifactPath(
|
|
1300
|
-
this.config.artifactDirectory,
|
|
1301
|
-
request.datasetId,
|
|
1302
|
-
runId
|
|
1303
|
-
);
|
|
1562
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1304
1563
|
const snapshot = {
|
|
1305
1564
|
runId,
|
|
1306
|
-
datasetId:
|
|
1307
|
-
datasetName: dataset.dataset.
|
|
1565
|
+
datasetId: params.datasetId,
|
|
1566
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1308
1567
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1309
1568
|
queuedAt: Date.now(),
|
|
1310
1569
|
totalTestCases: totalEvaluations,
|
|
@@ -1324,8 +1583,8 @@ var EffectRunner = class {
|
|
|
1324
1583
|
const queuedEvent = {
|
|
1325
1584
|
type: "RunQueued",
|
|
1326
1585
|
runId,
|
|
1327
|
-
datasetId:
|
|
1328
|
-
datasetName: dataset.dataset.
|
|
1586
|
+
datasetId: params.datasetId,
|
|
1587
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1329
1588
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1330
1589
|
totalTestCases: totalEvaluations,
|
|
1331
1590
|
artifactPath
|
|
@@ -1338,17 +1597,20 @@ var EffectRunner = class {
|
|
|
1338
1597
|
payload: queuedEvent
|
|
1339
1598
|
})
|
|
1340
1599
|
);
|
|
1341
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1342
1600
|
await effect.Effect.runPromise(
|
|
1343
1601
|
effect.Queue.offer(this.runQueue, {
|
|
1344
1602
|
runId,
|
|
1345
1603
|
triggerId,
|
|
1346
|
-
datasetId:
|
|
1604
|
+
datasetId: params.datasetId,
|
|
1347
1605
|
dataset: dataset.dataset,
|
|
1348
1606
|
evaluators: selectedEvaluators,
|
|
1349
1607
|
testCases: selectedTestCases,
|
|
1350
1608
|
snapshot,
|
|
1351
|
-
maxConcurrency
|
|
1609
|
+
maxConcurrency: params.maxConcurrency,
|
|
1610
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1611
|
+
runConfigName: params.runConfigName,
|
|
1612
|
+
runConfigTags,
|
|
1613
|
+
repetitions
|
|
1352
1614
|
})
|
|
1353
1615
|
);
|
|
1354
1616
|
return snapshot;
|
|
@@ -1427,6 +1689,8 @@ function getDefaultConcurrency() {
|
|
|
1427
1689
|
function parseSimpleCliArgs(argv) {
|
|
1428
1690
|
const args = {
|
|
1429
1691
|
help: false,
|
|
1692
|
+
ci: false,
|
|
1693
|
+
runConfigNames: [],
|
|
1430
1694
|
unknownArgs: []
|
|
1431
1695
|
};
|
|
1432
1696
|
let index = 0;
|
|
@@ -1440,18 +1704,26 @@ function parseSimpleCliArgs(argv) {
|
|
|
1440
1704
|
args.help = true;
|
|
1441
1705
|
continue;
|
|
1442
1706
|
}
|
|
1707
|
+
if (token === "--ci") {
|
|
1708
|
+
args.ci = true;
|
|
1709
|
+
continue;
|
|
1710
|
+
}
|
|
1443
1711
|
if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
|
|
1444
1712
|
args.datasetName = argv[index + 1];
|
|
1445
1713
|
index += 1;
|
|
1446
1714
|
continue;
|
|
1447
1715
|
}
|
|
1448
|
-
if ((token === "--
|
|
1449
|
-
|
|
1716
|
+
if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
|
|
1717
|
+
const next = argv[index + 1];
|
|
1718
|
+
if (typeof next === "string") {
|
|
1719
|
+
args.runConfigNames.push(next);
|
|
1720
|
+
}
|
|
1450
1721
|
index += 1;
|
|
1451
1722
|
continue;
|
|
1452
1723
|
}
|
|
1453
1724
|
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1454
|
-
const
|
|
1725
|
+
const nextConc = argv[index + 1];
|
|
1726
|
+
const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
|
|
1455
1727
|
if (!Number.isNaN(n) && n >= 1) {
|
|
1456
1728
|
args.concurrency = n;
|
|
1457
1729
|
}
|
|
@@ -1465,16 +1737,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1465
1737
|
function getSimpleCliUsage() {
|
|
1466
1738
|
return [
|
|
1467
1739
|
"Usage:",
|
|
1468
|
-
" eval-agents-simple run --
|
|
1469
|
-
" eval-agents-simple generate --dataset <
|
|
1740
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1741
|
+
" eval-agents-simple generate --dataset <datasetId>",
|
|
1470
1742
|
"",
|
|
1471
1743
|
"Options:",
|
|
1472
|
-
" --
|
|
1473
|
-
""
|
|
1474
|
-
"Pattern examples for --evaluator:",
|
|
1475
|
-
" score-evaluator exact name (case-insensitive)",
|
|
1476
|
-
' "*score*" wildcard pattern',
|
|
1477
|
-
' "/score/i" regex literal'
|
|
1744
|
+
" --ci With run: exit with code 1 if any test case fails.",
|
|
1745
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1478
1746
|
].join("\n");
|
|
1479
1747
|
}
|
|
1480
1748
|
|
|
@@ -1525,7 +1793,7 @@ function GenerateView({
|
|
|
1525
1793
|
const payload = testCases.map((item) => {
|
|
1526
1794
|
const tc = item.testCase;
|
|
1527
1795
|
return {
|
|
1528
|
-
name: item.testCase
|
|
1796
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1529
1797
|
input: item.testCase.getInput(),
|
|
1530
1798
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
1531
1799
|
};
|
|
@@ -1538,7 +1806,7 @@ function GenerateView({
|
|
|
1538
1806
|
if (!cancelled) {
|
|
1539
1807
|
setResult({
|
|
1540
1808
|
count: payload.length,
|
|
1541
|
-
datasetName: dataset.dataset
|
|
1809
|
+
datasetName: getDatasetDisplayLabel(dataset.dataset),
|
|
1542
1810
|
outputPath
|
|
1543
1811
|
});
|
|
1544
1812
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1591,7 +1859,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1591
1859
|
}
|
|
1592
1860
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
1593
1861
|
const payload = testCases.map((item) => ({
|
|
1594
|
-
name: item.testCase
|
|
1862
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1595
1863
|
input: item.testCase.getInput(),
|
|
1596
1864
|
output: readOutput2(item.testCase)
|
|
1597
1865
|
}));
|
|
@@ -1599,7 +1867,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1599
1867
|
const outputPath = createOutputPath(absoluteDatasetPath);
|
|
1600
1868
|
await promises.writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1601
1869
|
`, "utf8");
|
|
1602
|
-
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset
|
|
1870
|
+
console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
|
|
1603
1871
|
console.log(`Wrote ${outputPath}`);
|
|
1604
1872
|
}
|
|
1605
1873
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
@@ -1749,8 +2017,7 @@ function formatScorePart(item, _scoreToColor, options) {
|
|
|
1749
2017
|
}
|
|
1750
2018
|
function RunView({
|
|
1751
2019
|
runner,
|
|
1752
|
-
|
|
1753
|
-
evaluatorPattern,
|
|
2020
|
+
runConfigNames,
|
|
1754
2021
|
concurrency,
|
|
1755
2022
|
onComplete
|
|
1756
2023
|
}) {
|
|
@@ -1763,30 +2030,30 @@ function RunView({
|
|
|
1763
2030
|
const [summary, setSummary] = React.useState(null);
|
|
1764
2031
|
const [evaluatorNameById, setEvaluatorNameById] = React.useState(/* @__PURE__ */ new Map());
|
|
1765
2032
|
const runEval = React.useCallback(async () => {
|
|
1766
|
-
const
|
|
1767
|
-
if (
|
|
1768
|
-
|
|
1769
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1770
|
-
onComplete(
|
|
1771
|
-
new Error(
|
|
1772
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1773
|
-
)
|
|
1774
|
-
);
|
|
2033
|
+
const rcList = runConfigNames.filter((n) => n.trim().length > 0);
|
|
2034
|
+
if (rcList.length === 0) {
|
|
2035
|
+
onComplete(new Error("At least one RunConfig name is required."));
|
|
1775
2036
|
return;
|
|
1776
2037
|
}
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
2038
|
+
setStartedEvaluations(0);
|
|
2039
|
+
setCompletedEvaluations(0);
|
|
2040
|
+
setTestCases([]);
|
|
2041
|
+
setRunningEvaluations([]);
|
|
2042
|
+
setSummary(null);
|
|
2043
|
+
let jobs;
|
|
2044
|
+
try {
|
|
2045
|
+
jobs = await runner.expandRunConfigNamesToJobs(rcList);
|
|
2046
|
+
} catch (err) {
|
|
2047
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
2048
|
+
return;
|
|
2049
|
+
}
|
|
2050
|
+
if (jobs.length === 0) {
|
|
2051
|
+
onComplete(new Error("No jobs expanded from RunConfigs."));
|
|
1786
2052
|
return;
|
|
1787
2053
|
}
|
|
2054
|
+
const allEvaluators = await runner.collectEvaluators();
|
|
1788
2055
|
const nameById = new Map(
|
|
1789
|
-
|
|
2056
|
+
allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
1790
2057
|
);
|
|
1791
2058
|
setEvaluatorNameById(nameById);
|
|
1792
2059
|
const aggregates = /* @__PURE__ */ new Map();
|
|
@@ -1794,21 +2061,30 @@ function RunView({
|
|
|
1794
2061
|
let overallScoreTotal = 0;
|
|
1795
2062
|
let overallScoreSumSq = 0;
|
|
1796
2063
|
let overallScoreCount = 0;
|
|
1797
|
-
const
|
|
2064
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2065
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2066
|
+
let batchReady = false;
|
|
2067
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2068
|
+
const done = new Promise((resolve5, reject) => {
|
|
1798
2069
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2070
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2071
|
+
return;
|
|
2072
|
+
}
|
|
1799
2073
|
if (event.type === "TestCaseStarted") {
|
|
1800
|
-
setStartedEvaluations(
|
|
2074
|
+
setStartedEvaluations((c) => c + 1);
|
|
1801
2075
|
setRunningEvaluations((prev) => {
|
|
1802
2076
|
const withoutDuplicate = prev.filter(
|
|
1803
|
-
(item) => !(item.testCaseId === event.testCaseId && item.
|
|
2077
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
1804
2078
|
);
|
|
1805
2079
|
return [
|
|
1806
2080
|
...withoutDuplicate,
|
|
1807
2081
|
{
|
|
2082
|
+
runId: event.runId,
|
|
1808
2083
|
testCaseId: event.testCaseId,
|
|
1809
2084
|
name: event.testCaseName,
|
|
1810
|
-
|
|
1811
|
-
|
|
2085
|
+
repetitionId: event.repetitionId,
|
|
2086
|
+
repetitionIndex: event.repetitionIndex,
|
|
2087
|
+
repetitionCount: event.repetitionCount,
|
|
1812
2088
|
startedTestCases: event.startedTestCases,
|
|
1813
2089
|
totalTestCases: event.totalTestCases
|
|
1814
2090
|
}
|
|
@@ -1844,9 +2120,12 @@ function RunView({
|
|
|
1844
2120
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
1845
2121
|
}
|
|
1846
2122
|
}
|
|
2123
|
+
const label = runIdToLabel.get(event.runId);
|
|
2124
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2125
|
+
const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
|
|
1847
2126
|
setTestCases((prev) => {
|
|
1848
2127
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1849
|
-
const existing = byId.get(
|
|
2128
|
+
const existing = byId.get(compositeId);
|
|
1850
2129
|
const newEvent = {
|
|
1851
2130
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1852
2131
|
evaluatorId: item.evaluatorId,
|
|
@@ -1863,12 +2142,12 @@ function RunView({
|
|
|
1863
2142
|
const isAggregated = events.length > 1;
|
|
1864
2143
|
const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
|
|
1865
2144
|
const merged = {
|
|
1866
|
-
name:
|
|
1867
|
-
testCaseId:
|
|
2145
|
+
name: displayName,
|
|
2146
|
+
testCaseId: compositeId,
|
|
1868
2147
|
completedTestCases: event.completedTestCases,
|
|
1869
2148
|
totalTestCases: event.totalTestCases,
|
|
1870
|
-
|
|
1871
|
-
|
|
2149
|
+
repetitionIndex: event.repetitionIndex,
|
|
2150
|
+
repetitionCount: event.repetitionCount,
|
|
1872
2151
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1873
2152
|
passed: events.every((e) => e.passed),
|
|
1874
2153
|
errorMessage: event.errorMessage,
|
|
@@ -1876,84 +2155,118 @@ function RunView({
|
|
|
1876
2155
|
aggregatedEvaluatorScores,
|
|
1877
2156
|
isAggregated
|
|
1878
2157
|
};
|
|
1879
|
-
byId.set(
|
|
1880
|
-
setCompletedEvaluations(event.completedTestCases);
|
|
1881
|
-
setRunningEvaluations(
|
|
1882
|
-
(running) => running.filter(
|
|
1883
|
-
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1884
|
-
)
|
|
1885
|
-
);
|
|
2158
|
+
byId.set(compositeId, merged);
|
|
1886
2159
|
return Array.from(byId.values());
|
|
1887
2160
|
});
|
|
2161
|
+
setCompletedEvaluations((c) => c + 1);
|
|
2162
|
+
setRunningEvaluations(
|
|
2163
|
+
(running) => running.filter(
|
|
2164
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
2165
|
+
)
|
|
2166
|
+
);
|
|
1888
2167
|
}
|
|
1889
|
-
if (event.type === "
|
|
2168
|
+
if (event.type === "RunFailed") {
|
|
2169
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2170
|
+
return;
|
|
2171
|
+
}
|
|
1890
2172
|
unsubscribe();
|
|
1891
|
-
|
|
2173
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2174
|
+
return;
|
|
2175
|
+
}
|
|
2176
|
+
if (event.type === "RunCompleted") {
|
|
2177
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2178
|
+
return;
|
|
2179
|
+
}
|
|
2180
|
+
completedRuns.set(event.runId, event);
|
|
2181
|
+
batchPendingRunIds.delete(event.runId);
|
|
2182
|
+
if (batchPendingRunIds.size === 0) {
|
|
2183
|
+
unsubscribe();
|
|
2184
|
+
resolve5();
|
|
2185
|
+
}
|
|
1892
2186
|
}
|
|
1893
2187
|
});
|
|
1894
2188
|
});
|
|
1895
|
-
const
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
concurrency
|
|
2189
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2190
|
+
jobs,
|
|
2191
|
+
globalConcurrency: concurrency
|
|
1899
2192
|
});
|
|
2193
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2194
|
+
const snap = snapshots[i];
|
|
2195
|
+
const job = jobs[i];
|
|
2196
|
+
if (snap && job) {
|
|
2197
|
+
runIdToLabel.set(
|
|
2198
|
+
snap.runId,
|
|
2199
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2200
|
+
);
|
|
2201
|
+
batchPendingRunIds.add(snap.runId);
|
|
2202
|
+
}
|
|
2203
|
+
}
|
|
2204
|
+
const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2205
|
+
batchReady = true;
|
|
2206
|
+
const runConfigLabels = await Promise.all(
|
|
2207
|
+
rcList.map(async (n) => {
|
|
2208
|
+
const collected = await runner.resolveRunConfigByName(n);
|
|
2209
|
+
return collected?.runConfig.getDisplayLabel() ?? n;
|
|
2210
|
+
})
|
|
2211
|
+
);
|
|
1900
2212
|
setRunInfo({
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
totalTestCases: snapshot.totalTestCases
|
|
2213
|
+
names: runConfigLabels,
|
|
2214
|
+
jobs: jobs.length,
|
|
2215
|
+
totalTestCases: totalUnits
|
|
1905
2216
|
});
|
|
1906
2217
|
setPhase("running");
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
2218
|
+
try {
|
|
2219
|
+
await done;
|
|
2220
|
+
} catch (err) {
|
|
2221
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
1910
2222
|
return;
|
|
1911
2223
|
}
|
|
1912
|
-
|
|
2224
|
+
let passedTestCases = 0;
|
|
2225
|
+
let failedTestCases = 0;
|
|
2226
|
+
let totalTestCases = 0;
|
|
2227
|
+
const artifacts = [];
|
|
2228
|
+
for (const ev of completedRuns.values()) {
|
|
2229
|
+
passedTestCases += ev.passedTestCases;
|
|
2230
|
+
failedTestCases += ev.failedTestCases;
|
|
2231
|
+
totalTestCases += ev.totalTestCases;
|
|
2232
|
+
artifacts.push(ev.artifactPath);
|
|
2233
|
+
}
|
|
1913
2234
|
setSummary({
|
|
1914
|
-
passedTestCases
|
|
1915
|
-
failedTestCases
|
|
1916
|
-
totalTestCases
|
|
2235
|
+
passedTestCases,
|
|
2236
|
+
failedTestCases,
|
|
2237
|
+
totalTestCases,
|
|
1917
2238
|
overallScoreTotal,
|
|
1918
2239
|
overallScoreSumSq,
|
|
1919
2240
|
overallScoreCount,
|
|
1920
2241
|
aggregates: new Map(aggregates),
|
|
1921
2242
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1922
|
-
artifactPath:
|
|
2243
|
+
artifactPath: artifacts.join("\n")
|
|
1923
2244
|
});
|
|
1924
2245
|
setPhase("completed");
|
|
1925
|
-
|
|
1926
|
-
|
|
2246
|
+
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2247
|
+
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2248
|
+
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
1927
2249
|
React.useEffect(() => {
|
|
1928
2250
|
void runEval();
|
|
1929
2251
|
}, [runEval]);
|
|
1930
2252
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
1931
2253
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1932
2254
|
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1933
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
" "
|
|
1937
|
-
] }),
|
|
1938
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
2255
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
2256
|
+
"RunConfigs",
|
|
2257
|
+
" "
|
|
1939
2258
|
] }),
|
|
2259
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.names.join(", ") }),
|
|
1940
2260
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1941
2261
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1942
|
-
"
|
|
2262
|
+
"Jobs",
|
|
1943
2263
|
" "
|
|
1944
2264
|
] }),
|
|
1945
|
-
runInfo.
|
|
2265
|
+
runInfo.jobs
|
|
1946
2266
|
] }),
|
|
1947
2267
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1948
2268
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1949
|
-
"
|
|
1950
|
-
" "
|
|
1951
|
-
] }),
|
|
1952
|
-
runInfo.evaluatorNames.join(", ")
|
|
1953
|
-
] }),
|
|
1954
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1955
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1956
|
-
"Test cases",
|
|
2269
|
+
"Evaluation units",
|
|
1957
2270
|
" "
|
|
1958
2271
|
] }),
|
|
1959
2272
|
runInfo.totalTestCases
|
|
@@ -1966,22 +2279,29 @@ function RunView({
|
|
|
1966
2279
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1967
2280
|
}
|
|
1968
2281
|
),
|
|
1969
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
2282
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2283
|
+
ink.Text,
|
|
2284
|
+
{
|
|
2285
|
+
color: "yellow",
|
|
2286
|
+
children: [
|
|
2287
|
+
"[running ",
|
|
2288
|
+
item.startedTestCases,
|
|
2289
|
+
"/",
|
|
2290
|
+
item.totalTestCases,
|
|
2291
|
+
"] ",
|
|
2292
|
+
item.name,
|
|
2293
|
+
" ",
|
|
2294
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2295
|
+
"(",
|
|
2296
|
+
item.repetitionIndex,
|
|
2297
|
+
"/",
|
|
2298
|
+
item.repetitionCount,
|
|
2299
|
+
")"
|
|
2300
|
+
] })
|
|
2301
|
+
]
|
|
2302
|
+
},
|
|
2303
|
+
`${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
|
|
2304
|
+
)) })
|
|
1985
2305
|
] }),
|
|
1986
2306
|
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1987
2307
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
@@ -1997,9 +2317,9 @@ function RunView({
|
|
|
1997
2317
|
" ",
|
|
1998
2318
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1999
2319
|
"(",
|
|
2000
|
-
tc.
|
|
2320
|
+
tc.repetitionIndex,
|
|
2001
2321
|
"/",
|
|
2002
|
-
tc.
|
|
2322
|
+
tc.repetitionCount,
|
|
2003
2323
|
")"
|
|
2004
2324
|
] }),
|
|
2005
2325
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
@@ -2039,7 +2359,7 @@ function RunView({
|
|
|
2039
2359
|
})
|
|
2040
2360
|
] }) : null
|
|
2041
2361
|
] }),
|
|
2042
|
-
item.scores.length > 0 ? item.scores.map((s
|
|
2362
|
+
item.scores.length > 0 ? item.scores.map((s) => {
|
|
2043
2363
|
const def = s.def ?? getScoreById(s.id);
|
|
2044
2364
|
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2045
2365
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
@@ -2056,18 +2376,25 @@ function RunView({
|
|
|
2056
2376
|
})
|
|
2057
2377
|
]
|
|
2058
2378
|
},
|
|
2059
|
-
`${item.evaluatorId}-${s.id}-${
|
|
2379
|
+
`${item.evaluatorId}-${s.id}-${scoreLabel}`
|
|
2060
2380
|
);
|
|
2061
2381
|
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
2062
2382
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2063
|
-
(log
|
|
2064
|
-
ink.
|
|
2383
|
+
(log) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(
|
|
2384
|
+
ink.Box,
|
|
2065
2385
|
{
|
|
2066
|
-
|
|
2067
|
-
children: line
|
|
2386
|
+
flexDirection: "column",
|
|
2387
|
+
children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2388
|
+
ink.Text,
|
|
2389
|
+
{
|
|
2390
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2391
|
+
children: line
|
|
2392
|
+
},
|
|
2393
|
+
`${type}:${line}`
|
|
2394
|
+
))
|
|
2068
2395
|
},
|
|
2069
|
-
|
|
2070
|
-
)
|
|
2396
|
+
`diff:${getDiffLines(log).map((x) => x.line).join("|")}`
|
|
2397
|
+
) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
|
|
2071
2398
|
) })
|
|
2072
2399
|
] }, item.evaluatorId))
|
|
2073
2400
|
] }, tc.testCaseId)) }),
|
|
@@ -2191,10 +2518,10 @@ function RunView({
|
|
|
2191
2518
|
] }, tc.testCaseId);
|
|
2192
2519
|
})
|
|
2193
2520
|
] }),
|
|
2194
|
-
/* @__PURE__ */ jsxRuntime.
|
|
2195
|
-
"artifact:
|
|
2196
|
-
summary.artifactPath
|
|
2197
|
-
] })
|
|
2521
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
2522
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "artifact(s):" }),
|
|
2523
|
+
summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line))
|
|
2524
|
+
] })
|
|
2198
2525
|
] })
|
|
2199
2526
|
] });
|
|
2200
2527
|
}
|
|
@@ -2406,25 +2733,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2406
2733
|
}
|
|
2407
2734
|
return lines;
|
|
2408
2735
|
}
|
|
2409
|
-
async function
|
|
2410
|
-
const
|
|
2411
|
-
if (
|
|
2412
|
-
|
|
2413
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
2414
|
-
throw new Error(
|
|
2415
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
|
|
2416
|
-
);
|
|
2417
|
-
}
|
|
2418
|
-
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
2419
|
-
if (evaluators.length === 0) {
|
|
2420
|
-
const known = await runner.collectEvaluators();
|
|
2421
|
-
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
2422
|
-
throw new Error(
|
|
2423
|
-
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
|
|
2424
|
-
);
|
|
2736
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2737
|
+
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2738
|
+
if (jobs.length === 0) {
|
|
2739
|
+
throw new Error("No jobs expanded from RunConfigs.");
|
|
2425
2740
|
}
|
|
2741
|
+
const evaluators = await runner.collectEvaluators();
|
|
2426
2742
|
const evaluatorNameById = new Map(
|
|
2427
|
-
evaluators.map((item) => [item.id, item.evaluator
|
|
2743
|
+
evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
2428
2744
|
);
|
|
2429
2745
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2430
2746
|
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
@@ -2432,11 +2748,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2432
2748
|
let overallScoreTotal = 0;
|
|
2433
2749
|
let overallScoreSumSq = 0;
|
|
2434
2750
|
let overallScoreCount = 0;
|
|
2435
|
-
let
|
|
2436
|
-
let
|
|
2751
|
+
let globalStartedUnits = 0;
|
|
2752
|
+
let globalCompletedUnits = 0;
|
|
2437
2753
|
let totalCount = 0;
|
|
2438
2754
|
let runFinished = false;
|
|
2439
|
-
const
|
|
2755
|
+
const inFlightRepetitions = /* @__PURE__ */ new Set();
|
|
2440
2756
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2441
2757
|
let spinnerIndex = 0;
|
|
2442
2758
|
function clearLine() {
|
|
@@ -2458,33 +2774,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2458
2774
|
spinnerIndex += 1;
|
|
2459
2775
|
process.stdout.write(
|
|
2460
2776
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2461
|
-
`${
|
|
2777
|
+
`${globalCompletedUnits}/${totalCount}`,
|
|
2462
2778
|
ansi2.bold
|
|
2463
|
-
)} completed ${colorize(`${
|
|
2779
|
+
)} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
|
|
2464
2780
|
);
|
|
2465
2781
|
}
|
|
2466
2782
|
let lastPrintedTestCaseId = null;
|
|
2467
2783
|
let lastPrintedLineCount = 0;
|
|
2468
2784
|
let spinnerTimer;
|
|
2469
|
-
const
|
|
2785
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2786
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2787
|
+
let batchReady = false;
|
|
2788
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2789
|
+
const done = new Promise((resolve5, reject) => {
|
|
2470
2790
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2791
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2792
|
+
return;
|
|
2793
|
+
}
|
|
2794
|
+
const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
|
|
2795
|
+
const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
|
|
2471
2796
|
if (event.type === "TestCaseStarted") {
|
|
2472
|
-
|
|
2473
|
-
|
|
2797
|
+
globalStartedUnits += 1;
|
|
2798
|
+
inFlightRepetitions.add(
|
|
2799
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2800
|
+
);
|
|
2474
2801
|
clearLine();
|
|
2475
2802
|
process.stdout.write(
|
|
2476
|
-
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2803
|
+
`${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2477
2804
|
`
|
|
2478
2805
|
);
|
|
2479
2806
|
drawSpinner();
|
|
2480
2807
|
}
|
|
2481
2808
|
if (event.type === "TestCaseProgress") {
|
|
2482
|
-
|
|
2483
|
-
|
|
2809
|
+
globalCompletedUnits += 1;
|
|
2810
|
+
inFlightRepetitions.delete(
|
|
2811
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2812
|
+
);
|
|
2484
2813
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2485
2814
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2486
|
-
const
|
|
2487
|
-
const existing = testCaseByTestId.get(
|
|
2815
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2816
|
+
const existing = testCaseByTestId.get(compositeId) ?? {
|
|
2488
2817
|
name: event.testCaseName,
|
|
2489
2818
|
events: []
|
|
2490
2819
|
};
|
|
@@ -2494,7 +2823,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2494
2823
|
durationMs: event.durationMs,
|
|
2495
2824
|
evaluatorScores: event.evaluatorScores
|
|
2496
2825
|
});
|
|
2497
|
-
testCaseByTestId.set(
|
|
2826
|
+
testCaseByTestId.set(compositeId, existing);
|
|
2498
2827
|
for (const item of event.evaluatorScores) {
|
|
2499
2828
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
2500
2829
|
if (numeric !== void 0) {
|
|
@@ -2523,10 +2852,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2523
2852
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
2524
2853
|
}
|
|
2525
2854
|
}
|
|
2526
|
-
const isSameTestCase = lastPrintedTestCaseId ===
|
|
2527
|
-
const
|
|
2855
|
+
const isSameTestCase = lastPrintedTestCaseId === compositeId;
|
|
2856
|
+
const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
|
|
2528
2857
|
const isNonTty = !process.stdout.isTTY;
|
|
2529
|
-
const skipPrintNonTty = isNonTty && event.
|
|
2858
|
+
const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
|
|
2530
2859
|
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2531
2860
|
cursorUp(lastPrintedLineCount);
|
|
2532
2861
|
}
|
|
@@ -2537,7 +2866,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2537
2866
|
const lines = [];
|
|
2538
2867
|
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2539
2868
|
lines.push(
|
|
2540
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2869
|
+
`${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2541
2870
|
);
|
|
2542
2871
|
if (event.errorMessage) {
|
|
2543
2872
|
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
@@ -2568,64 +2897,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2568
2897
|
}
|
|
2569
2898
|
}
|
|
2570
2899
|
if (!skipPrintNonTty) {
|
|
2571
|
-
for (let i = 0; i < lines.length; i
|
|
2900
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
2572
2901
|
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2573
2902
|
`);
|
|
2574
2903
|
}
|
|
2575
|
-
lastPrintedTestCaseId =
|
|
2904
|
+
lastPrintedTestCaseId = compositeId;
|
|
2576
2905
|
lastPrintedLineCount = lines.length;
|
|
2577
2906
|
}
|
|
2578
2907
|
drawSpinner();
|
|
2579
2908
|
}
|
|
2580
|
-
if (event.type === "
|
|
2909
|
+
if (event.type === "RunFailed") {
|
|
2910
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2911
|
+
return;
|
|
2912
|
+
}
|
|
2581
2913
|
runFinished = true;
|
|
2582
2914
|
clearLine();
|
|
2583
2915
|
unsubscribe();
|
|
2584
|
-
|
|
2916
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2917
|
+
return;
|
|
2918
|
+
}
|
|
2919
|
+
if (event.type === "RunCompleted") {
|
|
2920
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2921
|
+
return;
|
|
2922
|
+
}
|
|
2923
|
+
completedRuns.set(event.runId, event);
|
|
2924
|
+
batchPendingRunIds.delete(event.runId);
|
|
2925
|
+
if (batchPendingRunIds.size === 0) {
|
|
2926
|
+
runFinished = true;
|
|
2927
|
+
clearLine();
|
|
2928
|
+
unsubscribe();
|
|
2929
|
+
resolve5();
|
|
2930
|
+
}
|
|
2585
2931
|
}
|
|
2586
2932
|
});
|
|
2587
2933
|
});
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2934
|
+
console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2935
|
+
for (const name of runConfigNames) {
|
|
2936
|
+
const collected = await runner.resolveRunConfigByName(name);
|
|
2937
|
+
const label = collected?.runConfig.getDisplayLabel() ?? name;
|
|
2938
|
+
console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
|
|
2939
|
+
}
|
|
2940
|
+
console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
|
|
2941
|
+
console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
|
|
2942
|
+
console.log("");
|
|
2943
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2944
|
+
jobs,
|
|
2945
|
+
globalConcurrency: concurrency
|
|
2592
2946
|
});
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
|
|
2947
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2948
|
+
const snap = snapshots[i];
|
|
2949
|
+
const job = jobs[i];
|
|
2950
|
+
if (snap && job) {
|
|
2951
|
+
runIdToLabel.set(
|
|
2952
|
+
snap.runId,
|
|
2953
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2954
|
+
);
|
|
2955
|
+
batchPendingRunIds.add(snap.runId);
|
|
2956
|
+
}
|
|
2957
|
+
}
|
|
2958
|
+
totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2959
|
+
console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
|
|
2601
2960
|
console.log("");
|
|
2961
|
+
batchReady = true;
|
|
2602
2962
|
drawSpinner();
|
|
2603
2963
|
spinnerTimer = setInterval(drawSpinner, 100);
|
|
2604
|
-
|
|
2964
|
+
await done;
|
|
2605
2965
|
if (spinnerTimer) {
|
|
2606
2966
|
clearInterval(spinnerTimer);
|
|
2607
2967
|
}
|
|
2608
|
-
if (finalEvent.type === "RunFailed") {
|
|
2609
|
-
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2610
|
-
}
|
|
2611
|
-
const completed = finalEvent;
|
|
2612
2968
|
console.log("");
|
|
2613
|
-
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
)
|
|
2622
|
-
|
|
2969
|
+
console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2970
|
+
for (const snap of snapshots) {
|
|
2971
|
+
const completed = completedRuns.get(snap.runId);
|
|
2972
|
+
if (!completed) {
|
|
2973
|
+
continue;
|
|
2974
|
+
}
|
|
2975
|
+
const label = runIdToLabel.get(snap.runId) ?? snap.runId;
|
|
2976
|
+
console.log("");
|
|
2977
|
+
console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
|
|
2978
|
+
console.log(
|
|
2979
|
+
`- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
|
|
2980
|
+
);
|
|
2981
|
+
console.log(
|
|
2982
|
+
`- failed: ${colorize(
|
|
2983
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2984
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2985
|
+
)}`
|
|
2986
|
+
);
|
|
2987
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2988
|
+
}
|
|
2623
2989
|
if (overallScoreCount > 0) {
|
|
2624
2990
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2625
2991
|
const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
|
|
2626
2992
|
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2993
|
+
console.log("");
|
|
2627
2994
|
console.log(
|
|
2628
|
-
`- overall avg score: ${colorize(
|
|
2995
|
+
`- overall avg score (all jobs): ${colorize(
|
|
2629
2996
|
avgStr,
|
|
2630
2997
|
scoreToColor(overallAverage)
|
|
2631
2998
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
@@ -2666,22 +3033,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2666
3033
|
);
|
|
2667
3034
|
}
|
|
2668
3035
|
}
|
|
2669
|
-
|
|
3036
|
+
let failedTestCasesTotal = 0;
|
|
3037
|
+
for (const snap of snapshots) {
|
|
3038
|
+
const completed = completedRuns.get(snap.runId);
|
|
3039
|
+
if (completed) {
|
|
3040
|
+
failedTestCasesTotal += completed.failedTestCases;
|
|
3041
|
+
}
|
|
3042
|
+
}
|
|
3043
|
+
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
2670
3044
|
}
|
|
2671
|
-
async function
|
|
3045
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
2672
3046
|
return new Promise((resolve5, reject) => {
|
|
2673
3047
|
const app = ink.render(
|
|
2674
3048
|
React__namespace.createElement(RunView, {
|
|
2675
3049
|
runner,
|
|
2676
|
-
|
|
2677
|
-
evaluatorPattern,
|
|
3050
|
+
runConfigNames,
|
|
2678
3051
|
concurrency,
|
|
2679
|
-
onComplete: (err) => {
|
|
3052
|
+
onComplete: (err, exitCode) => {
|
|
2680
3053
|
app.unmount();
|
|
2681
3054
|
if (err) {
|
|
2682
3055
|
reject(err);
|
|
2683
3056
|
} else {
|
|
2684
|
-
resolve5();
|
|
3057
|
+
resolve5(exitCode ?? 0);
|
|
2685
3058
|
}
|
|
2686
3059
|
}
|
|
2687
3060
|
})
|
|
@@ -2707,12 +3080,22 @@ async function main() {
|
|
|
2707
3080
|
if (!args.command) {
|
|
2708
3081
|
printUsageAndExit(1);
|
|
2709
3082
|
}
|
|
2710
|
-
if (
|
|
2711
|
-
|
|
2712
|
-
|
|
3083
|
+
if (args.command === "run") {
|
|
3084
|
+
if (args.runConfigNames.length === 0) {
|
|
3085
|
+
console.error(
|
|
3086
|
+
"Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
|
|
3087
|
+
);
|
|
3088
|
+
printUsageAndExit(1);
|
|
3089
|
+
}
|
|
3090
|
+
if (args.datasetName !== void 0) {
|
|
3091
|
+
console.error(
|
|
3092
|
+
"The run command no longer accepts --dataset; use --run-config <RunConfig name>."
|
|
3093
|
+
);
|
|
3094
|
+
printUsageAndExit(1);
|
|
3095
|
+
}
|
|
2713
3096
|
}
|
|
2714
|
-
if (args.command === "
|
|
2715
|
-
console.error("
|
|
3097
|
+
if (args.command === "generate" && args.runConfigNames.length > 0) {
|
|
3098
|
+
console.error("generate does not accept --run-config.");
|
|
2716
3099
|
printUsageAndExit(1);
|
|
2717
3100
|
}
|
|
2718
3101
|
const useInk = process.stdout.isTTY === true;
|
|
@@ -2723,17 +3106,24 @@ async function main() {
|
|
|
2723
3106
|
try {
|
|
2724
3107
|
if (args.command === "run") {
|
|
2725
3108
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2726
|
-
await (useInk ?
|
|
3109
|
+
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
2727
3110
|
runner,
|
|
2728
|
-
args.
|
|
2729
|
-
args.evaluatorPattern,
|
|
3111
|
+
args.runConfigNames,
|
|
2730
3112
|
concurrency
|
|
2731
3113
|
);
|
|
3114
|
+
if (args.ci && exitCode !== 0) {
|
|
3115
|
+
process.exit(1);
|
|
3116
|
+
}
|
|
2732
3117
|
return;
|
|
2733
3118
|
}
|
|
3119
|
+
const genDataset = args.datasetName;
|
|
3120
|
+
if (!genDataset) {
|
|
3121
|
+
console.error("Missing required --dataset <datasetId> argument.");
|
|
3122
|
+
printUsageAndExit(1);
|
|
3123
|
+
}
|
|
2734
3124
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
2735
3125
|
runner,
|
|
2736
|
-
|
|
3126
|
+
genDataset
|
|
2737
3127
|
);
|
|
2738
3128
|
} finally {
|
|
2739
3129
|
await runner.shutdown();
|