@m4trix/evals 0.25.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +831 -450
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +832 -451
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +531 -270
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +531 -270
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +888 -509
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +878 -513
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
var crypto = require('crypto');
|
|
5
5
|
var effect = require('effect');
|
|
6
|
-
var
|
|
6
|
+
var promises = require('fs/promises');
|
|
7
7
|
var path = require('path');
|
|
8
|
+
var fs = require('fs');
|
|
8
9
|
var jitiModule = require('jiti');
|
|
9
|
-
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
11
|
var diff = require('diff');
|
|
12
12
|
var stringify = require('fast-json-stable-stringify');
|
|
@@ -39,12 +39,178 @@ var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
|
39
39
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
40
40
|
var React__namespace = /*#__PURE__*/_interopNamespace(React);
|
|
41
41
|
|
|
42
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
43
|
+
function makeEntityIdSchema(brand, label) {
|
|
44
|
+
return effect.Schema.String.pipe(
|
|
45
|
+
effect.Schema.trimmed(),
|
|
46
|
+
effect.Schema.minLength(1, {
|
|
47
|
+
message: () => `${label} must be non-empty.`
|
|
48
|
+
}),
|
|
49
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
50
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
51
|
+
}),
|
|
52
|
+
effect.Schema.brand(brand)
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
56
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
57
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
58
|
+
function validateWithSchema(schema, raw, context) {
|
|
59
|
+
const trimmed = raw.trim();
|
|
60
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
61
|
+
schema
|
|
62
|
+
);
|
|
63
|
+
const result = decode(trimmed);
|
|
64
|
+
if (effect.Either.isLeft(result)) {
|
|
65
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
66
|
+
}
|
|
67
|
+
return result.right;
|
|
68
|
+
}
|
|
69
|
+
function validateRunConfigName(raw, context) {
|
|
70
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// src/evals/evaluator.ts
|
|
74
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
75
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
76
|
+
const label = evaluator.getDisplayLabel();
|
|
77
|
+
if (label !== void 0) {
|
|
78
|
+
return label;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
82
|
+
}
|
|
83
|
+
function getEvaluatorTagList(evaluator) {
|
|
84
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
85
|
+
}
|
|
86
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
87
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
88
|
+
let entries;
|
|
89
|
+
try {
|
|
90
|
+
entries = await promises.readdir(baseDir);
|
|
91
|
+
} catch {
|
|
92
|
+
return [];
|
|
93
|
+
}
|
|
94
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
95
|
+
const snapshots = [];
|
|
96
|
+
for (const fileName of jsonlFiles) {
|
|
97
|
+
const filePath = path.join(baseDir, fileName);
|
|
98
|
+
try {
|
|
99
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
100
|
+
if (snapshot) {
|
|
101
|
+
snapshots.push(snapshot);
|
|
102
|
+
}
|
|
103
|
+
} catch {
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
107
|
+
}
|
|
108
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
109
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
110
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
111
|
+
if (lines.length === 0) {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
let runQueued = null;
|
|
115
|
+
let runCompleted = null;
|
|
116
|
+
let runFailed = null;
|
|
117
|
+
let runStarted = null;
|
|
118
|
+
for (const line of lines) {
|
|
119
|
+
try {
|
|
120
|
+
const event = JSON.parse(line);
|
|
121
|
+
const type = event.type;
|
|
122
|
+
if (type === "RunQueued") {
|
|
123
|
+
runQueued = {
|
|
124
|
+
runId: event.runId,
|
|
125
|
+
datasetId: event.datasetId,
|
|
126
|
+
datasetName: event.datasetName,
|
|
127
|
+
evaluatorIds: event.evaluatorIds,
|
|
128
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
129
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
130
|
+
ts: event.ts
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
if (type === "RunStarted") {
|
|
134
|
+
runStarted = { startedAt: event.startedAt };
|
|
135
|
+
}
|
|
136
|
+
if (type === "RunCompleted") {
|
|
137
|
+
runCompleted = {
|
|
138
|
+
passedTestCases: event.passedTestCases,
|
|
139
|
+
failedTestCases: event.failedTestCases,
|
|
140
|
+
totalTestCases: event.totalTestCases,
|
|
141
|
+
finishedAt: event.finishedAt
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
if (type === "RunFailed") {
|
|
145
|
+
runFailed = {
|
|
146
|
+
finishedAt: event.finishedAt,
|
|
147
|
+
errorMessage: event.errorMessage
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
} catch {
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (!runQueued) {
|
|
154
|
+
return null;
|
|
155
|
+
}
|
|
156
|
+
const artifactPath = filePath;
|
|
157
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
158
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
159
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
160
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
161
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
162
|
+
return {
|
|
163
|
+
runId: runQueued.runId,
|
|
164
|
+
datasetId: runQueued.datasetId,
|
|
165
|
+
datasetName: runQueued.datasetName,
|
|
166
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
167
|
+
queuedAt: runQueued.ts ?? 0,
|
|
168
|
+
startedAt: runStarted?.startedAt,
|
|
169
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
170
|
+
totalTestCases: runQueued.totalTestCases,
|
|
171
|
+
completedTestCases,
|
|
172
|
+
passedTestCases,
|
|
173
|
+
failedTestCases,
|
|
174
|
+
status,
|
|
175
|
+
artifactPath,
|
|
176
|
+
errorMessage: runFailed?.errorMessage
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
function aggregateTestCaseProgress(lines) {
|
|
180
|
+
let completedTestCases = 0;
|
|
181
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
182
|
+
for (const line of lines) {
|
|
183
|
+
try {
|
|
184
|
+
const event = JSON.parse(line);
|
|
185
|
+
if (event.type === "TestCaseProgress") {
|
|
186
|
+
const ev = event;
|
|
187
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
188
|
+
const id = ev.testCaseId;
|
|
189
|
+
const current = testCasePassedBy.get(id);
|
|
190
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
191
|
+
}
|
|
192
|
+
} catch {
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
let passedTestCases = 0;
|
|
196
|
+
let failedTestCases = 0;
|
|
197
|
+
for (const passed of testCasePassedBy.values()) {
|
|
198
|
+
if (passed) {
|
|
199
|
+
passedTestCases += 1;
|
|
200
|
+
} else {
|
|
201
|
+
failedTestCases += 1;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
205
|
+
}
|
|
206
|
+
|
|
42
207
|
// src/runner/config.ts
|
|
43
208
|
var defaultRunnerConfig = {
|
|
44
209
|
discovery: {
|
|
45
210
|
rootDir: process.cwd(),
|
|
46
211
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
47
212
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
213
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
48
214
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
49
215
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
50
216
|
},
|
|
@@ -70,6 +236,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
70
236
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
71
237
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
72
238
|
}
|
|
239
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
240
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
241
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
242
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
243
|
+
}
|
|
73
244
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
74
245
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
75
246
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -168,6 +339,9 @@ function isDatasetLike(value) {
|
|
|
168
339
|
function isEvaluatorLike(value) {
|
|
169
340
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
170
341
|
}
|
|
342
|
+
function isRunConfigLike(value) {
|
|
343
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
344
|
+
}
|
|
171
345
|
function isTestCaseLike(value) {
|
|
172
346
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
173
347
|
}
|
|
@@ -256,6 +430,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
256
430
|
);
|
|
257
431
|
return found.flat();
|
|
258
432
|
}
|
|
433
|
+
async function collectRunConfigsFromFiles(config) {
|
|
434
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
435
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
436
|
+
const found = await Promise.all(
|
|
437
|
+
matched.map(async (absolutePath) => {
|
|
438
|
+
const exports = await loadModuleExports(absolutePath);
|
|
439
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
440
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
441
|
+
return runConfigs.map((runConfig) => ({
|
|
442
|
+
id: runConfig.getName(),
|
|
443
|
+
filePath: relPath,
|
|
444
|
+
runConfig
|
|
445
|
+
}));
|
|
446
|
+
})
|
|
447
|
+
);
|
|
448
|
+
return found.flat();
|
|
449
|
+
}
|
|
259
450
|
async function collectTestCasesFromFiles(config) {
|
|
260
451
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
261
452
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -409,6 +600,17 @@ function getDiffLines(entry) {
|
|
|
409
600
|
});
|
|
410
601
|
}
|
|
411
602
|
|
|
603
|
+
// src/evals/test-case.ts
|
|
604
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
605
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
606
|
+
return testCase.getDisplayLabel();
|
|
607
|
+
}
|
|
608
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
609
|
+
}
|
|
610
|
+
function getTestCaseTagList(testCase) {
|
|
611
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
612
|
+
}
|
|
613
|
+
|
|
412
614
|
// src/evals/metric.ts
|
|
413
615
|
var registry = /* @__PURE__ */ new Map();
|
|
414
616
|
var Metric = {
|
|
@@ -432,6 +634,54 @@ function getMetricById(id) {
|
|
|
432
634
|
return registry.get(id);
|
|
433
635
|
}
|
|
434
636
|
|
|
637
|
+
// src/evals/aggregators.ts
|
|
638
|
+
function aggregateTokenCountSum(values) {
|
|
639
|
+
const initial = {
|
|
640
|
+
input: 0,
|
|
641
|
+
output: 0,
|
|
642
|
+
inputCached: 0,
|
|
643
|
+
outputCached: 0
|
|
644
|
+
};
|
|
645
|
+
return values.reduce(
|
|
646
|
+
(acc, v) => ({
|
|
647
|
+
input: acc.input + (v.input ?? 0),
|
|
648
|
+
output: acc.output + (v.output ?? 0),
|
|
649
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
650
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
651
|
+
}),
|
|
652
|
+
initial
|
|
653
|
+
);
|
|
654
|
+
}
|
|
655
|
+
function aggregateLatencyAverage(values) {
|
|
656
|
+
if (values.length === 0) {
|
|
657
|
+
return { ms: 0 };
|
|
658
|
+
}
|
|
659
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
660
|
+
return { ms: sum / values.length };
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// src/evals/metrics/standard.ts
|
|
664
|
+
Metric.of({
|
|
665
|
+
id: "token-count",
|
|
666
|
+
name: "Tokens",
|
|
667
|
+
aggregate: aggregateTokenCountSum,
|
|
668
|
+
format: (data, options) => {
|
|
669
|
+
const input = data.input ?? 0;
|
|
670
|
+
const output = data.output ?? 0;
|
|
671
|
+
const inputCached = data.inputCached ?? 0;
|
|
672
|
+
const outputCached = data.outputCached ?? 0;
|
|
673
|
+
const cached = inputCached + outputCached;
|
|
674
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
675
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
676
|
+
}
|
|
677
|
+
});
|
|
678
|
+
Metric.of({
|
|
679
|
+
id: "latency",
|
|
680
|
+
name: "Latency",
|
|
681
|
+
aggregate: aggregateLatencyAverage,
|
|
682
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
683
|
+
});
|
|
684
|
+
|
|
435
685
|
// src/evals/score.ts
|
|
436
686
|
var registry2 = /* @__PURE__ */ new Map();
|
|
437
687
|
function formatScoreData(def, data, options) {
|
|
@@ -540,54 +790,6 @@ function getScoreById(id) {
|
|
|
540
790
|
return registry2.get(id);
|
|
541
791
|
}
|
|
542
792
|
|
|
543
|
-
// src/evals/aggregators.ts
|
|
544
|
-
function aggregateTokenCountSum(values) {
|
|
545
|
-
const initial = {
|
|
546
|
-
input: 0,
|
|
547
|
-
output: 0,
|
|
548
|
-
inputCached: 0,
|
|
549
|
-
outputCached: 0
|
|
550
|
-
};
|
|
551
|
-
return values.reduce(
|
|
552
|
-
(acc, v) => ({
|
|
553
|
-
input: acc.input + (v.input ?? 0),
|
|
554
|
-
output: acc.output + (v.output ?? 0),
|
|
555
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
556
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
557
|
-
}),
|
|
558
|
-
initial
|
|
559
|
-
);
|
|
560
|
-
}
|
|
561
|
-
function aggregateLatencyAverage(values) {
|
|
562
|
-
if (values.length === 0) {
|
|
563
|
-
return { ms: 0 };
|
|
564
|
-
}
|
|
565
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
566
|
-
return { ms: sum / values.length };
|
|
567
|
-
}
|
|
568
|
-
|
|
569
|
-
// src/evals/metrics/standard.ts
|
|
570
|
-
Metric.of({
|
|
571
|
-
id: "token-count",
|
|
572
|
-
name: "Tokens",
|
|
573
|
-
aggregate: aggregateTokenCountSum,
|
|
574
|
-
format: (data, options) => {
|
|
575
|
-
const input = data.input ?? 0;
|
|
576
|
-
const output = data.output ?? 0;
|
|
577
|
-
const inputCached = data.inputCached ?? 0;
|
|
578
|
-
const outputCached = data.outputCached ?? 0;
|
|
579
|
-
const cached = inputCached + outputCached;
|
|
580
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
581
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
582
|
-
}
|
|
583
|
-
});
|
|
584
|
-
Metric.of({
|
|
585
|
-
id: "latency",
|
|
586
|
-
name: "Latency",
|
|
587
|
-
aggregate: aggregateLatencyAverage,
|
|
588
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
589
|
-
});
|
|
590
|
-
|
|
591
793
|
// src/evals/scores/standard.ts
|
|
592
794
|
Score.of({
|
|
593
795
|
id: "percent",
|
|
@@ -731,15 +933,17 @@ function readOutput(testCase) {
|
|
|
731
933
|
}
|
|
732
934
|
return candidate.getOutput();
|
|
733
935
|
}
|
|
734
|
-
function buildEvaluationUnits(testCases) {
|
|
936
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
937
|
+
const count = Math.max(1, repetitionCount);
|
|
735
938
|
const units = [];
|
|
736
939
|
for (const testCaseItem of testCases) {
|
|
737
|
-
const
|
|
738
|
-
for (let r = 0; r <
|
|
940
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
941
|
+
for (let r = 0; r < count; r++) {
|
|
739
942
|
units.push({
|
|
740
943
|
testCaseItem,
|
|
741
|
-
|
|
742
|
-
|
|
944
|
+
repetitionId,
|
|
945
|
+
repetitionIndex: r + 1,
|
|
946
|
+
repetitionCount: count
|
|
743
947
|
});
|
|
744
948
|
}
|
|
745
949
|
}
|
|
@@ -752,7 +956,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
752
956
|
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
753
957
|
}
|
|
754
958
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
755
|
-
const { testCaseItem,
|
|
959
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
756
960
|
return effect.Effect.gen(function* () {
|
|
757
961
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
758
962
|
const started = Date.now();
|
|
@@ -761,11 +965,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
761
965
|
type: "TestCaseStarted",
|
|
762
966
|
runId: task.runId,
|
|
763
967
|
testCaseId: testCaseItem.id,
|
|
764
|
-
testCaseName: testCaseItem.testCase
|
|
968
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
765
969
|
startedTestCases: startedEvaluations,
|
|
766
970
|
totalTestCases: totalEvaluations,
|
|
767
|
-
|
|
768
|
-
|
|
971
|
+
repetitionId,
|
|
972
|
+
repetitionIndex,
|
|
973
|
+
repetitionCount
|
|
769
974
|
});
|
|
770
975
|
const evaluatorScores = [];
|
|
771
976
|
let testCaseError;
|
|
@@ -799,8 +1004,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
799
1004
|
meta: {
|
|
800
1005
|
triggerId: task.triggerId,
|
|
801
1006
|
runId: evaluatorRunId,
|
|
802
|
-
datasetId: task.datasetId
|
|
1007
|
+
datasetId: task.datasetId,
|
|
1008
|
+
repetitionId,
|
|
1009
|
+
repetitionIndex,
|
|
1010
|
+
repetitionCount,
|
|
1011
|
+
runConfigName: task.runConfigName
|
|
803
1012
|
},
|
|
1013
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1014
|
+
runConfigTags: task.runConfigTags,
|
|
1015
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
804
1016
|
logDiff,
|
|
805
1017
|
log,
|
|
806
1018
|
createError
|
|
@@ -843,18 +1055,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
843
1055
|
});
|
|
844
1056
|
}
|
|
845
1057
|
}
|
|
846
|
-
const
|
|
1058
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
847
1059
|
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
848
1060
|
const progressEvent = {
|
|
849
1061
|
type: "TestCaseProgress",
|
|
850
1062
|
runId: task.runId,
|
|
851
1063
|
testCaseId: testCaseItem.id,
|
|
852
|
-
testCaseName: testCaseItem.testCase
|
|
1064
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
853
1065
|
completedTestCases: completedEvaluations,
|
|
854
1066
|
totalTestCases: totalEvaluations,
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
1067
|
+
repetitionId,
|
|
1068
|
+
repetitionIndex,
|
|
1069
|
+
repetitionCount,
|
|
1070
|
+
passed: repetitionPassedThis,
|
|
858
1071
|
durationMs: Date.now() - started,
|
|
859
1072
|
evaluatorScores,
|
|
860
1073
|
output,
|
|
@@ -875,9 +1088,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
875
1088
|
(map) => {
|
|
876
1089
|
const key = testCaseItem.id;
|
|
877
1090
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
878
|
-
const newResults = [...existing.results,
|
|
1091
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
879
1092
|
const newCompletedCount = existing.completedCount + 1;
|
|
880
|
-
const isLast = newCompletedCount ===
|
|
1093
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
881
1094
|
const newMap = new Map(map);
|
|
882
1095
|
newMap.set(key, {
|
|
883
1096
|
completedCount: newCompletedCount,
|
|
@@ -914,10 +1127,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
914
1127
|
runId: task.runId,
|
|
915
1128
|
startedAt
|
|
916
1129
|
});
|
|
917
|
-
const totalEvaluations = task.testCases.
|
|
918
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
919
|
-
0
|
|
920
|
-
);
|
|
1130
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
921
1131
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
922
1132
|
const completedRef = yield* effect.Ref.make(0);
|
|
923
1133
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -926,7 +1136,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
926
1136
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
927
1137
|
/* @__PURE__ */ new Map()
|
|
928
1138
|
);
|
|
929
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1139
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
930
1140
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
931
1141
|
task,
|
|
932
1142
|
unit,
|
|
@@ -940,11 +1150,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
940
1150
|
failedRef,
|
|
941
1151
|
testCaseResultsRef
|
|
942
1152
|
);
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
1153
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1154
|
+
if (globalSem !== void 0) {
|
|
1155
|
+
yield* effect.Effect.forEach(
|
|
1156
|
+
evaluationUnits,
|
|
1157
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1158
|
+
{ concurrency: "unbounded", discard: true }
|
|
1159
|
+
);
|
|
1160
|
+
} else {
|
|
1161
|
+
yield* effect.Effect.forEach(
|
|
1162
|
+
evaluationUnits,
|
|
1163
|
+
processEvaluation,
|
|
1164
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1165
|
+
);
|
|
1166
|
+
}
|
|
948
1167
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
949
1168
|
effect.Ref.get(completedRef),
|
|
950
1169
|
effect.Ref.get(passedRef),
|
|
@@ -961,144 +1180,53 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
961
1180
|
artifactPath: task.snapshot.artifactPath
|
|
962
1181
|
};
|
|
963
1182
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
964
|
-
...snapshot,
|
|
965
|
-
status: "completed",
|
|
966
|
-
completedTestCases: completedEvaluations,
|
|
967
|
-
passedTestCases: passedUniqueTestCases,
|
|
968
|
-
failedTestCases: failedUniqueTestCases,
|
|
969
|
-
finishedAt
|
|
970
|
-
}));
|
|
971
|
-
yield* publishEvent(completedEvent);
|
|
972
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
973
|
-
runId: task.runId,
|
|
974
|
-
artifactPath: task.snapshot.artifactPath,
|
|
975
|
-
payload: completedEvent
|
|
976
|
-
});
|
|
977
|
-
yield* publishEvent({
|
|
978
|
-
type: "ArtifactFlushed",
|
|
979
|
-
runId: task.runId,
|
|
980
|
-
artifactPath: task.snapshot.artifactPath
|
|
981
|
-
});
|
|
982
|
-
});
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
} catch {
|
|
989
|
-
return [];
|
|
990
|
-
}
|
|
991
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
992
|
-
const snapshots = [];
|
|
993
|
-
for (const fileName of jsonlFiles) {
|
|
994
|
-
const filePath = path.join(baseDir, fileName);
|
|
995
|
-
try {
|
|
996
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
997
|
-
if (snapshot) {
|
|
998
|
-
snapshots.push(snapshot);
|
|
999
|
-
}
|
|
1000
|
-
} catch {
|
|
1001
|
-
}
|
|
1002
|
-
}
|
|
1003
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1004
|
-
}
|
|
1005
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1006
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1007
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1008
|
-
if (lines.length === 0) {
|
|
1009
|
-
return null;
|
|
1010
|
-
}
|
|
1011
|
-
let runQueued = null;
|
|
1012
|
-
let runCompleted = null;
|
|
1013
|
-
let runFailed = null;
|
|
1014
|
-
let runStarted = null;
|
|
1015
|
-
for (const line of lines) {
|
|
1016
|
-
try {
|
|
1017
|
-
const event = JSON.parse(line);
|
|
1018
|
-
const type = event.type;
|
|
1019
|
-
if (type === "RunQueued") {
|
|
1020
|
-
runQueued = {
|
|
1021
|
-
runId: event.runId,
|
|
1022
|
-
datasetId: event.datasetId,
|
|
1023
|
-
datasetName: event.datasetName,
|
|
1024
|
-
evaluatorIds: event.evaluatorIds,
|
|
1025
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1026
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1027
|
-
ts: event.ts
|
|
1028
|
-
};
|
|
1029
|
-
}
|
|
1030
|
-
if (type === "RunStarted") {
|
|
1031
|
-
runStarted = { startedAt: event.startedAt };
|
|
1032
|
-
}
|
|
1033
|
-
if (type === "RunCompleted") {
|
|
1034
|
-
runCompleted = {
|
|
1035
|
-
passedTestCases: event.passedTestCases,
|
|
1036
|
-
failedTestCases: event.failedTestCases,
|
|
1037
|
-
totalTestCases: event.totalTestCases,
|
|
1038
|
-
finishedAt: event.finishedAt
|
|
1039
|
-
};
|
|
1040
|
-
}
|
|
1041
|
-
if (type === "RunFailed") {
|
|
1042
|
-
runFailed = {
|
|
1043
|
-
finishedAt: event.finishedAt,
|
|
1044
|
-
errorMessage: event.errorMessage
|
|
1045
|
-
};
|
|
1046
|
-
}
|
|
1047
|
-
} catch {
|
|
1048
|
-
}
|
|
1183
|
+
...snapshot,
|
|
1184
|
+
status: "completed",
|
|
1185
|
+
completedTestCases: completedEvaluations,
|
|
1186
|
+
passedTestCases: passedUniqueTestCases,
|
|
1187
|
+
failedTestCases: failedUniqueTestCases,
|
|
1188
|
+
finishedAt
|
|
1189
|
+
}));
|
|
1190
|
+
yield* publishEvent(completedEvent);
|
|
1191
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1192
|
+
runId: task.runId,
|
|
1193
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1194
|
+
payload: completedEvent
|
|
1195
|
+
});
|
|
1196
|
+
yield* publishEvent({
|
|
1197
|
+
type: "ArtifactFlushed",
|
|
1198
|
+
runId: task.runId,
|
|
1199
|
+
artifactPath: task.snapshot.artifactPath
|
|
1200
|
+
});
|
|
1201
|
+
});
|
|
1202
|
+
|
|
1203
|
+
// src/runner/name-pattern.ts
|
|
1204
|
+
function parseRegexLiteral(pattern) {
|
|
1205
|
+
if (!pattern.startsWith("/")) {
|
|
1206
|
+
return void 0;
|
|
1049
1207
|
}
|
|
1050
|
-
|
|
1051
|
-
|
|
1208
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1209
|
+
if (lastSlash <= 0) {
|
|
1210
|
+
return void 0;
|
|
1052
1211
|
}
|
|
1053
|
-
const artifactPath = filePath;
|
|
1054
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1055
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1056
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1057
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1058
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1059
1212
|
return {
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
datasetName: runQueued.datasetName,
|
|
1063
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1064
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1065
|
-
startedAt: runStarted?.startedAt,
|
|
1066
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1067
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1068
|
-
completedTestCases,
|
|
1069
|
-
passedTestCases,
|
|
1070
|
-
failedTestCases,
|
|
1071
|
-
status,
|
|
1072
|
-
artifactPath,
|
|
1073
|
-
errorMessage: runFailed?.errorMessage
|
|
1213
|
+
source: pattern.slice(1, lastSlash),
|
|
1214
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1074
1215
|
};
|
|
1075
1216
|
}
|
|
1076
|
-
function
|
|
1077
|
-
|
|
1078
|
-
const
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
if (event.type === "TestCaseProgress") {
|
|
1083
|
-
const ev = event;
|
|
1084
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1085
|
-
const id = ev.testCaseId;
|
|
1086
|
-
const current = testCasePassedBy.get(id);
|
|
1087
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1088
|
-
}
|
|
1089
|
-
} catch {
|
|
1090
|
-
}
|
|
1217
|
+
function createNameMatcher(pattern) {
|
|
1218
|
+
const normalizedPattern = pattern.trim();
|
|
1219
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1220
|
+
if (regexLiteral) {
|
|
1221
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1222
|
+
return (value) => regex.test(value);
|
|
1091
1223
|
}
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
passedTestCases += 1;
|
|
1097
|
-
} else {
|
|
1098
|
-
failedTestCases += 1;
|
|
1099
|
-
}
|
|
1224
|
+
if (normalizedPattern.includes("*")) {
|
|
1225
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1226
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1227
|
+
return (value) => regex.test(value);
|
|
1100
1228
|
}
|
|
1101
|
-
return
|
|
1229
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1102
1230
|
}
|
|
1103
1231
|
async function appendJsonLine(artifactPath, payload) {
|
|
1104
1232
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1157,32 +1285,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1157
1285
|
}
|
|
1158
1286
|
|
|
1159
1287
|
// src/runner/api.ts
|
|
1160
|
-
function
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1165
|
-
if (lastSlash <= 0) {
|
|
1166
|
-
return void 0;
|
|
1167
|
-
}
|
|
1168
|
-
return {
|
|
1169
|
-
source: pattern.slice(1, lastSlash),
|
|
1170
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1171
|
-
};
|
|
1172
|
-
}
|
|
1173
|
-
function createNameMatcher(pattern) {
|
|
1174
|
-
const normalizedPattern = pattern.trim();
|
|
1175
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1176
|
-
if (regexLiteral) {
|
|
1177
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1178
|
-
return (value) => regex.test(value);
|
|
1288
|
+
function normalizeRunRepetitions(value) {
|
|
1289
|
+
const n = value ?? 1;
|
|
1290
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1291
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1179
1292
|
}
|
|
1180
|
-
|
|
1181
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1182
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1183
|
-
return (value) => regex.test(value);
|
|
1184
|
-
}
|
|
1185
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1293
|
+
return n;
|
|
1186
1294
|
}
|
|
1187
1295
|
function mergeRunnerOverrides(base, next) {
|
|
1188
1296
|
if (!base) {
|
|
@@ -1217,6 +1325,7 @@ var EffectRunner = class {
|
|
|
1217
1325
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1218
1326
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1219
1327
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1328
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1220
1329
|
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1221
1330
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1222
1331
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1257,6 +1366,137 @@ var EffectRunner = class {
|
|
|
1257
1366
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1258
1367
|
);
|
|
1259
1368
|
}
|
|
1369
|
+
async collectRunConfigs() {
|
|
1370
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1371
|
+
this.runConfigsById.clear();
|
|
1372
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1373
|
+
for (const item of runConfigs) {
|
|
1374
|
+
const id = item.runConfig.getName();
|
|
1375
|
+
const lower = id.toLowerCase();
|
|
1376
|
+
const prev = byNameLower.get(lower);
|
|
1377
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1378
|
+
throw new Error(
|
|
1379
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1380
|
+
);
|
|
1381
|
+
}
|
|
1382
|
+
byNameLower.set(lower, item);
|
|
1383
|
+
this.runConfigsById.set(id, item);
|
|
1384
|
+
}
|
|
1385
|
+
return runConfigs;
|
|
1386
|
+
}
|
|
1387
|
+
async resolveRunConfigByName(name) {
|
|
1388
|
+
if (this.runConfigsById.size === 0) {
|
|
1389
|
+
await this.collectRunConfigs();
|
|
1390
|
+
}
|
|
1391
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1392
|
+
const keyLower = key.toLowerCase();
|
|
1393
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1394
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1395
|
+
);
|
|
1396
|
+
if (matches.length === 0) {
|
|
1397
|
+
return void 0;
|
|
1398
|
+
}
|
|
1399
|
+
if (matches.length > 1) {
|
|
1400
|
+
throw new Error(
|
|
1401
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1402
|
+
);
|
|
1403
|
+
}
|
|
1404
|
+
return matches[0];
|
|
1405
|
+
}
|
|
1406
|
+
async expandRunConfigToJobs(collected) {
|
|
1407
|
+
if (this.datasetsById.size === 0) {
|
|
1408
|
+
await this.collectDatasets();
|
|
1409
|
+
}
|
|
1410
|
+
if (this.evaluatorsById.size === 0) {
|
|
1411
|
+
await this.collectEvaluators();
|
|
1412
|
+
}
|
|
1413
|
+
const rcName = collected.runConfig.getName();
|
|
1414
|
+
const jobs = [];
|
|
1415
|
+
const runs = collected.runConfig.getRuns();
|
|
1416
|
+
for (const [i, row] of runs.entries()) {
|
|
1417
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1418
|
+
(d) => d.dataset === row.dataset
|
|
1419
|
+
);
|
|
1420
|
+
if (!dsCollected) {
|
|
1421
|
+
throw new Error(
|
|
1422
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1423
|
+
);
|
|
1424
|
+
}
|
|
1425
|
+
let evaluatorIds;
|
|
1426
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1427
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1428
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1429
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1430
|
+
);
|
|
1431
|
+
if (matched.length === 0) {
|
|
1432
|
+
throw new Error(
|
|
1433
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1434
|
+
);
|
|
1435
|
+
}
|
|
1436
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1437
|
+
} else {
|
|
1438
|
+
const evaluators = row.evaluators;
|
|
1439
|
+
evaluatorIds = [];
|
|
1440
|
+
for (const ev of evaluators) {
|
|
1441
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
1442
|
+
(item) => item.evaluator === ev
|
|
1443
|
+
);
|
|
1444
|
+
if (!found) {
|
|
1445
|
+
throw new Error(
|
|
1446
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
1447
|
+
);
|
|
1448
|
+
}
|
|
1449
|
+
evaluatorIds.push(found.id);
|
|
1450
|
+
}
|
|
1451
|
+
}
|
|
1452
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
1453
|
+
jobs.push({
|
|
1454
|
+
datasetId: dsCollected.id,
|
|
1455
|
+
evaluatorIds,
|
|
1456
|
+
runConfigName: rcName,
|
|
1457
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
1458
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
1459
|
+
repetitions
|
|
1460
|
+
});
|
|
1461
|
+
}
|
|
1462
|
+
return jobs;
|
|
1463
|
+
}
|
|
1464
|
+
async expandRunConfigNamesToJobs(names) {
|
|
1465
|
+
const jobs = [];
|
|
1466
|
+
for (const name of names) {
|
|
1467
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
1468
|
+
if (!collected) {
|
|
1469
|
+
const known = await this.collectRunConfigs();
|
|
1470
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
1471
|
+
throw new Error(
|
|
1472
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
1473
|
+
);
|
|
1474
|
+
}
|
|
1475
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
1476
|
+
}
|
|
1477
|
+
return jobs;
|
|
1478
|
+
}
|
|
1479
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
1480
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1481
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1482
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1483
|
+
const snapshots = [];
|
|
1484
|
+
for (const job of request.jobs) {
|
|
1485
|
+
snapshots.push(
|
|
1486
|
+
await this.startDatasetRun({
|
|
1487
|
+
datasetId: job.datasetId,
|
|
1488
|
+
evaluatorIds: job.evaluatorIds,
|
|
1489
|
+
triggerId,
|
|
1490
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1491
|
+
globalEvaluationSemaphore: sem,
|
|
1492
|
+
runConfigName: job.runConfigName,
|
|
1493
|
+
runConfigTags: job.runConfigTags,
|
|
1494
|
+
repetitions: job.repetitions
|
|
1495
|
+
})
|
|
1496
|
+
);
|
|
1497
|
+
}
|
|
1498
|
+
return snapshots;
|
|
1499
|
+
}
|
|
1260
1500
|
async searchTestCases(query) {
|
|
1261
1501
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1262
1502
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1275,35 +1515,45 @@ var EffectRunner = class {
|
|
|
1275
1515
|
);
|
|
1276
1516
|
}
|
|
1277
1517
|
async runDatasetWith(request) {
|
|
1518
|
+
const runConfigName = validateRunConfigName(
|
|
1519
|
+
request.runConfigName,
|
|
1520
|
+
"runDatasetWith.runConfigName"
|
|
1521
|
+
);
|
|
1522
|
+
return this.startDatasetRun({
|
|
1523
|
+
datasetId: request.datasetId,
|
|
1524
|
+
evaluatorIds: request.evaluatorIds,
|
|
1525
|
+
triggerId: request.triggerId,
|
|
1526
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1527
|
+
repetitions: request.repetitions,
|
|
1528
|
+
runConfigName,
|
|
1529
|
+
runConfigTags: request.runConfigTags
|
|
1530
|
+
});
|
|
1531
|
+
}
|
|
1532
|
+
async startDatasetRun(params) {
|
|
1278
1533
|
if (this.datasetsById.size === 0) {
|
|
1279
1534
|
await this.collectDatasets();
|
|
1280
1535
|
}
|
|
1281
1536
|
if (this.evaluatorsById.size === 0) {
|
|
1282
1537
|
await this.collectEvaluators();
|
|
1283
1538
|
}
|
|
1284
|
-
const dataset = this.datasetsById.get(
|
|
1539
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1285
1540
|
if (!dataset) {
|
|
1286
|
-
throw new Error(`Unknown dataset: ${
|
|
1541
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1287
1542
|
}
|
|
1288
|
-
const selectedEvaluators =
|
|
1543
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1289
1544
|
if (selectedEvaluators.length === 0) {
|
|
1290
1545
|
throw new Error("No evaluators selected for run");
|
|
1291
1546
|
}
|
|
1292
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1293
|
-
const
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
)
|
|
1297
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1547
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
1548
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
1549
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1550
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1551
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1298
1552
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1299
|
-
const artifactPath = createArtifactPath(
|
|
1300
|
-
this.config.artifactDirectory,
|
|
1301
|
-
request.datasetId,
|
|
1302
|
-
runId
|
|
1303
|
-
);
|
|
1553
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1304
1554
|
const snapshot = {
|
|
1305
1555
|
runId,
|
|
1306
|
-
datasetId:
|
|
1556
|
+
datasetId: params.datasetId,
|
|
1307
1557
|
datasetName: dataset.dataset.getName(),
|
|
1308
1558
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1309
1559
|
queuedAt: Date.now(),
|
|
@@ -1324,7 +1574,7 @@ var EffectRunner = class {
|
|
|
1324
1574
|
const queuedEvent = {
|
|
1325
1575
|
type: "RunQueued",
|
|
1326
1576
|
runId,
|
|
1327
|
-
datasetId:
|
|
1577
|
+
datasetId: params.datasetId,
|
|
1328
1578
|
datasetName: dataset.dataset.getName(),
|
|
1329
1579
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1330
1580
|
totalTestCases: totalEvaluations,
|
|
@@ -1338,17 +1588,20 @@ var EffectRunner = class {
|
|
|
1338
1588
|
payload: queuedEvent
|
|
1339
1589
|
})
|
|
1340
1590
|
);
|
|
1341
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1342
1591
|
await effect.Effect.runPromise(
|
|
1343
1592
|
effect.Queue.offer(this.runQueue, {
|
|
1344
1593
|
runId,
|
|
1345
1594
|
triggerId,
|
|
1346
|
-
datasetId:
|
|
1595
|
+
datasetId: params.datasetId,
|
|
1347
1596
|
dataset: dataset.dataset,
|
|
1348
1597
|
evaluators: selectedEvaluators,
|
|
1349
1598
|
testCases: selectedTestCases,
|
|
1350
1599
|
snapshot,
|
|
1351
|
-
maxConcurrency
|
|
1600
|
+
maxConcurrency: params.maxConcurrency,
|
|
1601
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1602
|
+
runConfigName: params.runConfigName,
|
|
1603
|
+
runConfigTags,
|
|
1604
|
+
repetitions
|
|
1352
1605
|
})
|
|
1353
1606
|
);
|
|
1354
1607
|
return snapshot;
|
|
@@ -1427,6 +1680,8 @@ function getDefaultConcurrency() {
|
|
|
1427
1680
|
function parseSimpleCliArgs(argv) {
|
|
1428
1681
|
const args = {
|
|
1429
1682
|
help: false,
|
|
1683
|
+
ci: false,
|
|
1684
|
+
runConfigNames: [],
|
|
1430
1685
|
unknownArgs: []
|
|
1431
1686
|
};
|
|
1432
1687
|
let index = 0;
|
|
@@ -1440,18 +1695,26 @@ function parseSimpleCliArgs(argv) {
|
|
|
1440
1695
|
args.help = true;
|
|
1441
1696
|
continue;
|
|
1442
1697
|
}
|
|
1698
|
+
if (token === "--ci") {
|
|
1699
|
+
args.ci = true;
|
|
1700
|
+
continue;
|
|
1701
|
+
}
|
|
1443
1702
|
if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
|
|
1444
1703
|
args.datasetName = argv[index + 1];
|
|
1445
1704
|
index += 1;
|
|
1446
1705
|
continue;
|
|
1447
1706
|
}
|
|
1448
|
-
if ((token === "--
|
|
1449
|
-
|
|
1707
|
+
if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
|
|
1708
|
+
const next = argv[index + 1];
|
|
1709
|
+
if (typeof next === "string") {
|
|
1710
|
+
args.runConfigNames.push(next);
|
|
1711
|
+
}
|
|
1450
1712
|
index += 1;
|
|
1451
1713
|
continue;
|
|
1452
1714
|
}
|
|
1453
1715
|
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1454
|
-
const
|
|
1716
|
+
const nextConc = argv[index + 1];
|
|
1717
|
+
const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
|
|
1455
1718
|
if (!Number.isNaN(n) && n >= 1) {
|
|
1456
1719
|
args.concurrency = n;
|
|
1457
1720
|
}
|
|
@@ -1465,16 +1728,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1465
1728
|
function getSimpleCliUsage() {
|
|
1466
1729
|
return [
|
|
1467
1730
|
"Usage:",
|
|
1468
|
-
" eval-agents-simple run --
|
|
1731
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1469
1732
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1470
1733
|
"",
|
|
1471
1734
|
"Options:",
|
|
1472
|
-
" --
|
|
1473
|
-
""
|
|
1474
|
-
"Pattern examples for --evaluator:",
|
|
1475
|
-
" score-evaluator exact name (case-insensitive)",
|
|
1476
|
-
' "*score*" wildcard pattern',
|
|
1477
|
-
' "/score/i" regex literal'
|
|
1735
|
+
" --ci With run: exit with code 1 if any test case fails.",
|
|
1736
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1478
1737
|
].join("\n");
|
|
1479
1738
|
}
|
|
1480
1739
|
|
|
@@ -1525,7 +1784,7 @@ function GenerateView({
|
|
|
1525
1784
|
const payload = testCases.map((item) => {
|
|
1526
1785
|
const tc = item.testCase;
|
|
1527
1786
|
return {
|
|
1528
|
-
name: item.testCase
|
|
1787
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1529
1788
|
input: item.testCase.getInput(),
|
|
1530
1789
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
1531
1790
|
};
|
|
@@ -1591,7 +1850,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1591
1850
|
}
|
|
1592
1851
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
1593
1852
|
const payload = testCases.map((item) => ({
|
|
1594
|
-
name: item.testCase
|
|
1853
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1595
1854
|
input: item.testCase.getInput(),
|
|
1596
1855
|
output: readOutput2(item.testCase)
|
|
1597
1856
|
}));
|
|
@@ -1749,8 +2008,7 @@ function formatScorePart(item, _scoreToColor, options) {
|
|
|
1749
2008
|
}
|
|
1750
2009
|
function RunView({
|
|
1751
2010
|
runner,
|
|
1752
|
-
|
|
1753
|
-
evaluatorPattern,
|
|
2011
|
+
runConfigNames,
|
|
1754
2012
|
concurrency,
|
|
1755
2013
|
onComplete
|
|
1756
2014
|
}) {
|
|
@@ -1763,30 +2021,30 @@ function RunView({
|
|
|
1763
2021
|
const [summary, setSummary] = React.useState(null);
|
|
1764
2022
|
const [evaluatorNameById, setEvaluatorNameById] = React.useState(/* @__PURE__ */ new Map());
|
|
1765
2023
|
const runEval = React.useCallback(async () => {
|
|
1766
|
-
const
|
|
1767
|
-
if (
|
|
1768
|
-
|
|
1769
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1770
|
-
onComplete(
|
|
1771
|
-
new Error(
|
|
1772
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1773
|
-
)
|
|
1774
|
-
);
|
|
2024
|
+
const rcList = runConfigNames.filter((n) => n.trim().length > 0);
|
|
2025
|
+
if (rcList.length === 0) {
|
|
2026
|
+
onComplete(new Error("At least one RunConfig name is required."));
|
|
1775
2027
|
return;
|
|
1776
2028
|
}
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
2029
|
+
setStartedEvaluations(0);
|
|
2030
|
+
setCompletedEvaluations(0);
|
|
2031
|
+
setTestCases([]);
|
|
2032
|
+
setRunningEvaluations([]);
|
|
2033
|
+
setSummary(null);
|
|
2034
|
+
let jobs;
|
|
2035
|
+
try {
|
|
2036
|
+
jobs = await runner.expandRunConfigNamesToJobs(rcList);
|
|
2037
|
+
} catch (err) {
|
|
2038
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
2039
|
+
return;
|
|
2040
|
+
}
|
|
2041
|
+
if (jobs.length === 0) {
|
|
2042
|
+
onComplete(new Error("No jobs expanded from RunConfigs."));
|
|
1786
2043
|
return;
|
|
1787
2044
|
}
|
|
2045
|
+
const allEvaluators = await runner.collectEvaluators();
|
|
1788
2046
|
const nameById = new Map(
|
|
1789
|
-
|
|
2047
|
+
allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
1790
2048
|
);
|
|
1791
2049
|
setEvaluatorNameById(nameById);
|
|
1792
2050
|
const aggregates = /* @__PURE__ */ new Map();
|
|
@@ -1794,21 +2052,30 @@ function RunView({
|
|
|
1794
2052
|
let overallScoreTotal = 0;
|
|
1795
2053
|
let overallScoreSumSq = 0;
|
|
1796
2054
|
let overallScoreCount = 0;
|
|
1797
|
-
const
|
|
2055
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2056
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2057
|
+
let batchReady = false;
|
|
2058
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2059
|
+
const done = new Promise((resolve5, reject) => {
|
|
1798
2060
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2061
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2062
|
+
return;
|
|
2063
|
+
}
|
|
1799
2064
|
if (event.type === "TestCaseStarted") {
|
|
1800
|
-
setStartedEvaluations(
|
|
2065
|
+
setStartedEvaluations((c) => c + 1);
|
|
1801
2066
|
setRunningEvaluations((prev) => {
|
|
1802
2067
|
const withoutDuplicate = prev.filter(
|
|
1803
|
-
(item) => !(item.testCaseId === event.testCaseId && item.
|
|
2068
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
1804
2069
|
);
|
|
1805
2070
|
return [
|
|
1806
2071
|
...withoutDuplicate,
|
|
1807
2072
|
{
|
|
2073
|
+
runId: event.runId,
|
|
1808
2074
|
testCaseId: event.testCaseId,
|
|
1809
2075
|
name: event.testCaseName,
|
|
1810
|
-
|
|
1811
|
-
|
|
2076
|
+
repetitionId: event.repetitionId,
|
|
2077
|
+
repetitionIndex: event.repetitionIndex,
|
|
2078
|
+
repetitionCount: event.repetitionCount,
|
|
1812
2079
|
startedTestCases: event.startedTestCases,
|
|
1813
2080
|
totalTestCases: event.totalTestCases
|
|
1814
2081
|
}
|
|
@@ -1844,9 +2111,12 @@ function RunView({
|
|
|
1844
2111
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
1845
2112
|
}
|
|
1846
2113
|
}
|
|
2114
|
+
const label = runIdToLabel.get(event.runId);
|
|
2115
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2116
|
+
const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
|
|
1847
2117
|
setTestCases((prev) => {
|
|
1848
2118
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1849
|
-
const existing = byId.get(
|
|
2119
|
+
const existing = byId.get(compositeId);
|
|
1850
2120
|
const newEvent = {
|
|
1851
2121
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1852
2122
|
evaluatorId: item.evaluatorId,
|
|
@@ -1863,12 +2133,12 @@ function RunView({
|
|
|
1863
2133
|
const isAggregated = events.length > 1;
|
|
1864
2134
|
const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
|
|
1865
2135
|
const merged = {
|
|
1866
|
-
name:
|
|
1867
|
-
testCaseId:
|
|
2136
|
+
name: displayName,
|
|
2137
|
+
testCaseId: compositeId,
|
|
1868
2138
|
completedTestCases: event.completedTestCases,
|
|
1869
2139
|
totalTestCases: event.totalTestCases,
|
|
1870
|
-
|
|
1871
|
-
|
|
2140
|
+
repetitionIndex: event.repetitionIndex,
|
|
2141
|
+
repetitionCount: event.repetitionCount,
|
|
1872
2142
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1873
2143
|
passed: events.every((e) => e.passed),
|
|
1874
2144
|
errorMessage: event.errorMessage,
|
|
@@ -1876,84 +2146,118 @@ function RunView({
|
|
|
1876
2146
|
aggregatedEvaluatorScores,
|
|
1877
2147
|
isAggregated
|
|
1878
2148
|
};
|
|
1879
|
-
byId.set(
|
|
1880
|
-
setCompletedEvaluations(event.completedTestCases);
|
|
1881
|
-
setRunningEvaluations(
|
|
1882
|
-
(running) => running.filter(
|
|
1883
|
-
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1884
|
-
)
|
|
1885
|
-
);
|
|
2149
|
+
byId.set(compositeId, merged);
|
|
1886
2150
|
return Array.from(byId.values());
|
|
1887
2151
|
});
|
|
2152
|
+
setCompletedEvaluations((c) => c + 1);
|
|
2153
|
+
setRunningEvaluations(
|
|
2154
|
+
(running) => running.filter(
|
|
2155
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
2156
|
+
)
|
|
2157
|
+
);
|
|
1888
2158
|
}
|
|
1889
|
-
if (event.type === "
|
|
2159
|
+
if (event.type === "RunFailed") {
|
|
2160
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2161
|
+
return;
|
|
2162
|
+
}
|
|
1890
2163
|
unsubscribe();
|
|
1891
|
-
|
|
2164
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2165
|
+
return;
|
|
2166
|
+
}
|
|
2167
|
+
if (event.type === "RunCompleted") {
|
|
2168
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2169
|
+
return;
|
|
2170
|
+
}
|
|
2171
|
+
completedRuns.set(event.runId, event);
|
|
2172
|
+
batchPendingRunIds.delete(event.runId);
|
|
2173
|
+
if (batchPendingRunIds.size === 0) {
|
|
2174
|
+
unsubscribe();
|
|
2175
|
+
resolve5();
|
|
2176
|
+
}
|
|
1892
2177
|
}
|
|
1893
2178
|
});
|
|
1894
2179
|
});
|
|
1895
|
-
const
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
concurrency
|
|
2180
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2181
|
+
jobs,
|
|
2182
|
+
globalConcurrency: concurrency
|
|
1899
2183
|
});
|
|
2184
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2185
|
+
const snap = snapshots[i];
|
|
2186
|
+
const job = jobs[i];
|
|
2187
|
+
if (snap && job) {
|
|
2188
|
+
runIdToLabel.set(
|
|
2189
|
+
snap.runId,
|
|
2190
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2191
|
+
);
|
|
2192
|
+
batchPendingRunIds.add(snap.runId);
|
|
2193
|
+
}
|
|
2194
|
+
}
|
|
2195
|
+
const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2196
|
+
batchReady = true;
|
|
2197
|
+
const runConfigLabels = await Promise.all(
|
|
2198
|
+
rcList.map(async (n) => {
|
|
2199
|
+
const collected = await runner.resolveRunConfigByName(n);
|
|
2200
|
+
return collected?.runConfig.getDisplayLabel() ?? n;
|
|
2201
|
+
})
|
|
2202
|
+
);
|
|
1900
2203
|
setRunInfo({
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
totalTestCases: snapshot.totalTestCases
|
|
2204
|
+
names: runConfigLabels,
|
|
2205
|
+
jobs: jobs.length,
|
|
2206
|
+
totalTestCases: totalUnits
|
|
1905
2207
|
});
|
|
1906
2208
|
setPhase("running");
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
2209
|
+
try {
|
|
2210
|
+
await done;
|
|
2211
|
+
} catch (err) {
|
|
2212
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
1910
2213
|
return;
|
|
1911
2214
|
}
|
|
1912
|
-
|
|
2215
|
+
let passedTestCases = 0;
|
|
2216
|
+
let failedTestCases = 0;
|
|
2217
|
+
let totalTestCases = 0;
|
|
2218
|
+
const artifacts = [];
|
|
2219
|
+
for (const ev of completedRuns.values()) {
|
|
2220
|
+
passedTestCases += ev.passedTestCases;
|
|
2221
|
+
failedTestCases += ev.failedTestCases;
|
|
2222
|
+
totalTestCases += ev.totalTestCases;
|
|
2223
|
+
artifacts.push(ev.artifactPath);
|
|
2224
|
+
}
|
|
1913
2225
|
setSummary({
|
|
1914
|
-
passedTestCases
|
|
1915
|
-
failedTestCases
|
|
1916
|
-
totalTestCases
|
|
2226
|
+
passedTestCases,
|
|
2227
|
+
failedTestCases,
|
|
2228
|
+
totalTestCases,
|
|
1917
2229
|
overallScoreTotal,
|
|
1918
2230
|
overallScoreSumSq,
|
|
1919
2231
|
overallScoreCount,
|
|
1920
2232
|
aggregates: new Map(aggregates),
|
|
1921
2233
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1922
|
-
artifactPath:
|
|
2234
|
+
artifactPath: artifacts.join("\n")
|
|
1923
2235
|
});
|
|
1924
2236
|
setPhase("completed");
|
|
1925
|
-
|
|
1926
|
-
|
|
2237
|
+
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2238
|
+
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2239
|
+
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
1927
2240
|
React.useEffect(() => {
|
|
1928
2241
|
void runEval();
|
|
1929
2242
|
}, [runEval]);
|
|
1930
2243
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
1931
2244
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1932
2245
|
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1933
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
" "
|
|
1937
|
-
] }),
|
|
1938
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
2246
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
2247
|
+
"RunConfigs",
|
|
2248
|
+
" "
|
|
1939
2249
|
] }),
|
|
2250
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.names.join(", ") }),
|
|
1940
2251
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1941
2252
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1942
|
-
"
|
|
2253
|
+
"Jobs",
|
|
1943
2254
|
" "
|
|
1944
2255
|
] }),
|
|
1945
|
-
runInfo.
|
|
2256
|
+
runInfo.jobs
|
|
1946
2257
|
] }),
|
|
1947
2258
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1948
2259
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1949
|
-
"
|
|
1950
|
-
" "
|
|
1951
|
-
] }),
|
|
1952
|
-
runInfo.evaluatorNames.join(", ")
|
|
1953
|
-
] }),
|
|
1954
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1955
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
|
|
1956
|
-
"Test cases",
|
|
2260
|
+
"Evaluation units",
|
|
1957
2261
|
" "
|
|
1958
2262
|
] }),
|
|
1959
2263
|
runInfo.totalTestCases
|
|
@@ -1966,22 +2270,29 @@ function RunView({
|
|
|
1966
2270
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1967
2271
|
}
|
|
1968
2272
|
),
|
|
1969
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
2273
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2274
|
+
ink.Text,
|
|
2275
|
+
{
|
|
2276
|
+
color: "yellow",
|
|
2277
|
+
children: [
|
|
2278
|
+
"[running ",
|
|
2279
|
+
item.startedTestCases,
|
|
2280
|
+
"/",
|
|
2281
|
+
item.totalTestCases,
|
|
2282
|
+
"] ",
|
|
2283
|
+
item.name,
|
|
2284
|
+
" ",
|
|
2285
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2286
|
+
"(",
|
|
2287
|
+
item.repetitionIndex,
|
|
2288
|
+
"/",
|
|
2289
|
+
item.repetitionCount,
|
|
2290
|
+
")"
|
|
2291
|
+
] })
|
|
2292
|
+
]
|
|
2293
|
+
},
|
|
2294
|
+
`${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
|
|
2295
|
+
)) })
|
|
1985
2296
|
] }),
|
|
1986
2297
|
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1987
2298
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
@@ -1997,9 +2308,9 @@ function RunView({
|
|
|
1997
2308
|
" ",
|
|
1998
2309
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1999
2310
|
"(",
|
|
2000
|
-
tc.
|
|
2311
|
+
tc.repetitionIndex,
|
|
2001
2312
|
"/",
|
|
2002
|
-
tc.
|
|
2313
|
+
tc.repetitionCount,
|
|
2003
2314
|
")"
|
|
2004
2315
|
] }),
|
|
2005
2316
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
@@ -2039,7 +2350,7 @@ function RunView({
|
|
|
2039
2350
|
})
|
|
2040
2351
|
] }) : null
|
|
2041
2352
|
] }),
|
|
2042
|
-
item.scores.length > 0 ? item.scores.map((s
|
|
2353
|
+
item.scores.length > 0 ? item.scores.map((s) => {
|
|
2043
2354
|
const def = s.def ?? getScoreById(s.id);
|
|
2044
2355
|
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2045
2356
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
@@ -2056,18 +2367,25 @@ function RunView({
|
|
|
2056
2367
|
})
|
|
2057
2368
|
]
|
|
2058
2369
|
},
|
|
2059
|
-
`${item.evaluatorId}-${s.id}-${
|
|
2370
|
+
`${item.evaluatorId}-${s.id}-${scoreLabel}`
|
|
2060
2371
|
);
|
|
2061
2372
|
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
2062
2373
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2063
|
-
(log
|
|
2064
|
-
ink.
|
|
2374
|
+
(log) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(
|
|
2375
|
+
ink.Box,
|
|
2065
2376
|
{
|
|
2066
|
-
|
|
2067
|
-
children: line
|
|
2377
|
+
flexDirection: "column",
|
|
2378
|
+
children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2379
|
+
ink.Text,
|
|
2380
|
+
{
|
|
2381
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2382
|
+
children: line
|
|
2383
|
+
},
|
|
2384
|
+
`${type}:${line}`
|
|
2385
|
+
))
|
|
2068
2386
|
},
|
|
2069
|
-
|
|
2070
|
-
)
|
|
2387
|
+
`diff:${getDiffLines(log).map((x) => x.line).join("|")}`
|
|
2388
|
+
) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
|
|
2071
2389
|
) })
|
|
2072
2390
|
] }, item.evaluatorId))
|
|
2073
2391
|
] }, tc.testCaseId)) }),
|
|
@@ -2191,10 +2509,10 @@ function RunView({
|
|
|
2191
2509
|
] }, tc.testCaseId);
|
|
2192
2510
|
})
|
|
2193
2511
|
] }),
|
|
2194
|
-
/* @__PURE__ */ jsxRuntime.
|
|
2195
|
-
"artifact:
|
|
2196
|
-
summary.artifactPath
|
|
2197
|
-
] })
|
|
2512
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
2513
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "artifact(s):" }),
|
|
2514
|
+
summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line))
|
|
2515
|
+
] })
|
|
2198
2516
|
] })
|
|
2199
2517
|
] });
|
|
2200
2518
|
}
|
|
@@ -2406,25 +2724,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2406
2724
|
}
|
|
2407
2725
|
return lines;
|
|
2408
2726
|
}
|
|
2409
|
-
async function
|
|
2410
|
-
const
|
|
2411
|
-
if (
|
|
2412
|
-
|
|
2413
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
2414
|
-
throw new Error(
|
|
2415
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
|
|
2416
|
-
);
|
|
2417
|
-
}
|
|
2418
|
-
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
2419
|
-
if (evaluators.length === 0) {
|
|
2420
|
-
const known = await runner.collectEvaluators();
|
|
2421
|
-
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
2422
|
-
throw new Error(
|
|
2423
|
-
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
|
|
2424
|
-
);
|
|
2727
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2728
|
+
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2729
|
+
if (jobs.length === 0) {
|
|
2730
|
+
throw new Error("No jobs expanded from RunConfigs.");
|
|
2425
2731
|
}
|
|
2732
|
+
const evaluators = await runner.collectEvaluators();
|
|
2426
2733
|
const evaluatorNameById = new Map(
|
|
2427
|
-
evaluators.map((item) => [item.id, item.evaluator
|
|
2734
|
+
evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
2428
2735
|
);
|
|
2429
2736
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2430
2737
|
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
@@ -2432,11 +2739,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2432
2739
|
let overallScoreTotal = 0;
|
|
2433
2740
|
let overallScoreSumSq = 0;
|
|
2434
2741
|
let overallScoreCount = 0;
|
|
2435
|
-
let
|
|
2436
|
-
let
|
|
2742
|
+
let globalStartedUnits = 0;
|
|
2743
|
+
let globalCompletedUnits = 0;
|
|
2437
2744
|
let totalCount = 0;
|
|
2438
2745
|
let runFinished = false;
|
|
2439
|
-
const
|
|
2746
|
+
const inFlightRepetitions = /* @__PURE__ */ new Set();
|
|
2440
2747
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2441
2748
|
let spinnerIndex = 0;
|
|
2442
2749
|
function clearLine() {
|
|
@@ -2458,33 +2765,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2458
2765
|
spinnerIndex += 1;
|
|
2459
2766
|
process.stdout.write(
|
|
2460
2767
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2461
|
-
`${
|
|
2768
|
+
`${globalCompletedUnits}/${totalCount}`,
|
|
2462
2769
|
ansi2.bold
|
|
2463
|
-
)} completed ${colorize(`${
|
|
2770
|
+
)} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
|
|
2464
2771
|
);
|
|
2465
2772
|
}
|
|
2466
2773
|
let lastPrintedTestCaseId = null;
|
|
2467
2774
|
let lastPrintedLineCount = 0;
|
|
2468
2775
|
let spinnerTimer;
|
|
2469
|
-
const
|
|
2776
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2777
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2778
|
+
let batchReady = false;
|
|
2779
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2780
|
+
const done = new Promise((resolve5, reject) => {
|
|
2470
2781
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2782
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2783
|
+
return;
|
|
2784
|
+
}
|
|
2785
|
+
const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
|
|
2786
|
+
const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
|
|
2471
2787
|
if (event.type === "TestCaseStarted") {
|
|
2472
|
-
|
|
2473
|
-
|
|
2788
|
+
globalStartedUnits += 1;
|
|
2789
|
+
inFlightRepetitions.add(
|
|
2790
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2791
|
+
);
|
|
2474
2792
|
clearLine();
|
|
2475
2793
|
process.stdout.write(
|
|
2476
|
-
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2794
|
+
`${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2477
2795
|
`
|
|
2478
2796
|
);
|
|
2479
2797
|
drawSpinner();
|
|
2480
2798
|
}
|
|
2481
2799
|
if (event.type === "TestCaseProgress") {
|
|
2482
|
-
|
|
2483
|
-
|
|
2800
|
+
globalCompletedUnits += 1;
|
|
2801
|
+
inFlightRepetitions.delete(
|
|
2802
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2803
|
+
);
|
|
2484
2804
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2485
2805
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2486
|
-
const
|
|
2487
|
-
const existing = testCaseByTestId.get(
|
|
2806
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2807
|
+
const existing = testCaseByTestId.get(compositeId) ?? {
|
|
2488
2808
|
name: event.testCaseName,
|
|
2489
2809
|
events: []
|
|
2490
2810
|
};
|
|
@@ -2494,7 +2814,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2494
2814
|
durationMs: event.durationMs,
|
|
2495
2815
|
evaluatorScores: event.evaluatorScores
|
|
2496
2816
|
});
|
|
2497
|
-
testCaseByTestId.set(
|
|
2817
|
+
testCaseByTestId.set(compositeId, existing);
|
|
2498
2818
|
for (const item of event.evaluatorScores) {
|
|
2499
2819
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
2500
2820
|
if (numeric !== void 0) {
|
|
@@ -2523,10 +2843,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2523
2843
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
2524
2844
|
}
|
|
2525
2845
|
}
|
|
2526
|
-
const isSameTestCase = lastPrintedTestCaseId ===
|
|
2527
|
-
const
|
|
2846
|
+
const isSameTestCase = lastPrintedTestCaseId === compositeId;
|
|
2847
|
+
const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
|
|
2528
2848
|
const isNonTty = !process.stdout.isTTY;
|
|
2529
|
-
const skipPrintNonTty = isNonTty && event.
|
|
2849
|
+
const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
|
|
2530
2850
|
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2531
2851
|
cursorUp(lastPrintedLineCount);
|
|
2532
2852
|
}
|
|
@@ -2537,7 +2857,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2537
2857
|
const lines = [];
|
|
2538
2858
|
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2539
2859
|
lines.push(
|
|
2540
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2860
|
+
`${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2541
2861
|
);
|
|
2542
2862
|
if (event.errorMessage) {
|
|
2543
2863
|
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
@@ -2568,64 +2888,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2568
2888
|
}
|
|
2569
2889
|
}
|
|
2570
2890
|
if (!skipPrintNonTty) {
|
|
2571
|
-
for (let i = 0; i < lines.length; i
|
|
2891
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
2572
2892
|
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2573
2893
|
`);
|
|
2574
2894
|
}
|
|
2575
|
-
lastPrintedTestCaseId =
|
|
2895
|
+
lastPrintedTestCaseId = compositeId;
|
|
2576
2896
|
lastPrintedLineCount = lines.length;
|
|
2577
2897
|
}
|
|
2578
2898
|
drawSpinner();
|
|
2579
2899
|
}
|
|
2580
|
-
if (event.type === "
|
|
2900
|
+
if (event.type === "RunFailed") {
|
|
2901
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2902
|
+
return;
|
|
2903
|
+
}
|
|
2581
2904
|
runFinished = true;
|
|
2582
2905
|
clearLine();
|
|
2583
2906
|
unsubscribe();
|
|
2584
|
-
|
|
2907
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2908
|
+
return;
|
|
2909
|
+
}
|
|
2910
|
+
if (event.type === "RunCompleted") {
|
|
2911
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2912
|
+
return;
|
|
2913
|
+
}
|
|
2914
|
+
completedRuns.set(event.runId, event);
|
|
2915
|
+
batchPendingRunIds.delete(event.runId);
|
|
2916
|
+
if (batchPendingRunIds.size === 0) {
|
|
2917
|
+
runFinished = true;
|
|
2918
|
+
clearLine();
|
|
2919
|
+
unsubscribe();
|
|
2920
|
+
resolve5();
|
|
2921
|
+
}
|
|
2585
2922
|
}
|
|
2586
2923
|
});
|
|
2587
2924
|
});
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2925
|
+
console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2926
|
+
for (const name of runConfigNames) {
|
|
2927
|
+
const collected = await runner.resolveRunConfigByName(name);
|
|
2928
|
+
const label = collected?.runConfig.getDisplayLabel() ?? name;
|
|
2929
|
+
console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
|
|
2930
|
+
}
|
|
2931
|
+
console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
|
|
2932
|
+
console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
|
|
2933
|
+
console.log("");
|
|
2934
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2935
|
+
jobs,
|
|
2936
|
+
globalConcurrency: concurrency
|
|
2592
2937
|
});
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
|
|
2938
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2939
|
+
const snap = snapshots[i];
|
|
2940
|
+
const job = jobs[i];
|
|
2941
|
+
if (snap && job) {
|
|
2942
|
+
runIdToLabel.set(
|
|
2943
|
+
snap.runId,
|
|
2944
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2945
|
+
);
|
|
2946
|
+
batchPendingRunIds.add(snap.runId);
|
|
2947
|
+
}
|
|
2948
|
+
}
|
|
2949
|
+
totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2950
|
+
console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
|
|
2601
2951
|
console.log("");
|
|
2952
|
+
batchReady = true;
|
|
2602
2953
|
drawSpinner();
|
|
2603
2954
|
spinnerTimer = setInterval(drawSpinner, 100);
|
|
2604
|
-
|
|
2955
|
+
await done;
|
|
2605
2956
|
if (spinnerTimer) {
|
|
2606
2957
|
clearInterval(spinnerTimer);
|
|
2607
2958
|
}
|
|
2608
|
-
if (finalEvent.type === "RunFailed") {
|
|
2609
|
-
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2610
|
-
}
|
|
2611
|
-
const completed = finalEvent;
|
|
2612
2959
|
console.log("");
|
|
2613
|
-
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
)
|
|
2622
|
-
|
|
2960
|
+
console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2961
|
+
for (const snap of snapshots) {
|
|
2962
|
+
const completed = completedRuns.get(snap.runId);
|
|
2963
|
+
if (!completed) {
|
|
2964
|
+
continue;
|
|
2965
|
+
}
|
|
2966
|
+
const label = runIdToLabel.get(snap.runId) ?? snap.runId;
|
|
2967
|
+
console.log("");
|
|
2968
|
+
console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
|
|
2969
|
+
console.log(
|
|
2970
|
+
`- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
|
|
2971
|
+
);
|
|
2972
|
+
console.log(
|
|
2973
|
+
`- failed: ${colorize(
|
|
2974
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2975
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2976
|
+
)}`
|
|
2977
|
+
);
|
|
2978
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2979
|
+
}
|
|
2623
2980
|
if (overallScoreCount > 0) {
|
|
2624
2981
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2625
2982
|
const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
|
|
2626
2983
|
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2984
|
+
console.log("");
|
|
2627
2985
|
console.log(
|
|
2628
|
-
`- overall avg score: ${colorize(
|
|
2986
|
+
`- overall avg score (all jobs): ${colorize(
|
|
2629
2987
|
avgStr,
|
|
2630
2988
|
scoreToColor(overallAverage)
|
|
2631
2989
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
@@ -2666,22 +3024,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2666
3024
|
);
|
|
2667
3025
|
}
|
|
2668
3026
|
}
|
|
2669
|
-
|
|
3027
|
+
let failedTestCasesTotal = 0;
|
|
3028
|
+
for (const snap of snapshots) {
|
|
3029
|
+
const completed = completedRuns.get(snap.runId);
|
|
3030
|
+
if (completed) {
|
|
3031
|
+
failedTestCasesTotal += completed.failedTestCases;
|
|
3032
|
+
}
|
|
3033
|
+
}
|
|
3034
|
+
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
2670
3035
|
}
|
|
2671
|
-
async function
|
|
3036
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
2672
3037
|
return new Promise((resolve5, reject) => {
|
|
2673
3038
|
const app = ink.render(
|
|
2674
3039
|
React__namespace.createElement(RunView, {
|
|
2675
3040
|
runner,
|
|
2676
|
-
|
|
2677
|
-
evaluatorPattern,
|
|
3041
|
+
runConfigNames,
|
|
2678
3042
|
concurrency,
|
|
2679
|
-
onComplete: (err) => {
|
|
3043
|
+
onComplete: (err, exitCode) => {
|
|
2680
3044
|
app.unmount();
|
|
2681
3045
|
if (err) {
|
|
2682
3046
|
reject(err);
|
|
2683
3047
|
} else {
|
|
2684
|
-
resolve5();
|
|
3048
|
+
resolve5(exitCode ?? 0);
|
|
2685
3049
|
}
|
|
2686
3050
|
}
|
|
2687
3051
|
})
|
|
@@ -2707,12 +3071,22 @@ async function main() {
|
|
|
2707
3071
|
if (!args.command) {
|
|
2708
3072
|
printUsageAndExit(1);
|
|
2709
3073
|
}
|
|
2710
|
-
if (
|
|
2711
|
-
|
|
2712
|
-
|
|
3074
|
+
if (args.command === "run") {
|
|
3075
|
+
if (args.runConfigNames.length === 0) {
|
|
3076
|
+
console.error(
|
|
3077
|
+
"Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
|
|
3078
|
+
);
|
|
3079
|
+
printUsageAndExit(1);
|
|
3080
|
+
}
|
|
3081
|
+
if (args.datasetName !== void 0) {
|
|
3082
|
+
console.error(
|
|
3083
|
+
"The run command no longer accepts --dataset; use --run-config <RunConfig name>."
|
|
3084
|
+
);
|
|
3085
|
+
printUsageAndExit(1);
|
|
3086
|
+
}
|
|
2713
3087
|
}
|
|
2714
|
-
if (args.command === "
|
|
2715
|
-
console.error("
|
|
3088
|
+
if (args.command === "generate" && args.runConfigNames.length > 0) {
|
|
3089
|
+
console.error("generate does not accept --run-config.");
|
|
2716
3090
|
printUsageAndExit(1);
|
|
2717
3091
|
}
|
|
2718
3092
|
const useInk = process.stdout.isTTY === true;
|
|
@@ -2723,17 +3097,24 @@ async function main() {
|
|
|
2723
3097
|
try {
|
|
2724
3098
|
if (args.command === "run") {
|
|
2725
3099
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2726
|
-
await (useInk ?
|
|
3100
|
+
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
2727
3101
|
runner,
|
|
2728
|
-
args.
|
|
2729
|
-
args.evaluatorPattern,
|
|
3102
|
+
args.runConfigNames,
|
|
2730
3103
|
concurrency
|
|
2731
3104
|
);
|
|
3105
|
+
if (args.ci && exitCode !== 0) {
|
|
3106
|
+
process.exit(1);
|
|
3107
|
+
}
|
|
2732
3108
|
return;
|
|
2733
3109
|
}
|
|
3110
|
+
const genDataset = args.datasetName;
|
|
3111
|
+
if (!genDataset) {
|
|
3112
|
+
console.error("Missing required --dataset <datasetName> argument.");
|
|
3113
|
+
printUsageAndExit(1);
|
|
3114
|
+
}
|
|
2734
3115
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
2735
3116
|
runner,
|
|
2736
|
-
|
|
3117
|
+
genDataset
|
|
2737
3118
|
);
|
|
2738
3119
|
} finally {
|
|
2739
3120
|
await runner.shutdown();
|