@m4trix/evals 0.25.1 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/dist/cli-simple.cjs +845 -455
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +846 -456
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +543 -273
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +543 -273
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +948 -545
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +228 -14
- package/dist/index.js +933 -547
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
|
-
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
3
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
4
|
+
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
5
|
+
import { resolve, join, relative, parse, dirname } from 'path';
|
|
4
6
|
import { existsSync } from 'fs';
|
|
5
|
-
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
7
|
import * as jitiModule from 'jiti';
|
|
7
|
-
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import { diffLines } from 'diff';
|
|
10
10
|
import stringify from 'fast-json-stable-stringify';
|
|
@@ -13,12 +13,179 @@ import React__default, { useState, useEffect, useCallback } from 'react';
|
|
|
13
13
|
import { render, Box, Text } from 'ink';
|
|
14
14
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
15
15
|
|
|
16
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
17
|
+
function makeEntityIdSchema(brand, label) {
|
|
18
|
+
return Schema.String.pipe(
|
|
19
|
+
Schema.trimmed(),
|
|
20
|
+
Schema.minLength(1, {
|
|
21
|
+
message: () => `${label} must be non-empty.`
|
|
22
|
+
}),
|
|
23
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
24
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
25
|
+
}),
|
|
26
|
+
Schema.brand(brand)
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
30
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
31
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
32
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
33
|
+
function validateWithSchema(schema, raw, context) {
|
|
34
|
+
const trimmed = raw.trim();
|
|
35
|
+
const decode = Schema.decodeUnknownEither(
|
|
36
|
+
schema
|
|
37
|
+
);
|
|
38
|
+
const result = decode(trimmed);
|
|
39
|
+
if (Either.isLeft(result)) {
|
|
40
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
41
|
+
}
|
|
42
|
+
return result.right;
|
|
43
|
+
}
|
|
44
|
+
function validateRunConfigName(raw, context) {
|
|
45
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// src/evals/evaluator.ts
|
|
49
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
50
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
51
|
+
const label = evaluator.getDisplayLabel();
|
|
52
|
+
if (label !== void 0) {
|
|
53
|
+
return label;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
57
|
+
}
|
|
58
|
+
function getEvaluatorTagList(evaluator) {
|
|
59
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
60
|
+
}
|
|
61
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
62
|
+
const baseDir = resolve(config.artifactDirectory);
|
|
63
|
+
let entries;
|
|
64
|
+
try {
|
|
65
|
+
entries = await readdir(baseDir);
|
|
66
|
+
} catch {
|
|
67
|
+
return [];
|
|
68
|
+
}
|
|
69
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
70
|
+
const snapshots = [];
|
|
71
|
+
for (const fileName of jsonlFiles) {
|
|
72
|
+
const filePath = join(baseDir, fileName);
|
|
73
|
+
try {
|
|
74
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
75
|
+
if (snapshot) {
|
|
76
|
+
snapshots.push(snapshot);
|
|
77
|
+
}
|
|
78
|
+
} catch {
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
82
|
+
}
|
|
83
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
84
|
+
const content = await readFile(filePath, "utf8");
|
|
85
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
86
|
+
if (lines.length === 0) {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
let runQueued = null;
|
|
90
|
+
let runCompleted = null;
|
|
91
|
+
let runFailed = null;
|
|
92
|
+
let runStarted = null;
|
|
93
|
+
for (const line of lines) {
|
|
94
|
+
try {
|
|
95
|
+
const event = JSON.parse(line);
|
|
96
|
+
const type = event.type;
|
|
97
|
+
if (type === "RunQueued") {
|
|
98
|
+
runQueued = {
|
|
99
|
+
runId: event.runId,
|
|
100
|
+
datasetId: event.datasetId,
|
|
101
|
+
datasetName: event.datasetName,
|
|
102
|
+
evaluatorIds: event.evaluatorIds,
|
|
103
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
104
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
105
|
+
ts: event.ts
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
if (type === "RunStarted") {
|
|
109
|
+
runStarted = { startedAt: event.startedAt };
|
|
110
|
+
}
|
|
111
|
+
if (type === "RunCompleted") {
|
|
112
|
+
runCompleted = {
|
|
113
|
+
passedTestCases: event.passedTestCases,
|
|
114
|
+
failedTestCases: event.failedTestCases,
|
|
115
|
+
totalTestCases: event.totalTestCases,
|
|
116
|
+
finishedAt: event.finishedAt
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
if (type === "RunFailed") {
|
|
120
|
+
runFailed = {
|
|
121
|
+
finishedAt: event.finishedAt,
|
|
122
|
+
errorMessage: event.errorMessage
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
} catch {
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
if (!runQueued) {
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
const artifactPath = filePath;
|
|
132
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
133
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
134
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
135
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
136
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
137
|
+
return {
|
|
138
|
+
runId: runQueued.runId,
|
|
139
|
+
datasetId: runQueued.datasetId,
|
|
140
|
+
datasetName: runQueued.datasetName,
|
|
141
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
142
|
+
queuedAt: runQueued.ts ?? 0,
|
|
143
|
+
startedAt: runStarted?.startedAt,
|
|
144
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
145
|
+
totalTestCases: runQueued.totalTestCases,
|
|
146
|
+
completedTestCases,
|
|
147
|
+
passedTestCases,
|
|
148
|
+
failedTestCases,
|
|
149
|
+
status,
|
|
150
|
+
artifactPath,
|
|
151
|
+
errorMessage: runFailed?.errorMessage
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
function aggregateTestCaseProgress(lines) {
|
|
155
|
+
let completedTestCases = 0;
|
|
156
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
157
|
+
for (const line of lines) {
|
|
158
|
+
try {
|
|
159
|
+
const event = JSON.parse(line);
|
|
160
|
+
if (event.type === "TestCaseProgress") {
|
|
161
|
+
const ev = event;
|
|
162
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
163
|
+
const id = ev.testCaseId;
|
|
164
|
+
const current = testCasePassedBy.get(id);
|
|
165
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
166
|
+
}
|
|
167
|
+
} catch {
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
let passedTestCases = 0;
|
|
171
|
+
let failedTestCases = 0;
|
|
172
|
+
for (const passed of testCasePassedBy.values()) {
|
|
173
|
+
if (passed) {
|
|
174
|
+
passedTestCases += 1;
|
|
175
|
+
} else {
|
|
176
|
+
failedTestCases += 1;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
180
|
+
}
|
|
181
|
+
|
|
16
182
|
// src/runner/config.ts
|
|
17
183
|
var defaultRunnerConfig = {
|
|
18
184
|
discovery: {
|
|
19
185
|
rootDir: process.cwd(),
|
|
20
186
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
21
187
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
188
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
22
189
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
23
190
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
24
191
|
},
|
|
@@ -44,6 +211,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
44
211
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
45
212
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
46
213
|
}
|
|
214
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
215
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
216
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
217
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
218
|
+
}
|
|
47
219
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
48
220
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
49
221
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -142,6 +314,9 @@ function isDatasetLike(value) {
|
|
|
142
314
|
function isEvaluatorLike(value) {
|
|
143
315
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
144
316
|
}
|
|
317
|
+
function isRunConfigLike(value) {
|
|
318
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
319
|
+
}
|
|
145
320
|
function isTestCaseLike(value) {
|
|
146
321
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
147
322
|
}
|
|
@@ -230,6 +405,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
230
405
|
);
|
|
231
406
|
return found.flat();
|
|
232
407
|
}
|
|
408
|
+
async function collectRunConfigsFromFiles(config) {
|
|
409
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
410
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
411
|
+
const found = await Promise.all(
|
|
412
|
+
matched.map(async (absolutePath) => {
|
|
413
|
+
const exports = await loadModuleExports(absolutePath);
|
|
414
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
415
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
416
|
+
return runConfigs.map((runConfig) => ({
|
|
417
|
+
id: runConfig.getName(),
|
|
418
|
+
filePath: relPath,
|
|
419
|
+
runConfig
|
|
420
|
+
}));
|
|
421
|
+
})
|
|
422
|
+
);
|
|
423
|
+
return found.flat();
|
|
424
|
+
}
|
|
233
425
|
async function collectTestCasesFromFiles(config) {
|
|
234
426
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
235
427
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -383,6 +575,25 @@ function getDiffLines(entry) {
|
|
|
383
575
|
});
|
|
384
576
|
}
|
|
385
577
|
|
|
578
|
+
// src/evals/test-case.ts
|
|
579
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
580
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
581
|
+
return testCase.getDisplayLabel();
|
|
582
|
+
}
|
|
583
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
584
|
+
}
|
|
585
|
+
function getTestCaseTagList(testCase) {
|
|
586
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
// src/evals/dataset.ts
|
|
590
|
+
function getDatasetDisplayLabel(dataset) {
|
|
591
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
592
|
+
return dataset.getDisplayLabel();
|
|
593
|
+
}
|
|
594
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
595
|
+
}
|
|
596
|
+
|
|
386
597
|
// src/evals/metric.ts
|
|
387
598
|
var registry = /* @__PURE__ */ new Map();
|
|
388
599
|
var Metric = {
|
|
@@ -406,6 +617,54 @@ function getMetricById(id) {
|
|
|
406
617
|
return registry.get(id);
|
|
407
618
|
}
|
|
408
619
|
|
|
620
|
+
// src/evals/aggregators.ts
|
|
621
|
+
function aggregateTokenCountSum(values) {
|
|
622
|
+
const initial = {
|
|
623
|
+
input: 0,
|
|
624
|
+
output: 0,
|
|
625
|
+
inputCached: 0,
|
|
626
|
+
outputCached: 0
|
|
627
|
+
};
|
|
628
|
+
return values.reduce(
|
|
629
|
+
(acc, v) => ({
|
|
630
|
+
input: acc.input + (v.input ?? 0),
|
|
631
|
+
output: acc.output + (v.output ?? 0),
|
|
632
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
633
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
634
|
+
}),
|
|
635
|
+
initial
|
|
636
|
+
);
|
|
637
|
+
}
|
|
638
|
+
function aggregateLatencyAverage(values) {
|
|
639
|
+
if (values.length === 0) {
|
|
640
|
+
return { ms: 0 };
|
|
641
|
+
}
|
|
642
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
643
|
+
return { ms: sum / values.length };
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// src/evals/metrics/standard.ts
|
|
647
|
+
Metric.of({
|
|
648
|
+
id: "token-count",
|
|
649
|
+
name: "Tokens",
|
|
650
|
+
aggregate: aggregateTokenCountSum,
|
|
651
|
+
format: (data, options) => {
|
|
652
|
+
const input = data.input ?? 0;
|
|
653
|
+
const output = data.output ?? 0;
|
|
654
|
+
const inputCached = data.inputCached ?? 0;
|
|
655
|
+
const outputCached = data.outputCached ?? 0;
|
|
656
|
+
const cached = inputCached + outputCached;
|
|
657
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
658
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
659
|
+
}
|
|
660
|
+
});
|
|
661
|
+
Metric.of({
|
|
662
|
+
id: "latency",
|
|
663
|
+
name: "Latency",
|
|
664
|
+
aggregate: aggregateLatencyAverage,
|
|
665
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
666
|
+
});
|
|
667
|
+
|
|
409
668
|
// src/evals/score.ts
|
|
410
669
|
var registry2 = /* @__PURE__ */ new Map();
|
|
411
670
|
function formatScoreData(def, data, options) {
|
|
@@ -514,54 +773,6 @@ function getScoreById(id) {
|
|
|
514
773
|
return registry2.get(id);
|
|
515
774
|
}
|
|
516
775
|
|
|
517
|
-
// src/evals/aggregators.ts
|
|
518
|
-
function aggregateTokenCountSum(values) {
|
|
519
|
-
const initial = {
|
|
520
|
-
input: 0,
|
|
521
|
-
output: 0,
|
|
522
|
-
inputCached: 0,
|
|
523
|
-
outputCached: 0
|
|
524
|
-
};
|
|
525
|
-
return values.reduce(
|
|
526
|
-
(acc, v) => ({
|
|
527
|
-
input: acc.input + (v.input ?? 0),
|
|
528
|
-
output: acc.output + (v.output ?? 0),
|
|
529
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
530
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
531
|
-
}),
|
|
532
|
-
initial
|
|
533
|
-
);
|
|
534
|
-
}
|
|
535
|
-
function aggregateLatencyAverage(values) {
|
|
536
|
-
if (values.length === 0) {
|
|
537
|
-
return { ms: 0 };
|
|
538
|
-
}
|
|
539
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
540
|
-
return { ms: sum / values.length };
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
// src/evals/metrics/standard.ts
|
|
544
|
-
Metric.of({
|
|
545
|
-
id: "token-count",
|
|
546
|
-
name: "Tokens",
|
|
547
|
-
aggregate: aggregateTokenCountSum,
|
|
548
|
-
format: (data, options) => {
|
|
549
|
-
const input = data.input ?? 0;
|
|
550
|
-
const output = data.output ?? 0;
|
|
551
|
-
const inputCached = data.inputCached ?? 0;
|
|
552
|
-
const outputCached = data.outputCached ?? 0;
|
|
553
|
-
const cached = inputCached + outputCached;
|
|
554
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
555
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
556
|
-
}
|
|
557
|
-
});
|
|
558
|
-
Metric.of({
|
|
559
|
-
id: "latency",
|
|
560
|
-
name: "Latency",
|
|
561
|
-
aggregate: aggregateLatencyAverage,
|
|
562
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
563
|
-
});
|
|
564
|
-
|
|
565
776
|
// src/evals/scores/standard.ts
|
|
566
777
|
Score.of({
|
|
567
778
|
id: "percent",
|
|
@@ -705,15 +916,17 @@ function readOutput(testCase) {
|
|
|
705
916
|
}
|
|
706
917
|
return candidate.getOutput();
|
|
707
918
|
}
|
|
708
|
-
function buildEvaluationUnits(testCases) {
|
|
919
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
920
|
+
const count = Math.max(1, repetitionCount);
|
|
709
921
|
const units = [];
|
|
710
922
|
for (const testCaseItem of testCases) {
|
|
711
|
-
const
|
|
712
|
-
for (let r = 0; r <
|
|
923
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
924
|
+
for (let r = 0; r < count; r++) {
|
|
713
925
|
units.push({
|
|
714
926
|
testCaseItem,
|
|
715
|
-
|
|
716
|
-
|
|
927
|
+
repetitionId,
|
|
928
|
+
repetitionIndex: r + 1,
|
|
929
|
+
repetitionCount: count
|
|
717
930
|
});
|
|
718
931
|
}
|
|
719
932
|
}
|
|
@@ -726,7 +939,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
726
939
|
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
727
940
|
}
|
|
728
941
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
729
|
-
const { testCaseItem,
|
|
942
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
730
943
|
return Effect.gen(function* () {
|
|
731
944
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
732
945
|
const started = Date.now();
|
|
@@ -735,11 +948,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
735
948
|
type: "TestCaseStarted",
|
|
736
949
|
runId: task.runId,
|
|
737
950
|
testCaseId: testCaseItem.id,
|
|
738
|
-
testCaseName: testCaseItem.testCase
|
|
951
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
739
952
|
startedTestCases: startedEvaluations,
|
|
740
953
|
totalTestCases: totalEvaluations,
|
|
741
|
-
|
|
742
|
-
|
|
954
|
+
repetitionId,
|
|
955
|
+
repetitionIndex,
|
|
956
|
+
repetitionCount
|
|
743
957
|
});
|
|
744
958
|
const evaluatorScores = [];
|
|
745
959
|
let testCaseError;
|
|
@@ -773,8 +987,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
773
987
|
meta: {
|
|
774
988
|
triggerId: task.triggerId,
|
|
775
989
|
runId: evaluatorRunId,
|
|
776
|
-
|
|
990
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
991
|
+
repetitionId,
|
|
992
|
+
repetitionIndex,
|
|
993
|
+
repetitionCount,
|
|
994
|
+
runConfigName: task.runConfigName
|
|
777
995
|
},
|
|
996
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
997
|
+
runConfigTags: task.runConfigTags,
|
|
998
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
778
999
|
logDiff,
|
|
779
1000
|
log,
|
|
780
1001
|
createError
|
|
@@ -817,18 +1038,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
817
1038
|
});
|
|
818
1039
|
}
|
|
819
1040
|
}
|
|
820
|
-
const
|
|
1041
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
821
1042
|
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
822
1043
|
const progressEvent = {
|
|
823
1044
|
type: "TestCaseProgress",
|
|
824
1045
|
runId: task.runId,
|
|
825
1046
|
testCaseId: testCaseItem.id,
|
|
826
|
-
testCaseName: testCaseItem.testCase
|
|
1047
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
827
1048
|
completedTestCases: completedEvaluations,
|
|
828
1049
|
totalTestCases: totalEvaluations,
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
1050
|
+
repetitionId,
|
|
1051
|
+
repetitionIndex,
|
|
1052
|
+
repetitionCount,
|
|
1053
|
+
passed: repetitionPassedThis,
|
|
832
1054
|
durationMs: Date.now() - started,
|
|
833
1055
|
evaluatorScores,
|
|
834
1056
|
output,
|
|
@@ -849,9 +1071,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
849
1071
|
(map) => {
|
|
850
1072
|
const key = testCaseItem.id;
|
|
851
1073
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
852
|
-
const newResults = [...existing.results,
|
|
1074
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
853
1075
|
const newCompletedCount = existing.completedCount + 1;
|
|
854
|
-
const isLast = newCompletedCount ===
|
|
1076
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
855
1077
|
const newMap = new Map(map);
|
|
856
1078
|
newMap.set(key, {
|
|
857
1079
|
completedCount: newCompletedCount,
|
|
@@ -888,10 +1110,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
888
1110
|
runId: task.runId,
|
|
889
1111
|
startedAt
|
|
890
1112
|
});
|
|
891
|
-
const totalEvaluations = task.testCases.
|
|
892
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
893
|
-
0
|
|
894
|
-
);
|
|
1113
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
895
1114
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
896
1115
|
const completedRef = yield* Ref.make(0);
|
|
897
1116
|
const startedRef = yield* Ref.make(0);
|
|
@@ -900,7 +1119,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
900
1119
|
const testCaseResultsRef = yield* Ref.make(
|
|
901
1120
|
/* @__PURE__ */ new Map()
|
|
902
1121
|
);
|
|
903
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1122
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
904
1123
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
905
1124
|
task,
|
|
906
1125
|
unit,
|
|
@@ -914,11 +1133,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
914
1133
|
failedRef,
|
|
915
1134
|
testCaseResultsRef
|
|
916
1135
|
);
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
1136
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1137
|
+
if (globalSem !== void 0) {
|
|
1138
|
+
yield* Effect.forEach(
|
|
1139
|
+
evaluationUnits,
|
|
1140
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1141
|
+
{ concurrency: "unbounded", discard: true }
|
|
1142
|
+
);
|
|
1143
|
+
} else {
|
|
1144
|
+
yield* Effect.forEach(
|
|
1145
|
+
evaluationUnits,
|
|
1146
|
+
processEvaluation,
|
|
1147
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1148
|
+
);
|
|
1149
|
+
}
|
|
922
1150
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
923
1151
|
Ref.get(completedRef),
|
|
924
1152
|
Ref.get(passedRef),
|
|
@@ -935,144 +1163,53 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
935
1163
|
artifactPath: task.snapshot.artifactPath
|
|
936
1164
|
};
|
|
937
1165
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
938
|
-
...snapshot,
|
|
939
|
-
status: "completed",
|
|
940
|
-
completedTestCases: completedEvaluations,
|
|
941
|
-
passedTestCases: passedUniqueTestCases,
|
|
942
|
-
failedTestCases: failedUniqueTestCases,
|
|
943
|
-
finishedAt
|
|
944
|
-
}));
|
|
945
|
-
yield* publishEvent(completedEvent);
|
|
946
|
-
yield* Queue.offer(persistenceQueue, {
|
|
947
|
-
runId: task.runId,
|
|
948
|
-
artifactPath: task.snapshot.artifactPath,
|
|
949
|
-
payload: completedEvent
|
|
950
|
-
});
|
|
951
|
-
yield* publishEvent({
|
|
952
|
-
type: "ArtifactFlushed",
|
|
953
|
-
runId: task.runId,
|
|
954
|
-
artifactPath: task.snapshot.artifactPath
|
|
955
|
-
});
|
|
956
|
-
});
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
} catch {
|
|
963
|
-
return [];
|
|
964
|
-
}
|
|
965
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
966
|
-
const snapshots = [];
|
|
967
|
-
for (const fileName of jsonlFiles) {
|
|
968
|
-
const filePath = join(baseDir, fileName);
|
|
969
|
-
try {
|
|
970
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
971
|
-
if (snapshot) {
|
|
972
|
-
snapshots.push(snapshot);
|
|
973
|
-
}
|
|
974
|
-
} catch {
|
|
975
|
-
}
|
|
976
|
-
}
|
|
977
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
978
|
-
}
|
|
979
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
980
|
-
const content = await readFile(filePath, "utf8");
|
|
981
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
982
|
-
if (lines.length === 0) {
|
|
983
|
-
return null;
|
|
984
|
-
}
|
|
985
|
-
let runQueued = null;
|
|
986
|
-
let runCompleted = null;
|
|
987
|
-
let runFailed = null;
|
|
988
|
-
let runStarted = null;
|
|
989
|
-
for (const line of lines) {
|
|
990
|
-
try {
|
|
991
|
-
const event = JSON.parse(line);
|
|
992
|
-
const type = event.type;
|
|
993
|
-
if (type === "RunQueued") {
|
|
994
|
-
runQueued = {
|
|
995
|
-
runId: event.runId,
|
|
996
|
-
datasetId: event.datasetId,
|
|
997
|
-
datasetName: event.datasetName,
|
|
998
|
-
evaluatorIds: event.evaluatorIds,
|
|
999
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1000
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1001
|
-
ts: event.ts
|
|
1002
|
-
};
|
|
1003
|
-
}
|
|
1004
|
-
if (type === "RunStarted") {
|
|
1005
|
-
runStarted = { startedAt: event.startedAt };
|
|
1006
|
-
}
|
|
1007
|
-
if (type === "RunCompleted") {
|
|
1008
|
-
runCompleted = {
|
|
1009
|
-
passedTestCases: event.passedTestCases,
|
|
1010
|
-
failedTestCases: event.failedTestCases,
|
|
1011
|
-
totalTestCases: event.totalTestCases,
|
|
1012
|
-
finishedAt: event.finishedAt
|
|
1013
|
-
};
|
|
1014
|
-
}
|
|
1015
|
-
if (type === "RunFailed") {
|
|
1016
|
-
runFailed = {
|
|
1017
|
-
finishedAt: event.finishedAt,
|
|
1018
|
-
errorMessage: event.errorMessage
|
|
1019
|
-
};
|
|
1020
|
-
}
|
|
1021
|
-
} catch {
|
|
1022
|
-
}
|
|
1166
|
+
...snapshot,
|
|
1167
|
+
status: "completed",
|
|
1168
|
+
completedTestCases: completedEvaluations,
|
|
1169
|
+
passedTestCases: passedUniqueTestCases,
|
|
1170
|
+
failedTestCases: failedUniqueTestCases,
|
|
1171
|
+
finishedAt
|
|
1172
|
+
}));
|
|
1173
|
+
yield* publishEvent(completedEvent);
|
|
1174
|
+
yield* Queue.offer(persistenceQueue, {
|
|
1175
|
+
runId: task.runId,
|
|
1176
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1177
|
+
payload: completedEvent
|
|
1178
|
+
});
|
|
1179
|
+
yield* publishEvent({
|
|
1180
|
+
type: "ArtifactFlushed",
|
|
1181
|
+
runId: task.runId,
|
|
1182
|
+
artifactPath: task.snapshot.artifactPath
|
|
1183
|
+
});
|
|
1184
|
+
});
|
|
1185
|
+
|
|
1186
|
+
// src/runner/name-pattern.ts
|
|
1187
|
+
function parseRegexLiteral(pattern) {
|
|
1188
|
+
if (!pattern.startsWith("/")) {
|
|
1189
|
+
return void 0;
|
|
1023
1190
|
}
|
|
1024
|
-
|
|
1025
|
-
|
|
1191
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1192
|
+
if (lastSlash <= 0) {
|
|
1193
|
+
return void 0;
|
|
1026
1194
|
}
|
|
1027
|
-
const artifactPath = filePath;
|
|
1028
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1029
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1030
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1031
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1032
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1033
1195
|
return {
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
datasetName: runQueued.datasetName,
|
|
1037
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1038
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1039
|
-
startedAt: runStarted?.startedAt,
|
|
1040
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1041
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1042
|
-
completedTestCases,
|
|
1043
|
-
passedTestCases,
|
|
1044
|
-
failedTestCases,
|
|
1045
|
-
status,
|
|
1046
|
-
artifactPath,
|
|
1047
|
-
errorMessage: runFailed?.errorMessage
|
|
1196
|
+
source: pattern.slice(1, lastSlash),
|
|
1197
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1048
1198
|
};
|
|
1049
1199
|
}
|
|
1050
|
-
function
|
|
1051
|
-
|
|
1052
|
-
const
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
if (event.type === "TestCaseProgress") {
|
|
1057
|
-
const ev = event;
|
|
1058
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1059
|
-
const id = ev.testCaseId;
|
|
1060
|
-
const current = testCasePassedBy.get(id);
|
|
1061
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1062
|
-
}
|
|
1063
|
-
} catch {
|
|
1064
|
-
}
|
|
1200
|
+
function createNameMatcher(pattern) {
|
|
1201
|
+
const normalizedPattern = pattern.trim();
|
|
1202
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1203
|
+
if (regexLiteral) {
|
|
1204
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1205
|
+
return (value) => regex.test(value);
|
|
1065
1206
|
}
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
passedTestCases += 1;
|
|
1071
|
-
} else {
|
|
1072
|
-
failedTestCases += 1;
|
|
1073
|
-
}
|
|
1207
|
+
if (normalizedPattern.includes("*")) {
|
|
1208
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1209
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1210
|
+
return (value) => regex.test(value);
|
|
1074
1211
|
}
|
|
1075
|
-
return
|
|
1212
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1076
1213
|
}
|
|
1077
1214
|
async function appendJsonLine(artifactPath, payload) {
|
|
1078
1215
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1131,32 +1268,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1131
1268
|
}
|
|
1132
1269
|
|
|
1133
1270
|
// src/runner/api.ts
|
|
1134
|
-
function
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1139
|
-
if (lastSlash <= 0) {
|
|
1140
|
-
return void 0;
|
|
1141
|
-
}
|
|
1142
|
-
return {
|
|
1143
|
-
source: pattern.slice(1, lastSlash),
|
|
1144
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1145
|
-
};
|
|
1146
|
-
}
|
|
1147
|
-
function createNameMatcher(pattern) {
|
|
1148
|
-
const normalizedPattern = pattern.trim();
|
|
1149
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1150
|
-
if (regexLiteral) {
|
|
1151
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1152
|
-
return (value) => regex.test(value);
|
|
1271
|
+
function normalizeRunRepetitions(value) {
|
|
1272
|
+
const n = value ?? 1;
|
|
1273
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1274
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1153
1275
|
}
|
|
1154
|
-
|
|
1155
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1156
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1157
|
-
return (value) => regex.test(value);
|
|
1158
|
-
}
|
|
1159
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1276
|
+
return n;
|
|
1160
1277
|
}
|
|
1161
1278
|
function mergeRunnerOverrides(base, next) {
|
|
1162
1279
|
if (!base) {
|
|
@@ -1191,6 +1308,7 @@ var EffectRunner = class {
|
|
|
1191
1308
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1192
1309
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1193
1310
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1311
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1194
1312
|
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1195
1313
|
this.persistenceFiber = Effect.runFork(
|
|
1196
1314
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1231,6 +1349,137 @@ var EffectRunner = class {
|
|
|
1231
1349
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1232
1350
|
);
|
|
1233
1351
|
}
|
|
1352
|
+
async collectRunConfigs() {
|
|
1353
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1354
|
+
this.runConfigsById.clear();
|
|
1355
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1356
|
+
for (const item of runConfigs) {
|
|
1357
|
+
const id = item.runConfig.getName();
|
|
1358
|
+
const lower = id.toLowerCase();
|
|
1359
|
+
const prev = byNameLower.get(lower);
|
|
1360
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1361
|
+
throw new Error(
|
|
1362
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1363
|
+
);
|
|
1364
|
+
}
|
|
1365
|
+
byNameLower.set(lower, item);
|
|
1366
|
+
this.runConfigsById.set(id, item);
|
|
1367
|
+
}
|
|
1368
|
+
return runConfigs;
|
|
1369
|
+
}
|
|
1370
|
+
async resolveRunConfigByName(name) {
|
|
1371
|
+
if (this.runConfigsById.size === 0) {
|
|
1372
|
+
await this.collectRunConfigs();
|
|
1373
|
+
}
|
|
1374
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1375
|
+
const keyLower = key.toLowerCase();
|
|
1376
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1377
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1378
|
+
);
|
|
1379
|
+
if (matches.length === 0) {
|
|
1380
|
+
return void 0;
|
|
1381
|
+
}
|
|
1382
|
+
if (matches.length > 1) {
|
|
1383
|
+
throw new Error(
|
|
1384
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1385
|
+
);
|
|
1386
|
+
}
|
|
1387
|
+
return matches[0];
|
|
1388
|
+
}
|
|
1389
|
+
async expandRunConfigToJobs(collected) {
|
|
1390
|
+
if (this.datasetsById.size === 0) {
|
|
1391
|
+
await this.collectDatasets();
|
|
1392
|
+
}
|
|
1393
|
+
if (this.evaluatorsById.size === 0) {
|
|
1394
|
+
await this.collectEvaluators();
|
|
1395
|
+
}
|
|
1396
|
+
const rcName = collected.runConfig.getName();
|
|
1397
|
+
const jobs = [];
|
|
1398
|
+
const runs = collected.runConfig.getRuns();
|
|
1399
|
+
for (const [i, row] of runs.entries()) {
|
|
1400
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1401
|
+
(d) => d.dataset === row.dataset
|
|
1402
|
+
);
|
|
1403
|
+
if (!dsCollected) {
|
|
1404
|
+
throw new Error(
|
|
1405
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1406
|
+
);
|
|
1407
|
+
}
|
|
1408
|
+
let evaluatorIds;
|
|
1409
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1410
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1411
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1412
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1413
|
+
);
|
|
1414
|
+
if (matched.length === 0) {
|
|
1415
|
+
throw new Error(
|
|
1416
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1417
|
+
);
|
|
1418
|
+
}
|
|
1419
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1420
|
+
} else {
|
|
1421
|
+
const evaluators = row.evaluators;
|
|
1422
|
+
evaluatorIds = [];
|
|
1423
|
+
for (const ev of evaluators) {
|
|
1424
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
1425
|
+
(item) => item.evaluator === ev
|
|
1426
|
+
);
|
|
1427
|
+
if (!found) {
|
|
1428
|
+
throw new Error(
|
|
1429
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
1430
|
+
);
|
|
1431
|
+
}
|
|
1432
|
+
evaluatorIds.push(found.id);
|
|
1433
|
+
}
|
|
1434
|
+
}
|
|
1435
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
1436
|
+
jobs.push({
|
|
1437
|
+
datasetId: dsCollected.id,
|
|
1438
|
+
evaluatorIds,
|
|
1439
|
+
runConfigName: rcName,
|
|
1440
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
1441
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
1442
|
+
repetitions
|
|
1443
|
+
});
|
|
1444
|
+
}
|
|
1445
|
+
return jobs;
|
|
1446
|
+
}
|
|
1447
|
+
async expandRunConfigNamesToJobs(names) {
|
|
1448
|
+
const jobs = [];
|
|
1449
|
+
for (const name of names) {
|
|
1450
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
1451
|
+
if (!collected) {
|
|
1452
|
+
const known = await this.collectRunConfigs();
|
|
1453
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
1454
|
+
throw new Error(
|
|
1455
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
1456
|
+
);
|
|
1457
|
+
}
|
|
1458
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
1459
|
+
}
|
|
1460
|
+
return jobs;
|
|
1461
|
+
}
|
|
1462
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
1463
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1464
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1465
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1466
|
+
const snapshots = [];
|
|
1467
|
+
for (const job of request.jobs) {
|
|
1468
|
+
snapshots.push(
|
|
1469
|
+
await this.startDatasetRun({
|
|
1470
|
+
datasetId: job.datasetId,
|
|
1471
|
+
evaluatorIds: job.evaluatorIds,
|
|
1472
|
+
triggerId,
|
|
1473
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1474
|
+
globalEvaluationSemaphore: sem,
|
|
1475
|
+
runConfigName: job.runConfigName,
|
|
1476
|
+
runConfigTags: job.runConfigTags,
|
|
1477
|
+
repetitions: job.repetitions
|
|
1478
|
+
})
|
|
1479
|
+
);
|
|
1480
|
+
}
|
|
1481
|
+
return snapshots;
|
|
1482
|
+
}
|
|
1234
1483
|
async searchTestCases(query) {
|
|
1235
1484
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1236
1485
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1249,36 +1498,46 @@ var EffectRunner = class {
|
|
|
1249
1498
|
);
|
|
1250
1499
|
}
|
|
1251
1500
|
async runDatasetWith(request) {
|
|
1501
|
+
const runConfigName = validateRunConfigName(
|
|
1502
|
+
request.runConfigName,
|
|
1503
|
+
"runDatasetWith.runConfigName"
|
|
1504
|
+
);
|
|
1505
|
+
return this.startDatasetRun({
|
|
1506
|
+
datasetId: request.datasetId,
|
|
1507
|
+
evaluatorIds: request.evaluatorIds,
|
|
1508
|
+
triggerId: request.triggerId,
|
|
1509
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1510
|
+
repetitions: request.repetitions,
|
|
1511
|
+
runConfigName,
|
|
1512
|
+
runConfigTags: request.runConfigTags
|
|
1513
|
+
});
|
|
1514
|
+
}
|
|
1515
|
+
async startDatasetRun(params) {
|
|
1252
1516
|
if (this.datasetsById.size === 0) {
|
|
1253
1517
|
await this.collectDatasets();
|
|
1254
1518
|
}
|
|
1255
1519
|
if (this.evaluatorsById.size === 0) {
|
|
1256
1520
|
await this.collectEvaluators();
|
|
1257
1521
|
}
|
|
1258
|
-
const dataset = this.datasetsById.get(
|
|
1522
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1259
1523
|
if (!dataset) {
|
|
1260
|
-
throw new Error(`Unknown dataset: ${
|
|
1524
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1261
1525
|
}
|
|
1262
|
-
const selectedEvaluators =
|
|
1526
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1263
1527
|
if (selectedEvaluators.length === 0) {
|
|
1264
1528
|
throw new Error("No evaluators selected for run");
|
|
1265
1529
|
}
|
|
1266
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1267
|
-
const
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
)
|
|
1271
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1530
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
1531
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
1532
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1533
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1534
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1272
1535
|
const runId = `run-${randomUUID()}`;
|
|
1273
|
-
const artifactPath = createArtifactPath(
|
|
1274
|
-
this.config.artifactDirectory,
|
|
1275
|
-
request.datasetId,
|
|
1276
|
-
runId
|
|
1277
|
-
);
|
|
1536
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1278
1537
|
const snapshot = {
|
|
1279
1538
|
runId,
|
|
1280
|
-
datasetId:
|
|
1281
|
-
datasetName: dataset.dataset.
|
|
1539
|
+
datasetId: params.datasetId,
|
|
1540
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1282
1541
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1283
1542
|
queuedAt: Date.now(),
|
|
1284
1543
|
totalTestCases: totalEvaluations,
|
|
@@ -1298,8 +1557,8 @@ var EffectRunner = class {
|
|
|
1298
1557
|
const queuedEvent = {
|
|
1299
1558
|
type: "RunQueued",
|
|
1300
1559
|
runId,
|
|
1301
|
-
datasetId:
|
|
1302
|
-
datasetName: dataset.dataset.
|
|
1560
|
+
datasetId: params.datasetId,
|
|
1561
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1303
1562
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1304
1563
|
totalTestCases: totalEvaluations,
|
|
1305
1564
|
artifactPath
|
|
@@ -1312,17 +1571,20 @@ var EffectRunner = class {
|
|
|
1312
1571
|
payload: queuedEvent
|
|
1313
1572
|
})
|
|
1314
1573
|
);
|
|
1315
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1316
1574
|
await Effect.runPromise(
|
|
1317
1575
|
Queue.offer(this.runQueue, {
|
|
1318
1576
|
runId,
|
|
1319
1577
|
triggerId,
|
|
1320
|
-
datasetId:
|
|
1578
|
+
datasetId: params.datasetId,
|
|
1321
1579
|
dataset: dataset.dataset,
|
|
1322
1580
|
evaluators: selectedEvaluators,
|
|
1323
1581
|
testCases: selectedTestCases,
|
|
1324
1582
|
snapshot,
|
|
1325
|
-
maxConcurrency
|
|
1583
|
+
maxConcurrency: params.maxConcurrency,
|
|
1584
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1585
|
+
runConfigName: params.runConfigName,
|
|
1586
|
+
runConfigTags,
|
|
1587
|
+
repetitions
|
|
1326
1588
|
})
|
|
1327
1589
|
);
|
|
1328
1590
|
return snapshot;
|
|
@@ -1401,6 +1663,8 @@ function getDefaultConcurrency() {
|
|
|
1401
1663
|
function parseSimpleCliArgs(argv) {
|
|
1402
1664
|
const args = {
|
|
1403
1665
|
help: false,
|
|
1666
|
+
ci: false,
|
|
1667
|
+
runConfigNames: [],
|
|
1404
1668
|
unknownArgs: []
|
|
1405
1669
|
};
|
|
1406
1670
|
let index = 0;
|
|
@@ -1414,18 +1678,26 @@ function parseSimpleCliArgs(argv) {
|
|
|
1414
1678
|
args.help = true;
|
|
1415
1679
|
continue;
|
|
1416
1680
|
}
|
|
1681
|
+
if (token === "--ci") {
|
|
1682
|
+
args.ci = true;
|
|
1683
|
+
continue;
|
|
1684
|
+
}
|
|
1417
1685
|
if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
|
|
1418
1686
|
args.datasetName = argv[index + 1];
|
|
1419
1687
|
index += 1;
|
|
1420
1688
|
continue;
|
|
1421
1689
|
}
|
|
1422
|
-
if ((token === "--
|
|
1423
|
-
|
|
1690
|
+
if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
|
|
1691
|
+
const next = argv[index + 1];
|
|
1692
|
+
if (typeof next === "string") {
|
|
1693
|
+
args.runConfigNames.push(next);
|
|
1694
|
+
}
|
|
1424
1695
|
index += 1;
|
|
1425
1696
|
continue;
|
|
1426
1697
|
}
|
|
1427
1698
|
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1428
|
-
const
|
|
1699
|
+
const nextConc = argv[index + 1];
|
|
1700
|
+
const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
|
|
1429
1701
|
if (!Number.isNaN(n) && n >= 1) {
|
|
1430
1702
|
args.concurrency = n;
|
|
1431
1703
|
}
|
|
@@ -1439,16 +1711,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1439
1711
|
function getSimpleCliUsage() {
|
|
1440
1712
|
return [
|
|
1441
1713
|
"Usage:",
|
|
1442
|
-
" eval-agents-simple run --
|
|
1443
|
-
" eval-agents-simple generate --dataset <
|
|
1714
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1715
|
+
" eval-agents-simple generate --dataset <datasetId>",
|
|
1444
1716
|
"",
|
|
1445
1717
|
"Options:",
|
|
1446
|
-
" --
|
|
1447
|
-
""
|
|
1448
|
-
"Pattern examples for --evaluator:",
|
|
1449
|
-
" score-evaluator exact name (case-insensitive)",
|
|
1450
|
-
' "*score*" wildcard pattern',
|
|
1451
|
-
' "/score/i" regex literal'
|
|
1718
|
+
" --ci With run: exit with code 1 if any test case fails.",
|
|
1719
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1452
1720
|
].join("\n");
|
|
1453
1721
|
}
|
|
1454
1722
|
|
|
@@ -1499,7 +1767,7 @@ function GenerateView({
|
|
|
1499
1767
|
const payload = testCases.map((item) => {
|
|
1500
1768
|
const tc = item.testCase;
|
|
1501
1769
|
return {
|
|
1502
|
-
name: item.testCase
|
|
1770
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1503
1771
|
input: item.testCase.getInput(),
|
|
1504
1772
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
1505
1773
|
};
|
|
@@ -1512,7 +1780,7 @@ function GenerateView({
|
|
|
1512
1780
|
if (!cancelled) {
|
|
1513
1781
|
setResult({
|
|
1514
1782
|
count: payload.length,
|
|
1515
|
-
datasetName: dataset.dataset
|
|
1783
|
+
datasetName: getDatasetDisplayLabel(dataset.dataset),
|
|
1516
1784
|
outputPath
|
|
1517
1785
|
});
|
|
1518
1786
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1565,7 +1833,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1565
1833
|
}
|
|
1566
1834
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
1567
1835
|
const payload = testCases.map((item) => ({
|
|
1568
|
-
name: item.testCase
|
|
1836
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1569
1837
|
input: item.testCase.getInput(),
|
|
1570
1838
|
output: readOutput2(item.testCase)
|
|
1571
1839
|
}));
|
|
@@ -1573,7 +1841,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1573
1841
|
const outputPath = createOutputPath(absoluteDatasetPath);
|
|
1574
1842
|
await writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1575
1843
|
`, "utf8");
|
|
1576
|
-
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset
|
|
1844
|
+
console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
|
|
1577
1845
|
console.log(`Wrote ${outputPath}`);
|
|
1578
1846
|
}
|
|
1579
1847
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
@@ -1723,8 +1991,7 @@ function formatScorePart(item, _scoreToColor, options) {
|
|
|
1723
1991
|
}
|
|
1724
1992
|
function RunView({
|
|
1725
1993
|
runner,
|
|
1726
|
-
|
|
1727
|
-
evaluatorPattern,
|
|
1994
|
+
runConfigNames,
|
|
1728
1995
|
concurrency,
|
|
1729
1996
|
onComplete
|
|
1730
1997
|
}) {
|
|
@@ -1737,30 +2004,30 @@ function RunView({
|
|
|
1737
2004
|
const [summary, setSummary] = useState(null);
|
|
1738
2005
|
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1739
2006
|
const runEval = useCallback(async () => {
|
|
1740
|
-
const
|
|
1741
|
-
if (
|
|
1742
|
-
|
|
1743
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1744
|
-
onComplete(
|
|
1745
|
-
new Error(
|
|
1746
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1747
|
-
)
|
|
1748
|
-
);
|
|
2007
|
+
const rcList = runConfigNames.filter((n) => n.trim().length > 0);
|
|
2008
|
+
if (rcList.length === 0) {
|
|
2009
|
+
onComplete(new Error("At least one RunConfig name is required."));
|
|
1749
2010
|
return;
|
|
1750
2011
|
}
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
2012
|
+
setStartedEvaluations(0);
|
|
2013
|
+
setCompletedEvaluations(0);
|
|
2014
|
+
setTestCases([]);
|
|
2015
|
+
setRunningEvaluations([]);
|
|
2016
|
+
setSummary(null);
|
|
2017
|
+
let jobs;
|
|
2018
|
+
try {
|
|
2019
|
+
jobs = await runner.expandRunConfigNamesToJobs(rcList);
|
|
2020
|
+
} catch (err) {
|
|
2021
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
2022
|
+
return;
|
|
2023
|
+
}
|
|
2024
|
+
if (jobs.length === 0) {
|
|
2025
|
+
onComplete(new Error("No jobs expanded from RunConfigs."));
|
|
1760
2026
|
return;
|
|
1761
2027
|
}
|
|
2028
|
+
const allEvaluators = await runner.collectEvaluators();
|
|
1762
2029
|
const nameById = new Map(
|
|
1763
|
-
|
|
2030
|
+
allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
1764
2031
|
);
|
|
1765
2032
|
setEvaluatorNameById(nameById);
|
|
1766
2033
|
const aggregates = /* @__PURE__ */ new Map();
|
|
@@ -1768,21 +2035,30 @@ function RunView({
|
|
|
1768
2035
|
let overallScoreTotal = 0;
|
|
1769
2036
|
let overallScoreSumSq = 0;
|
|
1770
2037
|
let overallScoreCount = 0;
|
|
1771
|
-
const
|
|
2038
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2039
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2040
|
+
let batchReady = false;
|
|
2041
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2042
|
+
const done = new Promise((resolve5, reject) => {
|
|
1772
2043
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2044
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2045
|
+
return;
|
|
2046
|
+
}
|
|
1773
2047
|
if (event.type === "TestCaseStarted") {
|
|
1774
|
-
setStartedEvaluations(
|
|
2048
|
+
setStartedEvaluations((c) => c + 1);
|
|
1775
2049
|
setRunningEvaluations((prev) => {
|
|
1776
2050
|
const withoutDuplicate = prev.filter(
|
|
1777
|
-
(item) => !(item.testCaseId === event.testCaseId && item.
|
|
2051
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
1778
2052
|
);
|
|
1779
2053
|
return [
|
|
1780
2054
|
...withoutDuplicate,
|
|
1781
2055
|
{
|
|
2056
|
+
runId: event.runId,
|
|
1782
2057
|
testCaseId: event.testCaseId,
|
|
1783
2058
|
name: event.testCaseName,
|
|
1784
|
-
|
|
1785
|
-
|
|
2059
|
+
repetitionId: event.repetitionId,
|
|
2060
|
+
repetitionIndex: event.repetitionIndex,
|
|
2061
|
+
repetitionCount: event.repetitionCount,
|
|
1786
2062
|
startedTestCases: event.startedTestCases,
|
|
1787
2063
|
totalTestCases: event.totalTestCases
|
|
1788
2064
|
}
|
|
@@ -1818,9 +2094,12 @@ function RunView({
|
|
|
1818
2094
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
1819
2095
|
}
|
|
1820
2096
|
}
|
|
2097
|
+
const label = runIdToLabel.get(event.runId);
|
|
2098
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2099
|
+
const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
|
|
1821
2100
|
setTestCases((prev) => {
|
|
1822
2101
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1823
|
-
const existing = byId.get(
|
|
2102
|
+
const existing = byId.get(compositeId);
|
|
1824
2103
|
const newEvent = {
|
|
1825
2104
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1826
2105
|
evaluatorId: item.evaluatorId,
|
|
@@ -1837,12 +2116,12 @@ function RunView({
|
|
|
1837
2116
|
const isAggregated = events.length > 1;
|
|
1838
2117
|
const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
|
|
1839
2118
|
const merged = {
|
|
1840
|
-
name:
|
|
1841
|
-
testCaseId:
|
|
2119
|
+
name: displayName,
|
|
2120
|
+
testCaseId: compositeId,
|
|
1842
2121
|
completedTestCases: event.completedTestCases,
|
|
1843
2122
|
totalTestCases: event.totalTestCases,
|
|
1844
|
-
|
|
1845
|
-
|
|
2123
|
+
repetitionIndex: event.repetitionIndex,
|
|
2124
|
+
repetitionCount: event.repetitionCount,
|
|
1846
2125
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1847
2126
|
passed: events.every((e) => e.passed),
|
|
1848
2127
|
errorMessage: event.errorMessage,
|
|
@@ -1850,84 +2129,118 @@ function RunView({
|
|
|
1850
2129
|
aggregatedEvaluatorScores,
|
|
1851
2130
|
isAggregated
|
|
1852
2131
|
};
|
|
1853
|
-
byId.set(
|
|
1854
|
-
setCompletedEvaluations(event.completedTestCases);
|
|
1855
|
-
setRunningEvaluations(
|
|
1856
|
-
(running) => running.filter(
|
|
1857
|
-
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1858
|
-
)
|
|
1859
|
-
);
|
|
2132
|
+
byId.set(compositeId, merged);
|
|
1860
2133
|
return Array.from(byId.values());
|
|
1861
2134
|
});
|
|
2135
|
+
setCompletedEvaluations((c) => c + 1);
|
|
2136
|
+
setRunningEvaluations(
|
|
2137
|
+
(running) => running.filter(
|
|
2138
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
2139
|
+
)
|
|
2140
|
+
);
|
|
1862
2141
|
}
|
|
1863
|
-
if (event.type === "
|
|
2142
|
+
if (event.type === "RunFailed") {
|
|
2143
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2144
|
+
return;
|
|
2145
|
+
}
|
|
1864
2146
|
unsubscribe();
|
|
1865
|
-
|
|
2147
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2148
|
+
return;
|
|
2149
|
+
}
|
|
2150
|
+
if (event.type === "RunCompleted") {
|
|
2151
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2152
|
+
return;
|
|
2153
|
+
}
|
|
2154
|
+
completedRuns.set(event.runId, event);
|
|
2155
|
+
batchPendingRunIds.delete(event.runId);
|
|
2156
|
+
if (batchPendingRunIds.size === 0) {
|
|
2157
|
+
unsubscribe();
|
|
2158
|
+
resolve5();
|
|
2159
|
+
}
|
|
1866
2160
|
}
|
|
1867
2161
|
});
|
|
1868
2162
|
});
|
|
1869
|
-
const
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
concurrency
|
|
2163
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2164
|
+
jobs,
|
|
2165
|
+
globalConcurrency: concurrency
|
|
1873
2166
|
});
|
|
2167
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2168
|
+
const snap = snapshots[i];
|
|
2169
|
+
const job = jobs[i];
|
|
2170
|
+
if (snap && job) {
|
|
2171
|
+
runIdToLabel.set(
|
|
2172
|
+
snap.runId,
|
|
2173
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2174
|
+
);
|
|
2175
|
+
batchPendingRunIds.add(snap.runId);
|
|
2176
|
+
}
|
|
2177
|
+
}
|
|
2178
|
+
const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2179
|
+
batchReady = true;
|
|
2180
|
+
const runConfigLabels = await Promise.all(
|
|
2181
|
+
rcList.map(async (n) => {
|
|
2182
|
+
const collected = await runner.resolveRunConfigByName(n);
|
|
2183
|
+
return collected?.runConfig.getDisplayLabel() ?? n;
|
|
2184
|
+
})
|
|
2185
|
+
);
|
|
1874
2186
|
setRunInfo({
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
totalTestCases: snapshot.totalTestCases
|
|
2187
|
+
names: runConfigLabels,
|
|
2188
|
+
jobs: jobs.length,
|
|
2189
|
+
totalTestCases: totalUnits
|
|
1879
2190
|
});
|
|
1880
2191
|
setPhase("running");
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
2192
|
+
try {
|
|
2193
|
+
await done;
|
|
2194
|
+
} catch (err) {
|
|
2195
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
1884
2196
|
return;
|
|
1885
2197
|
}
|
|
1886
|
-
|
|
2198
|
+
let passedTestCases = 0;
|
|
2199
|
+
let failedTestCases = 0;
|
|
2200
|
+
let totalTestCases = 0;
|
|
2201
|
+
const artifacts = [];
|
|
2202
|
+
for (const ev of completedRuns.values()) {
|
|
2203
|
+
passedTestCases += ev.passedTestCases;
|
|
2204
|
+
failedTestCases += ev.failedTestCases;
|
|
2205
|
+
totalTestCases += ev.totalTestCases;
|
|
2206
|
+
artifacts.push(ev.artifactPath);
|
|
2207
|
+
}
|
|
1887
2208
|
setSummary({
|
|
1888
|
-
passedTestCases
|
|
1889
|
-
failedTestCases
|
|
1890
|
-
totalTestCases
|
|
2209
|
+
passedTestCases,
|
|
2210
|
+
failedTestCases,
|
|
2211
|
+
totalTestCases,
|
|
1891
2212
|
overallScoreTotal,
|
|
1892
2213
|
overallScoreSumSq,
|
|
1893
2214
|
overallScoreCount,
|
|
1894
2215
|
aggregates: new Map(aggregates),
|
|
1895
2216
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1896
|
-
artifactPath:
|
|
2217
|
+
artifactPath: artifacts.join("\n")
|
|
1897
2218
|
});
|
|
1898
2219
|
setPhase("completed");
|
|
1899
|
-
|
|
1900
|
-
|
|
2220
|
+
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2221
|
+
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2222
|
+
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
1901
2223
|
useEffect(() => {
|
|
1902
2224
|
void runEval();
|
|
1903
2225
|
}, [runEval]);
|
|
1904
2226
|
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
1905
2227
|
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
1906
2228
|
runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1907
|
-
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
" "
|
|
1911
|
-
] }),
|
|
1912
|
-
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
|
|
2229
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
2230
|
+
"RunConfigs",
|
|
2231
|
+
" "
|
|
1913
2232
|
] }),
|
|
2233
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.names.join(", ") }),
|
|
1914
2234
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1915
2235
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1916
|
-
"
|
|
2236
|
+
"Jobs",
|
|
1917
2237
|
" "
|
|
1918
2238
|
] }),
|
|
1919
|
-
runInfo.
|
|
2239
|
+
runInfo.jobs
|
|
1920
2240
|
] }),
|
|
1921
2241
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1922
2242
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1923
|
-
"
|
|
1924
|
-
" "
|
|
1925
|
-
] }),
|
|
1926
|
-
runInfo.evaluatorNames.join(", ")
|
|
1927
|
-
] }),
|
|
1928
|
-
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1929
|
-
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1930
|
-
"Test cases",
|
|
2243
|
+
"Evaluation units",
|
|
1931
2244
|
" "
|
|
1932
2245
|
] }),
|
|
1933
2246
|
runInfo.totalTestCases
|
|
@@ -1940,22 +2253,29 @@ function RunView({
|
|
|
1940
2253
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1941
2254
|
}
|
|
1942
2255
|
),
|
|
1943
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
2256
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
2257
|
+
Text,
|
|
2258
|
+
{
|
|
2259
|
+
color: "yellow",
|
|
2260
|
+
children: [
|
|
2261
|
+
"[running ",
|
|
2262
|
+
item.startedTestCases,
|
|
2263
|
+
"/",
|
|
2264
|
+
item.totalTestCases,
|
|
2265
|
+
"] ",
|
|
2266
|
+
item.name,
|
|
2267
|
+
" ",
|
|
2268
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2269
|
+
"(",
|
|
2270
|
+
item.repetitionIndex,
|
|
2271
|
+
"/",
|
|
2272
|
+
item.repetitionCount,
|
|
2273
|
+
")"
|
|
2274
|
+
] })
|
|
2275
|
+
]
|
|
2276
|
+
},
|
|
2277
|
+
`${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
|
|
2278
|
+
)) })
|
|
1959
2279
|
] }),
|
|
1960
2280
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1961
2281
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
@@ -1971,9 +2291,9 @@ function RunView({
|
|
|
1971
2291
|
" ",
|
|
1972
2292
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1973
2293
|
"(",
|
|
1974
|
-
tc.
|
|
2294
|
+
tc.repetitionIndex,
|
|
1975
2295
|
"/",
|
|
1976
|
-
tc.
|
|
2296
|
+
tc.repetitionCount,
|
|
1977
2297
|
")"
|
|
1978
2298
|
] }),
|
|
1979
2299
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
@@ -2013,7 +2333,7 @@ function RunView({
|
|
|
2013
2333
|
})
|
|
2014
2334
|
] }) : null
|
|
2015
2335
|
] }),
|
|
2016
|
-
item.scores.length > 0 ? item.scores.map((s
|
|
2336
|
+
item.scores.length > 0 ? item.scores.map((s) => {
|
|
2017
2337
|
const def = s.def ?? getScoreById(s.id);
|
|
2018
2338
|
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2019
2339
|
return /* @__PURE__ */ jsxs(
|
|
@@ -2030,18 +2350,25 @@ function RunView({
|
|
|
2030
2350
|
})
|
|
2031
2351
|
]
|
|
2032
2352
|
},
|
|
2033
|
-
`${item.evaluatorId}-${s.id}-${
|
|
2353
|
+
`${item.evaluatorId}-${s.id}-${scoreLabel}`
|
|
2034
2354
|
);
|
|
2035
2355
|
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
2036
2356
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2037
|
-
(log
|
|
2038
|
-
|
|
2357
|
+
(log) => log.type === "diff" ? /* @__PURE__ */ jsx(
|
|
2358
|
+
Box,
|
|
2039
2359
|
{
|
|
2040
|
-
|
|
2041
|
-
children: line
|
|
2360
|
+
flexDirection: "column",
|
|
2361
|
+
children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsx(
|
|
2362
|
+
Text,
|
|
2363
|
+
{
|
|
2364
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2365
|
+
children: line
|
|
2366
|
+
},
|
|
2367
|
+
`${type}:${line}`
|
|
2368
|
+
))
|
|
2042
2369
|
},
|
|
2043
|
-
|
|
2044
|
-
)
|
|
2370
|
+
`diff:${getDiffLines(log).map((x) => x.line).join("|")}`
|
|
2371
|
+
) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
|
|
2045
2372
|
) })
|
|
2046
2373
|
] }, item.evaluatorId))
|
|
2047
2374
|
] }, tc.testCaseId)) }),
|
|
@@ -2165,10 +2492,10 @@ function RunView({
|
|
|
2165
2492
|
] }, tc.testCaseId);
|
|
2166
2493
|
})
|
|
2167
2494
|
] }),
|
|
2168
|
-
/* @__PURE__ */
|
|
2169
|
-
"artifact:
|
|
2170
|
-
summary.artifactPath
|
|
2171
|
-
] })
|
|
2495
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
2496
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "artifact(s):" }),
|
|
2497
|
+
summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, line))
|
|
2498
|
+
] })
|
|
2172
2499
|
] })
|
|
2173
2500
|
] });
|
|
2174
2501
|
}
|
|
@@ -2380,25 +2707,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2380
2707
|
}
|
|
2381
2708
|
return lines;
|
|
2382
2709
|
}
|
|
2383
|
-
async function
|
|
2384
|
-
const
|
|
2385
|
-
if (
|
|
2386
|
-
|
|
2387
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
2388
|
-
throw new Error(
|
|
2389
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
|
|
2390
|
-
);
|
|
2391
|
-
}
|
|
2392
|
-
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
2393
|
-
if (evaluators.length === 0) {
|
|
2394
|
-
const known = await runner.collectEvaluators();
|
|
2395
|
-
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
2396
|
-
throw new Error(
|
|
2397
|
-
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
|
|
2398
|
-
);
|
|
2710
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2711
|
+
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2712
|
+
if (jobs.length === 0) {
|
|
2713
|
+
throw new Error("No jobs expanded from RunConfigs.");
|
|
2399
2714
|
}
|
|
2715
|
+
const evaluators = await runner.collectEvaluators();
|
|
2400
2716
|
const evaluatorNameById = new Map(
|
|
2401
|
-
evaluators.map((item) => [item.id, item.evaluator
|
|
2717
|
+
evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
2402
2718
|
);
|
|
2403
2719
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2404
2720
|
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
@@ -2406,11 +2722,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2406
2722
|
let overallScoreTotal = 0;
|
|
2407
2723
|
let overallScoreSumSq = 0;
|
|
2408
2724
|
let overallScoreCount = 0;
|
|
2409
|
-
let
|
|
2410
|
-
let
|
|
2725
|
+
let globalStartedUnits = 0;
|
|
2726
|
+
let globalCompletedUnits = 0;
|
|
2411
2727
|
let totalCount = 0;
|
|
2412
2728
|
let runFinished = false;
|
|
2413
|
-
const
|
|
2729
|
+
const inFlightRepetitions = /* @__PURE__ */ new Set();
|
|
2414
2730
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2415
2731
|
let spinnerIndex = 0;
|
|
2416
2732
|
function clearLine() {
|
|
@@ -2432,33 +2748,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2432
2748
|
spinnerIndex += 1;
|
|
2433
2749
|
process.stdout.write(
|
|
2434
2750
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2435
|
-
`${
|
|
2751
|
+
`${globalCompletedUnits}/${totalCount}`,
|
|
2436
2752
|
ansi2.bold
|
|
2437
|
-
)} completed ${colorize(`${
|
|
2753
|
+
)} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
|
|
2438
2754
|
);
|
|
2439
2755
|
}
|
|
2440
2756
|
let lastPrintedTestCaseId = null;
|
|
2441
2757
|
let lastPrintedLineCount = 0;
|
|
2442
2758
|
let spinnerTimer;
|
|
2443
|
-
const
|
|
2759
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2760
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2761
|
+
let batchReady = false;
|
|
2762
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2763
|
+
const done = new Promise((resolve5, reject) => {
|
|
2444
2764
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2765
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2766
|
+
return;
|
|
2767
|
+
}
|
|
2768
|
+
const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
|
|
2769
|
+
const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
|
|
2445
2770
|
if (event.type === "TestCaseStarted") {
|
|
2446
|
-
|
|
2447
|
-
|
|
2771
|
+
globalStartedUnits += 1;
|
|
2772
|
+
inFlightRepetitions.add(
|
|
2773
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2774
|
+
);
|
|
2448
2775
|
clearLine();
|
|
2449
2776
|
process.stdout.write(
|
|
2450
|
-
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2777
|
+
`${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2451
2778
|
`
|
|
2452
2779
|
);
|
|
2453
2780
|
drawSpinner();
|
|
2454
2781
|
}
|
|
2455
2782
|
if (event.type === "TestCaseProgress") {
|
|
2456
|
-
|
|
2457
|
-
|
|
2783
|
+
globalCompletedUnits += 1;
|
|
2784
|
+
inFlightRepetitions.delete(
|
|
2785
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2786
|
+
);
|
|
2458
2787
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2459
2788
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2460
|
-
const
|
|
2461
|
-
const existing = testCaseByTestId.get(
|
|
2789
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2790
|
+
const existing = testCaseByTestId.get(compositeId) ?? {
|
|
2462
2791
|
name: event.testCaseName,
|
|
2463
2792
|
events: []
|
|
2464
2793
|
};
|
|
@@ -2468,7 +2797,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2468
2797
|
durationMs: event.durationMs,
|
|
2469
2798
|
evaluatorScores: event.evaluatorScores
|
|
2470
2799
|
});
|
|
2471
|
-
testCaseByTestId.set(
|
|
2800
|
+
testCaseByTestId.set(compositeId, existing);
|
|
2472
2801
|
for (const item of event.evaluatorScores) {
|
|
2473
2802
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
2474
2803
|
if (numeric !== void 0) {
|
|
@@ -2497,10 +2826,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2497
2826
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
2498
2827
|
}
|
|
2499
2828
|
}
|
|
2500
|
-
const isSameTestCase = lastPrintedTestCaseId ===
|
|
2501
|
-
const
|
|
2829
|
+
const isSameTestCase = lastPrintedTestCaseId === compositeId;
|
|
2830
|
+
const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
|
|
2502
2831
|
const isNonTty = !process.stdout.isTTY;
|
|
2503
|
-
const skipPrintNonTty = isNonTty && event.
|
|
2832
|
+
const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
|
|
2504
2833
|
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2505
2834
|
cursorUp(lastPrintedLineCount);
|
|
2506
2835
|
}
|
|
@@ -2511,7 +2840,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2511
2840
|
const lines = [];
|
|
2512
2841
|
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2513
2842
|
lines.push(
|
|
2514
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2843
|
+
`${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2515
2844
|
);
|
|
2516
2845
|
if (event.errorMessage) {
|
|
2517
2846
|
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
@@ -2542,64 +2871,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2542
2871
|
}
|
|
2543
2872
|
}
|
|
2544
2873
|
if (!skipPrintNonTty) {
|
|
2545
|
-
for (let i = 0; i < lines.length; i
|
|
2874
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
2546
2875
|
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2547
2876
|
`);
|
|
2548
2877
|
}
|
|
2549
|
-
lastPrintedTestCaseId =
|
|
2878
|
+
lastPrintedTestCaseId = compositeId;
|
|
2550
2879
|
lastPrintedLineCount = lines.length;
|
|
2551
2880
|
}
|
|
2552
2881
|
drawSpinner();
|
|
2553
2882
|
}
|
|
2554
|
-
if (event.type === "
|
|
2883
|
+
if (event.type === "RunFailed") {
|
|
2884
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2885
|
+
return;
|
|
2886
|
+
}
|
|
2555
2887
|
runFinished = true;
|
|
2556
2888
|
clearLine();
|
|
2557
2889
|
unsubscribe();
|
|
2558
|
-
|
|
2890
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2891
|
+
return;
|
|
2892
|
+
}
|
|
2893
|
+
if (event.type === "RunCompleted") {
|
|
2894
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2895
|
+
return;
|
|
2896
|
+
}
|
|
2897
|
+
completedRuns.set(event.runId, event);
|
|
2898
|
+
batchPendingRunIds.delete(event.runId);
|
|
2899
|
+
if (batchPendingRunIds.size === 0) {
|
|
2900
|
+
runFinished = true;
|
|
2901
|
+
clearLine();
|
|
2902
|
+
unsubscribe();
|
|
2903
|
+
resolve5();
|
|
2904
|
+
}
|
|
2559
2905
|
}
|
|
2560
2906
|
});
|
|
2561
2907
|
});
|
|
2562
|
-
|
|
2563
|
-
|
|
2564
|
-
|
|
2565
|
-
|
|
2908
|
+
console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2909
|
+
for (const name of runConfigNames) {
|
|
2910
|
+
const collected = await runner.resolveRunConfigByName(name);
|
|
2911
|
+
const label = collected?.runConfig.getDisplayLabel() ?? name;
|
|
2912
|
+
console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
|
|
2913
|
+
}
|
|
2914
|
+
console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
|
|
2915
|
+
console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
|
|
2916
|
+
console.log("");
|
|
2917
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2918
|
+
jobs,
|
|
2919
|
+
globalConcurrency: concurrency
|
|
2566
2920
|
});
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
|
|
2571
|
-
|
|
2572
|
-
|
|
2573
|
-
|
|
2574
|
-
|
|
2921
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2922
|
+
const snap = snapshots[i];
|
|
2923
|
+
const job = jobs[i];
|
|
2924
|
+
if (snap && job) {
|
|
2925
|
+
runIdToLabel.set(
|
|
2926
|
+
snap.runId,
|
|
2927
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2928
|
+
);
|
|
2929
|
+
batchPendingRunIds.add(snap.runId);
|
|
2930
|
+
}
|
|
2931
|
+
}
|
|
2932
|
+
totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2933
|
+
console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
|
|
2575
2934
|
console.log("");
|
|
2935
|
+
batchReady = true;
|
|
2576
2936
|
drawSpinner();
|
|
2577
2937
|
spinnerTimer = setInterval(drawSpinner, 100);
|
|
2578
|
-
|
|
2938
|
+
await done;
|
|
2579
2939
|
if (spinnerTimer) {
|
|
2580
2940
|
clearInterval(spinnerTimer);
|
|
2581
2941
|
}
|
|
2582
|
-
if (finalEvent.type === "RunFailed") {
|
|
2583
|
-
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2584
|
-
}
|
|
2585
|
-
const completed = finalEvent;
|
|
2586
2942
|
console.log("");
|
|
2587
|
-
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
)
|
|
2596
|
-
|
|
2943
|
+
console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2944
|
+
for (const snap of snapshots) {
|
|
2945
|
+
const completed = completedRuns.get(snap.runId);
|
|
2946
|
+
if (!completed) {
|
|
2947
|
+
continue;
|
|
2948
|
+
}
|
|
2949
|
+
const label = runIdToLabel.get(snap.runId) ?? snap.runId;
|
|
2950
|
+
console.log("");
|
|
2951
|
+
console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
|
|
2952
|
+
console.log(
|
|
2953
|
+
`- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
|
|
2954
|
+
);
|
|
2955
|
+
console.log(
|
|
2956
|
+
`- failed: ${colorize(
|
|
2957
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2958
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2959
|
+
)}`
|
|
2960
|
+
);
|
|
2961
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2962
|
+
}
|
|
2597
2963
|
if (overallScoreCount > 0) {
|
|
2598
2964
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2599
2965
|
const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
|
|
2600
2966
|
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2967
|
+
console.log("");
|
|
2601
2968
|
console.log(
|
|
2602
|
-
`- overall avg score: ${colorize(
|
|
2969
|
+
`- overall avg score (all jobs): ${colorize(
|
|
2603
2970
|
avgStr,
|
|
2604
2971
|
scoreToColor(overallAverage)
|
|
2605
2972
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
@@ -2640,22 +3007,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2640
3007
|
);
|
|
2641
3008
|
}
|
|
2642
3009
|
}
|
|
2643
|
-
|
|
3010
|
+
let failedTestCasesTotal = 0;
|
|
3011
|
+
for (const snap of snapshots) {
|
|
3012
|
+
const completed = completedRuns.get(snap.runId);
|
|
3013
|
+
if (completed) {
|
|
3014
|
+
failedTestCasesTotal += completed.failedTestCases;
|
|
3015
|
+
}
|
|
3016
|
+
}
|
|
3017
|
+
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
2644
3018
|
}
|
|
2645
|
-
async function
|
|
3019
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
2646
3020
|
return new Promise((resolve5, reject) => {
|
|
2647
3021
|
const app = render(
|
|
2648
3022
|
React.createElement(RunView, {
|
|
2649
3023
|
runner,
|
|
2650
|
-
|
|
2651
|
-
evaluatorPattern,
|
|
3024
|
+
runConfigNames,
|
|
2652
3025
|
concurrency,
|
|
2653
|
-
onComplete: (err) => {
|
|
3026
|
+
onComplete: (err, exitCode) => {
|
|
2654
3027
|
app.unmount();
|
|
2655
3028
|
if (err) {
|
|
2656
3029
|
reject(err);
|
|
2657
3030
|
} else {
|
|
2658
|
-
resolve5();
|
|
3031
|
+
resolve5(exitCode ?? 0);
|
|
2659
3032
|
}
|
|
2660
3033
|
}
|
|
2661
3034
|
})
|
|
@@ -2681,12 +3054,22 @@ async function main() {
|
|
|
2681
3054
|
if (!args.command) {
|
|
2682
3055
|
printUsageAndExit(1);
|
|
2683
3056
|
}
|
|
2684
|
-
if (
|
|
2685
|
-
|
|
2686
|
-
|
|
3057
|
+
if (args.command === "run") {
|
|
3058
|
+
if (args.runConfigNames.length === 0) {
|
|
3059
|
+
console.error(
|
|
3060
|
+
"Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
|
|
3061
|
+
);
|
|
3062
|
+
printUsageAndExit(1);
|
|
3063
|
+
}
|
|
3064
|
+
if (args.datasetName !== void 0) {
|
|
3065
|
+
console.error(
|
|
3066
|
+
"The run command no longer accepts --dataset; use --run-config <RunConfig name>."
|
|
3067
|
+
);
|
|
3068
|
+
printUsageAndExit(1);
|
|
3069
|
+
}
|
|
2687
3070
|
}
|
|
2688
|
-
if (args.command === "
|
|
2689
|
-
console.error("
|
|
3071
|
+
if (args.command === "generate" && args.runConfigNames.length > 0) {
|
|
3072
|
+
console.error("generate does not accept --run-config.");
|
|
2690
3073
|
printUsageAndExit(1);
|
|
2691
3074
|
}
|
|
2692
3075
|
const useInk = process.stdout.isTTY === true;
|
|
@@ -2697,17 +3080,24 @@ async function main() {
|
|
|
2697
3080
|
try {
|
|
2698
3081
|
if (args.command === "run") {
|
|
2699
3082
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2700
|
-
await (useInk ?
|
|
3083
|
+
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
2701
3084
|
runner,
|
|
2702
|
-
args.
|
|
2703
|
-
args.evaluatorPattern,
|
|
3085
|
+
args.runConfigNames,
|
|
2704
3086
|
concurrency
|
|
2705
3087
|
);
|
|
3088
|
+
if (args.ci && exitCode !== 0) {
|
|
3089
|
+
process.exit(1);
|
|
3090
|
+
}
|
|
2706
3091
|
return;
|
|
2707
3092
|
}
|
|
3093
|
+
const genDataset = args.datasetName;
|
|
3094
|
+
if (!genDataset) {
|
|
3095
|
+
console.error("Missing required --dataset <datasetId> argument.");
|
|
3096
|
+
printUsageAndExit(1);
|
|
3097
|
+
}
|
|
2708
3098
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
2709
3099
|
runner,
|
|
2710
|
-
|
|
3100
|
+
genDataset
|
|
2711
3101
|
);
|
|
2712
3102
|
} finally {
|
|
2713
3103
|
await runner.shutdown();
|