@m4trix/evals 0.25.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +831 -450
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +832 -451
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +531 -270
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +531 -270
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +888 -509
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +878 -513
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
|
-
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
3
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
4
|
+
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
5
|
+
import { resolve, join, relative, parse, dirname } from 'path';
|
|
4
6
|
import { existsSync } from 'fs';
|
|
5
|
-
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
7
|
import * as jitiModule from 'jiti';
|
|
7
|
-
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import { diffLines } from 'diff';
|
|
10
10
|
import stringify from 'fast-json-stable-stringify';
|
|
@@ -13,12 +13,178 @@ import React__default, { useState, useEffect, useCallback } from 'react';
|
|
|
13
13
|
import { render, Box, Text } from 'ink';
|
|
14
14
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
15
15
|
|
|
16
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
17
|
+
function makeEntityIdSchema(brand, label) {
|
|
18
|
+
return Schema.String.pipe(
|
|
19
|
+
Schema.trimmed(),
|
|
20
|
+
Schema.minLength(1, {
|
|
21
|
+
message: () => `${label} must be non-empty.`
|
|
22
|
+
}),
|
|
23
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
24
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
25
|
+
}),
|
|
26
|
+
Schema.brand(brand)
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
30
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
31
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
32
|
+
function validateWithSchema(schema, raw, context) {
|
|
33
|
+
const trimmed = raw.trim();
|
|
34
|
+
const decode = Schema.decodeUnknownEither(
|
|
35
|
+
schema
|
|
36
|
+
);
|
|
37
|
+
const result = decode(trimmed);
|
|
38
|
+
if (Either.isLeft(result)) {
|
|
39
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
40
|
+
}
|
|
41
|
+
return result.right;
|
|
42
|
+
}
|
|
43
|
+
function validateRunConfigName(raw, context) {
|
|
44
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// src/evals/evaluator.ts
|
|
48
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
49
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
50
|
+
const label = evaluator.getDisplayLabel();
|
|
51
|
+
if (label !== void 0) {
|
|
52
|
+
return label;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
56
|
+
}
|
|
57
|
+
function getEvaluatorTagList(evaluator) {
|
|
58
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
59
|
+
}
|
|
60
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
61
|
+
const baseDir = resolve(config.artifactDirectory);
|
|
62
|
+
let entries;
|
|
63
|
+
try {
|
|
64
|
+
entries = await readdir(baseDir);
|
|
65
|
+
} catch {
|
|
66
|
+
return [];
|
|
67
|
+
}
|
|
68
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
69
|
+
const snapshots = [];
|
|
70
|
+
for (const fileName of jsonlFiles) {
|
|
71
|
+
const filePath = join(baseDir, fileName);
|
|
72
|
+
try {
|
|
73
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
74
|
+
if (snapshot) {
|
|
75
|
+
snapshots.push(snapshot);
|
|
76
|
+
}
|
|
77
|
+
} catch {
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
81
|
+
}
|
|
82
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
83
|
+
const content = await readFile(filePath, "utf8");
|
|
84
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
85
|
+
if (lines.length === 0) {
|
|
86
|
+
return null;
|
|
87
|
+
}
|
|
88
|
+
let runQueued = null;
|
|
89
|
+
let runCompleted = null;
|
|
90
|
+
let runFailed = null;
|
|
91
|
+
let runStarted = null;
|
|
92
|
+
for (const line of lines) {
|
|
93
|
+
try {
|
|
94
|
+
const event = JSON.parse(line);
|
|
95
|
+
const type = event.type;
|
|
96
|
+
if (type === "RunQueued") {
|
|
97
|
+
runQueued = {
|
|
98
|
+
runId: event.runId,
|
|
99
|
+
datasetId: event.datasetId,
|
|
100
|
+
datasetName: event.datasetName,
|
|
101
|
+
evaluatorIds: event.evaluatorIds,
|
|
102
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
103
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
104
|
+
ts: event.ts
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
if (type === "RunStarted") {
|
|
108
|
+
runStarted = { startedAt: event.startedAt };
|
|
109
|
+
}
|
|
110
|
+
if (type === "RunCompleted") {
|
|
111
|
+
runCompleted = {
|
|
112
|
+
passedTestCases: event.passedTestCases,
|
|
113
|
+
failedTestCases: event.failedTestCases,
|
|
114
|
+
totalTestCases: event.totalTestCases,
|
|
115
|
+
finishedAt: event.finishedAt
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
if (type === "RunFailed") {
|
|
119
|
+
runFailed = {
|
|
120
|
+
finishedAt: event.finishedAt,
|
|
121
|
+
errorMessage: event.errorMessage
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
} catch {
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
if (!runQueued) {
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
const artifactPath = filePath;
|
|
131
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
132
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
133
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
134
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
135
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
136
|
+
return {
|
|
137
|
+
runId: runQueued.runId,
|
|
138
|
+
datasetId: runQueued.datasetId,
|
|
139
|
+
datasetName: runQueued.datasetName,
|
|
140
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
141
|
+
queuedAt: runQueued.ts ?? 0,
|
|
142
|
+
startedAt: runStarted?.startedAt,
|
|
143
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
144
|
+
totalTestCases: runQueued.totalTestCases,
|
|
145
|
+
completedTestCases,
|
|
146
|
+
passedTestCases,
|
|
147
|
+
failedTestCases,
|
|
148
|
+
status,
|
|
149
|
+
artifactPath,
|
|
150
|
+
errorMessage: runFailed?.errorMessage
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
function aggregateTestCaseProgress(lines) {
|
|
154
|
+
let completedTestCases = 0;
|
|
155
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
156
|
+
for (const line of lines) {
|
|
157
|
+
try {
|
|
158
|
+
const event = JSON.parse(line);
|
|
159
|
+
if (event.type === "TestCaseProgress") {
|
|
160
|
+
const ev = event;
|
|
161
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
162
|
+
const id = ev.testCaseId;
|
|
163
|
+
const current = testCasePassedBy.get(id);
|
|
164
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
165
|
+
}
|
|
166
|
+
} catch {
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
let passedTestCases = 0;
|
|
170
|
+
let failedTestCases = 0;
|
|
171
|
+
for (const passed of testCasePassedBy.values()) {
|
|
172
|
+
if (passed) {
|
|
173
|
+
passedTestCases += 1;
|
|
174
|
+
} else {
|
|
175
|
+
failedTestCases += 1;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
179
|
+
}
|
|
180
|
+
|
|
16
181
|
// src/runner/config.ts
|
|
17
182
|
var defaultRunnerConfig = {
|
|
18
183
|
discovery: {
|
|
19
184
|
rootDir: process.cwd(),
|
|
20
185
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
21
186
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
187
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
22
188
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
23
189
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
24
190
|
},
|
|
@@ -44,6 +210,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
44
210
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
45
211
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
46
212
|
}
|
|
213
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
214
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
215
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
216
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
217
|
+
}
|
|
47
218
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
48
219
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
49
220
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -142,6 +313,9 @@ function isDatasetLike(value) {
|
|
|
142
313
|
function isEvaluatorLike(value) {
|
|
143
314
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
144
315
|
}
|
|
316
|
+
function isRunConfigLike(value) {
|
|
317
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
318
|
+
}
|
|
145
319
|
function isTestCaseLike(value) {
|
|
146
320
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
147
321
|
}
|
|
@@ -230,6 +404,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
230
404
|
);
|
|
231
405
|
return found.flat();
|
|
232
406
|
}
|
|
407
|
+
async function collectRunConfigsFromFiles(config) {
|
|
408
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
409
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
410
|
+
const found = await Promise.all(
|
|
411
|
+
matched.map(async (absolutePath) => {
|
|
412
|
+
const exports = await loadModuleExports(absolutePath);
|
|
413
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
414
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
415
|
+
return runConfigs.map((runConfig) => ({
|
|
416
|
+
id: runConfig.getName(),
|
|
417
|
+
filePath: relPath,
|
|
418
|
+
runConfig
|
|
419
|
+
}));
|
|
420
|
+
})
|
|
421
|
+
);
|
|
422
|
+
return found.flat();
|
|
423
|
+
}
|
|
233
424
|
async function collectTestCasesFromFiles(config) {
|
|
234
425
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
235
426
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -383,6 +574,17 @@ function getDiffLines(entry) {
|
|
|
383
574
|
});
|
|
384
575
|
}
|
|
385
576
|
|
|
577
|
+
// src/evals/test-case.ts
|
|
578
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
579
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
580
|
+
return testCase.getDisplayLabel();
|
|
581
|
+
}
|
|
582
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
583
|
+
}
|
|
584
|
+
function getTestCaseTagList(testCase) {
|
|
585
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
586
|
+
}
|
|
587
|
+
|
|
386
588
|
// src/evals/metric.ts
|
|
387
589
|
var registry = /* @__PURE__ */ new Map();
|
|
388
590
|
var Metric = {
|
|
@@ -406,6 +608,54 @@ function getMetricById(id) {
|
|
|
406
608
|
return registry.get(id);
|
|
407
609
|
}
|
|
408
610
|
|
|
611
|
+
// src/evals/aggregators.ts
|
|
612
|
+
function aggregateTokenCountSum(values) {
|
|
613
|
+
const initial = {
|
|
614
|
+
input: 0,
|
|
615
|
+
output: 0,
|
|
616
|
+
inputCached: 0,
|
|
617
|
+
outputCached: 0
|
|
618
|
+
};
|
|
619
|
+
return values.reduce(
|
|
620
|
+
(acc, v) => ({
|
|
621
|
+
input: acc.input + (v.input ?? 0),
|
|
622
|
+
output: acc.output + (v.output ?? 0),
|
|
623
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
624
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
625
|
+
}),
|
|
626
|
+
initial
|
|
627
|
+
);
|
|
628
|
+
}
|
|
629
|
+
function aggregateLatencyAverage(values) {
|
|
630
|
+
if (values.length === 0) {
|
|
631
|
+
return { ms: 0 };
|
|
632
|
+
}
|
|
633
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
634
|
+
return { ms: sum / values.length };
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
// src/evals/metrics/standard.ts
|
|
638
|
+
Metric.of({
|
|
639
|
+
id: "token-count",
|
|
640
|
+
name: "Tokens",
|
|
641
|
+
aggregate: aggregateTokenCountSum,
|
|
642
|
+
format: (data, options) => {
|
|
643
|
+
const input = data.input ?? 0;
|
|
644
|
+
const output = data.output ?? 0;
|
|
645
|
+
const inputCached = data.inputCached ?? 0;
|
|
646
|
+
const outputCached = data.outputCached ?? 0;
|
|
647
|
+
const cached = inputCached + outputCached;
|
|
648
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
649
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
650
|
+
}
|
|
651
|
+
});
|
|
652
|
+
Metric.of({
|
|
653
|
+
id: "latency",
|
|
654
|
+
name: "Latency",
|
|
655
|
+
aggregate: aggregateLatencyAverage,
|
|
656
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
657
|
+
});
|
|
658
|
+
|
|
409
659
|
// src/evals/score.ts
|
|
410
660
|
var registry2 = /* @__PURE__ */ new Map();
|
|
411
661
|
function formatScoreData(def, data, options) {
|
|
@@ -514,54 +764,6 @@ function getScoreById(id) {
|
|
|
514
764
|
return registry2.get(id);
|
|
515
765
|
}
|
|
516
766
|
|
|
517
|
-
// src/evals/aggregators.ts
|
|
518
|
-
function aggregateTokenCountSum(values) {
|
|
519
|
-
const initial = {
|
|
520
|
-
input: 0,
|
|
521
|
-
output: 0,
|
|
522
|
-
inputCached: 0,
|
|
523
|
-
outputCached: 0
|
|
524
|
-
};
|
|
525
|
-
return values.reduce(
|
|
526
|
-
(acc, v) => ({
|
|
527
|
-
input: acc.input + (v.input ?? 0),
|
|
528
|
-
output: acc.output + (v.output ?? 0),
|
|
529
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
530
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
531
|
-
}),
|
|
532
|
-
initial
|
|
533
|
-
);
|
|
534
|
-
}
|
|
535
|
-
function aggregateLatencyAverage(values) {
|
|
536
|
-
if (values.length === 0) {
|
|
537
|
-
return { ms: 0 };
|
|
538
|
-
}
|
|
539
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
540
|
-
return { ms: sum / values.length };
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
// src/evals/metrics/standard.ts
|
|
544
|
-
Metric.of({
|
|
545
|
-
id: "token-count",
|
|
546
|
-
name: "Tokens",
|
|
547
|
-
aggregate: aggregateTokenCountSum,
|
|
548
|
-
format: (data, options) => {
|
|
549
|
-
const input = data.input ?? 0;
|
|
550
|
-
const output = data.output ?? 0;
|
|
551
|
-
const inputCached = data.inputCached ?? 0;
|
|
552
|
-
const outputCached = data.outputCached ?? 0;
|
|
553
|
-
const cached = inputCached + outputCached;
|
|
554
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
555
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
556
|
-
}
|
|
557
|
-
});
|
|
558
|
-
Metric.of({
|
|
559
|
-
id: "latency",
|
|
560
|
-
name: "Latency",
|
|
561
|
-
aggregate: aggregateLatencyAverage,
|
|
562
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
563
|
-
});
|
|
564
|
-
|
|
565
767
|
// src/evals/scores/standard.ts
|
|
566
768
|
Score.of({
|
|
567
769
|
id: "percent",
|
|
@@ -705,15 +907,17 @@ function readOutput(testCase) {
|
|
|
705
907
|
}
|
|
706
908
|
return candidate.getOutput();
|
|
707
909
|
}
|
|
708
|
-
function buildEvaluationUnits(testCases) {
|
|
910
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
911
|
+
const count = Math.max(1, repetitionCount);
|
|
709
912
|
const units = [];
|
|
710
913
|
for (const testCaseItem of testCases) {
|
|
711
|
-
const
|
|
712
|
-
for (let r = 0; r <
|
|
914
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
915
|
+
for (let r = 0; r < count; r++) {
|
|
713
916
|
units.push({
|
|
714
917
|
testCaseItem,
|
|
715
|
-
|
|
716
|
-
|
|
918
|
+
repetitionId,
|
|
919
|
+
repetitionIndex: r + 1,
|
|
920
|
+
repetitionCount: count
|
|
717
921
|
});
|
|
718
922
|
}
|
|
719
923
|
}
|
|
@@ -726,7 +930,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
726
930
|
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
727
931
|
}
|
|
728
932
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
729
|
-
const { testCaseItem,
|
|
933
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
730
934
|
return Effect.gen(function* () {
|
|
731
935
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
732
936
|
const started = Date.now();
|
|
@@ -735,11 +939,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
735
939
|
type: "TestCaseStarted",
|
|
736
940
|
runId: task.runId,
|
|
737
941
|
testCaseId: testCaseItem.id,
|
|
738
|
-
testCaseName: testCaseItem.testCase
|
|
942
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
739
943
|
startedTestCases: startedEvaluations,
|
|
740
944
|
totalTestCases: totalEvaluations,
|
|
741
|
-
|
|
742
|
-
|
|
945
|
+
repetitionId,
|
|
946
|
+
repetitionIndex,
|
|
947
|
+
repetitionCount
|
|
743
948
|
});
|
|
744
949
|
const evaluatorScores = [];
|
|
745
950
|
let testCaseError;
|
|
@@ -773,8 +978,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
773
978
|
meta: {
|
|
774
979
|
triggerId: task.triggerId,
|
|
775
980
|
runId: evaluatorRunId,
|
|
776
|
-
datasetId: task.datasetId
|
|
981
|
+
datasetId: task.datasetId,
|
|
982
|
+
repetitionId,
|
|
983
|
+
repetitionIndex,
|
|
984
|
+
repetitionCount,
|
|
985
|
+
runConfigName: task.runConfigName
|
|
777
986
|
},
|
|
987
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
988
|
+
runConfigTags: task.runConfigTags,
|
|
989
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
778
990
|
logDiff,
|
|
779
991
|
log,
|
|
780
992
|
createError
|
|
@@ -817,18 +1029,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
817
1029
|
});
|
|
818
1030
|
}
|
|
819
1031
|
}
|
|
820
|
-
const
|
|
1032
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
821
1033
|
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
822
1034
|
const progressEvent = {
|
|
823
1035
|
type: "TestCaseProgress",
|
|
824
1036
|
runId: task.runId,
|
|
825
1037
|
testCaseId: testCaseItem.id,
|
|
826
|
-
testCaseName: testCaseItem.testCase
|
|
1038
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
827
1039
|
completedTestCases: completedEvaluations,
|
|
828
1040
|
totalTestCases: totalEvaluations,
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
1041
|
+
repetitionId,
|
|
1042
|
+
repetitionIndex,
|
|
1043
|
+
repetitionCount,
|
|
1044
|
+
passed: repetitionPassedThis,
|
|
832
1045
|
durationMs: Date.now() - started,
|
|
833
1046
|
evaluatorScores,
|
|
834
1047
|
output,
|
|
@@ -849,9 +1062,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
849
1062
|
(map) => {
|
|
850
1063
|
const key = testCaseItem.id;
|
|
851
1064
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
852
|
-
const newResults = [...existing.results,
|
|
1065
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
853
1066
|
const newCompletedCount = existing.completedCount + 1;
|
|
854
|
-
const isLast = newCompletedCount ===
|
|
1067
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
855
1068
|
const newMap = new Map(map);
|
|
856
1069
|
newMap.set(key, {
|
|
857
1070
|
completedCount: newCompletedCount,
|
|
@@ -888,10 +1101,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
888
1101
|
runId: task.runId,
|
|
889
1102
|
startedAt
|
|
890
1103
|
});
|
|
891
|
-
const totalEvaluations = task.testCases.
|
|
892
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
893
|
-
0
|
|
894
|
-
);
|
|
1104
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
895
1105
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
896
1106
|
const completedRef = yield* Ref.make(0);
|
|
897
1107
|
const startedRef = yield* Ref.make(0);
|
|
@@ -900,7 +1110,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
900
1110
|
const testCaseResultsRef = yield* Ref.make(
|
|
901
1111
|
/* @__PURE__ */ new Map()
|
|
902
1112
|
);
|
|
903
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1113
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
904
1114
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
905
1115
|
task,
|
|
906
1116
|
unit,
|
|
@@ -914,11 +1124,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
914
1124
|
failedRef,
|
|
915
1125
|
testCaseResultsRef
|
|
916
1126
|
);
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
1127
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1128
|
+
if (globalSem !== void 0) {
|
|
1129
|
+
yield* Effect.forEach(
|
|
1130
|
+
evaluationUnits,
|
|
1131
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1132
|
+
{ concurrency: "unbounded", discard: true }
|
|
1133
|
+
);
|
|
1134
|
+
} else {
|
|
1135
|
+
yield* Effect.forEach(
|
|
1136
|
+
evaluationUnits,
|
|
1137
|
+
processEvaluation,
|
|
1138
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1139
|
+
);
|
|
1140
|
+
}
|
|
922
1141
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
923
1142
|
Ref.get(completedRef),
|
|
924
1143
|
Ref.get(passedRef),
|
|
@@ -935,144 +1154,53 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
935
1154
|
artifactPath: task.snapshot.artifactPath
|
|
936
1155
|
};
|
|
937
1156
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
938
|
-
...snapshot,
|
|
939
|
-
status: "completed",
|
|
940
|
-
completedTestCases: completedEvaluations,
|
|
941
|
-
passedTestCases: passedUniqueTestCases,
|
|
942
|
-
failedTestCases: failedUniqueTestCases,
|
|
943
|
-
finishedAt
|
|
944
|
-
}));
|
|
945
|
-
yield* publishEvent(completedEvent);
|
|
946
|
-
yield* Queue.offer(persistenceQueue, {
|
|
947
|
-
runId: task.runId,
|
|
948
|
-
artifactPath: task.snapshot.artifactPath,
|
|
949
|
-
payload: completedEvent
|
|
950
|
-
});
|
|
951
|
-
yield* publishEvent({
|
|
952
|
-
type: "ArtifactFlushed",
|
|
953
|
-
runId: task.runId,
|
|
954
|
-
artifactPath: task.snapshot.artifactPath
|
|
955
|
-
});
|
|
956
|
-
});
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
} catch {
|
|
963
|
-
return [];
|
|
964
|
-
}
|
|
965
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
966
|
-
const snapshots = [];
|
|
967
|
-
for (const fileName of jsonlFiles) {
|
|
968
|
-
const filePath = join(baseDir, fileName);
|
|
969
|
-
try {
|
|
970
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
971
|
-
if (snapshot) {
|
|
972
|
-
snapshots.push(snapshot);
|
|
973
|
-
}
|
|
974
|
-
} catch {
|
|
975
|
-
}
|
|
976
|
-
}
|
|
977
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
978
|
-
}
|
|
979
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
980
|
-
const content = await readFile(filePath, "utf8");
|
|
981
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
982
|
-
if (lines.length === 0) {
|
|
983
|
-
return null;
|
|
984
|
-
}
|
|
985
|
-
let runQueued = null;
|
|
986
|
-
let runCompleted = null;
|
|
987
|
-
let runFailed = null;
|
|
988
|
-
let runStarted = null;
|
|
989
|
-
for (const line of lines) {
|
|
990
|
-
try {
|
|
991
|
-
const event = JSON.parse(line);
|
|
992
|
-
const type = event.type;
|
|
993
|
-
if (type === "RunQueued") {
|
|
994
|
-
runQueued = {
|
|
995
|
-
runId: event.runId,
|
|
996
|
-
datasetId: event.datasetId,
|
|
997
|
-
datasetName: event.datasetName,
|
|
998
|
-
evaluatorIds: event.evaluatorIds,
|
|
999
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1000
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1001
|
-
ts: event.ts
|
|
1002
|
-
};
|
|
1003
|
-
}
|
|
1004
|
-
if (type === "RunStarted") {
|
|
1005
|
-
runStarted = { startedAt: event.startedAt };
|
|
1006
|
-
}
|
|
1007
|
-
if (type === "RunCompleted") {
|
|
1008
|
-
runCompleted = {
|
|
1009
|
-
passedTestCases: event.passedTestCases,
|
|
1010
|
-
failedTestCases: event.failedTestCases,
|
|
1011
|
-
totalTestCases: event.totalTestCases,
|
|
1012
|
-
finishedAt: event.finishedAt
|
|
1013
|
-
};
|
|
1014
|
-
}
|
|
1015
|
-
if (type === "RunFailed") {
|
|
1016
|
-
runFailed = {
|
|
1017
|
-
finishedAt: event.finishedAt,
|
|
1018
|
-
errorMessage: event.errorMessage
|
|
1019
|
-
};
|
|
1020
|
-
}
|
|
1021
|
-
} catch {
|
|
1022
|
-
}
|
|
1157
|
+
...snapshot,
|
|
1158
|
+
status: "completed",
|
|
1159
|
+
completedTestCases: completedEvaluations,
|
|
1160
|
+
passedTestCases: passedUniqueTestCases,
|
|
1161
|
+
failedTestCases: failedUniqueTestCases,
|
|
1162
|
+
finishedAt
|
|
1163
|
+
}));
|
|
1164
|
+
yield* publishEvent(completedEvent);
|
|
1165
|
+
yield* Queue.offer(persistenceQueue, {
|
|
1166
|
+
runId: task.runId,
|
|
1167
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1168
|
+
payload: completedEvent
|
|
1169
|
+
});
|
|
1170
|
+
yield* publishEvent({
|
|
1171
|
+
type: "ArtifactFlushed",
|
|
1172
|
+
runId: task.runId,
|
|
1173
|
+
artifactPath: task.snapshot.artifactPath
|
|
1174
|
+
});
|
|
1175
|
+
});
|
|
1176
|
+
|
|
1177
|
+
// src/runner/name-pattern.ts
|
|
1178
|
+
function parseRegexLiteral(pattern) {
|
|
1179
|
+
if (!pattern.startsWith("/")) {
|
|
1180
|
+
return void 0;
|
|
1023
1181
|
}
|
|
1024
|
-
|
|
1025
|
-
|
|
1182
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1183
|
+
if (lastSlash <= 0) {
|
|
1184
|
+
return void 0;
|
|
1026
1185
|
}
|
|
1027
|
-
const artifactPath = filePath;
|
|
1028
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1029
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1030
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1031
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1032
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1033
1186
|
return {
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
datasetName: runQueued.datasetName,
|
|
1037
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1038
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1039
|
-
startedAt: runStarted?.startedAt,
|
|
1040
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1041
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1042
|
-
completedTestCases,
|
|
1043
|
-
passedTestCases,
|
|
1044
|
-
failedTestCases,
|
|
1045
|
-
status,
|
|
1046
|
-
artifactPath,
|
|
1047
|
-
errorMessage: runFailed?.errorMessage
|
|
1187
|
+
source: pattern.slice(1, lastSlash),
|
|
1188
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1048
1189
|
};
|
|
1049
1190
|
}
|
|
1050
|
-
function
|
|
1051
|
-
|
|
1052
|
-
const
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
if (event.type === "TestCaseProgress") {
|
|
1057
|
-
const ev = event;
|
|
1058
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1059
|
-
const id = ev.testCaseId;
|
|
1060
|
-
const current = testCasePassedBy.get(id);
|
|
1061
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1062
|
-
}
|
|
1063
|
-
} catch {
|
|
1064
|
-
}
|
|
1191
|
+
function createNameMatcher(pattern) {
|
|
1192
|
+
const normalizedPattern = pattern.trim();
|
|
1193
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1194
|
+
if (regexLiteral) {
|
|
1195
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1196
|
+
return (value) => regex.test(value);
|
|
1065
1197
|
}
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
passedTestCases += 1;
|
|
1071
|
-
} else {
|
|
1072
|
-
failedTestCases += 1;
|
|
1073
|
-
}
|
|
1198
|
+
if (normalizedPattern.includes("*")) {
|
|
1199
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1200
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1201
|
+
return (value) => regex.test(value);
|
|
1074
1202
|
}
|
|
1075
|
-
return
|
|
1203
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1076
1204
|
}
|
|
1077
1205
|
async function appendJsonLine(artifactPath, payload) {
|
|
1078
1206
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1131,32 +1259,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1131
1259
|
}
|
|
1132
1260
|
|
|
1133
1261
|
// src/runner/api.ts
|
|
1134
|
-
function
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1139
|
-
if (lastSlash <= 0) {
|
|
1140
|
-
return void 0;
|
|
1141
|
-
}
|
|
1142
|
-
return {
|
|
1143
|
-
source: pattern.slice(1, lastSlash),
|
|
1144
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1145
|
-
};
|
|
1146
|
-
}
|
|
1147
|
-
function createNameMatcher(pattern) {
|
|
1148
|
-
const normalizedPattern = pattern.trim();
|
|
1149
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1150
|
-
if (regexLiteral) {
|
|
1151
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1152
|
-
return (value) => regex.test(value);
|
|
1262
|
+
function normalizeRunRepetitions(value) {
|
|
1263
|
+
const n = value ?? 1;
|
|
1264
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1265
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1153
1266
|
}
|
|
1154
|
-
|
|
1155
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1156
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1157
|
-
return (value) => regex.test(value);
|
|
1158
|
-
}
|
|
1159
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1267
|
+
return n;
|
|
1160
1268
|
}
|
|
1161
1269
|
function mergeRunnerOverrides(base, next) {
|
|
1162
1270
|
if (!base) {
|
|
@@ -1191,6 +1299,7 @@ var EffectRunner = class {
|
|
|
1191
1299
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1192
1300
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1193
1301
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1302
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1194
1303
|
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1195
1304
|
this.persistenceFiber = Effect.runFork(
|
|
1196
1305
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1231,6 +1340,137 @@ var EffectRunner = class {
|
|
|
1231
1340
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1232
1341
|
);
|
|
1233
1342
|
}
|
|
1343
|
+
async collectRunConfigs() {
|
|
1344
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1345
|
+
this.runConfigsById.clear();
|
|
1346
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1347
|
+
for (const item of runConfigs) {
|
|
1348
|
+
const id = item.runConfig.getName();
|
|
1349
|
+
const lower = id.toLowerCase();
|
|
1350
|
+
const prev = byNameLower.get(lower);
|
|
1351
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1352
|
+
throw new Error(
|
|
1353
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1354
|
+
);
|
|
1355
|
+
}
|
|
1356
|
+
byNameLower.set(lower, item);
|
|
1357
|
+
this.runConfigsById.set(id, item);
|
|
1358
|
+
}
|
|
1359
|
+
return runConfigs;
|
|
1360
|
+
}
|
|
1361
|
+
async resolveRunConfigByName(name) {
|
|
1362
|
+
if (this.runConfigsById.size === 0) {
|
|
1363
|
+
await this.collectRunConfigs();
|
|
1364
|
+
}
|
|
1365
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1366
|
+
const keyLower = key.toLowerCase();
|
|
1367
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1368
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1369
|
+
);
|
|
1370
|
+
if (matches.length === 0) {
|
|
1371
|
+
return void 0;
|
|
1372
|
+
}
|
|
1373
|
+
if (matches.length > 1) {
|
|
1374
|
+
throw new Error(
|
|
1375
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1376
|
+
);
|
|
1377
|
+
}
|
|
1378
|
+
return matches[0];
|
|
1379
|
+
}
|
|
1380
|
+
async expandRunConfigToJobs(collected) {
|
|
1381
|
+
if (this.datasetsById.size === 0) {
|
|
1382
|
+
await this.collectDatasets();
|
|
1383
|
+
}
|
|
1384
|
+
if (this.evaluatorsById.size === 0) {
|
|
1385
|
+
await this.collectEvaluators();
|
|
1386
|
+
}
|
|
1387
|
+
const rcName = collected.runConfig.getName();
|
|
1388
|
+
const jobs = [];
|
|
1389
|
+
const runs = collected.runConfig.getRuns();
|
|
1390
|
+
for (const [i, row] of runs.entries()) {
|
|
1391
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1392
|
+
(d) => d.dataset === row.dataset
|
|
1393
|
+
);
|
|
1394
|
+
if (!dsCollected) {
|
|
1395
|
+
throw new Error(
|
|
1396
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1397
|
+
);
|
|
1398
|
+
}
|
|
1399
|
+
let evaluatorIds;
|
|
1400
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1401
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1402
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1403
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1404
|
+
);
|
|
1405
|
+
if (matched.length === 0) {
|
|
1406
|
+
throw new Error(
|
|
1407
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1408
|
+
);
|
|
1409
|
+
}
|
|
1410
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1411
|
+
} else {
|
|
1412
|
+
const evaluators = row.evaluators;
|
|
1413
|
+
evaluatorIds = [];
|
|
1414
|
+
for (const ev of evaluators) {
|
|
1415
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
1416
|
+
(item) => item.evaluator === ev
|
|
1417
|
+
);
|
|
1418
|
+
if (!found) {
|
|
1419
|
+
throw new Error(
|
|
1420
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
1421
|
+
);
|
|
1422
|
+
}
|
|
1423
|
+
evaluatorIds.push(found.id);
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
1427
|
+
jobs.push({
|
|
1428
|
+
datasetId: dsCollected.id,
|
|
1429
|
+
evaluatorIds,
|
|
1430
|
+
runConfigName: rcName,
|
|
1431
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
1432
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
1433
|
+
repetitions
|
|
1434
|
+
});
|
|
1435
|
+
}
|
|
1436
|
+
return jobs;
|
|
1437
|
+
}
|
|
1438
|
+
async expandRunConfigNamesToJobs(names) {
|
|
1439
|
+
const jobs = [];
|
|
1440
|
+
for (const name of names) {
|
|
1441
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
1442
|
+
if (!collected) {
|
|
1443
|
+
const known = await this.collectRunConfigs();
|
|
1444
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
1445
|
+
throw new Error(
|
|
1446
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
1447
|
+
);
|
|
1448
|
+
}
|
|
1449
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
1450
|
+
}
|
|
1451
|
+
return jobs;
|
|
1452
|
+
}
|
|
1453
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
1454
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1455
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1456
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1457
|
+
const snapshots = [];
|
|
1458
|
+
for (const job of request.jobs) {
|
|
1459
|
+
snapshots.push(
|
|
1460
|
+
await this.startDatasetRun({
|
|
1461
|
+
datasetId: job.datasetId,
|
|
1462
|
+
evaluatorIds: job.evaluatorIds,
|
|
1463
|
+
triggerId,
|
|
1464
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1465
|
+
globalEvaluationSemaphore: sem,
|
|
1466
|
+
runConfigName: job.runConfigName,
|
|
1467
|
+
runConfigTags: job.runConfigTags,
|
|
1468
|
+
repetitions: job.repetitions
|
|
1469
|
+
})
|
|
1470
|
+
);
|
|
1471
|
+
}
|
|
1472
|
+
return snapshots;
|
|
1473
|
+
}
|
|
1234
1474
|
async searchTestCases(query) {
|
|
1235
1475
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1236
1476
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1249,35 +1489,45 @@ var EffectRunner = class {
|
|
|
1249
1489
|
);
|
|
1250
1490
|
}
|
|
1251
1491
|
async runDatasetWith(request) {
|
|
1492
|
+
const runConfigName = validateRunConfigName(
|
|
1493
|
+
request.runConfigName,
|
|
1494
|
+
"runDatasetWith.runConfigName"
|
|
1495
|
+
);
|
|
1496
|
+
return this.startDatasetRun({
|
|
1497
|
+
datasetId: request.datasetId,
|
|
1498
|
+
evaluatorIds: request.evaluatorIds,
|
|
1499
|
+
triggerId: request.triggerId,
|
|
1500
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1501
|
+
repetitions: request.repetitions,
|
|
1502
|
+
runConfigName,
|
|
1503
|
+
runConfigTags: request.runConfigTags
|
|
1504
|
+
});
|
|
1505
|
+
}
|
|
1506
|
+
async startDatasetRun(params) {
|
|
1252
1507
|
if (this.datasetsById.size === 0) {
|
|
1253
1508
|
await this.collectDatasets();
|
|
1254
1509
|
}
|
|
1255
1510
|
if (this.evaluatorsById.size === 0) {
|
|
1256
1511
|
await this.collectEvaluators();
|
|
1257
1512
|
}
|
|
1258
|
-
const dataset = this.datasetsById.get(
|
|
1513
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1259
1514
|
if (!dataset) {
|
|
1260
|
-
throw new Error(`Unknown dataset: ${
|
|
1515
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1261
1516
|
}
|
|
1262
|
-
const selectedEvaluators =
|
|
1517
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1263
1518
|
if (selectedEvaluators.length === 0) {
|
|
1264
1519
|
throw new Error("No evaluators selected for run");
|
|
1265
1520
|
}
|
|
1266
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1267
|
-
const
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
)
|
|
1271
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1521
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
1522
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
1523
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1524
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1525
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1272
1526
|
const runId = `run-${randomUUID()}`;
|
|
1273
|
-
const artifactPath = createArtifactPath(
|
|
1274
|
-
this.config.artifactDirectory,
|
|
1275
|
-
request.datasetId,
|
|
1276
|
-
runId
|
|
1277
|
-
);
|
|
1527
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1278
1528
|
const snapshot = {
|
|
1279
1529
|
runId,
|
|
1280
|
-
datasetId:
|
|
1530
|
+
datasetId: params.datasetId,
|
|
1281
1531
|
datasetName: dataset.dataset.getName(),
|
|
1282
1532
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1283
1533
|
queuedAt: Date.now(),
|
|
@@ -1298,7 +1548,7 @@ var EffectRunner = class {
|
|
|
1298
1548
|
const queuedEvent = {
|
|
1299
1549
|
type: "RunQueued",
|
|
1300
1550
|
runId,
|
|
1301
|
-
datasetId:
|
|
1551
|
+
datasetId: params.datasetId,
|
|
1302
1552
|
datasetName: dataset.dataset.getName(),
|
|
1303
1553
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1304
1554
|
totalTestCases: totalEvaluations,
|
|
@@ -1312,17 +1562,20 @@ var EffectRunner = class {
|
|
|
1312
1562
|
payload: queuedEvent
|
|
1313
1563
|
})
|
|
1314
1564
|
);
|
|
1315
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1316
1565
|
await Effect.runPromise(
|
|
1317
1566
|
Queue.offer(this.runQueue, {
|
|
1318
1567
|
runId,
|
|
1319
1568
|
triggerId,
|
|
1320
|
-
datasetId:
|
|
1569
|
+
datasetId: params.datasetId,
|
|
1321
1570
|
dataset: dataset.dataset,
|
|
1322
1571
|
evaluators: selectedEvaluators,
|
|
1323
1572
|
testCases: selectedTestCases,
|
|
1324
1573
|
snapshot,
|
|
1325
|
-
maxConcurrency
|
|
1574
|
+
maxConcurrency: params.maxConcurrency,
|
|
1575
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1576
|
+
runConfigName: params.runConfigName,
|
|
1577
|
+
runConfigTags,
|
|
1578
|
+
repetitions
|
|
1326
1579
|
})
|
|
1327
1580
|
);
|
|
1328
1581
|
return snapshot;
|
|
@@ -1401,6 +1654,8 @@ function getDefaultConcurrency() {
|
|
|
1401
1654
|
function parseSimpleCliArgs(argv) {
|
|
1402
1655
|
const args = {
|
|
1403
1656
|
help: false,
|
|
1657
|
+
ci: false,
|
|
1658
|
+
runConfigNames: [],
|
|
1404
1659
|
unknownArgs: []
|
|
1405
1660
|
};
|
|
1406
1661
|
let index = 0;
|
|
@@ -1414,18 +1669,26 @@ function parseSimpleCliArgs(argv) {
|
|
|
1414
1669
|
args.help = true;
|
|
1415
1670
|
continue;
|
|
1416
1671
|
}
|
|
1672
|
+
if (token === "--ci") {
|
|
1673
|
+
args.ci = true;
|
|
1674
|
+
continue;
|
|
1675
|
+
}
|
|
1417
1676
|
if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
|
|
1418
1677
|
args.datasetName = argv[index + 1];
|
|
1419
1678
|
index += 1;
|
|
1420
1679
|
continue;
|
|
1421
1680
|
}
|
|
1422
|
-
if ((token === "--
|
|
1423
|
-
|
|
1681
|
+
if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
|
|
1682
|
+
const next = argv[index + 1];
|
|
1683
|
+
if (typeof next === "string") {
|
|
1684
|
+
args.runConfigNames.push(next);
|
|
1685
|
+
}
|
|
1424
1686
|
index += 1;
|
|
1425
1687
|
continue;
|
|
1426
1688
|
}
|
|
1427
1689
|
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1428
|
-
const
|
|
1690
|
+
const nextConc = argv[index + 1];
|
|
1691
|
+
const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
|
|
1429
1692
|
if (!Number.isNaN(n) && n >= 1) {
|
|
1430
1693
|
args.concurrency = n;
|
|
1431
1694
|
}
|
|
@@ -1439,16 +1702,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1439
1702
|
function getSimpleCliUsage() {
|
|
1440
1703
|
return [
|
|
1441
1704
|
"Usage:",
|
|
1442
|
-
" eval-agents-simple run --
|
|
1705
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1443
1706
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1444
1707
|
"",
|
|
1445
1708
|
"Options:",
|
|
1446
|
-
" --
|
|
1447
|
-
""
|
|
1448
|
-
"Pattern examples for --evaluator:",
|
|
1449
|
-
" score-evaluator exact name (case-insensitive)",
|
|
1450
|
-
' "*score*" wildcard pattern',
|
|
1451
|
-
' "/score/i" regex literal'
|
|
1709
|
+
" --ci With run: exit with code 1 if any test case fails.",
|
|
1710
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1452
1711
|
].join("\n");
|
|
1453
1712
|
}
|
|
1454
1713
|
|
|
@@ -1499,7 +1758,7 @@ function GenerateView({
|
|
|
1499
1758
|
const payload = testCases.map((item) => {
|
|
1500
1759
|
const tc = item.testCase;
|
|
1501
1760
|
return {
|
|
1502
|
-
name: item.testCase
|
|
1761
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1503
1762
|
input: item.testCase.getInput(),
|
|
1504
1763
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
1505
1764
|
};
|
|
@@ -1565,7 +1824,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1565
1824
|
}
|
|
1566
1825
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
1567
1826
|
const payload = testCases.map((item) => ({
|
|
1568
|
-
name: item.testCase
|
|
1827
|
+
name: getTestCaseDisplayLabel(item.testCase),
|
|
1569
1828
|
input: item.testCase.getInput(),
|
|
1570
1829
|
output: readOutput2(item.testCase)
|
|
1571
1830
|
}));
|
|
@@ -1723,8 +1982,7 @@ function formatScorePart(item, _scoreToColor, options) {
|
|
|
1723
1982
|
}
|
|
1724
1983
|
function RunView({
|
|
1725
1984
|
runner,
|
|
1726
|
-
|
|
1727
|
-
evaluatorPattern,
|
|
1985
|
+
runConfigNames,
|
|
1728
1986
|
concurrency,
|
|
1729
1987
|
onComplete
|
|
1730
1988
|
}) {
|
|
@@ -1737,30 +1995,30 @@ function RunView({
|
|
|
1737
1995
|
const [summary, setSummary] = useState(null);
|
|
1738
1996
|
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1739
1997
|
const runEval = useCallback(async () => {
|
|
1740
|
-
const
|
|
1741
|
-
if (
|
|
1742
|
-
|
|
1743
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1744
|
-
onComplete(
|
|
1745
|
-
new Error(
|
|
1746
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1747
|
-
)
|
|
1748
|
-
);
|
|
1998
|
+
const rcList = runConfigNames.filter((n) => n.trim().length > 0);
|
|
1999
|
+
if (rcList.length === 0) {
|
|
2000
|
+
onComplete(new Error("At least one RunConfig name is required."));
|
|
1749
2001
|
return;
|
|
1750
2002
|
}
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
2003
|
+
setStartedEvaluations(0);
|
|
2004
|
+
setCompletedEvaluations(0);
|
|
2005
|
+
setTestCases([]);
|
|
2006
|
+
setRunningEvaluations([]);
|
|
2007
|
+
setSummary(null);
|
|
2008
|
+
let jobs;
|
|
2009
|
+
try {
|
|
2010
|
+
jobs = await runner.expandRunConfigNamesToJobs(rcList);
|
|
2011
|
+
} catch (err) {
|
|
2012
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
2013
|
+
return;
|
|
2014
|
+
}
|
|
2015
|
+
if (jobs.length === 0) {
|
|
2016
|
+
onComplete(new Error("No jobs expanded from RunConfigs."));
|
|
1760
2017
|
return;
|
|
1761
2018
|
}
|
|
2019
|
+
const allEvaluators = await runner.collectEvaluators();
|
|
1762
2020
|
const nameById = new Map(
|
|
1763
|
-
|
|
2021
|
+
allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
1764
2022
|
);
|
|
1765
2023
|
setEvaluatorNameById(nameById);
|
|
1766
2024
|
const aggregates = /* @__PURE__ */ new Map();
|
|
@@ -1768,21 +2026,30 @@ function RunView({
|
|
|
1768
2026
|
let overallScoreTotal = 0;
|
|
1769
2027
|
let overallScoreSumSq = 0;
|
|
1770
2028
|
let overallScoreCount = 0;
|
|
1771
|
-
const
|
|
2029
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2030
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2031
|
+
let batchReady = false;
|
|
2032
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2033
|
+
const done = new Promise((resolve5, reject) => {
|
|
1772
2034
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2035
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2036
|
+
return;
|
|
2037
|
+
}
|
|
1773
2038
|
if (event.type === "TestCaseStarted") {
|
|
1774
|
-
setStartedEvaluations(
|
|
2039
|
+
setStartedEvaluations((c) => c + 1);
|
|
1775
2040
|
setRunningEvaluations((prev) => {
|
|
1776
2041
|
const withoutDuplicate = prev.filter(
|
|
1777
|
-
(item) => !(item.testCaseId === event.testCaseId && item.
|
|
2042
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
1778
2043
|
);
|
|
1779
2044
|
return [
|
|
1780
2045
|
...withoutDuplicate,
|
|
1781
2046
|
{
|
|
2047
|
+
runId: event.runId,
|
|
1782
2048
|
testCaseId: event.testCaseId,
|
|
1783
2049
|
name: event.testCaseName,
|
|
1784
|
-
|
|
1785
|
-
|
|
2050
|
+
repetitionId: event.repetitionId,
|
|
2051
|
+
repetitionIndex: event.repetitionIndex,
|
|
2052
|
+
repetitionCount: event.repetitionCount,
|
|
1786
2053
|
startedTestCases: event.startedTestCases,
|
|
1787
2054
|
totalTestCases: event.totalTestCases
|
|
1788
2055
|
}
|
|
@@ -1818,9 +2085,12 @@ function RunView({
|
|
|
1818
2085
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
1819
2086
|
}
|
|
1820
2087
|
}
|
|
2088
|
+
const label = runIdToLabel.get(event.runId);
|
|
2089
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2090
|
+
const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
|
|
1821
2091
|
setTestCases((prev) => {
|
|
1822
2092
|
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1823
|
-
const existing = byId.get(
|
|
2093
|
+
const existing = byId.get(compositeId);
|
|
1824
2094
|
const newEvent = {
|
|
1825
2095
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1826
2096
|
evaluatorId: item.evaluatorId,
|
|
@@ -1837,12 +2107,12 @@ function RunView({
|
|
|
1837
2107
|
const isAggregated = events.length > 1;
|
|
1838
2108
|
const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
|
|
1839
2109
|
const merged = {
|
|
1840
|
-
name:
|
|
1841
|
-
testCaseId:
|
|
2110
|
+
name: displayName,
|
|
2111
|
+
testCaseId: compositeId,
|
|
1842
2112
|
completedTestCases: event.completedTestCases,
|
|
1843
2113
|
totalTestCases: event.totalTestCases,
|
|
1844
|
-
|
|
1845
|
-
|
|
2114
|
+
repetitionIndex: event.repetitionIndex,
|
|
2115
|
+
repetitionCount: event.repetitionCount,
|
|
1846
2116
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1847
2117
|
passed: events.every((e) => e.passed),
|
|
1848
2118
|
errorMessage: event.errorMessage,
|
|
@@ -1850,84 +2120,118 @@ function RunView({
|
|
|
1850
2120
|
aggregatedEvaluatorScores,
|
|
1851
2121
|
isAggregated
|
|
1852
2122
|
};
|
|
1853
|
-
byId.set(
|
|
1854
|
-
setCompletedEvaluations(event.completedTestCases);
|
|
1855
|
-
setRunningEvaluations(
|
|
1856
|
-
(running) => running.filter(
|
|
1857
|
-
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1858
|
-
)
|
|
1859
|
-
);
|
|
2123
|
+
byId.set(compositeId, merged);
|
|
1860
2124
|
return Array.from(byId.values());
|
|
1861
2125
|
});
|
|
2126
|
+
setCompletedEvaluations((c) => c + 1);
|
|
2127
|
+
setRunningEvaluations(
|
|
2128
|
+
(running) => running.filter(
|
|
2129
|
+
(item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
|
|
2130
|
+
)
|
|
2131
|
+
);
|
|
1862
2132
|
}
|
|
1863
|
-
if (event.type === "
|
|
2133
|
+
if (event.type === "RunFailed") {
|
|
2134
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2135
|
+
return;
|
|
2136
|
+
}
|
|
1864
2137
|
unsubscribe();
|
|
1865
|
-
|
|
2138
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2139
|
+
return;
|
|
2140
|
+
}
|
|
2141
|
+
if (event.type === "RunCompleted") {
|
|
2142
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2143
|
+
return;
|
|
2144
|
+
}
|
|
2145
|
+
completedRuns.set(event.runId, event);
|
|
2146
|
+
batchPendingRunIds.delete(event.runId);
|
|
2147
|
+
if (batchPendingRunIds.size === 0) {
|
|
2148
|
+
unsubscribe();
|
|
2149
|
+
resolve5();
|
|
2150
|
+
}
|
|
1866
2151
|
}
|
|
1867
2152
|
});
|
|
1868
2153
|
});
|
|
1869
|
-
const
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
concurrency
|
|
2154
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2155
|
+
jobs,
|
|
2156
|
+
globalConcurrency: concurrency
|
|
1873
2157
|
});
|
|
2158
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2159
|
+
const snap = snapshots[i];
|
|
2160
|
+
const job = jobs[i];
|
|
2161
|
+
if (snap && job) {
|
|
2162
|
+
runIdToLabel.set(
|
|
2163
|
+
snap.runId,
|
|
2164
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2165
|
+
);
|
|
2166
|
+
batchPendingRunIds.add(snap.runId);
|
|
2167
|
+
}
|
|
2168
|
+
}
|
|
2169
|
+
const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2170
|
+
batchReady = true;
|
|
2171
|
+
const runConfigLabels = await Promise.all(
|
|
2172
|
+
rcList.map(async (n) => {
|
|
2173
|
+
const collected = await runner.resolveRunConfigByName(n);
|
|
2174
|
+
return collected?.runConfig.getDisplayLabel() ?? n;
|
|
2175
|
+
})
|
|
2176
|
+
);
|
|
1874
2177
|
setRunInfo({
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
totalTestCases: snapshot.totalTestCases
|
|
2178
|
+
names: runConfigLabels,
|
|
2179
|
+
jobs: jobs.length,
|
|
2180
|
+
totalTestCases: totalUnits
|
|
1879
2181
|
});
|
|
1880
2182
|
setPhase("running");
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
2183
|
+
try {
|
|
2184
|
+
await done;
|
|
2185
|
+
} catch (err) {
|
|
2186
|
+
onComplete(err instanceof Error ? err : new Error(String(err)));
|
|
1884
2187
|
return;
|
|
1885
2188
|
}
|
|
1886
|
-
|
|
2189
|
+
let passedTestCases = 0;
|
|
2190
|
+
let failedTestCases = 0;
|
|
2191
|
+
let totalTestCases = 0;
|
|
2192
|
+
const artifacts = [];
|
|
2193
|
+
for (const ev of completedRuns.values()) {
|
|
2194
|
+
passedTestCases += ev.passedTestCases;
|
|
2195
|
+
failedTestCases += ev.failedTestCases;
|
|
2196
|
+
totalTestCases += ev.totalTestCases;
|
|
2197
|
+
artifacts.push(ev.artifactPath);
|
|
2198
|
+
}
|
|
1887
2199
|
setSummary({
|
|
1888
|
-
passedTestCases
|
|
1889
|
-
failedTestCases
|
|
1890
|
-
totalTestCases
|
|
2200
|
+
passedTestCases,
|
|
2201
|
+
failedTestCases,
|
|
2202
|
+
totalTestCases,
|
|
1891
2203
|
overallScoreTotal,
|
|
1892
2204
|
overallScoreSumSq,
|
|
1893
2205
|
overallScoreCount,
|
|
1894
2206
|
aggregates: new Map(aggregates),
|
|
1895
2207
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1896
|
-
artifactPath:
|
|
2208
|
+
artifactPath: artifacts.join("\n")
|
|
1897
2209
|
});
|
|
1898
2210
|
setPhase("completed");
|
|
1899
|
-
|
|
1900
|
-
|
|
2211
|
+
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2212
|
+
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2213
|
+
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
1901
2214
|
useEffect(() => {
|
|
1902
2215
|
void runEval();
|
|
1903
2216
|
}, [runEval]);
|
|
1904
2217
|
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
1905
2218
|
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
1906
2219
|
runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1907
|
-
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
" "
|
|
1911
|
-
] }),
|
|
1912
|
-
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
|
|
2220
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
2221
|
+
"RunConfigs",
|
|
2222
|
+
" "
|
|
1913
2223
|
] }),
|
|
2224
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.names.join(", ") }),
|
|
1914
2225
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1915
2226
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1916
|
-
"
|
|
2227
|
+
"Jobs",
|
|
1917
2228
|
" "
|
|
1918
2229
|
] }),
|
|
1919
|
-
runInfo.
|
|
2230
|
+
runInfo.jobs
|
|
1920
2231
|
] }),
|
|
1921
2232
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1922
2233
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1923
|
-
"
|
|
1924
|
-
" "
|
|
1925
|
-
] }),
|
|
1926
|
-
runInfo.evaluatorNames.join(", ")
|
|
1927
|
-
] }),
|
|
1928
|
-
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1929
|
-
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1930
|
-
"Test cases",
|
|
2234
|
+
"Evaluation units",
|
|
1931
2235
|
" "
|
|
1932
2236
|
] }),
|
|
1933
2237
|
runInfo.totalTestCases
|
|
@@ -1940,22 +2244,29 @@ function RunView({
|
|
|
1940
2244
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1941
2245
|
}
|
|
1942
2246
|
),
|
|
1943
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
2247
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
2248
|
+
Text,
|
|
2249
|
+
{
|
|
2250
|
+
color: "yellow",
|
|
2251
|
+
children: [
|
|
2252
|
+
"[running ",
|
|
2253
|
+
item.startedTestCases,
|
|
2254
|
+
"/",
|
|
2255
|
+
item.totalTestCases,
|
|
2256
|
+
"] ",
|
|
2257
|
+
item.name,
|
|
2258
|
+
" ",
|
|
2259
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2260
|
+
"(",
|
|
2261
|
+
item.repetitionIndex,
|
|
2262
|
+
"/",
|
|
2263
|
+
item.repetitionCount,
|
|
2264
|
+
")"
|
|
2265
|
+
] })
|
|
2266
|
+
]
|
|
2267
|
+
},
|
|
2268
|
+
`${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
|
|
2269
|
+
)) })
|
|
1959
2270
|
] }),
|
|
1960
2271
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1961
2272
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
@@ -1971,9 +2282,9 @@ function RunView({
|
|
|
1971
2282
|
" ",
|
|
1972
2283
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1973
2284
|
"(",
|
|
1974
|
-
tc.
|
|
2285
|
+
tc.repetitionIndex,
|
|
1975
2286
|
"/",
|
|
1976
|
-
tc.
|
|
2287
|
+
tc.repetitionCount,
|
|
1977
2288
|
")"
|
|
1978
2289
|
] }),
|
|
1979
2290
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
@@ -2013,7 +2324,7 @@ function RunView({
|
|
|
2013
2324
|
})
|
|
2014
2325
|
] }) : null
|
|
2015
2326
|
] }),
|
|
2016
|
-
item.scores.length > 0 ? item.scores.map((s
|
|
2327
|
+
item.scores.length > 0 ? item.scores.map((s) => {
|
|
2017
2328
|
const def = s.def ?? getScoreById(s.id);
|
|
2018
2329
|
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2019
2330
|
return /* @__PURE__ */ jsxs(
|
|
@@ -2030,18 +2341,25 @@ function RunView({
|
|
|
2030
2341
|
})
|
|
2031
2342
|
]
|
|
2032
2343
|
},
|
|
2033
|
-
`${item.evaluatorId}-${s.id}-${
|
|
2344
|
+
`${item.evaluatorId}-${s.id}-${scoreLabel}`
|
|
2034
2345
|
);
|
|
2035
2346
|
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
2036
2347
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2037
|
-
(log
|
|
2038
|
-
|
|
2348
|
+
(log) => log.type === "diff" ? /* @__PURE__ */ jsx(
|
|
2349
|
+
Box,
|
|
2039
2350
|
{
|
|
2040
|
-
|
|
2041
|
-
children: line
|
|
2351
|
+
flexDirection: "column",
|
|
2352
|
+
children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsx(
|
|
2353
|
+
Text,
|
|
2354
|
+
{
|
|
2355
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2356
|
+
children: line
|
|
2357
|
+
},
|
|
2358
|
+
`${type}:${line}`
|
|
2359
|
+
))
|
|
2042
2360
|
},
|
|
2043
|
-
|
|
2044
|
-
)
|
|
2361
|
+
`diff:${getDiffLines(log).map((x) => x.line).join("|")}`
|
|
2362
|
+
) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
|
|
2045
2363
|
) })
|
|
2046
2364
|
] }, item.evaluatorId))
|
|
2047
2365
|
] }, tc.testCaseId)) }),
|
|
@@ -2165,10 +2483,10 @@ function RunView({
|
|
|
2165
2483
|
] }, tc.testCaseId);
|
|
2166
2484
|
})
|
|
2167
2485
|
] }),
|
|
2168
|
-
/* @__PURE__ */
|
|
2169
|
-
"artifact:
|
|
2170
|
-
summary.artifactPath
|
|
2171
|
-
] })
|
|
2486
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
2487
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "artifact(s):" }),
|
|
2488
|
+
summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, line))
|
|
2489
|
+
] })
|
|
2172
2490
|
] })
|
|
2173
2491
|
] });
|
|
2174
2492
|
}
|
|
@@ -2380,25 +2698,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2380
2698
|
}
|
|
2381
2699
|
return lines;
|
|
2382
2700
|
}
|
|
2383
|
-
async function
|
|
2384
|
-
const
|
|
2385
|
-
if (
|
|
2386
|
-
|
|
2387
|
-
const available = known.map((item) => item.dataset.getName()).sort();
|
|
2388
|
-
throw new Error(
|
|
2389
|
-
available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
|
|
2390
|
-
);
|
|
2391
|
-
}
|
|
2392
|
-
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
2393
|
-
if (evaluators.length === 0) {
|
|
2394
|
-
const known = await runner.collectEvaluators();
|
|
2395
|
-
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
2396
|
-
throw new Error(
|
|
2397
|
-
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
|
|
2398
|
-
);
|
|
2701
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2702
|
+
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2703
|
+
if (jobs.length === 0) {
|
|
2704
|
+
throw new Error("No jobs expanded from RunConfigs.");
|
|
2399
2705
|
}
|
|
2706
|
+
const evaluators = await runner.collectEvaluators();
|
|
2400
2707
|
const evaluatorNameById = new Map(
|
|
2401
|
-
evaluators.map((item) => [item.id, item.evaluator
|
|
2708
|
+
evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
|
|
2402
2709
|
);
|
|
2403
2710
|
const aggregates = /* @__PURE__ */ new Map();
|
|
2404
2711
|
const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
|
|
@@ -2406,11 +2713,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2406
2713
|
let overallScoreTotal = 0;
|
|
2407
2714
|
let overallScoreSumSq = 0;
|
|
2408
2715
|
let overallScoreCount = 0;
|
|
2409
|
-
let
|
|
2410
|
-
let
|
|
2716
|
+
let globalStartedUnits = 0;
|
|
2717
|
+
let globalCompletedUnits = 0;
|
|
2411
2718
|
let totalCount = 0;
|
|
2412
2719
|
let runFinished = false;
|
|
2413
|
-
const
|
|
2720
|
+
const inFlightRepetitions = /* @__PURE__ */ new Set();
|
|
2414
2721
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2415
2722
|
let spinnerIndex = 0;
|
|
2416
2723
|
function clearLine() {
|
|
@@ -2432,33 +2739,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2432
2739
|
spinnerIndex += 1;
|
|
2433
2740
|
process.stdout.write(
|
|
2434
2741
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2435
|
-
`${
|
|
2742
|
+
`${globalCompletedUnits}/${totalCount}`,
|
|
2436
2743
|
ansi2.bold
|
|
2437
|
-
)} completed ${colorize(`${
|
|
2744
|
+
)} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
|
|
2438
2745
|
);
|
|
2439
2746
|
}
|
|
2440
2747
|
let lastPrintedTestCaseId = null;
|
|
2441
2748
|
let lastPrintedLineCount = 0;
|
|
2442
2749
|
let spinnerTimer;
|
|
2443
|
-
const
|
|
2750
|
+
const batchPendingRunIds = /* @__PURE__ */ new Set();
|
|
2751
|
+
const runIdToLabel = /* @__PURE__ */ new Map();
|
|
2752
|
+
let batchReady = false;
|
|
2753
|
+
const completedRuns = /* @__PURE__ */ new Map();
|
|
2754
|
+
const done = new Promise((resolve5, reject) => {
|
|
2444
2755
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2756
|
+
if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
|
|
2757
|
+
return;
|
|
2758
|
+
}
|
|
2759
|
+
const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
|
|
2760
|
+
const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
|
|
2445
2761
|
if (event.type === "TestCaseStarted") {
|
|
2446
|
-
|
|
2447
|
-
|
|
2762
|
+
globalStartedUnits += 1;
|
|
2763
|
+
inFlightRepetitions.add(
|
|
2764
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2765
|
+
);
|
|
2448
2766
|
clearLine();
|
|
2449
2767
|
process.stdout.write(
|
|
2450
|
-
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2768
|
+
`${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2451
2769
|
`
|
|
2452
2770
|
);
|
|
2453
2771
|
drawSpinner();
|
|
2454
2772
|
}
|
|
2455
2773
|
if (event.type === "TestCaseProgress") {
|
|
2456
|
-
|
|
2457
|
-
|
|
2774
|
+
globalCompletedUnits += 1;
|
|
2775
|
+
inFlightRepetitions.delete(
|
|
2776
|
+
`${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
|
|
2777
|
+
);
|
|
2458
2778
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2459
2779
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2460
|
-
const
|
|
2461
|
-
const existing = testCaseByTestId.get(
|
|
2780
|
+
const compositeId = `${event.runId}:${event.testCaseId}`;
|
|
2781
|
+
const existing = testCaseByTestId.get(compositeId) ?? {
|
|
2462
2782
|
name: event.testCaseName,
|
|
2463
2783
|
events: []
|
|
2464
2784
|
};
|
|
@@ -2468,7 +2788,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2468
2788
|
durationMs: event.durationMs,
|
|
2469
2789
|
evaluatorScores: event.evaluatorScores
|
|
2470
2790
|
});
|
|
2471
|
-
testCaseByTestId.set(
|
|
2791
|
+
testCaseByTestId.set(compositeId, existing);
|
|
2472
2792
|
for (const item of event.evaluatorScores) {
|
|
2473
2793
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
2474
2794
|
if (numeric !== void 0) {
|
|
@@ -2497,10 +2817,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2497
2817
|
scoreItemsByEvaluatorScore.set(key, list);
|
|
2498
2818
|
}
|
|
2499
2819
|
}
|
|
2500
|
-
const isSameTestCase = lastPrintedTestCaseId ===
|
|
2501
|
-
const
|
|
2820
|
+
const isSameTestCase = lastPrintedTestCaseId === compositeId;
|
|
2821
|
+
const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
|
|
2502
2822
|
const isNonTty = !process.stdout.isTTY;
|
|
2503
|
-
const skipPrintNonTty = isNonTty && event.
|
|
2823
|
+
const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
|
|
2504
2824
|
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2505
2825
|
cursorUp(lastPrintedLineCount);
|
|
2506
2826
|
}
|
|
@@ -2511,7 +2831,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2511
2831
|
const lines = [];
|
|
2512
2832
|
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2513
2833
|
lines.push(
|
|
2514
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.
|
|
2834
|
+
`${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2515
2835
|
);
|
|
2516
2836
|
if (event.errorMessage) {
|
|
2517
2837
|
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
@@ -2542,64 +2862,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2542
2862
|
}
|
|
2543
2863
|
}
|
|
2544
2864
|
if (!skipPrintNonTty) {
|
|
2545
|
-
for (let i = 0; i < lines.length; i
|
|
2865
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
2546
2866
|
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2547
2867
|
`);
|
|
2548
2868
|
}
|
|
2549
|
-
lastPrintedTestCaseId =
|
|
2869
|
+
lastPrintedTestCaseId = compositeId;
|
|
2550
2870
|
lastPrintedLineCount = lines.length;
|
|
2551
2871
|
}
|
|
2552
2872
|
drawSpinner();
|
|
2553
2873
|
}
|
|
2554
|
-
if (event.type === "
|
|
2874
|
+
if (event.type === "RunFailed") {
|
|
2875
|
+
if (batchReady && !batchPendingRunIds.has(event.runId)) {
|
|
2876
|
+
return;
|
|
2877
|
+
}
|
|
2555
2878
|
runFinished = true;
|
|
2556
2879
|
clearLine();
|
|
2557
2880
|
unsubscribe();
|
|
2558
|
-
|
|
2881
|
+
reject(new Error(`Run failed: ${event.errorMessage}`));
|
|
2882
|
+
return;
|
|
2883
|
+
}
|
|
2884
|
+
if (event.type === "RunCompleted") {
|
|
2885
|
+
if (!batchPendingRunIds.has(event.runId)) {
|
|
2886
|
+
return;
|
|
2887
|
+
}
|
|
2888
|
+
completedRuns.set(event.runId, event);
|
|
2889
|
+
batchPendingRunIds.delete(event.runId);
|
|
2890
|
+
if (batchPendingRunIds.size === 0) {
|
|
2891
|
+
runFinished = true;
|
|
2892
|
+
clearLine();
|
|
2893
|
+
unsubscribe();
|
|
2894
|
+
resolve5();
|
|
2895
|
+
}
|
|
2559
2896
|
}
|
|
2560
2897
|
});
|
|
2561
2898
|
});
|
|
2562
|
-
|
|
2563
|
-
|
|
2564
|
-
|
|
2565
|
-
|
|
2899
|
+
console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2900
|
+
for (const name of runConfigNames) {
|
|
2901
|
+
const collected = await runner.resolveRunConfigByName(name);
|
|
2902
|
+
const label = collected?.runConfig.getDisplayLabel() ?? name;
|
|
2903
|
+
console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
|
|
2904
|
+
}
|
|
2905
|
+
console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
|
|
2906
|
+
console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
|
|
2907
|
+
console.log("");
|
|
2908
|
+
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2909
|
+
jobs,
|
|
2910
|
+
globalConcurrency: concurrency
|
|
2566
2911
|
});
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
|
|
2571
|
-
|
|
2572
|
-
|
|
2573
|
-
|
|
2574
|
-
|
|
2912
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2913
|
+
const snap = snapshots[i];
|
|
2914
|
+
const job = jobs[i];
|
|
2915
|
+
if (snap && job) {
|
|
2916
|
+
runIdToLabel.set(
|
|
2917
|
+
snap.runId,
|
|
2918
|
+
`${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
|
|
2919
|
+
);
|
|
2920
|
+
batchPendingRunIds.add(snap.runId);
|
|
2921
|
+
}
|
|
2922
|
+
}
|
|
2923
|
+
totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
|
|
2924
|
+
console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
|
|
2575
2925
|
console.log("");
|
|
2926
|
+
batchReady = true;
|
|
2576
2927
|
drawSpinner();
|
|
2577
2928
|
spinnerTimer = setInterval(drawSpinner, 100);
|
|
2578
|
-
|
|
2929
|
+
await done;
|
|
2579
2930
|
if (spinnerTimer) {
|
|
2580
2931
|
clearInterval(spinnerTimer);
|
|
2581
2932
|
}
|
|
2582
|
-
if (finalEvent.type === "RunFailed") {
|
|
2583
|
-
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2584
|
-
}
|
|
2585
|
-
const completed = finalEvent;
|
|
2586
2933
|
console.log("");
|
|
2587
|
-
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
)
|
|
2596
|
-
|
|
2934
|
+
console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2935
|
+
for (const snap of snapshots) {
|
|
2936
|
+
const completed = completedRuns.get(snap.runId);
|
|
2937
|
+
if (!completed) {
|
|
2938
|
+
continue;
|
|
2939
|
+
}
|
|
2940
|
+
const label = runIdToLabel.get(snap.runId) ?? snap.runId;
|
|
2941
|
+
console.log("");
|
|
2942
|
+
console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
|
|
2943
|
+
console.log(
|
|
2944
|
+
`- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
|
|
2945
|
+
);
|
|
2946
|
+
console.log(
|
|
2947
|
+
`- failed: ${colorize(
|
|
2948
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2949
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2950
|
+
)}`
|
|
2951
|
+
);
|
|
2952
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2953
|
+
}
|
|
2597
2954
|
if (overallScoreCount > 0) {
|
|
2598
2955
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2599
2956
|
const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
|
|
2600
2957
|
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2958
|
+
console.log("");
|
|
2601
2959
|
console.log(
|
|
2602
|
-
`- overall avg score: ${colorize(
|
|
2960
|
+
`- overall avg score (all jobs): ${colorize(
|
|
2603
2961
|
avgStr,
|
|
2604
2962
|
scoreToColor(overallAverage)
|
|
2605
2963
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
@@ -2640,22 +2998,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2640
2998
|
);
|
|
2641
2999
|
}
|
|
2642
3000
|
}
|
|
2643
|
-
|
|
3001
|
+
let failedTestCasesTotal = 0;
|
|
3002
|
+
for (const snap of snapshots) {
|
|
3003
|
+
const completed = completedRuns.get(snap.runId);
|
|
3004
|
+
if (completed) {
|
|
3005
|
+
failedTestCasesTotal += completed.failedTestCases;
|
|
3006
|
+
}
|
|
3007
|
+
}
|
|
3008
|
+
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
2644
3009
|
}
|
|
2645
|
-
async function
|
|
3010
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
2646
3011
|
return new Promise((resolve5, reject) => {
|
|
2647
3012
|
const app = render(
|
|
2648
3013
|
React.createElement(RunView, {
|
|
2649
3014
|
runner,
|
|
2650
|
-
|
|
2651
|
-
evaluatorPattern,
|
|
3015
|
+
runConfigNames,
|
|
2652
3016
|
concurrency,
|
|
2653
|
-
onComplete: (err) => {
|
|
3017
|
+
onComplete: (err, exitCode) => {
|
|
2654
3018
|
app.unmount();
|
|
2655
3019
|
if (err) {
|
|
2656
3020
|
reject(err);
|
|
2657
3021
|
} else {
|
|
2658
|
-
resolve5();
|
|
3022
|
+
resolve5(exitCode ?? 0);
|
|
2659
3023
|
}
|
|
2660
3024
|
}
|
|
2661
3025
|
})
|
|
@@ -2681,12 +3045,22 @@ async function main() {
|
|
|
2681
3045
|
if (!args.command) {
|
|
2682
3046
|
printUsageAndExit(1);
|
|
2683
3047
|
}
|
|
2684
|
-
if (
|
|
2685
|
-
|
|
2686
|
-
|
|
3048
|
+
if (args.command === "run") {
|
|
3049
|
+
if (args.runConfigNames.length === 0) {
|
|
3050
|
+
console.error(
|
|
3051
|
+
"Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
|
|
3052
|
+
);
|
|
3053
|
+
printUsageAndExit(1);
|
|
3054
|
+
}
|
|
3055
|
+
if (args.datasetName !== void 0) {
|
|
3056
|
+
console.error(
|
|
3057
|
+
"The run command no longer accepts --dataset; use --run-config <RunConfig name>."
|
|
3058
|
+
);
|
|
3059
|
+
printUsageAndExit(1);
|
|
3060
|
+
}
|
|
2687
3061
|
}
|
|
2688
|
-
if (args.command === "
|
|
2689
|
-
console.error("
|
|
3062
|
+
if (args.command === "generate" && args.runConfigNames.length > 0) {
|
|
3063
|
+
console.error("generate does not accept --run-config.");
|
|
2690
3064
|
printUsageAndExit(1);
|
|
2691
3065
|
}
|
|
2692
3066
|
const useInk = process.stdout.isTTY === true;
|
|
@@ -2697,17 +3071,24 @@ async function main() {
|
|
|
2697
3071
|
try {
|
|
2698
3072
|
if (args.command === "run") {
|
|
2699
3073
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2700
|
-
await (useInk ?
|
|
3074
|
+
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
2701
3075
|
runner,
|
|
2702
|
-
args.
|
|
2703
|
-
args.evaluatorPattern,
|
|
3076
|
+
args.runConfigNames,
|
|
2704
3077
|
concurrency
|
|
2705
3078
|
);
|
|
3079
|
+
if (args.ci && exitCode !== 0) {
|
|
3080
|
+
process.exit(1);
|
|
3081
|
+
}
|
|
2706
3082
|
return;
|
|
2707
3083
|
}
|
|
3084
|
+
const genDataset = args.datasetName;
|
|
3085
|
+
if (!genDataset) {
|
|
3086
|
+
console.error("Missing required --dataset <datasetName> argument.");
|
|
3087
|
+
printUsageAndExit(1);
|
|
3088
|
+
}
|
|
2708
3089
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
2709
3090
|
runner,
|
|
2710
|
-
|
|
3091
|
+
genDataset
|
|
2711
3092
|
);
|
|
2712
3093
|
} finally {
|
|
2713
3094
|
await runner.shutdown();
|