@m4trix/evals 0.25.1 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/dist/cli-simple.cjs +845 -455
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +846 -456
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +543 -273
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +543 -273
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +948 -545
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +228 -14
- package/dist/index.js +933 -547
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,14 +1,257 @@
|
|
|
1
|
-
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
1
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
3
|
import { diffLines } from 'diff';
|
|
4
4
|
import stringify from 'fast-json-stable-stringify';
|
|
5
5
|
import { randomUUID } from 'crypto';
|
|
6
|
+
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
7
|
+
import { resolve as resolve$1, join, relative, dirname } from 'path';
|
|
6
8
|
import { existsSync } from 'fs';
|
|
7
|
-
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
8
9
|
import * as jitiModule from 'jiti';
|
|
9
|
-
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
10
10
|
import { pathToFileURL } from 'url';
|
|
11
11
|
|
|
12
|
+
// src/index.ts
|
|
13
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
14
|
+
function makeEntityIdSchema(brand, label) {
|
|
15
|
+
return Schema.String.pipe(
|
|
16
|
+
Schema.trimmed(),
|
|
17
|
+
Schema.minLength(1, {
|
|
18
|
+
message: () => `${label} must be non-empty.`
|
|
19
|
+
}),
|
|
20
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
21
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
22
|
+
}),
|
|
23
|
+
Schema.brand(brand)
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
27
|
+
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
28
|
+
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
29
|
+
var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
|
|
30
|
+
function validateWithSchema(schema, raw, context) {
|
|
31
|
+
const trimmed = raw.trim();
|
|
32
|
+
const decode = Schema.decodeUnknownEither(
|
|
33
|
+
schema
|
|
34
|
+
);
|
|
35
|
+
const result = decode(trimmed);
|
|
36
|
+
if (Either.isLeft(result)) {
|
|
37
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
38
|
+
}
|
|
39
|
+
return result.right;
|
|
40
|
+
}
|
|
41
|
+
function validateRunConfigName(raw, context) {
|
|
42
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
43
|
+
}
|
|
44
|
+
function validateEvaluatorName(raw, context) {
|
|
45
|
+
return validateWithSchema(EvaluatorNameSchema, raw, context);
|
|
46
|
+
}
|
|
47
|
+
function validateTestCaseName(raw, context) {
|
|
48
|
+
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
49
|
+
}
|
|
50
|
+
function validateDatasetName(raw, context) {
|
|
51
|
+
return validateWithSchema(DatasetNameSchema, raw, context);
|
|
52
|
+
}
|
|
53
|
+
function normalizeOptionalDisplayName(raw) {
|
|
54
|
+
if (raw === void 0) {
|
|
55
|
+
return void 0;
|
|
56
|
+
}
|
|
57
|
+
const t = raw.trim();
|
|
58
|
+
return t.length === 0 ? void 0 : t;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// src/evals/dataset.ts
|
|
62
|
+
function matchesAny(value, matchers) {
|
|
63
|
+
return matchers.some(
|
|
64
|
+
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
65
|
+
);
|
|
66
|
+
}
|
|
67
|
+
function matchesAnyPath(filePath, matchers) {
|
|
68
|
+
return matchers.some((matcher) => {
|
|
69
|
+
if (typeof matcher === "string") {
|
|
70
|
+
return simpleGlobMatch(matcher, filePath);
|
|
71
|
+
}
|
|
72
|
+
return matcher.test(filePath);
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
function simpleGlobMatch(pattern, value) {
|
|
76
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
77
|
+
return new RegExp(`^${escaped}$`).test(value);
|
|
78
|
+
}
|
|
79
|
+
var Dataset = class _Dataset {
|
|
80
|
+
constructor(config) {
|
|
81
|
+
this._config = config;
|
|
82
|
+
}
|
|
83
|
+
static define(config) {
|
|
84
|
+
const name = validateDatasetName(config.name, "Dataset.define");
|
|
85
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
86
|
+
return new _Dataset({
|
|
87
|
+
name,
|
|
88
|
+
displayName,
|
|
89
|
+
includedTags: config.includedTags ?? [],
|
|
90
|
+
excludedTags: config.excludedTags ?? [],
|
|
91
|
+
includedPaths: config.includedPaths ?? [],
|
|
92
|
+
excludedPaths: config.excludedPaths ?? []
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
/** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
|
|
96
|
+
getName() {
|
|
97
|
+
return this._config.name;
|
|
98
|
+
}
|
|
99
|
+
getDisplayName() {
|
|
100
|
+
return this._config.displayName;
|
|
101
|
+
}
|
|
102
|
+
/** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
103
|
+
getDisplayLabel() {
|
|
104
|
+
return this._config.displayName ?? this._config.name;
|
|
105
|
+
}
|
|
106
|
+
getIncludedTags() {
|
|
107
|
+
return this._config.includedTags;
|
|
108
|
+
}
|
|
109
|
+
getExcludedTags() {
|
|
110
|
+
return this._config.excludedTags;
|
|
111
|
+
}
|
|
112
|
+
getIncludedPaths() {
|
|
113
|
+
return this._config.includedPaths;
|
|
114
|
+
}
|
|
115
|
+
getExcludedPaths() {
|
|
116
|
+
return this._config.excludedPaths;
|
|
117
|
+
}
|
|
118
|
+
matchesTestCase(testCase, filePath) {
|
|
119
|
+
const tags = testCase.getTags();
|
|
120
|
+
if (this._config.excludedTags.length > 0) {
|
|
121
|
+
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
122
|
+
return false;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (this._config.excludedPaths.length > 0) {
|
|
126
|
+
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
131
|
+
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
132
|
+
return tagMatch && pathMatch;
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
function getDatasetDisplayLabel(dataset) {
|
|
136
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
137
|
+
return dataset.getDisplayLabel();
|
|
138
|
+
}
|
|
139
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// src/evals/evaluator.ts
|
|
143
|
+
var Evaluator = class _Evaluator {
|
|
144
|
+
constructor(config) {
|
|
145
|
+
this._config = config;
|
|
146
|
+
}
|
|
147
|
+
getState() {
|
|
148
|
+
return {
|
|
149
|
+
name: this._config.name,
|
|
150
|
+
displayName: this._config.displayName,
|
|
151
|
+
tags: this._config.tags,
|
|
152
|
+
inputSchema: this._config.inputSchema,
|
|
153
|
+
outputSchema: this._config.outputSchema,
|
|
154
|
+
scoreSchema: this._config.scoreSchema,
|
|
155
|
+
middlewares: this._config.middlewares,
|
|
156
|
+
evaluateFn: this._config.evaluateFn,
|
|
157
|
+
passThreshold: this._config.passThreshold,
|
|
158
|
+
passCriterion: this._config.passCriterion
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
static use(middleware) {
|
|
162
|
+
return new _Evaluator({
|
|
163
|
+
middlewares: [middleware],
|
|
164
|
+
tags: []
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
use(middleware) {
|
|
168
|
+
const state = this.getState();
|
|
169
|
+
return new _Evaluator({
|
|
170
|
+
...state,
|
|
171
|
+
middlewares: [...state.middlewares, middleware]
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
define(config) {
|
|
175
|
+
const { middlewares } = this.getState();
|
|
176
|
+
const name = validateEvaluatorName(config.name, "Evaluator.define");
|
|
177
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
178
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
179
|
+
return new _Evaluator({
|
|
180
|
+
name,
|
|
181
|
+
displayName,
|
|
182
|
+
tags,
|
|
183
|
+
inputSchema: config.inputSchema,
|
|
184
|
+
outputSchema: config.outputSchema,
|
|
185
|
+
scoreSchema: config.scoreSchema,
|
|
186
|
+
middlewares,
|
|
187
|
+
passThreshold: config.passThreshold,
|
|
188
|
+
passCriterion: config.passCriterion
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
evaluate(fn) {
|
|
192
|
+
return new _Evaluator({
|
|
193
|
+
...this.getState(),
|
|
194
|
+
evaluateFn: fn
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
/** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
|
|
198
|
+
getName() {
|
|
199
|
+
return this._config.name;
|
|
200
|
+
}
|
|
201
|
+
getDisplayName() {
|
|
202
|
+
return this._config.displayName;
|
|
203
|
+
}
|
|
204
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
|
|
205
|
+
getDisplayLabel() {
|
|
206
|
+
const id = this._config.name;
|
|
207
|
+
if (id === void 0) {
|
|
208
|
+
return void 0;
|
|
209
|
+
}
|
|
210
|
+
return this._config.displayName ?? id;
|
|
211
|
+
}
|
|
212
|
+
/** Tags from `Evaluator.define({ tags })`; empty until defined. */
|
|
213
|
+
getTags() {
|
|
214
|
+
return [...this._config.tags];
|
|
215
|
+
}
|
|
216
|
+
getInputSchema() {
|
|
217
|
+
return this._config.inputSchema;
|
|
218
|
+
}
|
|
219
|
+
getOutputSchema() {
|
|
220
|
+
return this._config.outputSchema;
|
|
221
|
+
}
|
|
222
|
+
getScoreSchema() {
|
|
223
|
+
return this._config.scoreSchema;
|
|
224
|
+
}
|
|
225
|
+
getMiddlewares() {
|
|
226
|
+
return this._config.middlewares;
|
|
227
|
+
}
|
|
228
|
+
getEvaluateFn() {
|
|
229
|
+
return this._config.evaluateFn;
|
|
230
|
+
}
|
|
231
|
+
getPassThreshold() {
|
|
232
|
+
return this._config.passThreshold;
|
|
233
|
+
}
|
|
234
|
+
getPassCriterion() {
|
|
235
|
+
return this._config.passCriterion;
|
|
236
|
+
}
|
|
237
|
+
async resolveContext() {
|
|
238
|
+
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
239
|
+
return Object.assign({}, ...parts);
|
|
240
|
+
}
|
|
241
|
+
};
|
|
242
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
243
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
244
|
+
const label = evaluator.getDisplayLabel();
|
|
245
|
+
if (label !== void 0) {
|
|
246
|
+
return label;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
250
|
+
}
|
|
251
|
+
function getEvaluatorTagList(evaluator) {
|
|
252
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
253
|
+
}
|
|
254
|
+
|
|
12
255
|
// src/cli/data.mock.json
|
|
13
256
|
var data_mock_default = {
|
|
14
257
|
datasets: [
|
|
@@ -255,7 +498,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
255
498
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
256
499
|
return {
|
|
257
500
|
id: item.id,
|
|
258
|
-
name: item.dataset
|
|
501
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
259
502
|
overview: `Discovered from ${item.filePath}`,
|
|
260
503
|
runs
|
|
261
504
|
};
|
|
@@ -263,7 +506,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
263
506
|
function toEvaluatorOption(item) {
|
|
264
507
|
return {
|
|
265
508
|
id: item.id,
|
|
266
|
-
name: item.evaluator
|
|
509
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
267
510
|
configPreview: `Source: ${item.filePath}`
|
|
268
511
|
};
|
|
269
512
|
}
|
|
@@ -308,196 +551,149 @@ function parseStartupArgs(argv) {
|
|
|
308
551
|
}
|
|
309
552
|
return args;
|
|
310
553
|
}
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
this._config = config;
|
|
554
|
+
function preprocessForDiff(value, options) {
|
|
555
|
+
if (options?.sort && Array.isArray(value)) {
|
|
556
|
+
return [...value].sort((a, b) => {
|
|
557
|
+
const aStr = stringify(preprocessForDiff(a, options));
|
|
558
|
+
const bStr = stringify(preprocessForDiff(b, options));
|
|
559
|
+
return aStr.localeCompare(bStr);
|
|
560
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
319
561
|
}
|
|
320
|
-
|
|
321
|
-
const
|
|
322
|
-
|
|
323
|
-
|
|
562
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
563
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
564
|
+
const filtered = {};
|
|
565
|
+
for (const [k, v] of Object.entries(value)) {
|
|
566
|
+
if (!keys.includes(k)) {
|
|
567
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
568
|
+
}
|
|
324
569
|
}
|
|
325
|
-
return
|
|
326
|
-
name: config.name,
|
|
327
|
-
tags: config.tags,
|
|
328
|
-
reruns,
|
|
329
|
-
inputSchema: config.inputSchema,
|
|
330
|
-
input: config.input,
|
|
331
|
-
outputSchema: config.outputSchema,
|
|
332
|
-
output: config.output
|
|
333
|
-
});
|
|
570
|
+
return filtered;
|
|
334
571
|
}
|
|
335
|
-
|
|
336
|
-
|
|
572
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
573
|
+
const result = {};
|
|
574
|
+
for (const [k, v] of Object.entries(value)) {
|
|
575
|
+
result[k] = preprocessForDiff(v, options);
|
|
576
|
+
}
|
|
577
|
+
return result;
|
|
337
578
|
}
|
|
338
|
-
|
|
339
|
-
return
|
|
579
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
580
|
+
return Number(value.toFixed(options.precision));
|
|
340
581
|
}
|
|
341
|
-
|
|
342
|
-
|
|
582
|
+
return value;
|
|
583
|
+
}
|
|
584
|
+
function toPrettyJson(value) {
|
|
585
|
+
const str = stringify(value);
|
|
586
|
+
try {
|
|
587
|
+
const parsed = JSON.parse(str);
|
|
588
|
+
return JSON.stringify(parsed, null, 2);
|
|
589
|
+
} catch {
|
|
590
|
+
return str;
|
|
343
591
|
}
|
|
344
|
-
|
|
345
|
-
|
|
592
|
+
}
|
|
593
|
+
function formatDiffParts(parts) {
|
|
594
|
+
const lines = [];
|
|
595
|
+
for (const part of parts) {
|
|
596
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
597
|
+
const partLines = part.value.split("\n");
|
|
598
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
599
|
+
const line = partLines[i];
|
|
600
|
+
if (i === partLines.length - 1 && line === "")
|
|
601
|
+
continue;
|
|
602
|
+
lines.push(prefix + line);
|
|
603
|
+
}
|
|
346
604
|
}
|
|
347
|
-
|
|
348
|
-
|
|
605
|
+
return lines.join("\n");
|
|
606
|
+
}
|
|
607
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
608
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
609
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
610
|
+
if (diffOptions?.keysOnly) {
|
|
611
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
612
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
613
|
+
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
614
|
+
return formatDiffParts(parts2);
|
|
349
615
|
}
|
|
350
|
-
|
|
351
|
-
|
|
616
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
617
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
618
|
+
if (expectedStr === actualStr) {
|
|
619
|
+
return "";
|
|
352
620
|
}
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
return resolve(this._config.output);
|
|
358
|
-
}
|
|
359
|
-
};
|
|
360
|
-
|
|
361
|
-
// src/evals/evaluator.ts
|
|
362
|
-
var Evaluator = class _Evaluator {
|
|
363
|
-
constructor(config) {
|
|
364
|
-
this._config = config;
|
|
365
|
-
}
|
|
366
|
-
getState() {
|
|
367
|
-
return {
|
|
368
|
-
name: this._config.name,
|
|
369
|
-
inputSchema: this._config.inputSchema,
|
|
370
|
-
outputSchema: this._config.outputSchema,
|
|
371
|
-
scoreSchema: this._config.scoreSchema,
|
|
372
|
-
middlewares: this._config.middlewares,
|
|
373
|
-
evaluateFn: this._config.evaluateFn,
|
|
374
|
-
passThreshold: this._config.passThreshold,
|
|
375
|
-
passCriterion: this._config.passCriterion
|
|
376
|
-
};
|
|
377
|
-
}
|
|
378
|
-
static use(middleware) {
|
|
379
|
-
return new _Evaluator({
|
|
380
|
-
middlewares: [middleware]
|
|
381
|
-
});
|
|
382
|
-
}
|
|
383
|
-
use(middleware) {
|
|
384
|
-
const state = this.getState();
|
|
385
|
-
return new _Evaluator({
|
|
386
|
-
...state,
|
|
387
|
-
middlewares: [...state.middlewares, middleware]
|
|
388
|
-
});
|
|
389
|
-
}
|
|
390
|
-
define(config) {
|
|
391
|
-
const { middlewares } = this.getState();
|
|
392
|
-
return new _Evaluator({
|
|
393
|
-
name: config.name,
|
|
394
|
-
inputSchema: config.inputSchema,
|
|
395
|
-
outputSchema: config.outputSchema,
|
|
396
|
-
scoreSchema: config.scoreSchema,
|
|
397
|
-
middlewares,
|
|
398
|
-
passThreshold: config.passThreshold,
|
|
399
|
-
passCriterion: config.passCriterion
|
|
400
|
-
});
|
|
401
|
-
}
|
|
402
|
-
evaluate(fn) {
|
|
403
|
-
return new _Evaluator({
|
|
404
|
-
...this.getState(),
|
|
405
|
-
evaluateFn: fn
|
|
406
|
-
});
|
|
407
|
-
}
|
|
408
|
-
getName() {
|
|
409
|
-
return this._config.name;
|
|
410
|
-
}
|
|
411
|
-
getInputSchema() {
|
|
412
|
-
return this._config.inputSchema;
|
|
413
|
-
}
|
|
414
|
-
getOutputSchema() {
|
|
415
|
-
return this._config.outputSchema;
|
|
416
|
-
}
|
|
417
|
-
getScoreSchema() {
|
|
418
|
-
return this._config.scoreSchema;
|
|
419
|
-
}
|
|
420
|
-
getMiddlewares() {
|
|
421
|
-
return this._config.middlewares;
|
|
422
|
-
}
|
|
423
|
-
getEvaluateFn() {
|
|
424
|
-
return this._config.evaluateFn;
|
|
621
|
+
const parts = diffLines(expectedStr, actualStr);
|
|
622
|
+
if (diffOptions?.outputNewOnly) {
|
|
623
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
624
|
+
return formatDiffParts(filtered);
|
|
425
625
|
}
|
|
426
|
-
|
|
427
|
-
|
|
626
|
+
return formatDiffParts(parts);
|
|
627
|
+
}
|
|
628
|
+
function extractKeys(value) {
|
|
629
|
+
if (value === null || typeof value !== "object") {
|
|
630
|
+
return "\xB7";
|
|
428
631
|
}
|
|
429
|
-
|
|
430
|
-
return
|
|
632
|
+
if (Array.isArray(value)) {
|
|
633
|
+
return value.map(extractKeys);
|
|
431
634
|
}
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
635
|
+
const result = {};
|
|
636
|
+
for (const [k, v] of Object.entries(value)) {
|
|
637
|
+
result[k] = extractKeys(v);
|
|
435
638
|
}
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
// src/evals/dataset.ts
|
|
439
|
-
function matchesAny(value, matchers) {
|
|
440
|
-
return matchers.some(
|
|
441
|
-
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
442
|
-
);
|
|
639
|
+
return result;
|
|
443
640
|
}
|
|
444
|
-
function
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
641
|
+
function formatLogMessage(msg) {
|
|
642
|
+
if (typeof msg === "string")
|
|
643
|
+
return msg;
|
|
644
|
+
if (msg instanceof Error)
|
|
645
|
+
return msg.stack ?? msg.message;
|
|
646
|
+
try {
|
|
647
|
+
if (msg !== null && typeof msg === "object") {
|
|
648
|
+
return JSON.stringify(msg, null, 2);
|
|
448
649
|
}
|
|
449
|
-
return
|
|
450
|
-
}
|
|
650
|
+
return String(msg);
|
|
651
|
+
} catch {
|
|
652
|
+
return String(msg);
|
|
653
|
+
}
|
|
451
654
|
}
|
|
452
|
-
function
|
|
453
|
-
|
|
454
|
-
|
|
655
|
+
function createLogEntry(message, options) {
|
|
656
|
+
return {
|
|
657
|
+
type: "log",
|
|
658
|
+
label: options?.label,
|
|
659
|
+
message: formatLogMessage(message)
|
|
660
|
+
};
|
|
455
661
|
}
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
getIncludedPaths() {
|
|
479
|
-
return this._config.includedPaths;
|
|
480
|
-
}
|
|
481
|
-
getExcludedPaths() {
|
|
482
|
-
return this._config.excludedPaths;
|
|
483
|
-
}
|
|
484
|
-
matchesTestCase(testCase, filePath) {
|
|
485
|
-
const tags = testCase.getTags();
|
|
486
|
-
if (this._config.excludedTags.length > 0) {
|
|
487
|
-
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
488
|
-
return false;
|
|
662
|
+
function getLogLines(entry) {
|
|
663
|
+
return entry.message.split("\n");
|
|
664
|
+
}
|
|
665
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
666
|
+
const { label, ...diffOpts } = options ?? {};
|
|
667
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
668
|
+
return {
|
|
669
|
+
type: "diff",
|
|
670
|
+
label,
|
|
671
|
+
expected,
|
|
672
|
+
actual,
|
|
673
|
+
diff: diff || "(no differences)"
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
function printJsonDiff(expected, actual, options = {}) {
|
|
677
|
+
const { color = true, ...diffOpts } = options;
|
|
678
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
679
|
+
if (color) {
|
|
680
|
+
const lines = diff.split("\n").map((line) => {
|
|
681
|
+
const trimmed = line.trimStart();
|
|
682
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
683
|
+
return `\x1B[31m${line}\x1B[0m`;
|
|
489
684
|
}
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
493
|
-
return false;
|
|
685
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
686
|
+
return `\x1B[32m${line}\x1B[0m`;
|
|
494
687
|
}
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
const
|
|
498
|
-
|
|
688
|
+
return line;
|
|
689
|
+
});
|
|
690
|
+
const colored = lines.join("\n");
|
|
691
|
+
console.log(colored || "(no differences)");
|
|
692
|
+
return colored;
|
|
499
693
|
}
|
|
500
|
-
|
|
694
|
+
console.log(diff || "(no differences)");
|
|
695
|
+
return diff;
|
|
696
|
+
}
|
|
501
697
|
|
|
502
698
|
// src/evals/metric.ts
|
|
503
699
|
var registry = /* @__PURE__ */ new Map();
|
|
@@ -522,6 +718,113 @@ function getMetricById(id) {
|
|
|
522
718
|
return registry.get(id);
|
|
523
719
|
}
|
|
524
720
|
|
|
721
|
+
// src/evals/aggregators.ts
|
|
722
|
+
function aggregateTokenCountSum(values) {
|
|
723
|
+
const initial = {
|
|
724
|
+
input: 0,
|
|
725
|
+
output: 0,
|
|
726
|
+
inputCached: 0,
|
|
727
|
+
outputCached: 0
|
|
728
|
+
};
|
|
729
|
+
return values.reduce(
|
|
730
|
+
(acc, v) => ({
|
|
731
|
+
input: acc.input + (v.input ?? 0),
|
|
732
|
+
output: acc.output + (v.output ?? 0),
|
|
733
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
734
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
735
|
+
}),
|
|
736
|
+
initial
|
|
737
|
+
);
|
|
738
|
+
}
|
|
739
|
+
function aggregateLatencyAverage(values) {
|
|
740
|
+
if (values.length === 0) {
|
|
741
|
+
return { ms: 0 };
|
|
742
|
+
}
|
|
743
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
744
|
+
return { ms: sum / values.length };
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
// src/evals/metrics/standard.ts
|
|
748
|
+
var tokenCountMetric = Metric.of({
|
|
749
|
+
id: "token-count",
|
|
750
|
+
name: "Tokens",
|
|
751
|
+
aggregate: aggregateTokenCountSum,
|
|
752
|
+
format: (data, options) => {
|
|
753
|
+
const input = data.input ?? 0;
|
|
754
|
+
const output = data.output ?? 0;
|
|
755
|
+
const inputCached = data.inputCached ?? 0;
|
|
756
|
+
const outputCached = data.outputCached ?? 0;
|
|
757
|
+
const cached = inputCached + outputCached;
|
|
758
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
759
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
760
|
+
}
|
|
761
|
+
});
|
|
762
|
+
var latencyMetric = Metric.of({
|
|
763
|
+
id: "latency",
|
|
764
|
+
name: "Latency",
|
|
765
|
+
aggregate: aggregateLatencyAverage,
|
|
766
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
767
|
+
});
|
|
768
|
+
|
|
769
|
+
// src/evals/run-config.ts
|
|
770
|
+
function validateRow(row, index) {
|
|
771
|
+
const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
|
|
772
|
+
const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
|
|
773
|
+
if (hasEvaluators && hasPattern) {
|
|
774
|
+
throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
|
|
775
|
+
}
|
|
776
|
+
if (!hasEvaluators && !hasPattern) {
|
|
777
|
+
throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
|
|
778
|
+
}
|
|
779
|
+
if (hasEvaluators && row.evaluators.length === 0) {
|
|
780
|
+
throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
|
|
781
|
+
}
|
|
782
|
+
const rawRep = "repetitions" in row ? row.repetitions : void 0;
|
|
783
|
+
const repetitions = rawRep ?? 1;
|
|
784
|
+
if (!Number.isInteger(repetitions) || repetitions < 1) {
|
|
785
|
+
throw new Error(
|
|
786
|
+
`RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
|
|
787
|
+
);
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
var RunConfig = class _RunConfig {
|
|
791
|
+
constructor(name, displayName, tags, runs) {
|
|
792
|
+
this._name = name;
|
|
793
|
+
this._displayName = displayName;
|
|
794
|
+
this._tags = tags;
|
|
795
|
+
this._runs = runs;
|
|
796
|
+
}
|
|
797
|
+
static define(config) {
|
|
798
|
+
if (config.runs.length === 0) {
|
|
799
|
+
throw new Error("RunConfig runs must be non-empty");
|
|
800
|
+
}
|
|
801
|
+
config.runs.forEach(validateRow);
|
|
802
|
+
const name = validateRunConfigName(config.name, "RunConfig.define");
|
|
803
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
804
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
805
|
+
return new _RunConfig(name, displayName, tags, config.runs);
|
|
806
|
+
}
|
|
807
|
+
/** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
|
|
808
|
+
getName() {
|
|
809
|
+
return this._name;
|
|
810
|
+
}
|
|
811
|
+
/** Optional unrestricted display label. */
|
|
812
|
+
getDisplayName() {
|
|
813
|
+
return this._displayName;
|
|
814
|
+
}
|
|
815
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
816
|
+
getDisplayLabel() {
|
|
817
|
+
return this._displayName ?? this._name;
|
|
818
|
+
}
|
|
819
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
820
|
+
getTags() {
|
|
821
|
+
return [...this._tags];
|
|
822
|
+
}
|
|
823
|
+
getRuns() {
|
|
824
|
+
return this._runs;
|
|
825
|
+
}
|
|
826
|
+
};
|
|
827
|
+
|
|
525
828
|
// src/evals/score.ts
|
|
526
829
|
var registry2 = /* @__PURE__ */ new Map();
|
|
527
830
|
function formatScoreData(def, data, options) {
|
|
@@ -612,71 +915,23 @@ var Score = {
|
|
|
612
915
|
aggregateValues: config.aggregateValues,
|
|
613
916
|
make: (data, options) => {
|
|
614
917
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
615
|
-
return {
|
|
616
|
-
id: config.id,
|
|
617
|
-
data,
|
|
618
|
-
...passed !== void 0 && { passed },
|
|
619
|
-
...options?.name !== void 0 && { name: options.name },
|
|
620
|
-
def
|
|
621
|
-
// Attach def so rendering/aggregation works without registry lookup
|
|
622
|
-
};
|
|
623
|
-
}
|
|
624
|
-
};
|
|
625
|
-
registry2.set(config.id, def);
|
|
626
|
-
return def;
|
|
627
|
-
}
|
|
628
|
-
};
|
|
629
|
-
function getScoreById(id) {
|
|
630
|
-
return registry2.get(id);
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
// src/evals/aggregators.ts
|
|
634
|
-
function aggregateTokenCountSum(values) {
|
|
635
|
-
const initial = {
|
|
636
|
-
input: 0,
|
|
637
|
-
output: 0,
|
|
638
|
-
inputCached: 0,
|
|
639
|
-
outputCached: 0
|
|
640
|
-
};
|
|
641
|
-
return values.reduce(
|
|
642
|
-
(acc, v) => ({
|
|
643
|
-
input: acc.input + (v.input ?? 0),
|
|
644
|
-
output: acc.output + (v.output ?? 0),
|
|
645
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
646
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
647
|
-
}),
|
|
648
|
-
initial
|
|
649
|
-
);
|
|
650
|
-
}
|
|
651
|
-
function aggregateLatencyAverage(values) {
|
|
652
|
-
if (values.length === 0) {
|
|
653
|
-
return { ms: 0 };
|
|
654
|
-
}
|
|
655
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
656
|
-
return { ms: sum / values.length };
|
|
657
|
-
}
|
|
658
|
-
|
|
659
|
-
// src/evals/metrics/standard.ts
|
|
660
|
-
var tokenCountMetric = Metric.of({
|
|
661
|
-
id: "token-count",
|
|
662
|
-
name: "Tokens",
|
|
663
|
-
aggregate: aggregateTokenCountSum,
|
|
664
|
-
format: (data, options) => {
|
|
665
|
-
const input = data.input ?? 0;
|
|
666
|
-
const output = data.output ?? 0;
|
|
667
|
-
const inputCached = data.inputCached ?? 0;
|
|
668
|
-
const outputCached = data.outputCached ?? 0;
|
|
669
|
-
const cached = inputCached + outputCached;
|
|
670
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
671
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
918
|
+
return {
|
|
919
|
+
id: config.id,
|
|
920
|
+
data,
|
|
921
|
+
...passed !== void 0 && { passed },
|
|
922
|
+
...options?.name !== void 0 && { name: options.name },
|
|
923
|
+
def
|
|
924
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
925
|
+
};
|
|
926
|
+
}
|
|
927
|
+
};
|
|
928
|
+
registry2.set(config.id, def);
|
|
929
|
+
return def;
|
|
672
930
|
}
|
|
673
|
-
}
|
|
674
|
-
|
|
675
|
-
id
|
|
676
|
-
|
|
677
|
-
aggregate: aggregateLatencyAverage,
|
|
678
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
679
|
-
});
|
|
931
|
+
};
|
|
932
|
+
function getScoreById(id) {
|
|
933
|
+
return registry2.get(id);
|
|
934
|
+
}
|
|
680
935
|
|
|
681
936
|
// src/evals/scores/standard.ts
|
|
682
937
|
var percentScore = Score.of({
|
|
@@ -709,148 +964,197 @@ var binaryScore = Score.of({
|
|
|
709
964
|
},
|
|
710
965
|
aggregateValues: Score.aggregate.all
|
|
711
966
|
});
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
722
|
-
const filtered = {};
|
|
723
|
-
for (const [k, v] of Object.entries(value)) {
|
|
724
|
-
if (!keys.includes(k)) {
|
|
725
|
-
filtered[k] = preprocessForDiff(v, options);
|
|
726
|
-
}
|
|
967
|
+
|
|
968
|
+
// src/evals/tag-set.ts
|
|
969
|
+
var TagSet = class {
|
|
970
|
+
constructor() {
|
|
971
|
+
}
|
|
972
|
+
static define(tags) {
|
|
973
|
+
const out = {};
|
|
974
|
+
for (const tag of tags) {
|
|
975
|
+
out[tag] = tag;
|
|
727
976
|
}
|
|
728
|
-
return
|
|
977
|
+
return out;
|
|
729
978
|
}
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
979
|
+
};
|
|
980
|
+
|
|
981
|
+
// src/evals/test-case.ts
|
|
982
|
+
function resolve(value) {
|
|
983
|
+
return typeof value === "function" ? value() : value;
|
|
984
|
+
}
|
|
985
|
+
var TestCase = class _TestCase {
|
|
986
|
+
constructor(config) {
|
|
987
|
+
this._config = config;
|
|
736
988
|
}
|
|
737
|
-
|
|
738
|
-
|
|
989
|
+
static describe(config) {
|
|
990
|
+
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
991
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
992
|
+
return new _TestCase({
|
|
993
|
+
name,
|
|
994
|
+
displayName,
|
|
995
|
+
tags: config.tags,
|
|
996
|
+
inputSchema: config.inputSchema,
|
|
997
|
+
input: config.input,
|
|
998
|
+
outputSchema: config.outputSchema,
|
|
999
|
+
output: config.output
|
|
1000
|
+
});
|
|
739
1001
|
}
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
function toPrettyJson(value) {
|
|
743
|
-
const str = stringify(value);
|
|
744
|
-
try {
|
|
745
|
-
const parsed = JSON.parse(str);
|
|
746
|
-
return JSON.stringify(parsed, null, 2);
|
|
747
|
-
} catch {
|
|
748
|
-
return str;
|
|
1002
|
+
getName() {
|
|
1003
|
+
return this._config.name;
|
|
749
1004
|
}
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
const lines = [];
|
|
753
|
-
for (const part of parts) {
|
|
754
|
-
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
755
|
-
const partLines = part.value.split("\n");
|
|
756
|
-
for (let i = 0; i < partLines.length; i++) {
|
|
757
|
-
const line = partLines[i];
|
|
758
|
-
if (i === partLines.length - 1 && line === "")
|
|
759
|
-
continue;
|
|
760
|
-
lines.push(prefix + line);
|
|
761
|
-
}
|
|
1005
|
+
getDisplayName() {
|
|
1006
|
+
return this._config.displayName;
|
|
762
1007
|
}
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
function createDiffString(expected, actual, diffOptions) {
|
|
766
|
-
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
767
|
-
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
768
|
-
if (diffOptions?.keysOnly) {
|
|
769
|
-
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
770
|
-
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
771
|
-
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
772
|
-
return formatDiffParts(parts2);
|
|
1008
|
+
getDisplayLabel() {
|
|
1009
|
+
return this._config.displayName ?? this._config.name;
|
|
773
1010
|
}
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
if (expectedStr === actualStr) {
|
|
777
|
-
return "";
|
|
1011
|
+
getTags() {
|
|
1012
|
+
return this._config.tags;
|
|
778
1013
|
}
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
const filtered = parts.filter((p) => p.added === true);
|
|
782
|
-
return formatDiffParts(filtered);
|
|
1014
|
+
getInputSchema() {
|
|
1015
|
+
return this._config.inputSchema;
|
|
783
1016
|
}
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
function extractKeys(value) {
|
|
787
|
-
if (value === null || typeof value !== "object") {
|
|
788
|
-
return "\xB7";
|
|
1017
|
+
getInput() {
|
|
1018
|
+
return resolve(this._config.input);
|
|
789
1019
|
}
|
|
790
|
-
|
|
791
|
-
return
|
|
1020
|
+
getOutputSchema() {
|
|
1021
|
+
return this._config.outputSchema;
|
|
792
1022
|
}
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
1023
|
+
getOutput() {
|
|
1024
|
+
if (this._config.output === void 0) {
|
|
1025
|
+
return void 0;
|
|
1026
|
+
}
|
|
1027
|
+
return resolve(this._config.output);
|
|
796
1028
|
}
|
|
797
|
-
|
|
1029
|
+
};
|
|
1030
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1031
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1032
|
+
return testCase.getDisplayLabel();
|
|
1033
|
+
}
|
|
1034
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
798
1035
|
}
|
|
799
|
-
function
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
1036
|
+
function getTestCaseTagList(testCase) {
|
|
1037
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1038
|
+
}
|
|
1039
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
1040
|
+
const baseDir = resolve$1(config.artifactDirectory);
|
|
1041
|
+
let entries;
|
|
804
1042
|
try {
|
|
805
|
-
|
|
806
|
-
return JSON.stringify(msg, null, 2);
|
|
807
|
-
}
|
|
808
|
-
return String(msg);
|
|
1043
|
+
entries = await readdir(baseDir);
|
|
809
1044
|
} catch {
|
|
810
|
-
return
|
|
1045
|
+
return [];
|
|
811
1046
|
}
|
|
1047
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1048
|
+
const snapshots = [];
|
|
1049
|
+
for (const fileName of jsonlFiles) {
|
|
1050
|
+
const filePath = join(baseDir, fileName);
|
|
1051
|
+
try {
|
|
1052
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1053
|
+
if (snapshot) {
|
|
1054
|
+
snapshots.push(snapshot);
|
|
1055
|
+
}
|
|
1056
|
+
} catch {
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
812
1060
|
}
|
|
813
|
-
function
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
}
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
if (color) {
|
|
838
|
-
const lines = diff.split("\n").map((line) => {
|
|
839
|
-
const trimmed = line.trimStart();
|
|
840
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
841
|
-
return `\x1B[31m${line}\x1B[0m`;
|
|
1061
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1062
|
+
const content = await readFile(filePath, "utf8");
|
|
1063
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1064
|
+
if (lines.length === 0) {
|
|
1065
|
+
return null;
|
|
1066
|
+
}
|
|
1067
|
+
let runQueued = null;
|
|
1068
|
+
let runCompleted = null;
|
|
1069
|
+
let runFailed = null;
|
|
1070
|
+
let runStarted = null;
|
|
1071
|
+
for (const line of lines) {
|
|
1072
|
+
try {
|
|
1073
|
+
const event = JSON.parse(line);
|
|
1074
|
+
const type = event.type;
|
|
1075
|
+
if (type === "RunQueued") {
|
|
1076
|
+
runQueued = {
|
|
1077
|
+
runId: event.runId,
|
|
1078
|
+
datasetId: event.datasetId,
|
|
1079
|
+
datasetName: event.datasetName,
|
|
1080
|
+
evaluatorIds: event.evaluatorIds,
|
|
1081
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
1082
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
1083
|
+
ts: event.ts
|
|
1084
|
+
};
|
|
842
1085
|
}
|
|
843
|
-
if (
|
|
844
|
-
|
|
1086
|
+
if (type === "RunStarted") {
|
|
1087
|
+
runStarted = { startedAt: event.startedAt };
|
|
1088
|
+
}
|
|
1089
|
+
if (type === "RunCompleted") {
|
|
1090
|
+
runCompleted = {
|
|
1091
|
+
passedTestCases: event.passedTestCases,
|
|
1092
|
+
failedTestCases: event.failedTestCases,
|
|
1093
|
+
totalTestCases: event.totalTestCases,
|
|
1094
|
+
finishedAt: event.finishedAt
|
|
1095
|
+
};
|
|
1096
|
+
}
|
|
1097
|
+
if (type === "RunFailed") {
|
|
1098
|
+
runFailed = {
|
|
1099
|
+
finishedAt: event.finishedAt,
|
|
1100
|
+
errorMessage: event.errorMessage
|
|
1101
|
+
};
|
|
1102
|
+
}
|
|
1103
|
+
} catch {
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
1106
|
+
if (!runQueued) {
|
|
1107
|
+
return null;
|
|
1108
|
+
}
|
|
1109
|
+
const artifactPath = filePath;
|
|
1110
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1111
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
1112
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1113
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1114
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1115
|
+
return {
|
|
1116
|
+
runId: runQueued.runId,
|
|
1117
|
+
datasetId: runQueued.datasetId,
|
|
1118
|
+
datasetName: runQueued.datasetName,
|
|
1119
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
1120
|
+
queuedAt: runQueued.ts ?? 0,
|
|
1121
|
+
startedAt: runStarted?.startedAt,
|
|
1122
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1123
|
+
totalTestCases: runQueued.totalTestCases,
|
|
1124
|
+
completedTestCases,
|
|
1125
|
+
passedTestCases,
|
|
1126
|
+
failedTestCases,
|
|
1127
|
+
status,
|
|
1128
|
+
artifactPath,
|
|
1129
|
+
errorMessage: runFailed?.errorMessage
|
|
1130
|
+
};
|
|
1131
|
+
}
|
|
1132
|
+
function aggregateTestCaseProgress(lines) {
|
|
1133
|
+
let completedTestCases = 0;
|
|
1134
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1135
|
+
for (const line of lines) {
|
|
1136
|
+
try {
|
|
1137
|
+
const event = JSON.parse(line);
|
|
1138
|
+
if (event.type === "TestCaseProgress") {
|
|
1139
|
+
const ev = event;
|
|
1140
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1141
|
+
const id = ev.testCaseId;
|
|
1142
|
+
const current = testCasePassedBy.get(id);
|
|
1143
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
845
1144
|
}
|
|
846
|
-
|
|
847
|
-
}
|
|
848
|
-
const colored = lines.join("\n");
|
|
849
|
-
console.log(colored || "(no differences)");
|
|
850
|
-
return colored;
|
|
1145
|
+
} catch {
|
|
1146
|
+
}
|
|
851
1147
|
}
|
|
852
|
-
|
|
853
|
-
|
|
1148
|
+
let passedTestCases = 0;
|
|
1149
|
+
let failedTestCases = 0;
|
|
1150
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1151
|
+
if (passed) {
|
|
1152
|
+
passedTestCases += 1;
|
|
1153
|
+
} else {
|
|
1154
|
+
failedTestCases += 1;
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
854
1158
|
}
|
|
855
1159
|
|
|
856
1160
|
// src/runner/config.ts
|
|
@@ -862,6 +1166,7 @@ var defaultRunnerConfig = {
|
|
|
862
1166
|
rootDir: process.cwd(),
|
|
863
1167
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
864
1168
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
1169
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
865
1170
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
866
1171
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
867
1172
|
},
|
|
@@ -887,6 +1192,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
887
1192
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
888
1193
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
889
1194
|
}
|
|
1195
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
1196
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
1197
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
1198
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
1199
|
+
}
|
|
890
1200
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
891
1201
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
892
1202
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -985,6 +1295,9 @@ function isDatasetLike(value) {
|
|
|
985
1295
|
function isEvaluatorLike(value) {
|
|
986
1296
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
987
1297
|
}
|
|
1298
|
+
function isRunConfigLike(value) {
|
|
1299
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1300
|
+
}
|
|
988
1301
|
function isTestCaseLike(value) {
|
|
989
1302
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
990
1303
|
}
|
|
@@ -1073,6 +1386,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1073
1386
|
);
|
|
1074
1387
|
return found.flat();
|
|
1075
1388
|
}
|
|
1389
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1390
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1391
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1392
|
+
const found = await Promise.all(
|
|
1393
|
+
matched.map(async (absolutePath) => {
|
|
1394
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1395
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1396
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
1397
|
+
return runConfigs.map((runConfig) => ({
|
|
1398
|
+
id: runConfig.getName(),
|
|
1399
|
+
filePath: relPath,
|
|
1400
|
+
runConfig
|
|
1401
|
+
}));
|
|
1402
|
+
})
|
|
1403
|
+
);
|
|
1404
|
+
return found.flat();
|
|
1405
|
+
}
|
|
1076
1406
|
async function collectTestCasesFromFiles(config) {
|
|
1077
1407
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1078
1408
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -1165,15 +1495,17 @@ function readOutput(testCase) {
|
|
|
1165
1495
|
}
|
|
1166
1496
|
return candidate.getOutput();
|
|
1167
1497
|
}
|
|
1168
|
-
function buildEvaluationUnits(testCases) {
|
|
1498
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1499
|
+
const count = Math.max(1, repetitionCount);
|
|
1169
1500
|
const units = [];
|
|
1170
1501
|
for (const testCaseItem of testCases) {
|
|
1171
|
-
const
|
|
1172
|
-
for (let r = 0; r <
|
|
1502
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
1503
|
+
for (let r = 0; r < count; r++) {
|
|
1173
1504
|
units.push({
|
|
1174
1505
|
testCaseItem,
|
|
1175
|
-
|
|
1176
|
-
|
|
1506
|
+
repetitionId,
|
|
1507
|
+
repetitionIndex: r + 1,
|
|
1508
|
+
repetitionCount: count
|
|
1177
1509
|
});
|
|
1178
1510
|
}
|
|
1179
1511
|
}
|
|
@@ -1186,7 +1518,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1186
1518
|
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1187
1519
|
}
|
|
1188
1520
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1189
|
-
const { testCaseItem,
|
|
1521
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1190
1522
|
return Effect.gen(function* () {
|
|
1191
1523
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1192
1524
|
const started = Date.now();
|
|
@@ -1195,11 +1527,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1195
1527
|
type: "TestCaseStarted",
|
|
1196
1528
|
runId: task.runId,
|
|
1197
1529
|
testCaseId: testCaseItem.id,
|
|
1198
|
-
testCaseName: testCaseItem.testCase
|
|
1530
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1199
1531
|
startedTestCases: startedEvaluations,
|
|
1200
1532
|
totalTestCases: totalEvaluations,
|
|
1201
|
-
|
|
1202
|
-
|
|
1533
|
+
repetitionId,
|
|
1534
|
+
repetitionIndex,
|
|
1535
|
+
repetitionCount
|
|
1203
1536
|
});
|
|
1204
1537
|
const evaluatorScores = [];
|
|
1205
1538
|
let testCaseError;
|
|
@@ -1233,8 +1566,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1233
1566
|
meta: {
|
|
1234
1567
|
triggerId: task.triggerId,
|
|
1235
1568
|
runId: evaluatorRunId,
|
|
1236
|
-
|
|
1569
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1570
|
+
repetitionId,
|
|
1571
|
+
repetitionIndex,
|
|
1572
|
+
repetitionCount,
|
|
1573
|
+
runConfigName: task.runConfigName
|
|
1237
1574
|
},
|
|
1575
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1576
|
+
runConfigTags: task.runConfigTags,
|
|
1577
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1238
1578
|
logDiff,
|
|
1239
1579
|
log,
|
|
1240
1580
|
createError
|
|
@@ -1277,18 +1617,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1277
1617
|
});
|
|
1278
1618
|
}
|
|
1279
1619
|
}
|
|
1280
|
-
const
|
|
1620
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1281
1621
|
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1282
1622
|
const progressEvent = {
|
|
1283
1623
|
type: "TestCaseProgress",
|
|
1284
1624
|
runId: task.runId,
|
|
1285
1625
|
testCaseId: testCaseItem.id,
|
|
1286
|
-
testCaseName: testCaseItem.testCase
|
|
1626
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1287
1627
|
completedTestCases: completedEvaluations,
|
|
1288
1628
|
totalTestCases: totalEvaluations,
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1629
|
+
repetitionId,
|
|
1630
|
+
repetitionIndex,
|
|
1631
|
+
repetitionCount,
|
|
1632
|
+
passed: repetitionPassedThis,
|
|
1292
1633
|
durationMs: Date.now() - started,
|
|
1293
1634
|
evaluatorScores,
|
|
1294
1635
|
output,
|
|
@@ -1309,9 +1650,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1309
1650
|
(map) => {
|
|
1310
1651
|
const key = testCaseItem.id;
|
|
1311
1652
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1312
|
-
const newResults = [...existing.results,
|
|
1653
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1313
1654
|
const newCompletedCount = existing.completedCount + 1;
|
|
1314
|
-
const isLast = newCompletedCount ===
|
|
1655
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1315
1656
|
const newMap = new Map(map);
|
|
1316
1657
|
newMap.set(key, {
|
|
1317
1658
|
completedCount: newCompletedCount,
|
|
@@ -1348,10 +1689,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1348
1689
|
runId: task.runId,
|
|
1349
1690
|
startedAt
|
|
1350
1691
|
});
|
|
1351
|
-
const totalEvaluations = task.testCases.
|
|
1352
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1353
|
-
0
|
|
1354
|
-
);
|
|
1692
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1355
1693
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1356
1694
|
const completedRef = yield* Ref.make(0);
|
|
1357
1695
|
const startedRef = yield* Ref.make(0);
|
|
@@ -1360,7 +1698,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1360
1698
|
const testCaseResultsRef = yield* Ref.make(
|
|
1361
1699
|
/* @__PURE__ */ new Map()
|
|
1362
1700
|
);
|
|
1363
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1701
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1364
1702
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1365
1703
|
task,
|
|
1366
1704
|
unit,
|
|
@@ -1374,11 +1712,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1374
1712
|
failedRef,
|
|
1375
1713
|
testCaseResultsRef
|
|
1376
1714
|
);
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1715
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1716
|
+
if (globalSem !== void 0) {
|
|
1717
|
+
yield* Effect.forEach(
|
|
1718
|
+
evaluationUnits,
|
|
1719
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1720
|
+
{ concurrency: "unbounded", discard: true }
|
|
1721
|
+
);
|
|
1722
|
+
} else {
|
|
1723
|
+
yield* Effect.forEach(
|
|
1724
|
+
evaluationUnits,
|
|
1725
|
+
processEvaluation,
|
|
1726
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1727
|
+
);
|
|
1728
|
+
}
|
|
1382
1729
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1383
1730
|
Ref.get(completedRef),
|
|
1384
1731
|
Ref.get(passedRef),
|
|
@@ -1414,125 +1761,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1414
1761
|
artifactPath: task.snapshot.artifactPath
|
|
1415
1762
|
});
|
|
1416
1763
|
});
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
} catch {
|
|
1423
|
-
return [];
|
|
1424
|
-
}
|
|
1425
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1426
|
-
const snapshots = [];
|
|
1427
|
-
for (const fileName of jsonlFiles) {
|
|
1428
|
-
const filePath = join(baseDir, fileName);
|
|
1429
|
-
try {
|
|
1430
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1431
|
-
if (snapshot) {
|
|
1432
|
-
snapshots.push(snapshot);
|
|
1433
|
-
}
|
|
1434
|
-
} catch {
|
|
1435
|
-
}
|
|
1436
|
-
}
|
|
1437
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1438
|
-
}
|
|
1439
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1440
|
-
const content = await readFile(filePath, "utf8");
|
|
1441
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1442
|
-
if (lines.length === 0) {
|
|
1443
|
-
return null;
|
|
1444
|
-
}
|
|
1445
|
-
let runQueued = null;
|
|
1446
|
-
let runCompleted = null;
|
|
1447
|
-
let runFailed = null;
|
|
1448
|
-
let runStarted = null;
|
|
1449
|
-
for (const line of lines) {
|
|
1450
|
-
try {
|
|
1451
|
-
const event = JSON.parse(line);
|
|
1452
|
-
const type = event.type;
|
|
1453
|
-
if (type === "RunQueued") {
|
|
1454
|
-
runQueued = {
|
|
1455
|
-
runId: event.runId,
|
|
1456
|
-
datasetId: event.datasetId,
|
|
1457
|
-
datasetName: event.datasetName,
|
|
1458
|
-
evaluatorIds: event.evaluatorIds,
|
|
1459
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1460
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1461
|
-
ts: event.ts
|
|
1462
|
-
};
|
|
1463
|
-
}
|
|
1464
|
-
if (type === "RunStarted") {
|
|
1465
|
-
runStarted = { startedAt: event.startedAt };
|
|
1466
|
-
}
|
|
1467
|
-
if (type === "RunCompleted") {
|
|
1468
|
-
runCompleted = {
|
|
1469
|
-
passedTestCases: event.passedTestCases,
|
|
1470
|
-
failedTestCases: event.failedTestCases,
|
|
1471
|
-
totalTestCases: event.totalTestCases,
|
|
1472
|
-
finishedAt: event.finishedAt
|
|
1473
|
-
};
|
|
1474
|
-
}
|
|
1475
|
-
if (type === "RunFailed") {
|
|
1476
|
-
runFailed = {
|
|
1477
|
-
finishedAt: event.finishedAt,
|
|
1478
|
-
errorMessage: event.errorMessage
|
|
1479
|
-
};
|
|
1480
|
-
}
|
|
1481
|
-
} catch {
|
|
1482
|
-
}
|
|
1764
|
+
|
|
1765
|
+
// src/runner/name-pattern.ts
|
|
1766
|
+
function parseRegexLiteral(pattern) {
|
|
1767
|
+
if (!pattern.startsWith("/")) {
|
|
1768
|
+
return void 0;
|
|
1483
1769
|
}
|
|
1484
|
-
|
|
1485
|
-
|
|
1770
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1771
|
+
if (lastSlash <= 0) {
|
|
1772
|
+
return void 0;
|
|
1486
1773
|
}
|
|
1487
|
-
const artifactPath = filePath;
|
|
1488
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1489
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1490
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1491
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1492
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1493
1774
|
return {
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
datasetName: runQueued.datasetName,
|
|
1497
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1498
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1499
|
-
startedAt: runStarted?.startedAt,
|
|
1500
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1501
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1502
|
-
completedTestCases,
|
|
1503
|
-
passedTestCases,
|
|
1504
|
-
failedTestCases,
|
|
1505
|
-
status,
|
|
1506
|
-
artifactPath,
|
|
1507
|
-
errorMessage: runFailed?.errorMessage
|
|
1775
|
+
source: pattern.slice(1, lastSlash),
|
|
1776
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1508
1777
|
};
|
|
1509
1778
|
}
|
|
1510
|
-
function
|
|
1511
|
-
|
|
1512
|
-
const
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
if (event.type === "TestCaseProgress") {
|
|
1517
|
-
const ev = event;
|
|
1518
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1519
|
-
const id = ev.testCaseId;
|
|
1520
|
-
const current = testCasePassedBy.get(id);
|
|
1521
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1522
|
-
}
|
|
1523
|
-
} catch {
|
|
1524
|
-
}
|
|
1779
|
+
function createNameMatcher(pattern) {
|
|
1780
|
+
const normalizedPattern = pattern.trim();
|
|
1781
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1782
|
+
if (regexLiteral) {
|
|
1783
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1784
|
+
return (value) => regex.test(value);
|
|
1525
1785
|
}
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
passedTestCases += 1;
|
|
1531
|
-
} else {
|
|
1532
|
-
failedTestCases += 1;
|
|
1533
|
-
}
|
|
1786
|
+
if (normalizedPattern.includes("*")) {
|
|
1787
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1788
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1789
|
+
return (value) => regex.test(value);
|
|
1534
1790
|
}
|
|
1535
|
-
return
|
|
1791
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1536
1792
|
}
|
|
1537
1793
|
async function appendJsonLine(artifactPath, payload) {
|
|
1538
1794
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1591,32 +1847,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1591
1847
|
}
|
|
1592
1848
|
|
|
1593
1849
|
// src/runner/api.ts
|
|
1594
|
-
function
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1599
|
-
if (lastSlash <= 0) {
|
|
1600
|
-
return void 0;
|
|
1601
|
-
}
|
|
1602
|
-
return {
|
|
1603
|
-
source: pattern.slice(1, lastSlash),
|
|
1604
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1605
|
-
};
|
|
1606
|
-
}
|
|
1607
|
-
function createNameMatcher(pattern) {
|
|
1608
|
-
const normalizedPattern = pattern.trim();
|
|
1609
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1610
|
-
if (regexLiteral) {
|
|
1611
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1612
|
-
return (value) => regex.test(value);
|
|
1613
|
-
}
|
|
1614
|
-
if (normalizedPattern.includes("*")) {
|
|
1615
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1616
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1617
|
-
return (value) => regex.test(value);
|
|
1850
|
+
function normalizeRunRepetitions(value) {
|
|
1851
|
+
const n = value ?? 1;
|
|
1852
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1853
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1618
1854
|
}
|
|
1619
|
-
return
|
|
1855
|
+
return n;
|
|
1620
1856
|
}
|
|
1621
1857
|
function mergeRunnerOverrides(base, next) {
|
|
1622
1858
|
if (!base) {
|
|
@@ -1651,6 +1887,7 @@ var EffectRunner = class {
|
|
|
1651
1887
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1652
1888
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1653
1889
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1890
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1654
1891
|
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1655
1892
|
this.persistenceFiber = Effect.runFork(
|
|
1656
1893
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1691,6 +1928,137 @@ var EffectRunner = class {
|
|
|
1691
1928
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1692
1929
|
);
|
|
1693
1930
|
}
|
|
1931
|
+
async collectRunConfigs() {
|
|
1932
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1933
|
+
this.runConfigsById.clear();
|
|
1934
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1935
|
+
for (const item of runConfigs) {
|
|
1936
|
+
const id = item.runConfig.getName();
|
|
1937
|
+
const lower = id.toLowerCase();
|
|
1938
|
+
const prev = byNameLower.get(lower);
|
|
1939
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1940
|
+
throw new Error(
|
|
1941
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1942
|
+
);
|
|
1943
|
+
}
|
|
1944
|
+
byNameLower.set(lower, item);
|
|
1945
|
+
this.runConfigsById.set(id, item);
|
|
1946
|
+
}
|
|
1947
|
+
return runConfigs;
|
|
1948
|
+
}
|
|
1949
|
+
async resolveRunConfigByName(name) {
|
|
1950
|
+
if (this.runConfigsById.size === 0) {
|
|
1951
|
+
await this.collectRunConfigs();
|
|
1952
|
+
}
|
|
1953
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1954
|
+
const keyLower = key.toLowerCase();
|
|
1955
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1956
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1957
|
+
);
|
|
1958
|
+
if (matches.length === 0) {
|
|
1959
|
+
return void 0;
|
|
1960
|
+
}
|
|
1961
|
+
if (matches.length > 1) {
|
|
1962
|
+
throw new Error(
|
|
1963
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1964
|
+
);
|
|
1965
|
+
}
|
|
1966
|
+
return matches[0];
|
|
1967
|
+
}
|
|
1968
|
+
async expandRunConfigToJobs(collected) {
|
|
1969
|
+
if (this.datasetsById.size === 0) {
|
|
1970
|
+
await this.collectDatasets();
|
|
1971
|
+
}
|
|
1972
|
+
if (this.evaluatorsById.size === 0) {
|
|
1973
|
+
await this.collectEvaluators();
|
|
1974
|
+
}
|
|
1975
|
+
const rcName = collected.runConfig.getName();
|
|
1976
|
+
const jobs = [];
|
|
1977
|
+
const runs = collected.runConfig.getRuns();
|
|
1978
|
+
for (const [i, row] of runs.entries()) {
|
|
1979
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1980
|
+
(d) => d.dataset === row.dataset
|
|
1981
|
+
);
|
|
1982
|
+
if (!dsCollected) {
|
|
1983
|
+
throw new Error(
|
|
1984
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1985
|
+
);
|
|
1986
|
+
}
|
|
1987
|
+
let evaluatorIds;
|
|
1988
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1989
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1990
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1991
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1992
|
+
);
|
|
1993
|
+
if (matched.length === 0) {
|
|
1994
|
+
throw new Error(
|
|
1995
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1996
|
+
);
|
|
1997
|
+
}
|
|
1998
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1999
|
+
} else {
|
|
2000
|
+
const evaluators = row.evaluators;
|
|
2001
|
+
evaluatorIds = [];
|
|
2002
|
+
for (const ev of evaluators) {
|
|
2003
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2004
|
+
(item) => item.evaluator === ev
|
|
2005
|
+
);
|
|
2006
|
+
if (!found) {
|
|
2007
|
+
throw new Error(
|
|
2008
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2009
|
+
);
|
|
2010
|
+
}
|
|
2011
|
+
evaluatorIds.push(found.id);
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2015
|
+
jobs.push({
|
|
2016
|
+
datasetId: dsCollected.id,
|
|
2017
|
+
evaluatorIds,
|
|
2018
|
+
runConfigName: rcName,
|
|
2019
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2020
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2021
|
+
repetitions
|
|
2022
|
+
});
|
|
2023
|
+
}
|
|
2024
|
+
return jobs;
|
|
2025
|
+
}
|
|
2026
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2027
|
+
const jobs = [];
|
|
2028
|
+
for (const name of names) {
|
|
2029
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2030
|
+
if (!collected) {
|
|
2031
|
+
const known = await this.collectRunConfigs();
|
|
2032
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2033
|
+
throw new Error(
|
|
2034
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2035
|
+
);
|
|
2036
|
+
}
|
|
2037
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2038
|
+
}
|
|
2039
|
+
return jobs;
|
|
2040
|
+
}
|
|
2041
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2042
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2043
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2044
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2045
|
+
const snapshots = [];
|
|
2046
|
+
for (const job of request.jobs) {
|
|
2047
|
+
snapshots.push(
|
|
2048
|
+
await this.startDatasetRun({
|
|
2049
|
+
datasetId: job.datasetId,
|
|
2050
|
+
evaluatorIds: job.evaluatorIds,
|
|
2051
|
+
triggerId,
|
|
2052
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2053
|
+
globalEvaluationSemaphore: sem,
|
|
2054
|
+
runConfigName: job.runConfigName,
|
|
2055
|
+
runConfigTags: job.runConfigTags,
|
|
2056
|
+
repetitions: job.repetitions
|
|
2057
|
+
})
|
|
2058
|
+
);
|
|
2059
|
+
}
|
|
2060
|
+
return snapshots;
|
|
2061
|
+
}
|
|
1694
2062
|
async searchTestCases(query) {
|
|
1695
2063
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1696
2064
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1709,36 +2077,46 @@ var EffectRunner = class {
|
|
|
1709
2077
|
);
|
|
1710
2078
|
}
|
|
1711
2079
|
async runDatasetWith(request) {
|
|
2080
|
+
const runConfigName = validateRunConfigName(
|
|
2081
|
+
request.runConfigName,
|
|
2082
|
+
"runDatasetWith.runConfigName"
|
|
2083
|
+
);
|
|
2084
|
+
return this.startDatasetRun({
|
|
2085
|
+
datasetId: request.datasetId,
|
|
2086
|
+
evaluatorIds: request.evaluatorIds,
|
|
2087
|
+
triggerId: request.triggerId,
|
|
2088
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2089
|
+
repetitions: request.repetitions,
|
|
2090
|
+
runConfigName,
|
|
2091
|
+
runConfigTags: request.runConfigTags
|
|
2092
|
+
});
|
|
2093
|
+
}
|
|
2094
|
+
async startDatasetRun(params) {
|
|
1712
2095
|
if (this.datasetsById.size === 0) {
|
|
1713
2096
|
await this.collectDatasets();
|
|
1714
2097
|
}
|
|
1715
2098
|
if (this.evaluatorsById.size === 0) {
|
|
1716
2099
|
await this.collectEvaluators();
|
|
1717
2100
|
}
|
|
1718
|
-
const dataset = this.datasetsById.get(
|
|
2101
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1719
2102
|
if (!dataset) {
|
|
1720
|
-
throw new Error(`Unknown dataset: ${
|
|
2103
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1721
2104
|
}
|
|
1722
|
-
const selectedEvaluators =
|
|
2105
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1723
2106
|
if (selectedEvaluators.length === 0) {
|
|
1724
2107
|
throw new Error("No evaluators selected for run");
|
|
1725
2108
|
}
|
|
1726
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1727
|
-
const
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
)
|
|
1731
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2109
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2110
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2111
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2112
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2113
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1732
2114
|
const runId = `run-${randomUUID()}`;
|
|
1733
|
-
const artifactPath = createArtifactPath(
|
|
1734
|
-
this.config.artifactDirectory,
|
|
1735
|
-
request.datasetId,
|
|
1736
|
-
runId
|
|
1737
|
-
);
|
|
2115
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1738
2116
|
const snapshot = {
|
|
1739
2117
|
runId,
|
|
1740
|
-
datasetId:
|
|
1741
|
-
datasetName: dataset.dataset.
|
|
2118
|
+
datasetId: params.datasetId,
|
|
2119
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1742
2120
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1743
2121
|
queuedAt: Date.now(),
|
|
1744
2122
|
totalTestCases: totalEvaluations,
|
|
@@ -1758,8 +2136,8 @@ var EffectRunner = class {
|
|
|
1758
2136
|
const queuedEvent = {
|
|
1759
2137
|
type: "RunQueued",
|
|
1760
2138
|
runId,
|
|
1761
|
-
datasetId:
|
|
1762
|
-
datasetName: dataset.dataset.
|
|
2139
|
+
datasetId: params.datasetId,
|
|
2140
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1763
2141
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1764
2142
|
totalTestCases: totalEvaluations,
|
|
1765
2143
|
artifactPath
|
|
@@ -1772,17 +2150,20 @@ var EffectRunner = class {
|
|
|
1772
2150
|
payload: queuedEvent
|
|
1773
2151
|
})
|
|
1774
2152
|
);
|
|
1775
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1776
2153
|
await Effect.runPromise(
|
|
1777
2154
|
Queue.offer(this.runQueue, {
|
|
1778
2155
|
runId,
|
|
1779
2156
|
triggerId,
|
|
1780
|
-
datasetId:
|
|
2157
|
+
datasetId: params.datasetId,
|
|
1781
2158
|
dataset: dataset.dataset,
|
|
1782
2159
|
evaluators: selectedEvaluators,
|
|
1783
2160
|
testCases: selectedTestCases,
|
|
1784
2161
|
snapshot,
|
|
1785
|
-
maxConcurrency
|
|
2162
|
+
maxConcurrency: params.maxConcurrency,
|
|
2163
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2164
|
+
runConfigName: params.runConfigName,
|
|
2165
|
+
runConfigTags,
|
|
2166
|
+
repetitions
|
|
1786
2167
|
})
|
|
1787
2168
|
);
|
|
1788
2169
|
return snapshot;
|
|
@@ -1854,6 +2235,11 @@ var EffectRunner = class {
|
|
|
1854
2235
|
}
|
|
1855
2236
|
};
|
|
1856
2237
|
|
|
1857
|
-
|
|
2238
|
+
// src/runner/events.ts
|
|
2239
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2240
|
+
runConfigName: "programmatic"
|
|
2241
|
+
};
|
|
2242
|
+
|
|
2243
|
+
export { Dataset, DatasetNameSchema, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|
|
1858
2244
|
//# sourceMappingURL=out.js.map
|
|
1859
2245
|
//# sourceMappingURL=index.js.map
|