@m4trix/evals 0.25.1 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/dist/cli-simple.cjs +845 -455
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +846 -456
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +543 -273
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +543 -273
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +948 -545
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +228 -14
- package/dist/index.js +933 -547
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -4,10 +4,10 @@ var effect = require('effect');
|
|
|
4
4
|
var diff = require('diff');
|
|
5
5
|
var stringify = require('fast-json-stable-stringify');
|
|
6
6
|
var crypto = require('crypto');
|
|
7
|
-
var
|
|
7
|
+
var promises = require('fs/promises');
|
|
8
8
|
var path = require('path');
|
|
9
|
+
var fs = require('fs');
|
|
9
10
|
var jitiModule = require('jiti');
|
|
10
|
-
var promises = require('fs/promises');
|
|
11
11
|
var url = require('url');
|
|
12
12
|
|
|
13
13
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
@@ -34,6 +34,249 @@ function _interopNamespace(e) {
|
|
|
34
34
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
35
35
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
36
36
|
|
|
37
|
+
// src/index.ts
|
|
38
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
39
|
+
function makeEntityIdSchema(brand, label) {
|
|
40
|
+
return effect.Schema.String.pipe(
|
|
41
|
+
effect.Schema.trimmed(),
|
|
42
|
+
effect.Schema.minLength(1, {
|
|
43
|
+
message: () => `${label} must be non-empty.`
|
|
44
|
+
}),
|
|
45
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
46
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
47
|
+
}),
|
|
48
|
+
effect.Schema.brand(brand)
|
|
49
|
+
);
|
|
50
|
+
}
|
|
51
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
52
|
+
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
53
|
+
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
54
|
+
var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
|
|
55
|
+
function validateWithSchema(schema, raw, context) {
|
|
56
|
+
const trimmed = raw.trim();
|
|
57
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
58
|
+
schema
|
|
59
|
+
);
|
|
60
|
+
const result = decode(trimmed);
|
|
61
|
+
if (effect.Either.isLeft(result)) {
|
|
62
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
63
|
+
}
|
|
64
|
+
return result.right;
|
|
65
|
+
}
|
|
66
|
+
function validateRunConfigName(raw, context) {
|
|
67
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
68
|
+
}
|
|
69
|
+
function validateEvaluatorName(raw, context) {
|
|
70
|
+
return validateWithSchema(EvaluatorNameSchema, raw, context);
|
|
71
|
+
}
|
|
72
|
+
function validateTestCaseName(raw, context) {
|
|
73
|
+
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
74
|
+
}
|
|
75
|
+
function validateDatasetName(raw, context) {
|
|
76
|
+
return validateWithSchema(DatasetNameSchema, raw, context);
|
|
77
|
+
}
|
|
78
|
+
function normalizeOptionalDisplayName(raw) {
|
|
79
|
+
if (raw === void 0) {
|
|
80
|
+
return void 0;
|
|
81
|
+
}
|
|
82
|
+
const t = raw.trim();
|
|
83
|
+
return t.length === 0 ? void 0 : t;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// src/evals/dataset.ts
|
|
87
|
+
function matchesAny(value, matchers) {
|
|
88
|
+
return matchers.some(
|
|
89
|
+
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
function matchesAnyPath(filePath, matchers) {
|
|
93
|
+
return matchers.some((matcher) => {
|
|
94
|
+
if (typeof matcher === "string") {
|
|
95
|
+
return simpleGlobMatch(matcher, filePath);
|
|
96
|
+
}
|
|
97
|
+
return matcher.test(filePath);
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
function simpleGlobMatch(pattern, value) {
|
|
101
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
102
|
+
return new RegExp(`^${escaped}$`).test(value);
|
|
103
|
+
}
|
|
104
|
+
var Dataset = class _Dataset {
|
|
105
|
+
constructor(config) {
|
|
106
|
+
this._config = config;
|
|
107
|
+
}
|
|
108
|
+
static define(config) {
|
|
109
|
+
const name = validateDatasetName(config.name, "Dataset.define");
|
|
110
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
111
|
+
return new _Dataset({
|
|
112
|
+
name,
|
|
113
|
+
displayName,
|
|
114
|
+
includedTags: config.includedTags ?? [],
|
|
115
|
+
excludedTags: config.excludedTags ?? [],
|
|
116
|
+
includedPaths: config.includedPaths ?? [],
|
|
117
|
+
excludedPaths: config.excludedPaths ?? []
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
/** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
|
|
121
|
+
getName() {
|
|
122
|
+
return this._config.name;
|
|
123
|
+
}
|
|
124
|
+
getDisplayName() {
|
|
125
|
+
return this._config.displayName;
|
|
126
|
+
}
|
|
127
|
+
/** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
128
|
+
getDisplayLabel() {
|
|
129
|
+
return this._config.displayName ?? this._config.name;
|
|
130
|
+
}
|
|
131
|
+
getIncludedTags() {
|
|
132
|
+
return this._config.includedTags;
|
|
133
|
+
}
|
|
134
|
+
getExcludedTags() {
|
|
135
|
+
return this._config.excludedTags;
|
|
136
|
+
}
|
|
137
|
+
getIncludedPaths() {
|
|
138
|
+
return this._config.includedPaths;
|
|
139
|
+
}
|
|
140
|
+
getExcludedPaths() {
|
|
141
|
+
return this._config.excludedPaths;
|
|
142
|
+
}
|
|
143
|
+
matchesTestCase(testCase, filePath) {
|
|
144
|
+
const tags = testCase.getTags();
|
|
145
|
+
if (this._config.excludedTags.length > 0) {
|
|
146
|
+
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
if (this._config.excludedPaths.length > 0) {
|
|
151
|
+
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
156
|
+
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
157
|
+
return tagMatch && pathMatch;
|
|
158
|
+
}
|
|
159
|
+
};
|
|
160
|
+
function getDatasetDisplayLabel(dataset) {
|
|
161
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
162
|
+
return dataset.getDisplayLabel();
|
|
163
|
+
}
|
|
164
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// src/evals/evaluator.ts
|
|
168
|
+
var Evaluator = class _Evaluator {
|
|
169
|
+
constructor(config) {
|
|
170
|
+
this._config = config;
|
|
171
|
+
}
|
|
172
|
+
getState() {
|
|
173
|
+
return {
|
|
174
|
+
name: this._config.name,
|
|
175
|
+
displayName: this._config.displayName,
|
|
176
|
+
tags: this._config.tags,
|
|
177
|
+
inputSchema: this._config.inputSchema,
|
|
178
|
+
outputSchema: this._config.outputSchema,
|
|
179
|
+
scoreSchema: this._config.scoreSchema,
|
|
180
|
+
middlewares: this._config.middlewares,
|
|
181
|
+
evaluateFn: this._config.evaluateFn,
|
|
182
|
+
passThreshold: this._config.passThreshold,
|
|
183
|
+
passCriterion: this._config.passCriterion
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
static use(middleware) {
|
|
187
|
+
return new _Evaluator({
|
|
188
|
+
middlewares: [middleware],
|
|
189
|
+
tags: []
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
use(middleware) {
|
|
193
|
+
const state = this.getState();
|
|
194
|
+
return new _Evaluator({
|
|
195
|
+
...state,
|
|
196
|
+
middlewares: [...state.middlewares, middleware]
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
define(config) {
|
|
200
|
+
const { middlewares } = this.getState();
|
|
201
|
+
const name = validateEvaluatorName(config.name, "Evaluator.define");
|
|
202
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
203
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
204
|
+
return new _Evaluator({
|
|
205
|
+
name,
|
|
206
|
+
displayName,
|
|
207
|
+
tags,
|
|
208
|
+
inputSchema: config.inputSchema,
|
|
209
|
+
outputSchema: config.outputSchema,
|
|
210
|
+
scoreSchema: config.scoreSchema,
|
|
211
|
+
middlewares,
|
|
212
|
+
passThreshold: config.passThreshold,
|
|
213
|
+
passCriterion: config.passCriterion
|
|
214
|
+
});
|
|
215
|
+
}
|
|
216
|
+
evaluate(fn) {
|
|
217
|
+
return new _Evaluator({
|
|
218
|
+
...this.getState(),
|
|
219
|
+
evaluateFn: fn
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
/** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
|
|
223
|
+
getName() {
|
|
224
|
+
return this._config.name;
|
|
225
|
+
}
|
|
226
|
+
getDisplayName() {
|
|
227
|
+
return this._config.displayName;
|
|
228
|
+
}
|
|
229
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
|
|
230
|
+
getDisplayLabel() {
|
|
231
|
+
const id = this._config.name;
|
|
232
|
+
if (id === void 0) {
|
|
233
|
+
return void 0;
|
|
234
|
+
}
|
|
235
|
+
return this._config.displayName ?? id;
|
|
236
|
+
}
|
|
237
|
+
/** Tags from `Evaluator.define({ tags })`; empty until defined. */
|
|
238
|
+
getTags() {
|
|
239
|
+
return [...this._config.tags];
|
|
240
|
+
}
|
|
241
|
+
getInputSchema() {
|
|
242
|
+
return this._config.inputSchema;
|
|
243
|
+
}
|
|
244
|
+
getOutputSchema() {
|
|
245
|
+
return this._config.outputSchema;
|
|
246
|
+
}
|
|
247
|
+
getScoreSchema() {
|
|
248
|
+
return this._config.scoreSchema;
|
|
249
|
+
}
|
|
250
|
+
getMiddlewares() {
|
|
251
|
+
return this._config.middlewares;
|
|
252
|
+
}
|
|
253
|
+
getEvaluateFn() {
|
|
254
|
+
return this._config.evaluateFn;
|
|
255
|
+
}
|
|
256
|
+
getPassThreshold() {
|
|
257
|
+
return this._config.passThreshold;
|
|
258
|
+
}
|
|
259
|
+
getPassCriterion() {
|
|
260
|
+
return this._config.passCriterion;
|
|
261
|
+
}
|
|
262
|
+
async resolveContext() {
|
|
263
|
+
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
264
|
+
return Object.assign({}, ...parts);
|
|
265
|
+
}
|
|
266
|
+
};
|
|
267
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
268
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
269
|
+
const label = evaluator.getDisplayLabel();
|
|
270
|
+
if (label !== void 0) {
|
|
271
|
+
return label;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
275
|
+
}
|
|
276
|
+
function getEvaluatorTagList(evaluator) {
|
|
277
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
278
|
+
}
|
|
279
|
+
|
|
37
280
|
// src/cli/data.mock.json
|
|
38
281
|
var data_mock_default = {
|
|
39
282
|
datasets: [
|
|
@@ -280,7 +523,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
280
523
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
281
524
|
return {
|
|
282
525
|
id: item.id,
|
|
283
|
-
name: item.dataset
|
|
526
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
284
527
|
overview: `Discovered from ${item.filePath}`,
|
|
285
528
|
runs
|
|
286
529
|
};
|
|
@@ -288,7 +531,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
288
531
|
function toEvaluatorOption(item) {
|
|
289
532
|
return {
|
|
290
533
|
id: item.id,
|
|
291
|
-
name: item.evaluator
|
|
534
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
292
535
|
configPreview: `Source: ${item.filePath}`
|
|
293
536
|
};
|
|
294
537
|
}
|
|
@@ -333,196 +576,149 @@ function parseStartupArgs(argv) {
|
|
|
333
576
|
}
|
|
334
577
|
return args;
|
|
335
578
|
}
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
this._config = config;
|
|
579
|
+
function preprocessForDiff(value, options) {
|
|
580
|
+
if (options?.sort && Array.isArray(value)) {
|
|
581
|
+
return [...value].sort((a, b) => {
|
|
582
|
+
const aStr = stringify__default.default(preprocessForDiff(a, options));
|
|
583
|
+
const bStr = stringify__default.default(preprocessForDiff(b, options));
|
|
584
|
+
return aStr.localeCompare(bStr);
|
|
585
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
344
586
|
}
|
|
345
|
-
|
|
346
|
-
const
|
|
347
|
-
|
|
348
|
-
|
|
587
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
588
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
589
|
+
const filtered = {};
|
|
590
|
+
for (const [k, v] of Object.entries(value)) {
|
|
591
|
+
if (!keys.includes(k)) {
|
|
592
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
593
|
+
}
|
|
349
594
|
}
|
|
350
|
-
return
|
|
351
|
-
name: config.name,
|
|
352
|
-
tags: config.tags,
|
|
353
|
-
reruns,
|
|
354
|
-
inputSchema: config.inputSchema,
|
|
355
|
-
input: config.input,
|
|
356
|
-
outputSchema: config.outputSchema,
|
|
357
|
-
output: config.output
|
|
358
|
-
});
|
|
595
|
+
return filtered;
|
|
359
596
|
}
|
|
360
|
-
|
|
361
|
-
|
|
597
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
598
|
+
const result = {};
|
|
599
|
+
for (const [k, v] of Object.entries(value)) {
|
|
600
|
+
result[k] = preprocessForDiff(v, options);
|
|
601
|
+
}
|
|
602
|
+
return result;
|
|
362
603
|
}
|
|
363
|
-
|
|
364
|
-
return
|
|
604
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
605
|
+
return Number(value.toFixed(options.precision));
|
|
365
606
|
}
|
|
366
|
-
|
|
367
|
-
|
|
607
|
+
return value;
|
|
608
|
+
}
|
|
609
|
+
function toPrettyJson(value) {
|
|
610
|
+
const str = stringify__default.default(value);
|
|
611
|
+
try {
|
|
612
|
+
const parsed = JSON.parse(str);
|
|
613
|
+
return JSON.stringify(parsed, null, 2);
|
|
614
|
+
} catch {
|
|
615
|
+
return str;
|
|
368
616
|
}
|
|
369
|
-
|
|
370
|
-
|
|
617
|
+
}
|
|
618
|
+
function formatDiffParts(parts) {
|
|
619
|
+
const lines = [];
|
|
620
|
+
for (const part of parts) {
|
|
621
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
622
|
+
const partLines = part.value.split("\n");
|
|
623
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
624
|
+
const line = partLines[i];
|
|
625
|
+
if (i === partLines.length - 1 && line === "")
|
|
626
|
+
continue;
|
|
627
|
+
lines.push(prefix + line);
|
|
628
|
+
}
|
|
371
629
|
}
|
|
372
|
-
|
|
373
|
-
|
|
630
|
+
return lines.join("\n");
|
|
631
|
+
}
|
|
632
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
633
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
634
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
635
|
+
if (diffOptions?.keysOnly) {
|
|
636
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
637
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
638
|
+
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
639
|
+
return formatDiffParts(parts2);
|
|
374
640
|
}
|
|
375
|
-
|
|
376
|
-
|
|
641
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
642
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
643
|
+
if (expectedStr === actualStr) {
|
|
644
|
+
return "";
|
|
377
645
|
}
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
return resolve(this._config.output);
|
|
383
|
-
}
|
|
384
|
-
};
|
|
385
|
-
|
|
386
|
-
// src/evals/evaluator.ts
|
|
387
|
-
var Evaluator = class _Evaluator {
|
|
388
|
-
constructor(config) {
|
|
389
|
-
this._config = config;
|
|
390
|
-
}
|
|
391
|
-
getState() {
|
|
392
|
-
return {
|
|
393
|
-
name: this._config.name,
|
|
394
|
-
inputSchema: this._config.inputSchema,
|
|
395
|
-
outputSchema: this._config.outputSchema,
|
|
396
|
-
scoreSchema: this._config.scoreSchema,
|
|
397
|
-
middlewares: this._config.middlewares,
|
|
398
|
-
evaluateFn: this._config.evaluateFn,
|
|
399
|
-
passThreshold: this._config.passThreshold,
|
|
400
|
-
passCriterion: this._config.passCriterion
|
|
401
|
-
};
|
|
402
|
-
}
|
|
403
|
-
static use(middleware) {
|
|
404
|
-
return new _Evaluator({
|
|
405
|
-
middlewares: [middleware]
|
|
406
|
-
});
|
|
407
|
-
}
|
|
408
|
-
use(middleware) {
|
|
409
|
-
const state = this.getState();
|
|
410
|
-
return new _Evaluator({
|
|
411
|
-
...state,
|
|
412
|
-
middlewares: [...state.middlewares, middleware]
|
|
413
|
-
});
|
|
414
|
-
}
|
|
415
|
-
define(config) {
|
|
416
|
-
const { middlewares } = this.getState();
|
|
417
|
-
return new _Evaluator({
|
|
418
|
-
name: config.name,
|
|
419
|
-
inputSchema: config.inputSchema,
|
|
420
|
-
outputSchema: config.outputSchema,
|
|
421
|
-
scoreSchema: config.scoreSchema,
|
|
422
|
-
middlewares,
|
|
423
|
-
passThreshold: config.passThreshold,
|
|
424
|
-
passCriterion: config.passCriterion
|
|
425
|
-
});
|
|
426
|
-
}
|
|
427
|
-
evaluate(fn) {
|
|
428
|
-
return new _Evaluator({
|
|
429
|
-
...this.getState(),
|
|
430
|
-
evaluateFn: fn
|
|
431
|
-
});
|
|
432
|
-
}
|
|
433
|
-
getName() {
|
|
434
|
-
return this._config.name;
|
|
435
|
-
}
|
|
436
|
-
getInputSchema() {
|
|
437
|
-
return this._config.inputSchema;
|
|
438
|
-
}
|
|
439
|
-
getOutputSchema() {
|
|
440
|
-
return this._config.outputSchema;
|
|
441
|
-
}
|
|
442
|
-
getScoreSchema() {
|
|
443
|
-
return this._config.scoreSchema;
|
|
444
|
-
}
|
|
445
|
-
getMiddlewares() {
|
|
446
|
-
return this._config.middlewares;
|
|
447
|
-
}
|
|
448
|
-
getEvaluateFn() {
|
|
449
|
-
return this._config.evaluateFn;
|
|
646
|
+
const parts = diff.diffLines(expectedStr, actualStr);
|
|
647
|
+
if (diffOptions?.outputNewOnly) {
|
|
648
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
649
|
+
return formatDiffParts(filtered);
|
|
450
650
|
}
|
|
451
|
-
|
|
452
|
-
|
|
651
|
+
return formatDiffParts(parts);
|
|
652
|
+
}
|
|
653
|
+
function extractKeys(value) {
|
|
654
|
+
if (value === null || typeof value !== "object") {
|
|
655
|
+
return "\xB7";
|
|
453
656
|
}
|
|
454
|
-
|
|
455
|
-
return
|
|
657
|
+
if (Array.isArray(value)) {
|
|
658
|
+
return value.map(extractKeys);
|
|
456
659
|
}
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
660
|
+
const result = {};
|
|
661
|
+
for (const [k, v] of Object.entries(value)) {
|
|
662
|
+
result[k] = extractKeys(v);
|
|
460
663
|
}
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
// src/evals/dataset.ts
|
|
464
|
-
function matchesAny(value, matchers) {
|
|
465
|
-
return matchers.some(
|
|
466
|
-
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
467
|
-
);
|
|
664
|
+
return result;
|
|
468
665
|
}
|
|
469
|
-
function
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
666
|
+
function formatLogMessage(msg) {
|
|
667
|
+
if (typeof msg === "string")
|
|
668
|
+
return msg;
|
|
669
|
+
if (msg instanceof Error)
|
|
670
|
+
return msg.stack ?? msg.message;
|
|
671
|
+
try {
|
|
672
|
+
if (msg !== null && typeof msg === "object") {
|
|
673
|
+
return JSON.stringify(msg, null, 2);
|
|
473
674
|
}
|
|
474
|
-
return
|
|
475
|
-
}
|
|
675
|
+
return String(msg);
|
|
676
|
+
} catch {
|
|
677
|
+
return String(msg);
|
|
678
|
+
}
|
|
476
679
|
}
|
|
477
|
-
function
|
|
478
|
-
|
|
479
|
-
|
|
680
|
+
function createLogEntry(message, options) {
|
|
681
|
+
return {
|
|
682
|
+
type: "log",
|
|
683
|
+
label: options?.label,
|
|
684
|
+
message: formatLogMessage(message)
|
|
685
|
+
};
|
|
480
686
|
}
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
}
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
getIncludedPaths() {
|
|
504
|
-
return this._config.includedPaths;
|
|
505
|
-
}
|
|
506
|
-
getExcludedPaths() {
|
|
507
|
-
return this._config.excludedPaths;
|
|
508
|
-
}
|
|
509
|
-
matchesTestCase(testCase, filePath) {
|
|
510
|
-
const tags = testCase.getTags();
|
|
511
|
-
if (this._config.excludedTags.length > 0) {
|
|
512
|
-
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
513
|
-
return false;
|
|
687
|
+
function getLogLines(entry) {
|
|
688
|
+
return entry.message.split("\n");
|
|
689
|
+
}
|
|
690
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
691
|
+
const { label, ...diffOpts } = options ?? {};
|
|
692
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
693
|
+
return {
|
|
694
|
+
type: "diff",
|
|
695
|
+
label,
|
|
696
|
+
expected,
|
|
697
|
+
actual,
|
|
698
|
+
diff: diff || "(no differences)"
|
|
699
|
+
};
|
|
700
|
+
}
|
|
701
|
+
function printJsonDiff(expected, actual, options = {}) {
|
|
702
|
+
const { color = true, ...diffOpts } = options;
|
|
703
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
704
|
+
if (color) {
|
|
705
|
+
const lines = diff.split("\n").map((line) => {
|
|
706
|
+
const trimmed = line.trimStart();
|
|
707
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
708
|
+
return `\x1B[31m${line}\x1B[0m`;
|
|
514
709
|
}
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
518
|
-
return false;
|
|
710
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
711
|
+
return `\x1B[32m${line}\x1B[0m`;
|
|
519
712
|
}
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
const
|
|
523
|
-
|
|
713
|
+
return line;
|
|
714
|
+
});
|
|
715
|
+
const colored = lines.join("\n");
|
|
716
|
+
console.log(colored || "(no differences)");
|
|
717
|
+
return colored;
|
|
524
718
|
}
|
|
525
|
-
|
|
719
|
+
console.log(diff || "(no differences)");
|
|
720
|
+
return diff;
|
|
721
|
+
}
|
|
526
722
|
|
|
527
723
|
// src/evals/metric.ts
|
|
528
724
|
var registry = /* @__PURE__ */ new Map();
|
|
@@ -547,6 +743,113 @@ function getMetricById(id) {
|
|
|
547
743
|
return registry.get(id);
|
|
548
744
|
}
|
|
549
745
|
|
|
746
|
+
// src/evals/aggregators.ts
|
|
747
|
+
function aggregateTokenCountSum(values) {
|
|
748
|
+
const initial = {
|
|
749
|
+
input: 0,
|
|
750
|
+
output: 0,
|
|
751
|
+
inputCached: 0,
|
|
752
|
+
outputCached: 0
|
|
753
|
+
};
|
|
754
|
+
return values.reduce(
|
|
755
|
+
(acc, v) => ({
|
|
756
|
+
input: acc.input + (v.input ?? 0),
|
|
757
|
+
output: acc.output + (v.output ?? 0),
|
|
758
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
759
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
760
|
+
}),
|
|
761
|
+
initial
|
|
762
|
+
);
|
|
763
|
+
}
|
|
764
|
+
function aggregateLatencyAverage(values) {
|
|
765
|
+
if (values.length === 0) {
|
|
766
|
+
return { ms: 0 };
|
|
767
|
+
}
|
|
768
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
769
|
+
return { ms: sum / values.length };
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
// src/evals/metrics/standard.ts
|
|
773
|
+
var tokenCountMetric = Metric.of({
|
|
774
|
+
id: "token-count",
|
|
775
|
+
name: "Tokens",
|
|
776
|
+
aggregate: aggregateTokenCountSum,
|
|
777
|
+
format: (data, options) => {
|
|
778
|
+
const input = data.input ?? 0;
|
|
779
|
+
const output = data.output ?? 0;
|
|
780
|
+
const inputCached = data.inputCached ?? 0;
|
|
781
|
+
const outputCached = data.outputCached ?? 0;
|
|
782
|
+
const cached = inputCached + outputCached;
|
|
783
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
784
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
785
|
+
}
|
|
786
|
+
});
|
|
787
|
+
var latencyMetric = Metric.of({
|
|
788
|
+
id: "latency",
|
|
789
|
+
name: "Latency",
|
|
790
|
+
aggregate: aggregateLatencyAverage,
|
|
791
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
792
|
+
});
|
|
793
|
+
|
|
794
|
+
// src/evals/run-config.ts
|
|
795
|
+
function validateRow(row, index) {
|
|
796
|
+
const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
|
|
797
|
+
const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
|
|
798
|
+
if (hasEvaluators && hasPattern) {
|
|
799
|
+
throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
|
|
800
|
+
}
|
|
801
|
+
if (!hasEvaluators && !hasPattern) {
|
|
802
|
+
throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
|
|
803
|
+
}
|
|
804
|
+
if (hasEvaluators && row.evaluators.length === 0) {
|
|
805
|
+
throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
|
|
806
|
+
}
|
|
807
|
+
const rawRep = "repetitions" in row ? row.repetitions : void 0;
|
|
808
|
+
const repetitions = rawRep ?? 1;
|
|
809
|
+
if (!Number.isInteger(repetitions) || repetitions < 1) {
|
|
810
|
+
throw new Error(
|
|
811
|
+
`RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
|
|
812
|
+
);
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
var RunConfig = class _RunConfig {
|
|
816
|
+
constructor(name, displayName, tags, runs) {
|
|
817
|
+
this._name = name;
|
|
818
|
+
this._displayName = displayName;
|
|
819
|
+
this._tags = tags;
|
|
820
|
+
this._runs = runs;
|
|
821
|
+
}
|
|
822
|
+
static define(config) {
|
|
823
|
+
if (config.runs.length === 0) {
|
|
824
|
+
throw new Error("RunConfig runs must be non-empty");
|
|
825
|
+
}
|
|
826
|
+
config.runs.forEach(validateRow);
|
|
827
|
+
const name = validateRunConfigName(config.name, "RunConfig.define");
|
|
828
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
829
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
830
|
+
return new _RunConfig(name, displayName, tags, config.runs);
|
|
831
|
+
}
|
|
832
|
+
/** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
|
|
833
|
+
getName() {
|
|
834
|
+
return this._name;
|
|
835
|
+
}
|
|
836
|
+
/** Optional unrestricted display label. */
|
|
837
|
+
getDisplayName() {
|
|
838
|
+
return this._displayName;
|
|
839
|
+
}
|
|
840
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
841
|
+
getDisplayLabel() {
|
|
842
|
+
return this._displayName ?? this._name;
|
|
843
|
+
}
|
|
844
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
845
|
+
getTags() {
|
|
846
|
+
return [...this._tags];
|
|
847
|
+
}
|
|
848
|
+
getRuns() {
|
|
849
|
+
return this._runs;
|
|
850
|
+
}
|
|
851
|
+
};
|
|
852
|
+
|
|
550
853
|
// src/evals/score.ts
|
|
551
854
|
var registry2 = /* @__PURE__ */ new Map();
|
|
552
855
|
function formatScoreData(def, data, options) {
|
|
@@ -637,71 +940,23 @@ var Score = {
|
|
|
637
940
|
aggregateValues: config.aggregateValues,
|
|
638
941
|
make: (data, options) => {
|
|
639
942
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
640
|
-
return {
|
|
641
|
-
id: config.id,
|
|
642
|
-
data,
|
|
643
|
-
...passed !== void 0 && { passed },
|
|
644
|
-
...options?.name !== void 0 && { name: options.name },
|
|
645
|
-
def
|
|
646
|
-
// Attach def so rendering/aggregation works without registry lookup
|
|
647
|
-
};
|
|
648
|
-
}
|
|
649
|
-
};
|
|
650
|
-
registry2.set(config.id, def);
|
|
651
|
-
return def;
|
|
652
|
-
}
|
|
653
|
-
};
|
|
654
|
-
function getScoreById(id) {
|
|
655
|
-
return registry2.get(id);
|
|
656
|
-
}
|
|
657
|
-
|
|
658
|
-
// src/evals/aggregators.ts
|
|
659
|
-
function aggregateTokenCountSum(values) {
|
|
660
|
-
const initial = {
|
|
661
|
-
input: 0,
|
|
662
|
-
output: 0,
|
|
663
|
-
inputCached: 0,
|
|
664
|
-
outputCached: 0
|
|
665
|
-
};
|
|
666
|
-
return values.reduce(
|
|
667
|
-
(acc, v) => ({
|
|
668
|
-
input: acc.input + (v.input ?? 0),
|
|
669
|
-
output: acc.output + (v.output ?? 0),
|
|
670
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
671
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
672
|
-
}),
|
|
673
|
-
initial
|
|
674
|
-
);
|
|
675
|
-
}
|
|
676
|
-
function aggregateLatencyAverage(values) {
|
|
677
|
-
if (values.length === 0) {
|
|
678
|
-
return { ms: 0 };
|
|
679
|
-
}
|
|
680
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
681
|
-
return { ms: sum / values.length };
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
// src/evals/metrics/standard.ts
|
|
685
|
-
var tokenCountMetric = Metric.of({
|
|
686
|
-
id: "token-count",
|
|
687
|
-
name: "Tokens",
|
|
688
|
-
aggregate: aggregateTokenCountSum,
|
|
689
|
-
format: (data, options) => {
|
|
690
|
-
const input = data.input ?? 0;
|
|
691
|
-
const output = data.output ?? 0;
|
|
692
|
-
const inputCached = data.inputCached ?? 0;
|
|
693
|
-
const outputCached = data.outputCached ?? 0;
|
|
694
|
-
const cached = inputCached + outputCached;
|
|
695
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
696
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
943
|
+
return {
|
|
944
|
+
id: config.id,
|
|
945
|
+
data,
|
|
946
|
+
...passed !== void 0 && { passed },
|
|
947
|
+
...options?.name !== void 0 && { name: options.name },
|
|
948
|
+
def
|
|
949
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
950
|
+
};
|
|
951
|
+
}
|
|
952
|
+
};
|
|
953
|
+
registry2.set(config.id, def);
|
|
954
|
+
return def;
|
|
697
955
|
}
|
|
698
|
-
}
|
|
699
|
-
|
|
700
|
-
id
|
|
701
|
-
|
|
702
|
-
aggregate: aggregateLatencyAverage,
|
|
703
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
704
|
-
});
|
|
956
|
+
};
|
|
957
|
+
function getScoreById(id) {
|
|
958
|
+
return registry2.get(id);
|
|
959
|
+
}
|
|
705
960
|
|
|
706
961
|
// src/evals/scores/standard.ts
|
|
707
962
|
var percentScore = Score.of({
|
|
@@ -734,148 +989,197 @@ var binaryScore = Score.of({
|
|
|
734
989
|
},
|
|
735
990
|
aggregateValues: Score.aggregate.all
|
|
736
991
|
});
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
747
|
-
const filtered = {};
|
|
748
|
-
for (const [k, v] of Object.entries(value)) {
|
|
749
|
-
if (!keys.includes(k)) {
|
|
750
|
-
filtered[k] = preprocessForDiff(v, options);
|
|
751
|
-
}
|
|
992
|
+
|
|
993
|
+
// src/evals/tag-set.ts
|
|
994
|
+
var TagSet = class {
|
|
995
|
+
constructor() {
|
|
996
|
+
}
|
|
997
|
+
static define(tags) {
|
|
998
|
+
const out = {};
|
|
999
|
+
for (const tag of tags) {
|
|
1000
|
+
out[tag] = tag;
|
|
752
1001
|
}
|
|
753
|
-
return
|
|
1002
|
+
return out;
|
|
754
1003
|
}
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
1004
|
+
};
|
|
1005
|
+
|
|
1006
|
+
// src/evals/test-case.ts
|
|
1007
|
+
function resolve(value) {
|
|
1008
|
+
return typeof value === "function" ? value() : value;
|
|
1009
|
+
}
|
|
1010
|
+
var TestCase = class _TestCase {
|
|
1011
|
+
constructor(config) {
|
|
1012
|
+
this._config = config;
|
|
761
1013
|
}
|
|
762
|
-
|
|
763
|
-
|
|
1014
|
+
static describe(config) {
|
|
1015
|
+
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
1016
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
1017
|
+
return new _TestCase({
|
|
1018
|
+
name,
|
|
1019
|
+
displayName,
|
|
1020
|
+
tags: config.tags,
|
|
1021
|
+
inputSchema: config.inputSchema,
|
|
1022
|
+
input: config.input,
|
|
1023
|
+
outputSchema: config.outputSchema,
|
|
1024
|
+
output: config.output
|
|
1025
|
+
});
|
|
764
1026
|
}
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
function toPrettyJson(value) {
|
|
768
|
-
const str = stringify__default.default(value);
|
|
769
|
-
try {
|
|
770
|
-
const parsed = JSON.parse(str);
|
|
771
|
-
return JSON.stringify(parsed, null, 2);
|
|
772
|
-
} catch {
|
|
773
|
-
return str;
|
|
1027
|
+
getName() {
|
|
1028
|
+
return this._config.name;
|
|
774
1029
|
}
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
const lines = [];
|
|
778
|
-
for (const part of parts) {
|
|
779
|
-
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
780
|
-
const partLines = part.value.split("\n");
|
|
781
|
-
for (let i = 0; i < partLines.length; i++) {
|
|
782
|
-
const line = partLines[i];
|
|
783
|
-
if (i === partLines.length - 1 && line === "")
|
|
784
|
-
continue;
|
|
785
|
-
lines.push(prefix + line);
|
|
786
|
-
}
|
|
1030
|
+
getDisplayName() {
|
|
1031
|
+
return this._config.displayName;
|
|
787
1032
|
}
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
function createDiffString(expected, actual, diffOptions) {
|
|
791
|
-
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
792
|
-
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
793
|
-
if (diffOptions?.keysOnly) {
|
|
794
|
-
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
795
|
-
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
796
|
-
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
797
|
-
return formatDiffParts(parts2);
|
|
1033
|
+
getDisplayLabel() {
|
|
1034
|
+
return this._config.displayName ?? this._config.name;
|
|
798
1035
|
}
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
if (expectedStr === actualStr) {
|
|
802
|
-
return "";
|
|
1036
|
+
getTags() {
|
|
1037
|
+
return this._config.tags;
|
|
803
1038
|
}
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
const filtered = parts.filter((p) => p.added === true);
|
|
807
|
-
return formatDiffParts(filtered);
|
|
1039
|
+
getInputSchema() {
|
|
1040
|
+
return this._config.inputSchema;
|
|
808
1041
|
}
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
function extractKeys(value) {
|
|
812
|
-
if (value === null || typeof value !== "object") {
|
|
813
|
-
return "\xB7";
|
|
1042
|
+
getInput() {
|
|
1043
|
+
return resolve(this._config.input);
|
|
814
1044
|
}
|
|
815
|
-
|
|
816
|
-
return
|
|
1045
|
+
getOutputSchema() {
|
|
1046
|
+
return this._config.outputSchema;
|
|
817
1047
|
}
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
1048
|
+
getOutput() {
|
|
1049
|
+
if (this._config.output === void 0) {
|
|
1050
|
+
return void 0;
|
|
1051
|
+
}
|
|
1052
|
+
return resolve(this._config.output);
|
|
821
1053
|
}
|
|
822
|
-
|
|
1054
|
+
};
|
|
1055
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1056
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1057
|
+
return testCase.getDisplayLabel();
|
|
1058
|
+
}
|
|
1059
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
823
1060
|
}
|
|
824
|
-
function
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
1061
|
+
function getTestCaseTagList(testCase) {
|
|
1062
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1063
|
+
}
|
|
1064
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
1065
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
1066
|
+
let entries;
|
|
829
1067
|
try {
|
|
830
|
-
|
|
831
|
-
return JSON.stringify(msg, null, 2);
|
|
832
|
-
}
|
|
833
|
-
return String(msg);
|
|
1068
|
+
entries = await promises.readdir(baseDir);
|
|
834
1069
|
} catch {
|
|
835
|
-
return
|
|
1070
|
+
return [];
|
|
836
1071
|
}
|
|
1072
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1073
|
+
const snapshots = [];
|
|
1074
|
+
for (const fileName of jsonlFiles) {
|
|
1075
|
+
const filePath = path.join(baseDir, fileName);
|
|
1076
|
+
try {
|
|
1077
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1078
|
+
if (snapshot) {
|
|
1079
|
+
snapshots.push(snapshot);
|
|
1080
|
+
}
|
|
1081
|
+
} catch {
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
837
1085
|
}
|
|
838
|
-
function
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
}
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
if (color) {
|
|
863
|
-
const lines = diff.split("\n").map((line) => {
|
|
864
|
-
const trimmed = line.trimStart();
|
|
865
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
866
|
-
return `\x1B[31m${line}\x1B[0m`;
|
|
1086
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1087
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
1088
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1089
|
+
if (lines.length === 0) {
|
|
1090
|
+
return null;
|
|
1091
|
+
}
|
|
1092
|
+
let runQueued = null;
|
|
1093
|
+
let runCompleted = null;
|
|
1094
|
+
let runFailed = null;
|
|
1095
|
+
let runStarted = null;
|
|
1096
|
+
for (const line of lines) {
|
|
1097
|
+
try {
|
|
1098
|
+
const event = JSON.parse(line);
|
|
1099
|
+
const type = event.type;
|
|
1100
|
+
if (type === "RunQueued") {
|
|
1101
|
+
runQueued = {
|
|
1102
|
+
runId: event.runId,
|
|
1103
|
+
datasetId: event.datasetId,
|
|
1104
|
+
datasetName: event.datasetName,
|
|
1105
|
+
evaluatorIds: event.evaluatorIds,
|
|
1106
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
1107
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
1108
|
+
ts: event.ts
|
|
1109
|
+
};
|
|
867
1110
|
}
|
|
868
|
-
if (
|
|
869
|
-
|
|
1111
|
+
if (type === "RunStarted") {
|
|
1112
|
+
runStarted = { startedAt: event.startedAt };
|
|
1113
|
+
}
|
|
1114
|
+
if (type === "RunCompleted") {
|
|
1115
|
+
runCompleted = {
|
|
1116
|
+
passedTestCases: event.passedTestCases,
|
|
1117
|
+
failedTestCases: event.failedTestCases,
|
|
1118
|
+
totalTestCases: event.totalTestCases,
|
|
1119
|
+
finishedAt: event.finishedAt
|
|
1120
|
+
};
|
|
1121
|
+
}
|
|
1122
|
+
if (type === "RunFailed") {
|
|
1123
|
+
runFailed = {
|
|
1124
|
+
finishedAt: event.finishedAt,
|
|
1125
|
+
errorMessage: event.errorMessage
|
|
1126
|
+
};
|
|
1127
|
+
}
|
|
1128
|
+
} catch {
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
if (!runQueued) {
|
|
1132
|
+
return null;
|
|
1133
|
+
}
|
|
1134
|
+
const artifactPath = filePath;
|
|
1135
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1136
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
1137
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1138
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1139
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1140
|
+
return {
|
|
1141
|
+
runId: runQueued.runId,
|
|
1142
|
+
datasetId: runQueued.datasetId,
|
|
1143
|
+
datasetName: runQueued.datasetName,
|
|
1144
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
1145
|
+
queuedAt: runQueued.ts ?? 0,
|
|
1146
|
+
startedAt: runStarted?.startedAt,
|
|
1147
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1148
|
+
totalTestCases: runQueued.totalTestCases,
|
|
1149
|
+
completedTestCases,
|
|
1150
|
+
passedTestCases,
|
|
1151
|
+
failedTestCases,
|
|
1152
|
+
status,
|
|
1153
|
+
artifactPath,
|
|
1154
|
+
errorMessage: runFailed?.errorMessage
|
|
1155
|
+
};
|
|
1156
|
+
}
|
|
1157
|
+
function aggregateTestCaseProgress(lines) {
|
|
1158
|
+
let completedTestCases = 0;
|
|
1159
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1160
|
+
for (const line of lines) {
|
|
1161
|
+
try {
|
|
1162
|
+
const event = JSON.parse(line);
|
|
1163
|
+
if (event.type === "TestCaseProgress") {
|
|
1164
|
+
const ev = event;
|
|
1165
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1166
|
+
const id = ev.testCaseId;
|
|
1167
|
+
const current = testCasePassedBy.get(id);
|
|
1168
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
870
1169
|
}
|
|
871
|
-
|
|
872
|
-
}
|
|
873
|
-
const colored = lines.join("\n");
|
|
874
|
-
console.log(colored || "(no differences)");
|
|
875
|
-
return colored;
|
|
1170
|
+
} catch {
|
|
1171
|
+
}
|
|
876
1172
|
}
|
|
877
|
-
|
|
878
|
-
|
|
1173
|
+
let passedTestCases = 0;
|
|
1174
|
+
let failedTestCases = 0;
|
|
1175
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1176
|
+
if (passed) {
|
|
1177
|
+
passedTestCases += 1;
|
|
1178
|
+
} else {
|
|
1179
|
+
failedTestCases += 1;
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
879
1183
|
}
|
|
880
1184
|
|
|
881
1185
|
// src/runner/config.ts
|
|
@@ -887,6 +1191,7 @@ var defaultRunnerConfig = {
|
|
|
887
1191
|
rootDir: process.cwd(),
|
|
888
1192
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
889
1193
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
1194
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
890
1195
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
891
1196
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
892
1197
|
},
|
|
@@ -912,6 +1217,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
912
1217
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
913
1218
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
914
1219
|
}
|
|
1220
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
1221
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
1222
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
1223
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
1224
|
+
}
|
|
915
1225
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
916
1226
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
917
1227
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -1010,6 +1320,9 @@ function isDatasetLike(value) {
|
|
|
1010
1320
|
function isEvaluatorLike(value) {
|
|
1011
1321
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
1012
1322
|
}
|
|
1323
|
+
function isRunConfigLike(value) {
|
|
1324
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1325
|
+
}
|
|
1013
1326
|
function isTestCaseLike(value) {
|
|
1014
1327
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
1015
1328
|
}
|
|
@@ -1098,6 +1411,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1098
1411
|
);
|
|
1099
1412
|
return found.flat();
|
|
1100
1413
|
}
|
|
1414
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1415
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1416
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1417
|
+
const found = await Promise.all(
|
|
1418
|
+
matched.map(async (absolutePath) => {
|
|
1419
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1420
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1421
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1422
|
+
return runConfigs.map((runConfig) => ({
|
|
1423
|
+
id: runConfig.getName(),
|
|
1424
|
+
filePath: relPath,
|
|
1425
|
+
runConfig
|
|
1426
|
+
}));
|
|
1427
|
+
})
|
|
1428
|
+
);
|
|
1429
|
+
return found.flat();
|
|
1430
|
+
}
|
|
1101
1431
|
async function collectTestCasesFromFiles(config) {
|
|
1102
1432
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1103
1433
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -1190,15 +1520,17 @@ function readOutput(testCase) {
|
|
|
1190
1520
|
}
|
|
1191
1521
|
return candidate.getOutput();
|
|
1192
1522
|
}
|
|
1193
|
-
function buildEvaluationUnits(testCases) {
|
|
1523
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1524
|
+
const count = Math.max(1, repetitionCount);
|
|
1194
1525
|
const units = [];
|
|
1195
1526
|
for (const testCaseItem of testCases) {
|
|
1196
|
-
const
|
|
1197
|
-
for (let r = 0; r <
|
|
1527
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
1528
|
+
for (let r = 0; r < count; r++) {
|
|
1198
1529
|
units.push({
|
|
1199
1530
|
testCaseItem,
|
|
1200
|
-
|
|
1201
|
-
|
|
1531
|
+
repetitionId,
|
|
1532
|
+
repetitionIndex: r + 1,
|
|
1533
|
+
repetitionCount: count
|
|
1202
1534
|
});
|
|
1203
1535
|
}
|
|
1204
1536
|
}
|
|
@@ -1211,7 +1543,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1211
1543
|
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1212
1544
|
}
|
|
1213
1545
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1214
|
-
const { testCaseItem,
|
|
1546
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1215
1547
|
return effect.Effect.gen(function* () {
|
|
1216
1548
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1217
1549
|
const started = Date.now();
|
|
@@ -1220,11 +1552,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1220
1552
|
type: "TestCaseStarted",
|
|
1221
1553
|
runId: task.runId,
|
|
1222
1554
|
testCaseId: testCaseItem.id,
|
|
1223
|
-
testCaseName: testCaseItem.testCase
|
|
1555
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1224
1556
|
startedTestCases: startedEvaluations,
|
|
1225
1557
|
totalTestCases: totalEvaluations,
|
|
1226
|
-
|
|
1227
|
-
|
|
1558
|
+
repetitionId,
|
|
1559
|
+
repetitionIndex,
|
|
1560
|
+
repetitionCount
|
|
1228
1561
|
});
|
|
1229
1562
|
const evaluatorScores = [];
|
|
1230
1563
|
let testCaseError;
|
|
@@ -1258,8 +1591,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1258
1591
|
meta: {
|
|
1259
1592
|
triggerId: task.triggerId,
|
|
1260
1593
|
runId: evaluatorRunId,
|
|
1261
|
-
|
|
1594
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1595
|
+
repetitionId,
|
|
1596
|
+
repetitionIndex,
|
|
1597
|
+
repetitionCount,
|
|
1598
|
+
runConfigName: task.runConfigName
|
|
1262
1599
|
},
|
|
1600
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1601
|
+
runConfigTags: task.runConfigTags,
|
|
1602
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1263
1603
|
logDiff,
|
|
1264
1604
|
log,
|
|
1265
1605
|
createError
|
|
@@ -1302,18 +1642,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1302
1642
|
});
|
|
1303
1643
|
}
|
|
1304
1644
|
}
|
|
1305
|
-
const
|
|
1645
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1306
1646
|
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1307
1647
|
const progressEvent = {
|
|
1308
1648
|
type: "TestCaseProgress",
|
|
1309
1649
|
runId: task.runId,
|
|
1310
1650
|
testCaseId: testCaseItem.id,
|
|
1311
|
-
testCaseName: testCaseItem.testCase
|
|
1651
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1312
1652
|
completedTestCases: completedEvaluations,
|
|
1313
1653
|
totalTestCases: totalEvaluations,
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1654
|
+
repetitionId,
|
|
1655
|
+
repetitionIndex,
|
|
1656
|
+
repetitionCount,
|
|
1657
|
+
passed: repetitionPassedThis,
|
|
1317
1658
|
durationMs: Date.now() - started,
|
|
1318
1659
|
evaluatorScores,
|
|
1319
1660
|
output,
|
|
@@ -1334,9 +1675,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1334
1675
|
(map) => {
|
|
1335
1676
|
const key = testCaseItem.id;
|
|
1336
1677
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1337
|
-
const newResults = [...existing.results,
|
|
1678
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1338
1679
|
const newCompletedCount = existing.completedCount + 1;
|
|
1339
|
-
const isLast = newCompletedCount ===
|
|
1680
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1340
1681
|
const newMap = new Map(map);
|
|
1341
1682
|
newMap.set(key, {
|
|
1342
1683
|
completedCount: newCompletedCount,
|
|
@@ -1373,10 +1714,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1373
1714
|
runId: task.runId,
|
|
1374
1715
|
startedAt
|
|
1375
1716
|
});
|
|
1376
|
-
const totalEvaluations = task.testCases.
|
|
1377
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1378
|
-
0
|
|
1379
|
-
);
|
|
1717
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1380
1718
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1381
1719
|
const completedRef = yield* effect.Ref.make(0);
|
|
1382
1720
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -1385,7 +1723,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1385
1723
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
1386
1724
|
/* @__PURE__ */ new Map()
|
|
1387
1725
|
);
|
|
1388
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1726
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1389
1727
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1390
1728
|
task,
|
|
1391
1729
|
unit,
|
|
@@ -1399,11 +1737,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1399
1737
|
failedRef,
|
|
1400
1738
|
testCaseResultsRef
|
|
1401
1739
|
);
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1740
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1741
|
+
if (globalSem !== void 0) {
|
|
1742
|
+
yield* effect.Effect.forEach(
|
|
1743
|
+
evaluationUnits,
|
|
1744
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1745
|
+
{ concurrency: "unbounded", discard: true }
|
|
1746
|
+
);
|
|
1747
|
+
} else {
|
|
1748
|
+
yield* effect.Effect.forEach(
|
|
1749
|
+
evaluationUnits,
|
|
1750
|
+
processEvaluation,
|
|
1751
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1752
|
+
);
|
|
1753
|
+
}
|
|
1407
1754
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1408
1755
|
effect.Ref.get(completedRef),
|
|
1409
1756
|
effect.Ref.get(passedRef),
|
|
@@ -1439,125 +1786,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1439
1786
|
artifactPath: task.snapshot.artifactPath
|
|
1440
1787
|
});
|
|
1441
1788
|
});
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
} catch {
|
|
1448
|
-
return [];
|
|
1449
|
-
}
|
|
1450
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1451
|
-
const snapshots = [];
|
|
1452
|
-
for (const fileName of jsonlFiles) {
|
|
1453
|
-
const filePath = path.join(baseDir, fileName);
|
|
1454
|
-
try {
|
|
1455
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1456
|
-
if (snapshot) {
|
|
1457
|
-
snapshots.push(snapshot);
|
|
1458
|
-
}
|
|
1459
|
-
} catch {
|
|
1460
|
-
}
|
|
1461
|
-
}
|
|
1462
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1463
|
-
}
|
|
1464
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1465
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1466
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1467
|
-
if (lines.length === 0) {
|
|
1468
|
-
return null;
|
|
1469
|
-
}
|
|
1470
|
-
let runQueued = null;
|
|
1471
|
-
let runCompleted = null;
|
|
1472
|
-
let runFailed = null;
|
|
1473
|
-
let runStarted = null;
|
|
1474
|
-
for (const line of lines) {
|
|
1475
|
-
try {
|
|
1476
|
-
const event = JSON.parse(line);
|
|
1477
|
-
const type = event.type;
|
|
1478
|
-
if (type === "RunQueued") {
|
|
1479
|
-
runQueued = {
|
|
1480
|
-
runId: event.runId,
|
|
1481
|
-
datasetId: event.datasetId,
|
|
1482
|
-
datasetName: event.datasetName,
|
|
1483
|
-
evaluatorIds: event.evaluatorIds,
|
|
1484
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1485
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1486
|
-
ts: event.ts
|
|
1487
|
-
};
|
|
1488
|
-
}
|
|
1489
|
-
if (type === "RunStarted") {
|
|
1490
|
-
runStarted = { startedAt: event.startedAt };
|
|
1491
|
-
}
|
|
1492
|
-
if (type === "RunCompleted") {
|
|
1493
|
-
runCompleted = {
|
|
1494
|
-
passedTestCases: event.passedTestCases,
|
|
1495
|
-
failedTestCases: event.failedTestCases,
|
|
1496
|
-
totalTestCases: event.totalTestCases,
|
|
1497
|
-
finishedAt: event.finishedAt
|
|
1498
|
-
};
|
|
1499
|
-
}
|
|
1500
|
-
if (type === "RunFailed") {
|
|
1501
|
-
runFailed = {
|
|
1502
|
-
finishedAt: event.finishedAt,
|
|
1503
|
-
errorMessage: event.errorMessage
|
|
1504
|
-
};
|
|
1505
|
-
}
|
|
1506
|
-
} catch {
|
|
1507
|
-
}
|
|
1789
|
+
|
|
1790
|
+
// src/runner/name-pattern.ts
|
|
1791
|
+
function parseRegexLiteral(pattern) {
|
|
1792
|
+
if (!pattern.startsWith("/")) {
|
|
1793
|
+
return void 0;
|
|
1508
1794
|
}
|
|
1509
|
-
|
|
1510
|
-
|
|
1795
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1796
|
+
if (lastSlash <= 0) {
|
|
1797
|
+
return void 0;
|
|
1511
1798
|
}
|
|
1512
|
-
const artifactPath = filePath;
|
|
1513
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1514
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1515
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1516
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1517
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1518
1799
|
return {
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
datasetName: runQueued.datasetName,
|
|
1522
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1523
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1524
|
-
startedAt: runStarted?.startedAt,
|
|
1525
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1526
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1527
|
-
completedTestCases,
|
|
1528
|
-
passedTestCases,
|
|
1529
|
-
failedTestCases,
|
|
1530
|
-
status,
|
|
1531
|
-
artifactPath,
|
|
1532
|
-
errorMessage: runFailed?.errorMessage
|
|
1800
|
+
source: pattern.slice(1, lastSlash),
|
|
1801
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1533
1802
|
};
|
|
1534
1803
|
}
|
|
1535
|
-
function
|
|
1536
|
-
|
|
1537
|
-
const
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
if (event.type === "TestCaseProgress") {
|
|
1542
|
-
const ev = event;
|
|
1543
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1544
|
-
const id = ev.testCaseId;
|
|
1545
|
-
const current = testCasePassedBy.get(id);
|
|
1546
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1547
|
-
}
|
|
1548
|
-
} catch {
|
|
1549
|
-
}
|
|
1804
|
+
function createNameMatcher(pattern) {
|
|
1805
|
+
const normalizedPattern = pattern.trim();
|
|
1806
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1807
|
+
if (regexLiteral) {
|
|
1808
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1809
|
+
return (value) => regex.test(value);
|
|
1550
1810
|
}
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
passedTestCases += 1;
|
|
1556
|
-
} else {
|
|
1557
|
-
failedTestCases += 1;
|
|
1558
|
-
}
|
|
1811
|
+
if (normalizedPattern.includes("*")) {
|
|
1812
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1813
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1814
|
+
return (value) => regex.test(value);
|
|
1559
1815
|
}
|
|
1560
|
-
return
|
|
1816
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1561
1817
|
}
|
|
1562
1818
|
async function appendJsonLine(artifactPath, payload) {
|
|
1563
1819
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1616,32 +1872,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1616
1872
|
}
|
|
1617
1873
|
|
|
1618
1874
|
// src/runner/api.ts
|
|
1619
|
-
function
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1624
|
-
if (lastSlash <= 0) {
|
|
1625
|
-
return void 0;
|
|
1626
|
-
}
|
|
1627
|
-
return {
|
|
1628
|
-
source: pattern.slice(1, lastSlash),
|
|
1629
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1630
|
-
};
|
|
1631
|
-
}
|
|
1632
|
-
function createNameMatcher(pattern) {
|
|
1633
|
-
const normalizedPattern = pattern.trim();
|
|
1634
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1635
|
-
if (regexLiteral) {
|
|
1636
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1637
|
-
return (value) => regex.test(value);
|
|
1638
|
-
}
|
|
1639
|
-
if (normalizedPattern.includes("*")) {
|
|
1640
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1641
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1642
|
-
return (value) => regex.test(value);
|
|
1875
|
+
function normalizeRunRepetitions(value) {
|
|
1876
|
+
const n = value ?? 1;
|
|
1877
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1878
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1643
1879
|
}
|
|
1644
|
-
return
|
|
1880
|
+
return n;
|
|
1645
1881
|
}
|
|
1646
1882
|
function mergeRunnerOverrides(base, next) {
|
|
1647
1883
|
if (!base) {
|
|
@@ -1676,6 +1912,7 @@ var EffectRunner = class {
|
|
|
1676
1912
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1677
1913
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1678
1914
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1915
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1679
1916
|
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1680
1917
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1681
1918
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1716,6 +1953,137 @@ var EffectRunner = class {
|
|
|
1716
1953
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1717
1954
|
);
|
|
1718
1955
|
}
|
|
1956
|
+
async collectRunConfigs() {
|
|
1957
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1958
|
+
this.runConfigsById.clear();
|
|
1959
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1960
|
+
for (const item of runConfigs) {
|
|
1961
|
+
const id = item.runConfig.getName();
|
|
1962
|
+
const lower = id.toLowerCase();
|
|
1963
|
+
const prev = byNameLower.get(lower);
|
|
1964
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1965
|
+
throw new Error(
|
|
1966
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1967
|
+
);
|
|
1968
|
+
}
|
|
1969
|
+
byNameLower.set(lower, item);
|
|
1970
|
+
this.runConfigsById.set(id, item);
|
|
1971
|
+
}
|
|
1972
|
+
return runConfigs;
|
|
1973
|
+
}
|
|
1974
|
+
async resolveRunConfigByName(name) {
|
|
1975
|
+
if (this.runConfigsById.size === 0) {
|
|
1976
|
+
await this.collectRunConfigs();
|
|
1977
|
+
}
|
|
1978
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1979
|
+
const keyLower = key.toLowerCase();
|
|
1980
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1981
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1982
|
+
);
|
|
1983
|
+
if (matches.length === 0) {
|
|
1984
|
+
return void 0;
|
|
1985
|
+
}
|
|
1986
|
+
if (matches.length > 1) {
|
|
1987
|
+
throw new Error(
|
|
1988
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1989
|
+
);
|
|
1990
|
+
}
|
|
1991
|
+
return matches[0];
|
|
1992
|
+
}
|
|
1993
|
+
async expandRunConfigToJobs(collected) {
|
|
1994
|
+
if (this.datasetsById.size === 0) {
|
|
1995
|
+
await this.collectDatasets();
|
|
1996
|
+
}
|
|
1997
|
+
if (this.evaluatorsById.size === 0) {
|
|
1998
|
+
await this.collectEvaluators();
|
|
1999
|
+
}
|
|
2000
|
+
const rcName = collected.runConfig.getName();
|
|
2001
|
+
const jobs = [];
|
|
2002
|
+
const runs = collected.runConfig.getRuns();
|
|
2003
|
+
for (const [i, row] of runs.entries()) {
|
|
2004
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
2005
|
+
(d) => d.dataset === row.dataset
|
|
2006
|
+
);
|
|
2007
|
+
if (!dsCollected) {
|
|
2008
|
+
throw new Error(
|
|
2009
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2010
|
+
);
|
|
2011
|
+
}
|
|
2012
|
+
let evaluatorIds;
|
|
2013
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
2014
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
2015
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
2016
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
2017
|
+
);
|
|
2018
|
+
if (matched.length === 0) {
|
|
2019
|
+
throw new Error(
|
|
2020
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2021
|
+
);
|
|
2022
|
+
}
|
|
2023
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2024
|
+
} else {
|
|
2025
|
+
const evaluators = row.evaluators;
|
|
2026
|
+
evaluatorIds = [];
|
|
2027
|
+
for (const ev of evaluators) {
|
|
2028
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2029
|
+
(item) => item.evaluator === ev
|
|
2030
|
+
);
|
|
2031
|
+
if (!found) {
|
|
2032
|
+
throw new Error(
|
|
2033
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2034
|
+
);
|
|
2035
|
+
}
|
|
2036
|
+
evaluatorIds.push(found.id);
|
|
2037
|
+
}
|
|
2038
|
+
}
|
|
2039
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2040
|
+
jobs.push({
|
|
2041
|
+
datasetId: dsCollected.id,
|
|
2042
|
+
evaluatorIds,
|
|
2043
|
+
runConfigName: rcName,
|
|
2044
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2045
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2046
|
+
repetitions
|
|
2047
|
+
});
|
|
2048
|
+
}
|
|
2049
|
+
return jobs;
|
|
2050
|
+
}
|
|
2051
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2052
|
+
const jobs = [];
|
|
2053
|
+
for (const name of names) {
|
|
2054
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2055
|
+
if (!collected) {
|
|
2056
|
+
const known = await this.collectRunConfigs();
|
|
2057
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2058
|
+
throw new Error(
|
|
2059
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2060
|
+
);
|
|
2061
|
+
}
|
|
2062
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2063
|
+
}
|
|
2064
|
+
return jobs;
|
|
2065
|
+
}
|
|
2066
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2067
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2068
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2069
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2070
|
+
const snapshots = [];
|
|
2071
|
+
for (const job of request.jobs) {
|
|
2072
|
+
snapshots.push(
|
|
2073
|
+
await this.startDatasetRun({
|
|
2074
|
+
datasetId: job.datasetId,
|
|
2075
|
+
evaluatorIds: job.evaluatorIds,
|
|
2076
|
+
triggerId,
|
|
2077
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2078
|
+
globalEvaluationSemaphore: sem,
|
|
2079
|
+
runConfigName: job.runConfigName,
|
|
2080
|
+
runConfigTags: job.runConfigTags,
|
|
2081
|
+
repetitions: job.repetitions
|
|
2082
|
+
})
|
|
2083
|
+
);
|
|
2084
|
+
}
|
|
2085
|
+
return snapshots;
|
|
2086
|
+
}
|
|
1719
2087
|
async searchTestCases(query) {
|
|
1720
2088
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1721
2089
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1734,36 +2102,46 @@ var EffectRunner = class {
|
|
|
1734
2102
|
);
|
|
1735
2103
|
}
|
|
1736
2104
|
async runDatasetWith(request) {
|
|
2105
|
+
const runConfigName = validateRunConfigName(
|
|
2106
|
+
request.runConfigName,
|
|
2107
|
+
"runDatasetWith.runConfigName"
|
|
2108
|
+
);
|
|
2109
|
+
return this.startDatasetRun({
|
|
2110
|
+
datasetId: request.datasetId,
|
|
2111
|
+
evaluatorIds: request.evaluatorIds,
|
|
2112
|
+
triggerId: request.triggerId,
|
|
2113
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2114
|
+
repetitions: request.repetitions,
|
|
2115
|
+
runConfigName,
|
|
2116
|
+
runConfigTags: request.runConfigTags
|
|
2117
|
+
});
|
|
2118
|
+
}
|
|
2119
|
+
async startDatasetRun(params) {
|
|
1737
2120
|
if (this.datasetsById.size === 0) {
|
|
1738
2121
|
await this.collectDatasets();
|
|
1739
2122
|
}
|
|
1740
2123
|
if (this.evaluatorsById.size === 0) {
|
|
1741
2124
|
await this.collectEvaluators();
|
|
1742
2125
|
}
|
|
1743
|
-
const dataset = this.datasetsById.get(
|
|
2126
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1744
2127
|
if (!dataset) {
|
|
1745
|
-
throw new Error(`Unknown dataset: ${
|
|
2128
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1746
2129
|
}
|
|
1747
|
-
const selectedEvaluators =
|
|
2130
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1748
2131
|
if (selectedEvaluators.length === 0) {
|
|
1749
2132
|
throw new Error("No evaluators selected for run");
|
|
1750
2133
|
}
|
|
1751
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1752
|
-
const
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
)
|
|
1756
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2134
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2135
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2136
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2137
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2138
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1757
2139
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1758
|
-
const artifactPath = createArtifactPath(
|
|
1759
|
-
this.config.artifactDirectory,
|
|
1760
|
-
request.datasetId,
|
|
1761
|
-
runId
|
|
1762
|
-
);
|
|
2140
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1763
2141
|
const snapshot = {
|
|
1764
2142
|
runId,
|
|
1765
|
-
datasetId:
|
|
1766
|
-
datasetName: dataset.dataset.
|
|
2143
|
+
datasetId: params.datasetId,
|
|
2144
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1767
2145
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1768
2146
|
queuedAt: Date.now(),
|
|
1769
2147
|
totalTestCases: totalEvaluations,
|
|
@@ -1783,8 +2161,8 @@ var EffectRunner = class {
|
|
|
1783
2161
|
const queuedEvent = {
|
|
1784
2162
|
type: "RunQueued",
|
|
1785
2163
|
runId,
|
|
1786
|
-
datasetId:
|
|
1787
|
-
datasetName: dataset.dataset.
|
|
2164
|
+
datasetId: params.datasetId,
|
|
2165
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1788
2166
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1789
2167
|
totalTestCases: totalEvaluations,
|
|
1790
2168
|
artifactPath
|
|
@@ -1797,17 +2175,20 @@ var EffectRunner = class {
|
|
|
1797
2175
|
payload: queuedEvent
|
|
1798
2176
|
})
|
|
1799
2177
|
);
|
|
1800
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1801
2178
|
await effect.Effect.runPromise(
|
|
1802
2179
|
effect.Queue.offer(this.runQueue, {
|
|
1803
2180
|
runId,
|
|
1804
2181
|
triggerId,
|
|
1805
|
-
datasetId:
|
|
2182
|
+
datasetId: params.datasetId,
|
|
1806
2183
|
dataset: dataset.dataset,
|
|
1807
2184
|
evaluators: selectedEvaluators,
|
|
1808
2185
|
testCases: selectedTestCases,
|
|
1809
2186
|
snapshot,
|
|
1810
|
-
maxConcurrency
|
|
2187
|
+
maxConcurrency: params.maxConcurrency,
|
|
2188
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2189
|
+
runConfigName: params.runConfigName,
|
|
2190
|
+
runConfigTags,
|
|
2191
|
+
repetitions
|
|
1811
2192
|
})
|
|
1812
2193
|
);
|
|
1813
2194
|
return snapshot;
|
|
@@ -1879,15 +2260,27 @@ var EffectRunner = class {
|
|
|
1879
2260
|
}
|
|
1880
2261
|
};
|
|
1881
2262
|
|
|
2263
|
+
// src/runner/events.ts
|
|
2264
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2265
|
+
runConfigName: "programmatic"
|
|
2266
|
+
};
|
|
2267
|
+
|
|
1882
2268
|
Object.defineProperty(exports, 'S', {
|
|
1883
2269
|
enumerable: true,
|
|
1884
2270
|
get: function () { return effect.Schema; }
|
|
1885
2271
|
});
|
|
1886
2272
|
exports.Dataset = Dataset;
|
|
2273
|
+
exports.DatasetNameSchema = DatasetNameSchema;
|
|
1887
2274
|
exports.Evaluator = Evaluator;
|
|
2275
|
+
exports.EvaluatorNameSchema = EvaluatorNameSchema;
|
|
1888
2276
|
exports.Metric = Metric;
|
|
2277
|
+
exports.PROGRAMMATIC_RUN_CONFIG = PROGRAMMATIC_RUN_CONFIG;
|
|
2278
|
+
exports.RunConfig = RunConfig;
|
|
2279
|
+
exports.RunConfigNameSchema = RunConfigNameSchema;
|
|
1889
2280
|
exports.Score = Score;
|
|
2281
|
+
exports.TagSet = TagSet;
|
|
1890
2282
|
exports.TestCase = TestCase;
|
|
2283
|
+
exports.TestCaseNameSchema = TestCaseNameSchema;
|
|
1891
2284
|
exports.binaryScore = binaryScore;
|
|
1892
2285
|
exports.createLogEntry = createLogEntry;
|
|
1893
2286
|
exports.createRunner = createRunner;
|
|
@@ -1895,16 +2288,26 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
|
1895
2288
|
exports.defineConfig = defineConfig;
|
|
1896
2289
|
exports.deltaScore = deltaScore;
|
|
1897
2290
|
exports.formatScoreData = formatScoreData;
|
|
2291
|
+
exports.getDatasetDisplayLabel = getDatasetDisplayLabel;
|
|
2292
|
+
exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
|
|
2293
|
+
exports.getEvaluatorTagList = getEvaluatorTagList;
|
|
1898
2294
|
exports.getLogLines = getLogLines;
|
|
1899
2295
|
exports.getMetricById = getMetricById;
|
|
1900
2296
|
exports.getScoreById = getScoreById;
|
|
2297
|
+
exports.getTestCaseDisplayLabel = getTestCaseDisplayLabel;
|
|
2298
|
+
exports.getTestCaseTagList = getTestCaseTagList;
|
|
1901
2299
|
exports.latencyMetric = latencyMetric;
|
|
1902
2300
|
exports.loadMockData = loadMockData;
|
|
1903
2301
|
exports.loadRunnerData = loadRunnerData;
|
|
2302
|
+
exports.normalizeOptionalDisplayName = normalizeOptionalDisplayName;
|
|
1904
2303
|
exports.parseStartupArgs = parseStartupArgs;
|
|
1905
2304
|
exports.percentScore = percentScore;
|
|
1906
2305
|
exports.printJsonDiff = printJsonDiff;
|
|
1907
2306
|
exports.tokenCountMetric = tokenCountMetric;
|
|
2307
|
+
exports.validateDatasetName = validateDatasetName;
|
|
2308
|
+
exports.validateEvaluatorName = validateEvaluatorName;
|
|
2309
|
+
exports.validateRunConfigName = validateRunConfigName;
|
|
2310
|
+
exports.validateTestCaseName = validateTestCaseName;
|
|
1908
2311
|
exports.withRunnerConfig = withRunnerConfig;
|
|
1909
2312
|
//# sourceMappingURL=out.js.map
|
|
1910
2313
|
//# sourceMappingURL=index.cjs.map
|