@m4trix/evals 0.25.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +831 -450
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +832 -451
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +531 -270
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +531 -270
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +888 -509
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +878 -513
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -4,10 +4,10 @@ var effect = require('effect');
|
|
|
4
4
|
var diff = require('diff');
|
|
5
5
|
var stringify = require('fast-json-stable-stringify');
|
|
6
6
|
var crypto = require('crypto');
|
|
7
|
-
var
|
|
7
|
+
var promises = require('fs/promises');
|
|
8
8
|
var path = require('path');
|
|
9
|
+
var fs = require('fs');
|
|
9
10
|
var jitiModule = require('jiti');
|
|
10
|
-
var promises = require('fs/promises');
|
|
11
11
|
var url = require('url');
|
|
12
12
|
|
|
13
13
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
@@ -34,6 +34,164 @@ function _interopNamespace(e) {
|
|
|
34
34
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
35
35
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
36
36
|
|
|
37
|
+
// src/index.ts
|
|
38
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
39
|
+
function makeEntityIdSchema(brand, label) {
|
|
40
|
+
return effect.Schema.String.pipe(
|
|
41
|
+
effect.Schema.trimmed(),
|
|
42
|
+
effect.Schema.minLength(1, {
|
|
43
|
+
message: () => `${label} must be non-empty.`
|
|
44
|
+
}),
|
|
45
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
46
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
47
|
+
}),
|
|
48
|
+
effect.Schema.brand(brand)
|
|
49
|
+
);
|
|
50
|
+
}
|
|
51
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
52
|
+
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
53
|
+
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
54
|
+
function validateWithSchema(schema, raw, context) {
|
|
55
|
+
const trimmed = raw.trim();
|
|
56
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
57
|
+
schema
|
|
58
|
+
);
|
|
59
|
+
const result = decode(trimmed);
|
|
60
|
+
if (effect.Either.isLeft(result)) {
|
|
61
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
62
|
+
}
|
|
63
|
+
return result.right;
|
|
64
|
+
}
|
|
65
|
+
function validateRunConfigName(raw, context) {
|
|
66
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
67
|
+
}
|
|
68
|
+
function validateEvaluatorName(raw, context) {
|
|
69
|
+
return validateWithSchema(EvaluatorNameSchema, raw, context);
|
|
70
|
+
}
|
|
71
|
+
function validateTestCaseName(raw, context) {
|
|
72
|
+
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
73
|
+
}
|
|
74
|
+
function normalizeOptionalDisplayName(raw) {
|
|
75
|
+
if (raw === void 0) {
|
|
76
|
+
return void 0;
|
|
77
|
+
}
|
|
78
|
+
const t = raw.trim();
|
|
79
|
+
return t.length === 0 ? void 0 : t;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// src/evals/evaluator.ts
|
|
83
|
+
var Evaluator = class _Evaluator {
|
|
84
|
+
constructor(config) {
|
|
85
|
+
this._config = config;
|
|
86
|
+
}
|
|
87
|
+
getState() {
|
|
88
|
+
return {
|
|
89
|
+
name: this._config.name,
|
|
90
|
+
displayName: this._config.displayName,
|
|
91
|
+
tags: this._config.tags,
|
|
92
|
+
inputSchema: this._config.inputSchema,
|
|
93
|
+
outputSchema: this._config.outputSchema,
|
|
94
|
+
scoreSchema: this._config.scoreSchema,
|
|
95
|
+
middlewares: this._config.middlewares,
|
|
96
|
+
evaluateFn: this._config.evaluateFn,
|
|
97
|
+
passThreshold: this._config.passThreshold,
|
|
98
|
+
passCriterion: this._config.passCriterion
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
static use(middleware) {
|
|
102
|
+
return new _Evaluator({
|
|
103
|
+
middlewares: [middleware],
|
|
104
|
+
tags: []
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
use(middleware) {
|
|
108
|
+
const state = this.getState();
|
|
109
|
+
return new _Evaluator({
|
|
110
|
+
...state,
|
|
111
|
+
middlewares: [...state.middlewares, middleware]
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
define(config) {
|
|
115
|
+
const { middlewares } = this.getState();
|
|
116
|
+
const name = validateEvaluatorName(config.name, "Evaluator.define");
|
|
117
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
118
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
119
|
+
return new _Evaluator({
|
|
120
|
+
name,
|
|
121
|
+
displayName,
|
|
122
|
+
tags,
|
|
123
|
+
inputSchema: config.inputSchema,
|
|
124
|
+
outputSchema: config.outputSchema,
|
|
125
|
+
scoreSchema: config.scoreSchema,
|
|
126
|
+
middlewares,
|
|
127
|
+
passThreshold: config.passThreshold,
|
|
128
|
+
passCriterion: config.passCriterion
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
evaluate(fn) {
|
|
132
|
+
return new _Evaluator({
|
|
133
|
+
...this.getState(),
|
|
134
|
+
evaluateFn: fn
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
/** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
|
|
138
|
+
getName() {
|
|
139
|
+
return this._config.name;
|
|
140
|
+
}
|
|
141
|
+
getDisplayName() {
|
|
142
|
+
return this._config.displayName;
|
|
143
|
+
}
|
|
144
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
|
|
145
|
+
getDisplayLabel() {
|
|
146
|
+
const id = this._config.name;
|
|
147
|
+
if (id === void 0) {
|
|
148
|
+
return void 0;
|
|
149
|
+
}
|
|
150
|
+
return this._config.displayName ?? id;
|
|
151
|
+
}
|
|
152
|
+
/** Tags from `Evaluator.define({ tags })`; empty until defined. */
|
|
153
|
+
getTags() {
|
|
154
|
+
return [...this._config.tags];
|
|
155
|
+
}
|
|
156
|
+
getInputSchema() {
|
|
157
|
+
return this._config.inputSchema;
|
|
158
|
+
}
|
|
159
|
+
getOutputSchema() {
|
|
160
|
+
return this._config.outputSchema;
|
|
161
|
+
}
|
|
162
|
+
getScoreSchema() {
|
|
163
|
+
return this._config.scoreSchema;
|
|
164
|
+
}
|
|
165
|
+
getMiddlewares() {
|
|
166
|
+
return this._config.middlewares;
|
|
167
|
+
}
|
|
168
|
+
getEvaluateFn() {
|
|
169
|
+
return this._config.evaluateFn;
|
|
170
|
+
}
|
|
171
|
+
getPassThreshold() {
|
|
172
|
+
return this._config.passThreshold;
|
|
173
|
+
}
|
|
174
|
+
getPassCriterion() {
|
|
175
|
+
return this._config.passCriterion;
|
|
176
|
+
}
|
|
177
|
+
async resolveContext() {
|
|
178
|
+
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
179
|
+
return Object.assign({}, ...parts);
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
183
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
184
|
+
const label = evaluator.getDisplayLabel();
|
|
185
|
+
if (label !== void 0) {
|
|
186
|
+
return label;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
190
|
+
}
|
|
191
|
+
function getEvaluatorTagList(evaluator) {
|
|
192
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
193
|
+
}
|
|
194
|
+
|
|
37
195
|
// src/cli/data.mock.json
|
|
38
196
|
var data_mock_default = {
|
|
39
197
|
datasets: [
|
|
@@ -288,7 +446,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
288
446
|
function toEvaluatorOption(item) {
|
|
289
447
|
return {
|
|
290
448
|
id: item.id,
|
|
291
|
-
name: item.evaluator
|
|
449
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
292
450
|
configPreview: `Source: ${item.filePath}`
|
|
293
451
|
};
|
|
294
452
|
}
|
|
@@ -334,132 +492,6 @@ function parseStartupArgs(argv) {
|
|
|
334
492
|
return args;
|
|
335
493
|
}
|
|
336
494
|
|
|
337
|
-
// src/evals/test-case.ts
|
|
338
|
-
function resolve(value) {
|
|
339
|
-
return typeof value === "function" ? value() : value;
|
|
340
|
-
}
|
|
341
|
-
var TestCase = class _TestCase {
|
|
342
|
-
constructor(config) {
|
|
343
|
-
this._config = config;
|
|
344
|
-
}
|
|
345
|
-
static describe(config) {
|
|
346
|
-
const reruns = config.reruns ?? 1;
|
|
347
|
-
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
348
|
-
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
349
|
-
}
|
|
350
|
-
return new _TestCase({
|
|
351
|
-
name: config.name,
|
|
352
|
-
tags: config.tags,
|
|
353
|
-
reruns,
|
|
354
|
-
inputSchema: config.inputSchema,
|
|
355
|
-
input: config.input,
|
|
356
|
-
outputSchema: config.outputSchema,
|
|
357
|
-
output: config.output
|
|
358
|
-
});
|
|
359
|
-
}
|
|
360
|
-
getReruns() {
|
|
361
|
-
return this._config.reruns;
|
|
362
|
-
}
|
|
363
|
-
getName() {
|
|
364
|
-
return this._config.name;
|
|
365
|
-
}
|
|
366
|
-
getTags() {
|
|
367
|
-
return this._config.tags;
|
|
368
|
-
}
|
|
369
|
-
getInputSchema() {
|
|
370
|
-
return this._config.inputSchema;
|
|
371
|
-
}
|
|
372
|
-
getInput() {
|
|
373
|
-
return resolve(this._config.input);
|
|
374
|
-
}
|
|
375
|
-
getOutputSchema() {
|
|
376
|
-
return this._config.outputSchema;
|
|
377
|
-
}
|
|
378
|
-
getOutput() {
|
|
379
|
-
if (this._config.output === void 0) {
|
|
380
|
-
return void 0;
|
|
381
|
-
}
|
|
382
|
-
return resolve(this._config.output);
|
|
383
|
-
}
|
|
384
|
-
};
|
|
385
|
-
|
|
386
|
-
// src/evals/evaluator.ts
|
|
387
|
-
var Evaluator = class _Evaluator {
|
|
388
|
-
constructor(config) {
|
|
389
|
-
this._config = config;
|
|
390
|
-
}
|
|
391
|
-
getState() {
|
|
392
|
-
return {
|
|
393
|
-
name: this._config.name,
|
|
394
|
-
inputSchema: this._config.inputSchema,
|
|
395
|
-
outputSchema: this._config.outputSchema,
|
|
396
|
-
scoreSchema: this._config.scoreSchema,
|
|
397
|
-
middlewares: this._config.middlewares,
|
|
398
|
-
evaluateFn: this._config.evaluateFn,
|
|
399
|
-
passThreshold: this._config.passThreshold,
|
|
400
|
-
passCriterion: this._config.passCriterion
|
|
401
|
-
};
|
|
402
|
-
}
|
|
403
|
-
static use(middleware) {
|
|
404
|
-
return new _Evaluator({
|
|
405
|
-
middlewares: [middleware]
|
|
406
|
-
});
|
|
407
|
-
}
|
|
408
|
-
use(middleware) {
|
|
409
|
-
const state = this.getState();
|
|
410
|
-
return new _Evaluator({
|
|
411
|
-
...state,
|
|
412
|
-
middlewares: [...state.middlewares, middleware]
|
|
413
|
-
});
|
|
414
|
-
}
|
|
415
|
-
define(config) {
|
|
416
|
-
const { middlewares } = this.getState();
|
|
417
|
-
return new _Evaluator({
|
|
418
|
-
name: config.name,
|
|
419
|
-
inputSchema: config.inputSchema,
|
|
420
|
-
outputSchema: config.outputSchema,
|
|
421
|
-
scoreSchema: config.scoreSchema,
|
|
422
|
-
middlewares,
|
|
423
|
-
passThreshold: config.passThreshold,
|
|
424
|
-
passCriterion: config.passCriterion
|
|
425
|
-
});
|
|
426
|
-
}
|
|
427
|
-
evaluate(fn) {
|
|
428
|
-
return new _Evaluator({
|
|
429
|
-
...this.getState(),
|
|
430
|
-
evaluateFn: fn
|
|
431
|
-
});
|
|
432
|
-
}
|
|
433
|
-
getName() {
|
|
434
|
-
return this._config.name;
|
|
435
|
-
}
|
|
436
|
-
getInputSchema() {
|
|
437
|
-
return this._config.inputSchema;
|
|
438
|
-
}
|
|
439
|
-
getOutputSchema() {
|
|
440
|
-
return this._config.outputSchema;
|
|
441
|
-
}
|
|
442
|
-
getScoreSchema() {
|
|
443
|
-
return this._config.scoreSchema;
|
|
444
|
-
}
|
|
445
|
-
getMiddlewares() {
|
|
446
|
-
return this._config.middlewares;
|
|
447
|
-
}
|
|
448
|
-
getEvaluateFn() {
|
|
449
|
-
return this._config.evaluateFn;
|
|
450
|
-
}
|
|
451
|
-
getPassThreshold() {
|
|
452
|
-
return this._config.passThreshold;
|
|
453
|
-
}
|
|
454
|
-
getPassCriterion() {
|
|
455
|
-
return this._config.passCriterion;
|
|
456
|
-
}
|
|
457
|
-
async resolveContext() {
|
|
458
|
-
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
459
|
-
return Object.assign({}, ...parts);
|
|
460
|
-
}
|
|
461
|
-
};
|
|
462
|
-
|
|
463
495
|
// src/evals/dataset.ts
|
|
464
496
|
function matchesAny(value, matchers) {
|
|
465
497
|
return matchers.some(
|
|
@@ -523,34 +555,284 @@ var Dataset = class _Dataset {
|
|
|
523
555
|
return tagMatch && pathMatch;
|
|
524
556
|
}
|
|
525
557
|
};
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
name: config.name,
|
|
534
|
-
aggregate: config.aggregate,
|
|
535
|
-
format: config.format,
|
|
536
|
-
make: (data, options) => ({
|
|
537
|
-
id: config.id,
|
|
538
|
-
data,
|
|
539
|
-
...options?.name !== void 0 && { name: options.name }
|
|
540
|
-
})
|
|
541
|
-
};
|
|
542
|
-
registry.set(config.id, def);
|
|
543
|
-
return def;
|
|
558
|
+
function preprocessForDiff(value, options) {
|
|
559
|
+
if (options?.sort && Array.isArray(value)) {
|
|
560
|
+
return [...value].sort((a, b) => {
|
|
561
|
+
const aStr = stringify__default.default(preprocessForDiff(a, options));
|
|
562
|
+
const bStr = stringify__default.default(preprocessForDiff(b, options));
|
|
563
|
+
return aStr.localeCompare(bStr);
|
|
564
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
544
565
|
}
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
566
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
567
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
568
|
+
const filtered = {};
|
|
569
|
+
for (const [k, v] of Object.entries(value)) {
|
|
570
|
+
if (!keys.includes(k)) {
|
|
571
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
return filtered;
|
|
575
|
+
}
|
|
576
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
577
|
+
const result = {};
|
|
578
|
+
for (const [k, v] of Object.entries(value)) {
|
|
579
|
+
result[k] = preprocessForDiff(v, options);
|
|
580
|
+
}
|
|
581
|
+
return result;
|
|
582
|
+
}
|
|
583
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
584
|
+
return Number(value.toFixed(options.precision));
|
|
585
|
+
}
|
|
586
|
+
return value;
|
|
587
|
+
}
|
|
588
|
+
function toPrettyJson(value) {
|
|
589
|
+
const str = stringify__default.default(value);
|
|
590
|
+
try {
|
|
591
|
+
const parsed = JSON.parse(str);
|
|
592
|
+
return JSON.stringify(parsed, null, 2);
|
|
593
|
+
} catch {
|
|
594
|
+
return str;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
function formatDiffParts(parts) {
|
|
598
|
+
const lines = [];
|
|
599
|
+
for (const part of parts) {
|
|
600
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
601
|
+
const partLines = part.value.split("\n");
|
|
602
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
603
|
+
const line = partLines[i];
|
|
604
|
+
if (i === partLines.length - 1 && line === "")
|
|
605
|
+
continue;
|
|
606
|
+
lines.push(prefix + line);
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
return lines.join("\n");
|
|
610
|
+
}
|
|
611
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
612
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
613
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
614
|
+
if (diffOptions?.keysOnly) {
|
|
615
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
616
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
617
|
+
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
618
|
+
return formatDiffParts(parts2);
|
|
619
|
+
}
|
|
620
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
621
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
622
|
+
if (expectedStr === actualStr) {
|
|
623
|
+
return "";
|
|
624
|
+
}
|
|
625
|
+
const parts = diff.diffLines(expectedStr, actualStr);
|
|
626
|
+
if (diffOptions?.outputNewOnly) {
|
|
627
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
628
|
+
return formatDiffParts(filtered);
|
|
629
|
+
}
|
|
630
|
+
return formatDiffParts(parts);
|
|
631
|
+
}
|
|
632
|
+
function extractKeys(value) {
|
|
633
|
+
if (value === null || typeof value !== "object") {
|
|
634
|
+
return "\xB7";
|
|
635
|
+
}
|
|
636
|
+
if (Array.isArray(value)) {
|
|
637
|
+
return value.map(extractKeys);
|
|
638
|
+
}
|
|
639
|
+
const result = {};
|
|
640
|
+
for (const [k, v] of Object.entries(value)) {
|
|
641
|
+
result[k] = extractKeys(v);
|
|
642
|
+
}
|
|
643
|
+
return result;
|
|
644
|
+
}
|
|
645
|
+
function formatLogMessage(msg) {
|
|
646
|
+
if (typeof msg === "string")
|
|
647
|
+
return msg;
|
|
648
|
+
if (msg instanceof Error)
|
|
649
|
+
return msg.stack ?? msg.message;
|
|
650
|
+
try {
|
|
651
|
+
if (msg !== null && typeof msg === "object") {
|
|
652
|
+
return JSON.stringify(msg, null, 2);
|
|
653
|
+
}
|
|
654
|
+
return String(msg);
|
|
655
|
+
} catch {
|
|
656
|
+
return String(msg);
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
function createLogEntry(message, options) {
|
|
660
|
+
return {
|
|
661
|
+
type: "log",
|
|
662
|
+
label: options?.label,
|
|
663
|
+
message: formatLogMessage(message)
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
function getLogLines(entry) {
|
|
667
|
+
return entry.message.split("\n");
|
|
668
|
+
}
|
|
669
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
670
|
+
const { label, ...diffOpts } = options ?? {};
|
|
671
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
672
|
+
return {
|
|
673
|
+
type: "diff",
|
|
674
|
+
label,
|
|
675
|
+
expected,
|
|
676
|
+
actual,
|
|
677
|
+
diff: diff || "(no differences)"
|
|
678
|
+
};
|
|
679
|
+
}
|
|
680
|
+
function printJsonDiff(expected, actual, options = {}) {
|
|
681
|
+
const { color = true, ...diffOpts } = options;
|
|
682
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
683
|
+
if (color) {
|
|
684
|
+
const lines = diff.split("\n").map((line) => {
|
|
685
|
+
const trimmed = line.trimStart();
|
|
686
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
687
|
+
return `\x1B[31m${line}\x1B[0m`;
|
|
688
|
+
}
|
|
689
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
690
|
+
return `\x1B[32m${line}\x1B[0m`;
|
|
691
|
+
}
|
|
692
|
+
return line;
|
|
693
|
+
});
|
|
694
|
+
const colored = lines.join("\n");
|
|
695
|
+
console.log(colored || "(no differences)");
|
|
696
|
+
return colored;
|
|
697
|
+
}
|
|
698
|
+
console.log(diff || "(no differences)");
|
|
699
|
+
return diff;
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// src/evals/metric.ts
|
|
703
|
+
var registry = /* @__PURE__ */ new Map();
|
|
704
|
+
var Metric = {
|
|
705
|
+
of(config) {
|
|
706
|
+
const def = {
|
|
707
|
+
id: config.id,
|
|
708
|
+
name: config.name,
|
|
709
|
+
aggregate: config.aggregate,
|
|
710
|
+
format: config.format,
|
|
711
|
+
make: (data, options) => ({
|
|
712
|
+
id: config.id,
|
|
713
|
+
data,
|
|
714
|
+
...options?.name !== void 0 && { name: options.name }
|
|
715
|
+
})
|
|
716
|
+
};
|
|
717
|
+
registry.set(config.id, def);
|
|
718
|
+
return def;
|
|
719
|
+
}
|
|
720
|
+
};
|
|
721
|
+
function getMetricById(id) {
|
|
722
|
+
return registry.get(id);
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// src/evals/aggregators.ts
|
|
726
|
+
function aggregateTokenCountSum(values) {
|
|
727
|
+
const initial = {
|
|
728
|
+
input: 0,
|
|
729
|
+
output: 0,
|
|
730
|
+
inputCached: 0,
|
|
731
|
+
outputCached: 0
|
|
732
|
+
};
|
|
733
|
+
return values.reduce(
|
|
734
|
+
(acc, v) => ({
|
|
735
|
+
input: acc.input + (v.input ?? 0),
|
|
736
|
+
output: acc.output + (v.output ?? 0),
|
|
737
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
738
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
739
|
+
}),
|
|
740
|
+
initial
|
|
741
|
+
);
|
|
742
|
+
}
|
|
743
|
+
function aggregateLatencyAverage(values) {
|
|
744
|
+
if (values.length === 0) {
|
|
745
|
+
return { ms: 0 };
|
|
746
|
+
}
|
|
747
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
748
|
+
return { ms: sum / values.length };
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
// src/evals/metrics/standard.ts
|
|
752
|
+
var tokenCountMetric = Metric.of({
|
|
753
|
+
id: "token-count",
|
|
754
|
+
name: "Tokens",
|
|
755
|
+
aggregate: aggregateTokenCountSum,
|
|
756
|
+
format: (data, options) => {
|
|
757
|
+
const input = data.input ?? 0;
|
|
758
|
+
const output = data.output ?? 0;
|
|
759
|
+
const inputCached = data.inputCached ?? 0;
|
|
760
|
+
const outputCached = data.outputCached ?? 0;
|
|
761
|
+
const cached = inputCached + outputCached;
|
|
762
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
763
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
764
|
+
}
|
|
765
|
+
});
|
|
766
|
+
var latencyMetric = Metric.of({
|
|
767
|
+
id: "latency",
|
|
768
|
+
name: "Latency",
|
|
769
|
+
aggregate: aggregateLatencyAverage,
|
|
770
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
771
|
+
});
|
|
772
|
+
|
|
773
|
+
// src/evals/run-config.ts
|
|
774
|
+
function validateRow(row, index) {
|
|
775
|
+
const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
|
|
776
|
+
const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
|
|
777
|
+
if (hasEvaluators && hasPattern) {
|
|
778
|
+
throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
|
|
779
|
+
}
|
|
780
|
+
if (!hasEvaluators && !hasPattern) {
|
|
781
|
+
throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
|
|
782
|
+
}
|
|
783
|
+
if (hasEvaluators && row.evaluators.length === 0) {
|
|
784
|
+
throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
|
|
785
|
+
}
|
|
786
|
+
const rawRep = "repetitions" in row ? row.repetitions : void 0;
|
|
787
|
+
const repetitions = rawRep ?? 1;
|
|
788
|
+
if (!Number.isInteger(repetitions) || repetitions < 1) {
|
|
789
|
+
throw new Error(
|
|
790
|
+
`RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
|
|
791
|
+
);
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
var RunConfig = class _RunConfig {
|
|
795
|
+
constructor(name, displayName, tags, runs) {
|
|
796
|
+
this._name = name;
|
|
797
|
+
this._displayName = displayName;
|
|
798
|
+
this._tags = tags;
|
|
799
|
+
this._runs = runs;
|
|
800
|
+
}
|
|
801
|
+
static define(config) {
|
|
802
|
+
if (config.runs.length === 0) {
|
|
803
|
+
throw new Error("RunConfig runs must be non-empty");
|
|
804
|
+
}
|
|
805
|
+
config.runs.forEach(validateRow);
|
|
806
|
+
const name = validateRunConfigName(config.name, "RunConfig.define");
|
|
807
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
808
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
809
|
+
return new _RunConfig(name, displayName, tags, config.runs);
|
|
810
|
+
}
|
|
811
|
+
/** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
|
|
812
|
+
getName() {
|
|
813
|
+
return this._name;
|
|
814
|
+
}
|
|
815
|
+
/** Optional unrestricted display label. */
|
|
816
|
+
getDisplayName() {
|
|
817
|
+
return this._displayName;
|
|
818
|
+
}
|
|
819
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
820
|
+
getDisplayLabel() {
|
|
821
|
+
return this._displayName ?? this._name;
|
|
822
|
+
}
|
|
823
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
824
|
+
getTags() {
|
|
825
|
+
return [...this._tags];
|
|
826
|
+
}
|
|
827
|
+
getRuns() {
|
|
828
|
+
return this._runs;
|
|
829
|
+
}
|
|
830
|
+
};
|
|
831
|
+
|
|
832
|
+
// src/evals/score.ts
|
|
833
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
834
|
+
function formatScoreData(def, data, options) {
|
|
835
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
554
836
|
}
|
|
555
837
|
var ScoreAggregate = {
|
|
556
838
|
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
@@ -655,54 +937,6 @@ function getScoreById(id) {
|
|
|
655
937
|
return registry2.get(id);
|
|
656
938
|
}
|
|
657
939
|
|
|
658
|
-
// src/evals/aggregators.ts
|
|
659
|
-
function aggregateTokenCountSum(values) {
|
|
660
|
-
const initial = {
|
|
661
|
-
input: 0,
|
|
662
|
-
output: 0,
|
|
663
|
-
inputCached: 0,
|
|
664
|
-
outputCached: 0
|
|
665
|
-
};
|
|
666
|
-
return values.reduce(
|
|
667
|
-
(acc, v) => ({
|
|
668
|
-
input: acc.input + (v.input ?? 0),
|
|
669
|
-
output: acc.output + (v.output ?? 0),
|
|
670
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
671
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
672
|
-
}),
|
|
673
|
-
initial
|
|
674
|
-
);
|
|
675
|
-
}
|
|
676
|
-
function aggregateLatencyAverage(values) {
|
|
677
|
-
if (values.length === 0) {
|
|
678
|
-
return { ms: 0 };
|
|
679
|
-
}
|
|
680
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
681
|
-
return { ms: sum / values.length };
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
// src/evals/metrics/standard.ts
|
|
685
|
-
var tokenCountMetric = Metric.of({
|
|
686
|
-
id: "token-count",
|
|
687
|
-
name: "Tokens",
|
|
688
|
-
aggregate: aggregateTokenCountSum,
|
|
689
|
-
format: (data, options) => {
|
|
690
|
-
const input = data.input ?? 0;
|
|
691
|
-
const output = data.output ?? 0;
|
|
692
|
-
const inputCached = data.inputCached ?? 0;
|
|
693
|
-
const outputCached = data.outputCached ?? 0;
|
|
694
|
-
const cached = inputCached + outputCached;
|
|
695
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
696
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
697
|
-
}
|
|
698
|
-
});
|
|
699
|
-
var latencyMetric = Metric.of({
|
|
700
|
-
id: "latency",
|
|
701
|
-
name: "Latency",
|
|
702
|
-
aggregate: aggregateLatencyAverage,
|
|
703
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
704
|
-
});
|
|
705
|
-
|
|
706
940
|
// src/evals/scores/standard.ts
|
|
707
941
|
var percentScore = Score.of({
|
|
708
942
|
id: "percent",
|
|
@@ -734,148 +968,197 @@ var binaryScore = Score.of({
|
|
|
734
968
|
},
|
|
735
969
|
aggregateValues: Score.aggregate.all
|
|
736
970
|
});
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
747
|
-
const filtered = {};
|
|
748
|
-
for (const [k, v] of Object.entries(value)) {
|
|
749
|
-
if (!keys.includes(k)) {
|
|
750
|
-
filtered[k] = preprocessForDiff(v, options);
|
|
751
|
-
}
|
|
971
|
+
|
|
972
|
+
// src/evals/tag-set.ts
|
|
973
|
+
var TagSet = class {
|
|
974
|
+
constructor() {
|
|
975
|
+
}
|
|
976
|
+
static define(tags) {
|
|
977
|
+
const out = {};
|
|
978
|
+
for (const tag of tags) {
|
|
979
|
+
out[tag] = tag;
|
|
752
980
|
}
|
|
753
|
-
return
|
|
981
|
+
return out;
|
|
754
982
|
}
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
983
|
+
};
|
|
984
|
+
|
|
985
|
+
// src/evals/test-case.ts
|
|
986
|
+
function resolve(value) {
|
|
987
|
+
return typeof value === "function" ? value() : value;
|
|
988
|
+
}
|
|
989
|
+
var TestCase = class _TestCase {
|
|
990
|
+
constructor(config) {
|
|
991
|
+
this._config = config;
|
|
761
992
|
}
|
|
762
|
-
|
|
763
|
-
|
|
993
|
+
static describe(config) {
|
|
994
|
+
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
995
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
996
|
+
return new _TestCase({
|
|
997
|
+
name,
|
|
998
|
+
displayName,
|
|
999
|
+
tags: config.tags,
|
|
1000
|
+
inputSchema: config.inputSchema,
|
|
1001
|
+
input: config.input,
|
|
1002
|
+
outputSchema: config.outputSchema,
|
|
1003
|
+
output: config.output
|
|
1004
|
+
});
|
|
764
1005
|
}
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
function toPrettyJson(value) {
|
|
768
|
-
const str = stringify__default.default(value);
|
|
769
|
-
try {
|
|
770
|
-
const parsed = JSON.parse(str);
|
|
771
|
-
return JSON.stringify(parsed, null, 2);
|
|
772
|
-
} catch {
|
|
773
|
-
return str;
|
|
1006
|
+
getName() {
|
|
1007
|
+
return this._config.name;
|
|
774
1008
|
}
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
const lines = [];
|
|
778
|
-
for (const part of parts) {
|
|
779
|
-
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
780
|
-
const partLines = part.value.split("\n");
|
|
781
|
-
for (let i = 0; i < partLines.length; i++) {
|
|
782
|
-
const line = partLines[i];
|
|
783
|
-
if (i === partLines.length - 1 && line === "")
|
|
784
|
-
continue;
|
|
785
|
-
lines.push(prefix + line);
|
|
786
|
-
}
|
|
1009
|
+
getDisplayName() {
|
|
1010
|
+
return this._config.displayName;
|
|
787
1011
|
}
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
function createDiffString(expected, actual, diffOptions) {
|
|
791
|
-
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
792
|
-
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
793
|
-
if (diffOptions?.keysOnly) {
|
|
794
|
-
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
795
|
-
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
796
|
-
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
797
|
-
return formatDiffParts(parts2);
|
|
1012
|
+
getDisplayLabel() {
|
|
1013
|
+
return this._config.displayName ?? this._config.name;
|
|
798
1014
|
}
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
if (expectedStr === actualStr) {
|
|
802
|
-
return "";
|
|
1015
|
+
getTags() {
|
|
1016
|
+
return this._config.tags;
|
|
803
1017
|
}
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
const filtered = parts.filter((p) => p.added === true);
|
|
807
|
-
return formatDiffParts(filtered);
|
|
1018
|
+
getInputSchema() {
|
|
1019
|
+
return this._config.inputSchema;
|
|
808
1020
|
}
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
function extractKeys(value) {
|
|
812
|
-
if (value === null || typeof value !== "object") {
|
|
813
|
-
return "\xB7";
|
|
1021
|
+
getInput() {
|
|
1022
|
+
return resolve(this._config.input);
|
|
814
1023
|
}
|
|
815
|
-
|
|
816
|
-
return
|
|
1024
|
+
getOutputSchema() {
|
|
1025
|
+
return this._config.outputSchema;
|
|
817
1026
|
}
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
1027
|
+
getOutput() {
|
|
1028
|
+
if (this._config.output === void 0) {
|
|
1029
|
+
return void 0;
|
|
1030
|
+
}
|
|
1031
|
+
return resolve(this._config.output);
|
|
821
1032
|
}
|
|
822
|
-
|
|
1033
|
+
};
|
|
1034
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1035
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1036
|
+
return testCase.getDisplayLabel();
|
|
1037
|
+
}
|
|
1038
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
823
1039
|
}
|
|
824
|
-
function
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
1040
|
+
function getTestCaseTagList(testCase) {
|
|
1041
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1042
|
+
}
|
|
1043
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
1044
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
1045
|
+
let entries;
|
|
829
1046
|
try {
|
|
830
|
-
|
|
831
|
-
return JSON.stringify(msg, null, 2);
|
|
832
|
-
}
|
|
833
|
-
return String(msg);
|
|
1047
|
+
entries = await promises.readdir(baseDir);
|
|
834
1048
|
} catch {
|
|
835
|
-
return
|
|
1049
|
+
return [];
|
|
836
1050
|
}
|
|
1051
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1052
|
+
const snapshots = [];
|
|
1053
|
+
for (const fileName of jsonlFiles) {
|
|
1054
|
+
const filePath = path.join(baseDir, fileName);
|
|
1055
|
+
try {
|
|
1056
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1057
|
+
if (snapshot) {
|
|
1058
|
+
snapshots.push(snapshot);
|
|
1059
|
+
}
|
|
1060
|
+
} catch {
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
1063
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
837
1064
|
}
|
|
838
|
-
function
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
}
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
if (color) {
|
|
863
|
-
const lines = diff.split("\n").map((line) => {
|
|
864
|
-
const trimmed = line.trimStart();
|
|
865
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
866
|
-
return `\x1B[31m${line}\x1B[0m`;
|
|
1065
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1066
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
1067
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1068
|
+
if (lines.length === 0) {
|
|
1069
|
+
return null;
|
|
1070
|
+
}
|
|
1071
|
+
let runQueued = null;
|
|
1072
|
+
let runCompleted = null;
|
|
1073
|
+
let runFailed = null;
|
|
1074
|
+
let runStarted = null;
|
|
1075
|
+
for (const line of lines) {
|
|
1076
|
+
try {
|
|
1077
|
+
const event = JSON.parse(line);
|
|
1078
|
+
const type = event.type;
|
|
1079
|
+
if (type === "RunQueued") {
|
|
1080
|
+
runQueued = {
|
|
1081
|
+
runId: event.runId,
|
|
1082
|
+
datasetId: event.datasetId,
|
|
1083
|
+
datasetName: event.datasetName,
|
|
1084
|
+
evaluatorIds: event.evaluatorIds,
|
|
1085
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
1086
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
1087
|
+
ts: event.ts
|
|
1088
|
+
};
|
|
867
1089
|
}
|
|
868
|
-
if (
|
|
869
|
-
|
|
1090
|
+
if (type === "RunStarted") {
|
|
1091
|
+
runStarted = { startedAt: event.startedAt };
|
|
1092
|
+
}
|
|
1093
|
+
if (type === "RunCompleted") {
|
|
1094
|
+
runCompleted = {
|
|
1095
|
+
passedTestCases: event.passedTestCases,
|
|
1096
|
+
failedTestCases: event.failedTestCases,
|
|
1097
|
+
totalTestCases: event.totalTestCases,
|
|
1098
|
+
finishedAt: event.finishedAt
|
|
1099
|
+
};
|
|
1100
|
+
}
|
|
1101
|
+
if (type === "RunFailed") {
|
|
1102
|
+
runFailed = {
|
|
1103
|
+
finishedAt: event.finishedAt,
|
|
1104
|
+
errorMessage: event.errorMessage
|
|
1105
|
+
};
|
|
1106
|
+
}
|
|
1107
|
+
} catch {
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
if (!runQueued) {
|
|
1111
|
+
return null;
|
|
1112
|
+
}
|
|
1113
|
+
const artifactPath = filePath;
|
|
1114
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1115
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
1116
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1117
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1118
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1119
|
+
return {
|
|
1120
|
+
runId: runQueued.runId,
|
|
1121
|
+
datasetId: runQueued.datasetId,
|
|
1122
|
+
datasetName: runQueued.datasetName,
|
|
1123
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
1124
|
+
queuedAt: runQueued.ts ?? 0,
|
|
1125
|
+
startedAt: runStarted?.startedAt,
|
|
1126
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1127
|
+
totalTestCases: runQueued.totalTestCases,
|
|
1128
|
+
completedTestCases,
|
|
1129
|
+
passedTestCases,
|
|
1130
|
+
failedTestCases,
|
|
1131
|
+
status,
|
|
1132
|
+
artifactPath,
|
|
1133
|
+
errorMessage: runFailed?.errorMessage
|
|
1134
|
+
};
|
|
1135
|
+
}
|
|
1136
|
+
function aggregateTestCaseProgress(lines) {
|
|
1137
|
+
let completedTestCases = 0;
|
|
1138
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1139
|
+
for (const line of lines) {
|
|
1140
|
+
try {
|
|
1141
|
+
const event = JSON.parse(line);
|
|
1142
|
+
if (event.type === "TestCaseProgress") {
|
|
1143
|
+
const ev = event;
|
|
1144
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1145
|
+
const id = ev.testCaseId;
|
|
1146
|
+
const current = testCasePassedBy.get(id);
|
|
1147
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
870
1148
|
}
|
|
871
|
-
|
|
872
|
-
}
|
|
873
|
-
const colored = lines.join("\n");
|
|
874
|
-
console.log(colored || "(no differences)");
|
|
875
|
-
return colored;
|
|
1149
|
+
} catch {
|
|
1150
|
+
}
|
|
876
1151
|
}
|
|
877
|
-
|
|
878
|
-
|
|
1152
|
+
let passedTestCases = 0;
|
|
1153
|
+
let failedTestCases = 0;
|
|
1154
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1155
|
+
if (passed) {
|
|
1156
|
+
passedTestCases += 1;
|
|
1157
|
+
} else {
|
|
1158
|
+
failedTestCases += 1;
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
879
1162
|
}
|
|
880
1163
|
|
|
881
1164
|
// src/runner/config.ts
|
|
@@ -887,6 +1170,7 @@ var defaultRunnerConfig = {
|
|
|
887
1170
|
rootDir: process.cwd(),
|
|
888
1171
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
889
1172
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
1173
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
890
1174
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
891
1175
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
892
1176
|
},
|
|
@@ -912,6 +1196,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
912
1196
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
913
1197
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
914
1198
|
}
|
|
1199
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
1200
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
1201
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
1202
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
1203
|
+
}
|
|
915
1204
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
916
1205
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
917
1206
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -1010,6 +1299,9 @@ function isDatasetLike(value) {
|
|
|
1010
1299
|
function isEvaluatorLike(value) {
|
|
1011
1300
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
1012
1301
|
}
|
|
1302
|
+
function isRunConfigLike(value) {
|
|
1303
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1304
|
+
}
|
|
1013
1305
|
function isTestCaseLike(value) {
|
|
1014
1306
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
1015
1307
|
}
|
|
@@ -1098,6 +1390,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1098
1390
|
);
|
|
1099
1391
|
return found.flat();
|
|
1100
1392
|
}
|
|
1393
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1394
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1395
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1396
|
+
const found = await Promise.all(
|
|
1397
|
+
matched.map(async (absolutePath) => {
|
|
1398
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1399
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1400
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1401
|
+
return runConfigs.map((runConfig) => ({
|
|
1402
|
+
id: runConfig.getName(),
|
|
1403
|
+
filePath: relPath,
|
|
1404
|
+
runConfig
|
|
1405
|
+
}));
|
|
1406
|
+
})
|
|
1407
|
+
);
|
|
1408
|
+
return found.flat();
|
|
1409
|
+
}
|
|
1101
1410
|
async function collectTestCasesFromFiles(config) {
|
|
1102
1411
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1103
1412
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -1190,15 +1499,17 @@ function readOutput(testCase) {
|
|
|
1190
1499
|
}
|
|
1191
1500
|
return candidate.getOutput();
|
|
1192
1501
|
}
|
|
1193
|
-
function buildEvaluationUnits(testCases) {
|
|
1502
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1503
|
+
const count = Math.max(1, repetitionCount);
|
|
1194
1504
|
const units = [];
|
|
1195
1505
|
for (const testCaseItem of testCases) {
|
|
1196
|
-
const
|
|
1197
|
-
for (let r = 0; r <
|
|
1506
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
1507
|
+
for (let r = 0; r < count; r++) {
|
|
1198
1508
|
units.push({
|
|
1199
1509
|
testCaseItem,
|
|
1200
|
-
|
|
1201
|
-
|
|
1510
|
+
repetitionId,
|
|
1511
|
+
repetitionIndex: r + 1,
|
|
1512
|
+
repetitionCount: count
|
|
1202
1513
|
});
|
|
1203
1514
|
}
|
|
1204
1515
|
}
|
|
@@ -1211,7 +1522,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1211
1522
|
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1212
1523
|
}
|
|
1213
1524
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1214
|
-
const { testCaseItem,
|
|
1525
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1215
1526
|
return effect.Effect.gen(function* () {
|
|
1216
1527
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1217
1528
|
const started = Date.now();
|
|
@@ -1220,11 +1531,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1220
1531
|
type: "TestCaseStarted",
|
|
1221
1532
|
runId: task.runId,
|
|
1222
1533
|
testCaseId: testCaseItem.id,
|
|
1223
|
-
testCaseName: testCaseItem.testCase
|
|
1534
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1224
1535
|
startedTestCases: startedEvaluations,
|
|
1225
1536
|
totalTestCases: totalEvaluations,
|
|
1226
|
-
|
|
1227
|
-
|
|
1537
|
+
repetitionId,
|
|
1538
|
+
repetitionIndex,
|
|
1539
|
+
repetitionCount
|
|
1228
1540
|
});
|
|
1229
1541
|
const evaluatorScores = [];
|
|
1230
1542
|
let testCaseError;
|
|
@@ -1258,8 +1570,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1258
1570
|
meta: {
|
|
1259
1571
|
triggerId: task.triggerId,
|
|
1260
1572
|
runId: evaluatorRunId,
|
|
1261
|
-
datasetId: task.datasetId
|
|
1573
|
+
datasetId: task.datasetId,
|
|
1574
|
+
repetitionId,
|
|
1575
|
+
repetitionIndex,
|
|
1576
|
+
repetitionCount,
|
|
1577
|
+
runConfigName: task.runConfigName
|
|
1262
1578
|
},
|
|
1579
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1580
|
+
runConfigTags: task.runConfigTags,
|
|
1581
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1263
1582
|
logDiff,
|
|
1264
1583
|
log,
|
|
1265
1584
|
createError
|
|
@@ -1302,18 +1621,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1302
1621
|
});
|
|
1303
1622
|
}
|
|
1304
1623
|
}
|
|
1305
|
-
const
|
|
1624
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1306
1625
|
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1307
1626
|
const progressEvent = {
|
|
1308
1627
|
type: "TestCaseProgress",
|
|
1309
1628
|
runId: task.runId,
|
|
1310
1629
|
testCaseId: testCaseItem.id,
|
|
1311
|
-
testCaseName: testCaseItem.testCase
|
|
1630
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1312
1631
|
completedTestCases: completedEvaluations,
|
|
1313
1632
|
totalTestCases: totalEvaluations,
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1633
|
+
repetitionId,
|
|
1634
|
+
repetitionIndex,
|
|
1635
|
+
repetitionCount,
|
|
1636
|
+
passed: repetitionPassedThis,
|
|
1317
1637
|
durationMs: Date.now() - started,
|
|
1318
1638
|
evaluatorScores,
|
|
1319
1639
|
output,
|
|
@@ -1334,9 +1654,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1334
1654
|
(map) => {
|
|
1335
1655
|
const key = testCaseItem.id;
|
|
1336
1656
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1337
|
-
const newResults = [...existing.results,
|
|
1657
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1338
1658
|
const newCompletedCount = existing.completedCount + 1;
|
|
1339
|
-
const isLast = newCompletedCount ===
|
|
1659
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1340
1660
|
const newMap = new Map(map);
|
|
1341
1661
|
newMap.set(key, {
|
|
1342
1662
|
completedCount: newCompletedCount,
|
|
@@ -1373,10 +1693,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1373
1693
|
runId: task.runId,
|
|
1374
1694
|
startedAt
|
|
1375
1695
|
});
|
|
1376
|
-
const totalEvaluations = task.testCases.
|
|
1377
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1378
|
-
0
|
|
1379
|
-
);
|
|
1696
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1380
1697
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1381
1698
|
const completedRef = yield* effect.Ref.make(0);
|
|
1382
1699
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -1385,7 +1702,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1385
1702
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
1386
1703
|
/* @__PURE__ */ new Map()
|
|
1387
1704
|
);
|
|
1388
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1705
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1389
1706
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1390
1707
|
task,
|
|
1391
1708
|
unit,
|
|
@@ -1399,11 +1716,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1399
1716
|
failedRef,
|
|
1400
1717
|
testCaseResultsRef
|
|
1401
1718
|
);
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1719
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1720
|
+
if (globalSem !== void 0) {
|
|
1721
|
+
yield* effect.Effect.forEach(
|
|
1722
|
+
evaluationUnits,
|
|
1723
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1724
|
+
{ concurrency: "unbounded", discard: true }
|
|
1725
|
+
);
|
|
1726
|
+
} else {
|
|
1727
|
+
yield* effect.Effect.forEach(
|
|
1728
|
+
evaluationUnits,
|
|
1729
|
+
processEvaluation,
|
|
1730
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1731
|
+
);
|
|
1732
|
+
}
|
|
1407
1733
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1408
1734
|
effect.Ref.get(completedRef),
|
|
1409
1735
|
effect.Ref.get(passedRef),
|
|
@@ -1439,125 +1765,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1439
1765
|
artifactPath: task.snapshot.artifactPath
|
|
1440
1766
|
});
|
|
1441
1767
|
});
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
} catch {
|
|
1448
|
-
return [];
|
|
1449
|
-
}
|
|
1450
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1451
|
-
const snapshots = [];
|
|
1452
|
-
for (const fileName of jsonlFiles) {
|
|
1453
|
-
const filePath = path.join(baseDir, fileName);
|
|
1454
|
-
try {
|
|
1455
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1456
|
-
if (snapshot) {
|
|
1457
|
-
snapshots.push(snapshot);
|
|
1458
|
-
}
|
|
1459
|
-
} catch {
|
|
1460
|
-
}
|
|
1461
|
-
}
|
|
1462
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1463
|
-
}
|
|
1464
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1465
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1466
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1467
|
-
if (lines.length === 0) {
|
|
1468
|
-
return null;
|
|
1469
|
-
}
|
|
1470
|
-
let runQueued = null;
|
|
1471
|
-
let runCompleted = null;
|
|
1472
|
-
let runFailed = null;
|
|
1473
|
-
let runStarted = null;
|
|
1474
|
-
for (const line of lines) {
|
|
1475
|
-
try {
|
|
1476
|
-
const event = JSON.parse(line);
|
|
1477
|
-
const type = event.type;
|
|
1478
|
-
if (type === "RunQueued") {
|
|
1479
|
-
runQueued = {
|
|
1480
|
-
runId: event.runId,
|
|
1481
|
-
datasetId: event.datasetId,
|
|
1482
|
-
datasetName: event.datasetName,
|
|
1483
|
-
evaluatorIds: event.evaluatorIds,
|
|
1484
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1485
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1486
|
-
ts: event.ts
|
|
1487
|
-
};
|
|
1488
|
-
}
|
|
1489
|
-
if (type === "RunStarted") {
|
|
1490
|
-
runStarted = { startedAt: event.startedAt };
|
|
1491
|
-
}
|
|
1492
|
-
if (type === "RunCompleted") {
|
|
1493
|
-
runCompleted = {
|
|
1494
|
-
passedTestCases: event.passedTestCases,
|
|
1495
|
-
failedTestCases: event.failedTestCases,
|
|
1496
|
-
totalTestCases: event.totalTestCases,
|
|
1497
|
-
finishedAt: event.finishedAt
|
|
1498
|
-
};
|
|
1499
|
-
}
|
|
1500
|
-
if (type === "RunFailed") {
|
|
1501
|
-
runFailed = {
|
|
1502
|
-
finishedAt: event.finishedAt,
|
|
1503
|
-
errorMessage: event.errorMessage
|
|
1504
|
-
};
|
|
1505
|
-
}
|
|
1506
|
-
} catch {
|
|
1507
|
-
}
|
|
1768
|
+
|
|
1769
|
+
// src/runner/name-pattern.ts
|
|
1770
|
+
function parseRegexLiteral(pattern) {
|
|
1771
|
+
if (!pattern.startsWith("/")) {
|
|
1772
|
+
return void 0;
|
|
1508
1773
|
}
|
|
1509
|
-
|
|
1510
|
-
|
|
1774
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1775
|
+
if (lastSlash <= 0) {
|
|
1776
|
+
return void 0;
|
|
1511
1777
|
}
|
|
1512
|
-
const artifactPath = filePath;
|
|
1513
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1514
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1515
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1516
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1517
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1518
1778
|
return {
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
datasetName: runQueued.datasetName,
|
|
1522
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1523
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1524
|
-
startedAt: runStarted?.startedAt,
|
|
1525
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1526
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1527
|
-
completedTestCases,
|
|
1528
|
-
passedTestCases,
|
|
1529
|
-
failedTestCases,
|
|
1530
|
-
status,
|
|
1531
|
-
artifactPath,
|
|
1532
|
-
errorMessage: runFailed?.errorMessage
|
|
1779
|
+
source: pattern.slice(1, lastSlash),
|
|
1780
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1533
1781
|
};
|
|
1534
1782
|
}
|
|
1535
|
-
function
|
|
1536
|
-
|
|
1537
|
-
const
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
if (event.type === "TestCaseProgress") {
|
|
1542
|
-
const ev = event;
|
|
1543
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1544
|
-
const id = ev.testCaseId;
|
|
1545
|
-
const current = testCasePassedBy.get(id);
|
|
1546
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1547
|
-
}
|
|
1548
|
-
} catch {
|
|
1549
|
-
}
|
|
1783
|
+
function createNameMatcher(pattern) {
|
|
1784
|
+
const normalizedPattern = pattern.trim();
|
|
1785
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1786
|
+
if (regexLiteral) {
|
|
1787
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1788
|
+
return (value) => regex.test(value);
|
|
1550
1789
|
}
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
passedTestCases += 1;
|
|
1556
|
-
} else {
|
|
1557
|
-
failedTestCases += 1;
|
|
1558
|
-
}
|
|
1790
|
+
if (normalizedPattern.includes("*")) {
|
|
1791
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1792
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1793
|
+
return (value) => regex.test(value);
|
|
1559
1794
|
}
|
|
1560
|
-
return
|
|
1795
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1561
1796
|
}
|
|
1562
1797
|
async function appendJsonLine(artifactPath, payload) {
|
|
1563
1798
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1616,32 +1851,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1616
1851
|
}
|
|
1617
1852
|
|
|
1618
1853
|
// src/runner/api.ts
|
|
1619
|
-
function
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1624
|
-
if (lastSlash <= 0) {
|
|
1625
|
-
return void 0;
|
|
1626
|
-
}
|
|
1627
|
-
return {
|
|
1628
|
-
source: pattern.slice(1, lastSlash),
|
|
1629
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1630
|
-
};
|
|
1631
|
-
}
|
|
1632
|
-
function createNameMatcher(pattern) {
|
|
1633
|
-
const normalizedPattern = pattern.trim();
|
|
1634
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1635
|
-
if (regexLiteral) {
|
|
1636
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1637
|
-
return (value) => regex.test(value);
|
|
1638
|
-
}
|
|
1639
|
-
if (normalizedPattern.includes("*")) {
|
|
1640
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1641
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1642
|
-
return (value) => regex.test(value);
|
|
1854
|
+
function normalizeRunRepetitions(value) {
|
|
1855
|
+
const n = value ?? 1;
|
|
1856
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1857
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1643
1858
|
}
|
|
1644
|
-
return
|
|
1859
|
+
return n;
|
|
1645
1860
|
}
|
|
1646
1861
|
function mergeRunnerOverrides(base, next) {
|
|
1647
1862
|
if (!base) {
|
|
@@ -1676,6 +1891,7 @@ var EffectRunner = class {
|
|
|
1676
1891
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1677
1892
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1678
1893
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1894
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1679
1895
|
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1680
1896
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1681
1897
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1716,6 +1932,137 @@ var EffectRunner = class {
|
|
|
1716
1932
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1717
1933
|
);
|
|
1718
1934
|
}
|
|
1935
|
+
async collectRunConfigs() {
|
|
1936
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1937
|
+
this.runConfigsById.clear();
|
|
1938
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1939
|
+
for (const item of runConfigs) {
|
|
1940
|
+
const id = item.runConfig.getName();
|
|
1941
|
+
const lower = id.toLowerCase();
|
|
1942
|
+
const prev = byNameLower.get(lower);
|
|
1943
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1944
|
+
throw new Error(
|
|
1945
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1946
|
+
);
|
|
1947
|
+
}
|
|
1948
|
+
byNameLower.set(lower, item);
|
|
1949
|
+
this.runConfigsById.set(id, item);
|
|
1950
|
+
}
|
|
1951
|
+
return runConfigs;
|
|
1952
|
+
}
|
|
1953
|
+
async resolveRunConfigByName(name) {
|
|
1954
|
+
if (this.runConfigsById.size === 0) {
|
|
1955
|
+
await this.collectRunConfigs();
|
|
1956
|
+
}
|
|
1957
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1958
|
+
const keyLower = key.toLowerCase();
|
|
1959
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1960
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1961
|
+
);
|
|
1962
|
+
if (matches.length === 0) {
|
|
1963
|
+
return void 0;
|
|
1964
|
+
}
|
|
1965
|
+
if (matches.length > 1) {
|
|
1966
|
+
throw new Error(
|
|
1967
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1968
|
+
);
|
|
1969
|
+
}
|
|
1970
|
+
return matches[0];
|
|
1971
|
+
}
|
|
1972
|
+
async expandRunConfigToJobs(collected) {
|
|
1973
|
+
if (this.datasetsById.size === 0) {
|
|
1974
|
+
await this.collectDatasets();
|
|
1975
|
+
}
|
|
1976
|
+
if (this.evaluatorsById.size === 0) {
|
|
1977
|
+
await this.collectEvaluators();
|
|
1978
|
+
}
|
|
1979
|
+
const rcName = collected.runConfig.getName();
|
|
1980
|
+
const jobs = [];
|
|
1981
|
+
const runs = collected.runConfig.getRuns();
|
|
1982
|
+
for (const [i, row] of runs.entries()) {
|
|
1983
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1984
|
+
(d) => d.dataset === row.dataset
|
|
1985
|
+
);
|
|
1986
|
+
if (!dsCollected) {
|
|
1987
|
+
throw new Error(
|
|
1988
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1989
|
+
);
|
|
1990
|
+
}
|
|
1991
|
+
let evaluatorIds;
|
|
1992
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1993
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1994
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1995
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1996
|
+
);
|
|
1997
|
+
if (matched.length === 0) {
|
|
1998
|
+
throw new Error(
|
|
1999
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2000
|
+
);
|
|
2001
|
+
}
|
|
2002
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2003
|
+
} else {
|
|
2004
|
+
const evaluators = row.evaluators;
|
|
2005
|
+
evaluatorIds = [];
|
|
2006
|
+
for (const ev of evaluators) {
|
|
2007
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2008
|
+
(item) => item.evaluator === ev
|
|
2009
|
+
);
|
|
2010
|
+
if (!found) {
|
|
2011
|
+
throw new Error(
|
|
2012
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2013
|
+
);
|
|
2014
|
+
}
|
|
2015
|
+
evaluatorIds.push(found.id);
|
|
2016
|
+
}
|
|
2017
|
+
}
|
|
2018
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2019
|
+
jobs.push({
|
|
2020
|
+
datasetId: dsCollected.id,
|
|
2021
|
+
evaluatorIds,
|
|
2022
|
+
runConfigName: rcName,
|
|
2023
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2024
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2025
|
+
repetitions
|
|
2026
|
+
});
|
|
2027
|
+
}
|
|
2028
|
+
return jobs;
|
|
2029
|
+
}
|
|
2030
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2031
|
+
const jobs = [];
|
|
2032
|
+
for (const name of names) {
|
|
2033
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2034
|
+
if (!collected) {
|
|
2035
|
+
const known = await this.collectRunConfigs();
|
|
2036
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2037
|
+
throw new Error(
|
|
2038
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2039
|
+
);
|
|
2040
|
+
}
|
|
2041
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2042
|
+
}
|
|
2043
|
+
return jobs;
|
|
2044
|
+
}
|
|
2045
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2046
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2047
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2048
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2049
|
+
const snapshots = [];
|
|
2050
|
+
for (const job of request.jobs) {
|
|
2051
|
+
snapshots.push(
|
|
2052
|
+
await this.startDatasetRun({
|
|
2053
|
+
datasetId: job.datasetId,
|
|
2054
|
+
evaluatorIds: job.evaluatorIds,
|
|
2055
|
+
triggerId,
|
|
2056
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2057
|
+
globalEvaluationSemaphore: sem,
|
|
2058
|
+
runConfigName: job.runConfigName,
|
|
2059
|
+
runConfigTags: job.runConfigTags,
|
|
2060
|
+
repetitions: job.repetitions
|
|
2061
|
+
})
|
|
2062
|
+
);
|
|
2063
|
+
}
|
|
2064
|
+
return snapshots;
|
|
2065
|
+
}
|
|
1719
2066
|
async searchTestCases(query) {
|
|
1720
2067
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1721
2068
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1734,35 +2081,45 @@ var EffectRunner = class {
|
|
|
1734
2081
|
);
|
|
1735
2082
|
}
|
|
1736
2083
|
async runDatasetWith(request) {
|
|
2084
|
+
const runConfigName = validateRunConfigName(
|
|
2085
|
+
request.runConfigName,
|
|
2086
|
+
"runDatasetWith.runConfigName"
|
|
2087
|
+
);
|
|
2088
|
+
return this.startDatasetRun({
|
|
2089
|
+
datasetId: request.datasetId,
|
|
2090
|
+
evaluatorIds: request.evaluatorIds,
|
|
2091
|
+
triggerId: request.triggerId,
|
|
2092
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2093
|
+
repetitions: request.repetitions,
|
|
2094
|
+
runConfigName,
|
|
2095
|
+
runConfigTags: request.runConfigTags
|
|
2096
|
+
});
|
|
2097
|
+
}
|
|
2098
|
+
async startDatasetRun(params) {
|
|
1737
2099
|
if (this.datasetsById.size === 0) {
|
|
1738
2100
|
await this.collectDatasets();
|
|
1739
2101
|
}
|
|
1740
2102
|
if (this.evaluatorsById.size === 0) {
|
|
1741
2103
|
await this.collectEvaluators();
|
|
1742
2104
|
}
|
|
1743
|
-
const dataset = this.datasetsById.get(
|
|
2105
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1744
2106
|
if (!dataset) {
|
|
1745
|
-
throw new Error(`Unknown dataset: ${
|
|
2107
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1746
2108
|
}
|
|
1747
|
-
const selectedEvaluators =
|
|
2109
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1748
2110
|
if (selectedEvaluators.length === 0) {
|
|
1749
2111
|
throw new Error("No evaluators selected for run");
|
|
1750
2112
|
}
|
|
1751
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1752
|
-
const
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
)
|
|
1756
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2113
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2114
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2115
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2116
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2117
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1757
2118
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1758
|
-
const artifactPath = createArtifactPath(
|
|
1759
|
-
this.config.artifactDirectory,
|
|
1760
|
-
request.datasetId,
|
|
1761
|
-
runId
|
|
1762
|
-
);
|
|
2119
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1763
2120
|
const snapshot = {
|
|
1764
2121
|
runId,
|
|
1765
|
-
datasetId:
|
|
2122
|
+
datasetId: params.datasetId,
|
|
1766
2123
|
datasetName: dataset.dataset.getName(),
|
|
1767
2124
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1768
2125
|
queuedAt: Date.now(),
|
|
@@ -1783,7 +2140,7 @@ var EffectRunner = class {
|
|
|
1783
2140
|
const queuedEvent = {
|
|
1784
2141
|
type: "RunQueued",
|
|
1785
2142
|
runId,
|
|
1786
|
-
datasetId:
|
|
2143
|
+
datasetId: params.datasetId,
|
|
1787
2144
|
datasetName: dataset.dataset.getName(),
|
|
1788
2145
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1789
2146
|
totalTestCases: totalEvaluations,
|
|
@@ -1797,17 +2154,20 @@ var EffectRunner = class {
|
|
|
1797
2154
|
payload: queuedEvent
|
|
1798
2155
|
})
|
|
1799
2156
|
);
|
|
1800
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1801
2157
|
await effect.Effect.runPromise(
|
|
1802
2158
|
effect.Queue.offer(this.runQueue, {
|
|
1803
2159
|
runId,
|
|
1804
2160
|
triggerId,
|
|
1805
|
-
datasetId:
|
|
2161
|
+
datasetId: params.datasetId,
|
|
1806
2162
|
dataset: dataset.dataset,
|
|
1807
2163
|
evaluators: selectedEvaluators,
|
|
1808
2164
|
testCases: selectedTestCases,
|
|
1809
2165
|
snapshot,
|
|
1810
|
-
maxConcurrency
|
|
2166
|
+
maxConcurrency: params.maxConcurrency,
|
|
2167
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2168
|
+
runConfigName: params.runConfigName,
|
|
2169
|
+
runConfigTags,
|
|
2170
|
+
repetitions
|
|
1811
2171
|
})
|
|
1812
2172
|
);
|
|
1813
2173
|
return snapshot;
|
|
@@ -1879,15 +2239,26 @@ var EffectRunner = class {
|
|
|
1879
2239
|
}
|
|
1880
2240
|
};
|
|
1881
2241
|
|
|
2242
|
+
// src/runner/events.ts
|
|
2243
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2244
|
+
runConfigName: "programmatic"
|
|
2245
|
+
};
|
|
2246
|
+
|
|
1882
2247
|
Object.defineProperty(exports, 'S', {
|
|
1883
2248
|
enumerable: true,
|
|
1884
2249
|
get: function () { return effect.Schema; }
|
|
1885
2250
|
});
|
|
1886
2251
|
exports.Dataset = Dataset;
|
|
1887
2252
|
exports.Evaluator = Evaluator;
|
|
2253
|
+
exports.EvaluatorNameSchema = EvaluatorNameSchema;
|
|
1888
2254
|
exports.Metric = Metric;
|
|
2255
|
+
exports.PROGRAMMATIC_RUN_CONFIG = PROGRAMMATIC_RUN_CONFIG;
|
|
2256
|
+
exports.RunConfig = RunConfig;
|
|
2257
|
+
exports.RunConfigNameSchema = RunConfigNameSchema;
|
|
1889
2258
|
exports.Score = Score;
|
|
2259
|
+
exports.TagSet = TagSet;
|
|
1890
2260
|
exports.TestCase = TestCase;
|
|
2261
|
+
exports.TestCaseNameSchema = TestCaseNameSchema;
|
|
1891
2262
|
exports.binaryScore = binaryScore;
|
|
1892
2263
|
exports.createLogEntry = createLogEntry;
|
|
1893
2264
|
exports.createRunner = createRunner;
|
|
@@ -1895,16 +2266,24 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
|
1895
2266
|
exports.defineConfig = defineConfig;
|
|
1896
2267
|
exports.deltaScore = deltaScore;
|
|
1897
2268
|
exports.formatScoreData = formatScoreData;
|
|
2269
|
+
exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
|
|
2270
|
+
exports.getEvaluatorTagList = getEvaluatorTagList;
|
|
1898
2271
|
exports.getLogLines = getLogLines;
|
|
1899
2272
|
exports.getMetricById = getMetricById;
|
|
1900
2273
|
exports.getScoreById = getScoreById;
|
|
2274
|
+
exports.getTestCaseDisplayLabel = getTestCaseDisplayLabel;
|
|
2275
|
+
exports.getTestCaseTagList = getTestCaseTagList;
|
|
1901
2276
|
exports.latencyMetric = latencyMetric;
|
|
1902
2277
|
exports.loadMockData = loadMockData;
|
|
1903
2278
|
exports.loadRunnerData = loadRunnerData;
|
|
2279
|
+
exports.normalizeOptionalDisplayName = normalizeOptionalDisplayName;
|
|
1904
2280
|
exports.parseStartupArgs = parseStartupArgs;
|
|
1905
2281
|
exports.percentScore = percentScore;
|
|
1906
2282
|
exports.printJsonDiff = printJsonDiff;
|
|
1907
2283
|
exports.tokenCountMetric = tokenCountMetric;
|
|
2284
|
+
exports.validateEvaluatorName = validateEvaluatorName;
|
|
2285
|
+
exports.validateRunConfigName = validateRunConfigName;
|
|
2286
|
+
exports.validateTestCaseName = validateTestCaseName;
|
|
1908
2287
|
exports.withRunnerConfig = withRunnerConfig;
|
|
1909
2288
|
//# sourceMappingURL=out.js.map
|
|
1910
2289
|
//# sourceMappingURL=index.cjs.map
|