@m4trix/evals 0.25.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +831 -450
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +832 -451
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +531 -270
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +531 -270
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +888 -509
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +878 -513
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,14 +1,172 @@
|
|
|
1
|
-
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
1
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
3
|
import { diffLines } from 'diff';
|
|
4
4
|
import stringify from 'fast-json-stable-stringify';
|
|
5
5
|
import { randomUUID } from 'crypto';
|
|
6
|
+
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
7
|
+
import { resolve as resolve$1, join, relative, dirname } from 'path';
|
|
6
8
|
import { existsSync } from 'fs';
|
|
7
|
-
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
8
9
|
import * as jitiModule from 'jiti';
|
|
9
|
-
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
10
10
|
import { pathToFileURL } from 'url';
|
|
11
11
|
|
|
12
|
+
// src/index.ts
|
|
13
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
14
|
+
function makeEntityIdSchema(brand, label) {
|
|
15
|
+
return Schema.String.pipe(
|
|
16
|
+
Schema.trimmed(),
|
|
17
|
+
Schema.minLength(1, {
|
|
18
|
+
message: () => `${label} must be non-empty.`
|
|
19
|
+
}),
|
|
20
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
21
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
22
|
+
}),
|
|
23
|
+
Schema.brand(brand)
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
27
|
+
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
28
|
+
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
29
|
+
function validateWithSchema(schema, raw, context) {
|
|
30
|
+
const trimmed = raw.trim();
|
|
31
|
+
const decode = Schema.decodeUnknownEither(
|
|
32
|
+
schema
|
|
33
|
+
);
|
|
34
|
+
const result = decode(trimmed);
|
|
35
|
+
if (Either.isLeft(result)) {
|
|
36
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
37
|
+
}
|
|
38
|
+
return result.right;
|
|
39
|
+
}
|
|
40
|
+
function validateRunConfigName(raw, context) {
|
|
41
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
42
|
+
}
|
|
43
|
+
function validateEvaluatorName(raw, context) {
|
|
44
|
+
return validateWithSchema(EvaluatorNameSchema, raw, context);
|
|
45
|
+
}
|
|
46
|
+
function validateTestCaseName(raw, context) {
|
|
47
|
+
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
48
|
+
}
|
|
49
|
+
function normalizeOptionalDisplayName(raw) {
|
|
50
|
+
if (raw === void 0) {
|
|
51
|
+
return void 0;
|
|
52
|
+
}
|
|
53
|
+
const t = raw.trim();
|
|
54
|
+
return t.length === 0 ? void 0 : t;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// src/evals/evaluator.ts
|
|
58
|
+
var Evaluator = class _Evaluator {
|
|
59
|
+
constructor(config) {
|
|
60
|
+
this._config = config;
|
|
61
|
+
}
|
|
62
|
+
getState() {
|
|
63
|
+
return {
|
|
64
|
+
name: this._config.name,
|
|
65
|
+
displayName: this._config.displayName,
|
|
66
|
+
tags: this._config.tags,
|
|
67
|
+
inputSchema: this._config.inputSchema,
|
|
68
|
+
outputSchema: this._config.outputSchema,
|
|
69
|
+
scoreSchema: this._config.scoreSchema,
|
|
70
|
+
middlewares: this._config.middlewares,
|
|
71
|
+
evaluateFn: this._config.evaluateFn,
|
|
72
|
+
passThreshold: this._config.passThreshold,
|
|
73
|
+
passCriterion: this._config.passCriterion
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
static use(middleware) {
|
|
77
|
+
return new _Evaluator({
|
|
78
|
+
middlewares: [middleware],
|
|
79
|
+
tags: []
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
use(middleware) {
|
|
83
|
+
const state = this.getState();
|
|
84
|
+
return new _Evaluator({
|
|
85
|
+
...state,
|
|
86
|
+
middlewares: [...state.middlewares, middleware]
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
define(config) {
|
|
90
|
+
const { middlewares } = this.getState();
|
|
91
|
+
const name = validateEvaluatorName(config.name, "Evaluator.define");
|
|
92
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
93
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
94
|
+
return new _Evaluator({
|
|
95
|
+
name,
|
|
96
|
+
displayName,
|
|
97
|
+
tags,
|
|
98
|
+
inputSchema: config.inputSchema,
|
|
99
|
+
outputSchema: config.outputSchema,
|
|
100
|
+
scoreSchema: config.scoreSchema,
|
|
101
|
+
middlewares,
|
|
102
|
+
passThreshold: config.passThreshold,
|
|
103
|
+
passCriterion: config.passCriterion
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
evaluate(fn) {
|
|
107
|
+
return new _Evaluator({
|
|
108
|
+
...this.getState(),
|
|
109
|
+
evaluateFn: fn
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
/** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
|
|
113
|
+
getName() {
|
|
114
|
+
return this._config.name;
|
|
115
|
+
}
|
|
116
|
+
getDisplayName() {
|
|
117
|
+
return this._config.displayName;
|
|
118
|
+
}
|
|
119
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
|
|
120
|
+
getDisplayLabel() {
|
|
121
|
+
const id = this._config.name;
|
|
122
|
+
if (id === void 0) {
|
|
123
|
+
return void 0;
|
|
124
|
+
}
|
|
125
|
+
return this._config.displayName ?? id;
|
|
126
|
+
}
|
|
127
|
+
/** Tags from `Evaluator.define({ tags })`; empty until defined. */
|
|
128
|
+
getTags() {
|
|
129
|
+
return [...this._config.tags];
|
|
130
|
+
}
|
|
131
|
+
getInputSchema() {
|
|
132
|
+
return this._config.inputSchema;
|
|
133
|
+
}
|
|
134
|
+
getOutputSchema() {
|
|
135
|
+
return this._config.outputSchema;
|
|
136
|
+
}
|
|
137
|
+
getScoreSchema() {
|
|
138
|
+
return this._config.scoreSchema;
|
|
139
|
+
}
|
|
140
|
+
getMiddlewares() {
|
|
141
|
+
return this._config.middlewares;
|
|
142
|
+
}
|
|
143
|
+
getEvaluateFn() {
|
|
144
|
+
return this._config.evaluateFn;
|
|
145
|
+
}
|
|
146
|
+
getPassThreshold() {
|
|
147
|
+
return this._config.passThreshold;
|
|
148
|
+
}
|
|
149
|
+
getPassCriterion() {
|
|
150
|
+
return this._config.passCriterion;
|
|
151
|
+
}
|
|
152
|
+
async resolveContext() {
|
|
153
|
+
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
154
|
+
return Object.assign({}, ...parts);
|
|
155
|
+
}
|
|
156
|
+
};
|
|
157
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
158
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
159
|
+
const label = evaluator.getDisplayLabel();
|
|
160
|
+
if (label !== void 0) {
|
|
161
|
+
return label;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
165
|
+
}
|
|
166
|
+
function getEvaluatorTagList(evaluator) {
|
|
167
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
168
|
+
}
|
|
169
|
+
|
|
12
170
|
// src/cli/data.mock.json
|
|
13
171
|
var data_mock_default = {
|
|
14
172
|
datasets: [
|
|
@@ -263,7 +421,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
263
421
|
function toEvaluatorOption(item) {
|
|
264
422
|
return {
|
|
265
423
|
id: item.id,
|
|
266
|
-
name: item.evaluator
|
|
424
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
267
425
|
configPreview: `Source: ${item.filePath}`
|
|
268
426
|
};
|
|
269
427
|
}
|
|
@@ -309,132 +467,6 @@ function parseStartupArgs(argv) {
|
|
|
309
467
|
return args;
|
|
310
468
|
}
|
|
311
469
|
|
|
312
|
-
// src/evals/test-case.ts
|
|
313
|
-
function resolve(value) {
|
|
314
|
-
return typeof value === "function" ? value() : value;
|
|
315
|
-
}
|
|
316
|
-
var TestCase = class _TestCase {
|
|
317
|
-
constructor(config) {
|
|
318
|
-
this._config = config;
|
|
319
|
-
}
|
|
320
|
-
static describe(config) {
|
|
321
|
-
const reruns = config.reruns ?? 1;
|
|
322
|
-
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
323
|
-
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
324
|
-
}
|
|
325
|
-
return new _TestCase({
|
|
326
|
-
name: config.name,
|
|
327
|
-
tags: config.tags,
|
|
328
|
-
reruns,
|
|
329
|
-
inputSchema: config.inputSchema,
|
|
330
|
-
input: config.input,
|
|
331
|
-
outputSchema: config.outputSchema,
|
|
332
|
-
output: config.output
|
|
333
|
-
});
|
|
334
|
-
}
|
|
335
|
-
getReruns() {
|
|
336
|
-
return this._config.reruns;
|
|
337
|
-
}
|
|
338
|
-
getName() {
|
|
339
|
-
return this._config.name;
|
|
340
|
-
}
|
|
341
|
-
getTags() {
|
|
342
|
-
return this._config.tags;
|
|
343
|
-
}
|
|
344
|
-
getInputSchema() {
|
|
345
|
-
return this._config.inputSchema;
|
|
346
|
-
}
|
|
347
|
-
getInput() {
|
|
348
|
-
return resolve(this._config.input);
|
|
349
|
-
}
|
|
350
|
-
getOutputSchema() {
|
|
351
|
-
return this._config.outputSchema;
|
|
352
|
-
}
|
|
353
|
-
getOutput() {
|
|
354
|
-
if (this._config.output === void 0) {
|
|
355
|
-
return void 0;
|
|
356
|
-
}
|
|
357
|
-
return resolve(this._config.output);
|
|
358
|
-
}
|
|
359
|
-
};
|
|
360
|
-
|
|
361
|
-
// src/evals/evaluator.ts
|
|
362
|
-
var Evaluator = class _Evaluator {
|
|
363
|
-
constructor(config) {
|
|
364
|
-
this._config = config;
|
|
365
|
-
}
|
|
366
|
-
getState() {
|
|
367
|
-
return {
|
|
368
|
-
name: this._config.name,
|
|
369
|
-
inputSchema: this._config.inputSchema,
|
|
370
|
-
outputSchema: this._config.outputSchema,
|
|
371
|
-
scoreSchema: this._config.scoreSchema,
|
|
372
|
-
middlewares: this._config.middlewares,
|
|
373
|
-
evaluateFn: this._config.evaluateFn,
|
|
374
|
-
passThreshold: this._config.passThreshold,
|
|
375
|
-
passCriterion: this._config.passCriterion
|
|
376
|
-
};
|
|
377
|
-
}
|
|
378
|
-
static use(middleware) {
|
|
379
|
-
return new _Evaluator({
|
|
380
|
-
middlewares: [middleware]
|
|
381
|
-
});
|
|
382
|
-
}
|
|
383
|
-
use(middleware) {
|
|
384
|
-
const state = this.getState();
|
|
385
|
-
return new _Evaluator({
|
|
386
|
-
...state,
|
|
387
|
-
middlewares: [...state.middlewares, middleware]
|
|
388
|
-
});
|
|
389
|
-
}
|
|
390
|
-
define(config) {
|
|
391
|
-
const { middlewares } = this.getState();
|
|
392
|
-
return new _Evaluator({
|
|
393
|
-
name: config.name,
|
|
394
|
-
inputSchema: config.inputSchema,
|
|
395
|
-
outputSchema: config.outputSchema,
|
|
396
|
-
scoreSchema: config.scoreSchema,
|
|
397
|
-
middlewares,
|
|
398
|
-
passThreshold: config.passThreshold,
|
|
399
|
-
passCriterion: config.passCriterion
|
|
400
|
-
});
|
|
401
|
-
}
|
|
402
|
-
evaluate(fn) {
|
|
403
|
-
return new _Evaluator({
|
|
404
|
-
...this.getState(),
|
|
405
|
-
evaluateFn: fn
|
|
406
|
-
});
|
|
407
|
-
}
|
|
408
|
-
getName() {
|
|
409
|
-
return this._config.name;
|
|
410
|
-
}
|
|
411
|
-
getInputSchema() {
|
|
412
|
-
return this._config.inputSchema;
|
|
413
|
-
}
|
|
414
|
-
getOutputSchema() {
|
|
415
|
-
return this._config.outputSchema;
|
|
416
|
-
}
|
|
417
|
-
getScoreSchema() {
|
|
418
|
-
return this._config.scoreSchema;
|
|
419
|
-
}
|
|
420
|
-
getMiddlewares() {
|
|
421
|
-
return this._config.middlewares;
|
|
422
|
-
}
|
|
423
|
-
getEvaluateFn() {
|
|
424
|
-
return this._config.evaluateFn;
|
|
425
|
-
}
|
|
426
|
-
getPassThreshold() {
|
|
427
|
-
return this._config.passThreshold;
|
|
428
|
-
}
|
|
429
|
-
getPassCriterion() {
|
|
430
|
-
return this._config.passCriterion;
|
|
431
|
-
}
|
|
432
|
-
async resolveContext() {
|
|
433
|
-
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
434
|
-
return Object.assign({}, ...parts);
|
|
435
|
-
}
|
|
436
|
-
};
|
|
437
|
-
|
|
438
470
|
// src/evals/dataset.ts
|
|
439
471
|
function matchesAny(value, matchers) {
|
|
440
472
|
return matchers.some(
|
|
@@ -498,36 +530,286 @@ var Dataset = class _Dataset {
|
|
|
498
530
|
return tagMatch && pathMatch;
|
|
499
531
|
}
|
|
500
532
|
};
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
name: config.name,
|
|
509
|
-
aggregate: config.aggregate,
|
|
510
|
-
format: config.format,
|
|
511
|
-
make: (data, options) => ({
|
|
512
|
-
id: config.id,
|
|
513
|
-
data,
|
|
514
|
-
...options?.name !== void 0 && { name: options.name }
|
|
515
|
-
})
|
|
516
|
-
};
|
|
517
|
-
registry.set(config.id, def);
|
|
518
|
-
return def;
|
|
533
|
+
function preprocessForDiff(value, options) {
|
|
534
|
+
if (options?.sort && Array.isArray(value)) {
|
|
535
|
+
return [...value].sort((a, b) => {
|
|
536
|
+
const aStr = stringify(preprocessForDiff(a, options));
|
|
537
|
+
const bStr = stringify(preprocessForDiff(b, options));
|
|
538
|
+
return aStr.localeCompare(bStr);
|
|
539
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
519
540
|
}
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
}
|
|
530
|
-
|
|
541
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
542
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
543
|
+
const filtered = {};
|
|
544
|
+
for (const [k, v] of Object.entries(value)) {
|
|
545
|
+
if (!keys.includes(k)) {
|
|
546
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
return filtered;
|
|
550
|
+
}
|
|
551
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
552
|
+
const result = {};
|
|
553
|
+
for (const [k, v] of Object.entries(value)) {
|
|
554
|
+
result[k] = preprocessForDiff(v, options);
|
|
555
|
+
}
|
|
556
|
+
return result;
|
|
557
|
+
}
|
|
558
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
559
|
+
return Number(value.toFixed(options.precision));
|
|
560
|
+
}
|
|
561
|
+
return value;
|
|
562
|
+
}
|
|
563
|
+
function toPrettyJson(value) {
|
|
564
|
+
const str = stringify(value);
|
|
565
|
+
try {
|
|
566
|
+
const parsed = JSON.parse(str);
|
|
567
|
+
return JSON.stringify(parsed, null, 2);
|
|
568
|
+
} catch {
|
|
569
|
+
return str;
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
function formatDiffParts(parts) {
|
|
573
|
+
const lines = [];
|
|
574
|
+
for (const part of parts) {
|
|
575
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
576
|
+
const partLines = part.value.split("\n");
|
|
577
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
578
|
+
const line = partLines[i];
|
|
579
|
+
if (i === partLines.length - 1 && line === "")
|
|
580
|
+
continue;
|
|
581
|
+
lines.push(prefix + line);
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
return lines.join("\n");
|
|
585
|
+
}
|
|
586
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
587
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
588
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
589
|
+
if (diffOptions?.keysOnly) {
|
|
590
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
591
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
592
|
+
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
593
|
+
return formatDiffParts(parts2);
|
|
594
|
+
}
|
|
595
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
596
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
597
|
+
if (expectedStr === actualStr) {
|
|
598
|
+
return "";
|
|
599
|
+
}
|
|
600
|
+
const parts = diffLines(expectedStr, actualStr);
|
|
601
|
+
if (diffOptions?.outputNewOnly) {
|
|
602
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
603
|
+
return formatDiffParts(filtered);
|
|
604
|
+
}
|
|
605
|
+
return formatDiffParts(parts);
|
|
606
|
+
}
|
|
607
|
+
function extractKeys(value) {
|
|
608
|
+
if (value === null || typeof value !== "object") {
|
|
609
|
+
return "\xB7";
|
|
610
|
+
}
|
|
611
|
+
if (Array.isArray(value)) {
|
|
612
|
+
return value.map(extractKeys);
|
|
613
|
+
}
|
|
614
|
+
const result = {};
|
|
615
|
+
for (const [k, v] of Object.entries(value)) {
|
|
616
|
+
result[k] = extractKeys(v);
|
|
617
|
+
}
|
|
618
|
+
return result;
|
|
619
|
+
}
|
|
620
|
+
function formatLogMessage(msg) {
|
|
621
|
+
if (typeof msg === "string")
|
|
622
|
+
return msg;
|
|
623
|
+
if (msg instanceof Error)
|
|
624
|
+
return msg.stack ?? msg.message;
|
|
625
|
+
try {
|
|
626
|
+
if (msg !== null && typeof msg === "object") {
|
|
627
|
+
return JSON.stringify(msg, null, 2);
|
|
628
|
+
}
|
|
629
|
+
return String(msg);
|
|
630
|
+
} catch {
|
|
631
|
+
return String(msg);
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
function createLogEntry(message, options) {
|
|
635
|
+
return {
|
|
636
|
+
type: "log",
|
|
637
|
+
label: options?.label,
|
|
638
|
+
message: formatLogMessage(message)
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
function getLogLines(entry) {
|
|
642
|
+
return entry.message.split("\n");
|
|
643
|
+
}
|
|
644
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
645
|
+
const { label, ...diffOpts } = options ?? {};
|
|
646
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
647
|
+
return {
|
|
648
|
+
type: "diff",
|
|
649
|
+
label,
|
|
650
|
+
expected,
|
|
651
|
+
actual,
|
|
652
|
+
diff: diff || "(no differences)"
|
|
653
|
+
};
|
|
654
|
+
}
|
|
655
|
+
function printJsonDiff(expected, actual, options = {}) {
|
|
656
|
+
const { color = true, ...diffOpts } = options;
|
|
657
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
658
|
+
if (color) {
|
|
659
|
+
const lines = diff.split("\n").map((line) => {
|
|
660
|
+
const trimmed = line.trimStart();
|
|
661
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
662
|
+
return `\x1B[31m${line}\x1B[0m`;
|
|
663
|
+
}
|
|
664
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
665
|
+
return `\x1B[32m${line}\x1B[0m`;
|
|
666
|
+
}
|
|
667
|
+
return line;
|
|
668
|
+
});
|
|
669
|
+
const colored = lines.join("\n");
|
|
670
|
+
console.log(colored || "(no differences)");
|
|
671
|
+
return colored;
|
|
672
|
+
}
|
|
673
|
+
console.log(diff || "(no differences)");
|
|
674
|
+
return diff;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
// src/evals/metric.ts
|
|
678
|
+
var registry = /* @__PURE__ */ new Map();
|
|
679
|
+
var Metric = {
|
|
680
|
+
of(config) {
|
|
681
|
+
const def = {
|
|
682
|
+
id: config.id,
|
|
683
|
+
name: config.name,
|
|
684
|
+
aggregate: config.aggregate,
|
|
685
|
+
format: config.format,
|
|
686
|
+
make: (data, options) => ({
|
|
687
|
+
id: config.id,
|
|
688
|
+
data,
|
|
689
|
+
...options?.name !== void 0 && { name: options.name }
|
|
690
|
+
})
|
|
691
|
+
};
|
|
692
|
+
registry.set(config.id, def);
|
|
693
|
+
return def;
|
|
694
|
+
}
|
|
695
|
+
};
|
|
696
|
+
function getMetricById(id) {
|
|
697
|
+
return registry.get(id);
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// src/evals/aggregators.ts
|
|
701
|
+
function aggregateTokenCountSum(values) {
|
|
702
|
+
const initial = {
|
|
703
|
+
input: 0,
|
|
704
|
+
output: 0,
|
|
705
|
+
inputCached: 0,
|
|
706
|
+
outputCached: 0
|
|
707
|
+
};
|
|
708
|
+
return values.reduce(
|
|
709
|
+
(acc, v) => ({
|
|
710
|
+
input: acc.input + (v.input ?? 0),
|
|
711
|
+
output: acc.output + (v.output ?? 0),
|
|
712
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
713
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
714
|
+
}),
|
|
715
|
+
initial
|
|
716
|
+
);
|
|
717
|
+
}
|
|
718
|
+
function aggregateLatencyAverage(values) {
|
|
719
|
+
if (values.length === 0) {
|
|
720
|
+
return { ms: 0 };
|
|
721
|
+
}
|
|
722
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
723
|
+
return { ms: sum / values.length };
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
// src/evals/metrics/standard.ts
|
|
727
|
+
var tokenCountMetric = Metric.of({
|
|
728
|
+
id: "token-count",
|
|
729
|
+
name: "Tokens",
|
|
730
|
+
aggregate: aggregateTokenCountSum,
|
|
731
|
+
format: (data, options) => {
|
|
732
|
+
const input = data.input ?? 0;
|
|
733
|
+
const output = data.output ?? 0;
|
|
734
|
+
const inputCached = data.inputCached ?? 0;
|
|
735
|
+
const outputCached = data.outputCached ?? 0;
|
|
736
|
+
const cached = inputCached + outputCached;
|
|
737
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
738
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
739
|
+
}
|
|
740
|
+
});
|
|
741
|
+
var latencyMetric = Metric.of({
|
|
742
|
+
id: "latency",
|
|
743
|
+
name: "Latency",
|
|
744
|
+
aggregate: aggregateLatencyAverage,
|
|
745
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
746
|
+
});
|
|
747
|
+
|
|
748
|
+
// src/evals/run-config.ts
|
|
749
|
+
function validateRow(row, index) {
|
|
750
|
+
const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
|
|
751
|
+
const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
|
|
752
|
+
if (hasEvaluators && hasPattern) {
|
|
753
|
+
throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
|
|
754
|
+
}
|
|
755
|
+
if (!hasEvaluators && !hasPattern) {
|
|
756
|
+
throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
|
|
757
|
+
}
|
|
758
|
+
if (hasEvaluators && row.evaluators.length === 0) {
|
|
759
|
+
throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
|
|
760
|
+
}
|
|
761
|
+
const rawRep = "repetitions" in row ? row.repetitions : void 0;
|
|
762
|
+
const repetitions = rawRep ?? 1;
|
|
763
|
+
if (!Number.isInteger(repetitions) || repetitions < 1) {
|
|
764
|
+
throw new Error(
|
|
765
|
+
`RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
|
|
766
|
+
);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
var RunConfig = class _RunConfig {
|
|
770
|
+
constructor(name, displayName, tags, runs) {
|
|
771
|
+
this._name = name;
|
|
772
|
+
this._displayName = displayName;
|
|
773
|
+
this._tags = tags;
|
|
774
|
+
this._runs = runs;
|
|
775
|
+
}
|
|
776
|
+
static define(config) {
|
|
777
|
+
if (config.runs.length === 0) {
|
|
778
|
+
throw new Error("RunConfig runs must be non-empty");
|
|
779
|
+
}
|
|
780
|
+
config.runs.forEach(validateRow);
|
|
781
|
+
const name = validateRunConfigName(config.name, "RunConfig.define");
|
|
782
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
783
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
784
|
+
return new _RunConfig(name, displayName, tags, config.runs);
|
|
785
|
+
}
|
|
786
|
+
/** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
|
|
787
|
+
getName() {
|
|
788
|
+
return this._name;
|
|
789
|
+
}
|
|
790
|
+
/** Optional unrestricted display label. */
|
|
791
|
+
getDisplayName() {
|
|
792
|
+
return this._displayName;
|
|
793
|
+
}
|
|
794
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
795
|
+
getDisplayLabel() {
|
|
796
|
+
return this._displayName ?? this._name;
|
|
797
|
+
}
|
|
798
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
799
|
+
getTags() {
|
|
800
|
+
return [...this._tags];
|
|
801
|
+
}
|
|
802
|
+
getRuns() {
|
|
803
|
+
return this._runs;
|
|
804
|
+
}
|
|
805
|
+
};
|
|
806
|
+
|
|
807
|
+
// src/evals/score.ts
|
|
808
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
809
|
+
function formatScoreData(def, data, options) {
|
|
810
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
811
|
+
}
|
|
812
|
+
var ScoreAggregate = {
|
|
531
813
|
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
532
814
|
averageFields(fields) {
|
|
533
815
|
return (values) => {
|
|
@@ -630,54 +912,6 @@ function getScoreById(id) {
|
|
|
630
912
|
return registry2.get(id);
|
|
631
913
|
}
|
|
632
914
|
|
|
633
|
-
// src/evals/aggregators.ts
|
|
634
|
-
function aggregateTokenCountSum(values) {
|
|
635
|
-
const initial = {
|
|
636
|
-
input: 0,
|
|
637
|
-
output: 0,
|
|
638
|
-
inputCached: 0,
|
|
639
|
-
outputCached: 0
|
|
640
|
-
};
|
|
641
|
-
return values.reduce(
|
|
642
|
-
(acc, v) => ({
|
|
643
|
-
input: acc.input + (v.input ?? 0),
|
|
644
|
-
output: acc.output + (v.output ?? 0),
|
|
645
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
646
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
647
|
-
}),
|
|
648
|
-
initial
|
|
649
|
-
);
|
|
650
|
-
}
|
|
651
|
-
function aggregateLatencyAverage(values) {
|
|
652
|
-
if (values.length === 0) {
|
|
653
|
-
return { ms: 0 };
|
|
654
|
-
}
|
|
655
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
656
|
-
return { ms: sum / values.length };
|
|
657
|
-
}
|
|
658
|
-
|
|
659
|
-
// src/evals/metrics/standard.ts
|
|
660
|
-
var tokenCountMetric = Metric.of({
|
|
661
|
-
id: "token-count",
|
|
662
|
-
name: "Tokens",
|
|
663
|
-
aggregate: aggregateTokenCountSum,
|
|
664
|
-
format: (data, options) => {
|
|
665
|
-
const input = data.input ?? 0;
|
|
666
|
-
const output = data.output ?? 0;
|
|
667
|
-
const inputCached = data.inputCached ?? 0;
|
|
668
|
-
const outputCached = data.outputCached ?? 0;
|
|
669
|
-
const cached = inputCached + outputCached;
|
|
670
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
671
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
672
|
-
}
|
|
673
|
-
});
|
|
674
|
-
var latencyMetric = Metric.of({
|
|
675
|
-
id: "latency",
|
|
676
|
-
name: "Latency",
|
|
677
|
-
aggregate: aggregateLatencyAverage,
|
|
678
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
679
|
-
});
|
|
680
|
-
|
|
681
915
|
// src/evals/scores/standard.ts
|
|
682
916
|
var percentScore = Score.of({
|
|
683
917
|
id: "percent",
|
|
@@ -709,148 +943,197 @@ var binaryScore = Score.of({
|
|
|
709
943
|
},
|
|
710
944
|
aggregateValues: Score.aggregate.all
|
|
711
945
|
});
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
722
|
-
const filtered = {};
|
|
723
|
-
for (const [k, v] of Object.entries(value)) {
|
|
724
|
-
if (!keys.includes(k)) {
|
|
725
|
-
filtered[k] = preprocessForDiff(v, options);
|
|
726
|
-
}
|
|
946
|
+
|
|
947
|
+
// src/evals/tag-set.ts
|
|
948
|
+
var TagSet = class {
|
|
949
|
+
constructor() {
|
|
950
|
+
}
|
|
951
|
+
static define(tags) {
|
|
952
|
+
const out = {};
|
|
953
|
+
for (const tag of tags) {
|
|
954
|
+
out[tag] = tag;
|
|
727
955
|
}
|
|
728
|
-
return
|
|
956
|
+
return out;
|
|
729
957
|
}
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
958
|
+
};
|
|
959
|
+
|
|
960
|
+
// src/evals/test-case.ts
|
|
961
|
+
function resolve(value) {
|
|
962
|
+
return typeof value === "function" ? value() : value;
|
|
963
|
+
}
|
|
964
|
+
var TestCase = class _TestCase {
|
|
965
|
+
constructor(config) {
|
|
966
|
+
this._config = config;
|
|
736
967
|
}
|
|
737
|
-
|
|
738
|
-
|
|
968
|
+
static describe(config) {
|
|
969
|
+
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
970
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
971
|
+
return new _TestCase({
|
|
972
|
+
name,
|
|
973
|
+
displayName,
|
|
974
|
+
tags: config.tags,
|
|
975
|
+
inputSchema: config.inputSchema,
|
|
976
|
+
input: config.input,
|
|
977
|
+
outputSchema: config.outputSchema,
|
|
978
|
+
output: config.output
|
|
979
|
+
});
|
|
739
980
|
}
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
function toPrettyJson(value) {
|
|
743
|
-
const str = stringify(value);
|
|
744
|
-
try {
|
|
745
|
-
const parsed = JSON.parse(str);
|
|
746
|
-
return JSON.stringify(parsed, null, 2);
|
|
747
|
-
} catch {
|
|
748
|
-
return str;
|
|
981
|
+
getName() {
|
|
982
|
+
return this._config.name;
|
|
749
983
|
}
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
const lines = [];
|
|
753
|
-
for (const part of parts) {
|
|
754
|
-
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
755
|
-
const partLines = part.value.split("\n");
|
|
756
|
-
for (let i = 0; i < partLines.length; i++) {
|
|
757
|
-
const line = partLines[i];
|
|
758
|
-
if (i === partLines.length - 1 && line === "")
|
|
759
|
-
continue;
|
|
760
|
-
lines.push(prefix + line);
|
|
761
|
-
}
|
|
984
|
+
getDisplayName() {
|
|
985
|
+
return this._config.displayName;
|
|
762
986
|
}
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
function createDiffString(expected, actual, diffOptions) {
|
|
766
|
-
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
767
|
-
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
768
|
-
if (diffOptions?.keysOnly) {
|
|
769
|
-
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
770
|
-
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
771
|
-
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
772
|
-
return formatDiffParts(parts2);
|
|
987
|
+
getDisplayLabel() {
|
|
988
|
+
return this._config.displayName ?? this._config.name;
|
|
773
989
|
}
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
if (expectedStr === actualStr) {
|
|
777
|
-
return "";
|
|
990
|
+
getTags() {
|
|
991
|
+
return this._config.tags;
|
|
778
992
|
}
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
const filtered = parts.filter((p) => p.added === true);
|
|
782
|
-
return formatDiffParts(filtered);
|
|
993
|
+
getInputSchema() {
|
|
994
|
+
return this._config.inputSchema;
|
|
783
995
|
}
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
function extractKeys(value) {
|
|
787
|
-
if (value === null || typeof value !== "object") {
|
|
788
|
-
return "\xB7";
|
|
996
|
+
getInput() {
|
|
997
|
+
return resolve(this._config.input);
|
|
789
998
|
}
|
|
790
|
-
|
|
791
|
-
return
|
|
999
|
+
getOutputSchema() {
|
|
1000
|
+
return this._config.outputSchema;
|
|
792
1001
|
}
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
1002
|
+
getOutput() {
|
|
1003
|
+
if (this._config.output === void 0) {
|
|
1004
|
+
return void 0;
|
|
1005
|
+
}
|
|
1006
|
+
return resolve(this._config.output);
|
|
796
1007
|
}
|
|
797
|
-
|
|
1008
|
+
};
|
|
1009
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1010
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1011
|
+
return testCase.getDisplayLabel();
|
|
1012
|
+
}
|
|
1013
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
798
1014
|
}
|
|
799
|
-
function
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
1015
|
+
function getTestCaseTagList(testCase) {
|
|
1016
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1017
|
+
}
|
|
1018
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
1019
|
+
const baseDir = resolve$1(config.artifactDirectory);
|
|
1020
|
+
let entries;
|
|
804
1021
|
try {
|
|
805
|
-
|
|
806
|
-
return JSON.stringify(msg, null, 2);
|
|
807
|
-
}
|
|
808
|
-
return String(msg);
|
|
1022
|
+
entries = await readdir(baseDir);
|
|
809
1023
|
} catch {
|
|
810
|
-
return
|
|
1024
|
+
return [];
|
|
811
1025
|
}
|
|
1026
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1027
|
+
const snapshots = [];
|
|
1028
|
+
for (const fileName of jsonlFiles) {
|
|
1029
|
+
const filePath = join(baseDir, fileName);
|
|
1030
|
+
try {
|
|
1031
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1032
|
+
if (snapshot) {
|
|
1033
|
+
snapshots.push(snapshot);
|
|
1034
|
+
}
|
|
1035
|
+
} catch {
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
812
1039
|
}
|
|
813
|
-
function
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
}
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
if (color) {
|
|
838
|
-
const lines = diff.split("\n").map((line) => {
|
|
839
|
-
const trimmed = line.trimStart();
|
|
840
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
841
|
-
return `\x1B[31m${line}\x1B[0m`;
|
|
1040
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1041
|
+
const content = await readFile(filePath, "utf8");
|
|
1042
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1043
|
+
if (lines.length === 0) {
|
|
1044
|
+
return null;
|
|
1045
|
+
}
|
|
1046
|
+
let runQueued = null;
|
|
1047
|
+
let runCompleted = null;
|
|
1048
|
+
let runFailed = null;
|
|
1049
|
+
let runStarted = null;
|
|
1050
|
+
for (const line of lines) {
|
|
1051
|
+
try {
|
|
1052
|
+
const event = JSON.parse(line);
|
|
1053
|
+
const type = event.type;
|
|
1054
|
+
if (type === "RunQueued") {
|
|
1055
|
+
runQueued = {
|
|
1056
|
+
runId: event.runId,
|
|
1057
|
+
datasetId: event.datasetId,
|
|
1058
|
+
datasetName: event.datasetName,
|
|
1059
|
+
evaluatorIds: event.evaluatorIds,
|
|
1060
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
1061
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
1062
|
+
ts: event.ts
|
|
1063
|
+
};
|
|
842
1064
|
}
|
|
843
|
-
if (
|
|
844
|
-
|
|
1065
|
+
if (type === "RunStarted") {
|
|
1066
|
+
runStarted = { startedAt: event.startedAt };
|
|
1067
|
+
}
|
|
1068
|
+
if (type === "RunCompleted") {
|
|
1069
|
+
runCompleted = {
|
|
1070
|
+
passedTestCases: event.passedTestCases,
|
|
1071
|
+
failedTestCases: event.failedTestCases,
|
|
1072
|
+
totalTestCases: event.totalTestCases,
|
|
1073
|
+
finishedAt: event.finishedAt
|
|
1074
|
+
};
|
|
1075
|
+
}
|
|
1076
|
+
if (type === "RunFailed") {
|
|
1077
|
+
runFailed = {
|
|
1078
|
+
finishedAt: event.finishedAt,
|
|
1079
|
+
errorMessage: event.errorMessage
|
|
1080
|
+
};
|
|
1081
|
+
}
|
|
1082
|
+
} catch {
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
if (!runQueued) {
|
|
1086
|
+
return null;
|
|
1087
|
+
}
|
|
1088
|
+
const artifactPath = filePath;
|
|
1089
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1090
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
1091
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1092
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1093
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1094
|
+
return {
|
|
1095
|
+
runId: runQueued.runId,
|
|
1096
|
+
datasetId: runQueued.datasetId,
|
|
1097
|
+
datasetName: runQueued.datasetName,
|
|
1098
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
1099
|
+
queuedAt: runQueued.ts ?? 0,
|
|
1100
|
+
startedAt: runStarted?.startedAt,
|
|
1101
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1102
|
+
totalTestCases: runQueued.totalTestCases,
|
|
1103
|
+
completedTestCases,
|
|
1104
|
+
passedTestCases,
|
|
1105
|
+
failedTestCases,
|
|
1106
|
+
status,
|
|
1107
|
+
artifactPath,
|
|
1108
|
+
errorMessage: runFailed?.errorMessage
|
|
1109
|
+
};
|
|
1110
|
+
}
|
|
1111
|
+
function aggregateTestCaseProgress(lines) {
|
|
1112
|
+
let completedTestCases = 0;
|
|
1113
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1114
|
+
for (const line of lines) {
|
|
1115
|
+
try {
|
|
1116
|
+
const event = JSON.parse(line);
|
|
1117
|
+
if (event.type === "TestCaseProgress") {
|
|
1118
|
+
const ev = event;
|
|
1119
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1120
|
+
const id = ev.testCaseId;
|
|
1121
|
+
const current = testCasePassedBy.get(id);
|
|
1122
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
845
1123
|
}
|
|
846
|
-
|
|
847
|
-
}
|
|
848
|
-
const colored = lines.join("\n");
|
|
849
|
-
console.log(colored || "(no differences)");
|
|
850
|
-
return colored;
|
|
1124
|
+
} catch {
|
|
1125
|
+
}
|
|
851
1126
|
}
|
|
852
|
-
|
|
853
|
-
|
|
1127
|
+
let passedTestCases = 0;
|
|
1128
|
+
let failedTestCases = 0;
|
|
1129
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1130
|
+
if (passed) {
|
|
1131
|
+
passedTestCases += 1;
|
|
1132
|
+
} else {
|
|
1133
|
+
failedTestCases += 1;
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
854
1137
|
}
|
|
855
1138
|
|
|
856
1139
|
// src/runner/config.ts
|
|
@@ -862,6 +1145,7 @@ var defaultRunnerConfig = {
|
|
|
862
1145
|
rootDir: process.cwd(),
|
|
863
1146
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
864
1147
|
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
1148
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
865
1149
|
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
866
1150
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
867
1151
|
},
|
|
@@ -887,6 +1171,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
887
1171
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
888
1172
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
889
1173
|
}
|
|
1174
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
1175
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
1176
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
1177
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
1178
|
+
}
|
|
890
1179
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
891
1180
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
892
1181
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -985,6 +1274,9 @@ function isDatasetLike(value) {
|
|
|
985
1274
|
function isEvaluatorLike(value) {
|
|
986
1275
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
987
1276
|
}
|
|
1277
|
+
function isRunConfigLike(value) {
|
|
1278
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1279
|
+
}
|
|
988
1280
|
function isTestCaseLike(value) {
|
|
989
1281
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
990
1282
|
}
|
|
@@ -1073,6 +1365,23 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1073
1365
|
);
|
|
1074
1366
|
return found.flat();
|
|
1075
1367
|
}
|
|
1368
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1369
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1370
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1371
|
+
const found = await Promise.all(
|
|
1372
|
+
matched.map(async (absolutePath) => {
|
|
1373
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1374
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1375
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
1376
|
+
return runConfigs.map((runConfig) => ({
|
|
1377
|
+
id: runConfig.getName(),
|
|
1378
|
+
filePath: relPath,
|
|
1379
|
+
runConfig
|
|
1380
|
+
}));
|
|
1381
|
+
})
|
|
1382
|
+
);
|
|
1383
|
+
return found.flat();
|
|
1384
|
+
}
|
|
1076
1385
|
async function collectTestCasesFromFiles(config) {
|
|
1077
1386
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1078
1387
|
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
@@ -1165,15 +1474,17 @@ function readOutput(testCase) {
|
|
|
1165
1474
|
}
|
|
1166
1475
|
return candidate.getOutput();
|
|
1167
1476
|
}
|
|
1168
|
-
function buildEvaluationUnits(testCases) {
|
|
1477
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1478
|
+
const count = Math.max(1, repetitionCount);
|
|
1169
1479
|
const units = [];
|
|
1170
1480
|
for (const testCaseItem of testCases) {
|
|
1171
|
-
const
|
|
1172
|
-
for (let r = 0; r <
|
|
1481
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
1482
|
+
for (let r = 0; r < count; r++) {
|
|
1173
1483
|
units.push({
|
|
1174
1484
|
testCaseItem,
|
|
1175
|
-
|
|
1176
|
-
|
|
1485
|
+
repetitionId,
|
|
1486
|
+
repetitionIndex: r + 1,
|
|
1487
|
+
repetitionCount: count
|
|
1177
1488
|
});
|
|
1178
1489
|
}
|
|
1179
1490
|
}
|
|
@@ -1186,7 +1497,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1186
1497
|
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1187
1498
|
}
|
|
1188
1499
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1189
|
-
const { testCaseItem,
|
|
1500
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1190
1501
|
return Effect.gen(function* () {
|
|
1191
1502
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1192
1503
|
const started = Date.now();
|
|
@@ -1195,11 +1506,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1195
1506
|
type: "TestCaseStarted",
|
|
1196
1507
|
runId: task.runId,
|
|
1197
1508
|
testCaseId: testCaseItem.id,
|
|
1198
|
-
testCaseName: testCaseItem.testCase
|
|
1509
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1199
1510
|
startedTestCases: startedEvaluations,
|
|
1200
1511
|
totalTestCases: totalEvaluations,
|
|
1201
|
-
|
|
1202
|
-
|
|
1512
|
+
repetitionId,
|
|
1513
|
+
repetitionIndex,
|
|
1514
|
+
repetitionCount
|
|
1203
1515
|
});
|
|
1204
1516
|
const evaluatorScores = [];
|
|
1205
1517
|
let testCaseError;
|
|
@@ -1233,8 +1545,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1233
1545
|
meta: {
|
|
1234
1546
|
triggerId: task.triggerId,
|
|
1235
1547
|
runId: evaluatorRunId,
|
|
1236
|
-
datasetId: task.datasetId
|
|
1548
|
+
datasetId: task.datasetId,
|
|
1549
|
+
repetitionId,
|
|
1550
|
+
repetitionIndex,
|
|
1551
|
+
repetitionCount,
|
|
1552
|
+
runConfigName: task.runConfigName
|
|
1237
1553
|
},
|
|
1554
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1555
|
+
runConfigTags: task.runConfigTags,
|
|
1556
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1238
1557
|
logDiff,
|
|
1239
1558
|
log,
|
|
1240
1559
|
createError
|
|
@@ -1277,18 +1596,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1277
1596
|
});
|
|
1278
1597
|
}
|
|
1279
1598
|
}
|
|
1280
|
-
const
|
|
1599
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1281
1600
|
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1282
1601
|
const progressEvent = {
|
|
1283
1602
|
type: "TestCaseProgress",
|
|
1284
1603
|
runId: task.runId,
|
|
1285
1604
|
testCaseId: testCaseItem.id,
|
|
1286
|
-
testCaseName: testCaseItem.testCase
|
|
1605
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1287
1606
|
completedTestCases: completedEvaluations,
|
|
1288
1607
|
totalTestCases: totalEvaluations,
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1608
|
+
repetitionId,
|
|
1609
|
+
repetitionIndex,
|
|
1610
|
+
repetitionCount,
|
|
1611
|
+
passed: repetitionPassedThis,
|
|
1292
1612
|
durationMs: Date.now() - started,
|
|
1293
1613
|
evaluatorScores,
|
|
1294
1614
|
output,
|
|
@@ -1309,9 +1629,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1309
1629
|
(map) => {
|
|
1310
1630
|
const key = testCaseItem.id;
|
|
1311
1631
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1312
|
-
const newResults = [...existing.results,
|
|
1632
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1313
1633
|
const newCompletedCount = existing.completedCount + 1;
|
|
1314
|
-
const isLast = newCompletedCount ===
|
|
1634
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1315
1635
|
const newMap = new Map(map);
|
|
1316
1636
|
newMap.set(key, {
|
|
1317
1637
|
completedCount: newCompletedCount,
|
|
@@ -1348,10 +1668,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1348
1668
|
runId: task.runId,
|
|
1349
1669
|
startedAt
|
|
1350
1670
|
});
|
|
1351
|
-
const totalEvaluations = task.testCases.
|
|
1352
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1353
|
-
0
|
|
1354
|
-
);
|
|
1671
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1355
1672
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1356
1673
|
const completedRef = yield* Ref.make(0);
|
|
1357
1674
|
const startedRef = yield* Ref.make(0);
|
|
@@ -1360,7 +1677,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1360
1677
|
const testCaseResultsRef = yield* Ref.make(
|
|
1361
1678
|
/* @__PURE__ */ new Map()
|
|
1362
1679
|
);
|
|
1363
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1680
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1364
1681
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1365
1682
|
task,
|
|
1366
1683
|
unit,
|
|
@@ -1374,11 +1691,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1374
1691
|
failedRef,
|
|
1375
1692
|
testCaseResultsRef
|
|
1376
1693
|
);
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1694
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1695
|
+
if (globalSem !== void 0) {
|
|
1696
|
+
yield* Effect.forEach(
|
|
1697
|
+
evaluationUnits,
|
|
1698
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1699
|
+
{ concurrency: "unbounded", discard: true }
|
|
1700
|
+
);
|
|
1701
|
+
} else {
|
|
1702
|
+
yield* Effect.forEach(
|
|
1703
|
+
evaluationUnits,
|
|
1704
|
+
processEvaluation,
|
|
1705
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1706
|
+
);
|
|
1707
|
+
}
|
|
1382
1708
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1383
1709
|
Ref.get(completedRef),
|
|
1384
1710
|
Ref.get(passedRef),
|
|
@@ -1414,125 +1740,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1414
1740
|
artifactPath: task.snapshot.artifactPath
|
|
1415
1741
|
});
|
|
1416
1742
|
});
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
} catch {
|
|
1423
|
-
return [];
|
|
1424
|
-
}
|
|
1425
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1426
|
-
const snapshots = [];
|
|
1427
|
-
for (const fileName of jsonlFiles) {
|
|
1428
|
-
const filePath = join(baseDir, fileName);
|
|
1429
|
-
try {
|
|
1430
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1431
|
-
if (snapshot) {
|
|
1432
|
-
snapshots.push(snapshot);
|
|
1433
|
-
}
|
|
1434
|
-
} catch {
|
|
1435
|
-
}
|
|
1436
|
-
}
|
|
1437
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1438
|
-
}
|
|
1439
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1440
|
-
const content = await readFile(filePath, "utf8");
|
|
1441
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1442
|
-
if (lines.length === 0) {
|
|
1443
|
-
return null;
|
|
1444
|
-
}
|
|
1445
|
-
let runQueued = null;
|
|
1446
|
-
let runCompleted = null;
|
|
1447
|
-
let runFailed = null;
|
|
1448
|
-
let runStarted = null;
|
|
1449
|
-
for (const line of lines) {
|
|
1450
|
-
try {
|
|
1451
|
-
const event = JSON.parse(line);
|
|
1452
|
-
const type = event.type;
|
|
1453
|
-
if (type === "RunQueued") {
|
|
1454
|
-
runQueued = {
|
|
1455
|
-
runId: event.runId,
|
|
1456
|
-
datasetId: event.datasetId,
|
|
1457
|
-
datasetName: event.datasetName,
|
|
1458
|
-
evaluatorIds: event.evaluatorIds,
|
|
1459
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1460
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1461
|
-
ts: event.ts
|
|
1462
|
-
};
|
|
1463
|
-
}
|
|
1464
|
-
if (type === "RunStarted") {
|
|
1465
|
-
runStarted = { startedAt: event.startedAt };
|
|
1466
|
-
}
|
|
1467
|
-
if (type === "RunCompleted") {
|
|
1468
|
-
runCompleted = {
|
|
1469
|
-
passedTestCases: event.passedTestCases,
|
|
1470
|
-
failedTestCases: event.failedTestCases,
|
|
1471
|
-
totalTestCases: event.totalTestCases,
|
|
1472
|
-
finishedAt: event.finishedAt
|
|
1473
|
-
};
|
|
1474
|
-
}
|
|
1475
|
-
if (type === "RunFailed") {
|
|
1476
|
-
runFailed = {
|
|
1477
|
-
finishedAt: event.finishedAt,
|
|
1478
|
-
errorMessage: event.errorMessage
|
|
1479
|
-
};
|
|
1480
|
-
}
|
|
1481
|
-
} catch {
|
|
1482
|
-
}
|
|
1743
|
+
|
|
1744
|
+
// src/runner/name-pattern.ts
|
|
1745
|
+
function parseRegexLiteral(pattern) {
|
|
1746
|
+
if (!pattern.startsWith("/")) {
|
|
1747
|
+
return void 0;
|
|
1483
1748
|
}
|
|
1484
|
-
|
|
1485
|
-
|
|
1749
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1750
|
+
if (lastSlash <= 0) {
|
|
1751
|
+
return void 0;
|
|
1486
1752
|
}
|
|
1487
|
-
const artifactPath = filePath;
|
|
1488
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1489
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1490
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1491
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1492
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1493
1753
|
return {
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
datasetName: runQueued.datasetName,
|
|
1497
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1498
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1499
|
-
startedAt: runStarted?.startedAt,
|
|
1500
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1501
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1502
|
-
completedTestCases,
|
|
1503
|
-
passedTestCases,
|
|
1504
|
-
failedTestCases,
|
|
1505
|
-
status,
|
|
1506
|
-
artifactPath,
|
|
1507
|
-
errorMessage: runFailed?.errorMessage
|
|
1754
|
+
source: pattern.slice(1, lastSlash),
|
|
1755
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1508
1756
|
};
|
|
1509
1757
|
}
|
|
1510
|
-
function
|
|
1511
|
-
|
|
1512
|
-
const
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
if (event.type === "TestCaseProgress") {
|
|
1517
|
-
const ev = event;
|
|
1518
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1519
|
-
const id = ev.testCaseId;
|
|
1520
|
-
const current = testCasePassedBy.get(id);
|
|
1521
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1522
|
-
}
|
|
1523
|
-
} catch {
|
|
1524
|
-
}
|
|
1758
|
+
function createNameMatcher(pattern) {
|
|
1759
|
+
const normalizedPattern = pattern.trim();
|
|
1760
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1761
|
+
if (regexLiteral) {
|
|
1762
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1763
|
+
return (value) => regex.test(value);
|
|
1525
1764
|
}
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
passedTestCases += 1;
|
|
1531
|
-
} else {
|
|
1532
|
-
failedTestCases += 1;
|
|
1533
|
-
}
|
|
1765
|
+
if (normalizedPattern.includes("*")) {
|
|
1766
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1767
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1768
|
+
return (value) => regex.test(value);
|
|
1534
1769
|
}
|
|
1535
|
-
return
|
|
1770
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1536
1771
|
}
|
|
1537
1772
|
async function appendJsonLine(artifactPath, payload) {
|
|
1538
1773
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1591,32 +1826,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1591
1826
|
}
|
|
1592
1827
|
|
|
1593
1828
|
// src/runner/api.ts
|
|
1594
|
-
function
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1599
|
-
if (lastSlash <= 0) {
|
|
1600
|
-
return void 0;
|
|
1601
|
-
}
|
|
1602
|
-
return {
|
|
1603
|
-
source: pattern.slice(1, lastSlash),
|
|
1604
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1605
|
-
};
|
|
1606
|
-
}
|
|
1607
|
-
function createNameMatcher(pattern) {
|
|
1608
|
-
const normalizedPattern = pattern.trim();
|
|
1609
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1610
|
-
if (regexLiteral) {
|
|
1611
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1612
|
-
return (value) => regex.test(value);
|
|
1613
|
-
}
|
|
1614
|
-
if (normalizedPattern.includes("*")) {
|
|
1615
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1616
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1617
|
-
return (value) => regex.test(value);
|
|
1829
|
+
function normalizeRunRepetitions(value) {
|
|
1830
|
+
const n = value ?? 1;
|
|
1831
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1832
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1618
1833
|
}
|
|
1619
|
-
return
|
|
1834
|
+
return n;
|
|
1620
1835
|
}
|
|
1621
1836
|
function mergeRunnerOverrides(base, next) {
|
|
1622
1837
|
if (!base) {
|
|
@@ -1651,6 +1866,7 @@ var EffectRunner = class {
|
|
|
1651
1866
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1652
1867
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1653
1868
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1869
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1654
1870
|
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1655
1871
|
this.persistenceFiber = Effect.runFork(
|
|
1656
1872
|
createPersistenceWorker(this.persistenceQueue)
|
|
@@ -1691,6 +1907,137 @@ var EffectRunner = class {
|
|
|
1691
1907
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1692
1908
|
);
|
|
1693
1909
|
}
|
|
1910
|
+
async collectRunConfigs() {
|
|
1911
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1912
|
+
this.runConfigsById.clear();
|
|
1913
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1914
|
+
for (const item of runConfigs) {
|
|
1915
|
+
const id = item.runConfig.getName();
|
|
1916
|
+
const lower = id.toLowerCase();
|
|
1917
|
+
const prev = byNameLower.get(lower);
|
|
1918
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1919
|
+
throw new Error(
|
|
1920
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1921
|
+
);
|
|
1922
|
+
}
|
|
1923
|
+
byNameLower.set(lower, item);
|
|
1924
|
+
this.runConfigsById.set(id, item);
|
|
1925
|
+
}
|
|
1926
|
+
return runConfigs;
|
|
1927
|
+
}
|
|
1928
|
+
async resolveRunConfigByName(name) {
|
|
1929
|
+
if (this.runConfigsById.size === 0) {
|
|
1930
|
+
await this.collectRunConfigs();
|
|
1931
|
+
}
|
|
1932
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1933
|
+
const keyLower = key.toLowerCase();
|
|
1934
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1935
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1936
|
+
);
|
|
1937
|
+
if (matches.length === 0) {
|
|
1938
|
+
return void 0;
|
|
1939
|
+
}
|
|
1940
|
+
if (matches.length > 1) {
|
|
1941
|
+
throw new Error(
|
|
1942
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1943
|
+
);
|
|
1944
|
+
}
|
|
1945
|
+
return matches[0];
|
|
1946
|
+
}
|
|
1947
|
+
async expandRunConfigToJobs(collected) {
|
|
1948
|
+
if (this.datasetsById.size === 0) {
|
|
1949
|
+
await this.collectDatasets();
|
|
1950
|
+
}
|
|
1951
|
+
if (this.evaluatorsById.size === 0) {
|
|
1952
|
+
await this.collectEvaluators();
|
|
1953
|
+
}
|
|
1954
|
+
const rcName = collected.runConfig.getName();
|
|
1955
|
+
const jobs = [];
|
|
1956
|
+
const runs = collected.runConfig.getRuns();
|
|
1957
|
+
for (const [i, row] of runs.entries()) {
|
|
1958
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1959
|
+
(d) => d.dataset === row.dataset
|
|
1960
|
+
);
|
|
1961
|
+
if (!dsCollected) {
|
|
1962
|
+
throw new Error(
|
|
1963
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1964
|
+
);
|
|
1965
|
+
}
|
|
1966
|
+
let evaluatorIds;
|
|
1967
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1968
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1969
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1970
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1971
|
+
);
|
|
1972
|
+
if (matched.length === 0) {
|
|
1973
|
+
throw new Error(
|
|
1974
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1975
|
+
);
|
|
1976
|
+
}
|
|
1977
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1978
|
+
} else {
|
|
1979
|
+
const evaluators = row.evaluators;
|
|
1980
|
+
evaluatorIds = [];
|
|
1981
|
+
for (const ev of evaluators) {
|
|
1982
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
1983
|
+
(item) => item.evaluator === ev
|
|
1984
|
+
);
|
|
1985
|
+
if (!found) {
|
|
1986
|
+
throw new Error(
|
|
1987
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
1988
|
+
);
|
|
1989
|
+
}
|
|
1990
|
+
evaluatorIds.push(found.id);
|
|
1991
|
+
}
|
|
1992
|
+
}
|
|
1993
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
1994
|
+
jobs.push({
|
|
1995
|
+
datasetId: dsCollected.id,
|
|
1996
|
+
evaluatorIds,
|
|
1997
|
+
runConfigName: rcName,
|
|
1998
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
1999
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2000
|
+
repetitions
|
|
2001
|
+
});
|
|
2002
|
+
}
|
|
2003
|
+
return jobs;
|
|
2004
|
+
}
|
|
2005
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2006
|
+
const jobs = [];
|
|
2007
|
+
for (const name of names) {
|
|
2008
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2009
|
+
if (!collected) {
|
|
2010
|
+
const known = await this.collectRunConfigs();
|
|
2011
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2012
|
+
throw new Error(
|
|
2013
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2014
|
+
);
|
|
2015
|
+
}
|
|
2016
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2017
|
+
}
|
|
2018
|
+
return jobs;
|
|
2019
|
+
}
|
|
2020
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2021
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2022
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2023
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2024
|
+
const snapshots = [];
|
|
2025
|
+
for (const job of request.jobs) {
|
|
2026
|
+
snapshots.push(
|
|
2027
|
+
await this.startDatasetRun({
|
|
2028
|
+
datasetId: job.datasetId,
|
|
2029
|
+
evaluatorIds: job.evaluatorIds,
|
|
2030
|
+
triggerId,
|
|
2031
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2032
|
+
globalEvaluationSemaphore: sem,
|
|
2033
|
+
runConfigName: job.runConfigName,
|
|
2034
|
+
runConfigTags: job.runConfigTags,
|
|
2035
|
+
repetitions: job.repetitions
|
|
2036
|
+
})
|
|
2037
|
+
);
|
|
2038
|
+
}
|
|
2039
|
+
return snapshots;
|
|
2040
|
+
}
|
|
1694
2041
|
async searchTestCases(query) {
|
|
1695
2042
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1696
2043
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1709,35 +2056,45 @@ var EffectRunner = class {
|
|
|
1709
2056
|
);
|
|
1710
2057
|
}
|
|
1711
2058
|
async runDatasetWith(request) {
|
|
2059
|
+
const runConfigName = validateRunConfigName(
|
|
2060
|
+
request.runConfigName,
|
|
2061
|
+
"runDatasetWith.runConfigName"
|
|
2062
|
+
);
|
|
2063
|
+
return this.startDatasetRun({
|
|
2064
|
+
datasetId: request.datasetId,
|
|
2065
|
+
evaluatorIds: request.evaluatorIds,
|
|
2066
|
+
triggerId: request.triggerId,
|
|
2067
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2068
|
+
repetitions: request.repetitions,
|
|
2069
|
+
runConfigName,
|
|
2070
|
+
runConfigTags: request.runConfigTags
|
|
2071
|
+
});
|
|
2072
|
+
}
|
|
2073
|
+
async startDatasetRun(params) {
|
|
1712
2074
|
if (this.datasetsById.size === 0) {
|
|
1713
2075
|
await this.collectDatasets();
|
|
1714
2076
|
}
|
|
1715
2077
|
if (this.evaluatorsById.size === 0) {
|
|
1716
2078
|
await this.collectEvaluators();
|
|
1717
2079
|
}
|
|
1718
|
-
const dataset = this.datasetsById.get(
|
|
2080
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1719
2081
|
if (!dataset) {
|
|
1720
|
-
throw new Error(`Unknown dataset: ${
|
|
2082
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1721
2083
|
}
|
|
1722
|
-
const selectedEvaluators =
|
|
2084
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1723
2085
|
if (selectedEvaluators.length === 0) {
|
|
1724
2086
|
throw new Error("No evaluators selected for run");
|
|
1725
2087
|
}
|
|
1726
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1727
|
-
const
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
)
|
|
1731
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2088
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2089
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2090
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2091
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2092
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1732
2093
|
const runId = `run-${randomUUID()}`;
|
|
1733
|
-
const artifactPath = createArtifactPath(
|
|
1734
|
-
this.config.artifactDirectory,
|
|
1735
|
-
request.datasetId,
|
|
1736
|
-
runId
|
|
1737
|
-
);
|
|
2094
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1738
2095
|
const snapshot = {
|
|
1739
2096
|
runId,
|
|
1740
|
-
datasetId:
|
|
2097
|
+
datasetId: params.datasetId,
|
|
1741
2098
|
datasetName: dataset.dataset.getName(),
|
|
1742
2099
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1743
2100
|
queuedAt: Date.now(),
|
|
@@ -1758,7 +2115,7 @@ var EffectRunner = class {
|
|
|
1758
2115
|
const queuedEvent = {
|
|
1759
2116
|
type: "RunQueued",
|
|
1760
2117
|
runId,
|
|
1761
|
-
datasetId:
|
|
2118
|
+
datasetId: params.datasetId,
|
|
1762
2119
|
datasetName: dataset.dataset.getName(),
|
|
1763
2120
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1764
2121
|
totalTestCases: totalEvaluations,
|
|
@@ -1772,17 +2129,20 @@ var EffectRunner = class {
|
|
|
1772
2129
|
payload: queuedEvent
|
|
1773
2130
|
})
|
|
1774
2131
|
);
|
|
1775
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1776
2132
|
await Effect.runPromise(
|
|
1777
2133
|
Queue.offer(this.runQueue, {
|
|
1778
2134
|
runId,
|
|
1779
2135
|
triggerId,
|
|
1780
|
-
datasetId:
|
|
2136
|
+
datasetId: params.datasetId,
|
|
1781
2137
|
dataset: dataset.dataset,
|
|
1782
2138
|
evaluators: selectedEvaluators,
|
|
1783
2139
|
testCases: selectedTestCases,
|
|
1784
2140
|
snapshot,
|
|
1785
|
-
maxConcurrency
|
|
2141
|
+
maxConcurrency: params.maxConcurrency,
|
|
2142
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2143
|
+
runConfigName: params.runConfigName,
|
|
2144
|
+
runConfigTags,
|
|
2145
|
+
repetitions
|
|
1786
2146
|
})
|
|
1787
2147
|
);
|
|
1788
2148
|
return snapshot;
|
|
@@ -1854,6 +2214,11 @@ var EffectRunner = class {
|
|
|
1854
2214
|
}
|
|
1855
2215
|
};
|
|
1856
2216
|
|
|
1857
|
-
|
|
2217
|
+
// src/runner/events.ts
|
|
2218
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2219
|
+
runConfigName: "programmatic"
|
|
2220
|
+
};
|
|
2221
|
+
|
|
2222
|
+
export { Dataset, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|
|
1858
2223
|
//# sourceMappingURL=out.js.map
|
|
1859
2224
|
//# sourceMappingURL=index.js.map
|