@m4trix/evals 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +911 -643
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +898 -630
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +688 -575
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +679 -566
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +959 -623
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +947 -625
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/index.cjs
CHANGED
|
@@ -4,10 +4,10 @@ var effect = require('effect');
|
|
|
4
4
|
var diff = require('diff');
|
|
5
5
|
var stringify = require('fast-json-stable-stringify');
|
|
6
6
|
var crypto = require('crypto');
|
|
7
|
-
var
|
|
7
|
+
var promises = require('fs/promises');
|
|
8
8
|
var path = require('path');
|
|
9
|
+
var fs = require('fs');
|
|
9
10
|
var jitiModule = require('jiti');
|
|
10
|
-
var promises = require('fs/promises');
|
|
11
11
|
var url = require('url');
|
|
12
12
|
|
|
13
13
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
@@ -34,6 +34,164 @@ function _interopNamespace(e) {
|
|
|
34
34
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
35
35
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
36
36
|
|
|
37
|
+
// src/index.ts
|
|
38
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
39
|
+
function makeEntityIdSchema(brand, label) {
|
|
40
|
+
return effect.Schema.String.pipe(
|
|
41
|
+
effect.Schema.trimmed(),
|
|
42
|
+
effect.Schema.minLength(1, {
|
|
43
|
+
message: () => `${label} must be non-empty.`
|
|
44
|
+
}),
|
|
45
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
46
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
47
|
+
}),
|
|
48
|
+
effect.Schema.brand(brand)
|
|
49
|
+
);
|
|
50
|
+
}
|
|
51
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
52
|
+
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
53
|
+
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
54
|
+
function validateWithSchema(schema, raw, context) {
|
|
55
|
+
const trimmed = raw.trim();
|
|
56
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
57
|
+
schema
|
|
58
|
+
);
|
|
59
|
+
const result = decode(trimmed);
|
|
60
|
+
if (effect.Either.isLeft(result)) {
|
|
61
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
62
|
+
}
|
|
63
|
+
return result.right;
|
|
64
|
+
}
|
|
65
|
+
function validateRunConfigName(raw, context) {
|
|
66
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
67
|
+
}
|
|
68
|
+
function validateEvaluatorName(raw, context) {
|
|
69
|
+
return validateWithSchema(EvaluatorNameSchema, raw, context);
|
|
70
|
+
}
|
|
71
|
+
function validateTestCaseName(raw, context) {
|
|
72
|
+
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
73
|
+
}
|
|
74
|
+
function normalizeOptionalDisplayName(raw) {
|
|
75
|
+
if (raw === void 0) {
|
|
76
|
+
return void 0;
|
|
77
|
+
}
|
|
78
|
+
const t = raw.trim();
|
|
79
|
+
return t.length === 0 ? void 0 : t;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// src/evals/evaluator.ts
|
|
83
|
+
var Evaluator = class _Evaluator {
|
|
84
|
+
constructor(config) {
|
|
85
|
+
this._config = config;
|
|
86
|
+
}
|
|
87
|
+
getState() {
|
|
88
|
+
return {
|
|
89
|
+
name: this._config.name,
|
|
90
|
+
displayName: this._config.displayName,
|
|
91
|
+
tags: this._config.tags,
|
|
92
|
+
inputSchema: this._config.inputSchema,
|
|
93
|
+
outputSchema: this._config.outputSchema,
|
|
94
|
+
scoreSchema: this._config.scoreSchema,
|
|
95
|
+
middlewares: this._config.middlewares,
|
|
96
|
+
evaluateFn: this._config.evaluateFn,
|
|
97
|
+
passThreshold: this._config.passThreshold,
|
|
98
|
+
passCriterion: this._config.passCriterion
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
static use(middleware) {
|
|
102
|
+
return new _Evaluator({
|
|
103
|
+
middlewares: [middleware],
|
|
104
|
+
tags: []
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
use(middleware) {
|
|
108
|
+
const state = this.getState();
|
|
109
|
+
return new _Evaluator({
|
|
110
|
+
...state,
|
|
111
|
+
middlewares: [...state.middlewares, middleware]
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
define(config) {
|
|
115
|
+
const { middlewares } = this.getState();
|
|
116
|
+
const name = validateEvaluatorName(config.name, "Evaluator.define");
|
|
117
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
118
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
119
|
+
return new _Evaluator({
|
|
120
|
+
name,
|
|
121
|
+
displayName,
|
|
122
|
+
tags,
|
|
123
|
+
inputSchema: config.inputSchema,
|
|
124
|
+
outputSchema: config.outputSchema,
|
|
125
|
+
scoreSchema: config.scoreSchema,
|
|
126
|
+
middlewares,
|
|
127
|
+
passThreshold: config.passThreshold,
|
|
128
|
+
passCriterion: config.passCriterion
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
evaluate(fn) {
|
|
132
|
+
return new _Evaluator({
|
|
133
|
+
...this.getState(),
|
|
134
|
+
evaluateFn: fn
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
/** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
|
|
138
|
+
getName() {
|
|
139
|
+
return this._config.name;
|
|
140
|
+
}
|
|
141
|
+
getDisplayName() {
|
|
142
|
+
return this._config.displayName;
|
|
143
|
+
}
|
|
144
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
|
|
145
|
+
getDisplayLabel() {
|
|
146
|
+
const id = this._config.name;
|
|
147
|
+
if (id === void 0) {
|
|
148
|
+
return void 0;
|
|
149
|
+
}
|
|
150
|
+
return this._config.displayName ?? id;
|
|
151
|
+
}
|
|
152
|
+
/** Tags from `Evaluator.define({ tags })`; empty until defined. */
|
|
153
|
+
getTags() {
|
|
154
|
+
return [...this._config.tags];
|
|
155
|
+
}
|
|
156
|
+
getInputSchema() {
|
|
157
|
+
return this._config.inputSchema;
|
|
158
|
+
}
|
|
159
|
+
getOutputSchema() {
|
|
160
|
+
return this._config.outputSchema;
|
|
161
|
+
}
|
|
162
|
+
getScoreSchema() {
|
|
163
|
+
return this._config.scoreSchema;
|
|
164
|
+
}
|
|
165
|
+
getMiddlewares() {
|
|
166
|
+
return this._config.middlewares;
|
|
167
|
+
}
|
|
168
|
+
getEvaluateFn() {
|
|
169
|
+
return this._config.evaluateFn;
|
|
170
|
+
}
|
|
171
|
+
getPassThreshold() {
|
|
172
|
+
return this._config.passThreshold;
|
|
173
|
+
}
|
|
174
|
+
getPassCriterion() {
|
|
175
|
+
return this._config.passCriterion;
|
|
176
|
+
}
|
|
177
|
+
async resolveContext() {
|
|
178
|
+
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
179
|
+
return Object.assign({}, ...parts);
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
183
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
184
|
+
const label = evaluator.getDisplayLabel();
|
|
185
|
+
if (label !== void 0) {
|
|
186
|
+
return label;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
190
|
+
}
|
|
191
|
+
function getEvaluatorTagList(evaluator) {
|
|
192
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
193
|
+
}
|
|
194
|
+
|
|
37
195
|
// src/cli/data.mock.json
|
|
38
196
|
var data_mock_default = {
|
|
39
197
|
datasets: [
|
|
@@ -184,9 +342,7 @@ var data_mock_default = {
|
|
|
184
342
|
{ name: "contract_match", score: 100 },
|
|
185
343
|
{ name: "arg_validity", score: 100 }
|
|
186
344
|
],
|
|
187
|
-
checks: [
|
|
188
|
-
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
189
|
-
],
|
|
345
|
+
checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
|
|
190
346
|
failures: [],
|
|
191
347
|
meta: {
|
|
192
348
|
model: "gpt-4o-mini",
|
|
@@ -209,9 +365,21 @@ var data_mock_default = {
|
|
|
209
365
|
}
|
|
210
366
|
],
|
|
211
367
|
evaluators: [
|
|
212
|
-
{
|
|
213
|
-
|
|
214
|
-
|
|
368
|
+
{
|
|
369
|
+
id: "json-schema-validator",
|
|
370
|
+
name: "JSON Schema Validator",
|
|
371
|
+
configPreview: "strict=true"
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
id: "tool-call-contract-checker",
|
|
375
|
+
name: "Tool-call Contract Checker",
|
|
376
|
+
configPreview: "unexpectedCalls=error"
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
id: "rubric-judge",
|
|
380
|
+
name: "Rubric Judge (LLM)",
|
|
381
|
+
configPreview: "model=gpt-4o-mini; scale=0-100"
|
|
382
|
+
},
|
|
215
383
|
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
216
384
|
]
|
|
217
385
|
};
|
|
@@ -278,7 +446,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
278
446
|
function toEvaluatorOption(item) {
|
|
279
447
|
return {
|
|
280
448
|
id: item.id,
|
|
281
|
-
name: item.evaluator
|
|
449
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
282
450
|
configPreview: `Source: ${item.filePath}`
|
|
283
451
|
};
|
|
284
452
|
}
|
|
@@ -291,9 +459,7 @@ async function loadRunnerData(runner) {
|
|
|
291
459
|
const memSnapshots = runner.getAllRunSnapshots();
|
|
292
460
|
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
293
461
|
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
294
|
-
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
295
|
-
(a, b) => b.queuedAt - a.queuedAt
|
|
296
|
-
);
|
|
462
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
|
|
297
463
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
298
464
|
return loadMockData();
|
|
299
465
|
}
|
|
@@ -326,134 +492,6 @@ function parseStartupArgs(argv) {
|
|
|
326
492
|
return args;
|
|
327
493
|
}
|
|
328
494
|
|
|
329
|
-
// src/evals/test-case.ts
|
|
330
|
-
function resolve(value) {
|
|
331
|
-
return typeof value === "function" ? value() : value;
|
|
332
|
-
}
|
|
333
|
-
var TestCase = class _TestCase {
|
|
334
|
-
constructor(config) {
|
|
335
|
-
this._config = config;
|
|
336
|
-
}
|
|
337
|
-
static describe(config) {
|
|
338
|
-
const reruns = config.reruns ?? 1;
|
|
339
|
-
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
340
|
-
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
341
|
-
}
|
|
342
|
-
return new _TestCase({
|
|
343
|
-
name: config.name,
|
|
344
|
-
tags: config.tags,
|
|
345
|
-
reruns,
|
|
346
|
-
inputSchema: config.inputSchema,
|
|
347
|
-
input: config.input,
|
|
348
|
-
outputSchema: config.outputSchema,
|
|
349
|
-
output: config.output
|
|
350
|
-
});
|
|
351
|
-
}
|
|
352
|
-
getReruns() {
|
|
353
|
-
return this._config.reruns;
|
|
354
|
-
}
|
|
355
|
-
getName() {
|
|
356
|
-
return this._config.name;
|
|
357
|
-
}
|
|
358
|
-
getTags() {
|
|
359
|
-
return this._config.tags;
|
|
360
|
-
}
|
|
361
|
-
getInputSchema() {
|
|
362
|
-
return this._config.inputSchema;
|
|
363
|
-
}
|
|
364
|
-
getInput() {
|
|
365
|
-
return resolve(this._config.input);
|
|
366
|
-
}
|
|
367
|
-
getOutputSchema() {
|
|
368
|
-
return this._config.outputSchema;
|
|
369
|
-
}
|
|
370
|
-
getOutput() {
|
|
371
|
-
if (this._config.output === void 0) {
|
|
372
|
-
return void 0;
|
|
373
|
-
}
|
|
374
|
-
return resolve(this._config.output);
|
|
375
|
-
}
|
|
376
|
-
};
|
|
377
|
-
|
|
378
|
-
// src/evals/evaluator.ts
|
|
379
|
-
var Evaluator = class _Evaluator {
|
|
380
|
-
constructor(config) {
|
|
381
|
-
this._config = config;
|
|
382
|
-
}
|
|
383
|
-
getState() {
|
|
384
|
-
return {
|
|
385
|
-
name: this._config.name,
|
|
386
|
-
inputSchema: this._config.inputSchema,
|
|
387
|
-
outputSchema: this._config.outputSchema,
|
|
388
|
-
scoreSchema: this._config.scoreSchema,
|
|
389
|
-
middlewares: this._config.middlewares,
|
|
390
|
-
evaluateFn: this._config.evaluateFn,
|
|
391
|
-
passThreshold: this._config.passThreshold,
|
|
392
|
-
passCriterion: this._config.passCriterion
|
|
393
|
-
};
|
|
394
|
-
}
|
|
395
|
-
static use(middleware) {
|
|
396
|
-
return new _Evaluator({
|
|
397
|
-
middlewares: [middleware]
|
|
398
|
-
});
|
|
399
|
-
}
|
|
400
|
-
use(middleware) {
|
|
401
|
-
const state = this.getState();
|
|
402
|
-
return new _Evaluator({
|
|
403
|
-
...state,
|
|
404
|
-
middlewares: [...state.middlewares, middleware]
|
|
405
|
-
});
|
|
406
|
-
}
|
|
407
|
-
define(config) {
|
|
408
|
-
const { middlewares } = this.getState();
|
|
409
|
-
return new _Evaluator({
|
|
410
|
-
name: config.name,
|
|
411
|
-
inputSchema: config.inputSchema,
|
|
412
|
-
outputSchema: config.outputSchema,
|
|
413
|
-
scoreSchema: config.scoreSchema,
|
|
414
|
-
middlewares,
|
|
415
|
-
passThreshold: config.passThreshold,
|
|
416
|
-
passCriterion: config.passCriterion
|
|
417
|
-
});
|
|
418
|
-
}
|
|
419
|
-
evaluate(fn) {
|
|
420
|
-
return new _Evaluator({
|
|
421
|
-
...this.getState(),
|
|
422
|
-
evaluateFn: fn
|
|
423
|
-
});
|
|
424
|
-
}
|
|
425
|
-
getName() {
|
|
426
|
-
return this._config.name;
|
|
427
|
-
}
|
|
428
|
-
getInputSchema() {
|
|
429
|
-
return this._config.inputSchema;
|
|
430
|
-
}
|
|
431
|
-
getOutputSchema() {
|
|
432
|
-
return this._config.outputSchema;
|
|
433
|
-
}
|
|
434
|
-
getScoreSchema() {
|
|
435
|
-
return this._config.scoreSchema;
|
|
436
|
-
}
|
|
437
|
-
getMiddlewares() {
|
|
438
|
-
return this._config.middlewares;
|
|
439
|
-
}
|
|
440
|
-
getEvaluateFn() {
|
|
441
|
-
return this._config.evaluateFn;
|
|
442
|
-
}
|
|
443
|
-
getPassThreshold() {
|
|
444
|
-
return this._config.passThreshold;
|
|
445
|
-
}
|
|
446
|
-
getPassCriterion() {
|
|
447
|
-
return this._config.passCriterion;
|
|
448
|
-
}
|
|
449
|
-
async resolveContext() {
|
|
450
|
-
const parts = await Promise.all(
|
|
451
|
-
this._config.middlewares.map((mw) => mw.resolve())
|
|
452
|
-
);
|
|
453
|
-
return Object.assign({}, ...parts);
|
|
454
|
-
}
|
|
455
|
-
};
|
|
456
|
-
|
|
457
495
|
// src/evals/dataset.ts
|
|
458
496
|
function matchesAny(value, matchers) {
|
|
459
497
|
return matchers.some(
|
|
@@ -517,230 +555,13 @@ var Dataset = class _Dataset {
|
|
|
517
555
|
return tagMatch && pathMatch;
|
|
518
556
|
}
|
|
519
557
|
};
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
name: config.name,
|
|
528
|
-
aggregate: config.aggregate,
|
|
529
|
-
format: config.format,
|
|
530
|
-
make: (data, options) => ({
|
|
531
|
-
id: config.id,
|
|
532
|
-
data,
|
|
533
|
-
...options?.name !== void 0 && { name: options.name }
|
|
534
|
-
})
|
|
535
|
-
};
|
|
536
|
-
registry.set(config.id, def);
|
|
537
|
-
return def;
|
|
538
|
-
}
|
|
539
|
-
};
|
|
540
|
-
function getMetricById(id) {
|
|
541
|
-
return registry.get(id);
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
// src/evals/score.ts
|
|
545
|
-
var registry2 = /* @__PURE__ */ new Map();
|
|
546
|
-
function formatScoreData(def, data, options) {
|
|
547
|
-
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
548
|
-
}
|
|
549
|
-
var ScoreAggregate = {
|
|
550
|
-
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
551
|
-
averageFields(fields) {
|
|
552
|
-
return (values) => {
|
|
553
|
-
const count = values.length || 1;
|
|
554
|
-
const result = {};
|
|
555
|
-
for (const field of fields) {
|
|
556
|
-
result[field] = values.reduce(
|
|
557
|
-
(s, v) => s + (v[field] ?? 0),
|
|
558
|
-
0
|
|
559
|
-
) / count;
|
|
560
|
-
}
|
|
561
|
-
return result;
|
|
562
|
-
};
|
|
563
|
-
},
|
|
564
|
-
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
565
|
-
averageWithVariance(fields) {
|
|
566
|
-
return (values) => {
|
|
567
|
-
const count = values.length;
|
|
568
|
-
const result = {};
|
|
569
|
-
for (const field of fields) {
|
|
570
|
-
result[field] = count === 0 ? 0 : values.reduce(
|
|
571
|
-
(sum, item) => sum + (item[field] ?? 0),
|
|
572
|
-
0
|
|
573
|
-
) / count;
|
|
574
|
-
}
|
|
575
|
-
const valueField = "value";
|
|
576
|
-
const hasValueField = fields.includes(valueField);
|
|
577
|
-
if (count === 0) {
|
|
578
|
-
if (hasValueField) {
|
|
579
|
-
result[valueField] = 0;
|
|
580
|
-
}
|
|
581
|
-
return {
|
|
582
|
-
...result,
|
|
583
|
-
stdDev: void 0,
|
|
584
|
-
count: 0
|
|
585
|
-
};
|
|
586
|
-
}
|
|
587
|
-
let stdDev;
|
|
588
|
-
if (hasValueField && count >= 2) {
|
|
589
|
-
const sum = values.reduce(
|
|
590
|
-
(s, v) => s + (v[valueField] ?? 0),
|
|
591
|
-
0
|
|
592
|
-
);
|
|
593
|
-
const sumSq = values.reduce(
|
|
594
|
-
(s, v) => {
|
|
595
|
-
const value = v[valueField] ?? 0;
|
|
596
|
-
return s + value * value;
|
|
597
|
-
},
|
|
598
|
-
0
|
|
599
|
-
);
|
|
600
|
-
const mean = sum / count;
|
|
601
|
-
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
602
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
603
|
-
}
|
|
604
|
-
return {
|
|
605
|
-
...values[0],
|
|
606
|
-
...result,
|
|
607
|
-
stdDev,
|
|
608
|
-
count
|
|
609
|
-
};
|
|
610
|
-
};
|
|
611
|
-
},
|
|
612
|
-
/** All runs must pass. Use for binary scores. */
|
|
613
|
-
all(values) {
|
|
614
|
-
const total = values.length;
|
|
615
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
616
|
-
return {
|
|
617
|
-
...values[0],
|
|
618
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
619
|
-
passedCount,
|
|
620
|
-
totalCount: total
|
|
621
|
-
};
|
|
622
|
-
},
|
|
623
|
-
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
624
|
-
last(values) {
|
|
625
|
-
return values[values.length - 1] ?? {};
|
|
626
|
-
}
|
|
627
|
-
};
|
|
628
|
-
var Score = {
|
|
629
|
-
aggregate: ScoreAggregate,
|
|
630
|
-
of(config) {
|
|
631
|
-
const def = {
|
|
632
|
-
id: config.id,
|
|
633
|
-
name: config.name,
|
|
634
|
-
displayStrategy: config.displayStrategy,
|
|
635
|
-
formatValue: config.formatValue,
|
|
636
|
-
formatAggregate: config.formatAggregate,
|
|
637
|
-
aggregateValues: config.aggregateValues,
|
|
638
|
-
make: (data, options) => {
|
|
639
|
-
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
640
|
-
return {
|
|
641
|
-
id: config.id,
|
|
642
|
-
data,
|
|
643
|
-
...passed !== void 0 && { passed },
|
|
644
|
-
...options?.name !== void 0 && { name: options.name },
|
|
645
|
-
def
|
|
646
|
-
// Attach def so rendering/aggregation works without registry lookup
|
|
647
|
-
};
|
|
648
|
-
}
|
|
649
|
-
};
|
|
650
|
-
registry2.set(config.id, def);
|
|
651
|
-
return def;
|
|
652
|
-
}
|
|
653
|
-
};
|
|
654
|
-
function getScoreById(id) {
|
|
655
|
-
return registry2.get(id);
|
|
656
|
-
}
|
|
657
|
-
|
|
658
|
-
// src/evals/aggregators.ts
|
|
659
|
-
function aggregateTokenCountSum(values) {
|
|
660
|
-
const initial = {
|
|
661
|
-
input: 0,
|
|
662
|
-
output: 0,
|
|
663
|
-
inputCached: 0,
|
|
664
|
-
outputCached: 0
|
|
665
|
-
};
|
|
666
|
-
return values.reduce(
|
|
667
|
-
(acc, v) => ({
|
|
668
|
-
input: acc.input + (v.input ?? 0),
|
|
669
|
-
output: acc.output + (v.output ?? 0),
|
|
670
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
671
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
672
|
-
}),
|
|
673
|
-
initial
|
|
674
|
-
);
|
|
675
|
-
}
|
|
676
|
-
function aggregateLatencyAverage(values) {
|
|
677
|
-
if (values.length === 0) {
|
|
678
|
-
return { ms: 0 };
|
|
679
|
-
}
|
|
680
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
681
|
-
return { ms: sum / values.length };
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
// src/evals/metrics/standard.ts
|
|
685
|
-
var tokenCountMetric = Metric.of({
|
|
686
|
-
id: "token-count",
|
|
687
|
-
name: "Tokens",
|
|
688
|
-
aggregate: aggregateTokenCountSum,
|
|
689
|
-
format: (data, options) => {
|
|
690
|
-
const input = data.input ?? 0;
|
|
691
|
-
const output = data.output ?? 0;
|
|
692
|
-
const inputCached = data.inputCached ?? 0;
|
|
693
|
-
const outputCached = data.outputCached ?? 0;
|
|
694
|
-
const cached = inputCached + outputCached;
|
|
695
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
696
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
697
|
-
}
|
|
698
|
-
});
|
|
699
|
-
var latencyMetric = Metric.of({
|
|
700
|
-
id: "latency",
|
|
701
|
-
name: "Latency",
|
|
702
|
-
aggregate: aggregateLatencyAverage,
|
|
703
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
704
|
-
});
|
|
705
|
-
|
|
706
|
-
// src/evals/scores/standard.ts
|
|
707
|
-
var percentScore = Score.of({
|
|
708
|
-
id: "percent",
|
|
709
|
-
name: "Score",
|
|
710
|
-
displayStrategy: "bar",
|
|
711
|
-
formatValue: (data) => data.value.toFixed(2),
|
|
712
|
-
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
713
|
-
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
714
|
-
});
|
|
715
|
-
var deltaScore = Score.of({
|
|
716
|
-
id: "delta",
|
|
717
|
-
name: "Delta",
|
|
718
|
-
displayStrategy: "number",
|
|
719
|
-
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
720
|
-
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
721
|
-
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
722
|
-
});
|
|
723
|
-
var binaryScore = Score.of({
|
|
724
|
-
id: "binary",
|
|
725
|
-
name: "Result",
|
|
726
|
-
displayStrategy: "passFail",
|
|
727
|
-
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
728
|
-
formatAggregate: (data) => {
|
|
729
|
-
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
730
|
-
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
731
|
-
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
732
|
-
}
|
|
733
|
-
return base;
|
|
734
|
-
},
|
|
735
|
-
aggregateValues: Score.aggregate.all
|
|
736
|
-
});
|
|
737
|
-
function preprocessForDiff(value, options) {
|
|
738
|
-
if (options?.sort && Array.isArray(value)) {
|
|
739
|
-
return [...value].sort((a, b) => {
|
|
740
|
-
const aStr = stringify__default.default(preprocessForDiff(a, options));
|
|
741
|
-
const bStr = stringify__default.default(preprocessForDiff(b, options));
|
|
742
|
-
return aStr.localeCompare(bStr);
|
|
743
|
-
}).map((item) => preprocessForDiff(item, options));
|
|
558
|
+
function preprocessForDiff(value, options) {
|
|
559
|
+
if (options?.sort && Array.isArray(value)) {
|
|
560
|
+
return [...value].sort((a, b) => {
|
|
561
|
+
const aStr = stringify__default.default(preprocessForDiff(a, options));
|
|
562
|
+
const bStr = stringify__default.default(preprocessForDiff(b, options));
|
|
563
|
+
return aStr.localeCompare(bStr);
|
|
564
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
744
565
|
}
|
|
745
566
|
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
746
567
|
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
@@ -791,16 +612,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
791
612
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
792
613
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
793
614
|
if (diffOptions?.keysOnly) {
|
|
794
|
-
const expectedKeys = JSON.stringify(
|
|
795
|
-
|
|
796
|
-
null,
|
|
797
|
-
2
|
|
798
|
-
);
|
|
799
|
-
const actualKeys = JSON.stringify(
|
|
800
|
-
extractKeys(actualProcessed),
|
|
801
|
-
null,
|
|
802
|
-
2
|
|
803
|
-
);
|
|
615
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
616
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
804
617
|
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
805
618
|
return formatDiffParts(parts2);
|
|
806
619
|
}
|
|
@@ -811,9 +624,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
811
624
|
}
|
|
812
625
|
const parts = diff.diffLines(expectedStr, actualStr);
|
|
813
626
|
if (diffOptions?.outputNewOnly) {
|
|
814
|
-
const filtered = parts.filter(
|
|
815
|
-
(p) => p.added === true
|
|
816
|
-
);
|
|
627
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
817
628
|
return formatDiffParts(filtered);
|
|
818
629
|
}
|
|
819
630
|
return formatDiffParts(parts);
|
|
@@ -878,14 +689,476 @@ function printJsonDiff(expected, actual, options = {}) {
|
|
|
878
689
|
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
879
690
|
return `\x1B[32m${line}\x1B[0m`;
|
|
880
691
|
}
|
|
881
|
-
return line;
|
|
882
|
-
});
|
|
883
|
-
const colored = lines.join("\n");
|
|
884
|
-
console.log(colored || "(no differences)");
|
|
885
|
-
return colored;
|
|
692
|
+
return line;
|
|
693
|
+
});
|
|
694
|
+
const colored = lines.join("\n");
|
|
695
|
+
console.log(colored || "(no differences)");
|
|
696
|
+
return colored;
|
|
697
|
+
}
|
|
698
|
+
console.log(diff || "(no differences)");
|
|
699
|
+
return diff;
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// src/evals/metric.ts
|
|
703
|
+
var registry = /* @__PURE__ */ new Map();
|
|
704
|
+
var Metric = {
|
|
705
|
+
of(config) {
|
|
706
|
+
const def = {
|
|
707
|
+
id: config.id,
|
|
708
|
+
name: config.name,
|
|
709
|
+
aggregate: config.aggregate,
|
|
710
|
+
format: config.format,
|
|
711
|
+
make: (data, options) => ({
|
|
712
|
+
id: config.id,
|
|
713
|
+
data,
|
|
714
|
+
...options?.name !== void 0 && { name: options.name }
|
|
715
|
+
})
|
|
716
|
+
};
|
|
717
|
+
registry.set(config.id, def);
|
|
718
|
+
return def;
|
|
719
|
+
}
|
|
720
|
+
};
|
|
721
|
+
function getMetricById(id) {
|
|
722
|
+
return registry.get(id);
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// src/evals/aggregators.ts
|
|
726
|
+
function aggregateTokenCountSum(values) {
|
|
727
|
+
const initial = {
|
|
728
|
+
input: 0,
|
|
729
|
+
output: 0,
|
|
730
|
+
inputCached: 0,
|
|
731
|
+
outputCached: 0
|
|
732
|
+
};
|
|
733
|
+
return values.reduce(
|
|
734
|
+
(acc, v) => ({
|
|
735
|
+
input: acc.input + (v.input ?? 0),
|
|
736
|
+
output: acc.output + (v.output ?? 0),
|
|
737
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
738
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
739
|
+
}),
|
|
740
|
+
initial
|
|
741
|
+
);
|
|
742
|
+
}
|
|
743
|
+
function aggregateLatencyAverage(values) {
|
|
744
|
+
if (values.length === 0) {
|
|
745
|
+
return { ms: 0 };
|
|
746
|
+
}
|
|
747
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
748
|
+
return { ms: sum / values.length };
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
// src/evals/metrics/standard.ts
|
|
752
|
+
var tokenCountMetric = Metric.of({
|
|
753
|
+
id: "token-count",
|
|
754
|
+
name: "Tokens",
|
|
755
|
+
aggregate: aggregateTokenCountSum,
|
|
756
|
+
format: (data, options) => {
|
|
757
|
+
const input = data.input ?? 0;
|
|
758
|
+
const output = data.output ?? 0;
|
|
759
|
+
const inputCached = data.inputCached ?? 0;
|
|
760
|
+
const outputCached = data.outputCached ?? 0;
|
|
761
|
+
const cached = inputCached + outputCached;
|
|
762
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
763
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
764
|
+
}
|
|
765
|
+
});
|
|
766
|
+
var latencyMetric = Metric.of({
|
|
767
|
+
id: "latency",
|
|
768
|
+
name: "Latency",
|
|
769
|
+
aggregate: aggregateLatencyAverage,
|
|
770
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
771
|
+
});
|
|
772
|
+
|
|
773
|
+
// src/evals/run-config.ts
|
|
774
|
+
function validateRow(row, index) {
|
|
775
|
+
const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
|
|
776
|
+
const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
|
|
777
|
+
if (hasEvaluators && hasPattern) {
|
|
778
|
+
throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
|
|
779
|
+
}
|
|
780
|
+
if (!hasEvaluators && !hasPattern) {
|
|
781
|
+
throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
|
|
782
|
+
}
|
|
783
|
+
if (hasEvaluators && row.evaluators.length === 0) {
|
|
784
|
+
throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
|
|
785
|
+
}
|
|
786
|
+
const rawRep = "repetitions" in row ? row.repetitions : void 0;
|
|
787
|
+
const repetitions = rawRep ?? 1;
|
|
788
|
+
if (!Number.isInteger(repetitions) || repetitions < 1) {
|
|
789
|
+
throw new Error(
|
|
790
|
+
`RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
|
|
791
|
+
);
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
var RunConfig = class _RunConfig {
|
|
795
|
+
constructor(name, displayName, tags, runs) {
|
|
796
|
+
this._name = name;
|
|
797
|
+
this._displayName = displayName;
|
|
798
|
+
this._tags = tags;
|
|
799
|
+
this._runs = runs;
|
|
800
|
+
}
|
|
801
|
+
static define(config) {
|
|
802
|
+
if (config.runs.length === 0) {
|
|
803
|
+
throw new Error("RunConfig runs must be non-empty");
|
|
804
|
+
}
|
|
805
|
+
config.runs.forEach(validateRow);
|
|
806
|
+
const name = validateRunConfigName(config.name, "RunConfig.define");
|
|
807
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
808
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
809
|
+
return new _RunConfig(name, displayName, tags, config.runs);
|
|
810
|
+
}
|
|
811
|
+
/** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
|
|
812
|
+
getName() {
|
|
813
|
+
return this._name;
|
|
814
|
+
}
|
|
815
|
+
/** Optional unrestricted display label. */
|
|
816
|
+
getDisplayName() {
|
|
817
|
+
return this._displayName;
|
|
818
|
+
}
|
|
819
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
820
|
+
getDisplayLabel() {
|
|
821
|
+
return this._displayName ?? this._name;
|
|
822
|
+
}
|
|
823
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
824
|
+
getTags() {
|
|
825
|
+
return [...this._tags];
|
|
826
|
+
}
|
|
827
|
+
getRuns() {
|
|
828
|
+
return this._runs;
|
|
829
|
+
}
|
|
830
|
+
};
|
|
831
|
+
|
|
832
|
+
// src/evals/score.ts
|
|
833
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
834
|
+
function formatScoreData(def, data, options) {
|
|
835
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
836
|
+
}
|
|
837
|
+
var ScoreAggregate = {
|
|
838
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
839
|
+
averageFields(fields) {
|
|
840
|
+
return (values) => {
|
|
841
|
+
const count = values.length || 1;
|
|
842
|
+
const result = {};
|
|
843
|
+
for (const field of fields) {
|
|
844
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
845
|
+
}
|
|
846
|
+
return result;
|
|
847
|
+
};
|
|
848
|
+
},
|
|
849
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
850
|
+
averageWithVariance(fields) {
|
|
851
|
+
return (values) => {
|
|
852
|
+
const count = values.length;
|
|
853
|
+
const result = {};
|
|
854
|
+
for (const field of fields) {
|
|
855
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
856
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
857
|
+
0
|
|
858
|
+
) / count;
|
|
859
|
+
}
|
|
860
|
+
const valueField = "value";
|
|
861
|
+
const hasValueField = fields.includes(valueField);
|
|
862
|
+
if (count === 0) {
|
|
863
|
+
if (hasValueField) {
|
|
864
|
+
result[valueField] = 0;
|
|
865
|
+
}
|
|
866
|
+
return {
|
|
867
|
+
...result,
|
|
868
|
+
stdDev: void 0,
|
|
869
|
+
count: 0
|
|
870
|
+
};
|
|
871
|
+
}
|
|
872
|
+
let stdDev;
|
|
873
|
+
if (hasValueField && count >= 2) {
|
|
874
|
+
const sum = values.reduce(
|
|
875
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
876
|
+
0
|
|
877
|
+
);
|
|
878
|
+
const sumSq = values.reduce((s, v) => {
|
|
879
|
+
const value = v[valueField] ?? 0;
|
|
880
|
+
return s + value * value;
|
|
881
|
+
}, 0);
|
|
882
|
+
const mean = sum / count;
|
|
883
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
884
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
885
|
+
}
|
|
886
|
+
return {
|
|
887
|
+
...values[0],
|
|
888
|
+
...result,
|
|
889
|
+
stdDev,
|
|
890
|
+
count
|
|
891
|
+
};
|
|
892
|
+
};
|
|
893
|
+
},
|
|
894
|
+
/** All runs must pass. Use for binary scores. */
|
|
895
|
+
all(values) {
|
|
896
|
+
const total = values.length;
|
|
897
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
898
|
+
return {
|
|
899
|
+
...values[0],
|
|
900
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
901
|
+
passedCount,
|
|
902
|
+
totalCount: total
|
|
903
|
+
};
|
|
904
|
+
},
|
|
905
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
906
|
+
last(values) {
|
|
907
|
+
return values[values.length - 1] ?? {};
|
|
908
|
+
}
|
|
909
|
+
};
|
|
910
|
+
var Score = {
|
|
911
|
+
aggregate: ScoreAggregate,
|
|
912
|
+
of(config) {
|
|
913
|
+
const def = {
|
|
914
|
+
id: config.id,
|
|
915
|
+
name: config.name,
|
|
916
|
+
displayStrategy: config.displayStrategy,
|
|
917
|
+
formatValue: config.formatValue,
|
|
918
|
+
formatAggregate: config.formatAggregate,
|
|
919
|
+
aggregateValues: config.aggregateValues,
|
|
920
|
+
make: (data, options) => {
|
|
921
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
922
|
+
return {
|
|
923
|
+
id: config.id,
|
|
924
|
+
data,
|
|
925
|
+
...passed !== void 0 && { passed },
|
|
926
|
+
...options?.name !== void 0 && { name: options.name },
|
|
927
|
+
def
|
|
928
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
929
|
+
};
|
|
930
|
+
}
|
|
931
|
+
};
|
|
932
|
+
registry2.set(config.id, def);
|
|
933
|
+
return def;
|
|
934
|
+
}
|
|
935
|
+
};
|
|
936
|
+
function getScoreById(id) {
|
|
937
|
+
return registry2.get(id);
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
// src/evals/scores/standard.ts
|
|
941
|
+
var percentScore = Score.of({
|
|
942
|
+
id: "percent",
|
|
943
|
+
name: "Score",
|
|
944
|
+
displayStrategy: "bar",
|
|
945
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
946
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
947
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
948
|
+
});
|
|
949
|
+
var deltaScore = Score.of({
|
|
950
|
+
id: "delta",
|
|
951
|
+
name: "Delta",
|
|
952
|
+
displayStrategy: "number",
|
|
953
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
954
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
955
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
956
|
+
});
|
|
957
|
+
var binaryScore = Score.of({
|
|
958
|
+
id: "binary",
|
|
959
|
+
name: "Result",
|
|
960
|
+
displayStrategy: "passFail",
|
|
961
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
962
|
+
formatAggregate: (data) => {
|
|
963
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
964
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
965
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
966
|
+
}
|
|
967
|
+
return base;
|
|
968
|
+
},
|
|
969
|
+
aggregateValues: Score.aggregate.all
|
|
970
|
+
});
|
|
971
|
+
|
|
972
|
+
// src/evals/tag-set.ts
|
|
973
|
+
var TagSet = class {
|
|
974
|
+
constructor() {
|
|
975
|
+
}
|
|
976
|
+
static define(tags) {
|
|
977
|
+
const out = {};
|
|
978
|
+
for (const tag of tags) {
|
|
979
|
+
out[tag] = tag;
|
|
980
|
+
}
|
|
981
|
+
return out;
|
|
982
|
+
}
|
|
983
|
+
};
|
|
984
|
+
|
|
985
|
+
// src/evals/test-case.ts
|
|
986
|
+
function resolve(value) {
|
|
987
|
+
return typeof value === "function" ? value() : value;
|
|
988
|
+
}
|
|
989
|
+
var TestCase = class _TestCase {
|
|
990
|
+
constructor(config) {
|
|
991
|
+
this._config = config;
|
|
992
|
+
}
|
|
993
|
+
static describe(config) {
|
|
994
|
+
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
995
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
996
|
+
return new _TestCase({
|
|
997
|
+
name,
|
|
998
|
+
displayName,
|
|
999
|
+
tags: config.tags,
|
|
1000
|
+
inputSchema: config.inputSchema,
|
|
1001
|
+
input: config.input,
|
|
1002
|
+
outputSchema: config.outputSchema,
|
|
1003
|
+
output: config.output
|
|
1004
|
+
});
|
|
1005
|
+
}
|
|
1006
|
+
getName() {
|
|
1007
|
+
return this._config.name;
|
|
1008
|
+
}
|
|
1009
|
+
getDisplayName() {
|
|
1010
|
+
return this._config.displayName;
|
|
1011
|
+
}
|
|
1012
|
+
getDisplayLabel() {
|
|
1013
|
+
return this._config.displayName ?? this._config.name;
|
|
1014
|
+
}
|
|
1015
|
+
getTags() {
|
|
1016
|
+
return this._config.tags;
|
|
1017
|
+
}
|
|
1018
|
+
getInputSchema() {
|
|
1019
|
+
return this._config.inputSchema;
|
|
1020
|
+
}
|
|
1021
|
+
getInput() {
|
|
1022
|
+
return resolve(this._config.input);
|
|
1023
|
+
}
|
|
1024
|
+
getOutputSchema() {
|
|
1025
|
+
return this._config.outputSchema;
|
|
1026
|
+
}
|
|
1027
|
+
getOutput() {
|
|
1028
|
+
if (this._config.output === void 0) {
|
|
1029
|
+
return void 0;
|
|
1030
|
+
}
|
|
1031
|
+
return resolve(this._config.output);
|
|
1032
|
+
}
|
|
1033
|
+
};
|
|
1034
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1035
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1036
|
+
return testCase.getDisplayLabel();
|
|
1037
|
+
}
|
|
1038
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
1039
|
+
}
|
|
1040
|
+
function getTestCaseTagList(testCase) {
|
|
1041
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1042
|
+
}
|
|
1043
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
1044
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
1045
|
+
let entries;
|
|
1046
|
+
try {
|
|
1047
|
+
entries = await promises.readdir(baseDir);
|
|
1048
|
+
} catch {
|
|
1049
|
+
return [];
|
|
1050
|
+
}
|
|
1051
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1052
|
+
const snapshots = [];
|
|
1053
|
+
for (const fileName of jsonlFiles) {
|
|
1054
|
+
const filePath = path.join(baseDir, fileName);
|
|
1055
|
+
try {
|
|
1056
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1057
|
+
if (snapshot) {
|
|
1058
|
+
snapshots.push(snapshot);
|
|
1059
|
+
}
|
|
1060
|
+
} catch {
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
1063
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1064
|
+
}
|
|
1065
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1066
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
1067
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1068
|
+
if (lines.length === 0) {
|
|
1069
|
+
return null;
|
|
1070
|
+
}
|
|
1071
|
+
let runQueued = null;
|
|
1072
|
+
let runCompleted = null;
|
|
1073
|
+
let runFailed = null;
|
|
1074
|
+
let runStarted = null;
|
|
1075
|
+
for (const line of lines) {
|
|
1076
|
+
try {
|
|
1077
|
+
const event = JSON.parse(line);
|
|
1078
|
+
const type = event.type;
|
|
1079
|
+
if (type === "RunQueued") {
|
|
1080
|
+
runQueued = {
|
|
1081
|
+
runId: event.runId,
|
|
1082
|
+
datasetId: event.datasetId,
|
|
1083
|
+
datasetName: event.datasetName,
|
|
1084
|
+
evaluatorIds: event.evaluatorIds,
|
|
1085
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
1086
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
1087
|
+
ts: event.ts
|
|
1088
|
+
};
|
|
1089
|
+
}
|
|
1090
|
+
if (type === "RunStarted") {
|
|
1091
|
+
runStarted = { startedAt: event.startedAt };
|
|
1092
|
+
}
|
|
1093
|
+
if (type === "RunCompleted") {
|
|
1094
|
+
runCompleted = {
|
|
1095
|
+
passedTestCases: event.passedTestCases,
|
|
1096
|
+
failedTestCases: event.failedTestCases,
|
|
1097
|
+
totalTestCases: event.totalTestCases,
|
|
1098
|
+
finishedAt: event.finishedAt
|
|
1099
|
+
};
|
|
1100
|
+
}
|
|
1101
|
+
if (type === "RunFailed") {
|
|
1102
|
+
runFailed = {
|
|
1103
|
+
finishedAt: event.finishedAt,
|
|
1104
|
+
errorMessage: event.errorMessage
|
|
1105
|
+
};
|
|
1106
|
+
}
|
|
1107
|
+
} catch {
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
if (!runQueued) {
|
|
1111
|
+
return null;
|
|
1112
|
+
}
|
|
1113
|
+
const artifactPath = filePath;
|
|
1114
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1115
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
1116
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1117
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1118
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1119
|
+
return {
|
|
1120
|
+
runId: runQueued.runId,
|
|
1121
|
+
datasetId: runQueued.datasetId,
|
|
1122
|
+
datasetName: runQueued.datasetName,
|
|
1123
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
1124
|
+
queuedAt: runQueued.ts ?? 0,
|
|
1125
|
+
startedAt: runStarted?.startedAt,
|
|
1126
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1127
|
+
totalTestCases: runQueued.totalTestCases,
|
|
1128
|
+
completedTestCases,
|
|
1129
|
+
passedTestCases,
|
|
1130
|
+
failedTestCases,
|
|
1131
|
+
status,
|
|
1132
|
+
artifactPath,
|
|
1133
|
+
errorMessage: runFailed?.errorMessage
|
|
1134
|
+
};
|
|
1135
|
+
}
|
|
1136
|
+
function aggregateTestCaseProgress(lines) {
|
|
1137
|
+
let completedTestCases = 0;
|
|
1138
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1139
|
+
for (const line of lines) {
|
|
1140
|
+
try {
|
|
1141
|
+
const event = JSON.parse(line);
|
|
1142
|
+
if (event.type === "TestCaseProgress") {
|
|
1143
|
+
const ev = event;
|
|
1144
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1145
|
+
const id = ev.testCaseId;
|
|
1146
|
+
const current = testCasePassedBy.get(id);
|
|
1147
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1148
|
+
}
|
|
1149
|
+
} catch {
|
|
1150
|
+
}
|
|
886
1151
|
}
|
|
887
|
-
|
|
888
|
-
|
|
1152
|
+
let passedTestCases = 0;
|
|
1153
|
+
let failedTestCases = 0;
|
|
1154
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1155
|
+
if (passed) {
|
|
1156
|
+
passedTestCases += 1;
|
|
1157
|
+
} else {
|
|
1158
|
+
failedTestCases += 1;
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
889
1162
|
}
|
|
890
1163
|
|
|
891
1164
|
// src/runner/config.ts
|
|
@@ -896,18 +1169,9 @@ var defaultRunnerConfig = {
|
|
|
896
1169
|
discovery: {
|
|
897
1170
|
rootDir: process.cwd(),
|
|
898
1171
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
899
|
-
evaluatorSuffixes: [
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
".evaluator.js",
|
|
903
|
-
".evaluator.mjs"
|
|
904
|
-
],
|
|
905
|
-
testCaseSuffixes: [
|
|
906
|
-
".test-case.ts",
|
|
907
|
-
".test-case.tsx",
|
|
908
|
-
".test-case.js",
|
|
909
|
-
".test-case.mjs"
|
|
910
|
-
],
|
|
1172
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
1173
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
1174
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
911
1175
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
912
1176
|
},
|
|
913
1177
|
artifactDirectory: ".eval-results",
|
|
@@ -932,6 +1196,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
932
1196
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
933
1197
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
934
1198
|
}
|
|
1199
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
1200
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
1201
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
1202
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
1203
|
+
}
|
|
935
1204
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
936
1205
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
937
1206
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -974,14 +1243,15 @@ function getJitiLoader() {
|
|
|
974
1243
|
}
|
|
975
1244
|
const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
|
|
976
1245
|
if (typeof createJiti2 !== "function") {
|
|
977
|
-
throw new Error(
|
|
978
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
979
|
-
);
|
|
1246
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
980
1247
|
}
|
|
981
|
-
cachedLoader = createJiti2(
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1248
|
+
cachedLoader = createJiti2(
|
|
1249
|
+
(typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
|
|
1250
|
+
{
|
|
1251
|
+
interopDefault: true,
|
|
1252
|
+
moduleCache: true
|
|
1253
|
+
}
|
|
1254
|
+
);
|
|
985
1255
|
return cachedLoader;
|
|
986
1256
|
}
|
|
987
1257
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -1029,6 +1299,9 @@ function isDatasetLike(value) {
|
|
|
1029
1299
|
function isEvaluatorLike(value) {
|
|
1030
1300
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
1031
1301
|
}
|
|
1302
|
+
function isRunConfigLike(value) {
|
|
1303
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1304
|
+
}
|
|
1032
1305
|
function isTestCaseLike(value) {
|
|
1033
1306
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
1034
1307
|
}
|
|
@@ -1085,9 +1358,7 @@ async function loadModuleExports(filePath) {
|
|
|
1085
1358
|
}
|
|
1086
1359
|
async function collectDatasetsFromFiles(config) {
|
|
1087
1360
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1088
|
-
const matched = files.filter(
|
|
1089
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
1090
|
-
);
|
|
1361
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
1091
1362
|
const found = await Promise.all(
|
|
1092
1363
|
matched.map(async (absolutePath) => {
|
|
1093
1364
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1104,9 +1375,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
1104
1375
|
}
|
|
1105
1376
|
async function collectEvaluatorsFromFiles(config) {
|
|
1106
1377
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1107
|
-
const matched = files.filter(
|
|
1108
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
1109
|
-
);
|
|
1378
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
1110
1379
|
const found = await Promise.all(
|
|
1111
1380
|
matched.map(async (absolutePath) => {
|
|
1112
1381
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1121,11 +1390,26 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1121
1390
|
);
|
|
1122
1391
|
return found.flat();
|
|
1123
1392
|
}
|
|
1124
|
-
async function
|
|
1393
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1125
1394
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1126
|
-
const matched = files.filter(
|
|
1127
|
-
|
|
1395
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1396
|
+
const found = await Promise.all(
|
|
1397
|
+
matched.map(async (absolutePath) => {
|
|
1398
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1399
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1400
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1401
|
+
return runConfigs.map((runConfig) => ({
|
|
1402
|
+
id: runConfig.getName(),
|
|
1403
|
+
filePath: relPath,
|
|
1404
|
+
runConfig
|
|
1405
|
+
}));
|
|
1406
|
+
})
|
|
1128
1407
|
);
|
|
1408
|
+
return found.flat();
|
|
1409
|
+
}
|
|
1410
|
+
async function collectTestCasesFromFiles(config) {
|
|
1411
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1412
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
1129
1413
|
const found = await Promise.all(
|
|
1130
1414
|
matched.map(async (absolutePath) => {
|
|
1131
1415
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1215,15 +1499,17 @@ function readOutput(testCase) {
|
|
|
1215
1499
|
}
|
|
1216
1500
|
return candidate.getOutput();
|
|
1217
1501
|
}
|
|
1218
|
-
function buildEvaluationUnits(testCases) {
|
|
1502
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1503
|
+
const count = Math.max(1, repetitionCount);
|
|
1219
1504
|
const units = [];
|
|
1220
1505
|
for (const testCaseItem of testCases) {
|
|
1221
|
-
const
|
|
1222
|
-
for (let r = 0; r <
|
|
1506
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
1507
|
+
for (let r = 0; r < count; r++) {
|
|
1223
1508
|
units.push({
|
|
1224
1509
|
testCaseItem,
|
|
1225
|
-
|
|
1226
|
-
|
|
1510
|
+
repetitionId,
|
|
1511
|
+
repetitionIndex: r + 1,
|
|
1512
|
+
repetitionCount: count
|
|
1227
1513
|
});
|
|
1228
1514
|
}
|
|
1229
1515
|
}
|
|
@@ -1233,29 +1519,24 @@ function nowIsoForFile() {
|
|
|
1233
1519
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1234
1520
|
}
|
|
1235
1521
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1236
|
-
return path.join(
|
|
1237
|
-
artifactDirectory,
|
|
1238
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1239
|
-
);
|
|
1522
|
+
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1240
1523
|
}
|
|
1241
1524
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1242
|
-
const { testCaseItem,
|
|
1525
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1243
1526
|
return effect.Effect.gen(function* () {
|
|
1244
1527
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1245
1528
|
const started = Date.now();
|
|
1246
|
-
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1247
|
-
n + 1,
|
|
1248
|
-
n + 1
|
|
1249
|
-
]);
|
|
1529
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
1250
1530
|
yield* publishEvent({
|
|
1251
1531
|
type: "TestCaseStarted",
|
|
1252
1532
|
runId: task.runId,
|
|
1253
1533
|
testCaseId: testCaseItem.id,
|
|
1254
|
-
testCaseName: testCaseItem.testCase
|
|
1534
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1255
1535
|
startedTestCases: startedEvaluations,
|
|
1256
1536
|
totalTestCases: totalEvaluations,
|
|
1257
|
-
|
|
1258
|
-
|
|
1537
|
+
repetitionId,
|
|
1538
|
+
repetitionIndex,
|
|
1539
|
+
repetitionCount
|
|
1259
1540
|
});
|
|
1260
1541
|
const evaluatorScores = [];
|
|
1261
1542
|
let testCaseError;
|
|
@@ -1279,9 +1560,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1279
1560
|
return error;
|
|
1280
1561
|
};
|
|
1281
1562
|
try {
|
|
1282
|
-
const ctx = yield* effect.Effect.promise(
|
|
1283
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
1284
|
-
);
|
|
1563
|
+
const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
1285
1564
|
const result = yield* effect.Effect.promise(
|
|
1286
1565
|
() => Promise.resolve().then(
|
|
1287
1566
|
() => evaluateFn({
|
|
@@ -1291,8 +1570,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1291
1570
|
meta: {
|
|
1292
1571
|
triggerId: task.triggerId,
|
|
1293
1572
|
runId: evaluatorRunId,
|
|
1294
|
-
datasetId: task.datasetId
|
|
1573
|
+
datasetId: task.datasetId,
|
|
1574
|
+
repetitionId,
|
|
1575
|
+
repetitionIndex,
|
|
1576
|
+
repetitionCount,
|
|
1577
|
+
runConfigName: task.runConfigName
|
|
1295
1578
|
},
|
|
1579
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1580
|
+
runConfigTags: task.runConfigTags,
|
|
1581
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1296
1582
|
logDiff,
|
|
1297
1583
|
log,
|
|
1298
1584
|
createError
|
|
@@ -1335,21 +1621,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1335
1621
|
});
|
|
1336
1622
|
}
|
|
1337
1623
|
}
|
|
1338
|
-
const
|
|
1339
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1340
|
-
n + 1,
|
|
1341
|
-
n + 1
|
|
1342
|
-
]);
|
|
1624
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1625
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1343
1626
|
const progressEvent = {
|
|
1344
1627
|
type: "TestCaseProgress",
|
|
1345
1628
|
runId: task.runId,
|
|
1346
1629
|
testCaseId: testCaseItem.id,
|
|
1347
|
-
testCaseName: testCaseItem.testCase
|
|
1630
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1348
1631
|
completedTestCases: completedEvaluations,
|
|
1349
1632
|
totalTestCases: totalEvaluations,
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1633
|
+
repetitionId,
|
|
1634
|
+
repetitionIndex,
|
|
1635
|
+
repetitionCount,
|
|
1636
|
+
passed: repetitionPassedThis,
|
|
1353
1637
|
durationMs: Date.now() - started,
|
|
1354
1638
|
evaluatorScores,
|
|
1355
1639
|
output,
|
|
@@ -1370,9 +1654,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1370
1654
|
(map) => {
|
|
1371
1655
|
const key = testCaseItem.id;
|
|
1372
1656
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1373
|
-
const newResults = [...existing.results,
|
|
1657
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1374
1658
|
const newCompletedCount = existing.completedCount + 1;
|
|
1375
|
-
const isLast = newCompletedCount ===
|
|
1659
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1376
1660
|
const newMap = new Map(map);
|
|
1377
1661
|
newMap.set(key, {
|
|
1378
1662
|
completedCount: newCompletedCount,
|
|
@@ -1388,10 +1672,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1388
1672
|
} else {
|
|
1389
1673
|
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1390
1674
|
}
|
|
1391
|
-
const [passed, failed] = yield* effect.Effect.all([
|
|
1392
|
-
effect.Ref.get(passedRef),
|
|
1393
|
-
effect.Ref.get(failedRef)
|
|
1394
|
-
]);
|
|
1675
|
+
const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
|
|
1395
1676
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1396
1677
|
...snapshot,
|
|
1397
1678
|
passedTestCases: passed,
|
|
@@ -1412,10 +1693,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1412
1693
|
runId: task.runId,
|
|
1413
1694
|
startedAt
|
|
1414
1695
|
});
|
|
1415
|
-
const totalEvaluations = task.testCases.
|
|
1416
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1417
|
-
0
|
|
1418
|
-
);
|
|
1696
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1419
1697
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1420
1698
|
const completedRef = yield* effect.Ref.make(0);
|
|
1421
1699
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -1424,7 +1702,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1424
1702
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
1425
1703
|
/* @__PURE__ */ new Map()
|
|
1426
1704
|
);
|
|
1427
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1705
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1428
1706
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1429
1707
|
task,
|
|
1430
1708
|
unit,
|
|
@@ -1438,11 +1716,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1438
1716
|
failedRef,
|
|
1439
1717
|
testCaseResultsRef
|
|
1440
1718
|
);
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1719
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1720
|
+
if (globalSem !== void 0) {
|
|
1721
|
+
yield* effect.Effect.forEach(
|
|
1722
|
+
evaluationUnits,
|
|
1723
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1724
|
+
{ concurrency: "unbounded", discard: true }
|
|
1725
|
+
);
|
|
1726
|
+
} else {
|
|
1727
|
+
yield* effect.Effect.forEach(
|
|
1728
|
+
evaluationUnits,
|
|
1729
|
+
processEvaluation,
|
|
1730
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1731
|
+
);
|
|
1732
|
+
}
|
|
1446
1733
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1447
1734
|
effect.Ref.get(completedRef),
|
|
1448
1735
|
effect.Ref.get(passedRef),
|
|
@@ -1478,125 +1765,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1478
1765
|
artifactPath: task.snapshot.artifactPath
|
|
1479
1766
|
});
|
|
1480
1767
|
});
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
} catch {
|
|
1487
|
-
return [];
|
|
1488
|
-
}
|
|
1489
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1490
|
-
const snapshots = [];
|
|
1491
|
-
for (const fileName of jsonlFiles) {
|
|
1492
|
-
const filePath = path.join(baseDir, fileName);
|
|
1493
|
-
try {
|
|
1494
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1495
|
-
if (snapshot) {
|
|
1496
|
-
snapshots.push(snapshot);
|
|
1497
|
-
}
|
|
1498
|
-
} catch {
|
|
1499
|
-
}
|
|
1500
|
-
}
|
|
1501
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1502
|
-
}
|
|
1503
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1504
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1505
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1506
|
-
if (lines.length === 0) {
|
|
1507
|
-
return null;
|
|
1508
|
-
}
|
|
1509
|
-
let runQueued = null;
|
|
1510
|
-
let runCompleted = null;
|
|
1511
|
-
let runFailed = null;
|
|
1512
|
-
let runStarted = null;
|
|
1513
|
-
for (const line of lines) {
|
|
1514
|
-
try {
|
|
1515
|
-
const event = JSON.parse(line);
|
|
1516
|
-
const type = event.type;
|
|
1517
|
-
if (type === "RunQueued") {
|
|
1518
|
-
runQueued = {
|
|
1519
|
-
runId: event.runId,
|
|
1520
|
-
datasetId: event.datasetId,
|
|
1521
|
-
datasetName: event.datasetName,
|
|
1522
|
-
evaluatorIds: event.evaluatorIds,
|
|
1523
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1524
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1525
|
-
ts: event.ts
|
|
1526
|
-
};
|
|
1527
|
-
}
|
|
1528
|
-
if (type === "RunStarted") {
|
|
1529
|
-
runStarted = { startedAt: event.startedAt };
|
|
1530
|
-
}
|
|
1531
|
-
if (type === "RunCompleted") {
|
|
1532
|
-
runCompleted = {
|
|
1533
|
-
passedTestCases: event.passedTestCases,
|
|
1534
|
-
failedTestCases: event.failedTestCases,
|
|
1535
|
-
totalTestCases: event.totalTestCases,
|
|
1536
|
-
finishedAt: event.finishedAt
|
|
1537
|
-
};
|
|
1538
|
-
}
|
|
1539
|
-
if (type === "RunFailed") {
|
|
1540
|
-
runFailed = {
|
|
1541
|
-
finishedAt: event.finishedAt,
|
|
1542
|
-
errorMessage: event.errorMessage
|
|
1543
|
-
};
|
|
1544
|
-
}
|
|
1545
|
-
} catch {
|
|
1546
|
-
}
|
|
1768
|
+
|
|
1769
|
+
// src/runner/name-pattern.ts
|
|
1770
|
+
function parseRegexLiteral(pattern) {
|
|
1771
|
+
if (!pattern.startsWith("/")) {
|
|
1772
|
+
return void 0;
|
|
1547
1773
|
}
|
|
1548
|
-
|
|
1549
|
-
|
|
1774
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1775
|
+
if (lastSlash <= 0) {
|
|
1776
|
+
return void 0;
|
|
1550
1777
|
}
|
|
1551
|
-
const artifactPath = filePath;
|
|
1552
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1553
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1554
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1555
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1556
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1557
1778
|
return {
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
datasetName: runQueued.datasetName,
|
|
1561
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1562
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1563
|
-
startedAt: runStarted?.startedAt,
|
|
1564
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1565
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1566
|
-
completedTestCases,
|
|
1567
|
-
passedTestCases,
|
|
1568
|
-
failedTestCases,
|
|
1569
|
-
status,
|
|
1570
|
-
artifactPath,
|
|
1571
|
-
errorMessage: runFailed?.errorMessage
|
|
1779
|
+
source: pattern.slice(1, lastSlash),
|
|
1780
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1572
1781
|
};
|
|
1573
1782
|
}
|
|
1574
|
-
function
|
|
1575
|
-
|
|
1576
|
-
const
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
if (event.type === "TestCaseProgress") {
|
|
1581
|
-
const ev = event;
|
|
1582
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1583
|
-
const id = ev.testCaseId;
|
|
1584
|
-
const current = testCasePassedBy.get(id);
|
|
1585
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1586
|
-
}
|
|
1587
|
-
} catch {
|
|
1588
|
-
}
|
|
1783
|
+
function createNameMatcher(pattern) {
|
|
1784
|
+
const normalizedPattern = pattern.trim();
|
|
1785
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1786
|
+
if (regexLiteral) {
|
|
1787
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1788
|
+
return (value) => regex.test(value);
|
|
1589
1789
|
}
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
passedTestCases += 1;
|
|
1595
|
-
} else {
|
|
1596
|
-
failedTestCases += 1;
|
|
1597
|
-
}
|
|
1790
|
+
if (normalizedPattern.includes("*")) {
|
|
1791
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1792
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1793
|
+
return (value) => regex.test(value);
|
|
1598
1794
|
}
|
|
1599
|
-
return
|
|
1795
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1600
1796
|
}
|
|
1601
1797
|
async function appendJsonLine(artifactPath, payload) {
|
|
1602
1798
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1655,32 +1851,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1655
1851
|
}
|
|
1656
1852
|
|
|
1657
1853
|
// src/runner/api.ts
|
|
1658
|
-
function
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1663
|
-
if (lastSlash <= 0) {
|
|
1664
|
-
return void 0;
|
|
1665
|
-
}
|
|
1666
|
-
return {
|
|
1667
|
-
source: pattern.slice(1, lastSlash),
|
|
1668
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1669
|
-
};
|
|
1670
|
-
}
|
|
1671
|
-
function createNameMatcher(pattern) {
|
|
1672
|
-
const normalizedPattern = pattern.trim();
|
|
1673
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1674
|
-
if (regexLiteral) {
|
|
1675
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1676
|
-
return (value) => regex.test(value);
|
|
1677
|
-
}
|
|
1678
|
-
if (normalizedPattern.includes("*")) {
|
|
1679
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1680
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1681
|
-
return (value) => regex.test(value);
|
|
1854
|
+
function normalizeRunRepetitions(value) {
|
|
1855
|
+
const n = value ?? 1;
|
|
1856
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1857
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1682
1858
|
}
|
|
1683
|
-
return
|
|
1859
|
+
return n;
|
|
1684
1860
|
}
|
|
1685
1861
|
function mergeRunnerOverrides(base, next) {
|
|
1686
1862
|
if (!base) {
|
|
@@ -1711,15 +1887,12 @@ var EffectRunner = class {
|
|
|
1711
1887
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1712
1888
|
effect.Queue.unbounded()
|
|
1713
1889
|
);
|
|
1714
|
-
this.snapshotsRef = effect.Effect.runSync(
|
|
1715
|
-
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1716
|
-
);
|
|
1890
|
+
this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
|
|
1717
1891
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1718
1892
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1719
1893
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1720
|
-
this.
|
|
1721
|
-
|
|
1722
|
-
);
|
|
1894
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1895
|
+
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1723
1896
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1724
1897
|
createPersistenceWorker(this.persistenceQueue)
|
|
1725
1898
|
);
|
|
@@ -1759,6 +1932,137 @@ var EffectRunner = class {
|
|
|
1759
1932
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1760
1933
|
);
|
|
1761
1934
|
}
|
|
1935
|
+
async collectRunConfigs() {
|
|
1936
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1937
|
+
this.runConfigsById.clear();
|
|
1938
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1939
|
+
for (const item of runConfigs) {
|
|
1940
|
+
const id = item.runConfig.getName();
|
|
1941
|
+
const lower = id.toLowerCase();
|
|
1942
|
+
const prev = byNameLower.get(lower);
|
|
1943
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1944
|
+
throw new Error(
|
|
1945
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1946
|
+
);
|
|
1947
|
+
}
|
|
1948
|
+
byNameLower.set(lower, item);
|
|
1949
|
+
this.runConfigsById.set(id, item);
|
|
1950
|
+
}
|
|
1951
|
+
return runConfigs;
|
|
1952
|
+
}
|
|
1953
|
+
async resolveRunConfigByName(name) {
|
|
1954
|
+
if (this.runConfigsById.size === 0) {
|
|
1955
|
+
await this.collectRunConfigs();
|
|
1956
|
+
}
|
|
1957
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1958
|
+
const keyLower = key.toLowerCase();
|
|
1959
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1960
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1961
|
+
);
|
|
1962
|
+
if (matches.length === 0) {
|
|
1963
|
+
return void 0;
|
|
1964
|
+
}
|
|
1965
|
+
if (matches.length > 1) {
|
|
1966
|
+
throw new Error(
|
|
1967
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1968
|
+
);
|
|
1969
|
+
}
|
|
1970
|
+
return matches[0];
|
|
1971
|
+
}
|
|
1972
|
+
async expandRunConfigToJobs(collected) {
|
|
1973
|
+
if (this.datasetsById.size === 0) {
|
|
1974
|
+
await this.collectDatasets();
|
|
1975
|
+
}
|
|
1976
|
+
if (this.evaluatorsById.size === 0) {
|
|
1977
|
+
await this.collectEvaluators();
|
|
1978
|
+
}
|
|
1979
|
+
const rcName = collected.runConfig.getName();
|
|
1980
|
+
const jobs = [];
|
|
1981
|
+
const runs = collected.runConfig.getRuns();
|
|
1982
|
+
for (const [i, row] of runs.entries()) {
|
|
1983
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1984
|
+
(d) => d.dataset === row.dataset
|
|
1985
|
+
);
|
|
1986
|
+
if (!dsCollected) {
|
|
1987
|
+
throw new Error(
|
|
1988
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1989
|
+
);
|
|
1990
|
+
}
|
|
1991
|
+
let evaluatorIds;
|
|
1992
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1993
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1994
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1995
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1996
|
+
);
|
|
1997
|
+
if (matched.length === 0) {
|
|
1998
|
+
throw new Error(
|
|
1999
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2000
|
+
);
|
|
2001
|
+
}
|
|
2002
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2003
|
+
} else {
|
|
2004
|
+
const evaluators = row.evaluators;
|
|
2005
|
+
evaluatorIds = [];
|
|
2006
|
+
for (const ev of evaluators) {
|
|
2007
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2008
|
+
(item) => item.evaluator === ev
|
|
2009
|
+
);
|
|
2010
|
+
if (!found) {
|
|
2011
|
+
throw new Error(
|
|
2012
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2013
|
+
);
|
|
2014
|
+
}
|
|
2015
|
+
evaluatorIds.push(found.id);
|
|
2016
|
+
}
|
|
2017
|
+
}
|
|
2018
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2019
|
+
jobs.push({
|
|
2020
|
+
datasetId: dsCollected.id,
|
|
2021
|
+
evaluatorIds,
|
|
2022
|
+
runConfigName: rcName,
|
|
2023
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2024
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2025
|
+
repetitions
|
|
2026
|
+
});
|
|
2027
|
+
}
|
|
2028
|
+
return jobs;
|
|
2029
|
+
}
|
|
2030
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2031
|
+
const jobs = [];
|
|
2032
|
+
for (const name of names) {
|
|
2033
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2034
|
+
if (!collected) {
|
|
2035
|
+
const known = await this.collectRunConfigs();
|
|
2036
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2037
|
+
throw new Error(
|
|
2038
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2039
|
+
);
|
|
2040
|
+
}
|
|
2041
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2042
|
+
}
|
|
2043
|
+
return jobs;
|
|
2044
|
+
}
|
|
2045
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2046
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2047
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2048
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2049
|
+
const snapshots = [];
|
|
2050
|
+
for (const job of request.jobs) {
|
|
2051
|
+
snapshots.push(
|
|
2052
|
+
await this.startDatasetRun({
|
|
2053
|
+
datasetId: job.datasetId,
|
|
2054
|
+
evaluatorIds: job.evaluatorIds,
|
|
2055
|
+
triggerId,
|
|
2056
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2057
|
+
globalEvaluationSemaphore: sem,
|
|
2058
|
+
runConfigName: job.runConfigName,
|
|
2059
|
+
runConfigTags: job.runConfigTags,
|
|
2060
|
+
repetitions: job.repetitions
|
|
2061
|
+
})
|
|
2062
|
+
);
|
|
2063
|
+
}
|
|
2064
|
+
return snapshots;
|
|
2065
|
+
}
|
|
1762
2066
|
async searchTestCases(query) {
|
|
1763
2067
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1764
2068
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1777,35 +2081,45 @@ var EffectRunner = class {
|
|
|
1777
2081
|
);
|
|
1778
2082
|
}
|
|
1779
2083
|
async runDatasetWith(request) {
|
|
2084
|
+
const runConfigName = validateRunConfigName(
|
|
2085
|
+
request.runConfigName,
|
|
2086
|
+
"runDatasetWith.runConfigName"
|
|
2087
|
+
);
|
|
2088
|
+
return this.startDatasetRun({
|
|
2089
|
+
datasetId: request.datasetId,
|
|
2090
|
+
evaluatorIds: request.evaluatorIds,
|
|
2091
|
+
triggerId: request.triggerId,
|
|
2092
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2093
|
+
repetitions: request.repetitions,
|
|
2094
|
+
runConfigName,
|
|
2095
|
+
runConfigTags: request.runConfigTags
|
|
2096
|
+
});
|
|
2097
|
+
}
|
|
2098
|
+
async startDatasetRun(params) {
|
|
1780
2099
|
if (this.datasetsById.size === 0) {
|
|
1781
2100
|
await this.collectDatasets();
|
|
1782
2101
|
}
|
|
1783
2102
|
if (this.evaluatorsById.size === 0) {
|
|
1784
2103
|
await this.collectEvaluators();
|
|
1785
2104
|
}
|
|
1786
|
-
const dataset = this.datasetsById.get(
|
|
2105
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1787
2106
|
if (!dataset) {
|
|
1788
|
-
throw new Error(`Unknown dataset: ${
|
|
2107
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1789
2108
|
}
|
|
1790
|
-
const selectedEvaluators =
|
|
2109
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1791
2110
|
if (selectedEvaluators.length === 0) {
|
|
1792
2111
|
throw new Error("No evaluators selected for run");
|
|
1793
2112
|
}
|
|
1794
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1795
|
-
const
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
)
|
|
1799
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2113
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2114
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2115
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2116
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2117
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1800
2118
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1801
|
-
const artifactPath = createArtifactPath(
|
|
1802
|
-
this.config.artifactDirectory,
|
|
1803
|
-
request.datasetId,
|
|
1804
|
-
runId
|
|
1805
|
-
);
|
|
2119
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1806
2120
|
const snapshot = {
|
|
1807
2121
|
runId,
|
|
1808
|
-
datasetId:
|
|
2122
|
+
datasetId: params.datasetId,
|
|
1809
2123
|
datasetName: dataset.dataset.getName(),
|
|
1810
2124
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1811
2125
|
queuedAt: Date.now(),
|
|
@@ -1826,7 +2140,7 @@ var EffectRunner = class {
|
|
|
1826
2140
|
const queuedEvent = {
|
|
1827
2141
|
type: "RunQueued",
|
|
1828
2142
|
runId,
|
|
1829
|
-
datasetId:
|
|
2143
|
+
datasetId: params.datasetId,
|
|
1830
2144
|
datasetName: dataset.dataset.getName(),
|
|
1831
2145
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1832
2146
|
totalTestCases: totalEvaluations,
|
|
@@ -1840,17 +2154,20 @@ var EffectRunner = class {
|
|
|
1840
2154
|
payload: queuedEvent
|
|
1841
2155
|
})
|
|
1842
2156
|
);
|
|
1843
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1844
2157
|
await effect.Effect.runPromise(
|
|
1845
2158
|
effect.Queue.offer(this.runQueue, {
|
|
1846
2159
|
runId,
|
|
1847
2160
|
triggerId,
|
|
1848
|
-
datasetId:
|
|
2161
|
+
datasetId: params.datasetId,
|
|
1849
2162
|
dataset: dataset.dataset,
|
|
1850
2163
|
evaluators: selectedEvaluators,
|
|
1851
2164
|
testCases: selectedTestCases,
|
|
1852
2165
|
snapshot,
|
|
1853
|
-
maxConcurrency
|
|
2166
|
+
maxConcurrency: params.maxConcurrency,
|
|
2167
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2168
|
+
runConfigName: params.runConfigName,
|
|
2169
|
+
runConfigTags,
|
|
2170
|
+
repetitions
|
|
1854
2171
|
})
|
|
1855
2172
|
);
|
|
1856
2173
|
return snapshot;
|
|
@@ -1866,9 +2183,9 @@ var EffectRunner = class {
|
|
|
1866
2183
|
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
1867
2184
|
}
|
|
1868
2185
|
getAllRunSnapshots() {
|
|
1869
|
-
return Array.from(
|
|
1870
|
-
|
|
1871
|
-
)
|
|
2186
|
+
return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
|
|
2187
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
2188
|
+
);
|
|
1872
2189
|
}
|
|
1873
2190
|
async loadRunSnapshotsFromArtifacts() {
|
|
1874
2191
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1922,15 +2239,26 @@ var EffectRunner = class {
|
|
|
1922
2239
|
}
|
|
1923
2240
|
};
|
|
1924
2241
|
|
|
2242
|
+
// src/runner/events.ts
|
|
2243
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2244
|
+
runConfigName: "programmatic"
|
|
2245
|
+
};
|
|
2246
|
+
|
|
1925
2247
|
Object.defineProperty(exports, 'S', {
|
|
1926
2248
|
enumerable: true,
|
|
1927
2249
|
get: function () { return effect.Schema; }
|
|
1928
2250
|
});
|
|
1929
2251
|
exports.Dataset = Dataset;
|
|
1930
2252
|
exports.Evaluator = Evaluator;
|
|
2253
|
+
exports.EvaluatorNameSchema = EvaluatorNameSchema;
|
|
1931
2254
|
exports.Metric = Metric;
|
|
2255
|
+
exports.PROGRAMMATIC_RUN_CONFIG = PROGRAMMATIC_RUN_CONFIG;
|
|
2256
|
+
exports.RunConfig = RunConfig;
|
|
2257
|
+
exports.RunConfigNameSchema = RunConfigNameSchema;
|
|
1932
2258
|
exports.Score = Score;
|
|
2259
|
+
exports.TagSet = TagSet;
|
|
1933
2260
|
exports.TestCase = TestCase;
|
|
2261
|
+
exports.TestCaseNameSchema = TestCaseNameSchema;
|
|
1934
2262
|
exports.binaryScore = binaryScore;
|
|
1935
2263
|
exports.createLogEntry = createLogEntry;
|
|
1936
2264
|
exports.createRunner = createRunner;
|
|
@@ -1938,16 +2266,24 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
|
1938
2266
|
exports.defineConfig = defineConfig;
|
|
1939
2267
|
exports.deltaScore = deltaScore;
|
|
1940
2268
|
exports.formatScoreData = formatScoreData;
|
|
2269
|
+
exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
|
|
2270
|
+
exports.getEvaluatorTagList = getEvaluatorTagList;
|
|
1941
2271
|
exports.getLogLines = getLogLines;
|
|
1942
2272
|
exports.getMetricById = getMetricById;
|
|
1943
2273
|
exports.getScoreById = getScoreById;
|
|
2274
|
+
exports.getTestCaseDisplayLabel = getTestCaseDisplayLabel;
|
|
2275
|
+
exports.getTestCaseTagList = getTestCaseTagList;
|
|
1944
2276
|
exports.latencyMetric = latencyMetric;
|
|
1945
2277
|
exports.loadMockData = loadMockData;
|
|
1946
2278
|
exports.loadRunnerData = loadRunnerData;
|
|
2279
|
+
exports.normalizeOptionalDisplayName = normalizeOptionalDisplayName;
|
|
1947
2280
|
exports.parseStartupArgs = parseStartupArgs;
|
|
1948
2281
|
exports.percentScore = percentScore;
|
|
1949
2282
|
exports.printJsonDiff = printJsonDiff;
|
|
1950
2283
|
exports.tokenCountMetric = tokenCountMetric;
|
|
2284
|
+
exports.validateEvaluatorName = validateEvaluatorName;
|
|
2285
|
+
exports.validateRunConfigName = validateRunConfigName;
|
|
2286
|
+
exports.validateTestCaseName = validateTestCaseName;
|
|
1951
2287
|
exports.withRunnerConfig = withRunnerConfig;
|
|
1952
2288
|
//# sourceMappingURL=out.js.map
|
|
1953
2289
|
//# sourceMappingURL=index.cjs.map
|