@m4trix/evals 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +911 -643
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +898 -630
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +688 -575
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +679 -566
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +959 -623
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +947 -625
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/index.js
CHANGED
|
@@ -1,14 +1,172 @@
|
|
|
1
|
-
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
1
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
3
|
import { diffLines } from 'diff';
|
|
4
4
|
import stringify from 'fast-json-stable-stringify';
|
|
5
5
|
import { randomUUID } from 'crypto';
|
|
6
|
+
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
7
|
+
import { resolve as resolve$1, join, relative, dirname } from 'path';
|
|
6
8
|
import { existsSync } from 'fs';
|
|
7
|
-
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
8
9
|
import * as jitiModule from 'jiti';
|
|
9
|
-
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
10
10
|
import { pathToFileURL } from 'url';
|
|
11
11
|
|
|
12
|
+
// src/index.ts
|
|
13
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
14
|
+
function makeEntityIdSchema(brand, label) {
|
|
15
|
+
return Schema.String.pipe(
|
|
16
|
+
Schema.trimmed(),
|
|
17
|
+
Schema.minLength(1, {
|
|
18
|
+
message: () => `${label} must be non-empty.`
|
|
19
|
+
}),
|
|
20
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
21
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
22
|
+
}),
|
|
23
|
+
Schema.brand(brand)
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
27
|
+
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
28
|
+
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
29
|
+
function validateWithSchema(schema, raw, context) {
|
|
30
|
+
const trimmed = raw.trim();
|
|
31
|
+
const decode = Schema.decodeUnknownEither(
|
|
32
|
+
schema
|
|
33
|
+
);
|
|
34
|
+
const result = decode(trimmed);
|
|
35
|
+
if (Either.isLeft(result)) {
|
|
36
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
37
|
+
}
|
|
38
|
+
return result.right;
|
|
39
|
+
}
|
|
40
|
+
function validateRunConfigName(raw, context) {
|
|
41
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
42
|
+
}
|
|
43
|
+
function validateEvaluatorName(raw, context) {
|
|
44
|
+
return validateWithSchema(EvaluatorNameSchema, raw, context);
|
|
45
|
+
}
|
|
46
|
+
function validateTestCaseName(raw, context) {
|
|
47
|
+
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
48
|
+
}
|
|
49
|
+
function normalizeOptionalDisplayName(raw) {
|
|
50
|
+
if (raw === void 0) {
|
|
51
|
+
return void 0;
|
|
52
|
+
}
|
|
53
|
+
const t = raw.trim();
|
|
54
|
+
return t.length === 0 ? void 0 : t;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// src/evals/evaluator.ts
|
|
58
|
+
var Evaluator = class _Evaluator {
|
|
59
|
+
constructor(config) {
|
|
60
|
+
this._config = config;
|
|
61
|
+
}
|
|
62
|
+
getState() {
|
|
63
|
+
return {
|
|
64
|
+
name: this._config.name,
|
|
65
|
+
displayName: this._config.displayName,
|
|
66
|
+
tags: this._config.tags,
|
|
67
|
+
inputSchema: this._config.inputSchema,
|
|
68
|
+
outputSchema: this._config.outputSchema,
|
|
69
|
+
scoreSchema: this._config.scoreSchema,
|
|
70
|
+
middlewares: this._config.middlewares,
|
|
71
|
+
evaluateFn: this._config.evaluateFn,
|
|
72
|
+
passThreshold: this._config.passThreshold,
|
|
73
|
+
passCriterion: this._config.passCriterion
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
static use(middleware) {
|
|
77
|
+
return new _Evaluator({
|
|
78
|
+
middlewares: [middleware],
|
|
79
|
+
tags: []
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
use(middleware) {
|
|
83
|
+
const state = this.getState();
|
|
84
|
+
return new _Evaluator({
|
|
85
|
+
...state,
|
|
86
|
+
middlewares: [...state.middlewares, middleware]
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
define(config) {
|
|
90
|
+
const { middlewares } = this.getState();
|
|
91
|
+
const name = validateEvaluatorName(config.name, "Evaluator.define");
|
|
92
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
93
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
94
|
+
return new _Evaluator({
|
|
95
|
+
name,
|
|
96
|
+
displayName,
|
|
97
|
+
tags,
|
|
98
|
+
inputSchema: config.inputSchema,
|
|
99
|
+
outputSchema: config.outputSchema,
|
|
100
|
+
scoreSchema: config.scoreSchema,
|
|
101
|
+
middlewares,
|
|
102
|
+
passThreshold: config.passThreshold,
|
|
103
|
+
passCriterion: config.passCriterion
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
evaluate(fn) {
|
|
107
|
+
return new _Evaluator({
|
|
108
|
+
...this.getState(),
|
|
109
|
+
evaluateFn: fn
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
/** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
|
|
113
|
+
getName() {
|
|
114
|
+
return this._config.name;
|
|
115
|
+
}
|
|
116
|
+
getDisplayName() {
|
|
117
|
+
return this._config.displayName;
|
|
118
|
+
}
|
|
119
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
|
|
120
|
+
getDisplayLabel() {
|
|
121
|
+
const id = this._config.name;
|
|
122
|
+
if (id === void 0) {
|
|
123
|
+
return void 0;
|
|
124
|
+
}
|
|
125
|
+
return this._config.displayName ?? id;
|
|
126
|
+
}
|
|
127
|
+
/** Tags from `Evaluator.define({ tags })`; empty until defined. */
|
|
128
|
+
getTags() {
|
|
129
|
+
return [...this._config.tags];
|
|
130
|
+
}
|
|
131
|
+
getInputSchema() {
|
|
132
|
+
return this._config.inputSchema;
|
|
133
|
+
}
|
|
134
|
+
getOutputSchema() {
|
|
135
|
+
return this._config.outputSchema;
|
|
136
|
+
}
|
|
137
|
+
getScoreSchema() {
|
|
138
|
+
return this._config.scoreSchema;
|
|
139
|
+
}
|
|
140
|
+
getMiddlewares() {
|
|
141
|
+
return this._config.middlewares;
|
|
142
|
+
}
|
|
143
|
+
getEvaluateFn() {
|
|
144
|
+
return this._config.evaluateFn;
|
|
145
|
+
}
|
|
146
|
+
getPassThreshold() {
|
|
147
|
+
return this._config.passThreshold;
|
|
148
|
+
}
|
|
149
|
+
getPassCriterion() {
|
|
150
|
+
return this._config.passCriterion;
|
|
151
|
+
}
|
|
152
|
+
async resolveContext() {
|
|
153
|
+
const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
|
|
154
|
+
return Object.assign({}, ...parts);
|
|
155
|
+
}
|
|
156
|
+
};
|
|
157
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
158
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
159
|
+
const label = evaluator.getDisplayLabel();
|
|
160
|
+
if (label !== void 0) {
|
|
161
|
+
return label;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
165
|
+
}
|
|
166
|
+
function getEvaluatorTagList(evaluator) {
|
|
167
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
168
|
+
}
|
|
169
|
+
|
|
12
170
|
// src/cli/data.mock.json
|
|
13
171
|
var data_mock_default = {
|
|
14
172
|
datasets: [
|
|
@@ -159,9 +317,7 @@ var data_mock_default = {
|
|
|
159
317
|
{ name: "contract_match", score: 100 },
|
|
160
318
|
{ name: "arg_validity", score: 100 }
|
|
161
319
|
],
|
|
162
|
-
checks: [
|
|
163
|
-
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
164
|
-
],
|
|
320
|
+
checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
|
|
165
321
|
failures: [],
|
|
166
322
|
meta: {
|
|
167
323
|
model: "gpt-4o-mini",
|
|
@@ -184,9 +340,21 @@ var data_mock_default = {
|
|
|
184
340
|
}
|
|
185
341
|
],
|
|
186
342
|
evaluators: [
|
|
187
|
-
{
|
|
188
|
-
|
|
189
|
-
|
|
343
|
+
{
|
|
344
|
+
id: "json-schema-validator",
|
|
345
|
+
name: "JSON Schema Validator",
|
|
346
|
+
configPreview: "strict=true"
|
|
347
|
+
},
|
|
348
|
+
{
|
|
349
|
+
id: "tool-call-contract-checker",
|
|
350
|
+
name: "Tool-call Contract Checker",
|
|
351
|
+
configPreview: "unexpectedCalls=error"
|
|
352
|
+
},
|
|
353
|
+
{
|
|
354
|
+
id: "rubric-judge",
|
|
355
|
+
name: "Rubric Judge (LLM)",
|
|
356
|
+
configPreview: "model=gpt-4o-mini; scale=0-100"
|
|
357
|
+
},
|
|
190
358
|
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
191
359
|
]
|
|
192
360
|
};
|
|
@@ -253,7 +421,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
253
421
|
function toEvaluatorOption(item) {
|
|
254
422
|
return {
|
|
255
423
|
id: item.id,
|
|
256
|
-
name: item.evaluator
|
|
424
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
257
425
|
configPreview: `Source: ${item.filePath}`
|
|
258
426
|
};
|
|
259
427
|
}
|
|
@@ -266,9 +434,7 @@ async function loadRunnerData(runner) {
|
|
|
266
434
|
const memSnapshots = runner.getAllRunSnapshots();
|
|
267
435
|
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
268
436
|
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
269
|
-
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
270
|
-
(a, b) => b.queuedAt - a.queuedAt
|
|
271
|
-
);
|
|
437
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
|
|
272
438
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
273
439
|
return loadMockData();
|
|
274
440
|
}
|
|
@@ -301,134 +467,6 @@ function parseStartupArgs(argv) {
|
|
|
301
467
|
return args;
|
|
302
468
|
}
|
|
303
469
|
|
|
304
|
-
// src/evals/test-case.ts
|
|
305
|
-
function resolve(value) {
|
|
306
|
-
return typeof value === "function" ? value() : value;
|
|
307
|
-
}
|
|
308
|
-
var TestCase = class _TestCase {
|
|
309
|
-
constructor(config) {
|
|
310
|
-
this._config = config;
|
|
311
|
-
}
|
|
312
|
-
static describe(config) {
|
|
313
|
-
const reruns = config.reruns ?? 1;
|
|
314
|
-
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
315
|
-
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
316
|
-
}
|
|
317
|
-
return new _TestCase({
|
|
318
|
-
name: config.name,
|
|
319
|
-
tags: config.tags,
|
|
320
|
-
reruns,
|
|
321
|
-
inputSchema: config.inputSchema,
|
|
322
|
-
input: config.input,
|
|
323
|
-
outputSchema: config.outputSchema,
|
|
324
|
-
output: config.output
|
|
325
|
-
});
|
|
326
|
-
}
|
|
327
|
-
getReruns() {
|
|
328
|
-
return this._config.reruns;
|
|
329
|
-
}
|
|
330
|
-
getName() {
|
|
331
|
-
return this._config.name;
|
|
332
|
-
}
|
|
333
|
-
getTags() {
|
|
334
|
-
return this._config.tags;
|
|
335
|
-
}
|
|
336
|
-
getInputSchema() {
|
|
337
|
-
return this._config.inputSchema;
|
|
338
|
-
}
|
|
339
|
-
getInput() {
|
|
340
|
-
return resolve(this._config.input);
|
|
341
|
-
}
|
|
342
|
-
getOutputSchema() {
|
|
343
|
-
return this._config.outputSchema;
|
|
344
|
-
}
|
|
345
|
-
getOutput() {
|
|
346
|
-
if (this._config.output === void 0) {
|
|
347
|
-
return void 0;
|
|
348
|
-
}
|
|
349
|
-
return resolve(this._config.output);
|
|
350
|
-
}
|
|
351
|
-
};
|
|
352
|
-
|
|
353
|
-
// src/evals/evaluator.ts
|
|
354
|
-
var Evaluator = class _Evaluator {
|
|
355
|
-
constructor(config) {
|
|
356
|
-
this._config = config;
|
|
357
|
-
}
|
|
358
|
-
getState() {
|
|
359
|
-
return {
|
|
360
|
-
name: this._config.name,
|
|
361
|
-
inputSchema: this._config.inputSchema,
|
|
362
|
-
outputSchema: this._config.outputSchema,
|
|
363
|
-
scoreSchema: this._config.scoreSchema,
|
|
364
|
-
middlewares: this._config.middlewares,
|
|
365
|
-
evaluateFn: this._config.evaluateFn,
|
|
366
|
-
passThreshold: this._config.passThreshold,
|
|
367
|
-
passCriterion: this._config.passCriterion
|
|
368
|
-
};
|
|
369
|
-
}
|
|
370
|
-
static use(middleware) {
|
|
371
|
-
return new _Evaluator({
|
|
372
|
-
middlewares: [middleware]
|
|
373
|
-
});
|
|
374
|
-
}
|
|
375
|
-
use(middleware) {
|
|
376
|
-
const state = this.getState();
|
|
377
|
-
return new _Evaluator({
|
|
378
|
-
...state,
|
|
379
|
-
middlewares: [...state.middlewares, middleware]
|
|
380
|
-
});
|
|
381
|
-
}
|
|
382
|
-
define(config) {
|
|
383
|
-
const { middlewares } = this.getState();
|
|
384
|
-
return new _Evaluator({
|
|
385
|
-
name: config.name,
|
|
386
|
-
inputSchema: config.inputSchema,
|
|
387
|
-
outputSchema: config.outputSchema,
|
|
388
|
-
scoreSchema: config.scoreSchema,
|
|
389
|
-
middlewares,
|
|
390
|
-
passThreshold: config.passThreshold,
|
|
391
|
-
passCriterion: config.passCriterion
|
|
392
|
-
});
|
|
393
|
-
}
|
|
394
|
-
evaluate(fn) {
|
|
395
|
-
return new _Evaluator({
|
|
396
|
-
...this.getState(),
|
|
397
|
-
evaluateFn: fn
|
|
398
|
-
});
|
|
399
|
-
}
|
|
400
|
-
getName() {
|
|
401
|
-
return this._config.name;
|
|
402
|
-
}
|
|
403
|
-
getInputSchema() {
|
|
404
|
-
return this._config.inputSchema;
|
|
405
|
-
}
|
|
406
|
-
getOutputSchema() {
|
|
407
|
-
return this._config.outputSchema;
|
|
408
|
-
}
|
|
409
|
-
getScoreSchema() {
|
|
410
|
-
return this._config.scoreSchema;
|
|
411
|
-
}
|
|
412
|
-
getMiddlewares() {
|
|
413
|
-
return this._config.middlewares;
|
|
414
|
-
}
|
|
415
|
-
getEvaluateFn() {
|
|
416
|
-
return this._config.evaluateFn;
|
|
417
|
-
}
|
|
418
|
-
getPassThreshold() {
|
|
419
|
-
return this._config.passThreshold;
|
|
420
|
-
}
|
|
421
|
-
getPassCriterion() {
|
|
422
|
-
return this._config.passCriterion;
|
|
423
|
-
}
|
|
424
|
-
async resolveContext() {
|
|
425
|
-
const parts = await Promise.all(
|
|
426
|
-
this._config.middlewares.map((mw) => mw.resolve())
|
|
427
|
-
);
|
|
428
|
-
return Object.assign({}, ...parts);
|
|
429
|
-
}
|
|
430
|
-
};
|
|
431
|
-
|
|
432
470
|
// src/evals/dataset.ts
|
|
433
471
|
function matchesAny(value, matchers) {
|
|
434
472
|
return matchers.some(
|
|
@@ -492,230 +530,13 @@ var Dataset = class _Dataset {
|
|
|
492
530
|
return tagMatch && pathMatch;
|
|
493
531
|
}
|
|
494
532
|
};
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
name: config.name,
|
|
503
|
-
aggregate: config.aggregate,
|
|
504
|
-
format: config.format,
|
|
505
|
-
make: (data, options) => ({
|
|
506
|
-
id: config.id,
|
|
507
|
-
data,
|
|
508
|
-
...options?.name !== void 0 && { name: options.name }
|
|
509
|
-
})
|
|
510
|
-
};
|
|
511
|
-
registry.set(config.id, def);
|
|
512
|
-
return def;
|
|
513
|
-
}
|
|
514
|
-
};
|
|
515
|
-
function getMetricById(id) {
|
|
516
|
-
return registry.get(id);
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
// src/evals/score.ts
|
|
520
|
-
var registry2 = /* @__PURE__ */ new Map();
|
|
521
|
-
function formatScoreData(def, data, options) {
|
|
522
|
-
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
523
|
-
}
|
|
524
|
-
var ScoreAggregate = {
|
|
525
|
-
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
526
|
-
averageFields(fields) {
|
|
527
|
-
return (values) => {
|
|
528
|
-
const count = values.length || 1;
|
|
529
|
-
const result = {};
|
|
530
|
-
for (const field of fields) {
|
|
531
|
-
result[field] = values.reduce(
|
|
532
|
-
(s, v) => s + (v[field] ?? 0),
|
|
533
|
-
0
|
|
534
|
-
) / count;
|
|
535
|
-
}
|
|
536
|
-
return result;
|
|
537
|
-
};
|
|
538
|
-
},
|
|
539
|
-
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
540
|
-
averageWithVariance(fields) {
|
|
541
|
-
return (values) => {
|
|
542
|
-
const count = values.length;
|
|
543
|
-
const result = {};
|
|
544
|
-
for (const field of fields) {
|
|
545
|
-
result[field] = count === 0 ? 0 : values.reduce(
|
|
546
|
-
(sum, item) => sum + (item[field] ?? 0),
|
|
547
|
-
0
|
|
548
|
-
) / count;
|
|
549
|
-
}
|
|
550
|
-
const valueField = "value";
|
|
551
|
-
const hasValueField = fields.includes(valueField);
|
|
552
|
-
if (count === 0) {
|
|
553
|
-
if (hasValueField) {
|
|
554
|
-
result[valueField] = 0;
|
|
555
|
-
}
|
|
556
|
-
return {
|
|
557
|
-
...result,
|
|
558
|
-
stdDev: void 0,
|
|
559
|
-
count: 0
|
|
560
|
-
};
|
|
561
|
-
}
|
|
562
|
-
let stdDev;
|
|
563
|
-
if (hasValueField && count >= 2) {
|
|
564
|
-
const sum = values.reduce(
|
|
565
|
-
(s, v) => s + (v[valueField] ?? 0),
|
|
566
|
-
0
|
|
567
|
-
);
|
|
568
|
-
const sumSq = values.reduce(
|
|
569
|
-
(s, v) => {
|
|
570
|
-
const value = v[valueField] ?? 0;
|
|
571
|
-
return s + value * value;
|
|
572
|
-
},
|
|
573
|
-
0
|
|
574
|
-
);
|
|
575
|
-
const mean = sum / count;
|
|
576
|
-
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
577
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
578
|
-
}
|
|
579
|
-
return {
|
|
580
|
-
...values[0],
|
|
581
|
-
...result,
|
|
582
|
-
stdDev,
|
|
583
|
-
count
|
|
584
|
-
};
|
|
585
|
-
};
|
|
586
|
-
},
|
|
587
|
-
/** All runs must pass. Use for binary scores. */
|
|
588
|
-
all(values) {
|
|
589
|
-
const total = values.length;
|
|
590
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
591
|
-
return {
|
|
592
|
-
...values[0],
|
|
593
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
594
|
-
passedCount,
|
|
595
|
-
totalCount: total
|
|
596
|
-
};
|
|
597
|
-
},
|
|
598
|
-
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
599
|
-
last(values) {
|
|
600
|
-
return values[values.length - 1] ?? {};
|
|
601
|
-
}
|
|
602
|
-
};
|
|
603
|
-
var Score = {
|
|
604
|
-
aggregate: ScoreAggregate,
|
|
605
|
-
of(config) {
|
|
606
|
-
const def = {
|
|
607
|
-
id: config.id,
|
|
608
|
-
name: config.name,
|
|
609
|
-
displayStrategy: config.displayStrategy,
|
|
610
|
-
formatValue: config.formatValue,
|
|
611
|
-
formatAggregate: config.formatAggregate,
|
|
612
|
-
aggregateValues: config.aggregateValues,
|
|
613
|
-
make: (data, options) => {
|
|
614
|
-
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
615
|
-
return {
|
|
616
|
-
id: config.id,
|
|
617
|
-
data,
|
|
618
|
-
...passed !== void 0 && { passed },
|
|
619
|
-
...options?.name !== void 0 && { name: options.name },
|
|
620
|
-
def
|
|
621
|
-
// Attach def so rendering/aggregation works without registry lookup
|
|
622
|
-
};
|
|
623
|
-
}
|
|
624
|
-
};
|
|
625
|
-
registry2.set(config.id, def);
|
|
626
|
-
return def;
|
|
627
|
-
}
|
|
628
|
-
};
|
|
629
|
-
function getScoreById(id) {
|
|
630
|
-
return registry2.get(id);
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
// src/evals/aggregators.ts
|
|
634
|
-
function aggregateTokenCountSum(values) {
|
|
635
|
-
const initial = {
|
|
636
|
-
input: 0,
|
|
637
|
-
output: 0,
|
|
638
|
-
inputCached: 0,
|
|
639
|
-
outputCached: 0
|
|
640
|
-
};
|
|
641
|
-
return values.reduce(
|
|
642
|
-
(acc, v) => ({
|
|
643
|
-
input: acc.input + (v.input ?? 0),
|
|
644
|
-
output: acc.output + (v.output ?? 0),
|
|
645
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
646
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
647
|
-
}),
|
|
648
|
-
initial
|
|
649
|
-
);
|
|
650
|
-
}
|
|
651
|
-
function aggregateLatencyAverage(values) {
|
|
652
|
-
if (values.length === 0) {
|
|
653
|
-
return { ms: 0 };
|
|
654
|
-
}
|
|
655
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
656
|
-
return { ms: sum / values.length };
|
|
657
|
-
}
|
|
658
|
-
|
|
659
|
-
// src/evals/metrics/standard.ts
|
|
660
|
-
var tokenCountMetric = Metric.of({
|
|
661
|
-
id: "token-count",
|
|
662
|
-
name: "Tokens",
|
|
663
|
-
aggregate: aggregateTokenCountSum,
|
|
664
|
-
format: (data, options) => {
|
|
665
|
-
const input = data.input ?? 0;
|
|
666
|
-
const output = data.output ?? 0;
|
|
667
|
-
const inputCached = data.inputCached ?? 0;
|
|
668
|
-
const outputCached = data.outputCached ?? 0;
|
|
669
|
-
const cached = inputCached + outputCached;
|
|
670
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
671
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
672
|
-
}
|
|
673
|
-
});
|
|
674
|
-
var latencyMetric = Metric.of({
|
|
675
|
-
id: "latency",
|
|
676
|
-
name: "Latency",
|
|
677
|
-
aggregate: aggregateLatencyAverage,
|
|
678
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
679
|
-
});
|
|
680
|
-
|
|
681
|
-
// src/evals/scores/standard.ts
|
|
682
|
-
var percentScore = Score.of({
|
|
683
|
-
id: "percent",
|
|
684
|
-
name: "Score",
|
|
685
|
-
displayStrategy: "bar",
|
|
686
|
-
formatValue: (data) => data.value.toFixed(2),
|
|
687
|
-
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
688
|
-
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
689
|
-
});
|
|
690
|
-
var deltaScore = Score.of({
|
|
691
|
-
id: "delta",
|
|
692
|
-
name: "Delta",
|
|
693
|
-
displayStrategy: "number",
|
|
694
|
-
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
695
|
-
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
696
|
-
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
697
|
-
});
|
|
698
|
-
var binaryScore = Score.of({
|
|
699
|
-
id: "binary",
|
|
700
|
-
name: "Result",
|
|
701
|
-
displayStrategy: "passFail",
|
|
702
|
-
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
703
|
-
formatAggregate: (data) => {
|
|
704
|
-
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
705
|
-
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
706
|
-
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
707
|
-
}
|
|
708
|
-
return base;
|
|
709
|
-
},
|
|
710
|
-
aggregateValues: Score.aggregate.all
|
|
711
|
-
});
|
|
712
|
-
function preprocessForDiff(value, options) {
|
|
713
|
-
if (options?.sort && Array.isArray(value)) {
|
|
714
|
-
return [...value].sort((a, b) => {
|
|
715
|
-
const aStr = stringify(preprocessForDiff(a, options));
|
|
716
|
-
const bStr = stringify(preprocessForDiff(b, options));
|
|
717
|
-
return aStr.localeCompare(bStr);
|
|
718
|
-
}).map((item) => preprocessForDiff(item, options));
|
|
533
|
+
function preprocessForDiff(value, options) {
|
|
534
|
+
if (options?.sort && Array.isArray(value)) {
|
|
535
|
+
return [...value].sort((a, b) => {
|
|
536
|
+
const aStr = stringify(preprocessForDiff(a, options));
|
|
537
|
+
const bStr = stringify(preprocessForDiff(b, options));
|
|
538
|
+
return aStr.localeCompare(bStr);
|
|
539
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
719
540
|
}
|
|
720
541
|
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
721
542
|
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
@@ -766,16 +587,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
766
587
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
767
588
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
768
589
|
if (diffOptions?.keysOnly) {
|
|
769
|
-
const expectedKeys = JSON.stringify(
|
|
770
|
-
|
|
771
|
-
null,
|
|
772
|
-
2
|
|
773
|
-
);
|
|
774
|
-
const actualKeys = JSON.stringify(
|
|
775
|
-
extractKeys(actualProcessed),
|
|
776
|
-
null,
|
|
777
|
-
2
|
|
778
|
-
);
|
|
590
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
591
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
779
592
|
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
780
593
|
return formatDiffParts(parts2);
|
|
781
594
|
}
|
|
@@ -786,9 +599,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
786
599
|
}
|
|
787
600
|
const parts = diffLines(expectedStr, actualStr);
|
|
788
601
|
if (diffOptions?.outputNewOnly) {
|
|
789
|
-
const filtered = parts.filter(
|
|
790
|
-
(p) => p.added === true
|
|
791
|
-
);
|
|
602
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
792
603
|
return formatDiffParts(filtered);
|
|
793
604
|
}
|
|
794
605
|
return formatDiffParts(parts);
|
|
@@ -853,14 +664,476 @@ function printJsonDiff(expected, actual, options = {}) {
|
|
|
853
664
|
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
854
665
|
return `\x1B[32m${line}\x1B[0m`;
|
|
855
666
|
}
|
|
856
|
-
return line;
|
|
857
|
-
});
|
|
858
|
-
const colored = lines.join("\n");
|
|
859
|
-
console.log(colored || "(no differences)");
|
|
860
|
-
return colored;
|
|
667
|
+
return line;
|
|
668
|
+
});
|
|
669
|
+
const colored = lines.join("\n");
|
|
670
|
+
console.log(colored || "(no differences)");
|
|
671
|
+
return colored;
|
|
672
|
+
}
|
|
673
|
+
console.log(diff || "(no differences)");
|
|
674
|
+
return diff;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
// src/evals/metric.ts
|
|
678
|
+
var registry = /* @__PURE__ */ new Map();
|
|
679
|
+
var Metric = {
|
|
680
|
+
of(config) {
|
|
681
|
+
const def = {
|
|
682
|
+
id: config.id,
|
|
683
|
+
name: config.name,
|
|
684
|
+
aggregate: config.aggregate,
|
|
685
|
+
format: config.format,
|
|
686
|
+
make: (data, options) => ({
|
|
687
|
+
id: config.id,
|
|
688
|
+
data,
|
|
689
|
+
...options?.name !== void 0 && { name: options.name }
|
|
690
|
+
})
|
|
691
|
+
};
|
|
692
|
+
registry.set(config.id, def);
|
|
693
|
+
return def;
|
|
694
|
+
}
|
|
695
|
+
};
|
|
696
|
+
function getMetricById(id) {
|
|
697
|
+
return registry.get(id);
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// src/evals/aggregators.ts
|
|
701
|
+
function aggregateTokenCountSum(values) {
|
|
702
|
+
const initial = {
|
|
703
|
+
input: 0,
|
|
704
|
+
output: 0,
|
|
705
|
+
inputCached: 0,
|
|
706
|
+
outputCached: 0
|
|
707
|
+
};
|
|
708
|
+
return values.reduce(
|
|
709
|
+
(acc, v) => ({
|
|
710
|
+
input: acc.input + (v.input ?? 0),
|
|
711
|
+
output: acc.output + (v.output ?? 0),
|
|
712
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
713
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
714
|
+
}),
|
|
715
|
+
initial
|
|
716
|
+
);
|
|
717
|
+
}
|
|
718
|
+
function aggregateLatencyAverage(values) {
|
|
719
|
+
if (values.length === 0) {
|
|
720
|
+
return { ms: 0 };
|
|
721
|
+
}
|
|
722
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
723
|
+
return { ms: sum / values.length };
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
// src/evals/metrics/standard.ts
|
|
727
|
+
var tokenCountMetric = Metric.of({
|
|
728
|
+
id: "token-count",
|
|
729
|
+
name: "Tokens",
|
|
730
|
+
aggregate: aggregateTokenCountSum,
|
|
731
|
+
format: (data, options) => {
|
|
732
|
+
const input = data.input ?? 0;
|
|
733
|
+
const output = data.output ?? 0;
|
|
734
|
+
const inputCached = data.inputCached ?? 0;
|
|
735
|
+
const outputCached = data.outputCached ?? 0;
|
|
736
|
+
const cached = inputCached + outputCached;
|
|
737
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
738
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
739
|
+
}
|
|
740
|
+
});
|
|
741
|
+
var latencyMetric = Metric.of({
|
|
742
|
+
id: "latency",
|
|
743
|
+
name: "Latency",
|
|
744
|
+
aggregate: aggregateLatencyAverage,
|
|
745
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
746
|
+
});
|
|
747
|
+
|
|
748
|
+
// src/evals/run-config.ts
|
|
749
|
+
function validateRow(row, index) {
|
|
750
|
+
const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
|
|
751
|
+
const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
|
|
752
|
+
if (hasEvaluators && hasPattern) {
|
|
753
|
+
throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
|
|
754
|
+
}
|
|
755
|
+
if (!hasEvaluators && !hasPattern) {
|
|
756
|
+
throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
|
|
757
|
+
}
|
|
758
|
+
if (hasEvaluators && row.evaluators.length === 0) {
|
|
759
|
+
throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
|
|
760
|
+
}
|
|
761
|
+
const rawRep = "repetitions" in row ? row.repetitions : void 0;
|
|
762
|
+
const repetitions = rawRep ?? 1;
|
|
763
|
+
if (!Number.isInteger(repetitions) || repetitions < 1) {
|
|
764
|
+
throw new Error(
|
|
765
|
+
`RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
|
|
766
|
+
);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
var RunConfig = class _RunConfig {
|
|
770
|
+
constructor(name, displayName, tags, runs) {
|
|
771
|
+
this._name = name;
|
|
772
|
+
this._displayName = displayName;
|
|
773
|
+
this._tags = tags;
|
|
774
|
+
this._runs = runs;
|
|
775
|
+
}
|
|
776
|
+
static define(config) {
|
|
777
|
+
if (config.runs.length === 0) {
|
|
778
|
+
throw new Error("RunConfig runs must be non-empty");
|
|
779
|
+
}
|
|
780
|
+
config.runs.forEach(validateRow);
|
|
781
|
+
const name = validateRunConfigName(config.name, "RunConfig.define");
|
|
782
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
783
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
784
|
+
return new _RunConfig(name, displayName, tags, config.runs);
|
|
785
|
+
}
|
|
786
|
+
/** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
|
|
787
|
+
getName() {
|
|
788
|
+
return this._name;
|
|
789
|
+
}
|
|
790
|
+
/** Optional unrestricted display label. */
|
|
791
|
+
getDisplayName() {
|
|
792
|
+
return this._displayName;
|
|
793
|
+
}
|
|
794
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
795
|
+
getDisplayLabel() {
|
|
796
|
+
return this._displayName ?? this._name;
|
|
797
|
+
}
|
|
798
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
799
|
+
getTags() {
|
|
800
|
+
return [...this._tags];
|
|
801
|
+
}
|
|
802
|
+
getRuns() {
|
|
803
|
+
return this._runs;
|
|
804
|
+
}
|
|
805
|
+
};
|
|
806
|
+
|
|
807
|
+
// src/evals/score.ts
|
|
808
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
809
|
+
function formatScoreData(def, data, options) {
|
|
810
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
811
|
+
}
|
|
812
|
+
var ScoreAggregate = {
|
|
813
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
814
|
+
averageFields(fields) {
|
|
815
|
+
return (values) => {
|
|
816
|
+
const count = values.length || 1;
|
|
817
|
+
const result = {};
|
|
818
|
+
for (const field of fields) {
|
|
819
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
820
|
+
}
|
|
821
|
+
return result;
|
|
822
|
+
};
|
|
823
|
+
},
|
|
824
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
825
|
+
averageWithVariance(fields) {
|
|
826
|
+
return (values) => {
|
|
827
|
+
const count = values.length;
|
|
828
|
+
const result = {};
|
|
829
|
+
for (const field of fields) {
|
|
830
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
831
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
832
|
+
0
|
|
833
|
+
) / count;
|
|
834
|
+
}
|
|
835
|
+
const valueField = "value";
|
|
836
|
+
const hasValueField = fields.includes(valueField);
|
|
837
|
+
if (count === 0) {
|
|
838
|
+
if (hasValueField) {
|
|
839
|
+
result[valueField] = 0;
|
|
840
|
+
}
|
|
841
|
+
return {
|
|
842
|
+
...result,
|
|
843
|
+
stdDev: void 0,
|
|
844
|
+
count: 0
|
|
845
|
+
};
|
|
846
|
+
}
|
|
847
|
+
let stdDev;
|
|
848
|
+
if (hasValueField && count >= 2) {
|
|
849
|
+
const sum = values.reduce(
|
|
850
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
851
|
+
0
|
|
852
|
+
);
|
|
853
|
+
const sumSq = values.reduce((s, v) => {
|
|
854
|
+
const value = v[valueField] ?? 0;
|
|
855
|
+
return s + value * value;
|
|
856
|
+
}, 0);
|
|
857
|
+
const mean = sum / count;
|
|
858
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
859
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
860
|
+
}
|
|
861
|
+
return {
|
|
862
|
+
...values[0],
|
|
863
|
+
...result,
|
|
864
|
+
stdDev,
|
|
865
|
+
count
|
|
866
|
+
};
|
|
867
|
+
};
|
|
868
|
+
},
|
|
869
|
+
/** All runs must pass. Use for binary scores. */
|
|
870
|
+
all(values) {
|
|
871
|
+
const total = values.length;
|
|
872
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
873
|
+
return {
|
|
874
|
+
...values[0],
|
|
875
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
876
|
+
passedCount,
|
|
877
|
+
totalCount: total
|
|
878
|
+
};
|
|
879
|
+
},
|
|
880
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
881
|
+
last(values) {
|
|
882
|
+
return values[values.length - 1] ?? {};
|
|
883
|
+
}
|
|
884
|
+
};
|
|
885
|
+
var Score = {
|
|
886
|
+
aggregate: ScoreAggregate,
|
|
887
|
+
of(config) {
|
|
888
|
+
const def = {
|
|
889
|
+
id: config.id,
|
|
890
|
+
name: config.name,
|
|
891
|
+
displayStrategy: config.displayStrategy,
|
|
892
|
+
formatValue: config.formatValue,
|
|
893
|
+
formatAggregate: config.formatAggregate,
|
|
894
|
+
aggregateValues: config.aggregateValues,
|
|
895
|
+
make: (data, options) => {
|
|
896
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
897
|
+
return {
|
|
898
|
+
id: config.id,
|
|
899
|
+
data,
|
|
900
|
+
...passed !== void 0 && { passed },
|
|
901
|
+
...options?.name !== void 0 && { name: options.name },
|
|
902
|
+
def
|
|
903
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
904
|
+
};
|
|
905
|
+
}
|
|
906
|
+
};
|
|
907
|
+
registry2.set(config.id, def);
|
|
908
|
+
return def;
|
|
909
|
+
}
|
|
910
|
+
};
|
|
911
|
+
function getScoreById(id) {
|
|
912
|
+
return registry2.get(id);
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
// src/evals/scores/standard.ts
|
|
916
|
+
var percentScore = Score.of({
|
|
917
|
+
id: "percent",
|
|
918
|
+
name: "Score",
|
|
919
|
+
displayStrategy: "bar",
|
|
920
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
921
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
922
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
923
|
+
});
|
|
924
|
+
var deltaScore = Score.of({
|
|
925
|
+
id: "delta",
|
|
926
|
+
name: "Delta",
|
|
927
|
+
displayStrategy: "number",
|
|
928
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
929
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
930
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
931
|
+
});
|
|
932
|
+
var binaryScore = Score.of({
|
|
933
|
+
id: "binary",
|
|
934
|
+
name: "Result",
|
|
935
|
+
displayStrategy: "passFail",
|
|
936
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
937
|
+
formatAggregate: (data) => {
|
|
938
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
939
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
940
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
941
|
+
}
|
|
942
|
+
return base;
|
|
943
|
+
},
|
|
944
|
+
aggregateValues: Score.aggregate.all
|
|
945
|
+
});
|
|
946
|
+
|
|
947
|
+
// src/evals/tag-set.ts
|
|
948
|
+
var TagSet = class {
|
|
949
|
+
constructor() {
|
|
950
|
+
}
|
|
951
|
+
static define(tags) {
|
|
952
|
+
const out = {};
|
|
953
|
+
for (const tag of tags) {
|
|
954
|
+
out[tag] = tag;
|
|
955
|
+
}
|
|
956
|
+
return out;
|
|
957
|
+
}
|
|
958
|
+
};
|
|
959
|
+
|
|
960
|
+
// src/evals/test-case.ts
|
|
961
|
+
function resolve(value) {
|
|
962
|
+
return typeof value === "function" ? value() : value;
|
|
963
|
+
}
|
|
964
|
+
var TestCase = class _TestCase {
|
|
965
|
+
constructor(config) {
|
|
966
|
+
this._config = config;
|
|
967
|
+
}
|
|
968
|
+
static describe(config) {
|
|
969
|
+
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
970
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
971
|
+
return new _TestCase({
|
|
972
|
+
name,
|
|
973
|
+
displayName,
|
|
974
|
+
tags: config.tags,
|
|
975
|
+
inputSchema: config.inputSchema,
|
|
976
|
+
input: config.input,
|
|
977
|
+
outputSchema: config.outputSchema,
|
|
978
|
+
output: config.output
|
|
979
|
+
});
|
|
980
|
+
}
|
|
981
|
+
getName() {
|
|
982
|
+
return this._config.name;
|
|
983
|
+
}
|
|
984
|
+
getDisplayName() {
|
|
985
|
+
return this._config.displayName;
|
|
986
|
+
}
|
|
987
|
+
getDisplayLabel() {
|
|
988
|
+
return this._config.displayName ?? this._config.name;
|
|
989
|
+
}
|
|
990
|
+
getTags() {
|
|
991
|
+
return this._config.tags;
|
|
992
|
+
}
|
|
993
|
+
getInputSchema() {
|
|
994
|
+
return this._config.inputSchema;
|
|
995
|
+
}
|
|
996
|
+
getInput() {
|
|
997
|
+
return resolve(this._config.input);
|
|
998
|
+
}
|
|
999
|
+
getOutputSchema() {
|
|
1000
|
+
return this._config.outputSchema;
|
|
1001
|
+
}
|
|
1002
|
+
getOutput() {
|
|
1003
|
+
if (this._config.output === void 0) {
|
|
1004
|
+
return void 0;
|
|
1005
|
+
}
|
|
1006
|
+
return resolve(this._config.output);
|
|
1007
|
+
}
|
|
1008
|
+
};
|
|
1009
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1010
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1011
|
+
return testCase.getDisplayLabel();
|
|
1012
|
+
}
|
|
1013
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
1014
|
+
}
|
|
1015
|
+
function getTestCaseTagList(testCase) {
|
|
1016
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1017
|
+
}
|
|
1018
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
1019
|
+
const baseDir = resolve$1(config.artifactDirectory);
|
|
1020
|
+
let entries;
|
|
1021
|
+
try {
|
|
1022
|
+
entries = await readdir(baseDir);
|
|
1023
|
+
} catch {
|
|
1024
|
+
return [];
|
|
1025
|
+
}
|
|
1026
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1027
|
+
const snapshots = [];
|
|
1028
|
+
for (const fileName of jsonlFiles) {
|
|
1029
|
+
const filePath = join(baseDir, fileName);
|
|
1030
|
+
try {
|
|
1031
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1032
|
+
if (snapshot) {
|
|
1033
|
+
snapshots.push(snapshot);
|
|
1034
|
+
}
|
|
1035
|
+
} catch {
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1039
|
+
}
|
|
1040
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1041
|
+
const content = await readFile(filePath, "utf8");
|
|
1042
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1043
|
+
if (lines.length === 0) {
|
|
1044
|
+
return null;
|
|
1045
|
+
}
|
|
1046
|
+
let runQueued = null;
|
|
1047
|
+
let runCompleted = null;
|
|
1048
|
+
let runFailed = null;
|
|
1049
|
+
let runStarted = null;
|
|
1050
|
+
for (const line of lines) {
|
|
1051
|
+
try {
|
|
1052
|
+
const event = JSON.parse(line);
|
|
1053
|
+
const type = event.type;
|
|
1054
|
+
if (type === "RunQueued") {
|
|
1055
|
+
runQueued = {
|
|
1056
|
+
runId: event.runId,
|
|
1057
|
+
datasetId: event.datasetId,
|
|
1058
|
+
datasetName: event.datasetName,
|
|
1059
|
+
evaluatorIds: event.evaluatorIds,
|
|
1060
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
1061
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
1062
|
+
ts: event.ts
|
|
1063
|
+
};
|
|
1064
|
+
}
|
|
1065
|
+
if (type === "RunStarted") {
|
|
1066
|
+
runStarted = { startedAt: event.startedAt };
|
|
1067
|
+
}
|
|
1068
|
+
if (type === "RunCompleted") {
|
|
1069
|
+
runCompleted = {
|
|
1070
|
+
passedTestCases: event.passedTestCases,
|
|
1071
|
+
failedTestCases: event.failedTestCases,
|
|
1072
|
+
totalTestCases: event.totalTestCases,
|
|
1073
|
+
finishedAt: event.finishedAt
|
|
1074
|
+
};
|
|
1075
|
+
}
|
|
1076
|
+
if (type === "RunFailed") {
|
|
1077
|
+
runFailed = {
|
|
1078
|
+
finishedAt: event.finishedAt,
|
|
1079
|
+
errorMessage: event.errorMessage
|
|
1080
|
+
};
|
|
1081
|
+
}
|
|
1082
|
+
} catch {
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
if (!runQueued) {
|
|
1086
|
+
return null;
|
|
1087
|
+
}
|
|
1088
|
+
const artifactPath = filePath;
|
|
1089
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1090
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
1091
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1092
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1093
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1094
|
+
return {
|
|
1095
|
+
runId: runQueued.runId,
|
|
1096
|
+
datasetId: runQueued.datasetId,
|
|
1097
|
+
datasetName: runQueued.datasetName,
|
|
1098
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
1099
|
+
queuedAt: runQueued.ts ?? 0,
|
|
1100
|
+
startedAt: runStarted?.startedAt,
|
|
1101
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1102
|
+
totalTestCases: runQueued.totalTestCases,
|
|
1103
|
+
completedTestCases,
|
|
1104
|
+
passedTestCases,
|
|
1105
|
+
failedTestCases,
|
|
1106
|
+
status,
|
|
1107
|
+
artifactPath,
|
|
1108
|
+
errorMessage: runFailed?.errorMessage
|
|
1109
|
+
};
|
|
1110
|
+
}
|
|
1111
|
+
function aggregateTestCaseProgress(lines) {
|
|
1112
|
+
let completedTestCases = 0;
|
|
1113
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1114
|
+
for (const line of lines) {
|
|
1115
|
+
try {
|
|
1116
|
+
const event = JSON.parse(line);
|
|
1117
|
+
if (event.type === "TestCaseProgress") {
|
|
1118
|
+
const ev = event;
|
|
1119
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1120
|
+
const id = ev.testCaseId;
|
|
1121
|
+
const current = testCasePassedBy.get(id);
|
|
1122
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1123
|
+
}
|
|
1124
|
+
} catch {
|
|
1125
|
+
}
|
|
861
1126
|
}
|
|
862
|
-
|
|
863
|
-
|
|
1127
|
+
let passedTestCases = 0;
|
|
1128
|
+
let failedTestCases = 0;
|
|
1129
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1130
|
+
if (passed) {
|
|
1131
|
+
passedTestCases += 1;
|
|
1132
|
+
} else {
|
|
1133
|
+
failedTestCases += 1;
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
864
1137
|
}
|
|
865
1138
|
|
|
866
1139
|
// src/runner/config.ts
|
|
@@ -871,18 +1144,9 @@ var defaultRunnerConfig = {
|
|
|
871
1144
|
discovery: {
|
|
872
1145
|
rootDir: process.cwd(),
|
|
873
1146
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
874
|
-
evaluatorSuffixes: [
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
".evaluator.js",
|
|
878
|
-
".evaluator.mjs"
|
|
879
|
-
],
|
|
880
|
-
testCaseSuffixes: [
|
|
881
|
-
".test-case.ts",
|
|
882
|
-
".test-case.tsx",
|
|
883
|
-
".test-case.js",
|
|
884
|
-
".test-case.mjs"
|
|
885
|
-
],
|
|
1147
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
1148
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
1149
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
886
1150
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
887
1151
|
},
|
|
888
1152
|
artifactDirectory: ".eval-results",
|
|
@@ -907,6 +1171,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
907
1171
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
908
1172
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
909
1173
|
}
|
|
1174
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
1175
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
1176
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
1177
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
1178
|
+
}
|
|
910
1179
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
911
1180
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
912
1181
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -949,14 +1218,15 @@ function getJitiLoader() {
|
|
|
949
1218
|
}
|
|
950
1219
|
const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
|
|
951
1220
|
if (typeof createJiti2 !== "function") {
|
|
952
|
-
throw new Error(
|
|
953
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
954
|
-
);
|
|
1221
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
955
1222
|
}
|
|
956
|
-
cachedLoader = createJiti2(
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
1223
|
+
cachedLoader = createJiti2(
|
|
1224
|
+
import.meta.url,
|
|
1225
|
+
{
|
|
1226
|
+
interopDefault: true,
|
|
1227
|
+
moduleCache: true
|
|
1228
|
+
}
|
|
1229
|
+
);
|
|
960
1230
|
return cachedLoader;
|
|
961
1231
|
}
|
|
962
1232
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -1004,6 +1274,9 @@ function isDatasetLike(value) {
|
|
|
1004
1274
|
function isEvaluatorLike(value) {
|
|
1005
1275
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
1006
1276
|
}
|
|
1277
|
+
function isRunConfigLike(value) {
|
|
1278
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1279
|
+
}
|
|
1007
1280
|
function isTestCaseLike(value) {
|
|
1008
1281
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
1009
1282
|
}
|
|
@@ -1060,9 +1333,7 @@ async function loadModuleExports(filePath) {
|
|
|
1060
1333
|
}
|
|
1061
1334
|
async function collectDatasetsFromFiles(config) {
|
|
1062
1335
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1063
|
-
const matched = files.filter(
|
|
1064
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
1065
|
-
);
|
|
1336
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
1066
1337
|
const found = await Promise.all(
|
|
1067
1338
|
matched.map(async (absolutePath) => {
|
|
1068
1339
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1079,9 +1350,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
1079
1350
|
}
|
|
1080
1351
|
async function collectEvaluatorsFromFiles(config) {
|
|
1081
1352
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1082
|
-
const matched = files.filter(
|
|
1083
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
1084
|
-
);
|
|
1353
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
1085
1354
|
const found = await Promise.all(
|
|
1086
1355
|
matched.map(async (absolutePath) => {
|
|
1087
1356
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1096,11 +1365,26 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1096
1365
|
);
|
|
1097
1366
|
return found.flat();
|
|
1098
1367
|
}
|
|
1099
|
-
async function
|
|
1368
|
+
async function collectRunConfigsFromFiles(config) {
|
|
1100
1369
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1101
|
-
const matched = files.filter(
|
|
1102
|
-
|
|
1370
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1371
|
+
const found = await Promise.all(
|
|
1372
|
+
matched.map(async (absolutePath) => {
|
|
1373
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1374
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1375
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
1376
|
+
return runConfigs.map((runConfig) => ({
|
|
1377
|
+
id: runConfig.getName(),
|
|
1378
|
+
filePath: relPath,
|
|
1379
|
+
runConfig
|
|
1380
|
+
}));
|
|
1381
|
+
})
|
|
1103
1382
|
);
|
|
1383
|
+
return found.flat();
|
|
1384
|
+
}
|
|
1385
|
+
async function collectTestCasesFromFiles(config) {
|
|
1386
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1387
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
1104
1388
|
const found = await Promise.all(
|
|
1105
1389
|
matched.map(async (absolutePath) => {
|
|
1106
1390
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1190,15 +1474,17 @@ function readOutput(testCase) {
|
|
|
1190
1474
|
}
|
|
1191
1475
|
return candidate.getOutput();
|
|
1192
1476
|
}
|
|
1193
|
-
function buildEvaluationUnits(testCases) {
|
|
1477
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1478
|
+
const count = Math.max(1, repetitionCount);
|
|
1194
1479
|
const units = [];
|
|
1195
1480
|
for (const testCaseItem of testCases) {
|
|
1196
|
-
const
|
|
1197
|
-
for (let r = 0; r <
|
|
1481
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
1482
|
+
for (let r = 0; r < count; r++) {
|
|
1198
1483
|
units.push({
|
|
1199
1484
|
testCaseItem,
|
|
1200
|
-
|
|
1201
|
-
|
|
1485
|
+
repetitionId,
|
|
1486
|
+
repetitionIndex: r + 1,
|
|
1487
|
+
repetitionCount: count
|
|
1202
1488
|
});
|
|
1203
1489
|
}
|
|
1204
1490
|
}
|
|
@@ -1208,29 +1494,24 @@ function nowIsoForFile() {
|
|
|
1208
1494
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1209
1495
|
}
|
|
1210
1496
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1211
|
-
return join(
|
|
1212
|
-
artifactDirectory,
|
|
1213
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1214
|
-
);
|
|
1497
|
+
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1215
1498
|
}
|
|
1216
1499
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1217
|
-
const { testCaseItem,
|
|
1500
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1218
1501
|
return Effect.gen(function* () {
|
|
1219
1502
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1220
1503
|
const started = Date.now();
|
|
1221
|
-
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1222
|
-
n + 1,
|
|
1223
|
-
n + 1
|
|
1224
|
-
]);
|
|
1504
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
1225
1505
|
yield* publishEvent({
|
|
1226
1506
|
type: "TestCaseStarted",
|
|
1227
1507
|
runId: task.runId,
|
|
1228
1508
|
testCaseId: testCaseItem.id,
|
|
1229
|
-
testCaseName: testCaseItem.testCase
|
|
1509
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1230
1510
|
startedTestCases: startedEvaluations,
|
|
1231
1511
|
totalTestCases: totalEvaluations,
|
|
1232
|
-
|
|
1233
|
-
|
|
1512
|
+
repetitionId,
|
|
1513
|
+
repetitionIndex,
|
|
1514
|
+
repetitionCount
|
|
1234
1515
|
});
|
|
1235
1516
|
const evaluatorScores = [];
|
|
1236
1517
|
let testCaseError;
|
|
@@ -1254,9 +1535,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1254
1535
|
return error;
|
|
1255
1536
|
};
|
|
1256
1537
|
try {
|
|
1257
|
-
const ctx = yield* Effect.promise(
|
|
1258
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
1259
|
-
);
|
|
1538
|
+
const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
1260
1539
|
const result = yield* Effect.promise(
|
|
1261
1540
|
() => Promise.resolve().then(
|
|
1262
1541
|
() => evaluateFn({
|
|
@@ -1266,8 +1545,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1266
1545
|
meta: {
|
|
1267
1546
|
triggerId: task.triggerId,
|
|
1268
1547
|
runId: evaluatorRunId,
|
|
1269
|
-
datasetId: task.datasetId
|
|
1548
|
+
datasetId: task.datasetId,
|
|
1549
|
+
repetitionId,
|
|
1550
|
+
repetitionIndex,
|
|
1551
|
+
repetitionCount,
|
|
1552
|
+
runConfigName: task.runConfigName
|
|
1270
1553
|
},
|
|
1554
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1555
|
+
runConfigTags: task.runConfigTags,
|
|
1556
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1271
1557
|
logDiff,
|
|
1272
1558
|
log,
|
|
1273
1559
|
createError
|
|
@@ -1310,21 +1596,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1310
1596
|
});
|
|
1311
1597
|
}
|
|
1312
1598
|
}
|
|
1313
|
-
const
|
|
1314
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1315
|
-
n + 1,
|
|
1316
|
-
n + 1
|
|
1317
|
-
]);
|
|
1599
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1600
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1318
1601
|
const progressEvent = {
|
|
1319
1602
|
type: "TestCaseProgress",
|
|
1320
1603
|
runId: task.runId,
|
|
1321
1604
|
testCaseId: testCaseItem.id,
|
|
1322
|
-
testCaseName: testCaseItem.testCase
|
|
1605
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1323
1606
|
completedTestCases: completedEvaluations,
|
|
1324
1607
|
totalTestCases: totalEvaluations,
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1608
|
+
repetitionId,
|
|
1609
|
+
repetitionIndex,
|
|
1610
|
+
repetitionCount,
|
|
1611
|
+
passed: repetitionPassedThis,
|
|
1328
1612
|
durationMs: Date.now() - started,
|
|
1329
1613
|
evaluatorScores,
|
|
1330
1614
|
output,
|
|
@@ -1345,9 +1629,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1345
1629
|
(map) => {
|
|
1346
1630
|
const key = testCaseItem.id;
|
|
1347
1631
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1348
|
-
const newResults = [...existing.results,
|
|
1632
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1349
1633
|
const newCompletedCount = existing.completedCount + 1;
|
|
1350
|
-
const isLast = newCompletedCount ===
|
|
1634
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1351
1635
|
const newMap = new Map(map);
|
|
1352
1636
|
newMap.set(key, {
|
|
1353
1637
|
completedCount: newCompletedCount,
|
|
@@ -1363,10 +1647,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1363
1647
|
} else {
|
|
1364
1648
|
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1365
1649
|
}
|
|
1366
|
-
const [passed, failed] = yield* Effect.all([
|
|
1367
|
-
Ref.get(passedRef),
|
|
1368
|
-
Ref.get(failedRef)
|
|
1369
|
-
]);
|
|
1650
|
+
const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
|
|
1370
1651
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1371
1652
|
...snapshot,
|
|
1372
1653
|
passedTestCases: passed,
|
|
@@ -1387,10 +1668,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1387
1668
|
runId: task.runId,
|
|
1388
1669
|
startedAt
|
|
1389
1670
|
});
|
|
1390
|
-
const totalEvaluations = task.testCases.
|
|
1391
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1392
|
-
0
|
|
1393
|
-
);
|
|
1671
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1394
1672
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1395
1673
|
const completedRef = yield* Ref.make(0);
|
|
1396
1674
|
const startedRef = yield* Ref.make(0);
|
|
@@ -1399,7 +1677,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1399
1677
|
const testCaseResultsRef = yield* Ref.make(
|
|
1400
1678
|
/* @__PURE__ */ new Map()
|
|
1401
1679
|
);
|
|
1402
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1680
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1403
1681
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1404
1682
|
task,
|
|
1405
1683
|
unit,
|
|
@@ -1413,11 +1691,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1413
1691
|
failedRef,
|
|
1414
1692
|
testCaseResultsRef
|
|
1415
1693
|
);
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1694
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1695
|
+
if (globalSem !== void 0) {
|
|
1696
|
+
yield* Effect.forEach(
|
|
1697
|
+
evaluationUnits,
|
|
1698
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1699
|
+
{ concurrency: "unbounded", discard: true }
|
|
1700
|
+
);
|
|
1701
|
+
} else {
|
|
1702
|
+
yield* Effect.forEach(
|
|
1703
|
+
evaluationUnits,
|
|
1704
|
+
processEvaluation,
|
|
1705
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1706
|
+
);
|
|
1707
|
+
}
|
|
1421
1708
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1422
1709
|
Ref.get(completedRef),
|
|
1423
1710
|
Ref.get(passedRef),
|
|
@@ -1453,125 +1740,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1453
1740
|
artifactPath: task.snapshot.artifactPath
|
|
1454
1741
|
});
|
|
1455
1742
|
});
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
} catch {
|
|
1462
|
-
return [];
|
|
1463
|
-
}
|
|
1464
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1465
|
-
const snapshots = [];
|
|
1466
|
-
for (const fileName of jsonlFiles) {
|
|
1467
|
-
const filePath = join(baseDir, fileName);
|
|
1468
|
-
try {
|
|
1469
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1470
|
-
if (snapshot) {
|
|
1471
|
-
snapshots.push(snapshot);
|
|
1472
|
-
}
|
|
1473
|
-
} catch {
|
|
1474
|
-
}
|
|
1475
|
-
}
|
|
1476
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1477
|
-
}
|
|
1478
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1479
|
-
const content = await readFile(filePath, "utf8");
|
|
1480
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1481
|
-
if (lines.length === 0) {
|
|
1482
|
-
return null;
|
|
1483
|
-
}
|
|
1484
|
-
let runQueued = null;
|
|
1485
|
-
let runCompleted = null;
|
|
1486
|
-
let runFailed = null;
|
|
1487
|
-
let runStarted = null;
|
|
1488
|
-
for (const line of lines) {
|
|
1489
|
-
try {
|
|
1490
|
-
const event = JSON.parse(line);
|
|
1491
|
-
const type = event.type;
|
|
1492
|
-
if (type === "RunQueued") {
|
|
1493
|
-
runQueued = {
|
|
1494
|
-
runId: event.runId,
|
|
1495
|
-
datasetId: event.datasetId,
|
|
1496
|
-
datasetName: event.datasetName,
|
|
1497
|
-
evaluatorIds: event.evaluatorIds,
|
|
1498
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1499
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1500
|
-
ts: event.ts
|
|
1501
|
-
};
|
|
1502
|
-
}
|
|
1503
|
-
if (type === "RunStarted") {
|
|
1504
|
-
runStarted = { startedAt: event.startedAt };
|
|
1505
|
-
}
|
|
1506
|
-
if (type === "RunCompleted") {
|
|
1507
|
-
runCompleted = {
|
|
1508
|
-
passedTestCases: event.passedTestCases,
|
|
1509
|
-
failedTestCases: event.failedTestCases,
|
|
1510
|
-
totalTestCases: event.totalTestCases,
|
|
1511
|
-
finishedAt: event.finishedAt
|
|
1512
|
-
};
|
|
1513
|
-
}
|
|
1514
|
-
if (type === "RunFailed") {
|
|
1515
|
-
runFailed = {
|
|
1516
|
-
finishedAt: event.finishedAt,
|
|
1517
|
-
errorMessage: event.errorMessage
|
|
1518
|
-
};
|
|
1519
|
-
}
|
|
1520
|
-
} catch {
|
|
1521
|
-
}
|
|
1743
|
+
|
|
1744
|
+
// src/runner/name-pattern.ts
|
|
1745
|
+
function parseRegexLiteral(pattern) {
|
|
1746
|
+
if (!pattern.startsWith("/")) {
|
|
1747
|
+
return void 0;
|
|
1522
1748
|
}
|
|
1523
|
-
|
|
1524
|
-
|
|
1749
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1750
|
+
if (lastSlash <= 0) {
|
|
1751
|
+
return void 0;
|
|
1525
1752
|
}
|
|
1526
|
-
const artifactPath = filePath;
|
|
1527
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1528
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1529
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1530
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1531
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1532
1753
|
return {
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
datasetName: runQueued.datasetName,
|
|
1536
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1537
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1538
|
-
startedAt: runStarted?.startedAt,
|
|
1539
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1540
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1541
|
-
completedTestCases,
|
|
1542
|
-
passedTestCases,
|
|
1543
|
-
failedTestCases,
|
|
1544
|
-
status,
|
|
1545
|
-
artifactPath,
|
|
1546
|
-
errorMessage: runFailed?.errorMessage
|
|
1754
|
+
source: pattern.slice(1, lastSlash),
|
|
1755
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1547
1756
|
};
|
|
1548
1757
|
}
|
|
1549
|
-
function
|
|
1550
|
-
|
|
1551
|
-
const
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
if (event.type === "TestCaseProgress") {
|
|
1556
|
-
const ev = event;
|
|
1557
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1558
|
-
const id = ev.testCaseId;
|
|
1559
|
-
const current = testCasePassedBy.get(id);
|
|
1560
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1561
|
-
}
|
|
1562
|
-
} catch {
|
|
1563
|
-
}
|
|
1758
|
+
function createNameMatcher(pattern) {
|
|
1759
|
+
const normalizedPattern = pattern.trim();
|
|
1760
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1761
|
+
if (regexLiteral) {
|
|
1762
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1763
|
+
return (value) => regex.test(value);
|
|
1564
1764
|
}
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
passedTestCases += 1;
|
|
1570
|
-
} else {
|
|
1571
|
-
failedTestCases += 1;
|
|
1572
|
-
}
|
|
1765
|
+
if (normalizedPattern.includes("*")) {
|
|
1766
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1767
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1768
|
+
return (value) => regex.test(value);
|
|
1573
1769
|
}
|
|
1574
|
-
return
|
|
1770
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1575
1771
|
}
|
|
1576
1772
|
async function appendJsonLine(artifactPath, payload) {
|
|
1577
1773
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1630,32 +1826,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1630
1826
|
}
|
|
1631
1827
|
|
|
1632
1828
|
// src/runner/api.ts
|
|
1633
|
-
function
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1638
|
-
if (lastSlash <= 0) {
|
|
1639
|
-
return void 0;
|
|
1640
|
-
}
|
|
1641
|
-
return {
|
|
1642
|
-
source: pattern.slice(1, lastSlash),
|
|
1643
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1644
|
-
};
|
|
1645
|
-
}
|
|
1646
|
-
function createNameMatcher(pattern) {
|
|
1647
|
-
const normalizedPattern = pattern.trim();
|
|
1648
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1649
|
-
if (regexLiteral) {
|
|
1650
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1651
|
-
return (value) => regex.test(value);
|
|
1652
|
-
}
|
|
1653
|
-
if (normalizedPattern.includes("*")) {
|
|
1654
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1655
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1656
|
-
return (value) => regex.test(value);
|
|
1829
|
+
function normalizeRunRepetitions(value) {
|
|
1830
|
+
const n = value ?? 1;
|
|
1831
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1832
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1657
1833
|
}
|
|
1658
|
-
return
|
|
1834
|
+
return n;
|
|
1659
1835
|
}
|
|
1660
1836
|
function mergeRunnerOverrides(base, next) {
|
|
1661
1837
|
if (!base) {
|
|
@@ -1686,15 +1862,12 @@ var EffectRunner = class {
|
|
|
1686
1862
|
this.persistenceQueue = Effect.runSync(
|
|
1687
1863
|
Queue.unbounded()
|
|
1688
1864
|
);
|
|
1689
|
-
this.snapshotsRef = Effect.runSync(
|
|
1690
|
-
Ref.make(/* @__PURE__ */ new Map())
|
|
1691
|
-
);
|
|
1865
|
+
this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
|
|
1692
1866
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1693
1867
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1694
1868
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1695
|
-
this.
|
|
1696
|
-
|
|
1697
|
-
);
|
|
1869
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
1870
|
+
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1698
1871
|
this.persistenceFiber = Effect.runFork(
|
|
1699
1872
|
createPersistenceWorker(this.persistenceQueue)
|
|
1700
1873
|
);
|
|
@@ -1734,6 +1907,137 @@ var EffectRunner = class {
|
|
|
1734
1907
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1735
1908
|
);
|
|
1736
1909
|
}
|
|
1910
|
+
async collectRunConfigs() {
|
|
1911
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
1912
|
+
this.runConfigsById.clear();
|
|
1913
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
1914
|
+
for (const item of runConfigs) {
|
|
1915
|
+
const id = item.runConfig.getName();
|
|
1916
|
+
const lower = id.toLowerCase();
|
|
1917
|
+
const prev = byNameLower.get(lower);
|
|
1918
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
1919
|
+
throw new Error(
|
|
1920
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
1921
|
+
);
|
|
1922
|
+
}
|
|
1923
|
+
byNameLower.set(lower, item);
|
|
1924
|
+
this.runConfigsById.set(id, item);
|
|
1925
|
+
}
|
|
1926
|
+
return runConfigs;
|
|
1927
|
+
}
|
|
1928
|
+
async resolveRunConfigByName(name) {
|
|
1929
|
+
if (this.runConfigsById.size === 0) {
|
|
1930
|
+
await this.collectRunConfigs();
|
|
1931
|
+
}
|
|
1932
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
1933
|
+
const keyLower = key.toLowerCase();
|
|
1934
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
1935
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
1936
|
+
);
|
|
1937
|
+
if (matches.length === 0) {
|
|
1938
|
+
return void 0;
|
|
1939
|
+
}
|
|
1940
|
+
if (matches.length > 1) {
|
|
1941
|
+
throw new Error(
|
|
1942
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
1943
|
+
);
|
|
1944
|
+
}
|
|
1945
|
+
return matches[0];
|
|
1946
|
+
}
|
|
1947
|
+
async expandRunConfigToJobs(collected) {
|
|
1948
|
+
if (this.datasetsById.size === 0) {
|
|
1949
|
+
await this.collectDatasets();
|
|
1950
|
+
}
|
|
1951
|
+
if (this.evaluatorsById.size === 0) {
|
|
1952
|
+
await this.collectEvaluators();
|
|
1953
|
+
}
|
|
1954
|
+
const rcName = collected.runConfig.getName();
|
|
1955
|
+
const jobs = [];
|
|
1956
|
+
const runs = collected.runConfig.getRuns();
|
|
1957
|
+
for (const [i, row] of runs.entries()) {
|
|
1958
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
1959
|
+
(d) => d.dataset === row.dataset
|
|
1960
|
+
);
|
|
1961
|
+
if (!dsCollected) {
|
|
1962
|
+
throw new Error(
|
|
1963
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1964
|
+
);
|
|
1965
|
+
}
|
|
1966
|
+
let evaluatorIds;
|
|
1967
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
1968
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
1969
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
1970
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1971
|
+
);
|
|
1972
|
+
if (matched.length === 0) {
|
|
1973
|
+
throw new Error(
|
|
1974
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
1975
|
+
);
|
|
1976
|
+
}
|
|
1977
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
1978
|
+
} else {
|
|
1979
|
+
const evaluators = row.evaluators;
|
|
1980
|
+
evaluatorIds = [];
|
|
1981
|
+
for (const ev of evaluators) {
|
|
1982
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
1983
|
+
(item) => item.evaluator === ev
|
|
1984
|
+
);
|
|
1985
|
+
if (!found) {
|
|
1986
|
+
throw new Error(
|
|
1987
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
1988
|
+
);
|
|
1989
|
+
}
|
|
1990
|
+
evaluatorIds.push(found.id);
|
|
1991
|
+
}
|
|
1992
|
+
}
|
|
1993
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
1994
|
+
jobs.push({
|
|
1995
|
+
datasetId: dsCollected.id,
|
|
1996
|
+
evaluatorIds,
|
|
1997
|
+
runConfigName: rcName,
|
|
1998
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
1999
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2000
|
+
repetitions
|
|
2001
|
+
});
|
|
2002
|
+
}
|
|
2003
|
+
return jobs;
|
|
2004
|
+
}
|
|
2005
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2006
|
+
const jobs = [];
|
|
2007
|
+
for (const name of names) {
|
|
2008
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2009
|
+
if (!collected) {
|
|
2010
|
+
const known = await this.collectRunConfigs();
|
|
2011
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2012
|
+
throw new Error(
|
|
2013
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2014
|
+
);
|
|
2015
|
+
}
|
|
2016
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2017
|
+
}
|
|
2018
|
+
return jobs;
|
|
2019
|
+
}
|
|
2020
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2021
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2022
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2023
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2024
|
+
const snapshots = [];
|
|
2025
|
+
for (const job of request.jobs) {
|
|
2026
|
+
snapshots.push(
|
|
2027
|
+
await this.startDatasetRun({
|
|
2028
|
+
datasetId: job.datasetId,
|
|
2029
|
+
evaluatorIds: job.evaluatorIds,
|
|
2030
|
+
triggerId,
|
|
2031
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2032
|
+
globalEvaluationSemaphore: sem,
|
|
2033
|
+
runConfigName: job.runConfigName,
|
|
2034
|
+
runConfigTags: job.runConfigTags,
|
|
2035
|
+
repetitions: job.repetitions
|
|
2036
|
+
})
|
|
2037
|
+
);
|
|
2038
|
+
}
|
|
2039
|
+
return snapshots;
|
|
2040
|
+
}
|
|
1737
2041
|
async searchTestCases(query) {
|
|
1738
2042
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1739
2043
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -1752,35 +2056,45 @@ var EffectRunner = class {
|
|
|
1752
2056
|
);
|
|
1753
2057
|
}
|
|
1754
2058
|
async runDatasetWith(request) {
|
|
2059
|
+
const runConfigName = validateRunConfigName(
|
|
2060
|
+
request.runConfigName,
|
|
2061
|
+
"runDatasetWith.runConfigName"
|
|
2062
|
+
);
|
|
2063
|
+
return this.startDatasetRun({
|
|
2064
|
+
datasetId: request.datasetId,
|
|
2065
|
+
evaluatorIds: request.evaluatorIds,
|
|
2066
|
+
triggerId: request.triggerId,
|
|
2067
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2068
|
+
repetitions: request.repetitions,
|
|
2069
|
+
runConfigName,
|
|
2070
|
+
runConfigTags: request.runConfigTags
|
|
2071
|
+
});
|
|
2072
|
+
}
|
|
2073
|
+
async startDatasetRun(params) {
|
|
1755
2074
|
if (this.datasetsById.size === 0) {
|
|
1756
2075
|
await this.collectDatasets();
|
|
1757
2076
|
}
|
|
1758
2077
|
if (this.evaluatorsById.size === 0) {
|
|
1759
2078
|
await this.collectEvaluators();
|
|
1760
2079
|
}
|
|
1761
|
-
const dataset = this.datasetsById.get(
|
|
2080
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
1762
2081
|
if (!dataset) {
|
|
1763
|
-
throw new Error(`Unknown dataset: ${
|
|
2082
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
1764
2083
|
}
|
|
1765
|
-
const selectedEvaluators =
|
|
2084
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1766
2085
|
if (selectedEvaluators.length === 0) {
|
|
1767
2086
|
throw new Error("No evaluators selected for run");
|
|
1768
2087
|
}
|
|
1769
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
1770
|
-
const
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
)
|
|
1774
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2088
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2089
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2090
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2091
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2092
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1775
2093
|
const runId = `run-${randomUUID()}`;
|
|
1776
|
-
const artifactPath = createArtifactPath(
|
|
1777
|
-
this.config.artifactDirectory,
|
|
1778
|
-
request.datasetId,
|
|
1779
|
-
runId
|
|
1780
|
-
);
|
|
2094
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1781
2095
|
const snapshot = {
|
|
1782
2096
|
runId,
|
|
1783
|
-
datasetId:
|
|
2097
|
+
datasetId: params.datasetId,
|
|
1784
2098
|
datasetName: dataset.dataset.getName(),
|
|
1785
2099
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1786
2100
|
queuedAt: Date.now(),
|
|
@@ -1801,7 +2115,7 @@ var EffectRunner = class {
|
|
|
1801
2115
|
const queuedEvent = {
|
|
1802
2116
|
type: "RunQueued",
|
|
1803
2117
|
runId,
|
|
1804
|
-
datasetId:
|
|
2118
|
+
datasetId: params.datasetId,
|
|
1805
2119
|
datasetName: dataset.dataset.getName(),
|
|
1806
2120
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1807
2121
|
totalTestCases: totalEvaluations,
|
|
@@ -1815,17 +2129,20 @@ var EffectRunner = class {
|
|
|
1815
2129
|
payload: queuedEvent
|
|
1816
2130
|
})
|
|
1817
2131
|
);
|
|
1818
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1819
2132
|
await Effect.runPromise(
|
|
1820
2133
|
Queue.offer(this.runQueue, {
|
|
1821
2134
|
runId,
|
|
1822
2135
|
triggerId,
|
|
1823
|
-
datasetId:
|
|
2136
|
+
datasetId: params.datasetId,
|
|
1824
2137
|
dataset: dataset.dataset,
|
|
1825
2138
|
evaluators: selectedEvaluators,
|
|
1826
2139
|
testCases: selectedTestCases,
|
|
1827
2140
|
snapshot,
|
|
1828
|
-
maxConcurrency
|
|
2141
|
+
maxConcurrency: params.maxConcurrency,
|
|
2142
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2143
|
+
runConfigName: params.runConfigName,
|
|
2144
|
+
runConfigTags,
|
|
2145
|
+
repetitions
|
|
1829
2146
|
})
|
|
1830
2147
|
);
|
|
1831
2148
|
return snapshot;
|
|
@@ -1841,9 +2158,9 @@ var EffectRunner = class {
|
|
|
1841
2158
|
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
1842
2159
|
}
|
|
1843
2160
|
getAllRunSnapshots() {
|
|
1844
|
-
return Array.from(
|
|
1845
|
-
|
|
1846
|
-
)
|
|
2161
|
+
return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
|
|
2162
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
2163
|
+
);
|
|
1847
2164
|
}
|
|
1848
2165
|
async loadRunSnapshotsFromArtifacts() {
|
|
1849
2166
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1897,6 +2214,11 @@ var EffectRunner = class {
|
|
|
1897
2214
|
}
|
|
1898
2215
|
};
|
|
1899
2216
|
|
|
1900
|
-
|
|
2217
|
+
// src/runner/events.ts
|
|
2218
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2219
|
+
runConfigName: "programmatic"
|
|
2220
|
+
};
|
|
2221
|
+
|
|
2222
|
+
export { Dataset, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|
|
1901
2223
|
//# sourceMappingURL=out.js.map
|
|
1902
2224
|
//# sourceMappingURL=index.js.map
|