@m4trix/evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +1075 -0
- package/dist/cli-simple.cjs.map +1 -0
- package/dist/cli-simple.d.cts +1 -0
- package/dist/cli-simple.d.ts +1 -0
- package/dist/cli-simple.js +1072 -0
- package/dist/cli-simple.js.map +1 -0
- package/dist/cli.cjs +1981 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +1974 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +1184 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +347 -0
- package/dist/index.d.ts +347 -0
- package/dist/index.js +1165 -0
- package/dist/index.js.map +1 -0
- package/package.json +53 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1165 @@
|
|
|
1
|
+
import { randomUUID } from 'crypto';
|
|
2
|
+
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
3
|
+
import { mkdir, appendFile, readdir } from 'fs/promises';
|
|
4
|
+
import { relative, join, dirname, resolve as resolve$1 } from 'path';
|
|
5
|
+
import { pathToFileURL } from 'url';
|
|
6
|
+
|
|
7
|
+
// src/cli/data.mock.json
|
|
8
|
+
var data_mock_default = {
|
|
9
|
+
datasets: [
|
|
10
|
+
{
|
|
11
|
+
id: "onboarding-flows",
|
|
12
|
+
name: "onboarding-flows",
|
|
13
|
+
overview: "Evaluate first-user journeys and schema compliance for generated onboarding payloads.",
|
|
14
|
+
runs: [
|
|
15
|
+
{
|
|
16
|
+
id: "run_2026-02-17_2044",
|
|
17
|
+
label: "2026-02-17 20:44",
|
|
18
|
+
status: "FAILED",
|
|
19
|
+
performance: {
|
|
20
|
+
passRate: 96,
|
|
21
|
+
avgScore: 0.91,
|
|
22
|
+
latencyP95Ms: 710,
|
|
23
|
+
latencyAvgMs: 502,
|
|
24
|
+
tokensAvg: 171,
|
|
25
|
+
tokensP95: 230,
|
|
26
|
+
costUsd: 24e-4,
|
|
27
|
+
latencyHistoryMs: [380, 420, 510, 480, 550, 620, 590, 710, 520, 480, 530, 600]
|
|
28
|
+
},
|
|
29
|
+
dimensions: [
|
|
30
|
+
{ name: "correctness", score: 82 },
|
|
31
|
+
{ name: "faithfulness", score: 79 },
|
|
32
|
+
{ name: "brevity", score: 68 },
|
|
33
|
+
{ name: "style", score: 90 }
|
|
34
|
+
],
|
|
35
|
+
checks: [
|
|
36
|
+
{ name: "json_schema", passed: false, detail: "3 violations" },
|
|
37
|
+
{ name: "tool_calls", passed: true, detail: "0 unexpected" },
|
|
38
|
+
{ name: "pii_leak", passed: true },
|
|
39
|
+
{ name: "jailbreak", passed: true }
|
|
40
|
+
],
|
|
41
|
+
failures: [
|
|
42
|
+
{ title: "product_parser \u203A conforms to schema (price: string)" },
|
|
43
|
+
{ title: "checkout \u203A tool-call count mismatch" }
|
|
44
|
+
],
|
|
45
|
+
meta: {
|
|
46
|
+
model: "gpt-4o-mini",
|
|
47
|
+
provider: "OpenAI",
|
|
48
|
+
commit: "2f3c1a9",
|
|
49
|
+
branch: "main",
|
|
50
|
+
seed: 42,
|
|
51
|
+
concurrency: 4,
|
|
52
|
+
duration: "00:01:12",
|
|
53
|
+
artifact: "./eval-results/run_2026-02-17.jsonl"
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
id: "run_2026-02-16_1112",
|
|
58
|
+
label: "2026-02-16 11:12",
|
|
59
|
+
status: "PASS",
|
|
60
|
+
performance: {
|
|
61
|
+
passRate: 99,
|
|
62
|
+
avgScore: 0.95,
|
|
63
|
+
latencyP95Ms: 650,
|
|
64
|
+
latencyAvgMs: 488,
|
|
65
|
+
tokensAvg: 168,
|
|
66
|
+
tokensP95: 220,
|
|
67
|
+
costUsd: 2e-3,
|
|
68
|
+
latencyHistoryMs: [420, 450, 480, 460, 520, 490, 510, 650, 440, 470, 500, 480]
|
|
69
|
+
},
|
|
70
|
+
dimensions: [
|
|
71
|
+
{ name: "correctness", score: 89 },
|
|
72
|
+
{ name: "faithfulness", score: 88 },
|
|
73
|
+
{ name: "brevity", score: 72 },
|
|
74
|
+
{ name: "style", score: 93 }
|
|
75
|
+
],
|
|
76
|
+
checks: [
|
|
77
|
+
{ name: "json_schema", passed: true, detail: "0 violations" },
|
|
78
|
+
{ name: "tool_calls", passed: true, detail: "0 unexpected" },
|
|
79
|
+
{ name: "pii_leak", passed: true },
|
|
80
|
+
{ name: "jailbreak", passed: true }
|
|
81
|
+
],
|
|
82
|
+
failures: [],
|
|
83
|
+
meta: {
|
|
84
|
+
model: "gpt-4o-mini",
|
|
85
|
+
provider: "OpenAI",
|
|
86
|
+
commit: "0d24f8f",
|
|
87
|
+
branch: "main",
|
|
88
|
+
seed: 42,
|
|
89
|
+
concurrency: 4,
|
|
90
|
+
duration: "00:01:06",
|
|
91
|
+
artifact: "./eval-results/run_2026-02-16.jsonl"
|
|
92
|
+
}
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
id: "run_2026-02-15_0921",
|
|
96
|
+
label: "2026-02-15 09:21",
|
|
97
|
+
status: "PASS",
|
|
98
|
+
performance: {
|
|
99
|
+
passRate: 98,
|
|
100
|
+
avgScore: 0.93,
|
|
101
|
+
latencyP95Ms: 680,
|
|
102
|
+
latencyAvgMs: 495,
|
|
103
|
+
tokensAvg: 175,
|
|
104
|
+
tokensP95: 235,
|
|
105
|
+
costUsd: 22e-4,
|
|
106
|
+
latencyHistoryMs: [450, 480, 520, 490, 550, 580, 620, 680, 510, 470, 530, 560]
|
|
107
|
+
},
|
|
108
|
+
dimensions: [
|
|
109
|
+
{ name: "correctness", score: 86 },
|
|
110
|
+
{ name: "faithfulness", score: 84 },
|
|
111
|
+
{ name: "brevity", score: 70 },
|
|
112
|
+
{ name: "style", score: 91 }
|
|
113
|
+
],
|
|
114
|
+
checks: [
|
|
115
|
+
{ name: "json_schema", passed: true, detail: "0 violations" },
|
|
116
|
+
{ name: "tool_calls", passed: true, detail: "0 unexpected" },
|
|
117
|
+
{ name: "pii_leak", passed: true },
|
|
118
|
+
{ name: "jailbreak", passed: true }
|
|
119
|
+
],
|
|
120
|
+
failures: [],
|
|
121
|
+
meta: {
|
|
122
|
+
model: "gpt-4o-mini",
|
|
123
|
+
provider: "OpenAI",
|
|
124
|
+
commit: "a1b2c3d",
|
|
125
|
+
branch: "main",
|
|
126
|
+
seed: 42,
|
|
127
|
+
concurrency: 4,
|
|
128
|
+
duration: "00:01:08",
|
|
129
|
+
artifact: "./eval-results/run_2026-02-15.jsonl"
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
]
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
id: "tool-calls",
|
|
136
|
+
name: "tool-calls",
|
|
137
|
+
overview: "Validate function-call conformance and unexpected tool invocation behavior.",
|
|
138
|
+
runs: [
|
|
139
|
+
{
|
|
140
|
+
id: "run_2026-02-14_1530",
|
|
141
|
+
label: "2026-02-14 15:30",
|
|
142
|
+
status: "PASS",
|
|
143
|
+
performance: {
|
|
144
|
+
passRate: 100,
|
|
145
|
+
avgScore: 1,
|
|
146
|
+
latencyP95Ms: 320,
|
|
147
|
+
latencyAvgMs: 280,
|
|
148
|
+
tokensAvg: 45,
|
|
149
|
+
tokensP95: 62,
|
|
150
|
+
costUsd: 8e-4,
|
|
151
|
+
latencyHistoryMs: [250, 270, 290, 280, 310, 320, 265, 290, 300, 275]
|
|
152
|
+
},
|
|
153
|
+
dimensions: [
|
|
154
|
+
{ name: "contract_match", score: 100 },
|
|
155
|
+
{ name: "arg_validity", score: 100 }
|
|
156
|
+
],
|
|
157
|
+
checks: [
|
|
158
|
+
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
159
|
+
],
|
|
160
|
+
failures: [],
|
|
161
|
+
meta: {
|
|
162
|
+
model: "gpt-4o-mini",
|
|
163
|
+
provider: "OpenAI",
|
|
164
|
+
commit: "e4f5g6h",
|
|
165
|
+
branch: "feat/tools",
|
|
166
|
+
seed: 42,
|
|
167
|
+
concurrency: 8,
|
|
168
|
+
duration: "00:00:45",
|
|
169
|
+
artifact: "./eval-results/tool-calls_2026-02-14.jsonl"
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
]
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
id: "json-schema",
|
|
176
|
+
name: "json-schema",
|
|
177
|
+
overview: "Stress-test schema fidelity across generated extraction payloads.",
|
|
178
|
+
runs: []
|
|
179
|
+
}
|
|
180
|
+
],
|
|
181
|
+
evaluators: [
|
|
182
|
+
{ id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
|
|
183
|
+
{ id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
|
|
184
|
+
{ id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
|
|
185
|
+
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
186
|
+
]
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
// src/cli/state.ts
|
|
190
|
+
function loadMockData() {
|
|
191
|
+
return data_mock_default;
|
|
192
|
+
}
|
|
193
|
+
function toSlug(input) {
|
|
194
|
+
return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
195
|
+
}
|
|
196
|
+
function toEvalRun(snapshot) {
|
|
197
|
+
const total = snapshot.totalTestCases === 0 ? 1 : snapshot.totalTestCases;
|
|
198
|
+
const passRate = Math.round(snapshot.passedTestCases / total * 100);
|
|
199
|
+
const avgScore = snapshot.passedTestCases / total;
|
|
200
|
+
const durationMs = snapshot.finishedAt ? snapshot.finishedAt - (snapshot.startedAt ?? snapshot.queuedAt) : Date.now() - (snapshot.startedAt ?? snapshot.queuedAt);
|
|
201
|
+
return {
|
|
202
|
+
id: snapshot.runId,
|
|
203
|
+
label: snapshot.runId.slice(0, 12),
|
|
204
|
+
status: snapshot.status === "completed" ? "PASS" : snapshot.status === "failed" ? "FAILED" : "RUNNING",
|
|
205
|
+
performance: {
|
|
206
|
+
passRate,
|
|
207
|
+
avgScore,
|
|
208
|
+
latencyP95Ms: Math.max(1, Math.floor(durationMs / Math.max(1, total))),
|
|
209
|
+
latencyAvgMs: Math.max(1, Math.floor(durationMs / Math.max(1, total))),
|
|
210
|
+
tokensAvg: 0,
|
|
211
|
+
tokensP95: 0,
|
|
212
|
+
costUsd: 0,
|
|
213
|
+
latencyHistoryMs: [durationMs]
|
|
214
|
+
},
|
|
215
|
+
dimensions: [
|
|
216
|
+
{ name: "passed", score: Math.round(snapshot.passedTestCases / total * 100) },
|
|
217
|
+
{ name: "failed", score: Math.round(snapshot.failedTestCases / total * 100) }
|
|
218
|
+
],
|
|
219
|
+
checks: [
|
|
220
|
+
{
|
|
221
|
+
name: "run_status",
|
|
222
|
+
passed: snapshot.status === "completed",
|
|
223
|
+
detail: snapshot.status
|
|
224
|
+
}
|
|
225
|
+
],
|
|
226
|
+
failures: snapshot.errorMessage && snapshot.errorMessage.length > 0 ? [{ title: snapshot.errorMessage }] : [],
|
|
227
|
+
meta: {
|
|
228
|
+
model: "n/a",
|
|
229
|
+
provider: "runner",
|
|
230
|
+
commit: "local",
|
|
231
|
+
branch: "local",
|
|
232
|
+
seed: 0,
|
|
233
|
+
concurrency: 1,
|
|
234
|
+
duration: `${durationMs}ms`,
|
|
235
|
+
artifact: snapshot.artifactPath
|
|
236
|
+
}
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
function toEvalDataset(item, snapshots) {
|
|
240
|
+
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
241
|
+
return {
|
|
242
|
+
id: item.id,
|
|
243
|
+
name: item.dataset.getName(),
|
|
244
|
+
overview: `Discovered from ${item.filePath}`,
|
|
245
|
+
runs
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
function toEvaluatorOption(item) {
|
|
249
|
+
return {
|
|
250
|
+
id: item.id,
|
|
251
|
+
name: item.evaluator.getName() ?? toSlug(item.id),
|
|
252
|
+
configPreview: `Source: ${item.filePath}`
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
async function loadRunnerData(runner) {
|
|
256
|
+
const [datasets, evaluators] = await Promise.all([
|
|
257
|
+
runner.collectDatasets(),
|
|
258
|
+
runner.collectEvaluators()
|
|
259
|
+
]);
|
|
260
|
+
const snapshots = runner.getAllRunSnapshots();
|
|
261
|
+
if (datasets.length === 0 && evaluators.length === 0) {
|
|
262
|
+
return loadMockData();
|
|
263
|
+
}
|
|
264
|
+
return {
|
|
265
|
+
datasets: datasets.map((dataset) => toEvalDataset(dataset, snapshots)),
|
|
266
|
+
evaluators: evaluators.map(toEvaluatorOption)
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
function parseStartupArgs(argv) {
|
|
270
|
+
const args = { unknownArgs: [] };
|
|
271
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
272
|
+
const token = argv[index];
|
|
273
|
+
if (token === "--dataset" && argv[index + 1]) {
|
|
274
|
+
args.datasetId = argv[index + 1];
|
|
275
|
+
index += 1;
|
|
276
|
+
continue;
|
|
277
|
+
}
|
|
278
|
+
if (token === "--run" && argv[index + 1]) {
|
|
279
|
+
args.runId = argv[index + 1];
|
|
280
|
+
index += 1;
|
|
281
|
+
continue;
|
|
282
|
+
}
|
|
283
|
+
if (token === "--search" && argv[index + 1]) {
|
|
284
|
+
args.search = argv[index + 1];
|
|
285
|
+
index += 1;
|
|
286
|
+
continue;
|
|
287
|
+
}
|
|
288
|
+
args.unknownArgs.push(token);
|
|
289
|
+
}
|
|
290
|
+
return args;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// src/evals/test-case.ts
|
|
294
|
+
function resolve(value) {
|
|
295
|
+
return typeof value === "function" ? value() : value;
|
|
296
|
+
}
|
|
297
|
+
var TestCase = class _TestCase {
|
|
298
|
+
constructor(config) {
|
|
299
|
+
this._config = config;
|
|
300
|
+
}
|
|
301
|
+
static describe(config) {
|
|
302
|
+
return new _TestCase({
|
|
303
|
+
name: config.name,
|
|
304
|
+
tags: config.tags,
|
|
305
|
+
inputSchema: config.inputSchema,
|
|
306
|
+
input: config.input
|
|
307
|
+
});
|
|
308
|
+
}
|
|
309
|
+
getName() {
|
|
310
|
+
return this._config.name;
|
|
311
|
+
}
|
|
312
|
+
getTags() {
|
|
313
|
+
return this._config.tags;
|
|
314
|
+
}
|
|
315
|
+
getInputSchema() {
|
|
316
|
+
return this._config.inputSchema;
|
|
317
|
+
}
|
|
318
|
+
getInput() {
|
|
319
|
+
return resolve(this._config.input);
|
|
320
|
+
}
|
|
321
|
+
};
|
|
322
|
+
|
|
323
|
+
// src/evals/evaluator.ts
|
|
324
|
+
var Evaluator = class _Evaluator {
|
|
325
|
+
constructor(config) {
|
|
326
|
+
this._config = config;
|
|
327
|
+
}
|
|
328
|
+
getState() {
|
|
329
|
+
return {
|
|
330
|
+
name: this._config.name,
|
|
331
|
+
inputSchema: this._config.inputSchema,
|
|
332
|
+
outputSchema: this._config.outputSchema,
|
|
333
|
+
scoreSchema: this._config.scoreSchema,
|
|
334
|
+
middlewares: this._config.middlewares,
|
|
335
|
+
evaluateFn: this._config.evaluateFn,
|
|
336
|
+
passThreshold: this._config.passThreshold,
|
|
337
|
+
passCriterion: this._config.passCriterion
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
static use(middleware) {
|
|
341
|
+
return new _Evaluator({
|
|
342
|
+
middlewares: [middleware]
|
|
343
|
+
});
|
|
344
|
+
}
|
|
345
|
+
use(middleware) {
|
|
346
|
+
const state = this.getState();
|
|
347
|
+
return new _Evaluator({
|
|
348
|
+
...state,
|
|
349
|
+
middlewares: [...state.middlewares, middleware]
|
|
350
|
+
});
|
|
351
|
+
}
|
|
352
|
+
define(config) {
|
|
353
|
+
const { middlewares } = this.getState();
|
|
354
|
+
return new _Evaluator({
|
|
355
|
+
name: config.name,
|
|
356
|
+
inputSchema: config.inputSchema,
|
|
357
|
+
outputSchema: config.outputSchema,
|
|
358
|
+
scoreSchema: config.scoreSchema,
|
|
359
|
+
middlewares,
|
|
360
|
+
passThreshold: config.passThreshold,
|
|
361
|
+
passCriterion: config.passCriterion
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
evaluate(fn) {
|
|
365
|
+
return new _Evaluator({
|
|
366
|
+
...this.getState(),
|
|
367
|
+
evaluateFn: fn
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
getName() {
|
|
371
|
+
return this._config.name;
|
|
372
|
+
}
|
|
373
|
+
getInputSchema() {
|
|
374
|
+
return this._config.inputSchema;
|
|
375
|
+
}
|
|
376
|
+
getOutputSchema() {
|
|
377
|
+
return this._config.outputSchema;
|
|
378
|
+
}
|
|
379
|
+
getScoreSchema() {
|
|
380
|
+
return this._config.scoreSchema;
|
|
381
|
+
}
|
|
382
|
+
getMiddlewares() {
|
|
383
|
+
return this._config.middlewares;
|
|
384
|
+
}
|
|
385
|
+
getEvaluateFn() {
|
|
386
|
+
return this._config.evaluateFn;
|
|
387
|
+
}
|
|
388
|
+
getPassThreshold() {
|
|
389
|
+
return this._config.passThreshold;
|
|
390
|
+
}
|
|
391
|
+
getPassCriterion() {
|
|
392
|
+
return this._config.passCriterion;
|
|
393
|
+
}
|
|
394
|
+
async resolveContext() {
|
|
395
|
+
const parts = await Promise.all(
|
|
396
|
+
this._config.middlewares.map((mw) => mw.resolve())
|
|
397
|
+
);
|
|
398
|
+
return Object.assign({}, ...parts);
|
|
399
|
+
}
|
|
400
|
+
};
|
|
401
|
+
|
|
402
|
+
// src/evals/dataset.ts
|
|
403
|
+
function matchesAny(value, matchers) {
|
|
404
|
+
return matchers.some(
|
|
405
|
+
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
406
|
+
);
|
|
407
|
+
}
|
|
408
|
+
function matchesAnyPath(filePath, matchers) {
|
|
409
|
+
return matchers.some((matcher) => {
|
|
410
|
+
if (typeof matcher === "string") {
|
|
411
|
+
return simpleGlobMatch(matcher, filePath);
|
|
412
|
+
}
|
|
413
|
+
return matcher.test(filePath);
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
function simpleGlobMatch(pattern, value) {
|
|
417
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
418
|
+
return new RegExp(`^${escaped}$`).test(value);
|
|
419
|
+
}
|
|
420
|
+
var Dataset = class _Dataset {
|
|
421
|
+
constructor(config) {
|
|
422
|
+
this._config = config;
|
|
423
|
+
}
|
|
424
|
+
static define(config) {
|
|
425
|
+
return new _Dataset({
|
|
426
|
+
name: config.name,
|
|
427
|
+
includedTags: config.includedTags ?? [],
|
|
428
|
+
excludedTags: config.excludedTags ?? [],
|
|
429
|
+
includedPaths: config.includedPaths ?? [],
|
|
430
|
+
excludedPaths: config.excludedPaths ?? []
|
|
431
|
+
});
|
|
432
|
+
}
|
|
433
|
+
getName() {
|
|
434
|
+
return this._config.name;
|
|
435
|
+
}
|
|
436
|
+
getIncludedTags() {
|
|
437
|
+
return this._config.includedTags;
|
|
438
|
+
}
|
|
439
|
+
getExcludedTags() {
|
|
440
|
+
return this._config.excludedTags;
|
|
441
|
+
}
|
|
442
|
+
getIncludedPaths() {
|
|
443
|
+
return this._config.includedPaths;
|
|
444
|
+
}
|
|
445
|
+
getExcludedPaths() {
|
|
446
|
+
return this._config.excludedPaths;
|
|
447
|
+
}
|
|
448
|
+
matchesTestCase(testCase, filePath) {
|
|
449
|
+
const tags = testCase.getTags();
|
|
450
|
+
if (this._config.excludedTags.length > 0) {
|
|
451
|
+
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
452
|
+
return false;
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
if (this._config.excludedPaths.length > 0) {
|
|
456
|
+
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
457
|
+
return false;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
461
|
+
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
462
|
+
return tagMatch && pathMatch;
|
|
463
|
+
}
|
|
464
|
+
};
|
|
465
|
+
|
|
466
|
+
// src/evals/metric.ts
|
|
467
|
+
var registry = /* @__PURE__ */ new Map();
|
|
468
|
+
var Metric = {
|
|
469
|
+
of(config) {
|
|
470
|
+
const def = {
|
|
471
|
+
id: config.id,
|
|
472
|
+
name: config.name,
|
|
473
|
+
format: config.format,
|
|
474
|
+
make: (data) => ({ id: config.id, data })
|
|
475
|
+
};
|
|
476
|
+
registry.set(config.id, def);
|
|
477
|
+
return def;
|
|
478
|
+
}
|
|
479
|
+
};
|
|
480
|
+
function getMetricById(id) {
|
|
481
|
+
return registry.get(id);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// src/evals/score.ts
|
|
485
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
486
|
+
var Score = {
|
|
487
|
+
of(config) {
|
|
488
|
+
const def = {
|
|
489
|
+
id: config.id,
|
|
490
|
+
name: config.name,
|
|
491
|
+
displayStrategy: config.displayStrategy,
|
|
492
|
+
format: config.format,
|
|
493
|
+
make: (data, options) => {
|
|
494
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
495
|
+
return {
|
|
496
|
+
id: config.id,
|
|
497
|
+
data,
|
|
498
|
+
...passed !== void 0 && { passed }
|
|
499
|
+
};
|
|
500
|
+
}
|
|
501
|
+
};
|
|
502
|
+
registry2.set(config.id, def);
|
|
503
|
+
return def;
|
|
504
|
+
}
|
|
505
|
+
};
|
|
506
|
+
function getScoreById(id) {
|
|
507
|
+
return registry2.get(id);
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
// src/evals/metrics/standard.ts
|
|
511
|
+
var tokenCountMetric = Metric.of({
|
|
512
|
+
id: "token-count",
|
|
513
|
+
name: "Tokens",
|
|
514
|
+
format: (data) => {
|
|
515
|
+
const input = data.input ?? 0;
|
|
516
|
+
const output = data.output ?? 0;
|
|
517
|
+
const inputCached = data.inputCached ?? 0;
|
|
518
|
+
const outputCached = data.outputCached ?? 0;
|
|
519
|
+
const cached = inputCached + outputCached;
|
|
520
|
+
return `in:${input} out:${output} cached:${cached}`;
|
|
521
|
+
}
|
|
522
|
+
});
|
|
523
|
+
var latencyMetric = Metric.of({
|
|
524
|
+
id: "latency",
|
|
525
|
+
name: "Latency",
|
|
526
|
+
format: (data) => `${data.ms}ms`
|
|
527
|
+
});
|
|
528
|
+
|
|
529
|
+
// src/evals/scores/standard.ts
|
|
530
|
+
var percentScore = Score.of({
|
|
531
|
+
id: "percent",
|
|
532
|
+
name: "Score",
|
|
533
|
+
displayStrategy: "bar",
|
|
534
|
+
format: (data) => data.value.toFixed(2)
|
|
535
|
+
});
|
|
536
|
+
var binaryScore = Score.of({
|
|
537
|
+
id: "binary",
|
|
538
|
+
name: "Result",
|
|
539
|
+
displayStrategy: "passFail",
|
|
540
|
+
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
541
|
+
});
|
|
542
|
+
|
|
543
|
+
// src/runner/config.ts
|
|
544
|
+
var defaultRunnerConfig = {
|
|
545
|
+
discovery: {
|
|
546
|
+
rootDir: process.cwd(),
|
|
547
|
+
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
548
|
+
evaluatorSuffixes: [
|
|
549
|
+
".evaluator.ts",
|
|
550
|
+
".evaluator.tsx",
|
|
551
|
+
".evaluator.js",
|
|
552
|
+
".evaluator.mjs"
|
|
553
|
+
],
|
|
554
|
+
testCaseSuffixes: [
|
|
555
|
+
".test-case.ts",
|
|
556
|
+
".test-case.tsx",
|
|
557
|
+
".test-case.js",
|
|
558
|
+
".test-case.mjs"
|
|
559
|
+
],
|
|
560
|
+
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
561
|
+
},
|
|
562
|
+
artifactDirectory: ".eval-results"
|
|
563
|
+
};
|
|
564
|
+
function withRunnerConfig(overrides) {
|
|
565
|
+
if (!overrides) {
|
|
566
|
+
return defaultRunnerConfig;
|
|
567
|
+
}
|
|
568
|
+
const discovery = overrides.discovery ? {
|
|
569
|
+
...defaultRunnerConfig.discovery,
|
|
570
|
+
...overrides.discovery
|
|
571
|
+
} : defaultRunnerConfig.discovery;
|
|
572
|
+
return {
|
|
573
|
+
...defaultRunnerConfig,
|
|
574
|
+
...overrides,
|
|
575
|
+
discovery
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
var jitiLoader;
|
|
579
|
+
function toId(prefix, filePath, name) {
|
|
580
|
+
const stable = name && name.trim().length > 0 ? name : filePath;
|
|
581
|
+
return `${prefix}:${stable}`.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
582
|
+
}
|
|
583
|
+
function hasMethod(value, methodName) {
|
|
584
|
+
return typeof value === "object" && value !== null && methodName in value && typeof value[methodName] === "function";
|
|
585
|
+
}
|
|
586
|
+
function isDatasetLike(value) {
|
|
587
|
+
return hasMethod(value, "getName") && hasMethod(value, "matchesTestCase");
|
|
588
|
+
}
|
|
589
|
+
function isEvaluatorLike(value) {
|
|
590
|
+
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
591
|
+
}
|
|
592
|
+
function isTestCaseLike(value) {
|
|
593
|
+
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
594
|
+
}
|
|
595
|
+
async function walkDirectory(rootDir, excludeDirectories) {
|
|
596
|
+
const out = [];
|
|
597
|
+
async function walk(currentDir) {
|
|
598
|
+
let entries;
|
|
599
|
+
try {
|
|
600
|
+
entries = await readdir(currentDir, { withFileTypes: true });
|
|
601
|
+
} catch {
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
604
|
+
await Promise.all(
|
|
605
|
+
entries.map(async (entry) => {
|
|
606
|
+
const absolute = resolve$1(currentDir, entry.name);
|
|
607
|
+
if (entry.isDirectory()) {
|
|
608
|
+
if (excludeDirectories.includes(entry.name)) {
|
|
609
|
+
return;
|
|
610
|
+
}
|
|
611
|
+
await walk(absolute);
|
|
612
|
+
return;
|
|
613
|
+
}
|
|
614
|
+
if (entry.isFile()) {
|
|
615
|
+
out.push(absolute);
|
|
616
|
+
}
|
|
617
|
+
})
|
|
618
|
+
);
|
|
619
|
+
}
|
|
620
|
+
await walk(rootDir);
|
|
621
|
+
return out;
|
|
622
|
+
}
|
|
623
|
+
function hasOneSuffix(filePath, suffixes) {
|
|
624
|
+
return suffixes.some((suffix) => filePath.endsWith(suffix));
|
|
625
|
+
}
|
|
626
|
+
async function loadModuleExports(filePath) {
|
|
627
|
+
if (filePath.endsWith(".ts") || filePath.endsWith(".tsx")) {
|
|
628
|
+
if (!jitiLoader) {
|
|
629
|
+
const jitiModule = await import('jiti');
|
|
630
|
+
const createJiti = jitiModule.createJiti ?? jitiModule.default;
|
|
631
|
+
if (!createJiti) {
|
|
632
|
+
throw new Error("Failed to initialize jiti TypeScript loader");
|
|
633
|
+
}
|
|
634
|
+
jitiLoader = createJiti(import.meta.url, {
|
|
635
|
+
interopDefault: true,
|
|
636
|
+
moduleCache: true
|
|
637
|
+
});
|
|
638
|
+
}
|
|
639
|
+
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
640
|
+
return Object.values(loaded2);
|
|
641
|
+
}
|
|
642
|
+
const moduleUrl = pathToFileURL(filePath).href;
|
|
643
|
+
const loaded = await import(moduleUrl);
|
|
644
|
+
return Object.values(loaded);
|
|
645
|
+
}
|
|
646
|
+
async function collectDatasetsFromFiles(config) {
|
|
647
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
648
|
+
const matched = files.filter(
|
|
649
|
+
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
650
|
+
);
|
|
651
|
+
const found = await Promise.all(
|
|
652
|
+
matched.map(async (absolutePath) => {
|
|
653
|
+
const exports = await loadModuleExports(absolutePath);
|
|
654
|
+
const datasets = exports.filter(isDatasetLike);
|
|
655
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
656
|
+
return datasets.map((dataset) => ({
|
|
657
|
+
id: toId("dataset", relPath, dataset.getName()),
|
|
658
|
+
filePath: relPath,
|
|
659
|
+
dataset
|
|
660
|
+
}));
|
|
661
|
+
})
|
|
662
|
+
);
|
|
663
|
+
return found.flat();
|
|
664
|
+
}
|
|
665
|
+
async function collectEvaluatorsFromFiles(config) {
|
|
666
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
667
|
+
const matched = files.filter(
|
|
668
|
+
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
669
|
+
);
|
|
670
|
+
const found = await Promise.all(
|
|
671
|
+
matched.map(async (absolutePath) => {
|
|
672
|
+
const exports = await loadModuleExports(absolutePath);
|
|
673
|
+
const evaluators = exports.filter(isEvaluatorLike);
|
|
674
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
675
|
+
return evaluators.map((evaluator) => ({
|
|
676
|
+
id: toId("evaluator", relPath, evaluator.getName()),
|
|
677
|
+
filePath: relPath,
|
|
678
|
+
evaluator
|
|
679
|
+
}));
|
|
680
|
+
})
|
|
681
|
+
);
|
|
682
|
+
return found.flat();
|
|
683
|
+
}
|
|
684
|
+
async function collectTestCasesFromFiles(config) {
|
|
685
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
686
|
+
const matched = files.filter(
|
|
687
|
+
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
688
|
+
);
|
|
689
|
+
const found = await Promise.all(
|
|
690
|
+
matched.map(async (absolutePath) => {
|
|
691
|
+
const exports = await loadModuleExports(absolutePath);
|
|
692
|
+
const testCases = exports.filter(isTestCaseLike);
|
|
693
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
694
|
+
return testCases.map((testCase) => ({
|
|
695
|
+
id: toId("test-case", relPath, testCase.getName()),
|
|
696
|
+
filePath: relPath,
|
|
697
|
+
testCase
|
|
698
|
+
}));
|
|
699
|
+
})
|
|
700
|
+
);
|
|
701
|
+
return found.flat();
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
// src/runner/score-utils.ts
|
|
705
|
+
function toNumericScoreFromScores(scores) {
|
|
706
|
+
for (const item of scores) {
|
|
707
|
+
const def = getScoreById(item.id);
|
|
708
|
+
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
709
|
+
const value = item.data.value;
|
|
710
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
711
|
+
return value;
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
const numeric = toNumericScore(item.data);
|
|
715
|
+
if (numeric !== void 0) {
|
|
716
|
+
return numeric;
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
return void 0;
|
|
720
|
+
}
|
|
721
|
+
function toNumericScore(value) {
|
|
722
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
723
|
+
return value;
|
|
724
|
+
}
|
|
725
|
+
if (typeof value !== "object" || value === null) {
|
|
726
|
+
return void 0;
|
|
727
|
+
}
|
|
728
|
+
const obj = value;
|
|
729
|
+
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
730
|
+
return obj.score;
|
|
731
|
+
}
|
|
732
|
+
const numberValues = Object.values(value).filter(
|
|
733
|
+
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
734
|
+
);
|
|
735
|
+
if (numberValues.length === 0) {
|
|
736
|
+
return void 0;
|
|
737
|
+
}
|
|
738
|
+
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
// src/runner/execution.ts
|
|
742
|
+
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
743
|
+
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
744
|
+
if (scoresWithPassed.length > 0) {
|
|
745
|
+
return scoresWithPassed.every((s) => s.passed === true);
|
|
746
|
+
}
|
|
747
|
+
const passCriterion = evaluator.getPassCriterion();
|
|
748
|
+
if (passCriterion) {
|
|
749
|
+
return passCriterion(result);
|
|
750
|
+
}
|
|
751
|
+
const passThreshold = evaluator.getPassThreshold();
|
|
752
|
+
if (passThreshold !== void 0) {
|
|
753
|
+
const numeric = toNumericScoreFromScores(scores);
|
|
754
|
+
return numeric !== void 0 && numeric >= passThreshold;
|
|
755
|
+
}
|
|
756
|
+
return true;
|
|
757
|
+
}
|
|
758
|
+
function normalizeResult(result) {
|
|
759
|
+
if (typeof result !== "object" || result === null) {
|
|
760
|
+
return { scores: [] };
|
|
761
|
+
}
|
|
762
|
+
const obj = result;
|
|
763
|
+
const scores = Array.isArray(obj.scores) ? obj.scores : [];
|
|
764
|
+
const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
|
|
765
|
+
return { scores, metrics };
|
|
766
|
+
}
|
|
767
|
+
function nowIsoForFile() {
|
|
768
|
+
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
769
|
+
}
|
|
770
|
+
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
771
|
+
return join(
|
|
772
|
+
artifactDirectory,
|
|
773
|
+
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
774
|
+
);
|
|
775
|
+
}
|
|
776
|
+
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
777
|
+
const startedAt = Date.now();
|
|
778
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
779
|
+
...snapshot,
|
|
780
|
+
status: "running",
|
|
781
|
+
startedAt
|
|
782
|
+
}));
|
|
783
|
+
yield* publishEvent({
|
|
784
|
+
type: "RunStarted",
|
|
785
|
+
runId: task.runId,
|
|
786
|
+
startedAt
|
|
787
|
+
});
|
|
788
|
+
let completedTestCases = 0;
|
|
789
|
+
let passedTestCases = 0;
|
|
790
|
+
let failedTestCases = 0;
|
|
791
|
+
for (const testCaseItem of task.testCases) {
|
|
792
|
+
const started = Date.now();
|
|
793
|
+
const evaluatorScores = [];
|
|
794
|
+
let testCaseError;
|
|
795
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
796
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
797
|
+
if (!evaluateFn) {
|
|
798
|
+
continue;
|
|
799
|
+
}
|
|
800
|
+
try {
|
|
801
|
+
const ctx = yield* Effect.promise(
|
|
802
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
803
|
+
);
|
|
804
|
+
const result = yield* Effect.promise(
|
|
805
|
+
() => Promise.resolve(evaluateFn(testCaseItem.testCase.getInput(), ctx))
|
|
806
|
+
);
|
|
807
|
+
const { scores, metrics } = normalizeResult(result);
|
|
808
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
809
|
+
evaluatorScores.push({ evaluatorId, scores, passed, metrics });
|
|
810
|
+
} catch (error) {
|
|
811
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
812
|
+
evaluatorScores.push({
|
|
813
|
+
evaluatorId,
|
|
814
|
+
scores: [],
|
|
815
|
+
passed: false
|
|
816
|
+
});
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
820
|
+
completedTestCases += 1;
|
|
821
|
+
if (testCasePassed) {
|
|
822
|
+
passedTestCases += 1;
|
|
823
|
+
} else {
|
|
824
|
+
failedTestCases += 1;
|
|
825
|
+
}
|
|
826
|
+
const progressEvent = {
|
|
827
|
+
type: "TestCaseProgress",
|
|
828
|
+
runId: task.runId,
|
|
829
|
+
testCaseId: testCaseItem.id,
|
|
830
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
831
|
+
completedTestCases,
|
|
832
|
+
totalTestCases: task.testCases.length,
|
|
833
|
+
passed: testCasePassed,
|
|
834
|
+
durationMs: Date.now() - started,
|
|
835
|
+
evaluatorScores,
|
|
836
|
+
errorMessage: testCaseError
|
|
837
|
+
};
|
|
838
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
839
|
+
...snapshot,
|
|
840
|
+
completedTestCases,
|
|
841
|
+
passedTestCases,
|
|
842
|
+
failedTestCases
|
|
843
|
+
}));
|
|
844
|
+
yield* publishEvent(progressEvent);
|
|
845
|
+
yield* Queue.offer(persistenceQueue, {
|
|
846
|
+
runId: task.runId,
|
|
847
|
+
artifactPath: task.snapshot.artifactPath,
|
|
848
|
+
payload: progressEvent
|
|
849
|
+
});
|
|
850
|
+
}
|
|
851
|
+
const finishedAt = Date.now();
|
|
852
|
+
const completedEvent = {
|
|
853
|
+
type: "RunCompleted",
|
|
854
|
+
runId: task.runId,
|
|
855
|
+
finishedAt,
|
|
856
|
+
passedTestCases,
|
|
857
|
+
failedTestCases,
|
|
858
|
+
totalTestCases: task.testCases.length,
|
|
859
|
+
artifactPath: task.snapshot.artifactPath
|
|
860
|
+
};
|
|
861
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
862
|
+
...snapshot,
|
|
863
|
+
status: "completed",
|
|
864
|
+
completedTestCases,
|
|
865
|
+
passedTestCases,
|
|
866
|
+
failedTestCases,
|
|
867
|
+
finishedAt
|
|
868
|
+
}));
|
|
869
|
+
yield* publishEvent(completedEvent);
|
|
870
|
+
yield* Queue.offer(persistenceQueue, {
|
|
871
|
+
runId: task.runId,
|
|
872
|
+
artifactPath: task.snapshot.artifactPath,
|
|
873
|
+
payload: completedEvent
|
|
874
|
+
});
|
|
875
|
+
yield* publishEvent({
|
|
876
|
+
type: "ArtifactFlushed",
|
|
877
|
+
runId: task.runId,
|
|
878
|
+
artifactPath: task.snapshot.artifactPath
|
|
879
|
+
});
|
|
880
|
+
});
|
|
881
|
+
async function appendJsonLine(artifactPath, payload) {
|
|
882
|
+
await mkdir(dirname(artifactPath), { recursive: true });
|
|
883
|
+
await appendFile(artifactPath, `${JSON.stringify(payload)}
|
|
884
|
+
`, "utf8");
|
|
885
|
+
}
|
|
886
|
+
var createPersistenceWorker = (queue) => Effect.forever(
|
|
887
|
+
Effect.gen(function* () {
|
|
888
|
+
const message = yield* Queue.take(queue);
|
|
889
|
+
yield* Effect.promise(
|
|
890
|
+
() => appendJsonLine(message.artifactPath, {
|
|
891
|
+
runId: message.runId,
|
|
892
|
+
ts: Date.now(),
|
|
893
|
+
...message.payload
|
|
894
|
+
})
|
|
895
|
+
);
|
|
896
|
+
})
|
|
897
|
+
);
|
|
898
|
+
|
|
899
|
+
// src/runner/search.ts
|
|
900
|
+
function matchesAny2(value, matchers) {
|
|
901
|
+
if (!matchers || matchers.length === 0) {
|
|
902
|
+
return true;
|
|
903
|
+
}
|
|
904
|
+
return matchers.some(
|
|
905
|
+
(matcher) => typeof matcher === "string" ? matcher === value : matcher.test(value)
|
|
906
|
+
);
|
|
907
|
+
}
|
|
908
|
+
function matchesPath(value, matchers) {
|
|
909
|
+
if (!matchers || matchers.length === 0) {
|
|
910
|
+
return true;
|
|
911
|
+
}
|
|
912
|
+
return matchers.some((matcher) => {
|
|
913
|
+
if (typeof matcher === "string") {
|
|
914
|
+
return value.includes(matcher);
|
|
915
|
+
}
|
|
916
|
+
return matcher.test(value);
|
|
917
|
+
});
|
|
918
|
+
}
|
|
919
|
+
function searchCollectedTestCases(all, query) {
|
|
920
|
+
if (!query) {
|
|
921
|
+
return all;
|
|
922
|
+
}
|
|
923
|
+
return all.filter((item) => {
|
|
924
|
+
const tags = item.testCase.getTags();
|
|
925
|
+
if (query.excludedTags && tags.some((tag) => matchesAny2(tag, query.excludedTags))) {
|
|
926
|
+
return false;
|
|
927
|
+
}
|
|
928
|
+
if (query.excludedPaths && matchesPath(item.filePath, query.excludedPaths)) {
|
|
929
|
+
return false;
|
|
930
|
+
}
|
|
931
|
+
const includedTagsMatch = !query.includedTags || query.includedTags.length === 0 || tags.some((tag) => matchesAny2(tag, query.includedTags));
|
|
932
|
+
const includedPathsMatch = !query.includedPaths || query.includedPaths.length === 0 || matchesPath(item.filePath, query.includedPaths);
|
|
933
|
+
return includedTagsMatch && includedPathsMatch;
|
|
934
|
+
});
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
// src/runner/api.ts
|
|
938
|
+
function parseRegexLiteral(pattern) {
|
|
939
|
+
if (!pattern.startsWith("/")) {
|
|
940
|
+
return void 0;
|
|
941
|
+
}
|
|
942
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
943
|
+
if (lastSlash <= 0) {
|
|
944
|
+
return void 0;
|
|
945
|
+
}
|
|
946
|
+
return {
|
|
947
|
+
source: pattern.slice(1, lastSlash),
|
|
948
|
+
flags: pattern.slice(lastSlash + 1)
|
|
949
|
+
};
|
|
950
|
+
}
|
|
951
|
+
function createNameMatcher(pattern) {
|
|
952
|
+
const normalizedPattern = pattern.trim();
|
|
953
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
954
|
+
if (regexLiteral) {
|
|
955
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
956
|
+
return (value) => regex.test(value);
|
|
957
|
+
}
|
|
958
|
+
if (normalizedPattern.includes("*")) {
|
|
959
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
960
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
961
|
+
return (value) => regex.test(value);
|
|
962
|
+
}
|
|
963
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
964
|
+
}
|
|
965
|
+
function createRunner(overrides) {
|
|
966
|
+
return new EffectRunner(withRunnerConfig(overrides));
|
|
967
|
+
}
|
|
968
|
+
var EffectRunner = class {
|
|
969
|
+
constructor(config) {
|
|
970
|
+
this.eventBus = Effect.runSync(PubSub.unbounded());
|
|
971
|
+
this.runQueue = Effect.runSync(Queue.unbounded());
|
|
972
|
+
this.persistenceQueue = Effect.runSync(
|
|
973
|
+
Queue.unbounded()
|
|
974
|
+
);
|
|
975
|
+
this.snapshots = /* @__PURE__ */ new Map();
|
|
976
|
+
this.listeners = /* @__PURE__ */ new Set();
|
|
977
|
+
this.datasetsById = /* @__PURE__ */ new Map();
|
|
978
|
+
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
979
|
+
this.schedulerFiber = Effect.runFork(
|
|
980
|
+
this.createSchedulerEffect()
|
|
981
|
+
);
|
|
982
|
+
this.persistenceFiber = Effect.runFork(
|
|
983
|
+
createPersistenceWorker(this.persistenceQueue)
|
|
984
|
+
);
|
|
985
|
+
this.config = config;
|
|
986
|
+
}
|
|
987
|
+
async collectDatasets() {
|
|
988
|
+
const datasets = await collectDatasetsFromFiles(this.config.discovery);
|
|
989
|
+
this.datasetsById.clear();
|
|
990
|
+
for (const dataset of datasets) {
|
|
991
|
+
this.datasetsById.set(dataset.id, dataset);
|
|
992
|
+
}
|
|
993
|
+
return datasets;
|
|
994
|
+
}
|
|
995
|
+
async collectEvaluators() {
|
|
996
|
+
const evaluators = await collectEvaluatorsFromFiles(this.config.discovery);
|
|
997
|
+
this.evaluatorsById.clear();
|
|
998
|
+
for (const evaluator of evaluators) {
|
|
999
|
+
this.evaluatorsById.set(evaluator.id, evaluator);
|
|
1000
|
+
}
|
|
1001
|
+
return evaluators;
|
|
1002
|
+
}
|
|
1003
|
+
async resolveDatasetByName(name) {
|
|
1004
|
+
if (this.datasetsById.size === 0) {
|
|
1005
|
+
await this.collectDatasets();
|
|
1006
|
+
}
|
|
1007
|
+
const normalized = name.trim().toLowerCase();
|
|
1008
|
+
return Array.from(this.datasetsById.values()).find(
|
|
1009
|
+
(item) => item.dataset.getName().toLowerCase() === normalized
|
|
1010
|
+
);
|
|
1011
|
+
}
|
|
1012
|
+
async resolveEvaluatorsByNamePattern(pattern) {
|
|
1013
|
+
if (this.evaluatorsById.size === 0) {
|
|
1014
|
+
await this.collectEvaluators();
|
|
1015
|
+
}
|
|
1016
|
+
const matcher = createNameMatcher(pattern);
|
|
1017
|
+
return Array.from(this.evaluatorsById.values()).filter(
|
|
1018
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1019
|
+
);
|
|
1020
|
+
}
|
|
1021
|
+
async searchTestCases(query) {
|
|
1022
|
+
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1023
|
+
return searchCollectedTestCases(testCases, query);
|
|
1024
|
+
}
|
|
1025
|
+
async collectDatasetTestCases(datasetId) {
|
|
1026
|
+
if (this.datasetsById.size === 0) {
|
|
1027
|
+
await this.collectDatasets();
|
|
1028
|
+
}
|
|
1029
|
+
const dataset = this.datasetsById.get(datasetId);
|
|
1030
|
+
if (!dataset) {
|
|
1031
|
+
throw new Error(`Unknown dataset: ${datasetId}`);
|
|
1032
|
+
}
|
|
1033
|
+
const allTestCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1034
|
+
return allTestCases.filter(
|
|
1035
|
+
(testCase) => dataset.dataset.matchesTestCase(testCase.testCase, testCase.filePath)
|
|
1036
|
+
);
|
|
1037
|
+
}
|
|
1038
|
+
async runDatasetWith(request) {
|
|
1039
|
+
if (this.datasetsById.size === 0) {
|
|
1040
|
+
await this.collectDatasets();
|
|
1041
|
+
}
|
|
1042
|
+
if (this.evaluatorsById.size === 0) {
|
|
1043
|
+
await this.collectEvaluators();
|
|
1044
|
+
}
|
|
1045
|
+
const dataset = this.datasetsById.get(request.datasetId);
|
|
1046
|
+
if (!dataset) {
|
|
1047
|
+
throw new Error(`Unknown dataset: ${request.datasetId}`);
|
|
1048
|
+
}
|
|
1049
|
+
const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1050
|
+
if (selectedEvaluators.length === 0) {
|
|
1051
|
+
throw new Error("No evaluators selected for run");
|
|
1052
|
+
}
|
|
1053
|
+
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1054
|
+
const runId = `run-${randomUUID()}`;
|
|
1055
|
+
const artifactPath = createArtifactPath(
|
|
1056
|
+
this.config.artifactDirectory,
|
|
1057
|
+
request.datasetId,
|
|
1058
|
+
runId
|
|
1059
|
+
);
|
|
1060
|
+
const snapshot = {
|
|
1061
|
+
runId,
|
|
1062
|
+
datasetId: request.datasetId,
|
|
1063
|
+
datasetName: dataset.dataset.getName(),
|
|
1064
|
+
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1065
|
+
queuedAt: Date.now(),
|
|
1066
|
+
totalTestCases: selectedTestCases.length,
|
|
1067
|
+
completedTestCases: 0,
|
|
1068
|
+
passedTestCases: 0,
|
|
1069
|
+
failedTestCases: 0,
|
|
1070
|
+
status: "queued",
|
|
1071
|
+
artifactPath
|
|
1072
|
+
};
|
|
1073
|
+
this.snapshots.set(runId, snapshot);
|
|
1074
|
+
const queuedEvent = {
|
|
1075
|
+
type: "RunQueued",
|
|
1076
|
+
runId,
|
|
1077
|
+
datasetId: request.datasetId,
|
|
1078
|
+
datasetName: dataset.dataset.getName(),
|
|
1079
|
+
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1080
|
+
totalTestCases: selectedTestCases.length,
|
|
1081
|
+
artifactPath
|
|
1082
|
+
};
|
|
1083
|
+
await Effect.runPromise(this.publishEvent(queuedEvent));
|
|
1084
|
+
await Effect.runPromise(
|
|
1085
|
+
Queue.offer(this.persistenceQueue, {
|
|
1086
|
+
runId,
|
|
1087
|
+
artifactPath,
|
|
1088
|
+
payload: queuedEvent
|
|
1089
|
+
})
|
|
1090
|
+
);
|
|
1091
|
+
await Effect.runPromise(
|
|
1092
|
+
Queue.offer(this.runQueue, {
|
|
1093
|
+
runId,
|
|
1094
|
+
datasetId: request.datasetId,
|
|
1095
|
+
dataset: dataset.dataset,
|
|
1096
|
+
evaluators: selectedEvaluators,
|
|
1097
|
+
testCases: selectedTestCases,
|
|
1098
|
+
snapshot
|
|
1099
|
+
})
|
|
1100
|
+
);
|
|
1101
|
+
return snapshot;
|
|
1102
|
+
}
|
|
1103
|
+
subscribeRunEvents(listener, options) {
|
|
1104
|
+
const entry = { runId: options?.runId, listener };
|
|
1105
|
+
this.listeners.add(entry);
|
|
1106
|
+
return () => {
|
|
1107
|
+
this.listeners.delete(entry);
|
|
1108
|
+
};
|
|
1109
|
+
}
|
|
1110
|
+
getRunSnapshot(runId) {
|
|
1111
|
+
return this.snapshots.get(runId);
|
|
1112
|
+
}
|
|
1113
|
+
getAllRunSnapshots() {
|
|
1114
|
+
return Array.from(this.snapshots.values()).sort(
|
|
1115
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
1116
|
+
);
|
|
1117
|
+
}
|
|
1118
|
+
async shutdown() {
|
|
1119
|
+
await Effect.runPromise(Fiber.interrupt(this.schedulerFiber));
|
|
1120
|
+
await Effect.runPromise(Fiber.interrupt(this.persistenceFiber));
|
|
1121
|
+
await Effect.runPromise(Queue.shutdown(this.runQueue));
|
|
1122
|
+
await Effect.runPromise(Queue.shutdown(this.persistenceQueue));
|
|
1123
|
+
await Effect.runPromise(PubSub.shutdown(this.eventBus));
|
|
1124
|
+
}
|
|
1125
|
+
createSchedulerEffect() {
|
|
1126
|
+
const self = this;
|
|
1127
|
+
return Effect.forever(
|
|
1128
|
+
Effect.gen(function* () {
|
|
1129
|
+
const task = yield* Queue.take(self.runQueue);
|
|
1130
|
+
yield* Effect.fork(
|
|
1131
|
+
executeRunTask(
|
|
1132
|
+
task,
|
|
1133
|
+
self.publishEvent.bind(self),
|
|
1134
|
+
self.persistenceQueue,
|
|
1135
|
+
self.updateSnapshot.bind(self)
|
|
1136
|
+
)
|
|
1137
|
+
);
|
|
1138
|
+
})
|
|
1139
|
+
);
|
|
1140
|
+
}
|
|
1141
|
+
updateSnapshot(runId, updater) {
|
|
1142
|
+
const existing = this.snapshots.get(runId);
|
|
1143
|
+
if (!existing) {
|
|
1144
|
+
return;
|
|
1145
|
+
}
|
|
1146
|
+
this.snapshots.set(runId, updater(existing));
|
|
1147
|
+
}
|
|
1148
|
+
publishEvent(event) {
|
|
1149
|
+
return Effect.sync(() => {
|
|
1150
|
+
for (const entry of this.listeners) {
|
|
1151
|
+
if (entry.runId && entry.runId !== event.runId) {
|
|
1152
|
+
continue;
|
|
1153
|
+
}
|
|
1154
|
+
entry.listener(event);
|
|
1155
|
+
}
|
|
1156
|
+
}).pipe(
|
|
1157
|
+
Effect.flatMap(() => PubSub.publish(this.eventBus, event)),
|
|
1158
|
+
Effect.asVoid
|
|
1159
|
+
);
|
|
1160
|
+
}
|
|
1161
|
+
};
|
|
1162
|
+
|
|
1163
|
+
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
|
|
1164
|
+
//# sourceMappingURL=out.js.map
|
|
1165
|
+
//# sourceMappingURL=index.js.map
|