@m4trix/evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +1075 -0
- package/dist/cli-simple.cjs.map +1 -0
- package/dist/cli-simple.d.cts +1 -0
- package/dist/cli-simple.d.ts +1 -0
- package/dist/cli-simple.js +1072 -0
- package/dist/cli-simple.js.map +1 -0
- package/dist/cli.cjs +1981 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +1974 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +1184 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +347 -0
- package/dist/index.d.ts +347 -0
- package/dist/index.js +1165 -0
- package/dist/index.js.map +1 -0
- package/package.json +53 -0
|
@@ -0,0 +1,1075 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
var crypto = require('crypto');
|
|
5
|
+
var effect = require('effect');
|
|
6
|
+
var promises = require('fs/promises');
|
|
7
|
+
var path = require('path');
|
|
8
|
+
var url = require('url');
|
|
9
|
+
|
|
10
|
+
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
11
|
+
// src/runner/config.ts
|
|
12
|
+
var defaultRunnerConfig = {
|
|
13
|
+
discovery: {
|
|
14
|
+
rootDir: process.cwd(),
|
|
15
|
+
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
16
|
+
evaluatorSuffixes: [
|
|
17
|
+
".evaluator.ts",
|
|
18
|
+
".evaluator.tsx",
|
|
19
|
+
".evaluator.js",
|
|
20
|
+
".evaluator.mjs"
|
|
21
|
+
],
|
|
22
|
+
testCaseSuffixes: [
|
|
23
|
+
".test-case.ts",
|
|
24
|
+
".test-case.tsx",
|
|
25
|
+
".test-case.js",
|
|
26
|
+
".test-case.mjs"
|
|
27
|
+
],
|
|
28
|
+
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
29
|
+
},
|
|
30
|
+
artifactDirectory: ".eval-results"
|
|
31
|
+
};
|
|
32
|
+
function withRunnerConfig(overrides) {
|
|
33
|
+
{
|
|
34
|
+
return defaultRunnerConfig;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
var jitiLoader;
|
|
38
|
+
function toId(prefix, filePath, name) {
|
|
39
|
+
const stable = name && name.trim().length > 0 ? name : filePath;
|
|
40
|
+
return `${prefix}:${stable}`.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
41
|
+
}
|
|
42
|
+
function hasMethod(value, methodName) {
|
|
43
|
+
return typeof value === "object" && value !== null && methodName in value && typeof value[methodName] === "function";
|
|
44
|
+
}
|
|
45
|
+
function isDatasetLike(value) {
|
|
46
|
+
return hasMethod(value, "getName") && hasMethod(value, "matchesTestCase");
|
|
47
|
+
}
|
|
48
|
+
function isEvaluatorLike(value) {
|
|
49
|
+
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
50
|
+
}
|
|
51
|
+
function isTestCaseLike(value) {
|
|
52
|
+
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
53
|
+
}
|
|
54
|
+
async function walkDirectory(rootDir, excludeDirectories) {
|
|
55
|
+
const out = [];
|
|
56
|
+
async function walk(currentDir) {
|
|
57
|
+
let entries;
|
|
58
|
+
try {
|
|
59
|
+
entries = await promises.readdir(currentDir, { withFileTypes: true });
|
|
60
|
+
} catch {
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
await Promise.all(
|
|
64
|
+
entries.map(async (entry) => {
|
|
65
|
+
const absolute = path.resolve(currentDir, entry.name);
|
|
66
|
+
if (entry.isDirectory()) {
|
|
67
|
+
if (excludeDirectories.includes(entry.name)) {
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
await walk(absolute);
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
if (entry.isFile()) {
|
|
74
|
+
out.push(absolute);
|
|
75
|
+
}
|
|
76
|
+
})
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
await walk(rootDir);
|
|
80
|
+
return out;
|
|
81
|
+
}
|
|
82
|
+
function hasOneSuffix(filePath, suffixes) {
|
|
83
|
+
return suffixes.some((suffix) => filePath.endsWith(suffix));
|
|
84
|
+
}
|
|
85
|
+
async function loadModuleExports(filePath) {
|
|
86
|
+
if (filePath.endsWith(".ts") || filePath.endsWith(".tsx")) {
|
|
87
|
+
if (!jitiLoader) {
|
|
88
|
+
const jitiModule = await import('jiti');
|
|
89
|
+
const createJiti = jitiModule.createJiti ?? jitiModule.default;
|
|
90
|
+
if (!createJiti) {
|
|
91
|
+
throw new Error("Failed to initialize jiti TypeScript loader");
|
|
92
|
+
}
|
|
93
|
+
jitiLoader = createJiti((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
|
|
94
|
+
interopDefault: true,
|
|
95
|
+
moduleCache: true
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
99
|
+
return Object.values(loaded2);
|
|
100
|
+
}
|
|
101
|
+
const moduleUrl = url.pathToFileURL(filePath).href;
|
|
102
|
+
const loaded = await import(moduleUrl);
|
|
103
|
+
return Object.values(loaded);
|
|
104
|
+
}
|
|
105
|
+
async function collectDatasetsFromFiles(config) {
|
|
106
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
107
|
+
const matched = files.filter(
|
|
108
|
+
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
109
|
+
);
|
|
110
|
+
const found = await Promise.all(
|
|
111
|
+
matched.map(async (absolutePath) => {
|
|
112
|
+
const exports = await loadModuleExports(absolutePath);
|
|
113
|
+
const datasets = exports.filter(isDatasetLike);
|
|
114
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
115
|
+
return datasets.map((dataset) => ({
|
|
116
|
+
id: toId("dataset", relPath, dataset.getName()),
|
|
117
|
+
filePath: relPath,
|
|
118
|
+
dataset
|
|
119
|
+
}));
|
|
120
|
+
})
|
|
121
|
+
);
|
|
122
|
+
return found.flat();
|
|
123
|
+
}
|
|
124
|
+
async function collectEvaluatorsFromFiles(config) {
|
|
125
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
126
|
+
const matched = files.filter(
|
|
127
|
+
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
128
|
+
);
|
|
129
|
+
const found = await Promise.all(
|
|
130
|
+
matched.map(async (absolutePath) => {
|
|
131
|
+
const exports = await loadModuleExports(absolutePath);
|
|
132
|
+
const evaluators = exports.filter(isEvaluatorLike);
|
|
133
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
134
|
+
return evaluators.map((evaluator) => ({
|
|
135
|
+
id: toId("evaluator", relPath, evaluator.getName()),
|
|
136
|
+
filePath: relPath,
|
|
137
|
+
evaluator
|
|
138
|
+
}));
|
|
139
|
+
})
|
|
140
|
+
);
|
|
141
|
+
return found.flat();
|
|
142
|
+
}
|
|
143
|
+
async function collectTestCasesFromFiles(config) {
|
|
144
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
145
|
+
const matched = files.filter(
|
|
146
|
+
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
147
|
+
);
|
|
148
|
+
const found = await Promise.all(
|
|
149
|
+
matched.map(async (absolutePath) => {
|
|
150
|
+
const exports = await loadModuleExports(absolutePath);
|
|
151
|
+
const testCases = exports.filter(isTestCaseLike);
|
|
152
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
153
|
+
return testCases.map((testCase) => ({
|
|
154
|
+
id: toId("test-case", relPath, testCase.getName()),
|
|
155
|
+
filePath: relPath,
|
|
156
|
+
testCase
|
|
157
|
+
}));
|
|
158
|
+
})
|
|
159
|
+
);
|
|
160
|
+
return found.flat();
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// src/evals/metric.ts
|
|
164
|
+
var registry = /* @__PURE__ */ new Map();
|
|
165
|
+
var Metric = {
|
|
166
|
+
of(config) {
|
|
167
|
+
const def = {
|
|
168
|
+
id: config.id,
|
|
169
|
+
name: config.name,
|
|
170
|
+
format: config.format,
|
|
171
|
+
make: (data) => ({ id: config.id, data })
|
|
172
|
+
};
|
|
173
|
+
registry.set(config.id, def);
|
|
174
|
+
return def;
|
|
175
|
+
}
|
|
176
|
+
};
|
|
177
|
+
function getMetricById(id) {
|
|
178
|
+
return registry.get(id);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// src/evals/score.ts
|
|
182
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
183
|
+
var Score = {
|
|
184
|
+
of(config) {
|
|
185
|
+
const def = {
|
|
186
|
+
id: config.id,
|
|
187
|
+
name: config.name,
|
|
188
|
+
displayStrategy: config.displayStrategy,
|
|
189
|
+
format: config.format,
|
|
190
|
+
make: (data, options) => {
|
|
191
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
192
|
+
return {
|
|
193
|
+
id: config.id,
|
|
194
|
+
data,
|
|
195
|
+
...passed !== void 0 && { passed }
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
};
|
|
199
|
+
registry2.set(config.id, def);
|
|
200
|
+
return def;
|
|
201
|
+
}
|
|
202
|
+
};
|
|
203
|
+
function getScoreById(id) {
|
|
204
|
+
return registry2.get(id);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// src/evals/metrics/standard.ts
|
|
208
|
+
Metric.of({
|
|
209
|
+
id: "token-count",
|
|
210
|
+
name: "Tokens",
|
|
211
|
+
format: (data) => {
|
|
212
|
+
const input = data.input ?? 0;
|
|
213
|
+
const output = data.output ?? 0;
|
|
214
|
+
const inputCached = data.inputCached ?? 0;
|
|
215
|
+
const outputCached = data.outputCached ?? 0;
|
|
216
|
+
const cached = inputCached + outputCached;
|
|
217
|
+
return `in:${input} out:${output} cached:${cached}`;
|
|
218
|
+
}
|
|
219
|
+
});
|
|
220
|
+
Metric.of({
|
|
221
|
+
id: "latency",
|
|
222
|
+
name: "Latency",
|
|
223
|
+
format: (data) => `${data.ms}ms`
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
// src/evals/scores/standard.ts
|
|
227
|
+
Score.of({
|
|
228
|
+
id: "percent",
|
|
229
|
+
name: "Score",
|
|
230
|
+
displayStrategy: "bar",
|
|
231
|
+
format: (data) => data.value.toFixed(2)
|
|
232
|
+
});
|
|
233
|
+
Score.of({
|
|
234
|
+
id: "binary",
|
|
235
|
+
name: "Result",
|
|
236
|
+
displayStrategy: "passFail",
|
|
237
|
+
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
// src/runner/score-utils.ts
|
|
241
|
+
function toNumericScoreFromScores(scores) {
|
|
242
|
+
for (const item of scores) {
|
|
243
|
+
const def = getScoreById(item.id);
|
|
244
|
+
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
245
|
+
const value = item.data.value;
|
|
246
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
247
|
+
return value;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
const numeric = toNumericScore(item.data);
|
|
251
|
+
if (numeric !== void 0) {
|
|
252
|
+
return numeric;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
return void 0;
|
|
256
|
+
}
|
|
257
|
+
function toNumericScore(value) {
|
|
258
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
259
|
+
return value;
|
|
260
|
+
}
|
|
261
|
+
if (typeof value !== "object" || value === null) {
|
|
262
|
+
return void 0;
|
|
263
|
+
}
|
|
264
|
+
const obj = value;
|
|
265
|
+
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
266
|
+
return obj.score;
|
|
267
|
+
}
|
|
268
|
+
const numberValues = Object.values(value).filter(
|
|
269
|
+
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
270
|
+
);
|
|
271
|
+
if (numberValues.length === 0) {
|
|
272
|
+
return void 0;
|
|
273
|
+
}
|
|
274
|
+
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// src/runner/execution.ts
|
|
278
|
+
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
279
|
+
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
280
|
+
if (scoresWithPassed.length > 0) {
|
|
281
|
+
return scoresWithPassed.every((s) => s.passed === true);
|
|
282
|
+
}
|
|
283
|
+
const passCriterion = evaluator.getPassCriterion();
|
|
284
|
+
if (passCriterion) {
|
|
285
|
+
return passCriterion(result);
|
|
286
|
+
}
|
|
287
|
+
const passThreshold = evaluator.getPassThreshold();
|
|
288
|
+
if (passThreshold !== void 0) {
|
|
289
|
+
const numeric = toNumericScoreFromScores(scores);
|
|
290
|
+
return numeric !== void 0 && numeric >= passThreshold;
|
|
291
|
+
}
|
|
292
|
+
return true;
|
|
293
|
+
}
|
|
294
|
+
function normalizeResult(result) {
|
|
295
|
+
if (typeof result !== "object" || result === null) {
|
|
296
|
+
return { scores: [] };
|
|
297
|
+
}
|
|
298
|
+
const obj = result;
|
|
299
|
+
const scores = Array.isArray(obj.scores) ? obj.scores : [];
|
|
300
|
+
const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
|
|
301
|
+
return { scores, metrics };
|
|
302
|
+
}
|
|
303
|
+
function nowIsoForFile() {
|
|
304
|
+
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
305
|
+
}
|
|
306
|
+
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
307
|
+
return path.join(
|
|
308
|
+
artifactDirectory,
|
|
309
|
+
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
310
|
+
);
|
|
311
|
+
}
|
|
312
|
+
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
313
|
+
const startedAt = Date.now();
|
|
314
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
315
|
+
...snapshot,
|
|
316
|
+
status: "running",
|
|
317
|
+
startedAt
|
|
318
|
+
}));
|
|
319
|
+
yield* publishEvent({
|
|
320
|
+
type: "RunStarted",
|
|
321
|
+
runId: task.runId,
|
|
322
|
+
startedAt
|
|
323
|
+
});
|
|
324
|
+
let completedTestCases = 0;
|
|
325
|
+
let passedTestCases = 0;
|
|
326
|
+
let failedTestCases = 0;
|
|
327
|
+
for (const testCaseItem of task.testCases) {
|
|
328
|
+
const started = Date.now();
|
|
329
|
+
const evaluatorScores = [];
|
|
330
|
+
let testCaseError;
|
|
331
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
332
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
333
|
+
if (!evaluateFn) {
|
|
334
|
+
continue;
|
|
335
|
+
}
|
|
336
|
+
try {
|
|
337
|
+
const ctx = yield* effect.Effect.promise(
|
|
338
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
339
|
+
);
|
|
340
|
+
const result = yield* effect.Effect.promise(
|
|
341
|
+
() => Promise.resolve(evaluateFn(testCaseItem.testCase.getInput(), ctx))
|
|
342
|
+
);
|
|
343
|
+
const { scores, metrics } = normalizeResult(result);
|
|
344
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
345
|
+
evaluatorScores.push({ evaluatorId, scores, passed, metrics });
|
|
346
|
+
} catch (error) {
|
|
347
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
348
|
+
evaluatorScores.push({
|
|
349
|
+
evaluatorId,
|
|
350
|
+
scores: [],
|
|
351
|
+
passed: false
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
356
|
+
completedTestCases += 1;
|
|
357
|
+
if (testCasePassed) {
|
|
358
|
+
passedTestCases += 1;
|
|
359
|
+
} else {
|
|
360
|
+
failedTestCases += 1;
|
|
361
|
+
}
|
|
362
|
+
const progressEvent = {
|
|
363
|
+
type: "TestCaseProgress",
|
|
364
|
+
runId: task.runId,
|
|
365
|
+
testCaseId: testCaseItem.id,
|
|
366
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
367
|
+
completedTestCases,
|
|
368
|
+
totalTestCases: task.testCases.length,
|
|
369
|
+
passed: testCasePassed,
|
|
370
|
+
durationMs: Date.now() - started,
|
|
371
|
+
evaluatorScores,
|
|
372
|
+
errorMessage: testCaseError
|
|
373
|
+
};
|
|
374
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
375
|
+
...snapshot,
|
|
376
|
+
completedTestCases,
|
|
377
|
+
passedTestCases,
|
|
378
|
+
failedTestCases
|
|
379
|
+
}));
|
|
380
|
+
yield* publishEvent(progressEvent);
|
|
381
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
382
|
+
runId: task.runId,
|
|
383
|
+
artifactPath: task.snapshot.artifactPath,
|
|
384
|
+
payload: progressEvent
|
|
385
|
+
});
|
|
386
|
+
}
|
|
387
|
+
const finishedAt = Date.now();
|
|
388
|
+
const completedEvent = {
|
|
389
|
+
type: "RunCompleted",
|
|
390
|
+
runId: task.runId,
|
|
391
|
+
finishedAt,
|
|
392
|
+
passedTestCases,
|
|
393
|
+
failedTestCases,
|
|
394
|
+
totalTestCases: task.testCases.length,
|
|
395
|
+
artifactPath: task.snapshot.artifactPath
|
|
396
|
+
};
|
|
397
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
398
|
+
...snapshot,
|
|
399
|
+
status: "completed",
|
|
400
|
+
completedTestCases,
|
|
401
|
+
passedTestCases,
|
|
402
|
+
failedTestCases,
|
|
403
|
+
finishedAt
|
|
404
|
+
}));
|
|
405
|
+
yield* publishEvent(completedEvent);
|
|
406
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
407
|
+
runId: task.runId,
|
|
408
|
+
artifactPath: task.snapshot.artifactPath,
|
|
409
|
+
payload: completedEvent
|
|
410
|
+
});
|
|
411
|
+
yield* publishEvent({
|
|
412
|
+
type: "ArtifactFlushed",
|
|
413
|
+
runId: task.runId,
|
|
414
|
+
artifactPath: task.snapshot.artifactPath
|
|
415
|
+
});
|
|
416
|
+
});
|
|
417
|
+
async function appendJsonLine(artifactPath, payload) {
|
|
418
|
+
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
419
|
+
await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
|
|
420
|
+
`, "utf8");
|
|
421
|
+
}
|
|
422
|
+
var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
423
|
+
effect.Effect.gen(function* () {
|
|
424
|
+
const message = yield* effect.Queue.take(queue);
|
|
425
|
+
yield* effect.Effect.promise(
|
|
426
|
+
() => appendJsonLine(message.artifactPath, {
|
|
427
|
+
runId: message.runId,
|
|
428
|
+
ts: Date.now(),
|
|
429
|
+
...message.payload
|
|
430
|
+
})
|
|
431
|
+
);
|
|
432
|
+
})
|
|
433
|
+
);
|
|
434
|
+
|
|
435
|
+
// src/runner/search.ts
|
|
436
|
+
function matchesAny(value, matchers) {
|
|
437
|
+
if (!matchers || matchers.length === 0) {
|
|
438
|
+
return true;
|
|
439
|
+
}
|
|
440
|
+
return matchers.some(
|
|
441
|
+
(matcher) => typeof matcher === "string" ? matcher === value : matcher.test(value)
|
|
442
|
+
);
|
|
443
|
+
}
|
|
444
|
+
function matchesPath(value, matchers) {
|
|
445
|
+
if (!matchers || matchers.length === 0) {
|
|
446
|
+
return true;
|
|
447
|
+
}
|
|
448
|
+
return matchers.some((matcher) => {
|
|
449
|
+
if (typeof matcher === "string") {
|
|
450
|
+
return value.includes(matcher);
|
|
451
|
+
}
|
|
452
|
+
return matcher.test(value);
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
function searchCollectedTestCases(all, query) {
|
|
456
|
+
if (!query) {
|
|
457
|
+
return all;
|
|
458
|
+
}
|
|
459
|
+
return all.filter((item) => {
|
|
460
|
+
const tags = item.testCase.getTags();
|
|
461
|
+
if (query.excludedTags && tags.some((tag) => matchesAny(tag, query.excludedTags))) {
|
|
462
|
+
return false;
|
|
463
|
+
}
|
|
464
|
+
if (query.excludedPaths && matchesPath(item.filePath, query.excludedPaths)) {
|
|
465
|
+
return false;
|
|
466
|
+
}
|
|
467
|
+
const includedTagsMatch = !query.includedTags || query.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, query.includedTags));
|
|
468
|
+
const includedPathsMatch = !query.includedPaths || query.includedPaths.length === 0 || matchesPath(item.filePath, query.includedPaths);
|
|
469
|
+
return includedTagsMatch && includedPathsMatch;
|
|
470
|
+
});
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
// src/runner/api.ts
|
|
474
|
+
function parseRegexLiteral(pattern) {
|
|
475
|
+
if (!pattern.startsWith("/")) {
|
|
476
|
+
return void 0;
|
|
477
|
+
}
|
|
478
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
479
|
+
if (lastSlash <= 0) {
|
|
480
|
+
return void 0;
|
|
481
|
+
}
|
|
482
|
+
return {
|
|
483
|
+
source: pattern.slice(1, lastSlash),
|
|
484
|
+
flags: pattern.slice(lastSlash + 1)
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
function createNameMatcher(pattern) {
|
|
488
|
+
const normalizedPattern = pattern.trim();
|
|
489
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
490
|
+
if (regexLiteral) {
|
|
491
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
492
|
+
return (value) => regex.test(value);
|
|
493
|
+
}
|
|
494
|
+
if (normalizedPattern.includes("*")) {
|
|
495
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
496
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
497
|
+
return (value) => regex.test(value);
|
|
498
|
+
}
|
|
499
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
500
|
+
}
|
|
501
|
+
function createRunner(overrides) {
|
|
502
|
+
return new EffectRunner(withRunnerConfig());
|
|
503
|
+
}
|
|
504
|
+
var EffectRunner = class {
|
|
505
|
+
constructor(config) {
|
|
506
|
+
this.eventBus = effect.Effect.runSync(effect.PubSub.unbounded());
|
|
507
|
+
this.runQueue = effect.Effect.runSync(effect.Queue.unbounded());
|
|
508
|
+
this.persistenceQueue = effect.Effect.runSync(
|
|
509
|
+
effect.Queue.unbounded()
|
|
510
|
+
);
|
|
511
|
+
this.snapshots = /* @__PURE__ */ new Map();
|
|
512
|
+
this.listeners = /* @__PURE__ */ new Set();
|
|
513
|
+
this.datasetsById = /* @__PURE__ */ new Map();
|
|
514
|
+
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
515
|
+
this.schedulerFiber = effect.Effect.runFork(
|
|
516
|
+
this.createSchedulerEffect()
|
|
517
|
+
);
|
|
518
|
+
this.persistenceFiber = effect.Effect.runFork(
|
|
519
|
+
createPersistenceWorker(this.persistenceQueue)
|
|
520
|
+
);
|
|
521
|
+
this.config = config;
|
|
522
|
+
}
|
|
523
|
+
async collectDatasets() {
|
|
524
|
+
const datasets = await collectDatasetsFromFiles(this.config.discovery);
|
|
525
|
+
this.datasetsById.clear();
|
|
526
|
+
for (const dataset of datasets) {
|
|
527
|
+
this.datasetsById.set(dataset.id, dataset);
|
|
528
|
+
}
|
|
529
|
+
return datasets;
|
|
530
|
+
}
|
|
531
|
+
async collectEvaluators() {
|
|
532
|
+
const evaluators = await collectEvaluatorsFromFiles(this.config.discovery);
|
|
533
|
+
this.evaluatorsById.clear();
|
|
534
|
+
for (const evaluator of evaluators) {
|
|
535
|
+
this.evaluatorsById.set(evaluator.id, evaluator);
|
|
536
|
+
}
|
|
537
|
+
return evaluators;
|
|
538
|
+
}
|
|
539
|
+
async resolveDatasetByName(name) {
|
|
540
|
+
if (this.datasetsById.size === 0) {
|
|
541
|
+
await this.collectDatasets();
|
|
542
|
+
}
|
|
543
|
+
const normalized = name.trim().toLowerCase();
|
|
544
|
+
return Array.from(this.datasetsById.values()).find(
|
|
545
|
+
(item) => item.dataset.getName().toLowerCase() === normalized
|
|
546
|
+
);
|
|
547
|
+
}
|
|
548
|
+
async resolveEvaluatorsByNamePattern(pattern) {
|
|
549
|
+
if (this.evaluatorsById.size === 0) {
|
|
550
|
+
await this.collectEvaluators();
|
|
551
|
+
}
|
|
552
|
+
const matcher = createNameMatcher(pattern);
|
|
553
|
+
return Array.from(this.evaluatorsById.values()).filter(
|
|
554
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
555
|
+
);
|
|
556
|
+
}
|
|
557
|
+
async searchTestCases(query) {
|
|
558
|
+
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
559
|
+
return searchCollectedTestCases(testCases, query);
|
|
560
|
+
}
|
|
561
|
+
async collectDatasetTestCases(datasetId) {
|
|
562
|
+
if (this.datasetsById.size === 0) {
|
|
563
|
+
await this.collectDatasets();
|
|
564
|
+
}
|
|
565
|
+
const dataset = this.datasetsById.get(datasetId);
|
|
566
|
+
if (!dataset) {
|
|
567
|
+
throw new Error(`Unknown dataset: ${datasetId}`);
|
|
568
|
+
}
|
|
569
|
+
const allTestCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
570
|
+
return allTestCases.filter(
|
|
571
|
+
(testCase) => dataset.dataset.matchesTestCase(testCase.testCase, testCase.filePath)
|
|
572
|
+
);
|
|
573
|
+
}
|
|
574
|
+
async runDatasetWith(request) {
|
|
575
|
+
if (this.datasetsById.size === 0) {
|
|
576
|
+
await this.collectDatasets();
|
|
577
|
+
}
|
|
578
|
+
if (this.evaluatorsById.size === 0) {
|
|
579
|
+
await this.collectEvaluators();
|
|
580
|
+
}
|
|
581
|
+
const dataset = this.datasetsById.get(request.datasetId);
|
|
582
|
+
if (!dataset) {
|
|
583
|
+
throw new Error(`Unknown dataset: ${request.datasetId}`);
|
|
584
|
+
}
|
|
585
|
+
const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
586
|
+
if (selectedEvaluators.length === 0) {
|
|
587
|
+
throw new Error("No evaluators selected for run");
|
|
588
|
+
}
|
|
589
|
+
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
590
|
+
const runId = `run-${crypto.randomUUID()}`;
|
|
591
|
+
const artifactPath = createArtifactPath(
|
|
592
|
+
this.config.artifactDirectory,
|
|
593
|
+
request.datasetId,
|
|
594
|
+
runId
|
|
595
|
+
);
|
|
596
|
+
const snapshot = {
|
|
597
|
+
runId,
|
|
598
|
+
datasetId: request.datasetId,
|
|
599
|
+
datasetName: dataset.dataset.getName(),
|
|
600
|
+
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
601
|
+
queuedAt: Date.now(),
|
|
602
|
+
totalTestCases: selectedTestCases.length,
|
|
603
|
+
completedTestCases: 0,
|
|
604
|
+
passedTestCases: 0,
|
|
605
|
+
failedTestCases: 0,
|
|
606
|
+
status: "queued",
|
|
607
|
+
artifactPath
|
|
608
|
+
};
|
|
609
|
+
this.snapshots.set(runId, snapshot);
|
|
610
|
+
const queuedEvent = {
|
|
611
|
+
type: "RunQueued",
|
|
612
|
+
runId,
|
|
613
|
+
datasetId: request.datasetId,
|
|
614
|
+
datasetName: dataset.dataset.getName(),
|
|
615
|
+
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
616
|
+
totalTestCases: selectedTestCases.length,
|
|
617
|
+
artifactPath
|
|
618
|
+
};
|
|
619
|
+
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
620
|
+
await effect.Effect.runPromise(
|
|
621
|
+
effect.Queue.offer(this.persistenceQueue, {
|
|
622
|
+
runId,
|
|
623
|
+
artifactPath,
|
|
624
|
+
payload: queuedEvent
|
|
625
|
+
})
|
|
626
|
+
);
|
|
627
|
+
await effect.Effect.runPromise(
|
|
628
|
+
effect.Queue.offer(this.runQueue, {
|
|
629
|
+
runId,
|
|
630
|
+
datasetId: request.datasetId,
|
|
631
|
+
dataset: dataset.dataset,
|
|
632
|
+
evaluators: selectedEvaluators,
|
|
633
|
+
testCases: selectedTestCases,
|
|
634
|
+
snapshot
|
|
635
|
+
})
|
|
636
|
+
);
|
|
637
|
+
return snapshot;
|
|
638
|
+
}
|
|
639
|
+
subscribeRunEvents(listener, options) {
|
|
640
|
+
const entry = { runId: options?.runId, listener };
|
|
641
|
+
this.listeners.add(entry);
|
|
642
|
+
return () => {
|
|
643
|
+
this.listeners.delete(entry);
|
|
644
|
+
};
|
|
645
|
+
}
|
|
646
|
+
getRunSnapshot(runId) {
|
|
647
|
+
return this.snapshots.get(runId);
|
|
648
|
+
}
|
|
649
|
+
getAllRunSnapshots() {
|
|
650
|
+
return Array.from(this.snapshots.values()).sort(
|
|
651
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
652
|
+
);
|
|
653
|
+
}
|
|
654
|
+
async shutdown() {
|
|
655
|
+
await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
|
|
656
|
+
await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
|
|
657
|
+
await effect.Effect.runPromise(effect.Queue.shutdown(this.runQueue));
|
|
658
|
+
await effect.Effect.runPromise(effect.Queue.shutdown(this.persistenceQueue));
|
|
659
|
+
await effect.Effect.runPromise(effect.PubSub.shutdown(this.eventBus));
|
|
660
|
+
}
|
|
661
|
+
createSchedulerEffect() {
|
|
662
|
+
const self = this;
|
|
663
|
+
return effect.Effect.forever(
|
|
664
|
+
effect.Effect.gen(function* () {
|
|
665
|
+
const task = yield* effect.Queue.take(self.runQueue);
|
|
666
|
+
yield* effect.Effect.fork(
|
|
667
|
+
executeRunTask(
|
|
668
|
+
task,
|
|
669
|
+
self.publishEvent.bind(self),
|
|
670
|
+
self.persistenceQueue,
|
|
671
|
+
self.updateSnapshot.bind(self)
|
|
672
|
+
)
|
|
673
|
+
);
|
|
674
|
+
})
|
|
675
|
+
);
|
|
676
|
+
}
|
|
677
|
+
updateSnapshot(runId, updater) {
|
|
678
|
+
const existing = this.snapshots.get(runId);
|
|
679
|
+
if (!existing) {
|
|
680
|
+
return;
|
|
681
|
+
}
|
|
682
|
+
this.snapshots.set(runId, updater(existing));
|
|
683
|
+
}
|
|
684
|
+
publishEvent(event) {
|
|
685
|
+
return effect.Effect.sync(() => {
|
|
686
|
+
for (const entry of this.listeners) {
|
|
687
|
+
if (entry.runId && entry.runId !== event.runId) {
|
|
688
|
+
continue;
|
|
689
|
+
}
|
|
690
|
+
entry.listener(event);
|
|
691
|
+
}
|
|
692
|
+
}).pipe(
|
|
693
|
+
effect.Effect.flatMap(() => effect.PubSub.publish(this.eventBus, event)),
|
|
694
|
+
effect.Effect.asVoid
|
|
695
|
+
);
|
|
696
|
+
}
|
|
697
|
+
};
|
|
698
|
+
|
|
699
|
+
// src/cli-simple/args.ts
|
|
700
|
+
function parseSimpleCliArgs(argv) {
|
|
701
|
+
const args = {
|
|
702
|
+
help: false,
|
|
703
|
+
unknownArgs: []
|
|
704
|
+
};
|
|
705
|
+
let index = 0;
|
|
706
|
+
if (argv[0] === "run" || argv[0] === "generate") {
|
|
707
|
+
args.command = argv[0];
|
|
708
|
+
index = 1;
|
|
709
|
+
}
|
|
710
|
+
for (; index < argv.length; index += 1) {
|
|
711
|
+
const token = argv[index];
|
|
712
|
+
if (token === "--help" || token === "-h") {
|
|
713
|
+
args.help = true;
|
|
714
|
+
continue;
|
|
715
|
+
}
|
|
716
|
+
if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
|
|
717
|
+
args.datasetName = argv[index + 1];
|
|
718
|
+
index += 1;
|
|
719
|
+
continue;
|
|
720
|
+
}
|
|
721
|
+
if ((token === "--evaluator" || token === "--name") && argv[index + 1]) {
|
|
722
|
+
args.evaluatorPattern = argv[index + 1];
|
|
723
|
+
index += 1;
|
|
724
|
+
continue;
|
|
725
|
+
}
|
|
726
|
+
args.unknownArgs.push(token);
|
|
727
|
+
}
|
|
728
|
+
return args;
|
|
729
|
+
}
|
|
730
|
+
function getSimpleCliUsage() {
|
|
731
|
+
return [
|
|
732
|
+
"Usage:",
|
|
733
|
+
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
|
|
734
|
+
" eval-agents-simple generate --dataset <datasetName>",
|
|
735
|
+
"",
|
|
736
|
+
"Pattern examples for --evaluator:",
|
|
737
|
+
" score-evaluator exact name (case-insensitive)",
|
|
738
|
+
' "*score*" wildcard pattern',
|
|
739
|
+
' "/score/i" regex literal'
|
|
740
|
+
].join("\n");
|
|
741
|
+
}
|
|
742
|
+
function createOutputPath(datasetFilePath) {
|
|
743
|
+
const parsed = path.parse(datasetFilePath);
|
|
744
|
+
return path.join(parsed.dir, `${parsed.name}.cases.json`);
|
|
745
|
+
}
|
|
746
|
+
async function generateDatasetJsonCommand(runner, datasetName) {
|
|
747
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
748
|
+
if (!dataset) {
|
|
749
|
+
throw new Error(`Dataset "${datasetName}" not found.`);
|
|
750
|
+
}
|
|
751
|
+
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
752
|
+
const payload = testCases.map((item) => ({
|
|
753
|
+
name: item.testCase.getName(),
|
|
754
|
+
input: item.testCase.getInput()
|
|
755
|
+
}));
|
|
756
|
+
const absoluteDatasetPath = path.resolve(process.cwd(), dataset.filePath);
|
|
757
|
+
const outputPath = createOutputPath(absoluteDatasetPath);
|
|
758
|
+
await promises.writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
759
|
+
`, "utf8");
|
|
760
|
+
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
|
|
761
|
+
console.log(`Wrote ${outputPath}`);
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
// src/cli-simple/run.ts
|
|
765
|
+
var ansi = {
|
|
766
|
+
reset: "\x1B[0m",
|
|
767
|
+
bold: "\x1B[1m",
|
|
768
|
+
dim: "\x1B[2m",
|
|
769
|
+
green: "\x1B[32m",
|
|
770
|
+
yellow: "\x1B[33m",
|
|
771
|
+
red: "\x1B[31m",
|
|
772
|
+
cyan: "\x1B[36m",
|
|
773
|
+
magenta: "\x1B[35m"
|
|
774
|
+
};
|
|
775
|
+
function colorize(text, color) {
|
|
776
|
+
return `${color}${text}${ansi.reset}`;
|
|
777
|
+
}
|
|
778
|
+
function scoreToColor(score) {
|
|
779
|
+
if (score >= 80) {
|
|
780
|
+
return ansi.green;
|
|
781
|
+
}
|
|
782
|
+
if (score >= 50) {
|
|
783
|
+
return ansi.yellow;
|
|
784
|
+
}
|
|
785
|
+
return ansi.red;
|
|
786
|
+
}
|
|
787
|
+
function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
788
|
+
if (!aggregate || aggregate.count === 0) {
|
|
789
|
+
return `- ${evaluatorName.padEnd(28)} no numeric scores`;
|
|
790
|
+
}
|
|
791
|
+
const mean = aggregate.total / aggregate.count;
|
|
792
|
+
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
793
|
+
}
|
|
794
|
+
function createBar(value, max = 100, width = 20) {
|
|
795
|
+
const safe = Math.max(0, Math.min(max, value));
|
|
796
|
+
const filled = Math.round(safe / max * width);
|
|
797
|
+
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
798
|
+
}
|
|
799
|
+
function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
800
|
+
const passLabel = passed ? colorize("PASS", `${ansi.bold}${ansi.green}`) : colorize("FAIL", `${ansi.bold}${ansi.red}`);
|
|
801
|
+
const scoreParts = [];
|
|
802
|
+
for (const item of scores) {
|
|
803
|
+
const def = getScoreById(item.id);
|
|
804
|
+
if (!def) {
|
|
805
|
+
const numeric = toNumericScore(item.data);
|
|
806
|
+
scoreParts.push(
|
|
807
|
+
numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a"
|
|
808
|
+
);
|
|
809
|
+
continue;
|
|
810
|
+
}
|
|
811
|
+
const formatted = def.format(item.data);
|
|
812
|
+
switch (def.displayStrategy) {
|
|
813
|
+
case "bar": {
|
|
814
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
815
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
816
|
+
scoreParts.push(
|
|
817
|
+
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi.dim)}`
|
|
818
|
+
);
|
|
819
|
+
} else {
|
|
820
|
+
scoreParts.push(formatted);
|
|
821
|
+
}
|
|
822
|
+
break;
|
|
823
|
+
}
|
|
824
|
+
case "number":
|
|
825
|
+
scoreParts.push(formatted);
|
|
826
|
+
break;
|
|
827
|
+
case "passFail":
|
|
828
|
+
scoreParts.push(
|
|
829
|
+
colorize(
|
|
830
|
+
formatted,
|
|
831
|
+
item.passed === true ? `${ansi.bold}${ansi.green}` : item.passed === false ? `${ansi.bold}${ansi.red}` : ansi.dim
|
|
832
|
+
)
|
|
833
|
+
);
|
|
834
|
+
break;
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
const scoreStr = scoreParts.length > 0 ? scoreParts.join(" ") : "n/a";
|
|
838
|
+
let line = ` ${name}: ${passLabel} ${scoreStr}`;
|
|
839
|
+
if (metrics && metrics.length > 0) {
|
|
840
|
+
const metricParts = [];
|
|
841
|
+
for (const { id, data } of metrics) {
|
|
842
|
+
const def = getMetricById(id);
|
|
843
|
+
if (def) {
|
|
844
|
+
const formatted = def.format(data);
|
|
845
|
+
metricParts.push(
|
|
846
|
+
def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
|
|
847
|
+
);
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
if (metricParts.length > 0) {
|
|
851
|
+
line += ` ${metricParts.join(" ")}`;
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
return line;
|
|
855
|
+
}
|
|
856
|
+
async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
857
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
858
|
+
if (!dataset) {
|
|
859
|
+
const known = await runner.collectDatasets();
|
|
860
|
+
const available = known.map((item) => item.dataset.getName()).sort();
|
|
861
|
+
throw new Error(
|
|
862
|
+
available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
|
|
863
|
+
);
|
|
864
|
+
}
|
|
865
|
+
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
866
|
+
if (evaluators.length === 0) {
|
|
867
|
+
const known = await runner.collectEvaluators();
|
|
868
|
+
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
869
|
+
throw new Error(
|
|
870
|
+
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
|
|
871
|
+
);
|
|
872
|
+
}
|
|
873
|
+
const evaluatorNameById = new Map(
|
|
874
|
+
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
875
|
+
);
|
|
876
|
+
const aggregates = /* @__PURE__ */ new Map();
|
|
877
|
+
const testCaseSummaries = [];
|
|
878
|
+
let overallScoreTotal = 0;
|
|
879
|
+
let overallScoreCount = 0;
|
|
880
|
+
let completedCount = 0;
|
|
881
|
+
let totalCount = 0;
|
|
882
|
+
let runFinished = false;
|
|
883
|
+
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
884
|
+
let spinnerIndex = 0;
|
|
885
|
+
function clearLine() {
|
|
886
|
+
if (!process.stdout.isTTY) {
|
|
887
|
+
return;
|
|
888
|
+
}
|
|
889
|
+
process.stdout.write("\r\x1B[2K");
|
|
890
|
+
}
|
|
891
|
+
function drawSpinner() {
|
|
892
|
+
if (!process.stdout.isTTY || runFinished) {
|
|
893
|
+
return;
|
|
894
|
+
}
|
|
895
|
+
const frame = spinnerFrames[spinnerIndex % spinnerFrames.length];
|
|
896
|
+
spinnerIndex += 1;
|
|
897
|
+
process.stdout.write(
|
|
898
|
+
`\r${colorize(frame, ansi.cyan)} Running evaluations ${colorize(
|
|
899
|
+
`${completedCount}/${totalCount}`,
|
|
900
|
+
ansi.bold
|
|
901
|
+
)} ${colorize("(live)", ansi.dim)}`
|
|
902
|
+
);
|
|
903
|
+
}
|
|
904
|
+
let spinnerTimer;
|
|
905
|
+
const done = new Promise((resolve3) => {
|
|
906
|
+
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
907
|
+
if (event.type === "TestCaseProgress") {
|
|
908
|
+
completedCount = event.completedTestCases;
|
|
909
|
+
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
910
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
911
|
+
clearLine();
|
|
912
|
+
console.log(
|
|
913
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi.dim)}`
|
|
914
|
+
);
|
|
915
|
+
for (const item of event.evaluatorScores) {
|
|
916
|
+
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
917
|
+
console.log(
|
|
918
|
+
formatEvaluatorScoreLine(
|
|
919
|
+
name,
|
|
920
|
+
item.scores,
|
|
921
|
+
item.passed,
|
|
922
|
+
item.metrics
|
|
923
|
+
)
|
|
924
|
+
);
|
|
925
|
+
const numeric = toNumericScoreFromScores(item.scores);
|
|
926
|
+
if (numeric !== void 0) {
|
|
927
|
+
const current = aggregates.get(item.evaluatorId) ?? {
|
|
928
|
+
total: 0,
|
|
929
|
+
count: 0,
|
|
930
|
+
passed: 0,
|
|
931
|
+
failed: 0
|
|
932
|
+
};
|
|
933
|
+
aggregates.set(item.evaluatorId, {
|
|
934
|
+
total: current.total + numeric,
|
|
935
|
+
count: current.count + 1,
|
|
936
|
+
passed: current.passed + (item.passed ? 1 : 0),
|
|
937
|
+
failed: current.failed + (item.passed ? 0 : 1)
|
|
938
|
+
});
|
|
939
|
+
overallScoreTotal += numeric;
|
|
940
|
+
overallScoreCount += 1;
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
testCaseSummaries.push({
|
|
944
|
+
name: event.testCaseName,
|
|
945
|
+
averageScore,
|
|
946
|
+
durationMs: event.durationMs,
|
|
947
|
+
passed: event.passed
|
|
948
|
+
});
|
|
949
|
+
drawSpinner();
|
|
950
|
+
}
|
|
951
|
+
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
952
|
+
runFinished = true;
|
|
953
|
+
clearLine();
|
|
954
|
+
unsubscribe();
|
|
955
|
+
resolve3(event);
|
|
956
|
+
}
|
|
957
|
+
});
|
|
958
|
+
});
|
|
959
|
+
const snapshot = await runner.runDatasetWith({
|
|
960
|
+
datasetId: dataset.id,
|
|
961
|
+
evaluatorIds: evaluators.map((item) => item.id)
|
|
962
|
+
});
|
|
963
|
+
totalCount = snapshot.totalTestCases;
|
|
964
|
+
console.log(colorize("=== Eval Run Started ===", `${ansi.bold}${ansi.cyan}`));
|
|
965
|
+
console.log(`Run: ${colorize(snapshot.runId, ansi.cyan)}`);
|
|
966
|
+
console.log(`Dataset: ${colorize(snapshot.datasetName, ansi.bold)}`);
|
|
967
|
+
console.log(
|
|
968
|
+
`Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
|
|
969
|
+
);
|
|
970
|
+
console.log(
|
|
971
|
+
`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi.bold)}`
|
|
972
|
+
);
|
|
973
|
+
console.log("");
|
|
974
|
+
drawSpinner();
|
|
975
|
+
spinnerTimer = setInterval(drawSpinner, 100);
|
|
976
|
+
const finalEvent = await done;
|
|
977
|
+
if (spinnerTimer) {
|
|
978
|
+
clearInterval(spinnerTimer);
|
|
979
|
+
}
|
|
980
|
+
if (finalEvent.type === "RunFailed") {
|
|
981
|
+
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
982
|
+
}
|
|
983
|
+
console.log("");
|
|
984
|
+
console.log(colorize("=== Run Summary ===", `${ansi.bold}${ansi.cyan}`));
|
|
985
|
+
console.log(
|
|
986
|
+
`- passed: ${colorize(
|
|
987
|
+
`${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
|
|
988
|
+
ansi.green
|
|
989
|
+
)}`
|
|
990
|
+
);
|
|
991
|
+
console.log(
|
|
992
|
+
`- failed: ${colorize(
|
|
993
|
+
`${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
|
|
994
|
+
finalEvent.failedTestCases > 0 ? ansi.red : ansi.dim
|
|
995
|
+
)}`
|
|
996
|
+
);
|
|
997
|
+
if (overallScoreCount > 0) {
|
|
998
|
+
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
999
|
+
console.log(
|
|
1000
|
+
`- overall avg score: ${colorize(
|
|
1001
|
+
overallAverage.toFixed(2),
|
|
1002
|
+
scoreToColor(overallAverage)
|
|
1003
|
+
)} ${colorize(createBar(overallAverage), ansi.dim)}`
|
|
1004
|
+
);
|
|
1005
|
+
}
|
|
1006
|
+
console.log(colorize("- evaluator averages:", ansi.magenta));
|
|
1007
|
+
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
1008
|
+
console.log(
|
|
1009
|
+
getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
|
|
1010
|
+
);
|
|
1011
|
+
}
|
|
1012
|
+
if (testCaseSummaries.length > 0) {
|
|
1013
|
+
console.log(colorize("- test case scores:", ansi.magenta));
|
|
1014
|
+
for (const summary of testCaseSummaries) {
|
|
1015
|
+
const status = summary.passed ? colorize("PASS", ansi.green) : colorize("FAIL", ansi.red);
|
|
1016
|
+
if (summary.averageScore === void 0) {
|
|
1017
|
+
console.log(
|
|
1018
|
+
` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi.dim)}`
|
|
1019
|
+
);
|
|
1020
|
+
continue;
|
|
1021
|
+
}
|
|
1022
|
+
console.log(
|
|
1023
|
+
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1024
|
+
summary.averageScore.toFixed(2),
|
|
1025
|
+
scoreToColor(summary.averageScore)
|
|
1026
|
+
)} ${colorize(createBar(summary.averageScore, 100, 14), ansi.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi.dim)}`
|
|
1027
|
+
);
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi.dim)}`);
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
// src/cli-simple/index.ts
|
|
1034
|
+
function printUsageAndExit(exitCode) {
|
|
1035
|
+
const printer = exitCode === 0 ? console.log : console.error;
|
|
1036
|
+
printer(getSimpleCliUsage());
|
|
1037
|
+
process.exit(exitCode);
|
|
1038
|
+
}
|
|
1039
|
+
async function main() {
|
|
1040
|
+
const args = parseSimpleCliArgs(process.argv.slice(2));
|
|
1041
|
+
if (args.help) {
|
|
1042
|
+
printUsageAndExit(0);
|
|
1043
|
+
}
|
|
1044
|
+
if (args.unknownArgs.length > 0) {
|
|
1045
|
+
console.error(`Unknown arguments: ${args.unknownArgs.join(", ")}`);
|
|
1046
|
+
printUsageAndExit(1);
|
|
1047
|
+
}
|
|
1048
|
+
if (!args.command) {
|
|
1049
|
+
printUsageAndExit(1);
|
|
1050
|
+
}
|
|
1051
|
+
if (!args.datasetName) {
|
|
1052
|
+
console.error("Missing required --dataset <datasetName> argument.");
|
|
1053
|
+
printUsageAndExit(1);
|
|
1054
|
+
}
|
|
1055
|
+
const runner = createRunner();
|
|
1056
|
+
try {
|
|
1057
|
+
if (args.command === "run") {
|
|
1058
|
+
if (!args.evaluatorPattern) {
|
|
1059
|
+
console.error("Missing required --evaluator <name-or-pattern> argument.");
|
|
1060
|
+
printUsageAndExit(1);
|
|
1061
|
+
}
|
|
1062
|
+
await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
|
|
1063
|
+
return;
|
|
1064
|
+
}
|
|
1065
|
+
await generateDatasetJsonCommand(runner, args.datasetName);
|
|
1066
|
+
} finally {
|
|
1067
|
+
await runner.shutdown();
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
void main().catch((error) => {
|
|
1071
|
+
console.error(error instanceof Error ? error.message : "Command failed");
|
|
1072
|
+
process.exit(1);
|
|
1073
|
+
});
|
|
1074
|
+
//# sourceMappingURL=out.js.map
|
|
1075
|
+
//# sourceMappingURL=cli-simple.cjs.map
|