vieval 0.0.8 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -4
- package/dist/bin/vieval.mjs +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-Dao25VxV.mjs → cli-DTDgaqeI.mjs} +669 -599
- package/dist/cli-DTDgaqeI.mjs.map +1 -0
- package/dist/config.d.mts +1 -1
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/inference-executors/index.d.mts +1 -1
- package/dist/core/inference-executors/index.mjs +10 -4
- package/dist/core/inference-executors/index.mjs.map +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +2 -2
- package/dist/core/scheduler/index.d.mts +1 -1
- package/dist/core/scheduler/index.mjs +2 -2
- package/dist/core/scheduler/index.mjs.map +1 -1
- package/dist/{env-BeHv_5mo.d.mts → env-DfWZy_n4.d.mts} +14 -9
- package/dist/env-nV5rVErX.mjs +35 -0
- package/dist/env-nV5rVErX.mjs.map +1 -0
- package/dist/{index-fakXoZEe.d.mts → index-Bg0atWBF.d.mts} +4 -3
- package/dist/{index-BkjyCInx.d.mts → index-D_aMeWqO.d.mts} +2 -2
- package/dist/index.d.mts +2 -2
- package/dist/index.mjs +21 -26
- package/dist/index.mjs.map +1 -1
- package/dist/plugins/chat-models/index.d.mts +1 -1
- package/dist/plugins/chat-models/index.mjs +15 -13
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{registry-BHGMxjpA.mjs → registry-DMnwE_mY.mjs} +54 -10
- package/dist/registry-DMnwE_mY.mjs.map +1 -0
- package/package.json +1 -1
- package/dist/cli-Dao25VxV.mjs.map +0 -1
- package/dist/env-BFSjny07.mjs +0 -41
- package/dist/env-BFSjny07.mjs.map +0 -1
- package/dist/registry-BHGMxjpA.mjs.map +0 -1
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-
|
|
1
|
+
import { a as createOpenTelemetryRuntime, c as detectCliConfigMode, d as loadVievalCliConfig, n as consumeModuleRegistrations, o as createNoopTelemetryRuntime, r as endModuleRegistration, t as beginModuleRegistration, u as loadRawVievalConfig } from "./registry-DMnwE_mY.mjs";
|
|
2
2
|
import { createSchedulerRuntime } from "./core/scheduler/index.mjs";
|
|
3
3
|
import { RunnerExecutionError, collectEvalEntries, createFilesystemTaskCacheRuntime, createRunnerRuntimeContext, createRunnerSchedule, createTaskExecutionContext, runScheduledTasks } from "./core/runner/index.mjs";
|
|
4
4
|
import process from "node:process";
|
|
5
5
|
import { errorMessageFrom } from "@moeru/std";
|
|
6
6
|
import meow from "meow";
|
|
7
|
+
import { access, mkdir, mkdtemp, writeFile } from "node:fs/promises";
|
|
8
|
+
import { tmpdir } from "node:os";
|
|
7
9
|
import { basename, dirname, isAbsolute, join, relative, resolve } from "node:path";
|
|
8
|
-
import { access, mkdir, writeFile } from "node:fs/promises";
|
|
9
10
|
import { glob } from "tinyglobby";
|
|
10
11
|
import { pathToFileURL } from "node:url";
|
|
12
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
11
13
|
import { randomUUID } from "node:crypto";
|
|
12
14
|
import c from "tinyrainbow";
|
|
13
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
14
15
|
import { uniq } from "es-toolkit";
|
|
15
16
|
import { createVitest } from "vitest/node";
|
|
16
17
|
import { formatDuration, intervalToDuration } from "date-fns";
|
|
@@ -156,393 +157,99 @@ async function loadVievalComparisonConfig(options = {}) {
|
|
|
156
157
|
}
|
|
157
158
|
}
|
|
158
159
|
//#endregion
|
|
159
|
-
//#region src/cli/report-
|
|
160
|
-
/**
|
|
161
|
-
* Builds a compact compare report sorted by hybrid/exact score.
|
|
162
|
-
*/
|
|
163
|
-
function buildCompareReportArtifact(args) {
|
|
164
|
-
const rows = args.methods.map((method) => {
|
|
165
|
-
const overall = method.output.projects[0]?.result?.overall;
|
|
166
|
-
return {
|
|
167
|
-
exactAverage: overall?.exactAverage ?? null,
|
|
168
|
-
hybridAverage: overall?.hybridAverage ?? null,
|
|
169
|
-
methodId: method.methodId,
|
|
170
|
-
runCount: overall?.runCount ?? 0
|
|
171
|
-
};
|
|
172
|
-
});
|
|
173
|
-
rows.sort((left, right) => {
|
|
174
|
-
const leftHybrid = left.hybridAverage ?? Number.NEGATIVE_INFINITY;
|
|
175
|
-
const rightHybrid = right.hybridAverage ?? Number.NEGATIVE_INFINITY;
|
|
176
|
-
if (leftHybrid !== rightHybrid) return rightHybrid - leftHybrid;
|
|
177
|
-
const leftExact = left.exactAverage ?? Number.NEGATIVE_INFINITY;
|
|
178
|
-
return (right.exactAverage ?? Number.NEGATIVE_INFINITY) - leftExact;
|
|
179
|
-
});
|
|
180
|
-
return {
|
|
181
|
-
benchmarkId: args.benchmarkId,
|
|
182
|
-
methods: rows,
|
|
183
|
-
reportPath: args.reportPath
|
|
184
|
-
};
|
|
185
|
-
}
|
|
186
|
-
/**
|
|
187
|
-
* Writes compare report artifact as JSON.
|
|
188
|
-
*/
|
|
189
|
-
async function writeCompareReportArtifact(args) {
|
|
190
|
-
const outputPath = resolve(args.outputPath);
|
|
191
|
-
await mkdir(dirname(outputPath), { recursive: true });
|
|
192
|
-
await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
|
|
193
|
-
return outputPath;
|
|
194
|
-
}
|
|
195
|
-
//#endregion
|
|
196
|
-
//#region src/cli/discovery.ts
|
|
197
|
-
/**
|
|
198
|
-
* Discovers eval files using include/exclude globs relative to project root.
|
|
199
|
-
*
|
|
200
|
-
* Before:
|
|
201
|
-
* - Absolute path file list from recursive filesystem walk
|
|
202
|
-
*
|
|
203
|
-
* After:
|
|
204
|
-
* - Filtered absolute path list matching include/exclude rules
|
|
205
|
-
*/
|
|
206
|
-
async function discoverEvalFiles(options) {
|
|
207
|
-
return uniq(await glob([...options.include], {
|
|
208
|
-
absolute: true,
|
|
209
|
-
cwd: options.root,
|
|
210
|
-
ignore: [...options.exclude],
|
|
211
|
-
onlyFiles: true
|
|
212
|
-
})).sort((left, right) => left.localeCompare(right));
|
|
213
|
-
}
|
|
214
|
-
//#endregion
|
|
215
|
-
//#region src/cli/module-runtime.ts
|
|
160
|
+
//#region src/cli/report-records.ts
|
|
216
161
|
/**
|
|
217
|
-
*
|
|
162
|
+
* Builds normalized case records from lifecycle, metric, and score events.
|
|
218
163
|
*
|
|
219
164
|
* Use when:
|
|
220
|
-
* -
|
|
221
|
-
* -
|
|
165
|
+
* - `events.jsonl` should be projected into `cases.jsonl`
|
|
166
|
+
* - report commands need one final record per observed case outcome
|
|
222
167
|
*
|
|
223
168
|
* Expects:
|
|
224
|
-
* -
|
|
225
|
-
* -
|
|
169
|
+
* - events are ordered by occurrence where possible
|
|
170
|
+
* - lifecycle events use either `task.case.start`/`task.case.end` or current CLI `CaseStarted`/`CaseEnded` names
|
|
226
171
|
*
|
|
227
172
|
* Returns:
|
|
228
|
-
* -
|
|
173
|
+
* - records for cases that emitted an end lifecycle event
|
|
229
174
|
*/
|
|
230
|
-
|
|
231
|
-
const
|
|
232
|
-
const
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
|
|
248
|
-
const deduplicatedDefinitions = definitions.filter((definition, index) => {
|
|
249
|
-
const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
|
|
250
|
-
return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
|
|
251
|
-
});
|
|
252
|
-
if (deduplicatedDefinitions.length === 0) continue;
|
|
253
|
-
for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
|
|
254
|
-
const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
|
|
255
|
-
loadedModules[moduleKey] = { default: definition };
|
|
256
|
-
}
|
|
257
|
-
} finally {
|
|
258
|
-
endModuleRegistration();
|
|
259
|
-
}
|
|
175
|
+
function buildCaseRecords(args) {
|
|
176
|
+
const drafts = /* @__PURE__ */ new Map();
|
|
177
|
+
const completedKeys = [];
|
|
178
|
+
for (const event of args.events) {
|
|
179
|
+
const normalizedEvent = normalizeCaseEventName(event.event);
|
|
180
|
+
if (normalizedEvent == null) continue;
|
|
181
|
+
const ids = extractEventIds(event, args);
|
|
182
|
+
if (ids.caseId.length === 0 || ids.taskId.length === 0) continue;
|
|
183
|
+
const draft = getOrCreateDraft(drafts, ids, event, args);
|
|
184
|
+
applyIdentity(draft, ids, event, args);
|
|
185
|
+
if (normalizedEvent === "start") applyCaseStart(draft, event);
|
|
186
|
+
else if (normalizedEvent === "metric") applyCaseMetric(draft, event);
|
|
187
|
+
else if (normalizedEvent === "score") applyCaseScore(draft, event);
|
|
188
|
+
else {
|
|
189
|
+
applyCaseEnd(draft, event);
|
|
190
|
+
const key = createCaseKey(ids.taskId, ids.caseId);
|
|
191
|
+
if (!completedKeys.includes(key)) completedKeys.push(key);
|
|
260
192
|
}
|
|
261
|
-
} finally {
|
|
262
|
-
await runtime.close();
|
|
263
193
|
}
|
|
264
|
-
return
|
|
194
|
+
return completedKeys.map((key) => drafts.get(key)).filter((draft) => draft != null && draft.endedAt != null).map(toCaseRecord);
|
|
265
195
|
}
|
|
266
|
-
//#endregion
|
|
267
|
-
//#region src/cli/report-selectors.ts
|
|
268
196
|
/**
|
|
269
|
-
*
|
|
197
|
+
* Builds generic score summaries overall and grouped by arbitrary keys.
|
|
270
198
|
*
|
|
271
199
|
* Use when:
|
|
272
|
-
* - report
|
|
273
|
-
* -
|
|
200
|
+
* - report artifacts need benchmark-neutral aggregate score views
|
|
201
|
+
* - callers want to group by metrics such as `benchmark.category` or direct record fields such as `taskId`
|
|
274
202
|
*
|
|
275
203
|
* Expects:
|
|
276
|
-
* - `
|
|
204
|
+
* - `groupByKeys` are stable metric names or direct `CaseRecord` field names
|
|
205
|
+
* - record score values are normalized numeric scores
|
|
277
206
|
*
|
|
278
207
|
* Returns:
|
|
279
|
-
* -
|
|
208
|
+
* - overall score buckets and group buckets keyed by `<key>=<value>`
|
|
280
209
|
*/
|
|
281
|
-
function
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
210
|
+
function buildMetricsSummary(records, groupByKeys) {
|
|
211
|
+
const overall = {};
|
|
212
|
+
const groups = {};
|
|
213
|
+
for (const record of records) {
|
|
214
|
+
addRecordScores(overall, record);
|
|
215
|
+
for (const groupByKey of groupByKeys) {
|
|
216
|
+
const groupValue = getGroupValue(record, groupByKey);
|
|
217
|
+
if (!groupValue.exists) continue;
|
|
218
|
+
const groupKey = `${groupByKey}=${String(groupValue.value)}`;
|
|
219
|
+
groups[groupKey] ??= {};
|
|
220
|
+
addRecordScores(groups[groupKey], record);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
return {
|
|
224
|
+
groups: finalizeSummaryGroups(groups),
|
|
225
|
+
overall: finalizeScoreSummary(overall)
|
|
297
226
|
};
|
|
298
|
-
return { exists: false };
|
|
299
|
-
}
|
|
300
|
-
/**
|
|
301
|
-
* Stable-stringifies JSON-like values for report comparisons.
|
|
302
|
-
*
|
|
303
|
-
* Before:
|
|
304
|
-
* - `{ b: 1, a: true }`
|
|
305
|
-
*
|
|
306
|
-
* After:
|
|
307
|
-
* - `{"a":true,"b":1}`
|
|
308
|
-
*/
|
|
309
|
-
function stableStringify(value) {
|
|
310
|
-
if (value == null || typeof value !== "object") return JSON.stringify(value);
|
|
311
|
-
if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
|
|
312
|
-
const record = value;
|
|
313
|
-
return `{${Object.keys(record).sort((left, right) => left.localeCompare(right)).map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(",")}}`;
|
|
314
227
|
}
|
|
315
|
-
//#endregion
|
|
316
|
-
//#region src/cli/report-otlp.ts
|
|
317
228
|
/**
|
|
318
|
-
*
|
|
229
|
+
* Encodes records as newline-delimited JSON.
|
|
319
230
|
*
|
|
320
231
|
* Use when:
|
|
321
|
-
* - writing
|
|
322
|
-
* -
|
|
232
|
+
* - writing `cases.jsonl` for command-line tools, dataframes, or streaming parsers
|
|
233
|
+
* - each record should occupy exactly one JSON line
|
|
323
234
|
*
|
|
324
235
|
* Expects:
|
|
325
|
-
* - records
|
|
236
|
+
* - records are JSON-serializable case records
|
|
326
237
|
*
|
|
327
238
|
* Returns:
|
|
328
|
-
* -
|
|
239
|
+
* - one JSON object per line with a trailing newline for non-empty input
|
|
329
240
|
*/
|
|
330
|
-
function
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
"vieval.task.id": task.taskId
|
|
343
|
-
}),
|
|
344
|
-
name: "vieval.task"
|
|
345
|
-
}));
|
|
346
|
-
const caseSpans = args.records.map((record) => ({
|
|
347
|
-
attributes: toAttributes({
|
|
348
|
-
...record.metrics,
|
|
349
|
-
"vieval.case.duration_ms": record.durationMs,
|
|
350
|
-
"vieval.case.id": record.caseId,
|
|
351
|
-
"vieval.case.name": record.caseName,
|
|
352
|
-
"vieval.case.retry_count": record.retryCount,
|
|
353
|
-
"vieval.case.state": record.state,
|
|
354
|
-
"vieval.project.name": record.projectName,
|
|
355
|
-
"vieval.task.id": record.taskId
|
|
356
|
-
}),
|
|
357
|
-
endTimeUnixNano: isoToUnixNano(record.endedAt),
|
|
358
|
-
name: "vieval.case",
|
|
359
|
-
startTimeUnixNano: isoToUnixNano(record.startedAt)
|
|
360
|
-
}));
|
|
361
|
-
return {
|
|
362
|
-
logs: { resourceLogs: [{ scopeLogs: [{
|
|
363
|
-
logRecords: args.records.map((record) => ({
|
|
364
|
-
attributes: toAttributes(record.metrics),
|
|
365
|
-
body: { stringValue: JSON.stringify({
|
|
366
|
-
caseId: record.caseId,
|
|
367
|
-
scores: record.scores,
|
|
368
|
-
state: record.state
|
|
369
|
-
}) },
|
|
370
|
-
eventName: "vieval.case",
|
|
371
|
-
timeUnixNano: isoToUnixNano(record.endedAt)
|
|
372
|
-
})),
|
|
373
|
-
scope: { name: "vieval" }
|
|
374
|
-
}] }] },
|
|
375
|
-
metrics: { resourceMetrics: [{ scopeMetrics: [{
|
|
376
|
-
metrics: collectScoreKinds(args.records).map((kind) => ({
|
|
377
|
-
gauge: { dataPoints: args.records.filter((record) => typeof record.scores[kind] === "number").map((record) => ({
|
|
378
|
-
asDouble: record.scores[kind],
|
|
379
|
-
attributes: toAttributes({
|
|
380
|
-
...record.metrics,
|
|
381
|
-
"vieval.case.id": record.caseId,
|
|
382
|
-
"vieval.task.id": record.taskId
|
|
383
|
-
}),
|
|
384
|
-
timeUnixNano: isoToUnixNano(record.endedAt)
|
|
385
|
-
})) },
|
|
386
|
-
name: `vieval.score.${kind}`
|
|
387
|
-
})),
|
|
388
|
-
scope: { name: "vieval" }
|
|
389
|
-
}] }] },
|
|
390
|
-
traces: { resourceSpans: [{ scopeSpans: [{
|
|
391
|
-
scope: { name: "vieval" },
|
|
392
|
-
spans: [
|
|
393
|
-
{
|
|
394
|
-
attributes: toAttributes({ "vieval.run.id": args.runId }),
|
|
395
|
-
name: "vieval.run"
|
|
396
|
-
},
|
|
397
|
-
...projectSpans,
|
|
398
|
-
...taskSpans,
|
|
399
|
-
...caseSpans
|
|
400
|
-
]
|
|
401
|
-
}] }] }
|
|
402
|
-
};
|
|
403
|
-
}
|
|
404
|
-
function toAttributes(attributes) {
|
|
405
|
-
return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
|
|
406
|
-
key,
|
|
407
|
-
value: toAnyValue(value)
|
|
408
|
-
}));
|
|
409
|
-
}
|
|
410
|
-
function toAnyValue(value) {
|
|
411
|
-
if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
|
|
412
|
-
if (isAttributeScalar(value)) {
|
|
413
|
-
if (typeof value === "boolean") return { boolValue: value };
|
|
414
|
-
if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
|
|
415
|
-
if (value == null) return { stringValue: "null" };
|
|
416
|
-
return { stringValue: value };
|
|
417
|
-
}
|
|
418
|
-
return { stringValue: stableStringify(value) };
|
|
419
|
-
}
|
|
420
|
-
function isAttributeScalar(value) {
|
|
421
|
-
return value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string";
|
|
422
|
-
}
|
|
423
|
-
function isoToUnixNano(value) {
|
|
424
|
-
const preciseMatch = /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.(\d{1,9}))?(Z|[+-]\d{2}:\d{2})$/.exec(value);
|
|
425
|
-
if (preciseMatch != null) {
|
|
426
|
-
const [, secondsPart, fraction = "", zone] = preciseMatch;
|
|
427
|
-
const unixMilliseconds = Date.parse(`${secondsPart}.000${zone}`);
|
|
428
|
-
if (!Number.isFinite(unixMilliseconds)) return "0";
|
|
429
|
-
return String(BigInt(unixMilliseconds) * 1000000n + BigInt(fraction.padEnd(9, "0").slice(0, 9)));
|
|
430
|
-
}
|
|
431
|
-
const unixMilliseconds = Date.parse(value);
|
|
432
|
-
if (!Number.isFinite(unixMilliseconds)) return "0";
|
|
433
|
-
return String(BigInt(unixMilliseconds) * 1000000n);
|
|
434
|
-
}
|
|
435
|
-
function collectScoreKinds(records) {
|
|
436
|
-
return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
|
|
437
|
-
}
|
|
438
|
-
function collectProjectNames(records) {
|
|
439
|
-
return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
|
|
440
|
-
}
|
|
441
|
-
function collectTasks(records) {
|
|
442
|
-
const tasks = /* @__PURE__ */ new Map();
|
|
443
|
-
for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
|
|
444
|
-
projectName: record.projectName,
|
|
445
|
-
taskId: record.taskId
|
|
446
|
-
});
|
|
447
|
-
return [...tasks.values()].sort((left, right) => {
|
|
448
|
-
const projectOrder = left.projectName.localeCompare(right.projectName);
|
|
449
|
-
return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
|
|
450
|
-
});
|
|
451
|
-
}
|
|
452
|
-
//#endregion
|
|
453
|
-
//#region src/cli/report-records.ts
|
|
454
|
-
/**
|
|
455
|
-
* Builds normalized case records from lifecycle, metric, and score events.
|
|
456
|
-
*
|
|
457
|
-
* Use when:
|
|
458
|
-
* - `events.jsonl` should be projected into `cases.jsonl`
|
|
459
|
-
* - report commands need one final record per observed case outcome
|
|
460
|
-
*
|
|
461
|
-
* Expects:
|
|
462
|
-
* - events are ordered by occurrence where possible
|
|
463
|
-
* - lifecycle events use either `task.case.start`/`task.case.end` or current CLI `CaseStarted`/`CaseEnded` names
|
|
464
|
-
*
|
|
465
|
-
* Returns:
|
|
466
|
-
* - records for cases that emitted an end lifecycle event
|
|
467
|
-
*/
|
|
468
|
-
function buildCaseRecords(args) {
|
|
469
|
-
const drafts = /* @__PURE__ */ new Map();
|
|
470
|
-
const completedKeys = [];
|
|
471
|
-
for (const event of args.events) {
|
|
472
|
-
const normalizedEvent = normalizeCaseEventName(event.event);
|
|
473
|
-
if (normalizedEvent == null) continue;
|
|
474
|
-
const ids = extractEventIds(event, args);
|
|
475
|
-
if (ids.caseId.length === 0 || ids.taskId.length === 0) continue;
|
|
476
|
-
const draft = getOrCreateDraft(drafts, ids, event, args);
|
|
477
|
-
applyIdentity(draft, ids, event, args);
|
|
478
|
-
if (normalizedEvent === "start") applyCaseStart(draft, event);
|
|
479
|
-
else if (normalizedEvent === "metric") applyCaseMetric(draft, event);
|
|
480
|
-
else if (normalizedEvent === "score") applyCaseScore(draft, event);
|
|
481
|
-
else {
|
|
482
|
-
applyCaseEnd(draft, event);
|
|
483
|
-
const key = createCaseKey(ids.taskId, ids.caseId);
|
|
484
|
-
if (!completedKeys.includes(key)) completedKeys.push(key);
|
|
485
|
-
}
|
|
486
|
-
}
|
|
487
|
-
return completedKeys.map((key) => drafts.get(key)).filter((draft) => draft != null && draft.endedAt != null).map(toCaseRecord);
|
|
488
|
-
}
|
|
489
|
-
/**
|
|
490
|
-
* Builds generic score summaries overall and grouped by arbitrary keys.
|
|
491
|
-
*
|
|
492
|
-
* Use when:
|
|
493
|
-
* - report artifacts need benchmark-neutral aggregate score views
|
|
494
|
-
* - callers want to group by metrics such as `benchmark.category` or direct record fields such as `taskId`
|
|
495
|
-
*
|
|
496
|
-
* Expects:
|
|
497
|
-
* - `groupByKeys` are stable metric names or direct `CaseRecord` field names
|
|
498
|
-
* - record score values are normalized numeric scores
|
|
499
|
-
*
|
|
500
|
-
* Returns:
|
|
501
|
-
* - overall score buckets and group buckets keyed by `<key>=<value>`
|
|
502
|
-
*/
|
|
503
|
-
function buildMetricsSummary(records, groupByKeys) {
|
|
504
|
-
const overall = {};
|
|
505
|
-
const groups = {};
|
|
506
|
-
for (const record of records) {
|
|
507
|
-
addRecordScores(overall, record);
|
|
508
|
-
for (const groupByKey of groupByKeys) {
|
|
509
|
-
const groupValue = getGroupValue(record, groupByKey);
|
|
510
|
-
if (!groupValue.exists) continue;
|
|
511
|
-
const groupKey = `${groupByKey}=${String(groupValue.value)}`;
|
|
512
|
-
groups[groupKey] ??= {};
|
|
513
|
-
addRecordScores(groups[groupKey], record);
|
|
514
|
-
}
|
|
515
|
-
}
|
|
516
|
-
return {
|
|
517
|
-
groups: finalizeSummaryGroups(groups),
|
|
518
|
-
overall: finalizeScoreSummary(overall)
|
|
519
|
-
};
|
|
520
|
-
}
|
|
521
|
-
/**
|
|
522
|
-
* Encodes records as newline-delimited JSON.
|
|
523
|
-
*
|
|
524
|
-
* Use when:
|
|
525
|
-
* - writing `cases.jsonl` for command-line tools, dataframes, or streaming parsers
|
|
526
|
-
* - each record should occupy exactly one JSON line
|
|
527
|
-
*
|
|
528
|
-
* Expects:
|
|
529
|
-
* - records are JSON-serializable case records
|
|
530
|
-
*
|
|
531
|
-
* Returns:
|
|
532
|
-
* - one JSON object per line with a trailing newline for non-empty input
|
|
533
|
-
*/
|
|
534
|
-
function encodeJsonl(records) {
|
|
535
|
-
if (records.length === 0) return "";
|
|
536
|
-
return `${records.map((record) => JSON.stringify(record)).join("\n")}\n`;
|
|
537
|
-
}
|
|
538
|
-
function normalizeCaseEventName(eventName) {
|
|
539
|
-
if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
|
|
540
|
-
if (eventName === "task.case.metric") return "metric";
|
|
541
|
-
if (eventName === "task.case.score") return "score";
|
|
542
|
-
if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
|
|
543
|
-
}
|
|
544
|
-
function extractEventIds(event, args) {
|
|
545
|
-
const data = asRecord(event.data);
|
|
241
|
+
function encodeJsonl(records) {
|
|
242
|
+
if (records.length === 0) return "";
|
|
243
|
+
return `${records.map((record) => JSON.stringify(record)).join("\n")}\n`;
|
|
244
|
+
}
|
|
245
|
+
function normalizeCaseEventName(eventName) {
|
|
246
|
+
if (eventName === "task.case.start" || eventName === "CaseStarted") return "start";
|
|
247
|
+
if (eventName === "task.case.metric") return "metric";
|
|
248
|
+
if (eventName === "task.case.score") return "score";
|
|
249
|
+
if (eventName === "task.case.end" || eventName === "CaseEnded") return "end";
|
|
250
|
+
}
|
|
251
|
+
function extractEventIds(event, args) {
|
|
252
|
+
const data = asRecord(event.data);
|
|
546
253
|
return {
|
|
547
254
|
attemptId: stringFrom(data?.attemptId) ?? event.attemptId ?? args.attemptId,
|
|
548
255
|
caseId: stringFrom(data?.caseId) ?? event.caseId ?? "",
|
|
@@ -713,15 +420,560 @@ function isCaseMetricValue(value) {
|
|
|
713
420
|
if (value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string") return true;
|
|
714
421
|
return Array.isArray(value);
|
|
715
422
|
}
|
|
716
|
-
function asRecord(value) {
|
|
717
|
-
if (value == null || typeof value !== "object" || Array.isArray(value)) return;
|
|
718
|
-
return value;
|
|
423
|
+
function asRecord(value) {
|
|
424
|
+
if (value == null || typeof value !== "object" || Array.isArray(value)) return;
|
|
425
|
+
return value;
|
|
426
|
+
}
|
|
427
|
+
function stringFrom(value) {
|
|
428
|
+
return typeof value === "string" ? value : void 0;
|
|
429
|
+
}
|
|
430
|
+
function numberFrom(value) {
|
|
431
|
+
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
432
|
+
}
|
|
433
|
+
//#endregion
|
|
434
|
+
//#region src/cli/report-selectors.ts
|
|
435
|
+
/**
|
|
436
|
+
* Resolves a generic case selector from metrics, scores, then direct fields.
|
|
437
|
+
*
|
|
438
|
+
* Use when:
|
|
439
|
+
* - report commands accept benchmark-neutral selectors such as `benchmark.case.id`
|
|
440
|
+
* - comparisons need the same lookup semantics as filtering and grouping
|
|
441
|
+
*
|
|
442
|
+
* Expects:
|
|
443
|
+
* - `key` is a direct `CaseRecord` field, score key, `scores.<key>`, or metric key
|
|
444
|
+
*
|
|
445
|
+
* Returns:
|
|
446
|
+
* - existence flag plus matched value when present
|
|
447
|
+
*/
|
|
448
|
+
function getCaseSelectorValue(record, key) {
|
|
449
|
+
if (Object.hasOwn(record.metrics, key)) return {
|
|
450
|
+
exists: true,
|
|
451
|
+
value: record.metrics[key]
|
|
452
|
+
};
|
|
453
|
+
if (key.startsWith("scores.") && Object.hasOwn(record.scores, key.slice(7))) return {
|
|
454
|
+
exists: true,
|
|
455
|
+
value: record.scores[key.slice(7)]
|
|
456
|
+
};
|
|
457
|
+
if (Object.hasOwn(record.scores, key)) return {
|
|
458
|
+
exists: true,
|
|
459
|
+
value: record.scores[key]
|
|
460
|
+
};
|
|
461
|
+
if (Object.hasOwn(record, key)) return {
|
|
462
|
+
exists: true,
|
|
463
|
+
value: record[key]
|
|
464
|
+
};
|
|
465
|
+
return { exists: false };
|
|
466
|
+
}
|
|
467
|
+
/**
|
|
468
|
+
* Stable-stringifies JSON-like values for report comparisons.
|
|
469
|
+
*
|
|
470
|
+
* Before:
|
|
471
|
+
* - `{ b: 1, a: true }`
|
|
472
|
+
*
|
|
473
|
+
* After:
|
|
474
|
+
* - `{"a":true,"b":1}`
|
|
475
|
+
*/
|
|
476
|
+
function stableStringify(value) {
|
|
477
|
+
if (value == null || typeof value !== "object") return JSON.stringify(value);
|
|
478
|
+
if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
|
|
479
|
+
const record = value;
|
|
480
|
+
return `{${Object.keys(record).sort((left, right) => left.localeCompare(right)).map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(",")}}`;
|
|
481
|
+
}
|
|
482
|
+
//#endregion
|
|
483
|
+
//#region src/cli/report-cases.ts
|
|
484
|
+
const reportCasesHelpText = `
|
|
485
|
+
Inspect normalized case records from generated vieval report artifacts.
|
|
486
|
+
|
|
487
|
+
Usage
|
|
488
|
+
$ vieval report cases <reportPath> [options]
|
|
489
|
+
|
|
490
|
+
Options
|
|
491
|
+
--format Output format: table | json | jsonl (default: table)
|
|
492
|
+
--where Equality filter "key=value"; repeatable
|
|
493
|
+
--group-by Case field, score name, or metric name used for grouped score summaries
|
|
494
|
+
`;
|
|
495
|
+
/**
|
|
496
|
+
* Reads normalized case records from one report run directory or report root.
|
|
497
|
+
*
|
|
498
|
+
* Use when:
|
|
499
|
+
* - CLI tools need case-level inspection from local report artifacts
|
|
500
|
+
* - callers may pass a run directory, a `cases.jsonl` file, or a report root
|
|
501
|
+
*
|
|
502
|
+
* Expects:
|
|
503
|
+
* - discovered `cases.jsonl` files contain one `CaseRecord` JSON object per line
|
|
504
|
+
*
|
|
505
|
+
* Returns:
|
|
506
|
+
* - all parsed case records sorted by discovered file path order
|
|
507
|
+
*/
|
|
508
|
+
async function readCaseRecordsFromReport(reportPath) {
|
|
509
|
+
const caseFilePaths = await resolveCaseRecordPaths(reportPath);
|
|
510
|
+
if (caseFilePaths.length === 0) throw new Error(`No cases.jsonl files found under "${resolve(reportPath)}".`);
|
|
511
|
+
const records = [];
|
|
512
|
+
for (const caseFilePath of caseFilePaths) {
|
|
513
|
+
const lines = readFileSync(caseFilePath, "utf-8").split("\n");
|
|
514
|
+
for (const [index, line] of lines.entries()) {
|
|
515
|
+
const trimmed = line.trim();
|
|
516
|
+
if (trimmed.length === 0) continue;
|
|
517
|
+
try {
|
|
518
|
+
records.push(JSON.parse(trimmed));
|
|
519
|
+
} catch (error) {
|
|
520
|
+
throw new Error(`Invalid cases.jsonl line ${index + 1} in "${caseFilePath}": ${errorMessageFrom(error) ?? "Unknown JSON parse failure."}`);
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
return records;
|
|
525
|
+
}
|
|
526
|
+
/**
|
|
527
|
+
* Builds filtered case inspection output.
|
|
528
|
+
*
|
|
529
|
+
* Use when:
|
|
530
|
+
* - `vieval report cases` needs deterministic JSON/table output
|
|
531
|
+
* - tests need pure filtering and grouping behavior without process I/O
|
|
532
|
+
*
|
|
533
|
+
* Expects:
|
|
534
|
+
* - `where` filters use `key=value`
|
|
535
|
+
* - lookup keys may target direct case fields, score names, or metric names
|
|
536
|
+
*
|
|
537
|
+
* Returns:
|
|
538
|
+
* - filtered records plus grouped score summaries when `groupBy` is present
|
|
539
|
+
*/
|
|
540
|
+
function buildReportCasesOutput(records, options) {
|
|
541
|
+
const whereFilters = (options.where ?? []).map(parseSelector);
|
|
542
|
+
const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
|
|
543
|
+
return {
|
|
544
|
+
groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
|
|
545
|
+
records: [...filteredRecords]
|
|
546
|
+
};
|
|
547
|
+
}
|
|
548
|
+
/**
|
|
549
|
+
* Runs the `vieval report cases` command.
|
|
550
|
+
*
|
|
551
|
+
* Call stack:
|
|
552
|
+
*
|
|
553
|
+
* published executable (`../bin/vieval`)
|
|
554
|
+
* -> {@link import('./index').runTopLevelCli}
|
|
555
|
+
* -> {@link runReportCasesCli}
|
|
556
|
+
* -> {@link readCaseRecordsFromReport}
|
|
557
|
+
*
|
|
558
|
+
* Use when:
|
|
559
|
+
* - the top-level CLI dispatches local case artifact inspection
|
|
560
|
+
*
|
|
561
|
+
* Expects:
|
|
562
|
+
* - argv is either `cases <reportPath> ...` or `<reportPath> ...`
|
|
563
|
+
*
|
|
564
|
+
* Returns:
|
|
565
|
+
* - resolves after writing the requested output to stdout
|
|
566
|
+
*/
|
|
567
|
+
async function runReportCasesCli(argv) {
|
|
568
|
+
try {
|
|
569
|
+
const parsed = parseReportCasesCliArguments(argv);
|
|
570
|
+
const output = buildReportCasesOutput(await readCaseRecordsFromReport(parsed.reportPath), parsed);
|
|
571
|
+
if (parsed.format === "json") {
|
|
572
|
+
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
573
|
+
return;
|
|
574
|
+
}
|
|
575
|
+
if (parsed.format === "jsonl") {
|
|
576
|
+
process.stdout.write(encodeJsonl(output.records));
|
|
577
|
+
return;
|
|
578
|
+
}
|
|
579
|
+
process.stdout.write(`${formatCasesTable(output)}\n`);
|
|
580
|
+
} catch (error) {
|
|
581
|
+
const errorMessage = errorMessageFrom(error) ?? "Unknown report cases failure.";
|
|
582
|
+
process.stderr.write(`[vieval report cases] ${errorMessage}\n`);
|
|
583
|
+
process.exitCode = 1;
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
function normalizeCliArgv$6(argv) {
|
|
587
|
+
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
588
|
+
if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
|
|
589
|
+
if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
|
|
590
|
+
return normalizedArgv;
|
|
591
|
+
}
|
|
592
|
+
function parseReportCasesCliArguments(argv) {
|
|
593
|
+
const cli = meow(reportCasesHelpText, {
|
|
594
|
+
argv: normalizeCliArgv$6(argv),
|
|
595
|
+
flags: {
|
|
596
|
+
format: {
|
|
597
|
+
default: "table",
|
|
598
|
+
type: "string"
|
|
599
|
+
},
|
|
600
|
+
groupBy: { type: "string" },
|
|
601
|
+
where: {
|
|
602
|
+
isMultiple: true,
|
|
603
|
+
type: "string"
|
|
604
|
+
}
|
|
605
|
+
},
|
|
606
|
+
importMeta: import.meta
|
|
607
|
+
});
|
|
608
|
+
const reportPath = cli.input[0];
|
|
609
|
+
if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
|
|
610
|
+
return {
|
|
611
|
+
format: normalizeReportCasesFormat(cli.flags.format),
|
|
612
|
+
groupBy: cli.flags.groupBy,
|
|
613
|
+
reportPath,
|
|
614
|
+
where: cli.flags.where
|
|
615
|
+
};
|
|
616
|
+
}
|
|
617
|
+
function normalizeReportCasesFormat(value) {
|
|
618
|
+
const normalized = value.toLowerCase();
|
|
619
|
+
if (normalized === "json") return "json";
|
|
620
|
+
if (normalized === "jsonl") return "jsonl";
|
|
621
|
+
return "table";
|
|
622
|
+
}
|
|
623
|
+
async function resolveCaseRecordPaths(reportPath) {
|
|
624
|
+
const absoluteReportPath = resolve(reportPath);
|
|
625
|
+
const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
|
|
626
|
+
if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
|
|
627
|
+
if (existsSync(directCaseFilePath)) return [directCaseFilePath];
|
|
628
|
+
return (await glob("**/cases.jsonl", {
|
|
629
|
+
absolute: true,
|
|
630
|
+
cwd: absoluteReportPath
|
|
631
|
+
})).sort((left, right) => left.localeCompare(right));
|
|
632
|
+
}
|
|
633
|
+
function matchesWhereFilters(record, whereFilters) {
|
|
634
|
+
return whereFilters.every((parsed) => {
|
|
635
|
+
const resolved = getCaseSelectorValue(record, parsed.key);
|
|
636
|
+
return resolved.exists && String(resolved.value) === parsed.value;
|
|
637
|
+
});
|
|
638
|
+
}
|
|
639
|
+
function parseSelector(selector) {
|
|
640
|
+
const separatorIndex = selector.indexOf("=");
|
|
641
|
+
if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
|
|
642
|
+
return {
|
|
643
|
+
key: selector.slice(0, separatorIndex).trim(),
|
|
644
|
+
value: selector.slice(separatorIndex + 1).trim()
|
|
645
|
+
};
|
|
646
|
+
}
|
|
647
|
+
function buildCaseGroups(records, groupBy) {
|
|
648
|
+
const groups = {};
|
|
649
|
+
for (const record of records) {
|
|
650
|
+
const resolved = getCaseSelectorValue(record, groupBy);
|
|
651
|
+
if (!resolved.exists) continue;
|
|
652
|
+
const groupKey = `${groupBy}=${String(resolved.value)}`;
|
|
653
|
+
groups[groupKey] ??= {
|
|
654
|
+
count: 0,
|
|
655
|
+
scores: {}
|
|
656
|
+
};
|
|
657
|
+
groups[groupKey].count += 1;
|
|
658
|
+
addScores(groups[groupKey].scores, record.scores);
|
|
659
|
+
}
|
|
660
|
+
return Object.fromEntries(Object.entries(groups).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, group]) => [groupKey, {
|
|
661
|
+
count: group.count,
|
|
662
|
+
scores: finalizeScores(group.scores)
|
|
663
|
+
}]));
|
|
664
|
+
}
|
|
665
|
+
function addScores(summary, scores) {
|
|
666
|
+
for (const [scoreName, value] of Object.entries(scores)) {
|
|
667
|
+
summary[scoreName] ??= {
|
|
668
|
+
average: 0,
|
|
669
|
+
count: 0,
|
|
670
|
+
sum: 0
|
|
671
|
+
};
|
|
672
|
+
summary[scoreName].count += 1;
|
|
673
|
+
summary[scoreName].sum += value;
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
function finalizeScores(summary) {
|
|
677
|
+
return Object.fromEntries(Object.entries(summary).sort(([left], [right]) => left.localeCompare(right)).map(([scoreName, bucket]) => [scoreName, {
|
|
678
|
+
average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
|
|
679
|
+
count: bucket.count,
|
|
680
|
+
sum: bucket.sum
|
|
681
|
+
}]));
|
|
682
|
+
}
|
|
683
|
+
function formatCasesTable(output) {
|
|
684
|
+
const lines = ["CASES vieval report", `Case count ${output.records.length}`];
|
|
685
|
+
if (output.groups != null) {
|
|
686
|
+
lines.push("Groups");
|
|
687
|
+
for (const [groupKey, group] of Object.entries(output.groups)) {
|
|
688
|
+
const scoreText = Object.entries(group.scores).map(([scoreName, bucket]) => `${scoreName}=${bucket.average.toFixed(3)}`).join(" ");
|
|
689
|
+
lines.push(`${groupKey} count=${group.count}${scoreText.length > 0 ? ` ${scoreText}` : ""}`);
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
return lines.join("\n");
|
|
693
|
+
}
|
|
694
|
+
//#endregion
|
|
695
|
+
//#region src/cli/report-compare.ts
|
|
696
|
+
/**
|
|
697
|
+
* Builds a compact compare report sorted by hybrid/exact score.
|
|
698
|
+
*/
|
|
699
|
+
function buildCompareReportArtifact(args) {
|
|
700
|
+
const rows = args.methods.map((method) => {
|
|
701
|
+
const caseRecords = method.caseRecords ?? [];
|
|
702
|
+
const projects = method.output.projects.map((project) => ({
|
|
703
|
+
caseCount: countCasesForProject(caseRecords, project.name),
|
|
704
|
+
distinctCaseCount: countDistinctCasesForProject(caseRecords, project.name),
|
|
705
|
+
exactAverage: project.result?.overall.exactAverage ?? null,
|
|
706
|
+
executed: project.executed,
|
|
707
|
+
hybridAverage: project.result?.overall.hybridAverage ?? null,
|
|
708
|
+
name: project.name,
|
|
709
|
+
runCount: project.result?.overall.runCount ?? 0,
|
|
710
|
+
taskCount: project.taskCount
|
|
711
|
+
}));
|
|
712
|
+
return {
|
|
713
|
+
caseCount: caseRecords.length,
|
|
714
|
+
distinctCaseCount: countDistinctCases(caseRecords),
|
|
715
|
+
exactAverage: createWeightedAverage(projects, (project) => project.exactAverage),
|
|
716
|
+
executedProjectCount: projects.filter((project) => project.executed).length,
|
|
717
|
+
hybridAverage: createWeightedAverage(projects, (project) => project.hybridAverage),
|
|
718
|
+
methodId: method.methodId,
|
|
719
|
+
projectCount: projects.length,
|
|
720
|
+
projects,
|
|
721
|
+
runCount: projects.reduce((sum, project) => sum + project.runCount, 0),
|
|
722
|
+
taskCount: projects.reduce((sum, project) => sum + project.taskCount, 0)
|
|
723
|
+
};
|
|
724
|
+
});
|
|
725
|
+
rows.sort((left, right) => {
|
|
726
|
+
const leftHybrid = left.hybridAverage ?? Number.NEGATIVE_INFINITY;
|
|
727
|
+
const rightHybrid = right.hybridAverage ?? Number.NEGATIVE_INFINITY;
|
|
728
|
+
if (leftHybrid !== rightHybrid) return rightHybrid - leftHybrid;
|
|
729
|
+
const leftExact = left.exactAverage ?? Number.NEGATIVE_INFINITY;
|
|
730
|
+
return (right.exactAverage ?? Number.NEGATIVE_INFINITY) - leftExact;
|
|
731
|
+
});
|
|
732
|
+
return {
|
|
733
|
+
benchmarkId: args.benchmarkId,
|
|
734
|
+
methods: rows,
|
|
735
|
+
reportPath: args.reportPath
|
|
736
|
+
};
|
|
737
|
+
}
|
|
738
|
+
function countCasesForProject(caseRecords, projectName) {
|
|
739
|
+
return caseRecords.filter((record) => record.projectName === projectName).length;
|
|
740
|
+
}
|
|
741
|
+
function countDistinctCasesForProject(caseRecords, projectName) {
|
|
742
|
+
return countDistinctCases(caseRecords.filter((record) => record.projectName === projectName));
|
|
743
|
+
}
|
|
744
|
+
function countDistinctCases(caseRecords) {
|
|
745
|
+
const caseKeys = /* @__PURE__ */ new Set();
|
|
746
|
+
for (const record of caseRecords) caseKeys.add(`${record.projectName}:${record.taskId}:${record.caseId}`);
|
|
747
|
+
return caseKeys.size;
|
|
748
|
+
}
|
|
749
|
+
function createWeightedAverage(projects, selectAverage) {
|
|
750
|
+
let weightedScoreTotal = 0;
|
|
751
|
+
let weightTotal = 0;
|
|
752
|
+
for (const project of projects) {
|
|
753
|
+
const average = selectAverage(project);
|
|
754
|
+
if (average == null || project.runCount <= 0) continue;
|
|
755
|
+
weightedScoreTotal += average * project.runCount;
|
|
756
|
+
weightTotal += project.runCount;
|
|
757
|
+
}
|
|
758
|
+
if (weightTotal === 0) return null;
|
|
759
|
+
return weightedScoreTotal / weightTotal;
|
|
760
|
+
}
|
|
761
|
+
/**
|
|
762
|
+
* Writes compare report artifact as JSON.
|
|
763
|
+
*/
|
|
764
|
+
async function writeCompareReportArtifact(args) {
|
|
765
|
+
const outputPath = resolve(args.outputPath);
|
|
766
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
767
|
+
await writeFile(outputPath, `${JSON.stringify(args.artifact, null, 2)}\n`, "utf-8");
|
|
768
|
+
return outputPath;
|
|
769
|
+
}
|
|
770
|
+
//#endregion
|
|
771
|
+
//#region src/cli/discovery.ts
|
|
772
|
+
/**
|
|
773
|
+
* Discovers eval files using include/exclude globs relative to project root.
|
|
774
|
+
*
|
|
775
|
+
* Before:
|
|
776
|
+
* - Absolute path file list from recursive filesystem walk
|
|
777
|
+
*
|
|
778
|
+
* After:
|
|
779
|
+
* - Filtered absolute path list matching include/exclude rules
|
|
780
|
+
*/
|
|
781
|
+
async function discoverEvalFiles(options) {
|
|
782
|
+
return uniq(await glob([...options.include], {
|
|
783
|
+
absolute: true,
|
|
784
|
+
cwd: options.root,
|
|
785
|
+
ignore: [...options.exclude],
|
|
786
|
+
onlyFiles: true
|
|
787
|
+
})).sort((left, right) => left.localeCompare(right));
|
|
788
|
+
}
|
|
789
|
+
//#endregion
|
|
790
|
+
//#region src/cli/module-runtime.ts
|
|
791
|
+
/**
|
|
792
|
+
* Loads eval modules and returns a normalized eval-module map.
|
|
793
|
+
*
|
|
794
|
+
* Use when:
|
|
795
|
+
* - CLI collection needs Vite/Vitest-powered module resolution and transforms
|
|
796
|
+
* - eval files should be imported with the same runtime semantics as Vitest
|
|
797
|
+
*
|
|
798
|
+
* Expects:
|
|
799
|
+
* - `projectRoot` points at the project that owns the eval files
|
|
800
|
+
* - each `evalFilePaths` entry is an absolute file path
|
|
801
|
+
*
|
|
802
|
+
* Returns:
|
|
803
|
+
* - eval modules keyed by stable file href + optional registration suffixes
|
|
804
|
+
*/
|
|
805
|
+
async function loadEvalModulesWithVitestRuntime(evalFilePaths, projectRoot) {
|
|
806
|
+
const loadedModules = {};
|
|
807
|
+
const runtime = await createVitest("test", {
|
|
808
|
+
config: false,
|
|
809
|
+
root: projectRoot,
|
|
810
|
+
run: false,
|
|
811
|
+
silent: true,
|
|
812
|
+
watch: false
|
|
813
|
+
});
|
|
814
|
+
try {
|
|
815
|
+
for (const evalFilePath of evalFilePaths) {
|
|
816
|
+
const moduleHref = pathToFileURL(evalFilePath).href;
|
|
817
|
+
beginModuleRegistration(moduleHref);
|
|
818
|
+
try {
|
|
819
|
+
const moduleValue = await runtime.import(moduleHref);
|
|
820
|
+
const registeredDefinitions = consumeModuleRegistrations(moduleHref);
|
|
821
|
+
const defaultDefinition = moduleValue.default;
|
|
822
|
+
const definitions = [...registeredDefinitions, ...defaultDefinition == null ? [] : [defaultDefinition]];
|
|
823
|
+
const deduplicatedDefinitions = definitions.filter((definition, index) => {
|
|
824
|
+
const key = `${definition.name}::${definition.description}::${definition.task?.id ?? ""}`;
|
|
825
|
+
return definitions.findIndex((candidate) => `${candidate.name}::${candidate.description}::${candidate.task?.id ?? ""}` === key) === index;
|
|
826
|
+
});
|
|
827
|
+
if (deduplicatedDefinitions.length === 0) continue;
|
|
828
|
+
for (const [definitionIndex, definition] of deduplicatedDefinitions.entries()) {
|
|
829
|
+
const moduleKey = definitionIndex === 0 ? moduleHref : `${moduleHref}#registration-${definitionIndex + 1}`;
|
|
830
|
+
loadedModules[moduleKey] = { default: definition };
|
|
831
|
+
}
|
|
832
|
+
} finally {
|
|
833
|
+
endModuleRegistration();
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
} finally {
|
|
837
|
+
await runtime.close();
|
|
838
|
+
}
|
|
839
|
+
return loadedModules;
|
|
840
|
+
}
|
|
841
|
+
//#endregion
|
|
842
|
+
//#region src/cli/report-otlp.ts
|
|
843
|
+
/**
|
|
844
|
+
* Builds local OTLP-shaped JSON projections from normalized case records.
|
|
845
|
+
*
|
|
846
|
+
* Use when:
|
|
847
|
+
* - writing deterministic report artifacts without requiring an OpenTelemetry Collector
|
|
848
|
+
* - future tools need trace/log/metric-shaped JSON files
|
|
849
|
+
*
|
|
850
|
+
* Expects:
|
|
851
|
+
* - records belong to one Vieval run
|
|
852
|
+
*
|
|
853
|
+
* Returns:
|
|
854
|
+
* - trace, log, and metric containers shaped after OTLP JSON concepts
|
|
855
|
+
*/
|
|
856
|
+
function buildLocalOtlpProjection(args) {
|
|
857
|
+
const projectSpans = collectProjectNames(args.records).map((projectName) => ({
|
|
858
|
+
attributes: toAttributes({
|
|
859
|
+
"vieval.project.name": projectName,
|
|
860
|
+
"vieval.run.id": args.runId
|
|
861
|
+
}),
|
|
862
|
+
name: "vieval.project"
|
|
863
|
+
}));
|
|
864
|
+
const taskSpans = collectTasks(args.records).map((task) => ({
|
|
865
|
+
attributes: toAttributes({
|
|
866
|
+
"vieval.project.name": task.projectName,
|
|
867
|
+
"vieval.run.id": args.runId,
|
|
868
|
+
"vieval.task.id": task.taskId
|
|
869
|
+
}),
|
|
870
|
+
name: "vieval.task"
|
|
871
|
+
}));
|
|
872
|
+
const caseSpans = args.records.map((record) => ({
|
|
873
|
+
attributes: toAttributes({
|
|
874
|
+
...record.metrics,
|
|
875
|
+
"vieval.case.duration_ms": record.durationMs,
|
|
876
|
+
"vieval.case.id": record.caseId,
|
|
877
|
+
"vieval.case.name": record.caseName,
|
|
878
|
+
"vieval.case.retry_count": record.retryCount,
|
|
879
|
+
"vieval.case.state": record.state,
|
|
880
|
+
"vieval.project.name": record.projectName,
|
|
881
|
+
"vieval.task.id": record.taskId
|
|
882
|
+
}),
|
|
883
|
+
endTimeUnixNano: isoToUnixNano(record.endedAt),
|
|
884
|
+
name: "vieval.case",
|
|
885
|
+
startTimeUnixNano: isoToUnixNano(record.startedAt)
|
|
886
|
+
}));
|
|
887
|
+
return {
|
|
888
|
+
logs: { resourceLogs: [{ scopeLogs: [{
|
|
889
|
+
logRecords: args.records.map((record) => ({
|
|
890
|
+
attributes: toAttributes(record.metrics),
|
|
891
|
+
body: { stringValue: JSON.stringify({
|
|
892
|
+
caseId: record.caseId,
|
|
893
|
+
scores: record.scores,
|
|
894
|
+
state: record.state
|
|
895
|
+
}) },
|
|
896
|
+
eventName: "vieval.case",
|
|
897
|
+
timeUnixNano: isoToUnixNano(record.endedAt)
|
|
898
|
+
})),
|
|
899
|
+
scope: { name: "vieval" }
|
|
900
|
+
}] }] },
|
|
901
|
+
metrics: { resourceMetrics: [{ scopeMetrics: [{
|
|
902
|
+
metrics: collectScoreKinds(args.records).map((kind) => ({
|
|
903
|
+
gauge: { dataPoints: args.records.filter((record) => typeof record.scores[kind] === "number").map((record) => ({
|
|
904
|
+
asDouble: record.scores[kind],
|
|
905
|
+
attributes: toAttributes({
|
|
906
|
+
...record.metrics,
|
|
907
|
+
"vieval.case.id": record.caseId,
|
|
908
|
+
"vieval.task.id": record.taskId
|
|
909
|
+
}),
|
|
910
|
+
timeUnixNano: isoToUnixNano(record.endedAt)
|
|
911
|
+
})) },
|
|
912
|
+
name: `vieval.score.${kind}`
|
|
913
|
+
})),
|
|
914
|
+
scope: { name: "vieval" }
|
|
915
|
+
}] }] },
|
|
916
|
+
traces: { resourceSpans: [{ scopeSpans: [{
|
|
917
|
+
scope: { name: "vieval" },
|
|
918
|
+
spans: [
|
|
919
|
+
{
|
|
920
|
+
attributes: toAttributes({ "vieval.run.id": args.runId }),
|
|
921
|
+
name: "vieval.run"
|
|
922
|
+
},
|
|
923
|
+
...projectSpans,
|
|
924
|
+
...taskSpans,
|
|
925
|
+
...caseSpans
|
|
926
|
+
]
|
|
927
|
+
}] }] }
|
|
928
|
+
};
|
|
929
|
+
}
|
|
930
|
+
function toAttributes(attributes) {
|
|
931
|
+
return Object.entries(attributes).filter(([, value]) => value !== void 0).sort(([leftKey], [rightKey]) => leftKey.localeCompare(rightKey)).map(([key, value]) => ({
|
|
932
|
+
key,
|
|
933
|
+
value: toAnyValue(value)
|
|
934
|
+
}));
|
|
935
|
+
}
|
|
936
|
+
function toAnyValue(value) {
|
|
937
|
+
if (Array.isArray(value)) return { arrayValue: { values: value.map((item) => toAnyValue(item)) } };
|
|
938
|
+
if (isAttributeScalar(value)) {
|
|
939
|
+
if (typeof value === "boolean") return { boolValue: value };
|
|
940
|
+
if (typeof value === "number") return Number.isFinite(value) ? { doubleValue: value } : { stringValue: String(value) };
|
|
941
|
+
if (value == null) return { stringValue: "null" };
|
|
942
|
+
return { stringValue: value };
|
|
943
|
+
}
|
|
944
|
+
return { stringValue: stableStringify(value) };
|
|
945
|
+
}
|
|
946
|
+
function isAttributeScalar(value) {
|
|
947
|
+
return value == null || typeof value === "boolean" || typeof value === "number" || typeof value === "string";
|
|
948
|
+
}
|
|
949
|
+
function isoToUnixNano(value) {
|
|
950
|
+
const preciseMatch = /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.(\d{1,9}))?(Z|[+-]\d{2}:\d{2})$/.exec(value);
|
|
951
|
+
if (preciseMatch != null) {
|
|
952
|
+
const [, secondsPart, fraction = "", zone] = preciseMatch;
|
|
953
|
+
const unixMilliseconds = Date.parse(`${secondsPart}.000${zone}`);
|
|
954
|
+
if (!Number.isFinite(unixMilliseconds)) return "0";
|
|
955
|
+
return String(BigInt(unixMilliseconds) * 1000000n + BigInt(fraction.padEnd(9, "0").slice(0, 9)));
|
|
956
|
+
}
|
|
957
|
+
const unixMilliseconds = Date.parse(value);
|
|
958
|
+
if (!Number.isFinite(unixMilliseconds)) return "0";
|
|
959
|
+
return String(BigInt(unixMilliseconds) * 1000000n);
|
|
960
|
+
}
|
|
961
|
+
function collectScoreKinds(records) {
|
|
962
|
+
return [...new Set(records.flatMap((record) => Object.keys(record.scores)))].sort((left, right) => left.localeCompare(right));
|
|
719
963
|
}
|
|
720
|
-
function
|
|
721
|
-
return
|
|
964
|
+
function collectProjectNames(records) {
|
|
965
|
+
return [...new Set(records.map((record) => record.projectName))].sort((left, right) => left.localeCompare(right));
|
|
722
966
|
}
|
|
723
|
-
function
|
|
724
|
-
|
|
967
|
+
function collectTasks(records) {
|
|
968
|
+
const tasks = /* @__PURE__ */ new Map();
|
|
969
|
+
for (const record of records) tasks.set(`${record.projectName}\0${record.taskId}`, {
|
|
970
|
+
projectName: record.projectName,
|
|
971
|
+
taskId: record.taskId
|
|
972
|
+
});
|
|
973
|
+
return [...tasks.values()].sort((left, right) => {
|
|
974
|
+
const projectOrder = left.projectName.localeCompare(right.projectName);
|
|
975
|
+
return projectOrder === 0 ? left.taskId.localeCompare(right.taskId) : projectOrder;
|
|
976
|
+
});
|
|
725
977
|
}
|
|
726
978
|
//#endregion
|
|
727
979
|
//#region src/cli/report-artifacts.ts
|
|
@@ -1893,9 +2145,30 @@ function sanitizeIdentitySegment(value) {
|
|
|
1893
2145
|
if (normalized.length === 0) return "default";
|
|
1894
2146
|
return normalized.replace(/[^\w.-]+/g, "-");
|
|
1895
2147
|
}
|
|
1896
|
-
function
|
|
2148
|
+
function createExperimentMatrixRows(tasks) {
|
|
2149
|
+
const rows = /* @__PURE__ */ new Set();
|
|
2150
|
+
for (const task of tasks) {
|
|
2151
|
+
const runRowId = task.matrix.meta.runRowId;
|
|
2152
|
+
const evalRowId = task.matrix.meta.evalRowId;
|
|
2153
|
+
if (runRowId !== "default" && evalRowId !== "default") {
|
|
2154
|
+
rows.add(`run:${runRowId}+eval:${evalRowId}`);
|
|
2155
|
+
continue;
|
|
2156
|
+
}
|
|
2157
|
+
if (runRowId !== "default") rows.add(`run:${runRowId}`);
|
|
2158
|
+
if (evalRowId !== "default") rows.add(`eval:${evalRowId}`);
|
|
2159
|
+
}
|
|
2160
|
+
return [...rows].sort((left, right) => left.localeCompare(right));
|
|
2161
|
+
}
|
|
2162
|
+
function resolveExperimentId(options, preparedProjects) {
|
|
2163
|
+
if (options.experiment != null) return sanitizeIdentitySegment(options.experiment);
|
|
2164
|
+
const matrixRows = /* @__PURE__ */ new Set();
|
|
2165
|
+
for (const project of preparedProjects) project.experimentMatrixRows.forEach((row) => matrixRows.add(row));
|
|
2166
|
+
if (matrixRows.size === 0) return "default-experiment";
|
|
2167
|
+
return sanitizeIdentitySegment(`matrix-${[...matrixRows].sort().join("--")}`);
|
|
2168
|
+
}
|
|
2169
|
+
function createRunIdentity(options, preparedProjects) {
|
|
1897
2170
|
const workspaceId = sanitizeIdentitySegment(options.workspace ?? "default-workspace");
|
|
1898
|
-
const experimentId =
|
|
2171
|
+
const experimentId = resolveExperimentId(options, preparedProjects);
|
|
1899
2172
|
return {
|
|
1900
2173
|
attemptId: sanitizeIdentitySegment(options.attempt ?? `attempt-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`),
|
|
1901
2174
|
experimentId,
|
|
@@ -2251,6 +2524,7 @@ async function prepareProject(project) {
|
|
|
2251
2524
|
});
|
|
2252
2525
|
const canAutoExecuteEntryTasks = entries.some((entry) => entry.task != null) && project.models.length > 0;
|
|
2253
2526
|
if (project.executor == null && !canAutoExecuteEntryTasks) return {
|
|
2527
|
+
experimentMatrixRows: createExperimentMatrixRows(tasks),
|
|
2254
2528
|
kind: "summary",
|
|
2255
2529
|
summary: {
|
|
2256
2530
|
caseSummary: null,
|
|
@@ -2267,6 +2541,7 @@ async function prepareProject(project) {
|
|
|
2267
2541
|
}
|
|
2268
2542
|
};
|
|
2269
2543
|
return {
|
|
2544
|
+
experimentMatrixRows: createExperimentMatrixRows(tasks),
|
|
2270
2545
|
kind: "prepared",
|
|
2271
2546
|
prepared: {
|
|
2272
2547
|
discoveredEvalFileCount: evalFilePaths.length,
|
|
@@ -2279,6 +2554,7 @@ async function prepareProject(project) {
|
|
|
2279
2554
|
};
|
|
2280
2555
|
} catch (error) {
|
|
2281
2556
|
return {
|
|
2557
|
+
experimentMatrixRows: [],
|
|
2282
2558
|
kind: "summary",
|
|
2283
2559
|
summary: {
|
|
2284
2560
|
caseSummary: null,
|
|
@@ -2439,7 +2715,6 @@ async function executePreparedProject(prepared, identity, cacheProjectName, tele
|
|
|
2439
2715
|
* - keeping business-agent eval files near their implementation packages
|
|
2440
2716
|
*/
|
|
2441
2717
|
async function runVievalCli(options = {}) {
|
|
2442
|
-
const identity = createRunIdentity(options);
|
|
2443
2718
|
const loadedConfig = await loadVievalCliConfig({
|
|
2444
2719
|
configFilePath: options.configFilePath,
|
|
2445
2720
|
cwd: options.cwd
|
|
@@ -2447,21 +2722,24 @@ async function runVievalCli(options = {}) {
|
|
|
2447
2722
|
const telemetry = loadedConfig.reporting?.openTelemetry?.enabled === true ? createOpenTelemetryRuntime() : createNoopTelemetryRuntime();
|
|
2448
2723
|
const onOpenTelemetryRunEnd = loadedConfig.reporting?.openTelemetry?.enabled === true ? loadedConfig.reporting.openTelemetry.onRunEnd : void 0;
|
|
2449
2724
|
const restoreEnvironment = applyRunEnvironment(loadedConfig.env);
|
|
2450
|
-
const eventRecorder = createEventRecorder(identity);
|
|
2451
|
-
const reporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
|
|
2452
2725
|
let runError;
|
|
2453
2726
|
let runEndError;
|
|
2454
2727
|
let output;
|
|
2728
|
+
let reporter;
|
|
2455
2729
|
try {
|
|
2730
|
+
const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
|
|
2731
|
+
const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
|
|
2732
|
+
const identity = createRunIdentity(options, preparedProjects);
|
|
2733
|
+
const eventRecorder = createEventRecorder(identity);
|
|
2734
|
+
const runReporter = createReporterWithEventCapture(createRunReporter(options.reporter), eventRecorder.record);
|
|
2735
|
+
reporter = runReporter;
|
|
2456
2736
|
output = await telemetry.withSpan("vieval.run", {
|
|
2457
2737
|
"vieval.attempt.id": identity.attemptId,
|
|
2458
2738
|
"vieval.experiment.id": identity.experimentId,
|
|
2459
2739
|
"vieval.run.id": identity.runId,
|
|
2460
2740
|
"vieval.workspace.id": identity.workspaceId
|
|
2461
2741
|
}, async () => {
|
|
2462
|
-
const selectedProjects = filterProjectsByName(loadedConfig.projects, options.project ?? []);
|
|
2463
2742
|
const workspaceScheduler = createSchedulerRuntime({ concurrency: { workspace: resolveWorkspaceConcurrency(loadedConfig, options) } });
|
|
2464
|
-
const preparedProjects = await Promise.all(selectedProjects.map(async (project) => prepareProject(project)));
|
|
2465
2743
|
const executableProjects = preparedProjects.filter((project) => project.kind === "prepared").map((project) => project.prepared);
|
|
2466
2744
|
const totalTasks = preparedProjects.reduce((sum, project) => {
|
|
2467
2745
|
if (project.kind === "prepared") return sum + project.prepared.tasks.length;
|
|
@@ -2476,8 +2754,8 @@ async function runVievalCli(options = {}) {
|
|
|
2476
2754
|
passedTasks: 0,
|
|
2477
2755
|
skippedTasks: 0
|
|
2478
2756
|
};
|
|
2479
|
-
|
|
2480
|
-
for (const project of executableProjects) for (const task of project.tasks)
|
|
2757
|
+
runReporter.onRunStart({ totalTasks });
|
|
2758
|
+
for (const project of executableProjects) for (const task of project.tasks) runReporter.onTaskQueued(createTaskQueuePayload(task, project.name));
|
|
2481
2759
|
const projectSummaries = (await Promise.all(preparedProjects.map(async (preparedProject, index) => {
|
|
2482
2760
|
if (preparedProject.kind === "summary") return {
|
|
2483
2761
|
index,
|
|
@@ -2493,10 +2771,10 @@ async function runVievalCli(options = {}) {
|
|
|
2493
2771
|
projectName: preparedProject.prepared.name,
|
|
2494
2772
|
scope: "workspace",
|
|
2495
2773
|
workspaceId: identity.workspaceId
|
|
2496
|
-
}, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry,
|
|
2774
|
+
}, async () => executePreparedProject(preparedProject.prepared, identity, options.cacheProjectName, telemetry, runReporter, reporterCounters, eventRecorder.record, options)))
|
|
2497
2775
|
};
|
|
2498
2776
|
}))).sort((left, right) => left.index - right.index).map((item) => item.summary);
|
|
2499
|
-
|
|
2777
|
+
runReporter.onRunEnd({
|
|
2500
2778
|
failedTasks: reporterCounters.failedTasks,
|
|
2501
2779
|
passedTasks: reporterCounters.passedTasks,
|
|
2502
2780
|
skippedTasks: reporterCounters.skippedTasks + skippedSummaryTasks,
|
|
@@ -2522,7 +2800,7 @@ async function runVievalCli(options = {}) {
|
|
|
2522
2800
|
} catch (error) {
|
|
2523
2801
|
if (runError == null) runEndError = error;
|
|
2524
2802
|
}
|
|
2525
|
-
reporter
|
|
2803
|
+
reporter?.dispose();
|
|
2526
2804
|
restoreEnvironment();
|
|
2527
2805
|
}
|
|
2528
2806
|
if (runError != null) throw runError;
|
|
@@ -2640,14 +2918,14 @@ const compareHelpText = `
|
|
|
2640
2918
|
--output Optional output artifact path
|
|
2641
2919
|
--format Console output format: table | json (default: table)
|
|
2642
2920
|
`;
|
|
2643
|
-
function normalizeCliArgv$
|
|
2921
|
+
function normalizeCliArgv$5(argv) {
|
|
2644
2922
|
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
2645
2923
|
if (normalizedArgv[0] === "compare") return normalizedArgv.slice(1);
|
|
2646
2924
|
return normalizedArgv;
|
|
2647
2925
|
}
|
|
2648
2926
|
function parseCompareCliArguments(argv) {
|
|
2649
2927
|
const cli = meow(compareHelpText, {
|
|
2650
|
-
argv: normalizeCliArgv$
|
|
2928
|
+
argv: normalizeCliArgv$5(argv),
|
|
2651
2929
|
flags: {
|
|
2652
2930
|
config: { type: "string" },
|
|
2653
2931
|
comparison: { type: "string" },
|
|
@@ -2677,18 +2955,22 @@ async function runCompareCli(argv) {
|
|
|
2677
2955
|
cwd: parsed.cwd
|
|
2678
2956
|
});
|
|
2679
2957
|
const methodResults = [];
|
|
2958
|
+
const reportRoot = await mkdtemp(join(tmpdir(), "vieval-compare-"));
|
|
2680
2959
|
for (const method of loaded.config.methods) {
|
|
2681
2960
|
const methodWorkspace = resolve(method.workspace);
|
|
2961
|
+
const methodReportOut = join(reportRoot, method.id);
|
|
2682
2962
|
const output = await runVievalCli({
|
|
2683
2963
|
cacheProjectName: loaded.config.benchmark.sharedCaseNamespace,
|
|
2684
2964
|
configFilePath: method.configFilePath ?? resolve(methodWorkspace, "vieval.config.ts"),
|
|
2685
2965
|
cwd: methodWorkspace,
|
|
2686
2966
|
project: [method.project],
|
|
2967
|
+
reportOut: methodReportOut,
|
|
2687
2968
|
workspace: loaded.config.benchmark.id
|
|
2688
2969
|
});
|
|
2689
2970
|
const failedProject = output.projects.find((project) => project.errorMessage != null);
|
|
2690
2971
|
if (failedProject != null) throw new Error(`Comparison method "${method.id}" failed: ${failedProject.errorMessage}`);
|
|
2691
2972
|
methodResults.push({
|
|
2973
|
+
caseRecords: await readCaseRecordsFromReport(methodReportOut),
|
|
2692
2974
|
methodId: method.id,
|
|
2693
2975
|
output
|
|
2694
2976
|
});
|
|
@@ -2752,7 +3034,7 @@ const evalRunHelpText = `
|
|
|
2752
3034
|
--report-out Report output root directory
|
|
2753
3035
|
--json Print machine-readable JSON output
|
|
2754
3036
|
`;
|
|
2755
|
-
function normalizeCliArgv$
|
|
3037
|
+
function normalizeCliArgv$4(argv) {
|
|
2756
3038
|
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
2757
3039
|
return normalizedArgv[0] === "run" ? normalizedArgv.slice(1) : normalizedArgv;
|
|
2758
3040
|
}
|
|
@@ -2775,7 +3057,7 @@ function normalizeProjectNames(projectNames) {
|
|
|
2775
3057
|
*/
|
|
2776
3058
|
function parseCliArguments(argv) {
|
|
2777
3059
|
const cli = meow(evalRunHelpText, {
|
|
2778
|
-
argv: normalizeCliArgv$
|
|
3060
|
+
argv: normalizeCliArgv$4(argv),
|
|
2779
3061
|
importMeta: import.meta,
|
|
2780
3062
|
flags: {
|
|
2781
3063
|
config: { type: "string" },
|
|
@@ -2892,7 +3174,7 @@ const reportAnalyzeHelpText = `
|
|
|
2892
3174
|
--run-matrix Keep runs matching run-matrix selector "key=value[,key=value]"
|
|
2893
3175
|
--eval-matrix Keep runs matching eval-matrix selector "key=value[,key=value]"
|
|
2894
3176
|
`;
|
|
2895
|
-
function normalizeCliArgv$
|
|
3177
|
+
function normalizeCliArgv$3(argv) {
|
|
2896
3178
|
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
2897
3179
|
if (normalizedArgv[0] === "report" && normalizedArgv[1] === "analyze") return normalizedArgv.slice(2);
|
|
2898
3180
|
if (normalizedArgv[0] === "analyze") return normalizedArgv.slice(1);
|
|
@@ -2900,7 +3182,7 @@ function normalizeCliArgv$4(argv) {
|
|
|
2900
3182
|
}
|
|
2901
3183
|
function parseReportAnalyzeCliArguments(argv) {
|
|
2902
3184
|
const cli = meow(reportAnalyzeHelpText, {
|
|
2903
|
-
argv: normalizeCliArgv$
|
|
3185
|
+
argv: normalizeCliArgv$3(argv),
|
|
2904
3186
|
flags: {
|
|
2905
3187
|
attempt: { type: "string" },
|
|
2906
3188
|
caseState: { type: "string" },
|
|
@@ -3189,218 +3471,6 @@ async function runReportAnalyzeCli(argv) {
|
|
|
3189
3471
|
}
|
|
3190
3472
|
}
|
|
3191
3473
|
//#endregion
|
|
3192
|
-
//#region src/cli/report-cases.ts
|
|
3193
|
-
const reportCasesHelpText = `
|
|
3194
|
-
Inspect normalized case records from generated vieval report artifacts.
|
|
3195
|
-
|
|
3196
|
-
Usage
|
|
3197
|
-
$ vieval report cases <reportPath> [options]
|
|
3198
|
-
|
|
3199
|
-
Options
|
|
3200
|
-
--format Output format: table | json | jsonl (default: table)
|
|
3201
|
-
--where Equality filter "key=value"; repeatable
|
|
3202
|
-
--group-by Case field, score name, or metric name used for grouped score summaries
|
|
3203
|
-
`;
|
|
3204
|
-
/**
|
|
3205
|
-
* Reads normalized case records from one report run directory or report root.
|
|
3206
|
-
*
|
|
3207
|
-
* Use when:
|
|
3208
|
-
* - CLI tools need case-level inspection from local report artifacts
|
|
3209
|
-
* - callers may pass a run directory, a `cases.jsonl` file, or a report root
|
|
3210
|
-
*
|
|
3211
|
-
* Expects:
|
|
3212
|
-
* - discovered `cases.jsonl` files contain one `CaseRecord` JSON object per line
|
|
3213
|
-
*
|
|
3214
|
-
* Returns:
|
|
3215
|
-
* - all parsed case records sorted by discovered file path order
|
|
3216
|
-
*/
|
|
3217
|
-
async function readCaseRecordsFromReport(reportPath) {
|
|
3218
|
-
const caseFilePaths = await resolveCaseRecordPaths(reportPath);
|
|
3219
|
-
if (caseFilePaths.length === 0) throw new Error(`No cases.jsonl files found under "${resolve(reportPath)}".`);
|
|
3220
|
-
const records = [];
|
|
3221
|
-
for (const caseFilePath of caseFilePaths) {
|
|
3222
|
-
const lines = readFileSync(caseFilePath, "utf-8").split("\n");
|
|
3223
|
-
for (const [index, line] of lines.entries()) {
|
|
3224
|
-
const trimmed = line.trim();
|
|
3225
|
-
if (trimmed.length === 0) continue;
|
|
3226
|
-
try {
|
|
3227
|
-
records.push(JSON.parse(trimmed));
|
|
3228
|
-
} catch (error) {
|
|
3229
|
-
throw new Error(`Invalid cases.jsonl line ${index + 1} in "${caseFilePath}": ${errorMessageFrom(error) ?? "Unknown JSON parse failure."}`);
|
|
3230
|
-
}
|
|
3231
|
-
}
|
|
3232
|
-
}
|
|
3233
|
-
return records;
|
|
3234
|
-
}
|
|
3235
|
-
/**
|
|
3236
|
-
* Builds filtered case inspection output.
|
|
3237
|
-
*
|
|
3238
|
-
* Use when:
|
|
3239
|
-
* - `vieval report cases` needs deterministic JSON/table output
|
|
3240
|
-
* - tests need pure filtering and grouping behavior without process I/O
|
|
3241
|
-
*
|
|
3242
|
-
* Expects:
|
|
3243
|
-
* - `where` filters use `key=value`
|
|
3244
|
-
* - lookup keys may target direct case fields, score names, or metric names
|
|
3245
|
-
*
|
|
3246
|
-
* Returns:
|
|
3247
|
-
* - filtered records plus grouped score summaries when `groupBy` is present
|
|
3248
|
-
*/
|
|
3249
|
-
function buildReportCasesOutput(records, options) {
|
|
3250
|
-
const whereFilters = (options.where ?? []).map(parseSelector);
|
|
3251
|
-
const filteredRecords = records.filter((record) => matchesWhereFilters(record, whereFilters));
|
|
3252
|
-
return {
|
|
3253
|
-
groups: options.groupBy == null ? void 0 : buildCaseGroups(filteredRecords, options.groupBy),
|
|
3254
|
-
records: [...filteredRecords]
|
|
3255
|
-
};
|
|
3256
|
-
}
|
|
3257
|
-
/**
|
|
3258
|
-
* Runs the `vieval report cases` command.
|
|
3259
|
-
*
|
|
3260
|
-
* Call stack:
|
|
3261
|
-
*
|
|
3262
|
-
* published executable (`../bin/vieval`)
|
|
3263
|
-
* -> {@link import('./index').runTopLevelCli}
|
|
3264
|
-
* -> {@link runReportCasesCli}
|
|
3265
|
-
* -> {@link readCaseRecordsFromReport}
|
|
3266
|
-
*
|
|
3267
|
-
* Use when:
|
|
3268
|
-
* - the top-level CLI dispatches local case artifact inspection
|
|
3269
|
-
*
|
|
3270
|
-
* Expects:
|
|
3271
|
-
* - argv is either `cases <reportPath> ...` or `<reportPath> ...`
|
|
3272
|
-
*
|
|
3273
|
-
* Returns:
|
|
3274
|
-
* - resolves after writing the requested output to stdout
|
|
3275
|
-
*/
|
|
3276
|
-
async function runReportCasesCli(argv) {
|
|
3277
|
-
try {
|
|
3278
|
-
const parsed = parseReportCasesCliArguments(argv);
|
|
3279
|
-
const output = buildReportCasesOutput(await readCaseRecordsFromReport(parsed.reportPath), parsed);
|
|
3280
|
-
if (parsed.format === "json") {
|
|
3281
|
-
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
3282
|
-
return;
|
|
3283
|
-
}
|
|
3284
|
-
if (parsed.format === "jsonl") {
|
|
3285
|
-
process.stdout.write(encodeJsonl(output.records));
|
|
3286
|
-
return;
|
|
3287
|
-
}
|
|
3288
|
-
process.stdout.write(`${formatCasesTable(output)}\n`);
|
|
3289
|
-
} catch (error) {
|
|
3290
|
-
const errorMessage = errorMessageFrom(error) ?? "Unknown report cases failure.";
|
|
3291
|
-
process.stderr.write(`[vieval report cases] ${errorMessage}\n`);
|
|
3292
|
-
process.exitCode = 1;
|
|
3293
|
-
}
|
|
3294
|
-
}
|
|
3295
|
-
function normalizeCliArgv$3(argv) {
|
|
3296
|
-
const normalizedArgv = argv[0] === "--" ? argv.slice(1) : [...argv];
|
|
3297
|
-
if (normalizedArgv[0] === "report" && normalizedArgv[1] === "cases") return normalizedArgv.slice(2);
|
|
3298
|
-
if (normalizedArgv[0] === "cases") return normalizedArgv.slice(1);
|
|
3299
|
-
return normalizedArgv;
|
|
3300
|
-
}
|
|
3301
|
-
function parseReportCasesCliArguments(argv) {
|
|
3302
|
-
const cli = meow(reportCasesHelpText, {
|
|
3303
|
-
argv: normalizeCliArgv$3(argv),
|
|
3304
|
-
flags: {
|
|
3305
|
-
format: {
|
|
3306
|
-
default: "table",
|
|
3307
|
-
type: "string"
|
|
3308
|
-
},
|
|
3309
|
-
groupBy: { type: "string" },
|
|
3310
|
-
where: {
|
|
3311
|
-
isMultiple: true,
|
|
3312
|
-
type: "string"
|
|
3313
|
-
}
|
|
3314
|
-
},
|
|
3315
|
-
importMeta: import.meta
|
|
3316
|
-
});
|
|
3317
|
-
const reportPath = cli.input[0];
|
|
3318
|
-
if (reportPath == null || reportPath.length === 0) throw new Error("Missing required <reportPath> argument.");
|
|
3319
|
-
return {
|
|
3320
|
-
format: normalizeReportCasesFormat(cli.flags.format),
|
|
3321
|
-
groupBy: cli.flags.groupBy,
|
|
3322
|
-
reportPath,
|
|
3323
|
-
where: cli.flags.where
|
|
3324
|
-
};
|
|
3325
|
-
}
|
|
3326
|
-
function normalizeReportCasesFormat(value) {
|
|
3327
|
-
const normalized = value.toLowerCase();
|
|
3328
|
-
if (normalized === "json") return "json";
|
|
3329
|
-
if (normalized === "jsonl") return "jsonl";
|
|
3330
|
-
return "table";
|
|
3331
|
-
}
|
|
3332
|
-
async function resolveCaseRecordPaths(reportPath) {
|
|
3333
|
-
const absoluteReportPath = resolve(reportPath);
|
|
3334
|
-
const directCaseFilePath = resolve(absoluteReportPath, "cases.jsonl");
|
|
3335
|
-
if (existsSync(absoluteReportPath) && absoluteReportPath.endsWith(".jsonl")) return [absoluteReportPath];
|
|
3336
|
-
if (existsSync(directCaseFilePath)) return [directCaseFilePath];
|
|
3337
|
-
return (await glob("**/cases.jsonl", {
|
|
3338
|
-
absolute: true,
|
|
3339
|
-
cwd: absoluteReportPath
|
|
3340
|
-
})).sort((left, right) => left.localeCompare(right));
|
|
3341
|
-
}
|
|
3342
|
-
function matchesWhereFilters(record, whereFilters) {
|
|
3343
|
-
return whereFilters.every((parsed) => {
|
|
3344
|
-
const resolved = getCaseSelectorValue(record, parsed.key);
|
|
3345
|
-
return resolved.exists && String(resolved.value) === parsed.value;
|
|
3346
|
-
});
|
|
3347
|
-
}
|
|
3348
|
-
function parseSelector(selector) {
|
|
3349
|
-
const separatorIndex = selector.indexOf("=");
|
|
3350
|
-
if (separatorIndex <= 0 || separatorIndex === selector.length - 1) throw new Error(`Invalid selector "${selector}". Expected "key=value".`);
|
|
3351
|
-
return {
|
|
3352
|
-
key: selector.slice(0, separatorIndex).trim(),
|
|
3353
|
-
value: selector.slice(separatorIndex + 1).trim()
|
|
3354
|
-
};
|
|
3355
|
-
}
|
|
3356
|
-
function buildCaseGroups(records, groupBy) {
|
|
3357
|
-
const groups = {};
|
|
3358
|
-
for (const record of records) {
|
|
3359
|
-
const resolved = getCaseSelectorValue(record, groupBy);
|
|
3360
|
-
if (!resolved.exists) continue;
|
|
3361
|
-
const groupKey = `${groupBy}=${String(resolved.value)}`;
|
|
3362
|
-
groups[groupKey] ??= {
|
|
3363
|
-
count: 0,
|
|
3364
|
-
scores: {}
|
|
3365
|
-
};
|
|
3366
|
-
groups[groupKey].count += 1;
|
|
3367
|
-
addScores(groups[groupKey].scores, record.scores);
|
|
3368
|
-
}
|
|
3369
|
-
return Object.fromEntries(Object.entries(groups).sort(([left], [right]) => left.localeCompare(right)).map(([groupKey, group]) => [groupKey, {
|
|
3370
|
-
count: group.count,
|
|
3371
|
-
scores: finalizeScores(group.scores)
|
|
3372
|
-
}]));
|
|
3373
|
-
}
|
|
3374
|
-
function addScores(summary, scores) {
|
|
3375
|
-
for (const [scoreName, value] of Object.entries(scores)) {
|
|
3376
|
-
summary[scoreName] ??= {
|
|
3377
|
-
average: 0,
|
|
3378
|
-
count: 0,
|
|
3379
|
-
sum: 0
|
|
3380
|
-
};
|
|
3381
|
-
summary[scoreName].count += 1;
|
|
3382
|
-
summary[scoreName].sum += value;
|
|
3383
|
-
}
|
|
3384
|
-
}
|
|
3385
|
-
function finalizeScores(summary) {
|
|
3386
|
-
return Object.fromEntries(Object.entries(summary).sort(([left], [right]) => left.localeCompare(right)).map(([scoreName, bucket]) => [scoreName, {
|
|
3387
|
-
average: bucket.count === 0 ? 0 : bucket.sum / bucket.count,
|
|
3388
|
-
count: bucket.count,
|
|
3389
|
-
sum: bucket.sum
|
|
3390
|
-
}]));
|
|
3391
|
-
}
|
|
3392
|
-
function formatCasesTable(output) {
|
|
3393
|
-
const lines = ["CASES vieval report", `Case count ${output.records.length}`];
|
|
3394
|
-
if (output.groups != null) {
|
|
3395
|
-
lines.push("Groups");
|
|
3396
|
-
for (const [groupKey, group] of Object.entries(output.groups)) {
|
|
3397
|
-
const scoreText = Object.entries(group.scores).map(([scoreName, bucket]) => `${scoreName}=${bucket.average.toFixed(3)}`).join(" ");
|
|
3398
|
-
lines.push(`${groupKey} count=${group.count}${scoreText.length > 0 ? ` ${scoreText}` : ""}`);
|
|
3399
|
-
}
|
|
3400
|
-
}
|
|
3401
|
-
return lines.join("\n");
|
|
3402
|
-
}
|
|
3403
|
-
//#endregion
|
|
3404
3474
|
//#region src/cli/report-case-compare.ts
|
|
3405
3475
|
const reportCompareHelpText = `
|
|
3406
3476
|
Compare normalized case records from two generated vieval reports.
|
|
@@ -3842,4 +3912,4 @@ async function runTopLevelCli(argv) {
|
|
|
3842
3912
|
//#endregion
|
|
3843
3913
|
export { runTopLevelCli as n, parseTopLevelCliArguments as t };
|
|
3844
3914
|
|
|
3845
|
-
//# sourceMappingURL=cli-
|
|
3915
|
+
//# sourceMappingURL=cli-DTDgaqeI.mjs.map
|