agentv 2.18.4 → 3.0.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -36
- package/dist/agentv-provider-5CJVBBGG-2XVZBW7L.js +9 -0
- package/dist/{chunk-RMUVJ44Z.js → chunk-5WIB7A27.js} +598 -403
- package/dist/chunk-5WIB7A27.js.map +1 -0
- package/dist/chunk-6GSYTMXD.js +31520 -0
- package/dist/chunk-6GSYTMXD.js.map +1 -0
- package/dist/{chunk-KSUL3F3R.js → chunk-DY4ZDTTO.js} +1018 -140
- package/dist/chunk-DY4ZDTTO.js.map +1 -0
- package/dist/chunk-HF4X7ALN.js +24299 -0
- package/dist/chunk-HF4X7ALN.js.map +1 -0
- package/dist/{chunk-FV32QHPB.js → chunk-XOSNETAV.js} +1 -1
- package/dist/cli.js +5 -4
- package/dist/cli.js.map +1 -1
- package/dist/{dist-EDQZMZH2.js → dist-WN2QIOQR.js} +27 -11
- package/dist/{esm-DX3WQKEN.js → esm-CZAWIY6F.js} +2 -2
- package/dist/esm-CZAWIY6F.js.map +1 -0
- package/dist/index.js +5 -4
- package/dist/{interactive-J4IBXJF7.js → interactive-B432TCRZ.js} +5 -4
- package/dist/{interactive-J4IBXJF7.js.map → interactive-B432TCRZ.js.map} +1 -1
- package/dist/{src-2N5EJ2N6.js → src-ML4D2MC2.js} +2 -2
- package/dist/templates/.agentv/config.yaml +0 -5
- package/dist/templates/.agentv/targets.yaml +8 -11
- package/package.json +2 -2
- package/dist/chunk-KSUL3F3R.js.map +0 -1
- package/dist/chunk-RMUVJ44Z.js.map +0 -1
- package/dist/chunk-YTHTGLMT.js +0 -49786
- package/dist/chunk-YTHTGLMT.js.map +0 -1
- /package/dist/{dist-EDQZMZH2.js.map → agentv-provider-5CJVBBGG-2XVZBW7L.js.map} +0 -0
- /package/dist/{chunk-FV32QHPB.js.map → chunk-XOSNETAV.js.map} +0 -0
- /package/dist/{esm-DX3WQKEN.js.map → dist-WN2QIOQR.js.map} +0 -0
- /package/dist/{src-2N5EJ2N6.js.map → src-ML4D2MC2.js.map} +0 -0
|
@@ -25,12 +25,12 @@ import {
|
|
|
25
25
|
subscribeToCopilotCliLogEntries,
|
|
26
26
|
subscribeToCopilotSdkLogEntries,
|
|
27
27
|
subscribeToPiLogEntries
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-HF4X7ALN.js";
|
|
29
29
|
|
|
30
30
|
// package.json
|
|
31
31
|
var package_default = {
|
|
32
32
|
name: "agentv",
|
|
33
|
-
version: "
|
|
33
|
+
version: "3.0.0-next.1",
|
|
34
34
|
description: "CLI entry point for AgentV",
|
|
35
35
|
type: "module",
|
|
36
36
|
repository: {
|
|
@@ -95,7 +95,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
95
95
|
const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) : path.resolve(cwd, pattern);
|
|
96
96
|
try {
|
|
97
97
|
const stats = await stat(candidatePath);
|
|
98
|
-
if (stats.isFile() && /\.(ya?ml|jsonl)$/i.test(candidatePath)) {
|
|
98
|
+
if (stats.isFile() && /\.(ya?ml|jsonl|json)$/i.test(candidatePath)) {
|
|
99
99
|
results.add(candidatePath);
|
|
100
100
|
continue;
|
|
101
101
|
}
|
|
@@ -110,7 +110,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
110
110
|
dot: true,
|
|
111
111
|
followSymbolicLinks: true
|
|
112
112
|
});
|
|
113
|
-
const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl)$/i.test(filePath));
|
|
113
|
+
const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath));
|
|
114
114
|
if (yamlMatches.length === 0) {
|
|
115
115
|
unmatched.push(pattern);
|
|
116
116
|
continue;
|
|
@@ -123,7 +123,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
123
123
|
throw new Error(
|
|
124
124
|
`No eval files matched: ${unmatched.join(
|
|
125
125
|
", "
|
|
126
|
-
)}. Provide YAML or
|
|
126
|
+
)}. Provide YAML, JSONL, or JSON paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl", "evals.json").`
|
|
127
127
|
);
|
|
128
128
|
}
|
|
129
129
|
const sorted = Array.from(results);
|
|
@@ -201,7 +201,7 @@ async function discoverTargetsFile(options) {
|
|
|
201
201
|
// src/commands/eval/run-eval.ts
|
|
202
202
|
import { constants as constants4 } from "node:fs";
|
|
203
203
|
import { access as access4 } from "node:fs/promises";
|
|
204
|
-
import
|
|
204
|
+
import path12 from "node:path";
|
|
205
205
|
import { pathToFileURL } from "node:url";
|
|
206
206
|
|
|
207
207
|
// src/version-check.ts
|
|
@@ -258,16 +258,316 @@ async function promptContinue() {
|
|
|
258
258
|
return confirm({ message: "Continue anyway?", default: false });
|
|
259
259
|
}
|
|
260
260
|
|
|
261
|
+
// src/commands/eval/artifact-writer.ts
|
|
262
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
263
|
+
import path3 from "node:path";
|
|
264
|
+
var PASS_THRESHOLD = 0.8;
|
|
265
|
+
function computeStats(values) {
|
|
266
|
+
if (values.length === 0) {
|
|
267
|
+
return { mean: 0, stddev: 0 };
|
|
268
|
+
}
|
|
269
|
+
const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
270
|
+
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
|
|
271
|
+
return {
|
|
272
|
+
mean: Math.round(mean * 1e3) / 1e3,
|
|
273
|
+
stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
function computePassRate(result) {
|
|
277
|
+
const scores = result.scores;
|
|
278
|
+
if (scores && scores.length > 0) {
|
|
279
|
+
const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
|
|
280
|
+
return passed / scores.length;
|
|
281
|
+
}
|
|
282
|
+
return result.score >= PASS_THRESHOLD ? 1 : 0;
|
|
283
|
+
}
|
|
284
|
+
function countToolCalls(result) {
|
|
285
|
+
const toolCalls = {};
|
|
286
|
+
let total = 0;
|
|
287
|
+
const trace = result.trace;
|
|
288
|
+
if (trace?.steps) {
|
|
289
|
+
for (const step of trace.steps) {
|
|
290
|
+
if (step.toolName || step.type === "tool") {
|
|
291
|
+
const name = step.toolName ?? "unknown";
|
|
292
|
+
toolCalls[name] = (toolCalls[name] ?? 0) + 1;
|
|
293
|
+
total += 1;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
return { toolCalls, total };
|
|
298
|
+
}
|
|
299
|
+
function parseWorkspaceChanges(fileChanges) {
|
|
300
|
+
if (!fileChanges) {
|
|
301
|
+
return void 0;
|
|
302
|
+
}
|
|
303
|
+
let filesModified = 0;
|
|
304
|
+
let filesCreated = 0;
|
|
305
|
+
const lines = fileChanges.split("\n");
|
|
306
|
+
for (const line of lines) {
|
|
307
|
+
if (line.startsWith("--- /dev/null")) {
|
|
308
|
+
filesCreated += 1;
|
|
309
|
+
} else if (line.startsWith("--- a/")) {
|
|
310
|
+
filesModified += 1;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
const summaryLines = lines.slice(0, 20);
|
|
314
|
+
const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
|
|
315
|
+
... (${lines.length - 20} more lines)` : fileChanges;
|
|
316
|
+
return {
|
|
317
|
+
files_modified: filesModified,
|
|
318
|
+
files_created: filesCreated,
|
|
319
|
+
diff_summary: diffSummary
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
function buildExpectations(result) {
|
|
323
|
+
const expectations = [];
|
|
324
|
+
if (result.scores && result.scores.length > 0) {
|
|
325
|
+
for (const evaluator of result.scores) {
|
|
326
|
+
for (const hit of evaluator.hits) {
|
|
327
|
+
expectations.push({
|
|
328
|
+
text: hit,
|
|
329
|
+
passed: true,
|
|
330
|
+
evidence: evaluator.reasoning ?? ""
|
|
331
|
+
});
|
|
332
|
+
}
|
|
333
|
+
for (const miss of evaluator.misses) {
|
|
334
|
+
expectations.push({
|
|
335
|
+
text: miss,
|
|
336
|
+
passed: false,
|
|
337
|
+
evidence: evaluator.reasoning ?? ""
|
|
338
|
+
});
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
} else {
|
|
342
|
+
for (const hit of result.hits) {
|
|
343
|
+
expectations.push({ text: hit, passed: true, evidence: result.reasoning ?? "" });
|
|
344
|
+
}
|
|
345
|
+
for (const miss of result.misses) {
|
|
346
|
+
expectations.push({ text: miss, passed: false, evidence: result.reasoning ?? "" });
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
return expectations;
|
|
350
|
+
}
|
|
351
|
+
function buildEvaluators(scores) {
|
|
352
|
+
if (!scores || scores.length === 0) {
|
|
353
|
+
return void 0;
|
|
354
|
+
}
|
|
355
|
+
return scores.map((s) => ({
|
|
356
|
+
name: s.name,
|
|
357
|
+
type: s.type,
|
|
358
|
+
score: s.score,
|
|
359
|
+
reasoning: s.reasoning ?? "",
|
|
360
|
+
weight: s.weight,
|
|
361
|
+
verdict: s.verdict,
|
|
362
|
+
hits: s.hits,
|
|
363
|
+
misses: s.misses,
|
|
364
|
+
details: s.details
|
|
365
|
+
}));
|
|
366
|
+
}
|
|
367
|
+
function buildGradingArtifact(result) {
|
|
368
|
+
const expectations = buildExpectations(result);
|
|
369
|
+
const passed = expectations.filter((e) => e.passed).length;
|
|
370
|
+
const failed = expectations.filter((e) => !e.passed).length;
|
|
371
|
+
const total = expectations.length;
|
|
372
|
+
const { toolCalls, total: totalToolCalls } = countToolCalls(result);
|
|
373
|
+
const errorsEncountered = result.error ? 1 : 0;
|
|
374
|
+
return {
|
|
375
|
+
expectations,
|
|
376
|
+
summary: {
|
|
377
|
+
passed,
|
|
378
|
+
failed,
|
|
379
|
+
total,
|
|
380
|
+
pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
|
|
381
|
+
},
|
|
382
|
+
execution_metrics: {
|
|
383
|
+
tool_calls: toolCalls,
|
|
384
|
+
total_tool_calls: totalToolCalls,
|
|
385
|
+
errors_encountered: errorsEncountered
|
|
386
|
+
},
|
|
387
|
+
evaluators: buildEvaluators(result.scores),
|
|
388
|
+
workspace_changes: parseWorkspaceChanges(result.fileChanges),
|
|
389
|
+
conversation: result.conversationId ? {
|
|
390
|
+
turns: result.trace ? result.trace.steps?.length ?? 0 : 0,
|
|
391
|
+
conversation_id: result.conversationId
|
|
392
|
+
} : void 0
|
|
393
|
+
};
|
|
394
|
+
}
|
|
395
|
+
function buildTimingArtifact(results) {
|
|
396
|
+
let totalInput = 0;
|
|
397
|
+
let totalOutput = 0;
|
|
398
|
+
let totalDurationMs = 0;
|
|
399
|
+
for (const result of results) {
|
|
400
|
+
const usage = result.tokenUsage;
|
|
401
|
+
if (usage) {
|
|
402
|
+
totalInput += usage.input ?? 0;
|
|
403
|
+
totalOutput += usage.output ?? 0;
|
|
404
|
+
}
|
|
405
|
+
if (result.durationMs != null) {
|
|
406
|
+
totalDurationMs += result.durationMs;
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
return {
|
|
410
|
+
total_tokens: totalInput + totalOutput,
|
|
411
|
+
duration_ms: totalDurationMs,
|
|
412
|
+
total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
|
|
413
|
+
token_usage: {
|
|
414
|
+
input: totalInput,
|
|
415
|
+
output: totalOutput
|
|
416
|
+
}
|
|
417
|
+
};
|
|
418
|
+
}
|
|
419
|
+
function buildBenchmarkArtifact(results, evalFile = "") {
|
|
420
|
+
const targetSet = /* @__PURE__ */ new Set();
|
|
421
|
+
const testIdSet = /* @__PURE__ */ new Set();
|
|
422
|
+
for (const result of results) {
|
|
423
|
+
targetSet.add(result.target);
|
|
424
|
+
testIdSet.add(result.testId);
|
|
425
|
+
}
|
|
426
|
+
const targets = [...targetSet].sort();
|
|
427
|
+
const testIds = [...testIdSet].sort();
|
|
428
|
+
const runSummary = {};
|
|
429
|
+
const notes = [];
|
|
430
|
+
for (const target of targets) {
|
|
431
|
+
const targetResults = results.filter((r) => r.target === target);
|
|
432
|
+
const passRates = targetResults.map(computePassRate);
|
|
433
|
+
const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
|
|
434
|
+
const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
|
|
435
|
+
const usage = r.tokenUsage;
|
|
436
|
+
return (usage.input ?? 0) + (usage.output ?? 0);
|
|
437
|
+
});
|
|
438
|
+
const entry = {
|
|
439
|
+
pass_rate: computeStats(passRates),
|
|
440
|
+
time_seconds: computeStats(timings),
|
|
441
|
+
tokens: computeStats(tokens)
|
|
442
|
+
};
|
|
443
|
+
const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
|
|
444
|
+
if (toolCallCounts.some((c) => c > 0)) {
|
|
445
|
+
entry.tool_calls = computeStats(toolCallCounts);
|
|
446
|
+
}
|
|
447
|
+
const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
|
|
448
|
+
if (costs.length > 0) {
|
|
449
|
+
entry.cost_usd = computeStats(costs);
|
|
450
|
+
}
|
|
451
|
+
runSummary[target] = entry;
|
|
452
|
+
}
|
|
453
|
+
const evaluatorScores = /* @__PURE__ */ new Map();
|
|
454
|
+
for (const result of results) {
|
|
455
|
+
if (result.scores) {
|
|
456
|
+
for (const score of result.scores) {
|
|
457
|
+
const key = `${score.name}:${score.type}`;
|
|
458
|
+
if (!evaluatorScores.has(key)) {
|
|
459
|
+
evaluatorScores.set(key, []);
|
|
460
|
+
}
|
|
461
|
+
evaluatorScores.get(key)?.push(score.score);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
let perEvaluatorSummary;
|
|
466
|
+
if (evaluatorScores.size > 0) {
|
|
467
|
+
perEvaluatorSummary = {};
|
|
468
|
+
for (const [key, scores] of evaluatorScores) {
|
|
469
|
+
perEvaluatorSummary[key] = computeStats(scores);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
const errorCount = results.filter((r) => r.executionStatus === "execution_error").length;
|
|
473
|
+
if (errorCount > 0) {
|
|
474
|
+
notes.push(
|
|
475
|
+
`${errorCount} test(s) had execution errors and are included in pass_rate as failures`
|
|
476
|
+
);
|
|
477
|
+
}
|
|
478
|
+
if (results.length === 0) {
|
|
479
|
+
notes.push("No results to summarize");
|
|
480
|
+
}
|
|
481
|
+
const firstResult = results[0];
|
|
482
|
+
const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
483
|
+
return {
|
|
484
|
+
metadata: {
|
|
485
|
+
eval_file: evalFile,
|
|
486
|
+
timestamp,
|
|
487
|
+
targets,
|
|
488
|
+
tests_run: testIds
|
|
489
|
+
},
|
|
490
|
+
run_summary: runSummary,
|
|
491
|
+
per_evaluator_summary: perEvaluatorSummary,
|
|
492
|
+
notes
|
|
493
|
+
};
|
|
494
|
+
}
|
|
495
|
+
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
496
|
+
const gradingDir = path3.join(outputDir, "grading");
|
|
497
|
+
const timingPath = path3.join(outputDir, "timing.json");
|
|
498
|
+
const benchmarkPath = path3.join(outputDir, "benchmark.json");
|
|
499
|
+
await mkdir(gradingDir, { recursive: true });
|
|
500
|
+
for (const result of results) {
|
|
501
|
+
const grading = buildGradingArtifact(result);
|
|
502
|
+
const safeTestId = result.testId.replace(/[/\\:*?"<>|]/g, "_");
|
|
503
|
+
const gradingPath = path3.join(gradingDir, `${safeTestId}.json`);
|
|
504
|
+
await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
|
|
505
|
+
`, "utf8");
|
|
506
|
+
}
|
|
507
|
+
const timing = buildTimingArtifact(results);
|
|
508
|
+
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
|
|
509
|
+
`, "utf8");
|
|
510
|
+
const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
|
|
511
|
+
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
512
|
+
`, "utf8");
|
|
513
|
+
return { gradingDir, timingPath, benchmarkPath };
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// src/commands/eval/benchmark-writer.ts
|
|
517
|
+
import { writeFile as writeFile2 } from "node:fs/promises";
|
|
518
|
+
var PASS_THRESHOLD2 = 0.8;
|
|
519
|
+
function computeStats2(values) {
|
|
520
|
+
if (values.length === 0) {
|
|
521
|
+
return { mean: 0, stddev: 0 };
|
|
522
|
+
}
|
|
523
|
+
const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
524
|
+
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
|
|
525
|
+
return {
|
|
526
|
+
mean: Math.round(mean * 1e3) / 1e3,
|
|
527
|
+
stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
|
|
528
|
+
};
|
|
529
|
+
}
|
|
530
|
+
function computePassRate2(result) {
|
|
531
|
+
const scores = result.scores;
|
|
532
|
+
if (scores && scores.length > 0) {
|
|
533
|
+
const passed = scores.filter((s) => s.score >= PASS_THRESHOLD2).length;
|
|
534
|
+
return passed / scores.length;
|
|
535
|
+
}
|
|
536
|
+
return result.score >= PASS_THRESHOLD2 ? 1 : 0;
|
|
537
|
+
}
|
|
538
|
+
function buildBenchmarkJson(results) {
|
|
539
|
+
const passRates = results.map(computePassRate2);
|
|
540
|
+
const timings = results.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
|
|
541
|
+
const tokens = results.filter((r) => r.tokenUsage != null).map((r) => {
|
|
542
|
+
const usage = r.tokenUsage;
|
|
543
|
+
return (usage.input ?? 0) + (usage.output ?? 0);
|
|
544
|
+
});
|
|
545
|
+
return {
|
|
546
|
+
run_summary: {
|
|
547
|
+
with_skill: {
|
|
548
|
+
pass_rate: computeStats2(passRates),
|
|
549
|
+
time_seconds: computeStats2(timings),
|
|
550
|
+
tokens: computeStats2(tokens)
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
};
|
|
554
|
+
}
|
|
555
|
+
async function writeBenchmarkJson(outputPath, results) {
|
|
556
|
+
const benchmark = buildBenchmarkJson(results);
|
|
557
|
+
await writeFile2(outputPath, `${JSON.stringify(benchmark, null, 2)}
|
|
558
|
+
`, "utf8");
|
|
559
|
+
}
|
|
560
|
+
|
|
261
561
|
// src/commands/eval/env.ts
|
|
262
562
|
import { constants as constants3 } from "node:fs";
|
|
263
563
|
import { access as access3 } from "node:fs/promises";
|
|
264
|
-
import
|
|
564
|
+
import path4 from "node:path";
|
|
265
565
|
import { config as loadDotenv } from "dotenv";
|
|
266
566
|
function uniqueDirs(directories) {
|
|
267
567
|
const seen = /* @__PURE__ */ new Set();
|
|
268
568
|
const result = [];
|
|
269
569
|
for (const dir of directories) {
|
|
270
|
-
const absolute =
|
|
570
|
+
const absolute = path4.resolve(dir);
|
|
271
571
|
if (seen.has(absolute)) {
|
|
272
572
|
continue;
|
|
273
573
|
}
|
|
@@ -286,14 +586,14 @@ async function fileExists2(filePath) {
|
|
|
286
586
|
}
|
|
287
587
|
function collectAncestorDirectories(start, boundary) {
|
|
288
588
|
const directories = [];
|
|
289
|
-
const boundaryDir =
|
|
290
|
-
let current =
|
|
589
|
+
const boundaryDir = path4.resolve(boundary);
|
|
590
|
+
let current = path4.resolve(start);
|
|
291
591
|
while (current !== void 0) {
|
|
292
592
|
directories.push(current);
|
|
293
593
|
if (current === boundaryDir) {
|
|
294
594
|
break;
|
|
295
595
|
}
|
|
296
|
-
const parent =
|
|
596
|
+
const parent = path4.dirname(current);
|
|
297
597
|
if (parent === current) {
|
|
298
598
|
break;
|
|
299
599
|
}
|
|
@@ -303,12 +603,12 @@ function collectAncestorDirectories(start, boundary) {
|
|
|
303
603
|
}
|
|
304
604
|
async function loadEnvFromHierarchy(options) {
|
|
305
605
|
const { testFilePath, repoRoot, verbose } = options;
|
|
306
|
-
const testDir =
|
|
606
|
+
const testDir = path4.dirname(path4.resolve(testFilePath));
|
|
307
607
|
const cwd = process.cwd();
|
|
308
608
|
const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
|
|
309
609
|
const envFiles = [];
|
|
310
610
|
for (const dir of searchDirs) {
|
|
311
|
-
const candidate =
|
|
611
|
+
const candidate = path4.join(dir, ".env");
|
|
312
612
|
if (await fileExists2(candidate)) {
|
|
313
613
|
envFiles.push(candidate);
|
|
314
614
|
}
|
|
@@ -319,7 +619,7 @@ async function loadEnvFromHierarchy(options) {
|
|
|
319
619
|
}
|
|
320
620
|
return void 0;
|
|
321
621
|
}
|
|
322
|
-
for (let i =
|
|
622
|
+
for (let i = 0; i < envFiles.length; i++) {
|
|
323
623
|
const envFile = envFiles[i];
|
|
324
624
|
loadDotenv({ path: envFile, override: false });
|
|
325
625
|
if (verbose) {
|
|
@@ -330,83 +630,11 @@ async function loadEnvFromHierarchy(options) {
|
|
|
330
630
|
}
|
|
331
631
|
|
|
332
632
|
// src/commands/eval/output-writer.ts
|
|
333
|
-
import
|
|
334
|
-
|
|
335
|
-
// src/commands/eval/json-writer.ts
|
|
336
|
-
import { mkdir, writeFile } from "node:fs/promises";
|
|
337
|
-
import path4 from "node:path";
|
|
338
|
-
|
|
339
|
-
// src/utils/case-conversion.ts
|
|
340
|
-
function toSnakeCase(str) {
|
|
341
|
-
if (/^[A-Z]/.test(str)) {
|
|
342
|
-
return str;
|
|
343
|
-
}
|
|
344
|
-
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
345
|
-
}
|
|
346
|
-
function toSnakeCaseDeep(obj) {
|
|
347
|
-
if (obj === null || obj === void 0) {
|
|
348
|
-
return obj;
|
|
349
|
-
}
|
|
350
|
-
if (Array.isArray(obj)) {
|
|
351
|
-
return obj.map((item) => toSnakeCaseDeep(item));
|
|
352
|
-
}
|
|
353
|
-
if (typeof obj === "object") {
|
|
354
|
-
const result = {};
|
|
355
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
356
|
-
const snakeKey = toSnakeCase(key);
|
|
357
|
-
result[snakeKey] = toSnakeCaseDeep(value);
|
|
358
|
-
}
|
|
359
|
-
return result;
|
|
360
|
-
}
|
|
361
|
-
return obj;
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
// src/commands/eval/json-writer.ts
|
|
365
|
-
var JsonWriter = class _JsonWriter {
|
|
366
|
-
filePath;
|
|
367
|
-
results = [];
|
|
368
|
-
closed = false;
|
|
369
|
-
constructor(filePath) {
|
|
370
|
-
this.filePath = filePath;
|
|
371
|
-
}
|
|
372
|
-
static async open(filePath) {
|
|
373
|
-
await mkdir(path4.dirname(filePath), { recursive: true });
|
|
374
|
-
return new _JsonWriter(filePath);
|
|
375
|
-
}
|
|
376
|
-
async append(result) {
|
|
377
|
-
if (this.closed) {
|
|
378
|
-
throw new Error("Cannot write to closed JSON writer");
|
|
379
|
-
}
|
|
380
|
-
this.results.push(result);
|
|
381
|
-
}
|
|
382
|
-
async close() {
|
|
383
|
-
if (this.closed) {
|
|
384
|
-
return;
|
|
385
|
-
}
|
|
386
|
-
this.closed = true;
|
|
387
|
-
const passed = this.results.filter((r) => r.score >= 0.5).length;
|
|
388
|
-
const failed = this.results.length - passed;
|
|
389
|
-
const total = this.results.length;
|
|
390
|
-
const output = {
|
|
391
|
-
stats: {
|
|
392
|
-
total,
|
|
393
|
-
passed,
|
|
394
|
-
failed,
|
|
395
|
-
passRate: total > 0 ? passed / total : 0
|
|
396
|
-
},
|
|
397
|
-
results: this.results
|
|
398
|
-
};
|
|
399
|
-
const snakeCaseOutput = toSnakeCaseDeep(output);
|
|
400
|
-
await writeFile(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
|
|
401
|
-
`, "utf8");
|
|
402
|
-
}
|
|
403
|
-
};
|
|
633
|
+
import path10 from "node:path";
|
|
404
634
|
|
|
405
|
-
// src/commands/eval/
|
|
406
|
-
import {
|
|
407
|
-
import { mkdir as mkdir2 } from "node:fs/promises";
|
|
635
|
+
// src/commands/eval/html-writer.ts
|
|
636
|
+
import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
408
637
|
import path5 from "node:path";
|
|
409
|
-
import { finished } from "node:stream/promises";
|
|
410
638
|
|
|
411
639
|
// ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
|
|
412
640
|
var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
|
|
@@ -614,7 +842,597 @@ var Mutex = class {
|
|
|
614
842
|
}
|
|
615
843
|
};
|
|
616
844
|
|
|
845
|
+
// src/commands/eval/html-writer.ts
|
|
846
|
+
var HtmlWriter = class _HtmlWriter {
|
|
847
|
+
filePath;
|
|
848
|
+
results = [];
|
|
849
|
+
mutex = new Mutex();
|
|
850
|
+
closed = false;
|
|
851
|
+
isLive = true;
|
|
852
|
+
constructor(filePath) {
|
|
853
|
+
this.filePath = filePath;
|
|
854
|
+
}
|
|
855
|
+
static async open(filePath) {
|
|
856
|
+
await mkdir2(path5.dirname(filePath), { recursive: true });
|
|
857
|
+
const writer = new _HtmlWriter(filePath);
|
|
858
|
+
await writer.writeHtml();
|
|
859
|
+
return writer;
|
|
860
|
+
}
|
|
861
|
+
async append(result) {
|
|
862
|
+
await this.mutex.runExclusive(async () => {
|
|
863
|
+
if (this.closed) {
|
|
864
|
+
throw new Error("Cannot write to closed HTML writer");
|
|
865
|
+
}
|
|
866
|
+
this.results.push(result);
|
|
867
|
+
await this.writeHtml();
|
|
868
|
+
});
|
|
869
|
+
}
|
|
870
|
+
async close() {
|
|
871
|
+
await this.mutex.runExclusive(async () => {
|
|
872
|
+
if (this.closed) {
|
|
873
|
+
return;
|
|
874
|
+
}
|
|
875
|
+
this.closed = true;
|
|
876
|
+
this.isLive = false;
|
|
877
|
+
await this.writeHtml();
|
|
878
|
+
});
|
|
879
|
+
}
|
|
880
|
+
async writeHtml() {
|
|
881
|
+
const html = generateHtml(this.results, this.isLive);
|
|
882
|
+
await writeFile3(this.filePath, html, "utf8");
|
|
883
|
+
}
|
|
884
|
+
};
|
|
885
|
+
function generateHtml(results, isLive) {
|
|
886
|
+
const lightResults = results.map((r) => {
|
|
887
|
+
const { requests, trace, ...rest } = r;
|
|
888
|
+
return rest;
|
|
889
|
+
});
|
|
890
|
+
const dataJson = JSON.stringify(lightResults).replace(/<\//g, "<\\/");
|
|
891
|
+
const metaRefresh = isLive ? ' <meta http-equiv="refresh" content="2">\n' : "";
|
|
892
|
+
const liveIndicator = isLive ? '<span class="live-badge">\u25CF LIVE</span>' : `<span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>`;
|
|
893
|
+
return `<!DOCTYPE html>
|
|
894
|
+
<html lang="en">
|
|
895
|
+
<head>
|
|
896
|
+
<meta charset="utf-8">
|
|
897
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
898
|
+
${metaRefresh} <title>AgentV Evaluation Report</title>
|
|
899
|
+
<style>
|
|
900
|
+
${STYLES}
|
|
901
|
+
</style>
|
|
902
|
+
</head>
|
|
903
|
+
<body>
|
|
904
|
+
<header class="header">
|
|
905
|
+
<div class="header-left">
|
|
906
|
+
<h1 class="header-title">AgentV</h1>
|
|
907
|
+
<span class="header-subtitle">Evaluation Report</span>
|
|
908
|
+
</div>
|
|
909
|
+
<div class="header-right">${liveIndicator}</div>
|
|
910
|
+
</header>
|
|
911
|
+
<nav class="tabs" id="tabs">
|
|
912
|
+
<button class="tab active" data-tab="overview">Overview</button>
|
|
913
|
+
<button class="tab" data-tab="tests">Test Cases</button>
|
|
914
|
+
</nav>
|
|
915
|
+
<main id="app"></main>
|
|
916
|
+
<script>
|
|
917
|
+
var DATA = ${dataJson};
|
|
918
|
+
var IS_LIVE = ${String(isLive)};
|
|
919
|
+
${SCRIPT}
|
|
920
|
+
</script>
|
|
921
|
+
</body>
|
|
922
|
+
</html>`;
|
|
923
|
+
}
|
|
924
|
+
function escapeHtml(s) {
|
|
925
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
926
|
+
}
|
|
927
|
+
var STYLES = `
|
|
928
|
+
*{margin:0;padding:0;box-sizing:border-box}
|
|
929
|
+
:root{
|
|
930
|
+
--bg:#f6f8fa;--surface:#fff;--border:#d0d7de;--border-light:#e8ebee;
|
|
931
|
+
--text:#1f2328;--text-muted:#656d76;
|
|
932
|
+
--primary:#0969da;--primary-bg:#ddf4ff;
|
|
933
|
+
--success:#1a7f37;--success-bg:#dafbe1;
|
|
934
|
+
--danger:#cf222e;--danger-bg:#ffebe9;
|
|
935
|
+
--warning:#9a6700;--warning-bg:#fff8c5;
|
|
936
|
+
--radius:6px;
|
|
937
|
+
--shadow:0 1px 3px rgba(31,35,40,.04),0 1px 2px rgba(31,35,40,.06);
|
|
938
|
+
--font:-apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif;
|
|
939
|
+
--mono:ui-monospace,SFMono-Regular,"SF Mono",Menlo,Consolas,monospace;
|
|
940
|
+
}
|
|
941
|
+
body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5;font-size:14px}
|
|
942
|
+
|
|
943
|
+
/* Header */
|
|
944
|
+
.header{background:var(--surface);border-bottom:1px solid var(--border);padding:12px 24px;display:flex;align-items:center;justify-content:space-between}
|
|
945
|
+
.header-left{display:flex;align-items:baseline;gap:12px}
|
|
946
|
+
.header-title{font-size:18px;font-weight:600}
|
|
947
|
+
.header-subtitle{font-size:14px;color:var(--text-muted)}
|
|
948
|
+
.live-badge{color:var(--success);font-size:12px;font-weight:600;animation:pulse 2s infinite}
|
|
949
|
+
@keyframes pulse{0%,100%{opacity:1}50%{opacity:.4}}
|
|
950
|
+
.timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
|
|
951
|
+
|
|
952
|
+
/* Tabs */
|
|
953
|
+
.tabs{background:var(--surface);border-bottom:1px solid var(--border);padding:0 24px;display:flex}
|
|
954
|
+
.tab{background:none;border:none;padding:10px 16px;font-size:14px;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;font-family:var(--font);transition:color .15s,border-color .15s}
|
|
955
|
+
.tab:hover{color:var(--text)}
|
|
956
|
+
.tab.active{color:var(--text);font-weight:600;border-bottom-color:var(--primary)}
|
|
957
|
+
|
|
958
|
+
#app{max-width:1280px;margin:0 auto;padding:24px}
|
|
959
|
+
|
|
960
|
+
/* Stat cards */
|
|
961
|
+
.stats-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px}
|
|
962
|
+
.stat-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;text-align:center;box-shadow:var(--shadow)}
|
|
963
|
+
.stat-card.pass .stat-value{color:var(--success)}
|
|
964
|
+
.stat-card.fail .stat-value{color:var(--danger)}
|
|
965
|
+
.stat-card.error .stat-value{color:var(--danger)}
|
|
966
|
+
.stat-card.warn .stat-value{color:var(--warning)}
|
|
967
|
+
.stat-card.total .stat-value{color:var(--primary)}
|
|
968
|
+
.stat-value{font-size:28px;font-weight:700;line-height:1.2}
|
|
969
|
+
.stat-label{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.5px;margin-top:4px}
|
|
970
|
+
|
|
971
|
+
/* Sections */
|
|
972
|
+
.section{margin-bottom:24px}
|
|
973
|
+
.section-title{font-size:16px;font-weight:600;margin-bottom:12px}
|
|
974
|
+
|
|
975
|
+
/* Tables */
|
|
976
|
+
.table-wrap{overflow-x:auto;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);box-shadow:var(--shadow)}
|
|
977
|
+
.data-table{width:100%;border-collapse:collapse;font-size:13px}
|
|
978
|
+
.data-table th{background:var(--bg);border-bottom:1px solid var(--border);padding:8px 12px;text-align:left;font-weight:600;font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;white-space:nowrap}
|
|
979
|
+
.data-table th.sortable{cursor:pointer;user-select:none}
|
|
980
|
+
.data-table th.sortable:hover{color:var(--text)}
|
|
981
|
+
.data-table td{padding:8px 12px;border-bottom:1px solid var(--border-light);vertical-align:middle}
|
|
982
|
+
.data-table tbody tr:last-child td{border-bottom:none}
|
|
983
|
+
|
|
984
|
+
/* Status icons */
|
|
985
|
+
.status-icon{display:inline-flex;align-items:center;justify-content:center;width:22px;height:22px;border-radius:50%;font-size:12px;font-weight:700}
|
|
986
|
+
.status-icon.pass{background:var(--success-bg);color:var(--success)}
|
|
987
|
+
.status-icon.fail{background:var(--danger-bg);color:var(--danger)}
|
|
988
|
+
.status-icon.error{background:var(--warning-bg);color:var(--warning)}
|
|
989
|
+
|
|
990
|
+
/* Score colors */
|
|
991
|
+
.score-high{color:var(--success);font-weight:600}
|
|
992
|
+
.score-mid{color:var(--warning);font-weight:600}
|
|
993
|
+
.score-low{color:var(--danger);font-weight:600}
|
|
994
|
+
|
|
995
|
+
/* Pass-rate bar */
|
|
996
|
+
.bar-bg{width:100px;height:8px;background:var(--border-light);border-radius:4px;overflow:hidden}
|
|
997
|
+
.bar-fill{height:100%;border-radius:4px;transition:width .3s}
|
|
998
|
+
.bar-fill.score-high{background:var(--success)}
|
|
999
|
+
.bar-fill.score-mid{background:var(--warning)}
|
|
1000
|
+
.bar-fill.score-low{background:var(--danger)}
|
|
1001
|
+
|
|
1002
|
+
/* Histogram */
|
|
1003
|
+
.histogram{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;box-shadow:var(--shadow)}
|
|
1004
|
+
.hist-row{display:flex;align-items:center;gap:12px;margin-bottom:8px}
|
|
1005
|
+
.hist-row:last-child{margin-bottom:0}
|
|
1006
|
+
.hist-label{width:60px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
|
|
1007
|
+
.hist-bar-bg{flex:1;height:20px;background:var(--border-light);border-radius:3px;overflow:hidden}
|
|
1008
|
+
.hist-bar{height:100%;border-radius:3px;transition:width .3s}
|
|
1009
|
+
.hist-count{width:30px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
|
|
1010
|
+
|
|
1011
|
+
/* Filters */
|
|
1012
|
+
.filter-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap}
|
|
1013
|
+
.filter-select,.filter-search{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font)}
|
|
1014
|
+
.filter-search{flex:1;min-width:200px}
|
|
1015
|
+
.filter-count{font-size:12px;color:var(--text-muted);margin-left:auto}
|
|
1016
|
+
|
|
1017
|
+
/* Test rows */
|
|
1018
|
+
.test-row{cursor:pointer;transition:background .1s}
|
|
1019
|
+
.test-row:hover{background:var(--bg)!important}
|
|
1020
|
+
.test-row.expanded{background:var(--primary-bg)!important}
|
|
1021
|
+
.expand-col{width:32px;text-align:center}
|
|
1022
|
+
.expand-icon{color:var(--text-muted);font-size:12px}
|
|
1023
|
+
.fw-medium{font-weight:500}
|
|
1024
|
+
.text-pass{color:var(--success)}.text-fail{color:var(--danger)}.text-error{color:var(--warning)}
|
|
1025
|
+
|
|
1026
|
+
/* Detail panel */
|
|
1027
|
+
.detail-row td{padding:0!important;background:var(--bg)!important}
|
|
1028
|
+
.detail-panel{padding:16px 24px}
|
|
1029
|
+
.detail-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}
|
|
1030
|
+
.detail-block h4{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;margin-bottom:6px}
|
|
1031
|
+
.detail-pre{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:12px;font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow-y:auto;line-height:1.6}
|
|
1032
|
+
.detail-panel h4{font-size:13px;font-weight:600;margin:16px 0 8px}
|
|
1033
|
+
.eval-table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);margin-bottom:12px}
|
|
1034
|
+
.eval-table th{background:var(--bg);padding:6px 10px;text-align:left;font-size:11px;font-weight:600;color:var(--text-muted);text-transform:uppercase;border-bottom:1px solid var(--border)}
|
|
1035
|
+
.eval-table td{padding:8px 10px;border-bottom:1px solid var(--border-light)}
|
|
1036
|
+
.reasoning-cell{max-width:500px;font-size:12px;color:var(--text-muted)}
|
|
1037
|
+
.expect-list{list-style:none;padding:0;margin-bottom:12px}
|
|
1038
|
+
.expect-list li{padding:4px 8px 4px 24px;position:relative;font-size:13px}
|
|
1039
|
+
.expect-list.pass li::before{content:"\\2713";position:absolute;left:4px;color:var(--success);font-weight:700}
|
|
1040
|
+
.expect-list.fail li::before{content:"\\2717";position:absolute;left:4px;color:var(--danger);font-weight:700}
|
|
1041
|
+
.error-box{background:var(--danger-bg);border:1px solid var(--danger);border-radius:var(--radius);padding:12px;margin-bottom:12px}
|
|
1042
|
+
.error-box h4{color:var(--danger);margin:0 0 6px}
|
|
1043
|
+
.error-box pre{font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word}
|
|
1044
|
+
.detail-meta{font-size:12px;color:var(--text-muted);margin-top:12px;padding-top:12px;border-top:1px solid var(--border-light)}
|
|
1045
|
+
.empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
|
|
1046
|
+
.empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
|
|
1047
|
+
`;
|
|
1048
|
+
var SCRIPT = `
|
|
1049
|
+
(function(){
|
|
1050
|
+
/* ---- helpers ---- */
|
|
1051
|
+
function esc(s){
|
|
1052
|
+
if(s==null)return"";
|
|
1053
|
+
return String(s).replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""");
|
|
1054
|
+
}
|
|
1055
|
+
function getStatus(r){
|
|
1056
|
+
if(r.executionStatus==="execution_error")return"error";
|
|
1057
|
+
if(r.executionStatus==="quality_failure")return"fail";
|
|
1058
|
+
if(r.executionStatus==="ok")return"pass";
|
|
1059
|
+
if(r.error)return"error";
|
|
1060
|
+
return r.score>=0.5?"pass":"fail";
|
|
1061
|
+
}
|
|
1062
|
+
function sIcon(s){
|
|
1063
|
+
if(s==="pass")return'<span class="status-icon pass">\\u2713</span>';
|
|
1064
|
+
if(s==="fail")return'<span class="status-icon fail">\\u2717</span>';
|
|
1065
|
+
return'<span class="status-icon error">!</span>';
|
|
1066
|
+
}
|
|
1067
|
+
function fmtDur(ms){
|
|
1068
|
+
if(ms==null)return"\\u2014";
|
|
1069
|
+
if(ms<1000)return ms+"ms";
|
|
1070
|
+
if(ms<60000)return(ms/1000).toFixed(1)+"s";
|
|
1071
|
+
return Math.floor(ms/60000)+"m "+Math.round((ms%60000)/1000)+"s";
|
|
1072
|
+
}
|
|
1073
|
+
function fmtTok(n){
|
|
1074
|
+
if(n==null)return"\\u2014";
|
|
1075
|
+
if(n>=1e6)return(n/1e6).toFixed(1)+"M";
|
|
1076
|
+
if(n>=1e3)return(n/1e3).toFixed(1)+"K";
|
|
1077
|
+
return String(n);
|
|
1078
|
+
}
|
|
1079
|
+
function fmtCost(u){if(u==null)return"\\u2014";if(u<0.01)return"<$0.01";return"$"+u.toFixed(2);}
|
|
1080
|
+
function fmtPct(v){if(v==null)return"\\u2014";return(v*100).toFixed(1)+"%";}
|
|
1081
|
+
function sCls(v){if(v==null)return"";if(v>=0.9)return"score-high";if(v>=0.5)return"score-mid";return"score-low";}
|
|
1082
|
+
|
|
1083
|
+
/* ---- compute stats ---- */
|
|
1084
|
+
function computeStats(d){
|
|
1085
|
+
var t=d.length,p=0,f=0,e=0,dur=0,ti=0,to=0,cost=0,sc=[];
|
|
1086
|
+
for(var i=0;i<d.length;i++){
|
|
1087
|
+
var r=d[i],s=getStatus(r);
|
|
1088
|
+
if(s==="pass")p++;else if(s==="fail")f++;else e++;
|
|
1089
|
+
if(r.durationMs)dur+=r.durationMs;
|
|
1090
|
+
if(r.tokenUsage){ti+=(r.tokenUsage.input||0);to+=(r.tokenUsage.output||0);}
|
|
1091
|
+
if(r.costUsd)cost+=r.costUsd;
|
|
1092
|
+
if(s!=="error")sc.push(r.score);
|
|
1093
|
+
}
|
|
1094
|
+
var g=t-e;
|
|
1095
|
+
return{total:t,passed:p,failed:f,errors:e,passRate:g>0?p/g:0,dur:dur,tokens:ti+to,inTok:ti,outTok:to,cost:cost,scores:sc};
|
|
1096
|
+
}
|
|
1097
|
+
function computeTargets(d){
|
|
1098
|
+
var m={};
|
|
1099
|
+
for(var i=0;i<d.length;i++){
|
|
1100
|
+
var r=d[i],tgt=r.target||"unknown";
|
|
1101
|
+
if(!m[tgt])m[tgt]={target:tgt,results:[],p:0,f:0,e:0,ts:0,sc:0,dur:0,tok:0,cost:0};
|
|
1102
|
+
var o=m[tgt];o.results.push(r);
|
|
1103
|
+
var s=getStatus(r);
|
|
1104
|
+
if(s==="pass")o.p++;else if(s==="fail")o.f++;else o.e++;
|
|
1105
|
+
if(s!=="error"){o.ts+=r.score;o.sc++;}
|
|
1106
|
+
if(r.durationMs)o.dur+=r.durationMs;
|
|
1107
|
+
if(r.tokenUsage)o.tok+=(r.tokenUsage.input||0)+(r.tokenUsage.output||0);
|
|
1108
|
+
if(r.costUsd)o.cost+=r.costUsd;
|
|
1109
|
+
}
|
|
1110
|
+
var a=[];for(var k in m)a.push(m[k]);return a;
|
|
1111
|
+
}
|
|
1112
|
+
function getEvalNames(){
|
|
1113
|
+
var n={};
|
|
1114
|
+
for(var i=0;i<DATA.length;i++){
|
|
1115
|
+
var sc=DATA[i].scores;
|
|
1116
|
+
if(sc)for(var j=0;j<sc.length;j++)n[sc[j].name]=true;
|
|
1117
|
+
}
|
|
1118
|
+
return Object.keys(n);
|
|
1119
|
+
}
|
|
1120
|
+
function getEvalScore(r,name){
|
|
1121
|
+
if(!r.scores)return null;
|
|
1122
|
+
for(var i=0;i<r.scores.length;i++)if(r.scores[i].name===name)return r.scores[i].score;
|
|
1123
|
+
return null;
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
var stats=computeStats(DATA);
|
|
1127
|
+
var tgtStats=computeTargets(DATA);
|
|
1128
|
+
var tgtNames=tgtStats.map(function(t){return t.target;});
|
|
1129
|
+
|
|
1130
|
+
/* ---- state ---- */
|
|
1131
|
+
var state={tab:"overview",filter:{status:"all",target:"all",search:""},sort:{col:"testId",dir:"asc"},expanded:{}};
|
|
1132
|
+
|
|
1133
|
+
/* ---- DOM refs ---- */
|
|
1134
|
+
var app=document.getElementById("app");
|
|
1135
|
+
var tabBtns=document.querySelectorAll(".tab");
|
|
1136
|
+
|
|
1137
|
+
/* ---- tabs ---- */
|
|
1138
|
+
function setTab(t){
|
|
1139
|
+
state.tab=t;
|
|
1140
|
+
for(var i=0;i<tabBtns.length;i++)tabBtns[i].classList.toggle("active",tabBtns[i].getAttribute("data-tab")===t);
|
|
1141
|
+
render();
|
|
1142
|
+
}
|
|
1143
|
+
for(var i=0;i<tabBtns.length;i++){
|
|
1144
|
+
tabBtns[i].addEventListener("click",(function(b){return function(){setTab(b.getAttribute("data-tab"));};})(tabBtns[i]));
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
/* ---- render ---- */
|
|
1148
|
+
function render(){
|
|
1149
|
+
if(DATA.length===0){app.innerHTML='<div class="empty-state"><h3>No results yet</h3><p>'+(IS_LIVE?"Waiting for evaluation results\\u2026 Page will auto-refresh.":"Run an evaluation to generate results.")+"</p></div>";return;}
|
|
1150
|
+
if(state.tab==="overview")renderOverview();else renderTests();
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
/* ---- stat card helper ---- */
|
|
1154
|
+
function card(label,value,type){
|
|
1155
|
+
return'<div class="stat-card '+type+'"><div class="stat-value">'+value+'</div><div class="stat-label">'+label+"</div></div>";
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
/* ---- overview ---- */
|
|
1159
|
+
function renderOverview(){
|
|
1160
|
+
var h='<div class="stats-grid">';
|
|
1161
|
+
h+=card("Total Tests",stats.total,"total");
|
|
1162
|
+
h+=card("Passed",stats.passed,"pass");
|
|
1163
|
+
h+=card("Failed",stats.failed,"fail");
|
|
1164
|
+
h+=card("Errors",stats.errors,"error");
|
|
1165
|
+
var prCls=stats.passRate>=0.9?"pass":stats.passRate>=0.5?"warn":"fail";
|
|
1166
|
+
h+=card("Pass Rate",fmtPct(stats.passRate),prCls);
|
|
1167
|
+
h+=card("Duration",fmtDur(stats.dur),"neutral");
|
|
1168
|
+
h+=card("Tokens",fmtTok(stats.tokens),"neutral");
|
|
1169
|
+
h+=card("Est. Cost",fmtCost(stats.cost),"neutral");
|
|
1170
|
+
h+="</div>";
|
|
1171
|
+
|
|
1172
|
+
/* targets table */
|
|
1173
|
+
if(tgtStats.length>1){
|
|
1174
|
+
h+='<div class="section"><h2 class="section-title">Targets</h2><div class="table-wrap"><table class="data-table">';
|
|
1175
|
+
h+="<thead><tr><th>Target</th><th>Pass Rate</th><th></th><th>Passed</th><th>Failed</th><th>Errors</th><th>Avg Score</th><th>Duration</th><th>Tokens</th><th>Cost</th></tr></thead><tbody>";
|
|
1176
|
+
for(var i=0;i<tgtStats.length;i++){
|
|
1177
|
+
var t=tgtStats[i],g=t.p+t.f,pr=g>0?t.p/g:0,avg=t.sc>0?t.ts/t.sc:0;
|
|
1178
|
+
h+="<tr><td class=\\"fw-medium\\">"+esc(t.target)+"</td><td>"+fmtPct(pr)+'</td><td><div class="bar-bg"><div class="bar-fill '+sCls(pr)+'" style="width:'+(pr*100)+'%"></div></div></td>';
|
|
1179
|
+
h+='<td class="text-pass">'+t.p+'</td><td class="text-fail">'+t.f+'</td><td class="text-error">'+t.e+"</td>";
|
|
1180
|
+
h+='<td class="'+sCls(avg)+'">'+fmtPct(avg)+"</td><td>"+fmtDur(t.dur)+"</td><td>"+fmtTok(t.tok)+"</td><td>"+fmtCost(t.cost)+"</td></tr>";
|
|
1181
|
+
}
|
|
1182
|
+
h+="</tbody></table></div></div>";
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
/* histogram */
|
|
1186
|
+
if(stats.scores.length>0){
|
|
1187
|
+
var bk=[0,0,0,0,0];
|
|
1188
|
+
for(var i=0;i<stats.scores.length;i++){var idx=Math.min(Math.floor(stats.scores[i]*5),4);bk[idx]++;}
|
|
1189
|
+
var mx=Math.max.apply(null,bk);
|
|
1190
|
+
var lb=["0\\u201320%","20\\u201340%","40\\u201360%","60\\u201380%","80\\u2013100%"];
|
|
1191
|
+
h+='<div class="section"><h2 class="section-title">Score Distribution</h2><div class="histogram">';
|
|
1192
|
+
for(var i=0;i<bk.length;i++){
|
|
1193
|
+
var pct=mx>0?(bk[i]/mx*100):0;
|
|
1194
|
+
h+='<div class="hist-row"><span class="hist-label">'+lb[i]+'</span><div class="hist-bar-bg"><div class="hist-bar '+(i>=4?"score-high":i>=2?"score-mid":"score-low")+'" style="width:'+pct+'%"></div></div><span class="hist-count">'+bk[i]+"</span></div>";
|
|
1195
|
+
}
|
|
1196
|
+
h+="</div></div>";
|
|
1197
|
+
}
|
|
1198
|
+
app.innerHTML=h;
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
/* ---- test cases ---- */
|
|
1202
|
+
function renderTests(){
|
|
1203
|
+
var evalNames=getEvalNames();
|
|
1204
|
+
var h='<div class="filter-bar">';
|
|
1205
|
+
h+='<select id="flt-status" class="filter-select"><option value="all">All Status</option><option value="pass">Passed</option><option value="fail">Failed</option><option value="error">Errors</option></select>';
|
|
1206
|
+
if(tgtNames.length>1){
|
|
1207
|
+
h+='<select id="flt-target" class="filter-select"><option value="all">All Targets</option>';
|
|
1208
|
+
for(var i=0;i<tgtNames.length;i++)h+='<option value="'+esc(tgtNames[i])+'">'+esc(tgtNames[i])+"</option>";
|
|
1209
|
+
h+="</select>";
|
|
1210
|
+
}
|
|
1211
|
+
h+='<input type="text" id="flt-search" class="filter-search" placeholder="Search tests..." value="'+esc(state.filter.search)+'">';
|
|
1212
|
+
h+='<span class="filter-count" id="flt-count"></span></div>';
|
|
1213
|
+
|
|
1214
|
+
h+='<div class="table-wrap"><table class="data-table" id="test-tbl"><thead><tr>';
|
|
1215
|
+
h+='<th class="expand-col"></th>';
|
|
1216
|
+
h+=sHdr("Status","status");
|
|
1217
|
+
h+=sHdr("Test ID","testId");
|
|
1218
|
+
if(tgtNames.length>1)h+=sHdr("Target","target");
|
|
1219
|
+
h+=sHdr("Score","score");
|
|
1220
|
+
for(var i=0;i<evalNames.length;i++)h+="<th>"+esc(evalNames[i])+"</th>";
|
|
1221
|
+
h+=sHdr("Duration","durationMs");
|
|
1222
|
+
h+=sHdr("Cost","costUsd");
|
|
1223
|
+
h+="</tr></thead><tbody id=\\"test-body\\"></tbody></table></div>";
|
|
1224
|
+
app.innerHTML=h;
|
|
1225
|
+
|
|
1226
|
+
/* wire events */
|
|
1227
|
+
var selS=document.getElementById("flt-status");
|
|
1228
|
+
selS.value=state.filter.status;
|
|
1229
|
+
selS.addEventListener("change",function(e){state.filter.status=e.target.value;renderRows();});
|
|
1230
|
+
var selT=document.getElementById("flt-target");
|
|
1231
|
+
if(selT){selT.value=state.filter.target;selT.addEventListener("change",function(e){state.filter.target=e.target.value;renderRows();});}
|
|
1232
|
+
document.getElementById("flt-search").addEventListener("input",function(e){state.filter.search=e.target.value;renderRows();});
|
|
1233
|
+
var ths=document.querySelectorAll("th[data-sort]");
|
|
1234
|
+
for(var i=0;i<ths.length;i++){
|
|
1235
|
+
ths[i].addEventListener("click",(function(th){return function(){
|
|
1236
|
+
var c=th.getAttribute("data-sort");
|
|
1237
|
+
if(state.sort.col===c)state.sort.dir=state.sort.dir==="asc"?"desc":"asc";
|
|
1238
|
+
else{state.sort.col=c;state.sort.dir="asc";}
|
|
1239
|
+
renderTests();
|
|
1240
|
+
};})(ths[i]));
|
|
1241
|
+
}
|
|
1242
|
+
renderRows();
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1245
|
+
function sHdr(label,col){
|
|
1246
|
+
var arrow="";
|
|
1247
|
+
if(state.sort.col===col)arrow=state.sort.dir==="asc"?" \\u2191":" \\u2193";
|
|
1248
|
+
return'<th class="sortable" data-sort="'+col+'">'+label+arrow+"</th>";
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
function filtered(){
|
|
1252
|
+
var out=[];
|
|
1253
|
+
for(var i=0;i<DATA.length;i++){
|
|
1254
|
+
var r=DATA[i],s=getStatus(r);
|
|
1255
|
+
if(state.filter.status!=="all"&&s!==state.filter.status)continue;
|
|
1256
|
+
if(state.filter.target!=="all"&&r.target!==state.filter.target)continue;
|
|
1257
|
+
if(state.filter.search&&r.testId.toLowerCase().indexOf(state.filter.search.toLowerCase())===-1)continue;
|
|
1258
|
+
out.push(r);
|
|
1259
|
+
}
|
|
1260
|
+
var col=state.sort.col,dir=state.sort.dir==="asc"?1:-1;
|
|
1261
|
+
out.sort(function(a,b){
|
|
1262
|
+
var va=col==="status"?getStatus(a):a[col],vb=col==="status"?getStatus(b):b[col];
|
|
1263
|
+
if(va==null&&vb==null)return 0;if(va==null)return 1;if(vb==null)return-1;
|
|
1264
|
+
if(typeof va==="string")return va.localeCompare(vb)*dir;
|
|
1265
|
+
return(va-vb)*dir;
|
|
1266
|
+
});
|
|
1267
|
+
return out;
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1270
|
+
function renderRows(){
|
|
1271
|
+
var rows=filtered(),evalNames=getEvalNames();
|
|
1272
|
+
var tbody=document.getElementById("test-body");
|
|
1273
|
+
var colSpan=5+evalNames.length+(tgtNames.length>1?1:0);
|
|
1274
|
+
document.getElementById("flt-count").textContent=rows.length+" of "+DATA.length+" tests";
|
|
1275
|
+
var h="";
|
|
1276
|
+
for(var i=0;i<rows.length;i++){
|
|
1277
|
+
var r=rows[i],s=getStatus(r),key=r.testId+":"+r.target,exp=!!state.expanded[key];
|
|
1278
|
+
h+='<tr class="test-row '+s+(exp?" expanded":"")+'" data-key="'+esc(key)+'">';
|
|
1279
|
+
h+='<td class="expand-col"><span class="expand-icon">'+(exp?"\\u25BE":"\\u25B8")+"</span></td>";
|
|
1280
|
+
h+="<td>"+sIcon(s)+"</td>";
|
|
1281
|
+
h+='<td class="fw-medium">'+esc(r.testId)+"</td>";
|
|
1282
|
+
if(tgtNames.length>1)h+="<td>"+esc(r.target)+"</td>";
|
|
1283
|
+
h+='<td class="'+sCls(r.score)+'">'+fmtPct(r.score)+"</td>";
|
|
1284
|
+
for(var j=0;j<evalNames.length;j++){
|
|
1285
|
+
var es=getEvalScore(r,evalNames[j]);
|
|
1286
|
+
h+='<td class="'+sCls(es)+'">'+(es!=null?fmtPct(es):"\\u2014")+"</td>";
|
|
1287
|
+
}
|
|
1288
|
+
h+="<td>"+fmtDur(r.durationMs)+"</td><td>"+fmtCost(r.costUsd)+"</td></tr>";
|
|
1289
|
+
if(exp)h+='<tr class="detail-row"><td colspan="'+colSpan+'">'+renderDetail(r)+"</td></tr>";
|
|
1290
|
+
}
|
|
1291
|
+
if(rows.length===0)h+='<tr><td colspan="'+colSpan+'" class="empty-state">No matching tests</td></tr>';
|
|
1292
|
+
tbody.innerHTML=h;
|
|
1293
|
+
|
|
1294
|
+
/* row click */
|
|
1295
|
+
var trs=tbody.querySelectorAll(".test-row");
|
|
1296
|
+
for(var k=0;k<trs.length;k++){
|
|
1297
|
+
trs[k].addEventListener("click",(function(tr){return function(){
|
|
1298
|
+
var key=tr.getAttribute("data-key");
|
|
1299
|
+
state.expanded[key]=!state.expanded[key];
|
|
1300
|
+
renderRows();
|
|
1301
|
+
};})(trs[k]));
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
/* ---- detail panel ---- */
|
|
1306
|
+
function renderDetail(r){
|
|
1307
|
+
var h='<div class="detail-panel">';
|
|
1308
|
+
|
|
1309
|
+
/* input / output */
|
|
1310
|
+
h+='<div class="detail-grid">';
|
|
1311
|
+
if(r.input!=null){
|
|
1312
|
+
h+='<div class="detail-block"><h4>Input</h4><pre class="detail-pre">'+esc(typeof r.input==="string"?r.input:JSON.stringify(r.input,null,2))+"</pre></div>";
|
|
1313
|
+
}
|
|
1314
|
+
h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.answer||"")+"</pre></div>";
|
|
1315
|
+
h+="</div>";
|
|
1316
|
+
|
|
1317
|
+
/* evaluator results */
|
|
1318
|
+
if(r.scores&&r.scores.length>0){
|
|
1319
|
+
h+="<h4>Evaluator Results</h4>";
|
|
1320
|
+
h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Reasoning</th></tr></thead><tbody>';
|
|
1321
|
+
for(var i=0;i<r.scores.length;i++){
|
|
1322
|
+
var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
|
|
1323
|
+
h+="<tr><td class=\\"fw-medium\\">"+esc(ev.name)+'</td><td class="'+sCls(ev.score)+'">'+fmtPct(ev.score)+"</td><td>"+sIcon(evS)+'</td><td class="reasoning-cell">'+esc(ev.reasoning||"")+"</td></tr>";
|
|
1324
|
+
}
|
|
1325
|
+
h+="</tbody></table>";
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
/* hits / misses */
|
|
1329
|
+
if(r.hits&&r.hits.length>0){
|
|
1330
|
+
h+='<h4>Passed Expectations</h4><ul class="expect-list pass">';
|
|
1331
|
+
for(var i=0;i<r.hits.length;i++)h+="<li>"+esc(r.hits[i])+"</li>";
|
|
1332
|
+
h+="</ul>";
|
|
1333
|
+
}
|
|
1334
|
+
if(r.misses&&r.misses.length>0){
|
|
1335
|
+
h+='<h4>Failed Expectations</h4><ul class="expect-list fail">';
|
|
1336
|
+
for(var i=0;i<r.misses.length;i++)h+="<li>"+esc(r.misses[i])+"</li>";
|
|
1337
|
+
h+="</ul>";
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
/* error */
|
|
1341
|
+
if(r.error)h+='<div class="error-box"><h4>Error</h4><pre>'+esc(r.error)+"</pre></div>";
|
|
1342
|
+
|
|
1343
|
+
/* metadata */
|
|
1344
|
+
h+='<div class="detail-meta">';
|
|
1345
|
+
var m=[];
|
|
1346
|
+
if(r.tokenUsage)m.push(fmtTok(r.tokenUsage.input)+" in / "+fmtTok(r.tokenUsage.output)+" out tokens");
|
|
1347
|
+
if(r.durationMs)m.push(fmtDur(r.durationMs));
|
|
1348
|
+
if(r.target)m.push(r.target);
|
|
1349
|
+
if(r.costUsd)m.push(fmtCost(r.costUsd));
|
|
1350
|
+
if(r.timestamp)m.push(r.timestamp);
|
|
1351
|
+
h+=esc(m.join(" \\u00B7 "));
|
|
1352
|
+
h+="</div></div>";
|
|
1353
|
+
return h;
|
|
1354
|
+
}
|
|
1355
|
+
|
|
1356
|
+
/* ---- init ---- */
|
|
1357
|
+
render();
|
|
1358
|
+
})();
|
|
1359
|
+
`;
|
|
1360
|
+
|
|
1361
|
+
// src/commands/eval/json-writer.ts
|
|
1362
|
+
import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
|
|
1363
|
+
import path6 from "node:path";
|
|
1364
|
+
|
|
1365
|
+
// src/utils/case-conversion.ts
|
|
1366
|
+
function toSnakeCase(str) {
|
|
1367
|
+
if (/^[A-Z]/.test(str)) {
|
|
1368
|
+
return str;
|
|
1369
|
+
}
|
|
1370
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
1371
|
+
}
|
|
1372
|
+
function toSnakeCaseDeep(obj) {
|
|
1373
|
+
if (obj === null || obj === void 0) {
|
|
1374
|
+
return obj;
|
|
1375
|
+
}
|
|
1376
|
+
if (Array.isArray(obj)) {
|
|
1377
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
1378
|
+
}
|
|
1379
|
+
if (typeof obj === "object") {
|
|
1380
|
+
const result = {};
|
|
1381
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
1382
|
+
const snakeKey = toSnakeCase(key);
|
|
1383
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
1384
|
+
}
|
|
1385
|
+
return result;
|
|
1386
|
+
}
|
|
1387
|
+
return obj;
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
// src/commands/eval/json-writer.ts
|
|
1391
|
+
var JsonWriter = class _JsonWriter {
|
|
1392
|
+
filePath;
|
|
1393
|
+
results = [];
|
|
1394
|
+
closed = false;
|
|
1395
|
+
constructor(filePath) {
|
|
1396
|
+
this.filePath = filePath;
|
|
1397
|
+
}
|
|
1398
|
+
static async open(filePath) {
|
|
1399
|
+
await mkdir3(path6.dirname(filePath), { recursive: true });
|
|
1400
|
+
return new _JsonWriter(filePath);
|
|
1401
|
+
}
|
|
1402
|
+
async append(result) {
|
|
1403
|
+
if (this.closed) {
|
|
1404
|
+
throw new Error("Cannot write to closed JSON writer");
|
|
1405
|
+
}
|
|
1406
|
+
this.results.push(result);
|
|
1407
|
+
}
|
|
1408
|
+
async close() {
|
|
1409
|
+
if (this.closed) {
|
|
1410
|
+
return;
|
|
1411
|
+
}
|
|
1412
|
+
this.closed = true;
|
|
1413
|
+
const passed = this.results.filter((r) => r.score >= 0.5).length;
|
|
1414
|
+
const failed = this.results.length - passed;
|
|
1415
|
+
const total = this.results.length;
|
|
1416
|
+
const output = {
|
|
1417
|
+
stats: {
|
|
1418
|
+
total,
|
|
1419
|
+
passed,
|
|
1420
|
+
failed,
|
|
1421
|
+
passRate: total > 0 ? passed / total : 0
|
|
1422
|
+
},
|
|
1423
|
+
results: this.results
|
|
1424
|
+
};
|
|
1425
|
+
const snakeCaseOutput = toSnakeCaseDeep(output);
|
|
1426
|
+
await writeFile4(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
|
|
1427
|
+
`, "utf8");
|
|
1428
|
+
}
|
|
1429
|
+
};
|
|
1430
|
+
|
|
617
1431
|
// src/commands/eval/jsonl-writer.ts
|
|
1432
|
+
import { createWriteStream } from "node:fs";
|
|
1433
|
+
import { mkdir as mkdir4 } from "node:fs/promises";
|
|
1434
|
+
import path7 from "node:path";
|
|
1435
|
+
import { finished } from "node:stream/promises";
|
|
618
1436
|
var JsonlWriter = class _JsonlWriter {
|
|
619
1437
|
stream;
|
|
620
1438
|
mutex = new Mutex();
|
|
@@ -623,7 +1441,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
623
1441
|
this.stream = stream;
|
|
624
1442
|
}
|
|
625
1443
|
static async open(filePath) {
|
|
626
|
-
await
|
|
1444
|
+
await mkdir4(path7.dirname(filePath), { recursive: true });
|
|
627
1445
|
const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
|
|
628
1446
|
return new _JsonlWriter(stream);
|
|
629
1447
|
}
|
|
@@ -654,8 +1472,8 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
654
1472
|
};
|
|
655
1473
|
|
|
656
1474
|
// src/commands/eval/junit-writer.ts
|
|
657
|
-
import { mkdir as
|
|
658
|
-
import
|
|
1475
|
+
import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
|
|
1476
|
+
import path8 from "node:path";
|
|
659
1477
|
function escapeXml(str) {
|
|
660
1478
|
return str.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
661
1479
|
}
|
|
@@ -667,7 +1485,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
667
1485
|
this.filePath = filePath;
|
|
668
1486
|
}
|
|
669
1487
|
static async open(filePath) {
|
|
670
|
-
await
|
|
1488
|
+
await mkdir5(path8.dirname(filePath), { recursive: true });
|
|
671
1489
|
return new _JunitWriter(filePath);
|
|
672
1490
|
}
|
|
673
1491
|
async append(result) {
|
|
@@ -729,14 +1547,14 @@ ${testCases.join("\n")}
|
|
|
729
1547
|
${suiteXmls.join("\n")}
|
|
730
1548
|
</testsuites>
|
|
731
1549
|
`;
|
|
732
|
-
await
|
|
1550
|
+
await writeFile5(this.filePath, xml, "utf8");
|
|
733
1551
|
}
|
|
734
1552
|
};
|
|
735
1553
|
|
|
736
1554
|
// src/commands/eval/yaml-writer.ts
|
|
737
1555
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
738
|
-
import { mkdir as
|
|
739
|
-
import
|
|
1556
|
+
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
1557
|
+
import path9 from "node:path";
|
|
740
1558
|
import { finished as finished2 } from "node:stream/promises";
|
|
741
1559
|
import { stringify as stringifyYaml } from "yaml";
|
|
742
1560
|
var YamlWriter = class _YamlWriter {
|
|
@@ -748,7 +1566,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
748
1566
|
this.stream = stream;
|
|
749
1567
|
}
|
|
750
1568
|
static async open(filePath) {
|
|
751
|
-
await
|
|
1569
|
+
await mkdir6(path9.dirname(filePath), { recursive: true });
|
|
752
1570
|
const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
|
|
753
1571
|
return new _YamlWriter(stream);
|
|
754
1572
|
}
|
|
@@ -794,6 +1612,8 @@ async function createOutputWriter(filePath, format) {
|
|
|
794
1612
|
return JsonlWriter.open(filePath);
|
|
795
1613
|
case "yaml":
|
|
796
1614
|
return YamlWriter.open(filePath);
|
|
1615
|
+
case "html":
|
|
1616
|
+
return HtmlWriter.open(filePath);
|
|
797
1617
|
default: {
|
|
798
1618
|
const exhaustiveCheck = format;
|
|
799
1619
|
throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
|
|
@@ -806,15 +1626,17 @@ function getDefaultExtension(format) {
|
|
|
806
1626
|
return ".jsonl";
|
|
807
1627
|
case "yaml":
|
|
808
1628
|
return ".yaml";
|
|
1629
|
+
case "html":
|
|
1630
|
+
return ".html";
|
|
809
1631
|
default: {
|
|
810
1632
|
const exhaustiveCheck = format;
|
|
811
1633
|
throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
|
|
812
1634
|
}
|
|
813
1635
|
}
|
|
814
1636
|
}
|
|
815
|
-
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml"]);
|
|
1637
|
+
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
|
|
816
1638
|
function createWriterFromPath(filePath) {
|
|
817
|
-
const ext =
|
|
1639
|
+
const ext = path10.extname(filePath).toLowerCase();
|
|
818
1640
|
switch (ext) {
|
|
819
1641
|
case ".jsonl":
|
|
820
1642
|
return JsonlWriter.open(filePath);
|
|
@@ -825,6 +1647,9 @@ function createWriterFromPath(filePath) {
|
|
|
825
1647
|
case ".yaml":
|
|
826
1648
|
case ".yml":
|
|
827
1649
|
return YamlWriter.open(filePath);
|
|
1650
|
+
case ".html":
|
|
1651
|
+
case ".htm":
|
|
1652
|
+
return HtmlWriter.open(filePath);
|
|
828
1653
|
default:
|
|
829
1654
|
throw new Error(
|
|
830
1655
|
`Unsupported output file extension "${ext}". Supported: ${[...SUPPORTED_EXTENSIONS].join(", ")}`
|
|
@@ -898,12 +1723,12 @@ var ProgressDisplay = class {
|
|
|
898
1723
|
}
|
|
899
1724
|
addLogPaths(paths, provider) {
|
|
900
1725
|
const newPaths = [];
|
|
901
|
-
for (const
|
|
902
|
-
if (this.logPathSet.has(
|
|
1726
|
+
for (const path13 of paths) {
|
|
1727
|
+
if (this.logPathSet.has(path13)) {
|
|
903
1728
|
continue;
|
|
904
1729
|
}
|
|
905
|
-
this.logPathSet.add(
|
|
906
|
-
newPaths.push(
|
|
1730
|
+
this.logPathSet.add(path13);
|
|
1731
|
+
newPaths.push(path13);
|
|
907
1732
|
}
|
|
908
1733
|
if (newPaths.length === 0) {
|
|
909
1734
|
return;
|
|
@@ -916,8 +1741,8 @@ var ProgressDisplay = class {
|
|
|
916
1741
|
this.hasPrintedLogHeader = true;
|
|
917
1742
|
}
|
|
918
1743
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
919
|
-
newPaths.forEach((
|
|
920
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
1744
|
+
newPaths.forEach((path13, offset) => {
|
|
1745
|
+
console.log(`${startIndex + offset + 1}. ${path13}`);
|
|
921
1746
|
});
|
|
922
1747
|
}
|
|
923
1748
|
finish() {
|
|
@@ -1207,10 +2032,10 @@ function formatMatrixSummary(results) {
|
|
|
1207
2032
|
}
|
|
1208
2033
|
|
|
1209
2034
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
1210
|
-
import { readFile } from "node:fs/promises";
|
|
1211
|
-
import path9 from "node:path";
|
|
1212
|
-
import { parse } from "yaml";
|
|
1213
2035
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
2036
|
+
import path11 from "node:path";
|
|
2037
|
+
import { parse } from "yaml";
|
|
2038
|
+
import { readFile as readFile22 } from "node:fs/promises";
|
|
1214
2039
|
import path22 from "node:path";
|
|
1215
2040
|
import { parse as parse2 } from "yaml";
|
|
1216
2041
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -1226,7 +2051,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
|
|
|
1226
2051
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
1227
2052
|
async function detectFileType(filePath) {
|
|
1228
2053
|
try {
|
|
1229
|
-
const content = await
|
|
2054
|
+
const content = await readFile2(filePath, "utf8");
|
|
1230
2055
|
const parsed = parse(content);
|
|
1231
2056
|
if (typeof parsed !== "object" || parsed === null) {
|
|
1232
2057
|
return inferFileTypeFromPath(filePath);
|
|
@@ -1251,8 +2076,8 @@ async function detectFileType(filePath) {
|
|
|
1251
2076
|
}
|
|
1252
2077
|
}
|
|
1253
2078
|
function inferFileTypeFromPath(filePath) {
|
|
1254
|
-
const normalized =
|
|
1255
|
-
const basename =
|
|
2079
|
+
const normalized = path11.normalize(filePath).replace(/\\/g, "/");
|
|
2080
|
+
const basename = path11.basename(filePath);
|
|
1256
2081
|
if (normalized.includes("/.agentv/")) {
|
|
1257
2082
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
1258
2083
|
return "config";
|
|
@@ -1287,7 +2112,7 @@ async function validateEvalFile(filePath) {
|
|
|
1287
2112
|
const absolutePath = path22.resolve(filePath);
|
|
1288
2113
|
let parsed;
|
|
1289
2114
|
try {
|
|
1290
|
-
const content = await
|
|
2115
|
+
const content = await readFile22(absolutePath, "utf8");
|
|
1291
2116
|
parsed = parse2(content);
|
|
1292
2117
|
} catch (error) {
|
|
1293
2118
|
errors.push({
|
|
@@ -1454,7 +2279,7 @@ async function validateEvalFile(filePath) {
|
|
|
1454
2279
|
});
|
|
1455
2280
|
}
|
|
1456
2281
|
}
|
|
1457
|
-
const assertField = evalCase.assert;
|
|
2282
|
+
const assertField = evalCase.assertions ?? evalCase.assert;
|
|
1458
2283
|
if (assertField !== void 0) {
|
|
1459
2284
|
validateAssertArray(assertField, location, absolutePath, errors);
|
|
1460
2285
|
}
|
|
@@ -1625,14 +2450,14 @@ function validateAssertArray(assertField, parentLocation, filePath, errors) {
|
|
|
1625
2450
|
errors.push({
|
|
1626
2451
|
severity: "warning",
|
|
1627
2452
|
filePath,
|
|
1628
|
-
location: `${parentLocation}.
|
|
1629
|
-
message: "'
|
|
2453
|
+
location: `${parentLocation}.assertions`,
|
|
2454
|
+
message: "'assertions' must be an array of assertion objects."
|
|
1630
2455
|
});
|
|
1631
2456
|
return;
|
|
1632
2457
|
}
|
|
1633
2458
|
for (let i = 0; i < assertField.length; i++) {
|
|
1634
2459
|
const item = assertField[i];
|
|
1635
|
-
const location = `${parentLocation}.
|
|
2460
|
+
const location = `${parentLocation}.assertions[${i}]`;
|
|
1636
2461
|
if (!isObject(item)) {
|
|
1637
2462
|
errors.push({
|
|
1638
2463
|
severity: "warning",
|
|
@@ -1931,6 +2756,7 @@ function getKnownSettings(provider) {
|
|
|
1931
2756
|
return COPILOT_CLI_SETTINGS;
|
|
1932
2757
|
case "claude":
|
|
1933
2758
|
case "claude-code":
|
|
2759
|
+
case "claude-cli":
|
|
1934
2760
|
case "claude-sdk":
|
|
1935
2761
|
return CLAUDE_SETTINGS;
|
|
1936
2762
|
case "vscode":
|
|
@@ -1950,7 +2776,15 @@ function validateUnknownSettings(target, provider, absolutePath, location, error
|
|
|
1950
2776
|
if (!knownSettings) {
|
|
1951
2777
|
return;
|
|
1952
2778
|
}
|
|
1953
|
-
const baseFields = /* @__PURE__ */ new Set([
|
|
2779
|
+
const baseFields = /* @__PURE__ */ new Set([
|
|
2780
|
+
"name",
|
|
2781
|
+
"provider",
|
|
2782
|
+
"grader_target",
|
|
2783
|
+
"judge_target",
|
|
2784
|
+
"workers",
|
|
2785
|
+
"$schema",
|
|
2786
|
+
"targets"
|
|
2787
|
+
]);
|
|
1954
2788
|
for (const key of Object.keys(target)) {
|
|
1955
2789
|
if (removedTargetFields.has(key)) {
|
|
1956
2790
|
errors.push({
|
|
@@ -2157,13 +2991,13 @@ async function validateTargetsFile(filePath) {
|
|
|
2157
2991
|
if (typeof provider === "string") {
|
|
2158
2992
|
validateUnknownSettings(target, provider, absolutePath, location, errors);
|
|
2159
2993
|
}
|
|
2160
|
-
const
|
|
2161
|
-
if (
|
|
2994
|
+
const graderTarget = target.grader_target ?? target.judge_target;
|
|
2995
|
+
if (graderTarget !== void 0 && typeof graderTarget !== "string") {
|
|
2162
2996
|
errors.push({
|
|
2163
2997
|
severity: "error",
|
|
2164
2998
|
filePath: absolutePath,
|
|
2165
|
-
location: `${location}.
|
|
2166
|
-
message: "Invalid '
|
|
2999
|
+
location: `${location}.grader_target`,
|
|
3000
|
+
message: "Invalid 'grader_target' field (must be a string)"
|
|
2167
3001
|
});
|
|
2168
3002
|
}
|
|
2169
3003
|
}
|
|
@@ -2473,7 +3307,7 @@ Errors in ${targetsFilePath}:`);
|
|
|
2473
3307
|
const mockTarget = {
|
|
2474
3308
|
kind: "mock",
|
|
2475
3309
|
name: `${targetDefinition.name}-dry-run`,
|
|
2476
|
-
|
|
3310
|
+
graderTarget: void 0,
|
|
2477
3311
|
config: {
|
|
2478
3312
|
response: '{"answer":"Mock dry-run response"}',
|
|
2479
3313
|
delayMs: dryRunDelay,
|
|
@@ -2564,7 +3398,7 @@ Errors in ${targetsFilePath}:`);
|
|
|
2564
3398
|
const mockTarget = {
|
|
2565
3399
|
kind: "mock",
|
|
2566
3400
|
name: `${targetDefinition.name}-dry-run`,
|
|
2567
|
-
|
|
3401
|
+
graderTarget: void 0,
|
|
2568
3402
|
config: {
|
|
2569
3403
|
response: '{"answer":"Mock dry-run response"}',
|
|
2570
3404
|
delayMs: dryRunDelay,
|
|
@@ -2719,7 +3553,11 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
2719
3553
|
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
|
|
2720
3554
|
retryErrors: normalizeString(rawOptions.retryErrors),
|
|
2721
3555
|
workspaceMode,
|
|
2722
|
-
workspacePath
|
|
3556
|
+
workspacePath,
|
|
3557
|
+
benchmarkJson: normalizeString(rawOptions.benchmarkJson),
|
|
3558
|
+
artifacts: normalizeString(rawOptions.artifacts),
|
|
3559
|
+
graderTarget: normalizeString(rawOptions.graderTarget),
|
|
3560
|
+
model: normalizeString(rawOptions.model)
|
|
2723
3561
|
};
|
|
2724
3562
|
}
|
|
2725
3563
|
async function ensureFileExists(filePath, description) {
|
|
@@ -2733,7 +3571,7 @@ function buildDefaultOutputPath(cwd, format) {
|
|
|
2733
3571
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2734
3572
|
const baseName = "eval";
|
|
2735
3573
|
const extension = getDefaultExtension(format);
|
|
2736
|
-
return
|
|
3574
|
+
return path12.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
|
|
2737
3575
|
}
|
|
2738
3576
|
function createProgressReporter(maxWorkers, options) {
|
|
2739
3577
|
const display = new ProgressDisplay(maxWorkers, options);
|
|
@@ -2747,7 +3585,7 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
2747
3585
|
};
|
|
2748
3586
|
}
|
|
2749
3587
|
function makeEvalKey(testFilePath, evalId) {
|
|
2750
|
-
return `${
|
|
3588
|
+
return `${path12.resolve(testFilePath)}::${evalId}`;
|
|
2751
3589
|
}
|
|
2752
3590
|
function createDisplayIdTracker() {
|
|
2753
3591
|
const map = /* @__PURE__ */ new Map();
|
|
@@ -2952,6 +3790,8 @@ async function runSingleEvalFile(params) {
|
|
|
2952
3790
|
trials: trialsConfig,
|
|
2953
3791
|
totalBudgetUsd,
|
|
2954
3792
|
failOnError,
|
|
3793
|
+
graderTarget: options.graderTarget,
|
|
3794
|
+
model: options.model,
|
|
2955
3795
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
2956
3796
|
onResult: async (result) => {
|
|
2957
3797
|
streamingObserver?.finalizeEvalCase(result.score, result.error);
|
|
@@ -3004,16 +3844,19 @@ async function runEvalCommand(input) {
|
|
|
3004
3844
|
);
|
|
3005
3845
|
}
|
|
3006
3846
|
const repoRoot = await findRepoRoot(cwd);
|
|
3007
|
-
const yamlConfig = await loadConfig(
|
|
3847
|
+
const yamlConfig = await loadConfig(path12.join(cwd, "_"), repoRoot);
|
|
3008
3848
|
if (yamlConfig?.required_version) {
|
|
3009
3849
|
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
3010
3850
|
strict: normalizeBoolean(input.rawOptions.strict)
|
|
3011
3851
|
});
|
|
3012
3852
|
}
|
|
3013
3853
|
let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
|
|
3854
|
+
if (options.graderTarget === "agentv" && !options.model) {
|
|
3855
|
+
throw new Error("--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)");
|
|
3856
|
+
}
|
|
3014
3857
|
let retryNonErrorResults;
|
|
3015
3858
|
if (options.retryErrors) {
|
|
3016
|
-
const retryPath =
|
|
3859
|
+
const retryPath = path12.resolve(options.retryErrors);
|
|
3017
3860
|
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
3018
3861
|
const errorIds = await loadErrorTestIds(retryPath);
|
|
3019
3862
|
if (errorIds.length === 0) {
|
|
@@ -3026,7 +3869,7 @@ async function runEvalCommand(input) {
|
|
|
3026
3869
|
retryNonErrorResults = await loadNonErrorResults(retryPath);
|
|
3027
3870
|
}
|
|
3028
3871
|
if (options.workspacePath) {
|
|
3029
|
-
const resolvedWorkspace =
|
|
3872
|
+
const resolvedWorkspace = path12.resolve(options.workspacePath);
|
|
3030
3873
|
try {
|
|
3031
3874
|
const { stat: stat2 } = await import("node:fs/promises");
|
|
3032
3875
|
const stats = await stat2(resolvedWorkspace);
|
|
@@ -3048,7 +3891,7 @@ async function runEvalCommand(input) {
|
|
|
3048
3891
|
const useFileExport = !!(options.otelFile || options.traceFile);
|
|
3049
3892
|
if (options.exportOtel || useFileExport) {
|
|
3050
3893
|
try {
|
|
3051
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
3894
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-WN2QIOQR.js");
|
|
3052
3895
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
3053
3896
|
let headers = {};
|
|
3054
3897
|
if (options.otelBackend) {
|
|
@@ -3072,8 +3915,8 @@ async function runEvalCommand(input) {
|
|
|
3072
3915
|
headers,
|
|
3073
3916
|
captureContent,
|
|
3074
3917
|
groupTurns: options.otelGroupTurns,
|
|
3075
|
-
otlpFilePath: options.otelFile ?
|
|
3076
|
-
traceFilePath: options.traceFile ?
|
|
3918
|
+
otlpFilePath: options.otelFile ? path12.resolve(options.otelFile) : void 0,
|
|
3919
|
+
traceFilePath: options.traceFile ? path12.resolve(options.traceFile) : void 0
|
|
3077
3920
|
});
|
|
3078
3921
|
const initialized = await otelExporter.init();
|
|
3079
3922
|
if (!initialized) {
|
|
@@ -3089,8 +3932,8 @@ async function runEvalCommand(input) {
|
|
|
3089
3932
|
otelExporter = null;
|
|
3090
3933
|
}
|
|
3091
3934
|
}
|
|
3092
|
-
const outputPath = options.outPath ?
|
|
3093
|
-
const extraOutputPaths = options.outputPaths.map((p) =>
|
|
3935
|
+
const outputPath = options.outPath ? path12.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
|
|
3936
|
+
const extraOutputPaths = options.outputPaths.map((p) => path12.resolve(p));
|
|
3094
3937
|
const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
|
|
3095
3938
|
const uniqueOutputPaths = [...new Set(allOutputPaths)];
|
|
3096
3939
|
let outputWriter;
|
|
@@ -3104,12 +3947,12 @@ async function runEvalCommand(input) {
|
|
|
3104
3947
|
console.log(` ${p}`);
|
|
3105
3948
|
}
|
|
3106
3949
|
}
|
|
3107
|
-
const resolvedTestFiles = input.testFiles.map((file) =>
|
|
3950
|
+
const resolvedTestFiles = input.testFiles.map((file) => path12.resolve(file));
|
|
3108
3951
|
if (options.otelFile) {
|
|
3109
|
-
console.log(`OTLP JSON file: ${
|
|
3952
|
+
console.log(`OTLP JSON file: ${path12.resolve(options.otelFile)}`);
|
|
3110
3953
|
}
|
|
3111
3954
|
if (options.traceFile) {
|
|
3112
|
-
console.log(`Trace file: ${
|
|
3955
|
+
console.log(`Trace file: ${path12.resolve(options.traceFile)}`);
|
|
3113
3956
|
}
|
|
3114
3957
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
3115
3958
|
const allResults = [];
|
|
@@ -3122,7 +3965,23 @@ async function runEvalCommand(input) {
|
|
|
3122
3965
|
);
|
|
3123
3966
|
const perFileWorkers = options.workers ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) : void 0;
|
|
3124
3967
|
const fileMetadata = /* @__PURE__ */ new Map();
|
|
3968
|
+
const tsFiles = [];
|
|
3969
|
+
const yamlFiles = [];
|
|
3125
3970
|
for (const testFilePath of resolvedTestFiles) {
|
|
3971
|
+
if (/\.(ts|js|mts|mjs)$/.test(testFilePath)) {
|
|
3972
|
+
tsFiles.push(testFilePath);
|
|
3973
|
+
} else {
|
|
3974
|
+
yamlFiles.push(testFilePath);
|
|
3975
|
+
}
|
|
3976
|
+
}
|
|
3977
|
+
for (const tsFile of tsFiles) {
|
|
3978
|
+
await ensureFileExists(tsFile, "TypeScript eval file");
|
|
3979
|
+
await import(pathToFileURL(tsFile).href);
|
|
3980
|
+
}
|
|
3981
|
+
if (yamlFiles.length === 0 && tsFiles.length > 0) {
|
|
3982
|
+
return;
|
|
3983
|
+
}
|
|
3984
|
+
for (const testFilePath of yamlFiles) {
|
|
3126
3985
|
const meta = await prepareFileMetadata({
|
|
3127
3986
|
testFilePath,
|
|
3128
3987
|
repoRoot,
|
|
@@ -3139,7 +3998,7 @@ async function runEvalCommand(input) {
|
|
|
3139
3998
|
cliNoCache: options.noCache,
|
|
3140
3999
|
yamlCache: yamlCacheEnabled
|
|
3141
4000
|
});
|
|
3142
|
-
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ?
|
|
4001
|
+
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path12.resolve(yamlCachePath) : void 0) : void 0;
|
|
3143
4002
|
const useCache = cacheEnabled;
|
|
3144
4003
|
if (cacheEnabled) {
|
|
3145
4004
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
@@ -3269,6 +4128,24 @@ async function runEvalCommand(input) {
|
|
|
3269
4128
|
if (isMatrixMode && allResults.length > 0) {
|
|
3270
4129
|
console.log(formatMatrixSummary(allResults));
|
|
3271
4130
|
}
|
|
4131
|
+
if (options.benchmarkJson && allResults.length > 0) {
|
|
4132
|
+
const benchmarkPath = path12.resolve(options.benchmarkJson);
|
|
4133
|
+
await writeBenchmarkJson(benchmarkPath, allResults);
|
|
4134
|
+
console.log(`Benchmark written to: ${benchmarkPath}`);
|
|
4135
|
+
}
|
|
4136
|
+
if (options.artifacts && allResults.length > 0) {
|
|
4137
|
+
const artifactsDir = path12.resolve(options.artifacts);
|
|
4138
|
+
const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
|
|
4139
|
+
const {
|
|
4140
|
+
gradingDir,
|
|
4141
|
+
timingPath,
|
|
4142
|
+
benchmarkPath: abp
|
|
4143
|
+
} = await writeArtifactsFromResults(allResults, artifactsDir, { evalFile });
|
|
4144
|
+
console.log(`Artifacts written to: ${artifactsDir}`);
|
|
4145
|
+
console.log(` Grading: ${gradingDir} (${allResults.length} files)`);
|
|
4146
|
+
console.log(` Timing: ${timingPath}`);
|
|
4147
|
+
console.log(` Benchmark: ${abp}`);
|
|
4148
|
+
}
|
|
3272
4149
|
const failedWithWorkspaces = allResults.filter(
|
|
3273
4150
|
(r) => r.workspacePath && (r.error || r.score < 0.5)
|
|
3274
4151
|
);
|
|
@@ -3308,7 +4185,7 @@ async function resolveEvaluationRunner() {
|
|
|
3308
4185
|
if (!overridePath) {
|
|
3309
4186
|
return runEvaluation;
|
|
3310
4187
|
}
|
|
3311
|
-
const resolved =
|
|
4188
|
+
const resolved = path12.isAbsolute(overridePath) ? overridePath : path12.resolve(process.cwd(), overridePath);
|
|
3312
4189
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
3313
4190
|
const mod = await import(moduleUrl);
|
|
3314
4191
|
const candidate = mod.runEvaluation;
|
|
@@ -3323,6 +4200,7 @@ async function resolveEvaluationRunner() {
|
|
|
3323
4200
|
export {
|
|
3324
4201
|
package_default,
|
|
3325
4202
|
toSnakeCaseDeep,
|
|
4203
|
+
HtmlWriter,
|
|
3326
4204
|
resolveEvalPaths,
|
|
3327
4205
|
findRepoRoot,
|
|
3328
4206
|
detectFileType,
|
|
@@ -3335,4 +4213,4 @@ export {
|
|
|
3335
4213
|
selectTarget,
|
|
3336
4214
|
runEvalCommand
|
|
3337
4215
|
};
|
|
3338
|
-
//# sourceMappingURL=chunk-
|
|
4216
|
+
//# sourceMappingURL=chunk-DY4ZDTTO.js.map
|