agentv 3.3.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -9
- package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js → agentv-provider-NFFLXG5M-TJAWCWCX.js} +2 -2
- package/dist/{chunk-6LP5Z5Y4.js → chunk-5GG6DDP5.js} +256 -128
- package/dist/chunk-5GG6DDP5.js.map +1 -0
- package/dist/{chunk-AR3QEKXH.js → chunk-BJV6MDBE.js} +3 -3
- package/dist/{chunk-AR3QEKXH.js.map → chunk-BJV6MDBE.js.map} +1 -1
- package/dist/{chunk-5M3K2DMV.js → chunk-D6G4N2H2.js} +550 -516
- package/dist/chunk-D6G4N2H2.js.map +1 -0
- package/dist/{chunk-4ZMSAQWS.js → chunk-RLL4QGNL.js} +172 -81
- package/dist/chunk-RLL4QGNL.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/{dist-OC53WD3P.js → dist-MZFXE6B5.js} +3 -5
- package/dist/index.js +4 -4
- package/dist/{interactive-NA6SAIAG.js → interactive-J7SUWZH2.js} +45 -5
- package/dist/interactive-J7SUWZH2.js.map +1 -0
- package/dist/templates/.agentv/.env.example +11 -9
- package/dist/templates/.agentv/config.yaml +5 -0
- package/dist/templates/.agentv/targets.yaml +0 -16
- package/package.json +2 -2
- package/dist/chunk-4ZMSAQWS.js.map +0 -1
- package/dist/chunk-5M3K2DMV.js.map +0 -1
- package/dist/chunk-6LP5Z5Y4.js.map +0 -1
- package/dist/interactive-NA6SAIAG.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js.map → agentv-provider-NFFLXG5M-TJAWCWCX.js.map} +0 -0
- /package/dist/{dist-OC53WD3P.js.map → dist-MZFXE6B5.js.map} +0 -0
|
@@ -25,12 +25,12 @@ import {
|
|
|
25
25
|
subscribeToCopilotCliLogEntries,
|
|
26
26
|
subscribeToCopilotSdkLogEntries,
|
|
27
27
|
subscribeToPiLogEntries
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-D6G4N2H2.js";
|
|
29
29
|
|
|
30
30
|
// package.json
|
|
31
31
|
var package_default = {
|
|
32
32
|
name: "agentv",
|
|
33
|
-
version: "3.
|
|
33
|
+
version: "3.5.0",
|
|
34
34
|
description: "CLI entry point for AgentV",
|
|
35
35
|
type: "module",
|
|
36
36
|
repository: {
|
|
@@ -320,34 +320,12 @@ function parseWorkspaceChanges(fileChanges) {
|
|
|
320
320
|
diff_summary: diffSummary
|
|
321
321
|
};
|
|
322
322
|
}
|
|
323
|
-
function
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
text: hit,
|
|
330
|
-
passed: true,
|
|
331
|
-
evidence: evaluator.reasoning ?? ""
|
|
332
|
-
});
|
|
333
|
-
}
|
|
334
|
-
for (const miss of evaluator.misses) {
|
|
335
|
-
expectations.push({
|
|
336
|
-
text: miss,
|
|
337
|
-
passed: false,
|
|
338
|
-
evidence: evaluator.reasoning ?? ""
|
|
339
|
-
});
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
} else {
|
|
343
|
-
for (const hit of result.hits) {
|
|
344
|
-
expectations.push({ text: hit, passed: true, evidence: result.reasoning ?? "" });
|
|
345
|
-
}
|
|
346
|
-
for (const miss of result.misses) {
|
|
347
|
-
expectations.push({ text: miss, passed: false, evidence: result.reasoning ?? "" });
|
|
348
|
-
}
|
|
349
|
-
}
|
|
350
|
-
return expectations;
|
|
323
|
+
function buildAssertions(result) {
|
|
324
|
+
return result.assertions.map((a) => ({
|
|
325
|
+
text: a.text,
|
|
326
|
+
passed: a.passed,
|
|
327
|
+
evidence: a.evidence ?? ""
|
|
328
|
+
}));
|
|
351
329
|
}
|
|
352
330
|
function buildEvaluators(scores) {
|
|
353
331
|
if (!scores || scores.length === 0) {
|
|
@@ -357,23 +335,22 @@ function buildEvaluators(scores) {
|
|
|
357
335
|
name: s.name,
|
|
358
336
|
type: s.type,
|
|
359
337
|
score: s.score,
|
|
360
|
-
reasoning:
|
|
338
|
+
reasoning: "",
|
|
361
339
|
weight: s.weight,
|
|
362
340
|
verdict: s.verdict,
|
|
363
|
-
|
|
364
|
-
misses: s.misses,
|
|
341
|
+
assertions: s.assertions,
|
|
365
342
|
details: s.details
|
|
366
343
|
}));
|
|
367
344
|
}
|
|
368
345
|
function buildGradingArtifact(result) {
|
|
369
|
-
const
|
|
370
|
-
const passed =
|
|
371
|
-
const failed =
|
|
372
|
-
const total =
|
|
346
|
+
const assertions = buildAssertions(result);
|
|
347
|
+
const passed = assertions.filter((e) => e.passed).length;
|
|
348
|
+
const failed = assertions.filter((e) => !e.passed).length;
|
|
349
|
+
const total = assertions.length;
|
|
373
350
|
const { toolCalls, total: totalToolCalls } = countToolCalls(result);
|
|
374
351
|
const errorsEncountered = result.error ? 1 : 0;
|
|
375
352
|
return {
|
|
376
|
-
|
|
353
|
+
assertions,
|
|
377
354
|
summary: {
|
|
378
355
|
passed,
|
|
379
356
|
failed,
|
|
@@ -496,6 +473,42 @@ function buildBenchmarkArtifact(results, evalFile = "") {
|
|
|
496
473
|
notes
|
|
497
474
|
};
|
|
498
475
|
}
|
|
476
|
+
function toCamelCase(str) {
|
|
477
|
+
return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
|
|
478
|
+
}
|
|
479
|
+
function toCamelCaseDeep(obj) {
|
|
480
|
+
if (obj === null || obj === void 0) {
|
|
481
|
+
return obj;
|
|
482
|
+
}
|
|
483
|
+
if (Array.isArray(obj)) {
|
|
484
|
+
return obj.map((item) => toCamelCaseDeep(item));
|
|
485
|
+
}
|
|
486
|
+
if (typeof obj === "object") {
|
|
487
|
+
const result = {};
|
|
488
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
489
|
+
result[toCamelCase(key)] = toCamelCaseDeep(value);
|
|
490
|
+
}
|
|
491
|
+
return result;
|
|
492
|
+
}
|
|
493
|
+
return obj;
|
|
494
|
+
}
|
|
495
|
+
function parseJsonlResults(content) {
|
|
496
|
+
const results = [];
|
|
497
|
+
const lines = content.split("\n");
|
|
498
|
+
for (const line of lines) {
|
|
499
|
+
const trimmed = line.trim();
|
|
500
|
+
if (trimmed.length === 0) {
|
|
501
|
+
continue;
|
|
502
|
+
}
|
|
503
|
+
try {
|
|
504
|
+
const parsed = JSON.parse(trimmed);
|
|
505
|
+
const camelCased = toCamelCaseDeep(parsed);
|
|
506
|
+
results.push(camelCased);
|
|
507
|
+
} catch {
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
return results;
|
|
511
|
+
}
|
|
499
512
|
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
500
513
|
const gradingDir = path3.join(outputDir, "grading");
|
|
501
514
|
const timingPath = path3.join(outputDir, "timing.json");
|
|
@@ -1321,23 +1334,27 @@ var SCRIPT = `
|
|
|
1321
1334
|
/* evaluator results */
|
|
1322
1335
|
if(r.scores&&r.scores.length>0){
|
|
1323
1336
|
h+="<h4>Evaluator Results</h4>";
|
|
1324
|
-
h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>
|
|
1337
|
+
h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
|
|
1325
1338
|
for(var i=0;i<r.scores.length;i++){
|
|
1326
1339
|
var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
|
|
1327
|
-
|
|
1340
|
+
var evAssertions=ev.assertions||[];
|
|
1341
|
+
var evSummary=evAssertions.map(function(a){return (a.passed?"\u2713 ":"\u2717 ")+a.text;}).join("; ");
|
|
1342
|
+
h+="<tr><td class=\\"fw-medium\\">"+esc(ev.name)+'</td><td class="'+sCls(ev.score)+'">'+fmtPct(ev.score)+"</td><td>"+sIcon(evS)+'</td><td class="reasoning-cell">'+esc(evSummary)+"</td></tr>";
|
|
1328
1343
|
}
|
|
1329
1344
|
h+="</tbody></table>";
|
|
1330
1345
|
}
|
|
1331
1346
|
|
|
1332
|
-
/*
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1347
|
+
/* assertions */
|
|
1348
|
+
var passedA=r.assertions?r.assertions.filter(function(a){return a.passed;}):[];
|
|
1349
|
+
var failedA=r.assertions?r.assertions.filter(function(a){return !a.passed;}):[];
|
|
1350
|
+
if(passedA.length>0){
|
|
1351
|
+
h+='<h4>Passed Assertions</h4><ul class="expect-list pass">';
|
|
1352
|
+
for(var i=0;i<passedA.length;i++)h+="<li>"+esc(passedA[i].text)+(passedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(passedA[i].evidence)+")</span>":"")+"</li>";
|
|
1336
1353
|
h+="</ul>";
|
|
1337
1354
|
}
|
|
1338
|
-
if(
|
|
1339
|
-
h+='<h4>Failed
|
|
1340
|
-
for(var i=0;i<
|
|
1355
|
+
if(failedA.length>0){
|
|
1356
|
+
h+='<h4>Failed Assertions</h4><ul class="expect-list fail">';
|
|
1357
|
+
for(var i=0;i<failedA.length;i++)h+="<li>"+esc(failedA[i].text)+(failedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(failedA[i].evidence)+")</span>":"")+"</li>";
|
|
1341
1358
|
h+="</ul>";
|
|
1342
1359
|
}
|
|
1343
1360
|
|
|
@@ -1526,10 +1543,10 @@ var JunitWriter = class _JunitWriter {
|
|
|
1526
1543
|
`;
|
|
1527
1544
|
} else if (r.score < 0.5) {
|
|
1528
1545
|
const message = `score=${r.score.toFixed(3)}`;
|
|
1546
|
+
const failedAssertions = r.assertions.filter((a) => !a.passed);
|
|
1529
1547
|
const detail = [
|
|
1530
1548
|
`Score: ${r.score.toFixed(3)}`,
|
|
1531
|
-
|
|
1532
|
-
r.misses.length > 0 ? `Misses: ${r.misses.join(", ")}` : ""
|
|
1549
|
+
failedAssertions.length > 0 ? `Failed: ${failedAssertions.map((a) => a.text).join(", ")}` : ""
|
|
1533
1550
|
].filter(Boolean).join("\n");
|
|
1534
1551
|
inner = `
|
|
1535
1552
|
<failure message="${escapeXml(message)}">${escapeXml(detail)}</failure>
|
|
@@ -1673,6 +1690,24 @@ async function createMultiWriter(filePaths) {
|
|
|
1673
1690
|
}
|
|
1674
1691
|
|
|
1675
1692
|
// src/commands/eval/progress-display.ts
|
|
1693
|
+
var ANSI_BOLD = "\x1B[1m";
|
|
1694
|
+
var ANSI_GREEN = "\x1B[32m";
|
|
1695
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
1696
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
1697
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
1698
|
+
function useColors() {
|
|
1699
|
+
if (process.env.NO_COLOR !== void 0) return false;
|
|
1700
|
+
return process.stdout.isTTY ?? false;
|
|
1701
|
+
}
|
|
1702
|
+
function formatVerdict(score, verdict) {
|
|
1703
|
+
if (verdict === void 0) return "";
|
|
1704
|
+
const colors = useColors();
|
|
1705
|
+
const scoreStr = score !== void 0 ? score.toFixed(3) : "";
|
|
1706
|
+
const verdictLabel = verdict === "ERROR" ? "ERROR" : `${scoreStr} ${verdict}`;
|
|
1707
|
+
if (!colors) return ` | ${verdictLabel}`;
|
|
1708
|
+
const color = verdict === "PASS" ? ANSI_GREEN : verdict === "FAIL" ? ANSI_RED2 : ANSI_YELLOW2;
|
|
1709
|
+
return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET2}`;
|
|
1710
|
+
}
|
|
1676
1711
|
var ProgressDisplay = class {
|
|
1677
1712
|
workers = /* @__PURE__ */ new Map();
|
|
1678
1713
|
totalTests = 0;
|
|
@@ -1716,11 +1751,13 @@ var ProgressDisplay = class {
|
|
|
1716
1751
|
}
|
|
1717
1752
|
break;
|
|
1718
1753
|
case "completed":
|
|
1719
|
-
console.log(
|
|
1754
|
+
console.log(
|
|
1755
|
+
`${countPrefix} \u2705 ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`
|
|
1756
|
+
);
|
|
1720
1757
|
break;
|
|
1721
1758
|
case "failed":
|
|
1722
1759
|
console.log(
|
|
1723
|
-
`${countPrefix} \u274C ${progress.testId}${targetSuffix}${progress.error ? `: ${progress.error}` : ""}`
|
|
1760
|
+
`${countPrefix} \u274C ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ""}`
|
|
1724
1761
|
);
|
|
1725
1762
|
break;
|
|
1726
1763
|
}
|
|
@@ -1760,6 +1797,22 @@ var ProgressDisplay = class {
|
|
|
1760
1797
|
// src/commands/eval/retry-errors.ts
|
|
1761
1798
|
import { createReadStream } from "node:fs";
|
|
1762
1799
|
import { createInterface } from "node:readline";
|
|
1800
|
+
function getTestId(result) {
|
|
1801
|
+
return result.testId ?? result.test_id;
|
|
1802
|
+
}
|
|
1803
|
+
function getExecutionStatus(result) {
|
|
1804
|
+
return result.executionStatus ?? result.execution_status;
|
|
1805
|
+
}
|
|
1806
|
+
function toEvaluationResult(result) {
|
|
1807
|
+
if (result.testId !== void 0 && result.executionStatus !== void 0) {
|
|
1808
|
+
return result;
|
|
1809
|
+
}
|
|
1810
|
+
return {
|
|
1811
|
+
...result,
|
|
1812
|
+
testId: getTestId(result) ?? "",
|
|
1813
|
+
executionStatus: getExecutionStatus(result)
|
|
1814
|
+
};
|
|
1815
|
+
}
|
|
1763
1816
|
async function loadErrorTestIds(jsonlPath) {
|
|
1764
1817
|
const ids = [];
|
|
1765
1818
|
const rl = createInterface({
|
|
@@ -1771,8 +1824,10 @@ async function loadErrorTestIds(jsonlPath) {
|
|
|
1771
1824
|
if (!trimmed) continue;
|
|
1772
1825
|
try {
|
|
1773
1826
|
const parsed = JSON.parse(trimmed);
|
|
1774
|
-
|
|
1775
|
-
|
|
1827
|
+
const executionStatus = getExecutionStatus(parsed);
|
|
1828
|
+
const testId = getTestId(parsed);
|
|
1829
|
+
if (executionStatus === "execution_error" && testId) {
|
|
1830
|
+
ids.push(testId);
|
|
1776
1831
|
}
|
|
1777
1832
|
} catch {
|
|
1778
1833
|
}
|
|
@@ -1790,9 +1845,11 @@ async function loadNonErrorResults(jsonlPath) {
|
|
|
1790
1845
|
if (!trimmed) continue;
|
|
1791
1846
|
try {
|
|
1792
1847
|
const parsed = JSON.parse(trimmed);
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1848
|
+
const testId = getTestId(parsed);
|
|
1849
|
+
const executionStatus = getExecutionStatus(parsed);
|
|
1850
|
+
if (!testId || parsed.score === void 0) continue;
|
|
1851
|
+
if (executionStatus !== "execution_error") {
|
|
1852
|
+
results.push(toEvaluationResult(parsed));
|
|
1796
1853
|
}
|
|
1797
1854
|
} catch {
|
|
1798
1855
|
}
|
|
@@ -1936,7 +1993,19 @@ function formatEvaluationSummary(summary) {
|
|
|
1936
1993
|
}
|
|
1937
1994
|
lines.push("");
|
|
1938
1995
|
}
|
|
1996
|
+
const overallPassed = summary.passedCount === summary.total - summary.executionErrorCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
|
|
1997
|
+
const overallVerdict = overallPassed ? "PASS" : "FAIL";
|
|
1998
|
+
const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
|
|
1999
|
+
const verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
|
|
2000
|
+
const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} passed, mean score: ${formatScore(summary.mean)})`;
|
|
1939
2001
|
lines.push("\n==================================================");
|
|
2002
|
+
if (useColor) {
|
|
2003
|
+
lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
|
|
2004
|
+
} else {
|
|
2005
|
+
lines.push(verdictText);
|
|
2006
|
+
}
|
|
2007
|
+
lines.push("==================================================");
|
|
2008
|
+
lines.push("");
|
|
1940
2009
|
lines.push("EVALUATION SUMMARY");
|
|
1941
2010
|
lines.push("==================================================");
|
|
1942
2011
|
lines.push(`Total tests: ${summary.total}`);
|
|
@@ -3292,9 +3361,9 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
3292
3361
|
}
|
|
3293
3362
|
|
|
3294
3363
|
// src/commands/eval/targets.ts
|
|
3295
|
-
var
|
|
3296
|
-
var
|
|
3297
|
-
var
|
|
3364
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
3365
|
+
var ANSI_RED3 = "\x1B[31m";
|
|
3366
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
3298
3367
|
function isTTY() {
|
|
3299
3368
|
return process.stdout.isTTY ?? false;
|
|
3300
3369
|
}
|
|
@@ -3334,14 +3403,14 @@ async function selectTarget(options) {
|
|
|
3334
3403
|
});
|
|
3335
3404
|
const validationResult = await validateTargetsFile(targetsFilePath);
|
|
3336
3405
|
const warnings = validationResult.errors.filter((e) => e.severity === "warning");
|
|
3337
|
-
const
|
|
3406
|
+
const useColors2 = isTTY();
|
|
3338
3407
|
if (warnings.length > 0) {
|
|
3339
3408
|
console.warn(`
|
|
3340
3409
|
Warnings in ${targetsFilePath}:`);
|
|
3341
3410
|
for (const warning of warnings) {
|
|
3342
3411
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
3343
|
-
const prefix =
|
|
3344
|
-
const message =
|
|
3412
|
+
const prefix = useColors2 ? `${ANSI_YELLOW3} \u26A0${ANSI_RESET3}` : " \u26A0";
|
|
3413
|
+
const message = useColors2 ? `${ANSI_YELLOW3}${warning.message}${ANSI_RESET3}` : warning.message;
|
|
3345
3414
|
console.warn(`${prefix}${location} ${message}`);
|
|
3346
3415
|
}
|
|
3347
3416
|
console.warn("");
|
|
@@ -3352,8 +3421,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
3352
3421
|
Errors in ${targetsFilePath}:`);
|
|
3353
3422
|
for (const error of errors) {
|
|
3354
3423
|
const location = error.location ? ` [${error.location}]` : "";
|
|
3355
|
-
const prefix =
|
|
3356
|
-
const message =
|
|
3424
|
+
const prefix = useColors2 ? `${ANSI_RED3} \u2717${ANSI_RESET3}` : " \u2717";
|
|
3425
|
+
const message = useColors2 ? `${ANSI_RED3}${error.message}${ANSI_RESET3}` : error.message;
|
|
3357
3426
|
console.error(`${prefix}${location} ${message}`);
|
|
3358
3427
|
}
|
|
3359
3428
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -3425,14 +3494,14 @@ async function selectMultipleTargets(options) {
|
|
|
3425
3494
|
});
|
|
3426
3495
|
const validationResult = await validateTargetsFile(targetsFilePath);
|
|
3427
3496
|
const warnings = validationResult.errors.filter((e) => e.severity === "warning");
|
|
3428
|
-
const
|
|
3497
|
+
const useColors2 = isTTY();
|
|
3429
3498
|
if (warnings.length > 0) {
|
|
3430
3499
|
console.warn(`
|
|
3431
3500
|
Warnings in ${targetsFilePath}:`);
|
|
3432
3501
|
for (const warning of warnings) {
|
|
3433
3502
|
const location = warning.location ? ` [${warning.location}]` : "";
|
|
3434
|
-
const prefix =
|
|
3435
|
-
const message =
|
|
3503
|
+
const prefix = useColors2 ? `${ANSI_YELLOW3} \u26A0${ANSI_RESET3}` : " \u26A0";
|
|
3504
|
+
const message = useColors2 ? `${ANSI_YELLOW3}${warning.message}${ANSI_RESET3}` : warning.message;
|
|
3436
3505
|
console.warn(`${prefix}${location} ${message}`);
|
|
3437
3506
|
}
|
|
3438
3507
|
console.warn("");
|
|
@@ -3443,8 +3512,8 @@ Warnings in ${targetsFilePath}:`);
|
|
|
3443
3512
|
Errors in ${targetsFilePath}:`);
|
|
3444
3513
|
for (const error of errors) {
|
|
3445
3514
|
const location = error.location ? ` [${error.location}]` : "";
|
|
3446
|
-
const prefix =
|
|
3447
|
-
const message =
|
|
3515
|
+
const prefix = useColors2 ? `${ANSI_RED3} \u2717${ANSI_RESET3}` : " \u2717";
|
|
3516
|
+
const message = useColors2 ? `${ANSI_RED3}${error.message}${ANSI_RESET3}` : error.message;
|
|
3448
3517
|
console.error(`${prefix}${location} ${message}`);
|
|
3449
3518
|
}
|
|
3450
3519
|
throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
|
|
@@ -3737,13 +3806,10 @@ async function prepareFileMetadata(params) {
|
|
|
3737
3806
|
env: process.env,
|
|
3738
3807
|
targetNames
|
|
3739
3808
|
});
|
|
3740
|
-
selections = multiSelections.map((sel) => {
|
|
3741
|
-
|
|
3742
|
-
|
|
3743
|
-
|
|
3744
|
-
inlineTargetLabel: `${sel.targetName} ${buildTargetLabelSuffix(providerLabel, sel.resolvedTarget)}`
|
|
3745
|
-
};
|
|
3746
|
-
});
|
|
3809
|
+
selections = multiSelections.map((sel) => ({
|
|
3810
|
+
selection: sel,
|
|
3811
|
+
inlineTargetLabel: sel.targetName
|
|
3812
|
+
}));
|
|
3747
3813
|
} else {
|
|
3748
3814
|
const selection = await selectTarget({
|
|
3749
3815
|
testFilePath,
|
|
@@ -3757,11 +3823,10 @@ async function prepareFileMetadata(params) {
|
|
|
3757
3823
|
dryRunDelayMax: options.dryRunDelayMax,
|
|
3758
3824
|
env: process.env
|
|
3759
3825
|
});
|
|
3760
|
-
const providerLabel = options.dryRun ? `${selection.resolvedTarget.kind} (dry-run)` : selection.resolvedTarget.kind;
|
|
3761
3826
|
selections = [
|
|
3762
3827
|
{
|
|
3763
3828
|
selection,
|
|
3764
|
-
inlineTargetLabel:
|
|
3829
|
+
inlineTargetLabel: selection.targetName
|
|
3765
3830
|
}
|
|
3766
3831
|
];
|
|
3767
3832
|
}
|
|
@@ -3902,6 +3967,10 @@ async function runSingleEvalFile(params) {
|
|
|
3902
3967
|
if (event.status === "running" && streamingObserver) {
|
|
3903
3968
|
streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
|
|
3904
3969
|
}
|
|
3970
|
+
let verdict;
|
|
3971
|
+
if (event.executionStatus === "ok") verdict = "PASS";
|
|
3972
|
+
else if (event.executionStatus === "quality_failure") verdict = "FAIL";
|
|
3973
|
+
else if (event.executionStatus === "execution_error") verdict = "ERROR";
|
|
3905
3974
|
progressReporter.update(displayId, {
|
|
3906
3975
|
workerId: displayId,
|
|
3907
3976
|
testId: matrixMode ? `${event.testId}@${targetName}` : event.testId,
|
|
@@ -3909,7 +3978,9 @@ async function runSingleEvalFile(params) {
|
|
|
3909
3978
|
startedAt: event.startedAt,
|
|
3910
3979
|
completedAt: event.completedAt,
|
|
3911
3980
|
error: event.error,
|
|
3912
|
-
targetLabel: inlineTargetLabel
|
|
3981
|
+
targetLabel: inlineTargetLabel,
|
|
3982
|
+
score: event.score,
|
|
3983
|
+
verdict
|
|
3913
3984
|
});
|
|
3914
3985
|
}
|
|
3915
3986
|
});
|
|
@@ -3973,7 +4044,7 @@ async function runEvalCommand(input) {
|
|
|
3973
4044
|
const useFileExport = !!(options.otelFile || options.traceFile);
|
|
3974
4045
|
if (options.exportOtel || useFileExport) {
|
|
3975
4046
|
try {
|
|
3976
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4047
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-MZFXE6B5.js");
|
|
3977
4048
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
3978
4049
|
let headers = {};
|
|
3979
4050
|
if (options.otelBackend) {
|
|
@@ -4248,6 +4319,22 @@ Results written to: ${outputPath}`);
|
|
|
4248
4319
|
}
|
|
4249
4320
|
}
|
|
4250
4321
|
}
|
|
4322
|
+
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
4323
|
+
const evalFileArgs = resolvedTestFiles.map((f) => path12.relative(cwd, f)).join(" ");
|
|
4324
|
+
const targetFlag = options.target ? ` --target ${options.target}` : "";
|
|
4325
|
+
const relativeOutputPath = path12.relative(cwd, outputPath);
|
|
4326
|
+
console.log(
|
|
4327
|
+
`
|
|
4328
|
+
Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
|
|
4329
|
+
agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath} -o ${relativeOutputPath}`
|
|
4330
|
+
);
|
|
4331
|
+
}
|
|
4332
|
+
return {
|
|
4333
|
+
executionErrorCount: summary.executionErrorCount,
|
|
4334
|
+
outputPath,
|
|
4335
|
+
testFiles: resolvedTestFiles,
|
|
4336
|
+
target: options.target
|
|
4337
|
+
};
|
|
4251
4338
|
} finally {
|
|
4252
4339
|
unsubscribeCodexLogs();
|
|
4253
4340
|
unsubscribePiLogs();
|
|
@@ -4285,6 +4372,10 @@ export {
|
|
|
4285
4372
|
HtmlWriter,
|
|
4286
4373
|
resolveEvalPaths,
|
|
4287
4374
|
findRepoRoot,
|
|
4375
|
+
buildGradingArtifact,
|
|
4376
|
+
buildTimingArtifact,
|
|
4377
|
+
buildBenchmarkArtifact,
|
|
4378
|
+
parseJsonlResults,
|
|
4288
4379
|
detectFileType,
|
|
4289
4380
|
validateEvalFile,
|
|
4290
4381
|
validateTargetsFile,
|
|
@@ -4295,4 +4386,4 @@ export {
|
|
|
4295
4386
|
selectTarget,
|
|
4296
4387
|
runEvalCommand
|
|
4297
4388
|
};
|
|
4298
|
-
//# sourceMappingURL=chunk-
|
|
4389
|
+
//# sourceMappingURL=chunk-RLL4QGNL.js.map
|