agentv 4.35.1 → 4.37.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-G57MG52C.js → artifact-writer-GFNKYREE.js} +4 -4
- package/dist/{chunk-INOKS5LF.js → chunk-M7AMFWBZ.js} +275 -58
- package/dist/chunk-M7AMFWBZ.js.map +1 -0
- package/dist/{chunk-KJGYL3M3.js → chunk-N6E5XFOM.js} +213 -85
- package/dist/chunk-N6E5XFOM.js.map +1 -0
- package/dist/{chunk-KNF3AGCI.js → chunk-OYI35QFW.js} +314 -49
- package/dist/chunk-OYI35QFW.js.map +1 -0
- package/dist/{chunk-CRMGUVRZ.js → chunk-P4LSNFZR.js} +85 -19
- package/dist/chunk-P4LSNFZR.js.map +1 -0
- package/dist/{chunk-6QEIZ33V.js → chunk-RL4S2FBZ.js} +2700 -456
- package/dist/chunk-RL4S2FBZ.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/dashboard/assets/index-9tV-u4HJ.css +1 -0
- package/dist/dashboard/assets/{index-Bdk-9a_8.js → index-BDRYJsGF.js} +1 -1
- package/dist/dashboard/assets/index-DuESU7zZ.js +118 -0
- package/dist/dashboard/index.html +2 -2
- package/dist/{dist-M4B77IW4.js → dist-OY3JSP6Z.js} +125 -3
- package/dist/index.js +5 -5
- package/dist/{interactive-VYQ5SYMR.js → interactive-CQELHITQ.js} +5 -5
- package/dist/skills/agentv-eval-writer/SKILL.md +6 -0
- package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js → ts-eval-loader-RBTB2HG2-H5TRXZLO.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-6QEIZ33V.js.map +0 -1
- package/dist/chunk-CRMGUVRZ.js.map +0 -1
- package/dist/chunk-INOKS5LF.js.map +0 -1
- package/dist/chunk-KJGYL3M3.js.map +0 -1
- package/dist/chunk-KNF3AGCI.js.map +0 -1
- package/dist/dashboard/assets/index-BPMAZqjE.css +0 -1
- package/dist/dashboard/assets/index-BWO0UcxG.js +0 -118
- /package/dist/{artifact-writer-G57MG52C.js.map → artifact-writer-GFNKYREE.js.map} +0 -0
- /package/dist/{dist-M4B77IW4.js.map → dist-OY3JSP6Z.js.map} +0 -0
- /package/dist/{interactive-VYQ5SYMR.js.map → interactive-CQELHITQ.js.map} +0 -0
- /package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js.map → ts-eval-loader-RBTB2HG2-H5TRXZLO.js.map} +0 -0
|
@@ -53,7 +53,7 @@ import {
|
|
|
53
53
|
validateTargetsFile,
|
|
54
54
|
validateWorkspacePaths,
|
|
55
55
|
writeRunTags
|
|
56
|
-
} from "./chunk-
|
|
56
|
+
} from "./chunk-M7AMFWBZ.js";
|
|
57
57
|
import {
|
|
58
58
|
RESULT_INDEX_FILENAME,
|
|
59
59
|
aggregateRunDir,
|
|
@@ -65,7 +65,7 @@ import {
|
|
|
65
65
|
resolveRunManifestPath,
|
|
66
66
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
67
67
|
writeArtifactsFromResults
|
|
68
|
-
} from "./chunk-
|
|
68
|
+
} from "./chunk-N6E5XFOM.js";
|
|
69
69
|
import {
|
|
70
70
|
DEFAULT_CATEGORY,
|
|
71
71
|
addProject,
|
|
@@ -86,9 +86,10 @@ import {
|
|
|
86
86
|
touchProject,
|
|
87
87
|
transpileEvalYamlFile,
|
|
88
88
|
trimBaselineResult
|
|
89
|
-
} from "./chunk-
|
|
89
|
+
} from "./chunk-OYI35QFW.js";
|
|
90
90
|
import {
|
|
91
91
|
DEFAULT_THRESHOLD,
|
|
92
|
+
buildTraceFromMessages,
|
|
92
93
|
createBuiltinRegistry,
|
|
93
94
|
discoverCopilotSessions,
|
|
94
95
|
executeScript,
|
|
@@ -116,7 +117,7 @@ import {
|
|
|
116
117
|
runStartsWithAssertion,
|
|
117
118
|
toCamelCaseDeep,
|
|
118
119
|
toSnakeCaseDeep
|
|
119
|
-
} from "./chunk-
|
|
120
|
+
} from "./chunk-RL4S2FBZ.js";
|
|
120
121
|
import {
|
|
121
122
|
__commonJS,
|
|
122
123
|
__require,
|
|
@@ -4325,16 +4326,25 @@ var evalAssertCommand = command({
|
|
|
4325
4326
|
);
|
|
4326
4327
|
process.exit(1);
|
|
4327
4328
|
}
|
|
4329
|
+
const messages = [{ role: "assistant", content: resolvedOutput }];
|
|
4330
|
+
const inputMessages = [{ role: "user", content: resolvedInput }];
|
|
4331
|
+
const trace = buildTraceFromMessages({
|
|
4332
|
+
input: inputMessages,
|
|
4333
|
+
output: messages,
|
|
4334
|
+
finalOutput: resolvedOutput
|
|
4335
|
+
});
|
|
4328
4336
|
const payload = JSON.stringify(
|
|
4329
4337
|
{
|
|
4330
|
-
output:
|
|
4331
|
-
|
|
4338
|
+
output: resolvedOutput,
|
|
4339
|
+
answer: resolvedOutput,
|
|
4340
|
+
messages,
|
|
4341
|
+
input: inputMessages,
|
|
4332
4342
|
question: resolvedInput,
|
|
4333
4343
|
criteria: "",
|
|
4334
4344
|
expected_output: [],
|
|
4335
4345
|
reference_answer: "",
|
|
4336
4346
|
input_files: [],
|
|
4337
|
-
trace
|
|
4347
|
+
trace,
|
|
4338
4348
|
token_usage: null,
|
|
4339
4349
|
cost_usd: null,
|
|
4340
4350
|
duration_ms: null,
|
|
@@ -4581,11 +4591,21 @@ var evalRunCommand = command({
|
|
|
4581
4591
|
type: optional(string),
|
|
4582
4592
|
long: "transcript",
|
|
4583
4593
|
description: "Grade a pre-recorded transcript JSONL instead of invoking a live provider. Ignores targets."
|
|
4594
|
+
}),
|
|
4595
|
+
recordReplay: option({
|
|
4596
|
+
type: optional(string),
|
|
4597
|
+
long: "record-replay",
|
|
4598
|
+
description: "Append live target outputs to a replay fixture JSONL file. Graders still run normally."
|
|
4599
|
+
}),
|
|
4600
|
+
recordReplayVariant: option({
|
|
4601
|
+
type: optional(string),
|
|
4602
|
+
long: "record-replay-variant",
|
|
4603
|
+
description: "Optional variant key to store with --record-replay fixture rows."
|
|
4584
4604
|
})
|
|
4585
4605
|
},
|
|
4586
4606
|
handler: async (args) => {
|
|
4587
4607
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4588
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4608
|
+
const { launchInteractiveWizard } = await import("./interactive-CQELHITQ.js");
|
|
4589
4609
|
await launchInteractiveWizard();
|
|
4590
4610
|
return;
|
|
4591
4611
|
}
|
|
@@ -4634,7 +4654,9 @@ var evalRunCommand = command({
|
|
|
4634
4654
|
budgetUsd: args.budgetUsd,
|
|
4635
4655
|
tag: args.tag,
|
|
4636
4656
|
excludeTag: args.excludeTag,
|
|
4637
|
-
transcript: args.transcript
|
|
4657
|
+
transcript: args.transcript,
|
|
4658
|
+
recordReplay: args.recordReplay,
|
|
4659
|
+
recordReplayVariant: args.recordReplayVariant
|
|
4638
4660
|
};
|
|
4639
4661
|
const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
|
|
4640
4662
|
if (result?.allExecutionErrors) {
|
|
@@ -6730,9 +6752,16 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
|
|
|
6730
6752
|
const scored = [];
|
|
6731
6753
|
for (const raw of results) {
|
|
6732
6754
|
if (testIdFilter && raw.test_id !== testIdFilter) continue;
|
|
6733
|
-
const trace = toTraceSummary(raw);
|
|
6734
6755
|
const candidate = extractCandidate(raw);
|
|
6735
|
-
const output = raw.output;
|
|
6756
|
+
const output = raw.trace?.messages ?? (Array.isArray(raw.output) ? raw.output : void 0);
|
|
6757
|
+
const outputMessages = Array.isArray(output) ? toCamelCaseDeep(output) : void 0;
|
|
6758
|
+
const trace = raw.trace && Array.isArray(raw.trace.messages) && Array.isArray(raw.trace.events) ? toCamelCaseDeep(raw.trace) : buildTraceFromMessages({
|
|
6759
|
+
output: outputMessages,
|
|
6760
|
+
finalOutput: candidate,
|
|
6761
|
+
summary: toTraceSummary(raw),
|
|
6762
|
+
target: raw.target,
|
|
6763
|
+
testId: raw.test_id
|
|
6764
|
+
});
|
|
6736
6765
|
const evalContext = {
|
|
6737
6766
|
evalCase: buildTestCase(raw),
|
|
6738
6767
|
candidate,
|
|
@@ -6741,7 +6770,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
|
|
|
6741
6770
|
attempt: 1,
|
|
6742
6771
|
promptInputs: { question: "" },
|
|
6743
6772
|
now: /* @__PURE__ */ new Date(),
|
|
6744
|
-
output:
|
|
6773
|
+
output: outputMessages,
|
|
6745
6774
|
trace,
|
|
6746
6775
|
tokenUsage: raw.token_usage ? toCamelCaseDeep(raw.token_usage) : void 0,
|
|
6747
6776
|
costUsd: raw.cost_usd,
|
|
@@ -7102,7 +7131,7 @@ function renderScores(scores) {
|
|
|
7102
7131
|
}).join(" | ");
|
|
7103
7132
|
}
|
|
7104
7133
|
function renderTree(result) {
|
|
7105
|
-
const messages = result.output;
|
|
7134
|
+
const messages = result.trace?.messages ?? (Array.isArray(result.output) ? result.output : void 0);
|
|
7106
7135
|
const spans = getTraceSpans(result);
|
|
7107
7136
|
if (!messages || messages.length === 0) {
|
|
7108
7137
|
if (spans.length > 0) {
|
|
@@ -7759,13 +7788,22 @@ async function runCodeGraders(tasks, concurrency) {
|
|
|
7759
7788
|
const { testId, resultsDir, responseText, inputData } = task;
|
|
7760
7789
|
const graderName = graderConfig.name;
|
|
7761
7790
|
const inputText = extractInputText(inputData.input);
|
|
7791
|
+
const messages = [{ role: "assistant", content: responseText }];
|
|
7792
|
+
const trace = buildTraceFromMessages({
|
|
7793
|
+
input: inputData.input,
|
|
7794
|
+
output: messages,
|
|
7795
|
+
finalOutput: responseText,
|
|
7796
|
+
testId
|
|
7797
|
+
});
|
|
7762
7798
|
const payload = JSON.stringify({
|
|
7763
|
-
output:
|
|
7799
|
+
output: responseText,
|
|
7800
|
+
answer: responseText,
|
|
7801
|
+
messages,
|
|
7764
7802
|
input: inputData.input,
|
|
7765
7803
|
criteria: "",
|
|
7766
7804
|
expected_output: [],
|
|
7767
7805
|
input_files: inputData.input_files ?? [],
|
|
7768
|
-
trace
|
|
7806
|
+
trace,
|
|
7769
7807
|
token_usage: null,
|
|
7770
7808
|
cost_usd: null,
|
|
7771
7809
|
duration_ms: null,
|
|
@@ -10939,7 +10977,7 @@ function renderResultsReport(results, sourceFile, records, benchmarkEvalFile) {
|
|
|
10939
10977
|
(result, index) => serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile)
|
|
10940
10978
|
);
|
|
10941
10979
|
const dataJson = JSON.stringify(rows).replace(/<\//g, "<\\/");
|
|
10942
|
-
return RESULTS_REPORT_TEMPLATE.replace("__DATA_PLACEHOLDER__", dataJson);
|
|
10980
|
+
return RESULTS_REPORT_TEMPLATE.replace("__DATA_PLACEHOLDER__", () => dataJson);
|
|
10943
10981
|
}
|
|
10944
10982
|
async function writeResultsReport(source, outputPath, cwd) {
|
|
10945
10983
|
const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd);
|
|
@@ -10999,7 +11037,7 @@ function formatInput(result) {
|
|
|
10999
11037
|
}
|
|
11000
11038
|
function formatOutput(result) {
|
|
11001
11039
|
if (!result.output || result.output.length === 0) return "(no output)";
|
|
11002
|
-
return result.output
|
|
11040
|
+
return result.output;
|
|
11003
11041
|
}
|
|
11004
11042
|
function formatShow(result) {
|
|
11005
11043
|
const usage = result.tokenUsage;
|
|
@@ -12075,6 +12113,16 @@ function inferLanguage(filePath) {
|
|
|
12075
12113
|
};
|
|
12076
12114
|
return langMap[ext] ?? "plaintext";
|
|
12077
12115
|
}
|
|
12116
|
+
function inferRawContentType(filePath) {
|
|
12117
|
+
const ext = path23.extname(filePath).toLowerCase();
|
|
12118
|
+
if (ext === ".json") return "application/json; charset=utf-8";
|
|
12119
|
+
if (ext === ".jsonl") return "text/plain; charset=utf-8";
|
|
12120
|
+
if (ext === ".md") return "text/markdown; charset=utf-8";
|
|
12121
|
+
return "text/plain; charset=utf-8";
|
|
12122
|
+
}
|
|
12123
|
+
function contentDispositionFilename(filePath) {
|
|
12124
|
+
return path23.basename(filePath).replace(/["\\\r\n]/g, "_");
|
|
12125
|
+
}
|
|
12078
12126
|
function stripHeavyFields(results) {
|
|
12079
12127
|
return results.map((r) => {
|
|
12080
12128
|
const { requests, trace, ...rest } = r;
|
|
@@ -12475,6 +12523,8 @@ async function handleEvalFiles(c4, { searchDir, projectId }) {
|
|
|
12475
12523
|
record.input_path,
|
|
12476
12524
|
record.output_path,
|
|
12477
12525
|
record.response_path,
|
|
12526
|
+
record.answer_path,
|
|
12527
|
+
record.transcript_path,
|
|
12478
12528
|
record.task_dir,
|
|
12479
12529
|
record.eval_path,
|
|
12480
12530
|
record.targets_path,
|
|
@@ -12502,7 +12552,13 @@ async function handleEvalFileContent(c4, { searchDir, projectId }) {
|
|
|
12502
12552
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
12503
12553
|
const marker = "/files/";
|
|
12504
12554
|
const markerIdx = c4.req.path.indexOf(marker);
|
|
12505
|
-
const
|
|
12555
|
+
const encodedFilePath = markerIdx >= 0 ? c4.req.path.slice(markerIdx + marker.length) : "";
|
|
12556
|
+
let filePath = "";
|
|
12557
|
+
try {
|
|
12558
|
+
filePath = encodedFilePath ? decodeURIComponent(encodedFilePath) : "";
|
|
12559
|
+
} catch {
|
|
12560
|
+
return c4.json({ error: "Invalid file path encoding" }, 400);
|
|
12561
|
+
}
|
|
12506
12562
|
if (!filePath) return c4.json({ error: "No file path specified" }, 400);
|
|
12507
12563
|
await ensureRunReadable(searchDir, meta, projectId);
|
|
12508
12564
|
const baseDir = path23.dirname(meta.path);
|
|
@@ -12515,6 +12571,16 @@ async function handleEvalFileContent(c4, { searchDir, projectId }) {
|
|
|
12515
12571
|
}
|
|
12516
12572
|
try {
|
|
12517
12573
|
const fileContent = readFileSync12(absolutePath, "utf8");
|
|
12574
|
+
if (c4.req.query("raw") === "1" || c4.req.query("download") === "1") {
|
|
12575
|
+
c4.header("Content-Type", inferRawContentType(absolutePath));
|
|
12576
|
+
if (c4.req.query("download") === "1") {
|
|
12577
|
+
c4.header(
|
|
12578
|
+
"Content-Disposition",
|
|
12579
|
+
`attachment; filename="${contentDispositionFilename(absolutePath)}"`
|
|
12580
|
+
);
|
|
12581
|
+
}
|
|
12582
|
+
return c4.body(fileContent);
|
|
12583
|
+
}
|
|
12518
12584
|
const language = inferLanguage(absolutePath);
|
|
12519
12585
|
return c4.json({ content: fileContent, language });
|
|
12520
12586
|
} catch {
|
|
@@ -15426,4 +15492,4 @@ export {
|
|
|
15426
15492
|
preprocessArgv,
|
|
15427
15493
|
runCli
|
|
15428
15494
|
};
|
|
15429
|
-
//# sourceMappingURL=chunk-
|
|
15495
|
+
//# sourceMappingURL=chunk-P4LSNFZR.js.map
|