agentv 4.35.1 → 4.37.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/dist/{artifact-writer-G57MG52C.js → artifact-writer-GFNKYREE.js} +4 -4
  2. package/dist/{chunk-INOKS5LF.js → chunk-M7AMFWBZ.js} +275 -58
  3. package/dist/chunk-M7AMFWBZ.js.map +1 -0
  4. package/dist/{chunk-KJGYL3M3.js → chunk-N6E5XFOM.js} +213 -85
  5. package/dist/chunk-N6E5XFOM.js.map +1 -0
  6. package/dist/{chunk-KNF3AGCI.js → chunk-OYI35QFW.js} +314 -49
  7. package/dist/chunk-OYI35QFW.js.map +1 -0
  8. package/dist/{chunk-CRMGUVRZ.js → chunk-P4LSNFZR.js} +85 -19
  9. package/dist/chunk-P4LSNFZR.js.map +1 -0
  10. package/dist/{chunk-6QEIZ33V.js → chunk-RL4S2FBZ.js} +2700 -456
  11. package/dist/chunk-RL4S2FBZ.js.map +1 -0
  12. package/dist/cli.js +5 -5
  13. package/dist/dashboard/assets/index-9tV-u4HJ.css +1 -0
  14. package/dist/dashboard/assets/{index-Bdk-9a_8.js → index-BDRYJsGF.js} +1 -1
  15. package/dist/dashboard/assets/index-DuESU7zZ.js +118 -0
  16. package/dist/dashboard/index.html +2 -2
  17. package/dist/{dist-M4B77IW4.js → dist-OY3JSP6Z.js} +125 -3
  18. package/dist/index.js +5 -5
  19. package/dist/{interactive-VYQ5SYMR.js → interactive-CQELHITQ.js} +5 -5
  20. package/dist/skills/agentv-eval-writer/SKILL.md +6 -0
  21. package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js → ts-eval-loader-RBTB2HG2-H5TRXZLO.js} +2 -2
  22. package/package.json +1 -1
  23. package/dist/chunk-6QEIZ33V.js.map +0 -1
  24. package/dist/chunk-CRMGUVRZ.js.map +0 -1
  25. package/dist/chunk-INOKS5LF.js.map +0 -1
  26. package/dist/chunk-KJGYL3M3.js.map +0 -1
  27. package/dist/chunk-KNF3AGCI.js.map +0 -1
  28. package/dist/dashboard/assets/index-BPMAZqjE.css +0 -1
  29. package/dist/dashboard/assets/index-BWO0UcxG.js +0 -118
  30. /package/dist/{artifact-writer-G57MG52C.js.map → artifact-writer-GFNKYREE.js.map} +0 -0
  31. /package/dist/{dist-M4B77IW4.js.map → dist-OY3JSP6Z.js.map} +0 -0
  32. /package/dist/{interactive-VYQ5SYMR.js.map → interactive-CQELHITQ.js.map} +0 -0
  33. /package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js.map → ts-eval-loader-RBTB2HG2-H5TRXZLO.js.map} +0 -0
@@ -53,7 +53,7 @@ import {
53
53
  validateTargetsFile,
54
54
  validateWorkspacePaths,
55
55
  writeRunTags
56
- } from "./chunk-INOKS5LF.js";
56
+ } from "./chunk-M7AMFWBZ.js";
57
57
  import {
58
58
  RESULT_INDEX_FILENAME,
59
59
  aggregateRunDir,
@@ -65,7 +65,7 @@ import {
65
65
  resolveRunManifestPath,
66
66
  toSnakeCaseDeep as toSnakeCaseDeep2,
67
67
  writeArtifactsFromResults
68
- } from "./chunk-KJGYL3M3.js";
68
+ } from "./chunk-N6E5XFOM.js";
69
69
  import {
70
70
  DEFAULT_CATEGORY,
71
71
  addProject,
@@ -86,9 +86,10 @@ import {
86
86
  touchProject,
87
87
  transpileEvalYamlFile,
88
88
  trimBaselineResult
89
- } from "./chunk-KNF3AGCI.js";
89
+ } from "./chunk-OYI35QFW.js";
90
90
  import {
91
91
  DEFAULT_THRESHOLD,
92
+ buildTraceFromMessages,
92
93
  createBuiltinRegistry,
93
94
  discoverCopilotSessions,
94
95
  executeScript,
@@ -116,7 +117,7 @@ import {
116
117
  runStartsWithAssertion,
117
118
  toCamelCaseDeep,
118
119
  toSnakeCaseDeep
119
- } from "./chunk-6QEIZ33V.js";
120
+ } from "./chunk-RL4S2FBZ.js";
120
121
  import {
121
122
  __commonJS,
122
123
  __require,
@@ -4325,16 +4326,25 @@ var evalAssertCommand = command({
4325
4326
  );
4326
4327
  process.exit(1);
4327
4328
  }
4329
+ const messages = [{ role: "assistant", content: resolvedOutput }];
4330
+ const inputMessages = [{ role: "user", content: resolvedInput }];
4331
+ const trace = buildTraceFromMessages({
4332
+ input: inputMessages,
4333
+ output: messages,
4334
+ finalOutput: resolvedOutput
4335
+ });
4328
4336
  const payload = JSON.stringify(
4329
4337
  {
4330
- output: [{ role: "assistant", content: resolvedOutput }],
4331
- input: [{ role: "user", content: resolvedInput }],
4338
+ output: resolvedOutput,
4339
+ answer: resolvedOutput,
4340
+ messages,
4341
+ input: inputMessages,
4332
4342
  question: resolvedInput,
4333
4343
  criteria: "",
4334
4344
  expected_output: [],
4335
4345
  reference_answer: "",
4336
4346
  input_files: [],
4337
- trace: null,
4347
+ trace,
4338
4348
  token_usage: null,
4339
4349
  cost_usd: null,
4340
4350
  duration_ms: null,
@@ -4581,11 +4591,21 @@ var evalRunCommand = command({
4581
4591
  type: optional(string),
4582
4592
  long: "transcript",
4583
4593
  description: "Grade a pre-recorded transcript JSONL instead of invoking a live provider. Ignores targets."
4594
+ }),
4595
+ recordReplay: option({
4596
+ type: optional(string),
4597
+ long: "record-replay",
4598
+ description: "Append live target outputs to a replay fixture JSONL file. Graders still run normally."
4599
+ }),
4600
+ recordReplayVariant: option({
4601
+ type: optional(string),
4602
+ long: "record-replay-variant",
4603
+ description: "Optional variant key to store with --record-replay fixture rows."
4584
4604
  })
4585
4605
  },
4586
4606
  handler: async (args) => {
4587
4607
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4588
- const { launchInteractiveWizard } = await import("./interactive-VYQ5SYMR.js");
4608
+ const { launchInteractiveWizard } = await import("./interactive-CQELHITQ.js");
4589
4609
  await launchInteractiveWizard();
4590
4610
  return;
4591
4611
  }
@@ -4634,7 +4654,9 @@ var evalRunCommand = command({
4634
4654
  budgetUsd: args.budgetUsd,
4635
4655
  tag: args.tag,
4636
4656
  excludeTag: args.excludeTag,
4637
- transcript: args.transcript
4657
+ transcript: args.transcript,
4658
+ recordReplay: args.recordReplay,
4659
+ recordReplayVariant: args.recordReplayVariant
4638
4660
  };
4639
4661
  const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
4640
4662
  if (result?.allExecutionErrors) {
@@ -6730,9 +6752,16 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
6730
6752
  const scored = [];
6731
6753
  for (const raw of results) {
6732
6754
  if (testIdFilter && raw.test_id !== testIdFilter) continue;
6733
- const trace = toTraceSummary(raw);
6734
6755
  const candidate = extractCandidate(raw);
6735
- const output = raw.output;
6756
+ const output = raw.trace?.messages ?? (Array.isArray(raw.output) ? raw.output : void 0);
6757
+ const outputMessages = Array.isArray(output) ? toCamelCaseDeep(output) : void 0;
6758
+ const trace = raw.trace && Array.isArray(raw.trace.messages) && Array.isArray(raw.trace.events) ? toCamelCaseDeep(raw.trace) : buildTraceFromMessages({
6759
+ output: outputMessages,
6760
+ finalOutput: candidate,
6761
+ summary: toTraceSummary(raw),
6762
+ target: raw.target,
6763
+ testId: raw.test_id
6764
+ });
6736
6765
  const evalContext = {
6737
6766
  evalCase: buildTestCase(raw),
6738
6767
  candidate,
@@ -6741,7 +6770,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
6741
6770
  attempt: 1,
6742
6771
  promptInputs: { question: "" },
6743
6772
  now: /* @__PURE__ */ new Date(),
6744
- output: Array.isArray(output) ? output : void 0,
6773
+ output: outputMessages,
6745
6774
  trace,
6746
6775
  tokenUsage: raw.token_usage ? toCamelCaseDeep(raw.token_usage) : void 0,
6747
6776
  costUsd: raw.cost_usd,
@@ -7102,7 +7131,7 @@ function renderScores(scores) {
7102
7131
  }).join(" | ");
7103
7132
  }
7104
7133
  function renderTree(result) {
7105
- const messages = result.output;
7134
+ const messages = result.trace?.messages ?? (Array.isArray(result.output) ? result.output : void 0);
7106
7135
  const spans = getTraceSpans(result);
7107
7136
  if (!messages || messages.length === 0) {
7108
7137
  if (spans.length > 0) {
@@ -7759,13 +7788,22 @@ async function runCodeGraders(tasks, concurrency) {
7759
7788
  const { testId, resultsDir, responseText, inputData } = task;
7760
7789
  const graderName = graderConfig.name;
7761
7790
  const inputText = extractInputText(inputData.input);
7791
+ const messages = [{ role: "assistant", content: responseText }];
7792
+ const trace = buildTraceFromMessages({
7793
+ input: inputData.input,
7794
+ output: messages,
7795
+ finalOutput: responseText,
7796
+ testId
7797
+ });
7762
7798
  const payload = JSON.stringify({
7763
- output: [{ role: "assistant", content: responseText }],
7799
+ output: responseText,
7800
+ answer: responseText,
7801
+ messages,
7764
7802
  input: inputData.input,
7765
7803
  criteria: "",
7766
7804
  expected_output: [],
7767
7805
  input_files: inputData.input_files ?? [],
7768
- trace: null,
7806
+ trace,
7769
7807
  token_usage: null,
7770
7808
  cost_usd: null,
7771
7809
  duration_ms: null,
@@ -10939,7 +10977,7 @@ function renderResultsReport(results, sourceFile, records, benchmarkEvalFile) {
10939
10977
  (result, index) => serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile)
10940
10978
  );
10941
10979
  const dataJson = JSON.stringify(rows).replace(/<\//g, "<\\/");
10942
- return RESULTS_REPORT_TEMPLATE.replace("__DATA_PLACEHOLDER__", dataJson);
10980
+ return RESULTS_REPORT_TEMPLATE.replace("__DATA_PLACEHOLDER__", () => dataJson);
10943
10981
  }
10944
10982
  async function writeResultsReport(source, outputPath, cwd) {
10945
10983
  const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd);
@@ -10999,7 +11037,7 @@ function formatInput(result) {
10999
11037
  }
11000
11038
  function formatOutput(result) {
11001
11039
  if (!result.output || result.output.length === 0) return "(no output)";
11002
- return result.output.map((msg) => String(msg.content ?? "")).join("\n");
11040
+ return result.output;
11003
11041
  }
11004
11042
  function formatShow(result) {
11005
11043
  const usage = result.tokenUsage;
@@ -12075,6 +12113,16 @@ function inferLanguage(filePath) {
12075
12113
  };
12076
12114
  return langMap[ext] ?? "plaintext";
12077
12115
  }
12116
+ function inferRawContentType(filePath) {
12117
+ const ext = path23.extname(filePath).toLowerCase();
12118
+ if (ext === ".json") return "application/json; charset=utf-8";
12119
+ if (ext === ".jsonl") return "text/plain; charset=utf-8";
12120
+ if (ext === ".md") return "text/markdown; charset=utf-8";
12121
+ return "text/plain; charset=utf-8";
12122
+ }
12123
+ function contentDispositionFilename(filePath) {
12124
+ return path23.basename(filePath).replace(/["\\\r\n]/g, "_");
12125
+ }
12078
12126
  function stripHeavyFields(results) {
12079
12127
  return results.map((r) => {
12080
12128
  const { requests, trace, ...rest } = r;
@@ -12475,6 +12523,8 @@ async function handleEvalFiles(c4, { searchDir, projectId }) {
12475
12523
  record.input_path,
12476
12524
  record.output_path,
12477
12525
  record.response_path,
12526
+ record.answer_path,
12527
+ record.transcript_path,
12478
12528
  record.task_dir,
12479
12529
  record.eval_path,
12480
12530
  record.targets_path,
@@ -12502,7 +12552,13 @@ async function handleEvalFileContent(c4, { searchDir, projectId }) {
12502
12552
  if (!meta) return c4.json({ error: "Run not found" }, 404);
12503
12553
  const marker = "/files/";
12504
12554
  const markerIdx = c4.req.path.indexOf(marker);
12505
- const filePath = markerIdx >= 0 ? c4.req.path.slice(markerIdx + marker.length) : "";
12555
+ const encodedFilePath = markerIdx >= 0 ? c4.req.path.slice(markerIdx + marker.length) : "";
12556
+ let filePath = "";
12557
+ try {
12558
+ filePath = encodedFilePath ? decodeURIComponent(encodedFilePath) : "";
12559
+ } catch {
12560
+ return c4.json({ error: "Invalid file path encoding" }, 400);
12561
+ }
12506
12562
  if (!filePath) return c4.json({ error: "No file path specified" }, 400);
12507
12563
  await ensureRunReadable(searchDir, meta, projectId);
12508
12564
  const baseDir = path23.dirname(meta.path);
@@ -12515,6 +12571,16 @@ async function handleEvalFileContent(c4, { searchDir, projectId }) {
12515
12571
  }
12516
12572
  try {
12517
12573
  const fileContent = readFileSync12(absolutePath, "utf8");
12574
+ if (c4.req.query("raw") === "1" || c4.req.query("download") === "1") {
12575
+ c4.header("Content-Type", inferRawContentType(absolutePath));
12576
+ if (c4.req.query("download") === "1") {
12577
+ c4.header(
12578
+ "Content-Disposition",
12579
+ `attachment; filename="${contentDispositionFilename(absolutePath)}"`
12580
+ );
12581
+ }
12582
+ return c4.body(fileContent);
12583
+ }
12518
12584
  const language = inferLanguage(absolutePath);
12519
12585
  return c4.json({ content: fileContent, language });
12520
12586
  } catch {
@@ -15426,4 +15492,4 @@ export {
15426
15492
  preprocessArgv,
15427
15493
  runCli
15428
15494
  };
15429
- //# sourceMappingURL=chunk-CRMGUVRZ.js.map
15495
+ //# sourceMappingURL=chunk-P4LSNFZR.js.map