agentv 3.13.3 → 3.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@ import {
22
22
  validateFileReferences,
23
23
  validateTargetsFile,
24
24
  writeArtifactsFromResults
25
- } from "./chunk-PACTPWEN.js";
25
+ } from "./chunk-75PQBKLR.js";
26
26
  import {
27
27
  createBuiltinRegistry,
28
28
  executeScript,
@@ -39,7 +39,7 @@ import {
39
39
  toSnakeCaseDeep as toSnakeCaseDeep2,
40
40
  transpileEvalYamlFile,
41
41
  trimBaselineResult
42
- } from "./chunk-D3LNJUUB.js";
42
+ } from "./chunk-ELQEFMGO.js";
43
43
  import {
44
44
  __commonJS,
45
45
  __esm,
@@ -4185,7 +4185,7 @@ var evalRunCommand = command({
4185
4185
  },
4186
4186
  handler: async (args) => {
4187
4187
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4188
- const { launchInteractiveWizard } = await import("./interactive-OMJAMCQP.js");
4188
+ const { launchInteractiveWizard } = await import("./interactive-Q563ULAR.js");
4189
4189
  await launchInteractiveWizard();
4190
4190
  return;
4191
4191
  }
@@ -4408,13 +4408,23 @@ var evalBenchCommand = command({
4408
4408
  type: string,
4409
4409
  displayName: "export-dir",
4410
4410
  description: "Export directory from pipeline input/grade"
4411
+ }),
4412
+ llmScores: option({
4413
+ type: optional(string),
4414
+ long: "llm-scores",
4415
+ description: "Path to LLM scores JSON file (reads from stdin if omitted)"
4411
4416
  })
4412
4417
  },
4413
- handler: async ({ exportDir }) => {
4418
+ handler: async ({ exportDir, llmScores: llmScoresPath }) => {
4414
4419
  const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
4415
4420
  const testIds = manifest.test_ids;
4416
4421
  const targetName = manifest.target?.name ?? "unknown";
4417
- const stdinData = await readStdin();
4422
+ let stdinData;
4423
+ if (llmScoresPath) {
4424
+ stdinData = await readFile(llmScoresPath, "utf8");
4425
+ } else {
4426
+ stdinData = await readStdin();
4427
+ }
4418
4428
  const llmScores = stdinData ? JSON.parse(stdinData) : {};
4419
4429
  const indexLines = [];
4420
4430
  const allPassRates = [];
@@ -4814,6 +4824,351 @@ async function writeJson(filePath, data) {
4814
4824
  `, "utf8");
4815
4825
  }
4816
4826
 
4827
+ // src/commands/pipeline/run.ts
4828
+ import { execSync } from "node:child_process";
4829
+ import { existsSync as existsSync2, readFileSync as readFileSync4, unlinkSync } from "node:fs";
4830
+ import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
4831
+ import { tmpdir } from "node:os";
4832
+ import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
4833
+ function loadEnvFile(dir) {
4834
+ let current = resolve2(dir);
4835
+ while (true) {
4836
+ const candidate = join4(current, ".env");
4837
+ if (existsSync2(candidate)) {
4838
+ const env3 = {};
4839
+ for (const line of readFileSync4(candidate, "utf8").split("\n")) {
4840
+ const trimmed = line.trim();
4841
+ if (!trimmed || trimmed.startsWith("#")) continue;
4842
+ const eqIdx = trimmed.indexOf("=");
4843
+ if (eqIdx === -1) continue;
4844
+ env3[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim();
4845
+ }
4846
+ return env3;
4847
+ }
4848
+ const parent = dirname2(current);
4849
+ if (parent === current) break;
4850
+ current = parent;
4851
+ }
4852
+ return {};
4853
+ }
4854
+ var evalRunCommand2 = command({
4855
+ name: "run",
4856
+ description: "Extract inputs, invoke CLI targets, and run code graders in one step",
4857
+ args: {
4858
+ evalPath: positional({
4859
+ type: string,
4860
+ displayName: "eval-path",
4861
+ description: "Path to eval YAML file"
4862
+ }),
4863
+ out: option({
4864
+ type: string,
4865
+ long: "out",
4866
+ description: "Output directory for results"
4867
+ }),
4868
+ workers: option({
4869
+ type: optional(number),
4870
+ long: "workers",
4871
+ description: "Parallel workers for target invocation (default: all tests)"
4872
+ })
4873
+ },
4874
+ handler: async ({ evalPath, out, workers }) => {
4875
+ const resolvedEvalPath = resolve2(evalPath);
4876
+ const outDir = resolve2(out);
4877
+ const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
4878
+ const evalDir = dirname2(resolvedEvalPath);
4879
+ const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
4880
+ const tests = suite.tests;
4881
+ if (tests.length === 0) {
4882
+ console.error("No tests found in eval file.");
4883
+ process.exit(1);
4884
+ }
4885
+ let targetInfo = null;
4886
+ let targetName = "agent";
4887
+ let targetKind = "agent";
4888
+ try {
4889
+ const selection = await selectTarget({
4890
+ testFilePath: resolvedEvalPath,
4891
+ repoRoot,
4892
+ cwd: evalDir,
4893
+ dryRun: false,
4894
+ dryRunDelay: 0,
4895
+ dryRunDelayMin: 0,
4896
+ dryRunDelayMax: 0,
4897
+ env: process.env
4898
+ });
4899
+ targetName = selection.targetName;
4900
+ if (selection.resolvedTarget.kind === "cli") {
4901
+ targetKind = "cli";
4902
+ const config = selection.resolvedTarget.config;
4903
+ targetInfo = {
4904
+ kind: "cli",
4905
+ command: config.command,
4906
+ cwd: config.cwd ?? evalDir,
4907
+ timeoutMs: config.timeoutMs ?? 3e4
4908
+ };
4909
+ }
4910
+ } catch {
4911
+ }
4912
+ const testIds = [];
4913
+ for (const test of tests) {
4914
+ const testDir = join4(outDir, test.id);
4915
+ await mkdir4(testDir, { recursive: true });
4916
+ testIds.push(test.id);
4917
+ const inputText = test.question;
4918
+ const inputMessages = test.input.map((m) => ({
4919
+ role: m.role,
4920
+ content: typeof m.content === "string" ? m.content : m.content
4921
+ }));
4922
+ await writeJson2(join4(testDir, "input.json"), {
4923
+ input_text: inputText,
4924
+ input_messages: inputMessages,
4925
+ file_paths: test.file_paths,
4926
+ metadata: test.metadata ?? {}
4927
+ });
4928
+ if (targetInfo) {
4929
+ await writeJson2(join4(testDir, "invoke.json"), {
4930
+ kind: "cli",
4931
+ command: targetInfo.command,
4932
+ cwd: targetInfo.cwd,
4933
+ timeout_ms: targetInfo.timeoutMs,
4934
+ env: {}
4935
+ });
4936
+ } else {
4937
+ await writeJson2(join4(testDir, "invoke.json"), {
4938
+ kind: "agent",
4939
+ instructions: "Execute this task in the current workspace. The agent IS the target."
4940
+ });
4941
+ }
4942
+ await writeFile5(join4(testDir, "criteria.md"), test.criteria ?? "", "utf8");
4943
+ if (test.expected_output.length > 0 || test.reference_answer !== void 0 && test.reference_answer !== "") {
4944
+ await writeJson2(join4(testDir, "expected_output.json"), {
4945
+ expected_output: test.expected_output,
4946
+ reference_answer: test.reference_answer ?? ""
4947
+ });
4948
+ }
4949
+ await writeGraderConfigs2(testDir, test.assertions ?? [], evalDir);
4950
+ }
4951
+ await writeJson2(join4(outDir, "manifest.json"), {
4952
+ eval_file: resolvedEvalPath,
4953
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4954
+ target: { name: targetName, kind: targetKind },
4955
+ test_ids: testIds
4956
+ });
4957
+ console.log(`Extracted ${testIds.length} test(s) to ${outDir}`);
4958
+ if (targetInfo) {
4959
+ const envVars = loadEnvFile(evalDir);
4960
+ const mergedEnv = { ...process.env, ...envVars };
4961
+ const maxWorkers = workers ?? testIds.length;
4962
+ console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
4963
+ const invokeTarget = async (testId) => {
4964
+ const testDir = join4(outDir, testId);
4965
+ const invoke = JSON.parse(await readFile4(join4(testDir, "invoke.json"), "utf8"));
4966
+ if (invoke.kind !== "cli") return;
4967
+ const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
4968
+ const template = invoke.command;
4969
+ const cwd = invoke.cwd;
4970
+ const timeoutMs = invoke.timeout_ms ?? 12e4;
4971
+ const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
4972
+ const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
4973
+ await writeFile5(promptFile, inputData.input_text, "utf8");
4974
+ let rendered = template;
4975
+ rendered = rendered.replace("{PROMPT_FILE}", promptFile);
4976
+ rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
4977
+ rendered = rendered.replace("{PROMPT}", inputData.input_text);
4978
+ const start = performance.now();
4979
+ try {
4980
+ execSync(rendered, {
4981
+ cwd,
4982
+ timeout: timeoutMs,
4983
+ env: mergedEnv,
4984
+ stdio: ["pipe", "pipe", "pipe"],
4985
+ maxBuffer: 10 * 1024 * 1024
4986
+ });
4987
+ const durationMs = Math.round(performance.now() - start);
4988
+ let response;
4989
+ if (existsSync2(outputFile)) {
4990
+ response = readFileSync4(outputFile, "utf8");
4991
+ } else {
4992
+ response = "ERROR: No output file generated";
4993
+ }
4994
+ await writeFile5(join4(testDir, "response.md"), response, "utf8");
4995
+ await writeJson2(join4(testDir, "timing.json"), {
4996
+ duration_ms: durationMs,
4997
+ total_duration_seconds: Math.round(durationMs / 10) / 100
4998
+ });
4999
+ console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
5000
+ } catch (error) {
5001
+ const durationMs = Math.round(performance.now() - start);
5002
+ const message = error instanceof Error ? error.message : String(error);
5003
+ const response = `ERROR: target failed \u2014 ${message}`;
5004
+ await writeFile5(join4(testDir, "response.md"), response, "utf8");
5005
+ await writeJson2(join4(testDir, "timing.json"), {
5006
+ duration_ms: durationMs,
5007
+ total_duration_seconds: Math.round(durationMs / 10) / 100
5008
+ });
5009
+ console.error(` ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}`);
5010
+ } finally {
5011
+ try {
5012
+ if (existsSync2(promptFile)) unlinkSync(promptFile);
5013
+ if (existsSync2(outputFile)) unlinkSync(outputFile);
5014
+ } catch {
5015
+ }
5016
+ }
5017
+ };
5018
+ const allTasks = testIds.map((testId) => invokeTarget(testId));
5019
+ await Promise.all(allTasks);
5020
+ } else {
5021
+ console.log("Agent-as-target mode \u2014 skipping CLI invocation.");
5022
+ }
5023
+ let totalGraders = 0;
5024
+ let totalPassed = 0;
5025
+ for (const testId of testIds) {
5026
+ const testDir = join4(outDir, testId);
5027
+ const codeGradersDir = join4(testDir, "code_graders");
5028
+ const resultsDir = join4(testDir, "code_grader_results");
5029
+ let graderFiles;
5030
+ try {
5031
+ graderFiles = (await readdir3(codeGradersDir)).filter((f) => f.endsWith(".json"));
5032
+ } catch {
5033
+ continue;
5034
+ }
5035
+ if (graderFiles.length === 0) continue;
5036
+ await mkdir4(resultsDir, { recursive: true });
5037
+ const responseText = await readFile4(join4(testDir, "response.md"), "utf8");
5038
+ const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
5039
+ for (const graderFile of graderFiles) {
5040
+ const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
5041
+ const graderName = graderConfig.name;
5042
+ const payload = JSON.stringify({
5043
+ output: [{ role: "assistant", content: responseText }],
5044
+ input: inputData.input_messages,
5045
+ question: inputData.input_text,
5046
+ criteria: "",
5047
+ expected_output: [],
5048
+ reference_answer: "",
5049
+ input_files: [],
5050
+ trace: null,
5051
+ token_usage: null,
5052
+ cost_usd: null,
5053
+ duration_ms: null,
5054
+ start_time: null,
5055
+ end_time: null,
5056
+ file_changes: null,
5057
+ workspace_path: null,
5058
+ config: graderConfig.config ?? null,
5059
+ metadata: {},
5060
+ input_text: inputData.input_text,
5061
+ output_text: responseText,
5062
+ expected_output_text: ""
5063
+ });
5064
+ try {
5065
+ const stdout = await executeScript(
5066
+ graderConfig.command,
5067
+ payload,
5068
+ void 0,
5069
+ graderConfig.cwd
5070
+ );
5071
+ const parsed = JSON.parse(stdout);
5072
+ const score = typeof parsed.score === "number" ? parsed.score : 0;
5073
+ const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
5074
+ await writeFile5(
5075
+ join4(resultsDir, `${graderName}.json`),
5076
+ `${JSON.stringify(
5077
+ {
5078
+ name: graderName,
5079
+ type: "code-grader",
5080
+ score,
5081
+ weight: graderConfig.weight ?? 1,
5082
+ assertions,
5083
+ details: parsed.details ?? {}
5084
+ },
5085
+ null,
5086
+ 2
5087
+ )}
5088
+ `,
5089
+ "utf8"
5090
+ );
5091
+ totalGraders++;
5092
+ if (score >= 0.5) totalPassed++;
5093
+ } catch (error) {
5094
+ const message = error instanceof Error ? error.message : String(error);
5095
+ console.error(` ${testId}/${graderName}: ERROR \u2014 ${message}`);
5096
+ await writeFile5(
5097
+ join4(resultsDir, `${graderName}.json`),
5098
+ `${JSON.stringify(
5099
+ {
5100
+ name: graderName,
5101
+ type: "code-grader",
5102
+ score: 0,
5103
+ weight: graderConfig.weight ?? 1,
5104
+ assertions: [{ text: `Error: ${message}`, passed: false }],
5105
+ details: { error: message }
5106
+ },
5107
+ null,
5108
+ 2
5109
+ )}
5110
+ `,
5111
+ "utf8"
5112
+ );
5113
+ totalGraders++;
5114
+ }
5115
+ }
5116
+ }
5117
+ console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
5118
+ console.log(`
5119
+ Done. Agent can now perform LLM grading on responses in ${outDir}`);
5120
+ }
5121
+ });
5122
+ async function writeJson2(filePath, data) {
5123
+ await writeFile5(filePath, `${JSON.stringify(data, null, 2)}
5124
+ `, "utf8");
5125
+ }
5126
+ async function writeGraderConfigs2(testDir, assertions, evalDir) {
5127
+ const codeGradersDir = join4(testDir, "code_graders");
5128
+ const llmGradersDir = join4(testDir, "llm_graders");
5129
+ let hasCodeGraders = false;
5130
+ let hasLlmGraders = false;
5131
+ for (const assertion of assertions) {
5132
+ if (assertion.type === "code-grader") {
5133
+ if (!hasCodeGraders) {
5134
+ await mkdir4(codeGradersDir, { recursive: true });
5135
+ hasCodeGraders = true;
5136
+ }
5137
+ const config = assertion;
5138
+ await writeJson2(join4(codeGradersDir, `${config.name}.json`), {
5139
+ name: config.name,
5140
+ command: config.command,
5141
+ cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
5142
+ weight: config.weight ?? 1,
5143
+ config: config.config ?? {}
5144
+ });
5145
+ } else if (assertion.type === "llm-grader") {
5146
+ if (!hasLlmGraders) {
5147
+ await mkdir4(llmGradersDir, { recursive: true });
5148
+ hasLlmGraders = true;
5149
+ }
5150
+ const config = assertion;
5151
+ let promptContent = "";
5152
+ if (config.resolvedPromptPath) {
5153
+ try {
5154
+ promptContent = readFileSync4(config.resolvedPromptPath, "utf8");
5155
+ } catch {
5156
+ promptContent = typeof config.prompt === "string" ? config.prompt : "";
5157
+ }
5158
+ } else if (typeof config.prompt === "string") {
5159
+ promptContent = config.prompt;
5160
+ }
5161
+ await writeJson2(join4(llmGradersDir, `${config.name}.json`), {
5162
+ name: config.name,
5163
+ prompt_content: promptContent,
5164
+ weight: config.weight ?? 1,
5165
+ threshold: 0.5,
5166
+ config: {}
5167
+ });
5168
+ }
5169
+ }
5170
+ }
5171
+
4817
5172
  // src/commands/pipeline/index.ts
4818
5173
  var pipelineCommand = subcommands({
4819
5174
  name: "pipeline",
@@ -4821,7 +5176,8 @@ var pipelineCommand = subcommands({
4821
5176
  cmds: {
4822
5177
  input: evalInputCommand,
4823
5178
  grade: evalGradeCommand,
4824
- bench: evalBenchCommand
5179
+ bench: evalBenchCommand,
5180
+ run: evalRunCommand2
4825
5181
  }
4826
5182
  });
4827
5183
 
@@ -4829,10 +5185,10 @@ var pipelineCommand = subcommands({
4829
5185
  import path7 from "node:path";
4830
5186
 
4831
5187
  // src/commands/results/shared.ts
4832
- import { existsSync as existsSync2 } from "node:fs";
5188
+ import { existsSync as existsSync3 } from "node:fs";
4833
5189
 
4834
5190
  // src/commands/trace/utils.ts
4835
- import { readFileSync as readFileSync4, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
5191
+ import { readFileSync as readFileSync5, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
4836
5192
  import path6 from "node:path";
4837
5193
  var colors2 = {
4838
5194
  reset: "\x1B[0m",
@@ -4872,7 +5228,7 @@ function resolveTraceResultPath(filePath) {
4872
5228
  return resolveWorkspaceOrFilePath(filePath);
4873
5229
  }
4874
5230
  function loadJsonlRecords(filePath) {
4875
- const content = readFileSync4(filePath, "utf8");
5231
+ const content = readFileSync5(filePath, "utf8");
4876
5232
  const lines = content.trim().split("\n").filter((line) => line.trim());
4877
5233
  return lines.map((line, i) => {
4878
5234
  const record = JSON.parse(line);
@@ -4925,7 +5281,7 @@ function toRawResult(result) {
4925
5281
  };
4926
5282
  }
4927
5283
  function loadOtlpTraceFile(filePath) {
4928
- const parsed = JSON.parse(readFileSync4(filePath, "utf8"));
5284
+ const parsed = JSON.parse(readFileSync5(filePath, "utf8"));
4929
5285
  const spans = parsed.resourceSpans?.flatMap((resource) => resource.scopeSpans ?? []).flatMap((scope) => scope.spans ?? []);
4930
5286
  if (!spans || spans.length === 0) {
4931
5287
  return [];
@@ -5243,14 +5599,14 @@ async function resolveSourceFile(source, cwd) {
5243
5599
  let sourceFile;
5244
5600
  if (source) {
5245
5601
  sourceFile = resolveResultSourcePath(source, cwd);
5246
- if (!existsSync2(sourceFile)) {
5602
+ if (!existsSync3(sourceFile)) {
5247
5603
  console.error(`Error: File not found: ${sourceFile}`);
5248
5604
  process.exit(1);
5249
5605
  }
5250
5606
  } else {
5251
5607
  const cache = await loadRunCache(cwd);
5252
5608
  const cachedFile = cache ? resolveRunCacheFile(cache) : "";
5253
- if (cachedFile && existsSync2(cachedFile)) {
5609
+ if (cachedFile && existsSync3(cachedFile)) {
5254
5610
  sourceFile = cachedFile;
5255
5611
  } else {
5256
5612
  const metas = listResultFiles(cwd, 1);
@@ -5462,7 +5818,7 @@ var resultsShowCommand = command({
5462
5818
  });
5463
5819
 
5464
5820
  // src/commands/results/summary.ts
5465
- import { existsSync as existsSync3, readFileSync as readFileSync5 } from "node:fs";
5821
+ import { existsSync as existsSync4, readFileSync as readFileSync6 } from "node:fs";
5466
5822
  function formatSummary(results, grading) {
5467
5823
  const total = results.length;
5468
5824
  let passed;
@@ -5513,9 +5869,9 @@ var resultsSummaryCommand = command({
5513
5869
  const { results, sourceFile } = await loadResults(source, cwd);
5514
5870
  let grading;
5515
5871
  const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
5516
- if (existsSync3(gradingPath)) {
5872
+ if (existsSync4(gradingPath)) {
5517
5873
  try {
5518
- grading = JSON.parse(readFileSync5(gradingPath, "utf8"));
5874
+ grading = JSON.parse(readFileSync6(gradingPath, "utf8"));
5519
5875
  } catch {
5520
5876
  }
5521
5877
  }
@@ -5540,7 +5896,7 @@ var resultsCommand = subcommands({
5540
5896
  });
5541
5897
 
5542
5898
  // src/commands/results/serve.ts
5543
- import { existsSync as existsSync4, readFileSync as readFileSync6, writeFileSync as writeFileSync3 } from "node:fs";
5899
+ import { existsSync as existsSync5, readFileSync as readFileSync7, writeFileSync as writeFileSync3 } from "node:fs";
5544
5900
  import path8 from "node:path";
5545
5901
  import { Hono } from "hono";
5546
5902
  function feedbackPath(resultDir) {
@@ -5548,11 +5904,11 @@ function feedbackPath(resultDir) {
5548
5904
  }
5549
5905
  function readFeedback(cwd) {
5550
5906
  const fp = feedbackPath(cwd);
5551
- if (!existsSync4(fp)) {
5907
+ if (!existsSync5(fp)) {
5552
5908
  return { reviews: [] };
5553
5909
  }
5554
5910
  try {
5555
- return JSON.parse(readFileSync6(fp, "utf8"));
5911
+ return JSON.parse(readFileSync7(fp, "utf8"));
5556
5912
  } catch (err2) {
5557
5913
  console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
5558
5914
  return { reviews: [] };
@@ -5562,10 +5918,40 @@ function writeFeedback(cwd, data) {
5562
5918
  writeFileSync3(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
5563
5919
  `, "utf8");
5564
5920
  }
5565
- function createApp(results, resultDir) {
5921
+ function createApp(results, resultDir, cwd, sourceFile) {
5922
+ const searchDir = cwd ?? resultDir;
5566
5923
  const app2 = new Hono();
5567
5924
  app2.get("/", (c3) => {
5568
- return c3.html(generateServeHtml(results));
5925
+ return c3.html(generateServeHtml(results, sourceFile));
5926
+ });
5927
+ app2.get("/api/runs", (c3) => {
5928
+ const metas = listResultFiles(searchDir);
5929
+ return c3.json({
5930
+ runs: metas.map((m) => ({
5931
+ filename: m.filename,
5932
+ path: m.path,
5933
+ timestamp: m.timestamp,
5934
+ test_count: m.testCount,
5935
+ pass_rate: m.passRate,
5936
+ avg_score: m.avgScore,
5937
+ size_bytes: m.sizeBytes
5938
+ }))
5939
+ });
5940
+ });
5941
+ app2.get("/api/runs/:filename", (c3) => {
5942
+ const filename = c3.req.param("filename");
5943
+ const metas = listResultFiles(searchDir);
5944
+ const meta = metas.find((m) => m.filename === filename);
5945
+ if (!meta) {
5946
+ return c3.json({ error: "Run not found" }, 404);
5947
+ }
5948
+ try {
5949
+ const loaded = patchTestIds(loadManifestResults(meta.path));
5950
+ const lightResults = stripHeavyFields(loaded);
5951
+ return c3.json({ results: lightResults, source: meta.filename });
5952
+ } catch (err2) {
5953
+ return c3.json({ error: "Failed to load run" }, 500);
5954
+ }
5569
5955
  });
5570
5956
  app2.get("/api/feedback", (c3) => {
5571
5957
  const data = readFeedback(resultDir);
@@ -5611,11 +5997,8 @@ function createApp(results, resultDir) {
5611
5997
  });
5612
5998
  return app2;
5613
5999
  }
5614
- function escapeHtml(s) {
5615
- return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
5616
- }
5617
- function generateServeHtml(results) {
5618
- const lightResults = results.map((r) => {
6000
+ function stripHeavyFields(results) {
6001
+ return results.map((r) => {
5619
6002
  const { requests, trace, ...rest } = r;
5620
6003
  const toolCalls = trace?.toolCalls && Object.keys(trace.toolCalls).length > 0 ? trace.toolCalls : void 0;
5621
6004
  const graderDurationMs = (r.scores ?? []).reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
@@ -5625,6 +6008,12 @@ function generateServeHtml(results) {
5625
6008
  ...graderDurationMs > 0 && { _graderDurationMs: graderDurationMs }
5626
6009
  };
5627
6010
  });
6011
+ }
6012
+ function escapeHtml(s) {
6013
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
6014
+ }
6015
+ function generateServeHtml(results, sourceFile) {
6016
+ const lightResults = stripHeavyFields(results);
5628
6017
  const dataJson = JSON.stringify(lightResults).replace(/</g, "\\u003c").replace(/>/g, "\\u003e").replace(/\u2028/g, "\\u2028").replace(/\u2029/g, "\\u2029");
5629
6018
  return `<!DOCTYPE html>
5630
6019
  <html lang="en">
@@ -5642,6 +6031,11 @@ ${SERVE_STYLES}
5642
6031
  <h1 class="header-title">AgentV</h1>
5643
6032
  <span class="header-subtitle">Results Review</span>
5644
6033
  </div>
6034
+ <div class="header-center">
6035
+ <select id="run-picker" class="run-picker" title="Switch result file">
6036
+ <option value="">Loading runs...</option>
6037
+ </select>
6038
+ </div>
5645
6039
  <div class="header-right">
5646
6040
  <span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>
5647
6041
  </div>
@@ -5653,6 +6047,7 @@ ${SERVE_STYLES}
5653
6047
  <main id="app"></main>
5654
6048
  <script>
5655
6049
  var DATA = ${dataJson};
6050
+ var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(path8.basename(sourceFile)).replace(/</g, "\\u003c").replace(/>/g, "\\u003e") : "null"};
5656
6051
  ${SERVE_SCRIPT}
5657
6052
  </script>
5658
6053
  </body>
@@ -5679,6 +6074,10 @@ body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:
5679
6074
  .header-left{display:flex;align-items:baseline;gap:12px}
5680
6075
  .header-title{font-size:18px;font-weight:600}
5681
6076
  .header-subtitle{font-size:14px;color:var(--text-muted)}
6077
+ .header-center{flex:1;display:flex;justify-content:center;padding:0 16px}
6078
+ .run-picker{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font);max-width:400px;width:100%;cursor:pointer}
6079
+ .run-picker:hover{border-color:var(--primary)}
6080
+ .run-picker:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
5682
6081
  .timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
5683
6082
 
5684
6083
  /* Tabs */
@@ -5778,6 +6177,11 @@ body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:
5778
6177
  .tool-tag{display:inline-block;padding:2px 10px;font-size:12px;font-family:var(--mono);background:var(--primary-bg);color:var(--primary);border:1px solid var(--border);border-radius:12px}
5779
6178
  .empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
5780
6179
  .empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
6180
+ .welcome-state{text-align:center;padding:80px 24px;color:var(--text-muted)}
6181
+ .welcome-state h2{font-size:24px;margin-bottom:12px;color:var(--text);font-weight:600}
6182
+ .welcome-state p{font-size:15px;margin-bottom:8px;max-width:500px;margin-left:auto;margin-right:auto}
6183
+ .welcome-state code{font-family:var(--mono);background:var(--surface);border:1px solid var(--border);border-radius:3px;padding:2px 6px;font-size:13px}
6184
+ .welcome-state .hint{margin-top:24px;font-size:13px;color:var(--text-muted)}
5781
6185
 
5782
6186
  /* Feedback */
5783
6187
  .feedback-section{margin-top:16px;padding-top:16px;border-top:1px solid var(--border-light)}
@@ -5935,7 +6339,15 @@ var SERVE_SCRIPT = `
5935
6339
 
5936
6340
  /* ---- render ---- */
5937
6341
  function render(){
5938
- if(DATA.length===0){app.innerHTML='<div class="empty-state"><h3>No results</h3><p>No evaluation results to display.</p></div>';return;}
6342
+ if(DATA.length===0){
6343
+ app.innerHTML='<div class="welcome-state">'
6344
+ +'<h2>No results yet</h2>'
6345
+ +'<p>Run an evaluation or mount a results directory to see results here.</p>'
6346
+ +'<p><code>agentv eval &lt;eval-file&gt;</code></p>'
6347
+ +'<p class="hint">The dashboard will automatically detect new result files.</p>'
6348
+ +'</div>';
6349
+ return;
6350
+ }
5939
6351
  if(state.tab==="overview")renderOverview();else renderTests();
5940
6352
  }
5941
6353
 
@@ -6198,6 +6610,69 @@ var SERVE_SCRIPT = `
6198
6610
  return h;
6199
6611
  }
6200
6612
 
6613
+ /* ---- run picker ---- */
6614
+ var runPicker=document.getElementById("run-picker");
6615
+ var knownRunFilenames=[];
6616
+
6617
+ function refreshRunList(){
6618
+ fetch("/api/runs").then(function(r){return r.json();}).then(function(d){
6619
+ if(!d||!d.runs)return;
6620
+ var runs=d.runs;
6621
+ var newFilenames=runs.map(function(r){return r.filename;});
6622
+
6623
+ /* Detect new runs that appeared since last poll */
6624
+ if(knownRunFilenames.length>0){
6625
+ var hasNew=newFilenames.some(function(f){return knownRunFilenames.indexOf(f)===-1;});
6626
+ if(hasNew&&DATA.length===0){
6627
+ /* Auto-load the first (most recent) run when starting from empty state */
6628
+ loadRun(runs[0].filename);
6629
+ }
6630
+ }
6631
+ knownRunFilenames=newFilenames;
6632
+
6633
+ /* Rebuild picker options */
6634
+ var h='<option value="">Select a result file...</option>';
6635
+ if(runs.length===0){
6636
+ h='<option value="">No result files</option>';
6637
+ }
6638
+ for(var i=0;i<runs.length;i++){
6639
+ var r=runs[i];
6640
+ var label=r.filename+" ("+r.test_count+" tests, "+(r.pass_rate*100).toFixed(0)+"% pass)";
6641
+ h+='<option value="'+esc(r.filename)+'">'+esc(label)+"</option>";
6642
+ }
6643
+ runPicker.innerHTML=h;
6644
+ /* Pre-select the initially loaded run */
6645
+ if(INITIAL_SOURCE&&runs.length>0){
6646
+ runPicker.value=INITIAL_SOURCE;
6647
+ }
6648
+ }).catch(function(err){console.warn("Failed to refresh run list:",err);});
6649
+ }
6650
+
6651
+ function loadRun(filename){
6652
+ fetch("/api/runs/"+encodeURIComponent(filename)).then(function(r){return r.json();}).then(function(d){
6653
+ if(d.error){console.error(d.error);return;}
6654
+ DATA=d.results;
6655
+ stats=computeStats(DATA);
6656
+ tgtStats=computeTargets(DATA);
6657
+ tgtNames=tgtStats.map(function(t){return t.target;});
6658
+ state.expanded={};
6659
+ feedbackCache={};
6660
+ loadFeedback();
6661
+ render();
6662
+ /* Update picker selection */
6663
+ runPicker.value=filename;
6664
+ }).catch(function(err){console.error("Failed to load run:",err);});
6665
+ }
6666
+
6667
+ runPicker.addEventListener("change",function(){
6668
+ var val=runPicker.value;
6669
+ if(val)loadRun(val);
6670
+ });
6671
+
6672
+ /* Poll for new result files every 5 seconds */
6673
+ refreshRunList();
6674
+ setInterval(refreshRunList,5000);
6675
+
6201
6676
  /* ---- init ---- */
6202
6677
  loadFeedback();
6203
6678
  render();
@@ -6216,7 +6691,7 @@ var resultsServeCommand = command({
6216
6691
  type: optional(number),
6217
6692
  long: "port",
6218
6693
  short: "p",
6219
- description: "Port to listen on (default: 3117)"
6694
+ description: "Port to listen on (flag \u2192 PORT env var \u2192 3117)"
6220
6695
  }),
6221
6696
  dir: option({
6222
6697
  type: optional(string),
@@ -6227,14 +6702,43 @@ var resultsServeCommand = command({
6227
6702
  },
6228
6703
  handler: async ({ source, port, dir }) => {
6229
6704
  const cwd = dir ?? process.cwd();
6230
- const listenPort = port ?? 3117;
6705
+ const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117);
6231
6706
  try {
6232
- const { results, sourceFile } = await loadResults(source, cwd);
6233
- const resultDir = path8.dirname(path8.resolve(sourceFile));
6234
- const app2 = createApp(results, resultDir);
6235
- console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
6707
+ let results = [];
6708
+ let sourceFile;
6709
+ if (source) {
6710
+ const resolved = resolveResultSourcePath(source, cwd);
6711
+ if (!existsSync5(resolved)) {
6712
+ console.error(`Error: Source file not found: ${resolved}`);
6713
+ process.exit(1);
6714
+ }
6715
+ sourceFile = resolved;
6716
+ results = patchTestIds(loadManifestResults(resolved));
6717
+ } else {
6718
+ const cache = await loadRunCache(cwd);
6719
+ const cachedFile = cache ? resolveRunCacheFile(cache) : "";
6720
+ if (cachedFile && existsSync5(cachedFile)) {
6721
+ sourceFile = cachedFile;
6722
+ results = patchTestIds(loadManifestResults(cachedFile));
6723
+ } else {
6724
+ const metas = listResultFiles(cwd, 1);
6725
+ if (metas.length > 0) {
6726
+ sourceFile = metas[0].path;
6727
+ results = patchTestIds(loadManifestResults(metas[0].path));
6728
+ }
6729
+ }
6730
+ }
6731
+ const resultDir = sourceFile ? path8.dirname(path8.resolve(sourceFile)) : cwd;
6732
+ const app2 = createApp(results, resultDir, cwd, sourceFile);
6733
+ if (results.length > 0 && sourceFile) {
6734
+ console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
6735
+ } else {
6736
+ console.log("No results found. Dashboard will show an empty state.");
6737
+ console.log("Run an evaluation to see results: agentv eval <eval-file>");
6738
+ }
6236
6739
  console.log(`Dashboard: http://localhost:${listenPort}`);
6237
6740
  console.log(`Feedback API: http://localhost:${listenPort}/api/feedback`);
6741
+ console.log(`Result picker API: http://localhost:${listenPort}/api/runs`);
6238
6742
  console.log(`Feedback file: ${feedbackPath(resultDir)}`);
6239
6743
  console.log("Press Ctrl+C to stop");
6240
6744
  const { serve: startServer } = await import("@hono/node-server");
@@ -6263,7 +6767,7 @@ function detectPackageManager() {
6263
6767
  return detectPackageManagerFromPath(process.argv[1] ?? "");
6264
6768
  }
6265
6769
  function runCommand(cmd, args) {
6266
- return new Promise((resolve2, reject) => {
6770
+ return new Promise((resolve3, reject) => {
6267
6771
  const child = spawn(cmd, args, { stdio: ["inherit", "pipe", "inherit"], shell: true });
6268
6772
  let stdout = "";
6269
6773
  child.stdout?.on("data", (data) => {
@@ -6271,7 +6775,7 @@ function runCommand(cmd, args) {
6271
6775
  stdout += data.toString();
6272
6776
  });
6273
6777
  child.on("error", reject);
6274
- child.on("close", (code) => resolve2({ exitCode: code ?? 1, stdout }));
6778
+ child.on("close", (code) => resolve3({ exitCode: code ?? 1, stdout }));
6275
6779
  });
6276
6780
  }
6277
6781
  var updateCommand = command({
@@ -7179,7 +7683,7 @@ var transpileCommand = command({
7179
7683
  });
7180
7684
 
7181
7685
  // src/commands/trim/index.ts
7182
- import { readFileSync as readFileSync7, writeFileSync as writeFileSync5 } from "node:fs";
7686
+ import { readFileSync as readFileSync8, writeFileSync as writeFileSync5 } from "node:fs";
7183
7687
  var trimCommand = command({
7184
7688
  name: "trim",
7185
7689
  description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
@@ -7198,7 +7702,7 @@ var trimCommand = command({
7198
7702
  },
7199
7703
  handler: async ({ input, out }) => {
7200
7704
  try {
7201
- const content = readFileSync7(input, "utf8");
7705
+ const content = readFileSync8(input, "utf8");
7202
7706
  const lines = content.trim().split("\n").filter((line) => line.trim());
7203
7707
  const trimmedLines = lines.map((line) => {
7204
7708
  const record = JSON.parse(line);
@@ -7304,7 +7808,7 @@ function isTTY() {
7304
7808
 
7305
7809
  // src/commands/validate/validate-files.ts
7306
7810
  import { constants } from "node:fs";
7307
- import { access, readdir as readdir3, stat } from "node:fs/promises";
7811
+ import { access, readdir as readdir4, stat } from "node:fs/promises";
7308
7812
  import path10 from "node:path";
7309
7813
  async function validateFiles(paths) {
7310
7814
  const filePaths = await expandPaths(paths);
@@ -7370,7 +7874,7 @@ async function expandPaths(paths) {
7370
7874
  async function findYamlFiles(dirPath) {
7371
7875
  const results = [];
7372
7876
  try {
7373
- const entries2 = await readdir3(dirPath, { withFileTypes: true });
7877
+ const entries2 = await readdir4(dirPath, { withFileTypes: true });
7374
7878
  for (const entry of entries2) {
7375
7879
  const fullPath = path10.join(dirPath, entry.name);
7376
7880
  if (entry.isDirectory()) {
@@ -7427,14 +7931,14 @@ var validateCommand = command({
7427
7931
  });
7428
7932
 
7429
7933
  // src/commands/workspace/clean.ts
7430
- import { existsSync as existsSync5 } from "node:fs";
7431
- import { readFile as readFile4, readdir as readdir4, rm } from "node:fs/promises";
7934
+ import { existsSync as existsSync6 } from "node:fs";
7935
+ import { readFile as readFile5, readdir as readdir5, rm } from "node:fs/promises";
7432
7936
  import path11 from "node:path";
7433
7937
  async function confirm(message) {
7434
7938
  const readline2 = await import("node:readline");
7435
7939
  const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
7436
- const answer = await new Promise((resolve2) => {
7437
- rl.question(`${message} [y/N] `, resolve2);
7940
+ const answer = await new Promise((resolve3) => {
7941
+ rl.question(`${message} [y/N] `, resolve3);
7438
7942
  });
7439
7943
  rl.close();
7440
7944
  return answer.toLowerCase() === "y";
@@ -7456,19 +7960,19 @@ var cleanCommand = command({
7456
7960
  },
7457
7961
  handler: async ({ repo, force }) => {
7458
7962
  const poolRoot = getWorkspacePoolRoot();
7459
- if (!existsSync5(poolRoot)) {
7963
+ if (!existsSync6(poolRoot)) {
7460
7964
  console.log("No workspace pool entries found.");
7461
7965
  return;
7462
7966
  }
7463
7967
  if (repo) {
7464
- const entries2 = await readdir4(poolRoot, { withFileTypes: true });
7968
+ const entries2 = await readdir5(poolRoot, { withFileTypes: true });
7465
7969
  const poolDirs = entries2.filter((e) => e.isDirectory());
7466
7970
  const matchingDirs = [];
7467
7971
  for (const dir of poolDirs) {
7468
7972
  const poolDir = path11.join(poolRoot, dir.name);
7469
7973
  const metadataPath = path11.join(poolDir, "metadata.json");
7470
7974
  try {
7471
- const raw = await readFile4(metadataPath, "utf-8");
7975
+ const raw = await readFile5(metadataPath, "utf-8");
7472
7976
  const metadata = JSON.parse(raw);
7473
7977
  const hasRepo = metadata.repos?.some((r) => {
7474
7978
  if (r.source.type === "git" && r.source.url) {
@@ -7515,13 +8019,13 @@ var cleanCommand = command({
7515
8019
  });
7516
8020
 
7517
8021
  // src/commands/workspace/list.ts
7518
- import { existsSync as existsSync6 } from "node:fs";
7519
- import { readFile as readFile5, readdir as readdir5, stat as stat2 } from "node:fs/promises";
8022
+ import { existsSync as existsSync7 } from "node:fs";
8023
+ import { readFile as readFile6, readdir as readdir6, stat as stat2 } from "node:fs/promises";
7520
8024
  import path12 from "node:path";
7521
8025
  async function getDirectorySize(dirPath) {
7522
8026
  let totalSize = 0;
7523
8027
  try {
7524
- const entries2 = await readdir5(dirPath, { withFileTypes: true });
8028
+ const entries2 = await readdir6(dirPath, { withFileTypes: true });
7525
8029
  for (const entry of entries2) {
7526
8030
  const fullPath = path12.join(dirPath, entry.name);
7527
8031
  if (entry.isDirectory()) {
@@ -7547,11 +8051,11 @@ var listCommand = command({
7547
8051
  args: {},
7548
8052
  handler: async () => {
7549
8053
  const poolRoot = getWorkspacePoolRoot();
7550
- if (!existsSync6(poolRoot)) {
8054
+ if (!existsSync7(poolRoot)) {
7551
8055
  console.log("No workspace pool entries found.");
7552
8056
  return;
7553
8057
  }
7554
- const entries2 = await readdir5(poolRoot, { withFileTypes: true });
8058
+ const entries2 = await readdir6(poolRoot, { withFileTypes: true });
7555
8059
  const poolDirs = entries2.filter((e) => e.isDirectory());
7556
8060
  if (poolDirs.length === 0) {
7557
8061
  console.log("No workspace pool entries found.");
@@ -7560,12 +8064,12 @@ var listCommand = command({
7560
8064
  for (const dir of poolDirs) {
7561
8065
  const poolDir = path12.join(poolRoot, dir.name);
7562
8066
  const fingerprint = dir.name;
7563
- const poolEntries = await readdir5(poolDir, { withFileTypes: true });
8067
+ const poolEntries = await readdir6(poolDir, { withFileTypes: true });
7564
8068
  const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
7565
8069
  const metadataPath = path12.join(poolDir, "metadata.json");
7566
8070
  let metadata = null;
7567
8071
  try {
7568
- const raw = await readFile5(metadataPath, "utf-8");
8072
+ const raw = await readFile6(metadataPath, "utf-8");
7569
8073
  metadata = JSON.parse(raw);
7570
8074
  } catch {
7571
8075
  }
@@ -7602,16 +8106,16 @@ var workspaceCommand = subcommands({
7602
8106
 
7603
8107
  // src/update-check.ts
7604
8108
  import { spawn as spawn2 } from "node:child_process";
7605
- import { readFile as readFile6 } from "node:fs/promises";
7606
- import { join as join4 } from "node:path";
8109
+ import { readFile as readFile7 } from "node:fs/promises";
8110
+ import { join as join5 } from "node:path";
7607
8111
  var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
7608
8112
  var AGENTV_DIR = getAgentvHome();
7609
8113
  var CACHE_FILE = "version-check.json";
7610
8114
  var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
7611
8115
  async function getCachedUpdateInfo(path13) {
7612
- const filePath = path13 ?? join4(AGENTV_DIR, CACHE_FILE);
8116
+ const filePath = path13 ?? join5(AGENTV_DIR, CACHE_FILE);
7613
8117
  try {
7614
- const raw = await readFile6(filePath, "utf-8");
8118
+ const raw = await readFile7(filePath, "utf-8");
7615
8119
  const data = JSON.parse(raw);
7616
8120
  if (typeof data.latestVersion === "string" && typeof data.lastCheckedAt === "string") {
7617
8121
  return data;
@@ -7643,7 +8147,7 @@ function buildNotice(currentVersion, latestVersion) {
7643
8147
  }
7644
8148
  function backgroundUpdateCheck() {
7645
8149
  const dir = AGENTV_DIR;
7646
- const filePath = join4(dir, CACHE_FILE);
8150
+ const filePath = join5(dir, CACHE_FILE);
7647
8151
  const script = `
7648
8152
  const https = require('https');
7649
8153
  const fs = require('fs');
@@ -7766,4 +8270,4 @@ export {
7766
8270
  preprocessArgv,
7767
8271
  runCli
7768
8272
  };
7769
- //# sourceMappingURL=chunk-TGCWIHBH.js.map
8273
+ //# sourceMappingURL=chunk-3UW7KUQ3.js.map