npm - agentv - Versions diffs - 3.13.3 → 3.14.2 - Mend

agentv 3.13.3 → 3.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{chunk-TGCWIHBH.js → chunk-3UW7KUQ3.js} +562 -58
package/dist/chunk-3UW7KUQ3.js.map +1 -0
package/dist/{chunk-PACTPWEN.js → chunk-75PQBKLR.js} +4 -4
package/dist/{chunk-PACTPWEN.js.map → chunk-75PQBKLR.js.map} +1 -1
package/dist/{chunk-D3LNJUUB.js → chunk-ELQEFMGO.js} +773 -339
package/dist/chunk-ELQEFMGO.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/{dist-KPMR7RBT.js → dist-5EEXTTC3.js} +4 -2
package/dist/index.js +3 -3
package/dist/{interactive-OMJAMCQP.js → interactive-Q563ULAR.js} +3 -3
package/dist/templates/.agentv/config.yaml +4 -13
package/dist/templates/.agentv/targets.yaml +0 -16
package/package.json +1 -1
package/dist/chunk-D3LNJUUB.js.map +0 -1
package/dist/chunk-TGCWIHBH.js.map +0 -1
package/dist/templates/.agentv/.env.example +0 -23
/package/dist/{dist-KPMR7RBT.js.map → dist-5EEXTTC3.js.map} +0 -0
/package/dist/{interactive-OMJAMCQP.js.map → interactive-Q563ULAR.js.map} +0 -0

package/dist/{chunk-TGCWIHBH.js → chunk-3UW7KUQ3.js} RENAMED Viewed

@@ -22,7 +22,7 @@ import {
   validateFileReferences,
   validateTargetsFile,
   writeArtifactsFromResults
-} from "./chunk-PACTPWEN.js";
+} from "./chunk-75PQBKLR.js";
 import {
   createBuiltinRegistry,
   executeScript,
@@ -39,7 +39,7 @@ import {
   toSnakeCaseDeep as toSnakeCaseDeep2,
   transpileEvalYamlFile,
   trimBaselineResult
-} from "./chunk-D3LNJUUB.js";
+} from "./chunk-ELQEFMGO.js";
 import {
   __commonJS,
   __esm,
@@ -4185,7 +4185,7 @@ var evalRunCommand = command({
   },
   handler: async (args) => {
     if (args.evalPaths.length === 0 && process.stdin.isTTY) {
-      const { launchInteractiveWizard } = await import("./interactive-OMJAMCQP.js");
+      const { launchInteractiveWizard } = await import("./interactive-Q563ULAR.js");
       await launchInteractiveWizard();
       return;
     }
@@ -4408,13 +4408,23 @@ var evalBenchCommand = command({
       type: string,
       displayName: "export-dir",
       description: "Export directory from pipeline input/grade"
+    }),
+    llmScores: option({
+      type: optional(string),
+      long: "llm-scores",
+      description: "Path to LLM scores JSON file (reads from stdin if omitted)"
     })
   },
-  handler: async ({ exportDir }) => {
+  handler: async ({ exportDir, llmScores: llmScoresPath }) => {
     const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
     const testIds = manifest.test_ids;
     const targetName = manifest.target?.name ?? "unknown";
-    const stdinData = await readStdin();
+    let stdinData;
+    if (llmScoresPath) {
+      stdinData = await readFile(llmScoresPath, "utf8");
+    } else {
+      stdinData = await readStdin();
+    }
     const llmScores = stdinData ? JSON.parse(stdinData) : {};
     const indexLines = [];
     const allPassRates = [];
@@ -4814,6 +4824,351 @@ async function writeJson(filePath, data) {
 `, "utf8");
 }
+// src/commands/pipeline/run.ts
+import { execSync } from "node:child_process";
+import { existsSync as existsSync2, readFileSync as readFileSync4, unlinkSync } from "node:fs";
+import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
+function loadEnvFile(dir) {
+  let current = resolve2(dir);
+  while (true) {
+    const candidate = join4(current, ".env");
+    if (existsSync2(candidate)) {
+      const env3 = {};
+      for (const line of readFileSync4(candidate, "utf8").split("\n")) {
+        const trimmed = line.trim();
+        if (!trimmed || trimmed.startsWith("#")) continue;
+        const eqIdx = trimmed.indexOf("=");
+        if (eqIdx === -1) continue;
+        env3[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim();
+      }
+      return env3;
+    }
+    const parent = dirname2(current);
+    if (parent === current) break;
+    current = parent;
+  }
+  return {};
+}
+var evalRunCommand2 = command({
+  name: "run",
+  description: "Extract inputs, invoke CLI targets, and run code graders in one step",
+  args: {
+    evalPath: positional({
+      type: string,
+      displayName: "eval-path",
+      description: "Path to eval YAML file"
+    }),
+    out: option({
+      type: string,
+      long: "out",
+      description: "Output directory for results"
+    }),
+    workers: option({
+      type: optional(number),
+      long: "workers",
+      description: "Parallel workers for target invocation (default: all tests)"
+    })
+  },
+  handler: async ({ evalPath, out, workers }) => {
+    const resolvedEvalPath = resolve2(evalPath);
+    const outDir = resolve2(out);
+    const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
+    const evalDir = dirname2(resolvedEvalPath);
+    const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
+    const tests = suite.tests;
+    if (tests.length === 0) {
+      console.error("No tests found in eval file.");
+      process.exit(1);
+    }
+    let targetInfo = null;
+    let targetName = "agent";
+    let targetKind = "agent";
+    try {
+      const selection = await selectTarget({
+        testFilePath: resolvedEvalPath,
+        repoRoot,
+        cwd: evalDir,
+        dryRun: false,
+        dryRunDelay: 0,
+        dryRunDelayMin: 0,
+        dryRunDelayMax: 0,
+        env: process.env
+      });
+      targetName = selection.targetName;
+      if (selection.resolvedTarget.kind === "cli") {
+        targetKind = "cli";
+        const config = selection.resolvedTarget.config;
+        targetInfo = {
+          kind: "cli",
+          command: config.command,
+          cwd: config.cwd ?? evalDir,
+          timeoutMs: config.timeoutMs ?? 3e4
+        };
+      }
+    } catch {
+    }
+    const testIds = [];
+    for (const test of tests) {
+      const testDir = join4(outDir, test.id);
+      await mkdir4(testDir, { recursive: true });
+      testIds.push(test.id);
+      const inputText = test.question;
+      const inputMessages = test.input.map((m) => ({
+        role: m.role,
+        content: typeof m.content === "string" ? m.content : m.content
+      }));
+      await writeJson2(join4(testDir, "input.json"), {
+        input_text: inputText,
+        input_messages: inputMessages,
+        file_paths: test.file_paths,
+        metadata: test.metadata ?? {}
+      });
+      if (targetInfo) {
+        await writeJson2(join4(testDir, "invoke.json"), {
+          kind: "cli",
+          command: targetInfo.command,
+          cwd: targetInfo.cwd,
+          timeout_ms: targetInfo.timeoutMs,
+          env: {}
+        });
+      } else {
+        await writeJson2(join4(testDir, "invoke.json"), {
+          kind: "agent",
+          instructions: "Execute this task in the current workspace. The agent IS the target."
+        });
+      }
+      await writeFile5(join4(testDir, "criteria.md"), test.criteria ?? "", "utf8");
+      if (test.expected_output.length > 0 || test.reference_answer !== void 0 && test.reference_answer !== "") {
+        await writeJson2(join4(testDir, "expected_output.json"), {
+          expected_output: test.expected_output,
+          reference_answer: test.reference_answer ?? ""
+        });
+      }
+      await writeGraderConfigs2(testDir, test.assertions ?? [], evalDir);
+    }
+    await writeJson2(join4(outDir, "manifest.json"), {
+      eval_file: resolvedEvalPath,
+      timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+      target: { name: targetName, kind: targetKind },
+      test_ids: testIds
+    });
+    console.log(`Extracted ${testIds.length} test(s) to ${outDir}`);
+    if (targetInfo) {
+      const envVars = loadEnvFile(evalDir);
+      const mergedEnv = { ...process.env, ...envVars };
+      const maxWorkers = workers ?? testIds.length;
+      console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
+      const invokeTarget = async (testId) => {
+        const testDir = join4(outDir, testId);
+        const invoke = JSON.parse(await readFile4(join4(testDir, "invoke.json"), "utf8"));
+        if (invoke.kind !== "cli") return;
+        const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
+        const template = invoke.command;
+        const cwd = invoke.cwd;
+        const timeoutMs = invoke.timeout_ms ?? 12e4;
+        const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
+        const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
+        await writeFile5(promptFile, inputData.input_text, "utf8");
+        let rendered = template;
+        rendered = rendered.replace("{PROMPT_FILE}", promptFile);
+        rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
+        rendered = rendered.replace("{PROMPT}", inputData.input_text);
+        const start = performance.now();
+        try {
+          execSync(rendered, {
+            cwd,
+            timeout: timeoutMs,
+            env: mergedEnv,
+            stdio: ["pipe", "pipe", "pipe"],
+            maxBuffer: 10 * 1024 * 1024
+          });
+          const durationMs = Math.round(performance.now() - start);
+          let response;
+          if (existsSync2(outputFile)) {
+            response = readFileSync4(outputFile, "utf8");
+          } else {
+            response = "ERROR: No output file generated";
+          }
+          await writeFile5(join4(testDir, "response.md"), response, "utf8");
+          await writeJson2(join4(testDir, "timing.json"), {
+            duration_ms: durationMs,
+            total_duration_seconds: Math.round(durationMs / 10) / 100
+          });
+          console.log(`  ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
+        } catch (error) {
+          const durationMs = Math.round(performance.now() - start);
+          const message = error instanceof Error ? error.message : String(error);
+          const response = `ERROR: target failed \u2014 ${message}`;
+          await writeFile5(join4(testDir, "response.md"), response, "utf8");
+          await writeJson2(join4(testDir, "timing.json"), {
+            duration_ms: durationMs,
+            total_duration_seconds: Math.round(durationMs / 10) / 100
+          });
+          console.error(`  ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}`);
+        } finally {
+          try {
+            if (existsSync2(promptFile)) unlinkSync(promptFile);
+            if (existsSync2(outputFile)) unlinkSync(outputFile);
+          } catch {
+          }
+        }
+      };
+      const allTasks = testIds.map((testId) => invokeTarget(testId));
+      await Promise.all(allTasks);
+    } else {
+      console.log("Agent-as-target mode \u2014 skipping CLI invocation.");
+    }
+    let totalGraders = 0;
+    let totalPassed = 0;
+    for (const testId of testIds) {
+      const testDir = join4(outDir, testId);
+      const codeGradersDir = join4(testDir, "code_graders");
+      const resultsDir = join4(testDir, "code_grader_results");
+      let graderFiles;
+      try {
+        graderFiles = (await readdir3(codeGradersDir)).filter((f) => f.endsWith(".json"));
+      } catch {
+        continue;
+      }
+      if (graderFiles.length === 0) continue;
+      await mkdir4(resultsDir, { recursive: true });
+      const responseText = await readFile4(join4(testDir, "response.md"), "utf8");
+      const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
+      for (const graderFile of graderFiles) {
+        const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
+        const graderName = graderConfig.name;
+        const payload = JSON.stringify({
+          output: [{ role: "assistant", content: responseText }],
+          input: inputData.input_messages,
+          question: inputData.input_text,
+          criteria: "",
+          expected_output: [],
+          reference_answer: "",
+          input_files: [],
+          trace: null,
+          token_usage: null,
+          cost_usd: null,
+          duration_ms: null,
+          start_time: null,
+          end_time: null,
+          file_changes: null,
+          workspace_path: null,
+          config: graderConfig.config ?? null,
+          metadata: {},
+          input_text: inputData.input_text,
+          output_text: responseText,
+          expected_output_text: ""
+        });
+        try {
+          const stdout = await executeScript(
+            graderConfig.command,
+            payload,
+            void 0,
+            graderConfig.cwd
+          );
+          const parsed = JSON.parse(stdout);
+          const score = typeof parsed.score === "number" ? parsed.score : 0;
+          const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
+          await writeFile5(
+            join4(resultsDir, `${graderName}.json`),
+            `${JSON.stringify(
+              {
+                name: graderName,
+                type: "code-grader",
+                score,
+                weight: graderConfig.weight ?? 1,
+                assertions,
+                details: parsed.details ?? {}
+              },
+              null,
+              2
+            )}
+`,
+            "utf8"
+          );
+          totalGraders++;
+          if (score >= 0.5) totalPassed++;
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          console.error(`  ${testId}/${graderName}: ERROR \u2014 ${message}`);
+          await writeFile5(
+            join4(resultsDir, `${graderName}.json`),
+            `${JSON.stringify(
+              {
+                name: graderName,
+                type: "code-grader",
+                score: 0,
+                weight: graderConfig.weight ?? 1,
+                assertions: [{ text: `Error: ${message}`, passed: false }],
+                details: { error: message }
+              },
+              null,
+              2
+            )}
+`,
+            "utf8"
+          );
+          totalGraders++;
+        }
+      }
+    }
+    console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
+    console.log(`
+Done. Agent can now perform LLM grading on responses in ${outDir}`);
+  }
+});
+async function writeJson2(filePath, data) {
+  await writeFile5(filePath, `${JSON.stringify(data, null, 2)}
+`, "utf8");
+}
+async function writeGraderConfigs2(testDir, assertions, evalDir) {
+  const codeGradersDir = join4(testDir, "code_graders");
+  const llmGradersDir = join4(testDir, "llm_graders");
+  let hasCodeGraders = false;
+  let hasLlmGraders = false;
+  for (const assertion of assertions) {
+    if (assertion.type === "code-grader") {
+      if (!hasCodeGraders) {
+        await mkdir4(codeGradersDir, { recursive: true });
+        hasCodeGraders = true;
+      }
+      const config = assertion;
+      await writeJson2(join4(codeGradersDir, `${config.name}.json`), {
+        name: config.name,
+        command: config.command,
+        cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
+        weight: config.weight ?? 1,
+        config: config.config ?? {}
+      });
+    } else if (assertion.type === "llm-grader") {
+      if (!hasLlmGraders) {
+        await mkdir4(llmGradersDir, { recursive: true });
+        hasLlmGraders = true;
+      }
+      const config = assertion;
+      let promptContent = "";
+      if (config.resolvedPromptPath) {
+        try {
+          promptContent = readFileSync4(config.resolvedPromptPath, "utf8");
+        } catch {
+          promptContent = typeof config.prompt === "string" ? config.prompt : "";
+        }
+      } else if (typeof config.prompt === "string") {
+        promptContent = config.prompt;
+      }
+      await writeJson2(join4(llmGradersDir, `${config.name}.json`), {
+        name: config.name,
+        prompt_content: promptContent,
+        weight: config.weight ?? 1,
+        threshold: 0.5,
+        config: {}
+      });
+    }
+  }
+}
 // src/commands/pipeline/index.ts
 var pipelineCommand = subcommands({
   name: "pipeline",
@@ -4821,7 +5176,8 @@ var pipelineCommand = subcommands({
   cmds: {
     input: evalInputCommand,
     grade: evalGradeCommand,
-    bench: evalBenchCommand
+    bench: evalBenchCommand,
+    run: evalRunCommand2
   }
 });
@@ -4829,10 +5185,10 @@ var pipelineCommand = subcommands({
 import path7 from "node:path";
 // src/commands/results/shared.ts
-import { existsSync as existsSync2 } from "node:fs";
+import { existsSync as existsSync3 } from "node:fs";
 // src/commands/trace/utils.ts
-import { readFileSync as readFileSync4, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
+import { readFileSync as readFileSync5, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
 import path6 from "node:path";
 var colors2 = {
   reset: "\x1B[0m",
@@ -4872,7 +5228,7 @@ function resolveTraceResultPath(filePath) {
   return resolveWorkspaceOrFilePath(filePath);
 }
 function loadJsonlRecords(filePath) {
-  const content = readFileSync4(filePath, "utf8");
+  const content = readFileSync5(filePath, "utf8");
   const lines = content.trim().split("\n").filter((line) => line.trim());
   return lines.map((line, i) => {
     const record = JSON.parse(line);
@@ -4925,7 +5281,7 @@ function toRawResult(result) {
   };
 }
 function loadOtlpTraceFile(filePath) {
-  const parsed = JSON.parse(readFileSync4(filePath, "utf8"));
+  const parsed = JSON.parse(readFileSync5(filePath, "utf8"));
   const spans = parsed.resourceSpans?.flatMap((resource) => resource.scopeSpans ?? []).flatMap((scope) => scope.spans ?? []);
   if (!spans || spans.length === 0) {
     return [];
@@ -5243,14 +5599,14 @@ async function resolveSourceFile(source, cwd) {
   let sourceFile;
   if (source) {
     sourceFile = resolveResultSourcePath(source, cwd);
-    if (!existsSync2(sourceFile)) {
+    if (!existsSync3(sourceFile)) {
       console.error(`Error: File not found: ${sourceFile}`);
       process.exit(1);
     }
   } else {
     const cache = await loadRunCache(cwd);
     const cachedFile = cache ? resolveRunCacheFile(cache) : "";
-    if (cachedFile && existsSync2(cachedFile)) {
+    if (cachedFile && existsSync3(cachedFile)) {
       sourceFile = cachedFile;
     } else {
       const metas = listResultFiles(cwd, 1);
@@ -5462,7 +5818,7 @@ var resultsShowCommand = command({
 });
 // src/commands/results/summary.ts
-import { existsSync as existsSync3, readFileSync as readFileSync5 } from "node:fs";
+import { existsSync as existsSync4, readFileSync as readFileSync6 } from "node:fs";
 function formatSummary(results, grading) {
   const total = results.length;
   let passed;
@@ -5513,9 +5869,9 @@ var resultsSummaryCommand = command({
       const { results, sourceFile } = await loadResults(source, cwd);
       let grading;
       const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
-      if (existsSync3(gradingPath)) {
+      if (existsSync4(gradingPath)) {
         try {
-          grading = JSON.parse(readFileSync5(gradingPath, "utf8"));
+          grading = JSON.parse(readFileSync6(gradingPath, "utf8"));
         } catch {
         }
       }
@@ -5540,7 +5896,7 @@ var resultsCommand = subcommands({
 });
 // src/commands/results/serve.ts
-import { existsSync as existsSync4, readFileSync as readFileSync6, writeFileSync as writeFileSync3 } from "node:fs";
+import { existsSync as existsSync5, readFileSync as readFileSync7, writeFileSync as writeFileSync3 } from "node:fs";
 import path8 from "node:path";
 import { Hono } from "hono";
 function feedbackPath(resultDir) {
@@ -5548,11 +5904,11 @@ function feedbackPath(resultDir) {
 }
 function readFeedback(cwd) {
   const fp = feedbackPath(cwd);
-  if (!existsSync4(fp)) {
+  if (!existsSync5(fp)) {
     return { reviews: [] };
   }
   try {
-    return JSON.parse(readFileSync6(fp, "utf8"));
+    return JSON.parse(readFileSync7(fp, "utf8"));
   } catch (err2) {
     console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
     return { reviews: [] };
@@ -5562,10 +5918,40 @@ function writeFeedback(cwd, data) {
   writeFileSync3(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
 `, "utf8");
 }
-function createApp(results, resultDir) {
+function createApp(results, resultDir, cwd, sourceFile) {
+  const searchDir = cwd ?? resultDir;
   const app2 = new Hono();
   app2.get("/", (c3) => {
-    return c3.html(generateServeHtml(results));
+    return c3.html(generateServeHtml(results, sourceFile));
+  });
+  app2.get("/api/runs", (c3) => {
+    const metas = listResultFiles(searchDir);
+    return c3.json({
+      runs: metas.map((m) => ({
+        filename: m.filename,
+        path: m.path,
+        timestamp: m.timestamp,
+        test_count: m.testCount,
+        pass_rate: m.passRate,
+        avg_score: m.avgScore,
+        size_bytes: m.sizeBytes
+      }))
+    });
+  });
+  app2.get("/api/runs/:filename", (c3) => {
+    const filename = c3.req.param("filename");
+    const metas = listResultFiles(searchDir);
+    const meta = metas.find((m) => m.filename === filename);
+    if (!meta) {
+      return c3.json({ error: "Run not found" }, 404);
+    }
+    try {
+      const loaded = patchTestIds(loadManifestResults(meta.path));
+      const lightResults = stripHeavyFields(loaded);
+      return c3.json({ results: lightResults, source: meta.filename });
+    } catch (err2) {
+      return c3.json({ error: "Failed to load run" }, 500);
+    }
   });
   app2.get("/api/feedback", (c3) => {
     const data = readFeedback(resultDir);
@@ -5611,11 +5997,8 @@ function createApp(results, resultDir) {
   });
   return app2;
 }
-function escapeHtml(s) {
-  return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
-}
-function generateServeHtml(results) {
-  const lightResults = results.map((r) => {
+function stripHeavyFields(results) {
+  return results.map((r) => {
     const { requests, trace, ...rest } = r;
     const toolCalls = trace?.toolCalls && Object.keys(trace.toolCalls).length > 0 ? trace.toolCalls : void 0;
     const graderDurationMs = (r.scores ?? []).reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
@@ -5625,6 +6008,12 @@ function generateServeHtml(results) {
       ...graderDurationMs > 0 && { _graderDurationMs: graderDurationMs }
     };
   });
+}
+function escapeHtml(s) {
+  return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
+}
+function generateServeHtml(results, sourceFile) {
+  const lightResults = stripHeavyFields(results);
   const dataJson = JSON.stringify(lightResults).replace(/</g, "\\u003c").replace(/>/g, "\\u003e").replace(/\u2028/g, "\\u2028").replace(/\u2029/g, "\\u2029");
   return `<!DOCTYPE html>
 <html lang="en">
@@ -5642,6 +6031,11 @@ ${SERVE_STYLES}
             <h1 class="header-title">AgentV</h1>
             <span class="header-subtitle">Results Review</span>
         </div>
+        <div class="header-center">
+            <select id="run-picker" class="run-picker" title="Switch result file">
+                <option value="">Loading runs...</option>
+            </select>
+        </div>
         <div class="header-right">
             <span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>
         </div>
@@ -5653,6 +6047,7 @@ ${SERVE_STYLES}
     <main id="app"></main>
     <script>
     var DATA = ${dataJson};
+    var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(path8.basename(sourceFile)).replace(/</g, "\\u003c").replace(/>/g, "\\u003e") : "null"};
 ${SERVE_SCRIPT}
     </script>
 </body>
@@ -5679,6 +6074,10 @@ body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:
 .header-left{display:flex;align-items:baseline;gap:12px}
 .header-title{font-size:18px;font-weight:600}
 .header-subtitle{font-size:14px;color:var(--text-muted)}
+.header-center{flex:1;display:flex;justify-content:center;padding:0 16px}
+.run-picker{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font);max-width:400px;width:100%;cursor:pointer}
+.run-picker:hover{border-color:var(--primary)}
+.run-picker:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
 .timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
 /* Tabs */
@@ -5778,6 +6177,11 @@ body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:
 .tool-tag{display:inline-block;padding:2px 10px;font-size:12px;font-family:var(--mono);background:var(--primary-bg);color:var(--primary);border:1px solid var(--border);border-radius:12px}
 .empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
 .empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
+.welcome-state{text-align:center;padding:80px 24px;color:var(--text-muted)}
+.welcome-state h2{font-size:24px;margin-bottom:12px;color:var(--text);font-weight:600}
+.welcome-state p{font-size:15px;margin-bottom:8px;max-width:500px;margin-left:auto;margin-right:auto}
+.welcome-state code{font-family:var(--mono);background:var(--surface);border:1px solid var(--border);border-radius:3px;padding:2px 6px;font-size:13px}
+.welcome-state .hint{margin-top:24px;font-size:13px;color:var(--text-muted)}
 /* Feedback */
 .feedback-section{margin-top:16px;padding-top:16px;border-top:1px solid var(--border-light)}
@@ -5935,7 +6339,15 @@ var SERVE_SCRIPT = `
   /* ---- render ---- */
   function render(){
-    if(DATA.length===0){app.innerHTML='<div class="empty-state"><h3>No results</h3><p>No evaluation results to display.</p></div>';return;}
+    if(DATA.length===0){
+      app.innerHTML='<div class="welcome-state">'
+        +'<h2>No results yet</h2>'
+        +'<p>Run an evaluation or mount a results directory to see results here.</p>'
+        +'<p><code>agentv eval &lt;eval-file&gt;</code></p>'
+        +'<p class="hint">The dashboard will automatically detect new result files.</p>'
+        +'</div>';
+      return;
+    }
     if(state.tab==="overview")renderOverview();else renderTests();
   }
@@ -6198,6 +6610,69 @@ var SERVE_SCRIPT = `
     return h;
   }
+  /* ---- run picker ---- */
+  var runPicker=document.getElementById("run-picker");
+  var knownRunFilenames=[];
+  function refreshRunList(){
+    fetch("/api/runs").then(function(r){return r.json();}).then(function(d){
+      if(!d||!d.runs)return;
+      var runs=d.runs;
+      var newFilenames=runs.map(function(r){return r.filename;});
+      /* Detect new runs that appeared since last poll */
+      if(knownRunFilenames.length>0){
+        var hasNew=newFilenames.some(function(f){return knownRunFilenames.indexOf(f)===-1;});
+        if(hasNew&&DATA.length===0){
+          /* Auto-load the first (most recent) run when starting from empty state */
+          loadRun(runs[0].filename);
+        }
+      }
+      knownRunFilenames=newFilenames;
+      /* Rebuild picker options */
+      var h='<option value="">Select a result file...</option>';
+      if(runs.length===0){
+        h='<option value="">No result files</option>';
+      }
+      for(var i=0;i<runs.length;i++){
+        var r=runs[i];
+        var label=r.filename+" ("+r.test_count+" tests, "+(r.pass_rate*100).toFixed(0)+"% pass)";
+        h+='<option value="'+esc(r.filename)+'">'+esc(label)+"</option>";
+      }
+      runPicker.innerHTML=h;
+      /* Pre-select the initially loaded run */
+      if(INITIAL_SOURCE&&runs.length>0){
+        runPicker.value=INITIAL_SOURCE;
+      }
+    }).catch(function(err){console.warn("Failed to refresh run list:",err);});
+  }
+  function loadRun(filename){
+    fetch("/api/runs/"+encodeURIComponent(filename)).then(function(r){return r.json();}).then(function(d){
+      if(d.error){console.error(d.error);return;}
+      DATA=d.results;
+      stats=computeStats(DATA);
+      tgtStats=computeTargets(DATA);
+      tgtNames=tgtStats.map(function(t){return t.target;});
+      state.expanded={};
+      feedbackCache={};
+      loadFeedback();
+      render();
+      /* Update picker selection */
+      runPicker.value=filename;
+    }).catch(function(err){console.error("Failed to load run:",err);});
+  }
+  runPicker.addEventListener("change",function(){
+    var val=runPicker.value;
+    if(val)loadRun(val);
+  });
+  /* Poll for new result files every 5 seconds */
+  refreshRunList();
+  setInterval(refreshRunList,5000);
   /* ---- init ---- */
   loadFeedback();
   render();
@@ -6216,7 +6691,7 @@ var resultsServeCommand = command({
       type: optional(number),
       long: "port",
       short: "p",
-      description: "Port to listen on (default: 3117)"
+      description: "Port to listen on (flag \u2192 PORT env var \u2192 3117)"
     }),
     dir: option({
       type: optional(string),
@@ -6227,14 +6702,43 @@ var resultsServeCommand = command({
   },
   handler: async ({ source, port, dir }) => {
     const cwd = dir ?? process.cwd();
-    const listenPort = port ?? 3117;
+    const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117);
     try {
-      const { results, sourceFile } = await loadResults(source, cwd);
-      const resultDir = path8.dirname(path8.resolve(sourceFile));
-      const app2 = createApp(results, resultDir);
-      console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
+      let results = [];
+      let sourceFile;
+      if (source) {
+        const resolved = resolveResultSourcePath(source, cwd);
+        if (!existsSync5(resolved)) {
+          console.error(`Error: Source file not found: ${resolved}`);
+          process.exit(1);
+        }
+        sourceFile = resolved;
+        results = patchTestIds(loadManifestResults(resolved));
+      } else {
+        const cache = await loadRunCache(cwd);
+        const cachedFile = cache ? resolveRunCacheFile(cache) : "";
+        if (cachedFile && existsSync5(cachedFile)) {
+          sourceFile = cachedFile;
+          results = patchTestIds(loadManifestResults(cachedFile));
+        } else {
+          const metas = listResultFiles(cwd, 1);
+          if (metas.length > 0) {
+            sourceFile = metas[0].path;
+            results = patchTestIds(loadManifestResults(metas[0].path));
+          }
+        }
+      }
+      const resultDir = sourceFile ? path8.dirname(path8.resolve(sourceFile)) : cwd;
+      const app2 = createApp(results, resultDir, cwd, sourceFile);
+      if (results.length > 0 && sourceFile) {
+        console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
+      } else {
+        console.log("No results found. Dashboard will show an empty state.");
+        console.log("Run an evaluation to see results: agentv eval <eval-file>");
+      }
       console.log(`Dashboard: http://localhost:${listenPort}`);
       console.log(`Feedback API: http://localhost:${listenPort}/api/feedback`);
+      console.log(`Result picker API: http://localhost:${listenPort}/api/runs`);
       console.log(`Feedback file: ${feedbackPath(resultDir)}`);
       console.log("Press Ctrl+C to stop");
       const { serve: startServer } = await import("@hono/node-server");
@@ -6263,7 +6767,7 @@ function detectPackageManager() {
   return detectPackageManagerFromPath(process.argv[1] ?? "");
 }
 function runCommand(cmd, args) {
-  return new Promise((resolve2, reject) => {
+  return new Promise((resolve3, reject) => {
     const child = spawn(cmd, args, { stdio: ["inherit", "pipe", "inherit"], shell: true });
     let stdout = "";
     child.stdout?.on("data", (data) => {
@@ -6271,7 +6775,7 @@ function runCommand(cmd, args) {
       stdout += data.toString();
     });
     child.on("error", reject);
-    child.on("close", (code) => resolve2({ exitCode: code ?? 1, stdout }));
+    child.on("close", (code) => resolve3({ exitCode: code ?? 1, stdout }));
   });
 }
 var updateCommand = command({
@@ -7179,7 +7683,7 @@ var transpileCommand = command({
 });
 // src/commands/trim/index.ts
-import { readFileSync as readFileSync7, writeFileSync as writeFileSync5 } from "node:fs";
+import { readFileSync as readFileSync8, writeFileSync as writeFileSync5 } from "node:fs";
 var trimCommand = command({
   name: "trim",
   description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
@@ -7198,7 +7702,7 @@ var trimCommand = command({
   },
   handler: async ({ input, out }) => {
     try {
-      const content = readFileSync7(input, "utf8");
+      const content = readFileSync8(input, "utf8");
       const lines = content.trim().split("\n").filter((line) => line.trim());
       const trimmedLines = lines.map((line) => {
         const record = JSON.parse(line);
@@ -7304,7 +7808,7 @@ function isTTY() {
 // src/commands/validate/validate-files.ts
 import { constants } from "node:fs";
-import { access, readdir as readdir3, stat } from "node:fs/promises";
+import { access, readdir as readdir4, stat } from "node:fs/promises";
 import path10 from "node:path";
 async function validateFiles(paths) {
   const filePaths = await expandPaths(paths);
@@ -7370,7 +7874,7 @@ async function expandPaths(paths) {
 async function findYamlFiles(dirPath) {
   const results = [];
   try {
-    const entries2 = await readdir3(dirPath, { withFileTypes: true });
+    const entries2 = await readdir4(dirPath, { withFileTypes: true });
     for (const entry of entries2) {
       const fullPath = path10.join(dirPath, entry.name);
       if (entry.isDirectory()) {
@@ -7427,14 +7931,14 @@ var validateCommand = command({
 });
 // src/commands/workspace/clean.ts
-import { existsSync as existsSync5 } from "node:fs";
-import { readFile as readFile4, readdir as readdir4, rm } from "node:fs/promises";
+import { existsSync as existsSync6 } from "node:fs";
+import { readFile as readFile5, readdir as readdir5, rm } from "node:fs/promises";
 import path11 from "node:path";
 async function confirm(message) {
   const readline2 = await import("node:readline");
   const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
-  const answer = await new Promise((resolve2) => {
-    rl.question(`${message} [y/N] `, resolve2);
+  const answer = await new Promise((resolve3) => {
+    rl.question(`${message} [y/N] `, resolve3);
   });
   rl.close();
   return answer.toLowerCase() === "y";
@@ -7456,19 +7960,19 @@ var cleanCommand = command({
   },
   handler: async ({ repo, force }) => {
     const poolRoot = getWorkspacePoolRoot();
-    if (!existsSync5(poolRoot)) {
+    if (!existsSync6(poolRoot)) {
       console.log("No workspace pool entries found.");
       return;
     }
     if (repo) {
-      const entries2 = await readdir4(poolRoot, { withFileTypes: true });
+      const entries2 = await readdir5(poolRoot, { withFileTypes: true });
       const poolDirs = entries2.filter((e) => e.isDirectory());
       const matchingDirs = [];
       for (const dir of poolDirs) {
         const poolDir = path11.join(poolRoot, dir.name);
         const metadataPath = path11.join(poolDir, "metadata.json");
         try {
-          const raw = await readFile4(metadataPath, "utf-8");
+          const raw = await readFile5(metadataPath, "utf-8");
           const metadata = JSON.parse(raw);
           const hasRepo = metadata.repos?.some((r) => {
             if (r.source.type === "git" && r.source.url) {
@@ -7515,13 +8019,13 @@ var cleanCommand = command({
 });
 // src/commands/workspace/list.ts
-import { existsSync as existsSync6 } from "node:fs";
-import { readFile as readFile5, readdir as readdir5, stat as stat2 } from "node:fs/promises";
+import { existsSync as existsSync7 } from "node:fs";
+import { readFile as readFile6, readdir as readdir6, stat as stat2 } from "node:fs/promises";
 import path12 from "node:path";
 async function getDirectorySize(dirPath) {
   let totalSize = 0;
   try {
-    const entries2 = await readdir5(dirPath, { withFileTypes: true });
+    const entries2 = await readdir6(dirPath, { withFileTypes: true });
     for (const entry of entries2) {
       const fullPath = path12.join(dirPath, entry.name);
       if (entry.isDirectory()) {
@@ -7547,11 +8051,11 @@ var listCommand = command({
   args: {},
   handler: async () => {
     const poolRoot = getWorkspacePoolRoot();
-    if (!existsSync6(poolRoot)) {
+    if (!existsSync7(poolRoot)) {
       console.log("No workspace pool entries found.");
       return;
     }
-    const entries2 = await readdir5(poolRoot, { withFileTypes: true });
+    const entries2 = await readdir6(poolRoot, { withFileTypes: true });
     const poolDirs = entries2.filter((e) => e.isDirectory());
     if (poolDirs.length === 0) {
       console.log("No workspace pool entries found.");
@@ -7560,12 +8064,12 @@ var listCommand = command({
     for (const dir of poolDirs) {
       const poolDir = path12.join(poolRoot, dir.name);
       const fingerprint = dir.name;
-      const poolEntries = await readdir5(poolDir, { withFileTypes: true });
+      const poolEntries = await readdir6(poolDir, { withFileTypes: true });
       const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
       const metadataPath = path12.join(poolDir, "metadata.json");
       let metadata = null;
       try {
-        const raw = await readFile5(metadataPath, "utf-8");
+        const raw = await readFile6(metadataPath, "utf-8");
         metadata = JSON.parse(raw);
       } catch {
       }
@@ -7602,16 +8106,16 @@ var workspaceCommand = subcommands({
 // src/update-check.ts
 import { spawn as spawn2 } from "node:child_process";
-import { readFile as readFile6 } from "node:fs/promises";
-import { join as join4 } from "node:path";
+import { readFile as readFile7 } from "node:fs/promises";
+import { join as join5 } from "node:path";
 var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
 var AGENTV_DIR = getAgentvHome();
 var CACHE_FILE = "version-check.json";
 var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
 async function getCachedUpdateInfo(path13) {
-  const filePath = path13 ?? join4(AGENTV_DIR, CACHE_FILE);
+  const filePath = path13 ?? join5(AGENTV_DIR, CACHE_FILE);
   try {
-    const raw = await readFile6(filePath, "utf-8");
+    const raw = await readFile7(filePath, "utf-8");
     const data = JSON.parse(raw);
     if (typeof data.latestVersion === "string" && typeof data.lastCheckedAt === "string") {
       return data;
@@ -7643,7 +8147,7 @@ function buildNotice(currentVersion, latestVersion) {
 }
 function backgroundUpdateCheck() {
   const dir = AGENTV_DIR;
-  const filePath = join4(dir, CACHE_FILE);
+  const filePath = join5(dir, CACHE_FILE);
   const script = `
     const https = require('https');
     const fs = require('fs');
@@ -7766,4 +8270,4 @@ export {
   preprocessArgv,
   runCli
 };
-//# sourceMappingURL=chunk-TGCWIHBH.js.map
+//# sourceMappingURL=chunk-3UW7KUQ3.js.map