npm - agentv - Versions diffs - 4.10.0 → 4.11.0 - Mend

agentv 4.10.0 → 4.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/dist/{chunk-XOSNETAV.js → chunk-BAUNAXHT.js} +1 -1
package/dist/chunk-BPGJ4HBU.js +183 -0
package/dist/chunk-BPGJ4HBU.js.map +1 -0
package/dist/{chunk-KF6BABQ5.js → chunk-FH24D7XW.js} +1090 -303
package/dist/chunk-FH24D7XW.js.map +1 -0
package/dist/{chunk-SE73HJZG.js → chunk-FQGY6QXQ.js} +780 -346
package/dist/chunk-FQGY6QXQ.js.map +1 -0
package/dist/chunk-NPVGBFF6.js +151 -0
package/dist/chunk-NPVGBFF6.js.map +1 -0
package/dist/{chunk-VA64NETD.js → chunk-QRYAMYT7.js} +1120 -731
package/dist/chunk-QRYAMYT7.js.map +1 -0
package/dist/cli.js +6 -4
package/dist/cli.js.map +1 -1
package/dist/{dist-XDNB4WDT.js → dist-HNSXNRVK.js} +36 -3
package/dist/docker-workspace-RPPXBT27-B4AQHVWA.js +11 -0
package/dist/{esm-CZAWIY6F.js → esm-UYZ3HJBU.js} +2 -2
package/dist/esm-UYZ3HJBU.js.map +1 -0
package/dist/exec-AR6JUUN5-6MBPURPR.js +11 -0
package/dist/exec-AR6JUUN5-6MBPURPR.js.map +1 -0
package/dist/index.js +6 -4
package/dist/{interactive-SNKK6VCV.js → interactive-SIOZB665.js} +6 -4
package/dist/{interactive-SNKK6VCV.js.map → interactive-SIOZB665.js.map} +1 -1
package/dist/{src-ML4D2MC2.js → src-PXDA7QIS.js} +2 -2
package/dist/studio/assets/index-Bi-KHfNm.js +65 -0
package/dist/studio/assets/index-D_j-w4UO.css +1 -0
package/dist/studio/assets/{index-DcwjOyrk.js → index-VyDFrnoK.js} +1 -1
package/dist/studio/index.html +2 -2
package/package.json +1 -1
package/dist/chunk-KF6BABQ5.js.map +0 -1
package/dist/chunk-SE73HJZG.js.map +0 -1
package/dist/chunk-VA64NETD.js.map +0 -1
package/dist/studio/assets/index-DHxVz6M9.css +0 -1
package/dist/studio/assets/index-Y5InSvcS.js +0 -65
/package/dist/{chunk-XOSNETAV.js.map → chunk-BAUNAXHT.js.map} +0 -0
/package/dist/{dist-XDNB4WDT.js.map → dist-HNSXNRVK.js.map} +0 -0
/package/dist/{esm-CZAWIY6F.js.map → docker-workspace-RPPXBT27-B4AQHVWA.js.map} +0 -0
/package/dist/{src-ML4D2MC2.js.map → src-PXDA7QIS.js.map} +0 -0

package/dist/{chunk-KF6BABQ5.js → chunk-FH24D7XW.js} RENAMED Viewed

@@ -9,10 +9,14 @@ import {
   ResponseCache,
   buildDirectoryChain,
   buildSearchRoots,
+  commitAndPushResultsBranch,
+  createDraftResultsPr,
   deriveCategory,
+  directorySizeBytes,
   ensureVSCodeSubagents,
   findDeprecatedCamelCaseTargetWarnings,
   findGitRoot,
+  getResultsRepoStatus,
   interpolateEnv,
   isEvaluatorKind,
   listTargetNames,
@@ -21,23 +25,28 @@ import {
   loadTestSuite,
   loadTsConfig,
   normalizeLineEndings,
+  prepareResultsRepoBranch,
   readTargetDefinitions,
   readTestSuiteMetadata,
   resolveFileReference,
+  resolveResultsRepoRunsDir,
   resolveTargetDefinition,
   runEvaluation,
   shouldEnableCache,
   shouldSkipCacheForTemperature,
+  stageResultsArtifacts,
   subscribeToCodexLogEntries,
   subscribeToCopilotCliLogEntries,
   subscribeToCopilotSdkLogEntries,
-  subscribeToPiLogEntries
-} from "./chunk-SE73HJZG.js";
+  subscribeToPiLogEntries,
+  syncResultsRepo,
+  toCamelCaseDeep
+} from "./chunk-FQGY6QXQ.js";
 // package.json
 var package_default = {
   name: "agentv",
-  version: "4.10.0",
+  version: "4.11.0",
   description: "CLI entry point for AgentV",
   type: "module",
   repository: {
@@ -249,7 +258,7 @@ async function discoverTargetsFile(options) {
 // src/commands/eval/run-eval.ts
 import { constants as constants4, mkdirSync } from "node:fs";
 import { access as access4 } from "node:fs/promises";
-import path15 from "node:path";
+import path17 from "node:path";
 import { pathToFileURL } from "node:url";
 // src/version-check.ts
@@ -306,45 +315,43 @@ async function promptContinue() {
   return confirm({ message: "Continue anyway?", default: false });
 }
-// src/commands/eval/artifact-writer.ts
-import { mkdir, readFile, writeFile } from "node:fs/promises";
-import path4 from "node:path";
+// src/commands/results/remote.ts
+import path6 from "node:path";
-// src/utils/case-conversion.ts
-function toSnakeCase(str) {
-  if (/^[A-Z]/.test(str)) {
-    return str;
-  }
-  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
-}
-function toSnakeCaseDeep(obj) {
-  if (obj === null || obj === void 0) {
-    return obj;
-  }
-  if (Array.isArray(obj)) {
-    return obj.map((item) => toSnakeCaseDeep(item));
-  }
-  if (typeof obj === "object") {
-    const result = {};
-    for (const [key, value] of Object.entries(obj)) {
-      const snakeKey = toSnakeCase(key);
-      result[snakeKey] = toSnakeCaseDeep(value);
-    }
-    return result;
-  }
-  return obj;
-}
+// src/commands/inspect/utils.ts
+import { readFileSync as readFileSync2, readdirSync, statSync as statSync2 } from "node:fs";
+import path5 from "node:path";
 // src/commands/eval/result-layout.ts
 import { existsSync, statSync } from "node:fs";
 import path3 from "node:path";
 var RESULT_INDEX_FILENAME = "index.jsonl";
 var RESULT_RUNS_DIRNAME = "runs";
+var DEFAULT_EXPERIMENT_NAME = "default";
+function normalizeExperimentName(experiment) {
+  const trimmed = experiment?.trim();
+  if (!trimmed) {
+    return DEFAULT_EXPERIMENT_NAME;
+  }
+  if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
+    throw new Error(
+      `Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`
+    );
+  }
+  return trimmed;
+}
 function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
   return timestamp.toISOString().replace(/[:.]/g, "-");
 }
-function buildDefaultRunDir(cwd) {
-  return path3.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME, createRunDirName());
+function buildDefaultRunDir(cwd, experiment, timestamp = /* @__PURE__ */ new Date()) {
+  return path3.join(
+    cwd,
+    ".agentv",
+    "results",
+    RESULT_RUNS_DIRNAME,
+    normalizeExperimentName(experiment),
+    createRunDirName(timestamp)
+  );
 }
 function resolveRunIndexPath(runDir) {
   return path3.join(runDir, RESULT_INDEX_FILENAME);
@@ -366,26 +373,794 @@ function isDirectoryPath(filePath) {
     return false;
   }
 }
-function resolveWorkspaceOrFilePath(filePath) {
-  if (!isDirectoryPath(filePath)) {
-    return filePath;
-  }
-  const existing = resolveExistingRunPrimaryPath(filePath);
-  if (!existing) {
-    throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
+function resolveWorkspaceOrFilePath(filePath) {
+  if (!isDirectoryPath(filePath)) {
+    return filePath;
+  }
+  const existing = resolveExistingRunPrimaryPath(filePath);
+  if (!existing) {
+    throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
+  }
+  return existing;
+}
+function resolveRunManifestPath(filePath) {
+  if (isDirectoryPath(filePath)) {
+    return resolveWorkspaceOrFilePath(filePath);
+  }
+  if (!isRunManifestPath(filePath)) {
+    throw new Error(
+      `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
+    );
+  }
+  return filePath;
+}
+// src/commands/results/manifest.ts
+import { existsSync as existsSync2, readFileSync } from "node:fs";
+import path4 from "node:path";
+function parseJsonlLines(content) {
+  return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
+}
+function parseMarkdownMessages(content) {
+  const trimmed = content.trim();
+  if (!trimmed.startsWith("@[")) {
+    return [];
+  }
+  const matches = [...trimmed.matchAll(/^@\[(.+?)\]:\n([\s\S]*?)(?=^@\[(.+?)\]:\n|\s*$)/gm)];
+  return matches.map((match) => ({
+    role: match[1],
+    content: match[2].trimEnd()
+  }));
+}
+function readOptionalText(baseDir, relativePath) {
+  if (!relativePath) {
+    return void 0;
+  }
+  const absolutePath = path4.join(baseDir, relativePath);
+  if (!existsSync2(absolutePath)) {
+    return void 0;
+  }
+  return readFileSync(absolutePath, "utf8");
+}
+function readOptionalJson(baseDir, relativePath) {
+  const text = readOptionalText(baseDir, relativePath);
+  if (!text) {
+    return void 0;
+  }
+  try {
+    return JSON.parse(text);
+  } catch {
+    return void 0;
+  }
+}
+function hydrateInput(baseDir, record) {
+  const inputText = readOptionalText(baseDir, record.input_path);
+  if (!inputText) {
+    return void 0;
+  }
+  const messages = parseMarkdownMessages(inputText);
+  return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
+}
+function hydrateOutput(baseDir, record) {
+  const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
+  if (!responseText) {
+    return void 0;
+  }
+  const messages = parseMarkdownMessages(responseText);
+  if (messages.length > 0) {
+    return messages.map((message) => ({
+      role: message.role,
+      content: message.content
+    }));
+  }
+  return [{ role: "assistant", content: responseText.trimEnd() }];
+}
+function hydrateManifestRecord(baseDir, record) {
+  const grading = readOptionalJson(baseDir, record.grading_path);
+  const timing = readOptionalJson(baseDir, record.timing_path);
+  const testId = record.test_id ?? "unknown";
+  return {
+    timestamp: record.timestamp,
+    testId,
+    suite: record.suite,
+    category: record.category,
+    target: record.target,
+    score: record.score,
+    executionStatus: record.execution_status,
+    error: record.error,
+    assertions: grading?.assertions.map((assertion) => ({
+      text: assertion.text,
+      passed: assertion.passed,
+      evidence: assertion.evidence
+    })),
+    scores: grading?.evaluators?.map((evaluator) => ({
+      name: evaluator.name,
+      type: evaluator.type,
+      score: evaluator.score,
+      assertions: Array.isArray(evaluator.assertions) ? evaluator.assertions.map((assertion) => ({
+        text: String(assertion.text ?? ""),
+        passed: Boolean(assertion.passed),
+        evidence: typeof assertion.evidence === "string" ? String(assertion.evidence) : void 0
+      })) : void 0,
+      weight: typeof evaluator.weight === "number" ? evaluator.weight : void 0,
+      verdict: typeof evaluator.verdict === "string" ? evaluator.verdict : void 0,
+      details: evaluator.details
+    })) ?? record.scores,
+    tokenUsage: timing?.token_usage ? {
+      input: timing.token_usage.input,
+      output: timing.token_usage.output,
+      reasoning: timing.token_usage.reasoning
+    } : record.token_usage,
+    durationMs: timing?.duration_ms ?? record.duration_ms,
+    costUsd: record.cost_usd,
+    input: hydrateInput(baseDir, record),
+    output: hydrateOutput(baseDir, record)
+  };
+}
+function parseResultManifest(content) {
+  return parseJsonlLines(content);
+}
+function resolveResultSourcePath(source, cwd) {
+  const resolved = path4.isAbsolute(source) ? source : path4.resolve(cwd ?? process.cwd(), source);
+  if (isDirectoryPath(resolved) || path4.basename(resolved) === RESULT_INDEX_FILENAME) {
+    return resolveRunManifestPath(resolved);
+  }
+  return resolved;
+}
+function loadManifestResults(sourceFile) {
+  const resolvedSourceFile = resolveRunManifestPath(sourceFile);
+  const content = readFileSync(resolvedSourceFile, "utf8");
+  const records = parseResultManifest(content);
+  const baseDir = path4.dirname(resolvedSourceFile);
+  return records.map((record) => hydrateManifestRecord(baseDir, record));
+}
+function loadLightweightResults(sourceFile) {
+  const resolvedSourceFile = resolveRunManifestPath(sourceFile);
+  const content = readFileSync(resolvedSourceFile, "utf8");
+  return parseResultManifest(content).map((record) => ({
+    testId: record.test_id ?? "unknown",
+    suite: record.suite,
+    target: record.target,
+    experiment: record.experiment,
+    score: record.score,
+    scores: record.scores,
+    executionStatus: record.execution_status,
+    error: record.error,
+    timestamp: record.timestamp
+  }));
+}
+// src/commands/inspect/utils.ts
+var colors = {
+  reset: "\x1B[0m",
+  bold: "\x1B[1m",
+  dim: "\x1B[2m",
+  green: "\x1B[32m",
+  red: "\x1B[31m",
+  yellow: "\x1B[33m",
+  cyan: "\x1B[36m",
+  gray: "\x1B[90m"
+};
+var noColor = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
+var c = noColor ? Object.fromEntries(Object.keys(colors).map((k) => [k, ""])) : colors;
+var ansiPattern = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
+function stripAnsi(str) {
+  return str.replace(ansiPattern, "");
+}
+function padRight(str, len) {
+  const plainLen = stripAnsi(str).length;
+  return str + " ".repeat(Math.max(0, len - plainLen));
+}
+function padLeft(str, len) {
+  const plainLen = stripAnsi(str).length;
+  return " ".repeat(Math.max(0, len - plainLen)) + str;
+}
+function loadResultFile(filePath) {
+  const resolvedFilePath = resolveTraceResultPath(filePath);
+  if (path5.extname(resolvedFilePath) === ".json") {
+    return loadOtlpTraceFile(resolvedFilePath);
+  }
+  if (path5.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
+    return loadManifestAsRawResults(resolvedFilePath);
+  }
+  return loadJsonlRecords(resolvedFilePath);
+}
+function resolveTraceResultPath(filePath) {
+  return resolveWorkspaceOrFilePath(filePath);
+}
+function loadJsonlRecords(filePath) {
+  const content = readFileSync2(filePath, "utf8");
+  const lines = content.trim().split("\n").filter((line) => line.trim());
+  return lines.map((line, i) => {
+    const record = JSON.parse(line);
+    if (typeof record.score !== "number") {
+      throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`);
+    }
+    return record;
+  });
+}
+function loadManifestAsRawResults(filePath) {
+  return loadManifestResults(filePath).map(toRawResult);
+}
+function toRawResult(result) {
+  return {
+    timestamp: result.timestamp,
+    test_id: result.testId,
+    suite: result.suite,
+    conversation_id: result.conversationId,
+    score: result.score,
+    assertions: result.assertions?.map((assertion) => ({
+      text: assertion.text,
+      passed: assertion.passed,
+      evidence: assertion.evidence
+    })),
+    target: result.target,
+    error: result.error,
+    scores: result.scores?.map((score) => ({
+      name: score.name,
+      type: score.type,
+      score: score.score,
+      assertions: score.assertions?.map((assertion) => ({
+        text: assertion.text,
+        passed: assertion.passed,
+        evidence: assertion.evidence
+      })),
+      weight: score.weight
+    })),
+    token_usage: result.tokenUsage ? {
+      input: result.tokenUsage.input,
+      output: result.tokenUsage.output,
+      cached: result.tokenUsage.cached
+    } : void 0,
+    cost_usd: result.costUsd,
+    duration_ms: result.durationMs,
+    start_time: result.startTime,
+    end_time: result.endTime,
+    input: result.input,
+    output: result.output,
+    file_changes: result.fileChanges
+  };
+}
+function loadOtlpTraceFile(filePath) {
+  const parsed = JSON.parse(readFileSync2(filePath, "utf8"));
+  const spans = parsed.resourceSpans?.flatMap((resource) => resource.scopeSpans ?? []).flatMap((scope) => scope.spans ?? []);
+  if (!spans || spans.length === 0) {
+    return [];
+  }
+  const spanMap = /* @__PURE__ */ new Map();
+  const childMap = /* @__PURE__ */ new Map();
+  for (const span of spans) {
+    if (!span.spanId) continue;
+    spanMap.set(span.spanId, span);
+    if (span.parentSpanId) {
+      const siblings = childMap.get(span.parentSpanId) ?? [];
+      siblings.push(span);
+      childMap.set(span.parentSpanId, siblings);
+    }
+  }
+  const roots = spans.filter((span) => !span.parentSpanId || !spanMap.has(span.parentSpanId));
+  const supportedRoots = roots.filter(isAgentvEvalRoot);
+  const candidateRoots = supportedRoots.length > 0 ? supportedRoots : roots;
+  return candidateRoots.map((root, index) => {
+    const descendants = collectChildSpans(root.spanId, childMap);
+    const rootAttrs = parseOtlpAttributes(root.attributes);
+    const parsedDescendants = descendants.map((span) => ({
+      ...span,
+      parsedAttributes: parseOtlpAttributes(span.attributes)
+    }));
+    const toolSpans = parsedDescendants.filter(
+      (span) => typeof span.parsedAttributes.gen_ai_tool_name === "string"
+    );
+    const llmSpans = parsedDescendants.filter(
+      (span) => span.parsedAttributes.gen_ai_operation_name === "chat" || typeof span.name === "string" && span.name.startsWith("chat ")
+    );
+    const tokenUsage = descendants.reduce(
+      (acc, span) => {
+        const attrs = parseOtlpAttributes(span.attributes);
+        acc.input += numberAttr(attrs.gen_ai_usage_input_tokens) ?? 0;
+        acc.output += numberAttr(attrs.gen_ai_usage_output_tokens) ?? 0;
+        const cached = numberAttr(attrs.gen_ai_usage_cache_read_input_tokens);
+        if (cached !== void 0 && cached > 0) {
+          acc.cached = (acc.cached ?? 0) + cached;
+        }
+        return acc;
+      },
+      { input: 0, output: 0, cached: void 0 }
+    );
+    const traceSummary = buildDerivedTraceSummary({
+      trace: {
+        event_count: numberAttr(rootAttrs.agentv_trace_event_count) ?? (toolSpans.length > 0 ? toolSpans.length : void 0),
+        tool_calls: countRawSpanNames(
+          toolSpans.map((span) => ({
+            type: "tool",
+            name: String(span.parsedAttributes.gen_ai_tool_name)
+          }))
+        ),
+        error_count: descendants.filter((span) => span.status?.code === 2).length || void 0,
+        llm_call_count: numberAttr(rootAttrs.agentv_trace_llm_call_count) ?? (llmSpans.length > 0 ? llmSpans.length : void 0)
+      },
+      spans: [
+        ...llmSpans.map((span) => ({
+          type: "llm",
+          name: span.name ?? "chat",
+          duration_ms: durationFromSpan(span)
+        })),
+        ...toolSpans.map((span) => ({
+          type: "tool",
+          name: String(span.parsedAttributes.gen_ai_tool_name),
+          duration_ms: durationFromSpan(span)
+        }))
+      ],
+      duration_ms: numberAttr(rootAttrs.agentv_trace_duration_ms) ?? durationFromSpan(root),
+      cost_usd: numberAttr(rootAttrs.agentv_trace_cost_usd),
+      token_usage: tokenUsage.input || tokenUsage.output || tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_input) || numberAttr(rootAttrs.agentv_trace_token_output) || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
+        input: tokenUsage.input || numberAttr(rootAttrs.agentv_trace_token_input) || 0,
+        output: tokenUsage.output || numberAttr(rootAttrs.agentv_trace_token_output) || 0,
+        ...tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
+          cached: tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) || 0
+        } : {}
+      } : void 0
+    });
+    const score = numberAttr(rootAttrs.agentv_score);
+    if (score === void 0) {
+      throw new Error(
+        `Unsupported OTLP trace root span at index ${index + 1}: missing agentv.score attribute`
+      );
+    }
+    return {
+      test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
+      suite: stringAttr(rootAttrs.agentv_suite),
+      target: stringAttr(rootAttrs.agentv_target),
+      score,
+      error: root.status?.code === 2 ? root.status.message : void 0,
+      cost_usd: traceSummary?.cost_usd,
+      duration_ms: traceSummary?.duration_ms,
+      token_usage: traceSummary?.token_usage,
+      trace: traceSummary ? {
+        event_count: traceSummary.event_count,
+        tool_calls: traceSummary.tool_calls,
+        error_count: traceSummary.error_count,
+        tool_durations: traceSummary.tool_durations,
+        llm_call_count: traceSummary.llm_call_count,
+        token_usage: traceSummary.token_usage,
+        cost_usd: traceSummary.cost_usd,
+        duration_ms: traceSummary.duration_ms
+      } : void 0,
+      spans: traceSummary?.spans,
+      output: stringAttr(rootAttrs.agentv_output_text),
+      scores: root.events?.filter(
+        (event) => event.name?.startsWith("agentv.grader.") || event.name?.startsWith("agentv.evaluator.")
+      ).map((event) => {
+        const attrs = parseOtlpAttributes(event.attributes);
+        const name = event.name?.replace(/^agentv\.grader\./, "").replace(/^agentv\.evaluator\./, "") ?? "unknown";
+        return {
+          name,
+          type: stringAttr(attrs.agentv_grader_type) ?? stringAttr(attrs.agentv_evaluator_type) ?? "unknown",
+          score: numberAttr(attrs.agentv_grader_score) ?? numberAttr(attrs.agentv_evaluator_score) ?? 0
+        };
+      })
+    };
+  });
+}
+function isAgentvEvalRoot(span) {
+  const attrs = parseOtlpAttributes(span.attributes);
+  return span.name === "agentv.eval" || numberAttr(attrs.agentv_score) !== void 0 || typeof stringAttr(attrs.agentv_test_id) === "string";
+}
+function collectChildSpans(spanId, childMap) {
+  if (!spanId) return [];
+  const direct = childMap.get(spanId) ?? [];
+  const all = [...direct];
+  for (const child of direct) {
+    all.push(...collectChildSpans(child.spanId, childMap));
+  }
+  return all;
+}
+function parseOtlpAttributes(attributes) {
+  const parsed = {};
+  for (const attribute of attributes ?? []) {
+    parsed[attribute.key.replace(/\./g, "_")] = parseOtlpValue(attribute.value);
+  }
+  return parsed;
+}
+function parseOtlpValue(value) {
+  if (!value) return void 0;
+  if ("stringValue" in value && value.stringValue !== void 0) return value.stringValue;
+  if ("intValue" in value && value.intValue !== void 0) return Number(value.intValue);
+  if ("doubleValue" in value && value.doubleValue !== void 0) return value.doubleValue;
+  if ("boolValue" in value && value.boolValue !== void 0) return value.boolValue;
+  if ("arrayValue" in value)
+    return (value.arrayValue?.values ?? []).map((entry) => parseOtlpValue(entry));
+  return void 0;
+}
+function durationFromSpan(span) {
+  const start = Number(span.startTimeUnixNano);
+  const end = Number(span.endTimeUnixNano);
+  if (!Number.isFinite(start) || !Number.isFinite(end)) return void 0;
+  return Math.round((end - start) / 1e6);
+}
+function stringAttr(value) {
+  return typeof value === "string" ? value : void 0;
+}
+function numberAttr(value) {
+  return typeof value === "number" && Number.isFinite(value) ? value : void 0;
+}
+function buildDerivedTraceSummary(result) {
+  const toolSpans = (result.spans ?? []).filter((span) => span.type === "tool");
+  const llmSpans = (result.spans ?? []).filter((span) => span.type === "llm");
+  const toolCalls = result.trace?.tool_calls ?? countRawSpanNames(toolSpans);
+  const toolDurations = result.trace?.tool_durations ?? groupRawSpanDurations(toolSpans);
+  const hasSpanData = (result.spans?.length ?? 0) > 0;
+  const eventCount = result.trace?.event_count ?? (hasSpanData ? toolSpans.length : void 0);
+  const llmCallCount = result.trace?.llm_call_count ?? (hasSpanData ? llmSpans.length : void 0);
+  if (!result.trace && !result.spans?.length && result.token_usage === void 0 && result.cost_usd === void 0 && result.duration_ms === void 0) {
+    return void 0;
+  }
+  return {
+    event_count: eventCount,
+    tool_calls: toolCalls,
+    error_count: result.trace?.error_count,
+    tool_durations: toolDurations,
+    llm_call_count: llmCallCount,
+    token_usage: result.trace?.token_usage ?? result.token_usage,
+    cost_usd: result.trace?.cost_usd ?? result.cost_usd,
+    duration_ms: result.trace?.duration_ms ?? result.duration_ms,
+    spans: result.spans
+  };
+}
+function countRawSpanNames(spans) {
+  const counts = {};
+  for (const span of spans) {
+    counts[span.name] = (counts[span.name] ?? 0) + 1;
+  }
+  return Object.keys(counts).length > 0 ? counts : void 0;
+}
+function groupRawSpanDurations(spans) {
+  const grouped = {};
+  for (const span of spans) {
+    if (span.duration_ms === void 0) continue;
+    const existing = grouped[span.name] ?? [];
+    existing.push(span.duration_ms);
+    grouped[span.name] = existing;
+  }
+  return Object.keys(grouped).length > 0 ? grouped : void 0;
+}
+function getTraceSummary(result) {
+  const derived = buildDerivedTraceSummary(result);
+  if (!derived) return void 0;
+  const { spans: _spans, ...trace } = derived;
+  return trace;
+}
+function getTraceSpans(result) {
+  return buildDerivedTraceSummary(result)?.spans ?? [];
+}
+function toTraceSummary(result) {
+  const rawTrace = getTraceSummary(result);
+  if (!rawTrace) return void 0;
+  return toCamelCaseDeep(rawTrace);
+}
+function buildRunId(relativeRunPath) {
+  const normalized = relativeRunPath.split(path5.sep).join("/");
+  const segments = normalized.split("/").filter(Boolean);
+  if (segments.length >= 2) {
+    const experiment = segments.slice(0, -1).join("/");
+    const timestamp = segments.at(-1);
+    if (experiment === "default") {
+      return timestamp ?? normalized;
+    }
+    return `${experiment}::${timestamp}`;
+  }
+  return segments[0];
+}
+function collectRunManifestPaths(runsDir, currentDir, files) {
+  const primaryPath = resolveExistingRunPrimaryPath(currentDir);
+  if (primaryPath) {
+    const relativeRunPath = path5.relative(runsDir, currentDir);
+    files.push({
+      filePath: primaryPath,
+      displayName: path5.basename(currentDir),
+      runId: buildRunId(relativeRunPath)
+    });
+    return;
+  }
+  const entries = readdirSync(currentDir, { withFileTypes: true });
+  for (const entry of entries) {
+    if (entry.isDirectory()) {
+      collectRunManifestPaths(runsDir, path5.join(currentDir, entry.name), files);
+    }
+  }
+}
+function listResultFilesFromRunsDir(runsDir, limit) {
+  const files = [];
+  try {
+    const entries = readdirSync(runsDir, { withFileTypes: true });
+    for (const entry of entries) {
+      if (entry.isDirectory()) {
+        collectRunManifestPaths(runsDir, path5.join(runsDir, entry.name), files);
+      }
+    }
+  } catch {
+  }
+  files.sort((a, b) => b.displayName.localeCompare(a.displayName));
+  const limited = limit !== void 0 && limit > 0 ? files.slice(0, limit) : files;
+  const metas = [];
+  for (const { filePath, displayName, runId } of limited) {
+    try {
+      const fileStat = statSync2(filePath);
+      const results = loadResultFile(filePath);
+      const testCount = results.length;
+      const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
+      const passRate = testCount > 0 ? passCount / testCount : 0;
+      const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0;
+      const filenameTimestamp = extractTimestampFromFilename(displayName);
+      const timestamp = filenameTimestamp ?? results[0]?.timestamp ?? "unknown";
+      metas.push({
+        path: filePath,
+        filename: runId,
+        displayName,
+        timestamp,
+        testCount,
+        passRate,
+        avgScore,
+        sizeBytes: fileStat.size
+      });
+    } catch {
+    }
+  }
+  return metas;
+}
+function listResultFiles(cwd, limit) {
+  return listResultFilesFromRunsDir(
+    path5.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
+    limit
+  );
+}
+function extractTimestampFromFilename(filename) {
+  const match = filename.match(
+    /(?:^|eval_)(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z)(?:\.jsonl)?$/
+  );
+  if (!match) return void 0;
+  return match[1].replace(/-(\d{2})-(\d{2})-(\d{3})Z$/, ":$1:$2.$3Z");
+}
+function formatNumber(n) {
+  return n.toLocaleString();
+}
+function formatDuration(ms) {
+  if (ms < 1e3) return `${Math.round(ms)}ms`;
+  if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
+  const minutes = Math.floor(ms / 6e4);
+  const seconds = (ms % 6e4 / 1e3).toFixed(0);
+  return `${minutes}m${seconds}s`;
+}
+function formatCost(usd) {
+  if (usd < 0.01) return `$${usd.toFixed(4)}`;
+  return `$${usd.toFixed(3)}`;
+}
+function formatSize(bytes) {
+  if (bytes < 1024) return `${bytes}B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}KB`;
+  return `${(bytes / (1024 * 1024)).toFixed(1)}MB`;
+}
+function formatScore(score) {
+  return `${(score * 100).toFixed(0)}%`;
+}
+// src/commands/results/remote.ts
+var REMOTE_RUN_PREFIX = "remote::";
+var SIZE_WARNING_BYTES = 10 * 1024 * 1024;
+function getStatusMessage(error) {
+  return error instanceof Error ? error.message : String(error);
+}
+function normalizeResultsExportConfig(config) {
+  return {
+    repo: config.repo,
+    path: config.path,
+    auto_push: config.auto_push === true,
+    branch_prefix: config.branch_prefix?.trim() || "eval-results"
+  };
+}
+function slugify(value) {
+  return value.trim().replace(/[^A-Za-z0-9._/-]+/g, "-").replace(/\/+/g, "/").replace(/^-+|-+$/g, "").slice(0, 120);
+}
+function getRelativeRunPath(cwd, runDir) {
+  const relative = path6.relative(path6.join(cwd, ".agentv", "results", "runs"), runDir);
+  if (!relative.startsWith("..") && !path6.isAbsolute(relative)) {
+    return relative;
+  }
+  const experiment = path6.basename(path6.dirname(runDir));
+  const runName = path6.basename(runDir);
+  return experiment && experiment !== runName ? path6.join(experiment, runName) : runName;
+}
+function buildBranchName(config, payload) {
+  const timestamp = path6.basename(payload.run_dir);
+  const evalStem = payload.test_files.length === 1 ? path6.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
+  const experiment = slugify(payload.experiment ?? "default");
+  const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp;
+  return `${config.branch_prefix}/${branchLeaf}`;
+}
+function buildCommitTitle(payload) {
+  const passed = payload.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length;
+  const avgScore = payload.results.length > 0 ? payload.results.reduce((sum, result) => sum + result.score, 0) / payload.results.length : 0;
+  const experiment = payload.experiment ?? "default";
+  return `feat(results): ${experiment} - ${passed}/${payload.results.length} PASS (${avgScore.toFixed(3)})`;
+}
+function buildPrBody(payload) {
+  const sections = payload.eval_summaries.map((summary) => {
+    const table = summary.results.map((result) => `| ${result.test_id} | ${result.score.toFixed(3)} | ${result.status} |`).join("\n");
+    return [
+      `### ${summary.eval_file}`,
+      "",
+      `Summary: ${summary.passed}/${summary.total} PASS (${summary.avg_score.toFixed(3)})`,
+      "",
+      "| Test | Score | Status |",
+      "|---|---|---|",
+      table || "| (no results) | 0.000 | ERROR |"
+    ].join("\n");
+  }).join("\n\n");
+  return [
+    "## Results",
+    "",
+    sections,
+    "",
+    `Run: ${path6.basename(payload.run_dir)}`,
+    `Experiment: ${payload.experiment ?? "default"}`,
+    `Eval Files: ${payload.test_files.join(", ")}`
+  ].join("\n");
+}
+async function maybeWarnLargeArtifact(runDir) {
+  const sizeBytes = await directorySizeBytes(runDir);
+  if (sizeBytes > SIZE_WARNING_BYTES) {
+    console.warn(
+      `Warning: run artifacts total ${(sizeBytes / (1024 * 1024)).toFixed(1)}MB. Export will continue.`
+    );
+  }
+}
+async function loadNormalizedResultsConfig(cwd) {
+  const repoRoot = await findRepoRoot(cwd) ?? cwd;
+  const config = await loadConfig(path6.join(cwd, "_"), repoRoot);
+  if (!config?.results?.export) {
+    return void 0;
+  }
+  return normalizeResultsExportConfig(config.results.export);
+}
+function encodeRemoteRunId(filename) {
+  return `${REMOTE_RUN_PREFIX}${filename}`;
+}
+async function getRemoteResultsStatus(cwd) {
+  const config = await loadNormalizedResultsConfig(cwd);
+  const status = getResultsRepoStatus(config);
+  const runCount = config && status.available ? listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length : 0;
+  return {
+    ...status,
+    run_count: runCount
+  };
+}
+async function syncRemoteResults(cwd) {
+  const config = await loadNormalizedResultsConfig(cwd);
+  if (!config) {
+    return {
+      ...getResultsRepoStatus(),
+      run_count: 0
+    };
+  }
+  try {
+    await syncResultsRepo(config);
+  } catch (error) {
+    return {
+      ...getResultsRepoStatus(config),
+      run_count: 0,
+      last_error: getStatusMessage(error)
+    };
+  }
+  return getRemoteResultsStatus(cwd);
+}
+async function listMergedResultFiles(cwd, limit) {
+  const localRuns = listResultFiles(cwd).map(
+    (meta) => ({
+      ...meta,
+      source: "local",
+      raw_filename: meta.filename
+    })
+  );
+  const remoteStatus = await getRemoteResultsStatus(cwd);
+  const config = await loadNormalizedResultsConfig(cwd);
+  if (!config || !remoteStatus.available) {
+    return {
+      runs: limit !== void 0 && limit > 0 ? localRuns.slice(0, limit) : localRuns,
+      remote_status: remoteStatus
+    };
+  }
+  const remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
+    (meta) => ({
+      ...meta,
+      filename: encodeRemoteRunId(meta.filename),
+      raw_filename: meta.filename,
+      source: "remote"
+    })
+  );
+  const merged = [...localRuns, ...remoteRuns].sort(
+    (a, b) => b.timestamp.localeCompare(a.timestamp)
+  );
+  return {
+    runs: limit !== void 0 && limit > 0 ? merged.slice(0, limit) : merged,
+    remote_status: remoteStatus
+  };
+}
+async function findRunById(cwd, runId) {
+  const { runs } = await listMergedResultFiles(cwd);
+  return runs.find((run) => run.filename === runId);
+}
+async function maybeAutoExportRunArtifacts(payload) {
+  const config = await loadNormalizedResultsConfig(payload.cwd);
+  if (!config?.auto_push) {
+    return;
+  }
+  try {
+    await maybeWarnLargeArtifact(payload.run_dir);
+    const branchName = buildBranchName(config, payload);
+    const prepared = await prepareResultsRepoBranch(config, branchName);
+    try {
+      const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
+      const destinationDir = path6.join(prepared.repoDir, config.path, relativeRunPath);
+      await stageResultsArtifacts({
+        repoDir: prepared.repoDir,
+        sourceDir: payload.run_dir,
+        destinationDir
+      });
+      const commitTitle = buildCommitTitle(payload);
+      const changed = await commitAndPushResultsBranch({
+        repoDir: prepared.repoDir,
+        branchName,
+        commitMessage: commitTitle
+      });
+      if (!changed) {
+        console.warn("Warning: results export produced no git changes. Skipping PR creation.");
+        return;
+      }
+      const prUrl = await createDraftResultsPr({
+        repo: config.repo,
+        repoDir: prepared.repoDir,
+        baseBranch: prepared.baseBranch,
+        branchName,
+        title: commitTitle,
+        body: buildPrBody(payload)
+      });
+      console.log(`Remote results draft PR created: ${prUrl}`);
+    } finally {
+      await prepared.cleanup();
+    }
+  } catch (error) {
+    console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`);
+    console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing.");
+  }
+}
+// src/commands/eval/artifact-writer.ts
+import { mkdir, readFile, writeFile } from "node:fs/promises";
+import path7 from "node:path";
+// src/utils/case-conversion.ts
+function toSnakeCase(str) {
+  if (/^[A-Z]/.test(str)) {
+    return str;
   }
-  return existing;
+  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
 }
-function resolveRunManifestPath(filePath) {
-  if (isDirectoryPath(filePath)) {
-    return resolveWorkspaceOrFilePath(filePath);
+function toSnakeCaseDeep(obj) {
+  if (obj === null || obj === void 0) {
+    return obj;
   }
-  if (!isRunManifestPath(filePath)) {
-    throw new Error(
-      `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
-    );
+  if (Array.isArray(obj)) {
+    return obj.map((item) => toSnakeCaseDeep(item));
   }
-  return filePath;
+  if (typeof obj === "object") {
+    const result = {};
+    for (const [key, value] of Object.entries(obj)) {
+      const snakeKey = toSnakeCase(key);
+      result[snakeKey] = toSnakeCaseDeep(value);
+    }
+    return result;
+  }
+  return obj;
 }
 // src/commands/eval/artifact-writer.ts
@@ -524,7 +1299,7 @@ function buildTimingArtifact(results) {
     }
   };
 }
-function buildBenchmarkArtifact(results, evalFile = "") {
+function buildBenchmarkArtifact(results, evalFile = "", experiment) {
   const targetSet = /* @__PURE__ */ new Set();
   const testIdSet = /* @__PURE__ */ new Set();
   for (const result of results) {
@@ -549,7 +1324,7 @@ function buildBenchmarkArtifact(results, evalFile = "") {
       tokens: computeStats(tokens)
     };
     const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
-    if (toolCallCounts.some((c) => c > 0)) {
+    if (toolCallCounts.some((c2) => c2 > 0)) {
       entry.tool_calls = computeStats(toolCallCounts);
     }
     const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
@@ -595,7 +1370,8 @@ function buildBenchmarkArtifact(results, evalFile = "") {
       eval_file: evalFile,
       timestamp,
       targets,
-      tests_run: testIds
+      tests_run: testIds,
+      experiment
     },
     run_summary: runSummary,
     per_grader_summary: perEvaluatorSummary,
@@ -622,7 +1398,7 @@ function buildArtifactSubdir(result) {
     segments.push(safeArtifactPathSegment(evalSet, "default"));
   }
   segments.push(safeTestId(result.testId));
-  return path4.posix.join(...segments);
+  return path7.posix.join(...segments);
 }
 function formatOutputMarkdown(output) {
   return output.map((msg) => `@[${msg.role}]:
@@ -655,11 +1431,11 @@ function buildResultIndexArtifact(result) {
     failure_stage: result.failureStage,
     failure_reason_code: result.failureReasonCode,
     workspace_path: result.workspacePath,
-    grading_path: path4.posix.join(artifactSubdir, "grading.json"),
-    timing_path: path4.posix.join(artifactSubdir, "timing.json"),
-    input_path: input ? path4.posix.join(artifactSubdir, "input.md") : void 0,
-    output_path: hasResponse ? path4.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
-    response_path: hasResponse ? path4.posix.join(artifactSubdir, "outputs", "response.md") : void 0
+    grading_path: path7.posix.join(artifactSubdir, "grading.json"),
+    timing_path: path7.posix.join(artifactSubdir, "timing.json"),
+    input_path: input ? path7.posix.join(artifactSubdir, "input.md") : void 0,
+    output_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
+    response_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0
   };
 }
 async function writeJsonlFile(filePath, records) {
@@ -669,18 +1445,18 @@ async function writeJsonlFile(filePath, records) {
 }
 async function writeArtifactsFromResults(results, outputDir, options) {
   const testArtifactDir = outputDir;
-  const timingPath = path4.join(outputDir, "timing.json");
-  const benchmarkPath = path4.join(outputDir, "benchmark.json");
-  const indexPath = path4.join(outputDir, RESULT_INDEX_FILENAME);
+  const timingPath = path7.join(outputDir, "timing.json");
+  const benchmarkPath = path7.join(outputDir, "benchmark.json");
+  const indexPath = path7.join(outputDir, RESULT_INDEX_FILENAME);
   await mkdir(outputDir, { recursive: true });
   const indexRecords = [];
   for (const result of results) {
     const grading = buildGradingArtifact(result);
     const timing2 = buildTimingArtifact([result]);
     const artifactSubdir = buildArtifactSubdir(result);
-    const testDir = path4.join(outputDir, artifactSubdir);
-    const gradingPath = path4.join(testDir, "grading.json");
-    const perTestTimingPath = path4.join(testDir, "timing.json");
+    const testDir = path7.join(outputDir, artifactSubdir);
+    const gradingPath = path7.join(testDir, "grading.json");
+    const perTestTimingPath = path7.join(testDir, "timing.json");
     await mkdir(testDir, { recursive: true });
     await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
 `, "utf8");
@@ -688,23 +1464,26 @@ async function writeArtifactsFromResults(results, outputDir, options) {
 `, "utf8");
     const input = extractInput(result);
     if (input) {
-      await writeFile(path4.join(testDir, "input.md"), input, "utf8");
+      await writeFile(path7.join(testDir, "input.md"), input, "utf8");
     }
     if (result.output && result.output.length > 0) {
-      const outputsDir = path4.join(testDir, "outputs");
+      const outputsDir = path7.join(testDir, "outputs");
       await mkdir(outputsDir, { recursive: true });
       await writeFile(
-        path4.join(outputsDir, "response.md"),
+        path7.join(outputsDir, "response.md"),
         formatOutputMarkdown(result.output),
         "utf8"
       );
     }
-    indexRecords.push(buildResultIndexArtifact(result));
+    indexRecords.push({
+      ...buildResultIndexArtifact(result),
+      experiment: options?.experiment
+    });
   }
   const timing = buildTimingArtifact(results);
   await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
 `, "utf8");
-  const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
+  const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
   await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
 `, "utf8");
   await writeJsonlFile(indexPath, indexRecords);
@@ -758,13 +1537,13 @@ async function writeBenchmarkJson(outputPath, results) {
 // src/commands/eval/env.ts
 import { constants as constants3 } from "node:fs";
 import { access as access3 } from "node:fs/promises";
-import path5 from "node:path";
+import path8 from "node:path";
 import { config as loadDotenv } from "dotenv";
 function uniqueDirs(directories) {
   const seen = /* @__PURE__ */ new Set();
   const result = [];
   for (const dir of directories) {
-    const absolute = path5.resolve(dir);
+    const absolute = path8.resolve(dir);
     if (seen.has(absolute)) {
       continue;
     }
@@ -783,14 +1562,14 @@ async function fileExists2(filePath) {
 }
 function collectAncestorDirectories(start, boundary) {
   const directories = [];
-  const boundaryDir = path5.resolve(boundary);
-  let current = path5.resolve(start);
+  const boundaryDir = path8.resolve(boundary);
+  let current = path8.resolve(start);
   while (current !== void 0) {
     directories.push(current);
     if (current === boundaryDir) {
       break;
     }
-    const parent = path5.dirname(current);
+    const parent = path8.dirname(current);
     if (parent === current) {
       break;
     }
@@ -800,12 +1579,12 @@ function collectAncestorDirectories(start, boundary) {
 }
 async function loadEnvFromHierarchy(options) {
   const { testFilePath, repoRoot, verbose } = options;
-  const testDir = path5.dirname(path5.resolve(testFilePath));
+  const testDir = path8.dirname(path8.resolve(testFilePath));
   const cwd = process.cwd();
   const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
   const envFiles = [];
   for (const dir of searchDirs) {
-    const candidate = path5.join(dir, ".env");
+    const candidate = path8.join(dir, ".env");
     if (await fileExists2(candidate)) {
       envFiles.push(candidate);
     }
@@ -827,11 +1606,11 @@ async function loadEnvFromHierarchy(options) {
 }
 // src/commands/eval/output-writer.ts
-import path11 from "node:path";
+import path14 from "node:path";
 // src/commands/eval/html-writer.ts
 import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
-import path6 from "node:path";
+import path9 from "node:path";
 // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
 var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
@@ -1050,7 +1829,7 @@ var HtmlWriter = class _HtmlWriter {
     this.filePath = filePath;
   }
   static async open(filePath) {
-    await mkdir2(path6.dirname(filePath), { recursive: true });
+    await mkdir2(path9.dirname(filePath), { recursive: true });
     const writer = new _HtmlWriter(filePath);
     await writer.writeHtml();
     return writer;
@@ -1561,7 +2340,7 @@ var SCRIPT = `
 // src/commands/eval/json-writer.ts
 import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
-import path7 from "node:path";
+import path10 from "node:path";
 var JsonWriter = class _JsonWriter {
   filePath;
   results = [];
@@ -1570,7 +2349,7 @@ var JsonWriter = class _JsonWriter {
     this.filePath = filePath;
   }
   static async open(filePath) {
-    await mkdir3(path7.dirname(filePath), { recursive: true });
+    await mkdir3(path10.dirname(filePath), { recursive: true });
     return new _JsonWriter(filePath);
   }
   async append(result) {
@@ -1605,7 +2384,7 @@ var JsonWriter = class _JsonWriter {
 // src/commands/eval/jsonl-writer.ts
 import { createWriteStream } from "node:fs";
 import { mkdir as mkdir4 } from "node:fs/promises";
-import path8 from "node:path";
+import path11 from "node:path";
 import { finished } from "node:stream/promises";
 var JsonlWriter = class _JsonlWriter {
   stream;
@@ -1615,7 +2394,7 @@ var JsonlWriter = class _JsonlWriter {
     this.stream = stream;
   }
   static async open(filePath) {
-    await mkdir4(path8.dirname(filePath), { recursive: true });
+    await mkdir4(path11.dirname(filePath), { recursive: true });
     const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
     return new _JsonlWriter(stream);
   }
@@ -1647,7 +2426,7 @@ var JsonlWriter = class _JsonlWriter {
 // src/commands/eval/junit-writer.ts
 import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
-import path9 from "node:path";
+import path12 from "node:path";
 function escapeXml(str) {
   return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
 }
@@ -1661,7 +2440,7 @@ var JunitWriter = class _JunitWriter {
     this.threshold = options?.threshold ?? 0.5;
   }
   static async open(filePath, options) {
-    await mkdir5(path9.dirname(filePath), { recursive: true });
+    await mkdir5(path12.dirname(filePath), { recursive: true });
     return new _JunitWriter(filePath, options);
   }
   async append(result) {
@@ -1737,7 +2516,7 @@ ${suiteXmls.join("\n")}
 // src/commands/eval/yaml-writer.ts
 import { createWriteStream as createWriteStream2 } from "node:fs";
 import { mkdir as mkdir6 } from "node:fs/promises";
-import path10 from "node:path";
+import path13 from "node:path";
 import { finished as finished2 } from "node:stream/promises";
 import { stringify as stringifyYaml } from "yaml";
 var YamlWriter = class _YamlWriter {
@@ -1749,7 +2528,7 @@ var YamlWriter = class _YamlWriter {
     this.stream = stream;
   }
   static async open(filePath) {
-    await mkdir6(path10.dirname(filePath), { recursive: true });
+    await mkdir6(path13.dirname(filePath), { recursive: true });
     const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
     return new _YamlWriter(stream);
   }
@@ -1805,7 +2584,7 @@ async function createOutputWriter(filePath, format) {
 }
 var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
 function createWriterFromPath(filePath, options) {
-  const ext = path11.extname(filePath).toLowerCase();
+  const ext = path14.extname(filePath).toLowerCase();
   switch (ext) {
     case ".jsonl":
       return JsonlWriter.open(filePath);
@@ -1838,10 +2617,10 @@ function useColors() {
 }
 function formatVerdict(score, verdict) {
   if (verdict === void 0) return "";
-  const colors = useColors();
+  const colors2 = useColors();
   const scoreStr = score !== void 0 ? score.toFixed(3) : "";
   const verdictLabel = verdict === "ERROR" ? "ERROR" : `${scoreStr} ${verdict}`;
-  if (!colors) return ` | ${verdictLabel}`;
+  if (!colors2) return ` | ${verdictLabel}`;
   const color = verdict === "PASS" ? ANSI_GREEN : verdict === "FAIL" ? ANSI_RED2 : ANSI_YELLOW2;
   return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET2}`;
 }
@@ -1901,12 +2680,12 @@ var ProgressDisplay = class {
   }
   addLogPaths(paths, provider) {
     const newPaths = [];
-    for (const path17 of paths) {
-      if (this.logPathSet.has(path17)) {
+    for (const path19 of paths) {
+      if (this.logPathSet.has(path19)) {
         continue;
       }
-      this.logPathSet.add(path17);
-      newPaths.push(path17);
+      this.logPathSet.add(path19);
+      newPaths.push(path19);
     }
     if (newPaths.length === 0) {
       return;
@@ -1919,8 +2698,8 @@ var ProgressDisplay = class {
       this.hasPrintedLogHeader = true;
     }
     const startIndex = this.logPaths.length - newPaths.length;
-    newPaths.forEach((path17, offset) => {
-      console.log(`${startIndex + offset + 1}. ${path17}`);
+    newPaths.forEach((path19, offset) => {
+      console.log(`${startIndex + offset + 1}. ${path19}`);
     });
   }
   finish() {
@@ -1931,149 +2710,34 @@ var ProgressDisplay = class {
   }
 };
-// src/commands/results/manifest.ts
-import { existsSync as existsSync2, readFileSync } from "node:fs";
-import path12 from "node:path";
-function parseJsonlLines(content) {
-  return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
-}
-function parseMarkdownMessages(content) {
-  const trimmed = content.trim();
-  if (!trimmed.startsWith("@[")) {
-    return [];
-  }
-  const matches = [...trimmed.matchAll(/^@\[(.+?)\]:\n([\s\S]*?)(?=^@\[(.+?)\]:\n|\s*$)/gm)];
-  return matches.map((match) => ({
-    role: match[1],
-    content: match[2].trimEnd()
-  }));
-}
-function readOptionalText(baseDir, relativePath) {
-  if (!relativePath) {
-    return void 0;
-  }
-  const absolutePath = path12.join(baseDir, relativePath);
-  if (!existsSync2(absolutePath)) {
-    return void 0;
-  }
-  return readFileSync(absolutePath, "utf8");
-}
-function readOptionalJson(baseDir, relativePath) {
-  const text = readOptionalText(baseDir, relativePath);
-  if (!text) {
-    return void 0;
-  }
-  try {
-    return JSON.parse(text);
-  } catch {
-    return void 0;
-  }
-}
-function hydrateInput(baseDir, record) {
-  const inputText = readOptionalText(baseDir, record.input_path);
-  if (!inputText) {
-    return void 0;
-  }
-  const messages = parseMarkdownMessages(inputText);
-  return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
-}
-function hydrateOutput(baseDir, record) {
-  const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
-  if (!responseText) {
-    return void 0;
-  }
-  const messages = parseMarkdownMessages(responseText);
-  if (messages.length > 0) {
-    return messages.map((message) => ({
-      role: message.role,
-      content: message.content
-    }));
-  }
-  return [{ role: "assistant", content: responseText.trimEnd() }];
-}
-function hydrateManifestRecord(baseDir, record) {
-  const grading = readOptionalJson(baseDir, record.grading_path);
-  const timing = readOptionalJson(baseDir, record.timing_path);
-  const testId = record.test_id ?? "unknown";
-  return {
-    timestamp: record.timestamp,
-    testId,
-    suite: record.suite,
-    category: record.category,
-    target: record.target,
-    score: record.score,
-    executionStatus: record.execution_status,
-    error: record.error,
-    assertions: grading?.assertions.map((assertion) => ({
-      text: assertion.text,
-      passed: assertion.passed,
-      evidence: assertion.evidence
-    })),
-    scores: grading?.evaluators?.map((evaluator) => ({
-      name: evaluator.name,
-      type: evaluator.type,
-      score: evaluator.score,
-      assertions: Array.isArray(evaluator.assertions) ? evaluator.assertions.map((assertion) => ({
-        text: String(assertion.text ?? ""),
-        passed: Boolean(assertion.passed),
-        evidence: typeof assertion.evidence === "string" ? String(assertion.evidence) : void 0
-      })) : void 0,
-      weight: typeof evaluator.weight === "number" ? evaluator.weight : void 0,
-      verdict: typeof evaluator.verdict === "string" ? evaluator.verdict : void 0,
-      details: evaluator.details
-    })) ?? record.scores,
-    tokenUsage: timing?.token_usage ? {
-      input: timing.token_usage.input,
-      output: timing.token_usage.output,
-      reasoning: timing.token_usage.reasoning
-    } : record.token_usage,
-    durationMs: timing?.duration_ms ?? record.duration_ms,
-    costUsd: record.cost_usd,
-    input: hydrateInput(baseDir, record),
-    output: hydrateOutput(baseDir, record)
-  };
-}
-function parseResultManifest(content) {
-  return parseJsonlLines(content);
-}
-function resolveResultSourcePath(source, cwd) {
-  const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
-  if (isDirectoryPath(resolved) || path12.basename(resolved) === RESULT_INDEX_FILENAME) {
-    return resolveRunManifestPath(resolved);
-  }
-  return resolved;
-}
-function loadManifestResults(sourceFile) {
-  const resolvedSourceFile = resolveRunManifestPath(sourceFile);
-  const content = readFileSync(resolvedSourceFile, "utf8");
-  const records = parseResultManifest(content);
-  const baseDir = path12.dirname(resolvedSourceFile);
-  return records.map((record) => hydrateManifestRecord(baseDir, record));
-}
-function loadLightweightResults(sourceFile) {
-  const resolvedSourceFile = resolveRunManifestPath(sourceFile);
-  const content = readFileSync(resolvedSourceFile, "utf8");
-  return parseResultManifest(content).map((record) => ({
-    testId: record.test_id ?? "unknown",
-    suite: record.suite,
-    target: record.target,
-    experiment: record.experiment,
-    score: record.score,
-    scores: record.scores,
-    executionStatus: record.execution_status,
-    error: record.error,
-    timestamp: record.timestamp
-  }));
-}
 // src/commands/eval/retry-errors.ts
 async function loadRetrySourceResults(jsonlPath) {
   return loadManifestResults(resolveResultSourcePath(jsonlPath));
 }
+function escapeGlob(id) {
+  return id.replace(/[*?[\]{}()!@#+|\\]/g, "\\$&");
+}
 async function loadErrorTestIds(jsonlPath) {
   const ids = (await loadRetrySourceResults(jsonlPath)).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
   return [...new Set(ids)];
 }
+async function loadFullyCompletedTestIds(jsonlPath) {
+  const results = await loadRetrySourceResults(jsonlPath);
+  const allIds = /* @__PURE__ */ new Set();
+  const errorIds = /* @__PURE__ */ new Set();
+  for (const result of results) {
+    if (!result.testId) continue;
+    allIds.add(result.testId);
+    if (result.executionStatus === "execution_error") {
+      errorIds.add(result.testId);
+    }
+  }
+  return [...allIds].filter((id) => !errorIds.has(id));
+}
+function buildExclusionFilter(completedIds) {
+  const escaped = completedIds.map(escapeGlob);
+  return escaped.length === 1 ? `!${escaped[0]}` : `!{${escaped.join(",")}}`;
+}
 async function loadNonErrorResults(jsonlPath) {
   return (await loadRetrySourceResults(jsonlPath)).filter(
     (result) => result.testId && result.executionStatus !== "execution_error"
@@ -2082,7 +2746,7 @@ async function loadNonErrorResults(jsonlPath) {
 // src/commands/eval/run-cache.ts
 import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
-import path13 from "node:path";
+import path15 from "node:path";
 var CACHE_FILENAME = "cache.json";
 function resolveRunCacheFile(cache) {
   if (cache.lastRunDir) {
@@ -2091,7 +2755,7 @@ function resolveRunCacheFile(cache) {
   return "";
 }
 function cachePath(cwd) {
-  return path13.join(cwd, ".agentv", CACHE_FILENAME);
+  return path15.join(cwd, ".agentv", CACHE_FILENAME);
 }
 async function loadRunCache(cwd) {
   try {
@@ -2102,13 +2766,13 @@ async function loadRunCache(cwd) {
   }
 }
 async function saveRunCache(cwd, resultPath) {
-  if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
+  if (path15.basename(resultPath) !== RESULT_INDEX_FILENAME) {
     return;
   }
-  const dir = path13.join(cwd, ".agentv");
+  const dir = path15.join(cwd, ".agentv");
   await mkdir7(dir, { recursive: true });
   const cache = {
-    lastRunDir: path13.dirname(resultPath),
+    lastRunDir: path15.dirname(resultPath),
     timestamp: (/* @__PURE__ */ new Date()).toISOString()
   };
   await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
@@ -2233,7 +2897,7 @@ function calculateEvaluationSummary(results, options) {
     byFailureReason
   };
 }
-function formatScore(value) {
+function formatScore2(value) {
   return value.toFixed(3);
 }
 function formatEvaluationSummary(summary, options) {
@@ -2261,13 +2925,13 @@ function formatEvaluationSummary(summary, options) {
   let verdictColor;
   let verdictText;
   if (allExecutionErrors) {
-    overallVerdict = "INCONCLUSIVE";
+    overallVerdict = "ERROR";
     verdictColor = "\x1B[33m";
-    verdictText = `RESULT: INCONCLUSIVE  (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
+    verdictText = `RESULT: ERROR  (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
   } else {
     overallVerdict = overallPassed ? "PASS" : "FAIL";
     verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
-    verdictText = `RESULT: ${overallVerdict}  (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
+    verdictText = `RESULT: ${overallVerdict}  (${summary.passedCount}/${summary.total} scored >= ${threshold}, mean: ${formatScore2(summary.mean)})`;
   }
   lines.push("\n==================================================");
   if (useColor) {
@@ -2290,16 +2954,16 @@ function formatEvaluationSummary(summary, options) {
   if (summary.executionErrorCount > 0) {
     const qualityCount = summary.total - summary.executionErrorCount;
     lines.push(
-      `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
+      `Mean score: ${formatScore2(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
     );
   } else {
-    lines.push(`Mean score: ${formatScore(summary.mean)}`);
+    lines.push(`Mean score: ${formatScore2(summary.mean)}`);
   }
-  lines.push(`Median score: ${formatScore(summary.median)}`);
-  lines.push(`Min score: ${formatScore(summary.min)}`);
-  lines.push(`Max score: ${formatScore(summary.max)}`);
+  lines.push(`Median score: ${formatScore2(summary.median)}`);
+  lines.push(`Min score: ${formatScore2(summary.min)}`);
+  lines.push(`Max score: ${formatScore2(summary.max)}`);
   if (typeof summary.standardDeviation === "number") {
-    lines.push(`Std deviation: ${formatScore(summary.standardDeviation)}`);
+    lines.push(`Std deviation: ${formatScore2(summary.standardDeviation)}`);
   }
   lines.push("\nScore distribution:");
   for (const bin of summary.histogram) {
@@ -2308,11 +2972,11 @@ function formatEvaluationSummary(summary, options) {
   }
   lines.push("\nTop performing tests:");
   summary.topResults.forEach((result, index) => {
-    lines.push(`  ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
+    lines.push(`  ${index + 1}. ${result.testId}: ${formatScore2(result.score)}`);
   });
   lines.push("\nLowest performing tests:");
   summary.bottomResults.forEach((result, index) => {
-    lines.push(`  ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
+    lines.push(`  ${index + 1}. ${result.testId}: ${formatScore2(result.score)}`);
   });
   const failureStageEntries = Object.entries(summary.byFailureStage);
   if (failureStageEntries.length > 0) {
@@ -2361,7 +3025,7 @@ function formatMatrixSummary(results) {
   for (const testId of testIds) {
     const cells = targets.map((target) => {
       const score = scoreMap.get(testId)?.get(target);
-      return score !== void 0 ? formatScore(score).padEnd(targetColWidth) : "-".padEnd(targetColWidth);
+      return score !== void 0 ? formatScore2(score).padEnd(targetColWidth) : "-".padEnd(targetColWidth);
     });
     lines.push(`${testId.padEnd(testIdColWidth)}  ${cells.join("  ")}`);
   }
@@ -2369,7 +3033,7 @@ function formatMatrixSummary(results) {
   const avgCells = targets.map((target) => {
     const scores = results.filter((r) => r.target === target).map((r) => r.score);
     const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
-    return formatScore(avg).padEnd(targetColWidth);
+    return formatScore2(avg).padEnd(targetColWidth);
   });
   lines.push(`${"Average".padEnd(testIdColWidth)}  ${avgCells.join("  ")}`);
   return lines.join("\n");
@@ -2377,7 +3041,7 @@ function formatMatrixSummary(results) {
 // ../../packages/core/dist/evaluation/validation/index.js
 import { readFile as readFile3 } from "node:fs/promises";
-import path14 from "node:path";
+import path16 from "node:path";
 import { parse } from "yaml";
 import { readFile as readFile22 } from "node:fs/promises";
 import path22 from "node:path";
@@ -2420,8 +3084,8 @@ async function detectFileType(filePath) {
   }
 }
 function inferFileTypeFromPath(filePath) {
-  const normalized = path14.normalize(filePath).replace(/\\/g, "/");
-  const basename = path14.basename(filePath);
+  const normalized = path16.normalize(filePath).replace(/\\/g, "/");
+  const basename = path16.basename(filePath);
   if (normalized.includes("/.agentv/")) {
     if (basename === "config.yaml" || basename === "config.yml") {
       return "config";
@@ -2747,12 +3411,21 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
   const hooks = workspace.hooks;
   const afterEachHook = isObject(hooks) ? hooks.after_each : void 0;
   const isolation = workspace.isolation;
+  const docker = workspace.docker;
   if (Array.isArray(repos)) {
     for (const repo of repos) {
       if (!isObject(repo)) continue;
       const source = repo.source;
       const checkout = repo.checkout;
       const clone = repo.clone;
+      if (!isObject(source) && !isObject(docker)) {
+        errors.push({
+          severity: "error",
+          filePath,
+          location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
+          message: "repos[].source is required for non-Docker workspaces. Source-less repos are only valid when workspace.docker is configured (repo exists inside the container)."
+        });
+      }
       if (isObject(source) && isObject(checkout)) {
         const sourceType = source.type;
         const resolve = checkout.resolve;
@@ -2760,8 +3433,8 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
           errors.push({
             severity: "warning",
             filePath,
-            location: `workspace.repos[path=${repo.path}]`,
-            message: "checkout.resolve has no effect for a local source. Use source.type to choose where the repo comes from; keep checkout.ref or checkout.ancestor only when pinning a local source."
+            location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
+            message: "checkout.resolve has no effect for a local source. Use source.type to choose where the repo comes from; keep checkout.ref, checkout.base_commit, or checkout.ancestor only when pinning a local source."
           });
         }
       }
@@ -2772,7 +3445,7 @@ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
           errors.push({
             severity: "warning",
             filePath,
-            location: `workspace.repos[path=${repo.path}]`,
+            location: `workspace.repos[path=${repo.path ?? "(none)"}]`,
             message: `clone.depth (${depth}) may be insufficient for checkout.ancestor (${ancestor}). Recommend depth >= ${ancestor + 1}.`
           });
         }
@@ -3522,11 +4195,69 @@ async function validateConfigFile(filePath) {
         });
       }
     }
+    const results = config.results;
+    if (results !== void 0) {
+      if (typeof results !== "object" || results === null || Array.isArray(results)) {
+        errors.push({
+          severity: "error",
+          filePath,
+          location: "results",
+          message: "Field 'results' must be an object"
+        });
+      } else {
+        const exportConfig = results.export;
+        if (exportConfig !== void 0) {
+          if (typeof exportConfig !== "object" || exportConfig === null || Array.isArray(exportConfig)) {
+            errors.push({
+              severity: "error",
+              filePath,
+              location: "results.export",
+              message: "Field 'results.export' must be an object"
+            });
+          } else {
+            const exportRecord = exportConfig;
+            if (typeof exportRecord.repo !== "string" || exportRecord.repo.trim().length === 0) {
+              errors.push({
+                severity: "error",
+                filePath,
+                location: "results.export.repo",
+                message: "Field 'results.export.repo' must be a non-empty string"
+              });
+            }
+            if (typeof exportRecord.path !== "string" || exportRecord.path.trim().length === 0) {
+              errors.push({
+                severity: "error",
+                filePath,
+                location: "results.export.path",
+                message: "Field 'results.export.path' must be a non-empty string"
+              });
+            }
+            if (exportRecord.auto_push !== void 0 && typeof exportRecord.auto_push !== "boolean") {
+              errors.push({
+                severity: "error",
+                filePath,
+                location: "results.export.auto_push",
+                message: "Field 'results.export.auto_push' must be a boolean"
+              });
+            }
+            if (exportRecord.branch_prefix !== void 0 && (typeof exportRecord.branch_prefix !== "string" || exportRecord.branch_prefix.trim().length === 0)) {
+              errors.push({
+                severity: "error",
+                filePath,
+                location: "results.export.branch_prefix",
+                message: "Field 'results.export.branch_prefix' must be a non-empty string"
+              });
+            }
+          }
+        }
+      }
+    }
     const allowedFields = /* @__PURE__ */ new Set([
       "$schema",
       "eval_patterns",
       "required_version",
       "execution",
+      "results",
       "studio"
     ]);
     const unexpectedFields = Object.keys(config).filter((key) => !allowedFields.has(key));
@@ -4086,7 +4817,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
     threshold: normalizeOptionalNumber(rawOptions.threshold),
     tags: normalizeStringArray(rawOptions.tag),
     excludeTags: normalizeStringArray(rawOptions.excludeTag),
-    transcript: normalizeString(rawOptions.transcript)
+    transcript: normalizeString(rawOptions.transcript),
+    experiment: normalizeString(rawOptions.experiment)
   };
 }
 async function ensureFileExists(filePath, description) {
@@ -4096,10 +4828,10 @@ async function ensureFileExists(filePath, description) {
     throw new Error(`${description} not found: ${filePath}`);
   }
 }
-function buildDefaultOutputPath(cwd) {
-  const runDir = buildDefaultRunDir(cwd);
+function buildDefaultOutputPathForExperiment(cwd, experiment) {
+  const runDir = buildDefaultRunDir(cwd, experiment);
   mkdirSync(runDir, { recursive: true });
-  return path15.join(runDir, "index.jsonl");
+  return path17.join(runDir, "index.jsonl");
 }
 function createProgressReporter(maxWorkers, options) {
   const display = new ProgressDisplay(maxWorkers, options);
@@ -4113,7 +4845,7 @@ function createProgressReporter(maxWorkers, options) {
   };
 }
 function makeTestCaseKey(testFilePath, testId) {
-  return `${path15.resolve(testFilePath)}::${testId}`;
+  return `${path17.resolve(testFilePath)}::${testId}`;
 }
 function createDisplayIdTracker() {
   const map = /* @__PURE__ */ new Map();
@@ -4169,7 +4901,7 @@ async function prepareFileMetadata(params) {
     repoRoot,
     verbose: options.verbose
   });
-  const relativePath = path15.relative(cwd, testFilePath);
+  const relativePath = path17.relative(cwd, testFilePath);
   const category = deriveCategory(relativePath);
   const suite = await loadTestSuite(testFilePath, repoRoot, {
     verbose: options.verbose,
@@ -4194,7 +4926,7 @@ async function prepareFileMetadata(params) {
     selections = [
       {
         selection: transcriptSelection,
-        inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
+        inlineTargetLabel: `transcript (${path17.basename(options.transcript)})`
       }
     ];
   } else {
@@ -4430,32 +5162,36 @@ async function runEvalCommand(input) {
     );
   }
   const repoRoot = await findRepoRoot(cwd);
-  const yamlConfig = await loadConfig(path15.join(cwd, "_"), repoRoot);
+  const yamlConfig = await loadConfig(path17.join(cwd, "_"), repoRoot);
   if (yamlConfig?.required_version) {
     await enforceRequiredVersion(yamlConfig.required_version, {
       strict: normalizeBoolean(input.rawOptions.strict)
     });
   }
   let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
+  if (!process.env.AGENTV_EXPERIMENT) {
+    process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
+  }
   if (options.graderTarget === "agentv" && !options.model) {
     throw new Error("--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)");
   }
   let retryNonErrorResults;
   if (options.retryErrors) {
-    const retryPath = path15.resolve(options.retryErrors);
+    const retryPath = path17.resolve(options.retryErrors);
     await ensureFileExists(retryPath, "Retry-errors JSONL file");
+    const completedIds = await loadFullyCompletedTestIds(retryPath);
     const errorIds = await loadErrorTestIds(retryPath);
-    if (errorIds.length === 0) {
-      console.log("No execution errors found in the previous output. Nothing to retry.");
-      return;
-    }
-    console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
-    const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
-    options = { ...options, filter: filterPattern };
     retryNonErrorResults = await loadNonErrorResults(retryPath);
+    if (errorIds.length > 0) {
+      console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
+    }
+    if (completedIds.length > 0) {
+      options = { ...options, filter: buildExclusionFilter(completedIds) };
+      console.log(`Skipping ${completedIds.length} already-completed test(s).`);
+    }
   }
   if (options.workspacePath) {
-    const resolvedWorkspace = path15.resolve(options.workspacePath);
+    const resolvedWorkspace = path17.resolve(options.workspacePath);
     try {
       const { stat: stat2 } = await import("node:fs/promises");
       const stats = await stat2(resolvedWorkspace);
@@ -4496,25 +5232,25 @@ async function runEvalCommand(input) {
   let outputPath;
   let usesDefaultArtifactWorkspace;
   if (explicitDir) {
-    runDir = path15.resolve(explicitDir);
+    runDir = path17.resolve(explicitDir);
     mkdirSync(runDir, { recursive: true });
-    outputPath = path15.join(runDir, "index.jsonl");
+    outputPath = path17.join(runDir, "index.jsonl");
     usesDefaultArtifactWorkspace = true;
   } else if (options.outPath) {
-    outputPath = path15.resolve(options.outPath);
-    runDir = path15.dirname(outputPath);
+    outputPath = path17.resolve(options.outPath);
+    runDir = path17.dirname(outputPath);
     mkdirSync(runDir, { recursive: true });
     usesDefaultArtifactWorkspace = false;
   } else {
-    outputPath = buildDefaultOutputPath(cwd);
-    runDir = path15.dirname(outputPath);
+    outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
+    runDir = path17.dirname(outputPath);
     usesDefaultArtifactWorkspace = true;
   }
   let otelExporter = null;
   const useFileExport = !!options.otelFile;
   if (options.exportOtel || useFileExport) {
     try {
-      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-XDNB4WDT.js");
+      const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-HNSXNRVK.js");
       let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
       let headers = {};
       if (options.otelBackend) {
@@ -4538,7 +5274,7 @@ async function runEvalCommand(input) {
         headers,
         captureContent,
         groupTurns: options.otelGroupTurns,
-        otlpFilePath: options.otelFile ? path15.resolve(options.otelFile) : void 0
+        otlpFilePath: options.otelFile ? path17.resolve(options.otelFile) : void 0
       });
       const initialized = await otelExporter.init();
       if (!initialized) {
@@ -4555,7 +5291,7 @@ async function runEvalCommand(input) {
     }
   }
   const primaryWritePath = outputPath;
-  const resolvedExportPaths = options.exportPaths.map((p) => path15.resolve(p));
+  const resolvedExportPaths = options.exportPaths.map((p) => path17.resolve(p));
   console.log(`Artifact directory: ${runDir}`);
   if (resolvedExportPaths.length > 0) {
     console.log("Export files:");
@@ -4563,12 +5299,13 @@ async function runEvalCommand(input) {
       console.log(`  ${p}`);
     }
   }
-  const resolvedTestFiles = input.testFiles.map((file) => path15.resolve(file));
+  const resolvedTestFiles = input.testFiles.map((file) => path17.resolve(file));
   if (options.otelFile) {
-    console.log(`OTLP JSON file: ${path15.resolve(options.otelFile)}`);
+    console.log(`OTLP JSON file: ${path17.resolve(options.otelFile)}`);
   }
   const evaluationRunner = await resolveEvaluationRunner();
   const allResults = [];
+  const remoteEvalSummaries = [];
   const seenTestCases = /* @__PURE__ */ new Set();
   const displayIdTracker = createDisplayIdTracker();
   const totalWorkers = options.workers ?? DEFAULT_WORKERS;
@@ -4609,7 +5346,7 @@ async function runEvalCommand(input) {
     for (const [testFilePath, meta] of fileMetadata.entries()) {
       if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
         fileMetadata.delete(testFilePath);
-        skippedFiles.push(path15.relative(cwd, testFilePath));
+        skippedFiles.push(path17.relative(cwd, testFilePath));
       }
     }
     if (skippedFiles.length > 0 && options.verbose) {
@@ -4630,7 +5367,7 @@ async function runEvalCommand(input) {
     cliNoCache: options.noCache,
     yamlCache: yamlCacheEnabled
   });
-  const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
+  const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path17.resolve(yamlCachePath) : void 0) : void 0;
   if (cacheEnabled) {
     console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
   }
@@ -4651,6 +5388,10 @@ async function runEvalCommand(input) {
     }
   }
   if (totalEvalCount === 0) {
+    if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
+      console.log("No execution errors or missing cases in the previous run. Nothing to retry.");
+      return;
+    }
     throw new Error("No tests matched the provided filters.");
   }
   const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
@@ -4708,7 +5449,7 @@ async function runEvalCommand(input) {
   const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
   let transcriptProviderFactory;
   if (options.transcript) {
-    const { TranscriptProvider } = await import("./dist-XDNB4WDT.js");
+    const { TranscriptProvider } = await import("./dist-HNSXNRVK.js");
     const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
     const totalTests = [...fileMetadata.values()].reduce(
       (sum, meta) => sum + meta.testCases.length,
@@ -4767,11 +5508,23 @@ async function runEvalCommand(input) {
               threshold: resolvedThreshold,
               providerFactory: transcriptProviderFactory
             });
+            const evalFile = path17.relative(cwd, testFilePath);
+            const existingSummary = remoteEvalSummaries.find(
+              (summary2) => summary2.evalFile === evalFile
+            );
+            if (existingSummary) {
+              existingSummary.results.push(...result.results);
+            } else {
+              remoteEvalSummaries.push({
+                evalFile,
+                results: [...result.results]
+              });
+            }
             return result.results;
           } catch (fileError) {
             const message = fileError instanceof Error ? fileError.message : String(fileError);
             console.error(`
-\u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
+\u26A0 Eval file failed: ${path17.basename(testFilePath)} \u2014 ${message}
 `);
             const errorResults = applicableTestCases.map((testCase) => ({
               timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -4818,7 +5571,7 @@ async function runEvalCommand(input) {
       console.log(formatMatrixSummary(allResults));
     }
     if (options.benchmarkJson && allResults.length > 0) {
-      const benchmarkPath = path15.resolve(options.benchmarkJson);
+      const benchmarkPath = path17.resolve(options.benchmarkJson);
       await writeBenchmarkJson(benchmarkPath, allResults);
       console.log(`Benchmark written to: ${benchmarkPath}`);
     }
@@ -4830,7 +5583,8 @@ async function runEvalCommand(input) {
         benchmarkPath: workspaceBenchmarkPath,
         indexPath
       } = await writeArtifactsFromResults(allResults, runDir, {
-        evalFile
+        evalFile,
+        experiment: normalizeExperimentName(options.experiment)
       });
       console.log(`Artifact workspace written to: ${runDir}`);
       console.log(`  Index: ${indexPath}`);
@@ -4849,7 +5603,7 @@ async function runEvalCommand(input) {
         await writer.close();
       }
       console.log(
-        `Export file(s) written: ${resolvedExportPaths.map((p) => path15.relative(cwd, p)).join(", ")}`
+        `Export file(s) written: ${resolvedExportPaths.map((p) => path17.relative(cwd, p)).join(", ")}`
       );
     }
     const failedWithWorkspaces = allResults.filter(
@@ -4865,11 +5619,29 @@ async function runEvalCommand(input) {
       console.log(`
 Results written to: ${outputPath}`);
       await saveRunCache(cwd, outputPath).catch(() => void 0);
+      await maybeAutoExportRunArtifacts({
+        cwd,
+        run_dir: runDir,
+        test_files: activeTestFiles,
+        results: allResults,
+        eval_summaries: remoteEvalSummaries.map((summary2) => ({
+          eval_file: summary2.evalFile,
+          total: summary2.results.length,
+          passed: summary2.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
+          avg_score: summary2.results.length > 0 ? summary2.results.reduce((sum, result) => sum + result.score, 0) / summary2.results.length : 0,
+          results: summary2.results.map((result) => ({
+            test_id: result.testId,
+            score: result.score,
+            status: result.executionStatus === "execution_error" || result.error ? "ERROR" : result.score >= DEFAULT_THRESHOLD ? "PASS" : "FAIL"
+          }))
+        })),
+        experiment: normalizeExperimentName(options.experiment)
+      });
     }
     if (summary.executionErrorCount > 0 && !options.retryErrors) {
-      const evalFileArgs = activeTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
+      const evalFileArgs = activeTestFiles.map((f) => path17.relative(cwd, f)).join(" ");
       const targetFlag = options.target ? ` --target ${options.target}` : "";
-      const relativeOutputPath = path15.relative(cwd, outputPath);
+      const relativeOutputPath = path17.relative(cwd, outputPath);
       console.log(
         `
 Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
@@ -4903,7 +5675,7 @@ async function resolveEvaluationRunner() {
   if (!overridePath) {
     return runEvaluation;
   }
-  const resolved = path15.isAbsolute(overridePath) ? overridePath : path15.resolve(process.cwd(), overridePath);
+  const resolved = path17.isAbsolute(overridePath) ? overridePath : path17.resolve(process.cwd(), overridePath);
   const moduleUrl = pathToFileURL(resolved).href;
   const mod = await import(moduleUrl);
   const candidate = mod.runEvaluation;
@@ -4916,11 +5688,11 @@ async function resolveEvaluationRunner() {
 }
 // src/commands/eval/discover.ts
-import path16 from "node:path";
+import path18 from "node:path";
 import fg2 from "fast-glob";
 async function discoverEvalFiles(cwd) {
   const repoRoot = await findRepoRoot(cwd);
-  const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
+  const config = await loadConfig(path18.join(cwd, "_"), repoRoot);
   const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
   const ignore = ["**/node_modules/**", "**/dist/**"];
   const matches = await fg2(patterns, {
@@ -4932,7 +5704,7 @@ async function discoverEvalFiles(cwd) {
     caseSensitiveMatch: false
   });
   const evalFiles = matches.map((absPath) => {
-    const relativePath = path16.relative(cwd, absPath);
+    const relativePath = path18.relative(cwd, absPath);
     const category = deriveCategory(relativePath);
     return { path: absPath, relativePath, category };
   });
@@ -4956,21 +5728,36 @@ export {
   package_default,
   toSnakeCaseDeep,
   RESULT_INDEX_FILENAME,
-  RESULT_RUNS_DIRNAME,
   buildDefaultRunDir,
-  resolveExistingRunPrimaryPath,
-  resolveWorkspaceOrFilePath,
   resolveRunManifestPath,
   parseResultManifest,
   resolveResultSourcePath,
   loadManifestResults,
   loadLightweightResults,
   HtmlWriter,
+  resolveEvalPaths,
+  findRepoRoot,
+  c,
+  padRight,
+  padLeft,
+  loadResultFile,
+  getTraceSummary,
+  getTraceSpans,
+  toTraceSummary,
+  listResultFiles,
+  formatNumber,
+  formatDuration,
+  formatCost,
+  formatSize,
+  formatScore,
+  getRemoteResultsStatus,
+  syncRemoteResults,
+  listMergedResultFiles,
+  findRunById,
+  maybeAutoExportRunArtifacts,
   writeArtifactsFromResults,
   resolveRunCacheFile,
   loadRunCache,
-  resolveEvalPaths,
-  findRepoRoot,
   detectFileType,
   validateEvalFile,
   validateTargetsFile,
@@ -4984,4 +5771,4 @@ export {
   getCategories,
   filterByCategory
 };
-//# sourceMappingURL=chunk-KF6BABQ5.js.map
+//# sourceMappingURL=chunk-FH24D7XW.js.map