npm - @sean.holung/minicode - Versions diffs - 0.4.1 → 0.4.2 - Mend

@sean.holung/minicode 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/src/agent/config.js CHANGED Viewed

@@ -33,6 +33,7 @@ export function formatConfigForDisplay(config) {
         "compactionThreshold: " + (config.compactionThreshold ?? "(disabled)"),
         "compactionModel: " + (config.compactionModel ?? "(disabled — using mechanical compaction)"),
         "reasoningEffort: " + (config.reasoningEffort ?? "(unset — no reasoning parameters sent)"),
+        "reasoningMaxTokens: " + (config.reasoningMaxTokens !== undefined ? String(config.reasoningMaxTokens) : "(unset — uncapped)"),
         "enableDynamicPrompt: " + (config.enableDynamicPrompt ?? false),
     ];
     return lines.join("\n");
@@ -255,7 +256,7 @@ export async function loadAgentConfig(cwd = process.cwd(), options = {}) {
         modelProvider: parseModelProvider(env.MODEL_PROVIDER ?? "openai-compatible"),
         model: env.MODEL ?? "",
         maxSteps: parseNumber(env.MAX_STEPS, 50),
-        maxTokens: parseNumber(env.MAX_TOKENS, 4096),
+        maxTokens: parseNumber(env.MAX_TOKENS, 16000),
         modelTimeoutSeconds: parseNumber(env.MODEL_TIMEOUT_SECONDS, 60),
         maxContextTokens: parseNumber(env.MAX_CONTEXT_TOKENS, 32_000),
         workspaceRoot,
@@ -280,5 +281,14 @@ export async function loadAgentConfig(cwd = process.cwd(), options = {}) {
             const effort = parseReasoningEffort(env.REASONING_EFFORT);
             return effort ? { reasoningEffort: effort } : {};
         })(),
+        ...(() => {
+            const raw = env.REASONING_MAX_TOKENS;
+            if (raw === undefined || raw === "")
+                return {};
+            const value = Number(raw);
+            if (!Number.isFinite(value) || value <= 0)
+                return {};
+            return { reasoningMaxTokens: Math.floor(value) };
+        })(),
     };
 }

package/dist/src/benchmark/config.js CHANGED Viewed

@@ -163,7 +163,7 @@ export async function buildBenchmarkAgentConfig(options = {}) {
         modelProvider: provider,
         model,
         maxSteps: parseNumber(resolvedEnv.values.MAX_STEPS ?? fileConfig.maxSteps, 50),
-        maxTokens: parseNumber(resolvedEnv.values.MAX_TOKENS ?? fileConfig.maxTokens, 4096),
+        maxTokens: parseNumber(resolvedEnv.values.MAX_TOKENS ?? fileConfig.maxTokens, 16000),
         modelTimeoutSeconds: parseNumber(resolvedEnv.values.MODEL_TIMEOUT_SECONDS ?? fileConfig.modelTimeoutSeconds, 60),
         maxContextTokens: parseNumber(resolvedEnv.values.MAX_CONTEXT_TOKENS ?? fileConfig.maxContextTokens, 32_000),
         workspaceRoot,
@@ -192,6 +192,20 @@ export async function buildBenchmarkAgentConfig(options = {}) {
             const effort = parseReasoningEffort(resolvedEnv.values.REASONING_EFFORT ?? fileConfig.reasoningEffort);
             return effort ? { reasoningEffort: effort } : {};
         })(),
+        ...(() => {
+            // Opt-in hard cap on reasoning tokens per turn. Useful for models
+            // (notably Gemini 2.5 Pro) that can otherwise burn the full output
+            // budget on dynamic thinking without producing a visible response.
+            // Unset by default — uncapped reasoning is the right behavior for
+            // most models.
+            const raw = resolvedEnv.values.REASONING_MAX_TOKENS ?? fileConfig.reasoningMaxTokens;
+            if (raw === undefined || raw === null || raw === "")
+                return {};
+            const value = typeof raw === "number" ? raw : Number(raw);
+            if (!Number.isFinite(value) || value <= 0)
+                return {};
+            return { reasoningMaxTokens: Math.floor(value) };
+        })(),
         enableDynamicPrompt: parseBoolean(resolvedEnv.values.ENABLE_DYNAMIC_PROMPT ?? fileConfig.enableDynamicPrompt, false),
     };
 }

package/dist/src/benchmark/index.js CHANGED Viewed

@@ -2,5 +2,5 @@ export { loadBenchmarkTask, loadBenchmarkTasks } from "./task-loader.js";
 export { evaluate } from "./evaluator.js";
 export { runBenchmarkTask, runBenchmarkSuite, } from "./runner.js";
 export { buildBenchmarkAgentConfig, getDefaultBenchmarkConfigPath, resolveBenchmarkEnv, } from "./config.js";
-export { collectWorkspaceChanges, writeWorkspaceDiff, } from "./workspace-changes.js";
+export { collectWorkspaceChanges, getWorkspaceDiff, writeWorkspaceDiff, } from "./workspace-changes.js";
 export { buildReport, buildReportFromEvaluations, formatReport, compareReports, } from "./reporter.js";

package/dist/src/benchmark/workspace-changes.js CHANGED Viewed

@@ -59,6 +59,47 @@ async function getWorkspaceGitPrefix(workspaceRoot) {
     const prefix = await runGit(workspaceRoot, ["rev-parse", "--show-prefix"], true);
     return prefix.trim();
 }
+/**
+ * Snapshot the current HEAD so we can diff against it at the end of a run
+ * even if the model committed in between. Returns null when the workspace
+ * is not a git repo or has no commits yet.
+ */
+export async function captureBaselineRef(workspaceRoot) {
+    if (!(await isGitRepository(workspaceRoot))) {
+        return null;
+    }
+    const sha = (await runGit(workspaceRoot, ["rev-parse", "HEAD"], true)).trim();
+    return sha.length > 0 ? sha : null;
+}
+function parseNameStatusLine(line) {
+    if (line.length === 0) {
+        return undefined;
+    }
+    // git diff --name-status output is tab-separated: STATUS\tPATH or
+    // STATUS\tOLD\tNEW (for renames/copies, status looks like R100 / C75).
+    const parts = line.split("\t");
+    const rawStatus = parts[0];
+    if (!rawStatus) {
+        return undefined;
+    }
+    const code = rawStatus.charAt(0);
+    if (code === "R" || code === "C") {
+        const previousPath = parts[1];
+        const nextPath = parts[2];
+        if (!previousPath || !nextPath) {
+            return undefined;
+        }
+        return { status: `${code} `, path: nextPath, previousPath };
+    }
+    const filePath = parts[1];
+    if (!filePath) {
+        return undefined;
+    }
+    // Map to the two-char porcelain-ish status the downstream code expects.
+    // We don't try to be exact — the only meaningful check downstream is the
+    // "??" untracked case, which is handled separately via git status.
+    return { status: `${code} `, path: filePath };
+}
 function stripWorkspacePrefix(filePath, workspacePrefix) {
     if (!workspacePrefix) {
         return filePath;
@@ -70,7 +111,7 @@ function stripWorkspacePrefix(filePath, workspacePrefix) {
         ? filePath.slice(normalizedPrefix.length)
         : filePath;
 }
-export async function collectWorkspaceChanges(workspaceRoot) {
+export async function collectWorkspaceChanges(workspaceRoot, baselineRef) {
     const isGitRepo = await isGitRepository(workspaceRoot);
     if (!isGitRepo) {
         return {
@@ -80,11 +121,7 @@ export async function collectWorkspaceChanges(workspaceRoot) {
         };
     }
     const workspacePrefix = await getWorkspaceGitPrefix(workspaceRoot);
-    const statusOutput = await runGit(workspaceRoot, ["status", "--porcelain=v1", "--untracked-files=all", "--", "."], true);
-    const entries = statusOutput
-        .split(/\r?\n/)
-        .map((line) => parseStatusLine(line))
-        .map((entry) => entry
+    const remap = (entry) => entry
         ? {
             ...entry,
             path: stripWorkspacePrefix(entry.path, workspacePrefix),
@@ -92,8 +129,39 @@ export async function collectWorkspaceChanges(workspaceRoot) {
                 ? { previousPath: stripWorkspacePrefix(entry.previousPath, workspacePrefix) }
                 : {}),
         }
-        : undefined)
+        : undefined;
+    // Always pull untracked entries from `git status` — they're never part of
+    // a baseline diff because they aren't tracked yet.
+    const statusOutput = await runGit(workspaceRoot, ["status", "--porcelain=v1", "--untracked-files=all", "--", "."], true);
+    const statusEntries = statusOutput
+        .split(/\r?\n/)
+        .map((line) => parseStatusLine(line))
+        .map(remap)
         .filter((entry) => entry !== undefined);
+    let entries;
+    if (baselineRef) {
+        // Tracked changes: anything that differs between the baseline commit and
+        // the current working tree. Captures committed, staged, AND unstaged
+        // edits in one shot.
+        const nameStatusOutput = await runGit(workspaceRoot, ["diff", "--name-status", baselineRef, "--", "."], true);
+        const trackedEntries = nameStatusOutput
+            .split(/\r?\n/)
+            .map((line) => parseNameStatusLine(line))
+            .map(remap)
+            .filter((entry) => entry !== undefined);
+        const untrackedEntries = statusEntries.filter((entry) => entry.status === "??");
+        const seen = new Set();
+        entries = [];
+        for (const entry of [...trackedEntries, ...untrackedEntries]) {
+            if (seen.has(entry.path))
+                continue;
+            seen.add(entry.path);
+            entries.push(entry);
+        }
+    }
+    else {
+        entries = statusEntries;
+    }
     const changedFiles = [...new Set(entries.map((entry) => entry.path))];
     return {
         isGitRepo: true,
@@ -101,12 +169,19 @@ export async function collectWorkspaceChanges(workspaceRoot) {
         changedFiles,
     };
 }
-export async function writeWorkspaceDiff(workspaceRoot, outPath) {
-    const changes = await collectWorkspaceChanges(workspaceRoot);
+export async function getWorkspaceDiff(workspaceRoot, baselineRef) {
+    const changes = await collectWorkspaceChanges(workspaceRoot, baselineRef);
     if (!changes.isGitRepo) {
-        return false;
+        return null;
     }
-    const trackedDiff = await runGit(workspaceRoot, ["diff", "--binary", "--no-ext-diff", "--relative", "--", "."], true);
+    // With a baseline ref we diff working-tree vs baseline directly, which
+    // captures committed + staged + unstaged in one pass. Without one we
+    // fall back to the working-tree-vs-index behavior — useful when the
+    // caller hasn't snapshotted a starting point.
+    const trackedDiffArgs = baselineRef
+        ? ["diff", "--binary", "--no-ext-diff", "--relative", baselineRef, "--", "."]
+        : ["diff", "--binary", "--no-ext-diff", "--relative", "--", "."];
+    const trackedDiff = await runGit(workspaceRoot, trackedDiffArgs, true);
     const untrackedDiffs = [];
     for (const entry of changes.entries) {
         if (entry.status === "??") {
@@ -116,9 +191,15 @@ export async function writeWorkspaceDiff(workspaceRoot, outPath) {
             }
         }
     }
-    const combinedDiff = [trackedDiff, ...untrackedDiffs]
+    return [trackedDiff, ...untrackedDiffs]
         .filter((section) => section.trim().length > 0)
         .join("\n");
+}
+export async function writeWorkspaceDiff(workspaceRoot, outPath, baselineRef) {
+    const combinedDiff = await getWorkspaceDiff(workspaceRoot, baselineRef);
+    if (combinedDiff === null) {
+        return false;
+    }
     await mkdir(path.dirname(outPath), { recursive: true });
     await writeFile(outPath, combinedDiff, "utf8");
     return true;

package/dist/src/cli/benchmark-run.js CHANGED Viewed

@@ -4,24 +4,87 @@ import process from "node:process";
 import { CodingAgent, createModelClient, } from "@sean.holung/minicode-sdk";
 import { getConfigSetupMessage } from "../agent/config.js";
 import { buildBenchmarkAgentConfig, resolveBenchmarkEnv, } from "../benchmark/config.js";
-import { collectWorkspaceChanges, writeWorkspaceDiff, } from "../benchmark/workspace-changes.js";
+import { captureBaselineRef, collectWorkspaceChanges, getWorkspaceDiff, writeWorkspaceDiff, } from "../benchmark/workspace-changes.js";
 import { buildProjectIndex } from "../indexer/project-index.js";
 import { createToolRegistry } from "../tools/registry.js";
 import { CliUsageError } from "./args.js";
+import { buildContextBenchTrajectory } from "./contextbench-trajectory.js";
 const BENCHMARK_SYSTEM_PROMPT_SUFFIX = [
-    "[Benchmark Execution Mode]",
-    "- This task is running in a non-interactive benchmark harness.",
-    "- The task is already approved. Do not ask for confirmation, permission, or whether you should proceed.",
+    "[Execution Mode]",
+    "- This task is running in a non-interactive harness. The task is already approved.",
+    "- Do not ask for confirmation, permission, or whether you should proceed.",
     "- If the task requires code changes, make them immediately using the available tools.",
     "- Do not stop after presenting a plan. Either complete the task or explain a concrete blocker.",
-    "- When validation is part of the task, run the required command once after making changes.",
+    "",
+    "[Long-form Task Discipline]",
+    "- Non-trivial coding tasks routinely require 30+ tool-call iterate-test-fix cycles. Persistent iteration against the test suite is expected; do not bail early because you have read a few files.",
+    "- Iterate against the canonical test runner (the test command shipped with the task) until it passes. Treat the runner's output as the source of truth — not your own assessment of whether the code looks correct, and not ad-hoc verification scripts you write yourself.",
+    "- Before declaring the task complete: run the FULL existing test suite, not just tests targeting the new feature. Many tasks modify code that other features depend on — verify you did not break previously-passing functionality. Regression failures on stages you were not asked to modify still count as failures.",
+    "- \"I have implemented X\" is not the same as \"tests pass.\" Do not declare completion without observing an explicit green signal (exit code 0, all-pass marker) from the canonical test runner over the full suite.",
 ].join("\n");
-const BENCHMARK_RETRY_REMINDER = [
+const BENCHMARK_RETRY_REMINDER_APPROVAL = [
     "Benchmark harness reminder:",
     "- This task is already approved.",
     "- Do not ask for confirmation or present a plan without acting.",
     "- Use tools, make the required edits immediately, and finish the task.",
 ].join("\n");
+// Used when the previous attempt emitted zero tool calls — either pure
+// reasoning that produced no output (Gemini 2.5 Pro's thinking-paralysis
+// mode) or narration claiming work was done without any tool calls.
+// In benchmark mode every task requires code changes; zero tool calls
+// is a definite failure regardless of what the response text claims.
+const BENCHMARK_RETRY_REMINDER_NO_ACTION = [
+    "Benchmark harness reminder:",
+    "- Your previous response made zero tool calls. The task is not complete.",
+    '- Code changes only happen through tool calls (edit_file / write_file). Text alone — including past-tense statements like "I\'ve added X" or future-tense plans like "I\'ll add X" — is not a change.',
+    "- Begin by reading the relevant files with read_file or read_symbol, then make the edits with edit_file / write_file, then verify the result with run_command.",
+].join("\n");
+// Used when the previous attempt made tool calls but never mutated any
+// file (no edit_file/write_file, no `cat > FILE`/`sed -i`/`tee` shell
+// edit). Observed shape: model reads a few symbols, narrates a plan
+// ("Here's how I'll fix it: 1. … 2. …"), and stops without acting.
+// Distinct from `approval_seeking` (no explicit confirmation request)
+// and from `no_action` (tool-call count > 0).
+const BENCHMARK_RETRY_REMINDER_NO_MUTATION = [
+    "Benchmark harness reminder:",
+    "- Your previous response read files but never edited any. The task is not complete.",
+    "- Code changes only happen through file mutations: edit_file / write_file (preferred), or a shell command that writes to a file (e.g. `cat > path <<EOF`, `sed -i ...`).",
+    "- Reading more files is not progress on its own. Identify the file to change, make the edit, then verify with run_command.",
+].join("\n");
+/**
+ * Cap on prior-reasoning content forwarded to the retry prompt. ~2000
+ * chars ≈ 500 tokens — enough to convey the model's high-level plan from
+ * the failed attempt without ballooning the second attempt's input cost.
+ */
+const PRIOR_REASONING_MAX_CHARS = 2000;
+/**
+ * Build a "your previous attempt thought this" block to append to the
+ * retry prompt. Helps the model see its own prior reasoning so the
+ * retry isn't starting from a cold state — particularly useful when the
+ * first attempt collapsed to pure reasoning (no visible content / no
+ * tool calls). Returns an empty string when no reasoning was captured.
+ */
+export function buildPriorReasoningContext(reasoningContent) {
+    if (typeof reasoningContent !== "string") {
+        return "";
+    }
+    const trimmed = reasoningContent.trim();
+    if (trimmed.length === 0) {
+        return "";
+    }
+    const snippet = trimmed.length > PRIOR_REASONING_MAX_CHARS
+        ? trimmed.slice(0, PRIOR_REASONING_MAX_CHARS) +
+            `\n…[${trimmed.length - PRIOR_REASONING_MAX_CHARS} more chars of reasoning truncated]`
+        : trimmed;
+    return [
+        "",
+        "Your previous attempt's internal reasoning (verbatim, for your own context):",
+        "<<<PRIOR_REASONING>>>",
+        snippet,
+        "<<<END_PRIOR_REASONING>>>",
+        "Apply that reasoning concretely — make the file changes your previous turn was planning, then verify.",
+    ].join("\n");
+}
 const CONFIRMATION_REQUEST_PATTERNS = [
     /\bplease confirm\b/i,
     /\bconfirm and i(?:'|’)ll\b/i,
@@ -34,6 +97,20 @@ const CONFIRMATION_REQUEST_PATTERNS = [
     /\bneed your approval\b/i,
     /\bpermission\b/i,
 ];
+/**
+ * Whether the retry path should force a tool call (`toolChoice: "required"`)
+ * AND cap reasoning to 2K tokens on the retried attempt. Default is true —
+ * this is the validated rescue behavior for collapse-prone retries. Set
+ * `BENCHMARK_RETRY_FORCE_TOOLS=0` (or `false`/`no`/`off`) to disable; the
+ * retry will then run with the same model config as the first attempt.
+ *
+ * The escape hatch exists so a future model that misbehaves under
+ * `tool_choice: required` can be unblocked without a code revert.
+ */
+export function isRetryForceToolsEnabled(env) {
+    const value = (env.BENCHMARK_RETRY_FORCE_TOOLS ?? "").trim().toLowerCase();
+    return value !== "0" && value !== "false" && value !== "no" && value !== "off";
+}
 const SPECIALIZED_TOOL_NAMES = new Set([
     "read_symbol",
     "find_references",
@@ -46,6 +123,37 @@ const SEARCH_TOOL_NAMES = new Set(["search"]);
 const MUTATION_TOOL_NAMES = new Set(["edit_file", "write_file"]);
 const COMMAND_TOOL_NAMES = new Set(["run_command"]);
 const REPEATED_TOOL_CALL_STOP_TEXT = "Stopped due to repeated identical tool calls";
+// Heuristics for "this shell command modified a file." Observed during
+// trace analysis: gemini-3-pro routinely uses `cat > FILE <<EOF` /
+// `cat >> FILE` heredocs instead of edit_file/write_file, so the model
+// is doing real work while our toolUsage.mutationTotal reads zero. The
+// retry detector and mutation analysis both need to recognize these as
+// real edits.
+const SHELL_MUTATION_PATTERNS = [
+    // Redirect to a file: `> path`, `>> path`, or with a leading fd like `2> path`.
+    // Excludes `/dev/null`, `/dev/stderr`, and fd redirects (`>&2`).
+    /(?:^|[^&>])>>?\s*(?!\/dev\/null\b|\/dev\/stderr\b|&\d)[^\s|;&<>]+/,
+    // sed in-place edit.
+    /\bsed\b[^|;]*\s-i\b/,
+    // tee writes its stdin to one or more files.
+    /\btee\b(?!\s+--help\b)/,
+    // Python `open(..., "w"|"a"|"r+"|"wb"|"ab").write(...)` invocation (covers
+    // the common `python -c "..."` mutation shape).
+    /\bopen\s*\(\s*['"][^'"]+['"]\s*,\s*['"][rwa]\+?b?\+?['"]\s*\)\s*\.\s*write\b/,
+];
+export function looksLikeShellFileMutation(command) {
+    if (typeof command !== "string" || command.length === 0) {
+        return false;
+    }
+    return SHELL_MUTATION_PATTERNS.some((pattern) => pattern.test(command));
+}
+function toolCallLooksLikeShellMutation(toolCall) {
+    if (!COMMAND_TOOL_NAMES.has(toolCall.name)) {
+        return false;
+    }
+    const command = toolCall.input?.command;
+    return typeof command === "string" && looksLikeShellFileMutation(command);
+}
 export function getBenchmarkSystemPromptSuffix() {
     return BENCHMARK_SYSTEM_PROMPT_SUFFIX;
 }
@@ -55,6 +163,84 @@ export function isBenchmarkApprovalSeekingResponse(text) {
     }
     return CONFIRMATION_REQUEST_PATTERNS.some((pattern) => pattern.test(text));
 }
+function countToolCallsInMessages(messages) {
+    let count = 0;
+    for (const message of messages) {
+        if (message.role === "assistant" && message.toolCalls?.length) {
+            count += message.toolCalls.length;
+        }
+    }
+    return count;
+}
+/**
+ * Count any tool call that produced a real workspace mutation, whether via
+ * the structured tools (edit_file / write_file) or via a shell command
+ * that looks like a file write (heredoc into a file, sed -i, tee, etc.).
+ */
+export function countMutationsInMessages(messages) {
+    let count = 0;
+    for (const message of messages) {
+        if (message.role !== "assistant" || !message.toolCalls?.length) {
+            continue;
+        }
+        for (const toolCall of message.toolCalls) {
+            if (MUTATION_TOOL_NAMES.has(toolCall.name)) {
+                count += 1;
+                continue;
+            }
+            if (toolCallLooksLikeShellMutation({
+                name: toolCall.name,
+                input: (toolCall.input ?? {}),
+            })) {
+                count += 1;
+            }
+        }
+    }
+    return count;
+}
+/**
+ * Decide whether a benchmark attempt should be retried once with an
+ * additional reminder appended to the prompt. Returns the reason (so the
+ * caller can pick a matching reminder), or `null` if the attempt looks
+ * fine as-is.
+ *
+ * Three failure modes warrant retry, checked in this order:
+ *   - `no_action`: the model emitted zero tool calls in the entire turn.
+ *     In benchmark mode every task requires code changes, so a tool-call-
+ *     free response is by definition incomplete. Covers both pure-
+ *     reasoning failures (visible text empty) and hallucinated-completion
+ *     narration ("I've added the helper" with no edit_file call).
+ *   - `approval_seeking`: the model asked for confirmation rather than
+ *     acting, even though it made some tool calls.
+ *   - `no_mutation`: the model made tool calls but never produced a file
+ *     mutation — read-only exploration that stopped at a plan. Observed
+ *     on Gemini 2.5 Pro: 3 read_symbol calls followed by a future-tense
+ *     plan, no edit. mutationCount counts both structured and shell-based
+ *     mutations so this only fires when the model genuinely did nothing
+ *     to the workspace.
+ */
+export function getBenchmarkRetryReason(attempt) {
+    if (attempt.toolCallCount === 0) {
+        return "no_action";
+    }
+    if (isBenchmarkApprovalSeekingResponse(attempt.text)) {
+        return "approval_seeking";
+    }
+    if (attempt.mutationCount === 0) {
+        return "no_mutation";
+    }
+    return null;
+}
+export function getBenchmarkRetryReminder(reason) {
+    switch (reason) {
+        case "approval_seeking":
+            return BENCHMARK_RETRY_REMINDER_APPROVAL;
+        case "no_action":
+            return BENCHMARK_RETRY_REMINDER_NO_ACTION;
+        case "no_mutation":
+            return BENCHMARK_RETRY_REMINDER_NO_MUTATION;
+    }
+}
 function stableSerialize(value) {
     if (Array.isArray(value)) {
         return `[${value.map((item) => stableSerialize(item)).join(",")}]`;
@@ -105,6 +291,7 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
     let fileReadTotal = 0;
     let searchTotal = 0;
     let mutationTotal = 0;
+    let shellMutationTotal = 0;
     let commandTotal = 0;
     let skippedTotal = 0;
     for (const toolCall of toolCalls) {
@@ -124,6 +311,9 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
         }
         if (COMMAND_TOOL_NAMES.has(toolCall.name)) {
             commandTotal += 1;
+            if (toolCallLooksLikeShellMutation(toolCall)) {
+                shellMutationTotal += 1;
+            }
         }
         if (toolCall.skipped) {
             skippedTotal += 1;
@@ -149,6 +339,7 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
         fileReadTotal,
         searchTotal,
         mutationTotal,
+        shellMutationTotal,
         commandTotal,
         skippedTotal,
         repeatedStop: finalText.includes(REPEATED_TOOL_CALL_STOP_TEXT) ||
@@ -176,6 +367,8 @@ export function parseBenchmarkRunArgs(argv) {
     let workspaceRoot;
     let diffOut;
     let outFile;
+    let contextBenchTrajectory;
+    let contextBenchImage;
     let verbose = false;
     for (let i = 0; i < argv.length; i += 1) {
         const arg = argv[i];
@@ -276,6 +469,26 @@ export function parseBenchmarkRunArgs(argv) {
             promptFile = arg.slice("--prompt-file=".length).trim();
             continue;
         }
+        if (arg === "--contextbench-trajectory") {
+            const parsed = readFlagValue(argv, i, "--contextbench-trajectory");
+            contextBenchTrajectory = parsed.value;
+            i = parsed.nextIndex;
+            continue;
+        }
+        if (arg.startsWith("--contextbench-trajectory=")) {
+            contextBenchTrajectory = arg.slice("--contextbench-trajectory=".length).trim();
+            continue;
+        }
+        if (arg === "--contextbench-image") {
+            const parsed = readFlagValue(argv, i, "--contextbench-image");
+            contextBenchImage = parsed.value;
+            i = parsed.nextIndex;
+            continue;
+        }
+        if (arg.startsWith("--contextbench-image=")) {
+            contextBenchImage = arg.slice("--contextbench-image=".length).trim();
+            continue;
+        }
         promptParts.push(arg);
     }
     if (promptFile && promptParts.length > 0) {
@@ -292,6 +505,8 @@ export function parseBenchmarkRunArgs(argv) {
         ...(workspaceRoot ? { workspaceRoot } : {}),
         ...(diffOut ? { diffOut } : {}),
         ...(outFile ? { outFile } : {}),
+        ...(contextBenchTrajectory ? { contextBenchTrajectory } : {}),
+        ...(contextBenchImage ? { contextBenchImage } : {}),
         verbose,
     };
 }
@@ -365,6 +580,11 @@ export async function runBenchmarkCommand(argv) {
             projectIndex = undefined;
         }
         const toolRegistry = createToolRegistry(config, projectIndex);
+        // Snapshot HEAD before the model runs so we can diff against this point
+        // even if it commits its fix mid-run. Without this, `git diff` (which
+        // compares working-tree vs index) is blind to committed changes and the
+        // patch returned to the harness would be empty.
+        const baselineRef = (await captureBaselineRef(config.workspaceRoot)) ?? undefined;
         const startedAt = new Date().toISOString();
         const started = performance.now();
         let attempt = await runBenchmarkAttempt(prompt, {
@@ -374,12 +594,48 @@ export async function runBenchmarkCommand(argv) {
             verbose: args.verbose,
             ...(projectIndex !== undefined ? { projectIndex } : {}),
         });
-        if (isBenchmarkApprovalSeekingResponse(attempt.text)) {
+        let retryRecord = null;
+        const retryReason = getBenchmarkRetryReason({
+            text: attempt.text,
+            toolCallCount: countToolCallsInMessages(attempt.messages),
+            mutationCount: countMutationsInMessages(attempt.messages),
+        });
+        if (retryReason !== null) {
             if (args.verbose) {
-                console.error("[benchmark] Model asked for confirmation; retrying once with a non-interactive reminder.");
+                const description = retryReason === "approval_seeking"
+                    ? "Model asked for confirmation"
+                    : retryReason === "no_action"
+                        ? "Model emitted zero tool calls (pure-reasoning or narration-only response)"
+                        : "Model made tool calls but never mutated any file (plan-only response)";
+                console.error(`[benchmark] ${description}; retrying once with a non-interactive reminder.`);
             }
-            attempt = await runBenchmarkAttempt(`${prompt}\n\n${BENCHMARK_RETRY_REMINDER}`, {
-                config,
+            const reminder = getBenchmarkRetryReminder(retryReason);
+            const priorReasoningBlock = buildPriorReasoningContext(attempt.reasoningContent);
+            // Force a tool call on the retry AND cap reasoning. The first attempt
+            // already failed to act; `tool_choice: required` commits the model to
+            // emit a tool call, and the reasoning cap starves the dynamic-thinking
+            // path that produced the original collapse (Gemini 2.5 Pro routinely
+            // burns 10K+ reasoning tokens before returning nothing). The cap is
+            // above what a typical planning step needs (~1-2K tokens) but well
+            // below the observed collapse zone. Opt-out via env var.
+            const RETRY_REASONING_MAX_TOKENS = 2000;
+            const forceTools = isRetryForceToolsEnabled(process.env);
+            const retryConfig = forceTools
+                ? {
+                    ...config,
+                    toolChoice: "required",
+                    reasoningMaxTokens: RETRY_REASONING_MAX_TOKENS,
+                }
+                : config;
+            retryRecord = forceTools
+                ? {
+                    reason: retryReason,
+                    toolChoice: "required",
+                    reasoningMaxTokens: RETRY_REASONING_MAX_TOKENS,
+                }
+                : { reason: retryReason };
+            attempt = await runBenchmarkAttempt(`${prompt}\n\n${reminder}${priorReasoningBlock}`, {
+                config: retryConfig,
                 modelClient,
                 toolRegistry,
                 verbose: args.verbose,
@@ -388,11 +644,11 @@ export async function runBenchmarkCommand(argv) {
         }
         const durationMs = performance.now() - started;
         const completedAt = new Date().toISOString();
-        const changes = await collectWorkspaceChanges(config.workspaceRoot);
+        const changes = await collectWorkspaceChanges(config.workspaceRoot, baselineRef);
         let diffOutPath;
         if (args.diffOut) {
             diffOutPath = path.resolve(cwd, args.diffOut);
-            await writeWorkspaceDiff(config.workspaceRoot, diffOutPath);
+            await writeWorkspaceDiff(config.workspaceRoot, diffOutPath, baselineRef);
         }
         const toolCalls = buildBenchmarkToolTrace(attempt.messages);
         const result = {
@@ -410,6 +666,7 @@ export async function runBenchmarkCommand(argv) {
             ...(diffOutPath ? { diffOut: diffOutPath } : {}),
             toolCalls,
             toolUsage: summarizeBenchmarkToolUsage(toolCalls, attempt.text),
+            retry: retryRecord,
         };
         const payload = JSON.stringify(result, null, 2);
         if (args.outFile) {
@@ -420,6 +677,22 @@ export async function runBenchmarkCommand(argv) {
         else {
             console.log(payload);
         }
+        if (args.contextBenchTrajectory) {
+            const trajectoryPath = path.resolve(cwd, args.contextBenchTrajectory);
+            const patch = (await getWorkspaceDiff(config.workspaceRoot, baselineRef)) ?? "";
+            const trajectory = buildContextBenchTrajectory({
+                systemPrompt: BENCHMARK_SYSTEM_PROMPT_SUFFIX,
+                userPrompt: prompt,
+                toolCalls,
+                finalAssistantText: attempt.text,
+                workspaceRoot: config.workspaceRoot,
+                patch,
+                ...(projectIndex !== undefined ? { projectIndex } : {}),
+                ...(args.contextBenchImage ? { image: args.contextBenchImage } : {}),
+            });
+            await mkdir(path.dirname(trajectoryPath), { recursive: true });
+            await writeFile(trajectoryPath, JSON.stringify(trajectory, null, 2) + "\n", "utf8");
+        }
     }
     finally {
         if (previousAnthropicApiKey === undefined) {