@sean.holung/minicode 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/dist/src/agent/config.js +11 -1
  2. package/dist/src/benchmark/config.js +15 -1
  3. package/dist/src/benchmark/index.js +1 -1
  4. package/dist/src/benchmark/workspace-changes.js +93 -12
  5. package/dist/src/cli/benchmark-run.js +285 -12
  6. package/dist/src/cli/contextbench-trajectory.js +258 -0
  7. package/dist/tests/agent.test.js +17 -21
  8. package/dist/tests/benchmark-run.test.js +223 -2
  9. package/dist/tests/contextbench-trajectory.test.js +228 -0
  10. package/dist/tests/reasoning-effort.test.js +83 -0
  11. package/dist/tests/workspace-changes.test.js +50 -1
  12. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.d.ts +13 -1
  13. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.d.ts.map +1 -1
  14. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.js +133 -24
  15. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.js.map +1 -1
  16. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.d.ts +62 -0
  17. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.d.ts.map +1 -1
  18. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.js.map +1 -1
  19. package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.d.ts +5 -1
  20. package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.d.ts.map +1 -1
  21. package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.js +86 -7
  22. package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.js.map +1 -1
  23. package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.d.ts.map +1 -1
  24. package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.js +1 -1
  25. package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.js.map +1 -1
  26. package/node_modules/minicode-plugin-python/dist/tsconfig.tsbuildinfo +1 -1
  27. package/package.json +1 -1
@@ -33,6 +33,7 @@ export function formatConfigForDisplay(config) {
33
33
  "compactionThreshold: " + (config.compactionThreshold ?? "(disabled)"),
34
34
  "compactionModel: " + (config.compactionModel ?? "(disabled — using mechanical compaction)"),
35
35
  "reasoningEffort: " + (config.reasoningEffort ?? "(unset — no reasoning parameters sent)"),
36
+ "reasoningMaxTokens: " + (config.reasoningMaxTokens !== undefined ? String(config.reasoningMaxTokens) : "(unset — uncapped)"),
36
37
  "enableDynamicPrompt: " + (config.enableDynamicPrompt ?? false),
37
38
  ];
38
39
  return lines.join("\n");
@@ -255,7 +256,7 @@ export async function loadAgentConfig(cwd = process.cwd(), options = {}) {
255
256
  modelProvider: parseModelProvider(env.MODEL_PROVIDER ?? "openai-compatible"),
256
257
  model: env.MODEL ?? "",
257
258
  maxSteps: parseNumber(env.MAX_STEPS, 50),
258
- maxTokens: parseNumber(env.MAX_TOKENS, 4096),
259
+ maxTokens: parseNumber(env.MAX_TOKENS, 16000),
259
260
  modelTimeoutSeconds: parseNumber(env.MODEL_TIMEOUT_SECONDS, 60),
260
261
  maxContextTokens: parseNumber(env.MAX_CONTEXT_TOKENS, 32_000),
261
262
  workspaceRoot,
@@ -280,5 +281,14 @@ export async function loadAgentConfig(cwd = process.cwd(), options = {}) {
280
281
  const effort = parseReasoningEffort(env.REASONING_EFFORT);
281
282
  return effort ? { reasoningEffort: effort } : {};
282
283
  })(),
284
+ ...(() => {
285
+ const raw = env.REASONING_MAX_TOKENS;
286
+ if (raw === undefined || raw === "")
287
+ return {};
288
+ const value = Number(raw);
289
+ if (!Number.isFinite(value) || value <= 0)
290
+ return {};
291
+ return { reasoningMaxTokens: Math.floor(value) };
292
+ })(),
283
293
  };
284
294
  }
@@ -163,7 +163,7 @@ export async function buildBenchmarkAgentConfig(options = {}) {
163
163
  modelProvider: provider,
164
164
  model,
165
165
  maxSteps: parseNumber(resolvedEnv.values.MAX_STEPS ?? fileConfig.maxSteps, 50),
166
- maxTokens: parseNumber(resolvedEnv.values.MAX_TOKENS ?? fileConfig.maxTokens, 4096),
166
+ maxTokens: parseNumber(resolvedEnv.values.MAX_TOKENS ?? fileConfig.maxTokens, 16000),
167
167
  modelTimeoutSeconds: parseNumber(resolvedEnv.values.MODEL_TIMEOUT_SECONDS ?? fileConfig.modelTimeoutSeconds, 60),
168
168
  maxContextTokens: parseNumber(resolvedEnv.values.MAX_CONTEXT_TOKENS ?? fileConfig.maxContextTokens, 32_000),
169
169
  workspaceRoot,
@@ -192,6 +192,20 @@ export async function buildBenchmarkAgentConfig(options = {}) {
192
192
  const effort = parseReasoningEffort(resolvedEnv.values.REASONING_EFFORT ?? fileConfig.reasoningEffort);
193
193
  return effort ? { reasoningEffort: effort } : {};
194
194
  })(),
195
+ ...(() => {
196
+ // Opt-in hard cap on reasoning tokens per turn. Useful for models
197
+ // (notably Gemini 2.5 Pro) that can otherwise burn the full output
198
+ // budget on dynamic thinking without producing a visible response.
199
+ // Unset by default — uncapped reasoning is the right behavior for
200
+ // most models.
201
+ const raw = resolvedEnv.values.REASONING_MAX_TOKENS ?? fileConfig.reasoningMaxTokens;
202
+ if (raw === undefined || raw === null || raw === "")
203
+ return {};
204
+ const value = typeof raw === "number" ? raw : Number(raw);
205
+ if (!Number.isFinite(value) || value <= 0)
206
+ return {};
207
+ return { reasoningMaxTokens: Math.floor(value) };
208
+ })(),
195
209
  enableDynamicPrompt: parseBoolean(resolvedEnv.values.ENABLE_DYNAMIC_PROMPT ?? fileConfig.enableDynamicPrompt, false),
196
210
  };
197
211
  }
@@ -2,5 +2,5 @@ export { loadBenchmarkTask, loadBenchmarkTasks } from "./task-loader.js";
2
2
  export { evaluate } from "./evaluator.js";
3
3
  export { runBenchmarkTask, runBenchmarkSuite, } from "./runner.js";
4
4
  export { buildBenchmarkAgentConfig, getDefaultBenchmarkConfigPath, resolveBenchmarkEnv, } from "./config.js";
5
- export { collectWorkspaceChanges, writeWorkspaceDiff, } from "./workspace-changes.js";
5
+ export { collectWorkspaceChanges, getWorkspaceDiff, writeWorkspaceDiff, } from "./workspace-changes.js";
6
6
  export { buildReport, buildReportFromEvaluations, formatReport, compareReports, } from "./reporter.js";
@@ -59,6 +59,47 @@ async function getWorkspaceGitPrefix(workspaceRoot) {
59
59
  const prefix = await runGit(workspaceRoot, ["rev-parse", "--show-prefix"], true);
60
60
  return prefix.trim();
61
61
  }
62
+ /**
63
+ * Snapshot the current HEAD so we can diff against it at the end of a run
64
+ * even if the model committed in between. Returns null when the workspace
65
+ * is not a git repo or has no commits yet.
66
+ */
67
+ export async function captureBaselineRef(workspaceRoot) {
68
+ if (!(await isGitRepository(workspaceRoot))) {
69
+ return null;
70
+ }
71
+ const sha = (await runGit(workspaceRoot, ["rev-parse", "HEAD"], true)).trim();
72
+ return sha.length > 0 ? sha : null;
73
+ }
74
+ function parseNameStatusLine(line) {
75
+ if (line.length === 0) {
76
+ return undefined;
77
+ }
78
+ // git diff --name-status output is tab-separated: STATUS\tPATH or
79
+ // STATUS\tOLD\tNEW (for renames/copies, status looks like R100 / C75).
80
+ const parts = line.split("\t");
81
+ const rawStatus = parts[0];
82
+ if (!rawStatus) {
83
+ return undefined;
84
+ }
85
+ const code = rawStatus.charAt(0);
86
+ if (code === "R" || code === "C") {
87
+ const previousPath = parts[1];
88
+ const nextPath = parts[2];
89
+ if (!previousPath || !nextPath) {
90
+ return undefined;
91
+ }
92
+ return { status: `${code} `, path: nextPath, previousPath };
93
+ }
94
+ const filePath = parts[1];
95
+ if (!filePath) {
96
+ return undefined;
97
+ }
98
+ // Map to the two-char porcelain-ish status the downstream code expects.
99
+ // We don't try to be exact — the only meaningful check downstream is the
100
+ // "??" untracked case, which is handled separately via git status.
101
+ return { status: `${code} `, path: filePath };
102
+ }
62
103
  function stripWorkspacePrefix(filePath, workspacePrefix) {
63
104
  if (!workspacePrefix) {
64
105
  return filePath;
@@ -70,7 +111,7 @@ function stripWorkspacePrefix(filePath, workspacePrefix) {
70
111
  ? filePath.slice(normalizedPrefix.length)
71
112
  : filePath;
72
113
  }
73
- export async function collectWorkspaceChanges(workspaceRoot) {
114
+ export async function collectWorkspaceChanges(workspaceRoot, baselineRef) {
74
115
  const isGitRepo = await isGitRepository(workspaceRoot);
75
116
  if (!isGitRepo) {
76
117
  return {
@@ -80,11 +121,7 @@ export async function collectWorkspaceChanges(workspaceRoot) {
80
121
  };
81
122
  }
82
123
  const workspacePrefix = await getWorkspaceGitPrefix(workspaceRoot);
83
- const statusOutput = await runGit(workspaceRoot, ["status", "--porcelain=v1", "--untracked-files=all", "--", "."], true);
84
- const entries = statusOutput
85
- .split(/\r?\n/)
86
- .map((line) => parseStatusLine(line))
87
- .map((entry) => entry
124
+ const remap = (entry) => entry
88
125
  ? {
89
126
  ...entry,
90
127
  path: stripWorkspacePrefix(entry.path, workspacePrefix),
@@ -92,8 +129,39 @@ export async function collectWorkspaceChanges(workspaceRoot) {
92
129
  ? { previousPath: stripWorkspacePrefix(entry.previousPath, workspacePrefix) }
93
130
  : {}),
94
131
  }
95
- : undefined)
132
+ : undefined;
133
+ // Always pull untracked entries from `git status` — they're never part of
134
+ // a baseline diff because they aren't tracked yet.
135
+ const statusOutput = await runGit(workspaceRoot, ["status", "--porcelain=v1", "--untracked-files=all", "--", "."], true);
136
+ const statusEntries = statusOutput
137
+ .split(/\r?\n/)
138
+ .map((line) => parseStatusLine(line))
139
+ .map(remap)
96
140
  .filter((entry) => entry !== undefined);
141
+ let entries;
142
+ if (baselineRef) {
143
+ // Tracked changes: anything that differs between the baseline commit and
144
+ // the current working tree. Captures committed, staged, AND unstaged
145
+ // edits in one shot.
146
+ const nameStatusOutput = await runGit(workspaceRoot, ["diff", "--name-status", baselineRef, "--", "."], true);
147
+ const trackedEntries = nameStatusOutput
148
+ .split(/\r?\n/)
149
+ .map((line) => parseNameStatusLine(line))
150
+ .map(remap)
151
+ .filter((entry) => entry !== undefined);
152
+ const untrackedEntries = statusEntries.filter((entry) => entry.status === "??");
153
+ const seen = new Set();
154
+ entries = [];
155
+ for (const entry of [...trackedEntries, ...untrackedEntries]) {
156
+ if (seen.has(entry.path))
157
+ continue;
158
+ seen.add(entry.path);
159
+ entries.push(entry);
160
+ }
161
+ }
162
+ else {
163
+ entries = statusEntries;
164
+ }
97
165
  const changedFiles = [...new Set(entries.map((entry) => entry.path))];
98
166
  return {
99
167
  isGitRepo: true,
@@ -101,12 +169,19 @@ export async function collectWorkspaceChanges(workspaceRoot) {
101
169
  changedFiles,
102
170
  };
103
171
  }
104
- export async function writeWorkspaceDiff(workspaceRoot, outPath) {
105
- const changes = await collectWorkspaceChanges(workspaceRoot);
172
+ export async function getWorkspaceDiff(workspaceRoot, baselineRef) {
173
+ const changes = await collectWorkspaceChanges(workspaceRoot, baselineRef);
106
174
  if (!changes.isGitRepo) {
107
- return false;
175
+ return null;
108
176
  }
109
- const trackedDiff = await runGit(workspaceRoot, ["diff", "--binary", "--no-ext-diff", "--relative", "--", "."], true);
177
+ // With a baseline ref we diff working-tree vs baseline directly, which
178
+ // captures committed + staged + unstaged in one pass. Without one we
179
+ // fall back to the working-tree-vs-index behavior — useful when the
180
+ // caller hasn't snapshotted a starting point.
181
+ const trackedDiffArgs = baselineRef
182
+ ? ["diff", "--binary", "--no-ext-diff", "--relative", baselineRef, "--", "."]
183
+ : ["diff", "--binary", "--no-ext-diff", "--relative", "--", "."];
184
+ const trackedDiff = await runGit(workspaceRoot, trackedDiffArgs, true);
110
185
  const untrackedDiffs = [];
111
186
  for (const entry of changes.entries) {
112
187
  if (entry.status === "??") {
@@ -116,9 +191,15 @@ export async function writeWorkspaceDiff(workspaceRoot, outPath) {
116
191
  }
117
192
  }
118
193
  }
119
- const combinedDiff = [trackedDiff, ...untrackedDiffs]
194
+ return [trackedDiff, ...untrackedDiffs]
120
195
  .filter((section) => section.trim().length > 0)
121
196
  .join("\n");
197
+ }
198
+ export async function writeWorkspaceDiff(workspaceRoot, outPath, baselineRef) {
199
+ const combinedDiff = await getWorkspaceDiff(workspaceRoot, baselineRef);
200
+ if (combinedDiff === null) {
201
+ return false;
202
+ }
122
203
  await mkdir(path.dirname(outPath), { recursive: true });
123
204
  await writeFile(outPath, combinedDiff, "utf8");
124
205
  return true;
@@ -4,24 +4,87 @@ import process from "node:process";
4
4
  import { CodingAgent, createModelClient, } from "@sean.holung/minicode-sdk";
5
5
  import { getConfigSetupMessage } from "../agent/config.js";
6
6
  import { buildBenchmarkAgentConfig, resolveBenchmarkEnv, } from "../benchmark/config.js";
7
- import { collectWorkspaceChanges, writeWorkspaceDiff, } from "../benchmark/workspace-changes.js";
7
+ import { captureBaselineRef, collectWorkspaceChanges, getWorkspaceDiff, writeWorkspaceDiff, } from "../benchmark/workspace-changes.js";
8
8
  import { buildProjectIndex } from "../indexer/project-index.js";
9
9
  import { createToolRegistry } from "../tools/registry.js";
10
10
  import { CliUsageError } from "./args.js";
11
+ import { buildContextBenchTrajectory } from "./contextbench-trajectory.js";
11
12
  const BENCHMARK_SYSTEM_PROMPT_SUFFIX = [
12
- "[Benchmark Execution Mode]",
13
- "- This task is running in a non-interactive benchmark harness.",
14
- "- The task is already approved. Do not ask for confirmation, permission, or whether you should proceed.",
13
+ "[Execution Mode]",
14
+ "- This task is running in a non-interactive harness. The task is already approved.",
15
+ "- Do not ask for confirmation, permission, or whether you should proceed.",
15
16
  "- If the task requires code changes, make them immediately using the available tools.",
16
17
  "- Do not stop after presenting a plan. Either complete the task or explain a concrete blocker.",
17
- "- When validation is part of the task, run the required command once after making changes.",
18
+ "",
19
+ "[Long-form Task Discipline]",
20
+ "- Non-trivial coding tasks routinely require 30+ tool-call iterate-test-fix cycles. Persistent iteration against the test suite is expected; do not bail early because you have read a few files.",
21
+ "- Iterate against the canonical test runner (the test command shipped with the task) until it passes. Treat the runner's output as the source of truth — not your own assessment of whether the code looks correct, and not ad-hoc verification scripts you write yourself.",
22
+ "- Before declaring the task complete: run the FULL existing test suite, not just tests targeting the new feature. Many tasks modify code that other features depend on — verify you did not break previously-passing functionality. Regression failures on stages you were not asked to modify still count as failures.",
23
+ "- \"I have implemented X\" is not the same as \"tests pass.\" Do not declare completion without observing an explicit green signal (exit code 0, all-pass marker) from the canonical test runner over the full suite.",
18
24
  ].join("\n");
19
- const BENCHMARK_RETRY_REMINDER = [
25
+ const BENCHMARK_RETRY_REMINDER_APPROVAL = [
20
26
  "Benchmark harness reminder:",
21
27
  "- This task is already approved.",
22
28
  "- Do not ask for confirmation or present a plan without acting.",
23
29
  "- Use tools, make the required edits immediately, and finish the task.",
24
30
  ].join("\n");
31
+ // Used when the previous attempt emitted zero tool calls — either pure
32
+ // reasoning that produced no output (Gemini 2.5 Pro's thinking-paralysis
33
+ // mode) or narration claiming work was done without any tool calls.
34
+ // In benchmark mode every task requires code changes; zero tool calls
35
+ // is a definite failure regardless of what the response text claims.
36
+ const BENCHMARK_RETRY_REMINDER_NO_ACTION = [
37
+ "Benchmark harness reminder:",
38
+ "- Your previous response made zero tool calls. The task is not complete.",
39
+ '- Code changes only happen through tool calls (edit_file / write_file). Text alone — including past-tense statements like "I\'ve added X" or future-tense plans like "I\'ll add X" — is not a change.',
40
+ "- Begin by reading the relevant files with read_file or read_symbol, then make the edits with edit_file / write_file, then verify the result with run_command.",
41
+ ].join("\n");
42
+ // Used when the previous attempt made tool calls but never mutated any
43
+ // file (no edit_file/write_file, no `cat > FILE`/`sed -i`/`tee` shell
44
+ // edit). Observed shape: model reads a few symbols, narrates a plan
45
+ // ("Here's how I'll fix it: 1. … 2. …"), and stops without acting.
46
+ // Distinct from `approval_seeking` (no explicit confirmation request)
47
+ // and from `no_action` (tool-call count > 0).
48
+ const BENCHMARK_RETRY_REMINDER_NO_MUTATION = [
49
+ "Benchmark harness reminder:",
50
+ "- Your previous response read files but never edited any. The task is not complete.",
51
+ "- Code changes only happen through file mutations: edit_file / write_file (preferred), or a shell command that writes to a file (e.g. `cat > path <<EOF`, `sed -i ...`).",
52
+ "- Reading more files is not progress on its own. Identify the file to change, make the edit, then verify with run_command.",
53
+ ].join("\n");
54
+ /**
55
+ * Cap on prior-reasoning content forwarded to the retry prompt. ~2000
56
+ * chars ≈ 500 tokens — enough to convey the model's high-level plan from
57
+ * the failed attempt without ballooning the second attempt's input cost.
58
+ */
59
+ const PRIOR_REASONING_MAX_CHARS = 2000;
60
+ /**
61
+ * Build a "your previous attempt thought this" block to append to the
62
+ * retry prompt. Helps the model see its own prior reasoning so the
63
+ * retry isn't starting from a cold state — particularly useful when the
64
+ * first attempt collapsed to pure reasoning (no visible content / no
65
+ * tool calls). Returns an empty string when no reasoning was captured.
66
+ */
67
+ export function buildPriorReasoningContext(reasoningContent) {
68
+ if (typeof reasoningContent !== "string") {
69
+ return "";
70
+ }
71
+ const trimmed = reasoningContent.trim();
72
+ if (trimmed.length === 0) {
73
+ return "";
74
+ }
75
+ const snippet = trimmed.length > PRIOR_REASONING_MAX_CHARS
76
+ ? trimmed.slice(0, PRIOR_REASONING_MAX_CHARS) +
77
+ `\n…[${trimmed.length - PRIOR_REASONING_MAX_CHARS} more chars of reasoning truncated]`
78
+ : trimmed;
79
+ return [
80
+ "",
81
+ "Your previous attempt's internal reasoning (verbatim, for your own context):",
82
+ "<<<PRIOR_REASONING>>>",
83
+ snippet,
84
+ "<<<END_PRIOR_REASONING>>>",
85
+ "Apply that reasoning concretely — make the file changes your previous turn was planning, then verify.",
86
+ ].join("\n");
87
+ }
25
88
  const CONFIRMATION_REQUEST_PATTERNS = [
26
89
  /\bplease confirm\b/i,
27
90
  /\bconfirm and i(?:'|’)ll\b/i,
@@ -34,6 +97,20 @@ const CONFIRMATION_REQUEST_PATTERNS = [
34
97
  /\bneed your approval\b/i,
35
98
  /\bpermission\b/i,
36
99
  ];
100
+ /**
101
+ * Whether the retry path should force a tool call (`toolChoice: "required"`)
102
+ * AND cap reasoning to 2K tokens on the retried attempt. Default is true —
103
+ * this is the validated rescue behavior for collapse-prone retries. Set
104
+ * `BENCHMARK_RETRY_FORCE_TOOLS=0` (or `false`/`no`/`off`) to disable; the
105
+ * retry will then run with the same model config as the first attempt.
106
+ *
107
+ * The escape hatch exists so a future model that misbehaves under
108
+ * `tool_choice: required` can be unblocked without a code revert.
109
+ */
110
+ export function isRetryForceToolsEnabled(env) {
111
+ const value = (env.BENCHMARK_RETRY_FORCE_TOOLS ?? "").trim().toLowerCase();
112
+ return value !== "0" && value !== "false" && value !== "no" && value !== "off";
113
+ }
37
114
  const SPECIALIZED_TOOL_NAMES = new Set([
38
115
  "read_symbol",
39
116
  "find_references",
@@ -46,6 +123,37 @@ const SEARCH_TOOL_NAMES = new Set(["search"]);
46
123
  const MUTATION_TOOL_NAMES = new Set(["edit_file", "write_file"]);
47
124
  const COMMAND_TOOL_NAMES = new Set(["run_command"]);
48
125
  const REPEATED_TOOL_CALL_STOP_TEXT = "Stopped due to repeated identical tool calls";
126
+ // Heuristics for "this shell command modified a file." Observed during
127
+ // trace analysis: gemini-3-pro routinely uses `cat > FILE <<EOF` /
128
+ // `cat >> FILE` heredocs instead of edit_file/write_file, so the model
129
+ // is doing real work while our toolUsage.mutationTotal reads zero. The
130
+ // retry detector and mutation analysis both need to recognize these as
131
+ // real edits.
132
+ const SHELL_MUTATION_PATTERNS = [
133
+ // Redirect to a file: `> path`, `>> path`, or with a leading fd like `2> path`.
134
+ // Excludes `/dev/null`, `/dev/stderr`, and fd redirects (`>&2`).
135
+ /(?:^|[^&>])>>?\s*(?!\/dev\/null\b|\/dev\/stderr\b|&\d)[^\s|;&<>]+/,
136
+ // sed in-place edit.
137
+ /\bsed\b[^|;]*\s-i\b/,
138
+ // tee writes its stdin to one or more files.
139
+ /\btee\b(?!\s+--help\b)/,
140
+ // Python `open(..., "w"|"a"|"r+"|"wb"|"ab").write(...)` invocation (covers
141
+ // the common `python -c "..."` mutation shape).
142
+ /\bopen\s*\(\s*['"][^'"]+['"]\s*,\s*['"][rwa]\+?b?\+?['"]\s*\)\s*\.\s*write\b/,
143
+ ];
144
+ export function looksLikeShellFileMutation(command) {
145
+ if (typeof command !== "string" || command.length === 0) {
146
+ return false;
147
+ }
148
+ return SHELL_MUTATION_PATTERNS.some((pattern) => pattern.test(command));
149
+ }
150
+ function toolCallLooksLikeShellMutation(toolCall) {
151
+ if (!COMMAND_TOOL_NAMES.has(toolCall.name)) {
152
+ return false;
153
+ }
154
+ const command = toolCall.input?.command;
155
+ return typeof command === "string" && looksLikeShellFileMutation(command);
156
+ }
49
157
  export function getBenchmarkSystemPromptSuffix() {
50
158
  return BENCHMARK_SYSTEM_PROMPT_SUFFIX;
51
159
  }
@@ -55,6 +163,84 @@ export function isBenchmarkApprovalSeekingResponse(text) {
55
163
  }
56
164
  return CONFIRMATION_REQUEST_PATTERNS.some((pattern) => pattern.test(text));
57
165
  }
166
+ function countToolCallsInMessages(messages) {
167
+ let count = 0;
168
+ for (const message of messages) {
169
+ if (message.role === "assistant" && message.toolCalls?.length) {
170
+ count += message.toolCalls.length;
171
+ }
172
+ }
173
+ return count;
174
+ }
175
+ /**
176
+ * Count any tool call that produced a real workspace mutation, whether via
177
+ * the structured tools (edit_file / write_file) or via a shell command
178
+ * that looks like a file write (heredoc into a file, sed -i, tee, etc.).
179
+ */
180
+ export function countMutationsInMessages(messages) {
181
+ let count = 0;
182
+ for (const message of messages) {
183
+ if (message.role !== "assistant" || !message.toolCalls?.length) {
184
+ continue;
185
+ }
186
+ for (const toolCall of message.toolCalls) {
187
+ if (MUTATION_TOOL_NAMES.has(toolCall.name)) {
188
+ count += 1;
189
+ continue;
190
+ }
191
+ if (toolCallLooksLikeShellMutation({
192
+ name: toolCall.name,
193
+ input: (toolCall.input ?? {}),
194
+ })) {
195
+ count += 1;
196
+ }
197
+ }
198
+ }
199
+ return count;
200
+ }
201
+ /**
202
+ * Decide whether a benchmark attempt should be retried once with an
203
+ * additional reminder appended to the prompt. Returns the reason (so the
204
+ * caller can pick a matching reminder), or `null` if the attempt looks
205
+ * fine as-is.
206
+ *
207
+ * Three failure modes warrant retry, checked in this order:
208
+ * - `no_action`: the model emitted zero tool calls in the entire turn.
209
+ * In benchmark mode every task requires code changes, so a tool-call-
210
+ * free response is by definition incomplete. Covers both pure-
211
+ * reasoning failures (visible text empty) and hallucinated-completion
212
+ * narration ("I've added the helper" with no edit_file call).
213
+ * - `approval_seeking`: the model asked for confirmation rather than
214
+ * acting, even though it made some tool calls.
215
+ * - `no_mutation`: the model made tool calls but never produced a file
216
+ * mutation — read-only exploration that stopped at a plan. Observed
217
+ * on Gemini 2.5 Pro: 3 read_symbol calls followed by a future-tense
218
+ * plan, no edit. mutationCount counts both structured and shell-based
219
+ * mutations so this only fires when the model genuinely did nothing
220
+ * to the workspace.
221
+ */
222
+ export function getBenchmarkRetryReason(attempt) {
223
+ if (attempt.toolCallCount === 0) {
224
+ return "no_action";
225
+ }
226
+ if (isBenchmarkApprovalSeekingResponse(attempt.text)) {
227
+ return "approval_seeking";
228
+ }
229
+ if (attempt.mutationCount === 0) {
230
+ return "no_mutation";
231
+ }
232
+ return null;
233
+ }
234
+ export function getBenchmarkRetryReminder(reason) {
235
+ switch (reason) {
236
+ case "approval_seeking":
237
+ return BENCHMARK_RETRY_REMINDER_APPROVAL;
238
+ case "no_action":
239
+ return BENCHMARK_RETRY_REMINDER_NO_ACTION;
240
+ case "no_mutation":
241
+ return BENCHMARK_RETRY_REMINDER_NO_MUTATION;
242
+ }
243
+ }
58
244
  function stableSerialize(value) {
59
245
  if (Array.isArray(value)) {
60
246
  return `[${value.map((item) => stableSerialize(item)).join(",")}]`;
@@ -105,6 +291,7 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
105
291
  let fileReadTotal = 0;
106
292
  let searchTotal = 0;
107
293
  let mutationTotal = 0;
294
+ let shellMutationTotal = 0;
108
295
  let commandTotal = 0;
109
296
  let skippedTotal = 0;
110
297
  for (const toolCall of toolCalls) {
@@ -124,6 +311,9 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
124
311
  }
125
312
  if (COMMAND_TOOL_NAMES.has(toolCall.name)) {
126
313
  commandTotal += 1;
314
+ if (toolCallLooksLikeShellMutation(toolCall)) {
315
+ shellMutationTotal += 1;
316
+ }
127
317
  }
128
318
  if (toolCall.skipped) {
129
319
  skippedTotal += 1;
@@ -149,6 +339,7 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
149
339
  fileReadTotal,
150
340
  searchTotal,
151
341
  mutationTotal,
342
+ shellMutationTotal,
152
343
  commandTotal,
153
344
  skippedTotal,
154
345
  repeatedStop: finalText.includes(REPEATED_TOOL_CALL_STOP_TEXT) ||
@@ -176,6 +367,8 @@ export function parseBenchmarkRunArgs(argv) {
176
367
  let workspaceRoot;
177
368
  let diffOut;
178
369
  let outFile;
370
+ let contextBenchTrajectory;
371
+ let contextBenchImage;
179
372
  let verbose = false;
180
373
  for (let i = 0; i < argv.length; i += 1) {
181
374
  const arg = argv[i];
@@ -276,6 +469,26 @@ export function parseBenchmarkRunArgs(argv) {
276
469
  promptFile = arg.slice("--prompt-file=".length).trim();
277
470
  continue;
278
471
  }
472
+ if (arg === "--contextbench-trajectory") {
473
+ const parsed = readFlagValue(argv, i, "--contextbench-trajectory");
474
+ contextBenchTrajectory = parsed.value;
475
+ i = parsed.nextIndex;
476
+ continue;
477
+ }
478
+ if (arg.startsWith("--contextbench-trajectory=")) {
479
+ contextBenchTrajectory = arg.slice("--contextbench-trajectory=".length).trim();
480
+ continue;
481
+ }
482
+ if (arg === "--contextbench-image") {
483
+ const parsed = readFlagValue(argv, i, "--contextbench-image");
484
+ contextBenchImage = parsed.value;
485
+ i = parsed.nextIndex;
486
+ continue;
487
+ }
488
+ if (arg.startsWith("--contextbench-image=")) {
489
+ contextBenchImage = arg.slice("--contextbench-image=".length).trim();
490
+ continue;
491
+ }
279
492
  promptParts.push(arg);
280
493
  }
281
494
  if (promptFile && promptParts.length > 0) {
@@ -292,6 +505,8 @@ export function parseBenchmarkRunArgs(argv) {
292
505
  ...(workspaceRoot ? { workspaceRoot } : {}),
293
506
  ...(diffOut ? { diffOut } : {}),
294
507
  ...(outFile ? { outFile } : {}),
508
+ ...(contextBenchTrajectory ? { contextBenchTrajectory } : {}),
509
+ ...(contextBenchImage ? { contextBenchImage } : {}),
295
510
  verbose,
296
511
  };
297
512
  }
@@ -365,6 +580,11 @@ export async function runBenchmarkCommand(argv) {
365
580
  projectIndex = undefined;
366
581
  }
367
582
  const toolRegistry = createToolRegistry(config, projectIndex);
583
+ // Snapshot HEAD before the model runs so we can diff against this point
584
+ // even if it commits its fix mid-run. Without this, `git diff` (which
585
+ // compares working-tree vs index) is blind to committed changes and the
586
+ // patch returned to the harness would be empty.
587
+ const baselineRef = (await captureBaselineRef(config.workspaceRoot)) ?? undefined;
368
588
  const startedAt = new Date().toISOString();
369
589
  const started = performance.now();
370
590
  let attempt = await runBenchmarkAttempt(prompt, {
@@ -374,12 +594,48 @@ export async function runBenchmarkCommand(argv) {
374
594
  verbose: args.verbose,
375
595
  ...(projectIndex !== undefined ? { projectIndex } : {}),
376
596
  });
377
- if (isBenchmarkApprovalSeekingResponse(attempt.text)) {
597
+ let retryRecord = null;
598
+ const retryReason = getBenchmarkRetryReason({
599
+ text: attempt.text,
600
+ toolCallCount: countToolCallsInMessages(attempt.messages),
601
+ mutationCount: countMutationsInMessages(attempt.messages),
602
+ });
603
+ if (retryReason !== null) {
378
604
  if (args.verbose) {
379
- console.error("[benchmark] Model asked for confirmation; retrying once with a non-interactive reminder.");
605
+ const description = retryReason === "approval_seeking"
606
+ ? "Model asked for confirmation"
607
+ : retryReason === "no_action"
608
+ ? "Model emitted zero tool calls (pure-reasoning or narration-only response)"
609
+ : "Model made tool calls but never mutated any file (plan-only response)";
610
+ console.error(`[benchmark] ${description}; retrying once with a non-interactive reminder.`);
380
611
  }
381
- attempt = await runBenchmarkAttempt(`${prompt}\n\n${BENCHMARK_RETRY_REMINDER}`, {
382
- config,
612
+ const reminder = getBenchmarkRetryReminder(retryReason);
613
+ const priorReasoningBlock = buildPriorReasoningContext(attempt.reasoningContent);
614
+ // Force a tool call on the retry AND cap reasoning. The first attempt
615
+ // already failed to act; `tool_choice: required` commits the model to
616
+ // emit a tool call, and the reasoning cap starves the dynamic-thinking
617
+ // path that produced the original collapse (Gemini 2.5 Pro routinely
618
+ // burns 10K+ reasoning tokens before returning nothing). The cap is
619
+ // above what a typical planning step needs (~1-2K tokens) but well
620
+ // below the observed collapse zone. Opt-out via env var.
621
+ const RETRY_REASONING_MAX_TOKENS = 2000;
622
+ const forceTools = isRetryForceToolsEnabled(process.env);
623
+ const retryConfig = forceTools
624
+ ? {
625
+ ...config,
626
+ toolChoice: "required",
627
+ reasoningMaxTokens: RETRY_REASONING_MAX_TOKENS,
628
+ }
629
+ : config;
630
+ retryRecord = forceTools
631
+ ? {
632
+ reason: retryReason,
633
+ toolChoice: "required",
634
+ reasoningMaxTokens: RETRY_REASONING_MAX_TOKENS,
635
+ }
636
+ : { reason: retryReason };
637
+ attempt = await runBenchmarkAttempt(`${prompt}\n\n${reminder}${priorReasoningBlock}`, {
638
+ config: retryConfig,
383
639
  modelClient,
384
640
  toolRegistry,
385
641
  verbose: args.verbose,
@@ -388,11 +644,11 @@ export async function runBenchmarkCommand(argv) {
388
644
  }
389
645
  const durationMs = performance.now() - started;
390
646
  const completedAt = new Date().toISOString();
391
- const changes = await collectWorkspaceChanges(config.workspaceRoot);
647
+ const changes = await collectWorkspaceChanges(config.workspaceRoot, baselineRef);
392
648
  let diffOutPath;
393
649
  if (args.diffOut) {
394
650
  diffOutPath = path.resolve(cwd, args.diffOut);
395
- await writeWorkspaceDiff(config.workspaceRoot, diffOutPath);
651
+ await writeWorkspaceDiff(config.workspaceRoot, diffOutPath, baselineRef);
396
652
  }
397
653
  const toolCalls = buildBenchmarkToolTrace(attempt.messages);
398
654
  const result = {
@@ -410,6 +666,7 @@ export async function runBenchmarkCommand(argv) {
410
666
  ...(diffOutPath ? { diffOut: diffOutPath } : {}),
411
667
  toolCalls,
412
668
  toolUsage: summarizeBenchmarkToolUsage(toolCalls, attempt.text),
669
+ retry: retryRecord,
413
670
  };
414
671
  const payload = JSON.stringify(result, null, 2);
415
672
  if (args.outFile) {
@@ -420,6 +677,22 @@ export async function runBenchmarkCommand(argv) {
420
677
  else {
421
678
  console.log(payload);
422
679
  }
680
+ if (args.contextBenchTrajectory) {
681
+ const trajectoryPath = path.resolve(cwd, args.contextBenchTrajectory);
682
+ const patch = (await getWorkspaceDiff(config.workspaceRoot, baselineRef)) ?? "";
683
+ const trajectory = buildContextBenchTrajectory({
684
+ systemPrompt: BENCHMARK_SYSTEM_PROMPT_SUFFIX,
685
+ userPrompt: prompt,
686
+ toolCalls,
687
+ finalAssistantText: attempt.text,
688
+ workspaceRoot: config.workspaceRoot,
689
+ patch,
690
+ ...(projectIndex !== undefined ? { projectIndex } : {}),
691
+ ...(args.contextBenchImage ? { image: args.contextBenchImage } : {}),
692
+ });
693
+ await mkdir(path.dirname(trajectoryPath), { recursive: true });
694
+ await writeFile(trajectoryPath, JSON.stringify(trajectory, null, 2) + "\n", "utf8");
695
+ }
423
696
  }
424
697
  finally {
425
698
  if (previousAnthropicApiKey === undefined) {