@sean.holung/minicode 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/dist/src/agent/config.js +11 -1
  2. package/dist/src/benchmark/config.js +15 -1
  3. package/dist/src/benchmark/index.js +1 -1
  4. package/dist/src/benchmark/workspace-changes.js +93 -12
  5. package/dist/src/cli/benchmark-run.js +285 -12
  6. package/dist/src/cli/contextbench-trajectory.js +258 -0
  7. package/dist/tests/agent.test.js +17 -21
  8. package/dist/tests/benchmark-run.test.js +223 -2
  9. package/dist/tests/contextbench-trajectory.test.js +228 -0
  10. package/dist/tests/reasoning-effort.test.js +83 -0
  11. package/dist/tests/workspace-changes.test.js +50 -1
  12. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.d.ts +13 -1
  13. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.d.ts.map +1 -1
  14. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.js +133 -24
  15. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.js.map +1 -1
  16. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.d.ts +62 -0
  17. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.d.ts.map +1 -1
  18. package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.js.map +1 -1
  19. package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.d.ts +5 -1
  20. package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.d.ts.map +1 -1
  21. package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.js +86 -7
  22. package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.js.map +1 -1
  23. package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.d.ts.map +1 -1
  24. package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.js +1 -1
  25. package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.js.map +1 -1
  26. package/node_modules/minicode-plugin-python/dist/tsconfig.tsbuildinfo +1 -1
  27. package/package.json +1 -1
@@ -0,0 +1,258 @@
1
+ /**
2
+ * Convert a benchmark run into a MiniSWE-Agent compatible `.traj.json` trajectory
3
+ * that ContextBench's existing extractor (`contextbench/agents/minisweagent/extract.py`)
4
+ * can parse via the preferred `<explore_context>` / `<PATCH_CONTEXT>` path.
5
+ *
6
+ * Each tool call is rendered into one assistant message whose body contains a
7
+ * single `<explore_context>` block enumerating the files and line ranges the
8
+ * agent looked at on that step. The final assistant message carries a
9
+ * `<PATCH_CONTEXT>` block computed from the unified diff, listing the files
10
+ * and hunk ranges the agent actually edited.
11
+ */
12
+ export function buildContextBenchTrajectory(options) {
13
+ const messages = [];
14
+ messages.push({ role: "system", content: options.systemPrompt });
15
+ messages.push({ role: "user", content: options.userPrompt });
16
+ // Group tool calls by step so that batched calls within one assistant turn
17
+ // produce a single assistant message — mirrors how a real agent transcript
18
+ // is structured.
19
+ const stepGroups = new Map();
20
+ for (const call of options.toolCalls) {
21
+ const list = stepGroups.get(call.step);
22
+ if (list)
23
+ list.push(call);
24
+ else
25
+ stepGroups.set(call.step, [call]);
26
+ }
27
+ const sortedSteps = [...stepGroups.keys()].sort((a, b) => a - b);
28
+ for (const step of sortedSteps) {
29
+ const calls = stepGroups.get(step);
30
+ const spans = collectSpansForCalls(calls, options.projectIndex);
31
+ if (spans.length === 0)
32
+ continue;
33
+ messages.push({
34
+ role: "assistant",
35
+ content: `<explore_context>\n${formatSpans(spans)}\n</explore_context>`,
36
+ });
37
+ }
38
+ const patchSpans = parsePatchSpans(options.patch);
39
+ const patchBlock = patchSpans.length > 0
40
+ ? `<PATCH_CONTEXT>\n${formatSpans(patchSpans)}\n</PATCH_CONTEXT>`
41
+ : "<PATCH_CONTEXT>\n</PATCH_CONTEXT>";
42
+ const finalText = options.finalAssistantText.trim();
43
+ messages.push({
44
+ role: "assistant",
45
+ content: finalText.length > 0 ? `${finalText}\n\n${patchBlock}` : patchBlock,
46
+ });
47
+ return {
48
+ messages,
49
+ info: {
50
+ submission: options.patch,
51
+ ...(options.image
52
+ ? { config: { environment: { image: options.image } } }
53
+ : {}),
54
+ },
55
+ };
56
+ }
57
+ function formatSpans(spans) {
58
+ // Stable ordering by file, then start line.
59
+ const sorted = [...spans].sort((a, b) => a.file.localeCompare(b.file) ||
60
+ a.startLine - b.startLine ||
61
+ a.endLine - b.endLine);
62
+ return sorted
63
+ .map((s) => `File: ${s.file}\nLines: ${s.startLine}-${s.endLine}`)
64
+ .join("\n");
65
+ }
66
+ function collectSpansForCalls(calls, index) {
67
+ const collected = [];
68
+ const seen = new Set();
69
+ for (const call of calls) {
70
+ if (call.skipped)
71
+ continue;
72
+ for (const span of spansForCall(call, index)) {
73
+ const key = `${span.file}:${span.startLine}-${span.endLine}`;
74
+ if (seen.has(key))
75
+ continue;
76
+ seen.add(key);
77
+ collected.push(span);
78
+ }
79
+ }
80
+ return collected;
81
+ }
82
+ function spansForCall(call, index) {
83
+ switch (call.name) {
84
+ case "read_file":
85
+ return spansForReadFile(call);
86
+ case "read_symbol":
87
+ return spansForSymbolLookup(call, index);
88
+ case "find_references":
89
+ return spansForReferences(call, index);
90
+ case "get_dependencies":
91
+ return spansForDependencyCone(call, index);
92
+ case "edit_file":
93
+ case "write_file":
94
+ return spansForMutation(call);
95
+ default:
96
+ return [];
97
+ }
98
+ }
99
+ function spansForReadFile(call) {
100
+ const filePath = stringField(call.input, "path");
101
+ if (!filePath)
102
+ return [];
103
+ const offset = numberField(call.input, "offset");
104
+ const limit = numberField(call.input, "limit");
105
+ const start = offset !== undefined && offset > 0 ? offset : 1;
106
+ // Prefer the tool result's last line-number prefix as a tight upper bound,
107
+ // since read_file emits `<line>|<content>` per line. Fall back to
108
+ // offset+limit-1 when no offset/limit is known.
109
+ const resultEndLine = lastLineNumberInResult(call.result);
110
+ let end;
111
+ if (resultEndLine !== undefined) {
112
+ end = resultEndLine;
113
+ }
114
+ else if (limit !== undefined && limit > 0) {
115
+ end = start + limit - 1;
116
+ }
117
+ else {
118
+ // Read with no offset/limit and unparseable result — we can't infer a
119
+ // useful end line, so skip this span rather than fabricate one.
120
+ return [];
121
+ }
122
+ if (end < start)
123
+ end = start;
124
+ return [{ file: filePath, startLine: start, endLine: end }];
125
+ }
126
+ function spansForSymbolLookup(call, index) {
127
+ if (!index)
128
+ return [];
129
+ const name = stringField(call.input, "name");
130
+ if (!name)
131
+ return [];
132
+ // Some symbols resolve to multiple candidates (e.g. method `foo` on
133
+ // multiple classes). Emit a span per match so coverage credit reflects
134
+ // what the agent actually paid attention to.
135
+ const matches = index.getSymbolMatches?.(name) ?? [];
136
+ const candidates = matches.length > 0 ? matches : (index.getSymbol?.(name) ? [index.getSymbol(name)] : []);
137
+ return candidates.map(indexedSymbolToFileSpan).filter(isDefined);
138
+ }
139
+ function spansForReferences(call, index) {
140
+ if (!index)
141
+ return [];
142
+ const name = stringField(call.input, "name");
143
+ if (!name)
144
+ return [];
145
+ const edges = index.dependencyEdges ?? [];
146
+ const target = index.getSymbol?.(name);
147
+ if (!target)
148
+ return [];
149
+ const incoming = edges.filter((e) => e.to === target.qualifiedName);
150
+ const spans = [];
151
+ for (const edge of incoming) {
152
+ const sym = index.getSymbol?.(edge.from);
153
+ const span = sym ? indexedSymbolToFileSpan(sym) : undefined;
154
+ if (span)
155
+ spans.push(span);
156
+ }
157
+ return spans;
158
+ }
159
+ function spansForDependencyCone(call, index) {
160
+ if (!index?.getDependencyCone)
161
+ return [];
162
+ const name = stringField(call.input, "name") ?? stringField(call.input, "symbol");
163
+ if (!name)
164
+ return [];
165
+ const depth = numberField(call.input, "depth") ?? 2;
166
+ const cone = index.getDependencyCone(name, depth);
167
+ return cone.map(indexedSymbolToFileSpan).filter(isDefined);
168
+ }
169
+ function spansForMutation(call) {
170
+ // Mutations don't broaden the *exploration* set on their own — the final
171
+ // PATCH_CONTEXT covers what was changed. But ContextBench's gold-context
172
+ // includes edit-location credit, so emitting a span for the touched file
173
+ // helps make sure edits show up in the explored-set too. We don't have
174
+ // exact line ranges here without re-reading, so fall back to a single-line
175
+ // span at the explicit `offset`/`line` field when present; otherwise skip.
176
+ const filePath = stringField(call.input, "path");
177
+ if (!filePath)
178
+ return [];
179
+ const offset = numberField(call.input, "offset");
180
+ if (offset !== undefined && offset > 0) {
181
+ return [{ file: filePath, startLine: offset, endLine: offset }];
182
+ }
183
+ return [];
184
+ }
185
+ function indexedSymbolToFileSpan(symbol) {
186
+ return {
187
+ file: symbol.filePath,
188
+ startLine: symbol.startLine,
189
+ endLine: symbol.endLine,
190
+ };
191
+ }
192
+ function isDefined(value) {
193
+ return value !== undefined && value !== null;
194
+ }
195
+ function stringField(input, name) {
196
+ const value = input[name];
197
+ return typeof value === "string" && value.length > 0 ? value : undefined;
198
+ }
199
+ function numberField(input, name) {
200
+ const value = input[name];
201
+ if (typeof value === "number" && Number.isFinite(value))
202
+ return value;
203
+ if (typeof value === "string") {
204
+ const parsed = Number(value);
205
+ if (Number.isFinite(parsed))
206
+ return parsed;
207
+ }
208
+ return undefined;
209
+ }
210
+ function lastLineNumberInResult(result) {
211
+ if (!result)
212
+ return undefined;
213
+ const trimmed = result.replace(/\s+$/, "");
214
+ if (trimmed.length === 0)
215
+ return undefined;
216
+ const lastNewline = trimmed.lastIndexOf("\n");
217
+ const lastLine = lastNewline >= 0 ? trimmed.slice(lastNewline + 1) : trimmed;
218
+ const match = lastLine.match(/^\s*(\d+)\|/);
219
+ if (!match)
220
+ return undefined;
221
+ const parsed = Number(match[1]);
222
+ return Number.isFinite(parsed) ? parsed : undefined;
223
+ }
224
+ /**
225
+ * Parse a unified diff into a list of (file, new-file-line-range) spans.
226
+ * Uses the NEW file side (`+` ranges) since that's where the agent's edits
227
+ * landed.
228
+ */
229
+ export function parsePatchSpans(patch) {
230
+ if (!patch || patch.trim().length === 0)
231
+ return [];
232
+ const spans = [];
233
+ let currentFile = "";
234
+ for (const rawLine of patch.split(/\r?\n/)) {
235
+ if (rawLine.startsWith("+++ ")) {
236
+ const target = rawLine.slice(4).trim();
237
+ currentFile = stripDiffPathPrefix(target);
238
+ continue;
239
+ }
240
+ const hunk = rawLine.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@/);
241
+ if (hunk && currentFile) {
242
+ const start = Number(hunk[1]);
243
+ const count = hunk[2] !== undefined ? Number(hunk[2]) : 1;
244
+ const end = count === 0 ? start : start + count - 1;
245
+ spans.push({ file: currentFile, startLine: start, endLine: end });
246
+ }
247
+ }
248
+ return spans;
249
+ }
250
+ function stripDiffPathPrefix(target) {
251
+ if (target === "/dev/null")
252
+ return "";
253
+ if (target.startsWith("b/"))
254
+ return target.slice(2);
255
+ if (target.startsWith("a/"))
256
+ return target.slice(2);
257
+ return target;
258
+ }
@@ -194,29 +194,25 @@ test("agent does not treat repeated validation commands after edits as a loop",
194
194
  assertToolCallTranscriptIsComplete(agent.getSession().getMessages());
195
195
  });
196
196
  test("agent still stops on repeated identical mutations", async () => {
197
- const responses = [
198
- {
199
- text: "first edit",
200
- toolCalls: [{ id: "edit-1", name: "edit_file", input: { path: "app.ts", content: "same" } }],
201
- stopReason: "tool_use",
202
- usage: { inputTokens: 1, outputTokens: 1 },
203
- },
204
- {
205
- text: "second edit",
206
- toolCalls: [{ id: "edit-2", name: "edit_file", input: { path: "app.ts", content: "same" } }],
207
- stopReason: "tool_use",
208
- usage: { inputTokens: 1, outputTokens: 1 },
209
- },
210
- {
211
- text: "third edit",
212
- toolCalls: [{ id: "edit-3", name: "edit_file", input: { path: "app.ts", content: "same" } }],
213
- stopReason: "tool_use",
214
- usage: { inputTokens: 1, outputTokens: 1 },
215
- },
216
- ];
197
+ // A model that keeps emitting the same edit_file forever should be
198
+ // hard-stopped after the soft guard has fired 3 times in the turn.
199
+ class RepeatingEditClient {
200
+ idx = 0;
201
+ async chat() {
202
+ this.idx += 1;
203
+ return {
204
+ text: "edit",
205
+ toolCalls: [
206
+ { id: `edit-${this.idx}`, name: "edit_file", input: { path: "app.ts", content: "same" } },
207
+ ],
208
+ stopReason: "tool_use",
209
+ usage: { inputTokens: 1, outputTokens: 1 },
210
+ };
211
+ }
212
+ }
217
213
  const agent = new CodingAgent({
218
214
  config: createTestAgentConfig("/tmp"),
219
- modelClient: new SequenceModelClient(responses),
215
+ modelClient: new RepeatingEditClient(),
220
216
  toolRegistry: new ToolRegistry([createEditTool()]),
221
217
  });
222
218
  const { text } = await agent.runTurn("Edit repeatedly");
@@ -1,12 +1,24 @@
1
1
  import assert from "node:assert/strict";
2
2
  import { test } from "node:test";
3
- import { buildBenchmarkToolTrace, getBenchmarkSystemPromptSuffix, isBenchmarkApprovalSeekingResponse, parseBenchmarkRunArgs, summarizeBenchmarkToolUsage, } from "../src/cli/benchmark-run.js";
3
+ import { buildBenchmarkToolTrace, buildPriorReasoningContext, countMutationsInMessages, getBenchmarkRetryReason, getBenchmarkRetryReminder, getBenchmarkSystemPromptSuffix, isBenchmarkApprovalSeekingResponse, isRetryForceToolsEnabled, looksLikeShellFileMutation, parseBenchmarkRunArgs, summarizeBenchmarkToolUsage, } from "../src/cli/benchmark-run.js";
4
4
  test("benchmark system prompt suffix clearly disables approval-seeking behavior", () => {
5
5
  const suffix = getBenchmarkSystemPromptSuffix();
6
- assert.match(suffix, /non-interactive benchmark harness/i);
6
+ assert.match(suffix, /non-interactive harness/i);
7
7
  assert.match(suffix, /already approved/i);
8
8
  assert.match(suffix, /do not ask for confirmation/i);
9
9
  });
10
+ test("benchmark system prompt suffix overrides iteration discipline for long-form tasks", () => {
11
+ // Added after observing on CCBench that the base [Iteration Discipline]
12
+ // "3-5 calls then commit" guidance was driving premature completion on
13
+ // benchmark tasks that genuinely require 30+ iterate-test-fix cycles.
14
+ // Both gemini-3-flash and haiku-4.5 declared "I have implemented" before
15
+ // verifying their changes against the canonical test suite.
16
+ const suffix = getBenchmarkSystemPromptSuffix();
17
+ assert.match(suffix, /persistent iteration|30\+ tool-call/i, "should set the expectation that persistent iteration is normal");
18
+ assert.match(suffix, /canonical test runner/i, "should direct the model to the canonical test runner, not ad-hoc tests");
19
+ assert.match(suffix, /full existing test suite/i, "should require running the full suite to catch regressions");
20
+ assert.match(suffix, /explicit green signal/i, "should require an observed pass signal, not self-assessed completion");
21
+ });
10
22
  test("benchmark system prompt suffix omits runtime budget knobs", () => {
11
23
  const suffix = getBenchmarkSystemPromptSuffix();
12
24
  assert.doesNotMatch(suffix, /maxSteps/i);
@@ -23,6 +35,161 @@ test("normal benchmark summaries are not treated as approval-seeking", () => {
23
35
  assert.equal(isBenchmarkApprovalSeekingResponse("Updated src/app.ts, ran npm test once, and all tests passed."), false);
24
36
  assert.equal(isBenchmarkApprovalSeekingResponse("The task is blocked because the repository does not contain the referenced file."), false);
25
37
  });
38
+ test("getBenchmarkRetryReason flags zero-tool-call attempts as no_action", () => {
39
+ // Pure-reasoning failure (Gemini 2.5 Pro's "thought a lot, emitted
40
+ // nothing" mode). Despite empty text, the attempt is still a definite
41
+ // failure in benchmark mode since the task needs code changes.
42
+ assert.equal(getBenchmarkRetryReason({ text: "", toolCallCount: 0, mutationCount: 0 }), "no_action");
43
+ // Hallucinated-completion failure: the model narrates work without
44
+ // making any tool calls. Caught the same way — zero tool calls is
45
+ // the load-bearing signal, not the text.
46
+ assert.equal(getBenchmarkRetryReason({
47
+ text: "I've added the new transformation to astropy/coordinates/itrs.py and registered it with the frame_transform_graph.",
48
+ toolCallCount: 0,
49
+ mutationCount: 0,
50
+ }), "no_action");
51
+ // Future-tense planning without action — also covered.
52
+ assert.equal(getBenchmarkRetryReason({
53
+ text: "I will add the helper function to utils.py and update the imports.",
54
+ toolCallCount: 0,
55
+ mutationCount: 0,
56
+ }), "no_action");
57
+ });
58
+ test("getBenchmarkRetryReason flags approval-seeking when tool calls exist", () => {
59
+ assert.equal(getBenchmarkRetryReason({
60
+ text: "I found the changes needed. Please confirm and I'll apply them.",
61
+ toolCallCount: 5,
62
+ mutationCount: 0,
63
+ }), "approval_seeking");
64
+ });
65
+ test("getBenchmarkRetryReason flags plan-only attempts as no_mutation", () => {
66
+ // Observed on Gemini 2.5 Pro 71f348da: 3 read-only tool calls
67
+ // (search_code_map + 2 read_symbol) followed by a future-tense plan
68
+ // ("Here's how I'll fix it: 1. Read X. 2. Modify Y. 3. I'll replace Z.")
69
+ // and no edit. Approval-seeking detector doesn't fire (no "please confirm")
70
+ // and no_action doesn't fire (toolCallCount > 0) — needs a third signal.
71
+ assert.equal(getBenchmarkRetryReason({
72
+ text: "Here's how I'll fix it: 1. Read sliced_wcs.py. 2. Modify world_to_pixel_values. 3. I'll replace the 1. fallback.",
73
+ toolCallCount: 3,
74
+ mutationCount: 0,
75
+ }), "no_mutation");
76
+ });
77
+ test("getBenchmarkRetryReason returns null when mutations occurred", () => {
78
+ assert.equal(getBenchmarkRetryReason({
79
+ text: "Updated src/app.ts, ran npm test, all tests passed.",
80
+ toolCallCount: 12,
81
+ mutationCount: 2,
82
+ }), null);
83
+ });
84
+ test("getBenchmarkRetryReason prioritizes no_action over no_mutation", () => {
85
+ // Defensive: a zero-tool-call attempt also has zero mutations, but the
86
+ // reminder for no_action is more specific. Make sure that path wins.
87
+ assert.equal(getBenchmarkRetryReason({
88
+ text: "I will edit utils.py.",
89
+ toolCallCount: 0,
90
+ mutationCount: 0,
91
+ }), "no_action");
92
+ });
93
+ test("getBenchmarkRetryReminder returns distinct reminders for each reason", () => {
94
+ const approval = getBenchmarkRetryReminder("approval_seeking");
95
+ const noAction = getBenchmarkRetryReminder("no_action");
96
+ const noMutation = getBenchmarkRetryReminder("no_mutation");
97
+ // Approval reminder leans on "already approved" — the model was acting
98
+ // but asked for permission.
99
+ assert.match(approval, /already approved/i);
100
+ assert.doesNotMatch(approval, /zero tool calls/i);
101
+ // No-action reminder names the failure mode explicitly so the model
102
+ // understands what changed.
103
+ assert.match(noAction, /zero tool calls/i);
104
+ assert.match(noAction, /edit_file/);
105
+ // Calls out both the past-tense and future-tense narration traps that
106
+ // were observed in the Gemini 2.5 Pro empty-trajectory investigation.
107
+ assert.match(noAction, /past-tense/i);
108
+ assert.match(noAction, /future-tense/i);
109
+ // No-mutation reminder is distinct — the model DID call tools, just
110
+ // never edited anything. It should mention reading-without-editing,
111
+ // and acknowledge shell-based edits as legitimate (since some models
112
+ // prefer `cat > file` over edit_file).
113
+ assert.match(noMutation, /read files but never edited/i);
114
+ assert.match(noMutation, /cat > path|sed -i/);
115
+ assert.notEqual(noMutation, noAction);
116
+ assert.notEqual(noMutation, approval);
117
+ });
118
+ test("looksLikeShellFileMutation detects common file-writing shells", () => {
119
+ // Heredoc into file — the gemini-3-pro 71f348da pattern.
120
+ assert.equal(looksLikeShellFileMutation("cat > path/to/file.py <<'EOF'\nbody\nEOF"), true);
121
+ // Append redirect with heredoc.
122
+ assert.equal(looksLikeShellFileMutation("cat << 'EOF' >> tests/foo.py\nbody\nEOF"), true);
123
+ // In-place sed.
124
+ assert.equal(looksLikeShellFileMutation("sed -i 's/old/new/g' foo.py"), true);
125
+ // tee.
126
+ assert.equal(looksLikeShellFileMutation("echo x | tee path/to/file"), true);
127
+ // Python open().write().
128
+ assert.equal(looksLikeShellFileMutation('python -c "open(\'foo.py\', \'w\').write(\'body\')"'), true);
129
+ });
130
+ test("looksLikeShellFileMutation rejects read-only and benign redirects", () => {
131
+ // Pure read.
132
+ assert.equal(looksLikeShellFileMutation("cat path/to/file.py"), false);
133
+ // Pipe + cat with output to /dev/null.
134
+ assert.equal(looksLikeShellFileMutation("python script.py > /dev/null 2>&1"), false);
135
+ // File descriptor redirect (no file write).
136
+ assert.equal(looksLikeShellFileMutation("python script.py 2>&1"), false);
137
+ // pytest invocation — no redirect, no in-place edit.
138
+ assert.equal(looksLikeShellFileMutation("python -m pytest tests/foo.py"), false);
139
+ // git commands operate on the index, not arbitrary file writes — we
140
+ // don't count them as code mutations.
141
+ assert.equal(looksLikeShellFileMutation("git checkout tests/foo.py"), false);
142
+ assert.equal(looksLikeShellFileMutation("git add ."), false);
143
+ });
144
+ test("buildPriorReasoningContext returns empty when no reasoning was captured", () => {
145
+ assert.equal(buildPriorReasoningContext(undefined), "");
146
+ assert.equal(buildPriorReasoningContext(""), "");
147
+ assert.equal(buildPriorReasoningContext(" \n\n "), "");
148
+ });
149
+ test("buildPriorReasoningContext wraps the prior reasoning with framing", () => {
150
+ // The wrapper tells the model this is its own prior thinking, and
151
+ // nudges it to act on the reasoning rather than re-deliberate.
152
+ const block = buildPriorReasoningContext("The bug is the hardcoded 1.0 fallback in sliced_wcs.py line 254.");
153
+ assert.match(block, /your previous attempt/i);
154
+ assert.match(block, /<<<PRIOR_REASONING>>>/);
155
+ assert.match(block, /<<<END_PRIOR_REASONING>>>/);
156
+ assert.match(block, /hardcoded 1\.0 fallback/);
157
+ assert.match(block, /apply that reasoning/i);
158
+ });
159
+ test("buildPriorReasoningContext truncates very long reasoning", () => {
160
+ // 12k reasoning tokens from a pure-thinking collapse would explode
161
+ // the next attempt's input cost. Cap at 2000 chars so the retry stays
162
+ // affordable while still preserving the high-level plan.
163
+ const long = "x".repeat(5000);
164
+ const block = buildPriorReasoningContext(long);
165
+ assert.match(block, /more chars of reasoning truncated/);
166
+ // The wrapped block should NOT contain the full 5000 chars worth of x's.
167
+ const xCount = (block.match(/x/g) ?? []).length;
168
+ assert.ok(xCount < 5000, `expected truncation, got ${xCount} x chars`);
169
+ assert.ok(xCount >= 2000, `expected at least 2000 x chars preserved, got ${xCount}`);
170
+ });
171
+ test("countMutationsInMessages counts structured + shell mutations together", () => {
172
+ const messages = [
173
+ { role: "user", content: "task" },
174
+ {
175
+ role: "assistant",
176
+ content: "",
177
+ toolCalls: [
178
+ { id: "1", name: "read_file", input: { path: "a.py" } },
179
+ { id: "2", name: "edit_file", input: { path: "a.py", old_string: "x", new_string: "y" } },
180
+ {
181
+ id: "3",
182
+ name: "run_command",
183
+ input: { command: "cat > b.py <<'EOF'\nprint('hi')\nEOF" },
184
+ },
185
+ { id: "4", name: "run_command", input: { command: "python -m pytest" } },
186
+ ],
187
+ },
188
+ { role: "assistant", content: "done" },
189
+ ];
190
+ // edit_file + the heredoc command count; read_file and pytest don't.
191
+ assert.equal(countMutationsInMessages(messages), 2);
192
+ });
26
193
  test("parseBenchmarkRunArgs preserves prompt text and benchmark flags", () => {
27
194
  const args = parseBenchmarkRunArgs([
28
195
  "--verbose",
@@ -141,9 +308,42 @@ test("benchmark tool usage summary separates structured tools from file reads",
141
308
  assert.equal(summary.specializedByName.get_dependencies, 1);
142
309
  assert.equal(summary.fileReadTotal, 1);
143
310
  assert.equal(summary.mutationTotal, 1);
311
+ assert.equal(summary.shellMutationTotal, 0);
144
312
  assert.equal(summary.commandTotal, 1);
145
313
  assert.deepEqual(summary.repeatedToolCalls, []);
146
314
  });
315
+ test("benchmark tool usage summary counts shell-based file edits as shellMutation", () => {
316
+ // Gemini-3-Pro pattern: `cat > FILE <<EOF` heredoc instead of edit_file.
317
+ // mutationTotal stays at the structured-tool count; shellMutationTotal
318
+ // exposes the shell-based edits without inflating mutationTotal.
319
+ const summary = summarizeBenchmarkToolUsage([
320
+ {
321
+ step: 1,
322
+ name: "run_command",
323
+ input: { command: "cat > foo.py <<'EOF'\nprint('hi')\nEOF" },
324
+ result: "ok",
325
+ skipped: false,
326
+ },
327
+ {
328
+ step: 2,
329
+ name: "run_command",
330
+ input: { command: "sed -i 's/old/new/g' bar.py" },
331
+ result: "ok",
332
+ skipped: false,
333
+ },
334
+ {
335
+ step: 3,
336
+ name: "run_command",
337
+ input: { command: "python -m pytest" },
338
+ result: "ok",
339
+ skipped: false,
340
+ },
341
+ ], "Done");
342
+ assert.equal(summary.total, 3);
343
+ assert.equal(summary.commandTotal, 3);
344
+ assert.equal(summary.mutationTotal, 0);
345
+ assert.equal(summary.shellMutationTotal, 2);
346
+ });
147
347
  test("benchmark tool usage summary reports repeated-call stops", () => {
148
348
  const summary = summarizeBenchmarkToolUsage([
149
349
  {
@@ -178,3 +378,24 @@ test("benchmark tool usage summary reports repeated-call stops", () => {
178
378
  },
179
379
  ]);
180
380
  });
381
+ // ---------------------------------------------------------------------------
382
+ // isRetryForceToolsEnabled
383
+ // ---------------------------------------------------------------------------
384
+ test("isRetryForceToolsEnabled defaults to true when env var unset", () => {
385
+ assert.equal(isRetryForceToolsEnabled({}), true);
386
+ });
387
+ test("isRetryForceToolsEnabled defaults to true when env var is empty/whitespace", () => {
388
+ assert.equal(isRetryForceToolsEnabled({ BENCHMARK_RETRY_FORCE_TOOLS: "" }), true);
389
+ assert.equal(isRetryForceToolsEnabled({ BENCHMARK_RETRY_FORCE_TOOLS: " " }), true);
390
+ });
391
+ test("isRetryForceToolsEnabled returns false for explicit disable values", () => {
392
+ for (const value of ["0", "false", "no", "off", "FALSE", "NO", "Off"]) {
393
+ assert.equal(isRetryForceToolsEnabled({ BENCHMARK_RETRY_FORCE_TOOLS: value }), false, `expected false for input ${JSON.stringify(value)}`);
394
+ }
395
+ });
396
+ test("isRetryForceToolsEnabled returns true for affirmative or unrecognized values", () => {
397
+ // Unrecognized values fall back to "on" — keeping the safe default for typos.
398
+ for (const value of ["1", "true", "yes", "on", "TRUE", "anything"]) {
399
+ assert.equal(isRetryForceToolsEnabled({ BENCHMARK_RETRY_FORCE_TOOLS: value }), true, `expected true for input ${JSON.stringify(value)}`);
400
+ }
401
+ });