@sean.holung/minicode 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/agent/config.js +11 -1
- package/dist/src/benchmark/config.js +15 -1
- package/dist/src/benchmark/index.js +1 -1
- package/dist/src/benchmark/workspace-changes.js +93 -12
- package/dist/src/cli/benchmark-run.js +285 -12
- package/dist/src/cli/contextbench-trajectory.js +258 -0
- package/dist/tests/agent.test.js +17 -21
- package/dist/tests/benchmark-run.test.js +223 -2
- package/dist/tests/contextbench-trajectory.test.js +228 -0
- package/dist/tests/reasoning-effort.test.js +83 -0
- package/dist/tests/workspace-changes.test.js +50 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.d.ts +13 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.d.ts.map +1 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.js +133 -24
- package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/agent.js.map +1 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.d.ts +62 -0
- package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.d.ts.map +1 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/agent/types.js.map +1 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.d.ts +5 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.d.ts.map +1 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.js +86 -7
- package/node_modules/@sean.holung/minicode-sdk/dist/src/model/client.js.map +1 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.d.ts.map +1 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.js +1 -1
- package/node_modules/@sean.holung/minicode-sdk/dist/src/prompt/system-prompt.js.map +1 -1
- package/node_modules/minicode-plugin-python/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +1 -1
package/dist/src/agent/config.js
CHANGED
|
@@ -33,6 +33,7 @@ export function formatConfigForDisplay(config) {
|
|
|
33
33
|
"compactionThreshold: " + (config.compactionThreshold ?? "(disabled)"),
|
|
34
34
|
"compactionModel: " + (config.compactionModel ?? "(disabled — using mechanical compaction)"),
|
|
35
35
|
"reasoningEffort: " + (config.reasoningEffort ?? "(unset — no reasoning parameters sent)"),
|
|
36
|
+
"reasoningMaxTokens: " + (config.reasoningMaxTokens !== undefined ? String(config.reasoningMaxTokens) : "(unset — uncapped)"),
|
|
36
37
|
"enableDynamicPrompt: " + (config.enableDynamicPrompt ?? false),
|
|
37
38
|
];
|
|
38
39
|
return lines.join("\n");
|
|
@@ -255,7 +256,7 @@ export async function loadAgentConfig(cwd = process.cwd(), options = {}) {
|
|
|
255
256
|
modelProvider: parseModelProvider(env.MODEL_PROVIDER ?? "openai-compatible"),
|
|
256
257
|
model: env.MODEL ?? "",
|
|
257
258
|
maxSteps: parseNumber(env.MAX_STEPS, 50),
|
|
258
|
-
maxTokens: parseNumber(env.MAX_TOKENS,
|
|
259
|
+
maxTokens: parseNumber(env.MAX_TOKENS, 16000),
|
|
259
260
|
modelTimeoutSeconds: parseNumber(env.MODEL_TIMEOUT_SECONDS, 60),
|
|
260
261
|
maxContextTokens: parseNumber(env.MAX_CONTEXT_TOKENS, 32_000),
|
|
261
262
|
workspaceRoot,
|
|
@@ -280,5 +281,14 @@ export async function loadAgentConfig(cwd = process.cwd(), options = {}) {
|
|
|
280
281
|
const effort = parseReasoningEffort(env.REASONING_EFFORT);
|
|
281
282
|
return effort ? { reasoningEffort: effort } : {};
|
|
282
283
|
})(),
|
|
284
|
+
...(() => {
|
|
285
|
+
const raw = env.REASONING_MAX_TOKENS;
|
|
286
|
+
if (raw === undefined || raw === "")
|
|
287
|
+
return {};
|
|
288
|
+
const value = Number(raw);
|
|
289
|
+
if (!Number.isFinite(value) || value <= 0)
|
|
290
|
+
return {};
|
|
291
|
+
return { reasoningMaxTokens: Math.floor(value) };
|
|
292
|
+
})(),
|
|
283
293
|
};
|
|
284
294
|
}
|
|
@@ -163,7 +163,7 @@ export async function buildBenchmarkAgentConfig(options = {}) {
|
|
|
163
163
|
modelProvider: provider,
|
|
164
164
|
model,
|
|
165
165
|
maxSteps: parseNumber(resolvedEnv.values.MAX_STEPS ?? fileConfig.maxSteps, 50),
|
|
166
|
-
maxTokens: parseNumber(resolvedEnv.values.MAX_TOKENS ?? fileConfig.maxTokens,
|
|
166
|
+
maxTokens: parseNumber(resolvedEnv.values.MAX_TOKENS ?? fileConfig.maxTokens, 16000),
|
|
167
167
|
modelTimeoutSeconds: parseNumber(resolvedEnv.values.MODEL_TIMEOUT_SECONDS ?? fileConfig.modelTimeoutSeconds, 60),
|
|
168
168
|
maxContextTokens: parseNumber(resolvedEnv.values.MAX_CONTEXT_TOKENS ?? fileConfig.maxContextTokens, 32_000),
|
|
169
169
|
workspaceRoot,
|
|
@@ -192,6 +192,20 @@ export async function buildBenchmarkAgentConfig(options = {}) {
|
|
|
192
192
|
const effort = parseReasoningEffort(resolvedEnv.values.REASONING_EFFORT ?? fileConfig.reasoningEffort);
|
|
193
193
|
return effort ? { reasoningEffort: effort } : {};
|
|
194
194
|
})(),
|
|
195
|
+
...(() => {
|
|
196
|
+
// Opt-in hard cap on reasoning tokens per turn. Useful for models
|
|
197
|
+
// (notably Gemini 2.5 Pro) that can otherwise burn the full output
|
|
198
|
+
// budget on dynamic thinking without producing a visible response.
|
|
199
|
+
// Unset by default — uncapped reasoning is the right behavior for
|
|
200
|
+
// most models.
|
|
201
|
+
const raw = resolvedEnv.values.REASONING_MAX_TOKENS ?? fileConfig.reasoningMaxTokens;
|
|
202
|
+
if (raw === undefined || raw === null || raw === "")
|
|
203
|
+
return {};
|
|
204
|
+
const value = typeof raw === "number" ? raw : Number(raw);
|
|
205
|
+
if (!Number.isFinite(value) || value <= 0)
|
|
206
|
+
return {};
|
|
207
|
+
return { reasoningMaxTokens: Math.floor(value) };
|
|
208
|
+
})(),
|
|
195
209
|
enableDynamicPrompt: parseBoolean(resolvedEnv.values.ENABLE_DYNAMIC_PROMPT ?? fileConfig.enableDynamicPrompt, false),
|
|
196
210
|
};
|
|
197
211
|
}
|
|
@@ -2,5 +2,5 @@ export { loadBenchmarkTask, loadBenchmarkTasks } from "./task-loader.js";
|
|
|
2
2
|
export { evaluate } from "./evaluator.js";
|
|
3
3
|
export { runBenchmarkTask, runBenchmarkSuite, } from "./runner.js";
|
|
4
4
|
export { buildBenchmarkAgentConfig, getDefaultBenchmarkConfigPath, resolveBenchmarkEnv, } from "./config.js";
|
|
5
|
-
export { collectWorkspaceChanges, writeWorkspaceDiff, } from "./workspace-changes.js";
|
|
5
|
+
export { collectWorkspaceChanges, getWorkspaceDiff, writeWorkspaceDiff, } from "./workspace-changes.js";
|
|
6
6
|
export { buildReport, buildReportFromEvaluations, formatReport, compareReports, } from "./reporter.js";
|
|
@@ -59,6 +59,47 @@ async function getWorkspaceGitPrefix(workspaceRoot) {
|
|
|
59
59
|
const prefix = await runGit(workspaceRoot, ["rev-parse", "--show-prefix"], true);
|
|
60
60
|
return prefix.trim();
|
|
61
61
|
}
|
|
62
|
+
/**
|
|
63
|
+
* Snapshot the current HEAD so we can diff against it at the end of a run
|
|
64
|
+
* even if the model committed in between. Returns null when the workspace
|
|
65
|
+
* is not a git repo or has no commits yet.
|
|
66
|
+
*/
|
|
67
|
+
export async function captureBaselineRef(workspaceRoot) {
|
|
68
|
+
if (!(await isGitRepository(workspaceRoot))) {
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
const sha = (await runGit(workspaceRoot, ["rev-parse", "HEAD"], true)).trim();
|
|
72
|
+
return sha.length > 0 ? sha : null;
|
|
73
|
+
}
|
|
74
|
+
function parseNameStatusLine(line) {
|
|
75
|
+
if (line.length === 0) {
|
|
76
|
+
return undefined;
|
|
77
|
+
}
|
|
78
|
+
// git diff --name-status output is tab-separated: STATUS\tPATH or
|
|
79
|
+
// STATUS\tOLD\tNEW (for renames/copies, status looks like R100 / C75).
|
|
80
|
+
const parts = line.split("\t");
|
|
81
|
+
const rawStatus = parts[0];
|
|
82
|
+
if (!rawStatus) {
|
|
83
|
+
return undefined;
|
|
84
|
+
}
|
|
85
|
+
const code = rawStatus.charAt(0);
|
|
86
|
+
if (code === "R" || code === "C") {
|
|
87
|
+
const previousPath = parts[1];
|
|
88
|
+
const nextPath = parts[2];
|
|
89
|
+
if (!previousPath || !nextPath) {
|
|
90
|
+
return undefined;
|
|
91
|
+
}
|
|
92
|
+
return { status: `${code} `, path: nextPath, previousPath };
|
|
93
|
+
}
|
|
94
|
+
const filePath = parts[1];
|
|
95
|
+
if (!filePath) {
|
|
96
|
+
return undefined;
|
|
97
|
+
}
|
|
98
|
+
// Map to the two-char porcelain-ish status the downstream code expects.
|
|
99
|
+
// We don't try to be exact — the only meaningful check downstream is the
|
|
100
|
+
// "??" untracked case, which is handled separately via git status.
|
|
101
|
+
return { status: `${code} `, path: filePath };
|
|
102
|
+
}
|
|
62
103
|
function stripWorkspacePrefix(filePath, workspacePrefix) {
|
|
63
104
|
if (!workspacePrefix) {
|
|
64
105
|
return filePath;
|
|
@@ -70,7 +111,7 @@ function stripWorkspacePrefix(filePath, workspacePrefix) {
|
|
|
70
111
|
? filePath.slice(normalizedPrefix.length)
|
|
71
112
|
: filePath;
|
|
72
113
|
}
|
|
73
|
-
export async function collectWorkspaceChanges(workspaceRoot) {
|
|
114
|
+
export async function collectWorkspaceChanges(workspaceRoot, baselineRef) {
|
|
74
115
|
const isGitRepo = await isGitRepository(workspaceRoot);
|
|
75
116
|
if (!isGitRepo) {
|
|
76
117
|
return {
|
|
@@ -80,11 +121,7 @@ export async function collectWorkspaceChanges(workspaceRoot) {
|
|
|
80
121
|
};
|
|
81
122
|
}
|
|
82
123
|
const workspacePrefix = await getWorkspaceGitPrefix(workspaceRoot);
|
|
83
|
-
const
|
|
84
|
-
const entries = statusOutput
|
|
85
|
-
.split(/\r?\n/)
|
|
86
|
-
.map((line) => parseStatusLine(line))
|
|
87
|
-
.map((entry) => entry
|
|
124
|
+
const remap = (entry) => entry
|
|
88
125
|
? {
|
|
89
126
|
...entry,
|
|
90
127
|
path: stripWorkspacePrefix(entry.path, workspacePrefix),
|
|
@@ -92,8 +129,39 @@ export async function collectWorkspaceChanges(workspaceRoot) {
|
|
|
92
129
|
? { previousPath: stripWorkspacePrefix(entry.previousPath, workspacePrefix) }
|
|
93
130
|
: {}),
|
|
94
131
|
}
|
|
95
|
-
: undefined
|
|
132
|
+
: undefined;
|
|
133
|
+
// Always pull untracked entries from `git status` — they're never part of
|
|
134
|
+
// a baseline diff because they aren't tracked yet.
|
|
135
|
+
const statusOutput = await runGit(workspaceRoot, ["status", "--porcelain=v1", "--untracked-files=all", "--", "."], true);
|
|
136
|
+
const statusEntries = statusOutput
|
|
137
|
+
.split(/\r?\n/)
|
|
138
|
+
.map((line) => parseStatusLine(line))
|
|
139
|
+
.map(remap)
|
|
96
140
|
.filter((entry) => entry !== undefined);
|
|
141
|
+
let entries;
|
|
142
|
+
if (baselineRef) {
|
|
143
|
+
// Tracked changes: anything that differs between the baseline commit and
|
|
144
|
+
// the current working tree. Captures committed, staged, AND unstaged
|
|
145
|
+
// edits in one shot.
|
|
146
|
+
const nameStatusOutput = await runGit(workspaceRoot, ["diff", "--name-status", baselineRef, "--", "."], true);
|
|
147
|
+
const trackedEntries = nameStatusOutput
|
|
148
|
+
.split(/\r?\n/)
|
|
149
|
+
.map((line) => parseNameStatusLine(line))
|
|
150
|
+
.map(remap)
|
|
151
|
+
.filter((entry) => entry !== undefined);
|
|
152
|
+
const untrackedEntries = statusEntries.filter((entry) => entry.status === "??");
|
|
153
|
+
const seen = new Set();
|
|
154
|
+
entries = [];
|
|
155
|
+
for (const entry of [...trackedEntries, ...untrackedEntries]) {
|
|
156
|
+
if (seen.has(entry.path))
|
|
157
|
+
continue;
|
|
158
|
+
seen.add(entry.path);
|
|
159
|
+
entries.push(entry);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
entries = statusEntries;
|
|
164
|
+
}
|
|
97
165
|
const changedFiles = [...new Set(entries.map((entry) => entry.path))];
|
|
98
166
|
return {
|
|
99
167
|
isGitRepo: true,
|
|
@@ -101,12 +169,19 @@ export async function collectWorkspaceChanges(workspaceRoot) {
|
|
|
101
169
|
changedFiles,
|
|
102
170
|
};
|
|
103
171
|
}
|
|
104
|
-
export async function
|
|
105
|
-
const changes = await collectWorkspaceChanges(workspaceRoot);
|
|
172
|
+
export async function getWorkspaceDiff(workspaceRoot, baselineRef) {
|
|
173
|
+
const changes = await collectWorkspaceChanges(workspaceRoot, baselineRef);
|
|
106
174
|
if (!changes.isGitRepo) {
|
|
107
|
-
return
|
|
175
|
+
return null;
|
|
108
176
|
}
|
|
109
|
-
|
|
177
|
+
// With a baseline ref we diff working-tree vs baseline directly, which
|
|
178
|
+
// captures committed + staged + unstaged in one pass. Without one we
|
|
179
|
+
// fall back to the working-tree-vs-index behavior — useful when the
|
|
180
|
+
// caller hasn't snapshotted a starting point.
|
|
181
|
+
const trackedDiffArgs = baselineRef
|
|
182
|
+
? ["diff", "--binary", "--no-ext-diff", "--relative", baselineRef, "--", "."]
|
|
183
|
+
: ["diff", "--binary", "--no-ext-diff", "--relative", "--", "."];
|
|
184
|
+
const trackedDiff = await runGit(workspaceRoot, trackedDiffArgs, true);
|
|
110
185
|
const untrackedDiffs = [];
|
|
111
186
|
for (const entry of changes.entries) {
|
|
112
187
|
if (entry.status === "??") {
|
|
@@ -116,9 +191,15 @@ export async function writeWorkspaceDiff(workspaceRoot, outPath) {
|
|
|
116
191
|
}
|
|
117
192
|
}
|
|
118
193
|
}
|
|
119
|
-
|
|
194
|
+
return [trackedDiff, ...untrackedDiffs]
|
|
120
195
|
.filter((section) => section.trim().length > 0)
|
|
121
196
|
.join("\n");
|
|
197
|
+
}
|
|
198
|
+
export async function writeWorkspaceDiff(workspaceRoot, outPath, baselineRef) {
|
|
199
|
+
const combinedDiff = await getWorkspaceDiff(workspaceRoot, baselineRef);
|
|
200
|
+
if (combinedDiff === null) {
|
|
201
|
+
return false;
|
|
202
|
+
}
|
|
122
203
|
await mkdir(path.dirname(outPath), { recursive: true });
|
|
123
204
|
await writeFile(outPath, combinedDiff, "utf8");
|
|
124
205
|
return true;
|
|
@@ -4,24 +4,87 @@ import process from "node:process";
|
|
|
4
4
|
import { CodingAgent, createModelClient, } from "@sean.holung/minicode-sdk";
|
|
5
5
|
import { getConfigSetupMessage } from "../agent/config.js";
|
|
6
6
|
import { buildBenchmarkAgentConfig, resolveBenchmarkEnv, } from "../benchmark/config.js";
|
|
7
|
-
import { collectWorkspaceChanges, writeWorkspaceDiff, } from "../benchmark/workspace-changes.js";
|
|
7
|
+
import { captureBaselineRef, collectWorkspaceChanges, getWorkspaceDiff, writeWorkspaceDiff, } from "../benchmark/workspace-changes.js";
|
|
8
8
|
import { buildProjectIndex } from "../indexer/project-index.js";
|
|
9
9
|
import { createToolRegistry } from "../tools/registry.js";
|
|
10
10
|
import { CliUsageError } from "./args.js";
|
|
11
|
+
import { buildContextBenchTrajectory } from "./contextbench-trajectory.js";
|
|
11
12
|
const BENCHMARK_SYSTEM_PROMPT_SUFFIX = [
|
|
12
|
-
"[
|
|
13
|
-
"- This task is running in a non-interactive
|
|
14
|
-
"-
|
|
13
|
+
"[Execution Mode]",
|
|
14
|
+
"- This task is running in a non-interactive harness. The task is already approved.",
|
|
15
|
+
"- Do not ask for confirmation, permission, or whether you should proceed.",
|
|
15
16
|
"- If the task requires code changes, make them immediately using the available tools.",
|
|
16
17
|
"- Do not stop after presenting a plan. Either complete the task or explain a concrete blocker.",
|
|
17
|
-
"
|
|
18
|
+
"",
|
|
19
|
+
"[Long-form Task Discipline]",
|
|
20
|
+
"- Non-trivial coding tasks routinely require 30+ tool-call iterate-test-fix cycles. Persistent iteration against the test suite is expected; do not bail early because you have read a few files.",
|
|
21
|
+
"- Iterate against the canonical test runner (the test command shipped with the task) until it passes. Treat the runner's output as the source of truth — not your own assessment of whether the code looks correct, and not ad-hoc verification scripts you write yourself.",
|
|
22
|
+
"- Before declaring the task complete: run the FULL existing test suite, not just tests targeting the new feature. Many tasks modify code that other features depend on — verify you did not break previously-passing functionality. Regression failures on stages you were not asked to modify still count as failures.",
|
|
23
|
+
"- \"I have implemented X\" is not the same as \"tests pass.\" Do not declare completion without observing an explicit green signal (exit code 0, all-pass marker) from the canonical test runner over the full suite.",
|
|
18
24
|
].join("\n");
|
|
19
|
-
const
|
|
25
|
+
const BENCHMARK_RETRY_REMINDER_APPROVAL = [
|
|
20
26
|
"Benchmark harness reminder:",
|
|
21
27
|
"- This task is already approved.",
|
|
22
28
|
"- Do not ask for confirmation or present a plan without acting.",
|
|
23
29
|
"- Use tools, make the required edits immediately, and finish the task.",
|
|
24
30
|
].join("\n");
|
|
31
|
+
// Used when the previous attempt emitted zero tool calls — either pure
|
|
32
|
+
// reasoning that produced no output (Gemini 2.5 Pro's thinking-paralysis
|
|
33
|
+
// mode) or narration claiming work was done without any tool calls.
|
|
34
|
+
// In benchmark mode every task requires code changes; zero tool calls
|
|
35
|
+
// is a definite failure regardless of what the response text claims.
|
|
36
|
+
const BENCHMARK_RETRY_REMINDER_NO_ACTION = [
|
|
37
|
+
"Benchmark harness reminder:",
|
|
38
|
+
"- Your previous response made zero tool calls. The task is not complete.",
|
|
39
|
+
'- Code changes only happen through tool calls (edit_file / write_file). Text alone — including past-tense statements like "I\'ve added X" or future-tense plans like "I\'ll add X" — is not a change.',
|
|
40
|
+
"- Begin by reading the relevant files with read_file or read_symbol, then make the edits with edit_file / write_file, then verify the result with run_command.",
|
|
41
|
+
].join("\n");
|
|
42
|
+
// Used when the previous attempt made tool calls but never mutated any
|
|
43
|
+
// file (no edit_file/write_file, no `cat > FILE`/`sed -i`/`tee` shell
|
|
44
|
+
// edit). Observed shape: model reads a few symbols, narrates a plan
|
|
45
|
+
// ("Here's how I'll fix it: 1. … 2. …"), and stops without acting.
|
|
46
|
+
// Distinct from `approval_seeking` (no explicit confirmation request)
|
|
47
|
+
// and from `no_action` (tool-call count > 0).
|
|
48
|
+
const BENCHMARK_RETRY_REMINDER_NO_MUTATION = [
|
|
49
|
+
"Benchmark harness reminder:",
|
|
50
|
+
"- Your previous response read files but never edited any. The task is not complete.",
|
|
51
|
+
"- Code changes only happen through file mutations: edit_file / write_file (preferred), or a shell command that writes to a file (e.g. `cat > path <<EOF`, `sed -i ...`).",
|
|
52
|
+
"- Reading more files is not progress on its own. Identify the file to change, make the edit, then verify with run_command.",
|
|
53
|
+
].join("\n");
|
|
54
|
+
/**
|
|
55
|
+
* Cap on prior-reasoning content forwarded to the retry prompt. ~2000
|
|
56
|
+
* chars ≈ 500 tokens — enough to convey the model's high-level plan from
|
|
57
|
+
* the failed attempt without ballooning the second attempt's input cost.
|
|
58
|
+
*/
|
|
59
|
+
const PRIOR_REASONING_MAX_CHARS = 2000;
|
|
60
|
+
/**
|
|
61
|
+
* Build a "your previous attempt thought this" block to append to the
|
|
62
|
+
* retry prompt. Helps the model see its own prior reasoning so the
|
|
63
|
+
* retry isn't starting from a cold state — particularly useful when the
|
|
64
|
+
* first attempt collapsed to pure reasoning (no visible content / no
|
|
65
|
+
* tool calls). Returns an empty string when no reasoning was captured.
|
|
66
|
+
*/
|
|
67
|
+
export function buildPriorReasoningContext(reasoningContent) {
|
|
68
|
+
if (typeof reasoningContent !== "string") {
|
|
69
|
+
return "";
|
|
70
|
+
}
|
|
71
|
+
const trimmed = reasoningContent.trim();
|
|
72
|
+
if (trimmed.length === 0) {
|
|
73
|
+
return "";
|
|
74
|
+
}
|
|
75
|
+
const snippet = trimmed.length > PRIOR_REASONING_MAX_CHARS
|
|
76
|
+
? trimmed.slice(0, PRIOR_REASONING_MAX_CHARS) +
|
|
77
|
+
`\n…[${trimmed.length - PRIOR_REASONING_MAX_CHARS} more chars of reasoning truncated]`
|
|
78
|
+
: trimmed;
|
|
79
|
+
return [
|
|
80
|
+
"",
|
|
81
|
+
"Your previous attempt's internal reasoning (verbatim, for your own context):",
|
|
82
|
+
"<<<PRIOR_REASONING>>>",
|
|
83
|
+
snippet,
|
|
84
|
+
"<<<END_PRIOR_REASONING>>>",
|
|
85
|
+
"Apply that reasoning concretely — make the file changes your previous turn was planning, then verify.",
|
|
86
|
+
].join("\n");
|
|
87
|
+
}
|
|
25
88
|
const CONFIRMATION_REQUEST_PATTERNS = [
|
|
26
89
|
/\bplease confirm\b/i,
|
|
27
90
|
/\bconfirm and i(?:'|’)ll\b/i,
|
|
@@ -34,6 +97,20 @@ const CONFIRMATION_REQUEST_PATTERNS = [
|
|
|
34
97
|
/\bneed your approval\b/i,
|
|
35
98
|
/\bpermission\b/i,
|
|
36
99
|
];
|
|
100
|
+
/**
|
|
101
|
+
* Whether the retry path should force a tool call (`toolChoice: "required"`)
|
|
102
|
+
* AND cap reasoning to 2K tokens on the retried attempt. Default is true —
|
|
103
|
+
* this is the validated rescue behavior for collapse-prone retries. Set
|
|
104
|
+
* `BENCHMARK_RETRY_FORCE_TOOLS=0` (or `false`/`no`/`off`) to disable; the
|
|
105
|
+
* retry will then run with the same model config as the first attempt.
|
|
106
|
+
*
|
|
107
|
+
* The escape hatch exists so a future model that misbehaves under
|
|
108
|
+
* `tool_choice: required` can be unblocked without a code revert.
|
|
109
|
+
*/
|
|
110
|
+
export function isRetryForceToolsEnabled(env) {
|
|
111
|
+
const value = (env.BENCHMARK_RETRY_FORCE_TOOLS ?? "").trim().toLowerCase();
|
|
112
|
+
return value !== "0" && value !== "false" && value !== "no" && value !== "off";
|
|
113
|
+
}
|
|
37
114
|
const SPECIALIZED_TOOL_NAMES = new Set([
|
|
38
115
|
"read_symbol",
|
|
39
116
|
"find_references",
|
|
@@ -46,6 +123,37 @@ const SEARCH_TOOL_NAMES = new Set(["search"]);
|
|
|
46
123
|
const MUTATION_TOOL_NAMES = new Set(["edit_file", "write_file"]);
|
|
47
124
|
const COMMAND_TOOL_NAMES = new Set(["run_command"]);
|
|
48
125
|
const REPEATED_TOOL_CALL_STOP_TEXT = "Stopped due to repeated identical tool calls";
|
|
126
|
+
// Heuristics for "this shell command modified a file." Observed during
|
|
127
|
+
// trace analysis: gemini-3-pro routinely uses `cat > FILE <<EOF` /
|
|
128
|
+
// `cat >> FILE` heredocs instead of edit_file/write_file, so the model
|
|
129
|
+
// is doing real work while our toolUsage.mutationTotal reads zero. The
|
|
130
|
+
// retry detector and mutation analysis both need to recognize these as
|
|
131
|
+
// real edits.
|
|
132
|
+
const SHELL_MUTATION_PATTERNS = [
|
|
133
|
+
// Redirect to a file: `> path`, `>> path`, or with a leading fd like `2> path`.
|
|
134
|
+
// Excludes `/dev/null`, `/dev/stderr`, and fd redirects (`>&2`).
|
|
135
|
+
/(?:^|[^&>])>>?\s*(?!\/dev\/null\b|\/dev\/stderr\b|&\d)[^\s|;&<>]+/,
|
|
136
|
+
// sed in-place edit.
|
|
137
|
+
/\bsed\b[^|;]*\s-i\b/,
|
|
138
|
+
// tee writes its stdin to one or more files.
|
|
139
|
+
/\btee\b(?!\s+--help\b)/,
|
|
140
|
+
// Python `open(..., "w"|"a"|"r+"|"wb"|"ab").write(...)` invocation (covers
|
|
141
|
+
// the common `python -c "..."` mutation shape).
|
|
142
|
+
/\bopen\s*\(\s*['"][^'"]+['"]\s*,\s*['"][rwa]\+?b?\+?['"]\s*\)\s*\.\s*write\b/,
|
|
143
|
+
];
|
|
144
|
+
export function looksLikeShellFileMutation(command) {
|
|
145
|
+
if (typeof command !== "string" || command.length === 0) {
|
|
146
|
+
return false;
|
|
147
|
+
}
|
|
148
|
+
return SHELL_MUTATION_PATTERNS.some((pattern) => pattern.test(command));
|
|
149
|
+
}
|
|
150
|
+
function toolCallLooksLikeShellMutation(toolCall) {
|
|
151
|
+
if (!COMMAND_TOOL_NAMES.has(toolCall.name)) {
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
const command = toolCall.input?.command;
|
|
155
|
+
return typeof command === "string" && looksLikeShellFileMutation(command);
|
|
156
|
+
}
|
|
49
157
|
export function getBenchmarkSystemPromptSuffix() {
|
|
50
158
|
return BENCHMARK_SYSTEM_PROMPT_SUFFIX;
|
|
51
159
|
}
|
|
@@ -55,6 +163,84 @@ export function isBenchmarkApprovalSeekingResponse(text) {
|
|
|
55
163
|
}
|
|
56
164
|
return CONFIRMATION_REQUEST_PATTERNS.some((pattern) => pattern.test(text));
|
|
57
165
|
}
|
|
166
|
+
function countToolCallsInMessages(messages) {
|
|
167
|
+
let count = 0;
|
|
168
|
+
for (const message of messages) {
|
|
169
|
+
if (message.role === "assistant" && message.toolCalls?.length) {
|
|
170
|
+
count += message.toolCalls.length;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return count;
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Count any tool call that produced a real workspace mutation, whether via
|
|
177
|
+
* the structured tools (edit_file / write_file) or via a shell command
|
|
178
|
+
* that looks like a file write (heredoc into a file, sed -i, tee, etc.).
|
|
179
|
+
*/
|
|
180
|
+
export function countMutationsInMessages(messages) {
|
|
181
|
+
let count = 0;
|
|
182
|
+
for (const message of messages) {
|
|
183
|
+
if (message.role !== "assistant" || !message.toolCalls?.length) {
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
for (const toolCall of message.toolCalls) {
|
|
187
|
+
if (MUTATION_TOOL_NAMES.has(toolCall.name)) {
|
|
188
|
+
count += 1;
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
if (toolCallLooksLikeShellMutation({
|
|
192
|
+
name: toolCall.name,
|
|
193
|
+
input: (toolCall.input ?? {}),
|
|
194
|
+
})) {
|
|
195
|
+
count += 1;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
return count;
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Decide whether a benchmark attempt should be retried once with an
|
|
203
|
+
* additional reminder appended to the prompt. Returns the reason (so the
|
|
204
|
+
* caller can pick a matching reminder), or `null` if the attempt looks
|
|
205
|
+
* fine as-is.
|
|
206
|
+
*
|
|
207
|
+
* Three failure modes warrant retry, checked in this order:
|
|
208
|
+
* - `no_action`: the model emitted zero tool calls in the entire turn.
|
|
209
|
+
* In benchmark mode every task requires code changes, so a tool-call-
|
|
210
|
+
* free response is by definition incomplete. Covers both pure-
|
|
211
|
+
* reasoning failures (visible text empty) and hallucinated-completion
|
|
212
|
+
* narration ("I've added the helper" with no edit_file call).
|
|
213
|
+
* - `approval_seeking`: the model asked for confirmation rather than
|
|
214
|
+
* acting, even though it made some tool calls.
|
|
215
|
+
* - `no_mutation`: the model made tool calls but never produced a file
|
|
216
|
+
* mutation — read-only exploration that stopped at a plan. Observed
|
|
217
|
+
* on Gemini 2.5 Pro: 3 read_symbol calls followed by a future-tense
|
|
218
|
+
* plan, no edit. mutationCount counts both structured and shell-based
|
|
219
|
+
* mutations so this only fires when the model genuinely did nothing
|
|
220
|
+
* to the workspace.
|
|
221
|
+
*/
|
|
222
|
+
export function getBenchmarkRetryReason(attempt) {
|
|
223
|
+
if (attempt.toolCallCount === 0) {
|
|
224
|
+
return "no_action";
|
|
225
|
+
}
|
|
226
|
+
if (isBenchmarkApprovalSeekingResponse(attempt.text)) {
|
|
227
|
+
return "approval_seeking";
|
|
228
|
+
}
|
|
229
|
+
if (attempt.mutationCount === 0) {
|
|
230
|
+
return "no_mutation";
|
|
231
|
+
}
|
|
232
|
+
return null;
|
|
233
|
+
}
|
|
234
|
+
export function getBenchmarkRetryReminder(reason) {
|
|
235
|
+
switch (reason) {
|
|
236
|
+
case "approval_seeking":
|
|
237
|
+
return BENCHMARK_RETRY_REMINDER_APPROVAL;
|
|
238
|
+
case "no_action":
|
|
239
|
+
return BENCHMARK_RETRY_REMINDER_NO_ACTION;
|
|
240
|
+
case "no_mutation":
|
|
241
|
+
return BENCHMARK_RETRY_REMINDER_NO_MUTATION;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
58
244
|
function stableSerialize(value) {
|
|
59
245
|
if (Array.isArray(value)) {
|
|
60
246
|
return `[${value.map((item) => stableSerialize(item)).join(",")}]`;
|
|
@@ -105,6 +291,7 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
|
|
|
105
291
|
let fileReadTotal = 0;
|
|
106
292
|
let searchTotal = 0;
|
|
107
293
|
let mutationTotal = 0;
|
|
294
|
+
let shellMutationTotal = 0;
|
|
108
295
|
let commandTotal = 0;
|
|
109
296
|
let skippedTotal = 0;
|
|
110
297
|
for (const toolCall of toolCalls) {
|
|
@@ -124,6 +311,9 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
|
|
|
124
311
|
}
|
|
125
312
|
if (COMMAND_TOOL_NAMES.has(toolCall.name)) {
|
|
126
313
|
commandTotal += 1;
|
|
314
|
+
if (toolCallLooksLikeShellMutation(toolCall)) {
|
|
315
|
+
shellMutationTotal += 1;
|
|
316
|
+
}
|
|
127
317
|
}
|
|
128
318
|
if (toolCall.skipped) {
|
|
129
319
|
skippedTotal += 1;
|
|
@@ -149,6 +339,7 @@ export function summarizeBenchmarkToolUsage(toolCalls, finalText) {
|
|
|
149
339
|
fileReadTotal,
|
|
150
340
|
searchTotal,
|
|
151
341
|
mutationTotal,
|
|
342
|
+
shellMutationTotal,
|
|
152
343
|
commandTotal,
|
|
153
344
|
skippedTotal,
|
|
154
345
|
repeatedStop: finalText.includes(REPEATED_TOOL_CALL_STOP_TEXT) ||
|
|
@@ -176,6 +367,8 @@ export function parseBenchmarkRunArgs(argv) {
|
|
|
176
367
|
let workspaceRoot;
|
|
177
368
|
let diffOut;
|
|
178
369
|
let outFile;
|
|
370
|
+
let contextBenchTrajectory;
|
|
371
|
+
let contextBenchImage;
|
|
179
372
|
let verbose = false;
|
|
180
373
|
for (let i = 0; i < argv.length; i += 1) {
|
|
181
374
|
const arg = argv[i];
|
|
@@ -276,6 +469,26 @@ export function parseBenchmarkRunArgs(argv) {
|
|
|
276
469
|
promptFile = arg.slice("--prompt-file=".length).trim();
|
|
277
470
|
continue;
|
|
278
471
|
}
|
|
472
|
+
if (arg === "--contextbench-trajectory") {
|
|
473
|
+
const parsed = readFlagValue(argv, i, "--contextbench-trajectory");
|
|
474
|
+
contextBenchTrajectory = parsed.value;
|
|
475
|
+
i = parsed.nextIndex;
|
|
476
|
+
continue;
|
|
477
|
+
}
|
|
478
|
+
if (arg.startsWith("--contextbench-trajectory=")) {
|
|
479
|
+
contextBenchTrajectory = arg.slice("--contextbench-trajectory=".length).trim();
|
|
480
|
+
continue;
|
|
481
|
+
}
|
|
482
|
+
if (arg === "--contextbench-image") {
|
|
483
|
+
const parsed = readFlagValue(argv, i, "--contextbench-image");
|
|
484
|
+
contextBenchImage = parsed.value;
|
|
485
|
+
i = parsed.nextIndex;
|
|
486
|
+
continue;
|
|
487
|
+
}
|
|
488
|
+
if (arg.startsWith("--contextbench-image=")) {
|
|
489
|
+
contextBenchImage = arg.slice("--contextbench-image=".length).trim();
|
|
490
|
+
continue;
|
|
491
|
+
}
|
|
279
492
|
promptParts.push(arg);
|
|
280
493
|
}
|
|
281
494
|
if (promptFile && promptParts.length > 0) {
|
|
@@ -292,6 +505,8 @@ export function parseBenchmarkRunArgs(argv) {
|
|
|
292
505
|
...(workspaceRoot ? { workspaceRoot } : {}),
|
|
293
506
|
...(diffOut ? { diffOut } : {}),
|
|
294
507
|
...(outFile ? { outFile } : {}),
|
|
508
|
+
...(contextBenchTrajectory ? { contextBenchTrajectory } : {}),
|
|
509
|
+
...(contextBenchImage ? { contextBenchImage } : {}),
|
|
295
510
|
verbose,
|
|
296
511
|
};
|
|
297
512
|
}
|
|
@@ -365,6 +580,11 @@ export async function runBenchmarkCommand(argv) {
|
|
|
365
580
|
projectIndex = undefined;
|
|
366
581
|
}
|
|
367
582
|
const toolRegistry = createToolRegistry(config, projectIndex);
|
|
583
|
+
// Snapshot HEAD before the model runs so we can diff against this point
|
|
584
|
+
// even if it commits its fix mid-run. Without this, `git diff` (which
|
|
585
|
+
// compares working-tree vs index) is blind to committed changes and the
|
|
586
|
+
// patch returned to the harness would be empty.
|
|
587
|
+
const baselineRef = (await captureBaselineRef(config.workspaceRoot)) ?? undefined;
|
|
368
588
|
const startedAt = new Date().toISOString();
|
|
369
589
|
const started = performance.now();
|
|
370
590
|
let attempt = await runBenchmarkAttempt(prompt, {
|
|
@@ -374,12 +594,48 @@ export async function runBenchmarkCommand(argv) {
|
|
|
374
594
|
verbose: args.verbose,
|
|
375
595
|
...(projectIndex !== undefined ? { projectIndex } : {}),
|
|
376
596
|
});
|
|
377
|
-
|
|
597
|
+
let retryRecord = null;
|
|
598
|
+
const retryReason = getBenchmarkRetryReason({
|
|
599
|
+
text: attempt.text,
|
|
600
|
+
toolCallCount: countToolCallsInMessages(attempt.messages),
|
|
601
|
+
mutationCount: countMutationsInMessages(attempt.messages),
|
|
602
|
+
});
|
|
603
|
+
if (retryReason !== null) {
|
|
378
604
|
if (args.verbose) {
|
|
379
|
-
|
|
605
|
+
const description = retryReason === "approval_seeking"
|
|
606
|
+
? "Model asked for confirmation"
|
|
607
|
+
: retryReason === "no_action"
|
|
608
|
+
? "Model emitted zero tool calls (pure-reasoning or narration-only response)"
|
|
609
|
+
: "Model made tool calls but never mutated any file (plan-only response)";
|
|
610
|
+
console.error(`[benchmark] ${description}; retrying once with a non-interactive reminder.`);
|
|
380
611
|
}
|
|
381
|
-
|
|
382
|
-
|
|
612
|
+
const reminder = getBenchmarkRetryReminder(retryReason);
|
|
613
|
+
const priorReasoningBlock = buildPriorReasoningContext(attempt.reasoningContent);
|
|
614
|
+
// Force a tool call on the retry AND cap reasoning. The first attempt
|
|
615
|
+
// already failed to act; `tool_choice: required` commits the model to
|
|
616
|
+
// emit a tool call, and the reasoning cap starves the dynamic-thinking
|
|
617
|
+
// path that produced the original collapse (Gemini 2.5 Pro routinely
|
|
618
|
+
// burns 10K+ reasoning tokens before returning nothing). The cap is
|
|
619
|
+
// above what a typical planning step needs (~1-2K tokens) but well
|
|
620
|
+
// below the observed collapse zone. Opt-out via env var.
|
|
621
|
+
const RETRY_REASONING_MAX_TOKENS = 2000;
|
|
622
|
+
const forceTools = isRetryForceToolsEnabled(process.env);
|
|
623
|
+
const retryConfig = forceTools
|
|
624
|
+
? {
|
|
625
|
+
...config,
|
|
626
|
+
toolChoice: "required",
|
|
627
|
+
reasoningMaxTokens: RETRY_REASONING_MAX_TOKENS,
|
|
628
|
+
}
|
|
629
|
+
: config;
|
|
630
|
+
retryRecord = forceTools
|
|
631
|
+
? {
|
|
632
|
+
reason: retryReason,
|
|
633
|
+
toolChoice: "required",
|
|
634
|
+
reasoningMaxTokens: RETRY_REASONING_MAX_TOKENS,
|
|
635
|
+
}
|
|
636
|
+
: { reason: retryReason };
|
|
637
|
+
attempt = await runBenchmarkAttempt(`${prompt}\n\n${reminder}${priorReasoningBlock}`, {
|
|
638
|
+
config: retryConfig,
|
|
383
639
|
modelClient,
|
|
384
640
|
toolRegistry,
|
|
385
641
|
verbose: args.verbose,
|
|
@@ -388,11 +644,11 @@ export async function runBenchmarkCommand(argv) {
|
|
|
388
644
|
}
|
|
389
645
|
const durationMs = performance.now() - started;
|
|
390
646
|
const completedAt = new Date().toISOString();
|
|
391
|
-
const changes = await collectWorkspaceChanges(config.workspaceRoot);
|
|
647
|
+
const changes = await collectWorkspaceChanges(config.workspaceRoot, baselineRef);
|
|
392
648
|
let diffOutPath;
|
|
393
649
|
if (args.diffOut) {
|
|
394
650
|
diffOutPath = path.resolve(cwd, args.diffOut);
|
|
395
|
-
await writeWorkspaceDiff(config.workspaceRoot, diffOutPath);
|
|
651
|
+
await writeWorkspaceDiff(config.workspaceRoot, diffOutPath, baselineRef);
|
|
396
652
|
}
|
|
397
653
|
const toolCalls = buildBenchmarkToolTrace(attempt.messages);
|
|
398
654
|
const result = {
|
|
@@ -410,6 +666,7 @@ export async function runBenchmarkCommand(argv) {
|
|
|
410
666
|
...(diffOutPath ? { diffOut: diffOutPath } : {}),
|
|
411
667
|
toolCalls,
|
|
412
668
|
toolUsage: summarizeBenchmarkToolUsage(toolCalls, attempt.text),
|
|
669
|
+
retry: retryRecord,
|
|
413
670
|
};
|
|
414
671
|
const payload = JSON.stringify(result, null, 2);
|
|
415
672
|
if (args.outFile) {
|
|
@@ -420,6 +677,22 @@ export async function runBenchmarkCommand(argv) {
|
|
|
420
677
|
else {
|
|
421
678
|
console.log(payload);
|
|
422
679
|
}
|
|
680
|
+
if (args.contextBenchTrajectory) {
|
|
681
|
+
const trajectoryPath = path.resolve(cwd, args.contextBenchTrajectory);
|
|
682
|
+
const patch = (await getWorkspaceDiff(config.workspaceRoot, baselineRef)) ?? "";
|
|
683
|
+
const trajectory = buildContextBenchTrajectory({
|
|
684
|
+
systemPrompt: BENCHMARK_SYSTEM_PROMPT_SUFFIX,
|
|
685
|
+
userPrompt: prompt,
|
|
686
|
+
toolCalls,
|
|
687
|
+
finalAssistantText: attempt.text,
|
|
688
|
+
workspaceRoot: config.workspaceRoot,
|
|
689
|
+
patch,
|
|
690
|
+
...(projectIndex !== undefined ? { projectIndex } : {}),
|
|
691
|
+
...(args.contextBenchImage ? { image: args.contextBenchImage } : {}),
|
|
692
|
+
});
|
|
693
|
+
await mkdir(path.dirname(trajectoryPath), { recursive: true });
|
|
694
|
+
await writeFile(trajectoryPath, JSON.stringify(trajectory, null, 2) + "\n", "utf8");
|
|
695
|
+
}
|
|
423
696
|
}
|
|
424
697
|
finally {
|
|
425
698
|
if (previousAnthropicApiKey === undefined) {
|