npm - @sean.holung/minicode - Versions diffs - 0.4.1 → 0.4.2 - Mend

@sean.holung/minicode 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/src/cli/contextbench-trajectory.js ADDED Viewed

@@ -0,0 +1,258 @@
+/**
+ * Convert a benchmark run into a MiniSWE-Agent compatible `.traj.json` trajectory
+ * that ContextBench's existing extractor (`contextbench/agents/minisweagent/extract.py`)
+ * can parse via the preferred `<explore_context>` / `<PATCH_CONTEXT>` path.
+ *
+ * Each tool call is rendered into one assistant message whose body contains a
+ * single `<explore_context>` block enumerating the files and line ranges the
+ * agent looked at on that step. The final assistant message carries a
+ * `<PATCH_CONTEXT>` block computed from the unified diff, listing the files
+ * and hunk ranges the agent actually edited.
+ */
+export function buildContextBenchTrajectory(options) {
+    const messages = [];
+    messages.push({ role: "system", content: options.systemPrompt });
+    messages.push({ role: "user", content: options.userPrompt });
+    // Group tool calls by step so that batched calls within one assistant turn
+    // produce a single assistant message — mirrors how a real agent transcript
+    // is structured.
+    const stepGroups = new Map();
+    for (const call of options.toolCalls) {
+        const list = stepGroups.get(call.step);
+        if (list)
+            list.push(call);
+        else
+            stepGroups.set(call.step, [call]);
+    }
+    const sortedSteps = [...stepGroups.keys()].sort((a, b) => a - b);
+    for (const step of sortedSteps) {
+        const calls = stepGroups.get(step);
+        const spans = collectSpansForCalls(calls, options.projectIndex);
+        if (spans.length === 0)
+            continue;
+        messages.push({
+            role: "assistant",
+            content: `<explore_context>\n${formatSpans(spans)}\n</explore_context>`,
+        });
+    }
+    const patchSpans = parsePatchSpans(options.patch);
+    const patchBlock = patchSpans.length > 0
+        ? `<PATCH_CONTEXT>\n${formatSpans(patchSpans)}\n</PATCH_CONTEXT>`
+        : "<PATCH_CONTEXT>\n</PATCH_CONTEXT>";
+    const finalText = options.finalAssistantText.trim();
+    messages.push({
+        role: "assistant",
+        content: finalText.length > 0 ? `${finalText}\n\n${patchBlock}` : patchBlock,
+    });
+    return {
+        messages,
+        info: {
+            submission: options.patch,
+            ...(options.image
+                ? { config: { environment: { image: options.image } } }
+                : {}),
+        },
+    };
+}
+function formatSpans(spans) {
+    // Stable ordering by file, then start line.
+    const sorted = [...spans].sort((a, b) => a.file.localeCompare(b.file) ||
+        a.startLine - b.startLine ||
+        a.endLine - b.endLine);
+    return sorted
+        .map((s) => `File: ${s.file}\nLines: ${s.startLine}-${s.endLine}`)
+        .join("\n");
+}
+function collectSpansForCalls(calls, index) {
+    const collected = [];
+    const seen = new Set();
+    for (const call of calls) {
+        if (call.skipped)
+            continue;
+        for (const span of spansForCall(call, index)) {
+            const key = `${span.file}:${span.startLine}-${span.endLine}`;
+            if (seen.has(key))
+                continue;
+            seen.add(key);
+            collected.push(span);
+        }
+    }
+    return collected;
+}
+function spansForCall(call, index) {
+    switch (call.name) {
+        case "read_file":
+            return spansForReadFile(call);
+        case "read_symbol":
+            return spansForSymbolLookup(call, index);
+        case "find_references":
+            return spansForReferences(call, index);
+        case "get_dependencies":
+            return spansForDependencyCone(call, index);
+        case "edit_file":
+        case "write_file":
+            return spansForMutation(call);
+        default:
+            return [];
+    }
+}
+function spansForReadFile(call) {
+    const filePath = stringField(call.input, "path");
+    if (!filePath)
+        return [];
+    const offset = numberField(call.input, "offset");
+    const limit = numberField(call.input, "limit");
+    const start = offset !== undefined && offset > 0 ? offset : 1;
+    // Prefer the tool result's last line-number prefix as a tight upper bound,
+    // since read_file emits `<line>|<content>` per line. Fall back to
+    // offset+limit-1 when no offset/limit is known.
+    const resultEndLine = lastLineNumberInResult(call.result);
+    let end;
+    if (resultEndLine !== undefined) {
+        end = resultEndLine;
+    }
+    else if (limit !== undefined && limit > 0) {
+        end = start + limit - 1;
+    }
+    else {
+        // Read with no offset/limit and unparseable result — we can't infer a
+        // useful end line, so skip this span rather than fabricate one.
+        return [];
+    }
+    if (end < start)
+        end = start;
+    return [{ file: filePath, startLine: start, endLine: end }];
+}
+function spansForSymbolLookup(call, index) {
+    if (!index)
+        return [];
+    const name = stringField(call.input, "name");
+    if (!name)
+        return [];
+    // Some symbols resolve to multiple candidates (e.g. method `foo` on
+    // multiple classes). Emit a span per match so coverage credit reflects
+    // what the agent actually paid attention to.
+    const matches = index.getSymbolMatches?.(name) ?? [];
+    const candidates = matches.length > 0 ? matches : (index.getSymbol?.(name) ? [index.getSymbol(name)] : []);
+    return candidates.map(indexedSymbolToFileSpan).filter(isDefined);
+}
+function spansForReferences(call, index) {
+    if (!index)
+        return [];
+    const name = stringField(call.input, "name");
+    if (!name)
+        return [];
+    const edges = index.dependencyEdges ?? [];
+    const target = index.getSymbol?.(name);
+    if (!target)
+        return [];
+    const incoming = edges.filter((e) => e.to === target.qualifiedName);
+    const spans = [];
+    for (const edge of incoming) {
+        const sym = index.getSymbol?.(edge.from);
+        const span = sym ? indexedSymbolToFileSpan(sym) : undefined;
+        if (span)
+            spans.push(span);
+    }
+    return spans;
+}
+function spansForDependencyCone(call, index) {
+    if (!index?.getDependencyCone)
+        return [];
+    const name = stringField(call.input, "name") ?? stringField(call.input, "symbol");
+    if (!name)
+        return [];
+    const depth = numberField(call.input, "depth") ?? 2;
+    const cone = index.getDependencyCone(name, depth);
+    return cone.map(indexedSymbolToFileSpan).filter(isDefined);
+}
+function spansForMutation(call) {
+    // Mutations don't broaden the *exploration* set on their own — the final
+    // PATCH_CONTEXT covers what was changed. But ContextBench's gold-context
+    // includes edit-location credit, so emitting a span for the touched file
+    // helps make sure edits show up in the explored-set too. We don't have
+    // exact line ranges here without re-reading, so fall back to a single-line
+    // span at the explicit `offset`/`line` field when present; otherwise skip.
+    const filePath = stringField(call.input, "path");
+    if (!filePath)
+        return [];
+    const offset = numberField(call.input, "offset");
+    if (offset !== undefined && offset > 0) {
+        return [{ file: filePath, startLine: offset, endLine: offset }];
+    }
+    return [];
+}
+function indexedSymbolToFileSpan(symbol) {
+    return {
+        file: symbol.filePath,
+        startLine: symbol.startLine,
+        endLine: symbol.endLine,
+    };
+}
+function isDefined(value) {
+    return value !== undefined && value !== null;
+}
+function stringField(input, name) {
+    const value = input[name];
+    return typeof value === "string" && value.length > 0 ? value : undefined;
+}
+function numberField(input, name) {
+    const value = input[name];
+    if (typeof value === "number" && Number.isFinite(value))
+        return value;
+    if (typeof value === "string") {
+        const parsed = Number(value);
+        if (Number.isFinite(parsed))
+            return parsed;
+    }
+    return undefined;
+}
+function lastLineNumberInResult(result) {
+    if (!result)
+        return undefined;
+    const trimmed = result.replace(/\s+$/, "");
+    if (trimmed.length === 0)
+        return undefined;
+    const lastNewline = trimmed.lastIndexOf("\n");
+    const lastLine = lastNewline >= 0 ? trimmed.slice(lastNewline + 1) : trimmed;
+    const match = lastLine.match(/^\s*(\d+)\|/);
+    if (!match)
+        return undefined;
+    const parsed = Number(match[1]);
+    return Number.isFinite(parsed) ? parsed : undefined;
+}
+/**
+ * Parse a unified diff into a list of (file, new-file-line-range) spans.
+ * Uses the NEW file side (`+` ranges) since that's where the agent's edits
+ * landed.
+ */
+export function parsePatchSpans(patch) {
+    if (!patch || patch.trim().length === 0)
+        return [];
+    const spans = [];
+    let currentFile = "";
+    for (const rawLine of patch.split(/\r?\n/)) {
+        if (rawLine.startsWith("+++ ")) {
+            const target = rawLine.slice(4).trim();
+            currentFile = stripDiffPathPrefix(target);
+            continue;
+        }
+        const hunk = rawLine.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@/);
+        if (hunk && currentFile) {
+            const start = Number(hunk[1]);
+            const count = hunk[2] !== undefined ? Number(hunk[2]) : 1;
+            const end = count === 0 ? start : start + count - 1;
+            spans.push({ file: currentFile, startLine: start, endLine: end });
+        }
+    }
+    return spans;
+}
+function stripDiffPathPrefix(target) {
+    if (target === "/dev/null")
+        return "";
+    if (target.startsWith("b/"))
+        return target.slice(2);
+    if (target.startsWith("a/"))
+        return target.slice(2);
+    return target;
+}

package/dist/tests/agent.test.js CHANGED Viewed

@@ -194,29 +194,25 @@ test("agent does not treat repeated validation commands after edits as a loop",
     assertToolCallTranscriptIsComplete(agent.getSession().getMessages());
 });
 test("agent still stops on repeated identical mutations", async () => {
-    const responses = [
-        {
-            text: "first edit",
-            toolCalls: [{ id: "edit-1", name: "edit_file", input: { path: "app.ts", content: "same" } }],
-            stopReason: "tool_use",
-            usage: { inputTokens: 1, outputTokens: 1 },
-        },
-        {
-            text: "second edit",
-            toolCalls: [{ id: "edit-2", name: "edit_file", input: { path: "app.ts", content: "same" } }],
-            stopReason: "tool_use",
-            usage: { inputTokens: 1, outputTokens: 1 },
-        },
-        {
-            text: "third edit",
-            toolCalls: [{ id: "edit-3", name: "edit_file", input: { path: "app.ts", content: "same" } }],
-            stopReason: "tool_use",
-            usage: { inputTokens: 1, outputTokens: 1 },
-        },
-    ];
+    // A model that keeps emitting the same edit_file forever should be
+    // hard-stopped after the soft guard has fired 3 times in the turn.
+    class RepeatingEditClient {
+        idx = 0;
+        async chat() {
+            this.idx += 1;
+            return {
+                text: "edit",
+                toolCalls: [
+                    { id: `edit-${this.idx}`, name: "edit_file", input: { path: "app.ts", content: "same" } },
+                ],
+                stopReason: "tool_use",
+                usage: { inputTokens: 1, outputTokens: 1 },
+            };
+        }
+    }
     const agent = new CodingAgent({
         config: createTestAgentConfig("/tmp"),
-        modelClient: new SequenceModelClient(responses),
+        modelClient: new RepeatingEditClient(),
         toolRegistry: new ToolRegistry([createEditTool()]),
     });
     const { text } = await agent.runTurn("Edit repeatedly");

package/dist/tests/benchmark-run.test.js CHANGED Viewed

@@ -1,12 +1,24 @@
 import assert from "node:assert/strict";
 import { test } from "node:test";
-import { buildBenchmarkToolTrace, getBenchmarkSystemPromptSuffix, isBenchmarkApprovalSeekingResponse, parseBenchmarkRunArgs, summarizeBenchmarkToolUsage, } from "../src/cli/benchmark-run.js";
+import { buildBenchmarkToolTrace, buildPriorReasoningContext, countMutationsInMessages, getBenchmarkRetryReason, getBenchmarkRetryReminder, getBenchmarkSystemPromptSuffix, isBenchmarkApprovalSeekingResponse, isRetryForceToolsEnabled, looksLikeShellFileMutation, parseBenchmarkRunArgs, summarizeBenchmarkToolUsage, } from "../src/cli/benchmark-run.js";
 test("benchmark system prompt suffix clearly disables approval-seeking behavior", () => {
     const suffix = getBenchmarkSystemPromptSuffix();
-    assert.match(suffix, /non-interactive benchmark harness/i);
+    assert.match(suffix, /non-interactive harness/i);
     assert.match(suffix, /already approved/i);
     assert.match(suffix, /do not ask for confirmation/i);
 });
+test("benchmark system prompt suffix overrides iteration discipline for long-form tasks", () => {
+    // Added after observing on CCBench that the base [Iteration Discipline]
+    // "3-5 calls then commit" guidance was driving premature completion on
+    // benchmark tasks that genuinely require 30+ iterate-test-fix cycles.
+    // Both gemini-3-flash and haiku-4.5 declared "I have implemented" before
+    // verifying their changes against the canonical test suite.
+    const suffix = getBenchmarkSystemPromptSuffix();
+    assert.match(suffix, /persistent iteration|30\+ tool-call/i, "should set the expectation that persistent iteration is normal");
+    assert.match(suffix, /canonical test runner/i, "should direct the model to the canonical test runner, not ad-hoc tests");
+    assert.match(suffix, /full existing test suite/i, "should require running the full suite to catch regressions");
+    assert.match(suffix, /explicit green signal/i, "should require an observed pass signal, not self-assessed completion");
+});
 test("benchmark system prompt suffix omits runtime budget knobs", () => {
     const suffix = getBenchmarkSystemPromptSuffix();
     assert.doesNotMatch(suffix, /maxSteps/i);
@@ -23,6 +35,161 @@ test("normal benchmark summaries are not treated as approval-seeking", () => {
     assert.equal(isBenchmarkApprovalSeekingResponse("Updated src/app.ts, ran npm test once, and all tests passed."), false);
     assert.equal(isBenchmarkApprovalSeekingResponse("The task is blocked because the repository does not contain the referenced file."), false);
 });
+test("getBenchmarkRetryReason flags zero-tool-call attempts as no_action", () => {
+    // Pure-reasoning failure (Gemini 2.5 Pro's "thought a lot, emitted
+    // nothing" mode). Despite empty text, the attempt is still a definite
+    // failure in benchmark mode since the task needs code changes.
+    assert.equal(getBenchmarkRetryReason({ text: "", toolCallCount: 0, mutationCount: 0 }), "no_action");
+    // Hallucinated-completion failure: the model narrates work without
+    // making any tool calls. Caught the same way — zero tool calls is
+    // the load-bearing signal, not the text.
+    assert.equal(getBenchmarkRetryReason({
+        text: "I've added the new transformation to astropy/coordinates/itrs.py and registered it with the frame_transform_graph.",
+        toolCallCount: 0,
+        mutationCount: 0,
+    }), "no_action");
+    // Future-tense planning without action — also covered.
+    assert.equal(getBenchmarkRetryReason({
+        text: "I will add the helper function to utils.py and update the imports.",
+        toolCallCount: 0,
+        mutationCount: 0,
+    }), "no_action");
+});
+test("getBenchmarkRetryReason flags approval-seeking when tool calls exist", () => {
+    assert.equal(getBenchmarkRetryReason({
+        text: "I found the changes needed. Please confirm and I'll apply them.",
+        toolCallCount: 5,
+        mutationCount: 0,
+    }), "approval_seeking");
+});
+test("getBenchmarkRetryReason flags plan-only attempts as no_mutation", () => {
+    // Observed on Gemini 2.5 Pro 71f348da: 3 read-only tool calls
+    // (search_code_map + 2 read_symbol) followed by a future-tense plan
+    // ("Here's how I'll fix it: 1. Read X. 2. Modify Y. 3. I'll replace Z.")
+    // and no edit. Approval-seeking detector doesn't fire (no "please confirm")
+    // and no_action doesn't fire (toolCallCount > 0) — needs a third signal.
+    assert.equal(getBenchmarkRetryReason({
+        text: "Here's how I'll fix it: 1. Read sliced_wcs.py. 2. Modify world_to_pixel_values. 3. I'll replace the 1. fallback.",
+        toolCallCount: 3,
+        mutationCount: 0,
+    }), "no_mutation");
+});
+test("getBenchmarkRetryReason returns null when mutations occurred", () => {
+    assert.equal(getBenchmarkRetryReason({
+        text: "Updated src/app.ts, ran npm test, all tests passed.",
+        toolCallCount: 12,
+        mutationCount: 2,
+    }), null);
+});
+test("getBenchmarkRetryReason prioritizes no_action over no_mutation", () => {
+    // Defensive: a zero-tool-call attempt also has zero mutations, but the
+    // reminder for no_action is more specific. Make sure that path wins.
+    assert.equal(getBenchmarkRetryReason({
+        text: "I will edit utils.py.",
+        toolCallCount: 0,
+        mutationCount: 0,
+    }), "no_action");
+});
+test("getBenchmarkRetryReminder returns distinct reminders for each reason", () => {
+    const approval = getBenchmarkRetryReminder("approval_seeking");
+    const noAction = getBenchmarkRetryReminder("no_action");
+    const noMutation = getBenchmarkRetryReminder("no_mutation");
+    // Approval reminder leans on "already approved" — the model was acting
+    // but asked for permission.
+    assert.match(approval, /already approved/i);
+    assert.doesNotMatch(approval, /zero tool calls/i);
+    // No-action reminder names the failure mode explicitly so the model
+    // understands what changed.
+    assert.match(noAction, /zero tool calls/i);
+    assert.match(noAction, /edit_file/);
+    // Calls out both the past-tense and future-tense narration traps that
+    // were observed in the Gemini 2.5 Pro empty-trajectory investigation.
+    assert.match(noAction, /past-tense/i);
+    assert.match(noAction, /future-tense/i);
+    // No-mutation reminder is distinct — the model DID call tools, just
+    // never edited anything. It should mention reading-without-editing,
+    // and acknowledge shell-based edits as legitimate (since some models
+    // prefer `cat > file` over edit_file).
+    assert.match(noMutation, /read files but never edited/i);
+    assert.match(noMutation, /cat > path|sed -i/);
+    assert.notEqual(noMutation, noAction);
+    assert.notEqual(noMutation, approval);
+});
+test("looksLikeShellFileMutation detects common file-writing shells", () => {
+    // Heredoc into file — the gemini-3-pro 71f348da pattern.
+    assert.equal(looksLikeShellFileMutation("cat > path/to/file.py <<'EOF'\nbody\nEOF"), true);
+    // Append redirect with heredoc.
+    assert.equal(looksLikeShellFileMutation("cat << 'EOF' >> tests/foo.py\nbody\nEOF"), true);
+    // In-place sed.
+    assert.equal(looksLikeShellFileMutation("sed -i 's/old/new/g' foo.py"), true);
+    // tee.
+    assert.equal(looksLikeShellFileMutation("echo x | tee path/to/file"), true);
+    // Python open().write().
+    assert.equal(looksLikeShellFileMutation('python -c "open(\'foo.py\', \'w\').write(\'body\')"'), true);
+});
+test("looksLikeShellFileMutation rejects read-only and benign redirects", () => {
+    // Pure read.
+    assert.equal(looksLikeShellFileMutation("cat path/to/file.py"), false);
+    // Pipe + cat with output to /dev/null.
+    assert.equal(looksLikeShellFileMutation("python script.py > /dev/null 2>&1"), false);
+    // File descriptor redirect (no file write).
+    assert.equal(looksLikeShellFileMutation("python script.py 2>&1"), false);
+    // pytest invocation — no redirect, no in-place edit.
+    assert.equal(looksLikeShellFileMutation("python -m pytest tests/foo.py"), false);
+    // git commands operate on the index, not arbitrary file writes — we
+    // don't count them as code mutations.
+    assert.equal(looksLikeShellFileMutation("git checkout tests/foo.py"), false);
+    assert.equal(looksLikeShellFileMutation("git add ."), false);
+});
+test("buildPriorReasoningContext returns empty when no reasoning was captured", () => {
+    assert.equal(buildPriorReasoningContext(undefined), "");
+    assert.equal(buildPriorReasoningContext(""), "");
+    assert.equal(buildPriorReasoningContext("   \n\n   "), "");
+});
+test("buildPriorReasoningContext wraps the prior reasoning with framing", () => {
+    // The wrapper tells the model this is its own prior thinking, and
+    // nudges it to act on the reasoning rather than re-deliberate.
+    const block = buildPriorReasoningContext("The bug is the hardcoded 1.0 fallback in sliced_wcs.py line 254.");
+    assert.match(block, /your previous attempt/i);
+    assert.match(block, /<<<PRIOR_REASONING>>>/);
+    assert.match(block, /<<<END_PRIOR_REASONING>>>/);
+    assert.match(block, /hardcoded 1\.0 fallback/);
+    assert.match(block, /apply that reasoning/i);
+});
+test("buildPriorReasoningContext truncates very long reasoning", () => {
+    // 12k reasoning tokens from a pure-thinking collapse would explode
+    // the next attempt's input cost. Cap at 2000 chars so the retry stays
+    // affordable while still preserving the high-level plan.
+    const long = "x".repeat(5000);
+    const block = buildPriorReasoningContext(long);
+    assert.match(block, /more chars of reasoning truncated/);
+    // The wrapped block should NOT contain the full 5000 chars worth of x's.
+    const xCount = (block.match(/x/g) ?? []).length;
+    assert.ok(xCount < 5000, `expected truncation, got ${xCount} x chars`);
+    assert.ok(xCount >= 2000, `expected at least 2000 x chars preserved, got ${xCount}`);
+});
+test("countMutationsInMessages counts structured + shell mutations together", () => {
+    const messages = [
+        { role: "user", content: "task" },
+        {
+            role: "assistant",
+            content: "",
+            toolCalls: [
+                { id: "1", name: "read_file", input: { path: "a.py" } },
+                { id: "2", name: "edit_file", input: { path: "a.py", old_string: "x", new_string: "y" } },
+                {
+                    id: "3",
+                    name: "run_command",
+                    input: { command: "cat > b.py <<'EOF'\nprint('hi')\nEOF" },
+                },
+                { id: "4", name: "run_command", input: { command: "python -m pytest" } },
+            ],
+        },
+        { role: "assistant", content: "done" },
+    ];
+    // edit_file + the heredoc command count; read_file and pytest don't.
+    assert.equal(countMutationsInMessages(messages), 2);
+});
 test("parseBenchmarkRunArgs preserves prompt text and benchmark flags", () => {
     const args = parseBenchmarkRunArgs([
         "--verbose",
@@ -141,9 +308,42 @@ test("benchmark tool usage summary separates structured tools from file reads",
     assert.equal(summary.specializedByName.get_dependencies, 1);
     assert.equal(summary.fileReadTotal, 1);
     assert.equal(summary.mutationTotal, 1);
+    assert.equal(summary.shellMutationTotal, 0);
     assert.equal(summary.commandTotal, 1);
     assert.deepEqual(summary.repeatedToolCalls, []);
 });
+test("benchmark tool usage summary counts shell-based file edits as shellMutation", () => {
+    // Gemini-3-Pro pattern: `cat > FILE <<EOF` heredoc instead of edit_file.
+    // mutationTotal stays at the structured-tool count; shellMutationTotal
+    // exposes the shell-based edits without inflating mutationTotal.
+    const summary = summarizeBenchmarkToolUsage([
+        {
+            step: 1,
+            name: "run_command",
+            input: { command: "cat > foo.py <<'EOF'\nprint('hi')\nEOF" },
+            result: "ok",
+            skipped: false,
+        },
+        {
+            step: 2,
+            name: "run_command",
+            input: { command: "sed -i 's/old/new/g' bar.py" },
+            result: "ok",
+            skipped: false,
+        },
+        {
+            step: 3,
+            name: "run_command",
+            input: { command: "python -m pytest" },
+            result: "ok",
+            skipped: false,
+        },
+    ], "Done");
+    assert.equal(summary.total, 3);
+    assert.equal(summary.commandTotal, 3);
+    assert.equal(summary.mutationTotal, 0);
+    assert.equal(summary.shellMutationTotal, 2);
+});
 test("benchmark tool usage summary reports repeated-call stops", () => {
     const summary = summarizeBenchmarkToolUsage([
         {
@@ -178,3 +378,24 @@ test("benchmark tool usage summary reports repeated-call stops", () => {
         },
     ]);
 });
+// ---------------------------------------------------------------------------
+// isRetryForceToolsEnabled
+// ---------------------------------------------------------------------------
+test("isRetryForceToolsEnabled defaults to true when env var unset", () => {
+    assert.equal(isRetryForceToolsEnabled({}), true);
+});
+test("isRetryForceToolsEnabled defaults to true when env var is empty/whitespace", () => {
+    assert.equal(isRetryForceToolsEnabled({ BENCHMARK_RETRY_FORCE_TOOLS: "" }), true);
+    assert.equal(isRetryForceToolsEnabled({ BENCHMARK_RETRY_FORCE_TOOLS: "   " }), true);
+});
+test("isRetryForceToolsEnabled returns false for explicit disable values", () => {
+    for (const value of ["0", "false", "no", "off", "FALSE", "NO", "Off"]) {
+        assert.equal(isRetryForceToolsEnabled({ BENCHMARK_RETRY_FORCE_TOOLS: value }), false, `expected false for input ${JSON.stringify(value)}`);
+    }
+});
+test("isRetryForceToolsEnabled returns true for affirmative or unrecognized values", () => {
+    // Unrecognized values fall back to "on" — keeping the safe default for typos.
+    for (const value of ["1", "true", "yes", "on", "TRUE", "anything"]) {
+        assert.equal(isRetryForceToolsEnabled({ BENCHMARK_RETRY_FORCE_TOOLS: value }), true, `expected true for input ${JSON.stringify(value)}`);
+    }
+});