npm - @pentoshi/clai - Versions diffs - 1.1.4 → 1.2.1 - Mend

@pentoshi/clai 1.1.4 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/agent/runner.d.ts +15 -0
package/dist/agent/runner.js +387 -296
package/dist/agent/runner.js.map +1 -1
package/dist/commands/update.js +1 -1
package/dist/prompts/index.d.ts +2 -2
package/dist/prompts/index.js +107 -383
package/dist/prompts/index.js.map +1 -1
package/dist/repl.js +109 -26
package/dist/repl.js.map +1 -1
package/dist/safety/classifier.d.ts +7 -0
package/dist/safety/classifier.js +89 -79
package/dist/safety/classifier.js.map +1 -1
package/dist/safety/patterns.d.ts +15 -0
package/dist/safety/patterns.js +47 -20
package/dist/safety/patterns.js.map +1 -1
package/dist/tools/command-intent.js +22 -4
package/dist/tools/command-intent.js.map +1 -1
package/dist/tools/registry.d.ts +7 -0
package/dist/tools/registry.js +89 -6
package/dist/tools/registry.js.map +1 -1
package/dist/ui/thinking.d.ts +10 -1
package/dist/ui/thinking.js +81 -17
package/dist/ui/thinking.js.map +1 -1
package/package.json +1 -1

package/dist/agent/runner.js CHANGED Viewed

@@ -380,6 +380,53 @@ export function countToolFences(text) {
     const matches = text.match(/```tool\s*\n[\s\S]*?```/gi);
     return matches ? matches.length : 0;
 }
+/**
+ * Parse EVERY explicitly-delimited tool call in a message, in document
+ * order. Unlike parseToolCall (which returns only the first), this lets the
+ * runner execute a batch the model emitted in one turn — e.g. the natural
+ * "task.update in_progress → do the work → task.update done" sequence, or
+ * several fs.write calls. Only the unambiguous, delimited formats are
+ * collected (```tool fences, <tool_call> XML, and Kimi sentinel blocks) so a
+ * worked example in prose is far less likely to be mistaken for a call.
+ * The runner executes them sequentially and STOPS the batch on the first
+ * failure so the model can react, mirroring how Claude Code batches reads
+ * and edits but pauses when something breaks.
+ */
+export function parseAllToolCalls(text) {
+    const found = [];
+    let m;
+    const fenceRe = /```tool\s*\n?([\s\S]*?)```/gi;
+    while ((m = fenceRe.exec(text)) !== null) {
+        const call = tryParseCall(m[1] ?? "");
+        if (call)
+            found.push({ index: m.index, call });
+    }
+    const xmlRe = /<tool_call>([\s\S]*?)<\/tool_call>/gi;
+    while ((m = xmlRe.exec(text)) !== null) {
+        const call = tryParseCall(m[1] ?? "");
+        if (call)
+            found.push({ index: m.index, call });
+    }
+    const kimiRe = new RegExp(KIMI_TOOL_CALL_RE.source, "gi");
+    while ((m = kimiRe.exec(text)) !== null) {
+        const call = tryParseCall(JSON.stringify({ name: m[1], args: tryJson(m[2] ?? "{}") ?? {} }));
+        if (call)
+            found.push({ index: m.index, call });
+    }
+    found.sort((a, b) => a.index - b.index);
+    return found.map((f) => f.call);
+}
+/** Structural equality for two tool calls (name + canonical args JSON). */
+export function sameToolCall(a, b) {
+    if (a.name !== b.name)
+        return false;
+    try {
+        return JSON.stringify(a.args) === JSON.stringify(b.args);
+    }
+    catch {
+        return false;
+    }
+}
 /** Extract the text before the tool call block for display purposes */
 function textBeforeToolCall(text) {
     const patterns = [
@@ -529,7 +576,8 @@ function freshnessGuardMessage(now = new Date()) {
     return (`Freshness guard for this turn: the latest user prompt appears to ask for current, volatile, or externally verifiable information. The present moment is ${currentDateTimeContext(now)}. ` +
         "Before answering, call web.search FIRST with a concise query derived from the user prompt. " +
         "Shape the search query for the newest timeline by including current/latest or the current year/month when useful. " +
-        "Use the search results to answer. If web.search fails or has no results, say that current information is unavailable instead of guessing from memory.");
+        "Do not answer from the snippets alone when detail matters — set fetchTop (e.g. fetchTop:2) to read the top result pages, or follow up with web.fetch on the most relevant URL, then answer from what the pages actually say and cite them. " +
+        "If web.search fails or has no results, say that current information is unavailable instead of guessing from memory.");
 }
 /**
  * Directive injected for build/scaffold turns. Forces the careful
@@ -542,7 +590,7 @@ function buildWorkflowDirective() {
         "1. EXPLORE: fs.list the working directory (and key subdirs) to see what already exists. Use tool.batch to parallelize reads.",
         "2. UNDERSTAND: fs.read the files that matter (like package.json for js related and same for other languages too, config, entry points, existing components). Detect the existing stack/tooling and MATCH it. If the dir is empty or only has a stub, start fresh with a sensible modern default and say so.",
         "3. PLAN: call plan.create with a COMPREHENSIVE plan — a detailed `detail` (stack chosen and WHY, architecture, how you'll verify) and 4-8 SEPARATE, ordered, high-quality tasks. The FIRST task initializes the project (scaffolder); the MIDDLE tasks MUST implement the ACTUAL FEATURE the user asked for by REPLACING the scaffolder's boilerplate (e.g. rewrite src/App.jsx into the real todo/blog/etc. UI, add components, state, styles); the LAST task verifies with a build. Scaffolding + install + run ALONE is NOT acceptable — that just leaves the Vite starter page. Each task is one distinct, verifiable action. Then STOP and wait for the user to /implement.",
-        "4. IMPLEMENT: once approved, work task by task in STRICT ORDER across MULTIPLE steps, ONE tool call per turn. For each task: call task.update {taskId, state:'in_progress'} → do the real work → VERIFY it actually succeeded (read a file you wrote, check the command's exit/output) → call task.update {taskId, state:'done'}, then move to the NEXT task. Keep going until EVERY task is done. Do NOT stop after one step, and do NOT claim work you didn't actually run.",
+        "4. IMPLEMENT: once approved, work task by task in STRICT ORDER. For each task: call task.update {taskId, state:'in_progress'} → do the real work → VERIFY it actually succeeded (read a file you wrote, check the command's exit/output) → call task.update {taskId, state:'done'}, then move to the NEXT task. You MAY emit several tool calls in one message and they run in order, top to bottom (the batch STOPS if one fails). A clean rhythm is: task.update in_progress + the work + task.update done together. Keep going until EVERY task is done. Do NOT claim work you didn't actually run.",
         "",
         "INITIALIZE WITH THE OFFICIAL SCAFFOLDER FIRST (do NOT hand-write build configs):",
         "- React/Vue/Svelte/vanilla → `npm create vite@latest <appname> -- --template react` (templates: react, react-ts, vue, vue-ts, svelte, vanilla). Next.js → `npx --yes create-next-app@latest <appname> --yes --eslint --no-tailwind --app --src-dir --import-alias \"@/*\"`. Node API → `npm init -y`.",
@@ -552,7 +600,7 @@ function buildWorkflowDirective() {
         "- VERIFY the init actually worked before marking the task done: fs.read package.json (it must now exist AND list react + react-dom) and fs.read index.html (it must reference your jsx entry). 'Operation cancelled' / non-zero exit means the task FAILED — do not proceed as if it succeeded.",
         "",
         "CRITICAL RULES during IMPLEMENTATION:",
-        "- EXACTLY ONE ```tool block per message. NEVER put several tool calls (e.g. fs.writeMany + npm install + npm run dev) in one response — only the first runs and the rest are silently discarded, which is how false 'all done' claims happen.",
+        "- You may batch tool calls: emit one or several ```tool blocks in a message and they run in order, top to bottom. If any call fails, the rest of that batch is cancelled so you can react — so order dependent steps correctly and keep batches focused. A good batch is task.update(in_progress) + the work + task.update(done) for ONE task.",
         "- Do NOT re-explore. Step 1 (EXPLORE) was already completed during planning. Start executing the first pending task immediately.",
         "- ONE task at a time, in ORDER. Do NOT skip ahead to task 3 before task 2 is done.",
         "- KEEP EACH FILE SMALL ENOUGH TO WRITE IN ONE CALL. If a fs.write is reported as 'cut off (output too long)', the file was NOT fully written and is likely broken/invalid — re-write it, splitting a large component into smaller files if needed. NEVER leave a half-written file and move on.",
@@ -975,6 +1023,17 @@ export async function runAgentLoop(prompt, options = {}) {
     // tasks are still pending and it never ran the work. We nudge it back to
     // executing the next task a bounded number of times before giving up.
     let prematureCompletionRetries = 0;
+    // ── Multi-tool execution queue ─────────────────────────────────────
+    // Models naturally emit several tool calls in one message — e.g. the
+    // plan-execution rhythm "task.update in_progress → do the work →
+    // task.update done", or a batch of fs.write calls. Rather than running
+    // only the first and discarding the rest (which made models believe work
+    // ran when it didn't, and broke plan execution), we parse ALL calls in a
+    // message, run the first this iteration, and queue the rest here to run on
+    // subsequent iterations WITHOUT another model round-trip. The queue is
+    // cleared whenever a call fails, is blocked, or needs the model to react,
+    // so the model always sees errors and stays in control.
+    let pendingCalls = [];
     // ── Step budget ───────────────────────────────────────────────────
     // The budget governs how many *productive* steps (a tool execution or a
     // final answer) the agent may take. Recovery iterations — nudging a model
@@ -1018,301 +1077,343 @@ export async function runAgentLoop(prompt, options = {}) {
         if (productiveSteps >= stepBudget)
             break;
         options.signal?.throwIfAborted();
-        // Buffer LLM output so tool JSON and hidden thinking are not printed raw.
-        // Status messages (rate-limit retries, fallback hints) still surface live.
-        // A spinner gives the user feedback during long thinking phases on
-        // models like glm-5.1 / deepseek-v4-flash that stream reasoning first.
-        const spinner = startThinkingSpinner(step === 0 ? "waiting for model" : `step ${step + 1}`, options.signal);
-        let sawReasoning = false;
-        let inThinking = false;
-        let completion;
-        try {
-            completion = await streamWithProvider({
-                provider,
-                model,
-                messages,
-                temperature: 0.2,
-                // Reasoning models can spend a lot on hidden thinking; give
-                // them headroom so the visible answer / tool call isn't
-                // truncated to silence. The non-thinking budget must be large
-                // enough for a single-file fs.write / multi-file fs.writeMany
-                // payload — a truncated tool-call JSON fails to parse and leaks a
-                // broken (and syntactically invalid) file. 8k was too small for a
-                // full component, so allow more room for the visible tool call.
-                maxTokens: config.thinking?.enabled ? 16_384 : 12_288,
-                signal: options.signal,
-                thinking: config.thinking,
-            }, (token) => {
-                // Heuristic: <think>… markers and reasoning_content tokens flow
-                // through onToken. Surface activity in the spinner so the screen
-                // is never empty for minutes.
-                if (!sawReasoning && /<think/i.test(token)) {
-                    sawReasoning = true;
-                    inThinking = true;
-                    spinner.setLabel("thinking");
-                }
-                if (/<\/think>/i.test(token)) {
-                    inThinking = false;
-                }
-                // Only push reasoning tokens to the spinner preview. Visible
-                // answer / tool-call tokens should NOT go through the dim
-                // spinner preview — doing so makes the final answer appear
-                // "diluted" in light font when the spinner's last render
-                // briefly shows the answer text before being erased.
-                if (inThinking) {
-                    const cleaned = token.replace(/<\/?think[^>]*>/gi, "");
-                    if (cleaned) {
-                        spinner.pushPreview(cleaned);
-                        const approx = cleaned.split(/\s+/).filter(Boolean).length;
-                        if (approx > 0)
-                            spinner.bumpReasoning(approx);
-                    }
-                }
-            }, (status) => {
-                spinner.stop();
-                process.stdout.write(chalk.dim(status));
-            });
-        }
-        finally {
-            // Always clear the spinner — abort, network error, or success.
-            spinner.stop();
-        }
-        provider = completion.provider;
-        model = completion.model;
-        const assistantText = rememberThinkingFromText(completion.text);
-        // Try visible text first, then thinking content — some models (e.g. glm-5.1)
-        // wrap tool calls inside  considering tags, so stripThinking removes them
-        // into thinkContent and visible becomes empty. Recovering from thinkContent
-        // prevents an endless nudge loop where the model keeps hiding the call.
-        let call = parseToolCall(assistantText.visible, {
-            strict: getConfig().parserStrict,
-        });
-        if (!call && assistantText.hasThinking) {
-            call = parseToolCall(assistantText.thinkContent, {
-                strict: getConfig().parserStrict,
-            });
-            if (call) {
-                process.stdout.write(chalk.dim("  ℹ recovered tool call from thinking content\n"));
-            }
-        }
-        // ── Thinking-only recovery ────────────────────────────────────────
-        // Some models (eg gpt-oss-20b on NVIDIA NIM) occasionally spend their
-        // entire budget on hidden <think> reasoning and emit no visible text
-        // or tool call. Without this guard the agent silently returns an empty
-        // answer and the user has to re-submit the same prompt.
-        if (!assistantText.visible.trim() && !call && assistantText.hasThinking) {
-            emptyVisibleRetries += 1;
-            if (emptyVisibleRetries <= 2) {
-                process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
-                process.stdout.write(chalk.yellow("  ⚠ model produced only thinking — nudging it to take action\n"));
-                messages.push({ role: "assistant", content: completion.text });
-                const buildNudge = buildLikeTurn && !activePlan
-                    ? "You only produced internal reasoning with no visible answer or tool call. " +
-                        "This is a BUILD/SCAFFOLD task with NO plan yet. " +
-                        "You MUST call plan.create using the ```tool format to create a comprehensive plan BEFORE writing any files or running any commands. " +
-                        "Do NOT use fs.write, fs.writeMany, fs.edit, shell.exec, shell.start, or pkg.install yet. " +
-                        "Your ONLY allowed action right now is plan.create (or read/list for exploration)."
-                    : "You only produced internal reasoning with no visible answer or tool call. " +
-                        "You MUST either call a tool using the ```tool format or provide your final answer. " +
-                        "Do NOT wrap your tool call inside  considering or reasoning tags — put it in the VISIBLE response, not hidden. " +
-                        "If images are attached, inspect them directly for visual details (text, colors, layout, spacing, style) instead of using OCR unless explicitly needed. " +
-                        "Do NOT just think — take action NOW.";
-                messages.push(recoveryUserMessage(buildNudge));
-                continue;
-            }
-            // Exhausted retries — fall through to the normal empty-answer path
-            // which will print a warning and return.
+        // `call` and `assistantText` are shared by both paths below: a fresh
+        // model round-trip, or draining a previously-queued tool call.
+        let call;
+        let assistantText;
+        let recoveredFromBareJson = false;
+        if (pendingCalls.length > 0) {
+            // Drain the next queued call from the previous model message — no new
+            // round-trip. The assistant message and any prose were already shown
+            // when the batch was parsed.
+            call = pendingCalls.shift();
+            assistantText = { visible: "", thinkContent: "", hasThinking: false };
+            process.stdout.write(chalk.dim(`  ↳ continuing batch (${pendingCalls.length} more queued)\n`));
         }
         else {
-            // Reset the counter on any successful visible output or recovered call.
-            emptyVisibleRetries = 0;
-        }
-        // `call` was already extracted above (from visible text or thinking content).
-        // Recovery: the model meant to call a tool but emitted a bare JSON object
-        // with no ```tool fence — either a complete {name,args} the strict
-        // matchers missed (recover it directly), or just an args object like
-        // {"path":"file.pdf"} with the wrapper dropped (nudge a retry below so
-        // the requested action runs instead of the JSON leaking as the answer).
-        let bareArgsOnly = false;
-        let recoveredFromBareJson = false;
-        if (!call) {
-            const bare = recognizeBareToolJson(assistantText.visible);
-            if (bare?.call) {
-                call = bare.call;
-                recoveredFromBareJson = true;
-                process.stdout.write(chalk.dim("  ℹ recovered an unfenced tool call from bare JSON\n"));
-            }
-            else if (bare?.argsOnly) {
-                bareArgsOnly = true;
+            // Buffer LLM output so tool JSON and hidden thinking are not printed raw.
+            // Status messages (rate-limit retries, fallback hints) still surface live.
+            // A spinner gives the user feedback during long thinking phases on
+            // models like glm-5.1 / deepseek-v4-flash that stream reasoning first.
+            const spinner = startThinkingSpinner(step === 0 ? "waiting for model" : `step ${step + 1}`, options.signal);
+            let sawReasoning = false;
+            let inThinking = false;
+            let completion;
+            try {
+                completion = await streamWithProvider({
+                    provider,
+                    model,
+                    messages,
+                    temperature: 0.2,
+                    // Reasoning models can spend a lot on hidden thinking; give
+                    // them headroom so the visible answer / tool call isn't
+                    // truncated to silence. The non-thinking budget must be large
+                    // enough for a single-file fs.write / multi-file fs.writeMany
+                    // payload — a truncated tool-call JSON fails to parse and leaks a
+                    // broken (and syntactically invalid) file. 8k was too small for a
+                    // full component, so allow more room for the visible tool call.
+                    maxTokens: config.thinking?.enabled ? 16_384 : 12_288,
+                    signal: options.signal,
+                    thinking: config.thinking,
+                }, (token) => {
+                    // Heuristic: <think>… markers and reasoning_content tokens flow
+                    // through onToken. Surface activity in the spinner so the screen
+                    // is never empty for minutes.
+                    if (!sawReasoning && /<think/i.test(token)) {
+                        sawReasoning = true;
+                        inThinking = true;
+                        spinner.setLabel("thinking");
+                    }
+                    if (/<\/think>/i.test(token)) {
+                        inThinking = false;
+                    }
+                    // Only push reasoning tokens to the spinner preview. Visible
+                    // answer / tool-call tokens should NOT go through the dim
+                    // spinner preview — doing so makes the final answer appear
+                    // "diluted" in light font when the spinner's last render
+                    // briefly shows the answer text before being erased.
+                    if (inThinking) {
+                        const cleaned = token.replace(/<\/?think[^>]*>/gi, "");
+                        if (cleaned) {
+                            spinner.pushPreview(cleaned);
+                            const approx = cleaned.split(/\s+/).filter(Boolean).length;
+                            if (approx > 0)
+                                spinner.bumpReasoning(approx);
+                        }
+                    }
+                }, (status) => {
+                    spinner.stop();
+                    process.stdout.write(chalk.dim(status));
+                });
             }
-        }
-        // Also check thinking content for bare JSON calls.
-        if (!call && assistantText.hasThinking) {
-            const bareThink = recognizeBareToolJson(assistantText.thinkContent);
-            if (bareThink?.call) {
-                call = bareThink.call;
-                recoveredFromBareJson = true;
-                process.stdout.write(chalk.dim("  ℹ recovered an unfenced tool call from thinking content\n"));
+            finally {
+                // Always clear the spinner — abort, network error, or success.
+                spinner.stop();
             }
-            else if (bareThink?.argsOnly) {
-                bareArgsOnly = true;
+            provider = completion.provider;
+            model = completion.model;
+            const assistantTextResult = rememberThinkingFromText(completion.text);
+            assistantText = assistantTextResult;
+            // Try visible text first, then thinking content — some models (e.g. glm-5.1)
+            // wrap tool calls inside  considering tags, so stripThinking removes them
+            // into thinkContent and visible becomes empty. Recovering from thinkContent
+            // prevents an endless nudge loop where the model keeps hiding the call.
+            call = parseToolCall(assistantText.visible, {
+                strict: getConfig().parserStrict,
+            });
+            if (!call && assistantText.hasThinking) {
+                call = parseToolCall(assistantText.thinkContent, {
+                    strict: getConfig().parserStrict,
+                });
+                if (call) {
+                    process.stdout.write(chalk.dim("  ℹ recovered tool call from thinking content\n"));
+                }
             }
-        }
-        if (!call) {
-            if (bareArgsOnly) {
-                bareToolJsonRetries += 1;
-                if (bareToolJsonRetries <= 3) {
-                    process.stdout.write(chalk.yellow("  ⚠ tool call missing its name/fence — asking the model to re-emit a proper ```tool block\n"));
-                    messages.push({ role: "assistant", content: assistantText.visible });
-                    messages.push(recoveryUserMessage(buildLikeTurn && !activePlan
-                        ? "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
+            // ── Thinking-only recovery ────────────────────────────────────────
+            // Some models (eg gpt-oss-20b on NVIDIA NIM) occasionally spend their
+            // entire budget on hidden <think> reasoning and emit no visible text
+            // or tool call. Without this guard the agent silently returns an empty
+            // answer and the user has to re-submit the same prompt.
+            if (!assistantText.visible.trim() && !call && assistantText.hasThinking) {
+                emptyVisibleRetries += 1;
+                if (emptyVisibleRetries <= 2) {
+                    process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
+                    process.stdout.write(chalk.yellow("  ⚠ model produced only thinking — nudging it to take action\n"));
+                    messages.push({ role: "assistant", content: completion.text });
+                    const buildNudge = buildLikeTurn && !activePlan
+                        ? "You only produced internal reasoning with no visible answer or tool call. " +
                             "This is a BUILD/SCAFFOLD task with NO plan yet. " +
-                            "You MUST call plan.create using a proper ```tool block. For example:\n" +
-                            '```tool\n{"name":"plan.create","args":{"goal":"scaffold todo app","detail":"...","tasks":["...","..."],"kind":"coding"}}\n```\n' +
-                            "Do NOT use fs.write, fs.writeMany, shell.exec, or pkg.install yet."
-                        : "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
-                            "Reply with ONLY a fenced ```tool block of the form " +
-                            '`{"name": "<tool>", "args": { ... }}`. For example, to read a PDF:\n' +
-                            '```tool\n{"name":"pdf.read","args":{"path":"/abs/file.pdf"}}\n```\n' +
-                            "Choose the correct tool name for the task and include those args."));
+                            "You MUST call plan.create using the ```tool format to create a comprehensive plan BEFORE writing any files or running any commands. " +
+                            "Do NOT use fs.write, fs.writeMany, fs.edit, shell.exec, shell.start, or pkg.install yet. " +
+                            "Your ONLY allowed action right now is plan.create (or read/list for exploration)."
+                        : "You only produced internal reasoning with no visible answer or tool call. " +
+                            "You MUST either call a tool using the ```tool format or provide your final answer. " +
+                            "Do NOT wrap your tool call inside  considering or reasoning tags — put it in the VISIBLE response, not hidden. " +
+                            "If images are attached, inspect them directly for visual details (text, colors, layout, spacing, style) instead of using OCR unless explicitly needed. " +
+                            "Do NOT just think — take action NOW.";
+                    messages.push(recoveryUserMessage(buildNudge));
                     continue;
                 }
-                // Exhausted retries — fall through to the normal answer path.
+                // Exhausted retries — fall through to the normal empty-answer path
+                // which will print a warning and return.
             }
-            // Detect the case where the model emitted sentinel-style tool-call
-            // markers but the body was malformed or truncated. Printing those
-            // raw tokens looks like a crash to the user — instead, ask the
-            // model to retry the tool call in a clean JSON format.
-            if (/<\|tool_call(?:s_section)?_begin\|>|<\|tool_call_argument_begin\|>/i.test(assistantText.visible)) {
-                process.stdout.write(chalk.yellow("  ⚠ tool call was malformed or cut off — asking the model to retry in JSON form\n"));
-                messages.push({ role: "assistant", content: assistantText.visible });
-                messages.push(recoveryUserMessage("Your previous tool call was malformed or truncated. " +
-                    "Reply with ONLY a fenced ```tool block containing valid JSON " +
-                    'of the form `{"name": "<tool>", "args": { ... }}`. ' +
-                    "Do not use <|tool_call_begin|> markers."));
-                continue;
+            else {
+                // Reset the counter on any successful visible output or recovered call.
+                emptyVisibleRetries = 0;
             }
-            // Detect a tool call that opened but was cut off by the token limit
-            // (most common with a large multi-file fs.writeMany). Retrying with a
-            // nudge to split the work is far better than rendering broken JSON as
-            // a final answer and leaving the project half-created.
-            if (looksLikeTruncatedToolCall(assistantText.visible)) {
-                truncatedToolRetries += 1;
-                if (truncatedToolRetries <= 3) {
-                    process.stdout.write(chalk.yellow("  ⚠ tool call was cut off (output too long) — asking the model to retry in smaller pieces\n"));
-                    messages.push({ role: "assistant", content: assistantText.visible });
-                    messages.push({
-                        role: "user",
-                        content: "Your previous tool call was cut off before it finished — the JSON was incomplete, so NOTHING ran. " +
-                            "Retry now with a COMPLETE, valid ```tool block. " +
-                            "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files per call, and keep each file's content concise) " +
-                            "so the whole JSON fits in one response. Do NOT claim any file was written until a tool call actually succeeds.",
-                    });
-                    continue;
+            // `call` was already extracted above (from visible text or thinking content).
+            // Recovery: the model meant to call a tool but emitted a bare JSON object
+            // with no ```tool fence — either a complete {name,args} the strict
+            // matchers missed (recover it directly), or just an args object like
+            // {"path":"file.pdf"} with the wrapper dropped (nudge a retry below so
+            // the requested action runs instead of the JSON leaking as the answer).
+            let bareArgsOnly = false;
+            recoveredFromBareJson = false;
+            if (!call) {
+                const bare = recognizeBareToolJson(assistantText.visible);
+                if (bare?.call) {
+                    call = bare.call;
+                    recoveredFromBareJson = true;
+                    process.stdout.write(chalk.dim("  ℹ recovered an unfenced tool call from bare JSON\n"));
+                }
+                else if (bare?.argsOnly) {
+                    bareArgsOnly = true;
+                }
+            }
+            // Also check thinking content for bare JSON calls.
+            if (!call && assistantText.hasThinking) {
+                const bareThink = recognizeBareToolJson(assistantText.thinkContent);
+                if (bareThink?.call) {
+                    call = bareThink.call;
+                    recoveredFromBareJson = true;
+                    process.stdout.write(chalk.dim("  ℹ recovered an unfenced tool call from thinking content\n"));
+                }
+                else if (bareThink?.argsOnly) {
+                    bareArgsOnly = true;
                 }
-                // Exhausted retries — fall through so we don't loop forever, but the
-                // user at least sees the (broken) output and the stop notice.
             }
-            // Detect a ```tool fence whose JSON could NOT be parsed for any other
-            // reason (malformed braces, trailing junk, a stray `}` — NOT plain
-            // truncation, which is handled above). Without this, the raw block
-            // leaks to the screen as a code fence and the requested action (often
-            // a whole fs.writeMany scaffold) silently never runs — exactly the
-            // "fs.writeMany printed but nothing created" failure. Require the fence
-            // to actually look like an intended call (mentions name/args) so a
-            // genuine ```tool code example in prose isn't mistaken for one.
-            const hasFencedCallShape = countToolFences(assistantText.visible) > 0 &&
-                /```tool\s*\n[\s\S]*?"(?:name|args)"\s*:/i.test(assistantText.visible);
-            if (hasFencedCallShape) {
-                malformedFenceRetries += 1;
-                if (malformedFenceRetries <= 3) {
-                    process.stdout.write(chalk.yellow("  ⚠ tool block present but its JSON didn't parse — asking the model to re-emit valid JSON\n"));
+            if (!call) {
+                if (bareArgsOnly) {
+                    bareToolJsonRetries += 1;
+                    if (bareToolJsonRetries <= 3) {
+                        process.stdout.write(chalk.yellow("  ⚠ tool call missing its name/fence — asking the model to re-emit a proper ```tool block\n"));
+                        messages.push({ role: "assistant", content: assistantText.visible });
+                        messages.push(recoveryUserMessage(buildLikeTurn && !activePlan
+                            ? "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
+                                "This is a BUILD/SCAFFOLD task with NO plan yet. " +
+                                "You MUST call plan.create using a proper ```tool block. For example:\n" +
+                                '```tool\n{"name":"plan.create","args":{"goal":"scaffold todo app","detail":"...","tasks":["...","..."],"kind":"coding"}}\n```\n' +
+                                "Do NOT use fs.write, fs.writeMany, shell.exec, or pkg.install yet."
+                            : "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
+                                "Reply with ONLY a fenced ```tool block of the form " +
+                                '`{"name": "<tool>", "args": { ... }}`. For example, to read a PDF:\n' +
+                                '```tool\n{"name":"pdf.read","args":{"path":"/abs/file.pdf"}}\n```\n' +
+                                "Choose the correct tool name for the task and include those args."));
+                        continue;
+                    }
+                    // Exhausted retries — fall through to the normal answer path.
+                }
+                // Detect the case where the model emitted sentinel-style tool-call
+                // markers but the body was malformed or truncated. Printing those
+                // raw tokens looks like a crash to the user — instead, ask the
+                // model to retry the tool call in a clean JSON format.
+                if (/<\|tool_call(?:s_section)?_begin\|>|<\|tool_call_argument_begin\|>/i.test(assistantText.visible)) {
+                    process.stdout.write(chalk.yellow("  ⚠ tool call was malformed or cut off — asking the model to retry in JSON form\n"));
                     messages.push({ role: "assistant", content: assistantText.visible });
-                    messages.push({
-                        role: "user",
-                        content: "Your previous message contained a ```tool block, but its JSON was INVALID, so NOTHING ran. " +
-                            "Common causes: an extra or missing `}` / `]`, a trailing brace after the closing `}`, or unescaped quotes/newlines inside a string value. " +
-                            'Re-emit ONE valid ```tool block of the exact form {"name":"<tool>","args":{...}} with balanced braces. ' +
-                            "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files) so the JSON is easy to keep valid. " +
-                            "Do NOT claim any file was written until a tool call actually succeeds.",
-                    });
+                    messages.push(recoveryUserMessage("Your previous tool call was malformed or truncated. " +
+                        "Reply with ONLY a fenced ```tool block containing valid JSON " +
+                        'of the form `{"name": "<tool>", "args": { ... }}`. ' +
+                        "Do not use <|tool_call_begin|> markers."));
                     continue;
                 }
-                // Exhausted retries — fall through to the normal path.
-            }
-            // Normal final-answer path: strip any stray sentinel tokens that
-            // somehow leaked into prose so the answer renders cleanly.
-            const cleaned = stripSentinelTokens(assistantText.visible);
-            if (freshWebSearchRequired && !sawFreshWebSearch && !freshnessRetryUsed) {
-                freshnessRetryUsed = true;
-                process.stdout.write(chalk.dim("  ℹ current-info question detected — searching the web before answering\n"));
-                messages.push({ role: "assistant", content: assistantText.visible });
-                messages.push({
-                    role: "user",
-                    content: freshnessGuardMessage() +
-                        " Reply with ONLY a fenced ```tool block for web.search now.",
-                });
-                continue;
-            }
-            // ── Premature-completion guard (approved plan still has work) ──────
-            // If the user approved a plan and the model now gives a final answer
-            // while tasks are still pending/in_progress — without having run the
-            // work — it is fabricating completion (the exact "all tasks completed,
-            // running at localhost:5173" failure). Force it back to executing the
-            // next real task instead of accepting the false claim.
-            if (session.planApproved.value && prematureCompletionRetries < 3) {
-                const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
-                const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
-                if (livePlan && unfinished && unfinished.length > 0) {
-                    prematureCompletionRetries += 1;
-                    const next = unfinished[0];
-                    process.stdout.write(chalk.yellow(`  ⚠ ${unfinished.length} plan task(s) still unfinished — not accepting a "done" claim; resuming execution\n`));
+                // Detect a tool call that opened but was cut off by the token limit
+                // (most common with a large multi-file fs.writeMany). Retrying with a
+                // nudge to split the work is far better than rendering broken JSON as
+                // a final answer and leaving the project half-created.
+                if (looksLikeTruncatedToolCall(assistantText.visible)) {
+                    truncatedToolRetries += 1;
+                    if (truncatedToolRetries <= 3) {
+                        process.stdout.write(chalk.yellow("  ⚠ tool call was cut off (output too long) — asking the model to retry in smaller pieces\n"));
+                        messages.push({ role: "assistant", content: assistantText.visible });
+                        messages.push({
+                            role: "user",
+                            content: "Your previous tool call was cut off before it finished — the JSON was incomplete, so NOTHING ran. " +
+                                "Retry now with a COMPLETE, valid ```tool block. " +
+                                "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files per call, and keep each file's content concise) " +
+                                "so the whole JSON fits in one response. Do NOT claim any file was written until a tool call actually succeeds.",
+                        });
+                        continue;
+                    }
+                    // Exhausted retries — fall through so we don't loop forever, but the
+                    // user at least sees the (broken) output and the stop notice.
+                }
+                // Detect a ```tool fence whose JSON could NOT be parsed for any other
+                // reason (malformed braces, trailing junk, a stray `}` — NOT plain
+                // truncation, which is handled above). Without this, the raw block
+                // leaks to the screen as a code fence and the requested action (often
+                // a whole fs.writeMany scaffold) silently never runs — exactly the
+                // "fs.writeMany printed but nothing created" failure. Require the fence
+                // to actually look like an intended call (mentions name/args) so a
+                // genuine ```tool code example in prose isn't mistaken for one.
+                const hasFencedCallShape = countToolFences(assistantText.visible) > 0 &&
+                    /```tool\s*\n[\s\S]*?"(?:name|args)"\s*:/i.test(assistantText.visible);
+                if (hasFencedCallShape) {
+                    malformedFenceRetries += 1;
+                    if (malformedFenceRetries <= 3) {
+                        process.stdout.write(chalk.yellow("  ⚠ tool block present but its JSON didn't parse — asking the model to re-emit valid JSON\n"));
+                        messages.push({ role: "assistant", content: assistantText.visible });
+                        messages.push({
+                            role: "user",
+                            content: "Your previous message contained a ```tool block, but its JSON was INVALID, so NOTHING ran. " +
+                                "Common causes: an extra or missing `}` / `]`, a trailing brace after the closing `}`, or unescaped quotes/newlines inside a string value. " +
+                                'Re-emit ONE valid ```tool block of the exact form {"name":"<tool>","args":{...}} with balanced braces. ' +
+                                "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files) so the JSON is easy to keep valid. " +
+                                "Do NOT claim any file was written until a tool call actually succeeds.",
+                        });
+                        continue;
+                    }
+                    // Exhausted retries — fall through to the normal path.
+                }
+                // Normal final-answer path: strip any stray sentinel tokens that
+                // somehow leaked into prose so the answer renders cleanly.
+                const cleaned = stripSentinelTokens(assistantText.visible);
+                if (freshWebSearchRequired && !sawFreshWebSearch && !freshnessRetryUsed) {
+                    freshnessRetryUsed = true;
+                    process.stdout.write(chalk.dim("  ℹ current-info question detected — searching the web before answering\n"));
                     messages.push({ role: "assistant", content: assistantText.visible });
                     messages.push({
                         role: "user",
-                        content: `You have NOT finished the approved plan: ${unfinished.length} task(s) remain ` +
-                            `(${unfinished.map((t) => `[${t.id}] ${t.title}`).join("; ")}). ` +
-                            `Do NOT claim the work is complete, that files were created, or that a server is running ` +
-                            `unless a tool call actually succeeded and you saw the output. ` +
-                            `Resume now with the NEXT task ${next.id} ("${next.title}"): call task.update {taskId:"${next.id}", state:"in_progress"}, ` +
-                            `then do the real work with a tool call (fs.writeMany / shell.exec / shell.start), VERIFY it, and mark it done. ` +
-                            `Continue task by task until EVERY task is actually finished.`,
+                        content: freshnessGuardMessage() +
+                            " Reply with ONLY a fenced ```tool block for web.search now.",
                     });
                     continue;
                 }
-            }
-            // If we still print a final answer while an approved plan has unfinished
-            // tasks (retries exhausted), do NOT let a fabricated "it's done" stand
-            // unchallenged — append an explicit, honest status so the user knows the
-            // build did not actually complete.
-            let completionWarning = "";
-            if (session.planApproved.value) {
-                const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
-                const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
-                if (livePlan && unfinished && unfinished.length > 0) {
-                    completionWarning =
-                        chalk.yellow(`\n  ⚠ ${unfinished.length} of ${livePlan.tasks.length} plan task(s) are NOT actually complete:\n`) +
-                            unfinished
-                                .map((t) => chalk.yellow(`    • [${t.id}] ${t.title}`))
-                                .join("\n") +
-                            chalk.dim("\n  The summary above may overstate progress. Re-run with /implement, or ask clai to finish the remaining tasks.\n");
+                // ── Premature-completion guard (approved plan still has work) ──────
+                // If the user approved a plan and the model now gives a final answer
+                // while tasks are still pending/in_progress — without having run the
+                // work — it is fabricating completion (the exact "all tasks completed,
+                // running at localhost:5173" failure). Force it back to executing the
+                // next real task instead of accepting the false claim.
+                if (session.planApproved.value && prematureCompletionRetries < 3) {
+                    const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
+                    const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
+                    if (livePlan && unfinished && unfinished.length > 0) {
+                        prematureCompletionRetries += 1;
+                        const next = unfinished[0];
+                        process.stdout.write(chalk.yellow(`  ⚠ ${unfinished.length} plan task(s) still unfinished — not accepting a "done" claim; resuming execution\n`));
+                        messages.push({ role: "assistant", content: assistantText.visible });
+                        messages.push({
+                            role: "user",
+                            content: `You have NOT finished the approved plan: ${unfinished.length} task(s) remain ` +
+                                `(${unfinished.map((t) => `[${t.id}] ${t.title}`).join("; ")}). ` +
+                                `Do NOT claim the work is complete, that files were created, or that a server is running ` +
+                                `unless a tool call actually succeeded and you saw the output. ` +
+                                `Resume now with the NEXT task ${next.id} ("${next.title}"): call task.update {taskId:"${next.id}", state:"in_progress"}, ` +
+                                `then do the real work with a tool call (fs.writeMany / shell.exec / shell.start), VERIFY it, and mark it done. ` +
+                                `Continue task by task until EVERY task is actually finished.`,
+                        });
+                        continue;
+                    }
                 }
+                // If we still print a final answer while an approved plan has unfinished
+                // tasks (retries exhausted), do NOT let a fabricated "it's done" stand
+                // unchallenged — append an explicit, honest status so the user knows the
+                // build did not actually complete.
+                let completionWarning = "";
+                if (session.planApproved.value) {
+                    const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
+                    const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
+                    if (livePlan && unfinished && unfinished.length > 0) {
+                        completionWarning =
+                            chalk.yellow(`\n  ⚠ ${unfinished.length} of ${livePlan.tasks.length} plan task(s) are NOT actually complete:\n`) +
+                                unfinished
+                                    .map((t) => chalk.yellow(`    • [${t.id}] ${t.title}`))
+                                    .join("\n") +
+                                chalk.dim("\n  The summary above may overstate progress. Re-run with /implement, or ask clai to finish the remaining tasks.\n");
+                    }
+                }
+                if (cleaned) {
+                    process.stdout.write(renderMarkdown(cleaned));
+                    if (!cleaned.endsWith("\n"))
+                        process.stdout.write("\n");
+                }
+                if (completionWarning) {
+                    process.stdout.write(completionWarning);
+                }
+                if (assistantText.hasThinking) {
+                    process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
+                }
+                await auditLog("agent.final", { provider, model, steps: step + 1 });
+                lastAnswer = cleaned;
+                return lastAnswer;
             }
-            if (cleaned) {
-                process.stdout.write(renderMarkdown(cleaned));
-                if (!cleaned.endsWith("\n"))
-                    process.stdout.write("\n");
-            }
-            if (completionWarning) {
-                process.stdout.write(completionWarning);
+            // A valid primary tool call exists for this fresh model turn. Show any
+            // prose / thinking that preceded it, record the assistant message ONCE,
+            // then queue any additional tool calls from the same message so they
+            // run in order on the next iterations (no extra round-trip).
+            const beforeTool = recoveredFromBareJson
+                ? ""
+                : textBeforeToolCall(assistantText.visible);
+            if (beforeTool) {
+                process.stdout.write(renderMarkdown(beforeTool) + "\n");
             }
             if (assistantText.hasThinking) {
                 process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
             }
-            await auditLog("agent.final", { provider, model, steps: step + 1 });
-            lastAnswer = cleaned;
-            return lastAnswer;
+            messages.push({ role: "assistant", content: assistantText.visible });
+            if (!recoveredFromBareJson && call) {
+                const allCalls = parseAllToolCalls(assistantText.visible);
+                if (allCalls.length > 1 &&
+                    allCalls[0] &&
+                    sameToolCall(allCalls[0], call)) {
+                    pendingCalls = allCalls.slice(1);
+                    process.stdout.write(chalk.dim(`  ℹ ${allCalls.length} tool calls in this message — running them in order\n`));
+                }
+            }
         }
+        // Type guard: every path above either set `call` or returned/continued.
+        if (!call)
+            continue;
         // ── Duplicate-call detection ──────────────────────────────────────────
         // If the model calls the exact same tool with the exact same args
         // repeatedly, it's stuck in a loop. Inject a corrective message
@@ -1323,7 +1424,9 @@ export async function runAgentLoop(prompt, options = {}) {
                 call.name === "fs.writeMany" ||
                 call.name === "fs.edit";
             process.stdout.write(chalk.yellow(`  ⚠ ${call.name} was already called with the same arguments — ${isWrite ? "moving on" : "forcing summary"}\n`));
-            messages.push({ role: "assistant", content: assistantText.visible });
+            // A repeat means this batch went off the rails — drop any queued calls
+            // and let the model react. The assistant message was already recorded.
+            pendingCalls = [];
             messages.push({
                 role: "user",
                 content: isWrite
@@ -1338,27 +1441,6 @@ export async function runAgentLoop(prompt, options = {}) {
         if (loopCheck.reason) {
             process.stdout.write(chalk.dim(`  ℹ ${loopCheck.reason}\n`));
         }
-        // Print only non-thinking text before the tool call. When the call was
-        // recovered from a bare JSON object (the whole message WAS the call),
-        // there is no prose to show — skip it so we don't echo the raw JSON.
-        const beforeTool = recoveredFromBareJson
-            ? ""
-            : textBeforeToolCall(assistantText.visible);
-        if (beforeTool) {
-            process.stdout.write(renderMarkdown(beforeTool) + "\n");
-        }
-        if (assistantText.hasThinking) {
-            process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
-        }
-        messages.push({ role: "assistant", content: assistantText.visible });
-        // Detect a model that crammed MULTIPLE tool calls into one response.
-        // Only `call` (the first block) will run this turn; the rest are dropped.
-        // We flag it so that after the first tool executes we explicitly tell the
-        // model the others did NOT run — preventing the "I ran everything" lie.
-        const extraToolBlocks = Math.max(0, countToolFences(assistantText.visible) - 1);
-        if (extraToolBlocks > 0) {
-            process.stdout.write(chalk.yellow(`  ⚠ ${extraToolBlocks} extra tool block(s) in one message were ignored — only the first ran. One tool per turn.\n`));
-        }
         // ── Plan / task tools (session-scoped, handled inline) ─────────────
         // These don't go through the generic registry because they need the
         // session id and mutate the live plan that the user can view (Ctrl+P).
@@ -1371,6 +1453,10 @@ export async function runAgentLoop(prompt, options = {}) {
                 productiveSteps += 1;
                 loopGuard.recordAttempt(step, call.name, call.args, planResult.ok, 0);
                 process.stdout.write(planResult.display);
+                // plan.create means "STOP and wait for /implement" — abandon any
+                // other calls the model batched alongside it.
+                if (call.name === "plan.create")
+                    pendingCalls = [];
                 messages.push({
                     role: "tool",
                     content: `Tool ${call.name} result (ok=${planResult.ok}):\n${planResult.modelNote}`,
@@ -1398,6 +1484,7 @@ export async function runAgentLoop(prompt, options = {}) {
             !session.planApproved.value &&
             !isPreApprovalAllowedTool(call.name)) {
             process.stdout.write(chalk.yellow(`  ⚠ plan awaiting approval — ${call.name} is blocked until you /implement (or /discard)\n`));
+            pendingCalls = [];
             messages.push({
                 role: "user",
                 content: `There is an ACTIVE PLAN that has NOT been approved yet, so you must NOT execute it — ` +
@@ -1538,6 +1625,13 @@ export async function runAgentLoop(prompt, options = {}) {
             const errMsg = toolError instanceof Error ? toolError.message : String(toolError);
             result = { ok: false, output: `Tool error: ${errMsg}`, exitCode: 1 };
         }
+        // Stop-on-error: if this call failed, abandon any remaining queued calls
+        // from the same message so the model sees the failure and decides what to
+        // do next instead of blindly running steps that depended on it.
+        if (!result.ok && pendingCalls.length > 0) {
+            process.stdout.write(chalk.dim(`  ↳ ${pendingCalls.length} queued call(s) cancelled because this step failed\n`));
+            pendingCalls = [];
+        }
         const output = result.output.trim();
         const displayMax = 6_000;
         // If the tool already produced an artifact (shell.exec now streams to one
@@ -1651,10 +1745,7 @@ export async function runAgentLoop(prompt, options = {}) {
         }
         messages.push({
             role: "tool",
-            content: `Tool ${call.name} result (exit=${result.exitCode ?? 0}, ok=${result.ok}):\n${contextOutput}` +
-                (extraToolBlocks > 0
-                    ? `\n\nIMPORTANT: your previous message contained ${extraToolBlocks + 1} tool blocks, but ONLY this first one (${call.name}) actually ran. The other ${extraToolBlocks} did NOT execute and were discarded. Emit EXACTLY ONE tool block per message. Send the next tool call now — and do NOT assume any of the dropped calls happened.`
-                    : ""),
+            content: `Tool ${call.name} result (exit=${result.exitCode ?? 0}, ok=${result.ok}):\n${contextOutput}`,
         });
         // Compact older messages when the running estimate exceeds budget so
         // free-tier context windows are not blown by long pentest sessions.