@pentoshi/clai 1.1.4 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -380,6 +380,53 @@ export function countToolFences(text) {
380
380
  const matches = text.match(/```tool\s*\n[\s\S]*?```/gi);
381
381
  return matches ? matches.length : 0;
382
382
  }
383
+ /**
384
+ * Parse EVERY explicitly-delimited tool call in a message, in document
385
+ * order. Unlike parseToolCall (which returns only the first), this lets the
386
+ * runner execute a batch the model emitted in one turn — e.g. the natural
387
+ * "task.update in_progress → do the work → task.update done" sequence, or
388
+ * several fs.write calls. Only the unambiguous, delimited formats are
389
+ * collected (```tool fences, <tool_call> XML, and Kimi sentinel blocks) so a
390
+ * worked example in prose is far less likely to be mistaken for a call.
391
+ * The runner executes them sequentially and STOPS the batch on the first
392
+ * failure so the model can react, mirroring how Claude Code batches reads
393
+ * and edits but pauses when something breaks.
394
+ */
395
+ export function parseAllToolCalls(text) {
396
+ const found = [];
397
+ let m;
398
+ const fenceRe = /```tool\s*\n?([\s\S]*?)```/gi;
399
+ while ((m = fenceRe.exec(text)) !== null) {
400
+ const call = tryParseCall(m[1] ?? "");
401
+ if (call)
402
+ found.push({ index: m.index, call });
403
+ }
404
+ const xmlRe = /<tool_call>([\s\S]*?)<\/tool_call>/gi;
405
+ while ((m = xmlRe.exec(text)) !== null) {
406
+ const call = tryParseCall(m[1] ?? "");
407
+ if (call)
408
+ found.push({ index: m.index, call });
409
+ }
410
+ const kimiRe = new RegExp(KIMI_TOOL_CALL_RE.source, "gi");
411
+ while ((m = kimiRe.exec(text)) !== null) {
412
+ const call = tryParseCall(JSON.stringify({ name: m[1], args: tryJson(m[2] ?? "{}") ?? {} }));
413
+ if (call)
414
+ found.push({ index: m.index, call });
415
+ }
416
+ found.sort((a, b) => a.index - b.index);
417
+ return found.map((f) => f.call);
418
+ }
419
+ /** Structural equality for two tool calls (name + canonical args JSON). */
420
+ export function sameToolCall(a, b) {
421
+ if (a.name !== b.name)
422
+ return false;
423
+ try {
424
+ return JSON.stringify(a.args) === JSON.stringify(b.args);
425
+ }
426
+ catch {
427
+ return false;
428
+ }
429
+ }
383
430
  /** Extract the text before the tool call block for display purposes */
384
431
  function textBeforeToolCall(text) {
385
432
  const patterns = [
@@ -529,7 +576,8 @@ function freshnessGuardMessage(now = new Date()) {
529
576
  return (`Freshness guard for this turn: the latest user prompt appears to ask for current, volatile, or externally verifiable information. The present moment is ${currentDateTimeContext(now)}. ` +
530
577
  "Before answering, call web.search FIRST with a concise query derived from the user prompt. " +
531
578
  "Shape the search query for the newest timeline by including current/latest or the current year/month when useful. " +
532
- "Use the search results to answer. If web.search fails or has no results, say that current information is unavailable instead of guessing from memory.");
579
+ "Do not answer from the snippets alone when detail matters — set fetchTop (e.g. fetchTop:2) to read the top result pages, or follow up with web.fetch on the most relevant URL, then answer from what the pages actually say and cite them. " +
580
+ "If web.search fails or has no results, say that current information is unavailable instead of guessing from memory.");
533
581
  }
534
582
  /**
535
583
  * Directive injected for build/scaffold turns. Forces the careful
@@ -542,7 +590,7 @@ function buildWorkflowDirective() {
542
590
  "1. EXPLORE: fs.list the working directory (and key subdirs) to see what already exists. Use tool.batch to parallelize reads.",
543
591
  "2. UNDERSTAND: fs.read the files that matter (like package.json for js related and same for other languages too, config, entry points, existing components). Detect the existing stack/tooling and MATCH it. If the dir is empty or only has a stub, start fresh with a sensible modern default and say so.",
544
592
  "3. PLAN: call plan.create with a COMPREHENSIVE plan — a detailed `detail` (stack chosen and WHY, architecture, how you'll verify) and 4-8 SEPARATE, ordered, high-quality tasks. The FIRST task initializes the project (scaffolder); the MIDDLE tasks MUST implement the ACTUAL FEATURE the user asked for by REPLACING the scaffolder's boilerplate (e.g. rewrite src/App.jsx into the real todo/blog/etc. UI, add components, state, styles); the LAST task verifies with a build. Scaffolding + install + run ALONE is NOT acceptable — that just leaves the Vite starter page. Each task is one distinct, verifiable action. Then STOP and wait for the user to /implement.",
545
- "4. IMPLEMENT: once approved, work task by task in STRICT ORDER across MULTIPLE steps, ONE tool call per turn. For each task: call task.update {taskId, state:'in_progress'} → do the real work → VERIFY it actually succeeded (read a file you wrote, check the command's exit/output) → call task.update {taskId, state:'done'}, then move to the NEXT task. Keep going until EVERY task is done. Do NOT stop after one step, and do NOT claim work you didn't actually run.",
593
+ "4. IMPLEMENT: once approved, work task by task in STRICT ORDER. For each task: call task.update {taskId, state:'in_progress'} → do the real work → VERIFY it actually succeeded (read a file you wrote, check the command's exit/output) → call task.update {taskId, state:'done'}, then move to the NEXT task. You MAY emit several tool calls in one message and they run in order, top to bottom (the batch STOPS if one fails). A clean rhythm is: task.update in_progress + the work + task.update done together. Keep going until EVERY task is done. Do NOT claim work you didn't actually run.",
546
594
  "",
547
595
  "INITIALIZE WITH THE OFFICIAL SCAFFOLDER FIRST (do NOT hand-write build configs):",
548
596
  "- React/Vue/Svelte/vanilla → `npm create vite@latest <appname> -- --template react` (templates: react, react-ts, vue, vue-ts, svelte, vanilla). Next.js → `npx --yes create-next-app@latest <appname> --yes --eslint --no-tailwind --app --src-dir --import-alias \"@/*\"`. Node API → `npm init -y`.",
@@ -552,7 +600,7 @@ function buildWorkflowDirective() {
552
600
  "- VERIFY the init actually worked before marking the task done: fs.read package.json (it must now exist AND list react + react-dom) and fs.read index.html (it must reference your jsx entry). 'Operation cancelled' / non-zero exit means the task FAILED — do not proceed as if it succeeded.",
553
601
  "",
554
602
  "CRITICAL RULES during IMPLEMENTATION:",
555
- "- EXACTLY ONE ```tool block per message. NEVER put several tool calls (e.g. fs.writeMany + npm install + npm run dev) in one responseonly the first runs and the rest are silently discarded, which is how false 'all done' claims happen.",
603
+ "- You may batch tool calls: emit one or several ```tool blocks in a message and they run in order, top to bottom. If any call fails, the rest of that batch is cancelled so you can react so order dependent steps correctly and keep batches focused. A good batch is task.update(in_progress) + the work + task.update(done) for ONE task.",
556
604
  "- Do NOT re-explore. Step 1 (EXPLORE) was already completed during planning. Start executing the first pending task immediately.",
557
605
  "- ONE task at a time, in ORDER. Do NOT skip ahead to task 3 before task 2 is done.",
558
606
  "- KEEP EACH FILE SMALL ENOUGH TO WRITE IN ONE CALL. If a fs.write is reported as 'cut off (output too long)', the file was NOT fully written and is likely broken/invalid — re-write it, splitting a large component into smaller files if needed. NEVER leave a half-written file and move on.",
@@ -975,6 +1023,17 @@ export async function runAgentLoop(prompt, options = {}) {
975
1023
  // tasks are still pending and it never ran the work. We nudge it back to
976
1024
  // executing the next task a bounded number of times before giving up.
977
1025
  let prematureCompletionRetries = 0;
1026
+ // ── Multi-tool execution queue ─────────────────────────────────────
1027
+ // Models naturally emit several tool calls in one message — e.g. the
1028
+ // plan-execution rhythm "task.update in_progress → do the work →
1029
+ // task.update done", or a batch of fs.write calls. Rather than running
1030
+ // only the first and discarding the rest (which made models believe work
1031
+ // ran when it didn't, and broke plan execution), we parse ALL calls in a
1032
+ // message, run the first this iteration, and queue the rest here to run on
1033
+ // subsequent iterations WITHOUT another model round-trip. The queue is
1034
+ // cleared whenever a call fails, is blocked, or needs the model to react,
1035
+ // so the model always sees errors and stays in control.
1036
+ let pendingCalls = [];
978
1037
  // ── Step budget ───────────────────────────────────────────────────
979
1038
  // The budget governs how many *productive* steps (a tool execution or a
980
1039
  // final answer) the agent may take. Recovery iterations — nudging a model
@@ -1018,301 +1077,343 @@ export async function runAgentLoop(prompt, options = {}) {
1018
1077
  if (productiveSteps >= stepBudget)
1019
1078
  break;
1020
1079
  options.signal?.throwIfAborted();
1021
- // Buffer LLM output so tool JSON and hidden thinking are not printed raw.
1022
- // Status messages (rate-limit retries, fallback hints) still surface live.
1023
- // A spinner gives the user feedback during long thinking phases on
1024
- // models like glm-5.1 / deepseek-v4-flash that stream reasoning first.
1025
- const spinner = startThinkingSpinner(step === 0 ? "waiting for model" : `step ${step + 1}`, options.signal);
1026
- let sawReasoning = false;
1027
- let inThinking = false;
1028
- let completion;
1029
- try {
1030
- completion = await streamWithProvider({
1031
- provider,
1032
- model,
1033
- messages,
1034
- temperature: 0.2,
1035
- // Reasoning models can spend a lot on hidden thinking; give
1036
- // them headroom so the visible answer / tool call isn't
1037
- // truncated to silence. The non-thinking budget must be large
1038
- // enough for a single-file fs.write / multi-file fs.writeMany
1039
- // payload — a truncated tool-call JSON fails to parse and leaks a
1040
- // broken (and syntactically invalid) file. 8k was too small for a
1041
- // full component, so allow more room for the visible tool call.
1042
- maxTokens: config.thinking?.enabled ? 16_384 : 12_288,
1043
- signal: options.signal,
1044
- thinking: config.thinking,
1045
- }, (token) => {
1046
- // Heuristic: <think>… markers and reasoning_content tokens flow
1047
- // through onToken. Surface activity in the spinner so the screen
1048
- // is never empty for minutes.
1049
- if (!sawReasoning && /<think/i.test(token)) {
1050
- sawReasoning = true;
1051
- inThinking = true;
1052
- spinner.setLabel("thinking");
1053
- }
1054
- if (/<\/think>/i.test(token)) {
1055
- inThinking = false;
1056
- }
1057
- // Only push reasoning tokens to the spinner preview. Visible
1058
- // answer / tool-call tokens should NOT go through the dim
1059
- // spinner preview — doing so makes the final answer appear
1060
- // "diluted" in light font when the spinner's last render
1061
- // briefly shows the answer text before being erased.
1062
- if (inThinking) {
1063
- const cleaned = token.replace(/<\/?think[^>]*>/gi, "");
1064
- if (cleaned) {
1065
- spinner.pushPreview(cleaned);
1066
- const approx = cleaned.split(/\s+/).filter(Boolean).length;
1067
- if (approx > 0)
1068
- spinner.bumpReasoning(approx);
1069
- }
1070
- }
1071
- }, (status) => {
1072
- spinner.stop();
1073
- process.stdout.write(chalk.dim(status));
1074
- });
1075
- }
1076
- finally {
1077
- // Always clear the spinner — abort, network error, or success.
1078
- spinner.stop();
1079
- }
1080
- provider = completion.provider;
1081
- model = completion.model;
1082
- const assistantText = rememberThinkingFromText(completion.text);
1083
- // Try visible text first, then thinking content — some models (e.g. glm-5.1)
1084
- // wrap tool calls inside considering tags, so stripThinking removes them
1085
- // into thinkContent and visible becomes empty. Recovering from thinkContent
1086
- // prevents an endless nudge loop where the model keeps hiding the call.
1087
- let call = parseToolCall(assistantText.visible, {
1088
- strict: getConfig().parserStrict,
1089
- });
1090
- if (!call && assistantText.hasThinking) {
1091
- call = parseToolCall(assistantText.thinkContent, {
1092
- strict: getConfig().parserStrict,
1093
- });
1094
- if (call) {
1095
- process.stdout.write(chalk.dim(" ℹ recovered tool call from thinking content\n"));
1096
- }
1097
- }
1098
- // ── Thinking-only recovery ────────────────────────────────────────
1099
- // Some models (eg gpt-oss-20b on NVIDIA NIM) occasionally spend their
1100
- // entire budget on hidden <think> reasoning and emit no visible text
1101
- // or tool call. Without this guard the agent silently returns an empty
1102
- // answer and the user has to re-submit the same prompt.
1103
- if (!assistantText.visible.trim() && !call && assistantText.hasThinking) {
1104
- emptyVisibleRetries += 1;
1105
- if (emptyVisibleRetries <= 2) {
1106
- process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
1107
- process.stdout.write(chalk.yellow(" ⚠ model produced only thinking — nudging it to take action\n"));
1108
- messages.push({ role: "assistant", content: completion.text });
1109
- const buildNudge = buildLikeTurn && !activePlan
1110
- ? "You only produced internal reasoning with no visible answer or tool call. " +
1111
- "This is a BUILD/SCAFFOLD task with NO plan yet. " +
1112
- "You MUST call plan.create using the ```tool format to create a comprehensive plan BEFORE writing any files or running any commands. " +
1113
- "Do NOT use fs.write, fs.writeMany, fs.edit, shell.exec, shell.start, or pkg.install yet. " +
1114
- "Your ONLY allowed action right now is plan.create (or read/list for exploration)."
1115
- : "You only produced internal reasoning with no visible answer or tool call. " +
1116
- "You MUST either call a tool using the ```tool format or provide your final answer. " +
1117
- "Do NOT wrap your tool call inside considering or reasoning tags — put it in the VISIBLE response, not hidden. " +
1118
- "If images are attached, inspect them directly for visual details (text, colors, layout, spacing, style) instead of using OCR unless explicitly needed. " +
1119
- "Do NOT just think — take action NOW.";
1120
- messages.push(recoveryUserMessage(buildNudge));
1121
- continue;
1122
- }
1123
- // Exhausted retries — fall through to the normal empty-answer path
1124
- // which will print a warning and return.
1080
+ // `call` and `assistantText` are shared by both paths below: a fresh
1081
+ // model round-trip, or draining a previously-queued tool call.
1082
+ let call;
1083
+ let assistantText;
1084
+ let recoveredFromBareJson = false;
1085
+ if (pendingCalls.length > 0) {
1086
+ // Drain the next queued call from the previous model message — no new
1087
+ // round-trip. The assistant message and any prose were already shown
1088
+ // when the batch was parsed.
1089
+ call = pendingCalls.shift();
1090
+ assistantText = { visible: "", thinkContent: "", hasThinking: false };
1091
+ process.stdout.write(chalk.dim(` ↳ continuing batch (${pendingCalls.length} more queued)\n`));
1125
1092
  }
1126
1093
  else {
1127
- // Reset the counter on any successful visible output or recovered call.
1128
- emptyVisibleRetries = 0;
1129
- }
1130
- // `call` was already extracted above (from visible text or thinking content).
1131
- // Recovery: the model meant to call a tool but emitted a bare JSON object
1132
- // with no ```tool fence — either a complete {name,args} the strict
1133
- // matchers missed (recover it directly), or just an args object like
1134
- // {"path":"file.pdf"} with the wrapper dropped (nudge a retry below so
1135
- // the requested action runs instead of the JSON leaking as the answer).
1136
- let bareArgsOnly = false;
1137
- let recoveredFromBareJson = false;
1138
- if (!call) {
1139
- const bare = recognizeBareToolJson(assistantText.visible);
1140
- if (bare?.call) {
1141
- call = bare.call;
1142
- recoveredFromBareJson = true;
1143
- process.stdout.write(chalk.dim(" ℹ recovered an unfenced tool call from bare JSON\n"));
1144
- }
1145
- else if (bare?.argsOnly) {
1146
- bareArgsOnly = true;
1094
+ // Buffer LLM output so tool JSON and hidden thinking are not printed raw.
1095
+ // Status messages (rate-limit retries, fallback hints) still surface live.
1096
+ // A spinner gives the user feedback during long thinking phases on
1097
+ // models like glm-5.1 / deepseek-v4-flash that stream reasoning first.
1098
+ const spinner = startThinkingSpinner(step === 0 ? "waiting for model" : `step ${step + 1}`, options.signal);
1099
+ let sawReasoning = false;
1100
+ let inThinking = false;
1101
+ let completion;
1102
+ try {
1103
+ completion = await streamWithProvider({
1104
+ provider,
1105
+ model,
1106
+ messages,
1107
+ temperature: 0.2,
1108
+ // Reasoning models can spend a lot on hidden thinking; give
1109
+ // them headroom so the visible answer / tool call isn't
1110
+ // truncated to silence. The non-thinking budget must be large
1111
+ // enough for a single-file fs.write / multi-file fs.writeMany
1112
+ // payload a truncated tool-call JSON fails to parse and leaks a
1113
+ // broken (and syntactically invalid) file. 8k was too small for a
1114
+ // full component, so allow more room for the visible tool call.
1115
+ maxTokens: config.thinking?.enabled ? 16_384 : 12_288,
1116
+ signal: options.signal,
1117
+ thinking: config.thinking,
1118
+ }, (token) => {
1119
+ // Heuristic: <think>… markers and reasoning_content tokens flow
1120
+ // through onToken. Surface activity in the spinner so the screen
1121
+ // is never empty for minutes.
1122
+ if (!sawReasoning && /<think/i.test(token)) {
1123
+ sawReasoning = true;
1124
+ inThinking = true;
1125
+ spinner.setLabel("thinking");
1126
+ }
1127
+ if (/<\/think>/i.test(token)) {
1128
+ inThinking = false;
1129
+ }
1130
+ // Only push reasoning tokens to the spinner preview. Visible
1131
+ // answer / tool-call tokens should NOT go through the dim
1132
+ // spinner preview — doing so makes the final answer appear
1133
+ // "diluted" in light font when the spinner's last render
1134
+ // briefly shows the answer text before being erased.
1135
+ if (inThinking) {
1136
+ const cleaned = token.replace(/<\/?think[^>]*>/gi, "");
1137
+ if (cleaned) {
1138
+ spinner.pushPreview(cleaned);
1139
+ const approx = cleaned.split(/\s+/).filter(Boolean).length;
1140
+ if (approx > 0)
1141
+ spinner.bumpReasoning(approx);
1142
+ }
1143
+ }
1144
+ }, (status) => {
1145
+ spinner.stop();
1146
+ process.stdout.write(chalk.dim(status));
1147
+ });
1147
1148
  }
1148
- }
1149
- // Also check thinking content for bare JSON calls.
1150
- if (!call && assistantText.hasThinking) {
1151
- const bareThink = recognizeBareToolJson(assistantText.thinkContent);
1152
- if (bareThink?.call) {
1153
- call = bareThink.call;
1154
- recoveredFromBareJson = true;
1155
- process.stdout.write(chalk.dim(" ℹ recovered an unfenced tool call from thinking content\n"));
1149
+ finally {
1150
+ // Always clear the spinner abort, network error, or success.
1151
+ spinner.stop();
1156
1152
  }
1157
- else if (bareThink?.argsOnly) {
1158
- bareArgsOnly = true;
1153
+ provider = completion.provider;
1154
+ model = completion.model;
1155
+ const assistantTextResult = rememberThinkingFromText(completion.text);
1156
+ assistantText = assistantTextResult;
1157
+ // Try visible text first, then thinking content — some models (e.g. glm-5.1)
1158
+ // wrap tool calls inside considering tags, so stripThinking removes them
1159
+ // into thinkContent and visible becomes empty. Recovering from thinkContent
1160
+ // prevents an endless nudge loop where the model keeps hiding the call.
1161
+ call = parseToolCall(assistantText.visible, {
1162
+ strict: getConfig().parserStrict,
1163
+ });
1164
+ if (!call && assistantText.hasThinking) {
1165
+ call = parseToolCall(assistantText.thinkContent, {
1166
+ strict: getConfig().parserStrict,
1167
+ });
1168
+ if (call) {
1169
+ process.stdout.write(chalk.dim(" ℹ recovered tool call from thinking content\n"));
1170
+ }
1159
1171
  }
1160
- }
1161
- if (!call) {
1162
- if (bareArgsOnly) {
1163
- bareToolJsonRetries += 1;
1164
- if (bareToolJsonRetries <= 3) {
1165
- process.stdout.write(chalk.yellow(" ⚠ tool call missing its name/fence — asking the model to re-emit a proper ```tool block\n"));
1166
- messages.push({ role: "assistant", content: assistantText.visible });
1167
- messages.push(recoveryUserMessage(buildLikeTurn && !activePlan
1168
- ? "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
1172
+ // ── Thinking-only recovery ────────────────────────────────────────
1173
+ // Some models (eg gpt-oss-20b on NVIDIA NIM) occasionally spend their
1174
+ // entire budget on hidden <think> reasoning and emit no visible text
1175
+ // or tool call. Without this guard the agent silently returns an empty
1176
+ // answer and the user has to re-submit the same prompt.
1177
+ if (!assistantText.visible.trim() && !call && assistantText.hasThinking) {
1178
+ emptyVisibleRetries += 1;
1179
+ if (emptyVisibleRetries <= 2) {
1180
+ process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
1181
+ process.stdout.write(chalk.yellow(" ⚠ model produced only thinking — nudging it to take action\n"));
1182
+ messages.push({ role: "assistant", content: completion.text });
1183
+ const buildNudge = buildLikeTurn && !activePlan
1184
+ ? "You only produced internal reasoning with no visible answer or tool call. " +
1169
1185
  "This is a BUILD/SCAFFOLD task with NO plan yet. " +
1170
- "You MUST call plan.create using a proper ```tool block. For example:\n" +
1171
- '```tool\n{"name":"plan.create","args":{"goal":"scaffold todo app","detail":"...","tasks":["...","..."],"kind":"coding"}}\n```\n' +
1172
- "Do NOT use fs.write, fs.writeMany, shell.exec, or pkg.install yet."
1173
- : "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
1174
- "Reply with ONLY a fenced ```tool block of the form " +
1175
- '`{"name": "<tool>", "args": { ... }}`. For example, to read a PDF:\n' +
1176
- '```tool\n{"name":"pdf.read","args":{"path":"/abs/file.pdf"}}\n```\n' +
1177
- "Choose the correct tool name for the task and include those args."));
1186
+ "You MUST call plan.create using the ```tool format to create a comprehensive plan BEFORE writing any files or running any commands. " +
1187
+ "Do NOT use fs.write, fs.writeMany, fs.edit, shell.exec, shell.start, or pkg.install yet. " +
1188
+ "Your ONLY allowed action right now is plan.create (or read/list for exploration)."
1189
+ : "You only produced internal reasoning with no visible answer or tool call. " +
1190
+ "You MUST either call a tool using the ```tool format or provide your final answer. " +
1191
+ "Do NOT wrap your tool call inside considering or reasoning tags — put it in the VISIBLE response, not hidden. " +
1192
+ "If images are attached, inspect them directly for visual details (text, colors, layout, spacing, style) instead of using OCR unless explicitly needed. " +
1193
+ "Do NOT just think take action NOW.";
1194
+ messages.push(recoveryUserMessage(buildNudge));
1178
1195
  continue;
1179
1196
  }
1180
- // Exhausted retries — fall through to the normal answer path.
1197
+ // Exhausted retries — fall through to the normal empty-answer path
1198
+ // which will print a warning and return.
1181
1199
  }
1182
- // Detect the case where the model emitted sentinel-style tool-call
1183
- // markers but the body was malformed or truncated. Printing those
1184
- // raw tokens looks like a crash to the user — instead, ask the
1185
- // model to retry the tool call in a clean JSON format.
1186
- if (/<\|tool_call(?:s_section)?_begin\|>|<\|tool_call_argument_begin\|>/i.test(assistantText.visible)) {
1187
- process.stdout.write(chalk.yellow(" ⚠ tool call was malformed or cut off — asking the model to retry in JSON form\n"));
1188
- messages.push({ role: "assistant", content: assistantText.visible });
1189
- messages.push(recoveryUserMessage("Your previous tool call was malformed or truncated. " +
1190
- "Reply with ONLY a fenced ```tool block containing valid JSON " +
1191
- 'of the form `{"name": "<tool>", "args": { ... }}`. ' +
1192
- "Do not use <|tool_call_begin|> markers."));
1193
- continue;
1200
+ else {
1201
+ // Reset the counter on any successful visible output or recovered call.
1202
+ emptyVisibleRetries = 0;
1194
1203
  }
1195
- // Detect a tool call that opened but was cut off by the token limit
1196
- // (most common with a large multi-file fs.writeMany). Retrying with a
1197
- // nudge to split the work is far better than rendering broken JSON as
1198
- // a final answer and leaving the project half-created.
1199
- if (looksLikeTruncatedToolCall(assistantText.visible)) {
1200
- truncatedToolRetries += 1;
1201
- if (truncatedToolRetries <= 3) {
1202
- process.stdout.write(chalk.yellow(" ⚠ tool call was cut off (output too long) — asking the model to retry in smaller pieces\n"));
1203
- messages.push({ role: "assistant", content: assistantText.visible });
1204
- messages.push({
1205
- role: "user",
1206
- content: "Your previous tool call was cut off before it finished — the JSON was incomplete, so NOTHING ran. " +
1207
- "Retry now with a COMPLETE, valid ```tool block. " +
1208
- "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files per call, and keep each file's content concise) " +
1209
- "so the whole JSON fits in one response. Do NOT claim any file was written until a tool call actually succeeds.",
1210
- });
1211
- continue;
1204
+ // `call` was already extracted above (from visible text or thinking content).
1205
+ // Recovery: the model meant to call a tool but emitted a bare JSON object
1206
+ // with no ```tool fence either a complete {name,args} the strict
1207
+ // matchers missed (recover it directly), or just an args object like
1208
+ // {"path":"file.pdf"} with the wrapper dropped (nudge a retry below so
1209
+ // the requested action runs instead of the JSON leaking as the answer).
1210
+ let bareArgsOnly = false;
1211
+ recoveredFromBareJson = false;
1212
+ if (!call) {
1213
+ const bare = recognizeBareToolJson(assistantText.visible);
1214
+ if (bare?.call) {
1215
+ call = bare.call;
1216
+ recoveredFromBareJson = true;
1217
+ process.stdout.write(chalk.dim(" recovered an unfenced tool call from bare JSON\n"));
1218
+ }
1219
+ else if (bare?.argsOnly) {
1220
+ bareArgsOnly = true;
1221
+ }
1222
+ }
1223
+ // Also check thinking content for bare JSON calls.
1224
+ if (!call && assistantText.hasThinking) {
1225
+ const bareThink = recognizeBareToolJson(assistantText.thinkContent);
1226
+ if (bareThink?.call) {
1227
+ call = bareThink.call;
1228
+ recoveredFromBareJson = true;
1229
+ process.stdout.write(chalk.dim(" ℹ recovered an unfenced tool call from thinking content\n"));
1230
+ }
1231
+ else if (bareThink?.argsOnly) {
1232
+ bareArgsOnly = true;
1212
1233
  }
1213
- // Exhausted retries — fall through so we don't loop forever, but the
1214
- // user at least sees the (broken) output and the stop notice.
1215
1234
  }
1216
- // Detect a ```tool fence whose JSON could NOT be parsed for any other
1217
- // reason (malformed braces, trailing junk, a stray `}` — NOT plain
1218
- // truncation, which is handled above). Without this, the raw block
1219
- // leaks to the screen as a code fence and the requested action (often
1220
- // a whole fs.writeMany scaffold) silently never runs exactly the
1221
- // "fs.writeMany printed but nothing created" failure. Require the fence
1222
- // to actually look like an intended call (mentions name/args) so a
1223
- // genuine ```tool code example in prose isn't mistaken for one.
1224
- const hasFencedCallShape = countToolFences(assistantText.visible) > 0 &&
1225
- /```tool\s*\n[\s\S]*?"(?:name|args)"\s*:/i.test(assistantText.visible);
1226
- if (hasFencedCallShape) {
1227
- malformedFenceRetries += 1;
1228
- if (malformedFenceRetries <= 3) {
1229
- process.stdout.write(chalk.yellow(" tool block present but its JSON didn't parse — asking the model to re-emit valid JSON\n"));
1235
+ if (!call) {
1236
+ if (bareArgsOnly) {
1237
+ bareToolJsonRetries += 1;
1238
+ if (bareToolJsonRetries <= 3) {
1239
+ process.stdout.write(chalk.yellow(" ⚠ tool call missing its name/fence asking the model to re-emit a proper ```tool block\n"));
1240
+ messages.push({ role: "assistant", content: assistantText.visible });
1241
+ messages.push(recoveryUserMessage(buildLikeTurn && !activePlan
1242
+ ? "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
1243
+ "This is a BUILD/SCAFFOLD task with NO plan yet. " +
1244
+ "You MUST call plan.create using a proper ```tool block. For example:\n" +
1245
+ '```tool\n{"name":"plan.create","args":{"goal":"scaffold todo app","detail":"...","tasks":["...","..."],"kind":"coding"}}\n```\n' +
1246
+ "Do NOT use fs.write, fs.writeMany, shell.exec, or pkg.install yet."
1247
+ : "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
1248
+ "Reply with ONLY a fenced ```tool block of the form " +
1249
+ '`{"name": "<tool>", "args": { ... }}`. For example, to read a PDF:\n' +
1250
+ '```tool\n{"name":"pdf.read","args":{"path":"/abs/file.pdf"}}\n```\n' +
1251
+ "Choose the correct tool name for the task and include those args."));
1252
+ continue;
1253
+ }
1254
+ // Exhausted retries — fall through to the normal answer path.
1255
+ }
1256
+ // Detect the case where the model emitted sentinel-style tool-call
1257
+ // markers but the body was malformed or truncated. Printing those
1258
+ // raw tokens looks like a crash to the user — instead, ask the
1259
+ // model to retry the tool call in a clean JSON format.
1260
+ if (/<\|tool_call(?:s_section)?_begin\|>|<\|tool_call_argument_begin\|>/i.test(assistantText.visible)) {
1261
+ process.stdout.write(chalk.yellow(" ⚠ tool call was malformed or cut off — asking the model to retry in JSON form\n"));
1230
1262
  messages.push({ role: "assistant", content: assistantText.visible });
1231
- messages.push({
1232
- role: "user",
1233
- content: "Your previous message contained a ```tool block, but its JSON was INVALID, so NOTHING ran. " +
1234
- "Common causes: an extra or missing `}` / `]`, a trailing brace after the closing `}`, or unescaped quotes/newlines inside a string value. " +
1235
- 'Re-emit ONE valid ```tool block of the exact form {"name":"<tool>","args":{...}} with balanced braces. ' +
1236
- "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files) so the JSON is easy to keep valid. " +
1237
- "Do NOT claim any file was written until a tool call actually succeeds.",
1238
- });
1263
+ messages.push(recoveryUserMessage("Your previous tool call was malformed or truncated. " +
1264
+ "Reply with ONLY a fenced ```tool block containing valid JSON " +
1265
+ 'of the form `{"name": "<tool>", "args": { ... }}`. ' +
1266
+ "Do not use <|tool_call_begin|> markers."));
1239
1267
  continue;
1240
1268
  }
1241
- // Exhausted retries fall through to the normal path.
1242
- }
1243
- // Normal final-answer path: strip any stray sentinel tokens that
1244
- // somehow leaked into prose so the answer renders cleanly.
1245
- const cleaned = stripSentinelTokens(assistantText.visible);
1246
- if (freshWebSearchRequired && !sawFreshWebSearch && !freshnessRetryUsed) {
1247
- freshnessRetryUsed = true;
1248
- process.stdout.write(chalk.dim(" current-info question detectedsearching the web before answering\n"));
1249
- messages.push({ role: "assistant", content: assistantText.visible });
1250
- messages.push({
1251
- role: "user",
1252
- content: freshnessGuardMessage() +
1253
- " Reply with ONLY a fenced ```tool block for web.search now.",
1254
- });
1255
- continue;
1256
- }
1257
- // ── Premature-completion guard (approved plan still has work) ──────
1258
- // If the user approved a plan and the model now gives a final answer
1259
- // while tasks are still pending/in_progress without having run the
1260
- // work it is fabricating completion (the exact "all tasks completed,
1261
- // running at localhost:5173" failure). Force it back to executing the
1262
- // next real task instead of accepting the false claim.
1263
- if (session.planApproved.value && prematureCompletionRetries < 3) {
1264
- const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
1265
- const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
1266
- if (livePlan && unfinished && unfinished.length > 0) {
1267
- prematureCompletionRetries += 1;
1268
- const next = unfinished[0];
1269
- process.stdout.write(chalk.yellow(` ⚠ ${unfinished.length} plan task(s) still unfinished not accepting a "done" claim; resuming execution\n`));
1269
+ // Detect a tool call that opened but was cut off by the token limit
1270
+ // (most common with a large multi-file fs.writeMany). Retrying with a
1271
+ // nudge to split the work is far better than rendering broken JSON as
1272
+ // a final answer and leaving the project half-created.
1273
+ if (looksLikeTruncatedToolCall(assistantText.visible)) {
1274
+ truncatedToolRetries += 1;
1275
+ if (truncatedToolRetries <= 3) {
1276
+ process.stdout.write(chalk.yellow(" tool call was cut off (output too long) asking the model to retry in smaller pieces\n"));
1277
+ messages.push({ role: "assistant", content: assistantText.visible });
1278
+ messages.push({
1279
+ role: "user",
1280
+ content: "Your previous tool call was cut off before it finished — the JSON was incomplete, so NOTHING ran. " +
1281
+ "Retry now with a COMPLETE, valid ```tool block. " +
1282
+ "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files per call, and keep each file's content concise) " +
1283
+ "so the whole JSON fits in one response. Do NOT claim any file was written until a tool call actually succeeds.",
1284
+ });
1285
+ continue;
1286
+ }
1287
+ // Exhausted retries fall through so we don't loop forever, but the
1288
+ // user at least sees the (broken) output and the stop notice.
1289
+ }
1290
+ // Detect a ```tool fence whose JSON could NOT be parsed for any other
1291
+ // reason (malformed braces, trailing junk, a stray `}` — NOT plain
1292
+ // truncation, which is handled above). Without this, the raw block
1293
+ // leaks to the screen as a code fence and the requested action (often
1294
+ // a whole fs.writeMany scaffold) silently never runs — exactly the
1295
+ // "fs.writeMany printed but nothing created" failure. Require the fence
1296
+ // to actually look like an intended call (mentions name/args) so a
1297
+ // genuine ```tool code example in prose isn't mistaken for one.
1298
+ const hasFencedCallShape = countToolFences(assistantText.visible) > 0 &&
1299
+ /```tool\s*\n[\s\S]*?"(?:name|args)"\s*:/i.test(assistantText.visible);
1300
+ if (hasFencedCallShape) {
1301
+ malformedFenceRetries += 1;
1302
+ if (malformedFenceRetries <= 3) {
1303
+ process.stdout.write(chalk.yellow(" ⚠ tool block present but its JSON didn't parse — asking the model to re-emit valid JSON\n"));
1304
+ messages.push({ role: "assistant", content: assistantText.visible });
1305
+ messages.push({
1306
+ role: "user",
1307
+ content: "Your previous message contained a ```tool block, but its JSON was INVALID, so NOTHING ran. " +
1308
+ "Common causes: an extra or missing `}` / `]`, a trailing brace after the closing `}`, or unescaped quotes/newlines inside a string value. " +
1309
+ 'Re-emit ONE valid ```tool block of the exact form {"name":"<tool>","args":{...}} with balanced braces. ' +
1310
+ "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files) so the JSON is easy to keep valid. " +
1311
+ "Do NOT claim any file was written until a tool call actually succeeds.",
1312
+ });
1313
+ continue;
1314
+ }
1315
+ // Exhausted retries — fall through to the normal path.
1316
+ }
1317
+ // Normal final-answer path: strip any stray sentinel tokens that
1318
+ // somehow leaked into prose so the answer renders cleanly.
1319
+ const cleaned = stripSentinelTokens(assistantText.visible);
1320
+ if (freshWebSearchRequired && !sawFreshWebSearch && !freshnessRetryUsed) {
1321
+ freshnessRetryUsed = true;
1322
+ process.stdout.write(chalk.dim(" ℹ current-info question detected — searching the web before answering\n"));
1270
1323
  messages.push({ role: "assistant", content: assistantText.visible });
1271
1324
  messages.push({
1272
1325
  role: "user",
1273
- content: `You have NOT finished the approved plan: ${unfinished.length} task(s) remain ` +
1274
- `(${unfinished.map((t) => `[${t.id}] ${t.title}`).join("; ")}). ` +
1275
- `Do NOT claim the work is complete, that files were created, or that a server is running ` +
1276
- `unless a tool call actually succeeded and you saw the output. ` +
1277
- `Resume now with the NEXT task ${next.id} ("${next.title}"): call task.update {taskId:"${next.id}", state:"in_progress"}, ` +
1278
- `then do the real work with a tool call (fs.writeMany / shell.exec / shell.start), VERIFY it, and mark it done. ` +
1279
- `Continue task by task until EVERY task is actually finished.`,
1326
+ content: freshnessGuardMessage() +
1327
+ " Reply with ONLY a fenced ```tool block for web.search now.",
1280
1328
  });
1281
1329
  continue;
1282
1330
  }
1283
- }
1284
- // If we still print a final answer while an approved plan has unfinished
1285
- // tasks (retries exhausted), do NOT let a fabricated "it's done" stand
1286
- // unchallengedappend an explicit, honest status so the user knows the
1287
- // build did not actually complete.
1288
- let completionWarning = "";
1289
- if (session.planApproved.value) {
1290
- const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
1291
- const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
1292
- if (livePlan && unfinished && unfinished.length > 0) {
1293
- completionWarning =
1294
- chalk.yellow(`\n ⚠ ${unfinished.length} of ${livePlan.tasks.length} plan task(s) are NOT actually complete:\n`) +
1295
- unfinished
1296
- .map((t) => chalk.yellow(` • [${t.id}] ${t.title}`))
1297
- .join("\n") +
1298
- chalk.dim("\n The summary above may overstate progress. Re-run with /implement, or ask clai to finish the remaining tasks.\n");
1331
+ // ── Premature-completion guard (approved plan still has work) ──────
1332
+ // If the user approved a plan and the model now gives a final answer
1333
+ // while tasks are still pending/in_progress without having run the
1334
+ // workit is fabricating completion (the exact "all tasks completed,
1335
+ // running at localhost:5173" failure). Force it back to executing the
1336
+ // next real task instead of accepting the false claim.
1337
+ if (session.planApproved.value && prematureCompletionRetries < 3) {
1338
+ const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
1339
+ const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
1340
+ if (livePlan && unfinished && unfinished.length > 0) {
1341
+ prematureCompletionRetries += 1;
1342
+ const next = unfinished[0];
1343
+ process.stdout.write(chalk.yellow(` ⚠ ${unfinished.length} plan task(s) still unfinished — not accepting a "done" claim; resuming execution\n`));
1344
+ messages.push({ role: "assistant", content: assistantText.visible });
1345
+ messages.push({
1346
+ role: "user",
1347
+ content: `You have NOT finished the approved plan: ${unfinished.length} task(s) remain ` +
1348
+ `(${unfinished.map((t) => `[${t.id}] ${t.title}`).join("; ")}). ` +
1349
+ `Do NOT claim the work is complete, that files were created, or that a server is running ` +
1350
+ `unless a tool call actually succeeded and you saw the output. ` +
1351
+ `Resume now with the NEXT task ${next.id} ("${next.title}"): call task.update {taskId:"${next.id}", state:"in_progress"}, ` +
1352
+ `then do the real work with a tool call (fs.writeMany / shell.exec / shell.start), VERIFY it, and mark it done. ` +
1353
+ `Continue task by task until EVERY task is actually finished.`,
1354
+ });
1355
+ continue;
1356
+ }
1299
1357
  }
1358
+ // If we still print a final answer while an approved plan has unfinished
1359
+ // tasks (retries exhausted), do NOT let a fabricated "it's done" stand
1360
+ // unchallenged — append an explicit, honest status so the user knows the
1361
+ // build did not actually complete.
1362
+ let completionWarning = "";
1363
+ if (session.planApproved.value) {
1364
+ const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
1365
+ const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
1366
+ if (livePlan && unfinished && unfinished.length > 0) {
1367
+ completionWarning =
1368
+ chalk.yellow(`\n ⚠ ${unfinished.length} of ${livePlan.tasks.length} plan task(s) are NOT actually complete:\n`) +
1369
+ unfinished
1370
+ .map((t) => chalk.yellow(` • [${t.id}] ${t.title}`))
1371
+ .join("\n") +
1372
+ chalk.dim("\n The summary above may overstate progress. Re-run with /implement, or ask clai to finish the remaining tasks.\n");
1373
+ }
1374
+ }
1375
+ if (cleaned) {
1376
+ process.stdout.write(renderMarkdown(cleaned));
1377
+ if (!cleaned.endsWith("\n"))
1378
+ process.stdout.write("\n");
1379
+ }
1380
+ if (completionWarning) {
1381
+ process.stdout.write(completionWarning);
1382
+ }
1383
+ if (assistantText.hasThinking) {
1384
+ process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
1385
+ }
1386
+ await auditLog("agent.final", { provider, model, steps: step + 1 });
1387
+ lastAnswer = cleaned;
1388
+ return lastAnswer;
1300
1389
  }
1301
- if (cleaned) {
1302
- process.stdout.write(renderMarkdown(cleaned));
1303
- if (!cleaned.endsWith("\n"))
1304
- process.stdout.write("\n");
1305
- }
1306
- if (completionWarning) {
1307
- process.stdout.write(completionWarning);
1390
+ // A valid primary tool call exists for this fresh model turn. Show any
1391
+ // prose / thinking that preceded it, record the assistant message ONCE,
1392
+ // then queue any additional tool calls from the same message so they
1393
+ // run in order on the next iterations (no extra round-trip).
1394
+ const beforeTool = recoveredFromBareJson
1395
+ ? ""
1396
+ : textBeforeToolCall(assistantText.visible);
1397
+ if (beforeTool) {
1398
+ process.stdout.write(renderMarkdown(beforeTool) + "\n");
1308
1399
  }
1309
1400
  if (assistantText.hasThinking) {
1310
1401
  process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
1311
1402
  }
1312
- await auditLog("agent.final", { provider, model, steps: step + 1 });
1313
- lastAnswer = cleaned;
1314
- return lastAnswer;
1403
+ messages.push({ role: "assistant", content: assistantText.visible });
1404
+ if (!recoveredFromBareJson && call) {
1405
+ const allCalls = parseAllToolCalls(assistantText.visible);
1406
+ if (allCalls.length > 1 &&
1407
+ allCalls[0] &&
1408
+ sameToolCall(allCalls[0], call)) {
1409
+ pendingCalls = allCalls.slice(1);
1410
+ process.stdout.write(chalk.dim(` ℹ ${allCalls.length} tool calls in this message — running them in order\n`));
1411
+ }
1412
+ }
1315
1413
  }
1414
+ // Type guard: every path above either set `call` or returned/continued.
1415
+ if (!call)
1416
+ continue;
1316
1417
  // ── Duplicate-call detection ──────────────────────────────────────────
1317
1418
  // If the model calls the exact same tool with the exact same args
1318
1419
  // repeatedly, it's stuck in a loop. Inject a corrective message
@@ -1323,7 +1424,9 @@ export async function runAgentLoop(prompt, options = {}) {
1323
1424
  call.name === "fs.writeMany" ||
1324
1425
  call.name === "fs.edit";
1325
1426
  process.stdout.write(chalk.yellow(` ⚠ ${call.name} was already called with the same arguments — ${isWrite ? "moving on" : "forcing summary"}\n`));
1326
- messages.push({ role: "assistant", content: assistantText.visible });
1427
+ // A repeat means this batch went off the rails — drop any queued calls
1428
+ // and let the model react. The assistant message was already recorded.
1429
+ pendingCalls = [];
1327
1430
  messages.push({
1328
1431
  role: "user",
1329
1432
  content: isWrite
@@ -1338,27 +1441,6 @@ export async function runAgentLoop(prompt, options = {}) {
1338
1441
  if (loopCheck.reason) {
1339
1442
  process.stdout.write(chalk.dim(` ℹ ${loopCheck.reason}\n`));
1340
1443
  }
1341
- // Print only non-thinking text before the tool call. When the call was
1342
- // recovered from a bare JSON object (the whole message WAS the call),
1343
- // there is no prose to show — skip it so we don't echo the raw JSON.
1344
- const beforeTool = recoveredFromBareJson
1345
- ? ""
1346
- : textBeforeToolCall(assistantText.visible);
1347
- if (beforeTool) {
1348
- process.stdout.write(renderMarkdown(beforeTool) + "\n");
1349
- }
1350
- if (assistantText.hasThinking) {
1351
- process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
1352
- }
1353
- messages.push({ role: "assistant", content: assistantText.visible });
1354
- // Detect a model that crammed MULTIPLE tool calls into one response.
1355
- // Only `call` (the first block) will run this turn; the rest are dropped.
1356
- // We flag it so that after the first tool executes we explicitly tell the
1357
- // model the others did NOT run — preventing the "I ran everything" lie.
1358
- const extraToolBlocks = Math.max(0, countToolFences(assistantText.visible) - 1);
1359
- if (extraToolBlocks > 0) {
1360
- process.stdout.write(chalk.yellow(` ⚠ ${extraToolBlocks} extra tool block(s) in one message were ignored — only the first ran. One tool per turn.\n`));
1361
- }
1362
1444
  // ── Plan / task tools (session-scoped, handled inline) ─────────────
1363
1445
  // These don't go through the generic registry because they need the
1364
1446
  // session id and mutate the live plan that the user can view (Ctrl+P).
@@ -1371,6 +1453,10 @@ export async function runAgentLoop(prompt, options = {}) {
1371
1453
  productiveSteps += 1;
1372
1454
  loopGuard.recordAttempt(step, call.name, call.args, planResult.ok, 0);
1373
1455
  process.stdout.write(planResult.display);
1456
+ // plan.create means "STOP and wait for /implement" — abandon any
1457
+ // other calls the model batched alongside it.
1458
+ if (call.name === "plan.create")
1459
+ pendingCalls = [];
1374
1460
  messages.push({
1375
1461
  role: "tool",
1376
1462
  content: `Tool ${call.name} result (ok=${planResult.ok}):\n${planResult.modelNote}`,
@@ -1398,6 +1484,7 @@ export async function runAgentLoop(prompt, options = {}) {
1398
1484
  !session.planApproved.value &&
1399
1485
  !isPreApprovalAllowedTool(call.name)) {
1400
1486
  process.stdout.write(chalk.yellow(` ⚠ plan awaiting approval — ${call.name} is blocked until you /implement (or /discard)\n`));
1487
+ pendingCalls = [];
1401
1488
  messages.push({
1402
1489
  role: "user",
1403
1490
  content: `There is an ACTIVE PLAN that has NOT been approved yet, so you must NOT execute it — ` +
@@ -1538,6 +1625,13 @@ export async function runAgentLoop(prompt, options = {}) {
1538
1625
  const errMsg = toolError instanceof Error ? toolError.message : String(toolError);
1539
1626
  result = { ok: false, output: `Tool error: ${errMsg}`, exitCode: 1 };
1540
1627
  }
1628
+ // Stop-on-error: if this call failed, abandon any remaining queued calls
1629
+ // from the same message so the model sees the failure and decides what to
1630
+ // do next instead of blindly running steps that depended on it.
1631
+ if (!result.ok && pendingCalls.length > 0) {
1632
+ process.stdout.write(chalk.dim(` ↳ ${pendingCalls.length} queued call(s) cancelled because this step failed\n`));
1633
+ pendingCalls = [];
1634
+ }
1541
1635
  const output = result.output.trim();
1542
1636
  const displayMax = 6_000;
1543
1637
  // If the tool already produced an artifact (shell.exec now streams to one
@@ -1651,10 +1745,7 @@ export async function runAgentLoop(prompt, options = {}) {
1651
1745
  }
1652
1746
  messages.push({
1653
1747
  role: "tool",
1654
- content: `Tool ${call.name} result (exit=${result.exitCode ?? 0}, ok=${result.ok}):\n${contextOutput}` +
1655
- (extraToolBlocks > 0
1656
- ? `\n\nIMPORTANT: your previous message contained ${extraToolBlocks + 1} tool blocks, but ONLY this first one (${call.name}) actually ran. The other ${extraToolBlocks} did NOT execute and were discarded. Emit EXACTLY ONE tool block per message. Send the next tool call now — and do NOT assume any of the dropped calls happened.`
1657
- : ""),
1748
+ content: `Tool ${call.name} result (exit=${result.exitCode ?? 0}, ok=${result.ok}):\n${contextOutput}`,
1658
1749
  });
1659
1750
  // Compact older messages when the running estimate exceeds budget so
1660
1751
  // free-tier context windows are not blown by long pentest sessions.