@pentoshi/clai 1.1.4 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/runner.d.ts +15 -0
- package/dist/agent/runner.js +387 -296
- package/dist/agent/runner.js.map +1 -1
- package/dist/commands/update.js +1 -1
- package/dist/prompts/index.d.ts +2 -2
- package/dist/prompts/index.js +107 -383
- package/dist/prompts/index.js.map +1 -1
- package/dist/repl.js +109 -26
- package/dist/repl.js.map +1 -1
- package/dist/safety/classifier.d.ts +7 -0
- package/dist/safety/classifier.js +89 -79
- package/dist/safety/classifier.js.map +1 -1
- package/dist/safety/patterns.d.ts +15 -0
- package/dist/safety/patterns.js +47 -20
- package/dist/safety/patterns.js.map +1 -1
- package/dist/tools/command-intent.js +22 -4
- package/dist/tools/command-intent.js.map +1 -1
- package/dist/tools/registry.d.ts +7 -0
- package/dist/tools/registry.js +89 -6
- package/dist/tools/registry.js.map +1 -1
- package/dist/ui/thinking.d.ts +10 -1
- package/dist/ui/thinking.js +81 -17
- package/dist/ui/thinking.js.map +1 -1
- package/package.json +1 -1
package/dist/agent/runner.js
CHANGED
|
@@ -380,6 +380,53 @@ export function countToolFences(text) {
|
|
|
380
380
|
const matches = text.match(/```tool\s*\n[\s\S]*?```/gi);
|
|
381
381
|
return matches ? matches.length : 0;
|
|
382
382
|
}
|
|
383
|
+
/**
|
|
384
|
+
* Parse EVERY explicitly-delimited tool call in a message, in document
|
|
385
|
+
* order. Unlike parseToolCall (which returns only the first), this lets the
|
|
386
|
+
* runner execute a batch the model emitted in one turn — e.g. the natural
|
|
387
|
+
* "task.update in_progress → do the work → task.update done" sequence, or
|
|
388
|
+
* several fs.write calls. Only the unambiguous, delimited formats are
|
|
389
|
+
* collected (```tool fences, <tool_call> XML, and Kimi sentinel blocks) so a
|
|
390
|
+
* worked example in prose is far less likely to be mistaken for a call.
|
|
391
|
+
* The runner executes them sequentially and STOPS the batch on the first
|
|
392
|
+
* failure so the model can react, mirroring how Claude Code batches reads
|
|
393
|
+
* and edits but pauses when something breaks.
|
|
394
|
+
*/
|
|
395
|
+
export function parseAllToolCalls(text) {
|
|
396
|
+
const found = [];
|
|
397
|
+
let m;
|
|
398
|
+
const fenceRe = /```tool\s*\n?([\s\S]*?)```/gi;
|
|
399
|
+
while ((m = fenceRe.exec(text)) !== null) {
|
|
400
|
+
const call = tryParseCall(m[1] ?? "");
|
|
401
|
+
if (call)
|
|
402
|
+
found.push({ index: m.index, call });
|
|
403
|
+
}
|
|
404
|
+
const xmlRe = /<tool_call>([\s\S]*?)<\/tool_call>/gi;
|
|
405
|
+
while ((m = xmlRe.exec(text)) !== null) {
|
|
406
|
+
const call = tryParseCall(m[1] ?? "");
|
|
407
|
+
if (call)
|
|
408
|
+
found.push({ index: m.index, call });
|
|
409
|
+
}
|
|
410
|
+
const kimiRe = new RegExp(KIMI_TOOL_CALL_RE.source, "gi");
|
|
411
|
+
while ((m = kimiRe.exec(text)) !== null) {
|
|
412
|
+
const call = tryParseCall(JSON.stringify({ name: m[1], args: tryJson(m[2] ?? "{}") ?? {} }));
|
|
413
|
+
if (call)
|
|
414
|
+
found.push({ index: m.index, call });
|
|
415
|
+
}
|
|
416
|
+
found.sort((a, b) => a.index - b.index);
|
|
417
|
+
return found.map((f) => f.call);
|
|
418
|
+
}
|
|
419
|
+
/** Structural equality for two tool calls (name + canonical args JSON). */
|
|
420
|
+
export function sameToolCall(a, b) {
|
|
421
|
+
if (a.name !== b.name)
|
|
422
|
+
return false;
|
|
423
|
+
try {
|
|
424
|
+
return JSON.stringify(a.args) === JSON.stringify(b.args);
|
|
425
|
+
}
|
|
426
|
+
catch {
|
|
427
|
+
return false;
|
|
428
|
+
}
|
|
429
|
+
}
|
|
383
430
|
/** Extract the text before the tool call block for display purposes */
|
|
384
431
|
function textBeforeToolCall(text) {
|
|
385
432
|
const patterns = [
|
|
@@ -529,7 +576,8 @@ function freshnessGuardMessage(now = new Date()) {
|
|
|
529
576
|
return (`Freshness guard for this turn: the latest user prompt appears to ask for current, volatile, or externally verifiable information. The present moment is ${currentDateTimeContext(now)}. ` +
|
|
530
577
|
"Before answering, call web.search FIRST with a concise query derived from the user prompt. " +
|
|
531
578
|
"Shape the search query for the newest timeline by including current/latest or the current year/month when useful. " +
|
|
532
|
-
"
|
|
579
|
+
"Do not answer from the snippets alone when detail matters — set fetchTop (e.g. fetchTop:2) to read the top result pages, or follow up with web.fetch on the most relevant URL, then answer from what the pages actually say and cite them. " +
|
|
580
|
+
"If web.search fails or has no results, say that current information is unavailable instead of guessing from memory.");
|
|
533
581
|
}
|
|
534
582
|
/**
|
|
535
583
|
* Directive injected for build/scaffold turns. Forces the careful
|
|
@@ -542,7 +590,7 @@ function buildWorkflowDirective() {
|
|
|
542
590
|
"1. EXPLORE: fs.list the working directory (and key subdirs) to see what already exists. Use tool.batch to parallelize reads.",
|
|
543
591
|
"2. UNDERSTAND: fs.read the files that matter (like package.json for js related and same for other languages too, config, entry points, existing components). Detect the existing stack/tooling and MATCH it. If the dir is empty or only has a stub, start fresh with a sensible modern default and say so.",
|
|
544
592
|
"3. PLAN: call plan.create with a COMPREHENSIVE plan — a detailed `detail` (stack chosen and WHY, architecture, how you'll verify) and 4-8 SEPARATE, ordered, high-quality tasks. The FIRST task initializes the project (scaffolder); the MIDDLE tasks MUST implement the ACTUAL FEATURE the user asked for by REPLACING the scaffolder's boilerplate (e.g. rewrite src/App.jsx into the real todo/blog/etc. UI, add components, state, styles); the LAST task verifies with a build. Scaffolding + install + run ALONE is NOT acceptable — that just leaves the Vite starter page. Each task is one distinct, verifiable action. Then STOP and wait for the user to /implement.",
|
|
545
|
-
"4. IMPLEMENT: once approved, work task by task in STRICT ORDER
|
|
593
|
+
"4. IMPLEMENT: once approved, work task by task in STRICT ORDER. For each task: call task.update {taskId, state:'in_progress'} → do the real work → VERIFY it actually succeeded (read a file you wrote, check the command's exit/output) → call task.update {taskId, state:'done'}, then move to the NEXT task. You MAY emit several tool calls in one message and they run in order, top to bottom (the batch STOPS if one fails). A clean rhythm is: task.update in_progress + the work + task.update done together. Keep going until EVERY task is done. Do NOT claim work you didn't actually run.",
|
|
546
594
|
"",
|
|
547
595
|
"INITIALIZE WITH THE OFFICIAL SCAFFOLDER FIRST (do NOT hand-write build configs):",
|
|
548
596
|
"- React/Vue/Svelte/vanilla → `npm create vite@latest <appname> -- --template react` (templates: react, react-ts, vue, vue-ts, svelte, vanilla). Next.js → `npx --yes create-next-app@latest <appname> --yes --eslint --no-tailwind --app --src-dir --import-alias \"@/*\"`. Node API → `npm init -y`.",
|
|
@@ -552,7 +600,7 @@ function buildWorkflowDirective() {
|
|
|
552
600
|
"- VERIFY the init actually worked before marking the task done: fs.read package.json (it must now exist AND list react + react-dom) and fs.read index.html (it must reference your jsx entry). 'Operation cancelled' / non-zero exit means the task FAILED — do not proceed as if it succeeded.",
|
|
553
601
|
"",
|
|
554
602
|
"CRITICAL RULES during IMPLEMENTATION:",
|
|
555
|
-
"-
|
|
603
|
+
"- You may batch tool calls: emit one or several ```tool blocks in a message and they run in order, top to bottom. If any call fails, the rest of that batch is cancelled so you can react — so order dependent steps correctly and keep batches focused. A good batch is task.update(in_progress) + the work + task.update(done) for ONE task.",
|
|
556
604
|
"- Do NOT re-explore. Step 1 (EXPLORE) was already completed during planning. Start executing the first pending task immediately.",
|
|
557
605
|
"- ONE task at a time, in ORDER. Do NOT skip ahead to task 3 before task 2 is done.",
|
|
558
606
|
"- KEEP EACH FILE SMALL ENOUGH TO WRITE IN ONE CALL. If a fs.write is reported as 'cut off (output too long)', the file was NOT fully written and is likely broken/invalid — re-write it, splitting a large component into smaller files if needed. NEVER leave a half-written file and move on.",
|
|
@@ -975,6 +1023,17 @@ export async function runAgentLoop(prompt, options = {}) {
|
|
|
975
1023
|
// tasks are still pending and it never ran the work. We nudge it back to
|
|
976
1024
|
// executing the next task a bounded number of times before giving up.
|
|
977
1025
|
let prematureCompletionRetries = 0;
|
|
1026
|
+
// ── Multi-tool execution queue ─────────────────────────────────────
|
|
1027
|
+
// Models naturally emit several tool calls in one message — e.g. the
|
|
1028
|
+
// plan-execution rhythm "task.update in_progress → do the work →
|
|
1029
|
+
// task.update done", or a batch of fs.write calls. Rather than running
|
|
1030
|
+
// only the first and discarding the rest (which made models believe work
|
|
1031
|
+
// ran when it didn't, and broke plan execution), we parse ALL calls in a
|
|
1032
|
+
// message, run the first this iteration, and queue the rest here to run on
|
|
1033
|
+
// subsequent iterations WITHOUT another model round-trip. The queue is
|
|
1034
|
+
// cleared whenever a call fails, is blocked, or needs the model to react,
|
|
1035
|
+
// so the model always sees errors and stays in control.
|
|
1036
|
+
let pendingCalls = [];
|
|
978
1037
|
// ── Step budget ───────────────────────────────────────────────────
|
|
979
1038
|
// The budget governs how many *productive* steps (a tool execution or a
|
|
980
1039
|
// final answer) the agent may take. Recovery iterations — nudging a model
|
|
@@ -1018,301 +1077,343 @@ export async function runAgentLoop(prompt, options = {}) {
|
|
|
1018
1077
|
if (productiveSteps >= stepBudget)
|
|
1019
1078
|
break;
|
|
1020
1079
|
options.signal?.throwIfAborted();
|
|
1021
|
-
//
|
|
1022
|
-
//
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
messages,
|
|
1034
|
-
temperature: 0.2,
|
|
1035
|
-
// Reasoning models can spend a lot on hidden thinking; give
|
|
1036
|
-
// them headroom so the visible answer / tool call isn't
|
|
1037
|
-
// truncated to silence. The non-thinking budget must be large
|
|
1038
|
-
// enough for a single-file fs.write / multi-file fs.writeMany
|
|
1039
|
-
// payload — a truncated tool-call JSON fails to parse and leaks a
|
|
1040
|
-
// broken (and syntactically invalid) file. 8k was too small for a
|
|
1041
|
-
// full component, so allow more room for the visible tool call.
|
|
1042
|
-
maxTokens: config.thinking?.enabled ? 16_384 : 12_288,
|
|
1043
|
-
signal: options.signal,
|
|
1044
|
-
thinking: config.thinking,
|
|
1045
|
-
}, (token) => {
|
|
1046
|
-
// Heuristic: <think>… markers and reasoning_content tokens flow
|
|
1047
|
-
// through onToken. Surface activity in the spinner so the screen
|
|
1048
|
-
// is never empty for minutes.
|
|
1049
|
-
if (!sawReasoning && /<think/i.test(token)) {
|
|
1050
|
-
sawReasoning = true;
|
|
1051
|
-
inThinking = true;
|
|
1052
|
-
spinner.setLabel("thinking");
|
|
1053
|
-
}
|
|
1054
|
-
if (/<\/think>/i.test(token)) {
|
|
1055
|
-
inThinking = false;
|
|
1056
|
-
}
|
|
1057
|
-
// Only push reasoning tokens to the spinner preview. Visible
|
|
1058
|
-
// answer / tool-call tokens should NOT go through the dim
|
|
1059
|
-
// spinner preview — doing so makes the final answer appear
|
|
1060
|
-
// "diluted" in light font when the spinner's last render
|
|
1061
|
-
// briefly shows the answer text before being erased.
|
|
1062
|
-
if (inThinking) {
|
|
1063
|
-
const cleaned = token.replace(/<\/?think[^>]*>/gi, "");
|
|
1064
|
-
if (cleaned) {
|
|
1065
|
-
spinner.pushPreview(cleaned);
|
|
1066
|
-
const approx = cleaned.split(/\s+/).filter(Boolean).length;
|
|
1067
|
-
if (approx > 0)
|
|
1068
|
-
spinner.bumpReasoning(approx);
|
|
1069
|
-
}
|
|
1070
|
-
}
|
|
1071
|
-
}, (status) => {
|
|
1072
|
-
spinner.stop();
|
|
1073
|
-
process.stdout.write(chalk.dim(status));
|
|
1074
|
-
});
|
|
1075
|
-
}
|
|
1076
|
-
finally {
|
|
1077
|
-
// Always clear the spinner — abort, network error, or success.
|
|
1078
|
-
spinner.stop();
|
|
1079
|
-
}
|
|
1080
|
-
provider = completion.provider;
|
|
1081
|
-
model = completion.model;
|
|
1082
|
-
const assistantText = rememberThinkingFromText(completion.text);
|
|
1083
|
-
// Try visible text first, then thinking content — some models (e.g. glm-5.1)
|
|
1084
|
-
// wrap tool calls inside considering tags, so stripThinking removes them
|
|
1085
|
-
// into thinkContent and visible becomes empty. Recovering from thinkContent
|
|
1086
|
-
// prevents an endless nudge loop where the model keeps hiding the call.
|
|
1087
|
-
let call = parseToolCall(assistantText.visible, {
|
|
1088
|
-
strict: getConfig().parserStrict,
|
|
1089
|
-
});
|
|
1090
|
-
if (!call && assistantText.hasThinking) {
|
|
1091
|
-
call = parseToolCall(assistantText.thinkContent, {
|
|
1092
|
-
strict: getConfig().parserStrict,
|
|
1093
|
-
});
|
|
1094
|
-
if (call) {
|
|
1095
|
-
process.stdout.write(chalk.dim(" ℹ recovered tool call from thinking content\n"));
|
|
1096
|
-
}
|
|
1097
|
-
}
|
|
1098
|
-
// ── Thinking-only recovery ────────────────────────────────────────
|
|
1099
|
-
// Some models (eg gpt-oss-20b on NVIDIA NIM) occasionally spend their
|
|
1100
|
-
// entire budget on hidden <think> reasoning and emit no visible text
|
|
1101
|
-
// or tool call. Without this guard the agent silently returns an empty
|
|
1102
|
-
// answer and the user has to re-submit the same prompt.
|
|
1103
|
-
if (!assistantText.visible.trim() && !call && assistantText.hasThinking) {
|
|
1104
|
-
emptyVisibleRetries += 1;
|
|
1105
|
-
if (emptyVisibleRetries <= 2) {
|
|
1106
|
-
process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
|
|
1107
|
-
process.stdout.write(chalk.yellow(" ⚠ model produced only thinking — nudging it to take action\n"));
|
|
1108
|
-
messages.push({ role: "assistant", content: completion.text });
|
|
1109
|
-
const buildNudge = buildLikeTurn && !activePlan
|
|
1110
|
-
? "You only produced internal reasoning with no visible answer or tool call. " +
|
|
1111
|
-
"This is a BUILD/SCAFFOLD task with NO plan yet. " +
|
|
1112
|
-
"You MUST call plan.create using the ```tool format to create a comprehensive plan BEFORE writing any files or running any commands. " +
|
|
1113
|
-
"Do NOT use fs.write, fs.writeMany, fs.edit, shell.exec, shell.start, or pkg.install yet. " +
|
|
1114
|
-
"Your ONLY allowed action right now is plan.create (or read/list for exploration)."
|
|
1115
|
-
: "You only produced internal reasoning with no visible answer or tool call. " +
|
|
1116
|
-
"You MUST either call a tool using the ```tool format or provide your final answer. " +
|
|
1117
|
-
"Do NOT wrap your tool call inside considering or reasoning tags — put it in the VISIBLE response, not hidden. " +
|
|
1118
|
-
"If images are attached, inspect them directly for visual details (text, colors, layout, spacing, style) instead of using OCR unless explicitly needed. " +
|
|
1119
|
-
"Do NOT just think — take action NOW.";
|
|
1120
|
-
messages.push(recoveryUserMessage(buildNudge));
|
|
1121
|
-
continue;
|
|
1122
|
-
}
|
|
1123
|
-
// Exhausted retries — fall through to the normal empty-answer path
|
|
1124
|
-
// which will print a warning and return.
|
|
1080
|
+
// `call` and `assistantText` are shared by both paths below: a fresh
|
|
1081
|
+
// model round-trip, or draining a previously-queued tool call.
|
|
1082
|
+
let call;
|
|
1083
|
+
let assistantText;
|
|
1084
|
+
let recoveredFromBareJson = false;
|
|
1085
|
+
if (pendingCalls.length > 0) {
|
|
1086
|
+
// Drain the next queued call from the previous model message — no new
|
|
1087
|
+
// round-trip. The assistant message and any prose were already shown
|
|
1088
|
+
// when the batch was parsed.
|
|
1089
|
+
call = pendingCalls.shift();
|
|
1090
|
+
assistantText = { visible: "", thinkContent: "", hasThinking: false };
|
|
1091
|
+
process.stdout.write(chalk.dim(` ↳ continuing batch (${pendingCalls.length} more queued)\n`));
|
|
1125
1092
|
}
|
|
1126
1093
|
else {
|
|
1127
|
-
//
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1094
|
+
// Buffer LLM output so tool JSON and hidden thinking are not printed raw.
|
|
1095
|
+
// Status messages (rate-limit retries, fallback hints) still surface live.
|
|
1096
|
+
// A spinner gives the user feedback during long thinking phases on
|
|
1097
|
+
// models like glm-5.1 / deepseek-v4-flash that stream reasoning first.
|
|
1098
|
+
const spinner = startThinkingSpinner(step === 0 ? "waiting for model" : `step ${step + 1}`, options.signal);
|
|
1099
|
+
let sawReasoning = false;
|
|
1100
|
+
let inThinking = false;
|
|
1101
|
+
let completion;
|
|
1102
|
+
try {
|
|
1103
|
+
completion = await streamWithProvider({
|
|
1104
|
+
provider,
|
|
1105
|
+
model,
|
|
1106
|
+
messages,
|
|
1107
|
+
temperature: 0.2,
|
|
1108
|
+
// Reasoning models can spend a lot on hidden thinking; give
|
|
1109
|
+
// them headroom so the visible answer / tool call isn't
|
|
1110
|
+
// truncated to silence. The non-thinking budget must be large
|
|
1111
|
+
// enough for a single-file fs.write / multi-file fs.writeMany
|
|
1112
|
+
// payload — a truncated tool-call JSON fails to parse and leaks a
|
|
1113
|
+
// broken (and syntactically invalid) file. 8k was too small for a
|
|
1114
|
+
// full component, so allow more room for the visible tool call.
|
|
1115
|
+
maxTokens: config.thinking?.enabled ? 16_384 : 12_288,
|
|
1116
|
+
signal: options.signal,
|
|
1117
|
+
thinking: config.thinking,
|
|
1118
|
+
}, (token) => {
|
|
1119
|
+
// Heuristic: <think>… markers and reasoning_content tokens flow
|
|
1120
|
+
// through onToken. Surface activity in the spinner so the screen
|
|
1121
|
+
// is never empty for minutes.
|
|
1122
|
+
if (!sawReasoning && /<think/i.test(token)) {
|
|
1123
|
+
sawReasoning = true;
|
|
1124
|
+
inThinking = true;
|
|
1125
|
+
spinner.setLabel("thinking");
|
|
1126
|
+
}
|
|
1127
|
+
if (/<\/think>/i.test(token)) {
|
|
1128
|
+
inThinking = false;
|
|
1129
|
+
}
|
|
1130
|
+
// Only push reasoning tokens to the spinner preview. Visible
|
|
1131
|
+
// answer / tool-call tokens should NOT go through the dim
|
|
1132
|
+
// spinner preview — doing so makes the final answer appear
|
|
1133
|
+
// "diluted" in light font when the spinner's last render
|
|
1134
|
+
// briefly shows the answer text before being erased.
|
|
1135
|
+
if (inThinking) {
|
|
1136
|
+
const cleaned = token.replace(/<\/?think[^>]*>/gi, "");
|
|
1137
|
+
if (cleaned) {
|
|
1138
|
+
spinner.pushPreview(cleaned);
|
|
1139
|
+
const approx = cleaned.split(/\s+/).filter(Boolean).length;
|
|
1140
|
+
if (approx > 0)
|
|
1141
|
+
spinner.bumpReasoning(approx);
|
|
1142
|
+
}
|
|
1143
|
+
}
|
|
1144
|
+
}, (status) => {
|
|
1145
|
+
spinner.stop();
|
|
1146
|
+
process.stdout.write(chalk.dim(status));
|
|
1147
|
+
});
|
|
1147
1148
|
}
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
const bareThink = recognizeBareToolJson(assistantText.thinkContent);
|
|
1152
|
-
if (bareThink?.call) {
|
|
1153
|
-
call = bareThink.call;
|
|
1154
|
-
recoveredFromBareJson = true;
|
|
1155
|
-
process.stdout.write(chalk.dim(" ℹ recovered an unfenced tool call from thinking content\n"));
|
|
1149
|
+
finally {
|
|
1150
|
+
// Always clear the spinner — abort, network error, or success.
|
|
1151
|
+
spinner.stop();
|
|
1156
1152
|
}
|
|
1157
|
-
|
|
1158
|
-
|
|
1153
|
+
provider = completion.provider;
|
|
1154
|
+
model = completion.model;
|
|
1155
|
+
const assistantTextResult = rememberThinkingFromText(completion.text);
|
|
1156
|
+
assistantText = assistantTextResult;
|
|
1157
|
+
// Try visible text first, then thinking content — some models (e.g. glm-5.1)
|
|
1158
|
+
// wrap tool calls inside considering tags, so stripThinking removes them
|
|
1159
|
+
// into thinkContent and visible becomes empty. Recovering from thinkContent
|
|
1160
|
+
// prevents an endless nudge loop where the model keeps hiding the call.
|
|
1161
|
+
call = parseToolCall(assistantText.visible, {
|
|
1162
|
+
strict: getConfig().parserStrict,
|
|
1163
|
+
});
|
|
1164
|
+
if (!call && assistantText.hasThinking) {
|
|
1165
|
+
call = parseToolCall(assistantText.thinkContent, {
|
|
1166
|
+
strict: getConfig().parserStrict,
|
|
1167
|
+
});
|
|
1168
|
+
if (call) {
|
|
1169
|
+
process.stdout.write(chalk.dim(" ℹ recovered tool call from thinking content\n"));
|
|
1170
|
+
}
|
|
1159
1171
|
}
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1172
|
+
// ── Thinking-only recovery ────────────────────────────────────────
|
|
1173
|
+
// Some models (eg gpt-oss-20b on NVIDIA NIM) occasionally spend their
|
|
1174
|
+
// entire budget on hidden <think> reasoning and emit no visible text
|
|
1175
|
+
// or tool call. Without this guard the agent silently returns an empty
|
|
1176
|
+
// answer and the user has to re-submit the same prompt.
|
|
1177
|
+
if (!assistantText.visible.trim() && !call && assistantText.hasThinking) {
|
|
1178
|
+
emptyVisibleRetries += 1;
|
|
1179
|
+
if (emptyVisibleRetries <= 2) {
|
|
1180
|
+
process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
|
|
1181
|
+
process.stdout.write(chalk.yellow(" ⚠ model produced only thinking — nudging it to take action\n"));
|
|
1182
|
+
messages.push({ role: "assistant", content: completion.text });
|
|
1183
|
+
const buildNudge = buildLikeTurn && !activePlan
|
|
1184
|
+
? "You only produced internal reasoning with no visible answer or tool call. " +
|
|
1169
1185
|
"This is a BUILD/SCAFFOLD task with NO plan yet. " +
|
|
1170
|
-
"You MUST call plan.create using
|
|
1171
|
-
|
|
1172
|
-
"
|
|
1173
|
-
: "
|
|
1174
|
-
"
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
"
|
|
1186
|
+
"You MUST call plan.create using the ```tool format to create a comprehensive plan BEFORE writing any files or running any commands. " +
|
|
1187
|
+
"Do NOT use fs.write, fs.writeMany, fs.edit, shell.exec, shell.start, or pkg.install yet. " +
|
|
1188
|
+
"Your ONLY allowed action right now is plan.create (or read/list for exploration)."
|
|
1189
|
+
: "You only produced internal reasoning with no visible answer or tool call. " +
|
|
1190
|
+
"You MUST either call a tool using the ```tool format or provide your final answer. " +
|
|
1191
|
+
"Do NOT wrap your tool call inside considering or reasoning tags — put it in the VISIBLE response, not hidden. " +
|
|
1192
|
+
"If images are attached, inspect them directly for visual details (text, colors, layout, spacing, style) instead of using OCR unless explicitly needed. " +
|
|
1193
|
+
"Do NOT just think — take action NOW.";
|
|
1194
|
+
messages.push(recoveryUserMessage(buildNudge));
|
|
1178
1195
|
continue;
|
|
1179
1196
|
}
|
|
1180
|
-
// Exhausted retries — fall through to the normal answer path
|
|
1197
|
+
// Exhausted retries — fall through to the normal empty-answer path
|
|
1198
|
+
// which will print a warning and return.
|
|
1181
1199
|
}
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
// model to retry the tool call in a clean JSON format.
|
|
1186
|
-
if (/<\|tool_call(?:s_section)?_begin\|>|<\|tool_call_argument_begin\|>/i.test(assistantText.visible)) {
|
|
1187
|
-
process.stdout.write(chalk.yellow(" ⚠ tool call was malformed or cut off — asking the model to retry in JSON form\n"));
|
|
1188
|
-
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1189
|
-
messages.push(recoveryUserMessage("Your previous tool call was malformed or truncated. " +
|
|
1190
|
-
"Reply with ONLY a fenced ```tool block containing valid JSON " +
|
|
1191
|
-
'of the form `{"name": "<tool>", "args": { ... }}`. ' +
|
|
1192
|
-
"Do not use <|tool_call_begin|> markers."));
|
|
1193
|
-
continue;
|
|
1200
|
+
else {
|
|
1201
|
+
// Reset the counter on any successful visible output or recovered call.
|
|
1202
|
+
emptyVisibleRetries = 0;
|
|
1194
1203
|
}
|
|
1195
|
-
//
|
|
1196
|
-
//
|
|
1197
|
-
//
|
|
1198
|
-
//
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1204
|
+
// `call` was already extracted above (from visible text or thinking content).
|
|
1205
|
+
// Recovery: the model meant to call a tool but emitted a bare JSON object
|
|
1206
|
+
// with no ```tool fence — either a complete {name,args} the strict
|
|
1207
|
+
// matchers missed (recover it directly), or just an args object like
|
|
1208
|
+
// {"path":"file.pdf"} with the wrapper dropped (nudge a retry below so
|
|
1209
|
+
// the requested action runs instead of the JSON leaking as the answer).
|
|
1210
|
+
let bareArgsOnly = false;
|
|
1211
|
+
recoveredFromBareJson = false;
|
|
1212
|
+
if (!call) {
|
|
1213
|
+
const bare = recognizeBareToolJson(assistantText.visible);
|
|
1214
|
+
if (bare?.call) {
|
|
1215
|
+
call = bare.call;
|
|
1216
|
+
recoveredFromBareJson = true;
|
|
1217
|
+
process.stdout.write(chalk.dim(" ℹ recovered an unfenced tool call from bare JSON\n"));
|
|
1218
|
+
}
|
|
1219
|
+
else if (bare?.argsOnly) {
|
|
1220
|
+
bareArgsOnly = true;
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
// Also check thinking content for bare JSON calls.
|
|
1224
|
+
if (!call && assistantText.hasThinking) {
|
|
1225
|
+
const bareThink = recognizeBareToolJson(assistantText.thinkContent);
|
|
1226
|
+
if (bareThink?.call) {
|
|
1227
|
+
call = bareThink.call;
|
|
1228
|
+
recoveredFromBareJson = true;
|
|
1229
|
+
process.stdout.write(chalk.dim(" ℹ recovered an unfenced tool call from thinking content\n"));
|
|
1230
|
+
}
|
|
1231
|
+
else if (bareThink?.argsOnly) {
|
|
1232
|
+
bareArgsOnly = true;
|
|
1212
1233
|
}
|
|
1213
|
-
// Exhausted retries — fall through so we don't loop forever, but the
|
|
1214
|
-
// user at least sees the (broken) output and the stop notice.
|
|
1215
1234
|
}
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1235
|
+
if (!call) {
|
|
1236
|
+
if (bareArgsOnly) {
|
|
1237
|
+
bareToolJsonRetries += 1;
|
|
1238
|
+
if (bareToolJsonRetries <= 3) {
|
|
1239
|
+
process.stdout.write(chalk.yellow(" ⚠ tool call missing its name/fence — asking the model to re-emit a proper ```tool block\n"));
|
|
1240
|
+
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1241
|
+
messages.push(recoveryUserMessage(buildLikeTurn && !activePlan
|
|
1242
|
+
? "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
|
|
1243
|
+
"This is a BUILD/SCAFFOLD task with NO plan yet. " +
|
|
1244
|
+
"You MUST call plan.create using a proper ```tool block. For example:\n" +
|
|
1245
|
+
'```tool\n{"name":"plan.create","args":{"goal":"scaffold todo app","detail":"...","tasks":["...","..."],"kind":"coding"}}\n```\n' +
|
|
1246
|
+
"Do NOT use fs.write, fs.writeMany, shell.exec, or pkg.install yet."
|
|
1247
|
+
: "Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
|
|
1248
|
+
"Reply with ONLY a fenced ```tool block of the form " +
|
|
1249
|
+
'`{"name": "<tool>", "args": { ... }}`. For example, to read a PDF:\n' +
|
|
1250
|
+
'```tool\n{"name":"pdf.read","args":{"path":"/abs/file.pdf"}}\n```\n' +
|
|
1251
|
+
"Choose the correct tool name for the task and include those args."));
|
|
1252
|
+
continue;
|
|
1253
|
+
}
|
|
1254
|
+
// Exhausted retries — fall through to the normal answer path.
|
|
1255
|
+
}
|
|
1256
|
+
// Detect the case where the model emitted sentinel-style tool-call
|
|
1257
|
+
// markers but the body was malformed or truncated. Printing those
|
|
1258
|
+
// raw tokens looks like a crash to the user — instead, ask the
|
|
1259
|
+
// model to retry the tool call in a clean JSON format.
|
|
1260
|
+
if (/<\|tool_call(?:s_section)?_begin\|>|<\|tool_call_argument_begin\|>/i.test(assistantText.visible)) {
|
|
1261
|
+
process.stdout.write(chalk.yellow(" ⚠ tool call was malformed or cut off — asking the model to retry in JSON form\n"));
|
|
1230
1262
|
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1231
|
-
messages.push(
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
'Re-emit ONE valid ```tool block of the exact form {"name":"<tool>","args":{...}} with balanced braces. ' +
|
|
1236
|
-
"If it was a large fs.writeMany, split it into SMALLER batches (3-5 files) so the JSON is easy to keep valid. " +
|
|
1237
|
-
"Do NOT claim any file was written until a tool call actually succeeds.",
|
|
1238
|
-
});
|
|
1263
|
+
messages.push(recoveryUserMessage("Your previous tool call was malformed or truncated. " +
|
|
1264
|
+
"Reply with ONLY a fenced ```tool block containing valid JSON " +
|
|
1265
|
+
'of the form `{"name": "<tool>", "args": { ... }}`. ' +
|
|
1266
|
+
"Do not use <|tool_call_begin|> markers."));
|
|
1239
1267
|
continue;
|
|
1240
1268
|
}
|
|
1241
|
-
//
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1269
|
+
// Detect a tool call that opened but was cut off by the token limit
|
|
1270
|
+
// (most common with a large multi-file fs.writeMany). Retrying with a
|
|
1271
|
+
// nudge to split the work is far better than rendering broken JSON as
|
|
1272
|
+
// a final answer and leaving the project half-created.
|
|
1273
|
+
if (looksLikeTruncatedToolCall(assistantText.visible)) {
|
|
1274
|
+
truncatedToolRetries += 1;
|
|
1275
|
+
if (truncatedToolRetries <= 3) {
|
|
1276
|
+
process.stdout.write(chalk.yellow(" ⚠ tool call was cut off (output too long) — asking the model to retry in smaller pieces\n"));
|
|
1277
|
+
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1278
|
+
messages.push({
|
|
1279
|
+
role: "user",
|
|
1280
|
+
content: "Your previous tool call was cut off before it finished — the JSON was incomplete, so NOTHING ran. " +
|
|
1281
|
+
"Retry now with a COMPLETE, valid ```tool block. " +
|
|
1282
|
+
"If it was a large fs.writeMany, split it into SMALLER batches (3-5 files per call, and keep each file's content concise) " +
|
|
1283
|
+
"so the whole JSON fits in one response. Do NOT claim any file was written until a tool call actually succeeds.",
|
|
1284
|
+
});
|
|
1285
|
+
continue;
|
|
1286
|
+
}
|
|
1287
|
+
// Exhausted retries — fall through so we don't loop forever, but the
|
|
1288
|
+
// user at least sees the (broken) output and the stop notice.
|
|
1289
|
+
}
|
|
1290
|
+
// Detect a ```tool fence whose JSON could NOT be parsed for any other
|
|
1291
|
+
// reason (malformed braces, trailing junk, a stray `}` — NOT plain
|
|
1292
|
+
// truncation, which is handled above). Without this, the raw block
|
|
1293
|
+
// leaks to the screen as a code fence and the requested action (often
|
|
1294
|
+
// a whole fs.writeMany scaffold) silently never runs — exactly the
|
|
1295
|
+
// "fs.writeMany printed but nothing created" failure. Require the fence
|
|
1296
|
+
// to actually look like an intended call (mentions name/args) so a
|
|
1297
|
+
// genuine ```tool code example in prose isn't mistaken for one.
|
|
1298
|
+
const hasFencedCallShape = countToolFences(assistantText.visible) > 0 &&
|
|
1299
|
+
/```tool\s*\n[\s\S]*?"(?:name|args)"\s*:/i.test(assistantText.visible);
|
|
1300
|
+
if (hasFencedCallShape) {
|
|
1301
|
+
malformedFenceRetries += 1;
|
|
1302
|
+
if (malformedFenceRetries <= 3) {
|
|
1303
|
+
process.stdout.write(chalk.yellow(" ⚠ tool block present but its JSON didn't parse — asking the model to re-emit valid JSON\n"));
|
|
1304
|
+
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1305
|
+
messages.push({
|
|
1306
|
+
role: "user",
|
|
1307
|
+
content: "Your previous message contained a ```tool block, but its JSON was INVALID, so NOTHING ran. " +
|
|
1308
|
+
"Common causes: an extra or missing `}` / `]`, a trailing brace after the closing `}`, or unescaped quotes/newlines inside a string value. " +
|
|
1309
|
+
'Re-emit ONE valid ```tool block of the exact form {"name":"<tool>","args":{...}} with balanced braces. ' +
|
|
1310
|
+
"If it was a large fs.writeMany, split it into SMALLER batches (3-5 files) so the JSON is easy to keep valid. " +
|
|
1311
|
+
"Do NOT claim any file was written until a tool call actually succeeds.",
|
|
1312
|
+
});
|
|
1313
|
+
continue;
|
|
1314
|
+
}
|
|
1315
|
+
// Exhausted retries — fall through to the normal path.
|
|
1316
|
+
}
|
|
1317
|
+
// Normal final-answer path: strip any stray sentinel tokens that
|
|
1318
|
+
// somehow leaked into prose so the answer renders cleanly.
|
|
1319
|
+
const cleaned = stripSentinelTokens(assistantText.visible);
|
|
1320
|
+
if (freshWebSearchRequired && !sawFreshWebSearch && !freshnessRetryUsed) {
|
|
1321
|
+
freshnessRetryUsed = true;
|
|
1322
|
+
process.stdout.write(chalk.dim(" ℹ current-info question detected — searching the web before answering\n"));
|
|
1270
1323
|
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1271
1324
|
messages.push({
|
|
1272
1325
|
role: "user",
|
|
1273
|
-
content:
|
|
1274
|
-
|
|
1275
|
-
`Do NOT claim the work is complete, that files were created, or that a server is running ` +
|
|
1276
|
-
`unless a tool call actually succeeded and you saw the output. ` +
|
|
1277
|
-
`Resume now with the NEXT task ${next.id} ("${next.title}"): call task.update {taskId:"${next.id}", state:"in_progress"}, ` +
|
|
1278
|
-
`then do the real work with a tool call (fs.writeMany / shell.exec / shell.start), VERIFY it, and mark it done. ` +
|
|
1279
|
-
`Continue task by task until EVERY task is actually finished.`,
|
|
1326
|
+
content: freshnessGuardMessage() +
|
|
1327
|
+
" Reply with ONLY a fenced ```tool block for web.search now.",
|
|
1280
1328
|
});
|
|
1281
1329
|
continue;
|
|
1282
1330
|
}
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1331
|
+
// ── Premature-completion guard (approved plan still has work) ──────
|
|
1332
|
+
// If the user approved a plan and the model now gives a final answer
|
|
1333
|
+
// while tasks are still pending/in_progress — without having run the
|
|
1334
|
+
// work — it is fabricating completion (the exact "all tasks completed,
|
|
1335
|
+
// running at localhost:5173" failure). Force it back to executing the
|
|
1336
|
+
// next real task instead of accepting the false claim.
|
|
1337
|
+
if (session.planApproved.value && prematureCompletionRetries < 3) {
|
|
1338
|
+
const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
|
|
1339
|
+
const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
|
|
1340
|
+
if (livePlan && unfinished && unfinished.length > 0) {
|
|
1341
|
+
prematureCompletionRetries += 1;
|
|
1342
|
+
const next = unfinished[0];
|
|
1343
|
+
process.stdout.write(chalk.yellow(` ⚠ ${unfinished.length} plan task(s) still unfinished — not accepting a "done" claim; resuming execution\n`));
|
|
1344
|
+
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1345
|
+
messages.push({
|
|
1346
|
+
role: "user",
|
|
1347
|
+
content: `You have NOT finished the approved plan: ${unfinished.length} task(s) remain ` +
|
|
1348
|
+
`(${unfinished.map((t) => `[${t.id}] ${t.title}`).join("; ")}). ` +
|
|
1349
|
+
`Do NOT claim the work is complete, that files were created, or that a server is running ` +
|
|
1350
|
+
`unless a tool call actually succeeded and you saw the output. ` +
|
|
1351
|
+
`Resume now with the NEXT task ${next.id} ("${next.title}"): call task.update {taskId:"${next.id}", state:"in_progress"}, ` +
|
|
1352
|
+
`then do the real work with a tool call (fs.writeMany / shell.exec / shell.start), VERIFY it, and mark it done. ` +
|
|
1353
|
+
`Continue task by task until EVERY task is actually finished.`,
|
|
1354
|
+
});
|
|
1355
|
+
continue;
|
|
1356
|
+
}
|
|
1299
1357
|
}
|
|
1358
|
+
// If we still print a final answer while an approved plan has unfinished
|
|
1359
|
+
// tasks (retries exhausted), do NOT let a fabricated "it's done" stand
|
|
1360
|
+
// unchallenged — append an explicit, honest status so the user knows the
|
|
1361
|
+
// build did not actually complete.
|
|
1362
|
+
let completionWarning = "";
|
|
1363
|
+
if (session.planApproved.value) {
|
|
1364
|
+
const livePlan = await loadPlan(session.sessionId).catch(() => undefined);
|
|
1365
|
+
const unfinished = livePlan?.tasks.filter((t) => t.state === "pending" || t.state === "in_progress");
|
|
1366
|
+
if (livePlan && unfinished && unfinished.length > 0) {
|
|
1367
|
+
completionWarning =
|
|
1368
|
+
chalk.yellow(`\n ⚠ ${unfinished.length} of ${livePlan.tasks.length} plan task(s) are NOT actually complete:\n`) +
|
|
1369
|
+
unfinished
|
|
1370
|
+
.map((t) => chalk.yellow(` • [${t.id}] ${t.title}`))
|
|
1371
|
+
.join("\n") +
|
|
1372
|
+
chalk.dim("\n The summary above may overstate progress. Re-run with /implement, or ask clai to finish the remaining tasks.\n");
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
if (cleaned) {
|
|
1376
|
+
process.stdout.write(renderMarkdown(cleaned));
|
|
1377
|
+
if (!cleaned.endsWith("\n"))
|
|
1378
|
+
process.stdout.write("\n");
|
|
1379
|
+
}
|
|
1380
|
+
if (completionWarning) {
|
|
1381
|
+
process.stdout.write(completionWarning);
|
|
1382
|
+
}
|
|
1383
|
+
if (assistantText.hasThinking) {
|
|
1384
|
+
process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
|
|
1385
|
+
}
|
|
1386
|
+
await auditLog("agent.final", { provider, model, steps: step + 1 });
|
|
1387
|
+
lastAnswer = cleaned;
|
|
1388
|
+
return lastAnswer;
|
|
1300
1389
|
}
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1390
|
+
// A valid primary tool call exists for this fresh model turn. Show any
|
|
1391
|
+
// prose / thinking that preceded it, record the assistant message ONCE,
|
|
1392
|
+
// then queue any additional tool calls from the same message so they
|
|
1393
|
+
// run in order on the next iterations (no extra round-trip).
|
|
1394
|
+
const beforeTool = recoveredFromBareJson
|
|
1395
|
+
? ""
|
|
1396
|
+
: textBeforeToolCall(assistantText.visible);
|
|
1397
|
+
if (beforeTool) {
|
|
1398
|
+
process.stdout.write(renderMarkdown(beforeTool) + "\n");
|
|
1308
1399
|
}
|
|
1309
1400
|
if (assistantText.hasThinking) {
|
|
1310
1401
|
process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
|
|
1311
1402
|
}
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1403
|
+
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1404
|
+
if (!recoveredFromBareJson && call) {
|
|
1405
|
+
const allCalls = parseAllToolCalls(assistantText.visible);
|
|
1406
|
+
if (allCalls.length > 1 &&
|
|
1407
|
+
allCalls[0] &&
|
|
1408
|
+
sameToolCall(allCalls[0], call)) {
|
|
1409
|
+
pendingCalls = allCalls.slice(1);
|
|
1410
|
+
process.stdout.write(chalk.dim(` ℹ ${allCalls.length} tool calls in this message — running them in order\n`));
|
|
1411
|
+
}
|
|
1412
|
+
}
|
|
1315
1413
|
}
|
|
1414
|
+
// Type guard: every path above either set `call` or returned/continued.
|
|
1415
|
+
if (!call)
|
|
1416
|
+
continue;
|
|
1316
1417
|
// ── Duplicate-call detection ──────────────────────────────────────────
|
|
1317
1418
|
// If the model calls the exact same tool with the exact same args
|
|
1318
1419
|
// repeatedly, it's stuck in a loop. Inject a corrective message
|
|
@@ -1323,7 +1424,9 @@ export async function runAgentLoop(prompt, options = {}) {
|
|
|
1323
1424
|
call.name === "fs.writeMany" ||
|
|
1324
1425
|
call.name === "fs.edit";
|
|
1325
1426
|
process.stdout.write(chalk.yellow(` ⚠ ${call.name} was already called with the same arguments — ${isWrite ? "moving on" : "forcing summary"}\n`));
|
|
1326
|
-
|
|
1427
|
+
// A repeat means this batch went off the rails — drop any queued calls
|
|
1428
|
+
// and let the model react. The assistant message was already recorded.
|
|
1429
|
+
pendingCalls = [];
|
|
1327
1430
|
messages.push({
|
|
1328
1431
|
role: "user",
|
|
1329
1432
|
content: isWrite
|
|
@@ -1338,27 +1441,6 @@ export async function runAgentLoop(prompt, options = {}) {
|
|
|
1338
1441
|
if (loopCheck.reason) {
|
|
1339
1442
|
process.stdout.write(chalk.dim(` ℹ ${loopCheck.reason}\n`));
|
|
1340
1443
|
}
|
|
1341
|
-
// Print only non-thinking text before the tool call. When the call was
|
|
1342
|
-
// recovered from a bare JSON object (the whole message WAS the call),
|
|
1343
|
-
// there is no prose to show — skip it so we don't echo the raw JSON.
|
|
1344
|
-
const beforeTool = recoveredFromBareJson
|
|
1345
|
-
? ""
|
|
1346
|
-
: textBeforeToolCall(assistantText.visible);
|
|
1347
|
-
if (beforeTool) {
|
|
1348
|
-
process.stdout.write(renderMarkdown(beforeTool) + "\n");
|
|
1349
|
-
}
|
|
1350
|
-
if (assistantText.hasThinking) {
|
|
1351
|
-
process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
|
|
1352
|
-
}
|
|
1353
|
-
messages.push({ role: "assistant", content: assistantText.visible });
|
|
1354
|
-
// Detect a model that crammed MULTIPLE tool calls into one response.
|
|
1355
|
-
// Only `call` (the first block) will run this turn; the rest are dropped.
|
|
1356
|
-
// We flag it so that after the first tool executes we explicitly tell the
|
|
1357
|
-
// model the others did NOT run — preventing the "I ran everything" lie.
|
|
1358
|
-
const extraToolBlocks = Math.max(0, countToolFences(assistantText.visible) - 1);
|
|
1359
|
-
if (extraToolBlocks > 0) {
|
|
1360
|
-
process.stdout.write(chalk.yellow(` ⚠ ${extraToolBlocks} extra tool block(s) in one message were ignored — only the first ran. One tool per turn.\n`));
|
|
1361
|
-
}
|
|
1362
1444
|
// ── Plan / task tools (session-scoped, handled inline) ─────────────
|
|
1363
1445
|
// These don't go through the generic registry because they need the
|
|
1364
1446
|
// session id and mutate the live plan that the user can view (Ctrl+P).
|
|
@@ -1371,6 +1453,10 @@ export async function runAgentLoop(prompt, options = {}) {
|
|
|
1371
1453
|
productiveSteps += 1;
|
|
1372
1454
|
loopGuard.recordAttempt(step, call.name, call.args, planResult.ok, 0);
|
|
1373
1455
|
process.stdout.write(planResult.display);
|
|
1456
|
+
// plan.create means "STOP and wait for /implement" — abandon any
|
|
1457
|
+
// other calls the model batched alongside it.
|
|
1458
|
+
if (call.name === "plan.create")
|
|
1459
|
+
pendingCalls = [];
|
|
1374
1460
|
messages.push({
|
|
1375
1461
|
role: "tool",
|
|
1376
1462
|
content: `Tool ${call.name} result (ok=${planResult.ok}):\n${planResult.modelNote}`,
|
|
@@ -1398,6 +1484,7 @@ export async function runAgentLoop(prompt, options = {}) {
|
|
|
1398
1484
|
!session.planApproved.value &&
|
|
1399
1485
|
!isPreApprovalAllowedTool(call.name)) {
|
|
1400
1486
|
process.stdout.write(chalk.yellow(` ⚠ plan awaiting approval — ${call.name} is blocked until you /implement (or /discard)\n`));
|
|
1487
|
+
pendingCalls = [];
|
|
1401
1488
|
messages.push({
|
|
1402
1489
|
role: "user",
|
|
1403
1490
|
content: `There is an ACTIVE PLAN that has NOT been approved yet, so you must NOT execute it — ` +
|
|
@@ -1538,6 +1625,13 @@ export async function runAgentLoop(prompt, options = {}) {
|
|
|
1538
1625
|
const errMsg = toolError instanceof Error ? toolError.message : String(toolError);
|
|
1539
1626
|
result = { ok: false, output: `Tool error: ${errMsg}`, exitCode: 1 };
|
|
1540
1627
|
}
|
|
1628
|
+
// Stop-on-error: if this call failed, abandon any remaining queued calls
|
|
1629
|
+
// from the same message so the model sees the failure and decides what to
|
|
1630
|
+
// do next instead of blindly running steps that depended on it.
|
|
1631
|
+
if (!result.ok && pendingCalls.length > 0) {
|
|
1632
|
+
process.stdout.write(chalk.dim(` ↳ ${pendingCalls.length} queued call(s) cancelled because this step failed\n`));
|
|
1633
|
+
pendingCalls = [];
|
|
1634
|
+
}
|
|
1541
1635
|
const output = result.output.trim();
|
|
1542
1636
|
const displayMax = 6_000;
|
|
1543
1637
|
// If the tool already produced an artifact (shell.exec now streams to one
|
|
@@ -1651,10 +1745,7 @@ export async function runAgentLoop(prompt, options = {}) {
|
|
|
1651
1745
|
}
|
|
1652
1746
|
messages.push({
|
|
1653
1747
|
role: "tool",
|
|
1654
|
-
content: `Tool ${call.name} result (exit=${result.exitCode ?? 0}, ok=${result.ok}):\n${contextOutput}
|
|
1655
|
-
(extraToolBlocks > 0
|
|
1656
|
-
? `\n\nIMPORTANT: your previous message contained ${extraToolBlocks + 1} tool blocks, but ONLY this first one (${call.name}) actually ran. The other ${extraToolBlocks} did NOT execute and were discarded. Emit EXACTLY ONE tool block per message. Send the next tool call now — and do NOT assume any of the dropped calls happened.`
|
|
1657
|
-
: ""),
|
|
1748
|
+
content: `Tool ${call.name} result (exit=${result.exitCode ?? 0}, ok=${result.ok}):\n${contextOutput}`,
|
|
1658
1749
|
});
|
|
1659
1750
|
// Compact older messages when the running estimate exceeds budget so
|
|
1660
1751
|
// free-tier context windows are not blown by long pentest sessions.
|