@burtson-labs/agent-core 1.6.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +88 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +52 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp/activation.d.ts +60 -0
- package/dist/mcp/activation.d.ts.map +1 -0
- package/dist/mcp/activation.js +139 -0
- package/dist/mcp/activation.js.map +1 -0
- package/dist/mcp/clientPool.d.ts +202 -0
- package/dist/mcp/clientPool.d.ts.map +1 -0
- package/dist/mcp/clientPool.js +469 -0
- package/dist/mcp/clientPool.js.map +1 -0
- package/dist/mcp/index.d.ts +18 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +28 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/mcp/server.d.ts +43 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/dist/mcp/server.js +130 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/mcp/toolAdapter.d.ts +57 -0
- package/dist/mcp/toolAdapter.d.ts.map +1 -0
- package/dist/mcp/toolAdapter.js +223 -0
- package/dist/mcp/toolAdapter.js.map +1 -0
- package/dist/mcp/types.d.ts +122 -0
- package/dist/mcp/types.d.ts.map +1 -0
- package/dist/mcp/types.js +15 -0
- package/dist/mcp/types.js.map +1 -0
- package/dist/providers/deterministic-provider.d.ts +21 -0
- package/dist/providers/deterministic-provider.d.ts.map +1 -0
- package/dist/providers/deterministic-provider.js +80 -0
- package/dist/providers/deterministic-provider.js.map +1 -0
- package/dist/providers/provider-client.d.ts +12 -0
- package/dist/providers/provider-client.d.ts.map +1 -0
- package/dist/providers/provider-client.js +11 -0
- package/dist/providers/provider-client.js.map +1 -0
- package/dist/runtime/AgentRuntime.d.ts +67 -0
- package/dist/runtime/AgentRuntime.d.ts.map +1 -0
- package/dist/runtime/AgentRuntime.js +382 -0
- package/dist/runtime/AgentRuntime.js.map +1 -0
- package/dist/security/secretPatterns.d.ts +76 -0
- package/dist/security/secretPatterns.d.ts.map +1 -0
- package/dist/security/secretPatterns.js +290 -0
- package/dist/security/secretPatterns.js.map +1 -0
- package/dist/tools/ask-user-tool.d.ts +19 -0
- package/dist/tools/ask-user-tool.d.ts.map +1 -0
- package/dist/tools/ask-user-tool.js +148 -0
- package/dist/tools/ask-user-tool.js.map +1 -0
- package/dist/tools/compactMessages.d.ts +52 -0
- package/dist/tools/compactMessages.d.ts.map +1 -0
- package/dist/tools/compactMessages.js +158 -0
- package/dist/tools/compactMessages.js.map +1 -0
- package/dist/tools/core-tools.d.ts +29 -0
- package/dist/tools/core-tools.d.ts.map +1 -0
- package/dist/tools/core-tools.js +2214 -0
- package/dist/tools/core-tools.js.map +1 -0
- package/dist/tools/git-tools.d.ts +32 -0
- package/dist/tools/git-tools.d.ts.map +1 -0
- package/dist/tools/git-tools.js +330 -0
- package/dist/tools/git-tools.js.map +1 -0
- package/dist/tools/index.d.ts +15 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +31 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/language-adapters.d.ts +48 -0
- package/dist/tools/language-adapters.d.ts.map +1 -0
- package/dist/tools/language-adapters.js +299 -0
- package/dist/tools/language-adapters.js.map +1 -0
- package/dist/tools/loop/compactionTrigger.d.ts +47 -0
- package/dist/tools/loop/compactionTrigger.d.ts.map +1 -0
- package/dist/tools/loop/compactionTrigger.js +32 -0
- package/dist/tools/loop/compactionTrigger.js.map +1 -0
- package/dist/tools/loop/finalAnswerNudges.d.ts +68 -0
- package/dist/tools/loop/finalAnswerNudges.d.ts.map +1 -0
- package/dist/tools/loop/finalAnswerNudges.js +87 -0
- package/dist/tools/loop/finalAnswerNudges.js.map +1 -0
- package/dist/tools/loop/goalAnchor.d.ts +72 -0
- package/dist/tools/loop/goalAnchor.d.ts.map +1 -0
- package/dist/tools/loop/goalAnchor.js +76 -0
- package/dist/tools/loop/goalAnchor.js.map +1 -0
- package/dist/tools/loop/llmStream.d.ts +70 -0
- package/dist/tools/loop/llmStream.d.ts.map +1 -0
- package/dist/tools/loop/llmStream.js +181 -0
- package/dist/tools/loop/llmStream.js.map +1 -0
- package/dist/tools/loop/parallelExecute.d.ts +57 -0
- package/dist/tools/loop/parallelExecute.d.ts.map +1 -0
- package/dist/tools/loop/parallelExecute.js +54 -0
- package/dist/tools/loop/parallelExecute.js.map +1 -0
- package/dist/tools/loop/singleToolExecute.d.ts +71 -0
- package/dist/tools/loop/singleToolExecute.d.ts.map +1 -0
- package/dist/tools/loop/singleToolExecute.js +139 -0
- package/dist/tools/loop/singleToolExecute.js.map +1 -0
- package/dist/tools/loop/toolCallNormalize.d.ts +57 -0
- package/dist/tools/loop/toolCallNormalize.d.ts.map +1 -0
- package/dist/tools/loop/toolCallNormalize.js +99 -0
- package/dist/tools/loop/toolCallNormalize.js.map +1 -0
- package/dist/tools/loop/turnSetup.d.ts +43 -0
- package/dist/tools/loop/turnSetup.d.ts.map +1 -0
- package/dist/tools/loop/turnSetup.js +48 -0
- package/dist/tools/loop/turnSetup.js.map +1 -0
- package/dist/tools/ocr.d.ts +52 -0
- package/dist/tools/ocr.d.ts.map +1 -0
- package/dist/tools/ocr.js +238 -0
- package/dist/tools/ocr.js.map +1 -0
- package/dist/tools/post-edit-checks.d.ts +46 -0
- package/dist/tools/post-edit-checks.d.ts.map +1 -0
- package/dist/tools/post-edit-checks.js +236 -0
- package/dist/tools/post-edit-checks.js.map +1 -0
- package/dist/tools/skill-loader.d.ts +94 -0
- package/dist/tools/skill-loader.d.ts.map +1 -0
- package/dist/tools/skill-loader.js +422 -0
- package/dist/tools/skill-loader.js.map +1 -0
- package/dist/tools/skill-registry.d.ts +44 -0
- package/dist/tools/skill-registry.d.ts.map +1 -0
- package/dist/tools/skill-registry.js +118 -0
- package/dist/tools/skill-registry.js.map +1 -0
- package/dist/tools/skill-types.d.ts +38 -0
- package/dist/tools/skill-types.d.ts.map +1 -0
- package/dist/tools/skill-types.js +10 -0
- package/dist/tools/skill-types.js.map +1 -0
- package/dist/tools/skills/code-review-skill.d.ts +9 -0
- package/dist/tools/skills/code-review-skill.d.ts.map +1 -0
- package/dist/tools/skills/code-review-skill.js +66 -0
- package/dist/tools/skills/code-review-skill.js.map +1 -0
- package/dist/tools/skills/core-skill.d.ts +13 -0
- package/dist/tools/skills/core-skill.d.ts.map +1 -0
- package/dist/tools/skills/core-skill.js +23 -0
- package/dist/tools/skills/core-skill.js.map +1 -0
- package/dist/tools/skills/git-skill.d.ts +10 -0
- package/dist/tools/skills/git-skill.d.ts.map +1 -0
- package/dist/tools/skills/git-skill.js +30 -0
- package/dist/tools/skills/git-skill.js.map +1 -0
- package/dist/tools/skills/index.d.ts +17 -0
- package/dist/tools/skills/index.d.ts.map +1 -0
- package/dist/tools/skills/index.js +49 -0
- package/dist/tools/skills/index.js.map +1 -0
- package/dist/tools/skills/interaction-skill.d.ts +14 -0
- package/dist/tools/skills/interaction-skill.d.ts.map +1 -0
- package/dist/tools/skills/interaction-skill.js +24 -0
- package/dist/tools/skills/interaction-skill.js.map +1 -0
- package/dist/tools/skills/mail-search-skill.d.ts +25 -0
- package/dist/tools/skills/mail-search-skill.d.ts.map +1 -0
- package/dist/tools/skills/mail-search-skill.js +343 -0
- package/dist/tools/skills/mail-search-skill.js.map +1 -0
- package/dist/tools/skills/plan-skill.d.ts +10 -0
- package/dist/tools/skills/plan-skill.d.ts.map +1 -0
- package/dist/tools/skills/plan-skill.js +126 -0
- package/dist/tools/skills/plan-skill.js.map +1 -0
- package/dist/tools/skills/semantic-search-skill.d.ts +22 -0
- package/dist/tools/skills/semantic-search-skill.d.ts.map +1 -0
- package/dist/tools/skills/semantic-search-skill.js +244 -0
- package/dist/tools/skills/semantic-search-skill.js.map +1 -0
- package/dist/tools/skills/test-gen-skill.d.ts +9 -0
- package/dist/tools/skills/test-gen-skill.d.ts.map +1 -0
- package/dist/tools/skills/test-gen-skill.js +123 -0
- package/dist/tools/skills/test-gen-skill.js.map +1 -0
- package/dist/tools/tool-registry.d.ts +60 -0
- package/dist/tools/tool-registry.d.ts.map +1 -0
- package/dist/tools/tool-registry.js +200 -0
- package/dist/tools/tool-registry.js.map +1 -0
- package/dist/tools/tool-types.d.ts +281 -0
- package/dist/tools/tool-types.d.ts.map +1 -0
- package/dist/tools/tool-types.js +10 -0
- package/dist/tools/tool-types.js.map +1 -0
- package/dist/tools/tool-use-loop.d.ts +231 -0
- package/dist/tools/tool-use-loop.d.ts.map +1 -0
- package/dist/tools/tool-use-loop.js +2057 -0
- package/dist/tools/tool-use-loop.js.map +1 -0
- package/dist/tools/tool-use-parser.d.ts +78 -0
- package/dist/tools/tool-use-parser.d.ts.map +1 -0
- package/dist/tools/tool-use-parser.js +427 -0
- package/dist/tools/tool-use-parser.js.map +1 -0
- package/dist/tools/toolAvailabilityDetector.d.ts +48 -0
- package/dist/tools/toolAvailabilityDetector.d.ts.map +1 -0
- package/dist/tools/toolAvailabilityDetector.js +156 -0
- package/dist/tools/toolAvailabilityDetector.js.map +1 -0
- package/dist/tools/unified-patch.d.ts +87 -0
- package/dist/tools/unified-patch.d.ts.map +1 -0
- package/dist/tools/unified-patch.js +217 -0
- package/dist/tools/unified-patch.js.map +1 -0
- package/dist/types/agent.d.ts +69 -0
- package/dist/types/agent.d.ts.map +1 -0
- package/dist/types/agent.js +54 -0
- package/dist/types/agent.js.map +1 -0
- package/dist/types/tasks.d.ts +22 -0
- package/dist/types/tasks.d.ts.map +1 -0
- package/dist/types/tasks.js +3 -0
- package/dist/types/tasks.js.map +1 -0
- package/dist/utils/event-emitter.d.ts +13 -0
- package/dist/utils/event-emitter.d.ts.map +1 -0
- package/dist/utils/event-emitter.js +54 -0
- package/dist/utils/event-emitter.js.map +1 -0
- package/package.json +33 -0
|
@@ -0,0 +1,2057 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Text-based tool use execution loop.
|
|
4
|
+
*
|
|
5
|
+
* Implements the observe → act → replan cycle for models that don't support
|
|
6
|
+
* native function calling (gemma3, bandit-core, qwen2.5-coder, etc.).
|
|
7
|
+
*
|
|
8
|
+
* Flow:
|
|
9
|
+
* 1. Build messages with tool definitions in system prompt
|
|
10
|
+
* 2. Stream response from LLM, aggregate full text
|
|
11
|
+
* 3. Parse <tool_call> blocks
|
|
12
|
+
* 4. Execute tools via ToolExecutionContext
|
|
13
|
+
* 5. Inject <tool_result> blocks as next user message
|
|
14
|
+
* 6. Repeat from step 2 until no tool calls, or max iterations reached
|
|
15
|
+
* 7. Return final model response (the one with no tool calls)
|
|
16
|
+
*
|
|
17
|
+
* For models WITH native tool calling (qwen2.5-coder:32b, llama3.1),
|
|
18
|
+
* the host should use the Ollama `tools: [...]` field instead.
|
|
19
|
+
*/
|
|
20
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
21
|
+
exports.ToolUseLoop = void 0;
|
|
22
|
+
exports.sleep = sleep;
|
|
23
|
+
exports.isRetryableLlmError = isRetryableLlmError;
|
|
24
|
+
exports.tagRetryableLlmError = tagRetryableLlmError;
|
|
25
|
+
exports.summarizeLlmError = summarizeLlmError;
|
|
26
|
+
exports.isContinuationPrompt = isContinuationPrompt;
|
|
27
|
+
exports.isNoticingPrompt = isNoticingPrompt;
|
|
28
|
+
exports.createToolUseLoop = createToolUseLoop;
|
|
29
|
+
const tool_use_parser_1 = require("./tool-use-parser");
|
|
30
|
+
const toolCallNormalize_1 = require("./loop/toolCallNormalize");
|
|
31
|
+
const singleToolExecute_1 = require("./loop/singleToolExecute");
|
|
32
|
+
const turnSetup_1 = require("./loop/turnSetup");
|
|
33
|
+
const llmStream_1 = require("./loop/llmStream");
|
|
34
|
+
const compactionTrigger_1 = require("./loop/compactionTrigger");
|
|
35
|
+
const parallelExecute_1 = require("./loop/parallelExecute");
|
|
36
|
+
const goalAnchor_1 = require("./loop/goalAnchor");
|
|
37
|
+
const finalAnswerNudges_1 = require("./loop/finalAnswerNudges");
|
|
38
|
+
const toolAvailabilityDetector_1 = require("./toolAvailabilityDetector");
|
|
39
|
+
const FILE_EDIT_TOOL_NAMES = new Set(['write_file', 'apply_edit', 'replace_range', 'apply_patch']);
|
|
40
|
+
function isFileEditTool(name) {
|
|
41
|
+
return FILE_EDIT_TOOL_NAMES.has(name);
|
|
42
|
+
}
|
|
43
|
+
function sleep(ms) {
|
|
44
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
45
|
+
}
|
|
46
|
+
function getErrorCode(error) {
|
|
47
|
+
return typeof error === 'object' && error !== null && 'code' in error
|
|
48
|
+
? String(error.code ?? '')
|
|
49
|
+
: undefined;
|
|
50
|
+
}
|
|
51
|
+
function getErrorMessage(error) {
|
|
52
|
+
return error instanceof Error ? error.message : String(error);
|
|
53
|
+
}
|
|
54
|
+
function isRetryableLlmError(error) {
|
|
55
|
+
const code = getErrorCode(error);
|
|
56
|
+
if (code === 'USER_ABORT')
|
|
57
|
+
return false;
|
|
58
|
+
const message = getErrorMessage(error);
|
|
59
|
+
if (/\b429\b|rate limit/i.test(message))
|
|
60
|
+
return false;
|
|
61
|
+
return (code === 'WATCHDOG' ||
|
|
62
|
+
/\b5\d\d\b/.test(message) ||
|
|
63
|
+
/Upstream model request failed/i.test(message) ||
|
|
64
|
+
/ECONNREFUSED|ECONNRESET|ETIMEDOUT|EAI_AGAIN|socket hang up|fetch failed|network error|terminated|UND_ERR/i.test(message));
|
|
65
|
+
}
|
|
66
|
+
function tagRetryableLlmError(error) {
|
|
67
|
+
if (error instanceof Error) {
|
|
68
|
+
const tagged = error;
|
|
69
|
+
if (!tagged.code)
|
|
70
|
+
tagged.code = 'UPSTREAM_MODEL';
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
function summarizeLlmError(error) {
|
|
74
|
+
const message = getErrorMessage(error).replace(/\s+/g, ' ').trim();
|
|
75
|
+
return message.length > 180 ? `${message.slice(0, 177)}...` : message;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Detects "keep going" / "continue" / "yes" style prompts that
|
|
79
|
+
* carry no real goal content. The goal-anchor block uses the most recent
|
|
80
|
+
* user message as the recall text; when that text is "good lets keep
|
|
81
|
+
* going" the anchor degenerates into "remind yourself to keep going",
|
|
82
|
+
* which gives the model nothing to anchor on after 20 iterations of
|
|
83
|
+
* drift. Real on a 60-iteration linter-fix
|
|
84
|
+
* turn: every anchor injection cited "good lets keep going" as the
|
|
85
|
+
* goal. Detector lets callers walk back to a prior substantive prompt
|
|
86
|
+
* instead.
|
|
87
|
+
*
|
|
88
|
+
* Length cap (60 chars) + normalized-phrase match keeps false positives
|
|
89
|
+
* down — a sentence like "keep going on the auth refactor for the
|
|
90
|
+
* user-service" is longer than 60 chars and reads as a real goal, so it
|
|
91
|
+
* stays a goal.
|
|
92
|
+
*/
|
|
93
|
+
const CONTINUATION_PROMPT_PHRASES = new Set([
|
|
94
|
+
'continue', 'keep going', 'go on', 'proceed', 'next', 'more',
|
|
95
|
+
'please continue', 'carry on', 'finish', 'finish it', 'finish up', 'wrap up', 'wrap it up',
|
|
96
|
+
'good', 'great', 'nice', 'cool', 'sweet', 'perfect', 'ok', 'okay', 'k', 'yes', 'y', 'yep', 'yeah', 'ack', 'done',
|
|
97
|
+
"let's continue", 'lets continue', "let's keep going", 'lets keep going',
|
|
98
|
+
'good keep going', 'good lets keep going', "good let's keep going",
|
|
99
|
+
'good continue', 'ok continue', 'okay continue'
|
|
100
|
+
]);
|
|
101
|
+
function isContinuationPrompt(text) {
|
|
102
|
+
const trimmed = text.trim();
|
|
103
|
+
if (trimmed.length === 0 || trimmed.length > 60)
|
|
104
|
+
return false;
|
|
105
|
+
// Normalize: lowercase, drop non-word/space punctuation, collapse whitespace.
|
|
106
|
+
const norm = trimmed
|
|
107
|
+
.toLowerCase()
|
|
108
|
+
.replace(/[^\w\s']/g, ' ')
|
|
109
|
+
.replace(/\s+/g, ' ')
|
|
110
|
+
.trim();
|
|
111
|
+
if (CONTINUATION_PROMPT_PHRASES.has(norm))
|
|
112
|
+
return true;
|
|
113
|
+
// Permit "please <phrase>" and "<phrase> please" wrappings.
|
|
114
|
+
for (const phrase of CONTINUATION_PROMPT_PHRASES) {
|
|
115
|
+
if (norm === `please ${phrase}` || norm === `${phrase} please`)
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
return false;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* "Noticing prompt" detector. Catches user messages that are asking
|
|
122
|
+
* about state ("are we using these?", "did you update X?", "where's
|
|
123
|
+
* the…?", "isn't Y supposed to be…?") rather than requesting new
|
|
124
|
+
* work. These signal that the user spotted a gap in the prior turn
|
|
125
|
+
* and wants the agent to address it — NOT continue the prior plan.
|
|
126
|
+
*
|
|
127
|
+
* Real failure mode captured 2026-05-25 on a Portfolio React refactor:
|
|
128
|
+
* user asked "I dont think we actually are using these new files are
|
|
129
|
+
* we?" after the agent wrote data files but never wired them into
|
|
130
|
+
* App.jsx. Bandit read the question as a generic "keep going" prompt,
|
|
131
|
+
* wrote 5 MORE new component files, still didn't touch App.jsx. The
|
|
132
|
+
* pivot signal was right there in the prompt shape and got missed.
|
|
133
|
+
*
|
|
134
|
+
* The check is conservative: short prompts only, must START with a
|
|
135
|
+
* recognizable question/concern stem (so "is X working?" matches but
|
|
136
|
+
* "is this the right approach to X" does not), no length cap above
|
|
137
|
+
* 220 chars since longer messages usually contain a real request
|
|
138
|
+
* rather than a pure noticing question.
|
|
139
|
+
*/
|
|
140
|
+
function isNoticingPrompt(text) {
|
|
141
|
+
const trimmed = (text || '').trim();
|
|
142
|
+
if (trimmed.length === 0 || trimmed.length > 220)
|
|
143
|
+
return false;
|
|
144
|
+
const norm = trimmed.toLowerCase().replace(/[^\w\s'?-]/g, ' ').replace(/\s+/g, ' ').trim();
|
|
145
|
+
// Stems that introduce a noticing/clarifying question. Anchored to
|
|
146
|
+
// the start of the message so a paragraph mentioning "are we"
|
|
147
|
+
// mid-text doesn't false-positive.
|
|
148
|
+
const STEMS = [
|
|
149
|
+
/^(?:i\s+)?(?:dont|don't|do\s+not)\s+(?:think|see)\s/, // "I dont think…", "I don't see…"
|
|
150
|
+
/^are\s+we\s/, // "are we using…"
|
|
151
|
+
/^did\s+(?:you|we)\s/, // "did you remember to…"
|
|
152
|
+
/^didn't\s+(?:you|we)\s/, // "didn't you say…"
|
|
153
|
+
/^did\s+(?:you|we)\s+(?:miss|forget|skip|overlook)\b/,
|
|
154
|
+
/^isn'?t\s+(?:this|that|it|there)\s/, // "isn't this missing…"
|
|
155
|
+
/^shouldn'?t\s+(?:this|that|it|there|we)\s/, // "shouldn't we…"
|
|
156
|
+
/^why\s+(?:didn'?t|isn'?t|aren'?t|doesn'?t|don'?t)\s/, // "why isn't X happening"
|
|
157
|
+
/^where(?:'s|\s+is|\s+are|\s+did)\s/, // "where is the import", "where's the …"
|
|
158
|
+
/^what\s+(?:about|happened\s+to)\s/, // "what about App.jsx"
|
|
159
|
+
/^(?:i\s+thought\s+)?you\s+(?:said|were|are)\s+(?:supposed|going|gonna)/,
|
|
160
|
+
/^this\s+doesn'?t\s/, // "this doesn't look right"
|
|
161
|
+
/^that\s+doesn'?t\s/,
|
|
162
|
+
/^hmm\b|^huh\b/,
|
|
163
|
+
/^wait\b/, // "wait — what about Y?"
|
|
164
|
+
/^(?:i'?m|am\s+i)\s+(?:missing|seeing|reading)\b/,
|
|
165
|
+
];
|
|
166
|
+
if (!STEMS.some((re) => re.test(norm)))
|
|
167
|
+
return false;
|
|
168
|
+
// Has to contain a question mark OR a concern modal. Lots of false
|
|
169
|
+
// matches without — e.g. "are we" mid-sentence in a feature request.
|
|
170
|
+
const hasQuestion = trimmed.includes('?');
|
|
171
|
+
const hasConcernModal = /\b(?:should|need\s+to|supposed\s+to|expected|missing|wrong|broken|stuck)\b/i.test(trimmed);
|
|
172
|
+
return hasQuestion || hasConcernModal;
|
|
173
|
+
}
|
|
174
|
+
class ToolUseLoop {
|
|
175
|
+
constructor(registry, ctx, options = {}) {
|
|
176
|
+
this.registry = registry;
|
|
177
|
+
this.ctx = ctx;
|
|
178
|
+
this.defaultOptions = options;
|
|
179
|
+
this.maxIterations = options.maxIterations ?? 10;
|
|
180
|
+
this.defaultEmit = options.emitEvent ?? (() => undefined);
|
|
181
|
+
this.defaultBeforeToolExecute = options.beforeToolExecute ?? (() => ({ allow: true }));
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Run the tool use loop.
|
|
185
|
+
*
|
|
186
|
+
* @param userGoal The original user request (becomes the first user message).
|
|
187
|
+
* @param chat A streaming chat function — returns an async iterable of text chunks.
|
|
188
|
+
* @param systemPrompt Optional base system prompt. Tool definitions are appended to it.
|
|
189
|
+
* @param options Per-call options (emitEvent override, etc.)
|
|
190
|
+
*/
|
|
191
|
+
async run(userGoal, chat, systemPrompt, options) {
|
|
192
|
+
return this.runWithMessages([{ role: 'user', content: userGoal }], chat, systemPrompt, options);
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Run the tool use loop seeded with prior conversation messages.
|
|
196
|
+
* Use this for REPL-style hosts that want to preserve multi-turn context;
|
|
197
|
+
* the caller supplies the full user/assistant history (no system message —
|
|
198
|
+
* the loop prepends its own system prompt with tool definitions).
|
|
199
|
+
*/
|
|
200
|
+
async runWithMessages(seedMessages, chat, systemPrompt, options) {
|
|
201
|
+
const effectiveOptions = { ...this.defaultOptions, ...options };
|
|
202
|
+
const emit = effectiveOptions.emitEvent ?? this.defaultEmit;
|
|
203
|
+
// soft/hard cap split. `max` is now mutable so the loop
|
|
204
|
+
// can extend it when the model is making clear progress. The hard
|
|
205
|
+
// ceiling is `2 * initialMax` (40 by default) — beyond that we
|
|
206
|
+
// always wrap up regardless of how healthy the iteration looked.
|
|
207
|
+
// a real turn was patching 17 implicit-any
|
|
208
|
+
// errors one apply_edit per iteration, exhausted the 20-cap with
|
|
209
|
+
// 5 errors outstanding even though every iteration was succeeding
|
|
210
|
+
// and no loop-detection nudges had fired. Letting the model
|
|
211
|
+
// continue when it's clearly making progress is the right move.
|
|
212
|
+
let max = effectiveOptions.maxIterations ?? this.maxIterations;
|
|
213
|
+
const initialMax = max;
|
|
214
|
+
const hardCap = Math.max(initialMax * 2, initialMax + 20);
|
|
215
|
+
const CAP_EXTENSION_SIZE = 10;
|
|
216
|
+
const MAX_CAP_EXTENSIONS = 2;
|
|
217
|
+
let iterationCapExtensions = 0;
|
|
218
|
+
// Healthy-progress signal: track whether each of the last N iterations
|
|
219
|
+
// produced any tool calls. Rolling window of 5. Empty iterations
|
|
220
|
+
// (parse failures, prose-only responses) push `false`; productive
|
|
221
|
+
// iterations push `true`. Extension only fires when all 5 are true.
|
|
222
|
+
const recentIterationsHadTools = [];
|
|
223
|
+
const RECENT_HEALTH_WINDOW = 5;
|
|
224
|
+
const beforeToolExecute = effectiveOptions.beforeToolExecute ?? this.defaultBeforeToolExecute;
|
|
225
|
+
const signal = effectiveOptions.signal;
|
|
226
|
+
const maxParallelTools = Math.max(1, effectiveOptions.maxParallelTools ?? 8);
|
|
227
|
+
const maxTotalTools = Math.max(1, effectiveOptions.maxTotalTools ?? 60);
|
|
228
|
+
const outputBudgetTokens = effectiveOptions.outputBudgetTokens ?? Infinity;
|
|
229
|
+
const outputBudgetRatio = effectiveOptions.outputBudgetRatio ?? 0.6;
|
|
230
|
+
let totalToolsExecuted = 0;
|
|
231
|
+
const buildCancelledResult = (msgs, iter, finalText = '') => ({
|
|
232
|
+
finalResponse: finalText || '[cancelled]',
|
|
233
|
+
iterations: iter,
|
|
234
|
+
messages: msgs,
|
|
235
|
+
hitLimit: false,
|
|
236
|
+
cancelled: true
|
|
237
|
+
});
|
|
238
|
+
let nativeTools = effectiveOptions.nativeTools ?? false;
|
|
239
|
+
const nativeToolFailureFallback = effectiveOptions.nativeToolFailureFallback ?? true;
|
|
240
|
+
let nativeFallbackUsed = false;
|
|
241
|
+
// One-shot outer-layer retry on the text channel after the native
|
|
242
|
+
// channel switched. The inner same-channel retry layer covers the
|
|
243
|
+
// common transient blip case, but a sustained native failure forces
|
|
244
|
+
// the channel switch; if the first text call ALSO hits a transient
|
|
245
|
+
// blip (gateway flapping, ollama still recovering from load), the
|
|
246
|
+
// previous code path threw `Upstream model request failed` straight
|
|
247
|
+
// to the user with no recovery. This flag lets the outer catch
|
|
248
|
+
// re-enter `streamAndAggregate` exactly once more on the text channel
|
|
249
|
+
// before declaring the turn dead. Addresses the "double-failure path
|
|
250
|
+
// is still terminal" gap.
|
|
251
|
+
let textFallbackRetryUsed = false;
|
|
252
|
+
// One-shot final attempt: after every prior retry slot is spent,
|
|
253
|
+
// push a clean re-anchor message that re-states the original user
|
|
254
|
+
// goal and retry once more. Sometimes a mid-stream replay can't
|
|
255
|
+
// recover (the model is anchored on a half-emitted tool_call
|
|
256
|
+
// payload or a partial reasoning block) but a fresh anchor with
|
|
257
|
+
// explicit "this is a recovery attempt — answer the original goal"
|
|
258
|
+
// framing succeeds. Last resort before terminal throw.
|
|
259
|
+
let finalAnchorRetryUsed = false;
|
|
260
|
+
const textToolBlock = this.registry.buildSystemPromptBlock();
|
|
261
|
+
const buildFullSystemPrompt = (useNativeTools) => {
|
|
262
|
+
if (useNativeTools)
|
|
263
|
+
return systemPrompt ?? '';
|
|
264
|
+
return systemPrompt
|
|
265
|
+
? `${systemPrompt}\n\n${textToolBlock}`
|
|
266
|
+
: textToolBlock;
|
|
267
|
+
};
|
|
268
|
+
let nativeSchemas = nativeTools ? this.registry.buildNativeToolsSchema() : undefined;
|
|
269
|
+
const messages = [];
|
|
270
|
+
const initialSystemPrompt = buildFullSystemPrompt(nativeTools);
|
|
271
|
+
if (initialSystemPrompt) {
|
|
272
|
+
messages.push({ role: 'system', content: initialSystemPrompt });
|
|
273
|
+
}
|
|
274
|
+
// Capture the most recent user message (the actual goal of THIS turn,
|
|
275
|
+
// not earlier conversation turns). Used by the goal-anchor reminder
|
|
276
|
+
// below when the model is about to generate its final answer — long
|
|
277
|
+
// tool-result chains push the original question down the attention
|
|
278
|
+
// window and the model can drift to a related-but-different topic.
|
|
279
|
+
// Walks back through continuation tokens ("keep going", "yes") to
|
|
280
|
+
// the most recent SUBSTANTIVE prompt. See loop/turnSetup.ts.
|
|
281
|
+
let { originalGoal, priorUserPromptCount } = (0, turnSetup_1.resolveTurnGoal)({ seedMessages });
|
|
282
|
+
// Track the iteration we last anchored on rather than a boolean
|
|
283
|
+
// so we can re-fire when the model pivots AGAIN later in a long
|
|
284
|
+
// turn. -1 means "never anchored." Re-fire is gated by the
|
|
285
|
+
// GOAL_ANCHOR_REFIRE_GAP below to avoid hammering on a model
|
|
286
|
+
// that's working steadily — only fires again when the loop has
|
|
287
|
+
// continued without resolution for several more iterations.
|
|
288
|
+
let lastGoalAnchorIteration = -1;
|
|
289
|
+
for (const msg of seedMessages) {
|
|
290
|
+
if (msg.role === 'system')
|
|
291
|
+
continue;
|
|
292
|
+
messages.push(msg);
|
|
293
|
+
}
|
|
294
|
+
// Noticing-prompt pivot hint. When the most-recent user message
|
|
295
|
+
// looks like a noticing/clarifying question ("are we using these?",
|
|
296
|
+
// "did you remember X?", "where's the…?"), inject a one-time
|
|
297
|
+
// synthetic user-role hint instructing the model to address the
|
|
298
|
+
// implicit gap BEFORE continuing any prior plan. Without this the
|
|
299
|
+
// model often reads such prompts as generic "keep going" signals
|
|
300
|
+
// and continues scaffolding work the user just paused them on.
|
|
301
|
+
// One-shot per turn — only fires on this first pass.
|
|
302
|
+
if (originalGoal && isNoticingPrompt(originalGoal)) {
|
|
303
|
+
emit('tool_loop:noticing_prompt_hint', {
|
|
304
|
+
promptPreview: originalGoal.slice(0, 200)
|
|
305
|
+
});
|
|
306
|
+
messages.push({
|
|
307
|
+
role: 'user',
|
|
308
|
+
content: '[Reading-comprehension note for the assistant: the user\'s last message above is a noticing / clarifying question — they spotted a possible gap from prior turns and are asking you to confirm or correct, NOT to continue any prior plan. Before you take any new action, identify what gap the question points at and address it directly. If the question is "are we using X?" the correct first move is to verify whether X is actually being used (read the consumer file, grep for the import, check the call site) and answer honestly — yes/no with evidence. Do NOT create more new artifacts unless the user explicitly says to.]'
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
let iterations = 0;
|
|
312
|
+
let hitLimit = false;
|
|
313
|
+
let consecutiveEmptyRetries = 0;
|
|
314
|
+
// Per-retry-path budgets. Keeping these separate from
|
|
315
|
+
// consecutiveEmptyRetries (which resets on any non-empty response)
|
|
316
|
+
// prevents an infinite retry when a model repeatedly emits the
|
|
317
|
+
// SAME malformed tool_call — the S3Api pburg workspace (Apr 22)
|
|
318
|
+
// ran 10+ iterations at iteration=2 because each 30s malformed
|
|
319
|
+
// apply_edit response reset consecutiveEmptyRetries to 0 and the
|
|
320
|
+
// parse-retry counter got to fire again. Caps are per-turn (not
|
|
321
|
+
// per-iteration) so the model genuinely exhausts its attempts
|
|
322
|
+
// before we give up.
|
|
323
|
+
let parseRetries = 0;
|
|
324
|
+
let fakeToolResultRetries = 0;
|
|
325
|
+
let toolAbsenceCorrectionsFired = 0;
|
|
326
|
+
let toolErrorRecoveryFired = 0;
|
|
327
|
+
let lastIterationHadToolError = false;
|
|
328
|
+
const PARSE_RETRY_CAP = 2;
|
|
329
|
+
const FAKE_TOOL_RESULT_CAP = 2;
|
|
330
|
+
const TOOL_ABSENCE_CORRECTION_CAP = 1;
|
|
331
|
+
const TOOL_ERROR_RECOVERY_CAP = 1;
|
|
332
|
+
// Hard turn-level cap on responses that produced no tool_call. The
|
|
333
|
+
// individual detectors (empty_retry, narrate-no-action, tool_error
|
|
334
|
+
// recovery, etc.) each have their own caps, but they can chain — a
|
|
335
|
+
// model can spin through 6+ no-tool-call responses because
|
|
336
|
+
// thinking-off recovery resets consecutiveEmptyRetries=0. Captured
|
|
337
|
+
// 2026-05-26 in Mark's Portfolio session (turn-2026-05-26T02-30-37):
|
|
338
|
+
// model emitted 6 sequential reasoning-only responses inside
|
|
339
|
+
// iteration 4 before the loop finally terminated with a useless
|
|
340
|
+
// final answer ("I need to stop wrapping tool calls in reasoning
|
|
341
|
+
// blocks"). This counter doesn't reset on detector firings — when
|
|
342
|
+
// it hits the cap, the loop terminates with a final answer that
|
|
343
|
+
// names the stuck state so the user knows what to retry with.
|
|
344
|
+
let noToolCallAttemptsThisTurn = 0;
|
|
345
|
+
// 4 → 5 (Jun 2026): make room for prefill_recovery after the existing
|
|
346
|
+
// empty_retry ×2 + thinking_off_recovery sequence. The new ordering is
|
|
347
|
+
// 1. empty_retry (consec=1)
|
|
348
|
+
// 2. empty_retry (consec=2)
|
|
349
|
+
// 3. thinking_off_recovery (force think:false)
|
|
350
|
+
// 4. prefill_recovery (push `<tool_call>{"name":"` as assistant prefill)
|
|
351
|
+
// 5. hard cap → stuck answer
|
|
352
|
+
// Prefill is qualitatively different from the prior steps — it forces
|
|
353
|
+
// the model into an envelope-opened state so it can't terminate at the
|
|
354
|
+
// reasoning fence — and is the highest-leverage recovery slot for the
|
|
355
|
+
// qwen3.6 "stops after fence close" failure mode.
|
|
356
|
+
const NO_TOOL_CALL_HARD_CAP = 5;
|
|
357
|
+
// One-shot recovery: when consecutive reasoning-only retries exhaust
|
|
358
|
+
// (the model is stuck thinking and never emits content or tool_calls),
|
|
359
|
+
// make ONE final attempt with thinking forced OFF. Observed
|
|
360
|
+
// 2026-04-26 with qwen3.6:27b on remote Ollama — thinking-on stalled
|
|
361
|
+
// intermittently while bandit-logic on the home cluster (same model,
|
|
362
|
+
// different serving stack) worked fine. Forcing thinking off
|
|
363
|
+
// collapses the model into the regular content channel where its
|
|
364
|
+
// tool-call sampling is far more deterministic.
|
|
365
|
+
let thinkingOffRecoveryAttempted = false;
|
|
366
|
+
let nextCallThinkOverride = undefined;
|
|
367
|
+
// Final-shot prefill recovery for qwen3.6-style "closes the reasoning
|
|
368
|
+
// fence and stops" stalls. Observed Jun 2026 on a long CSS-refactor
|
|
369
|
+
// turn: the model emitted 4 reasoning-only responses in a row even
|
|
370
|
+
// after the nudge + thinking-off recovery had fired. Reasoning content
|
|
371
|
+
// said "I need to actually emit tool calls" but generation terminated
|
|
372
|
+
// right after the fence close. Prefill removes the choice — we push an
|
|
373
|
+
// assistant message containing `<tool_call>{"name":"` so the next
|
|
374
|
+
// generation MUST continue from inside an envelope. The provider
|
|
375
|
+
// returns only the new tokens, so `pendingPrefillPrefix` is prepended
|
|
376
|
+
// to the response before parsing.
|
|
377
|
+
let prefillRecoveryAttempted = false;
|
|
378
|
+
let pendingPrefillPrefix = null;
|
|
379
|
+
// Track the last N non-tool-calling assistant responses so we can
|
|
380
|
+
// detect a "deliberation loop" — the model emits multiple iterations
|
|
381
|
+
// of highly-similar prose ("Wait, I see X isn't listed. Let me check
|
|
382
|
+
// X. Actually, I'll try to read X.") without ever calling a tool.
|
|
383
|
+
// Observed Apr 2026 on pburg-bowl with bandit-core-1: the model
|
|
384
|
+
// streamed 24k chars of self-contradicting prose in a SINGLE
|
|
385
|
+
// response, and if the content had been split across iterations the
|
|
386
|
+
// existing detectors (hitLimit, false-completion patterns) would
|
|
387
|
+
// also have missed it because each individual response looked
|
|
388
|
+
// plausible in isolation. The cross-iteration guard below kicks in
|
|
389
|
+
// if we see K non-tool iterations whose normalized prose overlaps
|
|
390
|
+
// heavily with the previous one.
|
|
391
|
+
const recentNonToolResponses = [];
|
|
392
|
+
const PROSE_LOOP_WINDOW = 2; // look back this many iterations
|
|
393
|
+
let proseLoopNudged = false;
|
|
394
|
+
// Track recent tool calls to detect a stuck model. The classic failure:
|
|
395
|
+
// the model writes a long JSON/TS file, its output gets truncated by an
|
|
396
|
+
// unescaped quote in the content, the write "succeeds" but lands corrupt,
|
|
397
|
+
// and the model immediately retries the same write hoping the problem
|
|
398
|
+
// was transient. Without a circuit breaker it will loop until maxIterations.
|
|
399
|
+
const recentCallKeys = [];
|
|
400
|
+
const REPEAT_LIMIT = 3;
|
|
401
|
+
// Track whether the model keeps emitting `todo_write` as its only tool
|
|
402
|
+
// in consecutive iterations. The v1.5.40 "todo_store summary" nudge was
|
|
403
|
+
// supposed to end this, but observed pburg-bowl traces (Apr 2026) show
|
|
404
|
+
// the model still burns 3 iterations in a row revising its todo list
|
|
405
|
+
// before doing any actual work. When N consecutive iterations fire
|
|
406
|
+
// `todo_write` as the ONLY tool (no search/read/write alongside), we
|
|
407
|
+
// inject a corrective nudge once.
|
|
408
|
+
let consecutiveTodoOnlyIterations = 0;
|
|
409
|
+
// 3 consecutive todo-only iterations before we intervene. Lower was
|
|
410
|
+
// to block bandit-logic from ever ticking plan
|
|
411
|
+
// items to "completed" — the model called todo_write twice to set up
|
|
412
|
+
// the plan, churn nudge fired at iteration 1, and the "do NOT call
|
|
413
|
+
// todo_write again this turn" message killed status updates for the
|
|
414
|
+
// rest of the run. 3 gives the model one more iteration of grace.
|
|
415
|
+
const TODO_ONLY_LIMIT = 3;
|
|
416
|
+
let todoChurnNudged = false;
|
|
417
|
+
// apply_edit-loop nudge. from a real
|
|
418
|
+
// bandit-cli run that hit the 20-iteration cap while patching 17
|
|
419
|
+
// implicit-any TypeScript errors one apply_edit at a time. Each
|
|
420
|
+
// call landed (the work was real, unlike todo-churn), but the
|
|
421
|
+
// sequential one-error-per-iteration cadence ate the whole budget.
|
|
422
|
+
// When the model spends N consecutive iterations doing only
|
|
423
|
+
// apply_edit (no read/run/search interleaved), we inject a one-shot
|
|
424
|
+
// nudge pointing at apply_patch (multi-file, multi-hunk) or a
|
|
425
|
+
// broader-context apply_edit that consolidates several adjacent
|
|
426
|
+
// fixes — both expand throughput without changing the iteration
|
|
427
|
+
// cap. Limit is 4 (one higher than todo-only): apply_edits are
|
|
428
|
+
// real progress, so we tolerate one more before nudging.
|
|
429
|
+
let consecutiveApplyEditOnlyIterations = 0;
|
|
430
|
+
const APPLY_EDIT_ONLY_LIMIT = 4;
|
|
431
|
+
let applyEditBatchNudged = false;
|
|
432
|
+
// Companion to the churn breaker: detect when the model set up a plan
|
|
433
|
+
// via `todo_write` early, then did multiple edit iterations WITHOUT
|
|
434
|
+
// calling `todo_write` again. The Plan block in the UI stays frozen
|
|
435
|
+
// on the original pending state — user watches the feed do real work
|
|
436
|
+
// but sees nothing flip to ✓. on Gemma 4 12B:
|
|
437
|
+
// iteration 1 set up 4-item plan, iterations 2-7 did reads + edits,
|
|
438
|
+
// turn ended at iteration 8 with the Plan still all-pending. Nudge
|
|
439
|
+
// fires at most once per turn, and ONLY on models without native
|
|
440
|
+
// tool calling (capable models generally update plans unprompted).
|
|
441
|
+
let lastTodoWriteIter = -1;
|
|
442
|
+
let editsSinceLastTodo = 0;
|
|
443
|
+
let todoProgressNudged = false;
|
|
444
|
+
const TODO_PROGRESS_STALE_DELTA = 3;
|
|
445
|
+
const TODO_PROGRESS_EDIT_THRESHOLD = 2;
|
|
446
|
+
// Track file paths the user referenced in the prompt or any prior tool
|
|
447
|
+
// call. If the model ends the turn with a large fenced code block and
|
|
448
|
+
// has NOT emitted any file-edit tool call, AND one of these
|
|
449
|
+
// referenced paths exists, we treat that as "code in markdown instead
|
|
450
|
+
// of a tool call" and nudge. Populated from the user goal up-front;
|
|
451
|
+
// the detector only fires when the signal is real.
|
|
452
|
+
let promptImpliesFileEdit = false;
|
|
453
|
+
// Companion to `promptImpliesFileEdit`: detect goals that ask for an
|
|
454
|
+
// ANALYSIS — "evaluate", "review", "audit", "what is", "how does",
|
|
455
|
+
// etc. Used by the limit-hit wrap-up logic to pick between the
|
|
456
|
+
// edit-shaped Shipped/Partway/Blocked template and the analysis-shaped
|
|
457
|
+
// Findings/Evidence/Gaps template. Without this, a "deep self
|
|
458
|
+
// evaluation" turn that hit the 60-call cap got the edit template
|
|
459
|
+
// and produced "Shipped: nothing" — useless framing for what was
|
|
460
|
+
// actually asked. .
|
|
461
|
+
let promptWantsAnalysis = false;
|
|
462
|
+
{
|
|
463
|
+
// Accept simple path tokens (contains `/` and a file extension) OR
|
|
464
|
+
// the keywords "update", "edit", "change", "fix", "modify", "refactor",
|
|
465
|
+
// "rewrite" — any of which imply the user expects a write. Heuristic,
|
|
466
|
+
// not a parser. False positives here cost us one wasted nudge;
|
|
467
|
+
// false negatives let code-fence hallucinations ship.
|
|
468
|
+
const goalText = seedMessages
|
|
469
|
+
.filter(m => m.role === 'user')
|
|
470
|
+
.map(m => m.content)
|
|
471
|
+
.join('\n')
|
|
472
|
+
.toLowerCase();
|
|
473
|
+
promptImpliesFileEdit =
|
|
474
|
+
/\b(update|edit|change|fix|modify|refactor|rewrite|replace|add)\b/.test(goalText) ||
|
|
475
|
+
/[\w\-./]+\.(?:ts|tsx|js|jsx|py|rb|go|rs|java|kt|cs|swift|php|cpp|c|h|md|json|ya?ml|html|css)\b/.test(goalText);
|
|
476
|
+
// Analysis verbs/phrasings. Includes both verb forms ("evaluate",
|
|
477
|
+
// "review") and question forms ("what is", "how does", "why
|
|
478
|
+
// does") so "evaluate this codebase" and "what's keeping this
|
|
479
|
+
// agent from being better" both light up. Compatible with
|
|
480
|
+
// `promptImpliesFileEdit` — a goal can match both ("look at
|
|
481
|
+
// file.ts and tell me what you see"); the wrap-up picker
|
|
482
|
+
// resolves precedence using `editToolsInvoked` as the tiebreaker.
|
|
483
|
+
promptWantsAnalysis =
|
|
484
|
+
/\b(evaluate|review|analy[sz]e|audit|inspect|investigate|explain|summari[sz]e|describe|tell\s+me|find\s+out|self[-\s]?eval(?:uat(?:e|ion))?)\b/i.test(goalText)
|
|
485
|
+
|| /\b(what(?:'s|\s+is|\s+are)|how\s+does|why\s+does|where\s+does)\b/i.test(goalText)
|
|
486
|
+
|| /\blook(?:ing)?\s+at\b/i.test(goalText);
|
|
487
|
+
}
|
|
488
|
+
// Track whether any file-producing tool call has actually been invoked
|
|
489
|
+
// this turn. Used by the "false completion" detector below: if the model
|
|
490
|
+
// emits a final response claiming it wrote code but never called
|
|
491
|
+
// write_file / apply_edit / replace_range / apply_patch, we inject a corrective nudge and force one
|
|
492
|
+
// more iteration so the model has a chance to actually do the work.
|
|
493
|
+
let editToolsInvoked = 0;
|
|
494
|
+
// Per-file tracking so the "subject not modified" detector (further
|
|
495
|
+
// below) can catch the refactor failure mode where the model reads
|
|
496
|
+
// a file for context, writes NEW files based on it, but never
|
|
497
|
+
// updates the original. The set is normalized (lowercase, basename)
|
|
498
|
+
// so different references to the same file collapse.
|
|
499
|
+
const filesReadThisTurn = new Set();
|
|
500
|
+
const filesWrittenThisTurn = new Set();
|
|
501
|
+
let subjectNotModifiedNudged = false;
|
|
502
|
+
// One-shot guard for the code-fence-as-final-answer detector (see below).
|
|
503
|
+
let codeFenceHallucinationNudged = false;
|
|
504
|
+
// One-shot guard for the JSON-todo auto-promotion detector (see
|
|
505
|
+
// below). Small models (12B Gemma observed) sometimes paste their
|
|
506
|
+
// todo list as a ```json code fence instead of calling todo_write,
|
|
507
|
+
// which means the plan never advances and they re-iterate on the
|
|
508
|
+
// same task. We detect the shape, synthesize a todo_write call,
|
|
509
|
+
// execute it as if the model had emitted it, and continue. Capped
|
|
510
|
+
// once per turn so a model that genuinely wants to show JSON data
|
|
511
|
+
// isn't caught in a loop.
|
|
512
|
+
let jsonTodoAutoPromoted = false;
|
|
513
|
+
// One-shot guard so we don't infinite-loop a truly confused model.
|
|
514
|
+
// The detector fires at most once per turn; if the model STILL claims
|
|
515
|
+
// completion without writing after the nudge, we let the turn terminate
|
|
516
|
+
// so the user can intervene.
|
|
517
|
+
let falseCompletionNudged = false;
|
|
518
|
+
// One-shot guard for the announce-then-stall detector. The model emits
|
|
519
|
+
// a forward-looking commitment ("Let me dig deeper into X", "Next I'll
|
|
520
|
+
// explore Y") with NO tool call, and the loop exits because no-tool =
|
|
521
|
+
// final answer. with bandit-logic self-evaluating
|
|
522
|
+
// this repo: 3 iterations of reads, then iteration 4 returned only
|
|
523
|
+
// "Let me dig deeper into the core architecture..." and the runtime
|
|
524
|
+
// exited with iterations:3, hitLimit:false. None of the existing
|
|
525
|
+
// detectors caught it — no completion claim, no code fence, no prose-
|
|
526
|
+
// loop similarity (first stall after real work).
|
|
527
|
+
let announceIntentNudged = false;
|
|
528
|
+
let askUserNudged = false;
|
|
529
|
+
// One-shot guard for the fired-and-forgotten background-task detector.
|
|
530
|
+
// The model spawns multiple `task(run_in_background="true")` calls in
|
|
531
|
+
// one iteration and then either polls `check_task` immediately
|
|
532
|
+
// (returns "still running" — wasted iteration) or, more often, does
|
|
533
|
+
// the same exploration in parallel itself in the next iteration —
|
|
534
|
+
// burning the parent's context budget on work the subagents will
|
|
535
|
+
// report back. 6 backgrounded tasks spawned at
|
|
536
|
+
// iter 4, polled at iter 5 (none ready), parent then duplicated all
|
|
537
|
+
// their reads at iter 6. The nudge fires once per turn telling the
|
|
538
|
+
// model to either work on something independent or terminate the
|
|
539
|
+
// turn so the auto-inject can deliver synopses on the next turn.
|
|
540
|
+
let firedAndForgottenNudged = false;
|
|
541
|
+
// One-shot guard for the subagent-first-iteration-must-act detector.
|
|
542
|
+
// Subagents (`options.isSubagent === true`) are spawned to gather
|
|
543
|
+
// information for a specific goal; producing prose-only output on
|
|
544
|
+
// iteration 0 is always a stall, never a legitimate final answer.
|
|
545
|
+
// The existing announce-intent / narrate detectors miss when the
|
|
546
|
+
// model emits neutral reasoning + non-forward-looking prose
|
|
547
|
+
// ("This is a complex task...") that doesn't match their patterns.
|
|
548
|
+
// bandit-logic stalled 5/6 subagents on a
|
|
549
|
+
// self-eval turn with exactly that shape. Fires once per turn.
|
|
550
|
+
let subagentFirstIterNudged = false;
|
|
551
|
+
// Phrases a model uses when it thinks it has delivered code but hasn't
|
|
552
|
+
// actually emitted a write/edit tool call. Based on observed failure
|
|
553
|
+
// traces from bandit-core-1 and similar small models. Matched case-
|
|
554
|
+
// insensitively; any match + no write tool this turn trips the nudge.
|
|
555
|
+
const FALSE_COMPLETION_PATTERNS = [
|
|
556
|
+
/in (?:my|a|the) previous response/i,
|
|
557
|
+
/already provided (?:the|an?) (?:implementation|refactored|improved|updated)/i,
|
|
558
|
+
/you can find (?:the |this )?(?:refactored|improved|updated) (?:code|implementation)/i,
|
|
559
|
+
/here (?:is|'s) the (?:refactored|improved|updated|revised) (?:code|implementation|file)/i,
|
|
560
|
+
/(?:i have|i've) (?:refactored|rewritten|updated|improved)/i,
|
|
561
|
+
/(?:refactored|updated) (?:the )?(?:code|implementation) above/i,
|
|
562
|
+
/i'll finalize the task here/i,
|
|
563
|
+
/i've also marked (?:the tasks|these steps) as complet/i,
|
|
564
|
+
// Deferral patterns: the model emitted a malformed tool call (usually
|
|
565
|
+
// unescaped quotes/newlines in a large content payload), took the
|
|
566
|
+
// parse-retry nudge as a cue to apologize, and asked the user which
|
|
567
|
+
// task to resume instead of actually retrying. The user never sees
|
|
568
|
+
// the change land on disk. Observed in pburg-bowl scoring rewrite
|
|
569
|
+
// (Apr 2026): iteration 4 emitted write_file with unescaped content,
|
|
570
|
+
// parse-retry nudge fired, model responded with apology + "let me
|
|
571
|
+
// know which task I should resume" and termination.
|
|
572
|
+
/i apologi[sz]e for the (?:malformed|invalid)/i,
|
|
573
|
+
/(?:ensure|escape) (?:all )?(?:quotes|newlines|characters).*(?:properly )?escap/i,
|
|
574
|
+
/in my next tool call/i,
|
|
575
|
+
/let me know (?:which|what) (?:task|action) (?:i should |to )?resume/i,
|
|
576
|
+
/please (?:let me know|tell me).*(?:specific action|which task|what.*like me to)/i,
|
|
577
|
+
// Patterns surfaced 2026-04-23 on S3Api with bandit-logic (Qwen
|
|
578
|
+
// 2.5 Coder 32B). Model never called apply_edit, then ended the
|
|
579
|
+
// turn with "Based on the steps we've taken, here is the final
|
|
580
|
+
// state of the files..." followed by a prose dump of the
|
|
581
|
+
// "edited" files (which were never actually written to disk).
|
|
582
|
+
// The prior patterns covered "here is the refactored code" but
|
|
583
|
+
// not "here is the final state." Same failure mode, new words.
|
|
584
|
+
/here (?:is|'s) the (?:final|resulting|updated|modified) (?:state|version|content|output) of/i,
|
|
585
|
+
/(?:comments?|changes?|edits?|annotations?|updates?) (?:have )?been (?:added|made|applied|written|included)/i,
|
|
586
|
+
/you can verify (?:these|the|your) (?:changes?|edits?|updates?)/i,
|
|
587
|
+
/check(?:ing)? the files? (?:directly )?in your editor/i,
|
|
588
|
+
/running (?:a )?build to (?:see|verify|check)/i,
|
|
589
|
+
// Gemma 4 / bandit-core-1 escape patterns observed
|
|
590
|
+
// 2026-05-12 turn 1bec. After the bandit-tl hallucination detector
|
|
591
|
+
// blocked the fake-card shape, the model fell back to
|
|
592
|
+
// pure-prose lying with phrases like:
|
|
593
|
+
// "I have successfully eliminated all critical errors"
|
|
594
|
+
// "I have successfully fixed/resolved/removed/cleaned up X"
|
|
595
|
+
// "The project is now in a healthy state"
|
|
596
|
+
// "Verified via [tool] — confirmed [N→0]"
|
|
597
|
+
// "Removed forbidden require() calls: Converted them to ESM"
|
|
598
|
+
// Existing patterns covered "refactored / rewritten / updated /
|
|
599
|
+
// improved" but missed eliminated / resolved / cleaned / verified.
|
|
600
|
+
// Each new pattern is anchored to a completion-claim verb so this
|
|
601
|
+
// doesn't fire on legitimate "I will fix" intent phrases.
|
|
602
|
+
/(?:i have|i've)\s+(?:successfully\s+)?(?:eliminated|resolved|removed|cleaned|cleared|deleted|wiped|converted|wrapped|implemented|completed|finished)/i,
|
|
603
|
+
/(?:the project|the codebase|the file|the code) is now (?:in a (?:healthy|clean|working|fixed) state|fixed|complete|done|ready)/i,
|
|
604
|
+
/(?:verified|confirmed) (?:via|with|by running)\s+(?:the\s+)?(?:linter|tests?|build|tsc|eslint)/i,
|
|
605
|
+
/(?:critical errors?|lint(?:ing)? errors?|warnings?|issues?) (?:dropped|went|reduced) (?:from\s+)?\d+\+?\s*(?:to|→)\s*\d+/i,
|
|
606
|
+
// "Successfully" + past-tense action is the most common new shape.
|
|
607
|
+
/successfully\s+(?:fixed|resolved|removed|eliminated|cleaned|converted|implemented|verified|completed|applied|updated|patched)/i
|
|
608
|
+
];
|
|
609
|
+
for (;;) {
|
|
610
|
+
if (signal?.aborted) {
|
|
611
|
+
emit('tool_loop:cancelled', { iteration: iterations, stage: 'pre_iteration' });
|
|
612
|
+
return buildCancelledResult(messages, iterations);
|
|
613
|
+
}
|
|
614
|
+
// Both limit-hit messages now LEAD with the original user goal.
|
|
615
|
+
// a self-evaluation turn hit the 60-tool cap,
|
|
616
|
+
// got the wrap-up nudge, and the model wrote a wrap-up about a
|
|
617
|
+
// wholly different project (Helm chart / Next.js) it had touched
|
|
618
|
+
// in compacted-away context — explicitly admitting "Without
|
|
619
|
+
// knowing the exact original prompt." After 60 calls + multiple
|
|
620
|
+
// compactions, the model genuinely cannot recall what was asked
|
|
621
|
+
// unless we put it back in front of them at wrap-up time. The
|
|
622
|
+
// anchor IS in the conversation but it's deep history; the
|
|
623
|
+
// wrap-up message is the LAST thing the model sees, so the goal
|
|
624
|
+
// belongs here too.
|
|
625
|
+
const goalRecallBlock = originalGoal
|
|
626
|
+
? `## ORIGINAL USER GOAL — answer THIS, not whatever feels salient in recent reads:\n\n "${originalGoal.trim()}"\n\n`
|
|
627
|
+
: '';
|
|
628
|
+
// Template picker — analysis-shaped goals (evaluate, review,
|
|
629
|
+
// explain, "what is X") get a Findings/Evidence/Gaps shape;
|
|
630
|
+
// edit-shaped goals (or any turn where edits actually fired)
|
|
631
|
+
// get the Shipped/Partway/Blocked shape. `editToolsInvoked > 0`
|
|
632
|
+
// takes precedence: if real edits landed, the user needs that
|
|
633
|
+
// accounting regardless of the prompt phrasing. Default for
|
|
634
|
+
// ambiguous goals (no edit signal, no analysis verb) is the
|
|
635
|
+
// edit shape — that's what was here before, kept as the
|
|
636
|
+
// conservative fallback.
|
|
637
|
+
const useAnalysisTemplate = editToolsInvoked === 0
|
|
638
|
+
&& (promptWantsAnalysis || !promptImpliesFileEdit);
|
|
639
|
+
// Analysis-shaped wrap-up. Three sections that match what an
|
|
640
|
+
// evaluator-style turn produces: a substantive synthesis, the
|
|
641
|
+
// material that supports it, and an honest list of gaps. Without
|
|
642
|
+
// this template, "Shipped: nothing landed" was the model's
|
|
643
|
+
// mandatory opener for analysis turns — useless framing for the
|
|
644
|
+
// self-evaluation request that surfaced this fix.
|
|
645
|
+
const analysisWrapUp = '**Findings** — your conclusions, the actual analysis the user asked for. Be specific: name files, patterns, gaps you saw. This is the deliverable; do NOT bury it under "I read X then Y then Z" — synthesise.\n' +
|
|
646
|
+
'\n' +
|
|
647
|
+
'**Evidence** — what you actually read or ran that supports each finding. File paths + brief description ("`tool-use-loop.ts:540` — goal-anchor only fires every 4 iterations"). Without this the user can\'t verify your claims.\n' +
|
|
648
|
+
'\n' +
|
|
649
|
+
'**What you didn\'t get to** — parts of the question you couldn\'t answer with what you saw. Be honest about gaps; do NOT invent confident claims about code you didn\'t actually read.\n';
|
|
650
|
+
const editWrapUp = '**Shipped** — concrete changes that ACTUALLY landed this turn. Only list edits where a write_file, apply_edit, replace_range, or apply_patch tool call returned successfully (no errors). Be specific about file + what changed.\n' +
|
|
651
|
+
'\n' +
|
|
652
|
+
'**Build state** — if you edited code this turn you MUST state the build state explicitly. Either (a) cite a verified-clean run from THIS turn — quote the command + "exit code 0" / "no errors" output, OR (b) say "I did not run the build / typecheck this turn — caller should verify". DO NOT claim items are Done if the build is failing; downgrade those items to Partway and name the remaining errors. Real on a linter-fix turn: model wrote "Shipped" with 7 bullets while `tsc --noEmit` still reported 5 errors it had run out of iterations to fix.\n' +
|
|
653
|
+
'\n' +
|
|
654
|
+
'**Partway** — investigation done but not yet committed (files read, searches run, plan formed). State what was learned and what the next step would be.\n' +
|
|
655
|
+
'\n' +
|
|
656
|
+
'**Blocked / not attempted** — anything in the user\'s request you did not get to, or attempted-but-failed (e.g. apply_edit returned find-not-found). Own the failure honestly — do NOT claim success on these. If a fix is one paragraph the user can apply manually, say so.\n';
|
|
657
|
+
const wrapUpBody = useAnalysisTemplate ? analysisWrapUp : editWrapUp;
|
|
658
|
+
if (iterations >= max) {
|
|
659
|
+
// soft cap extension. Before forcing the wrap-up,
|
|
660
|
+
// check whether the model is making clear progress. Extension
|
|
661
|
+
// criteria: last RECENT_HEALTH_WINDOW iterations all produced
|
|
662
|
+
// tool calls (not empty, not todo-only), no loop-detection
|
|
663
|
+
// nudges have fired this turn, and we're under the hard
|
|
664
|
+
// ceiling. When all true, raise `max` by CAP_EXTENSION_SIZE
|
|
665
|
+
// and let the loop continue. Up to MAX_CAP_EXTENSIONS, then
|
|
666
|
+
// the wrap-up always fires no matter how healthy things look.
|
|
667
|
+
const fullWindow = recentIterationsHadTools.length === RECENT_HEALTH_WINDOW;
|
|
668
|
+
const allHealthy = fullWindow && recentIterationsHadTools.every(Boolean);
|
|
669
|
+
const noNudges = !todoChurnNudged && !applyEditBatchNudged && !proseLoopNudged
|
|
670
|
+
&& fakeToolResultRetries === 0 && parseRetries === 0;
|
|
671
|
+
const underCeiling = max + CAP_EXTENSION_SIZE <= hardCap;
|
|
672
|
+
const canExtend = allHealthy && noNudges && underCeiling
|
|
673
|
+
&& iterationCapExtensions < MAX_CAP_EXTENSIONS;
|
|
674
|
+
if (canExtend) {
|
|
675
|
+
const prevMax = max;
|
|
676
|
+
max += CAP_EXTENSION_SIZE;
|
|
677
|
+
iterationCapExtensions++;
|
|
678
|
+
emit('tool_loop:iteration_cap_extended', {
|
|
679
|
+
iteration: iterations,
|
|
680
|
+
previousMax: prevMax,
|
|
681
|
+
newMax: max,
|
|
682
|
+
extension: iterationCapExtensions,
|
|
683
|
+
hardCap
|
|
684
|
+
});
|
|
685
|
+
// Drop a single-sentence nudge so the model knows the budget
|
|
686
|
+
// grew and tightens up. Without this it might keep its
|
|
687
|
+
// current pace and burn the extension too.
|
|
688
|
+
messages.push({
|
|
689
|
+
role: 'user',
|
|
690
|
+
content: `You've been making good progress and the iteration budget has been extended by ${CAP_EXTENSION_SIZE} (new limit: ${max}). Keep going, but tighten up: prefer batched edits over single-line ones, and start wrapping up when you have a complete answer rather than running to the new cap. This is the ${iterationCapExtensions === 1 ? 'first' : 'second'} of at most ${MAX_CAP_EXTENSIONS} extensions for this turn.`
|
|
691
|
+
});
|
|
692
|
+
}
|
|
693
|
+
else {
|
|
694
|
+
hitLimit = true;
|
|
695
|
+
// Step-budget exhaustion prompt. Three-section structure forces
|
|
696
|
+
// honest accounting; the goal recall block above stops models
|
|
697
|
+
// from inventing what the goal was. Template choice (analysis
|
|
698
|
+
// vs edit) reflects what the user actually asked for.
|
|
699
|
+
messages.push({
|
|
700
|
+
role: 'user',
|
|
701
|
+
content: `${goalRecallBlock}` +
|
|
702
|
+
`You have reached the tool-use iteration limit (${max}). Stop calling tools. Produce a final answer with three short sections, in this exact shape:\n` +
|
|
703
|
+
'\n' +
|
|
704
|
+
wrapUpBody +
|
|
705
|
+
'\n' +
|
|
706
|
+
'No tool calls. No "I will continue" promises. Close the turn.'
|
|
707
|
+
});
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
if (totalToolsExecuted >= maxTotalTools && !hitLimit) {
|
|
711
|
+
hitLimit = true;
|
|
712
|
+
emit('tool_loop:total_tool_cap', { iteration: iterations, totalToolsExecuted });
|
|
713
|
+
messages.push({
|
|
714
|
+
role: 'user',
|
|
715
|
+
content: `${goalRecallBlock}` +
|
|
716
|
+
`You have executed ${totalToolsExecuted} tool calls this turn — the per-turn cap (${maxTotalTools}) has been reached. Stop calling tools. Produce a final answer with three short sections:\n` +
|
|
717
|
+
'\n' +
|
|
718
|
+
wrapUpBody +
|
|
719
|
+
'\n' +
|
|
720
|
+
'No more tool calls. Close the turn.'
|
|
721
|
+
});
|
|
722
|
+
}
|
|
723
|
+
// Compact accumulated tool-result history before sending to the
|
|
724
|
+
// provider. On small/medium models this is what keeps long agent
|
|
725
|
+
// turns (6+ iterations on a real codebase) from overflowing
|
|
726
|
+
// num_ctx — when older tool results have grown past the budget
|
|
727
|
+
// they get collapsed to one-line "[earlier run, N lines elided]"
|
|
728
|
+
// placeholders. The model still sees enough to avoid re-reading
|
|
729
|
+
// files it already read. Aggressive-threshold rationale + the
|
|
730
|
+
// why-trace live in loop/compactionTrigger.ts.
|
|
731
|
+
const { aggressive: aggressiveCompactionThisIteration } = (0, compactionTrigger_1.applyCompactionIfNeeded)({
|
|
732
|
+
messages,
|
|
733
|
+
tokenBudget: effectiveOptions.messageTokenBudget,
|
|
734
|
+
emit,
|
|
735
|
+
iteration: iterations
|
|
736
|
+
});
|
|
737
|
+
// Goal anchor — re-inject the original user goal when the loop is
|
|
738
|
+
// at risk of drifting (recency bias on long tool-result chains;
|
|
739
|
+
// multi-turn pivot after compaction). Eligibility, refire gap,
|
|
740
|
+
// and the aggressive-compaction override are pinned in
|
|
741
|
+
// loop/goalAnchor.ts.
|
|
742
|
+
({ lastGoalAnchorIteration } = (0, goalAnchor_1.applyGoalAnchorIfNeeded)({
|
|
743
|
+
originalGoal,
|
|
744
|
+
priorUserPromptCount,
|
|
745
|
+
hitLimit,
|
|
746
|
+
iteration: iterations,
|
|
747
|
+
lastGoalAnchorIteration,
|
|
748
|
+
aggressiveCompactionThisIteration,
|
|
749
|
+
messages,
|
|
750
|
+
registry: this.registry,
|
|
751
|
+
emit
|
|
752
|
+
}));
|
|
753
|
+
// Stream and aggregate the model response.
|
|
754
|
+
// Telemetry: capture total prompt size sent to the
|
|
755
|
+
// model. Subagent stalls were hard to diagnose because we
|
|
756
|
+
// couldn't tell if the prompt was 5KB (normal) or 50KB+ (would
|
|
757
|
+
// explain prompt-processing latency). Now both are visible.
|
|
758
|
+
const callOptions = nextCallThinkOverride !== undefined ? { think: nextCallThinkOverride } : undefined;
|
|
759
|
+
// Per-call think override is single-shot — clear immediately after
|
|
760
|
+
// building the options bag so subsequent iterations revert to the
|
|
761
|
+
// chat function's closure-captured default.
|
|
762
|
+
nextCallThinkOverride = undefined;
|
|
763
|
+
let llmStartedAt = Date.now();
|
|
764
|
+
let response = '';
|
|
765
|
+
// Drain externally-pushed messages BEFORE each LLM call. Host
|
|
766
|
+
// subscribes its backgroundStore (or other async event source)
|
|
767
|
+
// and pushes into a local queue; this callback returns the
|
|
768
|
+
// pending entries which the loop appends to the conversation.
|
|
769
|
+
// Net effect: parent loop sees subagent completions the moment
|
|
770
|
+
// they arrive instead of poll-spinning on check_task. See the
|
|
771
|
+
// ToolUseLoopOptions doc for the motivating use case.
|
|
772
|
+
const externals = effectiveOptions.drainExternalMessages?.() ?? [];
|
|
773
|
+
for (const ext of externals) {
|
|
774
|
+
if (ext && typeof ext.content === 'string' && ext.content.length > 0) {
|
|
775
|
+
messages.push(ext);
|
|
776
|
+
emit('tool_loop:external_inject', {
|
|
777
|
+
iteration: iterations,
|
|
778
|
+
role: ext.role,
|
|
779
|
+
chars: ext.content.length
|
|
780
|
+
});
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
while (true) {
|
|
784
|
+
emit('tool_loop:llm_start', {
|
|
785
|
+
iteration: iterations,
|
|
786
|
+
messageCount: messages.length,
|
|
787
|
+
promptCharsTotal: messages.reduce((sum, m) => sum + (m.content?.length ?? 0), 0),
|
|
788
|
+
systemPromptChars: messages
|
|
789
|
+
.filter((m) => m.role === 'system')
|
|
790
|
+
.reduce((sum, m) => sum + (m.content?.length ?? 0), 0),
|
|
791
|
+
thinkOverride: callOptions?.think
|
|
792
|
+
});
|
|
793
|
+
llmStartedAt = Date.now();
|
|
794
|
+
try {
|
|
795
|
+
response = await (0, llmStream_1.streamAndAggregate)({
|
|
796
|
+
chat,
|
|
797
|
+
messages,
|
|
798
|
+
emit,
|
|
799
|
+
iteration: iterations,
|
|
800
|
+
tools: nativeSchemas,
|
|
801
|
+
signal,
|
|
802
|
+
callOptions
|
|
803
|
+
});
|
|
804
|
+
if (pendingPrefillPrefix) {
|
|
805
|
+
// Ollama's chat API treats a trailing assistant message as a
|
|
806
|
+
// prefill — the model continues from where its content ends.
|
|
807
|
+
// The streamed response contains only the new tokens, so glue
|
|
808
|
+
// the prefix back on so downstream parsing sees a complete
|
|
809
|
+
// <tool_call> envelope.
|
|
810
|
+
response = pendingPrefillPrefix + response;
|
|
811
|
+
pendingPrefillPrefix = null;
|
|
812
|
+
}
|
|
813
|
+
break;
|
|
814
|
+
}
|
|
815
|
+
catch (error) {
|
|
816
|
+
if (nativeTools && nativeToolFailureFallback && !nativeFallbackUsed && isRetryableLlmError(error) && !signal?.aborted) {
|
|
817
|
+
nativeFallbackUsed = true;
|
|
818
|
+
nativeTools = false;
|
|
819
|
+
nativeSchemas = undefined;
|
|
820
|
+
const fallbackPrompt = buildFullSystemPrompt(false);
|
|
821
|
+
if (fallbackPrompt) {
|
|
822
|
+
if (messages[0]?.role === 'system') {
|
|
823
|
+
messages[0] = { role: 'system', content: fallbackPrompt };
|
|
824
|
+
}
|
|
825
|
+
else {
|
|
826
|
+
messages.unshift({ role: 'system', content: fallbackPrompt });
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
// v1.7.299 right-way fix: push a synthetic user message so
|
|
830
|
+
// the NEXT LLM call sees explicit guidance that the tool
|
|
831
|
+
// channel changed. The system-prompt swap alone is not
|
|
832
|
+
// enough — long-context models often anchor on the latest
|
|
833
|
+
// user turn for "what tool envelope should I use," and
|
|
834
|
+
// without this signal they keep emitting the prior
|
|
835
|
+
// native-tools shape into the void. Mark trace 2026-05-26:
|
|
836
|
+
// after a bandit-cloud 500 triggered native→text fallback
|
|
837
|
+
// mid-turn, the model continued emitting native-style
|
|
838
|
+
// payloads for 3+ iterations before finally producing
|
|
839
|
+
// visible markup.
|
|
840
|
+
messages.push({
|
|
841
|
+
role: 'user',
|
|
842
|
+
content: `[Provider error mid-turn — tool channel switched.] The previous attempt failed with: ${summarizeLlmError(error)}. ` +
|
|
843
|
+
`I retried with the text-based tool-call channel. ` +
|
|
844
|
+
`Re-emit your pending action using the text envelope: ` +
|
|
845
|
+
`<tool_call>{"name":"...","params":{...}}</tool_call> outside of any reasoning block. ` +
|
|
846
|
+
`Native-function-call payloads from your previous attempt were discarded — they're not visible to me. ` +
|
|
847
|
+
`If your last intended action is unclear, briefly state what you were trying to do and then emit the tool_call.`
|
|
848
|
+
});
|
|
849
|
+
emit('tool_loop:native_tool_fallback', {
|
|
850
|
+
iteration: iterations,
|
|
851
|
+
reason: summarizeLlmError(error)
|
|
852
|
+
});
|
|
853
|
+
continue;
|
|
854
|
+
}
|
|
855
|
+
// One-shot outer-layer retry on the text channel. Only fires
|
|
856
|
+
// when the channel switch has already happened (we're on text
|
|
857
|
+
// now) AND the failure is retryable AND we haven't already used
|
|
858
|
+
// this slot this turn. Larger backoff than the inner layer
|
|
859
|
+
// because by this point we've spent ~5-10s on the native
|
|
860
|
+
// attempts; the server probably needs longer to recover. After
|
|
861
|
+
// this attempt, any further failure on text is genuinely
|
|
862
|
+
// terminal — the user has been waiting > 30 s and a clean
|
|
863
|
+
// error is more helpful than another silent retry.
|
|
864
|
+
if (nativeFallbackUsed && !textFallbackRetryUsed && isRetryableLlmError(error) && !signal?.aborted) {
|
|
865
|
+
textFallbackRetryUsed = true;
|
|
866
|
+
emit('tool_loop:text_fallback_retry', {
|
|
867
|
+
iteration: iterations,
|
|
868
|
+
reason: summarizeLlmError(error)
|
|
869
|
+
});
|
|
870
|
+
await sleep(2400);
|
|
871
|
+
continue;
|
|
872
|
+
}
|
|
873
|
+
// Last-resort final-anchor retry. By this point we've spent
|
|
874
|
+
// every same-channel and cross-channel retry slot, and the
|
|
875
|
+
// conversation may contain partial tool_call deltas or
|
|
876
|
+
// half-emitted reasoning blocks that the model keeps anchoring
|
|
877
|
+
// on. Push a clean recovery message that restates the original
|
|
878
|
+
// goal and gives the model an explicit fresh-start framing,
|
|
879
|
+
// then retry once more. Only fires when an originalGoal is
|
|
880
|
+
// present (no point re-anchoring an empty turn) and the user
|
|
881
|
+
// hasn't aborted. After this attempt the failure is genuinely
|
|
882
|
+
// terminal — we've tried 12+ chat invocations across two
|
|
883
|
+
// channels with three distinct framings.
|
|
884
|
+
if (!finalAnchorRetryUsed
|
|
885
|
+
&& textFallbackRetryUsed
|
|
886
|
+
&& originalGoal.trim().length > 0
|
|
887
|
+
&& isRetryableLlmError(error)
|
|
888
|
+
&& !signal?.aborted) {
|
|
889
|
+
finalAnchorRetryUsed = true;
|
|
890
|
+
messages.push({
|
|
891
|
+
role: 'user',
|
|
892
|
+
content: `[Recovery attempt — previous channel attempts hit ${summarizeLlmError(error)}. ` +
|
|
893
|
+
`Discarding any partial tool_call or reasoning state from those attempts. ` +
|
|
894
|
+
`Original user goal restated as a fresh anchor:]\n\n${originalGoal.trim()}`
|
|
895
|
+
});
|
|
896
|
+
emit('tool_loop:final_anchor_retry', {
|
|
897
|
+
iteration: iterations,
|
|
898
|
+
reason: summarizeLlmError(error),
|
|
899
|
+
goalPreview: originalGoal.slice(0, 120)
|
|
900
|
+
});
|
|
901
|
+
await sleep(3600);
|
|
902
|
+
continue;
|
|
903
|
+
}
|
|
904
|
+
throw error;
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
// Diagnostic preview: 2000 chars + flags so we can tell apart "model
|
|
908
|
+
// emitted tool markup that the parser missed" from "model genuinely
|
|
909
|
+
// never emitted markup." 200 chars was too short to see past a
|
|
910
|
+
// typical reasoning fence (subagent traces 2026-05-08 captured only
|
|
911
|
+
// the fence opener and we couldn't tell if a tool call followed).
|
|
912
|
+
emit('tool_loop:llm_response', {
|
|
913
|
+
iteration: iterations,
|
|
914
|
+
response: response.slice(0, 2000),
|
|
915
|
+
responseLength: response.length,
|
|
916
|
+
hasToolCallMarkup: response.includes('<tool_call>') || /```\s*tool_call\b/.test(response),
|
|
917
|
+
endsWithFenceClose: /```\s*$/.test(response.trimEnd()),
|
|
918
|
+
llmDurationMs: Date.now() - llmStartedAt
|
|
919
|
+
});
|
|
920
|
+
if (signal?.aborted) {
|
|
921
|
+
emit('tool_loop:cancelled', { iteration: iterations, stage: 'post_stream' });
|
|
922
|
+
return buildCancelledResult(messages, iterations, response);
|
|
923
|
+
}
|
|
924
|
+
// Turn-level hard cap on no-tool-call responses. The individual
|
|
925
|
+
// detectors below (fake-tool-result, false-tool-absence,
|
|
926
|
+
// tool-error recovery, empty-retry, narrate-no-action,
|
|
927
|
+
// thinking-off recovery, parse-retry, prose-loop, etc.) each
|
|
928
|
+
// have their own caps, but they chain — thinking-off recovery
|
|
929
|
+
// resets consecutiveEmptyRetries=0, parse-retry has its own
|
|
930
|
+
// counter, and the model can move between failure modes faster
|
|
931
|
+
// than any one detector can give up. Mark Portfolio session
|
|
932
|
+
// 2026-05-26 turn-02-30-37: 6 sequential reasoning-only
|
|
933
|
+
// responses inside one iteration before the loop terminated
|
|
934
|
+
// silently. This counter increments on EVERY response without
|
|
935
|
+
// a tool_call and never resets; once it crosses the cap we
|
|
936
|
+
// force-terminate with a final answer that names the stuck
|
|
937
|
+
// state instead of letting the model spin.
|
|
938
|
+
//
|
|
939
|
+
// Placed BEFORE the per-detector branches so the cap takes
|
|
940
|
+
// precedence — detectors can still nudge once each below this
|
|
941
|
+
// line, but once we've hit the cap they don't run.
|
|
942
|
+
if (!hitLimit && !(0, tool_use_parser_1.hasToolCalls)(response)) {
|
|
943
|
+
noToolCallAttemptsThisTurn++;
|
|
944
|
+
if (noToolCallAttemptsThisTurn >= NO_TOOL_CALL_HARD_CAP) {
|
|
945
|
+
emit('tool_loop:no_tool_call_hard_cap', {
|
|
946
|
+
iteration: iterations,
|
|
947
|
+
attempts: noToolCallAttemptsThisTurn,
|
|
948
|
+
responsePreview: response.slice(0, 200)
|
|
949
|
+
});
|
|
950
|
+
const finalStripped = (0, tool_use_parser_1.stripToolCallMarkup)(response).trim();
|
|
951
|
+
const goalHint = originalGoal
|
|
952
|
+
? `\n\nGoal you asked me to handle: "${originalGoal.trim().slice(0, 200)}"`
|
|
953
|
+
: '';
|
|
954
|
+
const stuckAnswer = `I got stuck — emitted ${noToolCallAttemptsThisTurn} responses in a row without successfully invoking a tool, ` +
|
|
955
|
+
`so I'm stopping the turn before it wastes more time. ` +
|
|
956
|
+
`Most recent reasoning was:\n\n${finalStripped.slice(0, 600) || '(empty)'}` +
|
|
957
|
+
`${goalHint}\n\n` +
|
|
958
|
+
`Suggested next steps:\n` +
|
|
959
|
+
` - Re-ask with a narrower scope (one file or one concrete change)\n` +
|
|
960
|
+
` - Try \`/new\` to start fresh if the context is muddled\n` +
|
|
961
|
+
` - If you saw a tool error earlier in this turn, paste it back and I'll pick a different tool`;
|
|
962
|
+
return { messages, iterations, hitLimit, finalResponse: stuckAnswer };
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
else if ((0, tool_use_parser_1.hasToolCalls)(response)) {
|
|
966
|
+
// A real tool_call landed — reset the cap counter so a later
|
|
967
|
+
// unrelated stall in the same turn gets its own full budget.
|
|
968
|
+
noToolCallAttemptsThisTurn = 0;
|
|
969
|
+
// Also reset the prefill-recovery one-shot. The recovery budget
|
|
970
|
+
// is "per stretch of failures," not "once per turn" — without
|
|
971
|
+
// this reset, a long refactor that recovers from one prefill
|
|
972
|
+
// stall and then hits another (Mark, gregoryhite-site
|
|
973
|
+
// 2026-06-02T23-56-38: 26 iterations, prefill burned at iter 25,
|
|
974
|
+
// iter 26 stalled again with no recovery left) falls straight
|
|
975
|
+
// through to the terminal "Bandit stalled" fallback even though
|
|
976
|
+
// every other detector still has budget. The hard cap on
|
|
977
|
+
// noToolCallAttemptsThisTurn (5) bounds the total stuck
|
|
978
|
+
// responses per stretch, so this can't infinite-loop.
|
|
979
|
+
prefillRecoveryAttempted = false;
|
|
980
|
+
}
|
|
981
|
+
// Protocol guard: Gemma-family models (all sizes, including
|
|
982
|
+
// bandit-core-1 31B) sometimes helpfully "complete" the
|
|
983
|
+
// tool-call / tool-result pattern by emitting a fake
|
|
984
|
+
// `<tool_result>` envelope in their OWN response — template
|
|
985
|
+
// completion from training rather than real tool invocation.
|
|
986
|
+
// The downstream effect is the model reports "edits applied"
|
|
987
|
+
// when nothing was actually written. Detect the fake envelope,
|
|
988
|
+
// strip it, and re-inject a corrective user message so the
|
|
989
|
+
// model retries with a proper `<tool_call>` or produces a
|
|
990
|
+
// plain-prose final answer. One iteration budget — avoids loops
|
|
991
|
+
// if the model ignores the correction.
|
|
992
|
+
const FAKE_TOOL_RESULT_RE = /<tool_result\b[\s\S]*?<\/tool_result\s*>|<tool_result\b[^<]*$/i;
|
|
993
|
+
if (!hitLimit && FAKE_TOOL_RESULT_RE.test(response) && fakeToolResultRetries < FAKE_TOOL_RESULT_CAP) {
|
|
994
|
+
fakeToolResultRetries++;
|
|
995
|
+
emit('tool_loop:fake_tool_result_detected', {
|
|
996
|
+
iteration: iterations,
|
|
997
|
+
preview: response.slice(0, 200)
|
|
998
|
+
});
|
|
999
|
+
const scrubbed = response.replace(/<tool_result\b[\s\S]*?<\/tool_result\s*>/gi, '').replace(/<tool_result\b[^<]*$/i, '').trim();
|
|
1000
|
+
// Replace the just-pushed assistant response with the scrubbed
|
|
1001
|
+
// version so the model doesn't see its own hallucination in
|
|
1002
|
+
// the next turn's context (which would reinforce the pattern).
|
|
1003
|
+
messages.push({ role: 'assistant', content: scrubbed });
|
|
1004
|
+
messages.push({
|
|
1005
|
+
role: 'user',
|
|
1006
|
+
content: 'You emitted a `<tool_result>` envelope in your response. Those envelopes are SYSTEM output — they appear BETWEEN your turns, never inside your own message. If you meant to invoke a tool, emit a single `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. If the task is complete, give a plain-prose final answer with no XML envelopes. Retry now.'
|
|
1007
|
+
});
|
|
1008
|
+
continue;
|
|
1009
|
+
}
|
|
1010
|
+
// Fake tool-log fence detector. Some small/mid models hallucinate
|
|
1011
|
+
// ```bandit-tl / bandit-run / bandit-subagent fenced JSON cards
|
|
1012
|
+
// in prose to PRETEND they ran tools — the host's real tool-log
|
|
1013
|
+
// shape they've seen in conversation history. We strip the fake
|
|
1014
|
+
// fences and nudge with a hard-line "no claims of completion
|
|
1015
|
+
// without a real tool_call" message. Detector fires only when
|
|
1016
|
+
// the response has NO real `<tool_call>` markup, so models
|
|
1017
|
+
// legitimately quoting a tool-log card in explanatory prose
|
|
1018
|
+
// don't false-positive.
|
|
1019
|
+
const FAKE_BANDIT_TL_RE = /```bandit-(?:tl|run|subagent)\b[\s\S]*?```/gi;
|
|
1020
|
+
const FAKE_BANDIT_TL_LOOSE_RE = /```bandit-(?:tl|run|subagent)\b[\s\S]*$/i;
|
|
1021
|
+
const hasFakeBanditCard = FAKE_BANDIT_TL_RE.test(response) || FAKE_BANDIT_TL_LOOSE_RE.test(response);
|
|
1022
|
+
const hasRealToolCall = /<tool_call\b/i.test(response);
|
|
1023
|
+
if (!hitLimit && hasFakeBanditCard && !hasRealToolCall && fakeToolResultRetries < FAKE_TOOL_RESULT_CAP) {
|
|
1024
|
+
fakeToolResultRetries++;
|
|
1025
|
+
emit('tool_loop:fake_tool_result_detected', {
|
|
1026
|
+
iteration: iterations,
|
|
1027
|
+
preview: response.slice(0, 200),
|
|
1028
|
+
shape: 'bandit-tl'
|
|
1029
|
+
});
|
|
1030
|
+
const scrubbed = response
|
|
1031
|
+
.replace(/```bandit-(?:tl|run|subagent)\b[\s\S]*?```/gi, '')
|
|
1032
|
+
.replace(/```bandit-(?:tl|run|subagent)\b[\s\S]*$/i, '')
|
|
1033
|
+
.trim();
|
|
1034
|
+
messages.push({ role: 'assistant', content: scrubbed });
|
|
1035
|
+
messages.push({
|
|
1036
|
+
role: 'user',
|
|
1037
|
+
content: 'You emitted ` ```bandit-tl` (or `bandit-run` / `bandit-subagent`) fenced JSON in your response. Those fences are emitted by the EXTENSION HOST to log real tool execution — you CANNOT produce them. They show up in your context because the host logged actual tool calls, not because you can fabricate them. To actually run a tool, emit `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. Your fake fences mean NO work has happened this turn. You have TWO options for your retry, and ONLY two: (a) Emit a real `<tool_call>{"name":"...","params":{...}}</tool_call>` envelope NOW to actually do the work, then wait for the real result. (b) Honestly state "I have not [action] yet" and STOP. Do NOT claim completion. You MUST NOT claim you have fixed / eliminated / resolved / removed / cleaned / verified anything. No "successfully [verb]" phrasing. No numbered lists of "Step 1: I did X" actions. No "the project is now in a healthy state." Until a real `<tool_call>` lands on disk and returns a real tool-result, nothing has changed. Lying about completion is the worst failure mode. Retry now.'
|
|
1038
|
+
});
|
|
1039
|
+
continue;
|
|
1040
|
+
}
|
|
1041
|
+
// False-tool-absence detector. Model sometimes claims a tool
|
|
1042
|
+
// "is not available" / "I don't have access to X" — even when the
|
|
1043
|
+
// tool IS in the registry and was sent in this very turn's
|
|
1044
|
+
// native-tools schema. Usually triggered by an earlier error
|
|
1045
|
+
// ("Expected object, received string", "tool 'X' not registered")
|
|
1046
|
+
// surviving into compacted history while the success path didn't,
|
|
1047
|
+
// or by raw hallucination on small/mid models. Reset is a
|
|
1048
|
+
// band-aid; correct the claim inline so the user can keep going.
|
|
1049
|
+
//
|
|
1050
|
+
// Detector fires only when (a) the response has no tool_call,
|
|
1051
|
+
// (b) the absence phrase appears, (c) the named tool IS registered.
|
|
1052
|
+
// The registry-membership check is what gates the nudge — without
|
|
1053
|
+
// it we'd false-positive on legitimate "I can't do that" responses
|
|
1054
|
+
// about capabilities the agent genuinely doesn't have.
|
|
1055
|
+
if (!hitLimit
|
|
1056
|
+
&& !(0, tool_use_parser_1.hasToolCalls)(response)
|
|
1057
|
+
&& toolAbsenceCorrectionsFired < TOOL_ABSENCE_CORRECTION_CAP) {
|
|
1058
|
+
const registeredNames = this.registry.getAll().map((t) => t.name);
|
|
1059
|
+
const absence = (0, toolAvailabilityDetector_1.detectFalseToolAbsence)(response, registeredNames);
|
|
1060
|
+
if (absence.detected) {
|
|
1061
|
+
toolAbsenceCorrectionsFired++;
|
|
1062
|
+
emit('tool_loop:false_tool_absence', {
|
|
1063
|
+
iteration: iterations,
|
|
1064
|
+
matched: absence.matchedToolNames,
|
|
1065
|
+
suggested: absence.suggestedTools,
|
|
1066
|
+
responsePreview: response.slice(0, 200)
|
|
1067
|
+
});
|
|
1068
|
+
messages.push({ role: 'assistant', content: response });
|
|
1069
|
+
messages.push({ role: 'user', content: (0, toolAvailabilityDetector_1.buildToolAvailabilityNudge)(absence) });
|
|
1070
|
+
continue;
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
// Tool-error recovery. When the previous iteration's tool call
|
|
1074
|
+
// returned isError:true and THIS iteration produced no tool_call,
|
|
1075
|
+
// the model is silently abandoning the request. Push a one-shot
|
|
1076
|
+
// nudge: retry with corrected params OR explicitly state which
|
|
1077
|
+
// precondition failed. Without this the agent drops the task and
|
|
1078
|
+
// the user has to manually say "continue."
|
|
1079
|
+
if (!hitLimit
|
|
1080
|
+
&& !(0, tool_use_parser_1.hasToolCalls)(response)
|
|
1081
|
+
&& lastIterationHadToolError
|
|
1082
|
+
&& toolErrorRecoveryFired < TOOL_ERROR_RECOVERY_CAP) {
|
|
1083
|
+
toolErrorRecoveryFired++;
|
|
1084
|
+
emit('tool_loop:tool_error_recovery', {
|
|
1085
|
+
iteration: iterations,
|
|
1086
|
+
responsePreview: response.slice(0, 200)
|
|
1087
|
+
});
|
|
1088
|
+
messages.push({ role: 'assistant', content: response });
|
|
1089
|
+
messages.push({
|
|
1090
|
+
role: 'user',
|
|
1091
|
+
content: 'The previous tool call returned an error and you produced no follow-up tool_call. ' +
|
|
1092
|
+
'Do NOT silently abandon the request — the user expects you to either retry with corrected parameters OR state explicitly which precondition failed and why you cannot proceed. ' +
|
|
1093
|
+
'Choose one: (a) emit a corrected `<tool_call>{"name":"...","params":{...}}</tool_call>` now, fixing the param shape or value the error pointed at; ' +
|
|
1094
|
+
'(b) give a one-line final answer naming the exact precondition you lack (e.g. "I cannot trash message X because the message id is unknown — please provide it"). ' +
|
|
1095
|
+
'Do not pretend the error did not happen and do not continue with unrelated work.'
|
|
1096
|
+
});
|
|
1097
|
+
continue;
|
|
1098
|
+
}
|
|
1099
|
+
messages.push({ role: 'assistant', content: response });
|
|
1100
|
+
// Small models sometimes stall with an empty response after a tool
|
|
1101
|
+
// result. Give them one polite nudge before giving up — almost always
|
|
1102
|
+
// enough for gemma4:e4b / qwen 7B to produce a real answer.
|
|
1103
|
+
//
|
|
1104
|
+
// Reasoning-only responses count as empty here. bandit-logic / Qwen
|
|
1105
|
+
// 3.6 in thinking mode sometimes emits a full <think>…</think> or
|
|
1106
|
+
// ```bandit-reasoning``` block planning out the work and then stops
|
|
1107
|
+
// without emitting an actual tool_call. Visually the user sees a
|
|
1108
|
+
// wall of reasoning text and nothing happens. Strip the reasoning
|
|
1109
|
+
// fences before checking emptiness so the same nudge fires.
|
|
1110
|
+
const stripped = response
|
|
1111
|
+
.replace(/<think\b[\s\S]*?<\/think\s*>/gi, '')
|
|
1112
|
+
.replace(/<think\b[\s\S]*$/i, '')
|
|
1113
|
+
.replace(/```bandit-reasoning\b[\s\S]*?```/gi, '')
|
|
1114
|
+
.replace(/```bandit-reasoning\b[\s\S]*$/i, '')
|
|
1115
|
+
.trim();
|
|
1116
|
+
const reasoningOnly = !stripped && response.trim().length > 0;
|
|
1117
|
+
// "Narrated but didn't act" detector. Some models (notably ones
|
|
1118
|
+
// post-trained for a different tool-call envelope, e.g. OpenAI
|
|
1119
|
+
// harmony) emit reasoning + a prose intent ("I'll search for X.")
|
|
1120
|
+
// without emitting the actual tool_call envelope. We treat that
|
|
1121
|
+
// as a stall and nudge once per turn.
|
|
1122
|
+
//
|
|
1123
|
+
// Verbs are enumerated explicitly (inflections too) — stem-with-
|
|
1124
|
+
// suffix patterns over- or under-match on English irregulars
|
|
1125
|
+
// (doubled-letter "running", silent-e "using", false positives
|
|
1126
|
+
// on "useful"/"reader"). The check is anchored to the TAIL of
|
|
1127
|
+
// the stripped response (last sentence) so the verb has to be
|
|
1128
|
+
// in the model's final clause, not an earlier "I have already
|
|
1129
|
+
// searched the file" preamble before a real answer.
|
|
1130
|
+
//
|
|
1131
|
+
// Captured 2026-05-25 (Mark, Portfolio IDE session): model emitted
|
|
1132
|
+
// "I'll redesign the portfolio... Let me rewrite both files." with
|
|
1133
|
+
// NO tool_call and the turn closed as a final answer because
|
|
1134
|
+
// neither `redesign` nor `rewrite` was on the list. A long
|
|
1135
|
+
// session ended with zero work shipped. Missing a verb here =
|
|
1136
|
+
// silent stall = user has to re-prompt manually. Cheap to add.
|
|
1137
|
+
const NARRATE_VERB_RE = /\b(use|uses|used|using|call|calls|called|calling|invoke|invokes|invoked|invoking|execute|executes|executed|executing|run|runs|running|ran|search|searches|searched|searching|look|looks|looked|looking|read|reads|reading|check|checks|checked|checking|find|finds|finding|found|list|lists|listed|listing|fetch|fetches|fetched|fetching|grep|greps|grepped|grepping|explore|explores|explored|exploring|locate|locates|located|locating|plan|plans|planned|planning|start|starts|started|starting|begin|begins|began|beginning|create|creates|created|creating|write|writes|wrote|writing|rewrite|rewrites|rewrote|rewriting|rewritten|build|builds|built|building|rebuild|rebuilds|rebuilt|rebuilding|update|updates|updated|updating|implement|implements|implemented|implementing|refactor|refactors|refactored|refactoring|redesign|redesigns|redesigned|redesigning|design|designs|designed|designing|generate|generates|generated|generating|scaffold|scaffolds|scaffolded|scaffolding|set\s+up|setting\s+up|tackle|tackles|tackled|tackling|do|does|did|doing|make|makes|made|making|batch|batches|batched|batching|execute|prepare|prepares|prepared|preparing|draft|drafts|drafted|drafting|outline|outlines|outlined|outlining|organize|organizes|organized|organizing|structure|structures|structured|structuring|kick\s+off|kicking\s+off|fix|fixes|fixed|fixing|edit|edits|edited|editing|modify|modifies|modified|modifying|patch|patches|patched|patching|adjust|adjusts|adjusted|adjusting|replace|replaces|replaced|replacing|swap|swaps|swapped|swapping|polish|polishes|polished|polishing|clean\s+up|cleaning\s+up|tidy|tidies|tidied|tidying|finalize|finalizes|finalized|finalizing|finish|finishes|finished|finishing|complete|completes|completed|completing|wire|wires|wired|wiring|hook|hooks|hooked|hooking|render|renders|rendered|rendering|style|styles|styled|styling|theme|themes|themed|theming|redo|redoes|redid|redoing|port|ports|ported|porting|migrate|migrates|migrated|migrating|configure|configures|configured|configuring|install|installs|installed|installing|remove|removes|removed|removing|delete|deletes|deleted|deleting|rename|renames|renamed|renaming)\b/i;
|
|
1138
|
+
const NARRATE_INTENT_RE = /\b(we (?:will|need to|should)|we'?ll|we'?re going to|i'?ll|i will|let me|let'?s|going to|i'?m going to|i need to)\b/i;
|
|
1139
|
+
// Real code fences pass through; narrate only fires when the
|
|
1140
|
+
// model emitted no structured payload at all. Check the STRIPPED
|
|
1141
|
+
// response, not the raw one — `bandit-reasoning` fences are
|
|
1142
|
+
// reasoning, not structured output.
|
|
1143
|
+
const hasCodeFence = /```[a-zA-Z0-9_-]*\s*\n/.test(stripped);
|
|
1144
|
+
const tailMatch = stripped.match(/(?:[.!?]\s+)([^.!?]*)$/);
|
|
1145
|
+
const tail = (tailMatch ? tailMatch[1] : stripped).slice(-200);
|
|
1146
|
+
const narratedButNoAction = !(0, tool_use_parser_1.hasToolCalls)(response) &&
|
|
1147
|
+
!hasCodeFence &&
|
|
1148
|
+
stripped.length > 0 &&
|
|
1149
|
+
stripped.length < 240 &&
|
|
1150
|
+
NARRATE_INTENT_RE.test(tail) &&
|
|
1151
|
+
NARRATE_VERB_RE.test(tail);
|
|
1152
|
+
// Empty-response retry: was previously gated to `iterations > 0`
|
|
1153
|
+
// under the assumption "empty first response = provider outage."
|
|
1154
|
+
// That assumption was wrong — with bandit-logic
|
|
1155
|
+
// (cloud) on multi-message email-fetch turns: iteration 0 streams
|
|
1156
|
+
// completely empty (no reasoning text, no narrate prose, just zero
|
|
1157
|
+
// tokens), the loop falls straight through, and the user gets the
|
|
1158
|
+
// stall fallback instantly. Same model later in the same session
|
|
1159
|
+
// worked fine. Empty on iteration 0 is now allowed to nudge so
|
|
1160
|
+
// the model gets a second chance (and the thinking-off recovery
|
|
1161
|
+
// below can flip it to non-thinking mode if the second pass also
|
|
1162
|
+
// empties).
|
|
1163
|
+
const shouldNudge = (!response.trim() || reasoningOnly || narratedButNoAction) &&
|
|
1164
|
+
!hitLimit &&
|
|
1165
|
+
consecutiveEmptyRetries < 2 &&
|
|
1166
|
+
!thinkingOffRecoveryAttempted;
|
|
1167
|
+
if (shouldNudge) {
|
|
1168
|
+
consecutiveEmptyRetries++;
|
|
1169
|
+
emit('tool_loop:empty_retry', {
|
|
1170
|
+
iteration: iterations,
|
|
1171
|
+
attempt: consecutiveEmptyRetries,
|
|
1172
|
+
reasoningOnly,
|
|
1173
|
+
narratedButNoAction
|
|
1174
|
+
});
|
|
1175
|
+
const nudgeMessage = narratedButNoAction
|
|
1176
|
+
? 'You announced your next step in prose ("we will search…" / "let me check…" / "use X to find Y") but did NOT emit a `<tool_call>` envelope. Announcing intent is not enough — you must actually invoke the tool. Emit the call now in this exact format, OUTSIDE of any reasoning block, with NO commentary and NO markdown fence:\n\n<tool_call>{"name":"<tool>","params":{"<key>":"<value>"}}</tool_call>\n\nReplace name/params with the right values for your task. Or, if the task is already answerable from what you know, give a final answer instead.'
|
|
1177
|
+
: reasoningOnly
|
|
1178
|
+
? 'You completed reasoning but emitted no tool_call AND no final answer. The reasoning text alone does not run a tool — you must emit a `<tool_call>` envelope OUTSIDE the reasoning block. Format example (replace name/params for your task):\n\n<tool_call>{"name":"<tool>","params":{"<key>":"<value>"}}</tool_call>\n\nNo prose around it, no markdown fence, just the bare tag. If the task is answerable without a tool, write a complete final answer instead. Do not stop after only thinking.'
|
|
1179
|
+
: 'Your previous response was empty. Either emit a `<tool_call>{"name":"<tool>","params":{...}}</tool_call>` to invoke a tool, OR produce a complete final answer using what you have. Do not respond with an empty message.';
|
|
1180
|
+
messages.push({
|
|
1181
|
+
role: 'user',
|
|
1182
|
+
content: nudgeMessage
|
|
1183
|
+
});
|
|
1184
|
+
continue;
|
|
1185
|
+
}
|
|
1186
|
+
// Cap reached on a reasoning-only OR completely-empty stall: try
|
|
1187
|
+
// ONE more round with thinking forced off. This is the single-shot
|
|
1188
|
+
// "thinking-off recovery" — see comment on
|
|
1189
|
+
// `thinkingOffRecoveryAttempted` above. If the model produces a
|
|
1190
|
+
// tool_call this time, great. If it still stalls, we fall through
|
|
1191
|
+
// and the loop terminates normally with the final response shown
|
|
1192
|
+
// to the user.
|
|
1193
|
+
//
|
|
1194
|
+
// Threshold lowered from 2 to 1 AND extended to cover empty
|
|
1195
|
+
// responses (2026-05-03): bandit-logic via the cloud gateway
|
|
1196
|
+
// sometimes streams an entirely empty response on iteration 0
|
|
1197
|
+
// (not reasoning-only — zero tokens). Same prompt later in the
|
|
1198
|
+
// same session works fine. Force thinking-off after a single
|
|
1199
|
+
// empty/reasoning-only retry so the second attempt skips the
|
|
1200
|
+
// thinking channel entirely.
|
|
1201
|
+
const stallShape = reasoningOnly || !response.trim();
|
|
1202
|
+
if (!hitLimit
|
|
1203
|
+
&& stallShape
|
|
1204
|
+
&& consecutiveEmptyRetries >= 1
|
|
1205
|
+
&& !thinkingOffRecoveryAttempted) {
|
|
1206
|
+
thinkingOffRecoveryAttempted = true;
|
|
1207
|
+
consecutiveEmptyRetries = 0;
|
|
1208
|
+
nextCallThinkOverride = false;
|
|
1209
|
+
emit('tool_loop:thinking_off_recovery', {
|
|
1210
|
+
iteration: iterations,
|
|
1211
|
+
reason: 'reasoning_only_cap_exhausted'
|
|
1212
|
+
});
|
|
1213
|
+
messages.push({
|
|
1214
|
+
role: 'user',
|
|
1215
|
+
content: 'Switching to non-thinking mode for this attempt because reasoning-only retries exhausted. Emit either a tool_call or a complete final answer. No more reasoning preamble.'
|
|
1216
|
+
});
|
|
1217
|
+
continue;
|
|
1218
|
+
}
|
|
1219
|
+
// Final-shot prefill recovery for the qwen3.6 "closes reasoning fence
|
|
1220
|
+
// and stops" pattern. Reached when thinking-off recovery also
|
|
1221
|
+
// produced a reasoning-only / empty response. Push an assistant
|
|
1222
|
+
// message containing only the start of a tool_call envelope so
|
|
1223
|
+
// Ollama treats it as a prefill — the model has to continue from
|
|
1224
|
+
// inside the envelope, removing its option to end the response at
|
|
1225
|
+
// the reasoning fence close. The completion is glued back to the
|
|
1226
|
+
// prefix when streamAndAggregate returns (see the prepend above).
|
|
1227
|
+
if (!hitLimit
|
|
1228
|
+
&& stallShape
|
|
1229
|
+
&& thinkingOffRecoveryAttempted
|
|
1230
|
+
&& !prefillRecoveryAttempted) {
|
|
1231
|
+
prefillRecoveryAttempted = true;
|
|
1232
|
+
consecutiveEmptyRetries = 0;
|
|
1233
|
+
nextCallThinkOverride = false;
|
|
1234
|
+
pendingPrefillPrefix = '<tool_call>{"name":"';
|
|
1235
|
+
emit('tool_loop:prefill_recovery', {
|
|
1236
|
+
iteration: iterations,
|
|
1237
|
+
prefix: pendingPrefillPrefix
|
|
1238
|
+
});
|
|
1239
|
+
messages.push({
|
|
1240
|
+
role: 'assistant',
|
|
1241
|
+
content: pendingPrefillPrefix
|
|
1242
|
+
});
|
|
1243
|
+
continue;
|
|
1244
|
+
}
|
|
1245
|
+
consecutiveEmptyRetries = 0;
|
|
1246
|
+
// Model emitted tool_call markup but none parsed — almost always means
|
|
1247
|
+
// invalid JSON inside a content string (unescaped quotes is the classic
|
|
1248
|
+
// offender on writes of TS/JSON/HTML files). Give the model one more
|
|
1249
|
+
// shot with explicit guidance; otherwise treat the raw text as final.
|
|
1250
|
+
if (!hitLimit && (0, tool_use_parser_1.looksLikeAttemptedToolCall)(response) && !(0, tool_use_parser_1.hasToolCalls)(response) && parseRetries < PARSE_RETRY_CAP) {
|
|
1251
|
+
parseRetries++;
|
|
1252
|
+
emit('tool_loop:parse_retry', { iteration: iterations, attempt: parseRetries });
|
|
1253
|
+
// First retry: gentle guidance on escaping. Second retry: an
|
|
1254
|
+
// explicit escape-hatch — tell the model to write the file with
|
|
1255
|
+
// write_file (which takes a single `content` param and avoids
|
|
1256
|
+
// the find/replace escaping gauntlet) OR produce a prose-only
|
|
1257
|
+
// final answer. Without this the loop just terminates silently
|
|
1258
|
+
// and the user sees no actual edit.
|
|
1259
|
+
const firstRetry = parseRetries === 1;
|
|
1260
|
+
messages.push({
|
|
1261
|
+
role: 'user',
|
|
1262
|
+
content: firstRetry
|
|
1263
|
+
? 'Your previous tool_call was not valid JSON — I could not parse it. Common cause: unescaped `"` characters inside a string value (for example `["", "", ""]` inside a `content` string). Retry the tool call with properly escaped JSON: every `"` inside a string value must be written as `\\"`, and every newline as `\\n`. If the content is very long, consider `replace_range` for a line-numbered block or breaking the change into smaller edits.'
|
|
1264
|
+
: 'Your tool_call still did not parse. Do NOT retry with the same shape or the same escaping failure. Switch tactics: (a) call `replace_range` for a large block whose line numbers you just read, (b) call `write_file` for a new file, or (c) split the change into multiple small `apply_edit` calls that each target just one method or block (e.g. 3-5 lines of `find`, 5-10 lines of `replace`) instead of rewriting the whole class. Pick the smallest scope that accomplishes the next step. If you cannot produce a valid tool call, respond with a plain-prose final answer acknowledging you could not complete the edit.'
|
|
1265
|
+
});
|
|
1266
|
+
continue;
|
|
1267
|
+
}
|
|
1268
|
+
// Prose-loop detector (cross-iteration). If the assistant has gone
|
|
1269
|
+
// N iterations in a row without emitting a tool call AND the
|
|
1270
|
+
// current response is substring-similar to the previous one, the
|
|
1271
|
+
// model is almost certainly stuck in a deliberation loop. Fire
|
|
1272
|
+
// one corrective nudge; if that doesn't break the pattern, let
|
|
1273
|
+
// the turn terminate on the next iteration so the user sees a
|
|
1274
|
+
// coherent final answer instead of a second wall of repetition.
|
|
1275
|
+
if (!hitLimit && !(0, tool_use_parser_1.hasToolCalls)(response)) {
|
|
1276
|
+
const normalized = response.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
1277
|
+
const prior = recentNonToolResponses[recentNonToolResponses.length - 1];
|
|
1278
|
+
const looksLikeLoop = !!prior && (() => {
|
|
1279
|
+
// Cheap similarity: longest common prefix / max length. If two
|
|
1280
|
+
// consecutive no-tool responses share >60% of their text by
|
|
1281
|
+
// prefix the model is repeating itself. More sophisticated
|
|
1282
|
+
// diff would be overkill — the real failure mode is near-
|
|
1283
|
+
// identical responses, not subtle rephrasings.
|
|
1284
|
+
const short = prior.length < normalized.length ? prior : normalized;
|
|
1285
|
+
const long = prior.length < normalized.length ? normalized : prior;
|
|
1286
|
+
let matched = 0;
|
|
1287
|
+
while (matched < short.length && short[matched] === long[matched])
|
|
1288
|
+
matched++;
|
|
1289
|
+
return matched / short.length > 0.6;
|
|
1290
|
+
})();
|
|
1291
|
+
// Also flag the self-contradiction signature from the real
|
|
1292
|
+
// trace: alternating "Wait, I see …" and "Actually, I'll try
|
|
1293
|
+
// …" phrases appearing multiple times inside ONE response.
|
|
1294
|
+
const waitCount = (normalized.match(/wait,? i see/g) ?? []).length;
|
|
1295
|
+
const actuallyCount = (normalized.match(/actually,? i'?ll/g) ?? []).length;
|
|
1296
|
+
const selfContradicting = waitCount >= 3 && actuallyCount >= 3;
|
|
1297
|
+
// Intra-response stream abort already tagged the text — also a
|
|
1298
|
+
// loop.
|
|
1299
|
+
const streamAborted = response.includes('[stream aborted: self-contradicting prose loop detected]');
|
|
1300
|
+
if (!proseLoopNudged && (looksLikeLoop || selfContradicting || streamAborted)) {
|
|
1301
|
+
proseLoopNudged = true;
|
|
1302
|
+
emit('tool_loop:prose_loop_nudge', {
|
|
1303
|
+
iteration: iterations,
|
|
1304
|
+
responsePreview: response.slice(0, 200),
|
|
1305
|
+
reason: streamAborted ? 'stream_abort' : selfContradicting ? 'self_contradict' : 'cross_iteration_similarity'
|
|
1306
|
+
});
|
|
1307
|
+
messages.push({
|
|
1308
|
+
role: 'user',
|
|
1309
|
+
content: 'STOP deliberating. Your last response either repeated itself, contradicted itself (e.g. "Wait, I see X / Actually I\'ll try X"), or was aborted mid-stream as a loop. Do NOT continue speculating about what files might exist. Take exactly one of these actions now: (a) invoke a tool (`list_files`, `read_file`, `search_code`, etc.) to answer the question with real data, OR (b) give up and tell the user plainly that you could not complete the task and why. Do not write more than two sentences of prose before either calling a tool or terminating.'
|
|
1310
|
+
});
|
|
1311
|
+
recentNonToolResponses.length = 0;
|
|
1312
|
+
continue;
|
|
1313
|
+
}
|
|
1314
|
+
recentNonToolResponses.push(normalized);
|
|
1315
|
+
if (recentNonToolResponses.length > PROSE_LOOP_WINDOW) {
|
|
1316
|
+
recentNonToolResponses.shift();
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
else {
|
|
1320
|
+
// Reset the window whenever a tool call fires — legitimate
|
|
1321
|
+
// progress breaks any suspected loop.
|
|
1322
|
+
recentNonToolResponses.length = 0;
|
|
1323
|
+
}
|
|
1324
|
+
// JSON-todo auto-promote: small models (observed on gemma3:12b-it-qat,
|
|
1325
|
+
// Apr 22 S3Api turn) often paste their todo list as a ```json fenced
|
|
1326
|
+
// code block instead of calling the todo_write tool. The plan never
|
|
1327
|
+
// advances and the model re-iterates on the same task because its
|
|
1328
|
+
// own view of "what's done" stays frozen. Detect the shape, execute
|
|
1329
|
+
// a synthesized todo_write call on the model's behalf, continue.
|
|
1330
|
+
if (!hitLimit && !(0, tool_use_parser_1.hasToolCalls)(response) && !jsonTodoAutoPromoted) {
|
|
1331
|
+
const JSON_TODO_FENCE_RE = /```json\s*\n([\s\S]*?)```/i;
|
|
1332
|
+
const match = response.match(JSON_TODO_FENCE_RE);
|
|
1333
|
+
if (match) {
|
|
1334
|
+
try {
|
|
1335
|
+
const parsed = JSON.parse(match[1].trim());
|
|
1336
|
+
// Must be a non-empty array where every item looks like a todo
|
|
1337
|
+
// ({content: string} at minimum). Tight check avoids false-
|
|
1338
|
+
// positives on generic data-shaped JSON the model might emit.
|
|
1339
|
+
if (Array.isArray(parsed) &&
|
|
1340
|
+
parsed.length > 0 &&
|
|
1341
|
+
parsed.every((item) => item &&
|
|
1342
|
+
typeof item === 'object' &&
|
|
1343
|
+
typeof item.content === 'string')) {
|
|
1344
|
+
jsonTodoAutoPromoted = true;
|
|
1345
|
+
emit('tool_loop:json_todo_auto_promoted', {
|
|
1346
|
+
iteration: iterations,
|
|
1347
|
+
itemCount: parsed.length
|
|
1348
|
+
});
|
|
1349
|
+
const todoTool = this.registry.get('todo_write');
|
|
1350
|
+
if (todoTool) {
|
|
1351
|
+
const syntheticCall = {
|
|
1352
|
+
name: 'todo_write',
|
|
1353
|
+
params: { items: JSON.stringify(parsed) },
|
|
1354
|
+
raw: `<tool_call>{"name":"todo_write","params":{"items":${JSON.stringify(JSON.stringify(parsed))}}}</tool_call>`
|
|
1355
|
+
};
|
|
1356
|
+
emit('tool_loop:tool_execute', {
|
|
1357
|
+
name: 'todo_write',
|
|
1358
|
+
params: syntheticCall.params,
|
|
1359
|
+
rawSnippet: syntheticCall.raw.slice(0, 400)
|
|
1360
|
+
});
|
|
1361
|
+
try {
|
|
1362
|
+
const result = await todoTool.execute(syntheticCall.params, this.ctx);
|
|
1363
|
+
// redact outputSnippet and outputFull
|
|
1364
|
+
// before emitting; the model-facing message below
|
|
1365
|
+
// is also redacted via buildToolResultsMessage →
|
|
1366
|
+
// formatToolResult. todo_write output rarely carries
|
|
1367
|
+
// secrets but consistency matters here — tool cards
|
|
1368
|
+
// in the extension UI will render outputFull and we
|
|
1369
|
+
// don't want any path to leak.
|
|
1370
|
+
emit('tool_loop:tool_result', {
|
|
1371
|
+
name: 'todo_write',
|
|
1372
|
+
isError: result.isError,
|
|
1373
|
+
outputLength: result.output.length,
|
|
1374
|
+
outputSnippet: (0, tool_use_parser_1.applySecretRedactionIfEnabled)(result.output.slice(0, 280)),
|
|
1375
|
+
outputFull: (0, tool_use_parser_1.applySecretRedactionIfEnabled)(result.output.slice(0, 65536))
|
|
1376
|
+
});
|
|
1377
|
+
messages.push({
|
|
1378
|
+
role: 'user',
|
|
1379
|
+
content: (0, tool_use_parser_1.buildToolResultsMessage)([
|
|
1380
|
+
{ name: 'todo_write', output: result.output, isError: result.isError }
|
|
1381
|
+
])
|
|
1382
|
+
});
|
|
1383
|
+
}
|
|
1384
|
+
catch (err) {
|
|
1385
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1386
|
+
emit('tool_loop:tool_error', { name: 'todo_write', error: msg });
|
|
1387
|
+
messages.push({
|
|
1388
|
+
role: 'user',
|
|
1389
|
+
content: (0, tool_use_parser_1.buildToolResultsMessage)([
|
|
1390
|
+
{ name: 'todo_write', output: `Error: ${msg}`, isError: true }
|
|
1391
|
+
])
|
|
1392
|
+
});
|
|
1393
|
+
}
|
|
1394
|
+
// Nudge the model to stop pasting JSON and use the tool
|
|
1395
|
+
// directly next time. Reinforces the system-prompt anchor
|
|
1396
|
+
// without being so loud that it derails prose responses.
|
|
1397
|
+
messages.push({
|
|
1398
|
+
role: 'user',
|
|
1399
|
+
content: 'Note: I detected a JSON todo list in your response and auto-promoted it to a todo_write call. Next time, emit `<tool_call>{"name":"todo_write","params":{"items":"..."}}</tool_call>` directly instead of pasting JSON as a code block — pasted JSON does not update your plan, only the tool call does.'
|
|
1400
|
+
});
|
|
1401
|
+
iterations++;
|
|
1402
|
+
continue;
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
}
|
|
1406
|
+
catch {
|
|
1407
|
+
// Not valid JSON — fall through to normal handling.
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
// If no tool calls (or hit limit), return the final answer.
|
|
1412
|
+
// Strip any lingering tool_call markup so malformed blocks never
|
|
1413
|
+
// reach the user-visible output.
|
|
1414
|
+
if (hitLimit || !(0, tool_use_parser_1.hasToolCalls)(response)) {
|
|
1415
|
+
// Detect hallucinated `<tool_result>` envelopes BEFORE stripping
|
|
1416
|
+
// so we can emit a telemetry event. The strip is mandatory (the
|
|
1417
|
+
// user can't see fabricated tool output as if it were real); the
|
|
1418
|
+
// event lets us track frequency and confirm the cause is what we
|
|
1419
|
+
// think it is — typically aggressive compaction stripping the
|
|
1420
|
+
// model's memory and it falling back to imitating the format.
|
|
1421
|
+
if ((0, tool_use_parser_1.hasFabricatedToolResult)(response)) {
|
|
1422
|
+
emit('tool_loop:hallucinated_tool_result', {
|
|
1423
|
+
iteration: iterations,
|
|
1424
|
+
responsePreview: response.slice(0, 300)
|
|
1425
|
+
});
|
|
1426
|
+
}
|
|
1427
|
+
const finalResponse = (0, tool_use_parser_1.stripToolCallMarkup)(response).trim();
|
|
1428
|
+
// False-completion detector. Small models regularly end a turn
|
|
1429
|
+
// with "I refactored the file" / "here is the updated code" text
|
|
1430
|
+
// without ever emitting a file-edit tool call.
|
|
1431
|
+
// When that happens the user sees a confident final response
|
|
1432
|
+
// backed by zero actual change on disk. If we detect this
|
|
1433
|
+
// pattern AND haven't nudged yet AND no edit tool was called
|
|
1434
|
+
// this turn, push one corrective user message into the loop
|
|
1435
|
+
// and continue for one more iteration. The nudge is capped at
|
|
1436
|
+
// one per turn so a truly confused model can still terminate.
|
|
1437
|
+
if (!hitLimit && !falseCompletionNudged && editToolsInvoked === 0) {
|
|
1438
|
+
const claimsCompletion = FALSE_COMPLETION_PATTERNS.some(re => re.test(finalResponse));
|
|
1439
|
+
if (claimsCompletion) {
|
|
1440
|
+
falseCompletionNudged = true;
|
|
1441
|
+
emit('tool_loop:false_completion_nudge', { iteration: iterations, responsePreview: finalResponse.slice(0, 200) });
|
|
1442
|
+
messages.push({
|
|
1443
|
+
role: 'user',
|
|
1444
|
+
content: 'Your response either claims work is done OR apologizes and asks what to do next — but I see NO successful `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call in this turn, so nothing on disk has changed. ' +
|
|
1445
|
+
'Do NOT ask the user which task to resume, do NOT promise to escape JSON "in your next tool call", and do NOT defer. Either (a) emit a real edit tool call NOW with the actual change — use `replace_range` for a large block whose line numbers you just read, `apply_edit` for a small exact replacement, or `write_file` for a new file — or (b) respond honestly that you could not complete the task and briefly explain why. Retry the tool call yourself; the user cannot help you escape JSON.'
|
|
1446
|
+
});
|
|
1447
|
+
continue;
|
|
1448
|
+
}
|
|
1449
|
+
}
|
|
1450
|
+
// Partial-completion detector. The check above catches "claimed
|
|
1451
|
+
// work, did NOTHING." This catches "claimed work on N files, only
|
|
1452
|
+
// edited M of them." with gpt-oss:120b on
|
|
1453
|
+
// S3Api: 1 successful apply_edit on HealthController.cs, then
|
|
1454
|
+
// the final answer claimed edits to FileController.cs (class +
|
|
1455
|
+
// 3 methods) AND HealthController.cs (class + method). The user
|
|
1456
|
+
// saw a confident summary of 5 edits but only 1 landed on disk.
|
|
1457
|
+
// Heuristic: extract distinct file references (paths with
|
|
1458
|
+
// recognized source extensions or backticked file-like tokens)
|
|
1459
|
+
// from the response. If the count exceeds the actual successful
|
|
1460
|
+
// edit count, the model is overclaiming. One nudge per turn.
|
|
1461
|
+
if (!hitLimit && !falseCompletionNudged && editToolsInvoked > 0) {
|
|
1462
|
+
const filePathRe = /[`"']?([\w./\\-]+\.(?:cs|ts|tsx|js|jsx|mjs|cjs|py|rb|go|rs|java|kt|swift|cpp|cc|c|h|hpp|md|json|ya?ml|html|css|scss|sql|toml|sh|bash))[`"']?/gi;
|
|
1463
|
+
const fileSet = new Set();
|
|
1464
|
+
let m;
|
|
1465
|
+
while ((m = filePathRe.exec(finalResponse)) !== null) {
|
|
1466
|
+
// Normalize so `S3Api/Controllers/Foo.cs` and `Foo.cs` count
|
|
1467
|
+
// separately only when they really are different files. Last
|
|
1468
|
+
// segment is the cheapest disambiguator.
|
|
1469
|
+
const segments = m[1].split(/[/\\]/);
|
|
1470
|
+
const leaf = segments[segments.length - 1].toLowerCase();
|
|
1471
|
+
fileSet.add(leaf);
|
|
1472
|
+
}
|
|
1473
|
+
if (fileSet.size > editToolsInvoked) {
|
|
1474
|
+
falseCompletionNudged = true;
|
|
1475
|
+
emit('tool_loop:partial_completion_nudge', {
|
|
1476
|
+
iteration: iterations,
|
|
1477
|
+
editToolsInvoked,
|
|
1478
|
+
claimedFiles: fileSet.size,
|
|
1479
|
+
responsePreview: finalResponse.slice(0, 200)
|
|
1480
|
+
});
|
|
1481
|
+
messages.push({
|
|
1482
|
+
role: 'user',
|
|
1483
|
+
content: `Your response describes edits to ${fileSet.size} files (${[...fileSet].slice(0, 8).join(', ')}${fileSet.size > 8 ? ', …' : ''}), but only ${editToolsInvoked} successful edit${editToolsInvoked === 1 ? '' : 's'} actually fired this turn. ` +
|
|
1484
|
+
`The remaining ${fileSet.size - editToolsInvoked} file(s) were NOT modified — nothing landed on disk for them. ` +
|
|
1485
|
+
'Either (a) emit the missing `apply_edit` / `replace_range` / `write_file` tool calls now to actually do the work, OR (b) revise your response to honestly describe ONLY the edits that successfully applied. Do not summarize work that did not happen.'
|
|
1486
|
+
});
|
|
1487
|
+
continue;
|
|
1488
|
+
}
|
|
1489
|
+
}
|
|
1490
|
+
// Subject-not-modified detector. Refactor goals
|
|
1491
|
+
// ("break out", "split", "refactor", "extract", "move") imply
|
|
1492
|
+
// mutation of the SOURCE file the user wants restructured, not
|
|
1493
|
+
// just creation of new sibling files. Failure mode observed
|
|
1494
|
+
// 2026-05-25 on a Portfolio React refactor: model read App.jsx,
|
|
1495
|
+
// wrote 5 new component files, never touched App.jsx, declared
|
|
1496
|
+
// completion. User had to follow up "are we using these?" to
|
|
1497
|
+
// force the integration step — and even that follow-up turn
|
|
1498
|
+
// wrote MORE components without modifying App.jsx.
|
|
1499
|
+
//
|
|
1500
|
+
// Heuristic: original goal contains a refactor verb AND the
|
|
1501
|
+
// turn read files AND wrote DIFFERENT files. If none of the
|
|
1502
|
+
// read files were also written, the model produced consumers
|
|
1503
|
+
// but never updated the source. One-shot nudge.
|
|
1504
|
+
const REFACTOR_GOAL_RE = /\b(refactor|refactoring|break\s+(?:out|up|apart|into)|split\s+(?:out|up|into|apart)|extract|extracting|migrate|migrating|move\s+(?:out\s+of|from|into)|reorganize|reorganizing|restructure|restructuring|consolidate|consolidating)\b/i;
|
|
1505
|
+
if (!hitLimit &&
|
|
1506
|
+
!subjectNotModifiedNudged &&
|
|
1507
|
+
editToolsInvoked > 0 &&
|
|
1508
|
+
filesReadThisTurn.size > 0 &&
|
|
1509
|
+
originalGoal &&
|
|
1510
|
+
REFACTOR_GOAL_RE.test(originalGoal)) {
|
|
1511
|
+
const readNotWritten = [...filesReadThisTurn].filter((p) => !filesWrittenThisTurn.has(p));
|
|
1512
|
+
// Fire only when the read-set is disjoint from the write-set.
|
|
1513
|
+
// If even ONE read file was written, the model is integrating;
|
|
1514
|
+
// we don't want to nag a partial-but-progressing refactor.
|
|
1515
|
+
if (readNotWritten.length === filesReadThisTurn.size) {
|
|
1516
|
+
subjectNotModifiedNudged = true;
|
|
1517
|
+
emit('tool_loop:subject_not_modified_nudge', {
|
|
1518
|
+
iteration: iterations,
|
|
1519
|
+
readNotWritten: readNotWritten.slice(0, 4),
|
|
1520
|
+
writtenCount: filesWrittenThisTurn.size
|
|
1521
|
+
});
|
|
1522
|
+
const readPreview = readNotWritten.slice(0, 3).join(', ');
|
|
1523
|
+
const writeCount = filesWrittenThisTurn.size;
|
|
1524
|
+
messages.push({
|
|
1525
|
+
role: 'user',
|
|
1526
|
+
content: `The user's goal contains a refactor verb (refactor/break out/split/extract/move) which implies the SOURCE file(s) should be modified, not just supplemented with new siblings. You read ${readPreview}${readNotWritten.length > 3 ? ' and others' : ''} for context, then wrote ${writeCount} NEW file(s), but you NEVER modified the file(s) you read. The refactor is incomplete: the source file still contains the old monolithic code. ` +
|
|
1527
|
+
`Emit the missing apply_edit/replace_range/write_file call on the source file now — it should import from the new files and drop the inlined code that's been extracted. If the refactor is genuinely a "scaffold only, leave source untouched" task, say so explicitly and explain why the source doesn't need to change.`
|
|
1528
|
+
});
|
|
1529
|
+
continue;
|
|
1530
|
+
}
|
|
1531
|
+
}
|
|
1532
|
+
// Code-fence-as-final-answer detector. pburg-bowl trace (Apr 21):
|
|
1533
|
+
// the model read ScoreBoard.tsx, then ended the turn with a ```
|
|
1534
|
+
// fenced helper function and "Replace your current total calculation
|
|
1535
|
+
// logic with this" — never calling a file-edit tool. The
|
|
1536
|
+
// existing FALSE_COMPLETION_PATTERNS don't catch this flavor because
|
|
1537
|
+
// the model doesn't SAY "I have refactored" — it just hands back
|
|
1538
|
+
// code. Heuristic: final response contains a fenced block with at
|
|
1539
|
+
// least ~8 lines of code, no edit tool was invoked this turn, and
|
|
1540
|
+
// the original prompt implied a file change. One-shot nudge.
|
|
1541
|
+
if (!hitLimit &&
|
|
1542
|
+
!codeFenceHallucinationNudged &&
|
|
1543
|
+
editToolsInvoked === 0 &&
|
|
1544
|
+
promptImpliesFileEdit) {
|
|
1545
|
+
// Look for ```lang\n...\n``` blocks. We want *substantial* code,
|
|
1546
|
+
// not a one-liner — so require at least 8 non-empty lines inside
|
|
1547
|
+
// the fence. This avoids false positives on small snippets
|
|
1548
|
+
// (shell commands, regex, env values).
|
|
1549
|
+
const fenceRe = /```[a-zA-Z0-9_-]*\n([\s\S]*?)```/g;
|
|
1550
|
+
const MIN_LINES = 8;
|
|
1551
|
+
let biggestFenceLines = 0;
|
|
1552
|
+
let match;
|
|
1553
|
+
while ((match = fenceRe.exec(finalResponse)) !== null) {
|
|
1554
|
+
const nonEmpty = match[1].split('\n').filter(l => l.trim().length > 0).length;
|
|
1555
|
+
if (nonEmpty > biggestFenceLines)
|
|
1556
|
+
biggestFenceLines = nonEmpty;
|
|
1557
|
+
}
|
|
1558
|
+
if (biggestFenceLines >= MIN_LINES) {
|
|
1559
|
+
codeFenceHallucinationNudged = true;
|
|
1560
|
+
emit('tool_loop:code_fence_nudge', {
|
|
1561
|
+
iteration: iterations,
|
|
1562
|
+
fenceLines: biggestFenceLines,
|
|
1563
|
+
responsePreview: finalResponse.slice(0, 200)
|
|
1564
|
+
});
|
|
1565
|
+
messages.push({
|
|
1566
|
+
role: 'user',
|
|
1567
|
+
content: 'You produced a substantial code block in your reply but never emitted a `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call — so the change is NOT on disk. ' +
|
|
1568
|
+
'Do not ask the user to paste your code into a file themselves. Take exactly one of these actions now: (a) call `replace_range`, `apply_edit`, or `write_file` with the real change to the correct file, OR (b) say plainly that you could not locate the target file and explain what you searched for. Do not wrap up with another prose + code-fence response.'
|
|
1569
|
+
});
|
|
1570
|
+
continue;
|
|
1571
|
+
}
|
|
1572
|
+
}
|
|
1573
|
+
// Announce-then-stall detector. The model wraps an iteration with
|
|
1574
|
+
// a forward-looking commitment ("Let me dig deeper into X", "Next
|
|
1575
|
+
// I'll explore Y") but emits NO tool call, so the loop interprets
|
|
1576
|
+
// the prose as the final answer and exits. // with bandit-logic self-evaluating this repo. None of the
|
|
1577
|
+
// upstream detectors fire: no completion claim (false-completion
|
|
1578
|
+
// patterns miss), no code fence, no prose-loop similarity (it's
|
|
1579
|
+
// the first stall after real work), no parse retry (the prose
|
|
1580
|
+
// doesn't look like an attempted tool call). One nudge per turn;
|
|
1581
|
+
// if the model still won't act, we fall through to terminate so
|
|
1582
|
+
// the user can intervene.
|
|
1583
|
+
// Announce-then-stall + ask-user-in-prose detectors. The model
|
|
1584
|
+
// wrapped a turn with "Let me X" / "I'll Y" / "I'm porting Z"
|
|
1585
|
+
// (announce-intent) or with a prose decision question (ask-user)
|
|
1586
|
+
// while we could have rendered an interactive prompt. Either one
|
|
1587
|
+
// means the loop is about to exit on a non-final-answer shape.
|
|
1588
|
+
// Detector bodies + the regex why-traces live in
|
|
1589
|
+
// loop/finalAnswerNudges.ts. The orchestrator owns the
|
|
1590
|
+
// once-per-turn flags and the false-completion-nudge precedence.
|
|
1591
|
+
if (!hitLimit && !announceIntentNudged && !falseCompletionNudged) {
|
|
1592
|
+
const r = (0, finalAnswerNudges_1.tryAnnounceIntentNudge)({ finalResponse, iteration: iterations, emit });
|
|
1593
|
+
if (r.fired) {
|
|
1594
|
+
announceIntentNudged = true;
|
|
1595
|
+
messages.push(r.message);
|
|
1596
|
+
continue;
|
|
1597
|
+
}
|
|
1598
|
+
}
|
|
1599
|
+
if (!hitLimit && !askUserNudged && !falseCompletionNudged) {
|
|
1600
|
+
const r = (0, finalAnswerNudges_1.tryAskUserNudge)({
|
|
1601
|
+
finalResponse,
|
|
1602
|
+
iteration: iterations,
|
|
1603
|
+
emit,
|
|
1604
|
+
askUserAvailable: this.registry.get('ask_user') !== undefined
|
|
1605
|
+
});
|
|
1606
|
+
if (r.fired) {
|
|
1607
|
+
askUserNudged = true;
|
|
1608
|
+
messages.push(r.message);
|
|
1609
|
+
continue;
|
|
1610
|
+
}
|
|
1611
|
+
}
|
|
1612
|
+
// Subagent-first-iteration-must-act detector. Subagents are
|
|
1613
|
+
// spawned to gather information for a specific goal — producing
|
|
1614
|
+
// prose-only output on iter 0 is always a stall, not a real
|
|
1615
|
+
// final answer. The earlier announce-intent + narrate detectors
|
|
1616
|
+
// miss when bandit-logic emits neutral reasoning + non-forward-
|
|
1617
|
+
// looking prose ("This is a complex task...") that doesn't
|
|
1618
|
+
// match either's patterns. 5/6 subagents
|
|
1619
|
+
// on a self-eval turn died at 0 iterations with exactly that
|
|
1620
|
+
// shape. One nudge per turn; if the model still won't emit a
|
|
1621
|
+
// tool the loop exits and the parent gets the existing
|
|
1622
|
+
// "subagent stalled in reasoning" error.
|
|
1623
|
+
if (effectiveOptions.isSubagent
|
|
1624
|
+
&& iterations === 0
|
|
1625
|
+
&& !subagentFirstIterNudged
|
|
1626
|
+
&& !announceIntentNudged
|
|
1627
|
+
&& !falseCompletionNudged
|
|
1628
|
+
&& !hitLimit) {
|
|
1629
|
+
subagentFirstIterNudged = true;
|
|
1630
|
+
// DO NOT force think:false here. The earlier fix
|
|
1631
|
+
// hard-set nextCallThinkOverride = false on this
|
|
1632
|
+
// retry, which is correct for non-reasoning models but
|
|
1633
|
+
// catastrophic for bandit-logic (qwen3.6:27b): per the
|
|
1634
|
+
// model's training, the tool channel runs THROUGH the
|
|
1635
|
+
// reasoning channel — disabling thinking disables tool
|
|
1636
|
+
// calling entirely. Self-eval traces 2026-05-08 confirmed
|
|
1637
|
+
// 6+ consecutive retries with think:false producing only
|
|
1638
|
+
// reasoning prose, never a tool call. Now we keep the
|
|
1639
|
+
// model's natural think setting and only escalate the
|
|
1640
|
+
// prompt — give the model a concrete <tool_call> envelope
|
|
1641
|
+
// it can copy verbatim, with the most generic exploration
|
|
1642
|
+
// tool baked in. The thinking-off-recovery path at line 876
|
|
1643
|
+
// still fires earlier for genuinely empty/stuck responses;
|
|
1644
|
+
// we don't double-down here.
|
|
1645
|
+
emit('tool_loop:subagent_first_iter_no_tool_call', {
|
|
1646
|
+
iteration: iterations,
|
|
1647
|
+
responsePreview: finalResponse.slice(0, 240)
|
|
1648
|
+
});
|
|
1649
|
+
messages.push({
|
|
1650
|
+
role: 'user',
|
|
1651
|
+
content: 'Your first response had reasoning but emitted NO tool call — that is a hard stall for a subagent (you exist to gather information; reasoning alone produces zero output). ' +
|
|
1652
|
+
'For your next response, emit a tool call. The minimum viable starting move for ANY exploration goal is:\n\n' +
|
|
1653
|
+
'<tool_call>{"name":"list_files","params":{"path":"."}}</tool_call>\n\n' +
|
|
1654
|
+
'Copy that exact envelope as the very first thing you emit (you may keep the reasoning block before it if your model needs to think first, but the tool_call envelope MUST appear in this turn). ' +
|
|
1655
|
+
'Substitute a different tool only if it\'s obviously better for the goal — `read_file` for "what does file X look like", `search_code` for "where is symbol Y", `run_command` for shell output. ' +
|
|
1656
|
+
'Do NOT respond with reasoning only again. The next message you send must contain a real <tool_call> envelope.'
|
|
1657
|
+
});
|
|
1658
|
+
continue;
|
|
1659
|
+
}
|
|
1660
|
+
// Reasoning-only terminal fallback. If we got here because the
|
|
1661
|
+
// empty-retry / thinking-off-recovery cap was reached and the
|
|
1662
|
+
// model still produced only reasoning + zero actionable output,
|
|
1663
|
+
// the user otherwise sees nothing — just a return to the prompt.
|
|
1664
|
+
// Surface a clear message that names what the model intended (so
|
|
1665
|
+
// the user can act on it themselves) instead of leaving them
|
|
1666
|
+
// staring at a blank reply. with bandit-logic
|
|
1667
|
+
// on the email-fetch task: model reasoned "I should use
|
|
1668
|
+
// run_command with osascript to fetch …" and emitted no tool
|
|
1669
|
+
// call — final response was empty after fence-strip and the
|
|
1670
|
+
// user saw nothing.
|
|
1671
|
+
//
|
|
1672
|
+
// The gate also covers the "regurgitated reasoning after
|
|
1673
|
+
// native→text channel fallback" case. Mark Portfolio
|
|
1674
|
+
// 2026-05-31T17-39-53 cleanup turn: native-tool path 500'd,
|
|
1675
|
+
// text-channel recovery prompted the model to re-emit its
|
|
1676
|
+
// pending action, but the model just echoed its prior
|
|
1677
|
+
// `bandit-reasoning` block — no tool_call, no prose, no
|
|
1678
|
+
// visible action for the user. The previous gate (`!finalResponse`,
|
|
1679
|
+
// where finalResponse = response stripped of tool_call markup
|
|
1680
|
+
// only) didn't trigger because the reasoning fence is not
|
|
1681
|
+
// tool_call markup. Widened below to also strip reasoning
|
|
1682
|
+
// before testing emptiness — if the response would render to
|
|
1683
|
+
// the user as nothing-actionable, the fallback fires and the
|
|
1684
|
+
// user sees what the model was thinking instead of silence.
|
|
1685
|
+
const reasoningStripped = response
|
|
1686
|
+
.replace(/<think\b[\s\S]*?<\/think\s*>/gi, '')
|
|
1687
|
+
.replace(/<think\b[\s\S]*$/i, '')
|
|
1688
|
+
.replace(/```bandit-reasoning\b[\s\S]*?```/gi, '')
|
|
1689
|
+
.replace(/```bandit-reasoning\b[\s\S]*$/i, '')
|
|
1690
|
+
.trim();
|
|
1691
|
+
const visibleAfterStrip = (0, tool_use_parser_1.stripToolCallMarkup)(reasoningStripped).trim();
|
|
1692
|
+
if (!visibleAfterStrip) {
|
|
1693
|
+
// Pull the last 1-2 sentences of reasoning so the user sees
|
|
1694
|
+
// what the model planned to do. Cap at 280 chars so the
|
|
1695
|
+
// fallback stays readable.
|
|
1696
|
+
const reasoningMatch = response.match(/<think\b[\s\S]*?<\/think\s*>/gi)?.pop() ??
|
|
1697
|
+
response.match(/```bandit-reasoning\b[\s\S]*?```/gi)?.pop() ??
|
|
1698
|
+
response;
|
|
1699
|
+
const reasoningText = reasoningMatch
|
|
1700
|
+
.replace(/<\/?think[^>]*>/gi, '')
|
|
1701
|
+
.replace(/```bandit-reasoning\s*\n?|```/g, '')
|
|
1702
|
+
.trim();
|
|
1703
|
+
const sentences = reasoningText.match(/[^.!?]+[.!?]/g) ?? [reasoningText];
|
|
1704
|
+
const tail = sentences.slice(-2).join(' ').trim().slice(-280);
|
|
1705
|
+
const fallback = `[Bandit stalled after reasoning without emitting a tool call — the model thought through the next step but never committed to an action. ` +
|
|
1706
|
+
`Last reasoning: "${tail}${tail.length === 280 ? '…' : ''}"\n\n` +
|
|
1707
|
+
`Try: re-prompt with the same request (often resolves on the next turn), or run the planned command yourself.]`;
|
|
1708
|
+
return { finalResponse: fallback, iterations, messages, hitLimit };
|
|
1709
|
+
}
|
|
1710
|
+
// Narrate-but-no-action terminal annotator. If the model ends a
|
|
1711
|
+
// turn with "Let me revert it:" — i.e. a forward-looking intent
|
|
1712
|
+
// verb followed by a DANGLING COLON and NO tool_call envelope —
|
|
1713
|
+
// and the inline empty-retry / narrate-no-action detector
|
|
1714
|
+
// already used its retry budget (consecutiveEmptyRetries >= 2)
|
|
1715
|
+
// so it couldn't nudge again, the user is left reading a
|
|
1716
|
+
// promise the model never kept. Mark Portfolio
|
|
1717
|
+
// 2026-05-31T17-39-53 cleanup turn: after a native→text channel
|
|
1718
|
+
// recovery, the model emitted "Let me revert it:" with a
|
|
1719
|
+
// dangling colon and no tool call; the user saw the prose end
|
|
1720
|
+
// and waited for an action that never came. Append a clear
|
|
1721
|
+
// suffix so the unfulfilled intent reads as a stall, not as
|
|
1722
|
+
// the assistant's last word.
|
|
1723
|
+
//
|
|
1724
|
+
// The trailing colon is the smoking gun — it's the
|
|
1725
|
+
// grammatical signal "what comes next is the thing I'm about
|
|
1726
|
+
// to do". Without it ("Done. Let me know if you'd like me to
|
|
1727
|
+
// push the changes.") the response is a normal final answer
|
|
1728
|
+
// that happens to contain narrate verbs, and the annotator
|
|
1729
|
+
// would be a false positive.
|
|
1730
|
+
// The trailing colon + intent phrase combination is the
|
|
1731
|
+
// smoking gun. We DON'T also require NARRATE_VERB_RE here:
|
|
1732
|
+
// the existing inline detector's verb list misses "revert"
|
|
1733
|
+
// (Portfolio 2026-05-31) and would miss any other one-off
|
|
1734
|
+
// action verb a model might use. The colon alone is rare
|
|
1735
|
+
// enough in a legit final answer that pairing it with
|
|
1736
|
+
// "let me" / "I'll" / "we'll" / etc. is specific enough.
|
|
1737
|
+
//
|
|
1738
|
+
// Period-terminated variant (added 2026-06-03 after Mark's
|
|
1739
|
+
// gregoryhite-site run): the model ended with "Let me fix
|
|
1740
|
+
// all three project cards at once." — full sentence, full
|
|
1741
|
+
// stop, no colon. Both prefill and thinking-off recovery
|
|
1742
|
+
// had been spent earlier in the turn so the user saw the
|
|
1743
|
+
// narrate prose as the final answer with no annotation that
|
|
1744
|
+
// it represented a stall. Periods are MUCH more common than
|
|
1745
|
+
// colons in legit answers ("Done.", "Let me know if you'd
|
|
1746
|
+
// like me to push the changes."), so the period path
|
|
1747
|
+
// requires the STRICTER pair: NARRATE_INTENT_RE AND
|
|
1748
|
+
// NARRATE_VERB_RE both matching the tail clause. "Let me
|
|
1749
|
+
// know if you'd like…" hits intent but no action verb;
|
|
1750
|
+
// "Let me fix the cards" hits both.
|
|
1751
|
+
const terminalStripped = reasoningStripped;
|
|
1752
|
+
const endsWithColon = terminalStripped.endsWith(':');
|
|
1753
|
+
const endsWithPeriod = /\.["']?$/.test(terminalStripped);
|
|
1754
|
+
if ((endsWithColon || endsWithPeriod) && terminalStripped.length < 600) {
|
|
1755
|
+
// Extract the LAST sentence (text after the final non-trailing
|
|
1756
|
+
// sentence terminator). For period-ending responses we must
|
|
1757
|
+
// isolate just the closing clause — testing the whole response
|
|
1758
|
+
// would leak action verbs from earlier "Done. I updated the
|
|
1759
|
+
// file." prose into the gate and trigger false positives on
|
|
1760
|
+
// legit sign-offs like "Let me know if you'd like X."
|
|
1761
|
+
const sentenceSplit = terminalStripped
|
|
1762
|
+
.split(/[.!?]+\s+/)
|
|
1763
|
+
.map((s) => s.trim())
|
|
1764
|
+
.filter((s) => s.length > 0);
|
|
1765
|
+
const terminalTail = (sentenceSplit[sentenceSplit.length - 1] ?? terminalStripped).slice(-200);
|
|
1766
|
+
const intentHit = NARRATE_INTENT_RE.test(terminalTail);
|
|
1767
|
+
// Period path needs both intent + action verb. Colon path keeps the
|
|
1768
|
+
// original looser gate (colon alone is rare enough).
|
|
1769
|
+
const verbGateMet = endsWithColon ? true : NARRATE_VERB_RE.test(terminalTail);
|
|
1770
|
+
if (intentHit && verbGateMet) {
|
|
1771
|
+
const annotated = `${finalResponse}\n\n` +
|
|
1772
|
+
`[Bandit announced this action but did not emit the tool call — the turn ended without the planned change. ` +
|
|
1773
|
+
`If this came after retries (look for "Upstream hiccup" or "Native tool call failed" status messages), the upstream model errored mid-turn and the recovery prompt didn't land the action. ` +
|
|
1774
|
+
`Re-prompt with the same request to retry, or perform the action yourself.]`;
|
|
1775
|
+
return { finalResponse: annotated, iterations, messages, hitLimit };
|
|
1776
|
+
}
|
|
1777
|
+
}
|
|
1778
|
+
return { finalResponse, iterations, messages, hitLimit };
|
|
1779
|
+
}
|
|
1780
|
+
// Parse and execute all tool calls in this response.
|
|
1781
|
+
let toolCalls = (0, tool_use_parser_1.parseToolCalls)(response);
|
|
1782
|
+
emit('tool_loop:tool_calls', { iteration: iterations, tools: toolCalls.map(t => t.name) });
|
|
1783
|
+
// Repeated-todo-write circuit breaker. pburg-bowl (Apr 21) burned 3
|
|
1784
|
+
// consecutive iterations on `todo_write` revisions before doing any
|
|
1785
|
+
// real work. If this iteration's tools are ONLY todo_write (or
|
|
1786
|
+
// todo_write + another todo_write) AND the previous N-1 iterations
|
|
1787
|
+
// were also todo-only, drop the redundant todo_write calls and
|
|
1788
|
+
// inject a nudge telling the model to execute. We keep non-todo
|
|
1789
|
+
// calls in the same iteration — the breaker only strips redundant
|
|
1790
|
+
// planning, never real work.
|
|
1791
|
+
const todoOnly = toolCalls.length > 0 && toolCalls.every(t => t.name === 'todo_write');
|
|
1792
|
+
// apply_edit-only iteration detector. Mirrors todoOnly
|
|
1793
|
+
// shape; tracks how many consecutive iterations spent every tool
|
|
1794
|
+
// slot on apply_edit (no read, search, run_command, etc.) so we
|
|
1795
|
+
// can nudge toward batching after the model burns through 4 in a
|
|
1796
|
+
// row. Doesn't fire on mixed iterations (a read + 2 apply_edits
|
|
1797
|
+
// is normal investigative work).
|
|
1798
|
+
const applyEditOnly = toolCalls.length > 0 && toolCalls.every(t => t.name === 'apply_edit');
|
|
1799
|
+
// feed the rolling health window so the iteration-cap
|
|
1800
|
+
// extension below knows whether the model is making clear
|
|
1801
|
+
// progress. We push true ONLY when this iteration produced
|
|
1802
|
+
// tool calls AND wasn't purely a planning churn (todo-only).
|
|
1803
|
+
// Empty iterations (parse failures, prose-only) push false.
|
|
1804
|
+
recentIterationsHadTools.push(toolCalls.length > 0 && !todoOnly);
|
|
1805
|
+
while (recentIterationsHadTools.length > RECENT_HEALTH_WINDOW) {
|
|
1806
|
+
recentIterationsHadTools.shift();
|
|
1807
|
+
}
|
|
1808
|
+
// Iterations that emitted NO tool calls (parse failure — model tried
|
|
1809
|
+
// to generate tool-call JSON that didn't round-trip) are neither
|
|
1810
|
+
// "todo-only" nor "real work." Don't let them reset the consecutive
|
|
1811
|
+
// counter — otherwise a Qwen turn like
|
|
1812
|
+
// iter 3: todo_write
|
|
1813
|
+
// iter 4: (empty — bad JSON)
|
|
1814
|
+
// iter 5: todo_write
|
|
1815
|
+
// iter 6: (empty — bad JSON)
|
|
1816
|
+
// iter 7: todo_write ...
|
|
1817
|
+
// never accumulates to the threshold and the churn nudge never
|
|
1818
|
+
// fires. on S3Api with bandit-logic
|
|
1819
|
+
// (Qwen 2.5 Coder 32B via native tool calling).
|
|
1820
|
+
const iterationHadRealWork = toolCalls.length > 0 && !todoOnly;
|
|
1821
|
+
if (todoOnly) {
|
|
1822
|
+
consecutiveTodoOnlyIterations++;
|
|
1823
|
+
}
|
|
1824
|
+
else if (iterationHadRealWork) {
|
|
1825
|
+
consecutiveTodoOnlyIterations = 0;
|
|
1826
|
+
// Re-arm the nudge once the model has executed real work. Without
|
|
1827
|
+
// this, a single churn early in the turn bans further todo_write
|
|
1828
|
+
// calls even when the model has legitimately finished a step and
|
|
1829
|
+
// wants to mark it completed — leaving the Plan stuck with every
|
|
1830
|
+
// item in the pending state ( on S3Api).
|
|
1831
|
+
todoChurnNudged = false;
|
|
1832
|
+
}
|
|
1833
|
+
// apply_edit-only streak tracking. Increments only when
|
|
1834
|
+
// the whole iteration was apply_edit; resets on any mixed iter
|
|
1835
|
+
// (read + edit, run + edit, etc.) since those are normal
|
|
1836
|
+
// investigative work, not a serial-error-fix loop.
|
|
1837
|
+
if (applyEditOnly) {
|
|
1838
|
+
consecutiveApplyEditOnlyIterations++;
|
|
1839
|
+
}
|
|
1840
|
+
else if (toolCalls.length > 0) {
|
|
1841
|
+
consecutiveApplyEditOnlyIterations = 0;
|
|
1842
|
+
applyEditBatchNudged = false;
|
|
1843
|
+
}
|
|
1844
|
+
// Else: empty toolCalls iteration — preserve counter state. The
|
|
1845
|
+
// parse-failure case is handled separately below (repeat-detector).
|
|
1846
|
+
if (todoOnly && consecutiveTodoOnlyIterations >= TODO_ONLY_LIMIT && !todoChurnNudged) {
|
|
1847
|
+
todoChurnNudged = true;
|
|
1848
|
+
emit('tool_loop:todo_churn_nudge', {
|
|
1849
|
+
iteration: iterations,
|
|
1850
|
+
consecutive: consecutiveTodoOnlyIterations
|
|
1851
|
+
});
|
|
1852
|
+
// Drop the redundant todo_write calls for this iteration so the
|
|
1853
|
+
// breaker doesn't just get absorbed into another no-op. The model
|
|
1854
|
+
// still "saw" its own todo_write in the assistant response, but
|
|
1855
|
+
// we skip execution and inject a nudge as the next user message.
|
|
1856
|
+
toolCalls = [];
|
|
1857
|
+
messages.push({
|
|
1858
|
+
role: 'user',
|
|
1859
|
+
content: `You have revised the plan in ${consecutiveTodoOnlyIterations + 1} consecutive iterations without executing any step. ` +
|
|
1860
|
+
'Execute the first pending task now using a concrete tool — `search_code`, `read_file`, `apply_edit`, `replace_range`, `write_file`, or `run_command`. ' +
|
|
1861
|
+
'Once a task is actually DONE (tool call succeeded), you may call `todo_write` again to mark it completed — but not to re-plan. ' +
|
|
1862
|
+
'If you cannot identify a next step, respond to the user with a short honest explanation and stop.'
|
|
1863
|
+
});
|
|
1864
|
+
iterations++;
|
|
1865
|
+
continue;
|
|
1866
|
+
}
|
|
1867
|
+
// apply_edit-batch nudge. Fires once per turn when the
|
|
1868
|
+
// model has spent APPLY_EDIT_ONLY_LIMIT (4) consecutive iterations
|
|
1869
|
+
// doing nothing but apply_edit calls. Unlike the todo-churn nudge
|
|
1870
|
+
// we DO NOT drop the current iteration's calls — those edits are
|
|
1871
|
+
// real work, just slow work. We only inject the nudge as an
|
|
1872
|
+
// additional user message so the NEXT iteration considers
|
|
1873
|
+
// batching. Real on a 17-error
|
|
1874
|
+
// linter-fix turn that hit the iteration cap with 5 errors still
|
|
1875
|
+
// outstanding.
|
|
1876
|
+
if (applyEditOnly && consecutiveApplyEditOnlyIterations >= APPLY_EDIT_ONLY_LIMIT && !applyEditBatchNudged) {
|
|
1877
|
+
applyEditBatchNudged = true;
|
|
1878
|
+
emit('tool_loop:apply_edit_batch_nudge', {
|
|
1879
|
+
iteration: iterations,
|
|
1880
|
+
consecutive: consecutiveApplyEditOnlyIterations
|
|
1881
|
+
});
|
|
1882
|
+
messages.push({
|
|
1883
|
+
role: 'user',
|
|
1884
|
+
content: `You have spent ${consecutiveApplyEditOnlyIterations} consecutive iterations on apply_edit alone. ` +
|
|
1885
|
+
'If these are mechanical fixes of the same shape (one type annotation, one rename, one import path, one missing semicolon per call), STOP doing them one at a time — you will exhaust the iteration budget before the file is clean.\n' +
|
|
1886
|
+
'\n' +
|
|
1887
|
+
'Better tactics, in order of preference:\n' +
|
|
1888
|
+
'1. **`apply_patch` with multiple hunks** — one tool call lands every fix at once. You\'ve already read the files; the find context is in your buffer.\n' +
|
|
1889
|
+
'2. **`replace_range` for one large same-file region** — use the line numbers from `read_file` and replace the whole method/component block at once.\n' +
|
|
1890
|
+
'3. **A single broader-context `apply_edit`** — pick a `find` string that spans several adjacent edits and supply the corrected block as `replace`. Three small fixes in the same 10-line region collapse to one call.\n' +
|
|
1891
|
+
'4. **For 5+ fixes in one file**: re-read the file once, then `write_file` the corrected version. Faster than incrementally patching.\n' +
|
|
1892
|
+
'\n' +
|
|
1893
|
+
'Pick a tactic and reach for it next iteration. Do not just emit another single-line apply_edit.'
|
|
1894
|
+
});
|
|
1895
|
+
iterations++;
|
|
1896
|
+
continue;
|
|
1897
|
+
}
|
|
1898
|
+
// Intra-iteration normalization: byte-identical dedup, foreground-
|
|
1899
|
+
// task fanout cap, per-iteration parallel cap, per-turn total cap.
|
|
1900
|
+
// Each step emits its own telemetry event so hosts can surface
|
|
1901
|
+
// drops in the UI. See loop/toolCallNormalize.ts.
|
|
1902
|
+
const normalized = (0, toolCallNormalize_1.normalizeToolCallBatch)({
|
|
1903
|
+
toolCalls,
|
|
1904
|
+
iteration: iterations,
|
|
1905
|
+
maxParallelTools,
|
|
1906
|
+
maxTotalTools,
|
|
1907
|
+
totalToolsExecuted,
|
|
1908
|
+
emit
|
|
1909
|
+
});
|
|
1910
|
+
toolCalls = normalized.accepted;
|
|
1911
|
+
const droppedForegroundTaskCalls = normalized.droppedForegroundTaskCalls;
|
|
1912
|
+
const droppedToolCalls = normalized.droppedParallelCap;
|
|
1913
|
+
totalToolsExecuted += toolCalls.length;
|
|
1914
|
+
// Per-tool execution — repeat-breaker, registry lookup,
|
|
1915
|
+
// beforeToolExecute gate, run, file-tracking + edit counting,
|
|
1916
|
+
// event emission. See loop/singleToolExecute.ts.
|
|
1917
|
+
const dispatchOne = (0, singleToolExecute_1.createToolDispatcher)({
|
|
1918
|
+
registry: this.registry,
|
|
1919
|
+
ctx: this.ctx,
|
|
1920
|
+
beforeToolExecute,
|
|
1921
|
+
emit,
|
|
1922
|
+
recentCallKeys,
|
|
1923
|
+
repeatLimit: REPEAT_LIMIT,
|
|
1924
|
+
filesReadThisTurn,
|
|
1925
|
+
filesWrittenThisTurn,
|
|
1926
|
+
isFileEditTool,
|
|
1927
|
+
onEditToolSucceeded: () => { editToolsInvoked++; }
|
|
1928
|
+
});
|
|
1929
|
+
// Output-budget gate + parallel/serial dispatch. Strong models
|
|
1930
|
+
// pass `outputBudgetTokens: Infinity` and never serialise;
|
|
1931
|
+
// small/medium local models trip the gate exactly when their
|
|
1932
|
+
// assistant turn is at risk of tail malformation. See
|
|
1933
|
+
// loop/parallelExecute.ts.
|
|
1934
|
+
const toolResults = await (0, parallelExecute_1.executeParallelBatch)({
|
|
1935
|
+
toolCalls,
|
|
1936
|
+
dispatchOne,
|
|
1937
|
+
outputBudgetTokens,
|
|
1938
|
+
outputBudgetRatio,
|
|
1939
|
+
emit,
|
|
1940
|
+
iteration: iterations,
|
|
1941
|
+
signal
|
|
1942
|
+
});
|
|
1943
|
+
// Track whether ANY tool errored this iteration so the next
|
|
1944
|
+
// iteration's no-tool-call branch can fire the recovery nudge if
|
|
1945
|
+
// the model abandons the request rather than retrying.
|
|
1946
|
+
lastIterationHadToolError = toolResults.some((r) => r.isError === true);
|
|
1947
|
+
// Inject tool results as the next user message.
|
|
1948
|
+
let resultsMessage = (0, tool_use_parser_1.buildToolResultsMessage)(toolResults);
|
|
1949
|
+
if (droppedToolCalls > 0) {
|
|
1950
|
+
// Synthetic system-style note appended to the tool-result payload.
|
|
1951
|
+
// Keeps the model from re-emitting the dropped calls verbatim on
|
|
1952
|
+
// the next iteration: it sees "X were dropped, narrow your query"
|
|
1953
|
+
// alongside the results from the kept calls.
|
|
1954
|
+
resultsMessage +=
|
|
1955
|
+
`\n\n[Note: you emitted ${droppedToolCalls + toolCalls.length} tool calls in one iteration; ` +
|
|
1956
|
+
`only the first ${toolCalls.length} were executed. Do not re-issue duplicates — ` +
|
|
1957
|
+
`instead, read the results above and pick a single most-promising next action.]`;
|
|
1958
|
+
}
|
|
1959
|
+
if (droppedForegroundTaskCalls > 0) {
|
|
1960
|
+
resultsMessage +=
|
|
1961
|
+
`\n\n[Note: you emitted ${droppedForegroundTaskCalls + 1} foreground task subagents in one iteration; ` +
|
|
1962
|
+
`only the first one was executed. Foreground subagents block the parent agent and make the UI look stuck. ` +
|
|
1963
|
+
`For repo overviews, synthesize from direct reads/searches first. For truly parallel audits, re-issue extra ` +
|
|
1964
|
+
`subagents with run_in_background="true" so the parent can keep responding.]`;
|
|
1965
|
+
}
|
|
1966
|
+
messages.push({ role: 'user', content: resultsMessage });
|
|
1967
|
+
// Fired-and-forgotten guard. The model just spawned ≥2 background
|
|
1968
|
+
// subagents in this iteration. Without a nudge, the next iteration
|
|
1969
|
+
// typically polls `check_task` on tasks that haven't started (a
|
|
1970
|
+
// wasted iteration) or replays the same exploration in parallel —
|
|
1971
|
+
// either way burning the parent's context budget on work the
|
|
1972
|
+
// subagents will report back via the auto-inject path. See the
|
|
1973
|
+
// `firedAndForgottenNudged` declaration for the trace this is
|
|
1974
|
+
// patterned on. One nudge per turn.
|
|
1975
|
+
if (!firedAndForgottenNudged) {
|
|
1976
|
+
const bgSpawns = toolCalls.filter((tc, idx) => tc.name === 'task' &&
|
|
1977
|
+
String(tc.params.run_in_background ?? '').toLowerCase() === 'true' &&
|
|
1978
|
+
// Only count successful spawns — a failed task tool result is
|
|
1979
|
+
// its own signal and the parent's already going to retry or
|
|
1980
|
+
// pivot.
|
|
1981
|
+
!toolResults[idx]?.isError);
|
|
1982
|
+
if (bgSpawns.length >= 2) {
|
|
1983
|
+
firedAndForgottenNudged = true;
|
|
1984
|
+
const goalLines = bgSpawns
|
|
1985
|
+
.map((tc) => {
|
|
1986
|
+
const g = typeof tc.params.goal === 'string' ? tc.params.goal : '';
|
|
1987
|
+
const trimmed = g.length > 90 ? g.slice(0, 90).trimEnd() + '…' : g;
|
|
1988
|
+
return trimmed ? `- ${trimmed}` : '';
|
|
1989
|
+
})
|
|
1990
|
+
.filter(Boolean)
|
|
1991
|
+
.join('\n');
|
|
1992
|
+
emit('tool_loop:fired_and_forgotten_nudge', {
|
|
1993
|
+
iteration: iterations,
|
|
1994
|
+
backgroundSpawns: bgSpawns.length
|
|
1995
|
+
});
|
|
1996
|
+
messages.push({
|
|
1997
|
+
role: 'user',
|
|
1998
|
+
content: `You just spawned ${bgSpawns.length} background subagents:\n${goalLines}\n\n` +
|
|
1999
|
+
'Do NOT do those same explorations yourself in the next iteration — the subagents will deliver their synopses via the auto-inject path on a later turn. ' +
|
|
2000
|
+
'Choose ONE of: ' +
|
|
2001
|
+
'(a) work on a different, independent piece of the task that those subagents are NOT covering, ' +
|
|
2002
|
+
'(b) terminate this turn now and wait for the synopses to land on the next turn — preferred when the user is waiting on a synthesis built from those subagent results, ' +
|
|
2003
|
+
'(c) call `check_task` once on a specific id only when its result is the literal next blocking input you need. ' +
|
|
2004
|
+
'Do not poll all tasks at once immediately after spawning — they have not started yet and the call returns "still running" for every one of them.'
|
|
2005
|
+
});
|
|
2006
|
+
}
|
|
2007
|
+
}
|
|
2008
|
+
// Todo-progress tracking for the stale-plan nudge. Reset the edit
|
|
2009
|
+
// counter on any todo_write call (model updated its plan); increment
|
|
2010
|
+
// on successful edit calls. Native-tools-capable models generally
|
|
2011
|
+
// maintain plans without prompting so we skip the tracking there.
|
|
2012
|
+
if (!nativeTools) {
|
|
2013
|
+
for (let t = 0; t < toolCalls.length; t++) {
|
|
2014
|
+
const tc = toolCalls[t];
|
|
2015
|
+
const res = toolResults[t];
|
|
2016
|
+
if (tc.name === 'todo_write') {
|
|
2017
|
+
lastTodoWriteIter = iterations;
|
|
2018
|
+
editsSinceLastTodo = 0;
|
|
2019
|
+
}
|
|
2020
|
+
else if (isFileEditTool(tc.name) && res && !res.isError) {
|
|
2021
|
+
editsSinceLastTodo++;
|
|
2022
|
+
}
|
|
2023
|
+
}
|
|
2024
|
+
// One-shot stale-plan nudge: the model set up a plan earlier but
|
|
2025
|
+
// has since completed multiple edits without updating it. Fires
|
|
2026
|
+
// at most once per turn — if the model ignores it, we don't hound.
|
|
2027
|
+
if (!todoProgressNudged
|
|
2028
|
+
&& lastTodoWriteIter >= 0
|
|
2029
|
+
&& iterations - lastTodoWriteIter >= TODO_PROGRESS_STALE_DELTA
|
|
2030
|
+
&& editsSinceLastTodo >= TODO_PROGRESS_EDIT_THRESHOLD) {
|
|
2031
|
+
todoProgressNudged = true;
|
|
2032
|
+
emit('tool_loop:todo_progress_nudge', {
|
|
2033
|
+
iteration: iterations,
|
|
2034
|
+
editsSinceLastTodo,
|
|
2035
|
+
iterationsSinceLastTodo: iterations - lastTodoWriteIter
|
|
2036
|
+
});
|
|
2037
|
+
messages.push({
|
|
2038
|
+
role: 'user',
|
|
2039
|
+
content: 'You set up a plan with `todo_write` earlier but have since completed ' +
|
|
2040
|
+
`${editsSinceLastTodo} edit${editsSinceLastTodo === 1 ? '' : 's'} without updating it. ` +
|
|
2041
|
+
'Call `todo_write` now with the current status — mark finished items as `completed` and leave remaining items as `pending`. ' +
|
|
2042
|
+
"The Plan block in the user's UI mirrors your last `todo_write`, so skipping this leaves them looking at a stale checklist while real work has landed."
|
|
2043
|
+
});
|
|
2044
|
+
}
|
|
2045
|
+
}
|
|
2046
|
+
iterations++;
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2049
|
+
}
|
|
2050
|
+
exports.ToolUseLoop = ToolUseLoop;
|
|
2051
|
+
/**
|
|
2052
|
+
* Convenience factory. Creates a loop with the given registry and context.
|
|
2053
|
+
*/
|
|
2054
|
+
function createToolUseLoop(registry, ctx, options) {
|
|
2055
|
+
return new ToolUseLoop(registry, ctx, options);
|
|
2056
|
+
}
|
|
2057
|
+
//# sourceMappingURL=tool-use-loop.js.map
|