@burtson-labs/agent-core 1.6.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +88 -0
  3. package/dist/index.d.ts +16 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +52 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/mcp/activation.d.ts +60 -0
  8. package/dist/mcp/activation.d.ts.map +1 -0
  9. package/dist/mcp/activation.js +139 -0
  10. package/dist/mcp/activation.js.map +1 -0
  11. package/dist/mcp/clientPool.d.ts +202 -0
  12. package/dist/mcp/clientPool.d.ts.map +1 -0
  13. package/dist/mcp/clientPool.js +469 -0
  14. package/dist/mcp/clientPool.js.map +1 -0
  15. package/dist/mcp/index.d.ts +18 -0
  16. package/dist/mcp/index.d.ts.map +1 -0
  17. package/dist/mcp/index.js +28 -0
  18. package/dist/mcp/index.js.map +1 -0
  19. package/dist/mcp/server.d.ts +43 -0
  20. package/dist/mcp/server.d.ts.map +1 -0
  21. package/dist/mcp/server.js +130 -0
  22. package/dist/mcp/server.js.map +1 -0
  23. package/dist/mcp/toolAdapter.d.ts +57 -0
  24. package/dist/mcp/toolAdapter.d.ts.map +1 -0
  25. package/dist/mcp/toolAdapter.js +223 -0
  26. package/dist/mcp/toolAdapter.js.map +1 -0
  27. package/dist/mcp/types.d.ts +122 -0
  28. package/dist/mcp/types.d.ts.map +1 -0
  29. package/dist/mcp/types.js +15 -0
  30. package/dist/mcp/types.js.map +1 -0
  31. package/dist/providers/deterministic-provider.d.ts +21 -0
  32. package/dist/providers/deterministic-provider.d.ts.map +1 -0
  33. package/dist/providers/deterministic-provider.js +80 -0
  34. package/dist/providers/deterministic-provider.js.map +1 -0
  35. package/dist/providers/provider-client.d.ts +12 -0
  36. package/dist/providers/provider-client.d.ts.map +1 -0
  37. package/dist/providers/provider-client.js +11 -0
  38. package/dist/providers/provider-client.js.map +1 -0
  39. package/dist/runtime/AgentRuntime.d.ts +67 -0
  40. package/dist/runtime/AgentRuntime.d.ts.map +1 -0
  41. package/dist/runtime/AgentRuntime.js +382 -0
  42. package/dist/runtime/AgentRuntime.js.map +1 -0
  43. package/dist/security/secretPatterns.d.ts +76 -0
  44. package/dist/security/secretPatterns.d.ts.map +1 -0
  45. package/dist/security/secretPatterns.js +290 -0
  46. package/dist/security/secretPatterns.js.map +1 -0
  47. package/dist/tools/ask-user-tool.d.ts +19 -0
  48. package/dist/tools/ask-user-tool.d.ts.map +1 -0
  49. package/dist/tools/ask-user-tool.js +148 -0
  50. package/dist/tools/ask-user-tool.js.map +1 -0
  51. package/dist/tools/compactMessages.d.ts +52 -0
  52. package/dist/tools/compactMessages.d.ts.map +1 -0
  53. package/dist/tools/compactMessages.js +158 -0
  54. package/dist/tools/compactMessages.js.map +1 -0
  55. package/dist/tools/core-tools.d.ts +29 -0
  56. package/dist/tools/core-tools.d.ts.map +1 -0
  57. package/dist/tools/core-tools.js +2214 -0
  58. package/dist/tools/core-tools.js.map +1 -0
  59. package/dist/tools/git-tools.d.ts +32 -0
  60. package/dist/tools/git-tools.d.ts.map +1 -0
  61. package/dist/tools/git-tools.js +330 -0
  62. package/dist/tools/git-tools.js.map +1 -0
  63. package/dist/tools/index.d.ts +15 -0
  64. package/dist/tools/index.d.ts.map +1 -0
  65. package/dist/tools/index.js +31 -0
  66. package/dist/tools/index.js.map +1 -0
  67. package/dist/tools/language-adapters.d.ts +48 -0
  68. package/dist/tools/language-adapters.d.ts.map +1 -0
  69. package/dist/tools/language-adapters.js +299 -0
  70. package/dist/tools/language-adapters.js.map +1 -0
  71. package/dist/tools/loop/compactionTrigger.d.ts +47 -0
  72. package/dist/tools/loop/compactionTrigger.d.ts.map +1 -0
  73. package/dist/tools/loop/compactionTrigger.js +32 -0
  74. package/dist/tools/loop/compactionTrigger.js.map +1 -0
  75. package/dist/tools/loop/finalAnswerNudges.d.ts +68 -0
  76. package/dist/tools/loop/finalAnswerNudges.d.ts.map +1 -0
  77. package/dist/tools/loop/finalAnswerNudges.js +87 -0
  78. package/dist/tools/loop/finalAnswerNudges.js.map +1 -0
  79. package/dist/tools/loop/goalAnchor.d.ts +72 -0
  80. package/dist/tools/loop/goalAnchor.d.ts.map +1 -0
  81. package/dist/tools/loop/goalAnchor.js +76 -0
  82. package/dist/tools/loop/goalAnchor.js.map +1 -0
  83. package/dist/tools/loop/llmStream.d.ts +70 -0
  84. package/dist/tools/loop/llmStream.d.ts.map +1 -0
  85. package/dist/tools/loop/llmStream.js +181 -0
  86. package/dist/tools/loop/llmStream.js.map +1 -0
  87. package/dist/tools/loop/parallelExecute.d.ts +57 -0
  88. package/dist/tools/loop/parallelExecute.d.ts.map +1 -0
  89. package/dist/tools/loop/parallelExecute.js +54 -0
  90. package/dist/tools/loop/parallelExecute.js.map +1 -0
  91. package/dist/tools/loop/singleToolExecute.d.ts +71 -0
  92. package/dist/tools/loop/singleToolExecute.d.ts.map +1 -0
  93. package/dist/tools/loop/singleToolExecute.js +139 -0
  94. package/dist/tools/loop/singleToolExecute.js.map +1 -0
  95. package/dist/tools/loop/toolCallNormalize.d.ts +57 -0
  96. package/dist/tools/loop/toolCallNormalize.d.ts.map +1 -0
  97. package/dist/tools/loop/toolCallNormalize.js +99 -0
  98. package/dist/tools/loop/toolCallNormalize.js.map +1 -0
  99. package/dist/tools/loop/turnSetup.d.ts +43 -0
  100. package/dist/tools/loop/turnSetup.d.ts.map +1 -0
  101. package/dist/tools/loop/turnSetup.js +48 -0
  102. package/dist/tools/loop/turnSetup.js.map +1 -0
  103. package/dist/tools/ocr.d.ts +52 -0
  104. package/dist/tools/ocr.d.ts.map +1 -0
  105. package/dist/tools/ocr.js +238 -0
  106. package/dist/tools/ocr.js.map +1 -0
  107. package/dist/tools/post-edit-checks.d.ts +46 -0
  108. package/dist/tools/post-edit-checks.d.ts.map +1 -0
  109. package/dist/tools/post-edit-checks.js +236 -0
  110. package/dist/tools/post-edit-checks.js.map +1 -0
  111. package/dist/tools/skill-loader.d.ts +94 -0
  112. package/dist/tools/skill-loader.d.ts.map +1 -0
  113. package/dist/tools/skill-loader.js +422 -0
  114. package/dist/tools/skill-loader.js.map +1 -0
  115. package/dist/tools/skill-registry.d.ts +44 -0
  116. package/dist/tools/skill-registry.d.ts.map +1 -0
  117. package/dist/tools/skill-registry.js +118 -0
  118. package/dist/tools/skill-registry.js.map +1 -0
  119. package/dist/tools/skill-types.d.ts +38 -0
  120. package/dist/tools/skill-types.d.ts.map +1 -0
  121. package/dist/tools/skill-types.js +10 -0
  122. package/dist/tools/skill-types.js.map +1 -0
  123. package/dist/tools/skills/code-review-skill.d.ts +9 -0
  124. package/dist/tools/skills/code-review-skill.d.ts.map +1 -0
  125. package/dist/tools/skills/code-review-skill.js +66 -0
  126. package/dist/tools/skills/code-review-skill.js.map +1 -0
  127. package/dist/tools/skills/core-skill.d.ts +13 -0
  128. package/dist/tools/skills/core-skill.d.ts.map +1 -0
  129. package/dist/tools/skills/core-skill.js +23 -0
  130. package/dist/tools/skills/core-skill.js.map +1 -0
  131. package/dist/tools/skills/git-skill.d.ts +10 -0
  132. package/dist/tools/skills/git-skill.d.ts.map +1 -0
  133. package/dist/tools/skills/git-skill.js +30 -0
  134. package/dist/tools/skills/git-skill.js.map +1 -0
  135. package/dist/tools/skills/index.d.ts +17 -0
  136. package/dist/tools/skills/index.d.ts.map +1 -0
  137. package/dist/tools/skills/index.js +49 -0
  138. package/dist/tools/skills/index.js.map +1 -0
  139. package/dist/tools/skills/interaction-skill.d.ts +14 -0
  140. package/dist/tools/skills/interaction-skill.d.ts.map +1 -0
  141. package/dist/tools/skills/interaction-skill.js +24 -0
  142. package/dist/tools/skills/interaction-skill.js.map +1 -0
  143. package/dist/tools/skills/mail-search-skill.d.ts +25 -0
  144. package/dist/tools/skills/mail-search-skill.d.ts.map +1 -0
  145. package/dist/tools/skills/mail-search-skill.js +343 -0
  146. package/dist/tools/skills/mail-search-skill.js.map +1 -0
  147. package/dist/tools/skills/plan-skill.d.ts +10 -0
  148. package/dist/tools/skills/plan-skill.d.ts.map +1 -0
  149. package/dist/tools/skills/plan-skill.js +126 -0
  150. package/dist/tools/skills/plan-skill.js.map +1 -0
  151. package/dist/tools/skills/semantic-search-skill.d.ts +22 -0
  152. package/dist/tools/skills/semantic-search-skill.d.ts.map +1 -0
  153. package/dist/tools/skills/semantic-search-skill.js +244 -0
  154. package/dist/tools/skills/semantic-search-skill.js.map +1 -0
  155. package/dist/tools/skills/test-gen-skill.d.ts +9 -0
  156. package/dist/tools/skills/test-gen-skill.d.ts.map +1 -0
  157. package/dist/tools/skills/test-gen-skill.js +123 -0
  158. package/dist/tools/skills/test-gen-skill.js.map +1 -0
  159. package/dist/tools/tool-registry.d.ts +60 -0
  160. package/dist/tools/tool-registry.d.ts.map +1 -0
  161. package/dist/tools/tool-registry.js +200 -0
  162. package/dist/tools/tool-registry.js.map +1 -0
  163. package/dist/tools/tool-types.d.ts +281 -0
  164. package/dist/tools/tool-types.d.ts.map +1 -0
  165. package/dist/tools/tool-types.js +10 -0
  166. package/dist/tools/tool-types.js.map +1 -0
  167. package/dist/tools/tool-use-loop.d.ts +231 -0
  168. package/dist/tools/tool-use-loop.d.ts.map +1 -0
  169. package/dist/tools/tool-use-loop.js +2057 -0
  170. package/dist/tools/tool-use-loop.js.map +1 -0
  171. package/dist/tools/tool-use-parser.d.ts +78 -0
  172. package/dist/tools/tool-use-parser.d.ts.map +1 -0
  173. package/dist/tools/tool-use-parser.js +427 -0
  174. package/dist/tools/tool-use-parser.js.map +1 -0
  175. package/dist/tools/toolAvailabilityDetector.d.ts +48 -0
  176. package/dist/tools/toolAvailabilityDetector.d.ts.map +1 -0
  177. package/dist/tools/toolAvailabilityDetector.js +156 -0
  178. package/dist/tools/toolAvailabilityDetector.js.map +1 -0
  179. package/dist/tools/unified-patch.d.ts +87 -0
  180. package/dist/tools/unified-patch.d.ts.map +1 -0
  181. package/dist/tools/unified-patch.js +217 -0
  182. package/dist/tools/unified-patch.js.map +1 -0
  183. package/dist/types/agent.d.ts +69 -0
  184. package/dist/types/agent.d.ts.map +1 -0
  185. package/dist/types/agent.js +54 -0
  186. package/dist/types/agent.js.map +1 -0
  187. package/dist/types/tasks.d.ts +22 -0
  188. package/dist/types/tasks.d.ts.map +1 -0
  189. package/dist/types/tasks.js +3 -0
  190. package/dist/types/tasks.js.map +1 -0
  191. package/dist/utils/event-emitter.d.ts +13 -0
  192. package/dist/utils/event-emitter.d.ts.map +1 -0
  193. package/dist/utils/event-emitter.js +54 -0
  194. package/dist/utils/event-emitter.js.map +1 -0
  195. package/package.json +33 -0
@@ -0,0 +1,2057 @@
1
+ "use strict";
2
+ /**
3
+ * Text-based tool use execution loop.
4
+ *
5
+ * Implements the observe → act → replan cycle for models that don't support
6
+ * native function calling (gemma3, bandit-core, qwen2.5-coder, etc.).
7
+ *
8
+ * Flow:
9
+ * 1. Build messages with tool definitions in system prompt
10
+ * 2. Stream response from LLM, aggregate full text
11
+ * 3. Parse <tool_call> blocks
12
+ * 4. Execute tools via ToolExecutionContext
13
+ * 5. Inject <tool_result> blocks as next user message
14
+ * 6. Repeat from step 2 until no tool calls, or max iterations reached
15
+ * 7. Return final model response (the one with no tool calls)
16
+ *
17
+ * For models WITH native tool calling (qwen2.5-coder:32b, llama3.1),
18
+ * the host should use the Ollama `tools: [...]` field instead.
19
+ */
20
+ Object.defineProperty(exports, "__esModule", { value: true });
21
+ exports.ToolUseLoop = void 0;
22
+ exports.sleep = sleep;
23
+ exports.isRetryableLlmError = isRetryableLlmError;
24
+ exports.tagRetryableLlmError = tagRetryableLlmError;
25
+ exports.summarizeLlmError = summarizeLlmError;
26
+ exports.isContinuationPrompt = isContinuationPrompt;
27
+ exports.isNoticingPrompt = isNoticingPrompt;
28
+ exports.createToolUseLoop = createToolUseLoop;
29
+ const tool_use_parser_1 = require("./tool-use-parser");
30
+ const toolCallNormalize_1 = require("./loop/toolCallNormalize");
31
+ const singleToolExecute_1 = require("./loop/singleToolExecute");
32
+ const turnSetup_1 = require("./loop/turnSetup");
33
+ const llmStream_1 = require("./loop/llmStream");
34
+ const compactionTrigger_1 = require("./loop/compactionTrigger");
35
+ const parallelExecute_1 = require("./loop/parallelExecute");
36
+ const goalAnchor_1 = require("./loop/goalAnchor");
37
+ const finalAnswerNudges_1 = require("./loop/finalAnswerNudges");
38
+ const toolAvailabilityDetector_1 = require("./toolAvailabilityDetector");
39
+ const FILE_EDIT_TOOL_NAMES = new Set(['write_file', 'apply_edit', 'replace_range', 'apply_patch']);
40
+ function isFileEditTool(name) {
41
+ return FILE_EDIT_TOOL_NAMES.has(name);
42
+ }
43
+ function sleep(ms) {
44
+ return new Promise((resolve) => setTimeout(resolve, ms));
45
+ }
46
+ function getErrorCode(error) {
47
+ return typeof error === 'object' && error !== null && 'code' in error
48
+ ? String(error.code ?? '')
49
+ : undefined;
50
+ }
51
+ function getErrorMessage(error) {
52
+ return error instanceof Error ? error.message : String(error);
53
+ }
54
+ function isRetryableLlmError(error) {
55
+ const code = getErrorCode(error);
56
+ if (code === 'USER_ABORT')
57
+ return false;
58
+ const message = getErrorMessage(error);
59
+ if (/\b429\b|rate limit/i.test(message))
60
+ return false;
61
+ return (code === 'WATCHDOG' ||
62
+ /\b5\d\d\b/.test(message) ||
63
+ /Upstream model request failed/i.test(message) ||
64
+ /ECONNREFUSED|ECONNRESET|ETIMEDOUT|EAI_AGAIN|socket hang up|fetch failed|network error|terminated|UND_ERR/i.test(message));
65
+ }
66
+ function tagRetryableLlmError(error) {
67
+ if (error instanceof Error) {
68
+ const tagged = error;
69
+ if (!tagged.code)
70
+ tagged.code = 'UPSTREAM_MODEL';
71
+ }
72
+ }
73
+ function summarizeLlmError(error) {
74
+ const message = getErrorMessage(error).replace(/\s+/g, ' ').trim();
75
+ return message.length > 180 ? `${message.slice(0, 177)}...` : message;
76
+ }
77
+ /**
78
+ * Detects "keep going" / "continue" / "yes" style prompts that
79
+ * carry no real goal content. The goal-anchor block uses the most recent
80
+ * user message as the recall text; when that text is "good lets keep
81
+ * going" the anchor degenerates into "remind yourself to keep going",
82
+ * which gives the model nothing to anchor on after 20 iterations of
83
+ * drift. Real on a 60-iteration linter-fix
84
+ * turn: every anchor injection cited "good lets keep going" as the
85
+ * goal. Detector lets callers walk back to a prior substantive prompt
86
+ * instead.
87
+ *
88
+ * Length cap (60 chars) + normalized-phrase match keeps false positives
89
+ * down — a sentence like "keep going on the auth refactor for the
90
+ * user-service" is longer than 60 chars and reads as a real goal, so it
91
+ * stays a goal.
92
+ */
93
+ const CONTINUATION_PROMPT_PHRASES = new Set([
94
+ 'continue', 'keep going', 'go on', 'proceed', 'next', 'more',
95
+ 'please continue', 'carry on', 'finish', 'finish it', 'finish up', 'wrap up', 'wrap it up',
96
+ 'good', 'great', 'nice', 'cool', 'sweet', 'perfect', 'ok', 'okay', 'k', 'yes', 'y', 'yep', 'yeah', 'ack', 'done',
97
+ "let's continue", 'lets continue', "let's keep going", 'lets keep going',
98
+ 'good keep going', 'good lets keep going', "good let's keep going",
99
+ 'good continue', 'ok continue', 'okay continue'
100
+ ]);
101
+ function isContinuationPrompt(text) {
102
+ const trimmed = text.trim();
103
+ if (trimmed.length === 0 || trimmed.length > 60)
104
+ return false;
105
+ // Normalize: lowercase, drop non-word/space punctuation, collapse whitespace.
106
+ const norm = trimmed
107
+ .toLowerCase()
108
+ .replace(/[^\w\s']/g, ' ')
109
+ .replace(/\s+/g, ' ')
110
+ .trim();
111
+ if (CONTINUATION_PROMPT_PHRASES.has(norm))
112
+ return true;
113
+ // Permit "please <phrase>" and "<phrase> please" wrappings.
114
+ for (const phrase of CONTINUATION_PROMPT_PHRASES) {
115
+ if (norm === `please ${phrase}` || norm === `${phrase} please`)
116
+ return true;
117
+ }
118
+ return false;
119
+ }
120
+ /**
121
+ * "Noticing prompt" detector. Catches user messages that are asking
122
+ * about state ("are we using these?", "did you update X?", "where's
123
+ * the…?", "isn't Y supposed to be…?") rather than requesting new
124
+ * work. These signal that the user spotted a gap in the prior turn
125
+ * and wants the agent to address it — NOT continue the prior plan.
126
+ *
127
+ * Real failure mode captured 2026-05-25 on a Portfolio React refactor:
128
+ * user asked "I dont think we actually are using these new files are
129
+ * we?" after the agent wrote data files but never wired them into
130
+ * App.jsx. Bandit read the question as a generic "keep going" prompt,
131
+ * wrote 5 MORE new component files, still didn't touch App.jsx. The
132
+ * pivot signal was right there in the prompt shape and got missed.
133
+ *
134
+ * The check is conservative: short prompts only, must START with a
135
+ * recognizable question/concern stem (so "is X working?" matches but
136
+ * "is this the right approach to X" does not), no length cap above
137
+ * 220 chars since longer messages usually contain a real request
138
+ * rather than a pure noticing question.
139
+ */
140
+ function isNoticingPrompt(text) {
141
+ const trimmed = (text || '').trim();
142
+ if (trimmed.length === 0 || trimmed.length > 220)
143
+ return false;
144
+ const norm = trimmed.toLowerCase().replace(/[^\w\s'?-]/g, ' ').replace(/\s+/g, ' ').trim();
145
+ // Stems that introduce a noticing/clarifying question. Anchored to
146
+ // the start of the message so a paragraph mentioning "are we"
147
+ // mid-text doesn't false-positive.
148
+ const STEMS = [
149
+ /^(?:i\s+)?(?:dont|don't|do\s+not)\s+(?:think|see)\s/, // "I dont think…", "I don't see…"
150
+ /^are\s+we\s/, // "are we using…"
151
+ /^did\s+(?:you|we)\s/, // "did you remember to…"
152
+ /^didn't\s+(?:you|we)\s/, // "didn't you say…"
153
+ /^did\s+(?:you|we)\s+(?:miss|forget|skip|overlook)\b/,
154
+ /^isn'?t\s+(?:this|that|it|there)\s/, // "isn't this missing…"
155
+ /^shouldn'?t\s+(?:this|that|it|there|we)\s/, // "shouldn't we…"
156
+ /^why\s+(?:didn'?t|isn'?t|aren'?t|doesn'?t|don'?t)\s/, // "why isn't X happening"
157
+ /^where(?:'s|\s+is|\s+are|\s+did)\s/, // "where is the import", "where's the …"
158
+ /^what\s+(?:about|happened\s+to)\s/, // "what about App.jsx"
159
+ /^(?:i\s+thought\s+)?you\s+(?:said|were|are)\s+(?:supposed|going|gonna)/,
160
+ /^this\s+doesn'?t\s/, // "this doesn't look right"
161
+ /^that\s+doesn'?t\s/,
162
+ /^hmm\b|^huh\b/,
163
+ /^wait\b/, // "wait — what about Y?"
164
+ /^(?:i'?m|am\s+i)\s+(?:missing|seeing|reading)\b/,
165
+ ];
166
+ if (!STEMS.some((re) => re.test(norm)))
167
+ return false;
168
+ // Has to contain a question mark OR a concern modal. Lots of false
169
+ // matches without — e.g. "are we" mid-sentence in a feature request.
170
+ const hasQuestion = trimmed.includes('?');
171
+ const hasConcernModal = /\b(?:should|need\s+to|supposed\s+to|expected|missing|wrong|broken|stuck)\b/i.test(trimmed);
172
+ return hasQuestion || hasConcernModal;
173
+ }
174
+ class ToolUseLoop {
175
+ constructor(registry, ctx, options = {}) {
176
+ this.registry = registry;
177
+ this.ctx = ctx;
178
+ this.defaultOptions = options;
179
+ this.maxIterations = options.maxIterations ?? 10;
180
+ this.defaultEmit = options.emitEvent ?? (() => undefined);
181
+ this.defaultBeforeToolExecute = options.beforeToolExecute ?? (() => ({ allow: true }));
182
+ }
183
+ /**
184
+ * Run the tool use loop.
185
+ *
186
+ * @param userGoal The original user request (becomes the first user message).
187
+ * @param chat A streaming chat function — returns an async iterable of text chunks.
188
+ * @param systemPrompt Optional base system prompt. Tool definitions are appended to it.
189
+ * @param options Per-call options (emitEvent override, etc.)
190
+ */
191
+ async run(userGoal, chat, systemPrompt, options) {
192
+ return this.runWithMessages([{ role: 'user', content: userGoal }], chat, systemPrompt, options);
193
+ }
194
+ /**
195
+ * Run the tool use loop seeded with prior conversation messages.
196
+ * Use this for REPL-style hosts that want to preserve multi-turn context;
197
+ * the caller supplies the full user/assistant history (no system message —
198
+ * the loop prepends its own system prompt with tool definitions).
199
+ */
200
+ async runWithMessages(seedMessages, chat, systemPrompt, options) {
201
+ const effectiveOptions = { ...this.defaultOptions, ...options };
202
+ const emit = effectiveOptions.emitEvent ?? this.defaultEmit;
203
+ // soft/hard cap split. `max` is now mutable so the loop
204
+ // can extend it when the model is making clear progress. The hard
205
+ // ceiling is `2 * initialMax` (40 by default) — beyond that we
206
+ // always wrap up regardless of how healthy the iteration looked.
207
+ // a real turn was patching 17 implicit-any
208
+ // errors one apply_edit per iteration, exhausted the 20-cap with
209
+ // 5 errors outstanding even though every iteration was succeeding
210
+ // and no loop-detection nudges had fired. Letting the model
211
+ // continue when it's clearly making progress is the right move.
212
+ let max = effectiveOptions.maxIterations ?? this.maxIterations;
213
+ const initialMax = max;
214
+ const hardCap = Math.max(initialMax * 2, initialMax + 20);
215
+ const CAP_EXTENSION_SIZE = 10;
216
+ const MAX_CAP_EXTENSIONS = 2;
217
+ let iterationCapExtensions = 0;
218
+ // Healthy-progress signal: track whether each of the last N iterations
219
+ // produced any tool calls. Rolling window of 5. Empty iterations
220
+ // (parse failures, prose-only responses) push `false`; productive
221
+ // iterations push `true`. Extension only fires when all 5 are true.
222
+ const recentIterationsHadTools = [];
223
+ const RECENT_HEALTH_WINDOW = 5;
224
+ const beforeToolExecute = effectiveOptions.beforeToolExecute ?? this.defaultBeforeToolExecute;
225
+ const signal = effectiveOptions.signal;
226
+ const maxParallelTools = Math.max(1, effectiveOptions.maxParallelTools ?? 8);
227
+ const maxTotalTools = Math.max(1, effectiveOptions.maxTotalTools ?? 60);
228
+ const outputBudgetTokens = effectiveOptions.outputBudgetTokens ?? Infinity;
229
+ const outputBudgetRatio = effectiveOptions.outputBudgetRatio ?? 0.6;
230
+ let totalToolsExecuted = 0;
231
+ const buildCancelledResult = (msgs, iter, finalText = '') => ({
232
+ finalResponse: finalText || '[cancelled]',
233
+ iterations: iter,
234
+ messages: msgs,
235
+ hitLimit: false,
236
+ cancelled: true
237
+ });
238
+ let nativeTools = effectiveOptions.nativeTools ?? false;
239
+ const nativeToolFailureFallback = effectiveOptions.nativeToolFailureFallback ?? true;
240
+ let nativeFallbackUsed = false;
241
+ // One-shot outer-layer retry on the text channel after the native
242
+ // channel switched. The inner same-channel retry layer covers the
243
+ // common transient blip case, but a sustained native failure forces
244
+ // the channel switch; if the first text call ALSO hits a transient
245
+ // blip (gateway flapping, ollama still recovering from load), the
246
+ // previous code path threw `Upstream model request failed` straight
247
+ // to the user with no recovery. This flag lets the outer catch
248
+ // re-enter `streamAndAggregate` exactly once more on the text channel
249
+ // before declaring the turn dead. Addresses the "double-failure path
250
+ // is still terminal" gap.
251
+ let textFallbackRetryUsed = false;
252
+ // One-shot final attempt: after every prior retry slot is spent,
253
+ // push a clean re-anchor message that re-states the original user
254
+ // goal and retry once more. Sometimes a mid-stream replay can't
255
+ // recover (the model is anchored on a half-emitted tool_call
256
+ // payload or a partial reasoning block) but a fresh anchor with
257
+ // explicit "this is a recovery attempt — answer the original goal"
258
+ // framing succeeds. Last resort before terminal throw.
259
+ let finalAnchorRetryUsed = false;
260
+ const textToolBlock = this.registry.buildSystemPromptBlock();
261
+ const buildFullSystemPrompt = (useNativeTools) => {
262
+ if (useNativeTools)
263
+ return systemPrompt ?? '';
264
+ return systemPrompt
265
+ ? `${systemPrompt}\n\n${textToolBlock}`
266
+ : textToolBlock;
267
+ };
268
+ let nativeSchemas = nativeTools ? this.registry.buildNativeToolsSchema() : undefined;
269
+ const messages = [];
270
+ const initialSystemPrompt = buildFullSystemPrompt(nativeTools);
271
+ if (initialSystemPrompt) {
272
+ messages.push({ role: 'system', content: initialSystemPrompt });
273
+ }
274
+ // Capture the most recent user message (the actual goal of THIS turn,
275
+ // not earlier conversation turns). Used by the goal-anchor reminder
276
+ // below when the model is about to generate its final answer — long
277
+ // tool-result chains push the original question down the attention
278
+ // window and the model can drift to a related-but-different topic.
279
+ // Walks back through continuation tokens ("keep going", "yes") to
280
+ // the most recent SUBSTANTIVE prompt. See loop/turnSetup.ts.
281
+ let { originalGoal, priorUserPromptCount } = (0, turnSetup_1.resolveTurnGoal)({ seedMessages });
282
+ // Track the iteration we last anchored on rather than a boolean
283
+ // so we can re-fire when the model pivots AGAIN later in a long
284
+ // turn. -1 means "never anchored." Re-fire is gated by the
285
+ // GOAL_ANCHOR_REFIRE_GAP below to avoid hammering on a model
286
+ // that's working steadily — only fires again when the loop has
287
+ // continued without resolution for several more iterations.
288
+ let lastGoalAnchorIteration = -1;
289
+ for (const msg of seedMessages) {
290
+ if (msg.role === 'system')
291
+ continue;
292
+ messages.push(msg);
293
+ }
294
+ // Noticing-prompt pivot hint. When the most-recent user message
295
+ // looks like a noticing/clarifying question ("are we using these?",
296
+ // "did you remember X?", "where's the…?"), inject a one-time
297
+ // synthetic user-role hint instructing the model to address the
298
+ // implicit gap BEFORE continuing any prior plan. Without this the
299
+ // model often reads such prompts as generic "keep going" signals
300
+ // and continues scaffolding work the user just paused them on.
301
+ // One-shot per turn — only fires on this first pass.
302
+ if (originalGoal && isNoticingPrompt(originalGoal)) {
303
+ emit('tool_loop:noticing_prompt_hint', {
304
+ promptPreview: originalGoal.slice(0, 200)
305
+ });
306
+ messages.push({
307
+ role: 'user',
308
+ content: '[Reading-comprehension note for the assistant: the user\'s last message above is a noticing / clarifying question — they spotted a possible gap from prior turns and are asking you to confirm or correct, NOT to continue any prior plan. Before you take any new action, identify what gap the question points at and address it directly. If the question is "are we using X?" the correct first move is to verify whether X is actually being used (read the consumer file, grep for the import, check the call site) and answer honestly — yes/no with evidence. Do NOT create more new artifacts unless the user explicitly says to.]'
309
+ });
310
+ }
311
+ let iterations = 0;
312
+ let hitLimit = false;
313
+ let consecutiveEmptyRetries = 0;
314
+ // Per-retry-path budgets. Keeping these separate from
315
+ // consecutiveEmptyRetries (which resets on any non-empty response)
316
+ // prevents an infinite retry when a model repeatedly emits the
317
+ // SAME malformed tool_call — the S3Api pburg workspace (Apr 22)
318
+ // ran 10+ iterations at iteration=2 because each 30s malformed
319
+ // apply_edit response reset consecutiveEmptyRetries to 0 and the
320
+ // parse-retry counter got to fire again. Caps are per-turn (not
321
+ // per-iteration) so the model genuinely exhausts its attempts
322
+ // before we give up.
323
+ let parseRetries = 0;
324
+ let fakeToolResultRetries = 0;
325
+ let toolAbsenceCorrectionsFired = 0;
326
+ let toolErrorRecoveryFired = 0;
327
+ let lastIterationHadToolError = false;
328
+ const PARSE_RETRY_CAP = 2;
329
+ const FAKE_TOOL_RESULT_CAP = 2;
330
+ const TOOL_ABSENCE_CORRECTION_CAP = 1;
331
+ const TOOL_ERROR_RECOVERY_CAP = 1;
332
+ // Hard turn-level cap on responses that produced no tool_call. The
333
+ // individual detectors (empty_retry, narrate-no-action, tool_error
334
+ // recovery, etc.) each have their own caps, but they can chain — a
335
+ // model can spin through 6+ no-tool-call responses because
336
+ // thinking-off recovery resets consecutiveEmptyRetries=0. Captured
337
+ // 2026-05-26 in Mark's Portfolio session (turn-2026-05-26T02-30-37):
338
+ // model emitted 6 sequential reasoning-only responses inside
339
+ // iteration 4 before the loop finally terminated with a useless
340
+ // final answer ("I need to stop wrapping tool calls in reasoning
341
+ // blocks"). This counter doesn't reset on detector firings — when
342
+ // it hits the cap, the loop terminates with a final answer that
343
+ // names the stuck state so the user knows what to retry with.
344
+ let noToolCallAttemptsThisTurn = 0;
345
+ // 4 → 5 (Jun 2026): make room for prefill_recovery after the existing
346
+ // empty_retry ×2 + thinking_off_recovery sequence. The new ordering is
347
+ // 1. empty_retry (consec=1)
348
+ // 2. empty_retry (consec=2)
349
+ // 3. thinking_off_recovery (force think:false)
350
+ // 4. prefill_recovery (push `<tool_call>{"name":"` as assistant prefill)
351
+ // 5. hard cap → stuck answer
352
+ // Prefill is qualitatively different from the prior steps — it forces
353
+ // the model into an envelope-opened state so it can't terminate at the
354
+ // reasoning fence — and is the highest-leverage recovery slot for the
355
+ // qwen3.6 "stops after fence close" failure mode.
356
+ const NO_TOOL_CALL_HARD_CAP = 5;
357
+ // One-shot recovery: when consecutive reasoning-only retries exhaust
358
+ // (the model is stuck thinking and never emits content or tool_calls),
359
+ // make ONE final attempt with thinking forced OFF. Observed
360
+ // 2026-04-26 with qwen3.6:27b on remote Ollama — thinking-on stalled
361
+ // intermittently while bandit-logic on the home cluster (same model,
362
+ // different serving stack) worked fine. Forcing thinking off
363
+ // collapses the model into the regular content channel where its
364
+ // tool-call sampling is far more deterministic.
365
+ let thinkingOffRecoveryAttempted = false;
366
+ let nextCallThinkOverride = undefined;
367
+ // Final-shot prefill recovery for qwen3.6-style "closes the reasoning
368
+ // fence and stops" stalls. Observed Jun 2026 on a long CSS-refactor
369
+ // turn: the model emitted 4 reasoning-only responses in a row even
370
+ // after the nudge + thinking-off recovery had fired. Reasoning content
371
+ // said "I need to actually emit tool calls" but generation terminated
372
+ // right after the fence close. Prefill removes the choice — we push an
373
+ // assistant message containing `<tool_call>{"name":"` so the next
374
+ // generation MUST continue from inside an envelope. The provider
375
+ // returns only the new tokens, so `pendingPrefillPrefix` is prepended
376
+ // to the response before parsing.
377
+ let prefillRecoveryAttempted = false;
378
+ let pendingPrefillPrefix = null;
379
+ // Track the last N non-tool-calling assistant responses so we can
380
+ // detect a "deliberation loop" — the model emits multiple iterations
381
+ // of highly-similar prose ("Wait, I see X isn't listed. Let me check
382
+ // X. Actually, I'll try to read X.") without ever calling a tool.
383
+ // Observed Apr 2026 on pburg-bowl with bandit-core-1: the model
384
+ // streamed 24k chars of self-contradicting prose in a SINGLE
385
+ // response, and if the content had been split across iterations the
386
+ // existing detectors (hitLimit, false-completion patterns) would
387
+ // also have missed it because each individual response looked
388
+ // plausible in isolation. The cross-iteration guard below kicks in
389
+ // if we see K non-tool iterations whose normalized prose overlaps
390
+ // heavily with the previous one.
391
+ const recentNonToolResponses = [];
392
+ const PROSE_LOOP_WINDOW = 2; // look back this many iterations
393
+ let proseLoopNudged = false;
394
+ // Track recent tool calls to detect a stuck model. The classic failure:
395
+ // the model writes a long JSON/TS file, its output gets truncated by an
396
+ // unescaped quote in the content, the write "succeeds" but lands corrupt,
397
+ // and the model immediately retries the same write hoping the problem
398
+ // was transient. Without a circuit breaker it will loop until maxIterations.
399
+ const recentCallKeys = [];
400
+ const REPEAT_LIMIT = 3;
401
+ // Track whether the model keeps emitting `todo_write` as its only tool
402
+ // in consecutive iterations. The v1.5.40 "todo_store summary" nudge was
403
+ // supposed to end this, but observed pburg-bowl traces (Apr 2026) show
404
+ // the model still burns 3 iterations in a row revising its todo list
405
+ // before doing any actual work. When N consecutive iterations fire
406
+ // `todo_write` as the ONLY tool (no search/read/write alongside), we
407
+ // inject a corrective nudge once.
408
+ let consecutiveTodoOnlyIterations = 0;
409
+ // 3 consecutive todo-only iterations before we intervene. Lower was
410
+ // to block bandit-logic from ever ticking plan
411
+ // items to "completed" — the model called todo_write twice to set up
412
+ // the plan, churn nudge fired at iteration 1, and the "do NOT call
413
+ // todo_write again this turn" message killed status updates for the
414
+ // rest of the run. 3 gives the model one more iteration of grace.
415
+ const TODO_ONLY_LIMIT = 3;
416
+ let todoChurnNudged = false;
417
+ // apply_edit-loop nudge. from a real
418
+ // bandit-cli run that hit the 20-iteration cap while patching 17
419
+ // implicit-any TypeScript errors one apply_edit at a time. Each
420
+ // call landed (the work was real, unlike todo-churn), but the
421
+ // sequential one-error-per-iteration cadence ate the whole budget.
422
+ // When the model spends N consecutive iterations doing only
423
+ // apply_edit (no read/run/search interleaved), we inject a one-shot
424
+ // nudge pointing at apply_patch (multi-file, multi-hunk) or a
425
+ // broader-context apply_edit that consolidates several adjacent
426
+ // fixes — both expand throughput without changing the iteration
427
+ // cap. Limit is 4 (one higher than todo-only): apply_edits are
428
+ // real progress, so we tolerate one more before nudging.
429
+ let consecutiveApplyEditOnlyIterations = 0;
430
+ const APPLY_EDIT_ONLY_LIMIT = 4;
431
+ let applyEditBatchNudged = false;
432
+ // Companion to the churn breaker: detect when the model set up a plan
433
+ // via `todo_write` early, then did multiple edit iterations WITHOUT
434
+ // calling `todo_write` again. The Plan block in the UI stays frozen
435
+ // on the original pending state — user watches the feed do real work
436
+ // but sees nothing flip to ✓. on Gemma 4 12B:
437
+ // iteration 1 set up 4-item plan, iterations 2-7 did reads + edits,
438
+ // turn ended at iteration 8 with the Plan still all-pending. Nudge
439
+ // fires at most once per turn, and ONLY on models without native
440
+ // tool calling (capable models generally update plans unprompted).
441
+ let lastTodoWriteIter = -1;
442
+ let editsSinceLastTodo = 0;
443
+ let todoProgressNudged = false;
444
+ const TODO_PROGRESS_STALE_DELTA = 3;
445
+ const TODO_PROGRESS_EDIT_THRESHOLD = 2;
446
+ // Track file paths the user referenced in the prompt or any prior tool
447
+ // call. If the model ends the turn with a large fenced code block and
448
+ // has NOT emitted any file-edit tool call, AND one of these
449
+ // referenced paths exists, we treat that as "code in markdown instead
450
+ // of a tool call" and nudge. Populated from the user goal up-front;
451
+ // the detector only fires when the signal is real.
452
+ let promptImpliesFileEdit = false;
453
+ // Companion to `promptImpliesFileEdit`: detect goals that ask for an
454
+ // ANALYSIS — "evaluate", "review", "audit", "what is", "how does",
455
+ // etc. Used by the limit-hit wrap-up logic to pick between the
456
+ // edit-shaped Shipped/Partway/Blocked template and the analysis-shaped
457
+ // Findings/Evidence/Gaps template. Without this, a "deep self
458
+ // evaluation" turn that hit the 60-call cap got the edit template
459
+ // and produced "Shipped: nothing" — useless framing for what was
460
+ // actually asked. .
461
+ let promptWantsAnalysis = false;
462
+ {
463
+ // Accept simple path tokens (contains `/` and a file extension) OR
464
+ // the keywords "update", "edit", "change", "fix", "modify", "refactor",
465
+ // "rewrite" — any of which imply the user expects a write. Heuristic,
466
+ // not a parser. False positives here cost us one wasted nudge;
467
+ // false negatives let code-fence hallucinations ship.
468
+ const goalText = seedMessages
469
+ .filter(m => m.role === 'user')
470
+ .map(m => m.content)
471
+ .join('\n')
472
+ .toLowerCase();
473
+ promptImpliesFileEdit =
474
+ /\b(update|edit|change|fix|modify|refactor|rewrite|replace|add)\b/.test(goalText) ||
475
+ /[\w\-./]+\.(?:ts|tsx|js|jsx|py|rb|go|rs|java|kt|cs|swift|php|cpp|c|h|md|json|ya?ml|html|css)\b/.test(goalText);
476
+ // Analysis verbs/phrasings. Includes both verb forms ("evaluate",
477
+ // "review") and question forms ("what is", "how does", "why
478
+ // does") so "evaluate this codebase" and "what's keeping this
479
+ // agent from being better" both light up. Compatible with
480
+ // `promptImpliesFileEdit` — a goal can match both ("look at
481
+ // file.ts and tell me what you see"); the wrap-up picker
482
+ // resolves precedence using `editToolsInvoked` as the tiebreaker.
483
+ promptWantsAnalysis =
484
+ /\b(evaluate|review|analy[sz]e|audit|inspect|investigate|explain|summari[sz]e|describe|tell\s+me|find\s+out|self[-\s]?eval(?:uat(?:e|ion))?)\b/i.test(goalText)
485
+ || /\b(what(?:'s|\s+is|\s+are)|how\s+does|why\s+does|where\s+does)\b/i.test(goalText)
486
+ || /\blook(?:ing)?\s+at\b/i.test(goalText);
487
+ }
488
+ // Track whether any file-producing tool call has actually been invoked
489
+ // this turn. Used by the "false completion" detector below: if the model
490
+ // emits a final response claiming it wrote code but never called
491
+ // write_file / apply_edit / replace_range / apply_patch, we inject a corrective nudge and force one
492
+ // more iteration so the model has a chance to actually do the work.
493
+ let editToolsInvoked = 0;
494
+ // Per-file tracking so the "subject not modified" detector (further
495
+ // below) can catch the refactor failure mode where the model reads
496
+ // a file for context, writes NEW files based on it, but never
497
+ // updates the original. The set is normalized (lowercase, basename)
498
+ // so different references to the same file collapse.
499
+ const filesReadThisTurn = new Set();
500
+ const filesWrittenThisTurn = new Set();
501
+ let subjectNotModifiedNudged = false;
502
+ // One-shot guard for the code-fence-as-final-answer detector (see below).
503
+ let codeFenceHallucinationNudged = false;
504
+ // One-shot guard for the JSON-todo auto-promotion detector (see
505
+ // below). Small models (12B Gemma observed) sometimes paste their
506
+ // todo list as a ```json code fence instead of calling todo_write,
507
+ // which means the plan never advances and they re-iterate on the
508
+ // same task. We detect the shape, synthesize a todo_write call,
509
+ // execute it as if the model had emitted it, and continue. Capped
510
+ // once per turn so a model that genuinely wants to show JSON data
511
+ // isn't caught in a loop.
512
+ let jsonTodoAutoPromoted = false;
513
+ // One-shot guard so we don't infinite-loop a truly confused model.
514
+ // The detector fires at most once per turn; if the model STILL claims
515
+ // completion without writing after the nudge, we let the turn terminate
516
+ // so the user can intervene.
517
+ let falseCompletionNudged = false;
518
+ // One-shot guard for the announce-then-stall detector. The model emits
519
+ // a forward-looking commitment ("Let me dig deeper into X", "Next I'll
520
+ // explore Y") with NO tool call, and the loop exits because no-tool =
521
+ // final answer. with bandit-logic self-evaluating
522
+ // this repo: 3 iterations of reads, then iteration 4 returned only
523
+ // "Let me dig deeper into the core architecture..." and the runtime
524
+ // exited with iterations:3, hitLimit:false. None of the existing
525
+ // detectors caught it — no completion claim, no code fence, no prose-
526
+ // loop similarity (first stall after real work).
527
+ let announceIntentNudged = false;
528
+ let askUserNudged = false;
529
+ // One-shot guard for the fired-and-forgotten background-task detector.
530
+ // The model spawns multiple `task(run_in_background="true")` calls in
531
+ // one iteration and then either polls `check_task` immediately
532
+ // (returns "still running" — wasted iteration) or, more often, does
533
+ // the same exploration in parallel itself in the next iteration —
534
+ // burning the parent's context budget on work the subagents will
535
+ // report back. 6 backgrounded tasks spawned at
536
+ // iter 4, polled at iter 5 (none ready), parent then duplicated all
537
+ // their reads at iter 6. The nudge fires once per turn telling the
538
+ // model to either work on something independent or terminate the
539
+ // turn so the auto-inject can deliver synopses on the next turn.
540
+ let firedAndForgottenNudged = false;
541
+ // One-shot guard for the subagent-first-iteration-must-act detector.
542
+ // Subagents (`options.isSubagent === true`) are spawned to gather
543
+ // information for a specific goal; producing prose-only output on
544
+ // iteration 0 is always a stall, never a legitimate final answer.
545
+ // The existing announce-intent / narrate detectors miss when the
546
+ // model emits neutral reasoning + non-forward-looking prose
547
+ // ("This is a complex task...") that doesn't match their patterns.
548
+ // bandit-logic stalled 5/6 subagents on a
549
+ // self-eval turn with exactly that shape. Fires once per turn.
550
+ let subagentFirstIterNudged = false;
551
+ // Phrases a model uses when it thinks it has delivered code but hasn't
552
+ // actually emitted a write/edit tool call. Based on observed failure
553
+ // traces from bandit-core-1 and similar small models. Matched case-
554
+ // insensitively; any match + no write tool this turn trips the nudge.
555
+ const FALSE_COMPLETION_PATTERNS = [
556
+ /in (?:my|a|the) previous response/i,
557
+ /already provided (?:the|an?) (?:implementation|refactored|improved|updated)/i,
558
+ /you can find (?:the |this )?(?:refactored|improved|updated) (?:code|implementation)/i,
559
+ /here (?:is|'s) the (?:refactored|improved|updated|revised) (?:code|implementation|file)/i,
560
+ /(?:i have|i've) (?:refactored|rewritten|updated|improved)/i,
561
+ /(?:refactored|updated) (?:the )?(?:code|implementation) above/i,
562
+ /i'll finalize the task here/i,
563
+ /i've also marked (?:the tasks|these steps) as complet/i,
564
+ // Deferral patterns: the model emitted a malformed tool call (usually
565
+ // unescaped quotes/newlines in a large content payload), took the
566
+ // parse-retry nudge as a cue to apologize, and asked the user which
567
+ // task to resume instead of actually retrying. The user never sees
568
+ // the change land on disk. Observed in pburg-bowl scoring rewrite
569
+ // (Apr 2026): iteration 4 emitted write_file with unescaped content,
570
+ // parse-retry nudge fired, model responded with apology + "let me
571
+ // know which task I should resume" and termination.
572
+ /i apologi[sz]e for the (?:malformed|invalid)/i,
573
+ /(?:ensure|escape) (?:all )?(?:quotes|newlines|characters).*(?:properly )?escap/i,
574
+ /in my next tool call/i,
575
+ /let me know (?:which|what) (?:task|action) (?:i should |to )?resume/i,
576
+ /please (?:let me know|tell me).*(?:specific action|which task|what.*like me to)/i,
577
+ // Patterns surfaced 2026-04-23 on S3Api with bandit-logic (Qwen
578
+ // 2.5 Coder 32B). Model never called apply_edit, then ended the
579
+ // turn with "Based on the steps we've taken, here is the final
580
+ // state of the files..." followed by a prose dump of the
581
+ // "edited" files (which were never actually written to disk).
582
+ // The prior patterns covered "here is the refactored code" but
583
+ // not "here is the final state." Same failure mode, new words.
584
+ /here (?:is|'s) the (?:final|resulting|updated|modified) (?:state|version|content|output) of/i,
585
+ /(?:comments?|changes?|edits?|annotations?|updates?) (?:have )?been (?:added|made|applied|written|included)/i,
586
+ /you can verify (?:these|the|your) (?:changes?|edits?|updates?)/i,
587
+ /check(?:ing)? the files? (?:directly )?in your editor/i,
588
+ /running (?:a )?build to (?:see|verify|check)/i,
589
+ // Gemma 4 / bandit-core-1 escape patterns observed
590
+ // 2026-05-12 turn 1bec. After the bandit-tl hallucination detector
591
+ // blocked the fake-card shape, the model fell back to
592
+ // pure-prose lying with phrases like:
593
+ // "I have successfully eliminated all critical errors"
594
+ // "I have successfully fixed/resolved/removed/cleaned up X"
595
+ // "The project is now in a healthy state"
596
+ // "Verified via [tool] — confirmed [N→0]"
597
+ // "Removed forbidden require() calls: Converted them to ESM"
598
+ // Existing patterns covered "refactored / rewritten / updated /
599
+ // improved" but missed eliminated / resolved / cleaned / verified.
600
+ // Each new pattern is anchored to a completion-claim verb so this
601
+ // doesn't fire on legitimate "I will fix" intent phrases.
602
+ /(?:i have|i've)\s+(?:successfully\s+)?(?:eliminated|resolved|removed|cleaned|cleared|deleted|wiped|converted|wrapped|implemented|completed|finished)/i,
603
+ /(?:the project|the codebase|the file|the code) is now (?:in a (?:healthy|clean|working|fixed) state|fixed|complete|done|ready)/i,
604
+ /(?:verified|confirmed) (?:via|with|by running)\s+(?:the\s+)?(?:linter|tests?|build|tsc|eslint)/i,
605
+ /(?:critical errors?|lint(?:ing)? errors?|warnings?|issues?) (?:dropped|went|reduced) (?:from\s+)?\d+\+?\s*(?:to|→)\s*\d+/i,
606
+ // "Successfully" + past-tense action is the most common new shape.
607
+ /successfully\s+(?:fixed|resolved|removed|eliminated|cleaned|converted|implemented|verified|completed|applied|updated|patched)/i
608
+ ];
609
+ for (;;) {
610
+ if (signal?.aborted) {
611
+ emit('tool_loop:cancelled', { iteration: iterations, stage: 'pre_iteration' });
612
+ return buildCancelledResult(messages, iterations);
613
+ }
614
+ // Both limit-hit messages now LEAD with the original user goal.
615
+ // a self-evaluation turn hit the 60-tool cap,
616
+ // got the wrap-up nudge, and the model wrote a wrap-up about a
617
+ // wholly different project (Helm chart / Next.js) it had touched
618
+ // in compacted-away context — explicitly admitting "Without
619
+ // knowing the exact original prompt." After 60 calls + multiple
620
+ // compactions, the model genuinely cannot recall what was asked
621
+ // unless we put it back in front of them at wrap-up time. The
622
+ // anchor IS in the conversation but it's deep history; the
623
+ // wrap-up message is the LAST thing the model sees, so the goal
624
+ // belongs here too.
625
+ const goalRecallBlock = originalGoal
626
+ ? `## ORIGINAL USER GOAL — answer THIS, not whatever feels salient in recent reads:\n\n "${originalGoal.trim()}"\n\n`
627
+ : '';
628
+ // Template picker — analysis-shaped goals (evaluate, review,
629
+ // explain, "what is X") get a Findings/Evidence/Gaps shape;
630
+ // edit-shaped goals (or any turn where edits actually fired)
631
+ // get the Shipped/Partway/Blocked shape. `editToolsInvoked > 0`
632
+ // takes precedence: if real edits landed, the user needs that
633
+ // accounting regardless of the prompt phrasing. Default for
634
+ // ambiguous goals (no edit signal, no analysis verb) is the
635
+ // edit shape — that's what was here before, kept as the
636
+ // conservative fallback.
637
+ const useAnalysisTemplate = editToolsInvoked === 0
638
+ && (promptWantsAnalysis || !promptImpliesFileEdit);
639
+ // Analysis-shaped wrap-up. Three sections that match what an
640
+ // evaluator-style turn produces: a substantive synthesis, the
641
+ // material that supports it, and an honest list of gaps. Without
642
+ // this template, "Shipped: nothing landed" was the model's
643
+ // mandatory opener for analysis turns — useless framing for the
644
+ // self-evaluation request that surfaced this fix.
645
+ const analysisWrapUp = '**Findings** — your conclusions, the actual analysis the user asked for. Be specific: name files, patterns, gaps you saw. This is the deliverable; do NOT bury it under "I read X then Y then Z" — synthesise.\n' +
646
+ '\n' +
647
+ '**Evidence** — what you actually read or ran that supports each finding. File paths + brief description ("`tool-use-loop.ts:540` — goal-anchor only fires every 4 iterations"). Without this the user can\'t verify your claims.\n' +
648
+ '\n' +
649
+ '**What you didn\'t get to** — parts of the question you couldn\'t answer with what you saw. Be honest about gaps; do NOT invent confident claims about code you didn\'t actually read.\n';
650
+ const editWrapUp = '**Shipped** — concrete changes that ACTUALLY landed this turn. Only list edits where a write_file, apply_edit, replace_range, or apply_patch tool call returned successfully (no errors). Be specific about file + what changed.\n' +
651
+ '\n' +
652
+ '**Build state** — if you edited code this turn you MUST state the build state explicitly. Either (a) cite a verified-clean run from THIS turn — quote the command + "exit code 0" / "no errors" output, OR (b) say "I did not run the build / typecheck this turn — caller should verify". DO NOT claim items are Done if the build is failing; downgrade those items to Partway and name the remaining errors. Real on a linter-fix turn: model wrote "Shipped" with 7 bullets while `tsc --noEmit` still reported 5 errors it had run out of iterations to fix.\n' +
653
+ '\n' +
654
+ '**Partway** — investigation done but not yet committed (files read, searches run, plan formed). State what was learned and what the next step would be.\n' +
655
+ '\n' +
656
+ '**Blocked / not attempted** — anything in the user\'s request you did not get to, or attempted-but-failed (e.g. apply_edit returned find-not-found). Own the failure honestly — do NOT claim success on these. If a fix is one paragraph the user can apply manually, say so.\n';
657
+ const wrapUpBody = useAnalysisTemplate ? analysisWrapUp : editWrapUp;
658
+ if (iterations >= max) {
659
+ // soft cap extension. Before forcing the wrap-up,
660
+ // check whether the model is making clear progress. Extension
661
+ // criteria: last RECENT_HEALTH_WINDOW iterations all produced
662
+ // tool calls (not empty, not todo-only), no loop-detection
663
+ // nudges have fired this turn, and we're under the hard
664
+ // ceiling. When all true, raise `max` by CAP_EXTENSION_SIZE
665
+ // and let the loop continue. Up to MAX_CAP_EXTENSIONS, then
666
+ // the wrap-up always fires no matter how healthy things look.
667
+ const fullWindow = recentIterationsHadTools.length === RECENT_HEALTH_WINDOW;
668
+ const allHealthy = fullWindow && recentIterationsHadTools.every(Boolean);
669
+ const noNudges = !todoChurnNudged && !applyEditBatchNudged && !proseLoopNudged
670
+ && fakeToolResultRetries === 0 && parseRetries === 0;
671
+ const underCeiling = max + CAP_EXTENSION_SIZE <= hardCap;
672
+ const canExtend = allHealthy && noNudges && underCeiling
673
+ && iterationCapExtensions < MAX_CAP_EXTENSIONS;
674
+ if (canExtend) {
675
+ const prevMax = max;
676
+ max += CAP_EXTENSION_SIZE;
677
+ iterationCapExtensions++;
678
+ emit('tool_loop:iteration_cap_extended', {
679
+ iteration: iterations,
680
+ previousMax: prevMax,
681
+ newMax: max,
682
+ extension: iterationCapExtensions,
683
+ hardCap
684
+ });
685
+ // Drop a single-sentence nudge so the model knows the budget
686
+ // grew and tightens up. Without this it might keep its
687
+ // current pace and burn the extension too.
688
+ messages.push({
689
+ role: 'user',
690
+ content: `You've been making good progress and the iteration budget has been extended by ${CAP_EXTENSION_SIZE} (new limit: ${max}). Keep going, but tighten up: prefer batched edits over single-line ones, and start wrapping up when you have a complete answer rather than running to the new cap. This is the ${iterationCapExtensions === 1 ? 'first' : 'second'} of at most ${MAX_CAP_EXTENSIONS} extensions for this turn.`
691
+ });
692
+ }
693
+ else {
694
+ hitLimit = true;
695
+ // Step-budget exhaustion prompt. Three-section structure forces
696
+ // honest accounting; the goal recall block above stops models
697
+ // from inventing what the goal was. Template choice (analysis
698
+ // vs edit) reflects what the user actually asked for.
699
+ messages.push({
700
+ role: 'user',
701
+ content: `${goalRecallBlock}` +
702
+ `You have reached the tool-use iteration limit (${max}). Stop calling tools. Produce a final answer with three short sections, in this exact shape:\n` +
703
+ '\n' +
704
+ wrapUpBody +
705
+ '\n' +
706
+ 'No tool calls. No "I will continue" promises. Close the turn.'
707
+ });
708
+ }
709
+ }
710
+ if (totalToolsExecuted >= maxTotalTools && !hitLimit) {
711
+ hitLimit = true;
712
+ emit('tool_loop:total_tool_cap', { iteration: iterations, totalToolsExecuted });
713
+ messages.push({
714
+ role: 'user',
715
+ content: `${goalRecallBlock}` +
716
+ `You have executed ${totalToolsExecuted} tool calls this turn — the per-turn cap (${maxTotalTools}) has been reached. Stop calling tools. Produce a final answer with three short sections:\n` +
717
+ '\n' +
718
+ wrapUpBody +
719
+ '\n' +
720
+ 'No more tool calls. Close the turn.'
721
+ });
722
+ }
723
+ // Compact accumulated tool-result history before sending to the
724
+ // provider. On small/medium models this is what keeps long agent
725
+ // turns (6+ iterations on a real codebase) from overflowing
726
+ // num_ctx — when older tool results have grown past the budget
727
+ // they get collapsed to one-line "[earlier run, N lines elided]"
728
+ // placeholders. The model still sees enough to avoid re-reading
729
+ // files it already read. Aggressive-threshold rationale + the
730
+ // why-trace live in loop/compactionTrigger.ts.
731
+ const { aggressive: aggressiveCompactionThisIteration } = (0, compactionTrigger_1.applyCompactionIfNeeded)({
732
+ messages,
733
+ tokenBudget: effectiveOptions.messageTokenBudget,
734
+ emit,
735
+ iteration: iterations
736
+ });
737
+ // Goal anchor — re-inject the original user goal when the loop is
738
+ // at risk of drifting (recency bias on long tool-result chains;
739
+ // multi-turn pivot after compaction). Eligibility, refire gap,
740
+ // and the aggressive-compaction override are pinned in
741
+ // loop/goalAnchor.ts.
742
+ ({ lastGoalAnchorIteration } = (0, goalAnchor_1.applyGoalAnchorIfNeeded)({
743
+ originalGoal,
744
+ priorUserPromptCount,
745
+ hitLimit,
746
+ iteration: iterations,
747
+ lastGoalAnchorIteration,
748
+ aggressiveCompactionThisIteration,
749
+ messages,
750
+ registry: this.registry,
751
+ emit
752
+ }));
753
+ // Stream and aggregate the model response.
754
+ // Telemetry: capture total prompt size sent to the
755
+ // model. Subagent stalls were hard to diagnose because we
756
+ // couldn't tell if the prompt was 5KB (normal) or 50KB+ (would
757
+ // explain prompt-processing latency). Now both are visible.
758
+ const callOptions = nextCallThinkOverride !== undefined ? { think: nextCallThinkOverride } : undefined;
759
+ // Per-call think override is single-shot — clear immediately after
760
+ // building the options bag so subsequent iterations revert to the
761
+ // chat function's closure-captured default.
762
+ nextCallThinkOverride = undefined;
763
+ let llmStartedAt = Date.now();
764
+ let response = '';
765
+ // Drain externally-pushed messages BEFORE each LLM call. Host
766
+ // subscribes its backgroundStore (or other async event source)
767
+ // and pushes into a local queue; this callback returns the
768
+ // pending entries which the loop appends to the conversation.
769
+ // Net effect: parent loop sees subagent completions the moment
770
+ // they arrive instead of poll-spinning on check_task. See the
771
+ // ToolUseLoopOptions doc for the motivating use case.
772
+ const externals = effectiveOptions.drainExternalMessages?.() ?? [];
773
+ for (const ext of externals) {
774
+ if (ext && typeof ext.content === 'string' && ext.content.length > 0) {
775
+ messages.push(ext);
776
+ emit('tool_loop:external_inject', {
777
+ iteration: iterations,
778
+ role: ext.role,
779
+ chars: ext.content.length
780
+ });
781
+ }
782
+ }
783
+ while (true) {
784
+ emit('tool_loop:llm_start', {
785
+ iteration: iterations,
786
+ messageCount: messages.length,
787
+ promptCharsTotal: messages.reduce((sum, m) => sum + (m.content?.length ?? 0), 0),
788
+ systemPromptChars: messages
789
+ .filter((m) => m.role === 'system')
790
+ .reduce((sum, m) => sum + (m.content?.length ?? 0), 0),
791
+ thinkOverride: callOptions?.think
792
+ });
793
+ llmStartedAt = Date.now();
794
+ try {
795
+ response = await (0, llmStream_1.streamAndAggregate)({
796
+ chat,
797
+ messages,
798
+ emit,
799
+ iteration: iterations,
800
+ tools: nativeSchemas,
801
+ signal,
802
+ callOptions
803
+ });
804
+ if (pendingPrefillPrefix) {
805
+ // Ollama's chat API treats a trailing assistant message as a
806
+ // prefill — the model continues from where its content ends.
807
+ // The streamed response contains only the new tokens, so glue
808
+ // the prefix back on so downstream parsing sees a complete
809
+ // <tool_call> envelope.
810
+ response = pendingPrefillPrefix + response;
811
+ pendingPrefillPrefix = null;
812
+ }
813
+ break;
814
+ }
815
+ catch (error) {
816
+ if (nativeTools && nativeToolFailureFallback && !nativeFallbackUsed && isRetryableLlmError(error) && !signal?.aborted) {
817
+ nativeFallbackUsed = true;
818
+ nativeTools = false;
819
+ nativeSchemas = undefined;
820
+ const fallbackPrompt = buildFullSystemPrompt(false);
821
+ if (fallbackPrompt) {
822
+ if (messages[0]?.role === 'system') {
823
+ messages[0] = { role: 'system', content: fallbackPrompt };
824
+ }
825
+ else {
826
+ messages.unshift({ role: 'system', content: fallbackPrompt });
827
+ }
828
+ }
829
+ // v1.7.299 right-way fix: push a synthetic user message so
830
+ // the NEXT LLM call sees explicit guidance that the tool
831
+ // channel changed. The system-prompt swap alone is not
832
+ // enough — long-context models often anchor on the latest
833
+ // user turn for "what tool envelope should I use," and
834
+ // without this signal they keep emitting the prior
835
+ // native-tools shape into the void. Mark trace 2026-05-26:
836
+ // after a bandit-cloud 500 triggered native→text fallback
837
+ // mid-turn, the model continued emitting native-style
838
+ // payloads for 3+ iterations before finally producing
839
+ // visible markup.
840
+ messages.push({
841
+ role: 'user',
842
+ content: `[Provider error mid-turn — tool channel switched.] The previous attempt failed with: ${summarizeLlmError(error)}. ` +
843
+ `I retried with the text-based tool-call channel. ` +
844
+ `Re-emit your pending action using the text envelope: ` +
845
+ `<tool_call>{"name":"...","params":{...}}</tool_call> outside of any reasoning block. ` +
846
+ `Native-function-call payloads from your previous attempt were discarded — they're not visible to me. ` +
847
+ `If your last intended action is unclear, briefly state what you were trying to do and then emit the tool_call.`
848
+ });
849
+ emit('tool_loop:native_tool_fallback', {
850
+ iteration: iterations,
851
+ reason: summarizeLlmError(error)
852
+ });
853
+ continue;
854
+ }
855
+ // One-shot outer-layer retry on the text channel. Only fires
856
+ // when the channel switch has already happened (we're on text
857
+ // now) AND the failure is retryable AND we haven't already used
858
+ // this slot this turn. Larger backoff than the inner layer
859
+ // because by this point we've spent ~5-10s on the native
860
+ // attempts; the server probably needs longer to recover. After
861
+ // this attempt, any further failure on text is genuinely
862
+ // terminal — the user has been waiting > 30 s and a clean
863
+ // error is more helpful than another silent retry.
864
+ if (nativeFallbackUsed && !textFallbackRetryUsed && isRetryableLlmError(error) && !signal?.aborted) {
865
+ textFallbackRetryUsed = true;
866
+ emit('tool_loop:text_fallback_retry', {
867
+ iteration: iterations,
868
+ reason: summarizeLlmError(error)
869
+ });
870
+ await sleep(2400);
871
+ continue;
872
+ }
873
+ // Last-resort final-anchor retry. By this point we've spent
874
+ // every same-channel and cross-channel retry slot, and the
875
+ // conversation may contain partial tool_call deltas or
876
+ // half-emitted reasoning blocks that the model keeps anchoring
877
+ // on. Push a clean recovery message that restates the original
878
+ // goal and gives the model an explicit fresh-start framing,
879
+ // then retry once more. Only fires when an originalGoal is
880
+ // present (no point re-anchoring an empty turn) and the user
881
+ // hasn't aborted. After this attempt the failure is genuinely
882
+ // terminal — we've tried 12+ chat invocations across two
883
+ // channels with three distinct framings.
884
+ if (!finalAnchorRetryUsed
885
+ && textFallbackRetryUsed
886
+ && originalGoal.trim().length > 0
887
+ && isRetryableLlmError(error)
888
+ && !signal?.aborted) {
889
+ finalAnchorRetryUsed = true;
890
+ messages.push({
891
+ role: 'user',
892
+ content: `[Recovery attempt — previous channel attempts hit ${summarizeLlmError(error)}. ` +
893
+ `Discarding any partial tool_call or reasoning state from those attempts. ` +
894
+ `Original user goal restated as a fresh anchor:]\n\n${originalGoal.trim()}`
895
+ });
896
+ emit('tool_loop:final_anchor_retry', {
897
+ iteration: iterations,
898
+ reason: summarizeLlmError(error),
899
+ goalPreview: originalGoal.slice(0, 120)
900
+ });
901
+ await sleep(3600);
902
+ continue;
903
+ }
904
+ throw error;
905
+ }
906
+ }
907
+ // Diagnostic preview: 2000 chars + flags so we can tell apart "model
908
+ // emitted tool markup that the parser missed" from "model genuinely
909
+ // never emitted markup." 200 chars was too short to see past a
910
+ // typical reasoning fence (subagent traces 2026-05-08 captured only
911
+ // the fence opener and we couldn't tell if a tool call followed).
912
+ emit('tool_loop:llm_response', {
913
+ iteration: iterations,
914
+ response: response.slice(0, 2000),
915
+ responseLength: response.length,
916
+ hasToolCallMarkup: response.includes('<tool_call>') || /```\s*tool_call\b/.test(response),
917
+ endsWithFenceClose: /```\s*$/.test(response.trimEnd()),
918
+ llmDurationMs: Date.now() - llmStartedAt
919
+ });
920
+ if (signal?.aborted) {
921
+ emit('tool_loop:cancelled', { iteration: iterations, stage: 'post_stream' });
922
+ return buildCancelledResult(messages, iterations, response);
923
+ }
924
+ // Turn-level hard cap on no-tool-call responses. The individual
925
+ // detectors below (fake-tool-result, false-tool-absence,
926
+ // tool-error recovery, empty-retry, narrate-no-action,
927
+ // thinking-off recovery, parse-retry, prose-loop, etc.) each
928
+ // have their own caps, but they chain — thinking-off recovery
929
+ // resets consecutiveEmptyRetries=0, parse-retry has its own
930
+ // counter, and the model can move between failure modes faster
931
+ // than any one detector can give up. Mark Portfolio session
932
+ // 2026-05-26 turn-02-30-37: 6 sequential reasoning-only
933
+ // responses inside one iteration before the loop terminated
934
+ // silently. This counter increments on EVERY response without
935
+ // a tool_call and never resets; once it crosses the cap we
936
+ // force-terminate with a final answer that names the stuck
937
+ // state instead of letting the model spin.
938
+ //
939
+ // Placed BEFORE the per-detector branches so the cap takes
940
+ // precedence — detectors can still nudge once each below this
941
+ // line, but once we've hit the cap they don't run.
942
+ if (!hitLimit && !(0, tool_use_parser_1.hasToolCalls)(response)) {
943
+ noToolCallAttemptsThisTurn++;
944
+ if (noToolCallAttemptsThisTurn >= NO_TOOL_CALL_HARD_CAP) {
945
+ emit('tool_loop:no_tool_call_hard_cap', {
946
+ iteration: iterations,
947
+ attempts: noToolCallAttemptsThisTurn,
948
+ responsePreview: response.slice(0, 200)
949
+ });
950
+ const finalStripped = (0, tool_use_parser_1.stripToolCallMarkup)(response).trim();
951
+ const goalHint = originalGoal
952
+ ? `\n\nGoal you asked me to handle: "${originalGoal.trim().slice(0, 200)}"`
953
+ : '';
954
+ const stuckAnswer = `I got stuck — emitted ${noToolCallAttemptsThisTurn} responses in a row without successfully invoking a tool, ` +
955
+ `so I'm stopping the turn before it wastes more time. ` +
956
+ `Most recent reasoning was:\n\n${finalStripped.slice(0, 600) || '(empty)'}` +
957
+ `${goalHint}\n\n` +
958
+ `Suggested next steps:\n` +
959
+ ` - Re-ask with a narrower scope (one file or one concrete change)\n` +
960
+ ` - Try \`/new\` to start fresh if the context is muddled\n` +
961
+ ` - If you saw a tool error earlier in this turn, paste it back and I'll pick a different tool`;
962
+ return { messages, iterations, hitLimit, finalResponse: stuckAnswer };
963
+ }
964
+ }
965
+ else if ((0, tool_use_parser_1.hasToolCalls)(response)) {
966
+ // A real tool_call landed — reset the cap counter so a later
967
+ // unrelated stall in the same turn gets its own full budget.
968
+ noToolCallAttemptsThisTurn = 0;
969
+ // Also reset the prefill-recovery one-shot. The recovery budget
970
+ // is "per stretch of failures," not "once per turn" — without
971
+ // this reset, a long refactor that recovers from one prefill
972
+ // stall and then hits another (Mark, gregoryhite-site
973
+ // 2026-06-02T23-56-38: 26 iterations, prefill burned at iter 25,
974
+ // iter 26 stalled again with no recovery left) falls straight
975
+ // through to the terminal "Bandit stalled" fallback even though
976
+ // every other detector still has budget. The hard cap on
977
+ // noToolCallAttemptsThisTurn (5) bounds the total stuck
978
+ // responses per stretch, so this can't infinite-loop.
979
+ prefillRecoveryAttempted = false;
980
+ }
981
+ // Protocol guard: Gemma-family models (all sizes, including
982
+ // bandit-core-1 31B) sometimes helpfully "complete" the
983
+ // tool-call / tool-result pattern by emitting a fake
984
+ // `<tool_result>` envelope in their OWN response — template
985
+ // completion from training rather than real tool invocation.
986
+ // The downstream effect is the model reports "edits applied"
987
+ // when nothing was actually written. Detect the fake envelope,
988
+ // strip it, and re-inject a corrective user message so the
989
+ // model retries with a proper `<tool_call>` or produces a
990
+ // plain-prose final answer. One iteration budget — avoids loops
991
+ // if the model ignores the correction.
992
+ const FAKE_TOOL_RESULT_RE = /<tool_result\b[\s\S]*?<\/tool_result\s*>|<tool_result\b[^<]*$/i;
993
+ if (!hitLimit && FAKE_TOOL_RESULT_RE.test(response) && fakeToolResultRetries < FAKE_TOOL_RESULT_CAP) {
994
+ fakeToolResultRetries++;
995
+ emit('tool_loop:fake_tool_result_detected', {
996
+ iteration: iterations,
997
+ preview: response.slice(0, 200)
998
+ });
999
+ const scrubbed = response.replace(/<tool_result\b[\s\S]*?<\/tool_result\s*>/gi, '').replace(/<tool_result\b[^<]*$/i, '').trim();
1000
+ // Replace the just-pushed assistant response with the scrubbed
1001
+ // version so the model doesn't see its own hallucination in
1002
+ // the next turn's context (which would reinforce the pattern).
1003
+ messages.push({ role: 'assistant', content: scrubbed });
1004
+ messages.push({
1005
+ role: 'user',
1006
+ content: 'You emitted a `<tool_result>` envelope in your response. Those envelopes are SYSTEM output — they appear BETWEEN your turns, never inside your own message. If you meant to invoke a tool, emit a single `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. If the task is complete, give a plain-prose final answer with no XML envelopes. Retry now.'
1007
+ });
1008
+ continue;
1009
+ }
1010
+ // Fake tool-log fence detector. Some small/mid models hallucinate
1011
+ // ```bandit-tl / bandit-run / bandit-subagent fenced JSON cards
1012
+ // in prose to PRETEND they ran tools — the host's real tool-log
1013
+ // shape they've seen in conversation history. We strip the fake
1014
+ // fences and nudge with a hard-line "no claims of completion
1015
+ // without a real tool_call" message. Detector fires only when
1016
+ // the response has NO real `<tool_call>` markup, so models
1017
+ // legitimately quoting a tool-log card in explanatory prose
1018
+ // don't false-positive.
1019
+ const FAKE_BANDIT_TL_RE = /```bandit-(?:tl|run|subagent)\b[\s\S]*?```/gi;
1020
+ const FAKE_BANDIT_TL_LOOSE_RE = /```bandit-(?:tl|run|subagent)\b[\s\S]*$/i;
1021
+ const hasFakeBanditCard = FAKE_BANDIT_TL_RE.test(response) || FAKE_BANDIT_TL_LOOSE_RE.test(response);
1022
+ const hasRealToolCall = /<tool_call\b/i.test(response);
1023
+ if (!hitLimit && hasFakeBanditCard && !hasRealToolCall && fakeToolResultRetries < FAKE_TOOL_RESULT_CAP) {
1024
+ fakeToolResultRetries++;
1025
+ emit('tool_loop:fake_tool_result_detected', {
1026
+ iteration: iterations,
1027
+ preview: response.slice(0, 200),
1028
+ shape: 'bandit-tl'
1029
+ });
1030
+ const scrubbed = response
1031
+ .replace(/```bandit-(?:tl|run|subagent)\b[\s\S]*?```/gi, '')
1032
+ .replace(/```bandit-(?:tl|run|subagent)\b[\s\S]*$/i, '')
1033
+ .trim();
1034
+ messages.push({ role: 'assistant', content: scrubbed });
1035
+ messages.push({
1036
+ role: 'user',
1037
+ content: 'You emitted ` ```bandit-tl` (or `bandit-run` / `bandit-subagent`) fenced JSON in your response. Those fences are emitted by the EXTENSION HOST to log real tool execution — you CANNOT produce them. They show up in your context because the host logged actual tool calls, not because you can fabricate them. To actually run a tool, emit `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. Your fake fences mean NO work has happened this turn. You have TWO options for your retry, and ONLY two: (a) Emit a real `<tool_call>{"name":"...","params":{...}}</tool_call>` envelope NOW to actually do the work, then wait for the real result. (b) Honestly state "I have not [action] yet" and STOP. Do NOT claim completion. You MUST NOT claim you have fixed / eliminated / resolved / removed / cleaned / verified anything. No "successfully [verb]" phrasing. No numbered lists of "Step 1: I did X" actions. No "the project is now in a healthy state." Until a real `<tool_call>` lands on disk and returns a real tool-result, nothing has changed. Lying about completion is the worst failure mode. Retry now.'
1038
+ });
1039
+ continue;
1040
+ }
1041
+ // False-tool-absence detector. Model sometimes claims a tool
1042
+ // "is not available" / "I don't have access to X" — even when the
1043
+ // tool IS in the registry and was sent in this very turn's
1044
+ // native-tools schema. Usually triggered by an earlier error
1045
+ // ("Expected object, received string", "tool 'X' not registered")
1046
+ // surviving into compacted history while the success path didn't,
1047
+ // or by raw hallucination on small/mid models. Reset is a
1048
+ // band-aid; correct the claim inline so the user can keep going.
1049
+ //
1050
+ // Detector fires only when (a) the response has no tool_call,
1051
+ // (b) the absence phrase appears, (c) the named tool IS registered.
1052
+ // The registry-membership check is what gates the nudge — without
1053
+ // it we'd false-positive on legitimate "I can't do that" responses
1054
+ // about capabilities the agent genuinely doesn't have.
1055
+ if (!hitLimit
1056
+ && !(0, tool_use_parser_1.hasToolCalls)(response)
1057
+ && toolAbsenceCorrectionsFired < TOOL_ABSENCE_CORRECTION_CAP) {
1058
+ const registeredNames = this.registry.getAll().map((t) => t.name);
1059
+ const absence = (0, toolAvailabilityDetector_1.detectFalseToolAbsence)(response, registeredNames);
1060
+ if (absence.detected) {
1061
+ toolAbsenceCorrectionsFired++;
1062
+ emit('tool_loop:false_tool_absence', {
1063
+ iteration: iterations,
1064
+ matched: absence.matchedToolNames,
1065
+ suggested: absence.suggestedTools,
1066
+ responsePreview: response.slice(0, 200)
1067
+ });
1068
+ messages.push({ role: 'assistant', content: response });
1069
+ messages.push({ role: 'user', content: (0, toolAvailabilityDetector_1.buildToolAvailabilityNudge)(absence) });
1070
+ continue;
1071
+ }
1072
+ }
1073
+ // Tool-error recovery. When the previous iteration's tool call
1074
+ // returned isError:true and THIS iteration produced no tool_call,
1075
+ // the model is silently abandoning the request. Push a one-shot
1076
+ // nudge: retry with corrected params OR explicitly state which
1077
+ // precondition failed. Without this the agent drops the task and
1078
+ // the user has to manually say "continue."
1079
+ if (!hitLimit
1080
+ && !(0, tool_use_parser_1.hasToolCalls)(response)
1081
+ && lastIterationHadToolError
1082
+ && toolErrorRecoveryFired < TOOL_ERROR_RECOVERY_CAP) {
1083
+ toolErrorRecoveryFired++;
1084
+ emit('tool_loop:tool_error_recovery', {
1085
+ iteration: iterations,
1086
+ responsePreview: response.slice(0, 200)
1087
+ });
1088
+ messages.push({ role: 'assistant', content: response });
1089
+ messages.push({
1090
+ role: 'user',
1091
+ content: 'The previous tool call returned an error and you produced no follow-up tool_call. ' +
1092
+ 'Do NOT silently abandon the request — the user expects you to either retry with corrected parameters OR state explicitly which precondition failed and why you cannot proceed. ' +
1093
+ 'Choose one: (a) emit a corrected `<tool_call>{"name":"...","params":{...}}</tool_call>` now, fixing the param shape or value the error pointed at; ' +
1094
+ '(b) give a one-line final answer naming the exact precondition you lack (e.g. "I cannot trash message X because the message id is unknown — please provide it"). ' +
1095
+ 'Do not pretend the error did not happen and do not continue with unrelated work.'
1096
+ });
1097
+ continue;
1098
+ }
1099
+ messages.push({ role: 'assistant', content: response });
1100
+ // Small models sometimes stall with an empty response after a tool
1101
+ // result. Give them one polite nudge before giving up — almost always
1102
+ // enough for gemma4:e4b / qwen 7B to produce a real answer.
1103
+ //
1104
+ // Reasoning-only responses count as empty here. bandit-logic / Qwen
1105
+ // 3.6 in thinking mode sometimes emits a full <think>…</think> or
1106
+ // ```bandit-reasoning``` block planning out the work and then stops
1107
+ // without emitting an actual tool_call. Visually the user sees a
1108
+ // wall of reasoning text and nothing happens. Strip the reasoning
1109
+ // fences before checking emptiness so the same nudge fires.
1110
+ const stripped = response
1111
+ .replace(/<think\b[\s\S]*?<\/think\s*>/gi, '')
1112
+ .replace(/<think\b[\s\S]*$/i, '')
1113
+ .replace(/```bandit-reasoning\b[\s\S]*?```/gi, '')
1114
+ .replace(/```bandit-reasoning\b[\s\S]*$/i, '')
1115
+ .trim();
1116
+ const reasoningOnly = !stripped && response.trim().length > 0;
1117
+ // "Narrated but didn't act" detector. Some models (notably ones
1118
+ // post-trained for a different tool-call envelope, e.g. OpenAI
1119
+ // harmony) emit reasoning + a prose intent ("I'll search for X.")
1120
+ // without emitting the actual tool_call envelope. We treat that
1121
+ // as a stall and nudge once per turn.
1122
+ //
1123
+ // Verbs are enumerated explicitly (inflections too) — stem-with-
1124
+ // suffix patterns over- or under-match on English irregulars
1125
+ // (doubled-letter "running", silent-e "using", false positives
1126
+ // on "useful"/"reader"). The check is anchored to the TAIL of
1127
+ // the stripped response (last sentence) so the verb has to be
1128
+ // in the model's final clause, not an earlier "I have already
1129
+ // searched the file" preamble before a real answer.
1130
+ //
1131
+ // Captured 2026-05-25 (Mark, Portfolio IDE session): model emitted
1132
+ // "I'll redesign the portfolio... Let me rewrite both files." with
1133
+ // NO tool_call and the turn closed as a final answer because
1134
+ // neither `redesign` nor `rewrite` was on the list. A long
1135
+ // session ended with zero work shipped. Missing a verb here =
1136
+ // silent stall = user has to re-prompt manually. Cheap to add.
1137
+ const NARRATE_VERB_RE = /\b(use|uses|used|using|call|calls|called|calling|invoke|invokes|invoked|invoking|execute|executes|executed|executing|run|runs|running|ran|search|searches|searched|searching|look|looks|looked|looking|read|reads|reading|check|checks|checked|checking|find|finds|finding|found|list|lists|listed|listing|fetch|fetches|fetched|fetching|grep|greps|grepped|grepping|explore|explores|explored|exploring|locate|locates|located|locating|plan|plans|planned|planning|start|starts|started|starting|begin|begins|began|beginning|create|creates|created|creating|write|writes|wrote|writing|rewrite|rewrites|rewrote|rewriting|rewritten|build|builds|built|building|rebuild|rebuilds|rebuilt|rebuilding|update|updates|updated|updating|implement|implements|implemented|implementing|refactor|refactors|refactored|refactoring|redesign|redesigns|redesigned|redesigning|design|designs|designed|designing|generate|generates|generated|generating|scaffold|scaffolds|scaffolded|scaffolding|set\s+up|setting\s+up|tackle|tackles|tackled|tackling|do|does|did|doing|make|makes|made|making|batch|batches|batched|batching|execute|prepare|prepares|prepared|preparing|draft|drafts|drafted|drafting|outline|outlines|outlined|outlining|organize|organizes|organized|organizing|structure|structures|structured|structuring|kick\s+off|kicking\s+off|fix|fixes|fixed|fixing|edit|edits|edited|editing|modify|modifies|modified|modifying|patch|patches|patched|patching|adjust|adjusts|adjusted|adjusting|replace|replaces|replaced|replacing|swap|swaps|swapped|swapping|polish|polishes|polished|polishing|clean\s+up|cleaning\s+up|tidy|tidies|tidied|tidying|finalize|finalizes|finalized|finalizing|finish|finishes|finished|finishing|complete|completes|completed|completing|wire|wires|wired|wiring|hook|hooks|hooked|hooking|render|renders|rendered|rendering|style|styles|styled|styling|theme|themes|themed|theming|redo|redoes|redid|redoing|port|ports|ported|porting|migrate|migrates|migrated|migrating|configure|configures|configured|configuring|install|installs|installed|installing|remove|removes|removed|removing|delete|deletes|deleted|deleting|rename|renames|renamed|renaming)\b/i;
1138
+ const NARRATE_INTENT_RE = /\b(we (?:will|need to|should)|we'?ll|we'?re going to|i'?ll|i will|let me|let'?s|going to|i'?m going to|i need to)\b/i;
1139
+ // Real code fences pass through; narrate only fires when the
1140
+ // model emitted no structured payload at all. Check the STRIPPED
1141
+ // response, not the raw one — `bandit-reasoning` fences are
1142
+ // reasoning, not structured output.
1143
+ const hasCodeFence = /```[a-zA-Z0-9_-]*\s*\n/.test(stripped);
1144
+ const tailMatch = stripped.match(/(?:[.!?]\s+)([^.!?]*)$/);
1145
+ const tail = (tailMatch ? tailMatch[1] : stripped).slice(-200);
1146
+ const narratedButNoAction = !(0, tool_use_parser_1.hasToolCalls)(response) &&
1147
+ !hasCodeFence &&
1148
+ stripped.length > 0 &&
1149
+ stripped.length < 240 &&
1150
+ NARRATE_INTENT_RE.test(tail) &&
1151
+ NARRATE_VERB_RE.test(tail);
1152
+ // Empty-response retry: was previously gated to `iterations > 0`
1153
+ // under the assumption "empty first response = provider outage."
1154
+ // That assumption was wrong — with bandit-logic
1155
+ // (cloud) on multi-message email-fetch turns: iteration 0 streams
1156
+ // completely empty (no reasoning text, no narrate prose, just zero
1157
+ // tokens), the loop falls straight through, and the user gets the
1158
+ // stall fallback instantly. Same model later in the same session
1159
+ // worked fine. Empty on iteration 0 is now allowed to nudge so
1160
+ // the model gets a second chance (and the thinking-off recovery
1161
+ // below can flip it to non-thinking mode if the second pass also
1162
+ // empties).
1163
+ const shouldNudge = (!response.trim() || reasoningOnly || narratedButNoAction) &&
1164
+ !hitLimit &&
1165
+ consecutiveEmptyRetries < 2 &&
1166
+ !thinkingOffRecoveryAttempted;
1167
+ if (shouldNudge) {
1168
+ consecutiveEmptyRetries++;
1169
+ emit('tool_loop:empty_retry', {
1170
+ iteration: iterations,
1171
+ attempt: consecutiveEmptyRetries,
1172
+ reasoningOnly,
1173
+ narratedButNoAction
1174
+ });
1175
+ const nudgeMessage = narratedButNoAction
1176
+ ? 'You announced your next step in prose ("we will search…" / "let me check…" / "use X to find Y") but did NOT emit a `<tool_call>` envelope. Announcing intent is not enough — you must actually invoke the tool. Emit the call now in this exact format, OUTSIDE of any reasoning block, with NO commentary and NO markdown fence:\n\n<tool_call>{"name":"<tool>","params":{"<key>":"<value>"}}</tool_call>\n\nReplace name/params with the right values for your task. Or, if the task is already answerable from what you know, give a final answer instead.'
1177
+ : reasoningOnly
1178
+ ? 'You completed reasoning but emitted no tool_call AND no final answer. The reasoning text alone does not run a tool — you must emit a `<tool_call>` envelope OUTSIDE the reasoning block. Format example (replace name/params for your task):\n\n<tool_call>{"name":"<tool>","params":{"<key>":"<value>"}}</tool_call>\n\nNo prose around it, no markdown fence, just the bare tag. If the task is answerable without a tool, write a complete final answer instead. Do not stop after only thinking.'
1179
+ : 'Your previous response was empty. Either emit a `<tool_call>{"name":"<tool>","params":{...}}</tool_call>` to invoke a tool, OR produce a complete final answer using what you have. Do not respond with an empty message.';
1180
+ messages.push({
1181
+ role: 'user',
1182
+ content: nudgeMessage
1183
+ });
1184
+ continue;
1185
+ }
1186
+ // Cap reached on a reasoning-only OR completely-empty stall: try
1187
+ // ONE more round with thinking forced off. This is the single-shot
1188
+ // "thinking-off recovery" — see comment on
1189
+ // `thinkingOffRecoveryAttempted` above. If the model produces a
1190
+ // tool_call this time, great. If it still stalls, we fall through
1191
+ // and the loop terminates normally with the final response shown
1192
+ // to the user.
1193
+ //
1194
+ // Threshold lowered from 2 to 1 AND extended to cover empty
1195
+ // responses (2026-05-03): bandit-logic via the cloud gateway
1196
+ // sometimes streams an entirely empty response on iteration 0
1197
+ // (not reasoning-only — zero tokens). Same prompt later in the
1198
+ // same session works fine. Force thinking-off after a single
1199
+ // empty/reasoning-only retry so the second attempt skips the
1200
+ // thinking channel entirely.
1201
+ const stallShape = reasoningOnly || !response.trim();
1202
+ if (!hitLimit
1203
+ && stallShape
1204
+ && consecutiveEmptyRetries >= 1
1205
+ && !thinkingOffRecoveryAttempted) {
1206
+ thinkingOffRecoveryAttempted = true;
1207
+ consecutiveEmptyRetries = 0;
1208
+ nextCallThinkOverride = false;
1209
+ emit('tool_loop:thinking_off_recovery', {
1210
+ iteration: iterations,
1211
+ reason: 'reasoning_only_cap_exhausted'
1212
+ });
1213
+ messages.push({
1214
+ role: 'user',
1215
+ content: 'Switching to non-thinking mode for this attempt because reasoning-only retries exhausted. Emit either a tool_call or a complete final answer. No more reasoning preamble.'
1216
+ });
1217
+ continue;
1218
+ }
1219
+ // Final-shot prefill recovery for the qwen3.6 "closes reasoning fence
1220
+ // and stops" pattern. Reached when thinking-off recovery also
1221
+ // produced a reasoning-only / empty response. Push an assistant
1222
+ // message containing only the start of a tool_call envelope so
1223
+ // Ollama treats it as a prefill — the model has to continue from
1224
+ // inside the envelope, removing its option to end the response at
1225
+ // the reasoning fence close. The completion is glued back to the
1226
+ // prefix when streamAndAggregate returns (see the prepend above).
1227
+ if (!hitLimit
1228
+ && stallShape
1229
+ && thinkingOffRecoveryAttempted
1230
+ && !prefillRecoveryAttempted) {
1231
+ prefillRecoveryAttempted = true;
1232
+ consecutiveEmptyRetries = 0;
1233
+ nextCallThinkOverride = false;
1234
+ pendingPrefillPrefix = '<tool_call>{"name":"';
1235
+ emit('tool_loop:prefill_recovery', {
1236
+ iteration: iterations,
1237
+ prefix: pendingPrefillPrefix
1238
+ });
1239
+ messages.push({
1240
+ role: 'assistant',
1241
+ content: pendingPrefillPrefix
1242
+ });
1243
+ continue;
1244
+ }
1245
+ consecutiveEmptyRetries = 0;
1246
+ // Model emitted tool_call markup but none parsed — almost always means
1247
+ // invalid JSON inside a content string (unescaped quotes is the classic
1248
+ // offender on writes of TS/JSON/HTML files). Give the model one more
1249
+ // shot with explicit guidance; otherwise treat the raw text as final.
1250
+ if (!hitLimit && (0, tool_use_parser_1.looksLikeAttemptedToolCall)(response) && !(0, tool_use_parser_1.hasToolCalls)(response) && parseRetries < PARSE_RETRY_CAP) {
1251
+ parseRetries++;
1252
+ emit('tool_loop:parse_retry', { iteration: iterations, attempt: parseRetries });
1253
+ // First retry: gentle guidance on escaping. Second retry: an
1254
+ // explicit escape-hatch — tell the model to write the file with
1255
+ // write_file (which takes a single `content` param and avoids
1256
+ // the find/replace escaping gauntlet) OR produce a prose-only
1257
+ // final answer. Without this the loop just terminates silently
1258
+ // and the user sees no actual edit.
1259
+ const firstRetry = parseRetries === 1;
1260
+ messages.push({
1261
+ role: 'user',
1262
+ content: firstRetry
1263
+ ? 'Your previous tool_call was not valid JSON — I could not parse it. Common cause: unescaped `"` characters inside a string value (for example `["", "", ""]` inside a `content` string). Retry the tool call with properly escaped JSON: every `"` inside a string value must be written as `\\"`, and every newline as `\\n`. If the content is very long, consider `replace_range` for a line-numbered block or breaking the change into smaller edits.'
1264
+ : 'Your tool_call still did not parse. Do NOT retry with the same shape or the same escaping failure. Switch tactics: (a) call `replace_range` for a large block whose line numbers you just read, (b) call `write_file` for a new file, or (c) split the change into multiple small `apply_edit` calls that each target just one method or block (e.g. 3-5 lines of `find`, 5-10 lines of `replace`) instead of rewriting the whole class. Pick the smallest scope that accomplishes the next step. If you cannot produce a valid tool call, respond with a plain-prose final answer acknowledging you could not complete the edit.'
1265
+ });
1266
+ continue;
1267
+ }
1268
+ // Prose-loop detector (cross-iteration). If the assistant has gone
1269
+ // N iterations in a row without emitting a tool call AND the
1270
+ // current response is substring-similar to the previous one, the
1271
+ // model is almost certainly stuck in a deliberation loop. Fire
1272
+ // one corrective nudge; if that doesn't break the pattern, let
1273
+ // the turn terminate on the next iteration so the user sees a
1274
+ // coherent final answer instead of a second wall of repetition.
1275
+ if (!hitLimit && !(0, tool_use_parser_1.hasToolCalls)(response)) {
1276
+ const normalized = response.toLowerCase().replace(/\s+/g, ' ').trim();
1277
+ const prior = recentNonToolResponses[recentNonToolResponses.length - 1];
1278
+ const looksLikeLoop = !!prior && (() => {
1279
+ // Cheap similarity: longest common prefix / max length. If two
1280
+ // consecutive no-tool responses share >60% of their text by
1281
+ // prefix the model is repeating itself. More sophisticated
1282
+ // diff would be overkill — the real failure mode is near-
1283
+ // identical responses, not subtle rephrasings.
1284
+ const short = prior.length < normalized.length ? prior : normalized;
1285
+ const long = prior.length < normalized.length ? normalized : prior;
1286
+ let matched = 0;
1287
+ while (matched < short.length && short[matched] === long[matched])
1288
+ matched++;
1289
+ return matched / short.length > 0.6;
1290
+ })();
1291
+ // Also flag the self-contradiction signature from the real
1292
+ // trace: alternating "Wait, I see …" and "Actually, I'll try
1293
+ // …" phrases appearing multiple times inside ONE response.
1294
+ const waitCount = (normalized.match(/wait,? i see/g) ?? []).length;
1295
+ const actuallyCount = (normalized.match(/actually,? i'?ll/g) ?? []).length;
1296
+ const selfContradicting = waitCount >= 3 && actuallyCount >= 3;
1297
+ // Intra-response stream abort already tagged the text — also a
1298
+ // loop.
1299
+ const streamAborted = response.includes('[stream aborted: self-contradicting prose loop detected]');
1300
+ if (!proseLoopNudged && (looksLikeLoop || selfContradicting || streamAborted)) {
1301
+ proseLoopNudged = true;
1302
+ emit('tool_loop:prose_loop_nudge', {
1303
+ iteration: iterations,
1304
+ responsePreview: response.slice(0, 200),
1305
+ reason: streamAborted ? 'stream_abort' : selfContradicting ? 'self_contradict' : 'cross_iteration_similarity'
1306
+ });
1307
+ messages.push({
1308
+ role: 'user',
1309
+ content: 'STOP deliberating. Your last response either repeated itself, contradicted itself (e.g. "Wait, I see X / Actually I\'ll try X"), or was aborted mid-stream as a loop. Do NOT continue speculating about what files might exist. Take exactly one of these actions now: (a) invoke a tool (`list_files`, `read_file`, `search_code`, etc.) to answer the question with real data, OR (b) give up and tell the user plainly that you could not complete the task and why. Do not write more than two sentences of prose before either calling a tool or terminating.'
1310
+ });
1311
+ recentNonToolResponses.length = 0;
1312
+ continue;
1313
+ }
1314
+ recentNonToolResponses.push(normalized);
1315
+ if (recentNonToolResponses.length > PROSE_LOOP_WINDOW) {
1316
+ recentNonToolResponses.shift();
1317
+ }
1318
+ }
1319
+ else {
1320
+ // Reset the window whenever a tool call fires — legitimate
1321
+ // progress breaks any suspected loop.
1322
+ recentNonToolResponses.length = 0;
1323
+ }
1324
+ // JSON-todo auto-promote: small models (observed on gemma3:12b-it-qat,
1325
+ // Apr 22 S3Api turn) often paste their todo list as a ```json fenced
1326
+ // code block instead of calling the todo_write tool. The plan never
1327
+ // advances and the model re-iterates on the same task because its
1328
+ // own view of "what's done" stays frozen. Detect the shape, execute
1329
+ // a synthesized todo_write call on the model's behalf, continue.
1330
+ if (!hitLimit && !(0, tool_use_parser_1.hasToolCalls)(response) && !jsonTodoAutoPromoted) {
1331
+ const JSON_TODO_FENCE_RE = /```json\s*\n([\s\S]*?)```/i;
1332
+ const match = response.match(JSON_TODO_FENCE_RE);
1333
+ if (match) {
1334
+ try {
1335
+ const parsed = JSON.parse(match[1].trim());
1336
+ // Must be a non-empty array where every item looks like a todo
1337
+ // ({content: string} at minimum). Tight check avoids false-
1338
+ // positives on generic data-shaped JSON the model might emit.
1339
+ if (Array.isArray(parsed) &&
1340
+ parsed.length > 0 &&
1341
+ parsed.every((item) => item &&
1342
+ typeof item === 'object' &&
1343
+ typeof item.content === 'string')) {
1344
+ jsonTodoAutoPromoted = true;
1345
+ emit('tool_loop:json_todo_auto_promoted', {
1346
+ iteration: iterations,
1347
+ itemCount: parsed.length
1348
+ });
1349
+ const todoTool = this.registry.get('todo_write');
1350
+ if (todoTool) {
1351
+ const syntheticCall = {
1352
+ name: 'todo_write',
1353
+ params: { items: JSON.stringify(parsed) },
1354
+ raw: `<tool_call>{"name":"todo_write","params":{"items":${JSON.stringify(JSON.stringify(parsed))}}}</tool_call>`
1355
+ };
1356
+ emit('tool_loop:tool_execute', {
1357
+ name: 'todo_write',
1358
+ params: syntheticCall.params,
1359
+ rawSnippet: syntheticCall.raw.slice(0, 400)
1360
+ });
1361
+ try {
1362
+ const result = await todoTool.execute(syntheticCall.params, this.ctx);
1363
+ // redact outputSnippet and outputFull
1364
+ // before emitting; the model-facing message below
1365
+ // is also redacted via buildToolResultsMessage →
1366
+ // formatToolResult. todo_write output rarely carries
1367
+ // secrets but consistency matters here — tool cards
1368
+ // in the extension UI will render outputFull and we
1369
+ // don't want any path to leak.
1370
+ emit('tool_loop:tool_result', {
1371
+ name: 'todo_write',
1372
+ isError: result.isError,
1373
+ outputLength: result.output.length,
1374
+ outputSnippet: (0, tool_use_parser_1.applySecretRedactionIfEnabled)(result.output.slice(0, 280)),
1375
+ outputFull: (0, tool_use_parser_1.applySecretRedactionIfEnabled)(result.output.slice(0, 65536))
1376
+ });
1377
+ messages.push({
1378
+ role: 'user',
1379
+ content: (0, tool_use_parser_1.buildToolResultsMessage)([
1380
+ { name: 'todo_write', output: result.output, isError: result.isError }
1381
+ ])
1382
+ });
1383
+ }
1384
+ catch (err) {
1385
+ const msg = err instanceof Error ? err.message : String(err);
1386
+ emit('tool_loop:tool_error', { name: 'todo_write', error: msg });
1387
+ messages.push({
1388
+ role: 'user',
1389
+ content: (0, tool_use_parser_1.buildToolResultsMessage)([
1390
+ { name: 'todo_write', output: `Error: ${msg}`, isError: true }
1391
+ ])
1392
+ });
1393
+ }
1394
+ // Nudge the model to stop pasting JSON and use the tool
1395
+ // directly next time. Reinforces the system-prompt anchor
1396
+ // without being so loud that it derails prose responses.
1397
+ messages.push({
1398
+ role: 'user',
1399
+ content: 'Note: I detected a JSON todo list in your response and auto-promoted it to a todo_write call. Next time, emit `<tool_call>{"name":"todo_write","params":{"items":"..."}}</tool_call>` directly instead of pasting JSON as a code block — pasted JSON does not update your plan, only the tool call does.'
1400
+ });
1401
+ iterations++;
1402
+ continue;
1403
+ }
1404
+ }
1405
+ }
1406
+ catch {
1407
+ // Not valid JSON — fall through to normal handling.
1408
+ }
1409
+ }
1410
+ }
1411
+ // If no tool calls (or hit limit), return the final answer.
1412
+ // Strip any lingering tool_call markup so malformed blocks never
1413
+ // reach the user-visible output.
1414
+ if (hitLimit || !(0, tool_use_parser_1.hasToolCalls)(response)) {
1415
+ // Detect hallucinated `<tool_result>` envelopes BEFORE stripping
1416
+ // so we can emit a telemetry event. The strip is mandatory (the
1417
+ // user can't see fabricated tool output as if it were real); the
1418
+ // event lets us track frequency and confirm the cause is what we
1419
+ // think it is — typically aggressive compaction stripping the
1420
+ // model's memory and it falling back to imitating the format.
1421
+ if ((0, tool_use_parser_1.hasFabricatedToolResult)(response)) {
1422
+ emit('tool_loop:hallucinated_tool_result', {
1423
+ iteration: iterations,
1424
+ responsePreview: response.slice(0, 300)
1425
+ });
1426
+ }
1427
+ const finalResponse = (0, tool_use_parser_1.stripToolCallMarkup)(response).trim();
1428
+ // False-completion detector. Small models regularly end a turn
1429
+ // with "I refactored the file" / "here is the updated code" text
1430
+ // without ever emitting a file-edit tool call.
1431
+ // When that happens the user sees a confident final response
1432
+ // backed by zero actual change on disk. If we detect this
1433
+ // pattern AND haven't nudged yet AND no edit tool was called
1434
+ // this turn, push one corrective user message into the loop
1435
+ // and continue for one more iteration. The nudge is capped at
1436
+ // one per turn so a truly confused model can still terminate.
1437
+ if (!hitLimit && !falseCompletionNudged && editToolsInvoked === 0) {
1438
+ const claimsCompletion = FALSE_COMPLETION_PATTERNS.some(re => re.test(finalResponse));
1439
+ if (claimsCompletion) {
1440
+ falseCompletionNudged = true;
1441
+ emit('tool_loop:false_completion_nudge', { iteration: iterations, responsePreview: finalResponse.slice(0, 200) });
1442
+ messages.push({
1443
+ role: 'user',
1444
+ content: 'Your response either claims work is done OR apologizes and asks what to do next — but I see NO successful `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call in this turn, so nothing on disk has changed. ' +
1445
+ 'Do NOT ask the user which task to resume, do NOT promise to escape JSON "in your next tool call", and do NOT defer. Either (a) emit a real edit tool call NOW with the actual change — use `replace_range` for a large block whose line numbers you just read, `apply_edit` for a small exact replacement, or `write_file` for a new file — or (b) respond honestly that you could not complete the task and briefly explain why. Retry the tool call yourself; the user cannot help you escape JSON.'
1446
+ });
1447
+ continue;
1448
+ }
1449
+ }
1450
+ // Partial-completion detector. The check above catches "claimed
1451
+ // work, did NOTHING." This catches "claimed work on N files, only
1452
+ // edited M of them." with gpt-oss:120b on
1453
+ // S3Api: 1 successful apply_edit on HealthController.cs, then
1454
+ // the final answer claimed edits to FileController.cs (class +
1455
+ // 3 methods) AND HealthController.cs (class + method). The user
1456
+ // saw a confident summary of 5 edits but only 1 landed on disk.
1457
+ // Heuristic: extract distinct file references (paths with
1458
+ // recognized source extensions or backticked file-like tokens)
1459
+ // from the response. If the count exceeds the actual successful
1460
+ // edit count, the model is overclaiming. One nudge per turn.
1461
+ if (!hitLimit && !falseCompletionNudged && editToolsInvoked > 0) {
1462
+ const filePathRe = /[`"']?([\w./\\-]+\.(?:cs|ts|tsx|js|jsx|mjs|cjs|py|rb|go|rs|java|kt|swift|cpp|cc|c|h|hpp|md|json|ya?ml|html|css|scss|sql|toml|sh|bash))[`"']?/gi;
1463
+ const fileSet = new Set();
1464
+ let m;
1465
+ while ((m = filePathRe.exec(finalResponse)) !== null) {
1466
+ // Normalize so `S3Api/Controllers/Foo.cs` and `Foo.cs` count
1467
+ // separately only when they really are different files. Last
1468
+ // segment is the cheapest disambiguator.
1469
+ const segments = m[1].split(/[/\\]/);
1470
+ const leaf = segments[segments.length - 1].toLowerCase();
1471
+ fileSet.add(leaf);
1472
+ }
1473
+ if (fileSet.size > editToolsInvoked) {
1474
+ falseCompletionNudged = true;
1475
+ emit('tool_loop:partial_completion_nudge', {
1476
+ iteration: iterations,
1477
+ editToolsInvoked,
1478
+ claimedFiles: fileSet.size,
1479
+ responsePreview: finalResponse.slice(0, 200)
1480
+ });
1481
+ messages.push({
1482
+ role: 'user',
1483
+ content: `Your response describes edits to ${fileSet.size} files (${[...fileSet].slice(0, 8).join(', ')}${fileSet.size > 8 ? ', …' : ''}), but only ${editToolsInvoked} successful edit${editToolsInvoked === 1 ? '' : 's'} actually fired this turn. ` +
1484
+ `The remaining ${fileSet.size - editToolsInvoked} file(s) were NOT modified — nothing landed on disk for them. ` +
1485
+ 'Either (a) emit the missing `apply_edit` / `replace_range` / `write_file` tool calls now to actually do the work, OR (b) revise your response to honestly describe ONLY the edits that successfully applied. Do not summarize work that did not happen.'
1486
+ });
1487
+ continue;
1488
+ }
1489
+ }
1490
+ // Subject-not-modified detector. Refactor goals
1491
+ // ("break out", "split", "refactor", "extract", "move") imply
1492
+ // mutation of the SOURCE file the user wants restructured, not
1493
+ // just creation of new sibling files. Failure mode observed
1494
+ // 2026-05-25 on a Portfolio React refactor: model read App.jsx,
1495
+ // wrote 5 new component files, never touched App.jsx, declared
1496
+ // completion. User had to follow up "are we using these?" to
1497
+ // force the integration step — and even that follow-up turn
1498
+ // wrote MORE components without modifying App.jsx.
1499
+ //
1500
+ // Heuristic: original goal contains a refactor verb AND the
1501
+ // turn read files AND wrote DIFFERENT files. If none of the
1502
+ // read files were also written, the model produced consumers
1503
+ // but never updated the source. One-shot nudge.
1504
+ const REFACTOR_GOAL_RE = /\b(refactor|refactoring|break\s+(?:out|up|apart|into)|split\s+(?:out|up|into|apart)|extract|extracting|migrate|migrating|move\s+(?:out\s+of|from|into)|reorganize|reorganizing|restructure|restructuring|consolidate|consolidating)\b/i;
1505
+ if (!hitLimit &&
1506
+ !subjectNotModifiedNudged &&
1507
+ editToolsInvoked > 0 &&
1508
+ filesReadThisTurn.size > 0 &&
1509
+ originalGoal &&
1510
+ REFACTOR_GOAL_RE.test(originalGoal)) {
1511
+ const readNotWritten = [...filesReadThisTurn].filter((p) => !filesWrittenThisTurn.has(p));
1512
+ // Fire only when the read-set is disjoint from the write-set.
1513
+ // If even ONE read file was written, the model is integrating;
1514
+ // we don't want to nag a partial-but-progressing refactor.
1515
+ if (readNotWritten.length === filesReadThisTurn.size) {
1516
+ subjectNotModifiedNudged = true;
1517
+ emit('tool_loop:subject_not_modified_nudge', {
1518
+ iteration: iterations,
1519
+ readNotWritten: readNotWritten.slice(0, 4),
1520
+ writtenCount: filesWrittenThisTurn.size
1521
+ });
1522
+ const readPreview = readNotWritten.slice(0, 3).join(', ');
1523
+ const writeCount = filesWrittenThisTurn.size;
1524
+ messages.push({
1525
+ role: 'user',
1526
+ content: `The user's goal contains a refactor verb (refactor/break out/split/extract/move) which implies the SOURCE file(s) should be modified, not just supplemented with new siblings. You read ${readPreview}${readNotWritten.length > 3 ? ' and others' : ''} for context, then wrote ${writeCount} NEW file(s), but you NEVER modified the file(s) you read. The refactor is incomplete: the source file still contains the old monolithic code. ` +
1527
+ `Emit the missing apply_edit/replace_range/write_file call on the source file now — it should import from the new files and drop the inlined code that's been extracted. If the refactor is genuinely a "scaffold only, leave source untouched" task, say so explicitly and explain why the source doesn't need to change.`
1528
+ });
1529
+ continue;
1530
+ }
1531
+ }
1532
+ // Code-fence-as-final-answer detector. pburg-bowl trace (Apr 21):
1533
+ // the model read ScoreBoard.tsx, then ended the turn with a ```
1534
+ // fenced helper function and "Replace your current total calculation
1535
+ // logic with this" — never calling a file-edit tool. The
1536
+ // existing FALSE_COMPLETION_PATTERNS don't catch this flavor because
1537
+ // the model doesn't SAY "I have refactored" — it just hands back
1538
+ // code. Heuristic: final response contains a fenced block with at
1539
+ // least ~8 lines of code, no edit tool was invoked this turn, and
1540
+ // the original prompt implied a file change. One-shot nudge.
1541
+ if (!hitLimit &&
1542
+ !codeFenceHallucinationNudged &&
1543
+ editToolsInvoked === 0 &&
1544
+ promptImpliesFileEdit) {
1545
+ // Look for ```lang\n...\n``` blocks. We want *substantial* code,
1546
+ // not a one-liner — so require at least 8 non-empty lines inside
1547
+ // the fence. This avoids false positives on small snippets
1548
+ // (shell commands, regex, env values).
1549
+ const fenceRe = /```[a-zA-Z0-9_-]*\n([\s\S]*?)```/g;
1550
+ const MIN_LINES = 8;
1551
+ let biggestFenceLines = 0;
1552
+ let match;
1553
+ while ((match = fenceRe.exec(finalResponse)) !== null) {
1554
+ const nonEmpty = match[1].split('\n').filter(l => l.trim().length > 0).length;
1555
+ if (nonEmpty > biggestFenceLines)
1556
+ biggestFenceLines = nonEmpty;
1557
+ }
1558
+ if (biggestFenceLines >= MIN_LINES) {
1559
+ codeFenceHallucinationNudged = true;
1560
+ emit('tool_loop:code_fence_nudge', {
1561
+ iteration: iterations,
1562
+ fenceLines: biggestFenceLines,
1563
+ responsePreview: finalResponse.slice(0, 200)
1564
+ });
1565
+ messages.push({
1566
+ role: 'user',
1567
+ content: 'You produced a substantial code block in your reply but never emitted a `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call — so the change is NOT on disk. ' +
1568
+ 'Do not ask the user to paste your code into a file themselves. Take exactly one of these actions now: (a) call `replace_range`, `apply_edit`, or `write_file` with the real change to the correct file, OR (b) say plainly that you could not locate the target file and explain what you searched for. Do not wrap up with another prose + code-fence response.'
1569
+ });
1570
+ continue;
1571
+ }
1572
+ }
1573
+ // Announce-then-stall detector. The model wraps an iteration with
1574
+ // a forward-looking commitment ("Let me dig deeper into X", "Next
1575
+ // I'll explore Y") but emits NO tool call, so the loop interprets
1576
+ // the prose as the final answer and exits. // with bandit-logic self-evaluating this repo. None of the
1577
+ // upstream detectors fire: no completion claim (false-completion
1578
+ // patterns miss), no code fence, no prose-loop similarity (it's
1579
+ // the first stall after real work), no parse retry (the prose
1580
+ // doesn't look like an attempted tool call). One nudge per turn;
1581
+ // if the model still won't act, we fall through to terminate so
1582
+ // the user can intervene.
1583
+ // Announce-then-stall + ask-user-in-prose detectors. The model
1584
+ // wrapped a turn with "Let me X" / "I'll Y" / "I'm porting Z"
1585
+ // (announce-intent) or with a prose decision question (ask-user)
1586
+ // while we could have rendered an interactive prompt. Either one
1587
+ // means the loop is about to exit on a non-final-answer shape.
1588
+ // Detector bodies + the regex why-traces live in
1589
+ // loop/finalAnswerNudges.ts. The orchestrator owns the
1590
+ // once-per-turn flags and the false-completion-nudge precedence.
1591
+ if (!hitLimit && !announceIntentNudged && !falseCompletionNudged) {
1592
+ const r = (0, finalAnswerNudges_1.tryAnnounceIntentNudge)({ finalResponse, iteration: iterations, emit });
1593
+ if (r.fired) {
1594
+ announceIntentNudged = true;
1595
+ messages.push(r.message);
1596
+ continue;
1597
+ }
1598
+ }
1599
+ if (!hitLimit && !askUserNudged && !falseCompletionNudged) {
1600
+ const r = (0, finalAnswerNudges_1.tryAskUserNudge)({
1601
+ finalResponse,
1602
+ iteration: iterations,
1603
+ emit,
1604
+ askUserAvailable: this.registry.get('ask_user') !== undefined
1605
+ });
1606
+ if (r.fired) {
1607
+ askUserNudged = true;
1608
+ messages.push(r.message);
1609
+ continue;
1610
+ }
1611
+ }
1612
+ // Subagent-first-iteration-must-act detector. Subagents are
1613
+ // spawned to gather information for a specific goal — producing
1614
+ // prose-only output on iter 0 is always a stall, not a real
1615
+ // final answer. The earlier announce-intent + narrate detectors
1616
+ // miss when bandit-logic emits neutral reasoning + non-forward-
1617
+ // looking prose ("This is a complex task...") that doesn't
1618
+ // match either's patterns. 5/6 subagents
1619
+ // on a self-eval turn died at 0 iterations with exactly that
1620
+ // shape. One nudge per turn; if the model still won't emit a
1621
+ // tool the loop exits and the parent gets the existing
1622
+ // "subagent stalled in reasoning" error.
1623
+ if (effectiveOptions.isSubagent
1624
+ && iterations === 0
1625
+ && !subagentFirstIterNudged
1626
+ && !announceIntentNudged
1627
+ && !falseCompletionNudged
1628
+ && !hitLimit) {
1629
+ subagentFirstIterNudged = true;
1630
+ // DO NOT force think:false here. The earlier fix
1631
+ // hard-set nextCallThinkOverride = false on this
1632
+ // retry, which is correct for non-reasoning models but
1633
+ // catastrophic for bandit-logic (qwen3.6:27b): per the
1634
+ // model's training, the tool channel runs THROUGH the
1635
+ // reasoning channel — disabling thinking disables tool
1636
+ // calling entirely. Self-eval traces 2026-05-08 confirmed
1637
+ // 6+ consecutive retries with think:false producing only
1638
+ // reasoning prose, never a tool call. Now we keep the
1639
+ // model's natural think setting and only escalate the
1640
+ // prompt — give the model a concrete <tool_call> envelope
1641
+ // it can copy verbatim, with the most generic exploration
1642
+ // tool baked in. The thinking-off-recovery path at line 876
1643
+ // still fires earlier for genuinely empty/stuck responses;
1644
+ // we don't double-down here.
1645
+ emit('tool_loop:subagent_first_iter_no_tool_call', {
1646
+ iteration: iterations,
1647
+ responsePreview: finalResponse.slice(0, 240)
1648
+ });
1649
+ messages.push({
1650
+ role: 'user',
1651
+ content: 'Your first response had reasoning but emitted NO tool call — that is a hard stall for a subagent (you exist to gather information; reasoning alone produces zero output). ' +
1652
+ 'For your next response, emit a tool call. The minimum viable starting move for ANY exploration goal is:\n\n' +
1653
+ '<tool_call>{"name":"list_files","params":{"path":"."}}</tool_call>\n\n' +
1654
+ 'Copy that exact envelope as the very first thing you emit (you may keep the reasoning block before it if your model needs to think first, but the tool_call envelope MUST appear in this turn). ' +
1655
+ 'Substitute a different tool only if it\'s obviously better for the goal — `read_file` for "what does file X look like", `search_code` for "where is symbol Y", `run_command` for shell output. ' +
1656
+ 'Do NOT respond with reasoning only again. The next message you send must contain a real <tool_call> envelope.'
1657
+ });
1658
+ continue;
1659
+ }
1660
+ // Reasoning-only terminal fallback. If we got here because the
1661
+ // empty-retry / thinking-off-recovery cap was reached and the
1662
+ // model still produced only reasoning + zero actionable output,
1663
+ // the user otherwise sees nothing — just a return to the prompt.
1664
+ // Surface a clear message that names what the model intended (so
1665
+ // the user can act on it themselves) instead of leaving them
1666
+ // staring at a blank reply. with bandit-logic
1667
+ // on the email-fetch task: model reasoned "I should use
1668
+ // run_command with osascript to fetch …" and emitted no tool
1669
+ // call — final response was empty after fence-strip and the
1670
+ // user saw nothing.
1671
+ //
1672
+ // The gate also covers the "regurgitated reasoning after
1673
+ // native→text channel fallback" case. Mark Portfolio
1674
+ // 2026-05-31T17-39-53 cleanup turn: native-tool path 500'd,
1675
+ // text-channel recovery prompted the model to re-emit its
1676
+ // pending action, but the model just echoed its prior
1677
+ // `bandit-reasoning` block — no tool_call, no prose, no
1678
+ // visible action for the user. The previous gate (`!finalResponse`,
1679
+ // where finalResponse = response stripped of tool_call markup
1680
+ // only) didn't trigger because the reasoning fence is not
1681
+ // tool_call markup. Widened below to also strip reasoning
1682
+ // before testing emptiness — if the response would render to
1683
+ // the user as nothing-actionable, the fallback fires and the
1684
+ // user sees what the model was thinking instead of silence.
1685
+ const reasoningStripped = response
1686
+ .replace(/<think\b[\s\S]*?<\/think\s*>/gi, '')
1687
+ .replace(/<think\b[\s\S]*$/i, '')
1688
+ .replace(/```bandit-reasoning\b[\s\S]*?```/gi, '')
1689
+ .replace(/```bandit-reasoning\b[\s\S]*$/i, '')
1690
+ .trim();
1691
+ const visibleAfterStrip = (0, tool_use_parser_1.stripToolCallMarkup)(reasoningStripped).trim();
1692
+ if (!visibleAfterStrip) {
1693
+ // Pull the last 1-2 sentences of reasoning so the user sees
1694
+ // what the model planned to do. Cap at 280 chars so the
1695
+ // fallback stays readable.
1696
+ const reasoningMatch = response.match(/<think\b[\s\S]*?<\/think\s*>/gi)?.pop() ??
1697
+ response.match(/```bandit-reasoning\b[\s\S]*?```/gi)?.pop() ??
1698
+ response;
1699
+ const reasoningText = reasoningMatch
1700
+ .replace(/<\/?think[^>]*>/gi, '')
1701
+ .replace(/```bandit-reasoning\s*\n?|```/g, '')
1702
+ .trim();
1703
+ const sentences = reasoningText.match(/[^.!?]+[.!?]/g) ?? [reasoningText];
1704
+ const tail = sentences.slice(-2).join(' ').trim().slice(-280);
1705
+ const fallback = `[Bandit stalled after reasoning without emitting a tool call — the model thought through the next step but never committed to an action. ` +
1706
+ `Last reasoning: "${tail}${tail.length === 280 ? '…' : ''}"\n\n` +
1707
+ `Try: re-prompt with the same request (often resolves on the next turn), or run the planned command yourself.]`;
1708
+ return { finalResponse: fallback, iterations, messages, hitLimit };
1709
+ }
1710
+ // Narrate-but-no-action terminal annotator. If the model ends a
1711
+ // turn with "Let me revert it:" — i.e. a forward-looking intent
1712
+ // verb followed by a DANGLING COLON and NO tool_call envelope —
1713
+ // and the inline empty-retry / narrate-no-action detector
1714
+ // already used its retry budget (consecutiveEmptyRetries >= 2)
1715
+ // so it couldn't nudge again, the user is left reading a
1716
+ // promise the model never kept. Mark Portfolio
1717
+ // 2026-05-31T17-39-53 cleanup turn: after a native→text channel
1718
+ // recovery, the model emitted "Let me revert it:" with a
1719
+ // dangling colon and no tool call; the user saw the prose end
1720
+ // and waited for an action that never came. Append a clear
1721
+ // suffix so the unfulfilled intent reads as a stall, not as
1722
+ // the assistant's last word.
1723
+ //
1724
+ // The trailing colon is the smoking gun — it's the
1725
+ // grammatical signal "what comes next is the thing I'm about
1726
+ // to do". Without it ("Done. Let me know if you'd like me to
1727
+ // push the changes.") the response is a normal final answer
1728
+ // that happens to contain narrate verbs, and the annotator
1729
+ // would be a false positive.
1730
+ // The trailing colon + intent phrase combination is the
1731
+ // smoking gun. We DON'T also require NARRATE_VERB_RE here:
1732
+ // the existing inline detector's verb list misses "revert"
1733
+ // (Portfolio 2026-05-31) and would miss any other one-off
1734
+ // action verb a model might use. The colon alone is rare
1735
+ // enough in a legit final answer that pairing it with
1736
+ // "let me" / "I'll" / "we'll" / etc. is specific enough.
1737
+ //
1738
+ // Period-terminated variant (added 2026-06-03 after Mark's
1739
+ // gregoryhite-site run): the model ended with "Let me fix
1740
+ // all three project cards at once." — full sentence, full
1741
+ // stop, no colon. Both prefill and thinking-off recovery
1742
+ // had been spent earlier in the turn so the user saw the
1743
+ // narrate prose as the final answer with no annotation that
1744
+ // it represented a stall. Periods are MUCH more common than
1745
+ // colons in legit answers ("Done.", "Let me know if you'd
1746
+ // like me to push the changes."), so the period path
1747
+ // requires the STRICTER pair: NARRATE_INTENT_RE AND
1748
+ // NARRATE_VERB_RE both matching the tail clause. "Let me
1749
+ // know if you'd like…" hits intent but no action verb;
1750
+ // "Let me fix the cards" hits both.
1751
+ const terminalStripped = reasoningStripped;
1752
+ const endsWithColon = terminalStripped.endsWith(':');
1753
+ const endsWithPeriod = /\.["']?$/.test(terminalStripped);
1754
+ if ((endsWithColon || endsWithPeriod) && terminalStripped.length < 600) {
1755
+ // Extract the LAST sentence (text after the final non-trailing
1756
+ // sentence terminator). For period-ending responses we must
1757
+ // isolate just the closing clause — testing the whole response
1758
+ // would leak action verbs from earlier "Done. I updated the
1759
+ // file." prose into the gate and trigger false positives on
1760
+ // legit sign-offs like "Let me know if you'd like X."
1761
+ const sentenceSplit = terminalStripped
1762
+ .split(/[.!?]+\s+/)
1763
+ .map((s) => s.trim())
1764
+ .filter((s) => s.length > 0);
1765
+ const terminalTail = (sentenceSplit[sentenceSplit.length - 1] ?? terminalStripped).slice(-200);
1766
+ const intentHit = NARRATE_INTENT_RE.test(terminalTail);
1767
+ // Period path needs both intent + action verb. Colon path keeps the
1768
+ // original looser gate (colon alone is rare enough).
1769
+ const verbGateMet = endsWithColon ? true : NARRATE_VERB_RE.test(terminalTail);
1770
+ if (intentHit && verbGateMet) {
1771
+ const annotated = `${finalResponse}\n\n` +
1772
+ `[Bandit announced this action but did not emit the tool call — the turn ended without the planned change. ` +
1773
+ `If this came after retries (look for "Upstream hiccup" or "Native tool call failed" status messages), the upstream model errored mid-turn and the recovery prompt didn't land the action. ` +
1774
+ `Re-prompt with the same request to retry, or perform the action yourself.]`;
1775
+ return { finalResponse: annotated, iterations, messages, hitLimit };
1776
+ }
1777
+ }
1778
+ return { finalResponse, iterations, messages, hitLimit };
1779
+ }
1780
+ // Parse and execute all tool calls in this response.
1781
+ let toolCalls = (0, tool_use_parser_1.parseToolCalls)(response);
1782
+ emit('tool_loop:tool_calls', { iteration: iterations, tools: toolCalls.map(t => t.name) });
1783
+ // Repeated-todo-write circuit breaker. pburg-bowl (Apr 21) burned 3
1784
+ // consecutive iterations on `todo_write` revisions before doing any
1785
+ // real work. If this iteration's tools are ONLY todo_write (or
1786
+ // todo_write + another todo_write) AND the previous N-1 iterations
1787
+ // were also todo-only, drop the redundant todo_write calls and
1788
+ // inject a nudge telling the model to execute. We keep non-todo
1789
+ // calls in the same iteration — the breaker only strips redundant
1790
+ // planning, never real work.
1791
+ const todoOnly = toolCalls.length > 0 && toolCalls.every(t => t.name === 'todo_write');
1792
+ // apply_edit-only iteration detector. Mirrors todoOnly
1793
+ // shape; tracks how many consecutive iterations spent every tool
1794
+ // slot on apply_edit (no read, search, run_command, etc.) so we
1795
+ // can nudge toward batching after the model burns through 4 in a
1796
+ // row. Doesn't fire on mixed iterations (a read + 2 apply_edits
1797
+ // is normal investigative work).
1798
+ const applyEditOnly = toolCalls.length > 0 && toolCalls.every(t => t.name === 'apply_edit');
1799
+ // feed the rolling health window so the iteration-cap
1800
+ // extension below knows whether the model is making clear
1801
+ // progress. We push true ONLY when this iteration produced
1802
+ // tool calls AND wasn't purely a planning churn (todo-only).
1803
+ // Empty iterations (parse failures, prose-only) push false.
1804
+ recentIterationsHadTools.push(toolCalls.length > 0 && !todoOnly);
1805
+ while (recentIterationsHadTools.length > RECENT_HEALTH_WINDOW) {
1806
+ recentIterationsHadTools.shift();
1807
+ }
1808
+ // Iterations that emitted NO tool calls (parse failure — model tried
1809
+ // to generate tool-call JSON that didn't round-trip) are neither
1810
+ // "todo-only" nor "real work." Don't let them reset the consecutive
1811
+ // counter — otherwise a Qwen turn like
1812
+ // iter 3: todo_write
1813
+ // iter 4: (empty — bad JSON)
1814
+ // iter 5: todo_write
1815
+ // iter 6: (empty — bad JSON)
1816
+ // iter 7: todo_write ...
1817
+ // never accumulates to the threshold and the churn nudge never
1818
+ // fires. on S3Api with bandit-logic
1819
+ // (Qwen 2.5 Coder 32B via native tool calling).
1820
+ const iterationHadRealWork = toolCalls.length > 0 && !todoOnly;
1821
+ if (todoOnly) {
1822
+ consecutiveTodoOnlyIterations++;
1823
+ }
1824
+ else if (iterationHadRealWork) {
1825
+ consecutiveTodoOnlyIterations = 0;
1826
+ // Re-arm the nudge once the model has executed real work. Without
1827
+ // this, a single churn early in the turn bans further todo_write
1828
+ // calls even when the model has legitimately finished a step and
1829
+ // wants to mark it completed — leaving the Plan stuck with every
1830
+ // item in the pending state ( on S3Api).
1831
+ todoChurnNudged = false;
1832
+ }
1833
+ // apply_edit-only streak tracking. Increments only when
1834
+ // the whole iteration was apply_edit; resets on any mixed iter
1835
+ // (read + edit, run + edit, etc.) since those are normal
1836
+ // investigative work, not a serial-error-fix loop.
1837
+ if (applyEditOnly) {
1838
+ consecutiveApplyEditOnlyIterations++;
1839
+ }
1840
+ else if (toolCalls.length > 0) {
1841
+ consecutiveApplyEditOnlyIterations = 0;
1842
+ applyEditBatchNudged = false;
1843
+ }
1844
+ // Else: empty toolCalls iteration — preserve counter state. The
1845
+ // parse-failure case is handled separately below (repeat-detector).
1846
+ if (todoOnly && consecutiveTodoOnlyIterations >= TODO_ONLY_LIMIT && !todoChurnNudged) {
1847
+ todoChurnNudged = true;
1848
+ emit('tool_loop:todo_churn_nudge', {
1849
+ iteration: iterations,
1850
+ consecutive: consecutiveTodoOnlyIterations
1851
+ });
1852
+ // Drop the redundant todo_write calls for this iteration so the
1853
+ // breaker doesn't just get absorbed into another no-op. The model
1854
+ // still "saw" its own todo_write in the assistant response, but
1855
+ // we skip execution and inject a nudge as the next user message.
1856
+ toolCalls = [];
1857
+ messages.push({
1858
+ role: 'user',
1859
+ content: `You have revised the plan in ${consecutiveTodoOnlyIterations + 1} consecutive iterations without executing any step. ` +
1860
+ 'Execute the first pending task now using a concrete tool — `search_code`, `read_file`, `apply_edit`, `replace_range`, `write_file`, or `run_command`. ' +
1861
+ 'Once a task is actually DONE (tool call succeeded), you may call `todo_write` again to mark it completed — but not to re-plan. ' +
1862
+ 'If you cannot identify a next step, respond to the user with a short honest explanation and stop.'
1863
+ });
1864
+ iterations++;
1865
+ continue;
1866
+ }
1867
+ // apply_edit-batch nudge. Fires once per turn when the
1868
+ // model has spent APPLY_EDIT_ONLY_LIMIT (4) consecutive iterations
1869
+ // doing nothing but apply_edit calls. Unlike the todo-churn nudge
1870
+ // we DO NOT drop the current iteration's calls — those edits are
1871
+ // real work, just slow work. We only inject the nudge as an
1872
+ // additional user message so the NEXT iteration considers
1873
+ // batching. Real on a 17-error
1874
+ // linter-fix turn that hit the iteration cap with 5 errors still
1875
+ // outstanding.
1876
+ if (applyEditOnly && consecutiveApplyEditOnlyIterations >= APPLY_EDIT_ONLY_LIMIT && !applyEditBatchNudged) {
1877
+ applyEditBatchNudged = true;
1878
+ emit('tool_loop:apply_edit_batch_nudge', {
1879
+ iteration: iterations,
1880
+ consecutive: consecutiveApplyEditOnlyIterations
1881
+ });
1882
+ messages.push({
1883
+ role: 'user',
1884
+ content: `You have spent ${consecutiveApplyEditOnlyIterations} consecutive iterations on apply_edit alone. ` +
1885
+ 'If these are mechanical fixes of the same shape (one type annotation, one rename, one import path, one missing semicolon per call), STOP doing them one at a time — you will exhaust the iteration budget before the file is clean.\n' +
1886
+ '\n' +
1887
+ 'Better tactics, in order of preference:\n' +
1888
+ '1. **`apply_patch` with multiple hunks** — one tool call lands every fix at once. You\'ve already read the files; the find context is in your buffer.\n' +
1889
+ '2. **`replace_range` for one large same-file region** — use the line numbers from `read_file` and replace the whole method/component block at once.\n' +
1890
+ '3. **A single broader-context `apply_edit`** — pick a `find` string that spans several adjacent edits and supply the corrected block as `replace`. Three small fixes in the same 10-line region collapse to one call.\n' +
1891
+ '4. **For 5+ fixes in one file**: re-read the file once, then `write_file` the corrected version. Faster than incrementally patching.\n' +
1892
+ '\n' +
1893
+ 'Pick a tactic and reach for it next iteration. Do not just emit another single-line apply_edit.'
1894
+ });
1895
+ iterations++;
1896
+ continue;
1897
+ }
1898
+ // Intra-iteration normalization: byte-identical dedup, foreground-
1899
+ // task fanout cap, per-iteration parallel cap, per-turn total cap.
1900
+ // Each step emits its own telemetry event so hosts can surface
1901
+ // drops in the UI. See loop/toolCallNormalize.ts.
1902
+ const normalized = (0, toolCallNormalize_1.normalizeToolCallBatch)({
1903
+ toolCalls,
1904
+ iteration: iterations,
1905
+ maxParallelTools,
1906
+ maxTotalTools,
1907
+ totalToolsExecuted,
1908
+ emit
1909
+ });
1910
+ toolCalls = normalized.accepted;
1911
+ const droppedForegroundTaskCalls = normalized.droppedForegroundTaskCalls;
1912
+ const droppedToolCalls = normalized.droppedParallelCap;
1913
+ totalToolsExecuted += toolCalls.length;
1914
+ // Per-tool execution — repeat-breaker, registry lookup,
1915
+ // beforeToolExecute gate, run, file-tracking + edit counting,
1916
+ // event emission. See loop/singleToolExecute.ts.
1917
+ const dispatchOne = (0, singleToolExecute_1.createToolDispatcher)({
1918
+ registry: this.registry,
1919
+ ctx: this.ctx,
1920
+ beforeToolExecute,
1921
+ emit,
1922
+ recentCallKeys,
1923
+ repeatLimit: REPEAT_LIMIT,
1924
+ filesReadThisTurn,
1925
+ filesWrittenThisTurn,
1926
+ isFileEditTool,
1927
+ onEditToolSucceeded: () => { editToolsInvoked++; }
1928
+ });
1929
+ // Output-budget gate + parallel/serial dispatch. Strong models
1930
+ // pass `outputBudgetTokens: Infinity` and never serialise;
1931
+ // small/medium local models trip the gate exactly when their
1932
+ // assistant turn is at risk of tail malformation. See
1933
+ // loop/parallelExecute.ts.
1934
+ const toolResults = await (0, parallelExecute_1.executeParallelBatch)({
1935
+ toolCalls,
1936
+ dispatchOne,
1937
+ outputBudgetTokens,
1938
+ outputBudgetRatio,
1939
+ emit,
1940
+ iteration: iterations,
1941
+ signal
1942
+ });
1943
+ // Track whether ANY tool errored this iteration so the next
1944
+ // iteration's no-tool-call branch can fire the recovery nudge if
1945
+ // the model abandons the request rather than retrying.
1946
+ lastIterationHadToolError = toolResults.some((r) => r.isError === true);
1947
+ // Inject tool results as the next user message.
1948
+ let resultsMessage = (0, tool_use_parser_1.buildToolResultsMessage)(toolResults);
1949
+ if (droppedToolCalls > 0) {
1950
+ // Synthetic system-style note appended to the tool-result payload.
1951
+ // Keeps the model from re-emitting the dropped calls verbatim on
1952
+ // the next iteration: it sees "X were dropped, narrow your query"
1953
+ // alongside the results from the kept calls.
1954
+ resultsMessage +=
1955
+ `\n\n[Note: you emitted ${droppedToolCalls + toolCalls.length} tool calls in one iteration; ` +
1956
+ `only the first ${toolCalls.length} were executed. Do not re-issue duplicates — ` +
1957
+ `instead, read the results above and pick a single most-promising next action.]`;
1958
+ }
1959
+ if (droppedForegroundTaskCalls > 0) {
1960
+ resultsMessage +=
1961
+ `\n\n[Note: you emitted ${droppedForegroundTaskCalls + 1} foreground task subagents in one iteration; ` +
1962
+ `only the first one was executed. Foreground subagents block the parent agent and make the UI look stuck. ` +
1963
+ `For repo overviews, synthesize from direct reads/searches first. For truly parallel audits, re-issue extra ` +
1964
+ `subagents with run_in_background="true" so the parent can keep responding.]`;
1965
+ }
1966
+ messages.push({ role: 'user', content: resultsMessage });
1967
+ // Fired-and-forgotten guard. The model just spawned ≥2 background
1968
+ // subagents in this iteration. Without a nudge, the next iteration
1969
+ // typically polls `check_task` on tasks that haven't started (a
1970
+ // wasted iteration) or replays the same exploration in parallel —
1971
+ // either way burning the parent's context budget on work the
1972
+ // subagents will report back via the auto-inject path. See the
1973
+ // `firedAndForgottenNudged` declaration for the trace this is
1974
+ // patterned on. One nudge per turn.
1975
+ if (!firedAndForgottenNudged) {
1976
+ const bgSpawns = toolCalls.filter((tc, idx) => tc.name === 'task' &&
1977
+ String(tc.params.run_in_background ?? '').toLowerCase() === 'true' &&
1978
+ // Only count successful spawns — a failed task tool result is
1979
+ // its own signal and the parent's already going to retry or
1980
+ // pivot.
1981
+ !toolResults[idx]?.isError);
1982
+ if (bgSpawns.length >= 2) {
1983
+ firedAndForgottenNudged = true;
1984
+ const goalLines = bgSpawns
1985
+ .map((tc) => {
1986
+ const g = typeof tc.params.goal === 'string' ? tc.params.goal : '';
1987
+ const trimmed = g.length > 90 ? g.slice(0, 90).trimEnd() + '…' : g;
1988
+ return trimmed ? `- ${trimmed}` : '';
1989
+ })
1990
+ .filter(Boolean)
1991
+ .join('\n');
1992
+ emit('tool_loop:fired_and_forgotten_nudge', {
1993
+ iteration: iterations,
1994
+ backgroundSpawns: bgSpawns.length
1995
+ });
1996
+ messages.push({
1997
+ role: 'user',
1998
+ content: `You just spawned ${bgSpawns.length} background subagents:\n${goalLines}\n\n` +
1999
+ 'Do NOT do those same explorations yourself in the next iteration — the subagents will deliver their synopses via the auto-inject path on a later turn. ' +
2000
+ 'Choose ONE of: ' +
2001
+ '(a) work on a different, independent piece of the task that those subagents are NOT covering, ' +
2002
+ '(b) terminate this turn now and wait for the synopses to land on the next turn — preferred when the user is waiting on a synthesis built from those subagent results, ' +
2003
+ '(c) call `check_task` once on a specific id only when its result is the literal next blocking input you need. ' +
2004
+ 'Do not poll all tasks at once immediately after spawning — they have not started yet and the call returns "still running" for every one of them.'
2005
+ });
2006
+ }
2007
+ }
2008
+ // Todo-progress tracking for the stale-plan nudge. Reset the edit
2009
+ // counter on any todo_write call (model updated its plan); increment
2010
+ // on successful edit calls. Native-tools-capable models generally
2011
+ // maintain plans without prompting so we skip the tracking there.
2012
+ if (!nativeTools) {
2013
+ for (let t = 0; t < toolCalls.length; t++) {
2014
+ const tc = toolCalls[t];
2015
+ const res = toolResults[t];
2016
+ if (tc.name === 'todo_write') {
2017
+ lastTodoWriteIter = iterations;
2018
+ editsSinceLastTodo = 0;
2019
+ }
2020
+ else if (isFileEditTool(tc.name) && res && !res.isError) {
2021
+ editsSinceLastTodo++;
2022
+ }
2023
+ }
2024
+ // One-shot stale-plan nudge: the model set up a plan earlier but
2025
+ // has since completed multiple edits without updating it. Fires
2026
+ // at most once per turn — if the model ignores it, we don't hound.
2027
+ if (!todoProgressNudged
2028
+ && lastTodoWriteIter >= 0
2029
+ && iterations - lastTodoWriteIter >= TODO_PROGRESS_STALE_DELTA
2030
+ && editsSinceLastTodo >= TODO_PROGRESS_EDIT_THRESHOLD) {
2031
+ todoProgressNudged = true;
2032
+ emit('tool_loop:todo_progress_nudge', {
2033
+ iteration: iterations,
2034
+ editsSinceLastTodo,
2035
+ iterationsSinceLastTodo: iterations - lastTodoWriteIter
2036
+ });
2037
+ messages.push({
2038
+ role: 'user',
2039
+ content: 'You set up a plan with `todo_write` earlier but have since completed ' +
2040
+ `${editsSinceLastTodo} edit${editsSinceLastTodo === 1 ? '' : 's'} without updating it. ` +
2041
+ 'Call `todo_write` now with the current status — mark finished items as `completed` and leave remaining items as `pending`. ' +
2042
+ "The Plan block in the user's UI mirrors your last `todo_write`, so skipping this leaves them looking at a stale checklist while real work has landed."
2043
+ });
2044
+ }
2045
+ }
2046
+ iterations++;
2047
+ }
2048
+ }
2049
+ }
2050
+ exports.ToolUseLoop = ToolUseLoop;
2051
+ /**
2052
+ * Convenience factory. Creates a loop with the given registry and context.
2053
+ */
2054
+ function createToolUseLoop(registry, ctx, options) {
2055
+ return new ToolUseLoop(registry, ctx, options);
2056
+ }
2057
+ //# sourceMappingURL=tool-use-loop.js.map