@burtson-labs/agent-core 1.6.16 → 1.6.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -1
- package/dist/index.js.map +1 -1
- package/dist/mcp/activation.js +16 -8
- package/dist/mcp/activation.js.map +1 -1
- package/dist/mcp/clientPool.js +40 -22
- package/dist/mcp/clientPool.js.map +1 -1
- package/dist/mcp/server.js +16 -10
- package/dist/mcp/server.js.map +1 -1
- package/dist/mcp/toolAdapter.js +21 -11
- package/dist/mcp/toolAdapter.js.map +1 -1
- package/dist/providers/deterministic-provider.d.ts +1 -1
- package/dist/providers/deterministic-provider.d.ts.map +1 -1
- package/dist/runtime/AgentRuntime.d.ts +2 -2
- package/dist/runtime/AgentRuntime.d.ts.map +1 -1
- package/dist/security/secretPatterns.js +4 -2
- package/dist/security/secretPatterns.js.map +1 -1
- package/dist/telemetry/otlpExporter.d.ts +69 -0
- package/dist/telemetry/otlpExporter.d.ts.map +1 -0
- package/dist/telemetry/otlpExporter.js +321 -0
- package/dist/telemetry/otlpExporter.js.map +1 -0
- package/dist/tools/ask-user-tool.js +8 -4
- package/dist/tools/ask-user-tool.js.map +1 -1
- package/dist/tools/compactMessages.js +6 -3
- package/dist/tools/compactMessages.js.map +1 -1
- package/dist/tools/core-tools.js +151 -81
- package/dist/tools/core-tools.js.map +1 -1
- package/dist/tools/git-tools.js +22 -11
- package/dist/tools/git-tools.js.map +1 -1
- package/dist/tools/language-adapters.d.ts +1 -1
- package/dist/tools/language-adapters.d.ts.map +1 -1
- package/dist/tools/language-adapters.js +36 -18
- package/dist/tools/language-adapters.js.map +1 -1
- package/dist/tools/loop/finalAnswerNudges.js +12 -6
- package/dist/tools/loop/finalAnswerNudges.js.map +1 -1
- package/dist/tools/loop/goalAnchor.d.ts.map +1 -1
- package/dist/tools/loop/goalAnchor.js +2 -1
- package/dist/tools/loop/goalAnchor.js.map +1 -1
- package/dist/tools/loop/llmStream.js +11 -8
- package/dist/tools/loop/llmStream.js.map +1 -1
- package/dist/tools/loop/loopShared.d.ts +20 -0
- package/dist/tools/loop/loopShared.d.ts.map +1 -0
- package/dist/tools/loop/loopShared.js +105 -0
- package/dist/tools/loop/loopShared.js.map +1 -0
- package/dist/tools/loop/parallelExecute.d.ts +1 -1
- package/dist/tools/loop/parallelExecute.js +2 -1
- package/dist/tools/loop/parallelExecute.js.map +1 -1
- package/dist/tools/loop/singleToolExecute.js +8 -4
- package/dist/tools/loop/singleToolExecute.js.map +1 -1
- package/dist/tools/loop/turnSetup.js +9 -6
- package/dist/tools/loop/turnSetup.js.map +1 -1
- package/dist/tools/ocr.d.ts.map +1 -1
- package/dist/tools/ocr.js +7 -5
- package/dist/tools/ocr.js.map +1 -1
- package/dist/tools/post-edit-checks.js +25 -13
- package/dist/tools/post-edit-checks.js.map +1 -1
- package/dist/tools/skill-loader.d.ts +1 -1
- package/dist/tools/skill-loader.d.ts.map +1 -1
- package/dist/tools/skill-loader.js +14 -7
- package/dist/tools/skill-loader.js.map +1 -1
- package/dist/tools/skill-registry.js +2 -1
- package/dist/tools/skill-registry.js.map +1 -1
- package/dist/tools/skills/mail-search-skill.js +16 -9
- package/dist/tools/skills/mail-search-skill.js.map +1 -1
- package/dist/tools/skills/plan-skill.js +4 -2
- package/dist/tools/skills/plan-skill.js.map +1 -1
- package/dist/tools/skills/semantic-search-skill.js +12 -6
- package/dist/tools/skills/semantic-search-skill.js.map +1 -1
- package/dist/tools/skills/test-gen-skill.js +8 -4
- package/dist/tools/skills/test-gen-skill.js.map +1 -1
- package/dist/tools/tool-registry.d.ts +17 -0
- package/dist/tools/tool-registry.d.ts.map +1 -1
- package/dist/tools/tool-registry.js +110 -30
- package/dist/tools/tool-registry.js.map +1 -1
- package/dist/tools/tool-use-loop.d.ts +16 -8
- package/dist/tools/tool-use-loop.d.ts.map +1 -1
- package/dist/tools/tool-use-loop.js +144 -160
- package/dist/tools/tool-use-loop.js.map +1 -1
- package/dist/tools/tool-use-parser.d.ts +33 -0
- package/dist/tools/tool-use-parser.d.ts.map +1 -1
- package/dist/tools/tool-use-parser.js +105 -28
- package/dist/tools/tool-use-parser.js.map +1 -1
- package/dist/tools/toolAvailabilityDetector.d.ts +0 -24
- package/dist/tools/toolAvailabilityDetector.d.ts.map +1 -1
- package/dist/tools/toolAvailabilityDetector.js +26 -12
- package/dist/tools/toolAvailabilityDetector.js.map +1 -1
- package/dist/tools/unified-patch.js +16 -8
- package/dist/tools/unified-patch.js.map +1 -1
- package/dist/utils/event-emitter.d.ts +1 -1
- package/dist/utils/event-emitter.d.ts.map +1 -1
- package/package.json +20 -1
|
@@ -18,12 +18,7 @@
|
|
|
18
18
|
* the host should use the Ollama `tools: [...]` field instead.
|
|
19
19
|
*/
|
|
20
20
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
21
|
-
exports.ToolUseLoop = void 0;
|
|
22
|
-
exports.sleep = sleep;
|
|
23
|
-
exports.isRetryableLlmError = isRetryableLlmError;
|
|
24
|
-
exports.tagRetryableLlmError = tagRetryableLlmError;
|
|
25
|
-
exports.summarizeLlmError = summarizeLlmError;
|
|
26
|
-
exports.isContinuationPrompt = isContinuationPrompt;
|
|
21
|
+
exports.ToolUseLoop = exports.isContinuationPrompt = exports.summarizeLlmError = exports.tagRetryableLlmError = exports.isRetryableLlmError = exports.sleep = void 0;
|
|
27
22
|
exports.isNoticingPrompt = isNoticingPrompt;
|
|
28
23
|
exports.createToolUseLoop = createToolUseLoop;
|
|
29
24
|
const tool_use_parser_1 = require("./tool-use-parser");
|
|
@@ -36,87 +31,16 @@ const parallelExecute_1 = require("./loop/parallelExecute");
|
|
|
36
31
|
const goalAnchor_1 = require("./loop/goalAnchor");
|
|
37
32
|
const finalAnswerNudges_1 = require("./loop/finalAnswerNudges");
|
|
38
33
|
const toolAvailabilityDetector_1 = require("./toolAvailabilityDetector");
|
|
34
|
+
const loopShared_1 = require("./loop/loopShared");
|
|
35
|
+
Object.defineProperty(exports, "sleep", { enumerable: true, get: function () { return loopShared_1.sleep; } });
|
|
36
|
+
Object.defineProperty(exports, "isRetryableLlmError", { enumerable: true, get: function () { return loopShared_1.isRetryableLlmError; } });
|
|
37
|
+
Object.defineProperty(exports, "tagRetryableLlmError", { enumerable: true, get: function () { return loopShared_1.tagRetryableLlmError; } });
|
|
38
|
+
Object.defineProperty(exports, "summarizeLlmError", { enumerable: true, get: function () { return loopShared_1.summarizeLlmError; } });
|
|
39
|
+
Object.defineProperty(exports, "isContinuationPrompt", { enumerable: true, get: function () { return loopShared_1.isContinuationPrompt; } });
|
|
39
40
|
const FILE_EDIT_TOOL_NAMES = new Set(['write_file', 'apply_edit', 'replace_range', 'apply_patch']);
|
|
40
41
|
function isFileEditTool(name) {
|
|
41
42
|
return FILE_EDIT_TOOL_NAMES.has(name);
|
|
42
43
|
}
|
|
43
|
-
function sleep(ms) {
|
|
44
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
45
|
-
}
|
|
46
|
-
function getErrorCode(error) {
|
|
47
|
-
return typeof error === 'object' && error !== null && 'code' in error
|
|
48
|
-
? String(error.code ?? '')
|
|
49
|
-
: undefined;
|
|
50
|
-
}
|
|
51
|
-
function getErrorMessage(error) {
|
|
52
|
-
return error instanceof Error ? error.message : String(error);
|
|
53
|
-
}
|
|
54
|
-
function isRetryableLlmError(error) {
|
|
55
|
-
const code = getErrorCode(error);
|
|
56
|
-
if (code === 'USER_ABORT')
|
|
57
|
-
return false;
|
|
58
|
-
const message = getErrorMessage(error);
|
|
59
|
-
if (/\b429\b|rate limit/i.test(message))
|
|
60
|
-
return false;
|
|
61
|
-
return (code === 'WATCHDOG' ||
|
|
62
|
-
/\b5\d\d\b/.test(message) ||
|
|
63
|
-
/Upstream model request failed/i.test(message) ||
|
|
64
|
-
/ECONNREFUSED|ECONNRESET|ETIMEDOUT|EAI_AGAIN|socket hang up|fetch failed|network error|terminated|UND_ERR/i.test(message));
|
|
65
|
-
}
|
|
66
|
-
function tagRetryableLlmError(error) {
|
|
67
|
-
if (error instanceof Error) {
|
|
68
|
-
const tagged = error;
|
|
69
|
-
if (!tagged.code)
|
|
70
|
-
tagged.code = 'UPSTREAM_MODEL';
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
function summarizeLlmError(error) {
|
|
74
|
-
const message = getErrorMessage(error).replace(/\s+/g, ' ').trim();
|
|
75
|
-
return message.length > 180 ? `${message.slice(0, 177)}...` : message;
|
|
76
|
-
}
|
|
77
|
-
/**
|
|
78
|
-
* Detects "keep going" / "continue" / "yes" style prompts that
|
|
79
|
-
* carry no real goal content. The goal-anchor block uses the most recent
|
|
80
|
-
* user message as the recall text; when that text is "good lets keep
|
|
81
|
-
* going" the anchor degenerates into "remind yourself to keep going",
|
|
82
|
-
* which gives the model nothing to anchor on after 20 iterations of
|
|
83
|
-
* drift. Real on a 60-iteration linter-fix
|
|
84
|
-
* turn: every anchor injection cited "good lets keep going" as the
|
|
85
|
-
* goal. Detector lets callers walk back to a prior substantive prompt
|
|
86
|
-
* instead.
|
|
87
|
-
*
|
|
88
|
-
* Length cap (60 chars) + normalized-phrase match keeps false positives
|
|
89
|
-
* down — a sentence like "keep going on the auth refactor for the
|
|
90
|
-
* user-service" is longer than 60 chars and reads as a real goal, so it
|
|
91
|
-
* stays a goal.
|
|
92
|
-
*/
|
|
93
|
-
const CONTINUATION_PROMPT_PHRASES = new Set([
|
|
94
|
-
'continue', 'keep going', 'go on', 'proceed', 'next', 'more',
|
|
95
|
-
'please continue', 'carry on', 'finish', 'finish it', 'finish up', 'wrap up', 'wrap it up',
|
|
96
|
-
'good', 'great', 'nice', 'cool', 'sweet', 'perfect', 'ok', 'okay', 'k', 'yes', 'y', 'yep', 'yeah', 'ack', 'done',
|
|
97
|
-
"let's continue", 'lets continue', "let's keep going", 'lets keep going',
|
|
98
|
-
'good keep going', 'good lets keep going', "good let's keep going",
|
|
99
|
-
'good continue', 'ok continue', 'okay continue'
|
|
100
|
-
]);
|
|
101
|
-
function isContinuationPrompt(text) {
|
|
102
|
-
const trimmed = text.trim();
|
|
103
|
-
if (trimmed.length === 0 || trimmed.length > 60)
|
|
104
|
-
return false;
|
|
105
|
-
// Normalize: lowercase, drop non-word/space punctuation, collapse whitespace.
|
|
106
|
-
const norm = trimmed
|
|
107
|
-
.toLowerCase()
|
|
108
|
-
.replace(/[^\w\s']/g, ' ')
|
|
109
|
-
.replace(/\s+/g, ' ')
|
|
110
|
-
.trim();
|
|
111
|
-
if (CONTINUATION_PROMPT_PHRASES.has(norm))
|
|
112
|
-
return true;
|
|
113
|
-
// Permit "please <phrase>" and "<phrase> please" wrappings.
|
|
114
|
-
for (const phrase of CONTINUATION_PROMPT_PHRASES) {
|
|
115
|
-
if (norm === `please ${phrase}` || norm === `${phrase} please`)
|
|
116
|
-
return true;
|
|
117
|
-
}
|
|
118
|
-
return false;
|
|
119
|
-
}
|
|
120
44
|
/**
|
|
121
45
|
* "Noticing prompt" detector. Catches user messages that are asking
|
|
122
46
|
* about state ("are we using these?", "did you update X?", "where's
|
|
@@ -124,7 +48,7 @@ function isContinuationPrompt(text) {
|
|
|
124
48
|
* work. These signal that the user spotted a gap in the prior turn
|
|
125
49
|
* and wants the agent to address it — NOT continue the prior plan.
|
|
126
50
|
*
|
|
127
|
-
* Real failure mode captured 2026-05-25 on a
|
|
51
|
+
* Real failure mode captured 2026-05-25 on a local React refactor:
|
|
128
52
|
* user asked "I dont think we actually are using these new files are
|
|
129
53
|
* we?" after the agent wrote data files but never wired them into
|
|
130
54
|
* App.jsx. Bandit read the question as a generic "keep going" prompt,
|
|
@@ -139,8 +63,9 @@ function isContinuationPrompt(text) {
|
|
|
139
63
|
*/
|
|
140
64
|
function isNoticingPrompt(text) {
|
|
141
65
|
const trimmed = (text || '').trim();
|
|
142
|
-
if (trimmed.length === 0 || trimmed.length > 220)
|
|
66
|
+
if (trimmed.length === 0 || trimmed.length > 220) {
|
|
143
67
|
return false;
|
|
68
|
+
}
|
|
144
69
|
const norm = trimmed.toLowerCase().replace(/[^\w\s'?-]/g, ' ').replace(/\s+/g, ' ').trim();
|
|
145
70
|
// Stems that introduce a noticing/clarifying question. Anchored to
|
|
146
71
|
// the start of the message so a paragraph mentioning "are we"
|
|
@@ -163,8 +88,9 @@ function isNoticingPrompt(text) {
|
|
|
163
88
|
/^wait\b/, // "wait — what about Y?"
|
|
164
89
|
/^(?:i'?m|am\s+i)\s+(?:missing|seeing|reading)\b/,
|
|
165
90
|
];
|
|
166
|
-
if (!STEMS.some((re) => re.test(norm)))
|
|
91
|
+
if (!STEMS.some((re) => re.test(norm))) {
|
|
167
92
|
return false;
|
|
93
|
+
}
|
|
168
94
|
// Has to contain a question mark OR a concern modal. Lots of false
|
|
169
95
|
// matches without — e.g. "are we" mid-sentence in a feature request.
|
|
170
96
|
const hasQuestion = trimmed.includes('?');
|
|
@@ -257,10 +183,17 @@ class ToolUseLoop {
|
|
|
257
183
|
// explicit "this is a recovery attempt — answer the original goal"
|
|
258
184
|
// framing succeeds. Last resort before terminal throw.
|
|
259
185
|
let finalAnchorRetryUsed = false;
|
|
260
|
-
const textToolBlock =
|
|
186
|
+
const textToolBlock = effectiveOptions.compactToolBlock
|
|
187
|
+
? this.registry.buildCompactSystemPromptBlock()
|
|
188
|
+
: this.registry.buildSystemPromptBlock();
|
|
189
|
+
// Lowercased registered tool names — used by the narrated-call
|
|
190
|
+
// detector to anchor on "I call <real tool>" with near-zero false
|
|
191
|
+
// positives.
|
|
192
|
+
const registeredToolNames = new Set(this.registry.getAll().map(t => t.name.toLowerCase()));
|
|
261
193
|
const buildFullSystemPrompt = (useNativeTools) => {
|
|
262
|
-
if (useNativeTools)
|
|
194
|
+
if (useNativeTools) {
|
|
263
195
|
return systemPrompt ?? '';
|
|
196
|
+
}
|
|
264
197
|
return systemPrompt
|
|
265
198
|
? `${systemPrompt}\n\n${textToolBlock}`
|
|
266
199
|
: textToolBlock;
|
|
@@ -278,7 +211,7 @@ class ToolUseLoop {
|
|
|
278
211
|
// window and the model can drift to a related-but-different topic.
|
|
279
212
|
// Walks back through continuation tokens ("keep going", "yes") to
|
|
280
213
|
// the most recent SUBSTANTIVE prompt. See loop/turnSetup.ts.
|
|
281
|
-
|
|
214
|
+
const { originalGoal, priorUserPromptCount } = (0, turnSetup_1.resolveTurnGoal)({ seedMessages });
|
|
282
215
|
// Track the iteration we last anchored on rather than a boolean
|
|
283
216
|
// so we can re-fire when the model pivots AGAIN later in a long
|
|
284
217
|
// turn. -1 means "never anchored." Re-fire is gated by the
|
|
@@ -287,8 +220,9 @@ class ToolUseLoop {
|
|
|
287
220
|
// continued without resolution for several more iterations.
|
|
288
221
|
let lastGoalAnchorIteration = -1;
|
|
289
222
|
for (const msg of seedMessages) {
|
|
290
|
-
if (msg.role === 'system')
|
|
223
|
+
if (msg.role === 'system') {
|
|
291
224
|
continue;
|
|
225
|
+
}
|
|
292
226
|
messages.push(msg);
|
|
293
227
|
}
|
|
294
228
|
// Noticing-prompt pivot hint. When the most-recent user message
|
|
@@ -305,7 +239,7 @@ class ToolUseLoop {
|
|
|
305
239
|
});
|
|
306
240
|
messages.push({
|
|
307
241
|
role: 'user',
|
|
308
|
-
content: '[Reading-comprehension note for the assistant: the user\'s last message above is a noticing / clarifying question — they spotted a possible gap from prior turns and are asking you to confirm or correct, NOT to continue any prior plan. Before you take any new action, identify what gap the question points at and address it directly. If the question is "are we using X?" the correct first move is to verify whether X is actually being used (read the consumer file, grep for the import, check the call site) and answer honestly — yes/no with evidence. Do NOT create more new artifacts unless the user explicitly says to.]'
|
|
242
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + '[Reading-comprehension note for the assistant: the user\'s last message above is a noticing / clarifying question — they spotted a possible gap from prior turns and are asking you to confirm or correct, NOT to continue any prior plan. Before you take any new action, identify what gap the question points at and address it directly. If the question is "are we using X?" the correct first move is to verify whether X is actually being used (read the consumer file, grep for the import, check the call site) and answer honestly — yes/no with evidence. Do NOT create more new artifacts unless the user explicitly says to.]'
|
|
309
243
|
});
|
|
310
244
|
}
|
|
311
245
|
let iterations = 0;
|
|
@@ -334,7 +268,7 @@ class ToolUseLoop {
|
|
|
334
268
|
// recovery, etc.) each have their own caps, but they can chain — a
|
|
335
269
|
// model can spin through 6+ no-tool-call responses because
|
|
336
270
|
// thinking-off recovery resets consecutiveEmptyRetries=0. Captured
|
|
337
|
-
// 2026-05-26 in
|
|
271
|
+
// 2026-05-26 in a real CLI session (turn-2026-05-26T02-30-37):
|
|
338
272
|
// model emitted 6 sequential reasoning-only responses inside
|
|
339
273
|
// iteration 4 before the loop finally terminated with a useless
|
|
340
274
|
// final answer ("I need to stop wrapping tool calls in reasoning
|
|
@@ -687,7 +621,7 @@ class ToolUseLoop {
|
|
|
687
621
|
// current pace and burn the extension too.
|
|
688
622
|
messages.push({
|
|
689
623
|
role: 'user',
|
|
690
|
-
content: `You've been making good progress and the iteration budget has been extended by ${CAP_EXTENSION_SIZE} (new limit: ${max}). Keep going, but tighten up: prefer batched edits over single-line ones, and start wrapping up when you have a complete answer rather than running to the new cap. This is the ${iterationCapExtensions === 1 ? 'first' : 'second'} of at most ${MAX_CAP_EXTENSIONS} extensions for this turn.`
|
|
624
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `You've been making good progress and the iteration budget has been extended by ${CAP_EXTENSION_SIZE} (new limit: ${max}). Keep going, but tighten up: prefer batched edits over single-line ones, and start wrapping up when you have a complete answer rather than running to the new cap. This is the ${iterationCapExtensions === 1 ? 'first' : 'second'} of at most ${MAX_CAP_EXTENSIONS} extensions for this turn.`
|
|
691
625
|
});
|
|
692
626
|
}
|
|
693
627
|
else {
|
|
@@ -698,7 +632,7 @@ class ToolUseLoop {
|
|
|
698
632
|
// vs edit) reflects what the user actually asked for.
|
|
699
633
|
messages.push({
|
|
700
634
|
role: 'user',
|
|
701
|
-
content: `${goalRecallBlock}` +
|
|
635
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `${goalRecallBlock}` +
|
|
702
636
|
`You have reached the tool-use iteration limit (${max}). Stop calling tools. Produce a final answer with three short sections, in this exact shape:\n` +
|
|
703
637
|
'\n' +
|
|
704
638
|
wrapUpBody +
|
|
@@ -712,7 +646,7 @@ class ToolUseLoop {
|
|
|
712
646
|
emit('tool_loop:total_tool_cap', { iteration: iterations, totalToolsExecuted });
|
|
713
647
|
messages.push({
|
|
714
648
|
role: 'user',
|
|
715
|
-
content: `${goalRecallBlock}` +
|
|
649
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `${goalRecallBlock}` +
|
|
716
650
|
`You have executed ${totalToolsExecuted} tool calls this turn — the per-turn cap (${maxTotalTools}) has been reached. Stop calling tools. Produce a final answer with three short sections:\n` +
|
|
717
651
|
'\n' +
|
|
718
652
|
wrapUpBody +
|
|
@@ -813,7 +747,7 @@ class ToolUseLoop {
|
|
|
813
747
|
break;
|
|
814
748
|
}
|
|
815
749
|
catch (error) {
|
|
816
|
-
if (nativeTools && nativeToolFailureFallback && !nativeFallbackUsed && isRetryableLlmError(error) && !signal?.aborted) {
|
|
750
|
+
if (nativeTools && nativeToolFailureFallback && !nativeFallbackUsed && (0, loopShared_1.isRetryableLlmError)(error) && !signal?.aborted) {
|
|
817
751
|
nativeFallbackUsed = true;
|
|
818
752
|
nativeTools = false;
|
|
819
753
|
nativeSchemas = undefined;
|
|
@@ -839,7 +773,7 @@ class ToolUseLoop {
|
|
|
839
773
|
// visible markup.
|
|
840
774
|
messages.push({
|
|
841
775
|
role: 'user',
|
|
842
|
-
content: `[Provider error mid-turn — tool channel switched.] The previous attempt failed with: ${summarizeLlmError(error)}. ` +
|
|
776
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `[Provider error mid-turn — tool channel switched.] The previous attempt failed with: ${(0, loopShared_1.summarizeLlmError)(error)}. ` +
|
|
843
777
|
`I retried with the text-based tool-call channel. ` +
|
|
844
778
|
`Re-emit your pending action using the text envelope: ` +
|
|
845
779
|
`<tool_call>{"name":"...","params":{...}}</tool_call> outside of any reasoning block. ` +
|
|
@@ -848,7 +782,7 @@ class ToolUseLoop {
|
|
|
848
782
|
});
|
|
849
783
|
emit('tool_loop:native_tool_fallback', {
|
|
850
784
|
iteration: iterations,
|
|
851
|
-
reason: summarizeLlmError(error)
|
|
785
|
+
reason: (0, loopShared_1.summarizeLlmError)(error)
|
|
852
786
|
});
|
|
853
787
|
continue;
|
|
854
788
|
}
|
|
@@ -861,13 +795,13 @@ class ToolUseLoop {
|
|
|
861
795
|
// this attempt, any further failure on text is genuinely
|
|
862
796
|
// terminal — the user has been waiting > 30 s and a clean
|
|
863
797
|
// error is more helpful than another silent retry.
|
|
864
|
-
if (nativeFallbackUsed && !textFallbackRetryUsed && isRetryableLlmError(error) && !signal?.aborted) {
|
|
798
|
+
if (nativeFallbackUsed && !textFallbackRetryUsed && (0, loopShared_1.isRetryableLlmError)(error) && !signal?.aborted) {
|
|
865
799
|
textFallbackRetryUsed = true;
|
|
866
800
|
emit('tool_loop:text_fallback_retry', {
|
|
867
801
|
iteration: iterations,
|
|
868
|
-
reason: summarizeLlmError(error)
|
|
802
|
+
reason: (0, loopShared_1.summarizeLlmError)(error)
|
|
869
803
|
});
|
|
870
|
-
await sleep(2400);
|
|
804
|
+
await (0, loopShared_1.sleep)(2400);
|
|
871
805
|
continue;
|
|
872
806
|
}
|
|
873
807
|
// Last-resort final-anchor retry. By this point we've spent
|
|
@@ -884,21 +818,21 @@ class ToolUseLoop {
|
|
|
884
818
|
if (!finalAnchorRetryUsed
|
|
885
819
|
&& textFallbackRetryUsed
|
|
886
820
|
&& originalGoal.trim().length > 0
|
|
887
|
-
&& isRetryableLlmError(error)
|
|
821
|
+
&& (0, loopShared_1.isRetryableLlmError)(error)
|
|
888
822
|
&& !signal?.aborted) {
|
|
889
823
|
finalAnchorRetryUsed = true;
|
|
890
824
|
messages.push({
|
|
891
825
|
role: 'user',
|
|
892
|
-
content: `[Recovery attempt — previous channel attempts hit ${summarizeLlmError(error)}. ` +
|
|
826
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `[Recovery attempt — previous channel attempts hit ${(0, loopShared_1.summarizeLlmError)(error)}. ` +
|
|
893
827
|
`Discarding any partial tool_call or reasoning state from those attempts. ` +
|
|
894
828
|
`Original user goal restated as a fresh anchor:]\n\n${originalGoal.trim()}`
|
|
895
829
|
});
|
|
896
830
|
emit('tool_loop:final_anchor_retry', {
|
|
897
831
|
iteration: iterations,
|
|
898
|
-
reason: summarizeLlmError(error),
|
|
832
|
+
reason: (0, loopShared_1.summarizeLlmError)(error),
|
|
899
833
|
goalPreview: originalGoal.slice(0, 120)
|
|
900
834
|
});
|
|
901
|
-
await sleep(3600);
|
|
835
|
+
await (0, loopShared_1.sleep)(3600);
|
|
902
836
|
continue;
|
|
903
837
|
}
|
|
904
838
|
throw error;
|
|
@@ -928,7 +862,7 @@ class ToolUseLoop {
|
|
|
928
862
|
// have their own caps, but they chain — thinking-off recovery
|
|
929
863
|
// resets consecutiveEmptyRetries=0, parse-retry has its own
|
|
930
864
|
// counter, and the model can move between failure modes faster
|
|
931
|
-
// than any one detector can give up.
|
|
865
|
+
// than any one detector can give up. Real CLI session
|
|
932
866
|
// 2026-05-26 turn-02-30-37: 6 sequential reasoning-only
|
|
933
867
|
// responses inside one iteration before the loop terminated
|
|
934
868
|
// silently. This counter increments on EVERY response without
|
|
@@ -969,9 +903,9 @@ class ToolUseLoop {
|
|
|
969
903
|
// Also reset the prefill-recovery one-shot. The recovery budget
|
|
970
904
|
// is "per stretch of failures," not "once per turn" — without
|
|
971
905
|
// this reset, a long refactor that recovers from one prefill
|
|
972
|
-
// stall and then hits another (
|
|
973
|
-
//
|
|
974
|
-
//
|
|
906
|
+
// stall and then hits another (observed in a real run: 26
|
|
907
|
+
// iterations, prefill burned at iter 25, iter 26 stalled again
|
|
908
|
+
// with no recovery left) falls straight
|
|
975
909
|
// through to the terminal "Bandit stalled" fallback even though
|
|
976
910
|
// every other detector still has budget. The hard cap on
|
|
977
911
|
// noToolCallAttemptsThisTurn (5) bounds the total stuck
|
|
@@ -1003,7 +937,7 @@ class ToolUseLoop {
|
|
|
1003
937
|
messages.push({ role: 'assistant', content: scrubbed });
|
|
1004
938
|
messages.push({
|
|
1005
939
|
role: 'user',
|
|
1006
|
-
content: 'You emitted a `<tool_result>` envelope in your response. Those envelopes are SYSTEM output — they appear BETWEEN your turns, never inside your own message. If you meant to invoke a tool, emit a single `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. If the task is complete, give a plain-prose final answer with no XML envelopes. Retry now.'
|
|
940
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'You emitted a `<tool_result>` envelope in your response. Those envelopes are SYSTEM output — they appear BETWEEN your turns, never inside your own message. If you meant to invoke a tool, emit a single `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. If the task is complete, give a plain-prose final answer with no XML envelopes. Retry now.'
|
|
1007
941
|
});
|
|
1008
942
|
continue;
|
|
1009
943
|
}
|
|
@@ -1034,7 +968,7 @@ class ToolUseLoop {
|
|
|
1034
968
|
messages.push({ role: 'assistant', content: scrubbed });
|
|
1035
969
|
messages.push({
|
|
1036
970
|
role: 'user',
|
|
1037
|
-
content: 'You emitted ` ```bandit-tl` (or `bandit-run` / `bandit-subagent`) fenced JSON in your response. Those fences are emitted by the EXTENSION HOST to log real tool execution — you CANNOT produce them. They show up in your context because the host logged actual tool calls, not because you can fabricate them. To actually run a tool, emit `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. Your fake fences mean NO work has happened this turn. You have TWO options for your retry, and ONLY two: (a) Emit a real `<tool_call>{"name":"...","params":{...}}</tool_call>` envelope NOW to actually do the work, then wait for the real result. (b) Honestly state "I have not [action] yet" and STOP. Do NOT claim completion. You MUST NOT claim you have fixed / eliminated / resolved / removed / cleaned / verified anything. No "successfully [verb]" phrasing. No numbered lists of "Step 1: I did X" actions. No "the project is now in a healthy state." Until a real `<tool_call>` lands on disk and returns a real tool-result, nothing has changed. Lying about completion is the worst failure mode. Retry now.'
|
|
971
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'You emitted ` ```bandit-tl` (or `bandit-run` / `bandit-subagent`) fenced JSON in your response. Those fences are emitted by the EXTENSION HOST to log real tool execution — you CANNOT produce them. They show up in your context because the host logged actual tool calls, not because you can fabricate them. To actually run a tool, emit `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. Your fake fences mean NO work has happened this turn. You have TWO options for your retry, and ONLY two: (a) Emit a real `<tool_call>{"name":"...","params":{...}}</tool_call>` envelope NOW to actually do the work, then wait for the real result. (b) Honestly state "I have not [action] yet" and STOP. Do NOT claim completion. You MUST NOT claim you have fixed / eliminated / resolved / removed / cleaned / verified anything. No "successfully [verb]" phrasing. No numbered lists of "Step 1: I did X" actions. No "the project is now in a healthy state." Until a real `<tool_call>` lands on disk and returns a real tool-result, nothing has changed. Lying about completion is the worst failure mode. Retry now.'
|
|
1038
972
|
});
|
|
1039
973
|
continue;
|
|
1040
974
|
}
|
|
@@ -1056,7 +990,10 @@ class ToolUseLoop {
|
|
|
1056
990
|
&& !(0, tool_use_parser_1.hasToolCalls)(response)
|
|
1057
991
|
&& toolAbsenceCorrectionsFired < TOOL_ABSENCE_CORRECTION_CAP) {
|
|
1058
992
|
const registeredNames = this.registry.getAll().map((t) => t.name);
|
|
1059
|
-
|
|
993
|
+
// Reasoning channels MUST be stripped before prose-matching:
|
|
994
|
+
// reasoning narrates tool usage by name and false-positives the
|
|
995
|
+
// absence phrases (see toolAvailabilityDetector.ts header).
|
|
996
|
+
const absence = (0, toolAvailabilityDetector_1.detectFalseToolAbsence)((0, tool_use_parser_1.stripReasoningChannels)(response), registeredNames);
|
|
1060
997
|
if (absence.detected) {
|
|
1061
998
|
toolAbsenceCorrectionsFired++;
|
|
1062
999
|
emit('tool_loop:false_tool_absence', {
|
|
@@ -1088,7 +1025,7 @@ class ToolUseLoop {
|
|
|
1088
1025
|
messages.push({ role: 'assistant', content: response });
|
|
1089
1026
|
messages.push({
|
|
1090
1027
|
role: 'user',
|
|
1091
|
-
content: 'The previous tool call returned an error and you produced no follow-up tool_call. ' +
|
|
1028
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'The previous tool call returned an error and you produced no follow-up tool_call. ' +
|
|
1092
1029
|
'Do NOT silently abandon the request — the user expects you to either retry with corrected parameters OR state explicitly which precondition failed and why you cannot proceed. ' +
|
|
1093
1030
|
'Choose one: (a) emit a corrected `<tool_call>{"name":"...","params":{...}}</tool_call>` now, fixing the param shape or value the error pointed at; ' +
|
|
1094
1031
|
'(b) give a one-line final answer naming the exact precondition you lack (e.g. "I cannot trash message X because the message id is unknown — please provide it"). ' +
|
|
@@ -1107,12 +1044,12 @@ class ToolUseLoop {
|
|
|
1107
1044
|
// without emitting an actual tool_call. Visually the user sees a
|
|
1108
1045
|
// wall of reasoning text and nothing happens. Strip the reasoning
|
|
1109
1046
|
// fences before checking emptiness so the same nudge fires.
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1047
|
+
// Strip reasoning channels AND stray fence scaffolding (a bare
|
|
1048
|
+
// leading ``` opener that wraps the reasoning) so the
|
|
1049
|
+
// reasoning-only check isn't fooled into seeing the orphan ``` as
|
|
1050
|
+
// a real answer — which let a "reasoning + no tool call" turn end
|
|
1051
|
+
// with no answer (real CLI run, 2026-06-15).
|
|
1052
|
+
const stripped = (0, tool_use_parser_1.stripToAnswerContent)(response);
|
|
1116
1053
|
const reasoningOnly = !stripped && response.trim().length > 0;
|
|
1117
1054
|
// "Narrated but didn't act" detector. Some models (notably ones
|
|
1118
1055
|
// post-trained for a different tool-call envelope, e.g. OpenAI
|
|
@@ -1128,8 +1065,8 @@ class ToolUseLoop {
|
|
|
1128
1065
|
// in the model's final clause, not an earlier "I have already
|
|
1129
1066
|
// searched the file" preamble before a real answer.
|
|
1130
1067
|
//
|
|
1131
|
-
// Captured 2026-05-25 (
|
|
1132
|
-
// "I'll redesign the
|
|
1068
|
+
// Captured 2026-05-25 (real IDE session): model emitted
|
|
1069
|
+
// "I'll redesign the page... Let me rewrite both files." with
|
|
1133
1070
|
// NO tool_call and the turn closed as a final answer because
|
|
1134
1071
|
// neither `redesign` nor `rewrite` was on the list. A long
|
|
1135
1072
|
// session ended with zero work shipped. Missing a verb here =
|
|
@@ -1137,10 +1074,13 @@ class ToolUseLoop {
|
|
|
1137
1074
|
const NARRATE_VERB_RE = /\b(use|uses|used|using|call|calls|called|calling|invoke|invokes|invoked|invoking|execute|executes|executed|executing|run|runs|running|ran|search|searches|searched|searching|look|looks|looked|looking|read|reads|reading|check|checks|checked|checking|find|finds|finding|found|list|lists|listed|listing|fetch|fetches|fetched|fetching|grep|greps|grepped|grepping|explore|explores|explored|exploring|locate|locates|located|locating|plan|plans|planned|planning|start|starts|started|starting|begin|begins|began|beginning|create|creates|created|creating|write|writes|wrote|writing|rewrite|rewrites|rewrote|rewriting|rewritten|build|builds|built|building|rebuild|rebuilds|rebuilt|rebuilding|update|updates|updated|updating|implement|implements|implemented|implementing|refactor|refactors|refactored|refactoring|redesign|redesigns|redesigned|redesigning|design|designs|designed|designing|generate|generates|generated|generating|scaffold|scaffolds|scaffolded|scaffolding|set\s+up|setting\s+up|tackle|tackles|tackled|tackling|do|does|did|doing|make|makes|made|making|batch|batches|batched|batching|execute|prepare|prepares|prepared|preparing|draft|drafts|drafted|drafting|outline|outlines|outlined|outlining|organize|organizes|organized|organizing|structure|structures|structured|structuring|kick\s+off|kicking\s+off|fix|fixes|fixed|fixing|edit|edits|edited|editing|modify|modifies|modified|modifying|patch|patches|patched|patching|adjust|adjusts|adjusted|adjusting|replace|replaces|replaced|replacing|swap|swaps|swapped|swapping|polish|polishes|polished|polishing|clean\s+up|cleaning\s+up|tidy|tidies|tidied|tidying|finalize|finalizes|finalized|finalizing|finish|finishes|finished|finishing|complete|completes|completed|completing|wire|wires|wired|wiring|hook|hooks|hooked|hooking|render|renders|rendered|rendering|style|styles|styled|styling|theme|themes|themed|theming|redo|redoes|redid|redoing|port|ports|ported|porting|migrate|migrates|migrated|migrating|configure|configures|configured|configuring|install|installs|installed|installing|remove|removes|removed|removing|delete|deletes|deleted|deleting|rename|renames|renamed|renaming)\b/i;
|
|
1138
1075
|
const NARRATE_INTENT_RE = /\b(we (?:will|need to|should)|we'?ll|we'?re going to|i'?ll|i will|let me|let'?s|going to|i'?m going to|i need to)\b/i;
|
|
1139
1076
|
// Real code fences pass through; narrate only fires when the
|
|
1140
|
-
// model emitted no structured payload at all.
|
|
1141
|
-
// response
|
|
1142
|
-
//
|
|
1143
|
-
|
|
1077
|
+
// model emitted no structured payload at all. Use the
|
|
1078
|
+
// reasoning-stripped response (NOT `stripped`, which also removes
|
|
1079
|
+
// bare fence-marker lines) so a genuine ```json / ```diff payload
|
|
1080
|
+
// still suppresses the narrate nudge and reaches its own
|
|
1081
|
+
// auto-promote detector. `bandit-reasoning` fences are reasoning,
|
|
1082
|
+
// not structured output, so they're excluded either way.
|
|
1083
|
+
const hasCodeFence = /```[a-zA-Z0-9_-]*\s*\n/.test((0, tool_use_parser_1.stripReasoningChannels)(response));
|
|
1144
1084
|
const tailMatch = stripped.match(/(?:[.!?]\s+)([^.!?]*)$/);
|
|
1145
1085
|
const tail = (tailMatch ? tailMatch[1] : stripped).slice(-200);
|
|
1146
1086
|
const narratedButNoAction = !(0, tool_use_parser_1.hasToolCalls)(response) &&
|
|
@@ -1149,6 +1089,22 @@ class ToolUseLoop {
|
|
|
1149
1089
|
stripped.length < 240 &&
|
|
1150
1090
|
NARRATE_INTENT_RE.test(tail) &&
|
|
1151
1091
|
NARRATE_VERB_RE.test(tail);
|
|
1092
|
+
// Performative narrated call: "I call read_file with path=README.md".
|
|
1093
|
+
// The generic gate above caps stripped.length at 240 to avoid false
|
|
1094
|
+
// positives on real answers that merely contain narrate verbs — but
|
|
1095
|
+
// when the final clause NAMES A REGISTERED TOOL in a performative
|
|
1096
|
+
// phrase, the length cap is wrong: a long planning recap that ends
|
|
1097
|
+
// "I call read_file with path=…" is a stall no matter how long the
|
|
1098
|
+
// recap is, and tool-name anchoring keeps the false-positive rate
|
|
1099
|
+
// near zero. Captured 2026-06-12 (real CLI session,
|
|
1100
|
+
// gemma4:e4b): iteration 1 emitted a reasoning recap ending with
|
|
1101
|
+
// exactly that sentence and no tool_call — the generic gate missed
|
|
1102
|
+
// it (over the length cap; intent list lacks present-tense "I
|
|
1103
|
+
// call") and the turn closed as a final answer.
|
|
1104
|
+
const narratedCallMatch = stripped.slice(-300).match(/\b(?:i\s+(?:will\s+|now\s+|then\s+)?(?:call|invoke|run|use)|calling|invoking|let'?s\s+(?:call|run|use))\s+(?:the\s+)?`?([a-z][a-z0-9_]*)`?/i);
|
|
1105
|
+
const narratedToolCallNoAction = !(0, tool_use_parser_1.hasToolCalls)(response) &&
|
|
1106
|
+
!!narratedCallMatch &&
|
|
1107
|
+
registeredToolNames.has(narratedCallMatch[1].toLowerCase());
|
|
1152
1108
|
// Empty-response retry: was previously gated to `iterations > 0`
|
|
1153
1109
|
// under the assumption "empty first response = provider outage."
|
|
1154
1110
|
// That assumption was wrong — with bandit-logic
|
|
@@ -1160,7 +1116,7 @@ class ToolUseLoop {
|
|
|
1160
1116
|
// the model gets a second chance (and the thinking-off recovery
|
|
1161
1117
|
// below can flip it to non-thinking mode if the second pass also
|
|
1162
1118
|
// empties).
|
|
1163
|
-
const shouldNudge = (!response.trim() || reasoningOnly || narratedButNoAction) &&
|
|
1119
|
+
const shouldNudge = (!response.trim() || reasoningOnly || narratedButNoAction || narratedToolCallNoAction) &&
|
|
1164
1120
|
!hitLimit &&
|
|
1165
1121
|
consecutiveEmptyRetries < 2 &&
|
|
1166
1122
|
!thinkingOffRecoveryAttempted;
|
|
@@ -1170,16 +1126,17 @@ class ToolUseLoop {
|
|
|
1170
1126
|
iteration: iterations,
|
|
1171
1127
|
attempt: consecutiveEmptyRetries,
|
|
1172
1128
|
reasoningOnly,
|
|
1173
|
-
narratedButNoAction
|
|
1129
|
+
narratedButNoAction,
|
|
1130
|
+
narratedToolCallNoAction
|
|
1174
1131
|
});
|
|
1175
|
-
const nudgeMessage = narratedButNoAction
|
|
1132
|
+
const nudgeMessage = (narratedButNoAction || narratedToolCallNoAction)
|
|
1176
1133
|
? 'You announced your next step in prose ("we will search…" / "let me check…" / "use X to find Y") but did NOT emit a `<tool_call>` envelope. Announcing intent is not enough — you must actually invoke the tool. Emit the call now in this exact format, OUTSIDE of any reasoning block, with NO commentary and NO markdown fence:\n\n<tool_call>{"name":"<tool>","params":{"<key>":"<value>"}}</tool_call>\n\nReplace name/params with the right values for your task. Or, if the task is already answerable from what you know, give a final answer instead.'
|
|
1177
1134
|
: reasoningOnly
|
|
1178
1135
|
? 'You completed reasoning but emitted no tool_call AND no final answer. The reasoning text alone does not run a tool — you must emit a `<tool_call>` envelope OUTSIDE the reasoning block. Format example (replace name/params for your task):\n\n<tool_call>{"name":"<tool>","params":{"<key>":"<value>"}}</tool_call>\n\nNo prose around it, no markdown fence, just the bare tag. If the task is answerable without a tool, write a complete final answer instead. Do not stop after only thinking.'
|
|
1179
1136
|
: 'Your previous response was empty. Either emit a `<tool_call>{"name":"<tool>","params":{...}}</tool_call>` to invoke a tool, OR produce a complete final answer using what you have. Do not respond with an empty message.';
|
|
1180
1137
|
messages.push({
|
|
1181
1138
|
role: 'user',
|
|
1182
|
-
content: nudgeMessage
|
|
1139
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + nudgeMessage
|
|
1183
1140
|
});
|
|
1184
1141
|
continue;
|
|
1185
1142
|
}
|
|
@@ -1212,7 +1169,7 @@ class ToolUseLoop {
|
|
|
1212
1169
|
});
|
|
1213
1170
|
messages.push({
|
|
1214
1171
|
role: 'user',
|
|
1215
|
-
content: 'Switching to non-thinking mode for this attempt because reasoning-only retries exhausted. Emit either a tool_call or a complete final answer. No more reasoning preamble.'
|
|
1172
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Switching to non-thinking mode for this attempt because reasoning-only retries exhausted. Emit either a tool_call or a complete final answer. No more reasoning preamble.'
|
|
1216
1173
|
});
|
|
1217
1174
|
continue;
|
|
1218
1175
|
}
|
|
@@ -1260,8 +1217,8 @@ class ToolUseLoop {
|
|
|
1260
1217
|
messages.push({
|
|
1261
1218
|
role: 'user',
|
|
1262
1219
|
content: firstRetry
|
|
1263
|
-
? 'Your previous tool_call was not valid JSON — I could not parse it. Common cause: unescaped `"` characters inside a string value (for example `["", "", ""]` inside a `content` string). Retry the tool call with properly escaped JSON: every `"` inside a string value must be written as `\\"`, and every newline as `\\n`. If the content is very long, consider `replace_range` for a line-numbered block or breaking the change into smaller edits.'
|
|
1264
|
-
: 'Your tool_call still did not parse. Do NOT retry with the same shape or the same escaping failure. Switch tactics: (a) call `replace_range` for a large block whose line numbers you just read, (b) call `write_file` for a new file, or (c) split the change into multiple small `apply_edit` calls that each target just one method or block (e.g. 3-5 lines of `find`, 5-10 lines of `replace`) instead of rewriting the whole class. Pick the smallest scope that accomplishes the next step. If you cannot produce a valid tool call, respond with a plain-prose final answer acknowledging you could not complete the edit.'
|
|
1220
|
+
? tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Your previous tool_call was not valid JSON — I could not parse it. Common cause: unescaped `"` characters inside a string value (for example `["", "", ""]` inside a `content` string). Retry the tool call with properly escaped JSON: every `"` inside a string value must be written as `\\"`, and every newline as `\\n`. If the content is very long, consider `replace_range` for a line-numbered block or breaking the change into smaller edits.'
|
|
1221
|
+
: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Your tool_call still did not parse. Do NOT retry with the same shape or the same escaping failure. Switch tactics: (a) call `replace_range` for a large block whose line numbers you just read, (b) call `write_file` for a new file, or (c) split the change into multiple small `apply_edit` calls that each target just one method or block (e.g. 3-5 lines of `find`, 5-10 lines of `replace`) instead of rewriting the whole class. Pick the smallest scope that accomplishes the next step. If you cannot produce a valid tool call, respond with a plain-prose final answer acknowledging you could not complete the edit.'
|
|
1265
1222
|
});
|
|
1266
1223
|
continue;
|
|
1267
1224
|
}
|
|
@@ -1275,7 +1232,7 @@ class ToolUseLoop {
|
|
|
1275
1232
|
if (!hitLimit && !(0, tool_use_parser_1.hasToolCalls)(response)) {
|
|
1276
1233
|
const normalized = response.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
1277
1234
|
const prior = recentNonToolResponses[recentNonToolResponses.length - 1];
|
|
1278
|
-
const looksLikeLoop =
|
|
1235
|
+
const looksLikeLoop = Boolean(prior) && (() => {
|
|
1279
1236
|
// Cheap similarity: longest common prefix / max length. If two
|
|
1280
1237
|
// consecutive no-tool responses share >60% of their text by
|
|
1281
1238
|
// prefix the model is repeating itself. More sophisticated
|
|
@@ -1284,8 +1241,9 @@ class ToolUseLoop {
|
|
|
1284
1241
|
const short = prior.length < normalized.length ? prior : normalized;
|
|
1285
1242
|
const long = prior.length < normalized.length ? normalized : prior;
|
|
1286
1243
|
let matched = 0;
|
|
1287
|
-
while (matched < short.length && short[matched] === long[matched])
|
|
1244
|
+
while (matched < short.length && short[matched] === long[matched]) {
|
|
1288
1245
|
matched++;
|
|
1246
|
+
}
|
|
1289
1247
|
return matched / short.length > 0.6;
|
|
1290
1248
|
})();
|
|
1291
1249
|
// Also flag the self-contradiction signature from the real
|
|
@@ -1306,7 +1264,7 @@ class ToolUseLoop {
|
|
|
1306
1264
|
});
|
|
1307
1265
|
messages.push({
|
|
1308
1266
|
role: 'user',
|
|
1309
|
-
content: 'STOP deliberating. Your last response either repeated itself, contradicted itself (e.g. "Wait, I see X / Actually I\'ll try X"), or was aborted mid-stream as a loop. Do NOT continue speculating about what files might exist. Take exactly one of these actions now: (a) invoke a tool (`list_files`, `read_file`, `search_code`, etc.) to answer the question with real data, OR (b) give up and tell the user plainly that you could not complete the task and why. Do not write more than two sentences of prose before either calling a tool or terminating.'
|
|
1267
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'STOP deliberating. Your last response either repeated itself, contradicted itself (e.g. "Wait, I see X / Actually I\'ll try X"), or was aborted mid-stream as a loop. Do NOT continue speculating about what files might exist. Take exactly one of these actions now: (a) invoke a tool (`list_files`, `read_file`, `search_code`, etc.) to answer the question with real data, OR (b) give up and tell the user plainly that you could not complete the task and why. Do not write more than two sentences of prose before either calling a tool or terminating.'
|
|
1310
1268
|
});
|
|
1311
1269
|
recentNonToolResponses.length = 0;
|
|
1312
1270
|
continue;
|
|
@@ -1396,7 +1354,7 @@ class ToolUseLoop {
|
|
|
1396
1354
|
// without being so loud that it derails prose responses.
|
|
1397
1355
|
messages.push({
|
|
1398
1356
|
role: 'user',
|
|
1399
|
-
content: 'Note: I detected a JSON todo list in your response and auto-promoted it to a todo_write call. Next time, emit `<tool_call>{"name":"todo_write","params":{"items":"..."}}</tool_call>` directly instead of pasting JSON as a code block — pasted JSON does not update your plan, only the tool call does.'
|
|
1357
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Note: I detected a JSON todo list in your response and auto-promoted it to a todo_write call. Next time, emit `<tool_call>{"name":"todo_write","params":{"items":"..."}}</tool_call>` directly instead of pasting JSON as a code block — pasted JSON does not update your plan, only the tool call does.'
|
|
1400
1358
|
});
|
|
1401
1359
|
iterations++;
|
|
1402
1360
|
continue;
|
|
@@ -1424,7 +1382,26 @@ class ToolUseLoop {
|
|
|
1424
1382
|
responsePreview: response.slice(0, 300)
|
|
1425
1383
|
});
|
|
1426
1384
|
}
|
|
1427
|
-
|
|
1385
|
+
// Reasoning channels are streamed live by the host for display —
|
|
1386
|
+
// leaving them in the terminal answer double-renders them, and on
|
|
1387
|
+
// fabrication-retry exhaustion it prints the model's confusion
|
|
1388
|
+
// narrative as if it were the answer (real CLI run,
|
|
1389
|
+
// 2026-06-12T20-19 turn: three near-identical "the user is
|
|
1390
|
+
// correcting my formatting error" reasoning blocks rendered above
|
|
1391
|
+
// the real answer). The stall fallback below still inspects the
|
|
1392
|
+
// raw `response`, so reasoning-only turns keep their fallback.
|
|
1393
|
+
// ORDER MATTERS: reasoning channels strip FIRST. Reasoning text
|
|
1394
|
+
// routinely MENTIONS envelopes in backticks ("I included a
|
|
1395
|
+
// `<tool_result>` envelope…"); if markup stripping ran first, its
|
|
1396
|
+
// envelope regex would match from that in-fence mention through
|
|
1397
|
+
// to the real closing tag, eat the fence's closing ``` along the
|
|
1398
|
+
// way, and the unclosed-fence cleanup would then wipe the entire
|
|
1399
|
+
// rest of the answer.
|
|
1400
|
+
const finalResponse = (0, tool_use_parser_1.stripToolCallMarkup)(response
|
|
1401
|
+
.replace(/<think\b[\s\S]*?<\/think\s*>/gi, '')
|
|
1402
|
+
.replace(/<think\b[\s\S]*$/i, '')
|
|
1403
|
+
.replace(/```bandit-reasoning\b[\s\S]*?```/gi, '')
|
|
1404
|
+
.replace(/```bandit-reasoning\b[\s\S]*$/i, '')).trim();
|
|
1428
1405
|
// False-completion detector. Small models regularly end a turn
|
|
1429
1406
|
// with "I refactored the file" / "here is the updated code" text
|
|
1430
1407
|
// without ever emitting a file-edit tool call.
|
|
@@ -1434,14 +1411,25 @@ class ToolUseLoop {
|
|
|
1434
1411
|
// this turn, push one corrective user message into the loop
|
|
1435
1412
|
// and continue for one more iteration. The nudge is capped at
|
|
1436
1413
|
// one per turn so a truly confused model can still terminate.
|
|
1437
|
-
|
|
1414
|
+
//
|
|
1415
|
+
// ONLY fires when the goal actually implies an edit. Without this
|
|
1416
|
+
// gate the detector demanded an edit on a purely informational
|
|
1417
|
+
// "tell me about this repo" turn: the model correctly said "I have
|
|
1418
|
+
// completed the overview" (a completion phrase), no edit ran
|
|
1419
|
+
// (none was asked for), so the nudge fired and replaced the good
|
|
1420
|
+
// markdown overview with a defensive "no edits are required"
|
|
1421
|
+
// answer — plus a wall of "automated harness check" reasoning.
|
|
1422
|
+
// An analysis goal that does NOT also imply an edit can never
|
|
1423
|
+
// false-complete, so skip it. (real CLI run, 2026-06-12.)
|
|
1424
|
+
const goalCouldExpectEdit = promptImpliesFileEdit || !promptWantsAnalysis;
|
|
1425
|
+
if (!hitLimit && !falseCompletionNudged && editToolsInvoked === 0 && goalCouldExpectEdit) {
|
|
1438
1426
|
const claimsCompletion = FALSE_COMPLETION_PATTERNS.some(re => re.test(finalResponse));
|
|
1439
1427
|
if (claimsCompletion) {
|
|
1440
1428
|
falseCompletionNudged = true;
|
|
1441
1429
|
emit('tool_loop:false_completion_nudge', { iteration: iterations, responsePreview: finalResponse.slice(0, 200) });
|
|
1442
1430
|
messages.push({
|
|
1443
1431
|
role: 'user',
|
|
1444
|
-
content: 'Your response either claims work is done OR apologizes and asks what to do next — but I see NO successful `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call in this turn, so nothing on disk has changed. ' +
|
|
1432
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Your response either claims work is done OR apologizes and asks what to do next — but I see NO successful `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call in this turn, so nothing on disk has changed. ' +
|
|
1445
1433
|
'Do NOT ask the user which task to resume, do NOT promise to escape JSON "in your next tool call", and do NOT defer. Either (a) emit a real edit tool call NOW with the actual change — use `replace_range` for a large block whose line numbers you just read, `apply_edit` for a small exact replacement, or `write_file` for a new file — or (b) respond honestly that you could not complete the task and briefly explain why. Retry the tool call yourself; the user cannot help you escape JSON.'
|
|
1446
1434
|
});
|
|
1447
1435
|
continue;
|
|
@@ -1480,7 +1468,7 @@ class ToolUseLoop {
|
|
|
1480
1468
|
});
|
|
1481
1469
|
messages.push({
|
|
1482
1470
|
role: 'user',
|
|
1483
|
-
content: `Your response describes edits to ${fileSet.size} files (${[...fileSet].slice(0, 8).join(', ')}${fileSet.size > 8 ? ', …' : ''}), but only ${editToolsInvoked} successful edit${editToolsInvoked === 1 ? '' : 's'} actually fired this turn. ` +
|
|
1471
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `Your response describes edits to ${fileSet.size} files (${[...fileSet].slice(0, 8).join(', ')}${fileSet.size > 8 ? ', …' : ''}), but only ${editToolsInvoked} successful edit${editToolsInvoked === 1 ? '' : 's'} actually fired this turn. ` +
|
|
1484
1472
|
`The remaining ${fileSet.size - editToolsInvoked} file(s) were NOT modified — nothing landed on disk for them. ` +
|
|
1485
1473
|
'Either (a) emit the missing `apply_edit` / `replace_range` / `write_file` tool calls now to actually do the work, OR (b) revise your response to honestly describe ONLY the edits that successfully applied. Do not summarize work that did not happen.'
|
|
1486
1474
|
});
|
|
@@ -1491,7 +1479,7 @@ class ToolUseLoop {
|
|
|
1491
1479
|
// ("break out", "split", "refactor", "extract", "move") imply
|
|
1492
1480
|
// mutation of the SOURCE file the user wants restructured, not
|
|
1493
1481
|
// just creation of new sibling files. Failure mode observed
|
|
1494
|
-
// 2026-05-25 on a
|
|
1482
|
+
// 2026-05-25 on a local React refactor: model read App.jsx,
|
|
1495
1483
|
// wrote 5 new component files, never touched App.jsx, declared
|
|
1496
1484
|
// completion. User had to follow up "are we using these?" to
|
|
1497
1485
|
// force the integration step — and even that follow-up turn
|
|
@@ -1523,7 +1511,7 @@ class ToolUseLoop {
|
|
|
1523
1511
|
const writeCount = filesWrittenThisTurn.size;
|
|
1524
1512
|
messages.push({
|
|
1525
1513
|
role: 'user',
|
|
1526
|
-
content: `The user's goal contains a refactor verb (refactor/break out/split/extract/move) which implies the SOURCE file(s) should be modified, not just supplemented with new siblings. You read ${readPreview}${readNotWritten.length > 3 ? ' and others' : ''} for context, then wrote ${writeCount} NEW file(s), but you NEVER modified the file(s) you read. The refactor is incomplete: the source file still contains the old monolithic code. ` +
|
|
1514
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `The user's goal contains a refactor verb (refactor/break out/split/extract/move) which implies the SOURCE file(s) should be modified, not just supplemented with new siblings. You read ${readPreview}${readNotWritten.length > 3 ? ' and others' : ''} for context, then wrote ${writeCount} NEW file(s), but you NEVER modified the file(s) you read. The refactor is incomplete: the source file still contains the old monolithic code. ` +
|
|
1527
1515
|
`Emit the missing apply_edit/replace_range/write_file call on the source file now — it should import from the new files and drop the inlined code that's been extracted. If the refactor is genuinely a "scaffold only, leave source untouched" task, say so explicitly and explain why the source doesn't need to change.`
|
|
1528
1516
|
});
|
|
1529
1517
|
continue;
|
|
@@ -1552,8 +1540,9 @@ class ToolUseLoop {
|
|
|
1552
1540
|
let match;
|
|
1553
1541
|
while ((match = fenceRe.exec(finalResponse)) !== null) {
|
|
1554
1542
|
const nonEmpty = match[1].split('\n').filter(l => l.trim().length > 0).length;
|
|
1555
|
-
if (nonEmpty > biggestFenceLines)
|
|
1543
|
+
if (nonEmpty > biggestFenceLines) {
|
|
1556
1544
|
biggestFenceLines = nonEmpty;
|
|
1545
|
+
}
|
|
1557
1546
|
}
|
|
1558
1547
|
if (biggestFenceLines >= MIN_LINES) {
|
|
1559
1548
|
codeFenceHallucinationNudged = true;
|
|
@@ -1564,7 +1553,7 @@ class ToolUseLoop {
|
|
|
1564
1553
|
});
|
|
1565
1554
|
messages.push({
|
|
1566
1555
|
role: 'user',
|
|
1567
|
-
content: 'You produced a substantial code block in your reply but never emitted a `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call — so the change is NOT on disk. ' +
|
|
1556
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'You produced a substantial code block in your reply but never emitted a `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call — so the change is NOT on disk. ' +
|
|
1568
1557
|
'Do not ask the user to paste your code into a file themselves. Take exactly one of these actions now: (a) call `replace_range`, `apply_edit`, or `write_file` with the real change to the correct file, OR (b) say plainly that you could not locate the target file and explain what you searched for. Do not wrap up with another prose + code-fence response.'
|
|
1569
1558
|
});
|
|
1570
1559
|
continue;
|
|
@@ -1648,7 +1637,7 @@ class ToolUseLoop {
|
|
|
1648
1637
|
});
|
|
1649
1638
|
messages.push({
|
|
1650
1639
|
role: 'user',
|
|
1651
|
-
content: 'Your first response had reasoning but emitted NO tool call — that is a hard stall for a subagent (you exist to gather information; reasoning alone produces zero output). ' +
|
|
1640
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Your first response had reasoning but emitted NO tool call — that is a hard stall for a subagent (you exist to gather information; reasoning alone produces zero output). ' +
|
|
1652
1641
|
'For your next response, emit a tool call. The minimum viable starting move for ANY exploration goal is:\n\n' +
|
|
1653
1642
|
'<tool_call>{"name":"list_files","params":{"path":"."}}</tool_call>\n\n' +
|
|
1654
1643
|
'Copy that exact envelope as the very first thing you emit (you may keep the reasoning block before it if your model needs to think first, but the tool_call envelope MUST appear in this turn). ' +
|
|
@@ -1670,7 +1659,7 @@ class ToolUseLoop {
|
|
|
1670
1659
|
// user saw nothing.
|
|
1671
1660
|
//
|
|
1672
1661
|
// The gate also covers the "regurgitated reasoning after
|
|
1673
|
-
// native→text channel fallback" case.
|
|
1662
|
+
// native→text channel fallback" case. Real CLI
|
|
1674
1663
|
// 2026-05-31T17-39-53 cleanup turn: native-tool path 500'd,
|
|
1675
1664
|
// text-channel recovery prompted the model to re-emit its
|
|
1676
1665
|
// pending action, but the model just echoed its prior
|
|
@@ -1682,12 +1671,7 @@ class ToolUseLoop {
|
|
|
1682
1671
|
// before testing emptiness — if the response would render to
|
|
1683
1672
|
// the user as nothing-actionable, the fallback fires and the
|
|
1684
1673
|
// user sees what the model was thinking instead of silence.
|
|
1685
|
-
const reasoningStripped = response
|
|
1686
|
-
.replace(/<think\b[\s\S]*?<\/think\s*>/gi, '')
|
|
1687
|
-
.replace(/<think\b[\s\S]*$/i, '')
|
|
1688
|
-
.replace(/```bandit-reasoning\b[\s\S]*?```/gi, '')
|
|
1689
|
-
.replace(/```bandit-reasoning\b[\s\S]*$/i, '')
|
|
1690
|
-
.trim();
|
|
1674
|
+
const reasoningStripped = (0, tool_use_parser_1.stripToAnswerContent)(response);
|
|
1691
1675
|
const visibleAfterStrip = (0, tool_use_parser_1.stripToolCallMarkup)(reasoningStripped).trim();
|
|
1692
1676
|
if (!visibleAfterStrip) {
|
|
1693
1677
|
// Pull the last 1-2 sentences of reasoning so the user sees
|
|
@@ -1713,7 +1697,7 @@ class ToolUseLoop {
|
|
|
1713
1697
|
// and the inline empty-retry / narrate-no-action detector
|
|
1714
1698
|
// already used its retry budget (consecutiveEmptyRetries >= 2)
|
|
1715
1699
|
// so it couldn't nudge again, the user is left reading a
|
|
1716
|
-
// promise the model never kept.
|
|
1700
|
+
// promise the model never kept. Real CLI
|
|
1717
1701
|
// 2026-05-31T17-39-53 cleanup turn: after a native→text channel
|
|
1718
1702
|
// recovery, the model emitted "Let me revert it:" with a
|
|
1719
1703
|
// dangling colon and no tool call; the user saw the prose end
|
|
@@ -1730,13 +1714,13 @@ class ToolUseLoop {
|
|
|
1730
1714
|
// The trailing colon + intent phrase combination is the
|
|
1731
1715
|
// smoking gun. We DON'T also require NARRATE_VERB_RE here:
|
|
1732
1716
|
// the existing inline detector's verb list misses "revert"
|
|
1733
|
-
// (
|
|
1717
|
+
// (real run 2026-05-31) and would miss any other one-off
|
|
1734
1718
|
// action verb a model might use. The colon alone is rare
|
|
1735
1719
|
// enough in a legit final answer that pairing it with
|
|
1736
1720
|
// "let me" / "I'll" / "we'll" / etc. is specific enough.
|
|
1737
1721
|
//
|
|
1738
|
-
// Period-terminated variant (added 2026-06-03 after
|
|
1739
|
-
//
|
|
1722
|
+
// Period-terminated variant (added 2026-06-03 after a real
|
|
1723
|
+
// run): the model ended with "Let me fix
|
|
1740
1724
|
// all three project cards at once." — full sentence, full
|
|
1741
1725
|
// stop, no colon. Both prefill and thinking-off recovery
|
|
1742
1726
|
// had been spent earlier in the turn so the user saw the
|
|
@@ -1856,7 +1840,7 @@ class ToolUseLoop {
|
|
|
1856
1840
|
toolCalls = [];
|
|
1857
1841
|
messages.push({
|
|
1858
1842
|
role: 'user',
|
|
1859
|
-
content: `You have revised the plan in ${consecutiveTodoOnlyIterations + 1} consecutive iterations without executing any step. ` +
|
|
1843
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `You have revised the plan in ${consecutiveTodoOnlyIterations + 1} consecutive iterations without executing any step. ` +
|
|
1860
1844
|
'Execute the first pending task now using a concrete tool — `search_code`, `read_file`, `apply_edit`, `replace_range`, `write_file`, or `run_command`. ' +
|
|
1861
1845
|
'Once a task is actually DONE (tool call succeeded), you may call `todo_write` again to mark it completed — but not to re-plan. ' +
|
|
1862
1846
|
'If you cannot identify a next step, respond to the user with a short honest explanation and stop.'
|
|
@@ -1881,7 +1865,7 @@ class ToolUseLoop {
|
|
|
1881
1865
|
});
|
|
1882
1866
|
messages.push({
|
|
1883
1867
|
role: 'user',
|
|
1884
|
-
content: `You have spent ${consecutiveApplyEditOnlyIterations} consecutive iterations on apply_edit alone. ` +
|
|
1868
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `You have spent ${consecutiveApplyEditOnlyIterations} consecutive iterations on apply_edit alone. ` +
|
|
1885
1869
|
'If these are mechanical fixes of the same shape (one type annotation, one rename, one import path, one missing semicolon per call), STOP doing them one at a time — you will exhaust the iteration budget before the file is clean.\n' +
|
|
1886
1870
|
'\n' +
|
|
1887
1871
|
'Better tactics, in order of preference:\n' +
|
|
@@ -1995,7 +1979,7 @@ class ToolUseLoop {
|
|
|
1995
1979
|
});
|
|
1996
1980
|
messages.push({
|
|
1997
1981
|
role: 'user',
|
|
1998
|
-
content: `You just spawned ${bgSpawns.length} background subagents:\n${goalLines}\n\n` +
|
|
1982
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `You just spawned ${bgSpawns.length} background subagents:\n${goalLines}\n\n` +
|
|
1999
1983
|
'Do NOT do those same explorations yourself in the next iteration — the subagents will deliver their synopses via the auto-inject path on a later turn. ' +
|
|
2000
1984
|
'Choose ONE of: ' +
|
|
2001
1985
|
'(a) work on a different, independent piece of the task that those subagents are NOT covering, ' +
|
|
@@ -2036,7 +2020,7 @@ class ToolUseLoop {
|
|
|
2036
2020
|
});
|
|
2037
2021
|
messages.push({
|
|
2038
2022
|
role: 'user',
|
|
2039
|
-
content: 'You set up a plan with `todo_write` earlier but have since completed ' +
|
|
2023
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'You set up a plan with `todo_write` earlier but have since completed ' +
|
|
2040
2024
|
`${editsSinceLastTodo} edit${editsSinceLastTodo === 1 ? '' : 's'} without updating it. ` +
|
|
2041
2025
|
'Call `todo_write` now with the current status — mark finished items as `completed` and leave remaining items as `pending`. ' +
|
|
2042
2026
|
"The Plan block in the user's UI mirrors your last `todo_write`, so skipping this leaves them looking at a stale checklist while real work has landed."
|