@burtson-labs/agent-core 1.6.17 → 1.6.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tools/core-tools.js +6 -6
- package/dist/tools/core-tools.js.map +1 -1
- package/dist/tools/language-adapters.d.ts +1 -1
- package/dist/tools/language-adapters.d.ts.map +1 -1
- package/dist/tools/language-adapters.js +12 -6
- package/dist/tools/language-adapters.js.map +1 -1
- package/dist/tools/loop/goalAnchor.d.ts.map +1 -1
- package/dist/tools/loop/goalAnchor.js +2 -1
- package/dist/tools/loop/goalAnchor.js.map +1 -1
- package/dist/tools/loop/llmStream.js +5 -5
- package/dist/tools/loop/llmStream.js.map +1 -1
- package/dist/tools/loop/loopShared.d.ts +20 -0
- package/dist/tools/loop/loopShared.d.ts.map +1 -0
- package/dist/tools/loop/loopShared.js +105 -0
- package/dist/tools/loop/loopShared.js.map +1 -0
- package/dist/tools/loop/parallelExecute.d.ts +1 -1
- package/dist/tools/loop/turnSetup.js +3 -3
- package/dist/tools/loop/turnSetup.js.map +1 -1
- package/dist/tools/skills/mail-search-skill.js +2 -2
- package/dist/tools/skills/mail-search-skill.js.map +1 -1
- package/dist/tools/tool-registry.d.ts +17 -0
- package/dist/tools/tool-registry.d.ts.map +1 -1
- package/dist/tools/tool-registry.js +100 -25
- package/dist/tools/tool-registry.js.map +1 -1
- package/dist/tools/tool-use-loop.d.ts +15 -7
- package/dist/tools/tool-use-loop.d.ts.map +1 -1
- package/dist/tools/tool-use-loop.js +130 -158
- package/dist/tools/tool-use-loop.js.map +1 -1
- package/dist/tools/tool-use-parser.d.ts +33 -0
- package/dist/tools/tool-use-parser.d.ts.map +1 -1
- package/dist/tools/tool-use-parser.js +49 -0
- package/dist/tools/tool-use-parser.js.map +1 -1
- package/dist/tools/toolAvailabilityDetector.d.ts +0 -24
- package/dist/tools/toolAvailabilityDetector.d.ts.map +1 -1
- package/dist/tools/toolAvailabilityDetector.js +24 -11
- package/dist/tools/toolAvailabilityDetector.js.map +1 -1
- package/package.json +20 -1
|
@@ -18,12 +18,7 @@
|
|
|
18
18
|
* the host should use the Ollama `tools: [...]` field instead.
|
|
19
19
|
*/
|
|
20
20
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
21
|
-
exports.ToolUseLoop = void 0;
|
|
22
|
-
exports.sleep = sleep;
|
|
23
|
-
exports.isRetryableLlmError = isRetryableLlmError;
|
|
24
|
-
exports.tagRetryableLlmError = tagRetryableLlmError;
|
|
25
|
-
exports.summarizeLlmError = summarizeLlmError;
|
|
26
|
-
exports.isContinuationPrompt = isContinuationPrompt;
|
|
21
|
+
exports.ToolUseLoop = exports.isContinuationPrompt = exports.summarizeLlmError = exports.tagRetryableLlmError = exports.isRetryableLlmError = exports.sleep = void 0;
|
|
27
22
|
exports.isNoticingPrompt = isNoticingPrompt;
|
|
28
23
|
exports.createToolUseLoop = createToolUseLoop;
|
|
29
24
|
const tool_use_parser_1 = require("./tool-use-parser");
|
|
@@ -36,93 +31,16 @@ const parallelExecute_1 = require("./loop/parallelExecute");
|
|
|
36
31
|
const goalAnchor_1 = require("./loop/goalAnchor");
|
|
37
32
|
const finalAnswerNudges_1 = require("./loop/finalAnswerNudges");
|
|
38
33
|
const toolAvailabilityDetector_1 = require("./toolAvailabilityDetector");
|
|
34
|
+
const loopShared_1 = require("./loop/loopShared");
|
|
35
|
+
Object.defineProperty(exports, "sleep", { enumerable: true, get: function () { return loopShared_1.sleep; } });
|
|
36
|
+
Object.defineProperty(exports, "isRetryableLlmError", { enumerable: true, get: function () { return loopShared_1.isRetryableLlmError; } });
|
|
37
|
+
Object.defineProperty(exports, "tagRetryableLlmError", { enumerable: true, get: function () { return loopShared_1.tagRetryableLlmError; } });
|
|
38
|
+
Object.defineProperty(exports, "summarizeLlmError", { enumerable: true, get: function () { return loopShared_1.summarizeLlmError; } });
|
|
39
|
+
Object.defineProperty(exports, "isContinuationPrompt", { enumerable: true, get: function () { return loopShared_1.isContinuationPrompt; } });
|
|
39
40
|
const FILE_EDIT_TOOL_NAMES = new Set(['write_file', 'apply_edit', 'replace_range', 'apply_patch']);
|
|
40
41
|
function isFileEditTool(name) {
|
|
41
42
|
return FILE_EDIT_TOOL_NAMES.has(name);
|
|
42
43
|
}
|
|
43
|
-
function sleep(ms) {
|
|
44
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
45
|
-
}
|
|
46
|
-
function getErrorCode(error) {
|
|
47
|
-
return typeof error === 'object' && error !== null && 'code' in error
|
|
48
|
-
? String(error.code ?? '')
|
|
49
|
-
: undefined;
|
|
50
|
-
}
|
|
51
|
-
function getErrorMessage(error) {
|
|
52
|
-
return error instanceof Error ? error.message : String(error);
|
|
53
|
-
}
|
|
54
|
-
function isRetryableLlmError(error) {
|
|
55
|
-
const code = getErrorCode(error);
|
|
56
|
-
if (code === 'USER_ABORT') {
|
|
57
|
-
return false;
|
|
58
|
-
}
|
|
59
|
-
const message = getErrorMessage(error);
|
|
60
|
-
if (/\b429\b|rate limit/i.test(message)) {
|
|
61
|
-
return false;
|
|
62
|
-
}
|
|
63
|
-
return (code === 'WATCHDOG' ||
|
|
64
|
-
/\b5\d\d\b/.test(message) ||
|
|
65
|
-
/Upstream model request failed/i.test(message) ||
|
|
66
|
-
/ECONNREFUSED|ECONNRESET|ETIMEDOUT|EAI_AGAIN|socket hang up|fetch failed|network error|terminated|UND_ERR/i.test(message));
|
|
67
|
-
}
|
|
68
|
-
function tagRetryableLlmError(error) {
|
|
69
|
-
if (error instanceof Error) {
|
|
70
|
-
const tagged = error;
|
|
71
|
-
if (!tagged.code) {
|
|
72
|
-
tagged.code = 'UPSTREAM_MODEL';
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
function summarizeLlmError(error) {
|
|
77
|
-
const message = getErrorMessage(error).replace(/\s+/g, ' ').trim();
|
|
78
|
-
return message.length > 180 ? `${message.slice(0, 177)}...` : message;
|
|
79
|
-
}
|
|
80
|
-
/**
|
|
81
|
-
* Detects "keep going" / "continue" / "yes" style prompts that
|
|
82
|
-
* carry no real goal content. The goal-anchor block uses the most recent
|
|
83
|
-
* user message as the recall text; when that text is "good lets keep
|
|
84
|
-
* going" the anchor degenerates into "remind yourself to keep going",
|
|
85
|
-
* which gives the model nothing to anchor on after 20 iterations of
|
|
86
|
-
* drift. Real on a 60-iteration linter-fix
|
|
87
|
-
* turn: every anchor injection cited "good lets keep going" as the
|
|
88
|
-
* goal. Detector lets callers walk back to a prior substantive prompt
|
|
89
|
-
* instead.
|
|
90
|
-
*
|
|
91
|
-
* Length cap (60 chars) + normalized-phrase match keeps false positives
|
|
92
|
-
* down — a sentence like "keep going on the auth refactor for the
|
|
93
|
-
* user-service" is longer than 60 chars and reads as a real goal, so it
|
|
94
|
-
* stays a goal.
|
|
95
|
-
*/
|
|
96
|
-
const CONTINUATION_PROMPT_PHRASES = new Set([
|
|
97
|
-
'continue', 'keep going', 'go on', 'proceed', 'next', 'more',
|
|
98
|
-
'please continue', 'carry on', 'finish', 'finish it', 'finish up', 'wrap up', 'wrap it up',
|
|
99
|
-
'good', 'great', 'nice', 'cool', 'sweet', 'perfect', 'ok', 'okay', 'k', 'yes', 'y', 'yep', 'yeah', 'ack', 'done',
|
|
100
|
-
"let's continue", 'lets continue', "let's keep going", 'lets keep going',
|
|
101
|
-
'good keep going', 'good lets keep going', "good let's keep going",
|
|
102
|
-
'good continue', 'ok continue', 'okay continue'
|
|
103
|
-
]);
|
|
104
|
-
function isContinuationPrompt(text) {
|
|
105
|
-
const trimmed = text.trim();
|
|
106
|
-
if (trimmed.length === 0 || trimmed.length > 60) {
|
|
107
|
-
return false;
|
|
108
|
-
}
|
|
109
|
-
// Normalize: lowercase, drop non-word/space punctuation, collapse whitespace.
|
|
110
|
-
const norm = trimmed
|
|
111
|
-
.toLowerCase()
|
|
112
|
-
.replace(/[^\w\s']/g, ' ')
|
|
113
|
-
.replace(/\s+/g, ' ')
|
|
114
|
-
.trim();
|
|
115
|
-
if (CONTINUATION_PROMPT_PHRASES.has(norm)) {
|
|
116
|
-
return true;
|
|
117
|
-
}
|
|
118
|
-
// Permit "please <phrase>" and "<phrase> please" wrappings.
|
|
119
|
-
for (const phrase of CONTINUATION_PROMPT_PHRASES) {
|
|
120
|
-
if (norm === `please ${phrase}` || norm === `${phrase} please`) {
|
|
121
|
-
return true;
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
return false;
|
|
125
|
-
}
|
|
126
44
|
/**
|
|
127
45
|
* "Noticing prompt" detector. Catches user messages that are asking
|
|
128
46
|
* about state ("are we using these?", "did you update X?", "where's
|
|
@@ -130,7 +48,7 @@ function isContinuationPrompt(text) {
|
|
|
130
48
|
* work. These signal that the user spotted a gap in the prior turn
|
|
131
49
|
* and wants the agent to address it — NOT continue the prior plan.
|
|
132
50
|
*
|
|
133
|
-
* Real failure mode captured 2026-05-25 on a
|
|
51
|
+
* Real failure mode captured 2026-05-25 on a local React refactor:
|
|
134
52
|
* user asked "I dont think we actually are using these new files are
|
|
135
53
|
* we?" after the agent wrote data files but never wired them into
|
|
136
54
|
* App.jsx. Bandit read the question as a generic "keep going" prompt,
|
|
@@ -265,7 +183,13 @@ class ToolUseLoop {
|
|
|
265
183
|
// explicit "this is a recovery attempt — answer the original goal"
|
|
266
184
|
// framing succeeds. Last resort before terminal throw.
|
|
267
185
|
let finalAnchorRetryUsed = false;
|
|
268
|
-
const textToolBlock =
|
|
186
|
+
const textToolBlock = effectiveOptions.compactToolBlock
|
|
187
|
+
? this.registry.buildCompactSystemPromptBlock()
|
|
188
|
+
: this.registry.buildSystemPromptBlock();
|
|
189
|
+
// Lowercased registered tool names — used by the narrated-call
|
|
190
|
+
// detector to anchor on "I call <real tool>" with near-zero false
|
|
191
|
+
// positives.
|
|
192
|
+
const registeredToolNames = new Set(this.registry.getAll().map(t => t.name.toLowerCase()));
|
|
269
193
|
const buildFullSystemPrompt = (useNativeTools) => {
|
|
270
194
|
if (useNativeTools) {
|
|
271
195
|
return systemPrompt ?? '';
|
|
@@ -315,7 +239,7 @@ class ToolUseLoop {
|
|
|
315
239
|
});
|
|
316
240
|
messages.push({
|
|
317
241
|
role: 'user',
|
|
318
|
-
content: '[Reading-comprehension note for the assistant: the user\'s last message above is a noticing / clarifying question — they spotted a possible gap from prior turns and are asking you to confirm or correct, NOT to continue any prior plan. Before you take any new action, identify what gap the question points at and address it directly. If the question is "are we using X?" the correct first move is to verify whether X is actually being used (read the consumer file, grep for the import, check the call site) and answer honestly — yes/no with evidence. Do NOT create more new artifacts unless the user explicitly says to.]'
|
|
242
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + '[Reading-comprehension note for the assistant: the user\'s last message above is a noticing / clarifying question — they spotted a possible gap from prior turns and are asking you to confirm or correct, NOT to continue any prior plan. Before you take any new action, identify what gap the question points at and address it directly. If the question is "are we using X?" the correct first move is to verify whether X is actually being used (read the consumer file, grep for the import, check the call site) and answer honestly — yes/no with evidence. Do NOT create more new artifacts unless the user explicitly says to.]'
|
|
319
243
|
});
|
|
320
244
|
}
|
|
321
245
|
let iterations = 0;
|
|
@@ -344,7 +268,7 @@ class ToolUseLoop {
|
|
|
344
268
|
// recovery, etc.) each have their own caps, but they can chain — a
|
|
345
269
|
// model can spin through 6+ no-tool-call responses because
|
|
346
270
|
// thinking-off recovery resets consecutiveEmptyRetries=0. Captured
|
|
347
|
-
// 2026-05-26 in
|
|
271
|
+
// 2026-05-26 in a real CLI session (turn-2026-05-26T02-30-37):
|
|
348
272
|
// model emitted 6 sequential reasoning-only responses inside
|
|
349
273
|
// iteration 4 before the loop finally terminated with a useless
|
|
350
274
|
// final answer ("I need to stop wrapping tool calls in reasoning
|
|
@@ -697,7 +621,7 @@ class ToolUseLoop {
|
|
|
697
621
|
// current pace and burn the extension too.
|
|
698
622
|
messages.push({
|
|
699
623
|
role: 'user',
|
|
700
|
-
content: `You've been making good progress and the iteration budget has been extended by ${CAP_EXTENSION_SIZE} (new limit: ${max}). Keep going, but tighten up: prefer batched edits over single-line ones, and start wrapping up when you have a complete answer rather than running to the new cap. This is the ${iterationCapExtensions === 1 ? 'first' : 'second'} of at most ${MAX_CAP_EXTENSIONS} extensions for this turn.`
|
|
624
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `You've been making good progress and the iteration budget has been extended by ${CAP_EXTENSION_SIZE} (new limit: ${max}). Keep going, but tighten up: prefer batched edits over single-line ones, and start wrapping up when you have a complete answer rather than running to the new cap. This is the ${iterationCapExtensions === 1 ? 'first' : 'second'} of at most ${MAX_CAP_EXTENSIONS} extensions for this turn.`
|
|
701
625
|
});
|
|
702
626
|
}
|
|
703
627
|
else {
|
|
@@ -708,7 +632,7 @@ class ToolUseLoop {
|
|
|
708
632
|
// vs edit) reflects what the user actually asked for.
|
|
709
633
|
messages.push({
|
|
710
634
|
role: 'user',
|
|
711
|
-
content: `${goalRecallBlock}` +
|
|
635
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `${goalRecallBlock}` +
|
|
712
636
|
`You have reached the tool-use iteration limit (${max}). Stop calling tools. Produce a final answer with three short sections, in this exact shape:\n` +
|
|
713
637
|
'\n' +
|
|
714
638
|
wrapUpBody +
|
|
@@ -722,7 +646,7 @@ class ToolUseLoop {
|
|
|
722
646
|
emit('tool_loop:total_tool_cap', { iteration: iterations, totalToolsExecuted });
|
|
723
647
|
messages.push({
|
|
724
648
|
role: 'user',
|
|
725
|
-
content: `${goalRecallBlock}` +
|
|
649
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `${goalRecallBlock}` +
|
|
726
650
|
`You have executed ${totalToolsExecuted} tool calls this turn — the per-turn cap (${maxTotalTools}) has been reached. Stop calling tools. Produce a final answer with three short sections:\n` +
|
|
727
651
|
'\n' +
|
|
728
652
|
wrapUpBody +
|
|
@@ -823,7 +747,7 @@ class ToolUseLoop {
|
|
|
823
747
|
break;
|
|
824
748
|
}
|
|
825
749
|
catch (error) {
|
|
826
|
-
if (nativeTools && nativeToolFailureFallback && !nativeFallbackUsed && isRetryableLlmError(error) && !signal?.aborted) {
|
|
750
|
+
if (nativeTools && nativeToolFailureFallback && !nativeFallbackUsed && (0, loopShared_1.isRetryableLlmError)(error) && !signal?.aborted) {
|
|
827
751
|
nativeFallbackUsed = true;
|
|
828
752
|
nativeTools = false;
|
|
829
753
|
nativeSchemas = undefined;
|
|
@@ -849,7 +773,7 @@ class ToolUseLoop {
|
|
|
849
773
|
// visible markup.
|
|
850
774
|
messages.push({
|
|
851
775
|
role: 'user',
|
|
852
|
-
content: `[Provider error mid-turn — tool channel switched.] The previous attempt failed with: ${summarizeLlmError(error)}. ` +
|
|
776
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `[Provider error mid-turn — tool channel switched.] The previous attempt failed with: ${(0, loopShared_1.summarizeLlmError)(error)}. ` +
|
|
853
777
|
`I retried with the text-based tool-call channel. ` +
|
|
854
778
|
`Re-emit your pending action using the text envelope: ` +
|
|
855
779
|
`<tool_call>{"name":"...","params":{...}}</tool_call> outside of any reasoning block. ` +
|
|
@@ -858,7 +782,7 @@ class ToolUseLoop {
|
|
|
858
782
|
});
|
|
859
783
|
emit('tool_loop:native_tool_fallback', {
|
|
860
784
|
iteration: iterations,
|
|
861
|
-
reason: summarizeLlmError(error)
|
|
785
|
+
reason: (0, loopShared_1.summarizeLlmError)(error)
|
|
862
786
|
});
|
|
863
787
|
continue;
|
|
864
788
|
}
|
|
@@ -871,13 +795,13 @@ class ToolUseLoop {
|
|
|
871
795
|
// this attempt, any further failure on text is genuinely
|
|
872
796
|
// terminal — the user has been waiting > 30 s and a clean
|
|
873
797
|
// error is more helpful than another silent retry.
|
|
874
|
-
if (nativeFallbackUsed && !textFallbackRetryUsed && isRetryableLlmError(error) && !signal?.aborted) {
|
|
798
|
+
if (nativeFallbackUsed && !textFallbackRetryUsed && (0, loopShared_1.isRetryableLlmError)(error) && !signal?.aborted) {
|
|
875
799
|
textFallbackRetryUsed = true;
|
|
876
800
|
emit('tool_loop:text_fallback_retry', {
|
|
877
801
|
iteration: iterations,
|
|
878
|
-
reason: summarizeLlmError(error)
|
|
802
|
+
reason: (0, loopShared_1.summarizeLlmError)(error)
|
|
879
803
|
});
|
|
880
|
-
await sleep(2400);
|
|
804
|
+
await (0, loopShared_1.sleep)(2400);
|
|
881
805
|
continue;
|
|
882
806
|
}
|
|
883
807
|
// Last-resort final-anchor retry. By this point we've spent
|
|
@@ -894,21 +818,21 @@ class ToolUseLoop {
|
|
|
894
818
|
if (!finalAnchorRetryUsed
|
|
895
819
|
&& textFallbackRetryUsed
|
|
896
820
|
&& originalGoal.trim().length > 0
|
|
897
|
-
&& isRetryableLlmError(error)
|
|
821
|
+
&& (0, loopShared_1.isRetryableLlmError)(error)
|
|
898
822
|
&& !signal?.aborted) {
|
|
899
823
|
finalAnchorRetryUsed = true;
|
|
900
824
|
messages.push({
|
|
901
825
|
role: 'user',
|
|
902
|
-
content: `[Recovery attempt — previous channel attempts hit ${summarizeLlmError(error)}. ` +
|
|
826
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `[Recovery attempt — previous channel attempts hit ${(0, loopShared_1.summarizeLlmError)(error)}. ` +
|
|
903
827
|
`Discarding any partial tool_call or reasoning state from those attempts. ` +
|
|
904
828
|
`Original user goal restated as a fresh anchor:]\n\n${originalGoal.trim()}`
|
|
905
829
|
});
|
|
906
830
|
emit('tool_loop:final_anchor_retry', {
|
|
907
831
|
iteration: iterations,
|
|
908
|
-
reason: summarizeLlmError(error),
|
|
832
|
+
reason: (0, loopShared_1.summarizeLlmError)(error),
|
|
909
833
|
goalPreview: originalGoal.slice(0, 120)
|
|
910
834
|
});
|
|
911
|
-
await sleep(3600);
|
|
835
|
+
await (0, loopShared_1.sleep)(3600);
|
|
912
836
|
continue;
|
|
913
837
|
}
|
|
914
838
|
throw error;
|
|
@@ -938,7 +862,7 @@ class ToolUseLoop {
|
|
|
938
862
|
// have their own caps, but they chain — thinking-off recovery
|
|
939
863
|
// resets consecutiveEmptyRetries=0, parse-retry has its own
|
|
940
864
|
// counter, and the model can move between failure modes faster
|
|
941
|
-
// than any one detector can give up.
|
|
865
|
+
// than any one detector can give up. Real CLI session
|
|
942
866
|
// 2026-05-26 turn-02-30-37: 6 sequential reasoning-only
|
|
943
867
|
// responses inside one iteration before the loop terminated
|
|
944
868
|
// silently. This counter increments on EVERY response without
|
|
@@ -979,9 +903,9 @@ class ToolUseLoop {
|
|
|
979
903
|
// Also reset the prefill-recovery one-shot. The recovery budget
|
|
980
904
|
// is "per stretch of failures," not "once per turn" — without
|
|
981
905
|
// this reset, a long refactor that recovers from one prefill
|
|
982
|
-
// stall and then hits another (
|
|
983
|
-
//
|
|
984
|
-
//
|
|
906
|
+
// stall and then hits another (observed in a real run: 26
|
|
907
|
+
// iterations, prefill burned at iter 25, iter 26 stalled again
|
|
908
|
+
// with no recovery left) falls straight
|
|
985
909
|
// through to the terminal "Bandit stalled" fallback even though
|
|
986
910
|
// every other detector still has budget. The hard cap on
|
|
987
911
|
// noToolCallAttemptsThisTurn (5) bounds the total stuck
|
|
@@ -1013,7 +937,7 @@ class ToolUseLoop {
|
|
|
1013
937
|
messages.push({ role: 'assistant', content: scrubbed });
|
|
1014
938
|
messages.push({
|
|
1015
939
|
role: 'user',
|
|
1016
|
-
content: 'You emitted a `<tool_result>` envelope in your response. Those envelopes are SYSTEM output — they appear BETWEEN your turns, never inside your own message. If you meant to invoke a tool, emit a single `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. If the task is complete, give a plain-prose final answer with no XML envelopes. Retry now.'
|
|
940
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'You emitted a `<tool_result>` envelope in your response. Those envelopes are SYSTEM output — they appear BETWEEN your turns, never inside your own message. If you meant to invoke a tool, emit a single `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. If the task is complete, give a plain-prose final answer with no XML envelopes. Retry now.'
|
|
1017
941
|
});
|
|
1018
942
|
continue;
|
|
1019
943
|
}
|
|
@@ -1044,7 +968,7 @@ class ToolUseLoop {
|
|
|
1044
968
|
messages.push({ role: 'assistant', content: scrubbed });
|
|
1045
969
|
messages.push({
|
|
1046
970
|
role: 'user',
|
|
1047
|
-
content: 'You emitted ` ```bandit-tl` (or `bandit-run` / `bandit-subagent`) fenced JSON in your response. Those fences are emitted by the EXTENSION HOST to log real tool execution — you CANNOT produce them. They show up in your context because the host logged actual tool calls, not because you can fabricate them. To actually run a tool, emit `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. Your fake fences mean NO work has happened this turn. You have TWO options for your retry, and ONLY two: (a) Emit a real `<tool_call>{"name":"...","params":{...}}</tool_call>` envelope NOW to actually do the work, then wait for the real result. (b) Honestly state "I have not [action] yet" and STOP. Do NOT claim completion. You MUST NOT claim you have fixed / eliminated / resolved / removed / cleaned / verified anything. No "successfully [verb]" phrasing. No numbered lists of "Step 1: I did X" actions. No "the project is now in a healthy state." Until a real `<tool_call>` lands on disk and returns a real tool-result, nothing has changed. Lying about completion is the worst failure mode. Retry now.'
|
|
971
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'You emitted ` ```bandit-tl` (or `bandit-run` / `bandit-subagent`) fenced JSON in your response. Those fences are emitted by the EXTENSION HOST to log real tool execution — you CANNOT produce them. They show up in your context because the host logged actual tool calls, not because you can fabricate them. To actually run a tool, emit `<tool_call>{"name":"...","params":{...}}</tool_call>` and wait for the real result. Your fake fences mean NO work has happened this turn. You have TWO options for your retry, and ONLY two: (a) Emit a real `<tool_call>{"name":"...","params":{...}}</tool_call>` envelope NOW to actually do the work, then wait for the real result. (b) Honestly state "I have not [action] yet" and STOP. Do NOT claim completion. You MUST NOT claim you have fixed / eliminated / resolved / removed / cleaned / verified anything. No "successfully [verb]" phrasing. No numbered lists of "Step 1: I did X" actions. No "the project is now in a healthy state." Until a real `<tool_call>` lands on disk and returns a real tool-result, nothing has changed. Lying about completion is the worst failure mode. Retry now.'
|
|
1048
972
|
});
|
|
1049
973
|
continue;
|
|
1050
974
|
}
|
|
@@ -1066,7 +990,10 @@ class ToolUseLoop {
|
|
|
1066
990
|
&& !(0, tool_use_parser_1.hasToolCalls)(response)
|
|
1067
991
|
&& toolAbsenceCorrectionsFired < TOOL_ABSENCE_CORRECTION_CAP) {
|
|
1068
992
|
const registeredNames = this.registry.getAll().map((t) => t.name);
|
|
1069
|
-
|
|
993
|
+
// Reasoning channels MUST be stripped before prose-matching:
|
|
994
|
+
// reasoning narrates tool usage by name and false-positives the
|
|
995
|
+
// absence phrases (see toolAvailabilityDetector.ts header).
|
|
996
|
+
const absence = (0, toolAvailabilityDetector_1.detectFalseToolAbsence)((0, tool_use_parser_1.stripReasoningChannels)(response), registeredNames);
|
|
1070
997
|
if (absence.detected) {
|
|
1071
998
|
toolAbsenceCorrectionsFired++;
|
|
1072
999
|
emit('tool_loop:false_tool_absence', {
|
|
@@ -1098,7 +1025,7 @@ class ToolUseLoop {
|
|
|
1098
1025
|
messages.push({ role: 'assistant', content: response });
|
|
1099
1026
|
messages.push({
|
|
1100
1027
|
role: 'user',
|
|
1101
|
-
content: 'The previous tool call returned an error and you produced no follow-up tool_call. ' +
|
|
1028
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'The previous tool call returned an error and you produced no follow-up tool_call. ' +
|
|
1102
1029
|
'Do NOT silently abandon the request — the user expects you to either retry with corrected parameters OR state explicitly which precondition failed and why you cannot proceed. ' +
|
|
1103
1030
|
'Choose one: (a) emit a corrected `<tool_call>{"name":"...","params":{...}}</tool_call>` now, fixing the param shape or value the error pointed at; ' +
|
|
1104
1031
|
'(b) give a one-line final answer naming the exact precondition you lack (e.g. "I cannot trash message X because the message id is unknown — please provide it"). ' +
|
|
@@ -1117,12 +1044,12 @@ class ToolUseLoop {
|
|
|
1117
1044
|
// without emitting an actual tool_call. Visually the user sees a
|
|
1118
1045
|
// wall of reasoning text and nothing happens. Strip the reasoning
|
|
1119
1046
|
// fences before checking emptiness so the same nudge fires.
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1047
|
+
// Strip reasoning channels AND stray fence scaffolding (a bare
|
|
1048
|
+
// leading ``` opener that wraps the reasoning) so the
|
|
1049
|
+
// reasoning-only check isn't fooled into seeing the orphan ``` as
|
|
1050
|
+
// a real answer — which let a "reasoning + no tool call" turn end
|
|
1051
|
+
// with no answer (real CLI run, 2026-06-15).
|
|
1052
|
+
const stripped = (0, tool_use_parser_1.stripToAnswerContent)(response);
|
|
1126
1053
|
const reasoningOnly = !stripped && response.trim().length > 0;
|
|
1127
1054
|
// "Narrated but didn't act" detector. Some models (notably ones
|
|
1128
1055
|
// post-trained for a different tool-call envelope, e.g. OpenAI
|
|
@@ -1138,8 +1065,8 @@ class ToolUseLoop {
|
|
|
1138
1065
|
// in the model's final clause, not an earlier "I have already
|
|
1139
1066
|
// searched the file" preamble before a real answer.
|
|
1140
1067
|
//
|
|
1141
|
-
// Captured 2026-05-25 (
|
|
1142
|
-
// "I'll redesign the
|
|
1068
|
+
// Captured 2026-05-25 (real IDE session): model emitted
|
|
1069
|
+
// "I'll redesign the page... Let me rewrite both files." with
|
|
1143
1070
|
// NO tool_call and the turn closed as a final answer because
|
|
1144
1071
|
// neither `redesign` nor `rewrite` was on the list. A long
|
|
1145
1072
|
// session ended with zero work shipped. Missing a verb here =
|
|
@@ -1147,10 +1074,13 @@ class ToolUseLoop {
|
|
|
1147
1074
|
const NARRATE_VERB_RE = /\b(use|uses|used|using|call|calls|called|calling|invoke|invokes|invoked|invoking|execute|executes|executed|executing|run|runs|running|ran|search|searches|searched|searching|look|looks|looked|looking|read|reads|reading|check|checks|checked|checking|find|finds|finding|found|list|lists|listed|listing|fetch|fetches|fetched|fetching|grep|greps|grepped|grepping|explore|explores|explored|exploring|locate|locates|located|locating|plan|plans|planned|planning|start|starts|started|starting|begin|begins|began|beginning|create|creates|created|creating|write|writes|wrote|writing|rewrite|rewrites|rewrote|rewriting|rewritten|build|builds|built|building|rebuild|rebuilds|rebuilt|rebuilding|update|updates|updated|updating|implement|implements|implemented|implementing|refactor|refactors|refactored|refactoring|redesign|redesigns|redesigned|redesigning|design|designs|designed|designing|generate|generates|generated|generating|scaffold|scaffolds|scaffolded|scaffolding|set\s+up|setting\s+up|tackle|tackles|tackled|tackling|do|does|did|doing|make|makes|made|making|batch|batches|batched|batching|execute|prepare|prepares|prepared|preparing|draft|drafts|drafted|drafting|outline|outlines|outlined|outlining|organize|organizes|organized|organizing|structure|structures|structured|structuring|kick\s+off|kicking\s+off|fix|fixes|fixed|fixing|edit|edits|edited|editing|modify|modifies|modified|modifying|patch|patches|patched|patching|adjust|adjusts|adjusted|adjusting|replace|replaces|replaced|replacing|swap|swaps|swapped|swapping|polish|polishes|polished|polishing|clean\s+up|cleaning\s+up|tidy|tidies|tidied|tidying|finalize|finalizes|finalized|finalizing|finish|finishes|finished|finishing|complete|completes|completed|completing|wire|wires|wired|wiring|hook|hooks|hooked|hooking|render|renders|rendered|rendering|style|styles|styled|styling|theme|themes|themed|theming|redo|redoes|redid|redoing|port|ports|ported|porting|migrate|migrates|migrated|migrating|configure|configures|configured|configuring|install|installs|installed|installing|remove|removes|removed|removing|delete|deletes|deleted|deleting|rename|renames|renamed|renaming)\b/i;
|
|
1148
1075
|
const NARRATE_INTENT_RE = /\b(we (?:will|need to|should)|we'?ll|we'?re going to|i'?ll|i will|let me|let'?s|going to|i'?m going to|i need to)\b/i;
|
|
1149
1076
|
// Real code fences pass through; narrate only fires when the
|
|
1150
|
-
// model emitted no structured payload at all.
|
|
1151
|
-
// response
|
|
1152
|
-
//
|
|
1153
|
-
|
|
1077
|
+
// model emitted no structured payload at all. Use the
|
|
1078
|
+
// reasoning-stripped response (NOT `stripped`, which also removes
|
|
1079
|
+
// bare fence-marker lines) so a genuine ```json / ```diff payload
|
|
1080
|
+
// still suppresses the narrate nudge and reaches its own
|
|
1081
|
+
// auto-promote detector. `bandit-reasoning` fences are reasoning,
|
|
1082
|
+
// not structured output, so they're excluded either way.
|
|
1083
|
+
const hasCodeFence = /```[a-zA-Z0-9_-]*\s*\n/.test((0, tool_use_parser_1.stripReasoningChannels)(response));
|
|
1154
1084
|
const tailMatch = stripped.match(/(?:[.!?]\s+)([^.!?]*)$/);
|
|
1155
1085
|
const tail = (tailMatch ? tailMatch[1] : stripped).slice(-200);
|
|
1156
1086
|
const narratedButNoAction = !(0, tool_use_parser_1.hasToolCalls)(response) &&
|
|
@@ -1159,6 +1089,22 @@ class ToolUseLoop {
|
|
|
1159
1089
|
stripped.length < 240 &&
|
|
1160
1090
|
NARRATE_INTENT_RE.test(tail) &&
|
|
1161
1091
|
NARRATE_VERB_RE.test(tail);
|
|
1092
|
+
// Performative narrated call: "I call read_file with path=README.md".
|
|
1093
|
+
// The generic gate above caps stripped.length at 240 to avoid false
|
|
1094
|
+
// positives on real answers that merely contain narrate verbs — but
|
|
1095
|
+
// when the final clause NAMES A REGISTERED TOOL in a performative
|
|
1096
|
+
// phrase, the length cap is wrong: a long planning recap that ends
|
|
1097
|
+
// "I call read_file with path=…" is a stall no matter how long the
|
|
1098
|
+
// recap is, and tool-name anchoring keeps the false-positive rate
|
|
1099
|
+
// near zero. Captured 2026-06-12 (real CLI session,
|
|
1100
|
+
// gemma4:e4b): iteration 1 emitted a reasoning recap ending with
|
|
1101
|
+
// exactly that sentence and no tool_call — the generic gate missed
|
|
1102
|
+
// it (over the length cap; intent list lacks present-tense "I
|
|
1103
|
+
// call") and the turn closed as a final answer.
|
|
1104
|
+
const narratedCallMatch = stripped.slice(-300).match(/\b(?:i\s+(?:will\s+|now\s+|then\s+)?(?:call|invoke|run|use)|calling|invoking|let'?s\s+(?:call|run|use))\s+(?:the\s+)?`?([a-z][a-z0-9_]*)`?/i);
|
|
1105
|
+
const narratedToolCallNoAction = !(0, tool_use_parser_1.hasToolCalls)(response) &&
|
|
1106
|
+
!!narratedCallMatch &&
|
|
1107
|
+
registeredToolNames.has(narratedCallMatch[1].toLowerCase());
|
|
1162
1108
|
// Empty-response retry: was previously gated to `iterations > 0`
|
|
1163
1109
|
// under the assumption "empty first response = provider outage."
|
|
1164
1110
|
// That assumption was wrong — with bandit-logic
|
|
@@ -1170,7 +1116,7 @@ class ToolUseLoop {
|
|
|
1170
1116
|
// the model gets a second chance (and the thinking-off recovery
|
|
1171
1117
|
// below can flip it to non-thinking mode if the second pass also
|
|
1172
1118
|
// empties).
|
|
1173
|
-
const shouldNudge = (!response.trim() || reasoningOnly || narratedButNoAction) &&
|
|
1119
|
+
const shouldNudge = (!response.trim() || reasoningOnly || narratedButNoAction || narratedToolCallNoAction) &&
|
|
1174
1120
|
!hitLimit &&
|
|
1175
1121
|
consecutiveEmptyRetries < 2 &&
|
|
1176
1122
|
!thinkingOffRecoveryAttempted;
|
|
@@ -1180,16 +1126,17 @@ class ToolUseLoop {
|
|
|
1180
1126
|
iteration: iterations,
|
|
1181
1127
|
attempt: consecutiveEmptyRetries,
|
|
1182
1128
|
reasoningOnly,
|
|
1183
|
-
narratedButNoAction
|
|
1129
|
+
narratedButNoAction,
|
|
1130
|
+
narratedToolCallNoAction
|
|
1184
1131
|
});
|
|
1185
|
-
const nudgeMessage = narratedButNoAction
|
|
1132
|
+
const nudgeMessage = (narratedButNoAction || narratedToolCallNoAction)
|
|
1186
1133
|
? 'You announced your next step in prose ("we will search…" / "let me check…" / "use X to find Y") but did NOT emit a `<tool_call>` envelope. Announcing intent is not enough — you must actually invoke the tool. Emit the call now in this exact format, OUTSIDE of any reasoning block, with NO commentary and NO markdown fence:\n\n<tool_call>{"name":"<tool>","params":{"<key>":"<value>"}}</tool_call>\n\nReplace name/params with the right values for your task. Or, if the task is already answerable from what you know, give a final answer instead.'
|
|
1187
1134
|
: reasoningOnly
|
|
1188
1135
|
? 'You completed reasoning but emitted no tool_call AND no final answer. The reasoning text alone does not run a tool — you must emit a `<tool_call>` envelope OUTSIDE the reasoning block. Format example (replace name/params for your task):\n\n<tool_call>{"name":"<tool>","params":{"<key>":"<value>"}}</tool_call>\n\nNo prose around it, no markdown fence, just the bare tag. If the task is answerable without a tool, write a complete final answer instead. Do not stop after only thinking.'
|
|
1189
1136
|
: 'Your previous response was empty. Either emit a `<tool_call>{"name":"<tool>","params":{...}}</tool_call>` to invoke a tool, OR produce a complete final answer using what you have. Do not respond with an empty message.';
|
|
1190
1137
|
messages.push({
|
|
1191
1138
|
role: 'user',
|
|
1192
|
-
content: nudgeMessage
|
|
1139
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + nudgeMessage
|
|
1193
1140
|
});
|
|
1194
1141
|
continue;
|
|
1195
1142
|
}
|
|
@@ -1222,7 +1169,7 @@ class ToolUseLoop {
|
|
|
1222
1169
|
});
|
|
1223
1170
|
messages.push({
|
|
1224
1171
|
role: 'user',
|
|
1225
|
-
content: 'Switching to non-thinking mode for this attempt because reasoning-only retries exhausted. Emit either a tool_call or a complete final answer. No more reasoning preamble.'
|
|
1172
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Switching to non-thinking mode for this attempt because reasoning-only retries exhausted. Emit either a tool_call or a complete final answer. No more reasoning preamble.'
|
|
1226
1173
|
});
|
|
1227
1174
|
continue;
|
|
1228
1175
|
}
|
|
@@ -1270,8 +1217,8 @@ class ToolUseLoop {
|
|
|
1270
1217
|
messages.push({
|
|
1271
1218
|
role: 'user',
|
|
1272
1219
|
content: firstRetry
|
|
1273
|
-
? 'Your previous tool_call was not valid JSON — I could not parse it. Common cause: unescaped `"` characters inside a string value (for example `["", "", ""]` inside a `content` string). Retry the tool call with properly escaped JSON: every `"` inside a string value must be written as `\\"`, and every newline as `\\n`. If the content is very long, consider `replace_range` for a line-numbered block or breaking the change into smaller edits.'
|
|
1274
|
-
: 'Your tool_call still did not parse. Do NOT retry with the same shape or the same escaping failure. Switch tactics: (a) call `replace_range` for a large block whose line numbers you just read, (b) call `write_file` for a new file, or (c) split the change into multiple small `apply_edit` calls that each target just one method or block (e.g. 3-5 lines of `find`, 5-10 lines of `replace`) instead of rewriting the whole class. Pick the smallest scope that accomplishes the next step. If you cannot produce a valid tool call, respond with a plain-prose final answer acknowledging you could not complete the edit.'
|
|
1220
|
+
? tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Your previous tool_call was not valid JSON — I could not parse it. Common cause: unescaped `"` characters inside a string value (for example `["", "", ""]` inside a `content` string). Retry the tool call with properly escaped JSON: every `"` inside a string value must be written as `\\"`, and every newline as `\\n`. If the content is very long, consider `replace_range` for a line-numbered block or breaking the change into smaller edits.'
|
|
1221
|
+
: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Your tool_call still did not parse. Do NOT retry with the same shape or the same escaping failure. Switch tactics: (a) call `replace_range` for a large block whose line numbers you just read, (b) call `write_file` for a new file, or (c) split the change into multiple small `apply_edit` calls that each target just one method or block (e.g. 3-5 lines of `find`, 5-10 lines of `replace`) instead of rewriting the whole class. Pick the smallest scope that accomplishes the next step. If you cannot produce a valid tool call, respond with a plain-prose final answer acknowledging you could not complete the edit.'
|
|
1275
1222
|
});
|
|
1276
1223
|
continue;
|
|
1277
1224
|
}
|
|
@@ -1317,7 +1264,7 @@ class ToolUseLoop {
|
|
|
1317
1264
|
});
|
|
1318
1265
|
messages.push({
|
|
1319
1266
|
role: 'user',
|
|
1320
|
-
content: 'STOP deliberating. Your last response either repeated itself, contradicted itself (e.g. "Wait, I see X / Actually I\'ll try X"), or was aborted mid-stream as a loop. Do NOT continue speculating about what files might exist. Take exactly one of these actions now: (a) invoke a tool (`list_files`, `read_file`, `search_code`, etc.) to answer the question with real data, OR (b) give up and tell the user plainly that you could not complete the task and why. Do not write more than two sentences of prose before either calling a tool or terminating.'
|
|
1267
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'STOP deliberating. Your last response either repeated itself, contradicted itself (e.g. "Wait, I see X / Actually I\'ll try X"), or was aborted mid-stream as a loop. Do NOT continue speculating about what files might exist. Take exactly one of these actions now: (a) invoke a tool (`list_files`, `read_file`, `search_code`, etc.) to answer the question with real data, OR (b) give up and tell the user plainly that you could not complete the task and why. Do not write more than two sentences of prose before either calling a tool or terminating.'
|
|
1321
1268
|
});
|
|
1322
1269
|
recentNonToolResponses.length = 0;
|
|
1323
1270
|
continue;
|
|
@@ -1407,7 +1354,7 @@ class ToolUseLoop {
|
|
|
1407
1354
|
// without being so loud that it derails prose responses.
|
|
1408
1355
|
messages.push({
|
|
1409
1356
|
role: 'user',
|
|
1410
|
-
content: 'Note: I detected a JSON todo list in your response and auto-promoted it to a todo_write call. Next time, emit `<tool_call>{"name":"todo_write","params":{"items":"..."}}</tool_call>` directly instead of pasting JSON as a code block — pasted JSON does not update your plan, only the tool call does.'
|
|
1357
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Note: I detected a JSON todo list in your response and auto-promoted it to a todo_write call. Next time, emit `<tool_call>{"name":"todo_write","params":{"items":"..."}}</tool_call>` directly instead of pasting JSON as a code block — pasted JSON does not update your plan, only the tool call does.'
|
|
1411
1358
|
});
|
|
1412
1359
|
iterations++;
|
|
1413
1360
|
continue;
|
|
@@ -1435,7 +1382,26 @@ class ToolUseLoop {
|
|
|
1435
1382
|
responsePreview: response.slice(0, 300)
|
|
1436
1383
|
});
|
|
1437
1384
|
}
|
|
1438
|
-
|
|
1385
|
+
// Reasoning channels are streamed live by the host for display —
|
|
1386
|
+
// leaving them in the terminal answer double-renders them, and on
|
|
1387
|
+
// fabrication-retry exhaustion it prints the model's confusion
|
|
1388
|
+
// narrative as if it were the answer (real CLI run,
|
|
1389
|
+
// 2026-06-12T20-19 turn: three near-identical "the user is
|
|
1390
|
+
// correcting my formatting error" reasoning blocks rendered above
|
|
1391
|
+
// the real answer). The stall fallback below still inspects the
|
|
1392
|
+
// raw `response`, so reasoning-only turns keep their fallback.
|
|
1393
|
+
// ORDER MATTERS: reasoning channels strip FIRST. Reasoning text
|
|
1394
|
+
// routinely MENTIONS envelopes in backticks ("I included a
|
|
1395
|
+
// `<tool_result>` envelope…"); if markup stripping ran first, its
|
|
1396
|
+
// envelope regex would match from that in-fence mention through
|
|
1397
|
+
// to the real closing tag, eat the fence's closing ``` along the
|
|
1398
|
+
// way, and the unclosed-fence cleanup would then wipe the entire
|
|
1399
|
+
// rest of the answer.
|
|
1400
|
+
const finalResponse = (0, tool_use_parser_1.stripToolCallMarkup)(response
|
|
1401
|
+
.replace(/<think\b[\s\S]*?<\/think\s*>/gi, '')
|
|
1402
|
+
.replace(/<think\b[\s\S]*$/i, '')
|
|
1403
|
+
.replace(/```bandit-reasoning\b[\s\S]*?```/gi, '')
|
|
1404
|
+
.replace(/```bandit-reasoning\b[\s\S]*$/i, '')).trim();
|
|
1439
1405
|
// False-completion detector. Small models regularly end a turn
|
|
1440
1406
|
// with "I refactored the file" / "here is the updated code" text
|
|
1441
1407
|
// without ever emitting a file-edit tool call.
|
|
@@ -1445,14 +1411,25 @@ class ToolUseLoop {
|
|
|
1445
1411
|
// this turn, push one corrective user message into the loop
|
|
1446
1412
|
// and continue for one more iteration. The nudge is capped at
|
|
1447
1413
|
// one per turn so a truly confused model can still terminate.
|
|
1448
|
-
|
|
1414
|
+
//
|
|
1415
|
+
// ONLY fires when the goal actually implies an edit. Without this
|
|
1416
|
+
// gate the detector demanded an edit on a purely informational
|
|
1417
|
+
// "tell me about this repo" turn: the model correctly said "I have
|
|
1418
|
+
// completed the overview" (a completion phrase), no edit ran
|
|
1419
|
+
// (none was asked for), so the nudge fired and replaced the good
|
|
1420
|
+
// markdown overview with a defensive "no edits are required"
|
|
1421
|
+
// answer — plus a wall of "automated harness check" reasoning.
|
|
1422
|
+
// An analysis goal that does NOT also imply an edit can never
|
|
1423
|
+
// false-complete, so skip it. (real CLI run, 2026-06-12.)
|
|
1424
|
+
const goalCouldExpectEdit = promptImpliesFileEdit || !promptWantsAnalysis;
|
|
1425
|
+
if (!hitLimit && !falseCompletionNudged && editToolsInvoked === 0 && goalCouldExpectEdit) {
|
|
1449
1426
|
const claimsCompletion = FALSE_COMPLETION_PATTERNS.some(re => re.test(finalResponse));
|
|
1450
1427
|
if (claimsCompletion) {
|
|
1451
1428
|
falseCompletionNudged = true;
|
|
1452
1429
|
emit('tool_loop:false_completion_nudge', { iteration: iterations, responsePreview: finalResponse.slice(0, 200) });
|
|
1453
1430
|
messages.push({
|
|
1454
1431
|
role: 'user',
|
|
1455
|
-
content: 'Your response either claims work is done OR apologizes and asks what to do next — but I see NO successful `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call in this turn, so nothing on disk has changed. ' +
|
|
1432
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Your response either claims work is done OR apologizes and asks what to do next — but I see NO successful `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call in this turn, so nothing on disk has changed. ' +
|
|
1456
1433
|
'Do NOT ask the user which task to resume, do NOT promise to escape JSON "in your next tool call", and do NOT defer. Either (a) emit a real edit tool call NOW with the actual change — use `replace_range` for a large block whose line numbers you just read, `apply_edit` for a small exact replacement, or `write_file` for a new file — or (b) respond honestly that you could not complete the task and briefly explain why. Retry the tool call yourself; the user cannot help you escape JSON.'
|
|
1457
1434
|
});
|
|
1458
1435
|
continue;
|
|
@@ -1491,7 +1468,7 @@ class ToolUseLoop {
|
|
|
1491
1468
|
});
|
|
1492
1469
|
messages.push({
|
|
1493
1470
|
role: 'user',
|
|
1494
|
-
content: `Your response describes edits to ${fileSet.size} files (${[...fileSet].slice(0, 8).join(', ')}${fileSet.size > 8 ? ', …' : ''}), but only ${editToolsInvoked} successful edit${editToolsInvoked === 1 ? '' : 's'} actually fired this turn. ` +
|
|
1471
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `Your response describes edits to ${fileSet.size} files (${[...fileSet].slice(0, 8).join(', ')}${fileSet.size > 8 ? ', …' : ''}), but only ${editToolsInvoked} successful edit${editToolsInvoked === 1 ? '' : 's'} actually fired this turn. ` +
|
|
1495
1472
|
`The remaining ${fileSet.size - editToolsInvoked} file(s) were NOT modified — nothing landed on disk for them. ` +
|
|
1496
1473
|
'Either (a) emit the missing `apply_edit` / `replace_range` / `write_file` tool calls now to actually do the work, OR (b) revise your response to honestly describe ONLY the edits that successfully applied. Do not summarize work that did not happen.'
|
|
1497
1474
|
});
|
|
@@ -1502,7 +1479,7 @@ class ToolUseLoop {
|
|
|
1502
1479
|
// ("break out", "split", "refactor", "extract", "move") imply
|
|
1503
1480
|
// mutation of the SOURCE file the user wants restructured, not
|
|
1504
1481
|
// just creation of new sibling files. Failure mode observed
|
|
1505
|
-
// 2026-05-25 on a
|
|
1482
|
+
// 2026-05-25 on a local React refactor: model read App.jsx,
|
|
1506
1483
|
// wrote 5 new component files, never touched App.jsx, declared
|
|
1507
1484
|
// completion. User had to follow up "are we using these?" to
|
|
1508
1485
|
// force the integration step — and even that follow-up turn
|
|
@@ -1534,7 +1511,7 @@ class ToolUseLoop {
|
|
|
1534
1511
|
const writeCount = filesWrittenThisTurn.size;
|
|
1535
1512
|
messages.push({
|
|
1536
1513
|
role: 'user',
|
|
1537
|
-
content: `The user's goal contains a refactor verb (refactor/break out/split/extract/move) which implies the SOURCE file(s) should be modified, not just supplemented with new siblings. You read ${readPreview}${readNotWritten.length > 3 ? ' and others' : ''} for context, then wrote ${writeCount} NEW file(s), but you NEVER modified the file(s) you read. The refactor is incomplete: the source file still contains the old monolithic code. ` +
|
|
1514
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `The user's goal contains a refactor verb (refactor/break out/split/extract/move) which implies the SOURCE file(s) should be modified, not just supplemented with new siblings. You read ${readPreview}${readNotWritten.length > 3 ? ' and others' : ''} for context, then wrote ${writeCount} NEW file(s), but you NEVER modified the file(s) you read. The refactor is incomplete: the source file still contains the old monolithic code. ` +
|
|
1538
1515
|
`Emit the missing apply_edit/replace_range/write_file call on the source file now — it should import from the new files and drop the inlined code that's been extracted. If the refactor is genuinely a "scaffold only, leave source untouched" task, say so explicitly and explain why the source doesn't need to change.`
|
|
1539
1516
|
});
|
|
1540
1517
|
continue;
|
|
@@ -1576,7 +1553,7 @@ class ToolUseLoop {
|
|
|
1576
1553
|
});
|
|
1577
1554
|
messages.push({
|
|
1578
1555
|
role: 'user',
|
|
1579
|
-
content: 'You produced a substantial code block in your reply but never emitted a `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call — so the change is NOT on disk. ' +
|
|
1556
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'You produced a substantial code block in your reply but never emitted a `write_file`, `apply_edit`, `replace_range`, or `apply_patch` tool call — so the change is NOT on disk. ' +
|
|
1580
1557
|
'Do not ask the user to paste your code into a file themselves. Take exactly one of these actions now: (a) call `replace_range`, `apply_edit`, or `write_file` with the real change to the correct file, OR (b) say plainly that you could not locate the target file and explain what you searched for. Do not wrap up with another prose + code-fence response.'
|
|
1581
1558
|
});
|
|
1582
1559
|
continue;
|
|
@@ -1660,7 +1637,7 @@ class ToolUseLoop {
|
|
|
1660
1637
|
});
|
|
1661
1638
|
messages.push({
|
|
1662
1639
|
role: 'user',
|
|
1663
|
-
content: 'Your first response had reasoning but emitted NO tool call — that is a hard stall for a subagent (you exist to gather information; reasoning alone produces zero output). ' +
|
|
1640
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'Your first response had reasoning but emitted NO tool call — that is a hard stall for a subagent (you exist to gather information; reasoning alone produces zero output). ' +
|
|
1664
1641
|
'For your next response, emit a tool call. The minimum viable starting move for ANY exploration goal is:\n\n' +
|
|
1665
1642
|
'<tool_call>{"name":"list_files","params":{"path":"."}}</tool_call>\n\n' +
|
|
1666
1643
|
'Copy that exact envelope as the very first thing you emit (you may keep the reasoning block before it if your model needs to think first, but the tool_call envelope MUST appear in this turn). ' +
|
|
@@ -1682,7 +1659,7 @@ class ToolUseLoop {
|
|
|
1682
1659
|
// user saw nothing.
|
|
1683
1660
|
//
|
|
1684
1661
|
// The gate also covers the "regurgitated reasoning after
|
|
1685
|
-
// native→text channel fallback" case.
|
|
1662
|
+
// native→text channel fallback" case. Real CLI
|
|
1686
1663
|
// 2026-05-31T17-39-53 cleanup turn: native-tool path 500'd,
|
|
1687
1664
|
// text-channel recovery prompted the model to re-emit its
|
|
1688
1665
|
// pending action, but the model just echoed its prior
|
|
@@ -1694,12 +1671,7 @@ class ToolUseLoop {
|
|
|
1694
1671
|
// before testing emptiness — if the response would render to
|
|
1695
1672
|
// the user as nothing-actionable, the fallback fires and the
|
|
1696
1673
|
// user sees what the model was thinking instead of silence.
|
|
1697
|
-
const reasoningStripped = response
|
|
1698
|
-
.replace(/<think\b[\s\S]*?<\/think\s*>/gi, '')
|
|
1699
|
-
.replace(/<think\b[\s\S]*$/i, '')
|
|
1700
|
-
.replace(/```bandit-reasoning\b[\s\S]*?```/gi, '')
|
|
1701
|
-
.replace(/```bandit-reasoning\b[\s\S]*$/i, '')
|
|
1702
|
-
.trim();
|
|
1674
|
+
const reasoningStripped = (0, tool_use_parser_1.stripToAnswerContent)(response);
|
|
1703
1675
|
const visibleAfterStrip = (0, tool_use_parser_1.stripToolCallMarkup)(reasoningStripped).trim();
|
|
1704
1676
|
if (!visibleAfterStrip) {
|
|
1705
1677
|
// Pull the last 1-2 sentences of reasoning so the user sees
|
|
@@ -1725,7 +1697,7 @@ class ToolUseLoop {
|
|
|
1725
1697
|
// and the inline empty-retry / narrate-no-action detector
|
|
1726
1698
|
// already used its retry budget (consecutiveEmptyRetries >= 2)
|
|
1727
1699
|
// so it couldn't nudge again, the user is left reading a
|
|
1728
|
-
// promise the model never kept.
|
|
1700
|
+
// promise the model never kept. Real CLI
|
|
1729
1701
|
// 2026-05-31T17-39-53 cleanup turn: after a native→text channel
|
|
1730
1702
|
// recovery, the model emitted "Let me revert it:" with a
|
|
1731
1703
|
// dangling colon and no tool call; the user saw the prose end
|
|
@@ -1742,13 +1714,13 @@ class ToolUseLoop {
|
|
|
1742
1714
|
// The trailing colon + intent phrase combination is the
|
|
1743
1715
|
// smoking gun. We DON'T also require NARRATE_VERB_RE here:
|
|
1744
1716
|
// the existing inline detector's verb list misses "revert"
|
|
1745
|
-
// (
|
|
1717
|
+
// (real run 2026-05-31) and would miss any other one-off
|
|
1746
1718
|
// action verb a model might use. The colon alone is rare
|
|
1747
1719
|
// enough in a legit final answer that pairing it with
|
|
1748
1720
|
// "let me" / "I'll" / "we'll" / etc. is specific enough.
|
|
1749
1721
|
//
|
|
1750
|
-
// Period-terminated variant (added 2026-06-03 after
|
|
1751
|
-
//
|
|
1722
|
+
// Period-terminated variant (added 2026-06-03 after a real
|
|
1723
|
+
// run): the model ended with "Let me fix
|
|
1752
1724
|
// all three project cards at once." — full sentence, full
|
|
1753
1725
|
// stop, no colon. Both prefill and thinking-off recovery
|
|
1754
1726
|
// had been spent earlier in the turn so the user saw the
|
|
@@ -1868,7 +1840,7 @@ class ToolUseLoop {
|
|
|
1868
1840
|
toolCalls = [];
|
|
1869
1841
|
messages.push({
|
|
1870
1842
|
role: 'user',
|
|
1871
|
-
content: `You have revised the plan in ${consecutiveTodoOnlyIterations + 1} consecutive iterations without executing any step. ` +
|
|
1843
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `You have revised the plan in ${consecutiveTodoOnlyIterations + 1} consecutive iterations without executing any step. ` +
|
|
1872
1844
|
'Execute the first pending task now using a concrete tool — `search_code`, `read_file`, `apply_edit`, `replace_range`, `write_file`, or `run_command`. ' +
|
|
1873
1845
|
'Once a task is actually DONE (tool call succeeded), you may call `todo_write` again to mark it completed — but not to re-plan. ' +
|
|
1874
1846
|
'If you cannot identify a next step, respond to the user with a short honest explanation and stop.'
|
|
@@ -1893,7 +1865,7 @@ class ToolUseLoop {
|
|
|
1893
1865
|
});
|
|
1894
1866
|
messages.push({
|
|
1895
1867
|
role: 'user',
|
|
1896
|
-
content: `You have spent ${consecutiveApplyEditOnlyIterations} consecutive iterations on apply_edit alone. ` +
|
|
1868
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `You have spent ${consecutiveApplyEditOnlyIterations} consecutive iterations on apply_edit alone. ` +
|
|
1897
1869
|
'If these are mechanical fixes of the same shape (one type annotation, one rename, one import path, one missing semicolon per call), STOP doing them one at a time — you will exhaust the iteration budget before the file is clean.\n' +
|
|
1898
1870
|
'\n' +
|
|
1899
1871
|
'Better tactics, in order of preference:\n' +
|
|
@@ -2007,7 +1979,7 @@ class ToolUseLoop {
|
|
|
2007
1979
|
});
|
|
2008
1980
|
messages.push({
|
|
2009
1981
|
role: 'user',
|
|
2010
|
-
content: `You just spawned ${bgSpawns.length} background subagents:\n${goalLines}\n\n` +
|
|
1982
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + `You just spawned ${bgSpawns.length} background subagents:\n${goalLines}\n\n` +
|
|
2011
1983
|
'Do NOT do those same explorations yourself in the next iteration — the subagents will deliver their synopses via the auto-inject path on a later turn. ' +
|
|
2012
1984
|
'Choose ONE of: ' +
|
|
2013
1985
|
'(a) work on a different, independent piece of the task that those subagents are NOT covering, ' +
|
|
@@ -2048,7 +2020,7 @@ class ToolUseLoop {
|
|
|
2048
2020
|
});
|
|
2049
2021
|
messages.push({
|
|
2050
2022
|
role: 'user',
|
|
2051
|
-
content: 'You set up a plan with `todo_write` earlier but have since completed ' +
|
|
2023
|
+
content: tool_use_parser_1.AUTOMATED_NUDGE_PREFIX + 'You set up a plan with `todo_write` earlier but have since completed ' +
|
|
2052
2024
|
`${editsSinceLastTodo} edit${editsSinceLastTodo === 1 ? '' : 's'} without updating it. ` +
|
|
2053
2025
|
'Call `todo_write` now with the current status — mark finished items as `completed` and leave remaining items as `pending`. ' +
|
|
2054
2026
|
"The Plan block in the user's UI mirrors your last `todo_write`, so skipping this leaves them looking at a stale checklist while real work has landed."
|