bonecode 1.4.1 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/engine/session/build_mode.d.ts +6 -0
- package/dist/src/engine/session/build_mode.js +141 -13
- package/dist/src/engine/session/build_mode.js.map +1 -1
- package/dist/src/engine/session/leaked_tool_call.d.ts +49 -0
- package/dist/src/engine/session/leaked_tool_call.js +174 -0
- package/dist/src/engine/session/leaked_tool_call.js.map +1 -0
- package/dist/src/engine/session/prompt.js +167 -0
- package/dist/src/engine/session/prompt.js.map +1 -1
- package/package.json +1 -1
- package/scripts/debug_extract.js +40 -0
- package/scripts/test_build_mode.js +132 -0
- package/scripts/test_identical_response.js +129 -0
- package/scripts/test_leaked_tool_call.js +269 -0
- package/src/engine/session/build_mode.ts +157 -13
- package/src/engine/session/leaked_tool_call.ts +166 -0
- package/src/engine/session/prompt.ts +203 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Unit tests for the identical-response loop detector in prompt.ts.
|
|
4
|
+
*
|
|
5
|
+
* The detector exists because small/local models sometimes emit the same
|
|
6
|
+
* assistant message turn after turn when they're confused about whether to
|
|
7
|
+
* call a tool. Without bailout, the loop keeps re-issuing the same prompt
|
|
8
|
+
* and getting the same prose back forever.
|
|
9
|
+
*
|
|
10
|
+
* We can't easily run runAgentLoop (needs DB + provider), but we can verify
|
|
11
|
+
* the helper functions exist and the loop reads/updates the recentResponses
|
|
12
|
+
* tracking array.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
"use strict";
|
|
16
|
+
const fs = require("fs");
|
|
17
|
+
const path = require("path");
|
|
18
|
+
const crypto = require("crypto");
|
|
19
|
+
|
|
20
|
+
const G = "\x1b[32m"; const R = "\x1b[31m"; const C = "\x1b[36m";
|
|
21
|
+
const B = "\x1b[1m"; const D = "\x1b[2m"; const N = "\x1b[0m";
|
|
22
|
+
|
|
23
|
+
let passed = 0;
|
|
24
|
+
let failed = 0;
|
|
25
|
+
const failures = [];
|
|
26
|
+
|
|
27
|
+
function ok(name, info = "") { passed++; console.log(` ${G}✓${N} ${name}${info ? ` ${D}${info}${N}` : ""}`); }
|
|
28
|
+
function fail(name, msg) { failed++; failures.push(`${name}: ${msg}`); console.log(` ${R}✗${N} ${name} ${R}${msg}${N}`); }
|
|
29
|
+
function header(s) { console.log(`\n${C}${B}${s}${N}`); }
|
|
30
|
+
|
|
31
|
+
const ROOT = path.resolve(__dirname, "..");
|
|
32
|
+
const promptSrc = fs.readFileSync(path.join(ROOT, "src", "engine", "session", "prompt.ts"), "utf-8");
|
|
33
|
+
|
|
34
|
+
// ─── [1] Helpers exist ────────────────────────────────────────────────────────
|
|
35
|
+
|
|
36
|
+
header("[1] Identical-response detector — helper functions exist");
|
|
37
|
+
|
|
38
|
+
(() => {
|
|
39
|
+
if (/async function assistantTextFingerprint/.test(promptSrc)) ok("assistantTextFingerprint function defined");
|
|
40
|
+
else fail("assistantTextFingerprint", "missing from prompt.ts");
|
|
41
|
+
})();
|
|
42
|
+
|
|
43
|
+
(() => {
|
|
44
|
+
// The fingerprint should normalize whitespace and use SHA1
|
|
45
|
+
const fnMatch = promptSrc.match(/async function assistantTextFingerprint[\s\S]*?\n\}/);
|
|
46
|
+
if (!fnMatch) { fail("assistantTextFingerprint body", "function not extracted"); return; }
|
|
47
|
+
const body = fnMatch[0];
|
|
48
|
+
if (/createHash\(['"]sha1['"]\)/.test(body)) ok("uses SHA1 hashing");
|
|
49
|
+
else fail("hashing", "fingerprint should use SHA1");
|
|
50
|
+
if (/toLowerCase\(\)|replace\(\/\\\s\+\/g/.test(body)) ok("normalizes whitespace + case");
|
|
51
|
+
else fail("normalization", "fingerprint should normalize before hashing");
|
|
52
|
+
if (/\.length\s*<\s*\d+/.test(body)) ok("rejects short strings (low entropy)");
|
|
53
|
+
else fail("min length", "fingerprint should skip short strings");
|
|
54
|
+
})();
|
|
55
|
+
|
|
56
|
+
// ─── [2] Loop has recentResponses tracking ───────────────────────────────────
|
|
57
|
+
|
|
58
|
+
header("[2] Agent loop — recentResponses tracking");
|
|
59
|
+
|
|
60
|
+
(() => {
|
|
61
|
+
if (/const recentResponses\s*:\s*string\[\]\s*=\s*\[\]/.test(promptSrc)) ok("recentResponses array declared");
|
|
62
|
+
else fail("recentResponses", "missing in prompt.ts");
|
|
63
|
+
})();
|
|
64
|
+
|
|
65
|
+
(() => {
|
|
66
|
+
// Should push fingerprint and shift after threshold
|
|
67
|
+
if (/recentResponses\.push\(fingerprint\)/.test(promptSrc)) ok("pushes fingerprint each turn");
|
|
68
|
+
else fail("push", "fingerprint not pushed");
|
|
69
|
+
|
|
70
|
+
if (/recentResponses\.shift\(\)/.test(promptSrc)) ok("shifts old fingerprints");
|
|
71
|
+
else fail("shift", "no rolling window");
|
|
72
|
+
})();
|
|
73
|
+
|
|
74
|
+
(() => {
|
|
75
|
+
// Should break loop on duplicate
|
|
76
|
+
if (/recentResponses\.includes\(fingerprint\)/.test(promptSrc)) ok("checks for duplicate fingerprint");
|
|
77
|
+
else fail("duplicate check", "missing");
|
|
78
|
+
})();
|
|
79
|
+
|
|
80
|
+
(() => {
|
|
81
|
+
if (/identical_response_detected/.test(promptSrc)) ok("logs identical_response_detected");
|
|
82
|
+
else fail("log event", "no diagnostic log");
|
|
83
|
+
|
|
84
|
+
if (/Model produced an identical response/.test(promptSrc)) ok("user-facing warning emitted");
|
|
85
|
+
else fail("warning", "no session.warning broadcast");
|
|
86
|
+
})();
|
|
87
|
+
|
|
88
|
+
// ─── [3] Fingerprint behavior — manual reproduction ──────────────────────────
|
|
89
|
+
|
|
90
|
+
header("[3] Fingerprint algorithm — reproduces same hash for normalized input");
|
|
91
|
+
|
|
92
|
+
(() => {
|
|
93
|
+
// Re-implement the fingerprint logic in pure JS and verify it produces
|
|
94
|
+
// matching hashes for whitespace-equivalent inputs.
|
|
95
|
+
function fp(text) {
|
|
96
|
+
if (!text || text.length < 80) return null;
|
|
97
|
+
const normalized = text.toLowerCase().replace(/\s+/g, " ").trim();
|
|
98
|
+
const sample = normalized.slice(0, 1000);
|
|
99
|
+
return crypto.createHash("sha1").update(sample).digest("hex");
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const a = "I'll start by creating a structure for the medieval market in BoneScript and then we will iterate.";
|
|
103
|
+
const b = "I'll start by creating a structure for the medieval market in BoneScript and then we will iterate.";
|
|
104
|
+
const c = "i'll start by creating a structure for the medieval market in bonescript and then we will iterate.";
|
|
105
|
+
if (fp(a) === fp(b)) ok("whitespace differences ignored");
|
|
106
|
+
else fail("whitespace", `${fp(a)} ≠ ${fp(b)}`);
|
|
107
|
+
if (fp(a) === fp(c)) ok("case differences ignored");
|
|
108
|
+
else fail("case", `${fp(a)} ≠ ${fp(c)}`);
|
|
109
|
+
|
|
110
|
+
const d = "totally different text that is long enough to fingerprint without colliding";
|
|
111
|
+
if (fp(a) !== fp(d)) ok("different text → different fingerprint");
|
|
112
|
+
else fail("collision", "different inputs have same fingerprint");
|
|
113
|
+
|
|
114
|
+
const short = "too short";
|
|
115
|
+
if (fp(short) === null) ok("short input returns null");
|
|
116
|
+
else fail("short", "should not fingerprint short text");
|
|
117
|
+
})();
|
|
118
|
+
|
|
119
|
+
// ─── Summary ─────────────────────────────────────────────────────────────────
|
|
120
|
+
|
|
121
|
+
console.log();
|
|
122
|
+
if (failed === 0) {
|
|
123
|
+
console.log(`${G}${B}✓ All ${passed} tests passed${N}`);
|
|
124
|
+
process.exit(0);
|
|
125
|
+
} else {
|
|
126
|
+
console.log(`${R}${B}✗ ${failed} failed, ${passed} passed${N}`);
|
|
127
|
+
for (const f of failures) console.log(` ${R}- ${f}${N}`);
|
|
128
|
+
process.exit(1);
|
|
129
|
+
}
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Tests for the leaked tool-call parser. Loads the compiled module directly
|
|
4
|
+
* so tests run against the same code that ships.
|
|
5
|
+
*
|
|
6
|
+
* Patterns tested are taken from real model outputs:
|
|
7
|
+
* - gemma: <|tool_call>call:edit{file_path:<|"|>foo.bone<|"|>}<tool_call|>
|
|
8
|
+
* - qwen: <tool_call>{"name":"write","arguments":{"path":"x"}}</tool_call>
|
|
9
|
+
* - llama3: <|python_tag|>write({"path":"x"})<|/python_tag|>
|
|
10
|
+
* - openai-style fenced: ```tool_code\nname(arg=val)\n```
|
|
11
|
+
*
|
|
12
|
+
* Also re-tests isBuildPrompt against the prompts that previously slipped
|
|
13
|
+
* through (e.g. "using BoneScript as the backend, write a python ...").
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
"use strict";
|
|
17
|
+
const fs = require("fs");
|
|
18
|
+
const path = require("path");
|
|
19
|
+
|
|
20
|
+
const G = "\x1b[32m"; const R = "\x1b[31m"; const C = "\x1b[36m";
|
|
21
|
+
const B = "\x1b[1m"; const D = "\x1b[2m"; const N = "\x1b[0m";
|
|
22
|
+
|
|
23
|
+
let passed = 0;
|
|
24
|
+
let failed = 0;
|
|
25
|
+
const failures = [];
|
|
26
|
+
|
|
27
|
+
function ok(name, info = "") {
|
|
28
|
+
passed++;
|
|
29
|
+
console.log(` ${G}✓${N} ${name}${info ? ` ${D}${info}${N}` : ""}`);
|
|
30
|
+
}
|
|
31
|
+
function fail(name, msg) {
|
|
32
|
+
failed++;
|
|
33
|
+
failures.push(`${name}: ${msg}`);
|
|
34
|
+
console.log(` ${R}✗${N} ${name} ${R}${msg}${N}`);
|
|
35
|
+
}
|
|
36
|
+
function header(s) { console.log(`\n${C}${B}${s}${N}`); }
|
|
37
|
+
|
|
38
|
+
const ROOT = path.resolve(__dirname, "..");
|
|
39
|
+
const modulePath = path.join(ROOT, "dist", "src", "engine", "session", "leaked_tool_call.js");
|
|
40
|
+
|
|
41
|
+
if (!fs.existsSync(modulePath)) {
|
|
42
|
+
console.error(`${R}Compiled module not found at ${modulePath}.${N}`);
|
|
43
|
+
console.error(`Run \`npm run build\` first.`);
|
|
44
|
+
process.exit(1);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const {
|
|
48
|
+
extractLeakedToolCall,
|
|
49
|
+
parseLeakedBody,
|
|
50
|
+
parseKwargs,
|
|
51
|
+
parseLooseObject,
|
|
52
|
+
} = require(modulePath);
|
|
53
|
+
|
|
54
|
+
// ─── Tests: gemma-style markers ───────────────────────────────────────────────
|
|
55
|
+
|
|
56
|
+
header("[1] Gemma-style leaked calls (the user's exact bug)");
|
|
57
|
+
|
|
58
|
+
(() => {
|
|
59
|
+
const text = `I'll create the file.\n<|tool_call>call:edit{file_path:<|"|>medieval_market.bone<|"|>}<tool_call|>\nDone.`;
|
|
60
|
+
const r = extractLeakedToolCall(text);
|
|
61
|
+
if (r && r.toolName === "edit" && r.toolInput.file_path === "medieval_market.bone") {
|
|
62
|
+
ok("gemma <|tool_call>call:name{...}<tool_call|>", `→ edit(file_path="${r.toolInput.file_path}")`);
|
|
63
|
+
} else {
|
|
64
|
+
fail("gemma exact bug", JSON.stringify(r));
|
|
65
|
+
}
|
|
66
|
+
})();
|
|
67
|
+
|
|
68
|
+
(() => {
|
|
69
|
+
const text = `<|tool_call|>{"name":"write","arguments":{"path":"foo.ts","content":"hello"}}<|/tool_call|>`;
|
|
70
|
+
const r = extractLeakedToolCall(text);
|
|
71
|
+
if (r && r.toolName === "write" && r.toolInput.path === "foo.ts" && r.toolInput.content === "hello") {
|
|
72
|
+
ok("gemma <|tool_call|>{json}<|/tool_call|>");
|
|
73
|
+
} else {
|
|
74
|
+
fail("gemma JSON form", JSON.stringify(r));
|
|
75
|
+
}
|
|
76
|
+
})();
|
|
77
|
+
|
|
78
|
+
// ─── Tests: qwen-style markers ────────────────────────────────────────────────
|
|
79
|
+
|
|
80
|
+
header("[2] Qwen-style leaked calls");
|
|
81
|
+
|
|
82
|
+
(() => {
|
|
83
|
+
const text = `<tool_call>{"name":"bash","arguments":{"command":"ls -la"}}</tool_call>`;
|
|
84
|
+
const r = extractLeakedToolCall(text);
|
|
85
|
+
if (r && r.toolName === "bash" && r.toolInput.command === "ls -la") {
|
|
86
|
+
ok("<tool_call>{json}</tool_call>");
|
|
87
|
+
} else {
|
|
88
|
+
fail("qwen", JSON.stringify(r));
|
|
89
|
+
}
|
|
90
|
+
})();
|
|
91
|
+
|
|
92
|
+
(() => {
|
|
93
|
+
const text = `<tool_call>{"tool":"read","args":{"path":"src/main.ts"}}</tool_call>`;
|
|
94
|
+
const r = extractLeakedToolCall(text);
|
|
95
|
+
if (r && r.toolName === "read" && r.toolInput.path === "src/main.ts") {
|
|
96
|
+
ok("<tool_call>{tool: ..., args: ...}</tool_call>");
|
|
97
|
+
} else {
|
|
98
|
+
fail("qwen alt keys", JSON.stringify(r));
|
|
99
|
+
}
|
|
100
|
+
})();
|
|
101
|
+
|
|
102
|
+
// ─── Tests: llama3-style python_tag ───────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
header("[3] llama3-style <|python_tag|>");
|
|
105
|
+
|
|
106
|
+
(() => {
|
|
107
|
+
const text = `<|python_tag|>write({"path":"x.txt","content":"y"})<|/python_tag|>`;
|
|
108
|
+
const r = extractLeakedToolCall(text);
|
|
109
|
+
if (r && r.toolName === "write" && r.toolInput.path === "x.txt" && r.toolInput.content === "y") {
|
|
110
|
+
ok("llama3 python_tag with JSON arg");
|
|
111
|
+
} else {
|
|
112
|
+
fail("llama3", JSON.stringify(r));
|
|
113
|
+
}
|
|
114
|
+
})();
|
|
115
|
+
|
|
116
|
+
// ─── Tests: function-call kwargs ──────────────────────────────────────────────
|
|
117
|
+
|
|
118
|
+
header("[4] Function-call kwargs syntax");
|
|
119
|
+
|
|
120
|
+
(() => {
|
|
121
|
+
const args = parseKwargs(`path="foo.ts", content="hello world"`);
|
|
122
|
+
if (args && args.path === "foo.ts" && args.content === "hello world") ok("string kwargs");
|
|
123
|
+
else fail("string kwargs", JSON.stringify(args));
|
|
124
|
+
})();
|
|
125
|
+
|
|
126
|
+
(() => {
|
|
127
|
+
const args = parseKwargs(`count=42, ratio=3.14, enabled=true, missing=null`);
|
|
128
|
+
if (args && args.count === 42 && args.ratio === 3.14 && args.enabled === true && args.missing === null) {
|
|
129
|
+
ok("typed kwargs (number, float, bool, null)");
|
|
130
|
+
} else {
|
|
131
|
+
fail("typed kwargs", JSON.stringify(args));
|
|
132
|
+
}
|
|
133
|
+
})();
|
|
134
|
+
|
|
135
|
+
(() => {
|
|
136
|
+
const args = parseKwargs(`file_path=<|"|>medieval_market.bone<|"|>`);
|
|
137
|
+
if (args && args.file_path === "medieval_market.bone") ok(`<|"|> escapes are stripped`);
|
|
138
|
+
else fail("escape markers", JSON.stringify(args));
|
|
139
|
+
})();
|
|
140
|
+
|
|
141
|
+
// ─── Tests: loose-object form ─────────────────────────────────────────────────
|
|
142
|
+
|
|
143
|
+
header("[5] Loose-object form (pseudo-JSON)");
|
|
144
|
+
|
|
145
|
+
(() => {
|
|
146
|
+
const o = parseLooseObject(`file_path:"foo.bone", count:3`);
|
|
147
|
+
if (o && o.file_path === "foo.bone" && o.count === 3) ok("colon-separated loose object");
|
|
148
|
+
else fail("loose object", JSON.stringify(o));
|
|
149
|
+
})();
|
|
150
|
+
|
|
151
|
+
// ─── Tests: fenced tool_code ──────────────────────────────────────────────────
|
|
152
|
+
|
|
153
|
+
header("[6] Fenced tool_code blocks");
|
|
154
|
+
|
|
155
|
+
(() => {
|
|
156
|
+
const text = "Some prose\n```tool_code\nwrite(path=\"x\", content=\"y\")\n```\nMore prose.";
|
|
157
|
+
const r = extractLeakedToolCall(text);
|
|
158
|
+
if (r && r.toolName === "write" && r.toolInput.path === "x" && r.toolInput.content === "y") {
|
|
159
|
+
ok("```tool_code\\nname(args)\\n```");
|
|
160
|
+
} else {
|
|
161
|
+
fail("fenced tool_code", JSON.stringify(r));
|
|
162
|
+
}
|
|
163
|
+
})();
|
|
164
|
+
|
|
165
|
+
// ─── Tests: false-positives ───────────────────────────────────────────────────
|
|
166
|
+
|
|
167
|
+
header("[7] No false-positives on plain text");
|
|
168
|
+
|
|
169
|
+
const cleanCases = [
|
|
170
|
+
"I'll create a file called foo.bone now.",
|
|
171
|
+
"Use the `write` tool to save the file.",
|
|
172
|
+
"Here's how you'd do it: write(path, content) — but that's pseudocode.",
|
|
173
|
+
"<not_a_tool_call>just text</not_a_tool_call>",
|
|
174
|
+
"",
|
|
175
|
+
"<tool_call></tool_call>",
|
|
176
|
+
];
|
|
177
|
+
for (const c of cleanCases) {
|
|
178
|
+
const r = extractLeakedToolCall(c);
|
|
179
|
+
if (r === null) ok(`clean: "${c.slice(0, 50)}..."`);
|
|
180
|
+
else fail(`false positive`, `"${c}" → ${JSON.stringify(r)}`);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// ─── Tests: stripping positions ───────────────────────────────────────────────
|
|
184
|
+
|
|
185
|
+
header("[8] startIndex/endIndex enable text stripping");
|
|
186
|
+
|
|
187
|
+
(() => {
|
|
188
|
+
const text = `Before <|tool_call|>{"name":"write","arguments":{}}<|/tool_call|> after`;
|
|
189
|
+
const r = extractLeakedToolCall(text);
|
|
190
|
+
if (!r) {
|
|
191
|
+
fail("strip positions", "no match");
|
|
192
|
+
} else {
|
|
193
|
+
const stripped = text.slice(0, r.startIndex) + text.slice(r.endIndex);
|
|
194
|
+
if (stripped === "Before after") ok("text stripped cleanly", `"${stripped}"`);
|
|
195
|
+
else fail("strip", `got "${stripped}"`);
|
|
196
|
+
}
|
|
197
|
+
})();
|
|
198
|
+
|
|
199
|
+
// ─── Tests: build mode trigger detection ──────────────────────────────────────
|
|
200
|
+
|
|
201
|
+
header("[9] isBuildPrompt covers the previously-missed prompts");
|
|
202
|
+
|
|
203
|
+
const bmModulePath = path.join(ROOT, "dist", "src", "engine", "session", "build_mode.js");
|
|
204
|
+
if (!fs.existsSync(bmModulePath)) {
|
|
205
|
+
fail("build_mode module", "compiled file missing");
|
|
206
|
+
} else {
|
|
207
|
+
const { isBuildPrompt } = require(bmModulePath);
|
|
208
|
+
|
|
209
|
+
const newCases = [
|
|
210
|
+
// The exact prompt that previously failed in the user's session
|
|
211
|
+
"using BoneScript as the backend, write a python 2d mideveal copper silver gold platinum transaction market simulation",
|
|
212
|
+
"with bonescript, build a chat app",
|
|
213
|
+
"in BoneScript, design a multi-tenant CRM",
|
|
214
|
+
"BoneScript backend for a music streaming service",
|
|
215
|
+
"write me a REST API for a todo list",
|
|
216
|
+
"develop a graphql api for users",
|
|
217
|
+
"scaffold a web application with auth",
|
|
218
|
+
];
|
|
219
|
+
for (const p of newCases) {
|
|
220
|
+
if (isBuildPrompt(p)) ok(`triggers: "${p.slice(0, 60)}..."`);
|
|
221
|
+
else fail(`missed`, p);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const negative = [
|
|
225
|
+
"what does this function do",
|
|
226
|
+
"explain the difference between let and const",
|
|
227
|
+
"fix the typo on line 5",
|
|
228
|
+
];
|
|
229
|
+
for (const p of negative) {
|
|
230
|
+
if (!isBuildPrompt(p)) ok(`not triggered: "${p}"`);
|
|
231
|
+
else fail(`over-matched`, p);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// ─── Tests: parseLeakedBody handles edge cases ────────────────────────────────
|
|
236
|
+
|
|
237
|
+
header("[10] parseLeakedBody edge cases");
|
|
238
|
+
|
|
239
|
+
(() => {
|
|
240
|
+
const r = parseLeakedBody("");
|
|
241
|
+
if (r === null) ok("empty body returns null");
|
|
242
|
+
else fail("empty", JSON.stringify(r));
|
|
243
|
+
})();
|
|
244
|
+
|
|
245
|
+
(() => {
|
|
246
|
+
const r = parseLeakedBody("not a tool call at all");
|
|
247
|
+
if (r === null) ok("garbage body returns null");
|
|
248
|
+
else fail("garbage", JSON.stringify(r));
|
|
249
|
+
})();
|
|
250
|
+
|
|
251
|
+
(() => {
|
|
252
|
+
// Function call with JSON arg
|
|
253
|
+
const r = parseLeakedBody('write({"path": "a.txt", "content": "b"})');
|
|
254
|
+
if (r && r.toolName === "write" && r.toolInput.path === "a.txt" && r.toolInput.content === "b") {
|
|
255
|
+
ok("function with JSON arg");
|
|
256
|
+
} else {
|
|
257
|
+
fail("function JSON arg", JSON.stringify(r));
|
|
258
|
+
}
|
|
259
|
+
})();
|
|
260
|
+
|
|
261
|
+
console.log();
|
|
262
|
+
if (failed === 0) {
|
|
263
|
+
console.log(`${G}${B}✓ All ${passed} tests passed${N}`);
|
|
264
|
+
process.exit(0);
|
|
265
|
+
} else {
|
|
266
|
+
console.log(`${R}${B}✗ ${failed} failed, ${passed} passed${N}`);
|
|
267
|
+
for (const f of failures) console.log(` ${R}- ${f}${N}`);
|
|
268
|
+
process.exit(1);
|
|
269
|
+
}
|
|
@@ -69,6 +69,8 @@ export interface BuildState {
|
|
|
69
69
|
error?: string;
|
|
70
70
|
/** Set after probe: whether the model can emit OpenAI-format tool calls. */
|
|
71
71
|
tool_capable?: boolean;
|
|
72
|
+
/** Files written so far in this build, with todo that produced each. */
|
|
73
|
+
written_files?: Record<string, { todo_id: string; size: number; written_at: number }>;
|
|
72
74
|
}
|
|
73
75
|
|
|
74
76
|
export interface VerificationResult {
|
|
@@ -402,13 +404,24 @@ async function stageExecute(state: BuildState, input: BuildModeInput): Promise<B
|
|
|
402
404
|
|
|
403
405
|
if (state.tool_capable) {
|
|
404
406
|
// ── Tool-calling path ────────────────────────────────────────────────
|
|
407
|
+
const knownFilesNote =
|
|
408
|
+
Object.keys(state.written_files || {}).length > 0
|
|
409
|
+
? `\nFILES ALREADY WRITTEN (do NOT recreate or rewrite these — they're done):\n` +
|
|
410
|
+
Object.keys(state.written_files || {}).map((p) => ` - ${p}`).join("\n")
|
|
411
|
+
: "";
|
|
412
|
+
|
|
405
413
|
const focusedPrompt = [
|
|
406
414
|
`<build-task>`,
|
|
415
|
+
`You are doing ONE task in a larger build that's already in progress.`,
|
|
416
|
+
`IGNORE earlier conversation context — those questions are already answered.`,
|
|
417
|
+
`DO NOT ask clarifying questions. The plan is locked. Just do this task.`,
|
|
418
|
+
``,
|
|
407
419
|
`Title: ${next.title}`,
|
|
408
420
|
`Description: ${next.description}`,
|
|
421
|
+
knownFilesNote,
|
|
409
422
|
``,
|
|
410
|
-
`
|
|
411
|
-
`Do
|
|
423
|
+
`Output ONE OR MORE concrete tool calls (write/edit/bash) for THIS task.`,
|
|
424
|
+
`Do NOT respond with prose. Do NOT explain. Call the tools NOW.`,
|
|
412
425
|
`</build-task>`,
|
|
413
426
|
].join("\n");
|
|
414
427
|
|
|
@@ -503,6 +516,32 @@ async function countToolCallsSince(session_id: string, since_message_id: string)
|
|
|
503
516
|
}
|
|
504
517
|
}
|
|
505
518
|
|
|
519
|
+
/**
|
|
520
|
+
* Like countToolCallsSince, but excludes calls synthesized from leaked tool-call
|
|
521
|
+
* markers. Used by the probe so we know whether the model can actually emit
|
|
522
|
+
* native tool calls — not just leak text we recovered. Models that only ever
|
|
523
|
+
* leak should be routed through the JSON-manifest fallback.
|
|
524
|
+
*
|
|
525
|
+
* Uses LIKE on the serialized tool_input JSON instead of jsonb operators so
|
|
526
|
+
* the same query works in both Postgres (real JSONB) and the SQLite fallback
|
|
527
|
+
* (TEXT column with stripped JSON operators).
|
|
528
|
+
*/
|
|
529
|
+
async function countNativeToolCallsSince(session_id: string, since_message_id: string): Promise<number> {
|
|
530
|
+
try {
|
|
531
|
+
const r = await pool.query(
|
|
532
|
+
`SELECT COUNT(*) AS n FROM tool_calls
|
|
533
|
+
WHERE session_id = $1
|
|
534
|
+
AND created_at >= (SELECT created_at FROM messages WHERE id = $2)
|
|
535
|
+
AND CAST(tool_input AS TEXT) NOT LIKE '%__synthesized%'`,
|
|
536
|
+
[session_id, since_message_id]
|
|
537
|
+
);
|
|
538
|
+
const raw = r.rows[0]?.n;
|
|
539
|
+
return typeof raw === "string" ? parseInt(raw, 10) || 0 : raw || 0;
|
|
540
|
+
} catch {
|
|
541
|
+
return 0;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
|
|
506
545
|
// ─── Tool-capability probe & JSON-manifest fallback ───────────────────────────
|
|
507
546
|
|
|
508
547
|
/**
|
|
@@ -553,7 +592,11 @@ async function probeToolCapability(input: BuildModeInput): Promise<boolean> {
|
|
|
553
592
|
agent_name: "build",
|
|
554
593
|
});
|
|
555
594
|
|
|
556
|
-
|
|
595
|
+
// Only count NATIVE tool calls. Calls synthesized from leaked tool-marker
|
|
596
|
+
// text don't count — those are unreliable in real builds (the model needs
|
|
597
|
+
// to format the leak perfectly every time, which it usually doesn't).
|
|
598
|
+
// If the model only ever leaks, we want JSON-manifest fallback instead.
|
|
599
|
+
const calls = await countNativeToolCallsSince(input.session_id, probeMsgId);
|
|
557
600
|
return calls > 0;
|
|
558
601
|
} catch {
|
|
559
602
|
return false;
|
|
@@ -610,6 +653,22 @@ async function executeFallback(
|
|
|
610
653
|
.map((t) => `- ${t.title}`)
|
|
611
654
|
.join("\n");
|
|
612
655
|
|
|
656
|
+
// Build a manifest of files already written so the model doesn't blindly
|
|
657
|
+
// overwrite them. We include the first ~30 lines of each file so the model
|
|
658
|
+
// can see what's there and decide whether to extend or leave alone.
|
|
659
|
+
state.written_files = state.written_files || {};
|
|
660
|
+
const existingPaths = Object.keys(state.written_files);
|
|
661
|
+
const existingFilesSnippets: string[] = [];
|
|
662
|
+
for (const p of existingPaths) {
|
|
663
|
+
try {
|
|
664
|
+
const target = path.resolve(worktree, p);
|
|
665
|
+
const content = await fs.readFile(target, "utf-8").catch(() => "");
|
|
666
|
+
const snippet = content.split("\n").slice(0, 30).join("\n");
|
|
667
|
+
const truncated = content.split("\n").length > 30 ? "\n... (truncated)" : "";
|
|
668
|
+
existingFilesSnippets.push(`${p} (${state.written_files[p].size} bytes):\n${snippet}${truncated}`);
|
|
669
|
+
} catch {}
|
|
670
|
+
}
|
|
671
|
+
|
|
613
672
|
const result = await askJson<{
|
|
614
673
|
files?: Array<{ path: string; content: string }>;
|
|
615
674
|
commands?: string[];
|
|
@@ -617,20 +676,36 @@ async function executeFallback(
|
|
|
617
676
|
model_id: input.model_id,
|
|
618
677
|
provider_id: input.provider_id,
|
|
619
678
|
system: [
|
|
620
|
-
"You are completing one task in
|
|
679
|
+
"You are completing one task in an incremental project build. The project already has files from earlier tasks; THIS task adds the next layer.",
|
|
621
680
|
"",
|
|
622
|
-
"
|
|
623
|
-
"
|
|
681
|
+
"PRODUCE A JSON MANIFEST of files to create OR commands to run for THIS task only.",
|
|
682
|
+
"",
|
|
683
|
+
"CRITICAL RULES:",
|
|
684
|
+
`- Output a single JSON object: { "files": [...], "commands": [...] }`,
|
|
624
685
|
"- Each file must have a relative `path` and full `content`. Do not abbreviate file content.",
|
|
625
686
|
"- File paths must be relative to the project root (no leading slash, no '..').",
|
|
626
|
-
"
|
|
687
|
+
"",
|
|
688
|
+
"FILE OVERWRITE POLICY:",
|
|
689
|
+
"- If a file already exists (listed under <existing-files>), DO NOT include it in your manifest unless this task is specifically about modifying it.",
|
|
690
|
+
"- If you must update an existing file, you MUST include the COMPLETE new content (including everything from the existing version that you want to keep). Partial content will overwrite and destroy whatever is there.",
|
|
691
|
+
"- For NEW files, just provide the new content.",
|
|
692
|
+
"- Prefer adding NEW files over modifying existing ones whenever possible.",
|
|
693
|
+
"",
|
|
694
|
+
"OUTPUT FORMAT:",
|
|
627
695
|
"- Do not include explanatory prose. The JSON IS the entire response.",
|
|
696
|
+
"- Do not wrap in markdown code fences.",
|
|
628
697
|
].join("\n"),
|
|
629
698
|
user: [
|
|
630
699
|
`<design>`,
|
|
631
700
|
designContext,
|
|
632
701
|
`</design>`,
|
|
633
702
|
``,
|
|
703
|
+
`<existing-files>`,
|
|
704
|
+
existingFilesSnippets.length
|
|
705
|
+
? existingFilesSnippets.join("\n\n---\n\n")
|
|
706
|
+
: "(no files written yet)",
|
|
707
|
+
`</existing-files>`,
|
|
708
|
+
``,
|
|
634
709
|
`<completed-tasks>`,
|
|
635
710
|
completedFiles || "(none yet)",
|
|
636
711
|
`</completed-tasks>`,
|
|
@@ -639,6 +714,8 @@ async function executeFallback(
|
|
|
639
714
|
`Title: ${todo.title}`,
|
|
640
715
|
`Description: ${todo.description}`,
|
|
641
716
|
`</current-task>`,
|
|
717
|
+
``,
|
|
718
|
+
`Produce the JSON manifest for the current task. Do NOT re-emit existing files unless this task explicitly modifies them.`,
|
|
642
719
|
].join("\n"),
|
|
643
720
|
schema_hint: `{ "files": [{ "path": string, "content": string }], "commands": string[] }`,
|
|
644
721
|
});
|
|
@@ -663,10 +740,38 @@ async function executeFallback(
|
|
|
663
740
|
errors.push(`refused path outside worktree: ${f.path}`);
|
|
664
741
|
continue;
|
|
665
742
|
}
|
|
743
|
+
|
|
744
|
+
// OVERWRITE GUARD: if the file was written by an earlier todo and this
|
|
745
|
+
// todo's title doesn't suggest it's specifically about this file, refuse
|
|
746
|
+
// and force the model to think harder. Always allow overwrite if the new
|
|
747
|
+
// content is larger (model is appending) or the path appears in the todo.
|
|
748
|
+
const existing = state.written_files[f.path];
|
|
749
|
+
if (existing && existing.todo_id !== todo.id) {
|
|
750
|
+
const oldContent = await fs.readFile(target, "utf-8").catch(() => "");
|
|
751
|
+
const newSize = Buffer.byteLength(f.content, "utf-8");
|
|
752
|
+
const titleMentionsPath = todo.title.toLowerCase().includes(f.path.toLowerCase()) ||
|
|
753
|
+
todo.description.toLowerCase().includes(f.path.toLowerCase());
|
|
754
|
+
const isStrictShrink = newSize < oldContent.length * 0.5; // shrunk by >50% = clobber
|
|
755
|
+
|
|
756
|
+
if (isStrictShrink && !titleMentionsPath) {
|
|
757
|
+
errors.push(
|
|
758
|
+
`refused overwrite: ${f.path} would shrink from ${oldContent.length} to ${newSize} bytes ` +
|
|
759
|
+
`(model likely truncating). Skipping.`
|
|
760
|
+
);
|
|
761
|
+
continue;
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
|
|
666
765
|
try {
|
|
667
766
|
await fs.mkdir(path.dirname(target), { recursive: true });
|
|
668
767
|
await fs.writeFile(target, f.content, "utf-8");
|
|
669
768
|
filesWritten++;
|
|
769
|
+
// Track in state
|
|
770
|
+
state.written_files[f.path] = {
|
|
771
|
+
todo_id: todo.id,
|
|
772
|
+
size: Buffer.byteLength(f.content, "utf-8"),
|
|
773
|
+
written_at: Date.now(),
|
|
774
|
+
};
|
|
670
775
|
// Surface a tool.completed event so the TUI shows an Edit/Write line
|
|
671
776
|
const callId = `fallback-${uuid()}`;
|
|
672
777
|
const broadcastModule = await import("../../../bone/output/session/src/websocket");
|
|
@@ -674,14 +779,14 @@ async function executeFallback(
|
|
|
674
779
|
type: "tool.requested",
|
|
675
780
|
session_id: input.session_id,
|
|
676
781
|
tool_call_id: callId,
|
|
677
|
-
tool_name: "write",
|
|
782
|
+
tool_name: existing ? "edit" : "write",
|
|
678
783
|
tool_input: { path: f.path, content: f.content.slice(0, 200) },
|
|
679
784
|
});
|
|
680
785
|
broadcastModule.broadcastToChannel("part_stream", {
|
|
681
786
|
type: "tool.completed",
|
|
682
787
|
session_id: input.session_id,
|
|
683
788
|
tool_call_id: callId,
|
|
684
|
-
tool_name: "write",
|
|
789
|
+
tool_name: existing ? "edit" : "write",
|
|
685
790
|
tool_input: { path: f.path },
|
|
686
791
|
duration_ms: 0,
|
|
687
792
|
});
|
|
@@ -821,15 +926,43 @@ async function stageVerify(state: BuildState, input: BuildModeInput): Promise<Bu
|
|
|
821
926
|
|
|
822
927
|
// ─── Driver ───────────────────────────────────────────────────────────────────
|
|
823
928
|
|
|
824
|
-
|
|
825
|
-
|
|
929
|
+
/**
|
|
930
|
+
* Start a fresh BuildState for the given prompt. Centralized so we don't
|
|
931
|
+
* accidentally drift between the lazy-init path in `runBuildMode` and the
|
|
932
|
+
* restart path used after a previous build is already `done` or `failed`.
|
|
933
|
+
*/
|
|
934
|
+
function freshState(prompt: string): BuildState {
|
|
935
|
+
return {
|
|
826
936
|
stage: "clarify",
|
|
827
|
-
original_prompt:
|
|
937
|
+
original_prompt: prompt,
|
|
828
938
|
design: null,
|
|
829
939
|
todos: [],
|
|
830
940
|
iteration: 0,
|
|
831
941
|
max_iterations: 30,
|
|
942
|
+
written_files: {},
|
|
832
943
|
};
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
export async function runBuildMode(input: BuildModeInput): Promise<BuildState> {
|
|
947
|
+
let state: BuildState = (await loadState(input.session_id)) ?? freshState(input.prompt);
|
|
948
|
+
|
|
949
|
+
// If the previous build is already done/failed, and the user is asking for
|
|
950
|
+
// something new (different prompt), start a fresh build rather than no-op.
|
|
951
|
+
// Without this, follow-up "build me X" prompts after a completed build
|
|
952
|
+
// silently do nothing because the loop sees state.stage === "done" and
|
|
953
|
+
// exits immediately.
|
|
954
|
+
if (
|
|
955
|
+
(state.stage === "done" || state.stage === "failed") &&
|
|
956
|
+
input.prompt &&
|
|
957
|
+
input.prompt !== state.original_prompt
|
|
958
|
+
) {
|
|
959
|
+
emit(input.session_id, "build.restart", {
|
|
960
|
+
previous_stage: state.stage,
|
|
961
|
+
previous_prompt: state.original_prompt,
|
|
962
|
+
});
|
|
963
|
+
state = freshState(input.prompt);
|
|
964
|
+
await saveState(input.session_id, state);
|
|
965
|
+
}
|
|
833
966
|
|
|
834
967
|
// Resume from saved state if applicable. If the user is sending a new prompt
|
|
835
968
|
// and we're already in clarify with pending questions, treat the prompt as
|
|
@@ -887,9 +1020,20 @@ export function isBuildPrompt(prompt: string): boolean {
|
|
|
887
1020
|
/\bmake\s+(?:a|an|the)\s+(?:full|complete|whole|new)\b/,
|
|
888
1021
|
/\bproject\s+(?:from\s+scratch|to)\b/,
|
|
889
1022
|
/\bsimulation\s+(?:with|using|of)\b/,
|
|
890
|
-
/\bbackend\s+(?:for|with|using)\b/,
|
|
1023
|
+
/\bbackend\s+(?:for|with|using|service)\b/,
|
|
891
1024
|
/\bspec(?:ification)?\s+(?:for|of)\b/,
|
|
892
1025
|
/\bend[- ]to[- ]end\b/,
|
|
1026
|
+
// Verb-led "write/build/code/develop" requests with a noun follow
|
|
1027
|
+
/\b(?:write|code|develop|generate|scaffold)\s+(?:me\s+)?(?:a|an|the)\s+\w+/,
|
|
1028
|
+
// BoneScript-specific phrases — if they say bonescript at all, treat as build
|
|
1029
|
+
/\busing\s+bonescript\b/,
|
|
1030
|
+
/\bwith\s+bonescript\b/,
|
|
1031
|
+
/\bin\s+bonescript\b/,
|
|
1032
|
+
/\bbonescript\s+(?:as|for|backend)\b/,
|
|
1033
|
+
// Generic "<adjective> <noun-app>" patterns indicating a system request
|
|
1034
|
+
/\b(?:rest|graphql)\s+api\b/,
|
|
1035
|
+
/\bweb\s+app(?:lication)?\b/,
|
|
1036
|
+
/\bgame\s+(?:simulation|engine|server)\b/,
|
|
893
1037
|
];
|
|
894
1038
|
return triggers.some((re) => re.test(p));
|
|
895
1039
|
}
|