bonecode 1.4.1 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Unit tests for the identical-response loop detector in prompt.ts.
4
+ *
5
+ * The detector exists because small/local models sometimes emit the same
6
+ * assistant message turn after turn when they're confused about whether to
7
+ * call a tool. Without bailout, the loop keeps re-issuing the same prompt
8
+ * and getting the same prose back forever.
9
+ *
10
+ * We can't easily run runAgentLoop (needs DB + provider), but we can verify
11
+ * the helper functions exist and the loop reads/updates the recentResponses
12
+ * tracking array.
13
+ */
14
+
15
+ "use strict";
16
+ const fs = require("fs");
17
+ const path = require("path");
18
+ const crypto = require("crypto");
19
+
20
+ const G = "\x1b[32m"; const R = "\x1b[31m"; const C = "\x1b[36m";
21
+ const B = "\x1b[1m"; const D = "\x1b[2m"; const N = "\x1b[0m";
22
+
23
+ let passed = 0;
24
+ let failed = 0;
25
+ const failures = [];
26
+
27
+ function ok(name, info = "") { passed++; console.log(` ${G}✓${N} ${name}${info ? ` ${D}${info}${N}` : ""}`); }
28
+ function fail(name, msg) { failed++; failures.push(`${name}: ${msg}`); console.log(` ${R}✗${N} ${name} ${R}${msg}${N}`); }
29
+ function header(s) { console.log(`\n${C}${B}${s}${N}`); }
30
+
31
+ const ROOT = path.resolve(__dirname, "..");
32
+ const promptSrc = fs.readFileSync(path.join(ROOT, "src", "engine", "session", "prompt.ts"), "utf-8");
33
+
34
+ // ─── [1] Helpers exist ────────────────────────────────────────────────────────
35
+
36
+ header("[1] Identical-response detector — helper functions exist");
37
+
38
+ (() => {
39
+ if (/async function assistantTextFingerprint/.test(promptSrc)) ok("assistantTextFingerprint function defined");
40
+ else fail("assistantTextFingerprint", "missing from prompt.ts");
41
+ })();
42
+
43
+ (() => {
44
+ // The fingerprint should normalize whitespace and use SHA1
45
+ const fnMatch = promptSrc.match(/async function assistantTextFingerprint[\s\S]*?\n\}/);
46
+ if (!fnMatch) { fail("assistantTextFingerprint body", "function not extracted"); return; }
47
+ const body = fnMatch[0];
48
+ if (/createHash\(['"]sha1['"]\)/.test(body)) ok("uses SHA1 hashing");
49
+ else fail("hashing", "fingerprint should use SHA1");
50
+ if (/toLowerCase\(\)|replace\(\/\\\s\+\/g/.test(body)) ok("normalizes whitespace + case");
51
+ else fail("normalization", "fingerprint should normalize before hashing");
52
+ if (/\.length\s*<\s*\d+/.test(body)) ok("rejects short strings (low entropy)");
53
+ else fail("min length", "fingerprint should skip short strings");
54
+ })();
55
+
56
+ // ─── [2] Loop has recentResponses tracking ───────────────────────────────────
57
+
58
+ header("[2] Agent loop — recentResponses tracking");
59
+
60
+ (() => {
61
+ if (/const recentResponses\s*:\s*string\[\]\s*=\s*\[\]/.test(promptSrc)) ok("recentResponses array declared");
62
+ else fail("recentResponses", "missing in prompt.ts");
63
+ })();
64
+
65
+ (() => {
66
+ // Should push fingerprint and shift after threshold
67
+ if (/recentResponses\.push\(fingerprint\)/.test(promptSrc)) ok("pushes fingerprint each turn");
68
+ else fail("push", "fingerprint not pushed");
69
+
70
+ if (/recentResponses\.shift\(\)/.test(promptSrc)) ok("shifts old fingerprints");
71
+ else fail("shift", "no rolling window");
72
+ })();
73
+
74
+ (() => {
75
+ // Should break loop on duplicate
76
+ if (/recentResponses\.includes\(fingerprint\)/.test(promptSrc)) ok("checks for duplicate fingerprint");
77
+ else fail("duplicate check", "missing");
78
+ })();
79
+
80
+ (() => {
81
+ if (/identical_response_detected/.test(promptSrc)) ok("logs identical_response_detected");
82
+ else fail("log event", "no diagnostic log");
83
+
84
+ if (/Model produced an identical response/.test(promptSrc)) ok("user-facing warning emitted");
85
+ else fail("warning", "no session.warning broadcast");
86
+ })();
87
+
88
+ // ─── [3] Fingerprint behavior — manual reproduction ──────────────────────────
89
+
90
+ header("[3] Fingerprint algorithm — reproduces same hash for normalized input");
91
+
92
+ (() => {
93
+ // Re-implement the fingerprint logic in pure JS and verify it produces
94
+ // matching hashes for whitespace-equivalent inputs.
95
+ function fp(text) {
96
+ if (!text || text.length < 80) return null;
97
+ const normalized = text.toLowerCase().replace(/\s+/g, " ").trim();
98
+ const sample = normalized.slice(0, 1000);
99
+ return crypto.createHash("sha1").update(sample).digest("hex");
100
+ }
101
+
102
+ const a = "I'll start by creating a structure for the medieval market in BoneScript and then we will iterate.";
103
+ const b = "I'll start by creating a structure for the medieval market in BoneScript and then we will iterate.";
104
+ const c = "i'll start by creating a structure for the medieval market in bonescript and then we will iterate.";
105
+ if (fp(a) === fp(b)) ok("whitespace differences ignored");
106
+ else fail("whitespace", `${fp(a)} ≠ ${fp(b)}`);
107
+ if (fp(a) === fp(c)) ok("case differences ignored");
108
+ else fail("case", `${fp(a)} ≠ ${fp(c)}`);
109
+
110
+ const d = "totally different text that is long enough to fingerprint without colliding";
111
+ if (fp(a) !== fp(d)) ok("different text → different fingerprint");
112
+ else fail("collision", "different inputs have same fingerprint");
113
+
114
+ const short = "too short";
115
+ if (fp(short) === null) ok("short input returns null");
116
+ else fail("short", "should not fingerprint short text");
117
+ })();
118
+
119
+ // ─── Summary ─────────────────────────────────────────────────────────────────
120
+
121
+ console.log();
122
+ if (failed === 0) {
123
+ console.log(`${G}${B}✓ All ${passed} tests passed${N}`);
124
+ process.exit(0);
125
+ } else {
126
+ console.log(`${R}${B}✗ ${failed} failed, ${passed} passed${N}`);
127
+ for (const f of failures) console.log(` ${R}- ${f}${N}`);
128
+ process.exit(1);
129
+ }
@@ -0,0 +1,269 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Tests for the leaked tool-call parser. Loads the compiled module directly
4
+ * so tests run against the same code that ships.
5
+ *
6
+ * Patterns tested are taken from real model outputs:
7
+ * - gemma: <|tool_call>call:edit{file_path:<|"|>foo.bone<|"|>}<tool_call|>
8
+ * - qwen: <tool_call>{"name":"write","arguments":{"path":"x"}}</tool_call>
9
+ * - llama3: <|python_tag|>write({"path":"x"})<|/python_tag|>
10
+ * - openai-style fenced: ```tool_code\nname(arg=val)\n```
11
+ *
12
+ * Also re-tests isBuildPrompt against the prompts that previously slipped
13
+ * through (e.g. "using BoneScript as the backend, write a python ...").
14
+ */
15
+
16
+ "use strict";
17
+ const fs = require("fs");
18
+ const path = require("path");
19
+
20
+ const G = "\x1b[32m"; const R = "\x1b[31m"; const C = "\x1b[36m";
21
+ const B = "\x1b[1m"; const D = "\x1b[2m"; const N = "\x1b[0m";
22
+
23
+ let passed = 0;
24
+ let failed = 0;
25
+ const failures = [];
26
+
27
+ function ok(name, info = "") {
28
+ passed++;
29
+ console.log(` ${G}✓${N} ${name}${info ? ` ${D}${info}${N}` : ""}`);
30
+ }
31
+ function fail(name, msg) {
32
+ failed++;
33
+ failures.push(`${name}: ${msg}`);
34
+ console.log(` ${R}✗${N} ${name} ${R}${msg}${N}`);
35
+ }
36
+ function header(s) { console.log(`\n${C}${B}${s}${N}`); }
37
+
38
+ const ROOT = path.resolve(__dirname, "..");
39
+ const modulePath = path.join(ROOT, "dist", "src", "engine", "session", "leaked_tool_call.js");
40
+
41
+ if (!fs.existsSync(modulePath)) {
42
+ console.error(`${R}Compiled module not found at ${modulePath}.${N}`);
43
+ console.error(`Run \`npm run build\` first.`);
44
+ process.exit(1);
45
+ }
46
+
47
+ const {
48
+ extractLeakedToolCall,
49
+ parseLeakedBody,
50
+ parseKwargs,
51
+ parseLooseObject,
52
+ } = require(modulePath);
53
+
54
+ // ─── Tests: gemma-style markers ───────────────────────────────────────────────
55
+
56
+ header("[1] Gemma-style leaked calls (the user's exact bug)");
57
+
58
+ (() => {
59
+ const text = `I'll create the file.\n<|tool_call>call:edit{file_path:<|"|>medieval_market.bone<|"|>}<tool_call|>\nDone.`;
60
+ const r = extractLeakedToolCall(text);
61
+ if (r && r.toolName === "edit" && r.toolInput.file_path === "medieval_market.bone") {
62
+ ok("gemma <|tool_call>call:name{...}<tool_call|>", `→ edit(file_path="${r.toolInput.file_path}")`);
63
+ } else {
64
+ fail("gemma exact bug", JSON.stringify(r));
65
+ }
66
+ })();
67
+
68
+ (() => {
69
+ const text = `<|tool_call|>{"name":"write","arguments":{"path":"foo.ts","content":"hello"}}<|/tool_call|>`;
70
+ const r = extractLeakedToolCall(text);
71
+ if (r && r.toolName === "write" && r.toolInput.path === "foo.ts" && r.toolInput.content === "hello") {
72
+ ok("gemma <|tool_call|>{json}<|/tool_call|>");
73
+ } else {
74
+ fail("gemma JSON form", JSON.stringify(r));
75
+ }
76
+ })();
77
+
78
+ // ─── Tests: qwen-style markers ────────────────────────────────────────────────
79
+
80
+ header("[2] Qwen-style leaked calls");
81
+
82
+ (() => {
83
+ const text = `<tool_call>{"name":"bash","arguments":{"command":"ls -la"}}</tool_call>`;
84
+ const r = extractLeakedToolCall(text);
85
+ if (r && r.toolName === "bash" && r.toolInput.command === "ls -la") {
86
+ ok("<tool_call>{json}</tool_call>");
87
+ } else {
88
+ fail("qwen", JSON.stringify(r));
89
+ }
90
+ })();
91
+
92
+ (() => {
93
+ const text = `<tool_call>{"tool":"read","args":{"path":"src/main.ts"}}</tool_call>`;
94
+ const r = extractLeakedToolCall(text);
95
+ if (r && r.toolName === "read" && r.toolInput.path === "src/main.ts") {
96
+ ok("<tool_call>{tool: ..., args: ...}</tool_call>");
97
+ } else {
98
+ fail("qwen alt keys", JSON.stringify(r));
99
+ }
100
+ })();
101
+
102
+ // ─── Tests: llama3-style python_tag ───────────────────────────────────────────
103
+
104
+ header("[3] llama3-style <|python_tag|>");
105
+
106
+ (() => {
107
+ const text = `<|python_tag|>write({"path":"x.txt","content":"y"})<|/python_tag|>`;
108
+ const r = extractLeakedToolCall(text);
109
+ if (r && r.toolName === "write" && r.toolInput.path === "x.txt" && r.toolInput.content === "y") {
110
+ ok("llama3 python_tag with JSON arg");
111
+ } else {
112
+ fail("llama3", JSON.stringify(r));
113
+ }
114
+ })();
115
+
116
+ // ─── Tests: function-call kwargs ──────────────────────────────────────────────
117
+
118
+ header("[4] Function-call kwargs syntax");
119
+
120
+ (() => {
121
+ const args = parseKwargs(`path="foo.ts", content="hello world"`);
122
+ if (args && args.path === "foo.ts" && args.content === "hello world") ok("string kwargs");
123
+ else fail("string kwargs", JSON.stringify(args));
124
+ })();
125
+
126
+ (() => {
127
+ const args = parseKwargs(`count=42, ratio=3.14, enabled=true, missing=null`);
128
+ if (args && args.count === 42 && args.ratio === 3.14 && args.enabled === true && args.missing === null) {
129
+ ok("typed kwargs (number, float, bool, null)");
130
+ } else {
131
+ fail("typed kwargs", JSON.stringify(args));
132
+ }
133
+ })();
134
+
135
+ (() => {
136
+ const args = parseKwargs(`file_path=<|"|>medieval_market.bone<|"|>`);
137
+ if (args && args.file_path === "medieval_market.bone") ok(`<|"|> escapes are stripped`);
138
+ else fail("escape markers", JSON.stringify(args));
139
+ })();
140
+
141
+ // ─── Tests: loose-object form ─────────────────────────────────────────────────
142
+
143
+ header("[5] Loose-object form (pseudo-JSON)");
144
+
145
+ (() => {
146
+ const o = parseLooseObject(`file_path:"foo.bone", count:3`);
147
+ if (o && o.file_path === "foo.bone" && o.count === 3) ok("colon-separated loose object");
148
+ else fail("loose object", JSON.stringify(o));
149
+ })();
150
+
151
+ // ─── Tests: fenced tool_code ──────────────────────────────────────────────────
152
+
153
+ header("[6] Fenced tool_code blocks");
154
+
155
+ (() => {
156
+ const text = "Some prose\n```tool_code\nwrite(path=\"x\", content=\"y\")\n```\nMore prose.";
157
+ const r = extractLeakedToolCall(text);
158
+ if (r && r.toolName === "write" && r.toolInput.path === "x" && r.toolInput.content === "y") {
159
+ ok("```tool_code\\nname(args)\\n```");
160
+ } else {
161
+ fail("fenced tool_code", JSON.stringify(r));
162
+ }
163
+ })();
164
+
165
+ // ─── Tests: false-positives ───────────────────────────────────────────────────
166
+
167
+ header("[7] No false-positives on plain text");
168
+
169
+ const cleanCases = [
170
+ "I'll create a file called foo.bone now.",
171
+ "Use the `write` tool to save the file.",
172
+ "Here's how you'd do it: write(path, content) — but that's pseudocode.",
173
+ "<not_a_tool_call>just text</not_a_tool_call>",
174
+ "",
175
+ "<tool_call></tool_call>",
176
+ ];
177
+ for (const c of cleanCases) {
178
+ const r = extractLeakedToolCall(c);
179
+ if (r === null) ok(`clean: "${c.slice(0, 50)}..."`);
180
+ else fail(`false positive`, `"${c}" → ${JSON.stringify(r)}`);
181
+ }
182
+
183
+ // ─── Tests: stripping positions ───────────────────────────────────────────────
184
+
185
+ header("[8] startIndex/endIndex enable text stripping");
186
+
187
+ (() => {
188
+ const text = `Before <|tool_call|>{"name":"write","arguments":{}}<|/tool_call|> after`;
189
+ const r = extractLeakedToolCall(text);
190
+ if (!r) {
191
+ fail("strip positions", "no match");
192
+ } else {
193
+ const stripped = text.slice(0, r.startIndex) + text.slice(r.endIndex);
194
+ if (stripped === "Before after") ok("text stripped cleanly", `"${stripped}"`);
195
+ else fail("strip", `got "${stripped}"`);
196
+ }
197
+ })();
198
+
199
+ // ─── Tests: build mode trigger detection ──────────────────────────────────────
200
+
201
+ header("[9] isBuildPrompt covers the previously-missed prompts");
202
+
203
+ const bmModulePath = path.join(ROOT, "dist", "src", "engine", "session", "build_mode.js");
204
+ if (!fs.existsSync(bmModulePath)) {
205
+ fail("build_mode module", "compiled file missing");
206
+ } else {
207
+ const { isBuildPrompt } = require(bmModulePath);
208
+
209
+ const newCases = [
210
+ // The exact prompt that previously failed in the user's session
211
+ "using BoneScript as the backend, write a python 2d mideveal copper silver gold platinum transaction market simulation",
212
+ "with bonescript, build a chat app",
213
+ "in BoneScript, design a multi-tenant CRM",
214
+ "BoneScript backend for a music streaming service",
215
+ "write me a REST API for a todo list",
216
+ "develop a graphql api for users",
217
+ "scaffold a web application with auth",
218
+ ];
219
+ for (const p of newCases) {
220
+ if (isBuildPrompt(p)) ok(`triggers: "${p.slice(0, 60)}..."`);
221
+ else fail(`missed`, p);
222
+ }
223
+
224
+ const negative = [
225
+ "what does this function do",
226
+ "explain the difference between let and const",
227
+ "fix the typo on line 5",
228
+ ];
229
+ for (const p of negative) {
230
+ if (!isBuildPrompt(p)) ok(`not triggered: "${p}"`);
231
+ else fail(`over-matched`, p);
232
+ }
233
+ }
234
+
235
+ // ─── Tests: parseLeakedBody handles edge cases ────────────────────────────────
236
+
237
+ header("[10] parseLeakedBody edge cases");
238
+
239
+ (() => {
240
+ const r = parseLeakedBody("");
241
+ if (r === null) ok("empty body returns null");
242
+ else fail("empty", JSON.stringify(r));
243
+ })();
244
+
245
+ (() => {
246
+ const r = parseLeakedBody("not a tool call at all");
247
+ if (r === null) ok("garbage body returns null");
248
+ else fail("garbage", JSON.stringify(r));
249
+ })();
250
+
251
+ (() => {
252
+ // Function call with JSON arg
253
+ const r = parseLeakedBody('write({"path": "a.txt", "content": "b"})');
254
+ if (r && r.toolName === "write" && r.toolInput.path === "a.txt" && r.toolInput.content === "b") {
255
+ ok("function with JSON arg");
256
+ } else {
257
+ fail("function JSON arg", JSON.stringify(r));
258
+ }
259
+ })();
260
+
261
+ console.log();
262
+ if (failed === 0) {
263
+ console.log(`${G}${B}✓ All ${passed} tests passed${N}`);
264
+ process.exit(0);
265
+ } else {
266
+ console.log(`${R}${B}✗ ${failed} failed, ${passed} passed${N}`);
267
+ for (const f of failures) console.log(` ${R}- ${f}${N}`);
268
+ process.exit(1);
269
+ }
@@ -69,6 +69,8 @@ export interface BuildState {
69
69
  error?: string;
70
70
  /** Set after probe: whether the model can emit OpenAI-format tool calls. */
71
71
  tool_capable?: boolean;
72
+ /** Files written so far in this build, with todo that produced each. */
73
+ written_files?: Record<string, { todo_id: string; size: number; written_at: number }>;
72
74
  }
73
75
 
74
76
  export interface VerificationResult {
@@ -402,13 +404,24 @@ async function stageExecute(state: BuildState, input: BuildModeInput): Promise<B
402
404
 
403
405
  if (state.tool_capable) {
404
406
  // ── Tool-calling path ────────────────────────────────────────────────
407
+ const knownFilesNote =
408
+ Object.keys(state.written_files || {}).length > 0
409
+ ? `\nFILES ALREADY WRITTEN (do NOT recreate or rewrite these — they're done):\n` +
410
+ Object.keys(state.written_files || {}).map((p) => ` - ${p}`).join("\n")
411
+ : "";
412
+
405
413
  const focusedPrompt = [
406
414
  `<build-task>`,
415
+ `You are doing ONE task in a larger build that's already in progress.`,
416
+ `IGNORE earlier conversation context — those questions are already answered.`,
417
+ `DO NOT ask clarifying questions. The plan is locked. Just do this task.`,
418
+ ``,
407
419
  `Title: ${next.title}`,
408
420
  `Description: ${next.description}`,
421
+ knownFilesNote,
409
422
  ``,
410
- `This is one task in a larger build. Complete this task NOW by calling the appropriate tools.`,
411
- `Do not describe what you would do call the tools.`,
423
+ `Output ONE OR MORE concrete tool calls (write/edit/bash) for THIS task.`,
424
+ `Do NOT respond with prose. Do NOT explain. Call the tools NOW.`,
412
425
  `</build-task>`,
413
426
  ].join("\n");
414
427
 
@@ -503,6 +516,32 @@ async function countToolCallsSince(session_id: string, since_message_id: string)
503
516
  }
504
517
  }
505
518
 
519
+ /**
520
+ * Like countToolCallsSince, but excludes calls synthesized from leaked tool-call
521
+ * markers. Used by the probe so we know whether the model can actually emit
522
+ * native tool calls — not just leak text we recovered. Models that only ever
523
+ * leak should be routed through the JSON-manifest fallback.
524
+ *
525
+ * Uses LIKE on the serialized tool_input JSON instead of jsonb operators so
526
+ * the same query works in both Postgres (real JSONB) and the SQLite fallback
527
+ * (TEXT column with stripped JSON operators).
528
+ */
529
+ async function countNativeToolCallsSince(session_id: string, since_message_id: string): Promise<number> {
530
+ try {
531
+ const r = await pool.query(
532
+ `SELECT COUNT(*) AS n FROM tool_calls
533
+ WHERE session_id = $1
534
+ AND created_at >= (SELECT created_at FROM messages WHERE id = $2)
535
+ AND CAST(tool_input AS TEXT) NOT LIKE '%__synthesized%'`,
536
+ [session_id, since_message_id]
537
+ );
538
+ const raw = r.rows[0]?.n;
539
+ return typeof raw === "string" ? parseInt(raw, 10) || 0 : raw || 0;
540
+ } catch {
541
+ return 0;
542
+ }
543
+ }
544
+
506
545
  // ─── Tool-capability probe & JSON-manifest fallback ───────────────────────────
507
546
 
508
547
  /**
@@ -553,7 +592,11 @@ async function probeToolCapability(input: BuildModeInput): Promise<boolean> {
553
592
  agent_name: "build",
554
593
  });
555
594
 
556
- const calls = await countToolCallsSince(input.session_id, probeMsgId);
595
+ // Only count NATIVE tool calls. Calls synthesized from leaked tool-marker
596
+ // text don't count — those are unreliable in real builds (the model needs
597
+ // to format the leak perfectly every time, which it usually doesn't).
598
+ // If the model only ever leaks, we want JSON-manifest fallback instead.
599
+ const calls = await countNativeToolCallsSince(input.session_id, probeMsgId);
557
600
  return calls > 0;
558
601
  } catch {
559
602
  return false;
@@ -610,6 +653,22 @@ async function executeFallback(
610
653
  .map((t) => `- ${t.title}`)
611
654
  .join("\n");
612
655
 
656
+ // Build a manifest of files already written so the model doesn't blindly
657
+ // overwrite them. We include the first ~30 lines of each file so the model
658
+ // can see what's there and decide whether to extend or leave alone.
659
+ state.written_files = state.written_files || {};
660
+ const existingPaths = Object.keys(state.written_files);
661
+ const existingFilesSnippets: string[] = [];
662
+ for (const p of existingPaths) {
663
+ try {
664
+ const target = path.resolve(worktree, p);
665
+ const content = await fs.readFile(target, "utf-8").catch(() => "");
666
+ const snippet = content.split("\n").slice(0, 30).join("\n");
667
+ const truncated = content.split("\n").length > 30 ? "\n... (truncated)" : "";
668
+ existingFilesSnippets.push(`${p} (${state.written_files[p].size} bytes):\n${snippet}${truncated}`);
669
+ } catch {}
670
+ }
671
+
613
672
  const result = await askJson<{
614
673
  files?: Array<{ path: string; content: string }>;
615
674
  commands?: string[];
@@ -617,20 +676,36 @@ async function executeFallback(
617
676
  model_id: input.model_id,
618
677
  provider_id: input.provider_id,
619
678
  system: [
620
- "You are completing one task in a project build. Produce a JSON manifest of the files to create or update and shell commands to run for THIS task only.",
679
+ "You are completing one task in an incremental project build. The project already has files from earlier tasks; THIS task adds the next layer.",
621
680
  "",
622
- "RULES:",
623
- "- Output a single JSON object: { \"files\": [...], \"commands\": [...] }",
681
+ "PRODUCE A JSON MANIFEST of files to create OR commands to run for THIS task only.",
682
+ "",
683
+ "CRITICAL RULES:",
684
+ `- Output a single JSON object: { "files": [...], "commands": [...] }`,
624
685
  "- Each file must have a relative `path` and full `content`. Do not abbreviate file content.",
625
686
  "- File paths must be relative to the project root (no leading slash, no '..').",
626
- "- Commands run in the project root. Use them only for compilation, package install, or migrations.",
687
+ "",
688
+ "FILE OVERWRITE POLICY:",
689
+ "- If a file already exists (listed under <existing-files>), DO NOT include it in your manifest unless this task is specifically about modifying it.",
690
+ "- If you must update an existing file, you MUST include the COMPLETE new content (including everything from the existing version that you want to keep). Partial content will overwrite and destroy whatever is there.",
691
+ "- For NEW files, just provide the new content.",
692
+ "- Prefer adding NEW files over modifying existing ones whenever possible.",
693
+ "",
694
+ "OUTPUT FORMAT:",
627
695
  "- Do not include explanatory prose. The JSON IS the entire response.",
696
+ "- Do not wrap in markdown code fences.",
628
697
  ].join("\n"),
629
698
  user: [
630
699
  `<design>`,
631
700
  designContext,
632
701
  `</design>`,
633
702
  ``,
703
+ `<existing-files>`,
704
+ existingFilesSnippets.length
705
+ ? existingFilesSnippets.join("\n\n---\n\n")
706
+ : "(no files written yet)",
707
+ `</existing-files>`,
708
+ ``,
634
709
  `<completed-tasks>`,
635
710
  completedFiles || "(none yet)",
636
711
  `</completed-tasks>`,
@@ -639,6 +714,8 @@ async function executeFallback(
639
714
  `Title: ${todo.title}`,
640
715
  `Description: ${todo.description}`,
641
716
  `</current-task>`,
717
+ ``,
718
+ `Produce the JSON manifest for the current task. Do NOT re-emit existing files unless this task explicitly modifies them.`,
642
719
  ].join("\n"),
643
720
  schema_hint: `{ "files": [{ "path": string, "content": string }], "commands": string[] }`,
644
721
  });
@@ -663,10 +740,38 @@ async function executeFallback(
663
740
  errors.push(`refused path outside worktree: ${f.path}`);
664
741
  continue;
665
742
  }
743
+
744
+ // OVERWRITE GUARD: if the file was written by an earlier todo and this
745
+ // todo's title doesn't suggest it's specifically about this file, refuse
746
+ // and force the model to think harder. Always allow overwrite if the new
747
+ // content is larger (model is appending) or the path appears in the todo.
748
+ const existing = state.written_files[f.path];
749
+ if (existing && existing.todo_id !== todo.id) {
750
+ const oldContent = await fs.readFile(target, "utf-8").catch(() => "");
751
+ const newSize = Buffer.byteLength(f.content, "utf-8");
752
+ const titleMentionsPath = todo.title.toLowerCase().includes(f.path.toLowerCase()) ||
753
+ todo.description.toLowerCase().includes(f.path.toLowerCase());
754
+ const isStrictShrink = newSize < oldContent.length * 0.5; // shrunk by >50% = clobber
755
+
756
+ if (isStrictShrink && !titleMentionsPath) {
757
+ errors.push(
758
+ `refused overwrite: ${f.path} would shrink from ${oldContent.length} to ${newSize} bytes ` +
759
+ `(model likely truncating). Skipping.`
760
+ );
761
+ continue;
762
+ }
763
+ }
764
+
666
765
  try {
667
766
  await fs.mkdir(path.dirname(target), { recursive: true });
668
767
  await fs.writeFile(target, f.content, "utf-8");
669
768
  filesWritten++;
769
+ // Track in state
770
+ state.written_files[f.path] = {
771
+ todo_id: todo.id,
772
+ size: Buffer.byteLength(f.content, "utf-8"),
773
+ written_at: Date.now(),
774
+ };
670
775
  // Surface a tool.completed event so the TUI shows an Edit/Write line
671
776
  const callId = `fallback-${uuid()}`;
672
777
  const broadcastModule = await import("../../../bone/output/session/src/websocket");
@@ -674,14 +779,14 @@ async function executeFallback(
674
779
  type: "tool.requested",
675
780
  session_id: input.session_id,
676
781
  tool_call_id: callId,
677
- tool_name: "write",
782
+ tool_name: existing ? "edit" : "write",
678
783
  tool_input: { path: f.path, content: f.content.slice(0, 200) },
679
784
  });
680
785
  broadcastModule.broadcastToChannel("part_stream", {
681
786
  type: "tool.completed",
682
787
  session_id: input.session_id,
683
788
  tool_call_id: callId,
684
- tool_name: "write",
789
+ tool_name: existing ? "edit" : "write",
685
790
  tool_input: { path: f.path },
686
791
  duration_ms: 0,
687
792
  });
@@ -821,15 +926,43 @@ async function stageVerify(state: BuildState, input: BuildModeInput): Promise<Bu
821
926
 
822
927
  // ─── Driver ───────────────────────────────────────────────────────────────────
823
928
 
824
- export async function runBuildMode(input: BuildModeInput): Promise<BuildState> {
825
- let state: BuildState = (await loadState(input.session_id)) ?? {
929
+ /**
930
+ * Start a fresh BuildState for the given prompt. Centralized so we don't
931
+ * accidentally drift between the lazy-init path in `runBuildMode` and the
932
+ * restart path used after a previous build is already `done` or `failed`.
933
+ */
934
+ function freshState(prompt: string): BuildState {
935
+ return {
826
936
  stage: "clarify",
827
- original_prompt: input.prompt,
937
+ original_prompt: prompt,
828
938
  design: null,
829
939
  todos: [],
830
940
  iteration: 0,
831
941
  max_iterations: 30,
942
+ written_files: {},
832
943
  };
944
+ }
945
+
946
+ export async function runBuildMode(input: BuildModeInput): Promise<BuildState> {
947
+ let state: BuildState = (await loadState(input.session_id)) ?? freshState(input.prompt);
948
+
949
+ // If the previous build is already done/failed, and the user is asking for
950
+ // something new (different prompt), start a fresh build rather than no-op.
951
+ // Without this, follow-up "build me X" prompts after a completed build
952
+ // silently do nothing because the loop sees state.stage === "done" and
953
+ // exits immediately.
954
+ if (
955
+ (state.stage === "done" || state.stage === "failed") &&
956
+ input.prompt &&
957
+ input.prompt !== state.original_prompt
958
+ ) {
959
+ emit(input.session_id, "build.restart", {
960
+ previous_stage: state.stage,
961
+ previous_prompt: state.original_prompt,
962
+ });
963
+ state = freshState(input.prompt);
964
+ await saveState(input.session_id, state);
965
+ }
833
966
 
834
967
  // Resume from saved state if applicable. If the user is sending a new prompt
835
968
  // and we're already in clarify with pending questions, treat the prompt as
@@ -887,9 +1020,20 @@ export function isBuildPrompt(prompt: string): boolean {
887
1020
  /\bmake\s+(?:a|an|the)\s+(?:full|complete|whole|new)\b/,
888
1021
  /\bproject\s+(?:from\s+scratch|to)\b/,
889
1022
  /\bsimulation\s+(?:with|using|of)\b/,
890
- /\bbackend\s+(?:for|with|using)\b/,
1023
+ /\bbackend\s+(?:for|with|using|service)\b/,
891
1024
  /\bspec(?:ification)?\s+(?:for|of)\b/,
892
1025
  /\bend[- ]to[- ]end\b/,
1026
+ // Verb-led "write/build/code/develop" requests with a noun follow
1027
+ /\b(?:write|code|develop|generate|scaffold)\s+(?:me\s+)?(?:a|an|the)\s+\w+/,
1028
+ // BoneScript-specific phrases — if they say bonescript at all, treat as build
1029
+ /\busing\s+bonescript\b/,
1030
+ /\bwith\s+bonescript\b/,
1031
+ /\bin\s+bonescript\b/,
1032
+ /\bbonescript\s+(?:as|for|backend)\b/,
1033
+ // Generic "<adjective> <noun-app>" patterns indicating a system request
1034
+ /\b(?:rest|graphql)\s+api\b/,
1035
+ /\bweb\s+app(?:lication)?\b/,
1036
+ /\bgame\s+(?:simulation|engine|server)\b/,
893
1037
  ];
894
1038
  return triggers.some((re) => re.test(p));
895
1039
  }