bloby-bot 0.53.2 → 0.53.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bloby-bot",
3
- "version": "0.53.2",
3
+ "version": "0.53.5",
4
4
  "releaseNotes": [
5
5
  "1. New Morphy animation system: config-driven sprites loaded from /morphy/*.json",
6
6
  "2. Swapped teleporting (splash) and headphones (bubble + chat) to the new format",
@@ -8,6 +8,10 @@ let child: ChildProcess | null = null;
8
8
  let restarts = 0;
9
9
  let lastSpawnTime = 0;
10
10
  let intentionallyStopped = false;
11
+ // True once the backend has crash-looped past MAX_RESTARTS and given up — i.e. it's down and
12
+ // will NOT come back without the user fixing the code. The supervisor shows the "backend down"
13
+ // interstitial in this state. Cleared on every spawn attempt (a deliberate restart is "trying again").
14
+ let gaveUp = false;
11
15
  const MAX_RESTARTS = 3;
12
16
  const STABLE_THRESHOLD = 30_000; // 30s — if backend ran this long, it wasn't a crash loop
13
17
 
@@ -39,6 +43,7 @@ export function spawnBackend(port: number): ChildProcess {
39
43
  const backendPath = path.join(WORKSPACE_DIR, 'backend', 'index.ts');
40
44
  lastSpawnTime = Date.now();
41
45
  intentionallyStopped = false;
46
+ gaveUp = false;
42
47
 
43
48
  // Clear log file on each restart — only keeps current run
44
49
  try { fs.writeFileSync(LOG_FILE, ''); } catch {}
@@ -106,6 +111,7 @@ export function spawnBackend(port: number): ChildProcess {
106
111
  log.info(`Restarting backend (${restarts}/${MAX_RESTARTS}, delay ${delay}ms)...`);
107
112
  setTimeout(() => spawnBackend(port), delay);
108
113
  } else {
114
+ gaveUp = true;
109
115
  log.error('Backend failed too many times. Use Bloby chat to debug.');
110
116
  }
111
117
  });
@@ -131,27 +137,75 @@ export function stopBackend(): Promise<void> {
131
137
  const dying = child;
132
138
  child = null;
133
139
 
134
- stopPromise = new Promise<void>((resolve) => {
135
- dying.once('exit', () => {
136
- stopPromise = null;
140
+ const promise = new Promise<void>((resolve) => {
141
+ let killTimer: ReturnType<typeof setTimeout> | null = null;
142
+ let finished = false;
143
+ const done = () => {
144
+ if (finished) return; // exit + SIGKILL paths can both fire; run once
145
+ finished = true;
146
+ if (killTimer) clearTimeout(killTimer);
147
+ // Only release the shared guard if it still points at THIS stop. A later stopBackend()
148
+ // may already have installed its own promise; the 3s safety timer (or a late exit) must
149
+ // never null a *different* stop's guard — that would make isBackendStopping() lie and let
150
+ // a concurrent spawn race the in-flight kill for the port.
151
+ if (stopPromise === promise) stopPromise = null;
137
152
  resolve();
138
- });
153
+ };
154
+ dying.once('exit', done);
139
155
  dying.kill();
140
- // Safety: force kill after 3s if SIGTERM doesn't work
141
- setTimeout(() => {
142
- try { dying.kill('SIGKILL'); } catch {}
143
- stopPromise = null;
144
- resolve();
145
- }, 3000);
156
+ // Safety: force kill after 3s if SIGTERM doesn't land.
157
+ killTimer = setTimeout(() => { try { dying.kill('SIGKILL'); } catch {} done(); }, 3000);
146
158
  });
159
+ stopPromise = promise;
147
160
 
148
- return stopPromise;
161
+ return promise;
162
+ }
163
+
164
+ let restartInFlight: Promise<void> | null = null;
165
+ let rerunRequested = false;
166
+
167
+ /** Serialized + coalescing backend restart — the single funnel for every deliberate restart
168
+ * (file watcher, turn-complete, scheduler pulse, channel manager). Concurrent callers share
169
+ * one in-flight restart; a request that arrives mid-restart triggers exactly one more
170
+ * stop→spawn cycle afterward, so the final backend was spawned after the latest request. This
171
+ * removes the double-spawn-onto-contended-port race of independent stopBackend().then(spawn) chains. */
172
+ export function restartBackend(port: number): Promise<void> {
173
+ if (restartInFlight) {
174
+ rerunRequested = true;
175
+ return restartInFlight;
176
+ }
177
+ restartInFlight = (async () => {
178
+ do {
179
+ rerunRequested = false;
180
+ resetBackendRestarts();
181
+ await stopBackend();
182
+ spawnBackend(port);
183
+ } while (rerunRequested);
184
+ })().finally(() => { restartInFlight = null; });
185
+ return restartInFlight;
149
186
  }
150
187
 
151
188
  export function isBackendAlive(): boolean {
152
189
  return child !== null && child.exitCode === null;
153
190
  }
154
191
 
192
+ /** True when the backend has crash-looped past MAX_RESTARTS and given up — down and not
193
+ * coming back without a code fix. Drives the supervisor's "backend down" interstitial. */
194
+ export function isBackendDead(): boolean {
195
+ return gaveUp;
196
+ }
197
+
198
+ /** Read the tail of the backend log (default 100 lines) for the "copy logs" debug helper. */
199
+ export function readBackendLogTail(maxLines = 100): string {
200
+ try {
201
+ const text = fs.readFileSync(LOG_FILE, 'utf-8');
202
+ const lines = text.split('\n');
203
+ return lines.slice(-maxLines).join('\n').trim();
204
+ } catch {
205
+ return '';
206
+ }
207
+ }
208
+
155
209
  export function isBackendStopping(): boolean {
156
210
  return stopPromise !== null;
157
211
  }
@@ -86,6 +86,12 @@ export function isConversationBusy(conversationId: string): boolean {
86
86
  return Object.values(HARNESSES).some((h) => h.isConversationBusy(conversationId));
87
87
  }
88
88
 
89
+ /** True if ANY conversation in ANY harness is mid-turn. Lets the supervisor defer backend
90
+ * restarts during channel/Alexa turns, which don't set the dashboard's agentQueryActive flag. */
91
+ export function anyConversationBusy(): boolean {
92
+ return Object.values(HARNESSES).some((h) => h.anyConversationBusy());
93
+ }
94
+
89
95
  export async function stopSubAgentTask(conversationId: string, taskId: string): Promise<void> {
90
96
  for (const h of Object.values(HARNESSES)) {
91
97
  if (h.hasConversation(conversationId)) {
@@ -538,6 +538,13 @@ export function isConversationBusy(conversationId: string): boolean {
538
538
  return liveConversations.get(conversationId)?.busy || false;
539
539
  }
540
540
 
541
+ /** True if ANY live conversation in this harness is mid-turn. Used by the supervisor to defer
542
+ * backend restarts during channel/Alexa turns (which don't set the dashboard's agentQueryActive). */
543
+ export function anyConversationBusy(): boolean {
544
+ for (const c of liveConversations.values()) if (c.busy) return true;
545
+ return false;
546
+ }
547
+
541
548
  /** Stop a specific background sub-agent task */
542
549
  export async function stopSubAgentTask(conversationId: string, taskId: string): Promise<void> {
543
550
  const conv = liveConversations.get(conversationId);
@@ -365,7 +365,14 @@ async function startTurn(conv: CodexConversation, content: string, savedFiles?:
365
365
  await conv.rpc.request('turn/start', params);
366
366
  } catch (err: any) {
367
367
  conv.busy = false;
368
+ conv.currentTurnId = null;
368
369
  conv.onMessage('bot:error', { conversationId: conv.id, error: `turn/start failed: ${err.message}` });
370
+ // turn/start produced no turn, so no turn/completed will arrive to clear the supervisor's
371
+ // agentQueryActive (set on bot:typing above). Left as-is, that wedges true forever:
372
+ // backend auto-heal is deferred indefinitely and chat is stuck showing "typing". Tear the
373
+ // conversation down so bot:conversation-ended fires (which, unlike bot:turn-complete, does
374
+ // NOT trigger a backend restart) — the next user message cold-starts a fresh thread.
375
+ teardownConversation(conv.id);
369
376
  }
370
377
  }
371
378
 
@@ -633,6 +640,13 @@ export function isConversationBusy(conversationId: string): boolean {
633
640
  return conversations.get(conversationId)?.busy ?? false;
634
641
  }
635
642
 
643
+ /** True if ANY live conversation in this harness is mid-turn. Used by the supervisor to defer
644
+ * backend restarts during channel/Alexa turns (which don't set the dashboard's agentQueryActive). */
645
+ export function anyConversationBusy(): boolean {
646
+ for (const c of conversations.values()) if (c.busy) return true;
647
+ return false;
648
+ }
649
+
636
650
  export async function startConversation(
637
651
  conversationId: string,
638
652
  model: string,
@@ -320,6 +320,13 @@ export function isConversationBusy(conversationId: string): boolean {
320
320
  return liveConversations.get(conversationId)?.busy || false;
321
321
  }
322
322
 
323
+ /** True if ANY live conversation in this harness is mid-turn. Used by the supervisor to defer
324
+ * backend restarts during channel/Alexa turns (which don't set the dashboard's agentQueryActive). */
325
+ export function anyConversationBusy(): boolean {
326
+ for (const c of liveConversations.values()) if (c.busy) return true;
327
+ return false;
328
+ }
329
+
323
330
  /** Pi has no sub-agents yet; provided for interface compatibility. */
324
331
  export async function stopSubAgentTask(_conversationId: string, _taskId: string): Promise<void> {
325
332
  // no-op for Phase 1
@@ -220,8 +220,14 @@ export function createPiSession(init: PiSessionInit): PiSession {
220
220
  if (toolUses.length === 0 && !pendingInterleave) break;
221
221
  }
222
222
 
223
- if (!turnErrored) {
224
- if (accumulatedText) {
223
+ // Emit text_end only on a clean turn (don't persist a half-baked answer from an errored
224
+ // turn). But ALWAYS emit turn_complete on a non-aborted turn — including the errored path
225
+ // — so the supervisor clears agentQueryActive (set on turn_started). Skipping it on error
226
+ // wedged the flag true: backend auto-heal stayed deferred and chat stuck in "typing" until
227
+ // the next successful turn. The 'error' event was already emitted by runOneRound, so the
228
+ // user still sees the failure. Aborted turns are torn down via bot:conversation-ended.
229
+ if (!init.abortController.signal.aborted) {
230
+ if (!turnErrored && accumulatedText) {
225
231
  init.onEvent({ type: 'text_end', text: accumulatedText });
226
232
  }
227
233
  const usedFileTools = Array.from(usedTools).some((t) => FILE_TOOL_NAMES.has(t));
@@ -238,6 +244,12 @@ export function createPiSession(init: PiSessionInit): PiSession {
238
244
  } catch (err: any) {
239
245
  log.warn(`[pi/session] Turn failed: ${err?.message || err}`);
240
246
  init.onEvent({ type: 'error', error: err?.message || String(err) });
247
+ // A thrown turn emitted no turn_complete either — clear agentQueryActive so auto-heal
248
+ // and chat aren't wedged. Skip when aborting (teardown emits conversation-ended).
249
+ // usedFileTools=false is the safe default (it only governs whether to auto-restart now).
250
+ if (!init.abortController.signal.aborted) {
251
+ init.onEvent({ type: 'turn_complete', usedFileTools: false });
252
+ }
241
253
  }
242
254
  }
243
255
  },
@@ -57,6 +57,8 @@ export interface Harness {
57
57
  endConversation(conversationId: string): void;
58
58
  endAllConversations(): void;
59
59
  isConversationBusy(conversationId: string): boolean;
60
+ /** True if ANY conversation in this harness is mid-turn (no id — used to defer backend restarts). */
61
+ anyConversationBusy(): boolean;
60
62
  stopSubAgentTask(conversationId: string, taskId: string): Promise<void>;
61
63
  warmUpForLiveConversation(
62
64
  model: string,
@@ -11,12 +11,12 @@ import { log } from '../shared/logger.js';
11
11
  import { startTunnel, stopTunnel, isTunnelAlive, restartTunnel, startNamedTunnel, restartNamedTunnel } from './tunnel.js';
12
12
  import { createWorkerApp } from '../worker/index.js';
13
13
  import { closeDb, getSession, getSetting } from '../worker/db.js';
14
- import { spawnBackend, stopBackend, getBackendPort, isBackendAlive, isBackendStopping, resetBackendRestarts, setBackendEnv } from './backend.js';
14
+ import { spawnBackend, stopBackend, restartBackend, getBackendPort, isBackendAlive, isBackendStopping, isBackendDead, readBackendLogTail, setBackendEnv } from './backend.js';
15
15
  import { handleAgentQuery, type AgentQueryRequest } from './agent-api.js';
16
16
  import { updateTunnelUrl, startHeartbeat, stopHeartbeat, disconnect } from '../shared/relay.js';
17
17
  import {
18
18
  startConversation, hasConversation, endConversation, endAllConversations,
19
- isConversationBusy, stopSubAgentTask,
19
+ isConversationBusy, anyConversationBusy, stopSubAgentTask,
20
20
  startBlobyAgentQuery, stopBlobyAgentQuery,
21
21
  warmUpForLiveConversation,
22
22
  type RecentMessage,
@@ -69,6 +69,8 @@ const PLATFORM_ASSETS = new Set([
69
69
  '/pi-logo.svg',
70
70
  '/codex.svg',
71
71
  '/manifest.json',
72
+ '/what-happened.webm',
73
+ '/what-happened.mp4',
72
74
  ]);
73
75
 
74
76
  // Directory-prefix platform assets — anything under these is served from supervisor/public/.
@@ -250,6 +252,77 @@ const RECOVERING_HTML = `<!DOCTYPE html><html style="background:#222122"><head><
250
252
  </div><script>setTimeout(function(){location.reload()},3000)</script>
251
253
  <script src="/bloby/widget.js"></script></body></html>`;
252
254
 
255
+ /** Interstitial shown (by the supervisor, not the workspace) when the workspace backend has
256
+ * crash-looped and given up. Replaces proxying the dashboard SPA to Vite — which would 503 on
257
+ * every /app/api call and, for the common workspace-lock template, misread "no backend" as
258
+ * "no password set" and show the lock-setup screen. Embeds the Bloby chat widget so the user
259
+ * can ask the agent to fix it inline, a "copy logs" button (last 100 backend log lines baked in
260
+ * at render time), and a poll that reloads into the real dashboard once the backend is back. */
261
+ function backendDownPage(logTail: string): string {
262
+ // Embed logs as a JS string literal; escape `<` so a stray `</script>` in the logs can't break out.
263
+ const logs = JSON.stringify(logTail && logTail.length ? logTail : '(no backend logs were captured)').replace(/</g, '\\u003c');
264
+ return `<!DOCTYPE html>
265
+ <html lang="en"><head>
266
+ <meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1">
267
+ <title>Backend down · Bloby</title>
268
+ <style>
269
+ *{margin:0;padding:0;box-sizing:border-box}
270
+ body{font-family:system-ui,-apple-system,'Segoe UI',sans-serif;background:#0a0a0b;color:#e4e4e7;display:flex;align-items:center;justify-content:center;min-height:100dvh;padding:1.5rem;overflow:hidden}
271
+ .c{text-align:center;max-width:480px;width:100%;animation:fade-up .6s ease-out both}
272
+ .video-wrap{position:relative;width:200px;height:200px;margin:0 auto 1.4rem;display:flex;align-items:center;justify-content:center}
273
+ .video-wrap::before{content:'';position:absolute;inset:-20px;background:radial-gradient(circle,rgba(1,102,255,0.18) 0%,transparent 60%);filter:blur(20px);animation:glow 3s ease-in-out infinite}
274
+ .video-wrap video{position:relative;width:100%;height:100%;object-fit:contain;pointer-events:none;border-radius:50%}
275
+ h1{font-size:1.55rem;font-weight:700;margin-bottom:.6rem;background:linear-gradient(135deg,#0166FF,#009AFE,#4AEEFF);-webkit-background-clip:text;-webkit-text-fill-color:transparent;background-clip:text}
276
+ p{color:#a1a1aa;line-height:1.6;margin-bottom:.5rem;font-size:.95rem}
277
+ .lead{color:#e4e4e7;font-size:1rem}
278
+ .actions{margin-top:1.3rem}
279
+ button{font:inherit;cursor:pointer;border-radius:10px;padding:.65rem 1.2rem;font-size:.9rem;font-weight:600;border:none;background:linear-gradient(135deg,#0166FF,#0069FE);color:#fff;transition:filter .15s}
280
+ button:hover{filter:brightness(1.12)}
281
+ .sub{font-size:.82rem;color:#71717a;display:inline-flex;align-items:center;gap:.5rem;background:#18181b;border:1px solid #27272a;border-radius:9999px;padding:.35rem .9rem;margin-top:1.1rem}
282
+ .sub .dot{width:8px;height:8px;border-radius:50%;background:linear-gradient(135deg,#0166FF,#009AFE);box-shadow:0 0 8px rgba(1,102,255,.6);animation:pulse 1.6s ease-in-out infinite}
283
+ .badge{display:block;font-size:.7rem;color:#52525b;margin-top:1.3rem}
284
+ @keyframes pulse{0%,100%{opacity:1;transform:scale(1)}50%{opacity:.45;transform:scale(.85)}}
285
+ @keyframes glow{0%,100%{opacity:.55;transform:scale(1)}50%{opacity:1;transform:scale(1.08)}}
286
+ @keyframes fade-up{0%{opacity:0;transform:translateY(12px)}100%{opacity:1;transform:translateY(0)}}
287
+ </style></head>
288
+ <body><div class="c">
289
+ <div class="video-wrap"><video autoplay loop muted playsinline>
290
+ <source src="/what-happened.webm" type="video/webm">
291
+ <source src="/what-happened.mp4" type="video/mp4">
292
+ </video></div>
293
+ <h1>Your app's backend is down</h1>
294
+ <p class="lead">The workspace server crashed and couldn't restart on its own.</p>
295
+ <p>Ask your agent to fix it — the chat is right here in the corner. Tap below to copy the logs so it can debug faster.</p>
296
+ <div class="actions"><button id="copyBtn">Copy logs for your agent</button></div>
297
+ <div class="sub"><span class="dot"></span><span id="statusText">Watching for recovery…</span></div>
298
+ <span class="badge">Powered by Bloby</span>
299
+ </div>
300
+ <script>
301
+ (function(){
302
+ var LOGS = ${logs};
303
+ var btn = document.getElementById('copyBtn'), statusEl = document.getElementById('statusText');
304
+ btn.addEventListener('click', function(){
305
+ var text = 'My workspace backend crashed and will not start. Find and fix the root cause. Last backend logs:\\n\\n' + LOGS;
306
+ function ok(){ btn.textContent = '✓ Copied — paste it to your agent'; setTimeout(function(){ btn.textContent = 'Copy logs for your agent'; }, 2600); }
307
+ function fallback(){ var ta=document.createElement('textarea'); ta.value=text; ta.style.position='fixed'; ta.style.opacity='0'; document.body.appendChild(ta); ta.select(); try{ document.execCommand('copy'); ok(); }catch(e){ btn.textContent='Copy failed — open the logs manually'; } document.body.removeChild(ta); }
308
+ if (navigator.clipboard && navigator.clipboard.writeText) { navigator.clipboard.writeText(text).then(ok).catch(fallback); } else { fallback(); }
309
+ });
310
+ var attempt = 0;
311
+ function retry(){
312
+ attempt++;
313
+ fetch('/__bloby/backend-status', { cache:'no-store' })
314
+ .then(function(r){ return r.json(); })
315
+ .then(function(s){ if (s && s.alive) { location.reload(); } else { schedule(); } })
316
+ .catch(schedule);
317
+ }
318
+ function schedule(){ statusEl.textContent = 'Watching for recovery… (checked ' + attempt + 'x)'; setTimeout(retry, Math.min(4000, 1500 + attempt*250)); }
319
+ setTimeout(retry, 2500);
320
+ })();
321
+ </script>
322
+ <script src="/bloby/widget.js"></script>
323
+ </body></html>`;
324
+ }
325
+
253
326
  /** Kill any stale process holding a port. Ensures clean startup after crashes/updates. */
254
327
  function killPort(port: number): void {
255
328
  try {
@@ -464,6 +537,15 @@ export async function startSupervisor() {
464
537
  return;
465
538
  }
466
539
 
540
+ // Backend liveness for the "backend down" interstitial's recovery poll. Supervisor-served
541
+ // (not proxied) so it answers even when the workspace backend is dead, and independent of
542
+ // whatever routes the user's backend happens to define.
543
+ if (req.url === '/__bloby/backend-status') {
544
+ res.writeHead(200, { 'Content-Type': 'application/json', 'Cache-Control': 'no-store' });
545
+ res.end(JSON.stringify({ alive: isBackendAlive(), dead: isBackendDead() }));
546
+ return;
547
+ }
548
+
467
549
  // App API routes → proxy to user's backend server
468
550
  if (req.url?.startsWith('/app/api')) {
469
551
  const backendPath = req.url.replace(/^\/app/, '');
@@ -1271,8 +1353,7 @@ mint();
1271
1353
  const result = await handleAgentQuery(parsed);
1272
1354
 
1273
1355
  if (result.usedFileTools) {
1274
- resetBackendRestarts();
1275
- stopBackend().then(() => spawnBackend(backendPort));
1356
+ void doRestart();
1276
1357
  broadcastBloby('app:hmr-update', {});
1277
1358
  }
1278
1359
 
@@ -1697,6 +1778,24 @@ mint();
1697
1778
  } catch { /* fall through to Vite */ }
1698
1779
  }
1699
1780
 
1781
+ // Workspace backend has crash-looped and given up → serve the "backend down" interstitial
1782
+ // for dashboard DOCUMENT navigations, instead of proxying to Vite (which serves the user's
1783
+ // SPA that then 503s on every /app/api call and, for the common workspace-lock template,
1784
+ // misreads the dead backend as "no password set" and shows the lock-setup screen). Scoped to
1785
+ // top-level navigations only (not assets/HMR/XHR) and only when the backend has truly given
1786
+ // up — never during a normal 1–2s restart. The chat PWA (/bloby/*) is served earlier and is
1787
+ // unaffected.
1788
+ const wantsHtml = req.method === 'GET' && (
1789
+ req.headers['sec-fetch-dest'] === 'document' ||
1790
+ req.headers['sec-fetch-mode'] === 'navigate' ||
1791
+ (!req.headers['sec-fetch-dest'] && String(req.headers['accept'] || '').includes('text/html'))
1792
+ );
1793
+ if (wantsHtml && isBackendDead()) {
1794
+ res.writeHead(503, { 'Content-Type': 'text/html', 'Cache-Control': 'no-store, no-cache, must-revalidate' });
1795
+ res.end(backendDownPage(readBackendLogTail(100)));
1796
+ return;
1797
+ }
1798
+
1700
1799
  // Everything else → proxy to dashboard Vite dev server
1701
1800
  console.log(`[supervisor] → dashboard Vite :${vitePorts.dashboard} | ${req.method} ${(req.url || '').split('?')[0]}`);
1702
1801
  const proxy = http.request(
@@ -1931,11 +2030,8 @@ mint();
1931
2030
  currentStreamBuffer = '';
1932
2031
 
1933
2032
  if (eventData.usedFileTools || pendingBackendRestart) {
1934
- log.info('[orchestrator] Restarting backend (file tools used)');
1935
- pendingBackendRestart = false;
1936
- if (backendRestartTimer) { clearTimeout(backendRestartTimer); backendRestartTimer = null; }
1937
- resetBackendRestarts();
1938
- stopBackend().then(() => spawnBackend(backendPort));
2033
+ log.info('[orchestrator] Restarting backend (file tools used / pending watcher change)');
2034
+ void doRestart();
1939
2035
  }
1940
2036
  if (pendingUpdate) {
1941
2037
  pendingUpdate = false;
@@ -2536,11 +2632,7 @@ mint();
2536
2632
  startScheduler({
2537
2633
  broadcastBloby,
2538
2634
  workerApi,
2539
- restartBackend: async () => {
2540
- resetBackendRestarts();
2541
- await stopBackend();
2542
- spawnBackend(backendPort);
2543
- },
2635
+ restartBackend: () => doRestart(),
2544
2636
  getModel: () => loadConfig().ai.model,
2545
2637
  });
2546
2638
 
@@ -2548,11 +2640,7 @@ mint();
2548
2640
  const channelManager = new ChannelManager({
2549
2641
  broadcastBloby,
2550
2642
  workerApi,
2551
- restartBackend: async () => {
2552
- resetBackendRestarts();
2553
- await stopBackend();
2554
- spawnBackend(backendPort);
2555
- },
2643
+ restartBackend: () => doRestart(),
2556
2644
  getModel: () => loadConfig().ai.model,
2557
2645
  });
2558
2646
 
@@ -2586,21 +2674,39 @@ mint();
2586
2674
  const backendDir = path.join(workspaceDir, 'backend');
2587
2675
  let backendRestartTimer: ReturnType<typeof setTimeout> | null = null;
2588
2676
 
2677
+ /** Single funnel for every DELIBERATE backend restart (file watcher, turn-complete, agent-api
2678
+ * one-shot, scheduler pulse, channel manager). Clears the deferred-restart flag and the
2679
+ * debounce timer, then delegates to backend.ts's serialized + coalescing restartBackend so
2680
+ * concurrent triggers can never double-spawn onto the contended port. */
2681
+ function doRestart(): Promise<void> {
2682
+ pendingBackendRestart = false;
2683
+ if (backendRestartTimer) { clearTimeout(backendRestartTimer); backendRestartTimer = null; }
2684
+ return restartBackend(backendPort);
2685
+ }
2686
+
2687
+ /** True while any surface is mid-turn. Dashboard chat sets agentQueryActive; WhatsApp/Alexa
2688
+ * turns instead set the harness conv.busy (they don't touch agentQueryActive), so we must
2689
+ * check both — otherwise an agent editing the backend over a channel would get the backend
2690
+ * restarted out from under it mid-turn. */
2691
+ const aTurnIsActive = () => agentQueryActive || anyConversationBusy();
2692
+
2589
2693
  function scheduleBackendRestart(reason: string) {
2590
- if (agentQueryActive) {
2591
- // Agent is working — don't restart now, flag it for bot:done
2694
+ if (aTurnIsActive()) {
2695
+ // A turn is working — don't restart now; flush at turn-complete (createSharedChatOnMessage)
2696
+ // or via the channel manager's own post-turn restart.
2592
2697
  pendingBackendRestart = true;
2593
2698
  return;
2594
2699
  }
2595
- // Skip if a stop/restart is already in progress (bot:done handler owns the restart)
2700
+ // Skip if a stop/restart is already in progress (that restart owns the spawn).
2596
2701
  if (isBackendStopping()) return;
2597
2702
  if (backendRestartTimer) clearTimeout(backendRestartTimer);
2598
- backendRestartTimer = setTimeout(async () => {
2599
- if (isBackendStopping()) return; // re-check after delay
2703
+ backendRestartTimer = setTimeout(() => {
2704
+ backendRestartTimer = null;
2705
+ // Re-check at fire time: a turn may have started during the 1s debounce window.
2706
+ if (aTurnIsActive()) { pendingBackendRestart = true; return; }
2707
+ if (isBackendStopping()) return;
2600
2708
  log.info(`[watcher] ${reason} — restarting backend...`);
2601
- resetBackendRestarts();
2602
- await stopBackend();
2603
- spawnBackend(backendPort);
2709
+ void doRestart();
2604
2710
  }, 1000);
2605
2711
  }
2606
2712
 
@@ -2610,12 +2716,22 @@ mint();
2610
2716
  scheduleBackendRestart(`Backend file changed: ${filename}`);
2611
2717
  });
2612
2718
 
2613
- // Watch workspace root for .env changes and .restart trigger
2719
+ // Watch workspace root for .env, dependency, and .restart/.update changes
2614
2720
  const workspaceWatcher = fs.watch(workspaceDir, (_event, filename) => {
2615
2721
  if (!filename) return;
2616
2722
  if (filename === '.env') {
2617
2723
  scheduleBackendRestart('.env changed');
2618
2724
  }
2725
+ if (filename === 'package.json' || filename === 'package-lock.json') {
2726
+ // The agent ran `npm install` to add/fix a backend dependency. Neither watcher otherwise
2727
+ // covers workspace-root deps (backendWatcher only watches backend/; node_modules is huge
2728
+ // and intentionally unwatched). Without this, an install done to fix an ENOENT crash — where
2729
+ // the import already exists so no Write tool fires and usedFileTools stays false — never
2730
+ // restarts the backend, leaving it broken until some unrelated edit. npm install runs inside
2731
+ // the agent's turn, so this defers (like every trigger) and lands at turn-complete, after
2732
+ // the install has fully written package.json + node_modules.
2733
+ scheduleBackendRestart(`workspace dependencies changed (${filename})`);
2734
+ }
2619
2735
  if (filename === '.restart') {
2620
2736
  // Consume the trigger file
2621
2737
  try { fs.unlinkSync(path.join(workspaceDir, '.restart')); } catch {}