bloby-bot 0.65.4 → 0.66.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bloby-bot",
3
- "version": "0.65.4",
3
+ "version": "0.66.1",
4
4
  "releaseNotes": [
5
5
  "1. Fix: image (and audio) attachments now render in chat again — /api/files is fetched with the auth token instead of a raw <img> src that 401'd after the endpoint hardening",
6
6
  "2. Affects chat thumbnails, the image lightbox, voice-note playback, and agent image cards",
@@ -105,7 +105,9 @@ The supervisor manages the backend process:
105
105
  - Editing `.ts`, `.js`, or `.json` files in `backend/` → auto-restart
106
106
  - Editing `.env` → auto-restart
107
107
  - After your turn ends, if you used Write or Edit tools → auto-restart
108
- - The backend does NOT restart mid-turn — edits are batched
108
+ - The backend does NOT auto-restart mid-turn — edits are batched (multi-file changes apply atomically)
109
+ - To restart and verify a fix WITHIN your turn (after edits are saved): `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/restart-backend -d '{"wait":true}'` — it waits for the backend to be healthy and returns `{"healthy":...,"logs":"..."}`, so you can then curl the backend to confirm the fix
110
+ - Read backend logs: `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200`
109
111
 
110
112
  **NEVER** kill processes, run `bloby start`, or run `npm start` directly.
111
113
 
@@ -1,4 +1,5 @@
1
1
  import { spawn, type ChildProcess } from 'child_process';
2
+ import http from 'http';
2
3
  import fs from 'fs';
3
4
  import path from 'path';
4
5
  import { PKG_DIR, WORKSPACE_DIR } from '../shared/paths.js';
@@ -39,6 +40,10 @@ export function setBackendGiveUpHandler(fn: () => void): void {
39
40
  }
40
41
 
41
42
  const LOG_FILE = path.join(WORKSPACE_DIR, '.backend.log');
43
+ // Holds the LAST crashed run's output. spawnBackend truncates LOG_FILE on every (re)spawn, so an
44
+ // agent reading .backend.log right after an auto-restart would otherwise see only the fresh (often
45
+ // empty) run and lose the originating error. The crash exit handler copies LOG_FILE here first.
46
+ const LOG_FILE_PREV = LOG_FILE + '.prev';
42
47
 
43
48
  export function getBackendPort(basePort: number): number {
44
49
  return basePort + 4;
@@ -114,6 +119,11 @@ export function spawnBackend(port: number): ChildProcess {
114
119
  // Supervisor called stopBackend() — don't auto-restart
115
120
  if (intentionallyStopped) return;
116
121
 
122
+ // Preserve the just-crashed run's output before the next spawnBackend truncates LOG_FILE, so a
123
+ // post-bounce read (agent or interstitial) can still fetch the originating error via ?prev=1.
124
+ // Only crashes reach here (intentional stops returned above), so .prev always holds the last crash.
125
+ try { fs.copyFileSync(LOG_FILE, LOG_FILE_PREV); } catch {}
126
+
117
127
  // Any unexpected exit (crash, SIGTERM, OOM, null code) — restart
118
128
  log.warn(`Backend exited unexpectedly (code ${code})`);
119
129
  // Track crashes in a rolling window (backstop for the 30s-reset crash-loop hole).
@@ -216,10 +226,12 @@ export function isBackendDead(): boolean {
216
226
  return gaveUp;
217
227
  }
218
228
 
219
- /** Read the tail of the backend log (default 100 lines) for the "copy logs" debug helper. */
220
- export function readBackendLogTail(maxLines = 100): string {
229
+ /** Read the tail of the backend log (default 100 lines) for the "copy logs" debug helper and the
230
+ * agent's GET /__bloby/control/logs/backend endpoint. Pass prev=true to read the last CRASHED run
231
+ * (.backend.log.prev) — useful right after an auto-restart, when the live log is a fresh run. */
232
+ export function readBackendLogTail(maxLines = 100, prev = false): string {
221
233
  try {
222
- const text = fs.readFileSync(LOG_FILE, 'utf-8');
234
+ const text = fs.readFileSync(prev ? LOG_FILE_PREV : LOG_FILE, 'utf-8');
223
235
  const lines = text.split('\n');
224
236
  return lines.slice(-maxLines).join('\n').trim();
225
237
  } catch {
@@ -227,6 +239,37 @@ export function readBackendLogTail(maxLines = 100): string {
227
239
  }
228
240
  }
229
241
 
242
+ /** True if the backend was (re)spawned within the last ~2s — so callers can tell the agent that a
243
+ * near-empty log tail is a fresh-spawn artifact, not the absence of an error. */
244
+ export function backendJustSpawned(): boolean {
245
+ return Date.now() - lastSpawnTime < 2000;
246
+ }
247
+
248
+ /** Resolve true as soon as the backend's HTTP port is ACCEPTING connections (any response — even a
249
+ * 404 — means the port is bound and serving), false if it never comes up within timeoutMs or the
250
+ * backend gives up first. This is the REAL readiness signal that the restart-and-verify endpoint
251
+ * returns to the agent: isBackendAlive() only means the child process was spawned, not that it has
252
+ * bound its port, so it reports "alive" during the startup window when requests still 503. */
253
+ export function probeBackendReady(port: number, timeoutMs = 15000): Promise<boolean> {
254
+ const deadline = Date.now() + timeoutMs;
255
+ return new Promise((resolve) => {
256
+ const attempt = () => {
257
+ if (gaveUp) return resolve(false); // crash-looped past the limit — it's not coming up
258
+ const req = http.request(
259
+ { host: '127.0.0.1', port, path: '/', method: 'GET', timeout: 2000 },
260
+ (res) => { res.resume(); resolve(true); }, // any HTTP response = port is listening
261
+ );
262
+ req.on('error', () => {
263
+ if (Date.now() >= deadline) return resolve(false);
264
+ setTimeout(attempt, 250);
265
+ });
266
+ req.on('timeout', () => { try { req.destroy(); } catch {} }); // → 'error' → retry/deadline
267
+ req.end();
268
+ };
269
+ attempt();
270
+ });
271
+ }
272
+
230
273
  export function isBackendStopping(): boolean {
231
274
  return stopPromise !== null;
232
275
  }
@@ -88,6 +88,10 @@ export function isConversationBusy(conversationId: string): boolean {
88
88
 
89
89
  /** True if ANY conversation in ANY harness is mid-turn. Lets the supervisor defer backend
90
90
  * restarts during channel/Alexa turns, which don't set the dashboard's agentQueryActive flag. */
91
+ export function anyOneShotActive(): boolean {
92
+ return Object.values(HARNESSES).some((h) => h.anyOneShotActive());
93
+ }
94
+
91
95
  export function anyConversationBusy(): boolean {
92
96
  return Object.values(HARNESSES).some((h) => h.anyConversationBusy());
93
97
  }
@@ -54,6 +54,8 @@ interface ChannelManagerOpts {
54
54
  workerApi: (path: string, method?: string, body?: any) => Promise<any>;
55
55
  restartBackend: () => void;
56
56
  getModel: () => string;
57
+ /** Fired after a channel turn ends — the supervisor uses it to flush a queued self-update. */
58
+ onTurnComplete?: () => void;
57
59
  }
58
60
 
59
61
  interface ActiveAgentQuery {
@@ -1063,6 +1065,7 @@ export class ChannelManager {
1063
1065
  // the dashboard's typing indicator would stay on forever.
1064
1066
  if (type === 'bot:turn-complete') {
1065
1067
  if (eventData.usedFileTools) this.opts.restartBackend();
1068
+ this.opts.onTurnComplete?.(); // flush a queued self-update after a channel turn
1066
1069
  broadcastBloby('bot:idle', { conversationId: convId });
1067
1070
  return;
1068
1071
  }
@@ -1071,6 +1074,7 @@ export class ChannelManager {
1071
1074
  // conversation under the same convId starts clean.
1072
1075
  if (type === 'bot:conversation-ended') {
1073
1076
  this.clearRoutes(convId);
1077
+ this.opts.onTurnComplete?.(); // flush a queued self-update if this turn ended by exception
1074
1078
  return;
1075
1079
  }
1076
1080
 
@@ -1216,12 +1220,14 @@ export class ChannelManager {
1216
1220
 
1217
1221
  if (type === 'bot:turn-complete') {
1218
1222
  if (eventData.usedFileTools) this.opts.restartBackend();
1223
+ this.opts.onTurnComplete?.(); // flush a queued self-update after a channel turn
1219
1224
  broadcastBloby('bot:idle', { conversationId: convId });
1220
1225
  return;
1221
1226
  }
1222
1227
 
1223
1228
  if (type === 'bot:conversation-ended') {
1224
1229
  this.clearRoutes(convId);
1230
+ this.opts.onTurnComplete?.(); // flush a queued self-update if this turn ended by exception
1225
1231
  return;
1226
1232
  }
1227
1233
 
@@ -1375,6 +1381,7 @@ export class ChannelManager {
1375
1381
  if (type === 'bot:done') {
1376
1382
  this.activeAgents.delete(agentKey);
1377
1383
  if (eventData.usedFileTools) this.opts.restartBackend();
1384
+ this.opts.onTurnComplete?.(); // flush a queued self-update after a channel turn
1378
1385
  this.processQueue();
1379
1386
  }
1380
1387
  },
@@ -0,0 +1,80 @@
1
+ /**
2
+ * Server-side frontend log ring — the data source behind GET /__bloby/control/logs/frontend
3
+ * (the agent's "tail frontend / devtools logs") and the friendly "Copy error" flow.
4
+ *
5
+ * Two independent producers feed ONE in-memory ring, so the tail is never empty regardless of
6
+ * how the frontend broke:
7
+ * 1. The Vite dev server's customLogger (supervisor/vite-dev.ts) — COMPILE/transform errors,
8
+ * captured even when the browser never ran a line of JS (hard compile failure / blank page).
9
+ * 2. The browser (supervisor/workspace-guard.js) POSTing window.onerror / unhandledrejection /
10
+ * console.error / console.warn / Vite-overlay text to POST /__bloby/control/fe-log — RUNTIME
11
+ * errors, which Vite never sees.
12
+ *
13
+ * Memory-only by design: the agent reads it over the loopback endpoint (no workspace file to grow
14
+ * unbounded, pollute the dir, or self-trigger Vite's watcher). It is the current session's frontend
15
+ * error trail; a supervisor restart clears it (frontend errors are transient by nature).
16
+ */
17
+
18
+ export type FrontendLogKind =
19
+ | 'error'
20
+ | 'unhandledrejection'
21
+ | 'console.error'
22
+ | 'console.warn'
23
+ | 'vite-error'
24
+ | 'vite-warn'
25
+ | 'vite-overlay';
26
+
27
+ export interface FrontendLogEntry {
28
+ t: number;
29
+ kind: FrontendLogKind;
30
+ text: string;
31
+ stack?: string;
32
+ }
33
+
34
+ const RING_MAX = 500;
35
+ const TEXT_CAP = 4000; // per-field clamp so one giant stack can't blow the ring's memory
36
+
37
+ const ring: FrontendLogEntry[] = [];
38
+
39
+ // Collapse the same message arriving repeatedly in a short window. The guard re-evaluates the Vite
40
+ // overlay on a 1.5s tick, and a crash loop can spam identical errors — without this the ring fills
41
+ // with one repeated line and pushes out the useful history.
42
+ let lastKey = '';
43
+ let lastAt = 0;
44
+
45
+ /** Append one frontend log entry to the ring. Best-effort, never throws, drops empty text.
46
+ * text is newline-stripped: the browser-facing POST /__bloby/control/fe-log endpoint is
47
+ * unauthenticated, and tailFrontendLog renders one entry per line — an embedded newline would let a
48
+ * remote caller forge a fake `<ts> [kind] ...` line that the (Bash-capable) agent reads as genuine.
49
+ * Collapsing newlines to a marker keeps each entry to exactly one line. (stack keeps its newlines:
50
+ * the renderer indents every stack line, so it can't masquerade as an un-indented log header.) */
51
+ export function appendFrontendLog(kind: FrontendLogKind, text: string, stack?: string): void {
52
+ const clean = (text == null ? '' : String(text)).slice(0, TEXT_CAP).replace(/[\r\n]+/g, ' ⏎ ').trim();
53
+ if (!clean) return;
54
+ const stk = stack ? String(stack).slice(0, TEXT_CAP) : undefined;
55
+
56
+ const key = kind + '|' + clean;
57
+ const now = Date.now();
58
+ if (key === lastKey && now - lastAt < 4000) { lastAt = now; return; }
59
+ lastKey = key;
60
+ lastAt = now;
61
+
62
+ ring.push({ t: now, kind, text: clean, stack: stk });
63
+ while (ring.length > RING_MAX) ring.shift();
64
+ }
65
+
66
+ /** Render the last `maxLines` ring lines as text (newest last). Each entry is one header line
67
+ * (`<ISO ts> [kind] text`) plus optional indented stack lines. */
68
+ export function tailFrontendLog(maxLines = 100): string {
69
+ const lines: string[] = [];
70
+ for (const e of ring) {
71
+ lines.push(`${new Date(e.t).toISOString()} [${e.kind}] ${e.text}`);
72
+ if (e.stack) lines.push(' ' + e.stack.replace(/\n/g, '\n '));
73
+ }
74
+ return lines.slice(-Math.max(0, maxLines)).join('\n');
75
+ }
76
+
77
+ /** Number of entries currently buffered (surfaced as `clients`-independent count). */
78
+ export function getFrontendLogCount(): number {
79
+ return ring.length;
80
+ }
@@ -550,6 +550,13 @@ export function anyConversationBusy(): boolean {
550
550
  return false;
551
551
  }
552
552
 
553
+ /** True while any one-shot startBlobyAgentQuery (pulse/cron, customer WhatsApp) is in flight.
554
+ * These register only in activeQueries (cleared in a finally), not liveConversations, so
555
+ * anyConversationBusy() can't see them. */
556
+ export function anyOneShotActive(): boolean {
557
+ return activeQueries.size > 0;
558
+ }
559
+
553
560
  /** Stop a specific background sub-agent task */
554
561
  export async function stopSubAgentTask(conversationId: string, taskId: string): Promise<void> {
555
562
  const conv = liveConversations.get(conversationId);
@@ -887,6 +887,12 @@ export function isConversationBusy(conversationId: string): boolean {
887
887
 
888
888
  /** True if ANY live conversation in this harness is mid-turn. Used by the supervisor to defer
889
889
  * backend restarts during channel/Alexa turns (which don't set the dashboard's agentQueryActive). */
890
+ /** Codex one-shot queries (startBlobyAgentQuery) reuse the conversations map and set conv.busy via
891
+ * startTurn, so anyConversationBusy() already covers them — nothing extra to track here. */
892
+ export function anyOneShotActive(): boolean {
893
+ return false;
894
+ }
895
+
890
896
  export function anyConversationBusy(): boolean {
891
897
  for (const c of conversations.values()) if (c.busy) return true;
892
898
  return false;
@@ -344,6 +344,12 @@ export async function warmUpForLiveConversation(
344
344
 
345
345
  const activeQueries = new Map<string, AbortController>();
346
346
 
347
+ /** True while any one-shot startBlobyAgentQuery is in flight (cleared in a finally). These don't
348
+ * register as live conversations, so anyConversationBusy() can't see them. */
349
+ export function anyOneShotActive(): boolean {
350
+ return activeQueries.size > 0;
351
+ }
352
+
347
353
  /**
348
354
  * One-shot text query — used by customer WhatsApp + scheduler. Uses the
349
355
  * provider stream directly (no async queue), drains it, emits the same
@@ -59,6 +59,10 @@ export interface Harness {
59
59
  isConversationBusy(conversationId: string): boolean;
60
60
  /** True if ANY conversation in this harness is mid-turn (no id — used to defer backend restarts). */
61
61
  anyConversationBusy(): boolean;
62
+ /** True if ANY one-shot query (startBlobyAgentQuery: pulse/cron, customer WhatsApp) is in flight.
63
+ * These do NOT register as live conversations, so anyConversationBusy() can't see them — the
64
+ * supervisor ORs this in so a queued self-update / backend restart defers past one-shot turns too. */
65
+ anyOneShotActive(): boolean;
62
66
  stopSubAgentTask(conversationId: string, taskId: string): Promise<void>;
63
67
  warmUpForLiveConversation(
64
68
  model: string,
@@ -11,12 +11,13 @@ import { log } from '../shared/logger.js';
11
11
  import { startTunnel, stopTunnel, isTunnelAlive, restartTunnel, startNamedTunnel, restartNamedTunnel } from './tunnel.js';
12
12
  import { createWorkerApp } from '../worker/index.js';
13
13
  import { closeDb, getSession, getSetting } from '../worker/db.js';
14
- import { spawnBackend, stopBackend, restartBackend, getBackendPort, isBackendAlive, isBackendStopping, isBackendDead, readBackendLogTail, setBackendEnv, setBackendGiveUpHandler } from './backend.js';
14
+ import { spawnBackend, stopBackend, restartBackend, getBackendPort, isBackendAlive, isBackendStopping, isBackendDead, readBackendLogTail, setBackendEnv, setBackendGiveUpHandler, probeBackendReady, backendJustSpawned } from './backend.js';
15
+ import { appendFrontendLog, tailFrontendLog, getFrontendLogCount, type FrontendLogKind } from './frontend-log.js';
15
16
  import { handleAgentQuery, type AgentQueryRequest } from './agent-api.js';
16
17
  import { updateTunnelUrl, startHeartbeat, stopHeartbeat, disconnect } from '../shared/relay.js';
17
18
  import {
18
19
  startConversation, hasConversation, endConversation, endAllConversations,
19
- isConversationBusy, anyConversationBusy, stopSubAgentTask,
20
+ isConversationBusy, anyConversationBusy, anyOneShotActive, stopSubAgentTask,
20
21
  startBlobyAgentQuery, stopBlobyAgentQuery,
21
22
  warmUpForLiveConversation,
22
23
  type RecentMessage,
@@ -44,6 +45,24 @@ process.on('unhandledRejection', (reason) => {
44
45
  const DIST_BLOBY = path.join(PKG_DIR, 'dist-bloby');
45
46
  const SUPERVISOR_PUBLIC = path.join(PKG_DIR, 'supervisor', 'public');
46
47
 
48
+ // Self-update coordination. The marker persists a queued update across a supervisor restart that
49
+ // happens between the agent's request and the turn-complete flush (in-memory pendingUpdate alone
50
+ // would be lost). attempts + a TTL bound the boot-resume retry so a persistently-failing update
51
+ // can't loop on every boot. See queueUpdate/flushPendingUpdate/runDeferredUpdate.
52
+ const UPDATE_MARKER = path.join(DATA_DIR, '.update-pending');
53
+ const UPDATE_MAX_ATTEMPTS = 2;
54
+ const UPDATE_MARKER_TTL_MS = 30 * 60_000; // 30 min — a marker older than this is cleared, not retried
55
+
56
+ /** True for the loopback, non-tunnel requests the local agent makes to the /__bloby/control/* and
57
+ * channel-mutation endpoints. Identical trust model to the channel mutation guard: cloudflared
58
+ * forwards over loopback so the IP check alone is a no-op behind the relay — we also reject any
59
+ * request carrying cloudflared's cf-connecting-ip/cf-ray (tunnel-origin) headers. */
60
+ function isLoopbackAgentReq(req: http.IncomingMessage): boolean {
61
+ const ip = req.socket.remoteAddress || '';
62
+ const isLoopback = ip === '127.0.0.1' || ip === '::1' || ip === '::ffff:127.0.0.1';
63
+ return isLoopback && !req.headers['cf-connecting-ip'] && !req.headers['cf-ray'];
64
+ }
65
+
47
66
  // Proactive context recycling. The chat runs as one long-lived agent session per
48
67
  // conversation (so the user can keep talking while the agent works). That session's
49
68
  // context grows every turn and would eventually hit the wall. But continuity does NOT
@@ -390,6 +409,12 @@ export async function startSupervisor() {
390
409
  const internalSecret = crypto.randomBytes(16).toString('hex');
391
410
  const agentSecret = crypto.randomBytes(32).toString('hex');
392
411
 
412
+ // Expose the supervisor's own HTTP port to EVERY child subprocess via our own process.env —
413
+ // notably the agent harness (claude/codex/pi all spread ...process.env), whose Bash tool curls
414
+ // the /__bloby/control/* surface as http://127.0.0.1:$SUPERVISOR_PORT/... . Previously this was
415
+ // injected ONLY into the backend (setBackendEnv below), so the agent had no reliable port var.
416
+ process.env.SUPERVISOR_PORT = String(config.port);
417
+
393
418
  // Inject agent secret + supervisor port into workspace backend env
394
419
  setBackendEnv({
395
420
  BLOBY_AGENT_SECRET: agentSecret,
@@ -620,6 +645,161 @@ export async function startSupervisor() {
620
645
  return;
621
646
  }
622
647
 
648
+ // ── Agent control surface (/__bloby/control/*) ──────────────────────────────────────────────
649
+ // The Bloby agent drives backend restarts, self-update, and log tails through these endpoints
650
+ // instead of the old lossy fs.watch trigger files (.restart/.update). Every call returns a
651
+ // SYNCHRONOUS JSON ack — that explicit acknowledgment is the reliability fix (no silent drops).
652
+ // The agent curls http://127.0.0.1:$SUPERVISOR_PORT/__bloby/control/... (SUPERVISOR_PORT is
653
+ // injected into its env). All routes are loopback-only (same cf-reject guard as the channel
654
+ // mutations) so they are NEVER reachable over the public tunnel — EXCEPT fe-log, which the
655
+ // user's browser posts to (write-only, capped). Served here, before auth and the Vite catch-all,
656
+ // so they answer even when the backend/Vite are down.
657
+ if (req.url?.startsWith('/__bloby/control/')) {
658
+ const ctlPath = req.url.split('?')[0];
659
+ const ctlQuery = new URLSearchParams(req.url.split('?')[1] || '');
660
+
661
+ // POST /__bloby/control/fe-log — browser → supervisor frontend-error ingest. NOT loopback-
662
+ // gated (the workspace-guard posts from the user's browser, possibly over the tunnel).
663
+ // Write-only, size-capped, no read-back, no side effects → worst case is capped log spam.
664
+ if (ctlPath === '/__bloby/control/fe-log' && req.method === 'POST') {
665
+ let feBody = '';
666
+ let feTooBig = false;
667
+ // Cap by DROPPING the payload once oversize (not req.destroy(), which fires neither 'end' nor
668
+ // 'error' → the response would never be sent). Keep reading to a clean 'end' and 204 always.
669
+ req.on('data', (chunk: Buffer) => {
670
+ if (feTooBig) return;
671
+ feBody += chunk.toString();
672
+ if (feBody.length > 16_384) { feTooBig = true; feBody = ''; }
673
+ });
674
+ req.on('end', () => {
675
+ if (!feTooBig) {
676
+ try {
677
+ const parsed = JSON.parse(feBody);
678
+ const entries = Array.isArray(parsed?.entries) ? parsed.entries.slice(-40) : [];
679
+ const allowed = ['error', 'unhandledrejection', 'console.error', 'console.warn', 'vite-overlay'];
680
+ for (const e of entries) {
681
+ if (e && typeof e.text === 'string') {
682
+ const kind = (allowed.includes(e.kind) ? e.kind : 'error') as FrontendLogKind;
683
+ appendFrontendLog(kind, e.text, typeof e.stack === 'string' ? e.stack : undefined);
684
+ }
685
+ }
686
+ } catch {}
687
+ }
688
+ try { res.writeHead(204); res.end(); } catch {}
689
+ });
690
+ req.on('error', () => { try { res.writeHead(204); res.end(); } catch {} });
691
+ return;
692
+ }
693
+
694
+ // Every other control route is loopback-only (agent-driven, can restart/update the instance).
695
+ if (!isLoopbackAgentReq(req)) {
696
+ res.writeHead(403, { 'Content-Type': 'application/json' });
697
+ res.end(JSON.stringify({ ok: false, error: 'This control endpoint is localhost-only.' }));
698
+ return;
699
+ }
700
+ res.setHeader('Cache-Control', 'no-store');
701
+
702
+ // GET /__bloby/control/logs/backend?lines=N[&prev=1] — tail the current (or last-crashed) run.
703
+ if (ctlPath === '/__bloby/control/logs/backend' && req.method === 'GET') {
704
+ const lines = Math.max(1, Math.min(1000, parseInt(ctlQuery.get('lines') || '100', 10) || 100));
705
+ const prev = ctlQuery.get('prev') === '1';
706
+ res.writeHead(200, { 'Content-Type': 'application/json' });
707
+ res.end(JSON.stringify({ ok: true, lines, prev, justRestarted: backendJustSpawned(), log: readBackendLogTail(lines, prev) }));
708
+ return;
709
+ }
710
+
711
+ // GET /__bloby/control/logs/frontend?lines=N — runtime + console + Vite-compile frontend errors.
712
+ if (ctlPath === '/__bloby/control/logs/frontend' && req.method === 'GET') {
713
+ const lines = Math.max(1, Math.min(1000, parseInt(ctlQuery.get('lines') || '100', 10) || 100));
714
+ res.writeHead(200, { 'Content-Type': 'application/json' });
715
+ res.end(JSON.stringify({ ok: true, lines, entries: getFrontendLogCount(), log: tailFrontendLog(lines) }));
716
+ return;
717
+ }
718
+
719
+ // GET /__bloby/control/update-status — is a queued update running / did it fail?
720
+ if (ctlPath === '/__bloby/control/update-status' && req.method === 'GET') {
721
+ res.writeHead(200, { 'Content-Type': 'application/json' });
722
+ res.end(JSON.stringify({ ok: true, ...getUpdateStatus() }));
723
+ return;
724
+ }
725
+
726
+ // POST /__bloby/control/update — queue a self-update (acknowledged, idempotent, deferred).
727
+ if (ctlPath === '/__bloby/control/update' && req.method === 'POST') {
728
+ const r = queueUpdate();
729
+ res.writeHead(200, { 'Content-Type': 'application/json' });
730
+ res.end(JSON.stringify({
731
+ ok: true,
732
+ queued: r.queued || r.alreadyQueued,
733
+ alreadyQueued: r.alreadyQueued,
734
+ retrying: r.retrying,
735
+ deferred: r.deferred,
736
+ message: r.alreadyQueued
737
+ ? 'An update is already queued or running.'
738
+ : r.retrying
739
+ ? 'A previous update attempt failed — re-queued; it retries after your turn ends. Check update-status (state:failed exposes the prior error in logTail).'
740
+ : 'Update queued — it runs after your turn ends. You will NOT die mid-turn; finish your turn normally. The page is unresponsive ~1–2 min while Bloby restarts on the new version. Check update-status after.',
741
+ }));
742
+ return;
743
+ }
744
+
745
+ // POST /__bloby/control/restart-backend { wait?:bool=true, timeoutMs?:num=15000, logLines?:num=60 }
746
+ // Restarts the backend through the existing serialized doRestart() funnel and (when wait) blocks
747
+ // until the backend's PORT is listening — so the agent can restart-and-verify WITHIN its turn.
748
+ if (ctlPath === '/__bloby/control/restart-backend' && req.method === 'POST') {
749
+ let rbBody = '';
750
+ let rbTooBig = false;
751
+ // Cap by dropping the payload (not req.destroy(), which would fire neither 'end' nor 'error'
752
+ // → no JSON ack ever sent, violating the endpoint contract). Always answer from 'end'.
753
+ req.on('data', (chunk: Buffer) => {
754
+ if (rbTooBig) return;
755
+ rbBody += chunk.toString();
756
+ if (rbBody.length > 4096) { rbTooBig = true; rbBody = ''; }
757
+ });
758
+ req.on('end', async () => {
759
+ if (rbTooBig) {
760
+ try { res.writeHead(413, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ ok: false, error: 'Request body too large.' })); } catch {}
761
+ return;
762
+ }
763
+ let rbOpts: any = {};
764
+ try { rbOpts = rbBody ? JSON.parse(rbBody) : {}; } catch {}
765
+ const wait = rbOpts.wait !== false; // default true
766
+ const timeoutMs = Math.max(1000, Math.min(30_000, Number(rbOpts.timeoutMs) || 15_000));
767
+ const logLines = Math.max(0, Math.min(400, Number(rbOpts.logLines) || 60));
768
+ const wasDead = isBackendDead(); // had crash-looped & given up BEFORE this explicit restart
769
+ const started = Date.now();
770
+ try {
771
+ await doRestart(); // resetBackendRestarts + serialized stop→spawn; preserves all invariants
772
+ } catch (err: any) {
773
+ try { res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ ok: false, restarted: false, error: String(err?.message || err) })); } catch {}
774
+ return;
775
+ }
776
+ const listening = wait ? await probeBackendReady(backendPort, timeoutMs) : isBackendAlive();
777
+ const gaveUp = isBackendDead();
778
+ try {
779
+ res.writeHead(200, { 'Content-Type': 'application/json' });
780
+ res.end(JSON.stringify({
781
+ ok: true,
782
+ restarted: true,
783
+ healthy: listening && !gaveUp,
784
+ listening,
785
+ gaveUp,
786
+ wasDead,
787
+ // If it gave up AGAIN, restarting won't help — tell the agent to fix the code, not re-restart.
788
+ hint: gaveUp ? 'Backend crash-looped and gave up again — restarting will not fix it. Read the logs and fix the code.' : undefined,
789
+ waitedMs: Date.now() - started,
790
+ logs: logLines ? readBackendLogTail(logLines) : undefined,
791
+ }));
792
+ } catch {}
793
+ });
794
+ req.on('error', () => { try { if (!res.headersSent) { res.writeHead(500); res.end(); } } catch {} });
795
+ return;
796
+ }
797
+
798
+ res.writeHead(404, { 'Content-Type': 'application/json' });
799
+ res.end(JSON.stringify({ ok: false, error: 'Unknown control endpoint.' }));
800
+ return;
801
+ }
802
+
623
803
  // App API routes → proxy to user's backend server
624
804
  if (req.url?.startsWith('/app/api')) {
625
805
  const backendPath = req.url.replace(/^\/app/, '');
@@ -2851,12 +3031,7 @@ ${alreadyLinked ? '' : `
2851
3031
  log.info('[orchestrator] Restarting backend (file tools used / pending watcher change)');
2852
3032
  void doRestart();
2853
3033
  }
2854
- if (pendingUpdate) {
2855
- pendingUpdate = false;
2856
- log.info('[orchestrator] Ending conversation before update...');
2857
- endConversation(convId);
2858
- runDeferredUpdate();
2859
- }
3034
+ flushPendingUpdate(); // run a queued self-update now that this dashboard turn has ended
2860
3035
 
2861
3036
  // Proactive session recycling (see CONTEXT_RECYCLE_TOKENS). Only when the
2862
3037
  // harness reports the session idle (no queued message) — and this handler runs
@@ -2883,6 +3058,9 @@ ${alreadyLinked ? '' : `
2883
3058
  currentStreamConvId = null;
2884
3059
  currentStreamBuffer = '';
2885
3060
  channelManager.clearRoutes(convId);
3061
+ // A turn that ended by exception/recycle (not a clean bot:turn-complete) must still flush a
3062
+ // queued self-update — otherwise it'd wait for the next turn/reboot. Self-defers + idempotent.
3063
+ flushPendingUpdate();
2886
3064
  return;
2887
3065
  }
2888
3066
 
@@ -3408,16 +3586,90 @@ ${alreadyLinked ? '' : `
3408
3586
  // Track whether an agent is actively processing — file watcher defers restarts during active turns
3409
3587
  let agentQueryActive = false;
3410
3588
  let pendingBackendRestart = false; // Set when file watcher fires during agent turn
3411
- let pendingUpdate = false; // Set when .update file is created during agent turn
3589
+ let pendingUpdate = false; // An update is queued; runs at the next turn-complete (flushPendingUpdate)
3590
+ let updateInProgress = false; // The update child has actually been spawned — idempotency guard
3412
3591
 
3413
3592
  // Note: with live conversations, agentQueryActive is true while the agent processes a message
3414
3593
  // and false when it's idle (waiting for next message). The live conversation stays alive between messages.
3415
3594
 
3416
- // Run bloby update as a child process.
3417
- // BLOBY_SELF_UPDATE=1 tells bin/cli.js to skip daemon stop/restart —
3418
- // the supervisor exits after the update finishes, and systemd (Restart=on-failure)
3419
- // or launchd (KeepAlive.SuccessfulExit=false) restarts us with the new code.
3595
+ // ── Self-update marker (persists a queued update across a supervisor restart in the request→flush
3596
+ // window) ──────────────────────────────────────────────────────────────────────────────────
3597
+ function readUpdateMarker(): { queuedAt: number; attempts: number } | null {
3598
+ try {
3599
+ const m = JSON.parse(fs.readFileSync(UPDATE_MARKER, 'utf-8'));
3600
+ if (m && typeof m.queuedAt === 'number') return { queuedAt: m.queuedAt, attempts: Number(m.attempts) || 0 };
3601
+ } catch {}
3602
+ return null;
3603
+ }
3604
+ function writeUpdateMarker(m: { queuedAt: number; attempts: number }): void {
3605
+ try { fs.writeFileSync(UPDATE_MARKER, JSON.stringify(m)); } catch {}
3606
+ }
3607
+ function clearUpdateMarker(): void {
3608
+ try { fs.unlinkSync(UPDATE_MARKER); } catch {}
3609
+ }
3610
+
3611
+ /** Queue a self-update. Acknowledged + idempotent (the core fix vs the old fire-and-forget
3612
+ * `touch .update`). The update RUNS at the next turn-complete so the agent's current turn finishes
3613
+ * first (it does NOT die mid-turn). When truly idle it flushes on the next tick. */
3614
+ function queueUpdate(): { queued: boolean; alreadyQueued: boolean; deferred: boolean; retrying: boolean } {
3615
+ if (updateInProgress) return { queued: false, alreadyQueued: true, deferred: false, retrying: false };
3616
+ const marker = readUpdateMarker();
3617
+ const alreadyQueued = pendingUpdate; // genuinely already waiting to run
3618
+ const retrying = !pendingUpdate && !!marker && marker.attempts > 0; // a prior attempt failed; re-queue it
3619
+ pendingUpdate = true;
3620
+ if (!marker) writeUpdateMarker({ queuedAt: Date.now(), attempts: 0 });
3621
+ flushPendingUpdate(); // self-defers — runs now only if nothing is mid-turn
3622
+ return { queued: true, alreadyQueued, deferred: aTurnIsActive(), retrying };
3623
+ }
3624
+
3625
+ /** Run the queued update once NO turn is active on any surface. Deferred one tick so the
3626
+ * just-completed turn's in-flight flags (agentQueryActive / conv.busy / activeQueries) have
3627
+ * cleared first: that lets the completing turn's OWN queued update fire, while still never tearing
3628
+ * down a concurrent dashboard / channel / one-shot turn. If something else is still active it
3629
+ * stays pending and re-fires at the next turn-complete or boot-resume (idempotent, marker-backed). */
3630
+ function flushPendingUpdate(): void {
3631
+ if (!pendingUpdate || updateInProgress) return;
3632
+ setImmediate(() => {
3633
+ if (!pendingUpdate || updateInProgress || aTurnIsActive()) return;
3634
+ pendingUpdate = false;
3635
+ try { for (const cid of Array.from(clientConvs.values())) if (hasConversation(cid)) endConversation(cid); } catch {}
3636
+ runDeferredUpdate();
3637
+ });
3638
+ }
3639
+
3640
+ /** Status for GET /__bloby/control/update-status — lets the agent confirm a queued update actually
3641
+ * ran / read update.log on failure (a successful update ends in process.exit + daemon restart, so
3642
+ * the agent sees a connection drop then a new version on reconnect). */
3643
+ function getUpdateStatus(): { state: 'idle' | 'queued' | 'running' | 'failed'; attempts: number; logTail: string } {
3644
+ const marker = readUpdateMarker();
3645
+ let state: 'idle' | 'queued' | 'running' | 'failed';
3646
+ if (updateInProgress) state = 'running';
3647
+ else if (pendingUpdate) state = 'queued';
3648
+ else if (marker && marker.attempts > 0) state = 'failed'; // a prior attempt failed; retries at next turn/boot
3649
+ else if (marker) state = 'queued';
3650
+ else state = 'idle';
3651
+ let logTail = '';
3652
+ try { logTail = fs.readFileSync(path.join(DATA_DIR, 'update.log'), 'utf-8').split('\n').slice(-60).join('\n').trim(); } catch {}
3653
+ return { state, attempts: marker?.attempts ?? 0, logTail };
3654
+ }
3655
+
3656
+ // Run bloby update as a child process. BLOBY_SELF_UPDATE=1 tells bin/cli.js to skip daemon
3657
+ // stop/restart — the supervisor exits after the update finishes, and systemd (Restart=on-failure)
3658
+ // or launchd (KeepAlive.SuccessfulExit=false) restarts us with the new code. The marker's attempts
3659
+ // counter bounds retries (the TTL is enforced only on the boot-resume path so it can't strand a
3660
+ // legit update queued early in a >TTL-long turn).
3420
3661
  function runDeferredUpdate() {
3662
+ if (updateInProgress) { log.info('Update already in progress — skipping duplicate trigger'); return; }
3663
+ const marker = readUpdateMarker() || { queuedAt: Date.now(), attempts: 0 };
3664
+ if (marker.attempts >= UPDATE_MAX_ATTEMPTS) {
3665
+ log.error(`Self-update failed ${marker.attempts}× — giving up. Run \`bloby update\` manually or check ${path.join(DATA_DIR, 'update.log')}`);
3666
+ clearUpdateMarker();
3667
+ try { broadcastBloby('backend:failed', { message: 'Self-update failed repeatedly. Ask your human to run `bloby update`.' }); } catch {}
3668
+ return;
3669
+ }
3670
+ updateInProgress = true;
3671
+ writeUpdateMarker({ queuedAt: marker.queuedAt, attempts: marker.attempts + 1 });
3672
+
3421
3673
  const cliPath = path.join(PKG_DIR, 'bin', 'cli.js');
3422
3674
  const updateLog = path.join(DATA_DIR, 'update.log');
3423
3675
  log.info('Deferred update triggered — running bloby update...');
@@ -3430,17 +3682,38 @@ ${alreadyLinked ? '' : `
3430
3682
  child.on('exit', (code) => {
3431
3683
  try { fs.closeSync(logFd); } catch {}
3432
3684
  if (code === 0) {
3685
+ clearUpdateMarker(); // success (updated or already-latest) — don't re-run on the next boot
3433
3686
  log.ok('Update completed — restarting with new version...');
3434
- process.exit(1); // non-zero triggers daemon manager to restart us
3687
+ process.exit(1); // non-zero triggers daemon manager to restart us onto the new code
3435
3688
  } else {
3436
- log.error(`Update process exited with code ${code} see ${updateLog}`);
3689
+ // Leave the marker so the next boot retries (bounded by attempts); allow another flush now.
3690
+ updateInProgress = false;
3691
+ log.error(`Update process exited with code ${code} — see ${updateLog}. Will retry on next restart (attempt ${marker.attempts + 1}/${UPDATE_MAX_ATTEMPTS}).`);
3437
3692
  }
3438
3693
  });
3694
+ child.on('error', (err) => {
3695
+ try { fs.closeSync(logFd); } catch {}
3696
+ updateInProgress = false;
3697
+ log.error(`Update process failed to start: ${err.message}`);
3698
+ });
3439
3699
  } catch (err) {
3700
+ updateInProgress = false;
3440
3701
  log.error(`Deferred update failed: ${err instanceof Error ? err.message : err}`);
3441
3702
  }
3442
3703
  }
3443
3704
 
3705
+ /** On boot, resume an update that was queued but never ran (supervisor died in the request→flush
3706
+ * window). Safe to auto-run: bin/cli.js update version-checks and no-ops if already latest, and
3707
+ * the marker's TTL + attempts cap prevent a restart loop. */
3708
+ function resumePendingUpdateOnBoot(): void {
3709
+ const marker = readUpdateMarker();
3710
+ if (!marker) return;
3711
+ if (Date.now() - marker.queuedAt > UPDATE_MARKER_TTL_MS) { clearUpdateMarker(); return; }
3712
+ log.info('Found a pending update from before restart — resuming...');
3713
+ pendingUpdate = true;
3714
+ flushPendingUpdate(); // no active turn at boot
3715
+ }
3716
+
3444
3717
  // Tell the live chat when the backend gives up — the dashboard interstitial covers page loads,
3445
3718
  // but an already-open chat client gets an explicit event it can surface ("ask me to fix the backend").
3446
3719
  setBackendGiveUpHandler(() => {
@@ -3456,6 +3729,7 @@ ${alreadyLinked ? '' : `
3456
3729
  workerApi,
3457
3730
  restartBackend: () => doRestart(),
3458
3731
  getModel: () => loadConfig().ai.model,
3732
+ onTurnComplete: () => { if (pendingBackendRestart) void doRestart(); flushPendingUpdate(); }, // flush a deferred backend restart + queued self-update after a pulse/cron turn
3459
3733
  });
3460
3734
 
3461
3735
  // Initialize channel manager (WhatsApp, Telegram, etc.)
@@ -3464,6 +3738,7 @@ ${alreadyLinked ? '' : `
3464
3738
  workerApi,
3465
3739
  restartBackend: () => doRestart(),
3466
3740
  getModel: () => loadConfig().ai.model,
3741
+ onTurnComplete: () => { if (pendingBackendRestart) void doRestart(); flushPendingUpdate(); }, // flush a deferred backend restart + queued self-update after a channel turn
3467
3742
  });
3468
3743
 
3469
3744
  // Broadcast channel status changes to all connected chat clients
@@ -3506,11 +3781,12 @@ ${alreadyLinked ? '' : `
3506
3781
  return restartBackend(backendPort);
3507
3782
  }
3508
3783
 
3509
- /** True while any surface is mid-turn. Dashboard chat sets agentQueryActive; WhatsApp/Alexa
3510
- * turns instead set the harness conv.busy (they don't touch agentQueryActive), so we must
3511
- * check both otherwise an agent editing the backend over a channel would get the backend
3512
- * restarted out from under it mid-turn. */
3513
- const aTurnIsActive = () => agentQueryActive || anyConversationBusy();
3784
+ /** True while any surface is mid-turn. Dashboard chat sets agentQueryActive; WhatsApp/Alexa live
3785
+ * turns set the harness conv.busy; pulse/cron + customer-WhatsApp ONE-SHOT turns set neither
3786
+ * (they live in the harness activeQueries map) so we check all three. Otherwise an agent
3787
+ * editing the backend over any of these surfaces, or queuing a self-update from one, would get
3788
+ * the backend restarted / the supervisor exited out from under it mid-turn. */
3789
+ const aTurnIsActive = () => agentQueryActive || anyConversationBusy() || anyOneShotActive();
3514
3790
 
3515
3791
  function scheduleBackendRestart(reason: string) {
3516
3792
  if (aTurnIsActive()) {
@@ -3577,26 +3853,19 @@ ${alreadyLinked ? '' : `
3577
3853
  scheduleBackendRestart(`workspace dependencies changed (${filename})`);
3578
3854
  }
3579
3855
  if (filename === '.restart') {
3580
- // Consume the trigger file
3856
+ // DEPRECATED fallback agents now use POST /__bloby/control/restart-backend (synchronous ack,
3857
+ // no lossy fs.watch). Kept so a human/external script touching .restart still works.
3581
3858
  try { fs.unlinkSync(path.join(workspaceDir, '.restart')); } catch {}
3582
- scheduleBackendRestart('.restart trigger');
3859
+ scheduleBackendRestart('.restart trigger (deprecated)');
3583
3860
  }
3584
3861
  if (filename === '.update') {
3585
- // Consume the trigger file
3862
+ // DEPRECATED fallback agents now use POST /__bloby/control/update (acknowledged + idempotent).
3863
+ // Route through queueUpdate(), which carries every fix the old inline path lacked: the
3864
+ // idempotency guard (the watcher's own unlink re-fires this event → double-spawn), the
3865
+ // aTurnIsActive() gate (was agentQueryActive-only → fired mid-turn on pulse/channel turns), the
3866
+ // persisted marker, and the all-surface turn-complete flush.
3586
3867
  try { fs.unlinkSync(path.join(workspaceDir, '.update')); } catch {}
3587
- if (agentQueryActive) {
3588
- pendingUpdate = true;
3589
- log.info('Update requested — deferring until agent turn ends');
3590
- } else {
3591
- // End any live conversations before updating
3592
- for (const cid of Array.from(clientConvs.values())) {
3593
- if (hasConversation(cid)) {
3594
- log.info(`[update] Ending conversation ${cid} before update`);
3595
- endConversation(cid);
3596
- }
3597
- }
3598
- runDeferredUpdate();
3599
- }
3868
+ queueUpdate();
3600
3869
  }
3601
3870
  }
3602
3871
 
@@ -3619,6 +3888,9 @@ ${alreadyLinked ? '' : `
3619
3888
  armBackendWatcher();
3620
3889
  armWorkspaceWatcher();
3621
3890
 
3891
+ // Resume a self-update that was queued but never ran (supervisor died in the request→flush window).
3892
+ resumePendingUpdateOnBoot();
3893
+
3622
3894
  // WebSocket liveness heartbeat — ping the app + chat WS clients every 30s and terminate any
3623
3895
  // that missed the previous pong (half-open sockets that never fired 'close'). Terminating fires
3624
3896
  // 'close', which runs the existing map/subscription cleanup. Scoped to our two WSS only (Vite's
@@ -34,6 +34,8 @@ interface SchedulerOpts {
34
34
  workerApi: (path: string, method?: string, body?: any) => Promise<any>;
35
35
  restartBackend: () => void;
36
36
  getModel: () => string;
37
+ /** Fired after a pulse/cron turn ends — the supervisor uses it to flush a queued self-update. */
38
+ onTurnComplete?: () => void;
37
39
  }
38
40
 
39
41
  // State
@@ -120,7 +122,7 @@ function cronMatchesNow(schedule: string): boolean {
120
122
 
121
123
  function triggerAgent(prompt: string, label: string, onComplete?: () => void) {
122
124
  if (!schedulerOpts) return;
123
- const { broadcastBloby, workerApi, restartBackend, getModel } = schedulerOpts;
125
+ const { broadcastBloby, workerApi, restartBackend, getModel, onTurnComplete } = schedulerOpts;
124
126
  const timestamp = Date.now();
125
127
  const convId = label.startsWith('pulse') ? `pulse-${timestamp}` : `cron-${label}-${timestamp}`;
126
128
  const model = getModel();
@@ -230,6 +232,7 @@ function triggerAgent(prompt: string, label: string, onComplete?: () => void) {
230
232
  log.info(`[scheduler] File tools used — restarting backend`);
231
233
  restartBackend();
232
234
  }
235
+ onTurnComplete?.(); // flush a queued self-update now this pulse/cron turn has ended
233
236
  onComplete?.();
234
237
  }
235
238
 
@@ -1,11 +1,37 @@
1
- import { createServer as createViteServer, type ViteDevServer } from 'vite';
1
+ import { createServer as createViteServer, createLogger, type ViteDevServer } from 'vite';
2
2
  import type http from 'http';
3
3
  import path from 'path';
4
4
  import { PKG_DIR } from '../shared/paths.js';
5
5
  import { log } from '../shared/logger.js';
6
+ import { appendFrontendLog } from './frontend-log.js';
6
7
 
7
8
  let dashboardVite: ViteDevServer | null = null;
8
9
 
10
+ const stripAnsi = (s: string) => String(s).replace(/\x1b\[[0-9;]*m/g, '');
11
+
12
+ /** A Vite logger that mirrors error/warn to stdout (preserving the human-facing boot/HMR logs) AND
13
+ * captures them into the server-side frontend ring, so GET /__bloby/control/logs/frontend surfaces
14
+ * COMPILE/transform errors even when the browser never ran a line of JS (hard compile failure). */
15
+ function makeCaptureLogger() {
16
+ const logger = createLogger('info');
17
+ const origError = logger.error.bind(logger);
18
+ const origWarn = logger.warn.bind(logger);
19
+ const origWarnOnce = logger.warnOnce.bind(logger);
20
+ logger.error = (msg, opts) => {
21
+ try { appendFrontendLog('vite-error', stripAnsi(msg), opts?.error?.stack ? stripAnsi(opts.error.stack) : undefined); } catch {}
22
+ origError(msg, opts);
23
+ };
24
+ logger.warn = (msg, opts) => {
25
+ try { appendFrontendLog('vite-warn', stripAnsi(msg)); } catch {}
26
+ origWarn(msg, opts);
27
+ };
28
+ logger.warnOnce = (msg, opts) => {
29
+ try { appendFrontendLog('vite-warn', stripAnsi(msg)); } catch {}
30
+ origWarnOnce(msg, opts);
31
+ };
32
+ return logger;
33
+ }
34
+
9
35
  export async function startViteDevServers(supervisorPort: number, hmrServer: http.Server): Promise<{ dashboard: number }> {
10
36
  const ports = {
11
37
  dashboard: supervisorPort + 2,
@@ -25,7 +51,9 @@ export async function startViteDevServers(supervisorPort: number, hmrServer: htt
25
51
  // so it works both locally (localhost:3000) and through the relay (riven.bloby.bot:443).
26
52
  hmr: { server: hmrServer },
27
53
  },
28
- logLevel: 'info',
54
+ // customLogger captures compile/transform errors into the frontend ring (and still prints
55
+ // them); it supersedes logLevel, which Vite ignores when a customLogger is provided.
56
+ customLogger: makeCaptureLogger(),
29
57
  });
30
58
  await dashboardVite.listen();
31
59
  } catch (err) {
@@ -321,6 +321,71 @@
321
321
  hideStyle.textContent = 'vite-error-overlay{display:none!important}';
322
322
  (document.head || document.documentElement).appendChild(hideStyle);
323
323
 
324
+ /* ── 2a. Frontend error capture → supervisor ──────────────────────────────
325
+ Captures EVERY break class — runtime window.onerror, unhandledrejection, console.error/warn,
326
+ and the Vite compile overlay — into a ring. The ring backs (1) the "Copy error" button, so it
327
+ is never empty even for runtime/black-screen breaks (no Vite overlay), and (2) a debounced POST
328
+ to /__bloby/control/fe-log, so the agent's `…/control/logs/frontend` tail is never empty either.
329
+ Before this, only Vite's compile overlay was readable and the listeners captured nothing. */
330
+ var feRing = []; // last ~120 entries, for the Copy button
331
+ var feUnsent = []; // not-yet-POSTed entries
332
+ var feFlushTimer = null;
333
+ function safeStringify(o) { try { return JSON.stringify(o); } catch (e) { return String(o); } }
334
+ function pushFe(kind, text, stack) {
335
+ if (text == null) return;
336
+ text = String(text).slice(0, 4000).trim();
337
+ if (!text) return;
338
+ if (text.indexOf(VITE_SUPPRESS_MARK) !== -1) return; // benign HMR-reconnect marker, not an app error
339
+ var last = feRing[feRing.length - 1];
340
+ if (last && last.kind === kind && last.text === text) return; // collapse repeats (1.5s overlay tick)
341
+ var entry = { kind: kind, text: text, stack: stack ? String(stack).slice(0, 4000) : undefined, t: Date.now() };
342
+ feRing.push(entry); if (feRing.length > 120) feRing.shift();
343
+ feUnsent.push(entry);
344
+ if (!feFlushTimer) feFlushTimer = setTimeout(flushFe, 1000);
345
+ }
346
+ function flushFe() {
347
+ feFlushTimer = null;
348
+ if (!feUnsent.length) return;
349
+ var batch = feUnsent.splice(0, feUnsent.length);
350
+ try {
351
+ fetch('/__bloby/control/fe-log', {
352
+ method: 'POST', headers: { 'Content-Type': 'application/json' },
353
+ body: JSON.stringify({ entries: batch }), keepalive: true, cache: 'no-store',
354
+ }).catch(function () {});
355
+ } catch (e) {}
356
+ }
357
+ function feTail(n) {
358
+ return feRing.slice(-(n || 6)).map(function (e) {
359
+ return '[' + e.kind + '] ' + e.text + (e.stack ? '\n' + e.stack : '');
360
+ }).join('\n\n');
361
+ }
362
+ // Patch console.error/console.warn once — the ONLY source for the "frontend devtools" tail.
363
+ ['error', 'warn'].forEach(function (level) {
364
+ var orig = console[level];
365
+ if (typeof orig !== 'function' || orig.__blobyPatched) return;
366
+ var patched = function () {
367
+ try {
368
+ var parts = [];
369
+ for (var i = 0; i < arguments.length; i++) {
370
+ var a = arguments[i];
371
+ parts.push(a instanceof Error ? (a.message + (a.stack ? '\n' + a.stack : ''))
372
+ : (a && typeof a === 'object' ? safeStringify(a) : String(a)));
373
+ }
374
+ pushFe('console.' + level, parts.join(' '));
375
+ } catch (e) {}
376
+ return orig.apply(console, arguments);
377
+ };
378
+ patched.__blobyPatched = true;
379
+ console[level] = patched;
380
+ });
381
+
382
+ // Flush buffered errors before the page goes away — a runtime error very often immediately precedes
383
+ // a reload, and this guard itself reloads aggressively (backend-down poll, vite-stale recovery).
384
+ // keepalive (set in flushFe) lets the POST complete during unload, so the agent's frontend-log tail
385
+ // doesn't miss the error that broke the page.
386
+ window.addEventListener('pagehide', flushFe);
387
+ document.addEventListener('visibilitychange', function () { if (document.visibilityState === 'hidden') flushFe(); });
388
+
324
389
  var overlay = null;
325
390
  var dismissed = false;
326
391
  var lastErr = '';
@@ -359,7 +424,11 @@
359
424
 
360
425
  var copyBtn = d.querySelector('#__bloby_fe_copy');
361
426
  copyBtn.addEventListener('click', function () {
362
- var text = 'A screen in my app has a frontend build error. Find and fix the root cause. Error:\n\n' + (lastErr || '(no details captured)');
427
+ // Re-scrape Vite's overlay at click time (it may have populated since the overlay was built),
428
+ // then fall back to the captured ring — so runtime errors / black screens (no Vite overlay)
429
+ // still copy real text instead of "(no details captured)".
430
+ var detail = readViteError() || lastErr || feTail(6);
431
+ var text = 'A screen in my app has a frontend error. Find and fix the root cause. Error:\n\n' + (detail || '(no error text was captured — read the frontend logs to investigate)');
363
432
  function ok() { copyBtn.textContent = '✓ Copied — paste it to your agent'; setTimeout(function () { copyBtn.textContent = 'Copy error for your agent'; }, 2600); }
364
433
  function fb() { var ta = document.createElement('textarea'); ta.value = text; ta.style.position = 'fixed'; ta.style.opacity = '0'; document.body.appendChild(ta); ta.select(); try { document.execCommand('copy'); ok(); } catch (e) {} document.body.removeChild(ta); }
365
434
  if (navigator.clipboard && navigator.clipboard.writeText) navigator.clipboard.writeText(text).then(ok).catch(fb); else fb();
@@ -392,7 +461,7 @@
392
461
  function evaluate() {
393
462
  if (appLooksBroken()) {
394
463
  var err = readViteError();
395
- if (err) lastErr = err;
464
+ if (err) { lastErr = err; pushFe('vite-overlay', err); }
396
465
  if (!overlay && !dismissed) overlay = buildOverlay();
397
466
  } else {
398
467
  // App healthy (or recovered) — drop our overlay and re-arm for the next episode.
@@ -406,7 +475,19 @@
406
475
  // recovery; window errors flip sawError so real load failures surface fast.
407
476
  new MutationObserver(evaluate).observe(document.body, { childList: true });
408
477
  setInterval(evaluate, 1500);
409
- window.addEventListener('error', function () { sawError = true; evaluate(); });
478
+ window.addEventListener('error', function (e) {
479
+ // Capture the actual error text + stack. Runtime errors render NO Vite overlay, so before this
480
+ // they were completely invisible to the copy button / logs. Skip resource-load errors (e.target
481
+ // is an element, with no e.error/e.message) — they carry no useful app-error text.
482
+ try {
483
+ if (e && (e.error || e.message)) {
484
+ var emsg = e.message || (e.error && e.error.message) || 'Error';
485
+ if (e.filename) emsg += ' @ ' + e.filename + ':' + (e.lineno || 0) + ':' + (e.colno || 0);
486
+ pushFe('error', emsg, e.error && e.error.stack);
487
+ }
488
+ } catch (err) {}
489
+ sawError = true; evaluate();
490
+ });
410
491
  window.addEventListener('unhandledrejection', function (e) {
411
492
  // The reconnect-reload suppressor (job 5) throws a marker error inside Vite's HMR
412
493
  // dispatch; it surfaces here on every tunnel blip. Not an app error — swallow it.
@@ -414,6 +495,11 @@
414
495
  var msg = e && e.reason && e.reason.message;
415
496
  if (msg && String(msg).indexOf(VITE_SUPPRESS_MARK) !== -1) { e.preventDefault(); return; }
416
497
  } catch (err) {}
498
+ try {
499
+ var reason = e && e.reason;
500
+ var rtext = reason && (reason.message || reason);
501
+ pushFe('unhandledrejection', rtext != null ? String(rtext) : 'Unhandled promise rejection', reason && reason.stack);
502
+ } catch (err2) {}
417
503
  sawError = true; evaluate();
418
504
  });
419
505
  evaluate();
@@ -167,17 +167,16 @@ Notify your human only if importance is 7+ — otherwise log results silently.
167
167
 
168
168
  **Check version:** `cat ~/.bloby/VERSION` (current) vs `npm view bloby-bot version` (latest).
169
169
 
170
- **To update:** Create the trigger file `touch .update` — the supervisor runs the update after your turn ends. You will
171
- NOT die. Finish your turn normally.
170
+ **To update:** Run `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`. It returns `{"ok":true,"queued":true}`that ack confirms the update is queued and WILL run after your turn ends. You will NOT die mid-turn. Finish your turn normally. Verify afterwards with `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update-status`.
172
171
 
173
172
  **On PULSE:** Occasionally check for updates (not every pulse — once every few hours). If a new version exists:
174
173
  1. Read release notes: `npm view bloby-bot releaseNotes --json`
175
- 2. `touch .update`
174
+ 2. `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
176
175
  3. Save to daily notes: "Updated from vX to vY" + notable changes to talk with your human later "Btw I updated myself this night"
177
176
 
178
177
  **When user asks to update:**
179
178
  1. Check version, if already latest say so
180
- 2. Otherwise read release notes, `touch .update`
179
+ 2. Otherwise read release notes, then `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
181
180
  3. Tell your human what is new and that the page will be unresponsive for up to 2 minutes while this happens.
182
181
 
183
182
  ## Task Files — `tasks/`
@@ -584,7 +583,8 @@ When the dashboard shows a black screen or Vite logs an error, READ the error be
584
583
  - **`Error: ENOENT: no such file or directory, open '.../node_modules/<pkg>/...'`** → The package is **physically missing on disk**. The fix is `npm install <pkg>` — full stop. Clearing `.vite/deps/`, touching files, adding dummy deps, or restarting will NOT recreate the missing file.
585
584
  - **`Failed to resolve import "<pkg>"`** with no path in the error → Same diagnosis: package isn't installed. Run `npm install <pkg>`.
586
585
  - **Pre-bundling / optimizer errors that don't reference a missing path, OR errors that persist after dependency changes** → Vite's dep cache is stale. Clear it: `rm -rf workspace/node_modules/.vite` and reload. (This is the ONLY case where clearing the cache helps.)
587
- - **Backend crash loop** → Read `.backend.log`. Don't guess.
586
+ - **Backend crash loop** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free; add `&prev=1` for the last crashed run). Don't guess.
587
+ - **Black screen / frontend runtime error with no obvious Vite message** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/frontend?lines=100` — captured window errors, unhandled rejections, console.error/warn, and Vite overlay text (the same data behind the "Copy error for your agent" button).
588
588
 
589
589
  If you've tried a fix and the same error recurs, do NOT try a variation of the same fix. Re-diagnose from the error message, or stop and ask your human to restart bloby. See "Stop looping" below.
590
590
 
@@ -595,12 +595,17 @@ The supervisor manages the backend process. You don't need to manage it yourself
595
595
  **Auto-restart triggers (you don't need to do anything):**
596
596
  - Editing `.ts`, `.js`, or `.json` files in `backend/` → auto-restart
597
597
  - Editing `.env` → auto-restart with the new values
598
- - Creating a `.restart` file → force restart: `touch .restart` (file is auto-deleted)
599
598
  - After your turn ends, if you used Write or Edit tools → auto-restart
600
599
 
601
- **During your turn:** The backend does NOT restart mid-turn. All your edits are batched the backend restarts once when you're done. This means if you're writing multi-file changes, everything applies atomically.
600
+ **During your turn (batched, atomic):** By default the backend does NOT auto-restart mid-turn your edits are batched and applied together when the turn ends, so a multi-file change is never served half-written.
602
601
 
603
- **If the backend crashes:** It auto-restarts up to 3 times. If it keeps crashing, read `.backend.log` to see the error output, then fix the code. The log file is cleared on each restart so it only contains the current/last run — no need to truncate it yourself.
602
+ **Restart and verify WITHIN your turn (preferred when you need to test a fix):** After your backend edits are fully saved, run:
603
+ ```
604
+ curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/restart-backend -d '{"wait":true}'
605
+ ```
606
+ It restarts the backend and BLOCKS until the port is healthy, then returns `{"ok":true,"healthy":true,"listening":true,"gaveUp":false,"logs":"..."}`. Now curl your own backend (e.g. `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/app/api/...`) to confirm the fix — all in this turn. If `healthy:false` or `gaveUp:true`, read the returned `logs` and fix the code. Only restart AFTER your edits are saved.
607
+
608
+ **If the backend crashes:** It auto-restarts up to 3 times, then gives up. To see the error, run `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free — even right after a bounce; add `&prev=1` to read the last *crashed* run if the current tail looks empty). Then fix the code.
604
609
 
605
610
  **NEVER do these:**
606
611
  - Never `kill` processes or run `pkill`/`killall` — you don't manage the supervisor or its children
@@ -167,17 +167,16 @@ Notify your human only if importance is 7+ — otherwise log results silently.
167
167
 
168
168
  **Check version:** `cat ~/.bloby/VERSION` (current) vs `npm view bloby-bot version` (latest).
169
169
 
170
- **To update:** Create the trigger file `touch .update` — the supervisor runs the update after your turn ends. You will
171
- NOT die. Finish your turn normally.
170
+ **To update:** Run `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`. It returns `{"ok":true,"queued":true}`that ack confirms the update is queued and WILL run after your turn ends. You will NOT die mid-turn. Finish your turn normally. Verify afterwards with `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update-status`.
172
171
 
173
172
  **On PULSE:** Occasionally check for updates (not every pulse — once every few hours). If a new version exists:
174
173
  1. Read release notes: `npm view bloby-bot releaseNotes --json`
175
- 2. `touch .update`
174
+ 2. `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
176
175
  3. Save to daily notes: "Updated from vX to vY" + notable changes to talk with your human later "Btw I updated myself this night"
177
176
 
178
177
  **When user asks to update:**
179
178
  1. Check version, if already latest say so
180
- 2. Otherwise read release notes, `touch .update`
179
+ 2. Otherwise read release notes, then `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
181
180
  3. Tell your human what is new and that the page will be unresponsive for up to 2 minutes while this happens.
182
181
 
183
182
  ## Task Files — `tasks/`
@@ -584,7 +583,8 @@ When the dashboard shows a black screen or Vite logs an error, READ the error be
584
583
  - **`Error: ENOENT: no such file or directory, open '.../node_modules/<pkg>/...'`** → The package is **physically missing on disk**. The fix is `npm install <pkg>` — full stop. Clearing `.vite/deps/`, touching files, adding dummy deps, or restarting will NOT recreate the missing file.
585
584
  - **`Failed to resolve import "<pkg>"`** with no path in the error → Same diagnosis: package isn't installed. Run `npm install <pkg>`.
586
585
  - **Pre-bundling / optimizer errors that don't reference a missing path, OR errors that persist after dependency changes** → Vite's dep cache is stale. Clear it: `rm -rf workspace/node_modules/.vite` and reload. (This is the ONLY case where clearing the cache helps.)
587
- - **Backend crash loop** → Read `.backend.log`. Don't guess.
586
+ - **Backend crash loop** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free; add `&prev=1` for the last crashed run). Don't guess.
587
+ - **Black screen / frontend runtime error with no obvious Vite message** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/frontend?lines=100` — captured window errors, unhandled rejections, console.error/warn, and Vite overlay text (the same data behind the "Copy error for your agent" button).
588
588
 
589
589
  If you've tried a fix and the same error recurs, do NOT try a variation of the same fix. Re-diagnose from the error message, or stop and ask your human to restart bloby. See "Stop looping" below.
590
590
 
@@ -595,12 +595,17 @@ The supervisor manages the backend process. You don't need to manage it yourself
595
595
  **Auto-restart triggers (you don't need to do anything):**
596
596
  - Editing `.ts`, `.js`, or `.json` files in `backend/` → auto-restart
597
597
  - Editing `.env` → auto-restart with the new values
598
- - Creating a `.restart` file → force restart: `touch .restart` (file is auto-deleted)
599
598
  - After your turn ends, if you used Write or Edit tools → auto-restart
600
599
 
601
- **During your turn:** The backend does NOT restart mid-turn. All your edits are batched the backend restarts once when you're done. This means if you're writing multi-file changes, everything applies atomically.
600
+ **During your turn (batched, atomic):** By default the backend does NOT auto-restart mid-turn your edits are batched and applied together when the turn ends, so a multi-file change is never served half-written.
602
601
 
603
- **If the backend crashes:** It auto-restarts up to 3 times. If it keeps crashing, read `.backend.log` to see the error output, then fix the code. The log file is cleared on each restart so it only contains the current/last run — no need to truncate it yourself.
602
+ **Restart and verify WITHIN your turn (preferred when you need to test a fix):** After your backend edits are fully saved, run:
603
+ ```
604
+ curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/restart-backend -d '{"wait":true}'
605
+ ```
606
+ It restarts the backend and BLOCKS until the port is healthy, then returns `{"ok":true,"healthy":true,"listening":true,"gaveUp":false,"logs":"..."}`. Now curl your own backend (e.g. `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/app/api/...`) to confirm the fix — all in this turn. If `healthy:false` or `gaveUp:true`, read the returned `logs` and fix the code. Only restart AFTER your edits are saved.
607
+
608
+ **If the backend crashes:** It auto-restarts up to 3 times, then gives up. To see the error, run `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free — even right after a bounce; add `&prev=1` to read the last *crashed* run if the current tail looks empty). Then fix the code.
604
609
 
605
610
  **NEVER do these:**
606
611
  - Never `kill` processes or run `pkill`/`killall` — you don't manage the supervisor or its children
@@ -167,17 +167,16 @@ Notify your human only if importance is 7+ — otherwise log results silently.
167
167
 
168
168
  **Check version:** `cat ~/.bloby/VERSION` (current) vs `npm view bloby-bot version` (latest).
169
169
 
170
- **To update:** Create the trigger file `touch .update` — the supervisor runs the update after your turn ends. You will
171
- NOT die. Finish your turn normally.
170
+ **To update:** Run `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`. It returns `{"ok":true,"queued":true}`that ack confirms the update is queued and WILL run after your turn ends. You will NOT die mid-turn. Finish your turn normally. Verify afterwards with `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update-status`.
172
171
 
173
172
  **On PULSE:** Occasionally check for updates (not every pulse — once every few hours). If a new version exists:
174
173
  1. Read release notes: `npm view bloby-bot releaseNotes --json`
175
- 2. `touch .update`
174
+ 2. `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
176
175
  3. Save to daily notes: "Updated from vX to vY" + notable changes to talk with your human later "Btw I updated myself this night"
177
176
 
178
177
  **When user asks to update:**
179
178
  1. Check version, if already latest say so
180
- 2. Otherwise read release notes, `touch .update`
179
+ 2. Otherwise read release notes, then `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
181
180
  3. Tell your human what is new and that the page will be unresponsive for up to 2 minutes while this happens.
182
181
 
183
182
  ## Task Files — `tasks/`
@@ -584,7 +583,8 @@ When the dashboard shows a black screen or Vite logs an error, READ the error be
584
583
  - **`Error: ENOENT: no such file or directory, open '.../node_modules/<pkg>/...'`** → The package is **physically missing on disk**. The fix is `npm install <pkg>` — full stop. Clearing `.vite/deps/`, touching files, adding dummy deps, or restarting will NOT recreate the missing file.
585
584
  - **`Failed to resolve import "<pkg>"`** with no path in the error → Same diagnosis: package isn't installed. Run `npm install <pkg>`.
586
585
  - **Pre-bundling / optimizer errors that don't reference a missing path, OR errors that persist after dependency changes** → Vite's dep cache is stale. Clear it: `rm -rf workspace/node_modules/.vite` and reload. (This is the ONLY case where clearing the cache helps.)
587
- - **Backend crash loop** → Read `.backend.log`. Don't guess.
586
+ - **Backend crash loop** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free; add `&prev=1` for the last crashed run). Don't guess.
587
+ - **Black screen / frontend runtime error with no obvious Vite message** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/frontend?lines=100` — captured window errors, unhandled rejections, console.error/warn, and Vite overlay text (the same data behind the "Copy error for your agent" button).
588
588
 
589
589
  If you've tried a fix and the same error recurs, do NOT try a variation of the same fix. Re-diagnose from the error message, or stop and ask your human to restart bloby. See "Stop looping" below.
590
590
 
@@ -595,12 +595,17 @@ The supervisor manages the backend process. You don't need to manage it yourself
595
595
  **Auto-restart triggers (you don't need to do anything):**
596
596
  - Editing `.ts`, `.js`, or `.json` files in `backend/` → auto-restart
597
597
  - Editing `.env` → auto-restart with the new values
598
- - Creating a `.restart` file → force restart: `touch .restart` (file is auto-deleted)
599
598
  - After your turn ends, if you used Write or Edit tools → auto-restart
600
599
 
601
- **During your turn:** The backend does NOT restart mid-turn. All your edits are batched the backend restarts once when you're done. This means if you're writing multi-file changes, everything applies atomically.
600
+ **During your turn (batched, atomic):** By default the backend does NOT auto-restart mid-turn your edits are batched and applied together when the turn ends, so a multi-file change is never served half-written.
602
601
 
603
- **If the backend crashes:** It auto-restarts up to 3 times. If it keeps crashing, read `.backend.log` to see the error output, then fix the code. The log file is cleared on each restart so it only contains the current/last run — no need to truncate it yourself.
602
+ **Restart and verify WITHIN your turn (preferred when you need to test a fix):** After your backend edits are fully saved, run:
603
+ ```
604
+ curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/restart-backend -d '{"wait":true}'
605
+ ```
606
+ It restarts the backend and BLOCKS until the port is healthy, then returns `{"ok":true,"healthy":true,"listening":true,"gaveUp":false,"logs":"..."}`. Now curl your own backend (e.g. `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/app/api/...`) to confirm the fix — all in this turn. If `healthy:false` or `gaveUp:true`, read the returned `logs` and fix the code. Only restart AFTER your edits are saved.
607
+
608
+ **If the backend crashes:** It auto-restarts up to 3 times, then gives up. To see the error, run `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free — even right after a bounce; add `&prev=1` to read the last *crashed* run if the current tail looks empty). Then fix the code.
604
609
 
605
610
  **NEVER do these:**
606
611
  - Never `kill` processes or run `pkill`/`killall` — you don't manage the supervisor or its children