npm - bloby-bot - Versions diffs - 0.65.3 → 0.66.0 - Mend

bloby-bot 0.65.3 → 0.66.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/package.json +1 -1
package/supervisor/agents/prompts/coder.txt +3 -1
package/supervisor/backend.ts +46 -3
package/supervisor/bloby-agent.ts +4 -0
package/supervisor/channels/manager.ts +7 -0
package/supervisor/frontend-log.ts +80 -0
package/supervisor/harnesses/claude.ts +7 -0
package/supervisor/harnesses/codex.ts +6 -0
package/supervisor/harnesses/pi/index.ts +6 -0
package/supervisor/harnesses/types.ts +4 -0
package/supervisor/index.ts +308 -36
package/supervisor/scheduler.ts +4 -1
package/supervisor/vite-dev.ts +30 -2
package/supervisor/workspace-guard.js +89 -3
package/worker/prompts/bloby-system-prompt-codex.txt +13 -8
package/worker/prompts/bloby-system-prompt-pi.txt +13 -8
package/worker/prompts/bloby-system-prompt.txt +13 -8

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "bloby-bot",
-  "version": "0.65.3",
+  "version": "0.66.0",
   "releaseNotes": [
     "1. Fix: image (and audio) attachments now render in chat again — /api/files is fetched with the auth token instead of a raw <img> src that 401'd after the endpoint hardening",
     "2. Affects chat thumbnails, the image lightbox, voice-note playback, and agent image cards",

package/supervisor/agents/prompts/coder.txt CHANGED Viewed

@@ -105,7 +105,9 @@ The supervisor manages the backend process:
 - Editing `.ts`, `.js`, or `.json` files in `backend/` → auto-restart
 - Editing `.env` → auto-restart
 - After your turn ends, if you used Write or Edit tools → auto-restart
-- The backend does NOT restart mid-turn — edits are batched
+- The backend does NOT auto-restart mid-turn — edits are batched (multi-file changes apply atomically)
+- To restart and verify a fix WITHIN your turn (after edits are saved): `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/restart-backend -d '{"wait":true}'` — it waits for the backend to be healthy and returns `{"healthy":...,"logs":"..."}`, so you can then curl the backend to confirm the fix
+- Read backend logs: `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200`
 **NEVER** kill processes, run `bloby start`, or run `npm start` directly.

package/supervisor/backend.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { spawn, type ChildProcess } from 'child_process';
+import http from 'http';
 import fs from 'fs';
 import path from 'path';
 import { PKG_DIR, WORKSPACE_DIR } from '../shared/paths.js';
@@ -39,6 +40,10 @@ export function setBackendGiveUpHandler(fn: () => void): void {
 }
 const LOG_FILE = path.join(WORKSPACE_DIR, '.backend.log');
+// Holds the LAST crashed run's output. spawnBackend truncates LOG_FILE on every (re)spawn, so an
+// agent reading .backend.log right after an auto-restart would otherwise see only the fresh (often
+// empty) run and lose the originating error. The crash exit handler copies LOG_FILE here first.
+const LOG_FILE_PREV = LOG_FILE + '.prev';
 export function getBackendPort(basePort: number): number {
   return basePort + 4;
@@ -114,6 +119,11 @@ export function spawnBackend(port: number): ChildProcess {
     // Supervisor called stopBackend() — don't auto-restart
     if (intentionallyStopped) return;
+    // Preserve the just-crashed run's output before the next spawnBackend truncates LOG_FILE, so a
+    // post-bounce read (agent or interstitial) can still fetch the originating error via ?prev=1.
+    // Only crashes reach here (intentional stops returned above), so .prev always holds the last crash.
+    try { fs.copyFileSync(LOG_FILE, LOG_FILE_PREV); } catch {}
     // Any unexpected exit (crash, SIGTERM, OOM, null code) — restart
     log.warn(`Backend exited unexpectedly (code ${code})`);
     // Track crashes in a rolling window (backstop for the 30s-reset crash-loop hole).
@@ -216,10 +226,12 @@ export function isBackendDead(): boolean {
   return gaveUp;
 }
-/** Read the tail of the backend log (default 100 lines) for the "copy logs" debug helper. */
-export function readBackendLogTail(maxLines = 100): string {
+/** Read the tail of the backend log (default 100 lines) for the "copy logs" debug helper and the
+ *  agent's GET /__bloby/control/logs/backend endpoint. Pass prev=true to read the last CRASHED run
+ *  (.backend.log.prev) — useful right after an auto-restart, when the live log is a fresh run. */
+export function readBackendLogTail(maxLines = 100, prev = false): string {
   try {
-    const text = fs.readFileSync(LOG_FILE, 'utf-8');
+    const text = fs.readFileSync(prev ? LOG_FILE_PREV : LOG_FILE, 'utf-8');
     const lines = text.split('\n');
     return lines.slice(-maxLines).join('\n').trim();
   } catch {
@@ -227,6 +239,37 @@ export function readBackendLogTail(maxLines = 100): string {
   }
 }
+/** True if the backend was (re)spawned within the last ~2s — so callers can tell the agent that a
+ *  near-empty log tail is a fresh-spawn artifact, not the absence of an error. */
+export function backendJustSpawned(): boolean {
+  return Date.now() - lastSpawnTime < 2000;
+}
+/** Resolve true as soon as the backend's HTTP port is ACCEPTING connections (any response — even a
+ *  404 — means the port is bound and serving), false if it never comes up within timeoutMs or the
+ *  backend gives up first. This is the REAL readiness signal that the restart-and-verify endpoint
+ *  returns to the agent: isBackendAlive() only means the child process was spawned, not that it has
+ *  bound its port, so it reports "alive" during the startup window when requests still 503. */
+export function probeBackendReady(port: number, timeoutMs = 15000): Promise<boolean> {
+  const deadline = Date.now() + timeoutMs;
+  return new Promise((resolve) => {
+    const attempt = () => {
+      if (gaveUp) return resolve(false); // crash-looped past the limit — it's not coming up
+      const req = http.request(
+        { host: '127.0.0.1', port, path: '/', method: 'GET', timeout: 2000 },
+        (res) => { res.resume(); resolve(true); }, // any HTTP response = port is listening
+      );
+      req.on('error', () => {
+        if (Date.now() >= deadline) return resolve(false);
+        setTimeout(attempt, 250);
+      });
+      req.on('timeout', () => { try { req.destroy(); } catch {} }); // → 'error' → retry/deadline
+      req.end();
+    };
+    attempt();
+  });
+}
 export function isBackendStopping(): boolean {
   return stopPromise !== null;
 }

package/supervisor/bloby-agent.ts CHANGED Viewed

@@ -88,6 +88,10 @@ export function isConversationBusy(conversationId: string): boolean {
 /** True if ANY conversation in ANY harness is mid-turn. Lets the supervisor defer backend
  *  restarts during channel/Alexa turns, which don't set the dashboard's agentQueryActive flag. */
+export function anyOneShotActive(): boolean {
+  return Object.values(HARNESSES).some((h) => h.anyOneShotActive());
+}
 export function anyConversationBusy(): boolean {
   return Object.values(HARNESSES).some((h) => h.anyConversationBusy());
 }

package/supervisor/channels/manager.ts CHANGED Viewed

@@ -54,6 +54,8 @@ interface ChannelManagerOpts {
   workerApi: (path: string, method?: string, body?: any) => Promise<any>;
   restartBackend: () => void;
   getModel: () => string;
+  /** Fired after a channel turn ends — the supervisor uses it to flush a queued self-update. */
+  onTurnComplete?: () => void;
 }
 interface ActiveAgentQuery {
@@ -1063,6 +1065,7 @@ export class ChannelManager {
         // the dashboard's typing indicator would stay on forever.
         if (type === 'bot:turn-complete') {
           if (eventData.usedFileTools) this.opts.restartBackend();
+          this.opts.onTurnComplete?.(); // flush a queued self-update after a channel turn
           broadcastBloby('bot:idle', { conversationId: convId });
           return;
         }
@@ -1071,6 +1074,7 @@ export class ChannelManager {
         // conversation under the same convId starts clean.
         if (type === 'bot:conversation-ended') {
           this.clearRoutes(convId);
+          this.opts.onTurnComplete?.(); // flush a queued self-update if this turn ended by exception
           return;
         }
@@ -1216,12 +1220,14 @@ export class ChannelManager {
         if (type === 'bot:turn-complete') {
           if (eventData.usedFileTools) this.opts.restartBackend();
+          this.opts.onTurnComplete?.(); // flush a queued self-update after a channel turn
           broadcastBloby('bot:idle', { conversationId: convId });
           return;
         }
         if (type === 'bot:conversation-ended') {
           this.clearRoutes(convId);
+          this.opts.onTurnComplete?.(); // flush a queued self-update if this turn ended by exception
           return;
         }
@@ -1375,6 +1381,7 @@ export class ChannelManager {
         if (type === 'bot:done') {
           this.activeAgents.delete(agentKey);
           if (eventData.usedFileTools) this.opts.restartBackend();
+          this.opts.onTurnComplete?.(); // flush a queued self-update after a channel turn
           this.processQueue();
         }
       },

package/supervisor/frontend-log.ts ADDED Viewed

@@ -0,0 +1,80 @@
+/**
+ * Server-side frontend log ring — the data source behind GET /__bloby/control/logs/frontend
+ * (the agent's "tail frontend / devtools logs") and the friendly "Copy error" flow.
+ *
+ * Two independent producers feed ONE in-memory ring, so the tail is never empty regardless of
+ * how the frontend broke:
+ *   1. The Vite dev server's customLogger (supervisor/vite-dev.ts) — COMPILE/transform errors,
+ *      captured even when the browser never ran a line of JS (hard compile failure / blank page).
+ *   2. The browser (supervisor/workspace-guard.js) POSTing window.onerror / unhandledrejection /
+ *      console.error / console.warn / Vite-overlay text to POST /__bloby/control/fe-log — RUNTIME
+ *      errors, which Vite never sees.
+ *
+ * Memory-only by design: the agent reads it over the loopback endpoint (no workspace file to grow
+ * unbounded, pollute the dir, or self-trigger Vite's watcher). It is the current session's frontend
+ * error trail; a supervisor restart clears it (frontend errors are transient by nature).
+ */
+export type FrontendLogKind =
+  | 'error'
+  | 'unhandledrejection'
+  | 'console.error'
+  | 'console.warn'
+  | 'vite-error'
+  | 'vite-warn'
+  | 'vite-overlay';
+export interface FrontendLogEntry {
+  t: number;
+  kind: FrontendLogKind;
+  text: string;
+  stack?: string;
+}
+const RING_MAX = 500;
+const TEXT_CAP = 4000; // per-field clamp so one giant stack can't blow the ring's memory
+const ring: FrontendLogEntry[] = [];
+// Collapse the same message arriving repeatedly in a short window. The guard re-evaluates the Vite
+// overlay on a 1.5s tick, and a crash loop can spam identical errors — without this the ring fills
+// with one repeated line and pushes out the useful history.
+let lastKey = '';
+let lastAt = 0;
+/** Append one frontend log entry to the ring. Best-effort, never throws, drops empty text.
+ *  text is newline-stripped: the browser-facing POST /__bloby/control/fe-log endpoint is
+ *  unauthenticated, and tailFrontendLog renders one entry per line — an embedded newline would let a
+ *  remote caller forge a fake `<ts> [kind] ...` line that the (Bash-capable) agent reads as genuine.
+ *  Collapsing newlines to a marker keeps each entry to exactly one line. (stack keeps its newlines:
+ *  the renderer indents every stack line, so it can't masquerade as an un-indented log header.) */
+export function appendFrontendLog(kind: FrontendLogKind, text: string, stack?: string): void {
+  const clean = (text == null ? '' : String(text)).slice(0, TEXT_CAP).replace(/[\r\n]+/g, ' ⏎ ').trim();
+  if (!clean) return;
+  const stk = stack ? String(stack).slice(0, TEXT_CAP) : undefined;
+  const key = kind + '|' + clean;
+  const now = Date.now();
+  if (key === lastKey && now - lastAt < 4000) { lastAt = now; return; }
+  lastKey = key;
+  lastAt = now;
+  ring.push({ t: now, kind, text: clean, stack: stk });
+  while (ring.length > RING_MAX) ring.shift();
+}
+/** Render the last `maxLines` ring lines as text (newest last). Each entry is one header line
+ *  (`<ISO ts> [kind] text`) plus optional indented stack lines. */
+export function tailFrontendLog(maxLines = 100): string {
+  const lines: string[] = [];
+  for (const e of ring) {
+    lines.push(`${new Date(e.t).toISOString()} [${e.kind}] ${e.text}`);
+    if (e.stack) lines.push('    ' + e.stack.replace(/\n/g, '\n    '));
+  }
+  return lines.slice(-Math.max(0, maxLines)).join('\n');
+}
+/** Number of entries currently buffered (surfaced as `clients`-independent count). */
+export function getFrontendLogCount(): number {
+  return ring.length;
+}

package/supervisor/harnesses/claude.ts CHANGED Viewed

@@ -550,6 +550,13 @@ export function anyConversationBusy(): boolean {
   return false;
 }
+/** True while any one-shot startBlobyAgentQuery (pulse/cron, customer WhatsApp) is in flight.
+ *  These register only in activeQueries (cleared in a finally), not liveConversations, so
+ *  anyConversationBusy() can't see them. */
+export function anyOneShotActive(): boolean {
+  return activeQueries.size > 0;
+}
 /** Stop a specific background sub-agent task */
 export async function stopSubAgentTask(conversationId: string, taskId: string): Promise<void> {
   const conv = liveConversations.get(conversationId);

package/supervisor/harnesses/codex.ts CHANGED Viewed

@@ -887,6 +887,12 @@ export function isConversationBusy(conversationId: string): boolean {
 /** True if ANY live conversation in this harness is mid-turn. Used by the supervisor to defer
  *  backend restarts during channel/Alexa turns (which don't set the dashboard's agentQueryActive). */
+/** Codex one-shot queries (startBlobyAgentQuery) reuse the conversations map and set conv.busy via
+ *  startTurn, so anyConversationBusy() already covers them — nothing extra to track here. */
+export function anyOneShotActive(): boolean {
+  return false;
+}
 export function anyConversationBusy(): boolean {
   for (const c of conversations.values()) if (c.busy) return true;
   return false;

package/supervisor/harnesses/pi/index.ts CHANGED Viewed

@@ -344,6 +344,12 @@ export async function warmUpForLiveConversation(
 const activeQueries = new Map<string, AbortController>();
+/** True while any one-shot startBlobyAgentQuery is in flight (cleared in a finally). These don't
+ *  register as live conversations, so anyConversationBusy() can't see them. */
+export function anyOneShotActive(): boolean {
+  return activeQueries.size > 0;
+}
 /**
  * One-shot text query — used by customer WhatsApp + scheduler. Uses the
  * provider stream directly (no async queue), drains it, emits the same

package/supervisor/harnesses/types.ts CHANGED Viewed

@@ -59,6 +59,10 @@ export interface Harness {
   isConversationBusy(conversationId: string): boolean;
   /** True if ANY conversation in this harness is mid-turn (no id — used to defer backend restarts). */
   anyConversationBusy(): boolean;
+  /** True if ANY one-shot query (startBlobyAgentQuery: pulse/cron, customer WhatsApp) is in flight.
+   *  These do NOT register as live conversations, so anyConversationBusy() can't see them — the
+   *  supervisor ORs this in so a queued self-update / backend restart defers past one-shot turns too. */
+  anyOneShotActive(): boolean;
   stopSubAgentTask(conversationId: string, taskId: string): Promise<void>;
   warmUpForLiveConversation(
     model: string,

package/supervisor/index.ts CHANGED Viewed

@@ -11,12 +11,13 @@ import { log } from '../shared/logger.js';
 import { startTunnel, stopTunnel, isTunnelAlive, restartTunnel, startNamedTunnel, restartNamedTunnel } from './tunnel.js';
 import { createWorkerApp } from '../worker/index.js';
 import { closeDb, getSession, getSetting } from '../worker/db.js';
-import { spawnBackend, stopBackend, restartBackend, getBackendPort, isBackendAlive, isBackendStopping, isBackendDead, readBackendLogTail, setBackendEnv, setBackendGiveUpHandler } from './backend.js';
+import { spawnBackend, stopBackend, restartBackend, getBackendPort, isBackendAlive, isBackendStopping, isBackendDead, readBackendLogTail, setBackendEnv, setBackendGiveUpHandler, probeBackendReady, backendJustSpawned } from './backend.js';
+import { appendFrontendLog, tailFrontendLog, getFrontendLogCount, type FrontendLogKind } from './frontend-log.js';
 import { handleAgentQuery, type AgentQueryRequest } from './agent-api.js';
 import { updateTunnelUrl, startHeartbeat, stopHeartbeat, disconnect } from '../shared/relay.js';
 import {
   startConversation, hasConversation, endConversation, endAllConversations,
-  isConversationBusy, anyConversationBusy, stopSubAgentTask,
+  isConversationBusy, anyConversationBusy, anyOneShotActive, stopSubAgentTask,
   startBlobyAgentQuery, stopBlobyAgentQuery,
   warmUpForLiveConversation,
   type RecentMessage,
@@ -44,6 +45,24 @@ process.on('unhandledRejection', (reason) => {
 const DIST_BLOBY = path.join(PKG_DIR, 'dist-bloby');
 const SUPERVISOR_PUBLIC = path.join(PKG_DIR, 'supervisor', 'public');
+// Self-update coordination. The marker persists a queued update across a supervisor restart that
+// happens between the agent's request and the turn-complete flush (in-memory pendingUpdate alone
+// would be lost). attempts + a TTL bound the boot-resume retry so a persistently-failing update
+// can't loop on every boot. See queueUpdate/flushPendingUpdate/runDeferredUpdate.
+const UPDATE_MARKER = path.join(DATA_DIR, '.update-pending');
+const UPDATE_MAX_ATTEMPTS = 2;
+const UPDATE_MARKER_TTL_MS = 30 * 60_000; // 30 min — a marker older than this is cleared, not retried
+/** True for the loopback, non-tunnel requests the local agent makes to the /__bloby/control/* and
+ *  channel-mutation endpoints. Identical trust model to the channel mutation guard: cloudflared
+ *  forwards over loopback so the IP check alone is a no-op behind the relay — we also reject any
+ *  request carrying cloudflared's cf-connecting-ip/cf-ray (tunnel-origin) headers. */
+function isLoopbackAgentReq(req: http.IncomingMessage): boolean {
+  const ip = req.socket.remoteAddress || '';
+  const isLoopback = ip === '127.0.0.1' || ip === '::1' || ip === '::ffff:127.0.0.1';
+  return isLoopback && !req.headers['cf-connecting-ip'] && !req.headers['cf-ray'];
+}
 // Proactive context recycling. The chat runs as one long-lived agent session per
 // conversation (so the user can keep talking while the agent works). That session's
 // context grows every turn and would eventually hit the wall. But continuity does NOT
@@ -390,6 +409,12 @@ export async function startSupervisor() {
   const internalSecret = crypto.randomBytes(16).toString('hex');
   const agentSecret = crypto.randomBytes(32).toString('hex');
+  // Expose the supervisor's own HTTP port to EVERY child subprocess via our own process.env —
+  // notably the agent harness (claude/codex/pi all spread ...process.env), whose Bash tool curls
+  // the /__bloby/control/* surface as http://127.0.0.1:$SUPERVISOR_PORT/... . Previously this was
+  // injected ONLY into the backend (setBackendEnv below), so the agent had no reliable port var.
+  process.env.SUPERVISOR_PORT = String(config.port);
   // Inject agent secret + supervisor port into workspace backend env
   setBackendEnv({
     BLOBY_AGENT_SECRET: agentSecret,
@@ -620,6 +645,161 @@ export async function startSupervisor() {
       return;
     }
+    // ── Agent control surface (/__bloby/control/*) ──────────────────────────────────────────────
+    // The Bloby agent drives backend restarts, self-update, and log tails through these endpoints
+    // instead of the old lossy fs.watch trigger files (.restart/.update). Every call returns a
+    // SYNCHRONOUS JSON ack — that explicit acknowledgment is the reliability fix (no silent drops).
+    // The agent curls http://127.0.0.1:$SUPERVISOR_PORT/__bloby/control/... (SUPERVISOR_PORT is
+    // injected into its env). All routes are loopback-only (same cf-reject guard as the channel
+    // mutations) so they are NEVER reachable over the public tunnel — EXCEPT fe-log, which the
+    // user's browser posts to (write-only, capped). Served here, before auth and the Vite catch-all,
+    // so they answer even when the backend/Vite are down.
+    if (req.url?.startsWith('/__bloby/control/')) {
+      const ctlPath = req.url.split('?')[0];
+      const ctlQuery = new URLSearchParams(req.url.split('?')[1] || '');
+      // POST /__bloby/control/fe-log — browser → supervisor frontend-error ingest. NOT loopback-
+      // gated (the workspace-guard posts from the user's browser, possibly over the tunnel).
+      // Write-only, size-capped, no read-back, no side effects → worst case is capped log spam.
+      if (ctlPath === '/__bloby/control/fe-log' && req.method === 'POST') {
+        let feBody = '';
+        let feTooBig = false;
+        // Cap by DROPPING the payload once oversize (not req.destroy(), which fires neither 'end' nor
+        // 'error' → the response would never be sent). Keep reading to a clean 'end' and 204 always.
+        req.on('data', (chunk: Buffer) => {
+          if (feTooBig) return;
+          feBody += chunk.toString();
+          if (feBody.length > 16_384) { feTooBig = true; feBody = ''; }
+        });
+        req.on('end', () => {
+          if (!feTooBig) {
+            try {
+              const parsed = JSON.parse(feBody);
+              const entries = Array.isArray(parsed?.entries) ? parsed.entries.slice(-40) : [];
+              const allowed = ['error', 'unhandledrejection', 'console.error', 'console.warn', 'vite-overlay'];
+              for (const e of entries) {
+                if (e && typeof e.text === 'string') {
+                  const kind = (allowed.includes(e.kind) ? e.kind : 'error') as FrontendLogKind;
+                  appendFrontendLog(kind, e.text, typeof e.stack === 'string' ? e.stack : undefined);
+                }
+              }
+            } catch {}
+          }
+          try { res.writeHead(204); res.end(); } catch {}
+        });
+        req.on('error', () => { try { res.writeHead(204); res.end(); } catch {} });
+        return;
+      }
+      // Every other control route is loopback-only (agent-driven, can restart/update the instance).
+      if (!isLoopbackAgentReq(req)) {
+        res.writeHead(403, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify({ ok: false, error: 'This control endpoint is localhost-only.' }));
+        return;
+      }
+      res.setHeader('Cache-Control', 'no-store');
+      // GET /__bloby/control/logs/backend?lines=N[&prev=1] — tail the current (or last-crashed) run.
+      if (ctlPath === '/__bloby/control/logs/backend' && req.method === 'GET') {
+        const lines = Math.max(1, Math.min(1000, parseInt(ctlQuery.get('lines') || '100', 10) || 100));
+        const prev = ctlQuery.get('prev') === '1';
+        res.writeHead(200, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify({ ok: true, lines, prev, justRestarted: backendJustSpawned(), log: readBackendLogTail(lines, prev) }));
+        return;
+      }
+      // GET /__bloby/control/logs/frontend?lines=N — runtime + console + Vite-compile frontend errors.
+      if (ctlPath === '/__bloby/control/logs/frontend' && req.method === 'GET') {
+        const lines = Math.max(1, Math.min(1000, parseInt(ctlQuery.get('lines') || '100', 10) || 100));
+        res.writeHead(200, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify({ ok: true, lines, entries: getFrontendLogCount(), log: tailFrontendLog(lines) }));
+        return;
+      }
+      // GET /__bloby/control/update-status — is a queued update running / did it fail?
+      if (ctlPath === '/__bloby/control/update-status' && req.method === 'GET') {
+        res.writeHead(200, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify({ ok: true, ...getUpdateStatus() }));
+        return;
+      }
+      // POST /__bloby/control/update — queue a self-update (acknowledged, idempotent, deferred).
+      if (ctlPath === '/__bloby/control/update' && req.method === 'POST') {
+        const r = queueUpdate();
+        res.writeHead(200, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify({
+          ok: true,
+          queued: r.queued || r.alreadyQueued,
+          alreadyQueued: r.alreadyQueued,
+          retrying: r.retrying,
+          deferred: r.deferred,
+          message: r.alreadyQueued
+            ? 'An update is already queued or running.'
+            : r.retrying
+              ? 'A previous update attempt failed — re-queued; it retries after your turn ends. Check update-status (state:failed exposes the prior error in logTail).'
+              : 'Update queued — it runs after your turn ends. You will NOT die mid-turn; finish your turn normally. The page is unresponsive ~1–2 min while Bloby restarts on the new version. Check update-status after.',
+        }));
+        return;
+      }
+      // POST /__bloby/control/restart-backend { wait?:bool=true, timeoutMs?:num=15000, logLines?:num=60 }
+      // Restarts the backend through the existing serialized doRestart() funnel and (when wait) blocks
+      // until the backend's PORT is listening — so the agent can restart-and-verify WITHIN its turn.
+      if (ctlPath === '/__bloby/control/restart-backend' && req.method === 'POST') {
+        let rbBody = '';
+        let rbTooBig = false;
+        // Cap by dropping the payload (not req.destroy(), which would fire neither 'end' nor 'error'
+        // → no JSON ack ever sent, violating the endpoint contract). Always answer from 'end'.
+        req.on('data', (chunk: Buffer) => {
+          if (rbTooBig) return;
+          rbBody += chunk.toString();
+          if (rbBody.length > 4096) { rbTooBig = true; rbBody = ''; }
+        });
+        req.on('end', async () => {
+          if (rbTooBig) {
+            try { res.writeHead(413, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ ok: false, error: 'Request body too large.' })); } catch {}
+            return;
+          }
+          let rbOpts: any = {};
+          try { rbOpts = rbBody ? JSON.parse(rbBody) : {}; } catch {}
+          const wait = rbOpts.wait !== false; // default true
+          const timeoutMs = Math.max(1000, Math.min(30_000, Number(rbOpts.timeoutMs) || 15_000));
+          const logLines = Math.max(0, Math.min(400, Number(rbOpts.logLines) || 60));
+          const wasDead = isBackendDead(); // had crash-looped & given up BEFORE this explicit restart
+          const started = Date.now();
+          try {
+            await doRestart(); // resetBackendRestarts + serialized stop→spawn; preserves all invariants
+          } catch (err: any) {
+            try { res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ ok: false, restarted: false, error: String(err?.message || err) })); } catch {}
+            return;
+          }
+          const listening = wait ? await probeBackendReady(backendPort, timeoutMs) : isBackendAlive();
+          const gaveUp = isBackendDead();
+          try {
+            res.writeHead(200, { 'Content-Type': 'application/json' });
+            res.end(JSON.stringify({
+              ok: true,
+              restarted: true,
+              healthy: listening && !gaveUp,
+              listening,
+              gaveUp,
+              wasDead,
+              // If it gave up AGAIN, restarting won't help — tell the agent to fix the code, not re-restart.
+              hint: gaveUp ? 'Backend crash-looped and gave up again — restarting will not fix it. Read the logs and fix the code.' : undefined,
+              waitedMs: Date.now() - started,
+              logs: logLines ? readBackendLogTail(logLines) : undefined,
+            }));
+          } catch {}
+        });
+        req.on('error', () => { try { if (!res.headersSent) { res.writeHead(500); res.end(); } } catch {} });
+        return;
+      }
+      res.writeHead(404, { 'Content-Type': 'application/json' });
+      res.end(JSON.stringify({ ok: false, error: 'Unknown control endpoint.' }));
+      return;
+    }
     // App API routes → proxy to user's backend server
     if (req.url?.startsWith('/app/api')) {
       const backendPath = req.url.replace(/^\/app/, '');
@@ -2851,12 +3031,7 @@ ${alreadyLinked ? '' : `
           log.info('[orchestrator] Restarting backend (file tools used / pending watcher change)');
           void doRestart();
         }
-        if (pendingUpdate) {
-          pendingUpdate = false;
-          log.info('[orchestrator] Ending conversation before update...');
-          endConversation(convId);
-          runDeferredUpdate();
-        }
+        flushPendingUpdate(); // run a queued self-update now that this dashboard turn has ended
         // Proactive session recycling (see CONTEXT_RECYCLE_TOKENS). Only when the
         // harness reports the session idle (no queued message) — and this handler runs
@@ -2883,6 +3058,9 @@ ${alreadyLinked ? '' : `
         currentStreamConvId = null;
         currentStreamBuffer = '';
         channelManager.clearRoutes(convId);
+        // A turn that ended by exception/recycle (not a clean bot:turn-complete) must still flush a
+        // queued self-update — otherwise it'd wait for the next turn/reboot. Self-defers + idempotent.
+        flushPendingUpdate();
         return;
       }
@@ -3408,16 +3586,90 @@ ${alreadyLinked ? '' : `
   // Track whether an agent is actively processing — file watcher defers restarts during active turns
   let agentQueryActive = false;
   let pendingBackendRestart = false; // Set when file watcher fires during agent turn
-  let pendingUpdate = false; // Set when .update file is created during agent turn
+  let pendingUpdate = false; // An update is queued; runs at the next turn-complete (flushPendingUpdate)
+  let updateInProgress = false; // The update child has actually been spawned — idempotency guard
   // Note: with live conversations, agentQueryActive is true while the agent processes a message
   // and false when it's idle (waiting for next message). The live conversation stays alive between messages.
-  // Run bloby update as a child process.
-  // BLOBY_SELF_UPDATE=1 tells bin/cli.js to skip daemon stop/restart —
-  // the supervisor exits after the update finishes, and systemd (Restart=on-failure)
-  // or launchd (KeepAlive.SuccessfulExit=false) restarts us with the new code.
+  // ── Self-update marker (persists a queued update across a supervisor restart in the request→flush
+  //    window) ──────────────────────────────────────────────────────────────────────────────────
+  function readUpdateMarker(): { queuedAt: number; attempts: number } | null {
+    try {
+      const m = JSON.parse(fs.readFileSync(UPDATE_MARKER, 'utf-8'));
+      if (m && typeof m.queuedAt === 'number') return { queuedAt: m.queuedAt, attempts: Number(m.attempts) || 0 };
+    } catch {}
+    return null;
+  }
+  function writeUpdateMarker(m: { queuedAt: number; attempts: number }): void {
+    try { fs.writeFileSync(UPDATE_MARKER, JSON.stringify(m)); } catch {}
+  }
+  function clearUpdateMarker(): void {
+    try { fs.unlinkSync(UPDATE_MARKER); } catch {}
+  }
+  /** Queue a self-update. Acknowledged + idempotent (the core fix vs the old fire-and-forget
+   *  `touch .update`). The update RUNS at the next turn-complete so the agent's current turn finishes
+   *  first (it does NOT die mid-turn). When truly idle it flushes on the next tick. */
+  function queueUpdate(): { queued: boolean; alreadyQueued: boolean; deferred: boolean; retrying: boolean } {
+    if (updateInProgress) return { queued: false, alreadyQueued: true, deferred: false, retrying: false };
+    const marker = readUpdateMarker();
+    const alreadyQueued = pendingUpdate;                                  // genuinely already waiting to run
+    const retrying = !pendingUpdate && !!marker && marker.attempts > 0;   // a prior attempt failed; re-queue it
+    pendingUpdate = true;
+    if (!marker) writeUpdateMarker({ queuedAt: Date.now(), attempts: 0 });
+    flushPendingUpdate(); // self-defers — runs now only if nothing is mid-turn
+    return { queued: true, alreadyQueued, deferred: aTurnIsActive(), retrying };
+  }
+  /** Run the queued update once NO turn is active on any surface. Deferred one tick so the
+   *  just-completed turn's in-flight flags (agentQueryActive / conv.busy / activeQueries) have
+   *  cleared first: that lets the completing turn's OWN queued update fire, while still never tearing
+   *  down a concurrent dashboard / channel / one-shot turn. If something else is still active it
+   *  stays pending and re-fires at the next turn-complete or boot-resume (idempotent, marker-backed). */
+  function flushPendingUpdate(): void {
+    if (!pendingUpdate || updateInProgress) return;
+    setImmediate(() => {
+      if (!pendingUpdate || updateInProgress || aTurnIsActive()) return;
+      pendingUpdate = false;
+      try { for (const cid of Array.from(clientConvs.values())) if (hasConversation(cid)) endConversation(cid); } catch {}
+      runDeferredUpdate();
+    });
+  }
+  /** Status for GET /__bloby/control/update-status — lets the agent confirm a queued update actually
+   *  ran / read update.log on failure (a successful update ends in process.exit + daemon restart, so
+   *  the agent sees a connection drop then a new version on reconnect). */
+  function getUpdateStatus(): { state: 'idle' | 'queued' | 'running' | 'failed'; attempts: number; logTail: string } {
+    const marker = readUpdateMarker();
+    let state: 'idle' | 'queued' | 'running' | 'failed';
+    if (updateInProgress) state = 'running';
+    else if (pendingUpdate) state = 'queued';
+    else if (marker && marker.attempts > 0) state = 'failed'; // a prior attempt failed; retries at next turn/boot
+    else if (marker) state = 'queued';
+    else state = 'idle';
+    let logTail = '';
+    try { logTail = fs.readFileSync(path.join(DATA_DIR, 'update.log'), 'utf-8').split('\n').slice(-60).join('\n').trim(); } catch {}
+    return { state, attempts: marker?.attempts ?? 0, logTail };
+  }
+  // Run bloby update as a child process. BLOBY_SELF_UPDATE=1 tells bin/cli.js to skip daemon
+  // stop/restart — the supervisor exits after the update finishes, and systemd (Restart=on-failure)
+  // or launchd (KeepAlive.SuccessfulExit=false) restarts us with the new code. The marker's attempts
+  // counter bounds retries (the TTL is enforced only on the boot-resume path so it can't strand a
+  // legit update queued early in a >TTL-long turn).
   function runDeferredUpdate() {
+    if (updateInProgress) { log.info('Update already in progress — skipping duplicate trigger'); return; }
+    const marker = readUpdateMarker() || { queuedAt: Date.now(), attempts: 0 };
+    if (marker.attempts >= UPDATE_MAX_ATTEMPTS) {
+      log.error(`Self-update failed ${marker.attempts}× — giving up. Run \`bloby update\` manually or check ${path.join(DATA_DIR, 'update.log')}`);
+      clearUpdateMarker();
+      try { broadcastBloby('backend:failed', { message: 'Self-update failed repeatedly. Ask your human to run `bloby update`.' }); } catch {}
+      return;
+    }
+    updateInProgress = true;
+    writeUpdateMarker({ queuedAt: marker.queuedAt, attempts: marker.attempts + 1 });
     const cliPath = path.join(PKG_DIR, 'bin', 'cli.js');
     const updateLog = path.join(DATA_DIR, 'update.log');
     log.info('Deferred update triggered — running bloby update...');
@@ -3430,17 +3682,38 @@ ${alreadyLinked ? '' : `
       child.on('exit', (code) => {
         try { fs.closeSync(logFd); } catch {}
         if (code === 0) {
+          clearUpdateMarker(); // success (updated or already-latest) — don't re-run on the next boot
           log.ok('Update completed — restarting with new version...');
-          process.exit(1); // non-zero triggers daemon manager to restart us
+          process.exit(1); // non-zero triggers daemon manager to restart us onto the new code
         } else {
-          log.error(`Update process exited with code ${code} — see ${updateLog}`);
+          // Leave the marker so the next boot retries (bounded by attempts); allow another flush now.
+          updateInProgress = false;
+          log.error(`Update process exited with code ${code} — see ${updateLog}. Will retry on next restart (attempt ${marker.attempts + 1}/${UPDATE_MAX_ATTEMPTS}).`);
         }
       });
+      child.on('error', (err) => {
+        try { fs.closeSync(logFd); } catch {}
+        updateInProgress = false;
+        log.error(`Update process failed to start: ${err.message}`);
+      });
     } catch (err) {
+      updateInProgress = false;
       log.error(`Deferred update failed: ${err instanceof Error ? err.message : err}`);
     }
   }
+  /** On boot, resume an update that was queued but never ran (supervisor died in the request→flush
+   *  window). Safe to auto-run: bin/cli.js update version-checks and no-ops if already latest, and
+   *  the marker's TTL + attempts cap prevent a restart loop. */
+  function resumePendingUpdateOnBoot(): void {
+    const marker = readUpdateMarker();
+    if (!marker) return;
+    if (Date.now() - marker.queuedAt > UPDATE_MARKER_TTL_MS) { clearUpdateMarker(); return; }
+    log.info('Found a pending update from before restart — resuming...');
+    pendingUpdate = true;
+    flushPendingUpdate(); // no active turn at boot
+  }
   // Tell the live chat when the backend gives up — the dashboard interstitial covers page loads,
   // but an already-open chat client gets an explicit event it can surface ("ask me to fix the backend").
   setBackendGiveUpHandler(() => {
@@ -3456,6 +3729,7 @@ ${alreadyLinked ? '' : `
     workerApi,
     restartBackend: () => doRestart(),
     getModel: () => loadConfig().ai.model,
+    onTurnComplete: () => { if (pendingBackendRestart) void doRestart(); flushPendingUpdate(); }, // flush a deferred backend restart + queued self-update after a pulse/cron turn
   });
   // Initialize channel manager (WhatsApp, Telegram, etc.)
@@ -3464,6 +3738,7 @@ ${alreadyLinked ? '' : `
     workerApi,
     restartBackend: () => doRestart(),
     getModel: () => loadConfig().ai.model,
+    onTurnComplete: () => { if (pendingBackendRestart) void doRestart(); flushPendingUpdate(); }, // flush a deferred backend restart + queued self-update after a channel turn
   });
   // Broadcast channel status changes to all connected chat clients
@@ -3506,11 +3781,12 @@ ${alreadyLinked ? '' : `
     return restartBackend(backendPort);
   }
-  /** True while any surface is mid-turn. Dashboard chat sets agentQueryActive; WhatsApp/Alexa
-   *  turns instead set the harness conv.busy (they don't touch agentQueryActive), so we must
-   *  check both — otherwise an agent editing the backend over a channel would get the backend
-   *  restarted out from under it mid-turn. */
-  const aTurnIsActive = () => agentQueryActive || anyConversationBusy();
+  /** True while any surface is mid-turn. Dashboard chat sets agentQueryActive; WhatsApp/Alexa live
+   *  turns set the harness conv.busy; pulse/cron + customer-WhatsApp ONE-SHOT turns set neither
+   *  (they live in the harness activeQueries map) — so we check all three. Otherwise an agent
+   *  editing the backend over any of these surfaces, or queuing a self-update from one, would get
+   *  the backend restarted / the supervisor exited out from under it mid-turn. */
+  const aTurnIsActive = () => agentQueryActive || anyConversationBusy() || anyOneShotActive();
   function scheduleBackendRestart(reason: string) {
     if (aTurnIsActive()) {
@@ -3577,26 +3853,19 @@ ${alreadyLinked ? '' : `
       scheduleBackendRestart(`workspace dependencies changed (${filename})`);
     }
     if (filename === '.restart') {
-      // Consume the trigger file
+      // DEPRECATED fallback — agents now use POST /__bloby/control/restart-backend (synchronous ack,
+      // no lossy fs.watch). Kept so a human/external script touching .restart still works.
       try { fs.unlinkSync(path.join(workspaceDir, '.restart')); } catch {}
-      scheduleBackendRestart('.restart trigger');
+      scheduleBackendRestart('.restart trigger (deprecated)');
     }
     if (filename === '.update') {
-      // Consume the trigger file
+      // DEPRECATED fallback — agents now use POST /__bloby/control/update (acknowledged + idempotent).
+      // Route through queueUpdate(), which carries every fix the old inline path lacked: the
+      // idempotency guard (the watcher's own unlink re-fires this event → double-spawn), the
+      // aTurnIsActive() gate (was agentQueryActive-only → fired mid-turn on pulse/channel turns), the
+      // persisted marker, and the all-surface turn-complete flush.
       try { fs.unlinkSync(path.join(workspaceDir, '.update')); } catch {}
-      if (agentQueryActive) {
-        pendingUpdate = true;
-        log.info('Update requested — deferring until agent turn ends');
-      } else {
-        // End any live conversations before updating
-        for (const cid of Array.from(clientConvs.values())) {
-          if (hasConversation(cid)) {
-            log.info(`[update] Ending conversation ${cid} before update`);
-            endConversation(cid);
-          }
-        }
-        runDeferredUpdate();
-      }
+      queueUpdate();
     }
   }
@@ -3619,6 +3888,9 @@ ${alreadyLinked ? '' : `
   armBackendWatcher();
   armWorkspaceWatcher();
+  // Resume a self-update that was queued but never ran (supervisor died in the request→flush window).
+  resumePendingUpdateOnBoot();
   // WebSocket liveness heartbeat — ping the app + chat WS clients every 30s and terminate any
   // that missed the previous pong (half-open sockets that never fired 'close'). Terminating fires
   // 'close', which runs the existing map/subscription cleanup. Scoped to our two WSS only (Vite's

package/supervisor/scheduler.ts CHANGED Viewed

@@ -34,6 +34,8 @@ interface SchedulerOpts {
   workerApi: (path: string, method?: string, body?: any) => Promise<any>;
   restartBackend: () => void;
   getModel: () => string;
+  /** Fired after a pulse/cron turn ends — the supervisor uses it to flush a queued self-update. */
+  onTurnComplete?: () => void;
 }
 // State
@@ -120,7 +122,7 @@ function cronMatchesNow(schedule: string): boolean {
 function triggerAgent(prompt: string, label: string, onComplete?: () => void) {
   if (!schedulerOpts) return;
-  const { broadcastBloby, workerApi, restartBackend, getModel } = schedulerOpts;
+  const { broadcastBloby, workerApi, restartBackend, getModel, onTurnComplete } = schedulerOpts;
   const timestamp = Date.now();
   const convId = label.startsWith('pulse') ? `pulse-${timestamp}` : `cron-${label}-${timestamp}`;
   const model = getModel();
@@ -230,6 +232,7 @@ function triggerAgent(prompt: string, label: string, onComplete?: () => void) {
           log.info(`[scheduler] File tools used — restarting backend`);
           restartBackend();
         }
+        onTurnComplete?.(); // flush a queued self-update now this pulse/cron turn has ended
         onComplete?.();
       }

package/supervisor/vite-dev.ts CHANGED Viewed

@@ -1,11 +1,37 @@
-import { createServer as createViteServer, type ViteDevServer } from 'vite';
+import { createServer as createViteServer, createLogger, type ViteDevServer } from 'vite';
 import type http from 'http';
 import path from 'path';
 import { PKG_DIR } from '../shared/paths.js';
 import { log } from '../shared/logger.js';
+import { appendFrontendLog } from './frontend-log.js';
 let dashboardVite: ViteDevServer | null = null;
+const stripAnsi = (s: string) => String(s).replace(/\x1b\[[0-9;]*m/g, '');
+/** A Vite logger that mirrors error/warn to stdout (preserving the human-facing boot/HMR logs) AND
+ *  captures them into the server-side frontend ring, so GET /__bloby/control/logs/frontend surfaces
+ *  COMPILE/transform errors even when the browser never ran a line of JS (hard compile failure). */
+function makeCaptureLogger() {
+  const logger = createLogger('info');
+  const origError = logger.error.bind(logger);
+  const origWarn = logger.warn.bind(logger);
+  const origWarnOnce = logger.warnOnce.bind(logger);
+  logger.error = (msg, opts) => {
+    try { appendFrontendLog('vite-error', stripAnsi(msg), opts?.error?.stack ? stripAnsi(opts.error.stack) : undefined); } catch {}
+    origError(msg, opts);
+  };
+  logger.warn = (msg, opts) => {
+    try { appendFrontendLog('vite-warn', stripAnsi(msg)); } catch {}
+    origWarn(msg, opts);
+  };
+  logger.warnOnce = (msg, opts) => {
+    try { appendFrontendLog('vite-warn', stripAnsi(msg)); } catch {}
+    origWarnOnce(msg, opts);
+  };
+  return logger;
+}
 export async function startViteDevServers(supervisorPort: number, hmrServer: http.Server): Promise<{ dashboard: number }> {
   const ports = {
     dashboard: supervisorPort + 2,
@@ -25,7 +51,9 @@ export async function startViteDevServers(supervisorPort: number, hmrServer: htt
         // so it works both locally (localhost:3000) and through the relay (riven.bloby.bot:443).
         hmr: { server: hmrServer },
       },
-      logLevel: 'info',
+      // customLogger captures compile/transform errors into the frontend ring (and still prints
+      // them); it supersedes logLevel, which Vite ignores when a customLogger is provided.
+      customLogger: makeCaptureLogger(),
     });
     await dashboardVite.listen();
   } catch (err) {

package/supervisor/workspace-guard.js CHANGED Viewed

@@ -321,6 +321,71 @@
   hideStyle.textContent = 'vite-error-overlay{display:none!important}';
   (document.head || document.documentElement).appendChild(hideStyle);
+  /* ── 2a. Frontend error capture → supervisor ──────────────────────────────
+     Captures EVERY break class — runtime window.onerror, unhandledrejection, console.error/warn,
+     and the Vite compile overlay — into a ring. The ring backs (1) the "Copy error" button, so it
+     is never empty even for runtime/black-screen breaks (no Vite overlay), and (2) a debounced POST
+     to /__bloby/control/fe-log, so the agent's `…/control/logs/frontend` tail is never empty either.
+     Before this, only Vite's compile overlay was readable and the listeners captured nothing. */
+  var feRing = [];      // last ~120 entries, for the Copy button
+  var feUnsent = [];    // not-yet-POSTed entries
+  var feFlushTimer = null;
+  function safeStringify(o) { try { return JSON.stringify(o); } catch (e) { return String(o); } }
+  function pushFe(kind, text, stack) {
+    if (text == null) return;
+    text = String(text).slice(0, 4000).trim();
+    if (!text) return;
+    if (text.indexOf(VITE_SUPPRESS_MARK) !== -1) return; // benign HMR-reconnect marker, not an app error
+    var last = feRing[feRing.length - 1];
+    if (last && last.kind === kind && last.text === text) return; // collapse repeats (1.5s overlay tick)
+    var entry = { kind: kind, text: text, stack: stack ? String(stack).slice(0, 4000) : undefined, t: Date.now() };
+    feRing.push(entry); if (feRing.length > 120) feRing.shift();
+    feUnsent.push(entry);
+    if (!feFlushTimer) feFlushTimer = setTimeout(flushFe, 1000);
+  }
+  function flushFe() {
+    feFlushTimer = null;
+    if (!feUnsent.length) return;
+    var batch = feUnsent.splice(0, feUnsent.length);
+    try {
+      fetch('/__bloby/control/fe-log', {
+        method: 'POST', headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ entries: batch }), keepalive: true, cache: 'no-store',
+      }).catch(function () {});
+    } catch (e) {}
+  }
+  function feTail(n) {
+    return feRing.slice(-(n || 6)).map(function (e) {
+      return '[' + e.kind + '] ' + e.text + (e.stack ? '\n' + e.stack : '');
+    }).join('\n\n');
+  }
+  // Patch console.error/console.warn once — the ONLY source for the "frontend devtools" tail.
+  ['error', 'warn'].forEach(function (level) {
+    var orig = console[level];
+    if (typeof orig !== 'function' || orig.__blobyPatched) return;
+    var patched = function () {
+      try {
+        var parts = [];
+        for (var i = 0; i < arguments.length; i++) {
+          var a = arguments[i];
+          parts.push(a instanceof Error ? (a.message + (a.stack ? '\n' + a.stack : ''))
+            : (a && typeof a === 'object' ? safeStringify(a) : String(a)));
+        }
+        pushFe('console.' + level, parts.join(' '));
+      } catch (e) {}
+      return orig.apply(console, arguments);
+    };
+    patched.__blobyPatched = true;
+    console[level] = patched;
+  });
+  // Flush buffered errors before the page goes away — a runtime error very often immediately precedes
+  // a reload, and this guard itself reloads aggressively (backend-down poll, vite-stale recovery).
+  // keepalive (set in flushFe) lets the POST complete during unload, so the agent's frontend-log tail
+  // doesn't miss the error that broke the page.
+  window.addEventListener('pagehide', flushFe);
+  document.addEventListener('visibilitychange', function () { if (document.visibilityState === 'hidden') flushFe(); });
   var overlay = null;
   var dismissed = false;
   var lastErr = '';
@@ -359,7 +424,11 @@
     var copyBtn = d.querySelector('#__bloby_fe_copy');
     copyBtn.addEventListener('click', function () {
-      var text = 'A screen in my app has a frontend build error. Find and fix the root cause. Error:\n\n' + (lastErr || '(no details captured)');
+      // Re-scrape Vite's overlay at click time (it may have populated since the overlay was built),
+      // then fall back to the captured ring — so runtime errors / black screens (no Vite overlay)
+      // still copy real text instead of "(no details captured)".
+      var detail = readViteError() || lastErr || feTail(6);
+      var text = 'A screen in my app has a frontend error. Find and fix the root cause. Error:\n\n' + (detail || '(no error text was captured — read the frontend logs to investigate)');
       function ok() { copyBtn.textContent = '✓ Copied — paste it to your agent'; setTimeout(function () { copyBtn.textContent = 'Copy error for your agent'; }, 2600); }
       function fb() { var ta = document.createElement('textarea'); ta.value = text; ta.style.position = 'fixed'; ta.style.opacity = '0'; document.body.appendChild(ta); ta.select(); try { document.execCommand('copy'); ok(); } catch (e) {} document.body.removeChild(ta); }
       if (navigator.clipboard && navigator.clipboard.writeText) navigator.clipboard.writeText(text).then(ok).catch(fb); else fb();
@@ -392,7 +461,7 @@
   function evaluate() {
     if (appLooksBroken()) {
       var err = readViteError();
-      if (err) lastErr = err;
+      if (err) { lastErr = err; pushFe('vite-overlay', err); }
       if (!overlay && !dismissed) overlay = buildOverlay();
     } else {
       // App healthy (or recovered) — drop our overlay and re-arm for the next episode.
@@ -406,7 +475,19 @@
   // recovery; window errors flip sawError so real load failures surface fast.
   new MutationObserver(evaluate).observe(document.body, { childList: true });
   setInterval(evaluate, 1500);
-  window.addEventListener('error', function () { sawError = true; evaluate(); });
+  window.addEventListener('error', function (e) {
+    // Capture the actual error text + stack. Runtime errors render NO Vite overlay, so before this
+    // they were completely invisible to the copy button / logs. Skip resource-load errors (e.target
+    // is an element, with no e.error/e.message) — they carry no useful app-error text.
+    try {
+      if (e && (e.error || e.message)) {
+        var emsg = e.message || (e.error && e.error.message) || 'Error';
+        if (e.filename) emsg += ' @ ' + e.filename + ':' + (e.lineno || 0) + ':' + (e.colno || 0);
+        pushFe('error', emsg, e.error && e.error.stack);
+      }
+    } catch (err) {}
+    sawError = true; evaluate();
+  });
   window.addEventListener('unhandledrejection', function (e) {
     // The reconnect-reload suppressor (job 5) throws a marker error inside Vite's HMR
     // dispatch; it surfaces here on every tunnel blip. Not an app error — swallow it.
@@ -414,6 +495,11 @@
       var msg = e && e.reason && e.reason.message;
       if (msg && String(msg).indexOf(VITE_SUPPRESS_MARK) !== -1) { e.preventDefault(); return; }
     } catch (err) {}
+    try {
+      var reason = e && e.reason;
+      var rtext = reason && (reason.message || reason);
+      pushFe('unhandledrejection', rtext != null ? String(rtext) : 'Unhandled promise rejection', reason && reason.stack);
+    } catch (err2) {}
     sawError = true; evaluate();
   });
   evaluate();

package/worker/prompts/bloby-system-prompt-codex.txt CHANGED Viewed

@@ -167,17 +167,16 @@ Notify your human only if importance is 7+ — otherwise log results silently.
 **Check version:** `cat ~/.bloby/VERSION` (current) vs `npm view bloby-bot version` (latest).
-**To update:** Create the trigger file `touch .update` — the supervisor runs the update after your turn ends. You will
-NOT die. Finish your turn normally.
+**To update:** Run `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`. It returns `{"ok":true,"queued":true}` — that ack confirms the update is queued and WILL run after your turn ends. You will NOT die mid-turn. Finish your turn normally. Verify afterwards with `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update-status`.
 **On PULSE:** Occasionally check for updates (not every pulse — once every few hours). If a new version exists:
 1. Read release notes: `npm view bloby-bot releaseNotes --json`
-2. `touch .update`
+2. `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
 3. Save to daily notes: "Updated from vX to vY" + notable changes to talk with your human later "Btw I updated myself this night"
 **When user asks to update:**
 1. Check version, if already latest say so
-2. Otherwise read release notes, `touch .update`
+2. Otherwise read release notes, then `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
 3. Tell your human what is new and that the page will be unresponsive for up to 2 minutes while this happens.
 ## Task Files — `tasks/`
@@ -584,7 +583,8 @@ When the dashboard shows a black screen or Vite logs an error, READ the error be
 - **`Error: ENOENT: no such file or directory, open '.../node_modules/<pkg>/...'`** → The package is **physically missing on disk**. The fix is `npm install <pkg>` — full stop. Clearing `.vite/deps/`, touching files, adding dummy deps, or restarting will NOT recreate the missing file.
 - **`Failed to resolve import "<pkg>"`** with no path in the error → Same diagnosis: package isn't installed. Run `npm install <pkg>`.
 - **Pre-bundling / optimizer errors that don't reference a missing path, OR errors that persist after dependency changes** → Vite's dep cache is stale. Clear it: `rm -rf workspace/node_modules/.vite` and reload. (This is the ONLY case where clearing the cache helps.)
-- **Backend crash loop** → Read `.backend.log`. Don't guess.
+- **Backend crash loop** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free; add `&prev=1` for the last crashed run). Don't guess.
+- **Black screen / frontend runtime error with no obvious Vite message** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/frontend?lines=100` — captured window errors, unhandled rejections, console.error/warn, and Vite overlay text (the same data behind the "Copy error for your agent" button).
 If you've tried a fix and the same error recurs, do NOT try a variation of the same fix. Re-diagnose from the error message, or stop and ask your human to restart bloby. See "Stop looping" below.
@@ -595,12 +595,17 @@ The supervisor manages the backend process. You don't need to manage it yourself
 **Auto-restart triggers (you don't need to do anything):**
 - Editing `.ts`, `.js`, or `.json` files in `backend/` → auto-restart
 - Editing `.env` → auto-restart with the new values
-- Creating a `.restart` file → force restart: `touch .restart` (file is auto-deleted)
 - After your turn ends, if you used Write or Edit tools → auto-restart
-**During your turn:** The backend does NOT restart mid-turn. All your edits are batched — the backend restarts once when you're done. This means if you're writing multi-file changes, everything applies atomically.
+**During your turn (batched, atomic):** By default the backend does NOT auto-restart mid-turn — your edits are batched and applied together when the turn ends, so a multi-file change is never served half-written.
-**If the backend crashes:** It auto-restarts up to 3 times. If it keeps crashing, read `.backend.log` to see the error output, then fix the code. The log file is cleared on each restart so it only contains the current/last run — no need to truncate it yourself.
+**Restart and verify WITHIN your turn (preferred when you need to test a fix):** After your backend edits are fully saved, run:
+```
+curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/restart-backend -d '{"wait":true}'
+```
+It restarts the backend and BLOCKS until the port is healthy, then returns `{"ok":true,"healthy":true,"listening":true,"gaveUp":false,"logs":"..."}`. Now curl your own backend (e.g. `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/app/api/...`) to confirm the fix — all in this turn. If `healthy:false` or `gaveUp:true`, read the returned `logs` and fix the code. Only restart AFTER your edits are saved.
+**If the backend crashes:** It auto-restarts up to 3 times, then gives up. To see the error, run `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free — even right after a bounce; add `&prev=1` to read the last *crashed* run if the current tail looks empty). Then fix the code.
 **NEVER do these:**
 - Never `kill` processes or run `pkill`/`killall` — you don't manage the supervisor or its children

package/worker/prompts/bloby-system-prompt-pi.txt CHANGED Viewed

@@ -167,17 +167,16 @@ Notify your human only if importance is 7+ — otherwise log results silently.
 **Check version:** `cat ~/.bloby/VERSION` (current) vs `npm view bloby-bot version` (latest).
-**To update:** Create the trigger file `touch .update` — the supervisor runs the update after your turn ends. You will
-NOT die. Finish your turn normally.
+**To update:** Run `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`. It returns `{"ok":true,"queued":true}` — that ack confirms the update is queued and WILL run after your turn ends. You will NOT die mid-turn. Finish your turn normally. Verify afterwards with `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update-status`.
 **On PULSE:** Occasionally check for updates (not every pulse — once every few hours). If a new version exists:
 1. Read release notes: `npm view bloby-bot releaseNotes --json`
-2. `touch .update`
+2. `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
 3. Save to daily notes: "Updated from vX to vY" + notable changes to talk with your human later "Btw I updated myself this night"
 **When user asks to update:**
 1. Check version, if already latest say so
-2. Otherwise read release notes, `touch .update`
+2. Otherwise read release notes, then `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
 3. Tell your human what is new and that the page will be unresponsive for up to 2 minutes while this happens.
 ## Task Files — `tasks/`
@@ -584,7 +583,8 @@ When the dashboard shows a black screen or Vite logs an error, READ the error be
 - **`Error: ENOENT: no such file or directory, open '.../node_modules/<pkg>/...'`** → The package is **physically missing on disk**. The fix is `npm install <pkg>` — full stop. Clearing `.vite/deps/`, touching files, adding dummy deps, or restarting will NOT recreate the missing file.
 - **`Failed to resolve import "<pkg>"`** with no path in the error → Same diagnosis: package isn't installed. Run `npm install <pkg>`.
 - **Pre-bundling / optimizer errors that don't reference a missing path, OR errors that persist after dependency changes** → Vite's dep cache is stale. Clear it: `rm -rf workspace/node_modules/.vite` and reload. (This is the ONLY case where clearing the cache helps.)
-- **Backend crash loop** → Read `.backend.log`. Don't guess.
+- **Backend crash loop** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free; add `&prev=1` for the last crashed run). Don't guess.
+- **Black screen / frontend runtime error with no obvious Vite message** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/frontend?lines=100` — captured window errors, unhandled rejections, console.error/warn, and Vite overlay text (the same data behind the "Copy error for your agent" button).
 If you've tried a fix and the same error recurs, do NOT try a variation of the same fix. Re-diagnose from the error message, or stop and ask your human to restart bloby. See "Stop looping" below.
@@ -595,12 +595,17 @@ The supervisor manages the backend process. You don't need to manage it yourself
 **Auto-restart triggers (you don't need to do anything):**
 - Editing `.ts`, `.js`, or `.json` files in `backend/` → auto-restart
 - Editing `.env` → auto-restart with the new values
-- Creating a `.restart` file → force restart: `touch .restart` (file is auto-deleted)
 - After your turn ends, if you used Write or Edit tools → auto-restart
-**During your turn:** The backend does NOT restart mid-turn. All your edits are batched — the backend restarts once when you're done. This means if you're writing multi-file changes, everything applies atomically.
+**During your turn (batched, atomic):** By default the backend does NOT auto-restart mid-turn — your edits are batched and applied together when the turn ends, so a multi-file change is never served half-written.
-**If the backend crashes:** It auto-restarts up to 3 times. If it keeps crashing, read `.backend.log` to see the error output, then fix the code. The log file is cleared on each restart so it only contains the current/last run — no need to truncate it yourself.
+**Restart and verify WITHIN your turn (preferred when you need to test a fix):** After your backend edits are fully saved, run:
+```
+curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/restart-backend -d '{"wait":true}'
+```
+It restarts the backend and BLOCKS until the port is healthy, then returns `{"ok":true,"healthy":true,"listening":true,"gaveUp":false,"logs":"..."}`. Now curl your own backend (e.g. `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/app/api/...`) to confirm the fix — all in this turn. If `healthy:false` or `gaveUp:true`, read the returned `logs` and fix the code. Only restart AFTER your edits are saved.
+**If the backend crashes:** It auto-restarts up to 3 times, then gives up. To see the error, run `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free — even right after a bounce; add `&prev=1` to read the last *crashed* run if the current tail looks empty). Then fix the code.
 **NEVER do these:**
 - Never `kill` processes or run `pkill`/`killall` — you don't manage the supervisor or its children

package/worker/prompts/bloby-system-prompt.txt CHANGED Viewed

@@ -167,17 +167,16 @@ Notify your human only if importance is 7+ — otherwise log results silently.
 **Check version:** `cat ~/.bloby/VERSION` (current) vs `npm view bloby-bot version` (latest).
-**To update:** Create the trigger file `touch .update` — the supervisor runs the update after your turn ends. You will
-NOT die. Finish your turn normally.
+**To update:** Run `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`. It returns `{"ok":true,"queued":true}` — that ack confirms the update is queued and WILL run after your turn ends. You will NOT die mid-turn. Finish your turn normally. Verify afterwards with `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update-status`.
 **On PULSE:** Occasionally check for updates (not every pulse — once every few hours). If a new version exists:
 1. Read release notes: `npm view bloby-bot releaseNotes --json`
-2. `touch .update`
+2. `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
 3. Save to daily notes: "Updated from vX to vY" + notable changes to talk with your human later "Btw I updated myself this night"
 **When user asks to update:**
 1. Check version, if already latest say so
-2. Otherwise read release notes, `touch .update`
+2. Otherwise read release notes, then `curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/update`
 3. Tell your human what is new and that the page will be unresponsive for up to 2 minutes while this happens.
 ## Task Files — `tasks/`
@@ -584,7 +583,8 @@ When the dashboard shows a black screen or Vite logs an error, READ the error be
 - **`Error: ENOENT: no such file or directory, open '.../node_modules/<pkg>/...'`** → The package is **physically missing on disk**. The fix is `npm install <pkg>` — full stop. Clearing `.vite/deps/`, touching files, adding dummy deps, or restarting will NOT recreate the missing file.
 - **`Failed to resolve import "<pkg>"`** with no path in the error → Same diagnosis: package isn't installed. Run `npm install <pkg>`.
 - **Pre-bundling / optimizer errors that don't reference a missing path, OR errors that persist after dependency changes** → Vite's dep cache is stale. Clear it: `rm -rf workspace/node_modules/.vite` and reload. (This is the ONLY case where clearing the cache helps.)
-- **Backend crash loop** → Read `.backend.log`. Don't guess.
+- **Backend crash loop** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free; add `&prev=1` for the last crashed run). Don't guess.
+- **Black screen / frontend runtime error with no obvious Vite message** → `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/frontend?lines=100` — captured window errors, unhandled rejections, console.error/warn, and Vite overlay text (the same data behind the "Copy error for your agent" button).
 If you've tried a fix and the same error recurs, do NOT try a variation of the same fix. Re-diagnose from the error message, or stop and ask your human to restart bloby. See "Stop looping" below.
@@ -595,12 +595,17 @@ The supervisor manages the backend process. You don't need to manage it yourself
 **Auto-restart triggers (you don't need to do anything):**
 - Editing `.ts`, `.js`, or `.json` files in `backend/` → auto-restart
 - Editing `.env` → auto-restart with the new values
-- Creating a `.restart` file → force restart: `touch .restart` (file is auto-deleted)
 - After your turn ends, if you used Write or Edit tools → auto-restart
-**During your turn:** The backend does NOT restart mid-turn. All your edits are batched — the backend restarts once when you're done. This means if you're writing multi-file changes, everything applies atomically.
+**During your turn (batched, atomic):** By default the backend does NOT auto-restart mid-turn — your edits are batched and applied together when the turn ends, so a multi-file change is never served half-written.
-**If the backend crashes:** It auto-restarts up to 3 times. If it keeps crashing, read `.backend.log` to see the error output, then fix the code. The log file is cleared on each restart so it only contains the current/last run — no need to truncate it yourself.
+**Restart and verify WITHIN your turn (preferred when you need to test a fix):** After your backend edits are fully saved, run:
+```
+curl -s -X POST http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/restart-backend -d '{"wait":true}'
+```
+It restarts the backend and BLOCKS until the port is healthy, then returns `{"ok":true,"healthy":true,"listening":true,"gaveUp":false,"logs":"..."}`. Now curl your own backend (e.g. `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/app/api/...`) to confirm the fix — all in this turn. If `healthy:false` or `gaveUp:true`, read the returned `logs` and fix the code. Only restart AFTER your edits are saved.
+**If the backend crashes:** It auto-restarts up to 3 times, then gives up. To see the error, run `curl -s http://127.0.0.1:${SUPERVISOR_PORT:-7400}/__bloby/control/logs/backend?lines=200` (race-free — even right after a bounce; add `&prev=1` to read the last *crashed* run if the current tail looks empty). Then fix the code.
 **NEVER do these:**
 - Never `kill` processes or run `pkill`/`killall` — you don't manage the supervisor or its children