bloby-bot 0.53.9 → 0.54.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/shared/config.ts +5 -0
- package/supervisor/backend.ts +29 -4
- package/supervisor/channels/manager.ts +81 -19
- package/supervisor/channels/types.ts +5 -0
- package/supervisor/chat/bloby-main.tsx +1 -1
- package/supervisor/chat/src/components/Chat/EnvForm.tsx +2 -1
- package/supervisor/chat/src/hooks/useChat.ts +6 -5
- package/supervisor/harnesses/claude.ts +12 -2
- package/supervisor/harnesses/codex.ts +117 -22
- package/supervisor/harnesses/pi/index.ts +8 -1
- package/supervisor/index.ts +218 -53
- package/worker/prompts/bloby-system-prompt-codex.txt +778 -0
- package/worker/prompts/bloby-system-prompt-pi.txt +778 -0
- package/worker/prompts/prompt-assembler.ts +49 -14
- package/workspace/skills/whatsapp/SKILL.md +25 -2
package/supervisor/index.ts
CHANGED
|
@@ -11,7 +11,7 @@ import { log } from '../shared/logger.js';
|
|
|
11
11
|
import { startTunnel, stopTunnel, isTunnelAlive, restartTunnel, startNamedTunnel, restartNamedTunnel } from './tunnel.js';
|
|
12
12
|
import { createWorkerApp } from '../worker/index.js';
|
|
13
13
|
import { closeDb, getSession, getSetting } from '../worker/db.js';
|
|
14
|
-
import { spawnBackend, stopBackend, restartBackend, getBackendPort, isBackendAlive, isBackendStopping, isBackendDead, readBackendLogTail, setBackendEnv } from './backend.js';
|
|
14
|
+
import { spawnBackend, stopBackend, restartBackend, getBackendPort, isBackendAlive, isBackendStopping, isBackendDead, readBackendLogTail, setBackendEnv, setBackendGiveUpHandler } from './backend.js';
|
|
15
15
|
import { handleAgentQuery, type AgentQueryRequest } from './agent-api.js';
|
|
16
16
|
import { updateTunnelUrl, startHeartbeat, stopHeartbeat, disconnect } from '../shared/relay.js';
|
|
17
17
|
import {
|
|
@@ -406,10 +406,21 @@ export async function startSupervisor() {
|
|
|
406
406
|
// The request handler is set up later via server.on('request')
|
|
407
407
|
const server = http.createServer();
|
|
408
408
|
|
|
409
|
-
// Start Vite dev server — pass supervisor server so Vite attaches HMR WebSocket directly
|
|
409
|
+
// Start Vite dev server — pass supervisor server so Vite attaches HMR WebSocket directly.
|
|
410
|
+
// A Vite boot failure must NOT take down the supervisor (G1): chat is served independently and
|
|
411
|
+
// is the lifeline the user needs to ask the agent to fix things. On failure we fall back to a
|
|
412
|
+
// sentinel port — the dashboard proxy then serves the branded "Reconnecting" page (which polls
|
|
413
|
+
// and carries the chat widget) while chat, the worker API, channels, and the tunnel all still
|
|
414
|
+
// come up. Previously this throw reached the top-level catch → process.exit before chat listened.
|
|
410
415
|
console.log('[supervisor] Starting Vite dev server...');
|
|
411
|
-
|
|
412
|
-
|
|
416
|
+
let vitePorts: { dashboard: number };
|
|
417
|
+
try {
|
|
418
|
+
vitePorts = await startViteDevServers(config.port, server);
|
|
419
|
+
console.log(`[supervisor] Vite ready — dashboard :${vitePorts.dashboard}`);
|
|
420
|
+
} catch (err) {
|
|
421
|
+
log.error(`Vite dev server failed to start — dashboard degraded, chat still available: ${err instanceof Error ? err.message : err}`);
|
|
422
|
+
vitePorts = { dashboard: -1 }; // sentinel → dashboard proxy serves RECOVERING_HTML
|
|
423
|
+
}
|
|
413
424
|
console.log(`[supervisor] Upgrade listeners on server: ${server.listenerCount('upgrade')}`);
|
|
414
425
|
|
|
415
426
|
// Ensure file storage dirs exist
|
|
@@ -504,36 +515,31 @@ export async function startSupervisor() {
|
|
|
504
515
|
}
|
|
505
516
|
}
|
|
506
517
|
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
518
|
+
// SECURITY MODEL — public allowlist (secure by default). When a portal password is set, EVERY
|
|
519
|
+
// /api/* route requires a valid Bearer token EXCEPT the ones listed here (the pre-login surface:
|
|
520
|
+
// login/onboarding/health/non-secret config). A new /api route is therefore GATED by default —
|
|
521
|
+
// it can only leak if someone explicitly adds it here. (Previously the gate skipped ALL GET/HEAD,
|
|
522
|
+
// so every data read — conversations, context, wallet, devices — was readable with no token.)
|
|
523
|
+
// Note: /api/agent/* (agent secret) and /api/channels/* are intercepted + returned BEFORE this
|
|
524
|
+
// gate, so they're unaffected; the channel entries below are belt-and-suspenders.
|
|
525
|
+
const PUBLIC_PRELOGIN_ROUTES = [
|
|
513
526
|
'GET /api/health',
|
|
514
|
-
|
|
515
|
-
//
|
|
516
|
-
// afterward. Re-onboard from the dashboard uses the internal x-internal WS path.
|
|
527
|
+
'GET /api/onboard/status',
|
|
528
|
+
'GET /api/settings', // secrets already stripped (worker denylist); widget + onboard read flags pre-login
|
|
517
529
|
'GET /api/push/vapid-public-key',
|
|
518
|
-
'GET /api/
|
|
519
|
-
'POST /api/
|
|
520
|
-
'
|
|
521
|
-
'
|
|
522
|
-
'POST /api/auth/codex/start',
|
|
523
|
-
'POST /api/auth/codex/cancel',
|
|
524
|
-
'GET /api/auth/codex/status',
|
|
525
|
-
'GET /api/auth/pi/providers',
|
|
526
|
-
'GET /api/auth/pi/status',
|
|
527
|
-
'POST /api/auth/pi/test',
|
|
528
|
-
'POST /api/auth/pi/save',
|
|
529
|
-
'DELETE /api/auth/pi',
|
|
530
|
-
'POST /api/auth/pi/completion',
|
|
531
|
-
'POST /api/portal/totp/setup',
|
|
532
|
-
'POST /api/portal/totp/verify-setup',
|
|
533
|
-
'POST /api/portal/totp/disable',
|
|
534
|
-
'GET /api/portal/totp/status',
|
|
530
|
+
'GET /api/portal/login',
|
|
531
|
+
'POST /api/portal/login',
|
|
532
|
+
'GET /api/portal/validate-token',
|
|
533
|
+
'POST /api/portal/validate-token',
|
|
535
534
|
'GET /api/portal/login/totp',
|
|
536
|
-
'
|
|
535
|
+
'GET /api/portal/totp/status',
|
|
536
|
+
'POST /api/portal/totp/setup', // self-protected in-handler (Bearer OR password OR first-run)
|
|
537
|
+
'POST /api/portal/totp/verify-setup', // self-protected in-handler
|
|
538
|
+
'POST /api/portal/totp/disable', // self-protected (requires password + valid code)
|
|
539
|
+
'POST /api/portal/verify-password', // verifies the password itself — cannot require a token
|
|
540
|
+
// NOTE: 'POST /api/onboard' is NOT public — gated below: open only on genuine first run
|
|
541
|
+
// (no portal_pass yet), token-required afterward. Dashboard re-onboard uses the x-internal WS path.
|
|
542
|
+
// Channel onboarding (also intercepted earlier; kept for completeness/safety):
|
|
537
543
|
'GET /api/channels/status',
|
|
538
544
|
'GET /api/channels/whatsapp/qr',
|
|
539
545
|
'GET /api/channels/whatsapp/qr-page',
|
|
@@ -546,12 +552,23 @@ export async function startSupervisor() {
|
|
|
546
552
|
'POST /api/channels/send',
|
|
547
553
|
'POST /api/channels/alexa/handle',
|
|
548
554
|
];
|
|
555
|
+
// Method-specific public PREFIXES — onboarding namespaces with sub-paths / params that carry no
|
|
556
|
+
// private chat data: provider OAuth setup/status (all of /api/auth/*), and handle availability
|
|
557
|
+
// (GET /api/handle/* only — handle register/change are POSTs and stay gated).
|
|
558
|
+
const PUBLIC_PRELOGIN_PREFIXES = [
|
|
559
|
+
'POST /api/auth/',
|
|
560
|
+
'GET /api/auth/',
|
|
561
|
+
'DELETE /api/auth/',
|
|
562
|
+
'GET /api/handle/',
|
|
563
|
+
];
|
|
549
564
|
|
|
550
|
-
function
|
|
565
|
+
function isPublicRoute(method: string, url: string): boolean {
|
|
566
|
+
const m = method === 'HEAD' ? 'GET' : method; // a HEAD to a public GET route is public
|
|
551
567
|
const path = url.split('?')[0];
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
568
|
+
if (PUBLIC_PRELOGIN_ROUTES.includes(`${m} ${path}`)) return true;
|
|
569
|
+
return PUBLIC_PRELOGIN_PREFIXES.some((r) => {
|
|
570
|
+
const sp = r.indexOf(' ');
|
|
571
|
+
return m === r.slice(0, sp) && path.startsWith(r.slice(sp + 1));
|
|
555
572
|
});
|
|
556
573
|
}
|
|
557
574
|
|
|
@@ -645,6 +662,32 @@ export async function startSupervisor() {
|
|
|
645
662
|
res.setHeader('Content-Type', 'application/json');
|
|
646
663
|
res.setHeader('Cache-Control', 'no-store, no-cache, must-revalidate');
|
|
647
664
|
|
|
665
|
+
// ── Loopback-only guard for channel MUTATION endpoints ──
|
|
666
|
+
// These are only ever called by the local agent over loopback (curl localhost:7400),
|
|
667
|
+
// never legitimately from the public Cloudflare tunnel. Without this, an unauthenticated
|
|
668
|
+
// remote request could set mode/admins or flip allowOthersToTrigger and seize the agent
|
|
669
|
+
// (it can run Bash/edit files). Same guard the Agent API uses: cloudflared forwards over
|
|
670
|
+
// loopback so the IP check alone is a no-op behind the relay — we also reject any request
|
|
671
|
+
// carrying cloudflared's cf-connecting-ip/cf-ray (tunnel-origin) headers. Reads (status,
|
|
672
|
+
// qr, qr-page) and alexa/handle (relay-origin, secret-gated) deliberately stay public.
|
|
673
|
+
const WA_MUTATION_ROUTES = new Set([
|
|
674
|
+
'POST /api/channels/whatsapp/configure',
|
|
675
|
+
'POST /api/channels/whatsapp/connect',
|
|
676
|
+
'POST /api/channels/whatsapp/disconnect',
|
|
677
|
+
'POST /api/channels/whatsapp/logout',
|
|
678
|
+
'POST /api/channels/whatsapp/pairing-code',
|
|
679
|
+
'POST /api/channels/send',
|
|
680
|
+
]);
|
|
681
|
+
if (WA_MUTATION_ROUTES.has(`${req.method} ${channelPath}`)) {
|
|
682
|
+
const remoteIp = req.socket.remoteAddress || '';
|
|
683
|
+
const isLoopback = remoteIp === '127.0.0.1' || remoteIp === '::1' || remoteIp === '::ffff:127.0.0.1';
|
|
684
|
+
if (!isLoopback || req.headers['cf-connecting-ip'] || req.headers['cf-ray']) {
|
|
685
|
+
res.writeHead(403);
|
|
686
|
+
res.end(JSON.stringify({ ok: false, error: 'This channel endpoint is localhost-only.' }));
|
|
687
|
+
return;
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
|
|
648
691
|
// GET /api/channels/status — all channel statuses
|
|
649
692
|
if (req.method === 'GET' && channelPath === '/api/channels/status') {
|
|
650
693
|
res.writeHead(200);
|
|
@@ -935,6 +978,7 @@ ${!connected ? `<script>
|
|
|
935
978
|
if (data.admins !== undefined) cfg.channels.whatsapp.admins = data.admins;
|
|
936
979
|
if (data.skill !== undefined) cfg.channels.whatsapp.skill = data.skill;
|
|
937
980
|
if (data.allowGroups !== undefined) cfg.channels.whatsapp.allowGroups = !!data.allowGroups;
|
|
981
|
+
if (data.allowOthersToTrigger !== undefined) cfg.channels.whatsapp.allowOthersToTrigger = !!data.allowOthersToTrigger;
|
|
938
982
|
saveConfig(cfg);
|
|
939
983
|
res.writeHead(200);
|
|
940
984
|
res.end(JSON.stringify({ ok: true, config: cfg.channels.whatsapp }));
|
|
@@ -1310,11 +1354,31 @@ mint();
|
|
|
1310
1354
|
return;
|
|
1311
1355
|
}
|
|
1312
1356
|
|
|
1313
|
-
// POST /api/env — write env vars to workspace .env (used by chat EnvForm)
|
|
1357
|
+
// POST /api/env — write env vars to workspace .env (used by chat EnvForm).
|
|
1314
1358
|
if (req.method === 'POST' && req.url === '/api/env') {
|
|
1359
|
+
// This route is intercepted before the /api worker gate, so gate it here: writing the
|
|
1360
|
+
// workspace .env (which the backend loads) is a privileged action — require the portal token
|
|
1361
|
+
// when a password is set. Internal supervisor calls (x-internal) bypass. The chat EnvForm
|
|
1362
|
+
// sends the token via authFetch.
|
|
1363
|
+
if (req.headers['x-internal'] !== internalSecret && await isAuthRequired()) {
|
|
1364
|
+
const authHeader = req.headers['authorization'];
|
|
1365
|
+
const token = authHeader?.startsWith('Bearer ') ? authHeader.slice(7) : null;
|
|
1366
|
+
if (!token || !(await validateToken(token))) {
|
|
1367
|
+
res.writeHead(401, { 'Content-Type': 'application/json' });
|
|
1368
|
+
res.end(JSON.stringify({ error: 'Unauthorized' }));
|
|
1369
|
+
return;
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1315
1372
|
let body = '';
|
|
1316
|
-
|
|
1373
|
+
let bodyBytes = 0;
|
|
1374
|
+
let tooLarge = false;
|
|
1375
|
+
req.on('data', (chunk: Buffer) => {
|
|
1376
|
+
bodyBytes += chunk.length;
|
|
1377
|
+
if (bodyBytes > 1_000_000) { tooLarge = true; req.destroy(); return; } // .env is tiny; 1MB is generous
|
|
1378
|
+
body += chunk.toString();
|
|
1379
|
+
});
|
|
1317
1380
|
req.on('end', () => {
|
|
1381
|
+
if (tooLarge) return;
|
|
1318
1382
|
try {
|
|
1319
1383
|
const { vars } = JSON.parse(body) as { vars: Record<string, string> };
|
|
1320
1384
|
if (!vars || typeof vars !== 'object') {
|
|
@@ -1331,7 +1395,20 @@ mint();
|
|
|
1331
1395
|
|
|
1332
1396
|
for (const [rawKey, rawValue] of Object.entries(vars)) {
|
|
1333
1397
|
const key = rawKey.trim();
|
|
1398
|
+
// Validate the key as a real env var name; reject anything that could inject extra
|
|
1399
|
+
// lines or break .env parsing.
|
|
1400
|
+
if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(key)) {
|
|
1401
|
+
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
1402
|
+
res.end(JSON.stringify({ error: `Invalid env var name: ${rawKey}` }));
|
|
1403
|
+
return;
|
|
1404
|
+
}
|
|
1334
1405
|
const value = typeof rawValue === 'string' ? rawValue.trim() : rawValue;
|
|
1406
|
+
// Reject embedded newlines/CR in the value — they'd inject arbitrary extra .env lines.
|
|
1407
|
+
if (typeof value === 'string' && /[\r\n]/.test(value)) {
|
|
1408
|
+
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
1409
|
+
res.end(JSON.stringify({ error: `Invalid value for ${key}: must not contain newlines` }));
|
|
1410
|
+
return;
|
|
1411
|
+
}
|
|
1335
1412
|
// Find existing line for this key (supports KEY=val, KEY="val", KEY='val')
|
|
1336
1413
|
const idx = lines.findIndex((l) => {
|
|
1337
1414
|
const trimmed = l.trim();
|
|
@@ -1363,20 +1440,29 @@ mint();
|
|
|
1363
1440
|
if (req.url?.startsWith('/api/agent/')) {
|
|
1364
1441
|
const agentPath = req.url.split('?')[0];
|
|
1365
1442
|
|
|
1366
|
-
// Localhost-only guard
|
|
1443
|
+
// Localhost-only guard. NOTE: cloudflared connects over loopback, so a tunneled request also
|
|
1444
|
+
// arrives as 127.0.0.1 — this IP check alone is a no-op behind the relay. We additionally
|
|
1445
|
+
// reject any request carrying cloudflared's cf-connecting-ip header (tunnel-origin traffic);
|
|
1446
|
+
// the real defense is the 256-bit agent secret below. The Agent API is only ever called by
|
|
1447
|
+
// the local workspace backend (loopback), never legitimately from the public tunnel.
|
|
1367
1448
|
const remoteIp = req.socket.remoteAddress || '';
|
|
1368
|
-
|
|
1449
|
+
const isLoopback = remoteIp === '127.0.0.1' || remoteIp === '::1' || remoteIp === '::ffff:127.0.0.1';
|
|
1450
|
+
if (!isLoopback || req.headers['cf-connecting-ip'] || req.headers['cf-ray']) {
|
|
1369
1451
|
res.setHeader('Content-Type', 'application/json');
|
|
1370
1452
|
res.writeHead(403);
|
|
1371
1453
|
res.end(JSON.stringify({ ok: false, error: 'Agent API is localhost-only.' }));
|
|
1372
1454
|
return;
|
|
1373
1455
|
}
|
|
1374
1456
|
|
|
1375
|
-
// Auth: x-agent-secret header (query param fallback for EventSource which cannot set headers)
|
|
1457
|
+
// Auth: x-agent-secret header (query param fallback for EventSource which cannot set headers).
|
|
1458
|
+
// Constant-time compare (length-guarded) so the secret can't be recovered via timing.
|
|
1376
1459
|
const urlObj = new URL(req.url, `http://${req.headers.host}`);
|
|
1377
1460
|
const headerSecret = req.headers['x-agent-secret'];
|
|
1378
1461
|
const querySecret = urlObj.searchParams.get('secret');
|
|
1379
|
-
|
|
1462
|
+
const presented = typeof headerSecret === 'string' ? headerSecret : (querySecret || '');
|
|
1463
|
+
const secretOk = presented.length === agentSecret.length &&
|
|
1464
|
+
crypto.timingSafeEqual(Buffer.from(presented), Buffer.from(agentSecret));
|
|
1465
|
+
if (!secretOk) {
|
|
1380
1466
|
res.setHeader('Content-Type', 'application/json');
|
|
1381
1467
|
res.writeHead(401);
|
|
1382
1468
|
res.end(JSON.stringify({ ok: false, error: 'Invalid or missing x-agent-secret.' }));
|
|
@@ -1728,9 +1814,11 @@ mint();
|
|
|
1728
1814
|
const isInternal = req.headers['x-internal'] === internalSecret;
|
|
1729
1815
|
|
|
1730
1816
|
if (!isInternal) {
|
|
1731
|
-
//
|
|
1817
|
+
// Require a token for EVERY /api route except the public pre-login allowlist. This now
|
|
1818
|
+
// covers GET data reads (conversations, context, wallet, devices, push status) that
|
|
1819
|
+
// previously skipped auth and leaked over the public relay.
|
|
1732
1820
|
const method = req.method || 'GET';
|
|
1733
|
-
if (
|
|
1821
|
+
if (!isPublicRoute(method, req.url || '')) {
|
|
1734
1822
|
// POST /api/onboard is open only on genuine first run (no portal_pass yet). Read the
|
|
1735
1823
|
// setting DIRECTLY rather than the 30s-cached isAuthRequired() so the gate closes the
|
|
1736
1824
|
// instant onboarding sets a password — no stale-cache window for a takeover. The
|
|
@@ -1848,6 +1936,14 @@ mint();
|
|
|
1848
1936
|
return;
|
|
1849
1937
|
}
|
|
1850
1938
|
|
|
1939
|
+
// Vite failed to boot (sentinel port) → serve the recovering page directly instead of
|
|
1940
|
+
// proxying to a dead port. Chat (/bloby/*) is served earlier, so the lifeline stays up.
|
|
1941
|
+
if (vitePorts.dashboard < 0) {
|
|
1942
|
+
res.writeHead(503, { 'Content-Type': 'text/html', 'Cache-Control': 'no-store, no-cache, must-revalidate' });
|
|
1943
|
+
res.end(RECOVERING_HTML);
|
|
1944
|
+
return;
|
|
1945
|
+
}
|
|
1946
|
+
|
|
1851
1947
|
// Everything else → proxy to dashboard Vite dev server
|
|
1852
1948
|
console.log(`[supervisor] → dashboard Vite :${vitePorts.dashboard} | ${req.method} ${(req.url || '').split('?')[0]}`);
|
|
1853
1949
|
const GUARD_TAG = '<script defer src="/bloby/workspace-guard.js"></script>';
|
|
@@ -1902,6 +1998,11 @@ mint();
|
|
|
1902
1998
|
// An 'error' event with no listener is rethrown by Node as an uncaught exception,
|
|
1903
1999
|
// which would crash the whole supervisor. ws still tears down + fires 'close'.
|
|
1904
2000
|
ws.on('error', (err: any) => console.warn(`[app-ws] socket error: ${err?.message || err}`));
|
|
2001
|
+
// Liveness: a half-open socket (mobile/Wi-Fi drop behind the tunnel) never fires 'close', so
|
|
2002
|
+
// its chat subscription + maps would leak and broadcastBloby would keep writing to it. The
|
|
2003
|
+
// heartbeat below pings; a peer that misses a pong is terminated (which fires 'close' → cleanup).
|
|
2004
|
+
(ws as any).isAlive = true;
|
|
2005
|
+
ws.on('pong', () => { (ws as any).isAlive = true; });
|
|
1905
2006
|
|
|
1906
2007
|
// Per-WS chat subscription: when the client opts in, this WS joins chatSubscribers
|
|
1907
2008
|
// and receives every bot:* / chat:* event the dashboard widget does. SSE through the
|
|
@@ -2176,6 +2277,8 @@ mint();
|
|
|
2176
2277
|
// See appWss above: a listener-less 'error' event would crash the supervisor and kill
|
|
2177
2278
|
// chat for everyone (G1). ws still fires 'close' afterward, so map cleanup still runs.
|
|
2178
2279
|
ws.on('error', (err: any) => log.warn(`[bloby-ws] socket error: ${err?.message || err}`));
|
|
2280
|
+
(ws as any).isAlive = true;
|
|
2281
|
+
ws.on('pong', () => { (ws as any).isAlive = true; });
|
|
2179
2282
|
let convId = Math.random().toString(36).slice(2) + Date.now().toString(36);
|
|
2180
2283
|
conversations.set(ws, []);
|
|
2181
2284
|
|
|
@@ -2703,6 +2806,12 @@ mint();
|
|
|
2703
2806
|
}
|
|
2704
2807
|
}
|
|
2705
2808
|
|
|
2809
|
+
// Tell the live chat when the backend gives up — the dashboard interstitial covers page loads,
|
|
2810
|
+
// but an already-open chat client gets an explicit event it can surface ("ask me to fix the backend").
|
|
2811
|
+
setBackendGiveUpHandler(() => {
|
|
2812
|
+
broadcastBloby('backend:failed', { message: 'The workspace backend crashed and could not restart. Ask Bloby to fix it.' });
|
|
2813
|
+
});
|
|
2814
|
+
|
|
2706
2815
|
// Spawn backend (worker runs in-process)
|
|
2707
2816
|
spawnBackend(backendPort);
|
|
2708
2817
|
|
|
@@ -2788,14 +2897,36 @@ mint();
|
|
|
2788
2897
|
}, 1000);
|
|
2789
2898
|
}
|
|
2790
2899
|
|
|
2791
|
-
//
|
|
2792
|
-
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2900
|
+
// Self-healing file watchers. Two failure modes the audit flagged, both of which would hurt G3
|
|
2901
|
+
// (auto-heal) or G1 (chat): (a) fs.watch throws synchronously if its target is missing — at boot
|
|
2902
|
+
// that reached the top-level catch → process.exit before chat listened; (b) a watcher 'error'
|
|
2903
|
+
// event (EMFILE under load, the watched inode removed during a workspace swap) has no listener,
|
|
2904
|
+
// so it crashes the supervisor AND leaves a silently-dead watcher (auto-heal stops with no
|
|
2905
|
+
// signal). Fix: ensure the dir exists, attach an 'error' listener, and re-arm with backoff.
|
|
2906
|
+
let backendWatcher: fs.FSWatcher | null = null;
|
|
2907
|
+
let workspaceWatcher: fs.FSWatcher | null = null;
|
|
2796
2908
|
|
|
2797
|
-
|
|
2798
|
-
|
|
2909
|
+
function armBackendWatcher() {
|
|
2910
|
+
try {
|
|
2911
|
+
fs.mkdirSync(backendDir, { recursive: true }); // fs.watch throws if the target is missing
|
|
2912
|
+
const w = fs.watch(backendDir, { recursive: true }, (_event, filename) => {
|
|
2913
|
+
if (!filename || !filename.toString().match(/\.(ts|js|json)$/)) return;
|
|
2914
|
+
scheduleBackendRestart(`Backend file changed: ${filename}`);
|
|
2915
|
+
});
|
|
2916
|
+
w.on('error', (err: any) => {
|
|
2917
|
+
log.warn(`[watcher] backend watcher error: ${err?.message || err} — re-arming in 2s`);
|
|
2918
|
+
try { w.close(); } catch {}
|
|
2919
|
+
backendWatcher = null;
|
|
2920
|
+
setTimeout(armBackendWatcher, 2000);
|
|
2921
|
+
});
|
|
2922
|
+
backendWatcher = w;
|
|
2923
|
+
} catch (err: any) {
|
|
2924
|
+
log.warn(`[watcher] backend watcher failed to arm: ${err?.message || err} — retry in 5s`);
|
|
2925
|
+
setTimeout(armBackendWatcher, 5000);
|
|
2926
|
+
}
|
|
2927
|
+
}
|
|
2928
|
+
|
|
2929
|
+
function onWorkspaceChange(_event: fs.WatchEventType, filename: string | Buffer | null) {
|
|
2799
2930
|
if (!filename) return;
|
|
2800
2931
|
if (filename === '.env') {
|
|
2801
2932
|
scheduleBackendRestart('.env changed');
|
|
@@ -2832,7 +2963,40 @@ mint();
|
|
|
2832
2963
|
runDeferredUpdate();
|
|
2833
2964
|
}
|
|
2834
2965
|
}
|
|
2835
|
-
}
|
|
2966
|
+
}
|
|
2967
|
+
|
|
2968
|
+
function armWorkspaceWatcher() {
|
|
2969
|
+
try {
|
|
2970
|
+
const w = fs.watch(workspaceDir, onWorkspaceChange);
|
|
2971
|
+
w.on('error', (err: any) => {
|
|
2972
|
+
log.warn(`[watcher] workspace watcher error: ${err?.message || err} — re-arming in 2s`);
|
|
2973
|
+
try { w.close(); } catch {}
|
|
2974
|
+
workspaceWatcher = null;
|
|
2975
|
+
setTimeout(armWorkspaceWatcher, 2000);
|
|
2976
|
+
});
|
|
2977
|
+
workspaceWatcher = w;
|
|
2978
|
+
} catch (err: any) {
|
|
2979
|
+
log.warn(`[watcher] workspace watcher failed to arm: ${err?.message || err} — retry in 5s`);
|
|
2980
|
+
setTimeout(armWorkspaceWatcher, 5000);
|
|
2981
|
+
}
|
|
2982
|
+
}
|
|
2983
|
+
|
|
2984
|
+
armBackendWatcher();
|
|
2985
|
+
armWorkspaceWatcher();
|
|
2986
|
+
|
|
2987
|
+
// WebSocket liveness heartbeat — ping the app + chat WS clients every 30s and terminate any
|
|
2988
|
+
// that missed the previous pong (half-open sockets that never fired 'close'). Terminating fires
|
|
2989
|
+
// 'close', which runs the existing map/subscription cleanup. Scoped to our two WSS only (Vite's
|
|
2990
|
+
// HMR socket is separate and managed by Vite). Cleared in shutdown().
|
|
2991
|
+
const wsHeartbeat = setInterval(() => {
|
|
2992
|
+
for (const wss of [blobyWss, appWss]) {
|
|
2993
|
+
for (const ws of wss.clients) {
|
|
2994
|
+
if ((ws as any).isAlive === false) { try { ws.terminate(); } catch {} continue; }
|
|
2995
|
+
(ws as any).isAlive = false;
|
|
2996
|
+
try { ws.ping(); } catch {}
|
|
2997
|
+
}
|
|
2998
|
+
}
|
|
2999
|
+
}, 30_000);
|
|
2836
3000
|
|
|
2837
3001
|
// Tunnel
|
|
2838
3002
|
let tunnelUrl: string | null = null;
|
|
@@ -2994,8 +3158,9 @@ mint();
|
|
|
2994
3158
|
log.info('Shutting down...');
|
|
2995
3159
|
await channelManager.disconnectAll();
|
|
2996
3160
|
stopScheduler();
|
|
2997
|
-
backendWatcher
|
|
2998
|
-
workspaceWatcher
|
|
3161
|
+
backendWatcher?.close();
|
|
3162
|
+
workspaceWatcher?.close();
|
|
3163
|
+
clearInterval(wsHeartbeat);
|
|
2999
3164
|
if (backendRestartTimer) clearTimeout(backendRestartTimer);
|
|
3000
3165
|
if (watchdogInterval) clearInterval(watchdogInterval);
|
|
3001
3166
|
stopHeartbeat();
|