@hienlh/ppm 0.8.71 → 0.8.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.8.72] - 2026-03-31
4
+
5
+ ### Added
6
+ - **Supervisor state machine**: States `running → paused → upgrading` with promise-based wait/resume. Supervisor pauses after 10 consecutive crashes, resumes via `ppm restart --force` or SIGUSR2
7
+ - **Cloud WebSocket client**: Persistent WS connection from supervisor to PPM Cloud replacing HTTP heartbeat — auto-reconnect with exponential backoff + jitter, 60s heartbeat, 50-message offline queue
8
+ - **Remote commands via Cloud**: Supervisor handles restart/stop/upgrade/resume/status commands received from Cloud WS
9
+ - **`ppm restart --force`**: Resume a paused supervisor (crashed too many times)
10
+ - **Status CLI state display**: `ppm status` shows paused/upgrading state with reason, timestamp, and last crash error
11
+
12
+ ### Changed
13
+ - **Foreground mode removed**: `ppm start` no longer accepts `-f`/`--foreground` — always runs as supervised daemon
14
+ - **Heartbeat via WS**: Cloud heartbeat migrated from HTTP polling (5min) to WebSocket (60s), includes `appVersion`, `serverPid`, `uptime`
15
+
16
+ ### Fixed
17
+ - **Upgrade failure recovery**: `selfReplace` failure now correctly resets state from "upgrading" back to "running" and notifies Cloud
18
+
3
19
  ## [0.8.71] - 2026-03-31
4
20
 
5
21
  ### Added
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hienlh/ppm",
3
- "version": "0.8.71",
3
+ "version": "0.8.72",
4
4
  "description": "Personal Project Manager — mobile-first web IDE with AI assistance",
5
5
  "author": "hienlh",
6
6
  "license": "MIT",
@@ -9,7 +9,7 @@ const RESTARTING_FLAG = resolve(PPM_DIR, ".restarting");
9
9
  const RESTART_RESULT = resolve(PPM_DIR, ".restart-result");
10
10
 
11
11
  /** Restart only the server process, keeping the tunnel alive */
12
- export async function restartServer(options: { config?: string }) {
12
+ export async function restartServer(options: { config?: string; force?: boolean }) {
13
13
  // Ignore SIGHUP so this process survives when PPM terminal dies
14
14
  process.on("SIGHUP", () => {});
15
15
 
@@ -34,6 +34,14 @@ export async function restartServer(options: { config?: string }) {
34
34
  process.exit(1);
35
35
  }
36
36
 
37
+ // Check if supervisor is paused — require --force to resume
38
+ const state = status.state as string | undefined;
39
+ if (state === "paused" && !options.force) {
40
+ console.log("\n Server is paused (crashed too many times).");
41
+ console.log(" Use 'ppm restart --force' to resume.\n");
42
+ process.exit(1);
43
+ }
44
+
37
45
  const oldServerPid = status.pid as number | undefined;
38
46
  console.log("\n Restarting PPM server via supervisor...");
39
47
  console.log(" If you're using PPM terminal, wait a few seconds for auto-reconnect.\n");
@@ -15,6 +15,10 @@ interface DaemonStatus {
15
15
  tunnelAlive: boolean;
16
16
  supervisorPid: number | null;
17
17
  supervisorAlive: boolean;
18
+ state: string | null;
19
+ pausedAt: string | null;
20
+ pauseReason: string | null;
21
+ lastCrashError: string | null;
18
22
  }
19
23
 
20
24
  function isAlive(pid: number): boolean {
@@ -26,6 +30,7 @@ function getDaemonStatus(): DaemonStatus {
26
30
  running: false, pid: null, port: null, host: null,
27
31
  shareUrl: null, tunnelPid: null, tunnelAlive: false,
28
32
  supervisorPid: null, supervisorAlive: false,
33
+ state: null, pausedAt: null, pauseReason: null, lastCrashError: null,
29
34
  };
30
35
 
31
36
  if (existsSync(STATUS_FILE)) {
@@ -46,6 +51,10 @@ function getDaemonStatus(): DaemonStatus {
46
51
  tunnelAlive,
47
52
  supervisorPid,
48
53
  supervisorAlive,
54
+ state: (data.state as string) ?? null,
55
+ pausedAt: (data.pausedAt as string) ?? null,
56
+ pauseReason: (data.pauseReason as string) ?? null,
57
+ lastCrashError: (data.lastCrashError as string) ?? null,
49
58
  };
50
59
  } catch { return dead; }
51
60
  }
@@ -161,6 +170,16 @@ export async function showStatus(options: { json?: boolean; all?: boolean }) {
161
170
  if (status.supervisorPid) {
162
171
  console.log(` Supervisor: ${status.supervisorAlive ? "running" : "stopped"} (PID: ${status.supervisorPid})`);
163
172
  }
173
+ // Show state info
174
+ const state = status.state ?? (status.running ? "running" : "stopped");
175
+ if (state === "paused") {
176
+ console.log(` State: PAUSED — ${status.pauseReason ?? "unknown reason"}`);
177
+ if (status.pausedAt) console.log(` Paused: ${status.pausedAt}`);
178
+ if (status.lastCrashError) console.log(` Error: ${status.lastCrashError}`);
179
+ console.log(`\n Resume: ppm restart --force`);
180
+ } else if (state === "upgrading") {
181
+ console.log(` State: UPGRADING`);
182
+ }
164
183
  console.log(` Server: ${status.running ? "running" : "stopped"} (PID: ${status.pid})`);
165
184
  if (status.port) console.log(` Local: http://localhost:${status.port}/`);
166
185
  if (status.tunnelPid) {
package/src/index.ts CHANGED
@@ -16,8 +16,6 @@ program
16
16
  .command("start")
17
17
  .description("Start the PPM server (background by default)")
18
18
  .option("-p, --port <port>", "Port to listen on")
19
- .option("-f, --foreground", "Run in foreground (default: background daemon)")
20
- .option("-d, --daemon", "Run as background daemon (default, kept for compat)")
21
19
  .option("-s, --share", "Share via public URL (Cloudflare tunnel)")
22
20
  .option("-c, --config <path>", "Path to config file (YAML import into DB)")
23
21
  .option("--profile <name>", "DB profile name (e.g. 'dev' → ppm.dev.db)")
@@ -51,6 +49,7 @@ program
51
49
  .command("restart")
52
50
  .description("Restart the server (keeps tunnel alive)")
53
51
  .option("-c, --config <path>", "Path to config file")
52
+ .option("--force", "Force resume from paused state")
54
53
  .action(async (options) => {
55
54
  const { restartServer } = await import("./cli/commands/restart.ts");
56
55
  await restartServer(options);
@@ -160,8 +160,6 @@ app.route("/", staticRoutes);
160
160
 
161
161
  export async function startServer(options: {
162
162
  port?: string;
163
- foreground?: boolean;
164
- daemon?: boolean; // compat, ignored (daemon is now default)
165
163
  share?: boolean;
166
164
  config?: string;
167
165
  profile?: string;
@@ -171,36 +169,27 @@ export async function startServer(options: {
171
169
  const port = parseInt(options.port ?? String(configService.get("port")), 10);
172
170
  const host = configService.get("host");
173
171
 
174
- // Setup log file (both foreground and daemon modes)
175
172
  await setupLogFile();
176
173
 
177
- // Check if port is already in use before starting.
178
- // Skip in hot-reload mode — Bun.serve() replaces the previous server on the same port,
179
- // but a net.createServer() probe would see it as "in use" and exit prematurely.
180
- // globalThis persists across bun --hot reloads, so we use a flag set after first start.
181
- const isHotReload = !!(globalThis as any).__PPM_SERVER_STARTED__;
182
- if (!isHotReload) {
183
- const portInUse = await new Promise<boolean>((resolve) => {
184
- const net = require("node:net") as typeof import("node:net");
185
- const tester = net.createServer()
186
- .once("error", (err: NodeJS.ErrnoException) => {
187
- resolve(err.code === "EADDRINUSE");
188
- })
189
- .once("listening", () => {
190
- tester.close(() => resolve(false));
191
- })
192
- .listen(port, host);
193
- });
194
- if (portInUse) {
195
- console.error(`\n ✗ Port ${port} is already in use.`);
196
- console.error(` Run 'ppm stop' first or use a different port with --port.\n`);
197
- process.exit(1);
198
- }
174
+ // Check if port is already in use before spawning supervisor
175
+ const portInUse = await new Promise<boolean>((resolve) => {
176
+ const net = require("node:net") as typeof import("node:net");
177
+ const tester = net.createServer()
178
+ .once("error", (err: NodeJS.ErrnoException) => {
179
+ resolve(err.code === "EADDRINUSE");
180
+ })
181
+ .once("listening", () => {
182
+ tester.close(() => resolve(false));
183
+ })
184
+ .listen(port, host);
185
+ });
186
+ if (portInUse) {
187
+ console.error(`\n ✗ Port ${port} is already in use.`);
188
+ console.error(` Run 'ppm stop' first or use a different port with --port.\n`);
189
+ process.exit(1);
199
190
  }
200
191
 
201
- const isDaemon = !options.foreground;
202
-
203
- if (isDaemon) {
192
+ {
204
193
  const { resolve } = await import("node:path");
205
194
  const { homedir } = await import("node:os");
206
195
  const { writeFileSync, readFileSync, mkdirSync, existsSync, openSync } = await import("node:fs");
@@ -266,7 +255,6 @@ export async function startServer(options: {
266
255
  if (isNaN(supervisorPid)) {
267
256
  console.error(" ✗ Failed to start supervisor on Windows.");
268
257
  console.error(` ${result.stderr.toString().trim()}`);
269
- console.error(" Try: ppm start -f (foreground mode)");
270
258
  process.exit(1);
271
259
  }
272
260
  } else {
@@ -291,7 +279,6 @@ export async function startServer(options: {
291
279
  try { process.kill(supervisorPid, 0); } catch {
292
280
  console.error(" ✗ Supervisor exited immediately after start.");
293
281
  console.error(" Check logs: ppm logs");
294
- console.error(" Or try: ppm start -f (foreground mode)");
295
282
  process.exit(1);
296
283
  }
297
284
  // Check if server PID appeared in status.json
@@ -347,130 +334,6 @@ export async function startServer(options: {
347
334
 
348
335
  process.exit(0);
349
336
  }
350
-
351
- // Foreground mode — with WebSocket support
352
- const server = Bun.serve({
353
- port,
354
- hostname: host,
355
- fetch(req, server) {
356
- const url = new URL(req.url);
357
-
358
- // WebSocket upgrade: /ws/project/:projectName/terminal/:id
359
- if (url.pathname.startsWith("/ws/project/")) {
360
- const parts = url.pathname.split("/");
361
- const projectName = parts[3] ?? "";
362
- const wsType = parts[4] ?? "";
363
- const id = parts[5] ?? "";
364
-
365
- if (wsType === "terminal") {
366
- const upgraded = server.upgrade(req, {
367
- data: { type: "terminal", id, projectName },
368
- });
369
- if (upgraded) return undefined;
370
- return new Response("WebSocket upgrade failed", { status: 400 });
371
- }
372
-
373
- if (wsType === "chat") {
374
- const sessionId = id;
375
- const upgraded = server.upgrade(req, {
376
- data: { type: "chat", sessionId, projectName },
377
- });
378
- if (upgraded) return undefined;
379
- return new Response("WebSocket upgrade failed", { status: 400 });
380
- }
381
- }
382
-
383
- return app.fetch(req, server);
384
- },
385
- websocket: {
386
- idleTimeout: 960,
387
- sendPong: true,
388
- perMessageDeflate: false, // Disable compression — Cloudflare tunnels can mangle compressed frames
389
- open(ws: any) {
390
- if (ws.data?.type === "health") {
391
- ws.send(JSON.stringify({ type: "health", status: "ok" }));
392
- } else if (ws.data?.type === "chat") chatWebSocket.open(ws);
393
- else terminalWebSocket.open(ws);
394
- },
395
- message(ws: any, msg: any) {
396
- if (ws.data?.type === "health") {
397
- // Respond to ping with pong
398
- ws.send(JSON.stringify({ type: "health", status: "ok" }));
399
- } else if (ws.data?.type === "chat") chatWebSocket.message(ws, msg);
400
- else terminalWebSocket.message(ws, msg);
401
- },
402
- close(ws: any) {
403
- if (ws.data?.type === "health") return;
404
- if (ws.data?.type === "chat") chatWebSocket.close(ws);
405
- else terminalWebSocket.close(ws);
406
- },
407
- } as Parameters<typeof Bun.serve>[0] extends { websocket?: infer W } ? W : never,
408
- });
409
-
410
- // Mark server as started — survives bun --hot reloads (globalThis persists)
411
- (globalThis as any).__PPM_SERVER_STARTED__ = true;
412
-
413
- // Start background usage polling
414
- import("../services/claude-usage.service.ts").then(({ startUsagePolling }) => startUsagePolling()).catch(() => {});
415
-
416
- // Start background account token refresh
417
- import("../services/account.service.ts").then(({ accountService }) => accountService.startAutoRefresh()).catch(() => {});
418
-
419
- console.log(`\n PPM ready\n`);
420
- console.log(` ➜ Local: http://localhost:${server.port}/`);
421
-
422
- const { networkInterfaces } = await import("node:os");
423
- const nets = networkInterfaces();
424
- for (const name of Object.keys(nets)) {
425
- for (const net of nets[name] ?? []) {
426
- if (net.family === "IPv4" && !net.internal) {
427
- console.log(` ➜ Network: http://${net.address}:${server.port}/`);
428
- }
429
- }
430
- }
431
-
432
- // Share tunnel in foreground mode
433
- if (options.share) {
434
- try {
435
- const { tunnelService } = await import("../services/tunnel.service.ts");
436
- console.log("\n Starting share tunnel...");
437
- const shareUrl = await tunnelService.startTunnel(server.port!);
438
- console.log(` ➜ Share: ${shareUrl}`);
439
- if (!configService.get("auth").enabled) {
440
- console.log(`\n ⚠ Warning: auth is disabled — your IDE is publicly accessible!`);
441
- console.log(` Enable auth: run 'ppm config set auth.enabled true' or restart without --share.`);
442
- }
443
- const qr = await import("qrcode-terminal");
444
- console.log();
445
- qr.generate(shareUrl, { small: true });
446
- } catch (err: unknown) {
447
- const msg = err instanceof Error ? err.message : String(err);
448
- console.error(` ✗ Share failed: ${msg}`);
449
- }
450
- }
451
-
452
- console.log(`\n Auth: ${configService.get("auth").enabled ? "enabled" : "disabled"}`);
453
- if (configService.get("auth").enabled) {
454
- console.log(` Token: ${configService.get("auth").token}`);
455
- }
456
- console.log();
457
-
458
- // Graceful shutdown — stop server + tunnel + preview tunnels + DB on exit
459
- const shutdown = () => {
460
- try { server.stop(true); } catch {}
461
- try {
462
- import("../services/tunnel.service.ts").then(({ tunnelService }) => tunnelService.stopTunnel()).catch(() => {});
463
- } catch {}
464
- try {
465
- import("./routes/browser-preview.ts").then(({ stopAllPreviewTunnels }) => stopAllPreviewTunnels()).catch(() => {});
466
- } catch {}
467
- try {
468
- import("../services/db.service.ts").then(({ closeDb }) => closeDb()).catch(() => {});
469
- } catch {}
470
- };
471
- process.on("SIGINT", () => { shutdown(); process.exit(0); });
472
- process.on("SIGTERM", () => { shutdown(); process.exit(0); });
473
- process.on("exit", shutdown);
474
337
  }
475
338
 
476
339
  // Internal entry point for daemon child process
@@ -0,0 +1,208 @@
1
+ /**
2
+ * Cloud WebSocket client — persistent connection from supervisor to PPM Cloud.
3
+ * Auto-reconnects with exponential backoff + jitter. Queues messages when disconnected.
4
+ */
5
+ import { appendFileSync } from "node:fs";
6
+ import { resolve } from "node:path";
7
+ import { homedir } from "node:os";
8
+
9
+ // ─── Types (must match Cloud's ws-types.ts) ─────────
10
+ interface WsMessage {
11
+ type: string;
12
+ id?: string;
13
+ timestamp: string;
14
+ }
15
+
16
+ interface HeartbeatMsg extends WsMessage {
17
+ type: "heartbeat";
18
+ tunnelUrl: string | null;
19
+ state: string;
20
+ appVersion: string;
21
+ serverPid: number | null;
22
+ uptime: number;
23
+ }
24
+
25
+ interface StateChangeMsg extends WsMessage {
26
+ type: "state_change";
27
+ from: string;
28
+ to: string;
29
+ reason: string;
30
+ }
31
+
32
+ interface CommandResultMsg extends WsMessage {
33
+ type: "command_result";
34
+ id: string;
35
+ success: boolean;
36
+ error?: string;
37
+ data?: Record<string, unknown>;
38
+ }
39
+
40
+ type OutboundMsg = HeartbeatMsg | StateChangeMsg | CommandResultMsg;
41
+
42
+ interface CommandMsg extends WsMessage {
43
+ type: "command";
44
+ id: string;
45
+ action: string;
46
+ params?: Record<string, unknown>;
47
+ }
48
+
49
+ type CommandHandler = (cmd: CommandMsg) => void;
50
+
51
+ // ─── Constants ──────────────────────────────────────
52
+ const BACKOFF_STEPS = [1000, 2000, 4000, 8000, 15000, 30000, 60000];
53
+ const MAX_QUEUE_SIZE = 50;
54
+ const HEARTBEAT_INTERVAL_MS = 60_000; // 60s via WS
55
+
56
+ // ─── State ──────────────────────────────────────────
57
+ let ws: WebSocket | null = null;
58
+ let connected = false;
59
+ let reconnecting = false;
60
+ let reconnectAttempt = 0;
61
+ let reconnectTimer: ReturnType<typeof setTimeout> | null = null;
62
+ let heartbeatTimer: ReturnType<typeof setInterval> | null = null;
63
+ let commandHandler: CommandHandler | null = null;
64
+ let outboundQueue: OutboundMsg[] = [];
65
+ let wsUrl = "";
66
+ let shouldConnect = false;
67
+
68
+ // Credentials for first-message auth
69
+ let deviceId = "";
70
+ let secretKey = "";
71
+
72
+ // For heartbeat payload
73
+ let getHeartbeatData: (() => HeartbeatMsg) | null = null;
74
+
75
+ // ─── Public API ─────────────────────────────────────
76
+
77
+ export function connect(opts: {
78
+ cloudUrl: string;
79
+ deviceId: string;
80
+ secretKey: string;
81
+ heartbeatFn: () => HeartbeatMsg;
82
+ }): void {
83
+ // No secret_key in URL — auth via first message after connect
84
+ wsUrl = `${opts.cloudUrl.replace(/^http/, "ws")}/ws/device`;
85
+ deviceId = opts.deviceId;
86
+ secretKey = opts.secretKey;
87
+ getHeartbeatData = opts.heartbeatFn;
88
+ shouldConnect = true;
89
+ reconnectAttempt = 0;
90
+ doConnect();
91
+ }
92
+
93
+ export function disconnect(): void {
94
+ shouldConnect = false;
95
+ if (reconnectTimer) { clearTimeout(reconnectTimer); reconnectTimer = null; }
96
+ if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
97
+ if (ws) {
98
+ try { ws.close(1000, "shutdown"); } catch {}
99
+ ws = null;
100
+ }
101
+ connected = false;
102
+ outboundQueue = [];
103
+ }
104
+
105
+ export function send(msg: OutboundMsg): void {
106
+ if (connected && ws?.readyState === WebSocket.OPEN) {
107
+ ws.send(JSON.stringify(msg));
108
+ } else {
109
+ outboundQueue.push(msg);
110
+ if (outboundQueue.length > MAX_QUEUE_SIZE) outboundQueue.shift();
111
+ }
112
+ }
113
+
114
+ export function onCommand(handler: CommandHandler): void {
115
+ commandHandler = handler;
116
+ }
117
+
118
+ export function isConnected(): boolean {
119
+ return connected;
120
+ }
121
+
122
+ // ─── Internal ───────────────────────────────────────
123
+
124
+ function doConnect(): void {
125
+ if (!shouldConnect || reconnecting) return;
126
+ reconnecting = true;
127
+
128
+ try {
129
+ ws = new WebSocket(wsUrl);
130
+ } catch {
131
+ reconnecting = false;
132
+ scheduleReconnect();
133
+ return;
134
+ }
135
+
136
+ ws.onopen = () => {
137
+ reconnecting = false;
138
+ log("INFO", "Cloud WS connected, sending auth");
139
+
140
+ // Send auth as first message (not in URL)
141
+ ws!.send(JSON.stringify({
142
+ type: "auth",
143
+ deviceId,
144
+ secretKey,
145
+ timestamp: new Date().toISOString(),
146
+ version: 1,
147
+ }));
148
+
149
+ connected = true;
150
+ reconnectAttempt = 0;
151
+
152
+ // Flush queued messages
153
+ while (outboundQueue.length > 0 && connected) {
154
+ const msg = outboundQueue.shift()!;
155
+ ws!.send(JSON.stringify(msg));
156
+ }
157
+
158
+ // Send immediate heartbeat
159
+ if (getHeartbeatData) send(getHeartbeatData());
160
+
161
+ // Start periodic heartbeat
162
+ if (heartbeatTimer) clearInterval(heartbeatTimer);
163
+ heartbeatTimer = setInterval(() => {
164
+ if (getHeartbeatData && connected) send(getHeartbeatData());
165
+ }, HEARTBEAT_INTERVAL_MS);
166
+ };
167
+
168
+ ws.onmessage = (event) => {
169
+ try {
170
+ const msg = JSON.parse(String(event.data)) as CommandMsg;
171
+ if (msg.type === "command" && commandHandler) {
172
+ commandHandler(msg);
173
+ }
174
+ } catch {} // ignore malformed
175
+ };
176
+
177
+ ws.onclose = () => {
178
+ connected = false;
179
+ reconnecting = false;
180
+ ws = null;
181
+ if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }
182
+ if (shouldConnect) scheduleReconnect();
183
+ };
184
+
185
+ ws.onerror = () => {
186
+ // onclose will fire after onerror — reconnect handled there
187
+ };
188
+ }
189
+
190
+ function scheduleReconnect(): void {
191
+ if (!shouldConnect || reconnectTimer) return;
192
+ const base = BACKOFF_STEPS[Math.min(reconnectAttempt, BACKOFF_STEPS.length - 1)]!;
193
+ // Add ±30% jitter to prevent thundering herd after Cloud deploy
194
+ const jitter = base * (0.7 + Math.random() * 0.6);
195
+ const delay = Math.round(jitter);
196
+ reconnectAttempt++;
197
+ log("WARN", `Cloud WS reconnect in ${delay}ms (attempt #${reconnectAttempt})`);
198
+ reconnectTimer = setTimeout(() => {
199
+ reconnectTimer = null;
200
+ doConnect();
201
+ }, delay);
202
+ }
203
+
204
+ function log(level: string, msg: string): void {
205
+ const ts = new Date().toISOString();
206
+ const logFile = resolve(process.env.PPM_HOME || resolve(homedir(), ".ppm"), "ppm.log");
207
+ try { appendFileSync(logFile, `[${ts}] [${level}] [cloud-ws] ${msg}\n`); } catch {}
208
+ }
@@ -37,6 +37,24 @@ let tunnelChild: Subprocess | null = null;
37
37
  let tunnelUrl: string | null = null;
38
38
  let shuttingDown = false;
39
39
 
40
+ type SupervisorState = "running" | "paused" | "upgrading";
41
+ let supervisorState: SupervisorState = "running";
42
+
43
+ let resumeResolve: (() => void) | null = null;
44
+
45
+ function waitForResume(): Promise<void> {
46
+ return new Promise((resolve) => {
47
+ resumeResolve = resolve;
48
+ });
49
+ }
50
+
51
+ function triggerResume(): void {
52
+ if (resumeResolve) {
53
+ resumeResolve();
54
+ resumeResolve = null;
55
+ }
56
+ }
57
+
40
58
  let serverRestarts = 0;
41
59
  let lastServerCrash = 0;
42
60
  let tunnelRestarts = 0;
@@ -129,8 +147,25 @@ export async function spawnServer(
129
147
  serverRestarts++;
130
148
 
131
149
  if (serverRestarts > MAX_RESTARTS) {
132
- log("FATAL", `Server exceeded ${MAX_RESTARTS} restarts, giving up`);
133
- shutdown();
150
+ log("WARN", `Server exceeded ${MAX_RESTARTS} restarts, pausing`);
151
+ notifyStateChange("running", "paused", "max_restarts_exceeded");
152
+ supervisorState = "paused";
153
+ updateStatus({
154
+ state: "paused",
155
+ pid: null,
156
+ pausedAt: new Date().toISOString(),
157
+ pauseReason: "max_restarts",
158
+ lastCrashError: `exit ${exitCode}`,
159
+ });
160
+ // Wait for resume signal — supervisor stays alive
161
+ await waitForResume();
162
+ // Resumed — reset and respawn
163
+ notifyStateChange("paused", "running", "user_resume");
164
+ supervisorState = "running";
165
+ serverRestarts = 0;
166
+ updateStatus({ state: "running", pausedAt: null, pauseReason: null });
167
+ log("INFO", "Resuming server after pause");
168
+ if (!shuttingDown) return spawnServer(serverArgs, logFd);
134
169
  return;
135
170
  }
136
171
 
@@ -189,12 +224,7 @@ async function syncUrlToCloud(url: string) {
189
224
  } catch {}
190
225
  }
191
226
 
192
- function startCloudHeartbeat(url: string) {
193
- if (heartbeatTimer) clearInterval(heartbeatTimer);
194
- heartbeatTimer = setInterval(() => {
195
- if (tunnelUrl) syncUrlToCloud(tunnelUrl);
196
- }, 5 * 60 * 1000);
197
- }
227
+ // HTTP heartbeat removed — WS is the sole heartbeat mechanism (Phase 4)
198
228
 
199
229
  export async function spawnTunnel(port: number): Promise<void> {
200
230
  let bin: string;
@@ -230,9 +260,8 @@ export async function spawnTunnel(port: number): Promise<void> {
230
260
  updateStatus({ shareUrl: tunnelUrl, tunnelPid: tunnelChild.pid });
231
261
  log("INFO", `Tunnel ready: ${tunnelUrl} (PID: ${tunnelChild.pid})`);
232
262
 
233
- // Sync new URL to cloud immediately + start periodic heartbeat
263
+ // One-time sync of tunnel URL to cloud (WS handles periodic heartbeat)
234
264
  await syncUrlToCloud(tunnelUrl);
235
- startCloudHeartbeat(tunnelUrl);
236
265
 
237
266
  const exitCode = await tunnelChild.exited;
238
267
  tunnelChild = null;
@@ -330,6 +359,9 @@ async function selfReplace(): Promise<{ success: boolean; error?: string }> {
330
359
  try {
331
360
  // Prevent spawnServer crash-restart loop from respawning killed children
332
361
  shuttingDown = true;
362
+ notifyStateChange(supervisorState, "upgrading", "self_replace");
363
+ supervisorState = "upgrading";
364
+ updateStatus({ state: "upgrading" });
333
365
 
334
366
  // Kill server + tunnel children FIRST to free the port for the new supervisor
335
367
  log("INFO", "Stopping server and tunnel before spawning new supervisor");
@@ -372,20 +404,158 @@ async function selfReplace(): Promise<{ success: boolean; error?: string }> {
372
404
  log("ERROR", "Self-replace timeout: new supervisor did not start");
373
405
  try { child.kill(); } catch {}
374
406
  shuttingDown = false;
407
+ notifyStateChange("upgrading", "running", "upgrade_failed");
408
+ supervisorState = "running";
409
+ updateStatus({ state: "running" });
375
410
  return { success: false, error: "New supervisor failed to start within 30s" };
376
411
  } catch (e) {
377
412
  log("ERROR", `Self-replace error: ${e}`);
378
413
  shuttingDown = false;
414
+ notifyStateChange("upgrading", "running", "upgrade_failed");
415
+ supervisorState = "running";
416
+ updateStatus({ state: "running" });
379
417
  return { success: false, error: (e as Error).message };
380
418
  }
381
419
  }
382
420
 
421
+ // ─── Cloud WS integration ─────────────────────────────────────────────
422
+
423
+ /** Notify Cloud of supervisor state change via WS */
424
+ async function notifyStateChange(from: string, to: string, reason: string) {
425
+ try {
426
+ const { send, isConnected } = await import("./cloud-ws.service.ts");
427
+ if (isConnected()) {
428
+ send({
429
+ type: "state_change",
430
+ from,
431
+ to,
432
+ reason,
433
+ timestamp: new Date().toISOString(),
434
+ });
435
+ }
436
+ } catch {}
437
+ }
438
+
439
+ /** Connect supervisor to Cloud via WebSocket (if device is linked) */
440
+ async function connectCloud(opts: { port: number }, serverArgs: string[], logFd: number) {
441
+ try {
442
+ const { getCloudDevice } = await import("./cloud.service.ts");
443
+ const device = getCloudDevice();
444
+ if (!device) return; // not linked to cloud
445
+
446
+ const { connect, onCommand } = await import("./cloud-ws.service.ts");
447
+ const { VERSION } = await import("../version.ts");
448
+ const startTime = Date.now();
449
+
450
+ connect({
451
+ cloudUrl: device.cloud_url,
452
+ deviceId: device.device_id,
453
+ secretKey: device.secret_key,
454
+ heartbeatFn: () => ({
455
+ type: "heartbeat" as const,
456
+ tunnelUrl,
457
+ state: supervisorState,
458
+ appVersion: VERSION,
459
+ serverPid: serverChild?.pid ?? null,
460
+ uptime: Math.floor((Date.now() - startTime) / 1000),
461
+ timestamp: new Date().toISOString(),
462
+ }),
463
+ });
464
+
465
+ // Handle commands from Cloud
466
+ onCommand(async (cmd) => {
467
+ const { send } = await import("./cloud-ws.service.ts");
468
+ const sendResult = (success: boolean, error?: string, data?: Record<string, unknown>) => {
469
+ send({
470
+ type: "command_result",
471
+ id: cmd.id,
472
+ success,
473
+ error,
474
+ data,
475
+ timestamp: new Date().toISOString(),
476
+ });
477
+ };
478
+
479
+ log("INFO", `Cloud command received: ${cmd.action}`);
480
+
481
+ switch (cmd.action) {
482
+ case "restart":
483
+ if (serverChild) {
484
+ serverRestartRequested = true;
485
+ try { serverChild.kill(); } catch {}
486
+ sendResult(true);
487
+ } else if (supervisorState === "paused") {
488
+ triggerResume();
489
+ sendResult(true);
490
+ } else {
491
+ sendResult(false, "No server child to restart");
492
+ }
493
+ break;
494
+
495
+ case "resume":
496
+ if (supervisorState === "paused") {
497
+ triggerResume();
498
+ sendResult(true);
499
+ } else {
500
+ sendResult(false, "Not in paused state");
501
+ }
502
+ break;
503
+
504
+ case "stop":
505
+ sendResult(true);
506
+ // Delay exit to allow WS buffer to flush
507
+ setTimeout(() => {
508
+ shutdown();
509
+ process.exit(0);
510
+ }, 500);
511
+ break;
512
+
513
+ case "upgrade":
514
+ // Send result BEFORE selfReplace (which exits on success)
515
+ sendResult(true, undefined, { status: "upgrading" });
516
+ await new Promise(r => setTimeout(r, 300));
517
+ const result = await selfReplace();
518
+ // Only reaches here on failure — selfReplace exits on success
519
+ if (!result.success) {
520
+ sendResult(false, result.error);
521
+ if (!serverChild && !shuttingDown) {
522
+ spawnServer(serverArgs, logFd);
523
+ }
524
+ }
525
+ break;
526
+
527
+ case "status":
528
+ sendResult(true, undefined, {
529
+ state: supervisorState,
530
+ serverPid: serverChild?.pid ?? null,
531
+ tunnelUrl,
532
+ serverRestarts,
533
+ });
534
+ break;
535
+
536
+ default:
537
+ sendResult(false, `Unknown action: ${cmd.action}`);
538
+ }
539
+ });
540
+ } catch (e) {
541
+ log("WARN", `Cloud WS setup failed: ${e}`);
542
+ }
543
+ }
544
+
383
545
  // ─── Shutdown ──────────────────────────────────────────────────────────
384
546
  export function shutdown() {
385
547
  if (shuttingDown) return;
386
548
  shuttingDown = true;
387
549
  log("INFO", "Supervisor shutting down");
388
550
 
551
+ // Unblock if paused
552
+ triggerResume();
553
+
554
+ // Disconnect Cloud WS
555
+ import("./cloud-ws.service.ts")
556
+ .then(({ disconnect }) => disconnect())
557
+ .catch(() => {});
558
+
389
559
  if (healthTimer) clearInterval(healthTimer);
390
560
  if (tunnelProbeTimer) clearInterval(tunnelProbeTimer);
391
561
  if (heartbeatTimer) clearInterval(heartbeatTimer);
@@ -414,7 +584,10 @@ export async function runSupervisor(opts: {
414
584
 
415
585
  // Write supervisor PID + clear stale availableVersion from previous run
416
586
  writeFileSync(PID_FILE, String(process.pid));
417
- updateStatus({ supervisorPid: process.pid, port: opts.port, host: opts.host, availableVersion: null });
587
+ updateStatus({
588
+ supervisorPid: process.pid, port: opts.port, host: opts.host, availableVersion: null,
589
+ state: "running", pausedAt: null, pauseReason: null, lastCrashError: null,
590
+ });
418
591
 
419
592
  // Build __serve__ args
420
593
  const serverArgs = [
@@ -428,8 +601,13 @@ export async function runSupervisor(opts: {
428
601
  process.on("SIGTERM", () => { shutdown(); process.exit(0); });
429
602
  process.on("SIGINT", () => { shutdown(); process.exit(0); });
430
603
 
431
- // SIGUSR2 = graceful server restart (tunnel stays alive)
604
+ // SIGUSR2 = graceful server restart (tunnel stays alive) or resume from paused
432
605
  process.on("SIGUSR2", () => {
606
+ if (supervisorState === "paused") {
607
+ log("INFO", "SIGUSR2 received while paused, resuming server");
608
+ triggerResume();
609
+ return;
610
+ }
433
611
  log("INFO", "SIGUSR2 received, restarting server only");
434
612
  if (serverChild) {
435
613
  serverRestartRequested = true; // flag so spawnServer skips backoff
@@ -458,6 +636,9 @@ export async function runSupervisor(opts: {
458
636
  upgradeCheckTimer = setInterval(checkAvailableVersion, UPGRADE_CHECK_INTERVAL_MS);
459
637
  }, UPGRADE_SKIP_INITIAL_MS);
460
638
 
639
+ // Connect to Cloud via WebSocket (if device is linked)
640
+ connectCloud(opts, serverArgs, logFd);
641
+
461
642
  // Spawn server + tunnel in parallel
462
643
  const promises: Promise<void>[] = [spawnServer(serverArgs, logFd)];
463
644