@hienlh/ppm 0.8.52 → 0.8.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.8.53] - 2026-03-25
4
+
5
+ ### Added
6
+ - **Process supervisor**: Long-lived parent process manages server + tunnel children with auto-restart on crash (exponential backoff 1s→60s, resets after 5min stable)
7
+ - **Tunnel resilience**: Auto-respawn cloudflared on death, extract new URL, sync to cloud immediately
8
+ - **Server health watchdog**: GET /api/health every 30s, kills hung server after 3 consecutive failures
9
+ - **Tunnel URL probe**: GET tunnelUrl/api/health every 2min, regenerates tunnel after 2 failures
10
+ - **SIGUSR2 graceful restart**: `ppm restart` signals supervisor to restart server only (tunnel stays alive, no backoff)
11
+ - **Count-based exception exit**: 3+ uncaught exceptions in 1 minute triggers exit for clean supervisor restart
12
+ - **Integration tests**: 8 tests covering supervisor spawn, crash recovery, SIGUSR2 restart, backoff behavior
13
+
14
+ ### Changed
15
+ - **Daemon mode**: `ppm start` now spawns supervisor process instead of server directly
16
+ - **macOS autostart**: KeepAlive changed from conditional (SuccessfulExit=false) to unconditional
17
+ - **Linux autostart**: Restart policy changed from `on-failure` to `always`
18
+ - **`ppm stop`**: Kills supervisor PID first (cascades to children), 2s grace period
19
+ - **`ppm status`**: Shows supervisor PID and alive status
20
+ - **`ppm restart`**: Uses SIGUSR2 to supervisor for server-only restart
21
+
3
22
  ## [0.8.52] - 2026-03-25
4
23
 
5
24
  ### Fixed
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hienlh/ppm",
3
- "version": "0.8.52",
3
+ "version": "0.8.53",
4
4
  "description": "Personal Project Manager — mobile-first web IDE with AI assistance",
5
5
  "author": "hienlh",
6
6
  "license": "MIT",
@@ -26,6 +26,45 @@ export async function restartServer(options: { config?: string }) {
26
26
  process.exit(1);
27
27
  }
28
28
 
29
+ // Supervisor-aware restart: send SIGUSR2 → supervisor restarts server child
30
+ const supervisorPid = status.supervisorPid as number | undefined;
31
+ if (supervisorPid) {
32
+ try { process.kill(supervisorPid, 0); } catch {
33
+ console.log("Supervisor not running. Use 'ppm stop && ppm start' instead.");
34
+ process.exit(1);
35
+ }
36
+
37
+ const oldServerPid = status.pid as number | undefined;
38
+ console.log("\n Restarting PPM server via supervisor...");
39
+ console.log(" If you're using PPM terminal, wait a few seconds for auto-reconnect.\n");
40
+
41
+ try { process.kill(supervisorPid, "SIGUSR2"); } catch (e) {
42
+ console.error(` ✗ Failed to signal supervisor: ${e}`);
43
+ process.exit(1);
44
+ }
45
+
46
+ // Wait for new server PID to appear in status.json (up to 15s)
47
+ const start = Date.now();
48
+ while (Date.now() - start < 15_000) {
49
+ await Bun.sleep(500);
50
+ try {
51
+ const newStatus = JSON.parse(readFileSync(STATUS_FILE, "utf-8"));
52
+ const newPid = newStatus.pid as number | undefined;
53
+ if (newPid && newPid !== oldServerPid) {
54
+ // Verify it's alive
55
+ try { process.kill(newPid, 0); } catch { continue; }
56
+ console.log(` ✓ Restart complete (new PID: ${newPid})`);
57
+ if (newStatus.shareUrl) console.log(` ➜ Share: ${newStatus.shareUrl}`);
58
+ process.exit(0);
59
+ }
60
+ } catch {}
61
+ }
62
+
63
+ console.error(" ⚠ Restart timed out. Check: ppm logs");
64
+ process.exit(1);
65
+ }
66
+
67
+ // Legacy path: no supervisor (pre-supervisor daemon)
29
68
  const serverPid = status.pid as number | undefined;
30
69
  if (!serverPid) {
31
70
  console.log("No server PID found. Use 'ppm stop && ppm start' instead.");
@@ -13,6 +13,8 @@ interface DaemonStatus {
13
13
  shareUrl: string | null;
14
14
  tunnelPid: number | null;
15
15
  tunnelAlive: boolean;
16
+ supervisorPid: number | null;
17
+ supervisorAlive: boolean;
16
18
  }
17
19
 
18
20
  function isAlive(pid: number): boolean {
@@ -23,6 +25,7 @@ function getDaemonStatus(): DaemonStatus {
23
25
  const dead: DaemonStatus = {
24
26
  running: false, pid: null, port: null, host: null,
25
27
  shareUrl: null, tunnelPid: null, tunnelAlive: false,
28
+ supervisorPid: null, supervisorAlive: false,
26
29
  };
27
30
 
28
31
  if (existsSync(STATUS_FILE)) {
@@ -31,6 +34,8 @@ function getDaemonStatus(): DaemonStatus {
31
34
  const pid = data.pid as number;
32
35
  const tunnelPid = (data.tunnelPid as number) ?? null;
33
36
  const tunnelAlive = tunnelPid ? isAlive(tunnelPid) : false;
37
+ const supervisorPid = (data.supervisorPid as number) ?? null;
38
+ const supervisorAlive = supervisorPid ? isAlive(supervisorPid) : false;
34
39
  return {
35
40
  running: isAlive(pid),
36
41
  pid,
@@ -39,6 +44,8 @@ function getDaemonStatus(): DaemonStatus {
39
44
  shareUrl: data.shareUrl ?? null,
40
45
  tunnelPid,
41
46
  tunnelAlive,
47
+ supervisorPid,
48
+ supervisorAlive,
42
49
  };
43
50
  } catch { return dead; }
44
51
  }
@@ -151,6 +158,9 @@ export async function showStatus(options: { json?: boolean; all?: boolean }) {
151
158
  }
152
159
 
153
160
  console.log(`\n PPM daemon status\n`);
161
+ if (status.supervisorPid) {
162
+ console.log(` Supervisor: ${status.supervisorAlive ? "running" : "stopped"} (PID: ${status.supervisorPid})`);
163
+ }
154
164
  console.log(` Server: ${status.running ? "running" : "stopped"} (PID: ${status.pid})`);
155
165
  if (status.port) console.log(` Local: http://localhost:${status.port}/`);
156
166
  if (status.tunnelPid) {
@@ -54,53 +54,70 @@ export async function stopServer(options?: { all?: boolean }) {
54
54
  if (options?.all) {
55
55
  console.log(" Stopping all PPM and cloudflared processes...\n");
56
56
  const cfKilled = killAllByName("cloudflared");
57
- // Kill bun processes listening on PPM ports (from status.json or common ports)
58
- let serverKilled = 0;
57
+ let killed = 0;
59
58
  if (existsSync(STATUS_FILE)) {
60
59
  try {
61
60
  const data = JSON.parse(readFileSync(STATUS_FILE, "utf-8"));
62
- if (data.pid) { killPid(data.pid, "server"); serverKilled++; }
61
+ // Kill supervisor first (cascades to server + tunnel children)
62
+ if (data.supervisorPid) { killPid(data.supervisorPid, "supervisor"); killed++; }
63
+ if (data.pid) { killPid(data.pid, "server"); killed++; }
64
+ if (data.tunnelPid) { killPid(data.tunnelPid, "tunnel"); killed++; }
63
65
  } catch {}
64
66
  }
65
67
  if (existsSync(PID_FILE)) {
66
68
  try {
67
69
  const pid = parseInt(readFileSync(PID_FILE, "utf-8").trim(), 10);
68
- if (!isNaN(pid)) { killPid(pid, "server (pidfile)"); serverKilled++; }
70
+ if (!isNaN(pid)) { killPid(pid, "supervisor/server (pidfile)"); killed++; }
69
71
  } catch {}
70
72
  }
71
73
  cleanup();
72
- console.log(`\n Done. Killed ${cfKilled} cloudflared + ${serverKilled} server process(es).`);
74
+ console.log(`\n Done. Killed ${cfKilled} cloudflared + ${killed} PPM process(es).`);
73
75
  return;
74
76
  }
75
77
 
76
- let status: { pid?: number; tunnelPid?: number } | null = null;
78
+ let status: { pid?: number; tunnelPid?: number; supervisorPid?: number } | null = null;
77
79
 
78
80
  // Read status.json
79
81
  if (existsSync(STATUS_FILE)) {
80
82
  try { status = JSON.parse(readFileSync(STATUS_FILE, "utf-8")); } catch {}
81
83
  }
82
84
 
83
- // Fallback to ppm.pid
85
+ // Fallback to ppm.pid (now stores supervisor PID)
84
86
  const pidFromFile = existsSync(PID_FILE)
85
87
  ? parseInt(readFileSync(PID_FILE, "utf-8").trim(), 10)
86
88
  : NaN;
87
89
 
88
- const serverPid = status?.pid ?? (isNaN(pidFromFile) ? null : pidFromFile);
90
+ const supervisorPid = status?.supervisorPid ?? null;
91
+ const serverPid = status?.pid ?? null;
89
92
  const tunnelPid = status?.tunnelPid ?? null;
93
+ const fallbackPid = isNaN(pidFromFile) ? null : pidFromFile;
90
94
 
91
- if (!serverPid && !tunnelPid) {
95
+ if (!supervisorPid && !serverPid && !tunnelPid && !fallbackPid) {
92
96
  console.log("No PPM daemon running.");
93
97
  cleanup();
94
98
  return;
95
99
  }
96
100
 
97
- // Kill server process
98
- if (serverPid) killPid(serverPid, "server");
101
+ // Kill supervisor first — its SIGTERM handler kills server + tunnel children
102
+ if (supervisorPid) {
103
+ killPid(supervisorPid, "supervisor");
104
+ // Give supervisor 2s to gracefully kill children
105
+ await Bun.sleep(2000);
106
+ } else if (fallbackPid) {
107
+ // Legacy: ppm.pid might be server PID (pre-supervisor) or supervisor PID
108
+ killPid(fallbackPid, "supervisor/server (pidfile)");
109
+ await Bun.sleep(1000);
110
+ }
99
111
 
100
- // Kill tunnel process (independent from server)
101
- if (tunnelPid) killPid(tunnelPid, "tunnel");
112
+ // Kill remaining children if supervisor didn't clean them up
113
+ if (serverPid) {
114
+ try { process.kill(serverPid, 0); killPid(serverPid, "server"); } catch {}
115
+ }
116
+ if (tunnelPid) {
117
+ try { process.kill(tunnelPid, 0); killPid(tunnelPid, "tunnel"); } catch {}
118
+ }
102
119
 
103
- // Windows fallback: kill orphan cloudflared processes spawned by PPM
120
+ // Windows fallback: kill orphan cloudflared processes
104
121
  if (process.platform === "win32") {
105
122
  try {
106
123
  Bun.spawnSync(["taskkill", "/F", "/IM", "cloudflared.exe"], { stdout: "ignore", stderr: "ignore" });
@@ -53,12 +53,29 @@ async function setupLogFile() {
53
53
  console.error = (...args: unknown[]) => { origError(...args); writeLog("ERROR", args); };
54
54
  console.warn = (...args: unknown[]) => { origWarn(...args); writeLog("WARN", args); };
55
55
 
56
- // Capture uncaught errors
56
+ // Capture uncaught errors — count-based exit for supervisor restart
57
+ let exceptionCount = 0;
58
+ let lastExceptionTime = 0;
59
+
60
+ const handleFatalError = (label: string, detail: string) => {
61
+ writeLog("FATAL", [`${label}: ${detail}`]);
62
+ const now = Date.now();
63
+ if (now - lastExceptionTime < 60_000) exceptionCount++;
64
+ else exceptionCount = 1;
65
+ lastExceptionTime = now;
66
+
67
+ // 3+ fatal errors in 1 minute → exit and let supervisor restart fresh
68
+ if (exceptionCount >= 3) {
69
+ writeLog("FATAL", ["Too many errors in 1 min, exiting for supervisor restart"]);
70
+ process.exit(1);
71
+ }
72
+ };
73
+
57
74
  process.on("uncaughtException", (err) => {
58
- writeLog("FATAL", [`Uncaught exception: ${err.stack ?? err.message}`]);
75
+ handleFatalError("Uncaught exception", err.stack ?? err.message);
59
76
  });
60
77
  process.on("unhandledRejection", (reason) => {
61
- writeLog("FATAL", [`Unhandled rejection: ${reason}`]);
78
+ handleFatalError("Unhandled rejection", String(reason));
62
79
  });
63
80
  }
64
81
 
@@ -168,110 +185,52 @@ export async function startServer(options: {
168
185
  if (isDaemon) {
169
186
  const { resolve } = await import("node:path");
170
187
  const { homedir } = await import("node:os");
171
- const { writeFileSync, readFileSync, mkdirSync, existsSync } = await import("node:fs");
188
+ const { writeFileSync, readFileSync, mkdirSync, existsSync, openSync } = await import("node:fs");
189
+ const { isCompiledBinary } = await import("../services/autostart-generator.ts");
172
190
 
173
191
  const ppmDir = resolve(homedir(), ".ppm");
174
192
  if (!existsSync(ppmDir)) mkdirSync(ppmDir, { recursive: true });
175
193
  const pidFile = resolve(ppmDir, "ppm.pid");
176
194
  const statusFile = resolve(ppmDir, "status.json");
177
195
 
178
- // If --share, download cloudflared and start tunnel as independent process
179
- let shareUrl: string | undefined;
180
- let tunnelPid: number | undefined;
196
+ // Kill any leftover processes from previous run
197
+ if (existsSync(statusFile)) {
198
+ try {
199
+ const prev = JSON.parse(readFileSync(statusFile, "utf-8"));
200
+ if (prev.supervisorPid) { try { process.kill(prev.supervisorPid); } catch {} }
201
+ else if (prev.pid) { try { process.kill(prev.pid); } catch {} }
202
+ if (prev.tunnelPid) { try { process.kill(prev.tunnelPid); } catch {} }
203
+ } catch {}
204
+ }
205
+
206
+ // Pre-download cloudflared if --share (so supervisor doesn't need to)
181
207
  if (options.share) {
208
+ console.log(" Ensuring cloudflared is available...");
182
209
  const { ensureCloudflared } = await import("../services/cloudflared.service.ts");
183
- const bin = await ensureCloudflared();
184
-
185
- // Kill any leftover tunnel from previous run
186
- if (existsSync(statusFile)) {
187
- try {
188
- const prev = JSON.parse(readFileSync(statusFile, "utf-8"));
189
- if (prev.tunnelPid) {
190
- try { process.kill(prev.tunnelPid); } catch { /* already dead */ }
191
- }
192
- } catch {}
193
- }
194
-
195
- // Spawn new tunnel if no existing one
196
- if (!shareUrl) {
197
- console.log(" Starting share tunnel...");
198
- const { openSync: openFd, writeFileSync: writeFs } = await import("node:fs");
199
- const tunnelLog = resolve(ppmDir, "tunnel.log");
200
- // Truncate old log so we only match the new tunnel URL
201
- writeFs(tunnelLog, "");
202
-
203
- if (process.platform === "win32") {
204
- // Windows: use PowerShell for detached tunnel process
205
- const psCmd = [
206
- `$p = Start-Process -PassThru -WindowStyle Hidden`,
207
- `-FilePath '${bin.replace(/\\/g, "\\\\")}'`,
208
- `-ArgumentList 'tunnel','--url','http://localhost:${port}'`,
209
- `-RedirectStandardError '${tunnelLog.replace(/\\/g, "\\\\")}'`,
210
- `; Write-Output $p.Id`,
211
- ].join(" ");
212
- const result = Bun.spawnSync({
213
- cmd: ["powershell", "-NoProfile", "-Command", psCmd],
214
- stdout: "pipe", stderr: "pipe",
215
- });
216
- tunnelPid = parseInt(result.stdout.toString().trim(), 10);
217
- if (isNaN(tunnelPid)) tunnelPid = undefined;
218
- } else {
219
- const tfd = openFd(tunnelLog, "a");
220
- const tunnelProc = Bun.spawn({
221
- cmd: [bin, "tunnel", "--url", `http://localhost:${port}`],
222
- stdio: ["ignore", "ignore", tfd],
223
- env: process.env,
224
- });
225
- tunnelProc.unref();
226
- tunnelPid = tunnelProc.pid;
227
- }
228
-
229
- // Parse URL from tunnel.log (poll stderr output)
230
- const urlRegex = /https:\/\/[a-z0-9-]+\.trycloudflare\.com/;
231
- const pollStart = Date.now();
232
- while (Date.now() - pollStart < 30_000) {
233
- await Bun.sleep(500);
234
- try {
235
- const logContent = readFileSync(tunnelLog, "utf-8");
236
- const match = logContent.match(urlRegex);
237
- if (match) { shareUrl = match[0]; break; }
238
- } catch {}
239
- }
240
- if (!shareUrl) console.warn(" ⚠ Tunnel started but URL not detected.");
241
- }
210
+ await ensureCloudflared();
242
211
  }
243
212
 
244
- // Write preliminary status.json so child process can read shareUrl on startup
245
- // (child reads this before parent has a chance to write PID — fixes race condition)
246
- writeFileSync(statusFile, JSON.stringify({
247
- port, host,
248
- shareUrl: shareUrl ?? null,
249
- tunnelPid: tunnelPid ?? null,
250
- }));
251
-
252
- // Spawn server child process with log file
253
- const { openSync } = await import("node:fs");
254
- const { isCompiledBinary } = await import("../services/autostart-generator.ts");
213
+ // Spawn supervisor process (manages server + tunnel children)
255
214
  const isCompiledBin = isCompiledBinary();
256
215
  const logFile = resolve(ppmDir, "ppm.log");
257
216
  const logFd = openSync(logFile, "a");
258
- const { resolve: resolvePath } = await import("node:path");
259
- const script = resolvePath(import.meta.dir, "index.ts");
260
- // Keep positional order: port, host, config, profile (empty strings kept as placeholders)
261
- const args = ["__serve__", String(port), host, options.config ?? "", options.profile ?? ""];
262
- // Windows PowerShell: strip trailing empty args to avoid ArgumentList validation error
263
- while (args.length > 0 && args[args.length - 1] === "") args.pop();
217
+ const supervisorScript = resolve(import.meta.dir, "..", "services", "supervisor.ts");
264
218
 
265
- let childPid: number;
219
+ const superviseArgs = [
220
+ "__supervise__", String(port), host,
221
+ options.config ?? "", options.profile ?? "",
222
+ ];
223
+ if (options.share) superviseArgs.push("--share");
224
+ // Strip trailing empty args (before --share flag)
225
+ while (superviseArgs.length > 1 && superviseArgs[superviseArgs.length - 1] === "") superviseArgs.pop();
226
+
227
+ let supervisorPid: number;
266
228
 
267
229
  if (process.platform === "win32") {
268
- // Windows: Bun.spawn child may die when parent exits (same job object).
269
- // Use PowerShell Start-Process to create a truly detached process.
270
230
  const bunExe = process.execPath.replace(/\\/g, "\\\\");
271
231
  const logEscaped = logFile.replace(/\\/g, "\\\\");
272
232
  const errLog = logFile.replace(/\.log$/, ".err.log").replace(/\\/g, "\\\\");
273
- // Use "_" placeholder for empty args PowerShell rejects empty strings in ArgumentList
274
- const winArgs = isCompiledBin ? args : ["run", script, ...args];
233
+ const winArgs = isCompiledBin ? superviseArgs : ["run", supervisorScript, ...superviseArgs];
275
234
  const argStr = winArgs.map((a) => `'${a || "_"}'`).join(",");
276
235
  const psCmd = [
277
236
  `$p = Start-Process -PassThru -WindowStyle Hidden`,
@@ -283,48 +242,73 @@ export async function startServer(options: {
283
242
  ].join(" ");
284
243
  const result = Bun.spawnSync({
285
244
  cmd: ["powershell", "-NoProfile", "-Command", psCmd],
286
- stdout: "pipe",
287
- stderr: "pipe",
245
+ stdout: "pipe", stderr: "pipe",
288
246
  });
289
- childPid = parseInt(result.stdout.toString().trim(), 10);
290
- if (isNaN(childPid)) {
291
- console.error(" ✗ Failed to start daemon on Windows.");
247
+ supervisorPid = parseInt(result.stdout.toString().trim(), 10);
248
+ if (isNaN(supervisorPid)) {
249
+ console.error(" ✗ Failed to start supervisor on Windows.");
292
250
  console.error(` ${result.stderr.toString().trim()}`);
293
251
  console.error(" Try: ppm start -f (foreground mode)");
294
252
  process.exit(1);
295
253
  }
296
254
  } else {
297
- // macOS/Linux: Bun.spawn + unref works fine
298
- // Compiled binary: execPath IS the server, no "run script" needed
299
255
  const cmd = isCompiledBin
300
- ? [process.execPath, ...args]
301
- : [process.execPath, "run", script, ...args];
256
+ ? [process.execPath, ...superviseArgs]
257
+ : [process.execPath, "run", supervisorScript, ...superviseArgs];
302
258
  const child = Bun.spawn({
303
259
  cmd,
304
260
  stdio: ["ignore", logFd, logFd],
305
261
  env: process.env,
306
262
  });
307
263
  child.unref();
308
- childPid = child.pid;
264
+ supervisorPid = child.pid;
309
265
  }
310
266
 
311
- // Verify daemon is alive after brief startup
312
- await Bun.sleep(500);
313
- let alive = false;
314
- try { process.kill(childPid, 0); alive = true; } catch {}
315
- if (!alive) {
316
- console.error(" ✗ Daemon exited immediately after start.");
267
+ // Wait for supervisor to start server child (poll status.json for pid)
268
+ const startWait = Date.now();
269
+ let serverPid: number | null = null;
270
+ while (Date.now() - startWait < 10_000) {
271
+ await Bun.sleep(500);
272
+ // Check supervisor is still alive
273
+ try { process.kill(supervisorPid, 0); } catch {
274
+ console.error(" ✗ Supervisor exited immediately after start.");
275
+ console.error(" Check logs: ppm logs");
276
+ console.error(" Or try: ppm start -f (foreground mode)");
277
+ process.exit(1);
278
+ }
279
+ // Check if server PID appeared in status.json
280
+ try {
281
+ const data = JSON.parse(readFileSync(statusFile, "utf-8"));
282
+ if (data.pid && data.supervisorPid) {
283
+ serverPid = data.pid;
284
+ break;
285
+ }
286
+ } catch {}
287
+ }
288
+
289
+ if (!serverPid) {
290
+ console.error(" ✗ Server did not start within 10 seconds.");
317
291
  console.error(" Check logs: ppm logs");
318
- console.error(" Or try: ppm start -f (foreground mode)");
292
+ try { process.kill(supervisorPid); } catch {}
319
293
  process.exit(1);
320
294
  }
321
295
 
322
- // Update status file with child PID + server script path for restart
323
- const status = { pid: childPid, port, host, shareUrl: shareUrl ?? null, tunnelPid: tunnelPid ?? null, serverScript: script };
324
- writeFileSync(statusFile, JSON.stringify(status));
325
- writeFileSync(pidFile, String(childPid));
296
+ // Read final status for share URL
297
+ let shareUrl: string | null = null;
298
+ if (options.share) {
299
+ // Give tunnel a bit more time to establish
300
+ const tunnelWait = Date.now();
301
+ while (Date.now() - tunnelWait < 35_000) {
302
+ await Bun.sleep(500);
303
+ try {
304
+ const data = JSON.parse(readFileSync(statusFile, "utf-8"));
305
+ if (data.shareUrl) { shareUrl = data.shareUrl; break; }
306
+ } catch {}
307
+ }
308
+ if (!shareUrl) console.warn(" ⚠ Tunnel started but URL not detected yet. Check: ppm status");
309
+ }
326
310
 
327
- console.log(` Daemon started (PID: ${childPid})\n`);
311
+ console.log(` Supervisor started (PID: ${supervisorPid}, server PID: ${serverPid})\n`);
328
312
  console.log(` ➜ Local: http://localhost:${port}/`);
329
313
  if (shareUrl) {
330
314
  console.log(` ➜ Share: ${shareUrl}`);
@@ -90,10 +90,7 @@ ${programArgs}
90
90
  <key>RunAtLoad</key>
91
91
  <true/>
92
92
  <key>KeepAlive</key>
93
- <dict>
94
- <key>SuccessfulExit</key>
95
- <false/>
96
- </dict>
93
+ <true/>
97
94
  <key>StandardOutPath</key>
98
95
  <string>${escapeXml(logPath)}</string>
99
96
  <key>StandardErrorPath</key>
@@ -133,7 +130,7 @@ Wants=network-online.target
133
130
  [Service]
134
131
  Type=simple
135
132
  ExecStart=${execStart}
136
- Restart=on-failure
133
+ Restart=always
137
134
  RestartSec=5
138
135
  ${envPath}
139
136
  WorkingDirectory=${homedir()}/.ppm
@@ -0,0 +1,387 @@
1
+ /**
2
+ * Supervisor process — long-lived parent that manages server child + tunnel child.
3
+ * Respawns children on crash with exponential backoff.
4
+ * Health-checks server (/api/health) and tunnel URL (public probe).
5
+ * Entry: __supervise__ <port> <host> [config] [profile] [--share]
6
+ */
7
+ import type { Subprocess } from "bun";
8
+ import { resolve } from "node:path";
9
+ import { homedir } from "node:os";
10
+ import {
11
+ readFileSync, writeFileSync, existsSync, mkdirSync, openSync, appendFileSync,
12
+ } from "node:fs";
13
+ import { isCompiledBinary } from "./autostart-generator.ts";
14
+
15
+ // ─── Constants ─────────────────────────────────────────────────────────
16
+ const MAX_RESTARTS = 10;
17
+ const BACKOFF_BASE_MS = 1000;
18
+ const BACKOFF_MAX_MS = 60_000;
19
+ const STABLE_WINDOW_MS = 300_000; // 5min stable → reset restart counter
20
+ const SERVER_HEALTH_INTERVAL_MS = 30_000;
21
+ const SERVER_HEALTH_FAIL_THRESHOLD = 3;
22
+ const TUNNEL_PROBE_INTERVAL_MS = 120_000;
23
+ const TUNNEL_PROBE_FAIL_THRESHOLD = 2;
24
+ const TUNNEL_URL_REGEX = /https:\/\/[a-z0-9-]+\.trycloudflare\.com/;
25
+
26
+ const PPM_DIR = resolve(homedir(), ".ppm");
27
+ const STATUS_FILE = resolve(PPM_DIR, "status.json");
28
+ const PID_FILE = resolve(PPM_DIR, "ppm.pid");
29
+ const LOG_FILE = resolve(PPM_DIR, "ppm.log");
30
+
31
+ // ─── State ─────────────────────────────────────────────────────────────
32
+ let serverChild: Subprocess | null = null;
33
+ let tunnelChild: Subprocess | null = null;
34
+ let tunnelUrl: string | null = null;
35
+ let shuttingDown = false;
36
+
37
+ let serverRestarts = 0;
38
+ let lastServerCrash = 0;
39
+ let tunnelRestarts = 0;
40
+ let lastTunnelCrash = 0;
41
+
42
+ let healthFailCount = 0;
43
+ let tunnelFailCount = 0;
44
+ let serverRestartRequested = false; // SIGUSR2 flag — skip backoff on next crash
45
+
46
+ // Timers for cleanup
47
+ let healthTimer: ReturnType<typeof setInterval> | null = null;
48
+ let tunnelProbeTimer: ReturnType<typeof setInterval> | null = null;
49
+ let heartbeatTimer: ReturnType<typeof setInterval> | null = null;
50
+
51
+ // ─── Logging ───────────────────────────────────────────────────────────
52
+ function log(level: string, msg: string) {
53
+ const ts = new Date().toISOString();
54
+ const line = `[${ts}] [${level}] [supervisor] ${msg}\n`;
55
+ try { appendFileSync(LOG_FILE, line); } catch {}
56
+ if (level === "ERROR" || level === "FATAL") {
57
+ process.stderr.write(line);
58
+ }
59
+ }
60
+
61
+ // ─── Status management ─────────────────────────────────────────────────
62
+ function readStatus(): Record<string, unknown> {
63
+ try {
64
+ if (existsSync(STATUS_FILE)) return JSON.parse(readFileSync(STATUS_FILE, "utf-8"));
65
+ } catch {}
66
+ return {};
67
+ }
68
+
69
+ function updateStatus(patch: Record<string, unknown>) {
70
+ try {
71
+ const data = { ...readStatus(), ...patch };
72
+ writeFileSync(STATUS_FILE, JSON.stringify(data));
73
+ } catch {}
74
+ }
75
+
76
+ // ─── Backoff calc ──────────────────────────────────────────────────────
77
+ function backoffDelay(restartCount: number): number {
78
+ return Math.min(BACKOFF_BASE_MS * 2 ** (restartCount - 1), BACKOFF_MAX_MS);
79
+ }
80
+
81
+ // ─── Server management ─────────────────────────────────────────────────
82
+ export async function spawnServer(
83
+ serverArgs: string[],
84
+ logFd: number,
85
+ ): Promise<void> {
86
+ const cmd = isCompiledBinary()
87
+ ? [process.execPath, ...serverArgs]
88
+ : [process.execPath, "run", resolve(import.meta.dir, "..", "server", "index.ts"), ...serverArgs];
89
+
90
+ serverChild = Bun.spawn({
91
+ cmd,
92
+ stdio: ["ignore", logFd, logFd],
93
+ env: process.env,
94
+ });
95
+
96
+ const childPid = serverChild.pid;
97
+ updateStatus({ pid: childPid });
98
+ writeFileSync(PID_FILE, String(process.pid)); // supervisor PID for stop
99
+ log("INFO", `Server started (PID: ${childPid})`);
100
+
101
+ const exitCode = await serverChild.exited;
102
+ serverChild = null;
103
+
104
+ if (exitCode === 0 || shuttingDown) {
105
+ log("INFO", `Server exited cleanly (code ${exitCode})`);
106
+ return;
107
+ }
108
+
109
+ // SIGUSR2 restart — skip backoff, respawn immediately
110
+ if (serverRestartRequested) {
111
+ serverRestartRequested = false;
112
+ log("INFO", `Server restarting (SIGUSR2), no backoff`);
113
+ if (!shuttingDown) return spawnServer(serverArgs, logFd);
114
+ return;
115
+ }
116
+
117
+ // Crash — apply backoff
118
+ const now = Date.now();
119
+ if (now - lastServerCrash > STABLE_WINDOW_MS) serverRestarts = 0;
120
+ lastServerCrash = now;
121
+ serverRestarts++;
122
+
123
+ if (serverRestarts > MAX_RESTARTS) {
124
+ log("FATAL", `Server exceeded ${MAX_RESTARTS} restarts, giving up`);
125
+ shutdown();
126
+ return;
127
+ }
128
+
129
+ const delay = backoffDelay(serverRestarts);
130
+ log("WARN", `Server crashed (exit ${exitCode}), restarting in ${delay}ms (#${serverRestarts})`);
131
+ await Bun.sleep(delay);
132
+
133
+ if (!shuttingDown) return spawnServer(serverArgs, logFd);
134
+ }
135
+
136
+ // ─── Tunnel management ─────────────────────────────────────────────────
137
+ async function extractUrlFromStderr(stderr: ReadableStream<Uint8Array>): Promise<string> {
138
+ const reader = stderr.getReader();
139
+ const decoder = new TextDecoder();
140
+ let buffer = "";
141
+
142
+ return new Promise((resolve, reject) => {
143
+ const timeout = setTimeout(() => reject(new Error("Tunnel URL timeout (30s)")), 30_000);
144
+
145
+ const read = async () => {
146
+ try {
147
+ while (true) {
148
+ const { done, value } = await reader.read();
149
+ if (done) break;
150
+ buffer += decoder.decode(value, { stream: true });
151
+ const match = buffer.match(TUNNEL_URL_REGEX);
152
+ if (match) {
153
+ clearTimeout(timeout);
154
+ // Keep draining in background to avoid SIGPIPE
155
+ (async () => {
156
+ try { while (!(await reader.read()).done) {} } catch {}
157
+ })();
158
+ resolve(match[0]);
159
+ return;
160
+ }
161
+ }
162
+ clearTimeout(timeout);
163
+ reject(new Error("cloudflared exited without providing URL"));
164
+ } catch (err) {
165
+ clearTimeout(timeout);
166
+ reject(err);
167
+ }
168
+ };
169
+ read();
170
+ });
171
+ }
172
+
173
+ async function syncUrlToCloud(url: string) {
174
+ try {
175
+ const { sendHeartbeat, getCloudDevice } = await import("./cloud.service.ts");
176
+ if (getCloudDevice()) {
177
+ const ok = await sendHeartbeat(url);
178
+ if (ok) log("INFO", `Cloud synced: ${url}`);
179
+ else log("WARN", "Cloud sync failed (non-blocking)");
180
+ }
181
+ } catch {}
182
+ }
183
+
184
+ function startCloudHeartbeat(url: string) {
185
+ if (heartbeatTimer) clearInterval(heartbeatTimer);
186
+ heartbeatTimer = setInterval(() => {
187
+ if (tunnelUrl) syncUrlToCloud(tunnelUrl);
188
+ }, 5 * 60 * 1000);
189
+ }
190
+
191
+ export async function spawnTunnel(port: number): Promise<void> {
192
+ let bin: string;
193
+ try {
194
+ const { ensureCloudflared } = await import("./cloudflared.service.ts");
195
+ bin = await ensureCloudflared();
196
+ } catch (err) {
197
+ log("ERROR", `Failed to get cloudflared: ${err}`);
198
+ return;
199
+ }
200
+
201
+ tunnelChild = Bun.spawn(
202
+ [bin, "tunnel", "--url", `http://127.0.0.1:${port}`],
203
+ { stderr: "pipe", stdout: "ignore", stdin: "ignore" },
204
+ );
205
+
206
+ try {
207
+ tunnelUrl = await extractUrlFromStderr(tunnelChild.stderr as ReadableStream<Uint8Array>);
208
+ } catch (err) {
209
+ log("ERROR", `Tunnel URL extraction failed: ${err}`);
210
+ tunnelUrl = null;
211
+ try { tunnelChild.kill(); } catch {}
212
+ tunnelChild = null;
213
+
214
+ if (shuttingDown) return;
215
+ tunnelRestarts++;
216
+ const delay = backoffDelay(tunnelRestarts);
217
+ log("WARN", `Tunnel failed, retry in ${delay}ms (#${tunnelRestarts})`);
218
+ await Bun.sleep(delay);
219
+ return spawnTunnel(port);
220
+ }
221
+
222
+ updateStatus({ shareUrl: tunnelUrl, tunnelPid: tunnelChild.pid });
223
+ log("INFO", `Tunnel ready: ${tunnelUrl} (PID: ${tunnelChild.pid})`);
224
+
225
+ // Sync new URL to cloud immediately + start periodic heartbeat
226
+ await syncUrlToCloud(tunnelUrl);
227
+ startCloudHeartbeat(tunnelUrl);
228
+
229
+ const exitCode = await tunnelChild.exited;
230
+ tunnelChild = null;
231
+ const deadUrl = tunnelUrl;
232
+ tunnelUrl = null;
233
+
234
+ if (shuttingDown) return;
235
+
236
+ // Crash — apply backoff
237
+ const now = Date.now();
238
+ if (now - lastTunnelCrash > STABLE_WINDOW_MS) tunnelRestarts = 0;
239
+ lastTunnelCrash = now;
240
+ tunnelRestarts++;
241
+
242
+ if (tunnelRestarts > MAX_RESTARTS) {
243
+ log("ERROR", `Tunnel exceeded ${MAX_RESTARTS} restarts, disabling tunnel`);
244
+ updateStatus({ shareUrl: null, tunnelPid: null });
245
+ return;
246
+ }
247
+
248
+ const delay = backoffDelay(tunnelRestarts);
249
+ log("WARN", `Tunnel died (exit ${exitCode}, was ${deadUrl}), restart in ${delay}ms (#${tunnelRestarts})`);
250
+ await Bun.sleep(delay);
251
+
252
+ if (!shuttingDown) return spawnTunnel(port);
253
+ }
254
+
255
+ // ─── Health checks ─────────────────────────────────────────────────────
256
+ function startServerHealthCheck(port: number) {
257
+ healthTimer = setInterval(async () => {
258
+ if (shuttingDown || !serverChild) return;
259
+ try {
260
+ const res = await fetch(`http://127.0.0.1:${port}/api/health`, {
261
+ signal: AbortSignal.timeout(5000),
262
+ });
263
+ if (res.ok) { healthFailCount = 0; return; }
264
+ } catch {}
265
+ healthFailCount++;
266
+ if (healthFailCount >= SERVER_HEALTH_FAIL_THRESHOLD && serverChild) {
267
+ log("WARN", `Server unresponsive (${healthFailCount} failures), killing`);
268
+ try { serverChild.kill(); } catch {}
269
+ healthFailCount = 0;
270
+ // spawnServer loop handles respawn via exited promise
271
+ }
272
+ }, SERVER_HEALTH_INTERVAL_MS);
273
+ }
274
+
275
+ function startTunnelProbe(port: number) {
276
+ tunnelProbeTimer = setInterval(async () => {
277
+ if (shuttingDown || !tunnelUrl || !tunnelChild) {
278
+ tunnelFailCount = 0;
279
+ return;
280
+ }
281
+ try {
282
+ const res = await fetch(`${tunnelUrl}/api/health`, {
283
+ signal: AbortSignal.timeout(10_000),
284
+ });
285
+ if (res.ok) {
286
+ tunnelFailCount = 0;
287
+ tunnelRestarts = 0; // reset on success
288
+ return;
289
+ }
290
+ } catch {}
291
+ tunnelFailCount++;
292
+ if (tunnelFailCount >= TUNNEL_PROBE_FAIL_THRESHOLD && tunnelChild) {
293
+ log("WARN", `Tunnel URL dead (${tunnelFailCount} failures), regenerating`);
294
+ try { tunnelChild.kill(); } catch {}
295
+ tunnelFailCount = 0;
296
+ // spawnTunnel loop handles respawn via exited promise
297
+ }
298
+ }, TUNNEL_PROBE_INTERVAL_MS);
299
+ }
300
+
301
+ // ─── Shutdown ──────────────────────────────────────────────────────────
302
+ export function shutdown() {
303
+ if (shuttingDown) return;
304
+ shuttingDown = true;
305
+ log("INFO", "Supervisor shutting down");
306
+
307
+ if (healthTimer) clearInterval(healthTimer);
308
+ if (tunnelProbeTimer) clearInterval(tunnelProbeTimer);
309
+ if (heartbeatTimer) clearInterval(heartbeatTimer);
310
+
311
+ if (serverChild) { try { serverChild.kill(); } catch {} }
312
+ if (tunnelChild) { try { tunnelChild.kill(); } catch {} }
313
+ }
314
+
315
+ // ─── Main entry ────────────────────────────────────────────────────────
316
+ export async function runSupervisor(opts: {
317
+ port: number;
318
+ host: string;
319
+ config?: string;
320
+ profile?: string;
321
+ share: boolean;
322
+ }) {
323
+ if (!existsSync(PPM_DIR)) mkdirSync(PPM_DIR, { recursive: true });
324
+
325
+ const logFd = openSync(LOG_FILE, "a");
326
+ log("INFO", `Supervisor started (PID: ${process.pid}, port: ${opts.port}, share: ${opts.share})`);
327
+
328
+ // Write supervisor PID
329
+ writeFileSync(PID_FILE, String(process.pid));
330
+ updateStatus({ supervisorPid: process.pid, port: opts.port, host: opts.host });
331
+
332
+ // Build __serve__ args
333
+ const serverArgs = [
334
+ "__serve__", String(opts.port), opts.host,
335
+ opts.config ?? "", opts.profile ?? "",
336
+ ];
337
+ // Strip trailing empty args
338
+ while (serverArgs.length > 0 && serverArgs[serverArgs.length - 1] === "") serverArgs.pop();
339
+
340
+ // Signal handlers
341
+ process.on("SIGTERM", () => { shutdown(); process.exit(0); });
342
+ process.on("SIGINT", () => { shutdown(); process.exit(0); });
343
+
344
+ // SIGUSR2 = graceful server restart (tunnel stays alive)
345
+ process.on("SIGUSR2", () => {
346
+ log("INFO", "SIGUSR2 received, restarting server only");
347
+ if (serverChild) {
348
+ serverRestartRequested = true; // flag so spawnServer skips backoff
349
+ try { serverChild.kill(); } catch {}
350
+ }
351
+ });
352
+
353
+ // Start health checks
354
+ startServerHealthCheck(opts.port);
355
+
356
+ // Spawn server + tunnel in parallel
357
+ const promises: Promise<void>[] = [spawnServer(serverArgs, logFd)];
358
+
359
+ if (opts.share) {
360
+ startTunnelProbe(opts.port);
361
+ promises.push(spawnTunnel(opts.port));
362
+ }
363
+
364
+ await Promise.all(promises);
365
+
366
+ // If we get here, both loops exited (shutdown or max restarts)
367
+ log("INFO", "Supervisor exiting");
368
+ process.exit(shuttingDown ? 0 : 1);
369
+ }
370
+
371
+ // ─── CLI entry point ───────────────────────────────────────────────────
372
+ if (process.argv.includes("__supervise__")) {
373
+ const idx = process.argv.indexOf("__supervise__");
374
+ const port = parseInt(process.argv[idx + 1] ?? "8080", 10);
375
+ const host = process.argv[idx + 2] ?? "0.0.0.0";
376
+ const config = process.argv[idx + 3] && process.argv[idx + 3] !== "_" ? process.argv[idx + 3] : undefined;
377
+ const profile = process.argv[idx + 4] && process.argv[idx + 4] !== "_" ? process.argv[idx + 4] : undefined;
378
+ const share = process.argv.includes("--share");
379
+
380
+ // Set DB profile for supervisor (needed to read config)
381
+ if (profile) {
382
+ const { setDbProfile } = await import("./db.service.ts");
383
+ setDbProfile(profile);
384
+ }
385
+
386
+ runSupervisor({ port, host, config, profile, share });
387
+ }