@hienlh/ppm 0.9.52 → 0.9.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,23 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.9.53] - 2026-04-07
4
+
5
+ ### Added
6
+ - **Supervisor Always Alive**: `ppm stop` now does a soft stop — kills server only, supervisor stays alive with Cloud WS + tunnel. Use `ppm stop --kill` or `ppm down` for full shutdown.
7
+ - **`ppm down` command**: Alias for `ppm stop --kill` (full shutdown).
8
+ - **`ppm stop --kill` flag**: Full shutdown that kills supervisor + server + tunnel.
9
+ - **Stopped page**: When server is stopped, tunnel URL serves a minimal HTML status page + 503 on `/api/health`.
10
+ - **Supervisor detection**: `ppm start` detects existing supervisor and resumes/upgrades instead of spawning a new one.
11
+ - **Cloud WS commands**: `start` (resume from stopped), `shutdown` (full kill), `stop` (now soft stop).
12
+ - **Exception handlers**: Supervisor catches `uncaughtException`/`unhandledRejection` — never crashes.
13
+ - **Lockfile**: Prevents concurrent `ppm start` races (`~/.ppm/.start-lock`).
14
+ - **Windows command file polling**: Supervisor polls command file every 1s on Windows (no SIGUSR2).
15
+
16
+ ### Changed
17
+ - **BREAKING**: `ppm stop` default behavior changed from full shutdown to soft stop.
18
+ - **Autostart**: Generates `__supervise__` instead of `__serve__`. Existing users must run `ppm autostart disable && ppm autostart enable` to regenerate.
19
+ - **Supervisor modularized**: Split into `supervisor.ts` (orchestrator), `supervisor-state.ts` (state machine + IPC), `supervisor-stopped-page.ts` (stopped HTML server).
20
+
3
21
  ## [0.9.52] - 2026-04-07
4
22
 
5
23
  ### Added
@@ -2,7 +2,37 @@
2
2
 
3
3
  All notable changes to PPM are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/).
4
4
 
5
- **Current Version:** v0.9.9
5
+ **Current Version:** v0.9.10
6
+
7
+ ---
8
+
9
+ ## [0.9.11] — 2026-04-07
10
+
11
+ ### Added
12
+ - **Supervisor Always Alive Feature** — Distinguish between soft stop (server shutdown) and full shutdown (supervisor shutdown)
13
+ - `ppm stop` now performs SOFT STOP: kills server only, supervisor remains alive with Cloud WS + tunnel connectivity
14
+ - `ppm stop --kill` or `ppm down` performs FULL SHUTDOWN: kills everything (old `ppm stop` behavior)
15
+ - Supervisor now has new `stopped` state (in addition to running, paused, upgrading)
16
+ - When stopped, minimal HTML page served on the port (503 status on /api/health)
17
+ - `ppm start` detects existing supervisor and handles resume/upgrade scenarios
18
+ - Autostart now uses `__supervise__` instead of `__serve__` for consistency
19
+ - Cloud WS has new commands: `start`, `shutdown` (stop is now soft stop, separate from shutdown)
20
+ - Supervisor has uncaughtException/unhandledRejection handlers (never crashes)
21
+ - Supervisor logic modularized into 3 files: supervisor.ts (orchestrator), supervisor-state.ts (state machine), supervisor-stopped-page.ts (503 page)
22
+
23
+ ### Technical Details
24
+ - **Files Created:**
25
+ - `src/services/supervisor-state.ts` — State machine, IPC command file handling
26
+ - `src/services/supervisor-stopped-page.ts` — Minimal 503 HTML response
27
+ - Enhanced `src/services/supervisor.ts` — Orchestrator with stopped state support
28
+ - **Files Modified:**
29
+ - `src/cli/commands/stop.ts` — Added --kill flag, soft stop default, ppm down alias
30
+ - `src/cli/commands/start.ts` — Resume detection for existing supervisor
31
+ - `src/cli/autostart-generator.ts` — Uses __supervise__ entry point
32
+ - Cloud WS endpoints updated with new commands
33
+ - **Type Changes:** SupervisorState = "running" | "paused" | "stopped" | "upgrading"
34
+ - **API Changes:** GET /api/health returns 503 when server stopped (supervisor still running)
35
+ - **Breaking Changes:** None (backward compatible, graceful fallback)
6
36
 
7
37
  ---
8
38
 
@@ -38,12 +38,13 @@ PPM is the **lightest path from phone to code** — a self-hosted, BYOK, multi-d
38
38
 
39
39
  **Theme:** Multi-device access + AI chat improvements. Solve the "I can't reach my PPM from my phone" problem.
40
40
 
41
- | Feature | Priority | Description |
42
- |---------|----------|-------------|
43
- | **PPM Cloud** | Critical | Separate cloud service for device registry + tunnel URL sync. Google OAuth login. CLI `ppm cloud link` syncs tunnel URL. Open cloud dashboard on any device → see machines → tap to connect. NO code/data through cloud — only URLs + metadata. |
44
- | **Auto-start** | High | PPM starts on boot. macOS launchd, Linux systemd, Windows Task Scheduler. CLI: `ppm autostart enable/disable`. Required for "always accessible" story. |
45
- | **Auto-upgrade** | High | Supervisor checks npm registry every 15min. UI banner shows when update available. One-click upgrade via API or CLI. Supervisor self-replaces after install (no OS autostart dependency). **Completed in v0.8.54** |
46
- | **AI Chat enhancements** | High | Tool allow/deny config per session. Chat modes (plan/code/ask). Model selector (opus/sonnet/haiku). Effort level. Max turns. System prompt customization. Better streaming UX (collapsible tool calls). |
41
+ | Feature | Priority | Status | Description |
42
+ |---------|----------|--------|-------------|
43
+ | **PPM Cloud** | Critical | — | Separate cloud service for device registry + tunnel URL sync. Google OAuth login. CLI `ppm cloud link` syncs tunnel URL. Open cloud dashboard on any device → see machines → tap to connect. NO code/data through cloud — only URLs + metadata. |
44
+ | **Auto-start** | High | — | PPM starts on boot. macOS launchd, Linux systemd, Windows Task Scheduler. CLI: `ppm autostart enable/disable`. Required for "always accessible" story. |
45
+ | **Auto-upgrade** | High | ✅ Done | Supervisor checks npm registry every 15min. UI banner shows when update available. One-click upgrade via API or CLI. Supervisor self-replaces after install (no OS autostart dependency). **Completed in v0.8.54** |
46
+ | **Supervisor Always Alive** | High | Done | Soft stop (server shutdown, supervisor stays) vs full shutdown. New `stopped` state. Cloud WS + tunnel stay active when stopped. `ppm start` resumes without supervisor restart. Modularized: supervisor.ts, supervisor-state.ts, supervisor-stopped-page.ts. **Completed in v0.9.11** |
47
+ | **AI Chat enhancements** | High | — | Tool allow/deny config per session. Chat modes (plan/code/ask). Model selector (opus/sonnet/haiku). Effort level. Max turns. System prompt customization. Better streaming UX (collapsible tool calls). |
47
48
 
48
49
  **PPM Cloud — scope guard:**
49
50
  - Cloud is OPTIONAL convenience, never a dependency. PPM works 100% without it.
@@ -1628,13 +1628,81 @@ $ ppm upgrade
1628
1628
  → Works in headless environments (no OS autostart dependency)
1629
1629
 
1630
1630
  $ ppm stop
1631
- Reads ~/.ppm/status.json first (new format)
1632
- Falls back to ppm.pid (compat)
1633
- Sends SIGTERM to daemon
1631
+ SOFT STOP: kills server only, supervisor stays alive with Cloud WS + tunnel
1632
+ Supervisor transitions to "stopped" state
1633
+ Minimal HTML page served on port (503 status on /api/health)
1634
+ → Tunnel and Cloud connectivity remain active
1635
+ → `ppm start` resumes without restarting supervisor process
1636
+
1637
+ $ ppm stop --kill OR ppm down
1638
+ → FULL SHUTDOWN: kills everything (supervisor + server + tunnel)
1639
+ → Supervisor transitions to "upgrading" then terminates
1634
1640
  → Cleans up status.json and ppm.pid
1635
- → Graceful shutdown (close WS, cleanup PTY, stop tunnel)
1641
+ → Graceful cleanup (close WS, cleanup PTY, stop tunnel)
1636
1642
  ```
1637
1643
 
1644
+ ### Supervisor Architecture (v0.9.11+)
1645
+
1646
+ The supervisor is a long-lived parent process that manages server + tunnel children with resilience and state management.
1647
+
1648
+ **Architecture:**
1649
+ ```
1650
+ Supervisor Process (parent)
1651
+ ├── Server Child (Hono HTTP server)
1652
+ │ ├── Health checks every 30s (/api/health)
1653
+ │ ├── Auto-restart on crash (exponential backoff, max 10 restarts)
1654
+ │ └── If in "stopped" state, serves minimal 503 page instead of restarting
1655
+
1656
+ ├── Tunnel Child (Cloudflare Quick Tunnel, if --share)
1657
+ │ ├── URL probe every 2min
1658
+ │ ├── Auto-reconnect on failure
1659
+ │ └── URL persisted to status.json
1660
+
1661
+ ├── State Machine: "running" | "paused" | "stopped" | "upgrading"
1662
+ │ ├── running — Server spawned, tunnel optional, serving requests
1663
+ │ ├── paused — Supervisor paused (resume via signal)
1664
+ │ ├── stopped — Server stopped (soft stop), tunnel alive, Cloud WS active
1665
+ │ └── upgrading — Self-replace in progress
1666
+
1667
+ ├── Upgrade Check (every 15min)
1668
+ │ └── npm registry poll → availableVersion written to status.json
1669
+
1670
+ ├── Stopped Page Server
1671
+ │ ├── Lightweight HTTP handler on same port as server
1672
+ │ ├── Returns 503 on /api/health
1673
+ │ └── Tunnels Cloud WS calls through to PPM Cloud
1674
+
1675
+ └── Error Resilience
1676
+ ├── uncaughtException → log + exit gracefully
1677
+ ├── unhandledRejection → log + continue
1678
+ └── Signal handlers: SIGTERM (full shutdown), SIGUSR1 (self-replace), SIGUSR2 (restart skip backoff)
1679
+ ```
1680
+
1681
+ **Soft Stop vs Full Shutdown:**
1682
+ | Command | Server | Supervisor | Tunnel | Use Case |
1683
+ |---------|--------|------------|--------|----------|
1684
+ | `ppm stop` | Killed | Stays alive | Stays alive | Restart later with `ppm start` |
1685
+ | `ppm stop --kill` | Killed | Killed | Killed | Full cleanup, exit |
1686
+ | `ppm down` | Killed | Killed | Killed | Full cleanup, exit |
1687
+
1688
+ **State Persistence:**
1689
+ - Status file: `~/.ppm/status.json` — PID, port, host, shareUrl, supervisorPid, availableVersion, state
1690
+ - Lock file: `~/.ppm/.start-lock` — Prevent concurrent starts
1691
+ - Command file: `~/.ppm/.supervisor-cmd` — IPC for soft_stop, resume, self_replace
1692
+
1693
+ **Stopped Page Implementation:**
1694
+ - Minimal HTTP server on same port as main server
1695
+ - Serves `503 Service Unavailable` on /api/health
1696
+ - Proxies Cloud WS calls to PPM Cloud (if tunnel configured)
1697
+ - Allows `ppm start` to resume without supervisor restart
1698
+
1699
+ **Files (Modular Design):**
1700
+ - `src/services/supervisor.ts` — Main orchestrator (spawn, health checks, upgrade checks)
1701
+ - `src/services/supervisor-state.ts` — State machine, IPC command handling, signal routing
1702
+ - `src/services/supervisor-stopped-page.ts` — Minimal 503 page + Cloud WS proxy
1703
+
1704
+ ---
1705
+
1638
1706
  ### Future: Multi-Machine (Not in v2)
1639
1707
  Would require:
1640
1708
  - Central state server (Redis/Postgres)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hienlh/ppm",
3
- "version": "0.9.52",
3
+ "version": "0.9.53",
4
4
  "description": "Personal Project Manager — mobile-first web IDE with AI assistance",
5
5
  "author": "hienlh",
6
6
  "license": "MIT",
@@ -42,6 +42,35 @@ export async function restartServer(options: { config?: string; force?: boolean
42
42
  process.exit(1);
43
43
  }
44
44
 
45
+ // Stopped state: treat restart as resume (send resume command)
46
+ if (state === "stopped") {
47
+ console.log("\n Server is stopped. Resuming via supervisor...\n");
48
+ const cmdFile = resolve(PPM_DIR, ".supervisor-cmd");
49
+ writeFileSync(cmdFile, JSON.stringify({ action: "resume" }));
50
+ // Signal supervisor (Windows: polling picks up command file)
51
+ if (process.platform !== "win32") {
52
+ try { process.kill(supervisorPid, "SIGUSR2"); } catch (e) {
53
+ console.error(` ✗ Failed to signal supervisor: ${e}`);
54
+ process.exit(1);
55
+ }
56
+ }
57
+ // Wait for state to change back to running
58
+ const rStart = Date.now();
59
+ while (Date.now() - rStart < 15_000) {
60
+ await Bun.sleep(500);
61
+ try {
62
+ const newStatus = JSON.parse(readFileSync(STATUS_FILE, "utf-8"));
63
+ if (newStatus.state === "running" && newStatus.pid) {
64
+ console.log(` ✓ Server resumed (PID: ${newStatus.pid})`);
65
+ if (newStatus.shareUrl) console.log(` ➜ Share: ${newStatus.shareUrl}`);
66
+ process.exit(0);
67
+ }
68
+ } catch {}
69
+ }
70
+ console.error(" ⚠ Resume timed out. Check: ppm logs");
71
+ process.exit(1);
72
+ }
73
+
45
74
  const oldServerPid = status.pid as number | undefined;
46
75
  console.log("\n Restarting PPM server via supervisor...");
47
76
  console.log(" If you're using PPM terminal, wait a few seconds for auto-reconnect.\n");
@@ -1,10 +1,11 @@
1
1
  import { resolve } from "node:path";
2
2
  import { homedir } from "node:os";
3
- import { readFileSync, unlinkSync, existsSync } from "node:fs";
3
+ import { readFileSync, writeFileSync, unlinkSync, existsSync } from "node:fs";
4
4
 
5
5
  const PPM_DIR = process.env.PPM_HOME || resolve(homedir(), ".ppm");
6
6
  const PID_FILE = resolve(PPM_DIR, "ppm.pid");
7
7
  const STATUS_FILE = resolve(PPM_DIR, "status.json");
8
+ const CMD_FILE = resolve(PPM_DIR, ".supervisor-cmd");
8
9
 
9
10
  function killPid(pid: number, label: string): boolean {
10
11
  try {
@@ -51,7 +52,7 @@ function killAllByName(name: string): number {
51
52
  return killed;
52
53
  }
53
54
 
54
- export async function stopServer(options?: { all?: boolean }) {
55
+ export async function stopServer(options?: { all?: boolean; kill?: boolean }) {
55
56
  if (options?.all) {
56
57
  console.log(" Stopping all PPM and cloudflared processes...\n");
57
58
  const cfKilled = killAllByName("cloudflared");
@@ -76,14 +77,76 @@ export async function stopServer(options?: { all?: boolean }) {
76
77
  return;
77
78
  }
78
79
 
80
+ // Full shutdown: --kill flag or `ppm down`
81
+ if (options?.kill) {
82
+ return hardStop();
83
+ }
84
+
85
+ // Default: soft stop — kill server only, supervisor stays alive
86
+ return softStopCmd();
87
+ }
88
+
89
+ /** Soft stop: write command file + signal supervisor → kills server only */
90
+ async function softStopCmd() {
91
+ let status: Record<string, unknown> | null = null;
92
+ if (existsSync(STATUS_FILE)) {
93
+ try { status = JSON.parse(readFileSync(STATUS_FILE, "utf-8")); } catch {}
94
+ }
95
+
96
+ const supervisorPid = (status?.supervisorPid as number) ?? null;
97
+
98
+ if (!supervisorPid) {
99
+ // No supervisor — fall back to hard stop (legacy)
100
+ return hardStop();
101
+ }
102
+
103
+ // Check if supervisor is alive
104
+ try { process.kill(supervisorPid, 0); } catch {
105
+ console.log("Supervisor not running. Cleaning up.");
106
+ cleanup();
107
+ return;
108
+ }
109
+
110
+ // Already stopped?
111
+ if ((status?.state as string) === "stopped") {
112
+ console.log("PPM server is already stopped. Supervisor still alive.");
113
+ console.log("Use 'ppm stop --kill' or 'ppm down' to fully shut down.");
114
+ return;
115
+ }
116
+
117
+ // Write soft stop command file + signal supervisor (Windows: polling picks it up)
118
+ writeFileSync(CMD_FILE, JSON.stringify({ action: "soft_stop" }));
119
+ if (process.platform !== "win32") {
120
+ try { process.kill(supervisorPid, "SIGUSR2"); } catch (e) {
121
+ console.error(` Failed to signal supervisor: ${e}`);
122
+ return;
123
+ }
124
+ }
125
+
126
+ // Wait for state to change to "stopped" in status.json
127
+ const start = Date.now();
128
+ while (Date.now() - start < 5000) {
129
+ await Bun.sleep(500);
130
+ try {
131
+ const data = JSON.parse(readFileSync(STATUS_FILE, "utf-8"));
132
+ if (data.state === "stopped") {
133
+ console.log("PPM server stopped. Supervisor still alive (Cloud WS + tunnel).");
134
+ console.log("Use 'ppm start' to restart or 'ppm stop --kill' to fully shut down.");
135
+ return;
136
+ }
137
+ } catch {}
138
+ }
139
+ console.log("PPM server stop requested.");
140
+ }
141
+
142
+ /** Hard stop: SIGTERM supervisor → everything dies (current behavior) */
143
+ async function hardStop() {
79
144
  let status: { pid?: number; tunnelPid?: number; supervisorPid?: number } | null = null;
80
145
 
81
- // Read status.json
82
146
  if (existsSync(STATUS_FILE)) {
83
147
  try { status = JSON.parse(readFileSync(STATUS_FILE, "utf-8")); } catch {}
84
148
  }
85
149
 
86
- // Fallback to ppm.pid (now stores supervisor PID)
87
150
  const pidFromFile = existsSync(PID_FILE)
88
151
  ? parseInt(readFileSync(PID_FILE, "utf-8").trim(), 10)
89
152
  : NaN;
@@ -102,10 +165,8 @@ export async function stopServer(options?: { all?: boolean }) {
102
165
  // Kill supervisor first — its SIGTERM handler kills server + tunnel children
103
166
  if (supervisorPid) {
104
167
  killPid(supervisorPid, "supervisor");
105
- // Give supervisor 2s to gracefully kill children
106
168
  await Bun.sleep(2000);
107
169
  } else if (fallbackPid) {
108
- // Legacy: ppm.pid might be server PID (pre-supervisor) or supervisor PID
109
170
  killPid(fallbackPid, "supervisor/server (pidfile)");
110
171
  await Bun.sleep(1000);
111
172
  }
package/src/index.ts CHANGED
@@ -39,13 +39,22 @@ program
39
39
 
40
40
  program
41
41
  .command("stop")
42
- .description("Stop the PPM daemon")
42
+ .description("Stop the PPM server (supervisor stays alive)")
43
43
  .option("-a, --all", "Kill all PPM and cloudflared processes (including untracked)")
44
+ .option("--kill", "Full shutdown (kills supervisor too)")
44
45
  .action(async (options) => {
45
46
  const { stopServer } = await import("./cli/commands/stop.ts");
46
47
  await stopServer(options);
47
48
  });
48
49
 
50
+ program
51
+ .command("down")
52
+ .description("Fully shut down PPM (supervisor + server + tunnel)")
53
+ .action(async () => {
54
+ const { stopServer } = await import("./cli/commands/stop.ts");
55
+ await stopServer({ kill: true });
56
+ });
57
+
49
58
  program
50
59
  .command("restart")
51
60
  .description("Restart the server (keeps tunnel alive)")
@@ -169,6 +169,49 @@ app.route("/api/cloud", cloudRoutes);
169
169
  // Static files / SPA fallback (non-API routes)
170
170
  app.route("/", staticRoutes);
171
171
 
172
+ // ─── Helpers for supervisor detection ───────────────────────────────────
173
+ async function waitForNewSupervisor(statusFile: string, oldPid: number) {
174
+ const { readFileSync } = await import("node:fs");
175
+ const start = Date.now();
176
+ while (Date.now() - start < 30_000) {
177
+ await Bun.sleep(1000);
178
+ try {
179
+ const data = JSON.parse(readFileSync(statusFile, "utf-8"));
180
+ if (data.supervisorPid && data.supervisorPid !== oldPid && data.state === "running") {
181
+ console.log(` Upgrade complete (new PID: ${data.supervisorPid})`);
182
+ process.exit(0);
183
+ }
184
+ } catch {}
185
+ }
186
+ console.error(" Upgrade timed out (30s). Check: ppm logs");
187
+ process.exit(1);
188
+ }
189
+
190
+ async function waitForServerReady(statusFile: string, port: number) {
191
+ const { readFileSync } = await import("node:fs");
192
+ const start = Date.now();
193
+ while (Date.now() - start < 10_000) {
194
+ await Bun.sleep(500);
195
+ try {
196
+ const data = JSON.parse(readFileSync(statusFile, "utf-8"));
197
+ if (data.state === "running" && data.pid) {
198
+ // Verify server is responding
199
+ try {
200
+ const res = await fetch(`http://127.0.0.1:${port}/api/health`, {
201
+ signal: AbortSignal.timeout(2000),
202
+ });
203
+ if (res.ok) {
204
+ console.log(` Server is ready (PID: ${data.pid}).`);
205
+ process.exit(0);
206
+ }
207
+ } catch {}
208
+ }
209
+ } catch {}
210
+ }
211
+ console.log(" Resume signal sent. Check: ppm status");
212
+ process.exit(0);
213
+ }
214
+
172
215
  export async function startServer(options: {
173
216
  port?: string;
174
217
  share?: boolean;
@@ -189,36 +232,105 @@ export async function startServer(options: {
189
232
  const { bootstrapProviders } = await import("../providers/registry.ts");
190
233
  await bootstrapProviders();
191
234
 
192
- // Check if port is already in use before spawning supervisor
193
- const portInUse = await new Promise<boolean>((resolve) => {
194
- const net = require("node:net") as typeof import("node:net");
195
- const tester = net.createServer()
196
- .once("error", (err: NodeJS.ErrnoException) => {
197
- resolve(err.code === "EADDRINUSE");
198
- })
199
- .once("listening", () => {
200
- tester.close(() => resolve(false));
201
- })
202
- .listen(port, host);
203
- });
204
- if (portInUse) {
205
- console.error(`\n ✗ Port ${port} is already in use.`);
206
- console.error(` Run 'ppm stop' first or use a different port with --port.\n`);
207
- process.exit(1);
208
- }
209
-
210
235
  {
211
236
  const { resolve } = await import("node:path");
212
237
  const { homedir } = await import("node:os");
213
238
  const { writeFileSync, readFileSync, mkdirSync, existsSync, openSync } = await import("node:fs");
214
239
  const { isCompiledBinary } = await import("../services/autostart-generator.ts");
240
+ const { writeCmd, acquireLock, releaseLock } = await import("../services/supervisor-state.ts");
215
241
 
216
242
  const ppmDir = process.env.PPM_HOME || resolve(homedir(), ".ppm");
217
243
  if (!existsSync(ppmDir)) mkdirSync(ppmDir, { recursive: true });
218
244
  const pidFile = resolve(ppmDir, "ppm.pid");
219
245
  const statusFile = resolve(ppmDir, "status.json");
220
246
 
221
- // Kill any leftover processes from previous run
247
+ // Prevent concurrent ppm start races
248
+ if (!acquireLock()) {
249
+ console.log("\n Another 'ppm start' is already in progress. Exiting.\n");
250
+ process.exit(1);
251
+ }
252
+ // Release lock on exit (normal or error)
253
+ process.on("exit", releaseLock);
254
+
255
+ // ── Check for existing supervisor ──────────────────────────────────
256
+ if (existsSync(statusFile)) {
257
+ try {
258
+ const status = JSON.parse(readFileSync(statusFile, "utf-8"));
259
+ const supervisorPid = status.supervisorPid as number;
260
+
261
+ if (supervisorPid) {
262
+ try {
263
+ process.kill(supervisorPid, 0); // throws if dead
264
+
265
+ // Supervisor is alive — handle based on state
266
+ const state = status.state as string;
267
+ const runningVersion = status.serverVersion as string;
268
+
269
+ if (state === "stopped") {
270
+ console.log(" Supervisor is alive (stopped state). Resuming server...");
271
+ if (runningVersion !== VERSION) {
272
+ console.log(` Upgrading: ${runningVersion} -> ${VERSION}`);
273
+ process.kill(supervisorPid, "SIGUSR1");
274
+ await waitForNewSupervisor(statusFile, supervisorPid);
275
+ } else {
276
+ writeCmd("resume");
277
+ process.kill(supervisorPid, "SIGUSR2");
278
+ await waitForServerReady(statusFile, port);
279
+ }
280
+ return;
281
+ }
282
+
283
+ if (state === "running") {
284
+ if (runningVersion !== VERSION) {
285
+ console.log(` Supervisor running (v${runningVersion}). Upgrading to v${VERSION}...`);
286
+ process.kill(supervisorPid, "SIGUSR1");
287
+ await waitForNewSupervisor(statusFile, supervisorPid);
288
+ } else {
289
+ console.log(`\n PPM is already running (PID: ${supervisorPid}).`);
290
+ console.log(` Use 'ppm restart' to reload or 'ppm stop' first.\n`);
291
+ process.exit(0);
292
+ }
293
+ return;
294
+ }
295
+
296
+ if (state === "paused") {
297
+ console.log(" Supervisor is paused (max restarts). Sending resume...");
298
+ writeCmd("resume");
299
+ process.kill(supervisorPid, "SIGUSR2");
300
+ await waitForServerReady(statusFile, port);
301
+ return;
302
+ }
303
+
304
+ if (state === "upgrading") {
305
+ console.log(" Supervisor is currently upgrading. Please wait...");
306
+ process.exit(0);
307
+ }
308
+ } catch {
309
+ // Supervisor PID is dead, continue with fresh start
310
+ }
311
+ }
312
+ } catch {}
313
+ }
314
+
315
+ // ── Check port availability ────────────────────────────────────────
316
+ const portInUse = await new Promise<boolean>((resolve) => {
317
+ const net = require("node:net") as typeof import("node:net");
318
+ const tester = net.createServer()
319
+ .once("error", (err: NodeJS.ErrnoException) => {
320
+ resolve(err.code === "EADDRINUSE");
321
+ })
322
+ .once("listening", () => {
323
+ tester.close(() => resolve(false));
324
+ })
325
+ .listen(port, host);
326
+ });
327
+ if (portInUse) {
328
+ console.error(`\n ✗ Port ${port} is already in use.`);
329
+ console.error(` Run 'ppm stop' first or use a different port with --port.\n`);
330
+ process.exit(1);
331
+ }
332
+
333
+ // Kill any leftover processes from previous run (stale status.json)
222
334
  if (existsSync(statusFile)) {
223
335
  try {
224
336
  const prev = JSON.parse(readFileSync(statusFile, "utf-8"));
@@ -39,24 +39,26 @@ export function resolveBunPath(): string {
39
39
  throw new Error("Could not resolve bun binary. Install Bun or add it to PATH.");
40
40
  }
41
41
 
42
- /** Build the command array for the PPM server process */
42
+ /** Build the command array for the PPM supervisor process */
43
43
  export function buildExecCommand(config: AutoStartConfig): string[] {
44
44
  if (isCompiledBinary()) {
45
- // Compiled binary: just run self with __serve__ args
46
- const args = [process.execPath, "__serve__", String(config.port), config.host];
45
+ // Compiled binary: just run self with __supervise__ args
46
+ const args = [process.execPath, "__supervise__", String(config.port), config.host];
47
47
  if (config.configPath) args.push(config.configPath);
48
48
  if (config.profile) args.push(config.profile);
49
+ if (config.share) args.push("--share");
49
50
  return args;
50
51
  }
51
52
 
52
- // Bun runtime: bun run <script> __serve__ <port> <host> [config] [profile]
53
+ // Bun runtime: bun run <script> __supervise__ <port> <host> [config] [profile]
53
54
  const bunPath = resolveBunPath();
54
- const scriptPath = resolve(import.meta.dir, "..", "server", "index.ts");
55
- const args = [bunPath, "run", scriptPath, "__serve__", String(config.port), config.host];
55
+ const scriptPath = resolve(import.meta.dir, "supervisor.ts");
56
+ const args = [bunPath, "run", scriptPath, "__supervise__", String(config.port), config.host];
56
57
  if (config.configPath) args.push(config.configPath);
57
58
  else args.push(""); // placeholder
58
59
  if (config.profile) args.push(config.profile);
59
60
  else args.push(""); // placeholder
61
+ if (config.share) args.push("--share");
60
62
  return args;
61
63
  }
62
64
 
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Supervisor state machine — state transitions, IPC command file, signal handling.
3
+ * Extracted from supervisor.ts to keep the orchestrator lean.
4
+ */
5
+ import { resolve } from "node:path";
6
+ import { homedir } from "node:os";
7
+ import {
8
+ readFileSync, writeFileSync, existsSync, unlinkSync, renameSync, openSync, closeSync,
9
+ } from "node:fs";
10
+ import { constants } from "node:fs";
11
+
12
+ const PPM_DIR = resolve(process.env.PPM_HOME || resolve(homedir(), ".ppm"));
13
+ export const CMD_FILE = resolve(PPM_DIR, ".supervisor-cmd");
14
+ export const STATUS_FILE = resolve(PPM_DIR, "status.json");
15
+ export const PID_FILE = resolve(PPM_DIR, "ppm.pid");
16
+ export const LOCK_FILE = resolve(PPM_DIR, ".start-lock");
17
+
18
+ // ─── State ─────────────────────────────────────────────────────────────
19
+ export type SupervisorState = "running" | "paused" | "stopped" | "upgrading";
20
+
21
+ let _state: SupervisorState = "running";
22
+ let _resumeResolve: (() => void) | null = null;
23
+
24
+ export function getState(): SupervisorState { return _state; }
25
+
26
+ export function setState(s: SupervisorState) { _state = s; }
27
+
28
+ export function waitForResume(): Promise<void> {
29
+ return new Promise((res) => { _resumeResolve = res; });
30
+ }
31
+
32
+ export function triggerResume(): void {
33
+ if (_resumeResolve) {
34
+ _resumeResolve();
35
+ _resumeResolve = null;
36
+ }
37
+ }
38
+
39
+ // ─── Status file helpers ───────────────────────────────────────────────
40
+ export function readStatus(): Record<string, unknown> {
41
+ try {
42
+ if (existsSync(STATUS_FILE)) return JSON.parse(readFileSync(STATUS_FILE, "utf-8"));
43
+ } catch {}
44
+ return {};
45
+ }
46
+
47
+ export function updateStatus(patch: Record<string, unknown>) {
48
+ try {
49
+ const data = { ...readStatus(), ...patch };
50
+ writeFileSync(STATUS_FILE, JSON.stringify(data));
51
+ } catch {}
52
+ }
53
+
54
+ // ─── Command file protocol ─────────────────────────────────────────────
55
+ export type CmdAction = "soft_stop" | "resume";
56
+
57
+ /** Atomically claim + read command file (rename to .claimed, read, delete) */
58
+ export function readAndDeleteCmd(): { action: CmdAction } | null {
59
+ const claimed = CMD_FILE + ".claimed";
60
+ try {
61
+ renameSync(CMD_FILE, claimed); // atomic claim — second caller gets ENOENT
62
+ const cmd = JSON.parse(readFileSync(claimed, "utf-8"));
63
+ unlinkSync(claimed);
64
+ return cmd;
65
+ } catch {
66
+ // No command file or already claimed by another handler
67
+ try { unlinkSync(claimed); } catch {}
68
+ return null;
69
+ }
70
+ }
71
+
72
+ export function writeCmd(action: CmdAction) {
73
+ writeFileSync(CMD_FILE, JSON.stringify({ action }));
74
+ }
75
+
76
+ // ─── Lockfile ──────────────────────────────────────────────────────────
77
+ export function acquireLock(): boolean {
78
+ try {
79
+ // Try exclusive create — fails if file already exists (atomic)
80
+ const fd = openSync(LOCK_FILE, "wx");
81
+ writeFileSync(fd, String(process.pid));
82
+ closeSync(fd);
83
+ return true;
84
+ } catch {
85
+ // File exists — check if holding process is alive
86
+ try {
87
+ const pid = parseInt(readFileSync(LOCK_FILE, "utf-8").trim(), 10);
88
+ if (!isNaN(pid)) {
89
+ try { process.kill(pid, 0); return false; } catch {} // stale lock
90
+ }
91
+ // Stale lock — overwrite
92
+ writeFileSync(LOCK_FILE, String(process.pid));
93
+ return true;
94
+ } catch { return false; }
95
+ }
96
+ }
97
+
98
+ export function releaseLock() {
99
+ try { unlinkSync(LOCK_FILE); } catch {}
100
+ }
@@ -0,0 +1,73 @@
1
+ /**
2
+ * Minimal HTTP server that serves a "stopped" page when the PPM server child is down.
3
+ * Binds to the same port so the tunnel URL still works.
4
+ */
5
+ import { appendFileSync } from "node:fs";
6
+ import { resolve } from "node:path";
7
+ import { homedir } from "node:os";
8
+
9
+ const LOG_FILE = resolve(process.env.PPM_HOME || resolve(homedir(), ".ppm"), "ppm.log");
10
+
11
+ function log(level: string, msg: string) {
12
+ const ts = new Date().toISOString();
13
+ try { appendFileSync(LOG_FILE, `[${ts}] [${level}] [stopped-page] ${msg}\n`); } catch {}
14
+ }
15
+
16
+ const STOPPED_HTML = `<!DOCTYPE html>
17
+ <html><head>
18
+ <meta charset="utf-8">
19
+ <meta name="viewport" content="width=device-width,initial-scale=1">
20
+ <title>PPM - Stopped</title>
21
+ <style>
22
+ body { font-family: system-ui; display: flex; justify-content: center;
23
+ align-items: center; min-height: 100vh; margin: 0;
24
+ background: #1a1a2e; color: #e0e0e0; }
25
+ .card { text-align: center; padding: 2rem; }
26
+ h1 { font-size: 1.5rem; margin-bottom: 0.5rem; }
27
+ p { color: #888; font-size: 0.9rem; }
28
+ .dot { display: inline-block; width: 10px; height: 10px;
29
+ border-radius: 50%; background: #f59e0b; margin-right: 8px; }
30
+ </style>
31
+ </head><body>
32
+ <div class="card">
33
+ <h1><span class="dot"></span>PPM Server Stopped</h1>
34
+ <p>The server is stopped but the supervisor is still running.</p>
35
+ <p>Use <code>ppm start</code> or Cloud dashboard to restart.</p>
36
+ </div>
37
+ </body></html>`;
38
+
39
+ let stoppedServer: ReturnType<typeof Bun.serve> | null = null;
40
+
41
+ export function startStoppedPage(port: number, host: string) {
42
+ if (stoppedServer) return;
43
+
44
+ try {
45
+ stoppedServer = Bun.serve({
46
+ port,
47
+ hostname: host,
48
+ fetch(req) {
49
+ const url = new URL(req.url);
50
+ if (url.pathname === "/api/health") {
51
+ return new Response(JSON.stringify({ status: "stopped" }), {
52
+ status: 503,
53
+ headers: { "Content-Type": "application/json" },
54
+ });
55
+ }
56
+ return new Response(STOPPED_HTML, {
57
+ headers: { "Content-Type": "text/html" },
58
+ });
59
+ },
60
+ });
61
+ log("INFO", `Stopped page serving on port ${port}`);
62
+ } catch (e) {
63
+ log("WARN", `Failed to start stopped page: ${e}`);
64
+ }
65
+ }
66
+
67
+ export function stopStoppedPage() {
68
+ if (stoppedServer) {
69
+ stoppedServer.stop();
70
+ stoppedServer = null;
71
+ log("INFO", "Stopped page server shut down");
72
+ }
73
+ }
@@ -12,6 +12,13 @@ import {
12
12
  unlinkSync,
13
13
  } from "node:fs";
14
14
  import { isCompiledBinary } from "./autostart-generator.ts";
15
+ import {
16
+ type SupervisorState,
17
+ getState, setState, waitForResume, triggerResume,
18
+ readAndDeleteCmd, readStatus, updateStatus,
19
+ STATUS_FILE, PID_FILE,
20
+ } from "./supervisor-state.ts";
21
+ import { startStoppedPage, stopStoppedPage } from "./supervisor-stopped-page.ts";
15
22
 
16
23
  // ─── Constants ─────────────────────────────────────────────────────────
17
24
  const MAX_RESTARTS = 10;
@@ -28,8 +35,6 @@ const UPGRADE_SKIP_INITIAL_MS = 300_000; // 5min delay before first check
28
35
  const SELF_REPLACE_TIMEOUT_MS = 30_000; // 30s to wait for new supervisor
29
36
 
30
37
  const PPM_DIR = resolve(process.env.PPM_HOME || resolve(homedir(), ".ppm"));
31
- const STATUS_FILE = resolve(PPM_DIR, "status.json");
32
- const PID_FILE = resolve(PPM_DIR, "ppm.pid");
33
38
  const LOG_FILE = resolve(PPM_DIR, "ppm.log");
34
39
  const RESTARTING_FLAG = resolve(PPM_DIR, ".restarting");
35
40
 
@@ -40,23 +45,10 @@ let tunnelUrl: string | null = null;
40
45
  let adoptedTunnelPid: number | null = null; // PID of tunnel kept alive across upgrade
41
46
  let shuttingDown = false;
42
47
 
43
- type SupervisorState = "running" | "paused" | "upgrading";
44
- let supervisorState: SupervisorState = "running";
45
-
46
- let resumeResolve: (() => void) | null = null;
47
-
48
- function waitForResume(): Promise<void> {
49
- return new Promise((resolve) => {
50
- resumeResolve = resolve;
51
- });
52
- }
53
-
54
- function triggerResume(): void {
55
- if (resumeResolve) {
56
- resumeResolve();
57
- resumeResolve = null;
58
- }
59
- }
48
+ // Module-level refs for softStop (needs access to respawn args)
49
+ let _serverArgs: string[] = [];
50
+ let _logFd: number = -1;
51
+ let _opts: { port: number; host: string; share: boolean } = { port: 8080, host: "0.0.0.0", share: false };
60
52
 
61
53
  let serverRestarts = 0;
62
54
  let lastServerCrash = 0;
@@ -87,21 +79,6 @@ function log(level: string, msg: string) {
87
79
  }
88
80
  }
89
81
 
90
- // ─── Status management ─────────────────────────────────────────────────
91
- function readStatus(): Record<string, unknown> {
92
- try {
93
- if (existsSync(STATUS_FILE)) return JSON.parse(readFileSync(STATUS_FILE, "utf-8"));
94
- } catch {}
95
- return {};
96
- }
97
-
98
- function updateStatus(patch: Record<string, unknown>) {
99
- try {
100
- const data = { ...readStatus(), ...patch };
101
- writeFileSync(STATUS_FILE, JSON.stringify(data));
102
- } catch {}
103
- }
104
-
105
82
  // ─── Backoff calc ──────────────────────────────────────────────────────
106
83
  function backoffDelay(restartCount: number): number {
107
84
  return Math.min(BACKOFF_BASE_MS * 2 ** (restartCount - 1), BACKOFF_MAX_MS);
@@ -130,6 +107,12 @@ export async function spawnServer(
130
107
  const exitCode = await serverChild.exited;
131
108
  serverChild = null;
132
109
 
110
+ // Don't respawn if in stopped state (soft stop)
111
+ if (getState() === "stopped") {
112
+ log("INFO", "Server exited, supervisor in stopped state — not respawning");
113
+ return;
114
+ }
115
+
133
116
  if (exitCode === 0 && shuttingDown) {
134
117
  log("INFO", `Server exited cleanly (code ${exitCode})`);
135
118
  return;
@@ -158,7 +141,7 @@ export async function spawnServer(
158
141
  if (serverRestarts > MAX_RESTARTS) {
159
142
  log("WARN", `Server exceeded ${MAX_RESTARTS} restarts, pausing`);
160
143
  notifyStateChange("running", "paused", "max_restarts_exceeded");
161
- supervisorState = "paused";
144
+ setState("paused");
162
145
  updateStatus({
163
146
  state: "paused",
164
147
  pid: null,
@@ -170,7 +153,7 @@ export async function spawnServer(
170
153
  await waitForResume();
171
154
  // Resumed — reset and respawn
172
155
  notifyStateChange("paused", "running", "user_resume");
173
- supervisorState = "running";
156
+ setState("running");
174
157
  serverRestarts = 0;
175
158
  updateStatus({ state: "running", pausedAt: null, pauseReason: null });
176
159
  log("INFO", "Resuming server after pause");
@@ -301,7 +284,7 @@ export async function spawnTunnel(port: number): Promise<void> {
301
284
  // ─── Health checks ─────────────────────────────────────────────────────
302
285
  function startServerHealthCheck(port: number) {
303
286
  healthTimer = setInterval(async () => {
304
- if (shuttingDown || !serverChild) return;
287
+ if (shuttingDown || !serverChild || getState() === "stopped") return;
305
288
  try {
306
289
  const res = await fetch(`http://127.0.0.1:${port}/api/health`, {
307
290
  signal: AbortSignal.timeout(5000),
@@ -322,6 +305,8 @@ function startTunnelProbe(port: number) {
322
305
  tunnelProbeTimer = setInterval(async () => {
323
306
  if (shuttingDown || !tunnelUrl) { tunnelFailCount = 0; return; }
324
307
  if (!tunnelChild && !adoptedTunnelPid) { tunnelFailCount = 0; return; }
308
+ // Don't probe when server is intentionally stopped (stopped page serves 503)
309
+ if (getState() === "stopped") { tunnelFailCount = 0; return; }
325
310
 
326
311
  // Check if adopted tunnel process is still alive
327
312
  if (adoptedTunnelPid && !tunnelChild) {
@@ -421,8 +406,8 @@ async function selfReplace(): Promise<{ success: boolean; error?: string }> {
421
406
  try {
422
407
  // Prevent spawnServer crash-restart loop from respawning killed children
423
408
  shuttingDown = true;
424
- notifyStateChange(supervisorState, "upgrading", "self_replace");
425
- supervisorState = "upgrading";
409
+ notifyStateChange(getState(), "upgrading", "self_replace");
410
+ setState("upgrading");
426
411
  updateStatus({ state: "upgrading" });
427
412
 
428
413
  // Set restarting flag so server child's stopTunnel() skips killing the tunnel
@@ -470,7 +455,7 @@ async function selfReplace(): Promise<{ success: boolean; error?: string }> {
470
455
  try { unlinkSync(RESTARTING_FLAG); } catch {}
471
456
  shuttingDown = false;
472
457
  notifyStateChange("upgrading", "running", "upgrade_failed");
473
- supervisorState = "running";
458
+ setState("running");
474
459
  updateStatus({ state: "running" });
475
460
  return { success: false, error: "New supervisor failed to start within 30s" };
476
461
  } catch (e) {
@@ -478,7 +463,7 @@ async function selfReplace(): Promise<{ success: boolean; error?: string }> {
478
463
  try { unlinkSync(RESTARTING_FLAG); } catch {}
479
464
  shuttingDown = false;
480
465
  notifyStateChange("upgrading", "running", "upgrade_failed");
481
- supervisorState = "running";
466
+ setState("running");
482
467
  updateStatus({ state: "running" });
483
468
  return { success: false, error: (e as Error).message };
484
469
  }
@@ -524,7 +509,7 @@ async function connectCloud(opts: { port: number }, serverArgs: string[], logFd:
524
509
  return {
525
510
  type: "heartbeat" as const,
526
511
  tunnelUrl,
527
- state: supervisorState,
512
+ state: getState(),
528
513
  // Use server-reported version (source of truth) with supervisor fallback
529
514
  appVersion: (status.serverVersion as string) || VERSION,
530
515
  availableVersion: (status.availableVersion as string) || null,
@@ -560,12 +545,21 @@ async function connectCloud(opts: { port: number }, serverArgs: string[], logFd:
560
545
  });
561
546
 
562
547
  switch (cmd.action) {
548
+ case "start":
549
+ if (getState() === "stopped") {
550
+ triggerResume();
551
+ sendResult(true, undefined, { state: "running" });
552
+ } else {
553
+ sendResult(false, `Server already in ${getState()} state`);
554
+ }
555
+ break;
556
+
563
557
  case "restart":
564
558
  if (serverChild) {
565
559
  serverRestartRequested = true;
566
560
  try { serverChild.kill(); } catch {}
567
561
  sendResult(true);
568
- } else if (supervisorState === "paused") {
562
+ } else if (getState() === "paused" || getState() === "stopped") {
569
563
  triggerResume();
570
564
  sendResult(true);
571
565
  } else {
@@ -574,17 +568,25 @@ async function connectCloud(opts: { port: number }, serverArgs: string[], logFd:
574
568
  break;
575
569
 
576
570
  case "resume":
577
- if (supervisorState === "paused") {
571
+ if (getState() === "paused" || getState() === "stopped") {
578
572
  triggerResume();
579
573
  sendResult(true);
580
574
  } else {
581
- sendResult(false, "Not in paused state");
575
+ sendResult(false, `Not in paused/stopped state (current: ${getState()})`);
582
576
  }
583
577
  break;
584
578
 
585
579
  case "stop":
580
+ if (getState() === "stopped") {
581
+ sendResult(false, "Already stopped");
582
+ } else {
583
+ sendResult(true);
584
+ softStop();
585
+ }
586
+ break;
587
+
588
+ case "shutdown":
586
589
  sendResult(true);
587
- // Delay exit to allow WS buffer to flush
588
590
  setTimeout(() => {
589
591
  shutdown();
590
592
  process.exit(0);
@@ -593,10 +595,13 @@ async function connectCloud(opts: { port: number }, serverArgs: string[], logFd:
593
595
 
594
596
  case "status":
595
597
  sendResult(true, undefined, {
596
- state: supervisorState,
598
+ state: getState(),
597
599
  serverPid: serverChild?.pid ?? null,
598
600
  tunnelUrl,
599
601
  serverRestarts,
602
+ stoppedAt: getState() === "stopped"
603
+ ? readStatus().stoppedAt
604
+ : null,
600
605
  });
601
606
  break;
602
607
 
@@ -609,6 +614,47 @@ async function connectCloud(opts: { port: number }, serverArgs: string[], logFd:
609
614
  }
610
615
  }
611
616
 
617
+ // ─── Soft stop (server only, supervisor stays alive) ──────────────────
618
+ let _softStopRunning = false;
619
+ export async function softStop() {
620
+ if (getState() === "stopped" || _softStopRunning) return;
621
+ _softStopRunning = true;
622
+
623
+ log("INFO", "Soft stop: killing server, supervisor stays alive");
624
+ notifyStateChange(getState(), "stopped", "user_stop");
625
+ setState("stopped");
626
+
627
+ // Kill server child
628
+ if (serverChild) {
629
+ try { serverChild.kill(); } catch {}
630
+ serverChild = null;
631
+ }
632
+
633
+ // Stop health checks (no server to check)
634
+ if (healthTimer) { clearInterval(healthTimer); healthTimer = null; }
635
+
636
+ // Keep: tunnel, Cloud WS, upgrade checks, tunnel probe
637
+ updateStatus({ state: "stopped", pid: null, stoppedAt: new Date().toISOString() });
638
+
639
+ // Start stopped page on the server port so tunnel URL still works
640
+ await Bun.sleep(500); // brief wait for port release
641
+ startStoppedPage(_opts.port, _opts.host);
642
+
643
+ // Wait for resume signal
644
+ await waitForResume();
645
+
646
+ // Resumed — restart server
647
+ stopStoppedPage();
648
+ await Bun.sleep(200); // brief wait for port release
649
+ notifyStateChange("stopped", "running", "user_start");
650
+ setState("running");
651
+ updateStatus({ state: "running", stoppedAt: null });
652
+ startServerHealthCheck(_opts.port);
653
+ log("INFO", "Resuming server from stopped state");
654
+ _softStopRunning = false;
655
+ spawnServer(_serverArgs, _logFd);
656
+ }
657
+
612
658
  // ─── Shutdown ──────────────────────────────────────────────────────────
613
659
  export function shutdown() {
614
660
  if (shuttingDown) return;
@@ -653,6 +699,14 @@ export async function runSupervisor(opts: {
653
699
  const logFd = openSync(LOG_FILE, "a");
654
700
  log("INFO", `Supervisor started (PID: ${process.pid}, port: ${opts.port}, share: ${opts.share})`);
655
701
 
702
+ // Global exception handlers — supervisor must never crash
703
+ process.on("uncaughtException", (err) => {
704
+ log("ERROR", `Uncaught exception: ${err.stack || err.message}`);
705
+ });
706
+ process.on("unhandledRejection", (reason) => {
707
+ log("ERROR", `Unhandled rejection: ${reason}`);
708
+ });
709
+
656
710
  // Write supervisor PID + clear stale availableVersion from previous run
657
711
  writeFileSync(PID_FILE, String(process.pid));
658
712
  updateStatus({
@@ -668,17 +722,45 @@ export async function runSupervisor(opts: {
668
722
  // Strip trailing empty args
669
723
  while (serverArgs.length > 0 && serverArgs[serverArgs.length - 1] === "") serverArgs.pop();
670
724
 
725
+ // Save module-level refs for softStop()
726
+ _serverArgs = serverArgs;
727
+ _logFd = logFd;
728
+ _opts = { port: opts.port, host: opts.host, share: opts.share };
729
+
671
730
  // Signal handlers
672
731
  process.on("SIGTERM", () => { shutdown(); process.exit(0); });
673
732
  process.on("SIGINT", () => { shutdown(); process.exit(0); });
674
733
 
675
- // SIGUSR2 = graceful server restart (tunnel stays alive) or resume from paused
734
+ // SIGUSR2 = command file dispatch OR graceful server restart
676
735
  process.on("SIGUSR2", () => {
677
- if (supervisorState === "paused") {
736
+ // Check for command file first (soft_stop, resume)
737
+ const cmd = readAndDeleteCmd();
738
+ if (cmd) {
739
+ if (cmd.action === "soft_stop") {
740
+ log("INFO", "SIGUSR2: soft_stop command received");
741
+ softStop();
742
+ return;
743
+ }
744
+ if (cmd.action === "resume") {
745
+ log("INFO", "SIGUSR2: resume command received");
746
+ if (getState() === "stopped" || getState() === "paused") {
747
+ triggerResume();
748
+ }
749
+ return;
750
+ }
751
+ }
752
+
753
+ // Default: restart server (existing behavior)
754
+ if (getState() === "paused") {
678
755
  log("INFO", "SIGUSR2 received while paused, resuming server");
679
756
  triggerResume();
680
757
  return;
681
758
  }
759
+ if (getState() === "stopped") {
760
+ log("INFO", "SIGUSR2 received while stopped, resuming server");
761
+ triggerResume();
762
+ return;
763
+ }
682
764
  log("INFO", "SIGUSR2 received, restarting server only");
683
765
  if (serverChild) {
684
766
  serverRestartRequested = true; // flag so spawnServer skips backoff
@@ -707,6 +789,18 @@ export async function runSupervisor(opts: {
707
789
  upgradeCheckTimer = setInterval(checkAvailableVersion, UPGRADE_CHECK_INTERVAL_MS);
708
790
  }, UPGRADE_SKIP_INITIAL_MS);
709
791
 
792
+ // Windows: poll command file since SIGUSR2 is not available
793
+ if (process.platform === "win32") {
794
+ setInterval(() => {
795
+ const cmd = readAndDeleteCmd();
796
+ if (!cmd) return;
797
+ if (cmd.action === "soft_stop") { softStop(); }
798
+ else if (cmd.action === "resume") {
799
+ if (getState() === "stopped" || getState() === "paused") triggerResume();
800
+ }
801
+ }, 1000);
802
+ }
803
+
710
804
  // Connect to Cloud via WebSocket (if device is linked)
711
805
  connectCloud(opts, serverArgs, logFd);
712
806
 
@@ -725,7 +819,7 @@ export async function runSupervisor(opts: {
725
819
  await Promise.all(promises);
726
820
 
727
821
  // If upgrading, selfReplace handles process.exit — wait for it
728
- if (supervisorState === "upgrading") {
822
+ if (getState() === "upgrading") {
729
823
  log("INFO", "Server loop exited during upgrade, waiting for selfReplace to finish");
730
824
  await new Promise(() => {}); // selfReplace will call process.exit()
731
825
  }