@gmickel/gno 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,986 @@
1
+ /**
2
+ * Shared self-backgrounding helper for `gno serve` and `gno daemon`.
3
+ *
4
+ * Wires `--detach`/`--pid-file`/`--log-file`/`--status`/`--stop` behind a
5
+ * single entry-point so both commands stay in lockstep. The Bun spawn
6
+ * invariants documented here (numeric-fd stdio + `.unref()`) are load-bearing:
7
+ * see `.flow/tasks/fn-72-backgrounding-flags-for-serve-and-daemon.9.md` for
8
+ * the spike that validated them in this repo.
9
+ *
10
+ * @module src/cli/detach
11
+ */
12
+
13
+ // node:fs.openSync/closeSync — Bun has no equivalent and we specifically need
14
+ // a numeric fd (not a `Bun.file()` object, which Bun closes on parent exit).
15
+ // We also use openSync with "wx" for atomic lock-file creation in
16
+ // guardDoubleStart.
17
+ import { closeSync, openSync, statSync, unlinkSync } from "node:fs";
18
+ // node:fs/promises — stat/unlink structural ops not covered by Bun APIs.
19
+ import { mkdir, stat, unlink } from "node:fs/promises";
20
+ // node:path — no Bun path utils.
21
+ import { dirname, join } from "node:path";
22
+
23
+ import { VERSION, resolveDirs } from "../app/constants";
24
+ import { toAbsolutePath } from "../config/paths";
25
+ import { atomicWrite } from "../core/file-ops";
26
+ import { CliError } from "./errors";
27
+
28
+ // ─────────────────────────────────────────────────────────────────────────────
29
+ // Types
30
+ // ─────────────────────────────────────────────────────────────────────────────
31
+
32
+ /** Which command the helper is acting on. */
33
+ export type DetachKind = "serve" | "daemon";
34
+
35
+ /** Paths the helper reads and writes for one kind. */
36
+ export interface ProcessPaths {
37
+ pidFile: string;
38
+ logFile: string;
39
+ }
40
+
41
+ /** Optional overrides collected from CLI flags. */
42
+ export interface ProcessPathOverrides {
43
+ pidFile?: string;
44
+ logFile?: string;
45
+ cwd?: string;
46
+ }
47
+
48
+ /** Payload persisted in the pid-file. */
49
+ export interface PidFilePayload {
50
+ pid: number;
51
+ cmd: DetachKind;
52
+ version: string;
53
+ started_at: string;
54
+ port?: number | null;
55
+ }
56
+
57
+ /** Result of a `--status` call (matches process-status@1.0). */
58
+ export interface ProcessStatus {
59
+ running: boolean;
60
+ pid: number | null;
61
+ port: number | null;
62
+ cmd: DetachKind;
63
+ version: string | null;
64
+ started_at: string | null;
65
+ uptime_seconds: number | null;
66
+ pid_file: string;
67
+ log_file: string;
68
+ log_size_bytes: number | null;
69
+ }
70
+
71
+ /**
72
+ * Operator-facing ambiguity the `process-status@1.0` schema can't encode.
73
+ *
74
+ * Returned by `inspectForeignLive()` as a sidecar to `statusProcess()`. It
75
+ * signals that a pid-file's pid is live and names the same command, but
76
+ * records a different gno version — so we can't safely claim it as "ours".
77
+ */
78
+ export interface ForeignLiveSignal {
79
+ pid: number;
80
+ recordedVersion: string;
81
+ currentVersion: string;
82
+ }
83
+
84
+ /** Outcome classification for `stopProcess`. */
85
+ export type StopOutcome =
86
+ | { kind: "not-running"; pidFile: string }
87
+ | { kind: "stopped"; pid: number; signal: "SIGTERM" | "SIGKILL" }
88
+ | { kind: "timeout"; pid: number }
89
+ | {
90
+ /**
91
+ * Live pid whose version disagrees with this binary. We refused to
92
+ * signal it because we can't prove identity. Caller must surface the
93
+ * ambiguity to the operator (typically via a VALIDATION error).
94
+ */
95
+ kind: "foreign-live";
96
+ pid: number;
97
+ payload: PidFilePayload;
98
+ };
99
+
100
+ // ─────────────────────────────────────────────────────────────────────────────
101
+ // Paths
102
+ // ─────────────────────────────────────────────────────────────────────────────
103
+
104
+ /**
105
+ * Resolve pid/log file locations for a given kind.
106
+ *
107
+ * Defaults come from `resolveDirs().data` (honours `GNO_DATA_DIR`). User
108
+ * overrides pass through `toAbsolutePath` so relative paths and `~` both work.
109
+ */
110
+ export function resolveProcessPaths(
111
+ kind: DetachKind,
112
+ overrides: ProcessPathOverrides = {}
113
+ ): ProcessPaths {
114
+ const dataDir = resolveDirs().data;
115
+ const defaults: ProcessPaths = {
116
+ pidFile: join(dataDir, `${kind}.pid`),
117
+ logFile: join(dataDir, `${kind}.log`),
118
+ };
119
+
120
+ return {
121
+ pidFile: overrides.pidFile
122
+ ? toAbsolutePath(overrides.pidFile, overrides.cwd)
123
+ : defaults.pidFile,
124
+ logFile: overrides.logFile
125
+ ? toAbsolutePath(overrides.logFile, overrides.cwd)
126
+ : defaults.logFile,
127
+ };
128
+ }
129
+
130
+ // ─────────────────────────────────────────────────────────────────────────────
131
+ // Pid-file IO
132
+ // ─────────────────────────────────────────────────────────────────────────────
133
+
134
+ function isErrnoException(value: unknown): value is NodeJS.ErrnoException {
135
+ return (
136
+ value instanceof Error &&
137
+ "code" in value &&
138
+ typeof (value as NodeJS.ErrnoException).code === "string"
139
+ );
140
+ }
141
+
142
+ function isDetachKind(value: unknown): value is DetachKind {
143
+ return value === "serve" || value === "daemon";
144
+ }
145
+
146
+ /**
147
+ * Read and validate a pid-file. Returns null when missing; throws on
148
+ * permission errors; throws `CliError("RUNTIME")` on unparsable JSON.
149
+ */
150
+ export async function readPidFile(
151
+ path: string
152
+ ): Promise<PidFilePayload | null> {
153
+ const file = Bun.file(path);
154
+ if (!(await file.exists())) {
155
+ return null;
156
+ }
157
+
158
+ let raw: string;
159
+ try {
160
+ raw = await file.text();
161
+ } catch (error) {
162
+ if (isErrnoException(error) && error.code === "ENOENT") {
163
+ return null;
164
+ }
165
+ throw error;
166
+ }
167
+
168
+ let parsed: unknown;
169
+ try {
170
+ parsed = JSON.parse(raw);
171
+ } catch (error) {
172
+ throw new CliError(
173
+ "RUNTIME",
174
+ `Pid-file is not valid JSON: ${path}: ${error instanceof Error ? error.message : String(error)}`
175
+ );
176
+ }
177
+
178
+ if (
179
+ typeof parsed !== "object" ||
180
+ parsed === null ||
181
+ !("pid" in parsed) ||
182
+ !("cmd" in parsed)
183
+ ) {
184
+ throw new CliError("RUNTIME", `Pid-file has unexpected shape: ${path}`);
185
+ }
186
+
187
+ const record = parsed as Record<string, unknown>;
188
+ if (
189
+ typeof record.pid !== "number" ||
190
+ !Number.isInteger(record.pid) ||
191
+ record.pid < 1
192
+ ) {
193
+ throw new CliError(
194
+ "RUNTIME",
195
+ `Pid-file has invalid pid (must be a positive integer): ${path}`
196
+ );
197
+ }
198
+ if (!isDetachKind(record.cmd)) {
199
+ throw new CliError(
200
+ "RUNTIME",
201
+ `Pid-file has invalid cmd (expected "serve" or "daemon"): ${path}`
202
+ );
203
+ }
204
+ if (typeof record.version !== "string" || record.version.length === 0) {
205
+ throw new CliError("RUNTIME", `Pid-file is missing version: ${path}`);
206
+ }
207
+ if (typeof record.started_at !== "string") {
208
+ throw new CliError("RUNTIME", `Pid-file is missing started_at: ${path}`);
209
+ }
210
+ // started_at must be a parseable ISO datetime. An invalid value would later
211
+ // produce NaN through Date.parse() and violate the process-status@1.0
212
+ // schema invariant that live processes report an integer uptime_seconds.
213
+ const startedAtMs = Date.parse(record.started_at);
214
+ if (!Number.isFinite(startedAtMs)) {
215
+ throw new CliError(
216
+ "RUNTIME",
217
+ `Pid-file has invalid started_at (not a parseable ISO datetime): ${path}`
218
+ );
219
+ }
220
+
221
+ let port: number | null;
222
+ if (record.port === null || record.port === undefined) {
223
+ port = null;
224
+ } else if (
225
+ typeof record.port === "number" &&
226
+ Number.isInteger(record.port) &&
227
+ record.port >= 1 &&
228
+ record.port <= 65_535
229
+ ) {
230
+ port = record.port;
231
+ } else {
232
+ throw new CliError(
233
+ "RUNTIME",
234
+ `Pid-file has invalid port (must be null or an integer in 1..65535): ${path}`
235
+ );
236
+ }
237
+
238
+ return {
239
+ pid: record.pid,
240
+ cmd: record.cmd,
241
+ version: record.version,
242
+ started_at: record.started_at,
243
+ port,
244
+ };
245
+ }
246
+
247
+ /**
248
+ * Atomically write pid-file JSON via `atomicWrite` from `src/core/file-ops.ts`.
249
+ * Parent directory is created if missing.
250
+ */
251
+ export async function writePidFile(
252
+ path: string,
253
+ payload: PidFilePayload
254
+ ): Promise<void> {
255
+ await mkdir(dirname(path), { recursive: true });
256
+ await atomicWrite(path, `${JSON.stringify(payload)}\n`);
257
+ }
258
+
259
+ // ─────────────────────────────────────────────────────────────────────────────
260
+ // Liveness
261
+ // ─────────────────────────────────────────────────────────────────────────────
262
+
263
+ /**
264
+ * Probe whether a process is alive via `process.kill(pid, 0)`.
265
+ *
266
+ * - ESRCH → dead.
267
+ * - EPERM → alive under a different user; we treat it as alive so callers
268
+ * don't incorrectly clean up someone else's pid.
269
+ * - Any other errno → rethrown for the caller to surface.
270
+ */
271
+ /**
272
+ * SIGTERM then SIGKILL a child we spawned but failed to register, awaiting
273
+ * its exit between each escalation. We keep the Bun subprocess handle so we
274
+ * can `await child.exited` (a Promise that resolves when the OS reaps the
275
+ * child) instead of relying on `kill -0`, which would report a zombie as
276
+ * still alive. Called only from the pid-file-write-failure path.
277
+ */
278
+ async function reapOrphanedChild(
279
+ child: ReturnType<typeof Bun.spawn>
280
+ ): Promise<void> {
281
+ const pid = child.pid;
282
+ try {
283
+ process.kill(pid, "SIGTERM");
284
+ } catch (error) {
285
+ if (isErrnoException(error) && error.code === "ESRCH") {
286
+ return;
287
+ }
288
+ // EPERM or other: we can't signal it, nothing more to do.
289
+ return;
290
+ }
291
+
292
+ // Wait up to 1s for SIGTERM to land.
293
+ const raced = await Promise.race([
294
+ child.exited,
295
+ new Promise<"timeout">((resolve) => {
296
+ setTimeout(() => resolve("timeout"), 1000).unref();
297
+ }),
298
+ ]);
299
+ if (raced !== "timeout") {
300
+ return;
301
+ }
302
+
303
+ try {
304
+ process.kill(pid, "SIGKILL");
305
+ } catch {
306
+ return;
307
+ }
308
+
309
+ // Wait up to 500ms for SIGKILL — bounded so the caller doesn't hang.
310
+ await Promise.race([
311
+ child.exited,
312
+ new Promise<void>((resolve) => {
313
+ setTimeout(() => resolve(), 500).unref();
314
+ }),
315
+ ]);
316
+ }
317
+
318
+ /**
319
+ * Child-side pid-file self-check. Polls the pid-file for a bounded window,
320
+ * requiring that it eventually appear AND point at our pid. If the pid-file
321
+ * never materializes (e.g. parent crashed between `Bun.spawn` and
322
+ * `writePidFile`), or points at another pid (we're the losing racer in a
323
+ * concurrent-start edge case), returns `false` so the child can exit
324
+ * rather than boot unmanaged.
325
+ *
326
+ * Called from the child-side wiring in fn-72.3/.4 immediately after the
327
+ * detach branch is skipped (`--__detached-child` sentinel present).
328
+ *
329
+ * Why a poll: the parent's spawn → pid-file-write sequence is not atomic.
330
+ * The child may start before the parent finishes writing. Polling a few
331
+ * hundred ms gives the parent time to register us without stalling the
332
+ * child indefinitely on a legitimately crashed parent.
333
+ */
334
+ export async function verifyPidFileMatchesSelf(options: {
335
+ pidFile: string;
336
+ selfPid?: number;
337
+ /** Total wait budget for the pid-file to appear. Default 3s. */
338
+ timeoutMs?: number;
339
+ /** Poll interval while waiting. Default 50ms. */
340
+ pollIntervalMs?: number;
341
+ /** Sleep override for deterministic tests. */
342
+ sleep?: (ms: number) => Promise<void>;
343
+ }): Promise<boolean> {
344
+ const selfPid = options.selfPid ?? process.pid;
345
+ const timeoutMs = options.timeoutMs ?? 3_000;
346
+ const pollIntervalMs = options.pollIntervalMs ?? 50;
347
+ const sleep = options.sleep ?? defaultSleep;
348
+
349
+ const deadline = Date.now() + timeoutMs;
350
+ // First pass: readPidFile now and if it already matches, short-circuit.
351
+ // Then poll while the file is missing.
352
+ while (Date.now() < deadline) {
353
+ const payload = await readPidFile(options.pidFile);
354
+ if (payload) {
355
+ return payload.pid === selfPid;
356
+ }
357
+ await sleep(pollIntervalMs);
358
+ }
359
+
360
+ // Final check after the deadline so we don't lose a race in the last
361
+ // poll interval.
362
+ const payload = await readPidFile(options.pidFile);
363
+ if (payload) {
364
+ return payload.pid === selfPid;
365
+ }
366
+
367
+ // Parent never registered us — safest action is to exit so the operator
368
+ // doesn't end up with an unmanaged orphan.
369
+ return false;
370
+ }
371
+
372
+ export function isProcessAlive(pid: number): boolean {
373
+ try {
374
+ process.kill(pid, 0);
375
+ return true;
376
+ } catch (error) {
377
+ if (isErrnoException(error)) {
378
+ if (error.code === "ESRCH") {
379
+ return false;
380
+ }
381
+ if (error.code === "EPERM") {
382
+ return true;
383
+ }
384
+ }
385
+ throw error;
386
+ }
387
+ }
388
+
389
+ // ─────────────────────────────────────────────────────────────────────────────
390
+ // Spawn
391
+ // ─────────────────────────────────────────────────────────────────────────────
392
+
393
+ /** Sentinel flag appended to the child argv so the re-invoked body skips detach. */
394
+ export const DETACHED_CHILD_FLAG = "--__detached-child";
395
+
396
+ export interface SpawnDetachedOptions {
397
+ kind: DetachKind;
398
+ /**
399
+ * Argv to re-invoke (typically `process.argv.slice(2)` minus `--detach`).
400
+ *
401
+ * By default we build the full child command as
402
+ * `[execPath, entryScript, ...argv, DETACHED_CHILD_FLAG]`, where
403
+ * `entryScript` comes from `process.argv[1]`. This matches how Bun and
404
+ * Node launch a script (`bun src/index.ts serve` / `node dist/index.js
405
+ * serve`). Tests or callers that want to run a standalone file can
406
+ * override the prefix via `cmd` below.
407
+ */
408
+ argv: string[];
409
+ pidFile: string;
410
+ logFile: string;
411
+ /** Extra env merged on top of `process.env`. */
412
+ env?: Record<string, string | undefined>;
413
+ /** Override for the parent executable path. Defaults to `process.execPath`. */
414
+ execPath?: string;
415
+ /**
416
+ * Full child command prefix (everything before user argv). Defaults to
417
+ * `[execPath, process.argv[1]]` so the re-exec matches the way Bun/Node
418
+ * originally launched the parent. Omit `execPath` if you set this.
419
+ */
420
+ cmdPrefix?: string[];
421
+ /**
422
+ * Override for the script path passed to the child runtime. Defaults to
423
+ * `process.argv[1]`. Set to `null` to omit (for callers passing a
424
+ * self-contained executable like a single `.mjs` file).
425
+ */
426
+ entryScript?: string | null;
427
+ /** Optional port to embed in the pid-file payload (serve only). */
428
+ port?: number | null;
429
+ /** Working directory for the child. Defaults to `process.cwd()`. */
430
+ cwd?: string;
431
+ }
432
+
433
+ export interface SpawnDetachedResult {
434
+ pid: number;
435
+ pidFile: string;
436
+ logFile: string;
437
+ payload: PidFilePayload;
438
+ }
439
+
440
+ /**
441
+ * Open the log file for append-only stdio redirection to the detached child.
442
+ *
443
+ * Must use a numeric fd (`openSync`), not `Bun.file()` — Bun closes the latter
444
+ * on parent exit, which would immediately close the child's stdout/stderr.
445
+ */
446
+ function openLogFd(logFile: string): number {
447
+ return openSync(logFile, "a");
448
+ }
449
+
450
+ /**
451
+ * Derive the start-lock path from a pid-file path. A sidecar lock-file that
452
+ * exists only for the brief window between "decide to start" and "pid-file
453
+ * durably written" serializes concurrent `--detach` invocations against
454
+ * each other in the same `GNO_DATA_DIR`.
455
+ */
456
+ function startLockPath(pidFile: string): string {
457
+ return `${pidFile}.startlock`;
458
+ }
459
+
460
+ /**
461
+ * Acquire an exclusive start-lock via `open(O_CREAT | O_EXCL)`. Returns the
462
+ * lock path on success. Throws `CliError("VALIDATION")` if another starter
463
+ * holds the lock. Stale locks (leftover from a crash) are detected via a
464
+ * short age threshold and recovered automatically.
465
+ */
466
+ function acquireStartLock(pidFile: string, kind: DetachKind): string {
467
+ const lockPath = startLockPath(pidFile);
468
+ // "wx" = O_CREAT | O_EXCL | O_WRONLY — atomic create-or-fail.
469
+ try {
470
+ const fd = openSync(lockPath, "wx");
471
+ closeSync(fd);
472
+ return lockPath;
473
+ } catch (error) {
474
+ if (!isErrnoException(error) || error.code !== "EEXIST") {
475
+ throw error;
476
+ }
477
+ }
478
+
479
+ // Lock exists. If it's old (>30s), assume a previous start crashed before
480
+ // releasing it — unlink and retry once.
481
+ const STALE_LOCK_MS = 30_000;
482
+ try {
483
+ const info = statSync(lockPath);
484
+ if (Date.now() - info.mtimeMs > STALE_LOCK_MS) {
485
+ try {
486
+ unlinkSync(lockPath);
487
+ } catch {
488
+ /* race with another cleanup — fall through */
489
+ }
490
+ try {
491
+ const fd = openSync(lockPath, "wx");
492
+ closeSync(fd);
493
+ return lockPath;
494
+ } catch {
495
+ /* someone else won the retry */
496
+ }
497
+ }
498
+ } catch {
499
+ /* stat failed — treat as live lock */
500
+ }
501
+
502
+ throw new CliError(
503
+ "VALIDATION",
504
+ `another ${kind} start is in progress (lock-file ${lockPath}). If no other ${kind} start is running, delete the lock-file manually and retry.`
505
+ );
506
+ }
507
+
508
+ function releaseStartLock(lockPath: string): void {
509
+ try {
510
+ unlinkSync(lockPath);
511
+ } catch {
512
+ /* already gone — fine */
513
+ }
514
+ }
515
+
516
+ /**
517
+ * Spawn a detached background child, write the pid-file, and return the pid.
518
+ *
519
+ * Windows is explicitly unsupported: we throw a `CliError("VALIDATION")` with
520
+ * guidance toward WSL. The rest of the helper (status/stop) remains safe to
521
+ * call on Windows but has nothing to manage because no pid-file is ever
522
+ * written.
523
+ */
524
+ export async function spawnDetached(
525
+ options: SpawnDetachedOptions
526
+ ): Promise<SpawnDetachedResult> {
527
+ if (process.platform === "win32") {
528
+ throw new CliError(
529
+ "VALIDATION",
530
+ "`--detach` is not supported on Windows. Use WSL, or a Windows launcher like NSSM. See docs/WINDOWS.md."
531
+ );
532
+ }
533
+
534
+ const execPath = options.execPath ?? process.execPath;
535
+ const cwd = options.cwd ?? process.cwd();
536
+
537
+ // Build the child command prefix. By default we re-invoke the same script
538
+ // the parent was launched with (process.argv[1]), so `bun src/index.ts
539
+ // serve --detach` spawns `bun src/index.ts serve ...` in the child rather
540
+ // than `bun serve ...`. Callers can opt out by passing `entryScript: null`
541
+ // (single-file executables) or override the prefix wholesale via
542
+ // `cmdPrefix`.
543
+ let cmdPrefix: string[];
544
+ if (options.cmdPrefix) {
545
+ cmdPrefix = options.cmdPrefix;
546
+ } else {
547
+ const entryScript =
548
+ options.entryScript === undefined
549
+ ? (process.argv[1] ?? null)
550
+ : options.entryScript;
551
+ cmdPrefix = entryScript === null ? [execPath] : [execPath, entryScript];
552
+ }
553
+
554
+ await mkdir(dirname(options.logFile), { recursive: true });
555
+ await mkdir(dirname(options.pidFile), { recursive: true });
556
+
557
+ // Serialize the guard+spawn+pid-file-write sequence against concurrent
558
+ // `--detach` invocations via an atomic lock-file (O_CREAT | O_EXCL). Two
559
+ // parents can't both pass `guardDoubleStart` and then both spawn children
560
+ // into the same data dir — only one holds the lock at a time.
561
+ const lockPath = acquireStartLock(options.pidFile, options.kind);
562
+
563
+ let payload: PidFilePayload;
564
+ let childPid: number;
565
+ try {
566
+ // Re-check the pid-file *under the lock*. An earlier concurrent starter
567
+ // may have completed while we were waiting at file-system boundaries.
568
+ await guardDoubleStart(options.pidFile, options.kind);
569
+
570
+ const fd = openLogFd(options.logFile);
571
+ let child: ReturnType<typeof Bun.spawn>;
572
+ try {
573
+ child = Bun.spawn({
574
+ cmd: [...cmdPrefix, ...options.argv, DETACHED_CHILD_FLAG],
575
+ stdio: ["ignore", fd, fd],
576
+ detached: true,
577
+ cwd,
578
+ env: { ...process.env, ...options.env },
579
+ });
580
+ // `.unref()` is mandatory — `detached: true` alone keeps the parent's
581
+ // event loop tied to the child handle. See spike findings in
582
+ // `.flow/tasks/fn-72-backgrounding-flags-for-serve-and-daemon.9.md`.
583
+ child.unref();
584
+ } finally {
585
+ // The child has its own dup of the fd; closing the parent's copy is
586
+ // safe and prevents leaking an fd in the parent.
587
+ closeSync(fd);
588
+ }
589
+ childPid = child.pid;
590
+
591
+ payload = {
592
+ pid: child.pid,
593
+ cmd: options.kind,
594
+ version: VERSION,
595
+ started_at: new Date().toISOString(),
596
+ port: options.port ?? null,
597
+ };
598
+
599
+ // If the pid-file write fails (disk full, permission race, bad path on
600
+ // an overridden --pid-file), we MUST NOT leave the child orphaned —
601
+ // there'd be no pid-file for `--status`/`--stop` and the operator has
602
+ // no way to find it. Synchronously reap the child: SIGTERM → poll →
603
+ // SIGKILL → poll. This blocks until cleanup lands (or the timeout
604
+ // budget is exhausted), so the CliError we throw reflects reality.
605
+ try {
606
+ await writePidFile(options.pidFile, payload);
607
+ } catch (error) {
608
+ await reapOrphanedChild(child);
609
+ throw new CliError(
610
+ "RUNTIME",
611
+ `spawned ${options.kind} (pid ${child.pid}) but failed to write pid-file ${options.pidFile}; child was signaled and reaped. Original error: ${error instanceof Error ? error.message : String(error)}`
612
+ );
613
+ }
614
+ } finally {
615
+ releaseStartLock(lockPath);
616
+ }
617
+
618
+ return {
619
+ pid: childPid,
620
+ pidFile: options.pidFile,
621
+ logFile: options.logFile,
622
+ payload,
623
+ };
624
+ }
625
+
626
+ // ─────────────────────────────────────────────────────────────────────────────
627
+ // Guard
628
+ // ─────────────────────────────────────────────────────────────────────────────
629
+
630
+ /**
631
+ * Does the pid-file's recorded version match the binary currently running?
632
+ *
633
+ * PID-reuse mitigation (per epic spec): after liveness passes we cross-check
634
+ * the stored `cmd` *and* `version`. A mismatched version is a "live-foreign"
635
+ * signal: we have a live pid claiming to be ours but we can't prove identity.
636
+ * Two realistic causes:
637
+ *
638
+ * (a) User upgraded gno while the old detached process is still running.
639
+ * (b) Original process crashed and an unrelated process inherited the pid.
640
+ *
641
+ * In neither case is it safe to issue signals to the pid, nor to
642
+ * double-start into the same data dir, nor to claim "not running" — that
643
+ * would lose track of (a) and let two detached processes fight over the
644
+ * same port or watcher. Surface the ambiguity to the operator and make
645
+ * them resolve it.
646
+ */
647
+ function versionMatchesPidFile(payload: PidFilePayload): boolean {
648
+ return payload.version === VERSION;
649
+ }
650
+
651
+ function formatLiveForeignError(
652
+ kind: DetachKind,
653
+ pidFile: string,
654
+ existing: PidFilePayload,
655
+ action: "start" | "stop" | "status"
656
+ ): CliError {
657
+ const hint =
658
+ action === "start"
659
+ ? `refusing to start a second ${kind}. If the old process is defunct, terminate it manually (\`kill ${existing.pid}\` or \`kill -9 ${existing.pid}\`) and delete ${pidFile}.`
660
+ : action === "stop"
661
+ ? `refusing to signal pid ${existing.pid} without stronger identity proof. Terminate the old process manually and delete ${pidFile}.`
662
+ : `cannot verify liveness safely.`;
663
+ return new CliError(
664
+ "VALIDATION",
665
+ `pid-file ${pidFile} records a running ${kind} (pid ${existing.pid}) from gno ${existing.version}, but this binary is ${VERSION}: ${hint}`
666
+ );
667
+ }
668
+
669
+ /**
670
+ * Block a second detach when a matching process is already running.
671
+ *
672
+ * - Live + matching `cmd` + matching `version` → throw `CliError("VALIDATION")`
673
+ * with pid/port hint.
674
+ * - Live + mismatched `cmd` → throw (someone else's pid-file).
675
+ * - Live + matching `cmd` but mismatched `version` → throw VALIDATION with
676
+ * operator guidance. This is live-foreign: we can't prove identity, so we
677
+ * neither double-start nor silently unlink an active pid.
678
+ * - Dead (ESRCH) → unlink the stale pid-file and return.
679
+ */
680
+ export async function guardDoubleStart(
681
+ pidFile: string,
682
+ kind: DetachKind
683
+ ): Promise<void> {
684
+ const existing = await readPidFile(pidFile);
685
+ if (!existing) {
686
+ return;
687
+ }
688
+
689
+ if (!isProcessAlive(existing.pid)) {
690
+ await unlink(pidFile).catch(() => {
691
+ /* stale — removed by someone else is fine */
692
+ });
693
+ return;
694
+ }
695
+
696
+ if (existing.cmd !== kind) {
697
+ throw new CliError(
698
+ "VALIDATION",
699
+ `pid-file ${pidFile} is owned by a running ${existing.cmd} (pid ${existing.pid}), not ${kind}`
700
+ );
701
+ }
702
+
703
+ if (!versionMatchesPidFile(existing)) {
704
+ throw formatLiveForeignError(kind, pidFile, existing, "start");
705
+ }
706
+
707
+ const portSuffix =
708
+ existing.port === null || existing.port === undefined
709
+ ? ""
710
+ : ` on port ${existing.port}`;
711
+ throw new CliError(
712
+ "VALIDATION",
713
+ `${kind} is already running${portSuffix} (pid ${existing.pid}, pid-file ${pidFile})`
714
+ );
715
+ }
716
+
717
+ // ─────────────────────────────────────────────────────────────────────────────
718
+ // Status
719
+ // ─────────────────────────────────────────────────────────────────────────────
720
+
721
+ async function fileSizeOrNull(path: string): Promise<number | null> {
722
+ try {
723
+ const info = await stat(path);
724
+ return info.size;
725
+ } catch (error) {
726
+ if (isErrnoException(error) && error.code === "ENOENT") {
727
+ return null;
728
+ }
729
+ throw error;
730
+ }
731
+ }
732
+
733
+ export interface StatusOptions {
734
+ kind: DetachKind;
735
+ pidFile: string;
736
+ logFile: string;
737
+ /** Clock override for deterministic tests. Defaults to `Date.now`. */
738
+ now?: () => number;
739
+ }
740
+
741
+ /**
742
+ * Resolve the `process-status@1.0` payload for one process kind.
743
+ *
744
+ * Returns the bare schema-shape payload — callers can JSON-serialize it
745
+ * directly for `--status --json`. For the operator-facing live-foreign
746
+ * warning (a live pid whose recorded gno version disagrees with the current
747
+ * binary), call `inspectForeignLive()` as a sidecar.
748
+ *
749
+ * Safe to call on Windows — without a pid-file the payload is simply
750
+ * `running:false` with everything else null.
751
+ */
752
+ export async function statusProcess(
753
+ options: StatusOptions
754
+ ): Promise<ProcessStatus> {
755
+ const now = options.now ?? Date.now;
756
+ const logSize = await fileSizeOrNull(options.logFile);
757
+ const payload = await readPidFile(options.pidFile);
758
+
759
+ if (!payload) {
760
+ return {
761
+ running: false,
762
+ pid: null,
763
+ port: null,
764
+ cmd: options.kind,
765
+ version: null,
766
+ started_at: null,
767
+ uptime_seconds: null,
768
+ pid_file: options.pidFile,
769
+ log_file: options.logFile,
770
+ log_size_bytes: logSize,
771
+ };
772
+ }
773
+
774
+ // Pid-file exists but declares a different kind → treat the entry as not
775
+ // applicable to this status call (preserve the recorded metadata so
776
+ // operators can see what is there, but mark not-running).
777
+ const effectiveKind =
778
+ payload.cmd === options.kind ? payload.cmd : options.kind;
779
+
780
+ const alive = isProcessAlive(payload.pid);
781
+ // Cross-check cmd AND version to mitigate PID reuse after a crash. A
782
+ // mismatched version means the pid-file was written by a different gno
783
+ // binary than the one currently running, so we can't trust the pid to
784
+ // still be "ours".
785
+ const kindMatches = payload.cmd === options.kind;
786
+ const versionMatches = versionMatchesPidFile(payload);
787
+ const running = alive && kindMatches && versionMatches;
788
+ const uptimeSeconds = running
789
+ ? Math.max(0, Math.floor((now() - Date.parse(payload.started_at)) / 1000))
790
+ : null;
791
+
792
+ // Schema invariant: a live serve must report a numeric port. If the pid-file
793
+ // is missing one somehow, fall back to not-running rather than lying.
794
+ const portForRunningServe =
795
+ running && effectiveKind === "serve"
796
+ ? typeof payload.port === "number"
797
+ ? payload.port
798
+ : null
799
+ : running && effectiveKind === "daemon"
800
+ ? null
801
+ : (payload.port ?? null);
802
+
803
+ const runningFinal =
804
+ running && !(effectiveKind === "serve" && portForRunningServe === null);
805
+
806
+ return {
807
+ running: runningFinal,
808
+ pid: payload.pid,
809
+ port:
810
+ runningFinal && effectiveKind === "serve"
811
+ ? portForRunningServe
812
+ : effectiveKind === "daemon"
813
+ ? null
814
+ : (payload.port ?? null),
815
+ cmd: effectiveKind,
816
+ version: payload.version,
817
+ started_at: payload.started_at,
818
+ uptime_seconds: runningFinal ? uptimeSeconds : null,
819
+ pid_file: options.pidFile,
820
+ log_file: options.logFile,
821
+ log_size_bytes: logSize,
822
+ };
823
+ }
824
+
825
+ /**
826
+ * Inspect the pid-file for a "live-foreign" signal — a live pid whose
827
+ * recorded gno version disagrees with the currently running binary. Returns
828
+ * `null` when no pid-file, when the pid is dead, when cmds disagree, or
829
+ * when versions match. Callers pair this with `statusProcess()` to surface
830
+ * operator-facing ambiguity the schema doesn't encode.
831
+ */
832
+ export async function inspectForeignLive(options: {
833
+ kind: DetachKind;
834
+ pidFile: string;
835
+ }): Promise<ForeignLiveSignal | null> {
836
+ const payload = await readPidFile(options.pidFile);
837
+ if (!payload) {
838
+ return null;
839
+ }
840
+ if (payload.cmd !== options.kind) {
841
+ return null;
842
+ }
843
+ if (!isProcessAlive(payload.pid)) {
844
+ return null;
845
+ }
846
+ if (versionMatchesPidFile(payload)) {
847
+ return null;
848
+ }
849
+ return {
850
+ pid: payload.pid,
851
+ recordedVersion: payload.version,
852
+ currentVersion: VERSION,
853
+ };
854
+ }
855
+
856
+ // ─────────────────────────────────────────────────────────────────────────────
857
+ // Stop
858
+ // ─────────────────────────────────────────────────────────────────────────────
859
+
860
+ export interface StopOptions {
861
+ kind: DetachKind;
862
+ pidFile: string;
863
+ /** Grace period for SIGTERM before we escalate to SIGKILL. Default 10s. */
864
+ timeoutMs?: number;
865
+ /** Poll interval while waiting for the process to exit. Default 100ms. */
866
+ pollIntervalMs?: number;
867
+ /** Post-SIGKILL budget before giving up. Default 2s. */
868
+ killTimeoutMs?: number;
869
+ /** Sleep override for deterministic tests. */
870
+ sleep?: (ms: number) => Promise<void>;
871
+ /** kill override for deterministic tests. */
872
+ kill?: (pid: number, signal: NodeJS.Signals | number) => void;
873
+ /** isAlive override for deterministic tests. */
874
+ isAlive?: (pid: number) => boolean;
875
+ }
876
+
877
+ function defaultSleep(ms: number): Promise<void> {
878
+ return new Promise((resolve) => setTimeout(resolve, ms));
879
+ }
880
+
881
+ async function waitForExit(
882
+ pid: number,
883
+ deadlineMs: number,
884
+ pollIntervalMs: number,
885
+ isAlive: (pid: number) => boolean,
886
+ sleep: (ms: number) => Promise<void>
887
+ ): Promise<boolean> {
888
+ while (Date.now() < deadlineMs) {
889
+ if (!isAlive(pid)) {
890
+ return true;
891
+ }
892
+ await sleep(pollIntervalMs);
893
+ }
894
+ return !isAlive(pid);
895
+ }
896
+
897
+ /**
898
+ * Stop a detached process: SIGTERM → poll → SIGKILL → poll → error.
899
+ *
900
+ * The pid-file is **not** unlinked on success by this helper — we let the
901
+ * target's own signal handler clean it up (see `createSignalPromise` in
902
+ * `src/cli/commands/daemon.ts`). Callers may unlink the file as a fallback
903
+ * when liveness is `false` after the kill sequence; `stopProcess` itself only
904
+ * unlinks stale pid-files it discovers on entry.
905
+ */
906
+ export async function stopProcess(options: StopOptions): Promise<StopOutcome> {
907
+ const timeoutMs = options.timeoutMs ?? 10_000;
908
+ const pollIntervalMs = options.pollIntervalMs ?? 100;
909
+ const killTimeoutMs = options.killTimeoutMs ?? 2_000;
910
+ const sleep = options.sleep ?? defaultSleep;
911
+ const kill = options.kill ?? ((pid, signal) => process.kill(pid, signal));
912
+ const isAlive = options.isAlive ?? isProcessAlive;
913
+
914
+ const payload = await readPidFile(options.pidFile);
915
+ if (!payload) {
916
+ return { kind: "not-running", pidFile: options.pidFile };
917
+ }
918
+
919
+ if (!isAlive(payload.pid)) {
920
+ // Stale: clean up as a best-effort fallback.
921
+ await unlink(options.pidFile).catch(() => {
922
+ /* ignore */
923
+ });
924
+ return { kind: "not-running", pidFile: options.pidFile };
925
+ }
926
+
927
+ if (payload.cmd !== options.kind) {
928
+ throw new CliError(
929
+ "VALIDATION",
930
+ `pid-file ${options.pidFile} is owned by a running ${payload.cmd} (pid ${payload.pid}), not ${options.kind}`
931
+ );
932
+ }
933
+
934
+ if (!versionMatchesPidFile(payload)) {
935
+ // Live pid, matching kind, but a different gno version wrote the
936
+ // pid-file. Could be an orphan from a pre-upgrade process we still
937
+ // need to manage, or could be PID reuse — either way we can't prove
938
+ // identity, so we MUST NOT send signals to that pid. Surface the
939
+ // ambiguity and leave the pid-file in place for the operator.
940
+ return { kind: "foreign-live", pid: payload.pid, payload };
941
+ }
942
+
943
+ try {
944
+ kill(payload.pid, "SIGTERM");
945
+ } catch (error) {
946
+ if (isErrnoException(error) && error.code === "ESRCH") {
947
+ return { kind: "not-running", pidFile: options.pidFile };
948
+ }
949
+ throw error;
950
+ }
951
+
952
+ const sigtermDeadline = Date.now() + timeoutMs;
953
+ const exitedOnSigterm = await waitForExit(
954
+ payload.pid,
955
+ sigtermDeadline,
956
+ pollIntervalMs,
957
+ isAlive,
958
+ sleep
959
+ );
960
+ if (exitedOnSigterm) {
961
+ return { kind: "stopped", pid: payload.pid, signal: "SIGTERM" };
962
+ }
963
+
964
+ try {
965
+ kill(payload.pid, "SIGKILL");
966
+ } catch (error) {
967
+ if (isErrnoException(error) && error.code === "ESRCH") {
968
+ return { kind: "stopped", pid: payload.pid, signal: "SIGTERM" };
969
+ }
970
+ throw error;
971
+ }
972
+
973
+ const sigkillDeadline = Date.now() + killTimeoutMs;
974
+ const exitedOnSigkill = await waitForExit(
975
+ payload.pid,
976
+ sigkillDeadline,
977
+ pollIntervalMs,
978
+ isAlive,
979
+ sleep
980
+ );
981
+ if (exitedOnSigkill) {
982
+ return { kind: "stopped", pid: payload.pid, signal: "SIGKILL" };
983
+ }
984
+
985
+ return { kind: "timeout", pid: payload.pid };
986
+ }