@vellumai/cli 0.8.4 → 0.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/AGENTS.md +17 -1
  2. package/knip.json +2 -1
  3. package/package.json +1 -1
  4. package/src/__tests__/api-key-check.test.ts +78 -0
  5. package/src/__tests__/backup.test.ts +38 -0
  6. package/src/__tests__/recover.test.ts +307 -0
  7. package/src/__tests__/retire.test.ts +241 -0
  8. package/src/__tests__/wake.test.ts +215 -0
  9. package/src/commands/backup.ts +2 -0
  10. package/src/commands/client.ts +62 -32
  11. package/src/commands/flags.ts +197 -0
  12. package/src/commands/gateway/token.ts +73 -0
  13. package/src/commands/gateway.ts +29 -0
  14. package/src/commands/logs.ts +6 -18
  15. package/src/commands/ps.ts +41 -41
  16. package/src/commands/recover.ts +47 -9
  17. package/src/commands/restore.ts +8 -1
  18. package/src/commands/retire.ts +145 -55
  19. package/src/commands/roadmap.ts +449 -0
  20. package/src/commands/rollback.ts +2 -14
  21. package/src/commands/ssh.ts +5 -24
  22. package/src/commands/teleport.ts +34 -26
  23. package/src/commands/upgrade.ts +8 -16
  24. package/src/commands/wake.ts +68 -45
  25. package/src/index.ts +9 -0
  26. package/src/lib/__tests__/port-allocator.test.ts +117 -0
  27. package/src/lib/__tests__/step-runner.test.ts +133 -0
  28. package/src/lib/api-key-check.ts +40 -0
  29. package/src/lib/assistant-config.ts +13 -0
  30. package/src/lib/config-utils.ts +24 -3
  31. package/src/lib/docker.ts +72 -8
  32. package/src/lib/hatch-local.ts +15 -2
  33. package/src/lib/http-client.ts +1 -3
  34. package/src/lib/local.ts +173 -292
  35. package/src/lib/orphan-detection.ts +9 -5
  36. package/src/lib/pgrep.ts +5 -1
  37. package/src/lib/platform-client.ts +97 -49
  38. package/src/lib/port-allocator.ts +93 -0
  39. package/src/lib/process.ts +109 -39
  40. package/src/lib/statefulset.ts +0 -10
  41. package/src/lib/step-runner.ts +102 -9
  42. package/src/lib/sync-cloud-assistants.ts +17 -0
  43. package/src/shared/provider-env-vars.ts +1 -0
package/src/lib/local.ts CHANGED
@@ -17,7 +17,11 @@ import {
17
17
  } from "./assistant-config.js";
18
18
  import { GATEWAY_PORT } from "./constants.js";
19
19
  import { httpHealthCheck, waitForDaemonReady } from "./http-client.js";
20
- import { stopProcessByPidFile } from "./process.js";
20
+ import {
21
+ resolveProcessState,
22
+ stopProcess,
23
+ stopProcessByPidFile,
24
+ } from "./process.js";
21
25
  import { openLogFile, pipeToLogFile } from "./xdg-log.js";
22
26
 
23
27
  const _require = createRequire(import.meta.url);
@@ -319,80 +323,16 @@ type DaemonStartOptions = {
319
323
  signingKey?: string;
320
324
  };
321
325
 
322
- async function startDaemonFromSource(
323
- assistantIndex: string,
324
- resources: LocalInstanceResources,
326
+ /**
327
+ * Apply per-instance resource overrides and shared daemon options to an
328
+ * environment object. Called from all daemon spawn paths (source, watch,
329
+ * bundled binary) to eliminate drift between the three.
330
+ */
331
+ function applyDaemonEnvOverrides(
332
+ env: Record<string, string | undefined>,
333
+ resources: LocalInstanceResources | undefined,
325
334
  options?: DaemonStartOptions,
326
- ): Promise<void> {
327
- const foreground = options?.foreground ?? false;
328
- const daemonMainPath = resolveDaemonMainPath(assistantIndex);
329
-
330
- // Ensure the directory containing PID/socket files exists. For named
331
- // instances this is instanceDir/.vellum/workspace/ (matching daemon's getWorkspaceDir()).
332
- const pidFile = getDaemonPidPath(resources);
333
- mkdirSync(dirname(pidFile), { recursive: true });
334
-
335
- // --- Lifecycle guard: prevent split-brain daemon state ---
336
- if (existsSync(pidFile)) {
337
- try {
338
- const content = readFileSync(pidFile, "utf-8").trim();
339
-
340
- // Another caller is already spawning the daemon — wait for it
341
- // instead of racing to spawn a duplicate.
342
- if (content === "starting") {
343
- console.log(
344
- " Assistant is starting — waiting for it to become ready...",
345
- );
346
- if (await waitForDaemonReady(resources.daemonPort, 60000)) {
347
- console.log(" Assistant is ready\n");
348
- return;
349
- }
350
- // The other spawn may have failed; clean up and proceed to spawn.
351
- try {
352
- unlinkSync(pidFile);
353
- } catch {}
354
- }
355
-
356
- const pid = parseInt(content, 10);
357
- if (!isNaN(pid)) {
358
- try {
359
- process.kill(pid, 0);
360
- console.log(` Assistant already running (pid ${pid})\n`);
361
- return;
362
- } catch {
363
- try {
364
- unlinkSync(pidFile);
365
- } catch {}
366
- }
367
- }
368
- } catch {}
369
- }
370
-
371
- // PID file was stale or missing — check if daemon is responding via HTTP
372
- if (await isDaemonResponsive(resources.daemonPort)) {
373
- // Recover PID tracking so lifecycle commands (sleep, retire,
374
- // stopLocalProcesses) can manage this daemon process.
375
- const recoveredPid = recoverPidFile(pidFile, resources.daemonPort);
376
- if (recoveredPid) {
377
- console.log(
378
- ` Assistant is responsive (pid ${recoveredPid}) — skipping restart\n`,
379
- );
380
- } else {
381
- console.log(" Assistant is responsive — skipping restart\n");
382
- }
383
- return;
384
- }
385
-
386
- const env: Record<string, string | undefined> = {
387
- ...process.env,
388
- RUNTIME_HTTP_PORT: process.env.RUNTIME_HTTP_PORT || "7821",
389
- VELLUM_CLOUD: "local",
390
- VELLUM_DEV: "1",
391
- VELLUM_ENVIRONMENT: process.env.VELLUM_ENVIRONMENT || "local",
392
- ...(options?.signingKey
393
- ? { ACTOR_TOKEN_SIGNING_KEY: options.signingKey }
394
- : {}),
395
- };
335
+ ): void {
396
336
  if (resources) {
397
337
  env.VELLUM_WORKSPACE_DIR = join(
398
338
  resources.instanceDir,
@@ -414,12 +354,62 @@ async function startDaemonFromSource(
414
354
  env.QDRANT_HTTP_PORT = String(resources.qdrantPort);
415
355
  delete env.QDRANT_URL;
416
356
  }
357
+ if (options?.signingKey) {
358
+ env.ACTOR_TOKEN_SIGNING_KEY = options.signingKey;
359
+ }
417
360
  if (options?.defaultWorkspaceConfigPath) {
418
361
  env.VELLUM_DEFAULT_WORKSPACE_CONFIG_PATH =
419
362
  options.defaultWorkspaceConfigPath;
420
363
  }
421
-
422
364
  applyIpcSocketDirOverride(env);
365
+ }
366
+
367
+ function logDaemonReadiness(ready: boolean): void {
368
+ if (ready) {
369
+ console.log(" Assistant ready\n");
370
+ } else {
371
+ console.log(
372
+ " ⚠️ Assistant did not become ready within 60s — continuing anyway\n",
373
+ );
374
+ }
375
+ }
376
+
377
+ async function startDaemonFromSource(
378
+ assistantIndex: string,
379
+ resources: LocalInstanceResources,
380
+ options?: DaemonStartOptions,
381
+ ): Promise<void> {
382
+ const foreground = options?.foreground ?? false;
383
+ const daemonMainPath = resolveDaemonMainPath(assistantIndex);
384
+
385
+ // Ensure the directory containing PID/socket files exists. For named
386
+ // instances this is instanceDir/.vellum/workspace/ (matching daemon's getWorkspaceDir()).
387
+ const pidFile = getDaemonPidPath(resources);
388
+ mkdirSync(dirname(pidFile), { recursive: true });
389
+
390
+ // --- Lifecycle guard: prevent split-brain daemon state ---
391
+ if (await awaitStartingSentinel(pidFile, resources.daemonPort)) return;
392
+
393
+ const daemonState = await resolveProcessState(
394
+ pidFile,
395
+ resources.daemonPort,
396
+ "Assistant",
397
+ );
398
+ if (daemonState.status === "healthy") {
399
+ console.log(` Assistant already running (pid ${daemonState.pid})\n`);
400
+ return;
401
+ }
402
+
403
+ if (await checkOrphanedDaemon(pidFile, resources.daemonPort)) return;
404
+
405
+ const env: Record<string, string | undefined> = {
406
+ ...process.env,
407
+ RUNTIME_HTTP_PORT: process.env.RUNTIME_HTTP_PORT || "7821",
408
+ VELLUM_CLOUD: "local",
409
+ VELLUM_DEV: "1",
410
+ VELLUM_ENVIRONMENT: process.env.VELLUM_ENVIRONMENT || "local",
411
+ };
412
+ applyDaemonEnvOverrides(env, resources, options);
423
413
 
424
414
  // Write a sentinel PID file before spawning so concurrent hatch() calls
425
415
  // detect the in-progress spawn and wait instead of racing.
@@ -469,94 +459,27 @@ async function startDaemonWatchFromSource(
469
459
  mkdirSync(dirname(pidFile), { recursive: true });
470
460
 
471
461
  // --- Lifecycle guard: prevent split-brain daemon state ---
472
- // If a daemon is already running, skip spawning a new one.
473
- if (existsSync(pidFile)) {
474
- try {
475
- const content = readFileSync(pidFile, "utf-8").trim();
476
-
477
- // Another caller is already spawning the daemon — wait for it
478
- // instead of racing to spawn a duplicate.
479
- if (content === "starting") {
480
- console.log(
481
- " Assistant is starting — waiting for it to become ready...",
482
- );
483
- if (await waitForDaemonReady(resources.daemonPort, 60000)) {
484
- console.log(" Assistant is ready\n");
485
- return;
486
- }
487
- // The other spawn may have failed; clean up and proceed to spawn.
488
- try {
489
- unlinkSync(pidFile);
490
- } catch {}
491
- }
462
+ if (await awaitStartingSentinel(pidFile, resources.daemonPort)) return;
492
463
 
493
- const pid = parseInt(content, 10);
494
- if (!isNaN(pid)) {
495
- try {
496
- process.kill(pid, 0); // Check if alive
497
- console.log(` Assistant already running (pid ${pid})\n`);
498
- return;
499
- } catch {
500
- // Process doesn't exist, clean up stale PID file
501
- try {
502
- unlinkSync(pidFile);
503
- } catch {}
504
- }
505
- }
506
- } catch {}
507
- }
508
-
509
- // PID file was stale or missing — check if daemon is responding via HTTP
510
- if (await isDaemonResponsive(resources.daemonPort)) {
511
- // Recover PID tracking so lifecycle commands (sleep, retire,
512
- // stopLocalProcesses) can manage this daemon process.
513
- const recoveredPid = recoverPidFile(pidFile, resources.daemonPort);
514
- if (recoveredPid) {
515
- console.log(
516
- ` Assistant is responsive (pid ${recoveredPid}) — skipping restart\n`,
517
- );
518
- } else {
519
- console.log(" Assistant is responsive — skipping restart\n");
520
- }
464
+ const daemonState = await resolveProcessState(
465
+ pidFile,
466
+ resources.daemonPort,
467
+ "Assistant",
468
+ );
469
+ if (daemonState.status === "healthy") {
470
+ console.log(` Assistant already running (pid ${daemonState.pid})\n`);
521
471
  return;
522
472
  }
523
473
 
474
+ if (await checkOrphanedDaemon(pidFile, resources.daemonPort)) return;
475
+
524
476
  const env: Record<string, string | undefined> = {
525
477
  ...process.env,
526
478
  RUNTIME_HTTP_PORT: process.env.RUNTIME_HTTP_PORT || "7821",
527
479
  VELLUM_DEV: "1",
528
480
  VELLUM_ENVIRONMENT: process.env.VELLUM_ENVIRONMENT || "local",
529
- ...(options?.signingKey
530
- ? { ACTOR_TOKEN_SIGNING_KEY: options.signingKey }
531
- : {}),
532
481
  };
533
- if (resources) {
534
- env.VELLUM_WORKSPACE_DIR = join(
535
- resources.instanceDir,
536
- ".vellum",
537
- "workspace",
538
- );
539
- env.GATEWAY_SECURITY_DIR = join(
540
- resources.instanceDir,
541
- ".vellum",
542
- "protected",
543
- );
544
- env.CREDENTIAL_SECURITY_DIR = join(
545
- resources.instanceDir,
546
- ".vellum",
547
- "protected",
548
- );
549
- env.RUNTIME_HTTP_PORT = String(resources.daemonPort);
550
- env.GATEWAY_PORT = String(resources.gatewayPort);
551
- env.QDRANT_HTTP_PORT = String(resources.qdrantPort);
552
- delete env.QDRANT_URL;
553
- }
554
- if (options?.defaultWorkspaceConfigPath) {
555
- env.VELLUM_DEFAULT_WORKSPACE_CONFIG_PATH =
556
- options.defaultWorkspaceConfigPath;
557
- }
558
-
559
- applyIpcSocketDirOverride(env);
482
+ applyDaemonEnvOverrides(env, resources, options);
560
483
 
561
484
  // Write a sentinel PID file before spawning so concurrent hatch() calls
562
485
  // detect the in-progress spawn and wait instead of racing.
@@ -660,6 +583,63 @@ function recoverPidFile(
660
583
  return pid;
661
584
  }
662
585
 
586
+ /**
587
+ * Handle the "starting" sentinel in a PID file. When another caller is
588
+ * already spawning the daemon, wait for it to become ready instead of
589
+ * racing to spawn a duplicate.
590
+ *
591
+ * Returns `true` if the daemon became ready (caller should return early),
592
+ * `false` if the spawn failed or the sentinel wasn't present (caller
593
+ * should proceed). Cleans up the PID file on failure.
594
+ */
595
+ async function awaitStartingSentinel(
596
+ pidFile: string,
597
+ daemonPort: number,
598
+ ): Promise<boolean> {
599
+ if (!existsSync(pidFile)) return false;
600
+ try {
601
+ const content = readFileSync(pidFile, "utf-8").trim();
602
+ if (content !== "starting") return false;
603
+ } catch {
604
+ return false;
605
+ }
606
+
607
+ console.log(" Assistant is starting — waiting for it to become ready...");
608
+ if (await waitForDaemonReady(daemonPort, 60000)) {
609
+ console.log(" Assistant is ready\n");
610
+ return true;
611
+ }
612
+ try {
613
+ unlinkSync(pidFile);
614
+ } catch {}
615
+ return false;
616
+ }
617
+
618
+ /**
619
+ * Check if a daemon without a valid PID file is still reachable on its
620
+ * HTTP port (orphaned process). If so, recover its PID file so lifecycle
621
+ * commands can manage it.
622
+ *
623
+ * Returns `true` if an orphaned daemon was found (caller should skip
624
+ * starting a new one), `false` otherwise.
625
+ */
626
+ async function checkOrphanedDaemon(
627
+ pidFile: string,
628
+ daemonPort: number,
629
+ ): Promise<boolean> {
630
+ if (!(await isDaemonResponsive(daemonPort))) return false;
631
+
632
+ const recoveredPid = recoverPidFile(pidFile, daemonPort);
633
+ if (recoveredPid) {
634
+ console.log(
635
+ ` Assistant is responsive (pid ${recoveredPid}) — skipping restart\n`,
636
+ );
637
+ } else {
638
+ console.log(" Assistant is responsive — skipping restart\n");
639
+ }
640
+ return true;
641
+ }
642
+
663
643
  export async function discoverPublicUrl(
664
644
  port?: number,
665
645
  ): Promise<string | undefined> {
@@ -900,64 +880,24 @@ export async function startLocalDaemon(
900
880
 
901
881
  const pidFile = getDaemonPidPath(resources);
902
882
 
903
- // If a daemon is already running, skip spawning a new one.
904
- // This prevents cascading kill→restart cycles when multiple callers
905
- // invoke hatch() concurrently (setupDaemonClient + ensureDaemonConnected).
906
- let daemonAlive = false;
907
- if (existsSync(pidFile)) {
908
- try {
909
- const content = readFileSync(pidFile, "utf-8").trim();
910
-
911
- // Another caller is already spawning the daemon — wait for it
912
- // instead of racing to spawn a duplicate.
913
- if (content === "starting") {
914
- console.log(
915
- " Assistant is starting — waiting for it to become ready...",
916
- );
917
- if (await waitForDaemonReady(resources.daemonPort, 60000)) {
918
- console.log(" Assistant is ready\n");
919
- ensureBunInstalled();
920
- return;
921
- }
922
- // The other spawn may have failed; clean up and proceed to spawn.
923
- try {
924
- unlinkSync(pidFile);
925
- } catch {}
926
- }
883
+ // --- Lifecycle guard: prevent split-brain daemon state ---
884
+ if (await awaitStartingSentinel(pidFile, resources.daemonPort)) {
885
+ ensureBunInstalled();
886
+ return;
887
+ }
927
888
 
928
- const pid = parseInt(content, 10);
929
- if (!isNaN(pid)) {
930
- try {
931
- process.kill(pid, 0); // Check if alive
932
- daemonAlive = true;
933
- console.log(` Assistant already running (pid ${pid})\n`);
934
- } catch {
935
- // Process doesn't exist, clean up stale PID file
936
- try {
937
- unlinkSync(pidFile);
938
- } catch {}
939
- }
940
- }
941
- } catch {}
889
+ const daemonState = await resolveProcessState(
890
+ pidFile,
891
+ resources.daemonPort,
892
+ "Assistant",
893
+ );
894
+ const daemonAlive = daemonState.status === "healthy";
895
+ if (daemonAlive) {
896
+ console.log(` Assistant already running (pid ${daemonState.pid})\n`);
942
897
  }
943
898
 
944
899
  if (!daemonAlive) {
945
- // The PID file was stale or missing, but a daemon with a different PID
946
- // may still be listening on the HTTP port (e.g. if the PID file was
947
- // overwritten by a crashed restart attempt). Check before starting a new one.
948
- if (await isDaemonResponsive(resources.daemonPort)) {
949
- // Restore PID tracking so lifecycle commands (sleep, retire,
950
- // stopLocalProcesses) can manage this daemon process.
951
- const recoveredPid = recoverPidFile(pidFile, resources.daemonPort);
952
- if (recoveredPid) {
953
- console.log(
954
- ` Assistant is responsive (pid ${recoveredPid}) — skipping restart\n`,
955
- );
956
- } else {
957
- console.log(" Assistant is responsive — skipping restart\n");
958
- }
959
- // Ensure bun is available for runtime features (browser, skills install)
960
- // even when reusing an existing daemon.
900
+ if (await checkOrphanedDaemon(pidFile, resources.daemonPort)) {
961
901
  ensureBunInstalled();
962
902
  return;
963
903
  }
@@ -1013,39 +953,7 @@ export async function startLocalDaemon(
1013
953
  daemonEnv[key] = process.env[key]!;
1014
954
  }
1015
955
  }
1016
- if (options?.defaultWorkspaceConfigPath) {
1017
- daemonEnv.VELLUM_DEFAULT_WORKSPACE_CONFIG_PATH =
1018
- options.defaultWorkspaceConfigPath;
1019
- }
1020
- // When running a named instance, override env so the daemon resolves
1021
- // all paths under the instance directory and listens on its own port.
1022
- if (resources) {
1023
- daemonEnv.VELLUM_WORKSPACE_DIR = join(
1024
- resources.instanceDir,
1025
- ".vellum",
1026
- "workspace",
1027
- );
1028
- daemonEnv.GATEWAY_SECURITY_DIR = join(
1029
- resources.instanceDir,
1030
- ".vellum",
1031
- "protected",
1032
- );
1033
- daemonEnv.CREDENTIAL_SECURITY_DIR = join(
1034
- resources.instanceDir,
1035
- ".vellum",
1036
- "protected",
1037
- );
1038
- daemonEnv.RUNTIME_HTTP_PORT = String(resources.daemonPort);
1039
- daemonEnv.GATEWAY_PORT = String(resources.gatewayPort);
1040
- daemonEnv.QDRANT_HTTP_PORT = String(resources.qdrantPort);
1041
- delete daemonEnv.QDRANT_URL;
1042
- }
1043
-
1044
- if (options?.signingKey) {
1045
- daemonEnv.ACTOR_TOKEN_SIGNING_KEY = options.signingKey;
1046
- }
1047
-
1048
- applyIpcSocketDirOverride(daemonEnv);
956
+ applyDaemonEnvOverrides(daemonEnv, resources, options);
1049
957
 
1050
958
  // Write a sentinel PID file before spawning so concurrent hatch() calls
1051
959
  // see the file and fall through to the isDaemonResponsive() port check
@@ -1112,13 +1020,7 @@ export async function startLocalDaemon(
1112
1020
  }
1113
1021
  }
1114
1022
 
1115
- if (daemonReady) {
1116
- console.log(" Assistant ready\n");
1117
- } else {
1118
- console.log(
1119
- " ⚠️ Assistant did not become ready within 60s — continuing anyway\n",
1120
- );
1121
- }
1023
+ logDaemonReadiness(daemonReady);
1122
1024
  } else {
1123
1025
  console.log("🔨 Starting local assistant...");
1124
1026
 
@@ -1131,34 +1033,17 @@ export async function startLocalDaemon(
1131
1033
  }
1132
1034
  if (watch) {
1133
1035
  await startDaemonWatchFromSource(assistantIndex, resources, options);
1134
-
1135
- const daemonReady = await waitForDaemonReady(resources.daemonPort, 60000);
1136
- if (daemonReady) {
1137
- console.log(" Assistant ready\n");
1138
- } else {
1139
- console.log(
1140
- " ⚠️ Assistant did not become ready within 60s — continuing anyway\n",
1141
- );
1142
- }
1143
1036
  } else {
1144
1037
  await startDaemonFromSource(assistantIndex, resources, options);
1145
-
1146
- const daemonReady = await waitForDaemonReady(resources.daemonPort, 60000);
1147
- if (daemonReady) {
1148
- console.log(" Assistant ready\n");
1149
- } else {
1150
- console.log(
1151
- " ⚠️ Assistant did not become ready within 60s — continuing anyway\n",
1152
- );
1153
- }
1154
1038
  }
1039
+ logDaemonReadiness(await waitForDaemonReady(resources.daemonPort, 60000));
1155
1040
  }
1156
1041
  }
1157
1042
 
1158
1043
  export async function startGateway(
1159
1044
  watch: boolean = false,
1160
1045
  resources?: LocalInstanceResources,
1161
- options?: { signingKey?: string },
1046
+ options?: { signingKey?: string; bootstrapSecret?: string },
1162
1047
  ): Promise<string> {
1163
1048
  const effectiveGatewayPort = resources?.gatewayPort ?? GATEWAY_PORT;
1164
1049
 
@@ -1194,6 +1079,9 @@ export async function startGateway(
1194
1079
  ...(options?.signingKey
1195
1080
  ? { ACTOR_TOKEN_SIGNING_KEY: options.signingKey }
1196
1081
  : {}),
1082
+ ...(options?.bootstrapSecret
1083
+ ? { GUARDIAN_BOOTSTRAP_SECRET: options.bootstrapSecret }
1084
+ : {}),
1197
1085
  ...(watch
1198
1086
  ? {
1199
1087
  VELLUM_DEV: "1",
@@ -1273,27 +1161,7 @@ export async function startGateway(
1273
1161
  // Wait for the gateway to be responsive before returning. Without this,
1274
1162
  // callers may try to connect before the HTTP server is listening and get
1275
1163
  // connection-refused errors.
1276
- const start = Date.now();
1277
- const timeoutMs = 30000;
1278
- let ready = false;
1279
- while (Date.now() - start < timeoutMs) {
1280
- try {
1281
- const res = await fetch(
1282
- `http://localhost:${effectiveGatewayPort}/healthz`,
1283
- {
1284
- signal: AbortSignal.timeout(2000),
1285
- },
1286
- );
1287
- if (res.ok) {
1288
- ready = true;
1289
- break;
1290
- }
1291
- } catch {
1292
- // Gateway not ready yet
1293
- }
1294
- await new Promise((r) => setTimeout(r, 250));
1295
- }
1296
-
1164
+ const ready = await waitForDaemonReady(effectiveGatewayPort, 30000);
1297
1165
  if (!ready) {
1298
1166
  console.warn(
1299
1167
  "⚠ Gateway started but health check did not respond within 30s",
@@ -1304,6 +1172,20 @@ export async function startGateway(
1304
1172
  return gatewayUrl;
1305
1173
  }
1306
1174
 
1175
+ /** Check whether a PID belongs to an ngrok process via its command line. */
1176
+ function isNgrokProcess(pid: number): boolean {
1177
+ try {
1178
+ const output = execFileSync("ps", ["-p", String(pid), "-o", "command="], {
1179
+ encoding: "utf-8",
1180
+ timeout: 3000,
1181
+ stdio: ["ignore", "pipe", "ignore"],
1182
+ }).trim();
1183
+ return /ngrok/.test(output);
1184
+ } catch {
1185
+ return false;
1186
+ }
1187
+ }
1188
+
1307
1189
  /**
1308
1190
  * Stop any locally-running daemon and gateway processes
1309
1191
  * and clean up PID files. Called when hatch fails partway through
@@ -1326,15 +1208,14 @@ export async function stopLocalProcesses(
1326
1208
 
1327
1209
  // Kill ngrok directly by PID rather than using stopProcessByPidFile, because
1328
1210
  // isVellumProcess() won't match the ngrok binary — resulting in a no-op that
1329
- // leaves ngrok running.
1211
+ // leaves ngrok running. Verify the PID still belongs to ngrok before killing
1212
+ // to avoid hitting an unrelated process if the OS has reused the PID.
1330
1213
  const ngrokPidFile = join(vellumDir, "ngrok.pid");
1331
1214
  if (existsSync(ngrokPidFile)) {
1332
1215
  try {
1333
1216
  const pid = parseInt(readFileSync(ngrokPidFile, "utf-8").trim(), 10);
1334
- if (!isNaN(pid)) {
1335
- try {
1336
- process.kill(pid, "SIGTERM");
1337
- } catch {}
1217
+ if (!isNaN(pid) && isNgrokProcess(pid)) {
1218
+ await stopProcess(pid, "ngrok");
1338
1219
  }
1339
1220
  unlinkSync(ngrokPidFile);
1340
1221
  } catch {}
@@ -58,7 +58,7 @@ export function readPidFile(pidFile: string): string | null {
58
58
  return pid || null;
59
59
  }
60
60
 
61
- export function isProcessAlive(pid: string): boolean {
61
+ export function isPidAlive(pid: string): boolean {
62
62
  try {
63
63
  process.kill(parseInt(pid, 10), 0);
64
64
  return true;
@@ -138,10 +138,14 @@ export async function detectOrphanedProcesses(
138
138
  // Process table scan — discover orphaned processes by scanning the OS
139
139
  // process table rather than reading PID files from the workspace.
140
140
  try {
141
- const output = await execOutput("sh", [
142
- "-c",
143
- "ps ax -o pid=,ppid=,args= | grep -E 'vellum|qdrant|openclaw' | grep -v grep",
144
- ]);
141
+ const output = await execOutput(
142
+ "sh",
143
+ [
144
+ "-c",
145
+ "ps ax -o pid=,ppid=,args= | grep -E 'vellum|qdrant|openclaw' | grep -v grep",
146
+ ],
147
+ { timeoutMs: 5_000 },
148
+ );
145
149
  const procs = parseRemotePs(output);
146
150
  const ownPid = String(process.pid);
147
151
 
package/src/lib/pgrep.ts CHANGED
@@ -1,8 +1,12 @@
1
1
  import { execOutput } from "./step-runner";
2
2
 
3
+ const PGREP_TIMEOUT_MS = 5_000;
4
+
3
5
  export async function pgrepExact(name: string): Promise<string[]> {
4
6
  try {
5
- const output = await execOutput("pgrep", ["-x", name]);
7
+ const output = await execOutput("pgrep", ["-x", name], {
8
+ timeoutMs: PGREP_TIMEOUT_MS,
9
+ });
6
10
  return output.trim().split("\n").filter(Boolean);
7
11
  } catch {
8
12
  return [];