@os-eco/overstory-cli 0.9.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/README.md +50 -19
  2. package/agents/builder.md +19 -9
  3. package/agents/coordinator.md +6 -6
  4. package/agents/lead.md +204 -87
  5. package/agents/merger.md +25 -14
  6. package/agents/reviewer.md +22 -16
  7. package/agents/scout.md +17 -12
  8. package/package.json +6 -3
  9. package/src/agents/capabilities.test.ts +85 -0
  10. package/src/agents/capabilities.ts +125 -0
  11. package/src/agents/headless-mail-injector.test.ts +448 -0
  12. package/src/agents/headless-mail-injector.ts +219 -0
  13. package/src/agents/headless-prompt.test.ts +102 -0
  14. package/src/agents/headless-prompt.ts +68 -0
  15. package/src/agents/hooks-deployer.test.ts +514 -14
  16. package/src/agents/hooks-deployer.ts +141 -0
  17. package/src/agents/mail-poll-detect.test.ts +153 -0
  18. package/src/agents/mail-poll-detect.ts +73 -0
  19. package/src/agents/overlay.test.ts +60 -4
  20. package/src/agents/overlay.ts +63 -8
  21. package/src/agents/scope-detect.test.ts +190 -0
  22. package/src/agents/scope-detect.ts +146 -0
  23. package/src/agents/turn-lock.test.ts +181 -0
  24. package/src/agents/turn-lock.ts +235 -0
  25. package/src/agents/turn-runner-dispatch.test.ts +182 -0
  26. package/src/agents/turn-runner-dispatch.ts +105 -0
  27. package/src/agents/turn-runner.test.ts +2312 -0
  28. package/src/agents/turn-runner.ts +1383 -0
  29. package/src/commands/agents.ts +9 -0
  30. package/src/commands/clean.ts +54 -0
  31. package/src/commands/coordinator.test.ts +254 -0
  32. package/src/commands/coordinator.ts +273 -8
  33. package/src/commands/dashboard.test.ts +188 -0
  34. package/src/commands/dashboard.ts +14 -4
  35. package/src/commands/doctor.ts +3 -1
  36. package/src/commands/group.test.ts +94 -0
  37. package/src/commands/group.ts +49 -20
  38. package/src/commands/init.test.ts +8 -0
  39. package/src/commands/init.ts +8 -1
  40. package/src/commands/log.test.ts +187 -11
  41. package/src/commands/log.ts +171 -71
  42. package/src/commands/mail.test.ts +162 -0
  43. package/src/commands/mail.ts +64 -9
  44. package/src/commands/merge.test.ts +230 -1
  45. package/src/commands/merge.ts +68 -12
  46. package/src/commands/nudge.test.ts +351 -4
  47. package/src/commands/nudge.ts +356 -34
  48. package/src/commands/run.test.ts +43 -7
  49. package/src/commands/serve/build.test.ts +202 -0
  50. package/src/commands/serve/build.ts +206 -0
  51. package/src/commands/serve/coordinator-actions.test.ts +339 -0
  52. package/src/commands/serve/coordinator-actions.ts +408 -0
  53. package/src/commands/serve/dev.test.ts +168 -0
  54. package/src/commands/serve/dev.ts +117 -0
  55. package/src/commands/serve/mail-actions.test.ts +312 -0
  56. package/src/commands/serve/mail-actions.ts +167 -0
  57. package/src/commands/serve/rest.test.ts +1323 -0
  58. package/src/commands/serve/rest.ts +708 -0
  59. package/src/commands/serve/static.ts +51 -0
  60. package/src/commands/serve/ws.test.ts +361 -0
  61. package/src/commands/serve/ws.ts +332 -0
  62. package/src/commands/serve.test.ts +459 -0
  63. package/src/commands/serve.ts +565 -0
  64. package/src/commands/sling.test.ts +177 -1
  65. package/src/commands/sling.ts +243 -71
  66. package/src/commands/status.test.ts +9 -0
  67. package/src/commands/status.ts +12 -4
  68. package/src/commands/stop.test.ts +255 -1
  69. package/src/commands/stop.ts +107 -8
  70. package/src/commands/watch.test.ts +43 -0
  71. package/src/commands/watch.ts +153 -28
  72. package/src/config.ts +23 -0
  73. package/src/doctor/consistency.test.ts +106 -0
  74. package/src/doctor/consistency.ts +48 -1
  75. package/src/doctor/serve.test.ts +95 -0
  76. package/src/doctor/serve.ts +86 -0
  77. package/src/doctor/types.ts +2 -1
  78. package/src/doctor/watchdog.ts +57 -1
  79. package/src/events/tailer.test.ts +234 -1
  80. package/src/events/tailer.ts +90 -0
  81. package/src/index.ts +57 -6
  82. package/src/insights/quality-gates.test.ts +141 -0
  83. package/src/insights/quality-gates.ts +156 -0
  84. package/src/json.ts +29 -0
  85. package/src/logging/theme.ts +4 -0
  86. package/src/mail/client.ts +15 -2
  87. package/src/mail/store.test.ts +82 -0
  88. package/src/mail/store.ts +41 -4
  89. package/src/merge/lock.test.ts +149 -0
  90. package/src/merge/lock.ts +140 -0
  91. package/src/merge/predict.test.ts +387 -0
  92. package/src/merge/predict.ts +249 -0
  93. package/src/merge/resolver.ts +1 -1
  94. package/src/mulch/client.ts +3 -3
  95. package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
  96. package/src/runtimes/claude.test.ts +791 -1
  97. package/src/runtimes/claude.ts +323 -1
  98. package/src/runtimes/connections.test.ts +141 -1
  99. package/src/runtimes/connections.ts +73 -4
  100. package/src/runtimes/headless-connection.test.ts +264 -0
  101. package/src/runtimes/headless-connection.ts +158 -0
  102. package/src/runtimes/types.ts +10 -0
  103. package/src/schema-consistency.test.ts +1 -0
  104. package/src/sessions/store.test.ts +657 -29
  105. package/src/sessions/store.ts +286 -23
  106. package/src/test-setup.test.ts +31 -0
  107. package/src/test-setup.ts +28 -0
  108. package/src/types.ts +107 -2
  109. package/src/utils/pid.test.ts +85 -1
  110. package/src/utils/pid.ts +86 -1
  111. package/src/utils/process-scan.test.ts +53 -0
  112. package/src/utils/process-scan.ts +76 -0
  113. package/src/watchdog/daemon.test.ts +1607 -376
  114. package/src/watchdog/daemon.ts +462 -88
  115. package/src/watchdog/health.test.ts +282 -0
  116. package/src/watchdog/health.ts +126 -27
  117. package/src/worktree/manager.test.ts +218 -1
  118. package/src/worktree/manager.ts +55 -0
  119. package/src/worktree/process.test.ts +71 -0
  120. package/src/worktree/process.ts +25 -5
  121. package/src/worktree/tmux.test.ts +28 -0
  122. package/src/worktree/tmux.ts +27 -3
  123. package/templates/CLAUDE.md.tmpl +19 -8
  124. package/templates/overlay.md.tmpl +5 -2
@@ -13,7 +13,13 @@ import { jsonOutput } from "../json.ts";
13
13
  import { printError, printHint, printSuccess } from "../logging/color.ts";
14
14
  import type { HealthCheck } from "../types.ts";
15
15
  import { resolveOverstoryBin } from "../utils/bin.ts";
16
- import { readPidFile, removePidFile, writePidFile } from "../utils/pid.ts";
16
+ import {
17
+ type AcquirePidLockResult,
18
+ acquirePidLock,
19
+ readPidFile,
20
+ removePidFile,
21
+ } from "../utils/pid.ts";
22
+ import { findRunningWatchdogProcesses, type WatchdogProcess } from "../utils/process-scan.ts";
17
23
  import { startDaemon } from "../watchdog/daemon.ts";
18
24
  import { isProcessRunning } from "../watchdog/health.ts";
19
25
 
@@ -38,6 +44,39 @@ export function formatCheck(check: HealthCheck): string {
38
44
  return line;
39
45
  }
40
46
 
47
+ /**
48
+ * Format a "lock contested" error consistently across foreground/background.
49
+ */
50
+ function formatLockContestedError(existingPid: number, pidFilePath: string): string {
51
+ if (existingPid <= 0) {
52
+ return `Watchdog PID file at ${pidFilePath} is owned by another process (could not read PID). Run 'ov watch --kill-others' or remove the file.`;
53
+ }
54
+ return `Watchdog already running (PID: ${existingPid}). Kill it first, run 'ov watch --kill-others', or remove ${pidFilePath}`;
55
+ }
56
+
57
+ /**
58
+ * Kill running `ov watch` daemons that are NOT the given excludedPid.
59
+ * Returns the list of PIDs killed (after a SIGTERM was issued — not waited).
60
+ */
61
+ async function killForeignWatchdogs(
62
+ excludedPid: number | null,
63
+ ): Promise<{ killed: number[]; surveyed: WatchdogProcess[] }> {
64
+ const surveyed = await findRunningWatchdogProcesses();
65
+ const killed: number[] = [];
66
+ for (const proc of surveyed) {
67
+ if (excludedPid !== null && proc.pid === excludedPid) {
68
+ continue;
69
+ }
70
+ try {
71
+ process.kill(proc.pid, "SIGTERM");
72
+ killed.push(proc.pid);
73
+ } catch {
74
+ // Process already gone — not an error.
75
+ }
76
+ }
77
+ return { killed, surveyed };
78
+ }
79
+
41
80
  /**
42
81
  * Core implementation for the watch command.
43
82
  */
@@ -45,6 +84,7 @@ async function runWatch(opts: {
45
84
  interval?: string;
46
85
  background?: boolean;
47
86
  json?: boolean;
87
+ killOthers?: boolean;
48
88
  }): Promise<void> {
49
89
  const cwd = process.cwd();
50
90
  const config = await loadConfig(cwd);
@@ -59,26 +99,46 @@ async function runWatch(opts: {
59
99
 
60
100
  const useJson = opts.json ?? false;
61
101
 
62
- if (opts.background) {
63
- // Check if a watchdog is already running
64
- const existingPid = await readPidFile(pidFilePath);
65
- if (existingPid !== null && isProcessRunning(existingPid)) {
66
- if (useJson) {
67
- jsonOutput("watch", { running: true, pid: existingPid, error: "Watchdog already running" });
68
- } else {
69
- printError(
70
- `Watchdog already running (PID: ${existingPid}). Kill it first or remove ${pidFilePath}`,
71
- );
102
+ // --kill-others: kill any pre-existing `ov watch` daemons before claiming
103
+ // the lock. Useful when an earlier release allowed multi-daemon state.
104
+ if (opts.killOthers) {
105
+ const { killed } = await killForeignWatchdogs(null);
106
+
107
+ // Wait for the just-killed processes to actually exit before reclaiming
108
+ // the PID file. Without this, the next acquirePidLock call sees a still-
109
+ // alive PID in the file and refuses, even though we issued SIGTERM
110
+ // nanoseconds earlier. Poll for up to ~2s.
111
+ const killedSet = new Set(killed);
112
+ if (killedSet.size > 0) {
113
+ const deadline = Date.now() + 2000;
114
+ while (Date.now() < deadline) {
115
+ const stillAlive = killed.filter((p) => isProcessRunning(p));
116
+ if (stillAlive.length === 0) break;
117
+ await Bun.sleep(50);
72
118
  }
73
- process.exitCode = 1;
74
- return;
75
119
  }
76
120
 
77
- // Clean up stale PID file if process is no longer running
121
+ // Reclaim the PID file if it pointed at a process we just killed (it is
122
+ // either already dead or in flight to dead) or at any other dead PID.
123
+ const existingPid = await readPidFile(pidFilePath);
78
124
  if (existingPid !== null) {
79
- await removePidFile(pidFilePath);
125
+ if (killedSet.has(existingPid) || !isProcessRunning(existingPid)) {
126
+ await removePidFile(pidFilePath);
127
+ }
80
128
  }
81
129
 
130
+ if (killed.length > 0) {
131
+ if (useJson) {
132
+ jsonOutput("watch", { killed });
133
+ } else {
134
+ printSuccess(`Killed ${killed.length} foreign watchdog process(es): ${killed.join(", ")}`);
135
+ }
136
+ } else if (!useJson) {
137
+ printHint("No foreign watchdog processes found.");
138
+ }
139
+ }
140
+
141
+ if (opts.background) {
82
142
  // Build the args for the child process, forwarding --interval but not --background
83
143
  const childArgs: string[] = ["watch"];
84
144
  if (opts.interval) {
@@ -88,7 +148,21 @@ async function runWatch(opts: {
88
148
  // Resolve the overstory binary path
89
149
  const overstoryBin = await resolveOverstoryBin();
90
150
 
91
- // Spawn a detached background process running `overstory watch` (without --background)
151
+ // Pre-check: surface "already running" before paying the cost of a spawn.
152
+ // This is only for friendly errors — the authoritative exclusion happens
153
+ // in the atomic acquirePidLock call below.
154
+ const existingPid = await readPidFile(pidFilePath);
155
+ if (existingPid !== null && isProcessRunning(existingPid)) {
156
+ if (useJson) {
157
+ jsonOutput("watch", { running: true, pid: existingPid, error: "Watchdog already running" });
158
+ } else {
159
+ printError(formatLockContestedError(existingPid, pidFilePath));
160
+ }
161
+ process.exitCode = 1;
162
+ return;
163
+ }
164
+
165
+ // Spawn the detached background daemon (foreground mode in the child).
92
166
  const child = Bun.spawn(["bun", "run", overstoryBin, ...childArgs], {
93
167
  cwd,
94
168
  stdout: "ignore",
@@ -96,13 +170,33 @@ async function runWatch(opts: {
96
170
  stdin: "ignore",
97
171
  });
98
172
 
99
- // Unref the child so the parent can exit without waiting for it
100
- child.unref();
101
-
102
173
  const childPid = child.pid;
103
174
 
104
- // Write PID file for later cleanup
105
- await writePidFile(pidFilePath, childPid);
175
+ // Atomically acquire the lock with the child's PID. If another writer
176
+ // raced in between our pre-check and the spawn, we have to kill our
177
+ // child and report contention.
178
+ const lockResult = await acquirePidLock(pidFilePath, childPid, isProcessRunning);
179
+ if (!lockResult.acquired) {
180
+ try {
181
+ child.kill("SIGTERM");
182
+ } catch {
183
+ // Already exited — not an error.
184
+ }
185
+ if (useJson) {
186
+ jsonOutput("watch", {
187
+ running: true,
188
+ pid: lockResult.existingPid,
189
+ error: "Watchdog already running",
190
+ });
191
+ } else {
192
+ printError(formatLockContestedError(lockResult.existingPid, pidFilePath));
193
+ }
194
+ process.exitCode = 1;
195
+ return;
196
+ }
197
+
198
+ // Lock is ours. Detach so this parent invocation can exit independently.
199
+ child.unref();
106
200
 
107
201
  if (useJson) {
108
202
  jsonOutput("watch", { pid: childPid, intervalMs, pidFile: pidFilePath });
@@ -113,7 +207,29 @@ async function runWatch(opts: {
113
207
  return;
114
208
  }
115
209
 
116
- // Foreground mode: show real-time health checks
210
+ // Foreground mode: acquire the lock atomically before announcing anything.
211
+ // In the background-spawn case the parent has already written this PID into
212
+ // the lock file; acquirePidLock detects own-PID and returns acquired=true
213
+ // idempotently.
214
+ const lockResult: AcquirePidLockResult = await acquirePidLock(
215
+ pidFilePath,
216
+ process.pid,
217
+ isProcessRunning,
218
+ );
219
+ if (!lockResult.acquired) {
220
+ if (useJson) {
221
+ jsonOutput("watch", {
222
+ running: true,
223
+ pid: lockResult.existingPid,
224
+ error: "Watchdog already running",
225
+ });
226
+ } else {
227
+ printError(formatLockContestedError(lockResult.existingPid, pidFilePath));
228
+ }
229
+ process.exitCode = 1;
230
+ return;
231
+ }
232
+
117
233
  if (useJson) {
118
234
  jsonOutput("watch", { pid: process.pid, intervalMs, mode: "foreground" });
119
235
  } else {
@@ -121,9 +237,6 @@ async function runWatch(opts: {
121
237
  printHint("Press Ctrl+C to stop.");
122
238
  }
123
239
 
124
- // Write PID file so `--background` check and external tools can find us
125
- await writePidFile(pidFilePath, process.pid);
126
-
127
240
  const { stop } = startDaemon({
128
241
  root: config.project.root,
129
242
  intervalMs,
@@ -131,6 +244,7 @@ async function runWatch(opts: {
131
244
  zombieThresholdMs,
132
245
  nudgeIntervalMs: config.watchdog.nudgeIntervalMs,
133
246
  tier1Enabled: config.watchdog.tier1Enabled,
247
+ notifyParentOnDeath: config.watchdog.notifyParentOnDeath ?? true,
134
248
  onHealthCheck(check) {
135
249
  const timestamp = new Date().toISOString().slice(11, 19);
136
250
  process.stdout.write(`[${timestamp}] ${formatCheck(check)}\n`);
@@ -156,10 +270,21 @@ export function createWatchCommand(): Command {
156
270
  .description("Start Tier 0 mechanical watchdog daemon")
157
271
  .option("--interval <ms>", "Health check interval in milliseconds")
158
272
  .option("--background", "Daemonize (run in background)")
273
+ .option(
274
+ "--kill-others",
275
+ "Kill any pre-existing 'ov watch' processes before starting (for cleanup of multi-daemon state)",
276
+ )
159
277
  .option("--json", "Output as JSON")
160
- .action(async (opts: { interval?: string; background?: boolean; json?: boolean }) => {
161
- await runWatch(opts);
162
- });
278
+ .action(
279
+ async (opts: {
280
+ interval?: string;
281
+ background?: boolean;
282
+ killOthers?: boolean;
283
+ json?: boolean;
284
+ }) => {
285
+ await runWatch(opts);
286
+ },
287
+ );
163
288
  }
164
289
 
165
290
  /**
package/src/config.ts CHANGED
@@ -90,6 +90,7 @@ export const DEFAULT_CONFIG: OverstoryConfig = {
90
90
  rpcTimeoutMs: 5_000, // 5 seconds for RPC getState() calls
91
91
  triageTimeoutMs: 30_000, // 30 seconds for Tier 1 AI triage calls
92
92
  maxEscalationLevel: 3, // Maximum escalation level before termination
93
+ notifyParentOnDeath: true, // Send worker_died mail to parent when watchdog terminates a child
93
94
  },
94
95
  coordinator: {
95
96
  exitTriggers: {
@@ -633,6 +634,16 @@ function validateConfig(config: OverstoryConfig): void {
633
634
  }
634
635
  }
635
636
 
637
+ if (
638
+ config.watchdog.notifyParentOnDeath !== undefined &&
639
+ typeof config.watchdog.notifyParentOnDeath !== "boolean"
640
+ ) {
641
+ throw new ValidationError("watchdog.notifyParentOnDeath must be a boolean", {
642
+ field: "watchdog.notifyParentOnDeath",
643
+ value: config.watchdog.notifyParentOnDeath,
644
+ });
645
+ }
646
+
636
647
  // mulch.primeFormat must be one of the valid options
637
648
  const validFormats = ["markdown", "xml", "json"] as const;
638
649
  if (!validFormats.includes(config.mulch.primeFormat as (typeof validFormats)[number])) {
@@ -774,6 +785,18 @@ function validateConfig(config: OverstoryConfig): void {
774
785
  }
775
786
  }
776
787
 
788
+ // runtime.claudeHeadlessByDefault: must be a boolean if present
789
+ if (
790
+ config.runtime?.claudeHeadlessByDefault !== undefined &&
791
+ typeof config.runtime.claudeHeadlessByDefault !== "boolean"
792
+ ) {
793
+ process.stderr.write(
794
+ `[overstory] WARNING: runtime.claudeHeadlessByDefault must be a boolean. Got: ${typeof config
795
+ .runtime.claudeHeadlessByDefault}. Ignoring.\n`,
796
+ );
797
+ config.runtime.claudeHeadlessByDefault = undefined;
798
+ }
799
+
777
800
  if (config.runtime?.capabilities) {
778
801
  for (const [cap, runtimeName] of Object.entries(config.runtime.capabilities)) {
779
802
  if (runtimeName !== undefined && (typeof runtimeName !== "string" || runtimeName === "")) {
@@ -410,6 +410,112 @@ describe("checkConsistency", () => {
410
410
  expect(checks.find((c) => c.name === "missing-tmux")?.status).toBe("pass");
411
411
  });
412
412
 
413
+ test("orphan-spawns: terminal state with live pid is flagged", async () => {
414
+ const dbPath = join(overstoryDir, "sessions.db");
415
+ const store = createSessionStore(dbPath);
416
+
417
+ store.upsert({
418
+ id: "session-1",
419
+ agentName: "orphaned-agent",
420
+ capability: "builder",
421
+ worktreePath: join(overstoryDir, "worktrees", "orphaned-agent"),
422
+ branchName: "overstory/orphaned-agent/test-123",
423
+ taskId: "test-123",
424
+ tmuxSession: "",
425
+ state: "completed",
426
+ pid: 4242,
427
+ parentAgent: null,
428
+ depth: 0,
429
+ runId: null,
430
+ startedAt: new Date().toISOString(),
431
+ lastActivity: new Date().toISOString(),
432
+ escalationLevel: 0,
433
+ stalledSince: null,
434
+ transcriptPath: null,
435
+ });
436
+ store.close();
437
+
438
+ mockIsProcessAlive.mockReturnValue(true);
439
+ mockListSessions.mockResolvedValue([]);
440
+
441
+ const checks = await checkConsistency(config, overstoryDir, mockDeps);
442
+
443
+ const orphanCheck = checks.find((c) => c.name === "orphan-spawns");
444
+ expect(orphanCheck).toBeDefined();
445
+ expect(orphanCheck?.status).toBe("warn");
446
+ expect(orphanCheck?.message).toContain("1 orphaned spawn");
447
+ expect(orphanCheck?.details?.[0]).toContain("orphaned-agent");
448
+ expect(orphanCheck?.fixable).toBe(true);
449
+ });
450
+
451
+ test("orphan-spawns: tmux dead but pid alive is flagged", async () => {
452
+ const dbPath = join(overstoryDir, "sessions.db");
453
+ const store = createSessionStore(dbPath);
454
+
455
+ store.upsert({
456
+ id: "session-1",
457
+ agentName: "tmux-dead-agent",
458
+ capability: "builder",
459
+ worktreePath: join(overstoryDir, "worktrees", "tmux-dead-agent"),
460
+ branchName: "overstory/tmux-dead-agent/test-123",
461
+ taskId: "test-123",
462
+ tmuxSession: "overstory-testproject-tmux-dead-agent",
463
+ state: "working",
464
+ pid: 4242,
465
+ parentAgent: null,
466
+ depth: 0,
467
+ runId: null,
468
+ startedAt: new Date().toISOString(),
469
+ lastActivity: new Date().toISOString(),
470
+ escalationLevel: 0,
471
+ stalledSince: null,
472
+ transcriptPath: null,
473
+ });
474
+ store.close();
475
+
476
+ mockIsProcessAlive.mockReturnValue(true);
477
+ // tmux server reports no matching session
478
+ mockListSessions.mockResolvedValue([]);
479
+
480
+ const checks = await checkConsistency(config, overstoryDir, mockDeps);
481
+
482
+ const orphanCheck = checks.find((c) => c.name === "orphan-spawns");
483
+ expect(orphanCheck?.status).toBe("warn");
484
+ expect(orphanCheck?.details?.[0]).toContain("tmux session");
485
+ });
486
+
487
+ test("orphan-spawns: passes when terminal-state pid is dead", async () => {
488
+ const dbPath = join(overstoryDir, "sessions.db");
489
+ const store = createSessionStore(dbPath);
490
+
491
+ store.upsert({
492
+ id: "session-1",
493
+ agentName: "clean-completed",
494
+ capability: "builder",
495
+ worktreePath: join(overstoryDir, "worktrees", "clean-completed"),
496
+ branchName: "overstory/clean-completed/test-123",
497
+ taskId: "test-123",
498
+ tmuxSession: "",
499
+ state: "completed",
500
+ pid: 4242,
501
+ parentAgent: null,
502
+ depth: 0,
503
+ runId: null,
504
+ startedAt: new Date().toISOString(),
505
+ lastActivity: new Date().toISOString(),
506
+ escalationLevel: 0,
507
+ stalledSince: null,
508
+ transcriptPath: null,
509
+ });
510
+ store.close();
511
+
512
+ mockIsProcessAlive.mockReturnValue(false);
513
+
514
+ const checks = await checkConsistency(config, overstoryDir, mockDeps);
515
+
516
+ expect(checks.find((c) => c.name === "orphan-spawns")?.status).toBe("pass");
517
+ });
518
+
413
519
  test("handles tmux not installed gracefully", async () => {
414
520
  // Mock tmux listing to throw an error
415
521
  mockListSessions.mockRejectedValue(new Error("tmux: command not found"));
@@ -212,7 +212,9 @@ export async function checkConsistency(
212
212
 
213
213
  // 8. Check for SessionStore entries with missing tmux sessions
214
214
  const existingTmuxNames = new Set(tmuxSessions.map((s) => s.name));
215
- const missingTmux = liveSessions.filter((s) => !existingTmuxNames.has(s.tmuxSession));
215
+ const missingTmux = liveSessions.filter(
216
+ (s) => s.tmuxSession.length > 0 && !existingTmuxNames.has(s.tmuxSession),
217
+ );
216
218
 
217
219
  if (missingTmux.length > 0) {
218
220
  checks.push({
@@ -232,6 +234,51 @@ export async function checkConsistency(
232
234
  });
233
235
  }
234
236
 
237
+ // 8b. Check for orphaned claude spawn PIDs (overstory-505d).
238
+ //
239
+ // An orphan is a session whose pid is still alive but should not be:
240
+ // - the session reached a terminal state (completed/zombie) yet the
241
+ // spawn didn't exit, or
242
+ // - the tmux container is gone but the claude child survived (was
243
+ // reparented to init when its bash wrapper got SIGHUP).
244
+ // Run `ov clean --all` to reap. Distinct from `dead-pids` (the inverse:
245
+ // session is live but its pid already died).
246
+ const orphanedSpawns: Array<{ session: AgentSession; reason: string }> = [];
247
+ for (const s of storeSessions) {
248
+ if (s.pid === null || !isProcessAliveFn(s.pid)) continue;
249
+ if (s.state === "completed" || s.state === "zombie") {
250
+ orphanedSpawns.push({
251
+ session: s,
252
+ reason: `state=${s.state} but pid ${s.pid} still alive`,
253
+ });
254
+ continue;
255
+ }
256
+ if (s.tmuxSession.length > 0 && !existingTmuxNames.has(s.tmuxSession)) {
257
+ orphanedSpawns.push({
258
+ session: s,
259
+ reason: `tmux session "${s.tmuxSession}" missing but pid ${s.pid} alive`,
260
+ });
261
+ }
262
+ }
263
+
264
+ if (orphanedSpawns.length > 0) {
265
+ checks.push({
266
+ name: "orphan-spawns",
267
+ category: "consistency",
268
+ status: "warn",
269
+ message: `Found ${orphanedSpawns.length} orphaned spawn process(es) — run "ov clean --all" to reap`,
270
+ details: orphanedSpawns.map(({ session, reason }) => `${session.agentName}: ${reason}`),
271
+ fixable: true,
272
+ });
273
+ } else {
274
+ checks.push({
275
+ name: "orphan-spawns",
276
+ category: "consistency",
277
+ status: "pass",
278
+ message: "No orphaned spawn processes detected",
279
+ });
280
+ }
281
+
235
282
  // 9. Check reviewer-to-builder ratio per lead
236
283
  const parentGroups = new Map<string, { builders: number; reviewers: number }>();
237
284
  for (const session of storeSessions) {
@@ -0,0 +1,95 @@
1
+ import { afterEach, beforeEach, describe, expect, test } from "bun:test";
2
+ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
3
+ import { tmpdir } from "node:os";
4
+ import { join } from "node:path";
5
+ import type { OverstoryConfig } from "../types.ts";
6
+ import { checkServe } from "./serve.ts";
7
+
8
+ describe("checkServe", () => {
9
+ let tempDir: string;
10
+ let mockConfig: OverstoryConfig;
11
+
12
+ beforeEach(() => {
13
+ tempDir = mkdtempSync(join(tmpdir(), "overstory-serve-doctor-test-"));
14
+ mockConfig = {
15
+ project: { name: "test", root: tempDir, canonicalBranch: "main" },
16
+ agents: {
17
+ manifestPath: "",
18
+ baseDir: "",
19
+ maxConcurrent: 5,
20
+ staggerDelayMs: 100,
21
+ maxDepth: 2,
22
+ maxSessionsPerRun: 0,
23
+ maxAgentsPerLead: 5,
24
+ },
25
+ worktrees: { baseDir: "" },
26
+ taskTracker: { backend: "auto", enabled: true },
27
+ mulch: { enabled: true, domains: [], primeFormat: "markdown" },
28
+ merge: { aiResolveEnabled: false, reimagineEnabled: false },
29
+ providers: {
30
+ anthropic: { type: "native" },
31
+ },
32
+ watchdog: {
33
+ tier0Enabled: false,
34
+ tier0IntervalMs: 30000,
35
+ tier1Enabled: false,
36
+ tier2Enabled: false,
37
+ staleThresholdMs: 300000,
38
+ zombieThresholdMs: 600000,
39
+ nudgeIntervalMs: 60000,
40
+ },
41
+ models: {},
42
+ logging: { verbose: false, redactSecrets: true },
43
+ };
44
+ });
45
+
46
+ afterEach(() => {
47
+ rmSync(tempDir, { recursive: true, force: true });
48
+ });
49
+
50
+ test("ui/dist missing — returns warn about missing build", async () => {
51
+ const checks = await checkServe(mockConfig, tempDir);
52
+ const distCheck = checks.find((c) => c.name === "serve ui/dist");
53
+
54
+ expect(distCheck).toBeDefined();
55
+ expect(distCheck?.status).toBe("warn");
56
+ expect(distCheck?.message).toContain("ui/dist not found");
57
+ expect(distCheck?.details?.some((d) => d.includes("ui/dist"))).toBe(true);
58
+ });
59
+
60
+ test("ui/dist exists but index.html missing — returns warn about incomplete build", async () => {
61
+ mkdirSync(join(tempDir, "ui", "dist"), { recursive: true });
62
+ const checks = await checkServe(mockConfig, tempDir);
63
+ const distCheck = checks.find((c) => c.name === "serve ui/dist");
64
+
65
+ expect(distCheck).toBeDefined();
66
+ expect(distCheck?.status).toBe("warn");
67
+ expect(distCheck?.message).toContain("index.html is missing");
68
+ });
69
+
70
+ test("ui/dist with index.html — returns pass", async () => {
71
+ mkdirSync(join(tempDir, "ui", "dist"), { recursive: true });
72
+ writeFileSync(join(tempDir, "ui", "dist", "index.html"), "<html></html>");
73
+ const checks = await checkServe(mockConfig, tempDir);
74
+ const distCheck = checks.find((c) => c.name === "serve ui/dist");
75
+
76
+ expect(distCheck).toBeDefined();
77
+ expect(distCheck?.status).toBe("pass");
78
+ expect(distCheck?.message).toContain("index.html");
79
+ });
80
+
81
+ test("port check included in results", async () => {
82
+ const checks = await checkServe(mockConfig, tempDir);
83
+ const portCheck = checks.find((c) => c.name === "serve port");
84
+
85
+ expect(portCheck).toBeDefined();
86
+ // Server not running — should warn (or pass if something happens to be on the default port)
87
+ expect(portCheck?.status === "warn" || portCheck?.status === "pass").toBe(true);
88
+ });
89
+
90
+ test("returns exactly 2 checks (ui/dist + port)", async () => {
91
+ const checks = await checkServe(mockConfig, tempDir);
92
+ expect(checks).toHaveLength(2);
93
+ expect(checks.map((c) => c.category).every((cat) => cat === "serve")).toBe(true);
94
+ });
95
+ });
@@ -0,0 +1,86 @@
1
+ import { existsSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import { DEFAULT_SERVE_PORT } from "../commands/serve.ts";
4
+ import type { DoctorCheck, DoctorCheckFn } from "./types.ts";
5
+
6
+ /**
7
+ * ov serve subsystem health checks.
8
+ * Validates ui/dist build output and port reachability.
9
+ */
10
+ export const checkServe: DoctorCheckFn = async (config, _overstoryDir): Promise<DoctorCheck[]> => {
11
+ const checks: DoctorCheck[] = [];
12
+
13
+ // Check 1: ui/dist directory exists (only relevant if a UI has been built)
14
+ const uiDistPath = join(config.project.root, "ui", "dist");
15
+ const uiDistExists = existsSync(uiDistPath);
16
+ const indexHtmlExists = uiDistExists && existsSync(join(uiDistPath, "index.html"));
17
+
18
+ if (!uiDistExists) {
19
+ checks.push({
20
+ name: "serve ui/dist",
21
+ category: "serve",
22
+ status: "warn",
23
+ message: "ui/dist not found — run the UI build before starting ov serve",
24
+ details: [`Expected: ${uiDistPath}`],
25
+ });
26
+ } else if (!indexHtmlExists) {
27
+ checks.push({
28
+ name: "serve ui/dist",
29
+ category: "serve",
30
+ status: "warn",
31
+ message: "ui/dist exists but index.html is missing — UI build may be incomplete",
32
+ details: [`Expected: ${join(uiDistPath, "index.html")}`],
33
+ });
34
+ } else {
35
+ checks.push({
36
+ name: "serve ui/dist",
37
+ category: "serve",
38
+ status: "pass",
39
+ message: "ui/dist is present with index.html",
40
+ });
41
+ }
42
+
43
+ // Check 2: default port reachability (non-blocking probe)
44
+ const port = DEFAULT_SERVE_PORT;
45
+ const host = "127.0.0.1";
46
+ const reachable = await probePort(host, port);
47
+ if (reachable) {
48
+ checks.push({
49
+ name: "serve port",
50
+ category: "serve",
51
+ status: "pass",
52
+ message: `ov serve is reachable on ${host}:${port}`,
53
+ });
54
+ } else {
55
+ checks.push({
56
+ name: "serve port",
57
+ category: "serve",
58
+ status: "warn",
59
+ message: `ov serve is not running on ${host}:${port}`,
60
+ details: [`Start with: ov serve --port ${port}`],
61
+ });
62
+ }
63
+
64
+ return checks;
65
+ };
66
+
67
+ /**
68
+ * Probe whether a TCP port is open by attempting an HTTP connection.
69
+ * Returns true if the server responds, false on any error.
70
+ */
71
+ async function probePort(host: string, port: number): Promise<boolean> {
72
+ try {
73
+ const controller = new AbortController();
74
+ const timeout = setTimeout(() => controller.abort(), 1000);
75
+ try {
76
+ const res = await fetch(`http://${host}:${port}/healthz`, {
77
+ signal: controller.signal,
78
+ });
79
+ return res.ok || res.status < 500;
80
+ } finally {
81
+ clearTimeout(timeout);
82
+ }
83
+ } catch {
84
+ return false;
85
+ }
86
+ }
@@ -15,7 +15,8 @@ export type DoctorCategory =
15
15
  | "version"
16
16
  | "ecosystem"
17
17
  | "providers"
18
- | "watchdog";
18
+ | "watchdog"
19
+ | "serve";
19
20
 
20
21
  /** Result of a single doctor health check. */
21
22
  export interface DoctorCheck {