@os-eco/overstory-cli 0.9.4 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/README.md +47 -18
  2. package/agents/builder.md +9 -8
  3. package/agents/coordinator.md +6 -6
  4. package/agents/lead.md +98 -82
  5. package/agents/merger.md +25 -14
  6. package/agents/reviewer.md +22 -16
  7. package/agents/scout.md +17 -12
  8. package/package.json +6 -3
  9. package/src/agents/capabilities.test.ts +85 -0
  10. package/src/agents/capabilities.ts +125 -0
  11. package/src/agents/headless-mail-injector.test.ts +448 -0
  12. package/src/agents/headless-mail-injector.ts +211 -0
  13. package/src/agents/headless-prompt.test.ts +102 -0
  14. package/src/agents/headless-prompt.ts +68 -0
  15. package/src/agents/hooks-deployer.test.ts +514 -14
  16. package/src/agents/hooks-deployer.ts +141 -0
  17. package/src/agents/overlay.test.ts +4 -4
  18. package/src/agents/overlay.ts +30 -8
  19. package/src/agents/turn-lock.test.ts +181 -0
  20. package/src/agents/turn-lock.ts +235 -0
  21. package/src/agents/turn-runner-dispatch.test.ts +182 -0
  22. package/src/agents/turn-runner-dispatch.ts +105 -0
  23. package/src/agents/turn-runner.test.ts +1450 -0
  24. package/src/agents/turn-runner.ts +1166 -0
  25. package/src/commands/clean.ts +54 -0
  26. package/src/commands/coordinator.test.ts +127 -0
  27. package/src/commands/coordinator.ts +203 -5
  28. package/src/commands/dashboard.test.ts +188 -0
  29. package/src/commands/dashboard.ts +13 -3
  30. package/src/commands/doctor.ts +3 -1
  31. package/src/commands/group.test.ts +94 -0
  32. package/src/commands/group.ts +49 -20
  33. package/src/commands/init.test.ts +8 -0
  34. package/src/commands/init.ts +8 -1
  35. package/src/commands/log.test.ts +56 -11
  36. package/src/commands/log.ts +134 -69
  37. package/src/commands/mail.test.ts +162 -0
  38. package/src/commands/mail.ts +64 -9
  39. package/src/commands/merge.test.ts +112 -1
  40. package/src/commands/merge.ts +17 -4
  41. package/src/commands/nudge.test.ts +351 -4
  42. package/src/commands/nudge.ts +356 -34
  43. package/src/commands/run.test.ts +43 -7
  44. package/src/commands/serve/build.test.ts +202 -0
  45. package/src/commands/serve/build.ts +206 -0
  46. package/src/commands/serve/coordinator-actions.test.ts +339 -0
  47. package/src/commands/serve/coordinator-actions.ts +408 -0
  48. package/src/commands/serve/dev.test.ts +168 -0
  49. package/src/commands/serve/dev.ts +117 -0
  50. package/src/commands/serve/mail-actions.test.ts +312 -0
  51. package/src/commands/serve/mail-actions.ts +167 -0
  52. package/src/commands/serve/rest.test.ts +1323 -0
  53. package/src/commands/serve/rest.ts +708 -0
  54. package/src/commands/serve/static.ts +51 -0
  55. package/src/commands/serve/ws.test.ts +361 -0
  56. package/src/commands/serve/ws.ts +332 -0
  57. package/src/commands/serve.test.ts +459 -0
  58. package/src/commands/serve.ts +565 -0
  59. package/src/commands/sling.test.ts +73 -1
  60. package/src/commands/sling.ts +149 -64
  61. package/src/commands/status.test.ts +9 -0
  62. package/src/commands/status.ts +12 -4
  63. package/src/commands/stop.test.ts +174 -1
  64. package/src/commands/stop.ts +107 -8
  65. package/src/commands/watch.test.ts +43 -0
  66. package/src/commands/watch.ts +153 -28
  67. package/src/config.ts +23 -0
  68. package/src/doctor/consistency.test.ts +106 -0
  69. package/src/doctor/consistency.ts +48 -1
  70. package/src/doctor/serve.test.ts +95 -0
  71. package/src/doctor/serve.ts +86 -0
  72. package/src/doctor/types.ts +2 -1
  73. package/src/doctor/watchdog.ts +57 -1
  74. package/src/events/tailer.test.ts +234 -1
  75. package/src/events/tailer.ts +90 -0
  76. package/src/index.ts +53 -6
  77. package/src/json.ts +29 -0
  78. package/src/mail/client.ts +15 -2
  79. package/src/mail/store.test.ts +82 -0
  80. package/src/mail/store.ts +41 -4
  81. package/src/merge/lock.test.ts +149 -0
  82. package/src/merge/lock.ts +140 -0
  83. package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
  84. package/src/runtimes/claude.test.ts +791 -1
  85. package/src/runtimes/claude.ts +323 -1
  86. package/src/runtimes/connections.test.ts +141 -1
  87. package/src/runtimes/connections.ts +73 -4
  88. package/src/runtimes/headless-connection.test.ts +264 -0
  89. package/src/runtimes/headless-connection.ts +158 -0
  90. package/src/runtimes/types.ts +10 -0
  91. package/src/schema-consistency.test.ts +1 -0
  92. package/src/sessions/store.test.ts +390 -24
  93. package/src/sessions/store.ts +184 -19
  94. package/src/test-setup.test.ts +31 -0
  95. package/src/test-setup.ts +28 -0
  96. package/src/types.ts +56 -1
  97. package/src/utils/pid.test.ts +85 -1
  98. package/src/utils/pid.ts +86 -1
  99. package/src/utils/process-scan.test.ts +53 -0
  100. package/src/utils/process-scan.ts +76 -0
  101. package/src/watchdog/daemon.test.ts +1520 -411
  102. package/src/watchdog/daemon.ts +442 -83
  103. package/src/watchdog/health.test.ts +157 -0
  104. package/src/watchdog/health.ts +92 -25
  105. package/src/worktree/process.test.ts +71 -0
  106. package/src/worktree/process.ts +25 -5
  107. package/src/worktree/tmux.test.ts +3 -0
  108. package/src/worktree/tmux.ts +10 -3
  109. package/templates/CLAUDE.md.tmpl +19 -8
  110. package/templates/overlay.md.tmpl +3 -2
@@ -12,12 +12,16 @@
12
12
  * With --clean-worktree, completed agents skip the kill step and proceed to cleanup.
13
13
  */
14
14
 
15
+ import { unlink } from "node:fs/promises";
15
16
  import { join } from "node:path";
16
17
  import { loadConfig } from "../config.ts";
17
18
  import { AgentError, ValidationError } from "../errors.ts";
18
19
  import { jsonOutput } from "../json.ts";
19
20
  import { printSuccess, printWarning } from "../logging/color.ts";
21
+ import { createMailStore } from "../mail/store.ts";
20
22
  import { openSessionStore } from "../sessions/compat.ts";
23
+ import type { MergeReadyPayload } from "../types.ts";
24
+ import { readPidFile } from "../utils/pid.ts";
21
25
  import { removeWorktree } from "../worktree/manager.ts";
22
26
  import { isProcessAlive, isSessionAlive, killProcessTree, killSession } from "../worktree/tmux.ts";
23
27
 
@@ -49,6 +53,56 @@ export interface StopDeps {
49
53
  };
50
54
  }
51
55
 
56
+ /**
57
+ * Build the lead_completed nudge subject based on whether the lead actually sent
58
+ * merge_ready before exiting (overstory-41fe). The merge_ready close-gate
59
+ * (commit 3e21338) prevents leads from running `sd close` without it, but a
60
+ * lead can still exit (process termination, watchdog kill, manual `ov stop`)
61
+ * without ever having sent one. The coordinator's surfacing of this nudge
62
+ * needs to distinguish those two cases.
63
+ */
64
+ function buildLeadCompletedSubject(agentName: string, mailDbPath: string): string {
65
+ let mergeReadyBranches: string[] = [];
66
+ let mergeReadyCount = 0;
67
+ try {
68
+ const store = createMailStore(mailDbPath);
69
+ try {
70
+ const messages = store.getAll({ from: agentName, type: "merge_ready" });
71
+ mergeReadyCount = messages.length;
72
+ for (const msg of messages) {
73
+ if (msg.payload === null) continue;
74
+ try {
75
+ const parsed = JSON.parse(msg.payload) as Partial<MergeReadyPayload>;
76
+ if (typeof parsed.branch === "string" && parsed.branch.length > 0) {
77
+ mergeReadyBranches.push(parsed.branch);
78
+ }
79
+ } catch {
80
+ // Skip messages with unparseable payloads
81
+ }
82
+ }
83
+ } finally {
84
+ store.close();
85
+ }
86
+ } catch {
87
+ // If the mail store can't be opened (corrupt db, permissions), fall back
88
+ // to the historical ambiguous phrasing rather than blocking the stop.
89
+ return `Lead ${agentName} completed — check mail for merge_ready/worker_done`;
90
+ }
91
+
92
+ if (mergeReadyCount === 0) {
93
+ return `Lead ${agentName} exited — no merge_ready sent, needs coordinator follow-up`;
94
+ }
95
+ // Dedupe in case a lead resent merge_ready for the same branch
96
+ mergeReadyBranches = Array.from(new Set(mergeReadyBranches));
97
+ if (mergeReadyBranches.length === 0) {
98
+ return `Lead ${agentName} sent ${mergeReadyCount} merge_ready (branch unknown)`;
99
+ }
100
+ if (mergeReadyBranches.length === 1) {
101
+ return `Lead ${agentName} sent merge_ready for branch ${mergeReadyBranches[0]}`;
102
+ }
103
+ return `Lead ${agentName} sent ${mergeReadyBranches.length} merge_ready (branches: ${mergeReadyBranches.join(", ")})`;
104
+ }
105
+
52
106
  /** Delete a git branch (best-effort, non-fatal). */
53
107
  async function deleteBranchBestEffort(repoRoot: string, branch: string): Promise<boolean> {
54
108
  try {
@@ -115,20 +169,35 @@ export async function stopCommand(
115
169
  }
116
170
 
117
171
  const isZombie = session.state === "zombie";
118
- const isHeadless = session.tmuxSession === "" && session.pid !== null;
172
+ // Headless task-scoped agents (Phase 3 spawn-per-turn): tmuxSession is ""
173
+ // and session.pid is null between turns. The live PID for an in-flight
174
+ // turn is published at .overstory/agents/<name>/turn.pid. Sapling RPC
175
+ // agents still use session.pid for their long-lived process.
176
+ const isHeadless = session.tmuxSession === "";
177
+ const turnPidPath = join(overstoryDir, "agents", agentName, "turn.pid");
119
178
 
120
179
  let tmuxKilled = false;
121
180
  let pidKilled = false;
122
181
 
123
182
  // Skip kill operations for already-completed agents (process/tmux already gone)
124
183
  if (!isAlreadyCompleted) {
125
- if (isHeadless && session.pid !== null) {
126
- // Headless agent: kill via process tree instead of tmux
127
- const alive = proc.isAlive(session.pid);
128
- if (alive) {
129
- await proc.killTree(session.pid);
184
+ if (isHeadless) {
185
+ // Prefer the per-turn PID file (Phase 3) this catches an in-flight
186
+ // claude turn for any task-scoped capability. Fall back to the
187
+ // session row's pid for legacy/long-lived headless runtimes (Sapling).
188
+ const turnPid = await readPidFile(turnPidPath);
189
+ const targetPid = turnPid ?? session.pid;
190
+ if (targetPid !== null && proc.isAlive(targetPid)) {
191
+ await proc.killTree(targetPid);
130
192
  pidKilled = true;
131
193
  }
194
+ // Reap the turn.pid file so a subsequent ov stop / mail injector
195
+ // doesn't see a stale entry. Idempotent.
196
+ try {
197
+ await unlink(turnPidPath);
198
+ } catch {
199
+ // already gone — non-fatal
200
+ }
132
201
  } else {
133
202
  // TUI agent: kill via tmux session
134
203
  const alive = await tmux.isSessionAlive(session.tmuxSession);
@@ -138,9 +207,39 @@ export async function stopCommand(
138
207
  }
139
208
  }
140
209
 
141
- // Mark session as completed
142
- store.updateState(agentName, "completed");
210
+ // Mark session as completed via the guarded transition. `completed` is
211
+ // reachable from every non-completed state (including zombie, so `ov
212
+ // stop` can promote a watchdog-flagged zombie to a clean completion),
213
+ // so the only way this rejects is if state is already `completed` —
214
+ // which is the no-op we want anyway. Race-safe under overstory-a993.
215
+ store.tryTransitionState(agentName, "completed");
143
216
  store.updateLastActivity(agentName);
217
+
218
+ // Auto-nudge coordinator when a lead truly completes so it wakes up
219
+ // to process merge_ready / worker_done messages without waiting for
220
+ // user input. Fires from `ov stop` (real completion signal) rather
221
+ // than the per-turn Stop hook, which was spamming the coordinator
222
+ // (overstory-49a7).
223
+ if (session.capability === "lead") {
224
+ try {
225
+ const mailDbPath = join(overstoryDir, "mail.db");
226
+ const subject = buildLeadCompletedSubject(agentName, mailDbPath);
227
+ const nudgesDir = join(overstoryDir, "pending-nudges");
228
+ const { mkdir } = await import("node:fs/promises");
229
+ await mkdir(nudgesDir, { recursive: true });
230
+ const markerPath = join(nudgesDir, "coordinator.json");
231
+ const marker = {
232
+ from: agentName,
233
+ reason: "lead_completed",
234
+ subject,
235
+ messageId: `auto-nudge-${agentName}-${Date.now()}`,
236
+ createdAt: new Date().toISOString(),
237
+ };
238
+ await Bun.write(markerPath, `${JSON.stringify(marker, null, "\t")}\n`);
239
+ } catch {
240
+ // Non-fatal: nudge failure should not break stop
241
+ }
242
+ }
144
243
  }
145
244
 
146
245
  // Optionally remove worktree and branch (best-effort, non-fatal)
@@ -88,6 +88,7 @@ describe("watchCommand", () => {
88
88
  expect(out).toContain("watch");
89
89
  expect(out).toContain("--interval");
90
90
  expect(out).toContain("--background");
91
+ expect(out).toContain("--kill-others");
91
92
  expect(out).toContain("Tier 0");
92
93
  });
93
94
 
@@ -113,6 +114,48 @@ describe("watchCommand", () => {
113
114
  expect(process.exitCode).toBe(1);
114
115
  });
115
116
 
117
+ test("foreground mode: refuses when a live foreign PID owns the lock", async () => {
118
+ // Spawn a long-running child to act as the "foreign live process". Its
119
+ // PID will not match our own, so acquirePidLock should refuse rather
120
+ // than treat the existing PID as idempotent self-ownership. The
121
+ // foreground path used to overwrite this file unconditionally — the
122
+ // overstory-8ef6 fix forces it to refuse.
123
+ const sleeper = Bun.spawn(["sleep", "30"], {
124
+ stdout: "ignore",
125
+ stderr: "ignore",
126
+ });
127
+ try {
128
+ const pidFilePath = join(tempDir, ".overstory", "watchdog.pid");
129
+ await Bun.write(pidFilePath, `${sleeper.pid}\n`);
130
+
131
+ // --json for structured output. No --background, so this exercises
132
+ // the foreground exclusion path. A correctly contested lock returns
133
+ // immediately (exit 1) without starting the daemon loop.
134
+ await watchCommand(["--json"]);
135
+
136
+ const out = output();
137
+ const jsonLine = out
138
+ .split("\n")
139
+ .map((l) => l.trim())
140
+ .find((l) => l.startsWith("{"));
141
+ expect(jsonLine).toBeDefined();
142
+ if (jsonLine) {
143
+ const parsed = JSON.parse(jsonLine);
144
+ expect(parsed.running).toBe(true);
145
+ expect(parsed.pid).toBe(sleeper.pid);
146
+ expect(parsed.error).toContain("already running");
147
+ }
148
+ expect(process.exitCode).toBe(1);
149
+
150
+ // PID file untouched — still the foreign owner's PID.
151
+ const fileContent = await Bun.file(pidFilePath).text();
152
+ expect(fileContent.trim()).toBe(`${sleeper.pid}`);
153
+ } finally {
154
+ sleeper.kill("SIGTERM");
155
+ await sleeper.exited.catch(() => {});
156
+ }
157
+ });
158
+
116
159
  test("background mode: stale PID cleanup", async () => {
117
160
  // Write a PID file with a non-running process (999999 is very unlikely to exist)
118
161
  const pidFilePath = join(tempDir, ".overstory", "watchdog.pid");
@@ -13,7 +13,13 @@ import { jsonOutput } from "../json.ts";
13
13
  import { printError, printHint, printSuccess } from "../logging/color.ts";
14
14
  import type { HealthCheck } from "../types.ts";
15
15
  import { resolveOverstoryBin } from "../utils/bin.ts";
16
- import { readPidFile, removePidFile, writePidFile } from "../utils/pid.ts";
16
+ import {
17
+ type AcquirePidLockResult,
18
+ acquirePidLock,
19
+ readPidFile,
20
+ removePidFile,
21
+ } from "../utils/pid.ts";
22
+ import { findRunningWatchdogProcesses, type WatchdogProcess } from "../utils/process-scan.ts";
17
23
  import { startDaemon } from "../watchdog/daemon.ts";
18
24
  import { isProcessRunning } from "../watchdog/health.ts";
19
25
 
@@ -38,6 +44,39 @@ export function formatCheck(check: HealthCheck): string {
38
44
  return line;
39
45
  }
40
46
 
47
+ /**
48
+ * Format a "lock contested" error consistently across foreground/background.
49
+ */
50
+ function formatLockContestedError(existingPid: number, pidFilePath: string): string {
51
+ if (existingPid <= 0) {
52
+ return `Watchdog PID file at ${pidFilePath} is owned by another process (could not read PID). Run 'ov watch --kill-others' or remove the file.`;
53
+ }
54
+ return `Watchdog already running (PID: ${existingPid}). Kill it first, run 'ov watch --kill-others', or remove ${pidFilePath}`;
55
+ }
56
+
57
+ /**
58
+ * Kill running `ov watch` daemons that are NOT the given excludedPid.
59
+ * Returns the list of PIDs killed (after a SIGTERM was issued — not waited).
60
+ */
61
+ async function killForeignWatchdogs(
62
+ excludedPid: number | null,
63
+ ): Promise<{ killed: number[]; surveyed: WatchdogProcess[] }> {
64
+ const surveyed = await findRunningWatchdogProcesses();
65
+ const killed: number[] = [];
66
+ for (const proc of surveyed) {
67
+ if (excludedPid !== null && proc.pid === excludedPid) {
68
+ continue;
69
+ }
70
+ try {
71
+ process.kill(proc.pid, "SIGTERM");
72
+ killed.push(proc.pid);
73
+ } catch {
74
+ // Process already gone — not an error.
75
+ }
76
+ }
77
+ return { killed, surveyed };
78
+ }
79
+
41
80
  /**
42
81
  * Core implementation for the watch command.
43
82
  */
@@ -45,6 +84,7 @@ async function runWatch(opts: {
45
84
  interval?: string;
46
85
  background?: boolean;
47
86
  json?: boolean;
87
+ killOthers?: boolean;
48
88
  }): Promise<void> {
49
89
  const cwd = process.cwd();
50
90
  const config = await loadConfig(cwd);
@@ -59,26 +99,46 @@ async function runWatch(opts: {
59
99
 
60
100
  const useJson = opts.json ?? false;
61
101
 
62
- if (opts.background) {
63
- // Check if a watchdog is already running
64
- const existingPid = await readPidFile(pidFilePath);
65
- if (existingPid !== null && isProcessRunning(existingPid)) {
66
- if (useJson) {
67
- jsonOutput("watch", { running: true, pid: existingPid, error: "Watchdog already running" });
68
- } else {
69
- printError(
70
- `Watchdog already running (PID: ${existingPid}). Kill it first or remove ${pidFilePath}`,
71
- );
102
+ // --kill-others: kill any pre-existing `ov watch` daemons before claiming
103
+ // the lock. Useful when an earlier release allowed multi-daemon state.
104
+ if (opts.killOthers) {
105
+ const { killed } = await killForeignWatchdogs(null);
106
+
107
+ // Wait for the just-killed processes to actually exit before reclaiming
108
+ // the PID file. Without this, the next acquirePidLock call sees a still-
109
+ // alive PID in the file and refuses, even though we issued SIGTERM
110
+ // nanoseconds earlier. Poll for up to ~2s.
111
+ const killedSet = new Set(killed);
112
+ if (killedSet.size > 0) {
113
+ const deadline = Date.now() + 2000;
114
+ while (Date.now() < deadline) {
115
+ const stillAlive = killed.filter((p) => isProcessRunning(p));
116
+ if (stillAlive.length === 0) break;
117
+ await Bun.sleep(50);
72
118
  }
73
- process.exitCode = 1;
74
- return;
75
119
  }
76
120
 
77
- // Clean up stale PID file if process is no longer running
121
+ // Reclaim the PID file if it pointed at a process we just killed (it is
122
+ // either already dead or in flight to dead) or at any other dead PID.
123
+ const existingPid = await readPidFile(pidFilePath);
78
124
  if (existingPid !== null) {
79
- await removePidFile(pidFilePath);
125
+ if (killedSet.has(existingPid) || !isProcessRunning(existingPid)) {
126
+ await removePidFile(pidFilePath);
127
+ }
80
128
  }
81
129
 
130
+ if (killed.length > 0) {
131
+ if (useJson) {
132
+ jsonOutput("watch", { killed });
133
+ } else {
134
+ printSuccess(`Killed ${killed.length} foreign watchdog process(es): ${killed.join(", ")}`);
135
+ }
136
+ } else if (!useJson) {
137
+ printHint("No foreign watchdog processes found.");
138
+ }
139
+ }
140
+
141
+ if (opts.background) {
82
142
  // Build the args for the child process, forwarding --interval but not --background
83
143
  const childArgs: string[] = ["watch"];
84
144
  if (opts.interval) {
@@ -88,7 +148,21 @@ async function runWatch(opts: {
88
148
  // Resolve the overstory binary path
89
149
  const overstoryBin = await resolveOverstoryBin();
90
150
 
91
- // Spawn a detached background process running `overstory watch` (without --background)
151
+ // Pre-check: surface "already running" before paying the cost of a spawn.
152
+ // This is only for friendly errors — the authoritative exclusion happens
153
+ // in the atomic acquirePidLock call below.
154
+ const existingPid = await readPidFile(pidFilePath);
155
+ if (existingPid !== null && isProcessRunning(existingPid)) {
156
+ if (useJson) {
157
+ jsonOutput("watch", { running: true, pid: existingPid, error: "Watchdog already running" });
158
+ } else {
159
+ printError(formatLockContestedError(existingPid, pidFilePath));
160
+ }
161
+ process.exitCode = 1;
162
+ return;
163
+ }
164
+
165
+ // Spawn the detached background daemon (foreground mode in the child).
92
166
  const child = Bun.spawn(["bun", "run", overstoryBin, ...childArgs], {
93
167
  cwd,
94
168
  stdout: "ignore",
@@ -96,13 +170,33 @@ async function runWatch(opts: {
96
170
  stdin: "ignore",
97
171
  });
98
172
 
99
- // Unref the child so the parent can exit without waiting for it
100
- child.unref();
101
-
102
173
  const childPid = child.pid;
103
174
 
104
- // Write PID file for later cleanup
105
- await writePidFile(pidFilePath, childPid);
175
+ // Atomically acquire the lock with the child's PID. If another writer
176
+ // raced in between our pre-check and the spawn, we have to kill our
177
+ // child and report contention.
178
+ const lockResult = await acquirePidLock(pidFilePath, childPid, isProcessRunning);
179
+ if (!lockResult.acquired) {
180
+ try {
181
+ child.kill("SIGTERM");
182
+ } catch {
183
+ // Already exited — not an error.
184
+ }
185
+ if (useJson) {
186
+ jsonOutput("watch", {
187
+ running: true,
188
+ pid: lockResult.existingPid,
189
+ error: "Watchdog already running",
190
+ });
191
+ } else {
192
+ printError(formatLockContestedError(lockResult.existingPid, pidFilePath));
193
+ }
194
+ process.exitCode = 1;
195
+ return;
196
+ }
197
+
198
+ // Lock is ours. Detach so this parent invocation can exit independently.
199
+ child.unref();
106
200
 
107
201
  if (useJson) {
108
202
  jsonOutput("watch", { pid: childPid, intervalMs, pidFile: pidFilePath });
@@ -113,7 +207,29 @@ async function runWatch(opts: {
113
207
  return;
114
208
  }
115
209
 
116
- // Foreground mode: show real-time health checks
210
+ // Foreground mode: acquire the lock atomically before announcing anything.
211
+ // In the background-spawn case the parent has already written this PID into
212
+ // the lock file; acquirePidLock detects own-PID and returns acquired=true
213
+ // idempotently.
214
+ const lockResult: AcquirePidLockResult = await acquirePidLock(
215
+ pidFilePath,
216
+ process.pid,
217
+ isProcessRunning,
218
+ );
219
+ if (!lockResult.acquired) {
220
+ if (useJson) {
221
+ jsonOutput("watch", {
222
+ running: true,
223
+ pid: lockResult.existingPid,
224
+ error: "Watchdog already running",
225
+ });
226
+ } else {
227
+ printError(formatLockContestedError(lockResult.existingPid, pidFilePath));
228
+ }
229
+ process.exitCode = 1;
230
+ return;
231
+ }
232
+
117
233
  if (useJson) {
118
234
  jsonOutput("watch", { pid: process.pid, intervalMs, mode: "foreground" });
119
235
  } else {
@@ -121,9 +237,6 @@ async function runWatch(opts: {
121
237
  printHint("Press Ctrl+C to stop.");
122
238
  }
123
239
 
124
- // Write PID file so `--background` check and external tools can find us
125
- await writePidFile(pidFilePath, process.pid);
126
-
127
240
  const { stop } = startDaemon({
128
241
  root: config.project.root,
129
242
  intervalMs,
@@ -131,6 +244,7 @@ async function runWatch(opts: {
131
244
  zombieThresholdMs,
132
245
  nudgeIntervalMs: config.watchdog.nudgeIntervalMs,
133
246
  tier1Enabled: config.watchdog.tier1Enabled,
247
+ notifyParentOnDeath: config.watchdog.notifyParentOnDeath ?? true,
134
248
  onHealthCheck(check) {
135
249
  const timestamp = new Date().toISOString().slice(11, 19);
136
250
  process.stdout.write(`[${timestamp}] ${formatCheck(check)}\n`);
@@ -156,10 +270,21 @@ export function createWatchCommand(): Command {
156
270
  .description("Start Tier 0 mechanical watchdog daemon")
157
271
  .option("--interval <ms>", "Health check interval in milliseconds")
158
272
  .option("--background", "Daemonize (run in background)")
273
+ .option(
274
+ "--kill-others",
275
+ "Kill any pre-existing 'ov watch' processes before starting (for cleanup of multi-daemon state)",
276
+ )
159
277
  .option("--json", "Output as JSON")
160
- .action(async (opts: { interval?: string; background?: boolean; json?: boolean }) => {
161
- await runWatch(opts);
162
- });
278
+ .action(
279
+ async (opts: {
280
+ interval?: string;
281
+ background?: boolean;
282
+ killOthers?: boolean;
283
+ json?: boolean;
284
+ }) => {
285
+ await runWatch(opts);
286
+ },
287
+ );
163
288
  }
164
289
 
165
290
  /**
package/src/config.ts CHANGED
@@ -90,6 +90,7 @@ export const DEFAULT_CONFIG: OverstoryConfig = {
90
90
  rpcTimeoutMs: 5_000, // 5 seconds for RPC getState() calls
91
91
  triageTimeoutMs: 30_000, // 30 seconds for Tier 1 AI triage calls
92
92
  maxEscalationLevel: 3, // Maximum escalation level before termination
93
+ notifyParentOnDeath: true, // Send worker_died mail to parent when watchdog terminates a child
93
94
  },
94
95
  coordinator: {
95
96
  exitTriggers: {
@@ -633,6 +634,16 @@ function validateConfig(config: OverstoryConfig): void {
633
634
  }
634
635
  }
635
636
 
637
+ if (
638
+ config.watchdog.notifyParentOnDeath !== undefined &&
639
+ typeof config.watchdog.notifyParentOnDeath !== "boolean"
640
+ ) {
641
+ throw new ValidationError("watchdog.notifyParentOnDeath must be a boolean", {
642
+ field: "watchdog.notifyParentOnDeath",
643
+ value: config.watchdog.notifyParentOnDeath,
644
+ });
645
+ }
646
+
636
647
  // mulch.primeFormat must be one of the valid options
637
648
  const validFormats = ["markdown", "xml", "json"] as const;
638
649
  if (!validFormats.includes(config.mulch.primeFormat as (typeof validFormats)[number])) {
@@ -774,6 +785,18 @@ function validateConfig(config: OverstoryConfig): void {
774
785
  }
775
786
  }
776
787
 
788
+ // runtime.claudeHeadlessByDefault: must be a boolean if present
789
+ if (
790
+ config.runtime?.claudeHeadlessByDefault !== undefined &&
791
+ typeof config.runtime.claudeHeadlessByDefault !== "boolean"
792
+ ) {
793
+ process.stderr.write(
794
+ `[overstory] WARNING: runtime.claudeHeadlessByDefault must be a boolean. Got: ${typeof config
795
+ .runtime.claudeHeadlessByDefault}. Ignoring.\n`,
796
+ );
797
+ config.runtime.claudeHeadlessByDefault = undefined;
798
+ }
799
+
777
800
  if (config.runtime?.capabilities) {
778
801
  for (const [cap, runtimeName] of Object.entries(config.runtime.capabilities)) {
779
802
  if (runtimeName !== undefined && (typeof runtimeName !== "string" || runtimeName === "")) {
@@ -410,6 +410,112 @@ describe("checkConsistency", () => {
410
410
  expect(checks.find((c) => c.name === "missing-tmux")?.status).toBe("pass");
411
411
  });
412
412
 
413
+ test("orphan-spawns: terminal state with live pid is flagged", async () => {
414
+ const dbPath = join(overstoryDir, "sessions.db");
415
+ const store = createSessionStore(dbPath);
416
+
417
+ store.upsert({
418
+ id: "session-1",
419
+ agentName: "orphaned-agent",
420
+ capability: "builder",
421
+ worktreePath: join(overstoryDir, "worktrees", "orphaned-agent"),
422
+ branchName: "overstory/orphaned-agent/test-123",
423
+ taskId: "test-123",
424
+ tmuxSession: "",
425
+ state: "completed",
426
+ pid: 4242,
427
+ parentAgent: null,
428
+ depth: 0,
429
+ runId: null,
430
+ startedAt: new Date().toISOString(),
431
+ lastActivity: new Date().toISOString(),
432
+ escalationLevel: 0,
433
+ stalledSince: null,
434
+ transcriptPath: null,
435
+ });
436
+ store.close();
437
+
438
+ mockIsProcessAlive.mockReturnValue(true);
439
+ mockListSessions.mockResolvedValue([]);
440
+
441
+ const checks = await checkConsistency(config, overstoryDir, mockDeps);
442
+
443
+ const orphanCheck = checks.find((c) => c.name === "orphan-spawns");
444
+ expect(orphanCheck).toBeDefined();
445
+ expect(orphanCheck?.status).toBe("warn");
446
+ expect(orphanCheck?.message).toContain("1 orphaned spawn");
447
+ expect(orphanCheck?.details?.[0]).toContain("orphaned-agent");
448
+ expect(orphanCheck?.fixable).toBe(true);
449
+ });
450
+
451
+ test("orphan-spawns: tmux dead but pid alive is flagged", async () => {
452
+ const dbPath = join(overstoryDir, "sessions.db");
453
+ const store = createSessionStore(dbPath);
454
+
455
+ store.upsert({
456
+ id: "session-1",
457
+ agentName: "tmux-dead-agent",
458
+ capability: "builder",
459
+ worktreePath: join(overstoryDir, "worktrees", "tmux-dead-agent"),
460
+ branchName: "overstory/tmux-dead-agent/test-123",
461
+ taskId: "test-123",
462
+ tmuxSession: "overstory-testproject-tmux-dead-agent",
463
+ state: "working",
464
+ pid: 4242,
465
+ parentAgent: null,
466
+ depth: 0,
467
+ runId: null,
468
+ startedAt: new Date().toISOString(),
469
+ lastActivity: new Date().toISOString(),
470
+ escalationLevel: 0,
471
+ stalledSince: null,
472
+ transcriptPath: null,
473
+ });
474
+ store.close();
475
+
476
+ mockIsProcessAlive.mockReturnValue(true);
477
+ // tmux server reports no matching session
478
+ mockListSessions.mockResolvedValue([]);
479
+
480
+ const checks = await checkConsistency(config, overstoryDir, mockDeps);
481
+
482
+ const orphanCheck = checks.find((c) => c.name === "orphan-spawns");
483
+ expect(orphanCheck?.status).toBe("warn");
484
+ expect(orphanCheck?.details?.[0]).toContain("tmux session");
485
+ });
486
+
487
+ test("orphan-spawns: passes when terminal-state pid is dead", async () => {
488
+ const dbPath = join(overstoryDir, "sessions.db");
489
+ const store = createSessionStore(dbPath);
490
+
491
+ store.upsert({
492
+ id: "session-1",
493
+ agentName: "clean-completed",
494
+ capability: "builder",
495
+ worktreePath: join(overstoryDir, "worktrees", "clean-completed"),
496
+ branchName: "overstory/clean-completed/test-123",
497
+ taskId: "test-123",
498
+ tmuxSession: "",
499
+ state: "completed",
500
+ pid: 4242,
501
+ parentAgent: null,
502
+ depth: 0,
503
+ runId: null,
504
+ startedAt: new Date().toISOString(),
505
+ lastActivity: new Date().toISOString(),
506
+ escalationLevel: 0,
507
+ stalledSince: null,
508
+ transcriptPath: null,
509
+ });
510
+ store.close();
511
+
512
+ mockIsProcessAlive.mockReturnValue(false);
513
+
514
+ const checks = await checkConsistency(config, overstoryDir, mockDeps);
515
+
516
+ expect(checks.find((c) => c.name === "orphan-spawns")?.status).toBe("pass");
517
+ });
518
+
413
519
  test("handles tmux not installed gracefully", async () => {
414
520
  // Mock tmux listing to throw an error
415
521
  mockListSessions.mockRejectedValue(new Error("tmux: command not found"));