@os-eco/overstory-cli 0.9.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/README.md +50 -19
  2. package/agents/builder.md +19 -9
  3. package/agents/coordinator.md +6 -6
  4. package/agents/lead.md +204 -87
  5. package/agents/merger.md +25 -14
  6. package/agents/reviewer.md +22 -16
  7. package/agents/scout.md +17 -12
  8. package/package.json +6 -3
  9. package/src/agents/capabilities.test.ts +85 -0
  10. package/src/agents/capabilities.ts +125 -0
  11. package/src/agents/headless-mail-injector.test.ts +448 -0
  12. package/src/agents/headless-mail-injector.ts +219 -0
  13. package/src/agents/headless-prompt.test.ts +102 -0
  14. package/src/agents/headless-prompt.ts +68 -0
  15. package/src/agents/hooks-deployer.test.ts +514 -14
  16. package/src/agents/hooks-deployer.ts +141 -0
  17. package/src/agents/mail-poll-detect.test.ts +153 -0
  18. package/src/agents/mail-poll-detect.ts +73 -0
  19. package/src/agents/overlay.test.ts +60 -4
  20. package/src/agents/overlay.ts +63 -8
  21. package/src/agents/scope-detect.test.ts +190 -0
  22. package/src/agents/scope-detect.ts +146 -0
  23. package/src/agents/turn-lock.test.ts +181 -0
  24. package/src/agents/turn-lock.ts +235 -0
  25. package/src/agents/turn-runner-dispatch.test.ts +182 -0
  26. package/src/agents/turn-runner-dispatch.ts +105 -0
  27. package/src/agents/turn-runner.test.ts +2312 -0
  28. package/src/agents/turn-runner.ts +1383 -0
  29. package/src/commands/agents.ts +9 -0
  30. package/src/commands/clean.ts +54 -0
  31. package/src/commands/coordinator.test.ts +254 -0
  32. package/src/commands/coordinator.ts +273 -8
  33. package/src/commands/dashboard.test.ts +188 -0
  34. package/src/commands/dashboard.ts +14 -4
  35. package/src/commands/doctor.ts +3 -1
  36. package/src/commands/group.test.ts +94 -0
  37. package/src/commands/group.ts +49 -20
  38. package/src/commands/init.test.ts +8 -0
  39. package/src/commands/init.ts +8 -1
  40. package/src/commands/log.test.ts +187 -11
  41. package/src/commands/log.ts +171 -71
  42. package/src/commands/mail.test.ts +162 -0
  43. package/src/commands/mail.ts +64 -9
  44. package/src/commands/merge.test.ts +230 -1
  45. package/src/commands/merge.ts +68 -12
  46. package/src/commands/nudge.test.ts +351 -4
  47. package/src/commands/nudge.ts +356 -34
  48. package/src/commands/run.test.ts +43 -7
  49. package/src/commands/serve/build.test.ts +202 -0
  50. package/src/commands/serve/build.ts +206 -0
  51. package/src/commands/serve/coordinator-actions.test.ts +339 -0
  52. package/src/commands/serve/coordinator-actions.ts +408 -0
  53. package/src/commands/serve/dev.test.ts +168 -0
  54. package/src/commands/serve/dev.ts +117 -0
  55. package/src/commands/serve/mail-actions.test.ts +312 -0
  56. package/src/commands/serve/mail-actions.ts +167 -0
  57. package/src/commands/serve/rest.test.ts +1323 -0
  58. package/src/commands/serve/rest.ts +708 -0
  59. package/src/commands/serve/static.ts +51 -0
  60. package/src/commands/serve/ws.test.ts +361 -0
  61. package/src/commands/serve/ws.ts +332 -0
  62. package/src/commands/serve.test.ts +459 -0
  63. package/src/commands/serve.ts +565 -0
  64. package/src/commands/sling.test.ts +177 -1
  65. package/src/commands/sling.ts +243 -71
  66. package/src/commands/status.test.ts +9 -0
  67. package/src/commands/status.ts +12 -4
  68. package/src/commands/stop.test.ts +255 -1
  69. package/src/commands/stop.ts +107 -8
  70. package/src/commands/watch.test.ts +43 -0
  71. package/src/commands/watch.ts +153 -28
  72. package/src/config.ts +23 -0
  73. package/src/doctor/consistency.test.ts +106 -0
  74. package/src/doctor/consistency.ts +48 -1
  75. package/src/doctor/serve.test.ts +95 -0
  76. package/src/doctor/serve.ts +86 -0
  77. package/src/doctor/types.ts +2 -1
  78. package/src/doctor/watchdog.ts +57 -1
  79. package/src/events/tailer.test.ts +234 -1
  80. package/src/events/tailer.ts +90 -0
  81. package/src/index.ts +57 -6
  82. package/src/insights/quality-gates.test.ts +141 -0
  83. package/src/insights/quality-gates.ts +156 -0
  84. package/src/json.ts +29 -0
  85. package/src/logging/theme.ts +4 -0
  86. package/src/mail/client.ts +15 -2
  87. package/src/mail/store.test.ts +82 -0
  88. package/src/mail/store.ts +41 -4
  89. package/src/merge/lock.test.ts +149 -0
  90. package/src/merge/lock.ts +140 -0
  91. package/src/merge/predict.test.ts +387 -0
  92. package/src/merge/predict.ts +249 -0
  93. package/src/merge/resolver.ts +1 -1
  94. package/src/mulch/client.ts +3 -3
  95. package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
  96. package/src/runtimes/claude.test.ts +791 -1
  97. package/src/runtimes/claude.ts +323 -1
  98. package/src/runtimes/connections.test.ts +141 -1
  99. package/src/runtimes/connections.ts +73 -4
  100. package/src/runtimes/headless-connection.test.ts +264 -0
  101. package/src/runtimes/headless-connection.ts +158 -0
  102. package/src/runtimes/types.ts +10 -0
  103. package/src/schema-consistency.test.ts +1 -0
  104. package/src/sessions/store.test.ts +657 -29
  105. package/src/sessions/store.ts +286 -23
  106. package/src/test-setup.test.ts +31 -0
  107. package/src/test-setup.ts +28 -0
  108. package/src/types.ts +107 -2
  109. package/src/utils/pid.test.ts +85 -1
  110. package/src/utils/pid.ts +86 -1
  111. package/src/utils/process-scan.test.ts +53 -0
  112. package/src/utils/process-scan.ts +76 -0
  113. package/src/watchdog/daemon.test.ts +1607 -376
  114. package/src/watchdog/daemon.ts +462 -88
  115. package/src/watchdog/health.test.ts +282 -0
  116. package/src/watchdog/health.ts +126 -27
  117. package/src/worktree/manager.test.ts +218 -1
  118. package/src/worktree/manager.ts +55 -0
  119. package/src/worktree/process.test.ts +71 -0
  120. package/src/worktree/process.ts +25 -5
  121. package/src/worktree/tmux.test.ts +28 -0
  122. package/src/worktree/tmux.ts +27 -3
  123. package/templates/CLAUDE.md.tmpl +19 -8
  124. package/templates/overlay.md.tmpl +5 -2
package/src/types.ts CHANGED
@@ -108,6 +108,7 @@ export interface OverstoryConfig {
108
108
  rpcTimeoutMs?: number; // Timeout for RPC getState() calls (default 5_000)
109
109
  triageTimeoutMs?: number; // Timeout for Tier 1 AI triage calls (default 30_000)
110
110
  maxEscalationLevel?: number; // Maximum escalation level before termination (default 3)
111
+ notifyParentOnDeath?: boolean; // Send synthetic worker_died mail to parent on watchdog termination (default true)
111
112
  };
112
113
  models: Partial<Record<string, ModelRef>>;
113
114
  logging: {
@@ -141,6 +142,13 @@ export interface OverstoryConfig {
141
142
  * Default: 0 (no delay).
142
143
  */
143
144
  shellInitDelayMs?: number;
145
+ /**
146
+ * Project-level default for spawning Claude Code agents in headless mode
147
+ * (Bun.spawn + stream-json) instead of the tmux interactive runtime.
148
+ * Per-spawn `--headless` / `--no-headless` flags on `ov sling` override this.
149
+ * Default: false (tmux).
150
+ */
151
+ claudeHeadlessByDefault?: boolean;
144
152
  };
145
153
  }
146
154
 
@@ -179,7 +187,49 @@ export type Capability = (typeof SUPPORTED_CAPABILITIES)[number];
179
187
 
180
188
  // === Agent Session ===
181
189
 
182
- export type AgentState = "booting" | "working" | "completed" | "stalled" | "zombie";
190
+ /**
191
+ * Agent lifecycle states.
192
+ *
193
+ * `in_turn` and `between_turns` are spawn-per-turn-specific substates that
194
+ * split the legacy `working` state so the UI can distinguish a worker actively
195
+ * executing a turn from one idling between mail batches (overstory-3087):
196
+ *
197
+ * - `in_turn`: the turn-runner has observed at least one parser event from
198
+ * a live claude subprocess. The agent is mid-execution.
199
+ * - `between_turns`: the turn-runner finished a turn without a terminal
200
+ * mail; the agent is alive (process gone, session pinned) and waiting
201
+ * for the next mail batch to spawn a fresh turn.
202
+ *
203
+ * `working` remains the active state for tmux/long-lived headless agents
204
+ * (coordinator, orchestrator, monitor, sapling) which have no per-turn
205
+ * boundary. Spawn-per-turn workers (builder/scout/reviewer/lead/merger
206
+ * under the headless default) transition through in_turn ↔ between_turns
207
+ * instead.
208
+ */
209
+ export type AgentState =
210
+ | "booting"
211
+ | "working"
212
+ | "in_turn"
213
+ | "between_turns"
214
+ | "completed"
215
+ | "stalled"
216
+ | "zombie";
217
+
218
+ /**
219
+ * Result of a guarded state transition attempt (`SessionStore.tryTransitionState`).
220
+ *
221
+ * Discriminated by `ok`. When `ok` is false, `reason` distinguishes:
222
+ * - `not_found`: no session exists for the given name.
223
+ * - `illegal_transition`: a session exists but the matrix forbids prev → attempted.
224
+ *
225
+ * `prev` is always the state observed by the SQL CAS. For `illegal_transition` it
226
+ * is the state that blocked the write (which may differ from what the caller read,
227
+ * if another writer landed first).
228
+ */
229
+ export type TransitionOutcome =
230
+ | { ok: true; prev: AgentState; next: AgentState }
231
+ | { ok: false; reason: "not_found"; attempted: AgentState }
232
+ | { ok: false; reason: "illegal_transition"; prev: AgentState; attempted: AgentState };
183
233
 
184
234
  export interface AgentSession {
185
235
  id: string; // Unique session ID
@@ -200,6 +250,7 @@ export interface AgentSession {
200
250
  stalledSince: string | null; // ISO timestamp when agent first entered stalled state
201
251
  transcriptPath: string | null; // Runtime-provided transcript JSONL path (decoupled from ~/.claude/)
202
252
  promptVersion?: string | null; // Canopy prompt version used at sling time (e.g. "builder@17")
253
+ claudeSessionId?: string | null; // Runtime-provided session_id (Claude stream-json), eagerly pinned on first event
203
254
  }
204
255
 
205
256
  // === Agent Identity ===
@@ -225,6 +276,7 @@ export type MailSemanticType = "status" | "question" | "result" | "error";
225
276
  /** Protocol message types for structured agent coordination. */
226
277
  export type MailProtocolType =
227
278
  | "worker_done"
279
+ | "worker_died"
228
280
  | "merge_ready"
229
281
  | "merged"
230
282
  | "merge_failed"
@@ -244,6 +296,7 @@ export const MAIL_MESSAGE_TYPES: readonly MailMessageType[] = [
244
296
  "result",
245
297
  "error",
246
298
  "worker_done",
299
+ "worker_died",
247
300
  "merge_ready",
248
301
  "merged",
249
302
  "merge_failed",
@@ -278,6 +331,33 @@ export interface WorkerDonePayload {
278
331
  filesModified: string[];
279
332
  }
280
333
 
334
+ /**
335
+ * Watchdog signals the parent that one of its children was terminated.
336
+ *
337
+ * Synthetic mail injected by the Tier 0 daemon when it transitions a worker
338
+ * to `zombie` (overstory-c111). Without this, the parent — typically a lead
339
+ * waiting for `worker_done` from this child — would block indefinitely on
340
+ * mail that will never arrive. The parent reads this on its next mail-injector
341
+ * tick and decides whether to retry, escalate, or report up.
342
+ */
343
+ export interface WorkerDiedPayload {
344
+ agentName: string;
345
+ capability: string;
346
+ taskId: string;
347
+ /** Reason the watchdog or runner terminated the child (e.g. "Process terminated"). */
348
+ reason: string;
349
+ /** ISO timestamp of the child's last observed activity. */
350
+ lastActivity: string;
351
+ /**
352
+ * Source that detected the failure.
353
+ * - `tier0`/`tier1`: watchdog daemon detected a dead/stuck process out-of-band.
354
+ * - `runner`: the per-turn runner observed an in-band failure — either an
355
+ * abort/stall that forced SIGTERM/SIGKILL, or a clean exit without the
356
+ * capability's terminal mail (silent-no-op, overstory-4159 / overstory-c772).
357
+ */
358
+ terminatedBy: "tier0" | "tier1" | "runner";
359
+ }
360
+
281
361
  /** Supervisor signals branch is verified and ready for merge. */
282
362
  export interface MergeReadyPayload {
283
363
  branch: string;
@@ -349,6 +429,7 @@ export interface DecisionGatePayload {
349
429
  /** Maps protocol message types to their payload interfaces. */
350
430
  export interface MailPayloadMap {
351
431
  worker_done: WorkerDonePayload;
432
+ worker_died: WorkerDiedPayload;
352
433
  merge_ready: MergeReadyPayload;
353
434
  merged: MergedPayload;
354
435
  merge_failed: MergeFailedPayload;
@@ -391,6 +472,13 @@ export interface OverlayConfig {
391
472
  qualityGates?: QualityGate[];
392
473
  /** Relative path to the instruction file within the worktree (runtime-specific). Defaults to .claude/CLAUDE.md. */
393
474
  instructionPath?: string;
475
+ /**
476
+ * Names of sibling agents dispatched in parallel that may share file scope
477
+ * with this agent. When set, the overlay renders a "Parallel Siblings"
478
+ * section with rebase-before-merge_ready guidance (overstory-f76a). Empty
479
+ * or unset → no overlay section.
480
+ */
481
+ siblings?: string[];
394
482
  }
395
483
 
396
484
  // === Merge Queue ===
@@ -436,6 +524,23 @@ export interface ConflictHistory {
436
524
  predictedConflictFiles: string[];
437
525
  }
438
526
 
527
+ /**
528
+ * Side-effect-free prediction of how `ov merge` would resolve a branch.
529
+ * Produced by `predictConflicts` (src/merge/predict.ts) without touching HEAD,
530
+ * the working tree, or the merge lock — surfaced via `ov merge --dry-run` so a
531
+ * lead/operator/greenhouse can branch on `wouldRequireAgent`.
532
+ */
533
+ export interface ConflictPrediction {
534
+ /** The tier `ov merge` would land in if invoked now. */
535
+ predictedTier: ResolutionTier;
536
+ /** Files that would conflict — empty for clean-merge. */
537
+ conflictFiles: string[];
538
+ /** True iff predictedTier is "ai-resolve" or "reimagine" (Tier 3+). */
539
+ wouldRequireAgent: boolean;
540
+ /** Short, operator-readable explanation for the predicted tier. */
541
+ reason: string;
542
+ }
543
+
439
544
  // === Watchdog ===
440
545
 
441
546
  export interface HealthCheck {
@@ -446,7 +551,7 @@ export interface HealthCheck {
446
551
  pidAlive: boolean | null; // null when pid is unavailable
447
552
  lastActivity: string;
448
553
  state: AgentState;
449
- action: "none" | "escalate" | "terminate" | "investigate";
554
+ action: "none" | "escalate" | "terminate" | "investigate" | "complete";
450
555
  /** Describes any conflict between observable state and recorded state. */
451
556
  reconciliationNote: string | null;
452
557
  }
@@ -3,7 +3,7 @@ import { mkdtemp } from "node:fs/promises";
3
3
  import { tmpdir } from "node:os";
4
4
  import { join } from "node:path";
5
5
  import { cleanupTempDir } from "../test-helpers.ts";
6
- import { readPidFile, removePidFile, writePidFile } from "./pid.ts";
6
+ import { acquirePidLock, readPidFile, removePidFile, writePidFile } from "./pid.ts";
7
7
 
8
8
  let tempDir: string;
9
9
 
@@ -66,3 +66,87 @@ describe("removePidFile", () => {
66
66
  // No throw = pass
67
67
  });
68
68
  });
69
+
70
+ describe("acquirePidLock", () => {
71
+ const alwaysAlive = (_pid: number) => true;
72
+ const alwaysDead = (_pid: number) => false;
73
+
74
+ test("acquires when no lock file exists", async () => {
75
+ const pidPath = join(tempDir, "lock.pid");
76
+ const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
77
+ expect(result.acquired).toBe(true);
78
+ expect(await readPidFile(pidPath)).toBe(1234);
79
+ });
80
+
81
+ test("creates parent directory if missing", async () => {
82
+ const pidPath = join(tempDir, "nested", "deeper", "lock.pid");
83
+ const result = await acquirePidLock(pidPath, 555, alwaysAlive);
84
+ expect(result.acquired).toBe(true);
85
+ expect(await readPidFile(pidPath)).toBe(555);
86
+ });
87
+
88
+ test("refuses when a live foreign PID owns the lock", async () => {
89
+ const pidPath = join(tempDir, "lock.pid");
90
+ await Bun.write(pidPath, "9999\n");
91
+ const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
92
+ expect(result.acquired).toBe(false);
93
+ if (!result.acquired) {
94
+ expect(result.existingPid).toBe(9999);
95
+ }
96
+ // File untouched.
97
+ expect(await readPidFile(pidPath)).toBe(9999);
98
+ });
99
+
100
+ test("idempotent when file already contains caller's own PID", async () => {
101
+ const pidPath = join(tempDir, "lock.pid");
102
+ await Bun.write(pidPath, "1234\n");
103
+ // alwaysAlive would say 1234 is alive, but acquirePidLock should detect
104
+ // own-PID first and accept.
105
+ const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
106
+ expect(result.acquired).toBe(true);
107
+ expect(await readPidFile(pidPath)).toBe(1234);
108
+ });
109
+
110
+ test("reclaims stale lock with dead PID", async () => {
111
+ const pidPath = join(tempDir, "lock.pid");
112
+ await Bun.write(pidPath, "9999\n");
113
+ const result = await acquirePidLock(pidPath, 1234, alwaysDead);
114
+ expect(result.acquired).toBe(true);
115
+ expect(await readPidFile(pidPath)).toBe(1234);
116
+ });
117
+
118
+ test("reclaims unreadable/corrupted lock file", async () => {
119
+ const pidPath = join(tempDir, "lock.pid");
120
+ await Bun.write(pidPath, "garbage-not-a-pid\n");
121
+ const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
122
+ expect(result.acquired).toBe(true);
123
+ expect(await readPidFile(pidPath)).toBe(1234);
124
+ });
125
+
126
+ test("two simultaneous acquirers — only one wins", async () => {
127
+ const pidPath = join(tempDir, "lock.pid");
128
+ const [a, b] = await Promise.all([
129
+ acquirePidLock(pidPath, 1111, alwaysAlive),
130
+ acquirePidLock(pidPath, 2222, alwaysAlive),
131
+ ]);
132
+ const winners = [a, b].filter((r) => r.acquired);
133
+ const losers = [a, b].filter((r) => !r.acquired);
134
+ expect(winners.length).toBe(1);
135
+ expect(losers.length).toBe(1);
136
+ const loser = losers[0];
137
+ if (loser && !loser.acquired) {
138
+ expect([1111, 2222]).toContain(loser.existingPid);
139
+ }
140
+ });
141
+
142
+ test("two simultaneous acquirers — file content matches the winner", async () => {
143
+ const pidPath = join(tempDir, "lock.pid");
144
+ const [a, b] = await Promise.all([
145
+ acquirePidLock(pidPath, 1111, alwaysAlive),
146
+ acquirePidLock(pidPath, 2222, alwaysAlive),
147
+ ]);
148
+ const fileContent = await readPidFile(pidPath);
149
+ const winnerPid = a.acquired ? 1111 : b.acquired ? 2222 : -1;
150
+ expect(fileContent).toBe(winnerPid);
151
+ });
152
+ });
package/src/utils/pid.ts CHANGED
@@ -1,7 +1,9 @@
1
1
  /**
2
2
  * PID file management for daemon processes.
3
3
  */
4
- import { unlink } from "node:fs/promises";
4
+ import { randomUUID } from "node:crypto";
5
+ import { link, mkdir, unlink, writeFile } from "node:fs/promises";
6
+ import { dirname } from "node:path";
5
7
 
6
8
  /**
7
9
  * Read the PID from a PID file.
@@ -43,3 +45,86 @@ export async function removePidFile(pidFilePath: string): Promise<void> {
43
45
  // File may already be gone — not an error
44
46
  }
45
47
  }
48
+
49
+ /**
50
+ * Result of acquirePidLock.
51
+ *
52
+ * `acquired: true` — caller owns the lock and is responsible for removing the
53
+ * PID file on shutdown.
54
+ *
55
+ * `acquired: false` — a live foreign process already owns the lock; caller
56
+ * must not start. `existingPid` is the live owner. `existingPid === -1` means
57
+ * the lock file existed but was unreadable and could not be reclaimed.
58
+ */
59
+ export type AcquirePidLockResult = { acquired: true } | { acquired: false; existingPid: number };
60
+
61
+ /**
62
+ * Atomically acquire a PID-file lock.
63
+ *
64
+ * Uses the write-temp-then-link pattern so the lock file appears at its final
65
+ * path with PID contents already present (no empty-file window): a competing
66
+ * reader can never observe an in-flight write. Behavior:
67
+ *
68
+ * - Lock file does not exist → atomic create via link(). Caller owns the lock.
69
+ * - Lock file exists, contains the caller's own PID → idempotent acquire
70
+ * (caller already owns it; e.g. background-mode parent wrote child.pid
71
+ * before spawn).
72
+ * - Lock file exists with a live foreign PID → refuse; return existingPid.
73
+ * - Lock file exists with a dead PID (or unreadable) → reclaim by unlinking
74
+ * and retrying once. If the retry races and loses to a live foreign
75
+ * watchdog, the call returns acquired=false with that foreign PID.
76
+ *
77
+ * Parent directory is created if missing (matches the implicit Bun.write
78
+ * behavior the legacy writePidFile relied on).
79
+ */
80
+ export async function acquirePidLock(
81
+ pidFilePath: string,
82
+ pid: number,
83
+ isAlive: (pid: number) => boolean,
84
+ ): Promise<AcquirePidLockResult> {
85
+ await mkdir(dirname(pidFilePath), { recursive: true });
86
+
87
+ // Stage the PID content at a unique temp path. After link() succeeds, the
88
+ // lock path appears with full content already present.
89
+ const tempPath = `${pidFilePath}.tmp.${pid}.${randomUUID()}`;
90
+ await writeFile(tempPath, `${pid}\n`);
91
+
92
+ try {
93
+ // Two attempts: first try, then one stale-lock reclaim retry. A second
94
+ // EEXIST after reclaim means a live foreign process raced in.
95
+ for (let attempt = 0; attempt < 2; attempt++) {
96
+ try {
97
+ await link(tempPath, pidFilePath);
98
+ return { acquired: true };
99
+ } catch (err: unknown) {
100
+ const code = (err as NodeJS.ErrnoException | undefined)?.code;
101
+ if (code !== "EEXIST") {
102
+ throw err;
103
+ }
104
+ const existing = await readPidFile(pidFilePath);
105
+ if (existing === null) {
106
+ // Unreadable/corrupted lock file — treat as stale.
107
+ await removePidFile(pidFilePath);
108
+ continue;
109
+ }
110
+ if (existing === pid) {
111
+ // Idempotent: caller already owns it (parent pre-wrote child PID).
112
+ return { acquired: true };
113
+ }
114
+ if (isAlive(existing)) {
115
+ return { acquired: false, existingPid: existing };
116
+ }
117
+ // Stale: reclaim and retry once.
118
+ await removePidFile(pidFilePath);
119
+ }
120
+ }
121
+
122
+ // Two stale-then-retry attempts both failed. Another writer raced in
123
+ // between our reclaim and our retry — they own the lock now.
124
+ const existing = await readPidFile(pidFilePath);
125
+ return { acquired: false, existingPid: existing ?? -1 };
126
+ } finally {
127
+ // Drop the temp inode link (lock path retains the data via the second link).
128
+ await unlink(tempPath).catch(() => {});
129
+ }
130
+ }
@@ -0,0 +1,53 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import { findRunningWatchdogProcesses } from "./process-scan.ts";
3
+
4
+ describe("findRunningWatchdogProcesses", () => {
5
+ test("returns an array (does not throw)", async () => {
6
+ const results = await findRunningWatchdogProcesses();
7
+ expect(Array.isArray(results)).toBe(true);
8
+ // We can't assert specifics — depends on what's running on the host —
9
+ // but each entry should have a numeric pid and string command.
10
+ for (const proc of results) {
11
+ expect(typeof proc.pid).toBe("number");
12
+ expect(proc.pid).toBeGreaterThan(0);
13
+ expect(typeof proc.command).toBe("string");
14
+ }
15
+ });
16
+
17
+ test("excludes own process even if command matches", async () => {
18
+ // The test process itself runs `bun test ...` not `ov watch`, so it
19
+ // would not match anyway. But we still verify own-pid is filtered out
20
+ // by checking no result has our PID.
21
+ const results = await findRunningWatchdogProcesses();
22
+ const ownPid = process.pid;
23
+ for (const proc of results) {
24
+ expect(proc.pid).not.toBe(ownPid);
25
+ }
26
+ });
27
+
28
+ test("matches `ov watch` and `bun run ov watch` invocations", async () => {
29
+ // Spawn a sleeper whose command line contains the `ov watch` substring,
30
+ // then verify the scanner finds it. We use `sh -c` so the argv string
31
+ // passed to ps contains our marker tokens.
32
+ const sleeper = Bun.spawn(["sh", "-c", "exec -a 'bun run ov watch' sleep 30"], {
33
+ stdout: "ignore",
34
+ stderr: "ignore",
35
+ });
36
+ try {
37
+ // Give ps a moment to see the new process.
38
+ await Bun.sleep(150);
39
+ const results = await findRunningWatchdogProcesses();
40
+ const found = results.find((p) => p.pid === sleeper.pid);
41
+ // On macOS BSD ps, `exec -a` may or may not change the displayed
42
+ // argv depending on shell version. We accept either: if the
43
+ // command is detected, it must look right; if not, we don't fail
44
+ // the test (env-dependent).
45
+ if (found) {
46
+ expect(found.command).toMatch(/\b(ov|overstory)\b.*\bwatch\b/);
47
+ }
48
+ } finally {
49
+ sleeper.kill("SIGTERM");
50
+ await sleeper.exited.catch(() => {});
51
+ }
52
+ });
53
+ });
@@ -0,0 +1,76 @@
1
+ /**
2
+ * Process-table scanning helpers.
3
+ *
4
+ * Used to detect runaway daemon processes that are not tracked by a PID file —
5
+ * for example, the multi-`ov watch` situation observed on 2026-04-30 where
6
+ * three concurrent watchdogs were running because earlier releases had no
7
+ * PID-file exclusion lock.
8
+ *
9
+ * Implementation note: `ps` is used directly because we only need to find
10
+ * processes by command-line substring, and Bun has no built-in process-table
11
+ * API. The `ps -o pid=,command=` form is portable across macOS (BSD) and
12
+ * Linux (procps) for the columns we read.
13
+ */
14
+
15
+ export interface WatchdogProcess {
16
+ pid: number;
17
+ /** The full command line as reported by `ps`. */
18
+ command: string;
19
+ }
20
+
21
+ /**
22
+ * Find running processes that look like an `ov watch` daemon.
23
+ *
24
+ * Matches on the command-line substring `ov watch` (the daemon spawn form)
25
+ * and excludes the current process so callers do not accidentally treat
26
+ * themselves as a foreign daemon.
27
+ *
28
+ * Returns an empty list if `ps` is unavailable or fails — callers must not
29
+ * rely on this for correctness, only for diagnostics and `--kill-others`.
30
+ */
31
+ export async function findRunningWatchdogProcesses(): Promise<WatchdogProcess[]> {
32
+ const proc = Bun.spawn(["ps", "-A", "-o", "pid=,command="], {
33
+ stdout: "pipe",
34
+ stderr: "ignore",
35
+ });
36
+ const exitCode = await proc.exited;
37
+ if (exitCode !== 0) {
38
+ return [];
39
+ }
40
+ const text = await new Response(proc.stdout).text();
41
+ const ownPid = process.pid;
42
+ const out: WatchdogProcess[] = [];
43
+
44
+ for (const rawLine of text.split("\n")) {
45
+ const line = rawLine.trim();
46
+ if (line === "") continue;
47
+
48
+ // `ps -o pid=,command=` outputs: ` 1234 /path/to/binary args...`
49
+ // (leading whitespace is allowed, then PID, then a single space, then
50
+ // the rest of the command).
51
+ const match = line.match(/^(\d+)\s+(.+)$/);
52
+ if (!match) continue;
53
+ const pidStr = match[1];
54
+ const command = match[2];
55
+ if (pidStr === undefined || command === undefined) continue;
56
+ const pid = Number.parseInt(pidStr, 10);
57
+ if (!Number.isFinite(pid) || pid <= 0) continue;
58
+ if (pid === ownPid) continue;
59
+
60
+ // Match the spawn form: `bun run /path/to/ov watch`. We also tolerate
61
+ // direct invocation `overstory watch` and `ov watch`.
62
+ if (!isWatchdogCommand(command)) continue;
63
+
64
+ out.push({ pid, command });
65
+ }
66
+
67
+ return out;
68
+ }
69
+
70
+ function isWatchdogCommand(command: string): boolean {
71
+ // Anchor on a `watch` token preceded by an `ov` or `overstory` token.
72
+ // Avoids false positives like "watch ov.log" or unrelated `watch` commands.
73
+ if (!/\bwatch\b/.test(command)) return false;
74
+ if (/\b(ov|overstory)\b[^\n]*\bwatch\b/.test(command)) return true;
75
+ return false;
76
+ }