npm - @os-eco/overstory-cli - Versions diffs - 0.9.4 → 0.11.0 - Mend

@os-eco/overstory-cli 0.9.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

package/README.md +50 -19
package/agents/builder.md +19 -9
package/agents/coordinator.md +6 -6
package/agents/lead.md +204 -87
package/agents/merger.md +25 -14
package/agents/reviewer.md +22 -16
package/agents/scout.md +17 -12
package/package.json +6 -3
package/src/agents/capabilities.test.ts +85 -0
package/src/agents/capabilities.ts +125 -0
package/src/agents/headless-mail-injector.test.ts +448 -0
package/src/agents/headless-mail-injector.ts +219 -0
package/src/agents/headless-prompt.test.ts +102 -0
package/src/agents/headless-prompt.ts +68 -0
package/src/agents/hooks-deployer.test.ts +514 -14
package/src/agents/hooks-deployer.ts +141 -0
package/src/agents/mail-poll-detect.test.ts +153 -0
package/src/agents/mail-poll-detect.ts +73 -0
package/src/agents/overlay.test.ts +60 -4
package/src/agents/overlay.ts +63 -8
package/src/agents/scope-detect.test.ts +190 -0
package/src/agents/scope-detect.ts +146 -0
package/src/agents/turn-lock.test.ts +181 -0
package/src/agents/turn-lock.ts +235 -0
package/src/agents/turn-runner-dispatch.test.ts +182 -0
package/src/agents/turn-runner-dispatch.ts +105 -0
package/src/agents/turn-runner.test.ts +2312 -0
package/src/agents/turn-runner.ts +1383 -0
package/src/commands/agents.ts +9 -0
package/src/commands/clean.ts +54 -0
package/src/commands/coordinator.test.ts +254 -0
package/src/commands/coordinator.ts +273 -8
package/src/commands/dashboard.test.ts +188 -0
package/src/commands/dashboard.ts +14 -4
package/src/commands/doctor.ts +3 -1
package/src/commands/group.test.ts +94 -0
package/src/commands/group.ts +49 -20
package/src/commands/init.test.ts +8 -0
package/src/commands/init.ts +8 -1
package/src/commands/log.test.ts +187 -11
package/src/commands/log.ts +171 -71
package/src/commands/mail.test.ts +162 -0
package/src/commands/mail.ts +64 -9
package/src/commands/merge.test.ts +230 -1
package/src/commands/merge.ts +68 -12
package/src/commands/nudge.test.ts +351 -4
package/src/commands/nudge.ts +356 -34
package/src/commands/run.test.ts +43 -7
package/src/commands/serve/build.test.ts +202 -0
package/src/commands/serve/build.ts +206 -0
package/src/commands/serve/coordinator-actions.test.ts +339 -0
package/src/commands/serve/coordinator-actions.ts +408 -0
package/src/commands/serve/dev.test.ts +168 -0
package/src/commands/serve/dev.ts +117 -0
package/src/commands/serve/mail-actions.test.ts +312 -0
package/src/commands/serve/mail-actions.ts +167 -0
package/src/commands/serve/rest.test.ts +1323 -0
package/src/commands/serve/rest.ts +708 -0
package/src/commands/serve/static.ts +51 -0
package/src/commands/serve/ws.test.ts +361 -0
package/src/commands/serve/ws.ts +332 -0
package/src/commands/serve.test.ts +459 -0
package/src/commands/serve.ts +565 -0
package/src/commands/sling.test.ts +177 -1
package/src/commands/sling.ts +243 -71
package/src/commands/status.test.ts +9 -0
package/src/commands/status.ts +12 -4
package/src/commands/stop.test.ts +255 -1
package/src/commands/stop.ts +107 -8
package/src/commands/watch.test.ts +43 -0
package/src/commands/watch.ts +153 -28
package/src/config.ts +23 -0
package/src/doctor/consistency.test.ts +106 -0
package/src/doctor/consistency.ts +48 -1
package/src/doctor/serve.test.ts +95 -0
package/src/doctor/serve.ts +86 -0
package/src/doctor/types.ts +2 -1
package/src/doctor/watchdog.ts +57 -1
package/src/events/tailer.test.ts +234 -1
package/src/events/tailer.ts +90 -0
package/src/index.ts +57 -6
package/src/insights/quality-gates.test.ts +141 -0
package/src/insights/quality-gates.ts +156 -0
package/src/json.ts +29 -0
package/src/logging/theme.ts +4 -0
package/src/mail/client.ts +15 -2
package/src/mail/store.test.ts +82 -0
package/src/mail/store.ts +41 -4
package/src/merge/lock.test.ts +149 -0
package/src/merge/lock.ts +140 -0
package/src/merge/predict.test.ts +387 -0
package/src/merge/predict.ts +249 -0
package/src/merge/resolver.ts +1 -1
package/src/mulch/client.ts +3 -3
package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
package/src/runtimes/claude.test.ts +791 -1
package/src/runtimes/claude.ts +323 -1
package/src/runtimes/connections.test.ts +141 -1
package/src/runtimes/connections.ts +73 -4
package/src/runtimes/headless-connection.test.ts +264 -0
package/src/runtimes/headless-connection.ts +158 -0
package/src/runtimes/types.ts +10 -0
package/src/schema-consistency.test.ts +1 -0
package/src/sessions/store.test.ts +657 -29
package/src/sessions/store.ts +286 -23
package/src/test-setup.test.ts +31 -0
package/src/test-setup.ts +28 -0
package/src/types.ts +107 -2
package/src/utils/pid.test.ts +85 -1
package/src/utils/pid.ts +86 -1
package/src/utils/process-scan.test.ts +53 -0
package/src/utils/process-scan.ts +76 -0
package/src/watchdog/daemon.test.ts +1607 -376
package/src/watchdog/daemon.ts +462 -88
package/src/watchdog/health.test.ts +282 -0
package/src/watchdog/health.ts +126 -27
package/src/worktree/manager.test.ts +218 -1
package/src/worktree/manager.ts +55 -0
package/src/worktree/process.test.ts +71 -0
package/src/worktree/process.ts +25 -5
package/src/worktree/tmux.test.ts +28 -0
package/src/worktree/tmux.ts +27 -3
package/templates/CLAUDE.md.tmpl +19 -8
package/templates/overlay.md.tmpl +5 -2

package/src/types.ts CHANGED Viewed

@@ -108,6 +108,7 @@ export interface OverstoryConfig {
 		rpcTimeoutMs?: number; // Timeout for RPC getState() calls (default 5_000)
 		triageTimeoutMs?: number; // Timeout for Tier 1 AI triage calls (default 30_000)
 		maxEscalationLevel?: number; // Maximum escalation level before termination (default 3)
+		notifyParentOnDeath?: boolean; // Send synthetic worker_died mail to parent on watchdog termination (default true)
 	};
 	models: Partial<Record<string, ModelRef>>;
 	logging: {
@@ -141,6 +142,13 @@ export interface OverstoryConfig {
 		 * Default: 0 (no delay).
 		 */
 		shellInitDelayMs?: number;
+		/**
+		 * Project-level default for spawning Claude Code agents in headless mode
+		 * (Bun.spawn + stream-json) instead of the tmux interactive runtime.
+		 * Per-spawn `--headless` / `--no-headless` flags on `ov sling` override this.
+		 * Default: false (tmux).
+		 */
+		claudeHeadlessByDefault?: boolean;
 	};
 }
@@ -179,7 +187,49 @@ export type Capability = (typeof SUPPORTED_CAPABILITIES)[number];
 // === Agent Session ===
-export type AgentState = "booting" | "working" | "completed" | "stalled" | "zombie";
+/**
+ * Agent lifecycle states.
+ *
+ * `in_turn` and `between_turns` are spawn-per-turn-specific substates that
+ * split the legacy `working` state so the UI can distinguish a worker actively
+ * executing a turn from one idling between mail batches (overstory-3087):
+ *
+ *   - `in_turn`: the turn-runner has observed at least one parser event from
+ *     a live claude subprocess. The agent is mid-execution.
+ *   - `between_turns`: the turn-runner finished a turn without a terminal
+ *     mail; the agent is alive (process gone, session pinned) and waiting
+ *     for the next mail batch to spawn a fresh turn.
+ *
+ * `working` remains the active state for tmux/long-lived headless agents
+ * (coordinator, orchestrator, monitor, sapling) which have no per-turn
+ * boundary. Spawn-per-turn workers (builder/scout/reviewer/lead/merger
+ * under the headless default) transition through in_turn ↔ between_turns
+ * instead.
+ */
+export type AgentState =
+	| "booting"
+	| "working"
+	| "in_turn"
+	| "between_turns"
+	| "completed"
+	| "stalled"
+	| "zombie";
+/**
+ * Result of a guarded state transition attempt (`SessionStore.tryTransitionState`).
+ *
+ * Discriminated by `ok`. When `ok` is false, `reason` distinguishes:
+ *   - `not_found`: no session exists for the given name.
+ *   - `illegal_transition`: a session exists but the matrix forbids prev → attempted.
+ *
+ * `prev` is always the state observed by the SQL CAS. For `illegal_transition` it
+ * is the state that blocked the write (which may differ from what the caller read,
+ * if another writer landed first).
+ */
+export type TransitionOutcome =
+	| { ok: true; prev: AgentState; next: AgentState }
+	| { ok: false; reason: "not_found"; attempted: AgentState }
+	| { ok: false; reason: "illegal_transition"; prev: AgentState; attempted: AgentState };
 export interface AgentSession {
 	id: string; // Unique session ID
@@ -200,6 +250,7 @@ export interface AgentSession {
 	stalledSince: string | null; // ISO timestamp when agent first entered stalled state
 	transcriptPath: string | null; // Runtime-provided transcript JSONL path (decoupled from ~/.claude/)
 	promptVersion?: string | null; // Canopy prompt version used at sling time (e.g. "builder@17")
+	claudeSessionId?: string | null; // Runtime-provided session_id (Claude stream-json), eagerly pinned on first event
 }
 // === Agent Identity ===
@@ -225,6 +276,7 @@ export type MailSemanticType = "status" | "question" | "result" | "error";
 /** Protocol message types for structured agent coordination. */
 export type MailProtocolType =
 	| "worker_done"
+	| "worker_died"
 	| "merge_ready"
 	| "merged"
 	| "merge_failed"
@@ -244,6 +296,7 @@ export const MAIL_MESSAGE_TYPES: readonly MailMessageType[] = [
 	"result",
 	"error",
 	"worker_done",
+	"worker_died",
 	"merge_ready",
 	"merged",
 	"merge_failed",
@@ -278,6 +331,33 @@ export interface WorkerDonePayload {
 	filesModified: string[];
 }
+/**
+ * Watchdog signals the parent that one of its children was terminated.
+ *
+ * Synthetic mail injected by the Tier 0 daemon when it transitions a worker
+ * to `zombie` (overstory-c111). Without this, the parent — typically a lead
+ * waiting for `worker_done` from this child — would block indefinitely on
+ * mail that will never arrive. The parent reads this on its next mail-injector
+ * tick and decides whether to retry, escalate, or report up.
+ */
+export interface WorkerDiedPayload {
+	agentName: string;
+	capability: string;
+	taskId: string;
+	/** Reason the watchdog or runner terminated the child (e.g. "Process terminated"). */
+	reason: string;
+	/** ISO timestamp of the child's last observed activity. */
+	lastActivity: string;
+	/**
+	 * Source that detected the failure.
+	 * - `tier0`/`tier1`: watchdog daemon detected a dead/stuck process out-of-band.
+	 * - `runner`: the per-turn runner observed an in-band failure — either an
+	 *   abort/stall that forced SIGTERM/SIGKILL, or a clean exit without the
+	 *   capability's terminal mail (silent-no-op, overstory-4159 / overstory-c772).
+	 */
+	terminatedBy: "tier0" | "tier1" | "runner";
+}
 /** Supervisor signals branch is verified and ready for merge. */
 export interface MergeReadyPayload {
 	branch: string;
@@ -349,6 +429,7 @@ export interface DecisionGatePayload {
 /** Maps protocol message types to their payload interfaces. */
 export interface MailPayloadMap {
 	worker_done: WorkerDonePayload;
+	worker_died: WorkerDiedPayload;
 	merge_ready: MergeReadyPayload;
 	merged: MergedPayload;
 	merge_failed: MergeFailedPayload;
@@ -391,6 +472,13 @@ export interface OverlayConfig {
 	qualityGates?: QualityGate[];
 	/** Relative path to the instruction file within the worktree (runtime-specific). Defaults to .claude/CLAUDE.md. */
 	instructionPath?: string;
+	/**
+	 * Names of sibling agents dispatched in parallel that may share file scope
+	 * with this agent. When set, the overlay renders a "Parallel Siblings"
+	 * section with rebase-before-merge_ready guidance (overstory-f76a). Empty
+	 * or unset → no overlay section.
+	 */
+	siblings?: string[];
 }
 // === Merge Queue ===
@@ -436,6 +524,23 @@ export interface ConflictHistory {
 	predictedConflictFiles: string[];
 }
+/**
+ * Side-effect-free prediction of how `ov merge` would resolve a branch.
+ * Produced by `predictConflicts` (src/merge/predict.ts) without touching HEAD,
+ * the working tree, or the merge lock — surfaced via `ov merge --dry-run` so a
+ * lead/operator/greenhouse can branch on `wouldRequireAgent`.
+ */
+export interface ConflictPrediction {
+	/** The tier `ov merge` would land in if invoked now. */
+	predictedTier: ResolutionTier;
+	/** Files that would conflict — empty for clean-merge. */
+	conflictFiles: string[];
+	/** True iff predictedTier is "ai-resolve" or "reimagine" (Tier 3+). */
+	wouldRequireAgent: boolean;
+	/** Short, operator-readable explanation for the predicted tier. */
+	reason: string;
+}
 // === Watchdog ===
 export interface HealthCheck {
@@ -446,7 +551,7 @@ export interface HealthCheck {
 	pidAlive: boolean | null; // null when pid is unavailable
 	lastActivity: string;
 	state: AgentState;
-	action: "none" | "escalate" | "terminate" | "investigate";
+	action: "none" | "escalate" | "terminate" | "investigate" | "complete";
 	/** Describes any conflict between observable state and recorded state. */
 	reconciliationNote: string | null;
 }

package/src/utils/pid.test.ts CHANGED Viewed

@@ -3,7 +3,7 @@ import { mkdtemp } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
 import { cleanupTempDir } from "../test-helpers.ts";
-import { readPidFile, removePidFile, writePidFile } from "./pid.ts";
+import { acquirePidLock, readPidFile, removePidFile, writePidFile } from "./pid.ts";
 let tempDir: string;
@@ -66,3 +66,87 @@ describe("removePidFile", () => {
 		// No throw = pass
 	});
 });
+describe("acquirePidLock", () => {
+	const alwaysAlive = (_pid: number) => true;
+	const alwaysDead = (_pid: number) => false;
+	test("acquires when no lock file exists", async () => {
+		const pidPath = join(tempDir, "lock.pid");
+		const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
+		expect(result.acquired).toBe(true);
+		expect(await readPidFile(pidPath)).toBe(1234);
+	});
+	test("creates parent directory if missing", async () => {
+		const pidPath = join(tempDir, "nested", "deeper", "lock.pid");
+		const result = await acquirePidLock(pidPath, 555, alwaysAlive);
+		expect(result.acquired).toBe(true);
+		expect(await readPidFile(pidPath)).toBe(555);
+	});
+	test("refuses when a live foreign PID owns the lock", async () => {
+		const pidPath = join(tempDir, "lock.pid");
+		await Bun.write(pidPath, "9999\n");
+		const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
+		expect(result.acquired).toBe(false);
+		if (!result.acquired) {
+			expect(result.existingPid).toBe(9999);
+		}
+		// File untouched.
+		expect(await readPidFile(pidPath)).toBe(9999);
+	});
+	test("idempotent when file already contains caller's own PID", async () => {
+		const pidPath = join(tempDir, "lock.pid");
+		await Bun.write(pidPath, "1234\n");
+		// alwaysAlive would say 1234 is alive, but acquirePidLock should detect
+		// own-PID first and accept.
+		const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
+		expect(result.acquired).toBe(true);
+		expect(await readPidFile(pidPath)).toBe(1234);
+	});
+	test("reclaims stale lock with dead PID", async () => {
+		const pidPath = join(tempDir, "lock.pid");
+		await Bun.write(pidPath, "9999\n");
+		const result = await acquirePidLock(pidPath, 1234, alwaysDead);
+		expect(result.acquired).toBe(true);
+		expect(await readPidFile(pidPath)).toBe(1234);
+	});
+	test("reclaims unreadable/corrupted lock file", async () => {
+		const pidPath = join(tempDir, "lock.pid");
+		await Bun.write(pidPath, "garbage-not-a-pid\n");
+		const result = await acquirePidLock(pidPath, 1234, alwaysAlive);
+		expect(result.acquired).toBe(true);
+		expect(await readPidFile(pidPath)).toBe(1234);
+	});
+	test("two simultaneous acquirers — only one wins", async () => {
+		const pidPath = join(tempDir, "lock.pid");
+		const [a, b] = await Promise.all([
+			acquirePidLock(pidPath, 1111, alwaysAlive),
+			acquirePidLock(pidPath, 2222, alwaysAlive),
+		]);
+		const winners = [a, b].filter((r) => r.acquired);
+		const losers = [a, b].filter((r) => !r.acquired);
+		expect(winners.length).toBe(1);
+		expect(losers.length).toBe(1);
+		const loser = losers[0];
+		if (loser && !loser.acquired) {
+			expect([1111, 2222]).toContain(loser.existingPid);
+		}
+	});
+	test("two simultaneous acquirers — file content matches the winner", async () => {
+		const pidPath = join(tempDir, "lock.pid");
+		const [a, b] = await Promise.all([
+			acquirePidLock(pidPath, 1111, alwaysAlive),
+			acquirePidLock(pidPath, 2222, alwaysAlive),
+		]);
+		const fileContent = await readPidFile(pidPath);
+		const winnerPid = a.acquired ? 1111 : b.acquired ? 2222 : -1;
+		expect(fileContent).toBe(winnerPid);
+	});
+});

package/src/utils/pid.ts CHANGED Viewed

@@ -1,7 +1,9 @@
 /**
  * PID file management for daemon processes.
  */
-import { unlink } from "node:fs/promises";
+import { randomUUID } from "node:crypto";
+import { link, mkdir, unlink, writeFile } from "node:fs/promises";
+import { dirname } from "node:path";
 /**
  * Read the PID from a PID file.
@@ -43,3 +45,86 @@ export async function removePidFile(pidFilePath: string): Promise<void> {
 		// File may already be gone — not an error
 	}
 }
+/**
+ * Result of acquirePidLock.
+ *
+ * `acquired: true` — caller owns the lock and is responsible for removing the
+ * PID file on shutdown.
+ *
+ * `acquired: false` — a live foreign process already owns the lock; caller
+ * must not start. `existingPid` is the live owner. `existingPid === -1` means
+ * the lock file existed but was unreadable and could not be reclaimed.
+ */
+export type AcquirePidLockResult = { acquired: true } | { acquired: false; existingPid: number };
+/**
+ * Atomically acquire a PID-file lock.
+ *
+ * Uses the write-temp-then-link pattern so the lock file appears at its final
+ * path with PID contents already present (no empty-file window): a competing
+ * reader can never observe an in-flight write. Behavior:
+ *
+ * - Lock file does not exist → atomic create via link(). Caller owns the lock.
+ * - Lock file exists, contains the caller's own PID → idempotent acquire
+ *   (caller already owns it; e.g. background-mode parent wrote child.pid
+ *   before spawn).
+ * - Lock file exists with a live foreign PID → refuse; return existingPid.
+ * - Lock file exists with a dead PID (or unreadable) → reclaim by unlinking
+ *   and retrying once. If the retry races and loses to a live foreign
+ *   watchdog, the call returns acquired=false with that foreign PID.
+ *
+ * Parent directory is created if missing (matches the implicit Bun.write
+ * behavior the legacy writePidFile relied on).
+ */
+export async function acquirePidLock(
+	pidFilePath: string,
+	pid: number,
+	isAlive: (pid: number) => boolean,
+): Promise<AcquirePidLockResult> {
+	await mkdir(dirname(pidFilePath), { recursive: true });
+	// Stage the PID content at a unique temp path. After link() succeeds, the
+	// lock path appears with full content already present.
+	const tempPath = `${pidFilePath}.tmp.${pid}.${randomUUID()}`;
+	await writeFile(tempPath, `${pid}\n`);
+	try {
+		// Two attempts: first try, then one stale-lock reclaim retry. A second
+		// EEXIST after reclaim means a live foreign process raced in.
+		for (let attempt = 0; attempt < 2; attempt++) {
+			try {
+				await link(tempPath, pidFilePath);
+				return { acquired: true };
+			} catch (err: unknown) {
+				const code = (err as NodeJS.ErrnoException | undefined)?.code;
+				if (code !== "EEXIST") {
+					throw err;
+				}
+				const existing = await readPidFile(pidFilePath);
+				if (existing === null) {
+					// Unreadable/corrupted lock file — treat as stale.
+					await removePidFile(pidFilePath);
+					continue;
+				}
+				if (existing === pid) {
+					// Idempotent: caller already owns it (parent pre-wrote child PID).
+					return { acquired: true };
+				}
+				if (isAlive(existing)) {
+					return { acquired: false, existingPid: existing };
+				}
+				// Stale: reclaim and retry once.
+				await removePidFile(pidFilePath);
+			}
+		}
+		// Two stale-then-retry attempts both failed. Another writer raced in
+		// between our reclaim and our retry — they own the lock now.
+		const existing = await readPidFile(pidFilePath);
+		return { acquired: false, existingPid: existing ?? -1 };
+	} finally {
+		// Drop the temp inode link (lock path retains the data via the second link).
+		await unlink(tempPath).catch(() => {});
+	}
+}

package/src/utils/process-scan.test.ts ADDED Viewed

@@ -0,0 +1,53 @@
+import { describe, expect, test } from "bun:test";
+import { findRunningWatchdogProcesses } from "./process-scan.ts";
+describe("findRunningWatchdogProcesses", () => {
+	test("returns an array (does not throw)", async () => {
+		const results = await findRunningWatchdogProcesses();
+		expect(Array.isArray(results)).toBe(true);
+		// We can't assert specifics — depends on what's running on the host —
+		// but each entry should have a numeric pid and string command.
+		for (const proc of results) {
+			expect(typeof proc.pid).toBe("number");
+			expect(proc.pid).toBeGreaterThan(0);
+			expect(typeof proc.command).toBe("string");
+		}
+	});
+	test("excludes own process even if command matches", async () => {
+		// The test process itself runs `bun test ...` not `ov watch`, so it
+		// would not match anyway. But we still verify own-pid is filtered out
+		// by checking no result has our PID.
+		const results = await findRunningWatchdogProcesses();
+		const ownPid = process.pid;
+		for (const proc of results) {
+			expect(proc.pid).not.toBe(ownPid);
+		}
+	});
+	test("matches `ov watch` and `bun run ov watch` invocations", async () => {
+		// Spawn a sleeper whose command line contains the `ov watch` substring,
+		// then verify the scanner finds it. We use `sh -c` so the argv string
+		// passed to ps contains our marker tokens.
+		const sleeper = Bun.spawn(["sh", "-c", "exec -a 'bun run ov watch' sleep 30"], {
+			stdout: "ignore",
+			stderr: "ignore",
+		});
+		try {
+			// Give ps a moment to see the new process.
+			await Bun.sleep(150);
+			const results = await findRunningWatchdogProcesses();
+			const found = results.find((p) => p.pid === sleeper.pid);
+			// On macOS BSD ps, `exec -a` may or may not change the displayed
+			// argv depending on shell version. We accept either: if the
+			// command is detected, it must look right; if not, we don't fail
+			// the test (env-dependent).
+			if (found) {
+				expect(found.command).toMatch(/\b(ov|overstory)\b.*\bwatch\b/);
+			}
+		} finally {
+			sleeper.kill("SIGTERM");
+			await sleeper.exited.catch(() => {});
+		}
+	});
+});

package/src/utils/process-scan.ts ADDED Viewed

@@ -0,0 +1,76 @@
+/**
+ * Process-table scanning helpers.
+ *
+ * Used to detect runaway daemon processes that are not tracked by a PID file —
+ * for example, the multi-`ov watch` situation observed on 2026-04-30 where
+ * three concurrent watchdogs were running because earlier releases had no
+ * PID-file exclusion lock.
+ *
+ * Implementation note: `ps` is used directly because we only need to find
+ * processes by command-line substring, and Bun has no built-in process-table
+ * API. The `ps -o pid=,command=` form is portable across macOS (BSD) and
+ * Linux (procps) for the columns we read.
+ */
+export interface WatchdogProcess {
+	pid: number;
+	/** The full command line as reported by `ps`. */
+	command: string;
+}
+/**
+ * Find running processes that look like an `ov watch` daemon.
+ *
+ * Matches on the command-line substring `ov watch` (the daemon spawn form)
+ * and excludes the current process so callers do not accidentally treat
+ * themselves as a foreign daemon.
+ *
+ * Returns an empty list if `ps` is unavailable or fails — callers must not
+ * rely on this for correctness, only for diagnostics and `--kill-others`.
+ */
+export async function findRunningWatchdogProcesses(): Promise<WatchdogProcess[]> {
+	const proc = Bun.spawn(["ps", "-A", "-o", "pid=,command="], {
+		stdout: "pipe",
+		stderr: "ignore",
+	});
+	const exitCode = await proc.exited;
+	if (exitCode !== 0) {
+		return [];
+	}
+	const text = await new Response(proc.stdout).text();
+	const ownPid = process.pid;
+	const out: WatchdogProcess[] = [];
+	for (const rawLine of text.split("\n")) {
+		const line = rawLine.trim();
+		if (line === "") continue;
+		// `ps -o pid=,command=` outputs: `   1234 /path/to/binary args...`
+		// (leading whitespace is allowed, then PID, then a single space, then
+		// the rest of the command).
+		const match = line.match(/^(\d+)\s+(.+)$/);
+		if (!match) continue;
+		const pidStr = match[1];
+		const command = match[2];
+		if (pidStr === undefined || command === undefined) continue;
+		const pid = Number.parseInt(pidStr, 10);
+		if (!Number.isFinite(pid) || pid <= 0) continue;
+		if (pid === ownPid) continue;
+		// Match the spawn form: `bun run /path/to/ov watch`. We also tolerate
+		// direct invocation `overstory watch` and `ov watch`.
+		if (!isWatchdogCommand(command)) continue;
+		out.push({ pid, command });
+	}
+	return out;
+}
+function isWatchdogCommand(command: string): boolean {
+	// Anchor on a `watch` token preceded by an `ov` or `overstory` token.
+	// Avoids false positives like "watch ov.log" or unrelated `watch` commands.
+	if (!/\bwatch\b/.test(command)) return false;
+	if (/\b(ov|overstory)\b[^\n]*\bwatch\b/.test(command)) return true;
+	return false;
+}