npm - @os-eco/overstory-cli - Versions diffs - 0.9.4 → 0.11.0 - Mend

@os-eco/overstory-cli 0.9.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

package/README.md +50 -19
package/agents/builder.md +19 -9
package/agents/coordinator.md +6 -6
package/agents/lead.md +204 -87
package/agents/merger.md +25 -14
package/agents/reviewer.md +22 -16
package/agents/scout.md +17 -12
package/package.json +6 -3
package/src/agents/capabilities.test.ts +85 -0
package/src/agents/capabilities.ts +125 -0
package/src/agents/headless-mail-injector.test.ts +448 -0
package/src/agents/headless-mail-injector.ts +219 -0
package/src/agents/headless-prompt.test.ts +102 -0
package/src/agents/headless-prompt.ts +68 -0
package/src/agents/hooks-deployer.test.ts +514 -14
package/src/agents/hooks-deployer.ts +141 -0
package/src/agents/mail-poll-detect.test.ts +153 -0
package/src/agents/mail-poll-detect.ts +73 -0
package/src/agents/overlay.test.ts +60 -4
package/src/agents/overlay.ts +63 -8
package/src/agents/scope-detect.test.ts +190 -0
package/src/agents/scope-detect.ts +146 -0
package/src/agents/turn-lock.test.ts +181 -0
package/src/agents/turn-lock.ts +235 -0
package/src/agents/turn-runner-dispatch.test.ts +182 -0
package/src/agents/turn-runner-dispatch.ts +105 -0
package/src/agents/turn-runner.test.ts +2312 -0
package/src/agents/turn-runner.ts +1383 -0
package/src/commands/agents.ts +9 -0
package/src/commands/clean.ts +54 -0
package/src/commands/coordinator.test.ts +254 -0
package/src/commands/coordinator.ts +273 -8
package/src/commands/dashboard.test.ts +188 -0
package/src/commands/dashboard.ts +14 -4
package/src/commands/doctor.ts +3 -1
package/src/commands/group.test.ts +94 -0
package/src/commands/group.ts +49 -20
package/src/commands/init.test.ts +8 -0
package/src/commands/init.ts +8 -1
package/src/commands/log.test.ts +187 -11
package/src/commands/log.ts +171 -71
package/src/commands/mail.test.ts +162 -0
package/src/commands/mail.ts +64 -9
package/src/commands/merge.test.ts +230 -1
package/src/commands/merge.ts +68 -12
package/src/commands/nudge.test.ts +351 -4
package/src/commands/nudge.ts +356 -34
package/src/commands/run.test.ts +43 -7
package/src/commands/serve/build.test.ts +202 -0
package/src/commands/serve/build.ts +206 -0
package/src/commands/serve/coordinator-actions.test.ts +339 -0
package/src/commands/serve/coordinator-actions.ts +408 -0
package/src/commands/serve/dev.test.ts +168 -0
package/src/commands/serve/dev.ts +117 -0
package/src/commands/serve/mail-actions.test.ts +312 -0
package/src/commands/serve/mail-actions.ts +167 -0
package/src/commands/serve/rest.test.ts +1323 -0
package/src/commands/serve/rest.ts +708 -0
package/src/commands/serve/static.ts +51 -0
package/src/commands/serve/ws.test.ts +361 -0
package/src/commands/serve/ws.ts +332 -0
package/src/commands/serve.test.ts +459 -0
package/src/commands/serve.ts +565 -0
package/src/commands/sling.test.ts +177 -1
package/src/commands/sling.ts +243 -71
package/src/commands/status.test.ts +9 -0
package/src/commands/status.ts +12 -4
package/src/commands/stop.test.ts +255 -1
package/src/commands/stop.ts +107 -8
package/src/commands/watch.test.ts +43 -0
package/src/commands/watch.ts +153 -28
package/src/config.ts +23 -0
package/src/doctor/consistency.test.ts +106 -0
package/src/doctor/consistency.ts +48 -1
package/src/doctor/serve.test.ts +95 -0
package/src/doctor/serve.ts +86 -0
package/src/doctor/types.ts +2 -1
package/src/doctor/watchdog.ts +57 -1
package/src/events/tailer.test.ts +234 -1
package/src/events/tailer.ts +90 -0
package/src/index.ts +57 -6
package/src/insights/quality-gates.test.ts +141 -0
package/src/insights/quality-gates.ts +156 -0
package/src/json.ts +29 -0
package/src/logging/theme.ts +4 -0
package/src/mail/client.ts +15 -2
package/src/mail/store.test.ts +82 -0
package/src/mail/store.ts +41 -4
package/src/merge/lock.test.ts +149 -0
package/src/merge/lock.ts +140 -0
package/src/merge/predict.test.ts +387 -0
package/src/merge/predict.ts +249 -0
package/src/merge/resolver.ts +1 -1
package/src/mulch/client.ts +3 -3
package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
package/src/runtimes/claude.test.ts +791 -1
package/src/runtimes/claude.ts +323 -1
package/src/runtimes/connections.test.ts +141 -1
package/src/runtimes/connections.ts +73 -4
package/src/runtimes/headless-connection.test.ts +264 -0
package/src/runtimes/headless-connection.ts +158 -0
package/src/runtimes/types.ts +10 -0
package/src/schema-consistency.test.ts +1 -0
package/src/sessions/store.test.ts +657 -29
package/src/sessions/store.ts +286 -23
package/src/test-setup.test.ts +31 -0
package/src/test-setup.ts +28 -0
package/src/types.ts +107 -2
package/src/utils/pid.test.ts +85 -1
package/src/utils/pid.ts +86 -1
package/src/utils/process-scan.test.ts +53 -0
package/src/utils/process-scan.ts +76 -0
package/src/watchdog/daemon.test.ts +1607 -376
package/src/watchdog/daemon.ts +462 -88
package/src/watchdog/health.test.ts +282 -0
package/src/watchdog/health.ts +126 -27
package/src/worktree/manager.test.ts +218 -1
package/src/worktree/manager.ts +55 -0
package/src/worktree/process.test.ts +71 -0
package/src/worktree/process.ts +25 -5
package/src/worktree/tmux.test.ts +28 -0
package/src/worktree/tmux.ts +27 -3
package/templates/CLAUDE.md.tmpl +19 -8
package/templates/overlay.md.tmpl +5 -2

package/src/watchdog/daemon.ts CHANGED Viewed

@@ -21,6 +21,7 @@
  */
 import { join } from "node:path";
+import { isPersistentCapability } from "../agents/capabilities.ts";
 import { nudgeAgent } from "../commands/nudge.ts";
 import { createEventStore } from "../events/store.ts";
 import {
@@ -34,7 +35,14 @@ import { createMulchClient } from "../mulch/client.ts";
 import { getConnection, removeConnection } from "../runtimes/connections.ts";
 import type { RuntimeConnection } from "../runtimes/types.ts";
 import { openSessionStore } from "../sessions/compat.ts";
-import type { AgentSession, EventStore, HealthCheck } from "../types.ts";
+import { createRunStore } from "../sessions/store.ts";
+import type {
+	AgentSession,
+	EventStore,
+	HealthCheck,
+	RunStore,
+	WorkerDiedPayload,
+} from "../types.ts";
 import { isProcessAlive, isSessionAlive, killProcessTree, killSession } from "../worktree/tmux.ts";
 import { evaluateHealth, transitionState } from "./health.ts";
 import { type TriageResult, triageAgent } from "./triage.ts";
@@ -42,12 +50,6 @@ import { type TriageResult, triageAgent } from "./triage.ts";
 /** Maximum escalation level (terminate). */
 const MAX_ESCALATION_LEVEL = 3;
-/**
- * Persistent agent capabilities that are excluded from run-level completion checks.
- * These agents are long-running and should not count toward "all workers done".
- */
-const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor"]);
 /**
  * Module-level registry of active event tailers for headless agents.
  * Maps agentName → TailerHandle. Persists across daemon ticks so tailers
@@ -55,6 +57,28 @@ const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor
  */
 const _defaultTailerRegistry: Map<string, TailerHandle> = new Map();
+/**
+ * Per-cause dedup state for `current-run.txt` defensive-read warnings
+ * (overstory-87bf). The watchdog reads `.overstory/current-run.txt` once per
+ * tick to gate run-completion checks; if the file is missing/empty/unreadable
+ * or points to an id with no row in the runs table, the check would silently
+ * skip every tick. We log one warning per cause and then continue skipping
+ * silently, so an operator can see the run-completion path is wedged without
+ * drowning in repeated lines.
+ *
+ * Module-level by design: warnings should dedupe across ticks within one
+ * watchdog process. Overridable via DaemonOptions._runIdWarnState in tests.
+ */
+export interface RunIdWarnState {
+	missingFileWarned: boolean;
+	unknownIds: Set<string>;
+}
+const _defaultRunIdWarnState: RunIdWarnState = {
+	missingFileWarned: false,
+	unknownIds: new Set(),
+};
 /**
  * Record an agent failure to mulch for future reference.
  * Fire-and-forget: never throws, logs errors internally if mulch fails.
@@ -115,6 +139,56 @@ async function readCurrentRunId(overstoryDir: string): Promise<string | null> {
 	}
 }
+/**
+ * Resolve the active run id for run-completion checks, defensively
+ * (overstory-87bf). Returns the id only when `current-run.txt` is readable
+ * AND points to a row in the runs table. On either failure mode, logs one
+ * warning per cause via `warnState` and returns null so the caller can skip
+ * the check silently on subsequent ticks.
+ *
+ * Intentionally narrow: the broader `readCurrentRunId` is unchanged and still
+ * powers event-recording paths where a stale id is acceptable as a label.
+ */
+async function resolveRunIdForCompletionCheck(
+	overstoryDir: string,
+	runStore: RunStore | null,
+	warnState: RunIdWarnState,
+): Promise<string | null> {
+	const runId = await readCurrentRunId(overstoryDir);
+	if (runId === null) {
+		if (!warnState.missingFileWarned) {
+			warnState.missingFileWarned = true;
+			process.stderr.write(
+				"[WATCHDOG] current-run.txt missing — run-completion checks disabled until restart\n",
+			);
+		}
+		return null;
+	}
+	if (runStore === null) {
+		// RunStore unavailable (rare — sessions.db open failed). Trust the file
+		// and let the downstream nudge path proceed; this is no worse than the
+		// pre-87bf behavior.
+		return runId;
+	}
+	let run: ReturnType<RunStore["getRun"]>;
+	try {
+		run = runStore.getRun(runId);
+	} catch {
+		// Treat lookup errors as "unknown" — same defensive posture as a missing row.
+		run = null;
+	}
+	if (run === null) {
+		if (!warnState.unknownIds.has(runId)) {
+			warnState.unknownIds.add(runId);
+			process.stderr.write(
+				`[WATCHDOG] current-run.txt points to unknown run "${runId}" — run-completion checks disabled until restart\n`,
+			);
+		}
+		return null;
+	}
+	return runId;
+}
 /**
  * Fire-and-forget: record an event to EventStore. Never throws.
  */
@@ -147,10 +221,15 @@ function recordEvent(
 }
 /**
- * Build a phase-aware completion message based on the capabilities of completed workers.
+ * Build a phase-aware completion message based on the capabilities of terminal workers.
  *
- * Single-capability batches get targeted messages (e.g. scouts → "Ready for next phase"),
- * while mixed-capability batches get a generic summary with a breakdown.
+ * "Terminal" includes both `completed` (clean exit) and `zombie` (watchdog-killed)
+ * — see overstory-e130 for why a zombie counts as run-terminal. Single-capability
+ * batches get targeted messages (e.g. scouts → "Ready for next phase"), while
+ * mixed-capability batches get a generic summary with a breakdown. When any worker
+ * died, the verb changes from "have completed" to "have terminated" and the message
+ * carries a "(N completed, M zombie)" qualifier so the coordinator does not mistake
+ * a partial failure for a clean batch.
  */
 export function buildCompletionMessage(
 	workerSessions: readonly AgentSession[],
@@ -158,32 +237,41 @@ export function buildCompletionMessage(
 ): string {
 	const capabilities = new Set(workerSessions.map((s) => s.capability));
 	const count = workerSessions.length;
+	const zombieCount = workerSessions.filter((s) => s.state === "zombie").length;
+	const completedCount = count - zombieCount;
+	const verb = zombieCount > 0 ? "have terminated" : "have completed";
+	const qualifier = zombieCount > 0 ? ` (${completedCount} completed, ${zombieCount} zombie)` : "";
 	if (capabilities.size === 1) {
 		if (capabilities.has("scout")) {
-			return `[WATCHDOG] All ${count} scout(s) in run ${runId} have completed. Ready for next phase.`;
+			return `[WATCHDOG] All ${count} scout(s) in run ${runId} ${verb}${qualifier}. Ready for next phase.`;
 		}
 		if (capabilities.has("builder")) {
-			return `[WATCHDOG] All ${count} builder(s) in run ${runId} have completed. Awaiting lead verification.`;
+			return `[WATCHDOG] All ${count} builder(s) in run ${runId} ${verb}${qualifier}. Awaiting lead verification.`;
 		}
 		if (capabilities.has("reviewer")) {
-			return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} have completed. Reviews done.`;
+			return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} ${verb}${qualifier}. Reviews done.`;
 		}
 		if (capabilities.has("lead")) {
-			return `[WATCHDOG] All ${count} lead(s) in run ${runId} have completed. Ready for merge/cleanup.`;
+			return `[WATCHDOG] All ${count} lead(s) in run ${runId} ${verb}${qualifier}. Ready for merge/cleanup.`;
 		}
 		if (capabilities.has("merger")) {
-			return `[WATCHDOG] All ${count} merger(s) in run ${runId} have completed. Merges done.`;
+			return `[WATCHDOG] All ${count} merger(s) in run ${runId} ${verb}${qualifier}. Merges done.`;
 		}
 	}
 	const breakdown = Array.from(capabilities).sort().join(", ");
-	return `[WATCHDOG] All ${count} worker(s) in run ${runId} have completed (${breakdown}). Ready for next steps.`;
+	return `[WATCHDOG] All ${count} worker(s) in run ${runId} ${verb}${qualifier} (${breakdown}). Ready for next steps.`;
 }
 /**
- * Check if all worker sessions for the active run have completed, and if so,
- * nudge the coordinator. Fire-and-forget: never throws.
+ * Check if every worker session for the active run has reached a terminal state
+ * (`completed` or `zombie`), and if so, nudge the coordinator. Fire-and-forget:
+ * never throws.
+ *
+ * Zombie counts as terminal (overstory-e130): a watchdog-killed worker is not
+ * coming back, so excluding it would strand the coordinator on a run that mixes
+ * clean exits with kills.
  *
  * Deduplication: uses a marker file (run-complete-notified.txt) to prevent
  * repeated nudges for the same run ID.
@@ -204,14 +292,17 @@ async function checkRunCompletion(ctx: {
 	const { store, runId, overstoryDir, root, nudge, eventStore } = ctx;
 	const runSessions = store.getByRun(runId);
-	const workerSessions = runSessions.filter((s) => !PERSISTENT_CAPABILITIES.has(s.capability));
+	const workerSessions = runSessions.filter((s) => !isPersistentCapability(s.capability));
 	if (workerSessions.length === 0) {
 		return;
 	}
-	const allCompleted = workerSessions.every((s) => s.state === "completed");
-	if (!allCompleted) {
+	// `completed` = clean exit, `zombie` = watchdog-killed. Both are terminal
+	// for run-completion: a zombie is not coming back, so blocking on it would
+	// strand the coordinator forever (overstory-e130).
+	const allTerminal = workerSessions.every((s) => s.state === "completed" || s.state === "zombie");
+	if (!allTerminal) {
 		return;
 	}
@@ -240,15 +331,20 @@ async function checkRunCompletion(ctx: {
 	// Record the event
 	const capabilitiesArr = Array.from(new Set(workerSessions.map((s) => s.capability))).sort();
 	const phase = capabilitiesArr.length === 1 ? capabilitiesArr[0] : "mixed";
+	const completedAgents = workerSessions
+		.filter((s) => s.state === "completed")
+		.map((s) => s.agentName);
+	const zombieAgents = workerSessions.filter((s) => s.state === "zombie").map((s) => s.agentName);
 	recordEvent(eventStore, {
 		runId,
 		agentName: "watchdog",
 		eventType: "custom",
-		level: "info",
+		level: zombieAgents.length > 0 ? "warn" : "info",
 		data: {
 			type: "run_complete",
 			workerCount: workerSessions.length,
-			completedAgents: workerSessions.map((s) => s.agentName),
+			completedAgents,
+			zombieAgents,
 			capabilities: capabilitiesArr,
 			phase,
 		},
@@ -269,6 +365,13 @@ export interface DaemonOptions {
 	zombieThresholdMs: number;
 	nudgeIntervalMs?: number;
 	tier1Enabled?: boolean;
+	/**
+	 * When true (default), the watchdog sends a synthetic `worker_died` mail to
+	 * `session.parentAgent` the first time it transitions a session to `zombie`
+	 * (overstory-c111). Without this, the parent — typically a lead waiting for
+	 * `worker_done` — blocks indefinitely on mail that will never arrive.
+	 */
+	notifyParentOnDeath?: boolean;
 	onHealthCheck?: (check: HealthCheck) => void;
 	/** Dependency injection for testing. Uses real implementations when omitted. */
 	_tmux?: {
@@ -317,6 +420,18 @@ export interface DaemonOptions {
 	_findLatestStdoutLog?: (overstoryDir: string, agentName: string) => Promise<string | null>;
 	/** Dependency injection for testing. Overrides MailStore creation for decision gate detection. */
 	_mailStore?: MailStore | null;
+	/**
+	 * Dependency injection for testing. Overrides the module-level run-id warning
+	 * state so each test starts with a clean dedup slate (overstory-87bf).
+	 */
+	_runIdWarnState?: RunIdWarnState;
+	/**
+	 * Dependency injection for testing. Overrides RunStore creation. When `null`
+	 * is passed explicitly, run-id validation is skipped (file presence still
+	 * gates the warning). When omitted, a real RunStore is opened against
+	 * `.overstory/sessions.db`.
+	 */
+	_runStore?: RunStore | null;
 }
 /**
@@ -369,27 +484,66 @@ export function startDaemon(options: DaemonOptions & { intervalMs: number }): {
 /**
  * Kill an agent using the appropriate method based on whether it is headless or TUI.
  *
- * Headless agents (tmuxSession === "" && pid !== null) are killed via PID process tree.
- * TUI agents are killed via their named tmux session (only if tmuxAlive).
+ * Prefers runtime-agnostic `conn.abort()` when a RuntimeConnection is registered.
+ * If abort() succeeds, returns immediately — no PID/tmux kill needed.
+ * If abort() throws (e.g. process already exited), falls through to the
+ * defense-in-depth path below.
  *
- * This prevents the blast-radius bug where killSession("") with tmux prefix matching
- * would kill ALL tmux sessions when a headless agent is terminated.
+ * Branching after abort:
+ *   - tmuxSession === "" (headless): never call tmux.killSession — an empty `-t`
+ *     prefix-matches every session in the tmux server, wildcard-killing the entire
+ *     overstory swarm (overstory-74ce). Branch by pid:
+ *       - pid !== null  → kill the process tree (long-lived headless capability).
+ *       - pid === null  → no-op (spawn-per-turn agent between turns; the in-flight
+ *         process, if any, was already handled by the abort/connection path).
+ *   - tmuxSession !== "" (TUI): kill the named tmux session, but only when
+ *     `tmuxAlive` to avoid spurious "session not found" errors.
  */
 async function killAgent(ctx: {
 	session: AgentSession;
 	tmuxAlive: boolean;
 	tmux: { killSession: (name: string) => Promise<void> };
 	process: { killTree: (pid: number) => Promise<void> };
+	getConnection: (name: string) => RuntimeConnection | undefined;
+	removeConnection: (name: string) => void;
 }): Promise<void> {
-	const { session, tmuxAlive, tmux, process: proc } = ctx;
-	const isHeadless = session.tmuxSession === "" && session.pid !== null;
-	if (isHeadless && session.pid !== null) {
+	const { session, tmuxAlive, tmux, process: proc, getConnection, removeConnection } = ctx;
+	// Prefer runtime-agnostic abort() when a connection is registered.
+	const conn = getConnection(session.agentName);
+	if (conn) {
+		let aborted = false;
 		try {
-			await proc.killTree(session.pid);
+			await conn.abort();
+			aborted = true;
 		} catch {
-			// Already exited — not an error
+			// abort() failure — fall through to defense-in-depth path
+		}
+		removeConnection(session.agentName);
+		if (aborted) {
+			return;
+		}
+		// abort() threw — fall through to PID/tmux kill below as defense-in-depth
+	}
+	// Headless agents (no tmux session) must never reach tmux.killSession.
+	// An empty `-t` argument is prefix-matched and would kill every overstory
+	// tmux session in the server (overstory-74ce).
+	if (session.tmuxSession === "") {
+		if (session.pid !== null) {
+			try {
+				await proc.killTree(session.pid);
+			} catch {
+				// Already exited — not an error
+			}
 		}
-	} else if (tmuxAlive) {
+		// pid === null: spawn-per-turn agent between turns. Any in-flight process
+		// was handled by abort/connection above. No-op — next dispatch will spawn fresh.
+		return;
+	}
+	// Named tmux session path (TUI agents).
+	if (tmuxAlive) {
 		try {
 			await tmux.killSession(session.tmuxSession);
 		} catch {
@@ -398,6 +552,70 @@ async function killAgent(ctx: {
 	}
 }
+/**
+ * Send a synthetic `worker_died` mail to the parent of a watchdog-terminated
+ * session (overstory-c111). Fire-and-forget: never throws.
+ *
+ * Called only when `tryTransitionState(..., "zombie")` returns `ok: true`, so
+ * the state-machine's idempotence dedupes us — a subsequent watchdog tick that
+ * tries to re-zombify a session sees `illegal_transition` and skips notify.
+ */
+function notifyParentOfDeath(ctx: {
+	session: AgentSession;
+	mailStore: MailStore | null;
+	reason: string;
+	tier: 0 | 1;
+	eventStore: EventStore | null;
+	runId: string | null;
+}): void {
+	const { session, mailStore, reason, tier, eventStore, runId } = ctx;
+	if (mailStore === null) return;
+	if (session.parentAgent === null) return;
+	const payload: WorkerDiedPayload = {
+		agentName: session.agentName,
+		capability: session.capability,
+		taskId: session.taskId,
+		reason,
+		lastActivity: session.lastActivity,
+		terminatedBy: tier === 0 ? "tier0" : "tier1",
+	};
+	try {
+		mailStore.insert({
+			id: "",
+			from: session.agentName,
+			to: session.parentAgent,
+			subject: `[WATCHDOG] worker_died: ${session.agentName}`,
+			body:
+				`Worker "${session.agentName}" (${session.capability}) on task ${session.taskId} ` +
+				`was terminated by the watchdog. Reason: ${reason}. ` +
+				`Last activity: ${session.lastActivity}. ` +
+				`Decide whether to retry the work, escalate, or report the failure upstream.`,
+			type: "worker_died",
+			priority: "high",
+			threadId: null,
+			payload: JSON.stringify(payload),
+		});
+	} catch {
+		// Mail-send failure must never crash the watchdog.
+		return;
+	}
+	recordEvent(eventStore, {
+		runId,
+		agentName: session.agentName,
+		eventType: "mail_sent",
+		level: "warn",
+		data: {
+			type: "worker_died",
+			parent: session.parentAgent,
+			reason,
+			tier,
+		},
+	});
+}
 /**
  * Run a single daemon tick. Exported for testing — allows direct invocation
  * of the monitoring logic without starting the interval-based daemon loop.
@@ -411,6 +629,7 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 		zombieThresholdMs,
 		nudgeIntervalMs = 60_000,
 		tier1Enabled = false,
+		notifyParentOnDeath = true,
 		onHealthCheck,
 	} = options;
 	const tmux = options._tmux ?? { isSessionAlive, killSession };
@@ -425,10 +644,26 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 	const findStdoutLog = options._findLatestStdoutLog ?? findLatestStdoutLog;
 	const maxTriagePerTick = options._maxTriagePerTick ?? 3;
 	const triageCount = { value: 0 };
+	const runIdWarnState = options._runIdWarnState ?? _defaultRunIdWarnState;
 	const overstoryDir = join(root, ".overstory");
 	const { store } = openSessionStore(overstoryDir);
+	// Open RunStore for run-id validation (overstory-87bf). Sharing sessions.db
+	// is intentional — same file, WAL mode covers concurrent reads.
+	let runStore: RunStore | null = null;
+	let ownRunStore = false;
+	if (options._runStore !== undefined) {
+		runStore = options._runStore;
+	} else {
+		try {
+			runStore = createRunStore(join(overstoryDir, "sessions.db"));
+			ownRunStore = true;
+		} catch {
+			// RunStore creation failure is non-fatal — id validation is then skipped.
+		}
+	}
 	// Open MailStore for decision gate detection (fire-and-forget: non-fatal if unavailable)
 	let mailStore: MailStore | null = null;
 	let ownMailStore = false;
@@ -474,6 +709,7 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 		// Track active headless agents to clean up stale tailers after the loop.
 		const activeHeadlessAgents = new Set<string>();
 		const eventsDbPath = join(overstoryDir, "events.db");
+		const sessionsDbPath = join(overstoryDir, "sessions.db");
 		for (const session of sessions) {
 			// Skip completed sessions — they are terminal and don't need monitoring
@@ -488,7 +724,11 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 			// active headless agent that doesn't already have one running.
 			// Tailers persist between ticks (module-level registry) so events are
 			// continuously written to events.db while the agent is working.
-			if (session.tmuxSession === "" && session.pid !== null) {
+			//
+			// Both long-lived headless (pid !== null) and spawn-per-turn workers
+			// (pid === null, overstory-7a34) emit stream-json to stdout.log, so
+			// either pattern needs a tailer.
+			if (session.tmuxSession === "") {
 				activeHeadlessAgents.add(session.agentName);
 				if (!tailerRegistry.has(session.agentName)) {
 					// Discover the latest stdout.log for this agent and start tailing.
@@ -499,41 +739,52 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 							agentName: session.agentName,
 							runId,
 							eventsDbPath,
+							sessionsDbPath,
 						});
 						tailerRegistry.set(session.agentName, handle);
 					}
 				}
 			}
-			// RPC health check: for headless agents with an active connection,
-			// call getState() to refresh lastActivity before evaluateHealth().
-			// This prevents false-positive stale/zombie classification for agents
-			// that are actively working but haven't updated lastActivity via hooks.
-			//
-			// For non-RPC headless agents, fall back to event-based activity detection:
-			// if events.db has a recent event from this agent within the stale window,
-			// the agent is considered active and lastActivity is refreshed.
-			if (session.tmuxSession === "" && session.pid !== null) {
-				const conn = getConn(session.agentName);
-				if (conn) {
-					try {
-						const state = await Promise.race([
-							conn.getState(),
-							new Promise<never>((_, reject) =>
-								setTimeout(() => reject(new Error("getState timed out")), 5000),
-							),
-						]);
-						if (state.status === "idle" || state.status === "working") {
-							store.updateLastActivity(session.agentName);
-							// Refresh the session object so evaluateHealth sees updated lastActivity
-							session.lastActivity = new Date().toISOString();
-						}
-					} catch {
-						// getState() failed or timed out — remove stale connection
-						removeConn(session.agentName);
+			// === Liveness check ===
+			// Prefer RuntimeConnection.getState() when a connection is registered. Fall
+			// back to tmux liveness when no connection exists. For headless agents without
+			// a connection, use event-based activity detection to refresh lastActivity.
+			const conn = getConn(session.agentName);
+			let tmuxAlive: boolean;
+			if (conn) {
+				try {
+					const state = await Promise.race([
+						conn.getState(),
+						new Promise<never>((_, reject) =>
+							setTimeout(() => reject(new Error("getState timed out")), 5000),
+						),
+					]);
+					// Map ConnectionState → liveness:
+					//   idle | working → alive (running)
+					//   error          → not alive (exited)
+					if (state.status === "idle" || state.status === "working") {
+						tmuxAlive = true;
+						store.updateLastActivity(session.agentName);
+						session.lastActivity = new Date().toISOString();
+					} else {
+						tmuxAlive = false;
 					}
-				} else if (eventStore) {
-					// No RPC connection — check events.db for recent activity
+				} catch {
+					// getState() failed/timed out — drop stale connection, fall back to tmux
+					removeConn(session.agentName);
+					tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
+				}
+			} else {
+				tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
+				// Headless agents without a registered connection: event-based
+				// activity detection to avoid false-positive stale. Covers both
+				// long-lived headless (e.g. after a process restart) and
+				// spawn-per-turn workers between turns where lastActivity is
+				// the only liveness signal (overstory-7a34).
+				if (session.tmuxSession === "" && eventStore) {
 					try {
 						const recentEvents = eventStore.getByAgent(session.agentName, {
 							since: new Date(Date.now() - staleThresholdMs).toISOString(),
@@ -548,15 +799,28 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 					}
 				}
 			}
-			const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
 			const check = evaluateHealth(session, tmuxAlive, thresholds);
-			// Transition state forward only (investigate action holds state)
+			// Snapshot the pre-tick state so the worker_died notify path can
+			// dedupe across re-ticks (overstory-c111). Subsequent `tryTransitionState`
+			// calls below mutate session.state, and the matrix allows the idempotent
+			// `zombie → zombie` self-transition — both would erase the dedup signal.
+			const stateBeforeTick = session.state;
+			// Transition state forward only (investigate action holds state).
+			// `transitionState` computes the watchdog's preferred target;
+			// `tryTransitionState` is the matrix-guarded CAS — `completed → *`
+			// is rejected here so a properly-completed agent cannot be
+			// reclassified as zombie by a late watchdog tick (overstory-a993).
 			const newState = transitionState(session.state, check);
 			if (newState !== session.state) {
-				store.updateState(session.agentName, newState);
-				session.state = newState;
+				const outcome = store.tryTransitionState(session.agentName, newState);
+				if (outcome.ok) {
+					session.state = newState;
+				} else if (outcome.reason === "illegal_transition") {
+					// Resync local mirror — another writer settled state durably.
+					session.state = outcome.prev;
+				}
 			}
 			if (onHealthCheck) {
@@ -568,12 +832,41 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 				const reason = check.reconciliationNote ?? "Process terminated";
 				await recordFailureFn(root, session, reason, 0);
-				// Kill the agent: headless agents are killed via PID, TUI agents via tmux
-				await killAgent({ session, tmuxAlive, tmux, process: proc });
-				store.updateState(session.agentName, "zombie");
+				// Kill the agent: prefer conn.abort(), fall back to PID/tmux
+				await killAgent({
+					session,
+					tmuxAlive,
+					tmux,
+					process: proc,
+					getConnection: getConn,
+					removeConnection: removeConn,
+				});
+				// Matrix-guarded: rejected when state is `completed` so a clean
+				// `ov stop` cannot be silently downgraded to zombie by a late
+				// watchdog termination (overstory-a993).
+				const outcome = store.tryTransitionState(session.agentName, "zombie");
 				// Reset escalation tracking on terminal state
 				store.updateEscalation(session.agentName, 0, null);
-				session.state = "zombie";
+				if (outcome.ok) {
+					session.state = "zombie";
+					// First-time zombify: notify parent so it doesn't block on
+					// missing `worker_done` mail (overstory-c111). Dedup uses the
+					// pre-tick snapshot because the matrix allows the idempotent
+					// zombie → zombie transition (both `outcome.ok` and the earlier
+					// transitionState call would otherwise mask re-ticks).
+					if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
+						notifyParentOfDeath({
+							session,
+							mailStore,
+							reason,
+							tier: 0,
+							eventStore,
+							runId,
+						});
+					}
+				} else if (outcome.reason === "illegal_transition") {
+					session.state = outcome.prev;
+				}
 				session.escalationLevel = 0;
 				session.stalledSince = null;
 			} else if (check.action === "investigate") {
@@ -581,6 +874,21 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 				// Log the conflict but do NOT auto-kill.
 				// The onHealthCheck callback surfaces this to the operator.
 				// No state change — keep zombie until a human or higher-tier agent decides.
+			} else if (check.action === "complete") {
+				// ZFC fallback: tmux/pid is gone AND lastActivity is stale —
+				// the agent looks like it finished naturally and only the
+				// session-end hook missed (overstory-e74b). Mark completed
+				// without killing (process is already gone) and without
+				// notifying parents of death (this is not a crash).
+				const outcome = store.tryTransitionState(session.agentName, "completed");
+				if (outcome.ok) {
+					session.state = "completed";
+				} else if (outcome.reason === "illegal_transition") {
+					session.state = outcome.prev;
+				}
+				store.updateEscalation(session.agentName, 0, null);
+				session.escalationLevel = 0;
+				session.stalledSince = null;
 			} else if (check.action === "escalate") {
 				// Decision gate check: if the agent sent a decision_gate message, it is
 				// intentionally paused waiting for a human decision — not a stall.
@@ -635,12 +943,32 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 					recordFailure: recordFailureFn,
 					triageCount,
 					maxTriagePerTick,
+					getConnection: getConn,
+					removeConnection: removeConn,
 				});
 				if (actionResult.terminated) {
-					store.updateState(session.agentName, "zombie");
+					// Matrix-guarded: completed → zombie is rejected (overstory-a993).
+					const outcome = store.tryTransitionState(session.agentName, "zombie");
 					store.updateEscalation(session.agentName, 0, null);
-					session.state = "zombie";
+					if (outcome.ok) {
+						session.state = "zombie";
+						// First-time zombify: notify parent so it doesn't block on
+						// missing `worker_done` mail (overstory-c111). Dedup via
+						// the pre-tick snapshot — see the terminate branch above.
+						if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
+							notifyParentOfDeath({
+								session,
+								mailStore,
+								reason: actionResult.deathReason ?? "Watchdog escalation terminated agent",
+								tier: actionResult.deathTier ?? 0,
+								eventStore,
+								runId,
+							});
+						}
+					} else if (outcome.reason === "illegal_transition") {
+						session.state = outcome.prev;
+					}
 					session.escalationLevel = 0;
 					session.stalledSince = null;
 				}
@@ -664,10 +992,18 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 		// === Run-level completion detection ===
 		// After monitoring individual sessions, check if the entire run is done.
-		if (runId) {
+		// Re-resolve the run id defensively (overstory-87bf): a missing
+		// current-run.txt or a stale id (no row in runs table) skips the check
+		// and emits one warning per cause for the lifetime of this watchdog.
+		const validatedRunId = await resolveRunIdForCompletionCheck(
+			overstoryDir,
+			runStore,
+			runIdWarnState,
+		);
+		if (validatedRunId) {
 			await checkRunCompletion({
 				store,
-				runId,
+				runId: validatedRunId,
 				overstoryDir,
 				root,
 				nudge,
@@ -692,6 +1028,14 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
 				// Non-fatal
 			}
 		}
+		// Close RunStore only if we created it (not injected)
+		if (runStore && ownRunStore) {
+			try {
+				runStore.close();
+			} catch {
+				// Non-fatal
+			}
+		}
 	}
 }
@@ -741,7 +1085,15 @@ async function executeEscalationAction(ctx: {
 		tier: 0 | 1,
 		triageSuggestion?: string,
 	) => Promise<void>;
-}): Promise<{ terminated: boolean; stateChanged: boolean }> {
+	getConnection: (name: string) => RuntimeConnection | undefined;
+	removeConnection: (name: string) => void;
+}): Promise<{
+	terminated: boolean;
+	stateChanged: boolean;
+	/** Reason and tier of the termination (only set when `terminated` is true). */
+	deathReason?: string;
+	deathTier?: 0 | 1;
+}> {
 	const {
 		session,
 		root,
@@ -756,6 +1108,8 @@ async function executeEscalationAction(ctx: {
 		recordFailure,
 		triageCount,
 		maxTriagePerTick,
+		getConnection: getConn,
+		removeConnection: removeConn,
 	} = ctx;
 	switch (session.escalationLevel) {
@@ -832,16 +1186,23 @@ async function executeEscalationAction(ctx: {
 			if (result.verdict === "terminate") {
 				// Record the failure via mulch (Tier 1 AI triage)
-				await recordFailure(
-					root,
-					session,
-					"AI triage classified as terminal failure",
-					1,
-					result.verdict,
-				);
+				const triageReason = "AI triage classified as terminal failure";
+				await recordFailure(root, session, triageReason, 1, result.verdict);
-				await killAgent({ session, tmuxAlive, tmux, process: proc });
-				return { terminated: true, stateChanged: true };
+				await killAgent({
+					session,
+					tmuxAlive,
+					tmux,
+					process: proc,
+					getConnection: getConn,
+					removeConnection: removeConn,
+				});
+				return {
+					terminated: true,
+					stateChanged: true,
+					deathReason: triageReason,
+					deathTier: 1,
+				};
 			}
 			if (result.verdict === "retry") {
@@ -874,10 +1235,23 @@ async function executeEscalationAction(ctx: {
 			});
 			// Record the failure via mulch (Tier 0: progressive escalation to terminal level)
-			await recordFailure(root, session, "Progressive escalation reached terminal level", 0);
+			const escalationReason = "Progressive escalation reached terminal level";
+			await recordFailure(root, session, escalationReason, 0);
-			await killAgent({ session, tmuxAlive, tmux, process: proc });
-			return { terminated: true, stateChanged: true };
+			await killAgent({
+				session,
+				tmuxAlive,
+				tmux,
+				process: proc,
+				getConnection: getConn,
+				removeConnection: removeConn,
+			});
+			return {
+				terminated: true,
+				stateChanged: true,
+				deathReason: escalationReason,
+				deathTier: 0,
+			};
 		}
 	}
 }