@os-eco/overstory-cli 0.9.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/README.md +50 -19
  2. package/agents/builder.md +19 -9
  3. package/agents/coordinator.md +6 -6
  4. package/agents/lead.md +204 -87
  5. package/agents/merger.md +25 -14
  6. package/agents/reviewer.md +22 -16
  7. package/agents/scout.md +17 -12
  8. package/package.json +6 -3
  9. package/src/agents/capabilities.test.ts +85 -0
  10. package/src/agents/capabilities.ts +125 -0
  11. package/src/agents/headless-mail-injector.test.ts +448 -0
  12. package/src/agents/headless-mail-injector.ts +219 -0
  13. package/src/agents/headless-prompt.test.ts +102 -0
  14. package/src/agents/headless-prompt.ts +68 -0
  15. package/src/agents/hooks-deployer.test.ts +514 -14
  16. package/src/agents/hooks-deployer.ts +141 -0
  17. package/src/agents/mail-poll-detect.test.ts +153 -0
  18. package/src/agents/mail-poll-detect.ts +73 -0
  19. package/src/agents/overlay.test.ts +60 -4
  20. package/src/agents/overlay.ts +63 -8
  21. package/src/agents/scope-detect.test.ts +190 -0
  22. package/src/agents/scope-detect.ts +146 -0
  23. package/src/agents/turn-lock.test.ts +181 -0
  24. package/src/agents/turn-lock.ts +235 -0
  25. package/src/agents/turn-runner-dispatch.test.ts +182 -0
  26. package/src/agents/turn-runner-dispatch.ts +105 -0
  27. package/src/agents/turn-runner.test.ts +2312 -0
  28. package/src/agents/turn-runner.ts +1383 -0
  29. package/src/commands/agents.ts +9 -0
  30. package/src/commands/clean.ts +54 -0
  31. package/src/commands/coordinator.test.ts +254 -0
  32. package/src/commands/coordinator.ts +273 -8
  33. package/src/commands/dashboard.test.ts +188 -0
  34. package/src/commands/dashboard.ts +14 -4
  35. package/src/commands/doctor.ts +3 -1
  36. package/src/commands/group.test.ts +94 -0
  37. package/src/commands/group.ts +49 -20
  38. package/src/commands/init.test.ts +8 -0
  39. package/src/commands/init.ts +8 -1
  40. package/src/commands/log.test.ts +187 -11
  41. package/src/commands/log.ts +171 -71
  42. package/src/commands/mail.test.ts +162 -0
  43. package/src/commands/mail.ts +64 -9
  44. package/src/commands/merge.test.ts +230 -1
  45. package/src/commands/merge.ts +68 -12
  46. package/src/commands/nudge.test.ts +351 -4
  47. package/src/commands/nudge.ts +356 -34
  48. package/src/commands/run.test.ts +43 -7
  49. package/src/commands/serve/build.test.ts +202 -0
  50. package/src/commands/serve/build.ts +206 -0
  51. package/src/commands/serve/coordinator-actions.test.ts +339 -0
  52. package/src/commands/serve/coordinator-actions.ts +408 -0
  53. package/src/commands/serve/dev.test.ts +168 -0
  54. package/src/commands/serve/dev.ts +117 -0
  55. package/src/commands/serve/mail-actions.test.ts +312 -0
  56. package/src/commands/serve/mail-actions.ts +167 -0
  57. package/src/commands/serve/rest.test.ts +1323 -0
  58. package/src/commands/serve/rest.ts +708 -0
  59. package/src/commands/serve/static.ts +51 -0
  60. package/src/commands/serve/ws.test.ts +361 -0
  61. package/src/commands/serve/ws.ts +332 -0
  62. package/src/commands/serve.test.ts +459 -0
  63. package/src/commands/serve.ts +565 -0
  64. package/src/commands/sling.test.ts +177 -1
  65. package/src/commands/sling.ts +243 -71
  66. package/src/commands/status.test.ts +9 -0
  67. package/src/commands/status.ts +12 -4
  68. package/src/commands/stop.test.ts +255 -1
  69. package/src/commands/stop.ts +107 -8
  70. package/src/commands/watch.test.ts +43 -0
  71. package/src/commands/watch.ts +153 -28
  72. package/src/config.ts +23 -0
  73. package/src/doctor/consistency.test.ts +106 -0
  74. package/src/doctor/consistency.ts +48 -1
  75. package/src/doctor/serve.test.ts +95 -0
  76. package/src/doctor/serve.ts +86 -0
  77. package/src/doctor/types.ts +2 -1
  78. package/src/doctor/watchdog.ts +57 -1
  79. package/src/events/tailer.test.ts +234 -1
  80. package/src/events/tailer.ts +90 -0
  81. package/src/index.ts +57 -6
  82. package/src/insights/quality-gates.test.ts +141 -0
  83. package/src/insights/quality-gates.ts +156 -0
  84. package/src/json.ts +29 -0
  85. package/src/logging/theme.ts +4 -0
  86. package/src/mail/client.ts +15 -2
  87. package/src/mail/store.test.ts +82 -0
  88. package/src/mail/store.ts +41 -4
  89. package/src/merge/lock.test.ts +149 -0
  90. package/src/merge/lock.ts +140 -0
  91. package/src/merge/predict.test.ts +387 -0
  92. package/src/merge/predict.ts +249 -0
  93. package/src/merge/resolver.ts +1 -1
  94. package/src/mulch/client.ts +3 -3
  95. package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
  96. package/src/runtimes/claude.test.ts +791 -1
  97. package/src/runtimes/claude.ts +323 -1
  98. package/src/runtimes/connections.test.ts +141 -1
  99. package/src/runtimes/connections.ts +73 -4
  100. package/src/runtimes/headless-connection.test.ts +264 -0
  101. package/src/runtimes/headless-connection.ts +158 -0
  102. package/src/runtimes/types.ts +10 -0
  103. package/src/schema-consistency.test.ts +1 -0
  104. package/src/sessions/store.test.ts +657 -29
  105. package/src/sessions/store.ts +286 -23
  106. package/src/test-setup.test.ts +31 -0
  107. package/src/test-setup.ts +28 -0
  108. package/src/types.ts +107 -2
  109. package/src/utils/pid.test.ts +85 -1
  110. package/src/utils/pid.ts +86 -1
  111. package/src/utils/process-scan.test.ts +53 -0
  112. package/src/utils/process-scan.ts +76 -0
  113. package/src/watchdog/daemon.test.ts +1607 -376
  114. package/src/watchdog/daemon.ts +462 -88
  115. package/src/watchdog/health.test.ts +282 -0
  116. package/src/watchdog/health.ts +126 -27
  117. package/src/worktree/manager.test.ts +218 -1
  118. package/src/worktree/manager.ts +55 -0
  119. package/src/worktree/process.test.ts +71 -0
  120. package/src/worktree/process.ts +25 -5
  121. package/src/worktree/tmux.test.ts +28 -0
  122. package/src/worktree/tmux.ts +27 -3
  123. package/templates/CLAUDE.md.tmpl +19 -8
  124. package/templates/overlay.md.tmpl +5 -2
@@ -21,6 +21,7 @@
21
21
  */
22
22
 
23
23
  import { join } from "node:path";
24
+ import { isPersistentCapability } from "../agents/capabilities.ts";
24
25
  import { nudgeAgent } from "../commands/nudge.ts";
25
26
  import { createEventStore } from "../events/store.ts";
26
27
  import {
@@ -34,7 +35,14 @@ import { createMulchClient } from "../mulch/client.ts";
34
35
  import { getConnection, removeConnection } from "../runtimes/connections.ts";
35
36
  import type { RuntimeConnection } from "../runtimes/types.ts";
36
37
  import { openSessionStore } from "../sessions/compat.ts";
37
- import type { AgentSession, EventStore, HealthCheck } from "../types.ts";
38
+ import { createRunStore } from "../sessions/store.ts";
39
+ import type {
40
+ AgentSession,
41
+ EventStore,
42
+ HealthCheck,
43
+ RunStore,
44
+ WorkerDiedPayload,
45
+ } from "../types.ts";
38
46
  import { isProcessAlive, isSessionAlive, killProcessTree, killSession } from "../worktree/tmux.ts";
39
47
  import { evaluateHealth, transitionState } from "./health.ts";
40
48
  import { type TriageResult, triageAgent } from "./triage.ts";
@@ -42,12 +50,6 @@ import { type TriageResult, triageAgent } from "./triage.ts";
42
50
  /** Maximum escalation level (terminate). */
43
51
  const MAX_ESCALATION_LEVEL = 3;
44
52
 
45
- /**
46
- * Persistent agent capabilities that are excluded from run-level completion checks.
47
- * These agents are long-running and should not count toward "all workers done".
48
- */
49
- const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor"]);
50
-
51
53
  /**
52
54
  * Module-level registry of active event tailers for headless agents.
53
55
  * Maps agentName → TailerHandle. Persists across daemon ticks so tailers
@@ -55,6 +57,28 @@ const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor
55
57
  */
56
58
  const _defaultTailerRegistry: Map<string, TailerHandle> = new Map();
57
59
 
60
+ /**
61
+ * Per-cause dedup state for `current-run.txt` defensive-read warnings
62
+ * (overstory-87bf). The watchdog reads `.overstory/current-run.txt` once per
63
+ * tick to gate run-completion checks; if the file is missing/empty/unreadable
64
+ * or points to an id with no row in the runs table, the check would silently
65
+ * skip every tick. We log one warning per cause and then continue skipping
66
+ * silently, so an operator can see the run-completion path is wedged without
67
+ * drowning in repeated lines.
68
+ *
69
+ * Module-level by design: warnings should dedupe across ticks within one
70
+ * watchdog process. Overridable via DaemonOptions._runIdWarnState in tests.
71
+ */
72
+ export interface RunIdWarnState {
73
+ missingFileWarned: boolean;
74
+ unknownIds: Set<string>;
75
+ }
76
+
77
+ const _defaultRunIdWarnState: RunIdWarnState = {
78
+ missingFileWarned: false,
79
+ unknownIds: new Set(),
80
+ };
81
+
58
82
  /**
59
83
  * Record an agent failure to mulch for future reference.
60
84
  * Fire-and-forget: never throws, logs errors internally if mulch fails.
@@ -115,6 +139,56 @@ async function readCurrentRunId(overstoryDir: string): Promise<string | null> {
115
139
  }
116
140
  }
117
141
 
142
+ /**
143
+ * Resolve the active run id for run-completion checks, defensively
144
+ * (overstory-87bf). Returns the id only when `current-run.txt` is readable
145
+ * AND points to a row in the runs table. On either failure mode, logs one
146
+ * warning per cause via `warnState` and returns null so the caller can skip
147
+ * the check silently on subsequent ticks.
148
+ *
149
+ * Intentionally narrow: the broader `readCurrentRunId` is unchanged and still
150
+ * powers event-recording paths where a stale id is acceptable as a label.
151
+ */
152
+ async function resolveRunIdForCompletionCheck(
153
+ overstoryDir: string,
154
+ runStore: RunStore | null,
155
+ warnState: RunIdWarnState,
156
+ ): Promise<string | null> {
157
+ const runId = await readCurrentRunId(overstoryDir);
158
+ if (runId === null) {
159
+ if (!warnState.missingFileWarned) {
160
+ warnState.missingFileWarned = true;
161
+ process.stderr.write(
162
+ "[WATCHDOG] current-run.txt missing — run-completion checks disabled until restart\n",
163
+ );
164
+ }
165
+ return null;
166
+ }
167
+ if (runStore === null) {
168
+ // RunStore unavailable (rare — sessions.db open failed). Trust the file
169
+ // and let the downstream nudge path proceed; this is no worse than the
170
+ // pre-87bf behavior.
171
+ return runId;
172
+ }
173
+ let run: ReturnType<RunStore["getRun"]>;
174
+ try {
175
+ run = runStore.getRun(runId);
176
+ } catch {
177
+ // Treat lookup errors as "unknown" — same defensive posture as a missing row.
178
+ run = null;
179
+ }
180
+ if (run === null) {
181
+ if (!warnState.unknownIds.has(runId)) {
182
+ warnState.unknownIds.add(runId);
183
+ process.stderr.write(
184
+ `[WATCHDOG] current-run.txt points to unknown run "${runId}" — run-completion checks disabled until restart\n`,
185
+ );
186
+ }
187
+ return null;
188
+ }
189
+ return runId;
190
+ }
191
+
118
192
  /**
119
193
  * Fire-and-forget: record an event to EventStore. Never throws.
120
194
  */
@@ -147,10 +221,15 @@ function recordEvent(
147
221
  }
148
222
 
149
223
  /**
150
- * Build a phase-aware completion message based on the capabilities of completed workers.
224
+ * Build a phase-aware completion message based on the capabilities of terminal workers.
151
225
  *
152
- * Single-capability batches get targeted messages (e.g. scouts "Ready for next phase"),
153
- * while mixed-capability batches get a generic summary with a breakdown.
226
+ * "Terminal" includes both `completed` (clean exit) and `zombie` (watchdog-killed)
227
+ * see overstory-e130 for why a zombie counts as run-terminal. Single-capability
228
+ * batches get targeted messages (e.g. scouts → "Ready for next phase"), while
229
+ * mixed-capability batches get a generic summary with a breakdown. When any worker
230
+ * died, the verb changes from "have completed" to "have terminated" and the message
231
+ * carries a "(N completed, M zombie)" qualifier so the coordinator does not mistake
232
+ * a partial failure for a clean batch.
154
233
  */
155
234
  export function buildCompletionMessage(
156
235
  workerSessions: readonly AgentSession[],
@@ -158,32 +237,41 @@ export function buildCompletionMessage(
158
237
  ): string {
159
238
  const capabilities = new Set(workerSessions.map((s) => s.capability));
160
239
  const count = workerSessions.length;
240
+ const zombieCount = workerSessions.filter((s) => s.state === "zombie").length;
241
+ const completedCount = count - zombieCount;
242
+ const verb = zombieCount > 0 ? "have terminated" : "have completed";
243
+ const qualifier = zombieCount > 0 ? ` (${completedCount} completed, ${zombieCount} zombie)` : "";
161
244
 
162
245
  if (capabilities.size === 1) {
163
246
  if (capabilities.has("scout")) {
164
- return `[WATCHDOG] All ${count} scout(s) in run ${runId} have completed. Ready for next phase.`;
247
+ return `[WATCHDOG] All ${count} scout(s) in run ${runId} ${verb}${qualifier}. Ready for next phase.`;
165
248
  }
166
249
  if (capabilities.has("builder")) {
167
- return `[WATCHDOG] All ${count} builder(s) in run ${runId} have completed. Awaiting lead verification.`;
250
+ return `[WATCHDOG] All ${count} builder(s) in run ${runId} ${verb}${qualifier}. Awaiting lead verification.`;
168
251
  }
169
252
  if (capabilities.has("reviewer")) {
170
- return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} have completed. Reviews done.`;
253
+ return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} ${verb}${qualifier}. Reviews done.`;
171
254
  }
172
255
  if (capabilities.has("lead")) {
173
- return `[WATCHDOG] All ${count} lead(s) in run ${runId} have completed. Ready for merge/cleanup.`;
256
+ return `[WATCHDOG] All ${count} lead(s) in run ${runId} ${verb}${qualifier}. Ready for merge/cleanup.`;
174
257
  }
175
258
  if (capabilities.has("merger")) {
176
- return `[WATCHDOG] All ${count} merger(s) in run ${runId} have completed. Merges done.`;
259
+ return `[WATCHDOG] All ${count} merger(s) in run ${runId} ${verb}${qualifier}. Merges done.`;
177
260
  }
178
261
  }
179
262
 
180
263
  const breakdown = Array.from(capabilities).sort().join(", ");
181
- return `[WATCHDOG] All ${count} worker(s) in run ${runId} have completed (${breakdown}). Ready for next steps.`;
264
+ return `[WATCHDOG] All ${count} worker(s) in run ${runId} ${verb}${qualifier} (${breakdown}). Ready for next steps.`;
182
265
  }
183
266
 
184
267
  /**
185
- * Check if all worker sessions for the active run have completed, and if so,
186
- * nudge the coordinator. Fire-and-forget: never throws.
268
+ * Check if every worker session for the active run has reached a terminal state
269
+ * (`completed` or `zombie`), and if so, nudge the coordinator. Fire-and-forget:
270
+ * never throws.
271
+ *
272
+ * Zombie counts as terminal (overstory-e130): a watchdog-killed worker is not
273
+ * coming back, so excluding it would strand the coordinator on a run that mixes
274
+ * clean exits with kills.
187
275
  *
188
276
  * Deduplication: uses a marker file (run-complete-notified.txt) to prevent
189
277
  * repeated nudges for the same run ID.
@@ -204,14 +292,17 @@ async function checkRunCompletion(ctx: {
204
292
  const { store, runId, overstoryDir, root, nudge, eventStore } = ctx;
205
293
 
206
294
  const runSessions = store.getByRun(runId);
207
- const workerSessions = runSessions.filter((s) => !PERSISTENT_CAPABILITIES.has(s.capability));
295
+ const workerSessions = runSessions.filter((s) => !isPersistentCapability(s.capability));
208
296
 
209
297
  if (workerSessions.length === 0) {
210
298
  return;
211
299
  }
212
300
 
213
- const allCompleted = workerSessions.every((s) => s.state === "completed");
214
- if (!allCompleted) {
301
+ // `completed` = clean exit, `zombie` = watchdog-killed. Both are terminal
302
+ // for run-completion: a zombie is not coming back, so blocking on it would
303
+ // strand the coordinator forever (overstory-e130).
304
+ const allTerminal = workerSessions.every((s) => s.state === "completed" || s.state === "zombie");
305
+ if (!allTerminal) {
215
306
  return;
216
307
  }
217
308
 
@@ -240,15 +331,20 @@ async function checkRunCompletion(ctx: {
240
331
  // Record the event
241
332
  const capabilitiesArr = Array.from(new Set(workerSessions.map((s) => s.capability))).sort();
242
333
  const phase = capabilitiesArr.length === 1 ? capabilitiesArr[0] : "mixed";
334
+ const completedAgents = workerSessions
335
+ .filter((s) => s.state === "completed")
336
+ .map((s) => s.agentName);
337
+ const zombieAgents = workerSessions.filter((s) => s.state === "zombie").map((s) => s.agentName);
243
338
  recordEvent(eventStore, {
244
339
  runId,
245
340
  agentName: "watchdog",
246
341
  eventType: "custom",
247
- level: "info",
342
+ level: zombieAgents.length > 0 ? "warn" : "info",
248
343
  data: {
249
344
  type: "run_complete",
250
345
  workerCount: workerSessions.length,
251
- completedAgents: workerSessions.map((s) => s.agentName),
346
+ completedAgents,
347
+ zombieAgents,
252
348
  capabilities: capabilitiesArr,
253
349
  phase,
254
350
  },
@@ -269,6 +365,13 @@ export interface DaemonOptions {
269
365
  zombieThresholdMs: number;
270
366
  nudgeIntervalMs?: number;
271
367
  tier1Enabled?: boolean;
368
+ /**
369
+ * When true (default), the watchdog sends a synthetic `worker_died` mail to
370
+ * `session.parentAgent` the first time it transitions a session to `zombie`
371
+ * (overstory-c111). Without this, the parent — typically a lead waiting for
372
+ * `worker_done` — blocks indefinitely on mail that will never arrive.
373
+ */
374
+ notifyParentOnDeath?: boolean;
272
375
  onHealthCheck?: (check: HealthCheck) => void;
273
376
  /** Dependency injection for testing. Uses real implementations when omitted. */
274
377
  _tmux?: {
@@ -317,6 +420,18 @@ export interface DaemonOptions {
317
420
  _findLatestStdoutLog?: (overstoryDir: string, agentName: string) => Promise<string | null>;
318
421
  /** Dependency injection for testing. Overrides MailStore creation for decision gate detection. */
319
422
  _mailStore?: MailStore | null;
423
+ /**
424
+ * Dependency injection for testing. Overrides the module-level run-id warning
425
+ * state so each test starts with a clean dedup slate (overstory-87bf).
426
+ */
427
+ _runIdWarnState?: RunIdWarnState;
428
+ /**
429
+ * Dependency injection for testing. Overrides RunStore creation. When `null`
430
+ * is passed explicitly, run-id validation is skipped (file presence still
431
+ * gates the warning). When omitted, a real RunStore is opened against
432
+ * `.overstory/sessions.db`.
433
+ */
434
+ _runStore?: RunStore | null;
320
435
  }
321
436
 
322
437
  /**
@@ -369,27 +484,66 @@ export function startDaemon(options: DaemonOptions & { intervalMs: number }): {
369
484
  /**
370
485
  * Kill an agent using the appropriate method based on whether it is headless or TUI.
371
486
  *
372
- * Headless agents (tmuxSession === "" && pid !== null) are killed via PID process tree.
373
- * TUI agents are killed via their named tmux session (only if tmuxAlive).
487
+ * Prefers runtime-agnostic `conn.abort()` when a RuntimeConnection is registered.
488
+ * If abort() succeeds, returns immediately no PID/tmux kill needed.
489
+ * If abort() throws (e.g. process already exited), falls through to the
490
+ * defense-in-depth path below.
374
491
  *
375
- * This prevents the blast-radius bug where killSession("") with tmux prefix matching
376
- * would kill ALL tmux sessions when a headless agent is terminated.
492
+ * Branching after abort:
493
+ * - tmuxSession === "" (headless): never call tmux.killSession an empty `-t`
494
+ * prefix-matches every session in the tmux server, wildcard-killing the entire
495
+ * overstory swarm (overstory-74ce). Branch by pid:
496
+ * - pid !== null → kill the process tree (long-lived headless capability).
497
+ * - pid === null → no-op (spawn-per-turn agent between turns; the in-flight
498
+ * process, if any, was already handled by the abort/connection path).
499
+ * - tmuxSession !== "" (TUI): kill the named tmux session, but only when
500
+ * `tmuxAlive` to avoid spurious "session not found" errors.
377
501
  */
378
502
  async function killAgent(ctx: {
379
503
  session: AgentSession;
380
504
  tmuxAlive: boolean;
381
505
  tmux: { killSession: (name: string) => Promise<void> };
382
506
  process: { killTree: (pid: number) => Promise<void> };
507
+ getConnection: (name: string) => RuntimeConnection | undefined;
508
+ removeConnection: (name: string) => void;
383
509
  }): Promise<void> {
384
- const { session, tmuxAlive, tmux, process: proc } = ctx;
385
- const isHeadless = session.tmuxSession === "" && session.pid !== null;
386
- if (isHeadless && session.pid !== null) {
510
+ const { session, tmuxAlive, tmux, process: proc, getConnection, removeConnection } = ctx;
511
+
512
+ // Prefer runtime-agnostic abort() when a connection is registered.
513
+ const conn = getConnection(session.agentName);
514
+ if (conn) {
515
+ let aborted = false;
387
516
  try {
388
- await proc.killTree(session.pid);
517
+ await conn.abort();
518
+ aborted = true;
389
519
  } catch {
390
- // Already exitednot an error
520
+ // abort() failurefall through to defense-in-depth path
521
+ }
522
+ removeConnection(session.agentName);
523
+ if (aborted) {
524
+ return;
525
+ }
526
+ // abort() threw — fall through to PID/tmux kill below as defense-in-depth
527
+ }
528
+
529
+ // Headless agents (no tmux session) must never reach tmux.killSession.
530
+ // An empty `-t` argument is prefix-matched and would kill every overstory
531
+ // tmux session in the server (overstory-74ce).
532
+ if (session.tmuxSession === "") {
533
+ if (session.pid !== null) {
534
+ try {
535
+ await proc.killTree(session.pid);
536
+ } catch {
537
+ // Already exited — not an error
538
+ }
391
539
  }
392
- } else if (tmuxAlive) {
540
+ // pid === null: spawn-per-turn agent between turns. Any in-flight process
541
+ // was handled by abort/connection above. No-op — next dispatch will spawn fresh.
542
+ return;
543
+ }
544
+
545
+ // Named tmux session path (TUI agents).
546
+ if (tmuxAlive) {
393
547
  try {
394
548
  await tmux.killSession(session.tmuxSession);
395
549
  } catch {
@@ -398,6 +552,70 @@ async function killAgent(ctx: {
398
552
  }
399
553
  }
400
554
 
555
+ /**
556
+ * Send a synthetic `worker_died` mail to the parent of a watchdog-terminated
557
+ * session (overstory-c111). Fire-and-forget: never throws.
558
+ *
559
+ * Called only when `tryTransitionState(..., "zombie")` returns `ok: true`, so
560
+ * the state-machine's idempotence dedupes us — a subsequent watchdog tick that
561
+ * tries to re-zombify a session sees `illegal_transition` and skips notify.
562
+ */
563
+ function notifyParentOfDeath(ctx: {
564
+ session: AgentSession;
565
+ mailStore: MailStore | null;
566
+ reason: string;
567
+ tier: 0 | 1;
568
+ eventStore: EventStore | null;
569
+ runId: string | null;
570
+ }): void {
571
+ const { session, mailStore, reason, tier, eventStore, runId } = ctx;
572
+ if (mailStore === null) return;
573
+ if (session.parentAgent === null) return;
574
+
575
+ const payload: WorkerDiedPayload = {
576
+ agentName: session.agentName,
577
+ capability: session.capability,
578
+ taskId: session.taskId,
579
+ reason,
580
+ lastActivity: session.lastActivity,
581
+ terminatedBy: tier === 0 ? "tier0" : "tier1",
582
+ };
583
+
584
+ try {
585
+ mailStore.insert({
586
+ id: "",
587
+ from: session.agentName,
588
+ to: session.parentAgent,
589
+ subject: `[WATCHDOG] worker_died: ${session.agentName}`,
590
+ body:
591
+ `Worker "${session.agentName}" (${session.capability}) on task ${session.taskId} ` +
592
+ `was terminated by the watchdog. Reason: ${reason}. ` +
593
+ `Last activity: ${session.lastActivity}. ` +
594
+ `Decide whether to retry the work, escalate, or report the failure upstream.`,
595
+ type: "worker_died",
596
+ priority: "high",
597
+ threadId: null,
598
+ payload: JSON.stringify(payload),
599
+ });
600
+ } catch {
601
+ // Mail-send failure must never crash the watchdog.
602
+ return;
603
+ }
604
+
605
+ recordEvent(eventStore, {
606
+ runId,
607
+ agentName: session.agentName,
608
+ eventType: "mail_sent",
609
+ level: "warn",
610
+ data: {
611
+ type: "worker_died",
612
+ parent: session.parentAgent,
613
+ reason,
614
+ tier,
615
+ },
616
+ });
617
+ }
618
+
401
619
  /**
402
620
  * Run a single daemon tick. Exported for testing — allows direct invocation
403
621
  * of the monitoring logic without starting the interval-based daemon loop.
@@ -411,6 +629,7 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
411
629
  zombieThresholdMs,
412
630
  nudgeIntervalMs = 60_000,
413
631
  tier1Enabled = false,
632
+ notifyParentOnDeath = true,
414
633
  onHealthCheck,
415
634
  } = options;
416
635
  const tmux = options._tmux ?? { isSessionAlive, killSession };
@@ -425,10 +644,26 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
425
644
  const findStdoutLog = options._findLatestStdoutLog ?? findLatestStdoutLog;
426
645
  const maxTriagePerTick = options._maxTriagePerTick ?? 3;
427
646
  const triageCount = { value: 0 };
647
+ const runIdWarnState = options._runIdWarnState ?? _defaultRunIdWarnState;
428
648
 
429
649
  const overstoryDir = join(root, ".overstory");
430
650
  const { store } = openSessionStore(overstoryDir);
431
651
 
652
+ // Open RunStore for run-id validation (overstory-87bf). Sharing sessions.db
653
+ // is intentional — same file, WAL mode covers concurrent reads.
654
+ let runStore: RunStore | null = null;
655
+ let ownRunStore = false;
656
+ if (options._runStore !== undefined) {
657
+ runStore = options._runStore;
658
+ } else {
659
+ try {
660
+ runStore = createRunStore(join(overstoryDir, "sessions.db"));
661
+ ownRunStore = true;
662
+ } catch {
663
+ // RunStore creation failure is non-fatal — id validation is then skipped.
664
+ }
665
+ }
666
+
432
667
  // Open MailStore for decision gate detection (fire-and-forget: non-fatal if unavailable)
433
668
  let mailStore: MailStore | null = null;
434
669
  let ownMailStore = false;
@@ -474,6 +709,7 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
474
709
  // Track active headless agents to clean up stale tailers after the loop.
475
710
  const activeHeadlessAgents = new Set<string>();
476
711
  const eventsDbPath = join(overstoryDir, "events.db");
712
+ const sessionsDbPath = join(overstoryDir, "sessions.db");
477
713
 
478
714
  for (const session of sessions) {
479
715
  // Skip completed sessions — they are terminal and don't need monitoring
@@ -488,7 +724,11 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
488
724
  // active headless agent that doesn't already have one running.
489
725
  // Tailers persist between ticks (module-level registry) so events are
490
726
  // continuously written to events.db while the agent is working.
491
- if (session.tmuxSession === "" && session.pid !== null) {
727
+ //
728
+ // Both long-lived headless (pid !== null) and spawn-per-turn workers
729
+ // (pid === null, overstory-7a34) emit stream-json to stdout.log, so
730
+ // either pattern needs a tailer.
731
+ if (session.tmuxSession === "") {
492
732
  activeHeadlessAgents.add(session.agentName);
493
733
  if (!tailerRegistry.has(session.agentName)) {
494
734
  // Discover the latest stdout.log for this agent and start tailing.
@@ -499,41 +739,52 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
499
739
  agentName: session.agentName,
500
740
  runId,
501
741
  eventsDbPath,
742
+ sessionsDbPath,
502
743
  });
503
744
  tailerRegistry.set(session.agentName, handle);
504
745
  }
505
746
  }
506
747
  }
507
748
 
508
- // RPC health check: for headless agents with an active connection,
509
- // call getState() to refresh lastActivity before evaluateHealth().
510
- // This prevents false-positive stale/zombie classification for agents
511
- // that are actively working but haven't updated lastActivity via hooks.
512
- //
513
- // For non-RPC headless agents, fall back to event-based activity detection:
514
- // if events.db has a recent event from this agent within the stale window,
515
- // the agent is considered active and lastActivity is refreshed.
516
- if (session.tmuxSession === "" && session.pid !== null) {
517
- const conn = getConn(session.agentName);
518
- if (conn) {
519
- try {
520
- const state = await Promise.race([
521
- conn.getState(),
522
- new Promise<never>((_, reject) =>
523
- setTimeout(() => reject(new Error("getState timed out")), 5000),
524
- ),
525
- ]);
526
- if (state.status === "idle" || state.status === "working") {
527
- store.updateLastActivity(session.agentName);
528
- // Refresh the session object so evaluateHealth sees updated lastActivity
529
- session.lastActivity = new Date().toISOString();
530
- }
531
- } catch {
532
- // getState() failed or timed out — remove stale connection
533
- removeConn(session.agentName);
749
+ // === Liveness check ===
750
+ // Prefer RuntimeConnection.getState() when a connection is registered. Fall
751
+ // back to tmux liveness when no connection exists. For headless agents without
752
+ // a connection, use event-based activity detection to refresh lastActivity.
753
+ const conn = getConn(session.agentName);
754
+ let tmuxAlive: boolean;
755
+
756
+ if (conn) {
757
+ try {
758
+ const state = await Promise.race([
759
+ conn.getState(),
760
+ new Promise<never>((_, reject) =>
761
+ setTimeout(() => reject(new Error("getState timed out")), 5000),
762
+ ),
763
+ ]);
764
+ // Map ConnectionState liveness:
765
+ // idle | working → alive (running)
766
+ // error → not alive (exited)
767
+ if (state.status === "idle" || state.status === "working") {
768
+ tmuxAlive = true;
769
+ store.updateLastActivity(session.agentName);
770
+ session.lastActivity = new Date().toISOString();
771
+ } else {
772
+ tmuxAlive = false;
534
773
  }
535
- } else if (eventStore) {
536
- // No RPC connectioncheck events.db for recent activity
774
+ } catch {
775
+ // getState() failed/timed outdrop stale connection, fall back to tmux
776
+ removeConn(session.agentName);
777
+ tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
778
+ }
779
+ } else {
780
+ tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
781
+
782
+ // Headless agents without a registered connection: event-based
783
+ // activity detection to avoid false-positive stale. Covers both
784
+ // long-lived headless (e.g. after a process restart) and
785
+ // spawn-per-turn workers between turns where lastActivity is
786
+ // the only liveness signal (overstory-7a34).
787
+ if (session.tmuxSession === "" && eventStore) {
537
788
  try {
538
789
  const recentEvents = eventStore.getByAgent(session.agentName, {
539
790
  since: new Date(Date.now() - staleThresholdMs).toISOString(),
@@ -548,15 +799,28 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
548
799
  }
549
800
  }
550
801
  }
551
-
552
- const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
553
802
  const check = evaluateHealth(session, tmuxAlive, thresholds);
554
803
 
555
- // Transition state forward only (investigate action holds state)
804
+ // Snapshot the pre-tick state so the worker_died notify path can
805
+ // dedupe across re-ticks (overstory-c111). Subsequent `tryTransitionState`
806
+ // calls below mutate session.state, and the matrix allows the idempotent
807
+ // `zombie → zombie` self-transition — both would erase the dedup signal.
808
+ const stateBeforeTick = session.state;
809
+
810
+ // Transition state forward only (investigate action holds state).
811
+ // `transitionState` computes the watchdog's preferred target;
812
+ // `tryTransitionState` is the matrix-guarded CAS — `completed → *`
813
+ // is rejected here so a properly-completed agent cannot be
814
+ // reclassified as zombie by a late watchdog tick (overstory-a993).
556
815
  const newState = transitionState(session.state, check);
557
816
  if (newState !== session.state) {
558
- store.updateState(session.agentName, newState);
559
- session.state = newState;
817
+ const outcome = store.tryTransitionState(session.agentName, newState);
818
+ if (outcome.ok) {
819
+ session.state = newState;
820
+ } else if (outcome.reason === "illegal_transition") {
821
+ // Resync local mirror — another writer settled state durably.
822
+ session.state = outcome.prev;
823
+ }
560
824
  }
561
825
 
562
826
  if (onHealthCheck) {
@@ -568,12 +832,41 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
568
832
  const reason = check.reconciliationNote ?? "Process terminated";
569
833
  await recordFailureFn(root, session, reason, 0);
570
834
 
571
- // Kill the agent: headless agents are killed via PID, TUI agents via tmux
572
- await killAgent({ session, tmuxAlive, tmux, process: proc });
573
- store.updateState(session.agentName, "zombie");
835
+ // Kill the agent: prefer conn.abort(), fall back to PID/tmux
836
+ await killAgent({
837
+ session,
838
+ tmuxAlive,
839
+ tmux,
840
+ process: proc,
841
+ getConnection: getConn,
842
+ removeConnection: removeConn,
843
+ });
844
+ // Matrix-guarded: rejected when state is `completed` so a clean
845
+ // `ov stop` cannot be silently downgraded to zombie by a late
846
+ // watchdog termination (overstory-a993).
847
+ const outcome = store.tryTransitionState(session.agentName, "zombie");
574
848
  // Reset escalation tracking on terminal state
575
849
  store.updateEscalation(session.agentName, 0, null);
576
- session.state = "zombie";
850
+ if (outcome.ok) {
851
+ session.state = "zombie";
852
+ // First-time zombify: notify parent so it doesn't block on
853
+ // missing `worker_done` mail (overstory-c111). Dedup uses the
854
+ // pre-tick snapshot because the matrix allows the idempotent
855
+ // zombie → zombie transition (both `outcome.ok` and the earlier
856
+ // transitionState call would otherwise mask re-ticks).
857
+ if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
858
+ notifyParentOfDeath({
859
+ session,
860
+ mailStore,
861
+ reason,
862
+ tier: 0,
863
+ eventStore,
864
+ runId,
865
+ });
866
+ }
867
+ } else if (outcome.reason === "illegal_transition") {
868
+ session.state = outcome.prev;
869
+ }
577
870
  session.escalationLevel = 0;
578
871
  session.stalledSince = null;
579
872
  } else if (check.action === "investigate") {
@@ -581,6 +874,21 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
581
874
  // Log the conflict but do NOT auto-kill.
582
875
  // The onHealthCheck callback surfaces this to the operator.
583
876
  // No state change — keep zombie until a human or higher-tier agent decides.
877
+ } else if (check.action === "complete") {
878
+ // ZFC fallback: tmux/pid is gone AND lastActivity is stale —
879
+ // the agent looks like it finished naturally and only the
880
+ // session-end hook missed (overstory-e74b). Mark completed
881
+ // without killing (process is already gone) and without
882
+ // notifying parents of death (this is not a crash).
883
+ const outcome = store.tryTransitionState(session.agentName, "completed");
884
+ if (outcome.ok) {
885
+ session.state = "completed";
886
+ } else if (outcome.reason === "illegal_transition") {
887
+ session.state = outcome.prev;
888
+ }
889
+ store.updateEscalation(session.agentName, 0, null);
890
+ session.escalationLevel = 0;
891
+ session.stalledSince = null;
584
892
  } else if (check.action === "escalate") {
585
893
  // Decision gate check: if the agent sent a decision_gate message, it is
586
894
  // intentionally paused waiting for a human decision — not a stall.
@@ -635,12 +943,32 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
635
943
  recordFailure: recordFailureFn,
636
944
  triageCount,
637
945
  maxTriagePerTick,
946
+ getConnection: getConn,
947
+ removeConnection: removeConn,
638
948
  });
639
949
 
640
950
  if (actionResult.terminated) {
641
- store.updateState(session.agentName, "zombie");
951
+ // Matrix-guarded: completed → zombie is rejected (overstory-a993).
952
+ const outcome = store.tryTransitionState(session.agentName, "zombie");
642
953
  store.updateEscalation(session.agentName, 0, null);
643
- session.state = "zombie";
954
+ if (outcome.ok) {
955
+ session.state = "zombie";
956
+ // First-time zombify: notify parent so it doesn't block on
957
+ // missing `worker_done` mail (overstory-c111). Dedup via
958
+ // the pre-tick snapshot — see the terminate branch above.
959
+ if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
960
+ notifyParentOfDeath({
961
+ session,
962
+ mailStore,
963
+ reason: actionResult.deathReason ?? "Watchdog escalation terminated agent",
964
+ tier: actionResult.deathTier ?? 0,
965
+ eventStore,
966
+ runId,
967
+ });
968
+ }
969
+ } else if (outcome.reason === "illegal_transition") {
970
+ session.state = outcome.prev;
971
+ }
644
972
  session.escalationLevel = 0;
645
973
  session.stalledSince = null;
646
974
  }
@@ -664,10 +992,18 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
664
992
 
665
993
  // === Run-level completion detection ===
666
994
  // After monitoring individual sessions, check if the entire run is done.
667
- if (runId) {
995
+ // Re-resolve the run id defensively (overstory-87bf): a missing
996
+ // current-run.txt or a stale id (no row in runs table) skips the check
997
+ // and emits one warning per cause for the lifetime of this watchdog.
998
+ const validatedRunId = await resolveRunIdForCompletionCheck(
999
+ overstoryDir,
1000
+ runStore,
1001
+ runIdWarnState,
1002
+ );
1003
+ if (validatedRunId) {
668
1004
  await checkRunCompletion({
669
1005
  store,
670
- runId,
1006
+ runId: validatedRunId,
671
1007
  overstoryDir,
672
1008
  root,
673
1009
  nudge,
@@ -692,6 +1028,14 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
692
1028
  // Non-fatal
693
1029
  }
694
1030
  }
1031
+ // Close RunStore only if we created it (not injected)
1032
+ if (runStore && ownRunStore) {
1033
+ try {
1034
+ runStore.close();
1035
+ } catch {
1036
+ // Non-fatal
1037
+ }
1038
+ }
695
1039
  }
696
1040
  }
697
1041
 
@@ -741,7 +1085,15 @@ async function executeEscalationAction(ctx: {
741
1085
  tier: 0 | 1,
742
1086
  triageSuggestion?: string,
743
1087
  ) => Promise<void>;
744
- }): Promise<{ terminated: boolean; stateChanged: boolean }> {
1088
+ getConnection: (name: string) => RuntimeConnection | undefined;
1089
+ removeConnection: (name: string) => void;
1090
+ }): Promise<{
1091
+ terminated: boolean;
1092
+ stateChanged: boolean;
1093
+ /** Reason and tier of the termination (only set when `terminated` is true). */
1094
+ deathReason?: string;
1095
+ deathTier?: 0 | 1;
1096
+ }> {
745
1097
  const {
746
1098
  session,
747
1099
  root,
@@ -756,6 +1108,8 @@ async function executeEscalationAction(ctx: {
756
1108
  recordFailure,
757
1109
  triageCount,
758
1110
  maxTriagePerTick,
1111
+ getConnection: getConn,
1112
+ removeConnection: removeConn,
759
1113
  } = ctx;
760
1114
 
761
1115
  switch (session.escalationLevel) {
@@ -832,16 +1186,23 @@ async function executeEscalationAction(ctx: {
832
1186
 
833
1187
  if (result.verdict === "terminate") {
834
1188
  // Record the failure via mulch (Tier 1 AI triage)
835
- await recordFailure(
836
- root,
837
- session,
838
- "AI triage classified as terminal failure",
839
- 1,
840
- result.verdict,
841
- );
1189
+ const triageReason = "AI triage classified as terminal failure";
1190
+ await recordFailure(root, session, triageReason, 1, result.verdict);
842
1191
 
843
- await killAgent({ session, tmuxAlive, tmux, process: proc });
844
- return { terminated: true, stateChanged: true };
1192
+ await killAgent({
1193
+ session,
1194
+ tmuxAlive,
1195
+ tmux,
1196
+ process: proc,
1197
+ getConnection: getConn,
1198
+ removeConnection: removeConn,
1199
+ });
1200
+ return {
1201
+ terminated: true,
1202
+ stateChanged: true,
1203
+ deathReason: triageReason,
1204
+ deathTier: 1,
1205
+ };
845
1206
  }
846
1207
 
847
1208
  if (result.verdict === "retry") {
@@ -874,10 +1235,23 @@ async function executeEscalationAction(ctx: {
874
1235
  });
875
1236
 
876
1237
  // Record the failure via mulch (Tier 0: progressive escalation to terminal level)
877
- await recordFailure(root, session, "Progressive escalation reached terminal level", 0);
1238
+ const escalationReason = "Progressive escalation reached terminal level";
1239
+ await recordFailure(root, session, escalationReason, 0);
878
1240
 
879
- await killAgent({ session, tmuxAlive, tmux, process: proc });
880
- return { terminated: true, stateChanged: true };
1241
+ await killAgent({
1242
+ session,
1243
+ tmuxAlive,
1244
+ tmux,
1245
+ process: proc,
1246
+ getConnection: getConn,
1247
+ removeConnection: removeConn,
1248
+ });
1249
+ return {
1250
+ terminated: true,
1251
+ stateChanged: true,
1252
+ deathReason: escalationReason,
1253
+ deathTier: 0,
1254
+ };
881
1255
  }
882
1256
  }
883
1257
  }