@os-eco/overstory-cli 0.9.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/README.md +50 -19
  2. package/agents/builder.md +19 -9
  3. package/agents/coordinator.md +6 -6
  4. package/agents/lead.md +204 -87
  5. package/agents/merger.md +25 -14
  6. package/agents/reviewer.md +22 -16
  7. package/agents/scout.md +17 -12
  8. package/package.json +6 -3
  9. package/src/agents/capabilities.test.ts +85 -0
  10. package/src/agents/capabilities.ts +125 -0
  11. package/src/agents/headless-mail-injector.test.ts +448 -0
  12. package/src/agents/headless-mail-injector.ts +219 -0
  13. package/src/agents/headless-prompt.test.ts +102 -0
  14. package/src/agents/headless-prompt.ts +68 -0
  15. package/src/agents/hooks-deployer.test.ts +514 -14
  16. package/src/agents/hooks-deployer.ts +141 -0
  17. package/src/agents/mail-poll-detect.test.ts +153 -0
  18. package/src/agents/mail-poll-detect.ts +73 -0
  19. package/src/agents/overlay.test.ts +60 -4
  20. package/src/agents/overlay.ts +63 -8
  21. package/src/agents/scope-detect.test.ts +190 -0
  22. package/src/agents/scope-detect.ts +146 -0
  23. package/src/agents/turn-lock.test.ts +181 -0
  24. package/src/agents/turn-lock.ts +235 -0
  25. package/src/agents/turn-runner-dispatch.test.ts +182 -0
  26. package/src/agents/turn-runner-dispatch.ts +105 -0
  27. package/src/agents/turn-runner.test.ts +2312 -0
  28. package/src/agents/turn-runner.ts +1383 -0
  29. package/src/commands/agents.ts +9 -0
  30. package/src/commands/clean.ts +54 -0
  31. package/src/commands/coordinator.test.ts +254 -0
  32. package/src/commands/coordinator.ts +273 -8
  33. package/src/commands/dashboard.test.ts +188 -0
  34. package/src/commands/dashboard.ts +14 -4
  35. package/src/commands/doctor.ts +3 -1
  36. package/src/commands/group.test.ts +94 -0
  37. package/src/commands/group.ts +49 -20
  38. package/src/commands/init.test.ts +8 -0
  39. package/src/commands/init.ts +8 -1
  40. package/src/commands/log.test.ts +187 -11
  41. package/src/commands/log.ts +171 -71
  42. package/src/commands/mail.test.ts +162 -0
  43. package/src/commands/mail.ts +64 -9
  44. package/src/commands/merge.test.ts +230 -1
  45. package/src/commands/merge.ts +68 -12
  46. package/src/commands/nudge.test.ts +351 -4
  47. package/src/commands/nudge.ts +356 -34
  48. package/src/commands/run.test.ts +43 -7
  49. package/src/commands/serve/build.test.ts +202 -0
  50. package/src/commands/serve/build.ts +206 -0
  51. package/src/commands/serve/coordinator-actions.test.ts +339 -0
  52. package/src/commands/serve/coordinator-actions.ts +408 -0
  53. package/src/commands/serve/dev.test.ts +168 -0
  54. package/src/commands/serve/dev.ts +117 -0
  55. package/src/commands/serve/mail-actions.test.ts +312 -0
  56. package/src/commands/serve/mail-actions.ts +167 -0
  57. package/src/commands/serve/rest.test.ts +1323 -0
  58. package/src/commands/serve/rest.ts +708 -0
  59. package/src/commands/serve/static.ts +51 -0
  60. package/src/commands/serve/ws.test.ts +361 -0
  61. package/src/commands/serve/ws.ts +332 -0
  62. package/src/commands/serve.test.ts +459 -0
  63. package/src/commands/serve.ts +565 -0
  64. package/src/commands/sling.test.ts +177 -1
  65. package/src/commands/sling.ts +243 -71
  66. package/src/commands/status.test.ts +9 -0
  67. package/src/commands/status.ts +12 -4
  68. package/src/commands/stop.test.ts +255 -1
  69. package/src/commands/stop.ts +107 -8
  70. package/src/commands/watch.test.ts +43 -0
  71. package/src/commands/watch.ts +153 -28
  72. package/src/config.ts +23 -0
  73. package/src/doctor/consistency.test.ts +106 -0
  74. package/src/doctor/consistency.ts +48 -1
  75. package/src/doctor/serve.test.ts +95 -0
  76. package/src/doctor/serve.ts +86 -0
  77. package/src/doctor/types.ts +2 -1
  78. package/src/doctor/watchdog.ts +57 -1
  79. package/src/events/tailer.test.ts +234 -1
  80. package/src/events/tailer.ts +90 -0
  81. package/src/index.ts +57 -6
  82. package/src/insights/quality-gates.test.ts +141 -0
  83. package/src/insights/quality-gates.ts +156 -0
  84. package/src/json.ts +29 -0
  85. package/src/logging/theme.ts +4 -0
  86. package/src/mail/client.ts +15 -2
  87. package/src/mail/store.test.ts +82 -0
  88. package/src/mail/store.ts +41 -4
  89. package/src/merge/lock.test.ts +149 -0
  90. package/src/merge/lock.ts +140 -0
  91. package/src/merge/predict.test.ts +387 -0
  92. package/src/merge/predict.ts +249 -0
  93. package/src/merge/resolver.ts +1 -1
  94. package/src/mulch/client.ts +3 -3
  95. package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
  96. package/src/runtimes/claude.test.ts +791 -1
  97. package/src/runtimes/claude.ts +323 -1
  98. package/src/runtimes/connections.test.ts +141 -1
  99. package/src/runtimes/connections.ts +73 -4
  100. package/src/runtimes/headless-connection.test.ts +264 -0
  101. package/src/runtimes/headless-connection.ts +158 -0
  102. package/src/runtimes/types.ts +10 -0
  103. package/src/schema-consistency.test.ts +1 -0
  104. package/src/sessions/store.test.ts +657 -29
  105. package/src/sessions/store.ts +286 -23
  106. package/src/test-setup.test.ts +31 -0
  107. package/src/test-setup.ts +28 -0
  108. package/src/types.ts +107 -2
  109. package/src/utils/pid.test.ts +85 -1
  110. package/src/utils/pid.ts +86 -1
  111. package/src/utils/process-scan.test.ts +53 -0
  112. package/src/utils/process-scan.ts +76 -0
  113. package/src/watchdog/daemon.test.ts +1607 -376
  114. package/src/watchdog/daemon.ts +462 -88
  115. package/src/watchdog/health.test.ts +282 -0
  116. package/src/watchdog/health.ts +126 -27
  117. package/src/worktree/manager.test.ts +218 -1
  118. package/src/worktree/manager.ts +55 -0
  119. package/src/worktree/process.test.ts +71 -0
  120. package/src/worktree/process.ts +25 -5
  121. package/src/worktree/tmux.test.ts +28 -0
  122. package/src/worktree/tmux.ts +27 -3
  123. package/templates/CLAUDE.md.tmpl +19 -8
  124. package/templates/overlay.md.tmpl +5 -2
@@ -103,6 +103,67 @@ describe("evaluateHealth", () => {
103
103
  expect(check.reconciliationNote).toBeNull();
104
104
  });
105
105
 
106
+ // --- ZFC Rule 1 fallback: tmux dead + stale lastActivity → completed ---
107
+
108
+ test("ZFC fallback: tmux dead + stale lastActivity (working) → complete (missed signal)", () => {
109
+ const staleActivity = new Date(Date.now() - 60_000).toISOString();
110
+ const session = makeSession({ state: "working", lastActivity: staleActivity });
111
+ const check = evaluateHealth(session, false, THRESHOLDS);
112
+
113
+ expect(check.state).toBe("completed");
114
+ expect(check.action).toBe("complete");
115
+ expect(check.tmuxAlive).toBe(false);
116
+ expect(check.processAlive).toBe(false);
117
+ expect(check.reconciliationNote).toContain("missed session-end signal");
118
+ });
119
+
120
+ test("ZFC fallback: tmux dead + stale lastActivity (stalled) → complete (missed signal)", () => {
121
+ const staleActivity = new Date(Date.now() - 90_000).toISOString();
122
+ const session = makeSession({ state: "stalled", lastActivity: staleActivity });
123
+ const check = evaluateHealth(session, false, THRESHOLDS);
124
+
125
+ expect(check.state).toBe("completed");
126
+ expect(check.action).toBe("complete");
127
+ });
128
+
129
+ test("ZFC: tmux dead + recent lastActivity → still zombie (true crash)", () => {
130
+ const recentActivity = new Date(Date.now() - 1_000).toISOString();
131
+ const session = makeSession({ state: "working", lastActivity: recentActivity });
132
+ const check = evaluateHealth(session, false, THRESHOLDS);
133
+
134
+ expect(check.state).toBe("zombie");
135
+ expect(check.action).toBe("terminate");
136
+ });
137
+
138
+ test("ZFC fallback (headless): pid dead + stale lastActivity → complete", () => {
139
+ const staleActivity = new Date(Date.now() - 60_000).toISOString();
140
+ const session = makeSession({
141
+ state: "working",
142
+ tmuxSession: "",
143
+ pid: DEAD_PID,
144
+ lastActivity: staleActivity,
145
+ });
146
+ const check = evaluateHealth(session, false, THRESHOLDS);
147
+
148
+ expect(check.state).toBe("completed");
149
+ expect(check.action).toBe("complete");
150
+ expect(check.reconciliationNote).toContain("missed session-end signal");
151
+ });
152
+
153
+ test("ZFC (headless): pid dead + recent lastActivity → still zombie", () => {
154
+ const recentActivity = new Date(Date.now() - 1_000).toISOString();
155
+ const session = makeSession({
156
+ state: "working",
157
+ tmuxSession: "",
158
+ pid: DEAD_PID,
159
+ lastActivity: recentActivity,
160
+ });
161
+ const check = evaluateHealth(session, false, THRESHOLDS);
162
+
163
+ expect(check.state).toBe("zombie");
164
+ expect(check.action).toBe("terminate");
165
+ });
166
+
106
167
  // --- ZFC Rule 2: tmux alive + sessions.json says zombie → investigate ---
107
168
 
108
169
  test("ZFC: tmux alive + sessions.json says zombie → investigate (don't auto-kill)", () => {
@@ -432,6 +493,162 @@ describe("headless agents (tmuxSession empty, PID-based lifecycle)", () => {
432
493
  });
433
494
  });
434
495
 
496
+ // === Spawn-per-turn workers (tmuxSession === '' && pid === null) ===
497
+
498
+ describe("spawn-per-turn workers (overstory-7a34)", () => {
499
+ // Spawn-per-turn workers (builder/scout/reviewer/lead/merger under the
500
+ // headless default) have no persistent process between turns. The previous
501
+ // "headless" branch only matched pid !== null, so these sessions fell into
502
+ // the TUI/tmux path where tmuxAlive=false → ZFC Rule 1 → zombie within
503
+ // seconds of sling, despite being actively executing tools (overstory-7a34).
504
+
505
+ test("freshly slung spawn-per-turn lead (booting, no pid, no tmux) → between_turns (overstory-3087)", () => {
506
+ // Spec change: spawn-per-turn workers report `between_turns` instead
507
+ // of `working` for the healthy classification, including the booting
508
+ // → healthy transition. The turn-runner authoritatively writes
509
+ // `in_turn` once the first parser event of a turn arrives.
510
+ const session = makeSession({
511
+ tmuxSession: "",
512
+ pid: null,
513
+ capability: "lead",
514
+ state: "booting",
515
+ lastActivity: new Date().toISOString(),
516
+ });
517
+ const check = evaluateHealth(session, false, THRESHOLDS);
518
+
519
+ expect(check.state).toBe("between_turns");
520
+ expect(check.action).toBe("none");
521
+ expect(check.reconciliationNote).toBeNull();
522
+ });
523
+
524
+ test("legacy spawn-per-turn worker still at 'working' is reported as between_turns (overstory-3087)", () => {
525
+ // A row that predates the substate split (state=working) gets
526
+ // reclassified to `between_turns` by the watchdog's healthy-state
527
+ // reporter. transitionState then promotes the row forward (working
528
+ // and between_turns share rank 1 in STATE_ORDER, so the actual
529
+ // promotion happens via tryTransitionState elsewhere — here we just
530
+ // verify the check itself reports the new substate).
531
+ const session = makeSession({
532
+ tmuxSession: "",
533
+ pid: null,
534
+ capability: "builder",
535
+ state: "working",
536
+ lastActivity: new Date(Date.now() - 5_000).toISOString(),
537
+ });
538
+ const check = evaluateHealth(session, false, THRESHOLDS);
539
+
540
+ expect(check.state).toBe("between_turns");
541
+ expect(check.action).toBe("none");
542
+ });
543
+
544
+ test("spawn-per-turn worker between turns (recent activity) → between_turns, NOT zombie (overstory-3087)", () => {
545
+ // Repro of overstory-7a34: ov sling --capability lead any-task; within
546
+ // ~30s ov dashboard previously showed state='zombie' while ov feed
547
+ // showed live tool calls. The healthy classification now lands
548
+ // between_turns; the test still verifies that recent activity does
549
+ // not trigger zombie classification.
550
+ const session = makeSession({
551
+ tmuxSession: "",
552
+ pid: null,
553
+ capability: "lead",
554
+ state: "working",
555
+ lastActivity: new Date().toISOString(),
556
+ });
557
+ const check = evaluateHealth(session, false, THRESHOLDS);
558
+
559
+ expect(check.state).toBe("between_turns");
560
+ expect(check.action).toBe("none");
561
+ });
562
+
563
+ test("spawn-per-turn worker with stale activity → stalled", () => {
564
+ const session = makeSession({
565
+ tmuxSession: "",
566
+ pid: null,
567
+ capability: "builder",
568
+ state: "working",
569
+ lastActivity: new Date(Date.now() - 60_000).toISOString(),
570
+ });
571
+ const check = evaluateHealth(session, false, THRESHOLDS);
572
+
573
+ expect(check.state).toBe("stalled");
574
+ expect(check.action).toBe("escalate");
575
+ });
576
+
577
+ test("spawn-per-turn worker with zombie-level staleness → zombie, terminate", () => {
578
+ const session = makeSession({
579
+ tmuxSession: "",
580
+ pid: null,
581
+ capability: "builder",
582
+ state: "working",
583
+ lastActivity: new Date(Date.now() - 200_000).toISOString(),
584
+ });
585
+ const check = evaluateHealth(session, false, THRESHOLDS);
586
+
587
+ expect(check.state).toBe("zombie");
588
+ expect(check.action).toBe("terminate");
589
+ });
590
+
591
+ test("spawn-per-turn worker that already completed → skips monitoring", () => {
592
+ const session = makeSession({
593
+ tmuxSession: "",
594
+ pid: null,
595
+ capability: "builder",
596
+ state: "completed",
597
+ });
598
+ const check = evaluateHealth(session, false, THRESHOLDS);
599
+
600
+ expect(check.state).toBe("completed");
601
+ expect(check.action).toBe("none");
602
+ });
603
+
604
+ test("preserves in_turn for healthy spawn-per-turn worker (overstory-3087)", () => {
605
+ // A spawn-per-turn worker the turn-runner has marked in_turn must
606
+ // have its state preserved by the health evaluation when activity is
607
+ // recent — otherwise the watchdog would stomp the substate back to
608
+ // `working` and the UI would lose the distinction between mid-turn
609
+ // and idling.
610
+ const session = makeSession({
611
+ tmuxSession: "",
612
+ pid: null,
613
+ capability: "builder",
614
+ state: "in_turn",
615
+ lastActivity: new Date().toISOString(),
616
+ });
617
+ const check = evaluateHealth(session, false, THRESHOLDS);
618
+
619
+ expect(check.state).toBe("in_turn");
620
+ expect(check.action).toBe("none");
621
+ });
622
+
623
+ test("preserves between_turns for healthy spawn-per-turn worker (overstory-3087)", () => {
624
+ const session = makeSession({
625
+ tmuxSession: "",
626
+ pid: null,
627
+ capability: "builder",
628
+ state: "between_turns",
629
+ lastActivity: new Date().toISOString(),
630
+ });
631
+ const check = evaluateHealth(session, false, THRESHOLDS);
632
+
633
+ expect(check.state).toBe("between_turns");
634
+ expect(check.action).toBe("none");
635
+ });
636
+
637
+ test("escalates an in_turn worker with stale activity to stalled (overstory-3087)", () => {
638
+ const session = makeSession({
639
+ tmuxSession: "",
640
+ pid: null,
641
+ capability: "builder",
642
+ state: "in_turn",
643
+ lastActivity: new Date(Date.now() - 60_000).toISOString(),
644
+ });
645
+ const check = evaluateHealth(session, false, THRESHOLDS);
646
+
647
+ expect(check.state).toBe("stalled");
648
+ expect(check.action).toBe("escalate");
649
+ });
650
+ });
651
+
435
652
  // === transitionState ===
436
653
 
437
654
  describe("transitionState", () => {
@@ -545,4 +762,69 @@ describe("transitionState", () => {
545
762
  // the state should NOT advance
546
763
  expect(transitionState("working", check)).toBe("working");
547
764
  });
765
+
766
+ // --- in_turn / between_turns coexist with working at the active rank (overstory-3087) ---
767
+
768
+ test("preserves in_turn when watchdog reports a healthy 'working' check", () => {
769
+ // The watchdog's healthy-classification check returns state=working;
770
+ // since in_turn shares rank 1 with working, transitionState must not
771
+ // advance and the spawn-per-turn substate the turn-runner wrote stays.
772
+ const check = {
773
+ state: "working" as const,
774
+ agentName: "a",
775
+ timestamp: "",
776
+ tmuxAlive: true,
777
+ pidAlive: true as boolean | null,
778
+ lastActivity: "",
779
+ processAlive: true,
780
+ action: "none" as const,
781
+ reconciliationNote: null,
782
+ };
783
+ expect(transitionState("in_turn", check)).toBe("in_turn");
784
+ });
785
+
786
+ test("preserves between_turns when watchdog reports a healthy 'working' check", () => {
787
+ const check = {
788
+ state: "working" as const,
789
+ agentName: "a",
790
+ timestamp: "",
791
+ tmuxAlive: true,
792
+ pidAlive: true as boolean | null,
793
+ lastActivity: "",
794
+ processAlive: true,
795
+ action: "none" as const,
796
+ reconciliationNote: null,
797
+ };
798
+ expect(transitionState("between_turns", check)).toBe("between_turns");
799
+ });
800
+
801
+ test("advances in_turn → stalled when the watchdog escalates", () => {
802
+ const check = {
803
+ state: "stalled" as const,
804
+ agentName: "a",
805
+ timestamp: "",
806
+ tmuxAlive: true,
807
+ pidAlive: true as boolean | null,
808
+ lastActivity: "",
809
+ processAlive: true,
810
+ action: "escalate" as const,
811
+ reconciliationNote: null,
812
+ };
813
+ expect(transitionState("in_turn", check)).toBe("stalled");
814
+ });
815
+
816
+ test("advances between_turns → zombie when the watchdog terminates", () => {
817
+ const check = {
818
+ state: "zombie" as const,
819
+ agentName: "a",
820
+ timestamp: "",
821
+ tmuxAlive: false,
822
+ pidAlive: false as boolean | null,
823
+ lastActivity: "",
824
+ processAlive: false,
825
+ action: "terminate" as const,
826
+ reconciliationNote: null,
827
+ };
828
+ expect(transitionState("between_turns", check)).toBe("zombie");
829
+ });
548
830
  });
@@ -30,22 +30,25 @@
30
30
  * table are always up-to-date because they reflect real kernel state.
31
31
  */
32
32
 
33
+ import { isPersistentCapability } from "../agents/capabilities.ts";
33
34
  import type { AgentSession, AgentState, HealthCheck } from "../types.ts";
34
35
 
35
36
  /**
36
- * Agent capabilities that run as persistent interactive sessions.
37
- * These agents are expected to have long idle periods (e.g. coordinator waiting
38
- * for worker mail) and should NOT be flagged stale/zombie based on lastActivity.
39
- * Only tmux/pid liveness checks apply to them.
37
+ * Numeric ordering for forward-only state transitions.
40
38
  *
41
- * Shared concept with src/commands/log.ts:PERSISTENT_CAPABILITIES.
39
+ * `in_turn` and `between_turns` share the `working` rank (1) because, from
40
+ * the watchdog's perspective, all three are "agent is alive and active" —
41
+ * they only differ in whether the spawn-per-turn worker is currently
42
+ * mid-execution or idling between mail batches (overstory-3087). Same rank
43
+ * means a healthy-classification check (`check.state === "working"`) will
44
+ * not stomp on the more specific in_turn/between_turns states the
45
+ * turn-runner has already written.
42
46
  */
43
- const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor"]);
44
-
45
- /** Numeric ordering for forward-only state transitions. */
46
47
  const STATE_ORDER: Record<AgentState, number> = {
47
48
  booting: 0,
48
49
  working: 1,
50
+ in_turn: 1,
51
+ between_turns: 1,
49
52
  completed: 2,
50
53
  stalled: 3,
51
54
  zombie: 4,
@@ -71,15 +74,34 @@ export function isProcessRunning(pid: number): boolean {
71
74
  }
72
75
 
73
76
  /**
74
- * Detect whether a session is a headless agent.
77
+ * Detect whether a session is a long-lived headless agent.
75
78
  *
76
- * Headless agents are spawned without a tmux session (tmuxSession === '') and
77
- * are tracked solely by PID. For these agents, PID is the primary liveness signal.
79
+ * Long-lived headless agents (coordinator, orchestrator, monitor, sapling, etc.)
80
+ * have no tmux session (tmuxSession === '') but do have a persistent process
81
+ * so `session.pid` is non-null and PID is the primary liveness signal.
78
82
  */
79
83
  function isHeadlessSession(session: AgentSession): boolean {
80
84
  return session.tmuxSession === "" && session.pid !== null;
81
85
  }
82
86
 
87
+ /**
88
+ * Detect whether a session is a spawn-per-turn worker between turns.
89
+ *
90
+ * Spawn-per-turn workers (task-scoped capabilities under the new headless
91
+ * default — builder/scout/reviewer/lead/merger) have no tmux session AND no
92
+ * persistent process: `tmuxSession === ''` and `session.pid === null` from
93
+ * sling onward. The per-turn claude PID lives in
94
+ * `.overstory/agents/<name>/turn.pid` only while a turn is in flight.
95
+ *
96
+ * "No process" is the normal state between turns, so neither tmux liveness nor
97
+ * pid liveness can be used as a death signal — only `lastActivity` recency
98
+ * (refreshed by the turn-runner on every event and by the watchdog from
99
+ * events.db) can. (overstory-7a34)
100
+ */
101
+ export function isSpawnPerTurnSession(session: AgentSession): boolean {
102
+ return session.tmuxSession === "" && session.pid === null;
103
+ }
104
+
83
105
  /**
84
106
  * Evaluate time-based health (persistent capability exemptions, stale, zombie thresholds,
85
107
  * booting→working transition). Called after liveness is confirmed for both TUI and headless paths.
@@ -98,7 +120,7 @@ function evaluateTimeBased(
98
120
  // Persistent capabilities (coordinator, monitor) are expected to have long idle
99
121
  // periods waiting for mail/events. Skip time-based stale/zombie detection for
100
122
  // them — only tmux/pid liveness matters (checked above).
101
- if (PERSISTENT_CAPABILITIES.has(session.capability)) {
123
+ if (isPersistentCapability(session.capability)) {
102
124
  // Transition booting → working if we reach here (process alive)
103
125
  const state = session.state === "booting" ? "working" : session.state;
104
126
  return {
@@ -135,22 +157,42 @@ function evaluateTimeBased(
135
157
  };
136
158
  }
137
159
 
138
- // booting transition to working once there's recent activity
160
+ // Spawn-per-turn workers (overstory-3087): healthy classification reports
161
+ // `between_turns` instead of `working`, including the booting → healthy
162
+ // transition. The turn-runner authoritatively writes `in_turn` /
163
+ // `between_turns` while a turn is alive; in_turn is preserved here when
164
+ // already set so a watchdog tick mid-turn does not overwrite it.
165
+ const isSpawnPerTurn = isSpawnPerTurnSession(session);
166
+
167
+ // booting → transition to the healthy state once there's recent activity.
139
168
  if (session.state === "booting") {
140
169
  return {
141
170
  ...base,
142
171
  processAlive: true,
143
- state: "working",
172
+ state: isSpawnPerTurn ? "between_turns" : "working",
144
173
  action: "none",
145
174
  reconciliationNote: null,
146
175
  };
147
176
  }
148
177
 
149
- // Default: healthy and working
178
+ // Default: healthy active state. For spawn-per-turn workers report the
179
+ // existing in_turn/between_turns substate; for tmux/long-lived agents
180
+ // report `working`. The turn-runner is authoritative for in_turn ↔
181
+ // between_turns transitions, so the watchdog must not stomp the more
182
+ // specific state — same rank in STATE_ORDER ensures `transitionState`
183
+ // also leaves the row alone.
184
+ let healthyState: AgentState;
185
+ if (session.state === "in_turn" || session.state === "between_turns") {
186
+ healthyState = session.state;
187
+ } else if (isSpawnPerTurn) {
188
+ healthyState = "between_turns";
189
+ } else {
190
+ healthyState = "working";
191
+ }
150
192
  return {
151
193
  ...base,
152
194
  processAlive: true,
153
- state: "working",
195
+ state: healthyState,
154
196
  action: "none",
155
197
  reconciliationNote: null,
156
198
  };
@@ -165,19 +207,23 @@ function evaluateTimeBased(
165
207
  * Decision logic (in priority order):
166
208
  *
167
209
  * 1. Completed agents skip monitoring entirely.
168
- * 2. Headless agents (tmuxSession === ''): PID is primary liveness signal.
210
+ * 2. Spawn-per-turn workers (tmuxSession === '' && pid === null): no
211
+ * persistent process between turns — fall straight through to time-based
212
+ * checks driven by lastActivity. PID/tmux liveness are meaningless here.
213
+ * 3. Headless agents with persistent process (tmuxSession === '' && pid !== null):
214
+ * PID is primary liveness signal.
169
215
  * - pid dead → zombie, terminate.
170
216
  * - pid alive + state zombie → investigate.
171
217
  * - pid alive → fall through to time-based checks.
172
- * 3. tmux dead → zombie, terminate (regardless of what sessions.json says).
173
- * 4. tmux alive + sessions.json says zombie → investigate (don't auto-kill).
218
+ * 4. tmux dead → zombie, terminate (regardless of what sessions.json says).
219
+ * 5. tmux alive + sessions.json says zombie → investigate (don't auto-kill).
174
220
  * Something external marked this zombie, but the process is still running.
175
- * 5. pid dead + tmux alive → zombie, terminate. The agent process exited but
221
+ * 6. pid dead + tmux alive → zombie, terminate. The agent process exited but
176
222
  * the tmux pane shell survived. The agent is not doing work.
177
- * 6. lastActivity older than zombieMs → zombie, terminate.
178
- * 7. lastActivity older than staleMs → stalled, escalate.
179
- * 8. booting with recent activity → working.
180
- * 9. Otherwise → working, healthy.
223
+ * 7. lastActivity older than zombieMs → zombie, terminate.
224
+ * 8. lastActivity older than staleMs → stalled, escalate.
225
+ * 9. booting with recent activity → working.
226
+ * 10. Otherwise → working, healthy.
181
227
  *
182
228
  * @param session - The agent session to evaluate
183
229
  * @param tmuxAlive - Whether the agent's tmux session is still running
@@ -222,10 +268,37 @@ export function evaluateHealth(
222
268
  };
223
269
  }
224
270
 
271
+ // === Spawn-per-turn path: no persistent process between turns ===
272
+ // For these workers (overstory-7a34) `session.pid` is null by design and
273
+ // there is no tmux session. Liveness signals reduce to lastActivity
274
+ // recency: the turn-runner updates it on every parser event during a
275
+ // turn, and the watchdog refreshes it from events.db between turns. PID
276
+ // and tmux checks would always say "dead" and false-positive every fresh
277
+ // agent as zombie within seconds of sling.
278
+ if (isSpawnPerTurnSession(session)) {
279
+ return evaluateTimeBased(session, base, elapsedMs, thresholds);
280
+ }
281
+
225
282
  // === Headless path: PID is the primary liveness signal ===
226
283
  if (isHeadlessSession(session)) {
227
- // pid dead zombie immediately (equivalent to ZFC Rule 1 for headless)
284
+ // pid dead: zombie OR completed-with-missed-signal.
285
+ // Distinguish by lastActivity age — recent activity means the agent
286
+ // crashed mid-work (true zombie); stale activity means it likely
287
+ // finished naturally and only the session-end hook didn't deliver
288
+ // (treat as completed). (overstory-e74b)
228
289
  if (pidAlive === false) {
290
+ if (
291
+ elapsedMs > thresholds.staleMs &&
292
+ (session.state === "working" || session.state === "booting" || session.state === "stalled")
293
+ ) {
294
+ return {
295
+ ...base,
296
+ processAlive: false,
297
+ state: "completed",
298
+ action: "complete",
299
+ reconciliationNote: `ZFC: headless pid ${session.pid} dead + stale lastActivity (${Math.round(elapsedMs / 1000)}s ago) — assumed completed (missed session-end signal)`,
300
+ };
301
+ }
229
302
  return {
230
303
  ...base,
231
304
  processAlive: false,
@@ -253,9 +326,25 @@ export function evaluateHealth(
253
326
 
254
327
  // === TUI/tmux path ===
255
328
 
256
- // ZFC Rule 1: tmux dead → zombie immediately, regardless of recorded state.
257
- // Observable state says the process is gone.
329
+ // ZFC Rule 1: tmux dead → zombie OR completed-with-missed-signal.
330
+ // Distinguish by lastActivity age recent activity means the agent
331
+ // crashed mid-work (true zombie); stale activity means it likely
332
+ // finished naturally and only the session-end hook didn't deliver
333
+ // (treat as completed). (overstory-e74b)
258
334
  if (!tmuxAlive) {
335
+ if (
336
+ elapsedMs > thresholds.staleMs &&
337
+ (session.state === "working" || session.state === "booting" || session.state === "stalled")
338
+ ) {
339
+ return {
340
+ ...base,
341
+ processAlive: false,
342
+ state: "completed",
343
+ action: "complete",
344
+ reconciliationNote: `ZFC: tmux dead + stale lastActivity (${Math.round(elapsedMs / 1000)}s ago) — assumed completed (missed session-end signal)`,
345
+ };
346
+ }
347
+
259
348
  const note =
260
349
  session.state === "working" || session.state === "booting"
261
350
  ? `ZFC: tmux dead but sessions.json says "${session.state}" — marking zombie (observable state wins)`
@@ -323,6 +412,16 @@ export function transitionState(currentState: AgentState, check: HealthCheck): A
323
412
  return currentState;
324
413
  }
325
414
 
415
+ // `complete` is a terminal classification triggered when observable state
416
+ // proves the agent finished naturally (missed session-end signal —
417
+ // overstory-e74b). It bypasses the forward-only STATE_ORDER guard because
418
+ // `completed` (order 2) sits before `stalled` (order 3) and would
419
+ // otherwise be blocked from advancing the recorded state. The matrix in
420
+ // SessionStore.tryTransitionState still gates the actual write.
421
+ if (check.action === "complete") {
422
+ return check.state;
423
+ }
424
+
326
425
  const currentOrder = STATE_ORDER[currentState];
327
426
  const checkOrder = STATE_ORDER[check.state];
328
427