@os-eco/overstory-cli 0.9.4 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -19
- package/agents/builder.md +19 -9
- package/agents/coordinator.md +6 -6
- package/agents/lead.md +204 -87
- package/agents/merger.md +25 -14
- package/agents/reviewer.md +22 -16
- package/agents/scout.md +17 -12
- package/package.json +6 -3
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/headless-mail-injector.test.ts +448 -0
- package/src/agents/headless-mail-injector.ts +219 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +514 -14
- package/src/agents/hooks-deployer.ts +141 -0
- package/src/agents/mail-poll-detect.test.ts +153 -0
- package/src/agents/mail-poll-detect.ts +73 -0
- package/src/agents/overlay.test.ts +60 -4
- package/src/agents/overlay.ts +63 -8
- package/src/agents/scope-detect.test.ts +190 -0
- package/src/agents/scope-detect.ts +146 -0
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +2312 -0
- package/src/agents/turn-runner.ts +1383 -0
- package/src/commands/agents.ts +9 -0
- package/src/commands/clean.ts +54 -0
- package/src/commands/coordinator.test.ts +254 -0
- package/src/commands/coordinator.ts +273 -8
- package/src/commands/dashboard.test.ts +188 -0
- package/src/commands/dashboard.ts +14 -4
- package/src/commands/doctor.ts +3 -1
- package/src/commands/group.test.ts +94 -0
- package/src/commands/group.ts +49 -20
- package/src/commands/init.test.ts +8 -0
- package/src/commands/init.ts +8 -1
- package/src/commands/log.test.ts +187 -11
- package/src/commands/log.ts +171 -71
- package/src/commands/mail.test.ts +162 -0
- package/src/commands/mail.ts +64 -9
- package/src/commands/merge.test.ts +230 -1
- package/src/commands/merge.ts +68 -12
- package/src/commands/nudge.test.ts +351 -4
- package/src/commands/nudge.ts +356 -34
- package/src/commands/run.test.ts +43 -7
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +408 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1323 -0
- package/src/commands/serve/rest.ts +708 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +565 -0
- package/src/commands/sling.test.ts +177 -1
- package/src/commands/sling.ts +243 -71
- package/src/commands/status.test.ts +9 -0
- package/src/commands/status.ts +12 -4
- package/src/commands/stop.test.ts +255 -1
- package/src/commands/stop.ts +107 -8
- package/src/commands/watch.test.ts +43 -0
- package/src/commands/watch.ts +153 -28
- package/src/config.ts +23 -0
- package/src/doctor/consistency.test.ts +106 -0
- package/src/doctor/consistency.ts +48 -1
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/types.ts +2 -1
- package/src/doctor/watchdog.ts +57 -1
- package/src/events/tailer.test.ts +234 -1
- package/src/events/tailer.ts +90 -0
- package/src/index.ts +57 -6
- package/src/insights/quality-gates.test.ts +141 -0
- package/src/insights/quality-gates.ts +156 -0
- package/src/json.ts +29 -0
- package/src/logging/theme.ts +4 -0
- package/src/mail/client.ts +15 -2
- package/src/mail/store.test.ts +82 -0
- package/src/mail/store.ts +41 -4
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/merge/predict.test.ts +387 -0
- package/src/merge/predict.ts +249 -0
- package/src/merge/resolver.ts +1 -1
- package/src/mulch/client.ts +3 -3
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/claude.test.ts +791 -1
- package/src/runtimes/claude.ts +323 -1
- package/src/runtimes/connections.test.ts +141 -1
- package/src/runtimes/connections.ts +73 -4
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/types.ts +10 -0
- package/src/schema-consistency.test.ts +1 -0
- package/src/sessions/store.test.ts +657 -29
- package/src/sessions/store.ts +286 -23
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/types.ts +107 -2
- package/src/utils/pid.test.ts +85 -1
- package/src/utils/pid.ts +86 -1
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/watchdog/daemon.test.ts +1607 -376
- package/src/watchdog/daemon.ts +462 -88
- package/src/watchdog/health.test.ts +282 -0
- package/src/watchdog/health.ts +126 -27
- package/src/worktree/manager.test.ts +218 -1
- package/src/worktree/manager.ts +55 -0
- package/src/worktree/process.test.ts +71 -0
- package/src/worktree/process.ts +25 -5
- package/src/worktree/tmux.test.ts +28 -0
- package/src/worktree/tmux.ts +27 -3
- package/templates/CLAUDE.md.tmpl +19 -8
- package/templates/overlay.md.tmpl +5 -2
|
@@ -103,6 +103,67 @@ describe("evaluateHealth", () => {
|
|
|
103
103
|
expect(check.reconciliationNote).toBeNull();
|
|
104
104
|
});
|
|
105
105
|
|
|
106
|
+
// --- ZFC Rule 1 fallback: tmux dead + stale lastActivity → completed ---
|
|
107
|
+
|
|
108
|
+
test("ZFC fallback: tmux dead + stale lastActivity (working) → complete (missed signal)", () => {
|
|
109
|
+
const staleActivity = new Date(Date.now() - 60_000).toISOString();
|
|
110
|
+
const session = makeSession({ state: "working", lastActivity: staleActivity });
|
|
111
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
112
|
+
|
|
113
|
+
expect(check.state).toBe("completed");
|
|
114
|
+
expect(check.action).toBe("complete");
|
|
115
|
+
expect(check.tmuxAlive).toBe(false);
|
|
116
|
+
expect(check.processAlive).toBe(false);
|
|
117
|
+
expect(check.reconciliationNote).toContain("missed session-end signal");
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
test("ZFC fallback: tmux dead + stale lastActivity (stalled) → complete (missed signal)", () => {
|
|
121
|
+
const staleActivity = new Date(Date.now() - 90_000).toISOString();
|
|
122
|
+
const session = makeSession({ state: "stalled", lastActivity: staleActivity });
|
|
123
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
124
|
+
|
|
125
|
+
expect(check.state).toBe("completed");
|
|
126
|
+
expect(check.action).toBe("complete");
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
test("ZFC: tmux dead + recent lastActivity → still zombie (true crash)", () => {
|
|
130
|
+
const recentActivity = new Date(Date.now() - 1_000).toISOString();
|
|
131
|
+
const session = makeSession({ state: "working", lastActivity: recentActivity });
|
|
132
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
133
|
+
|
|
134
|
+
expect(check.state).toBe("zombie");
|
|
135
|
+
expect(check.action).toBe("terminate");
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
test("ZFC fallback (headless): pid dead + stale lastActivity → complete", () => {
|
|
139
|
+
const staleActivity = new Date(Date.now() - 60_000).toISOString();
|
|
140
|
+
const session = makeSession({
|
|
141
|
+
state: "working",
|
|
142
|
+
tmuxSession: "",
|
|
143
|
+
pid: DEAD_PID,
|
|
144
|
+
lastActivity: staleActivity,
|
|
145
|
+
});
|
|
146
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
147
|
+
|
|
148
|
+
expect(check.state).toBe("completed");
|
|
149
|
+
expect(check.action).toBe("complete");
|
|
150
|
+
expect(check.reconciliationNote).toContain("missed session-end signal");
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
test("ZFC (headless): pid dead + recent lastActivity → still zombie", () => {
|
|
154
|
+
const recentActivity = new Date(Date.now() - 1_000).toISOString();
|
|
155
|
+
const session = makeSession({
|
|
156
|
+
state: "working",
|
|
157
|
+
tmuxSession: "",
|
|
158
|
+
pid: DEAD_PID,
|
|
159
|
+
lastActivity: recentActivity,
|
|
160
|
+
});
|
|
161
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
162
|
+
|
|
163
|
+
expect(check.state).toBe("zombie");
|
|
164
|
+
expect(check.action).toBe("terminate");
|
|
165
|
+
});
|
|
166
|
+
|
|
106
167
|
// --- ZFC Rule 2: tmux alive + sessions.json says zombie → investigate ---
|
|
107
168
|
|
|
108
169
|
test("ZFC: tmux alive + sessions.json says zombie → investigate (don't auto-kill)", () => {
|
|
@@ -432,6 +493,162 @@ describe("headless agents (tmuxSession empty, PID-based lifecycle)", () => {
|
|
|
432
493
|
});
|
|
433
494
|
});
|
|
434
495
|
|
|
496
|
+
// === Spawn-per-turn workers (tmuxSession === '' && pid === null) ===
|
|
497
|
+
|
|
498
|
+
describe("spawn-per-turn workers (overstory-7a34)", () => {
|
|
499
|
+
// Spawn-per-turn workers (builder/scout/reviewer/lead/merger under the
|
|
500
|
+
// headless default) have no persistent process between turns. The previous
|
|
501
|
+
// "headless" branch only matched pid !== null, so these sessions fell into
|
|
502
|
+
// the TUI/tmux path where tmuxAlive=false → ZFC Rule 1 → zombie within
|
|
503
|
+
// seconds of sling, despite being actively executing tools (overstory-7a34).
|
|
504
|
+
|
|
505
|
+
test("freshly slung spawn-per-turn lead (booting, no pid, no tmux) → between_turns (overstory-3087)", () => {
|
|
506
|
+
// Spec change: spawn-per-turn workers report `between_turns` instead
|
|
507
|
+
// of `working` for the healthy classification, including the booting
|
|
508
|
+
// → healthy transition. The turn-runner authoritatively writes
|
|
509
|
+
// `in_turn` once the first parser event of a turn arrives.
|
|
510
|
+
const session = makeSession({
|
|
511
|
+
tmuxSession: "",
|
|
512
|
+
pid: null,
|
|
513
|
+
capability: "lead",
|
|
514
|
+
state: "booting",
|
|
515
|
+
lastActivity: new Date().toISOString(),
|
|
516
|
+
});
|
|
517
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
518
|
+
|
|
519
|
+
expect(check.state).toBe("between_turns");
|
|
520
|
+
expect(check.action).toBe("none");
|
|
521
|
+
expect(check.reconciliationNote).toBeNull();
|
|
522
|
+
});
|
|
523
|
+
|
|
524
|
+
test("legacy spawn-per-turn worker still at 'working' is reported as between_turns (overstory-3087)", () => {
|
|
525
|
+
// A row that predates the substate split (state=working) gets
|
|
526
|
+
// reclassified to `between_turns` by the watchdog's healthy-state
|
|
527
|
+
// reporter. transitionState then promotes the row forward (working
|
|
528
|
+
// and between_turns share rank 1 in STATE_ORDER, so the actual
|
|
529
|
+
// promotion happens via tryTransitionState elsewhere — here we just
|
|
530
|
+
// verify the check itself reports the new substate).
|
|
531
|
+
const session = makeSession({
|
|
532
|
+
tmuxSession: "",
|
|
533
|
+
pid: null,
|
|
534
|
+
capability: "builder",
|
|
535
|
+
state: "working",
|
|
536
|
+
lastActivity: new Date(Date.now() - 5_000).toISOString(),
|
|
537
|
+
});
|
|
538
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
539
|
+
|
|
540
|
+
expect(check.state).toBe("between_turns");
|
|
541
|
+
expect(check.action).toBe("none");
|
|
542
|
+
});
|
|
543
|
+
|
|
544
|
+
test("spawn-per-turn worker between turns (recent activity) → between_turns, NOT zombie (overstory-3087)", () => {
|
|
545
|
+
// Repro of overstory-7a34: ov sling --capability lead any-task; within
|
|
546
|
+
// ~30s ov dashboard previously showed state='zombie' while ov feed
|
|
547
|
+
// showed live tool calls. The healthy classification now lands
|
|
548
|
+
// between_turns; the test still verifies that recent activity does
|
|
549
|
+
// not trigger zombie classification.
|
|
550
|
+
const session = makeSession({
|
|
551
|
+
tmuxSession: "",
|
|
552
|
+
pid: null,
|
|
553
|
+
capability: "lead",
|
|
554
|
+
state: "working",
|
|
555
|
+
lastActivity: new Date().toISOString(),
|
|
556
|
+
});
|
|
557
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
558
|
+
|
|
559
|
+
expect(check.state).toBe("between_turns");
|
|
560
|
+
expect(check.action).toBe("none");
|
|
561
|
+
});
|
|
562
|
+
|
|
563
|
+
test("spawn-per-turn worker with stale activity → stalled", () => {
|
|
564
|
+
const session = makeSession({
|
|
565
|
+
tmuxSession: "",
|
|
566
|
+
pid: null,
|
|
567
|
+
capability: "builder",
|
|
568
|
+
state: "working",
|
|
569
|
+
lastActivity: new Date(Date.now() - 60_000).toISOString(),
|
|
570
|
+
});
|
|
571
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
572
|
+
|
|
573
|
+
expect(check.state).toBe("stalled");
|
|
574
|
+
expect(check.action).toBe("escalate");
|
|
575
|
+
});
|
|
576
|
+
|
|
577
|
+
test("spawn-per-turn worker with zombie-level staleness → zombie, terminate", () => {
|
|
578
|
+
const session = makeSession({
|
|
579
|
+
tmuxSession: "",
|
|
580
|
+
pid: null,
|
|
581
|
+
capability: "builder",
|
|
582
|
+
state: "working",
|
|
583
|
+
lastActivity: new Date(Date.now() - 200_000).toISOString(),
|
|
584
|
+
});
|
|
585
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
586
|
+
|
|
587
|
+
expect(check.state).toBe("zombie");
|
|
588
|
+
expect(check.action).toBe("terminate");
|
|
589
|
+
});
|
|
590
|
+
|
|
591
|
+
test("spawn-per-turn worker that already completed → skips monitoring", () => {
|
|
592
|
+
const session = makeSession({
|
|
593
|
+
tmuxSession: "",
|
|
594
|
+
pid: null,
|
|
595
|
+
capability: "builder",
|
|
596
|
+
state: "completed",
|
|
597
|
+
});
|
|
598
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
599
|
+
|
|
600
|
+
expect(check.state).toBe("completed");
|
|
601
|
+
expect(check.action).toBe("none");
|
|
602
|
+
});
|
|
603
|
+
|
|
604
|
+
test("preserves in_turn for healthy spawn-per-turn worker (overstory-3087)", () => {
|
|
605
|
+
// A spawn-per-turn worker the turn-runner has marked in_turn must
|
|
606
|
+
// have its state preserved by the health evaluation when activity is
|
|
607
|
+
// recent — otherwise the watchdog would stomp the substate back to
|
|
608
|
+
// `working` and the UI would lose the distinction between mid-turn
|
|
609
|
+
// and idling.
|
|
610
|
+
const session = makeSession({
|
|
611
|
+
tmuxSession: "",
|
|
612
|
+
pid: null,
|
|
613
|
+
capability: "builder",
|
|
614
|
+
state: "in_turn",
|
|
615
|
+
lastActivity: new Date().toISOString(),
|
|
616
|
+
});
|
|
617
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
618
|
+
|
|
619
|
+
expect(check.state).toBe("in_turn");
|
|
620
|
+
expect(check.action).toBe("none");
|
|
621
|
+
});
|
|
622
|
+
|
|
623
|
+
test("preserves between_turns for healthy spawn-per-turn worker (overstory-3087)", () => {
|
|
624
|
+
const session = makeSession({
|
|
625
|
+
tmuxSession: "",
|
|
626
|
+
pid: null,
|
|
627
|
+
capability: "builder",
|
|
628
|
+
state: "between_turns",
|
|
629
|
+
lastActivity: new Date().toISOString(),
|
|
630
|
+
});
|
|
631
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
632
|
+
|
|
633
|
+
expect(check.state).toBe("between_turns");
|
|
634
|
+
expect(check.action).toBe("none");
|
|
635
|
+
});
|
|
636
|
+
|
|
637
|
+
test("escalates an in_turn worker with stale activity to stalled (overstory-3087)", () => {
|
|
638
|
+
const session = makeSession({
|
|
639
|
+
tmuxSession: "",
|
|
640
|
+
pid: null,
|
|
641
|
+
capability: "builder",
|
|
642
|
+
state: "in_turn",
|
|
643
|
+
lastActivity: new Date(Date.now() - 60_000).toISOString(),
|
|
644
|
+
});
|
|
645
|
+
const check = evaluateHealth(session, false, THRESHOLDS);
|
|
646
|
+
|
|
647
|
+
expect(check.state).toBe("stalled");
|
|
648
|
+
expect(check.action).toBe("escalate");
|
|
649
|
+
});
|
|
650
|
+
});
|
|
651
|
+
|
|
435
652
|
// === transitionState ===
|
|
436
653
|
|
|
437
654
|
describe("transitionState", () => {
|
|
@@ -545,4 +762,69 @@ describe("transitionState", () => {
|
|
|
545
762
|
// the state should NOT advance
|
|
546
763
|
expect(transitionState("working", check)).toBe("working");
|
|
547
764
|
});
|
|
765
|
+
|
|
766
|
+
// --- in_turn / between_turns coexist with working at the active rank (overstory-3087) ---
|
|
767
|
+
|
|
768
|
+
test("preserves in_turn when watchdog reports a healthy 'working' check", () => {
|
|
769
|
+
// The watchdog's healthy-classification check returns state=working;
|
|
770
|
+
// since in_turn shares rank 1 with working, transitionState must not
|
|
771
|
+
// advance and the spawn-per-turn substate the turn-runner wrote stays.
|
|
772
|
+
const check = {
|
|
773
|
+
state: "working" as const,
|
|
774
|
+
agentName: "a",
|
|
775
|
+
timestamp: "",
|
|
776
|
+
tmuxAlive: true,
|
|
777
|
+
pidAlive: true as boolean | null,
|
|
778
|
+
lastActivity: "",
|
|
779
|
+
processAlive: true,
|
|
780
|
+
action: "none" as const,
|
|
781
|
+
reconciliationNote: null,
|
|
782
|
+
};
|
|
783
|
+
expect(transitionState("in_turn", check)).toBe("in_turn");
|
|
784
|
+
});
|
|
785
|
+
|
|
786
|
+
test("preserves between_turns when watchdog reports a healthy 'working' check", () => {
|
|
787
|
+
const check = {
|
|
788
|
+
state: "working" as const,
|
|
789
|
+
agentName: "a",
|
|
790
|
+
timestamp: "",
|
|
791
|
+
tmuxAlive: true,
|
|
792
|
+
pidAlive: true as boolean | null,
|
|
793
|
+
lastActivity: "",
|
|
794
|
+
processAlive: true,
|
|
795
|
+
action: "none" as const,
|
|
796
|
+
reconciliationNote: null,
|
|
797
|
+
};
|
|
798
|
+
expect(transitionState("between_turns", check)).toBe("between_turns");
|
|
799
|
+
});
|
|
800
|
+
|
|
801
|
+
test("advances in_turn → stalled when the watchdog escalates", () => {
|
|
802
|
+
const check = {
|
|
803
|
+
state: "stalled" as const,
|
|
804
|
+
agentName: "a",
|
|
805
|
+
timestamp: "",
|
|
806
|
+
tmuxAlive: true,
|
|
807
|
+
pidAlive: true as boolean | null,
|
|
808
|
+
lastActivity: "",
|
|
809
|
+
processAlive: true,
|
|
810
|
+
action: "escalate" as const,
|
|
811
|
+
reconciliationNote: null,
|
|
812
|
+
};
|
|
813
|
+
expect(transitionState("in_turn", check)).toBe("stalled");
|
|
814
|
+
});
|
|
815
|
+
|
|
816
|
+
test("advances between_turns → zombie when the watchdog terminates", () => {
|
|
817
|
+
const check = {
|
|
818
|
+
state: "zombie" as const,
|
|
819
|
+
agentName: "a",
|
|
820
|
+
timestamp: "",
|
|
821
|
+
tmuxAlive: false,
|
|
822
|
+
pidAlive: false as boolean | null,
|
|
823
|
+
lastActivity: "",
|
|
824
|
+
processAlive: false,
|
|
825
|
+
action: "terminate" as const,
|
|
826
|
+
reconciliationNote: null,
|
|
827
|
+
};
|
|
828
|
+
expect(transitionState("between_turns", check)).toBe("zombie");
|
|
829
|
+
});
|
|
548
830
|
});
|
package/src/watchdog/health.ts
CHANGED
|
@@ -30,22 +30,25 @@
|
|
|
30
30
|
* table are always up-to-date because they reflect real kernel state.
|
|
31
31
|
*/
|
|
32
32
|
|
|
33
|
+
import { isPersistentCapability } from "../agents/capabilities.ts";
|
|
33
34
|
import type { AgentSession, AgentState, HealthCheck } from "../types.ts";
|
|
34
35
|
|
|
35
36
|
/**
|
|
36
|
-
*
|
|
37
|
-
* These agents are expected to have long idle periods (e.g. coordinator waiting
|
|
38
|
-
* for worker mail) and should NOT be flagged stale/zombie based on lastActivity.
|
|
39
|
-
* Only tmux/pid liveness checks apply to them.
|
|
37
|
+
* Numeric ordering for forward-only state transitions.
|
|
40
38
|
*
|
|
41
|
-
*
|
|
39
|
+
* `in_turn` and `between_turns` share the `working` rank (1) because, from
|
|
40
|
+
* the watchdog's perspective, all three are "agent is alive and active" —
|
|
41
|
+
* they only differ in whether the spawn-per-turn worker is currently
|
|
42
|
+
* mid-execution or idling between mail batches (overstory-3087). Same rank
|
|
43
|
+
* means a healthy-classification check (`check.state === "working"`) will
|
|
44
|
+
* not stomp on the more specific in_turn/between_turns states the
|
|
45
|
+
* turn-runner has already written.
|
|
42
46
|
*/
|
|
43
|
-
const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor"]);
|
|
44
|
-
|
|
45
|
-
/** Numeric ordering for forward-only state transitions. */
|
|
46
47
|
const STATE_ORDER: Record<AgentState, number> = {
|
|
47
48
|
booting: 0,
|
|
48
49
|
working: 1,
|
|
50
|
+
in_turn: 1,
|
|
51
|
+
between_turns: 1,
|
|
49
52
|
completed: 2,
|
|
50
53
|
stalled: 3,
|
|
51
54
|
zombie: 4,
|
|
@@ -71,15 +74,34 @@ export function isProcessRunning(pid: number): boolean {
|
|
|
71
74
|
}
|
|
72
75
|
|
|
73
76
|
/**
|
|
74
|
-
* Detect whether a session is a headless agent.
|
|
77
|
+
* Detect whether a session is a long-lived headless agent.
|
|
75
78
|
*
|
|
76
|
-
*
|
|
77
|
-
*
|
|
79
|
+
* Long-lived headless agents (coordinator, orchestrator, monitor, sapling, etc.)
|
|
80
|
+
* have no tmux session (tmuxSession === '') but do have a persistent process —
|
|
81
|
+
* so `session.pid` is non-null and PID is the primary liveness signal.
|
|
78
82
|
*/
|
|
79
83
|
function isHeadlessSession(session: AgentSession): boolean {
|
|
80
84
|
return session.tmuxSession === "" && session.pid !== null;
|
|
81
85
|
}
|
|
82
86
|
|
|
87
|
+
/**
|
|
88
|
+
* Detect whether a session is a spawn-per-turn worker between turns.
|
|
89
|
+
*
|
|
90
|
+
* Spawn-per-turn workers (task-scoped capabilities under the new headless
|
|
91
|
+
* default — builder/scout/reviewer/lead/merger) have no tmux session AND no
|
|
92
|
+
* persistent process: `tmuxSession === ''` and `session.pid === null` from
|
|
93
|
+
* sling onward. The per-turn claude PID lives in
|
|
94
|
+
* `.overstory/agents/<name>/turn.pid` only while a turn is in flight.
|
|
95
|
+
*
|
|
96
|
+
* "No process" is the normal state between turns, so neither tmux liveness nor
|
|
97
|
+
* pid liveness can be used as a death signal — only `lastActivity` recency
|
|
98
|
+
* (refreshed by the turn-runner on every event and by the watchdog from
|
|
99
|
+
* events.db) can. (overstory-7a34)
|
|
100
|
+
*/
|
|
101
|
+
export function isSpawnPerTurnSession(session: AgentSession): boolean {
|
|
102
|
+
return session.tmuxSession === "" && session.pid === null;
|
|
103
|
+
}
|
|
104
|
+
|
|
83
105
|
/**
|
|
84
106
|
* Evaluate time-based health (persistent capability exemptions, stale, zombie thresholds,
|
|
85
107
|
* booting→working transition). Called after liveness is confirmed for both TUI and headless paths.
|
|
@@ -98,7 +120,7 @@ function evaluateTimeBased(
|
|
|
98
120
|
// Persistent capabilities (coordinator, monitor) are expected to have long idle
|
|
99
121
|
// periods waiting for mail/events. Skip time-based stale/zombie detection for
|
|
100
122
|
// them — only tmux/pid liveness matters (checked above).
|
|
101
|
-
if (
|
|
123
|
+
if (isPersistentCapability(session.capability)) {
|
|
102
124
|
// Transition booting → working if we reach here (process alive)
|
|
103
125
|
const state = session.state === "booting" ? "working" : session.state;
|
|
104
126
|
return {
|
|
@@ -135,22 +157,42 @@ function evaluateTimeBased(
|
|
|
135
157
|
};
|
|
136
158
|
}
|
|
137
159
|
|
|
138
|
-
//
|
|
160
|
+
// Spawn-per-turn workers (overstory-3087): healthy classification reports
|
|
161
|
+
// `between_turns` instead of `working`, including the booting → healthy
|
|
162
|
+
// transition. The turn-runner authoritatively writes `in_turn` /
|
|
163
|
+
// `between_turns` while a turn is alive; in_turn is preserved here when
|
|
164
|
+
// already set so a watchdog tick mid-turn does not overwrite it.
|
|
165
|
+
const isSpawnPerTurn = isSpawnPerTurnSession(session);
|
|
166
|
+
|
|
167
|
+
// booting → transition to the healthy state once there's recent activity.
|
|
139
168
|
if (session.state === "booting") {
|
|
140
169
|
return {
|
|
141
170
|
...base,
|
|
142
171
|
processAlive: true,
|
|
143
|
-
state: "working",
|
|
172
|
+
state: isSpawnPerTurn ? "between_turns" : "working",
|
|
144
173
|
action: "none",
|
|
145
174
|
reconciliationNote: null,
|
|
146
175
|
};
|
|
147
176
|
}
|
|
148
177
|
|
|
149
|
-
// Default: healthy
|
|
178
|
+
// Default: healthy active state. For spawn-per-turn workers report the
|
|
179
|
+
// existing in_turn/between_turns substate; for tmux/long-lived agents
|
|
180
|
+
// report `working`. The turn-runner is authoritative for in_turn ↔
|
|
181
|
+
// between_turns transitions, so the watchdog must not stomp the more
|
|
182
|
+
// specific state — same rank in STATE_ORDER ensures `transitionState`
|
|
183
|
+
// also leaves the row alone.
|
|
184
|
+
let healthyState: AgentState;
|
|
185
|
+
if (session.state === "in_turn" || session.state === "between_turns") {
|
|
186
|
+
healthyState = session.state;
|
|
187
|
+
} else if (isSpawnPerTurn) {
|
|
188
|
+
healthyState = "between_turns";
|
|
189
|
+
} else {
|
|
190
|
+
healthyState = "working";
|
|
191
|
+
}
|
|
150
192
|
return {
|
|
151
193
|
...base,
|
|
152
194
|
processAlive: true,
|
|
153
|
-
state:
|
|
195
|
+
state: healthyState,
|
|
154
196
|
action: "none",
|
|
155
197
|
reconciliationNote: null,
|
|
156
198
|
};
|
|
@@ -165,19 +207,23 @@ function evaluateTimeBased(
|
|
|
165
207
|
* Decision logic (in priority order):
|
|
166
208
|
*
|
|
167
209
|
* 1. Completed agents skip monitoring entirely.
|
|
168
|
-
* 2.
|
|
210
|
+
* 2. Spawn-per-turn workers (tmuxSession === '' && pid === null): no
|
|
211
|
+
* persistent process between turns — fall straight through to time-based
|
|
212
|
+
* checks driven by lastActivity. PID/tmux liveness are meaningless here.
|
|
213
|
+
* 3. Headless agents with persistent process (tmuxSession === '' && pid !== null):
|
|
214
|
+
* PID is primary liveness signal.
|
|
169
215
|
* - pid dead → zombie, terminate.
|
|
170
216
|
* - pid alive + state zombie → investigate.
|
|
171
217
|
* - pid alive → fall through to time-based checks.
|
|
172
|
-
*
|
|
173
|
-
*
|
|
218
|
+
* 4. tmux dead → zombie, terminate (regardless of what sessions.json says).
|
|
219
|
+
* 5. tmux alive + sessions.json says zombie → investigate (don't auto-kill).
|
|
174
220
|
* Something external marked this zombie, but the process is still running.
|
|
175
|
-
*
|
|
221
|
+
* 6. pid dead + tmux alive → zombie, terminate. The agent process exited but
|
|
176
222
|
* the tmux pane shell survived. The agent is not doing work.
|
|
177
|
-
*
|
|
178
|
-
*
|
|
179
|
-
*
|
|
180
|
-
*
|
|
223
|
+
* 7. lastActivity older than zombieMs → zombie, terminate.
|
|
224
|
+
* 8. lastActivity older than staleMs → stalled, escalate.
|
|
225
|
+
* 9. booting with recent activity → working.
|
|
226
|
+
* 10. Otherwise → working, healthy.
|
|
181
227
|
*
|
|
182
228
|
* @param session - The agent session to evaluate
|
|
183
229
|
* @param tmuxAlive - Whether the agent's tmux session is still running
|
|
@@ -222,10 +268,37 @@ export function evaluateHealth(
|
|
|
222
268
|
};
|
|
223
269
|
}
|
|
224
270
|
|
|
271
|
+
// === Spawn-per-turn path: no persistent process between turns ===
|
|
272
|
+
// For these workers (overstory-7a34) `session.pid` is null by design and
|
|
273
|
+
// there is no tmux session. Liveness signals reduce to lastActivity
|
|
274
|
+
// recency: the turn-runner updates it on every parser event during a
|
|
275
|
+
// turn, and the watchdog refreshes it from events.db between turns. PID
|
|
276
|
+
// and tmux checks would always say "dead" and false-positive every fresh
|
|
277
|
+
// agent as zombie within seconds of sling.
|
|
278
|
+
if (isSpawnPerTurnSession(session)) {
|
|
279
|
+
return evaluateTimeBased(session, base, elapsedMs, thresholds);
|
|
280
|
+
}
|
|
281
|
+
|
|
225
282
|
// === Headless path: PID is the primary liveness signal ===
|
|
226
283
|
if (isHeadlessSession(session)) {
|
|
227
|
-
// pid dead
|
|
284
|
+
// pid dead: zombie OR completed-with-missed-signal.
|
|
285
|
+
// Distinguish by lastActivity age — recent activity means the agent
|
|
286
|
+
// crashed mid-work (true zombie); stale activity means it likely
|
|
287
|
+
// finished naturally and only the session-end hook didn't deliver
|
|
288
|
+
// (treat as completed). (overstory-e74b)
|
|
228
289
|
if (pidAlive === false) {
|
|
290
|
+
if (
|
|
291
|
+
elapsedMs > thresholds.staleMs &&
|
|
292
|
+
(session.state === "working" || session.state === "booting" || session.state === "stalled")
|
|
293
|
+
) {
|
|
294
|
+
return {
|
|
295
|
+
...base,
|
|
296
|
+
processAlive: false,
|
|
297
|
+
state: "completed",
|
|
298
|
+
action: "complete",
|
|
299
|
+
reconciliationNote: `ZFC: headless pid ${session.pid} dead + stale lastActivity (${Math.round(elapsedMs / 1000)}s ago) — assumed completed (missed session-end signal)`,
|
|
300
|
+
};
|
|
301
|
+
}
|
|
229
302
|
return {
|
|
230
303
|
...base,
|
|
231
304
|
processAlive: false,
|
|
@@ -253,9 +326,25 @@ export function evaluateHealth(
|
|
|
253
326
|
|
|
254
327
|
// === TUI/tmux path ===
|
|
255
328
|
|
|
256
|
-
// ZFC Rule 1: tmux dead → zombie
|
|
257
|
-
//
|
|
329
|
+
// ZFC Rule 1: tmux dead → zombie OR completed-with-missed-signal.
|
|
330
|
+
// Distinguish by lastActivity age — recent activity means the agent
|
|
331
|
+
// crashed mid-work (true zombie); stale activity means it likely
|
|
332
|
+
// finished naturally and only the session-end hook didn't deliver
|
|
333
|
+
// (treat as completed). (overstory-e74b)
|
|
258
334
|
if (!tmuxAlive) {
|
|
335
|
+
if (
|
|
336
|
+
elapsedMs > thresholds.staleMs &&
|
|
337
|
+
(session.state === "working" || session.state === "booting" || session.state === "stalled")
|
|
338
|
+
) {
|
|
339
|
+
return {
|
|
340
|
+
...base,
|
|
341
|
+
processAlive: false,
|
|
342
|
+
state: "completed",
|
|
343
|
+
action: "complete",
|
|
344
|
+
reconciliationNote: `ZFC: tmux dead + stale lastActivity (${Math.round(elapsedMs / 1000)}s ago) — assumed completed (missed session-end signal)`,
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
|
|
259
348
|
const note =
|
|
260
349
|
session.state === "working" || session.state === "booting"
|
|
261
350
|
? `ZFC: tmux dead but sessions.json says "${session.state}" — marking zombie (observable state wins)`
|
|
@@ -323,6 +412,16 @@ export function transitionState(currentState: AgentState, check: HealthCheck): A
|
|
|
323
412
|
return currentState;
|
|
324
413
|
}
|
|
325
414
|
|
|
415
|
+
// `complete` is a terminal classification triggered when observable state
|
|
416
|
+
// proves the agent finished naturally (missed session-end signal —
|
|
417
|
+
// overstory-e74b). It bypasses the forward-only STATE_ORDER guard because
|
|
418
|
+
// `completed` (order 2) sits before `stalled` (order 3) and would
|
|
419
|
+
// otherwise be blocked from advancing the recorded state. The matrix in
|
|
420
|
+
// SessionStore.tryTransitionState still gates the actual write.
|
|
421
|
+
if (check.action === "complete") {
|
|
422
|
+
return check.state;
|
|
423
|
+
}
|
|
424
|
+
|
|
326
425
|
const currentOrder = STATE_ORDER[currentState];
|
|
327
426
|
const checkOrder = STATE_ORDER[check.state];
|
|
328
427
|
|