@os-eco/overstory-cli 0.9.4 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -18
- package/agents/builder.md +9 -8
- package/agents/coordinator.md +6 -6
- package/agents/lead.md +98 -82
- package/agents/merger.md +25 -14
- package/agents/reviewer.md +22 -16
- package/agents/scout.md +17 -12
- package/package.json +6 -3
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/headless-mail-injector.test.ts +448 -0
- package/src/agents/headless-mail-injector.ts +211 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +514 -14
- package/src/agents/hooks-deployer.ts +141 -0
- package/src/agents/overlay.test.ts +4 -4
- package/src/agents/overlay.ts +30 -8
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +1450 -0
- package/src/agents/turn-runner.ts +1166 -0
- package/src/commands/clean.ts +54 -0
- package/src/commands/coordinator.test.ts +127 -0
- package/src/commands/coordinator.ts +203 -5
- package/src/commands/dashboard.test.ts +188 -0
- package/src/commands/dashboard.ts +13 -3
- package/src/commands/doctor.ts +3 -1
- package/src/commands/group.test.ts +94 -0
- package/src/commands/group.ts +49 -20
- package/src/commands/init.test.ts +8 -0
- package/src/commands/init.ts +8 -1
- package/src/commands/log.test.ts +56 -11
- package/src/commands/log.ts +134 -69
- package/src/commands/mail.test.ts +162 -0
- package/src/commands/mail.ts +64 -9
- package/src/commands/merge.test.ts +112 -1
- package/src/commands/merge.ts +17 -4
- package/src/commands/nudge.test.ts +351 -4
- package/src/commands/nudge.ts +356 -34
- package/src/commands/run.test.ts +43 -7
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +408 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1323 -0
- package/src/commands/serve/rest.ts +708 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +565 -0
- package/src/commands/sling.test.ts +73 -1
- package/src/commands/sling.ts +149 -64
- package/src/commands/status.test.ts +9 -0
- package/src/commands/status.ts +12 -4
- package/src/commands/stop.test.ts +174 -1
- package/src/commands/stop.ts +107 -8
- package/src/commands/watch.test.ts +43 -0
- package/src/commands/watch.ts +153 -28
- package/src/config.ts +23 -0
- package/src/doctor/consistency.test.ts +106 -0
- package/src/doctor/consistency.ts +48 -1
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/types.ts +2 -1
- package/src/doctor/watchdog.ts +57 -1
- package/src/events/tailer.test.ts +234 -1
- package/src/events/tailer.ts +90 -0
- package/src/index.ts +53 -6
- package/src/json.ts +29 -0
- package/src/mail/client.ts +15 -2
- package/src/mail/store.test.ts +82 -0
- package/src/mail/store.ts +41 -4
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/claude.test.ts +791 -1
- package/src/runtimes/claude.ts +323 -1
- package/src/runtimes/connections.test.ts +141 -1
- package/src/runtimes/connections.ts +73 -4
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/types.ts +10 -0
- package/src/schema-consistency.test.ts +1 -0
- package/src/sessions/store.test.ts +390 -24
- package/src/sessions/store.ts +184 -19
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/types.ts +56 -1
- package/src/utils/pid.test.ts +85 -1
- package/src/utils/pid.ts +86 -1
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/watchdog/daemon.test.ts +1520 -411
- package/src/watchdog/daemon.ts +442 -83
- package/src/watchdog/health.test.ts +157 -0
- package/src/watchdog/health.ts +92 -25
- package/src/worktree/process.test.ts +71 -0
- package/src/worktree/process.ts +25 -5
- package/src/worktree/tmux.test.ts +3 -0
- package/src/worktree/tmux.ts +10 -3
- package/templates/CLAUDE.md.tmpl +19 -8
- package/templates/overlay.md.tmpl +3 -2
|
@@ -19,10 +19,16 @@ import { mkdir, mkdtemp } from "node:fs/promises";
|
|
|
19
19
|
import { tmpdir } from "node:os";
|
|
20
20
|
import { join } from "node:path";
|
|
21
21
|
import { createEventStore } from "../events/store.ts";
|
|
22
|
-
import {
|
|
22
|
+
import { createMailStore } from "../mail/store.ts";
|
|
23
|
+
import { createRunStore, createSessionStore } from "../sessions/store.ts";
|
|
23
24
|
import { cleanupTempDir } from "../test-helpers.ts";
|
|
24
|
-
import type { AgentSession, HealthCheck, StoredEvent } from "../types.ts";
|
|
25
|
-
import {
|
|
25
|
+
import type { AgentSession, HealthCheck, StoredEvent, WorkerDiedPayload } from "../types.ts";
|
|
26
|
+
import {
|
|
27
|
+
buildCompletionMessage,
|
|
28
|
+
type RunIdWarnState,
|
|
29
|
+
runDaemonTick,
|
|
30
|
+
startDaemon,
|
|
31
|
+
} from "./daemon.ts";
|
|
26
32
|
|
|
27
33
|
// === Test constants ===
|
|
28
34
|
|
|
@@ -50,6 +56,34 @@ function writeSessionsToStore(root: string, sessions: AgentSession[]): void {
|
|
|
50
56
|
store.close();
|
|
51
57
|
}
|
|
52
58
|
|
|
59
|
+
/**
|
|
60
|
+
* Mark a run as active: write current-run.txt AND insert a row in the runs
|
|
61
|
+
* table (sessions.db). The watchdog now validates the id against the runs
|
|
62
|
+
* table before running the run-completion check (overstory-87bf), so tests
|
|
63
|
+
* must seed both surfaces to mirror production reality.
|
|
64
|
+
*/
|
|
65
|
+
async function setActiveRun(root: string, runId: string): Promise<void> {
|
|
66
|
+
await Bun.write(join(root, ".overstory", "current-run.txt"), runId);
|
|
67
|
+
const runStore = createRunStore(join(root, ".overstory", "sessions.db"));
|
|
68
|
+
try {
|
|
69
|
+
runStore.createRun({
|
|
70
|
+
id: runId,
|
|
71
|
+
startedAt: new Date().toISOString(),
|
|
72
|
+
coordinatorSessionId: null,
|
|
73
|
+
status: "active",
|
|
74
|
+
});
|
|
75
|
+
} catch {
|
|
76
|
+
// Row may already exist (re-seeding within one test) — non-fatal.
|
|
77
|
+
} finally {
|
|
78
|
+
runStore.close();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Build a fresh, isolated RunIdWarnState for tests (overstory-87bf). */
|
|
83
|
+
function freshRunIdWarnState(): RunIdWarnState {
|
|
84
|
+
return { missingFileWarned: false, unknownIds: new Set() };
|
|
85
|
+
}
|
|
86
|
+
|
|
53
87
|
/** Read sessions from the SessionStore (sessions.db) at the given root. */
|
|
54
88
|
function readSessionsFromStore(root: string): AgentSession[] {
|
|
55
89
|
const dbPath = join(root, ".overstory", "sessions.db");
|
|
@@ -1084,7 +1118,7 @@ describe("daemon event recording", () => {
|
|
|
1084
1118
|
|
|
1085
1119
|
// Write a current-run.txt
|
|
1086
1120
|
const runId = "run-2026-02-13T10-00-00-000Z";
|
|
1087
|
-
await
|
|
1121
|
+
await setActiveRun(tempRoot, runId);
|
|
1088
1122
|
|
|
1089
1123
|
const eventsDbPath = join(tempRoot, ".overstory", "events.db");
|
|
1090
1124
|
const eventStore = createEventStore(eventsDbPath);
|
|
@@ -1421,7 +1455,7 @@ describe("run completion detection", () => {
|
|
|
1421
1455
|
];
|
|
1422
1456
|
|
|
1423
1457
|
writeSessionsToStore(tempRoot, sessions);
|
|
1424
|
-
await
|
|
1458
|
+
await setActiveRun(tempRoot, runId);
|
|
1425
1459
|
|
|
1426
1460
|
const nudgeMock = nudgeTracker();
|
|
1427
1461
|
|
|
@@ -1467,7 +1501,7 @@ describe("run completion detection", () => {
|
|
|
1467
1501
|
];
|
|
1468
1502
|
|
|
1469
1503
|
writeSessionsToStore(tempRoot, sessions);
|
|
1470
|
-
await
|
|
1504
|
+
await setActiveRun(tempRoot, runId);
|
|
1471
1505
|
|
|
1472
1506
|
const nudgeMock = nudgeTracker();
|
|
1473
1507
|
|
|
@@ -1509,7 +1543,7 @@ describe("run completion detection", () => {
|
|
|
1509
1543
|
];
|
|
1510
1544
|
|
|
1511
1545
|
writeSessionsToStore(tempRoot, sessions);
|
|
1512
|
-
await
|
|
1546
|
+
await setActiveRun(tempRoot, runId);
|
|
1513
1547
|
// Pre-write dedup marker
|
|
1514
1548
|
await Bun.write(join(tempRoot, ".overstory", "run-complete-notified.txt"), runId);
|
|
1515
1549
|
|
|
@@ -1613,7 +1647,7 @@ describe("run completion detection", () => {
|
|
|
1613
1647
|
];
|
|
1614
1648
|
|
|
1615
1649
|
writeSessionsToStore(tempRoot, sessions);
|
|
1616
|
-
await
|
|
1650
|
+
await setActiveRun(tempRoot, runId);
|
|
1617
1651
|
|
|
1618
1652
|
const nudgeMock = nudgeTracker();
|
|
1619
1653
|
|
|
@@ -1659,7 +1693,7 @@ describe("run completion detection", () => {
|
|
|
1659
1693
|
];
|
|
1660
1694
|
|
|
1661
1695
|
writeSessionsToStore(tempRoot, sessions);
|
|
1662
|
-
await
|
|
1696
|
+
await setActiveRun(tempRoot, runId);
|
|
1663
1697
|
|
|
1664
1698
|
const nudgeMock = nudgeTracker();
|
|
1665
1699
|
|
|
@@ -1701,7 +1735,7 @@ describe("run completion detection", () => {
|
|
|
1701
1735
|
];
|
|
1702
1736
|
|
|
1703
1737
|
writeSessionsToStore(tempRoot, sessions);
|
|
1704
|
-
await
|
|
1738
|
+
await setActiveRun(tempRoot, runId);
|
|
1705
1739
|
|
|
1706
1740
|
const eventsDbPath = join(tempRoot, ".overstory", "events.db");
|
|
1707
1741
|
const eventStore = createEventStore(eventsDbPath);
|
|
@@ -1759,7 +1793,7 @@ describe("run completion detection", () => {
|
|
|
1759
1793
|
];
|
|
1760
1794
|
|
|
1761
1795
|
writeSessionsToStore(tempRoot, sessions);
|
|
1762
|
-
await
|
|
1796
|
+
await setActiveRun(tempRoot, runId);
|
|
1763
1797
|
|
|
1764
1798
|
await runDaemonTick({
|
|
1765
1799
|
root: tempRoot,
|
|
@@ -1800,7 +1834,7 @@ describe("run completion detection", () => {
|
|
|
1800
1834
|
];
|
|
1801
1835
|
|
|
1802
1836
|
writeSessionsToStore(tempRoot, sessions);
|
|
1803
|
-
await
|
|
1837
|
+
await setActiveRun(tempRoot, runId);
|
|
1804
1838
|
|
|
1805
1839
|
const nudgeMock = nudgeTracker();
|
|
1806
1840
|
|
|
@@ -1846,7 +1880,7 @@ describe("run completion detection", () => {
|
|
|
1846
1880
|
];
|
|
1847
1881
|
|
|
1848
1882
|
writeSessionsToStore(tempRoot, sessions);
|
|
1849
|
-
await
|
|
1883
|
+
await setActiveRun(tempRoot, runId);
|
|
1850
1884
|
|
|
1851
1885
|
const nudgeMock = nudgeTracker();
|
|
1852
1886
|
|
|
@@ -1881,7 +1915,7 @@ describe("run completion detection", () => {
|
|
|
1881
1915
|
];
|
|
1882
1916
|
|
|
1883
1917
|
writeSessionsToStore(tempRoot, sessions);
|
|
1884
|
-
await
|
|
1918
|
+
await setActiveRun(tempRoot, runId);
|
|
1885
1919
|
|
|
1886
1920
|
const nudgeMock = nudgeTracker();
|
|
1887
1921
|
|
|
@@ -1916,7 +1950,7 @@ describe("run completion detection", () => {
|
|
|
1916
1950
|
];
|
|
1917
1951
|
|
|
1918
1952
|
writeSessionsToStore(tempRoot, sessions);
|
|
1919
|
-
await
|
|
1953
|
+
await setActiveRun(tempRoot, runId);
|
|
1920
1954
|
|
|
1921
1955
|
const eventsDbPath = join(tempRoot, ".overstory", "events.db");
|
|
1922
1956
|
const eventStore = createEventStore(eventsDbPath);
|
|
@@ -1950,440 +1984,1344 @@ describe("run completion detection", () => {
|
|
|
1950
1984
|
store.close();
|
|
1951
1985
|
}
|
|
1952
1986
|
});
|
|
1953
|
-
});
|
|
1954
|
-
|
|
1955
|
-
// === buildCompletionMessage unit tests ===
|
|
1956
|
-
|
|
1957
|
-
describe("buildCompletionMessage", () => {
|
|
1958
|
-
const testRunId = "run-test-123";
|
|
1959
|
-
|
|
1960
|
-
test("all scouts → contains 'scout' and 'Ready for next phase'", () => {
|
|
1961
|
-
const sessions = [
|
|
1962
|
-
makeSession({ capability: "scout", agentName: "scout-1" }),
|
|
1963
|
-
makeSession({ capability: "scout", agentName: "scout-2" }),
|
|
1964
|
-
];
|
|
1965
|
-
const msg = buildCompletionMessage(sessions, testRunId);
|
|
1966
|
-
expect(msg).toContain("scout");
|
|
1967
|
-
expect(msg).toContain("Ready for next phase");
|
|
1968
|
-
expect(msg).not.toContain("merge/cleanup");
|
|
1969
|
-
});
|
|
1970
1987
|
|
|
1971
|
-
|
|
1988
|
+
// overstory-e130: a run that mixes `completed` and `zombie` workers must
|
|
1989
|
+
// still notify the coordinator. Before the fix, the every-completed predicate
|
|
1990
|
+
// stranded the coordinator forever whenever the watchdog killed any worker.
|
|
1991
|
+
test("nudges coordinator when workers are a mix of completed and zombie", async () => {
|
|
1972
1992
|
const sessions = [
|
|
1973
|
-
makeSession({
|
|
1974
|
-
|
|
1993
|
+
makeSession({
|
|
1994
|
+
id: "s1",
|
|
1995
|
+
agentName: "builder-one",
|
|
1996
|
+
capability: "builder",
|
|
1997
|
+
tmuxSession: "overstory-agent-fake-builder-one",
|
|
1998
|
+
state: "completed",
|
|
1999
|
+
runId,
|
|
2000
|
+
lastActivity: new Date().toISOString(),
|
|
2001
|
+
}),
|
|
2002
|
+
makeSession({
|
|
2003
|
+
id: "s2",
|
|
2004
|
+
agentName: "builder-two",
|
|
2005
|
+
capability: "builder",
|
|
2006
|
+
tmuxSession: "overstory-agent-fake-builder-two",
|
|
2007
|
+
state: "zombie",
|
|
2008
|
+
runId,
|
|
2009
|
+
lastActivity: new Date().toISOString(),
|
|
2010
|
+
}),
|
|
1975
2011
|
];
|
|
1976
|
-
const msg = buildCompletionMessage(sessions, testRunId);
|
|
1977
|
-
expect(msg).toContain("builder");
|
|
1978
|
-
expect(msg).toContain("Awaiting lead verification");
|
|
1979
|
-
expect(msg).not.toContain("merge/cleanup");
|
|
1980
|
-
});
|
|
1981
|
-
|
|
1982
|
-
test("all reviewers → contains 'reviewer' and 'Reviews done'", () => {
|
|
1983
|
-
const sessions = [makeSession({ capability: "reviewer", agentName: "reviewer-1" })];
|
|
1984
|
-
const msg = buildCompletionMessage(sessions, testRunId);
|
|
1985
|
-
expect(msg).toContain("reviewer");
|
|
1986
|
-
expect(msg).toContain("Reviews done");
|
|
1987
|
-
});
|
|
1988
2012
|
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
const msg = buildCompletionMessage(sessions, testRunId);
|
|
1992
|
-
expect(msg).toContain("lead");
|
|
1993
|
-
expect(msg).toContain("Ready for merge/cleanup");
|
|
1994
|
-
});
|
|
2013
|
+
writeSessionsToStore(tempRoot, sessions);
|
|
2014
|
+
await setActiveRun(tempRoot, runId);
|
|
1995
2015
|
|
|
1996
|
-
|
|
1997
|
-
const sessions = [makeSession({ capability: "merger", agentName: "merger-1" })];
|
|
1998
|
-
const msg = buildCompletionMessage(sessions, testRunId);
|
|
1999
|
-
expect(msg).toContain("merger");
|
|
2000
|
-
expect(msg).toContain("Merges done");
|
|
2001
|
-
});
|
|
2016
|
+
const nudgeMock = nudgeTracker();
|
|
2002
2017
|
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
});
|
|
2018
|
+
await runDaemonTick({
|
|
2019
|
+
root: tempRoot,
|
|
2020
|
+
...THRESHOLDS,
|
|
2021
|
+
_tmux: tmuxAllAlive(),
|
|
2022
|
+
_triage: triageAlways("extend"),
|
|
2023
|
+
_nudge: nudgeMock.nudge,
|
|
2024
|
+
_eventStore: null,
|
|
2025
|
+
});
|
|
2012
2026
|
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
expect(
|
|
2027
|
+
const coordinatorNudges = nudgeMock.calls.filter(
|
|
2028
|
+
(c) => c.agentName === "coordinator" && c.message.includes("WATCHDOG"),
|
|
2029
|
+
);
|
|
2030
|
+
expect(coordinatorNudges).toHaveLength(1);
|
|
2031
|
+
expect(coordinatorNudges[0]?.message).toContain("have terminated");
|
|
2032
|
+
expect(coordinatorNudges[0]?.message).toContain("(1 completed, 1 zombie)");
|
|
2017
2033
|
});
|
|
2018
2034
|
|
|
2019
|
-
test("
|
|
2035
|
+
test("nudges coordinator when every worker is zombie", async () => {
|
|
2020
2036
|
const sessions = [
|
|
2021
|
-
makeSession({
|
|
2022
|
-
|
|
2023
|
-
|
|
2037
|
+
makeSession({
|
|
2038
|
+
id: "s1",
|
|
2039
|
+
agentName: "builder-one",
|
|
2040
|
+
capability: "builder",
|
|
2041
|
+
tmuxSession: "overstory-agent-fake-builder-one",
|
|
2042
|
+
state: "zombie",
|
|
2043
|
+
runId,
|
|
2044
|
+
lastActivity: new Date().toISOString(),
|
|
2045
|
+
}),
|
|
2046
|
+
makeSession({
|
|
2047
|
+
id: "s2",
|
|
2048
|
+
agentName: "builder-two",
|
|
2049
|
+
capability: "builder",
|
|
2050
|
+
tmuxSession: "overstory-agent-fake-builder-two",
|
|
2051
|
+
state: "zombie",
|
|
2052
|
+
runId,
|
|
2053
|
+
lastActivity: new Date().toISOString(),
|
|
2054
|
+
}),
|
|
2024
2055
|
];
|
|
2025
|
-
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2026
|
-
expect(msg).toContain("3");
|
|
2027
|
-
});
|
|
2028
|
-
});
|
|
2029
|
-
|
|
2030
|
-
// === Bug fix tests: headless agent kill blast radius + stale detection ===
|
|
2031
|
-
|
|
2032
|
-
describe("headless agent kill blast radius fix (Bug 1)", () => {
|
|
2033
|
-
/**
|
|
2034
|
-
* Track PID kill calls without spawning real processes.
|
|
2035
|
-
* Also surfaces killTree calls so tests can assert on them.
|
|
2036
|
-
*/
|
|
2037
|
-
function processTracker(): {
|
|
2038
|
-
isAlive: (pid: number) => boolean;
|
|
2039
|
-
killTree: (pid: number) => Promise<void>;
|
|
2040
|
-
killed: number[];
|
|
2041
|
-
} {
|
|
2042
|
-
const killed: number[] = [];
|
|
2043
|
-
return {
|
|
2044
|
-
isAlive: (pid: number) => {
|
|
2045
|
-
try {
|
|
2046
|
-
process.kill(pid, 0);
|
|
2047
|
-
return true;
|
|
2048
|
-
} catch {
|
|
2049
|
-
return false;
|
|
2050
|
-
}
|
|
2051
|
-
},
|
|
2052
|
-
killTree: async (pid: number) => {
|
|
2053
|
-
killed.push(pid);
|
|
2054
|
-
},
|
|
2055
|
-
killed,
|
|
2056
|
-
};
|
|
2057
|
-
}
|
|
2058
|
-
|
|
2059
|
-
test("headless agent at escalation level 3 kills PID, not tmux session", async () => {
|
|
2060
|
-
const nudgeIntervalMs = 60_000;
|
|
2061
|
-
// stalledSince is 4 intervals ago — expectedLevel = floor(4) = 4, clamped to MAX (3)
|
|
2062
|
-
const stalledSince = new Date(Date.now() - 4 * nudgeIntervalMs).toISOString();
|
|
2063
|
-
const staleActivity = new Date(Date.now() - THRESHOLDS.staleThresholdMs * 2).toISOString();
|
|
2064
|
-
|
|
2065
|
-
const session = makeSession({
|
|
2066
|
-
agentName: "headless-stalled",
|
|
2067
|
-
tmuxSession: "", // headless
|
|
2068
|
-
pid: process.pid, // alive PID — ZFC won't trigger direct terminate
|
|
2069
|
-
state: "stalled",
|
|
2070
|
-
lastActivity: staleActivity,
|
|
2071
|
-
escalationLevel: 2,
|
|
2072
|
-
stalledSince,
|
|
2073
|
-
});
|
|
2074
2056
|
|
|
2075
|
-
writeSessionsToStore(tempRoot,
|
|
2057
|
+
writeSessionsToStore(tempRoot, sessions);
|
|
2058
|
+
await setActiveRun(tempRoot, runId);
|
|
2076
2059
|
|
|
2077
|
-
const
|
|
2078
|
-
// tmux mock: isSessionAlive("") returns true — simulates prefix-match bug scenario
|
|
2079
|
-
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
2060
|
+
const nudgeMock = nudgeTracker();
|
|
2080
2061
|
|
|
2081
2062
|
await runDaemonTick({
|
|
2082
2063
|
root: tempRoot,
|
|
2083
2064
|
...THRESHOLDS,
|
|
2084
|
-
|
|
2085
|
-
tier1Enabled: false,
|
|
2086
|
-
_tmux: tmuxMock,
|
|
2065
|
+
_tmux: tmuxAllAlive(),
|
|
2087
2066
|
_triage: triageAlways("extend"),
|
|
2088
|
-
|
|
2067
|
+
_nudge: nudgeMock.nudge,
|
|
2089
2068
|
_eventStore: null,
|
|
2090
|
-
_recordFailure: async () => {},
|
|
2091
|
-
_getConnection: () => undefined,
|
|
2092
|
-
_removeConnection: () => {},
|
|
2093
|
-
_tailerRegistry: new Map(),
|
|
2094
|
-
_findLatestStdoutLog: async () => null,
|
|
2095
2069
|
});
|
|
2096
2070
|
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2071
|
+
const coordinatorNudges = nudgeMock.calls.filter(
|
|
2072
|
+
(c) => c.agentName === "coordinator" && c.message.includes("WATCHDOG"),
|
|
2073
|
+
);
|
|
2074
|
+
expect(coordinatorNudges).toHaveLength(1);
|
|
2075
|
+
expect(coordinatorNudges[0]?.message).toContain("(0 completed, 2 zombie)");
|
|
2100
2076
|
});
|
|
2101
2077
|
|
|
2102
|
-
test("
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2078
|
+
test("does not nudge when a working worker remains alongside a zombie", async () => {
|
|
2079
|
+
const sessions = [
|
|
2080
|
+
makeSession({
|
|
2081
|
+
id: "s1",
|
|
2082
|
+
agentName: "builder-one",
|
|
2083
|
+
capability: "builder",
|
|
2084
|
+
tmuxSession: "overstory-agent-fake-builder-one",
|
|
2085
|
+
state: "zombie",
|
|
2086
|
+
runId,
|
|
2087
|
+
lastActivity: new Date().toISOString(),
|
|
2088
|
+
}),
|
|
2089
|
+
makeSession({
|
|
2090
|
+
id: "s2",
|
|
2091
|
+
agentName: "builder-two",
|
|
2092
|
+
capability: "builder",
|
|
2093
|
+
tmuxSession: "overstory-agent-fake-builder-two",
|
|
2094
|
+
state: "working",
|
|
2095
|
+
runId,
|
|
2096
|
+
lastActivity: new Date().toISOString(),
|
|
2097
|
+
}),
|
|
2098
|
+
];
|
|
2112
2099
|
|
|
2113
|
-
writeSessionsToStore(tempRoot,
|
|
2100
|
+
writeSessionsToStore(tempRoot, sessions);
|
|
2101
|
+
await setActiveRun(tempRoot, runId);
|
|
2114
2102
|
|
|
2115
|
-
const
|
|
2116
|
-
// tmux mock: isSessionAlive("") returns true — would kill everything without the fix
|
|
2117
|
-
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
2103
|
+
const nudgeMock = nudgeTracker();
|
|
2118
2104
|
|
|
2119
2105
|
await runDaemonTick({
|
|
2120
2106
|
root: tempRoot,
|
|
2121
2107
|
...THRESHOLDS,
|
|
2122
|
-
_tmux:
|
|
2108
|
+
_tmux: tmuxAllAlive(),
|
|
2123
2109
|
_triage: triageAlways("extend"),
|
|
2124
|
-
|
|
2110
|
+
_nudge: nudgeMock.nudge,
|
|
2125
2111
|
_eventStore: null,
|
|
2126
|
-
_recordFailure: async () => {},
|
|
2127
|
-
_getConnection: () => undefined,
|
|
2128
|
-
_removeConnection: () => {},
|
|
2129
|
-
_tailerRegistry: new Map(),
|
|
2130
|
-
_findLatestStdoutLog: async () => null,
|
|
2131
2112
|
});
|
|
2132
2113
|
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2114
|
+
const coordinatorNudges = nudgeMock.calls.filter(
|
|
2115
|
+
(c) => c.agentName === "coordinator" && c.message.includes("WATCHDOG"),
|
|
2116
|
+
);
|
|
2117
|
+
expect(coordinatorNudges).toHaveLength(0);
|
|
2136
2118
|
});
|
|
2137
2119
|
|
|
2138
|
-
test("
|
|
2139
|
-
const
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2120
|
+
test("run_complete event with zombies records zombieAgents and warn level", async () => {
|
|
2121
|
+
const sessions = [
|
|
2122
|
+
makeSession({
|
|
2123
|
+
id: "s1",
|
|
2124
|
+
agentName: "builder-one",
|
|
2125
|
+
capability: "builder",
|
|
2126
|
+
tmuxSession: "overstory-agent-fake-builder-one",
|
|
2127
|
+
state: "completed",
|
|
2128
|
+
runId,
|
|
2129
|
+
lastActivity: new Date().toISOString(),
|
|
2130
|
+
}),
|
|
2131
|
+
makeSession({
|
|
2132
|
+
id: "s2",
|
|
2133
|
+
agentName: "builder-two",
|
|
2134
|
+
capability: "builder",
|
|
2135
|
+
tmuxSession: "overstory-agent-fake-builder-two",
|
|
2136
|
+
state: "zombie",
|
|
2137
|
+
runId,
|
|
2138
|
+
lastActivity: new Date().toISOString(),
|
|
2139
|
+
}),
|
|
2140
|
+
];
|
|
2155
2141
|
|
|
2156
|
-
|
|
2157
|
-
|
|
2142
|
+
writeSessionsToStore(tempRoot, sessions);
|
|
2143
|
+
await setActiveRun(tempRoot, runId);
|
|
2158
2144
|
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
...THRESHOLDS,
|
|
2162
|
-
nudgeIntervalMs,
|
|
2163
|
-
tier1Enabled: true,
|
|
2164
|
-
_tmux: tmuxMock,
|
|
2165
|
-
_triage: triageAlways("terminate"), // AI triage says terminate
|
|
2166
|
-
_nudge: nudgeTracker().nudge,
|
|
2167
|
-
_process: proc,
|
|
2168
|
-
_eventStore: null,
|
|
2169
|
-
_recordFailure: async () => {},
|
|
2170
|
-
_getConnection: () => undefined,
|
|
2171
|
-
_removeConnection: () => {},
|
|
2172
|
-
_tailerRegistry: new Map(),
|
|
2173
|
-
_findLatestStdoutLog: async () => null,
|
|
2174
|
-
});
|
|
2145
|
+
const eventsDbPath = join(tempRoot, ".overstory", "events.db");
|
|
2146
|
+
const eventStore = createEventStore(eventsDbPath);
|
|
2175
2147
|
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2148
|
+
try {
|
|
2149
|
+
await runDaemonTick({
|
|
2150
|
+
root: tempRoot,
|
|
2151
|
+
...THRESHOLDS,
|
|
2152
|
+
_tmux: tmuxAllAlive(),
|
|
2153
|
+
_triage: triageAlways("extend"),
|
|
2154
|
+
_nudge: nudgeTracker().nudge,
|
|
2155
|
+
_eventStore: eventStore,
|
|
2156
|
+
});
|
|
2157
|
+
} finally {
|
|
2158
|
+
eventStore.close();
|
|
2159
|
+
}
|
|
2160
|
+
|
|
2161
|
+
const store = createEventStore(eventsDbPath);
|
|
2162
|
+
try {
|
|
2163
|
+
const events = store.getTimeline({ since: "2000-01-01T00:00:00Z" });
|
|
2164
|
+
const runCompleteEvent = events.find((e) => {
|
|
2165
|
+
if (!e.data) return false;
|
|
2166
|
+
const data = JSON.parse(e.data) as Record<string, unknown>;
|
|
2167
|
+
return data.type === "run_complete";
|
|
2168
|
+
});
|
|
2169
|
+
expect(runCompleteEvent).toBeDefined();
|
|
2170
|
+
expect(runCompleteEvent?.level).toBe("warn");
|
|
2171
|
+
const data = JSON.parse(runCompleteEvent?.data ?? "{}") as Record<string, unknown>;
|
|
2172
|
+
expect(data.completedAgents).toEqual(["builder-one"]);
|
|
2173
|
+
expect(data.zombieAgents).toEqual(["builder-two"]);
|
|
2174
|
+
expect(data.workerCount).toBe(2);
|
|
2175
|
+
} finally {
|
|
2176
|
+
store.close();
|
|
2177
|
+
}
|
|
2179
2178
|
});
|
|
2180
|
-
});
|
|
2181
2179
|
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2180
|
+
test("missing current-run.txt: warns once, skips run-completion check (overstory-87bf)", async () => {
|
|
2181
|
+
const sessions = [
|
|
2182
|
+
makeSession({
|
|
2183
|
+
id: "s1",
|
|
2184
|
+
agentName: "builder-one",
|
|
2185
|
+
capability: "builder",
|
|
2186
|
+
tmuxSession: "overstory-agent-fake-builder-one",
|
|
2187
|
+
state: "completed",
|
|
2188
|
+
runId,
|
|
2189
|
+
lastActivity: new Date().toISOString(),
|
|
2190
|
+
}),
|
|
2191
|
+
makeSession({
|
|
2192
|
+
id: "s2",
|
|
2193
|
+
agentName: "builder-two",
|
|
2194
|
+
capability: "builder",
|
|
2195
|
+
tmuxSession: "overstory-agent-fake-builder-two",
|
|
2196
|
+
state: "completed",
|
|
2197
|
+
runId,
|
|
2198
|
+
lastActivity: new Date().toISOString(),
|
|
2199
|
+
}),
|
|
2200
|
+
];
|
|
2185
2201
|
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
tmuxSession: "", // headless
|
|
2189
|
-
pid: process.pid, // alive
|
|
2190
|
-
state: "working",
|
|
2191
|
-
lastActivity: staleActivity, // stale — would trigger escalate without event fallback
|
|
2192
|
-
});
|
|
2202
|
+
writeSessionsToStore(tempRoot, sessions);
|
|
2203
|
+
// Deliberately do NOT call setActiveRun — current-run.txt absent.
|
|
2193
2204
|
|
|
2194
|
-
|
|
2205
|
+
const nudgeMock = nudgeTracker();
|
|
2206
|
+
const warnState = freshRunIdWarnState();
|
|
2195
2207
|
|
|
2196
|
-
const
|
|
2197
|
-
const
|
|
2208
|
+
const stderrWrites: string[] = [];
|
|
2209
|
+
const originalStderrWrite = process.stderr.write.bind(process.stderr);
|
|
2210
|
+
process.stderr.write = ((chunk: unknown, ...rest: unknown[]) => {
|
|
2211
|
+
stderrWrites.push(typeof chunk === "string" ? chunk : String(chunk));
|
|
2212
|
+
return originalStderrWrite(chunk as string, ...(rest as []));
|
|
2213
|
+
}) as typeof process.stderr.write;
|
|
2198
2214
|
|
|
2199
2215
|
try {
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
toolDurationMs: 100,
|
|
2209
|
-
level: "info",
|
|
2210
|
-
data: null,
|
|
2216
|
+
await runDaemonTick({
|
|
2217
|
+
root: tempRoot,
|
|
2218
|
+
...THRESHOLDS,
|
|
2219
|
+
_tmux: tmuxAllAlive(),
|
|
2220
|
+
_triage: triageAlways("extend"),
|
|
2221
|
+
_nudge: nudgeMock.nudge,
|
|
2222
|
+
_eventStore: null,
|
|
2223
|
+
_runIdWarnState: warnState,
|
|
2211
2224
|
});
|
|
2212
2225
|
|
|
2213
|
-
|
|
2214
|
-
|
|
2226
|
+
// Tick again to confirm the warning dedupes for the same cause.
|
|
2215
2227
|
await runDaemonTick({
|
|
2216
2228
|
root: tempRoot,
|
|
2217
2229
|
...THRESHOLDS,
|
|
2218
|
-
onHealthCheck: (c) => checks.push(c),
|
|
2219
2230
|
_tmux: tmuxAllAlive(),
|
|
2220
2231
|
_triage: triageAlways("extend"),
|
|
2221
|
-
|
|
2222
|
-
_eventStore:
|
|
2223
|
-
|
|
2224
|
-
_getConnection: () => undefined,
|
|
2225
|
-
_removeConnection: () => {},
|
|
2226
|
-
_tailerRegistry: new Map(),
|
|
2227
|
-
_findLatestStdoutLog: async () => null,
|
|
2232
|
+
_nudge: nudgeMock.nudge,
|
|
2233
|
+
_eventStore: null,
|
|
2234
|
+
_runIdWarnState: warnState,
|
|
2228
2235
|
});
|
|
2229
|
-
|
|
2230
|
-
// Recent events found — lastActivity was refreshed, agent is NOT stalled
|
|
2231
|
-
expect(checks).toHaveLength(1);
|
|
2232
|
-
expect(checks[0]?.action).toBe("none");
|
|
2233
|
-
expect(checks[0]?.state).toBe("working");
|
|
2234
|
-
|
|
2235
|
-
const reloaded = readSessionsFromStore(tempRoot);
|
|
2236
|
-
expect(reloaded[0]?.state).toBe("working");
|
|
2237
2236
|
} finally {
|
|
2238
|
-
|
|
2237
|
+
process.stderr.write = originalStderrWrite;
|
|
2239
2238
|
}
|
|
2240
|
-
});
|
|
2241
2239
|
|
|
2242
|
-
|
|
2243
|
-
const
|
|
2240
|
+
// Run-completion skip is observable: no coordinator nudge was sent.
|
|
2241
|
+
const coordinatorNudges = nudgeMock.calls.filter(
|
|
2242
|
+
(c) => c.agentName === "coordinator" && c.message.includes("WATCHDOG"),
|
|
2243
|
+
);
|
|
2244
|
+
expect(coordinatorNudges).toHaveLength(0);
|
|
2244
2245
|
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2246
|
+
// Warning logged exactly once across the two ticks.
|
|
2247
|
+
expect(warnState.missingFileWarned).toBe(true);
|
|
2248
|
+
const missingWarnings = stderrWrites.filter((w) =>
|
|
2249
|
+
w.includes("[WATCHDOG] current-run.txt missing"),
|
|
2250
|
+
);
|
|
2251
|
+
expect(missingWarnings).toHaveLength(1);
|
|
2252
|
+
});
|
|
2252
2253
|
|
|
2253
|
-
|
|
2254
|
+
test("stale current-run.txt id (no row in runs table): warns once per id, skips check (overstory-87bf)", async () => {
|
|
2255
|
+
const staleId = "run-stale-2026-01-01T00-00-00-000Z";
|
|
2256
|
+
const sessions = [
|
|
2257
|
+
makeSession({
|
|
2258
|
+
id: "s1",
|
|
2259
|
+
agentName: "builder-one",
|
|
2260
|
+
capability: "builder",
|
|
2261
|
+
tmuxSession: "overstory-agent-fake-builder-one",
|
|
2262
|
+
state: "completed",
|
|
2263
|
+
runId: staleId,
|
|
2264
|
+
lastActivity: new Date().toISOString(),
|
|
2265
|
+
}),
|
|
2266
|
+
makeSession({
|
|
2267
|
+
id: "s2",
|
|
2268
|
+
agentName: "builder-two",
|
|
2269
|
+
capability: "builder",
|
|
2270
|
+
tmuxSession: "overstory-agent-fake-builder-two",
|
|
2271
|
+
state: "completed",
|
|
2272
|
+
runId: staleId,
|
|
2273
|
+
lastActivity: new Date().toISOString(),
|
|
2274
|
+
}),
|
|
2275
|
+
];
|
|
2254
2276
|
|
|
2255
|
-
|
|
2256
|
-
|
|
2277
|
+
writeSessionsToStore(tempRoot, sessions);
|
|
2278
|
+
// Write current-run.txt but DO NOT seed the runs table — the lookup
|
|
2279
|
+
// will return null, exercising the stale-id branch.
|
|
2280
|
+
await Bun.write(join(tempRoot, ".overstory", "current-run.txt"), staleId);
|
|
2257
2281
|
|
|
2258
|
-
|
|
2259
|
-
|
|
2282
|
+
const nudgeMock = nudgeTracker();
|
|
2283
|
+
const warnState = freshRunIdWarnState();
|
|
2260
2284
|
|
|
2261
|
-
|
|
2285
|
+
const stderrWrites: string[] = [];
|
|
2286
|
+
const originalStderrWrite = process.stderr.write.bind(process.stderr);
|
|
2287
|
+
process.stderr.write = ((chunk: unknown, ...rest: unknown[]) => {
|
|
2288
|
+
stderrWrites.push(typeof chunk === "string" ? chunk : String(chunk));
|
|
2289
|
+
return originalStderrWrite(chunk as string, ...(rest as []));
|
|
2290
|
+
}) as typeof process.stderr.write;
|
|
2262
2291
|
|
|
2292
|
+
try {
|
|
2263
2293
|
await runDaemonTick({
|
|
2264
2294
|
root: tempRoot,
|
|
2265
2295
|
...THRESHOLDS,
|
|
2266
|
-
onHealthCheck: (c) => checks.push(c),
|
|
2267
2296
|
_tmux: tmuxAllAlive(),
|
|
2268
2297
|
_triage: triageAlways("extend"),
|
|
2269
|
-
|
|
2270
|
-
_eventStore:
|
|
2271
|
-
|
|
2272
|
-
_getConnection: () => undefined,
|
|
2273
|
-
_removeConnection: () => {},
|
|
2274
|
-
_tailerRegistry: new Map(),
|
|
2275
|
-
_findLatestStdoutLog: async () => null,
|
|
2298
|
+
_nudge: nudgeMock.nudge,
|
|
2299
|
+
_eventStore: null,
|
|
2300
|
+
_runIdWarnState: warnState,
|
|
2276
2301
|
});
|
|
2277
2302
|
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2303
|
+
await runDaemonTick({
|
|
2304
|
+
root: tempRoot,
|
|
2305
|
+
...THRESHOLDS,
|
|
2306
|
+
_tmux: tmuxAllAlive(),
|
|
2307
|
+
_triage: triageAlways("extend"),
|
|
2308
|
+
_nudge: nudgeMock.nudge,
|
|
2309
|
+
_eventStore: null,
|
|
2310
|
+
_runIdWarnState: warnState,
|
|
2311
|
+
});
|
|
2281
2312
|
} finally {
|
|
2282
|
-
|
|
2313
|
+
process.stderr.write = originalStderrWrite;
|
|
2283
2314
|
}
|
|
2315
|
+
|
|
2316
|
+
// Run-completion skip is observable: no coordinator nudge.
|
|
2317
|
+
const coordinatorNudges = nudgeMock.calls.filter(
|
|
2318
|
+
(c) => c.agentName === "coordinator" && c.message.includes("WATCHDOG"),
|
|
2319
|
+
);
|
|
2320
|
+
expect(coordinatorNudges).toHaveLength(0);
|
|
2321
|
+
|
|
2322
|
+
// Stale-id was recorded once, missing-file path was NOT triggered.
|
|
2323
|
+
expect(warnState.unknownIds.has(staleId)).toBe(true);
|
|
2324
|
+
expect(warnState.missingFileWarned).toBe(false);
|
|
2325
|
+
const staleWarnings = stderrWrites.filter((w) =>
|
|
2326
|
+
w.includes(`points to unknown run "${staleId}"`),
|
|
2327
|
+
);
|
|
2328
|
+
expect(staleWarnings).toHaveLength(1);
|
|
2284
2329
|
});
|
|
2285
2330
|
});
|
|
2286
2331
|
|
|
2287
|
-
//
|
|
2288
|
-
// startDaemon() shutdown cleanup
|
|
2289
|
-
// ============================================================
|
|
2332
|
+
// === buildCompletionMessage unit tests ===
|
|
2290
2333
|
|
|
2291
|
-
describe("
|
|
2292
|
-
|
|
2334
|
+
describe("buildCompletionMessage", () => {
|
|
2335
|
+
const testRunId = "run-test-123";
|
|
2293
2336
|
|
|
2294
|
-
|
|
2295
|
-
|
|
2337
|
+
test("all scouts → contains 'scout' and 'Ready for next phase'", () => {
|
|
2338
|
+
const sessions = [
|
|
2339
|
+
makeSession({ capability: "scout", agentName: "scout-1" }),
|
|
2340
|
+
makeSession({ capability: "scout", agentName: "scout-2" }),
|
|
2341
|
+
];
|
|
2342
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2343
|
+
expect(msg).toContain("scout");
|
|
2344
|
+
expect(msg).toContain("Ready for next phase");
|
|
2345
|
+
expect(msg).not.toContain("merge/cleanup");
|
|
2296
2346
|
});
|
|
2297
2347
|
|
|
2298
|
-
|
|
2299
|
-
|
|
2348
|
+
test("all builders → contains 'builder' and 'Awaiting lead verification' (not merge authorization)", () => {
|
|
2349
|
+
const sessions = [
|
|
2350
|
+
makeSession({ capability: "builder", agentName: "builder-1" }),
|
|
2351
|
+
makeSession({ capability: "builder", agentName: "builder-2" }),
|
|
2352
|
+
];
|
|
2353
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2354
|
+
expect(msg).toContain("builder");
|
|
2355
|
+
expect(msg).toContain("Awaiting lead verification");
|
|
2356
|
+
expect(msg).not.toContain("merge/cleanup");
|
|
2300
2357
|
});
|
|
2301
2358
|
|
|
2302
|
-
test("
|
|
2303
|
-
|
|
2304
|
-
const
|
|
2359
|
+
test("all reviewers → contains 'reviewer' and 'Reviews done'", () => {
|
|
2360
|
+
const sessions = [makeSession({ capability: "reviewer", agentName: "reviewer-1" })];
|
|
2361
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2362
|
+
expect(msg).toContain("reviewer");
|
|
2363
|
+
expect(msg).toContain("Reviews done");
|
|
2364
|
+
});
|
|
2305
2365
|
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2366
|
+
test("all leads → contains 'lead' and 'Ready for merge/cleanup'", () => {
|
|
2367
|
+
const sessions = [makeSession({ capability: "lead", agentName: "lead-1" })];
|
|
2368
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2369
|
+
expect(msg).toContain("lead");
|
|
2370
|
+
expect(msg).toContain("Ready for merge/cleanup");
|
|
2371
|
+
});
|
|
2372
|
+
|
|
2373
|
+
test("all mergers → contains 'merger' and 'Merges done'", () => {
|
|
2374
|
+
const sessions = [makeSession({ capability: "merger", agentName: "merger-1" })];
|
|
2375
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2376
|
+
expect(msg).toContain("merger");
|
|
2377
|
+
expect(msg).toContain("Merges done");
|
|
2378
|
+
});
|
|
2379
|
+
|
|
2380
|
+
test("mixed capabilities → contains breakdown and 'Ready for next steps'", () => {
|
|
2381
|
+
const sessions = [
|
|
2382
|
+
makeSession({ capability: "scout", agentName: "scout-1" }),
|
|
2383
|
+
makeSession({ capability: "builder", agentName: "builder-1" }),
|
|
2384
|
+
];
|
|
2385
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2386
|
+
expect(msg).toContain("(builder, scout)");
|
|
2387
|
+
expect(msg).toContain("Ready for next steps");
|
|
2388
|
+
});
|
|
2389
|
+
|
|
2390
|
+
test("message includes the run ID", () => {
|
|
2391
|
+
const sessions = [makeSession({ capability: "builder", agentName: "builder-1" })];
|
|
2392
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2393
|
+
expect(msg).toContain(testRunId);
|
|
2394
|
+
});
|
|
2395
|
+
|
|
2396
|
+
test("message includes the worker count", () => {
|
|
2397
|
+
const sessions = [
|
|
2398
|
+
makeSession({ capability: "scout", agentName: "scout-1" }),
|
|
2399
|
+
makeSession({ capability: "scout", agentName: "scout-2" }),
|
|
2400
|
+
makeSession({ capability: "scout", agentName: "scout-3" }),
|
|
2401
|
+
];
|
|
2402
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2403
|
+
expect(msg).toContain("3");
|
|
2404
|
+
});
|
|
2405
|
+
|
|
2406
|
+
// overstory-e130: zombie workers must surface in the message so the coordinator
|
|
2407
|
+
// reads "have terminated (...)" instead of being misled into "have completed".
|
|
2408
|
+
test("mix of completed and zombie workers → 'have terminated' with completed/zombie qualifier", () => {
|
|
2409
|
+
const sessions = [
|
|
2410
|
+
makeSession({ capability: "builder", agentName: "builder-1", state: "completed" }),
|
|
2411
|
+
makeSession({ capability: "builder", agentName: "builder-2", state: "zombie" }),
|
|
2412
|
+
makeSession({ capability: "builder", agentName: "builder-3", state: "completed" }),
|
|
2413
|
+
];
|
|
2414
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2415
|
+
expect(msg).toContain("have terminated");
|
|
2416
|
+
expect(msg).toContain("(2 completed, 1 zombie)");
|
|
2417
|
+
expect(msg).not.toContain("have completed");
|
|
2418
|
+
// Capability-specific suffix is preserved
|
|
2419
|
+
expect(msg).toContain("Awaiting lead verification");
|
|
2420
|
+
});
|
|
2421
|
+
|
|
2422
|
+
test("all-zombie batch → '(0 completed, N zombie)' qualifier", () => {
|
|
2423
|
+
const sessions = [
|
|
2424
|
+
makeSession({ capability: "scout", agentName: "scout-1", state: "zombie" }),
|
|
2425
|
+
makeSession({ capability: "scout", agentName: "scout-2", state: "zombie" }),
|
|
2426
|
+
];
|
|
2427
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2428
|
+
expect(msg).toContain("have terminated");
|
|
2429
|
+
expect(msg).toContain("(0 completed, 2 zombie)");
|
|
2430
|
+
expect(msg).toContain("Ready for next phase");
|
|
2431
|
+
});
|
|
2432
|
+
|
|
2433
|
+
test("mixed-capability batch with zombies includes both qualifier and capability breakdown", () => {
|
|
2434
|
+
const sessions = [
|
|
2435
|
+
makeSession({ capability: "scout", agentName: "scout-1", state: "completed" }),
|
|
2436
|
+
makeSession({ capability: "builder", agentName: "builder-1", state: "zombie" }),
|
|
2437
|
+
];
|
|
2438
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2439
|
+
expect(msg).toContain("have terminated");
|
|
2440
|
+
expect(msg).toContain("(1 completed, 1 zombie)");
|
|
2441
|
+
expect(msg).toContain("(builder, scout)");
|
|
2442
|
+
expect(msg).toContain("Ready for next steps");
|
|
2443
|
+
});
|
|
2444
|
+
|
|
2445
|
+
test("all-completed batch keeps existing 'have completed' phrasing (no zombie qualifier)", () => {
|
|
2446
|
+
const sessions = [
|
|
2447
|
+
makeSession({ capability: "builder", agentName: "builder-1", state: "completed" }),
|
|
2448
|
+
makeSession({ capability: "builder", agentName: "builder-2", state: "completed" }),
|
|
2449
|
+
];
|
|
2450
|
+
const msg = buildCompletionMessage(sessions, testRunId);
|
|
2451
|
+
expect(msg).toContain("have completed");
|
|
2452
|
+
expect(msg).not.toContain("have terminated");
|
|
2453
|
+
expect(msg).not.toContain("zombie");
|
|
2454
|
+
});
|
|
2455
|
+
});
|
|
2456
|
+
|
|
2457
|
+
// === Bug fix tests: headless agent kill blast radius + stale detection ===
|
|
2458
|
+
|
|
2459
|
+
describe("headless agent kill blast radius fix (Bug 1)", () => {
|
|
2460
|
+
/**
|
|
2461
|
+
* Track PID kill calls without spawning real processes.
|
|
2462
|
+
* Also surfaces killTree calls so tests can assert on them.
|
|
2463
|
+
*/
|
|
2464
|
+
function processTracker(): {
|
|
2465
|
+
isAlive: (pid: number) => boolean;
|
|
2466
|
+
killTree: (pid: number) => Promise<void>;
|
|
2467
|
+
killed: number[];
|
|
2468
|
+
} {
|
|
2469
|
+
const killed: number[] = [];
|
|
2470
|
+
return {
|
|
2471
|
+
isAlive: (pid: number) => {
|
|
2472
|
+
try {
|
|
2473
|
+
process.kill(pid, 0);
|
|
2474
|
+
return true;
|
|
2475
|
+
} catch {
|
|
2476
|
+
return false;
|
|
2477
|
+
}
|
|
2478
|
+
},
|
|
2479
|
+
killTree: async (pid: number) => {
|
|
2480
|
+
killed.push(pid);
|
|
2481
|
+
},
|
|
2482
|
+
killed,
|
|
2483
|
+
};
|
|
2484
|
+
}
|
|
2485
|
+
|
|
2486
|
+
test("headless agent at escalation level 3 kills PID, not tmux session", async () => {
|
|
2487
|
+
const nudgeIntervalMs = 60_000;
|
|
2488
|
+
// stalledSince is 4 intervals ago — expectedLevel = floor(4) = 4, clamped to MAX (3)
|
|
2489
|
+
const stalledSince = new Date(Date.now() - 4 * nudgeIntervalMs).toISOString();
|
|
2490
|
+
const staleActivity = new Date(Date.now() - THRESHOLDS.staleThresholdMs * 2).toISOString();
|
|
2491
|
+
|
|
2492
|
+
const session = makeSession({
|
|
2493
|
+
agentName: "headless-stalled",
|
|
2494
|
+
tmuxSession: "", // headless
|
|
2495
|
+
pid: process.pid, // alive PID — ZFC won't trigger direct terminate
|
|
2496
|
+
state: "stalled",
|
|
2497
|
+
lastActivity: staleActivity,
|
|
2498
|
+
escalationLevel: 2,
|
|
2499
|
+
stalledSince,
|
|
2500
|
+
});
|
|
2501
|
+
|
|
2502
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
2503
|
+
|
|
2504
|
+
const proc = processTracker();
|
|
2505
|
+
// tmux mock: isSessionAlive("") returns true — simulates prefix-match bug scenario
|
|
2506
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
2507
|
+
|
|
2508
|
+
await runDaemonTick({
|
|
2509
|
+
root: tempRoot,
|
|
2510
|
+
...THRESHOLDS,
|
|
2511
|
+
nudgeIntervalMs,
|
|
2512
|
+
tier1Enabled: false,
|
|
2513
|
+
_tmux: tmuxMock,
|
|
2514
|
+
_triage: triageAlways("extend"),
|
|
2515
|
+
_process: proc,
|
|
2516
|
+
_eventStore: null,
|
|
2517
|
+
_recordFailure: async () => {},
|
|
2518
|
+
_getConnection: () => undefined,
|
|
2519
|
+
_removeConnection: () => {},
|
|
2520
|
+
_tailerRegistry: new Map(),
|
|
2521
|
+
_findLatestStdoutLog: async () => null,
|
|
2522
|
+
});
|
|
2523
|
+
|
|
2524
|
+
// PID was killed via killTree, NOT via tmux killSession("")
|
|
2525
|
+
expect(proc.killed).toContain(process.pid);
|
|
2526
|
+
expect(tmuxMock.killed).not.toContain("");
|
|
2527
|
+
});
|
|
2528
|
+
|
|
2529
|
+
test("headless agent direct terminate kills PID, not tmux", async () => {
|
|
2530
|
+
// PID 999999 is virtually guaranteed not to exist — health check sees it as dead
|
|
2531
|
+
const deadPid = 999999;
|
|
2532
|
+
const session = makeSession({
|
|
2533
|
+
agentName: "headless-dead-pid",
|
|
2534
|
+
tmuxSession: "", // headless
|
|
2535
|
+
pid: deadPid,
|
|
2536
|
+
state: "working",
|
|
2537
|
+
lastActivity: new Date().toISOString(),
|
|
2538
|
+
});
|
|
2539
|
+
|
|
2540
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
2541
|
+
|
|
2542
|
+
const proc = processTracker();
|
|
2543
|
+
// tmux mock: isSessionAlive("") returns true — would kill everything without the fix
|
|
2544
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
2545
|
+
|
|
2546
|
+
await runDaemonTick({
|
|
2547
|
+
root: tempRoot,
|
|
2548
|
+
...THRESHOLDS,
|
|
2549
|
+
_tmux: tmuxMock,
|
|
2550
|
+
_triage: triageAlways("extend"),
|
|
2551
|
+
_process: proc,
|
|
2552
|
+
_eventStore: null,
|
|
2553
|
+
_recordFailure: async () => {},
|
|
2554
|
+
_getConnection: () => undefined,
|
|
2555
|
+
_removeConnection: () => {},
|
|
2556
|
+
_tailerRegistry: new Map(),
|
|
2557
|
+
_findLatestStdoutLog: async () => null,
|
|
2558
|
+
});
|
|
2559
|
+
|
|
2560
|
+
// Should have attempted PID kill, NOT tmux killSession("")
|
|
2561
|
+
expect(proc.killed).toContain(deadPid);
|
|
2562
|
+
expect(tmuxMock.killed).not.toContain("");
|
|
2563
|
+
});
|
|
2564
|
+
|
|
2565
|
+
test("triage terminate on headless agent kills PID, not tmux", async () => {
|
|
2566
|
+
const nudgeIntervalMs = 60_000;
|
|
2567
|
+
// stalledSince is 2.5 intervals ago — expectedLevel = floor(2.5) = 2 → triage fires
|
|
2568
|
+
const stalledSince = new Date(Date.now() - 2.5 * nudgeIntervalMs).toISOString();
|
|
2569
|
+
const staleActivity = new Date(Date.now() - THRESHOLDS.staleThresholdMs * 2).toISOString();
|
|
2570
|
+
|
|
2571
|
+
const session = makeSession({
|
|
2572
|
+
agentName: "headless-triage-terminate",
|
|
2573
|
+
tmuxSession: "", // headless
|
|
2574
|
+
pid: process.pid, // alive
|
|
2575
|
+
state: "stalled",
|
|
2576
|
+
lastActivity: staleActivity,
|
|
2577
|
+
escalationLevel: 1,
|
|
2578
|
+
stalledSince,
|
|
2579
|
+
});
|
|
2580
|
+
|
|
2581
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
2582
|
+
|
|
2583
|
+
const proc = processTracker();
|
|
2584
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
2585
|
+
|
|
2586
|
+
await runDaemonTick({
|
|
2587
|
+
root: tempRoot,
|
|
2588
|
+
...THRESHOLDS,
|
|
2589
|
+
nudgeIntervalMs,
|
|
2590
|
+
tier1Enabled: true,
|
|
2591
|
+
_tmux: tmuxMock,
|
|
2592
|
+
_triage: triageAlways("terminate"), // AI triage says terminate
|
|
2593
|
+
_nudge: nudgeTracker().nudge,
|
|
2594
|
+
_process: proc,
|
|
2595
|
+
_eventStore: null,
|
|
2596
|
+
_recordFailure: async () => {},
|
|
2597
|
+
_getConnection: () => undefined,
|
|
2598
|
+
_removeConnection: () => {},
|
|
2599
|
+
_tailerRegistry: new Map(),
|
|
2600
|
+
_findLatestStdoutLog: async () => null,
|
|
2601
|
+
});
|
|
2602
|
+
|
|
2603
|
+
// Should have killed the PID, not the tmux session
|
|
2604
|
+
expect(proc.killed).toContain(process.pid);
|
|
2605
|
+
expect(tmuxMock.killed).not.toContain("");
|
|
2606
|
+
});
|
|
2607
|
+
});
|
|
2608
|
+
|
|
2609
|
+
describe("headless agent stale detection via events.db (Bug 2)", () => {
|
|
2610
|
+
test("headless agent with recent events in events.db is not flagged stale", async () => {
|
|
2611
|
+
const staleActivity = new Date(Date.now() - THRESHOLDS.staleThresholdMs * 2).toISOString();
|
|
2612
|
+
|
|
2613
|
+
const session = makeSession({
|
|
2614
|
+
agentName: "headless-active",
|
|
2615
|
+
tmuxSession: "", // headless
|
|
2616
|
+
pid: process.pid, // alive
|
|
2617
|
+
state: "working",
|
|
2618
|
+
lastActivity: staleActivity, // stale — would trigger escalate without event fallback
|
|
2619
|
+
});
|
|
2620
|
+
|
|
2621
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
2622
|
+
|
|
2623
|
+
const eventsDbPath = join(tempRoot, ".overstory", "events.db");
|
|
2624
|
+
const eventStore = createEventStore(eventsDbPath);
|
|
2625
|
+
|
|
2626
|
+
try {
|
|
2627
|
+
// Insert a recent event for this agent (within the stale threshold window)
|
|
2628
|
+
eventStore.insert({
|
|
2629
|
+
runId: null,
|
|
2630
|
+
agentName: "headless-active",
|
|
2631
|
+
sessionId: null,
|
|
2632
|
+
eventType: "tool_end",
|
|
2633
|
+
toolName: "Read",
|
|
2634
|
+
toolArgs: null,
|
|
2635
|
+
toolDurationMs: 100,
|
|
2636
|
+
level: "info",
|
|
2637
|
+
data: null,
|
|
2638
|
+
});
|
|
2639
|
+
|
|
2640
|
+
const checks: HealthCheck[] = [];
|
|
2641
|
+
|
|
2642
|
+
await runDaemonTick({
|
|
2643
|
+
root: tempRoot,
|
|
2644
|
+
...THRESHOLDS,
|
|
2645
|
+
onHealthCheck: (c) => checks.push(c),
|
|
2646
|
+
_tmux: tmuxAllAlive(),
|
|
2647
|
+
_triage: triageAlways("extend"),
|
|
2648
|
+
_process: { isAlive: () => true, killTree: async () => {} },
|
|
2649
|
+
_eventStore: eventStore,
|
|
2650
|
+
_recordFailure: async () => {},
|
|
2651
|
+
_getConnection: () => undefined,
|
|
2652
|
+
_removeConnection: () => {},
|
|
2653
|
+
_tailerRegistry: new Map(),
|
|
2654
|
+
_findLatestStdoutLog: async () => null,
|
|
2655
|
+
});
|
|
2656
|
+
|
|
2657
|
+
// Recent events found — lastActivity was refreshed, agent is NOT stalled
|
|
2658
|
+
expect(checks).toHaveLength(1);
|
|
2659
|
+
expect(checks[0]?.action).toBe("none");
|
|
2660
|
+
expect(checks[0]?.state).toBe("working");
|
|
2661
|
+
|
|
2662
|
+
const reloaded = readSessionsFromStore(tempRoot);
|
|
2663
|
+
expect(reloaded[0]?.state).toBe("working");
|
|
2664
|
+
} finally {
|
|
2665
|
+
eventStore.close();
|
|
2666
|
+
}
|
|
2667
|
+
});
|
|
2668
|
+
|
|
2669
|
+
test("headless agent with no recent events IS flagged stale", async () => {
|
|
2670
|
+
const staleActivity = new Date(Date.now() - THRESHOLDS.staleThresholdMs * 2).toISOString();
|
|
2671
|
+
|
|
2672
|
+
const session = makeSession({
|
|
2673
|
+
agentName: "headless-silent",
|
|
2674
|
+
tmuxSession: "", // headless
|
|
2675
|
+
pid: process.pid, // alive
|
|
2676
|
+
state: "working",
|
|
2677
|
+
lastActivity: staleActivity, // stale
|
|
2678
|
+
});
|
|
2679
|
+
|
|
2680
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
2681
|
+
|
|
2682
|
+
const eventsDbPath = join(tempRoot, ".overstory", "events.db");
|
|
2683
|
+
const eventStore = createEventStore(eventsDbPath);
|
|
2684
|
+
|
|
2685
|
+
try {
|
|
2686
|
+
// No events inserted for this agent — event fallback finds nothing
|
|
2687
|
+
|
|
2688
|
+
const checks: HealthCheck[] = [];
|
|
2689
|
+
|
|
2690
|
+
await runDaemonTick({
|
|
2691
|
+
root: tempRoot,
|
|
2692
|
+
...THRESHOLDS,
|
|
2693
|
+
onHealthCheck: (c) => checks.push(c),
|
|
2694
|
+
_tmux: tmuxAllAlive(),
|
|
2695
|
+
_triage: triageAlways("extend"),
|
|
2696
|
+
_process: { isAlive: () => true, killTree: async () => {} },
|
|
2697
|
+
_eventStore: eventStore,
|
|
2698
|
+
_recordFailure: async () => {},
|
|
2699
|
+
_getConnection: () => undefined,
|
|
2700
|
+
_removeConnection: () => {},
|
|
2701
|
+
_tailerRegistry: new Map(),
|
|
2702
|
+
_findLatestStdoutLog: async () => null,
|
|
2703
|
+
});
|
|
2704
|
+
|
|
2705
|
+
// No recent events — lastActivity stays stale, agent IS flagged stalled
|
|
2706
|
+
expect(checks).toHaveLength(1);
|
|
2707
|
+
expect(checks[0]?.action).toBe("escalate");
|
|
2708
|
+
} finally {
|
|
2709
|
+
eventStore.close();
|
|
2710
|
+
}
|
|
2711
|
+
});
|
|
2712
|
+
|
|
2713
|
+
test("spawn-per-turn worker (pid=null) is NOT flagged zombie when actively emitting events (overstory-7a34)", async () => {
|
|
2714
|
+
// Repro: ov sling --capability lead → freshly slung headless lead has
|
|
2715
|
+
// tmuxSession='' AND pid=null (no persistent process between turns).
|
|
2716
|
+
// Previously the daemon's event-based liveness fallback was gated by
|
|
2717
|
+
// `pid !== null`, so spawn-per-turn workers' lastActivity was never
|
|
2718
|
+
// refreshed from events.db and they would flip to stalled / zombie
|
|
2719
|
+
// despite ov feed showing live tool activity.
|
|
2720
|
+
const staleActivity = new Date(Date.now() - THRESHOLDS.staleThresholdMs * 2).toISOString();
|
|
2721
|
+
|
|
2722
|
+
const session = makeSession({
|
|
2723
|
+
agentName: "spawn-per-turn-lead",
|
|
2724
|
+
capability: "lead",
|
|
2725
|
+
tmuxSession: "", // headless
|
|
2726
|
+
pid: null, // spawn-per-turn: no persistent process between turns
|
|
2727
|
+
state: "working",
|
|
2728
|
+
lastActivity: staleActivity, // stale — would flip without event fallback
|
|
2729
|
+
});
|
|
2730
|
+
|
|
2731
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
2732
|
+
|
|
2733
|
+
const eventsDbPath = join(tempRoot, ".overstory", "events.db");
|
|
2734
|
+
const eventStore = createEventStore(eventsDbPath);
|
|
2735
|
+
|
|
2736
|
+
try {
|
|
2737
|
+
// Insert a recent tool event for this agent (matches ov feed activity)
|
|
2738
|
+
eventStore.insert({
|
|
2739
|
+
runId: null,
|
|
2740
|
+
agentName: "spawn-per-turn-lead",
|
|
2741
|
+
sessionId: null,
|
|
2742
|
+
eventType: "tool_end",
|
|
2743
|
+
toolName: "Edit",
|
|
2744
|
+
toolArgs: null,
|
|
2745
|
+
toolDurationMs: 50,
|
|
2746
|
+
level: "info",
|
|
2747
|
+
data: null,
|
|
2748
|
+
});
|
|
2749
|
+
|
|
2750
|
+
const checks: HealthCheck[] = [];
|
|
2751
|
+
|
|
2752
|
+
await runDaemonTick({
|
|
2753
|
+
root: tempRoot,
|
|
2754
|
+
...THRESHOLDS,
|
|
2755
|
+
onHealthCheck: (c) => checks.push(c),
|
|
2756
|
+
_tmux: tmuxAllAlive(),
|
|
2757
|
+
_triage: triageAlways("extend"),
|
|
2758
|
+
_process: { isAlive: () => true, killTree: async () => {} },
|
|
2759
|
+
_eventStore: eventStore,
|
|
2760
|
+
_recordFailure: async () => {},
|
|
2761
|
+
_getConnection: () => undefined,
|
|
2762
|
+
_removeConnection: () => {},
|
|
2763
|
+
_tailerRegistry: new Map(),
|
|
2764
|
+
_findLatestStdoutLog: async () => null,
|
|
2765
|
+
});
|
|
2766
|
+
|
|
2767
|
+
// lastActivity refreshed from events.db → spawn-per-turn evaluation
|
|
2768
|
+
// path keeps the agent in working, NOT zombie.
|
|
2769
|
+
expect(checks).toHaveLength(1);
|
|
2770
|
+
expect(checks[0]?.action).toBe("none");
|
|
2771
|
+
expect(checks[0]?.state).toBe("working");
|
|
2772
|
+
|
|
2773
|
+
const reloaded = readSessionsFromStore(tempRoot);
|
|
2774
|
+
expect(reloaded[0]?.state).toBe("working");
|
|
2775
|
+
} finally {
|
|
2776
|
+
eventStore.close();
|
|
2777
|
+
}
|
|
2778
|
+
});
|
|
2779
|
+
});
|
|
2780
|
+
|
|
2781
|
+
// ============================================================
|
|
2782
|
+
// startDaemon() shutdown cleanup
|
|
2783
|
+
// ============================================================
|
|
2784
|
+
|
|
2785
|
+
describe("startDaemon() stop() cleans up tailer registry", () => {
|
|
2786
|
+
let tempRoot: string;
|
|
2787
|
+
|
|
2788
|
+
beforeEach(async () => {
|
|
2789
|
+
tempRoot = await createTempRoot();
|
|
2790
|
+
});
|
|
2791
|
+
|
|
2792
|
+
afterEach(async () => {
|
|
2793
|
+
await cleanupTempDir(tempRoot);
|
|
2794
|
+
});
|
|
2795
|
+
|
|
2796
|
+
test("stop() calls handle.stop() on all registry entries and empties the map", async () => {
|
|
2797
|
+
// Build a fake tailer registry with two entries.
|
|
2798
|
+
const stopped: Record<string, boolean> = { tailer1: false, tailer2: false };
|
|
2799
|
+
|
|
2800
|
+
const registry = new Map<string, { agentName: string; logPath: string; stop(): void }>([
|
|
2801
|
+
[
|
|
2802
|
+
"agent-one",
|
|
2803
|
+
{
|
|
2804
|
+
agentName: "agent-one",
|
|
2311
2805
|
logPath: "/fake/one/stdout.log",
|
|
2312
2806
|
stop: () => {
|
|
2313
|
-
stopped
|
|
2807
|
+
stopped.tailer1 = true;
|
|
2808
|
+
},
|
|
2809
|
+
},
|
|
2810
|
+
],
|
|
2811
|
+
[
|
|
2812
|
+
"agent-two",
|
|
2813
|
+
{
|
|
2814
|
+
agentName: "agent-two",
|
|
2815
|
+
logPath: "/fake/two/stdout.log",
|
|
2816
|
+
stop: () => {
|
|
2817
|
+
stopped.tailer2 = true;
|
|
2818
|
+
},
|
|
2819
|
+
},
|
|
2820
|
+
],
|
|
2821
|
+
]);
|
|
2822
|
+
|
|
2823
|
+
// Use a long interval so the periodic tick never fires during this test.
|
|
2824
|
+
const daemon = startDaemon({
|
|
2825
|
+
root: tempRoot,
|
|
2826
|
+
intervalMs: 60_000,
|
|
2827
|
+
...THRESHOLDS,
|
|
2828
|
+
_tmux: { isSessionAlive: async () => false, killSession: async () => {} },
|
|
2829
|
+
_nudge: async () => ({ delivered: false }),
|
|
2830
|
+
_process: { isAlive: () => false, killTree: async () => {} },
|
|
2831
|
+
_triage: async () => "extend",
|
|
2832
|
+
_recordFailure: async () => {},
|
|
2833
|
+
_getConnection: () => undefined,
|
|
2834
|
+
_removeConnection: () => {},
|
|
2835
|
+
_eventStore: null,
|
|
2836
|
+
_mailStore: null,
|
|
2837
|
+
_tailerRegistry: registry,
|
|
2838
|
+
_tailerFactory: () => ({ agentName: "", logPath: "", stop: () => {} }),
|
|
2839
|
+
_findLatestStdoutLog: async () => null,
|
|
2840
|
+
});
|
|
2841
|
+
|
|
2842
|
+
// Allow the first (immediate) tick to settle.
|
|
2843
|
+
await new Promise<void>((resolve) => setTimeout(resolve, 20));
|
|
2844
|
+
|
|
2845
|
+
daemon.stop();
|
|
2846
|
+
|
|
2847
|
+
expect(stopped.tailer1).toBe(true);
|
|
2848
|
+
expect(stopped.tailer2).toBe(true);
|
|
2849
|
+
expect(registry.size).toBe(0);
|
|
2850
|
+
});
|
|
2851
|
+
});
|
|
2852
|
+
|
|
2853
|
+
// ============================================================
|
|
2854
|
+
// RPC getState() timeout removes stale connection
|
|
2855
|
+
// ============================================================
|
|
2856
|
+
|
|
2857
|
+
describe("RPC getState() timeout removes stale connection", () => {
|
|
2858
|
+
test("_removeConnection is called when getState() rejects", async () => {
|
|
2859
|
+
const session = makeSession({
|
|
2860
|
+
agentName: "rpc-agent",
|
|
2861
|
+
tmuxSession: "", // headless
|
|
2862
|
+
pid: process.pid, // alive
|
|
2863
|
+
state: "working",
|
|
2864
|
+
lastActivity: new Date().toISOString(),
|
|
2865
|
+
});
|
|
2866
|
+
|
|
2867
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
2868
|
+
|
|
2869
|
+
const removedNames: string[] = [];
|
|
2870
|
+
|
|
2871
|
+
await runDaemonTick({
|
|
2872
|
+
root: tempRoot,
|
|
2873
|
+
...THRESHOLDS,
|
|
2874
|
+
_tmux: { isSessionAlive: async () => false, killSession: async () => {} },
|
|
2875
|
+
_triage: triageAlways("extend"),
|
|
2876
|
+
_process: { isAlive: () => true, killTree: async () => {} },
|
|
2877
|
+
_eventStore: null,
|
|
2878
|
+
_recordFailure: async () => {},
|
|
2879
|
+
_getConnection: (name: string) => {
|
|
2880
|
+
if (name !== "rpc-agent") return undefined;
|
|
2881
|
+
return {
|
|
2882
|
+
getState: () => Promise.reject(new Error("connection error")),
|
|
2883
|
+
sendPrompt: async () => {},
|
|
2884
|
+
followUp: async () => {},
|
|
2885
|
+
abort: async () => {},
|
|
2886
|
+
close: () => {},
|
|
2887
|
+
};
|
|
2888
|
+
},
|
|
2889
|
+
_removeConnection: (name: string) => {
|
|
2890
|
+
removedNames.push(name);
|
|
2891
|
+
},
|
|
2892
|
+
_tailerRegistry: new Map(),
|
|
2893
|
+
_findLatestStdoutLog: async () => null,
|
|
2894
|
+
_mailStore: null,
|
|
2895
|
+
});
|
|
2896
|
+
|
|
2897
|
+
expect(removedNames).toContain("rpc-agent");
|
|
2898
|
+
});
|
|
2899
|
+
});
|
|
2900
|
+
|
|
2901
|
+
// ============================================================
|
|
2902
|
+
// Triage concurrency limit (_maxTriagePerTick)
|
|
2903
|
+
// ============================================================
|
|
2904
|
+
|
|
2905
|
+
describe("triage concurrency limit (_maxTriagePerTick)", () => {
|
|
2906
|
+
test("only _maxTriagePerTick triage calls happen when multiple sessions need level-2 escalation", async () => {
|
|
2907
|
+
const staleActivity = new Date(Date.now() - 60_000).toISOString();
|
|
2908
|
+
const stalledSince = new Date(Date.now() - 130_000).toISOString();
|
|
2909
|
+
|
|
2910
|
+
// 4 sessions all at escalation level 2
|
|
2911
|
+
const sessions: AgentSession[] = [
|
|
2912
|
+
makeSession({
|
|
2913
|
+
id: "s-1",
|
|
2914
|
+
agentName: "agent-1",
|
|
2915
|
+
tmuxSession: "ov-agent-1",
|
|
2916
|
+
state: "stalled",
|
|
2917
|
+
lastActivity: staleActivity,
|
|
2918
|
+
escalationLevel: 2,
|
|
2919
|
+
stalledSince,
|
|
2920
|
+
}),
|
|
2921
|
+
makeSession({
|
|
2922
|
+
id: "s-2",
|
|
2923
|
+
agentName: "agent-2",
|
|
2924
|
+
tmuxSession: "ov-agent-2",
|
|
2925
|
+
state: "stalled",
|
|
2926
|
+
lastActivity: staleActivity,
|
|
2927
|
+
escalationLevel: 2,
|
|
2928
|
+
stalledSince,
|
|
2929
|
+
}),
|
|
2930
|
+
makeSession({
|
|
2931
|
+
id: "s-3",
|
|
2932
|
+
agentName: "agent-3",
|
|
2933
|
+
tmuxSession: "ov-agent-3",
|
|
2934
|
+
state: "stalled",
|
|
2935
|
+
lastActivity: staleActivity,
|
|
2936
|
+
escalationLevel: 2,
|
|
2937
|
+
stalledSince,
|
|
2938
|
+
}),
|
|
2939
|
+
makeSession({
|
|
2940
|
+
id: "s-4",
|
|
2941
|
+
agentName: "agent-4",
|
|
2942
|
+
tmuxSession: "ov-agent-4",
|
|
2943
|
+
state: "stalled",
|
|
2944
|
+
lastActivity: staleActivity,
|
|
2945
|
+
escalationLevel: 2,
|
|
2946
|
+
stalledSince,
|
|
2947
|
+
}),
|
|
2948
|
+
];
|
|
2949
|
+
|
|
2950
|
+
writeSessionsToStore(tempRoot, sessions);
|
|
2951
|
+
|
|
2952
|
+
let triageCallCount = 0;
|
|
2953
|
+
const triageMock = async (_opts: { agentName: string; root: string; lastActivity: string }) => {
|
|
2954
|
+
triageCallCount++;
|
|
2955
|
+
return "extend" as const;
|
|
2956
|
+
};
|
|
2957
|
+
|
|
2958
|
+
await runDaemonTick({
|
|
2959
|
+
root: tempRoot,
|
|
2960
|
+
...THRESHOLDS,
|
|
2961
|
+
nudgeIntervalMs: 60_000,
|
|
2962
|
+
tier1Enabled: true,
|
|
2963
|
+
_maxTriagePerTick: 2,
|
|
2964
|
+
_tmux: tmuxWithLiveness({
|
|
2965
|
+
"ov-agent-1": true,
|
|
2966
|
+
"ov-agent-2": true,
|
|
2967
|
+
"ov-agent-3": true,
|
|
2968
|
+
"ov-agent-4": true,
|
|
2969
|
+
}),
|
|
2970
|
+
_triage: triageMock,
|
|
2971
|
+
_nudge: nudgeTracker().nudge,
|
|
2972
|
+
_eventStore: null,
|
|
2973
|
+
_recordFailure: async () => {},
|
|
2974
|
+
_getConnection: () => undefined,
|
|
2975
|
+
_removeConnection: () => {},
|
|
2976
|
+
_tailerRegistry: new Map(),
|
|
2977
|
+
_findLatestStdoutLog: async () => null,
|
|
2978
|
+
_mailStore: null,
|
|
2979
|
+
});
|
|
2980
|
+
|
|
2981
|
+
// Only 2 of the 4 sessions should have triggered triage
|
|
2982
|
+
expect(triageCallCount).toBe(2);
|
|
2983
|
+
});
|
|
2984
|
+
});
|
|
2985
|
+
|
|
2986
|
+
// ============================================================
|
|
2987
|
+
// RuntimeConnection-aware kill and liveness (overstory-32cd)
|
|
2988
|
+
// ============================================================
|
|
2989
|
+
|
|
2990
|
+
describe("killAgent uses RuntimeConnection.abort() when available", () => {
|
|
2991
|
+
const deadPid = 999999;
|
|
2992
|
+
|
|
2993
|
+
function connProcessTracker(): {
|
|
2994
|
+
isAlive: (pid: number) => boolean;
|
|
2995
|
+
killTree: (pid: number) => Promise<void>;
|
|
2996
|
+
killed: number[];
|
|
2997
|
+
} {
|
|
2998
|
+
const killed: number[] = [];
|
|
2999
|
+
return {
|
|
3000
|
+
isAlive: (pid: number) => {
|
|
3001
|
+
try {
|
|
3002
|
+
process.kill(pid, 0);
|
|
3003
|
+
return true;
|
|
3004
|
+
} catch {
|
|
3005
|
+
return false;
|
|
3006
|
+
}
|
|
3007
|
+
},
|
|
3008
|
+
killTree: async (pid: number) => {
|
|
3009
|
+
killed.push(pid);
|
|
3010
|
+
},
|
|
3011
|
+
killed,
|
|
3012
|
+
};
|
|
3013
|
+
}
|
|
3014
|
+
|
|
3015
|
+
// Test A: killAgent uses connection.abort() when a connection is registered
|
|
3016
|
+
test("Test A: abort() called for ZFC-terminated headless agent with registered connection", async () => {
|
|
3017
|
+
const session = makeSession({
|
|
3018
|
+
agentName: "headless-conn-agent",
|
|
3019
|
+
tmuxSession: "", // headless
|
|
3020
|
+
pid: deadPid, // dead PID → ZFC fires (pidAlive=false)
|
|
3021
|
+
state: "working",
|
|
3022
|
+
lastActivity: new Date().toISOString(),
|
|
3023
|
+
});
|
|
3024
|
+
|
|
3025
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3026
|
+
|
|
3027
|
+
let abortCount = 0;
|
|
3028
|
+
const removedNames: string[] = [];
|
|
3029
|
+
const proc = connProcessTracker();
|
|
3030
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
3031
|
+
|
|
3032
|
+
await runDaemonTick({
|
|
3033
|
+
root: tempRoot,
|
|
3034
|
+
...THRESHOLDS,
|
|
3035
|
+
_tmux: tmuxMock,
|
|
3036
|
+
_triage: triageAlways("extend"),
|
|
3037
|
+
_process: proc,
|
|
3038
|
+
_eventStore: null,
|
|
3039
|
+
_recordFailure: async () => {},
|
|
3040
|
+
_getConnection: (name: string) => {
|
|
3041
|
+
if (name !== "headless-conn-agent") return undefined;
|
|
3042
|
+
return {
|
|
3043
|
+
getState: async () => ({ status: "working" as const }),
|
|
3044
|
+
sendPrompt: async () => {},
|
|
3045
|
+
followUp: async () => {},
|
|
3046
|
+
abort: async () => {
|
|
3047
|
+
abortCount++;
|
|
2314
3048
|
},
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
3049
|
+
close: () => {},
|
|
3050
|
+
};
|
|
3051
|
+
},
|
|
3052
|
+
_removeConnection: (name: string) => {
|
|
3053
|
+
removedNames.push(name);
|
|
3054
|
+
},
|
|
3055
|
+
_tailerRegistry: new Map(),
|
|
3056
|
+
_findLatestStdoutLog: async () => null,
|
|
3057
|
+
_mailStore: null,
|
|
3058
|
+
});
|
|
3059
|
+
|
|
3060
|
+
// abort() called exactly once
|
|
3061
|
+
expect(abortCount).toBe(1);
|
|
3062
|
+
// killTree NOT called (abort succeeded)
|
|
3063
|
+
expect(proc.killed).toHaveLength(0);
|
|
3064
|
+
// removeConnection called for the agent
|
|
3065
|
+
expect(removedNames).toContain("headless-conn-agent");
|
|
3066
|
+
});
|
|
3067
|
+
|
|
3068
|
+
// Test B: killAgent falls back to killTree when conn.abort() throws
|
|
3069
|
+
test("Test B: killTree called as fallback when abort() throws", async () => {
|
|
3070
|
+
const session = makeSession({
|
|
3071
|
+
agentName: "headless-abort-fail",
|
|
3072
|
+
tmuxSession: "",
|
|
3073
|
+
pid: deadPid,
|
|
3074
|
+
state: "working",
|
|
3075
|
+
lastActivity: new Date().toISOString(),
|
|
3076
|
+
});
|
|
3077
|
+
|
|
3078
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3079
|
+
|
|
3080
|
+
let abortCalled = false;
|
|
3081
|
+
const removedNames: string[] = [];
|
|
3082
|
+
const proc = connProcessTracker();
|
|
3083
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
3084
|
+
|
|
3085
|
+
await runDaemonTick({
|
|
3086
|
+
root: tempRoot,
|
|
3087
|
+
...THRESHOLDS,
|
|
3088
|
+
_tmux: tmuxMock,
|
|
3089
|
+
_triage: triageAlways("extend"),
|
|
3090
|
+
_process: proc,
|
|
3091
|
+
_eventStore: null,
|
|
3092
|
+
_recordFailure: async () => {},
|
|
3093
|
+
_getConnection: (name: string) => {
|
|
3094
|
+
if (name !== "headless-abort-fail") return undefined;
|
|
3095
|
+
return {
|
|
3096
|
+
getState: async () => ({ status: "working" as const }),
|
|
3097
|
+
sendPrompt: async () => {},
|
|
3098
|
+
followUp: async () => {},
|
|
3099
|
+
abort: async () => {
|
|
3100
|
+
abortCalled = true;
|
|
3101
|
+
throw new Error("process already dead");
|
|
2324
3102
|
},
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
3103
|
+
close: () => {},
|
|
3104
|
+
};
|
|
3105
|
+
},
|
|
3106
|
+
_removeConnection: (name: string) => {
|
|
3107
|
+
removedNames.push(name);
|
|
3108
|
+
},
|
|
3109
|
+
_tailerRegistry: new Map(),
|
|
3110
|
+
_findLatestStdoutLog: async () => null,
|
|
3111
|
+
_mailStore: null,
|
|
3112
|
+
});
|
|
3113
|
+
|
|
3114
|
+
// abort() was attempted
|
|
3115
|
+
expect(abortCalled).toBe(true);
|
|
3116
|
+
// killTree called as defense-in-depth fallback
|
|
3117
|
+
expect(proc.killed).toContain(deadPid);
|
|
3118
|
+
// removeConnection still called (before fallback)
|
|
3119
|
+
expect(removedNames).toContain("headless-abort-fail");
|
|
3120
|
+
});
|
|
3121
|
+
|
|
3122
|
+
// Test C: killAgent uses conn.abort() for triage-terminate path (level 2 → terminate)
|
|
3123
|
+
test("Test C: abort() called in triage-terminate path (level 2 → terminate verdict)", async () => {
|
|
3124
|
+
const nudgeIntervalMs = 60_000;
|
|
3125
|
+
// stalledSince 2.5 intervals ago → expectedLevel = floor(2.5) = 2 → triage fires
|
|
3126
|
+
const stalledSince = new Date(Date.now() - 2.5 * nudgeIntervalMs).toISOString();
|
|
3127
|
+
// staleActivity: 2x staleThreshold (60s) — stale but not zombie, so escalate fires
|
|
3128
|
+
const staleActivity = new Date(Date.now() - THRESHOLDS.staleThresholdMs * 2).toISOString();
|
|
3129
|
+
|
|
3130
|
+
const session = makeSession({
|
|
3131
|
+
agentName: "headless-triage-conn",
|
|
3132
|
+
tmuxSession: "",
|
|
3133
|
+
pid: process.pid, // alive — ZFC won't fire; escalation path triggers triage
|
|
3134
|
+
state: "stalled",
|
|
3135
|
+
lastActivity: staleActivity,
|
|
3136
|
+
escalationLevel: 1,
|
|
3137
|
+
stalledSince,
|
|
3138
|
+
});
|
|
3139
|
+
|
|
3140
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3141
|
+
|
|
3142
|
+
let abortCount = 0;
|
|
3143
|
+
const removedNames: string[] = [];
|
|
3144
|
+
const proc = connProcessTracker();
|
|
3145
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
3146
|
+
|
|
3147
|
+
await runDaemonTick({
|
|
3148
|
+
root: tempRoot,
|
|
3149
|
+
...THRESHOLDS,
|
|
3150
|
+
nudgeIntervalMs,
|
|
3151
|
+
tier1Enabled: true,
|
|
3152
|
+
_tmux: tmuxMock,
|
|
3153
|
+
_triage: triageAlways("terminate"),
|
|
3154
|
+
_nudge: nudgeTracker().nudge,
|
|
3155
|
+
_process: proc,
|
|
3156
|
+
_eventStore: null,
|
|
3157
|
+
_recordFailure: async () => {},
|
|
3158
|
+
// getState returns "error" so lastActivity is NOT refreshed — stale condition preserved
|
|
3159
|
+
_getConnection: (name: string) => {
|
|
3160
|
+
if (name !== "headless-triage-conn") return undefined;
|
|
3161
|
+
return {
|
|
3162
|
+
getState: async () => ({ status: "error" as const }),
|
|
3163
|
+
sendPrompt: async () => {},
|
|
3164
|
+
followUp: async () => {},
|
|
3165
|
+
abort: async () => {
|
|
3166
|
+
abortCount++;
|
|
3167
|
+
},
|
|
3168
|
+
close: () => {},
|
|
3169
|
+
};
|
|
3170
|
+
},
|
|
3171
|
+
_removeConnection: (name: string) => {
|
|
3172
|
+
removedNames.push(name);
|
|
3173
|
+
},
|
|
3174
|
+
_tailerRegistry: new Map(),
|
|
3175
|
+
_findLatestStdoutLog: async () => null,
|
|
3176
|
+
_mailStore: null,
|
|
3177
|
+
});
|
|
3178
|
+
|
|
3179
|
+
// abort() called via triage-terminate → killAgent path
|
|
3180
|
+
expect(abortCount).toBe(1);
|
|
3181
|
+
// killTree NOT called (abort succeeded)
|
|
3182
|
+
expect(proc.killed).toHaveLength(0);
|
|
3183
|
+
// tmux killSession NOT called (headless path only)
|
|
3184
|
+
expect(tmuxMock.killed).toHaveLength(0);
|
|
3185
|
+
});
|
|
3186
|
+
|
|
3187
|
+
// Test D: integration — watchdog terminates a hung headless agent without touching tmux
|
|
3188
|
+
test("Test D: conn.abort() called, tmux.killSession and killTree NEVER called, state → zombie", async () => {
|
|
3189
|
+
const session = makeSession({
|
|
3190
|
+
agentName: "headless-zombie-conn",
|
|
3191
|
+
tmuxSession: "",
|
|
3192
|
+
pid: deadPid, // dead PID → ZFC fires
|
|
3193
|
+
state: "working",
|
|
3194
|
+
lastActivity: new Date(Date.now() - THRESHOLDS.zombieThresholdMs * 2).toISOString(),
|
|
3195
|
+
});
|
|
3196
|
+
|
|
3197
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3198
|
+
|
|
3199
|
+
let abortCount = 0;
|
|
3200
|
+
const proc = connProcessTracker();
|
|
3201
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
3202
|
+
|
|
3203
|
+
await runDaemonTick({
|
|
3204
|
+
root: tempRoot,
|
|
3205
|
+
...THRESHOLDS,
|
|
3206
|
+
_tmux: tmuxMock,
|
|
3207
|
+
_triage: triageAlways("extend"),
|
|
3208
|
+
_process: proc,
|
|
3209
|
+
_eventStore: null,
|
|
3210
|
+
_recordFailure: async () => {},
|
|
3211
|
+
_getConnection: (name: string) => {
|
|
3212
|
+
if (name !== "headless-zombie-conn") return undefined;
|
|
3213
|
+
return {
|
|
3214
|
+
getState: async () => ({ status: "working" as const }),
|
|
3215
|
+
sendPrompt: async () => {},
|
|
3216
|
+
followUp: async () => {},
|
|
3217
|
+
abort: async () => {
|
|
3218
|
+
abortCount++;
|
|
3219
|
+
},
|
|
3220
|
+
close: () => {},
|
|
3221
|
+
};
|
|
3222
|
+
},
|
|
3223
|
+
_removeConnection: () => {},
|
|
3224
|
+
_tailerRegistry: new Map(),
|
|
3225
|
+
_findLatestStdoutLog: async () => null,
|
|
3226
|
+
_mailStore: null,
|
|
3227
|
+
});
|
|
3228
|
+
|
|
3229
|
+
// abort() called
|
|
3230
|
+
expect(abortCount).toBe(1);
|
|
3231
|
+
// tmux.killSession NEVER called
|
|
3232
|
+
expect(tmuxMock.killed).toHaveLength(0);
|
|
3233
|
+
// killTree NEVER called (abort succeeded)
|
|
3234
|
+
expect(proc.killed).toHaveLength(0);
|
|
3235
|
+
// Agent state transitioned to zombie
|
|
3236
|
+
const reloaded = readSessionsFromStore(tempRoot);
|
|
3237
|
+
expect(reloaded[0]?.state).toBe("zombie");
|
|
3238
|
+
});
|
|
3239
|
+
|
|
3240
|
+
// Test E: liveness — getState() returning error status drives the agent toward zombie
|
|
3241
|
+
test("Test E: getState()=error + dead PID → tmuxAlive=false, state=zombie, terminate, abort called", async () => {
|
|
3242
|
+
const session = makeSession({
|
|
3243
|
+
agentName: "headless-error-conn",
|
|
3244
|
+
tmuxSession: "",
|
|
3245
|
+
pid: deadPid, // dead → ZFC fires: pidAlive=false
|
|
3246
|
+
state: "working",
|
|
3247
|
+
lastActivity: new Date().toISOString(), // fresh — time-based won't fire; ZFC does
|
|
3248
|
+
});
|
|
3249
|
+
|
|
3250
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3251
|
+
|
|
3252
|
+
let abortCount = 0;
|
|
3253
|
+
const proc = connProcessTracker();
|
|
3254
|
+
const checks: HealthCheck[] = [];
|
|
3255
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
2328
3256
|
|
|
2329
|
-
|
|
2330
|
-
const daemon = startDaemon({
|
|
3257
|
+
await runDaemonTick({
|
|
2331
3258
|
root: tempRoot,
|
|
2332
|
-
intervalMs: 60_000,
|
|
2333
3259
|
...THRESHOLDS,
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
3260
|
+
onHealthCheck: (c) => checks.push(c),
|
|
3261
|
+
_tmux: tmuxMock,
|
|
3262
|
+
_triage: triageAlways("extend"),
|
|
3263
|
+
_process: proc,
|
|
3264
|
+
_eventStore: null,
|
|
2338
3265
|
_recordFailure: async () => {},
|
|
2339
|
-
_getConnection: () =>
|
|
3266
|
+
_getConnection: (name: string) => {
|
|
3267
|
+
if (name !== "headless-error-conn") return undefined;
|
|
3268
|
+
return {
|
|
3269
|
+
getState: async () => ({ status: "error" as const }),
|
|
3270
|
+
sendPrompt: async () => {},
|
|
3271
|
+
followUp: async () => {},
|
|
3272
|
+
abort: async () => {
|
|
3273
|
+
abortCount++;
|
|
3274
|
+
},
|
|
3275
|
+
close: () => {},
|
|
3276
|
+
};
|
|
3277
|
+
},
|
|
2340
3278
|
_removeConnection: () => {},
|
|
2341
|
-
|
|
2342
|
-
_mailStore: null,
|
|
2343
|
-
_tailerRegistry: registry,
|
|
2344
|
-
_tailerFactory: () => ({ agentName: "", logPath: "", stop: () => {} }),
|
|
3279
|
+
_tailerRegistry: new Map(),
|
|
2345
3280
|
_findLatestStdoutLog: async () => null,
|
|
3281
|
+
_mailStore: null,
|
|
2346
3282
|
});
|
|
2347
3283
|
|
|
2348
|
-
//
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
expect(
|
|
2354
|
-
expect(
|
|
2355
|
-
|
|
3284
|
+
// Health check produced
|
|
3285
|
+
expect(checks).toHaveLength(1);
|
|
3286
|
+
// tmuxAlive=false because getState returned "error"
|
|
3287
|
+
expect(checks[0]?.tmuxAlive).toBe(false);
|
|
3288
|
+
// ZFC fires (pidAlive=false for dead PID) → zombie/terminate
|
|
3289
|
+
expect(checks[0]?.state).toBe("zombie");
|
|
3290
|
+
expect(checks[0]?.action).toBe("terminate");
|
|
3291
|
+
// abort() called via killAgent
|
|
3292
|
+
expect(abortCount).toBe(1);
|
|
3293
|
+
// killTree NOT called (abort succeeded)
|
|
3294
|
+
expect(proc.killed).toHaveLength(0);
|
|
2356
3295
|
});
|
|
2357
|
-
});
|
|
2358
|
-
|
|
2359
|
-
// ============================================================
|
|
2360
|
-
// RPC getState() timeout removes stale connection
|
|
2361
|
-
// ============================================================
|
|
2362
3296
|
|
|
2363
|
-
|
|
2364
|
-
test("
|
|
3297
|
+
// Test F: connection.getState() rejection drops the connection and falls back to tmux
|
|
3298
|
+
test("Test F: getState() rejection → removeConnection called, tmux liveness used as fallback", async () => {
|
|
2365
3299
|
const session = makeSession({
|
|
2366
|
-
agentName: "
|
|
2367
|
-
tmuxSession: "",
|
|
3300
|
+
agentName: "headless-reject-conn",
|
|
3301
|
+
tmuxSession: "",
|
|
2368
3302
|
pid: process.pid, // alive
|
|
2369
3303
|
state: "working",
|
|
2370
|
-
lastActivity: new Date().toISOString(),
|
|
3304
|
+
lastActivity: new Date().toISOString(), // fresh — no stale
|
|
2371
3305
|
});
|
|
2372
3306
|
|
|
2373
3307
|
writeSessionsToStore(tempRoot, [session]);
|
|
2374
3308
|
|
|
2375
3309
|
const removedNames: string[] = [];
|
|
3310
|
+
const checks: HealthCheck[] = [];
|
|
3311
|
+
// tmux returns alive — used as fallback when getState rejects
|
|
3312
|
+
const tmuxMock = tmuxWithLiveness({ "": true });
|
|
2376
3313
|
|
|
2377
3314
|
await runDaemonTick({
|
|
2378
3315
|
root: tempRoot,
|
|
2379
3316
|
...THRESHOLDS,
|
|
2380
|
-
|
|
3317
|
+
onHealthCheck: (c) => checks.push(c),
|
|
3318
|
+
_tmux: tmuxMock,
|
|
2381
3319
|
_triage: triageAlways("extend"),
|
|
2382
3320
|
_process: { isAlive: () => true, killTree: async () => {} },
|
|
2383
3321
|
_eventStore: null,
|
|
2384
3322
|
_recordFailure: async () => {},
|
|
2385
3323
|
_getConnection: (name: string) => {
|
|
2386
|
-
if (name !== "
|
|
3324
|
+
if (name !== "headless-reject-conn") return undefined;
|
|
2387
3325
|
return {
|
|
2388
3326
|
getState: () => Promise.reject(new Error("connection error")),
|
|
2389
3327
|
sendPrompt: async () => {},
|
|
@@ -2400,91 +3338,262 @@ describe("RPC getState() timeout removes stale connection", () => {
|
|
|
2400
3338
|
_mailStore: null,
|
|
2401
3339
|
});
|
|
2402
3340
|
|
|
2403
|
-
|
|
3341
|
+
// removeConnection called (connection dropped after rejection)
|
|
3342
|
+
expect(removedNames).toContain("headless-reject-conn");
|
|
3343
|
+
// Agent is healthy (alive PID, fresh lastActivity, tmux fallback returns alive)
|
|
3344
|
+
expect(checks).toHaveLength(1);
|
|
3345
|
+
expect(checks[0]?.action).toBe("none");
|
|
2404
3346
|
});
|
|
2405
3347
|
});
|
|
2406
3348
|
|
|
2407
3349
|
// ============================================================
|
|
2408
|
-
//
|
|
3350
|
+
// worker_died notification (overstory-c111)
|
|
2409
3351
|
// ============================================================
|
|
2410
3352
|
|
|
2411
|
-
describe("
|
|
2412
|
-
|
|
2413
|
-
const staleActivity = new Date(Date.now() - 60_000).toISOString();
|
|
2414
|
-
const stalledSince = new Date(Date.now() - 130_000).toISOString();
|
|
3353
|
+
describe("worker_died parent notification", () => {
|
|
3354
|
+
let tempRoot: string;
|
|
2415
3355
|
|
|
2416
|
-
|
|
2417
|
-
|
|
2418
|
-
|
|
2419
|
-
id: "s-1",
|
|
2420
|
-
agentName: "agent-1",
|
|
2421
|
-
tmuxSession: "ov-agent-1",
|
|
2422
|
-
state: "stalled",
|
|
2423
|
-
lastActivity: staleActivity,
|
|
2424
|
-
escalationLevel: 2,
|
|
2425
|
-
stalledSince,
|
|
2426
|
-
}),
|
|
2427
|
-
makeSession({
|
|
2428
|
-
id: "s-2",
|
|
2429
|
-
agentName: "agent-2",
|
|
2430
|
-
tmuxSession: "ov-agent-2",
|
|
2431
|
-
state: "stalled",
|
|
2432
|
-
lastActivity: staleActivity,
|
|
2433
|
-
escalationLevel: 2,
|
|
2434
|
-
stalledSince,
|
|
2435
|
-
}),
|
|
2436
|
-
makeSession({
|
|
2437
|
-
id: "s-3",
|
|
2438
|
-
agentName: "agent-3",
|
|
2439
|
-
tmuxSession: "ov-agent-3",
|
|
2440
|
-
state: "stalled",
|
|
2441
|
-
lastActivity: staleActivity,
|
|
2442
|
-
escalationLevel: 2,
|
|
2443
|
-
stalledSince,
|
|
2444
|
-
}),
|
|
2445
|
-
makeSession({
|
|
2446
|
-
id: "s-4",
|
|
2447
|
-
agentName: "agent-4",
|
|
2448
|
-
tmuxSession: "ov-agent-4",
|
|
2449
|
-
state: "stalled",
|
|
2450
|
-
lastActivity: staleActivity,
|
|
2451
|
-
escalationLevel: 2,
|
|
2452
|
-
stalledSince,
|
|
2453
|
-
}),
|
|
2454
|
-
];
|
|
3356
|
+
beforeEach(async () => {
|
|
3357
|
+
tempRoot = await createTempRoot();
|
|
3358
|
+
});
|
|
2455
3359
|
|
|
2456
|
-
|
|
3360
|
+
afterEach(async () => {
|
|
3361
|
+
await cleanupTempDir(tempRoot);
|
|
3362
|
+
});
|
|
2457
3363
|
|
|
2458
|
-
|
|
2459
|
-
const
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
|
|
3364
|
+
test("terminate path sends worker_died mail to parentAgent on first zombify", async () => {
|
|
3365
|
+
const session = makeSession({
|
|
3366
|
+
agentName: "dead-builder",
|
|
3367
|
+
capability: "builder",
|
|
3368
|
+
parentAgent: "lead-1",
|
|
3369
|
+
tmuxSession: "overstory-dead-builder",
|
|
3370
|
+
state: "working",
|
|
3371
|
+
lastActivity: new Date().toISOString(),
|
|
3372
|
+
});
|
|
2463
3373
|
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2472
|
-
|
|
2473
|
-
"
|
|
2474
|
-
"
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
|
|
2482
|
-
|
|
2483
|
-
|
|
2484
|
-
|
|
3374
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3375
|
+
|
|
3376
|
+
const mailDb = join(tempRoot, ".overstory", "mail.db");
|
|
3377
|
+
const mailStore = createMailStore(mailDb);
|
|
3378
|
+
|
|
3379
|
+
try {
|
|
3380
|
+
await runDaemonTick({
|
|
3381
|
+
root: tempRoot,
|
|
3382
|
+
...THRESHOLDS,
|
|
3383
|
+
_tmux: tmuxWithLiveness({ "overstory-dead-builder": false }),
|
|
3384
|
+
_triage: triageAlways("extend"),
|
|
3385
|
+
_recordFailure: async () => {},
|
|
3386
|
+
_mailStore: mailStore,
|
|
3387
|
+
});
|
|
3388
|
+
|
|
3389
|
+
const inbox = mailStore.getUnread("lead-1");
|
|
3390
|
+
expect(inbox).toHaveLength(1);
|
|
3391
|
+
const msg = inbox[0];
|
|
3392
|
+
expect(msg).toBeDefined();
|
|
3393
|
+
if (!msg) return;
|
|
3394
|
+
expect(msg.type).toBe("worker_died");
|
|
3395
|
+
expect(msg.from).toBe("dead-builder");
|
|
3396
|
+
expect(msg.to).toBe("lead-1");
|
|
3397
|
+
expect(msg.priority).toBe("high");
|
|
3398
|
+
expect(msg.payload).not.toBeNull();
|
|
3399
|
+
const payload = JSON.parse(msg.payload ?? "{}") as WorkerDiedPayload;
|
|
3400
|
+
expect(payload.agentName).toBe("dead-builder");
|
|
3401
|
+
expect(payload.capability).toBe("builder");
|
|
3402
|
+
expect(payload.terminatedBy).toBe("tier0");
|
|
3403
|
+
expect(payload.reason).toBeTruthy();
|
|
3404
|
+
} finally {
|
|
3405
|
+
mailStore.close();
|
|
3406
|
+
}
|
|
3407
|
+
});
|
|
3408
|
+
|
|
3409
|
+
test("orphan agent (parentAgent=null) receives no notification", async () => {
|
|
3410
|
+
const session = makeSession({
|
|
3411
|
+
agentName: "orphan-agent",
|
|
3412
|
+
parentAgent: null,
|
|
3413
|
+
tmuxSession: "overstory-orphan-agent",
|
|
3414
|
+
state: "working",
|
|
3415
|
+
lastActivity: new Date().toISOString(),
|
|
2485
3416
|
});
|
|
2486
3417
|
|
|
2487
|
-
|
|
2488
|
-
|
|
3418
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3419
|
+
|
|
3420
|
+
const mailDb = join(tempRoot, ".overstory", "mail.db");
|
|
3421
|
+
const mailStore = createMailStore(mailDb);
|
|
3422
|
+
|
|
3423
|
+
try {
|
|
3424
|
+
await runDaemonTick({
|
|
3425
|
+
root: tempRoot,
|
|
3426
|
+
...THRESHOLDS,
|
|
3427
|
+
_tmux: tmuxWithLiveness({ "overstory-orphan-agent": false }),
|
|
3428
|
+
_triage: triageAlways("extend"),
|
|
3429
|
+
_recordFailure: async () => {},
|
|
3430
|
+
_mailStore: mailStore,
|
|
3431
|
+
});
|
|
3432
|
+
|
|
3433
|
+
expect(mailStore.getAll({ type: "worker_died" })).toHaveLength(0);
|
|
3434
|
+
} finally {
|
|
3435
|
+
mailStore.close();
|
|
3436
|
+
}
|
|
3437
|
+
});
|
|
3438
|
+
|
|
3439
|
+
test("re-tick on already-zombie session does not send a second worker_died", async () => {
|
|
3440
|
+
// Subsequent ticks see the session already in `zombie`. The state matrix
|
|
3441
|
+
// rejects zombie → zombie transitions, so notify is gated on `outcome.ok`.
|
|
3442
|
+
const session = makeSession({
|
|
3443
|
+
agentName: "re-zombie-agent",
|
|
3444
|
+
parentAgent: "lead-2",
|
|
3445
|
+
tmuxSession: "overstory-re-zombie-agent",
|
|
3446
|
+
state: "working",
|
|
3447
|
+
lastActivity: new Date().toISOString(),
|
|
3448
|
+
});
|
|
3449
|
+
|
|
3450
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3451
|
+
|
|
3452
|
+
const mailDb = join(tempRoot, ".overstory", "mail.db");
|
|
3453
|
+
const mailStore = createMailStore(mailDb);
|
|
3454
|
+
|
|
3455
|
+
try {
|
|
3456
|
+
const tickOpts = {
|
|
3457
|
+
root: tempRoot,
|
|
3458
|
+
...THRESHOLDS,
|
|
3459
|
+
_tmux: tmuxWithLiveness({ "overstory-re-zombie-agent": false }),
|
|
3460
|
+
_triage: triageAlways("extend"),
|
|
3461
|
+
_recordFailure: async () => {},
|
|
3462
|
+
_mailStore: mailStore,
|
|
3463
|
+
};
|
|
3464
|
+
await runDaemonTick(tickOpts);
|
|
3465
|
+
await runDaemonTick(tickOpts);
|
|
3466
|
+
await runDaemonTick(tickOpts);
|
|
3467
|
+
|
|
3468
|
+
expect(mailStore.getAll({ to: "lead-2", type: "worker_died" })).toHaveLength(1);
|
|
3469
|
+
} finally {
|
|
3470
|
+
mailStore.close();
|
|
3471
|
+
}
|
|
3472
|
+
});
|
|
3473
|
+
|
|
3474
|
+
test("notifyParentOnDeath=false suppresses the synthetic mail", async () => {
|
|
3475
|
+
const session = makeSession({
|
|
3476
|
+
agentName: "opt-out-agent",
|
|
3477
|
+
parentAgent: "lead-3",
|
|
3478
|
+
tmuxSession: "overstory-opt-out-agent",
|
|
3479
|
+
state: "working",
|
|
3480
|
+
lastActivity: new Date().toISOString(),
|
|
3481
|
+
});
|
|
3482
|
+
|
|
3483
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3484
|
+
|
|
3485
|
+
const mailDb = join(tempRoot, ".overstory", "mail.db");
|
|
3486
|
+
const mailStore = createMailStore(mailDb);
|
|
3487
|
+
|
|
3488
|
+
try {
|
|
3489
|
+
await runDaemonTick({
|
|
3490
|
+
root: tempRoot,
|
|
3491
|
+
...THRESHOLDS,
|
|
3492
|
+
notifyParentOnDeath: false,
|
|
3493
|
+
_tmux: tmuxWithLiveness({ "overstory-opt-out-agent": false }),
|
|
3494
|
+
_triage: triageAlways("extend"),
|
|
3495
|
+
_recordFailure: async () => {},
|
|
3496
|
+
_mailStore: mailStore,
|
|
3497
|
+
});
|
|
3498
|
+
|
|
3499
|
+
expect(mailStore.getAll({ type: "worker_died" })).toHaveLength(0);
|
|
3500
|
+
// State should still transition normally
|
|
3501
|
+
const reloaded = readSessionsFromStore(tempRoot);
|
|
3502
|
+
expect(reloaded[0]?.state).toBe("zombie");
|
|
3503
|
+
} finally {
|
|
3504
|
+
mailStore.close();
|
|
3505
|
+
}
|
|
3506
|
+
});
|
|
3507
|
+
|
|
3508
|
+
test("escalation-level-3 terminate also notifies parent with tier0 reason", async () => {
|
|
3509
|
+
// Stalled agent with alive tmux: progressive escalation drives it to level 3
|
|
3510
|
+
// terminate. The notify path runs through the escalation branch, not the
|
|
3511
|
+
// `check.action === "terminate"` branch.
|
|
3512
|
+
const stalledSince = new Date(Date.now() - 4 * 60_000).toISOString();
|
|
3513
|
+
const lastActivity = new Date(Date.now() - 60_000).toISOString();
|
|
3514
|
+
const session = makeSession({
|
|
3515
|
+
agentName: "escalated-agent",
|
|
3516
|
+
parentAgent: "coordinator",
|
|
3517
|
+
tmuxSession: "overstory-escalated-agent",
|
|
3518
|
+
state: "working",
|
|
3519
|
+
lastActivity,
|
|
3520
|
+
stalledSince,
|
|
3521
|
+
escalationLevel: 3,
|
|
3522
|
+
});
|
|
3523
|
+
|
|
3524
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3525
|
+
|
|
3526
|
+
const mailDb = join(tempRoot, ".overstory", "mail.db");
|
|
3527
|
+
const mailStore = createMailStore(mailDb);
|
|
3528
|
+
|
|
3529
|
+
try {
|
|
3530
|
+
await runDaemonTick({
|
|
3531
|
+
root: tempRoot,
|
|
3532
|
+
...THRESHOLDS,
|
|
3533
|
+
nudgeIntervalMs: 60_000,
|
|
3534
|
+
_tmux: tmuxWithLiveness({ "overstory-escalated-agent": true }),
|
|
3535
|
+
_triage: triageAlways("extend"),
|
|
3536
|
+
_nudge: async () => ({ delivered: true }),
|
|
3537
|
+
_recordFailure: async () => {},
|
|
3538
|
+
_mailStore: mailStore,
|
|
3539
|
+
});
|
|
3540
|
+
|
|
3541
|
+
const inbox = mailStore.getUnread("coordinator");
|
|
3542
|
+
expect(inbox).toHaveLength(1);
|
|
3543
|
+
const msg = inbox[0];
|
|
3544
|
+
if (!msg) return;
|
|
3545
|
+
expect(msg.type).toBe("worker_died");
|
|
3546
|
+
const payload = JSON.parse(msg.payload ?? "{}") as WorkerDiedPayload;
|
|
3547
|
+
expect(payload.terminatedBy).toBe("tier0");
|
|
3548
|
+
expect(payload.reason).toContain("Progressive escalation");
|
|
3549
|
+
} finally {
|
|
3550
|
+
mailStore.close();
|
|
3551
|
+
}
|
|
3552
|
+
});
|
|
3553
|
+
|
|
3554
|
+
test("tier1 triage terminate sets terminatedBy=tier1 in payload", async () => {
|
|
3555
|
+
// stalledSince must produce expectedLevel==2 from nudgeIntervalMs=60_000:
|
|
3556
|
+
// floor(stalledMs / 60_000) === 2 requires 2*60_000 <= stalledMs < 3*60_000.
|
|
3557
|
+
const stalledSince = new Date(Date.now() - 150_000).toISOString();
|
|
3558
|
+
const lastActivity = new Date(Date.now() - 60_000).toISOString();
|
|
3559
|
+
const session = makeSession({
|
|
3560
|
+
agentName: "triaged-agent",
|
|
3561
|
+
parentAgent: "lead-triage",
|
|
3562
|
+
tmuxSession: "overstory-triaged-agent",
|
|
3563
|
+
state: "working",
|
|
3564
|
+
lastActivity,
|
|
3565
|
+
stalledSince,
|
|
3566
|
+
escalationLevel: 2,
|
|
3567
|
+
});
|
|
3568
|
+
|
|
3569
|
+
writeSessionsToStore(tempRoot, [session]);
|
|
3570
|
+
|
|
3571
|
+
const mailDb = join(tempRoot, ".overstory", "mail.db");
|
|
3572
|
+
const mailStore = createMailStore(mailDb);
|
|
3573
|
+
|
|
3574
|
+
try {
|
|
3575
|
+
await runDaemonTick({
|
|
3576
|
+
root: tempRoot,
|
|
3577
|
+
...THRESHOLDS,
|
|
3578
|
+
nudgeIntervalMs: 60_000,
|
|
3579
|
+
tier1Enabled: true,
|
|
3580
|
+
_tmux: tmuxWithLiveness({ "overstory-triaged-agent": true }),
|
|
3581
|
+
_triage: triageAlways("terminate"),
|
|
3582
|
+
_nudge: async () => ({ delivered: true }),
|
|
3583
|
+
_recordFailure: async () => {},
|
|
3584
|
+
_mailStore: mailStore,
|
|
3585
|
+
});
|
|
3586
|
+
|
|
3587
|
+
const inbox = mailStore.getUnread("lead-triage");
|
|
3588
|
+
expect(inbox).toHaveLength(1);
|
|
3589
|
+
const msg = inbox[0];
|
|
3590
|
+
if (!msg) return;
|
|
3591
|
+
expect(msg.type).toBe("worker_died");
|
|
3592
|
+
const payload = JSON.parse(msg.payload ?? "{}") as WorkerDiedPayload;
|
|
3593
|
+
expect(payload.terminatedBy).toBe("tier1");
|
|
3594
|
+
expect(payload.reason).toContain("AI triage");
|
|
3595
|
+
} finally {
|
|
3596
|
+
mailStore.close();
|
|
3597
|
+
}
|
|
2489
3598
|
});
|
|
2490
3599
|
});
|