@katyella/legio 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/CHANGELOG.md +422 -0
  2. package/LICENSE +21 -0
  3. package/README.md +555 -0
  4. package/agents/builder.md +141 -0
  5. package/agents/coordinator.md +351 -0
  6. package/agents/cto.md +196 -0
  7. package/agents/gateway.md +276 -0
  8. package/agents/lead.md +281 -0
  9. package/agents/merger.md +156 -0
  10. package/agents/monitor.md +212 -0
  11. package/agents/reviewer.md +142 -0
  12. package/agents/scout.md +131 -0
  13. package/agents/supervisor.md +416 -0
  14. package/bin/legio.mjs +38 -0
  15. package/package.json +77 -0
  16. package/src/agents/checkpoint.test.ts +88 -0
  17. package/src/agents/checkpoint.ts +102 -0
  18. package/src/agents/hooks-deployer.test.ts +1820 -0
  19. package/src/agents/hooks-deployer.ts +574 -0
  20. package/src/agents/identity.test.ts +614 -0
  21. package/src/agents/identity.ts +385 -0
  22. package/src/agents/lifecycle.test.ts +202 -0
  23. package/src/agents/lifecycle.ts +184 -0
  24. package/src/agents/manifest.test.ts +558 -0
  25. package/src/agents/manifest.ts +297 -0
  26. package/src/agents/overlay.test.ts +592 -0
  27. package/src/agents/overlay.ts +316 -0
  28. package/src/beads/client.test.ts +210 -0
  29. package/src/beads/client.ts +227 -0
  30. package/src/beads/molecules.test.ts +320 -0
  31. package/src/beads/molecules.ts +209 -0
  32. package/src/commands/agents.test.ts +325 -0
  33. package/src/commands/agents.ts +286 -0
  34. package/src/commands/clean.test.ts +730 -0
  35. package/src/commands/clean.ts +653 -0
  36. package/src/commands/completions.test.ts +346 -0
  37. package/src/commands/completions.ts +950 -0
  38. package/src/commands/coordinator.test.ts +1524 -0
  39. package/src/commands/coordinator.ts +880 -0
  40. package/src/commands/costs.test.ts +1015 -0
  41. package/src/commands/costs.ts +473 -0
  42. package/src/commands/dashboard.test.ts +94 -0
  43. package/src/commands/dashboard.ts +607 -0
  44. package/src/commands/doctor.test.ts +295 -0
  45. package/src/commands/doctor.ts +213 -0
  46. package/src/commands/down.test.ts +308 -0
  47. package/src/commands/down.ts +124 -0
  48. package/src/commands/errors.test.ts +648 -0
  49. package/src/commands/errors.ts +255 -0
  50. package/src/commands/feed.test.ts +579 -0
  51. package/src/commands/feed.ts +368 -0
  52. package/src/commands/gateway.test.ts +698 -0
  53. package/src/commands/gateway.ts +419 -0
  54. package/src/commands/group.test.ts +262 -0
  55. package/src/commands/group.ts +539 -0
  56. package/src/commands/hooks.test.ts +292 -0
  57. package/src/commands/hooks.ts +210 -0
  58. package/src/commands/init.test.ts +211 -0
  59. package/src/commands/init.ts +622 -0
  60. package/src/commands/inspect.test.ts +670 -0
  61. package/src/commands/inspect.ts +455 -0
  62. package/src/commands/log.test.ts +1556 -0
  63. package/src/commands/log.ts +752 -0
  64. package/src/commands/logs.test.ts +379 -0
  65. package/src/commands/logs.ts +544 -0
  66. package/src/commands/mail.test.ts +1726 -0
  67. package/src/commands/mail.ts +926 -0
  68. package/src/commands/merge.test.ts +676 -0
  69. package/src/commands/merge.ts +374 -0
  70. package/src/commands/metrics.test.ts +444 -0
  71. package/src/commands/metrics.ts +150 -0
  72. package/src/commands/monitor.test.ts +151 -0
  73. package/src/commands/monitor.ts +394 -0
  74. package/src/commands/nudge.test.ts +230 -0
  75. package/src/commands/nudge.ts +373 -0
  76. package/src/commands/prime.test.ts +467 -0
  77. package/src/commands/prime.ts +386 -0
  78. package/src/commands/replay.test.ts +742 -0
  79. package/src/commands/replay.ts +367 -0
  80. package/src/commands/run.test.ts +443 -0
  81. package/src/commands/run.ts +365 -0
  82. package/src/commands/server.test.ts +626 -0
  83. package/src/commands/server.ts +298 -0
  84. package/src/commands/sling.test.ts +810 -0
  85. package/src/commands/sling.ts +700 -0
  86. package/src/commands/spec.test.ts +206 -0
  87. package/src/commands/spec.ts +171 -0
  88. package/src/commands/status.test.ts +276 -0
  89. package/src/commands/status.ts +339 -0
  90. package/src/commands/stop.test.ts +357 -0
  91. package/src/commands/stop.ts +119 -0
  92. package/src/commands/supervisor.test.ts +186 -0
  93. package/src/commands/supervisor.ts +544 -0
  94. package/src/commands/trace.test.ts +746 -0
  95. package/src/commands/trace.ts +332 -0
  96. package/src/commands/up.test.ts +597 -0
  97. package/src/commands/up.ts +275 -0
  98. package/src/commands/watch.test.ts +152 -0
  99. package/src/commands/watch.ts +238 -0
  100. package/src/commands/worktree.test.ts +648 -0
  101. package/src/commands/worktree.ts +266 -0
  102. package/src/config.test.ts +496 -0
  103. package/src/config.ts +616 -0
  104. package/src/doctor/agents.test.ts +448 -0
  105. package/src/doctor/agents.ts +396 -0
  106. package/src/doctor/config-check.test.ts +184 -0
  107. package/src/doctor/config-check.ts +185 -0
  108. package/src/doctor/consistency.test.ts +645 -0
  109. package/src/doctor/consistency.ts +294 -0
  110. package/src/doctor/databases.test.ts +284 -0
  111. package/src/doctor/databases.ts +211 -0
  112. package/src/doctor/dependencies.test.ts +150 -0
  113. package/src/doctor/dependencies.ts +179 -0
  114. package/src/doctor/logs.test.ts +244 -0
  115. package/src/doctor/logs.ts +295 -0
  116. package/src/doctor/merge-queue.test.ts +210 -0
  117. package/src/doctor/merge-queue.ts +144 -0
  118. package/src/doctor/structure.test.ts +285 -0
  119. package/src/doctor/structure.ts +195 -0
  120. package/src/doctor/types.ts +37 -0
  121. package/src/doctor/version.test.ts +130 -0
  122. package/src/doctor/version.ts +131 -0
  123. package/src/e2e/chat-flow.test.ts +346 -0
  124. package/src/e2e/init-sling-lifecycle.test.ts +288 -0
  125. package/src/errors.test.ts +21 -0
  126. package/src/errors.ts +246 -0
  127. package/src/events/store.test.ts +660 -0
  128. package/src/events/store.ts +344 -0
  129. package/src/events/tool-filter.test.ts +330 -0
  130. package/src/events/tool-filter.ts +126 -0
  131. package/src/global-setup.ts +14 -0
  132. package/src/index.ts +339 -0
  133. package/src/insights/analyzer.test.ts +466 -0
  134. package/src/insights/analyzer.ts +203 -0
  135. package/src/logging/color.test.ts +118 -0
  136. package/src/logging/color.ts +71 -0
  137. package/src/logging/logger.test.ts +812 -0
  138. package/src/logging/logger.ts +266 -0
  139. package/src/logging/reporter.test.ts +258 -0
  140. package/src/logging/reporter.ts +109 -0
  141. package/src/logging/sanitizer.test.ts +190 -0
  142. package/src/logging/sanitizer.ts +57 -0
  143. package/src/mail/broadcast.test.ts +203 -0
  144. package/src/mail/broadcast.ts +92 -0
  145. package/src/mail/client.test.ts +873 -0
  146. package/src/mail/client.ts +236 -0
  147. package/src/mail/store.test.ts +815 -0
  148. package/src/mail/store.ts +402 -0
  149. package/src/merge/queue.test.ts +449 -0
  150. package/src/merge/queue.ts +262 -0
  151. package/src/merge/resolver.test.ts +1453 -0
  152. package/src/merge/resolver.ts +759 -0
  153. package/src/metrics/store.test.ts +1167 -0
  154. package/src/metrics/store.ts +511 -0
  155. package/src/metrics/summary.test.ts +397 -0
  156. package/src/metrics/summary.ts +178 -0
  157. package/src/metrics/transcript.test.ts +643 -0
  158. package/src/metrics/transcript.ts +351 -0
  159. package/src/mulch/client.test.ts +547 -0
  160. package/src/mulch/client.ts +416 -0
  161. package/src/server/audit-store.test.ts +384 -0
  162. package/src/server/audit-store.ts +257 -0
  163. package/src/server/headless.test.ts +180 -0
  164. package/src/server/headless.ts +151 -0
  165. package/src/server/index.test.ts +241 -0
  166. package/src/server/index.ts +317 -0
  167. package/src/server/public/app.js +187 -0
  168. package/src/server/public/apple-touch-icon.png +0 -0
  169. package/src/server/public/components/agent-badge.js +37 -0
  170. package/src/server/public/components/data-table.js +114 -0
  171. package/src/server/public/components/gateway-chat.js +256 -0
  172. package/src/server/public/components/issue-card.js +96 -0
  173. package/src/server/public/components/layout.js +88 -0
  174. package/src/server/public/components/message-bubble.js +120 -0
  175. package/src/server/public/components/stat-card.js +26 -0
  176. package/src/server/public/components/terminal-panel.js +140 -0
  177. package/src/server/public/favicon-16.png +0 -0
  178. package/src/server/public/favicon-32.png +0 -0
  179. package/src/server/public/favicon.ico +0 -0
  180. package/src/server/public/favicon.png +0 -0
  181. package/src/server/public/index.html +64 -0
  182. package/src/server/public/lib/api.js +35 -0
  183. package/src/server/public/lib/markdown.js +8 -0
  184. package/src/server/public/lib/preact-setup.js +8 -0
  185. package/src/server/public/lib/state.js +99 -0
  186. package/src/server/public/lib/utils.js +309 -0
  187. package/src/server/public/lib/ws.js +79 -0
  188. package/src/server/public/views/chat.js +983 -0
  189. package/src/server/public/views/costs.js +692 -0
  190. package/src/server/public/views/dashboard.js +781 -0
  191. package/src/server/public/views/gateway-chat.js +622 -0
  192. package/src/server/public/views/inspect.js +399 -0
  193. package/src/server/public/views/issues.js +470 -0
  194. package/src/server/public/views/setup.js +94 -0
  195. package/src/server/public/views/task-detail.js +422 -0
  196. package/src/server/routes.test.ts +3816 -0
  197. package/src/server/routes.ts +1964 -0
  198. package/src/server/websocket.test.ts +288 -0
  199. package/src/server/websocket.ts +196 -0
  200. package/src/sessions/compat.test.ts +109 -0
  201. package/src/sessions/compat.ts +17 -0
  202. package/src/sessions/store.test.ts +969 -0
  203. package/src/sessions/store.ts +480 -0
  204. package/src/test-helpers.test.ts +97 -0
  205. package/src/test-helpers.ts +143 -0
  206. package/src/types.ts +708 -0
  207. package/src/watchdog/daemon.test.ts +1233 -0
  208. package/src/watchdog/daemon.ts +533 -0
  209. package/src/watchdog/health.test.ts +371 -0
  210. package/src/watchdog/health.ts +248 -0
  211. package/src/watchdog/triage.test.ts +162 -0
  212. package/src/watchdog/triage.ts +193 -0
  213. package/src/worktree/manager.test.ts +444 -0
  214. package/src/worktree/manager.ts +224 -0
  215. package/src/worktree/tmux.test.ts +1238 -0
  216. package/src/worktree/tmux.ts +644 -0
  217. package/templates/CLAUDE.md.tmpl +89 -0
  218. package/templates/hooks.json.tmpl +132 -0
  219. package/templates/overlay.md.tmpl +79 -0
@@ -0,0 +1,371 @@
1
+ import { describe, expect, test } from "vitest";
2
+ import type { AgentSession } from "../types.ts";
3
+ import { evaluateHealth, isProcessRunning, transitionState } from "./health.ts";
4
+
5
+ /**
6
+ * Tests for the ZFC-based health evaluation and state machine.
7
+ *
8
+ * evaluateHealth is a pure function that takes session state + tmux liveness +
9
+ * thresholds and returns a HealthCheck. No mocks needed for the core logic.
10
+ *
11
+ * isProcessRunning uses process.kill(pid, 0) which is safe to test with real PIDs:
12
+ * the current process PID (alive) and a known-dead PID (not alive).
13
+ *
14
+ * Note: evaluateHealth calls isProcessRunning internally. For tests that need
15
+ * to control pid liveness independently of the actual OS process table, we set
16
+ * session.pid to known-alive (current process) or known-dead PIDs.
17
+ */
18
+
19
+ const THRESHOLDS = { zombieMs: 120_000 };
20
+
21
+ /** PID that is guaranteed to be alive during tests: our own process. */
22
+ const ALIVE_PID = process.pid;
23
+
24
+ /**
25
+ * PID that is very likely dead. PID 2147483647 (max 32-bit signed int) is
26
+ * almost never in use. If by some miracle it is, the test still works because
27
+ * we use it only for the "pid dead" path and the test validates behavior, not
28
+ * the exact PID value.
29
+ */
30
+ const DEAD_PID = 2147483647;
31
+
32
+ function makeSession(overrides: Partial<AgentSession> = {}): AgentSession {
33
+ return {
34
+ id: "session-test",
35
+ agentName: "test-agent",
36
+ capability: "builder",
37
+ worktreePath: "/tmp/test",
38
+ branchName: "legio/test-agent/test-task",
39
+ beadId: "test-task",
40
+ tmuxSession: "legio-test-agent",
41
+ state: "booting",
42
+ pid: ALIVE_PID,
43
+ parentAgent: null,
44
+ depth: 0,
45
+ runId: null,
46
+ startedAt: new Date().toISOString(),
47
+ lastActivity: new Date().toISOString(),
48
+ escalationLevel: 0,
49
+ stalledSince: null,
50
+ ...overrides,
51
+ };
52
+ }
53
+
54
+ // === isProcessRunning ===
55
+
56
+ describe("isProcessRunning", () => {
57
+ test("returns true for the current process PID", () => {
58
+ expect(isProcessRunning(process.pid)).toBe(true);
59
+ });
60
+
61
+ test("returns false for a PID that does not exist", () => {
62
+ // PID 2147483647 is max 32-bit signed — extremely unlikely to be alive
63
+ expect(isProcessRunning(DEAD_PID)).toBe(false);
64
+ });
65
+ });
66
+
67
+ // === evaluateHealth ===
68
+
69
+ describe("evaluateHealth", () => {
70
+ // --- ZFC Rule 1: tmux dead → zombie (observable state wins) ---
71
+
72
+ test("ZFC: tmux dead + sessions.json says working → zombie with reconciliation note", () => {
73
+ const session = makeSession({ state: "working" });
74
+ const check = evaluateHealth(session, false, THRESHOLDS);
75
+
76
+ expect(check.state).toBe("zombie");
77
+ expect(check.action).toBe("terminate");
78
+ expect(check.tmuxAlive).toBe(false);
79
+ expect(check.processAlive).toBe(false);
80
+ expect(check.reconciliationNote).toContain("ZFC");
81
+ expect(check.reconciliationNote).toContain("tmux dead");
82
+ expect(check.reconciliationNote).toContain('"working"');
83
+ });
84
+
85
+ test("ZFC: tmux dead + sessions.json says booting → zombie with reconciliation note", () => {
86
+ const session = makeSession({ state: "booting" });
87
+ const check = evaluateHealth(session, false, THRESHOLDS);
88
+
89
+ expect(check.state).toBe("zombie");
90
+ expect(check.action).toBe("terminate");
91
+ expect(check.reconciliationNote).toContain("ZFC");
92
+ expect(check.reconciliationNote).toContain('"booting"');
93
+ });
94
+
95
+ // --- ZFC Rule 2: tmux alive + sessions.json says zombie → investigate ---
96
+
97
+ test("ZFC: tmux alive + sessions.json says zombie → investigate (don't auto-kill)", () => {
98
+ const session = makeSession({ state: "zombie", pid: ALIVE_PID });
99
+ const check = evaluateHealth(session, true, THRESHOLDS);
100
+
101
+ expect(check.state).toBe("zombie");
102
+ expect(check.action).toBe("investigate");
103
+ expect(check.processAlive).toBe(true);
104
+ expect(check.reconciliationNote).toContain("ZFC");
105
+ expect(check.reconciliationNote).toContain("investigation needed");
106
+ expect(check.reconciliationNote).toContain("don't auto-kill");
107
+ });
108
+
109
+ // --- ZFC Rule 3: pid dead + tmux alive → zombie ---
110
+
111
+ test("ZFC: pid dead + tmux alive → zombie (agent process exited, shell survived)", () => {
112
+ const session = makeSession({ state: "working", pid: DEAD_PID });
113
+ const check = evaluateHealth(session, true, THRESHOLDS);
114
+
115
+ expect(check.state).toBe("zombie");
116
+ expect(check.action).toBe("terminate");
117
+ expect(check.processAlive).toBe(false);
118
+ expect(check.pidAlive).toBe(false);
119
+ expect(check.tmuxAlive).toBe(true);
120
+ expect(check.reconciliationNote).toContain("ZFC");
121
+ expect(check.reconciliationNote).toContain("pid");
122
+ expect(check.reconciliationNote).toContain("shell survived");
123
+ });
124
+
125
+ // --- pid null (unavailable) ---
126
+
127
+ test("pid null does not trigger pid-based zombie detection", () => {
128
+ const session = makeSession({ state: "working", pid: null });
129
+ const check = evaluateHealth(session, true, THRESHOLDS);
130
+
131
+ expect(check.state).toBe("working");
132
+ expect(check.action).toBe("none");
133
+ expect(check.pidAlive).toBeNull();
134
+ });
135
+
136
+ // --- Time-based checks (both tmux and pid alive) ---
137
+
138
+ test("activity older than zombieMs → zombie", () => {
139
+ const oldActivity = new Date(Date.now() - 200_000).toISOString();
140
+ const session = makeSession({ state: "working", lastActivity: oldActivity });
141
+ const check = evaluateHealth(session, true, THRESHOLDS);
142
+
143
+ expect(check.state).toBe("zombie");
144
+ expect(check.action).toBe("terminate");
145
+ expect(check.reconciliationNote).toBeNull();
146
+ });
147
+
148
+ // --- Normal state transitions ---
149
+
150
+ test("booting with recent activity → transitions to working", () => {
151
+ const recentActivity = new Date(Date.now() - 5_000).toISOString();
152
+ const session = makeSession({ state: "booting", lastActivity: recentActivity });
153
+ const check = evaluateHealth(session, true, THRESHOLDS);
154
+
155
+ expect(check.state).toBe("working");
156
+ expect(check.action).toBe("none");
157
+ expect(check.reconciliationNote).toBeNull();
158
+ });
159
+
160
+ test("working with recent activity → stays working", () => {
161
+ const recentActivity = new Date(Date.now() - 5_000).toISOString();
162
+ const session = makeSession({ state: "working", lastActivity: recentActivity });
163
+ const check = evaluateHealth(session, true, THRESHOLDS);
164
+
165
+ expect(check.state).toBe("working");
166
+ expect(check.action).toBe("none");
167
+ });
168
+
169
+ // --- Persistent capabilities (coordinator, monitor) ---
170
+
171
+ test("persistent capability: coordinator with stale activity → still working, no escalation", () => {
172
+ const staleActivity = new Date(Date.now() - 60_000).toISOString();
173
+ const session = makeSession({
174
+ capability: "coordinator",
175
+ state: "working",
176
+ lastActivity: staleActivity,
177
+ });
178
+ const check = evaluateHealth(session, true, THRESHOLDS);
179
+
180
+ expect(check.state).toBe("working");
181
+ expect(check.action).toBe("none");
182
+ });
183
+
184
+ test("persistent capability: coordinator with zombie-level staleness → still working", () => {
185
+ const oldActivity = new Date(Date.now() - 200_000).toISOString();
186
+ const session = makeSession({
187
+ capability: "coordinator",
188
+ state: "working",
189
+ lastActivity: oldActivity,
190
+ });
191
+ const check = evaluateHealth(session, true, THRESHOLDS);
192
+
193
+ expect(check.state).toBe("working");
194
+ expect(check.action).toBe("none");
195
+ });
196
+
197
+ test("persistent capability: monitor with stale activity → still working", () => {
198
+ const staleActivity = new Date(Date.now() - 60_000).toISOString();
199
+ const session = makeSession({
200
+ capability: "monitor",
201
+ state: "working",
202
+ lastActivity: staleActivity,
203
+ });
204
+ const check = evaluateHealth(session, true, THRESHOLDS);
205
+
206
+ expect(check.state).toBe("working");
207
+ expect(check.action).toBe("none");
208
+ });
209
+
210
+ test("persistent capability: gateway with stale activity → still working", () => {
211
+ const staleActivity = new Date(Date.now() - 60_000).toISOString();
212
+ const session = makeSession({
213
+ capability: "gateway",
214
+ state: "working",
215
+ lastActivity: staleActivity,
216
+ });
217
+ const check = evaluateHealth(session, true, THRESHOLDS);
218
+
219
+ expect(check.state).toBe("working");
220
+ expect(check.action).toBe("none");
221
+ });
222
+
223
+ test("persistent capability: coordinator booting → transitions to working", () => {
224
+ const session = makeSession({
225
+ capability: "coordinator",
226
+ state: "booting",
227
+ });
228
+ const check = evaluateHealth(session, true, THRESHOLDS);
229
+
230
+ expect(check.state).toBe("working");
231
+ expect(check.action).toBe("none");
232
+ });
233
+
234
+ test("persistent capability: coordinator with tmux dead → still zombie (ZFC Rule 1 applies)", () => {
235
+ const session = makeSession({
236
+ capability: "coordinator",
237
+ state: "working",
238
+ });
239
+ const check = evaluateHealth(session, false, THRESHOLDS);
240
+
241
+ expect(check.state).toBe("zombie");
242
+ expect(check.action).toBe("terminate");
243
+ });
244
+
245
+ test("persistent capability: coordinator with pid dead → still zombie (ZFC Rule 3 applies)", () => {
246
+ const session = makeSession({
247
+ capability: "coordinator",
248
+ state: "working",
249
+ pid: DEAD_PID,
250
+ });
251
+ const check = evaluateHealth(session, true, THRESHOLDS);
252
+
253
+ expect(check.state).toBe("zombie");
254
+ expect(check.action).toBe("terminate");
255
+ });
256
+
257
+ // --- Completed agents ---
258
+
259
+ test("completed agents skip monitoring", () => {
260
+ const session = makeSession({ state: "completed" });
261
+ const check = evaluateHealth(session, true, THRESHOLDS);
262
+
263
+ expect(check.state).toBe("completed");
264
+ expect(check.action).toBe("none");
265
+ expect(check.reconciliationNote).toBeNull();
266
+ });
267
+
268
+ // --- pidAlive field is populated ---
269
+
270
+ test("pidAlive reflects actual process state for alive PID", () => {
271
+ const session = makeSession({ pid: ALIVE_PID, state: "working" });
272
+ const check = evaluateHealth(session, true, THRESHOLDS);
273
+
274
+ expect(check.pidAlive).toBe(true);
275
+ });
276
+
277
+ test("pidAlive reflects actual process state for dead PID", () => {
278
+ // Use dead pid but also tmux dead to avoid pid-zombie path intercepting
279
+ const session = makeSession({ pid: DEAD_PID, state: "working" });
280
+ const check = evaluateHealth(session, false, THRESHOLDS);
281
+
282
+ // tmux dead takes priority, so state is zombie via ZFC Rule 1
283
+ expect(check.state).toBe("zombie");
284
+ expect(check.pidAlive).toBe(false);
285
+ });
286
+ });
287
+
288
+ // === transitionState ===
289
+
290
+ describe("transitionState", () => {
291
+ test("advances from booting to working", () => {
292
+ const check = {
293
+ state: "working" as const,
294
+ agentName: "a",
295
+ timestamp: "",
296
+ tmuxAlive: true,
297
+ pidAlive: true as boolean | null,
298
+ lastActivity: "",
299
+ processAlive: true,
300
+ action: "none" as const,
301
+ reconciliationNote: null,
302
+ };
303
+ expect(transitionState("booting", check)).toBe("working");
304
+ });
305
+
306
+ test("never regresses from zombie to booting", () => {
307
+ const check = {
308
+ state: "booting" as const,
309
+ agentName: "a",
310
+ timestamp: "",
311
+ tmuxAlive: true,
312
+ pidAlive: true as boolean | null,
313
+ lastActivity: "",
314
+ processAlive: true,
315
+ action: "none" as const,
316
+ reconciliationNote: null,
317
+ };
318
+ expect(transitionState("zombie", check)).toBe("zombie");
319
+ });
320
+
321
+ test("same state stays the same", () => {
322
+ const check = {
323
+ state: "working" as const,
324
+ agentName: "a",
325
+ timestamp: "",
326
+ tmuxAlive: true,
327
+ pidAlive: true as boolean | null,
328
+ lastActivity: "",
329
+ processAlive: true,
330
+ action: "none" as const,
331
+ reconciliationNote: null,
332
+ };
333
+ expect(transitionState("working", check)).toBe("working");
334
+ });
335
+
336
+ // --- ZFC: investigate holds state ---
337
+
338
+ test("ZFC: investigate action holds current state (does not advance)", () => {
339
+ const check = {
340
+ state: "zombie" as const,
341
+ agentName: "a",
342
+ timestamp: "",
343
+ tmuxAlive: true,
344
+ pidAlive: true as boolean | null,
345
+ lastActivity: "",
346
+ processAlive: true,
347
+ action: "investigate" as const,
348
+ reconciliationNote: "ZFC: tmux alive but sessions.json says zombie",
349
+ };
350
+ // Even though check.state is zombie (order 4) and current is zombie (order 4),
351
+ // investigate should hold — not advance
352
+ expect(transitionState("zombie", check)).toBe("zombie");
353
+ });
354
+
355
+ test("ZFC: investigate prevents forward transition", () => {
356
+ const check = {
357
+ state: "zombie" as const,
358
+ agentName: "a",
359
+ timestamp: "",
360
+ tmuxAlive: true,
361
+ pidAlive: true as boolean | null,
362
+ lastActivity: "",
363
+ processAlive: true,
364
+ action: "investigate" as const,
365
+ reconciliationNote: "ZFC conflict",
366
+ };
367
+ // If something were at "working" and check says zombie with investigate,
368
+ // the state should NOT advance
369
+ expect(transitionState("working", check)).toBe("working");
370
+ });
371
+ });
@@ -0,0 +1,248 @@
1
+ /**
2
+ * Health check state machine and evaluation logic for agent monitoring.
3
+ *
4
+ * ZFC Principle (Zero Failure Crash)
5
+ * ==================================
6
+ * Observable state is the source of truth, not recorded state.
7
+ *
8
+ * Signal priority (highest to lowest):
9
+ * 1. tmux session liveness — Is the tmux session actually running?
10
+ * 2. Process liveness (pid) — Is the Claude Code process still alive?
11
+ * 3. Recorded state — What does sessions.json claim?
12
+ *
13
+ * When signals conflict, always trust what you can observe:
14
+ * - tmux dead + sessions.json says "working" → mark zombie immediately.
15
+ * The recorded state is stale; the process is gone.
16
+ * - tmux alive + sessions.json says "zombie" → investigate, don't auto-kill.
17
+ * Something marked it zombie but the process recovered or was misclassified.
18
+ * - pid dead + tmux alive → the pane's shell survived but the agent process
19
+ * exited. Treat as zombie (the agent is not doing work).
20
+ * - pid alive + tmux dead → should not happen (tmux owns the pid), but if it
21
+ * does, trust tmux (the session is gone).
22
+ *
23
+ * The rationale: sessions.json is updated asynchronously by hooks and can become
24
+ * stale if the agent crashes between hook invocations. tmux and the OS process
25
+ * table are always up-to-date because they reflect real kernel state.
26
+ */
27
+
28
+ import type { AgentSession, AgentState, HealthCheck } from "../types.ts";
29
+
30
+ /**
31
+ * Agent capabilities that run as persistent interactive sessions.
32
+ * These agents are expected to have long idle periods (e.g. coordinator waiting
33
+ * for worker mail) and should NOT be flagged stale/zombie based on lastActivity.
34
+ * Only tmux/pid liveness checks apply to them.
35
+ *
36
+ * Shared concept with src/commands/log.ts:PERSISTENT_CAPABILITIES.
37
+ */
38
+ const PERSISTENT_CAPABILITIES = new Set(["coordinator", "monitor", "gateway"]);
39
+
40
+ /** Numeric ordering for forward-only state transitions. */
41
+ const STATE_ORDER: Partial<Record<AgentState, number>> = {
42
+ booting: 0,
43
+ working: 1,
44
+ completed: 2,
45
+ zombie: 3,
46
+ };
47
+
48
+ /**
49
+ * Check whether a process with the given PID is still running.
50
+ *
51
+ * Uses signal 0 which does not kill the process — it only checks
52
+ * whether it exists and we have permission to signal it.
53
+ *
54
+ * @param pid - The process ID to check
55
+ * @returns true if the process exists, false otherwise
56
+ */
57
+ export function isProcessRunning(pid: number): boolean {
58
+ try {
59
+ // Signal 0 doesn't kill the process — just checks if it exists
60
+ process.kill(pid, 0);
61
+ return true;
62
+ } catch {
63
+ return false;
64
+ }
65
+ }
66
+
67
+ /**
68
+ * Evaluate the health of an agent session.
69
+ *
70
+ * Implements the ZFC principle: observable state (tmux liveness, pid liveness)
71
+ * takes priority over recorded state (sessions.json fields).
72
+ *
73
+ * Decision logic (in priority order):
74
+ *
75
+ * 1. Completed agents skip monitoring entirely.
76
+ * 2. tmux dead → zombie, terminate (regardless of what sessions.json says).
77
+ * 3. tmux alive + sessions.json says zombie → investigate (don't auto-kill).
78
+ * Something external marked this zombie, but the process is still running.
79
+ * 4. pid dead + tmux alive → zombie, terminate. The agent process exited but
80
+ * the tmux pane shell survived. The agent is not doing work.
81
+ * 5. lastActivity older than zombieMs → zombie, terminate.
82
+ * 6. booting with recent activity → working.
83
+ * 7. Otherwise → working, healthy.
84
+ *
85
+ * @param session - The agent session to evaluate
86
+ * @param tmuxAlive - Whether the agent's tmux session is still running
87
+ * @param thresholds - Zombie time threshold in milliseconds
88
+ * @returns A HealthCheck describing the agent's current state and recommended action
89
+ */
90
+ export function evaluateHealth(
91
+ session: AgentSession,
92
+ tmuxAlive: boolean,
93
+ thresholds: { zombieMs: number },
94
+ ): HealthCheck {
95
+ const now = new Date();
96
+ const lastActivityTime = new Date(session.lastActivity).getTime();
97
+ const elapsedMs = now.getTime() - lastActivityTime;
98
+
99
+ // Check pid liveness as secondary signal (null if pid unavailable)
100
+ const pidAlive = session.pid !== null ? isProcessRunning(session.pid) : null;
101
+
102
+ const base: Pick<
103
+ HealthCheck,
104
+ "agentName" | "timestamp" | "tmuxAlive" | "pidAlive" | "lastActivity"
105
+ > = {
106
+ agentName: session.agentName,
107
+ timestamp: now.toISOString(),
108
+ tmuxAlive,
109
+ pidAlive,
110
+ lastActivity: session.lastActivity,
111
+ };
112
+
113
+ // Completed agents don't need health monitoring
114
+ if (session.state === "completed") {
115
+ return {
116
+ ...base,
117
+ processAlive: tmuxAlive,
118
+ state: "completed",
119
+ action: "none",
120
+ reconciliationNote: null,
121
+ };
122
+ }
123
+
124
+ // ZFC Rule 1: tmux dead → zombie immediately, regardless of recorded state.
125
+ // Observable state says the process is gone.
126
+ if (!tmuxAlive) {
127
+ const note =
128
+ session.state === "working" || session.state === "booting"
129
+ ? `ZFC: tmux dead but sessions.json says "${session.state}" — marking zombie (observable state wins)`
130
+ : null;
131
+
132
+ return {
133
+ ...base,
134
+ processAlive: false,
135
+ state: "zombie",
136
+ action: "terminate",
137
+ reconciliationNote: note,
138
+ };
139
+ }
140
+
141
+ // ZFC Rule 2: tmux alive but sessions.json says zombie → investigate.
142
+ // Something marked it zombie but the process is still running. Don't auto-kill;
143
+ // a human or higher-tier agent should decide.
144
+ if (session.state === "zombie") {
145
+ return {
146
+ ...base,
147
+ processAlive: true,
148
+ state: "zombie",
149
+ action: "investigate",
150
+ reconciliationNote:
151
+ "ZFC: tmux alive but sessions.json says zombie — investigation needed (don't auto-kill)",
152
+ };
153
+ }
154
+
155
+ // ZFC Rule 3: pid dead but tmux alive → the agent process exited but the
156
+ // tmux pane shell survived. The agent is not doing work.
157
+ if (pidAlive === false) {
158
+ return {
159
+ ...base,
160
+ processAlive: false,
161
+ state: "zombie",
162
+ action: "terminate",
163
+ reconciliationNote: `ZFC: pid ${session.pid} dead but tmux alive — agent process exited, shell survived`,
164
+ };
165
+ }
166
+
167
+ // Persistent capabilities (coordinator, monitor) are expected to have long idle
168
+ // periods waiting for mail/events. Skip time-based zombie detection for
169
+ // them — only tmux/pid liveness matters (checked above).
170
+ if (PERSISTENT_CAPABILITIES.has(session.capability)) {
171
+ // Transition booting → working if we reach here (tmux alive, pid alive)
172
+ const state = session.state === "booting" ? "working" : session.state;
173
+ return {
174
+ ...base,
175
+ processAlive: true,
176
+ state,
177
+ action: "none",
178
+ reconciliationNote: null,
179
+ };
180
+ }
181
+
182
+ // Time-based checks (both tmux and pid confirmed alive, or pid unavailable)
183
+
184
+ // lastActivity older than zombieMs → zombie
185
+ if (elapsedMs > thresholds.zombieMs) {
186
+ return {
187
+ ...base,
188
+ processAlive: true,
189
+ state: "zombie",
190
+ action: "terminate",
191
+ reconciliationNote: null,
192
+ };
193
+ }
194
+
195
+ // booting → transition to working once there's recent activity
196
+ if (session.state === "booting") {
197
+ return {
198
+ ...base,
199
+ processAlive: true,
200
+ state: "working",
201
+ action: "none",
202
+ reconciliationNote: null,
203
+ };
204
+ }
205
+
206
+ // Default: healthy and working
207
+ return {
208
+ ...base,
209
+ processAlive: true,
210
+ state: "working",
211
+ action: "none",
212
+ reconciliationNote: null,
213
+ };
214
+ }
215
+
216
+ /**
217
+ * Compute the next agent state based on a health check.
218
+ *
219
+ * State transitions are strictly forward-only using the ordering:
220
+ * booting(0) → working(1) → completed(2) → zombie(3)
221
+ *
222
+ * A state can only advance forward, never move backwards.
223
+ * For example, a zombie can never become working again.
224
+ *
225
+ * Exception (ZFC): When the health check action is "investigate", the state
226
+ * is NOT advanced. This allows a human or higher-tier agent to review the
227
+ * conflicting signals before making a state change.
228
+ *
229
+ * @param currentState - The agent's current state
230
+ * @param check - The latest health check result
231
+ * @returns The new state (always >= currentState in ordering)
232
+ */
233
+ export function transitionState(currentState: AgentState, check: HealthCheck): AgentState {
234
+ // ZFC: investigate means signals conflict — hold state until reviewed
235
+ if (check.action === "investigate") {
236
+ return currentState;
237
+ }
238
+
239
+ const currentOrder = STATE_ORDER[currentState] ?? 0;
240
+ const checkOrder = STATE_ORDER[check.state] ?? 0;
241
+
242
+ // Only move forward — never regress
243
+ if (checkOrder > currentOrder) {
244
+ return check.state;
245
+ }
246
+
247
+ return currentState;
248
+ }