@katyella/legio 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/CHANGELOG.md +422 -0
  2. package/LICENSE +21 -0
  3. package/README.md +555 -0
  4. package/agents/builder.md +141 -0
  5. package/agents/coordinator.md +351 -0
  6. package/agents/cto.md +196 -0
  7. package/agents/gateway.md +276 -0
  8. package/agents/lead.md +281 -0
  9. package/agents/merger.md +156 -0
  10. package/agents/monitor.md +212 -0
  11. package/agents/reviewer.md +142 -0
  12. package/agents/scout.md +131 -0
  13. package/agents/supervisor.md +416 -0
  14. package/bin/legio.mjs +38 -0
  15. package/package.json +77 -0
  16. package/src/agents/checkpoint.test.ts +88 -0
  17. package/src/agents/checkpoint.ts +102 -0
  18. package/src/agents/hooks-deployer.test.ts +1820 -0
  19. package/src/agents/hooks-deployer.ts +574 -0
  20. package/src/agents/identity.test.ts +614 -0
  21. package/src/agents/identity.ts +385 -0
  22. package/src/agents/lifecycle.test.ts +202 -0
  23. package/src/agents/lifecycle.ts +184 -0
  24. package/src/agents/manifest.test.ts +558 -0
  25. package/src/agents/manifest.ts +297 -0
  26. package/src/agents/overlay.test.ts +592 -0
  27. package/src/agents/overlay.ts +316 -0
  28. package/src/beads/client.test.ts +210 -0
  29. package/src/beads/client.ts +227 -0
  30. package/src/beads/molecules.test.ts +320 -0
  31. package/src/beads/molecules.ts +209 -0
  32. package/src/commands/agents.test.ts +325 -0
  33. package/src/commands/agents.ts +286 -0
  34. package/src/commands/clean.test.ts +730 -0
  35. package/src/commands/clean.ts +653 -0
  36. package/src/commands/completions.test.ts +346 -0
  37. package/src/commands/completions.ts +950 -0
  38. package/src/commands/coordinator.test.ts +1524 -0
  39. package/src/commands/coordinator.ts +880 -0
  40. package/src/commands/costs.test.ts +1015 -0
  41. package/src/commands/costs.ts +473 -0
  42. package/src/commands/dashboard.test.ts +94 -0
  43. package/src/commands/dashboard.ts +607 -0
  44. package/src/commands/doctor.test.ts +295 -0
  45. package/src/commands/doctor.ts +213 -0
  46. package/src/commands/down.test.ts +308 -0
  47. package/src/commands/down.ts +124 -0
  48. package/src/commands/errors.test.ts +648 -0
  49. package/src/commands/errors.ts +255 -0
  50. package/src/commands/feed.test.ts +579 -0
  51. package/src/commands/feed.ts +368 -0
  52. package/src/commands/gateway.test.ts +698 -0
  53. package/src/commands/gateway.ts +419 -0
  54. package/src/commands/group.test.ts +262 -0
  55. package/src/commands/group.ts +539 -0
  56. package/src/commands/hooks.test.ts +292 -0
  57. package/src/commands/hooks.ts +210 -0
  58. package/src/commands/init.test.ts +211 -0
  59. package/src/commands/init.ts +622 -0
  60. package/src/commands/inspect.test.ts +670 -0
  61. package/src/commands/inspect.ts +455 -0
  62. package/src/commands/log.test.ts +1556 -0
  63. package/src/commands/log.ts +752 -0
  64. package/src/commands/logs.test.ts +379 -0
  65. package/src/commands/logs.ts +544 -0
  66. package/src/commands/mail.test.ts +1726 -0
  67. package/src/commands/mail.ts +926 -0
  68. package/src/commands/merge.test.ts +676 -0
  69. package/src/commands/merge.ts +374 -0
  70. package/src/commands/metrics.test.ts +444 -0
  71. package/src/commands/metrics.ts +150 -0
  72. package/src/commands/monitor.test.ts +151 -0
  73. package/src/commands/monitor.ts +394 -0
  74. package/src/commands/nudge.test.ts +230 -0
  75. package/src/commands/nudge.ts +373 -0
  76. package/src/commands/prime.test.ts +467 -0
  77. package/src/commands/prime.ts +386 -0
  78. package/src/commands/replay.test.ts +742 -0
  79. package/src/commands/replay.ts +367 -0
  80. package/src/commands/run.test.ts +443 -0
  81. package/src/commands/run.ts +365 -0
  82. package/src/commands/server.test.ts +626 -0
  83. package/src/commands/server.ts +298 -0
  84. package/src/commands/sling.test.ts +810 -0
  85. package/src/commands/sling.ts +700 -0
  86. package/src/commands/spec.test.ts +206 -0
  87. package/src/commands/spec.ts +171 -0
  88. package/src/commands/status.test.ts +276 -0
  89. package/src/commands/status.ts +339 -0
  90. package/src/commands/stop.test.ts +357 -0
  91. package/src/commands/stop.ts +119 -0
  92. package/src/commands/supervisor.test.ts +186 -0
  93. package/src/commands/supervisor.ts +544 -0
  94. package/src/commands/trace.test.ts +746 -0
  95. package/src/commands/trace.ts +332 -0
  96. package/src/commands/up.test.ts +597 -0
  97. package/src/commands/up.ts +275 -0
  98. package/src/commands/watch.test.ts +152 -0
  99. package/src/commands/watch.ts +238 -0
  100. package/src/commands/worktree.test.ts +648 -0
  101. package/src/commands/worktree.ts +266 -0
  102. package/src/config.test.ts +496 -0
  103. package/src/config.ts +616 -0
  104. package/src/doctor/agents.test.ts +448 -0
  105. package/src/doctor/agents.ts +396 -0
  106. package/src/doctor/config-check.test.ts +184 -0
  107. package/src/doctor/config-check.ts +185 -0
  108. package/src/doctor/consistency.test.ts +645 -0
  109. package/src/doctor/consistency.ts +294 -0
  110. package/src/doctor/databases.test.ts +284 -0
  111. package/src/doctor/databases.ts +211 -0
  112. package/src/doctor/dependencies.test.ts +150 -0
  113. package/src/doctor/dependencies.ts +179 -0
  114. package/src/doctor/logs.test.ts +244 -0
  115. package/src/doctor/logs.ts +295 -0
  116. package/src/doctor/merge-queue.test.ts +210 -0
  117. package/src/doctor/merge-queue.ts +144 -0
  118. package/src/doctor/structure.test.ts +285 -0
  119. package/src/doctor/structure.ts +195 -0
  120. package/src/doctor/types.ts +37 -0
  121. package/src/doctor/version.test.ts +130 -0
  122. package/src/doctor/version.ts +131 -0
  123. package/src/e2e/chat-flow.test.ts +346 -0
  124. package/src/e2e/init-sling-lifecycle.test.ts +288 -0
  125. package/src/errors.test.ts +21 -0
  126. package/src/errors.ts +246 -0
  127. package/src/events/store.test.ts +660 -0
  128. package/src/events/store.ts +344 -0
  129. package/src/events/tool-filter.test.ts +330 -0
  130. package/src/events/tool-filter.ts +126 -0
  131. package/src/global-setup.ts +14 -0
  132. package/src/index.ts +339 -0
  133. package/src/insights/analyzer.test.ts +466 -0
  134. package/src/insights/analyzer.ts +203 -0
  135. package/src/logging/color.test.ts +118 -0
  136. package/src/logging/color.ts +71 -0
  137. package/src/logging/logger.test.ts +812 -0
  138. package/src/logging/logger.ts +266 -0
  139. package/src/logging/reporter.test.ts +258 -0
  140. package/src/logging/reporter.ts +109 -0
  141. package/src/logging/sanitizer.test.ts +190 -0
  142. package/src/logging/sanitizer.ts +57 -0
  143. package/src/mail/broadcast.test.ts +203 -0
  144. package/src/mail/broadcast.ts +92 -0
  145. package/src/mail/client.test.ts +873 -0
  146. package/src/mail/client.ts +236 -0
  147. package/src/mail/store.test.ts +815 -0
  148. package/src/mail/store.ts +402 -0
  149. package/src/merge/queue.test.ts +449 -0
  150. package/src/merge/queue.ts +262 -0
  151. package/src/merge/resolver.test.ts +1453 -0
  152. package/src/merge/resolver.ts +759 -0
  153. package/src/metrics/store.test.ts +1167 -0
  154. package/src/metrics/store.ts +511 -0
  155. package/src/metrics/summary.test.ts +397 -0
  156. package/src/metrics/summary.ts +178 -0
  157. package/src/metrics/transcript.test.ts +643 -0
  158. package/src/metrics/transcript.ts +351 -0
  159. package/src/mulch/client.test.ts +547 -0
  160. package/src/mulch/client.ts +416 -0
  161. package/src/server/audit-store.test.ts +384 -0
  162. package/src/server/audit-store.ts +257 -0
  163. package/src/server/headless.test.ts +180 -0
  164. package/src/server/headless.ts +151 -0
  165. package/src/server/index.test.ts +241 -0
  166. package/src/server/index.ts +317 -0
  167. package/src/server/public/app.js +187 -0
  168. package/src/server/public/apple-touch-icon.png +0 -0
  169. package/src/server/public/components/agent-badge.js +37 -0
  170. package/src/server/public/components/data-table.js +114 -0
  171. package/src/server/public/components/gateway-chat.js +256 -0
  172. package/src/server/public/components/issue-card.js +96 -0
  173. package/src/server/public/components/layout.js +88 -0
  174. package/src/server/public/components/message-bubble.js +120 -0
  175. package/src/server/public/components/stat-card.js +26 -0
  176. package/src/server/public/components/terminal-panel.js +140 -0
  177. package/src/server/public/favicon-16.png +0 -0
  178. package/src/server/public/favicon-32.png +0 -0
  179. package/src/server/public/favicon.ico +0 -0
  180. package/src/server/public/favicon.png +0 -0
  181. package/src/server/public/index.html +64 -0
  182. package/src/server/public/lib/api.js +35 -0
  183. package/src/server/public/lib/markdown.js +8 -0
  184. package/src/server/public/lib/preact-setup.js +8 -0
  185. package/src/server/public/lib/state.js +99 -0
  186. package/src/server/public/lib/utils.js +309 -0
  187. package/src/server/public/lib/ws.js +79 -0
  188. package/src/server/public/views/chat.js +983 -0
  189. package/src/server/public/views/costs.js +692 -0
  190. package/src/server/public/views/dashboard.js +781 -0
  191. package/src/server/public/views/gateway-chat.js +622 -0
  192. package/src/server/public/views/inspect.js +399 -0
  193. package/src/server/public/views/issues.js +470 -0
  194. package/src/server/public/views/setup.js +94 -0
  195. package/src/server/public/views/task-detail.js +422 -0
  196. package/src/server/routes.test.ts +3816 -0
  197. package/src/server/routes.ts +1964 -0
  198. package/src/server/websocket.test.ts +288 -0
  199. package/src/server/websocket.ts +196 -0
  200. package/src/sessions/compat.test.ts +109 -0
  201. package/src/sessions/compat.ts +17 -0
  202. package/src/sessions/store.test.ts +969 -0
  203. package/src/sessions/store.ts +480 -0
  204. package/src/test-helpers.test.ts +97 -0
  205. package/src/test-helpers.ts +143 -0
  206. package/src/types.ts +708 -0
  207. package/src/watchdog/daemon.test.ts +1233 -0
  208. package/src/watchdog/daemon.ts +533 -0
  209. package/src/watchdog/health.test.ts +371 -0
  210. package/src/watchdog/health.ts +248 -0
  211. package/src/watchdog/triage.test.ts +162 -0
  212. package/src/watchdog/triage.ts +193 -0
  213. package/src/worktree/manager.test.ts +444 -0
  214. package/src/worktree/manager.ts +224 -0
  215. package/src/worktree/tmux.test.ts +1238 -0
  216. package/src/worktree/tmux.ts +644 -0
  217. package/templates/CLAUDE.md.tmpl +89 -0
  218. package/templates/hooks.json.tmpl +132 -0
  219. package/templates/overlay.md.tmpl +79 -0
@@ -0,0 +1,533 @@
1
+ /**
2
+ * Tier 0 mechanical process monitoring daemon.
3
+ *
4
+ * Runs on a configurable interval, checking the health of all active agent
5
+ * sessions. Detects zombie agents (dead tmux or process) and attempts
6
+ * auto-recovery from checkpoints.
7
+ *
8
+ * Phase 4 tier numbering:
9
+ * Tier 0 = Mechanical daemon (this file)
10
+ * Tier 1 = Triage agent (triage.ts)
11
+ * Tier 2 = Monitor agent (not yet implemented)
12
+ * Tier 3 = Supervisor monitors (per-project)
13
+ *
14
+ * ZFC Principle: Observable state (tmux alive, pid alive) is the source of
15
+ * truth. See health.ts for the full ZFC documentation.
16
+ */
17
+
18
+ import { spawn } from "node:child_process";
19
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
20
+ import { join } from "node:path";
21
+ import { loadCheckpoint } from "../agents/checkpoint.ts";
22
+ import { createEventStore } from "../events/store.ts";
23
+ import { createMulchClient } from "../mulch/client.ts";
24
+ import { openSessionStore } from "../sessions/compat.ts";
25
+ import type { AgentSession, EventStore, HealthCheck, SessionCheckpoint } from "../types.ts";
26
+ import { isSessionAlive, killSession } from "../worktree/tmux.ts";
27
+ import { evaluateHealth, transitionState } from "./health.ts";
28
+
29
+ /**
30
+ * Record an agent failure to mulch for future reference.
31
+ * Fire-and-forget: never throws, logs errors internally if mulch fails.
32
+ *
33
+ * @param root - Project root directory
34
+ * @param session - The agent session that failed
35
+ * @param reason - Human-readable failure reason
36
+ * @param tier - Which watchdog tier detected the failure (0 or 1)
37
+ * @param triageSuggestion - Optional triage verdict from Tier 1 AI analysis
38
+ */
39
+ async function recordFailure(
40
+ root: string,
41
+ session: AgentSession,
42
+ reason: string,
43
+ tier: 0 | 1,
44
+ triageSuggestion?: string,
45
+ ): Promise<void> {
46
+ try {
47
+ const mulch = createMulchClient(root);
48
+ const tierLabel = tier === 0 ? "Tier 0 (process death)" : "Tier 1 (AI triage)";
49
+ const description = [
50
+ `Agent: ${session.agentName}`,
51
+ `Capability: ${session.capability}`,
52
+ `Failure reason: ${reason}`,
53
+ triageSuggestion ? `Triage suggestion: ${triageSuggestion}` : null,
54
+ `Detected by: ${tierLabel}`,
55
+ ]
56
+ .filter((line) => line !== null)
57
+ .join("\n");
58
+
59
+ await mulch.record("agents", {
60
+ type: "failure",
61
+ description,
62
+ tags: ["watchdog", "auto-recorded"],
63
+ evidenceBead: session.beadId || undefined,
64
+ });
65
+ } catch {
66
+ // Fire-and-forget: recording failures must not break the watchdog
67
+ }
68
+ }
69
+
70
+ /**
71
+ * Read the current run ID from current-run.txt, or null if no active run.
72
+ */
73
+ async function readCurrentRunId(legioDir: string): Promise<string | null> {
74
+ const path = join(legioDir, "current-run.txt");
75
+ try {
76
+ const text = await readFile(path, "utf-8");
77
+ const trimmed = text.trim();
78
+ return trimmed.length > 0 ? trimmed : null;
79
+ } catch {
80
+ return null;
81
+ }
82
+ }
83
+
84
+ /**
85
+ * Fire-and-forget: record an event to EventStore. Never throws.
86
+ */
87
+ function recordEvent(
88
+ eventStore: EventStore | null,
89
+ event: {
90
+ runId: string | null;
91
+ agentName: string;
92
+ eventType: "custom" | "mail_sent";
93
+ level: "debug" | "info" | "warn" | "error";
94
+ data: Record<string, unknown>;
95
+ },
96
+ ): void {
97
+ if (!eventStore) return;
98
+ try {
99
+ eventStore.insert({
100
+ runId: event.runId,
101
+ agentName: event.agentName,
102
+ sessionId: null,
103
+ eventType: event.eventType,
104
+ toolName: null,
105
+ toolArgs: null,
106
+ toolDurationMs: null,
107
+ level: event.level,
108
+ data: JSON.stringify(event.data),
109
+ });
110
+ } catch {
111
+ // Fire-and-forget: event recording must never break the daemon
112
+ }
113
+ }
114
+
115
+ /**
116
+ * Read the recovery attempt count for an agent from disk.
117
+ * Returns 0 if the file doesn't exist.
118
+ */
119
+ async function readRecoveryCount(agentsDir: string, agentName: string): Promise<number> {
120
+ try {
121
+ const text = await readFile(join(agentsDir, agentName, "recovery-count"), "utf-8");
122
+ return parseInt(text.trim(), 10) || 0;
123
+ } catch {
124
+ return 0;
125
+ }
126
+ }
127
+
128
+ /**
129
+ * Write the recovery attempt count for an agent to disk.
130
+ * Creates the directory if it doesn't exist.
131
+ */
132
+ async function writeRecoveryCount(
133
+ agentsDir: string,
134
+ agentName: string,
135
+ count: number,
136
+ ): Promise<void> {
137
+ const dir = join(agentsDir, agentName);
138
+ await mkdir(dir, { recursive: true });
139
+ await writeFile(join(dir, "recovery-count"), String(count), "utf-8");
140
+ }
141
+
142
+ /**
143
+ * Default sling implementation: spawn `legio sling` as a subprocess.
144
+ */
145
+ async function reSling(
146
+ args: string[],
147
+ root: string,
148
+ ): Promise<{ exitCode: number; stderr: string }> {
149
+ return new Promise((resolve) => {
150
+ const proc = spawn("legio", ["sling", ...args], {
151
+ cwd: root,
152
+ stdio: ["ignore", "pipe", "pipe"],
153
+ });
154
+ let stderr = "";
155
+ proc.stderr?.on("data", (chunk: Buffer) => {
156
+ stderr += chunk.toString();
157
+ });
158
+ proc.on("close", (code) => resolve({ exitCode: code ?? 1, stderr }));
159
+ });
160
+ }
161
+
162
+ /**
163
+ * Default recovery mail implementation: spawn `legio mail send` as a subprocess.
164
+ */
165
+ async function sendMailSubprocess(args: string[], root: string): Promise<void> {
166
+ return new Promise((resolve) => {
167
+ const proc = spawn("legio", ["mail", "send", ...args], {
168
+ cwd: root,
169
+ stdio: ["ignore", "ignore", "ignore"],
170
+ });
171
+ proc.on("close", () => resolve());
172
+ });
173
+ }
174
+
175
+ /**
176
+ * Attempt to auto-recover a dead agent from its checkpoint by re-slinging it.
177
+ *
178
+ * @returns `{ recovered: true }` if sling succeeded, `{ recovered: false }` otherwise.
179
+ */
180
+ async function attemptRecovery(options: {
181
+ session: AgentSession;
182
+ legioDir: string;
183
+ root: string;
184
+ maxRecoveryAttempts: number;
185
+ eventStore: EventStore | null;
186
+ runId: string | null;
187
+ sling: (args: string[]) => Promise<{ exitCode: number; stderr: string }>;
188
+ loadCheckpointFn: (agentsDir: string, agentName: string) => Promise<SessionCheckpoint | null>;
189
+ sendRecoveryMail: (args: string[]) => Promise<void>;
190
+ }): Promise<{ recovered: boolean }> {
191
+ const {
192
+ session,
193
+ legioDir,
194
+ maxRecoveryAttempts,
195
+ eventStore,
196
+ runId,
197
+ sling,
198
+ loadCheckpointFn,
199
+ sendRecoveryMail,
200
+ } = options;
201
+ const agentsDir = join(legioDir, "agents");
202
+
203
+ // Load checkpoint — if none exists, recovery is not possible
204
+ let checkpoint: SessionCheckpoint | null = null;
205
+ try {
206
+ checkpoint = await loadCheckpointFn(agentsDir, session.agentName);
207
+ } catch {
208
+ return { recovered: false };
209
+ }
210
+
211
+ if (!checkpoint) {
212
+ return { recovered: false };
213
+ }
214
+
215
+ // Check retry count — if exhausted, send escalation mail and bail
216
+ const recoveryCount = await readRecoveryCount(agentsDir, session.agentName);
217
+ if (recoveryCount >= maxRecoveryAttempts) {
218
+ if (session.parentAgent) {
219
+ try {
220
+ await sendRecoveryMail([
221
+ "--to",
222
+ session.parentAgent,
223
+ "--subject",
224
+ `Recovery failed: ${session.agentName}`,
225
+ "--body",
226
+ `Auto-recovery exhausted for ${session.agentName} after ${recoveryCount} attempts. Agent marked zombie.`,
227
+ "--type",
228
+ "error",
229
+ "--priority",
230
+ "high",
231
+ "--from",
232
+ "watchdog",
233
+ ]);
234
+ } catch {
235
+ // Fire-and-forget: mail failure must not break the watchdog
236
+ }
237
+ }
238
+ return { recovered: false };
239
+ }
240
+
241
+ // Increment recovery count before attempting
242
+ try {
243
+ await writeRecoveryCount(agentsDir, session.agentName, recoveryCount + 1);
244
+ } catch {
245
+ // Non-fatal: proceed with recovery even if count write fails
246
+ }
247
+
248
+ const attempt = recoveryCount + 1;
249
+
250
+ // Record recovery_attempt event
251
+ recordEvent(eventStore, {
252
+ runId,
253
+ agentName: session.agentName,
254
+ eventType: "custom",
255
+ level: "info",
256
+ data: { type: "recovery_attempt", attempt, maxAttempts: maxRecoveryAttempts },
257
+ });
258
+
259
+ // Send mail to parent notifying of recovery attempt
260
+ if (session.parentAgent) {
261
+ try {
262
+ await sendRecoveryMail([
263
+ "--to",
264
+ session.parentAgent,
265
+ "--subject",
266
+ `Recovery: ${session.agentName}`,
267
+ "--body",
268
+ `Watchdog attempting auto-recovery from checkpoint for ${session.agentName} (attempt ${attempt}/${maxRecoveryAttempts}).`,
269
+ "--type",
270
+ "health_check",
271
+ "--from",
272
+ "watchdog",
273
+ ]);
274
+ } catch {
275
+ // Fire-and-forget: mail failure must not break the watchdog
276
+ }
277
+ }
278
+
279
+ // Build sling args from checkpoint + session
280
+ const specPath = join(legioDir, "specs", `${checkpoint.beadId}.md`);
281
+ const slingArgs: string[] = [
282
+ checkpoint.beadId,
283
+ "--capability",
284
+ session.capability,
285
+ "--name",
286
+ session.agentName,
287
+ "--spec",
288
+ specPath,
289
+ ];
290
+
291
+ if (checkpoint.filesModified.length > 0) {
292
+ slingArgs.push("--files", checkpoint.filesModified.join(","));
293
+ }
294
+
295
+ if (session.parentAgent) {
296
+ slingArgs.push("--parent", session.parentAgent);
297
+ }
298
+
299
+ slingArgs.push("--depth", String(session.depth));
300
+
301
+ // Attempt sling subprocess
302
+ try {
303
+ const result = await sling(slingArgs);
304
+ if (result.exitCode === 0) {
305
+ recordEvent(eventStore, {
306
+ runId,
307
+ agentName: session.agentName,
308
+ eventType: "custom",
309
+ level: "info",
310
+ data: { type: "recovery_success", attempt },
311
+ });
312
+ return { recovered: true };
313
+ }
314
+
315
+ recordEvent(eventStore, {
316
+ runId,
317
+ agentName: session.agentName,
318
+ eventType: "custom",
319
+ level: "error",
320
+ data: { type: "recovery_failed", attempt, stderr: result.stderr },
321
+ });
322
+ return { recovered: false };
323
+ } catch {
324
+ recordEvent(eventStore, {
325
+ runId,
326
+ agentName: session.agentName,
327
+ eventType: "custom",
328
+ level: "error",
329
+ data: { type: "recovery_failed", attempt },
330
+ });
331
+ return { recovered: false };
332
+ }
333
+ }
334
+
335
+ /** Options shared between startDaemon and runDaemonTick. */
336
+ export interface DaemonOptions {
337
+ root: string;
338
+ zombieThresholdMs: number;
339
+ onHealthCheck?: (check: HealthCheck) => void;
340
+ /** Dependency injection for testing. Uses real implementations when omitted. */
341
+ _tmux?: {
342
+ isSessionAlive: (name: string) => Promise<boolean>;
343
+ killSession: (name: string) => Promise<void>;
344
+ };
345
+ /** Dependency injection for testing. Overrides EventStore creation. */
346
+ _eventStore?: EventStore | null;
347
+ /** Dependency injection for testing. Uses real recordFailure when omitted. */
348
+ _recordFailure?: (
349
+ root: string,
350
+ session: AgentSession,
351
+ reason: string,
352
+ tier: 0 | 1,
353
+ triageSuggestion?: string,
354
+ ) => Promise<void>;
355
+ /** Max recovery attempts per agent before escalating (default: 1). */
356
+ maxRecoveryAttempts?: number;
357
+ /** DI for testing. Overrides sling subprocess spawn. */
358
+ _sling?: (args: string[]) => Promise<{ exitCode: number; stderr: string }>;
359
+ /** DI for testing. Overrides checkpoint loading. */
360
+ _loadCheckpoint?: (agentsDir: string, agentName: string) => Promise<SessionCheckpoint | null>;
361
+ /** DI for testing. Overrides mail sending for recovery notifications. */
362
+ _sendRecoveryMail?: (args: string[]) => Promise<void>;
363
+ }
364
+
365
+ /**
366
+ * Start the watchdog daemon that periodically monitors agent health.
367
+ *
368
+ * On each tick:
369
+ * 1. Loads sessions from SessionStore (sessions.db)
370
+ * 2. For each session (including zombies — ZFC requires re-checking observable
371
+ * state), checks tmux liveness and evaluates health
372
+ * 3. For "terminate" actions: kills tmux session immediately
373
+ * 4. For "investigate" actions: surfaces via onHealthCheck, no auto-kill
374
+ * 5. For "escalate" actions: applies progressive nudging based on escalationLevel
375
+ * 6. Persists updated session states back to SessionStore
376
+ *
377
+ * @param options.root - Project root directory (contains .legio/)
378
+ * @param options.intervalMs - Polling interval in milliseconds
379
+ * @param options.zombieThresholdMs - Time after which an agent is considered a zombie
380
+ * @param options.onHealthCheck - Optional callback for each health check result
381
+ * @returns An object with a `stop` function to halt the daemon
382
+ */
383
+ export function startDaemon(options: DaemonOptions & { intervalMs: number }): { stop: () => void } {
384
+ const { intervalMs } = options;
385
+
386
+ // Run the first tick immediately, then on interval
387
+ runDaemonTick(options).catch(() => {
388
+ // Swallow errors in the first tick — daemon must not crash
389
+ });
390
+
391
+ const interval = setInterval(() => {
392
+ runDaemonTick(options).catch(() => {
393
+ // Swallow errors in periodic ticks — daemon must not crash
394
+ });
395
+ }, intervalMs);
396
+
397
+ return {
398
+ stop(): void {
399
+ clearInterval(interval);
400
+ },
401
+ };
402
+ }
403
+
404
+ /**
405
+ * Run a single daemon tick. Exported for testing — allows direct invocation
406
+ * of the monitoring logic without starting the interval-based daemon loop.
407
+ *
408
+ * @param options - Same options as startDaemon (minus intervalMs)
409
+ */
410
+ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
411
+ const { root, zombieThresholdMs, onHealthCheck } = options;
412
+ const tmux = options._tmux ?? { isSessionAlive, killSession };
413
+ const recordFailureFn = options._recordFailure ?? recordFailure;
414
+ const maxRecoveryAttempts = options.maxRecoveryAttempts ?? 1;
415
+ const slingFn = options._sling ?? ((args: string[]) => reSling(args, root));
416
+ const loadCheckpointFn = options._loadCheckpoint ?? loadCheckpoint;
417
+ const sendRecoveryMailFn =
418
+ options._sendRecoveryMail ?? ((args: string[]) => sendMailSubprocess(args, root));
419
+
420
+ const legioDir = join(root, ".legio");
421
+ const { store } = openSessionStore(legioDir);
422
+
423
+ // Open EventStore for recording daemon events (fire-and-forget)
424
+ let eventStore: EventStore | null = null;
425
+ let runId: string | null = null;
426
+ const useInjectedEventStore = options._eventStore !== undefined;
427
+ if (useInjectedEventStore) {
428
+ eventStore = options._eventStore ?? null;
429
+ } else {
430
+ try {
431
+ const eventsDbPath = join(legioDir, "events.db");
432
+ eventStore = createEventStore(eventsDbPath);
433
+ } catch {
434
+ // EventStore creation failure is non-fatal for the daemon
435
+ }
436
+ }
437
+ try {
438
+ runId = await readCurrentRunId(legioDir);
439
+ } catch {
440
+ // Reading run ID failure is non-fatal
441
+ }
442
+
443
+ try {
444
+ const thresholds = {
445
+ zombieMs: zombieThresholdMs,
446
+ };
447
+
448
+ const sessions = store.getAll();
449
+
450
+ for (const session of sessions) {
451
+ // Skip completed sessions — they are terminal and don't need monitoring
452
+ if (session.state === "completed") {
453
+ continue;
454
+ }
455
+
456
+ // ZFC: Don't skip zombies. Re-check tmux liveness on every tick.
457
+ // A zombie with a live tmux session needs investigation, not silence.
458
+
459
+ const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
460
+ const check = evaluateHealth(session, tmuxAlive, thresholds);
461
+
462
+ // Transition state forward only (investigate action holds state)
463
+ const newState = transitionState(session.state, check);
464
+ if (newState !== session.state) {
465
+ store.updateState(session.agentName, newState);
466
+ session.state = newState;
467
+ }
468
+
469
+ if (onHealthCheck) {
470
+ onHealthCheck(check);
471
+ }
472
+
473
+ if (check.action === "terminate") {
474
+ // Record the failure via mulch (Tier 0 detection)
475
+ const reason = check.reconciliationNote ?? "Process terminated";
476
+ await recordFailureFn(root, session, reason, 0);
477
+
478
+ // Kill the tmux session if it's still alive
479
+ if (tmuxAlive) {
480
+ try {
481
+ await tmux.killSession(session.tmuxSession);
482
+ } catch {
483
+ // Session may have died between check and kill — not an error
484
+ }
485
+ }
486
+
487
+ // Attempt auto-recovery from checkpoint before marking zombie
488
+ const { recovered } = await attemptRecovery({
489
+ session,
490
+ legioDir,
491
+ root,
492
+ maxRecoveryAttempts,
493
+ eventStore,
494
+ runId,
495
+ sling: slingFn,
496
+ loadCheckpointFn,
497
+ sendRecoveryMail: sendRecoveryMailFn,
498
+ });
499
+
500
+ if (!recovered) {
501
+ store.updateState(session.agentName, "zombie");
502
+ // Reset escalation tracking on terminal state
503
+ store.updateEscalation(session.agentName, 0, null);
504
+ session.state = "zombie";
505
+ session.escalationLevel = 0;
506
+ session.stalledSince = null;
507
+ } else {
508
+ // Recovery succeeded — clear zombie state set by transitionState above
509
+ store.updateState(session.agentName, "completed");
510
+ store.updateEscalation(session.agentName, 0, null);
511
+ session.state = "completed";
512
+ session.escalationLevel = 0;
513
+ session.stalledSince = null;
514
+ }
515
+ } else if (check.action === "investigate") {
516
+ // ZFC: tmux alive but SessionStore says zombie.
517
+ // Log the conflict but do NOT auto-kill.
518
+ // The onHealthCheck callback surfaces this to the operator.
519
+ // No state change — keep zombie until a human or higher-tier agent decides.
520
+ }
521
+ }
522
+ } finally {
523
+ store.close();
524
+ // Close EventStore only if we created it (not injected)
525
+ if (eventStore && !useInjectedEventStore) {
526
+ try {
527
+ eventStore.close();
528
+ } catch {
529
+ // Non-fatal
530
+ }
531
+ }
532
+ }
533
+ }