@katyella/legio 0.1.3 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/CHANGELOG.md +61 -3
  2. package/README.md +21 -10
  3. package/agents/builder.md +11 -10
  4. package/agents/coordinator.md +36 -27
  5. package/agents/cto.md +9 -8
  6. package/agents/gateway.md +28 -12
  7. package/agents/lead.md +45 -30
  8. package/agents/merger.md +4 -4
  9. package/agents/monitor.md +10 -9
  10. package/agents/reviewer.md +8 -8
  11. package/agents/scout.md +10 -10
  12. package/agents/supervisor.md +60 -45
  13. package/package.json +2 -2
  14. package/src/agents/hooks-deployer.test.ts +46 -41
  15. package/src/agents/hooks-deployer.ts +10 -9
  16. package/src/agents/manifest.test.ts +6 -2
  17. package/src/agents/overlay.test.ts +9 -7
  18. package/src/agents/overlay.ts +29 -7
  19. package/src/commands/agents.test.ts +1 -5
  20. package/src/commands/clean.test.ts +2 -5
  21. package/src/commands/clean.ts +25 -1
  22. package/src/commands/completions.test.ts +1 -1
  23. package/src/commands/completions.ts +26 -7
  24. package/src/commands/coordinator.test.ts +87 -82
  25. package/src/commands/coordinator.ts +94 -48
  26. package/src/commands/costs.test.ts +2 -6
  27. package/src/commands/dashboard.test.ts +2 -5
  28. package/src/commands/doctor.test.ts +2 -6
  29. package/src/commands/down.ts +3 -3
  30. package/src/commands/errors.test.ts +2 -6
  31. package/src/commands/feed.test.ts +2 -6
  32. package/src/commands/gateway.test.ts +43 -17
  33. package/src/commands/gateway.ts +101 -11
  34. package/src/commands/hooks.test.ts +2 -5
  35. package/src/commands/init.test.ts +4 -13
  36. package/src/commands/inspect.test.ts +2 -6
  37. package/src/commands/log.test.ts +2 -6
  38. package/src/commands/logs.test.ts +2 -9
  39. package/src/commands/mail.test.ts +76 -215
  40. package/src/commands/mail.ts +43 -187
  41. package/src/commands/metrics.test.ts +3 -10
  42. package/src/commands/nudge.ts +15 -0
  43. package/src/commands/prime.test.ts +4 -11
  44. package/src/commands/replay.test.ts +2 -6
  45. package/src/commands/server.test.ts +1 -5
  46. package/src/commands/server.ts +1 -1
  47. package/src/commands/sling.test.ts +6 -1
  48. package/src/commands/sling.ts +42 -17
  49. package/src/commands/spec.test.ts +2 -5
  50. package/src/commands/status.test.ts +2 -4
  51. package/src/commands/stop.test.ts +2 -5
  52. package/src/commands/supervisor.ts +6 -6
  53. package/src/commands/trace.test.ts +2 -6
  54. package/src/commands/up.test.ts +43 -9
  55. package/src/commands/up.ts +15 -11
  56. package/src/commands/watchman.ts +327 -0
  57. package/src/commands/worktree.test.ts +2 -6
  58. package/src/config.test.ts +34 -104
  59. package/src/config.ts +120 -32
  60. package/src/doctor/agents.test.ts +52 -2
  61. package/src/doctor/agents.ts +4 -2
  62. package/src/doctor/config-check.test.ts +7 -2
  63. package/src/doctor/consistency.test.ts +7 -2
  64. package/src/doctor/databases.test.ts +6 -2
  65. package/src/doctor/dependencies.test.ts +18 -13
  66. package/src/doctor/dependencies.ts +23 -94
  67. package/src/doctor/logs.test.ts +7 -2
  68. package/src/doctor/merge-queue.test.ts +6 -2
  69. package/src/doctor/structure.test.ts +7 -2
  70. package/src/doctor/version.test.ts +7 -2
  71. package/src/e2e/init-sling-lifecycle.test.ts +2 -5
  72. package/src/index.ts +7 -7
  73. package/src/mail/pending.ts +120 -0
  74. package/src/mail/store.test.ts +89 -0
  75. package/src/mail/store.ts +11 -0
  76. package/src/merge/resolver.test.ts +518 -489
  77. package/src/server/index.ts +33 -2
  78. package/src/server/public/app.js +3 -3
  79. package/src/server/public/components/message-bubble.js +11 -1
  80. package/src/server/public/components/terminal-panel.js +66 -74
  81. package/src/server/public/views/chat.js +18 -2
  82. package/src/server/public/views/costs.js +5 -5
  83. package/src/server/public/views/dashboard.js +80 -51
  84. package/src/server/public/views/gateway-chat.js +37 -131
  85. package/src/server/public/views/inspect.js +16 -4
  86. package/src/server/public/views/issues.js +16 -12
  87. package/src/server/routes.test.ts +55 -39
  88. package/src/server/routes.ts +38 -26
  89. package/src/test-helpers.ts +6 -3
  90. package/src/tracker/beads.ts +159 -0
  91. package/src/tracker/exec.ts +44 -0
  92. package/src/tracker/factory.test.ts +283 -0
  93. package/src/tracker/factory.ts +59 -0
  94. package/src/tracker/seeds.ts +156 -0
  95. package/src/tracker/types.ts +46 -0
  96. package/src/types.ts +11 -2
  97. package/src/{watchdog → watchman}/daemon.test.ts +421 -515
  98. package/src/watchman/daemon.ts +940 -0
  99. package/src/worktree/tmux.test.ts +2 -1
  100. package/src/worktree/tmux.ts +4 -4
  101. package/templates/hooks.json.tmpl +17 -17
  102. package/src/beads/client.test.ts +0 -210
  103. package/src/commands/merge.test.ts +0 -676
  104. package/src/commands/watch.test.ts +0 -152
  105. package/src/commands/watch.ts +0 -238
  106. package/src/test-helpers.test.ts +0 -97
  107. package/src/watchdog/daemon.ts +0 -533
  108. package/src/watchdog/health.test.ts +0 -371
  109. package/src/watchdog/triage.test.ts +0 -162
  110. package/src/worktree/manager.test.ts +0 -444
  111. /package/src/{watchdog → watchman}/health.ts +0 -0
  112. /package/src/{watchdog → watchman}/triage.ts +0 -0
@@ -0,0 +1,940 @@
1
+ /**
2
+ * Unified daemon ("Watchman") — health monitoring + mail delivery + beacon safety net.
3
+ *
4
+ * Combines three responsibilities into a single process:
5
+ * 1. Health tick (default 30s): session health checks, zombie detection, boot timeout, recovery
6
+ * 2. Mail tick (default 5s): poll for unread mail, nudge agents
7
+ * 3. Beacon safety net (inside health tick): detect stuck beacons and send follow-up Enter
8
+ *
9
+ * Phase 4 tier numbering:
10
+ * Tier 0 = Mechanical daemon (this file)
11
+ * Tier 1 = Triage agent (triage.ts)
12
+ * Tier 2 = Monitor agent (not yet implemented)
13
+ * Tier 3 = Supervisor monitors (per-project)
14
+ *
15
+ * ZFC Principle: Observable state (tmux alive, pid alive) is the source of
16
+ * truth. See health.ts for the full ZFC documentation.
17
+ */
18
+
19
+ import { spawn } from "node:child_process";
20
+ import { mkdir, readFile, unlink, writeFile } from "node:fs/promises";
21
+ import { join } from "node:path";
22
+ import { loadCheckpoint } from "../agents/checkpoint.ts";
23
+ import { nudgeAgent } from "../commands/nudge.ts";
24
+ import { loadConfig } from "../config.ts";
25
+ import { createEventStore } from "../events/store.ts";
26
+ import { isAgentIdle, writePendingNudge } from "../mail/pending.ts";
27
+ import { createMailStore, type MailStore } from "../mail/store.ts";
28
+ import { createMulchClient } from "../mulch/client.ts";
29
+ import { openSessionStore } from "../sessions/compat.ts";
30
+ import type { AgentSession, EventStore, HealthCheck, SessionCheckpoint } from "../types.ts";
31
+ import { capturePaneContent, isSessionAlive, killSession, sendKeys } from "../worktree/tmux.ts";
32
+ import { evaluateHealth, transitionState } from "./health.ts";
33
+
34
+ /**
35
+ * Record an agent failure to mulch for future reference.
36
+ * Fire-and-forget: never throws, logs errors internally if mulch fails.
37
+ *
38
+ * @param root - Project root directory
39
+ * @param session - The agent session that failed
40
+ * @param reason - Human-readable failure reason
41
+ * @param tier - Which watchman tier detected the failure (0 or 1)
42
+ * @param triageSuggestion - Optional triage verdict from Tier 1 AI analysis
43
+ */
44
+ async function recordFailure(
45
+ root: string,
46
+ session: AgentSession,
47
+ reason: string,
48
+ tier: 0 | 1,
49
+ triageSuggestion?: string,
50
+ ): Promise<void> {
51
+ try {
52
+ const mulch = createMulchClient(root);
53
+ const tierLabel = tier === 0 ? "Tier 0 (process death)" : "Tier 1 (AI triage)";
54
+ const description = [
55
+ `Agent: ${session.agentName}`,
56
+ `Capability: ${session.capability}`,
57
+ `Failure reason: ${reason}`,
58
+ triageSuggestion ? `Triage suggestion: ${triageSuggestion}` : null,
59
+ `Detected by: ${tierLabel}`,
60
+ ]
61
+ .filter((line) => line !== null)
62
+ .join("\n");
63
+
64
+ await mulch.record("agents", {
65
+ type: "failure",
66
+ description,
67
+ tags: ["watchdog", "auto-recorded"],
68
+ evidenceBead: session.beadId || undefined,
69
+ });
70
+ } catch {
71
+ // Fire-and-forget: recording failures must not break the watchman
72
+ }
73
+ }
74
+
75
+ /**
76
+ * Read the current run ID from current-run.txt, or null if no active run.
77
+ */
78
+ async function readCurrentRunId(legioDir: string): Promise<string | null> {
79
+ const path = join(legioDir, "current-run.txt");
80
+ try {
81
+ const text = await readFile(path, "utf-8");
82
+ const trimmed = text.trim();
83
+ return trimmed.length > 0 ? trimmed : null;
84
+ } catch {
85
+ return null;
86
+ }
87
+ }
88
+
89
+ /**
90
+ * Fire-and-forget: record an event to EventStore. Never throws.
91
+ */
92
+ function recordEvent(
93
+ eventStore: EventStore | null,
94
+ event: {
95
+ runId: string | null;
96
+ agentName: string;
97
+ eventType: "custom" | "mail_sent";
98
+ level: "debug" | "info" | "warn" | "error";
99
+ data: Record<string, unknown>;
100
+ },
101
+ ): void {
102
+ if (!eventStore) return;
103
+ try {
104
+ eventStore.insert({
105
+ runId: event.runId,
106
+ agentName: event.agentName,
107
+ sessionId: null,
108
+ eventType: event.eventType,
109
+ toolName: null,
110
+ toolArgs: null,
111
+ toolDurationMs: null,
112
+ level: event.level,
113
+ data: JSON.stringify(event.data),
114
+ });
115
+ } catch {
116
+ // Fire-and-forget: event recording must never break the daemon
117
+ }
118
+ }
119
+
120
+ /**
121
+ * Read the recovery attempt count for an agent from disk.
122
+ * Returns 0 if the file doesn't exist.
123
+ */
124
+ async function readRecoveryCount(agentsDir: string, agentName: string): Promise<number> {
125
+ try {
126
+ const text = await readFile(join(agentsDir, agentName, "recovery-count"), "utf-8");
127
+ return parseInt(text.trim(), 10) || 0;
128
+ } catch {
129
+ return 0;
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Write the recovery attempt count for an agent to disk.
135
+ * Creates the directory if it doesn't exist.
136
+ */
137
+ async function writeRecoveryCount(
138
+ agentsDir: string,
139
+ agentName: string,
140
+ count: number,
141
+ ): Promise<void> {
142
+ const dir = join(agentsDir, agentName);
143
+ await mkdir(dir, { recursive: true });
144
+ await writeFile(join(dir, "recovery-count"), String(count), "utf-8");
145
+ }
146
+
147
+ /**
148
+ * Default sling implementation: spawn `legio sling` as a subprocess.
149
+ */
150
+ async function reSling(
151
+ args: string[],
152
+ root: string,
153
+ ): Promise<{ exitCode: number; stderr: string }> {
154
+ return new Promise((resolve) => {
155
+ const proc = spawn("legio", ["sling", ...args], {
156
+ cwd: root,
157
+ stdio: ["ignore", "pipe", "pipe"],
158
+ });
159
+ let stderr = "";
160
+ proc.stderr?.on("data", (chunk: Buffer) => {
161
+ stderr += chunk.toString();
162
+ });
163
+ proc.on("close", (code) => resolve({ exitCode: code ?? 1, stderr }));
164
+ });
165
+ }
166
+
167
+ /**
168
+ * Default recovery mail implementation: spawn `legio mail send` as a subprocess.
169
+ */
170
+ async function sendMailSubprocess(args: string[], root: string): Promise<void> {
171
+ return new Promise((resolve) => {
172
+ const proc = spawn("legio", ["mail", "send", ...args], {
173
+ cwd: root,
174
+ stdio: ["ignore", "ignore", "ignore"],
175
+ });
176
+ proc.on("close", () => resolve());
177
+ });
178
+ }
179
+
180
+ /**
181
+ * Attempt to auto-recover a dead agent from its checkpoint by re-slinging it.
182
+ *
183
+ * @returns `{ recovered: true }` if sling succeeded, `{ recovered: false }` otherwise.
184
+ */
185
+ async function attemptRecovery(options: {
186
+ session: AgentSession;
187
+ legioDir: string;
188
+ root: string;
189
+ maxRecoveryAttempts: number;
190
+ eventStore: EventStore | null;
191
+ runId: string | null;
192
+ sling: (args: string[]) => Promise<{ exitCode: number; stderr: string }>;
193
+ loadCheckpointFn: (agentsDir: string, agentName: string) => Promise<SessionCheckpoint | null>;
194
+ sendRecoveryMail: (args: string[]) => Promise<void>;
195
+ }): Promise<{ recovered: boolean }> {
196
+ const {
197
+ session,
198
+ legioDir,
199
+ maxRecoveryAttempts,
200
+ eventStore,
201
+ runId,
202
+ sling,
203
+ loadCheckpointFn,
204
+ sendRecoveryMail,
205
+ } = options;
206
+ const agentsDir = join(legioDir, "agents");
207
+
208
+ // Load checkpoint — if none exists, recovery is not possible
209
+ let checkpoint: SessionCheckpoint | null = null;
210
+ try {
211
+ checkpoint = await loadCheckpointFn(agentsDir, session.agentName);
212
+ } catch {
213
+ return { recovered: false };
214
+ }
215
+
216
+ if (!checkpoint) {
217
+ return { recovered: false };
218
+ }
219
+
220
+ // Check retry count — if exhausted, send escalation mail and bail
221
+ const recoveryCount = await readRecoveryCount(agentsDir, session.agentName);
222
+ if (recoveryCount >= maxRecoveryAttempts) {
223
+ if (session.parentAgent) {
224
+ try {
225
+ await sendRecoveryMail([
226
+ "--to",
227
+ session.parentAgent,
228
+ "--subject",
229
+ `Recovery failed: ${session.agentName}`,
230
+ "--body",
231
+ `Auto-recovery exhausted for ${session.agentName} after ${recoveryCount} attempts. Agent marked zombie.`,
232
+ "--type",
233
+ "error",
234
+ "--priority",
235
+ "high",
236
+ "--from",
237
+ "watchman",
238
+ ]);
239
+ } catch {
240
+ // Fire-and-forget: mail failure must not break the watchman
241
+ }
242
+ }
243
+ return { recovered: false };
244
+ }
245
+
246
+ // Increment recovery count before attempting
247
+ try {
248
+ await writeRecoveryCount(agentsDir, session.agentName, recoveryCount + 1);
249
+ } catch {
250
+ // Non-fatal: proceed with recovery even if count write fails
251
+ }
252
+
253
+ const attempt = recoveryCount + 1;
254
+
255
+ // Record recovery_attempt event
256
+ recordEvent(eventStore, {
257
+ runId,
258
+ agentName: session.agentName,
259
+ eventType: "custom",
260
+ level: "info",
261
+ data: { type: "recovery_attempt", attempt, maxAttempts: maxRecoveryAttempts },
262
+ });
263
+
264
+ // Send mail to parent notifying of recovery attempt
265
+ if (session.parentAgent) {
266
+ try {
267
+ await sendRecoveryMail([
268
+ "--to",
269
+ session.parentAgent,
270
+ "--subject",
271
+ `Recovery: ${session.agentName}`,
272
+ "--body",
273
+ `Watchman attempting auto-recovery from checkpoint for ${session.agentName} (attempt ${attempt}/${maxRecoveryAttempts}).`,
274
+ "--type",
275
+ "health_check",
276
+ "--from",
277
+ "watchman",
278
+ ]);
279
+ } catch {
280
+ // Fire-and-forget: mail failure must not break the watchman
281
+ }
282
+ }
283
+
284
+ // Build sling args from checkpoint + session
285
+ const specPath = join(legioDir, "specs", `${checkpoint.beadId}.md`);
286
+ const slingArgs: string[] = [
287
+ checkpoint.beadId,
288
+ "--capability",
289
+ session.capability,
290
+ "--name",
291
+ session.agentName,
292
+ "--spec",
293
+ specPath,
294
+ ];
295
+
296
+ if (checkpoint.filesModified.length > 0) {
297
+ slingArgs.push("--files", checkpoint.filesModified.join(","));
298
+ }
299
+
300
+ if (session.parentAgent) {
301
+ slingArgs.push("--parent", session.parentAgent);
302
+ }
303
+
304
+ slingArgs.push("--depth", String(session.depth));
305
+
306
+ // Attempt sling subprocess
307
+ try {
308
+ const result = await sling(slingArgs);
309
+ if (result.exitCode === 0) {
310
+ recordEvent(eventStore, {
311
+ runId,
312
+ agentName: session.agentName,
313
+ eventType: "custom",
314
+ level: "info",
315
+ data: { type: "recovery_success", attempt },
316
+ });
317
+ return { recovered: true };
318
+ }
319
+
320
+ recordEvent(eventStore, {
321
+ runId,
322
+ agentName: session.agentName,
323
+ eventType: "custom",
324
+ level: "error",
325
+ data: { type: "recovery_failed", attempt, stderr: result.stderr },
326
+ });
327
+ return { recovered: false };
328
+ } catch {
329
+ recordEvent(eventStore, {
330
+ runId,
331
+ agentName: session.agentName,
332
+ eventType: "custom",
333
+ level: "error",
334
+ data: { type: "recovery_failed", attempt },
335
+ });
336
+ return { recovered: false };
337
+ }
338
+ }
339
+
340
+ /**
341
+ * List all tmux session names that match a given prefix.
342
+ * Returns an empty array if tmux is not running or returns no sessions.
343
+ */
344
+ async function listTmuxSessions(prefix: string): Promise<string[]> {
345
+ return new Promise((resolve) => {
346
+ const proc = spawn("tmux", ["list-sessions", "-F", "#{session_name}"], {
347
+ stdio: ["ignore", "pipe", "pipe"],
348
+ });
349
+ let stdout = "";
350
+ proc.stdout?.on("data", (chunk: Buffer) => {
351
+ stdout += chunk.toString();
352
+ });
353
+ proc.on("close", (code) => {
354
+ if (code !== 0) {
355
+ resolve([]);
356
+ return;
357
+ }
358
+ const sessions = stdout
359
+ .split("\n")
360
+ .map((line) => line.trim())
361
+ .filter((line) => line.length > 0 && line.startsWith(prefix));
362
+ resolve(sessions);
363
+ });
364
+ proc.on("error", () => resolve([]));
365
+ });
366
+ }
367
+
368
+ /** Activity markers that indicate an agent is actively working (not stuck at prompt). */
369
+ const ACTIVITY_MARKERS = ["⏺", "Claude", "Reading", "Searching", "Editing", "Writing"];
370
+
371
+ /** Per-agent tracking state for unread mail delivery. */
372
+ export interface AgentMailState {
373
+ firstSeenAt: number;
374
+ lastNudgeAt: number;
375
+ nudgeCount: number;
376
+ }
377
+
378
+ /** Options shared between startDaemon and runDaemonTick / runMailTick. */
379
+ export interface WatchmanOptions {
380
+ root: string;
381
+ zombieThresholdMs: number;
382
+ onHealthCheck?: (check: HealthCheck) => void;
383
+ /** Dependency injection for testing. Uses real implementations when omitted. */
384
+ _tmux?: {
385
+ isSessionAlive: (name: string) => Promise<boolean>;
386
+ killSession: (name: string) => Promise<void>;
387
+ };
388
+ /** Dependency injection for testing. Overrides EventStore creation. */
389
+ _eventStore?: EventStore | null;
390
+ /** Dependency injection for testing. Uses real recordFailure when omitted. */
391
+ _recordFailure?: (
392
+ root: string,
393
+ session: AgentSession,
394
+ reason: string,
395
+ tier: 0 | 1,
396
+ triageSuggestion?: string,
397
+ ) => Promise<void>;
398
+ /** Max recovery attempts per agent before escalating (default: 1). */
399
+ maxRecoveryAttempts?: number;
400
+ /** DI for testing. Overrides sling subprocess spawn. */
401
+ _sling?: (args: string[]) => Promise<{ exitCode: number; stderr: string }>;
402
+ /** DI for testing. Overrides checkpoint loading. */
403
+ _loadCheckpoint?: (agentsDir: string, agentName: string) => Promise<SessionCheckpoint | null>;
404
+ /** DI for testing. Overrides mail sending for recovery notifications. */
405
+ _sendRecoveryMail?: (args: string[]) => Promise<void>;
406
+ /**
407
+ * Boot timeout in milliseconds for agents stuck in booting state (default: 90000).
408
+ * When an agent has been in the "booting" state longer than this threshold,
409
+ * it is treated as a zombie and an urgent alert is sent to its parent.
410
+ */
411
+ bootTimeoutMs?: number;
412
+ /** DI for testing. Overrides tmux session listing for unregistered agent detection. */
413
+ _listTmuxSessions?: (prefix: string) => Promise<string[]>;
414
+ /** DI for testing. Overrides project name lookup (bypasses loadConfig). */
415
+ _projectName?: string;
416
+
417
+ // --- Mail delivery fields ---
418
+
419
+ /** Mail polling interval in ms (default 5_000). */
420
+ mailIntervalMs?: number;
421
+ /** Time between re-nudges for the same agent (default 10_000). */
422
+ reNudgeIntervalMs?: number;
423
+ /** Warn after unread mail sits this long (default 60_000). */
424
+ warnAfterMs?: number;
425
+ /** Beacon nudge threshold in ms (default 20_000). */
426
+ beaconNudgeMs?: number;
427
+ /** Callback when an agent is nudged for unread mail. */
428
+ onNudge?: (agentName: string, nudgeCount: number) => void;
429
+ /** Callback when an agent has had unread mail for too long. */
430
+ onWarn?: (agentName: string, unreadSinceMs: number) => void;
431
+ /** DI for testing. Overrides MailStore creation. */
432
+ _mailStore?: MailStore;
433
+ /** DI for testing. Overrides nudge delivery. */
434
+ _nudge?: (
435
+ projectRoot: string,
436
+ agentName: string,
437
+ message: string,
438
+ force: boolean,
439
+ ) => Promise<{ delivered: boolean; reason?: string }>;
440
+ /** DI for testing. Overrides isAgentIdle check. */
441
+ _isAgentIdle?: (cwd: string, agentName: string) => Promise<boolean>;
442
+ /** DI for testing. Overrides writePendingNudge. */
443
+ _writePendingNudge?: (
444
+ cwd: string,
445
+ agentName: string,
446
+ nudge: { from: string; reason: string; subject: string; messageId: string },
447
+ ) => Promise<void>;
448
+ /** DI for testing. Overrides capturePaneContent. */
449
+ _capturePaneContent?: (sessionName: string) => Promise<string>;
450
+ /** DI for testing. Overrides sendKeys. */
451
+ _sendKeys?: (sessionName: string, keys: string) => Promise<void>;
452
+ }
453
+
454
+ /**
455
+ * Start the unified watchman daemon that monitors agent health and delivers mail.
456
+ *
457
+ * Two independent intervals within the same process:
458
+ * - Health tick: session health checks, zombie detection, beacon stuck detection
459
+ * - Mail tick: poll for unread mail and nudge agents
460
+ *
461
+ * @returns An object with a `stop` function to halt both intervals
462
+ */
463
+ export function startDaemon(options: WatchmanOptions & { intervalMs: number }): {
464
+ stop: () => void;
465
+ } {
466
+ const { intervalMs } = options;
467
+ const mailIntervalMs = options.mailIntervalMs ?? 5_000;
468
+ const mailState = new Map<string, AgentMailState>();
469
+
470
+ // Run the first health tick immediately, then on interval
471
+ runDaemonTick(options).catch(() => {
472
+ // Swallow errors in the first tick — daemon must not crash
473
+ });
474
+
475
+ const healthInterval = setInterval(() => {
476
+ runDaemonTick(options).catch(() => {
477
+ // Swallow errors in periodic ticks — daemon must not crash
478
+ });
479
+ }, intervalMs);
480
+
481
+ // Run the first mail tick immediately, then on interval
482
+ runMailTick(options, mailState).catch(() => {
483
+ // Swallow errors in the first tick — daemon must not crash
484
+ });
485
+
486
+ const mailInterval = setInterval(() => {
487
+ runMailTick(options, mailState).catch(() => {
488
+ // Swallow errors in periodic ticks — daemon must not crash
489
+ });
490
+ }, mailIntervalMs);
491
+
492
+ return {
493
+ stop(): void {
494
+ clearInterval(healthInterval);
495
+ clearInterval(mailInterval);
496
+ },
497
+ };
498
+ }
499
+
500
+ /**
501
+ * Run a single health daemon tick. Exported for testing — allows direct invocation
502
+ * of the monitoring logic without starting the interval-based daemon loop.
503
+ *
504
+ * @param options - Same options as startDaemon (minus intervalMs)
505
+ */
506
+ export async function runDaemonTick(options: WatchmanOptions): Promise<void> {
507
+ const { root, zombieThresholdMs, onHealthCheck } = options;
508
+ const tmux = options._tmux ?? { isSessionAlive, killSession };
509
+ const recordFailureFn = options._recordFailure ?? recordFailure;
510
+ const maxRecoveryAttempts = options.maxRecoveryAttempts ?? 1;
511
+ const slingFn = options._sling ?? ((args: string[]) => reSling(args, root));
512
+ const loadCheckpointFn = options._loadCheckpoint ?? loadCheckpoint;
513
+ const sendRecoveryMailFn =
514
+ options._sendRecoveryMail ?? ((args: string[]) => sendMailSubprocess(args, root));
515
+ const beaconNudgeMs = options.beaconNudgeMs ?? 20_000;
516
+ const capturePaneFn = options._capturePaneContent ?? capturePaneContent;
517
+ const sendKeysFn = options._sendKeys ?? sendKeys;
518
+
519
+ const legioDir = join(root, ".legio");
520
+ const { store } = openSessionStore(legioDir);
521
+
522
+ // Open EventStore for recording daemon events (fire-and-forget)
523
+ let eventStore: EventStore | null = null;
524
+ let runId: string | null = null;
525
+ const useInjectedEventStore = options._eventStore !== undefined;
526
+ if (useInjectedEventStore) {
527
+ eventStore = options._eventStore ?? null;
528
+ } else {
529
+ try {
530
+ const eventsDbPath = join(legioDir, "events.db");
531
+ eventStore = createEventStore(eventsDbPath);
532
+ } catch {
533
+ // EventStore creation failure is non-fatal for the daemon
534
+ }
535
+ }
536
+ try {
537
+ runId = await readCurrentRunId(legioDir);
538
+ } catch {
539
+ // Reading run ID failure is non-fatal
540
+ }
541
+
542
+ try {
543
+ const thresholds = {
544
+ zombieMs: zombieThresholdMs,
545
+ };
546
+
547
+ const sessions = store.getAll();
548
+
549
+ for (const session of sessions) {
550
+ // Skip completed sessions — they are terminal and don't need monitoring
551
+ if (session.state === "completed") {
552
+ continue;
553
+ }
554
+
555
+ // ZFC: Don't skip zombies. Re-check tmux liveness on every tick.
556
+ // A zombie with a live tmux session needs investigation, not silence.
557
+
558
+ const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
559
+ const check = evaluateHealth(session, tmuxAlive, thresholds);
560
+
561
+ // Boot timeout detection: agent stuck in booting state beyond threshold.
562
+ // Fires when tmux is alive (the session started) but the agent never
563
+ // transitioned out of "booting" within the allowed window.
564
+ if (session.state === "booting" && tmuxAlive) {
565
+ const bootElapsed = Date.now() - new Date(session.startedAt).getTime();
566
+ const bootTimeoutMs = options.bootTimeoutMs ?? 90_000;
567
+
568
+ // Beacon safety net: if the agent has been booting longer than
569
+ // beaconNudgeMs but less than bootTimeoutMs, check if the beacon
570
+ // is stuck in the tmux input buffer and send a follow-up Enter.
571
+ if (bootElapsed > beaconNudgeMs && bootElapsed <= bootTimeoutMs) {
572
+ try {
573
+ const paneContent = await capturePaneFn(session.tmuxSession);
574
+ const hasActivity = ACTIVITY_MARKERS.some((marker) => paneContent.includes(marker));
575
+ if (!hasActivity && paneContent.trim().length > 0) {
576
+ await sendKeysFn(session.tmuxSession, "");
577
+ recordEvent(eventStore, {
578
+ runId,
579
+ agentName: session.agentName,
580
+ eventType: "custom",
581
+ level: "info",
582
+ data: {
583
+ type: "beacon_nudge",
584
+ bootElapsedMs: bootElapsed,
585
+ beaconNudgeMs,
586
+ },
587
+ });
588
+ }
589
+ } catch {
590
+ // Non-fatal: beacon nudge failure must not break the daemon
591
+ }
592
+ }
593
+
594
+ if (bootElapsed > bootTimeoutMs) {
595
+ const notifyTarget = session.parentAgent ?? "coordinator";
596
+ try {
597
+ await sendRecoveryMailFn([
598
+ "--to",
599
+ notifyTarget,
600
+ "--subject",
601
+ `Boot timeout: ${session.agentName}`,
602
+ "--body",
603
+ `Agent ${session.agentName} stuck in booting state for ${Math.round(bootElapsed / 1000)}s (threshold: ${Math.round(bootTimeoutMs / 1000)}s). Marking zombie.`,
604
+ "--type",
605
+ "error",
606
+ "--priority",
607
+ "urgent",
608
+ "--from",
609
+ "watchman",
610
+ ]);
611
+ } catch {
612
+ // Fire-and-forget: mail failure must not break the watchman
613
+ }
614
+ recordEvent(eventStore, {
615
+ runId,
616
+ agentName: session.agentName,
617
+ eventType: "custom",
618
+ level: "warn",
619
+ data: { type: "boot_timeout", bootElapsedMs: bootElapsed, bootTimeoutMs },
620
+ });
621
+ store.updateState(session.agentName, "zombie");
622
+ session.state = "zombie";
623
+ if (onHealthCheck) {
624
+ onHealthCheck(check);
625
+ }
626
+ continue;
627
+ }
628
+ }
629
+
630
+ // Transition state forward only (investigate action holds state)
631
+ const newState = transitionState(session.state, check);
632
+ if (newState !== session.state) {
633
+ store.updateState(session.agentName, newState);
634
+ session.state = newState;
635
+ }
636
+
637
+ if (onHealthCheck) {
638
+ onHealthCheck(check);
639
+ }
640
+
641
+ if (check.action === "terminate") {
642
+ // Record the failure via mulch (Tier 0 detection)
643
+ const reason = check.reconciliationNote ?? "Process terminated";
644
+ await recordFailureFn(root, session, reason, 0);
645
+
646
+ // Kill the tmux session if it's still alive
647
+ if (tmuxAlive) {
648
+ try {
649
+ await tmux.killSession(session.tmuxSession);
650
+ } catch {
651
+ // Session may have died between check and kill — not an error
652
+ }
653
+ }
654
+
655
+ // Attempt auto-recovery from checkpoint before marking zombie
656
+ const { recovered } = await attemptRecovery({
657
+ session,
658
+ legioDir,
659
+ root,
660
+ maxRecoveryAttempts,
661
+ eventStore,
662
+ runId,
663
+ sling: slingFn,
664
+ loadCheckpointFn,
665
+ sendRecoveryMail: sendRecoveryMailFn,
666
+ });
667
+
668
+ if (!recovered) {
669
+ store.updateState(session.agentName, "zombie");
670
+ // Reset escalation tracking on terminal state
671
+ store.updateEscalation(session.agentName, 0, null);
672
+ session.state = "zombie";
673
+ session.escalationLevel = 0;
674
+ session.stalledSince = null;
675
+ } else {
676
+ // Recovery succeeded — clear zombie state set by transitionState above
677
+ store.updateState(session.agentName, "completed");
678
+ store.updateEscalation(session.agentName, 0, null);
679
+ session.state = "completed";
680
+ session.escalationLevel = 0;
681
+ session.stalledSince = null;
682
+ }
683
+ } else if (check.action === "investigate") {
684
+ // ZFC: tmux alive but SessionStore says zombie.
685
+ // Log the conflict but do NOT auto-kill.
686
+ // The onHealthCheck callback surfaces this to the operator.
687
+ // No state change — keep zombie until a human or higher-tier agent decides.
688
+ }
689
+ }
690
+
691
+ // Unregistered agent detection: find tmux sessions with no DB registration.
692
+ // Compares live tmux sessions against sessions.db. Sessions that appear in
693
+ // tmux but not in the DB may be rogue processes or orphaned from a crash.
694
+ // On first sighting, writes a marker file. On subsequent ticks, if the session
695
+ // has been running unregistered for >3 minutes, sends an urgent alert.
696
+ try {
697
+ let projectName: string;
698
+ if (options._projectName !== undefined) {
699
+ projectName = options._projectName;
700
+ } else {
701
+ const config = await loadConfig(root);
702
+ projectName = config.project.name;
703
+ }
704
+ const sessionPrefix = `legio-${projectName}-`;
705
+ const listSessionsFn = options._listTmuxSessions ?? listTmuxSessions;
706
+ const tmuxSessionNames = await listSessionsFn(sessionPrefix);
707
+
708
+ // Build set of all registered tmux session names (including completed/zombie)
709
+ const registeredTmuxSessions = new Set(sessions.map((s) => s.tmuxSession));
710
+
711
+ // Persistent coordination agents are excluded — they may not always be in sessions.db
712
+ const EXCLUDED_AGENTS = new Set(["coordinator", "gateway", "monitor"]);
713
+ const unregisteredDir = join(legioDir, "unregistered-agents");
714
+
715
+ for (const tmuxSession of tmuxSessionNames) {
716
+ if (registeredTmuxSessions.has(tmuxSession)) continue;
717
+
718
+ // Extract agent name by stripping the session prefix
719
+ const agentName = tmuxSession.slice(sessionPrefix.length);
720
+ if (EXCLUDED_AGENTS.has(agentName)) continue;
721
+
722
+ const markerPath = join(unregisteredDir, `${agentName}.txt`);
723
+ let firstSeenMs: number | null = null;
724
+ try {
725
+ const content = await readFile(markerPath, "utf-8");
726
+ firstSeenMs = Number.parseInt(content.trim(), 10) || null;
727
+ } catch {
728
+ // Marker doesn't exist — first sighting: write the timestamp
729
+ try {
730
+ await mkdir(unregisteredDir, { recursive: true });
731
+ await writeFile(markerPath, String(Date.now()), "utf-8");
732
+ } catch {
733
+ // Non-fatal: marker write failure
734
+ }
735
+ continue;
736
+ }
737
+
738
+ if (firstSeenMs !== null) {
739
+ const elapsed = Date.now() - firstSeenMs;
740
+ if (elapsed > 3 * 60 * 1000) {
741
+ // Session has been unregistered for >3 minutes — send alert
742
+ try {
743
+ await sendRecoveryMailFn([
744
+ "--to",
745
+ "coordinator",
746
+ "--subject",
747
+ `Unregistered agent: ${agentName}`,
748
+ "--body",
749
+ `Tmux session ${tmuxSession} has been running for ${Math.round(elapsed / 60000)}min but is not registered in sessions.db. Possible zombie or rogue process.`,
750
+ "--type",
751
+ "error",
752
+ "--priority",
753
+ "urgent",
754
+ "--from",
755
+ "watchman",
756
+ ]);
757
+ } catch {
758
+ // Fire-and-forget: mail failure must not break the watchman
759
+ }
760
+ recordEvent(eventStore, {
761
+ runId,
762
+ agentName,
763
+ eventType: "custom",
764
+ level: "warn",
765
+ data: { type: "unregistered_zombie", tmuxSession, elapsedMs: elapsed },
766
+ });
767
+ // Clean up the marker so we don't re-alert on every tick
768
+ try {
769
+ await unlink(markerPath);
770
+ } catch {
771
+ // Non-fatal: marker cleanup failure
772
+ }
773
+ }
774
+ }
775
+ }
776
+ } catch {
777
+ // Non-fatal: unregistered agent detection must not break the daemon
778
+ }
779
+ } finally {
780
+ store.close();
781
+ // Close EventStore only if we created it (not injected)
782
+ if (eventStore && !useInjectedEventStore) {
783
+ try {
784
+ eventStore.close();
785
+ } catch {
786
+ // Non-fatal
787
+ }
788
+ }
789
+ }
790
+ }
791
+
792
+ /**
793
+ * Run a single mail delivery tick. Exported for testing.
794
+ *
795
+ * Each tick:
796
+ * 1. Opens mail.db and queries agents with unread mail
797
+ * 2. For each agent with unread mail, checks idle state and nudges
798
+ * 3. Writes pending-nudge marker as fallback
799
+ * 4. Escalation: first nudge immediately, re-nudge every reNudgeIntervalMs
800
+ * 5. Warns after warnAfterMs of unread mail
801
+ * 6. Clears state entry when agent's unread count drops to 0
802
+ */
803
+ export async function runMailTick(
804
+ options: WatchmanOptions,
805
+ state: Map<string, AgentMailState>,
806
+ ): Promise<void> {
807
+ const { root, onNudge, onWarn } = options;
808
+ const reNudgeIntervalMs = options.reNudgeIntervalMs ?? 10_000;
809
+ const warnAfterMs = options.warnAfterMs ?? 60_000;
810
+ const nudgeFn = options._nudge ?? nudgeAgent;
811
+ const isIdleFn = options._isAgentIdle ?? isAgentIdle;
812
+ const writePendingNudgeFn = options._writePendingNudge ?? writePendingNudge;
813
+ // Agents with recent activity have hooks that deliver mail — tmux nudges
814
+ // are noisy and redundant for them. Only nudge truly idle agents.
815
+ const activityThresholdMs = 30_000;
816
+
817
+ // Open mail store
818
+ let mailStore: MailStore;
819
+ const useInjectedMailStore = options._mailStore !== undefined;
820
+ if (useInjectedMailStore) {
821
+ mailStore = options._mailStore as MailStore;
822
+ } else {
823
+ const dbPath = join(root, ".legio", "mail.db");
824
+ mailStore = createMailStore(dbPath);
825
+ }
826
+
827
+ // Open session store to check lastActivityAt
828
+ const legioDir = join(root, ".legio");
829
+ let sessionStore: ReturnType<typeof openSessionStore>["store"] | null = null;
830
+ try {
831
+ const { store: ss } = openSessionStore(legioDir);
832
+ sessionStore = ss;
833
+ } catch {
834
+ // Non-fatal: session store unavailable — fall back to file-based idle check
835
+ }
836
+
837
+ try {
838
+ const agentsWithUnread = mailStore.getAgentsWithUnread();
839
+ const agentSet = new Set(agentsWithUnread);
840
+
841
+ // Clear state for agents that no longer have unread mail
842
+ for (const [agentName] of state) {
843
+ if (!agentSet.has(agentName)) {
844
+ state.delete(agentName);
845
+ }
846
+ }
847
+
848
+ const now = Date.now();
849
+
850
+ for (const agentName of agentsWithUnread) {
851
+ let agentState = state.get(agentName);
852
+
853
+ if (!agentState) {
854
+ // First time seeing unread mail for this agent — nudge immediately
855
+ agentState = {
856
+ firstSeenAt: now,
857
+ lastNudgeAt: 0,
858
+ nudgeCount: 0,
859
+ };
860
+ state.set(agentName, agentState);
861
+ }
862
+
863
+ // Check if it's time to re-nudge
864
+ const timeSinceLastNudge = now - agentState.lastNudgeAt;
865
+ const shouldNudge = agentState.nudgeCount === 0 || timeSinceLastNudge >= reNudgeIntervalMs;
866
+
867
+ if (shouldNudge) {
868
+ // Write pending-nudge marker as fallback (always, regardless of activity)
869
+ try {
870
+ await writePendingNudgeFn(root, agentName, {
871
+ from: "watchman",
872
+ reason: "unread mail",
873
+ subject: "You have unread mail",
874
+ messageId: "",
875
+ });
876
+ } catch {
877
+ // Non-fatal: pending marker write failure
878
+ }
879
+
880
+ // Check if the agent has recent activity — if so, hooks will deliver
881
+ // the mail on the next tool call. Skip the tmux nudge to avoid noise.
882
+ let recentlyActive = false;
883
+ if (sessionStore) {
884
+ try {
885
+ const session = sessionStore.getByName(agentName);
886
+ if (session?.lastActivity) {
887
+ const activityAge = now - new Date(session.lastActivity).getTime();
888
+ recentlyActive = activityAge < activityThresholdMs;
889
+ }
890
+ } catch {
891
+ // Non-fatal: session lookup failure
892
+ }
893
+ }
894
+
895
+ // Only send tmux nudge if the agent is NOT recently active AND is idle
896
+ if (!recentlyActive) {
897
+ try {
898
+ const idle = await isIdleFn(root, agentName);
899
+ if (idle) {
900
+ await nudgeFn(
901
+ root,
902
+ agentName,
903
+ "[watchman] You have unread mail. Run: legio mail check",
904
+ true, // force — skip debounce
905
+ ).catch(() => {
906
+ // Non-fatal: nudge failure
907
+ });
908
+ }
909
+ } catch {
910
+ // Non-fatal: idle check or nudge failure
911
+ }
912
+ }
913
+
914
+ agentState.lastNudgeAt = now;
915
+ agentState.nudgeCount++;
916
+
917
+ if (onNudge) {
918
+ onNudge(agentName, agentState.nudgeCount);
919
+ }
920
+ }
921
+
922
+ // Warn if unread mail has been sitting too long
923
+ const unreadDuration = now - agentState.firstSeenAt;
924
+ if (unreadDuration >= warnAfterMs && onWarn) {
925
+ onWarn(agentName, unreadDuration);
926
+ }
927
+ }
928
+ } finally {
929
+ if (!useInjectedMailStore) {
930
+ mailStore.close();
931
+ }
932
+ if (sessionStore) {
933
+ try {
934
+ sessionStore.close();
935
+ } catch {
936
+ // Non-fatal
937
+ }
938
+ }
939
+ }
940
+ }