@katyella/legio 0.1.3 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +61 -3
- package/README.md +21 -10
- package/agents/builder.md +11 -10
- package/agents/coordinator.md +36 -27
- package/agents/cto.md +9 -8
- package/agents/gateway.md +28 -12
- package/agents/lead.md +45 -30
- package/agents/merger.md +4 -4
- package/agents/monitor.md +10 -9
- package/agents/reviewer.md +8 -8
- package/agents/scout.md +10 -10
- package/agents/supervisor.md +60 -45
- package/package.json +2 -2
- package/src/agents/hooks-deployer.test.ts +46 -41
- package/src/agents/hooks-deployer.ts +10 -9
- package/src/agents/manifest.test.ts +6 -2
- package/src/agents/overlay.test.ts +9 -7
- package/src/agents/overlay.ts +29 -7
- package/src/commands/agents.test.ts +1 -5
- package/src/commands/clean.test.ts +2 -5
- package/src/commands/clean.ts +25 -1
- package/src/commands/completions.test.ts +1 -1
- package/src/commands/completions.ts +26 -7
- package/src/commands/coordinator.test.ts +87 -82
- package/src/commands/coordinator.ts +94 -48
- package/src/commands/costs.test.ts +2 -6
- package/src/commands/dashboard.test.ts +2 -5
- package/src/commands/doctor.test.ts +2 -6
- package/src/commands/down.ts +3 -3
- package/src/commands/errors.test.ts +2 -6
- package/src/commands/feed.test.ts +2 -6
- package/src/commands/gateway.test.ts +43 -17
- package/src/commands/gateway.ts +101 -11
- package/src/commands/hooks.test.ts +2 -5
- package/src/commands/init.test.ts +4 -13
- package/src/commands/inspect.test.ts +2 -6
- package/src/commands/log.test.ts +2 -6
- package/src/commands/logs.test.ts +2 -9
- package/src/commands/mail.test.ts +76 -215
- package/src/commands/mail.ts +43 -187
- package/src/commands/metrics.test.ts +3 -10
- package/src/commands/nudge.ts +15 -0
- package/src/commands/prime.test.ts +4 -11
- package/src/commands/replay.test.ts +2 -6
- package/src/commands/server.test.ts +1 -5
- package/src/commands/server.ts +1 -1
- package/src/commands/sling.test.ts +6 -1
- package/src/commands/sling.ts +42 -17
- package/src/commands/spec.test.ts +2 -5
- package/src/commands/status.test.ts +2 -4
- package/src/commands/stop.test.ts +2 -5
- package/src/commands/supervisor.ts +6 -6
- package/src/commands/trace.test.ts +2 -6
- package/src/commands/up.test.ts +43 -9
- package/src/commands/up.ts +15 -11
- package/src/commands/watchman.ts +327 -0
- package/src/commands/worktree.test.ts +2 -6
- package/src/config.test.ts +34 -104
- package/src/config.ts +120 -32
- package/src/doctor/agents.test.ts +52 -2
- package/src/doctor/agents.ts +4 -2
- package/src/doctor/config-check.test.ts +7 -2
- package/src/doctor/consistency.test.ts +7 -2
- package/src/doctor/databases.test.ts +6 -2
- package/src/doctor/dependencies.test.ts +18 -13
- package/src/doctor/dependencies.ts +23 -94
- package/src/doctor/logs.test.ts +7 -2
- package/src/doctor/merge-queue.test.ts +6 -2
- package/src/doctor/structure.test.ts +7 -2
- package/src/doctor/version.test.ts +7 -2
- package/src/e2e/init-sling-lifecycle.test.ts +2 -5
- package/src/index.ts +7 -7
- package/src/mail/pending.ts +120 -0
- package/src/mail/store.test.ts +89 -0
- package/src/mail/store.ts +11 -0
- package/src/merge/resolver.test.ts +518 -489
- package/src/server/index.ts +33 -2
- package/src/server/public/app.js +3 -3
- package/src/server/public/components/message-bubble.js +11 -1
- package/src/server/public/components/terminal-panel.js +66 -74
- package/src/server/public/views/chat.js +18 -2
- package/src/server/public/views/costs.js +5 -5
- package/src/server/public/views/dashboard.js +80 -51
- package/src/server/public/views/gateway-chat.js +37 -131
- package/src/server/public/views/inspect.js +16 -4
- package/src/server/public/views/issues.js +16 -12
- package/src/server/routes.test.ts +55 -39
- package/src/server/routes.ts +38 -26
- package/src/test-helpers.ts +6 -3
- package/src/tracker/beads.ts +159 -0
- package/src/tracker/exec.ts +44 -0
- package/src/tracker/factory.test.ts +283 -0
- package/src/tracker/factory.ts +59 -0
- package/src/tracker/seeds.ts +156 -0
- package/src/tracker/types.ts +46 -0
- package/src/types.ts +11 -2
- package/src/{watchdog → watchman}/daemon.test.ts +421 -515
- package/src/watchman/daemon.ts +940 -0
- package/src/worktree/tmux.test.ts +2 -1
- package/src/worktree/tmux.ts +4 -4
- package/templates/hooks.json.tmpl +17 -17
- package/src/beads/client.test.ts +0 -210
- package/src/commands/merge.test.ts +0 -676
- package/src/commands/watch.test.ts +0 -152
- package/src/commands/watch.ts +0 -238
- package/src/test-helpers.test.ts +0 -97
- package/src/watchdog/daemon.ts +0 -533
- package/src/watchdog/health.test.ts +0 -371
- package/src/watchdog/triage.test.ts +0 -162
- package/src/worktree/manager.test.ts +0 -444
- /package/src/{watchdog → watchman}/health.ts +0 -0
- /package/src/{watchdog → watchman}/triage.ts +0 -0
|
@@ -0,0 +1,940 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified daemon ("Watchman") — health monitoring + mail delivery + beacon safety net.
|
|
3
|
+
*
|
|
4
|
+
* Combines three responsibilities into a single process:
|
|
5
|
+
* 1. Health tick (default 30s): session health checks, zombie detection, boot timeout, recovery
|
|
6
|
+
* 2. Mail tick (default 5s): poll for unread mail, nudge agents
|
|
7
|
+
* 3. Beacon safety net (inside health tick): detect stuck beacons and send follow-up Enter
|
|
8
|
+
*
|
|
9
|
+
* Phase 4 tier numbering:
|
|
10
|
+
* Tier 0 = Mechanical daemon (this file)
|
|
11
|
+
* Tier 1 = Triage agent (triage.ts)
|
|
12
|
+
* Tier 2 = Monitor agent (not yet implemented)
|
|
13
|
+
* Tier 3 = Supervisor monitors (per-project)
|
|
14
|
+
*
|
|
15
|
+
* ZFC Principle: Observable state (tmux alive, pid alive) is the source of
|
|
16
|
+
* truth. See health.ts for the full ZFC documentation.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { spawn } from "node:child_process";
|
|
20
|
+
import { mkdir, readFile, unlink, writeFile } from "node:fs/promises";
|
|
21
|
+
import { join } from "node:path";
|
|
22
|
+
import { loadCheckpoint } from "../agents/checkpoint.ts";
|
|
23
|
+
import { nudgeAgent } from "../commands/nudge.ts";
|
|
24
|
+
import { loadConfig } from "../config.ts";
|
|
25
|
+
import { createEventStore } from "../events/store.ts";
|
|
26
|
+
import { isAgentIdle, writePendingNudge } from "../mail/pending.ts";
|
|
27
|
+
import { createMailStore, type MailStore } from "../mail/store.ts";
|
|
28
|
+
import { createMulchClient } from "../mulch/client.ts";
|
|
29
|
+
import { openSessionStore } from "../sessions/compat.ts";
|
|
30
|
+
import type { AgentSession, EventStore, HealthCheck, SessionCheckpoint } from "../types.ts";
|
|
31
|
+
import { capturePaneContent, isSessionAlive, killSession, sendKeys } from "../worktree/tmux.ts";
|
|
32
|
+
import { evaluateHealth, transitionState } from "./health.ts";
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Record an agent failure to mulch for future reference.
|
|
36
|
+
* Fire-and-forget: never throws, logs errors internally if mulch fails.
|
|
37
|
+
*
|
|
38
|
+
* @param root - Project root directory
|
|
39
|
+
* @param session - The agent session that failed
|
|
40
|
+
* @param reason - Human-readable failure reason
|
|
41
|
+
* @param tier - Which watchman tier detected the failure (0 or 1)
|
|
42
|
+
* @param triageSuggestion - Optional triage verdict from Tier 1 AI analysis
|
|
43
|
+
*/
|
|
44
|
+
async function recordFailure(
|
|
45
|
+
root: string,
|
|
46
|
+
session: AgentSession,
|
|
47
|
+
reason: string,
|
|
48
|
+
tier: 0 | 1,
|
|
49
|
+
triageSuggestion?: string,
|
|
50
|
+
): Promise<void> {
|
|
51
|
+
try {
|
|
52
|
+
const mulch = createMulchClient(root);
|
|
53
|
+
const tierLabel = tier === 0 ? "Tier 0 (process death)" : "Tier 1 (AI triage)";
|
|
54
|
+
const description = [
|
|
55
|
+
`Agent: ${session.agentName}`,
|
|
56
|
+
`Capability: ${session.capability}`,
|
|
57
|
+
`Failure reason: ${reason}`,
|
|
58
|
+
triageSuggestion ? `Triage suggestion: ${triageSuggestion}` : null,
|
|
59
|
+
`Detected by: ${tierLabel}`,
|
|
60
|
+
]
|
|
61
|
+
.filter((line) => line !== null)
|
|
62
|
+
.join("\n");
|
|
63
|
+
|
|
64
|
+
await mulch.record("agents", {
|
|
65
|
+
type: "failure",
|
|
66
|
+
description,
|
|
67
|
+
tags: ["watchdog", "auto-recorded"],
|
|
68
|
+
evidenceBead: session.beadId || undefined,
|
|
69
|
+
});
|
|
70
|
+
} catch {
|
|
71
|
+
// Fire-and-forget: recording failures must not break the watchman
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Read the current run ID from current-run.txt, or null if no active run.
|
|
77
|
+
*/
|
|
78
|
+
async function readCurrentRunId(legioDir: string): Promise<string | null> {
|
|
79
|
+
const path = join(legioDir, "current-run.txt");
|
|
80
|
+
try {
|
|
81
|
+
const text = await readFile(path, "utf-8");
|
|
82
|
+
const trimmed = text.trim();
|
|
83
|
+
return trimmed.length > 0 ? trimmed : null;
|
|
84
|
+
} catch {
|
|
85
|
+
return null;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Fire-and-forget: record an event to EventStore. Never throws.
|
|
91
|
+
*/
|
|
92
|
+
function recordEvent(
|
|
93
|
+
eventStore: EventStore | null,
|
|
94
|
+
event: {
|
|
95
|
+
runId: string | null;
|
|
96
|
+
agentName: string;
|
|
97
|
+
eventType: "custom" | "mail_sent";
|
|
98
|
+
level: "debug" | "info" | "warn" | "error";
|
|
99
|
+
data: Record<string, unknown>;
|
|
100
|
+
},
|
|
101
|
+
): void {
|
|
102
|
+
if (!eventStore) return;
|
|
103
|
+
try {
|
|
104
|
+
eventStore.insert({
|
|
105
|
+
runId: event.runId,
|
|
106
|
+
agentName: event.agentName,
|
|
107
|
+
sessionId: null,
|
|
108
|
+
eventType: event.eventType,
|
|
109
|
+
toolName: null,
|
|
110
|
+
toolArgs: null,
|
|
111
|
+
toolDurationMs: null,
|
|
112
|
+
level: event.level,
|
|
113
|
+
data: JSON.stringify(event.data),
|
|
114
|
+
});
|
|
115
|
+
} catch {
|
|
116
|
+
// Fire-and-forget: event recording must never break the daemon
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Read the recovery attempt count for an agent from disk.
|
|
122
|
+
* Returns 0 if the file doesn't exist.
|
|
123
|
+
*/
|
|
124
|
+
async function readRecoveryCount(agentsDir: string, agentName: string): Promise<number> {
|
|
125
|
+
try {
|
|
126
|
+
const text = await readFile(join(agentsDir, agentName, "recovery-count"), "utf-8");
|
|
127
|
+
return parseInt(text.trim(), 10) || 0;
|
|
128
|
+
} catch {
|
|
129
|
+
return 0;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Write the recovery attempt count for an agent to disk.
|
|
135
|
+
* Creates the directory if it doesn't exist.
|
|
136
|
+
*/
|
|
137
|
+
async function writeRecoveryCount(
|
|
138
|
+
agentsDir: string,
|
|
139
|
+
agentName: string,
|
|
140
|
+
count: number,
|
|
141
|
+
): Promise<void> {
|
|
142
|
+
const dir = join(agentsDir, agentName);
|
|
143
|
+
await mkdir(dir, { recursive: true });
|
|
144
|
+
await writeFile(join(dir, "recovery-count"), String(count), "utf-8");
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Default sling implementation: spawn `legio sling` as a subprocess.
|
|
149
|
+
*/
|
|
150
|
+
async function reSling(
|
|
151
|
+
args: string[],
|
|
152
|
+
root: string,
|
|
153
|
+
): Promise<{ exitCode: number; stderr: string }> {
|
|
154
|
+
return new Promise((resolve) => {
|
|
155
|
+
const proc = spawn("legio", ["sling", ...args], {
|
|
156
|
+
cwd: root,
|
|
157
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
158
|
+
});
|
|
159
|
+
let stderr = "";
|
|
160
|
+
proc.stderr?.on("data", (chunk: Buffer) => {
|
|
161
|
+
stderr += chunk.toString();
|
|
162
|
+
});
|
|
163
|
+
proc.on("close", (code) => resolve({ exitCode: code ?? 1, stderr }));
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Default recovery mail implementation: spawn `legio mail send` as a subprocess.
|
|
169
|
+
*/
|
|
170
|
+
async function sendMailSubprocess(args: string[], root: string): Promise<void> {
|
|
171
|
+
return new Promise((resolve) => {
|
|
172
|
+
const proc = spawn("legio", ["mail", "send", ...args], {
|
|
173
|
+
cwd: root,
|
|
174
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
175
|
+
});
|
|
176
|
+
proc.on("close", () => resolve());
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Attempt to auto-recover a dead agent from its checkpoint by re-slinging it.
|
|
182
|
+
*
|
|
183
|
+
* @returns `{ recovered: true }` if sling succeeded, `{ recovered: false }` otherwise.
|
|
184
|
+
*/
|
|
185
|
+
async function attemptRecovery(options: {
|
|
186
|
+
session: AgentSession;
|
|
187
|
+
legioDir: string;
|
|
188
|
+
root: string;
|
|
189
|
+
maxRecoveryAttempts: number;
|
|
190
|
+
eventStore: EventStore | null;
|
|
191
|
+
runId: string | null;
|
|
192
|
+
sling: (args: string[]) => Promise<{ exitCode: number; stderr: string }>;
|
|
193
|
+
loadCheckpointFn: (agentsDir: string, agentName: string) => Promise<SessionCheckpoint | null>;
|
|
194
|
+
sendRecoveryMail: (args: string[]) => Promise<void>;
|
|
195
|
+
}): Promise<{ recovered: boolean }> {
|
|
196
|
+
const {
|
|
197
|
+
session,
|
|
198
|
+
legioDir,
|
|
199
|
+
maxRecoveryAttempts,
|
|
200
|
+
eventStore,
|
|
201
|
+
runId,
|
|
202
|
+
sling,
|
|
203
|
+
loadCheckpointFn,
|
|
204
|
+
sendRecoveryMail,
|
|
205
|
+
} = options;
|
|
206
|
+
const agentsDir = join(legioDir, "agents");
|
|
207
|
+
|
|
208
|
+
// Load checkpoint — if none exists, recovery is not possible
|
|
209
|
+
let checkpoint: SessionCheckpoint | null = null;
|
|
210
|
+
try {
|
|
211
|
+
checkpoint = await loadCheckpointFn(agentsDir, session.agentName);
|
|
212
|
+
} catch {
|
|
213
|
+
return { recovered: false };
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (!checkpoint) {
|
|
217
|
+
return { recovered: false };
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Check retry count — if exhausted, send escalation mail and bail
|
|
221
|
+
const recoveryCount = await readRecoveryCount(agentsDir, session.agentName);
|
|
222
|
+
if (recoveryCount >= maxRecoveryAttempts) {
|
|
223
|
+
if (session.parentAgent) {
|
|
224
|
+
try {
|
|
225
|
+
await sendRecoveryMail([
|
|
226
|
+
"--to",
|
|
227
|
+
session.parentAgent,
|
|
228
|
+
"--subject",
|
|
229
|
+
`Recovery failed: ${session.agentName}`,
|
|
230
|
+
"--body",
|
|
231
|
+
`Auto-recovery exhausted for ${session.agentName} after ${recoveryCount} attempts. Agent marked zombie.`,
|
|
232
|
+
"--type",
|
|
233
|
+
"error",
|
|
234
|
+
"--priority",
|
|
235
|
+
"high",
|
|
236
|
+
"--from",
|
|
237
|
+
"watchman",
|
|
238
|
+
]);
|
|
239
|
+
} catch {
|
|
240
|
+
// Fire-and-forget: mail failure must not break the watchman
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
return { recovered: false };
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// Increment recovery count before attempting
|
|
247
|
+
try {
|
|
248
|
+
await writeRecoveryCount(agentsDir, session.agentName, recoveryCount + 1);
|
|
249
|
+
} catch {
|
|
250
|
+
// Non-fatal: proceed with recovery even if count write fails
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
const attempt = recoveryCount + 1;
|
|
254
|
+
|
|
255
|
+
// Record recovery_attempt event
|
|
256
|
+
recordEvent(eventStore, {
|
|
257
|
+
runId,
|
|
258
|
+
agentName: session.agentName,
|
|
259
|
+
eventType: "custom",
|
|
260
|
+
level: "info",
|
|
261
|
+
data: { type: "recovery_attempt", attempt, maxAttempts: maxRecoveryAttempts },
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
// Send mail to parent notifying of recovery attempt
|
|
265
|
+
if (session.parentAgent) {
|
|
266
|
+
try {
|
|
267
|
+
await sendRecoveryMail([
|
|
268
|
+
"--to",
|
|
269
|
+
session.parentAgent,
|
|
270
|
+
"--subject",
|
|
271
|
+
`Recovery: ${session.agentName}`,
|
|
272
|
+
"--body",
|
|
273
|
+
`Watchman attempting auto-recovery from checkpoint for ${session.agentName} (attempt ${attempt}/${maxRecoveryAttempts}).`,
|
|
274
|
+
"--type",
|
|
275
|
+
"health_check",
|
|
276
|
+
"--from",
|
|
277
|
+
"watchman",
|
|
278
|
+
]);
|
|
279
|
+
} catch {
|
|
280
|
+
// Fire-and-forget: mail failure must not break the watchman
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Build sling args from checkpoint + session
|
|
285
|
+
const specPath = join(legioDir, "specs", `${checkpoint.beadId}.md`);
|
|
286
|
+
const slingArgs: string[] = [
|
|
287
|
+
checkpoint.beadId,
|
|
288
|
+
"--capability",
|
|
289
|
+
session.capability,
|
|
290
|
+
"--name",
|
|
291
|
+
session.agentName,
|
|
292
|
+
"--spec",
|
|
293
|
+
specPath,
|
|
294
|
+
];
|
|
295
|
+
|
|
296
|
+
if (checkpoint.filesModified.length > 0) {
|
|
297
|
+
slingArgs.push("--files", checkpoint.filesModified.join(","));
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if (session.parentAgent) {
|
|
301
|
+
slingArgs.push("--parent", session.parentAgent);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
slingArgs.push("--depth", String(session.depth));
|
|
305
|
+
|
|
306
|
+
// Attempt sling subprocess
|
|
307
|
+
try {
|
|
308
|
+
const result = await sling(slingArgs);
|
|
309
|
+
if (result.exitCode === 0) {
|
|
310
|
+
recordEvent(eventStore, {
|
|
311
|
+
runId,
|
|
312
|
+
agentName: session.agentName,
|
|
313
|
+
eventType: "custom",
|
|
314
|
+
level: "info",
|
|
315
|
+
data: { type: "recovery_success", attempt },
|
|
316
|
+
});
|
|
317
|
+
return { recovered: true };
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
recordEvent(eventStore, {
|
|
321
|
+
runId,
|
|
322
|
+
agentName: session.agentName,
|
|
323
|
+
eventType: "custom",
|
|
324
|
+
level: "error",
|
|
325
|
+
data: { type: "recovery_failed", attempt, stderr: result.stderr },
|
|
326
|
+
});
|
|
327
|
+
return { recovered: false };
|
|
328
|
+
} catch {
|
|
329
|
+
recordEvent(eventStore, {
|
|
330
|
+
runId,
|
|
331
|
+
agentName: session.agentName,
|
|
332
|
+
eventType: "custom",
|
|
333
|
+
level: "error",
|
|
334
|
+
data: { type: "recovery_failed", attempt },
|
|
335
|
+
});
|
|
336
|
+
return { recovered: false };
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
/**
|
|
341
|
+
* List all tmux session names that match a given prefix.
|
|
342
|
+
* Returns an empty array if tmux is not running or returns no sessions.
|
|
343
|
+
*/
|
|
344
|
+
async function listTmuxSessions(prefix: string): Promise<string[]> {
|
|
345
|
+
return new Promise((resolve) => {
|
|
346
|
+
const proc = spawn("tmux", ["list-sessions", "-F", "#{session_name}"], {
|
|
347
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
348
|
+
});
|
|
349
|
+
let stdout = "";
|
|
350
|
+
proc.stdout?.on("data", (chunk: Buffer) => {
|
|
351
|
+
stdout += chunk.toString();
|
|
352
|
+
});
|
|
353
|
+
proc.on("close", (code) => {
|
|
354
|
+
if (code !== 0) {
|
|
355
|
+
resolve([]);
|
|
356
|
+
return;
|
|
357
|
+
}
|
|
358
|
+
const sessions = stdout
|
|
359
|
+
.split("\n")
|
|
360
|
+
.map((line) => line.trim())
|
|
361
|
+
.filter((line) => line.length > 0 && line.startsWith(prefix));
|
|
362
|
+
resolve(sessions);
|
|
363
|
+
});
|
|
364
|
+
proc.on("error", () => resolve([]));
|
|
365
|
+
});
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/** Activity markers that indicate an agent is actively working (not stuck at prompt). */
|
|
369
|
+
const ACTIVITY_MARKERS = ["⏺", "Claude", "Reading", "Searching", "Editing", "Writing"];
|
|
370
|
+
|
|
371
|
+
/** Per-agent tracking state for unread mail delivery. */
|
|
372
|
+
export interface AgentMailState {
|
|
373
|
+
firstSeenAt: number;
|
|
374
|
+
lastNudgeAt: number;
|
|
375
|
+
nudgeCount: number;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/** Options shared between startDaemon and runDaemonTick / runMailTick. */
|
|
379
|
+
export interface WatchmanOptions {
|
|
380
|
+
root: string;
|
|
381
|
+
zombieThresholdMs: number;
|
|
382
|
+
onHealthCheck?: (check: HealthCheck) => void;
|
|
383
|
+
/** Dependency injection for testing. Uses real implementations when omitted. */
|
|
384
|
+
_tmux?: {
|
|
385
|
+
isSessionAlive: (name: string) => Promise<boolean>;
|
|
386
|
+
killSession: (name: string) => Promise<void>;
|
|
387
|
+
};
|
|
388
|
+
/** Dependency injection for testing. Overrides EventStore creation. */
|
|
389
|
+
_eventStore?: EventStore | null;
|
|
390
|
+
/** Dependency injection for testing. Uses real recordFailure when omitted. */
|
|
391
|
+
_recordFailure?: (
|
|
392
|
+
root: string,
|
|
393
|
+
session: AgentSession,
|
|
394
|
+
reason: string,
|
|
395
|
+
tier: 0 | 1,
|
|
396
|
+
triageSuggestion?: string,
|
|
397
|
+
) => Promise<void>;
|
|
398
|
+
/** Max recovery attempts per agent before escalating (default: 1). */
|
|
399
|
+
maxRecoveryAttempts?: number;
|
|
400
|
+
/** DI for testing. Overrides sling subprocess spawn. */
|
|
401
|
+
_sling?: (args: string[]) => Promise<{ exitCode: number; stderr: string }>;
|
|
402
|
+
/** DI for testing. Overrides checkpoint loading. */
|
|
403
|
+
_loadCheckpoint?: (agentsDir: string, agentName: string) => Promise<SessionCheckpoint | null>;
|
|
404
|
+
/** DI for testing. Overrides mail sending for recovery notifications. */
|
|
405
|
+
_sendRecoveryMail?: (args: string[]) => Promise<void>;
|
|
406
|
+
/**
|
|
407
|
+
* Boot timeout in milliseconds for agents stuck in booting state (default: 90000).
|
|
408
|
+
* When an agent has been in the "booting" state longer than this threshold,
|
|
409
|
+
* it is treated as a zombie and an urgent alert is sent to its parent.
|
|
410
|
+
*/
|
|
411
|
+
bootTimeoutMs?: number;
|
|
412
|
+
/** DI for testing. Overrides tmux session listing for unregistered agent detection. */
|
|
413
|
+
_listTmuxSessions?: (prefix: string) => Promise<string[]>;
|
|
414
|
+
/** DI for testing. Overrides project name lookup (bypasses loadConfig). */
|
|
415
|
+
_projectName?: string;
|
|
416
|
+
|
|
417
|
+
// --- Mail delivery fields ---
|
|
418
|
+
|
|
419
|
+
/** Mail polling interval in ms (default 5_000). */
|
|
420
|
+
mailIntervalMs?: number;
|
|
421
|
+
/** Time between re-nudges for the same agent (default 10_000). */
|
|
422
|
+
reNudgeIntervalMs?: number;
|
|
423
|
+
/** Warn after unread mail sits this long (default 60_000). */
|
|
424
|
+
warnAfterMs?: number;
|
|
425
|
+
/** Beacon nudge threshold in ms (default 20_000). */
|
|
426
|
+
beaconNudgeMs?: number;
|
|
427
|
+
/** Callback when an agent is nudged for unread mail. */
|
|
428
|
+
onNudge?: (agentName: string, nudgeCount: number) => void;
|
|
429
|
+
/** Callback when an agent has had unread mail for too long. */
|
|
430
|
+
onWarn?: (agentName: string, unreadSinceMs: number) => void;
|
|
431
|
+
/** DI for testing. Overrides MailStore creation. */
|
|
432
|
+
_mailStore?: MailStore;
|
|
433
|
+
/** DI for testing. Overrides nudge delivery. */
|
|
434
|
+
_nudge?: (
|
|
435
|
+
projectRoot: string,
|
|
436
|
+
agentName: string,
|
|
437
|
+
message: string,
|
|
438
|
+
force: boolean,
|
|
439
|
+
) => Promise<{ delivered: boolean; reason?: string }>;
|
|
440
|
+
/** DI for testing. Overrides isAgentIdle check. */
|
|
441
|
+
_isAgentIdle?: (cwd: string, agentName: string) => Promise<boolean>;
|
|
442
|
+
/** DI for testing. Overrides writePendingNudge. */
|
|
443
|
+
_writePendingNudge?: (
|
|
444
|
+
cwd: string,
|
|
445
|
+
agentName: string,
|
|
446
|
+
nudge: { from: string; reason: string; subject: string; messageId: string },
|
|
447
|
+
) => Promise<void>;
|
|
448
|
+
/** DI for testing. Overrides capturePaneContent. */
|
|
449
|
+
_capturePaneContent?: (sessionName: string) => Promise<string>;
|
|
450
|
+
/** DI for testing. Overrides sendKeys. */
|
|
451
|
+
_sendKeys?: (sessionName: string, keys: string) => Promise<void>;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Start the unified watchman daemon that monitors agent health and delivers mail.
|
|
456
|
+
*
|
|
457
|
+
* Two independent intervals within the same process:
|
|
458
|
+
* - Health tick: session health checks, zombie detection, beacon stuck detection
|
|
459
|
+
* - Mail tick: poll for unread mail and nudge agents
|
|
460
|
+
*
|
|
461
|
+
* @returns An object with a `stop` function to halt both intervals
|
|
462
|
+
*/
|
|
463
|
+
export function startDaemon(options: WatchmanOptions & { intervalMs: number }): {
|
|
464
|
+
stop: () => void;
|
|
465
|
+
} {
|
|
466
|
+
const { intervalMs } = options;
|
|
467
|
+
const mailIntervalMs = options.mailIntervalMs ?? 5_000;
|
|
468
|
+
const mailState = new Map<string, AgentMailState>();
|
|
469
|
+
|
|
470
|
+
// Run the first health tick immediately, then on interval
|
|
471
|
+
runDaemonTick(options).catch(() => {
|
|
472
|
+
// Swallow errors in the first tick — daemon must not crash
|
|
473
|
+
});
|
|
474
|
+
|
|
475
|
+
const healthInterval = setInterval(() => {
|
|
476
|
+
runDaemonTick(options).catch(() => {
|
|
477
|
+
// Swallow errors in periodic ticks — daemon must not crash
|
|
478
|
+
});
|
|
479
|
+
}, intervalMs);
|
|
480
|
+
|
|
481
|
+
// Run the first mail tick immediately, then on interval
|
|
482
|
+
runMailTick(options, mailState).catch(() => {
|
|
483
|
+
// Swallow errors in the first tick — daemon must not crash
|
|
484
|
+
});
|
|
485
|
+
|
|
486
|
+
const mailInterval = setInterval(() => {
|
|
487
|
+
runMailTick(options, mailState).catch(() => {
|
|
488
|
+
// Swallow errors in periodic ticks — daemon must not crash
|
|
489
|
+
});
|
|
490
|
+
}, mailIntervalMs);
|
|
491
|
+
|
|
492
|
+
return {
|
|
493
|
+
stop(): void {
|
|
494
|
+
clearInterval(healthInterval);
|
|
495
|
+
clearInterval(mailInterval);
|
|
496
|
+
},
|
|
497
|
+
};
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
/**
|
|
501
|
+
* Run a single health daemon tick. Exported for testing — allows direct invocation
|
|
502
|
+
* of the monitoring logic without starting the interval-based daemon loop.
|
|
503
|
+
*
|
|
504
|
+
* @param options - Same options as startDaemon (minus intervalMs)
|
|
505
|
+
*/
|
|
506
|
+
export async function runDaemonTick(options: WatchmanOptions): Promise<void> {
|
|
507
|
+
const { root, zombieThresholdMs, onHealthCheck } = options;
|
|
508
|
+
const tmux = options._tmux ?? { isSessionAlive, killSession };
|
|
509
|
+
const recordFailureFn = options._recordFailure ?? recordFailure;
|
|
510
|
+
const maxRecoveryAttempts = options.maxRecoveryAttempts ?? 1;
|
|
511
|
+
const slingFn = options._sling ?? ((args: string[]) => reSling(args, root));
|
|
512
|
+
const loadCheckpointFn = options._loadCheckpoint ?? loadCheckpoint;
|
|
513
|
+
const sendRecoveryMailFn =
|
|
514
|
+
options._sendRecoveryMail ?? ((args: string[]) => sendMailSubprocess(args, root));
|
|
515
|
+
const beaconNudgeMs = options.beaconNudgeMs ?? 20_000;
|
|
516
|
+
const capturePaneFn = options._capturePaneContent ?? capturePaneContent;
|
|
517
|
+
const sendKeysFn = options._sendKeys ?? sendKeys;
|
|
518
|
+
|
|
519
|
+
const legioDir = join(root, ".legio");
|
|
520
|
+
const { store } = openSessionStore(legioDir);
|
|
521
|
+
|
|
522
|
+
// Open EventStore for recording daemon events (fire-and-forget)
|
|
523
|
+
let eventStore: EventStore | null = null;
|
|
524
|
+
let runId: string | null = null;
|
|
525
|
+
const useInjectedEventStore = options._eventStore !== undefined;
|
|
526
|
+
if (useInjectedEventStore) {
|
|
527
|
+
eventStore = options._eventStore ?? null;
|
|
528
|
+
} else {
|
|
529
|
+
try {
|
|
530
|
+
const eventsDbPath = join(legioDir, "events.db");
|
|
531
|
+
eventStore = createEventStore(eventsDbPath);
|
|
532
|
+
} catch {
|
|
533
|
+
// EventStore creation failure is non-fatal for the daemon
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
try {
|
|
537
|
+
runId = await readCurrentRunId(legioDir);
|
|
538
|
+
} catch {
|
|
539
|
+
// Reading run ID failure is non-fatal
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
try {
|
|
543
|
+
const thresholds = {
|
|
544
|
+
zombieMs: zombieThresholdMs,
|
|
545
|
+
};
|
|
546
|
+
|
|
547
|
+
const sessions = store.getAll();
|
|
548
|
+
|
|
549
|
+
for (const session of sessions) {
|
|
550
|
+
// Skip completed sessions — they are terminal and don't need monitoring
|
|
551
|
+
if (session.state === "completed") {
|
|
552
|
+
continue;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// ZFC: Don't skip zombies. Re-check tmux liveness on every tick.
|
|
556
|
+
// A zombie with a live tmux session needs investigation, not silence.
|
|
557
|
+
|
|
558
|
+
const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
559
|
+
const check = evaluateHealth(session, tmuxAlive, thresholds);
|
|
560
|
+
|
|
561
|
+
// Boot timeout detection: agent stuck in booting state beyond threshold.
|
|
562
|
+
// Fires when tmux is alive (the session started) but the agent never
|
|
563
|
+
// transitioned out of "booting" within the allowed window.
|
|
564
|
+
if (session.state === "booting" && tmuxAlive) {
|
|
565
|
+
const bootElapsed = Date.now() - new Date(session.startedAt).getTime();
|
|
566
|
+
const bootTimeoutMs = options.bootTimeoutMs ?? 90_000;
|
|
567
|
+
|
|
568
|
+
// Beacon safety net: if the agent has been booting longer than
|
|
569
|
+
// beaconNudgeMs but less than bootTimeoutMs, check if the beacon
|
|
570
|
+
// is stuck in the tmux input buffer and send a follow-up Enter.
|
|
571
|
+
if (bootElapsed > beaconNudgeMs && bootElapsed <= bootTimeoutMs) {
|
|
572
|
+
try {
|
|
573
|
+
const paneContent = await capturePaneFn(session.tmuxSession);
|
|
574
|
+
const hasActivity = ACTIVITY_MARKERS.some((marker) => paneContent.includes(marker));
|
|
575
|
+
if (!hasActivity && paneContent.trim().length > 0) {
|
|
576
|
+
await sendKeysFn(session.tmuxSession, "");
|
|
577
|
+
recordEvent(eventStore, {
|
|
578
|
+
runId,
|
|
579
|
+
agentName: session.agentName,
|
|
580
|
+
eventType: "custom",
|
|
581
|
+
level: "info",
|
|
582
|
+
data: {
|
|
583
|
+
type: "beacon_nudge",
|
|
584
|
+
bootElapsedMs: bootElapsed,
|
|
585
|
+
beaconNudgeMs,
|
|
586
|
+
},
|
|
587
|
+
});
|
|
588
|
+
}
|
|
589
|
+
} catch {
|
|
590
|
+
// Non-fatal: beacon nudge failure must not break the daemon
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
if (bootElapsed > bootTimeoutMs) {
|
|
595
|
+
const notifyTarget = session.parentAgent ?? "coordinator";
|
|
596
|
+
try {
|
|
597
|
+
await sendRecoveryMailFn([
|
|
598
|
+
"--to",
|
|
599
|
+
notifyTarget,
|
|
600
|
+
"--subject",
|
|
601
|
+
`Boot timeout: ${session.agentName}`,
|
|
602
|
+
"--body",
|
|
603
|
+
`Agent ${session.agentName} stuck in booting state for ${Math.round(bootElapsed / 1000)}s (threshold: ${Math.round(bootTimeoutMs / 1000)}s). Marking zombie.`,
|
|
604
|
+
"--type",
|
|
605
|
+
"error",
|
|
606
|
+
"--priority",
|
|
607
|
+
"urgent",
|
|
608
|
+
"--from",
|
|
609
|
+
"watchman",
|
|
610
|
+
]);
|
|
611
|
+
} catch {
|
|
612
|
+
// Fire-and-forget: mail failure must not break the watchman
|
|
613
|
+
}
|
|
614
|
+
recordEvent(eventStore, {
|
|
615
|
+
runId,
|
|
616
|
+
agentName: session.agentName,
|
|
617
|
+
eventType: "custom",
|
|
618
|
+
level: "warn",
|
|
619
|
+
data: { type: "boot_timeout", bootElapsedMs: bootElapsed, bootTimeoutMs },
|
|
620
|
+
});
|
|
621
|
+
store.updateState(session.agentName, "zombie");
|
|
622
|
+
session.state = "zombie";
|
|
623
|
+
if (onHealthCheck) {
|
|
624
|
+
onHealthCheck(check);
|
|
625
|
+
}
|
|
626
|
+
continue;
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Transition state forward only (investigate action holds state)
|
|
631
|
+
const newState = transitionState(session.state, check);
|
|
632
|
+
if (newState !== session.state) {
|
|
633
|
+
store.updateState(session.agentName, newState);
|
|
634
|
+
session.state = newState;
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
if (onHealthCheck) {
|
|
638
|
+
onHealthCheck(check);
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
if (check.action === "terminate") {
|
|
642
|
+
// Record the failure via mulch (Tier 0 detection)
|
|
643
|
+
const reason = check.reconciliationNote ?? "Process terminated";
|
|
644
|
+
await recordFailureFn(root, session, reason, 0);
|
|
645
|
+
|
|
646
|
+
// Kill the tmux session if it's still alive
|
|
647
|
+
if (tmuxAlive) {
|
|
648
|
+
try {
|
|
649
|
+
await tmux.killSession(session.tmuxSession);
|
|
650
|
+
} catch {
|
|
651
|
+
// Session may have died between check and kill — not an error
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
// Attempt auto-recovery from checkpoint before marking zombie
|
|
656
|
+
const { recovered } = await attemptRecovery({
|
|
657
|
+
session,
|
|
658
|
+
legioDir,
|
|
659
|
+
root,
|
|
660
|
+
maxRecoveryAttempts,
|
|
661
|
+
eventStore,
|
|
662
|
+
runId,
|
|
663
|
+
sling: slingFn,
|
|
664
|
+
loadCheckpointFn,
|
|
665
|
+
sendRecoveryMail: sendRecoveryMailFn,
|
|
666
|
+
});
|
|
667
|
+
|
|
668
|
+
if (!recovered) {
|
|
669
|
+
store.updateState(session.agentName, "zombie");
|
|
670
|
+
// Reset escalation tracking on terminal state
|
|
671
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
672
|
+
session.state = "zombie";
|
|
673
|
+
session.escalationLevel = 0;
|
|
674
|
+
session.stalledSince = null;
|
|
675
|
+
} else {
|
|
676
|
+
// Recovery succeeded — clear zombie state set by transitionState above
|
|
677
|
+
store.updateState(session.agentName, "completed");
|
|
678
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
679
|
+
session.state = "completed";
|
|
680
|
+
session.escalationLevel = 0;
|
|
681
|
+
session.stalledSince = null;
|
|
682
|
+
}
|
|
683
|
+
} else if (check.action === "investigate") {
|
|
684
|
+
// ZFC: tmux alive but SessionStore says zombie.
|
|
685
|
+
// Log the conflict but do NOT auto-kill.
|
|
686
|
+
// The onHealthCheck callback surfaces this to the operator.
|
|
687
|
+
// No state change — keep zombie until a human or higher-tier agent decides.
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
// Unregistered agent detection: find tmux sessions with no DB registration.
|
|
692
|
+
// Compares live tmux sessions against sessions.db. Sessions that appear in
|
|
693
|
+
// tmux but not in the DB may be rogue processes or orphaned from a crash.
|
|
694
|
+
// On first sighting, writes a marker file. On subsequent ticks, if the session
|
|
695
|
+
// has been running unregistered for >3 minutes, sends an urgent alert.
|
|
696
|
+
try {
|
|
697
|
+
let projectName: string;
|
|
698
|
+
if (options._projectName !== undefined) {
|
|
699
|
+
projectName = options._projectName;
|
|
700
|
+
} else {
|
|
701
|
+
const config = await loadConfig(root);
|
|
702
|
+
projectName = config.project.name;
|
|
703
|
+
}
|
|
704
|
+
const sessionPrefix = `legio-${projectName}-`;
|
|
705
|
+
const listSessionsFn = options._listTmuxSessions ?? listTmuxSessions;
|
|
706
|
+
const tmuxSessionNames = await listSessionsFn(sessionPrefix);
|
|
707
|
+
|
|
708
|
+
// Build set of all registered tmux session names (including completed/zombie)
|
|
709
|
+
const registeredTmuxSessions = new Set(sessions.map((s) => s.tmuxSession));
|
|
710
|
+
|
|
711
|
+
// Persistent coordination agents are excluded — they may not always be in sessions.db
|
|
712
|
+
const EXCLUDED_AGENTS = new Set(["coordinator", "gateway", "monitor"]);
|
|
713
|
+
const unregisteredDir = join(legioDir, "unregistered-agents");
|
|
714
|
+
|
|
715
|
+
for (const tmuxSession of tmuxSessionNames) {
|
|
716
|
+
if (registeredTmuxSessions.has(tmuxSession)) continue;
|
|
717
|
+
|
|
718
|
+
// Extract agent name by stripping the session prefix
|
|
719
|
+
const agentName = tmuxSession.slice(sessionPrefix.length);
|
|
720
|
+
if (EXCLUDED_AGENTS.has(agentName)) continue;
|
|
721
|
+
|
|
722
|
+
const markerPath = join(unregisteredDir, `${agentName}.txt`);
|
|
723
|
+
let firstSeenMs: number | null = null;
|
|
724
|
+
try {
|
|
725
|
+
const content = await readFile(markerPath, "utf-8");
|
|
726
|
+
firstSeenMs = Number.parseInt(content.trim(), 10) || null;
|
|
727
|
+
} catch {
|
|
728
|
+
// Marker doesn't exist — first sighting: write the timestamp
|
|
729
|
+
try {
|
|
730
|
+
await mkdir(unregisteredDir, { recursive: true });
|
|
731
|
+
await writeFile(markerPath, String(Date.now()), "utf-8");
|
|
732
|
+
} catch {
|
|
733
|
+
// Non-fatal: marker write failure
|
|
734
|
+
}
|
|
735
|
+
continue;
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
if (firstSeenMs !== null) {
|
|
739
|
+
const elapsed = Date.now() - firstSeenMs;
|
|
740
|
+
if (elapsed > 3 * 60 * 1000) {
|
|
741
|
+
// Session has been unregistered for >3 minutes — send alert
|
|
742
|
+
try {
|
|
743
|
+
await sendRecoveryMailFn([
|
|
744
|
+
"--to",
|
|
745
|
+
"coordinator",
|
|
746
|
+
"--subject",
|
|
747
|
+
`Unregistered agent: ${agentName}`,
|
|
748
|
+
"--body",
|
|
749
|
+
`Tmux session ${tmuxSession} has been running for ${Math.round(elapsed / 60000)}min but is not registered in sessions.db. Possible zombie or rogue process.`,
|
|
750
|
+
"--type",
|
|
751
|
+
"error",
|
|
752
|
+
"--priority",
|
|
753
|
+
"urgent",
|
|
754
|
+
"--from",
|
|
755
|
+
"watchman",
|
|
756
|
+
]);
|
|
757
|
+
} catch {
|
|
758
|
+
// Fire-and-forget: mail failure must not break the watchman
|
|
759
|
+
}
|
|
760
|
+
recordEvent(eventStore, {
|
|
761
|
+
runId,
|
|
762
|
+
agentName,
|
|
763
|
+
eventType: "custom",
|
|
764
|
+
level: "warn",
|
|
765
|
+
data: { type: "unregistered_zombie", tmuxSession, elapsedMs: elapsed },
|
|
766
|
+
});
|
|
767
|
+
// Clean up the marker so we don't re-alert on every tick
|
|
768
|
+
try {
|
|
769
|
+
await unlink(markerPath);
|
|
770
|
+
} catch {
|
|
771
|
+
// Non-fatal: marker cleanup failure
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
} catch {
|
|
777
|
+
// Non-fatal: unregistered agent detection must not break the daemon
|
|
778
|
+
}
|
|
779
|
+
} finally {
|
|
780
|
+
store.close();
|
|
781
|
+
// Close EventStore only if we created it (not injected)
|
|
782
|
+
if (eventStore && !useInjectedEventStore) {
|
|
783
|
+
try {
|
|
784
|
+
eventStore.close();
|
|
785
|
+
} catch {
|
|
786
|
+
// Non-fatal
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
/**
|
|
793
|
+
* Run a single mail delivery tick. Exported for testing.
|
|
794
|
+
*
|
|
795
|
+
* Each tick:
|
|
796
|
+
* 1. Opens mail.db and queries agents with unread mail
|
|
797
|
+
* 2. For each agent with unread mail, checks idle state and nudges
|
|
798
|
+
* 3. Writes pending-nudge marker as fallback
|
|
799
|
+
* 4. Escalation: first nudge immediately, re-nudge every reNudgeIntervalMs
|
|
800
|
+
* 5. Warns after warnAfterMs of unread mail
|
|
801
|
+
* 6. Clears state entry when agent's unread count drops to 0
|
|
802
|
+
*/
|
|
803
|
+
export async function runMailTick(
|
|
804
|
+
options: WatchmanOptions,
|
|
805
|
+
state: Map<string, AgentMailState>,
|
|
806
|
+
): Promise<void> {
|
|
807
|
+
const { root, onNudge, onWarn } = options;
|
|
808
|
+
const reNudgeIntervalMs = options.reNudgeIntervalMs ?? 10_000;
|
|
809
|
+
const warnAfterMs = options.warnAfterMs ?? 60_000;
|
|
810
|
+
const nudgeFn = options._nudge ?? nudgeAgent;
|
|
811
|
+
const isIdleFn = options._isAgentIdle ?? isAgentIdle;
|
|
812
|
+
const writePendingNudgeFn = options._writePendingNudge ?? writePendingNudge;
|
|
813
|
+
// Agents with recent activity have hooks that deliver mail — tmux nudges
|
|
814
|
+
// are noisy and redundant for them. Only nudge truly idle agents.
|
|
815
|
+
const activityThresholdMs = 30_000;
|
|
816
|
+
|
|
817
|
+
// Open mail store
|
|
818
|
+
let mailStore: MailStore;
|
|
819
|
+
const useInjectedMailStore = options._mailStore !== undefined;
|
|
820
|
+
if (useInjectedMailStore) {
|
|
821
|
+
mailStore = options._mailStore as MailStore;
|
|
822
|
+
} else {
|
|
823
|
+
const dbPath = join(root, ".legio", "mail.db");
|
|
824
|
+
mailStore = createMailStore(dbPath);
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
// Open session store to check lastActivityAt
|
|
828
|
+
const legioDir = join(root, ".legio");
|
|
829
|
+
let sessionStore: ReturnType<typeof openSessionStore>["store"] | null = null;
|
|
830
|
+
try {
|
|
831
|
+
const { store: ss } = openSessionStore(legioDir);
|
|
832
|
+
sessionStore = ss;
|
|
833
|
+
} catch {
|
|
834
|
+
// Non-fatal: session store unavailable — fall back to file-based idle check
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
try {
|
|
838
|
+
const agentsWithUnread = mailStore.getAgentsWithUnread();
|
|
839
|
+
const agentSet = new Set(agentsWithUnread);
|
|
840
|
+
|
|
841
|
+
// Clear state for agents that no longer have unread mail
|
|
842
|
+
for (const [agentName] of state) {
|
|
843
|
+
if (!agentSet.has(agentName)) {
|
|
844
|
+
state.delete(agentName);
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
const now = Date.now();
|
|
849
|
+
|
|
850
|
+
for (const agentName of agentsWithUnread) {
|
|
851
|
+
let agentState = state.get(agentName);
|
|
852
|
+
|
|
853
|
+
if (!agentState) {
|
|
854
|
+
// First time seeing unread mail for this agent — nudge immediately
|
|
855
|
+
agentState = {
|
|
856
|
+
firstSeenAt: now,
|
|
857
|
+
lastNudgeAt: 0,
|
|
858
|
+
nudgeCount: 0,
|
|
859
|
+
};
|
|
860
|
+
state.set(agentName, agentState);
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
// Check if it's time to re-nudge
|
|
864
|
+
const timeSinceLastNudge = now - agentState.lastNudgeAt;
|
|
865
|
+
const shouldNudge = agentState.nudgeCount === 0 || timeSinceLastNudge >= reNudgeIntervalMs;
|
|
866
|
+
|
|
867
|
+
if (shouldNudge) {
|
|
868
|
+
// Write pending-nudge marker as fallback (always, regardless of activity)
|
|
869
|
+
try {
|
|
870
|
+
await writePendingNudgeFn(root, agentName, {
|
|
871
|
+
from: "watchman",
|
|
872
|
+
reason: "unread mail",
|
|
873
|
+
subject: "You have unread mail",
|
|
874
|
+
messageId: "",
|
|
875
|
+
});
|
|
876
|
+
} catch {
|
|
877
|
+
// Non-fatal: pending marker write failure
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
// Check if the agent has recent activity — if so, hooks will deliver
|
|
881
|
+
// the mail on the next tool call. Skip the tmux nudge to avoid noise.
|
|
882
|
+
let recentlyActive = false;
|
|
883
|
+
if (sessionStore) {
|
|
884
|
+
try {
|
|
885
|
+
const session = sessionStore.getByName(agentName);
|
|
886
|
+
if (session?.lastActivity) {
|
|
887
|
+
const activityAge = now - new Date(session.lastActivity).getTime();
|
|
888
|
+
recentlyActive = activityAge < activityThresholdMs;
|
|
889
|
+
}
|
|
890
|
+
} catch {
|
|
891
|
+
// Non-fatal: session lookup failure
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
// Only send tmux nudge if the agent is NOT recently active AND is idle
|
|
896
|
+
if (!recentlyActive) {
|
|
897
|
+
try {
|
|
898
|
+
const idle = await isIdleFn(root, agentName);
|
|
899
|
+
if (idle) {
|
|
900
|
+
await nudgeFn(
|
|
901
|
+
root,
|
|
902
|
+
agentName,
|
|
903
|
+
"[watchman] You have unread mail. Run: legio mail check",
|
|
904
|
+
true, // force — skip debounce
|
|
905
|
+
).catch(() => {
|
|
906
|
+
// Non-fatal: nudge failure
|
|
907
|
+
});
|
|
908
|
+
}
|
|
909
|
+
} catch {
|
|
910
|
+
// Non-fatal: idle check or nudge failure
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
agentState.lastNudgeAt = now;
|
|
915
|
+
agentState.nudgeCount++;
|
|
916
|
+
|
|
917
|
+
if (onNudge) {
|
|
918
|
+
onNudge(agentName, agentState.nudgeCount);
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
// Warn if unread mail has been sitting too long
|
|
923
|
+
const unreadDuration = now - agentState.firstSeenAt;
|
|
924
|
+
if (unreadDuration >= warnAfterMs && onWarn) {
|
|
925
|
+
onWarn(agentName, unreadDuration);
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
} finally {
|
|
929
|
+
if (!useInjectedMailStore) {
|
|
930
|
+
mailStore.close();
|
|
931
|
+
}
|
|
932
|
+
if (sessionStore) {
|
|
933
|
+
try {
|
|
934
|
+
sessionStore.close();
|
|
935
|
+
} catch {
|
|
936
|
+
// Non-fatal
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
}
|