@katyella/legio 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +422 -0
- package/LICENSE +21 -0
- package/README.md +555 -0
- package/agents/builder.md +141 -0
- package/agents/coordinator.md +351 -0
- package/agents/cto.md +196 -0
- package/agents/gateway.md +276 -0
- package/agents/lead.md +281 -0
- package/agents/merger.md +156 -0
- package/agents/monitor.md +212 -0
- package/agents/reviewer.md +142 -0
- package/agents/scout.md +131 -0
- package/agents/supervisor.md +416 -0
- package/bin/legio.mjs +38 -0
- package/package.json +77 -0
- package/src/agents/checkpoint.test.ts +88 -0
- package/src/agents/checkpoint.ts +102 -0
- package/src/agents/hooks-deployer.test.ts +1820 -0
- package/src/agents/hooks-deployer.ts +574 -0
- package/src/agents/identity.test.ts +614 -0
- package/src/agents/identity.ts +385 -0
- package/src/agents/lifecycle.test.ts +202 -0
- package/src/agents/lifecycle.ts +184 -0
- package/src/agents/manifest.test.ts +558 -0
- package/src/agents/manifest.ts +297 -0
- package/src/agents/overlay.test.ts +592 -0
- package/src/agents/overlay.ts +316 -0
- package/src/beads/client.test.ts +210 -0
- package/src/beads/client.ts +227 -0
- package/src/beads/molecules.test.ts +320 -0
- package/src/beads/molecules.ts +209 -0
- package/src/commands/agents.test.ts +325 -0
- package/src/commands/agents.ts +286 -0
- package/src/commands/clean.test.ts +730 -0
- package/src/commands/clean.ts +653 -0
- package/src/commands/completions.test.ts +346 -0
- package/src/commands/completions.ts +950 -0
- package/src/commands/coordinator.test.ts +1524 -0
- package/src/commands/coordinator.ts +880 -0
- package/src/commands/costs.test.ts +1015 -0
- package/src/commands/costs.ts +473 -0
- package/src/commands/dashboard.test.ts +94 -0
- package/src/commands/dashboard.ts +607 -0
- package/src/commands/doctor.test.ts +295 -0
- package/src/commands/doctor.ts +213 -0
- package/src/commands/down.test.ts +308 -0
- package/src/commands/down.ts +124 -0
- package/src/commands/errors.test.ts +648 -0
- package/src/commands/errors.ts +255 -0
- package/src/commands/feed.test.ts +579 -0
- package/src/commands/feed.ts +368 -0
- package/src/commands/gateway.test.ts +698 -0
- package/src/commands/gateway.ts +419 -0
- package/src/commands/group.test.ts +262 -0
- package/src/commands/group.ts +539 -0
- package/src/commands/hooks.test.ts +292 -0
- package/src/commands/hooks.ts +210 -0
- package/src/commands/init.test.ts +211 -0
- package/src/commands/init.ts +622 -0
- package/src/commands/inspect.test.ts +670 -0
- package/src/commands/inspect.ts +455 -0
- package/src/commands/log.test.ts +1556 -0
- package/src/commands/log.ts +752 -0
- package/src/commands/logs.test.ts +379 -0
- package/src/commands/logs.ts +544 -0
- package/src/commands/mail.test.ts +1726 -0
- package/src/commands/mail.ts +926 -0
- package/src/commands/merge.test.ts +676 -0
- package/src/commands/merge.ts +374 -0
- package/src/commands/metrics.test.ts +444 -0
- package/src/commands/metrics.ts +150 -0
- package/src/commands/monitor.test.ts +151 -0
- package/src/commands/monitor.ts +394 -0
- package/src/commands/nudge.test.ts +230 -0
- package/src/commands/nudge.ts +373 -0
- package/src/commands/prime.test.ts +467 -0
- package/src/commands/prime.ts +386 -0
- package/src/commands/replay.test.ts +742 -0
- package/src/commands/replay.ts +367 -0
- package/src/commands/run.test.ts +443 -0
- package/src/commands/run.ts +365 -0
- package/src/commands/server.test.ts +626 -0
- package/src/commands/server.ts +298 -0
- package/src/commands/sling.test.ts +810 -0
- package/src/commands/sling.ts +700 -0
- package/src/commands/spec.test.ts +206 -0
- package/src/commands/spec.ts +171 -0
- package/src/commands/status.test.ts +276 -0
- package/src/commands/status.ts +339 -0
- package/src/commands/stop.test.ts +357 -0
- package/src/commands/stop.ts +119 -0
- package/src/commands/supervisor.test.ts +186 -0
- package/src/commands/supervisor.ts +544 -0
- package/src/commands/trace.test.ts +746 -0
- package/src/commands/trace.ts +332 -0
- package/src/commands/up.test.ts +597 -0
- package/src/commands/up.ts +275 -0
- package/src/commands/watch.test.ts +152 -0
- package/src/commands/watch.ts +238 -0
- package/src/commands/worktree.test.ts +648 -0
- package/src/commands/worktree.ts +266 -0
- package/src/config.test.ts +496 -0
- package/src/config.ts +616 -0
- package/src/doctor/agents.test.ts +448 -0
- package/src/doctor/agents.ts +396 -0
- package/src/doctor/config-check.test.ts +184 -0
- package/src/doctor/config-check.ts +185 -0
- package/src/doctor/consistency.test.ts +645 -0
- package/src/doctor/consistency.ts +294 -0
- package/src/doctor/databases.test.ts +284 -0
- package/src/doctor/databases.ts +211 -0
- package/src/doctor/dependencies.test.ts +150 -0
- package/src/doctor/dependencies.ts +179 -0
- package/src/doctor/logs.test.ts +244 -0
- package/src/doctor/logs.ts +295 -0
- package/src/doctor/merge-queue.test.ts +210 -0
- package/src/doctor/merge-queue.ts +144 -0
- package/src/doctor/structure.test.ts +285 -0
- package/src/doctor/structure.ts +195 -0
- package/src/doctor/types.ts +37 -0
- package/src/doctor/version.test.ts +130 -0
- package/src/doctor/version.ts +131 -0
- package/src/e2e/chat-flow.test.ts +346 -0
- package/src/e2e/init-sling-lifecycle.test.ts +288 -0
- package/src/errors.test.ts +21 -0
- package/src/errors.ts +246 -0
- package/src/events/store.test.ts +660 -0
- package/src/events/store.ts +344 -0
- package/src/events/tool-filter.test.ts +330 -0
- package/src/events/tool-filter.ts +126 -0
- package/src/global-setup.ts +14 -0
- package/src/index.ts +339 -0
- package/src/insights/analyzer.test.ts +466 -0
- package/src/insights/analyzer.ts +203 -0
- package/src/logging/color.test.ts +118 -0
- package/src/logging/color.ts +71 -0
- package/src/logging/logger.test.ts +812 -0
- package/src/logging/logger.ts +266 -0
- package/src/logging/reporter.test.ts +258 -0
- package/src/logging/reporter.ts +109 -0
- package/src/logging/sanitizer.test.ts +190 -0
- package/src/logging/sanitizer.ts +57 -0
- package/src/mail/broadcast.test.ts +203 -0
- package/src/mail/broadcast.ts +92 -0
- package/src/mail/client.test.ts +873 -0
- package/src/mail/client.ts +236 -0
- package/src/mail/store.test.ts +815 -0
- package/src/mail/store.ts +402 -0
- package/src/merge/queue.test.ts +449 -0
- package/src/merge/queue.ts +262 -0
- package/src/merge/resolver.test.ts +1453 -0
- package/src/merge/resolver.ts +759 -0
- package/src/metrics/store.test.ts +1167 -0
- package/src/metrics/store.ts +511 -0
- package/src/metrics/summary.test.ts +397 -0
- package/src/metrics/summary.ts +178 -0
- package/src/metrics/transcript.test.ts +643 -0
- package/src/metrics/transcript.ts +351 -0
- package/src/mulch/client.test.ts +547 -0
- package/src/mulch/client.ts +416 -0
- package/src/server/audit-store.test.ts +384 -0
- package/src/server/audit-store.ts +257 -0
- package/src/server/headless.test.ts +180 -0
- package/src/server/headless.ts +151 -0
- package/src/server/index.test.ts +241 -0
- package/src/server/index.ts +317 -0
- package/src/server/public/app.js +187 -0
- package/src/server/public/apple-touch-icon.png +0 -0
- package/src/server/public/components/agent-badge.js +37 -0
- package/src/server/public/components/data-table.js +114 -0
- package/src/server/public/components/gateway-chat.js +256 -0
- package/src/server/public/components/issue-card.js +96 -0
- package/src/server/public/components/layout.js +88 -0
- package/src/server/public/components/message-bubble.js +120 -0
- package/src/server/public/components/stat-card.js +26 -0
- package/src/server/public/components/terminal-panel.js +140 -0
- package/src/server/public/favicon-16.png +0 -0
- package/src/server/public/favicon-32.png +0 -0
- package/src/server/public/favicon.ico +0 -0
- package/src/server/public/favicon.png +0 -0
- package/src/server/public/index.html +64 -0
- package/src/server/public/lib/api.js +35 -0
- package/src/server/public/lib/markdown.js +8 -0
- package/src/server/public/lib/preact-setup.js +8 -0
- package/src/server/public/lib/state.js +99 -0
- package/src/server/public/lib/utils.js +309 -0
- package/src/server/public/lib/ws.js +79 -0
- package/src/server/public/views/chat.js +983 -0
- package/src/server/public/views/costs.js +692 -0
- package/src/server/public/views/dashboard.js +781 -0
- package/src/server/public/views/gateway-chat.js +622 -0
- package/src/server/public/views/inspect.js +399 -0
- package/src/server/public/views/issues.js +470 -0
- package/src/server/public/views/setup.js +94 -0
- package/src/server/public/views/task-detail.js +422 -0
- package/src/server/routes.test.ts +3816 -0
- package/src/server/routes.ts +1964 -0
- package/src/server/websocket.test.ts +288 -0
- package/src/server/websocket.ts +196 -0
- package/src/sessions/compat.test.ts +109 -0
- package/src/sessions/compat.ts +17 -0
- package/src/sessions/store.test.ts +969 -0
- package/src/sessions/store.ts +480 -0
- package/src/test-helpers.test.ts +97 -0
- package/src/test-helpers.ts +143 -0
- package/src/types.ts +708 -0
- package/src/watchdog/daemon.test.ts +1233 -0
- package/src/watchdog/daemon.ts +533 -0
- package/src/watchdog/health.test.ts +371 -0
- package/src/watchdog/health.ts +248 -0
- package/src/watchdog/triage.test.ts +162 -0
- package/src/watchdog/triage.ts +193 -0
- package/src/worktree/manager.test.ts +444 -0
- package/src/worktree/manager.ts +224 -0
- package/src/worktree/tmux.test.ts +1238 -0
- package/src/worktree/tmux.ts +644 -0
- package/templates/CLAUDE.md.tmpl +89 -0
- package/templates/hooks.json.tmpl +132 -0
- package/templates/overlay.md.tmpl +79 -0
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tier 0 mechanical process monitoring daemon.
|
|
3
|
+
*
|
|
4
|
+
* Runs on a configurable interval, checking the health of all active agent
|
|
5
|
+
* sessions. Detects zombie agents (dead tmux or process) and attempts
|
|
6
|
+
* auto-recovery from checkpoints.
|
|
7
|
+
*
|
|
8
|
+
* Phase 4 tier numbering:
|
|
9
|
+
* Tier 0 = Mechanical daemon (this file)
|
|
10
|
+
* Tier 1 = Triage agent (triage.ts)
|
|
11
|
+
* Tier 2 = Monitor agent (not yet implemented)
|
|
12
|
+
* Tier 3 = Supervisor monitors (per-project)
|
|
13
|
+
*
|
|
14
|
+
* ZFC Principle: Observable state (tmux alive, pid alive) is the source of
|
|
15
|
+
* truth. See health.ts for the full ZFC documentation.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { spawn } from "node:child_process";
|
|
19
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
20
|
+
import { join } from "node:path";
|
|
21
|
+
import { loadCheckpoint } from "../agents/checkpoint.ts";
|
|
22
|
+
import { createEventStore } from "../events/store.ts";
|
|
23
|
+
import { createMulchClient } from "../mulch/client.ts";
|
|
24
|
+
import { openSessionStore } from "../sessions/compat.ts";
|
|
25
|
+
import type { AgentSession, EventStore, HealthCheck, SessionCheckpoint } from "../types.ts";
|
|
26
|
+
import { isSessionAlive, killSession } from "../worktree/tmux.ts";
|
|
27
|
+
import { evaluateHealth, transitionState } from "./health.ts";
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Record an agent failure to mulch for future reference.
|
|
31
|
+
* Fire-and-forget: never throws, logs errors internally if mulch fails.
|
|
32
|
+
*
|
|
33
|
+
* @param root - Project root directory
|
|
34
|
+
* @param session - The agent session that failed
|
|
35
|
+
* @param reason - Human-readable failure reason
|
|
36
|
+
* @param tier - Which watchdog tier detected the failure (0 or 1)
|
|
37
|
+
* @param triageSuggestion - Optional triage verdict from Tier 1 AI analysis
|
|
38
|
+
*/
|
|
39
|
+
async function recordFailure(
|
|
40
|
+
root: string,
|
|
41
|
+
session: AgentSession,
|
|
42
|
+
reason: string,
|
|
43
|
+
tier: 0 | 1,
|
|
44
|
+
triageSuggestion?: string,
|
|
45
|
+
): Promise<void> {
|
|
46
|
+
try {
|
|
47
|
+
const mulch = createMulchClient(root);
|
|
48
|
+
const tierLabel = tier === 0 ? "Tier 0 (process death)" : "Tier 1 (AI triage)";
|
|
49
|
+
const description = [
|
|
50
|
+
`Agent: ${session.agentName}`,
|
|
51
|
+
`Capability: ${session.capability}`,
|
|
52
|
+
`Failure reason: ${reason}`,
|
|
53
|
+
triageSuggestion ? `Triage suggestion: ${triageSuggestion}` : null,
|
|
54
|
+
`Detected by: ${tierLabel}`,
|
|
55
|
+
]
|
|
56
|
+
.filter((line) => line !== null)
|
|
57
|
+
.join("\n");
|
|
58
|
+
|
|
59
|
+
await mulch.record("agents", {
|
|
60
|
+
type: "failure",
|
|
61
|
+
description,
|
|
62
|
+
tags: ["watchdog", "auto-recorded"],
|
|
63
|
+
evidenceBead: session.beadId || undefined,
|
|
64
|
+
});
|
|
65
|
+
} catch {
|
|
66
|
+
// Fire-and-forget: recording failures must not break the watchdog
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Read the current run ID from current-run.txt, or null if no active run.
|
|
72
|
+
*/
|
|
73
|
+
async function readCurrentRunId(legioDir: string): Promise<string | null> {
|
|
74
|
+
const path = join(legioDir, "current-run.txt");
|
|
75
|
+
try {
|
|
76
|
+
const text = await readFile(path, "utf-8");
|
|
77
|
+
const trimmed = text.trim();
|
|
78
|
+
return trimmed.length > 0 ? trimmed : null;
|
|
79
|
+
} catch {
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Fire-and-forget: record an event to EventStore. Never throws.
|
|
86
|
+
*/
|
|
87
|
+
function recordEvent(
|
|
88
|
+
eventStore: EventStore | null,
|
|
89
|
+
event: {
|
|
90
|
+
runId: string | null;
|
|
91
|
+
agentName: string;
|
|
92
|
+
eventType: "custom" | "mail_sent";
|
|
93
|
+
level: "debug" | "info" | "warn" | "error";
|
|
94
|
+
data: Record<string, unknown>;
|
|
95
|
+
},
|
|
96
|
+
): void {
|
|
97
|
+
if (!eventStore) return;
|
|
98
|
+
try {
|
|
99
|
+
eventStore.insert({
|
|
100
|
+
runId: event.runId,
|
|
101
|
+
agentName: event.agentName,
|
|
102
|
+
sessionId: null,
|
|
103
|
+
eventType: event.eventType,
|
|
104
|
+
toolName: null,
|
|
105
|
+
toolArgs: null,
|
|
106
|
+
toolDurationMs: null,
|
|
107
|
+
level: event.level,
|
|
108
|
+
data: JSON.stringify(event.data),
|
|
109
|
+
});
|
|
110
|
+
} catch {
|
|
111
|
+
// Fire-and-forget: event recording must never break the daemon
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Read the recovery attempt count for an agent from disk.
|
|
117
|
+
* Returns 0 if the file doesn't exist.
|
|
118
|
+
*/
|
|
119
|
+
async function readRecoveryCount(agentsDir: string, agentName: string): Promise<number> {
|
|
120
|
+
try {
|
|
121
|
+
const text = await readFile(join(agentsDir, agentName, "recovery-count"), "utf-8");
|
|
122
|
+
return parseInt(text.trim(), 10) || 0;
|
|
123
|
+
} catch {
|
|
124
|
+
return 0;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Write the recovery attempt count for an agent to disk.
|
|
130
|
+
* Creates the directory if it doesn't exist.
|
|
131
|
+
*/
|
|
132
|
+
async function writeRecoveryCount(
|
|
133
|
+
agentsDir: string,
|
|
134
|
+
agentName: string,
|
|
135
|
+
count: number,
|
|
136
|
+
): Promise<void> {
|
|
137
|
+
const dir = join(agentsDir, agentName);
|
|
138
|
+
await mkdir(dir, { recursive: true });
|
|
139
|
+
await writeFile(join(dir, "recovery-count"), String(count), "utf-8");
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Default sling implementation: spawn `legio sling` as a subprocess.
|
|
144
|
+
*/
|
|
145
|
+
async function reSling(
|
|
146
|
+
args: string[],
|
|
147
|
+
root: string,
|
|
148
|
+
): Promise<{ exitCode: number; stderr: string }> {
|
|
149
|
+
return new Promise((resolve) => {
|
|
150
|
+
const proc = spawn("legio", ["sling", ...args], {
|
|
151
|
+
cwd: root,
|
|
152
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
153
|
+
});
|
|
154
|
+
let stderr = "";
|
|
155
|
+
proc.stderr?.on("data", (chunk: Buffer) => {
|
|
156
|
+
stderr += chunk.toString();
|
|
157
|
+
});
|
|
158
|
+
proc.on("close", (code) => resolve({ exitCode: code ?? 1, stderr }));
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Default recovery mail implementation: spawn `legio mail send` as a subprocess.
|
|
164
|
+
*/
|
|
165
|
+
async function sendMailSubprocess(args: string[], root: string): Promise<void> {
|
|
166
|
+
return new Promise((resolve) => {
|
|
167
|
+
const proc = spawn("legio", ["mail", "send", ...args], {
|
|
168
|
+
cwd: root,
|
|
169
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
170
|
+
});
|
|
171
|
+
proc.on("close", () => resolve());
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Attempt to auto-recover a dead agent from its checkpoint by re-slinging it.
|
|
177
|
+
*
|
|
178
|
+
* @returns `{ recovered: true }` if sling succeeded, `{ recovered: false }` otherwise.
|
|
179
|
+
*/
|
|
180
|
+
async function attemptRecovery(options: {
|
|
181
|
+
session: AgentSession;
|
|
182
|
+
legioDir: string;
|
|
183
|
+
root: string;
|
|
184
|
+
maxRecoveryAttempts: number;
|
|
185
|
+
eventStore: EventStore | null;
|
|
186
|
+
runId: string | null;
|
|
187
|
+
sling: (args: string[]) => Promise<{ exitCode: number; stderr: string }>;
|
|
188
|
+
loadCheckpointFn: (agentsDir: string, agentName: string) => Promise<SessionCheckpoint | null>;
|
|
189
|
+
sendRecoveryMail: (args: string[]) => Promise<void>;
|
|
190
|
+
}): Promise<{ recovered: boolean }> {
|
|
191
|
+
const {
|
|
192
|
+
session,
|
|
193
|
+
legioDir,
|
|
194
|
+
maxRecoveryAttempts,
|
|
195
|
+
eventStore,
|
|
196
|
+
runId,
|
|
197
|
+
sling,
|
|
198
|
+
loadCheckpointFn,
|
|
199
|
+
sendRecoveryMail,
|
|
200
|
+
} = options;
|
|
201
|
+
const agentsDir = join(legioDir, "agents");
|
|
202
|
+
|
|
203
|
+
// Load checkpoint — if none exists, recovery is not possible
|
|
204
|
+
let checkpoint: SessionCheckpoint | null = null;
|
|
205
|
+
try {
|
|
206
|
+
checkpoint = await loadCheckpointFn(agentsDir, session.agentName);
|
|
207
|
+
} catch {
|
|
208
|
+
return { recovered: false };
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if (!checkpoint) {
|
|
212
|
+
return { recovered: false };
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Check retry count — if exhausted, send escalation mail and bail
|
|
216
|
+
const recoveryCount = await readRecoveryCount(agentsDir, session.agentName);
|
|
217
|
+
if (recoveryCount >= maxRecoveryAttempts) {
|
|
218
|
+
if (session.parentAgent) {
|
|
219
|
+
try {
|
|
220
|
+
await sendRecoveryMail([
|
|
221
|
+
"--to",
|
|
222
|
+
session.parentAgent,
|
|
223
|
+
"--subject",
|
|
224
|
+
`Recovery failed: ${session.agentName}`,
|
|
225
|
+
"--body",
|
|
226
|
+
`Auto-recovery exhausted for ${session.agentName} after ${recoveryCount} attempts. Agent marked zombie.`,
|
|
227
|
+
"--type",
|
|
228
|
+
"error",
|
|
229
|
+
"--priority",
|
|
230
|
+
"high",
|
|
231
|
+
"--from",
|
|
232
|
+
"watchdog",
|
|
233
|
+
]);
|
|
234
|
+
} catch {
|
|
235
|
+
// Fire-and-forget: mail failure must not break the watchdog
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
return { recovered: false };
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Increment recovery count before attempting
|
|
242
|
+
try {
|
|
243
|
+
await writeRecoveryCount(agentsDir, session.agentName, recoveryCount + 1);
|
|
244
|
+
} catch {
|
|
245
|
+
// Non-fatal: proceed with recovery even if count write fails
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const attempt = recoveryCount + 1;
|
|
249
|
+
|
|
250
|
+
// Record recovery_attempt event
|
|
251
|
+
recordEvent(eventStore, {
|
|
252
|
+
runId,
|
|
253
|
+
agentName: session.agentName,
|
|
254
|
+
eventType: "custom",
|
|
255
|
+
level: "info",
|
|
256
|
+
data: { type: "recovery_attempt", attempt, maxAttempts: maxRecoveryAttempts },
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
// Send mail to parent notifying of recovery attempt
|
|
260
|
+
if (session.parentAgent) {
|
|
261
|
+
try {
|
|
262
|
+
await sendRecoveryMail([
|
|
263
|
+
"--to",
|
|
264
|
+
session.parentAgent,
|
|
265
|
+
"--subject",
|
|
266
|
+
`Recovery: ${session.agentName}`,
|
|
267
|
+
"--body",
|
|
268
|
+
`Watchdog attempting auto-recovery from checkpoint for ${session.agentName} (attempt ${attempt}/${maxRecoveryAttempts}).`,
|
|
269
|
+
"--type",
|
|
270
|
+
"health_check",
|
|
271
|
+
"--from",
|
|
272
|
+
"watchdog",
|
|
273
|
+
]);
|
|
274
|
+
} catch {
|
|
275
|
+
// Fire-and-forget: mail failure must not break the watchdog
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Build sling args from checkpoint + session
|
|
280
|
+
const specPath = join(legioDir, "specs", `${checkpoint.beadId}.md`);
|
|
281
|
+
const slingArgs: string[] = [
|
|
282
|
+
checkpoint.beadId,
|
|
283
|
+
"--capability",
|
|
284
|
+
session.capability,
|
|
285
|
+
"--name",
|
|
286
|
+
session.agentName,
|
|
287
|
+
"--spec",
|
|
288
|
+
specPath,
|
|
289
|
+
];
|
|
290
|
+
|
|
291
|
+
if (checkpoint.filesModified.length > 0) {
|
|
292
|
+
slingArgs.push("--files", checkpoint.filesModified.join(","));
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
if (session.parentAgent) {
|
|
296
|
+
slingArgs.push("--parent", session.parentAgent);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
slingArgs.push("--depth", String(session.depth));
|
|
300
|
+
|
|
301
|
+
// Attempt sling subprocess
|
|
302
|
+
try {
|
|
303
|
+
const result = await sling(slingArgs);
|
|
304
|
+
if (result.exitCode === 0) {
|
|
305
|
+
recordEvent(eventStore, {
|
|
306
|
+
runId,
|
|
307
|
+
agentName: session.agentName,
|
|
308
|
+
eventType: "custom",
|
|
309
|
+
level: "info",
|
|
310
|
+
data: { type: "recovery_success", attempt },
|
|
311
|
+
});
|
|
312
|
+
return { recovered: true };
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
recordEvent(eventStore, {
|
|
316
|
+
runId,
|
|
317
|
+
agentName: session.agentName,
|
|
318
|
+
eventType: "custom",
|
|
319
|
+
level: "error",
|
|
320
|
+
data: { type: "recovery_failed", attempt, stderr: result.stderr },
|
|
321
|
+
});
|
|
322
|
+
return { recovered: false };
|
|
323
|
+
} catch {
|
|
324
|
+
recordEvent(eventStore, {
|
|
325
|
+
runId,
|
|
326
|
+
agentName: session.agentName,
|
|
327
|
+
eventType: "custom",
|
|
328
|
+
level: "error",
|
|
329
|
+
data: { type: "recovery_failed", attempt },
|
|
330
|
+
});
|
|
331
|
+
return { recovered: false };
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/** Options shared between startDaemon and runDaemonTick. */
|
|
336
|
+
export interface DaemonOptions {
|
|
337
|
+
root: string;
|
|
338
|
+
zombieThresholdMs: number;
|
|
339
|
+
onHealthCheck?: (check: HealthCheck) => void;
|
|
340
|
+
/** Dependency injection for testing. Uses real implementations when omitted. */
|
|
341
|
+
_tmux?: {
|
|
342
|
+
isSessionAlive: (name: string) => Promise<boolean>;
|
|
343
|
+
killSession: (name: string) => Promise<void>;
|
|
344
|
+
};
|
|
345
|
+
/** Dependency injection for testing. Overrides EventStore creation. */
|
|
346
|
+
_eventStore?: EventStore | null;
|
|
347
|
+
/** Dependency injection for testing. Uses real recordFailure when omitted. */
|
|
348
|
+
_recordFailure?: (
|
|
349
|
+
root: string,
|
|
350
|
+
session: AgentSession,
|
|
351
|
+
reason: string,
|
|
352
|
+
tier: 0 | 1,
|
|
353
|
+
triageSuggestion?: string,
|
|
354
|
+
) => Promise<void>;
|
|
355
|
+
/** Max recovery attempts per agent before escalating (default: 1). */
|
|
356
|
+
maxRecoveryAttempts?: number;
|
|
357
|
+
/** DI for testing. Overrides sling subprocess spawn. */
|
|
358
|
+
_sling?: (args: string[]) => Promise<{ exitCode: number; stderr: string }>;
|
|
359
|
+
/** DI for testing. Overrides checkpoint loading. */
|
|
360
|
+
_loadCheckpoint?: (agentsDir: string, agentName: string) => Promise<SessionCheckpoint | null>;
|
|
361
|
+
/** DI for testing. Overrides mail sending for recovery notifications. */
|
|
362
|
+
_sendRecoveryMail?: (args: string[]) => Promise<void>;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Start the watchdog daemon that periodically monitors agent health.
|
|
367
|
+
*
|
|
368
|
+
* On each tick:
|
|
369
|
+
* 1. Loads sessions from SessionStore (sessions.db)
|
|
370
|
+
* 2. For each session (including zombies — ZFC requires re-checking observable
|
|
371
|
+
* state), checks tmux liveness and evaluates health
|
|
372
|
+
* 3. For "terminate" actions: kills tmux session immediately
|
|
373
|
+
* 4. For "investigate" actions: surfaces via onHealthCheck, no auto-kill
|
|
374
|
+
* 5. For "escalate" actions: applies progressive nudging based on escalationLevel
|
|
375
|
+
* 6. Persists updated session states back to SessionStore
|
|
376
|
+
*
|
|
377
|
+
* @param options.root - Project root directory (contains .legio/)
|
|
378
|
+
* @param options.intervalMs - Polling interval in milliseconds
|
|
379
|
+
* @param options.zombieThresholdMs - Time after which an agent is considered a zombie
|
|
380
|
+
* @param options.onHealthCheck - Optional callback for each health check result
|
|
381
|
+
* @returns An object with a `stop` function to halt the daemon
|
|
382
|
+
*/
|
|
383
|
+
export function startDaemon(options: DaemonOptions & { intervalMs: number }): { stop: () => void } {
|
|
384
|
+
const { intervalMs } = options;
|
|
385
|
+
|
|
386
|
+
// Run the first tick immediately, then on interval
|
|
387
|
+
runDaemonTick(options).catch(() => {
|
|
388
|
+
// Swallow errors in the first tick — daemon must not crash
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
const interval = setInterval(() => {
|
|
392
|
+
runDaemonTick(options).catch(() => {
|
|
393
|
+
// Swallow errors in periodic ticks — daemon must not crash
|
|
394
|
+
});
|
|
395
|
+
}, intervalMs);
|
|
396
|
+
|
|
397
|
+
return {
|
|
398
|
+
stop(): void {
|
|
399
|
+
clearInterval(interval);
|
|
400
|
+
},
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Run a single daemon tick. Exported for testing — allows direct invocation
|
|
406
|
+
* of the monitoring logic without starting the interval-based daemon loop.
|
|
407
|
+
*
|
|
408
|
+
* @param options - Same options as startDaemon (minus intervalMs)
|
|
409
|
+
*/
|
|
410
|
+
export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
411
|
+
const { root, zombieThresholdMs, onHealthCheck } = options;
|
|
412
|
+
const tmux = options._tmux ?? { isSessionAlive, killSession };
|
|
413
|
+
const recordFailureFn = options._recordFailure ?? recordFailure;
|
|
414
|
+
const maxRecoveryAttempts = options.maxRecoveryAttempts ?? 1;
|
|
415
|
+
const slingFn = options._sling ?? ((args: string[]) => reSling(args, root));
|
|
416
|
+
const loadCheckpointFn = options._loadCheckpoint ?? loadCheckpoint;
|
|
417
|
+
const sendRecoveryMailFn =
|
|
418
|
+
options._sendRecoveryMail ?? ((args: string[]) => sendMailSubprocess(args, root));
|
|
419
|
+
|
|
420
|
+
const legioDir = join(root, ".legio");
|
|
421
|
+
const { store } = openSessionStore(legioDir);
|
|
422
|
+
|
|
423
|
+
// Open EventStore for recording daemon events (fire-and-forget)
|
|
424
|
+
let eventStore: EventStore | null = null;
|
|
425
|
+
let runId: string | null = null;
|
|
426
|
+
const useInjectedEventStore = options._eventStore !== undefined;
|
|
427
|
+
if (useInjectedEventStore) {
|
|
428
|
+
eventStore = options._eventStore ?? null;
|
|
429
|
+
} else {
|
|
430
|
+
try {
|
|
431
|
+
const eventsDbPath = join(legioDir, "events.db");
|
|
432
|
+
eventStore = createEventStore(eventsDbPath);
|
|
433
|
+
} catch {
|
|
434
|
+
// EventStore creation failure is non-fatal for the daemon
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
try {
|
|
438
|
+
runId = await readCurrentRunId(legioDir);
|
|
439
|
+
} catch {
|
|
440
|
+
// Reading run ID failure is non-fatal
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
try {
|
|
444
|
+
const thresholds = {
|
|
445
|
+
zombieMs: zombieThresholdMs,
|
|
446
|
+
};
|
|
447
|
+
|
|
448
|
+
const sessions = store.getAll();
|
|
449
|
+
|
|
450
|
+
for (const session of sessions) {
|
|
451
|
+
// Skip completed sessions — they are terminal and don't need monitoring
|
|
452
|
+
if (session.state === "completed") {
|
|
453
|
+
continue;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// ZFC: Don't skip zombies. Re-check tmux liveness on every tick.
|
|
457
|
+
// A zombie with a live tmux session needs investigation, not silence.
|
|
458
|
+
|
|
459
|
+
const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
460
|
+
const check = evaluateHealth(session, tmuxAlive, thresholds);
|
|
461
|
+
|
|
462
|
+
// Transition state forward only (investigate action holds state)
|
|
463
|
+
const newState = transitionState(session.state, check);
|
|
464
|
+
if (newState !== session.state) {
|
|
465
|
+
store.updateState(session.agentName, newState);
|
|
466
|
+
session.state = newState;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
if (onHealthCheck) {
|
|
470
|
+
onHealthCheck(check);
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
if (check.action === "terminate") {
|
|
474
|
+
// Record the failure via mulch (Tier 0 detection)
|
|
475
|
+
const reason = check.reconciliationNote ?? "Process terminated";
|
|
476
|
+
await recordFailureFn(root, session, reason, 0);
|
|
477
|
+
|
|
478
|
+
// Kill the tmux session if it's still alive
|
|
479
|
+
if (tmuxAlive) {
|
|
480
|
+
try {
|
|
481
|
+
await tmux.killSession(session.tmuxSession);
|
|
482
|
+
} catch {
|
|
483
|
+
// Session may have died between check and kill — not an error
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// Attempt auto-recovery from checkpoint before marking zombie
|
|
488
|
+
const { recovered } = await attemptRecovery({
|
|
489
|
+
session,
|
|
490
|
+
legioDir,
|
|
491
|
+
root,
|
|
492
|
+
maxRecoveryAttempts,
|
|
493
|
+
eventStore,
|
|
494
|
+
runId,
|
|
495
|
+
sling: slingFn,
|
|
496
|
+
loadCheckpointFn,
|
|
497
|
+
sendRecoveryMail: sendRecoveryMailFn,
|
|
498
|
+
});
|
|
499
|
+
|
|
500
|
+
if (!recovered) {
|
|
501
|
+
store.updateState(session.agentName, "zombie");
|
|
502
|
+
// Reset escalation tracking on terminal state
|
|
503
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
504
|
+
session.state = "zombie";
|
|
505
|
+
session.escalationLevel = 0;
|
|
506
|
+
session.stalledSince = null;
|
|
507
|
+
} else {
|
|
508
|
+
// Recovery succeeded — clear zombie state set by transitionState above
|
|
509
|
+
store.updateState(session.agentName, "completed");
|
|
510
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
511
|
+
session.state = "completed";
|
|
512
|
+
session.escalationLevel = 0;
|
|
513
|
+
session.stalledSince = null;
|
|
514
|
+
}
|
|
515
|
+
} else if (check.action === "investigate") {
|
|
516
|
+
// ZFC: tmux alive but SessionStore says zombie.
|
|
517
|
+
// Log the conflict but do NOT auto-kill.
|
|
518
|
+
// The onHealthCheck callback surfaces this to the operator.
|
|
519
|
+
// No state change — keep zombie until a human or higher-tier agent decides.
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
} finally {
|
|
523
|
+
store.close();
|
|
524
|
+
// Close EventStore only if we created it (not injected)
|
|
525
|
+
if (eventStore && !useInjectedEventStore) {
|
|
526
|
+
try {
|
|
527
|
+
eventStore.close();
|
|
528
|
+
} catch {
|
|
529
|
+
// Non-fatal
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
}
|