@os-eco/overstory-cli 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +381 -0
- package/agents/builder.md +137 -0
- package/agents/coordinator.md +263 -0
- package/agents/lead.md +301 -0
- package/agents/merger.md +160 -0
- package/agents/monitor.md +214 -0
- package/agents/reviewer.md +140 -0
- package/agents/scout.md +119 -0
- package/agents/supervisor.md +423 -0
- package/package.json +47 -0
- package/src/agents/checkpoint.test.ts +88 -0
- package/src/agents/checkpoint.ts +101 -0
- package/src/agents/hooks-deployer.test.ts +2040 -0
- package/src/agents/hooks-deployer.ts +607 -0
- package/src/agents/identity.test.ts +603 -0
- package/src/agents/identity.ts +384 -0
- package/src/agents/lifecycle.test.ts +196 -0
- package/src/agents/lifecycle.ts +183 -0
- package/src/agents/manifest.test.ts +746 -0
- package/src/agents/manifest.ts +354 -0
- package/src/agents/overlay.test.ts +676 -0
- package/src/agents/overlay.ts +308 -0
- package/src/beads/client.test.ts +217 -0
- package/src/beads/client.ts +202 -0
- package/src/beads/molecules.test.ts +338 -0
- package/src/beads/molecules.ts +198 -0
- package/src/commands/agents.test.ts +322 -0
- package/src/commands/agents.ts +287 -0
- package/src/commands/clean.test.ts +670 -0
- package/src/commands/clean.ts +618 -0
- package/src/commands/completions.test.ts +342 -0
- package/src/commands/completions.ts +887 -0
- package/src/commands/coordinator.test.ts +1530 -0
- package/src/commands/coordinator.ts +733 -0
- package/src/commands/costs.test.ts +1119 -0
- package/src/commands/costs.ts +564 -0
- package/src/commands/dashboard.test.ts +308 -0
- package/src/commands/dashboard.ts +838 -0
- package/src/commands/doctor.test.ts +294 -0
- package/src/commands/doctor.ts +213 -0
- package/src/commands/errors.test.ts +647 -0
- package/src/commands/errors.ts +248 -0
- package/src/commands/feed.test.ts +578 -0
- package/src/commands/feed.ts +361 -0
- package/src/commands/group.test.ts +262 -0
- package/src/commands/group.ts +511 -0
- package/src/commands/hooks.test.ts +458 -0
- package/src/commands/hooks.ts +253 -0
- package/src/commands/init.test.ts +347 -0
- package/src/commands/init.ts +650 -0
- package/src/commands/inspect.test.ts +670 -0
- package/src/commands/inspect.ts +431 -0
- package/src/commands/log.test.ts +1454 -0
- package/src/commands/log.ts +724 -0
- package/src/commands/logs.test.ts +379 -0
- package/src/commands/logs.ts +546 -0
- package/src/commands/mail.test.ts +1270 -0
- package/src/commands/mail.ts +771 -0
- package/src/commands/merge.test.ts +670 -0
- package/src/commands/merge.ts +355 -0
- package/src/commands/metrics.test.ts +444 -0
- package/src/commands/metrics.ts +143 -0
- package/src/commands/monitor.test.ts +191 -0
- package/src/commands/monitor.ts +390 -0
- package/src/commands/nudge.test.ts +230 -0
- package/src/commands/nudge.ts +372 -0
- package/src/commands/prime.test.ts +470 -0
- package/src/commands/prime.ts +381 -0
- package/src/commands/replay.test.ts +741 -0
- package/src/commands/replay.ts +360 -0
- package/src/commands/run.test.ts +431 -0
- package/src/commands/run.ts +351 -0
- package/src/commands/sling.test.ts +657 -0
- package/src/commands/sling.ts +661 -0
- package/src/commands/spec.test.ts +203 -0
- package/src/commands/spec.ts +168 -0
- package/src/commands/status.test.ts +430 -0
- package/src/commands/status.ts +398 -0
- package/src/commands/stop.test.ts +420 -0
- package/src/commands/stop.ts +151 -0
- package/src/commands/supervisor.test.ts +187 -0
- package/src/commands/supervisor.ts +535 -0
- package/src/commands/trace.test.ts +745 -0
- package/src/commands/trace.ts +325 -0
- package/src/commands/watch.test.ts +145 -0
- package/src/commands/watch.ts +247 -0
- package/src/commands/worktree.test.ts +786 -0
- package/src/commands/worktree.ts +311 -0
- package/src/config.test.ts +822 -0
- package/src/config.ts +829 -0
- package/src/doctor/agents.test.ts +454 -0
- package/src/doctor/agents.ts +396 -0
- package/src/doctor/config-check.test.ts +190 -0
- package/src/doctor/config-check.ts +183 -0
- package/src/doctor/consistency.test.ts +651 -0
- package/src/doctor/consistency.ts +294 -0
- package/src/doctor/databases.test.ts +290 -0
- package/src/doctor/databases.ts +218 -0
- package/src/doctor/dependencies.test.ts +184 -0
- package/src/doctor/dependencies.ts +175 -0
- package/src/doctor/logs.test.ts +251 -0
- package/src/doctor/logs.ts +295 -0
- package/src/doctor/merge-queue.test.ts +216 -0
- package/src/doctor/merge-queue.ts +144 -0
- package/src/doctor/structure.test.ts +291 -0
- package/src/doctor/structure.ts +198 -0
- package/src/doctor/types.ts +37 -0
- package/src/doctor/version.test.ts +136 -0
- package/src/doctor/version.ts +129 -0
- package/src/e2e/init-sling-lifecycle.test.ts +277 -0
- package/src/errors.ts +217 -0
- package/src/events/store.test.ts +660 -0
- package/src/events/store.ts +369 -0
- package/src/events/tool-filter.test.ts +330 -0
- package/src/events/tool-filter.ts +126 -0
- package/src/index.ts +316 -0
- package/src/insights/analyzer.test.ts +466 -0
- package/src/insights/analyzer.ts +203 -0
- package/src/logging/color.test.ts +142 -0
- package/src/logging/color.ts +71 -0
- package/src/logging/logger.test.ts +813 -0
- package/src/logging/logger.ts +266 -0
- package/src/logging/reporter.test.ts +259 -0
- package/src/logging/reporter.ts +109 -0
- package/src/logging/sanitizer.test.ts +190 -0
- package/src/logging/sanitizer.ts +57 -0
- package/src/mail/broadcast.test.ts +203 -0
- package/src/mail/broadcast.ts +92 -0
- package/src/mail/client.test.ts +773 -0
- package/src/mail/client.ts +223 -0
- package/src/mail/store.test.ts +705 -0
- package/src/mail/store.ts +387 -0
- package/src/merge/queue.test.ts +359 -0
- package/src/merge/queue.ts +231 -0
- package/src/merge/resolver.test.ts +1345 -0
- package/src/merge/resolver.ts +645 -0
- package/src/metrics/store.test.ts +667 -0
- package/src/metrics/store.ts +445 -0
- package/src/metrics/summary.test.ts +398 -0
- package/src/metrics/summary.ts +178 -0
- package/src/metrics/transcript.test.ts +356 -0
- package/src/metrics/transcript.ts +175 -0
- package/src/mulch/client.test.ts +671 -0
- package/src/mulch/client.ts +332 -0
- package/src/sessions/compat.test.ts +280 -0
- package/src/sessions/compat.ts +104 -0
- package/src/sessions/store.test.ts +873 -0
- package/src/sessions/store.ts +494 -0
- package/src/test-helpers.test.ts +124 -0
- package/src/test-helpers.ts +126 -0
- package/src/tracker/beads.ts +56 -0
- package/src/tracker/factory.test.ts +80 -0
- package/src/tracker/factory.ts +64 -0
- package/src/tracker/seeds.ts +182 -0
- package/src/tracker/types.ts +52 -0
- package/src/types.ts +724 -0
- package/src/watchdog/daemon.test.ts +1975 -0
- package/src/watchdog/daemon.ts +671 -0
- package/src/watchdog/health.test.ts +431 -0
- package/src/watchdog/health.ts +264 -0
- package/src/watchdog/triage.test.ts +164 -0
- package/src/watchdog/triage.ts +179 -0
- package/src/worktree/manager.test.ts +439 -0
- package/src/worktree/manager.ts +198 -0
- package/src/worktree/tmux.test.ts +1009 -0
- package/src/worktree/tmux.ts +509 -0
- package/templates/CLAUDE.md.tmpl +89 -0
- package/templates/hooks.json.tmpl +105 -0
- package/templates/overlay.md.tmpl +81 -0
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tier 0 mechanical process monitoring daemon.
|
|
3
|
+
*
|
|
4
|
+
* Runs on a configurable interval, checking the health of all active agent
|
|
5
|
+
* sessions. Implements progressive nudging for stalled agents instead of
|
|
6
|
+
* immediately escalating to AI triage:
|
|
7
|
+
*
|
|
8
|
+
* Level 0 (warn): Log warning via onHealthCheck callback, no direct action
|
|
9
|
+
* Level 1 (nudge): Send tmux nudge via nudgeAgent()
|
|
10
|
+
* Level 2 (escalate): Invoke Tier 1 AI triage (if tier1Enabled), else skip
|
|
11
|
+
* Level 3 (terminate): Kill tmux session
|
|
12
|
+
*
|
|
13
|
+
* Phase 4 tier numbering:
|
|
14
|
+
* Tier 0 = Mechanical daemon (this file)
|
|
15
|
+
* Tier 1 = Triage agent (triage.ts)
|
|
16
|
+
* Tier 2 = Monitor agent (not yet implemented)
|
|
17
|
+
* Tier 3 = Supervisor monitors (per-project)
|
|
18
|
+
*
|
|
19
|
+
* ZFC Principle: Observable state (tmux alive, pid alive) is the source of
|
|
20
|
+
* truth. See health.ts for the full ZFC documentation.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { join } from "node:path";
|
|
24
|
+
import { nudgeAgent } from "../commands/nudge.ts";
|
|
25
|
+
import { createEventStore } from "../events/store.ts";
|
|
26
|
+
import { createMulchClient } from "../mulch/client.ts";
|
|
27
|
+
import { openSessionStore } from "../sessions/compat.ts";
|
|
28
|
+
import type { AgentSession, EventStore, HealthCheck } from "../types.ts";
|
|
29
|
+
import { isSessionAlive, killSession } from "../worktree/tmux.ts";
|
|
30
|
+
import { evaluateHealth, transitionState } from "./health.ts";
|
|
31
|
+
import { triageAgent } from "./triage.ts";
|
|
32
|
+
|
|
33
|
+
/** Maximum escalation level (terminate). */
|
|
34
|
+
const MAX_ESCALATION_LEVEL = 3;
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Persistent agent capabilities that are excluded from run-level completion checks.
|
|
38
|
+
* These agents are long-running and should not count toward "all workers done".
|
|
39
|
+
*/
|
|
40
|
+
const PERSISTENT_CAPABILITIES = new Set(["coordinator", "monitor"]);
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Record an agent failure to mulch for future reference.
|
|
44
|
+
* Fire-and-forget: never throws, logs errors internally if mulch fails.
|
|
45
|
+
*
|
|
46
|
+
* @param root - Project root directory
|
|
47
|
+
* @param session - The agent session that failed
|
|
48
|
+
* @param reason - Human-readable failure reason
|
|
49
|
+
* @param tier - Which watchdog tier detected the failure (0 or 1)
|
|
50
|
+
* @param triageSuggestion - Optional triage verdict from Tier 1 AI analysis
|
|
51
|
+
*/
|
|
52
|
+
async function recordFailure(
|
|
53
|
+
root: string,
|
|
54
|
+
session: AgentSession,
|
|
55
|
+
reason: string,
|
|
56
|
+
tier: 0 | 1,
|
|
57
|
+
triageSuggestion?: string,
|
|
58
|
+
): Promise<void> {
|
|
59
|
+
try {
|
|
60
|
+
const mulch = createMulchClient(root);
|
|
61
|
+
const tierLabel = tier === 0 ? "Tier 0 (process death)" : "Tier 1 (AI triage)";
|
|
62
|
+
const description = [
|
|
63
|
+
`Agent: ${session.agentName}`,
|
|
64
|
+
`Capability: ${session.capability}`,
|
|
65
|
+
`Failure reason: ${reason}`,
|
|
66
|
+
triageSuggestion ? `Triage suggestion: ${triageSuggestion}` : null,
|
|
67
|
+
`Detected by: ${tierLabel}`,
|
|
68
|
+
]
|
|
69
|
+
.filter((line) => line !== null)
|
|
70
|
+
.join("\n");
|
|
71
|
+
|
|
72
|
+
await mulch.record("agents", {
|
|
73
|
+
type: "failure",
|
|
74
|
+
description,
|
|
75
|
+
tags: ["watchdog", "auto-recorded"],
|
|
76
|
+
evidenceBead: session.beadId || undefined,
|
|
77
|
+
});
|
|
78
|
+
} catch {
|
|
79
|
+
// Fire-and-forget: recording failures must not break the watchdog
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Read the current run ID from current-run.txt, or null if no active run.
|
|
85
|
+
* Async because it uses Bun.file().
|
|
86
|
+
*/
|
|
87
|
+
async function readCurrentRunId(overstoryDir: string): Promise<string | null> {
|
|
88
|
+
const path = join(overstoryDir, "current-run.txt");
|
|
89
|
+
const file = Bun.file(path);
|
|
90
|
+
if (!(await file.exists())) {
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
93
|
+
try {
|
|
94
|
+
const text = await file.text();
|
|
95
|
+
const trimmed = text.trim();
|
|
96
|
+
return trimmed.length > 0 ? trimmed : null;
|
|
97
|
+
} catch {
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Fire-and-forget: record an event to EventStore. Never throws.
|
|
104
|
+
*/
|
|
105
|
+
function recordEvent(
|
|
106
|
+
eventStore: EventStore | null,
|
|
107
|
+
event: {
|
|
108
|
+
runId: string | null;
|
|
109
|
+
agentName: string;
|
|
110
|
+
eventType: "custom" | "mail_sent";
|
|
111
|
+
level: "debug" | "info" | "warn" | "error";
|
|
112
|
+
data: Record<string, unknown>;
|
|
113
|
+
},
|
|
114
|
+
): void {
|
|
115
|
+
if (!eventStore) return;
|
|
116
|
+
try {
|
|
117
|
+
eventStore.insert({
|
|
118
|
+
runId: event.runId,
|
|
119
|
+
agentName: event.agentName,
|
|
120
|
+
sessionId: null,
|
|
121
|
+
eventType: event.eventType,
|
|
122
|
+
toolName: null,
|
|
123
|
+
toolArgs: null,
|
|
124
|
+
toolDurationMs: null,
|
|
125
|
+
level: event.level,
|
|
126
|
+
data: JSON.stringify(event.data),
|
|
127
|
+
});
|
|
128
|
+
} catch {
|
|
129
|
+
// Fire-and-forget: event recording must never break the daemon
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Build a phase-aware completion message based on the capabilities of completed workers.
|
|
135
|
+
*
|
|
136
|
+
* Single-capability batches get targeted messages (e.g. scouts → "Ready for next phase"),
|
|
137
|
+
* while mixed-capability batches get a generic summary with a breakdown.
|
|
138
|
+
*/
|
|
139
|
+
export function buildCompletionMessage(
|
|
140
|
+
workerSessions: readonly AgentSession[],
|
|
141
|
+
runId: string,
|
|
142
|
+
): string {
|
|
143
|
+
const capabilities = new Set(workerSessions.map((s) => s.capability));
|
|
144
|
+
const count = workerSessions.length;
|
|
145
|
+
|
|
146
|
+
if (capabilities.size === 1) {
|
|
147
|
+
if (capabilities.has("scout")) {
|
|
148
|
+
return `[WATCHDOG] All ${count} scout(s) in run ${runId} have completed. Ready for next phase.`;
|
|
149
|
+
}
|
|
150
|
+
if (capabilities.has("builder")) {
|
|
151
|
+
return `[WATCHDOG] All ${count} builder(s) in run ${runId} have completed. Ready for merge/cleanup.`;
|
|
152
|
+
}
|
|
153
|
+
if (capabilities.has("reviewer")) {
|
|
154
|
+
return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} have completed. Reviews done.`;
|
|
155
|
+
}
|
|
156
|
+
if (capabilities.has("lead")) {
|
|
157
|
+
return `[WATCHDOG] All ${count} lead(s) in run ${runId} have completed. Ready for merge/cleanup.`;
|
|
158
|
+
}
|
|
159
|
+
if (capabilities.has("merger")) {
|
|
160
|
+
return `[WATCHDOG] All ${count} merger(s) in run ${runId} have completed. Merges done.`;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const breakdown = Array.from(capabilities).sort().join(", ");
|
|
165
|
+
return `[WATCHDOG] All ${count} worker(s) in run ${runId} have completed (${breakdown}). Ready for next steps.`;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Check if all worker sessions for the active run have completed, and if so,
|
|
170
|
+
* nudge the coordinator. Fire-and-forget: never throws.
|
|
171
|
+
*
|
|
172
|
+
* Deduplication: uses a marker file (run-complete-notified.txt) to prevent
|
|
173
|
+
* repeated nudges for the same run ID.
|
|
174
|
+
*/
|
|
175
|
+
async function checkRunCompletion(ctx: {
|
|
176
|
+
store: { getByRun: (runId: string) => AgentSession[] };
|
|
177
|
+
runId: string;
|
|
178
|
+
overstoryDir: string;
|
|
179
|
+
root: string;
|
|
180
|
+
nudge: (
|
|
181
|
+
projectRoot: string,
|
|
182
|
+
agentName: string,
|
|
183
|
+
message: string,
|
|
184
|
+
force: boolean,
|
|
185
|
+
) => Promise<{ delivered: boolean; reason?: string }>;
|
|
186
|
+
eventStore: EventStore | null;
|
|
187
|
+
}): Promise<void> {
|
|
188
|
+
const { store, runId, overstoryDir, root, nudge, eventStore } = ctx;
|
|
189
|
+
|
|
190
|
+
const runSessions = store.getByRun(runId);
|
|
191
|
+
const workerSessions = runSessions.filter((s) => !PERSISTENT_CAPABILITIES.has(s.capability));
|
|
192
|
+
|
|
193
|
+
if (workerSessions.length === 0) {
|
|
194
|
+
return;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const allCompleted = workerSessions.every((s) => s.state === "completed");
|
|
198
|
+
if (!allCompleted) {
|
|
199
|
+
return;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Dedup: check marker file
|
|
203
|
+
const markerPath = join(overstoryDir, "run-complete-notified.txt");
|
|
204
|
+
try {
|
|
205
|
+
const file = Bun.file(markerPath);
|
|
206
|
+
if (await file.exists()) {
|
|
207
|
+
const existing = await file.text();
|
|
208
|
+
if (existing.trim() === runId) {
|
|
209
|
+
return; // Already notified
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
} catch {
|
|
213
|
+
// Read failure is non-fatal — proceed with nudge
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Nudge the coordinator
|
|
217
|
+
const message = buildCompletionMessage(workerSessions, runId);
|
|
218
|
+
try {
|
|
219
|
+
await nudge(root, "coordinator", message, true);
|
|
220
|
+
} catch {
|
|
221
|
+
// Nudge delivery failure is non-fatal
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Record the event
|
|
225
|
+
const capabilitiesArr = Array.from(new Set(workerSessions.map((s) => s.capability))).sort();
|
|
226
|
+
const phase = capabilitiesArr.length === 1 ? capabilitiesArr[0] : "mixed";
|
|
227
|
+
recordEvent(eventStore, {
|
|
228
|
+
runId,
|
|
229
|
+
agentName: "watchdog",
|
|
230
|
+
eventType: "custom",
|
|
231
|
+
level: "info",
|
|
232
|
+
data: {
|
|
233
|
+
type: "run_complete",
|
|
234
|
+
workerCount: workerSessions.length,
|
|
235
|
+
completedAgents: workerSessions.map((s) => s.agentName),
|
|
236
|
+
capabilities: capabilitiesArr,
|
|
237
|
+
phase,
|
|
238
|
+
},
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
// Write dedup marker
|
|
242
|
+
try {
|
|
243
|
+
await Bun.write(markerPath, runId);
|
|
244
|
+
} catch {
|
|
245
|
+
// Marker write failure is non-fatal
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/** Options shared between startDaemon and runDaemonTick. */
|
|
250
|
+
export interface DaemonOptions {
|
|
251
|
+
root: string;
|
|
252
|
+
staleThresholdMs: number;
|
|
253
|
+
zombieThresholdMs: number;
|
|
254
|
+
nudgeIntervalMs?: number;
|
|
255
|
+
tier1Enabled?: boolean;
|
|
256
|
+
onHealthCheck?: (check: HealthCheck) => void;
|
|
257
|
+
/** Dependency injection for testing. Uses real implementations when omitted. */
|
|
258
|
+
_tmux?: {
|
|
259
|
+
isSessionAlive: (name: string) => Promise<boolean>;
|
|
260
|
+
killSession: (name: string) => Promise<void>;
|
|
261
|
+
};
|
|
262
|
+
/** Dependency injection for testing. Uses real triageAgent when omitted. */
|
|
263
|
+
_triage?: (options: {
|
|
264
|
+
agentName: string;
|
|
265
|
+
root: string;
|
|
266
|
+
lastActivity: string;
|
|
267
|
+
}) => Promise<"retry" | "terminate" | "extend">;
|
|
268
|
+
/** Dependency injection for testing. Uses real nudgeAgent when omitted. */
|
|
269
|
+
_nudge?: (
|
|
270
|
+
projectRoot: string,
|
|
271
|
+
agentName: string,
|
|
272
|
+
message: string,
|
|
273
|
+
force: boolean,
|
|
274
|
+
) => Promise<{ delivered: boolean; reason?: string }>;
|
|
275
|
+
/** Dependency injection for testing. Overrides EventStore creation. */
|
|
276
|
+
_eventStore?: EventStore | null;
|
|
277
|
+
/** Dependency injection for testing. Uses real recordFailure when omitted. */
|
|
278
|
+
_recordFailure?: (
|
|
279
|
+
root: string,
|
|
280
|
+
session: AgentSession,
|
|
281
|
+
reason: string,
|
|
282
|
+
tier: 0 | 1,
|
|
283
|
+
triageSuggestion?: string,
|
|
284
|
+
) => Promise<void>;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Start the watchdog daemon that periodically monitors agent health.
|
|
289
|
+
*
|
|
290
|
+
* On each tick:
|
|
291
|
+
* 1. Loads sessions from SessionStore (sessions.db)
|
|
292
|
+
* 2. For each session (including zombies — ZFC requires re-checking observable
|
|
293
|
+
* state), checks tmux liveness and evaluates health
|
|
294
|
+
* 3. For "terminate" actions: kills tmux session immediately
|
|
295
|
+
* 4. For "investigate" actions: surfaces via onHealthCheck, no auto-kill
|
|
296
|
+
* 5. For "escalate" actions: applies progressive nudging based on escalationLevel
|
|
297
|
+
* 6. Persists updated session states back to SessionStore
|
|
298
|
+
*
|
|
299
|
+
* @param options.root - Project root directory (contains .overstory/)
|
|
300
|
+
* @param options.intervalMs - Polling interval in milliseconds
|
|
301
|
+
* @param options.staleThresholdMs - Time after which an agent is considered stale
|
|
302
|
+
* @param options.zombieThresholdMs - Time after which an agent is considered a zombie
|
|
303
|
+
* @param options.nudgeIntervalMs - Time between progressive nudge stage transitions (default 60000)
|
|
304
|
+
* @param options.tier1Enabled - Whether Tier 1 AI triage is enabled (default false)
|
|
305
|
+
* @param options.onHealthCheck - Optional callback for each health check result
|
|
306
|
+
* @returns An object with a `stop` function to halt the daemon
|
|
307
|
+
*/
|
|
308
|
+
export function startDaemon(options: DaemonOptions & { intervalMs: number }): { stop: () => void } {
|
|
309
|
+
const { intervalMs } = options;
|
|
310
|
+
|
|
311
|
+
// Run the first tick immediately, then on interval
|
|
312
|
+
runDaemonTick(options).catch(() => {
|
|
313
|
+
// Swallow errors in the first tick — daemon must not crash
|
|
314
|
+
});
|
|
315
|
+
|
|
316
|
+
const interval = setInterval(() => {
|
|
317
|
+
runDaemonTick(options).catch(() => {
|
|
318
|
+
// Swallow errors in periodic ticks — daemon must not crash
|
|
319
|
+
});
|
|
320
|
+
}, intervalMs);
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
stop(): void {
|
|
324
|
+
clearInterval(interval);
|
|
325
|
+
},
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/**
|
|
330
|
+
* Run a single daemon tick. Exported for testing — allows direct invocation
|
|
331
|
+
* of the monitoring logic without starting the interval-based daemon loop.
|
|
332
|
+
*
|
|
333
|
+
* @param options - Same options as startDaemon (minus intervalMs)
|
|
334
|
+
*/
|
|
335
|
+
export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
336
|
+
const {
|
|
337
|
+
root,
|
|
338
|
+
staleThresholdMs,
|
|
339
|
+
zombieThresholdMs,
|
|
340
|
+
nudgeIntervalMs = 60_000,
|
|
341
|
+
tier1Enabled = false,
|
|
342
|
+
onHealthCheck,
|
|
343
|
+
} = options;
|
|
344
|
+
const tmux = options._tmux ?? { isSessionAlive, killSession };
|
|
345
|
+
const triage = options._triage ?? triageAgent;
|
|
346
|
+
const nudge = options._nudge ?? nudgeAgent;
|
|
347
|
+
const recordFailureFn = options._recordFailure ?? recordFailure;
|
|
348
|
+
|
|
349
|
+
const overstoryDir = join(root, ".overstory");
|
|
350
|
+
const { store } = openSessionStore(overstoryDir);
|
|
351
|
+
|
|
352
|
+
// Open EventStore for recording daemon events (fire-and-forget)
|
|
353
|
+
let eventStore: EventStore | null = null;
|
|
354
|
+
let runId: string | null = null;
|
|
355
|
+
const useInjectedEventStore = options._eventStore !== undefined;
|
|
356
|
+
if (useInjectedEventStore) {
|
|
357
|
+
eventStore = options._eventStore ?? null;
|
|
358
|
+
} else {
|
|
359
|
+
try {
|
|
360
|
+
const eventsDbPath = join(overstoryDir, "events.db");
|
|
361
|
+
eventStore = createEventStore(eventsDbPath);
|
|
362
|
+
} catch {
|
|
363
|
+
// EventStore creation failure is non-fatal for the daemon
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
try {
|
|
367
|
+
runId = await readCurrentRunId(overstoryDir);
|
|
368
|
+
} catch {
|
|
369
|
+
// Reading run ID failure is non-fatal
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
try {
|
|
373
|
+
const thresholds = {
|
|
374
|
+
staleMs: staleThresholdMs,
|
|
375
|
+
zombieMs: zombieThresholdMs,
|
|
376
|
+
};
|
|
377
|
+
|
|
378
|
+
const sessions = store.getAll();
|
|
379
|
+
|
|
380
|
+
for (const session of sessions) {
|
|
381
|
+
// Skip completed sessions — they are terminal and don't need monitoring
|
|
382
|
+
if (session.state === "completed") {
|
|
383
|
+
continue;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// ZFC: Don't skip zombies. Re-check tmux liveness on every tick.
|
|
387
|
+
// A zombie with a live tmux session needs investigation, not silence.
|
|
388
|
+
|
|
389
|
+
const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
390
|
+
const check = evaluateHealth(session, tmuxAlive, thresholds);
|
|
391
|
+
|
|
392
|
+
// Transition state forward only (investigate action holds state)
|
|
393
|
+
const newState = transitionState(session.state, check);
|
|
394
|
+
if (newState !== session.state) {
|
|
395
|
+
store.updateState(session.agentName, newState);
|
|
396
|
+
session.state = newState;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
if (onHealthCheck) {
|
|
400
|
+
onHealthCheck(check);
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
if (check.action === "terminate") {
|
|
404
|
+
// Record the failure via mulch (Tier 0 detection)
|
|
405
|
+
const reason = check.reconciliationNote ?? "Process terminated";
|
|
406
|
+
await recordFailureFn(root, session, reason, 0);
|
|
407
|
+
|
|
408
|
+
// Kill the tmux session if it's still alive
|
|
409
|
+
if (tmuxAlive) {
|
|
410
|
+
try {
|
|
411
|
+
await tmux.killSession(session.tmuxSession);
|
|
412
|
+
} catch {
|
|
413
|
+
// Session may have died between check and kill — not an error
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
store.updateState(session.agentName, "zombie");
|
|
417
|
+
// Reset escalation tracking on terminal state
|
|
418
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
419
|
+
session.state = "zombie";
|
|
420
|
+
session.escalationLevel = 0;
|
|
421
|
+
session.stalledSince = null;
|
|
422
|
+
} else if (check.action === "investigate") {
|
|
423
|
+
// ZFC: tmux alive but SessionStore says zombie.
|
|
424
|
+
// Log the conflict but do NOT auto-kill.
|
|
425
|
+
// The onHealthCheck callback surfaces this to the operator.
|
|
426
|
+
// No state change — keep zombie until a human or higher-tier agent decides.
|
|
427
|
+
} else if (check.action === "escalate") {
|
|
428
|
+
// Progressive nudging: increment escalation level based on elapsed time
|
|
429
|
+
// instead of immediately delegating to AI triage.
|
|
430
|
+
|
|
431
|
+
// Initialize stalledSince on first escalation detection
|
|
432
|
+
if (session.stalledSince === null) {
|
|
433
|
+
session.stalledSince = new Date().toISOString();
|
|
434
|
+
session.escalationLevel = 0;
|
|
435
|
+
store.updateEscalation(session.agentName, 0, session.stalledSince);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Check if enough time has passed to advance to the next escalation level
|
|
439
|
+
const stalledMs = Date.now() - new Date(session.stalledSince).getTime();
|
|
440
|
+
const expectedLevel = Math.min(
|
|
441
|
+
Math.floor(stalledMs / nudgeIntervalMs),
|
|
442
|
+
MAX_ESCALATION_LEVEL,
|
|
443
|
+
);
|
|
444
|
+
|
|
445
|
+
if (expectedLevel > session.escalationLevel) {
|
|
446
|
+
session.escalationLevel = expectedLevel;
|
|
447
|
+
store.updateEscalation(session.agentName, expectedLevel, session.stalledSince);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// Execute the action for the current escalation level
|
|
451
|
+
const actionResult = await executeEscalationAction({
|
|
452
|
+
session,
|
|
453
|
+
root,
|
|
454
|
+
tmuxAlive,
|
|
455
|
+
tier1Enabled,
|
|
456
|
+
tmux,
|
|
457
|
+
triage,
|
|
458
|
+
nudge,
|
|
459
|
+
eventStore,
|
|
460
|
+
runId,
|
|
461
|
+
recordFailure: recordFailureFn,
|
|
462
|
+
});
|
|
463
|
+
|
|
464
|
+
if (actionResult.terminated) {
|
|
465
|
+
store.updateState(session.agentName, "zombie");
|
|
466
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
467
|
+
session.state = "zombie";
|
|
468
|
+
session.escalationLevel = 0;
|
|
469
|
+
session.stalledSince = null;
|
|
470
|
+
}
|
|
471
|
+
} else if (check.action === "none" && session.stalledSince !== null) {
|
|
472
|
+
// Agent recovered — reset escalation tracking
|
|
473
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
474
|
+
session.stalledSince = null;
|
|
475
|
+
session.escalationLevel = 0;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// === Run-level completion detection ===
|
|
480
|
+
// After monitoring individual sessions, check if the entire run is done.
|
|
481
|
+
if (runId) {
|
|
482
|
+
await checkRunCompletion({
|
|
483
|
+
store,
|
|
484
|
+
runId,
|
|
485
|
+
overstoryDir,
|
|
486
|
+
root,
|
|
487
|
+
nudge,
|
|
488
|
+
eventStore,
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
} finally {
|
|
492
|
+
store.close();
|
|
493
|
+
// Close EventStore only if we created it (not injected)
|
|
494
|
+
if (eventStore && !useInjectedEventStore) {
|
|
495
|
+
try {
|
|
496
|
+
eventStore.close();
|
|
497
|
+
} catch {
|
|
498
|
+
// Non-fatal
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
/**
|
|
505
|
+
* Execute the escalation action corresponding to the agent's current escalation level.
|
|
506
|
+
*
|
|
507
|
+
* Level 0 (warn): No direct action — onHealthCheck callback already fired above.
|
|
508
|
+
* Level 1 (nudge): Send a tmux nudge to the agent.
|
|
509
|
+
* Level 2 (escalate): Invoke Tier 1 AI triage (if tier1Enabled; skip otherwise).
|
|
510
|
+
* Level 3 (terminate): Kill the tmux session.
|
|
511
|
+
*
|
|
512
|
+
* @returns Object indicating whether the agent was terminated or state changed.
|
|
513
|
+
*/
|
|
514
|
+
async function executeEscalationAction(ctx: {
|
|
515
|
+
session: AgentSession;
|
|
516
|
+
root: string;
|
|
517
|
+
tmuxAlive: boolean;
|
|
518
|
+
tier1Enabled: boolean;
|
|
519
|
+
tmux: {
|
|
520
|
+
isSessionAlive: (name: string) => Promise<boolean>;
|
|
521
|
+
killSession: (name: string) => Promise<void>;
|
|
522
|
+
};
|
|
523
|
+
triage: (options: {
|
|
524
|
+
agentName: string;
|
|
525
|
+
root: string;
|
|
526
|
+
lastActivity: string;
|
|
527
|
+
}) => Promise<"retry" | "terminate" | "extend">;
|
|
528
|
+
nudge: (
|
|
529
|
+
projectRoot: string,
|
|
530
|
+
agentName: string,
|
|
531
|
+
message: string,
|
|
532
|
+
force: boolean,
|
|
533
|
+
) => Promise<{ delivered: boolean; reason?: string }>;
|
|
534
|
+
eventStore: EventStore | null;
|
|
535
|
+
runId: string | null;
|
|
536
|
+
recordFailure: (
|
|
537
|
+
root: string,
|
|
538
|
+
session: AgentSession,
|
|
539
|
+
reason: string,
|
|
540
|
+
tier: 0 | 1,
|
|
541
|
+
triageSuggestion?: string,
|
|
542
|
+
) => Promise<void>;
|
|
543
|
+
}): Promise<{ terminated: boolean; stateChanged: boolean }> {
|
|
544
|
+
const {
|
|
545
|
+
session,
|
|
546
|
+
root,
|
|
547
|
+
tmuxAlive,
|
|
548
|
+
tier1Enabled,
|
|
549
|
+
tmux,
|
|
550
|
+
triage,
|
|
551
|
+
nudge,
|
|
552
|
+
eventStore,
|
|
553
|
+
runId,
|
|
554
|
+
recordFailure,
|
|
555
|
+
} = ctx;
|
|
556
|
+
|
|
557
|
+
switch (session.escalationLevel) {
|
|
558
|
+
case 0: {
|
|
559
|
+
// Level 0: warn — onHealthCheck callback already fired, no direct action
|
|
560
|
+
recordEvent(eventStore, {
|
|
561
|
+
runId,
|
|
562
|
+
agentName: session.agentName,
|
|
563
|
+
eventType: "custom",
|
|
564
|
+
level: "warn",
|
|
565
|
+
data: { type: "escalation", escalationLevel: 0, action: "warn" },
|
|
566
|
+
});
|
|
567
|
+
return { terminated: false, stateChanged: false };
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
case 1: {
|
|
571
|
+
// Level 1: nudge — send a tmux nudge to the agent
|
|
572
|
+
let delivered = false;
|
|
573
|
+
try {
|
|
574
|
+
const result = await nudge(
|
|
575
|
+
root,
|
|
576
|
+
session.agentName,
|
|
577
|
+
`[WATCHDOG] Agent "${session.agentName}" appears stalled. Please check your current task and report status.`,
|
|
578
|
+
true, // force — skip debounce for watchdog nudges
|
|
579
|
+
);
|
|
580
|
+
delivered = result.delivered;
|
|
581
|
+
} catch {
|
|
582
|
+
// Nudge delivery failure is non-fatal for the watchdog
|
|
583
|
+
}
|
|
584
|
+
recordEvent(eventStore, {
|
|
585
|
+
runId,
|
|
586
|
+
agentName: session.agentName,
|
|
587
|
+
eventType: "custom",
|
|
588
|
+
level: "warn",
|
|
589
|
+
data: { type: "nudge", escalationLevel: 1, delivered },
|
|
590
|
+
});
|
|
591
|
+
return { terminated: false, stateChanged: false };
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
case 2: {
|
|
595
|
+
// Level 2: escalate — invoke Tier 1 AI triage if enabled
|
|
596
|
+
if (!tier1Enabled) {
|
|
597
|
+
// Tier 1 disabled — skip triage, progressive nudging continues to level 3
|
|
598
|
+
return { terminated: false, stateChanged: false };
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
const verdict = await triage({
|
|
602
|
+
agentName: session.agentName,
|
|
603
|
+
root,
|
|
604
|
+
lastActivity: session.lastActivity,
|
|
605
|
+
});
|
|
606
|
+
|
|
607
|
+
recordEvent(eventStore, {
|
|
608
|
+
runId,
|
|
609
|
+
agentName: session.agentName,
|
|
610
|
+
eventType: "custom",
|
|
611
|
+
level: "warn",
|
|
612
|
+
data: { type: "triage", escalationLevel: 2, verdict },
|
|
613
|
+
});
|
|
614
|
+
|
|
615
|
+
if (verdict === "terminate") {
|
|
616
|
+
// Record the failure via mulch (Tier 1 AI triage)
|
|
617
|
+
await recordFailure(root, session, "AI triage classified as terminal failure", 1, verdict);
|
|
618
|
+
|
|
619
|
+
if (tmuxAlive) {
|
|
620
|
+
try {
|
|
621
|
+
await tmux.killSession(session.tmuxSession);
|
|
622
|
+
} catch {
|
|
623
|
+
// Session may have died — not an error
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
return { terminated: true, stateChanged: true };
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if (verdict === "retry") {
|
|
630
|
+
// Send a nudge with a recovery message
|
|
631
|
+
try {
|
|
632
|
+
await nudge(
|
|
633
|
+
root,
|
|
634
|
+
session.agentName,
|
|
635
|
+
"[WATCHDOG] Triage suggests recovery is possible. " +
|
|
636
|
+
"Please retry your current operation or check for errors.",
|
|
637
|
+
true, // force — skip debounce
|
|
638
|
+
);
|
|
639
|
+
} catch {
|
|
640
|
+
// Nudge delivery failure is non-fatal
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
// "retry" (after nudge) and "extend" leave the session running
|
|
645
|
+
return { terminated: false, stateChanged: false };
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
default: {
|
|
649
|
+
// Level 3+: terminate — kill the tmux session
|
|
650
|
+
recordEvent(eventStore, {
|
|
651
|
+
runId,
|
|
652
|
+
agentName: session.agentName,
|
|
653
|
+
eventType: "custom",
|
|
654
|
+
level: "error",
|
|
655
|
+
data: { type: "escalation", escalationLevel: 3, action: "terminate" },
|
|
656
|
+
});
|
|
657
|
+
|
|
658
|
+
// Record the failure via mulch (Tier 0: progressive escalation to terminal level)
|
|
659
|
+
await recordFailure(root, session, "Progressive escalation reached terminal level", 0);
|
|
660
|
+
|
|
661
|
+
if (tmuxAlive) {
|
|
662
|
+
try {
|
|
663
|
+
await tmux.killSession(session.tmuxSession);
|
|
664
|
+
} catch {
|
|
665
|
+
// Session may have died — not an error
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
return { terminated: true, stateChanged: true };
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
}
|