@os-eco/overstory-cli 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +381 -0
  3. package/agents/builder.md +137 -0
  4. package/agents/coordinator.md +263 -0
  5. package/agents/lead.md +301 -0
  6. package/agents/merger.md +160 -0
  7. package/agents/monitor.md +214 -0
  8. package/agents/reviewer.md +140 -0
  9. package/agents/scout.md +119 -0
  10. package/agents/supervisor.md +423 -0
  11. package/package.json +47 -0
  12. package/src/agents/checkpoint.test.ts +88 -0
  13. package/src/agents/checkpoint.ts +101 -0
  14. package/src/agents/hooks-deployer.test.ts +2040 -0
  15. package/src/agents/hooks-deployer.ts +607 -0
  16. package/src/agents/identity.test.ts +603 -0
  17. package/src/agents/identity.ts +384 -0
  18. package/src/agents/lifecycle.test.ts +196 -0
  19. package/src/agents/lifecycle.ts +183 -0
  20. package/src/agents/manifest.test.ts +746 -0
  21. package/src/agents/manifest.ts +354 -0
  22. package/src/agents/overlay.test.ts +676 -0
  23. package/src/agents/overlay.ts +308 -0
  24. package/src/beads/client.test.ts +217 -0
  25. package/src/beads/client.ts +202 -0
  26. package/src/beads/molecules.test.ts +338 -0
  27. package/src/beads/molecules.ts +198 -0
  28. package/src/commands/agents.test.ts +322 -0
  29. package/src/commands/agents.ts +287 -0
  30. package/src/commands/clean.test.ts +670 -0
  31. package/src/commands/clean.ts +618 -0
  32. package/src/commands/completions.test.ts +342 -0
  33. package/src/commands/completions.ts +887 -0
  34. package/src/commands/coordinator.test.ts +1530 -0
  35. package/src/commands/coordinator.ts +733 -0
  36. package/src/commands/costs.test.ts +1119 -0
  37. package/src/commands/costs.ts +564 -0
  38. package/src/commands/dashboard.test.ts +308 -0
  39. package/src/commands/dashboard.ts +838 -0
  40. package/src/commands/doctor.test.ts +294 -0
  41. package/src/commands/doctor.ts +213 -0
  42. package/src/commands/errors.test.ts +647 -0
  43. package/src/commands/errors.ts +248 -0
  44. package/src/commands/feed.test.ts +578 -0
  45. package/src/commands/feed.ts +361 -0
  46. package/src/commands/group.test.ts +262 -0
  47. package/src/commands/group.ts +511 -0
  48. package/src/commands/hooks.test.ts +458 -0
  49. package/src/commands/hooks.ts +253 -0
  50. package/src/commands/init.test.ts +347 -0
  51. package/src/commands/init.ts +650 -0
  52. package/src/commands/inspect.test.ts +670 -0
  53. package/src/commands/inspect.ts +431 -0
  54. package/src/commands/log.test.ts +1454 -0
  55. package/src/commands/log.ts +724 -0
  56. package/src/commands/logs.test.ts +379 -0
  57. package/src/commands/logs.ts +546 -0
  58. package/src/commands/mail.test.ts +1270 -0
  59. package/src/commands/mail.ts +771 -0
  60. package/src/commands/merge.test.ts +670 -0
  61. package/src/commands/merge.ts +355 -0
  62. package/src/commands/metrics.test.ts +444 -0
  63. package/src/commands/metrics.ts +143 -0
  64. package/src/commands/monitor.test.ts +191 -0
  65. package/src/commands/monitor.ts +390 -0
  66. package/src/commands/nudge.test.ts +230 -0
  67. package/src/commands/nudge.ts +372 -0
  68. package/src/commands/prime.test.ts +470 -0
  69. package/src/commands/prime.ts +381 -0
  70. package/src/commands/replay.test.ts +741 -0
  71. package/src/commands/replay.ts +360 -0
  72. package/src/commands/run.test.ts +431 -0
  73. package/src/commands/run.ts +351 -0
  74. package/src/commands/sling.test.ts +657 -0
  75. package/src/commands/sling.ts +661 -0
  76. package/src/commands/spec.test.ts +203 -0
  77. package/src/commands/spec.ts +168 -0
  78. package/src/commands/status.test.ts +430 -0
  79. package/src/commands/status.ts +398 -0
  80. package/src/commands/stop.test.ts +420 -0
  81. package/src/commands/stop.ts +151 -0
  82. package/src/commands/supervisor.test.ts +187 -0
  83. package/src/commands/supervisor.ts +535 -0
  84. package/src/commands/trace.test.ts +745 -0
  85. package/src/commands/trace.ts +325 -0
  86. package/src/commands/watch.test.ts +145 -0
  87. package/src/commands/watch.ts +247 -0
  88. package/src/commands/worktree.test.ts +786 -0
  89. package/src/commands/worktree.ts +311 -0
  90. package/src/config.test.ts +822 -0
  91. package/src/config.ts +829 -0
  92. package/src/doctor/agents.test.ts +454 -0
  93. package/src/doctor/agents.ts +396 -0
  94. package/src/doctor/config-check.test.ts +190 -0
  95. package/src/doctor/config-check.ts +183 -0
  96. package/src/doctor/consistency.test.ts +651 -0
  97. package/src/doctor/consistency.ts +294 -0
  98. package/src/doctor/databases.test.ts +290 -0
  99. package/src/doctor/databases.ts +218 -0
  100. package/src/doctor/dependencies.test.ts +184 -0
  101. package/src/doctor/dependencies.ts +175 -0
  102. package/src/doctor/logs.test.ts +251 -0
  103. package/src/doctor/logs.ts +295 -0
  104. package/src/doctor/merge-queue.test.ts +216 -0
  105. package/src/doctor/merge-queue.ts +144 -0
  106. package/src/doctor/structure.test.ts +291 -0
  107. package/src/doctor/structure.ts +198 -0
  108. package/src/doctor/types.ts +37 -0
  109. package/src/doctor/version.test.ts +136 -0
  110. package/src/doctor/version.ts +129 -0
  111. package/src/e2e/init-sling-lifecycle.test.ts +277 -0
  112. package/src/errors.ts +217 -0
  113. package/src/events/store.test.ts +660 -0
  114. package/src/events/store.ts +369 -0
  115. package/src/events/tool-filter.test.ts +330 -0
  116. package/src/events/tool-filter.ts +126 -0
  117. package/src/index.ts +316 -0
  118. package/src/insights/analyzer.test.ts +466 -0
  119. package/src/insights/analyzer.ts +203 -0
  120. package/src/logging/color.test.ts +142 -0
  121. package/src/logging/color.ts +71 -0
  122. package/src/logging/logger.test.ts +813 -0
  123. package/src/logging/logger.ts +266 -0
  124. package/src/logging/reporter.test.ts +259 -0
  125. package/src/logging/reporter.ts +109 -0
  126. package/src/logging/sanitizer.test.ts +190 -0
  127. package/src/logging/sanitizer.ts +57 -0
  128. package/src/mail/broadcast.test.ts +203 -0
  129. package/src/mail/broadcast.ts +92 -0
  130. package/src/mail/client.test.ts +773 -0
  131. package/src/mail/client.ts +223 -0
  132. package/src/mail/store.test.ts +705 -0
  133. package/src/mail/store.ts +387 -0
  134. package/src/merge/queue.test.ts +359 -0
  135. package/src/merge/queue.ts +231 -0
  136. package/src/merge/resolver.test.ts +1345 -0
  137. package/src/merge/resolver.ts +645 -0
  138. package/src/metrics/store.test.ts +667 -0
  139. package/src/metrics/store.ts +445 -0
  140. package/src/metrics/summary.test.ts +398 -0
  141. package/src/metrics/summary.ts +178 -0
  142. package/src/metrics/transcript.test.ts +356 -0
  143. package/src/metrics/transcript.ts +175 -0
  144. package/src/mulch/client.test.ts +671 -0
  145. package/src/mulch/client.ts +332 -0
  146. package/src/sessions/compat.test.ts +280 -0
  147. package/src/sessions/compat.ts +104 -0
  148. package/src/sessions/store.test.ts +873 -0
  149. package/src/sessions/store.ts +494 -0
  150. package/src/test-helpers.test.ts +124 -0
  151. package/src/test-helpers.ts +126 -0
  152. package/src/tracker/beads.ts +56 -0
  153. package/src/tracker/factory.test.ts +80 -0
  154. package/src/tracker/factory.ts +64 -0
  155. package/src/tracker/seeds.ts +182 -0
  156. package/src/tracker/types.ts +52 -0
  157. package/src/types.ts +724 -0
  158. package/src/watchdog/daemon.test.ts +1975 -0
  159. package/src/watchdog/daemon.ts +671 -0
  160. package/src/watchdog/health.test.ts +431 -0
  161. package/src/watchdog/health.ts +264 -0
  162. package/src/watchdog/triage.test.ts +164 -0
  163. package/src/watchdog/triage.ts +179 -0
  164. package/src/worktree/manager.test.ts +439 -0
  165. package/src/worktree/manager.ts +198 -0
  166. package/src/worktree/tmux.test.ts +1009 -0
  167. package/src/worktree/tmux.ts +509 -0
  168. package/templates/CLAUDE.md.tmpl +89 -0
  169. package/templates/hooks.json.tmpl +105 -0
  170. package/templates/overlay.md.tmpl +81 -0
@@ -0,0 +1,671 @@
1
+ /**
2
+ * Tier 0 mechanical process monitoring daemon.
3
+ *
4
+ * Runs on a configurable interval, checking the health of all active agent
5
+ * sessions. Implements progressive nudging for stalled agents instead of
6
+ * immediately escalating to AI triage:
7
+ *
8
+ * Level 0 (warn): Log warning via onHealthCheck callback, no direct action
9
+ * Level 1 (nudge): Send tmux nudge via nudgeAgent()
10
+ * Level 2 (escalate): Invoke Tier 1 AI triage (if tier1Enabled), else skip
11
+ * Level 3 (terminate): Kill tmux session
12
+ *
13
+ * Phase 4 tier numbering:
14
+ * Tier 0 = Mechanical daemon (this file)
15
+ * Tier 1 = Triage agent (triage.ts)
16
+ * Tier 2 = Monitor agent (not yet implemented)
17
+ * Tier 3 = Supervisor monitors (per-project)
18
+ *
19
+ * ZFC Principle: Observable state (tmux alive, pid alive) is the source of
20
+ * truth. See health.ts for the full ZFC documentation.
21
+ */
22
+
23
+ import { join } from "node:path";
24
+ import { nudgeAgent } from "../commands/nudge.ts";
25
+ import { createEventStore } from "../events/store.ts";
26
+ import { createMulchClient } from "../mulch/client.ts";
27
+ import { openSessionStore } from "../sessions/compat.ts";
28
+ import type { AgentSession, EventStore, HealthCheck } from "../types.ts";
29
+ import { isSessionAlive, killSession } from "../worktree/tmux.ts";
30
+ import { evaluateHealth, transitionState } from "./health.ts";
31
+ import { triageAgent } from "./triage.ts";
32
+
33
+ /** Maximum escalation level (terminate). */
34
+ const MAX_ESCALATION_LEVEL = 3;
35
+
36
+ /**
37
+ * Persistent agent capabilities that are excluded from run-level completion checks.
38
+ * These agents are long-running and should not count toward "all workers done".
39
+ */
40
+ const PERSISTENT_CAPABILITIES = new Set(["coordinator", "monitor"]);
41
+
42
+ /**
43
+ * Record an agent failure to mulch for future reference.
44
+ * Fire-and-forget: never throws, logs errors internally if mulch fails.
45
+ *
46
+ * @param root - Project root directory
47
+ * @param session - The agent session that failed
48
+ * @param reason - Human-readable failure reason
49
+ * @param tier - Which watchdog tier detected the failure (0 or 1)
50
+ * @param triageSuggestion - Optional triage verdict from Tier 1 AI analysis
51
+ */
52
+ async function recordFailure(
53
+ root: string,
54
+ session: AgentSession,
55
+ reason: string,
56
+ tier: 0 | 1,
57
+ triageSuggestion?: string,
58
+ ): Promise<void> {
59
+ try {
60
+ const mulch = createMulchClient(root);
61
+ const tierLabel = tier === 0 ? "Tier 0 (process death)" : "Tier 1 (AI triage)";
62
+ const description = [
63
+ `Agent: ${session.agentName}`,
64
+ `Capability: ${session.capability}`,
65
+ `Failure reason: ${reason}`,
66
+ triageSuggestion ? `Triage suggestion: ${triageSuggestion}` : null,
67
+ `Detected by: ${tierLabel}`,
68
+ ]
69
+ .filter((line) => line !== null)
70
+ .join("\n");
71
+
72
+ await mulch.record("agents", {
73
+ type: "failure",
74
+ description,
75
+ tags: ["watchdog", "auto-recorded"],
76
+ evidenceBead: session.beadId || undefined,
77
+ });
78
+ } catch {
79
+ // Fire-and-forget: recording failures must not break the watchdog
80
+ }
81
+ }
82
+
83
+ /**
84
+ * Read the current run ID from current-run.txt, or null if no active run.
85
+ * Async because it uses Bun.file().
86
+ */
87
+ async function readCurrentRunId(overstoryDir: string): Promise<string | null> {
88
+ const path = join(overstoryDir, "current-run.txt");
89
+ const file = Bun.file(path);
90
+ if (!(await file.exists())) {
91
+ return null;
92
+ }
93
+ try {
94
+ const text = await file.text();
95
+ const trimmed = text.trim();
96
+ return trimmed.length > 0 ? trimmed : null;
97
+ } catch {
98
+ return null;
99
+ }
100
+ }
101
+
102
+ /**
103
+ * Fire-and-forget: record an event to EventStore. Never throws.
104
+ */
105
+ function recordEvent(
106
+ eventStore: EventStore | null,
107
+ event: {
108
+ runId: string | null;
109
+ agentName: string;
110
+ eventType: "custom" | "mail_sent";
111
+ level: "debug" | "info" | "warn" | "error";
112
+ data: Record<string, unknown>;
113
+ },
114
+ ): void {
115
+ if (!eventStore) return;
116
+ try {
117
+ eventStore.insert({
118
+ runId: event.runId,
119
+ agentName: event.agentName,
120
+ sessionId: null,
121
+ eventType: event.eventType,
122
+ toolName: null,
123
+ toolArgs: null,
124
+ toolDurationMs: null,
125
+ level: event.level,
126
+ data: JSON.stringify(event.data),
127
+ });
128
+ } catch {
129
+ // Fire-and-forget: event recording must never break the daemon
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Build a phase-aware completion message based on the capabilities of completed workers.
135
+ *
136
+ * Single-capability batches get targeted messages (e.g. scouts → "Ready for next phase"),
137
+ * while mixed-capability batches get a generic summary with a breakdown.
138
+ */
139
+ export function buildCompletionMessage(
140
+ workerSessions: readonly AgentSession[],
141
+ runId: string,
142
+ ): string {
143
+ const capabilities = new Set(workerSessions.map((s) => s.capability));
144
+ const count = workerSessions.length;
145
+
146
+ if (capabilities.size === 1) {
147
+ if (capabilities.has("scout")) {
148
+ return `[WATCHDOG] All ${count} scout(s) in run ${runId} have completed. Ready for next phase.`;
149
+ }
150
+ if (capabilities.has("builder")) {
151
+ return `[WATCHDOG] All ${count} builder(s) in run ${runId} have completed. Ready for merge/cleanup.`;
152
+ }
153
+ if (capabilities.has("reviewer")) {
154
+ return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} have completed. Reviews done.`;
155
+ }
156
+ if (capabilities.has("lead")) {
157
+ return `[WATCHDOG] All ${count} lead(s) in run ${runId} have completed. Ready for merge/cleanup.`;
158
+ }
159
+ if (capabilities.has("merger")) {
160
+ return `[WATCHDOG] All ${count} merger(s) in run ${runId} have completed. Merges done.`;
161
+ }
162
+ }
163
+
164
+ const breakdown = Array.from(capabilities).sort().join(", ");
165
+ return `[WATCHDOG] All ${count} worker(s) in run ${runId} have completed (${breakdown}). Ready for next steps.`;
166
+ }
167
+
168
+ /**
169
+ * Check if all worker sessions for the active run have completed, and if so,
170
+ * nudge the coordinator. Fire-and-forget: never throws.
171
+ *
172
+ * Deduplication: uses a marker file (run-complete-notified.txt) to prevent
173
+ * repeated nudges for the same run ID.
174
+ */
175
+ async function checkRunCompletion(ctx: {
176
+ store: { getByRun: (runId: string) => AgentSession[] };
177
+ runId: string;
178
+ overstoryDir: string;
179
+ root: string;
180
+ nudge: (
181
+ projectRoot: string,
182
+ agentName: string,
183
+ message: string,
184
+ force: boolean,
185
+ ) => Promise<{ delivered: boolean; reason?: string }>;
186
+ eventStore: EventStore | null;
187
+ }): Promise<void> {
188
+ const { store, runId, overstoryDir, root, nudge, eventStore } = ctx;
189
+
190
+ const runSessions = store.getByRun(runId);
191
+ const workerSessions = runSessions.filter((s) => !PERSISTENT_CAPABILITIES.has(s.capability));
192
+
193
+ if (workerSessions.length === 0) {
194
+ return;
195
+ }
196
+
197
+ const allCompleted = workerSessions.every((s) => s.state === "completed");
198
+ if (!allCompleted) {
199
+ return;
200
+ }
201
+
202
+ // Dedup: check marker file
203
+ const markerPath = join(overstoryDir, "run-complete-notified.txt");
204
+ try {
205
+ const file = Bun.file(markerPath);
206
+ if (await file.exists()) {
207
+ const existing = await file.text();
208
+ if (existing.trim() === runId) {
209
+ return; // Already notified
210
+ }
211
+ }
212
+ } catch {
213
+ // Read failure is non-fatal — proceed with nudge
214
+ }
215
+
216
+ // Nudge the coordinator
217
+ const message = buildCompletionMessage(workerSessions, runId);
218
+ try {
219
+ await nudge(root, "coordinator", message, true);
220
+ } catch {
221
+ // Nudge delivery failure is non-fatal
222
+ }
223
+
224
+ // Record the event
225
+ const capabilitiesArr = Array.from(new Set(workerSessions.map((s) => s.capability))).sort();
226
+ const phase = capabilitiesArr.length === 1 ? capabilitiesArr[0] : "mixed";
227
+ recordEvent(eventStore, {
228
+ runId,
229
+ agentName: "watchdog",
230
+ eventType: "custom",
231
+ level: "info",
232
+ data: {
233
+ type: "run_complete",
234
+ workerCount: workerSessions.length,
235
+ completedAgents: workerSessions.map((s) => s.agentName),
236
+ capabilities: capabilitiesArr,
237
+ phase,
238
+ },
239
+ });
240
+
241
+ // Write dedup marker
242
+ try {
243
+ await Bun.write(markerPath, runId);
244
+ } catch {
245
+ // Marker write failure is non-fatal
246
+ }
247
+ }
248
+
249
+ /** Options shared between startDaemon and runDaemonTick. */
250
+ export interface DaemonOptions {
251
+ root: string;
252
+ staleThresholdMs: number;
253
+ zombieThresholdMs: number;
254
+ nudgeIntervalMs?: number;
255
+ tier1Enabled?: boolean;
256
+ onHealthCheck?: (check: HealthCheck) => void;
257
+ /** Dependency injection for testing. Uses real implementations when omitted. */
258
+ _tmux?: {
259
+ isSessionAlive: (name: string) => Promise<boolean>;
260
+ killSession: (name: string) => Promise<void>;
261
+ };
262
+ /** Dependency injection for testing. Uses real triageAgent when omitted. */
263
+ _triage?: (options: {
264
+ agentName: string;
265
+ root: string;
266
+ lastActivity: string;
267
+ }) => Promise<"retry" | "terminate" | "extend">;
268
+ /** Dependency injection for testing. Uses real nudgeAgent when omitted. */
269
+ _nudge?: (
270
+ projectRoot: string,
271
+ agentName: string,
272
+ message: string,
273
+ force: boolean,
274
+ ) => Promise<{ delivered: boolean; reason?: string }>;
275
+ /** Dependency injection for testing. Overrides EventStore creation. */
276
+ _eventStore?: EventStore | null;
277
+ /** Dependency injection for testing. Uses real recordFailure when omitted. */
278
+ _recordFailure?: (
279
+ root: string,
280
+ session: AgentSession,
281
+ reason: string,
282
+ tier: 0 | 1,
283
+ triageSuggestion?: string,
284
+ ) => Promise<void>;
285
+ }
286
+
287
+ /**
288
+ * Start the watchdog daemon that periodically monitors agent health.
289
+ *
290
+ * On each tick:
291
+ * 1. Loads sessions from SessionStore (sessions.db)
292
+ * 2. For each session (including zombies — ZFC requires re-checking observable
293
+ * state), checks tmux liveness and evaluates health
294
+ * 3. For "terminate" actions: kills tmux session immediately
295
+ * 4. For "investigate" actions: surfaces via onHealthCheck, no auto-kill
296
+ * 5. For "escalate" actions: applies progressive nudging based on escalationLevel
297
+ * 6. Persists updated session states back to SessionStore
298
+ *
299
+ * @param options.root - Project root directory (contains .overstory/)
300
+ * @param options.intervalMs - Polling interval in milliseconds
301
+ * @param options.staleThresholdMs - Time after which an agent is considered stale
302
+ * @param options.zombieThresholdMs - Time after which an agent is considered a zombie
303
+ * @param options.nudgeIntervalMs - Time between progressive nudge stage transitions (default 60000)
304
+ * @param options.tier1Enabled - Whether Tier 1 AI triage is enabled (default false)
305
+ * @param options.onHealthCheck - Optional callback for each health check result
306
+ * @returns An object with a `stop` function to halt the daemon
307
+ */
308
+ export function startDaemon(options: DaemonOptions & { intervalMs: number }): { stop: () => void } {
309
+ const { intervalMs } = options;
310
+
311
+ // Run the first tick immediately, then on interval
312
+ runDaemonTick(options).catch(() => {
313
+ // Swallow errors in the first tick — daemon must not crash
314
+ });
315
+
316
+ const interval = setInterval(() => {
317
+ runDaemonTick(options).catch(() => {
318
+ // Swallow errors in periodic ticks — daemon must not crash
319
+ });
320
+ }, intervalMs);
321
+
322
+ return {
323
+ stop(): void {
324
+ clearInterval(interval);
325
+ },
326
+ };
327
+ }
328
+
329
+ /**
330
+ * Run a single daemon tick. Exported for testing — allows direct invocation
331
+ * of the monitoring logic without starting the interval-based daemon loop.
332
+ *
333
+ * @param options - Same options as startDaemon (minus intervalMs)
334
+ */
335
+ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
336
+ const {
337
+ root,
338
+ staleThresholdMs,
339
+ zombieThresholdMs,
340
+ nudgeIntervalMs = 60_000,
341
+ tier1Enabled = false,
342
+ onHealthCheck,
343
+ } = options;
344
+ const tmux = options._tmux ?? { isSessionAlive, killSession };
345
+ const triage = options._triage ?? triageAgent;
346
+ const nudge = options._nudge ?? nudgeAgent;
347
+ const recordFailureFn = options._recordFailure ?? recordFailure;
348
+
349
+ const overstoryDir = join(root, ".overstory");
350
+ const { store } = openSessionStore(overstoryDir);
351
+
352
+ // Open EventStore for recording daemon events (fire-and-forget)
353
+ let eventStore: EventStore | null = null;
354
+ let runId: string | null = null;
355
+ const useInjectedEventStore = options._eventStore !== undefined;
356
+ if (useInjectedEventStore) {
357
+ eventStore = options._eventStore ?? null;
358
+ } else {
359
+ try {
360
+ const eventsDbPath = join(overstoryDir, "events.db");
361
+ eventStore = createEventStore(eventsDbPath);
362
+ } catch {
363
+ // EventStore creation failure is non-fatal for the daemon
364
+ }
365
+ }
366
+ try {
367
+ runId = await readCurrentRunId(overstoryDir);
368
+ } catch {
369
+ // Reading run ID failure is non-fatal
370
+ }
371
+
372
+ try {
373
+ const thresholds = {
374
+ staleMs: staleThresholdMs,
375
+ zombieMs: zombieThresholdMs,
376
+ };
377
+
378
+ const sessions = store.getAll();
379
+
380
+ for (const session of sessions) {
381
+ // Skip completed sessions — they are terminal and don't need monitoring
382
+ if (session.state === "completed") {
383
+ continue;
384
+ }
385
+
386
+ // ZFC: Don't skip zombies. Re-check tmux liveness on every tick.
387
+ // A zombie with a live tmux session needs investigation, not silence.
388
+
389
+ const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
390
+ const check = evaluateHealth(session, tmuxAlive, thresholds);
391
+
392
+ // Transition state forward only (investigate action holds state)
393
+ const newState = transitionState(session.state, check);
394
+ if (newState !== session.state) {
395
+ store.updateState(session.agentName, newState);
396
+ session.state = newState;
397
+ }
398
+
399
+ if (onHealthCheck) {
400
+ onHealthCheck(check);
401
+ }
402
+
403
+ if (check.action === "terminate") {
404
+ // Record the failure via mulch (Tier 0 detection)
405
+ const reason = check.reconciliationNote ?? "Process terminated";
406
+ await recordFailureFn(root, session, reason, 0);
407
+
408
+ // Kill the tmux session if it's still alive
409
+ if (tmuxAlive) {
410
+ try {
411
+ await tmux.killSession(session.tmuxSession);
412
+ } catch {
413
+ // Session may have died between check and kill — not an error
414
+ }
415
+ }
416
+ store.updateState(session.agentName, "zombie");
417
+ // Reset escalation tracking on terminal state
418
+ store.updateEscalation(session.agentName, 0, null);
419
+ session.state = "zombie";
420
+ session.escalationLevel = 0;
421
+ session.stalledSince = null;
422
+ } else if (check.action === "investigate") {
423
+ // ZFC: tmux alive but SessionStore says zombie.
424
+ // Log the conflict but do NOT auto-kill.
425
+ // The onHealthCheck callback surfaces this to the operator.
426
+ // No state change — keep zombie until a human or higher-tier agent decides.
427
+ } else if (check.action === "escalate") {
428
+ // Progressive nudging: increment escalation level based on elapsed time
429
+ // instead of immediately delegating to AI triage.
430
+
431
+ // Initialize stalledSince on first escalation detection
432
+ if (session.stalledSince === null) {
433
+ session.stalledSince = new Date().toISOString();
434
+ session.escalationLevel = 0;
435
+ store.updateEscalation(session.agentName, 0, session.stalledSince);
436
+ }
437
+
438
+ // Check if enough time has passed to advance to the next escalation level
439
+ const stalledMs = Date.now() - new Date(session.stalledSince).getTime();
440
+ const expectedLevel = Math.min(
441
+ Math.floor(stalledMs / nudgeIntervalMs),
442
+ MAX_ESCALATION_LEVEL,
443
+ );
444
+
445
+ if (expectedLevel > session.escalationLevel) {
446
+ session.escalationLevel = expectedLevel;
447
+ store.updateEscalation(session.agentName, expectedLevel, session.stalledSince);
448
+ }
449
+
450
+ // Execute the action for the current escalation level
451
+ const actionResult = await executeEscalationAction({
452
+ session,
453
+ root,
454
+ tmuxAlive,
455
+ tier1Enabled,
456
+ tmux,
457
+ triage,
458
+ nudge,
459
+ eventStore,
460
+ runId,
461
+ recordFailure: recordFailureFn,
462
+ });
463
+
464
+ if (actionResult.terminated) {
465
+ store.updateState(session.agentName, "zombie");
466
+ store.updateEscalation(session.agentName, 0, null);
467
+ session.state = "zombie";
468
+ session.escalationLevel = 0;
469
+ session.stalledSince = null;
470
+ }
471
+ } else if (check.action === "none" && session.stalledSince !== null) {
472
+ // Agent recovered — reset escalation tracking
473
+ store.updateEscalation(session.agentName, 0, null);
474
+ session.stalledSince = null;
475
+ session.escalationLevel = 0;
476
+ }
477
+ }
478
+
479
+ // === Run-level completion detection ===
480
+ // After monitoring individual sessions, check if the entire run is done.
481
+ if (runId) {
482
+ await checkRunCompletion({
483
+ store,
484
+ runId,
485
+ overstoryDir,
486
+ root,
487
+ nudge,
488
+ eventStore,
489
+ });
490
+ }
491
+ } finally {
492
+ store.close();
493
+ // Close EventStore only if we created it (not injected)
494
+ if (eventStore && !useInjectedEventStore) {
495
+ try {
496
+ eventStore.close();
497
+ } catch {
498
+ // Non-fatal
499
+ }
500
+ }
501
+ }
502
+ }
503
+
504
+ /**
505
+ * Execute the escalation action corresponding to the agent's current escalation level.
506
+ *
507
+ * Level 0 (warn): No direct action — onHealthCheck callback already fired above.
508
+ * Level 1 (nudge): Send a tmux nudge to the agent.
509
+ * Level 2 (escalate): Invoke Tier 1 AI triage (if tier1Enabled; skip otherwise).
510
+ * Level 3 (terminate): Kill the tmux session.
511
+ *
512
+ * @returns Object indicating whether the agent was terminated or state changed.
513
+ */
514
+ async function executeEscalationAction(ctx: {
515
+ session: AgentSession;
516
+ root: string;
517
+ tmuxAlive: boolean;
518
+ tier1Enabled: boolean;
519
+ tmux: {
520
+ isSessionAlive: (name: string) => Promise<boolean>;
521
+ killSession: (name: string) => Promise<void>;
522
+ };
523
+ triage: (options: {
524
+ agentName: string;
525
+ root: string;
526
+ lastActivity: string;
527
+ }) => Promise<"retry" | "terminate" | "extend">;
528
+ nudge: (
529
+ projectRoot: string,
530
+ agentName: string,
531
+ message: string,
532
+ force: boolean,
533
+ ) => Promise<{ delivered: boolean; reason?: string }>;
534
+ eventStore: EventStore | null;
535
+ runId: string | null;
536
+ recordFailure: (
537
+ root: string,
538
+ session: AgentSession,
539
+ reason: string,
540
+ tier: 0 | 1,
541
+ triageSuggestion?: string,
542
+ ) => Promise<void>;
543
+ }): Promise<{ terminated: boolean; stateChanged: boolean }> {
544
+ const {
545
+ session,
546
+ root,
547
+ tmuxAlive,
548
+ tier1Enabled,
549
+ tmux,
550
+ triage,
551
+ nudge,
552
+ eventStore,
553
+ runId,
554
+ recordFailure,
555
+ } = ctx;
556
+
557
+ switch (session.escalationLevel) {
558
+ case 0: {
559
+ // Level 0: warn — onHealthCheck callback already fired, no direct action
560
+ recordEvent(eventStore, {
561
+ runId,
562
+ agentName: session.agentName,
563
+ eventType: "custom",
564
+ level: "warn",
565
+ data: { type: "escalation", escalationLevel: 0, action: "warn" },
566
+ });
567
+ return { terminated: false, stateChanged: false };
568
+ }
569
+
570
+ case 1: {
571
+ // Level 1: nudge — send a tmux nudge to the agent
572
+ let delivered = false;
573
+ try {
574
+ const result = await nudge(
575
+ root,
576
+ session.agentName,
577
+ `[WATCHDOG] Agent "${session.agentName}" appears stalled. Please check your current task and report status.`,
578
+ true, // force — skip debounce for watchdog nudges
579
+ );
580
+ delivered = result.delivered;
581
+ } catch {
582
+ // Nudge delivery failure is non-fatal for the watchdog
583
+ }
584
+ recordEvent(eventStore, {
585
+ runId,
586
+ agentName: session.agentName,
587
+ eventType: "custom",
588
+ level: "warn",
589
+ data: { type: "nudge", escalationLevel: 1, delivered },
590
+ });
591
+ return { terminated: false, stateChanged: false };
592
+ }
593
+
594
+ case 2: {
595
+ // Level 2: escalate — invoke Tier 1 AI triage if enabled
596
+ if (!tier1Enabled) {
597
+ // Tier 1 disabled — skip triage, progressive nudging continues to level 3
598
+ return { terminated: false, stateChanged: false };
599
+ }
600
+
601
+ const verdict = await triage({
602
+ agentName: session.agentName,
603
+ root,
604
+ lastActivity: session.lastActivity,
605
+ });
606
+
607
+ recordEvent(eventStore, {
608
+ runId,
609
+ agentName: session.agentName,
610
+ eventType: "custom",
611
+ level: "warn",
612
+ data: { type: "triage", escalationLevel: 2, verdict },
613
+ });
614
+
615
+ if (verdict === "terminate") {
616
+ // Record the failure via mulch (Tier 1 AI triage)
617
+ await recordFailure(root, session, "AI triage classified as terminal failure", 1, verdict);
618
+
619
+ if (tmuxAlive) {
620
+ try {
621
+ await tmux.killSession(session.tmuxSession);
622
+ } catch {
623
+ // Session may have died — not an error
624
+ }
625
+ }
626
+ return { terminated: true, stateChanged: true };
627
+ }
628
+
629
+ if (verdict === "retry") {
630
+ // Send a nudge with a recovery message
631
+ try {
632
+ await nudge(
633
+ root,
634
+ session.agentName,
635
+ "[WATCHDOG] Triage suggests recovery is possible. " +
636
+ "Please retry your current operation or check for errors.",
637
+ true, // force — skip debounce
638
+ );
639
+ } catch {
640
+ // Nudge delivery failure is non-fatal
641
+ }
642
+ }
643
+
644
+ // "retry" (after nudge) and "extend" leave the session running
645
+ return { terminated: false, stateChanged: false };
646
+ }
647
+
648
+ default: {
649
+ // Level 3+: terminate — kill the tmux session
650
+ recordEvent(eventStore, {
651
+ runId,
652
+ agentName: session.agentName,
653
+ eventType: "custom",
654
+ level: "error",
655
+ data: { type: "escalation", escalationLevel: 3, action: "terminate" },
656
+ });
657
+
658
+ // Record the failure via mulch (Tier 0: progressive escalation to terminal level)
659
+ await recordFailure(root, session, "Progressive escalation reached terminal level", 0);
660
+
661
+ if (tmuxAlive) {
662
+ try {
663
+ await tmux.killSession(session.tmuxSession);
664
+ } catch {
665
+ // Session may have died — not an error
666
+ }
667
+ }
668
+ return { terminated: true, stateChanged: true };
669
+ }
670
+ }
671
+ }