@os-eco/overstory-cli 0.9.3 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -18
- package/agents/builder.md +9 -8
- package/agents/coordinator.md +6 -6
- package/agents/lead.md +98 -82
- package/agents/merger.md +25 -14
- package/agents/reviewer.md +22 -16
- package/agents/scout.md +17 -12
- package/package.json +6 -3
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/headless-mail-injector.test.ts +448 -0
- package/src/agents/headless-mail-injector.ts +211 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +514 -14
- package/src/agents/hooks-deployer.ts +141 -0
- package/src/agents/overlay.test.ts +4 -4
- package/src/agents/overlay.ts +30 -8
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +1450 -0
- package/src/agents/turn-runner.ts +1166 -0
- package/src/commands/clean.ts +56 -1
- package/src/commands/completions.test.ts +4 -1
- package/src/commands/coordinator.test.ts +127 -0
- package/src/commands/coordinator.ts +205 -6
- package/src/commands/dashboard.test.ts +188 -0
- package/src/commands/dashboard.ts +13 -3
- package/src/commands/doctor.ts +94 -77
- package/src/commands/group.test.ts +94 -0
- package/src/commands/group.ts +49 -20
- package/src/commands/init.test.ts +8 -0
- package/src/commands/init.ts +8 -1
- package/src/commands/log.test.ts +56 -11
- package/src/commands/log.ts +134 -69
- package/src/commands/mail.test.ts +162 -0
- package/src/commands/mail.ts +64 -9
- package/src/commands/merge.test.ts +112 -1
- package/src/commands/merge.ts +17 -4
- package/src/commands/monitor.ts +2 -1
- package/src/commands/nudge.test.ts +351 -4
- package/src/commands/nudge.ts +356 -34
- package/src/commands/run.test.ts +43 -7
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +408 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1323 -0
- package/src/commands/serve/rest.ts +708 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +565 -0
- package/src/commands/sling.test.ts +85 -1
- package/src/commands/sling.ts +153 -64
- package/src/commands/status.test.ts +9 -0
- package/src/commands/status.ts +12 -4
- package/src/commands/stop.test.ts +174 -1
- package/src/commands/stop.ts +107 -8
- package/src/commands/supervisor.ts +2 -1
- package/src/commands/watch.test.ts +49 -4
- package/src/commands/watch.ts +153 -28
- package/src/commands/worktree.test.ts +319 -3
- package/src/commands/worktree.ts +86 -0
- package/src/config.test.ts +78 -0
- package/src/config.ts +43 -1
- package/src/doctor/consistency.test.ts +106 -0
- package/src/doctor/consistency.ts +50 -3
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/types.ts +2 -1
- package/src/doctor/watchdog.ts +57 -1
- package/src/events/tailer.test.ts +234 -1
- package/src/events/tailer.ts +90 -0
- package/src/index.ts +53 -6
- package/src/json.ts +29 -0
- package/src/mail/client.ts +15 -2
- package/src/mail/store.test.ts +82 -0
- package/src/mail/store.ts +41 -4
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/claude.test.ts +791 -1
- package/src/runtimes/claude.ts +323 -1
- package/src/runtimes/connections.test.ts +141 -1
- package/src/runtimes/connections.ts +73 -4
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/types.ts +10 -0
- package/src/schema-consistency.test.ts +1 -0
- package/src/sessions/store.test.ts +390 -24
- package/src/sessions/store.ts +184 -19
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/types.ts +56 -1
- package/src/utils/pid.test.ts +85 -1
- package/src/utils/pid.ts +86 -1
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/watchdog/daemon.test.ts +1520 -411
- package/src/watchdog/daemon.ts +442 -83
- package/src/watchdog/health.test.ts +157 -0
- package/src/watchdog/health.ts +92 -25
- package/src/worktree/process.test.ts +71 -0
- package/src/worktree/process.ts +25 -5
- package/src/worktree/tmux.test.ts +39 -0
- package/src/worktree/tmux.ts +23 -3
- package/templates/CLAUDE.md.tmpl +19 -8
- package/templates/overlay.md.tmpl +3 -2
package/src/watchdog/daemon.ts
CHANGED
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
*/
|
|
22
22
|
|
|
23
23
|
import { join } from "node:path";
|
|
24
|
+
import { isPersistentCapability } from "../agents/capabilities.ts";
|
|
24
25
|
import { nudgeAgent } from "../commands/nudge.ts";
|
|
25
26
|
import { createEventStore } from "../events/store.ts";
|
|
26
27
|
import {
|
|
@@ -34,7 +35,14 @@ import { createMulchClient } from "../mulch/client.ts";
|
|
|
34
35
|
import { getConnection, removeConnection } from "../runtimes/connections.ts";
|
|
35
36
|
import type { RuntimeConnection } from "../runtimes/types.ts";
|
|
36
37
|
import { openSessionStore } from "../sessions/compat.ts";
|
|
37
|
-
import
|
|
38
|
+
import { createRunStore } from "../sessions/store.ts";
|
|
39
|
+
import type {
|
|
40
|
+
AgentSession,
|
|
41
|
+
EventStore,
|
|
42
|
+
HealthCheck,
|
|
43
|
+
RunStore,
|
|
44
|
+
WorkerDiedPayload,
|
|
45
|
+
} from "../types.ts";
|
|
38
46
|
import { isProcessAlive, isSessionAlive, killProcessTree, killSession } from "../worktree/tmux.ts";
|
|
39
47
|
import { evaluateHealth, transitionState } from "./health.ts";
|
|
40
48
|
import { type TriageResult, triageAgent } from "./triage.ts";
|
|
@@ -42,12 +50,6 @@ import { type TriageResult, triageAgent } from "./triage.ts";
|
|
|
42
50
|
/** Maximum escalation level (terminate). */
|
|
43
51
|
const MAX_ESCALATION_LEVEL = 3;
|
|
44
52
|
|
|
45
|
-
/**
|
|
46
|
-
* Persistent agent capabilities that are excluded from run-level completion checks.
|
|
47
|
-
* These agents are long-running and should not count toward "all workers done".
|
|
48
|
-
*/
|
|
49
|
-
const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor"]);
|
|
50
|
-
|
|
51
53
|
/**
|
|
52
54
|
* Module-level registry of active event tailers for headless agents.
|
|
53
55
|
* Maps agentName → TailerHandle. Persists across daemon ticks so tailers
|
|
@@ -55,6 +57,28 @@ const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor
|
|
|
55
57
|
*/
|
|
56
58
|
const _defaultTailerRegistry: Map<string, TailerHandle> = new Map();
|
|
57
59
|
|
|
60
|
+
/**
|
|
61
|
+
* Per-cause dedup state for `current-run.txt` defensive-read warnings
|
|
62
|
+
* (overstory-87bf). The watchdog reads `.overstory/current-run.txt` once per
|
|
63
|
+
* tick to gate run-completion checks; if the file is missing/empty/unreadable
|
|
64
|
+
* or points to an id with no row in the runs table, the check would silently
|
|
65
|
+
* skip every tick. We log one warning per cause and then continue skipping
|
|
66
|
+
* silently, so an operator can see the run-completion path is wedged without
|
|
67
|
+
* drowning in repeated lines.
|
|
68
|
+
*
|
|
69
|
+
* Module-level by design: warnings should dedupe across ticks within one
|
|
70
|
+
* watchdog process. Overridable via DaemonOptions._runIdWarnState in tests.
|
|
71
|
+
*/
|
|
72
|
+
export interface RunIdWarnState {
|
|
73
|
+
missingFileWarned: boolean;
|
|
74
|
+
unknownIds: Set<string>;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const _defaultRunIdWarnState: RunIdWarnState = {
|
|
78
|
+
missingFileWarned: false,
|
|
79
|
+
unknownIds: new Set(),
|
|
80
|
+
};
|
|
81
|
+
|
|
58
82
|
/**
|
|
59
83
|
* Record an agent failure to mulch for future reference.
|
|
60
84
|
* Fire-and-forget: never throws, logs errors internally if mulch fails.
|
|
@@ -115,6 +139,56 @@ async function readCurrentRunId(overstoryDir: string): Promise<string | null> {
|
|
|
115
139
|
}
|
|
116
140
|
}
|
|
117
141
|
|
|
142
|
+
/**
|
|
143
|
+
* Resolve the active run id for run-completion checks, defensively
|
|
144
|
+
* (overstory-87bf). Returns the id only when `current-run.txt` is readable
|
|
145
|
+
* AND points to a row in the runs table. On either failure mode, logs one
|
|
146
|
+
* warning per cause via `warnState` and returns null so the caller can skip
|
|
147
|
+
* the check silently on subsequent ticks.
|
|
148
|
+
*
|
|
149
|
+
* Intentionally narrow: the broader `readCurrentRunId` is unchanged and still
|
|
150
|
+
* powers event-recording paths where a stale id is acceptable as a label.
|
|
151
|
+
*/
|
|
152
|
+
async function resolveRunIdForCompletionCheck(
|
|
153
|
+
overstoryDir: string,
|
|
154
|
+
runStore: RunStore | null,
|
|
155
|
+
warnState: RunIdWarnState,
|
|
156
|
+
): Promise<string | null> {
|
|
157
|
+
const runId = await readCurrentRunId(overstoryDir);
|
|
158
|
+
if (runId === null) {
|
|
159
|
+
if (!warnState.missingFileWarned) {
|
|
160
|
+
warnState.missingFileWarned = true;
|
|
161
|
+
process.stderr.write(
|
|
162
|
+
"[WATCHDOG] current-run.txt missing — run-completion checks disabled until restart\n",
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
return null;
|
|
166
|
+
}
|
|
167
|
+
if (runStore === null) {
|
|
168
|
+
// RunStore unavailable (rare — sessions.db open failed). Trust the file
|
|
169
|
+
// and let the downstream nudge path proceed; this is no worse than the
|
|
170
|
+
// pre-87bf behavior.
|
|
171
|
+
return runId;
|
|
172
|
+
}
|
|
173
|
+
let run: ReturnType<RunStore["getRun"]>;
|
|
174
|
+
try {
|
|
175
|
+
run = runStore.getRun(runId);
|
|
176
|
+
} catch {
|
|
177
|
+
// Treat lookup errors as "unknown" — same defensive posture as a missing row.
|
|
178
|
+
run = null;
|
|
179
|
+
}
|
|
180
|
+
if (run === null) {
|
|
181
|
+
if (!warnState.unknownIds.has(runId)) {
|
|
182
|
+
warnState.unknownIds.add(runId);
|
|
183
|
+
process.stderr.write(
|
|
184
|
+
`[WATCHDOG] current-run.txt points to unknown run "${runId}" — run-completion checks disabled until restart\n`,
|
|
185
|
+
);
|
|
186
|
+
}
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
return runId;
|
|
190
|
+
}
|
|
191
|
+
|
|
118
192
|
/**
|
|
119
193
|
* Fire-and-forget: record an event to EventStore. Never throws.
|
|
120
194
|
*/
|
|
@@ -147,10 +221,15 @@ function recordEvent(
|
|
|
147
221
|
}
|
|
148
222
|
|
|
149
223
|
/**
|
|
150
|
-
* Build a phase-aware completion message based on the capabilities of
|
|
224
|
+
* Build a phase-aware completion message based on the capabilities of terminal workers.
|
|
151
225
|
*
|
|
152
|
-
*
|
|
153
|
-
*
|
|
226
|
+
* "Terminal" includes both `completed` (clean exit) and `zombie` (watchdog-killed)
|
|
227
|
+
* — see overstory-e130 for why a zombie counts as run-terminal. Single-capability
|
|
228
|
+
* batches get targeted messages (e.g. scouts → "Ready for next phase"), while
|
|
229
|
+
* mixed-capability batches get a generic summary with a breakdown. When any worker
|
|
230
|
+
* died, the verb changes from "have completed" to "have terminated" and the message
|
|
231
|
+
* carries a "(N completed, M zombie)" qualifier so the coordinator does not mistake
|
|
232
|
+
* a partial failure for a clean batch.
|
|
154
233
|
*/
|
|
155
234
|
export function buildCompletionMessage(
|
|
156
235
|
workerSessions: readonly AgentSession[],
|
|
@@ -158,32 +237,41 @@ export function buildCompletionMessage(
|
|
|
158
237
|
): string {
|
|
159
238
|
const capabilities = new Set(workerSessions.map((s) => s.capability));
|
|
160
239
|
const count = workerSessions.length;
|
|
240
|
+
const zombieCount = workerSessions.filter((s) => s.state === "zombie").length;
|
|
241
|
+
const completedCount = count - zombieCount;
|
|
242
|
+
const verb = zombieCount > 0 ? "have terminated" : "have completed";
|
|
243
|
+
const qualifier = zombieCount > 0 ? ` (${completedCount} completed, ${zombieCount} zombie)` : "";
|
|
161
244
|
|
|
162
245
|
if (capabilities.size === 1) {
|
|
163
246
|
if (capabilities.has("scout")) {
|
|
164
|
-
return `[WATCHDOG] All ${count} scout(s) in run ${runId}
|
|
247
|
+
return `[WATCHDOG] All ${count} scout(s) in run ${runId} ${verb}${qualifier}. Ready for next phase.`;
|
|
165
248
|
}
|
|
166
249
|
if (capabilities.has("builder")) {
|
|
167
|
-
return `[WATCHDOG] All ${count} builder(s) in run ${runId}
|
|
250
|
+
return `[WATCHDOG] All ${count} builder(s) in run ${runId} ${verb}${qualifier}. Awaiting lead verification.`;
|
|
168
251
|
}
|
|
169
252
|
if (capabilities.has("reviewer")) {
|
|
170
|
-
return `[WATCHDOG] All ${count} reviewer(s) in run ${runId}
|
|
253
|
+
return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} ${verb}${qualifier}. Reviews done.`;
|
|
171
254
|
}
|
|
172
255
|
if (capabilities.has("lead")) {
|
|
173
|
-
return `[WATCHDOG] All ${count} lead(s) in run ${runId}
|
|
256
|
+
return `[WATCHDOG] All ${count} lead(s) in run ${runId} ${verb}${qualifier}. Ready for merge/cleanup.`;
|
|
174
257
|
}
|
|
175
258
|
if (capabilities.has("merger")) {
|
|
176
|
-
return `[WATCHDOG] All ${count} merger(s) in run ${runId}
|
|
259
|
+
return `[WATCHDOG] All ${count} merger(s) in run ${runId} ${verb}${qualifier}. Merges done.`;
|
|
177
260
|
}
|
|
178
261
|
}
|
|
179
262
|
|
|
180
263
|
const breakdown = Array.from(capabilities).sort().join(", ");
|
|
181
|
-
return `[WATCHDOG] All ${count} worker(s) in run ${runId}
|
|
264
|
+
return `[WATCHDOG] All ${count} worker(s) in run ${runId} ${verb}${qualifier} (${breakdown}). Ready for next steps.`;
|
|
182
265
|
}
|
|
183
266
|
|
|
184
267
|
/**
|
|
185
|
-
* Check if
|
|
186
|
-
* nudge the coordinator. Fire-and-forget:
|
|
268
|
+
* Check if every worker session for the active run has reached a terminal state
|
|
269
|
+
* (`completed` or `zombie`), and if so, nudge the coordinator. Fire-and-forget:
|
|
270
|
+
* never throws.
|
|
271
|
+
*
|
|
272
|
+
* Zombie counts as terminal (overstory-e130): a watchdog-killed worker is not
|
|
273
|
+
* coming back, so excluding it would strand the coordinator on a run that mixes
|
|
274
|
+
* clean exits with kills.
|
|
187
275
|
*
|
|
188
276
|
* Deduplication: uses a marker file (run-complete-notified.txt) to prevent
|
|
189
277
|
* repeated nudges for the same run ID.
|
|
@@ -204,14 +292,17 @@ async function checkRunCompletion(ctx: {
|
|
|
204
292
|
const { store, runId, overstoryDir, root, nudge, eventStore } = ctx;
|
|
205
293
|
|
|
206
294
|
const runSessions = store.getByRun(runId);
|
|
207
|
-
const workerSessions = runSessions.filter((s) => !
|
|
295
|
+
const workerSessions = runSessions.filter((s) => !isPersistentCapability(s.capability));
|
|
208
296
|
|
|
209
297
|
if (workerSessions.length === 0) {
|
|
210
298
|
return;
|
|
211
299
|
}
|
|
212
300
|
|
|
213
|
-
|
|
214
|
-
|
|
301
|
+
// `completed` = clean exit, `zombie` = watchdog-killed. Both are terminal
|
|
302
|
+
// for run-completion: a zombie is not coming back, so blocking on it would
|
|
303
|
+
// strand the coordinator forever (overstory-e130).
|
|
304
|
+
const allTerminal = workerSessions.every((s) => s.state === "completed" || s.state === "zombie");
|
|
305
|
+
if (!allTerminal) {
|
|
215
306
|
return;
|
|
216
307
|
}
|
|
217
308
|
|
|
@@ -240,15 +331,20 @@ async function checkRunCompletion(ctx: {
|
|
|
240
331
|
// Record the event
|
|
241
332
|
const capabilitiesArr = Array.from(new Set(workerSessions.map((s) => s.capability))).sort();
|
|
242
333
|
const phase = capabilitiesArr.length === 1 ? capabilitiesArr[0] : "mixed";
|
|
334
|
+
const completedAgents = workerSessions
|
|
335
|
+
.filter((s) => s.state === "completed")
|
|
336
|
+
.map((s) => s.agentName);
|
|
337
|
+
const zombieAgents = workerSessions.filter((s) => s.state === "zombie").map((s) => s.agentName);
|
|
243
338
|
recordEvent(eventStore, {
|
|
244
339
|
runId,
|
|
245
340
|
agentName: "watchdog",
|
|
246
341
|
eventType: "custom",
|
|
247
|
-
level: "info",
|
|
342
|
+
level: zombieAgents.length > 0 ? "warn" : "info",
|
|
248
343
|
data: {
|
|
249
344
|
type: "run_complete",
|
|
250
345
|
workerCount: workerSessions.length,
|
|
251
|
-
completedAgents
|
|
346
|
+
completedAgents,
|
|
347
|
+
zombieAgents,
|
|
252
348
|
capabilities: capabilitiesArr,
|
|
253
349
|
phase,
|
|
254
350
|
},
|
|
@@ -269,6 +365,13 @@ export interface DaemonOptions {
|
|
|
269
365
|
zombieThresholdMs: number;
|
|
270
366
|
nudgeIntervalMs?: number;
|
|
271
367
|
tier1Enabled?: boolean;
|
|
368
|
+
/**
|
|
369
|
+
* When true (default), the watchdog sends a synthetic `worker_died` mail to
|
|
370
|
+
* `session.parentAgent` the first time it transitions a session to `zombie`
|
|
371
|
+
* (overstory-c111). Without this, the parent — typically a lead waiting for
|
|
372
|
+
* `worker_done` — blocks indefinitely on mail that will never arrive.
|
|
373
|
+
*/
|
|
374
|
+
notifyParentOnDeath?: boolean;
|
|
272
375
|
onHealthCheck?: (check: HealthCheck) => void;
|
|
273
376
|
/** Dependency injection for testing. Uses real implementations when omitted. */
|
|
274
377
|
_tmux?: {
|
|
@@ -317,6 +420,18 @@ export interface DaemonOptions {
|
|
|
317
420
|
_findLatestStdoutLog?: (overstoryDir: string, agentName: string) => Promise<string | null>;
|
|
318
421
|
/** Dependency injection for testing. Overrides MailStore creation for decision gate detection. */
|
|
319
422
|
_mailStore?: MailStore | null;
|
|
423
|
+
/**
|
|
424
|
+
* Dependency injection for testing. Overrides the module-level run-id warning
|
|
425
|
+
* state so each test starts with a clean dedup slate (overstory-87bf).
|
|
426
|
+
*/
|
|
427
|
+
_runIdWarnState?: RunIdWarnState;
|
|
428
|
+
/**
|
|
429
|
+
* Dependency injection for testing. Overrides RunStore creation. When `null`
|
|
430
|
+
* is passed explicitly, run-id validation is skipped (file presence still
|
|
431
|
+
* gates the warning). When omitted, a real RunStore is opened against
|
|
432
|
+
* `.overstory/sessions.db`.
|
|
433
|
+
*/
|
|
434
|
+
_runStore?: RunStore | null;
|
|
320
435
|
}
|
|
321
436
|
|
|
322
437
|
/**
|
|
@@ -369,20 +484,44 @@ export function startDaemon(options: DaemonOptions & { intervalMs: number }): {
|
|
|
369
484
|
/**
|
|
370
485
|
* Kill an agent using the appropriate method based on whether it is headless or TUI.
|
|
371
486
|
*
|
|
372
|
-
*
|
|
373
|
-
*
|
|
487
|
+
* Prefers runtime-agnostic `conn.abort()` when a RuntimeConnection is registered.
|
|
488
|
+
* If abort() succeeds, returns immediately — no PID/tmux kill needed.
|
|
489
|
+
* If abort() throws (e.g. process already exited), falls through to the
|
|
490
|
+
* defense-in-depth path: PID kill for headless agents, tmux kill for TUI agents.
|
|
374
491
|
*
|
|
375
|
-
*
|
|
376
|
-
*
|
|
492
|
+
* Headless agents without a connection (tmuxSession === "" && pid !== null) are
|
|
493
|
+
* killed via PID process tree. TUI agents are killed via their named tmux session
|
|
494
|
+
* (only if tmuxAlive). This prevents the blast-radius bug where killSession("")
|
|
495
|
+
* with tmux prefix matching would kill ALL tmux sessions.
|
|
377
496
|
*/
|
|
378
497
|
async function killAgent(ctx: {
|
|
379
498
|
session: AgentSession;
|
|
380
499
|
tmuxAlive: boolean;
|
|
381
500
|
tmux: { killSession: (name: string) => Promise<void> };
|
|
382
501
|
process: { killTree: (pid: number) => Promise<void> };
|
|
502
|
+
getConnection: (name: string) => RuntimeConnection | undefined;
|
|
503
|
+
removeConnection: (name: string) => void;
|
|
383
504
|
}): Promise<void> {
|
|
384
|
-
const { session, tmuxAlive, tmux, process: proc } = ctx;
|
|
505
|
+
const { session, tmuxAlive, tmux, process: proc, getConnection, removeConnection } = ctx;
|
|
385
506
|
const isHeadless = session.tmuxSession === "" && session.pid !== null;
|
|
507
|
+
|
|
508
|
+
// Prefer runtime-agnostic abort() when a connection is registered.
|
|
509
|
+
const conn = getConnection(session.agentName);
|
|
510
|
+
if (conn) {
|
|
511
|
+
let aborted = false;
|
|
512
|
+
try {
|
|
513
|
+
await conn.abort();
|
|
514
|
+
aborted = true;
|
|
515
|
+
} catch {
|
|
516
|
+
// abort() failure — fall through to defense-in-depth path
|
|
517
|
+
}
|
|
518
|
+
removeConnection(session.agentName);
|
|
519
|
+
if (aborted) {
|
|
520
|
+
return;
|
|
521
|
+
}
|
|
522
|
+
// abort() threw — fall through to PID/tmux kill below as defense-in-depth
|
|
523
|
+
}
|
|
524
|
+
|
|
386
525
|
if (isHeadless && session.pid !== null) {
|
|
387
526
|
try {
|
|
388
527
|
await proc.killTree(session.pid);
|
|
@@ -398,6 +537,70 @@ async function killAgent(ctx: {
|
|
|
398
537
|
}
|
|
399
538
|
}
|
|
400
539
|
|
|
540
|
+
/**
|
|
541
|
+
* Send a synthetic `worker_died` mail to the parent of a watchdog-terminated
|
|
542
|
+
* session (overstory-c111). Fire-and-forget: never throws.
|
|
543
|
+
*
|
|
544
|
+
* Called only when `tryTransitionState(..., "zombie")` returns `ok: true`, so
|
|
545
|
+
* the state-machine's idempotence dedupes us — a subsequent watchdog tick that
|
|
546
|
+
* tries to re-zombify a session sees `illegal_transition` and skips notify.
|
|
547
|
+
*/
|
|
548
|
+
function notifyParentOfDeath(ctx: {
|
|
549
|
+
session: AgentSession;
|
|
550
|
+
mailStore: MailStore | null;
|
|
551
|
+
reason: string;
|
|
552
|
+
tier: 0 | 1;
|
|
553
|
+
eventStore: EventStore | null;
|
|
554
|
+
runId: string | null;
|
|
555
|
+
}): void {
|
|
556
|
+
const { session, mailStore, reason, tier, eventStore, runId } = ctx;
|
|
557
|
+
if (mailStore === null) return;
|
|
558
|
+
if (session.parentAgent === null) return;
|
|
559
|
+
|
|
560
|
+
const payload: WorkerDiedPayload = {
|
|
561
|
+
agentName: session.agentName,
|
|
562
|
+
capability: session.capability,
|
|
563
|
+
taskId: session.taskId,
|
|
564
|
+
reason,
|
|
565
|
+
lastActivity: session.lastActivity,
|
|
566
|
+
terminatedBy: tier === 0 ? "tier0" : "tier1",
|
|
567
|
+
};
|
|
568
|
+
|
|
569
|
+
try {
|
|
570
|
+
mailStore.insert({
|
|
571
|
+
id: "",
|
|
572
|
+
from: session.agentName,
|
|
573
|
+
to: session.parentAgent,
|
|
574
|
+
subject: `[WATCHDOG] worker_died: ${session.agentName}`,
|
|
575
|
+
body:
|
|
576
|
+
`Worker "${session.agentName}" (${session.capability}) on task ${session.taskId} ` +
|
|
577
|
+
`was terminated by the watchdog. Reason: ${reason}. ` +
|
|
578
|
+
`Last activity: ${session.lastActivity}. ` +
|
|
579
|
+
`Decide whether to retry the work, escalate, or report the failure upstream.`,
|
|
580
|
+
type: "worker_died",
|
|
581
|
+
priority: "high",
|
|
582
|
+
threadId: null,
|
|
583
|
+
payload: JSON.stringify(payload),
|
|
584
|
+
});
|
|
585
|
+
} catch {
|
|
586
|
+
// Mail-send failure must never crash the watchdog.
|
|
587
|
+
return;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
recordEvent(eventStore, {
|
|
591
|
+
runId,
|
|
592
|
+
agentName: session.agentName,
|
|
593
|
+
eventType: "mail_sent",
|
|
594
|
+
level: "warn",
|
|
595
|
+
data: {
|
|
596
|
+
type: "worker_died",
|
|
597
|
+
parent: session.parentAgent,
|
|
598
|
+
reason,
|
|
599
|
+
tier,
|
|
600
|
+
},
|
|
601
|
+
});
|
|
602
|
+
}
|
|
603
|
+
|
|
401
604
|
/**
|
|
402
605
|
* Run a single daemon tick. Exported for testing — allows direct invocation
|
|
403
606
|
* of the monitoring logic without starting the interval-based daemon loop.
|
|
@@ -411,6 +614,7 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
411
614
|
zombieThresholdMs,
|
|
412
615
|
nudgeIntervalMs = 60_000,
|
|
413
616
|
tier1Enabled = false,
|
|
617
|
+
notifyParentOnDeath = true,
|
|
414
618
|
onHealthCheck,
|
|
415
619
|
} = options;
|
|
416
620
|
const tmux = options._tmux ?? { isSessionAlive, killSession };
|
|
@@ -425,10 +629,26 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
425
629
|
const findStdoutLog = options._findLatestStdoutLog ?? findLatestStdoutLog;
|
|
426
630
|
const maxTriagePerTick = options._maxTriagePerTick ?? 3;
|
|
427
631
|
const triageCount = { value: 0 };
|
|
632
|
+
const runIdWarnState = options._runIdWarnState ?? _defaultRunIdWarnState;
|
|
428
633
|
|
|
429
634
|
const overstoryDir = join(root, ".overstory");
|
|
430
635
|
const { store } = openSessionStore(overstoryDir);
|
|
431
636
|
|
|
637
|
+
// Open RunStore for run-id validation (overstory-87bf). Sharing sessions.db
|
|
638
|
+
// is intentional — same file, WAL mode covers concurrent reads.
|
|
639
|
+
let runStore: RunStore | null = null;
|
|
640
|
+
let ownRunStore = false;
|
|
641
|
+
if (options._runStore !== undefined) {
|
|
642
|
+
runStore = options._runStore;
|
|
643
|
+
} else {
|
|
644
|
+
try {
|
|
645
|
+
runStore = createRunStore(join(overstoryDir, "sessions.db"));
|
|
646
|
+
ownRunStore = true;
|
|
647
|
+
} catch {
|
|
648
|
+
// RunStore creation failure is non-fatal — id validation is then skipped.
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
432
652
|
// Open MailStore for decision gate detection (fire-and-forget: non-fatal if unavailable)
|
|
433
653
|
let mailStore: MailStore | null = null;
|
|
434
654
|
let ownMailStore = false;
|
|
@@ -474,6 +694,7 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
474
694
|
// Track active headless agents to clean up stale tailers after the loop.
|
|
475
695
|
const activeHeadlessAgents = new Set<string>();
|
|
476
696
|
const eventsDbPath = join(overstoryDir, "events.db");
|
|
697
|
+
const sessionsDbPath = join(overstoryDir, "sessions.db");
|
|
477
698
|
|
|
478
699
|
for (const session of sessions) {
|
|
479
700
|
// Skip completed sessions — they are terminal and don't need monitoring
|
|
@@ -488,7 +709,11 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
488
709
|
// active headless agent that doesn't already have one running.
|
|
489
710
|
// Tailers persist between ticks (module-level registry) so events are
|
|
490
711
|
// continuously written to events.db while the agent is working.
|
|
491
|
-
|
|
712
|
+
//
|
|
713
|
+
// Both long-lived headless (pid !== null) and spawn-per-turn workers
|
|
714
|
+
// (pid === null, overstory-7a34) emit stream-json to stdout.log, so
|
|
715
|
+
// either pattern needs a tailer.
|
|
716
|
+
if (session.tmuxSession === "") {
|
|
492
717
|
activeHeadlessAgents.add(session.agentName);
|
|
493
718
|
if (!tailerRegistry.has(session.agentName)) {
|
|
494
719
|
// Discover the latest stdout.log for this agent and start tailing.
|
|
@@ -499,41 +724,52 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
499
724
|
agentName: session.agentName,
|
|
500
725
|
runId,
|
|
501
726
|
eventsDbPath,
|
|
727
|
+
sessionsDbPath,
|
|
502
728
|
});
|
|
503
729
|
tailerRegistry.set(session.agentName, handle);
|
|
504
730
|
}
|
|
505
731
|
}
|
|
506
732
|
}
|
|
507
733
|
|
|
508
|
-
//
|
|
509
|
-
//
|
|
510
|
-
//
|
|
511
|
-
//
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
// getState() failed or timed out — remove stale connection
|
|
533
|
-
removeConn(session.agentName);
|
|
734
|
+
// === Liveness check ===
|
|
735
|
+
// Prefer RuntimeConnection.getState() when a connection is registered. Fall
|
|
736
|
+
// back to tmux liveness when no connection exists. For headless agents without
|
|
737
|
+
// a connection, use event-based activity detection to refresh lastActivity.
|
|
738
|
+
const conn = getConn(session.agentName);
|
|
739
|
+
let tmuxAlive: boolean;
|
|
740
|
+
|
|
741
|
+
if (conn) {
|
|
742
|
+
try {
|
|
743
|
+
const state = await Promise.race([
|
|
744
|
+
conn.getState(),
|
|
745
|
+
new Promise<never>((_, reject) =>
|
|
746
|
+
setTimeout(() => reject(new Error("getState timed out")), 5000),
|
|
747
|
+
),
|
|
748
|
+
]);
|
|
749
|
+
// Map ConnectionState → liveness:
|
|
750
|
+
// idle | working → alive (running)
|
|
751
|
+
// error → not alive (exited)
|
|
752
|
+
if (state.status === "idle" || state.status === "working") {
|
|
753
|
+
tmuxAlive = true;
|
|
754
|
+
store.updateLastActivity(session.agentName);
|
|
755
|
+
session.lastActivity = new Date().toISOString();
|
|
756
|
+
} else {
|
|
757
|
+
tmuxAlive = false;
|
|
534
758
|
}
|
|
535
|
-
}
|
|
536
|
-
//
|
|
759
|
+
} catch {
|
|
760
|
+
// getState() failed/timed out — drop stale connection, fall back to tmux
|
|
761
|
+
removeConn(session.agentName);
|
|
762
|
+
tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
763
|
+
}
|
|
764
|
+
} else {
|
|
765
|
+
tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
766
|
+
|
|
767
|
+
// Headless agents without a registered connection: event-based
|
|
768
|
+
// activity detection to avoid false-positive stale. Covers both
|
|
769
|
+
// long-lived headless (e.g. after a process restart) and
|
|
770
|
+
// spawn-per-turn workers between turns where lastActivity is
|
|
771
|
+
// the only liveness signal (overstory-7a34).
|
|
772
|
+
if (session.tmuxSession === "" && eventStore) {
|
|
537
773
|
try {
|
|
538
774
|
const recentEvents = eventStore.getByAgent(session.agentName, {
|
|
539
775
|
since: new Date(Date.now() - staleThresholdMs).toISOString(),
|
|
@@ -548,15 +784,28 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
548
784
|
}
|
|
549
785
|
}
|
|
550
786
|
}
|
|
551
|
-
|
|
552
|
-
const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
553
787
|
const check = evaluateHealth(session, tmuxAlive, thresholds);
|
|
554
788
|
|
|
555
|
-
//
|
|
789
|
+
// Snapshot the pre-tick state so the worker_died notify path can
|
|
790
|
+
// dedupe across re-ticks (overstory-c111). Subsequent `tryTransitionState`
|
|
791
|
+
// calls below mutate session.state, and the matrix allows the idempotent
|
|
792
|
+
// `zombie → zombie` self-transition — both would erase the dedup signal.
|
|
793
|
+
const stateBeforeTick = session.state;
|
|
794
|
+
|
|
795
|
+
// Transition state forward only (investigate action holds state).
|
|
796
|
+
// `transitionState` computes the watchdog's preferred target;
|
|
797
|
+
// `tryTransitionState` is the matrix-guarded CAS — `completed → *`
|
|
798
|
+
// is rejected here so a properly-completed agent cannot be
|
|
799
|
+
// reclassified as zombie by a late watchdog tick (overstory-a993).
|
|
556
800
|
const newState = transitionState(session.state, check);
|
|
557
801
|
if (newState !== session.state) {
|
|
558
|
-
store.
|
|
559
|
-
|
|
802
|
+
const outcome = store.tryTransitionState(session.agentName, newState);
|
|
803
|
+
if (outcome.ok) {
|
|
804
|
+
session.state = newState;
|
|
805
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
806
|
+
// Resync local mirror — another writer settled state durably.
|
|
807
|
+
session.state = outcome.prev;
|
|
808
|
+
}
|
|
560
809
|
}
|
|
561
810
|
|
|
562
811
|
if (onHealthCheck) {
|
|
@@ -568,12 +817,41 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
568
817
|
const reason = check.reconciliationNote ?? "Process terminated";
|
|
569
818
|
await recordFailureFn(root, session, reason, 0);
|
|
570
819
|
|
|
571
|
-
// Kill the agent:
|
|
572
|
-
await killAgent({
|
|
573
|
-
|
|
820
|
+
// Kill the agent: prefer conn.abort(), fall back to PID/tmux
|
|
821
|
+
await killAgent({
|
|
822
|
+
session,
|
|
823
|
+
tmuxAlive,
|
|
824
|
+
tmux,
|
|
825
|
+
process: proc,
|
|
826
|
+
getConnection: getConn,
|
|
827
|
+
removeConnection: removeConn,
|
|
828
|
+
});
|
|
829
|
+
// Matrix-guarded: rejected when state is `completed` so a clean
|
|
830
|
+
// `ov stop` cannot be silently downgraded to zombie by a late
|
|
831
|
+
// watchdog termination (overstory-a993).
|
|
832
|
+
const outcome = store.tryTransitionState(session.agentName, "zombie");
|
|
574
833
|
// Reset escalation tracking on terminal state
|
|
575
834
|
store.updateEscalation(session.agentName, 0, null);
|
|
576
|
-
|
|
835
|
+
if (outcome.ok) {
|
|
836
|
+
session.state = "zombie";
|
|
837
|
+
// First-time zombify: notify parent so it doesn't block on
|
|
838
|
+
// missing `worker_done` mail (overstory-c111). Dedup uses the
|
|
839
|
+
// pre-tick snapshot because the matrix allows the idempotent
|
|
840
|
+
// zombie → zombie transition (both `outcome.ok` and the earlier
|
|
841
|
+
// transitionState call would otherwise mask re-ticks).
|
|
842
|
+
if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
|
|
843
|
+
notifyParentOfDeath({
|
|
844
|
+
session,
|
|
845
|
+
mailStore,
|
|
846
|
+
reason,
|
|
847
|
+
tier: 0,
|
|
848
|
+
eventStore,
|
|
849
|
+
runId,
|
|
850
|
+
});
|
|
851
|
+
}
|
|
852
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
853
|
+
session.state = outcome.prev;
|
|
854
|
+
}
|
|
577
855
|
session.escalationLevel = 0;
|
|
578
856
|
session.stalledSince = null;
|
|
579
857
|
} else if (check.action === "investigate") {
|
|
@@ -581,6 +859,21 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
581
859
|
// Log the conflict but do NOT auto-kill.
|
|
582
860
|
// The onHealthCheck callback surfaces this to the operator.
|
|
583
861
|
// No state change — keep zombie until a human or higher-tier agent decides.
|
|
862
|
+
} else if (check.action === "complete") {
|
|
863
|
+
// ZFC fallback: tmux/pid is gone AND lastActivity is stale —
|
|
864
|
+
// the agent looks like it finished naturally and only the
|
|
865
|
+
// session-end hook missed (overstory-e74b). Mark completed
|
|
866
|
+
// without killing (process is already gone) and without
|
|
867
|
+
// notifying parents of death (this is not a crash).
|
|
868
|
+
const outcome = store.tryTransitionState(session.agentName, "completed");
|
|
869
|
+
if (outcome.ok) {
|
|
870
|
+
session.state = "completed";
|
|
871
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
872
|
+
session.state = outcome.prev;
|
|
873
|
+
}
|
|
874
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
875
|
+
session.escalationLevel = 0;
|
|
876
|
+
session.stalledSince = null;
|
|
584
877
|
} else if (check.action === "escalate") {
|
|
585
878
|
// Decision gate check: if the agent sent a decision_gate message, it is
|
|
586
879
|
// intentionally paused waiting for a human decision — not a stall.
|
|
@@ -635,12 +928,32 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
635
928
|
recordFailure: recordFailureFn,
|
|
636
929
|
triageCount,
|
|
637
930
|
maxTriagePerTick,
|
|
931
|
+
getConnection: getConn,
|
|
932
|
+
removeConnection: removeConn,
|
|
638
933
|
});
|
|
639
934
|
|
|
640
935
|
if (actionResult.terminated) {
|
|
641
|
-
|
|
936
|
+
// Matrix-guarded: completed → zombie is rejected (overstory-a993).
|
|
937
|
+
const outcome = store.tryTransitionState(session.agentName, "zombie");
|
|
642
938
|
store.updateEscalation(session.agentName, 0, null);
|
|
643
|
-
|
|
939
|
+
if (outcome.ok) {
|
|
940
|
+
session.state = "zombie";
|
|
941
|
+
// First-time zombify: notify parent so it doesn't block on
|
|
942
|
+
// missing `worker_done` mail (overstory-c111). Dedup via
|
|
943
|
+
// the pre-tick snapshot — see the terminate branch above.
|
|
944
|
+
if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
|
|
945
|
+
notifyParentOfDeath({
|
|
946
|
+
session,
|
|
947
|
+
mailStore,
|
|
948
|
+
reason: actionResult.deathReason ?? "Watchdog escalation terminated agent",
|
|
949
|
+
tier: actionResult.deathTier ?? 0,
|
|
950
|
+
eventStore,
|
|
951
|
+
runId,
|
|
952
|
+
});
|
|
953
|
+
}
|
|
954
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
955
|
+
session.state = outcome.prev;
|
|
956
|
+
}
|
|
644
957
|
session.escalationLevel = 0;
|
|
645
958
|
session.stalledSince = null;
|
|
646
959
|
}
|
|
@@ -664,10 +977,18 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
664
977
|
|
|
665
978
|
// === Run-level completion detection ===
|
|
666
979
|
// After monitoring individual sessions, check if the entire run is done.
|
|
667
|
-
|
|
980
|
+
// Re-resolve the run id defensively (overstory-87bf): a missing
|
|
981
|
+
// current-run.txt or a stale id (no row in runs table) skips the check
|
|
982
|
+
// and emits one warning per cause for the lifetime of this watchdog.
|
|
983
|
+
const validatedRunId = await resolveRunIdForCompletionCheck(
|
|
984
|
+
overstoryDir,
|
|
985
|
+
runStore,
|
|
986
|
+
runIdWarnState,
|
|
987
|
+
);
|
|
988
|
+
if (validatedRunId) {
|
|
668
989
|
await checkRunCompletion({
|
|
669
990
|
store,
|
|
670
|
-
runId,
|
|
991
|
+
runId: validatedRunId,
|
|
671
992
|
overstoryDir,
|
|
672
993
|
root,
|
|
673
994
|
nudge,
|
|
@@ -692,6 +1013,14 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
692
1013
|
// Non-fatal
|
|
693
1014
|
}
|
|
694
1015
|
}
|
|
1016
|
+
// Close RunStore only if we created it (not injected)
|
|
1017
|
+
if (runStore && ownRunStore) {
|
|
1018
|
+
try {
|
|
1019
|
+
runStore.close();
|
|
1020
|
+
} catch {
|
|
1021
|
+
// Non-fatal
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
695
1024
|
}
|
|
696
1025
|
}
|
|
697
1026
|
|
|
@@ -741,7 +1070,15 @@ async function executeEscalationAction(ctx: {
|
|
|
741
1070
|
tier: 0 | 1,
|
|
742
1071
|
triageSuggestion?: string,
|
|
743
1072
|
) => Promise<void>;
|
|
744
|
-
|
|
1073
|
+
getConnection: (name: string) => RuntimeConnection | undefined;
|
|
1074
|
+
removeConnection: (name: string) => void;
|
|
1075
|
+
}): Promise<{
|
|
1076
|
+
terminated: boolean;
|
|
1077
|
+
stateChanged: boolean;
|
|
1078
|
+
/** Reason and tier of the termination (only set when `terminated` is true). */
|
|
1079
|
+
deathReason?: string;
|
|
1080
|
+
deathTier?: 0 | 1;
|
|
1081
|
+
}> {
|
|
745
1082
|
const {
|
|
746
1083
|
session,
|
|
747
1084
|
root,
|
|
@@ -756,6 +1093,8 @@ async function executeEscalationAction(ctx: {
|
|
|
756
1093
|
recordFailure,
|
|
757
1094
|
triageCount,
|
|
758
1095
|
maxTriagePerTick,
|
|
1096
|
+
getConnection: getConn,
|
|
1097
|
+
removeConnection: removeConn,
|
|
759
1098
|
} = ctx;
|
|
760
1099
|
|
|
761
1100
|
switch (session.escalationLevel) {
|
|
@@ -832,16 +1171,23 @@ async function executeEscalationAction(ctx: {
|
|
|
832
1171
|
|
|
833
1172
|
if (result.verdict === "terminate") {
|
|
834
1173
|
// Record the failure via mulch (Tier 1 AI triage)
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
session,
|
|
838
|
-
"AI triage classified as terminal failure",
|
|
839
|
-
1,
|
|
840
|
-
result.verdict,
|
|
841
|
-
);
|
|
1174
|
+
const triageReason = "AI triage classified as terminal failure";
|
|
1175
|
+
await recordFailure(root, session, triageReason, 1, result.verdict);
|
|
842
1176
|
|
|
843
|
-
await killAgent({
|
|
844
|
-
|
|
1177
|
+
await killAgent({
|
|
1178
|
+
session,
|
|
1179
|
+
tmuxAlive,
|
|
1180
|
+
tmux,
|
|
1181
|
+
process: proc,
|
|
1182
|
+
getConnection: getConn,
|
|
1183
|
+
removeConnection: removeConn,
|
|
1184
|
+
});
|
|
1185
|
+
return {
|
|
1186
|
+
terminated: true,
|
|
1187
|
+
stateChanged: true,
|
|
1188
|
+
deathReason: triageReason,
|
|
1189
|
+
deathTier: 1,
|
|
1190
|
+
};
|
|
845
1191
|
}
|
|
846
1192
|
|
|
847
1193
|
if (result.verdict === "retry") {
|
|
@@ -874,10 +1220,23 @@ async function executeEscalationAction(ctx: {
|
|
|
874
1220
|
});
|
|
875
1221
|
|
|
876
1222
|
// Record the failure via mulch (Tier 0: progressive escalation to terminal level)
|
|
877
|
-
|
|
1223
|
+
const escalationReason = "Progressive escalation reached terminal level";
|
|
1224
|
+
await recordFailure(root, session, escalationReason, 0);
|
|
878
1225
|
|
|
879
|
-
await killAgent({
|
|
880
|
-
|
|
1226
|
+
await killAgent({
|
|
1227
|
+
session,
|
|
1228
|
+
tmuxAlive,
|
|
1229
|
+
tmux,
|
|
1230
|
+
process: proc,
|
|
1231
|
+
getConnection: getConn,
|
|
1232
|
+
removeConnection: removeConn,
|
|
1233
|
+
});
|
|
1234
|
+
return {
|
|
1235
|
+
terminated: true,
|
|
1236
|
+
stateChanged: true,
|
|
1237
|
+
deathReason: escalationReason,
|
|
1238
|
+
deathTier: 0,
|
|
1239
|
+
};
|
|
881
1240
|
}
|
|
882
1241
|
}
|
|
883
1242
|
}
|