@os-eco/overstory-cli 0.9.4 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -19
- package/agents/builder.md +19 -9
- package/agents/coordinator.md +6 -6
- package/agents/lead.md +204 -87
- package/agents/merger.md +25 -14
- package/agents/reviewer.md +22 -16
- package/agents/scout.md +17 -12
- package/package.json +6 -3
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/headless-mail-injector.test.ts +448 -0
- package/src/agents/headless-mail-injector.ts +219 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +514 -14
- package/src/agents/hooks-deployer.ts +141 -0
- package/src/agents/mail-poll-detect.test.ts +153 -0
- package/src/agents/mail-poll-detect.ts +73 -0
- package/src/agents/overlay.test.ts +60 -4
- package/src/agents/overlay.ts +63 -8
- package/src/agents/scope-detect.test.ts +190 -0
- package/src/agents/scope-detect.ts +146 -0
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +2312 -0
- package/src/agents/turn-runner.ts +1383 -0
- package/src/commands/agents.ts +9 -0
- package/src/commands/clean.ts +54 -0
- package/src/commands/coordinator.test.ts +254 -0
- package/src/commands/coordinator.ts +273 -8
- package/src/commands/dashboard.test.ts +188 -0
- package/src/commands/dashboard.ts +14 -4
- package/src/commands/doctor.ts +3 -1
- package/src/commands/group.test.ts +94 -0
- package/src/commands/group.ts +49 -20
- package/src/commands/init.test.ts +8 -0
- package/src/commands/init.ts +8 -1
- package/src/commands/log.test.ts +187 -11
- package/src/commands/log.ts +171 -71
- package/src/commands/mail.test.ts +162 -0
- package/src/commands/mail.ts +64 -9
- package/src/commands/merge.test.ts +230 -1
- package/src/commands/merge.ts +68 -12
- package/src/commands/nudge.test.ts +351 -4
- package/src/commands/nudge.ts +356 -34
- package/src/commands/run.test.ts +43 -7
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +408 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1323 -0
- package/src/commands/serve/rest.ts +708 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +565 -0
- package/src/commands/sling.test.ts +177 -1
- package/src/commands/sling.ts +243 -71
- package/src/commands/status.test.ts +9 -0
- package/src/commands/status.ts +12 -4
- package/src/commands/stop.test.ts +255 -1
- package/src/commands/stop.ts +107 -8
- package/src/commands/watch.test.ts +43 -0
- package/src/commands/watch.ts +153 -28
- package/src/config.ts +23 -0
- package/src/doctor/consistency.test.ts +106 -0
- package/src/doctor/consistency.ts +48 -1
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/types.ts +2 -1
- package/src/doctor/watchdog.ts +57 -1
- package/src/events/tailer.test.ts +234 -1
- package/src/events/tailer.ts +90 -0
- package/src/index.ts +57 -6
- package/src/insights/quality-gates.test.ts +141 -0
- package/src/insights/quality-gates.ts +156 -0
- package/src/json.ts +29 -0
- package/src/logging/theme.ts +4 -0
- package/src/mail/client.ts +15 -2
- package/src/mail/store.test.ts +82 -0
- package/src/mail/store.ts +41 -4
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/merge/predict.test.ts +387 -0
- package/src/merge/predict.ts +249 -0
- package/src/merge/resolver.ts +1 -1
- package/src/mulch/client.ts +3 -3
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/claude.test.ts +791 -1
- package/src/runtimes/claude.ts +323 -1
- package/src/runtimes/connections.test.ts +141 -1
- package/src/runtimes/connections.ts +73 -4
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/types.ts +10 -0
- package/src/schema-consistency.test.ts +1 -0
- package/src/sessions/store.test.ts +657 -29
- package/src/sessions/store.ts +286 -23
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/types.ts +107 -2
- package/src/utils/pid.test.ts +85 -1
- package/src/utils/pid.ts +86 -1
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/watchdog/daemon.test.ts +1607 -376
- package/src/watchdog/daemon.ts +462 -88
- package/src/watchdog/health.test.ts +282 -0
- package/src/watchdog/health.ts +126 -27
- package/src/worktree/manager.test.ts +218 -1
- package/src/worktree/manager.ts +55 -0
- package/src/worktree/process.test.ts +71 -0
- package/src/worktree/process.ts +25 -5
- package/src/worktree/tmux.test.ts +28 -0
- package/src/worktree/tmux.ts +27 -3
- package/templates/CLAUDE.md.tmpl +19 -8
- package/templates/overlay.md.tmpl +5 -2
package/src/watchdog/daemon.ts
CHANGED
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
*/
|
|
22
22
|
|
|
23
23
|
import { join } from "node:path";
|
|
24
|
+
import { isPersistentCapability } from "../agents/capabilities.ts";
|
|
24
25
|
import { nudgeAgent } from "../commands/nudge.ts";
|
|
25
26
|
import { createEventStore } from "../events/store.ts";
|
|
26
27
|
import {
|
|
@@ -34,7 +35,14 @@ import { createMulchClient } from "../mulch/client.ts";
|
|
|
34
35
|
import { getConnection, removeConnection } from "../runtimes/connections.ts";
|
|
35
36
|
import type { RuntimeConnection } from "../runtimes/types.ts";
|
|
36
37
|
import { openSessionStore } from "../sessions/compat.ts";
|
|
37
|
-
import
|
|
38
|
+
import { createRunStore } from "../sessions/store.ts";
|
|
39
|
+
import type {
|
|
40
|
+
AgentSession,
|
|
41
|
+
EventStore,
|
|
42
|
+
HealthCheck,
|
|
43
|
+
RunStore,
|
|
44
|
+
WorkerDiedPayload,
|
|
45
|
+
} from "../types.ts";
|
|
38
46
|
import { isProcessAlive, isSessionAlive, killProcessTree, killSession } from "../worktree/tmux.ts";
|
|
39
47
|
import { evaluateHealth, transitionState } from "./health.ts";
|
|
40
48
|
import { type TriageResult, triageAgent } from "./triage.ts";
|
|
@@ -42,12 +50,6 @@ import { type TriageResult, triageAgent } from "./triage.ts";
|
|
|
42
50
|
/** Maximum escalation level (terminate). */
|
|
43
51
|
const MAX_ESCALATION_LEVEL = 3;
|
|
44
52
|
|
|
45
|
-
/**
|
|
46
|
-
* Persistent agent capabilities that are excluded from run-level completion checks.
|
|
47
|
-
* These agents are long-running and should not count toward "all workers done".
|
|
48
|
-
*/
|
|
49
|
-
const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor"]);
|
|
50
|
-
|
|
51
53
|
/**
|
|
52
54
|
* Module-level registry of active event tailers for headless agents.
|
|
53
55
|
* Maps agentName → TailerHandle. Persists across daemon ticks so tailers
|
|
@@ -55,6 +57,28 @@ const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor
|
|
|
55
57
|
*/
|
|
56
58
|
const _defaultTailerRegistry: Map<string, TailerHandle> = new Map();
|
|
57
59
|
|
|
60
|
+
/**
|
|
61
|
+
* Per-cause dedup state for `current-run.txt` defensive-read warnings
|
|
62
|
+
* (overstory-87bf). The watchdog reads `.overstory/current-run.txt` once per
|
|
63
|
+
* tick to gate run-completion checks; if the file is missing/empty/unreadable
|
|
64
|
+
* or points to an id with no row in the runs table, the check would silently
|
|
65
|
+
* skip every tick. We log one warning per cause and then continue skipping
|
|
66
|
+
* silently, so an operator can see the run-completion path is wedged without
|
|
67
|
+
* drowning in repeated lines.
|
|
68
|
+
*
|
|
69
|
+
* Module-level by design: warnings should dedupe across ticks within one
|
|
70
|
+
* watchdog process. Overridable via DaemonOptions._runIdWarnState in tests.
|
|
71
|
+
*/
|
|
72
|
+
export interface RunIdWarnState {
|
|
73
|
+
missingFileWarned: boolean;
|
|
74
|
+
unknownIds: Set<string>;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const _defaultRunIdWarnState: RunIdWarnState = {
|
|
78
|
+
missingFileWarned: false,
|
|
79
|
+
unknownIds: new Set(),
|
|
80
|
+
};
|
|
81
|
+
|
|
58
82
|
/**
|
|
59
83
|
* Record an agent failure to mulch for future reference.
|
|
60
84
|
* Fire-and-forget: never throws, logs errors internally if mulch fails.
|
|
@@ -115,6 +139,56 @@ async function readCurrentRunId(overstoryDir: string): Promise<string | null> {
|
|
|
115
139
|
}
|
|
116
140
|
}
|
|
117
141
|
|
|
142
|
+
/**
|
|
143
|
+
* Resolve the active run id for run-completion checks, defensively
|
|
144
|
+
* (overstory-87bf). Returns the id only when `current-run.txt` is readable
|
|
145
|
+
* AND points to a row in the runs table. On either failure mode, logs one
|
|
146
|
+
* warning per cause via `warnState` and returns null so the caller can skip
|
|
147
|
+
* the check silently on subsequent ticks.
|
|
148
|
+
*
|
|
149
|
+
* Intentionally narrow: the broader `readCurrentRunId` is unchanged and still
|
|
150
|
+
* powers event-recording paths where a stale id is acceptable as a label.
|
|
151
|
+
*/
|
|
152
|
+
async function resolveRunIdForCompletionCheck(
|
|
153
|
+
overstoryDir: string,
|
|
154
|
+
runStore: RunStore | null,
|
|
155
|
+
warnState: RunIdWarnState,
|
|
156
|
+
): Promise<string | null> {
|
|
157
|
+
const runId = await readCurrentRunId(overstoryDir);
|
|
158
|
+
if (runId === null) {
|
|
159
|
+
if (!warnState.missingFileWarned) {
|
|
160
|
+
warnState.missingFileWarned = true;
|
|
161
|
+
process.stderr.write(
|
|
162
|
+
"[WATCHDOG] current-run.txt missing — run-completion checks disabled until restart\n",
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
return null;
|
|
166
|
+
}
|
|
167
|
+
if (runStore === null) {
|
|
168
|
+
// RunStore unavailable (rare — sessions.db open failed). Trust the file
|
|
169
|
+
// and let the downstream nudge path proceed; this is no worse than the
|
|
170
|
+
// pre-87bf behavior.
|
|
171
|
+
return runId;
|
|
172
|
+
}
|
|
173
|
+
let run: ReturnType<RunStore["getRun"]>;
|
|
174
|
+
try {
|
|
175
|
+
run = runStore.getRun(runId);
|
|
176
|
+
} catch {
|
|
177
|
+
// Treat lookup errors as "unknown" — same defensive posture as a missing row.
|
|
178
|
+
run = null;
|
|
179
|
+
}
|
|
180
|
+
if (run === null) {
|
|
181
|
+
if (!warnState.unknownIds.has(runId)) {
|
|
182
|
+
warnState.unknownIds.add(runId);
|
|
183
|
+
process.stderr.write(
|
|
184
|
+
`[WATCHDOG] current-run.txt points to unknown run "${runId}" — run-completion checks disabled until restart\n`,
|
|
185
|
+
);
|
|
186
|
+
}
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
return runId;
|
|
190
|
+
}
|
|
191
|
+
|
|
118
192
|
/**
|
|
119
193
|
* Fire-and-forget: record an event to EventStore. Never throws.
|
|
120
194
|
*/
|
|
@@ -147,10 +221,15 @@ function recordEvent(
|
|
|
147
221
|
}
|
|
148
222
|
|
|
149
223
|
/**
|
|
150
|
-
* Build a phase-aware completion message based on the capabilities of
|
|
224
|
+
* Build a phase-aware completion message based on the capabilities of terminal workers.
|
|
151
225
|
*
|
|
152
|
-
*
|
|
153
|
-
*
|
|
226
|
+
* "Terminal" includes both `completed` (clean exit) and `zombie` (watchdog-killed)
|
|
227
|
+
* — see overstory-e130 for why a zombie counts as run-terminal. Single-capability
|
|
228
|
+
* batches get targeted messages (e.g. scouts → "Ready for next phase"), while
|
|
229
|
+
* mixed-capability batches get a generic summary with a breakdown. When any worker
|
|
230
|
+
* died, the verb changes from "have completed" to "have terminated" and the message
|
|
231
|
+
* carries a "(N completed, M zombie)" qualifier so the coordinator does not mistake
|
|
232
|
+
* a partial failure for a clean batch.
|
|
154
233
|
*/
|
|
155
234
|
export function buildCompletionMessage(
|
|
156
235
|
workerSessions: readonly AgentSession[],
|
|
@@ -158,32 +237,41 @@ export function buildCompletionMessage(
|
|
|
158
237
|
): string {
|
|
159
238
|
const capabilities = new Set(workerSessions.map((s) => s.capability));
|
|
160
239
|
const count = workerSessions.length;
|
|
240
|
+
const zombieCount = workerSessions.filter((s) => s.state === "zombie").length;
|
|
241
|
+
const completedCount = count - zombieCount;
|
|
242
|
+
const verb = zombieCount > 0 ? "have terminated" : "have completed";
|
|
243
|
+
const qualifier = zombieCount > 0 ? ` (${completedCount} completed, ${zombieCount} zombie)` : "";
|
|
161
244
|
|
|
162
245
|
if (capabilities.size === 1) {
|
|
163
246
|
if (capabilities.has("scout")) {
|
|
164
|
-
return `[WATCHDOG] All ${count} scout(s) in run ${runId}
|
|
247
|
+
return `[WATCHDOG] All ${count} scout(s) in run ${runId} ${verb}${qualifier}. Ready for next phase.`;
|
|
165
248
|
}
|
|
166
249
|
if (capabilities.has("builder")) {
|
|
167
|
-
return `[WATCHDOG] All ${count} builder(s) in run ${runId}
|
|
250
|
+
return `[WATCHDOG] All ${count} builder(s) in run ${runId} ${verb}${qualifier}. Awaiting lead verification.`;
|
|
168
251
|
}
|
|
169
252
|
if (capabilities.has("reviewer")) {
|
|
170
|
-
return `[WATCHDOG] All ${count} reviewer(s) in run ${runId}
|
|
253
|
+
return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} ${verb}${qualifier}. Reviews done.`;
|
|
171
254
|
}
|
|
172
255
|
if (capabilities.has("lead")) {
|
|
173
|
-
return `[WATCHDOG] All ${count} lead(s) in run ${runId}
|
|
256
|
+
return `[WATCHDOG] All ${count} lead(s) in run ${runId} ${verb}${qualifier}. Ready for merge/cleanup.`;
|
|
174
257
|
}
|
|
175
258
|
if (capabilities.has("merger")) {
|
|
176
|
-
return `[WATCHDOG] All ${count} merger(s) in run ${runId}
|
|
259
|
+
return `[WATCHDOG] All ${count} merger(s) in run ${runId} ${verb}${qualifier}. Merges done.`;
|
|
177
260
|
}
|
|
178
261
|
}
|
|
179
262
|
|
|
180
263
|
const breakdown = Array.from(capabilities).sort().join(", ");
|
|
181
|
-
return `[WATCHDOG] All ${count} worker(s) in run ${runId}
|
|
264
|
+
return `[WATCHDOG] All ${count} worker(s) in run ${runId} ${verb}${qualifier} (${breakdown}). Ready for next steps.`;
|
|
182
265
|
}
|
|
183
266
|
|
|
184
267
|
/**
|
|
185
|
-
* Check if
|
|
186
|
-
* nudge the coordinator. Fire-and-forget:
|
|
268
|
+
* Check if every worker session for the active run has reached a terminal state
|
|
269
|
+
* (`completed` or `zombie`), and if so, nudge the coordinator. Fire-and-forget:
|
|
270
|
+
* never throws.
|
|
271
|
+
*
|
|
272
|
+
* Zombie counts as terminal (overstory-e130): a watchdog-killed worker is not
|
|
273
|
+
* coming back, so excluding it would strand the coordinator on a run that mixes
|
|
274
|
+
* clean exits with kills.
|
|
187
275
|
*
|
|
188
276
|
* Deduplication: uses a marker file (run-complete-notified.txt) to prevent
|
|
189
277
|
* repeated nudges for the same run ID.
|
|
@@ -204,14 +292,17 @@ async function checkRunCompletion(ctx: {
|
|
|
204
292
|
const { store, runId, overstoryDir, root, nudge, eventStore } = ctx;
|
|
205
293
|
|
|
206
294
|
const runSessions = store.getByRun(runId);
|
|
207
|
-
const workerSessions = runSessions.filter((s) => !
|
|
295
|
+
const workerSessions = runSessions.filter((s) => !isPersistentCapability(s.capability));
|
|
208
296
|
|
|
209
297
|
if (workerSessions.length === 0) {
|
|
210
298
|
return;
|
|
211
299
|
}
|
|
212
300
|
|
|
213
|
-
|
|
214
|
-
|
|
301
|
+
// `completed` = clean exit, `zombie` = watchdog-killed. Both are terminal
|
|
302
|
+
// for run-completion: a zombie is not coming back, so blocking on it would
|
|
303
|
+
// strand the coordinator forever (overstory-e130).
|
|
304
|
+
const allTerminal = workerSessions.every((s) => s.state === "completed" || s.state === "zombie");
|
|
305
|
+
if (!allTerminal) {
|
|
215
306
|
return;
|
|
216
307
|
}
|
|
217
308
|
|
|
@@ -240,15 +331,20 @@ async function checkRunCompletion(ctx: {
|
|
|
240
331
|
// Record the event
|
|
241
332
|
const capabilitiesArr = Array.from(new Set(workerSessions.map((s) => s.capability))).sort();
|
|
242
333
|
const phase = capabilitiesArr.length === 1 ? capabilitiesArr[0] : "mixed";
|
|
334
|
+
const completedAgents = workerSessions
|
|
335
|
+
.filter((s) => s.state === "completed")
|
|
336
|
+
.map((s) => s.agentName);
|
|
337
|
+
const zombieAgents = workerSessions.filter((s) => s.state === "zombie").map((s) => s.agentName);
|
|
243
338
|
recordEvent(eventStore, {
|
|
244
339
|
runId,
|
|
245
340
|
agentName: "watchdog",
|
|
246
341
|
eventType: "custom",
|
|
247
|
-
level: "info",
|
|
342
|
+
level: zombieAgents.length > 0 ? "warn" : "info",
|
|
248
343
|
data: {
|
|
249
344
|
type: "run_complete",
|
|
250
345
|
workerCount: workerSessions.length,
|
|
251
|
-
completedAgents
|
|
346
|
+
completedAgents,
|
|
347
|
+
zombieAgents,
|
|
252
348
|
capabilities: capabilitiesArr,
|
|
253
349
|
phase,
|
|
254
350
|
},
|
|
@@ -269,6 +365,13 @@ export interface DaemonOptions {
|
|
|
269
365
|
zombieThresholdMs: number;
|
|
270
366
|
nudgeIntervalMs?: number;
|
|
271
367
|
tier1Enabled?: boolean;
|
|
368
|
+
/**
|
|
369
|
+
* When true (default), the watchdog sends a synthetic `worker_died` mail to
|
|
370
|
+
* `session.parentAgent` the first time it transitions a session to `zombie`
|
|
371
|
+
* (overstory-c111). Without this, the parent — typically a lead waiting for
|
|
372
|
+
* `worker_done` — blocks indefinitely on mail that will never arrive.
|
|
373
|
+
*/
|
|
374
|
+
notifyParentOnDeath?: boolean;
|
|
272
375
|
onHealthCheck?: (check: HealthCheck) => void;
|
|
273
376
|
/** Dependency injection for testing. Uses real implementations when omitted. */
|
|
274
377
|
_tmux?: {
|
|
@@ -317,6 +420,18 @@ export interface DaemonOptions {
|
|
|
317
420
|
_findLatestStdoutLog?: (overstoryDir: string, agentName: string) => Promise<string | null>;
|
|
318
421
|
/** Dependency injection for testing. Overrides MailStore creation for decision gate detection. */
|
|
319
422
|
_mailStore?: MailStore | null;
|
|
423
|
+
/**
|
|
424
|
+
* Dependency injection for testing. Overrides the module-level run-id warning
|
|
425
|
+
* state so each test starts with a clean dedup slate (overstory-87bf).
|
|
426
|
+
*/
|
|
427
|
+
_runIdWarnState?: RunIdWarnState;
|
|
428
|
+
/**
|
|
429
|
+
* Dependency injection for testing. Overrides RunStore creation. When `null`
|
|
430
|
+
* is passed explicitly, run-id validation is skipped (file presence still
|
|
431
|
+
* gates the warning). When omitted, a real RunStore is opened against
|
|
432
|
+
* `.overstory/sessions.db`.
|
|
433
|
+
*/
|
|
434
|
+
_runStore?: RunStore | null;
|
|
320
435
|
}
|
|
321
436
|
|
|
322
437
|
/**
|
|
@@ -369,27 +484,66 @@ export function startDaemon(options: DaemonOptions & { intervalMs: number }): {
|
|
|
369
484
|
/**
|
|
370
485
|
* Kill an agent using the appropriate method based on whether it is headless or TUI.
|
|
371
486
|
*
|
|
372
|
-
*
|
|
373
|
-
*
|
|
487
|
+
* Prefers runtime-agnostic `conn.abort()` when a RuntimeConnection is registered.
|
|
488
|
+
* If abort() succeeds, returns immediately — no PID/tmux kill needed.
|
|
489
|
+
* If abort() throws (e.g. process already exited), falls through to the
|
|
490
|
+
* defense-in-depth path below.
|
|
374
491
|
*
|
|
375
|
-
*
|
|
376
|
-
*
|
|
492
|
+
* Branching after abort:
|
|
493
|
+
* - tmuxSession === "" (headless): never call tmux.killSession — an empty `-t`
|
|
494
|
+
* prefix-matches every session in the tmux server, wildcard-killing the entire
|
|
495
|
+
* overstory swarm (overstory-74ce). Branch by pid:
|
|
496
|
+
* - pid !== null → kill the process tree (long-lived headless capability).
|
|
497
|
+
* - pid === null → no-op (spawn-per-turn agent between turns; the in-flight
|
|
498
|
+
* process, if any, was already handled by the abort/connection path).
|
|
499
|
+
* - tmuxSession !== "" (TUI): kill the named tmux session, but only when
|
|
500
|
+
* `tmuxAlive` to avoid spurious "session not found" errors.
|
|
377
501
|
*/
|
|
378
502
|
async function killAgent(ctx: {
|
|
379
503
|
session: AgentSession;
|
|
380
504
|
tmuxAlive: boolean;
|
|
381
505
|
tmux: { killSession: (name: string) => Promise<void> };
|
|
382
506
|
process: { killTree: (pid: number) => Promise<void> };
|
|
507
|
+
getConnection: (name: string) => RuntimeConnection | undefined;
|
|
508
|
+
removeConnection: (name: string) => void;
|
|
383
509
|
}): Promise<void> {
|
|
384
|
-
const { session, tmuxAlive, tmux, process: proc } = ctx;
|
|
385
|
-
|
|
386
|
-
|
|
510
|
+
const { session, tmuxAlive, tmux, process: proc, getConnection, removeConnection } = ctx;
|
|
511
|
+
|
|
512
|
+
// Prefer runtime-agnostic abort() when a connection is registered.
|
|
513
|
+
const conn = getConnection(session.agentName);
|
|
514
|
+
if (conn) {
|
|
515
|
+
let aborted = false;
|
|
387
516
|
try {
|
|
388
|
-
await
|
|
517
|
+
await conn.abort();
|
|
518
|
+
aborted = true;
|
|
389
519
|
} catch {
|
|
390
|
-
//
|
|
520
|
+
// abort() failure — fall through to defense-in-depth path
|
|
521
|
+
}
|
|
522
|
+
removeConnection(session.agentName);
|
|
523
|
+
if (aborted) {
|
|
524
|
+
return;
|
|
525
|
+
}
|
|
526
|
+
// abort() threw — fall through to PID/tmux kill below as defense-in-depth
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
// Headless agents (no tmux session) must never reach tmux.killSession.
|
|
530
|
+
// An empty `-t` argument is prefix-matched and would kill every overstory
|
|
531
|
+
// tmux session in the server (overstory-74ce).
|
|
532
|
+
if (session.tmuxSession === "") {
|
|
533
|
+
if (session.pid !== null) {
|
|
534
|
+
try {
|
|
535
|
+
await proc.killTree(session.pid);
|
|
536
|
+
} catch {
|
|
537
|
+
// Already exited — not an error
|
|
538
|
+
}
|
|
391
539
|
}
|
|
392
|
-
|
|
540
|
+
// pid === null: spawn-per-turn agent between turns. Any in-flight process
|
|
541
|
+
// was handled by abort/connection above. No-op — next dispatch will spawn fresh.
|
|
542
|
+
return;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// Named tmux session path (TUI agents).
|
|
546
|
+
if (tmuxAlive) {
|
|
393
547
|
try {
|
|
394
548
|
await tmux.killSession(session.tmuxSession);
|
|
395
549
|
} catch {
|
|
@@ -398,6 +552,70 @@ async function killAgent(ctx: {
|
|
|
398
552
|
}
|
|
399
553
|
}
|
|
400
554
|
|
|
555
|
+
/**
|
|
556
|
+
* Send a synthetic `worker_died` mail to the parent of a watchdog-terminated
|
|
557
|
+
* session (overstory-c111). Fire-and-forget: never throws.
|
|
558
|
+
*
|
|
559
|
+
* Called only when `tryTransitionState(..., "zombie")` returns `ok: true`, so
|
|
560
|
+
* the state-machine's idempotence dedupes us — a subsequent watchdog tick that
|
|
561
|
+
* tries to re-zombify a session sees `illegal_transition` and skips notify.
|
|
562
|
+
*/
|
|
563
|
+
function notifyParentOfDeath(ctx: {
|
|
564
|
+
session: AgentSession;
|
|
565
|
+
mailStore: MailStore | null;
|
|
566
|
+
reason: string;
|
|
567
|
+
tier: 0 | 1;
|
|
568
|
+
eventStore: EventStore | null;
|
|
569
|
+
runId: string | null;
|
|
570
|
+
}): void {
|
|
571
|
+
const { session, mailStore, reason, tier, eventStore, runId } = ctx;
|
|
572
|
+
if (mailStore === null) return;
|
|
573
|
+
if (session.parentAgent === null) return;
|
|
574
|
+
|
|
575
|
+
const payload: WorkerDiedPayload = {
|
|
576
|
+
agentName: session.agentName,
|
|
577
|
+
capability: session.capability,
|
|
578
|
+
taskId: session.taskId,
|
|
579
|
+
reason,
|
|
580
|
+
lastActivity: session.lastActivity,
|
|
581
|
+
terminatedBy: tier === 0 ? "tier0" : "tier1",
|
|
582
|
+
};
|
|
583
|
+
|
|
584
|
+
try {
|
|
585
|
+
mailStore.insert({
|
|
586
|
+
id: "",
|
|
587
|
+
from: session.agentName,
|
|
588
|
+
to: session.parentAgent,
|
|
589
|
+
subject: `[WATCHDOG] worker_died: ${session.agentName}`,
|
|
590
|
+
body:
|
|
591
|
+
`Worker "${session.agentName}" (${session.capability}) on task ${session.taskId} ` +
|
|
592
|
+
`was terminated by the watchdog. Reason: ${reason}. ` +
|
|
593
|
+
`Last activity: ${session.lastActivity}. ` +
|
|
594
|
+
`Decide whether to retry the work, escalate, or report the failure upstream.`,
|
|
595
|
+
type: "worker_died",
|
|
596
|
+
priority: "high",
|
|
597
|
+
threadId: null,
|
|
598
|
+
payload: JSON.stringify(payload),
|
|
599
|
+
});
|
|
600
|
+
} catch {
|
|
601
|
+
// Mail-send failure must never crash the watchdog.
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
recordEvent(eventStore, {
|
|
606
|
+
runId,
|
|
607
|
+
agentName: session.agentName,
|
|
608
|
+
eventType: "mail_sent",
|
|
609
|
+
level: "warn",
|
|
610
|
+
data: {
|
|
611
|
+
type: "worker_died",
|
|
612
|
+
parent: session.parentAgent,
|
|
613
|
+
reason,
|
|
614
|
+
tier,
|
|
615
|
+
},
|
|
616
|
+
});
|
|
617
|
+
}
|
|
618
|
+
|
|
401
619
|
/**
|
|
402
620
|
* Run a single daemon tick. Exported for testing — allows direct invocation
|
|
403
621
|
* of the monitoring logic without starting the interval-based daemon loop.
|
|
@@ -411,6 +629,7 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
411
629
|
zombieThresholdMs,
|
|
412
630
|
nudgeIntervalMs = 60_000,
|
|
413
631
|
tier1Enabled = false,
|
|
632
|
+
notifyParentOnDeath = true,
|
|
414
633
|
onHealthCheck,
|
|
415
634
|
} = options;
|
|
416
635
|
const tmux = options._tmux ?? { isSessionAlive, killSession };
|
|
@@ -425,10 +644,26 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
425
644
|
const findStdoutLog = options._findLatestStdoutLog ?? findLatestStdoutLog;
|
|
426
645
|
const maxTriagePerTick = options._maxTriagePerTick ?? 3;
|
|
427
646
|
const triageCount = { value: 0 };
|
|
647
|
+
const runIdWarnState = options._runIdWarnState ?? _defaultRunIdWarnState;
|
|
428
648
|
|
|
429
649
|
const overstoryDir = join(root, ".overstory");
|
|
430
650
|
const { store } = openSessionStore(overstoryDir);
|
|
431
651
|
|
|
652
|
+
// Open RunStore for run-id validation (overstory-87bf). Sharing sessions.db
|
|
653
|
+
// is intentional — same file, WAL mode covers concurrent reads.
|
|
654
|
+
let runStore: RunStore | null = null;
|
|
655
|
+
let ownRunStore = false;
|
|
656
|
+
if (options._runStore !== undefined) {
|
|
657
|
+
runStore = options._runStore;
|
|
658
|
+
} else {
|
|
659
|
+
try {
|
|
660
|
+
runStore = createRunStore(join(overstoryDir, "sessions.db"));
|
|
661
|
+
ownRunStore = true;
|
|
662
|
+
} catch {
|
|
663
|
+
// RunStore creation failure is non-fatal — id validation is then skipped.
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
432
667
|
// Open MailStore for decision gate detection (fire-and-forget: non-fatal if unavailable)
|
|
433
668
|
let mailStore: MailStore | null = null;
|
|
434
669
|
let ownMailStore = false;
|
|
@@ -474,6 +709,7 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
474
709
|
// Track active headless agents to clean up stale tailers after the loop.
|
|
475
710
|
const activeHeadlessAgents = new Set<string>();
|
|
476
711
|
const eventsDbPath = join(overstoryDir, "events.db");
|
|
712
|
+
const sessionsDbPath = join(overstoryDir, "sessions.db");
|
|
477
713
|
|
|
478
714
|
for (const session of sessions) {
|
|
479
715
|
// Skip completed sessions — they are terminal and don't need monitoring
|
|
@@ -488,7 +724,11 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
488
724
|
// active headless agent that doesn't already have one running.
|
|
489
725
|
// Tailers persist between ticks (module-level registry) so events are
|
|
490
726
|
// continuously written to events.db while the agent is working.
|
|
491
|
-
|
|
727
|
+
//
|
|
728
|
+
// Both long-lived headless (pid !== null) and spawn-per-turn workers
|
|
729
|
+
// (pid === null, overstory-7a34) emit stream-json to stdout.log, so
|
|
730
|
+
// either pattern needs a tailer.
|
|
731
|
+
if (session.tmuxSession === "") {
|
|
492
732
|
activeHeadlessAgents.add(session.agentName);
|
|
493
733
|
if (!tailerRegistry.has(session.agentName)) {
|
|
494
734
|
// Discover the latest stdout.log for this agent and start tailing.
|
|
@@ -499,41 +739,52 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
499
739
|
agentName: session.agentName,
|
|
500
740
|
runId,
|
|
501
741
|
eventsDbPath,
|
|
742
|
+
sessionsDbPath,
|
|
502
743
|
});
|
|
503
744
|
tailerRegistry.set(session.agentName, handle);
|
|
504
745
|
}
|
|
505
746
|
}
|
|
506
747
|
}
|
|
507
748
|
|
|
508
|
-
//
|
|
509
|
-
//
|
|
510
|
-
//
|
|
511
|
-
//
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
// getState() failed or timed out — remove stale connection
|
|
533
|
-
removeConn(session.agentName);
|
|
749
|
+
// === Liveness check ===
|
|
750
|
+
// Prefer RuntimeConnection.getState() when a connection is registered. Fall
|
|
751
|
+
// back to tmux liveness when no connection exists. For headless agents without
|
|
752
|
+
// a connection, use event-based activity detection to refresh lastActivity.
|
|
753
|
+
const conn = getConn(session.agentName);
|
|
754
|
+
let tmuxAlive: boolean;
|
|
755
|
+
|
|
756
|
+
if (conn) {
|
|
757
|
+
try {
|
|
758
|
+
const state = await Promise.race([
|
|
759
|
+
conn.getState(),
|
|
760
|
+
new Promise<never>((_, reject) =>
|
|
761
|
+
setTimeout(() => reject(new Error("getState timed out")), 5000),
|
|
762
|
+
),
|
|
763
|
+
]);
|
|
764
|
+
// Map ConnectionState → liveness:
|
|
765
|
+
// idle | working → alive (running)
|
|
766
|
+
// error → not alive (exited)
|
|
767
|
+
if (state.status === "idle" || state.status === "working") {
|
|
768
|
+
tmuxAlive = true;
|
|
769
|
+
store.updateLastActivity(session.agentName);
|
|
770
|
+
session.lastActivity = new Date().toISOString();
|
|
771
|
+
} else {
|
|
772
|
+
tmuxAlive = false;
|
|
534
773
|
}
|
|
535
|
-
}
|
|
536
|
-
//
|
|
774
|
+
} catch {
|
|
775
|
+
// getState() failed/timed out — drop stale connection, fall back to tmux
|
|
776
|
+
removeConn(session.agentName);
|
|
777
|
+
tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
778
|
+
}
|
|
779
|
+
} else {
|
|
780
|
+
tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
781
|
+
|
|
782
|
+
// Headless agents without a registered connection: event-based
|
|
783
|
+
// activity detection to avoid false-positive stale. Covers both
|
|
784
|
+
// long-lived headless (e.g. after a process restart) and
|
|
785
|
+
// spawn-per-turn workers between turns where lastActivity is
|
|
786
|
+
// the only liveness signal (overstory-7a34).
|
|
787
|
+
if (session.tmuxSession === "" && eventStore) {
|
|
537
788
|
try {
|
|
538
789
|
const recentEvents = eventStore.getByAgent(session.agentName, {
|
|
539
790
|
since: new Date(Date.now() - staleThresholdMs).toISOString(),
|
|
@@ -548,15 +799,28 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
548
799
|
}
|
|
549
800
|
}
|
|
550
801
|
}
|
|
551
|
-
|
|
552
|
-
const tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
553
802
|
const check = evaluateHealth(session, tmuxAlive, thresholds);
|
|
554
803
|
|
|
555
|
-
//
|
|
804
|
+
// Snapshot the pre-tick state so the worker_died notify path can
|
|
805
|
+
// dedupe across re-ticks (overstory-c111). Subsequent `tryTransitionState`
|
|
806
|
+
// calls below mutate session.state, and the matrix allows the idempotent
|
|
807
|
+
// `zombie → zombie` self-transition — both would erase the dedup signal.
|
|
808
|
+
const stateBeforeTick = session.state;
|
|
809
|
+
|
|
810
|
+
// Transition state forward only (investigate action holds state).
|
|
811
|
+
// `transitionState` computes the watchdog's preferred target;
|
|
812
|
+
// `tryTransitionState` is the matrix-guarded CAS — `completed → *`
|
|
813
|
+
// is rejected here so a properly-completed agent cannot be
|
|
814
|
+
// reclassified as zombie by a late watchdog tick (overstory-a993).
|
|
556
815
|
const newState = transitionState(session.state, check);
|
|
557
816
|
if (newState !== session.state) {
|
|
558
|
-
store.
|
|
559
|
-
|
|
817
|
+
const outcome = store.tryTransitionState(session.agentName, newState);
|
|
818
|
+
if (outcome.ok) {
|
|
819
|
+
session.state = newState;
|
|
820
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
821
|
+
// Resync local mirror — another writer settled state durably.
|
|
822
|
+
session.state = outcome.prev;
|
|
823
|
+
}
|
|
560
824
|
}
|
|
561
825
|
|
|
562
826
|
if (onHealthCheck) {
|
|
@@ -568,12 +832,41 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
568
832
|
const reason = check.reconciliationNote ?? "Process terminated";
|
|
569
833
|
await recordFailureFn(root, session, reason, 0);
|
|
570
834
|
|
|
571
|
-
// Kill the agent:
|
|
572
|
-
await killAgent({
|
|
573
|
-
|
|
835
|
+
// Kill the agent: prefer conn.abort(), fall back to PID/tmux
|
|
836
|
+
await killAgent({
|
|
837
|
+
session,
|
|
838
|
+
tmuxAlive,
|
|
839
|
+
tmux,
|
|
840
|
+
process: proc,
|
|
841
|
+
getConnection: getConn,
|
|
842
|
+
removeConnection: removeConn,
|
|
843
|
+
});
|
|
844
|
+
// Matrix-guarded: rejected when state is `completed` so a clean
|
|
845
|
+
// `ov stop` cannot be silently downgraded to zombie by a late
|
|
846
|
+
// watchdog termination (overstory-a993).
|
|
847
|
+
const outcome = store.tryTransitionState(session.agentName, "zombie");
|
|
574
848
|
// Reset escalation tracking on terminal state
|
|
575
849
|
store.updateEscalation(session.agentName, 0, null);
|
|
576
|
-
|
|
850
|
+
if (outcome.ok) {
|
|
851
|
+
session.state = "zombie";
|
|
852
|
+
// First-time zombify: notify parent so it doesn't block on
|
|
853
|
+
// missing `worker_done` mail (overstory-c111). Dedup uses the
|
|
854
|
+
// pre-tick snapshot because the matrix allows the idempotent
|
|
855
|
+
// zombie → zombie transition (both `outcome.ok` and the earlier
|
|
856
|
+
// transitionState call would otherwise mask re-ticks).
|
|
857
|
+
if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
|
|
858
|
+
notifyParentOfDeath({
|
|
859
|
+
session,
|
|
860
|
+
mailStore,
|
|
861
|
+
reason,
|
|
862
|
+
tier: 0,
|
|
863
|
+
eventStore,
|
|
864
|
+
runId,
|
|
865
|
+
});
|
|
866
|
+
}
|
|
867
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
868
|
+
session.state = outcome.prev;
|
|
869
|
+
}
|
|
577
870
|
session.escalationLevel = 0;
|
|
578
871
|
session.stalledSince = null;
|
|
579
872
|
} else if (check.action === "investigate") {
|
|
@@ -581,6 +874,21 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
581
874
|
// Log the conflict but do NOT auto-kill.
|
|
582
875
|
// The onHealthCheck callback surfaces this to the operator.
|
|
583
876
|
// No state change — keep zombie until a human or higher-tier agent decides.
|
|
877
|
+
} else if (check.action === "complete") {
|
|
878
|
+
// ZFC fallback: tmux/pid is gone AND lastActivity is stale —
|
|
879
|
+
// the agent looks like it finished naturally and only the
|
|
880
|
+
// session-end hook missed (overstory-e74b). Mark completed
|
|
881
|
+
// without killing (process is already gone) and without
|
|
882
|
+
// notifying parents of death (this is not a crash).
|
|
883
|
+
const outcome = store.tryTransitionState(session.agentName, "completed");
|
|
884
|
+
if (outcome.ok) {
|
|
885
|
+
session.state = "completed";
|
|
886
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
887
|
+
session.state = outcome.prev;
|
|
888
|
+
}
|
|
889
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
890
|
+
session.escalationLevel = 0;
|
|
891
|
+
session.stalledSince = null;
|
|
584
892
|
} else if (check.action === "escalate") {
|
|
585
893
|
// Decision gate check: if the agent sent a decision_gate message, it is
|
|
586
894
|
// intentionally paused waiting for a human decision — not a stall.
|
|
@@ -635,12 +943,32 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
635
943
|
recordFailure: recordFailureFn,
|
|
636
944
|
triageCount,
|
|
637
945
|
maxTriagePerTick,
|
|
946
|
+
getConnection: getConn,
|
|
947
|
+
removeConnection: removeConn,
|
|
638
948
|
});
|
|
639
949
|
|
|
640
950
|
if (actionResult.terminated) {
|
|
641
|
-
|
|
951
|
+
// Matrix-guarded: completed → zombie is rejected (overstory-a993).
|
|
952
|
+
const outcome = store.tryTransitionState(session.agentName, "zombie");
|
|
642
953
|
store.updateEscalation(session.agentName, 0, null);
|
|
643
|
-
|
|
954
|
+
if (outcome.ok) {
|
|
955
|
+
session.state = "zombie";
|
|
956
|
+
// First-time zombify: notify parent so it doesn't block on
|
|
957
|
+
// missing `worker_done` mail (overstory-c111). Dedup via
|
|
958
|
+
// the pre-tick snapshot — see the terminate branch above.
|
|
959
|
+
if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
|
|
960
|
+
notifyParentOfDeath({
|
|
961
|
+
session,
|
|
962
|
+
mailStore,
|
|
963
|
+
reason: actionResult.deathReason ?? "Watchdog escalation terminated agent",
|
|
964
|
+
tier: actionResult.deathTier ?? 0,
|
|
965
|
+
eventStore,
|
|
966
|
+
runId,
|
|
967
|
+
});
|
|
968
|
+
}
|
|
969
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
970
|
+
session.state = outcome.prev;
|
|
971
|
+
}
|
|
644
972
|
session.escalationLevel = 0;
|
|
645
973
|
session.stalledSince = null;
|
|
646
974
|
}
|
|
@@ -664,10 +992,18 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
664
992
|
|
|
665
993
|
// === Run-level completion detection ===
|
|
666
994
|
// After monitoring individual sessions, check if the entire run is done.
|
|
667
|
-
|
|
995
|
+
// Re-resolve the run id defensively (overstory-87bf): a missing
|
|
996
|
+
// current-run.txt or a stale id (no row in runs table) skips the check
|
|
997
|
+
// and emits one warning per cause for the lifetime of this watchdog.
|
|
998
|
+
const validatedRunId = await resolveRunIdForCompletionCheck(
|
|
999
|
+
overstoryDir,
|
|
1000
|
+
runStore,
|
|
1001
|
+
runIdWarnState,
|
|
1002
|
+
);
|
|
1003
|
+
if (validatedRunId) {
|
|
668
1004
|
await checkRunCompletion({
|
|
669
1005
|
store,
|
|
670
|
-
runId,
|
|
1006
|
+
runId: validatedRunId,
|
|
671
1007
|
overstoryDir,
|
|
672
1008
|
root,
|
|
673
1009
|
nudge,
|
|
@@ -692,6 +1028,14 @@ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
|
692
1028
|
// Non-fatal
|
|
693
1029
|
}
|
|
694
1030
|
}
|
|
1031
|
+
// Close RunStore only if we created it (not injected)
|
|
1032
|
+
if (runStore && ownRunStore) {
|
|
1033
|
+
try {
|
|
1034
|
+
runStore.close();
|
|
1035
|
+
} catch {
|
|
1036
|
+
// Non-fatal
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
695
1039
|
}
|
|
696
1040
|
}
|
|
697
1041
|
|
|
@@ -741,7 +1085,15 @@ async function executeEscalationAction(ctx: {
|
|
|
741
1085
|
tier: 0 | 1,
|
|
742
1086
|
triageSuggestion?: string,
|
|
743
1087
|
) => Promise<void>;
|
|
744
|
-
|
|
1088
|
+
getConnection: (name: string) => RuntimeConnection | undefined;
|
|
1089
|
+
removeConnection: (name: string) => void;
|
|
1090
|
+
}): Promise<{
|
|
1091
|
+
terminated: boolean;
|
|
1092
|
+
stateChanged: boolean;
|
|
1093
|
+
/** Reason and tier of the termination (only set when `terminated` is true). */
|
|
1094
|
+
deathReason?: string;
|
|
1095
|
+
deathTier?: 0 | 1;
|
|
1096
|
+
}> {
|
|
745
1097
|
const {
|
|
746
1098
|
session,
|
|
747
1099
|
root,
|
|
@@ -756,6 +1108,8 @@ async function executeEscalationAction(ctx: {
|
|
|
756
1108
|
recordFailure,
|
|
757
1109
|
triageCount,
|
|
758
1110
|
maxTriagePerTick,
|
|
1111
|
+
getConnection: getConn,
|
|
1112
|
+
removeConnection: removeConn,
|
|
759
1113
|
} = ctx;
|
|
760
1114
|
|
|
761
1115
|
switch (session.escalationLevel) {
|
|
@@ -832,16 +1186,23 @@ async function executeEscalationAction(ctx: {
|
|
|
832
1186
|
|
|
833
1187
|
if (result.verdict === "terminate") {
|
|
834
1188
|
// Record the failure via mulch (Tier 1 AI triage)
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
session,
|
|
838
|
-
"AI triage classified as terminal failure",
|
|
839
|
-
1,
|
|
840
|
-
result.verdict,
|
|
841
|
-
);
|
|
1189
|
+
const triageReason = "AI triage classified as terminal failure";
|
|
1190
|
+
await recordFailure(root, session, triageReason, 1, result.verdict);
|
|
842
1191
|
|
|
843
|
-
await killAgent({
|
|
844
|
-
|
|
1192
|
+
await killAgent({
|
|
1193
|
+
session,
|
|
1194
|
+
tmuxAlive,
|
|
1195
|
+
tmux,
|
|
1196
|
+
process: proc,
|
|
1197
|
+
getConnection: getConn,
|
|
1198
|
+
removeConnection: removeConn,
|
|
1199
|
+
});
|
|
1200
|
+
return {
|
|
1201
|
+
terminated: true,
|
|
1202
|
+
stateChanged: true,
|
|
1203
|
+
deathReason: triageReason,
|
|
1204
|
+
deathTier: 1,
|
|
1205
|
+
};
|
|
845
1206
|
}
|
|
846
1207
|
|
|
847
1208
|
if (result.verdict === "retry") {
|
|
@@ -874,10 +1235,23 @@ async function executeEscalationAction(ctx: {
|
|
|
874
1235
|
});
|
|
875
1236
|
|
|
876
1237
|
// Record the failure via mulch (Tier 0: progressive escalation to terminal level)
|
|
877
|
-
|
|
1238
|
+
const escalationReason = "Progressive escalation reached terminal level";
|
|
1239
|
+
await recordFailure(root, session, escalationReason, 0);
|
|
878
1240
|
|
|
879
|
-
await killAgent({
|
|
880
|
-
|
|
1241
|
+
await killAgent({
|
|
1242
|
+
session,
|
|
1243
|
+
tmuxAlive,
|
|
1244
|
+
tmux,
|
|
1245
|
+
process: proc,
|
|
1246
|
+
getConnection: getConn,
|
|
1247
|
+
removeConnection: removeConn,
|
|
1248
|
+
});
|
|
1249
|
+
return {
|
|
1250
|
+
terminated: true,
|
|
1251
|
+
stateChanged: true,
|
|
1252
|
+
deathReason: escalationReason,
|
|
1253
|
+
deathTier: 0,
|
|
1254
|
+
};
|
|
881
1255
|
}
|
|
882
1256
|
}
|
|
883
1257
|
}
|