@os-eco/overstory-cli 0.9.4 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -19
- package/agents/builder.md +19 -9
- package/agents/coordinator.md +6 -6
- package/agents/lead.md +204 -87
- package/agents/merger.md +25 -14
- package/agents/reviewer.md +22 -16
- package/agents/scout.md +17 -12
- package/package.json +6 -3
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/headless-mail-injector.test.ts +448 -0
- package/src/agents/headless-mail-injector.ts +219 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +514 -14
- package/src/agents/hooks-deployer.ts +141 -0
- package/src/agents/mail-poll-detect.test.ts +153 -0
- package/src/agents/mail-poll-detect.ts +73 -0
- package/src/agents/overlay.test.ts +60 -4
- package/src/agents/overlay.ts +63 -8
- package/src/agents/scope-detect.test.ts +190 -0
- package/src/agents/scope-detect.ts +146 -0
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +2312 -0
- package/src/agents/turn-runner.ts +1383 -0
- package/src/commands/agents.ts +9 -0
- package/src/commands/clean.ts +54 -0
- package/src/commands/coordinator.test.ts +254 -0
- package/src/commands/coordinator.ts +273 -8
- package/src/commands/dashboard.test.ts +188 -0
- package/src/commands/dashboard.ts +14 -4
- package/src/commands/doctor.ts +3 -1
- package/src/commands/group.test.ts +94 -0
- package/src/commands/group.ts +49 -20
- package/src/commands/init.test.ts +8 -0
- package/src/commands/init.ts +8 -1
- package/src/commands/log.test.ts +187 -11
- package/src/commands/log.ts +171 -71
- package/src/commands/mail.test.ts +162 -0
- package/src/commands/mail.ts +64 -9
- package/src/commands/merge.test.ts +230 -1
- package/src/commands/merge.ts +68 -12
- package/src/commands/nudge.test.ts +351 -4
- package/src/commands/nudge.ts +356 -34
- package/src/commands/run.test.ts +43 -7
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +408 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1323 -0
- package/src/commands/serve/rest.ts +708 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +565 -0
- package/src/commands/sling.test.ts +177 -1
- package/src/commands/sling.ts +243 -71
- package/src/commands/status.test.ts +9 -0
- package/src/commands/status.ts +12 -4
- package/src/commands/stop.test.ts +255 -1
- package/src/commands/stop.ts +107 -8
- package/src/commands/watch.test.ts +43 -0
- package/src/commands/watch.ts +153 -28
- package/src/config.ts +23 -0
- package/src/doctor/consistency.test.ts +106 -0
- package/src/doctor/consistency.ts +48 -1
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/types.ts +2 -1
- package/src/doctor/watchdog.ts +57 -1
- package/src/events/tailer.test.ts +234 -1
- package/src/events/tailer.ts +90 -0
- package/src/index.ts +57 -6
- package/src/insights/quality-gates.test.ts +141 -0
- package/src/insights/quality-gates.ts +156 -0
- package/src/json.ts +29 -0
- package/src/logging/theme.ts +4 -0
- package/src/mail/client.ts +15 -2
- package/src/mail/store.test.ts +82 -0
- package/src/mail/store.ts +41 -4
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/merge/predict.test.ts +387 -0
- package/src/merge/predict.ts +249 -0
- package/src/merge/resolver.ts +1 -1
- package/src/mulch/client.ts +3 -3
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/claude.test.ts +791 -1
- package/src/runtimes/claude.ts +323 -1
- package/src/runtimes/connections.test.ts +141 -1
- package/src/runtimes/connections.ts +73 -4
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/types.ts +10 -0
- package/src/schema-consistency.test.ts +1 -0
- package/src/sessions/store.test.ts +657 -29
- package/src/sessions/store.ts +286 -23
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/types.ts +107 -2
- package/src/utils/pid.test.ts +85 -1
- package/src/utils/pid.ts +86 -1
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/watchdog/daemon.test.ts +1607 -376
- package/src/watchdog/daemon.ts +462 -88
- package/src/watchdog/health.test.ts +282 -0
- package/src/watchdog/health.ts +126 -27
- package/src/worktree/manager.test.ts +218 -1
- package/src/worktree/manager.ts +55 -0
- package/src/worktree/process.test.ts +71 -0
- package/src/worktree/process.ts +25 -5
- package/src/worktree/tmux.test.ts +28 -0
- package/src/worktree/tmux.ts +27 -3
- package/templates/CLAUDE.md.tmpl +19 -8
- package/templates/overlay.md.tmpl +5 -2
|
@@ -0,0 +1,1383 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-turn engine for headless Claude Code agents (spawn-per-turn model).
|
|
3
|
+
*
|
|
4
|
+
* Owns a single agent turn end-to-end:
|
|
5
|
+
* - acquires per-agent serialization (in-process mutex + cross-process lease)
|
|
6
|
+
* - re-reads SessionStore under the lock so the prior `claudeSessionId` is fresh
|
|
7
|
+
* - spawns claude via the runtime's `buildDirectSpawn` (with `--resume` when available)
|
|
8
|
+
* - writes the user turn to a real stdin pipe and closes it (claude sees EOF)
|
|
9
|
+
* - drains `runtime.parseEvents` and tees events into events.db
|
|
10
|
+
* - captures the new session id via the parser's `onSessionId` hook
|
|
11
|
+
* - snapshots mail.db before spawn and detects the agent's capability-specific
|
|
12
|
+
* terminal mail (`worker_done` for builder/scout/reviewer/lead;
|
|
13
|
+
* `merged`/`merge_failed` for merger)
|
|
14
|
+
* - applies state-transition rules (booting → working, completed when done)
|
|
15
|
+
* - handles abort signals with SIGTERM → SIGKILL escalation
|
|
16
|
+
* - releases the lock on every exit path
|
|
17
|
+
*
|
|
18
|
+
* This module does NOT decide WHEN to run a turn. The mail injector and nudge
|
|
19
|
+
* command call `runTurn(opts)` when they have a user turn to deliver.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { Database } from "bun:sqlite";
|
|
23
|
+
import { appendFileSync, existsSync } from "node:fs";
|
|
24
|
+
import { mkdir, unlink } from "node:fs/promises";
|
|
25
|
+
import { join } from "node:path";
|
|
26
|
+
import { extractFileScope } from "../commands/agents.ts";
|
|
27
|
+
import { AgentError } from "../errors.ts";
|
|
28
|
+
import { createEventStore } from "../events/store.ts";
|
|
29
|
+
import { filterToolArgs } from "../events/tool-filter.ts";
|
|
30
|
+
import { createMailStore, type MailStore } from "../mail/store.ts";
|
|
31
|
+
import type { AgentEvent, AgentRuntime, DirectSpawnOpts } from "../runtimes/types.ts";
|
|
32
|
+
import { createSessionStore } from "../sessions/store.ts";
|
|
33
|
+
import type {
|
|
34
|
+
AgentState,
|
|
35
|
+
EventStore,
|
|
36
|
+
EventType,
|
|
37
|
+
ResolvedModel,
|
|
38
|
+
WorkerDiedPayload,
|
|
39
|
+
} from "../types.ts";
|
|
40
|
+
import { terminalMailTypesFor } from "./capabilities.ts";
|
|
41
|
+
import { detectMailPollPattern } from "./mail-poll-detect.ts";
|
|
42
|
+
import {
|
|
43
|
+
type DetectScopeViolationOpts,
|
|
44
|
+
detectScopeViolation as defaultDetectScopeViolation,
|
|
45
|
+
IMPLEMENTATION_CAPABILITIES,
|
|
46
|
+
type ScopeViolationResult,
|
|
47
|
+
} from "./scope-detect.ts";
|
|
48
|
+
import { acquireTurnLock } from "./turn-lock.ts";
|
|
49
|
+
|
|
50
|
+
/** Subprocess shape required by `runTurn`. Compatible with `Bun.spawn`. */
|
|
51
|
+
export interface TurnSubprocess {
|
|
52
|
+
readonly pid: number;
|
|
53
|
+
readonly stdin: {
|
|
54
|
+
write(data: string | Uint8Array): number | Promise<number> | unknown;
|
|
55
|
+
end?(): void | Promise<void> | unknown;
|
|
56
|
+
flush?(): unknown;
|
|
57
|
+
};
|
|
58
|
+
readonly stdout: ReadableStream<Uint8Array>;
|
|
59
|
+
readonly exited: Promise<number | null>;
|
|
60
|
+
kill(signal?: number | string): void;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** Spawn function signature. Production passes `Bun.spawn` cast to this type. */
|
|
64
|
+
export type TurnSpawnFn = (
|
|
65
|
+
cmd: string[],
|
|
66
|
+
options: {
|
|
67
|
+
cwd: string;
|
|
68
|
+
env: Record<string, string>;
|
|
69
|
+
stdin: "pipe";
|
|
70
|
+
stdout: "pipe";
|
|
71
|
+
stderr: "pipe" | "ignore" | number;
|
|
72
|
+
},
|
|
73
|
+
) => TurnSubprocess;
|
|
74
|
+
|
|
75
|
+
/** Severity of an internal runner diagnostic. `error` indicates a contract violation. */
|
|
76
|
+
export type RunnerLogLevel = "warn" | "error";
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Internal runner diagnostic sink. Replaces the swallowed `catch {}` blocks
|
|
80
|
+
* around SessionStore writes and turn.pid I/O so that future failures are
|
|
81
|
+
* visible (overstory-4af3). Test injection point.
|
|
82
|
+
*/
|
|
83
|
+
export type RunnerLogger = (level: RunnerLogLevel, message: string, err?: unknown) => void;
|
|
84
|
+
|
|
85
|
+
export interface RunTurnOpts {
|
|
86
|
+
agentName: string;
|
|
87
|
+
/**
|
|
88
|
+
* Worker capability driving terminal-mail detection (builder/scout/reviewer/
|
|
89
|
+
* merger/lead). The runner uses {@link terminalMailTypesFor} to decide which
|
|
90
|
+
* mail types signal completion for this agent.
|
|
91
|
+
*/
|
|
92
|
+
capability: string;
|
|
93
|
+
overstoryDir: string;
|
|
94
|
+
worktreePath: string;
|
|
95
|
+
projectRoot: string;
|
|
96
|
+
taskId: string;
|
|
97
|
+
/** Pre-encoded stream-json envelope (from `encodeUserTurn`). Empty string is a no-op. */
|
|
98
|
+
userTurnNdjson: string;
|
|
99
|
+
runtime: AgentRuntime;
|
|
100
|
+
resolvedModel: ResolvedModel;
|
|
101
|
+
runId: string | null;
|
|
102
|
+
mailDbPath: string;
|
|
103
|
+
eventsDbPath: string;
|
|
104
|
+
sessionsDbPath: string;
|
|
105
|
+
/** Test injection: spawn function. Defaults to `Bun.spawn`. */
|
|
106
|
+
_spawnFn?: TurnSpawnFn;
|
|
107
|
+
/** Test injection: time source. */
|
|
108
|
+
_now?: () => Date;
|
|
109
|
+
/**
|
|
110
|
+
* Test injection: pre-opened MailStore for the parent-notify path.
|
|
111
|
+
* Production opens `mailDbPath` briefly inside the helper and closes it; tests
|
|
112
|
+
* pass a shared in-memory store so they can read what was inserted without
|
|
113
|
+
* reopening the DB file.
|
|
114
|
+
*/
|
|
115
|
+
_mailStore?: MailStore;
|
|
116
|
+
/**
|
|
117
|
+
* Test injection: runner diagnostic sink. When omitted, warnings append to
|
|
118
|
+
* `<turnLogDir>/runner.log` and mirror to `process.stderr` with a
|
|
119
|
+
* `[turn-runner:<level>] <agent>:` prefix.
|
|
120
|
+
*/
|
|
121
|
+
_logWarning?: RunnerLogger;
|
|
122
|
+
/** Operator-driven kill (e.g. `ov stop`). */
|
|
123
|
+
abortSignal?: AbortSignal;
|
|
124
|
+
/** Time between SIGTERM and SIGKILL on abort. Default 2000ms. */
|
|
125
|
+
sigkillDelayMs?: number;
|
|
126
|
+
/**
|
|
127
|
+
* Mid-stream stall watchdog: max time (ms) between parser events before the
|
|
128
|
+
* runner aborts the turn via SIGTERM (escalates to SIGKILL after
|
|
129
|
+
* `sigkillDelayMs`). Resets on every event from the runtime parser. Default
|
|
130
|
+
* 600000ms (10 minutes) — generous enough to span long tool calls while
|
|
131
|
+
* still bounding hung-claude turns (overstory-ddb3).
|
|
132
|
+
*
|
|
133
|
+
* Set to `0` to disable (test injection / explicit opt-out only).
|
|
134
|
+
*/
|
|
135
|
+
eventStallTimeoutMs?: number;
|
|
136
|
+
/**
|
|
137
|
+
* Throttle (ms) for refreshing `session.lastActivity` while events stream
|
|
138
|
+
* from the parser loop. Default `2000` (every 2s). The watchdog at
|
|
139
|
+
* `src/watchdog/health.ts:242-243` documents its design as: "the
|
|
140
|
+
* turn-runner updates [lastActivity] on every parser event during a turn,
|
|
141
|
+
* and the watchdog refreshes it from events.db between turns" — so the
|
|
142
|
+
* runner must drive lastActivity itself or a long turn looks stalled and
|
|
143
|
+
* gets zombified mid-flight (overstory-8e61).
|
|
144
|
+
*
|
|
145
|
+
* Set to `0` to refresh on every event (test injection / explicit opt-out).
|
|
146
|
+
*/
|
|
147
|
+
lastActivityRefreshIntervalMs?: number;
|
|
148
|
+
/**
|
|
149
|
+
* Test injection: invoked each time the parser loop fires a mid-turn
|
|
150
|
+
* `lastActivity` refresh (after the throttle gate, before/after the
|
|
151
|
+
* SessionStore write). Used by tests to count refresh attempts directly
|
|
152
|
+
* rather than inferring from observable timestamps (overstory-8e61).
|
|
153
|
+
*/
|
|
154
|
+
_onLastActivityRefresh?: () => void;
|
|
155
|
+
/**
|
|
156
|
+
* Test injection: replaces the real `detectScopeViolation` from
|
|
157
|
+
* `scope-detect.ts`. Tests pass a stubbed runner via the wrapper so they
|
|
158
|
+
* can drive the scope-violation observability path without spawning git
|
|
159
|
+
* (overstory-9f4d). Defaults to the real implementation.
|
|
160
|
+
*/
|
|
161
|
+
_scopeDetect?: (opts: DetectScopeViolationOpts) => ScopeViolationResult;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export interface TurnResult {
|
|
165
|
+
/** Process exit code. `null` when aborted before exit. */
|
|
166
|
+
exitCode: number | null;
|
|
167
|
+
/** True iff the parser observed a `result` event with `isError: false`. */
|
|
168
|
+
cleanResult: boolean;
|
|
169
|
+
/** Session id captured from this turn's stream-json (may differ from prior). */
|
|
170
|
+
newSessionId: string | null;
|
|
171
|
+
/** True iff a prior session id was requested and the new one differs. */
|
|
172
|
+
resumeMismatch: boolean;
|
|
173
|
+
/**
|
|
174
|
+
* True iff a capability-specific terminal mail from the agent appeared
|
|
175
|
+
* during the turn (`worker_done` for builder/scout/reviewer/lead,
|
|
176
|
+
* `merged`/`merge_failed` for merger).
|
|
177
|
+
*/
|
|
178
|
+
terminalMailObserved: boolean;
|
|
179
|
+
/** Wall-clock turn duration in milliseconds. */
|
|
180
|
+
durationMs: number;
|
|
181
|
+
/** AgentState read from SessionStore at the start of the turn. */
|
|
182
|
+
initialState: AgentState;
|
|
183
|
+
/** AgentState computed by the transition rules and persisted on exit. */
|
|
184
|
+
finalState: AgentState;
|
|
185
|
+
/**
|
|
186
|
+
* True iff the per-event stall watchdog fired during the turn — the runner
|
|
187
|
+
* sent SIGTERM/SIGKILL because no parser event arrived for
|
|
188
|
+
* `eventStallTimeoutMs` (overstory-ddb3). Treated like `aborted` for
|
|
189
|
+
* finalState purposes (`zombie`).
|
|
190
|
+
*/
|
|
191
|
+
stallAborted: boolean;
|
|
192
|
+
/**
|
|
193
|
+
* True iff claude exited cleanly (`cleanResult` true) without sending the
|
|
194
|
+
* capability-specific terminal mail (overstory-6071). Contract violation:
|
|
195
|
+
* the agent finished its turn but failed to signal completion. Logged at
|
|
196
|
+
* `error` level via the runner diagnostic sink and recorded here for
|
|
197
|
+
* caller-visible auditing.
|
|
198
|
+
*/
|
|
199
|
+
terminalMailMissing: boolean;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const defaultSpawnFn: TurnSpawnFn = (cmd, options) =>
|
|
203
|
+
Bun.spawn(cmd, options) as unknown as TurnSubprocess;
|
|
204
|
+
|
|
205
|
+
function mapAgentEventType(type: string): EventType {
|
|
206
|
+
switch (type) {
|
|
207
|
+
case "tool_use":
|
|
208
|
+
return "tool_start";
|
|
209
|
+
case "tool_result":
|
|
210
|
+
return "tool_end";
|
|
211
|
+
case "status":
|
|
212
|
+
return "session_start";
|
|
213
|
+
case "result":
|
|
214
|
+
return "result";
|
|
215
|
+
case "error":
|
|
216
|
+
return "error";
|
|
217
|
+
case "assistant_message":
|
|
218
|
+
return "progress";
|
|
219
|
+
default:
|
|
220
|
+
return "custom";
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
function recordAgentEvent(
|
|
225
|
+
eventStore: EventStore,
|
|
226
|
+
agentName: string,
|
|
227
|
+
runId: string | null,
|
|
228
|
+
sessionId: string | null,
|
|
229
|
+
event: AgentEvent,
|
|
230
|
+
): void {
|
|
231
|
+
const eventType = mapAgentEventType(event.type);
|
|
232
|
+
let dataStr: string | null;
|
|
233
|
+
try {
|
|
234
|
+
dataStr = JSON.stringify(event);
|
|
235
|
+
} catch {
|
|
236
|
+
dataStr = null;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if (event.type === "tool_use") {
|
|
240
|
+
const toolName = typeof event.name === "string" ? event.name : null;
|
|
241
|
+
const toolInput =
|
|
242
|
+
typeof event.input === "object" && event.input !== null
|
|
243
|
+
? (event.input as Record<string, unknown>)
|
|
244
|
+
: {};
|
|
245
|
+
const filtered = toolName ? filterToolArgs(toolName, toolInput) : null;
|
|
246
|
+
eventStore.insert({
|
|
247
|
+
runId,
|
|
248
|
+
agentName,
|
|
249
|
+
sessionId,
|
|
250
|
+
eventType,
|
|
251
|
+
toolName,
|
|
252
|
+
toolArgs: filtered ? JSON.stringify(filtered.args) : null,
|
|
253
|
+
toolDurationMs: null,
|
|
254
|
+
level: "info",
|
|
255
|
+
data: dataStr,
|
|
256
|
+
});
|
|
257
|
+
return;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if (event.type === "result") {
|
|
261
|
+
eventStore.insert({
|
|
262
|
+
runId,
|
|
263
|
+
agentName,
|
|
264
|
+
sessionId,
|
|
265
|
+
eventType,
|
|
266
|
+
toolName: null,
|
|
267
|
+
toolArgs: null,
|
|
268
|
+
toolDurationMs: typeof event.durationMs === "number" ? Math.round(event.durationMs) : null,
|
|
269
|
+
level: event.isError === true ? "error" : "info",
|
|
270
|
+
data: dataStr,
|
|
271
|
+
});
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
eventStore.insert({
|
|
276
|
+
runId,
|
|
277
|
+
agentName,
|
|
278
|
+
sessionId,
|
|
279
|
+
eventType,
|
|
280
|
+
toolName: null,
|
|
281
|
+
toolArgs: null,
|
|
282
|
+
toolDurationMs: null,
|
|
283
|
+
level: event.type === "error" ? "error" : "info",
|
|
284
|
+
data: dataStr,
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
function checkTerminalMailSince(
|
|
289
|
+
mailDbPath: string,
|
|
290
|
+
agentName: string,
|
|
291
|
+
capability: string,
|
|
292
|
+
sinceTs: string,
|
|
293
|
+
): boolean {
|
|
294
|
+
const types = terminalMailTypesFor(capability);
|
|
295
|
+
if (types.length === 0) return false;
|
|
296
|
+
|
|
297
|
+
let db: Database;
|
|
298
|
+
try {
|
|
299
|
+
db = new Database(mailDbPath);
|
|
300
|
+
} catch {
|
|
301
|
+
return false;
|
|
302
|
+
}
|
|
303
|
+
try {
|
|
304
|
+
db.exec("PRAGMA busy_timeout = 5000");
|
|
305
|
+
const placeholders = types.map((_, i) => `$t${i}`).join(",");
|
|
306
|
+
const sql = `SELECT 1 AS c FROM messages WHERE from_agent = $a AND type IN (${placeholders}) AND created_at > $ts LIMIT 1`;
|
|
307
|
+
const stmt = db.prepare<{ c: number }, Record<string, string>>(sql);
|
|
308
|
+
const params: Record<string, string> = { $a: agentName, $ts: sinceTs };
|
|
309
|
+
types.forEach((t, i) => {
|
|
310
|
+
params[`$t${i}`] = t;
|
|
311
|
+
});
|
|
312
|
+
const row = stmt.get(params);
|
|
313
|
+
return row !== null;
|
|
314
|
+
} catch {
|
|
315
|
+
return false;
|
|
316
|
+
} finally {
|
|
317
|
+
try {
|
|
318
|
+
db.close();
|
|
319
|
+
} catch {
|
|
320
|
+
// best-effort
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Check whether the agent has previously sent a `scope_expansion`-prefixed
|
|
327
|
+
* status mail (overstory-9f4d). When such a mail exists, the runner suppresses
|
|
328
|
+
* the soft scope-violation warning — the lead has already been informed.
|
|
329
|
+
*
|
|
330
|
+
* Soft signal — every failure (DB unavailable, missing table, etc.) returns
|
|
331
|
+
* false so observability never breaks the runner.
|
|
332
|
+
*/
|
|
333
|
+
function hasScopeExpansionMail(mailDbPath: string, agentName: string): boolean {
|
|
334
|
+
let db: Database;
|
|
335
|
+
try {
|
|
336
|
+
db = new Database(mailDbPath);
|
|
337
|
+
} catch {
|
|
338
|
+
return false;
|
|
339
|
+
}
|
|
340
|
+
try {
|
|
341
|
+
db.exec("PRAGMA busy_timeout = 5000");
|
|
342
|
+
const stmt = db.prepare<{ c: number }, { $a: string }>(
|
|
343
|
+
"SELECT 1 AS c FROM messages WHERE from_agent = $a AND subject LIKE 'scope_expansion%' LIMIT 1",
|
|
344
|
+
);
|
|
345
|
+
return stmt.get({ $a: agentName }) !== null;
|
|
346
|
+
} catch {
|
|
347
|
+
return false;
|
|
348
|
+
} finally {
|
|
349
|
+
try {
|
|
350
|
+
db.close();
|
|
351
|
+
} catch {
|
|
352
|
+
// best-effort
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Latest `created_at` timestamp of a terminal mail (`worker_done`/`result` for
|
|
359
|
+
* task-scoped workers; `merged`/`merge_failed` for merger) sent by `agentName`.
|
|
360
|
+
*
|
|
361
|
+
* Returns `null` when the agent has no prior terminal mail or the mail DB is
|
|
362
|
+
* unavailable. The runner uses this as the snapshot baseline for the new turn:
|
|
363
|
+
* any terminal mail with `created_at > snapshot` is attributable to the spawn
|
|
364
|
+
* we are about to start. Querying the actual prior timestamp eliminates the
|
|
365
|
+
* misattribution window that `now()` opened — a prior-turn `worker_done` that
|
|
366
|
+
* lands between baseline capture and spawn would have falsely tripped the
|
|
367
|
+
* "terminal mail observed" check (overstory-088b C1).
|
|
368
|
+
*/
|
|
369
|
+
function latestTerminalMailTs(
|
|
370
|
+
mailDbPath: string,
|
|
371
|
+
agentName: string,
|
|
372
|
+
capability: string,
|
|
373
|
+
): string | null {
|
|
374
|
+
const types = terminalMailTypesFor(capability);
|
|
375
|
+
if (types.length === 0) return null;
|
|
376
|
+
|
|
377
|
+
let db: Database;
|
|
378
|
+
try {
|
|
379
|
+
db = new Database(mailDbPath);
|
|
380
|
+
} catch {
|
|
381
|
+
return null;
|
|
382
|
+
}
|
|
383
|
+
try {
|
|
384
|
+
db.exec("PRAGMA busy_timeout = 5000");
|
|
385
|
+
const placeholders = types.map((_, i) => `$t${i}`).join(",");
|
|
386
|
+
const sql = `SELECT MAX(created_at) AS ts FROM messages WHERE from_agent = $a AND type IN (${placeholders})`;
|
|
387
|
+
const stmt = db.prepare<{ ts: string | null }, Record<string, string>>(sql);
|
|
388
|
+
const params: Record<string, string> = { $a: agentName };
|
|
389
|
+
types.forEach((t, i) => {
|
|
390
|
+
params[`$t${i}`] = t;
|
|
391
|
+
});
|
|
392
|
+
const row = stmt.get(params);
|
|
393
|
+
return row?.ts ?? null;
|
|
394
|
+
} catch {
|
|
395
|
+
return null;
|
|
396
|
+
} finally {
|
|
397
|
+
try {
|
|
398
|
+
db.close();
|
|
399
|
+
} catch {
|
|
400
|
+
// best-effort
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
/**
|
|
406
|
+
* Send a synthetic `worker_died` mail to the parent of a session whose turn
|
|
407
|
+
* ended without the capability's terminal mail. Mirrors the watchdog's
|
|
408
|
+
* `notifyParentOfDeath` (overstory-c111) but for in-band runner detection:
|
|
409
|
+
*
|
|
410
|
+
* - **Aborted / stalled** (zombie): operator `ov stop` or the parser-stall
|
|
411
|
+
* watchdog killed the subprocess. The agent never got a chance to send
|
|
412
|
+
* `worker_done`/`merged` (overstory-c772).
|
|
413
|
+
* - **terminalMailMissing**: claude exited cleanly but never sent the terminal
|
|
414
|
+
* mail — the silent-no-op path (overstory-4159).
|
|
415
|
+
*
|
|
416
|
+
* Without this, the lead waits forever for a terminal mail that will never
|
|
417
|
+
* arrive. The watchdog's pre-tick state-snapshot dedup (mx-b0e54b) means a
|
|
418
|
+
* later watchdog tick on the now-zombie session will see `stateBeforeTick ===
|
|
419
|
+
* "zombie"` and skip its own notify, so we won't double-fire.
|
|
420
|
+
*
|
|
421
|
+
* Fire-and-forget: every failure surfaces through `runnerLog` and never
|
|
422
|
+
* propagates. Mail-send must not break the turn.
|
|
423
|
+
*/
|
|
424
|
+
function notifyParentOfRunnerDeath(ctx: {
|
|
425
|
+
mailStore: MailStore | null;
|
|
426
|
+
mailDbPath: string;
|
|
427
|
+
parentAgent: string;
|
|
428
|
+
agentName: string;
|
|
429
|
+
capability: string;
|
|
430
|
+
taskId: string;
|
|
431
|
+
reason: string;
|
|
432
|
+
lastActivity: string;
|
|
433
|
+
runnerLog: RunnerLogger;
|
|
434
|
+
}): void {
|
|
435
|
+
const {
|
|
436
|
+
mailStore,
|
|
437
|
+
mailDbPath,
|
|
438
|
+
parentAgent,
|
|
439
|
+
agentName,
|
|
440
|
+
capability,
|
|
441
|
+
taskId,
|
|
442
|
+
reason,
|
|
443
|
+
lastActivity,
|
|
444
|
+
runnerLog,
|
|
445
|
+
} = ctx;
|
|
446
|
+
|
|
447
|
+
const payload: WorkerDiedPayload = {
|
|
448
|
+
agentName,
|
|
449
|
+
capability,
|
|
450
|
+
taskId,
|
|
451
|
+
reason,
|
|
452
|
+
lastActivity,
|
|
453
|
+
terminatedBy: "runner",
|
|
454
|
+
};
|
|
455
|
+
const subject = `[RUNNER] worker_died: ${agentName}`;
|
|
456
|
+
const body =
|
|
457
|
+
`Worker "${agentName}" (${capability}) on task ${taskId} ended without ` +
|
|
458
|
+
`sending its terminal mail. Reason: ${reason}. Last activity: ${lastActivity}. ` +
|
|
459
|
+
`Decide whether to retry the work, escalate, or report the failure upstream.`;
|
|
460
|
+
|
|
461
|
+
let store: MailStore | null = mailStore;
|
|
462
|
+
let owned = false;
|
|
463
|
+
if (store === null) {
|
|
464
|
+
try {
|
|
465
|
+
store = createMailStore(mailDbPath);
|
|
466
|
+
owned = true;
|
|
467
|
+
} catch (err) {
|
|
468
|
+
runnerLog("warn", "failed to open mail store for parent notify", err);
|
|
469
|
+
return;
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
try {
|
|
473
|
+
store.insert({
|
|
474
|
+
id: "",
|
|
475
|
+
from: agentName,
|
|
476
|
+
to: parentAgent,
|
|
477
|
+
subject,
|
|
478
|
+
body,
|
|
479
|
+
type: "worker_died",
|
|
480
|
+
priority: "high",
|
|
481
|
+
threadId: null,
|
|
482
|
+
payload: JSON.stringify(payload),
|
|
483
|
+
});
|
|
484
|
+
} catch (err) {
|
|
485
|
+
runnerLog("warn", "failed to send worker_died mail to parent", err);
|
|
486
|
+
} finally {
|
|
487
|
+
if (owned) {
|
|
488
|
+
try {
|
|
489
|
+
store.close();
|
|
490
|
+
} catch {
|
|
491
|
+
// best-effort
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
/**
|
|
498
|
+
* Guarded state transition for the turn runner. Uses the SessionStore CAS
|
|
499
|
+
* (`tryTransitionState`) so a concurrent writer — `ov stop` writing
|
|
500
|
+
* `completed`, watchdog writing `zombie` — cannot be silently overwritten
|
|
501
|
+
* by the turn-runner's "settle to working/completed/zombie" at end of turn.
|
|
502
|
+
*
|
|
503
|
+
* Returns true when the transition landed. Rejected transitions are not
|
|
504
|
+
* fatal: the SQL CAS preserves whatever the conflicting writer set, which
|
|
505
|
+
* is the correct outcome for this race (overstory-a993).
|
|
506
|
+
*
|
|
507
|
+
* `onError` fires on database/IO failure. `onRejected` fires when the CAS
|
|
508
|
+
* rejected the transition (the row exists but was in a state that disallowed
|
|
509
|
+
* the move). Both are diagnostic-only — the caller need not recover.
|
|
510
|
+
*/
|
|
511
|
+
function updateSessionState(
|
|
512
|
+
sessionsDbPath: string,
|
|
513
|
+
agentName: string,
|
|
514
|
+
state: AgentState,
|
|
515
|
+
onError?: (err: unknown) => void,
|
|
516
|
+
onRejected?: (prev: AgentState, attempted: AgentState) => void,
|
|
517
|
+
): boolean {
|
|
518
|
+
try {
|
|
519
|
+
const store = createSessionStore(sessionsDbPath);
|
|
520
|
+
try {
|
|
521
|
+
const outcome = store.tryTransitionState(agentName, state);
|
|
522
|
+
if (!outcome.ok) {
|
|
523
|
+
if (outcome.reason === "illegal_transition") {
|
|
524
|
+
onRejected?.(outcome.prev, outcome.attempted);
|
|
525
|
+
}
|
|
526
|
+
return false;
|
|
527
|
+
}
|
|
528
|
+
} finally {
|
|
529
|
+
store.close();
|
|
530
|
+
}
|
|
531
|
+
return true;
|
|
532
|
+
} catch (err) {
|
|
533
|
+
onError?.(err);
|
|
534
|
+
return false;
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
function updateSessionLastActivity(
|
|
539
|
+
sessionsDbPath: string,
|
|
540
|
+
agentName: string,
|
|
541
|
+
onError?: (err: unknown) => void,
|
|
542
|
+
): boolean {
|
|
543
|
+
try {
|
|
544
|
+
const store = createSessionStore(sessionsDbPath);
|
|
545
|
+
try {
|
|
546
|
+
store.updateLastActivity(agentName);
|
|
547
|
+
} finally {
|
|
548
|
+
store.close();
|
|
549
|
+
}
|
|
550
|
+
return true;
|
|
551
|
+
} catch (err) {
|
|
552
|
+
onError?.(err);
|
|
553
|
+
return false;
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
function updateSessionClaudeId(
|
|
558
|
+
sessionsDbPath: string,
|
|
559
|
+
agentName: string,
|
|
560
|
+
sessionId: string,
|
|
561
|
+
onError?: (err: unknown) => void,
|
|
562
|
+
): boolean {
|
|
563
|
+
try {
|
|
564
|
+
const store = createSessionStore(sessionsDbPath);
|
|
565
|
+
try {
|
|
566
|
+
store.updateClaudeSessionId(agentName, sessionId);
|
|
567
|
+
} finally {
|
|
568
|
+
store.close();
|
|
569
|
+
}
|
|
570
|
+
return true;
|
|
571
|
+
} catch (err) {
|
|
572
|
+
onError?.(err);
|
|
573
|
+
return false;
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
/**
|
|
578
|
+
* Build the default runner diagnostic sink. Appends to `<turnLogDir>/runner.log`
|
|
579
|
+
* (synchronous, safe inside async functions) and mirrors to `process.stderr`
|
|
580
|
+
* with a `[turn-runner:<level>] <agent>:` prefix. Failures in the sink itself
|
|
581
|
+
* are swallowed — diagnostics must never break the turn.
|
|
582
|
+
*/
|
|
583
|
+
function defaultRunnerLogger(agentName: string, runnerLogPath: string | null): RunnerLogger {
|
|
584
|
+
return (level, message, err) => {
|
|
585
|
+
const ts = new Date().toISOString();
|
|
586
|
+
const detail =
|
|
587
|
+
err instanceof Error
|
|
588
|
+
? `: ${err.message}`
|
|
589
|
+
: err !== undefined && err !== null
|
|
590
|
+
? `: ${String(err)}`
|
|
591
|
+
: "";
|
|
592
|
+
const line = `${ts} [${level}] ${message}${detail}\n`;
|
|
593
|
+
if (runnerLogPath) {
|
|
594
|
+
try {
|
|
595
|
+
appendFileSync(runnerLogPath, line);
|
|
596
|
+
} catch {
|
|
597
|
+
// best-effort; the stderr mirror still surfaces the warning
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
try {
|
|
601
|
+
process.stderr.write(`[turn-runner:${level}] ${agentName}: ${message}${detail}\n`);
|
|
602
|
+
} catch {
|
|
603
|
+
// nothing to do if stderr is unwritable
|
|
604
|
+
}
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
async function teeStreamToWriter(
|
|
609
|
+
stream: ReadableStream<Uint8Array> | null | undefined,
|
|
610
|
+
writer: { write(data: Uint8Array): unknown; end(): unknown },
|
|
611
|
+
): Promise<void> {
|
|
612
|
+
const awaitMaybe = async (value: unknown): Promise<void> => {
|
|
613
|
+
if (value && typeof (value as Promise<unknown>).then === "function") {
|
|
614
|
+
await value;
|
|
615
|
+
}
|
|
616
|
+
};
|
|
617
|
+
|
|
618
|
+
if (!stream || typeof stream.getReader !== "function") {
|
|
619
|
+
try {
|
|
620
|
+
await awaitMaybe(writer.end());
|
|
621
|
+
} catch {
|
|
622
|
+
// ignore
|
|
623
|
+
}
|
|
624
|
+
return;
|
|
625
|
+
}
|
|
626
|
+
const reader = stream.getReader();
|
|
627
|
+
try {
|
|
628
|
+
while (true) {
|
|
629
|
+
const { done, value } = await reader.read();
|
|
630
|
+
if (done) break;
|
|
631
|
+
if (value && value.byteLength > 0) {
|
|
632
|
+
try {
|
|
633
|
+
await awaitMaybe(writer.write(value));
|
|
634
|
+
} catch {
|
|
635
|
+
// best-effort; do not block the parser on log file writes
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
} catch {
|
|
640
|
+
// ignore stream errors
|
|
641
|
+
} finally {
|
|
642
|
+
try {
|
|
643
|
+
reader.releaseLock();
|
|
644
|
+
} catch {
|
|
645
|
+
// ignore
|
|
646
|
+
}
|
|
647
|
+
try {
|
|
648
|
+
await awaitMaybe(writer.end());
|
|
649
|
+
} catch {
|
|
650
|
+
// ignore
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
/**
|
|
656
|
+
* Spawn one turn for an agent and drive it to completion.
|
|
657
|
+
*
|
|
658
|
+
* Always releases the per-agent lock, including on thrown errors.
|
|
659
|
+
*/
|
|
660
|
+
export async function runTurn(opts: RunTurnOpts): Promise<TurnResult> {
|
|
661
|
+
const {
|
|
662
|
+
agentName,
|
|
663
|
+
capability,
|
|
664
|
+
overstoryDir,
|
|
665
|
+
worktreePath,
|
|
666
|
+
projectRoot,
|
|
667
|
+
taskId,
|
|
668
|
+
userTurnNdjson,
|
|
669
|
+
runtime,
|
|
670
|
+
resolvedModel,
|
|
671
|
+
runId,
|
|
672
|
+
mailDbPath,
|
|
673
|
+
eventsDbPath,
|
|
674
|
+
sessionsDbPath,
|
|
675
|
+
} = opts;
|
|
676
|
+
|
|
677
|
+
if (!runtime.buildDirectSpawn) {
|
|
678
|
+
throw new AgentError(
|
|
679
|
+
`Runtime "${runtime.id}" does not support buildDirectSpawn; cannot use spawn-per-turn`,
|
|
680
|
+
{ agentName },
|
|
681
|
+
);
|
|
682
|
+
}
|
|
683
|
+
if (!runtime.parseEvents) {
|
|
684
|
+
throw new AgentError(
|
|
685
|
+
`Runtime "${runtime.id}" does not support parseEvents; cannot use spawn-per-turn`,
|
|
686
|
+
{ agentName },
|
|
687
|
+
);
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
const spawnFn = opts._spawnFn ?? defaultSpawnFn;
|
|
691
|
+
const now = opts._now ?? (() => new Date());
|
|
692
|
+
const sigkillDelayMs = opts.sigkillDelayMs ?? 2000;
|
|
693
|
+
|
|
694
|
+
// Pre-lock peek so the empty-input path can short-circuit without
|
|
695
|
+
// paying the lock cost or transitioning state.
|
|
696
|
+
let preInitialState: AgentState = "booting";
|
|
697
|
+
try {
|
|
698
|
+
const preStore = createSessionStore(sessionsDbPath);
|
|
699
|
+
try {
|
|
700
|
+
const session = preStore.getByName(agentName);
|
|
701
|
+
if (session) preInitialState = session.state;
|
|
702
|
+
} finally {
|
|
703
|
+
preStore.close();
|
|
704
|
+
}
|
|
705
|
+
} catch {
|
|
706
|
+
// non-fatal — fall back to "booting"
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
if (userTurnNdjson === "") {
|
|
710
|
+
return {
|
|
711
|
+
exitCode: null,
|
|
712
|
+
cleanResult: false,
|
|
713
|
+
newSessionId: null,
|
|
714
|
+
resumeMismatch: false,
|
|
715
|
+
terminalMailObserved: false,
|
|
716
|
+
durationMs: 0,
|
|
717
|
+
initialState: preInitialState,
|
|
718
|
+
finalState: preInitialState,
|
|
719
|
+
stallAborted: false,
|
|
720
|
+
terminalMailMissing: false,
|
|
721
|
+
};
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
const lock = await acquireTurnLock({ agentName, overstoryDir });
|
|
725
|
+
const startedAtMs = now().getTime();
|
|
726
|
+
let initialState: AgentState = preInitialState;
|
|
727
|
+
let priorSessionId: string | null = null;
|
|
728
|
+
let parentAgent: string | null = null;
|
|
729
|
+
let sessionLastActivity: string | null = null;
|
|
730
|
+
let turnPidPath: string | null = null;
|
|
731
|
+
// Per-turn diagnostic sink. Bound after the turn log dir is created;
|
|
732
|
+
// pre-creation failures (rare — only the lock-held SessionStore re-read)
|
|
733
|
+
// remain silent because the file path doesn't exist yet.
|
|
734
|
+
let runnerLog: RunnerLogger = opts._logWarning ?? defaultRunnerLogger(agentName, null);
|
|
735
|
+
|
|
736
|
+
try {
|
|
737
|
+
// Re-read session under the lock — the value passed to the caller may be
|
|
738
|
+
// stale if another process just updated it.
|
|
739
|
+
try {
|
|
740
|
+
const store = createSessionStore(sessionsDbPath);
|
|
741
|
+
try {
|
|
742
|
+
const session = store.getByName(agentName);
|
|
743
|
+
if (session) {
|
|
744
|
+
initialState = session.state;
|
|
745
|
+
priorSessionId = session.claudeSessionId ?? null;
|
|
746
|
+
parentAgent = session.parentAgent ?? null;
|
|
747
|
+
sessionLastActivity = session.lastActivity ?? null;
|
|
748
|
+
}
|
|
749
|
+
} finally {
|
|
750
|
+
store.close();
|
|
751
|
+
}
|
|
752
|
+
} catch {
|
|
753
|
+
// non-fatal — fall back to pre-lock peek
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
const directEnv: Record<string, string> = {
|
|
757
|
+
...runtime.buildEnv(resolvedModel),
|
|
758
|
+
OVERSTORY_AGENT_NAME: agentName,
|
|
759
|
+
OVERSTORY_WORKTREE_PATH: worktreePath,
|
|
760
|
+
OVERSTORY_TASK_ID: taskId,
|
|
761
|
+
OVERSTORY_PROJECT_ROOT: projectRoot,
|
|
762
|
+
};
|
|
763
|
+
const spawnEnv: Record<string, string> = {
|
|
764
|
+
...(process.env as Record<string, string>),
|
|
765
|
+
...directEnv,
|
|
766
|
+
};
|
|
767
|
+
|
|
768
|
+
const directOpts: DirectSpawnOpts = {
|
|
769
|
+
cwd: worktreePath,
|
|
770
|
+
env: directEnv,
|
|
771
|
+
...(resolvedModel.isExplicitOverride ? { model: resolvedModel.model } : {}),
|
|
772
|
+
instructionPath: runtime.instructionPath,
|
|
773
|
+
resumeSessionId: priorSessionId,
|
|
774
|
+
};
|
|
775
|
+
const argv = runtime.buildDirectSpawn(directOpts);
|
|
776
|
+
|
|
777
|
+
const logTimestamp = now().toISOString().replace(/[:.]/g, "-");
|
|
778
|
+
const turnLogDir = join(overstoryDir, "logs", agentName, logTimestamp);
|
|
779
|
+
await mkdir(turnLogDir, { recursive: true });
|
|
780
|
+
const stderrPath = join(turnLogDir, "stderr.log");
|
|
781
|
+
const stderrWriter = Bun.file(stderrPath).writer();
|
|
782
|
+
|
|
783
|
+
// Bind the runner-diagnostic sink now that the per-turn log dir exists.
|
|
784
|
+
// Subsequent silent-failure paths (SessionStore writes, turn.pid I/O)
|
|
785
|
+
// route through `runnerLog` so future leaks/contract violations are
|
|
786
|
+
// diagnosable (overstory-4af3).
|
|
787
|
+
const runnerLogPath = join(turnLogDir, "runner.log");
|
|
788
|
+
runnerLog = opts._logWarning ?? defaultRunnerLogger(agentName, runnerLogPath);
|
|
789
|
+
|
|
790
|
+
// Per-agent state dir (shared with applied-records.json, identity.yaml).
|
|
791
|
+
// Holds turn.pid while a turn is in flight so other processes (`ov stop`,
|
|
792
|
+
// watchdog) can find and signal the live claude PID.
|
|
793
|
+
const agentStateDir = join(overstoryDir, "agents", agentName);
|
|
794
|
+
await mkdir(agentStateDir, { recursive: true });
|
|
795
|
+
turnPidPath = join(agentStateDir, "turn.pid");
|
|
796
|
+
|
|
797
|
+
// Snapshot the terminal-mail baseline at the latest prior terminal mail
|
|
798
|
+
// (`worker_done`/`result` for task workers, `merged`/`merge_failed` for
|
|
799
|
+
// merger). Querying the actual prior timestamp — rather than wall-clock
|
|
800
|
+
// `now()` — closes the misattribution window where a prior turn's
|
|
801
|
+
// terminal mail lands between baseline capture and spawn (overstory-088b
|
|
802
|
+
// C1). Falls back to epoch when no prior terminal mail exists, so the
|
|
803
|
+
// first terminal mail of the agent's lifetime is attributed to this turn.
|
|
804
|
+
const snapshotTs =
|
|
805
|
+
latestTerminalMailTs(mailDbPath, agentName, capability) ?? new Date(0).toISOString();
|
|
806
|
+
|
|
807
|
+
// Spawn. Failures here propagate after the finally below releases the lock.
|
|
808
|
+
let proc: TurnSubprocess;
|
|
809
|
+
try {
|
|
810
|
+
proc = spawnFn(argv, {
|
|
811
|
+
cwd: worktreePath,
|
|
812
|
+
env: spawnEnv,
|
|
813
|
+
stdin: "pipe",
|
|
814
|
+
stdout: "pipe",
|
|
815
|
+
stderr: "pipe",
|
|
816
|
+
});
|
|
817
|
+
} catch (err) {
|
|
818
|
+
try {
|
|
819
|
+
await stderrWriter.end();
|
|
820
|
+
} catch {
|
|
821
|
+
// ignore
|
|
822
|
+
}
|
|
823
|
+
throw err;
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
// Publish the live claude PID so other processes (`ov stop`, watchdog) can
|
|
827
|
+
// find and signal it. turn.pid is the cross-process kill primitive for
|
|
828
|
+
// headless task-scoped agents — without it, `ov stop` reads null and
|
|
829
|
+
// silently degrades (overstory-62a6). Treat write failure as a contract
|
|
830
|
+
// violation (symmetric with the cleanup-side assertion that turn.pid must
|
|
831
|
+
// not survive the runner): SIGKILL the just-spawned subprocess and abort
|
|
832
|
+
// the turn so the operator sees the failure instead of a half-broken
|
|
833
|
+
// agent that cannot be killed.
|
|
834
|
+
try {
|
|
835
|
+
await Bun.write(turnPidPath, `${proc.pid}\n`);
|
|
836
|
+
} catch (err) {
|
|
837
|
+
runnerLog(
|
|
838
|
+
"error",
|
|
839
|
+
`failed to write turn.pid at ${turnPidPath} — kill primitive unavailable, aborting turn`,
|
|
840
|
+
err,
|
|
841
|
+
);
|
|
842
|
+
try {
|
|
843
|
+
proc.kill("SIGKILL");
|
|
844
|
+
} catch {
|
|
845
|
+
// process may have already exited
|
|
846
|
+
}
|
|
847
|
+
try {
|
|
848
|
+
await stderrWriter.end();
|
|
849
|
+
} catch {
|
|
850
|
+
// ignore
|
|
851
|
+
}
|
|
852
|
+
throw new AgentError(
|
|
853
|
+
`failed to write turn.pid at ${turnPidPath}: ${err instanceof Error ? err.message : String(err)}`,
|
|
854
|
+
{ agentName, ...(err instanceof Error ? { cause: err } : {}) },
|
|
855
|
+
);
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
// Tee stderr stream into the per-turn stderr.log without blocking the parser.
|
|
859
|
+
const stderrStream = (proc as unknown as { stderr?: ReadableStream<Uint8Array> | null }).stderr;
|
|
860
|
+
const stderrTeePromise = teeStreamToWriter(stderrStream, {
|
|
861
|
+
write: (data) => stderrWriter.write(data),
|
|
862
|
+
end: () => stderrWriter.end(),
|
|
863
|
+
});
|
|
864
|
+
|
|
865
|
+
// Write the user turn and close stdin so claude sees EOF.
|
|
866
|
+
try {
|
|
867
|
+
const writeRes = proc.stdin.write(userTurnNdjson);
|
|
868
|
+
if (writeRes && typeof (writeRes as Promise<unknown>).then === "function") {
|
|
869
|
+
await writeRes;
|
|
870
|
+
}
|
|
871
|
+
if (typeof proc.stdin.end === "function") {
|
|
872
|
+
const endRes = proc.stdin.end();
|
|
873
|
+
if (endRes && typeof (endRes as Promise<unknown>).then === "function") {
|
|
874
|
+
await endRes;
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
} catch (err) {
|
|
878
|
+
try {
|
|
879
|
+
proc.kill();
|
|
880
|
+
} catch {
|
|
881
|
+
// ignore
|
|
882
|
+
}
|
|
883
|
+
throw err;
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
// Abort wiring — SIGTERM, then SIGKILL after sigkillDelayMs.
|
|
887
|
+
let aborted = false;
|
|
888
|
+
let sigkillTimer: ReturnType<typeof setTimeout> | null = null;
|
|
889
|
+
const onAbort = (): void => {
|
|
890
|
+
if (aborted) return;
|
|
891
|
+
aborted = true;
|
|
892
|
+
try {
|
|
893
|
+
proc.kill("SIGTERM");
|
|
894
|
+
} catch {
|
|
895
|
+
// process may have already exited
|
|
896
|
+
}
|
|
897
|
+
sigkillTimer = setTimeout(() => {
|
|
898
|
+
try {
|
|
899
|
+
proc.kill("SIGKILL");
|
|
900
|
+
} catch {
|
|
901
|
+
// ignore
|
|
902
|
+
}
|
|
903
|
+
}, sigkillDelayMs);
|
|
904
|
+
(sigkillTimer as { unref?: () => void }).unref?.();
|
|
905
|
+
};
|
|
906
|
+
if (opts.abortSignal) {
|
|
907
|
+
if (opts.abortSignal.aborted) {
|
|
908
|
+
onAbort();
|
|
909
|
+
} else {
|
|
910
|
+
opts.abortSignal.addEventListener("abort", onAbort, { once: true });
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
// Drain parser, capture session id, tee events into events.db.
|
|
915
|
+
const eventStore = createEventStore(eventsDbPath);
|
|
916
|
+
let newSessionId: string | null = null;
|
|
917
|
+
let cleanResult = false;
|
|
918
|
+
let observedAnyEvent = false;
|
|
919
|
+
// True iff this turn fired the "first parser event" transition into
|
|
920
|
+
// `in_turn`. Replaces the legacy `bootedToWorking` flag; the trigger
|
|
921
|
+
// now fires from booting OR between_turns OR working (legacy migration)
|
|
922
|
+
// so a resumed spawn-per-turn agent flips back to `in_turn` at the
|
|
923
|
+
// start of every batch (overstory-3087).
|
|
924
|
+
let transitionedToInTurn = false;
|
|
925
|
+
|
|
926
|
+
// Stall watchdog (overstory-ddb3): if no parser event arrives for
|
|
927
|
+
// `eventStallTimeoutMs`, abort the turn via SIGTERM/SIGKILL. Otherwise a
|
|
928
|
+
// hung claude (Anthropic API stall, deadlock) hangs the runner forever.
|
|
929
|
+
const eventStallTimeoutMs = opts.eventStallTimeoutMs ?? 600_000;
|
|
930
|
+
let stallAborted = false;
|
|
931
|
+
let stallTimer: ReturnType<typeof setTimeout> | null = null;
|
|
932
|
+
let stallSigkillTimer: ReturnType<typeof setTimeout> | null = null;
|
|
933
|
+
const clearStallTimer = (): void => {
|
|
934
|
+
if (stallTimer) {
|
|
935
|
+
clearTimeout(stallTimer);
|
|
936
|
+
stallTimer = null;
|
|
937
|
+
}
|
|
938
|
+
};
|
|
939
|
+
const armStallTimer = (): void => {
|
|
940
|
+
if (eventStallTimeoutMs <= 0) return;
|
|
941
|
+
clearStallTimer();
|
|
942
|
+
stallTimer = setTimeout(() => {
|
|
943
|
+
if (aborted || stallAborted) return;
|
|
944
|
+
stallAborted = true;
|
|
945
|
+
runnerLog(
|
|
946
|
+
"error",
|
|
947
|
+
`parser stalled: no event for ${eventStallTimeoutMs}ms — aborting via SIGTERM`,
|
|
948
|
+
);
|
|
949
|
+
try {
|
|
950
|
+
proc.kill("SIGTERM");
|
|
951
|
+
} catch {
|
|
952
|
+
// process may have already exited
|
|
953
|
+
}
|
|
954
|
+
stallSigkillTimer = setTimeout(() => {
|
|
955
|
+
try {
|
|
956
|
+
proc.kill("SIGKILL");
|
|
957
|
+
} catch {
|
|
958
|
+
// ignore
|
|
959
|
+
}
|
|
960
|
+
}, sigkillDelayMs);
|
|
961
|
+
(stallSigkillTimer as { unref?: () => void }).unref?.();
|
|
962
|
+
}, eventStallTimeoutMs);
|
|
963
|
+
(stallTimer as { unref?: () => void }).unref?.();
|
|
964
|
+
};
|
|
965
|
+
|
|
966
|
+
// `AgentRuntime.parseEvents` is declared as a 1-param method, but the Claude
|
|
967
|
+
// adapter accepts an `onSessionId` hook. Widen the call site so we can pass
|
|
968
|
+
// the hook without depending on adapter-specific types.
|
|
969
|
+
type ParseEventsWithOpts = (
|
|
970
|
+
stream: ReadableStream<Uint8Array>,
|
|
971
|
+
opts?: { onSessionId?: (sid: string) => void },
|
|
972
|
+
) => AsyncIterable<AgentEvent>;
|
|
973
|
+
const parseEvents = runtime.parseEvents as unknown as ParseEventsWithOpts;
|
|
974
|
+
|
|
975
|
+
// Arm before iteration so a process that never emits also gets caught.
|
|
976
|
+
armStallTimer();
|
|
977
|
+
|
|
978
|
+
try {
|
|
979
|
+
const parser = parseEvents(proc.stdout, {
|
|
980
|
+
onSessionId: (sid: string) => {
|
|
981
|
+
newSessionId = sid;
|
|
982
|
+
updateSessionClaudeId(sessionsDbPath, agentName, sid, (err) =>
|
|
983
|
+
runnerLog("warn", "failed to persist claudeSessionId", err),
|
|
984
|
+
);
|
|
985
|
+
// Resume mismatch (overstory-088b C2): the runtime returned a
|
|
986
|
+
// different session id than the one we asked it to resume.
|
|
987
|
+
// `--resume` is best-effort — claude can decide to start a fresh
|
|
988
|
+
// session if it cannot rehydrate the requested one. Surface a
|
|
989
|
+
// structured warning event so observability mirrors the runner
|
|
990
|
+
// diagnostic and downstream tooling can detect the mismatch.
|
|
991
|
+
if (priorSessionId !== null && sid !== priorSessionId) {
|
|
992
|
+
try {
|
|
993
|
+
eventStore.insert({
|
|
994
|
+
runId,
|
|
995
|
+
agentName,
|
|
996
|
+
sessionId: sid,
|
|
997
|
+
eventType: "custom",
|
|
998
|
+
toolName: null,
|
|
999
|
+
toolArgs: null,
|
|
1000
|
+
toolDurationMs: null,
|
|
1001
|
+
level: "warn",
|
|
1002
|
+
data: JSON.stringify({
|
|
1003
|
+
type: "resume_mismatch",
|
|
1004
|
+
requestedSessionId: priorSessionId,
|
|
1005
|
+
observedSessionId: sid,
|
|
1006
|
+
}),
|
|
1007
|
+
});
|
|
1008
|
+
} catch {
|
|
1009
|
+
// non-fatal — observability must not break the turn
|
|
1010
|
+
}
|
|
1011
|
+
runnerLog(
|
|
1012
|
+
"warn",
|
|
1013
|
+
`resume mismatch: requested ${priorSessionId} but runtime returned ${sid}`,
|
|
1014
|
+
);
|
|
1015
|
+
}
|
|
1016
|
+
},
|
|
1017
|
+
});
|
|
1018
|
+
|
|
1019
|
+
// Mid-turn `lastActivity` refresh (overstory-8e61). The watchdog at
|
|
1020
|
+
// `src/watchdog/health.ts:242-243` documents that the runner advances
|
|
1021
|
+
// lastActivity per parser event; without this the row stayed at
|
|
1022
|
+
// `startedAt` for the whole turn and long turns got zombified live.
|
|
1023
|
+
const lastActivityRefreshIntervalMs = opts.lastActivityRefreshIntervalMs ?? 2000;
|
|
1024
|
+
let lastActivityRefreshMs = 0; // first event always refreshes
|
|
1025
|
+
|
|
1026
|
+
for await (const event of parser) {
|
|
1027
|
+
armStallTimer();
|
|
1028
|
+
observedAnyEvent = true;
|
|
1029
|
+
|
|
1030
|
+
// Keep `session.lastActivity` advancing while events flow so the
|
|
1031
|
+
// watchdog does not zombify a live agent mid-turn — see
|
|
1032
|
+
// `src/watchdog/health.ts:242-243` and overstory-8e61.
|
|
1033
|
+
const nowMs = now().getTime();
|
|
1034
|
+
if (nowMs - lastActivityRefreshMs >= lastActivityRefreshIntervalMs) {
|
|
1035
|
+
lastActivityRefreshMs = nowMs;
|
|
1036
|
+
updateSessionLastActivity(sessionsDbPath, agentName, (err) =>
|
|
1037
|
+
runnerLog("warn", "failed to refresh lastActivity mid-turn", err),
|
|
1038
|
+
);
|
|
1039
|
+
opts._onLastActivityRefresh?.();
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
// First parser event of a turn → settle into `in_turn`. Allowed
|
|
1043
|
+
// predecessors are `booting` (initial dispatch), `between_turns`
|
|
1044
|
+
// (next mail batch on a healthy worker), or already-`in_turn`
|
|
1045
|
+
// (idempotent — covers the case where a prior turn somehow left
|
|
1046
|
+
// the row at in_turn). Legacy `working` rows are intentionally
|
|
1047
|
+
// not in the matrix predecessor set (overstory-3087): spawn-
|
|
1048
|
+
// per-turn workers should not flow through `working`, so the
|
|
1049
|
+
// matrix keeps the substate path disjoint and a stale `working`
|
|
1050
|
+
// row is left alone rather than silently coerced.
|
|
1051
|
+
if (
|
|
1052
|
+
!transitionedToInTurn &&
|
|
1053
|
+
(initialState === "booting" || initialState === "between_turns")
|
|
1054
|
+
) {
|
|
1055
|
+
transitionedToInTurn = true;
|
|
1056
|
+
updateSessionState(
|
|
1057
|
+
sessionsDbPath,
|
|
1058
|
+
agentName,
|
|
1059
|
+
"in_turn",
|
|
1060
|
+
(err) => runnerLog("warn", `failed to transition ${initialState} → in_turn`, err),
|
|
1061
|
+
(prev, attempted) =>
|
|
1062
|
+
runnerLog(
|
|
1063
|
+
"warn",
|
|
1064
|
+
`${initialState} → in_turn rejected: state is now ${prev} (attempted ${attempted})`,
|
|
1065
|
+
),
|
|
1066
|
+
);
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
if (event.type === "result") {
|
|
1070
|
+
cleanResult = event.isError !== true;
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
// Defense-in-depth (overstory-c92c): detect Bash mail-poll patterns
|
|
1074
|
+
// the lead.md prompt forbids (overstory-fa84). Warn-only — emit a
|
|
1075
|
+
// custom event before the original tool_use so observability tools
|
|
1076
|
+
// see the warning ahead of the offending call. Wrapped in try/catch
|
|
1077
|
+
// so detection failure cannot break the turn.
|
|
1078
|
+
if (event.type === "tool_use" && event.name === "Bash") {
|
|
1079
|
+
try {
|
|
1080
|
+
const input =
|
|
1081
|
+
typeof event.input === "object" && event.input !== null
|
|
1082
|
+
? (event.input as Record<string, unknown>)
|
|
1083
|
+
: null;
|
|
1084
|
+
const command = input?.command;
|
|
1085
|
+
const detection = detectMailPollPattern(command);
|
|
1086
|
+
if (detection.matched) {
|
|
1087
|
+
const cmdStr = typeof command === "string" ? command : "";
|
|
1088
|
+
const truncated = cmdStr.length > 200 ? `${cmdStr.slice(0, 200)}…` : cmdStr;
|
|
1089
|
+
runnerLog(
|
|
1090
|
+
"warn",
|
|
1091
|
+
`detected mail-poll pattern in Bash command (${detection.reason}): ${truncated}`,
|
|
1092
|
+
);
|
|
1093
|
+
try {
|
|
1094
|
+
eventStore.insert({
|
|
1095
|
+
runId,
|
|
1096
|
+
agentName,
|
|
1097
|
+
sessionId: newSessionId,
|
|
1098
|
+
eventType: "custom",
|
|
1099
|
+
toolName: null,
|
|
1100
|
+
toolArgs: null,
|
|
1101
|
+
toolDurationMs: null,
|
|
1102
|
+
level: "warn",
|
|
1103
|
+
data: JSON.stringify({
|
|
1104
|
+
type: "mail_poll_detected",
|
|
1105
|
+
reason: detection.reason,
|
|
1106
|
+
command: cmdStr,
|
|
1107
|
+
}),
|
|
1108
|
+
});
|
|
1109
|
+
} catch (insertErr) {
|
|
1110
|
+
runnerLog("warn", "failed to insert mail_poll_detected event", insertErr);
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
} catch (detectErr) {
|
|
1114
|
+
runnerLog("warn", "mail-poll detector threw", detectErr);
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
try {
|
|
1119
|
+
recordAgentEvent(eventStore, agentName, runId, newSessionId, event);
|
|
1120
|
+
} catch {
|
|
1121
|
+
// non-fatal — observability must not break the turn
|
|
1122
|
+
}
|
|
1123
|
+
}
|
|
1124
|
+
} catch (err) {
|
|
1125
|
+
// Parser iteration threw (malformed stream-json, decoder error, etc.).
|
|
1126
|
+
// The subprocess is still running and would orphan past lock.release()
|
|
1127
|
+
// if we just propagated the error (overstory-088b C3). Send SIGKILL so
|
|
1128
|
+
// it cannot keep producing output or holding resources, then rethrow
|
|
1129
|
+
// for the outer finally to clean up turn.pid and release the lock.
|
|
1130
|
+
runnerLog("error", "parser iteration threw — killing subprocess to avoid orphan", err);
|
|
1131
|
+
try {
|
|
1132
|
+
proc.kill("SIGKILL");
|
|
1133
|
+
} catch {
|
|
1134
|
+
// process may have already exited
|
|
1135
|
+
}
|
|
1136
|
+
throw err;
|
|
1137
|
+
} finally {
|
|
1138
|
+
clearStallTimer();
|
|
1139
|
+
if (stallSigkillTimer) {
|
|
1140
|
+
clearTimeout(stallSigkillTimer);
|
|
1141
|
+
stallSigkillTimer = null;
|
|
1142
|
+
}
|
|
1143
|
+
try {
|
|
1144
|
+
eventStore.close();
|
|
1145
|
+
} catch {
|
|
1146
|
+
// ignore
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
let exitCode: number | null;
|
|
1151
|
+
try {
|
|
1152
|
+
exitCode = await proc.exited;
|
|
1153
|
+
} catch (err) {
|
|
1154
|
+
runnerLog("warn", "proc.exited rejected", err);
|
|
1155
|
+
exitCode = null;
|
|
1156
|
+
}
|
|
1157
|
+
if (sigkillTimer) {
|
|
1158
|
+
clearTimeout(sigkillTimer);
|
|
1159
|
+
sigkillTimer = null;
|
|
1160
|
+
}
|
|
1161
|
+
if (opts.abortSignal && !opts.abortSignal.aborted) {
|
|
1162
|
+
opts.abortSignal.removeEventListener("abort", onAbort);
|
|
1163
|
+
}
|
|
1164
|
+
if (aborted || stallAborted) {
|
|
1165
|
+
exitCode = null;
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
// Wait for stderr drain so the log file isn't truncated mid-write.
|
|
1169
|
+
try {
|
|
1170
|
+
await stderrTeePromise;
|
|
1171
|
+
} catch {
|
|
1172
|
+
// best-effort
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
const terminalMailObserved = checkTerminalMailSince(
|
|
1176
|
+
mailDbPath,
|
|
1177
|
+
agentName,
|
|
1178
|
+
capability,
|
|
1179
|
+
snapshotTs,
|
|
1180
|
+
);
|
|
1181
|
+
|
|
1182
|
+
// Soft scope-violation observability (overstory-9f4d). Builders sometimes
|
|
1183
|
+
// expand beyond their declared FILE_SCOPE; the lead needs a way to spot it
|
|
1184
|
+
// during merge verification. Surface a warn-level event into events.db
|
|
1185
|
+
// when the worker's modified files exceed FILE_SCOPE without an
|
|
1186
|
+
// `expansion_reason:` justification (commit body OR prior scope_expansion
|
|
1187
|
+
// mail). This is advisory — never aborts the turn, never blocks the
|
|
1188
|
+
// completed transition. All errors are swallowed.
|
|
1189
|
+
//
|
|
1190
|
+
// TODO: baseRef is hard-coded to "main"; a future improvement could
|
|
1191
|
+
// resolve the actual session-branch.txt for projects whose canonical
|
|
1192
|
+
// branch differs.
|
|
1193
|
+
if (terminalMailObserved && IMPLEMENTATION_CAPABILITIES.has(capability)) {
|
|
1194
|
+
try {
|
|
1195
|
+
const fileScope = await extractFileScope(worktreePath, runtime.instructionPath);
|
|
1196
|
+
if (fileScope.length > 0) {
|
|
1197
|
+
const detectFn = opts._scopeDetect ?? defaultDetectScopeViolation;
|
|
1198
|
+
const { violations, expansionReasons } = detectFn({
|
|
1199
|
+
worktreePath,
|
|
1200
|
+
baseRef: "main",
|
|
1201
|
+
fileScope,
|
|
1202
|
+
});
|
|
1203
|
+
if (violations.length > 0 && expansionReasons.length === 0) {
|
|
1204
|
+
const justified = hasScopeExpansionMail(mailDbPath, agentName);
|
|
1205
|
+
if (!justified) {
|
|
1206
|
+
runnerLog(
|
|
1207
|
+
"warn",
|
|
1208
|
+
`agent modified ${violations.length} file(s) outside declared FILE_SCOPE without justification: ${violations.join(", ")}. To suppress, include 'expansion_reason: <why>' in your last commit message OR send a scope_expansion mail to your lead.`,
|
|
1209
|
+
);
|
|
1210
|
+
try {
|
|
1211
|
+
const evStore = createEventStore(eventsDbPath);
|
|
1212
|
+
try {
|
|
1213
|
+
evStore.insert({
|
|
1214
|
+
runId,
|
|
1215
|
+
agentName,
|
|
1216
|
+
sessionId: newSessionId,
|
|
1217
|
+
eventType: "custom",
|
|
1218
|
+
toolName: null,
|
|
1219
|
+
toolArgs: null,
|
|
1220
|
+
toolDurationMs: null,
|
|
1221
|
+
level: "warn",
|
|
1222
|
+
data: JSON.stringify({
|
|
1223
|
+
type: "scope_violation",
|
|
1224
|
+
violations,
|
|
1225
|
+
fileScope,
|
|
1226
|
+
}),
|
|
1227
|
+
});
|
|
1228
|
+
} finally {
|
|
1229
|
+
try {
|
|
1230
|
+
evStore.close();
|
|
1231
|
+
} catch {
|
|
1232
|
+
// best-effort
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
} catch {
|
|
1236
|
+
// observability must never break the runner
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
}
|
|
1241
|
+
} catch {
|
|
1242
|
+
// scope detection is advisory — swallow all errors
|
|
1243
|
+
}
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
const resumeMismatch =
|
|
1247
|
+
priorSessionId !== null && newSessionId !== null && newSessionId !== priorSessionId;
|
|
1248
|
+
|
|
1249
|
+
// Contract violation (overstory-6071): claude exited cleanly (saw a
|
|
1250
|
+
// `result` event with isError:false) but never sent the capability's
|
|
1251
|
+
// terminal mail. Pre-fix this fell through to `working` and stayed
|
|
1252
|
+
// there forever — the process is gone but the session looks alive.
|
|
1253
|
+
// Surface loudly via the runner diagnostic sink and settle to
|
|
1254
|
+
// `completed` so operators don't see a zombie-but-labeled-working row.
|
|
1255
|
+
const terminalMailMissing = cleanResult && !terminalMailObserved && !aborted && !stallAborted;
|
|
1256
|
+
if (terminalMailMissing) {
|
|
1257
|
+
const expected = terminalMailTypesFor(capability).join("|") || "<none>";
|
|
1258
|
+
runnerLog(
|
|
1259
|
+
"error",
|
|
1260
|
+
`agent exited cleanly without sending terminal mail (expected ${expected}); marking completed and surfacing contract violation`,
|
|
1261
|
+
);
|
|
1262
|
+
}
|
|
1263
|
+
|
|
1264
|
+
let finalState: AgentState;
|
|
1265
|
+
if (aborted || stallAborted) {
|
|
1266
|
+
finalState = "zombie";
|
|
1267
|
+
} else if (cleanResult && terminalMailObserved) {
|
|
1268
|
+
finalState = "completed";
|
|
1269
|
+
} else if (terminalMailMissing) {
|
|
1270
|
+
finalState = "completed";
|
|
1271
|
+
} else if (observedAnyEvent || transitionedToInTurn) {
|
|
1272
|
+
// Turn produced events but did not complete — settle to
|
|
1273
|
+
// `between_turns`, NOT `working`, so the UI can distinguish a
|
|
1274
|
+
// spawn-per-turn worker waiting for its next mail batch from one
|
|
1275
|
+
// mid-execution. The watchdog will flip the row back to `in_turn`
|
|
1276
|
+
// on the next batch when the parser fires its first event
|
|
1277
|
+
// (overstory-3087).
|
|
1278
|
+
finalState = "between_turns";
|
|
1279
|
+
} else {
|
|
1280
|
+
finalState = initialState;
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
if (finalState !== initialState) {
|
|
1284
|
+
updateSessionState(
|
|
1285
|
+
sessionsDbPath,
|
|
1286
|
+
agentName,
|
|
1287
|
+
finalState,
|
|
1288
|
+
(err) => runnerLog("warn", `failed to transition state to ${finalState}`, err),
|
|
1289
|
+
(prev, attempted) =>
|
|
1290
|
+
runnerLog(
|
|
1291
|
+
"warn",
|
|
1292
|
+
`turn-end transition ${initialState} → ${attempted} rejected: state is now ${prev}`,
|
|
1293
|
+
),
|
|
1294
|
+
);
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
// In-band parent notification (overstory-4159, overstory-c772). When the
|
|
1298
|
+
// turn ends without the capability's terminal mail — either because the
|
|
1299
|
+
// runner zombified (abort/stall) or claude exited cleanly without sending
|
|
1300
|
+
// `worker_done` — synthesize a `worker_died` mail to the parent so the
|
|
1301
|
+
// lead does not block forever waiting for a signal that will never come.
|
|
1302
|
+
// The watchdog's pre-tick state-snapshot dedup (mx-b0e54b) ensures a
|
|
1303
|
+
// later watchdog pass on the now-zombie session does not re-fire.
|
|
1304
|
+
const shouldNotifyParent =
|
|
1305
|
+
parentAgent !== null && (finalState === "zombie" || terminalMailMissing);
|
|
1306
|
+
if (shouldNotifyParent && parentAgent !== null) {
|
|
1307
|
+
const reason = aborted
|
|
1308
|
+
? "Aborted by operator (SIGTERM)"
|
|
1309
|
+
: stallAborted
|
|
1310
|
+
? "Parser stalled (no events within timeout)"
|
|
1311
|
+
: terminalMailMissing
|
|
1312
|
+
? `Clean exit without terminal mail (expected ${terminalMailTypesFor(capability).join("|") || "<none>"})`
|
|
1313
|
+
: "Turn ended without terminal mail";
|
|
1314
|
+
notifyParentOfRunnerDeath({
|
|
1315
|
+
mailStore: opts._mailStore ?? null,
|
|
1316
|
+
mailDbPath,
|
|
1317
|
+
parentAgent,
|
|
1318
|
+
agentName,
|
|
1319
|
+
capability,
|
|
1320
|
+
taskId,
|
|
1321
|
+
reason,
|
|
1322
|
+
lastActivity: sessionLastActivity ?? new Date(startedAtMs).toISOString(),
|
|
1323
|
+
runnerLog,
|
|
1324
|
+
});
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
// `lastActivity` advancing past `startedAt` is a turn-cleanup contract
|
|
1328
|
+
// invariant — silent failure here was the smoking gun in overstory-4af3.
|
|
1329
|
+
const lastActivityOk = updateSessionLastActivity(sessionsDbPath, agentName, (err) =>
|
|
1330
|
+
runnerLog("warn", "failed to update lastActivity", err),
|
|
1331
|
+
);
|
|
1332
|
+
if (!lastActivityOk) {
|
|
1333
|
+
runnerLog(
|
|
1334
|
+
"error",
|
|
1335
|
+
"lastActivity stayed at startedAt — session.lastActivity is unreliable for this turn",
|
|
1336
|
+
);
|
|
1337
|
+
}
|
|
1338
|
+
|
|
1339
|
+
const durationMs = now().getTime() - startedAtMs;
|
|
1340
|
+
|
|
1341
|
+
return {
|
|
1342
|
+
exitCode,
|
|
1343
|
+
cleanResult,
|
|
1344
|
+
newSessionId,
|
|
1345
|
+
resumeMismatch,
|
|
1346
|
+
terminalMailObserved,
|
|
1347
|
+
durationMs,
|
|
1348
|
+
initialState,
|
|
1349
|
+
finalState,
|
|
1350
|
+
stallAborted,
|
|
1351
|
+
terminalMailMissing,
|
|
1352
|
+
};
|
|
1353
|
+
} finally {
|
|
1354
|
+
// PID-file cleanup so a follow-up turn never sees a stale PID (covers
|
|
1355
|
+
// thrown errors as well as the happy path). ENOENT is expected on the
|
|
1356
|
+
// "spawn never happened" path; any other error is a contract violation
|
|
1357
|
+
// because turn.pid is the cross-process kill primitive (overstory-2cf9).
|
|
1358
|
+
if (turnPidPath) {
|
|
1359
|
+
try {
|
|
1360
|
+
await unlink(turnPidPath);
|
|
1361
|
+
} catch (err) {
|
|
1362
|
+
const code = (err as NodeJS.ErrnoException | undefined)?.code;
|
|
1363
|
+
if (code !== "ENOENT") {
|
|
1364
|
+
runnerLog("error", `failed to unlink turn.pid at ${turnPidPath}`, err);
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
// Contract assertion: turn.pid must NOT survive the runner. A
|
|
1368
|
+
// surviving file means a follow-up `ov stop` or watchdog will target
|
|
1369
|
+
// a stale PID. Surface the violation loudly (overstory-4af3).
|
|
1370
|
+
try {
|
|
1371
|
+
if (existsSync(turnPidPath)) {
|
|
1372
|
+
runnerLog(
|
|
1373
|
+
"error",
|
|
1374
|
+
`turn.pid still exists at ${turnPidPath} after cleanup — kill primitive will target stale PID`,
|
|
1375
|
+
);
|
|
1376
|
+
}
|
|
1377
|
+
} catch {
|
|
1378
|
+
// existsSync should not throw, but keep diagnostics defensive
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
lock.release();
|
|
1382
|
+
}
|
|
1383
|
+
}
|