@os-eco/overstory-cli 0.9.3 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -18
- package/agents/builder.md +9 -8
- package/agents/coordinator.md +6 -6
- package/agents/lead.md +98 -82
- package/agents/merger.md +25 -14
- package/agents/reviewer.md +22 -16
- package/agents/scout.md +17 -12
- package/package.json +6 -3
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/headless-mail-injector.test.ts +448 -0
- package/src/agents/headless-mail-injector.ts +211 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +514 -14
- package/src/agents/hooks-deployer.ts +141 -0
- package/src/agents/overlay.test.ts +4 -4
- package/src/agents/overlay.ts +30 -8
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +1450 -0
- package/src/agents/turn-runner.ts +1166 -0
- package/src/commands/clean.ts +56 -1
- package/src/commands/completions.test.ts +4 -1
- package/src/commands/coordinator.test.ts +127 -0
- package/src/commands/coordinator.ts +205 -6
- package/src/commands/dashboard.test.ts +188 -0
- package/src/commands/dashboard.ts +13 -3
- package/src/commands/doctor.ts +94 -77
- package/src/commands/group.test.ts +94 -0
- package/src/commands/group.ts +49 -20
- package/src/commands/init.test.ts +8 -0
- package/src/commands/init.ts +8 -1
- package/src/commands/log.test.ts +56 -11
- package/src/commands/log.ts +134 -69
- package/src/commands/mail.test.ts +162 -0
- package/src/commands/mail.ts +64 -9
- package/src/commands/merge.test.ts +112 -1
- package/src/commands/merge.ts +17 -4
- package/src/commands/monitor.ts +2 -1
- package/src/commands/nudge.test.ts +351 -4
- package/src/commands/nudge.ts +356 -34
- package/src/commands/run.test.ts +43 -7
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +408 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1323 -0
- package/src/commands/serve/rest.ts +708 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +565 -0
- package/src/commands/sling.test.ts +85 -1
- package/src/commands/sling.ts +153 -64
- package/src/commands/status.test.ts +9 -0
- package/src/commands/status.ts +12 -4
- package/src/commands/stop.test.ts +174 -1
- package/src/commands/stop.ts +107 -8
- package/src/commands/supervisor.ts +2 -1
- package/src/commands/watch.test.ts +49 -4
- package/src/commands/watch.ts +153 -28
- package/src/commands/worktree.test.ts +319 -3
- package/src/commands/worktree.ts +86 -0
- package/src/config.test.ts +78 -0
- package/src/config.ts +43 -1
- package/src/doctor/consistency.test.ts +106 -0
- package/src/doctor/consistency.ts +50 -3
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/types.ts +2 -1
- package/src/doctor/watchdog.ts +57 -1
- package/src/events/tailer.test.ts +234 -1
- package/src/events/tailer.ts +90 -0
- package/src/index.ts +53 -6
- package/src/json.ts +29 -0
- package/src/mail/client.ts +15 -2
- package/src/mail/store.test.ts +82 -0
- package/src/mail/store.ts +41 -4
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/claude.test.ts +791 -1
- package/src/runtimes/claude.ts +323 -1
- package/src/runtimes/connections.test.ts +141 -1
- package/src/runtimes/connections.ts +73 -4
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/types.ts +10 -0
- package/src/schema-consistency.test.ts +1 -0
- package/src/sessions/store.test.ts +390 -24
- package/src/sessions/store.ts +184 -19
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/types.ts +56 -1
- package/src/utils/pid.test.ts +85 -1
- package/src/utils/pid.ts +86 -1
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/watchdog/daemon.test.ts +1520 -411
- package/src/watchdog/daemon.ts +442 -83
- package/src/watchdog/health.test.ts +157 -0
- package/src/watchdog/health.ts +92 -25
- package/src/worktree/process.test.ts +71 -0
- package/src/worktree/process.ts +25 -5
- package/src/worktree/tmux.test.ts +39 -0
- package/src/worktree/tmux.ts +23 -3
- package/templates/CLAUDE.md.tmpl +19 -8
- package/templates/overlay.md.tmpl +3 -2
|
@@ -0,0 +1,1166 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-turn engine for headless Claude Code agents (spawn-per-turn model).
|
|
3
|
+
*
|
|
4
|
+
* Owns a single agent turn end-to-end:
|
|
5
|
+
* - acquires per-agent serialization (in-process mutex + cross-process lease)
|
|
6
|
+
* - re-reads SessionStore under the lock so the prior `claudeSessionId` is fresh
|
|
7
|
+
* - spawns claude via the runtime's `buildDirectSpawn` (with `--resume` when available)
|
|
8
|
+
* - writes the user turn to a real stdin pipe and closes it (claude sees EOF)
|
|
9
|
+
* - drains `runtime.parseEvents` and tees events into events.db
|
|
10
|
+
* - captures the new session id via the parser's `onSessionId` hook
|
|
11
|
+
* - snapshots mail.db before spawn and detects the agent's capability-specific
|
|
12
|
+
* terminal mail (`worker_done` for builder/scout/reviewer/lead;
|
|
13
|
+
* `merged`/`merge_failed` for merger)
|
|
14
|
+
* - applies state-transition rules (booting → working, completed when done)
|
|
15
|
+
* - handles abort signals with SIGTERM → SIGKILL escalation
|
|
16
|
+
* - releases the lock on every exit path
|
|
17
|
+
*
|
|
18
|
+
* This module does NOT decide WHEN to run a turn. The mail injector and nudge
|
|
19
|
+
* command call `runTurn(opts)` when they have a user turn to deliver.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { Database } from "bun:sqlite";
|
|
23
|
+
import { appendFileSync, existsSync } from "node:fs";
|
|
24
|
+
import { mkdir, unlink } from "node:fs/promises";
|
|
25
|
+
import { join } from "node:path";
|
|
26
|
+
import { AgentError } from "../errors.ts";
|
|
27
|
+
import { createEventStore } from "../events/store.ts";
|
|
28
|
+
import { filterToolArgs } from "../events/tool-filter.ts";
|
|
29
|
+
import { createMailStore, type MailStore } from "../mail/store.ts";
|
|
30
|
+
import type { AgentEvent, AgentRuntime, DirectSpawnOpts } from "../runtimes/types.ts";
|
|
31
|
+
import { createSessionStore } from "../sessions/store.ts";
|
|
32
|
+
import type {
|
|
33
|
+
AgentState,
|
|
34
|
+
EventStore,
|
|
35
|
+
EventType,
|
|
36
|
+
ResolvedModel,
|
|
37
|
+
WorkerDiedPayload,
|
|
38
|
+
} from "../types.ts";
|
|
39
|
+
import { terminalMailTypesFor } from "./capabilities.ts";
|
|
40
|
+
import { acquireTurnLock } from "./turn-lock.ts";
|
|
41
|
+
|
|
42
|
+
/** Subprocess shape required by `runTurn`. Compatible with `Bun.spawn`. */
|
|
43
|
+
export interface TurnSubprocess {
|
|
44
|
+
readonly pid: number;
|
|
45
|
+
readonly stdin: {
|
|
46
|
+
write(data: string | Uint8Array): number | Promise<number> | unknown;
|
|
47
|
+
end?(): void | Promise<void> | unknown;
|
|
48
|
+
flush?(): unknown;
|
|
49
|
+
};
|
|
50
|
+
readonly stdout: ReadableStream<Uint8Array>;
|
|
51
|
+
readonly exited: Promise<number | null>;
|
|
52
|
+
kill(signal?: number | string): void;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Spawn function signature. Production passes `Bun.spawn` cast to this type. */
|
|
56
|
+
export type TurnSpawnFn = (
|
|
57
|
+
cmd: string[],
|
|
58
|
+
options: {
|
|
59
|
+
cwd: string;
|
|
60
|
+
env: Record<string, string>;
|
|
61
|
+
stdin: "pipe";
|
|
62
|
+
stdout: "pipe";
|
|
63
|
+
stderr: "pipe" | "ignore" | number;
|
|
64
|
+
},
|
|
65
|
+
) => TurnSubprocess;
|
|
66
|
+
|
|
67
|
+
/** Severity of an internal runner diagnostic. `error` indicates a contract violation. */
|
|
68
|
+
export type RunnerLogLevel = "warn" | "error";
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Internal runner diagnostic sink. Replaces the swallowed `catch {}` blocks
|
|
72
|
+
* around SessionStore writes and turn.pid I/O so that future failures are
|
|
73
|
+
* visible (overstory-4af3). Test injection point.
|
|
74
|
+
*/
|
|
75
|
+
export type RunnerLogger = (level: RunnerLogLevel, message: string, err?: unknown) => void;
|
|
76
|
+
|
|
77
|
+
export interface RunTurnOpts {
|
|
78
|
+
agentName: string;
|
|
79
|
+
/**
|
|
80
|
+
* Worker capability driving terminal-mail detection (builder/scout/reviewer/
|
|
81
|
+
* merger/lead). The runner uses {@link terminalMailTypesFor} to decide which
|
|
82
|
+
* mail types signal completion for this agent.
|
|
83
|
+
*/
|
|
84
|
+
capability: string;
|
|
85
|
+
overstoryDir: string;
|
|
86
|
+
worktreePath: string;
|
|
87
|
+
projectRoot: string;
|
|
88
|
+
taskId: string;
|
|
89
|
+
/** Pre-encoded stream-json envelope (from `encodeUserTurn`). Empty string is a no-op. */
|
|
90
|
+
userTurnNdjson: string;
|
|
91
|
+
runtime: AgentRuntime;
|
|
92
|
+
resolvedModel: ResolvedModel;
|
|
93
|
+
runId: string | null;
|
|
94
|
+
mailDbPath: string;
|
|
95
|
+
eventsDbPath: string;
|
|
96
|
+
sessionsDbPath: string;
|
|
97
|
+
/** Test injection: spawn function. Defaults to `Bun.spawn`. */
|
|
98
|
+
_spawnFn?: TurnSpawnFn;
|
|
99
|
+
/** Test injection: time source. */
|
|
100
|
+
_now?: () => Date;
|
|
101
|
+
/**
|
|
102
|
+
* Test injection: pre-opened MailStore for the parent-notify path.
|
|
103
|
+
* Production opens `mailDbPath` briefly inside the helper and closes it; tests
|
|
104
|
+
* pass a shared in-memory store so they can read what was inserted without
|
|
105
|
+
* reopening the DB file.
|
|
106
|
+
*/
|
|
107
|
+
_mailStore?: MailStore;
|
|
108
|
+
/**
|
|
109
|
+
* Test injection: runner diagnostic sink. When omitted, warnings append to
|
|
110
|
+
* `<turnLogDir>/runner.log` and mirror to `process.stderr` with a
|
|
111
|
+
* `[turn-runner:<level>] <agent>:` prefix.
|
|
112
|
+
*/
|
|
113
|
+
_logWarning?: RunnerLogger;
|
|
114
|
+
/** Operator-driven kill (e.g. `ov stop`). */
|
|
115
|
+
abortSignal?: AbortSignal;
|
|
116
|
+
/** Time between SIGTERM and SIGKILL on abort. Default 2000ms. */
|
|
117
|
+
sigkillDelayMs?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Mid-stream stall watchdog: max time (ms) between parser events before the
|
|
120
|
+
* runner aborts the turn via SIGTERM (escalates to SIGKILL after
|
|
121
|
+
* `sigkillDelayMs`). Resets on every event from the runtime parser. Default
|
|
122
|
+
* 600000ms (10 minutes) — generous enough to span long tool calls while
|
|
123
|
+
* still bounding hung-claude turns (overstory-ddb3).
|
|
124
|
+
*
|
|
125
|
+
* Set to `0` to disable (test injection / explicit opt-out only).
|
|
126
|
+
*/
|
|
127
|
+
eventStallTimeoutMs?: number;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
export interface TurnResult {
|
|
131
|
+
/** Process exit code. `null` when aborted before exit. */
|
|
132
|
+
exitCode: number | null;
|
|
133
|
+
/** True iff the parser observed a `result` event with `isError: false`. */
|
|
134
|
+
cleanResult: boolean;
|
|
135
|
+
/** Session id captured from this turn's stream-json (may differ from prior). */
|
|
136
|
+
newSessionId: string | null;
|
|
137
|
+
/** True iff a prior session id was requested and the new one differs. */
|
|
138
|
+
resumeMismatch: boolean;
|
|
139
|
+
/**
|
|
140
|
+
* True iff a capability-specific terminal mail from the agent appeared
|
|
141
|
+
* during the turn (`worker_done` for builder/scout/reviewer/lead,
|
|
142
|
+
* `merged`/`merge_failed` for merger).
|
|
143
|
+
*/
|
|
144
|
+
terminalMailObserved: boolean;
|
|
145
|
+
/** Wall-clock turn duration in milliseconds. */
|
|
146
|
+
durationMs: number;
|
|
147
|
+
/** AgentState read from SessionStore at the start of the turn. */
|
|
148
|
+
initialState: AgentState;
|
|
149
|
+
/** AgentState computed by the transition rules and persisted on exit. */
|
|
150
|
+
finalState: AgentState;
|
|
151
|
+
/**
|
|
152
|
+
* True iff the per-event stall watchdog fired during the turn — the runner
|
|
153
|
+
* sent SIGTERM/SIGKILL because no parser event arrived for
|
|
154
|
+
* `eventStallTimeoutMs` (overstory-ddb3). Treated like `aborted` for
|
|
155
|
+
* finalState purposes (`zombie`).
|
|
156
|
+
*/
|
|
157
|
+
stallAborted: boolean;
|
|
158
|
+
/**
|
|
159
|
+
* True iff claude exited cleanly (`cleanResult` true) without sending the
|
|
160
|
+
* capability-specific terminal mail (overstory-6071). Contract violation:
|
|
161
|
+
* the agent finished its turn but failed to signal completion. Logged at
|
|
162
|
+
* `error` level via the runner diagnostic sink and recorded here for
|
|
163
|
+
* caller-visible auditing.
|
|
164
|
+
*/
|
|
165
|
+
terminalMailMissing: boolean;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const defaultSpawnFn: TurnSpawnFn = (cmd, options) =>
|
|
169
|
+
Bun.spawn(cmd, options) as unknown as TurnSubprocess;
|
|
170
|
+
|
|
171
|
+
function mapAgentEventType(type: string): EventType {
|
|
172
|
+
switch (type) {
|
|
173
|
+
case "tool_use":
|
|
174
|
+
return "tool_start";
|
|
175
|
+
case "tool_result":
|
|
176
|
+
return "tool_end";
|
|
177
|
+
case "status":
|
|
178
|
+
return "session_start";
|
|
179
|
+
case "result":
|
|
180
|
+
return "result";
|
|
181
|
+
case "error":
|
|
182
|
+
return "error";
|
|
183
|
+
case "assistant_message":
|
|
184
|
+
return "progress";
|
|
185
|
+
default:
|
|
186
|
+
return "custom";
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function recordAgentEvent(
|
|
191
|
+
eventStore: EventStore,
|
|
192
|
+
agentName: string,
|
|
193
|
+
runId: string | null,
|
|
194
|
+
sessionId: string | null,
|
|
195
|
+
event: AgentEvent,
|
|
196
|
+
): void {
|
|
197
|
+
const eventType = mapAgentEventType(event.type);
|
|
198
|
+
let dataStr: string | null;
|
|
199
|
+
try {
|
|
200
|
+
dataStr = JSON.stringify(event);
|
|
201
|
+
} catch {
|
|
202
|
+
dataStr = null;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (event.type === "tool_use") {
|
|
206
|
+
const toolName = typeof event.name === "string" ? event.name : null;
|
|
207
|
+
const toolInput =
|
|
208
|
+
typeof event.input === "object" && event.input !== null
|
|
209
|
+
? (event.input as Record<string, unknown>)
|
|
210
|
+
: {};
|
|
211
|
+
const filtered = toolName ? filterToolArgs(toolName, toolInput) : null;
|
|
212
|
+
eventStore.insert({
|
|
213
|
+
runId,
|
|
214
|
+
agentName,
|
|
215
|
+
sessionId,
|
|
216
|
+
eventType,
|
|
217
|
+
toolName,
|
|
218
|
+
toolArgs: filtered ? JSON.stringify(filtered.args) : null,
|
|
219
|
+
toolDurationMs: null,
|
|
220
|
+
level: "info",
|
|
221
|
+
data: dataStr,
|
|
222
|
+
});
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
if (event.type === "result") {
|
|
227
|
+
eventStore.insert({
|
|
228
|
+
runId,
|
|
229
|
+
agentName,
|
|
230
|
+
sessionId,
|
|
231
|
+
eventType,
|
|
232
|
+
toolName: null,
|
|
233
|
+
toolArgs: null,
|
|
234
|
+
toolDurationMs: typeof event.durationMs === "number" ? Math.round(event.durationMs) : null,
|
|
235
|
+
level: event.isError === true ? "error" : "info",
|
|
236
|
+
data: dataStr,
|
|
237
|
+
});
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
eventStore.insert({
|
|
242
|
+
runId,
|
|
243
|
+
agentName,
|
|
244
|
+
sessionId,
|
|
245
|
+
eventType,
|
|
246
|
+
toolName: null,
|
|
247
|
+
toolArgs: null,
|
|
248
|
+
toolDurationMs: null,
|
|
249
|
+
level: event.type === "error" ? "error" : "info",
|
|
250
|
+
data: dataStr,
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function checkTerminalMailSince(
|
|
255
|
+
mailDbPath: string,
|
|
256
|
+
agentName: string,
|
|
257
|
+
capability: string,
|
|
258
|
+
sinceTs: string,
|
|
259
|
+
): boolean {
|
|
260
|
+
const types = terminalMailTypesFor(capability);
|
|
261
|
+
if (types.length === 0) return false;
|
|
262
|
+
|
|
263
|
+
let db: Database;
|
|
264
|
+
try {
|
|
265
|
+
db = new Database(mailDbPath);
|
|
266
|
+
} catch {
|
|
267
|
+
return false;
|
|
268
|
+
}
|
|
269
|
+
try {
|
|
270
|
+
db.exec("PRAGMA busy_timeout = 5000");
|
|
271
|
+
const placeholders = types.map((_, i) => `$t${i}`).join(",");
|
|
272
|
+
const sql = `SELECT 1 AS c FROM messages WHERE from_agent = $a AND type IN (${placeholders}) AND created_at > $ts LIMIT 1`;
|
|
273
|
+
const stmt = db.prepare<{ c: number }, Record<string, string>>(sql);
|
|
274
|
+
const params: Record<string, string> = { $a: agentName, $ts: sinceTs };
|
|
275
|
+
types.forEach((t, i) => {
|
|
276
|
+
params[`$t${i}`] = t;
|
|
277
|
+
});
|
|
278
|
+
const row = stmt.get(params);
|
|
279
|
+
return row !== null;
|
|
280
|
+
} catch {
|
|
281
|
+
return false;
|
|
282
|
+
} finally {
|
|
283
|
+
try {
|
|
284
|
+
db.close();
|
|
285
|
+
} catch {
|
|
286
|
+
// best-effort
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Latest `created_at` timestamp of a terminal mail (`worker_done`/`result` for
|
|
293
|
+
* task-scoped workers; `merged`/`merge_failed` for merger) sent by `agentName`.
|
|
294
|
+
*
|
|
295
|
+
* Returns `null` when the agent has no prior terminal mail or the mail DB is
|
|
296
|
+
* unavailable. The runner uses this as the snapshot baseline for the new turn:
|
|
297
|
+
* any terminal mail with `created_at > snapshot` is attributable to the spawn
|
|
298
|
+
* we are about to start. Querying the actual prior timestamp eliminates the
|
|
299
|
+
* misattribution window that `now()` opened — a prior-turn `worker_done` that
|
|
300
|
+
* lands between baseline capture and spawn would have falsely tripped the
|
|
301
|
+
* "terminal mail observed" check (overstory-088b C1).
|
|
302
|
+
*/
|
|
303
|
+
function latestTerminalMailTs(
|
|
304
|
+
mailDbPath: string,
|
|
305
|
+
agentName: string,
|
|
306
|
+
capability: string,
|
|
307
|
+
): string | null {
|
|
308
|
+
const types = terminalMailTypesFor(capability);
|
|
309
|
+
if (types.length === 0) return null;
|
|
310
|
+
|
|
311
|
+
let db: Database;
|
|
312
|
+
try {
|
|
313
|
+
db = new Database(mailDbPath);
|
|
314
|
+
} catch {
|
|
315
|
+
return null;
|
|
316
|
+
}
|
|
317
|
+
try {
|
|
318
|
+
db.exec("PRAGMA busy_timeout = 5000");
|
|
319
|
+
const placeholders = types.map((_, i) => `$t${i}`).join(",");
|
|
320
|
+
const sql = `SELECT MAX(created_at) AS ts FROM messages WHERE from_agent = $a AND type IN (${placeholders})`;
|
|
321
|
+
const stmt = db.prepare<{ ts: string | null }, Record<string, string>>(sql);
|
|
322
|
+
const params: Record<string, string> = { $a: agentName };
|
|
323
|
+
types.forEach((t, i) => {
|
|
324
|
+
params[`$t${i}`] = t;
|
|
325
|
+
});
|
|
326
|
+
const row = stmt.get(params);
|
|
327
|
+
return row?.ts ?? null;
|
|
328
|
+
} catch {
|
|
329
|
+
return null;
|
|
330
|
+
} finally {
|
|
331
|
+
try {
|
|
332
|
+
db.close();
|
|
333
|
+
} catch {
|
|
334
|
+
// best-effort
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Send a synthetic `worker_died` mail to the parent of a session whose turn
|
|
341
|
+
* ended without the capability's terminal mail. Mirrors the watchdog's
|
|
342
|
+
* `notifyParentOfDeath` (overstory-c111) but for in-band runner detection:
|
|
343
|
+
*
|
|
344
|
+
* - **Aborted / stalled** (zombie): operator `ov stop` or the parser-stall
|
|
345
|
+
* watchdog killed the subprocess. The agent never got a chance to send
|
|
346
|
+
* `worker_done`/`merged` (overstory-c772).
|
|
347
|
+
* - **terminalMailMissing**: claude exited cleanly but never sent the terminal
|
|
348
|
+
* mail — the silent-no-op path (overstory-4159).
|
|
349
|
+
*
|
|
350
|
+
* Without this, the lead waits forever for a terminal mail that will never
|
|
351
|
+
* arrive. The watchdog's pre-tick state-snapshot dedup (mx-b0e54b) means a
|
|
352
|
+
* later watchdog tick on the now-zombie session will see `stateBeforeTick ===
|
|
353
|
+
* "zombie"` and skip its own notify, so we won't double-fire.
|
|
354
|
+
*
|
|
355
|
+
* Fire-and-forget: every failure surfaces through `runnerLog` and never
|
|
356
|
+
* propagates. Mail-send must not break the turn.
|
|
357
|
+
*/
|
|
358
|
+
function notifyParentOfRunnerDeath(ctx: {
|
|
359
|
+
mailStore: MailStore | null;
|
|
360
|
+
mailDbPath: string;
|
|
361
|
+
parentAgent: string;
|
|
362
|
+
agentName: string;
|
|
363
|
+
capability: string;
|
|
364
|
+
taskId: string;
|
|
365
|
+
reason: string;
|
|
366
|
+
lastActivity: string;
|
|
367
|
+
runnerLog: RunnerLogger;
|
|
368
|
+
}): void {
|
|
369
|
+
const {
|
|
370
|
+
mailStore,
|
|
371
|
+
mailDbPath,
|
|
372
|
+
parentAgent,
|
|
373
|
+
agentName,
|
|
374
|
+
capability,
|
|
375
|
+
taskId,
|
|
376
|
+
reason,
|
|
377
|
+
lastActivity,
|
|
378
|
+
runnerLog,
|
|
379
|
+
} = ctx;
|
|
380
|
+
|
|
381
|
+
const payload: WorkerDiedPayload = {
|
|
382
|
+
agentName,
|
|
383
|
+
capability,
|
|
384
|
+
taskId,
|
|
385
|
+
reason,
|
|
386
|
+
lastActivity,
|
|
387
|
+
terminatedBy: "runner",
|
|
388
|
+
};
|
|
389
|
+
const subject = `[RUNNER] worker_died: ${agentName}`;
|
|
390
|
+
const body =
|
|
391
|
+
`Worker "${agentName}" (${capability}) on task ${taskId} ended without ` +
|
|
392
|
+
`sending its terminal mail. Reason: ${reason}. Last activity: ${lastActivity}. ` +
|
|
393
|
+
`Decide whether to retry the work, escalate, or report the failure upstream.`;
|
|
394
|
+
|
|
395
|
+
let store: MailStore | null = mailStore;
|
|
396
|
+
let owned = false;
|
|
397
|
+
if (store === null) {
|
|
398
|
+
try {
|
|
399
|
+
store = createMailStore(mailDbPath);
|
|
400
|
+
owned = true;
|
|
401
|
+
} catch (err) {
|
|
402
|
+
runnerLog("warn", "failed to open mail store for parent notify", err);
|
|
403
|
+
return;
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
try {
|
|
407
|
+
store.insert({
|
|
408
|
+
id: "",
|
|
409
|
+
from: agentName,
|
|
410
|
+
to: parentAgent,
|
|
411
|
+
subject,
|
|
412
|
+
body,
|
|
413
|
+
type: "worker_died",
|
|
414
|
+
priority: "high",
|
|
415
|
+
threadId: null,
|
|
416
|
+
payload: JSON.stringify(payload),
|
|
417
|
+
});
|
|
418
|
+
} catch (err) {
|
|
419
|
+
runnerLog("warn", "failed to send worker_died mail to parent", err);
|
|
420
|
+
} finally {
|
|
421
|
+
if (owned) {
|
|
422
|
+
try {
|
|
423
|
+
store.close();
|
|
424
|
+
} catch {
|
|
425
|
+
// best-effort
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
/**
|
|
432
|
+
* Guarded state transition for the turn runner. Uses the SessionStore CAS
|
|
433
|
+
* (`tryTransitionState`) so a concurrent writer — `ov stop` writing
|
|
434
|
+
* `completed`, watchdog writing `zombie` — cannot be silently overwritten
|
|
435
|
+
* by the turn-runner's "settle to working/completed/zombie" at end of turn.
|
|
436
|
+
*
|
|
437
|
+
* Returns true when the transition landed. Rejected transitions are not
|
|
438
|
+
* fatal: the SQL CAS preserves whatever the conflicting writer set, which
|
|
439
|
+
* is the correct outcome for this race (overstory-a993).
|
|
440
|
+
*
|
|
441
|
+
* `onError` fires on database/IO failure. `onRejected` fires when the CAS
|
|
442
|
+
* rejected the transition (the row exists but was in a state that disallowed
|
|
443
|
+
* the move). Both are diagnostic-only — the caller need not recover.
|
|
444
|
+
*/
|
|
445
|
+
function updateSessionState(
|
|
446
|
+
sessionsDbPath: string,
|
|
447
|
+
agentName: string,
|
|
448
|
+
state: AgentState,
|
|
449
|
+
onError?: (err: unknown) => void,
|
|
450
|
+
onRejected?: (prev: AgentState, attempted: AgentState) => void,
|
|
451
|
+
): boolean {
|
|
452
|
+
try {
|
|
453
|
+
const store = createSessionStore(sessionsDbPath);
|
|
454
|
+
try {
|
|
455
|
+
const outcome = store.tryTransitionState(agentName, state);
|
|
456
|
+
if (!outcome.ok) {
|
|
457
|
+
if (outcome.reason === "illegal_transition") {
|
|
458
|
+
onRejected?.(outcome.prev, outcome.attempted);
|
|
459
|
+
}
|
|
460
|
+
return false;
|
|
461
|
+
}
|
|
462
|
+
} finally {
|
|
463
|
+
store.close();
|
|
464
|
+
}
|
|
465
|
+
return true;
|
|
466
|
+
} catch (err) {
|
|
467
|
+
onError?.(err);
|
|
468
|
+
return false;
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
function updateSessionLastActivity(
|
|
473
|
+
sessionsDbPath: string,
|
|
474
|
+
agentName: string,
|
|
475
|
+
onError?: (err: unknown) => void,
|
|
476
|
+
): boolean {
|
|
477
|
+
try {
|
|
478
|
+
const store = createSessionStore(sessionsDbPath);
|
|
479
|
+
try {
|
|
480
|
+
store.updateLastActivity(agentName);
|
|
481
|
+
} finally {
|
|
482
|
+
store.close();
|
|
483
|
+
}
|
|
484
|
+
return true;
|
|
485
|
+
} catch (err) {
|
|
486
|
+
onError?.(err);
|
|
487
|
+
return false;
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
function updateSessionClaudeId(
|
|
492
|
+
sessionsDbPath: string,
|
|
493
|
+
agentName: string,
|
|
494
|
+
sessionId: string,
|
|
495
|
+
onError?: (err: unknown) => void,
|
|
496
|
+
): boolean {
|
|
497
|
+
try {
|
|
498
|
+
const store = createSessionStore(sessionsDbPath);
|
|
499
|
+
try {
|
|
500
|
+
store.updateClaudeSessionId(agentName, sessionId);
|
|
501
|
+
} finally {
|
|
502
|
+
store.close();
|
|
503
|
+
}
|
|
504
|
+
return true;
|
|
505
|
+
} catch (err) {
|
|
506
|
+
onError?.(err);
|
|
507
|
+
return false;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
/**
|
|
512
|
+
* Build the default runner diagnostic sink. Appends to `<turnLogDir>/runner.log`
|
|
513
|
+
* (synchronous, safe inside async functions) and mirrors to `process.stderr`
|
|
514
|
+
* with a `[turn-runner:<level>] <agent>:` prefix. Failures in the sink itself
|
|
515
|
+
* are swallowed — diagnostics must never break the turn.
|
|
516
|
+
*/
|
|
517
|
+
function defaultRunnerLogger(agentName: string, runnerLogPath: string | null): RunnerLogger {
|
|
518
|
+
return (level, message, err) => {
|
|
519
|
+
const ts = new Date().toISOString();
|
|
520
|
+
const detail =
|
|
521
|
+
err instanceof Error
|
|
522
|
+
? `: ${err.message}`
|
|
523
|
+
: err !== undefined && err !== null
|
|
524
|
+
? `: ${String(err)}`
|
|
525
|
+
: "";
|
|
526
|
+
const line = `${ts} [${level}] ${message}${detail}\n`;
|
|
527
|
+
if (runnerLogPath) {
|
|
528
|
+
try {
|
|
529
|
+
appendFileSync(runnerLogPath, line);
|
|
530
|
+
} catch {
|
|
531
|
+
// best-effort; the stderr mirror still surfaces the warning
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
try {
|
|
535
|
+
process.stderr.write(`[turn-runner:${level}] ${agentName}: ${message}${detail}\n`);
|
|
536
|
+
} catch {
|
|
537
|
+
// nothing to do if stderr is unwritable
|
|
538
|
+
}
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
async function teeStreamToWriter(
|
|
543
|
+
stream: ReadableStream<Uint8Array> | null | undefined,
|
|
544
|
+
writer: { write(data: Uint8Array): unknown; end(): unknown },
|
|
545
|
+
): Promise<void> {
|
|
546
|
+
const awaitMaybe = async (value: unknown): Promise<void> => {
|
|
547
|
+
if (value && typeof (value as Promise<unknown>).then === "function") {
|
|
548
|
+
await value;
|
|
549
|
+
}
|
|
550
|
+
};
|
|
551
|
+
|
|
552
|
+
if (!stream || typeof stream.getReader !== "function") {
|
|
553
|
+
try {
|
|
554
|
+
await awaitMaybe(writer.end());
|
|
555
|
+
} catch {
|
|
556
|
+
// ignore
|
|
557
|
+
}
|
|
558
|
+
return;
|
|
559
|
+
}
|
|
560
|
+
const reader = stream.getReader();
|
|
561
|
+
try {
|
|
562
|
+
while (true) {
|
|
563
|
+
const { done, value } = await reader.read();
|
|
564
|
+
if (done) break;
|
|
565
|
+
if (value && value.byteLength > 0) {
|
|
566
|
+
try {
|
|
567
|
+
await awaitMaybe(writer.write(value));
|
|
568
|
+
} catch {
|
|
569
|
+
// best-effort; do not block the parser on log file writes
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
} catch {
|
|
574
|
+
// ignore stream errors
|
|
575
|
+
} finally {
|
|
576
|
+
try {
|
|
577
|
+
reader.releaseLock();
|
|
578
|
+
} catch {
|
|
579
|
+
// ignore
|
|
580
|
+
}
|
|
581
|
+
try {
|
|
582
|
+
await awaitMaybe(writer.end());
|
|
583
|
+
} catch {
|
|
584
|
+
// ignore
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
/**
|
|
590
|
+
* Spawn one turn for an agent and drive it to completion.
|
|
591
|
+
*
|
|
592
|
+
* Always releases the per-agent lock, including on thrown errors.
|
|
593
|
+
*/
|
|
594
|
+
export async function runTurn(opts: RunTurnOpts): Promise<TurnResult> {
|
|
595
|
+
const {
|
|
596
|
+
agentName,
|
|
597
|
+
capability,
|
|
598
|
+
overstoryDir,
|
|
599
|
+
worktreePath,
|
|
600
|
+
projectRoot,
|
|
601
|
+
taskId,
|
|
602
|
+
userTurnNdjson,
|
|
603
|
+
runtime,
|
|
604
|
+
resolvedModel,
|
|
605
|
+
runId,
|
|
606
|
+
mailDbPath,
|
|
607
|
+
eventsDbPath,
|
|
608
|
+
sessionsDbPath,
|
|
609
|
+
} = opts;
|
|
610
|
+
|
|
611
|
+
if (!runtime.buildDirectSpawn) {
|
|
612
|
+
throw new AgentError(
|
|
613
|
+
`Runtime "${runtime.id}" does not support buildDirectSpawn; cannot use spawn-per-turn`,
|
|
614
|
+
{ agentName },
|
|
615
|
+
);
|
|
616
|
+
}
|
|
617
|
+
if (!runtime.parseEvents) {
|
|
618
|
+
throw new AgentError(
|
|
619
|
+
`Runtime "${runtime.id}" does not support parseEvents; cannot use spawn-per-turn`,
|
|
620
|
+
{ agentName },
|
|
621
|
+
);
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
const spawnFn = opts._spawnFn ?? defaultSpawnFn;
|
|
625
|
+
const now = opts._now ?? (() => new Date());
|
|
626
|
+
const sigkillDelayMs = opts.sigkillDelayMs ?? 2000;
|
|
627
|
+
|
|
628
|
+
// Pre-lock peek so the empty-input path can short-circuit without
|
|
629
|
+
// paying the lock cost or transitioning state.
|
|
630
|
+
let preInitialState: AgentState = "booting";
|
|
631
|
+
try {
|
|
632
|
+
const preStore = createSessionStore(sessionsDbPath);
|
|
633
|
+
try {
|
|
634
|
+
const session = preStore.getByName(agentName);
|
|
635
|
+
if (session) preInitialState = session.state;
|
|
636
|
+
} finally {
|
|
637
|
+
preStore.close();
|
|
638
|
+
}
|
|
639
|
+
} catch {
|
|
640
|
+
// non-fatal — fall back to "booting"
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
if (userTurnNdjson === "") {
|
|
644
|
+
return {
|
|
645
|
+
exitCode: null,
|
|
646
|
+
cleanResult: false,
|
|
647
|
+
newSessionId: null,
|
|
648
|
+
resumeMismatch: false,
|
|
649
|
+
terminalMailObserved: false,
|
|
650
|
+
durationMs: 0,
|
|
651
|
+
initialState: preInitialState,
|
|
652
|
+
finalState: preInitialState,
|
|
653
|
+
stallAborted: false,
|
|
654
|
+
terminalMailMissing: false,
|
|
655
|
+
};
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
const lock = await acquireTurnLock({ agentName, overstoryDir });
|
|
659
|
+
const startedAtMs = now().getTime();
|
|
660
|
+
let initialState: AgentState = preInitialState;
|
|
661
|
+
let priorSessionId: string | null = null;
|
|
662
|
+
let parentAgent: string | null = null;
|
|
663
|
+
let sessionLastActivity: string | null = null;
|
|
664
|
+
let turnPidPath: string | null = null;
|
|
665
|
+
// Per-turn diagnostic sink. Bound after the turn log dir is created;
|
|
666
|
+
// pre-creation failures (rare — only the lock-held SessionStore re-read)
|
|
667
|
+
// remain silent because the file path doesn't exist yet.
|
|
668
|
+
let runnerLog: RunnerLogger = opts._logWarning ?? defaultRunnerLogger(agentName, null);
|
|
669
|
+
|
|
670
|
+
try {
|
|
671
|
+
// Re-read session under the lock — the value passed to the caller may be
|
|
672
|
+
// stale if another process just updated it.
|
|
673
|
+
try {
|
|
674
|
+
const store = createSessionStore(sessionsDbPath);
|
|
675
|
+
try {
|
|
676
|
+
const session = store.getByName(agentName);
|
|
677
|
+
if (session) {
|
|
678
|
+
initialState = session.state;
|
|
679
|
+
priorSessionId = session.claudeSessionId ?? null;
|
|
680
|
+
parentAgent = session.parentAgent ?? null;
|
|
681
|
+
sessionLastActivity = session.lastActivity ?? null;
|
|
682
|
+
}
|
|
683
|
+
} finally {
|
|
684
|
+
store.close();
|
|
685
|
+
}
|
|
686
|
+
} catch {
|
|
687
|
+
// non-fatal — fall back to pre-lock peek
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
const directEnv: Record<string, string> = {
|
|
691
|
+
...runtime.buildEnv(resolvedModel),
|
|
692
|
+
OVERSTORY_AGENT_NAME: agentName,
|
|
693
|
+
OVERSTORY_WORKTREE_PATH: worktreePath,
|
|
694
|
+
OVERSTORY_TASK_ID: taskId,
|
|
695
|
+
OVERSTORY_PROJECT_ROOT: projectRoot,
|
|
696
|
+
};
|
|
697
|
+
const spawnEnv: Record<string, string> = {
|
|
698
|
+
...(process.env as Record<string, string>),
|
|
699
|
+
...directEnv,
|
|
700
|
+
};
|
|
701
|
+
|
|
702
|
+
const directOpts: DirectSpawnOpts = {
|
|
703
|
+
cwd: worktreePath,
|
|
704
|
+
env: directEnv,
|
|
705
|
+
...(resolvedModel.isExplicitOverride ? { model: resolvedModel.model } : {}),
|
|
706
|
+
instructionPath: runtime.instructionPath,
|
|
707
|
+
resumeSessionId: priorSessionId,
|
|
708
|
+
};
|
|
709
|
+
const argv = runtime.buildDirectSpawn(directOpts);
|
|
710
|
+
|
|
711
|
+
const logTimestamp = now().toISOString().replace(/[:.]/g, "-");
|
|
712
|
+
const turnLogDir = join(overstoryDir, "logs", agentName, logTimestamp);
|
|
713
|
+
await mkdir(turnLogDir, { recursive: true });
|
|
714
|
+
const stderrPath = join(turnLogDir, "stderr.log");
|
|
715
|
+
const stderrWriter = Bun.file(stderrPath).writer();
|
|
716
|
+
|
|
717
|
+
// Bind the runner-diagnostic sink now that the per-turn log dir exists.
|
|
718
|
+
// Subsequent silent-failure paths (SessionStore writes, turn.pid I/O)
|
|
719
|
+
// route through `runnerLog` so future leaks/contract violations are
|
|
720
|
+
// diagnosable (overstory-4af3).
|
|
721
|
+
const runnerLogPath = join(turnLogDir, "runner.log");
|
|
722
|
+
runnerLog = opts._logWarning ?? defaultRunnerLogger(agentName, runnerLogPath);
|
|
723
|
+
|
|
724
|
+
// Per-agent state dir (shared with applied-records.json, identity.yaml).
|
|
725
|
+
// Holds turn.pid while a turn is in flight so other processes (`ov stop`,
|
|
726
|
+
// watchdog) can find and signal the live claude PID.
|
|
727
|
+
const agentStateDir = join(overstoryDir, "agents", agentName);
|
|
728
|
+
await mkdir(agentStateDir, { recursive: true });
|
|
729
|
+
turnPidPath = join(agentStateDir, "turn.pid");
|
|
730
|
+
|
|
731
|
+
// Snapshot the terminal-mail baseline at the latest prior terminal mail
|
|
732
|
+
// (`worker_done`/`result` for task workers, `merged`/`merge_failed` for
|
|
733
|
+
// merger). Querying the actual prior timestamp — rather than wall-clock
|
|
734
|
+
// `now()` — closes the misattribution window where a prior turn's
|
|
735
|
+
// terminal mail lands between baseline capture and spawn (overstory-088b
|
|
736
|
+
// C1). Falls back to epoch when no prior terminal mail exists, so the
|
|
737
|
+
// first terminal mail of the agent's lifetime is attributed to this turn.
|
|
738
|
+
const snapshotTs =
|
|
739
|
+
latestTerminalMailTs(mailDbPath, agentName, capability) ?? new Date(0).toISOString();
|
|
740
|
+
|
|
741
|
+
// Spawn. Failures here propagate after the finally below releases the lock.
|
|
742
|
+
let proc: TurnSubprocess;
|
|
743
|
+
try {
|
|
744
|
+
proc = spawnFn(argv, {
|
|
745
|
+
cwd: worktreePath,
|
|
746
|
+
env: spawnEnv,
|
|
747
|
+
stdin: "pipe",
|
|
748
|
+
stdout: "pipe",
|
|
749
|
+
stderr: "pipe",
|
|
750
|
+
});
|
|
751
|
+
} catch (err) {
|
|
752
|
+
try {
|
|
753
|
+
await stderrWriter.end();
|
|
754
|
+
} catch {
|
|
755
|
+
// ignore
|
|
756
|
+
}
|
|
757
|
+
throw err;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
// Publish the live claude PID so other processes (`ov stop`, watchdog) can
|
|
761
|
+
// find and signal it. turn.pid is the cross-process kill primitive for
|
|
762
|
+
// headless task-scoped agents — without it, `ov stop` reads null and
|
|
763
|
+
// silently degrades (overstory-62a6). Treat write failure as a contract
|
|
764
|
+
// violation (symmetric with the cleanup-side assertion that turn.pid must
|
|
765
|
+
// not survive the runner): SIGKILL the just-spawned subprocess and abort
|
|
766
|
+
// the turn so the operator sees the failure instead of a half-broken
|
|
767
|
+
// agent that cannot be killed.
|
|
768
|
+
try {
|
|
769
|
+
await Bun.write(turnPidPath, `${proc.pid}\n`);
|
|
770
|
+
} catch (err) {
|
|
771
|
+
runnerLog(
|
|
772
|
+
"error",
|
|
773
|
+
`failed to write turn.pid at ${turnPidPath} — kill primitive unavailable, aborting turn`,
|
|
774
|
+
err,
|
|
775
|
+
);
|
|
776
|
+
try {
|
|
777
|
+
proc.kill("SIGKILL");
|
|
778
|
+
} catch {
|
|
779
|
+
// process may have already exited
|
|
780
|
+
}
|
|
781
|
+
try {
|
|
782
|
+
await stderrWriter.end();
|
|
783
|
+
} catch {
|
|
784
|
+
// ignore
|
|
785
|
+
}
|
|
786
|
+
throw new AgentError(
|
|
787
|
+
`failed to write turn.pid at ${turnPidPath}: ${err instanceof Error ? err.message : String(err)}`,
|
|
788
|
+
{ agentName, ...(err instanceof Error ? { cause: err } : {}) },
|
|
789
|
+
);
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
// Tee stderr stream into the per-turn stderr.log without blocking the parser.
|
|
793
|
+
const stderrStream = (proc as unknown as { stderr?: ReadableStream<Uint8Array> | null }).stderr;
|
|
794
|
+
const stderrTeePromise = teeStreamToWriter(stderrStream, {
|
|
795
|
+
write: (data) => stderrWriter.write(data),
|
|
796
|
+
end: () => stderrWriter.end(),
|
|
797
|
+
});
|
|
798
|
+
|
|
799
|
+
// Write the user turn and close stdin so claude sees EOF.
|
|
800
|
+
try {
|
|
801
|
+
const writeRes = proc.stdin.write(userTurnNdjson);
|
|
802
|
+
if (writeRes && typeof (writeRes as Promise<unknown>).then === "function") {
|
|
803
|
+
await writeRes;
|
|
804
|
+
}
|
|
805
|
+
if (typeof proc.stdin.end === "function") {
|
|
806
|
+
const endRes = proc.stdin.end();
|
|
807
|
+
if (endRes && typeof (endRes as Promise<unknown>).then === "function") {
|
|
808
|
+
await endRes;
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
} catch (err) {
|
|
812
|
+
try {
|
|
813
|
+
proc.kill();
|
|
814
|
+
} catch {
|
|
815
|
+
// ignore
|
|
816
|
+
}
|
|
817
|
+
throw err;
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
// Abort wiring — SIGTERM, then SIGKILL after sigkillDelayMs.
|
|
821
|
+
let aborted = false;
|
|
822
|
+
let sigkillTimer: ReturnType<typeof setTimeout> | null = null;
|
|
823
|
+
const onAbort = (): void => {
|
|
824
|
+
if (aborted) return;
|
|
825
|
+
aborted = true;
|
|
826
|
+
try {
|
|
827
|
+
proc.kill("SIGTERM");
|
|
828
|
+
} catch {
|
|
829
|
+
// process may have already exited
|
|
830
|
+
}
|
|
831
|
+
sigkillTimer = setTimeout(() => {
|
|
832
|
+
try {
|
|
833
|
+
proc.kill("SIGKILL");
|
|
834
|
+
} catch {
|
|
835
|
+
// ignore
|
|
836
|
+
}
|
|
837
|
+
}, sigkillDelayMs);
|
|
838
|
+
(sigkillTimer as { unref?: () => void }).unref?.();
|
|
839
|
+
};
|
|
840
|
+
if (opts.abortSignal) {
|
|
841
|
+
if (opts.abortSignal.aborted) {
|
|
842
|
+
onAbort();
|
|
843
|
+
} else {
|
|
844
|
+
opts.abortSignal.addEventListener("abort", onAbort, { once: true });
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
// Drain parser, capture session id, tee events into events.db.
|
|
849
|
+
const eventStore = createEventStore(eventsDbPath);
|
|
850
|
+
let newSessionId: string | null = null;
|
|
851
|
+
let cleanResult = false;
|
|
852
|
+
let observedAnyEvent = false;
|
|
853
|
+
let bootedToWorking = false;
|
|
854
|
+
|
|
855
|
+
// Stall watchdog (overstory-ddb3): if no parser event arrives for
|
|
856
|
+
// `eventStallTimeoutMs`, abort the turn via SIGTERM/SIGKILL. Otherwise a
|
|
857
|
+
// hung claude (Anthropic API stall, deadlock) hangs the runner forever.
|
|
858
|
+
const eventStallTimeoutMs = opts.eventStallTimeoutMs ?? 600_000;
|
|
859
|
+
let stallAborted = false;
|
|
860
|
+
let stallTimer: ReturnType<typeof setTimeout> | null = null;
|
|
861
|
+
let stallSigkillTimer: ReturnType<typeof setTimeout> | null = null;
|
|
862
|
+
const clearStallTimer = (): void => {
|
|
863
|
+
if (stallTimer) {
|
|
864
|
+
clearTimeout(stallTimer);
|
|
865
|
+
stallTimer = null;
|
|
866
|
+
}
|
|
867
|
+
};
|
|
868
|
+
const armStallTimer = (): void => {
|
|
869
|
+
if (eventStallTimeoutMs <= 0) return;
|
|
870
|
+
clearStallTimer();
|
|
871
|
+
stallTimer = setTimeout(() => {
|
|
872
|
+
if (aborted || stallAborted) return;
|
|
873
|
+
stallAborted = true;
|
|
874
|
+
runnerLog(
|
|
875
|
+
"error",
|
|
876
|
+
`parser stalled: no event for ${eventStallTimeoutMs}ms — aborting via SIGTERM`,
|
|
877
|
+
);
|
|
878
|
+
try {
|
|
879
|
+
proc.kill("SIGTERM");
|
|
880
|
+
} catch {
|
|
881
|
+
// process may have already exited
|
|
882
|
+
}
|
|
883
|
+
stallSigkillTimer = setTimeout(() => {
|
|
884
|
+
try {
|
|
885
|
+
proc.kill("SIGKILL");
|
|
886
|
+
} catch {
|
|
887
|
+
// ignore
|
|
888
|
+
}
|
|
889
|
+
}, sigkillDelayMs);
|
|
890
|
+
(stallSigkillTimer as { unref?: () => void }).unref?.();
|
|
891
|
+
}, eventStallTimeoutMs);
|
|
892
|
+
(stallTimer as { unref?: () => void }).unref?.();
|
|
893
|
+
};
|
|
894
|
+
|
|
895
|
+
// `AgentRuntime.parseEvents` is declared as a 1-param method, but the Claude
|
|
896
|
+
// adapter accepts an `onSessionId` hook. Widen the call site so we can pass
|
|
897
|
+
// the hook without depending on adapter-specific types.
|
|
898
|
+
type ParseEventsWithOpts = (
|
|
899
|
+
stream: ReadableStream<Uint8Array>,
|
|
900
|
+
opts?: { onSessionId?: (sid: string) => void },
|
|
901
|
+
) => AsyncIterable<AgentEvent>;
|
|
902
|
+
const parseEvents = runtime.parseEvents as unknown as ParseEventsWithOpts;
|
|
903
|
+
|
|
904
|
+
// Arm before iteration so a process that never emits also gets caught.
|
|
905
|
+
armStallTimer();
|
|
906
|
+
|
|
907
|
+
try {
|
|
908
|
+
const parser = parseEvents(proc.stdout, {
|
|
909
|
+
onSessionId: (sid: string) => {
|
|
910
|
+
newSessionId = sid;
|
|
911
|
+
updateSessionClaudeId(sessionsDbPath, agentName, sid, (err) =>
|
|
912
|
+
runnerLog("warn", "failed to persist claudeSessionId", err),
|
|
913
|
+
);
|
|
914
|
+
// Resume mismatch (overstory-088b C2): the runtime returned a
|
|
915
|
+
// different session id than the one we asked it to resume.
|
|
916
|
+
// `--resume` is best-effort — claude can decide to start a fresh
|
|
917
|
+
// session if it cannot rehydrate the requested one. Surface a
|
|
918
|
+
// structured warning event so observability mirrors the runner
|
|
919
|
+
// diagnostic and downstream tooling can detect the mismatch.
|
|
920
|
+
if (priorSessionId !== null && sid !== priorSessionId) {
|
|
921
|
+
try {
|
|
922
|
+
eventStore.insert({
|
|
923
|
+
runId,
|
|
924
|
+
agentName,
|
|
925
|
+
sessionId: sid,
|
|
926
|
+
eventType: "custom",
|
|
927
|
+
toolName: null,
|
|
928
|
+
toolArgs: null,
|
|
929
|
+
toolDurationMs: null,
|
|
930
|
+
level: "warn",
|
|
931
|
+
data: JSON.stringify({
|
|
932
|
+
type: "resume_mismatch",
|
|
933
|
+
requestedSessionId: priorSessionId,
|
|
934
|
+
observedSessionId: sid,
|
|
935
|
+
}),
|
|
936
|
+
});
|
|
937
|
+
} catch {
|
|
938
|
+
// non-fatal — observability must not break the turn
|
|
939
|
+
}
|
|
940
|
+
runnerLog(
|
|
941
|
+
"warn",
|
|
942
|
+
`resume mismatch: requested ${priorSessionId} but runtime returned ${sid}`,
|
|
943
|
+
);
|
|
944
|
+
}
|
|
945
|
+
},
|
|
946
|
+
});
|
|
947
|
+
|
|
948
|
+
for await (const event of parser) {
|
|
949
|
+
armStallTimer();
|
|
950
|
+
observedAnyEvent = true;
|
|
951
|
+
|
|
952
|
+
if (!bootedToWorking && initialState === "booting") {
|
|
953
|
+
bootedToWorking = true;
|
|
954
|
+
updateSessionState(
|
|
955
|
+
sessionsDbPath,
|
|
956
|
+
agentName,
|
|
957
|
+
"working",
|
|
958
|
+
(err) => runnerLog("warn", "failed to transition booting → working", err),
|
|
959
|
+
(prev, attempted) =>
|
|
960
|
+
runnerLog(
|
|
961
|
+
"warn",
|
|
962
|
+
`booting → working rejected: state is now ${prev} (attempted ${attempted})`,
|
|
963
|
+
),
|
|
964
|
+
);
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
if (event.type === "result") {
|
|
968
|
+
cleanResult = event.isError !== true;
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
try {
|
|
972
|
+
recordAgentEvent(eventStore, agentName, runId, newSessionId, event);
|
|
973
|
+
} catch {
|
|
974
|
+
// non-fatal — observability must not break the turn
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
} catch (err) {
|
|
978
|
+
// Parser iteration threw (malformed stream-json, decoder error, etc.).
|
|
979
|
+
// The subprocess is still running and would orphan past lock.release()
|
|
980
|
+
// if we just propagated the error (overstory-088b C3). Send SIGKILL so
|
|
981
|
+
// it cannot keep producing output or holding resources, then rethrow
|
|
982
|
+
// for the outer finally to clean up turn.pid and release the lock.
|
|
983
|
+
runnerLog("error", "parser iteration threw — killing subprocess to avoid orphan", err);
|
|
984
|
+
try {
|
|
985
|
+
proc.kill("SIGKILL");
|
|
986
|
+
} catch {
|
|
987
|
+
// process may have already exited
|
|
988
|
+
}
|
|
989
|
+
throw err;
|
|
990
|
+
} finally {
|
|
991
|
+
clearStallTimer();
|
|
992
|
+
if (stallSigkillTimer) {
|
|
993
|
+
clearTimeout(stallSigkillTimer);
|
|
994
|
+
stallSigkillTimer = null;
|
|
995
|
+
}
|
|
996
|
+
try {
|
|
997
|
+
eventStore.close();
|
|
998
|
+
} catch {
|
|
999
|
+
// ignore
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
let exitCode: number | null;
|
|
1004
|
+
try {
|
|
1005
|
+
exitCode = await proc.exited;
|
|
1006
|
+
} catch (err) {
|
|
1007
|
+
runnerLog("warn", "proc.exited rejected", err);
|
|
1008
|
+
exitCode = null;
|
|
1009
|
+
}
|
|
1010
|
+
if (sigkillTimer) {
|
|
1011
|
+
clearTimeout(sigkillTimer);
|
|
1012
|
+
sigkillTimer = null;
|
|
1013
|
+
}
|
|
1014
|
+
if (opts.abortSignal && !opts.abortSignal.aborted) {
|
|
1015
|
+
opts.abortSignal.removeEventListener("abort", onAbort);
|
|
1016
|
+
}
|
|
1017
|
+
if (aborted || stallAborted) {
|
|
1018
|
+
exitCode = null;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
// Wait for stderr drain so the log file isn't truncated mid-write.
|
|
1022
|
+
try {
|
|
1023
|
+
await stderrTeePromise;
|
|
1024
|
+
} catch {
|
|
1025
|
+
// best-effort
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
const terminalMailObserved = checkTerminalMailSince(
|
|
1029
|
+
mailDbPath,
|
|
1030
|
+
agentName,
|
|
1031
|
+
capability,
|
|
1032
|
+
snapshotTs,
|
|
1033
|
+
);
|
|
1034
|
+
|
|
1035
|
+
const resumeMismatch =
|
|
1036
|
+
priorSessionId !== null && newSessionId !== null && newSessionId !== priorSessionId;
|
|
1037
|
+
|
|
1038
|
+
// Contract violation (overstory-6071): claude exited cleanly (saw a
|
|
1039
|
+
// `result` event with isError:false) but never sent the capability's
|
|
1040
|
+
// terminal mail. Pre-fix this fell through to `working` and stayed
|
|
1041
|
+
// there forever — the process is gone but the session looks alive.
|
|
1042
|
+
// Surface loudly via the runner diagnostic sink and settle to
|
|
1043
|
+
// `completed` so operators don't see a zombie-but-labeled-working row.
|
|
1044
|
+
const terminalMailMissing = cleanResult && !terminalMailObserved && !aborted && !stallAborted;
|
|
1045
|
+
if (terminalMailMissing) {
|
|
1046
|
+
const expected = terminalMailTypesFor(capability).join("|") || "<none>";
|
|
1047
|
+
runnerLog(
|
|
1048
|
+
"error",
|
|
1049
|
+
`agent exited cleanly without sending terminal mail (expected ${expected}); marking completed and surfacing contract violation`,
|
|
1050
|
+
);
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
let finalState: AgentState;
|
|
1054
|
+
if (aborted || stallAborted) {
|
|
1055
|
+
finalState = "zombie";
|
|
1056
|
+
} else if (cleanResult && terminalMailObserved) {
|
|
1057
|
+
finalState = "completed";
|
|
1058
|
+
} else if (terminalMailMissing) {
|
|
1059
|
+
finalState = "completed";
|
|
1060
|
+
} else if (observedAnyEvent || bootedToWorking) {
|
|
1061
|
+
finalState = "working";
|
|
1062
|
+
} else {
|
|
1063
|
+
finalState = initialState;
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
if (finalState !== initialState) {
|
|
1067
|
+
updateSessionState(
|
|
1068
|
+
sessionsDbPath,
|
|
1069
|
+
agentName,
|
|
1070
|
+
finalState,
|
|
1071
|
+
(err) => runnerLog("warn", `failed to transition state to ${finalState}`, err),
|
|
1072
|
+
(prev, attempted) =>
|
|
1073
|
+
runnerLog(
|
|
1074
|
+
"warn",
|
|
1075
|
+
`turn-end transition ${initialState} → ${attempted} rejected: state is now ${prev}`,
|
|
1076
|
+
),
|
|
1077
|
+
);
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
// In-band parent notification (overstory-4159, overstory-c772). When the
|
|
1081
|
+
// turn ends without the capability's terminal mail — either because the
|
|
1082
|
+
// runner zombified (abort/stall) or claude exited cleanly without sending
|
|
1083
|
+
// `worker_done` — synthesize a `worker_died` mail to the parent so the
|
|
1084
|
+
// lead does not block forever waiting for a signal that will never come.
|
|
1085
|
+
// The watchdog's pre-tick state-snapshot dedup (mx-b0e54b) ensures a
|
|
1086
|
+
// later watchdog pass on the now-zombie session does not re-fire.
|
|
1087
|
+
const shouldNotifyParent =
|
|
1088
|
+
parentAgent !== null && (finalState === "zombie" || terminalMailMissing);
|
|
1089
|
+
if (shouldNotifyParent && parentAgent !== null) {
|
|
1090
|
+
const reason = aborted
|
|
1091
|
+
? "Aborted by operator (SIGTERM)"
|
|
1092
|
+
: stallAborted
|
|
1093
|
+
? "Parser stalled (no events within timeout)"
|
|
1094
|
+
: terminalMailMissing
|
|
1095
|
+
? `Clean exit without terminal mail (expected ${terminalMailTypesFor(capability).join("|") || "<none>"})`
|
|
1096
|
+
: "Turn ended without terminal mail";
|
|
1097
|
+
notifyParentOfRunnerDeath({
|
|
1098
|
+
mailStore: opts._mailStore ?? null,
|
|
1099
|
+
mailDbPath,
|
|
1100
|
+
parentAgent,
|
|
1101
|
+
agentName,
|
|
1102
|
+
capability,
|
|
1103
|
+
taskId,
|
|
1104
|
+
reason,
|
|
1105
|
+
lastActivity: sessionLastActivity ?? new Date(startedAtMs).toISOString(),
|
|
1106
|
+
runnerLog,
|
|
1107
|
+
});
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
// `lastActivity` advancing past `startedAt` is a turn-cleanup contract
|
|
1111
|
+
// invariant — silent failure here was the smoking gun in overstory-4af3.
|
|
1112
|
+
const lastActivityOk = updateSessionLastActivity(sessionsDbPath, agentName, (err) =>
|
|
1113
|
+
runnerLog("warn", "failed to update lastActivity", err),
|
|
1114
|
+
);
|
|
1115
|
+
if (!lastActivityOk) {
|
|
1116
|
+
runnerLog(
|
|
1117
|
+
"error",
|
|
1118
|
+
"lastActivity stayed at startedAt — session.lastActivity is unreliable for this turn",
|
|
1119
|
+
);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
const durationMs = now().getTime() - startedAtMs;
|
|
1123
|
+
|
|
1124
|
+
return {
|
|
1125
|
+
exitCode,
|
|
1126
|
+
cleanResult,
|
|
1127
|
+
newSessionId,
|
|
1128
|
+
resumeMismatch,
|
|
1129
|
+
terminalMailObserved,
|
|
1130
|
+
durationMs,
|
|
1131
|
+
initialState,
|
|
1132
|
+
finalState,
|
|
1133
|
+
stallAborted,
|
|
1134
|
+
terminalMailMissing,
|
|
1135
|
+
};
|
|
1136
|
+
} finally {
|
|
1137
|
+
// PID-file cleanup so a follow-up turn never sees a stale PID (covers
|
|
1138
|
+
// thrown errors as well as the happy path). ENOENT is expected on the
|
|
1139
|
+
// "spawn never happened" path; any other error is a contract violation
|
|
1140
|
+
// because turn.pid is the cross-process kill primitive (overstory-2cf9).
|
|
1141
|
+
if (turnPidPath) {
|
|
1142
|
+
try {
|
|
1143
|
+
await unlink(turnPidPath);
|
|
1144
|
+
} catch (err) {
|
|
1145
|
+
const code = (err as NodeJS.ErrnoException | undefined)?.code;
|
|
1146
|
+
if (code !== "ENOENT") {
|
|
1147
|
+
runnerLog("error", `failed to unlink turn.pid at ${turnPidPath}`, err);
|
|
1148
|
+
}
|
|
1149
|
+
}
|
|
1150
|
+
// Contract assertion: turn.pid must NOT survive the runner. A
|
|
1151
|
+
// surviving file means a follow-up `ov stop` or watchdog will target
|
|
1152
|
+
// a stale PID. Surface the violation loudly (overstory-4af3).
|
|
1153
|
+
try {
|
|
1154
|
+
if (existsSync(turnPidPath)) {
|
|
1155
|
+
runnerLog(
|
|
1156
|
+
"error",
|
|
1157
|
+
`turn.pid still exists at ${turnPidPath} after cleanup — kill primitive will target stale PID`,
|
|
1158
|
+
);
|
|
1159
|
+
}
|
|
1160
|
+
} catch {
|
|
1161
|
+
// existsSync should not throw, but keep diagnostics defensive
|
|
1162
|
+
}
|
|
1163
|
+
}
|
|
1164
|
+
lock.release();
|
|
1165
|
+
}
|
|
1166
|
+
}
|