@os-eco/overstory-cli 0.9.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/README.md +50 -19
  2. package/agents/builder.md +19 -9
  3. package/agents/coordinator.md +6 -6
  4. package/agents/lead.md +204 -87
  5. package/agents/merger.md +25 -14
  6. package/agents/reviewer.md +22 -16
  7. package/agents/scout.md +17 -12
  8. package/package.json +6 -3
  9. package/src/agents/capabilities.test.ts +85 -0
  10. package/src/agents/capabilities.ts +125 -0
  11. package/src/agents/headless-mail-injector.test.ts +448 -0
  12. package/src/agents/headless-mail-injector.ts +219 -0
  13. package/src/agents/headless-prompt.test.ts +102 -0
  14. package/src/agents/headless-prompt.ts +68 -0
  15. package/src/agents/hooks-deployer.test.ts +514 -14
  16. package/src/agents/hooks-deployer.ts +141 -0
  17. package/src/agents/mail-poll-detect.test.ts +153 -0
  18. package/src/agents/mail-poll-detect.ts +73 -0
  19. package/src/agents/overlay.test.ts +60 -4
  20. package/src/agents/overlay.ts +63 -8
  21. package/src/agents/scope-detect.test.ts +190 -0
  22. package/src/agents/scope-detect.ts +146 -0
  23. package/src/agents/turn-lock.test.ts +181 -0
  24. package/src/agents/turn-lock.ts +235 -0
  25. package/src/agents/turn-runner-dispatch.test.ts +182 -0
  26. package/src/agents/turn-runner-dispatch.ts +105 -0
  27. package/src/agents/turn-runner.test.ts +2312 -0
  28. package/src/agents/turn-runner.ts +1383 -0
  29. package/src/commands/agents.ts +9 -0
  30. package/src/commands/clean.ts +54 -0
  31. package/src/commands/coordinator.test.ts +254 -0
  32. package/src/commands/coordinator.ts +273 -8
  33. package/src/commands/dashboard.test.ts +188 -0
  34. package/src/commands/dashboard.ts +14 -4
  35. package/src/commands/doctor.ts +3 -1
  36. package/src/commands/group.test.ts +94 -0
  37. package/src/commands/group.ts +49 -20
  38. package/src/commands/init.test.ts +8 -0
  39. package/src/commands/init.ts +8 -1
  40. package/src/commands/log.test.ts +187 -11
  41. package/src/commands/log.ts +171 -71
  42. package/src/commands/mail.test.ts +162 -0
  43. package/src/commands/mail.ts +64 -9
  44. package/src/commands/merge.test.ts +230 -1
  45. package/src/commands/merge.ts +68 -12
  46. package/src/commands/nudge.test.ts +351 -4
  47. package/src/commands/nudge.ts +356 -34
  48. package/src/commands/run.test.ts +43 -7
  49. package/src/commands/serve/build.test.ts +202 -0
  50. package/src/commands/serve/build.ts +206 -0
  51. package/src/commands/serve/coordinator-actions.test.ts +339 -0
  52. package/src/commands/serve/coordinator-actions.ts +408 -0
  53. package/src/commands/serve/dev.test.ts +168 -0
  54. package/src/commands/serve/dev.ts +117 -0
  55. package/src/commands/serve/mail-actions.test.ts +312 -0
  56. package/src/commands/serve/mail-actions.ts +167 -0
  57. package/src/commands/serve/rest.test.ts +1323 -0
  58. package/src/commands/serve/rest.ts +708 -0
  59. package/src/commands/serve/static.ts +51 -0
  60. package/src/commands/serve/ws.test.ts +361 -0
  61. package/src/commands/serve/ws.ts +332 -0
  62. package/src/commands/serve.test.ts +459 -0
  63. package/src/commands/serve.ts +565 -0
  64. package/src/commands/sling.test.ts +177 -1
  65. package/src/commands/sling.ts +243 -71
  66. package/src/commands/status.test.ts +9 -0
  67. package/src/commands/status.ts +12 -4
  68. package/src/commands/stop.test.ts +255 -1
  69. package/src/commands/stop.ts +107 -8
  70. package/src/commands/watch.test.ts +43 -0
  71. package/src/commands/watch.ts +153 -28
  72. package/src/config.ts +23 -0
  73. package/src/doctor/consistency.test.ts +106 -0
  74. package/src/doctor/consistency.ts +48 -1
  75. package/src/doctor/serve.test.ts +95 -0
  76. package/src/doctor/serve.ts +86 -0
  77. package/src/doctor/types.ts +2 -1
  78. package/src/doctor/watchdog.ts +57 -1
  79. package/src/events/tailer.test.ts +234 -1
  80. package/src/events/tailer.ts +90 -0
  81. package/src/index.ts +57 -6
  82. package/src/insights/quality-gates.test.ts +141 -0
  83. package/src/insights/quality-gates.ts +156 -0
  84. package/src/json.ts +29 -0
  85. package/src/logging/theme.ts +4 -0
  86. package/src/mail/client.ts +15 -2
  87. package/src/mail/store.test.ts +82 -0
  88. package/src/mail/store.ts +41 -4
  89. package/src/merge/lock.test.ts +149 -0
  90. package/src/merge/lock.ts +140 -0
  91. package/src/merge/predict.test.ts +387 -0
  92. package/src/merge/predict.ts +249 -0
  93. package/src/merge/resolver.ts +1 -1
  94. package/src/mulch/client.ts +3 -3
  95. package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
  96. package/src/runtimes/claude.test.ts +791 -1
  97. package/src/runtimes/claude.ts +323 -1
  98. package/src/runtimes/connections.test.ts +141 -1
  99. package/src/runtimes/connections.ts +73 -4
  100. package/src/runtimes/headless-connection.test.ts +264 -0
  101. package/src/runtimes/headless-connection.ts +158 -0
  102. package/src/runtimes/types.ts +10 -0
  103. package/src/schema-consistency.test.ts +1 -0
  104. package/src/sessions/store.test.ts +657 -29
  105. package/src/sessions/store.ts +286 -23
  106. package/src/test-setup.test.ts +31 -0
  107. package/src/test-setup.ts +28 -0
  108. package/src/types.ts +107 -2
  109. package/src/utils/pid.test.ts +85 -1
  110. package/src/utils/pid.ts +86 -1
  111. package/src/utils/process-scan.test.ts +53 -0
  112. package/src/utils/process-scan.ts +76 -0
  113. package/src/watchdog/daemon.test.ts +1607 -376
  114. package/src/watchdog/daemon.ts +462 -88
  115. package/src/watchdog/health.test.ts +282 -0
  116. package/src/watchdog/health.ts +126 -27
  117. package/src/worktree/manager.test.ts +218 -1
  118. package/src/worktree/manager.ts +55 -0
  119. package/src/worktree/process.test.ts +71 -0
  120. package/src/worktree/process.ts +25 -5
  121. package/src/worktree/tmux.test.ts +28 -0
  122. package/src/worktree/tmux.ts +27 -3
  123. package/templates/CLAUDE.md.tmpl +19 -8
  124. package/templates/overlay.md.tmpl +5 -2
@@ -0,0 +1,1383 @@
1
+ /**
2
+ * Per-turn engine for headless Claude Code agents (spawn-per-turn model).
3
+ *
4
+ * Owns a single agent turn end-to-end:
5
+ * - acquires per-agent serialization (in-process mutex + cross-process lease)
6
+ * - re-reads SessionStore under the lock so the prior `claudeSessionId` is fresh
7
+ * - spawns claude via the runtime's `buildDirectSpawn` (with `--resume` when available)
8
+ * - writes the user turn to a real stdin pipe and closes it (claude sees EOF)
9
+ * - drains `runtime.parseEvents` and tees events into events.db
10
+ * - captures the new session id via the parser's `onSessionId` hook
11
+ * - snapshots mail.db before spawn and detects the agent's capability-specific
12
+ * terminal mail (`worker_done` for builder/scout/reviewer/lead;
13
+ * `merged`/`merge_failed` for merger)
14
+ * - applies state-transition rules (booting → working, completed when done)
15
+ * - handles abort signals with SIGTERM → SIGKILL escalation
16
+ * - releases the lock on every exit path
17
+ *
18
+ * This module does NOT decide WHEN to run a turn. The mail injector and nudge
19
+ * command call `runTurn(opts)` when they have a user turn to deliver.
20
+ */
21
+
22
+ import { Database } from "bun:sqlite";
23
+ import { appendFileSync, existsSync } from "node:fs";
24
+ import { mkdir, unlink } from "node:fs/promises";
25
+ import { join } from "node:path";
26
+ import { extractFileScope } from "../commands/agents.ts";
27
+ import { AgentError } from "../errors.ts";
28
+ import { createEventStore } from "../events/store.ts";
29
+ import { filterToolArgs } from "../events/tool-filter.ts";
30
+ import { createMailStore, type MailStore } from "../mail/store.ts";
31
+ import type { AgentEvent, AgentRuntime, DirectSpawnOpts } from "../runtimes/types.ts";
32
+ import { createSessionStore } from "../sessions/store.ts";
33
+ import type {
34
+ AgentState,
35
+ EventStore,
36
+ EventType,
37
+ ResolvedModel,
38
+ WorkerDiedPayload,
39
+ } from "../types.ts";
40
+ import { terminalMailTypesFor } from "./capabilities.ts";
41
+ import { detectMailPollPattern } from "./mail-poll-detect.ts";
42
+ import {
43
+ type DetectScopeViolationOpts,
44
+ detectScopeViolation as defaultDetectScopeViolation,
45
+ IMPLEMENTATION_CAPABILITIES,
46
+ type ScopeViolationResult,
47
+ } from "./scope-detect.ts";
48
+ import { acquireTurnLock } from "./turn-lock.ts";
49
+
50
+ /** Subprocess shape required by `runTurn`. Compatible with `Bun.spawn`. */
51
+ export interface TurnSubprocess {
52
+ readonly pid: number;
53
+ readonly stdin: {
54
+ write(data: string | Uint8Array): number | Promise<number> | unknown;
55
+ end?(): void | Promise<void> | unknown;
56
+ flush?(): unknown;
57
+ };
58
+ readonly stdout: ReadableStream<Uint8Array>;
59
+ readonly exited: Promise<number | null>;
60
+ kill(signal?: number | string): void;
61
+ }
62
+
63
+ /** Spawn function signature. Production passes `Bun.spawn` cast to this type. */
64
+ export type TurnSpawnFn = (
65
+ cmd: string[],
66
+ options: {
67
+ cwd: string;
68
+ env: Record<string, string>;
69
+ stdin: "pipe";
70
+ stdout: "pipe";
71
+ stderr: "pipe" | "ignore" | number;
72
+ },
73
+ ) => TurnSubprocess;
74
+
75
+ /** Severity of an internal runner diagnostic. `error` indicates a contract violation. */
76
+ export type RunnerLogLevel = "warn" | "error";
77
+
78
+ /**
79
+ * Internal runner diagnostic sink. Replaces the swallowed `catch {}` blocks
80
+ * around SessionStore writes and turn.pid I/O so that future failures are
81
+ * visible (overstory-4af3). Test injection point.
82
+ */
83
+ export type RunnerLogger = (level: RunnerLogLevel, message: string, err?: unknown) => void;
84
+
85
+ export interface RunTurnOpts {
86
+ agentName: string;
87
+ /**
88
+ * Worker capability driving terminal-mail detection (builder/scout/reviewer/
89
+ * merger/lead). The runner uses {@link terminalMailTypesFor} to decide which
90
+ * mail types signal completion for this agent.
91
+ */
92
+ capability: string;
93
+ overstoryDir: string;
94
+ worktreePath: string;
95
+ projectRoot: string;
96
+ taskId: string;
97
+ /** Pre-encoded stream-json envelope (from `encodeUserTurn`). Empty string is a no-op. */
98
+ userTurnNdjson: string;
99
+ runtime: AgentRuntime;
100
+ resolvedModel: ResolvedModel;
101
+ runId: string | null;
102
+ mailDbPath: string;
103
+ eventsDbPath: string;
104
+ sessionsDbPath: string;
105
+ /** Test injection: spawn function. Defaults to `Bun.spawn`. */
106
+ _spawnFn?: TurnSpawnFn;
107
+ /** Test injection: time source. */
108
+ _now?: () => Date;
109
+ /**
110
+ * Test injection: pre-opened MailStore for the parent-notify path.
111
+ * Production opens `mailDbPath` briefly inside the helper and closes it; tests
112
+ * pass a shared in-memory store so they can read what was inserted without
113
+ * reopening the DB file.
114
+ */
115
+ _mailStore?: MailStore;
116
+ /**
117
+ * Test injection: runner diagnostic sink. When omitted, warnings append to
118
+ * `<turnLogDir>/runner.log` and mirror to `process.stderr` with a
119
+ * `[turn-runner:<level>] <agent>:` prefix.
120
+ */
121
+ _logWarning?: RunnerLogger;
122
+ /** Operator-driven kill (e.g. `ov stop`). */
123
+ abortSignal?: AbortSignal;
124
+ /** Time between SIGTERM and SIGKILL on abort. Default 2000ms. */
125
+ sigkillDelayMs?: number;
126
+ /**
127
+ * Mid-stream stall watchdog: max time (ms) between parser events before the
128
+ * runner aborts the turn via SIGTERM (escalates to SIGKILL after
129
+ * `sigkillDelayMs`). Resets on every event from the runtime parser. Default
130
+ * 600000ms (10 minutes) — generous enough to span long tool calls while
131
+ * still bounding hung-claude turns (overstory-ddb3).
132
+ *
133
+ * Set to `0` to disable (test injection / explicit opt-out only).
134
+ */
135
+ eventStallTimeoutMs?: number;
136
+ /**
137
+ * Throttle (ms) for refreshing `session.lastActivity` while events stream
138
+ * from the parser loop. Default `2000` (every 2s). The watchdog at
139
+ * `src/watchdog/health.ts:242-243` documents its design as: "the
140
+ * turn-runner updates [lastActivity] on every parser event during a turn,
141
+ * and the watchdog refreshes it from events.db between turns" — so the
142
+ * runner must drive lastActivity itself or a long turn looks stalled and
143
+ * gets zombified mid-flight (overstory-8e61).
144
+ *
145
+ * Set to `0` to refresh on every event (test injection / explicit opt-out).
146
+ */
147
+ lastActivityRefreshIntervalMs?: number;
148
+ /**
149
+ * Test injection: invoked each time the parser loop fires a mid-turn
150
+ * `lastActivity` refresh (after the throttle gate, before/after the
151
+ * SessionStore write). Used by tests to count refresh attempts directly
152
+ * rather than inferring from observable timestamps (overstory-8e61).
153
+ */
154
+ _onLastActivityRefresh?: () => void;
155
+ /**
156
+ * Test injection: replaces the real `detectScopeViolation` from
157
+ * `scope-detect.ts`. Tests pass a stubbed runner via the wrapper so they
158
+ * can drive the scope-violation observability path without spawning git
159
+ * (overstory-9f4d). Defaults to the real implementation.
160
+ */
161
+ _scopeDetect?: (opts: DetectScopeViolationOpts) => ScopeViolationResult;
162
+ }
163
+
164
+ export interface TurnResult {
165
+ /** Process exit code. `null` when aborted before exit. */
166
+ exitCode: number | null;
167
+ /** True iff the parser observed a `result` event with `isError: false`. */
168
+ cleanResult: boolean;
169
+ /** Session id captured from this turn's stream-json (may differ from prior). */
170
+ newSessionId: string | null;
171
+ /** True iff a prior session id was requested and the new one differs. */
172
+ resumeMismatch: boolean;
173
+ /**
174
+ * True iff a capability-specific terminal mail from the agent appeared
175
+ * during the turn (`worker_done` for builder/scout/reviewer/lead,
176
+ * `merged`/`merge_failed` for merger).
177
+ */
178
+ terminalMailObserved: boolean;
179
+ /** Wall-clock turn duration in milliseconds. */
180
+ durationMs: number;
181
+ /** AgentState read from SessionStore at the start of the turn. */
182
+ initialState: AgentState;
183
+ /** AgentState computed by the transition rules and persisted on exit. */
184
+ finalState: AgentState;
185
+ /**
186
+ * True iff the per-event stall watchdog fired during the turn — the runner
187
+ * sent SIGTERM/SIGKILL because no parser event arrived for
188
+ * `eventStallTimeoutMs` (overstory-ddb3). Treated like `aborted` for
189
+ * finalState purposes (`zombie`).
190
+ */
191
+ stallAborted: boolean;
192
+ /**
193
+ * True iff claude exited cleanly (`cleanResult` true) without sending the
194
+ * capability-specific terminal mail (overstory-6071). Contract violation:
195
+ * the agent finished its turn but failed to signal completion. Logged at
196
+ * `error` level via the runner diagnostic sink and recorded here for
197
+ * caller-visible auditing.
198
+ */
199
+ terminalMailMissing: boolean;
200
+ }
201
+
202
+ const defaultSpawnFn: TurnSpawnFn = (cmd, options) =>
203
+ Bun.spawn(cmd, options) as unknown as TurnSubprocess;
204
+
205
+ function mapAgentEventType(type: string): EventType {
206
+ switch (type) {
207
+ case "tool_use":
208
+ return "tool_start";
209
+ case "tool_result":
210
+ return "tool_end";
211
+ case "status":
212
+ return "session_start";
213
+ case "result":
214
+ return "result";
215
+ case "error":
216
+ return "error";
217
+ case "assistant_message":
218
+ return "progress";
219
+ default:
220
+ return "custom";
221
+ }
222
+ }
223
+
224
+ function recordAgentEvent(
225
+ eventStore: EventStore,
226
+ agentName: string,
227
+ runId: string | null,
228
+ sessionId: string | null,
229
+ event: AgentEvent,
230
+ ): void {
231
+ const eventType = mapAgentEventType(event.type);
232
+ let dataStr: string | null;
233
+ try {
234
+ dataStr = JSON.stringify(event);
235
+ } catch {
236
+ dataStr = null;
237
+ }
238
+
239
+ if (event.type === "tool_use") {
240
+ const toolName = typeof event.name === "string" ? event.name : null;
241
+ const toolInput =
242
+ typeof event.input === "object" && event.input !== null
243
+ ? (event.input as Record<string, unknown>)
244
+ : {};
245
+ const filtered = toolName ? filterToolArgs(toolName, toolInput) : null;
246
+ eventStore.insert({
247
+ runId,
248
+ agentName,
249
+ sessionId,
250
+ eventType,
251
+ toolName,
252
+ toolArgs: filtered ? JSON.stringify(filtered.args) : null,
253
+ toolDurationMs: null,
254
+ level: "info",
255
+ data: dataStr,
256
+ });
257
+ return;
258
+ }
259
+
260
+ if (event.type === "result") {
261
+ eventStore.insert({
262
+ runId,
263
+ agentName,
264
+ sessionId,
265
+ eventType,
266
+ toolName: null,
267
+ toolArgs: null,
268
+ toolDurationMs: typeof event.durationMs === "number" ? Math.round(event.durationMs) : null,
269
+ level: event.isError === true ? "error" : "info",
270
+ data: dataStr,
271
+ });
272
+ return;
273
+ }
274
+
275
+ eventStore.insert({
276
+ runId,
277
+ agentName,
278
+ sessionId,
279
+ eventType,
280
+ toolName: null,
281
+ toolArgs: null,
282
+ toolDurationMs: null,
283
+ level: event.type === "error" ? "error" : "info",
284
+ data: dataStr,
285
+ });
286
+ }
287
+
288
+ function checkTerminalMailSince(
289
+ mailDbPath: string,
290
+ agentName: string,
291
+ capability: string,
292
+ sinceTs: string,
293
+ ): boolean {
294
+ const types = terminalMailTypesFor(capability);
295
+ if (types.length === 0) return false;
296
+
297
+ let db: Database;
298
+ try {
299
+ db = new Database(mailDbPath);
300
+ } catch {
301
+ return false;
302
+ }
303
+ try {
304
+ db.exec("PRAGMA busy_timeout = 5000");
305
+ const placeholders = types.map((_, i) => `$t${i}`).join(",");
306
+ const sql = `SELECT 1 AS c FROM messages WHERE from_agent = $a AND type IN (${placeholders}) AND created_at > $ts LIMIT 1`;
307
+ const stmt = db.prepare<{ c: number }, Record<string, string>>(sql);
308
+ const params: Record<string, string> = { $a: agentName, $ts: sinceTs };
309
+ types.forEach((t, i) => {
310
+ params[`$t${i}`] = t;
311
+ });
312
+ const row = stmt.get(params);
313
+ return row !== null;
314
+ } catch {
315
+ return false;
316
+ } finally {
317
+ try {
318
+ db.close();
319
+ } catch {
320
+ // best-effort
321
+ }
322
+ }
323
+ }
324
+
325
+ /**
326
+ * Check whether the agent has previously sent a `scope_expansion`-prefixed
327
+ * status mail (overstory-9f4d). When such a mail exists, the runner suppresses
328
+ * the soft scope-violation warning — the lead has already been informed.
329
+ *
330
+ * Soft signal — every failure (DB unavailable, missing table, etc.) returns
331
+ * false so observability never breaks the runner.
332
+ */
333
+ function hasScopeExpansionMail(mailDbPath: string, agentName: string): boolean {
334
+ let db: Database;
335
+ try {
336
+ db = new Database(mailDbPath);
337
+ } catch {
338
+ return false;
339
+ }
340
+ try {
341
+ db.exec("PRAGMA busy_timeout = 5000");
342
+ const stmt = db.prepare<{ c: number }, { $a: string }>(
343
+ "SELECT 1 AS c FROM messages WHERE from_agent = $a AND subject LIKE 'scope_expansion%' LIMIT 1",
344
+ );
345
+ return stmt.get({ $a: agentName }) !== null;
346
+ } catch {
347
+ return false;
348
+ } finally {
349
+ try {
350
+ db.close();
351
+ } catch {
352
+ // best-effort
353
+ }
354
+ }
355
+ }
356
+
357
+ /**
358
+ * Latest `created_at` timestamp of a terminal mail (`worker_done`/`result` for
359
+ * task-scoped workers; `merged`/`merge_failed` for merger) sent by `agentName`.
360
+ *
361
+ * Returns `null` when the agent has no prior terminal mail or the mail DB is
362
+ * unavailable. The runner uses this as the snapshot baseline for the new turn:
363
+ * any terminal mail with `created_at > snapshot` is attributable to the spawn
364
+ * we are about to start. Querying the actual prior timestamp eliminates the
365
+ * misattribution window that `now()` opened — a prior-turn `worker_done` that
366
+ * lands between baseline capture and spawn would have falsely tripped the
367
+ * "terminal mail observed" check (overstory-088b C1).
368
+ */
369
+ function latestTerminalMailTs(
370
+ mailDbPath: string,
371
+ agentName: string,
372
+ capability: string,
373
+ ): string | null {
374
+ const types = terminalMailTypesFor(capability);
375
+ if (types.length === 0) return null;
376
+
377
+ let db: Database;
378
+ try {
379
+ db = new Database(mailDbPath);
380
+ } catch {
381
+ return null;
382
+ }
383
+ try {
384
+ db.exec("PRAGMA busy_timeout = 5000");
385
+ const placeholders = types.map((_, i) => `$t${i}`).join(",");
386
+ const sql = `SELECT MAX(created_at) AS ts FROM messages WHERE from_agent = $a AND type IN (${placeholders})`;
387
+ const stmt = db.prepare<{ ts: string | null }, Record<string, string>>(sql);
388
+ const params: Record<string, string> = { $a: agentName };
389
+ types.forEach((t, i) => {
390
+ params[`$t${i}`] = t;
391
+ });
392
+ const row = stmt.get(params);
393
+ return row?.ts ?? null;
394
+ } catch {
395
+ return null;
396
+ } finally {
397
+ try {
398
+ db.close();
399
+ } catch {
400
+ // best-effort
401
+ }
402
+ }
403
+ }
404
+
405
+ /**
406
+ * Send a synthetic `worker_died` mail to the parent of a session whose turn
407
+ * ended without the capability's terminal mail. Mirrors the watchdog's
408
+ * `notifyParentOfDeath` (overstory-c111) but for in-band runner detection:
409
+ *
410
+ * - **Aborted / stalled** (zombie): operator `ov stop` or the parser-stall
411
+ * watchdog killed the subprocess. The agent never got a chance to send
412
+ * `worker_done`/`merged` (overstory-c772).
413
+ * - **terminalMailMissing**: claude exited cleanly but never sent the terminal
414
+ * mail — the silent-no-op path (overstory-4159).
415
+ *
416
+ * Without this, the lead waits forever for a terminal mail that will never
417
+ * arrive. The watchdog's pre-tick state-snapshot dedup (mx-b0e54b) means a
418
+ * later watchdog tick on the now-zombie session will see `stateBeforeTick ===
419
+ * "zombie"` and skip its own notify, so we won't double-fire.
420
+ *
421
+ * Fire-and-forget: every failure surfaces through `runnerLog` and never
422
+ * propagates. Mail-send must not break the turn.
423
+ */
424
+ function notifyParentOfRunnerDeath(ctx: {
425
+ mailStore: MailStore | null;
426
+ mailDbPath: string;
427
+ parentAgent: string;
428
+ agentName: string;
429
+ capability: string;
430
+ taskId: string;
431
+ reason: string;
432
+ lastActivity: string;
433
+ runnerLog: RunnerLogger;
434
+ }): void {
435
+ const {
436
+ mailStore,
437
+ mailDbPath,
438
+ parentAgent,
439
+ agentName,
440
+ capability,
441
+ taskId,
442
+ reason,
443
+ lastActivity,
444
+ runnerLog,
445
+ } = ctx;
446
+
447
+ const payload: WorkerDiedPayload = {
448
+ agentName,
449
+ capability,
450
+ taskId,
451
+ reason,
452
+ lastActivity,
453
+ terminatedBy: "runner",
454
+ };
455
+ const subject = `[RUNNER] worker_died: ${agentName}`;
456
+ const body =
457
+ `Worker "${agentName}" (${capability}) on task ${taskId} ended without ` +
458
+ `sending its terminal mail. Reason: ${reason}. Last activity: ${lastActivity}. ` +
459
+ `Decide whether to retry the work, escalate, or report the failure upstream.`;
460
+
461
+ let store: MailStore | null = mailStore;
462
+ let owned = false;
463
+ if (store === null) {
464
+ try {
465
+ store = createMailStore(mailDbPath);
466
+ owned = true;
467
+ } catch (err) {
468
+ runnerLog("warn", "failed to open mail store for parent notify", err);
469
+ return;
470
+ }
471
+ }
472
+ try {
473
+ store.insert({
474
+ id: "",
475
+ from: agentName,
476
+ to: parentAgent,
477
+ subject,
478
+ body,
479
+ type: "worker_died",
480
+ priority: "high",
481
+ threadId: null,
482
+ payload: JSON.stringify(payload),
483
+ });
484
+ } catch (err) {
485
+ runnerLog("warn", "failed to send worker_died mail to parent", err);
486
+ } finally {
487
+ if (owned) {
488
+ try {
489
+ store.close();
490
+ } catch {
491
+ // best-effort
492
+ }
493
+ }
494
+ }
495
+ }
496
+
497
+ /**
498
+ * Guarded state transition for the turn runner. Uses the SessionStore CAS
499
+ * (`tryTransitionState`) so a concurrent writer — `ov stop` writing
500
+ * `completed`, watchdog writing `zombie` — cannot be silently overwritten
501
+ * by the turn-runner's "settle to working/completed/zombie" at end of turn.
502
+ *
503
+ * Returns true when the transition landed. Rejected transitions are not
504
+ * fatal: the SQL CAS preserves whatever the conflicting writer set, which
505
+ * is the correct outcome for this race (overstory-a993).
506
+ *
507
+ * `onError` fires on database/IO failure. `onRejected` fires when the CAS
508
+ * rejected the transition (the row exists but was in a state that disallowed
509
+ * the move). Both are diagnostic-only — the caller need not recover.
510
+ */
511
+ function updateSessionState(
512
+ sessionsDbPath: string,
513
+ agentName: string,
514
+ state: AgentState,
515
+ onError?: (err: unknown) => void,
516
+ onRejected?: (prev: AgentState, attempted: AgentState) => void,
517
+ ): boolean {
518
+ try {
519
+ const store = createSessionStore(sessionsDbPath);
520
+ try {
521
+ const outcome = store.tryTransitionState(agentName, state);
522
+ if (!outcome.ok) {
523
+ if (outcome.reason === "illegal_transition") {
524
+ onRejected?.(outcome.prev, outcome.attempted);
525
+ }
526
+ return false;
527
+ }
528
+ } finally {
529
+ store.close();
530
+ }
531
+ return true;
532
+ } catch (err) {
533
+ onError?.(err);
534
+ return false;
535
+ }
536
+ }
537
+
538
+ function updateSessionLastActivity(
539
+ sessionsDbPath: string,
540
+ agentName: string,
541
+ onError?: (err: unknown) => void,
542
+ ): boolean {
543
+ try {
544
+ const store = createSessionStore(sessionsDbPath);
545
+ try {
546
+ store.updateLastActivity(agentName);
547
+ } finally {
548
+ store.close();
549
+ }
550
+ return true;
551
+ } catch (err) {
552
+ onError?.(err);
553
+ return false;
554
+ }
555
+ }
556
+
557
+ function updateSessionClaudeId(
558
+ sessionsDbPath: string,
559
+ agentName: string,
560
+ sessionId: string,
561
+ onError?: (err: unknown) => void,
562
+ ): boolean {
563
+ try {
564
+ const store = createSessionStore(sessionsDbPath);
565
+ try {
566
+ store.updateClaudeSessionId(agentName, sessionId);
567
+ } finally {
568
+ store.close();
569
+ }
570
+ return true;
571
+ } catch (err) {
572
+ onError?.(err);
573
+ return false;
574
+ }
575
+ }
576
+
577
+ /**
578
+ * Build the default runner diagnostic sink. Appends to `<turnLogDir>/runner.log`
579
+ * (synchronous, safe inside async functions) and mirrors to `process.stderr`
580
+ * with a `[turn-runner:<level>] <agent>:` prefix. Failures in the sink itself
581
+ * are swallowed — diagnostics must never break the turn.
582
+ */
583
+ function defaultRunnerLogger(agentName: string, runnerLogPath: string | null): RunnerLogger {
584
+ return (level, message, err) => {
585
+ const ts = new Date().toISOString();
586
+ const detail =
587
+ err instanceof Error
588
+ ? `: ${err.message}`
589
+ : err !== undefined && err !== null
590
+ ? `: ${String(err)}`
591
+ : "";
592
+ const line = `${ts} [${level}] ${message}${detail}\n`;
593
+ if (runnerLogPath) {
594
+ try {
595
+ appendFileSync(runnerLogPath, line);
596
+ } catch {
597
+ // best-effort; the stderr mirror still surfaces the warning
598
+ }
599
+ }
600
+ try {
601
+ process.stderr.write(`[turn-runner:${level}] ${agentName}: ${message}${detail}\n`);
602
+ } catch {
603
+ // nothing to do if stderr is unwritable
604
+ }
605
+ };
606
+ }
607
+
608
+ async function teeStreamToWriter(
609
+ stream: ReadableStream<Uint8Array> | null | undefined,
610
+ writer: { write(data: Uint8Array): unknown; end(): unknown },
611
+ ): Promise<void> {
612
+ const awaitMaybe = async (value: unknown): Promise<void> => {
613
+ if (value && typeof (value as Promise<unknown>).then === "function") {
614
+ await value;
615
+ }
616
+ };
617
+
618
+ if (!stream || typeof stream.getReader !== "function") {
619
+ try {
620
+ await awaitMaybe(writer.end());
621
+ } catch {
622
+ // ignore
623
+ }
624
+ return;
625
+ }
626
+ const reader = stream.getReader();
627
+ try {
628
+ while (true) {
629
+ const { done, value } = await reader.read();
630
+ if (done) break;
631
+ if (value && value.byteLength > 0) {
632
+ try {
633
+ await awaitMaybe(writer.write(value));
634
+ } catch {
635
+ // best-effort; do not block the parser on log file writes
636
+ }
637
+ }
638
+ }
639
+ } catch {
640
+ // ignore stream errors
641
+ } finally {
642
+ try {
643
+ reader.releaseLock();
644
+ } catch {
645
+ // ignore
646
+ }
647
+ try {
648
+ await awaitMaybe(writer.end());
649
+ } catch {
650
+ // ignore
651
+ }
652
+ }
653
+ }
654
+
655
+ /**
656
+ * Spawn one turn for an agent and drive it to completion.
657
+ *
658
+ * Always releases the per-agent lock, including on thrown errors.
659
+ */
660
+ export async function runTurn(opts: RunTurnOpts): Promise<TurnResult> {
661
+ const {
662
+ agentName,
663
+ capability,
664
+ overstoryDir,
665
+ worktreePath,
666
+ projectRoot,
667
+ taskId,
668
+ userTurnNdjson,
669
+ runtime,
670
+ resolvedModel,
671
+ runId,
672
+ mailDbPath,
673
+ eventsDbPath,
674
+ sessionsDbPath,
675
+ } = opts;
676
+
677
+ if (!runtime.buildDirectSpawn) {
678
+ throw new AgentError(
679
+ `Runtime "${runtime.id}" does not support buildDirectSpawn; cannot use spawn-per-turn`,
680
+ { agentName },
681
+ );
682
+ }
683
+ if (!runtime.parseEvents) {
684
+ throw new AgentError(
685
+ `Runtime "${runtime.id}" does not support parseEvents; cannot use spawn-per-turn`,
686
+ { agentName },
687
+ );
688
+ }
689
+
690
+ const spawnFn = opts._spawnFn ?? defaultSpawnFn;
691
+ const now = opts._now ?? (() => new Date());
692
+ const sigkillDelayMs = opts.sigkillDelayMs ?? 2000;
693
+
694
+ // Pre-lock peek so the empty-input path can short-circuit without
695
+ // paying the lock cost or transitioning state.
696
+ let preInitialState: AgentState = "booting";
697
+ try {
698
+ const preStore = createSessionStore(sessionsDbPath);
699
+ try {
700
+ const session = preStore.getByName(agentName);
701
+ if (session) preInitialState = session.state;
702
+ } finally {
703
+ preStore.close();
704
+ }
705
+ } catch {
706
+ // non-fatal — fall back to "booting"
707
+ }
708
+
709
+ if (userTurnNdjson === "") {
710
+ return {
711
+ exitCode: null,
712
+ cleanResult: false,
713
+ newSessionId: null,
714
+ resumeMismatch: false,
715
+ terminalMailObserved: false,
716
+ durationMs: 0,
717
+ initialState: preInitialState,
718
+ finalState: preInitialState,
719
+ stallAborted: false,
720
+ terminalMailMissing: false,
721
+ };
722
+ }
723
+
724
+ const lock = await acquireTurnLock({ agentName, overstoryDir });
725
+ const startedAtMs = now().getTime();
726
+ let initialState: AgentState = preInitialState;
727
+ let priorSessionId: string | null = null;
728
+ let parentAgent: string | null = null;
729
+ let sessionLastActivity: string | null = null;
730
+ let turnPidPath: string | null = null;
731
+ // Per-turn diagnostic sink. Bound after the turn log dir is created;
732
+ // pre-creation failures (rare — only the lock-held SessionStore re-read)
733
+ // remain silent because the file path doesn't exist yet.
734
+ let runnerLog: RunnerLogger = opts._logWarning ?? defaultRunnerLogger(agentName, null);
735
+
736
+ try {
737
+ // Re-read session under the lock — the value passed to the caller may be
738
+ // stale if another process just updated it.
739
+ try {
740
+ const store = createSessionStore(sessionsDbPath);
741
+ try {
742
+ const session = store.getByName(agentName);
743
+ if (session) {
744
+ initialState = session.state;
745
+ priorSessionId = session.claudeSessionId ?? null;
746
+ parentAgent = session.parentAgent ?? null;
747
+ sessionLastActivity = session.lastActivity ?? null;
748
+ }
749
+ } finally {
750
+ store.close();
751
+ }
752
+ } catch {
753
+ // non-fatal — fall back to pre-lock peek
754
+ }
755
+
756
+ const directEnv: Record<string, string> = {
757
+ ...runtime.buildEnv(resolvedModel),
758
+ OVERSTORY_AGENT_NAME: agentName,
759
+ OVERSTORY_WORKTREE_PATH: worktreePath,
760
+ OVERSTORY_TASK_ID: taskId,
761
+ OVERSTORY_PROJECT_ROOT: projectRoot,
762
+ };
763
+ const spawnEnv: Record<string, string> = {
764
+ ...(process.env as Record<string, string>),
765
+ ...directEnv,
766
+ };
767
+
768
+ const directOpts: DirectSpawnOpts = {
769
+ cwd: worktreePath,
770
+ env: directEnv,
771
+ ...(resolvedModel.isExplicitOverride ? { model: resolvedModel.model } : {}),
772
+ instructionPath: runtime.instructionPath,
773
+ resumeSessionId: priorSessionId,
774
+ };
775
+ const argv = runtime.buildDirectSpawn(directOpts);
776
+
777
+ const logTimestamp = now().toISOString().replace(/[:.]/g, "-");
778
+ const turnLogDir = join(overstoryDir, "logs", agentName, logTimestamp);
779
+ await mkdir(turnLogDir, { recursive: true });
780
+ const stderrPath = join(turnLogDir, "stderr.log");
781
+ const stderrWriter = Bun.file(stderrPath).writer();
782
+
783
+ // Bind the runner-diagnostic sink now that the per-turn log dir exists.
784
+ // Subsequent silent-failure paths (SessionStore writes, turn.pid I/O)
785
+ // route through `runnerLog` so future leaks/contract violations are
786
+ // diagnosable (overstory-4af3).
787
+ const runnerLogPath = join(turnLogDir, "runner.log");
788
+ runnerLog = opts._logWarning ?? defaultRunnerLogger(agentName, runnerLogPath);
789
+
790
+ // Per-agent state dir (shared with applied-records.json, identity.yaml).
791
+ // Holds turn.pid while a turn is in flight so other processes (`ov stop`,
792
+ // watchdog) can find and signal the live claude PID.
793
+ const agentStateDir = join(overstoryDir, "agents", agentName);
794
+ await mkdir(agentStateDir, { recursive: true });
795
+ turnPidPath = join(agentStateDir, "turn.pid");
796
+
797
+ // Snapshot the terminal-mail baseline at the latest prior terminal mail
798
+ // (`worker_done`/`result` for task workers, `merged`/`merge_failed` for
799
+ // merger). Querying the actual prior timestamp — rather than wall-clock
800
+ // `now()` — closes the misattribution window where a prior turn's
801
+ // terminal mail lands between baseline capture and spawn (overstory-088b
802
+ // C1). Falls back to epoch when no prior terminal mail exists, so the
803
+ // first terminal mail of the agent's lifetime is attributed to this turn.
804
+ const snapshotTs =
805
+ latestTerminalMailTs(mailDbPath, agentName, capability) ?? new Date(0).toISOString();
806
+
807
+ // Spawn. Failures here propagate after the finally below releases the lock.
808
+ let proc: TurnSubprocess;
809
+ try {
810
+ proc = spawnFn(argv, {
811
+ cwd: worktreePath,
812
+ env: spawnEnv,
813
+ stdin: "pipe",
814
+ stdout: "pipe",
815
+ stderr: "pipe",
816
+ });
817
+ } catch (err) {
818
+ try {
819
+ await stderrWriter.end();
820
+ } catch {
821
+ // ignore
822
+ }
823
+ throw err;
824
+ }
825
+
826
+ // Publish the live claude PID so other processes (`ov stop`, watchdog) can
827
+ // find and signal it. turn.pid is the cross-process kill primitive for
828
+ // headless task-scoped agents — without it, `ov stop` reads null and
829
+ // silently degrades (overstory-62a6). Treat write failure as a contract
830
+ // violation (symmetric with the cleanup-side assertion that turn.pid must
831
+ // not survive the runner): SIGKILL the just-spawned subprocess and abort
832
+ // the turn so the operator sees the failure instead of a half-broken
833
+ // agent that cannot be killed.
834
+ try {
835
+ await Bun.write(turnPidPath, `${proc.pid}\n`);
836
+ } catch (err) {
837
+ runnerLog(
838
+ "error",
839
+ `failed to write turn.pid at ${turnPidPath} — kill primitive unavailable, aborting turn`,
840
+ err,
841
+ );
842
+ try {
843
+ proc.kill("SIGKILL");
844
+ } catch {
845
+ // process may have already exited
846
+ }
847
+ try {
848
+ await stderrWriter.end();
849
+ } catch {
850
+ // ignore
851
+ }
852
+ throw new AgentError(
853
+ `failed to write turn.pid at ${turnPidPath}: ${err instanceof Error ? err.message : String(err)}`,
854
+ { agentName, ...(err instanceof Error ? { cause: err } : {}) },
855
+ );
856
+ }
857
+
858
+ // Tee stderr stream into the per-turn stderr.log without blocking the parser.
859
+ const stderrStream = (proc as unknown as { stderr?: ReadableStream<Uint8Array> | null }).stderr;
860
+ const stderrTeePromise = teeStreamToWriter(stderrStream, {
861
+ write: (data) => stderrWriter.write(data),
862
+ end: () => stderrWriter.end(),
863
+ });
864
+
865
+ // Write the user turn and close stdin so claude sees EOF.
866
+ try {
867
+ const writeRes = proc.stdin.write(userTurnNdjson);
868
+ if (writeRes && typeof (writeRes as Promise<unknown>).then === "function") {
869
+ await writeRes;
870
+ }
871
+ if (typeof proc.stdin.end === "function") {
872
+ const endRes = proc.stdin.end();
873
+ if (endRes && typeof (endRes as Promise<unknown>).then === "function") {
874
+ await endRes;
875
+ }
876
+ }
877
+ } catch (err) {
878
+ try {
879
+ proc.kill();
880
+ } catch {
881
+ // ignore
882
+ }
883
+ throw err;
884
+ }
885
+
886
+ // Abort wiring — SIGTERM, then SIGKILL after sigkillDelayMs.
887
+ let aborted = false;
888
+ let sigkillTimer: ReturnType<typeof setTimeout> | null = null;
889
+ const onAbort = (): void => {
890
+ if (aborted) return;
891
+ aborted = true;
892
+ try {
893
+ proc.kill("SIGTERM");
894
+ } catch {
895
+ // process may have already exited
896
+ }
897
+ sigkillTimer = setTimeout(() => {
898
+ try {
899
+ proc.kill("SIGKILL");
900
+ } catch {
901
+ // ignore
902
+ }
903
+ }, sigkillDelayMs);
904
+ (sigkillTimer as { unref?: () => void }).unref?.();
905
+ };
906
+ if (opts.abortSignal) {
907
+ if (opts.abortSignal.aborted) {
908
+ onAbort();
909
+ } else {
910
+ opts.abortSignal.addEventListener("abort", onAbort, { once: true });
911
+ }
912
+ }
913
+
914
+ // Drain parser, capture session id, tee events into events.db.
915
+ const eventStore = createEventStore(eventsDbPath);
916
+ let newSessionId: string | null = null;
917
+ let cleanResult = false;
918
+ let observedAnyEvent = false;
919
+ // True iff this turn fired the "first parser event" transition into
920
+ // `in_turn`. Replaces the legacy `bootedToWorking` flag; the trigger
921
+ // now fires from booting OR between_turns OR working (legacy migration)
922
+ // so a resumed spawn-per-turn agent flips back to `in_turn` at the
923
+ // start of every batch (overstory-3087).
924
+ let transitionedToInTurn = false;
925
+
926
+ // Stall watchdog (overstory-ddb3): if no parser event arrives for
927
+ // `eventStallTimeoutMs`, abort the turn via SIGTERM/SIGKILL. Otherwise a
928
+ // hung claude (Anthropic API stall, deadlock) hangs the runner forever.
929
+ const eventStallTimeoutMs = opts.eventStallTimeoutMs ?? 600_000;
930
+ let stallAborted = false;
931
+ let stallTimer: ReturnType<typeof setTimeout> | null = null;
932
+ let stallSigkillTimer: ReturnType<typeof setTimeout> | null = null;
933
+ const clearStallTimer = (): void => {
934
+ if (stallTimer) {
935
+ clearTimeout(stallTimer);
936
+ stallTimer = null;
937
+ }
938
+ };
939
+ const armStallTimer = (): void => {
940
+ if (eventStallTimeoutMs <= 0) return;
941
+ clearStallTimer();
942
+ stallTimer = setTimeout(() => {
943
+ if (aborted || stallAborted) return;
944
+ stallAborted = true;
945
+ runnerLog(
946
+ "error",
947
+ `parser stalled: no event for ${eventStallTimeoutMs}ms — aborting via SIGTERM`,
948
+ );
949
+ try {
950
+ proc.kill("SIGTERM");
951
+ } catch {
952
+ // process may have already exited
953
+ }
954
+ stallSigkillTimer = setTimeout(() => {
955
+ try {
956
+ proc.kill("SIGKILL");
957
+ } catch {
958
+ // ignore
959
+ }
960
+ }, sigkillDelayMs);
961
+ (stallSigkillTimer as { unref?: () => void }).unref?.();
962
+ }, eventStallTimeoutMs);
963
+ (stallTimer as { unref?: () => void }).unref?.();
964
+ };
965
+
966
+ // `AgentRuntime.parseEvents` is declared as a 1-param method, but the Claude
967
+ // adapter accepts an `onSessionId` hook. Widen the call site so we can pass
968
+ // the hook without depending on adapter-specific types.
969
+ type ParseEventsWithOpts = (
970
+ stream: ReadableStream<Uint8Array>,
971
+ opts?: { onSessionId?: (sid: string) => void },
972
+ ) => AsyncIterable<AgentEvent>;
973
+ const parseEvents = runtime.parseEvents as unknown as ParseEventsWithOpts;
974
+
975
+ // Arm before iteration so a process that never emits also gets caught.
976
+ armStallTimer();
977
+
978
+ try {
979
+ const parser = parseEvents(proc.stdout, {
980
+ onSessionId: (sid: string) => {
981
+ newSessionId = sid;
982
+ updateSessionClaudeId(sessionsDbPath, agentName, sid, (err) =>
983
+ runnerLog("warn", "failed to persist claudeSessionId", err),
984
+ );
985
+ // Resume mismatch (overstory-088b C2): the runtime returned a
986
+ // different session id than the one we asked it to resume.
987
+ // `--resume` is best-effort — claude can decide to start a fresh
988
+ // session if it cannot rehydrate the requested one. Surface a
989
+ // structured warning event so observability mirrors the runner
990
+ // diagnostic and downstream tooling can detect the mismatch.
991
+ if (priorSessionId !== null && sid !== priorSessionId) {
992
+ try {
993
+ eventStore.insert({
994
+ runId,
995
+ agentName,
996
+ sessionId: sid,
997
+ eventType: "custom",
998
+ toolName: null,
999
+ toolArgs: null,
1000
+ toolDurationMs: null,
1001
+ level: "warn",
1002
+ data: JSON.stringify({
1003
+ type: "resume_mismatch",
1004
+ requestedSessionId: priorSessionId,
1005
+ observedSessionId: sid,
1006
+ }),
1007
+ });
1008
+ } catch {
1009
+ // non-fatal — observability must not break the turn
1010
+ }
1011
+ runnerLog(
1012
+ "warn",
1013
+ `resume mismatch: requested ${priorSessionId} but runtime returned ${sid}`,
1014
+ );
1015
+ }
1016
+ },
1017
+ });
1018
+
1019
+ // Mid-turn `lastActivity` refresh (overstory-8e61). The watchdog at
1020
+ // `src/watchdog/health.ts:242-243` documents that the runner advances
1021
+ // lastActivity per parser event; without this the row stayed at
1022
+ // `startedAt` for the whole turn and long turns got zombified live.
1023
+ const lastActivityRefreshIntervalMs = opts.lastActivityRefreshIntervalMs ?? 2000;
1024
+ let lastActivityRefreshMs = 0; // first event always refreshes
1025
+
1026
+ for await (const event of parser) {
1027
+ armStallTimer();
1028
+ observedAnyEvent = true;
1029
+
1030
+ // Keep `session.lastActivity` advancing while events flow so the
1031
+ // watchdog does not zombify a live agent mid-turn — see
1032
+ // `src/watchdog/health.ts:242-243` and overstory-8e61.
1033
+ const nowMs = now().getTime();
1034
+ if (nowMs - lastActivityRefreshMs >= lastActivityRefreshIntervalMs) {
1035
+ lastActivityRefreshMs = nowMs;
1036
+ updateSessionLastActivity(sessionsDbPath, agentName, (err) =>
1037
+ runnerLog("warn", "failed to refresh lastActivity mid-turn", err),
1038
+ );
1039
+ opts._onLastActivityRefresh?.();
1040
+ }
1041
+
1042
+ // First parser event of a turn → settle into `in_turn`. Allowed
1043
+ // predecessors are `booting` (initial dispatch), `between_turns`
1044
+ // (next mail batch on a healthy worker), or already-`in_turn`
1045
+ // (idempotent — covers the case where a prior turn somehow left
1046
+ // the row at in_turn). Legacy `working` rows are intentionally
1047
+ // not in the matrix predecessor set (overstory-3087): spawn-
1048
+ // per-turn workers should not flow through `working`, so the
1049
+ // matrix keeps the substate path disjoint and a stale `working`
1050
+ // row is left alone rather than silently coerced.
1051
+ if (
1052
+ !transitionedToInTurn &&
1053
+ (initialState === "booting" || initialState === "between_turns")
1054
+ ) {
1055
+ transitionedToInTurn = true;
1056
+ updateSessionState(
1057
+ sessionsDbPath,
1058
+ agentName,
1059
+ "in_turn",
1060
+ (err) => runnerLog("warn", `failed to transition ${initialState} → in_turn`, err),
1061
+ (prev, attempted) =>
1062
+ runnerLog(
1063
+ "warn",
1064
+ `${initialState} → in_turn rejected: state is now ${prev} (attempted ${attempted})`,
1065
+ ),
1066
+ );
1067
+ }
1068
+
1069
+ if (event.type === "result") {
1070
+ cleanResult = event.isError !== true;
1071
+ }
1072
+
1073
+ // Defense-in-depth (overstory-c92c): detect Bash mail-poll patterns
1074
+ // the lead.md prompt forbids (overstory-fa84). Warn-only — emit a
1075
+ // custom event before the original tool_use so observability tools
1076
+ // see the warning ahead of the offending call. Wrapped in try/catch
1077
+ // so detection failure cannot break the turn.
1078
+ if (event.type === "tool_use" && event.name === "Bash") {
1079
+ try {
1080
+ const input =
1081
+ typeof event.input === "object" && event.input !== null
1082
+ ? (event.input as Record<string, unknown>)
1083
+ : null;
1084
+ const command = input?.command;
1085
+ const detection = detectMailPollPattern(command);
1086
+ if (detection.matched) {
1087
+ const cmdStr = typeof command === "string" ? command : "";
1088
+ const truncated = cmdStr.length > 200 ? `${cmdStr.slice(0, 200)}…` : cmdStr;
1089
+ runnerLog(
1090
+ "warn",
1091
+ `detected mail-poll pattern in Bash command (${detection.reason}): ${truncated}`,
1092
+ );
1093
+ try {
1094
+ eventStore.insert({
1095
+ runId,
1096
+ agentName,
1097
+ sessionId: newSessionId,
1098
+ eventType: "custom",
1099
+ toolName: null,
1100
+ toolArgs: null,
1101
+ toolDurationMs: null,
1102
+ level: "warn",
1103
+ data: JSON.stringify({
1104
+ type: "mail_poll_detected",
1105
+ reason: detection.reason,
1106
+ command: cmdStr,
1107
+ }),
1108
+ });
1109
+ } catch (insertErr) {
1110
+ runnerLog("warn", "failed to insert mail_poll_detected event", insertErr);
1111
+ }
1112
+ }
1113
+ } catch (detectErr) {
1114
+ runnerLog("warn", "mail-poll detector threw", detectErr);
1115
+ }
1116
+ }
1117
+
1118
+ try {
1119
+ recordAgentEvent(eventStore, agentName, runId, newSessionId, event);
1120
+ } catch {
1121
+ // non-fatal — observability must not break the turn
1122
+ }
1123
+ }
1124
+ } catch (err) {
1125
+ // Parser iteration threw (malformed stream-json, decoder error, etc.).
1126
+ // The subprocess is still running and would orphan past lock.release()
1127
+ // if we just propagated the error (overstory-088b C3). Send SIGKILL so
1128
+ // it cannot keep producing output or holding resources, then rethrow
1129
+ // for the outer finally to clean up turn.pid and release the lock.
1130
+ runnerLog("error", "parser iteration threw — killing subprocess to avoid orphan", err);
1131
+ try {
1132
+ proc.kill("SIGKILL");
1133
+ } catch {
1134
+ // process may have already exited
1135
+ }
1136
+ throw err;
1137
+ } finally {
1138
+ clearStallTimer();
1139
+ if (stallSigkillTimer) {
1140
+ clearTimeout(stallSigkillTimer);
1141
+ stallSigkillTimer = null;
1142
+ }
1143
+ try {
1144
+ eventStore.close();
1145
+ } catch {
1146
+ // ignore
1147
+ }
1148
+ }
1149
+
1150
+ let exitCode: number | null;
1151
+ try {
1152
+ exitCode = await proc.exited;
1153
+ } catch (err) {
1154
+ runnerLog("warn", "proc.exited rejected", err);
1155
+ exitCode = null;
1156
+ }
1157
+ if (sigkillTimer) {
1158
+ clearTimeout(sigkillTimer);
1159
+ sigkillTimer = null;
1160
+ }
1161
+ if (opts.abortSignal && !opts.abortSignal.aborted) {
1162
+ opts.abortSignal.removeEventListener("abort", onAbort);
1163
+ }
1164
+ if (aborted || stallAborted) {
1165
+ exitCode = null;
1166
+ }
1167
+
1168
+ // Wait for stderr drain so the log file isn't truncated mid-write.
1169
+ try {
1170
+ await stderrTeePromise;
1171
+ } catch {
1172
+ // best-effort
1173
+ }
1174
+
1175
+ const terminalMailObserved = checkTerminalMailSince(
1176
+ mailDbPath,
1177
+ agentName,
1178
+ capability,
1179
+ snapshotTs,
1180
+ );
1181
+
1182
+ // Soft scope-violation observability (overstory-9f4d). Builders sometimes
1183
+ // expand beyond their declared FILE_SCOPE; the lead needs a way to spot it
1184
+ // during merge verification. Surface a warn-level event into events.db
1185
+ // when the worker's modified files exceed FILE_SCOPE without an
1186
+ // `expansion_reason:` justification (commit body OR prior scope_expansion
1187
+ // mail). This is advisory — never aborts the turn, never blocks the
1188
+ // completed transition. All errors are swallowed.
1189
+ //
1190
+ // TODO: baseRef is hard-coded to "main"; a future improvement could
1191
+ // resolve the actual session-branch.txt for projects whose canonical
1192
+ // branch differs.
1193
+ if (terminalMailObserved && IMPLEMENTATION_CAPABILITIES.has(capability)) {
1194
+ try {
1195
+ const fileScope = await extractFileScope(worktreePath, runtime.instructionPath);
1196
+ if (fileScope.length > 0) {
1197
+ const detectFn = opts._scopeDetect ?? defaultDetectScopeViolation;
1198
+ const { violations, expansionReasons } = detectFn({
1199
+ worktreePath,
1200
+ baseRef: "main",
1201
+ fileScope,
1202
+ });
1203
+ if (violations.length > 0 && expansionReasons.length === 0) {
1204
+ const justified = hasScopeExpansionMail(mailDbPath, agentName);
1205
+ if (!justified) {
1206
+ runnerLog(
1207
+ "warn",
1208
+ `agent modified ${violations.length} file(s) outside declared FILE_SCOPE without justification: ${violations.join(", ")}. To suppress, include 'expansion_reason: <why>' in your last commit message OR send a scope_expansion mail to your lead.`,
1209
+ );
1210
+ try {
1211
+ const evStore = createEventStore(eventsDbPath);
1212
+ try {
1213
+ evStore.insert({
1214
+ runId,
1215
+ agentName,
1216
+ sessionId: newSessionId,
1217
+ eventType: "custom",
1218
+ toolName: null,
1219
+ toolArgs: null,
1220
+ toolDurationMs: null,
1221
+ level: "warn",
1222
+ data: JSON.stringify({
1223
+ type: "scope_violation",
1224
+ violations,
1225
+ fileScope,
1226
+ }),
1227
+ });
1228
+ } finally {
1229
+ try {
1230
+ evStore.close();
1231
+ } catch {
1232
+ // best-effort
1233
+ }
1234
+ }
1235
+ } catch {
1236
+ // observability must never break the runner
1237
+ }
1238
+ }
1239
+ }
1240
+ }
1241
+ } catch {
1242
+ // scope detection is advisory — swallow all errors
1243
+ }
1244
+ }
1245
+
1246
+ const resumeMismatch =
1247
+ priorSessionId !== null && newSessionId !== null && newSessionId !== priorSessionId;
1248
+
1249
+ // Contract violation (overstory-6071): claude exited cleanly (saw a
1250
+ // `result` event with isError:false) but never sent the capability's
1251
+ // terminal mail. Pre-fix this fell through to `working` and stayed
1252
+ // there forever — the process is gone but the session looks alive.
1253
+ // Surface loudly via the runner diagnostic sink and settle to
1254
+ // `completed` so operators don't see a zombie-but-labeled-working row.
1255
+ const terminalMailMissing = cleanResult && !terminalMailObserved && !aborted && !stallAborted;
1256
+ if (terminalMailMissing) {
1257
+ const expected = terminalMailTypesFor(capability).join("|") || "<none>";
1258
+ runnerLog(
1259
+ "error",
1260
+ `agent exited cleanly without sending terminal mail (expected ${expected}); marking completed and surfacing contract violation`,
1261
+ );
1262
+ }
1263
+
1264
+ let finalState: AgentState;
1265
+ if (aborted || stallAborted) {
1266
+ finalState = "zombie";
1267
+ } else if (cleanResult && terminalMailObserved) {
1268
+ finalState = "completed";
1269
+ } else if (terminalMailMissing) {
1270
+ finalState = "completed";
1271
+ } else if (observedAnyEvent || transitionedToInTurn) {
1272
+ // Turn produced events but did not complete — settle to
1273
+ // `between_turns`, NOT `working`, so the UI can distinguish a
1274
+ // spawn-per-turn worker waiting for its next mail batch from one
1275
+ // mid-execution. The watchdog will flip the row back to `in_turn`
1276
+ // on the next batch when the parser fires its first event
1277
+ // (overstory-3087).
1278
+ finalState = "between_turns";
1279
+ } else {
1280
+ finalState = initialState;
1281
+ }
1282
+
1283
+ if (finalState !== initialState) {
1284
+ updateSessionState(
1285
+ sessionsDbPath,
1286
+ agentName,
1287
+ finalState,
1288
+ (err) => runnerLog("warn", `failed to transition state to ${finalState}`, err),
1289
+ (prev, attempted) =>
1290
+ runnerLog(
1291
+ "warn",
1292
+ `turn-end transition ${initialState} → ${attempted} rejected: state is now ${prev}`,
1293
+ ),
1294
+ );
1295
+ }
1296
+
1297
+ // In-band parent notification (overstory-4159, overstory-c772). When the
1298
+ // turn ends without the capability's terminal mail — either because the
1299
+ // runner zombified (abort/stall) or claude exited cleanly without sending
1300
+ // `worker_done` — synthesize a `worker_died` mail to the parent so the
1301
+ // lead does not block forever waiting for a signal that will never come.
1302
+ // The watchdog's pre-tick state-snapshot dedup (mx-b0e54b) ensures a
1303
+ // later watchdog pass on the now-zombie session does not re-fire.
1304
+ const shouldNotifyParent =
1305
+ parentAgent !== null && (finalState === "zombie" || terminalMailMissing);
1306
+ if (shouldNotifyParent && parentAgent !== null) {
1307
+ const reason = aborted
1308
+ ? "Aborted by operator (SIGTERM)"
1309
+ : stallAborted
1310
+ ? "Parser stalled (no events within timeout)"
1311
+ : terminalMailMissing
1312
+ ? `Clean exit without terminal mail (expected ${terminalMailTypesFor(capability).join("|") || "<none>"})`
1313
+ : "Turn ended without terminal mail";
1314
+ notifyParentOfRunnerDeath({
1315
+ mailStore: opts._mailStore ?? null,
1316
+ mailDbPath,
1317
+ parentAgent,
1318
+ agentName,
1319
+ capability,
1320
+ taskId,
1321
+ reason,
1322
+ lastActivity: sessionLastActivity ?? new Date(startedAtMs).toISOString(),
1323
+ runnerLog,
1324
+ });
1325
+ }
1326
+
1327
+ // `lastActivity` advancing past `startedAt` is a turn-cleanup contract
1328
+ // invariant — silent failure here was the smoking gun in overstory-4af3.
1329
+ const lastActivityOk = updateSessionLastActivity(sessionsDbPath, agentName, (err) =>
1330
+ runnerLog("warn", "failed to update lastActivity", err),
1331
+ );
1332
+ if (!lastActivityOk) {
1333
+ runnerLog(
1334
+ "error",
1335
+ "lastActivity stayed at startedAt — session.lastActivity is unreliable for this turn",
1336
+ );
1337
+ }
1338
+
1339
+ const durationMs = now().getTime() - startedAtMs;
1340
+
1341
+ return {
1342
+ exitCode,
1343
+ cleanResult,
1344
+ newSessionId,
1345
+ resumeMismatch,
1346
+ terminalMailObserved,
1347
+ durationMs,
1348
+ initialState,
1349
+ finalState,
1350
+ stallAborted,
1351
+ terminalMailMissing,
1352
+ };
1353
+ } finally {
1354
+ // PID-file cleanup so a follow-up turn never sees a stale PID (covers
1355
+ // thrown errors as well as the happy path). ENOENT is expected on the
1356
+ // "spawn never happened" path; any other error is a contract violation
1357
+ // because turn.pid is the cross-process kill primitive (overstory-2cf9).
1358
+ if (turnPidPath) {
1359
+ try {
1360
+ await unlink(turnPidPath);
1361
+ } catch (err) {
1362
+ const code = (err as NodeJS.ErrnoException | undefined)?.code;
1363
+ if (code !== "ENOENT") {
1364
+ runnerLog("error", `failed to unlink turn.pid at ${turnPidPath}`, err);
1365
+ }
1366
+ }
1367
+ // Contract assertion: turn.pid must NOT survive the runner. A
1368
+ // surviving file means a follow-up `ov stop` or watchdog will target
1369
+ // a stale PID. Surface the violation loudly (overstory-4af3).
1370
+ try {
1371
+ if (existsSync(turnPidPath)) {
1372
+ runnerLog(
1373
+ "error",
1374
+ `turn.pid still exists at ${turnPidPath} after cleanup — kill primitive will target stale PID`,
1375
+ );
1376
+ }
1377
+ } catch {
1378
+ // existsSync should not throw, but keep diagnostics defensive
1379
+ }
1380
+ }
1381
+ lock.release();
1382
+ }
1383
+ }