@os-eco/overstory-cli 0.9.3 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/README.md +49 -18
  2. package/agents/builder.md +9 -8
  3. package/agents/coordinator.md +6 -6
  4. package/agents/lead.md +98 -82
  5. package/agents/merger.md +25 -14
  6. package/agents/reviewer.md +22 -16
  7. package/agents/scout.md +17 -12
  8. package/package.json +6 -3
  9. package/src/agents/capabilities.test.ts +85 -0
  10. package/src/agents/capabilities.ts +125 -0
  11. package/src/agents/headless-mail-injector.test.ts +448 -0
  12. package/src/agents/headless-mail-injector.ts +211 -0
  13. package/src/agents/headless-prompt.test.ts +102 -0
  14. package/src/agents/headless-prompt.ts +68 -0
  15. package/src/agents/hooks-deployer.test.ts +514 -14
  16. package/src/agents/hooks-deployer.ts +141 -0
  17. package/src/agents/overlay.test.ts +4 -4
  18. package/src/agents/overlay.ts +30 -8
  19. package/src/agents/turn-lock.test.ts +181 -0
  20. package/src/agents/turn-lock.ts +235 -0
  21. package/src/agents/turn-runner-dispatch.test.ts +182 -0
  22. package/src/agents/turn-runner-dispatch.ts +105 -0
  23. package/src/agents/turn-runner.test.ts +1450 -0
  24. package/src/agents/turn-runner.ts +1166 -0
  25. package/src/commands/clean.ts +56 -1
  26. package/src/commands/completions.test.ts +4 -1
  27. package/src/commands/coordinator.test.ts +127 -0
  28. package/src/commands/coordinator.ts +205 -6
  29. package/src/commands/dashboard.test.ts +188 -0
  30. package/src/commands/dashboard.ts +13 -3
  31. package/src/commands/doctor.ts +94 -77
  32. package/src/commands/group.test.ts +94 -0
  33. package/src/commands/group.ts +49 -20
  34. package/src/commands/init.test.ts +8 -0
  35. package/src/commands/init.ts +8 -1
  36. package/src/commands/log.test.ts +56 -11
  37. package/src/commands/log.ts +134 -69
  38. package/src/commands/mail.test.ts +162 -0
  39. package/src/commands/mail.ts +64 -9
  40. package/src/commands/merge.test.ts +112 -1
  41. package/src/commands/merge.ts +17 -4
  42. package/src/commands/monitor.ts +2 -1
  43. package/src/commands/nudge.test.ts +351 -4
  44. package/src/commands/nudge.ts +356 -34
  45. package/src/commands/run.test.ts +43 -7
  46. package/src/commands/serve/build.test.ts +202 -0
  47. package/src/commands/serve/build.ts +206 -0
  48. package/src/commands/serve/coordinator-actions.test.ts +339 -0
  49. package/src/commands/serve/coordinator-actions.ts +408 -0
  50. package/src/commands/serve/dev.test.ts +168 -0
  51. package/src/commands/serve/dev.ts +117 -0
  52. package/src/commands/serve/mail-actions.test.ts +312 -0
  53. package/src/commands/serve/mail-actions.ts +167 -0
  54. package/src/commands/serve/rest.test.ts +1323 -0
  55. package/src/commands/serve/rest.ts +708 -0
  56. package/src/commands/serve/static.ts +51 -0
  57. package/src/commands/serve/ws.test.ts +361 -0
  58. package/src/commands/serve/ws.ts +332 -0
  59. package/src/commands/serve.test.ts +459 -0
  60. package/src/commands/serve.ts +565 -0
  61. package/src/commands/sling.test.ts +85 -1
  62. package/src/commands/sling.ts +153 -64
  63. package/src/commands/status.test.ts +9 -0
  64. package/src/commands/status.ts +12 -4
  65. package/src/commands/stop.test.ts +174 -1
  66. package/src/commands/stop.ts +107 -8
  67. package/src/commands/supervisor.ts +2 -1
  68. package/src/commands/watch.test.ts +49 -4
  69. package/src/commands/watch.ts +153 -28
  70. package/src/commands/worktree.test.ts +319 -3
  71. package/src/commands/worktree.ts +86 -0
  72. package/src/config.test.ts +78 -0
  73. package/src/config.ts +43 -1
  74. package/src/doctor/consistency.test.ts +106 -0
  75. package/src/doctor/consistency.ts +50 -3
  76. package/src/doctor/serve.test.ts +95 -0
  77. package/src/doctor/serve.ts +86 -0
  78. package/src/doctor/types.ts +2 -1
  79. package/src/doctor/watchdog.ts +57 -1
  80. package/src/events/tailer.test.ts +234 -1
  81. package/src/events/tailer.ts +90 -0
  82. package/src/index.ts +53 -6
  83. package/src/json.ts +29 -0
  84. package/src/mail/client.ts +15 -2
  85. package/src/mail/store.test.ts +82 -0
  86. package/src/mail/store.ts +41 -4
  87. package/src/merge/lock.test.ts +149 -0
  88. package/src/merge/lock.ts +140 -0
  89. package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
  90. package/src/runtimes/claude.test.ts +791 -1
  91. package/src/runtimes/claude.ts +323 -1
  92. package/src/runtimes/connections.test.ts +141 -1
  93. package/src/runtimes/connections.ts +73 -4
  94. package/src/runtimes/headless-connection.test.ts +264 -0
  95. package/src/runtimes/headless-connection.ts +158 -0
  96. package/src/runtimes/types.ts +10 -0
  97. package/src/schema-consistency.test.ts +1 -0
  98. package/src/sessions/store.test.ts +390 -24
  99. package/src/sessions/store.ts +184 -19
  100. package/src/test-setup.test.ts +31 -0
  101. package/src/test-setup.ts +28 -0
  102. package/src/types.ts +56 -1
  103. package/src/utils/pid.test.ts +85 -1
  104. package/src/utils/pid.ts +86 -1
  105. package/src/utils/process-scan.test.ts +53 -0
  106. package/src/utils/process-scan.ts +76 -0
  107. package/src/watchdog/daemon.test.ts +1520 -411
  108. package/src/watchdog/daemon.ts +442 -83
  109. package/src/watchdog/health.test.ts +157 -0
  110. package/src/watchdog/health.ts +92 -25
  111. package/src/worktree/process.test.ts +71 -0
  112. package/src/worktree/process.ts +25 -5
  113. package/src/worktree/tmux.test.ts +39 -0
  114. package/src/worktree/tmux.ts +23 -3
  115. package/templates/CLAUDE.md.tmpl +19 -8
  116. package/templates/overlay.md.tmpl +3 -2
@@ -79,6 +79,44 @@ function generateGroupId(): string {
79
79
  return `group-${crypto.randomUUID().slice(0, 8)}`;
80
80
  }
81
81
 
82
+ /**
83
+ * Resolve a group by ID or name.
84
+ *
85
+ * Names are not enforced unique by `createGroup`, so live `groups.json` files
86
+ * contain duplicate names — a naive name lookup would silently pick the wrong
87
+ * group. Resolution precedence:
88
+ * 1. Exact ID match wins (UUIDs are unambiguous).
89
+ * 2. Otherwise filter by name. If exactly one match, return it.
90
+ * 3. If multiple name matches, prefer a single `active` one. If still
91
+ * ambiguous, throw with the matching IDs so the caller can disambiguate
92
+ * by passing the UUID.
93
+ *
94
+ * @internal Exported for testing.
95
+ */
96
+ export function resolveGroup(groups: TaskGroup[], identifier: string): TaskGroup {
97
+ const byId = groups.find((g) => g.id === identifier);
98
+ if (byId) return byId;
99
+
100
+ const byName = groups.filter((g) => g.name === identifier);
101
+ if (byName.length === 1) {
102
+ const only = byName[0];
103
+ if (only) return only;
104
+ }
105
+ if (byName.length > 1) {
106
+ const active = byName.filter((g) => g.status === "active");
107
+ if (active.length === 1) {
108
+ const only = active[0];
109
+ if (only) return only;
110
+ }
111
+ const ids = byName.map((g) => g.id).join(", ");
112
+ throw new GroupError(
113
+ `Group name "${identifier}" is ambiguous (matches: ${ids}). Use the group ID.`,
114
+ { groupId: identifier },
115
+ );
116
+ }
117
+ throw new GroupError(`Group "${identifier}" not found`, { groupId: identifier });
118
+ }
119
+
82
120
  /**
83
121
  * Create a new task group.
84
122
  * @internal Exported for testing.
@@ -140,16 +178,13 @@ export async function addToGroup(
140
178
  }
141
179
 
142
180
  const groups = await loadGroups(projectRoot);
143
- const group = groups.find((g) => g.id === groupId);
144
- if (!group) {
145
- throw new GroupError(`Group "${groupId}" not found`, { groupId });
146
- }
181
+ const group = resolveGroup(groups, groupId);
147
182
 
148
183
  // Check for duplicates against existing members
149
184
  for (const id of issueIds) {
150
185
  if (group.memberIssueIds.includes(id)) {
151
- throw new GroupError(`Issue "${id}" is already a member of group "${groupId}"`, {
152
- groupId,
186
+ throw new GroupError(`Issue "${id}" is already a member of group "${group.id}"`, {
187
+ groupId: group.id,
153
188
  });
154
189
  }
155
190
  }
@@ -187,16 +222,13 @@ export async function removeFromGroup(
187
222
  }
188
223
 
189
224
  const groups = await loadGroups(projectRoot);
190
- const group = groups.find((g) => g.id === groupId);
191
- if (!group) {
192
- throw new GroupError(`Group "${groupId}" not found`, { groupId });
193
- }
225
+ const group = resolveGroup(groups, groupId);
194
226
 
195
227
  // Validate all issues are members
196
228
  for (const id of issueIds) {
197
229
  if (!group.memberIssueIds.includes(id)) {
198
- throw new GroupError(`Issue "${id}" is not a member of group "${groupId}"`, {
199
- groupId,
230
+ throw new GroupError(`Issue "${id}" is not a member of group "${group.id}"`, {
231
+ groupId: group.id,
200
232
  });
201
233
  }
202
234
  }
@@ -204,7 +236,7 @@ export async function removeFromGroup(
204
236
  // Check that removal won't empty the group
205
237
  const remaining = group.memberIssueIds.filter((id) => !issueIds.includes(id));
206
238
  if (remaining.length === 0) {
207
- throw new GroupError("Cannot remove all issues from a group", { groupId });
239
+ throw new GroupError("Cannot remove all issues from a group", { groupId: group.id });
208
240
  }
209
241
 
210
242
  group.memberIssueIds = remaining;
@@ -347,7 +379,7 @@ export function createGroupCommand(): Command {
347
379
  cmd
348
380
  .command("status")
349
381
  .description("Show progress for one or all groups")
350
- .argument("[group-id]", "Group ID (optional, shows all if omitted)")
382
+ .argument("[group-id-or-name]", "Group ID or name (optional, shows all if omitted)")
351
383
  .option("--json", "Output as JSON")
352
384
  .option("--skip-validation", "Skip task validation (for offline use)")
353
385
  .action(
@@ -361,10 +393,7 @@ export function createGroupCommand(): Command {
361
393
  const groups = await loadGroups(projectRoot);
362
394
 
363
395
  if (groupId) {
364
- const group = groups.find((g) => g.id === groupId);
365
- if (!group) {
366
- throw new GroupError(`Group "${groupId}" not found`, { groupId });
367
- }
396
+ const group = resolveGroup(groups, groupId);
368
397
  const progress = await getGroupProgress(projectRoot, group, groups, tracker);
369
398
  if (json) {
370
399
  jsonOutput("group status", { ...progress });
@@ -401,7 +430,7 @@ export function createGroupCommand(): Command {
401
430
  cmd
402
431
  .command("add")
403
432
  .description("Add issues to a group")
404
- .argument("<group-id>", "Group ID")
433
+ .argument("<group-id-or-name>", "Group ID or name")
405
434
  .argument("<ids...>", "Issue IDs to add")
406
435
  .option("--json", "Output as JSON")
407
436
  .option("--skip-validation", "Skip task validation (for offline use)")
@@ -437,7 +466,7 @@ export function createGroupCommand(): Command {
437
466
  cmd
438
467
  .command("remove")
439
468
  .description("Remove issues from a group")
440
- .argument("<group-id>", "Group ID")
469
+ .argument("<group-id-or-name>", "Group ID or name")
441
470
  .argument("<ids...>", "Issue IDs to remove")
442
471
  .option("--json", "Output as JSON")
443
472
  .action(async (groupId: string, ids: string[], opts: { json?: boolean }) => {
@@ -353,6 +353,14 @@ describe("initCommand: canonical branch detection", () => {
353
353
  const content = await Bun.file(configPath).text();
354
354
  expect(content).toContain("canonicalBranch: main");
355
355
  });
356
+
357
+ test("generated config opts into headless Claude by default (overstory-caec)", async () => {
358
+ await initCommand({ _spawner: noopSpawner });
359
+
360
+ const configPath = join(tempDir, ".overstory", "config.yaml");
361
+ const content = await Bun.file(configPath).text();
362
+ expect(content).toContain("claudeHeadlessByDefault: true");
363
+ });
356
364
  });
357
365
 
358
366
  describe("initCommand: --yes flag", () => {
@@ -816,6 +816,10 @@ export async function initCommand(opts: InitOptions): Promise<void> {
816
816
  config.project.canonicalBranch = canonicalBranch;
817
817
  if (config.runtime) {
818
818
  config.runtime.default = defaultRuntime;
819
+ // New projects default to headless Claude spawns; the UI (`ov serve`) is the
820
+ // primary operator surface and tmux is opt-in via `--no-headless`. Existing
821
+ // projects keep tmux until they edit their config (overstory-caec).
822
+ config.runtime.claudeHeadlessByDefault = true;
819
823
  }
820
824
 
821
825
  const configYaml = serializeConfigToYaml(config);
@@ -956,5 +960,8 @@ export async function initCommand(opts: InitOptions): Promise<void> {
956
960
 
957
961
  printSuccess("Initialized");
958
962
  printHint("Next: run `ov hooks install` to enable Claude Code hooks.");
959
- printHint("Then: run `ov status` to see the current state.");
963
+ printHint("Then: `ov coordinator start` and `ov serve` open http://localhost:7321");
964
+ printHint(
965
+ " (UI is the primary operator surface; pass `--no-headless` to ov sling for tmux attach)",
966
+ );
960
967
  }
@@ -633,8 +633,55 @@ describe("logCommand", () => {
633
633
  });
634
634
  });
635
635
 
636
- test("session-end writes pending-nudge marker for coordinator when lead completes", async () => {
637
- // Create sessions.db with a lead agent
636
+ test("session-end does NOT transition lead to completed (persistent agent)", async () => {
637
+ // Regression test for overstory-49a7:
638
+ // The lead's Stop hook fires every turn (interactive Claude Code), not just at
639
+ // true session end. session-end must NOT mark leads completed, or they vanish
640
+ // from getActive() after their first turn while their tmux is still alive.
641
+ const dbPath = join(tempDir, ".overstory", "sessions.db");
642
+ const session: AgentSession = {
643
+ id: "session-lead",
644
+ agentName: "lead-alpha",
645
+ capability: "lead",
646
+ worktreePath: tempDir,
647
+ branchName: "lead-alpha-branch",
648
+ taskId: "bead-lead-001",
649
+ tmuxSession: "overstory-lead-alpha",
650
+ state: "working",
651
+ pid: 33333,
652
+ parentAgent: null,
653
+ depth: 0,
654
+ runId: null,
655
+ startedAt: new Date().toISOString(),
656
+ lastActivity: new Date(Date.now() - 60_000).toISOString(),
657
+ escalationLevel: 0,
658
+ stalledSince: null,
659
+ transcriptPath: null,
660
+ };
661
+ const store = createSessionStore(dbPath);
662
+ store.upsert(session);
663
+ store.close();
664
+
665
+ await logCommand(["session-end", "--agent", "lead-alpha"]);
666
+
667
+ // Lead should remain 'working', not transition to 'completed'
668
+ const readStore = createSessionStore(dbPath);
669
+ const updatedSession = readStore.getByName("lead-alpha");
670
+ readStore.close();
671
+
672
+ expect(updatedSession).toBeDefined();
673
+ expect(updatedSession?.state).toBe("working");
674
+ // But lastActivity should be updated
675
+ expect(new Date(updatedSession?.lastActivity ?? "").getTime()).toBeGreaterThan(
676
+ new Date(session.lastActivity).getTime(),
677
+ );
678
+ });
679
+
680
+ test("session-end does NOT write pending-nudge marker for leads (moved to ov stop)", async () => {
681
+ // Regression test for overstory-49a7:
682
+ // The lead_completed nudge used to fire from the per-turn Stop hook, spamming
683
+ // the coordinator with false completion signals every turn. It is now emitted
684
+ // only by `ov stop <lead>` (the real completion signal).
638
685
  const dbPath = join(tempDir, ".overstory", "sessions.db");
639
686
  const session: AgentSession = {
640
687
  id: "session-lead",
@@ -661,17 +708,10 @@ describe("logCommand", () => {
661
708
 
662
709
  await logCommand(["session-end", "--agent", "lead-alpha"]);
663
710
 
664
- // Verify the pending-nudge marker was written for the coordinator
711
+ // No pending-nudge marker should be written from session-end
665
712
  const markerPath = join(tempDir, ".overstory", "pending-nudges", "coordinator.json");
666
713
  const markerFile = Bun.file(markerPath);
667
- expect(await markerFile.exists()).toBe(true);
668
-
669
- const marker = JSON.parse(await markerFile.text());
670
- expect(marker.from).toBe("lead-alpha");
671
- expect(marker.reason).toBe("lead_completed");
672
- expect(marker.subject).toContain("lead-alpha");
673
- expect(marker.messageId).toContain("auto-nudge-lead-alpha-");
674
- expect(marker.createdAt).toBeDefined();
714
+ expect(await markerFile.exists()).toBe(false);
675
715
  });
676
716
 
677
717
  test("session-end does NOT write pending-nudge marker for non-lead agents", async () => {
@@ -1312,6 +1352,10 @@ try {
1312
1352
  stdin: "pipe",
1313
1353
  stdout: "pipe",
1314
1354
  stderr: "pipe",
1355
+ // Pin project root to tempDir. Without this, a subprocess started from
1356
+ // inside an `ov sling`-spawned worktree inherits OVERSTORY_PROJECT_ROOT
1357
+ // pointing at the parent project, and writes events to prod's events.db.
1358
+ env: { ...process.env, OVERSTORY_PROJECT_ROOT: tempDir },
1315
1359
  });
1316
1360
 
1317
1361
  // Write the JSON payload to stdin and close
@@ -1501,6 +1545,7 @@ try {
1501
1545
  stdin: "pipe",
1502
1546
  stdout: "pipe",
1503
1547
  stderr: "pipe",
1548
+ env: { ...process.env, OVERSTORY_PROJECT_ROOT: tempDir },
1504
1549
  });
1505
1550
 
1506
1551
  // Write empty string and close immediately
@@ -12,6 +12,7 @@
12
12
 
13
13
  import { join } from "node:path";
14
14
  import { Command } from "commander";
15
+ import { isStopHookPersistentCapability } from "../agents/capabilities.ts";
15
16
  import { updateIdentity } from "../agents/identity.ts";
16
17
  import { loadConfig } from "../config.ts";
17
18
  import { ValidationError } from "../errors.ts";
@@ -66,8 +67,12 @@ function updateLastActivity(projectRoot: string, agentName: string): void {
66
67
  const session = store.getByName(agentName);
67
68
  if (session) {
68
69
  store.updateLastActivity(agentName);
69
- if (session.state === "booting" || session.state === "zombie") {
70
- store.updateState(agentName, "working");
70
+ // Tool-use observed: try booting working. Matrix-guarded so a
71
+ // zombie classification (set by watchdog) is NOT silently revived
72
+ // here — that revival was a contributor to the schizophrenic
73
+ // state=zombie + tool-use-active symptom in overstory-a993.
74
+ if (session.state === "booting") {
75
+ store.tryTransitionState(agentName, "working");
71
76
  }
72
77
  }
73
78
  } finally {
@@ -79,63 +84,144 @@ function updateLastActivity(projectRoot: string, agentName: string): void {
79
84
  }
80
85
 
81
86
  /**
82
- * Agent capabilities that run as persistent interactive sessions.
83
- * The Stop hook fires every turn for these agents (not just at session end),
84
- * so they must NOT auto-transition to 'completed' on session-end events.
87
+ * Maximum retry attempts for the session-end transition.
88
+ *
89
+ * The Stop hook is the only signal that turns sessions.db state from
90
+ * "working" to "completed" for headless legacy paths and tmux sessions.
91
+ * If it loses that signal due to a transient SQLite contention error
92
+ * (e.g. "database is locked" while the watchdog ticks against the same
93
+ * file), the row stays in "working" forever and the watchdog later
94
+ * promotes it to "zombie". Retrying with exponential backoff lets brief
95
+ * lock contention resolve before we give up. (overstory-e74b)
85
96
  */
86
- const PERSISTENT_CAPABILITIES = new Set(["coordinator", "orchestrator", "monitor"]);
97
+ const TRANSITION_MAX_ATTEMPTS = 5;
98
+ const TRANSITION_BACKOFF_BASE_MS = 50;
87
99
 
88
100
  /**
89
- * Transition agent state to 'completed' in the SessionStore.
90
- * Called when session-end event fires.
91
- *
92
- * Skips the transition for persistent agent types (coordinator, orchestrator, monitor)
93
- * whose Stop hook fires every turn, not just at true session end.
101
+ * One attempt at the session-end state transition.
94
102
  *
95
- * Non-fatal: silently ignores errors to avoid breaking hook execution.
103
+ * Throws on transient failures (e.g. SQLite "database is locked") so the
104
+ * caller can retry. The body is the original logic from
105
+ * `transitionToCompleted`.
96
106
  */
97
- function transitionToCompleted(projectRoot: string, agentName: string): void {
107
+ function transitionToCompletedOnce(projectRoot: string, agentName: string): void {
108
+ const overstoryDir = join(projectRoot, ".overstory");
109
+ const { store } = openSessionStore(overstoryDir);
98
110
  try {
99
- const overstoryDir = join(projectRoot, ".overstory");
100
- const { store } = openSessionStore(overstoryDir);
101
- try {
102
- const session = store.getByName(agentName);
103
- if (session && PERSISTENT_CAPABILITIES.has(session.capability)) {
104
- // Check if a persistent top-level agent self-exited by verifying the run
105
- // is already completed.
106
- // If `ov run complete` was called before session-end, the run status is 'completed'
107
- // and we should transition the persistent session to completed too.
108
- if (
109
- (session.capability === "coordinator" || session.capability === "orchestrator") &&
110
- session.runId
111
- ) {
112
- const runStore = createRunStore(join(overstoryDir, "sessions.db"));
113
- try {
114
- const run = runStore.getRun(session.runId);
115
- if (run && run.status === "completed") {
116
- // Self-exit: the persistent agent called ov run complete before session ended
117
- store.updateState(agentName, "completed");
118
- store.updateLastActivity(agentName);
119
- return;
120
- }
121
- } finally {
122
- runStore.close();
111
+ const session = store.getByName(agentName);
112
+ if (session && isStopHookPersistentCapability(session.capability)) {
113
+ // Check if a persistent top-level agent self-exited by verifying the run
114
+ // is already completed.
115
+ // If `ov run complete` was called before session-end, the run status is 'completed'
116
+ // and we should transition the persistent session to completed too.
117
+ if (
118
+ (session.capability === "coordinator" || session.capability === "orchestrator") &&
119
+ session.runId
120
+ ) {
121
+ const runStore = createRunStore(join(overstoryDir, "sessions.db"));
122
+ try {
123
+ const run = runStore.getRun(session.runId);
124
+ if (run && run.status === "completed") {
125
+ // Self-exit: the persistent agent called ov run complete before session ended
126
+ store.updateState(agentName, "completed");
127
+ store.updateLastActivity(agentName);
128
+ return;
123
129
  }
130
+ } finally {
131
+ runStore.close();
124
132
  }
125
- // Normal persistent agent: only update activity, don't mark completed
126
- store.updateLastActivity(agentName);
127
- return;
128
133
  }
129
- store.updateState(agentName, "completed");
134
+ // Normal persistent agent: only update activity, don't mark completed
130
135
  store.updateLastActivity(agentName);
136
+ return;
137
+ }
138
+ store.updateState(agentName, "completed");
139
+ store.updateLastActivity(agentName);
140
+ } finally {
141
+ store.close();
142
+ }
143
+ }
144
+
145
+ /**
146
+ * Best-effort: log a session-end hook failure to events.db so it surfaces in
147
+ * `ov errors` and trace timelines. Swallows secondary errors (events.db may
148
+ * also be locked when the primary write failed).
149
+ */
150
+ async function logHookFailure(
151
+ projectRoot: string,
152
+ agentName: string,
153
+ hookName: string,
154
+ error: unknown,
155
+ attempts: number,
156
+ ): Promise<void> {
157
+ try {
158
+ const eventsDbPath = join(projectRoot, ".overstory", "events.db");
159
+ const eventStore = createEventStore(eventsDbPath);
160
+ try {
161
+ eventStore.insert({
162
+ runId: null,
163
+ agentName,
164
+ sessionId: null,
165
+ eventType: "error",
166
+ toolName: null,
167
+ toolArgs: null,
168
+ toolDurationMs: null,
169
+ level: "error",
170
+ data: JSON.stringify({
171
+ hook: hookName,
172
+ attempts,
173
+ message: error instanceof Error ? error.message : String(error),
174
+ }),
175
+ });
131
176
  } finally {
132
- store.close();
177
+ eventStore.close();
133
178
  }
134
179
  } catch {
135
- // Non-fatal: don't break logging if session update fails
180
+ // Non-fatal: events.db may also be unavailable when the primary write failed.
136
181
  }
137
182
  }
138
183
 
184
+ /**
185
+ * Transition agent state to 'completed' in the SessionStore.
186
+ * Called when session-end event fires.
187
+ *
188
+ * Retries on transient SQLite contention with exponential backoff
189
+ * (50/100/200/400/800ms). On persistent failure, records an `error` event
190
+ * to events.db so the missed signal shows up in observability tooling and
191
+ * the watchdog's stale-but-tmux-dead fallback can recognize it.
192
+ * (overstory-e74b)
193
+ *
194
+ * Skips the transition for capabilities in `STOP_HOOK_PERSISTENT_CAPABILITIES`
195
+ * (coordinator, orchestrator, monitor, lead) whose Stop hook fires every model
196
+ * turn rather than once at true session end. See
197
+ * `src/agents/capabilities.ts` for the full rationale and consumer list.
198
+ *
199
+ * Non-fatal: silently ignores errors to avoid breaking hook execution.
200
+ */
201
+ async function transitionToCompleted(projectRoot: string, agentName: string): Promise<void> {
202
+ let lastError: unknown;
203
+ for (let attempt = 0; attempt < TRANSITION_MAX_ATTEMPTS; attempt++) {
204
+ try {
205
+ transitionToCompletedOnce(projectRoot, agentName);
206
+ return;
207
+ } catch (err) {
208
+ lastError = err;
209
+ if (attempt < TRANSITION_MAX_ATTEMPTS - 1) {
210
+ await Bun.sleep(TRANSITION_BACKOFF_BASE_MS * 2 ** attempt);
211
+ }
212
+ }
213
+ }
214
+
215
+ // All retries failed — surface the missed signal via events.db.
216
+ await logHookFailure(
217
+ projectRoot,
218
+ agentName,
219
+ "session-end:transitionToCompleted",
220
+ lastError,
221
+ TRANSITION_MAX_ATTEMPTS,
222
+ );
223
+ }
224
+
139
225
  /**
140
226
  * Look up an agent's session record.
141
227
  * Returns null if not found.
@@ -629,8 +715,9 @@ async function runLog(opts: {
629
715
  }
630
716
  case "session-end":
631
717
  logger.info("session.end", { agentName: opts.agent });
632
- // Transition agent state to completed
633
- transitionToCompleted(config.project.root, opts.agent);
718
+ // Transition agent state to completed (with retry/backoff and
719
+ // events.db fallback on persistent failure — overstory-e74b).
720
+ await transitionToCompleted(config.project.root, opts.agent);
634
721
  // Look up agent session for identity update and metrics recording
635
722
  {
636
723
  const agentSession = getAgentSession(config.project.root, opts.agent);
@@ -647,28 +734,6 @@ async function runLog(opts: {
647
734
  // Non-fatal: identity may not exist for this agent
648
735
  }
649
736
 
650
- // Auto-nudge coordinator when a lead completes so it wakes up
651
- // to process merge_ready / worker_done messages without waiting
652
- // for user input (see decision mx-728f8d).
653
- if (agentSession?.capability === "lead") {
654
- try {
655
- const nudgesDir = join(config.project.root, ".overstory", "pending-nudges");
656
- const { mkdir } = await import("node:fs/promises");
657
- await mkdir(nudgesDir, { recursive: true });
658
- const markerPath = join(nudgesDir, "coordinator.json");
659
- const marker = {
660
- from: opts.agent,
661
- reason: "lead_completed",
662
- subject: `Lead ${opts.agent} completed — check mail for merge_ready/worker_done`,
663
- messageId: `auto-nudge-${opts.agent}-${Date.now()}`,
664
- createdAt: new Date().toISOString(),
665
- };
666
- await Bun.write(markerPath, `${JSON.stringify(marker, null, "\t")}\n`);
667
- } catch {
668
- // Non-fatal: nudge failure should not break session-end
669
- }
670
- }
671
-
672
737
  // Record session metrics (with optional token data from transcript)
673
738
  if (agentSession) {
674
739
  // NOTE: We intentionally do NOT auto-complete the run here for coordinator agents.
@@ -730,7 +795,7 @@ async function runLog(opts: {
730
795
 
731
796
  // Auto-record expertise via mulch learn + record (post-session).
732
797
  // Skip persistent agents whose Stop hook fires every turn.
733
- if (!PERSISTENT_CAPABILITIES.has(agentSession.capability)) {
798
+ if (!isStopHookPersistentCapability(agentSession.capability)) {
734
799
  try {
735
800
  const mulchClient = createMulchClient(config.project.root);
736
801
  const mailDbPath = join(config.project.root, ".overstory", "mail.db");
@@ -751,7 +816,7 @@ async function runLog(opts: {
751
816
 
752
817
  // Append outcomes to applied mulch records (outcome feedback loop).
753
818
  // Reads applied-records.json written by sling.ts at spawn time.
754
- if (!PERSISTENT_CAPABILITIES.has(agentSession.capability)) {
819
+ if (!isStopHookPersistentCapability(agentSession.capability)) {
755
820
  try {
756
821
  const mulchClient = createMulchClient(config.project.root);
757
822
  await appendOutcomeToAppliedRecords({