stagent 0.9.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (277) hide show
  1. package/README.md +5 -42
  2. package/dist/cli.js +42 -18
  3. package/docs/.coverage-gaps.json +13 -55
  4. package/docs/.last-generated +1 -1
  5. package/docs/features/provider-runtimes.md +4 -0
  6. package/docs/features/schedules.md +32 -4
  7. package/docs/features/settings.md +28 -5
  8. package/docs/features/tables.md +9 -2
  9. package/docs/features/workflows.md +10 -4
  10. package/docs/journeys/developer.md +15 -1
  11. package/docs/journeys/personal-use.md +21 -4
  12. package/docs/superpowers/plans/2026-04-07-instance-bootstrap.md +1691 -0
  13. package/docs/superpowers/plans/2026-04-08-schedule-orchestration.md +2983 -0
  14. package/docs/superpowers/plans/2026-04-11-schedule-maxturns-api-control.md +551 -0
  15. package/docs/superpowers/plans/2026-04-11-task-create-profile-validation.md +864 -0
  16. package/docs/superpowers/plans/2026-04-11-task-runtime-stagent-mcp-injection.md +739 -0
  17. package/docs/superpowers/specs/2026-04-08-chat-sse-resilience-hotfix-design.md +201 -0
  18. package/docs/superpowers/specs/2026-04-08-schedule-orchestration-design.md +371 -0
  19. package/docs/superpowers/specs/2026-04-08-swarm-visibility-design.md +213 -0
  20. package/package.json +3 -2
  21. package/src/__tests__/instrumentation-smoke.test.ts +15 -0
  22. package/src/app/analytics/page.tsx +1 -21
  23. package/src/app/api/chat/conversations/[id]/messages/route.ts +22 -1
  24. package/src/app/api/diagnostics/chat-streams/route.ts +65 -0
  25. package/src/app/api/instance/config/route.ts +41 -0
  26. package/src/app/api/instance/init/route.ts +34 -0
  27. package/src/app/api/instance/upgrade/check/route.ts +26 -0
  28. package/src/app/api/instance/upgrade/route.ts +96 -0
  29. package/src/app/api/instance/upgrade/status/route.ts +35 -0
  30. package/src/app/api/memory/route.ts +0 -11
  31. package/src/app/api/notifications/route.ts +4 -2
  32. package/src/app/api/projects/[id]/route.ts +5 -155
  33. package/src/app/api/projects/__tests__/delete-project.test.ts +10 -19
  34. package/src/app/api/schedules/[id]/execute/route.ts +111 -0
  35. package/src/app/api/schedules/[id]/route.ts +9 -1
  36. package/src/app/api/schedules/__tests__/execute-route.test.ts +118 -0
  37. package/src/app/api/schedules/route.ts +3 -12
  38. package/src/app/api/settings/openai/login/route.ts +22 -0
  39. package/src/app/api/settings/openai/logout/route.ts +7 -0
  40. package/src/app/api/settings/openai/route.ts +21 -1
  41. package/src/app/api/settings/providers/route.ts +35 -8
  42. package/src/app/api/tables/[id]/enrich/__tests__/route.test.ts +153 -0
  43. package/src/app/api/tables/[id]/enrich/plan/route.ts +98 -0
  44. package/src/app/api/tables/[id]/enrich/route.ts +147 -0
  45. package/src/app/api/tables/[id]/enrich/runs/route.ts +25 -0
  46. package/src/app/api/tasks/[id]/execute/route.ts +0 -21
  47. package/src/app/api/workflows/[id]/resume/route.ts +59 -0
  48. package/src/app/api/workflows/[id]/status/route.ts +22 -8
  49. package/src/app/api/workspace/context/route.ts +2 -0
  50. package/src/app/api/workspace/fix-data-dir/route.ts +81 -0
  51. package/src/app/chat/page.tsx +11 -0
  52. package/src/app/inbox/page.tsx +12 -5
  53. package/src/app/layout.tsx +42 -21
  54. package/src/app/page.tsx +0 -2
  55. package/src/app/settings/page.tsx +6 -9
  56. package/src/components/chat/__tests__/chat-session-provider.test.tsx +408 -0
  57. package/src/components/chat/chat-command-popover.tsx +2 -2
  58. package/src/components/chat/chat-input.tsx +2 -3
  59. package/src/components/chat/chat-session-provider.tsx +720 -0
  60. package/src/components/chat/chat-shell.tsx +92 -401
  61. package/src/components/instance/__tests__/instance-section.test.tsx +125 -0
  62. package/src/components/instance/instance-section.tsx +382 -0
  63. package/src/components/instance/upgrade-badge.tsx +219 -0
  64. package/src/components/notifications/__tests__/batch-proposal-review.test.tsx +95 -0
  65. package/src/components/notifications/__tests__/notification-item.test.tsx +106 -0
  66. package/src/components/notifications/batch-proposal-review.tsx +20 -5
  67. package/src/components/notifications/inbox-list.tsx +11 -2
  68. package/src/components/notifications/notification-item.tsx +56 -2
  69. package/src/components/notifications/pending-approval-host.tsx +56 -37
  70. package/src/components/schedules/schedule-create-sheet.tsx +19 -1
  71. package/src/components/schedules/schedule-edit-sheet.tsx +20 -1
  72. package/src/components/schedules/schedule-form.tsx +31 -0
  73. package/src/components/settings/__tests__/providers-runtimes-section.test.tsx +149 -0
  74. package/src/components/settings/auth-method-selector.tsx +19 -4
  75. package/src/components/settings/auth-status-badge.tsx +28 -3
  76. package/src/components/settings/openai-chatgpt-auth-control.tsx +278 -0
  77. package/src/components/settings/openai-runtime-section.tsx +7 -1
  78. package/src/components/settings/providers-runtimes-section.tsx +138 -19
  79. package/src/components/shared/app-sidebar.tsx +4 -3
  80. package/src/components/shared/command-palette.tsx +4 -5
  81. package/src/components/shared/theme-toggle.tsx +5 -24
  82. package/src/components/shared/workspace-indicator.tsx +61 -2
  83. package/src/components/tables/__tests__/table-enrichment-sheet.test.tsx +130 -0
  84. package/src/components/tables/table-create-sheet.tsx +4 -0
  85. package/src/components/tables/table-enrichment-runs.tsx +103 -0
  86. package/src/components/tables/table-enrichment-sheet.tsx +538 -0
  87. package/src/components/tables/table-spreadsheet.tsx +29 -5
  88. package/src/components/tables/table-toolbar.tsx +10 -1
  89. package/src/components/tasks/kanban-board.tsx +1 -0
  90. package/src/components/tasks/kanban-column.tsx +53 -14
  91. package/src/components/tasks/task-bento-grid.tsx +19 -0
  92. package/src/components/tasks/task-card.tsx +26 -3
  93. package/src/components/tasks/task-chip-bar.tsx +24 -0
  94. package/src/components/tasks/task-result-renderer.tsx +1 -1
  95. package/src/components/workflows/delay-step-body.tsx +109 -0
  96. package/src/components/workflows/hooks/use-workflow-status.ts +50 -0
  97. package/src/components/workflows/loop-status-view.tsx +1 -1
  98. package/src/components/workflows/shared/step-result.tsx +78 -0
  99. package/src/components/workflows/shared/workflow-header.tsx +141 -0
  100. package/src/components/workflows/shared/workflow-loading-skeleton.tsx +36 -0
  101. package/src/components/workflows/swarm-dashboard.tsx +2 -15
  102. package/src/components/workflows/views/loop-pattern-view.tsx +137 -0
  103. package/src/components/workflows/views/sequence-pattern-view.tsx +511 -0
  104. package/src/components/workflows/workflow-form-view.tsx +133 -16
  105. package/src/components/workflows/workflow-status-view.tsx +30 -740
  106. package/src/instrumentation-node.ts +94 -0
  107. package/src/instrumentation.ts +4 -48
  108. package/src/lib/agents/__tests__/claude-agent.test.ts +199 -0
  109. package/src/lib/agents/__tests__/execution-manager.test.ts +1 -27
  110. package/src/lib/agents/__tests__/failure-reason.test.ts +68 -0
  111. package/src/lib/agents/__tests__/learned-context.test.ts +0 -11
  112. package/src/lib/agents/__tests__/learning-session.test.ts +158 -0
  113. package/src/lib/agents/__tests__/pattern-extractor.test.ts +48 -0
  114. package/src/lib/agents/claude-agent.ts +155 -18
  115. package/src/lib/agents/execution-manager.ts +0 -35
  116. package/src/lib/agents/learned-context.ts +0 -12
  117. package/src/lib/agents/learning-session.ts +18 -5
  118. package/src/lib/agents/profiles/__tests__/registry.test.ts +6 -4
  119. package/src/lib/agents/profiles/builtins/upgrade-assistant/SKILL.md +70 -0
  120. package/src/lib/agents/profiles/builtins/upgrade-assistant/profile.yaml +32 -0
  121. package/src/lib/agents/runtime/__tests__/openai-codex-auth.test.ts +118 -0
  122. package/src/lib/agents/runtime/codex-app-server-client.ts +11 -5
  123. package/src/lib/agents/runtime/openai-codex-auth.ts +389 -0
  124. package/src/lib/agents/runtime/openai-codex.ts +29 -60
  125. package/src/lib/agents/runtime/types.ts +8 -0
  126. package/src/lib/book/chapter-mapping.ts +11 -0
  127. package/src/lib/book/content.ts +10 -0
  128. package/src/lib/chat/__tests__/active-streams.test.ts +49 -0
  129. package/src/lib/chat/__tests__/finalize-safety-net.test.ts +139 -0
  130. package/src/lib/chat/__tests__/reconcile.test.ts +137 -0
  131. package/src/lib/chat/__tests__/stream-telemetry.test.ts +151 -0
  132. package/src/lib/chat/active-streams.ts +27 -0
  133. package/src/lib/chat/codex-engine.ts +16 -17
  134. package/src/lib/chat/context-builder.ts +5 -3
  135. package/src/lib/chat/engine.ts +50 -3
  136. package/src/lib/chat/reconcile.ts +117 -0
  137. package/src/lib/chat/stagent-tools.ts +1 -0
  138. package/src/lib/chat/stream-telemetry.ts +132 -0
  139. package/src/lib/chat/suggested-prompts.ts +28 -1
  140. package/src/lib/chat/system-prompt.ts +26 -1
  141. package/src/lib/chat/tool-catalog.ts +2 -1
  142. package/src/lib/chat/tools/__tests__/enrich-table-tool.test.ts +127 -0
  143. package/src/lib/chat/tools/__tests__/schedule-tools.test.ts +261 -0
  144. package/src/lib/chat/tools/__tests__/task-tools.test.ts +352 -0
  145. package/src/lib/chat/tools/__tests__/workflow-tools-dedup.test.ts +217 -0
  146. package/src/lib/chat/tools/document-tools.ts +29 -13
  147. package/src/lib/chat/tools/helpers.ts +39 -0
  148. package/src/lib/chat/tools/notification-tools.ts +9 -5
  149. package/src/lib/chat/tools/project-tools.ts +33 -0
  150. package/src/lib/chat/tools/schedule-tools.ts +44 -11
  151. package/src/lib/chat/tools/table-tools.ts +71 -0
  152. package/src/lib/chat/tools/task-tools.ts +84 -20
  153. package/src/lib/chat/tools/workflow-tools.ts +234 -32
  154. package/src/lib/constants/settings.ts +8 -18
  155. package/src/lib/data/__tests__/clear.test.ts +56 -2
  156. package/src/lib/data/clear.ts +20 -15
  157. package/src/lib/data/delete-project.ts +171 -0
  158. package/src/lib/db/__tests__/bootstrap.test.ts +1 -1
  159. package/src/lib/db/bootstrap.ts +45 -16
  160. package/src/lib/db/index.ts +5 -0
  161. package/src/lib/db/migrations/0009_add_app_instances.sql +25 -0
  162. package/src/lib/db/migrations/0024_add_workflow_resume_at.sql +10 -0
  163. package/src/lib/db/migrations/0025_drop_app_instances.sql +3 -0
  164. package/src/lib/db/migrations/0026_drop_license.sql +3 -0
  165. package/src/lib/db/migrations/meta/_journal.json +21 -0
  166. package/src/lib/db/schema.ts +68 -23
  167. package/src/lib/environment/workspace-context.ts +13 -1
  168. package/src/lib/import/dedup.ts +4 -54
  169. package/src/lib/instance/__tests__/bootstrap.test.ts +362 -0
  170. package/src/lib/instance/__tests__/detect.test.ts +115 -0
  171. package/src/lib/instance/__tests__/fingerprint.test.ts +48 -0
  172. package/src/lib/instance/__tests__/git-ops.test.ts +95 -0
  173. package/src/lib/instance/__tests__/settings.test.ts +83 -0
  174. package/src/lib/instance/__tests__/upgrade-poller.test.ts +131 -0
  175. package/src/lib/instance/bootstrap.ts +270 -0
  176. package/src/lib/instance/detect.ts +49 -0
  177. package/src/lib/instance/fingerprint.ts +78 -0
  178. package/src/lib/instance/git-ops.ts +95 -0
  179. package/src/lib/instance/settings.ts +61 -0
  180. package/src/lib/instance/types.ts +77 -0
  181. package/src/lib/instance/upgrade-poller.ts +153 -0
  182. package/src/lib/notifications/__tests__/visibility.test.ts +51 -0
  183. package/src/lib/notifications/visibility.ts +33 -0
  184. package/src/lib/schedules/__tests__/collision-check.test.ts +93 -0
  185. package/src/lib/schedules/__tests__/config.test.ts +62 -0
  186. package/src/lib/schedules/__tests__/firing-metrics.test.ts +99 -0
  187. package/src/lib/schedules/__tests__/integration.test.ts +82 -0
  188. package/src/lib/schedules/__tests__/slot-claim.test.ts +242 -0
  189. package/src/lib/schedules/__tests__/tick-scheduler.test.ts +102 -0
  190. package/src/lib/schedules/__tests__/turn-budget.test.ts +228 -0
  191. package/src/lib/schedules/collision-check.ts +105 -0
  192. package/src/lib/schedules/config.ts +53 -0
  193. package/src/lib/schedules/scheduler.ts +232 -13
  194. package/src/lib/schedules/slot-claim.ts +105 -0
  195. package/src/lib/settings/__tests__/openai-auth.test.ts +101 -0
  196. package/src/lib/settings/__tests__/openai-login-manager.test.ts +64 -0
  197. package/src/lib/settings/__tests__/runtime-setup.test.ts +33 -0
  198. package/src/lib/settings/openai-auth.ts +105 -10
  199. package/src/lib/settings/openai-login-manager.ts +260 -0
  200. package/src/lib/settings/runtime-setup.ts +14 -4
  201. package/src/lib/tables/__tests__/enrichment-planner.test.ts +124 -0
  202. package/src/lib/tables/__tests__/enrichment.test.ts +147 -0
  203. package/src/lib/tables/enrichment-planner.ts +454 -0
  204. package/src/lib/tables/enrichment.ts +328 -0
  205. package/src/lib/tables/query-builder.ts +5 -2
  206. package/src/lib/tables/trigger-evaluator.ts +3 -2
  207. package/src/lib/theme.ts +71 -0
  208. package/src/lib/usage/ledger.ts +2 -18
  209. package/src/lib/util/__tests__/similarity.test.ts +106 -0
  210. package/src/lib/util/similarity.ts +77 -0
  211. package/src/lib/utils/format-timestamp.ts +24 -0
  212. package/src/lib/utils/stagent-paths.ts +12 -0
  213. package/src/lib/validators/__tests__/blueprint.test.ts +172 -0
  214. package/src/lib/validators/__tests__/settings.test.ts +10 -0
  215. package/src/lib/validators/blueprint.ts +70 -9
  216. package/src/lib/validators/profile.ts +2 -2
  217. package/src/lib/validators/settings.ts +3 -1
  218. package/src/lib/workflows/__tests__/delay.test.ts +196 -0
  219. package/src/lib/workflows/__tests__/engine.test.ts +8 -0
  220. package/src/lib/workflows/__tests__/loop-executor.test.ts +54 -0
  221. package/src/lib/workflows/__tests__/post-action.test.ts +108 -0
  222. package/src/lib/workflows/blueprints/instantiator.ts +22 -1
  223. package/src/lib/workflows/blueprints/types.ts +10 -2
  224. package/src/lib/workflows/delay.ts +106 -0
  225. package/src/lib/workflows/engine.ts +207 -4
  226. package/src/lib/workflows/loop-executor.ts +349 -24
  227. package/src/lib/workflows/post-action.ts +91 -0
  228. package/src/lib/workflows/types.ts +166 -1
  229. package/src/app/api/license/checkout/route.ts +0 -28
  230. package/src/app/api/license/portal/route.ts +0 -26
  231. package/src/app/api/license/route.ts +0 -89
  232. package/src/app/api/license/usage/route.ts +0 -63
  233. package/src/app/api/marketplace/browse/route.ts +0 -15
  234. package/src/app/api/marketplace/import/route.ts +0 -28
  235. package/src/app/api/marketplace/publish/route.ts +0 -40
  236. package/src/app/api/onboarding/email/route.ts +0 -53
  237. package/src/app/api/settings/telemetry/route.ts +0 -14
  238. package/src/app/api/sync/export/route.ts +0 -54
  239. package/src/app/api/sync/restore/route.ts +0 -37
  240. package/src/app/api/sync/sessions/route.ts +0 -24
  241. package/src/app/auth/callback/route.ts +0 -73
  242. package/src/app/marketplace/page.tsx +0 -19
  243. package/src/components/analytics/analytics-gate-card.tsx +0 -101
  244. package/src/components/marketplace/blueprint-card.tsx +0 -61
  245. package/src/components/marketplace/marketplace-browser.tsx +0 -131
  246. package/src/components/onboarding/email-capture-card.tsx +0 -104
  247. package/src/components/settings/activation-form.tsx +0 -95
  248. package/src/components/settings/cloud-account-section.tsx +0 -147
  249. package/src/components/settings/cloud-sync-section.tsx +0 -155
  250. package/src/components/settings/subscription-section.tsx +0 -410
  251. package/src/components/settings/telemetry-section.tsx +0 -80
  252. package/src/components/shared/premium-gate-overlay.tsx +0 -50
  253. package/src/components/shared/schedule-gate-dialog.tsx +0 -64
  254. package/src/components/shared/upgrade-banner.tsx +0 -112
  255. package/src/hooks/use-supabase-auth.ts +0 -79
  256. package/src/lib/billing/email.ts +0 -54
  257. package/src/lib/billing/products.ts +0 -80
  258. package/src/lib/billing/stripe.ts +0 -101
  259. package/src/lib/cloud/supabase-browser.ts +0 -32
  260. package/src/lib/cloud/supabase-client.ts +0 -56
  261. package/src/lib/license/__tests__/features.test.ts +0 -56
  262. package/src/lib/license/__tests__/key-format.test.ts +0 -88
  263. package/src/lib/license/__tests__/manager.test.ts +0 -64
  264. package/src/lib/license/__tests__/tier-limits.test.ts +0 -79
  265. package/src/lib/license/cloud-validation.ts +0 -60
  266. package/src/lib/license/features.ts +0 -44
  267. package/src/lib/license/key-format.ts +0 -101
  268. package/src/lib/license/limit-check.ts +0 -111
  269. package/src/lib/license/limit-queries.ts +0 -51
  270. package/src/lib/license/manager.ts +0 -345
  271. package/src/lib/license/notifications.ts +0 -59
  272. package/src/lib/license/tier-limits.ts +0 -71
  273. package/src/lib/marketplace/marketplace-client.ts +0 -107
  274. package/src/lib/sync/cloud-sync.ts +0 -235
  275. package/src/lib/telemetry/conversion-events.ts +0 -71
  276. package/src/lib/telemetry/queue.ts +0 -122
  277. package/src/lib/validators/license.ts +0 -33
@@ -0,0 +1,201 @@
1
+ # Spec B — Chat SSE Resilience Hotfix
2
+
3
+ **Status:** Approved
4
+ **Created:** 2026-04-08
5
+ **Scope mode:** REDUCE
6
+ **Related:** [Schedule Orchestration (Spec A)](./2026-04-08-schedule-orchestration-design.md), [Swarm Visibility (Spec C)](./2026-04-08-swarm-visibility-design.md)
7
+
8
+ ## Context
9
+
10
+ On 2026-04-08 at 12:20:49 UTC, five scheduled agents fired simultaneously and consumed ~12,600 combined turns on Claude Opus 4.6. A user sent a chat message ~66 seconds later; the SSE stream dropped mid-stream and the assistant message persisted with `content: ""` and `status: "streaming"`. The user saw the conversation "jank and reset."
11
+
12
+ This hotfix addresses the symptom — placeholder chat messages left in an empty/streaming state — independent of the underlying schedule-orchestration work (Spec A). It is a ~40 LOC defensive change that can ship in hours, in parallel with Spec A implementation.
13
+
14
+ ## Goal
15
+
16
+ Uphold the invariant:
17
+
18
+ > After `sendMessage()` returns or throws, no `chat_messages` row for that conversation remains with `status='streaming'` and `content=''`.
19
+
20
+ ## Root cause analysis
21
+
22
+ Code inspection of `src/app/api/chat/conversations/[id]/messages/route.ts` and `src/lib/chat/engine.ts` reveals three paths by which the invariant can be broken:
23
+
24
+ 1. **Finally-block bypass via iterator abandonment.** When the route handler consumer `break`s out of the `for await` loop (route.ts:83), the async iterator's `return()` method is invoked. In an async generator, `return()` jumps to the `finally` block, **skipping the `catch` block entirely**. Engine.ts's catch at line 644 never runs, so `updateMessageContent()` is never called. The placeholder row from engine.ts:246 stays at `content=''`.
25
+
26
+ 2. **Defensive fallback gap in error path.** engine.ts:680 writes `fullText || errorMessage`. If both are empty strings (e.g., `diagnoseProcessError()` returns empty from a blank stderr), the DB gets `content=''`.
27
+
28
+ 3. **DB write hang under contention.** Under WAL contention from concurrent schedulers, `await updateMessageContent()` in the catch path can block past the HTTP request lifetime. Next.js tears down the request before it resolves; the update never commits.
29
+
30
+ 4. **No orphan reconciliation.** Historical `streaming` rows from crashed processes or prior bugs remain visible in the UI forever.
31
+
32
+ ## Fix design
33
+
34
+ ### Change 1 — Finally-block safety net
35
+
36
+ In `src/lib/chat/engine.ts`, modify the top-level `finally` block (currently line 700, containing only `cleanupConversation(conversationId)`):
37
+
38
+ ```typescript
39
+ } finally {
40
+ try {
41
+ const current = await getMessage(assistantMsg.id);
42
+ if (current && current.status === "streaming") {
43
+ const salvage =
44
+ fullText && fullText.trim().length > 0
45
+ ? fullText
46
+ : "(Response interrupted. Please try again.)";
47
+ await updateMessageContent(assistantMsg.id, salvage);
48
+ await updateMessageStatus(
49
+ assistantMsg.id,
50
+ fullText && fullText.length > 50 ? "complete" : "error",
51
+ );
52
+ }
53
+ } catch (finalizeErr) {
54
+ console.error("[chat] finalize safety net failed:", finalizeErr);
55
+ }
56
+ cleanupConversation(conversationId);
57
+ }
58
+ ```
59
+
60
+ **Why at the finally level:** catches every code path — happy path (already `complete`, safety net is no-op), engine catch path (already wrote content, safety net is no-op), abandoned iterator path (NEW — catches the bug), generator throw path (NEW — catches the bug).
61
+
62
+ ### Change 2 — Defensive fallback in error path
63
+
64
+ At `src/lib/chat/engine.ts:680`, replace `fullText || errorMessage` with:
65
+
66
+ ```typescript
67
+ fullText || errorMessage || "(Response failed — no error detail available.)"
68
+ ```
69
+
70
+ Eliminates the empty-string write even if both sources are blank.
71
+
72
+ ### Change 3 — Truncate oversized errorMessage
73
+
74
+ Before writing `errorMessage` to the DB, truncate at 4KB:
75
+
76
+ ```typescript
77
+ const safeErrorMessage = errorMessage.length > 4096
78
+ ? errorMessage.slice(0, 4096) + "... (truncated)"
79
+ : errorMessage;
80
+ ```
81
+
82
+ Prevents bloat from multi-MB stderr dumps.
83
+
84
+ ### Change 4 — Orphan reconciliation sweep
85
+
86
+ Add a helper in `src/lib/chat/reconcile.ts` (new file):
87
+
88
+ ```typescript
89
+ export async function reconcileStreamingMessages(): Promise<number> {
90
+ const cutoff = new Date(Date.now() - 10 * 60 * 1000); // 10 minutes ago
91
+ const orphans = await db
92
+ .select()
93
+ .from(chatMessages)
94
+ .where(
95
+ and(
96
+ eq(chatMessages.status, "streaming"),
97
+ lt(chatMessages.createdAt, cutoff),
98
+ ),
99
+ );
100
+
101
+ for (const row of orphans) {
102
+ await db
103
+ .update(chatMessages)
104
+ .set({
105
+ status: "error",
106
+ content:
107
+ row.content && row.content.length > 0
108
+ ? row.content
109
+ : "(Interrupted — this response was not completed. Please retry.)",
110
+ })
111
+ .where(eq(chatMessages.id, row.id));
112
+ }
113
+
114
+ return orphans.length;
115
+ }
116
+ ```
117
+
118
+ Call from the chat conversations page loader (fire-and-forget). 10-min cutoff is far longer than any legitimate streaming duration — no risk of clobbering in-flight responses.
119
+
120
+ ### Change 5 — Route handler cleanup
121
+
122
+ In `src/app/api/chat/conversations/[id]/messages/route.ts:95-98`, wrap `controller.close()` in a try/catch so a throw during close doesn't mask earlier errors:
123
+
124
+ ```typescript
125
+ } finally {
126
+ clearInterval(keepalive);
127
+ try {
128
+ controller.close();
129
+ } catch {
130
+ // Already closed; nothing to do
131
+ }
132
+ }
133
+ ```
134
+
135
+ ## Data model changes
136
+
137
+ **None.** Uses existing schema.
138
+
139
+ ## Tests
140
+
141
+ ### Unit tests (new)
142
+
143
+ **`src/lib/chat/__tests__/engine.finalize-safety-net.test.ts`:**
144
+
145
+ 1. **Mid-stream SDK throw with partial content**: mock SDK to yield 3 chunks then throw; assert placeholder ends up with salvaged `fullText` as content and `status='complete'` (because fullText > 50 chars).
146
+ 2. **Mid-stream SDK throw with no content**: mock SDK to throw before any text; assert placeholder ends up with fallback string and `status='error'`.
147
+ 3. **Empty errorMessage AND empty fullText**: mock `diagnoseProcessError` to return empty and SDK to throw immediately; assert the line-680 fallback string is written, never `''`.
148
+ 4. **Iterator abandonment (consumer break)**: mock consumer that breaks on first yield; assert finally-block safety net salvages the row even though catch didn't run.
149
+ 5. **Happy path no-op**: mock SDK to complete normally; assert finally-block safety net sees `status='complete'` and does nothing.
150
+
151
+ **`src/lib/chat/__tests__/reconcile.test.ts`:**
152
+
153
+ 6. **20-min-old streaming row**: seed a row with `status='streaming'`, `createdAt = now - 20min`; assert reconcile marks it `error` with fallback content.
154
+ 7. **30-sec-old streaming row**: seed a row with `status='streaming'`, `createdAt = now - 30s`; assert reconcile leaves it untouched.
155
+ 8. **Partial content preservation**: seed a row with `status='streaming'`, `content='Hello wor'`, old timestamp; assert reconcile preserves the partial content, marks `error`.
156
+
157
+ ### Integration
158
+
159
+ 9. **Manual repro**: open chat, start a long prompt, send `SIGSTOP` to Next.js mid-stream for 15s, resume → assert assistant message ends finalized (never `streaming`/`content=''`).
160
+ 10. **Spec A interaction**: after Spec A lands, fire 5 schedules via `POST /api/schedules/:id/execute?force=true`, send a chat message, force the SSE to drop → assert no `chat_messages` row with `content=''` remains.
161
+
162
+ ## Error & Rescue Registry
163
+
164
+ | Error | Trigger | Impact | Rescue |
165
+ |---|---|---|---|
166
+ | Finalize safety-net DB write itself fails | Disk full, WAL locked | Placeholder stays empty (regression) | `try/catch` around the finalize block; log to console; `cleanupConversation` still runs |
167
+ | `getMessage()` returns undefined in finally | Race with delete | TypeError | Null-check (`if (current && ...)`) |
168
+ | Orphan sweep deletes legitimate in-flight row | 10-min window too tight | User sees interrupted message falsely | Use 10 min (far longer than any real SDK turn); monitor sweep hits post-ship |
169
+ | `errorMessage` is a multi-MB stderr dump | `diagnoseProcessError` returns huge string | Bloated chat_messages row | Truncate at 4KB (Change 3) |
170
+ | Reconcile runs concurrently with a new message | Race between page load and new send | Double-write | Reconcile's UPDATE is idempotent; only touches rows matching `status='streaming' AND createdAt < cutoff` |
171
+ | `controller.close()` throws in finally | Stream already closed by peer | Unhandled rejection | try/catch (Change 5) |
172
+
173
+ ## NOT in scope (deferred)
174
+
175
+ - **SSE client-side reconnect / replay from last event ID** — future spec "Chat Streaming v2"
176
+ - **Heartbeat-based client timeout detection** — future spec "Chat Streaming v2"
177
+ - **Moving chat off the shared Node event loop** (worker isolation) — addressed by Spec A's concurrency cap instead
178
+ - **Refactor of `diagnoseProcessError()`** — use fallback string at call site instead
179
+ - **Adding `lastHeartbeatAt` column for more precise orphan detection** — defer until 10-min cutoff proves insufficient
180
+
181
+ ## Files touched
182
+
183
+ - `src/lib/chat/engine.ts` — finally block (Change 1), error-path fallback (Change 2), truncation (Change 3)
184
+ - `src/app/api/chat/conversations/[id]/messages/route.ts` — controller.close try/catch (Change 5)
185
+ - `src/lib/chat/reconcile.ts` — NEW file with `reconcileStreamingMessages()` (Change 4)
186
+ - `src/app/chat/page.tsx` — call `reconcileStreamingMessages()` in loader (fire-and-forget)
187
+ - `src/lib/chat/__tests__/engine.finalize-safety-net.test.ts` — NEW
188
+ - `src/lib/chat/__tests__/reconcile.test.ts` — NEW
189
+
190
+ ## Verification
191
+
192
+ 1. All new unit tests pass.
193
+ 2. Full chat test suite regression green.
194
+ 3. Manual SIGSTOP repro (step 9 above) shows no orphaned `streaming` rows.
195
+ 4. Post-ship query: `SELECT COUNT(*) FROM chat_messages WHERE content='' AND status IN ('streaming','pending')` stays at 0 after first full chat page reload.
196
+
197
+ ## Ship plan
198
+
199
+ - No feature flag — hotfix is unconditional safety.
200
+ - Ships independently of Spec A (zero shared code).
201
+ - Ship as a standalone PR; commit separately from orchestration work for clean bisect-ability.
@@ -0,0 +1,371 @@
1
+ # Spec A — Schedule Orchestration
2
+
3
+ **Status:** Approved
4
+ **Created:** 2026-04-08
5
+ **Scope mode:** HOLD (maximum rigor)
6
+ **Related:** [Chat SSE Resilience Hotfix (Spec B)](./2026-04-08-chat-sse-resilience-hotfix-design.md), [Swarm Visibility (Spec C)](./2026-04-08-swarm-visibility-design.md)
7
+
8
+ ## Context
9
+
10
+ On 2026-04-08 at 12:20:49 UTC, five scheduled agents fired simultaneously and consumed ~12,600 combined turns on Claude Opus 4.6 via the claude-code runtime. The concurrent load saturated the single Node.js event loop that hosts both scheduled tasks and interactive chat. A user's chat message sent at 12:21:55 dropped its SSE stream and persisted with `content: ""`.
11
+
12
+ The root cause is twofold: (1) schedules fire independently with no concurrency control beyond a title-pattern sibling guard, and (2) in-prompt instructions like "MAX 18 turns" are model hints, not runtime-enforced limits. This spec introduces a global concurrency cap, per-schedule turn budgets, lease-based timeouts, a minimal collision warning, and time-series metrics for evidence-based tuning.
13
+
14
+ **Key codebase discoveries that shape this design:**
15
+
16
+ 1. **"Turn" = one SDK assistant message.** `src/lib/agents/claude-agent.ts:181` increments `turnCount` on `message.type === "assistant"`. No runtime enforcement today.
17
+ 2. **Active execution tracking already exists.** `src/lib/agents/execution-manager.ts:14-62` maintains a `Map<taskId, RunningExecution>` with `getAllExecutions()`.
18
+ 3. **Scheduler already atomically claims schedules** at `src/lib/schedules/scheduler.ts:238-252` via conditional UPDATE, and serializes drain via `drainQueue()` at line 51. The existing `.then(drainQueue)` chain at line 420 runs concurrent with the tick loop — any new coordination primitive must be correct under that interleaving.
19
+ 4. **Turn budget header infrastructure exists.** `buildTurnBudgetHeader()` at claude-agent.ts:103 reads a global `MAX_TURNS` setting.
20
+ 5. **Failure detection + auto-pause already shipped.** `detectFailureReason()` at scheduler.ts:122 parses error text; auto-pause after 3-streak exists.
21
+
22
+ ## Goals
23
+
24
+ 1. **Prevent concurrent schedule overload from starving chat.** No more than `SCHEDULE_MAX_CONCURRENT` scheduled agents run simultaneously.
25
+ 2. **Enforce per-schedule turn budgets at runtime**, not via prompt hints.
26
+ 3. **Prevent permanent lock holder hangs.** Every slot and lock carries a lease; a reaper cleans expired leases.
27
+ 4. **Give users pre-flight awareness of cron overlaps** without forcing them to auto-stagger.
28
+ 5. **Collect enough telemetry to tune the concurrency cap from evidence**, not intuition.
29
+
30
+ ## Non-goals (NOT in scope)
31
+
32
+ These are deferred to follow-up specs to keep the initial ship focused and de-risked:
33
+
34
+ - **`concurrencyGroup` column and group locks** — future spec "Schedule Concurrency Groups". The incident was a global-cap problem, not a group problem.
35
+ - **Auto-stagger endpoint, 48h forecast report, collision-forecast notifications** — future spec "Schedule Predictability & Forecasting".
36
+ - **Turn drift detection, efficiency scoring (`useful_actions / total_turns`)** — future spec "Schedule Observability".
37
+ - **`turnBudgetAction: 'optimize'` meta-agent prompt rewriter** — future spec "Agent Self-Optimization".
38
+ - **Hard chat priority / `pauseSchedulesDuringChat` setting** — only if the AR1b soft pressure signal (below) proves insufficient.
39
+ - **Dynamic adaptive cap** based on measured P99 chat latency — architect explicitly recommended against until static cap proves insufficient.
40
+ - **`usage_ledger.turn_count` column** — derivable from `schedule_firing_metrics` and `agent_logs`.
41
+
42
+ ## Design
43
+
44
+ ### A.1 Concurrency limiter
45
+
46
+ **Cap:** `SCHEDULE_MAX_CONCURRENT` env var, default **2** for initial ship. Raise to 3 after one week of telemetry validates chat SSE P99 under load.
47
+
48
+ **Primitive:** atomic single-SQL conditional UPDATE. Check-then-act is forbidden — the tick loop and `drainQueue()` run concurrently via the `.then()` chain at scheduler.ts:420, and a `SELECT count(*) ... then fire` sequence will allow two callers to both see `activeCount < cap` and both fire.
49
+
50
+ Correct claim:
51
+
52
+ ```sql
53
+ UPDATE tasks
54
+ SET status = 'running',
55
+ slot_claimed_at = :now,
56
+ lease_expires_at = :now + :leaseSec
57
+ WHERE id = :taskId
58
+ AND status = 'queued'
59
+ AND (SELECT COUNT(*)
60
+ FROM tasks
61
+ WHERE status = 'running'
62
+ AND source_type = 'scheduled') < :cap;
63
+ ```
64
+
65
+ `changes = 1` → proceed to `executeTaskWithRuntime()`. `changes = 0` → leave the task in `queued` state; `drainQueue()` will retry it after the next completion.
66
+
67
+ The primitive lives in a new helper `src/lib/schedules/slot-claim.ts` and is called by:
68
+ - `fireSchedule()` in scheduler.ts at line 412 (replace direct `executeTaskWithRuntime` call with `claimSlotThenExecute`)
69
+ - `drainQueue()` in scheduler.ts at line 74 (same)
70
+ - `POST /api/schedules/:id/execute` route handler (honors cap by default — see A.1.1)
71
+
72
+ ### A.1.1 Manual execute
73
+
74
+ `POST /api/schedules/:id/execute` honors the cap by default. Behavior:
75
+
76
+ - **Slot available:** claim and run normally.
77
+ - **Cap full:** return `429 Too Many Requests` with body `{ error: 'capacity_full', slotEtaSec: N }` where `N` is the minimum `lease_expires_at - now()` across running slots.
78
+ - **Explicit bypass:** `?force=true` query parameter bypasses the cap, writes an audit-log entry to `usage_ledger` with `activityType = 'manual_force_bypass'`, and triggers a confirmation modal in the UI (handled client-side).
79
+
80
+ This closes the footgun where a user clicking "Run now" five times in 2 seconds could spawn five concurrent Opus runs.
81
+
82
+ ### A.1.2 Chat soft pressure signal (AR1b)
83
+
84
+ An in-memory `activeChatStreams: Set<string>` lives in a new `src/lib/chat/active-streams.ts`. The chat engine at `src/lib/chat/engine.ts` adds to the set at stream start and removes at stream end (in the finally block — safe because Spec B already guarantees finally runs).
85
+
86
+ `tickScheduler()` calls `applyChatPressure()` before processing due schedules: if `activeChatStreams.size > 0`, any schedule whose `nextFireAt` is due gets its `nextFireAt` pushed forward by `SCHEDULE_CHAT_PRESSURE_DELAY_SEC` (default 30s) and skipped this tick. In-flight scheduled runs are not affected.
87
+
88
+ This is a soft signal, not a hard block — chat never starves schedules indefinitely because the delay is per-tick and one-shot.
89
+
90
+ ### A.2 Lease + timeout + reaper
91
+
92
+ Every claimed slot carries a lease. The reaper runs at each `tickScheduler()` pass (60s cadence) and reaps expired leases.
93
+
94
+ **Schema additions to `tasks`:**
95
+ - `slot_claimed_at TIMESTAMP` — set atomically with the slot claim
96
+ - `lease_expires_at TIMESTAMP` — `slot_claimed_at + max_run_duration_sec`
97
+ - `failure_reason TEXT` — written explicitly by runtime adapter at terminal transitions
98
+
99
+ **Schema additions to `schedules`:**
100
+ - `max_run_duration_sec INTEGER DEFAULT NULL` — NULL inherits global default (1200s = 20 min)
101
+
102
+ **Reaper query:**
103
+
104
+ ```sql
105
+ SELECT id FROM tasks
106
+ WHERE status = 'running'
107
+ AND source_type = 'scheduled'
108
+ AND lease_expires_at < :now;
109
+ ```
110
+
111
+ For each expired task: call `abortController.abort()` via the `RunningExecution` map at `execution-manager.ts:5`, then `UPDATE tasks SET status='failed', failure_reason='lease_expired', completed_at=:now`. The slot is freed automatically by the status change (the claim SQL counts `status='running'` rows).
112
+
113
+ **Runtime adapter change:** thread `AbortSignal` from `RunningExecution.abortController` into the SDK `query()` options in the scheduled runtime adapter. Chat already does this at `src/lib/chat/engine.ts:300`; mirror the pattern.
114
+
115
+ ### A.3 Turn budget
116
+
117
+ **Schema addition to `schedules`:**
118
+
119
+ ```sql
120
+ ALTER TABLE schedules ADD COLUMN max_turns INTEGER DEFAULT NULL
121
+ CHECK (max_turns IS NULL OR (max_turns BETWEEN 1 AND 10000));
122
+ ALTER TABLE schedules ADD COLUMN max_turns_set_at TIMESTAMP;
123
+ ALTER TABLE schedules ADD COLUMN turn_budget_breach_streak INTEGER DEFAULT 0;
124
+ ```
125
+
126
+ NULL `max_turns` inherits from the global `MAX_TURNS` setting already read by `buildTurnBudgetHeader()`.
127
+
128
+ **Enforcement:** pass `maxTurns` to SDK `query()` options in the scheduled runtime adapter. The SDK hard-stops at the limit (same mechanism chat uses at engine.ts:299).
129
+
130
+ **On breach — footgun-mitigated flow:**
131
+
132
+ 1. **First-breach grace:** if `tasks.completed_at < schedules.max_turns_set_at + 2 × cron_interval`, the breach is logged only — it does not increment `turn_budget_breach_streak`. Protects users from tripping auto-pause on the very first firing after a config edit.
133
+ 2. **Drift warning at streak ≥ 2:** send a notification: "Schedule X used {lastTurnCount}/{maxTurns} agent steps. Consider raising the budget or reducing the prompt scope."
134
+ 3. **Auto-pause at streak ≥ 5** (higher than generic failure's 3): "Schedule X paused — 5 consecutive runs exceeded the {N}-step budget. Budget may be too low; typical runs use {avgTurnsPerFiring} steps."
135
+
136
+ The separate `turn_budget_breach_streak` counter is critical: conflating budget breaches with generic failures would let a user trip auto-pause in 3 minutes by setting `maxTurns=10` on a schedule that averages 40.
137
+
138
+ **Explicit `failure_reason` writes:** the runtime adapter writes `failure_reason` directly at terminal transitions (`turn_limit_exceeded`, `lease_expired`, `sdk_error`, `aborted`, etc.). `detectFailureReason()` at scheduler.ts:122 remains as a fallback for legacy or unknown cases but is no longer the primary classifier. String-matching is fragile.
139
+
140
+ ### A.4 UI: rename + tooltips + calibration hint (PM recommendation)
141
+
142
+ - **Schedule form field rename:** "Max turns per firing" → **"Max agent steps per run"**. Keep `maxTurns` in code/API.
143
+ - **Tooltip on field:** "One step = one agent action (message, tool call, or sub-response). Most schedules use 50–500 steps; heavy research runs 2,000+."
144
+ - **Tooltip on prompt field:** "Note: writing 'MAX N turns' in your prompt is a hint to the model, not a runtime limit. Use Max agent steps below to enforce a budget."
145
+ - **Inline calibration hint:** when a user types a prompt, show "Schedules like this average ~{N} steps" derived from `avgTurnsPerFiring` across schedules with similar characteristics. Cheap — data already exists.
146
+
147
+ ### A.5 Collision warning (PR1b — minimal, restored to scope)
148
+
149
+ **Trigger:** `POST /api/schedules` and `PUT /api/schedules/:id`.
150
+
151
+ **Check:** expand the incoming `cronExpression` over the next 24h using the existing cron parser at `src/lib/schedules/interval-parser.ts`. Bucket fire times by 5-minute windows. Compare against all other active schedules in the same project. If any 5-min bucket has ≥2 schedules whose combined `avgTurnsPerFiring > 3000`, return a warning.
152
+
153
+ **Response shape:** `200 OK` with the saved schedule plus:
154
+
155
+ ```json
156
+ {
157
+ "warnings": [{
158
+ "type": "cron_collision",
159
+ "overlappingSchedules": ["Price Monitor", "News Sentinel"],
160
+ "nextCollisionAt": "2026-04-09T12:20:00Z",
161
+ "estimatedConcurrentSteps": 6878
162
+ }]
163
+ }
164
+ ```
165
+
166
+ **UI:** the create/edit sheet renders a dismissible amber banner inside `SheetContent` (with `px-6 pb-6` per the recurring shadcn Sheet padding issue logged in MEMORY.md). Copy: "This schedule overlaps with Price Monitor and News Sentinel at {time}. They'll take turns; the last to run may be delayed ~2–4 min." One action: "[Save anyway]".
167
+
168
+ **Non-blocking:** the warning does not prevent save. It informs.
169
+
170
+ **Deferred:** auto-stagger endpoint, 48h forecast, collision-forecast notifications.
171
+
172
+ ### A.6 Time-series metrics (AR3b)
173
+
174
+ New table `schedule_firing_metrics` for evidence-based cap tuning and post-hoc incident forensics. EMA on a single row erases the signal we need.
175
+
176
+ ```sql
177
+ CREATE TABLE schedule_firing_metrics (
178
+ id TEXT PRIMARY KEY,
179
+ schedule_id TEXT NOT NULL REFERENCES schedules(id),
180
+ task_id TEXT REFERENCES tasks(id),
181
+ fired_at TIMESTAMP NOT NULL,
182
+ slot_claimed_at TIMESTAMP,
183
+ completed_at TIMESTAMP,
184
+ slot_wait_ms INTEGER, -- fired_at → slot_claimed_at
185
+ duration_ms INTEGER, -- slot_claimed_at → completed_at
186
+ turn_count INTEGER,
187
+ max_turns_at_firing INTEGER,
188
+ event_loop_lag_ms REAL, -- perf_hooks.monitorEventLoopDelay p99 during run
189
+ peak_rss_mb INTEGER,
190
+ chat_streams_active INTEGER, -- count at slot claim
191
+ concurrent_schedules INTEGER, -- count at slot claim
192
+ failure_reason TEXT
193
+ );
194
+ CREATE INDEX idx_sfm_schedule_time ON schedule_firing_metrics(schedule_id, fired_at DESC);
195
+ ```
196
+
197
+ Insert a row in `recordFiringMetrics()` at scheduler.ts:419, on every completion (success or failure).
198
+
199
+ **Critical:** add matching bootstrap `CREATE TABLE IF NOT EXISTS` in `src/lib/db/index.ts` (per CLAUDE.md's recurring-issue note about bootstrap vs migrations). Also add `db.delete()` call in `src/lib/data/clear.ts` in FK-safe order (delete from `schedule_firing_metrics` before `schedules`).
200
+
201
+ ### A.7 Data model — consolidated
202
+
203
+ ```sql
204
+ -- schedules table
205
+ ALTER TABLE schedules ADD COLUMN max_turns INTEGER DEFAULT NULL
206
+ CHECK (max_turns IS NULL OR (max_turns BETWEEN 1 AND 10000));
207
+ ALTER TABLE schedules ADD COLUMN max_turns_set_at TIMESTAMP;
208
+ ALTER TABLE schedules ADD COLUMN max_run_duration_sec INTEGER DEFAULT NULL;
209
+ ALTER TABLE schedules ADD COLUMN turn_budget_breach_streak INTEGER DEFAULT 0;
210
+
211
+ -- tasks table
212
+ ALTER TABLE tasks ADD COLUMN slot_claimed_at TIMESTAMP;
213
+ ALTER TABLE tasks ADD COLUMN lease_expires_at TIMESTAMP;
214
+ ALTER TABLE tasks ADD COLUMN failure_reason TEXT;
215
+ CREATE INDEX idx_tasks_slot_running
216
+ ON tasks(status, source_type, lease_expires_at)
217
+ WHERE status = 'running';
218
+
219
+ -- schedule_firing_metrics (new)
220
+ -- [see A.6]
221
+ ```
222
+
223
+ **Settings (existing key-value table, no schema change):**
224
+ - `schedule.maxConcurrent` default: `2`
225
+ - `schedule.maxRunDurationSec` default: `1200`
226
+ - `schedule.chatPressureDelaySec` default: `30`
227
+
228
+ ### A.8 API surface
229
+
230
+ | Method | Path | Change |
231
+ |---|---|---|
232
+ | POST | `/api/schedules` | Response includes `warnings: [...]` from collision check |
233
+ | PUT | `/api/schedules/:id` | Same |
234
+ | POST | `/api/schedules/:id/execute` | Honors global cap by default; `?force=true` bypasses with audit log; returns `429 {error, slotEtaSec}` when full |
235
+ | GET | `/api/schedules/:id/metrics` | Returns recent `schedule_firing_metrics` rows for tuning/debug |
236
+
237
+ No new endpoints for orchestration proper. `/api/swarm-status` is defined in Spec C.
238
+
239
+ ## Data flow — scheduler tick + slot claim
240
+
241
+ ```
242
+ tickScheduler() (every 60s)
243
+ |
244
+ v
245
+ ┌──────────────────────────────┐
246
+ │ reapExpiredLeases() │ ── abort via RunningExecution
247
+ │ UPDATE tasks SET status= │ + mark lease_expired
248
+ │ 'failed' WHERE status= │
249
+ │ 'running' AND │
250
+ │ lease_expires_at < now() │
251
+ └──────────────┬───────────────┘
252
+ |
253
+ v
254
+ ┌──────────────────────────────┐
255
+ │ findDueSchedules() │
256
+ │ SELECT ... WHERE │
257
+ │ next_fire_at <= now() │
258
+ └──────────────┬───────────────┘
259
+ |
260
+ v
261
+ ┌──────────────────────────────┐
262
+ │ applyChatPressure() [AR1b] │
263
+ │ if activeChatStreams > 0: │
264
+ │ push nextFireAt +30s, │
265
+ │ skip this tick │
266
+ └──────────────┬───────────────┘
267
+ |
268
+ v
269
+ ┌──────────────────────────────┐
270
+ │ for each due schedule: │
271
+ │ insertQueuedTask() │
272
+ │ atomicSlotClaim() ◄──────┐│
273
+ │ UPDATE tasks SET ││
274
+ │ status='running', ││ (single SQL, guarantees cap)
275
+ │ slot_claimed_at=now(),││
276
+ │ lease_expires_at=... ││
277
+ │ WHERE id=? AND ││
278
+ │ status='queued' AND ││
279
+ │ (SELECT COUNT(*) ││
280
+ │ FROM tasks WHERE ││
281
+ │ status='running' ││
282
+ │ AND source_type= ││
283
+ │ 'scheduled') < :cap ││
284
+ │ if changes=0: ││
285
+ │ leave in queued, ││
286
+ │ drain will retry ││
287
+ │ if changes=1: ││
288
+ │ executeTaskWithRuntime ││
289
+ │ .then(recordMetrics) ││
290
+ │ .then(drainQueue) ─────┘│
291
+ └──────────────────────────────┘
292
+ ```
293
+
294
+ ## Error & Rescue Registry
295
+
296
+ | Error | Trigger | Impact | Rescue |
297
+ |---|---|---|---|
298
+ | Two ticks race on slot claim | `drainQueue()` concurrent with `tickScheduler()` | Cap breached (3 running when cap=2) | Atomic single-SQL claim (A.1); `changes=0` means lost the race — leave in queued |
299
+ | SDK hangs mid-run | Upstream Opus stall, network partition | Slot held forever, cap permanently reduced | Lease expiry + reaper aborts via AbortController after `max_run_duration_sec` |
300
+ | Reaper fails to fire | `tickScheduler` crashes or paused | Expired leases accumulate | Reaper is idempotent; runs at next tick. If tickScheduler itself is down, `bootstrapNextFireTimes` at startup repairs state |
301
+ | User sets `maxTurns=10` on schedule averaging 40 | Config footgun | Auto-pause in 3 firings (under naive design) | First-breach grace + separate `turn_budget_breach_streak` counter with threshold 5 + drift warning at streak 2 |
302
+ | `detectFailureReason` misclassifies | SDK error message format changes | Wrong streak incremented | Runtime adapter writes explicit `failure_reason` at terminal transitions; string-match is fallback only |
303
+ | Manual execute spammed | User double-clicks Run now 5× | Could exceed cap under naive design | Manual honors cap by default; `429 + slotEtaSec`; explicit `?force=true` for deliberate bypass with audit log |
304
+ | Chat pressure delay causes schedule to miss a cron interval | User has `* * * * *` cron, chat is streaming for 45s | Minute skipped | 30s delay is one-shot per tick; next tick re-evaluates. Document in UI help text |
305
+ | `schedule_firing_metrics` table unbounded growth | High-frequency schedules over months | Disk bloat | Periodic cleanup: `DELETE WHERE fired_at < now() - 30 days`. Deferred to follow-up if general maintenance sweep doesn't exist yet |
306
+ | Clock skew between scheduler and DB | Container restart, NTP drift | `lease_expires_at` mismatches | Use SQLite `CURRENT_TIMESTAMP` consistently; avoid mixing JS `Date.now()` |
307
+ | Collision check false positive under chat pressure | A delayed schedule shifts into a bucket that was previously clear | Confusing warning | Collision check runs against *nominal* cron expansion, not chat-pressure-adjusted times. Warning remains deterministic |
308
+ | Cap env var typo | User sets `SCHEDULE_MAX_CONCURRENT=abc` | Silent fallback to default | Parse with `parseInt`, log warning on NaN, use default; add settings-page validation UI |
309
+ | Lease expiry fires during a legitimate long run | Schedule takes 25 min, default lease 20 min | Run aborted falsely | Per-schedule `max_run_duration_sec` override; drift warning at 80% of lease |
310
+
311
+ ## Telemetry / 48h post-ship watchlist
312
+
313
+ 1. `COUNT(*) FROM chat_messages WHERE content='' AND status IN ('streaming','pending')` — must be 0 (Spec B success signal)
314
+ 2. `schedule_firing_metrics.slot_wait_ms` — p50/p95 per schedule. If p95 > 300s, cap too tight
315
+ 3. `schedules.failure_streak >= 3` count — auto-pause rate vs baseline
316
+ 4. `schedules.turn_budget_breach_streak > 0` count — tracks `maxTurns` misconfig rate
317
+ 5. `schedule_firing_metrics.failure_reason = 'lease_expired'` count — indicates timeouts too tight or SDK hangs
318
+ 6. Chat SSE completion rate (`status='complete'` / total) — must stay at or above pre-incident baseline
319
+ 7. `schedule_firing_metrics.event_loop_lag_ms` p99 — validates/falsifies cap=2; if always <50ms, raise to 3
320
+ 8. Collision-warning acceptance rate (how often users save despite warning)
321
+ 9. Manual `?force=true` bypass frequency — should be near-zero; alert if >5/week
322
+
323
+ ## TDRs to capture
324
+
325
+ Create in `.claude/skills/architect/references/`:
326
+
327
+ 1. **TDR: Concurrency slot claim is a single SQL statement, not check-then-act.** References the 2026-04-08 incident.
328
+ 2. **TDR: Scheduler cap is static and evidence-based.** Changes require re-running the load test.
329
+ 3. **TDR: Auto-pause streak counts per failure class.** Forces future failure modes to reason about whether they feed the generic streak or a dedicated one.
330
+ 4. **TDR: Manual execute honors the global cap by default.** Operational controls prefer safety over convenience.
331
+ 5. **TDR: All lock holders carry lease expiries + reapers.** Generalize beyond concurrency slots.
332
+ 6. **TDR: Chat and scheduled agents compete for the same Node event loop.** Architectural constraint; future features must not starve chat.
333
+
334
+ ## Tests
335
+
336
+ 1. **Race-condition test:** spawn 10 concurrent `fireSchedule` calls against cap=3; assert exactly 3 slots claimed, no breach.
337
+ 2. **Lease reaper test:** set tiny lease, trigger run, wait, assert reaper marks `failed`/`lease_expired` and frees slot.
338
+ 3. **Turn budget enforcement test:** `maxTurns=5`, prompt that needs 50 turns, assert SDK hard-stops and `turn_budget_breach_streak` increments.
339
+ 4. **First-breach grace test:** set new `maxTurns`, first firing breaches, assert streak stays at 0.
340
+ 5. **Manual execute cap test:** fill cap, POST execute, assert 429 + `slotEtaSec`. POST with `?force=true`, assert 200 + audit log entry.
341
+ 6. **Chat pressure test:** start fake chat stream, trigger scheduler tick, assert due schedules get `next_fire_at` pushed forward 30s.
342
+ 7. **Collision warning test:** create overlapping cron, assert `warnings` array populated.
343
+ 8. **Load test (validation of cap):** 5 schedules × 500-turn dummy prompts, measure chat SSE P99 first-token with cap=2. Assert P99 < 2s.
344
+ 9. **Incident reproduction:** fire 5 real schedules → queue of 3, 2 wait → chat message sent → chat SSE stays responsive → no `content=''` row.
345
+
346
+ ## Files touched
347
+
348
+ ### Modify
349
+ - `src/lib/schedules/scheduler.ts` — tick loop, drain queue, reaper, firing metrics recording, chat pressure application
350
+ - `src/lib/agents/execution-manager.ts` — abortController surface (already exists, just wire)
351
+ - `src/lib/agents/claude-agent.ts` — runtime adapter turn budget + failure reason writes
352
+ - `src/lib/db/schema.ts` — new columns + table
353
+ - `src/lib/db/index.ts` — bootstrap CREATE TABLE IF NOT EXISTS for `schedule_firing_metrics`
354
+ - `src/lib/data/clear.ts` — add delete for new table (FK-ordered)
355
+ - `src/lib/schedules/interval-parser.ts` — reuse for collision check
356
+ - `src/app/api/schedules/route.ts` + `[id]/route.ts` — collision warning response shape
357
+ - `src/app/api/schedules/[id]/execute/route.ts` — cap check + force bypass
358
+ - `src/components/schedules/schedule-form.tsx` — new "Max agent steps" field + rename + tooltip + calibration hint
359
+
360
+ ### New
361
+ - `src/lib/schedules/slot-claim.ts` — atomic primitive
362
+ - `src/lib/chat/active-streams.ts` — in-memory set for chat pressure signal
363
+ - `src/lib/schedules/collision-check.ts` — 24h cron expansion + bucket compare
364
+ - `.claude/skills/architect/references/tdr-*.md` — 6 new TDRs
365
+
366
+ ## Ship plan
367
+
368
+ - **Feature flag:** `SCHEDULE_MAX_CONCURRENT` env var, default 2. Override raises post-telemetry.
369
+ - **Parallel with Spec B** — zero shared code; Spec B is a separate commit/PR.
370
+ - **After 1 week of telemetry:** raise cap from 2 → 3 if metrics healthy.
371
+ - **Update `features/roadmap.md`** post-ship with a "Schedule Orchestration Resilience" subsection including A/B/C completed entries plus future `schedule-collision-prevention` and `schedule-forecasting` entries.