stagent 0.9.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (277) hide show
  1. package/README.md +5 -42
  2. package/dist/cli.js +42 -18
  3. package/docs/.coverage-gaps.json +13 -55
  4. package/docs/.last-generated +1 -1
  5. package/docs/features/provider-runtimes.md +4 -0
  6. package/docs/features/schedules.md +32 -4
  7. package/docs/features/settings.md +28 -5
  8. package/docs/features/tables.md +9 -2
  9. package/docs/features/workflows.md +10 -4
  10. package/docs/journeys/developer.md +15 -1
  11. package/docs/journeys/personal-use.md +21 -4
  12. package/docs/superpowers/plans/2026-04-07-instance-bootstrap.md +1691 -0
  13. package/docs/superpowers/plans/2026-04-08-schedule-orchestration.md +2983 -0
  14. package/docs/superpowers/plans/2026-04-11-schedule-maxturns-api-control.md +551 -0
  15. package/docs/superpowers/plans/2026-04-11-task-create-profile-validation.md +864 -0
  16. package/docs/superpowers/plans/2026-04-11-task-runtime-stagent-mcp-injection.md +739 -0
  17. package/docs/superpowers/specs/2026-04-08-chat-sse-resilience-hotfix-design.md +201 -0
  18. package/docs/superpowers/specs/2026-04-08-schedule-orchestration-design.md +371 -0
  19. package/docs/superpowers/specs/2026-04-08-swarm-visibility-design.md +213 -0
  20. package/package.json +3 -2
  21. package/src/__tests__/instrumentation-smoke.test.ts +15 -0
  22. package/src/app/analytics/page.tsx +1 -21
  23. package/src/app/api/chat/conversations/[id]/messages/route.ts +22 -1
  24. package/src/app/api/diagnostics/chat-streams/route.ts +65 -0
  25. package/src/app/api/instance/config/route.ts +41 -0
  26. package/src/app/api/instance/init/route.ts +34 -0
  27. package/src/app/api/instance/upgrade/check/route.ts +26 -0
  28. package/src/app/api/instance/upgrade/route.ts +96 -0
  29. package/src/app/api/instance/upgrade/status/route.ts +35 -0
  30. package/src/app/api/memory/route.ts +0 -11
  31. package/src/app/api/notifications/route.ts +4 -2
  32. package/src/app/api/projects/[id]/route.ts +5 -155
  33. package/src/app/api/projects/__tests__/delete-project.test.ts +10 -19
  34. package/src/app/api/schedules/[id]/execute/route.ts +111 -0
  35. package/src/app/api/schedules/[id]/route.ts +9 -1
  36. package/src/app/api/schedules/__tests__/execute-route.test.ts +118 -0
  37. package/src/app/api/schedules/route.ts +3 -12
  38. package/src/app/api/settings/openai/login/route.ts +22 -0
  39. package/src/app/api/settings/openai/logout/route.ts +7 -0
  40. package/src/app/api/settings/openai/route.ts +21 -1
  41. package/src/app/api/settings/providers/route.ts +35 -8
  42. package/src/app/api/tables/[id]/enrich/__tests__/route.test.ts +153 -0
  43. package/src/app/api/tables/[id]/enrich/plan/route.ts +98 -0
  44. package/src/app/api/tables/[id]/enrich/route.ts +147 -0
  45. package/src/app/api/tables/[id]/enrich/runs/route.ts +25 -0
  46. package/src/app/api/tasks/[id]/execute/route.ts +0 -21
  47. package/src/app/api/workflows/[id]/resume/route.ts +59 -0
  48. package/src/app/api/workflows/[id]/status/route.ts +22 -8
  49. package/src/app/api/workspace/context/route.ts +2 -0
  50. package/src/app/api/workspace/fix-data-dir/route.ts +81 -0
  51. package/src/app/chat/page.tsx +11 -0
  52. package/src/app/inbox/page.tsx +12 -5
  53. package/src/app/layout.tsx +42 -21
  54. package/src/app/page.tsx +0 -2
  55. package/src/app/settings/page.tsx +6 -9
  56. package/src/components/chat/__tests__/chat-session-provider.test.tsx +408 -0
  57. package/src/components/chat/chat-command-popover.tsx +2 -2
  58. package/src/components/chat/chat-input.tsx +2 -3
  59. package/src/components/chat/chat-session-provider.tsx +720 -0
  60. package/src/components/chat/chat-shell.tsx +92 -401
  61. package/src/components/instance/__tests__/instance-section.test.tsx +125 -0
  62. package/src/components/instance/instance-section.tsx +382 -0
  63. package/src/components/instance/upgrade-badge.tsx +219 -0
  64. package/src/components/notifications/__tests__/batch-proposal-review.test.tsx +95 -0
  65. package/src/components/notifications/__tests__/notification-item.test.tsx +106 -0
  66. package/src/components/notifications/batch-proposal-review.tsx +20 -5
  67. package/src/components/notifications/inbox-list.tsx +11 -2
  68. package/src/components/notifications/notification-item.tsx +56 -2
  69. package/src/components/notifications/pending-approval-host.tsx +56 -37
  70. package/src/components/schedules/schedule-create-sheet.tsx +19 -1
  71. package/src/components/schedules/schedule-edit-sheet.tsx +20 -1
  72. package/src/components/schedules/schedule-form.tsx +31 -0
  73. package/src/components/settings/__tests__/providers-runtimes-section.test.tsx +149 -0
  74. package/src/components/settings/auth-method-selector.tsx +19 -4
  75. package/src/components/settings/auth-status-badge.tsx +28 -3
  76. package/src/components/settings/openai-chatgpt-auth-control.tsx +278 -0
  77. package/src/components/settings/openai-runtime-section.tsx +7 -1
  78. package/src/components/settings/providers-runtimes-section.tsx +138 -19
  79. package/src/components/shared/app-sidebar.tsx +4 -3
  80. package/src/components/shared/command-palette.tsx +4 -5
  81. package/src/components/shared/theme-toggle.tsx +5 -24
  82. package/src/components/shared/workspace-indicator.tsx +61 -2
  83. package/src/components/tables/__tests__/table-enrichment-sheet.test.tsx +130 -0
  84. package/src/components/tables/table-create-sheet.tsx +4 -0
  85. package/src/components/tables/table-enrichment-runs.tsx +103 -0
  86. package/src/components/tables/table-enrichment-sheet.tsx +538 -0
  87. package/src/components/tables/table-spreadsheet.tsx +29 -5
  88. package/src/components/tables/table-toolbar.tsx +10 -1
  89. package/src/components/tasks/kanban-board.tsx +1 -0
  90. package/src/components/tasks/kanban-column.tsx +53 -14
  91. package/src/components/tasks/task-bento-grid.tsx +19 -0
  92. package/src/components/tasks/task-card.tsx +26 -3
  93. package/src/components/tasks/task-chip-bar.tsx +24 -0
  94. package/src/components/tasks/task-result-renderer.tsx +1 -1
  95. package/src/components/workflows/delay-step-body.tsx +109 -0
  96. package/src/components/workflows/hooks/use-workflow-status.ts +50 -0
  97. package/src/components/workflows/loop-status-view.tsx +1 -1
  98. package/src/components/workflows/shared/step-result.tsx +78 -0
  99. package/src/components/workflows/shared/workflow-header.tsx +141 -0
  100. package/src/components/workflows/shared/workflow-loading-skeleton.tsx +36 -0
  101. package/src/components/workflows/swarm-dashboard.tsx +2 -15
  102. package/src/components/workflows/views/loop-pattern-view.tsx +137 -0
  103. package/src/components/workflows/views/sequence-pattern-view.tsx +511 -0
  104. package/src/components/workflows/workflow-form-view.tsx +133 -16
  105. package/src/components/workflows/workflow-status-view.tsx +30 -740
  106. package/src/instrumentation-node.ts +94 -0
  107. package/src/instrumentation.ts +4 -48
  108. package/src/lib/agents/__tests__/claude-agent.test.ts +199 -0
  109. package/src/lib/agents/__tests__/execution-manager.test.ts +1 -27
  110. package/src/lib/agents/__tests__/failure-reason.test.ts +68 -0
  111. package/src/lib/agents/__tests__/learned-context.test.ts +0 -11
  112. package/src/lib/agents/__tests__/learning-session.test.ts +158 -0
  113. package/src/lib/agents/__tests__/pattern-extractor.test.ts +48 -0
  114. package/src/lib/agents/claude-agent.ts +155 -18
  115. package/src/lib/agents/execution-manager.ts +0 -35
  116. package/src/lib/agents/learned-context.ts +0 -12
  117. package/src/lib/agents/learning-session.ts +18 -5
  118. package/src/lib/agents/profiles/__tests__/registry.test.ts +6 -4
  119. package/src/lib/agents/profiles/builtins/upgrade-assistant/SKILL.md +70 -0
  120. package/src/lib/agents/profiles/builtins/upgrade-assistant/profile.yaml +32 -0
  121. package/src/lib/agents/runtime/__tests__/openai-codex-auth.test.ts +118 -0
  122. package/src/lib/agents/runtime/codex-app-server-client.ts +11 -5
  123. package/src/lib/agents/runtime/openai-codex-auth.ts +389 -0
  124. package/src/lib/agents/runtime/openai-codex.ts +29 -60
  125. package/src/lib/agents/runtime/types.ts +8 -0
  126. package/src/lib/book/chapter-mapping.ts +11 -0
  127. package/src/lib/book/content.ts +10 -0
  128. package/src/lib/chat/__tests__/active-streams.test.ts +49 -0
  129. package/src/lib/chat/__tests__/finalize-safety-net.test.ts +139 -0
  130. package/src/lib/chat/__tests__/reconcile.test.ts +137 -0
  131. package/src/lib/chat/__tests__/stream-telemetry.test.ts +151 -0
  132. package/src/lib/chat/active-streams.ts +27 -0
  133. package/src/lib/chat/codex-engine.ts +16 -17
  134. package/src/lib/chat/context-builder.ts +5 -3
  135. package/src/lib/chat/engine.ts +50 -3
  136. package/src/lib/chat/reconcile.ts +117 -0
  137. package/src/lib/chat/stagent-tools.ts +1 -0
  138. package/src/lib/chat/stream-telemetry.ts +132 -0
  139. package/src/lib/chat/suggested-prompts.ts +28 -1
  140. package/src/lib/chat/system-prompt.ts +26 -1
  141. package/src/lib/chat/tool-catalog.ts +2 -1
  142. package/src/lib/chat/tools/__tests__/enrich-table-tool.test.ts +127 -0
  143. package/src/lib/chat/tools/__tests__/schedule-tools.test.ts +261 -0
  144. package/src/lib/chat/tools/__tests__/task-tools.test.ts +352 -0
  145. package/src/lib/chat/tools/__tests__/workflow-tools-dedup.test.ts +217 -0
  146. package/src/lib/chat/tools/document-tools.ts +29 -13
  147. package/src/lib/chat/tools/helpers.ts +39 -0
  148. package/src/lib/chat/tools/notification-tools.ts +9 -5
  149. package/src/lib/chat/tools/project-tools.ts +33 -0
  150. package/src/lib/chat/tools/schedule-tools.ts +44 -11
  151. package/src/lib/chat/tools/table-tools.ts +71 -0
  152. package/src/lib/chat/tools/task-tools.ts +84 -20
  153. package/src/lib/chat/tools/workflow-tools.ts +234 -32
  154. package/src/lib/constants/settings.ts +8 -18
  155. package/src/lib/data/__tests__/clear.test.ts +56 -2
  156. package/src/lib/data/clear.ts +20 -15
  157. package/src/lib/data/delete-project.ts +171 -0
  158. package/src/lib/db/__tests__/bootstrap.test.ts +1 -1
  159. package/src/lib/db/bootstrap.ts +45 -16
  160. package/src/lib/db/index.ts +5 -0
  161. package/src/lib/db/migrations/0009_add_app_instances.sql +25 -0
  162. package/src/lib/db/migrations/0024_add_workflow_resume_at.sql +10 -0
  163. package/src/lib/db/migrations/0025_drop_app_instances.sql +3 -0
  164. package/src/lib/db/migrations/0026_drop_license.sql +3 -0
  165. package/src/lib/db/migrations/meta/_journal.json +21 -0
  166. package/src/lib/db/schema.ts +68 -23
  167. package/src/lib/environment/workspace-context.ts +13 -1
  168. package/src/lib/import/dedup.ts +4 -54
  169. package/src/lib/instance/__tests__/bootstrap.test.ts +362 -0
  170. package/src/lib/instance/__tests__/detect.test.ts +115 -0
  171. package/src/lib/instance/__tests__/fingerprint.test.ts +48 -0
  172. package/src/lib/instance/__tests__/git-ops.test.ts +95 -0
  173. package/src/lib/instance/__tests__/settings.test.ts +83 -0
  174. package/src/lib/instance/__tests__/upgrade-poller.test.ts +131 -0
  175. package/src/lib/instance/bootstrap.ts +270 -0
  176. package/src/lib/instance/detect.ts +49 -0
  177. package/src/lib/instance/fingerprint.ts +78 -0
  178. package/src/lib/instance/git-ops.ts +95 -0
  179. package/src/lib/instance/settings.ts +61 -0
  180. package/src/lib/instance/types.ts +77 -0
  181. package/src/lib/instance/upgrade-poller.ts +153 -0
  182. package/src/lib/notifications/__tests__/visibility.test.ts +51 -0
  183. package/src/lib/notifications/visibility.ts +33 -0
  184. package/src/lib/schedules/__tests__/collision-check.test.ts +93 -0
  185. package/src/lib/schedules/__tests__/config.test.ts +62 -0
  186. package/src/lib/schedules/__tests__/firing-metrics.test.ts +99 -0
  187. package/src/lib/schedules/__tests__/integration.test.ts +82 -0
  188. package/src/lib/schedules/__tests__/slot-claim.test.ts +242 -0
  189. package/src/lib/schedules/__tests__/tick-scheduler.test.ts +102 -0
  190. package/src/lib/schedules/__tests__/turn-budget.test.ts +228 -0
  191. package/src/lib/schedules/collision-check.ts +105 -0
  192. package/src/lib/schedules/config.ts +53 -0
  193. package/src/lib/schedules/scheduler.ts +232 -13
  194. package/src/lib/schedules/slot-claim.ts +105 -0
  195. package/src/lib/settings/__tests__/openai-auth.test.ts +101 -0
  196. package/src/lib/settings/__tests__/openai-login-manager.test.ts +64 -0
  197. package/src/lib/settings/__tests__/runtime-setup.test.ts +33 -0
  198. package/src/lib/settings/openai-auth.ts +105 -10
  199. package/src/lib/settings/openai-login-manager.ts +260 -0
  200. package/src/lib/settings/runtime-setup.ts +14 -4
  201. package/src/lib/tables/__tests__/enrichment-planner.test.ts +124 -0
  202. package/src/lib/tables/__tests__/enrichment.test.ts +147 -0
  203. package/src/lib/tables/enrichment-planner.ts +454 -0
  204. package/src/lib/tables/enrichment.ts +328 -0
  205. package/src/lib/tables/query-builder.ts +5 -2
  206. package/src/lib/tables/trigger-evaluator.ts +3 -2
  207. package/src/lib/theme.ts +71 -0
  208. package/src/lib/usage/ledger.ts +2 -18
  209. package/src/lib/util/__tests__/similarity.test.ts +106 -0
  210. package/src/lib/util/similarity.ts +77 -0
  211. package/src/lib/utils/format-timestamp.ts +24 -0
  212. package/src/lib/utils/stagent-paths.ts +12 -0
  213. package/src/lib/validators/__tests__/blueprint.test.ts +172 -0
  214. package/src/lib/validators/__tests__/settings.test.ts +10 -0
  215. package/src/lib/validators/blueprint.ts +70 -9
  216. package/src/lib/validators/profile.ts +2 -2
  217. package/src/lib/validators/settings.ts +3 -1
  218. package/src/lib/workflows/__tests__/delay.test.ts +196 -0
  219. package/src/lib/workflows/__tests__/engine.test.ts +8 -0
  220. package/src/lib/workflows/__tests__/loop-executor.test.ts +54 -0
  221. package/src/lib/workflows/__tests__/post-action.test.ts +108 -0
  222. package/src/lib/workflows/blueprints/instantiator.ts +22 -1
  223. package/src/lib/workflows/blueprints/types.ts +10 -2
  224. package/src/lib/workflows/delay.ts +106 -0
  225. package/src/lib/workflows/engine.ts +207 -4
  226. package/src/lib/workflows/loop-executor.ts +349 -24
  227. package/src/lib/workflows/post-action.ts +91 -0
  228. package/src/lib/workflows/types.ts +166 -1
  229. package/src/app/api/license/checkout/route.ts +0 -28
  230. package/src/app/api/license/portal/route.ts +0 -26
  231. package/src/app/api/license/route.ts +0 -89
  232. package/src/app/api/license/usage/route.ts +0 -63
  233. package/src/app/api/marketplace/browse/route.ts +0 -15
  234. package/src/app/api/marketplace/import/route.ts +0 -28
  235. package/src/app/api/marketplace/publish/route.ts +0 -40
  236. package/src/app/api/onboarding/email/route.ts +0 -53
  237. package/src/app/api/settings/telemetry/route.ts +0 -14
  238. package/src/app/api/sync/export/route.ts +0 -54
  239. package/src/app/api/sync/restore/route.ts +0 -37
  240. package/src/app/api/sync/sessions/route.ts +0 -24
  241. package/src/app/auth/callback/route.ts +0 -73
  242. package/src/app/marketplace/page.tsx +0 -19
  243. package/src/components/analytics/analytics-gate-card.tsx +0 -101
  244. package/src/components/marketplace/blueprint-card.tsx +0 -61
  245. package/src/components/marketplace/marketplace-browser.tsx +0 -131
  246. package/src/components/onboarding/email-capture-card.tsx +0 -104
  247. package/src/components/settings/activation-form.tsx +0 -95
  248. package/src/components/settings/cloud-account-section.tsx +0 -147
  249. package/src/components/settings/cloud-sync-section.tsx +0 -155
  250. package/src/components/settings/subscription-section.tsx +0 -410
  251. package/src/components/settings/telemetry-section.tsx +0 -80
  252. package/src/components/shared/premium-gate-overlay.tsx +0 -50
  253. package/src/components/shared/schedule-gate-dialog.tsx +0 -64
  254. package/src/components/shared/upgrade-banner.tsx +0 -112
  255. package/src/hooks/use-supabase-auth.ts +0 -79
  256. package/src/lib/billing/email.ts +0 -54
  257. package/src/lib/billing/products.ts +0 -80
  258. package/src/lib/billing/stripe.ts +0 -101
  259. package/src/lib/cloud/supabase-browser.ts +0 -32
  260. package/src/lib/cloud/supabase-client.ts +0 -56
  261. package/src/lib/license/__tests__/features.test.ts +0 -56
  262. package/src/lib/license/__tests__/key-format.test.ts +0 -88
  263. package/src/lib/license/__tests__/manager.test.ts +0 -64
  264. package/src/lib/license/__tests__/tier-limits.test.ts +0 -79
  265. package/src/lib/license/cloud-validation.ts +0 -60
  266. package/src/lib/license/features.ts +0 -44
  267. package/src/lib/license/key-format.ts +0 -101
  268. package/src/lib/license/limit-check.ts +0 -111
  269. package/src/lib/license/limit-queries.ts +0 -51
  270. package/src/lib/license/manager.ts +0 -345
  271. package/src/lib/license/notifications.ts +0 -59
  272. package/src/lib/license/tier-limits.ts +0 -71
  273. package/src/lib/marketplace/marketplace-client.ts +0 -107
  274. package/src/lib/sync/cloud-sync.ts +0 -235
  275. package/src/lib/telemetry/conversion-events.ts +0 -71
  276. package/src/lib/telemetry/queue.ts +0 -122
  277. package/src/lib/validators/license.ts +0 -33
@@ -0,0 +1,2983 @@
1
+ # Schedule Orchestration Implementation Plan
2
+
3
+ > **For agentic workers:** REQUIRED SUB-SKILL: Use `superpowers:subagent-driven-development` (recommended) or `superpowers:executing-plans` to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
4
+
5
+ **Goal:** Prevent concurrent scheduled agents from starving the chat SSE stream by introducing a global concurrency cap enforced via atomic slot claim, per-schedule turn budgets, lease-based timeouts with a reaper, a minimal pre-flight collision warning, and a time-series metrics table for evidence-based cap tuning.
6
+
7
+ **Architecture:** Changes are concentrated in `src/lib/schedules/scheduler.ts`, a new `src/lib/schedules/slot-claim.ts` primitive, and the claude runtime adapter. Coordination uses atomic single-SQL conditional updates (no check-then-act). Lease expiry via `AbortController` reaped at each tick. Per-schedule turn budget is a new `max_turns` column propagated into the `tasks` row at firing time and threaded into the SDK `query()` call. Chat soft pressure uses a module-level `Set<conversationId>` in `src/lib/chat/active-streams.ts` checked by `tickScheduler()` to defer (not block) new firings.
8
+
9
+ **Tech Stack:** TypeScript, better-sqlite3 (synchronous), Drizzle ORM, `@anthropic-ai/claude-agent-sdk`, `cron-parser` (already in tree for `expandCronMinutes`), vitest with real temp-dir SQLite, Next.js `register()` instrumentation hook.
10
+
11
+ **Worktree guidance:** This plan makes invasive changes to scheduler semantics. Run it in a dedicated worktree:
12
+
13
+ ```bash
14
+ git worktree add -b schedule-orchestration ../stagent-schedule-orchestration main
15
+ cd ../stagent-schedule-orchestration
16
+ ```
17
+
18
+ ---
19
+
20
+ ## NOT in scope
21
+
22
+ Explicit deferrals to prevent scope re-creep during execution:
23
+
24
+ - **`concurrencyGroup` column and per-group locks** — deferred to a follow-up spec. The 2026-04-08 incident was a global-cap problem; groups add a second coordination primitive whose correctness depends on solving the first. Ship global cap alone.
25
+ - **Auto-stagger endpoint, 48h forecast, collision-forecast notifications** — future spec "Schedule Predictability & Forecasting". Only the minimal save-time collision *warning* is in this plan.
26
+ - **Turn drift detection across 3-run moving window, efficiency scoring (`useful_actions / total_turns`)** — future spec "Schedule Observability".
27
+ - **`turnBudgetAction: 'optimize'` meta-agent prompt rewriter** — future spec "Agent Self-Optimization".
28
+ - **Hard chat priority / `pauseSchedulesDuringChat` setting** — only if the soft pressure signal proves insufficient post-launch.
29
+ - **Dynamic adaptive cap** based on measured P99 chat latency — architect explicitly recommended against until static cap proves insufficient.
30
+ - **`usage_ledger.turn_count` column** — derivable from `schedule_firing_metrics` and `agent_logs`.
31
+ - **`swarm_snapshots` time-series table** — deferred to Spec C follow-ups ("Swarm Activity Feed").
32
+ - **Worker-thread isolation for the agent runtime** — architectural bet, separate design effort.
33
+ - **UI visibility layer** — delivered by Spec C (Swarm Visibility), which depends on this plan's API shape but is a separate plan.
34
+
35
+ ## What already exists
36
+
37
+ Reusable code and patterns confirmed during exploration. Do not rebuild these:
38
+
39
+ - **`src/lib/schedules/scheduler.ts:238-252`** — atomic per-schedule claim via conditional WHERE UPDATE. The new global-cap claim follows the same single-SQL-statement pattern.
40
+ - **`src/lib/schedules/scheduler.ts:51-95`** — `drainQueue()` with module-level `draining` flag. The new atomic claim must be correct under the drain + tick interleaving.
41
+ - **`src/lib/schedules/scheduler.ts:304-322`** — existing title-pattern sibling guard. Keep as-is; global cap layers on top.
42
+ - **`src/lib/schedules/scheduler.ts:122-133`** — `detectFailureReason()`. Keep as fallback; runtime adapter will write explicit `failure_reason` at terminal transitions.
43
+ - **`src/lib/schedules/scheduler.ts:140-186`** — `recordFiringMetrics()` is the natural hook for inserting into the new firing-metrics table.
44
+ - **`src/lib/schedules/interval-parser.ts:92`** — `expandCronMinutes()` expands a cron into the list of fire minutes. Reuse for collision-check bucketing.
45
+ - **`src/lib/agents/execution-manager.ts:14-62`** — in-memory `Map<taskId, RunningExecution>` with `abortController` on each entry. The reaper uses this to abort expired leases.
46
+ - **`src/lib/agents/claude-agent.ts:444-470`** — SDK `query()` invocation. `maxTurns` is passed through `ctx.maxTurns`. Override when a task came from a schedule with its own `max_turns`.
47
+ - **`src/lib/agents/claude-agent.ts:358-414`** — `buildTaskQueryContext()` resolves `maxTurns` from profile fallback. Keep as default; schedule-level override takes precedence.
48
+ - **`src/lib/db/bootstrap.ts:266-275`** — `addColumnIfMissing()` helper: tolerates `duplicate column` errors so `ALTER TABLE ADD COLUMN` is idempotent across dev and deployed DBs. Use this for all new columns.
49
+ - **`src/lib/data/clear.ts`** — FK-safe deletion order. Tests enforce that every schema table is deleted. New tables must be added here.
50
+ - **`src/lib/settings/helpers.ts:12`** — `getSettingSync(key)` for in-process reads. Use sync helpers inside hot scheduler paths to avoid needless awaits.
51
+ - **`src/lib/constants/settings.ts`** — `SETTINGS_KEYS` enum. Add new keys here.
52
+ - **`src/lib/chat/engine.ts:256`** — chat stream start point (where `fullText = ""` is initialized and streaming begins). The `active-streams.ts` set will be populated here and cleared in the finally block alongside `cleanupConversation()`.
53
+ - **`src/test/setup.ts:6-10`** — vitest setup creates a temp-dir SQLite per run via `STAGENT_DATA_DIR`. Tests can freely insert/query against a real DB.
54
+ - **`src/lib/chat/reconcile.ts`** (NEW from Spec B hotfix, already committed) — `finalizeStreamingMessage()` and `reconcileStreamingMessages()`. Pattern reference for pure DB-only helpers tested in isolation.
55
+
56
+ ## Error & Rescue Registry
57
+
58
+ HOLD-mode feature — each primitive's failure path is enumerated and rescued.
59
+
60
+ | Error | Trigger | Impact | Rescue |
61
+ |---|---|---|---|
62
+ | Two ticks race on slot claim | `drainQueue()` concurrent with `tickScheduler()` via `.then()` chain at scheduler.ts:420 | Cap breached if naive check-then-act | Atomic single-SQL claim — `changes=0` means lost the race; leave row in `queued`, let next drain retry |
63
+ | SDK hangs mid-run | Upstream Opus stall, network partition, subprocess deadlock | Slot held forever, cap permanently reduced (e.g. 2→1 effective) | Lease expiry + reaper at each tick aborts via `execution-manager.RunningExecution.abortController`; DB update to `failed`/`lease_expired` frees the slot |
64
+ | Reaper itself throws | Rare DB error during `SELECT expired` or per-task `UPDATE` | Expired leases accumulate | Reaper catches per-task errors; the sweep continues to the next expired row. Next tick retries anything missed |
65
+ | Reaper aborts a task that legitimately needs 25 min | Per-schedule `max_run_duration_sec` not configured; default 20 min too tight | Legitimate run killed | Per-schedule `max_run_duration_sec` override. Drift-warn when a run completes at >80% of lease on 3 consecutive firings so users raise the cap |
66
+ | User sets `max_turns=10` on schedule averaging 40 | Config footgun | Would trip auto-pause in 3 firings under shared streak | First-breach grace + separate `turn_budget_breach_streak` with threshold 5; drift warning at 2 advising raise |
67
+ | `detectFailureReason()` misclassifies | SDK error text changes format | Wrong streak incremented | Runtime adapter writes explicit `failure_reason` at terminal transitions; string-match is fallback only |
68
+ | Manual execute spammed | User double-clicks "Run now" 5× | Could exceed cap under naive design | Manual honors cap by default; returns `429 {error, slotEtaSec}`; `?force=true` bypasses with audit log |
69
+ | Chat pressure delay causes schedule to miss a minute | User has `* * * * *` cron, chat is streaming for 45s | Minute skipped | 30s delay is one-shot per tick; next tick re-evaluates. Documented in UI help text |
70
+ | Firing metrics unbounded growth | High-frequency schedules over months | Disk bloat | Periodic cleanup `DELETE WHERE fired_at < now() - 30 days` in a post-tick maintenance pass |
71
+ | Clock skew between JS `Date.now()` and SQLite `CURRENT_TIMESTAMP` | Container restart, NTP drift | `lease_expires_at` mismatches | Use consistent Unix-ms integers everywhere; no mixing of SQL clock and JS clock inside one comparison |
72
+ | Collision check runs against in-flight chat-pressure-shifted fire time | Deterministic warning becomes nondeterministic | Confusing UX | Collision check always runs against *nominal* cron expansion, never adjusted times |
73
+ | `SCHEDULE_MAX_CONCURRENT` env var typo | User sets `=abc` | Silent fallback to default | `parseInt` with NaN guard; log warning; use default. Same pattern as existing SDK timeout handling |
74
+ | Tests pollute each other via shared temp DB | Multiple test files hitting same tables | Flaky tests | Every test file uses `beforeEach` to delete in FK-safe order (pattern from `src/lib/chat/__tests__/reconcile.test.ts`) |
75
+
76
+ ---
77
+
78
+ ## File Structure
79
+
80
+ **New files:**
81
+
82
+ ```
83
+ src/lib/schedules/slot-claim.ts — Atomic claim primitive + reap helper
84
+ src/lib/schedules/collision-check.ts — 24h cron expansion + 5-min bucket overlap detector
85
+ src/lib/schedules/config.ts — Config reader helpers for the new settings keys
86
+ src/lib/chat/active-streams.ts — Module-level Set tracking in-flight chat streams
87
+ src/app/api/schedules/[id]/execute/route.ts — Manual fire endpoint (does not exist today)
88
+
89
+ src/lib/schedules/__tests__/slot-claim.test.ts — Race + reap tests
90
+ src/lib/schedules/__tests__/collision-check.test.ts — Overlap detection tests
91
+ src/lib/schedules/__tests__/turn-budget.test.ts — First-breach grace + streak threshold
92
+ src/lib/schedules/__tests__/tick-scheduler.test.ts — Cap + chat pressure
93
+ src/lib/schedules/__tests__/firing-metrics.test.ts — Metrics insertion
94
+ src/lib/schedules/__tests__/integration.test.ts — End-to-end
95
+ src/lib/schedules/__tests__/config.test.ts — Config reader
96
+ src/lib/chat/__tests__/active-streams.test.ts — Set lifecycle
97
+ src/lib/agents/__tests__/failure-reason.test.ts — Classifier
98
+ src/app/api/schedules/__tests__/execute-route.test.ts — 429 + force bypass
99
+
100
+ .claude/skills/architect/references/tdr-atomic-slot-claim.md
101
+ .claude/skills/architect/references/tdr-evidence-based-cap.md
102
+ .claude/skills/architect/references/tdr-failure-class-streaks.md
103
+ .claude/skills/architect/references/tdr-manual-honors-cap.md
104
+ .claude/skills/architect/references/tdr-lock-holders-leased.md
105
+ .claude/skills/architect/references/tdr-chat-shares-event-loop.md
106
+ ```
107
+
108
+ **Modified files:**
109
+
110
+ ```
111
+ src/lib/db/schema.ts — New columns on tasks + schedules, new table
112
+ src/lib/db/bootstrap.ts — CREATE TABLE + addColumnIfMissing calls
113
+ src/lib/data/clear.ts — Delete new table in FK-safe order
114
+ src/lib/constants/settings.ts — SCHEDULE_MAX_CONCURRENT etc.
115
+ src/lib/schedules/scheduler.ts — Wire slot claim + reaper + chat pressure + metrics
116
+ src/lib/agents/claude-agent.ts — Override maxTurns from tasks.maxTurns; write failure_reason
117
+ src/lib/chat/engine.ts — Register/unregister in activeChatStreams
118
+ src/app/api/schedules/route.ts — Attach collision warnings in POST response
119
+ src/app/api/schedules/[id]/route.ts — Attach collision warnings in PUT response
120
+ src/components/schedules/schedule-form.tsx — New "Max agent steps" field, tooltip, calibration hint
121
+ src/components/schedules/schedule-create-sheet.tsx — Render collision warning banner
122
+ src/components/schedules/schedule-edit-sheet.tsx — Render collision warning banner
123
+ ```
124
+
125
+ ---
126
+
127
+ ## Task 1: Add schema columns + new firing-metrics table
128
+
129
+ **Files:**
130
+ - Modify: `src/lib/db/schema.ts`
131
+ - Modify: `src/lib/db/bootstrap.ts`
132
+ - Modify: `src/lib/data/clear.ts`
133
+
134
+ - [ ] **Step 1.1: Add Drizzle schema definitions**
135
+
136
+ Edit `src/lib/db/schema.ts`. Inside the `tasks` table definition (around line 16-53), add these columns before `createdAt`:
137
+
138
+ ```typescript
139
+ /** When the slot for this task was atomically claimed */
140
+ slotClaimedAt: integer("slot_claimed_at", { mode: "timestamp" }),
141
+ /** Wall-clock expiry; reaper aborts tasks whose lease has passed */
142
+ leaseExpiresAt: integer("lease_expires_at", { mode: "timestamp" }),
143
+ /** Explicit terminal-state reason written by the runtime adapter */
144
+ failureReason: text("failure_reason"),
145
+ /** Per-task turn budget copied from schedules.maxTurns at firing time */
146
+ maxTurns: integer("max_turns"),
147
+ ```
148
+
149
+ Add a new index at the end of the `tasks` table definition's index array:
150
+
151
+ ```typescript
152
+ index("idx_tasks_running_scheduled").on(table.status, table.sourceType, table.leaseExpiresAt),
153
+ ```
154
+
155
+ Inside the `schedules` table definition (around line 165-228), add these columns before `createdAt`:
156
+
157
+ ```typescript
158
+ /** Hard cap on turns per firing; NULL inherits the global MAX_TURNS setting */
159
+ maxTurns: integer("max_turns"),
160
+ /** Timestamp when maxTurns was last edited — drives first-breach grace */
161
+ maxTurnsSetAt: integer("max_turns_set_at", { mode: "timestamp" }),
162
+ /** Wall-clock lease override in seconds; NULL inherits global default (1200s) */
163
+ maxRunDurationSec: integer("max_run_duration_sec"),
164
+ /** Counter separate from failureStreak — only increments on maxTurns breach */
165
+ turnBudgetBreachStreak: integer("turn_budget_breach_streak").default(0).notNull(),
166
+ ```
167
+
168
+ Append a new table definition at the bottom of `schema.ts`, before the `export type` block:
169
+
170
+ ```typescript
171
+ export const scheduleFiringMetrics = sqliteTable(
172
+ "schedule_firing_metrics",
173
+ {
174
+ id: text("id").primaryKey(),
175
+ scheduleId: text("schedule_id")
176
+ .references(() => schedules.id)
177
+ .notNull(),
178
+ taskId: text("task_id").references(() => tasks.id),
179
+ firedAt: integer("fired_at", { mode: "timestamp" }).notNull(),
180
+ slotClaimedAt: integer("slot_claimed_at", { mode: "timestamp" }),
181
+ completedAt: integer("completed_at", { mode: "timestamp" }),
182
+ slotWaitMs: integer("slot_wait_ms"),
183
+ durationMs: integer("duration_ms"),
184
+ turnCount: integer("turn_count"),
185
+ maxTurnsAtFiring: integer("max_turns_at_firing"),
186
+ eventLoopLagMs: real("event_loop_lag_ms"),
187
+ peakRssMb: integer("peak_rss_mb"),
188
+ chatStreamsActive: integer("chat_streams_active"),
189
+ concurrentSchedules: integer("concurrent_schedules"),
190
+ failureReason: text("failure_reason"),
191
+ },
192
+ (table) => [
193
+ index("idx_sfm_schedule_time").on(table.scheduleId, table.firedAt),
194
+ ]
195
+ );
196
+
197
+ export type ScheduleFiringMetricRow = InferSelectModel<typeof scheduleFiringMetrics>;
198
+ ```
199
+
200
+ - [ ] **Step 1.2: Add bootstrap CREATE TABLE + addColumnIfMissing calls**
201
+
202
+ Edit `src/lib/db/bootstrap.ts`. Inside the `STAGENT_TABLES` const (around line 4-51), append `"schedule_firing_metrics"` to the array.
203
+
204
+ Inside `bootstrapStagentDatabase()`, after the `schedules` CREATE TABLE (around line 190), add a new `CREATE TABLE IF NOT EXISTS schedule_firing_metrics (...)` with columns matching the Drizzle schema above. Also add `CREATE INDEX IF NOT EXISTS idx_sfm_schedule_time ON schedule_firing_metrics(schedule_id, fired_at);`.
205
+
206
+ At the end of the `addColumnIfMissing` call block (around line 558), add:
207
+
208
+ ```typescript
209
+ addColumnIfMissing(`ALTER TABLE tasks ADD COLUMN slot_claimed_at INTEGER;`);
210
+ addColumnIfMissing(`ALTER TABLE tasks ADD COLUMN lease_expires_at INTEGER;`);
211
+ addColumnIfMissing(`ALTER TABLE tasks ADD COLUMN failure_reason TEXT;`);
212
+ addColumnIfMissing(`ALTER TABLE tasks ADD COLUMN max_turns INTEGER;`);
213
+ addColumnIfMissing(`ALTER TABLE schedules ADD COLUMN max_turns INTEGER;`);
214
+ addColumnIfMissing(`ALTER TABLE schedules ADD COLUMN max_turns_set_at INTEGER;`);
215
+ addColumnIfMissing(`ALTER TABLE schedules ADD COLUMN max_run_duration_sec INTEGER;`);
216
+ addColumnIfMissing(`ALTER TABLE schedules ADD COLUMN turn_budget_breach_streak INTEGER DEFAULT 0 NOT NULL;`);
217
+ ```
218
+
219
+ Also add an index creation line (using the existing sqlite handle):
220
+
221
+ ```typescript
222
+ sqlite.exec(`CREATE INDEX IF NOT EXISTS idx_tasks_running_scheduled ON tasks(status, source_type, lease_expires_at);`);
223
+ ```
224
+
225
+ - [ ] **Step 1.3: Add firing-metrics delete to clear.ts (FK-safe order)**
226
+
227
+ Edit `src/lib/data/clear.ts`. Add `scheduleFiringMetrics` to the imports from `@/lib/db/schema`. Add the delete call BEFORE the existing `schedulesDeleted = db.delete(schedules)...` line, because it references `schedules`:
228
+
229
+ ```typescript
230
+ const scheduleFiringMetricsDeleted = db.delete(scheduleFiringMetrics).run().changes;
231
+ const schedulesDeleted = db.delete(schedules).run().changes;
232
+ ```
233
+
234
+ Include the count in the returned object at the end of `clearAllData`:
235
+
236
+ ```typescript
237
+ return {
238
+ // ... existing keys ...
239
+ scheduleFiringMetrics: scheduleFiringMetricsDeleted,
240
+ };
241
+ ```
242
+
243
+ - [ ] **Step 1.4: Run the clear.ts safety-net test**
244
+
245
+ Run: `npx vitest run src/lib/data/__tests__/clear.test.ts`
246
+ Expected: PASS. The safety-net test verifies every schema table has a `db.delete()` call. If it fails, you forgot to add `scheduleFiringMetrics` to clear.ts.
247
+
248
+ - [ ] **Step 1.5: Run full test suite**
249
+
250
+ Run: `npx vitest run`
251
+ Expected: PASS — all existing tests still pass. New columns are nullable so no existing seeds break.
252
+
253
+ - [ ] **Step 1.6: Commit**
254
+
255
+ ```bash
256
+ git add src/lib/db/schema.ts src/lib/db/bootstrap.ts src/lib/data/clear.ts
257
+ git commit -m "feat(schedules): add schema columns + firing metrics table"
258
+ ```
259
+
260
+ ---
261
+
262
+ ## Task 2: Settings keys + config reader helpers
263
+
264
+ **Files:**
265
+ - Modify: `src/lib/constants/settings.ts`
266
+ - Create: `src/lib/schedules/config.ts`
267
+ - Test: `src/lib/schedules/__tests__/config.test.ts`
268
+
269
+ - [ ] **Step 2.1: Write failing config reader tests**
270
+
271
+ Create `src/lib/schedules/__tests__/config.test.ts`:
272
+
273
+ ```typescript
274
+ import { describe, it, expect, beforeEach } from "vitest";
275
+ import { db } from "@/lib/db";
276
+ import { settings } from "@/lib/db/schema";
277
+ import { eq } from "drizzle-orm";
278
+ import {
279
+ getScheduleMaxConcurrent,
280
+ getScheduleMaxRunDurationSec,
281
+ getScheduleChatPressureDelaySec,
282
+ } from "../config";
283
+
284
+ describe("schedule config", () => {
285
+ beforeEach(() => {
286
+ db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
287
+ db.delete(settings).where(eq(settings.key, "schedule.maxRunDurationSec")).run();
288
+ db.delete(settings).where(eq(settings.key, "schedule.chatPressureDelaySec")).run();
289
+ });
290
+
291
+ it("returns default max concurrent of 2 when setting is absent", () => {
292
+ expect(getScheduleMaxConcurrent()).toBe(2);
293
+ });
294
+
295
+ it("reads max concurrent from settings when set", () => {
296
+ db.insert(settings)
297
+ .values({
298
+ key: "schedule.maxConcurrent",
299
+ value: "3",
300
+ updatedAt: new Date(),
301
+ })
302
+ .run();
303
+ expect(getScheduleMaxConcurrent()).toBe(3);
304
+ });
305
+
306
+ it("reads max concurrent from SCHEDULE_MAX_CONCURRENT env var", () => {
307
+ const original = process.env.SCHEDULE_MAX_CONCURRENT;
308
+ process.env.SCHEDULE_MAX_CONCURRENT = "5";
309
+ try {
310
+ expect(getScheduleMaxConcurrent()).toBe(5);
311
+ } finally {
312
+ if (original === undefined) delete process.env.SCHEDULE_MAX_CONCURRENT;
313
+ else process.env.SCHEDULE_MAX_CONCURRENT = original;
314
+ }
315
+ });
316
+
317
+ it("falls back to default when env var is NaN", () => {
318
+ const original = process.env.SCHEDULE_MAX_CONCURRENT;
319
+ process.env.SCHEDULE_MAX_CONCURRENT = "abc";
320
+ try {
321
+ expect(getScheduleMaxConcurrent()).toBe(2);
322
+ } finally {
323
+ if (original === undefined) delete process.env.SCHEDULE_MAX_CONCURRENT;
324
+ else process.env.SCHEDULE_MAX_CONCURRENT = original;
325
+ }
326
+ });
327
+
328
+ it("returns default max run duration of 1200s", () => {
329
+ expect(getScheduleMaxRunDurationSec()).toBe(1200);
330
+ });
331
+
332
+ it("returns default chat pressure delay of 30s", () => {
333
+ expect(getScheduleChatPressureDelaySec()).toBe(30);
334
+ });
335
+ });
336
+ ```
337
+
338
+ - [ ] **Step 2.2: Run to verify RED**
339
+
340
+ Run: `npx vitest run src/lib/schedules/__tests__/config.test.ts`
341
+ Expected: FAIL — module `../config` does not exist.
342
+
343
+ - [ ] **Step 2.3: Add settings keys**
344
+
345
+ Edit `src/lib/constants/settings.ts`. Add inside the `SETTINGS_KEYS` const:
346
+
347
+ ```typescript
348
+ SCHEDULE_MAX_CONCURRENT: "schedule.maxConcurrent",
349
+ SCHEDULE_MAX_RUN_DURATION_SEC: "schedule.maxRunDurationSec",
350
+ SCHEDULE_CHAT_PRESSURE_DELAY_SEC: "schedule.chatPressureDelaySec",
351
+ ```
352
+
353
+ - [ ] **Step 2.4: Implement config helpers**
354
+
355
+ Create `src/lib/schedules/config.ts`:
356
+
357
+ ```typescript
358
+ import { getSettingSync } from "@/lib/settings/helpers";
359
+ import { SETTINGS_KEYS } from "@/lib/constants/settings";
360
+
361
+ const DEFAULT_MAX_CONCURRENT = 2;
362
+ const DEFAULT_MAX_RUN_DURATION_SEC = 1200; // 20 minutes
363
+ const DEFAULT_CHAT_PRESSURE_DELAY_SEC = 30;
364
+
365
+ function readIntConfig(
366
+ envVar: string,
367
+ settingKey: string,
368
+ defaultValue: number,
369
+ ): number {
370
+ const envRaw = process.env[envVar];
371
+ if (envRaw !== undefined) {
372
+ const parsed = parseInt(envRaw, 10);
373
+ if (Number.isFinite(parsed) && parsed > 0) return parsed;
374
+ console.warn(
375
+ `[schedule-config] ${envVar}="${envRaw}" is not a positive integer; using default ${defaultValue}`,
376
+ );
377
+ }
378
+
379
+ const settingRaw = getSettingSync(settingKey);
380
+ if (settingRaw !== null) {
381
+ const parsed = parseInt(settingRaw, 10);
382
+ if (Number.isFinite(parsed) && parsed > 0) return parsed;
383
+ }
384
+
385
+ return defaultValue;
386
+ }
387
+
388
+ export function getScheduleMaxConcurrent(): number {
389
+ return readIntConfig(
390
+ "SCHEDULE_MAX_CONCURRENT",
391
+ SETTINGS_KEYS.SCHEDULE_MAX_CONCURRENT,
392
+ DEFAULT_MAX_CONCURRENT,
393
+ );
394
+ }
395
+
396
+ export function getScheduleMaxRunDurationSec(): number {
397
+ return readIntConfig(
398
+ "SCHEDULE_MAX_RUN_DURATION_SEC",
399
+ SETTINGS_KEYS.SCHEDULE_MAX_RUN_DURATION_SEC,
400
+ DEFAULT_MAX_RUN_DURATION_SEC,
401
+ );
402
+ }
403
+
404
+ export function getScheduleChatPressureDelaySec(): number {
405
+ return readIntConfig(
406
+ "SCHEDULE_CHAT_PRESSURE_DELAY_SEC",
407
+ SETTINGS_KEYS.SCHEDULE_CHAT_PRESSURE_DELAY_SEC,
408
+ DEFAULT_CHAT_PRESSURE_DELAY_SEC,
409
+ );
410
+ }
411
+ ```
412
+
413
+ - [ ] **Step 2.5: Run to verify GREEN**
414
+
415
+ Run: `npx vitest run src/lib/schedules/__tests__/config.test.ts`
416
+ Expected: PASS — 6 tests pass.
417
+
418
+ - [ ] **Step 2.6: Commit**
419
+
420
+ ```bash
421
+ git add src/lib/constants/settings.ts src/lib/schedules/config.ts src/lib/schedules/__tests__/config.test.ts
422
+ git commit -m "feat(schedules): add concurrency + lease + chat-pressure config readers"
423
+ ```
424
+
425
+ ---
426
+
427
+ ## Task 3: Atomic slot claim primitive
428
+
429
+ **Files:**
430
+ - Create: `src/lib/schedules/slot-claim.ts`
431
+ - Test: `src/lib/schedules/__tests__/slot-claim.test.ts`
432
+
433
+ This is the load-bearing primitive. The atomic claim MUST be a single SQL statement — check-then-act is forbidden because `tickScheduler()` and `drainQueue()` run concurrently via the `.then()` chain at scheduler.ts:420.
434
+
435
+ - [ ] **Step 3.1: Write failing tests**
436
+
437
+ Create `src/lib/schedules/__tests__/slot-claim.test.ts`:
438
+
439
+ ```typescript
440
+ import { describe, it, expect, beforeEach } from "vitest";
441
+ import { db } from "@/lib/db";
442
+ import { tasks, schedules, projects, settings } from "@/lib/db/schema";
443
+ import { eq } from "drizzle-orm";
444
+ import { randomUUID } from "crypto";
445
+ import { claimSlot, countRunningScheduledSlots } from "../slot-claim";
446
+
447
+ function seedProject(): string {
448
+ const id = randomUUID();
449
+ const now = new Date();
450
+ db.insert(projects)
451
+ .values({ id, name: "test", status: "active", createdAt: now, updatedAt: now })
452
+ .run();
453
+ return id;
454
+ }
455
+
456
+ function seedSchedule(projectId: string): string {
457
+ const id = randomUUID();
458
+ const now = new Date();
459
+ db.insert(schedules)
460
+ .values({
461
+ id,
462
+ projectId,
463
+ name: `sched-${id.slice(0, 4)}`,
464
+ prompt: "test",
465
+ cronExpression: "* * * * *",
466
+ status: "active",
467
+ type: "scheduled",
468
+ firingCount: 0,
469
+ suppressionCount: 0,
470
+ heartbeatSpentToday: 0,
471
+ failureStreak: 0,
472
+ turnBudgetBreachStreak: 0,
473
+ createdAt: now,
474
+ updatedAt: now,
475
+ })
476
+ .run();
477
+ return id;
478
+ }
479
+
480
+ function seedQueuedTask(scheduleId: string): string {
481
+ const id = randomUUID();
482
+ const now = new Date();
483
+ db.insert(tasks)
484
+ .values({
485
+ id,
486
+ scheduleId,
487
+ title: "test firing",
488
+ status: "queued",
489
+ priority: 2,
490
+ sourceType: "scheduled",
491
+ resumeCount: 0,
492
+ createdAt: now,
493
+ updatedAt: now,
494
+ })
495
+ .run();
496
+ return id;
497
+ }
498
+
499
+ describe("claimSlot", () => {
500
+ beforeEach(() => {
501
+ db.delete(tasks).run();
502
+ db.delete(schedules).run();
503
+ db.delete(projects).run();
504
+ db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
505
+ });
506
+
507
+ it("claims a slot when capacity available, transitioning queued→running", () => {
508
+ const pid = seedProject();
509
+ const sid = seedSchedule(pid);
510
+ const tid = seedQueuedTask(sid);
511
+
512
+ const result = claimSlot(tid, 2, 1200);
513
+
514
+ expect(result.claimed).toBe(true);
515
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
516
+ expect(row?.status).toBe("running");
517
+ expect(row?.slotClaimedAt).not.toBeNull();
518
+ expect(row?.leaseExpiresAt).not.toBeNull();
519
+ });
520
+
521
+ it("refuses to claim when cap=0", () => {
522
+ const pid = seedProject();
523
+ const sid = seedSchedule(pid);
524
+ const tid = seedQueuedTask(sid);
525
+
526
+ const result = claimSlot(tid, 0, 1200);
527
+
528
+ expect(result.claimed).toBe(false);
529
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
530
+ expect(row?.status).toBe("queued");
531
+ });
532
+
533
+ it("refuses when cap already full", () => {
534
+ const pid = seedProject();
535
+ const sid1 = seedSchedule(pid);
536
+ const sid2 = seedSchedule(pid);
537
+ const tid1 = seedQueuedTask(sid1);
538
+ const tid2 = seedQueuedTask(sid2);
539
+
540
+ expect(claimSlot(tid1, 1, 1200).claimed).toBe(true);
541
+ expect(claimSlot(tid2, 1, 1200).claimed).toBe(false);
542
+
543
+ const row2 = db.select().from(tasks).where(eq(tasks.id, tid2)).get();
544
+ expect(row2?.status).toBe("queued");
545
+ });
546
+
547
+ it("two concurrent claim attempts for the same task yield exactly one winner", () => {
548
+ const pid = seedProject();
549
+ const sid = seedSchedule(pid);
550
+ const tid = seedQueuedTask(sid);
551
+
552
+ const first = claimSlot(tid, 10, 1200);
553
+ const second = claimSlot(tid, 10, 1200);
554
+
555
+ expect(first.claimed).toBe(true);
556
+ expect(second.claimed).toBe(false); // task already running, can't re-claim
557
+ });
558
+
559
+ it("respects cap across multiple tasks from different schedules", () => {
560
+ const pid = seedProject();
561
+ const tids: string[] = [];
562
+ for (let i = 0; i < 5; i++) {
563
+ const sid = seedSchedule(pid);
564
+ tids.push(seedQueuedTask(sid));
565
+ }
566
+
567
+ // Cap of 3 → first 3 claim, last 2 fail
568
+ const results = tids.map((tid) => claimSlot(tid, 3, 1200));
569
+ expect(results.filter((r) => r.claimed).length).toBe(3);
570
+ expect(results.filter((r) => !r.claimed).length).toBe(2);
571
+
572
+ expect(countRunningScheduledSlots()).toBe(3);
573
+ });
574
+
575
+ it("countRunningScheduledSlots ignores non-scheduled tasks", () => {
576
+ const pid = seedProject();
577
+ const sid = seedSchedule(pid);
578
+ const schedTid = seedQueuedTask(sid);
579
+ claimSlot(schedTid, 10, 1200);
580
+
581
+ // Insert a manual running task — must not count against scheduled cap
582
+ const manualId = randomUUID();
583
+ const now = new Date();
584
+ db.insert(tasks)
585
+ .values({
586
+ id: manualId,
587
+ title: "manual",
588
+ status: "running",
589
+ priority: 2,
590
+ sourceType: "manual",
591
+ resumeCount: 0,
592
+ createdAt: now,
593
+ updatedAt: now,
594
+ })
595
+ .run();
596
+
597
+ expect(countRunningScheduledSlots()).toBe(1);
598
+ });
599
+
600
+ it("writes leaseExpiresAt = slotClaimedAt + leaseSec", () => {
601
+ const pid = seedProject();
602
+ const sid = seedSchedule(pid);
603
+ const tid = seedQueuedTask(sid);
604
+
605
+ const before = Date.now();
606
+ claimSlot(tid, 10, 60);
607
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
608
+
609
+ expect(row?.slotClaimedAt?.getTime()).toBeGreaterThanOrEqual(before);
610
+ expect(
611
+ row!.leaseExpiresAt!.getTime() - row!.slotClaimedAt!.getTime(),
612
+ ).toBe(60 * 1000);
613
+ });
614
+ });
615
+ ```
616
+
617
+ - [ ] **Step 3.2: Run to verify RED**
618
+
619
+ Run: `npx vitest run src/lib/schedules/__tests__/slot-claim.test.ts`
620
+ Expected: FAIL — module `../slot-claim` does not exist.
621
+
622
+ - [ ] **Step 3.3: Implement slot-claim.ts**
623
+
624
+ Create `src/lib/schedules/slot-claim.ts`:
625
+
626
+ ```typescript
627
+ import { sqlite } from "@/lib/db";
628
+
629
+ export interface ClaimResult {
630
+ claimed: boolean;
631
+ }
632
+
633
+ /**
634
+ * Atomic slot claim: transitions a queued scheduled task to running IFF the
635
+ * global cap of concurrent running scheduled tasks is not exceeded.
636
+ *
637
+ * Must be a single SQL statement — check-then-act would race between the
638
+ * scheduler tick loop and the drain loop that scheduler.ts currently dispatches
639
+ * concurrently. Using a subquery inside the WHERE clause guarantees SQLite
640
+ * serializes the count and update under its write lock, so two concurrent
641
+ * claim attempts cannot both succeed against the same cap.
642
+ *
643
+ * Returns `{ claimed: true }` when the task transitioned; `{ claimed: false }`
644
+ * when either (a) the task is no longer in queued state (already claimed) or
645
+ * (b) the global cap is full.
646
+ */
647
+ export function claimSlot(
648
+ taskId: string,
649
+ cap: number,
650
+ leaseSec: number,
651
+ ): ClaimResult {
652
+ const now = Date.now();
653
+ const leaseExpires = now + leaseSec * 1000;
654
+
655
+ const stmt = sqlite.prepare(
656
+ "UPDATE tasks SET status = 'running', slot_claimed_at = ?, lease_expires_at = ?, updated_at = ? WHERE id = ? AND status = 'queued' AND source_type IN ('scheduled', 'heartbeat') AND (SELECT COUNT(*) FROM tasks WHERE status = 'running' AND source_type IN ('scheduled', 'heartbeat')) < ?",
657
+ );
658
+
659
+ const result = stmt.run(now, leaseExpires, now, taskId, cap);
660
+ return { claimed: result.changes === 1 };
661
+ }
662
+
663
+ /**
664
+ * Count currently running scheduled/heartbeat tasks — used by the drain loop,
665
+ * manual-execute endpoint, and telemetry.
666
+ */
667
+ export function countRunningScheduledSlots(): number {
668
+ const row = sqlite
669
+ .prepare(
670
+ "SELECT COUNT(*) AS n FROM tasks WHERE status = 'running' AND source_type IN ('scheduled', 'heartbeat')",
671
+ )
672
+ .get() as { n: number } | undefined;
673
+ return row?.n ?? 0;
674
+ }
675
+ ```
676
+
677
+ - [ ] **Step 3.4: Run to verify GREEN**
678
+
679
+ Run: `npx vitest run src/lib/schedules/__tests__/slot-claim.test.ts`
680
+ Expected: PASS — 7 tests pass.
681
+
682
+ - [ ] **Step 3.5: Commit**
683
+
684
+ ```bash
685
+ git add src/lib/schedules/slot-claim.ts src/lib/schedules/__tests__/slot-claim.test.ts
686
+ git commit -m "feat(schedules): atomic slot claim primitive with race coverage"
687
+ ```
688
+
689
+ ---
690
+
691
+ ## Task 4: Lease reaper
692
+
693
+ **Files:**
694
+ - Modify: `src/lib/schedules/slot-claim.ts`
695
+ - Modify: `src/lib/schedules/__tests__/slot-claim.test.ts` (extend)
696
+
697
+ - [ ] **Step 4.1: Append failing reaper tests**
698
+
699
+ Append to `src/lib/schedules/__tests__/slot-claim.test.ts`:
700
+
701
+ ```typescript
702
+ import { reapExpiredLeases } from "../slot-claim";
703
+
704
+ describe("reapExpiredLeases", () => {
705
+ beforeEach(() => {
706
+ db.delete(tasks).run();
707
+ db.delete(schedules).run();
708
+ db.delete(projects).run();
709
+ });
710
+
711
+ it("marks an expired running task as failed with failure_reason=lease_expired", () => {
712
+ const pid = seedProject();
713
+ const sid = seedSchedule(pid);
714
+ const tid = seedQueuedTask(sid);
715
+
716
+ // Claim with a 1-second lease, then fast-forward via direct DB edit
717
+ claimSlot(tid, 10, 1);
718
+ const past = new Date(Date.now() - 5000);
719
+ db.update(tasks)
720
+ .set({ leaseExpiresAt: past })
721
+ .where(eq(tasks.id, tid))
722
+ .run();
723
+
724
+ const reaped = reapExpiredLeases();
725
+
726
+ expect(reaped).toEqual([tid]);
727
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
728
+ expect(row?.status).toBe("failed");
729
+ expect(row?.failureReason).toBe("lease_expired");
730
+ });
731
+
732
+ it("leaves fresh running tasks alone", () => {
733
+ const pid = seedProject();
734
+ const sid = seedSchedule(pid);
735
+ const tid = seedQueuedTask(sid);
736
+
737
+ claimSlot(tid, 10, 3600); // 1-hour lease
738
+
739
+ const reaped = reapExpiredLeases();
740
+
741
+ expect(reaped).toEqual([]);
742
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
743
+ expect(row?.status).toBe("running");
744
+ });
745
+
746
+ it("reaps multiple expired tasks in one sweep", () => {
747
+ const pid = seedProject();
748
+ const tids: string[] = [];
749
+ for (let i = 0; i < 3; i++) {
750
+ const sid = seedSchedule(pid);
751
+ const tid = seedQueuedTask(sid);
752
+ claimSlot(tid, 10, 1);
753
+ tids.push(tid);
754
+ }
755
+ const past = new Date(Date.now() - 5000);
756
+ for (const tid of tids) {
757
+ db.update(tasks)
758
+ .set({ leaseExpiresAt: past })
759
+ .where(eq(tasks.id, tid))
760
+ .run();
761
+ }
762
+
763
+ const reaped = reapExpiredLeases();
764
+
765
+ expect(reaped.sort()).toEqual([...tids].sort());
766
+ expect(countRunningScheduledSlots()).toBe(0);
767
+ });
768
+ });
769
+ ```
770
+
771
+ - [ ] **Step 4.2: Run to verify RED**
772
+
773
+ Run: `npx vitest run src/lib/schedules/__tests__/slot-claim.test.ts`
774
+ Expected: FAIL — `reapExpiredLeases` not exported.
775
+
776
+ - [ ] **Step 4.3: Implement reapExpiredLeases**
777
+
778
+ Append to `src/lib/schedules/slot-claim.ts`:
779
+
780
+ ```typescript
781
+ import { getExecution } from "@/lib/agents/execution-manager";
782
+
783
+ /**
784
+ * Reap running scheduled tasks whose lease has expired. For each expired
785
+ * task: (1) abort the in-memory execution via AbortController, (2) mark
786
+ * the DB row as failed with failure_reason='lease_expired'. Returns the
787
+ * list of reaped task IDs for logging.
788
+ *
789
+ * Idempotent — safe to call on every scheduler tick.
790
+ */
791
+ export function reapExpiredLeases(): string[] {
792
+ const now = Date.now();
793
+ const expiredRows = sqlite
794
+ .prepare(
795
+ "SELECT id FROM tasks WHERE status = 'running' AND source_type IN ('scheduled', 'heartbeat') AND lease_expires_at IS NOT NULL AND lease_expires_at < ?",
796
+ )
797
+ .all(now) as Array<{ id: string }>;
798
+
799
+ const reaped: string[] = [];
800
+ for (const { id } of expiredRows) {
801
+ // Abort the in-process execution so the SDK stops immediately
802
+ const execution = getExecution(id);
803
+ if (execution) {
804
+ try {
805
+ execution.abortController.abort();
806
+ } catch {
807
+ // Already aborted — safe to ignore
808
+ }
809
+ }
810
+
811
+ const updateResult = sqlite
812
+ .prepare(
813
+ "UPDATE tasks SET status = 'failed', failure_reason = 'lease_expired', updated_at = ? WHERE id = ? AND status = 'running'",
814
+ )
815
+ .run(now, id);
816
+ if (updateResult.changes === 1) reaped.push(id);
817
+ }
818
+
819
+ return reaped;
820
+ }
821
+ ```
822
+
823
+ - [ ] **Step 4.4: Run to verify GREEN**
824
+
825
+ Run: `npx vitest run src/lib/schedules/__tests__/slot-claim.test.ts`
826
+ Expected: PASS — 10 tests (7 claim + 3 reap).
827
+
828
+ - [ ] **Step 4.5: Commit**
829
+
830
+ ```bash
831
+ git add src/lib/schedules/slot-claim.ts src/lib/schedules/__tests__/slot-claim.test.ts
832
+ git commit -m "feat(schedules): lease reaper aborts hung runs via AbortController"
833
+ ```
834
+
835
+ ---
836
+
837
+ ## Task 5: Chat active-streams tracker
838
+
839
+ **Files:**
840
+ - Create: `src/lib/chat/active-streams.ts`
841
+ - Test: `src/lib/chat/__tests__/active-streams.test.ts`
842
+ - Modify: `src/lib/chat/engine.ts` (register/unregister)
843
+
844
+ - [ ] **Step 5.1: Write failing tests**
845
+
846
+ Create `src/lib/chat/__tests__/active-streams.test.ts`:
847
+
848
+ ```typescript
849
+ import { describe, it, expect, beforeEach } from "vitest";
850
+ import {
851
+ registerChatStream,
852
+ unregisterChatStream,
853
+ getActiveChatStreamCount,
854
+ isAnyChatStreaming,
855
+ } from "../active-streams";
856
+
857
+ describe("active chat streams", () => {
858
+ beforeEach(() => {
859
+ for (const id of ["a", "b", "c"]) unregisterChatStream(id);
860
+ });
861
+
862
+ it("starts empty", () => {
863
+ expect(getActiveChatStreamCount()).toBe(0);
864
+ expect(isAnyChatStreaming()).toBe(false);
865
+ });
866
+
867
+ it("tracks a single registered stream", () => {
868
+ registerChatStream("a");
869
+ expect(getActiveChatStreamCount()).toBe(1);
870
+ expect(isAnyChatStreaming()).toBe(true);
871
+ });
872
+
873
+ it("tracks multiple streams independently", () => {
874
+ registerChatStream("a");
875
+ registerChatStream("b");
876
+ expect(getActiveChatStreamCount()).toBe(2);
877
+ });
878
+
879
+ it("is idempotent — registering the same id twice still counts as one", () => {
880
+ registerChatStream("a");
881
+ registerChatStream("a");
882
+ expect(getActiveChatStreamCount()).toBe(1);
883
+ });
884
+
885
+ it("unregisters by id", () => {
886
+ registerChatStream("a");
887
+ registerChatStream("b");
888
+ unregisterChatStream("a");
889
+ expect(getActiveChatStreamCount()).toBe(1);
890
+ expect(isAnyChatStreaming()).toBe(true);
891
+ });
892
+
893
+ it("unregistering a non-existent id is a no-op", () => {
894
+ expect(() => unregisterChatStream("never-registered")).not.toThrow();
895
+ expect(getActiveChatStreamCount()).toBe(0);
896
+ });
897
+ });
898
+ ```
899
+
900
+ - [ ] **Step 5.2: Run to verify RED**
901
+
902
+ Run: `npx vitest run src/lib/chat/__tests__/active-streams.test.ts`
903
+ Expected: FAIL — module does not exist.
904
+
905
+ - [ ] **Step 5.3: Implement active-streams.ts**
906
+
907
+ Create `src/lib/chat/active-streams.ts`:
908
+
909
+ ```typescript
910
+ /**
911
+ * In-memory tracker for chat conversations that currently have an SSE stream
912
+ * in flight. Used by the scheduler tick loop to apply a soft pressure signal
913
+ * — when chat is active, new schedule firings are deferred by N seconds to
914
+ * keep the Node event loop responsive for the user's conversation.
915
+ *
916
+ * Module-level state; single-process (same Node instance as the scheduler).
917
+ * Must NOT be persisted — crash recovery relies on the set starting empty.
918
+ */
919
+
920
+ const activeStreams = new Set<string>();
921
+
922
+ export function registerChatStream(conversationId: string): void {
923
+ activeStreams.add(conversationId);
924
+ }
925
+
926
+ export function unregisterChatStream(conversationId: string): void {
927
+ activeStreams.delete(conversationId);
928
+ }
929
+
930
+ export function getActiveChatStreamCount(): number {
931
+ return activeStreams.size;
932
+ }
933
+
934
+ export function isAnyChatStreaming(): boolean {
935
+ return activeStreams.size > 0;
936
+ }
937
+ ```
938
+
939
+ - [ ] **Step 5.4: Run to verify GREEN**
940
+
941
+ Run: `npx vitest run src/lib/chat/__tests__/active-streams.test.ts`
942
+ Expected: PASS — 6 tests.
943
+
944
+ - [ ] **Step 5.5: Wire engine.ts to register/unregister**
945
+
946
+ Edit `src/lib/chat/engine.ts`. Add import at the top:
947
+
948
+ ```typescript
949
+ import { registerChatStream, unregisterChatStream } from "./active-streams";
950
+ ```
951
+
952
+ Find the stream start point (just before `yield { type: "status", phase: "connecting" ... }` near line 280) and call:
953
+
954
+ ```typescript
955
+ registerChatStream(conversationId);
956
+ ```
957
+
958
+ In the top-level `finally` block (the one that already calls `finalizeStreamingMessage()` from Spec B), add `unregisterChatStream(conversationId);` alongside the finalize call so the set is cleared even on abnormal exit.
959
+
960
+ - [ ] **Step 5.6: Run chat tests**
961
+
962
+ Run: `npx vitest run src/lib/chat`
963
+ Expected: PASS — all existing chat tests still pass; new active-streams tests pass.
964
+
965
+ - [ ] **Step 5.7: Commit**
966
+
967
+ ```bash
968
+ git add src/lib/chat/active-streams.ts src/lib/chat/__tests__/active-streams.test.ts src/lib/chat/engine.ts
969
+ git commit -m "feat(chat): track active streams for scheduler pressure signal"
970
+ ```
971
+
972
+ ---
973
+
974
+ ## Task 6: Wire slot claim + reaper + chat pressure into tickScheduler
975
+
976
+ **Files:**
977
+ - Modify: `src/lib/schedules/scheduler.ts`
978
+ - Test: `src/lib/schedules/__tests__/tick-scheduler.test.ts`
979
+
980
+ - [ ] **Step 6.1: Write failing tick-scheduler tests**
981
+
982
+ Create `src/lib/schedules/__tests__/tick-scheduler.test.ts`:
983
+
984
+ ```typescript
985
+ import { describe, it, expect, beforeEach, vi } from "vitest";
986
+ import { db } from "@/lib/db";
987
+ import { tasks, schedules, projects, settings } from "@/lib/db/schema";
988
+ import { eq } from "drizzle-orm";
989
+ import { randomUUID } from "crypto";
990
+ import { tickScheduler } from "../scheduler";
991
+ import { registerChatStream, unregisterChatStream } from "@/lib/chat/active-streams";
992
+
993
+ // Stub the runtime — we're testing coordination, not the SDK
994
+ vi.mock("@/lib/agents/runtime", () => ({
995
+ executeTaskWithRuntime: vi.fn().mockResolvedValue(undefined),
996
+ }));
997
+
998
+ function seedProject(): string {
999
+ const id = randomUUID();
1000
+ const now = new Date();
1001
+ db.insert(projects)
1002
+ .values({ id, name: "test", status: "active", createdAt: now, updatedAt: now })
1003
+ .run();
1004
+ return id;
1005
+ }
1006
+
1007
+ function seedScheduleDue(projectId: string, nextFireAt: Date): string {
1008
+ const id = randomUUID();
1009
+ const now = new Date();
1010
+ db.insert(schedules)
1011
+ .values({
1012
+ id,
1013
+ projectId,
1014
+ name: `sched-${id.slice(0, 4)}`,
1015
+ prompt: "test prompt",
1016
+ cronExpression: "* * * * *",
1017
+ status: "active",
1018
+ type: "scheduled",
1019
+ firingCount: 0,
1020
+ suppressionCount: 0,
1021
+ heartbeatSpentToday: 0,
1022
+ failureStreak: 0,
1023
+ turnBudgetBreachStreak: 0,
1024
+ nextFireAt,
1025
+ createdAt: now,
1026
+ updatedAt: now,
1027
+ })
1028
+ .run();
1029
+ return id;
1030
+ }
1031
+
1032
+ describe("tickScheduler with concurrency cap", () => {
1033
+ beforeEach(() => {
1034
+ db.delete(tasks).run();
1035
+ db.delete(schedules).run();
1036
+ db.delete(projects).run();
1037
+ db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
1038
+ db.insert(settings)
1039
+ .values({ key: "schedule.maxConcurrent", value: "2", updatedAt: new Date() })
1040
+ .run();
1041
+ for (const id of ["x", "y", "z"]) unregisterChatStream(id);
1042
+ });
1043
+
1044
+ it("fires up to cap schedules, queues the rest", async () => {
1045
+ const pid = seedProject();
1046
+ const past = new Date(Date.now() - 10_000);
1047
+ for (let i = 0; i < 5; i++) seedScheduleDue(pid, past);
1048
+
1049
+ await tickScheduler();
1050
+
1051
+ const runningCount = db
1052
+ .select()
1053
+ .from(tasks)
1054
+ .where(eq(tasks.status, "running"))
1055
+ .all().length;
1056
+ const queuedCount = db
1057
+ .select()
1058
+ .from(tasks)
1059
+ .where(eq(tasks.status, "queued"))
1060
+ .all().length;
1061
+
1062
+ expect(runningCount).toBe(2); // cap=2
1063
+ expect(queuedCount).toBe(3); // remaining 3 waiting
1064
+ });
1065
+
1066
+ it("defers new firings when chat is active", async () => {
1067
+ const pid = seedProject();
1068
+ const past = new Date(Date.now() - 10_000);
1069
+ const sid = seedScheduleDue(pid, past);
1070
+
1071
+ registerChatStream("x");
1072
+
1073
+ await tickScheduler();
1074
+
1075
+ // No task should have been created
1076
+ const taskCount = db.select().from(tasks).all().length;
1077
+ expect(taskCount).toBe(0);
1078
+
1079
+ // The schedule's next_fire_at should have been pushed forward ≥25s
1080
+ const row = db.select().from(schedules).where(eq(schedules.id, sid)).get();
1081
+ expect(row?.nextFireAt?.getTime()).toBeGreaterThan(Date.now() + 25 * 1000);
1082
+
1083
+ unregisterChatStream("x");
1084
+ });
1085
+ });
1086
+ ```
1087
+
1088
+ - [ ] **Step 6.2: Run to verify RED**
1089
+
1090
+ Run: `npx vitest run src/lib/schedules/__tests__/tick-scheduler.test.ts`
1091
+ Expected: FAIL — cap enforcement not yet wired; all 5 schedules would fire.
1092
+
1093
+ - [ ] **Step 6.3: Wire reaper + chat pressure + atomic claim into scheduler.ts**
1094
+
1095
+ Edit `src/lib/schedules/scheduler.ts`. Add imports at the top alongside existing imports:
1096
+
1097
+ ```typescript
1098
+ import { claimSlot, reapExpiredLeases, countRunningScheduledSlots } from "./slot-claim";
1099
+ import { isAnyChatStreaming } from "@/lib/chat/active-streams";
1100
+ import {
1101
+ getScheduleMaxConcurrent,
1102
+ getScheduleMaxRunDurationSec,
1103
+ getScheduleChatPressureDelaySec,
1104
+ } from "./config";
1105
+ ```
1106
+
1107
+ At the top of `tickScheduler()` (around line 221), add the reaper pass:
1108
+
1109
+ ```typescript
1110
+ export async function tickScheduler(): Promise<void> {
1111
+ // Reap any running tasks whose lease has expired before claiming new slots.
1112
+ try {
1113
+ const reaped = reapExpiredLeases();
1114
+ if (reaped.length > 0) {
1115
+ console.warn(
1116
+ `[scheduler] reaped ${reaped.length} expired lease(s): ${reaped.join(", ")}`,
1117
+ );
1118
+ }
1119
+ } catch (err) {
1120
+ console.error("[scheduler] lease reaper error:", err);
1121
+ }
1122
+
1123
+ const now = new Date();
1124
+ // ... existing function body continues unchanged ...
1125
+ ```
1126
+
1127
+ Right after fetching `dueSchedules` (around line 224-232) and before the `for` loop, add the chat pressure check:
1128
+
1129
+ ```typescript
1130
+ // Chat soft pressure: defer new firings by N seconds when a chat stream is
1131
+ // in flight. In-flight scheduled runs are NOT affected — this only gates
1132
+ // new claims.
1133
+ if (isAnyChatStreaming() && dueSchedules.length > 0) {
1134
+ const delayMs = getScheduleChatPressureDelaySec() * 1000;
1135
+ const deferredUntil = new Date(now.getTime() + delayMs);
1136
+ for (const schedule of dueSchedules) {
1137
+ await db
1138
+ .update(schedules)
1139
+ .set({ nextFireAt: deferredUntil, updatedAt: now })
1140
+ .where(eq(schedules.id, schedule.id));
1141
+ }
1142
+ console.log(
1143
+ `[scheduler] chat streaming — deferred ${dueSchedules.length} firings by ${delayMs}ms`,
1144
+ );
1145
+ return;
1146
+ }
1147
+ ```
1148
+
1149
+ In `fireSchedule()` (around line 300-445), after the task INSERT and BEFORE the `executeTaskWithRuntime(taskId)` call at line 412, add the atomic claim:
1150
+
1151
+ ```typescript
1152
+ // Atomic slot claim — if the global cap is full, leave the task in queued
1153
+ // state. drainQueue will pick it up when a running slot frees.
1154
+ const cap = getScheduleMaxConcurrent();
1155
+ const leaseSec = schedule.maxRunDurationSec ?? getScheduleMaxRunDurationSec();
1156
+ const { claimed } = claimSlot(taskId, cap, leaseSec);
1157
+
1158
+ if (!claimed) {
1159
+ console.log(
1160
+ `[scheduler] schedule "${schedule.name}" queued — cap full (${countRunningScheduledSlots()}/${cap})`,
1161
+ );
1162
+ return;
1163
+ }
1164
+ ```
1165
+
1166
+ In `drainQueue()` (around line 51-95), replace the body of the `while (true)` loop so it claims slots atomically and stops when the cap is full:
1167
+
1168
+ ```typescript
1169
+ while (true) {
1170
+ const cap = getScheduleMaxConcurrent();
1171
+ if (countRunningScheduledSlots() >= cap) return;
1172
+
1173
+ const [nextQueued] = await db
1174
+ .select({ id: tasks.id })
1175
+ .from(tasks)
1176
+ .where(
1177
+ and(
1178
+ eq(tasks.status, "queued"),
1179
+ inArray(tasks.sourceType, ["scheduled", "heartbeat"])
1180
+ )
1181
+ )
1182
+ .orderBy(asc(tasks.createdAt))
1183
+ .limit(1);
1184
+
1185
+ if (!nextQueued) return;
1186
+
1187
+ const leaseSec = getScheduleMaxRunDurationSec();
1188
+ const { claimed } = claimSlot(nextQueued.id, cap, leaseSec);
1189
+ if (!claimed) return; // lost race or cap filled again
1190
+
1191
+ console.log(`[scheduler] draining queue → running task ${nextQueued.id}`);
1192
+ try {
1193
+ await executeTaskWithRuntime(nextQueued.id);
1194
+ } catch (err) {
1195
+ console.error(`[scheduler] drain task ${nextQueued.id} failed:`, err);
1196
+ }
1197
+
1198
+ try {
1199
+ const [taskRow] = await db
1200
+ .select({ scheduleId: tasks.scheduleId })
1201
+ .from(tasks)
1202
+ .where(eq(tasks.id, nextQueued.id));
1203
+ if (taskRow?.scheduleId) {
1204
+ await recordFiringMetrics(taskRow.scheduleId, nextQueued.id);
1205
+ }
1206
+ } catch (err) {
1207
+ console.error(`[scheduler] metrics recording failed for ${nextQueued.id}:`, err);
1208
+ }
1209
+ }
1210
+ ```
1211
+
1212
+ - [ ] **Step 6.4: Run tick-scheduler tests to verify GREEN**
1213
+
1214
+ Run: `npx vitest run src/lib/schedules/__tests__/tick-scheduler.test.ts`
1215
+ Expected: PASS — both cap-enforcement and chat-pressure tests.
1216
+
1217
+ - [ ] **Step 6.5: Run full scheduler test suite**
1218
+
1219
+ Run: `npx vitest run src/lib/schedules`
1220
+ Expected: PASS — no regressions.
1221
+
1222
+ - [ ] **Step 6.6: Run full test suite**
1223
+
1224
+ Run: `npx vitest run`
1225
+ Expected: PASS across the codebase.
1226
+
1227
+ - [ ] **Step 6.7: Commit**
1228
+
1229
+ ```bash
1230
+ git add src/lib/schedules/scheduler.ts src/lib/schedules/__tests__/tick-scheduler.test.ts
1231
+ git commit -m "feat(schedules): enforce global concurrency cap with lease reaper + chat pressure"
1232
+ ```
1233
+
1234
+ ---
1235
+
1236
+ ## Task 7: Per-schedule turn budget propagation
1237
+
1238
+ **Files:**
1239
+ - Modify: `src/lib/schedules/scheduler.ts` (populate `tasks.max_turns` at firing)
1240
+ - Modify: `src/lib/agents/claude-agent.ts` (override `ctx.maxTurns` from `task.maxTurns`)
1241
+ - Test: `src/lib/schedules/__tests__/turn-budget.test.ts`
1242
+
1243
+ - [ ] **Step 7.1: Write failing test for task.maxTurns propagation**
1244
+
1245
+ Create `src/lib/schedules/__tests__/turn-budget.test.ts`:
1246
+
1247
+ ```typescript
1248
+ import { describe, it, expect, beforeEach, vi } from "vitest";
1249
+ import { db } from "@/lib/db";
1250
+ import { tasks, schedules, projects, settings } from "@/lib/db/schema";
1251
+ import { eq } from "drizzle-orm";
1252
+ import { randomUUID } from "crypto";
1253
+ import { tickScheduler } from "../scheduler";
1254
+
1255
+ vi.mock("@/lib/agents/runtime", () => ({
1256
+ executeTaskWithRuntime: vi.fn().mockResolvedValue(undefined),
1257
+ }));
1258
+
1259
+ describe("per-schedule turn budget propagation", () => {
1260
+ beforeEach(() => {
1261
+ db.delete(tasks).run();
1262
+ db.delete(schedules).run();
1263
+ db.delete(projects).run();
1264
+ db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
1265
+ db.insert(settings)
1266
+ .values({ key: "schedule.maxConcurrent", value: "10", updatedAt: new Date() })
1267
+ .run();
1268
+ });
1269
+
1270
+ it("copies schedules.max_turns into tasks.max_turns at firing time", async () => {
1271
+ const pid = randomUUID();
1272
+ const sid = randomUUID();
1273
+ const now = new Date();
1274
+ db.insert(projects)
1275
+ .values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
1276
+ .run();
1277
+ db.insert(schedules)
1278
+ .values({
1279
+ id: sid,
1280
+ projectId: pid,
1281
+ name: "bounded",
1282
+ prompt: "test",
1283
+ cronExpression: "* * * * *",
1284
+ status: "active",
1285
+ type: "scheduled",
1286
+ firingCount: 0,
1287
+ suppressionCount: 0,
1288
+ heartbeatSpentToday: 0,
1289
+ failureStreak: 0,
1290
+ turnBudgetBreachStreak: 0,
1291
+ nextFireAt: new Date(now.getTime() - 10_000),
1292
+ maxTurns: 42,
1293
+ createdAt: now,
1294
+ updatedAt: now,
1295
+ })
1296
+ .run();
1297
+
1298
+ await tickScheduler();
1299
+
1300
+ const [task] = db.select().from(tasks).where(eq(tasks.scheduleId, sid)).all();
1301
+ expect(task?.maxTurns).toBe(42);
1302
+ });
1303
+
1304
+ it("leaves tasks.max_turns null when schedules.max_turns is null", async () => {
1305
+ const pid = randomUUID();
1306
+ const sid = randomUUID();
1307
+ const now = new Date();
1308
+ db.insert(projects)
1309
+ .values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
1310
+ .run();
1311
+ db.insert(schedules)
1312
+ .values({
1313
+ id: sid,
1314
+ projectId: pid,
1315
+ name: "unbounded",
1316
+ prompt: "test",
1317
+ cronExpression: "* * * * *",
1318
+ status: "active",
1319
+ type: "scheduled",
1320
+ firingCount: 0,
1321
+ suppressionCount: 0,
1322
+ heartbeatSpentToday: 0,
1323
+ failureStreak: 0,
1324
+ turnBudgetBreachStreak: 0,
1325
+ nextFireAt: new Date(now.getTime() - 10_000),
1326
+ createdAt: now,
1327
+ updatedAt: now,
1328
+ })
1329
+ .run();
1330
+
1331
+ await tickScheduler();
1332
+
1333
+ const [task] = db.select().from(tasks).where(eq(tasks.scheduleId, sid)).all();
1334
+ expect(task?.maxTurns).toBeNull();
1335
+ });
1336
+ });
1337
+ ```
1338
+
1339
+ - [ ] **Step 7.2: Run to verify RED**
1340
+
1341
+ Run: `npx vitest run src/lib/schedules/__tests__/turn-budget.test.ts`
1342
+ Expected: FAIL — `task.maxTurns` will be null because the scheduler doesn't copy it yet.
1343
+
1344
+ - [ ] **Step 7.3: Populate tasks.max_turns in fireSchedule**
1345
+
1346
+ Edit `src/lib/schedules/scheduler.ts`, inside `fireSchedule()` at the `db.insert(tasks).values({ ... })` call (around line 350-364). Add `maxTurns: schedule.maxTurns,` to the inserted values, placed before `createdAt`:
1347
+
1348
+ ```typescript
1349
+ await db.insert(tasks).values({
1350
+ id: taskId,
1351
+ projectId: schedule.projectId,
1352
+ workflowId: null,
1353
+ scheduleId: schedule.id,
1354
+ title: `${schedule.name} — firing #${firingNumber}`,
1355
+ description: budgetHeader + schedule.prompt,
1356
+ status: "queued",
1357
+ assignedAgent: schedule.assignedAgent,
1358
+ agentProfile: schedule.agentProfile,
1359
+ priority: 2,
1360
+ sourceType: "scheduled",
1361
+ maxTurns: schedule.maxTurns, // per-schedule override, NULL = inherit global
1362
+ createdAt: now,
1363
+ updatedAt: now,
1364
+ });
1365
+ ```
1366
+
1367
+ Do the same change in `fireHeartbeat()` (around line 528-542) for the heartbeat task insert.
1368
+
1369
+ - [ ] **Step 7.4: Run to verify GREEN**
1370
+
1371
+ Run: `npx vitest run src/lib/schedules/__tests__/turn-budget.test.ts`
1372
+ Expected: PASS — both propagation tests pass.
1373
+
1374
+ - [ ] **Step 7.5: Override ctx.maxTurns in executeClaudeTask**
1375
+
1376
+ Edit `src/lib/agents/claude-agent.ts`. Find `executeClaudeTask()` (line 416-499). After `const ctx = await buildTaskQueryContext(task, agentProfileId);` (around line 433), add:
1377
+
1378
+ ```typescript
1379
+ // Per-schedule override: if the task carries its own maxTurns (set by
1380
+ // fireSchedule from schedules.maxTurns), it takes precedence over the
1381
+ // profile default. This is the runtime-enforced budget cap.
1382
+ const effectiveMaxTurns = task.maxTurns ?? ctx.maxTurns;
1383
+ ```
1384
+
1385
+ In the SDK `query()` call options (around line 456), replace `maxTurns: ctx.maxTurns,` with `maxTurns: effectiveMaxTurns,`.
1386
+
1387
+ Do the same in `resumeClaudeTask()` (around line 570).
1388
+
1389
+ - [ ] **Step 7.6: Run full test suite**
1390
+
1391
+ Run: `npx vitest run`
1392
+ Expected: PASS — no regressions.
1393
+
1394
+ - [ ] **Step 7.7: Commit**
1395
+
1396
+ ```bash
1397
+ git add src/lib/schedules/scheduler.ts src/lib/agents/claude-agent.ts src/lib/schedules/__tests__/turn-budget.test.ts
1398
+ git commit -m "feat(schedules): propagate per-schedule max_turns into SDK query options"
1399
+ ```
1400
+
1401
+ ---
1402
+
1403
+ ## Task 8: Separate `turnBudgetBreachStreak` with first-breach grace
1404
+
1405
+ **Files:**
1406
+ - Modify: `src/lib/schedules/scheduler.ts` (`recordFiringMetrics`)
1407
+ - Modify: `src/lib/schedules/__tests__/turn-budget.test.ts` (extend)
1408
+
1409
+ The existing `recordFiringMetrics()` at scheduler.ts:140-186 uses a single `failureStreak`. Split turn-budget breaches into their own counter so a misconfigured `maxTurns` doesn't auto-pause via the generic threshold of 3.
1410
+
1411
+ - [ ] **Step 8.1: Append failing tests for streak split + grace + auto-pause**
1412
+
1413
+ Append to `src/lib/schedules/__tests__/turn-budget.test.ts`:
1414
+
1415
+ ```typescript
1416
+ import { recordFiringMetrics } from "../scheduler";
1417
+
1418
+ async function seedBreachedTask(scheduleId: string): Promise<string> {
1419
+ const id = randomUUID();
1420
+ const now = new Date();
1421
+ db.insert(tasks)
1422
+ .values({
1423
+ id,
1424
+ scheduleId,
1425
+ title: "firing",
1426
+ status: "failed",
1427
+ result: "Agent exhausted its turn limit (42 turns used)",
1428
+ priority: 2,
1429
+ sourceType: "scheduled",
1430
+ resumeCount: 0,
1431
+ failureReason: "turn_limit_exceeded",
1432
+ createdAt: now,
1433
+ updatedAt: now,
1434
+ })
1435
+ .run();
1436
+ return id;
1437
+ }
1438
+
1439
+ describe("turn_budget_breach_streak", () => {
1440
+ beforeEach(() => {
1441
+ db.delete(tasks).run();
1442
+ db.delete(schedules).run();
1443
+ db.delete(projects).run();
1444
+ });
1445
+
1446
+ it("does NOT increment generic failureStreak on turn-budget breach", async () => {
1447
+ const pid = randomUUID();
1448
+ const sid = randomUUID();
1449
+ const now = new Date();
1450
+ db.insert(projects)
1451
+ .values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
1452
+ .run();
1453
+ db.insert(schedules)
1454
+ .values({
1455
+ id: sid,
1456
+ projectId: pid,
1457
+ name: "bounded",
1458
+ prompt: "test",
1459
+ cronExpression: "* * * * *",
1460
+ status: "active",
1461
+ type: "scheduled",
1462
+ firingCount: 1,
1463
+ suppressionCount: 0,
1464
+ heartbeatSpentToday: 0,
1465
+ failureStreak: 0,
1466
+ turnBudgetBreachStreak: 0,
1467
+ maxTurns: 20,
1468
+ maxTurnsSetAt: new Date(now.getTime() - 86400_000), // yesterday
1469
+ createdAt: now,
1470
+ updatedAt: now,
1471
+ })
1472
+ .run();
1473
+
1474
+ const tid = await seedBreachedTask(sid);
1475
+ await recordFiringMetrics(sid, tid);
1476
+
1477
+ const row = db.select().from(schedules).where(eq(schedules.id, sid)).get();
1478
+ expect(row?.failureStreak).toBe(0);
1479
+ expect(row?.turnBudgetBreachStreak).toBe(1);
1480
+ });
1481
+
1482
+ it("applies first-breach grace when maxTurns was set recently", async () => {
1483
+ const pid = randomUUID();
1484
+ const sid = randomUUID();
1485
+ const now = new Date();
1486
+ db.insert(projects)
1487
+ .values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
1488
+ .run();
1489
+ db.insert(schedules)
1490
+ .values({
1491
+ id: sid,
1492
+ projectId: pid,
1493
+ name: "bounded",
1494
+ prompt: "test",
1495
+ cronExpression: "0 * * * *", // hourly
1496
+ status: "active",
1497
+ type: "scheduled",
1498
+ firingCount: 1,
1499
+ suppressionCount: 0,
1500
+ heartbeatSpentToday: 0,
1501
+ failureStreak: 0,
1502
+ turnBudgetBreachStreak: 0,
1503
+ maxTurns: 20,
1504
+ // maxTurnsSetAt 30 min ago → first firing after edit → grace applies
1505
+ maxTurnsSetAt: new Date(now.getTime() - 30 * 60 * 1000),
1506
+ createdAt: now,
1507
+ updatedAt: now,
1508
+ })
1509
+ .run();
1510
+
1511
+ const tid = await seedBreachedTask(sid);
1512
+ await recordFiringMetrics(sid, tid);
1513
+
1514
+ const row = db.select().from(schedules).where(eq(schedules.id, sid)).get();
1515
+ expect(row?.turnBudgetBreachStreak).toBe(0); // grace applied
1516
+ });
1517
+
1518
+ it("auto-pauses at turn_budget_breach_streak >= 5", async () => {
1519
+ const pid = randomUUID();
1520
+ const sid = randomUUID();
1521
+ const now = new Date();
1522
+ db.insert(projects)
1523
+ .values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
1524
+ .run();
1525
+ db.insert(schedules)
1526
+ .values({
1527
+ id: sid,
1528
+ projectId: pid,
1529
+ name: "bounded",
1530
+ prompt: "test",
1531
+ cronExpression: "* * * * *",
1532
+ status: "active",
1533
+ type: "scheduled",
1534
+ firingCount: 5,
1535
+ suppressionCount: 0,
1536
+ heartbeatSpentToday: 0,
1537
+ failureStreak: 0,
1538
+ turnBudgetBreachStreak: 4, // next breach trips the threshold
1539
+ maxTurns: 20,
1540
+ maxTurnsSetAt: new Date(now.getTime() - 86400_000),
1541
+ createdAt: now,
1542
+ updatedAt: now,
1543
+ })
1544
+ .run();
1545
+
1546
+ const tid = await seedBreachedTask(sid);
1547
+ await recordFiringMetrics(sid, tid);
1548
+
1549
+ const row = db.select().from(schedules).where(eq(schedules.id, sid)).get();
1550
+ expect(row?.status).toBe("paused");
1551
+ expect(row?.turnBudgetBreachStreak).toBe(5);
1552
+ });
1553
+ });
1554
+ ```
1555
+
1556
+ - [ ] **Step 8.2: Run to verify RED**
1557
+
1558
+ Run: `npx vitest run src/lib/schedules/__tests__/turn-budget.test.ts`
1559
+ Expected: FAIL — `recordFiringMetrics` doesn't split streaks yet.
1560
+
1561
+ - [ ] **Step 8.3: Refactor recordFiringMetrics**
1562
+
1563
+ Edit `src/lib/schedules/scheduler.ts`. Replace the body of `recordFiringMetrics()` (lines 140-186) with a version that splits streaks and honors first-breach grace:
1564
+
1565
+ ```typescript
1566
+ const TURN_BUDGET_BREACH_AUTO_PAUSE_THRESHOLD = 5;
1567
+ const GRACE_PERIOD_MULTIPLIER = 2; // grace window = 2 × cron interval
1568
+
1569
+ export async function recordFiringMetrics(
1570
+ scheduleId: string,
1571
+ taskId: string,
1572
+ ): Promise<void> {
1573
+ const [task] = await db
1574
+ .select({
1575
+ status: tasks.status,
1576
+ result: tasks.result,
1577
+ failureReason: tasks.failureReason,
1578
+ updatedAt: tasks.updatedAt,
1579
+ })
1580
+ .from(tasks)
1581
+ .where(eq(tasks.id, taskId));
1582
+ if (!task) return;
1583
+
1584
+ const [schedule] = await db
1585
+ .select()
1586
+ .from(schedules)
1587
+ .where(eq(schedules.id, scheduleId));
1588
+ if (!schedule) return;
1589
+
1590
+ const turnCountResult = await db
1591
+ .select({ count: sql<number>`count(*)` })
1592
+ .from(agentLogs)
1593
+ .where(eq(agentLogs.taskId, taskId));
1594
+ const turns = Number(turnCountResult[0]?.count ?? 0);
1595
+
1596
+ const prevAvg = schedule.avgTurnsPerFiring ?? turns;
1597
+ const newAvg = Math.round(prevAvg * 0.7 + turns * 0.3);
1598
+
1599
+ const isFailure = task.status === "failed";
1600
+ const failureReason =
1601
+ task.failureReason ?? (isFailure ? detectFailureReason(task.result) : null);
1602
+ const isTurnBudgetBreach = failureReason === "turn_limit_exceeded";
1603
+ const isGenericFailure = isFailure && !isTurnBudgetBreach;
1604
+
1605
+ // First-breach grace
1606
+ let turnBudgetStreakDelta = 0;
1607
+ if (isTurnBudgetBreach) {
1608
+ const graceApplies = shouldApplyGrace(
1609
+ schedule.maxTurnsSetAt,
1610
+ schedule.cronExpression,
1611
+ task.updatedAt,
1612
+ );
1613
+ if (!graceApplies) turnBudgetStreakDelta = 1;
1614
+ }
1615
+
1616
+ const newFailureStreak = isGenericFailure ? (schedule.failureStreak ?? 0) + 1 : 0;
1617
+ const newBudgetStreak =
1618
+ turnBudgetStreakDelta > 0
1619
+ ? (schedule.turnBudgetBreachStreak ?? 0) + 1
1620
+ : isTurnBudgetBreach
1621
+ ? schedule.turnBudgetBreachStreak
1622
+ : 0;
1623
+ const shouldAutoPauseGeneric =
1624
+ isGenericFailure && newFailureStreak >= 3 && schedule.status === "active";
1625
+ const shouldAutoPauseBudget =
1626
+ newBudgetStreak >= TURN_BUDGET_BREACH_AUTO_PAUSE_THRESHOLD &&
1627
+ schedule.status === "active";
1628
+ const shouldAutoPause = shouldAutoPauseGeneric || shouldAutoPauseBudget;
1629
+
1630
+ await db
1631
+ .update(schedules)
1632
+ .set({
1633
+ lastTurnCount: turns,
1634
+ avgTurnsPerFiring: newAvg,
1635
+ failureStreak: newFailureStreak,
1636
+ turnBudgetBreachStreak: newBudgetStreak,
1637
+ lastFailureReason: failureReason,
1638
+ status: shouldAutoPause ? "paused" : schedule.status,
1639
+ updatedAt: new Date(),
1640
+ })
1641
+ .where(eq(schedules.id, scheduleId));
1642
+
1643
+ if (shouldAutoPauseGeneric) {
1644
+ console.warn(
1645
+ `[scheduler] auto-paused "${schedule.name}" after 3 consecutive failures`,
1646
+ );
1647
+ }
1648
+ if (shouldAutoPauseBudget) {
1649
+ console.warn(
1650
+ `[scheduler] auto-paused "${schedule.name}" after 5 consecutive turn-budget breaches (avg: ${newAvg} steps, cap: ${schedule.maxTurns})`,
1651
+ );
1652
+ }
1653
+ }
1654
+
1655
+ /**
1656
+ * First-breach grace: if maxTurnsSetAt was recent enough that this is the
1657
+ * first-or-second firing after the edit, don't count the breach toward the
1658
+ * auto-pause streak.
1659
+ */
1660
+ function shouldApplyGrace(
1661
+ maxTurnsSetAt: Date | null,
1662
+ cronExpression: string,
1663
+ completedAt: Date | null,
1664
+ ): boolean {
1665
+ if (!maxTurnsSetAt || !completedAt) return false;
1666
+ try {
1667
+ const nextAfterSet = computeNextFireTime(cronExpression, maxTurnsSetAt);
1668
+ const cronIntervalMs = nextAfterSet.getTime() - maxTurnsSetAt.getTime();
1669
+ const graceWindowEnd = new Date(
1670
+ maxTurnsSetAt.getTime() + GRACE_PERIOD_MULTIPLIER * cronIntervalMs,
1671
+ );
1672
+ return completedAt <= graceWindowEnd;
1673
+ } catch {
1674
+ return false;
1675
+ }
1676
+ }
1677
+ ```
1678
+
1679
+ - [ ] **Step 8.4: Run to verify GREEN**
1680
+
1681
+ Run: `npx vitest run src/lib/schedules/__tests__/turn-budget.test.ts`
1682
+ Expected: PASS — all turn-budget tests (propagation + streak + grace + auto-pause).
1683
+
1684
+ - [ ] **Step 8.5: Run full scheduler suite**
1685
+
1686
+ Run: `npx vitest run src/lib/schedules`
1687
+ Expected: PASS — no regressions.
1688
+
1689
+ - [ ] **Step 8.6: Commit**
1690
+
1691
+ ```bash
1692
+ git add src/lib/schedules/scheduler.ts src/lib/schedules/__tests__/turn-budget.test.ts
1693
+ git commit -m "feat(schedules): separate turn-budget breach streak with first-breach grace"
1694
+ ```
1695
+
1696
+ ---
1697
+
1698
+ ## Task 9: Runtime adapter writes explicit `failure_reason`
1699
+
1700
+ **Files:**
1701
+ - Modify: `src/lib/agents/claude-agent.ts`
1702
+ - Test: `src/lib/agents/__tests__/failure-reason.test.ts`
1703
+
1704
+ The runtime adapter currently catches errors in `handleExecutionError()` but does not write `tasks.failure_reason`. The reaper and recordFiringMetrics rely on this column. Populate it at terminal-state transitions so `detectFailureReason()` (scheduler.ts:122) becomes a fallback, not the primary classifier.
1705
+
1706
+ - [ ] **Step 9.1: Write failing classifier tests**
1707
+
1708
+ Create `src/lib/agents/__tests__/failure-reason.test.ts`:
1709
+
1710
+ ```typescript
1711
+ import { describe, it, expect, beforeEach } from "vitest";
1712
+ import { db } from "@/lib/db";
1713
+ import { tasks, projects } from "@/lib/db/schema";
1714
+ import { eq } from "drizzle-orm";
1715
+ import { randomUUID } from "crypto";
1716
+ import { writeTerminalFailureReason } from "../claude-agent";
1717
+
1718
+ function seedRunningTask(): string {
1719
+ const pid = randomUUID();
1720
+ const tid = randomUUID();
1721
+ const now = new Date();
1722
+ db.insert(projects)
1723
+ .values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
1724
+ .run();
1725
+ db.insert(tasks)
1726
+ .values({
1727
+ id: tid,
1728
+ projectId: pid,
1729
+ title: "t",
1730
+ status: "running",
1731
+ priority: 2,
1732
+ resumeCount: 0,
1733
+ createdAt: now,
1734
+ updatedAt: now,
1735
+ })
1736
+ .run();
1737
+ return tid;
1738
+ }
1739
+
1740
+ describe("writeTerminalFailureReason", () => {
1741
+ beforeEach(() => {
1742
+ db.delete(tasks).run();
1743
+ db.delete(projects).run();
1744
+ });
1745
+
1746
+ it("writes 'turn_limit_exceeded' on turn limit errors", async () => {
1747
+ const tid = seedRunningTask();
1748
+ await writeTerminalFailureReason(
1749
+ tid,
1750
+ new Error("Agent exhausted its turn limit (42 turns used)"),
1751
+ );
1752
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
1753
+ expect(row?.failureReason).toBe("turn_limit_exceeded");
1754
+ });
1755
+
1756
+ it("writes 'aborted' on AbortError", async () => {
1757
+ const tid = seedRunningTask();
1758
+ const err = new Error("aborted");
1759
+ err.name = "AbortError";
1760
+ await writeTerminalFailureReason(tid, err);
1761
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
1762
+ expect(row?.failureReason).toBe("aborted");
1763
+ });
1764
+
1765
+ it("writes 'sdk_error' for unknown errors", async () => {
1766
+ const tid = seedRunningTask();
1767
+ await writeTerminalFailureReason(tid, new Error("something weird"));
1768
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
1769
+ expect(row?.failureReason).toBe("sdk_error");
1770
+ });
1771
+
1772
+ it("writes 'rate_limited' on 429 errors", async () => {
1773
+ const tid = seedRunningTask();
1774
+ await writeTerminalFailureReason(tid, new Error("HTTP 429 rate limit"));
1775
+ const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
1776
+ expect(row?.failureReason).toBe("rate_limited");
1777
+ });
1778
+ });
1779
+ ```
1780
+
1781
+ - [ ] **Step 9.2: Run to verify RED**
1782
+
1783
+ Run: `npx vitest run src/lib/agents/__tests__/failure-reason.test.ts`
1784
+ Expected: FAIL — `writeTerminalFailureReason` not exported.
1785
+
1786
+ - [ ] **Step 9.3: Add writeTerminalFailureReason helper**
1787
+
1788
+ Edit `src/lib/agents/claude-agent.ts`. Add this helper near the other top-level exports (after the imports block):
1789
+
1790
+ ```typescript
1791
+ /**
1792
+ * Write an explicit failure_reason to tasks at terminal-state transitions.
1793
+ * Called from handleExecutionError and the execute/resume functions on known
1794
+ * error classes. Prefer this over reverse-engineering reasons from text via
1795
+ * detectFailureReason in scheduler.ts, which is fragile to SDK message changes.
1796
+ */
1797
+ export async function writeTerminalFailureReason(
1798
+ taskId: string,
1799
+ error: unknown,
1800
+ ): Promise<void> {
1801
+ const reason = classifyError(error);
1802
+ await db
1803
+ .update(tasks)
1804
+ .set({ failureReason: reason, updatedAt: new Date() })
1805
+ .where(eq(tasks.id, taskId));
1806
+ }
1807
+
1808
+ function classifyError(error: unknown): string {
1809
+ if (!(error instanceof Error)) return "sdk_error";
1810
+ if (error.name === "AbortError" || error.message.includes("aborted")) {
1811
+ return "aborted";
1812
+ }
1813
+ const lower = error.message.toLowerCase();
1814
+ if (
1815
+ lower.includes("turn") &&
1816
+ (lower.includes("limit") || lower.includes("exhausted") || lower.includes("max"))
1817
+ ) {
1818
+ return "turn_limit_exceeded";
1819
+ }
1820
+ if (lower.includes("timeout") || lower.includes("timed out")) return "timeout";
1821
+ if (lower.includes("budget")) return "budget_exceeded";
1822
+ if (lower.includes("authentication") || lower.includes("oauth")) {
1823
+ return "auth_failed";
1824
+ }
1825
+ if (lower.includes("rate limit") || lower.includes("429")) {
1826
+ return "rate_limited";
1827
+ }
1828
+ return "sdk_error";
1829
+ }
1830
+ ```
1831
+
1832
+ - [ ] **Step 9.4: Call it from handleExecutionError**
1833
+
1834
+ Still in `claude-agent.ts`, find `handleExecutionError()`. At the point where it updates `tasks.status = 'failed'`, add a call to `writeTerminalFailureReason(taskId, error)` alongside the status update.
1835
+
1836
+ - [ ] **Step 9.5: Run to verify GREEN**
1837
+
1838
+ Run: `npx vitest run src/lib/agents/__tests__/failure-reason.test.ts`
1839
+ Expected: PASS — 4 classification tests.
1840
+
1841
+ - [ ] **Step 9.6: Run full suite**
1842
+
1843
+ Run: `npx vitest run`
1844
+ Expected: PASS.
1845
+
1846
+ - [ ] **Step 9.7: Commit**
1847
+
1848
+ ```bash
1849
+ git add src/lib/agents/claude-agent.ts src/lib/agents/__tests__/failure-reason.test.ts
1850
+ git commit -m "feat(agents): runtime adapter writes explicit failure_reason at terminal states"
1851
+ ```
1852
+
1853
+ ---
1854
+
1855
+ ## Task 10: Manual execute endpoint with cap + force bypass
1856
+
1857
+ **Files:**
1858
+ - Create: `src/app/api/schedules/[id]/execute/route.ts`
1859
+ - Test: `src/app/api/schedules/__tests__/execute-route.test.ts`
1860
+
1861
+ No manual-execute endpoint exists today. Build one that honors the cap by default with explicit `?force=true` bypass.
1862
+
1863
+ - [ ] **Step 10.1: Write failing route tests**
1864
+
1865
+ Create `src/app/api/schedules/__tests__/execute-route.test.ts`:
1866
+
1867
+ ```typescript
1868
+ import { describe, it, expect, beforeEach, vi } from "vitest";
1869
+ import { db } from "@/lib/db";
1870
+ import { tasks, schedules, projects, settings, usageLedger } from "@/lib/db/schema";
1871
+ import { eq } from "drizzle-orm";
1872
+ import { randomUUID } from "crypto";
1873
+ import { NextRequest } from "next/server";
1874
+ import { POST } from "../[id]/execute/route";
1875
+
1876
+ vi.mock("@/lib/agents/runtime", () => ({
1877
+ executeTaskWithRuntime: vi.fn().mockResolvedValue(undefined),
1878
+ }));
1879
+
1880
+ function req(url: string): NextRequest {
1881
+ return new NextRequest(new URL(url, "http://localhost"));
1882
+ }
1883
+
1884
+ function seedSchedule(): string {
1885
+ const pid = randomUUID();
1886
+ const sid = randomUUID();
1887
+ const now = new Date();
1888
+ db.insert(projects)
1889
+ .values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
1890
+ .run();
1891
+ db.insert(schedules)
1892
+ .values({
1893
+ id: sid,
1894
+ projectId: pid,
1895
+ name: "manual",
1896
+ prompt: "test",
1897
+ cronExpression: "0 0 * * *",
1898
+ status: "active",
1899
+ type: "scheduled",
1900
+ firingCount: 0,
1901
+ suppressionCount: 0,
1902
+ heartbeatSpentToday: 0,
1903
+ failureStreak: 0,
1904
+ turnBudgetBreachStreak: 0,
1905
+ createdAt: now,
1906
+ updatedAt: now,
1907
+ })
1908
+ .run();
1909
+ return sid;
1910
+ }
1911
+
1912
+ describe("POST /api/schedules/:id/execute", () => {
1913
+ beforeEach(() => {
1914
+ db.delete(usageLedger).run();
1915
+ db.delete(tasks).run();
1916
+ db.delete(schedules).run();
1917
+ db.delete(projects).run();
1918
+ db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
1919
+ db.insert(settings)
1920
+ .values({ key: "schedule.maxConcurrent", value: "1", updatedAt: new Date() })
1921
+ .run();
1922
+ });
1923
+
1924
+ it("fires when capacity available, returns 200 with taskId", async () => {
1925
+ const sid = seedSchedule();
1926
+ const res = await POST(req(`/api/schedules/${sid}/execute`), {
1927
+ params: Promise.resolve({ id: sid }),
1928
+ });
1929
+ expect(res.status).toBe(200);
1930
+ const body = await res.json();
1931
+ expect(body.taskId).toBeDefined();
1932
+ });
1933
+
1934
+ it("returns 429 when cap is full", async () => {
1935
+ const sid1 = seedSchedule();
1936
+ const sid2 = seedSchedule();
1937
+
1938
+ const res1 = await POST(req(`/api/schedules/${sid1}/execute`), {
1939
+ params: Promise.resolve({ id: sid1 }),
1940
+ });
1941
+ expect(res1.status).toBe(200);
1942
+
1943
+ const res2 = await POST(req(`/api/schedules/${sid2}/execute`), {
1944
+ params: Promise.resolve({ id: sid2 }),
1945
+ });
1946
+ expect(res2.status).toBe(429);
1947
+ const body = await res2.json();
1948
+ expect(body.error).toBe("capacity_full");
1949
+ expect(body.slotEtaSec).toBeGreaterThanOrEqual(0);
1950
+ });
1951
+
1952
+ it("bypasses the cap when ?force=true and writes audit-log entry", async () => {
1953
+ const sid1 = seedSchedule();
1954
+ const sid2 = seedSchedule();
1955
+
1956
+ await POST(req(`/api/schedules/${sid1}/execute`), {
1957
+ params: Promise.resolve({ id: sid1 }),
1958
+ });
1959
+
1960
+ const res2 = await POST(
1961
+ req(`/api/schedules/${sid2}/execute?force=true`),
1962
+ { params: Promise.resolve({ id: sid2 }) },
1963
+ );
1964
+ expect(res2.status).toBe(200);
1965
+
1966
+ const ledger = db
1967
+ .select()
1968
+ .from(usageLedger)
1969
+ .where(eq(usageLedger.activityType, "manual_force_bypass"))
1970
+ .all();
1971
+ expect(ledger.length).toBe(1);
1972
+ });
1973
+
1974
+ it("returns 404 when the schedule does not exist", async () => {
1975
+ const res = await POST(req("/api/schedules/nonexistent/execute"), {
1976
+ params: Promise.resolve({ id: "nonexistent" }),
1977
+ });
1978
+ expect(res.status).toBe(404);
1979
+ });
1980
+ });
1981
+ ```
1982
+
1983
+ - [ ] **Step 10.2: Run to verify RED**
1984
+
1985
+ Run: `npx vitest run src/app/api/schedules/__tests__/execute-route.test.ts`
1986
+ Expected: FAIL — route module does not exist.
1987
+
1988
+ - [ ] **Step 10.3: Implement the manual-execute route**
1989
+
1990
+ Create `src/app/api/schedules/[id]/execute/route.ts`:
1991
+
1992
+ ```typescript
1993
+ import { NextRequest, NextResponse } from "next/server";
1994
+ import { db } from "@/lib/db";
1995
+ import { schedules, tasks, usageLedger } from "@/lib/db/schema";
1996
+ import { eq } from "drizzle-orm";
1997
+ import { executeTaskWithRuntime } from "@/lib/agents/runtime";
1998
+ import { claimSlot, countRunningScheduledSlots } from "@/lib/schedules/slot-claim";
1999
+ import {
2000
+ getScheduleMaxConcurrent,
2001
+ getScheduleMaxRunDurationSec,
2002
+ } from "@/lib/schedules/config";
2003
+ import { randomUUID } from "crypto";
2004
+
2005
+ /**
2006
+ * Manually fire a schedule. Honors the global concurrency cap by default.
2007
+ * Use `?force=true` to bypass the cap (logged to usage_ledger as
2008
+ * manual_force_bypass for audit).
2009
+ */
2010
+ export async function POST(
2011
+ req: NextRequest,
2012
+ { params }: { params: Promise<{ id: string }> },
2013
+ ) {
2014
+ const { id: scheduleId } = await params;
2015
+ const force = req.nextUrl.searchParams.get("force") === "true";
2016
+
2017
+ const [schedule] = await db
2018
+ .select()
2019
+ .from(schedules)
2020
+ .where(eq(schedules.id, scheduleId));
2021
+ if (!schedule) {
2022
+ return NextResponse.json({ error: "schedule_not_found" }, { status: 404 });
2023
+ }
2024
+
2025
+ const taskId = randomUUID();
2026
+ const firingNumber = schedule.firingCount + 1;
2027
+ const now = new Date();
2028
+
2029
+ await db.insert(tasks).values({
2030
+ id: taskId,
2031
+ projectId: schedule.projectId,
2032
+ workflowId: null,
2033
+ scheduleId: schedule.id,
2034
+ title: `${schedule.name} — manual firing #${firingNumber}`,
2035
+ description: schedule.prompt,
2036
+ status: "queued",
2037
+ assignedAgent: schedule.assignedAgent,
2038
+ agentProfile: schedule.agentProfile,
2039
+ priority: 2,
2040
+ sourceType: "scheduled",
2041
+ maxTurns: schedule.maxTurns,
2042
+ createdAt: now,
2043
+ updatedAt: now,
2044
+ });
2045
+
2046
+ const cap = getScheduleMaxConcurrent();
2047
+ const leaseSec = schedule.maxRunDurationSec ?? getScheduleMaxRunDurationSec();
2048
+
2049
+ const effectiveCap = force ? Number.MAX_SAFE_INTEGER : cap;
2050
+ const { claimed } = claimSlot(taskId, effectiveCap, leaseSec);
2051
+
2052
+ if (!claimed) {
2053
+ await db.delete(tasks).where(eq(tasks.id, taskId));
2054
+ const slotEtaSec = 60;
2055
+ return NextResponse.json(
2056
+ {
2057
+ error: "capacity_full",
2058
+ message: `Swarm at capacity (${countRunningScheduledSlots()}/${cap}). Retry in ~${slotEtaSec}s or add ?force=true to bypass.`,
2059
+ slotEtaSec,
2060
+ },
2061
+ { status: 429 },
2062
+ );
2063
+ }
2064
+
2065
+ if (force) {
2066
+ await db.insert(usageLedger).values({
2067
+ id: randomUUID(),
2068
+ taskId,
2069
+ scheduleId: schedule.id,
2070
+ projectId: schedule.projectId,
2071
+ activityType: "manual_force_bypass",
2072
+ runtimeId: schedule.assignedAgent ?? null,
2073
+ providerId: null,
2074
+ modelId: null,
2075
+ inputTokens: null,
2076
+ outputTokens: null,
2077
+ totalTokens: null,
2078
+ costMicros: 0,
2079
+ status: "completed",
2080
+ startedAt: now,
2081
+ finishedAt: now,
2082
+ } as typeof usageLedger.$inferInsert);
2083
+ }
2084
+
2085
+ executeTaskWithRuntime(taskId).catch((err) => {
2086
+ console.error(`[api/schedules/execute] task ${taskId} failed:`, err);
2087
+ });
2088
+
2089
+ return NextResponse.json({ taskId, forced: force });
2090
+ }
2091
+ ```
2092
+
2093
+ **Note:** if `usageLedger` schema columns differ from the insert shape above, read `src/lib/db/schema.ts` lines 297-340 for the actual column names and adjust. The `activityType` field may need to be added to the enum — check `tasks.status` pattern for reference.
2094
+
2095
+ - [ ] **Step 10.4: Run to verify GREEN**
2096
+
2097
+ Run: `npx vitest run src/app/api/schedules/__tests__/execute-route.test.ts`
2098
+ Expected: PASS — 4 tests.
2099
+
2100
+ - [ ] **Step 10.5: Commit**
2101
+
2102
+ ```bash
2103
+ git add src/app/api/schedules/[id]/execute/route.ts src/app/api/schedules/__tests__/execute-route.test.ts
2104
+ git commit -m "feat(schedules): manual execute endpoint honors cap with force bypass + audit"
2105
+ ```
2106
+
2107
+ ---
2108
+
2109
+ ## Task 11: Firing metrics insertion
2110
+
2111
+ **Files:**
2112
+ - Modify: `src/lib/schedules/scheduler.ts` (`recordFiringMetrics` — add insert at end)
2113
+ - Test: `src/lib/schedules/__tests__/firing-metrics.test.ts`
2114
+
2115
+ - [ ] **Step 11.1: Write failing metrics test**
2116
+
2117
+ Create `src/lib/schedules/__tests__/firing-metrics.test.ts`:
2118
+
2119
+ ```typescript
2120
+ import { describe, it, expect, beforeEach } from "vitest";
2121
+ import { db } from "@/lib/db";
2122
+ import {
2123
+ tasks,
2124
+ schedules,
2125
+ projects,
2126
+ scheduleFiringMetrics,
2127
+ agentLogs,
2128
+ } from "@/lib/db/schema";
2129
+ import { eq } from "drizzle-orm";
2130
+ import { randomUUID } from "crypto";
2131
+ import { recordFiringMetrics } from "../scheduler";
2132
+
2133
+ describe("schedule_firing_metrics insertion", () => {
2134
+ beforeEach(() => {
2135
+ db.delete(scheduleFiringMetrics).run();
2136
+ db.delete(agentLogs).run();
2137
+ db.delete(tasks).run();
2138
+ db.delete(schedules).run();
2139
+ db.delete(projects).run();
2140
+ });
2141
+
2142
+ it("inserts a row for every firing with slot_wait_ms and duration_ms", async () => {
2143
+ const pid = randomUUID();
2144
+ const sid = randomUUID();
2145
+ const tid = randomUUID();
2146
+ const firedAt = new Date(Date.now() - 5000);
2147
+ const slotClaimedAt = new Date(Date.now() - 4000);
2148
+ const completedAt = new Date(Date.now() - 100);
2149
+
2150
+ db.insert(projects)
2151
+ .values({
2152
+ id: pid,
2153
+ name: "p",
2154
+ status: "active",
2155
+ createdAt: firedAt,
2156
+ updatedAt: firedAt,
2157
+ })
2158
+ .run();
2159
+ db.insert(schedules)
2160
+ .values({
2161
+ id: sid,
2162
+ projectId: pid,
2163
+ name: "test",
2164
+ prompt: "x",
2165
+ cronExpression: "* * * * *",
2166
+ status: "active",
2167
+ type: "scheduled",
2168
+ firingCount: 1,
2169
+ suppressionCount: 0,
2170
+ heartbeatSpentToday: 0,
2171
+ failureStreak: 0,
2172
+ turnBudgetBreachStreak: 0,
2173
+ maxTurns: 50,
2174
+ createdAt: firedAt,
2175
+ updatedAt: firedAt,
2176
+ })
2177
+ .run();
2178
+ db.insert(tasks)
2179
+ .values({
2180
+ id: tid,
2181
+ scheduleId: sid,
2182
+ title: "firing",
2183
+ status: "completed",
2184
+ priority: 2,
2185
+ sourceType: "scheduled",
2186
+ resumeCount: 0,
2187
+ slotClaimedAt,
2188
+ createdAt: firedAt,
2189
+ updatedAt: completedAt,
2190
+ })
2191
+ .run();
2192
+ for (let i = 0; i < 7; i++) {
2193
+ db.insert(agentLogs)
2194
+ .values({
2195
+ id: randomUUID(),
2196
+ taskId: tid,
2197
+ agentType: "test",
2198
+ event: "assistant_message",
2199
+ timestamp: new Date(),
2200
+ })
2201
+ .run();
2202
+ }
2203
+
2204
+ await recordFiringMetrics(sid, tid);
2205
+
2206
+ const rows = db
2207
+ .select()
2208
+ .from(scheduleFiringMetrics)
2209
+ .where(eq(scheduleFiringMetrics.scheduleId, sid))
2210
+ .all();
2211
+
2212
+ expect(rows.length).toBe(1);
2213
+ expect(rows[0].turnCount).toBe(7);
2214
+ expect(rows[0].maxTurnsAtFiring).toBe(50);
2215
+ expect(rows[0].slotWaitMs).toBeGreaterThan(0);
2216
+ expect(rows[0].durationMs).toBeGreaterThan(0);
2217
+ });
2218
+ });
2219
+ ```
2220
+
2221
+ - [ ] **Step 11.2: Run to verify RED**
2222
+
2223
+ Run: `npx vitest run src/lib/schedules/__tests__/firing-metrics.test.ts`
2224
+ Expected: FAIL — no row inserted.
2225
+
2226
+ - [ ] **Step 11.3: Append metric insertion to recordFiringMetrics**
2227
+
2228
+ Edit `src/lib/schedules/scheduler.ts`. At the end of `recordFiringMetrics()` (after the schedule UPDATE), add the metric insertion:
2229
+
2230
+ ```typescript
2231
+ try {
2232
+ const [taskRow] = await db
2233
+ .select()
2234
+ .from(tasks)
2235
+ .where(eq(tasks.id, taskId));
2236
+ if (taskRow) {
2237
+ const firedAtDate = taskRow.createdAt;
2238
+ const slotClaimedAt = taskRow.slotClaimedAt;
2239
+ const completedAt = taskRow.updatedAt;
2240
+ const slotWaitMs =
2241
+ slotClaimedAt && firedAtDate
2242
+ ? slotClaimedAt.getTime() - firedAtDate.getTime()
2243
+ : null;
2244
+ const durationMs =
2245
+ slotClaimedAt && completedAt
2246
+ ? completedAt.getTime() - slotClaimedAt.getTime()
2247
+ : null;
2248
+
2249
+ await db.insert(scheduleFiringMetrics).values({
2250
+ id: crypto.randomUUID(),
2251
+ scheduleId,
2252
+ taskId,
2253
+ firedAt: firedAtDate,
2254
+ slotClaimedAt,
2255
+ completedAt,
2256
+ slotWaitMs,
2257
+ durationMs,
2258
+ turnCount: turns,
2259
+ maxTurnsAtFiring: schedule.maxTurns,
2260
+ eventLoopLagMs: null,
2261
+ peakRssMb: null,
2262
+ chatStreamsActive: null,
2263
+ concurrentSchedules: null,
2264
+ failureReason,
2265
+ });
2266
+ }
2267
+ } catch (err) {
2268
+ console.error(`[scheduler] failed to insert firing metrics for ${taskId}:`, err);
2269
+ }
2270
+ ```
2271
+
2272
+ Remember to import `scheduleFiringMetrics` at the top of `scheduler.ts`:
2273
+
2274
+ ```typescript
2275
+ import { schedules, tasks, agentLogs, scheduleFiringMetrics, scheduleDocumentInputs, documents, workflows } from "@/lib/db/schema";
2276
+ ```
2277
+
2278
+ - [ ] **Step 11.4: Run to verify GREEN**
2279
+
2280
+ Run: `npx vitest run src/lib/schedules/__tests__/firing-metrics.test.ts`
2281
+ Expected: PASS.
2282
+
2283
+ - [ ] **Step 11.5: Commit**
2284
+
2285
+ ```bash
2286
+ git add src/lib/schedules/scheduler.ts src/lib/schedules/__tests__/firing-metrics.test.ts
2287
+ git commit -m "feat(schedules): insert schedule_firing_metrics rows for tuning + forensics"
2288
+ ```
2289
+
2290
+ ---
2291
+
2292
+ ## Task 12: Collision warning helper + API wiring
2293
+
2294
+ **Files:**
2295
+ - Create: `src/lib/schedules/collision-check.ts`
2296
+ - Test: `src/lib/schedules/__tests__/collision-check.test.ts`
2297
+ - Modify: `src/app/api/schedules/route.ts`
2298
+ - Modify: `src/app/api/schedules/[id]/route.ts`
2299
+
2300
+ - [ ] **Step 12.1: Write failing collision-check tests**
2301
+
2302
+ Create `src/lib/schedules/__tests__/collision-check.test.ts`:
2303
+
2304
+ ```typescript
2305
+ import { describe, it, expect, beforeEach } from "vitest";
2306
+ import { db } from "@/lib/db";
2307
+ import { schedules, projects } from "@/lib/db/schema";
2308
+ import { randomUUID } from "crypto";
2309
+ import { checkCollision } from "../collision-check";
2310
+
2311
+ function seedSchedule(opts: {
2312
+ cron: string;
2313
+ avgTurns: number;
2314
+ projectId: string;
2315
+ status?: "active" | "paused";
2316
+ }): string {
2317
+ const id = randomUUID();
2318
+ const now = new Date();
2319
+ db.insert(schedules)
2320
+ .values({
2321
+ id,
2322
+ projectId: opts.projectId,
2323
+ name: `s-${id.slice(0, 4)}`,
2324
+ prompt: "test",
2325
+ cronExpression: opts.cron,
2326
+ status: opts.status ?? "active",
2327
+ type: "scheduled",
2328
+ firingCount: 0,
2329
+ suppressionCount: 0,
2330
+ heartbeatSpentToday: 0,
2331
+ failureStreak: 0,
2332
+ turnBudgetBreachStreak: 0,
2333
+ avgTurnsPerFiring: opts.avgTurns,
2334
+ createdAt: now,
2335
+ updatedAt: now,
2336
+ })
2337
+ .run();
2338
+ return id;
2339
+ }
2340
+
2341
+ function seedProject(): string {
2342
+ const id = randomUUID();
2343
+ const now = new Date();
2344
+ db.insert(projects)
2345
+ .values({ id, name: "p", status: "active", createdAt: now, updatedAt: now })
2346
+ .run();
2347
+ return id;
2348
+ }
2349
+
2350
+ describe("checkCollision", () => {
2351
+ beforeEach(() => {
2352
+ db.delete(schedules).run();
2353
+ db.delete(projects).run();
2354
+ });
2355
+
2356
+ it("returns no warnings when no overlap exists", () => {
2357
+ const pid = seedProject();
2358
+ seedSchedule({ cron: "0 3 * * *", avgTurns: 500, projectId: pid });
2359
+ expect(checkCollision("0 15 * * *", 500, pid, null)).toEqual([]);
2360
+ });
2361
+
2362
+ it("detects overlap when two heavy schedules share a 5-min bucket", () => {
2363
+ const pid = seedProject();
2364
+ seedSchedule({ cron: "2 * * * *", avgTurns: 2000, projectId: pid });
2365
+ const warnings = checkCollision("0 * * * *", 2000, pid, null);
2366
+ expect(warnings.length).toBe(1);
2367
+ expect(warnings[0].type).toBe("cron_collision");
2368
+ expect(warnings[0].estimatedConcurrentSteps).toBeGreaterThanOrEqual(4000);
2369
+ });
2370
+
2371
+ it("ignores paused schedules", () => {
2372
+ const pid = seedProject();
2373
+ seedSchedule({
2374
+ cron: "2 * * * *",
2375
+ avgTurns: 2000,
2376
+ projectId: pid,
2377
+ status: "paused",
2378
+ });
2379
+ expect(checkCollision("0 * * * *", 2000, pid, null)).toEqual([]);
2380
+ });
2381
+
2382
+ it("excludes the excludeScheduleId (for PUT updates)", () => {
2383
+ const pid = seedProject();
2384
+ const existing = seedSchedule({
2385
+ cron: "0 * * * *",
2386
+ avgTurns: 3000,
2387
+ projectId: pid,
2388
+ });
2389
+ expect(checkCollision("0 * * * *", 3000, pid, existing)).toEqual([]);
2390
+ });
2391
+
2392
+ it("does not warn when combined steps are below the threshold", () => {
2393
+ const pid = seedProject();
2394
+ seedSchedule({ cron: "2 * * * *", avgTurns: 500, projectId: pid });
2395
+ expect(checkCollision("0 * * * *", 500, pid, null)).toEqual([]);
2396
+ });
2397
+ });
2398
+ ```
2399
+
2400
+ - [ ] **Step 12.2: Run to verify RED**
2401
+
2402
+ Run: `npx vitest run src/lib/schedules/__tests__/collision-check.test.ts`
2403
+ Expected: FAIL — module doesn't exist.
2404
+
2405
+ - [ ] **Step 12.3: Implement collision-check**
2406
+
2407
+ Create `src/lib/schedules/collision-check.ts`:
2408
+
2409
+ ```typescript
2410
+ import { db } from "@/lib/db";
2411
+ import { schedules } from "@/lib/db/schema";
2412
+ import { and, eq, ne } from "drizzle-orm";
2413
+ import { expandCronMinutes } from "./interval-parser";
2414
+
2415
+ const BUCKET_SIZE_MIN = 5;
2416
+ const COLLISION_THRESHOLD_TURNS = 3000;
2417
+
2418
+ export interface CronCollisionWarning {
2419
+ type: "cron_collision";
2420
+ overlappingSchedules: string[];
2421
+ overlappingMinutes: number[];
2422
+ estimatedConcurrentSteps: number;
2423
+ }
2424
+
2425
+ /**
2426
+ * Check if a candidate cron collides with existing active schedules in the
2427
+ * same project inside a 5-minute bucket, weighted by the sum of their
2428
+ * avgTurnsPerFiring. Warns only when combined weight exceeds 3000 steps.
2429
+ *
2430
+ * Passing an excludeScheduleId skips that schedule (for PUT flows where a
2431
+ * schedule should not collide with its own prior state).
2432
+ *
2433
+ * Deterministic — runs against nominal cron expansion, not chat-pressure
2434
+ * adjusted times.
2435
+ */
2436
+ export function checkCollision(
2437
+ candidateCron: string,
2438
+ candidateAvgTurns: number,
2439
+ projectId: string | null,
2440
+ excludeScheduleId: string | null,
2441
+ ): CronCollisionWarning[] {
2442
+ let candidateMinutes: number[];
2443
+ try {
2444
+ candidateMinutes = expandCronMinutes(candidateCron);
2445
+ } catch {
2446
+ return [];
2447
+ }
2448
+
2449
+ const candidateBuckets = new Set(
2450
+ candidateMinutes.map((m) => Math.floor(m / BUCKET_SIZE_MIN)),
2451
+ );
2452
+
2453
+ const conditions = [eq(schedules.status, "active")];
2454
+ if (projectId !== null) {
2455
+ conditions.push(eq(schedules.projectId, projectId));
2456
+ }
2457
+ if (excludeScheduleId !== null) {
2458
+ conditions.push(ne(schedules.id, excludeScheduleId));
2459
+ }
2460
+
2461
+ const others = db
2462
+ .select({
2463
+ id: schedules.id,
2464
+ name: schedules.name,
2465
+ cronExpression: schedules.cronExpression,
2466
+ avgTurnsPerFiring: schedules.avgTurnsPerFiring,
2467
+ })
2468
+ .from(schedules)
2469
+ .where(and(...conditions))
2470
+ .all();
2471
+
2472
+ const overlappingNames: string[] = [];
2473
+ const overlappingMinutesSet = new Set<number>();
2474
+ let totalOtherTurns = 0;
2475
+
2476
+ for (const other of others) {
2477
+ let otherMinutes: number[];
2478
+ try {
2479
+ otherMinutes = expandCronMinutes(other.cronExpression);
2480
+ } catch {
2481
+ continue;
2482
+ }
2483
+ const otherBuckets = new Set(
2484
+ otherMinutes.map((m) => Math.floor(m / BUCKET_SIZE_MIN)),
2485
+ );
2486
+ const sharedBuckets = [...otherBuckets].filter((b) =>
2487
+ candidateBuckets.has(b),
2488
+ );
2489
+ if (sharedBuckets.length > 0) {
2490
+ overlappingNames.push(other.name);
2491
+ totalOtherTurns += other.avgTurnsPerFiring ?? 0;
2492
+ for (const b of sharedBuckets) {
2493
+ overlappingMinutesSet.add(b * BUCKET_SIZE_MIN);
2494
+ }
2495
+ }
2496
+ }
2497
+
2498
+ const combinedTurns = candidateAvgTurns + totalOtherTurns;
2499
+ if (
2500
+ overlappingNames.length === 0 ||
2501
+ combinedTurns < COLLISION_THRESHOLD_TURNS
2502
+ ) {
2503
+ return [];
2504
+ }
2505
+
2506
+ return [
2507
+ {
2508
+ type: "cron_collision",
2509
+ overlappingSchedules: overlappingNames,
2510
+ overlappingMinutes: [...overlappingMinutesSet].sort((a, b) => a - b),
2511
+ estimatedConcurrentSteps: combinedTurns,
2512
+ },
2513
+ ];
2514
+ }
2515
+ ```
2516
+
2517
+ - [ ] **Step 12.4: Run to verify GREEN**
2518
+
2519
+ Run: `npx vitest run src/lib/schedules/__tests__/collision-check.test.ts`
2520
+ Expected: PASS — 5 tests.
2521
+
2522
+ - [ ] **Step 12.5: Wire collision warnings into POST /api/schedules**
2523
+
2524
+ Edit `src/app/api/schedules/route.ts`. Add the import:
2525
+
2526
+ ```typescript
2527
+ import { checkCollision } from "@/lib/schedules/collision-check";
2528
+ ```
2529
+
2530
+ At the end of the POST handler, after the schedule is inserted, compute and attach warnings. Find the existing `return NextResponse.json(row)` and replace with:
2531
+
2532
+ ```typescript
2533
+ const warnings = checkCollision(cronExpression, 0, projectId ?? null, null);
2534
+ return NextResponse.json({ schedule: row, warnings });
2535
+ ```
2536
+
2537
+ - [ ] **Step 12.6: Wire collision warnings into PUT /api/schedules/:id**
2538
+
2539
+ Edit `src/app/api/schedules/[id]/route.ts`. Similar change at the end of the PUT handler:
2540
+
2541
+ ```typescript
2542
+ const warnings = checkCollision(
2543
+ cronExpression,
2544
+ schedule.avgTurnsPerFiring ?? 0,
2545
+ schedule.projectId ?? null,
2546
+ schedule.id,
2547
+ );
2548
+ return NextResponse.json({ schedule: updatedRow, warnings });
2549
+ ```
2550
+
2551
+ **Note:** these are response-shape changes. Existing consumers of these endpoints expect the schedule directly. The schedule form in Task 13 will read `res.schedule` and `res.warnings`. If preserving backwards compat is required, spread instead: `{ ...row, warnings }`.
2552
+
2553
+ - [ ] **Step 12.7: Run tests**
2554
+
2555
+ Run: `npx vitest run src/lib/schedules src/app/api/schedules`
2556
+ Expected: PASS.
2557
+
2558
+ - [ ] **Step 12.8: Commit**
2559
+
2560
+ ```bash
2561
+ git add src/lib/schedules/collision-check.ts src/lib/schedules/__tests__/collision-check.test.ts src/app/api/schedules/route.ts src/app/api/schedules/[id]/route.ts
2562
+ git commit -m "feat(schedules): pre-flight cron collision warning at save time"
2563
+ ```
2564
+
2565
+ ---
2566
+
2567
+ ## Task 13: Schedule form — rename + tooltips + calibration hint + warning banner
2568
+
2569
+ **Files:**
2570
+ - Modify: `src/components/schedules/schedule-form.tsx`
2571
+ - Modify: `src/components/schedules/schedule-create-sheet.tsx`
2572
+ - Modify: `src/components/schedules/schedule-edit-sheet.tsx`
2573
+
2574
+ This task is UI-heavy. Steps describe form changes without full component test coverage; smoke-test manually via `npm run dev`.
2575
+
2576
+ - [ ] **Step 13.1: Add Max agent steps field to schedule-form.tsx**
2577
+
2578
+ Edit `src/components/schedules/schedule-form.tsx`. Locate the form's state block and add:
2579
+
2580
+ ```typescript
2581
+ const [maxTurns, setMaxTurns] = useState<number | null>(initial?.maxTurns ?? null);
2582
+ ```
2583
+
2584
+ Add the form field near the existing budget/tuning fields:
2585
+
2586
+ ```tsx
2587
+ <div className="space-y-2">
2588
+ <Label htmlFor="max-turns">Max agent steps per run</Label>
2589
+ <Input
2590
+ id="max-turns"
2591
+ type="number"
2592
+ min={1}
2593
+ max={10000}
2594
+ placeholder="Inherits global default"
2595
+ value={maxTurns ?? ""}
2596
+ onChange={(e) =>
2597
+ setMaxTurns(e.target.value ? parseInt(e.target.value, 10) : null)
2598
+ }
2599
+ />
2600
+ <p className="text-muted-foreground text-xs">
2601
+ One step = one agent action (message, tool call, or sub-response). Most
2602
+ schedules use 50–500 steps; heavy research runs 2,000+.
2603
+ </p>
2604
+ </div>
2605
+ ```
2606
+
2607
+ Include `maxTurns` in the submit payload sent to `/api/schedules`.
2608
+
2609
+ - [ ] **Step 13.2: Add prompt-field tooltip**
2610
+
2611
+ Near the prompt textarea, add below it:
2612
+
2613
+ ```tsx
2614
+ <p className="text-muted-foreground text-xs">
2615
+ Note: writing &quot;MAX N turns&quot; in your prompt is a hint to the model,
2616
+ not a runtime limit. Use <strong>Max agent steps</strong> below to enforce
2617
+ a budget.
2618
+ </p>
2619
+ ```
2620
+
2621
+ - [ ] **Step 13.3: Add inline calibration hint**
2622
+
2623
+ Below the prompt field, add a calibration hint that reads from the schedule list if a similar schedule exists (same `agentProfile` and non-null `avgTurnsPerFiring`). For v1, compute client-side:
2624
+
2625
+ ```tsx
2626
+ {suggestedSteps !== null && (
2627
+ <p className="text-muted-foreground text-xs">
2628
+ Schedules like this average ~{suggestedSteps} steps.
2629
+ </p>
2630
+ )}
2631
+ ```
2632
+
2633
+ `suggestedSteps` is a `useMemo` over the schedule list prop (or fetched once on mount) — pick the median `avgTurnsPerFiring` among existing schedules with the same `agentProfile`.
2634
+
2635
+ - [ ] **Step 13.4: Render collision warning banner in sheets**
2636
+
2637
+ Edit `src/components/schedules/schedule-create-sheet.tsx`. After the POST call, read `res.warnings` from the response. If non-empty, render an amber banner inside `SheetContent` (remember the recurring shadcn issue: body must have `px-6`):
2638
+
2639
+ ```tsx
2640
+ {warnings.length > 0 && (
2641
+ <div className="mx-6 mb-4 rounded-lg border border-amber-500/40 bg-amber-50 p-3 text-sm">
2642
+ <p className="font-medium text-amber-900">
2643
+ Overlap detected with: {warnings[0].overlappingSchedules.join(", ")}
2644
+ </p>
2645
+ <p className="text-amber-800">
2646
+ Combined load: ~{warnings[0].estimatedConcurrentSteps} agent steps.
2647
+ Schedules will take turns; the last to run may be delayed.
2648
+ </p>
2649
+ </div>
2650
+ )}
2651
+ ```
2652
+
2653
+ Do the same in `schedule-edit-sheet.tsx`.
2654
+
2655
+ Also update the POST/PUT result handlers to read `res.schedule` instead of the top-level response (response shape changed in Task 12).
2656
+
2657
+ - [ ] **Step 13.5: Smoke test**
2658
+
2659
+ Run: `npm run dev`
2660
+ Navigate to `/schedules`, create two schedules with overlapping crons (e.g. both at `0 * * * *` with high avgTurns seeded manually in the DB), verify the warning banner renders. Submit with a `maxTurns` value set and verify it persists via `SELECT max_turns FROM schedules` in sqlite CLI.
2661
+
2662
+ - [ ] **Step 13.6: Commit**
2663
+
2664
+ ```bash
2665
+ git add src/components/schedules/schedule-form.tsx src/components/schedules/schedule-create-sheet.tsx src/components/schedules/schedule-edit-sheet.tsx
2666
+ git commit -m "feat(schedules): max agent steps field + tooltips + collision warning banner"
2667
+ ```
2668
+
2669
+ ---
2670
+
2671
+ ## Task 14: Architectural decision records (TDRs)
2672
+
2673
+ **Files:**
2674
+ - Create: `.claude/skills/architect/references/tdr-atomic-slot-claim.md`
2675
+ - Create: `.claude/skills/architect/references/tdr-evidence-based-cap.md`
2676
+ - Create: `.claude/skills/architect/references/tdr-failure-class-streaks.md`
2677
+ - Create: `.claude/skills/architect/references/tdr-manual-honors-cap.md`
2678
+ - Create: `.claude/skills/architect/references/tdr-lock-holders-leased.md`
2679
+ - Create: `.claude/skills/architect/references/tdr-chat-shares-event-loop.md`
2680
+
2681
+ Each TDR is a short markdown file capturing the architectural principle and its motivating incident. Pattern: Title / Status / Date / Context / Decision / Consequences.
2682
+
2683
+ - [ ] **Step 14.1: TDR 1 — atomic slot claim**
2684
+
2685
+ Create `.claude/skills/architect/references/tdr-atomic-slot-claim.md` with this body:
2686
+
2687
+ ```markdown
2688
+ # TDR: Concurrency slot claim is a single SQL statement, not check-then-act
2689
+
2690
+ **Status:** Accepted
2691
+ **Date:** 2026-04-08
2692
+ **Incident:** 2026-04-08 schedule starvation (5 concurrent firings consumed ~12,600 turns, killed chat SSE)
2693
+
2694
+ ## Context
2695
+
2696
+ The scheduler has two concurrent coordination points: `tickScheduler()` (the poll loop) and `drainQueue()` (the post-completion chain at `src/lib/schedules/scheduler.ts:420`). Both need to check "is the global cap full?" before firing a new task. A naive SELECT then INSERT across these two entry points races and allows the cap to be exceeded.
2697
+
2698
+ ## Decision
2699
+
2700
+ The slot claim MUST be a single SQL statement. We use an atomic conditional UPDATE with a subquery inside the WHERE clause, exploiting SQLite's serialized write lock to guarantee two concurrent claim attempts cannot both succeed.
2701
+
2702
+ The implementation lives in `src/lib/schedules/slot-claim.ts`.
2703
+
2704
+ ## Consequences
2705
+
2706
+ - Future coordination primitives must also use single-statement atomic claims. Never SELECT then UPDATE.
2707
+ - The approach is SQLite-specific. If the backend moves to Postgres, revisit with SELECT ... FOR UPDATE or advisory locks.
2708
+ - `changes = 0` is a normal outcome meaning "lost the race" — callers must handle it as "leave in queued, retry via drain."
2709
+ ```
2710
+
2711
+ - [ ] **Step 14.2: TDR 2 — evidence-based cap**
2712
+
2713
+ Create `.claude/skills/architect/references/tdr-evidence-based-cap.md`:
2714
+
2715
+ ```markdown
2716
+ # TDR: Scheduler cap is static and evidence-based
2717
+
2718
+ **Status:** Accepted
2719
+ **Date:** 2026-04-08
2720
+
2721
+ ## Context
2722
+
2723
+ The 2026-04-08 incident showed 5 concurrent schedules starved the chat SSE stream. The cap of 2 (later 3) was chosen as a guess, not a measurement. Without `schedule_firing_metrics` we have no way to validate or refine it.
2724
+
2725
+ ## Decision
2726
+
2727
+ The cap starts at 2 and is raised to 3 only after one week of `schedule_firing_metrics` telemetry shows:
2728
+ - Chat SSE P99 first-token latency stays below 2 seconds
2729
+ - `event_loop_lag_ms` p99 stays below 50ms
2730
+ - `slot_wait_ms` p95 stays below 60s under typical load
2731
+
2732
+ Any future change to the cap requires re-running the validation against the metrics table.
2733
+
2734
+ ## Consequences
2735
+
2736
+ - `schedule_firing_metrics` is load-bearing. Never cut it from follow-up specs.
2737
+ - Dynamic cap adjustment is deferred until the static cap proves insufficient. Dynamic control loops have failure modes (oscillation, thundering herd) that don't belong in a first ship.
2738
+ ```
2739
+
2740
+ - [ ] **Step 14.3: TDR 3 — failure class streaks**
2741
+
2742
+ Create `.claude/skills/architect/references/tdr-failure-class-streaks.md`:
2743
+
2744
+ ```markdown
2745
+ # TDR: Auto-pause streak counts per failure class
2746
+
2747
+ **Status:** Accepted
2748
+ **Date:** 2026-04-08
2749
+
2750
+ ## Context
2751
+
2752
+ The original scheduler had a single `failureStreak` that tripped auto-pause after 3 consecutive failures regardless of cause. Sharing this counter across genuinely-failing runs and misconfigured `maxTurns` values is a footgun: a user who sets `maxTurns=10` on a schedule averaging 40 would trip auto-pause in 3 firings — potentially within 3 minutes on a `* * * * *` cron — before they realized the config took effect.
2753
+
2754
+ ## Decision
2755
+
2756
+ Split the streak counter per failure class:
2757
+ - `failureStreak` — generic failures (SDK error, timeout, auth, etc.). Auto-pause threshold: 3.
2758
+ - `turnBudgetBreachStreak` — turn-limit exceeded. Auto-pause threshold: 5, with first-breach grace: breaches in the first 2 cron intervals after a `maxTurnsSetAt` edit are logged only.
2759
+
2760
+ Future failure modes (e.g. context window exceeded, MCP tool failures) should each get their own counter if the appropriate auto-pause threshold differs from the generic 3.
2761
+
2762
+ ## Consequences
2763
+
2764
+ - `schedules` schema grows one counter column per named failure class.
2765
+ - The runtime adapter must write explicit `failure_reason` at terminal transitions so the classifier has reliable input — string-matching error text is fragile.
2766
+ ```
2767
+
2768
+ - [ ] **Step 14.4: TDR 4 — manual honors cap**
2769
+
2770
+ Create `.claude/skills/architect/references/tdr-manual-honors-cap.md`:
2771
+
2772
+ ```markdown
2773
+ # TDR: Manual execute honors the global cap by default
2774
+
2775
+ **Status:** Accepted
2776
+ **Date:** 2026-04-08
2777
+
2778
+ ## Context
2779
+
2780
+ Operational controls like "Run now" buttons are tempting to implement as cap-bypassing shortcuts, but a user who clicks them 5 times in 2 seconds can reproduce the exact incident profile that motivated the cap in the first place (2026-04-08: 5 concurrent Opus runs, ~12,600 turns, starved chat).
2781
+
2782
+ ## Decision
2783
+
2784
+ `POST /api/schedules/:id/execute` honors `SCHEDULE_MAX_CONCURRENT` by default. When the cap is full, return `429` with an ETA for the next free slot. An explicit `?force=true` query parameter bypasses the cap, logged to `usage_ledger` as `activityType='manual_force_bypass'` for audit.
2785
+
2786
+ ## Consequences
2787
+
2788
+ - Future operational endpoints (bulk re-run, workflow force-trigger) should follow the same pattern: honor cap + explicit force flag + audit log.
2789
+ - Users who genuinely need rapid-fire execution have an escape hatch, but the happy path defaults to safety.
2790
+ - Audit log entries can be queried to detect abusive or automated bypass patterns.
2791
+ ```
2792
+
2793
+ - [ ] **Step 14.5: TDR 5 — lock holders leased**
2794
+
2795
+ Create `.claude/skills/architect/references/tdr-lock-holders-leased.md`:
2796
+
2797
+ ```markdown
2798
+ # TDR: All lock holders carry lease expiries + reapers
2799
+
2800
+ **Status:** Accepted
2801
+ **Date:** 2026-04-08
2802
+
2803
+ ## Context
2804
+
2805
+ A hung SDK call can permanently wedge any lock: group locks, concurrency slots, even the existing per-schedule claim (which sets `nextFireAt = NULL` as a lock at `src/lib/schedules/scheduler.ts:240`; if `fireSchedule` throws before writing the new `nextFireAt`, the schedule is stuck until process restart).
2806
+
2807
+ ## Decision
2808
+
2809
+ Every lock primitive in the scheduler pipeline must carry a lease expiry and a reaper:
2810
+ 1. **Concurrency slots** — `tasks.lease_expires_at` reaped at each `tickScheduler()` call. Expired leases are aborted via the execution-manager AbortController and marked failed/lease_expired.
2811
+ 2. **Per-schedule claim** — currently relies on `bootstrapNextFireTimes()` at startup; future work should add a time-based reaper.
2812
+ 3. **New locks** — any future coordination primitive must ship with a reaper from day one.
2813
+
2814
+ Default lease: 20 minutes. Override per-schedule via `schedules.max_run_duration_sec`.
2815
+
2816
+ ## Consequences
2817
+
2818
+ - Lock holders cannot rely on "the other code path will clean this up." Every claim must be either released normally (on completion) or reaped (on lease expiry).
2819
+ - The reaper is idempotent — safe to run at every tick.
2820
+ - Aborting via AbortController requires the runtime adapter to honor the signal; all SDK query calls must pass through the abort controller from execution-manager.
2821
+ ```
2822
+
2823
+ - [ ] **Step 14.6: TDR 6 — chat shares event loop**
2824
+
2825
+ Create `.claude/skills/architect/references/tdr-chat-shares-event-loop.md`:
2826
+
2827
+ ```markdown
2828
+ # TDR: Chat and scheduled agents compete for the same Node event loop
2829
+
2830
+ **Status:** Accepted
2831
+ **Date:** 2026-04-08
2832
+
2833
+ ## Context
2834
+
2835
+ Stagent runs chat and scheduled tasks in the same Node process, on the same event loop. The 2026-04-08 incident showed this is a critical architectural constraint: when 5 schedules saturated the event loop, a user's chat SSE stream was starved and dropped mid-stream.
2836
+
2837
+ ## Decision
2838
+
2839
+ This is a known and intentional constraint until a worker-thread isolation architecture is designed. Any feature that adds agent-like workloads (image pipelines, MCP servers, streaming tools) must assume chat is on the critical path and must not starve it.
2840
+
2841
+ Mitigations:
2842
+ 1. Global concurrency cap limits scheduled agents to `SCHEDULE_MAX_CONCURRENT` (default 2).
2843
+ 2. Chat soft pressure signal — when chat is streaming, the scheduler defers new firings by 30s (`src/lib/chat/active-streams.ts` + `scheduler.ts:applyChatPressure`).
2844
+ 3. Spec B hotfix guarantees chat messages never persist as empty content even under worst-case contention.
2845
+
2846
+ ## Consequences
2847
+
2848
+ - Future high-throughput features must evaluate event-loop impact before shipping.
2849
+ - Worker-thread isolation is tracked as an architectural follow-up. This TDR is the anchor point for that future work.
2850
+ - Profiling under load should measure `event_loop_lag_ms` and alert when p99 exceeds 50ms.
2851
+ ```
2852
+
2853
+ - [ ] **Step 14.7: Commit**
2854
+
2855
+ ```bash
2856
+ git add .claude/skills/architect/references/tdr-atomic-slot-claim.md .claude/skills/architect/references/tdr-evidence-based-cap.md .claude/skills/architect/references/tdr-failure-class-streaks.md .claude/skills/architect/references/tdr-manual-honors-cap.md .claude/skills/architect/references/tdr-lock-holders-leased.md .claude/skills/architect/references/tdr-chat-shares-event-loop.md
2857
+ git commit -m "docs(architect): 6 TDRs for schedule orchestration principles"
2858
+ ```
2859
+
2860
+ ---
2861
+
2862
+ ## Task 15: End-to-end integration test
2863
+
2864
+ **Files:**
2865
+ - Create: `src/lib/schedules/__tests__/integration.test.ts`
2866
+
2867
+ Final integration test validating the full cap + queue + reap path composes correctly. No Opus calls — the runtime is mocked.
2868
+
2869
+ - [ ] **Step 15.1: Write integration test**
2870
+
2871
+ Create `src/lib/schedules/__tests__/integration.test.ts`:
2872
+
2873
+ ```typescript
2874
+ import { describe, it, expect, beforeEach, vi } from "vitest";
2875
+ import { db } from "@/lib/db";
2876
+ import {
2877
+ tasks,
2878
+ schedules,
2879
+ projects,
2880
+ settings,
2881
+ scheduleFiringMetrics,
2882
+ agentLogs,
2883
+ } from "@/lib/db/schema";
2884
+ import { eq } from "drizzle-orm";
2885
+ import { randomUUID } from "crypto";
2886
+ import { tickScheduler } from "../scheduler";
2887
+ import { countRunningScheduledSlots } from "../slot-claim";
2888
+
2889
+ vi.mock("@/lib/agents/runtime", () => ({
2890
+ executeTaskWithRuntime: vi.fn(async () => {
2891
+ // Simulate a short-running task
2892
+ await new Promise((r) => setTimeout(r, 20));
2893
+ }),
2894
+ }));
2895
+
2896
+ describe("schedule orchestration end-to-end", () => {
2897
+ beforeEach(() => {
2898
+ db.delete(scheduleFiringMetrics).run();
2899
+ db.delete(agentLogs).run();
2900
+ db.delete(tasks).run();
2901
+ db.delete(schedules).run();
2902
+ db.delete(projects).run();
2903
+ db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
2904
+ db.insert(settings)
2905
+ .values({ key: "schedule.maxConcurrent", value: "2", updatedAt: new Date() })
2906
+ .run();
2907
+ });
2908
+
2909
+ it("5 schedules firing at once → exactly 2 run, 3 queue", async () => {
2910
+ const pid = randomUUID();
2911
+ const now = new Date();
2912
+ db.insert(projects)
2913
+ .values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
2914
+ .run();
2915
+
2916
+ const past = new Date(now.getTime() - 10_000);
2917
+ for (let i = 0; i < 5; i++) {
2918
+ db.insert(schedules)
2919
+ .values({
2920
+ id: randomUUID(),
2921
+ projectId: pid,
2922
+ name: `sched-${i}`,
2923
+ prompt: "test",
2924
+ cronExpression: "* * * * *",
2925
+ status: "active",
2926
+ type: "scheduled",
2927
+ firingCount: 0,
2928
+ suppressionCount: 0,
2929
+ heartbeatSpentToday: 0,
2930
+ failureStreak: 0,
2931
+ turnBudgetBreachStreak: 0,
2932
+ nextFireAt: past,
2933
+ createdAt: now,
2934
+ updatedAt: now,
2935
+ })
2936
+ .run();
2937
+ }
2938
+
2939
+ await tickScheduler();
2940
+
2941
+ expect(countRunningScheduledSlots()).toBe(2);
2942
+ const queued = db
2943
+ .select()
2944
+ .from(tasks)
2945
+ .where(eq(tasks.status, "queued"))
2946
+ .all();
2947
+ expect(queued.length).toBe(3);
2948
+ });
2949
+ });
2950
+ ```
2951
+
2952
+ - [ ] **Step 15.2: Run and verify PASS**
2953
+
2954
+ Run: `npx vitest run src/lib/schedules/__tests__/integration.test.ts`
2955
+ Expected: PASS.
2956
+
2957
+ - [ ] **Step 15.3: Run full suite for final regression check**
2958
+
2959
+ Run: `npx vitest run`
2960
+ Expected: PASS across all test files.
2961
+
2962
+ - [ ] **Step 15.4: Commit**
2963
+
2964
+ ```bash
2965
+ git add src/lib/schedules/__tests__/integration.test.ts
2966
+ git commit -m "test(schedules): end-to-end cap + queue integration test"
2967
+ ```
2968
+
2969
+ ---
2970
+
2971
+ ## Final verification
2972
+
2973
+ After all tasks complete:
2974
+
2975
+ - [ ] **Full test suite**: `npx vitest run` — all green, no regressions
2976
+ - [ ] **TypeScript check**: `npx tsc --noEmit` — zero errors
2977
+ - [ ] **Manual smoke test**: `npm run dev`, create 3 schedules with overlapping crons, observe cap enforcement kicks in and queued schedules drain sequentially
2978
+ - [ ] **Incident reproduction**: manually fire 5 schedules via `POST /api/schedules/:id/execute?force=true` in rapid succession, send a chat message, confirm the chat SSE stream stays responsive and no empty-content rows are left behind
2979
+ - [ ] **Telemetry check**: query `SELECT * FROM schedule_firing_metrics` — confirm rows exist with non-null `slot_wait_ms` and `duration_ms`
2980
+ - [ ] **Roadmap update**: append a "Schedule Orchestration Resilience" subsection to `features/roadmap.md` with A/B/C completed entries plus future `schedule-collision-prevention` and `schedule-forecasting` entries
2981
+ - [ ] **Ship**: push the branch, open a PR, wait for CI, merge. Leave `SCHEDULE_MAX_CONCURRENT=2` for the first week of telemetry
2982
+
2983
+ If all items pass, the feature is ready to ship. After one week of telemetry showing chat SSE p99 < 2s and event-loop lag p99 < 50ms, raise `SCHEDULE_MAX_CONCURRENT` from 2 to 3 and continue monitoring.