stagent 0.9.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -42
- package/dist/cli.js +42 -18
- package/docs/.coverage-gaps.json +13 -55
- package/docs/.last-generated +1 -1
- package/docs/features/provider-runtimes.md +4 -0
- package/docs/features/schedules.md +32 -4
- package/docs/features/settings.md +28 -5
- package/docs/features/tables.md +9 -2
- package/docs/features/workflows.md +10 -4
- package/docs/journeys/developer.md +15 -1
- package/docs/journeys/personal-use.md +21 -4
- package/docs/superpowers/plans/2026-04-07-instance-bootstrap.md +1691 -0
- package/docs/superpowers/plans/2026-04-08-schedule-orchestration.md +2983 -0
- package/docs/superpowers/plans/2026-04-11-schedule-maxturns-api-control.md +551 -0
- package/docs/superpowers/plans/2026-04-11-task-create-profile-validation.md +864 -0
- package/docs/superpowers/plans/2026-04-11-task-runtime-stagent-mcp-injection.md +739 -0
- package/docs/superpowers/specs/2026-04-08-chat-sse-resilience-hotfix-design.md +201 -0
- package/docs/superpowers/specs/2026-04-08-schedule-orchestration-design.md +371 -0
- package/docs/superpowers/specs/2026-04-08-swarm-visibility-design.md +213 -0
- package/package.json +3 -2
- package/src/__tests__/instrumentation-smoke.test.ts +15 -0
- package/src/app/analytics/page.tsx +1 -21
- package/src/app/api/chat/conversations/[id]/messages/route.ts +22 -1
- package/src/app/api/diagnostics/chat-streams/route.ts +65 -0
- package/src/app/api/instance/config/route.ts +41 -0
- package/src/app/api/instance/init/route.ts +34 -0
- package/src/app/api/instance/upgrade/check/route.ts +26 -0
- package/src/app/api/instance/upgrade/route.ts +96 -0
- package/src/app/api/instance/upgrade/status/route.ts +35 -0
- package/src/app/api/memory/route.ts +0 -11
- package/src/app/api/notifications/route.ts +4 -2
- package/src/app/api/projects/[id]/route.ts +5 -155
- package/src/app/api/projects/__tests__/delete-project.test.ts +10 -19
- package/src/app/api/schedules/[id]/execute/route.ts +111 -0
- package/src/app/api/schedules/[id]/route.ts +9 -1
- package/src/app/api/schedules/__tests__/execute-route.test.ts +118 -0
- package/src/app/api/schedules/route.ts +3 -12
- package/src/app/api/settings/openai/login/route.ts +22 -0
- package/src/app/api/settings/openai/logout/route.ts +7 -0
- package/src/app/api/settings/openai/route.ts +21 -1
- package/src/app/api/settings/providers/route.ts +35 -8
- package/src/app/api/tables/[id]/enrich/__tests__/route.test.ts +153 -0
- package/src/app/api/tables/[id]/enrich/plan/route.ts +98 -0
- package/src/app/api/tables/[id]/enrich/route.ts +147 -0
- package/src/app/api/tables/[id]/enrich/runs/route.ts +25 -0
- package/src/app/api/tasks/[id]/execute/route.ts +0 -21
- package/src/app/api/workflows/[id]/resume/route.ts +59 -0
- package/src/app/api/workflows/[id]/status/route.ts +22 -8
- package/src/app/api/workspace/context/route.ts +2 -0
- package/src/app/api/workspace/fix-data-dir/route.ts +81 -0
- package/src/app/chat/page.tsx +11 -0
- package/src/app/inbox/page.tsx +12 -5
- package/src/app/layout.tsx +42 -21
- package/src/app/page.tsx +0 -2
- package/src/app/settings/page.tsx +6 -9
- package/src/components/chat/__tests__/chat-session-provider.test.tsx +408 -0
- package/src/components/chat/chat-command-popover.tsx +2 -2
- package/src/components/chat/chat-input.tsx +2 -3
- package/src/components/chat/chat-session-provider.tsx +720 -0
- package/src/components/chat/chat-shell.tsx +92 -401
- package/src/components/instance/__tests__/instance-section.test.tsx +125 -0
- package/src/components/instance/instance-section.tsx +382 -0
- package/src/components/instance/upgrade-badge.tsx +219 -0
- package/src/components/notifications/__tests__/batch-proposal-review.test.tsx +95 -0
- package/src/components/notifications/__tests__/notification-item.test.tsx +106 -0
- package/src/components/notifications/batch-proposal-review.tsx +20 -5
- package/src/components/notifications/inbox-list.tsx +11 -2
- package/src/components/notifications/notification-item.tsx +56 -2
- package/src/components/notifications/pending-approval-host.tsx +56 -37
- package/src/components/schedules/schedule-create-sheet.tsx +19 -1
- package/src/components/schedules/schedule-edit-sheet.tsx +20 -1
- package/src/components/schedules/schedule-form.tsx +31 -0
- package/src/components/settings/__tests__/providers-runtimes-section.test.tsx +149 -0
- package/src/components/settings/auth-method-selector.tsx +19 -4
- package/src/components/settings/auth-status-badge.tsx +28 -3
- package/src/components/settings/openai-chatgpt-auth-control.tsx +278 -0
- package/src/components/settings/openai-runtime-section.tsx +7 -1
- package/src/components/settings/providers-runtimes-section.tsx +138 -19
- package/src/components/shared/app-sidebar.tsx +4 -3
- package/src/components/shared/command-palette.tsx +4 -5
- package/src/components/shared/theme-toggle.tsx +5 -24
- package/src/components/shared/workspace-indicator.tsx +61 -2
- package/src/components/tables/__tests__/table-enrichment-sheet.test.tsx +130 -0
- package/src/components/tables/table-create-sheet.tsx +4 -0
- package/src/components/tables/table-enrichment-runs.tsx +103 -0
- package/src/components/tables/table-enrichment-sheet.tsx +538 -0
- package/src/components/tables/table-spreadsheet.tsx +29 -5
- package/src/components/tables/table-toolbar.tsx +10 -1
- package/src/components/tasks/kanban-board.tsx +1 -0
- package/src/components/tasks/kanban-column.tsx +53 -14
- package/src/components/tasks/task-bento-grid.tsx +19 -0
- package/src/components/tasks/task-card.tsx +26 -3
- package/src/components/tasks/task-chip-bar.tsx +24 -0
- package/src/components/tasks/task-result-renderer.tsx +1 -1
- package/src/components/workflows/delay-step-body.tsx +109 -0
- package/src/components/workflows/hooks/use-workflow-status.ts +50 -0
- package/src/components/workflows/loop-status-view.tsx +1 -1
- package/src/components/workflows/shared/step-result.tsx +78 -0
- package/src/components/workflows/shared/workflow-header.tsx +141 -0
- package/src/components/workflows/shared/workflow-loading-skeleton.tsx +36 -0
- package/src/components/workflows/swarm-dashboard.tsx +2 -15
- package/src/components/workflows/views/loop-pattern-view.tsx +137 -0
- package/src/components/workflows/views/sequence-pattern-view.tsx +511 -0
- package/src/components/workflows/workflow-form-view.tsx +133 -16
- package/src/components/workflows/workflow-status-view.tsx +30 -740
- package/src/instrumentation-node.ts +94 -0
- package/src/instrumentation.ts +4 -48
- package/src/lib/agents/__tests__/claude-agent.test.ts +199 -0
- package/src/lib/agents/__tests__/execution-manager.test.ts +1 -27
- package/src/lib/agents/__tests__/failure-reason.test.ts +68 -0
- package/src/lib/agents/__tests__/learned-context.test.ts +0 -11
- package/src/lib/agents/__tests__/learning-session.test.ts +158 -0
- package/src/lib/agents/__tests__/pattern-extractor.test.ts +48 -0
- package/src/lib/agents/claude-agent.ts +155 -18
- package/src/lib/agents/execution-manager.ts +0 -35
- package/src/lib/agents/learned-context.ts +0 -12
- package/src/lib/agents/learning-session.ts +18 -5
- package/src/lib/agents/profiles/__tests__/registry.test.ts +6 -4
- package/src/lib/agents/profiles/builtins/upgrade-assistant/SKILL.md +70 -0
- package/src/lib/agents/profiles/builtins/upgrade-assistant/profile.yaml +32 -0
- package/src/lib/agents/runtime/__tests__/openai-codex-auth.test.ts +118 -0
- package/src/lib/agents/runtime/codex-app-server-client.ts +11 -5
- package/src/lib/agents/runtime/openai-codex-auth.ts +389 -0
- package/src/lib/agents/runtime/openai-codex.ts +29 -60
- package/src/lib/agents/runtime/types.ts +8 -0
- package/src/lib/book/chapter-mapping.ts +11 -0
- package/src/lib/book/content.ts +10 -0
- package/src/lib/chat/__tests__/active-streams.test.ts +49 -0
- package/src/lib/chat/__tests__/finalize-safety-net.test.ts +139 -0
- package/src/lib/chat/__tests__/reconcile.test.ts +137 -0
- package/src/lib/chat/__tests__/stream-telemetry.test.ts +151 -0
- package/src/lib/chat/active-streams.ts +27 -0
- package/src/lib/chat/codex-engine.ts +16 -17
- package/src/lib/chat/context-builder.ts +5 -3
- package/src/lib/chat/engine.ts +50 -3
- package/src/lib/chat/reconcile.ts +117 -0
- package/src/lib/chat/stagent-tools.ts +1 -0
- package/src/lib/chat/stream-telemetry.ts +132 -0
- package/src/lib/chat/suggested-prompts.ts +28 -1
- package/src/lib/chat/system-prompt.ts +26 -1
- package/src/lib/chat/tool-catalog.ts +2 -1
- package/src/lib/chat/tools/__tests__/enrich-table-tool.test.ts +127 -0
- package/src/lib/chat/tools/__tests__/schedule-tools.test.ts +261 -0
- package/src/lib/chat/tools/__tests__/task-tools.test.ts +352 -0
- package/src/lib/chat/tools/__tests__/workflow-tools-dedup.test.ts +217 -0
- package/src/lib/chat/tools/document-tools.ts +29 -13
- package/src/lib/chat/tools/helpers.ts +39 -0
- package/src/lib/chat/tools/notification-tools.ts +9 -5
- package/src/lib/chat/tools/project-tools.ts +33 -0
- package/src/lib/chat/tools/schedule-tools.ts +44 -11
- package/src/lib/chat/tools/table-tools.ts +71 -0
- package/src/lib/chat/tools/task-tools.ts +84 -20
- package/src/lib/chat/tools/workflow-tools.ts +234 -32
- package/src/lib/constants/settings.ts +8 -18
- package/src/lib/data/__tests__/clear.test.ts +56 -2
- package/src/lib/data/clear.ts +20 -15
- package/src/lib/data/delete-project.ts +171 -0
- package/src/lib/db/__tests__/bootstrap.test.ts +1 -1
- package/src/lib/db/bootstrap.ts +45 -16
- package/src/lib/db/index.ts +5 -0
- package/src/lib/db/migrations/0009_add_app_instances.sql +25 -0
- package/src/lib/db/migrations/0024_add_workflow_resume_at.sql +10 -0
- package/src/lib/db/migrations/0025_drop_app_instances.sql +3 -0
- package/src/lib/db/migrations/0026_drop_license.sql +3 -0
- package/src/lib/db/migrations/meta/_journal.json +21 -0
- package/src/lib/db/schema.ts +68 -23
- package/src/lib/environment/workspace-context.ts +13 -1
- package/src/lib/import/dedup.ts +4 -54
- package/src/lib/instance/__tests__/bootstrap.test.ts +362 -0
- package/src/lib/instance/__tests__/detect.test.ts +115 -0
- package/src/lib/instance/__tests__/fingerprint.test.ts +48 -0
- package/src/lib/instance/__tests__/git-ops.test.ts +95 -0
- package/src/lib/instance/__tests__/settings.test.ts +83 -0
- package/src/lib/instance/__tests__/upgrade-poller.test.ts +131 -0
- package/src/lib/instance/bootstrap.ts +270 -0
- package/src/lib/instance/detect.ts +49 -0
- package/src/lib/instance/fingerprint.ts +78 -0
- package/src/lib/instance/git-ops.ts +95 -0
- package/src/lib/instance/settings.ts +61 -0
- package/src/lib/instance/types.ts +77 -0
- package/src/lib/instance/upgrade-poller.ts +153 -0
- package/src/lib/notifications/__tests__/visibility.test.ts +51 -0
- package/src/lib/notifications/visibility.ts +33 -0
- package/src/lib/schedules/__tests__/collision-check.test.ts +93 -0
- package/src/lib/schedules/__tests__/config.test.ts +62 -0
- package/src/lib/schedules/__tests__/firing-metrics.test.ts +99 -0
- package/src/lib/schedules/__tests__/integration.test.ts +82 -0
- package/src/lib/schedules/__tests__/slot-claim.test.ts +242 -0
- package/src/lib/schedules/__tests__/tick-scheduler.test.ts +102 -0
- package/src/lib/schedules/__tests__/turn-budget.test.ts +228 -0
- package/src/lib/schedules/collision-check.ts +105 -0
- package/src/lib/schedules/config.ts +53 -0
- package/src/lib/schedules/scheduler.ts +232 -13
- package/src/lib/schedules/slot-claim.ts +105 -0
- package/src/lib/settings/__tests__/openai-auth.test.ts +101 -0
- package/src/lib/settings/__tests__/openai-login-manager.test.ts +64 -0
- package/src/lib/settings/__tests__/runtime-setup.test.ts +33 -0
- package/src/lib/settings/openai-auth.ts +105 -10
- package/src/lib/settings/openai-login-manager.ts +260 -0
- package/src/lib/settings/runtime-setup.ts +14 -4
- package/src/lib/tables/__tests__/enrichment-planner.test.ts +124 -0
- package/src/lib/tables/__tests__/enrichment.test.ts +147 -0
- package/src/lib/tables/enrichment-planner.ts +454 -0
- package/src/lib/tables/enrichment.ts +328 -0
- package/src/lib/tables/query-builder.ts +5 -2
- package/src/lib/tables/trigger-evaluator.ts +3 -2
- package/src/lib/theme.ts +71 -0
- package/src/lib/usage/ledger.ts +2 -18
- package/src/lib/util/__tests__/similarity.test.ts +106 -0
- package/src/lib/util/similarity.ts +77 -0
- package/src/lib/utils/format-timestamp.ts +24 -0
- package/src/lib/utils/stagent-paths.ts +12 -0
- package/src/lib/validators/__tests__/blueprint.test.ts +172 -0
- package/src/lib/validators/__tests__/settings.test.ts +10 -0
- package/src/lib/validators/blueprint.ts +70 -9
- package/src/lib/validators/profile.ts +2 -2
- package/src/lib/validators/settings.ts +3 -1
- package/src/lib/workflows/__tests__/delay.test.ts +196 -0
- package/src/lib/workflows/__tests__/engine.test.ts +8 -0
- package/src/lib/workflows/__tests__/loop-executor.test.ts +54 -0
- package/src/lib/workflows/__tests__/post-action.test.ts +108 -0
- package/src/lib/workflows/blueprints/instantiator.ts +22 -1
- package/src/lib/workflows/blueprints/types.ts +10 -2
- package/src/lib/workflows/delay.ts +106 -0
- package/src/lib/workflows/engine.ts +207 -4
- package/src/lib/workflows/loop-executor.ts +349 -24
- package/src/lib/workflows/post-action.ts +91 -0
- package/src/lib/workflows/types.ts +166 -1
- package/src/app/api/license/checkout/route.ts +0 -28
- package/src/app/api/license/portal/route.ts +0 -26
- package/src/app/api/license/route.ts +0 -89
- package/src/app/api/license/usage/route.ts +0 -63
- package/src/app/api/marketplace/browse/route.ts +0 -15
- package/src/app/api/marketplace/import/route.ts +0 -28
- package/src/app/api/marketplace/publish/route.ts +0 -40
- package/src/app/api/onboarding/email/route.ts +0 -53
- package/src/app/api/settings/telemetry/route.ts +0 -14
- package/src/app/api/sync/export/route.ts +0 -54
- package/src/app/api/sync/restore/route.ts +0 -37
- package/src/app/api/sync/sessions/route.ts +0 -24
- package/src/app/auth/callback/route.ts +0 -73
- package/src/app/marketplace/page.tsx +0 -19
- package/src/components/analytics/analytics-gate-card.tsx +0 -101
- package/src/components/marketplace/blueprint-card.tsx +0 -61
- package/src/components/marketplace/marketplace-browser.tsx +0 -131
- package/src/components/onboarding/email-capture-card.tsx +0 -104
- package/src/components/settings/activation-form.tsx +0 -95
- package/src/components/settings/cloud-account-section.tsx +0 -147
- package/src/components/settings/cloud-sync-section.tsx +0 -155
- package/src/components/settings/subscription-section.tsx +0 -410
- package/src/components/settings/telemetry-section.tsx +0 -80
- package/src/components/shared/premium-gate-overlay.tsx +0 -50
- package/src/components/shared/schedule-gate-dialog.tsx +0 -64
- package/src/components/shared/upgrade-banner.tsx +0 -112
- package/src/hooks/use-supabase-auth.ts +0 -79
- package/src/lib/billing/email.ts +0 -54
- package/src/lib/billing/products.ts +0 -80
- package/src/lib/billing/stripe.ts +0 -101
- package/src/lib/cloud/supabase-browser.ts +0 -32
- package/src/lib/cloud/supabase-client.ts +0 -56
- package/src/lib/license/__tests__/features.test.ts +0 -56
- package/src/lib/license/__tests__/key-format.test.ts +0 -88
- package/src/lib/license/__tests__/manager.test.ts +0 -64
- package/src/lib/license/__tests__/tier-limits.test.ts +0 -79
- package/src/lib/license/cloud-validation.ts +0 -60
- package/src/lib/license/features.ts +0 -44
- package/src/lib/license/key-format.ts +0 -101
- package/src/lib/license/limit-check.ts +0 -111
- package/src/lib/license/limit-queries.ts +0 -51
- package/src/lib/license/manager.ts +0 -345
- package/src/lib/license/notifications.ts +0 -59
- package/src/lib/license/tier-limits.ts +0 -71
- package/src/lib/marketplace/marketplace-client.ts +0 -107
- package/src/lib/sync/cloud-sync.ts +0 -235
- package/src/lib/telemetry/conversion-events.ts +0 -71
- package/src/lib/telemetry/queue.ts +0 -122
- package/src/lib/validators/license.ts +0 -33
|
@@ -0,0 +1,2983 @@
|
|
|
1
|
+
# Schedule Orchestration Implementation Plan
|
|
2
|
+
|
|
3
|
+
> **For agentic workers:** REQUIRED SUB-SKILL: Use `superpowers:subagent-driven-development` (recommended) or `superpowers:executing-plans` to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
|
4
|
+
|
|
5
|
+
**Goal:** Prevent concurrent scheduled agents from starving the chat SSE stream by introducing a global concurrency cap enforced via atomic slot claim, per-schedule turn budgets, lease-based timeouts with a reaper, a minimal pre-flight collision warning, and a time-series metrics table for evidence-based cap tuning.
|
|
6
|
+
|
|
7
|
+
**Architecture:** Changes are concentrated in `src/lib/schedules/scheduler.ts`, a new `src/lib/schedules/slot-claim.ts` primitive, and the claude runtime adapter. Coordination uses atomic single-SQL conditional updates (no check-then-act). Lease expiry via `AbortController` reaped at each tick. Per-schedule turn budget is a new `max_turns` column propagated into the `tasks` row at firing time and threaded into the SDK `query()` call. Chat soft pressure uses a module-level `Set<conversationId>` in `src/lib/chat/active-streams.ts` checked by `tickScheduler()` to defer (not block) new firings.
|
|
8
|
+
|
|
9
|
+
**Tech Stack:** TypeScript, better-sqlite3 (synchronous), Drizzle ORM, `@anthropic-ai/claude-agent-sdk`, `cron-parser` (already in tree for `expandCronMinutes`), vitest with real temp-dir SQLite, Next.js `register()` instrumentation hook.
|
|
10
|
+
|
|
11
|
+
**Worktree guidance:** This plan makes invasive changes to scheduler semantics. Run it in a dedicated worktree:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
git worktree add -b schedule-orchestration ../stagent-schedule-orchestration main
|
|
15
|
+
cd ../stagent-schedule-orchestration
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## NOT in scope
|
|
21
|
+
|
|
22
|
+
Explicit deferrals to prevent scope re-creep during execution:
|
|
23
|
+
|
|
24
|
+
- **`concurrencyGroup` column and per-group locks** — deferred to a follow-up spec. The 2026-04-08 incident was a global-cap problem; groups add a second coordination primitive whose correctness depends on solving the first. Ship global cap alone.
|
|
25
|
+
- **Auto-stagger endpoint, 48h forecast, collision-forecast notifications** — future spec "Schedule Predictability & Forecasting". Only the minimal save-time collision *warning* is in this plan.
|
|
26
|
+
- **Turn drift detection across 3-run moving window, efficiency scoring (`useful_actions / total_turns`)** — future spec "Schedule Observability".
|
|
27
|
+
- **`turnBudgetAction: 'optimize'` meta-agent prompt rewriter** — future spec "Agent Self-Optimization".
|
|
28
|
+
- **Hard chat priority / `pauseSchedulesDuringChat` setting** — only if the soft pressure signal proves insufficient post-launch.
|
|
29
|
+
- **Dynamic adaptive cap** based on measured P99 chat latency — architect explicitly recommended against until static cap proves insufficient.
|
|
30
|
+
- **`usage_ledger.turn_count` column** — derivable from `schedule_firing_metrics` and `agent_logs`.
|
|
31
|
+
- **`swarm_snapshots` time-series table** — deferred to Spec C follow-ups ("Swarm Activity Feed").
|
|
32
|
+
- **Worker-thread isolation for the agent runtime** — architectural bet, separate design effort.
|
|
33
|
+
- **UI visibility layer** — delivered by Spec C (Swarm Visibility), which depends on this plan's API shape but is a separate plan.
|
|
34
|
+
|
|
35
|
+
## What already exists
|
|
36
|
+
|
|
37
|
+
Reusable code and patterns confirmed during exploration. Do not rebuild these:
|
|
38
|
+
|
|
39
|
+
- **`src/lib/schedules/scheduler.ts:238-252`** — atomic per-schedule claim via conditional WHERE UPDATE. The new global-cap claim follows the same single-SQL-statement pattern.
|
|
40
|
+
- **`src/lib/schedules/scheduler.ts:51-95`** — `drainQueue()` with module-level `draining` flag. The new atomic claim must be correct under the drain + tick interleaving.
|
|
41
|
+
- **`src/lib/schedules/scheduler.ts:304-322`** — existing title-pattern sibling guard. Keep as-is; global cap layers on top.
|
|
42
|
+
- **`src/lib/schedules/scheduler.ts:122-133`** — `detectFailureReason()`. Keep as fallback; runtime adapter will write explicit `failure_reason` at terminal transitions.
|
|
43
|
+
- **`src/lib/schedules/scheduler.ts:140-186`** — `recordFiringMetrics()` is the natural hook for inserting into the new firing-metrics table.
|
|
44
|
+
- **`src/lib/schedules/interval-parser.ts:92`** — `expandCronMinutes()` expands a cron into the list of fire minutes. Reuse for collision-check bucketing.
|
|
45
|
+
- **`src/lib/agents/execution-manager.ts:14-62`** — in-memory `Map<taskId, RunningExecution>` with `abortController` on each entry. The reaper uses this to abort expired leases.
|
|
46
|
+
- **`src/lib/agents/claude-agent.ts:444-470`** — SDK `query()` invocation. `maxTurns` is passed through `ctx.maxTurns`. Override when a task came from a schedule with its own `max_turns`.
|
|
47
|
+
- **`src/lib/agents/claude-agent.ts:358-414`** — `buildTaskQueryContext()` resolves `maxTurns` from profile fallback. Keep as default; schedule-level override takes precedence.
|
|
48
|
+
- **`src/lib/db/bootstrap.ts:266-275`** — `addColumnIfMissing()` helper: tolerates `duplicate column` errors so `ALTER TABLE ADD COLUMN` is idempotent across dev and deployed DBs. Use this for all new columns.
|
|
49
|
+
- **`src/lib/data/clear.ts`** — FK-safe deletion order. Tests enforce that every schema table is deleted. New tables must be added here.
|
|
50
|
+
- **`src/lib/settings/helpers.ts:12`** — `getSettingSync(key)` for in-process reads. Use sync helpers inside hot scheduler paths to avoid needless awaits.
|
|
51
|
+
- **`src/lib/constants/settings.ts`** — `SETTINGS_KEYS` enum. Add new keys here.
|
|
52
|
+
- **`src/lib/chat/engine.ts:256`** — chat stream start point (where `fullText = ""` is initialized and streaming begins). The `active-streams.ts` set will be populated here and cleared in the finally block alongside `cleanupConversation()`.
|
|
53
|
+
- **`src/test/setup.ts:6-10`** — vitest setup creates a temp-dir SQLite per run via `STAGENT_DATA_DIR`. Tests can freely insert/query against a real DB.
|
|
54
|
+
- **`src/lib/chat/reconcile.ts`** (NEW from Spec B hotfix, already committed) — `finalizeStreamingMessage()` and `reconcileStreamingMessages()`. Pattern reference for pure DB-only helpers tested in isolation.
|
|
55
|
+
|
|
56
|
+
## Error & Rescue Registry
|
|
57
|
+
|
|
58
|
+
HOLD-mode feature — each primitive's failure path is enumerated and rescued.
|
|
59
|
+
|
|
60
|
+
| Error | Trigger | Impact | Rescue |
|
|
61
|
+
|---|---|---|---|
|
|
62
|
+
| Two ticks race on slot claim | `drainQueue()` concurrent with `tickScheduler()` via `.then()` chain at scheduler.ts:420 | Cap breached if naive check-then-act | Atomic single-SQL claim — `changes=0` means lost the race; leave row in `queued`, let next drain retry |
|
|
63
|
+
| SDK hangs mid-run | Upstream Opus stall, network partition, subprocess deadlock | Slot held forever, cap permanently reduced (e.g. 2→1 effective) | Lease expiry + reaper at each tick aborts via `execution-manager.RunningExecution.abortController`; DB update to `failed`/`lease_expired` frees the slot |
|
|
64
|
+
| Reaper itself throws | Rare DB error during `SELECT expired` or per-task `UPDATE` | Expired leases accumulate | Reaper catches per-task errors; the sweep continues to the next expired row. Next tick retries anything missed |
|
|
65
|
+
| Reaper aborts a task that legitimately needs 25 min | Per-schedule `max_run_duration_sec` not configured; default 20 min too tight | Legitimate run killed | Per-schedule `max_run_duration_sec` override. Drift-warn when a run completes at >80% of lease on 3 consecutive firings so users raise the cap |
|
|
66
|
+
| User sets `max_turns=10` on schedule averaging 40 | Config footgun | Would trip auto-pause in 3 firings under shared streak | First-breach grace + separate `turn_budget_breach_streak` with threshold 5; drift warning at 2 advising raise |
|
|
67
|
+
| `detectFailureReason()` misclassifies | SDK error text changes format | Wrong streak incremented | Runtime adapter writes explicit `failure_reason` at terminal transitions; string-match is fallback only |
|
|
68
|
+
| Manual execute spammed | User double-clicks "Run now" 5× | Could exceed cap under naive design | Manual honors cap by default; returns `429 {error, slotEtaSec}`; `?force=true` bypasses with audit log |
|
|
69
|
+
| Chat pressure delay causes schedule to miss a minute | User has `* * * * *` cron, chat is streaming for 45s | Minute skipped | 30s delay is one-shot per tick; next tick re-evaluates. Documented in UI help text |
|
|
70
|
+
| Firing metrics unbounded growth | High-frequency schedules over months | Disk bloat | Periodic cleanup `DELETE WHERE fired_at < now() - 30 days` in a post-tick maintenance pass |
|
|
71
|
+
| Clock skew between JS `Date.now()` and SQLite `CURRENT_TIMESTAMP` | Container restart, NTP drift | `lease_expires_at` mismatches | Use consistent Unix-ms integers everywhere; no mixing of SQL clock and JS clock inside one comparison |
|
|
72
|
+
| Collision check runs against in-flight chat-pressure-shifted fire time | Deterministic warning becomes nondeterministic | Confusing UX | Collision check always runs against *nominal* cron expansion, never adjusted times |
|
|
73
|
+
| `SCHEDULE_MAX_CONCURRENT` env var typo | User sets `=abc` | Silent fallback to default | `parseInt` with NaN guard; log warning; use default. Same pattern as existing SDK timeout handling |
|
|
74
|
+
| Tests pollute each other via shared temp DB | Multiple test files hitting same tables | Flaky tests | Every test file uses `beforeEach` to delete in FK-safe order (pattern from `src/lib/chat/__tests__/reconcile.test.ts`) |
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## File Structure
|
|
79
|
+
|
|
80
|
+
**New files:**
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
src/lib/schedules/slot-claim.ts — Atomic claim primitive + reap helper
|
|
84
|
+
src/lib/schedules/collision-check.ts — 24h cron expansion + 5-min bucket overlap detector
|
|
85
|
+
src/lib/schedules/config.ts — Config reader helpers for the new settings keys
|
|
86
|
+
src/lib/chat/active-streams.ts — Module-level Set tracking in-flight chat streams
|
|
87
|
+
src/app/api/schedules/[id]/execute/route.ts — Manual fire endpoint (does not exist today)
|
|
88
|
+
|
|
89
|
+
src/lib/schedules/__tests__/slot-claim.test.ts — Race + reap tests
|
|
90
|
+
src/lib/schedules/__tests__/collision-check.test.ts — Overlap detection tests
|
|
91
|
+
src/lib/schedules/__tests__/turn-budget.test.ts — First-breach grace + streak threshold
|
|
92
|
+
src/lib/schedules/__tests__/tick-scheduler.test.ts — Cap + chat pressure
|
|
93
|
+
src/lib/schedules/__tests__/firing-metrics.test.ts — Metrics insertion
|
|
94
|
+
src/lib/schedules/__tests__/integration.test.ts — End-to-end
|
|
95
|
+
src/lib/schedules/__tests__/config.test.ts — Config reader
|
|
96
|
+
src/lib/chat/__tests__/active-streams.test.ts — Set lifecycle
|
|
97
|
+
src/lib/agents/__tests__/failure-reason.test.ts — Classifier
|
|
98
|
+
src/app/api/schedules/__tests__/execute-route.test.ts — 429 + force bypass
|
|
99
|
+
|
|
100
|
+
.claude/skills/architect/references/tdr-atomic-slot-claim.md
|
|
101
|
+
.claude/skills/architect/references/tdr-evidence-based-cap.md
|
|
102
|
+
.claude/skills/architect/references/tdr-failure-class-streaks.md
|
|
103
|
+
.claude/skills/architect/references/tdr-manual-honors-cap.md
|
|
104
|
+
.claude/skills/architect/references/tdr-lock-holders-leased.md
|
|
105
|
+
.claude/skills/architect/references/tdr-chat-shares-event-loop.md
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**Modified files:**
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
src/lib/db/schema.ts — New columns on tasks + schedules, new table
|
|
112
|
+
src/lib/db/bootstrap.ts — CREATE TABLE + addColumnIfMissing calls
|
|
113
|
+
src/lib/data/clear.ts — Delete new table in FK-safe order
|
|
114
|
+
src/lib/constants/settings.ts — SCHEDULE_MAX_CONCURRENT etc.
|
|
115
|
+
src/lib/schedules/scheduler.ts — Wire slot claim + reaper + chat pressure + metrics
|
|
116
|
+
src/lib/agents/claude-agent.ts — Override maxTurns from tasks.maxTurns; write failure_reason
|
|
117
|
+
src/lib/chat/engine.ts — Register/unregister in activeChatStreams
|
|
118
|
+
src/app/api/schedules/route.ts — Attach collision warnings in POST response
|
|
119
|
+
src/app/api/schedules/[id]/route.ts — Attach collision warnings in PUT response
|
|
120
|
+
src/components/schedules/schedule-form.tsx — New "Max agent steps" field, tooltip, calibration hint
|
|
121
|
+
src/components/schedules/schedule-create-sheet.tsx — Render collision warning banner
|
|
122
|
+
src/components/schedules/schedule-edit-sheet.tsx — Render collision warning banner
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Task 1: Add schema columns + new firing-metrics table
|
|
128
|
+
|
|
129
|
+
**Files:**
|
|
130
|
+
- Modify: `src/lib/db/schema.ts`
|
|
131
|
+
- Modify: `src/lib/db/bootstrap.ts`
|
|
132
|
+
- Modify: `src/lib/data/clear.ts`
|
|
133
|
+
|
|
134
|
+
- [ ] **Step 1.1: Add Drizzle schema definitions**
|
|
135
|
+
|
|
136
|
+
Edit `src/lib/db/schema.ts`. Inside the `tasks` table definition (around line 16-53), add these columns before `createdAt`:
|
|
137
|
+
|
|
138
|
+
```typescript
|
|
139
|
+
/** When the slot for this task was atomically claimed */
|
|
140
|
+
slotClaimedAt: integer("slot_claimed_at", { mode: "timestamp" }),
|
|
141
|
+
/** Wall-clock expiry; reaper aborts tasks whose lease has passed */
|
|
142
|
+
leaseExpiresAt: integer("lease_expires_at", { mode: "timestamp" }),
|
|
143
|
+
/** Explicit terminal-state reason written by the runtime adapter */
|
|
144
|
+
failureReason: text("failure_reason"),
|
|
145
|
+
/** Per-task turn budget copied from schedules.maxTurns at firing time */
|
|
146
|
+
maxTurns: integer("max_turns"),
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Add a new index at the end of the `tasks` table definition's index array:
|
|
150
|
+
|
|
151
|
+
```typescript
|
|
152
|
+
index("idx_tasks_running_scheduled").on(table.status, table.sourceType, table.leaseExpiresAt),
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Inside the `schedules` table definition (around line 165-228), add these columns before `createdAt`:
|
|
156
|
+
|
|
157
|
+
```typescript
|
|
158
|
+
/** Hard cap on turns per firing; NULL inherits the global MAX_TURNS setting */
|
|
159
|
+
maxTurns: integer("max_turns"),
|
|
160
|
+
/** Timestamp when maxTurns was last edited — drives first-breach grace */
|
|
161
|
+
maxTurnsSetAt: integer("max_turns_set_at", { mode: "timestamp" }),
|
|
162
|
+
/** Wall-clock lease override in seconds; NULL inherits global default (1200s) */
|
|
163
|
+
maxRunDurationSec: integer("max_run_duration_sec"),
|
|
164
|
+
/** Counter separate from failureStreak — only increments on maxTurns breach */
|
|
165
|
+
turnBudgetBreachStreak: integer("turn_budget_breach_streak").default(0).notNull(),
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Append a new table definition at the bottom of `schema.ts`, before the `export type` block:
|
|
169
|
+
|
|
170
|
+
```typescript
|
|
171
|
+
export const scheduleFiringMetrics = sqliteTable(
|
|
172
|
+
"schedule_firing_metrics",
|
|
173
|
+
{
|
|
174
|
+
id: text("id").primaryKey(),
|
|
175
|
+
scheduleId: text("schedule_id")
|
|
176
|
+
.references(() => schedules.id)
|
|
177
|
+
.notNull(),
|
|
178
|
+
taskId: text("task_id").references(() => tasks.id),
|
|
179
|
+
firedAt: integer("fired_at", { mode: "timestamp" }).notNull(),
|
|
180
|
+
slotClaimedAt: integer("slot_claimed_at", { mode: "timestamp" }),
|
|
181
|
+
completedAt: integer("completed_at", { mode: "timestamp" }),
|
|
182
|
+
slotWaitMs: integer("slot_wait_ms"),
|
|
183
|
+
durationMs: integer("duration_ms"),
|
|
184
|
+
turnCount: integer("turn_count"),
|
|
185
|
+
maxTurnsAtFiring: integer("max_turns_at_firing"),
|
|
186
|
+
eventLoopLagMs: real("event_loop_lag_ms"),
|
|
187
|
+
peakRssMb: integer("peak_rss_mb"),
|
|
188
|
+
chatStreamsActive: integer("chat_streams_active"),
|
|
189
|
+
concurrentSchedules: integer("concurrent_schedules"),
|
|
190
|
+
failureReason: text("failure_reason"),
|
|
191
|
+
},
|
|
192
|
+
(table) => [
|
|
193
|
+
index("idx_sfm_schedule_time").on(table.scheduleId, table.firedAt),
|
|
194
|
+
]
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
export type ScheduleFiringMetricRow = InferSelectModel<typeof scheduleFiringMetrics>;
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
- [ ] **Step 1.2: Add bootstrap CREATE TABLE + addColumnIfMissing calls**
|
|
201
|
+
|
|
202
|
+
Edit `src/lib/db/bootstrap.ts`. Inside the `STAGENT_TABLES` const (around line 4-51), append `"schedule_firing_metrics"` to the array.
|
|
203
|
+
|
|
204
|
+
Inside `bootstrapStagentDatabase()`, after the `schedules` CREATE TABLE (around line 190), add a new `CREATE TABLE IF NOT EXISTS schedule_firing_metrics (...)` with columns matching the Drizzle schema above. Also add `CREATE INDEX IF NOT EXISTS idx_sfm_schedule_time ON schedule_firing_metrics(schedule_id, fired_at);`.
|
|
205
|
+
|
|
206
|
+
At the end of the `addColumnIfMissing` call block (around line 558), add:
|
|
207
|
+
|
|
208
|
+
```typescript
|
|
209
|
+
addColumnIfMissing(`ALTER TABLE tasks ADD COLUMN slot_claimed_at INTEGER;`);
|
|
210
|
+
addColumnIfMissing(`ALTER TABLE tasks ADD COLUMN lease_expires_at INTEGER;`);
|
|
211
|
+
addColumnIfMissing(`ALTER TABLE tasks ADD COLUMN failure_reason TEXT;`);
|
|
212
|
+
addColumnIfMissing(`ALTER TABLE tasks ADD COLUMN max_turns INTEGER;`);
|
|
213
|
+
addColumnIfMissing(`ALTER TABLE schedules ADD COLUMN max_turns INTEGER;`);
|
|
214
|
+
addColumnIfMissing(`ALTER TABLE schedules ADD COLUMN max_turns_set_at INTEGER;`);
|
|
215
|
+
addColumnIfMissing(`ALTER TABLE schedules ADD COLUMN max_run_duration_sec INTEGER;`);
|
|
216
|
+
addColumnIfMissing(`ALTER TABLE schedules ADD COLUMN turn_budget_breach_streak INTEGER DEFAULT 0 NOT NULL;`);
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
Also add an index creation line (using the existing sqlite handle):
|
|
220
|
+
|
|
221
|
+
```typescript
|
|
222
|
+
sqlite.exec(`CREATE INDEX IF NOT EXISTS idx_tasks_running_scheduled ON tasks(status, source_type, lease_expires_at);`);
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
- [ ] **Step 1.3: Add firing-metrics delete to clear.ts (FK-safe order)**
|
|
226
|
+
|
|
227
|
+
Edit `src/lib/data/clear.ts`. Add `scheduleFiringMetrics` to the imports from `@/lib/db/schema`. Add the delete call BEFORE the existing `schedulesDeleted = db.delete(schedules)...` line, because it references `schedules`:
|
|
228
|
+
|
|
229
|
+
```typescript
|
|
230
|
+
const scheduleFiringMetricsDeleted = db.delete(scheduleFiringMetrics).run().changes;
|
|
231
|
+
const schedulesDeleted = db.delete(schedules).run().changes;
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Include the count in the returned object at the end of `clearAllData`:
|
|
235
|
+
|
|
236
|
+
```typescript
|
|
237
|
+
return {
|
|
238
|
+
// ... existing keys ...
|
|
239
|
+
scheduleFiringMetrics: scheduleFiringMetricsDeleted,
|
|
240
|
+
};
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
- [ ] **Step 1.4: Run the clear.ts safety-net test**
|
|
244
|
+
|
|
245
|
+
Run: `npx vitest run src/lib/data/__tests__/clear.test.ts`
|
|
246
|
+
Expected: PASS. The safety-net test verifies every schema table has a `db.delete()` call. If it fails, you forgot to add `scheduleFiringMetrics` to clear.ts.
|
|
247
|
+
|
|
248
|
+
- [ ] **Step 1.5: Run full test suite**
|
|
249
|
+
|
|
250
|
+
Run: `npx vitest run`
|
|
251
|
+
Expected: PASS — all existing tests still pass. New columns are nullable so no existing seeds break.
|
|
252
|
+
|
|
253
|
+
- [ ] **Step 1.6: Commit**
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
git add src/lib/db/schema.ts src/lib/db/bootstrap.ts src/lib/data/clear.ts
|
|
257
|
+
git commit -m "feat(schedules): add schema columns + firing metrics table"
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## Task 2: Settings keys + config reader helpers
|
|
263
|
+
|
|
264
|
+
**Files:**
|
|
265
|
+
- Modify: `src/lib/constants/settings.ts`
|
|
266
|
+
- Create: `src/lib/schedules/config.ts`
|
|
267
|
+
- Test: `src/lib/schedules/__tests__/config.test.ts`
|
|
268
|
+
|
|
269
|
+
- [ ] **Step 2.1: Write failing config reader tests**
|
|
270
|
+
|
|
271
|
+
Create `src/lib/schedules/__tests__/config.test.ts`:
|
|
272
|
+
|
|
273
|
+
```typescript
|
|
274
|
+
import { describe, it, expect, beforeEach } from "vitest";
|
|
275
|
+
import { db } from "@/lib/db";
|
|
276
|
+
import { settings } from "@/lib/db/schema";
|
|
277
|
+
import { eq } from "drizzle-orm";
|
|
278
|
+
import {
|
|
279
|
+
getScheduleMaxConcurrent,
|
|
280
|
+
getScheduleMaxRunDurationSec,
|
|
281
|
+
getScheduleChatPressureDelaySec,
|
|
282
|
+
} from "../config";
|
|
283
|
+
|
|
284
|
+
describe("schedule config", () => {
|
|
285
|
+
beforeEach(() => {
|
|
286
|
+
db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
|
|
287
|
+
db.delete(settings).where(eq(settings.key, "schedule.maxRunDurationSec")).run();
|
|
288
|
+
db.delete(settings).where(eq(settings.key, "schedule.chatPressureDelaySec")).run();
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
it("returns default max concurrent of 2 when setting is absent", () => {
|
|
292
|
+
expect(getScheduleMaxConcurrent()).toBe(2);
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
it("reads max concurrent from settings when set", () => {
|
|
296
|
+
db.insert(settings)
|
|
297
|
+
.values({
|
|
298
|
+
key: "schedule.maxConcurrent",
|
|
299
|
+
value: "3",
|
|
300
|
+
updatedAt: new Date(),
|
|
301
|
+
})
|
|
302
|
+
.run();
|
|
303
|
+
expect(getScheduleMaxConcurrent()).toBe(3);
|
|
304
|
+
});
|
|
305
|
+
|
|
306
|
+
it("reads max concurrent from SCHEDULE_MAX_CONCURRENT env var", () => {
|
|
307
|
+
const original = process.env.SCHEDULE_MAX_CONCURRENT;
|
|
308
|
+
process.env.SCHEDULE_MAX_CONCURRENT = "5";
|
|
309
|
+
try {
|
|
310
|
+
expect(getScheduleMaxConcurrent()).toBe(5);
|
|
311
|
+
} finally {
|
|
312
|
+
if (original === undefined) delete process.env.SCHEDULE_MAX_CONCURRENT;
|
|
313
|
+
else process.env.SCHEDULE_MAX_CONCURRENT = original;
|
|
314
|
+
}
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
it("falls back to default when env var is NaN", () => {
|
|
318
|
+
const original = process.env.SCHEDULE_MAX_CONCURRENT;
|
|
319
|
+
process.env.SCHEDULE_MAX_CONCURRENT = "abc";
|
|
320
|
+
try {
|
|
321
|
+
expect(getScheduleMaxConcurrent()).toBe(2);
|
|
322
|
+
} finally {
|
|
323
|
+
if (original === undefined) delete process.env.SCHEDULE_MAX_CONCURRENT;
|
|
324
|
+
else process.env.SCHEDULE_MAX_CONCURRENT = original;
|
|
325
|
+
}
|
|
326
|
+
});
|
|
327
|
+
|
|
328
|
+
it("returns default max run duration of 1200s", () => {
|
|
329
|
+
expect(getScheduleMaxRunDurationSec()).toBe(1200);
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
it("returns default chat pressure delay of 30s", () => {
|
|
333
|
+
expect(getScheduleChatPressureDelaySec()).toBe(30);
|
|
334
|
+
});
|
|
335
|
+
});
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
- [ ] **Step 2.2: Run to verify RED**
|
|
339
|
+
|
|
340
|
+
Run: `npx vitest run src/lib/schedules/__tests__/config.test.ts`
|
|
341
|
+
Expected: FAIL — module `../config` does not exist.
|
|
342
|
+
|
|
343
|
+
- [ ] **Step 2.3: Add settings keys**
|
|
344
|
+
|
|
345
|
+
Edit `src/lib/constants/settings.ts`. Add inside the `SETTINGS_KEYS` const:
|
|
346
|
+
|
|
347
|
+
```typescript
|
|
348
|
+
SCHEDULE_MAX_CONCURRENT: "schedule.maxConcurrent",
|
|
349
|
+
SCHEDULE_MAX_RUN_DURATION_SEC: "schedule.maxRunDurationSec",
|
|
350
|
+
SCHEDULE_CHAT_PRESSURE_DELAY_SEC: "schedule.chatPressureDelaySec",
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
- [ ] **Step 2.4: Implement config helpers**
|
|
354
|
+
|
|
355
|
+
Create `src/lib/schedules/config.ts`:
|
|
356
|
+
|
|
357
|
+
```typescript
|
|
358
|
+
import { getSettingSync } from "@/lib/settings/helpers";
|
|
359
|
+
import { SETTINGS_KEYS } from "@/lib/constants/settings";
|
|
360
|
+
|
|
361
|
+
const DEFAULT_MAX_CONCURRENT = 2;
|
|
362
|
+
const DEFAULT_MAX_RUN_DURATION_SEC = 1200; // 20 minutes
|
|
363
|
+
const DEFAULT_CHAT_PRESSURE_DELAY_SEC = 30;
|
|
364
|
+
|
|
365
|
+
function readIntConfig(
|
|
366
|
+
envVar: string,
|
|
367
|
+
settingKey: string,
|
|
368
|
+
defaultValue: number,
|
|
369
|
+
): number {
|
|
370
|
+
const envRaw = process.env[envVar];
|
|
371
|
+
if (envRaw !== undefined) {
|
|
372
|
+
const parsed = parseInt(envRaw, 10);
|
|
373
|
+
if (Number.isFinite(parsed) && parsed > 0) return parsed;
|
|
374
|
+
console.warn(
|
|
375
|
+
`[schedule-config] ${envVar}="${envRaw}" is not a positive integer; using default ${defaultValue}`,
|
|
376
|
+
);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
const settingRaw = getSettingSync(settingKey);
|
|
380
|
+
if (settingRaw !== null) {
|
|
381
|
+
const parsed = parseInt(settingRaw, 10);
|
|
382
|
+
if (Number.isFinite(parsed) && parsed > 0) return parsed;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
return defaultValue;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
export function getScheduleMaxConcurrent(): number {
|
|
389
|
+
return readIntConfig(
|
|
390
|
+
"SCHEDULE_MAX_CONCURRENT",
|
|
391
|
+
SETTINGS_KEYS.SCHEDULE_MAX_CONCURRENT,
|
|
392
|
+
DEFAULT_MAX_CONCURRENT,
|
|
393
|
+
);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
export function getScheduleMaxRunDurationSec(): number {
|
|
397
|
+
return readIntConfig(
|
|
398
|
+
"SCHEDULE_MAX_RUN_DURATION_SEC",
|
|
399
|
+
SETTINGS_KEYS.SCHEDULE_MAX_RUN_DURATION_SEC,
|
|
400
|
+
DEFAULT_MAX_RUN_DURATION_SEC,
|
|
401
|
+
);
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
export function getScheduleChatPressureDelaySec(): number {
|
|
405
|
+
return readIntConfig(
|
|
406
|
+
"SCHEDULE_CHAT_PRESSURE_DELAY_SEC",
|
|
407
|
+
SETTINGS_KEYS.SCHEDULE_CHAT_PRESSURE_DELAY_SEC,
|
|
408
|
+
DEFAULT_CHAT_PRESSURE_DELAY_SEC,
|
|
409
|
+
);
|
|
410
|
+
}
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
- [ ] **Step 2.5: Run to verify GREEN**
|
|
414
|
+
|
|
415
|
+
Run: `npx vitest run src/lib/schedules/__tests__/config.test.ts`
|
|
416
|
+
Expected: PASS — 6 tests pass.
|
|
417
|
+
|
|
418
|
+
- [ ] **Step 2.6: Commit**
|
|
419
|
+
|
|
420
|
+
```bash
|
|
421
|
+
git add src/lib/constants/settings.ts src/lib/schedules/config.ts src/lib/schedules/__tests__/config.test.ts
|
|
422
|
+
git commit -m "feat(schedules): add concurrency + lease + chat-pressure config readers"
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
---
|
|
426
|
+
|
|
427
|
+
## Task 3: Atomic slot claim primitive
|
|
428
|
+
|
|
429
|
+
**Files:**
|
|
430
|
+
- Create: `src/lib/schedules/slot-claim.ts`
|
|
431
|
+
- Test: `src/lib/schedules/__tests__/slot-claim.test.ts`
|
|
432
|
+
|
|
433
|
+
This is the load-bearing primitive. The atomic claim MUST be a single SQL statement — check-then-act is forbidden because `tickScheduler()` and `drainQueue()` run concurrently via the `.then()` chain at scheduler.ts:420.
|
|
434
|
+
|
|
435
|
+
- [ ] **Step 3.1: Write failing tests**
|
|
436
|
+
|
|
437
|
+
Create `src/lib/schedules/__tests__/slot-claim.test.ts`:
|
|
438
|
+
|
|
439
|
+
```typescript
|
|
440
|
+
import { describe, it, expect, beforeEach } from "vitest";
|
|
441
|
+
import { db } from "@/lib/db";
|
|
442
|
+
import { tasks, schedules, projects, settings } from "@/lib/db/schema";
|
|
443
|
+
import { eq } from "drizzle-orm";
|
|
444
|
+
import { randomUUID } from "crypto";
|
|
445
|
+
import { claimSlot, countRunningScheduledSlots } from "../slot-claim";
|
|
446
|
+
|
|
447
|
+
function seedProject(): string {
|
|
448
|
+
const id = randomUUID();
|
|
449
|
+
const now = new Date();
|
|
450
|
+
db.insert(projects)
|
|
451
|
+
.values({ id, name: "test", status: "active", createdAt: now, updatedAt: now })
|
|
452
|
+
.run();
|
|
453
|
+
return id;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
function seedSchedule(projectId: string): string {
|
|
457
|
+
const id = randomUUID();
|
|
458
|
+
const now = new Date();
|
|
459
|
+
db.insert(schedules)
|
|
460
|
+
.values({
|
|
461
|
+
id,
|
|
462
|
+
projectId,
|
|
463
|
+
name: `sched-${id.slice(0, 4)}`,
|
|
464
|
+
prompt: "test",
|
|
465
|
+
cronExpression: "* * * * *",
|
|
466
|
+
status: "active",
|
|
467
|
+
type: "scheduled",
|
|
468
|
+
firingCount: 0,
|
|
469
|
+
suppressionCount: 0,
|
|
470
|
+
heartbeatSpentToday: 0,
|
|
471
|
+
failureStreak: 0,
|
|
472
|
+
turnBudgetBreachStreak: 0,
|
|
473
|
+
createdAt: now,
|
|
474
|
+
updatedAt: now,
|
|
475
|
+
})
|
|
476
|
+
.run();
|
|
477
|
+
return id;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
function seedQueuedTask(scheduleId: string): string {
|
|
481
|
+
const id = randomUUID();
|
|
482
|
+
const now = new Date();
|
|
483
|
+
db.insert(tasks)
|
|
484
|
+
.values({
|
|
485
|
+
id,
|
|
486
|
+
scheduleId,
|
|
487
|
+
title: "test firing",
|
|
488
|
+
status: "queued",
|
|
489
|
+
priority: 2,
|
|
490
|
+
sourceType: "scheduled",
|
|
491
|
+
resumeCount: 0,
|
|
492
|
+
createdAt: now,
|
|
493
|
+
updatedAt: now,
|
|
494
|
+
})
|
|
495
|
+
.run();
|
|
496
|
+
return id;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
describe("claimSlot", () => {
|
|
500
|
+
beforeEach(() => {
|
|
501
|
+
db.delete(tasks).run();
|
|
502
|
+
db.delete(schedules).run();
|
|
503
|
+
db.delete(projects).run();
|
|
504
|
+
db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
|
|
505
|
+
});
|
|
506
|
+
|
|
507
|
+
it("claims a slot when capacity available, transitioning queued→running", () => {
|
|
508
|
+
const pid = seedProject();
|
|
509
|
+
const sid = seedSchedule(pid);
|
|
510
|
+
const tid = seedQueuedTask(sid);
|
|
511
|
+
|
|
512
|
+
const result = claimSlot(tid, 2, 1200);
|
|
513
|
+
|
|
514
|
+
expect(result.claimed).toBe(true);
|
|
515
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
516
|
+
expect(row?.status).toBe("running");
|
|
517
|
+
expect(row?.slotClaimedAt).not.toBeNull();
|
|
518
|
+
expect(row?.leaseExpiresAt).not.toBeNull();
|
|
519
|
+
});
|
|
520
|
+
|
|
521
|
+
it("refuses to claim when cap=0", () => {
|
|
522
|
+
const pid = seedProject();
|
|
523
|
+
const sid = seedSchedule(pid);
|
|
524
|
+
const tid = seedQueuedTask(sid);
|
|
525
|
+
|
|
526
|
+
const result = claimSlot(tid, 0, 1200);
|
|
527
|
+
|
|
528
|
+
expect(result.claimed).toBe(false);
|
|
529
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
530
|
+
expect(row?.status).toBe("queued");
|
|
531
|
+
});
|
|
532
|
+
|
|
533
|
+
it("refuses when cap already full", () => {
|
|
534
|
+
const pid = seedProject();
|
|
535
|
+
const sid1 = seedSchedule(pid);
|
|
536
|
+
const sid2 = seedSchedule(pid);
|
|
537
|
+
const tid1 = seedQueuedTask(sid1);
|
|
538
|
+
const tid2 = seedQueuedTask(sid2);
|
|
539
|
+
|
|
540
|
+
expect(claimSlot(tid1, 1, 1200).claimed).toBe(true);
|
|
541
|
+
expect(claimSlot(tid2, 1, 1200).claimed).toBe(false);
|
|
542
|
+
|
|
543
|
+
const row2 = db.select().from(tasks).where(eq(tasks.id, tid2)).get();
|
|
544
|
+
expect(row2?.status).toBe("queued");
|
|
545
|
+
});
|
|
546
|
+
|
|
547
|
+
it("two concurrent claim attempts for the same task yield exactly one winner", () => {
|
|
548
|
+
const pid = seedProject();
|
|
549
|
+
const sid = seedSchedule(pid);
|
|
550
|
+
const tid = seedQueuedTask(sid);
|
|
551
|
+
|
|
552
|
+
const first = claimSlot(tid, 10, 1200);
|
|
553
|
+
const second = claimSlot(tid, 10, 1200);
|
|
554
|
+
|
|
555
|
+
expect(first.claimed).toBe(true);
|
|
556
|
+
expect(second.claimed).toBe(false); // task already running, can't re-claim
|
|
557
|
+
});
|
|
558
|
+
|
|
559
|
+
it("respects cap across multiple tasks from different schedules", () => {
|
|
560
|
+
const pid = seedProject();
|
|
561
|
+
const tids: string[] = [];
|
|
562
|
+
for (let i = 0; i < 5; i++) {
|
|
563
|
+
const sid = seedSchedule(pid);
|
|
564
|
+
tids.push(seedQueuedTask(sid));
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// Cap of 3 → first 3 claim, last 2 fail
|
|
568
|
+
const results = tids.map((tid) => claimSlot(tid, 3, 1200));
|
|
569
|
+
expect(results.filter((r) => r.claimed).length).toBe(3);
|
|
570
|
+
expect(results.filter((r) => !r.claimed).length).toBe(2);
|
|
571
|
+
|
|
572
|
+
expect(countRunningScheduledSlots()).toBe(3);
|
|
573
|
+
});
|
|
574
|
+
|
|
575
|
+
it("countRunningScheduledSlots ignores non-scheduled tasks", () => {
|
|
576
|
+
const pid = seedProject();
|
|
577
|
+
const sid = seedSchedule(pid);
|
|
578
|
+
const schedTid = seedQueuedTask(sid);
|
|
579
|
+
claimSlot(schedTid, 10, 1200);
|
|
580
|
+
|
|
581
|
+
// Insert a manual running task — must not count against scheduled cap
|
|
582
|
+
const manualId = randomUUID();
|
|
583
|
+
const now = new Date();
|
|
584
|
+
db.insert(tasks)
|
|
585
|
+
.values({
|
|
586
|
+
id: manualId,
|
|
587
|
+
title: "manual",
|
|
588
|
+
status: "running",
|
|
589
|
+
priority: 2,
|
|
590
|
+
sourceType: "manual",
|
|
591
|
+
resumeCount: 0,
|
|
592
|
+
createdAt: now,
|
|
593
|
+
updatedAt: now,
|
|
594
|
+
})
|
|
595
|
+
.run();
|
|
596
|
+
|
|
597
|
+
expect(countRunningScheduledSlots()).toBe(1);
|
|
598
|
+
});
|
|
599
|
+
|
|
600
|
+
it("writes leaseExpiresAt = slotClaimedAt + leaseSec", () => {
|
|
601
|
+
const pid = seedProject();
|
|
602
|
+
const sid = seedSchedule(pid);
|
|
603
|
+
const tid = seedQueuedTask(sid);
|
|
604
|
+
|
|
605
|
+
const before = Date.now();
|
|
606
|
+
claimSlot(tid, 10, 60);
|
|
607
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
608
|
+
|
|
609
|
+
expect(row?.slotClaimedAt?.getTime()).toBeGreaterThanOrEqual(before);
|
|
610
|
+
expect(
|
|
611
|
+
row!.leaseExpiresAt!.getTime() - row!.slotClaimedAt!.getTime(),
|
|
612
|
+
).toBe(60 * 1000);
|
|
613
|
+
});
|
|
614
|
+
});
|
|
615
|
+
```
|
|
616
|
+
|
|
617
|
+
- [ ] **Step 3.2: Run to verify RED**
|
|
618
|
+
|
|
619
|
+
Run: `npx vitest run src/lib/schedules/__tests__/slot-claim.test.ts`
|
|
620
|
+
Expected: FAIL — module `../slot-claim` does not exist.
|
|
621
|
+
|
|
622
|
+
- [ ] **Step 3.3: Implement slot-claim.ts**
|
|
623
|
+
|
|
624
|
+
Create `src/lib/schedules/slot-claim.ts`:
|
|
625
|
+
|
|
626
|
+
```typescript
|
|
627
|
+
import { sqlite } from "@/lib/db";
|
|
628
|
+
|
|
629
|
+
export interface ClaimResult {
|
|
630
|
+
claimed: boolean;
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
/**
|
|
634
|
+
* Atomic slot claim: transitions a queued scheduled task to running IFF the
|
|
635
|
+
* global cap of concurrent running scheduled tasks is not exceeded.
|
|
636
|
+
*
|
|
637
|
+
* Must be a single SQL statement — check-then-act would race between the
|
|
638
|
+
* scheduler tick loop and the drain loop that scheduler.ts currently dispatches
|
|
639
|
+
* concurrently. Using a subquery inside the WHERE clause guarantees SQLite
|
|
640
|
+
* serializes the count and update under its write lock, so two concurrent
|
|
641
|
+
* claim attempts cannot both succeed against the same cap.
|
|
642
|
+
*
|
|
643
|
+
* Returns `{ claimed: true }` when the task transitioned; `{ claimed: false }`
|
|
644
|
+
* when either (a) the task is no longer in queued state (already claimed) or
|
|
645
|
+
* (b) the global cap is full.
|
|
646
|
+
*/
|
|
647
|
+
export function claimSlot(
|
|
648
|
+
taskId: string,
|
|
649
|
+
cap: number,
|
|
650
|
+
leaseSec: number,
|
|
651
|
+
): ClaimResult {
|
|
652
|
+
const now = Date.now();
|
|
653
|
+
const leaseExpires = now + leaseSec * 1000;
|
|
654
|
+
|
|
655
|
+
const stmt = sqlite.prepare(
|
|
656
|
+
"UPDATE tasks SET status = 'running', slot_claimed_at = ?, lease_expires_at = ?, updated_at = ? WHERE id = ? AND status = 'queued' AND source_type IN ('scheduled', 'heartbeat') AND (SELECT COUNT(*) FROM tasks WHERE status = 'running' AND source_type IN ('scheduled', 'heartbeat')) < ?",
|
|
657
|
+
);
|
|
658
|
+
|
|
659
|
+
const result = stmt.run(now, leaseExpires, now, taskId, cap);
|
|
660
|
+
return { claimed: result.changes === 1 };
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
/**
|
|
664
|
+
* Count currently running scheduled/heartbeat tasks — used by the drain loop,
|
|
665
|
+
* manual-execute endpoint, and telemetry.
|
|
666
|
+
*/
|
|
667
|
+
export function countRunningScheduledSlots(): number {
|
|
668
|
+
const row = sqlite
|
|
669
|
+
.prepare(
|
|
670
|
+
"SELECT COUNT(*) AS n FROM tasks WHERE status = 'running' AND source_type IN ('scheduled', 'heartbeat')",
|
|
671
|
+
)
|
|
672
|
+
.get() as { n: number } | undefined;
|
|
673
|
+
return row?.n ?? 0;
|
|
674
|
+
}
|
|
675
|
+
```
|
|
676
|
+
|
|
677
|
+
- [ ] **Step 3.4: Run to verify GREEN**
|
|
678
|
+
|
|
679
|
+
Run: `npx vitest run src/lib/schedules/__tests__/slot-claim.test.ts`
|
|
680
|
+
Expected: PASS — 7 tests pass.
|
|
681
|
+
|
|
682
|
+
- [ ] **Step 3.5: Commit**
|
|
683
|
+
|
|
684
|
+
```bash
|
|
685
|
+
git add src/lib/schedules/slot-claim.ts src/lib/schedules/__tests__/slot-claim.test.ts
|
|
686
|
+
git commit -m "feat(schedules): atomic slot claim primitive with race coverage"
|
|
687
|
+
```
|
|
688
|
+
|
|
689
|
+
---
|
|
690
|
+
|
|
691
|
+
## Task 4: Lease reaper
|
|
692
|
+
|
|
693
|
+
**Files:**
|
|
694
|
+
- Modify: `src/lib/schedules/slot-claim.ts`
|
|
695
|
+
- Modify: `src/lib/schedules/__tests__/slot-claim.test.ts` (extend)
|
|
696
|
+
|
|
697
|
+
- [ ] **Step 4.1: Append failing reaper tests**
|
|
698
|
+
|
|
699
|
+
Append to `src/lib/schedules/__tests__/slot-claim.test.ts`:
|
|
700
|
+
|
|
701
|
+
```typescript
|
|
702
|
+
import { reapExpiredLeases } from "../slot-claim";
|
|
703
|
+
|
|
704
|
+
describe("reapExpiredLeases", () => {
|
|
705
|
+
beforeEach(() => {
|
|
706
|
+
db.delete(tasks).run();
|
|
707
|
+
db.delete(schedules).run();
|
|
708
|
+
db.delete(projects).run();
|
|
709
|
+
});
|
|
710
|
+
|
|
711
|
+
it("marks an expired running task as failed with failure_reason=lease_expired", () => {
|
|
712
|
+
const pid = seedProject();
|
|
713
|
+
const sid = seedSchedule(pid);
|
|
714
|
+
const tid = seedQueuedTask(sid);
|
|
715
|
+
|
|
716
|
+
// Claim with a 1-second lease, then fast-forward via direct DB edit
|
|
717
|
+
claimSlot(tid, 10, 1);
|
|
718
|
+
const past = new Date(Date.now() - 5000);
|
|
719
|
+
db.update(tasks)
|
|
720
|
+
.set({ leaseExpiresAt: past })
|
|
721
|
+
.where(eq(tasks.id, tid))
|
|
722
|
+
.run();
|
|
723
|
+
|
|
724
|
+
const reaped = reapExpiredLeases();
|
|
725
|
+
|
|
726
|
+
expect(reaped).toEqual([tid]);
|
|
727
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
728
|
+
expect(row?.status).toBe("failed");
|
|
729
|
+
expect(row?.failureReason).toBe("lease_expired");
|
|
730
|
+
});
|
|
731
|
+
|
|
732
|
+
it("leaves fresh running tasks alone", () => {
|
|
733
|
+
const pid = seedProject();
|
|
734
|
+
const sid = seedSchedule(pid);
|
|
735
|
+
const tid = seedQueuedTask(sid);
|
|
736
|
+
|
|
737
|
+
claimSlot(tid, 10, 3600); // 1-hour lease
|
|
738
|
+
|
|
739
|
+
const reaped = reapExpiredLeases();
|
|
740
|
+
|
|
741
|
+
expect(reaped).toEqual([]);
|
|
742
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
743
|
+
expect(row?.status).toBe("running");
|
|
744
|
+
});
|
|
745
|
+
|
|
746
|
+
it("reaps multiple expired tasks in one sweep", () => {
|
|
747
|
+
const pid = seedProject();
|
|
748
|
+
const tids: string[] = [];
|
|
749
|
+
for (let i = 0; i < 3; i++) {
|
|
750
|
+
const sid = seedSchedule(pid);
|
|
751
|
+
const tid = seedQueuedTask(sid);
|
|
752
|
+
claimSlot(tid, 10, 1);
|
|
753
|
+
tids.push(tid);
|
|
754
|
+
}
|
|
755
|
+
const past = new Date(Date.now() - 5000);
|
|
756
|
+
for (const tid of tids) {
|
|
757
|
+
db.update(tasks)
|
|
758
|
+
.set({ leaseExpiresAt: past })
|
|
759
|
+
.where(eq(tasks.id, tid))
|
|
760
|
+
.run();
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
const reaped = reapExpiredLeases();
|
|
764
|
+
|
|
765
|
+
expect(reaped.sort()).toEqual([...tids].sort());
|
|
766
|
+
expect(countRunningScheduledSlots()).toBe(0);
|
|
767
|
+
});
|
|
768
|
+
});
|
|
769
|
+
```
|
|
770
|
+
|
|
771
|
+
- [ ] **Step 4.2: Run to verify RED**
|
|
772
|
+
|
|
773
|
+
Run: `npx vitest run src/lib/schedules/__tests__/slot-claim.test.ts`
|
|
774
|
+
Expected: FAIL — `reapExpiredLeases` not exported.
|
|
775
|
+
|
|
776
|
+
- [ ] **Step 4.3: Implement reapExpiredLeases**
|
|
777
|
+
|
|
778
|
+
Append to `src/lib/schedules/slot-claim.ts`:
|
|
779
|
+
|
|
780
|
+
```typescript
|
|
781
|
+
import { getExecution } from "@/lib/agents/execution-manager";
|
|
782
|
+
|
|
783
|
+
/**
|
|
784
|
+
* Reap running scheduled tasks whose lease has expired. For each expired
|
|
785
|
+
* task: (1) abort the in-memory execution via AbortController, (2) mark
|
|
786
|
+
* the DB row as failed with failure_reason='lease_expired'. Returns the
|
|
787
|
+
* list of reaped task IDs for logging.
|
|
788
|
+
*
|
|
789
|
+
* Idempotent — safe to call on every scheduler tick.
|
|
790
|
+
*/
|
|
791
|
+
export function reapExpiredLeases(): string[] {
|
|
792
|
+
const now = Date.now();
|
|
793
|
+
const expiredRows = sqlite
|
|
794
|
+
.prepare(
|
|
795
|
+
"SELECT id FROM tasks WHERE status = 'running' AND source_type IN ('scheduled', 'heartbeat') AND lease_expires_at IS NOT NULL AND lease_expires_at < ?",
|
|
796
|
+
)
|
|
797
|
+
.all(now) as Array<{ id: string }>;
|
|
798
|
+
|
|
799
|
+
const reaped: string[] = [];
|
|
800
|
+
for (const { id } of expiredRows) {
|
|
801
|
+
// Abort the in-process execution so the SDK stops immediately
|
|
802
|
+
const execution = getExecution(id);
|
|
803
|
+
if (execution) {
|
|
804
|
+
try {
|
|
805
|
+
execution.abortController.abort();
|
|
806
|
+
} catch {
|
|
807
|
+
// Already aborted — safe to ignore
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
const updateResult = sqlite
|
|
812
|
+
.prepare(
|
|
813
|
+
"UPDATE tasks SET status = 'failed', failure_reason = 'lease_expired', updated_at = ? WHERE id = ? AND status = 'running'",
|
|
814
|
+
)
|
|
815
|
+
.run(now, id);
|
|
816
|
+
if (updateResult.changes === 1) reaped.push(id);
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
return reaped;
|
|
820
|
+
}
|
|
821
|
+
```
|
|
822
|
+
|
|
823
|
+
- [ ] **Step 4.4: Run to verify GREEN**
|
|
824
|
+
|
|
825
|
+
Run: `npx vitest run src/lib/schedules/__tests__/slot-claim.test.ts`
|
|
826
|
+
Expected: PASS — 10 tests (7 claim + 3 reap).
|
|
827
|
+
|
|
828
|
+
- [ ] **Step 4.5: Commit**
|
|
829
|
+
|
|
830
|
+
```bash
|
|
831
|
+
git add src/lib/schedules/slot-claim.ts src/lib/schedules/__tests__/slot-claim.test.ts
|
|
832
|
+
git commit -m "feat(schedules): lease reaper aborts hung runs via AbortController"
|
|
833
|
+
```
|
|
834
|
+
|
|
835
|
+
---
|
|
836
|
+
|
|
837
|
+
## Task 5: Chat active-streams tracker
|
|
838
|
+
|
|
839
|
+
**Files:**
|
|
840
|
+
- Create: `src/lib/chat/active-streams.ts`
|
|
841
|
+
- Test: `src/lib/chat/__tests__/active-streams.test.ts`
|
|
842
|
+
- Modify: `src/lib/chat/engine.ts` (register/unregister)
|
|
843
|
+
|
|
844
|
+
- [ ] **Step 5.1: Write failing tests**
|
|
845
|
+
|
|
846
|
+
Create `src/lib/chat/__tests__/active-streams.test.ts`:
|
|
847
|
+
|
|
848
|
+
```typescript
|
|
849
|
+
import { describe, it, expect, beforeEach } from "vitest";
|
|
850
|
+
import {
|
|
851
|
+
registerChatStream,
|
|
852
|
+
unregisterChatStream,
|
|
853
|
+
getActiveChatStreamCount,
|
|
854
|
+
isAnyChatStreaming,
|
|
855
|
+
} from "../active-streams";
|
|
856
|
+
|
|
857
|
+
describe("active chat streams", () => {
|
|
858
|
+
beforeEach(() => {
|
|
859
|
+
for (const id of ["a", "b", "c"]) unregisterChatStream(id);
|
|
860
|
+
});
|
|
861
|
+
|
|
862
|
+
it("starts empty", () => {
|
|
863
|
+
expect(getActiveChatStreamCount()).toBe(0);
|
|
864
|
+
expect(isAnyChatStreaming()).toBe(false);
|
|
865
|
+
});
|
|
866
|
+
|
|
867
|
+
it("tracks a single registered stream", () => {
|
|
868
|
+
registerChatStream("a");
|
|
869
|
+
expect(getActiveChatStreamCount()).toBe(1);
|
|
870
|
+
expect(isAnyChatStreaming()).toBe(true);
|
|
871
|
+
});
|
|
872
|
+
|
|
873
|
+
it("tracks multiple streams independently", () => {
|
|
874
|
+
registerChatStream("a");
|
|
875
|
+
registerChatStream("b");
|
|
876
|
+
expect(getActiveChatStreamCount()).toBe(2);
|
|
877
|
+
});
|
|
878
|
+
|
|
879
|
+
it("is idempotent — registering the same id twice still counts as one", () => {
|
|
880
|
+
registerChatStream("a");
|
|
881
|
+
registerChatStream("a");
|
|
882
|
+
expect(getActiveChatStreamCount()).toBe(1);
|
|
883
|
+
});
|
|
884
|
+
|
|
885
|
+
it("unregisters by id", () => {
|
|
886
|
+
registerChatStream("a");
|
|
887
|
+
registerChatStream("b");
|
|
888
|
+
unregisterChatStream("a");
|
|
889
|
+
expect(getActiveChatStreamCount()).toBe(1);
|
|
890
|
+
expect(isAnyChatStreaming()).toBe(true);
|
|
891
|
+
});
|
|
892
|
+
|
|
893
|
+
it("unregistering a non-existent id is a no-op", () => {
|
|
894
|
+
expect(() => unregisterChatStream("never-registered")).not.toThrow();
|
|
895
|
+
expect(getActiveChatStreamCount()).toBe(0);
|
|
896
|
+
});
|
|
897
|
+
});
|
|
898
|
+
```
|
|
899
|
+
|
|
900
|
+
- [ ] **Step 5.2: Run to verify RED**
|
|
901
|
+
|
|
902
|
+
Run: `npx vitest run src/lib/chat/__tests__/active-streams.test.ts`
|
|
903
|
+
Expected: FAIL — module does not exist.
|
|
904
|
+
|
|
905
|
+
- [ ] **Step 5.3: Implement active-streams.ts**
|
|
906
|
+
|
|
907
|
+
Create `src/lib/chat/active-streams.ts`:
|
|
908
|
+
|
|
909
|
+
```typescript
|
|
910
|
+
/**
|
|
911
|
+
* In-memory tracker for chat conversations that currently have an SSE stream
|
|
912
|
+
* in flight. Used by the scheduler tick loop to apply a soft pressure signal
|
|
913
|
+
* — when chat is active, new schedule firings are deferred by N seconds to
|
|
914
|
+
* keep the Node event loop responsive for the user's conversation.
|
|
915
|
+
*
|
|
916
|
+
* Module-level state; single-process (same Node instance as the scheduler).
|
|
917
|
+
* Must NOT be persisted — crash recovery relies on the set starting empty.
|
|
918
|
+
*/
|
|
919
|
+
|
|
920
|
+
const activeStreams = new Set<string>();
|
|
921
|
+
|
|
922
|
+
export function registerChatStream(conversationId: string): void {
|
|
923
|
+
activeStreams.add(conversationId);
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
export function unregisterChatStream(conversationId: string): void {
|
|
927
|
+
activeStreams.delete(conversationId);
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
export function getActiveChatStreamCount(): number {
|
|
931
|
+
return activeStreams.size;
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
export function isAnyChatStreaming(): boolean {
|
|
935
|
+
return activeStreams.size > 0;
|
|
936
|
+
}
|
|
937
|
+
```
|
|
938
|
+
|
|
939
|
+
- [ ] **Step 5.4: Run to verify GREEN**
|
|
940
|
+
|
|
941
|
+
Run: `npx vitest run src/lib/chat/__tests__/active-streams.test.ts`
|
|
942
|
+
Expected: PASS — 6 tests.
|
|
943
|
+
|
|
944
|
+
- [ ] **Step 5.5: Wire engine.ts to register/unregister**
|
|
945
|
+
|
|
946
|
+
Edit `src/lib/chat/engine.ts`. Add import at the top:
|
|
947
|
+
|
|
948
|
+
```typescript
|
|
949
|
+
import { registerChatStream, unregisterChatStream } from "./active-streams";
|
|
950
|
+
```
|
|
951
|
+
|
|
952
|
+
Find the stream start point (just before `yield { type: "status", phase: "connecting" ... }` near line 280) and call:
|
|
953
|
+
|
|
954
|
+
```typescript
|
|
955
|
+
registerChatStream(conversationId);
|
|
956
|
+
```
|
|
957
|
+
|
|
958
|
+
In the top-level `finally` block (the one that already calls `finalizeStreamingMessage()` from Spec B), add `unregisterChatStream(conversationId);` alongside the finalize call so the set is cleared even on abnormal exit.
|
|
959
|
+
|
|
960
|
+
- [ ] **Step 5.6: Run chat tests**
|
|
961
|
+
|
|
962
|
+
Run: `npx vitest run src/lib/chat`
|
|
963
|
+
Expected: PASS — all existing chat tests still pass; new active-streams tests pass.
|
|
964
|
+
|
|
965
|
+
- [ ] **Step 5.7: Commit**
|
|
966
|
+
|
|
967
|
+
```bash
|
|
968
|
+
git add src/lib/chat/active-streams.ts src/lib/chat/__tests__/active-streams.test.ts src/lib/chat/engine.ts
|
|
969
|
+
git commit -m "feat(chat): track active streams for scheduler pressure signal"
|
|
970
|
+
```
|
|
971
|
+
|
|
972
|
+
---
|
|
973
|
+
|
|
974
|
+
## Task 6: Wire slot claim + reaper + chat pressure into tickScheduler
|
|
975
|
+
|
|
976
|
+
**Files:**
|
|
977
|
+
- Modify: `src/lib/schedules/scheduler.ts`
|
|
978
|
+
- Test: `src/lib/schedules/__tests__/tick-scheduler.test.ts`
|
|
979
|
+
|
|
980
|
+
- [ ] **Step 6.1: Write failing tick-scheduler tests**
|
|
981
|
+
|
|
982
|
+
Create `src/lib/schedules/__tests__/tick-scheduler.test.ts`:
|
|
983
|
+
|
|
984
|
+
```typescript
|
|
985
|
+
import { describe, it, expect, beforeEach, vi } from "vitest";
|
|
986
|
+
import { db } from "@/lib/db";
|
|
987
|
+
import { tasks, schedules, projects, settings } from "@/lib/db/schema";
|
|
988
|
+
import { eq } from "drizzle-orm";
|
|
989
|
+
import { randomUUID } from "crypto";
|
|
990
|
+
import { tickScheduler } from "../scheduler";
|
|
991
|
+
import { registerChatStream, unregisterChatStream } from "@/lib/chat/active-streams";
|
|
992
|
+
|
|
993
|
+
// Stub the runtime — we're testing coordination, not the SDK
|
|
994
|
+
vi.mock("@/lib/agents/runtime", () => ({
|
|
995
|
+
executeTaskWithRuntime: vi.fn().mockResolvedValue(undefined),
|
|
996
|
+
}));
|
|
997
|
+
|
|
998
|
+
function seedProject(): string {
|
|
999
|
+
const id = randomUUID();
|
|
1000
|
+
const now = new Date();
|
|
1001
|
+
db.insert(projects)
|
|
1002
|
+
.values({ id, name: "test", status: "active", createdAt: now, updatedAt: now })
|
|
1003
|
+
.run();
|
|
1004
|
+
return id;
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
function seedScheduleDue(projectId: string, nextFireAt: Date): string {
|
|
1008
|
+
const id = randomUUID();
|
|
1009
|
+
const now = new Date();
|
|
1010
|
+
db.insert(schedules)
|
|
1011
|
+
.values({
|
|
1012
|
+
id,
|
|
1013
|
+
projectId,
|
|
1014
|
+
name: `sched-${id.slice(0, 4)}`,
|
|
1015
|
+
prompt: "test prompt",
|
|
1016
|
+
cronExpression: "* * * * *",
|
|
1017
|
+
status: "active",
|
|
1018
|
+
type: "scheduled",
|
|
1019
|
+
firingCount: 0,
|
|
1020
|
+
suppressionCount: 0,
|
|
1021
|
+
heartbeatSpentToday: 0,
|
|
1022
|
+
failureStreak: 0,
|
|
1023
|
+
turnBudgetBreachStreak: 0,
|
|
1024
|
+
nextFireAt,
|
|
1025
|
+
createdAt: now,
|
|
1026
|
+
updatedAt: now,
|
|
1027
|
+
})
|
|
1028
|
+
.run();
|
|
1029
|
+
return id;
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
describe("tickScheduler with concurrency cap", () => {
|
|
1033
|
+
beforeEach(() => {
|
|
1034
|
+
db.delete(tasks).run();
|
|
1035
|
+
db.delete(schedules).run();
|
|
1036
|
+
db.delete(projects).run();
|
|
1037
|
+
db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
|
|
1038
|
+
db.insert(settings)
|
|
1039
|
+
.values({ key: "schedule.maxConcurrent", value: "2", updatedAt: new Date() })
|
|
1040
|
+
.run();
|
|
1041
|
+
for (const id of ["x", "y", "z"]) unregisterChatStream(id);
|
|
1042
|
+
});
|
|
1043
|
+
|
|
1044
|
+
it("fires up to cap schedules, queues the rest", async () => {
|
|
1045
|
+
const pid = seedProject();
|
|
1046
|
+
const past = new Date(Date.now() - 10_000);
|
|
1047
|
+
for (let i = 0; i < 5; i++) seedScheduleDue(pid, past);
|
|
1048
|
+
|
|
1049
|
+
await tickScheduler();
|
|
1050
|
+
|
|
1051
|
+
const runningCount = db
|
|
1052
|
+
.select()
|
|
1053
|
+
.from(tasks)
|
|
1054
|
+
.where(eq(tasks.status, "running"))
|
|
1055
|
+
.all().length;
|
|
1056
|
+
const queuedCount = db
|
|
1057
|
+
.select()
|
|
1058
|
+
.from(tasks)
|
|
1059
|
+
.where(eq(tasks.status, "queued"))
|
|
1060
|
+
.all().length;
|
|
1061
|
+
|
|
1062
|
+
expect(runningCount).toBe(2); // cap=2
|
|
1063
|
+
expect(queuedCount).toBe(3); // remaining 3 waiting
|
|
1064
|
+
});
|
|
1065
|
+
|
|
1066
|
+
it("defers new firings when chat is active", async () => {
|
|
1067
|
+
const pid = seedProject();
|
|
1068
|
+
const past = new Date(Date.now() - 10_000);
|
|
1069
|
+
const sid = seedScheduleDue(pid, past);
|
|
1070
|
+
|
|
1071
|
+
registerChatStream("x");
|
|
1072
|
+
|
|
1073
|
+
await tickScheduler();
|
|
1074
|
+
|
|
1075
|
+
// No task should have been created
|
|
1076
|
+
const taskCount = db.select().from(tasks).all().length;
|
|
1077
|
+
expect(taskCount).toBe(0);
|
|
1078
|
+
|
|
1079
|
+
// The schedule's next_fire_at should have been pushed forward ≥25s
|
|
1080
|
+
const row = db.select().from(schedules).where(eq(schedules.id, sid)).get();
|
|
1081
|
+
expect(row?.nextFireAt?.getTime()).toBeGreaterThan(Date.now() + 25 * 1000);
|
|
1082
|
+
|
|
1083
|
+
unregisterChatStream("x");
|
|
1084
|
+
});
|
|
1085
|
+
});
|
|
1086
|
+
```
|
|
1087
|
+
|
|
1088
|
+
- [ ] **Step 6.2: Run to verify RED**
|
|
1089
|
+
|
|
1090
|
+
Run: `npx vitest run src/lib/schedules/__tests__/tick-scheduler.test.ts`
|
|
1091
|
+
Expected: FAIL — cap enforcement not yet wired; all 5 schedules would fire.
|
|
1092
|
+
|
|
1093
|
+
- [ ] **Step 6.3: Wire reaper + chat pressure + atomic claim into scheduler.ts**
|
|
1094
|
+
|
|
1095
|
+
Edit `src/lib/schedules/scheduler.ts`. Add imports at the top alongside existing imports:
|
|
1096
|
+
|
|
1097
|
+
```typescript
|
|
1098
|
+
import { claimSlot, reapExpiredLeases, countRunningScheduledSlots } from "./slot-claim";
|
|
1099
|
+
import { isAnyChatStreaming } from "@/lib/chat/active-streams";
|
|
1100
|
+
import {
|
|
1101
|
+
getScheduleMaxConcurrent,
|
|
1102
|
+
getScheduleMaxRunDurationSec,
|
|
1103
|
+
getScheduleChatPressureDelaySec,
|
|
1104
|
+
} from "./config";
|
|
1105
|
+
```
|
|
1106
|
+
|
|
1107
|
+
At the top of `tickScheduler()` (around line 221), add the reaper pass:
|
|
1108
|
+
|
|
1109
|
+
```typescript
|
|
1110
|
+
export async function tickScheduler(): Promise<void> {
|
|
1111
|
+
// Reap any running tasks whose lease has expired before claiming new slots.
|
|
1112
|
+
try {
|
|
1113
|
+
const reaped = reapExpiredLeases();
|
|
1114
|
+
if (reaped.length > 0) {
|
|
1115
|
+
console.warn(
|
|
1116
|
+
`[scheduler] reaped ${reaped.length} expired lease(s): ${reaped.join(", ")}`,
|
|
1117
|
+
);
|
|
1118
|
+
}
|
|
1119
|
+
} catch (err) {
|
|
1120
|
+
console.error("[scheduler] lease reaper error:", err);
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
const now = new Date();
|
|
1124
|
+
// ... existing function body continues unchanged ...
|
|
1125
|
+
```
|
|
1126
|
+
|
|
1127
|
+
Right after fetching `dueSchedules` (around line 224-232) and before the `for` loop, add the chat pressure check:
|
|
1128
|
+
|
|
1129
|
+
```typescript
|
|
1130
|
+
// Chat soft pressure: defer new firings by N seconds when a chat stream is
|
|
1131
|
+
// in flight. In-flight scheduled runs are NOT affected — this only gates
|
|
1132
|
+
// new claims.
|
|
1133
|
+
if (isAnyChatStreaming() && dueSchedules.length > 0) {
|
|
1134
|
+
const delayMs = getScheduleChatPressureDelaySec() * 1000;
|
|
1135
|
+
const deferredUntil = new Date(now.getTime() + delayMs);
|
|
1136
|
+
for (const schedule of dueSchedules) {
|
|
1137
|
+
await db
|
|
1138
|
+
.update(schedules)
|
|
1139
|
+
.set({ nextFireAt: deferredUntil, updatedAt: now })
|
|
1140
|
+
.where(eq(schedules.id, schedule.id));
|
|
1141
|
+
}
|
|
1142
|
+
console.log(
|
|
1143
|
+
`[scheduler] chat streaming — deferred ${dueSchedules.length} firings by ${delayMs}ms`,
|
|
1144
|
+
);
|
|
1145
|
+
return;
|
|
1146
|
+
}
|
|
1147
|
+
```
|
|
1148
|
+
|
|
1149
|
+
In `fireSchedule()` (around line 300-445), after the task INSERT and BEFORE the `executeTaskWithRuntime(taskId)` call at line 412, add the atomic claim:
|
|
1150
|
+
|
|
1151
|
+
```typescript
|
|
1152
|
+
// Atomic slot claim — if the global cap is full, leave the task in queued
|
|
1153
|
+
// state. drainQueue will pick it up when a running slot frees.
|
|
1154
|
+
const cap = getScheduleMaxConcurrent();
|
|
1155
|
+
const leaseSec = schedule.maxRunDurationSec ?? getScheduleMaxRunDurationSec();
|
|
1156
|
+
const { claimed } = claimSlot(taskId, cap, leaseSec);
|
|
1157
|
+
|
|
1158
|
+
if (!claimed) {
|
|
1159
|
+
console.log(
|
|
1160
|
+
`[scheduler] schedule "${schedule.name}" queued — cap full (${countRunningScheduledSlots()}/${cap})`,
|
|
1161
|
+
);
|
|
1162
|
+
return;
|
|
1163
|
+
}
|
|
1164
|
+
```
|
|
1165
|
+
|
|
1166
|
+
In `drainQueue()` (around line 51-95), replace the body of the `while (true)` loop so it claims slots atomically and stops when the cap is full:
|
|
1167
|
+
|
|
1168
|
+
```typescript
|
|
1169
|
+
while (true) {
|
|
1170
|
+
const cap = getScheduleMaxConcurrent();
|
|
1171
|
+
if (countRunningScheduledSlots() >= cap) return;
|
|
1172
|
+
|
|
1173
|
+
const [nextQueued] = await db
|
|
1174
|
+
.select({ id: tasks.id })
|
|
1175
|
+
.from(tasks)
|
|
1176
|
+
.where(
|
|
1177
|
+
and(
|
|
1178
|
+
eq(tasks.status, "queued"),
|
|
1179
|
+
inArray(tasks.sourceType, ["scheduled", "heartbeat"])
|
|
1180
|
+
)
|
|
1181
|
+
)
|
|
1182
|
+
.orderBy(asc(tasks.createdAt))
|
|
1183
|
+
.limit(1);
|
|
1184
|
+
|
|
1185
|
+
if (!nextQueued) return;
|
|
1186
|
+
|
|
1187
|
+
const leaseSec = getScheduleMaxRunDurationSec();
|
|
1188
|
+
const { claimed } = claimSlot(nextQueued.id, cap, leaseSec);
|
|
1189
|
+
if (!claimed) return; // lost race or cap filled again
|
|
1190
|
+
|
|
1191
|
+
console.log(`[scheduler] draining queue → running task ${nextQueued.id}`);
|
|
1192
|
+
try {
|
|
1193
|
+
await executeTaskWithRuntime(nextQueued.id);
|
|
1194
|
+
} catch (err) {
|
|
1195
|
+
console.error(`[scheduler] drain task ${nextQueued.id} failed:`, err);
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
try {
|
|
1199
|
+
const [taskRow] = await db
|
|
1200
|
+
.select({ scheduleId: tasks.scheduleId })
|
|
1201
|
+
.from(tasks)
|
|
1202
|
+
.where(eq(tasks.id, nextQueued.id));
|
|
1203
|
+
if (taskRow?.scheduleId) {
|
|
1204
|
+
await recordFiringMetrics(taskRow.scheduleId, nextQueued.id);
|
|
1205
|
+
}
|
|
1206
|
+
} catch (err) {
|
|
1207
|
+
console.error(`[scheduler] metrics recording failed for ${nextQueued.id}:`, err);
|
|
1208
|
+
}
|
|
1209
|
+
}
|
|
1210
|
+
```
|
|
1211
|
+
|
|
1212
|
+
- [ ] **Step 6.4: Run tick-scheduler tests to verify GREEN**
|
|
1213
|
+
|
|
1214
|
+
Run: `npx vitest run src/lib/schedules/__tests__/tick-scheduler.test.ts`
|
|
1215
|
+
Expected: PASS — both cap-enforcement and chat-pressure tests.
|
|
1216
|
+
|
|
1217
|
+
- [ ] **Step 6.5: Run full scheduler test suite**
|
|
1218
|
+
|
|
1219
|
+
Run: `npx vitest run src/lib/schedules`
|
|
1220
|
+
Expected: PASS — no regressions.
|
|
1221
|
+
|
|
1222
|
+
- [ ] **Step 6.6: Run full test suite**
|
|
1223
|
+
|
|
1224
|
+
Run: `npx vitest run`
|
|
1225
|
+
Expected: PASS across the codebase.
|
|
1226
|
+
|
|
1227
|
+
- [ ] **Step 6.7: Commit**
|
|
1228
|
+
|
|
1229
|
+
```bash
|
|
1230
|
+
git add src/lib/schedules/scheduler.ts src/lib/schedules/__tests__/tick-scheduler.test.ts
|
|
1231
|
+
git commit -m "feat(schedules): enforce global concurrency cap with lease reaper + chat pressure"
|
|
1232
|
+
```
|
|
1233
|
+
|
|
1234
|
+
---
|
|
1235
|
+
|
|
1236
|
+
## Task 7: Per-schedule turn budget propagation
|
|
1237
|
+
|
|
1238
|
+
**Files:**
|
|
1239
|
+
- Modify: `src/lib/schedules/scheduler.ts` (populate `tasks.max_turns` at firing)
|
|
1240
|
+
- Modify: `src/lib/agents/claude-agent.ts` (override `ctx.maxTurns` from `task.maxTurns`)
|
|
1241
|
+
- Test: `src/lib/schedules/__tests__/turn-budget.test.ts`
|
|
1242
|
+
|
|
1243
|
+
- [ ] **Step 7.1: Write failing test for task.maxTurns propagation**
|
|
1244
|
+
|
|
1245
|
+
Create `src/lib/schedules/__tests__/turn-budget.test.ts`:
|
|
1246
|
+
|
|
1247
|
+
```typescript
|
|
1248
|
+
import { describe, it, expect, beforeEach, vi } from "vitest";
|
|
1249
|
+
import { db } from "@/lib/db";
|
|
1250
|
+
import { tasks, schedules, projects, settings } from "@/lib/db/schema";
|
|
1251
|
+
import { eq } from "drizzle-orm";
|
|
1252
|
+
import { randomUUID } from "crypto";
|
|
1253
|
+
import { tickScheduler } from "../scheduler";
|
|
1254
|
+
|
|
1255
|
+
vi.mock("@/lib/agents/runtime", () => ({
|
|
1256
|
+
executeTaskWithRuntime: vi.fn().mockResolvedValue(undefined),
|
|
1257
|
+
}));
|
|
1258
|
+
|
|
1259
|
+
describe("per-schedule turn budget propagation", () => {
|
|
1260
|
+
beforeEach(() => {
|
|
1261
|
+
db.delete(tasks).run();
|
|
1262
|
+
db.delete(schedules).run();
|
|
1263
|
+
db.delete(projects).run();
|
|
1264
|
+
db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
|
|
1265
|
+
db.insert(settings)
|
|
1266
|
+
.values({ key: "schedule.maxConcurrent", value: "10", updatedAt: new Date() })
|
|
1267
|
+
.run();
|
|
1268
|
+
});
|
|
1269
|
+
|
|
1270
|
+
it("copies schedules.max_turns into tasks.max_turns at firing time", async () => {
|
|
1271
|
+
const pid = randomUUID();
|
|
1272
|
+
const sid = randomUUID();
|
|
1273
|
+
const now = new Date();
|
|
1274
|
+
db.insert(projects)
|
|
1275
|
+
.values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
1276
|
+
.run();
|
|
1277
|
+
db.insert(schedules)
|
|
1278
|
+
.values({
|
|
1279
|
+
id: sid,
|
|
1280
|
+
projectId: pid,
|
|
1281
|
+
name: "bounded",
|
|
1282
|
+
prompt: "test",
|
|
1283
|
+
cronExpression: "* * * * *",
|
|
1284
|
+
status: "active",
|
|
1285
|
+
type: "scheduled",
|
|
1286
|
+
firingCount: 0,
|
|
1287
|
+
suppressionCount: 0,
|
|
1288
|
+
heartbeatSpentToday: 0,
|
|
1289
|
+
failureStreak: 0,
|
|
1290
|
+
turnBudgetBreachStreak: 0,
|
|
1291
|
+
nextFireAt: new Date(now.getTime() - 10_000),
|
|
1292
|
+
maxTurns: 42,
|
|
1293
|
+
createdAt: now,
|
|
1294
|
+
updatedAt: now,
|
|
1295
|
+
})
|
|
1296
|
+
.run();
|
|
1297
|
+
|
|
1298
|
+
await tickScheduler();
|
|
1299
|
+
|
|
1300
|
+
const [task] = db.select().from(tasks).where(eq(tasks.scheduleId, sid)).all();
|
|
1301
|
+
expect(task?.maxTurns).toBe(42);
|
|
1302
|
+
});
|
|
1303
|
+
|
|
1304
|
+
it("leaves tasks.max_turns null when schedules.max_turns is null", async () => {
|
|
1305
|
+
const pid = randomUUID();
|
|
1306
|
+
const sid = randomUUID();
|
|
1307
|
+
const now = new Date();
|
|
1308
|
+
db.insert(projects)
|
|
1309
|
+
.values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
1310
|
+
.run();
|
|
1311
|
+
db.insert(schedules)
|
|
1312
|
+
.values({
|
|
1313
|
+
id: sid,
|
|
1314
|
+
projectId: pid,
|
|
1315
|
+
name: "unbounded",
|
|
1316
|
+
prompt: "test",
|
|
1317
|
+
cronExpression: "* * * * *",
|
|
1318
|
+
status: "active",
|
|
1319
|
+
type: "scheduled",
|
|
1320
|
+
firingCount: 0,
|
|
1321
|
+
suppressionCount: 0,
|
|
1322
|
+
heartbeatSpentToday: 0,
|
|
1323
|
+
failureStreak: 0,
|
|
1324
|
+
turnBudgetBreachStreak: 0,
|
|
1325
|
+
nextFireAt: new Date(now.getTime() - 10_000),
|
|
1326
|
+
createdAt: now,
|
|
1327
|
+
updatedAt: now,
|
|
1328
|
+
})
|
|
1329
|
+
.run();
|
|
1330
|
+
|
|
1331
|
+
await tickScheduler();
|
|
1332
|
+
|
|
1333
|
+
const [task] = db.select().from(tasks).where(eq(tasks.scheduleId, sid)).all();
|
|
1334
|
+
expect(task?.maxTurns).toBeNull();
|
|
1335
|
+
});
|
|
1336
|
+
});
|
|
1337
|
+
```
|
|
1338
|
+
|
|
1339
|
+
- [ ] **Step 7.2: Run to verify RED**
|
|
1340
|
+
|
|
1341
|
+
Run: `npx vitest run src/lib/schedules/__tests__/turn-budget.test.ts`
|
|
1342
|
+
Expected: FAIL — `task.maxTurns` will be null because the scheduler doesn't copy it yet.
|
|
1343
|
+
|
|
1344
|
+
- [ ] **Step 7.3: Populate tasks.max_turns in fireSchedule**
|
|
1345
|
+
|
|
1346
|
+
Edit `src/lib/schedules/scheduler.ts`, inside `fireSchedule()` at the `db.insert(tasks).values({ ... })` call (around line 350-364). Add `maxTurns: schedule.maxTurns,` to the inserted values, placed before `createdAt`:
|
|
1347
|
+
|
|
1348
|
+
```typescript
|
|
1349
|
+
await db.insert(tasks).values({
|
|
1350
|
+
id: taskId,
|
|
1351
|
+
projectId: schedule.projectId,
|
|
1352
|
+
workflowId: null,
|
|
1353
|
+
scheduleId: schedule.id,
|
|
1354
|
+
title: `${schedule.name} — firing #${firingNumber}`,
|
|
1355
|
+
description: budgetHeader + schedule.prompt,
|
|
1356
|
+
status: "queued",
|
|
1357
|
+
assignedAgent: schedule.assignedAgent,
|
|
1358
|
+
agentProfile: schedule.agentProfile,
|
|
1359
|
+
priority: 2,
|
|
1360
|
+
sourceType: "scheduled",
|
|
1361
|
+
maxTurns: schedule.maxTurns, // per-schedule override, NULL = inherit global
|
|
1362
|
+
createdAt: now,
|
|
1363
|
+
updatedAt: now,
|
|
1364
|
+
});
|
|
1365
|
+
```
|
|
1366
|
+
|
|
1367
|
+
Do the same change in `fireHeartbeat()` (around line 528-542) for the heartbeat task insert.
|
|
1368
|
+
|
|
1369
|
+
- [ ] **Step 7.4: Run to verify GREEN**
|
|
1370
|
+
|
|
1371
|
+
Run: `npx vitest run src/lib/schedules/__tests__/turn-budget.test.ts`
|
|
1372
|
+
Expected: PASS — both propagation tests pass.
|
|
1373
|
+
|
|
1374
|
+
- [ ] **Step 7.5: Override ctx.maxTurns in executeClaudeTask**
|
|
1375
|
+
|
|
1376
|
+
Edit `src/lib/agents/claude-agent.ts`. Find `executeClaudeTask()` (line 416-499). After `const ctx = await buildTaskQueryContext(task, agentProfileId);` (around line 433), add:
|
|
1377
|
+
|
|
1378
|
+
```typescript
|
|
1379
|
+
// Per-schedule override: if the task carries its own maxTurns (set by
|
|
1380
|
+
// fireSchedule from schedules.maxTurns), it takes precedence over the
|
|
1381
|
+
// profile default. This is the runtime-enforced budget cap.
|
|
1382
|
+
const effectiveMaxTurns = task.maxTurns ?? ctx.maxTurns;
|
|
1383
|
+
```
|
|
1384
|
+
|
|
1385
|
+
In the SDK `query()` call options (around line 456), replace `maxTurns: ctx.maxTurns,` with `maxTurns: effectiveMaxTurns,`.
|
|
1386
|
+
|
|
1387
|
+
Do the same in `resumeClaudeTask()` (around line 570).
|
|
1388
|
+
|
|
1389
|
+
- [ ] **Step 7.6: Run full test suite**
|
|
1390
|
+
|
|
1391
|
+
Run: `npx vitest run`
|
|
1392
|
+
Expected: PASS — no regressions.
|
|
1393
|
+
|
|
1394
|
+
- [ ] **Step 7.7: Commit**
|
|
1395
|
+
|
|
1396
|
+
```bash
|
|
1397
|
+
git add src/lib/schedules/scheduler.ts src/lib/agents/claude-agent.ts src/lib/schedules/__tests__/turn-budget.test.ts
|
|
1398
|
+
git commit -m "feat(schedules): propagate per-schedule max_turns into SDK query options"
|
|
1399
|
+
```
|
|
1400
|
+
|
|
1401
|
+
---
|
|
1402
|
+
|
|
1403
|
+
## Task 8: Separate `turnBudgetBreachStreak` with first-breach grace
|
|
1404
|
+
|
|
1405
|
+
**Files:**
|
|
1406
|
+
- Modify: `src/lib/schedules/scheduler.ts` (`recordFiringMetrics`)
|
|
1407
|
+
- Modify: `src/lib/schedules/__tests__/turn-budget.test.ts` (extend)
|
|
1408
|
+
|
|
1409
|
+
The existing `recordFiringMetrics()` at scheduler.ts:140-186 uses a single `failureStreak`. Split turn-budget breaches into their own counter so a misconfigured `maxTurns` doesn't auto-pause via the generic threshold of 3.
|
|
1410
|
+
|
|
1411
|
+
- [ ] **Step 8.1: Append failing tests for streak split + grace + auto-pause**
|
|
1412
|
+
|
|
1413
|
+
Append to `src/lib/schedules/__tests__/turn-budget.test.ts`:
|
|
1414
|
+
|
|
1415
|
+
```typescript
|
|
1416
|
+
import { recordFiringMetrics } from "../scheduler";
|
|
1417
|
+
|
|
1418
|
+
async function seedBreachedTask(scheduleId: string): Promise<string> {
|
|
1419
|
+
const id = randomUUID();
|
|
1420
|
+
const now = new Date();
|
|
1421
|
+
db.insert(tasks)
|
|
1422
|
+
.values({
|
|
1423
|
+
id,
|
|
1424
|
+
scheduleId,
|
|
1425
|
+
title: "firing",
|
|
1426
|
+
status: "failed",
|
|
1427
|
+
result: "Agent exhausted its turn limit (42 turns used)",
|
|
1428
|
+
priority: 2,
|
|
1429
|
+
sourceType: "scheduled",
|
|
1430
|
+
resumeCount: 0,
|
|
1431
|
+
failureReason: "turn_limit_exceeded",
|
|
1432
|
+
createdAt: now,
|
|
1433
|
+
updatedAt: now,
|
|
1434
|
+
})
|
|
1435
|
+
.run();
|
|
1436
|
+
return id;
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1439
|
+
describe("turn_budget_breach_streak", () => {
|
|
1440
|
+
beforeEach(() => {
|
|
1441
|
+
db.delete(tasks).run();
|
|
1442
|
+
db.delete(schedules).run();
|
|
1443
|
+
db.delete(projects).run();
|
|
1444
|
+
});
|
|
1445
|
+
|
|
1446
|
+
it("does NOT increment generic failureStreak on turn-budget breach", async () => {
|
|
1447
|
+
const pid = randomUUID();
|
|
1448
|
+
const sid = randomUUID();
|
|
1449
|
+
const now = new Date();
|
|
1450
|
+
db.insert(projects)
|
|
1451
|
+
.values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
1452
|
+
.run();
|
|
1453
|
+
db.insert(schedules)
|
|
1454
|
+
.values({
|
|
1455
|
+
id: sid,
|
|
1456
|
+
projectId: pid,
|
|
1457
|
+
name: "bounded",
|
|
1458
|
+
prompt: "test",
|
|
1459
|
+
cronExpression: "* * * * *",
|
|
1460
|
+
status: "active",
|
|
1461
|
+
type: "scheduled",
|
|
1462
|
+
firingCount: 1,
|
|
1463
|
+
suppressionCount: 0,
|
|
1464
|
+
heartbeatSpentToday: 0,
|
|
1465
|
+
failureStreak: 0,
|
|
1466
|
+
turnBudgetBreachStreak: 0,
|
|
1467
|
+
maxTurns: 20,
|
|
1468
|
+
maxTurnsSetAt: new Date(now.getTime() - 86400_000), // yesterday
|
|
1469
|
+
createdAt: now,
|
|
1470
|
+
updatedAt: now,
|
|
1471
|
+
})
|
|
1472
|
+
.run();
|
|
1473
|
+
|
|
1474
|
+
const tid = await seedBreachedTask(sid);
|
|
1475
|
+
await recordFiringMetrics(sid, tid);
|
|
1476
|
+
|
|
1477
|
+
const row = db.select().from(schedules).where(eq(schedules.id, sid)).get();
|
|
1478
|
+
expect(row?.failureStreak).toBe(0);
|
|
1479
|
+
expect(row?.turnBudgetBreachStreak).toBe(1);
|
|
1480
|
+
});
|
|
1481
|
+
|
|
1482
|
+
it("applies first-breach grace when maxTurns was set recently", async () => {
|
|
1483
|
+
const pid = randomUUID();
|
|
1484
|
+
const sid = randomUUID();
|
|
1485
|
+
const now = new Date();
|
|
1486
|
+
db.insert(projects)
|
|
1487
|
+
.values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
1488
|
+
.run();
|
|
1489
|
+
db.insert(schedules)
|
|
1490
|
+
.values({
|
|
1491
|
+
id: sid,
|
|
1492
|
+
projectId: pid,
|
|
1493
|
+
name: "bounded",
|
|
1494
|
+
prompt: "test",
|
|
1495
|
+
cronExpression: "0 * * * *", // hourly
|
|
1496
|
+
status: "active",
|
|
1497
|
+
type: "scheduled",
|
|
1498
|
+
firingCount: 1,
|
|
1499
|
+
suppressionCount: 0,
|
|
1500
|
+
heartbeatSpentToday: 0,
|
|
1501
|
+
failureStreak: 0,
|
|
1502
|
+
turnBudgetBreachStreak: 0,
|
|
1503
|
+
maxTurns: 20,
|
|
1504
|
+
// maxTurnsSetAt 30 min ago → first firing after edit → grace applies
|
|
1505
|
+
maxTurnsSetAt: new Date(now.getTime() - 30 * 60 * 1000),
|
|
1506
|
+
createdAt: now,
|
|
1507
|
+
updatedAt: now,
|
|
1508
|
+
})
|
|
1509
|
+
.run();
|
|
1510
|
+
|
|
1511
|
+
const tid = await seedBreachedTask(sid);
|
|
1512
|
+
await recordFiringMetrics(sid, tid);
|
|
1513
|
+
|
|
1514
|
+
const row = db.select().from(schedules).where(eq(schedules.id, sid)).get();
|
|
1515
|
+
expect(row?.turnBudgetBreachStreak).toBe(0); // grace applied
|
|
1516
|
+
});
|
|
1517
|
+
|
|
1518
|
+
it("auto-pauses at turn_budget_breach_streak >= 5", async () => {
|
|
1519
|
+
const pid = randomUUID();
|
|
1520
|
+
const sid = randomUUID();
|
|
1521
|
+
const now = new Date();
|
|
1522
|
+
db.insert(projects)
|
|
1523
|
+
.values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
1524
|
+
.run();
|
|
1525
|
+
db.insert(schedules)
|
|
1526
|
+
.values({
|
|
1527
|
+
id: sid,
|
|
1528
|
+
projectId: pid,
|
|
1529
|
+
name: "bounded",
|
|
1530
|
+
prompt: "test",
|
|
1531
|
+
cronExpression: "* * * * *",
|
|
1532
|
+
status: "active",
|
|
1533
|
+
type: "scheduled",
|
|
1534
|
+
firingCount: 5,
|
|
1535
|
+
suppressionCount: 0,
|
|
1536
|
+
heartbeatSpentToday: 0,
|
|
1537
|
+
failureStreak: 0,
|
|
1538
|
+
turnBudgetBreachStreak: 4, // next breach trips the threshold
|
|
1539
|
+
maxTurns: 20,
|
|
1540
|
+
maxTurnsSetAt: new Date(now.getTime() - 86400_000),
|
|
1541
|
+
createdAt: now,
|
|
1542
|
+
updatedAt: now,
|
|
1543
|
+
})
|
|
1544
|
+
.run();
|
|
1545
|
+
|
|
1546
|
+
const tid = await seedBreachedTask(sid);
|
|
1547
|
+
await recordFiringMetrics(sid, tid);
|
|
1548
|
+
|
|
1549
|
+
const row = db.select().from(schedules).where(eq(schedules.id, sid)).get();
|
|
1550
|
+
expect(row?.status).toBe("paused");
|
|
1551
|
+
expect(row?.turnBudgetBreachStreak).toBe(5);
|
|
1552
|
+
});
|
|
1553
|
+
});
|
|
1554
|
+
```
|
|
1555
|
+
|
|
1556
|
+
- [ ] **Step 8.2: Run to verify RED**
|
|
1557
|
+
|
|
1558
|
+
Run: `npx vitest run src/lib/schedules/__tests__/turn-budget.test.ts`
|
|
1559
|
+
Expected: FAIL — `recordFiringMetrics` doesn't split streaks yet.
|
|
1560
|
+
|
|
1561
|
+
- [ ] **Step 8.3: Refactor recordFiringMetrics**
|
|
1562
|
+
|
|
1563
|
+
Edit `src/lib/schedules/scheduler.ts`. Replace the body of `recordFiringMetrics()` (lines 140-186) with a version that splits streaks and honors first-breach grace:
|
|
1564
|
+
|
|
1565
|
+
```typescript
|
|
1566
|
+
const TURN_BUDGET_BREACH_AUTO_PAUSE_THRESHOLD = 5;
|
|
1567
|
+
const GRACE_PERIOD_MULTIPLIER = 2; // grace window = 2 × cron interval
|
|
1568
|
+
|
|
1569
|
+
export async function recordFiringMetrics(
|
|
1570
|
+
scheduleId: string,
|
|
1571
|
+
taskId: string,
|
|
1572
|
+
): Promise<void> {
|
|
1573
|
+
const [task] = await db
|
|
1574
|
+
.select({
|
|
1575
|
+
status: tasks.status,
|
|
1576
|
+
result: tasks.result,
|
|
1577
|
+
failureReason: tasks.failureReason,
|
|
1578
|
+
updatedAt: tasks.updatedAt,
|
|
1579
|
+
})
|
|
1580
|
+
.from(tasks)
|
|
1581
|
+
.where(eq(tasks.id, taskId));
|
|
1582
|
+
if (!task) return;
|
|
1583
|
+
|
|
1584
|
+
const [schedule] = await db
|
|
1585
|
+
.select()
|
|
1586
|
+
.from(schedules)
|
|
1587
|
+
.where(eq(schedules.id, scheduleId));
|
|
1588
|
+
if (!schedule) return;
|
|
1589
|
+
|
|
1590
|
+
const turnCountResult = await db
|
|
1591
|
+
.select({ count: sql<number>`count(*)` })
|
|
1592
|
+
.from(agentLogs)
|
|
1593
|
+
.where(eq(agentLogs.taskId, taskId));
|
|
1594
|
+
const turns = Number(turnCountResult[0]?.count ?? 0);
|
|
1595
|
+
|
|
1596
|
+
const prevAvg = schedule.avgTurnsPerFiring ?? turns;
|
|
1597
|
+
const newAvg = Math.round(prevAvg * 0.7 + turns * 0.3);
|
|
1598
|
+
|
|
1599
|
+
const isFailure = task.status === "failed";
|
|
1600
|
+
const failureReason =
|
|
1601
|
+
task.failureReason ?? (isFailure ? detectFailureReason(task.result) : null);
|
|
1602
|
+
const isTurnBudgetBreach = failureReason === "turn_limit_exceeded";
|
|
1603
|
+
const isGenericFailure = isFailure && !isTurnBudgetBreach;
|
|
1604
|
+
|
|
1605
|
+
// First-breach grace
|
|
1606
|
+
let turnBudgetStreakDelta = 0;
|
|
1607
|
+
if (isTurnBudgetBreach) {
|
|
1608
|
+
const graceApplies = shouldApplyGrace(
|
|
1609
|
+
schedule.maxTurnsSetAt,
|
|
1610
|
+
schedule.cronExpression,
|
|
1611
|
+
task.updatedAt,
|
|
1612
|
+
);
|
|
1613
|
+
if (!graceApplies) turnBudgetStreakDelta = 1;
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
const newFailureStreak = isGenericFailure ? (schedule.failureStreak ?? 0) + 1 : 0;
|
|
1617
|
+
const newBudgetStreak =
|
|
1618
|
+
turnBudgetStreakDelta > 0
|
|
1619
|
+
? (schedule.turnBudgetBreachStreak ?? 0) + 1
|
|
1620
|
+
: isTurnBudgetBreach
|
|
1621
|
+
? schedule.turnBudgetBreachStreak
|
|
1622
|
+
: 0;
|
|
1623
|
+
const shouldAutoPauseGeneric =
|
|
1624
|
+
isGenericFailure && newFailureStreak >= 3 && schedule.status === "active";
|
|
1625
|
+
const shouldAutoPauseBudget =
|
|
1626
|
+
newBudgetStreak >= TURN_BUDGET_BREACH_AUTO_PAUSE_THRESHOLD &&
|
|
1627
|
+
schedule.status === "active";
|
|
1628
|
+
const shouldAutoPause = shouldAutoPauseGeneric || shouldAutoPauseBudget;
|
|
1629
|
+
|
|
1630
|
+
await db
|
|
1631
|
+
.update(schedules)
|
|
1632
|
+
.set({
|
|
1633
|
+
lastTurnCount: turns,
|
|
1634
|
+
avgTurnsPerFiring: newAvg,
|
|
1635
|
+
failureStreak: newFailureStreak,
|
|
1636
|
+
turnBudgetBreachStreak: newBudgetStreak,
|
|
1637
|
+
lastFailureReason: failureReason,
|
|
1638
|
+
status: shouldAutoPause ? "paused" : schedule.status,
|
|
1639
|
+
updatedAt: new Date(),
|
|
1640
|
+
})
|
|
1641
|
+
.where(eq(schedules.id, scheduleId));
|
|
1642
|
+
|
|
1643
|
+
if (shouldAutoPauseGeneric) {
|
|
1644
|
+
console.warn(
|
|
1645
|
+
`[scheduler] auto-paused "${schedule.name}" after 3 consecutive failures`,
|
|
1646
|
+
);
|
|
1647
|
+
}
|
|
1648
|
+
if (shouldAutoPauseBudget) {
|
|
1649
|
+
console.warn(
|
|
1650
|
+
`[scheduler] auto-paused "${schedule.name}" after 5 consecutive turn-budget breaches (avg: ${newAvg} steps, cap: ${schedule.maxTurns})`,
|
|
1651
|
+
);
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
/**
|
|
1656
|
+
* First-breach grace: if maxTurnsSetAt was recent enough that this is the
|
|
1657
|
+
* first-or-second firing after the edit, don't count the breach toward the
|
|
1658
|
+
* auto-pause streak.
|
|
1659
|
+
*/
|
|
1660
|
+
function shouldApplyGrace(
|
|
1661
|
+
maxTurnsSetAt: Date | null,
|
|
1662
|
+
cronExpression: string,
|
|
1663
|
+
completedAt: Date | null,
|
|
1664
|
+
): boolean {
|
|
1665
|
+
if (!maxTurnsSetAt || !completedAt) return false;
|
|
1666
|
+
try {
|
|
1667
|
+
const nextAfterSet = computeNextFireTime(cronExpression, maxTurnsSetAt);
|
|
1668
|
+
const cronIntervalMs = nextAfterSet.getTime() - maxTurnsSetAt.getTime();
|
|
1669
|
+
const graceWindowEnd = new Date(
|
|
1670
|
+
maxTurnsSetAt.getTime() + GRACE_PERIOD_MULTIPLIER * cronIntervalMs,
|
|
1671
|
+
);
|
|
1672
|
+
return completedAt <= graceWindowEnd;
|
|
1673
|
+
} catch {
|
|
1674
|
+
return false;
|
|
1675
|
+
}
|
|
1676
|
+
}
|
|
1677
|
+
```
|
|
1678
|
+
|
|
1679
|
+
- [ ] **Step 8.4: Run to verify GREEN**
|
|
1680
|
+
|
|
1681
|
+
Run: `npx vitest run src/lib/schedules/__tests__/turn-budget.test.ts`
|
|
1682
|
+
Expected: PASS — all turn-budget tests (propagation + streak + grace + auto-pause).
|
|
1683
|
+
|
|
1684
|
+
- [ ] **Step 8.5: Run full scheduler suite**
|
|
1685
|
+
|
|
1686
|
+
Run: `npx vitest run src/lib/schedules`
|
|
1687
|
+
Expected: PASS — no regressions.
|
|
1688
|
+
|
|
1689
|
+
- [ ] **Step 8.6: Commit**
|
|
1690
|
+
|
|
1691
|
+
```bash
|
|
1692
|
+
git add src/lib/schedules/scheduler.ts src/lib/schedules/__tests__/turn-budget.test.ts
|
|
1693
|
+
git commit -m "feat(schedules): separate turn-budget breach streak with first-breach grace"
|
|
1694
|
+
```
|
|
1695
|
+
|
|
1696
|
+
---
|
|
1697
|
+
|
|
1698
|
+
## Task 9: Runtime adapter writes explicit `failure_reason`
|
|
1699
|
+
|
|
1700
|
+
**Files:**
|
|
1701
|
+
- Modify: `src/lib/agents/claude-agent.ts`
|
|
1702
|
+
- Test: `src/lib/agents/__tests__/failure-reason.test.ts`
|
|
1703
|
+
|
|
1704
|
+
The runtime adapter currently catches errors in `handleExecutionError()` but does not write `tasks.failure_reason`. The reaper and recordFiringMetrics rely on this column. Populate it at terminal-state transitions so `detectFailureReason()` (scheduler.ts:122) becomes a fallback, not the primary classifier.
|
|
1705
|
+
|
|
1706
|
+
- [ ] **Step 9.1: Write failing classifier tests**
|
|
1707
|
+
|
|
1708
|
+
Create `src/lib/agents/__tests__/failure-reason.test.ts`:
|
|
1709
|
+
|
|
1710
|
+
```typescript
|
|
1711
|
+
import { describe, it, expect, beforeEach } from "vitest";
|
|
1712
|
+
import { db } from "@/lib/db";
|
|
1713
|
+
import { tasks, projects } from "@/lib/db/schema";
|
|
1714
|
+
import { eq } from "drizzle-orm";
|
|
1715
|
+
import { randomUUID } from "crypto";
|
|
1716
|
+
import { writeTerminalFailureReason } from "../claude-agent";
|
|
1717
|
+
|
|
1718
|
+
function seedRunningTask(): string {
|
|
1719
|
+
const pid = randomUUID();
|
|
1720
|
+
const tid = randomUUID();
|
|
1721
|
+
const now = new Date();
|
|
1722
|
+
db.insert(projects)
|
|
1723
|
+
.values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
1724
|
+
.run();
|
|
1725
|
+
db.insert(tasks)
|
|
1726
|
+
.values({
|
|
1727
|
+
id: tid,
|
|
1728
|
+
projectId: pid,
|
|
1729
|
+
title: "t",
|
|
1730
|
+
status: "running",
|
|
1731
|
+
priority: 2,
|
|
1732
|
+
resumeCount: 0,
|
|
1733
|
+
createdAt: now,
|
|
1734
|
+
updatedAt: now,
|
|
1735
|
+
})
|
|
1736
|
+
.run();
|
|
1737
|
+
return tid;
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
describe("writeTerminalFailureReason", () => {
|
|
1741
|
+
beforeEach(() => {
|
|
1742
|
+
db.delete(tasks).run();
|
|
1743
|
+
db.delete(projects).run();
|
|
1744
|
+
});
|
|
1745
|
+
|
|
1746
|
+
it("writes 'turn_limit_exceeded' on turn limit errors", async () => {
|
|
1747
|
+
const tid = seedRunningTask();
|
|
1748
|
+
await writeTerminalFailureReason(
|
|
1749
|
+
tid,
|
|
1750
|
+
new Error("Agent exhausted its turn limit (42 turns used)"),
|
|
1751
|
+
);
|
|
1752
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
1753
|
+
expect(row?.failureReason).toBe("turn_limit_exceeded");
|
|
1754
|
+
});
|
|
1755
|
+
|
|
1756
|
+
it("writes 'aborted' on AbortError", async () => {
|
|
1757
|
+
const tid = seedRunningTask();
|
|
1758
|
+
const err = new Error("aborted");
|
|
1759
|
+
err.name = "AbortError";
|
|
1760
|
+
await writeTerminalFailureReason(tid, err);
|
|
1761
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
1762
|
+
expect(row?.failureReason).toBe("aborted");
|
|
1763
|
+
});
|
|
1764
|
+
|
|
1765
|
+
it("writes 'sdk_error' for unknown errors", async () => {
|
|
1766
|
+
const tid = seedRunningTask();
|
|
1767
|
+
await writeTerminalFailureReason(tid, new Error("something weird"));
|
|
1768
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
1769
|
+
expect(row?.failureReason).toBe("sdk_error");
|
|
1770
|
+
});
|
|
1771
|
+
|
|
1772
|
+
it("writes 'rate_limited' on 429 errors", async () => {
|
|
1773
|
+
const tid = seedRunningTask();
|
|
1774
|
+
await writeTerminalFailureReason(tid, new Error("HTTP 429 rate limit"));
|
|
1775
|
+
const row = db.select().from(tasks).where(eq(tasks.id, tid)).get();
|
|
1776
|
+
expect(row?.failureReason).toBe("rate_limited");
|
|
1777
|
+
});
|
|
1778
|
+
});
|
|
1779
|
+
```
|
|
1780
|
+
|
|
1781
|
+
- [ ] **Step 9.2: Run to verify RED**
|
|
1782
|
+
|
|
1783
|
+
Run: `npx vitest run src/lib/agents/__tests__/failure-reason.test.ts`
|
|
1784
|
+
Expected: FAIL — `writeTerminalFailureReason` not exported.
|
|
1785
|
+
|
|
1786
|
+
- [ ] **Step 9.3: Add writeTerminalFailureReason helper**
|
|
1787
|
+
|
|
1788
|
+
Edit `src/lib/agents/claude-agent.ts`. Add this helper near the other top-level exports (after the imports block):
|
|
1789
|
+
|
|
1790
|
+
```typescript
|
|
1791
|
+
/**
|
|
1792
|
+
* Write an explicit failure_reason to tasks at terminal-state transitions.
|
|
1793
|
+
* Called from handleExecutionError and the execute/resume functions on known
|
|
1794
|
+
* error classes. Prefer this over reverse-engineering reasons from text via
|
|
1795
|
+
* detectFailureReason in scheduler.ts, which is fragile to SDK message changes.
|
|
1796
|
+
*/
|
|
1797
|
+
export async function writeTerminalFailureReason(
|
|
1798
|
+
taskId: string,
|
|
1799
|
+
error: unknown,
|
|
1800
|
+
): Promise<void> {
|
|
1801
|
+
const reason = classifyError(error);
|
|
1802
|
+
await db
|
|
1803
|
+
.update(tasks)
|
|
1804
|
+
.set({ failureReason: reason, updatedAt: new Date() })
|
|
1805
|
+
.where(eq(tasks.id, taskId));
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
function classifyError(error: unknown): string {
|
|
1809
|
+
if (!(error instanceof Error)) return "sdk_error";
|
|
1810
|
+
if (error.name === "AbortError" || error.message.includes("aborted")) {
|
|
1811
|
+
return "aborted";
|
|
1812
|
+
}
|
|
1813
|
+
const lower = error.message.toLowerCase();
|
|
1814
|
+
if (
|
|
1815
|
+
lower.includes("turn") &&
|
|
1816
|
+
(lower.includes("limit") || lower.includes("exhausted") || lower.includes("max"))
|
|
1817
|
+
) {
|
|
1818
|
+
return "turn_limit_exceeded";
|
|
1819
|
+
}
|
|
1820
|
+
if (lower.includes("timeout") || lower.includes("timed out")) return "timeout";
|
|
1821
|
+
if (lower.includes("budget")) return "budget_exceeded";
|
|
1822
|
+
if (lower.includes("authentication") || lower.includes("oauth")) {
|
|
1823
|
+
return "auth_failed";
|
|
1824
|
+
}
|
|
1825
|
+
if (lower.includes("rate limit") || lower.includes("429")) {
|
|
1826
|
+
return "rate_limited";
|
|
1827
|
+
}
|
|
1828
|
+
return "sdk_error";
|
|
1829
|
+
}
|
|
1830
|
+
```
|
|
1831
|
+
|
|
1832
|
+
- [ ] **Step 9.4: Call it from handleExecutionError**
|
|
1833
|
+
|
|
1834
|
+
Still in `claude-agent.ts`, find `handleExecutionError()`. At the point where it updates `tasks.status = 'failed'`, add a call to `writeTerminalFailureReason(taskId, error)` alongside the status update.
|
|
1835
|
+
|
|
1836
|
+
- [ ] **Step 9.5: Run to verify GREEN**
|
|
1837
|
+
|
|
1838
|
+
Run: `npx vitest run src/lib/agents/__tests__/failure-reason.test.ts`
|
|
1839
|
+
Expected: PASS — 4 classification tests.
|
|
1840
|
+
|
|
1841
|
+
- [ ] **Step 9.6: Run full suite**
|
|
1842
|
+
|
|
1843
|
+
Run: `npx vitest run`
|
|
1844
|
+
Expected: PASS.
|
|
1845
|
+
|
|
1846
|
+
- [ ] **Step 9.7: Commit**
|
|
1847
|
+
|
|
1848
|
+
```bash
|
|
1849
|
+
git add src/lib/agents/claude-agent.ts src/lib/agents/__tests__/failure-reason.test.ts
|
|
1850
|
+
git commit -m "feat(agents): runtime adapter writes explicit failure_reason at terminal states"
|
|
1851
|
+
```
|
|
1852
|
+
|
|
1853
|
+
---
|
|
1854
|
+
|
|
1855
|
+
## Task 10: Manual execute endpoint with cap + force bypass
|
|
1856
|
+
|
|
1857
|
+
**Files:**
|
|
1858
|
+
- Create: `src/app/api/schedules/[id]/execute/route.ts`
|
|
1859
|
+
- Test: `src/app/api/schedules/__tests__/execute-route.test.ts`
|
|
1860
|
+
|
|
1861
|
+
No manual-execute endpoint exists today. Build one that honors the cap by default with explicit `?force=true` bypass.
|
|
1862
|
+
|
|
1863
|
+
- [ ] **Step 10.1: Write failing route tests**
|
|
1864
|
+
|
|
1865
|
+
Create `src/app/api/schedules/__tests__/execute-route.test.ts`:
|
|
1866
|
+
|
|
1867
|
+
```typescript
|
|
1868
|
+
import { describe, it, expect, beforeEach, vi } from "vitest";
|
|
1869
|
+
import { db } from "@/lib/db";
|
|
1870
|
+
import { tasks, schedules, projects, settings, usageLedger } from "@/lib/db/schema";
|
|
1871
|
+
import { eq } from "drizzle-orm";
|
|
1872
|
+
import { randomUUID } from "crypto";
|
|
1873
|
+
import { NextRequest } from "next/server";
|
|
1874
|
+
import { POST } from "../[id]/execute/route";
|
|
1875
|
+
|
|
1876
|
+
vi.mock("@/lib/agents/runtime", () => ({
|
|
1877
|
+
executeTaskWithRuntime: vi.fn().mockResolvedValue(undefined),
|
|
1878
|
+
}));
|
|
1879
|
+
|
|
1880
|
+
function req(url: string): NextRequest {
|
|
1881
|
+
return new NextRequest(new URL(url, "http://localhost"));
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
function seedSchedule(): string {
|
|
1885
|
+
const pid = randomUUID();
|
|
1886
|
+
const sid = randomUUID();
|
|
1887
|
+
const now = new Date();
|
|
1888
|
+
db.insert(projects)
|
|
1889
|
+
.values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
1890
|
+
.run();
|
|
1891
|
+
db.insert(schedules)
|
|
1892
|
+
.values({
|
|
1893
|
+
id: sid,
|
|
1894
|
+
projectId: pid,
|
|
1895
|
+
name: "manual",
|
|
1896
|
+
prompt: "test",
|
|
1897
|
+
cronExpression: "0 0 * * *",
|
|
1898
|
+
status: "active",
|
|
1899
|
+
type: "scheduled",
|
|
1900
|
+
firingCount: 0,
|
|
1901
|
+
suppressionCount: 0,
|
|
1902
|
+
heartbeatSpentToday: 0,
|
|
1903
|
+
failureStreak: 0,
|
|
1904
|
+
turnBudgetBreachStreak: 0,
|
|
1905
|
+
createdAt: now,
|
|
1906
|
+
updatedAt: now,
|
|
1907
|
+
})
|
|
1908
|
+
.run();
|
|
1909
|
+
return sid;
|
|
1910
|
+
}
|
|
1911
|
+
|
|
1912
|
+
describe("POST /api/schedules/:id/execute", () => {
|
|
1913
|
+
beforeEach(() => {
|
|
1914
|
+
db.delete(usageLedger).run();
|
|
1915
|
+
db.delete(tasks).run();
|
|
1916
|
+
db.delete(schedules).run();
|
|
1917
|
+
db.delete(projects).run();
|
|
1918
|
+
db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
|
|
1919
|
+
db.insert(settings)
|
|
1920
|
+
.values({ key: "schedule.maxConcurrent", value: "1", updatedAt: new Date() })
|
|
1921
|
+
.run();
|
|
1922
|
+
});
|
|
1923
|
+
|
|
1924
|
+
it("fires when capacity available, returns 200 with taskId", async () => {
|
|
1925
|
+
const sid = seedSchedule();
|
|
1926
|
+
const res = await POST(req(`/api/schedules/${sid}/execute`), {
|
|
1927
|
+
params: Promise.resolve({ id: sid }),
|
|
1928
|
+
});
|
|
1929
|
+
expect(res.status).toBe(200);
|
|
1930
|
+
const body = await res.json();
|
|
1931
|
+
expect(body.taskId).toBeDefined();
|
|
1932
|
+
});
|
|
1933
|
+
|
|
1934
|
+
it("returns 429 when cap is full", async () => {
|
|
1935
|
+
const sid1 = seedSchedule();
|
|
1936
|
+
const sid2 = seedSchedule();
|
|
1937
|
+
|
|
1938
|
+
const res1 = await POST(req(`/api/schedules/${sid1}/execute`), {
|
|
1939
|
+
params: Promise.resolve({ id: sid1 }),
|
|
1940
|
+
});
|
|
1941
|
+
expect(res1.status).toBe(200);
|
|
1942
|
+
|
|
1943
|
+
const res2 = await POST(req(`/api/schedules/${sid2}/execute`), {
|
|
1944
|
+
params: Promise.resolve({ id: sid2 }),
|
|
1945
|
+
});
|
|
1946
|
+
expect(res2.status).toBe(429);
|
|
1947
|
+
const body = await res2.json();
|
|
1948
|
+
expect(body.error).toBe("capacity_full");
|
|
1949
|
+
expect(body.slotEtaSec).toBeGreaterThanOrEqual(0);
|
|
1950
|
+
});
|
|
1951
|
+
|
|
1952
|
+
it("bypasses the cap when ?force=true and writes audit-log entry", async () => {
|
|
1953
|
+
const sid1 = seedSchedule();
|
|
1954
|
+
const sid2 = seedSchedule();
|
|
1955
|
+
|
|
1956
|
+
await POST(req(`/api/schedules/${sid1}/execute`), {
|
|
1957
|
+
params: Promise.resolve({ id: sid1 }),
|
|
1958
|
+
});
|
|
1959
|
+
|
|
1960
|
+
const res2 = await POST(
|
|
1961
|
+
req(`/api/schedules/${sid2}/execute?force=true`),
|
|
1962
|
+
{ params: Promise.resolve({ id: sid2 }) },
|
|
1963
|
+
);
|
|
1964
|
+
expect(res2.status).toBe(200);
|
|
1965
|
+
|
|
1966
|
+
const ledger = db
|
|
1967
|
+
.select()
|
|
1968
|
+
.from(usageLedger)
|
|
1969
|
+
.where(eq(usageLedger.activityType, "manual_force_bypass"))
|
|
1970
|
+
.all();
|
|
1971
|
+
expect(ledger.length).toBe(1);
|
|
1972
|
+
});
|
|
1973
|
+
|
|
1974
|
+
it("returns 404 when the schedule does not exist", async () => {
|
|
1975
|
+
const res = await POST(req("/api/schedules/nonexistent/execute"), {
|
|
1976
|
+
params: Promise.resolve({ id: "nonexistent" }),
|
|
1977
|
+
});
|
|
1978
|
+
expect(res.status).toBe(404);
|
|
1979
|
+
});
|
|
1980
|
+
});
|
|
1981
|
+
```
|
|
1982
|
+
|
|
1983
|
+
- [ ] **Step 10.2: Run to verify RED**
|
|
1984
|
+
|
|
1985
|
+
Run: `npx vitest run src/app/api/schedules/__tests__/execute-route.test.ts`
|
|
1986
|
+
Expected: FAIL — route module does not exist.
|
|
1987
|
+
|
|
1988
|
+
- [ ] **Step 10.3: Implement the manual-execute route**
|
|
1989
|
+
|
|
1990
|
+
Create `src/app/api/schedules/[id]/execute/route.ts`:
|
|
1991
|
+
|
|
1992
|
+
```typescript
|
|
1993
|
+
import { NextRequest, NextResponse } from "next/server";
|
|
1994
|
+
import { db } from "@/lib/db";
|
|
1995
|
+
import { schedules, tasks, usageLedger } from "@/lib/db/schema";
|
|
1996
|
+
import { eq } from "drizzle-orm";
|
|
1997
|
+
import { executeTaskWithRuntime } from "@/lib/agents/runtime";
|
|
1998
|
+
import { claimSlot, countRunningScheduledSlots } from "@/lib/schedules/slot-claim";
|
|
1999
|
+
import {
|
|
2000
|
+
getScheduleMaxConcurrent,
|
|
2001
|
+
getScheduleMaxRunDurationSec,
|
|
2002
|
+
} from "@/lib/schedules/config";
|
|
2003
|
+
import { randomUUID } from "crypto";
|
|
2004
|
+
|
|
2005
|
+
/**
|
|
2006
|
+
* Manually fire a schedule. Honors the global concurrency cap by default.
|
|
2007
|
+
* Use `?force=true` to bypass the cap (logged to usage_ledger as
|
|
2008
|
+
* manual_force_bypass for audit).
|
|
2009
|
+
*/
|
|
2010
|
+
export async function POST(
|
|
2011
|
+
req: NextRequest,
|
|
2012
|
+
{ params }: { params: Promise<{ id: string }> },
|
|
2013
|
+
) {
|
|
2014
|
+
const { id: scheduleId } = await params;
|
|
2015
|
+
const force = req.nextUrl.searchParams.get("force") === "true";
|
|
2016
|
+
|
|
2017
|
+
const [schedule] = await db
|
|
2018
|
+
.select()
|
|
2019
|
+
.from(schedules)
|
|
2020
|
+
.where(eq(schedules.id, scheduleId));
|
|
2021
|
+
if (!schedule) {
|
|
2022
|
+
return NextResponse.json({ error: "schedule_not_found" }, { status: 404 });
|
|
2023
|
+
}
|
|
2024
|
+
|
|
2025
|
+
const taskId = randomUUID();
|
|
2026
|
+
const firingNumber = schedule.firingCount + 1;
|
|
2027
|
+
const now = new Date();
|
|
2028
|
+
|
|
2029
|
+
await db.insert(tasks).values({
|
|
2030
|
+
id: taskId,
|
|
2031
|
+
projectId: schedule.projectId,
|
|
2032
|
+
workflowId: null,
|
|
2033
|
+
scheduleId: schedule.id,
|
|
2034
|
+
title: `${schedule.name} — manual firing #${firingNumber}`,
|
|
2035
|
+
description: schedule.prompt,
|
|
2036
|
+
status: "queued",
|
|
2037
|
+
assignedAgent: schedule.assignedAgent,
|
|
2038
|
+
agentProfile: schedule.agentProfile,
|
|
2039
|
+
priority: 2,
|
|
2040
|
+
sourceType: "scheduled",
|
|
2041
|
+
maxTurns: schedule.maxTurns,
|
|
2042
|
+
createdAt: now,
|
|
2043
|
+
updatedAt: now,
|
|
2044
|
+
});
|
|
2045
|
+
|
|
2046
|
+
const cap = getScheduleMaxConcurrent();
|
|
2047
|
+
const leaseSec = schedule.maxRunDurationSec ?? getScheduleMaxRunDurationSec();
|
|
2048
|
+
|
|
2049
|
+
const effectiveCap = force ? Number.MAX_SAFE_INTEGER : cap;
|
|
2050
|
+
const { claimed } = claimSlot(taskId, effectiveCap, leaseSec);
|
|
2051
|
+
|
|
2052
|
+
if (!claimed) {
|
|
2053
|
+
await db.delete(tasks).where(eq(tasks.id, taskId));
|
|
2054
|
+
const slotEtaSec = 60;
|
|
2055
|
+
return NextResponse.json(
|
|
2056
|
+
{
|
|
2057
|
+
error: "capacity_full",
|
|
2058
|
+
message: `Swarm at capacity (${countRunningScheduledSlots()}/${cap}). Retry in ~${slotEtaSec}s or add ?force=true to bypass.`,
|
|
2059
|
+
slotEtaSec,
|
|
2060
|
+
},
|
|
2061
|
+
{ status: 429 },
|
|
2062
|
+
);
|
|
2063
|
+
}
|
|
2064
|
+
|
|
2065
|
+
if (force) {
|
|
2066
|
+
await db.insert(usageLedger).values({
|
|
2067
|
+
id: randomUUID(),
|
|
2068
|
+
taskId,
|
|
2069
|
+
scheduleId: schedule.id,
|
|
2070
|
+
projectId: schedule.projectId,
|
|
2071
|
+
activityType: "manual_force_bypass",
|
|
2072
|
+
runtimeId: schedule.assignedAgent ?? null,
|
|
2073
|
+
providerId: null,
|
|
2074
|
+
modelId: null,
|
|
2075
|
+
inputTokens: null,
|
|
2076
|
+
outputTokens: null,
|
|
2077
|
+
totalTokens: null,
|
|
2078
|
+
costMicros: 0,
|
|
2079
|
+
status: "completed",
|
|
2080
|
+
startedAt: now,
|
|
2081
|
+
finishedAt: now,
|
|
2082
|
+
} as typeof usageLedger.$inferInsert);
|
|
2083
|
+
}
|
|
2084
|
+
|
|
2085
|
+
executeTaskWithRuntime(taskId).catch((err) => {
|
|
2086
|
+
console.error(`[api/schedules/execute] task ${taskId} failed:`, err);
|
|
2087
|
+
});
|
|
2088
|
+
|
|
2089
|
+
return NextResponse.json({ taskId, forced: force });
|
|
2090
|
+
}
|
|
2091
|
+
```
|
|
2092
|
+
|
|
2093
|
+
**Note:** if `usageLedger` schema columns differ from the insert shape above, read `src/lib/db/schema.ts` lines 297-340 for the actual column names and adjust. The `activityType` field may need to be added to the enum — check `tasks.status` pattern for reference.
|
|
2094
|
+
|
|
2095
|
+
- [ ] **Step 10.4: Run to verify GREEN**
|
|
2096
|
+
|
|
2097
|
+
Run: `npx vitest run src/app/api/schedules/__tests__/execute-route.test.ts`
|
|
2098
|
+
Expected: PASS — 4 tests.
|
|
2099
|
+
|
|
2100
|
+
- [ ] **Step 10.5: Commit**
|
|
2101
|
+
|
|
2102
|
+
```bash
|
|
2103
|
+
git add src/app/api/schedules/[id]/execute/route.ts src/app/api/schedules/__tests__/execute-route.test.ts
|
|
2104
|
+
git commit -m "feat(schedules): manual execute endpoint honors cap with force bypass + audit"
|
|
2105
|
+
```
|
|
2106
|
+
|
|
2107
|
+
---
|
|
2108
|
+
|
|
2109
|
+
## Task 11: Firing metrics insertion
|
|
2110
|
+
|
|
2111
|
+
**Files:**
|
|
2112
|
+
- Modify: `src/lib/schedules/scheduler.ts` (`recordFiringMetrics` — add insert at end)
|
|
2113
|
+
- Test: `src/lib/schedules/__tests__/firing-metrics.test.ts`
|
|
2114
|
+
|
|
2115
|
+
- [ ] **Step 11.1: Write failing metrics test**
|
|
2116
|
+
|
|
2117
|
+
Create `src/lib/schedules/__tests__/firing-metrics.test.ts`:
|
|
2118
|
+
|
|
2119
|
+
```typescript
|
|
2120
|
+
import { describe, it, expect, beforeEach } from "vitest";
|
|
2121
|
+
import { db } from "@/lib/db";
|
|
2122
|
+
import {
|
|
2123
|
+
tasks,
|
|
2124
|
+
schedules,
|
|
2125
|
+
projects,
|
|
2126
|
+
scheduleFiringMetrics,
|
|
2127
|
+
agentLogs,
|
|
2128
|
+
} from "@/lib/db/schema";
|
|
2129
|
+
import { eq } from "drizzle-orm";
|
|
2130
|
+
import { randomUUID } from "crypto";
|
|
2131
|
+
import { recordFiringMetrics } from "../scheduler";
|
|
2132
|
+
|
|
2133
|
+
describe("schedule_firing_metrics insertion", () => {
|
|
2134
|
+
beforeEach(() => {
|
|
2135
|
+
db.delete(scheduleFiringMetrics).run();
|
|
2136
|
+
db.delete(agentLogs).run();
|
|
2137
|
+
db.delete(tasks).run();
|
|
2138
|
+
db.delete(schedules).run();
|
|
2139
|
+
db.delete(projects).run();
|
|
2140
|
+
});
|
|
2141
|
+
|
|
2142
|
+
it("inserts a row for every firing with slot_wait_ms and duration_ms", async () => {
|
|
2143
|
+
const pid = randomUUID();
|
|
2144
|
+
const sid = randomUUID();
|
|
2145
|
+
const tid = randomUUID();
|
|
2146
|
+
const firedAt = new Date(Date.now() - 5000);
|
|
2147
|
+
const slotClaimedAt = new Date(Date.now() - 4000);
|
|
2148
|
+
const completedAt = new Date(Date.now() - 100);
|
|
2149
|
+
|
|
2150
|
+
db.insert(projects)
|
|
2151
|
+
.values({
|
|
2152
|
+
id: pid,
|
|
2153
|
+
name: "p",
|
|
2154
|
+
status: "active",
|
|
2155
|
+
createdAt: firedAt,
|
|
2156
|
+
updatedAt: firedAt,
|
|
2157
|
+
})
|
|
2158
|
+
.run();
|
|
2159
|
+
db.insert(schedules)
|
|
2160
|
+
.values({
|
|
2161
|
+
id: sid,
|
|
2162
|
+
projectId: pid,
|
|
2163
|
+
name: "test",
|
|
2164
|
+
prompt: "x",
|
|
2165
|
+
cronExpression: "* * * * *",
|
|
2166
|
+
status: "active",
|
|
2167
|
+
type: "scheduled",
|
|
2168
|
+
firingCount: 1,
|
|
2169
|
+
suppressionCount: 0,
|
|
2170
|
+
heartbeatSpentToday: 0,
|
|
2171
|
+
failureStreak: 0,
|
|
2172
|
+
turnBudgetBreachStreak: 0,
|
|
2173
|
+
maxTurns: 50,
|
|
2174
|
+
createdAt: firedAt,
|
|
2175
|
+
updatedAt: firedAt,
|
|
2176
|
+
})
|
|
2177
|
+
.run();
|
|
2178
|
+
db.insert(tasks)
|
|
2179
|
+
.values({
|
|
2180
|
+
id: tid,
|
|
2181
|
+
scheduleId: sid,
|
|
2182
|
+
title: "firing",
|
|
2183
|
+
status: "completed",
|
|
2184
|
+
priority: 2,
|
|
2185
|
+
sourceType: "scheduled",
|
|
2186
|
+
resumeCount: 0,
|
|
2187
|
+
slotClaimedAt,
|
|
2188
|
+
createdAt: firedAt,
|
|
2189
|
+
updatedAt: completedAt,
|
|
2190
|
+
})
|
|
2191
|
+
.run();
|
|
2192
|
+
for (let i = 0; i < 7; i++) {
|
|
2193
|
+
db.insert(agentLogs)
|
|
2194
|
+
.values({
|
|
2195
|
+
id: randomUUID(),
|
|
2196
|
+
taskId: tid,
|
|
2197
|
+
agentType: "test",
|
|
2198
|
+
event: "assistant_message",
|
|
2199
|
+
timestamp: new Date(),
|
|
2200
|
+
})
|
|
2201
|
+
.run();
|
|
2202
|
+
}
|
|
2203
|
+
|
|
2204
|
+
await recordFiringMetrics(sid, tid);
|
|
2205
|
+
|
|
2206
|
+
const rows = db
|
|
2207
|
+
.select()
|
|
2208
|
+
.from(scheduleFiringMetrics)
|
|
2209
|
+
.where(eq(scheduleFiringMetrics.scheduleId, sid))
|
|
2210
|
+
.all();
|
|
2211
|
+
|
|
2212
|
+
expect(rows.length).toBe(1);
|
|
2213
|
+
expect(rows[0].turnCount).toBe(7);
|
|
2214
|
+
expect(rows[0].maxTurnsAtFiring).toBe(50);
|
|
2215
|
+
expect(rows[0].slotWaitMs).toBeGreaterThan(0);
|
|
2216
|
+
expect(rows[0].durationMs).toBeGreaterThan(0);
|
|
2217
|
+
});
|
|
2218
|
+
});
|
|
2219
|
+
```
|
|
2220
|
+
|
|
2221
|
+
- [ ] **Step 11.2: Run to verify RED**
|
|
2222
|
+
|
|
2223
|
+
Run: `npx vitest run src/lib/schedules/__tests__/firing-metrics.test.ts`
|
|
2224
|
+
Expected: FAIL — no row inserted.
|
|
2225
|
+
|
|
2226
|
+
- [ ] **Step 11.3: Append metric insertion to recordFiringMetrics**
|
|
2227
|
+
|
|
2228
|
+
Edit `src/lib/schedules/scheduler.ts`. At the end of `recordFiringMetrics()` (after the schedule UPDATE), add the metric insertion:
|
|
2229
|
+
|
|
2230
|
+
```typescript
|
|
2231
|
+
try {
|
|
2232
|
+
const [taskRow] = await db
|
|
2233
|
+
.select()
|
|
2234
|
+
.from(tasks)
|
|
2235
|
+
.where(eq(tasks.id, taskId));
|
|
2236
|
+
if (taskRow) {
|
|
2237
|
+
const firedAtDate = taskRow.createdAt;
|
|
2238
|
+
const slotClaimedAt = taskRow.slotClaimedAt;
|
|
2239
|
+
const completedAt = taskRow.updatedAt;
|
|
2240
|
+
const slotWaitMs =
|
|
2241
|
+
slotClaimedAt && firedAtDate
|
|
2242
|
+
? slotClaimedAt.getTime() - firedAtDate.getTime()
|
|
2243
|
+
: null;
|
|
2244
|
+
const durationMs =
|
|
2245
|
+
slotClaimedAt && completedAt
|
|
2246
|
+
? completedAt.getTime() - slotClaimedAt.getTime()
|
|
2247
|
+
: null;
|
|
2248
|
+
|
|
2249
|
+
await db.insert(scheduleFiringMetrics).values({
|
|
2250
|
+
id: crypto.randomUUID(),
|
|
2251
|
+
scheduleId,
|
|
2252
|
+
taskId,
|
|
2253
|
+
firedAt: firedAtDate,
|
|
2254
|
+
slotClaimedAt,
|
|
2255
|
+
completedAt,
|
|
2256
|
+
slotWaitMs,
|
|
2257
|
+
durationMs,
|
|
2258
|
+
turnCount: turns,
|
|
2259
|
+
maxTurnsAtFiring: schedule.maxTurns,
|
|
2260
|
+
eventLoopLagMs: null,
|
|
2261
|
+
peakRssMb: null,
|
|
2262
|
+
chatStreamsActive: null,
|
|
2263
|
+
concurrentSchedules: null,
|
|
2264
|
+
failureReason,
|
|
2265
|
+
});
|
|
2266
|
+
}
|
|
2267
|
+
} catch (err) {
|
|
2268
|
+
console.error(`[scheduler] failed to insert firing metrics for ${taskId}:`, err);
|
|
2269
|
+
}
|
|
2270
|
+
```
|
|
2271
|
+
|
|
2272
|
+
Remember to import `scheduleFiringMetrics` at the top of `scheduler.ts`:
|
|
2273
|
+
|
|
2274
|
+
```typescript
|
|
2275
|
+
import { schedules, tasks, agentLogs, scheduleFiringMetrics, scheduleDocumentInputs, documents, workflows } from "@/lib/db/schema";
|
|
2276
|
+
```
|
|
2277
|
+
|
|
2278
|
+
- [ ] **Step 11.4: Run to verify GREEN**
|
|
2279
|
+
|
|
2280
|
+
Run: `npx vitest run src/lib/schedules/__tests__/firing-metrics.test.ts`
|
|
2281
|
+
Expected: PASS.
|
|
2282
|
+
|
|
2283
|
+
- [ ] **Step 11.5: Commit**
|
|
2284
|
+
|
|
2285
|
+
```bash
|
|
2286
|
+
git add src/lib/schedules/scheduler.ts src/lib/schedules/__tests__/firing-metrics.test.ts
|
|
2287
|
+
git commit -m "feat(schedules): insert schedule_firing_metrics rows for tuning + forensics"
|
|
2288
|
+
```
|
|
2289
|
+
|
|
2290
|
+
---
|
|
2291
|
+
|
|
2292
|
+
## Task 12: Collision warning helper + API wiring
|
|
2293
|
+
|
|
2294
|
+
**Files:**
|
|
2295
|
+
- Create: `src/lib/schedules/collision-check.ts`
|
|
2296
|
+
- Test: `src/lib/schedules/__tests__/collision-check.test.ts`
|
|
2297
|
+
- Modify: `src/app/api/schedules/route.ts`
|
|
2298
|
+
- Modify: `src/app/api/schedules/[id]/route.ts`
|
|
2299
|
+
|
|
2300
|
+
- [ ] **Step 12.1: Write failing collision-check tests**
|
|
2301
|
+
|
|
2302
|
+
Create `src/lib/schedules/__tests__/collision-check.test.ts`:
|
|
2303
|
+
|
|
2304
|
+
```typescript
|
|
2305
|
+
import { describe, it, expect, beforeEach } from "vitest";
|
|
2306
|
+
import { db } from "@/lib/db";
|
|
2307
|
+
import { schedules, projects } from "@/lib/db/schema";
|
|
2308
|
+
import { randomUUID } from "crypto";
|
|
2309
|
+
import { checkCollision } from "../collision-check";
|
|
2310
|
+
|
|
2311
|
+
function seedSchedule(opts: {
|
|
2312
|
+
cron: string;
|
|
2313
|
+
avgTurns: number;
|
|
2314
|
+
projectId: string;
|
|
2315
|
+
status?: "active" | "paused";
|
|
2316
|
+
}): string {
|
|
2317
|
+
const id = randomUUID();
|
|
2318
|
+
const now = new Date();
|
|
2319
|
+
db.insert(schedules)
|
|
2320
|
+
.values({
|
|
2321
|
+
id,
|
|
2322
|
+
projectId: opts.projectId,
|
|
2323
|
+
name: `s-${id.slice(0, 4)}`,
|
|
2324
|
+
prompt: "test",
|
|
2325
|
+
cronExpression: opts.cron,
|
|
2326
|
+
status: opts.status ?? "active",
|
|
2327
|
+
type: "scheduled",
|
|
2328
|
+
firingCount: 0,
|
|
2329
|
+
suppressionCount: 0,
|
|
2330
|
+
heartbeatSpentToday: 0,
|
|
2331
|
+
failureStreak: 0,
|
|
2332
|
+
turnBudgetBreachStreak: 0,
|
|
2333
|
+
avgTurnsPerFiring: opts.avgTurns,
|
|
2334
|
+
createdAt: now,
|
|
2335
|
+
updatedAt: now,
|
|
2336
|
+
})
|
|
2337
|
+
.run();
|
|
2338
|
+
return id;
|
|
2339
|
+
}
|
|
2340
|
+
|
|
2341
|
+
function seedProject(): string {
|
|
2342
|
+
const id = randomUUID();
|
|
2343
|
+
const now = new Date();
|
|
2344
|
+
db.insert(projects)
|
|
2345
|
+
.values({ id, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
2346
|
+
.run();
|
|
2347
|
+
return id;
|
|
2348
|
+
}
|
|
2349
|
+
|
|
2350
|
+
describe("checkCollision", () => {
|
|
2351
|
+
beforeEach(() => {
|
|
2352
|
+
db.delete(schedules).run();
|
|
2353
|
+
db.delete(projects).run();
|
|
2354
|
+
});
|
|
2355
|
+
|
|
2356
|
+
it("returns no warnings when no overlap exists", () => {
|
|
2357
|
+
const pid = seedProject();
|
|
2358
|
+
seedSchedule({ cron: "0 3 * * *", avgTurns: 500, projectId: pid });
|
|
2359
|
+
expect(checkCollision("0 15 * * *", 500, pid, null)).toEqual([]);
|
|
2360
|
+
});
|
|
2361
|
+
|
|
2362
|
+
it("detects overlap when two heavy schedules share a 5-min bucket", () => {
|
|
2363
|
+
const pid = seedProject();
|
|
2364
|
+
seedSchedule({ cron: "2 * * * *", avgTurns: 2000, projectId: pid });
|
|
2365
|
+
const warnings = checkCollision("0 * * * *", 2000, pid, null);
|
|
2366
|
+
expect(warnings.length).toBe(1);
|
|
2367
|
+
expect(warnings[0].type).toBe("cron_collision");
|
|
2368
|
+
expect(warnings[0].estimatedConcurrentSteps).toBeGreaterThanOrEqual(4000);
|
|
2369
|
+
});
|
|
2370
|
+
|
|
2371
|
+
it("ignores paused schedules", () => {
|
|
2372
|
+
const pid = seedProject();
|
|
2373
|
+
seedSchedule({
|
|
2374
|
+
cron: "2 * * * *",
|
|
2375
|
+
avgTurns: 2000,
|
|
2376
|
+
projectId: pid,
|
|
2377
|
+
status: "paused",
|
|
2378
|
+
});
|
|
2379
|
+
expect(checkCollision("0 * * * *", 2000, pid, null)).toEqual([]);
|
|
2380
|
+
});
|
|
2381
|
+
|
|
2382
|
+
it("excludes the excludeScheduleId (for PUT updates)", () => {
|
|
2383
|
+
const pid = seedProject();
|
|
2384
|
+
const existing = seedSchedule({
|
|
2385
|
+
cron: "0 * * * *",
|
|
2386
|
+
avgTurns: 3000,
|
|
2387
|
+
projectId: pid,
|
|
2388
|
+
});
|
|
2389
|
+
expect(checkCollision("0 * * * *", 3000, pid, existing)).toEqual([]);
|
|
2390
|
+
});
|
|
2391
|
+
|
|
2392
|
+
it("does not warn when combined steps are below the threshold", () => {
|
|
2393
|
+
const pid = seedProject();
|
|
2394
|
+
seedSchedule({ cron: "2 * * * *", avgTurns: 500, projectId: pid });
|
|
2395
|
+
expect(checkCollision("0 * * * *", 500, pid, null)).toEqual([]);
|
|
2396
|
+
});
|
|
2397
|
+
});
|
|
2398
|
+
```
|
|
2399
|
+
|
|
2400
|
+
- [ ] **Step 12.2: Run to verify RED**
|
|
2401
|
+
|
|
2402
|
+
Run: `npx vitest run src/lib/schedules/__tests__/collision-check.test.ts`
|
|
2403
|
+
Expected: FAIL — module doesn't exist.
|
|
2404
|
+
|
|
2405
|
+
- [ ] **Step 12.3: Implement collision-check**
|
|
2406
|
+
|
|
2407
|
+
Create `src/lib/schedules/collision-check.ts`:
|
|
2408
|
+
|
|
2409
|
+
```typescript
|
|
2410
|
+
import { db } from "@/lib/db";
|
|
2411
|
+
import { schedules } from "@/lib/db/schema";
|
|
2412
|
+
import { and, eq, ne } from "drizzle-orm";
|
|
2413
|
+
import { expandCronMinutes } from "./interval-parser";
|
|
2414
|
+
|
|
2415
|
+
const BUCKET_SIZE_MIN = 5;
|
|
2416
|
+
const COLLISION_THRESHOLD_TURNS = 3000;
|
|
2417
|
+
|
|
2418
|
+
export interface CronCollisionWarning {
|
|
2419
|
+
type: "cron_collision";
|
|
2420
|
+
overlappingSchedules: string[];
|
|
2421
|
+
overlappingMinutes: number[];
|
|
2422
|
+
estimatedConcurrentSteps: number;
|
|
2423
|
+
}
|
|
2424
|
+
|
|
2425
|
+
/**
|
|
2426
|
+
* Check if a candidate cron collides with existing active schedules in the
|
|
2427
|
+
* same project inside a 5-minute bucket, weighted by the sum of their
|
|
2428
|
+
* avgTurnsPerFiring. Warns only when combined weight exceeds 3000 steps.
|
|
2429
|
+
*
|
|
2430
|
+
* Passing an excludeScheduleId skips that schedule (for PUT flows where a
|
|
2431
|
+
* schedule should not collide with its own prior state).
|
|
2432
|
+
*
|
|
2433
|
+
* Deterministic — runs against nominal cron expansion, not chat-pressure
|
|
2434
|
+
* adjusted times.
|
|
2435
|
+
*/
|
|
2436
|
+
export function checkCollision(
|
|
2437
|
+
candidateCron: string,
|
|
2438
|
+
candidateAvgTurns: number,
|
|
2439
|
+
projectId: string | null,
|
|
2440
|
+
excludeScheduleId: string | null,
|
|
2441
|
+
): CronCollisionWarning[] {
|
|
2442
|
+
let candidateMinutes: number[];
|
|
2443
|
+
try {
|
|
2444
|
+
candidateMinutes = expandCronMinutes(candidateCron);
|
|
2445
|
+
} catch {
|
|
2446
|
+
return [];
|
|
2447
|
+
}
|
|
2448
|
+
|
|
2449
|
+
const candidateBuckets = new Set(
|
|
2450
|
+
candidateMinutes.map((m) => Math.floor(m / BUCKET_SIZE_MIN)),
|
|
2451
|
+
);
|
|
2452
|
+
|
|
2453
|
+
const conditions = [eq(schedules.status, "active")];
|
|
2454
|
+
if (projectId !== null) {
|
|
2455
|
+
conditions.push(eq(schedules.projectId, projectId));
|
|
2456
|
+
}
|
|
2457
|
+
if (excludeScheduleId !== null) {
|
|
2458
|
+
conditions.push(ne(schedules.id, excludeScheduleId));
|
|
2459
|
+
}
|
|
2460
|
+
|
|
2461
|
+
const others = db
|
|
2462
|
+
.select({
|
|
2463
|
+
id: schedules.id,
|
|
2464
|
+
name: schedules.name,
|
|
2465
|
+
cronExpression: schedules.cronExpression,
|
|
2466
|
+
avgTurnsPerFiring: schedules.avgTurnsPerFiring,
|
|
2467
|
+
})
|
|
2468
|
+
.from(schedules)
|
|
2469
|
+
.where(and(...conditions))
|
|
2470
|
+
.all();
|
|
2471
|
+
|
|
2472
|
+
const overlappingNames: string[] = [];
|
|
2473
|
+
const overlappingMinutesSet = new Set<number>();
|
|
2474
|
+
let totalOtherTurns = 0;
|
|
2475
|
+
|
|
2476
|
+
for (const other of others) {
|
|
2477
|
+
let otherMinutes: number[];
|
|
2478
|
+
try {
|
|
2479
|
+
otherMinutes = expandCronMinutes(other.cronExpression);
|
|
2480
|
+
} catch {
|
|
2481
|
+
continue;
|
|
2482
|
+
}
|
|
2483
|
+
const otherBuckets = new Set(
|
|
2484
|
+
otherMinutes.map((m) => Math.floor(m / BUCKET_SIZE_MIN)),
|
|
2485
|
+
);
|
|
2486
|
+
const sharedBuckets = [...otherBuckets].filter((b) =>
|
|
2487
|
+
candidateBuckets.has(b),
|
|
2488
|
+
);
|
|
2489
|
+
if (sharedBuckets.length > 0) {
|
|
2490
|
+
overlappingNames.push(other.name);
|
|
2491
|
+
totalOtherTurns += other.avgTurnsPerFiring ?? 0;
|
|
2492
|
+
for (const b of sharedBuckets) {
|
|
2493
|
+
overlappingMinutesSet.add(b * BUCKET_SIZE_MIN);
|
|
2494
|
+
}
|
|
2495
|
+
}
|
|
2496
|
+
}
|
|
2497
|
+
|
|
2498
|
+
const combinedTurns = candidateAvgTurns + totalOtherTurns;
|
|
2499
|
+
if (
|
|
2500
|
+
overlappingNames.length === 0 ||
|
|
2501
|
+
combinedTurns < COLLISION_THRESHOLD_TURNS
|
|
2502
|
+
) {
|
|
2503
|
+
return [];
|
|
2504
|
+
}
|
|
2505
|
+
|
|
2506
|
+
return [
|
|
2507
|
+
{
|
|
2508
|
+
type: "cron_collision",
|
|
2509
|
+
overlappingSchedules: overlappingNames,
|
|
2510
|
+
overlappingMinutes: [...overlappingMinutesSet].sort((a, b) => a - b),
|
|
2511
|
+
estimatedConcurrentSteps: combinedTurns,
|
|
2512
|
+
},
|
|
2513
|
+
];
|
|
2514
|
+
}
|
|
2515
|
+
```
|
|
2516
|
+
|
|
2517
|
+
- [ ] **Step 12.4: Run to verify GREEN**
|
|
2518
|
+
|
|
2519
|
+
Run: `npx vitest run src/lib/schedules/__tests__/collision-check.test.ts`
|
|
2520
|
+
Expected: PASS — 5 tests.
|
|
2521
|
+
|
|
2522
|
+
- [ ] **Step 12.5: Wire collision warnings into POST /api/schedules**
|
|
2523
|
+
|
|
2524
|
+
Edit `src/app/api/schedules/route.ts`. Add the import:
|
|
2525
|
+
|
|
2526
|
+
```typescript
|
|
2527
|
+
import { checkCollision } from "@/lib/schedules/collision-check";
|
|
2528
|
+
```
|
|
2529
|
+
|
|
2530
|
+
At the end of the POST handler, after the schedule is inserted, compute and attach warnings. Find the existing `return NextResponse.json(row)` and replace with:
|
|
2531
|
+
|
|
2532
|
+
```typescript
|
|
2533
|
+
const warnings = checkCollision(cronExpression, 0, projectId ?? null, null);
|
|
2534
|
+
return NextResponse.json({ schedule: row, warnings });
|
|
2535
|
+
```
|
|
2536
|
+
|
|
2537
|
+
- [ ] **Step 12.6: Wire collision warnings into PUT /api/schedules/:id**
|
|
2538
|
+
|
|
2539
|
+
Edit `src/app/api/schedules/[id]/route.ts`. Similar change at the end of the PUT handler:
|
|
2540
|
+
|
|
2541
|
+
```typescript
|
|
2542
|
+
const warnings = checkCollision(
|
|
2543
|
+
cronExpression,
|
|
2544
|
+
schedule.avgTurnsPerFiring ?? 0,
|
|
2545
|
+
schedule.projectId ?? null,
|
|
2546
|
+
schedule.id,
|
|
2547
|
+
);
|
|
2548
|
+
return NextResponse.json({ schedule: updatedRow, warnings });
|
|
2549
|
+
```
|
|
2550
|
+
|
|
2551
|
+
**Note:** these are response-shape changes. Existing consumers of these endpoints expect the schedule directly. The schedule form in Task 13 will read `res.schedule` and `res.warnings`. If preserving backwards compat is required, spread instead: `{ ...row, warnings }`.
|
|
2552
|
+
|
|
2553
|
+
- [ ] **Step 12.7: Run tests**
|
|
2554
|
+
|
|
2555
|
+
Run: `npx vitest run src/lib/schedules src/app/api/schedules`
|
|
2556
|
+
Expected: PASS.
|
|
2557
|
+
|
|
2558
|
+
- [ ] **Step 12.8: Commit**
|
|
2559
|
+
|
|
2560
|
+
```bash
|
|
2561
|
+
git add src/lib/schedules/collision-check.ts src/lib/schedules/__tests__/collision-check.test.ts src/app/api/schedules/route.ts src/app/api/schedules/[id]/route.ts
|
|
2562
|
+
git commit -m "feat(schedules): pre-flight cron collision warning at save time"
|
|
2563
|
+
```
|
|
2564
|
+
|
|
2565
|
+
---
|
|
2566
|
+
|
|
2567
|
+
## Task 13: Schedule form — rename + tooltips + calibration hint + warning banner
|
|
2568
|
+
|
|
2569
|
+
**Files:**
|
|
2570
|
+
- Modify: `src/components/schedules/schedule-form.tsx`
|
|
2571
|
+
- Modify: `src/components/schedules/schedule-create-sheet.tsx`
|
|
2572
|
+
- Modify: `src/components/schedules/schedule-edit-sheet.tsx`
|
|
2573
|
+
|
|
2574
|
+
This task is UI-heavy. Steps describe form changes without full component test coverage; smoke-test manually via `npm run dev`.
|
|
2575
|
+
|
|
2576
|
+
- [ ] **Step 13.1: Add Max agent steps field to schedule-form.tsx**
|
|
2577
|
+
|
|
2578
|
+
Edit `src/components/schedules/schedule-form.tsx`. Locate the form's state block and add:
|
|
2579
|
+
|
|
2580
|
+
```typescript
|
|
2581
|
+
const [maxTurns, setMaxTurns] = useState<number | null>(initial?.maxTurns ?? null);
|
|
2582
|
+
```
|
|
2583
|
+
|
|
2584
|
+
Add the form field near the existing budget/tuning fields:
|
|
2585
|
+
|
|
2586
|
+
```tsx
|
|
2587
|
+
<div className="space-y-2">
|
|
2588
|
+
<Label htmlFor="max-turns">Max agent steps per run</Label>
|
|
2589
|
+
<Input
|
|
2590
|
+
id="max-turns"
|
|
2591
|
+
type="number"
|
|
2592
|
+
min={1}
|
|
2593
|
+
max={10000}
|
|
2594
|
+
placeholder="Inherits global default"
|
|
2595
|
+
value={maxTurns ?? ""}
|
|
2596
|
+
onChange={(e) =>
|
|
2597
|
+
setMaxTurns(e.target.value ? parseInt(e.target.value, 10) : null)
|
|
2598
|
+
}
|
|
2599
|
+
/>
|
|
2600
|
+
<p className="text-muted-foreground text-xs">
|
|
2601
|
+
One step = one agent action (message, tool call, or sub-response). Most
|
|
2602
|
+
schedules use 50–500 steps; heavy research runs 2,000+.
|
|
2603
|
+
</p>
|
|
2604
|
+
</div>
|
|
2605
|
+
```
|
|
2606
|
+
|
|
2607
|
+
Include `maxTurns` in the submit payload sent to `/api/schedules`.
|
|
2608
|
+
|
|
2609
|
+
- [ ] **Step 13.2: Add prompt-field tooltip**
|
|
2610
|
+
|
|
2611
|
+
Near the prompt textarea, add below it:
|
|
2612
|
+
|
|
2613
|
+
```tsx
|
|
2614
|
+
<p className="text-muted-foreground text-xs">
|
|
2615
|
+
Note: writing "MAX N turns" in your prompt is a hint to the model,
|
|
2616
|
+
not a runtime limit. Use <strong>Max agent steps</strong> below to enforce
|
|
2617
|
+
a budget.
|
|
2618
|
+
</p>
|
|
2619
|
+
```
|
|
2620
|
+
|
|
2621
|
+
- [ ] **Step 13.3: Add inline calibration hint**
|
|
2622
|
+
|
|
2623
|
+
Below the prompt field, add a calibration hint that reads from the schedule list if a similar schedule exists (same `agentProfile` and non-null `avgTurnsPerFiring`). For v1, compute client-side:
|
|
2624
|
+
|
|
2625
|
+
```tsx
|
|
2626
|
+
{suggestedSteps !== null && (
|
|
2627
|
+
<p className="text-muted-foreground text-xs">
|
|
2628
|
+
Schedules like this average ~{suggestedSteps} steps.
|
|
2629
|
+
</p>
|
|
2630
|
+
)}
|
|
2631
|
+
```
|
|
2632
|
+
|
|
2633
|
+
`suggestedSteps` is a `useMemo` over the schedule list prop (or fetched once on mount) — pick the median `avgTurnsPerFiring` among existing schedules with the same `agentProfile`.
|
|
2634
|
+
|
|
2635
|
+
- [ ] **Step 13.4: Render collision warning banner in sheets**
|
|
2636
|
+
|
|
2637
|
+
Edit `src/components/schedules/schedule-create-sheet.tsx`. After the POST call, read `res.warnings` from the response. If non-empty, render an amber banner inside `SheetContent` (remember the recurring shadcn issue: body must have `px-6`):
|
|
2638
|
+
|
|
2639
|
+
```tsx
|
|
2640
|
+
{warnings.length > 0 && (
|
|
2641
|
+
<div className="mx-6 mb-4 rounded-lg border border-amber-500/40 bg-amber-50 p-3 text-sm">
|
|
2642
|
+
<p className="font-medium text-amber-900">
|
|
2643
|
+
Overlap detected with: {warnings[0].overlappingSchedules.join(", ")}
|
|
2644
|
+
</p>
|
|
2645
|
+
<p className="text-amber-800">
|
|
2646
|
+
Combined load: ~{warnings[0].estimatedConcurrentSteps} agent steps.
|
|
2647
|
+
Schedules will take turns; the last to run may be delayed.
|
|
2648
|
+
</p>
|
|
2649
|
+
</div>
|
|
2650
|
+
)}
|
|
2651
|
+
```
|
|
2652
|
+
|
|
2653
|
+
Do the same in `schedule-edit-sheet.tsx`.
|
|
2654
|
+
|
|
2655
|
+
Also update the POST/PUT result handlers to read `res.schedule` instead of the top-level response (response shape changed in Task 12).
|
|
2656
|
+
|
|
2657
|
+
- [ ] **Step 13.5: Smoke test**
|
|
2658
|
+
|
|
2659
|
+
Run: `npm run dev`
|
|
2660
|
+
Navigate to `/schedules`, create two schedules with overlapping crons (e.g. both at `0 * * * *` with high avgTurns seeded manually in the DB), verify the warning banner renders. Submit with a `maxTurns` value set and verify it persists via `SELECT max_turns FROM schedules` in sqlite CLI.
|
|
2661
|
+
|
|
2662
|
+
- [ ] **Step 13.6: Commit**
|
|
2663
|
+
|
|
2664
|
+
```bash
|
|
2665
|
+
git add src/components/schedules/schedule-form.tsx src/components/schedules/schedule-create-sheet.tsx src/components/schedules/schedule-edit-sheet.tsx
|
|
2666
|
+
git commit -m "feat(schedules): max agent steps field + tooltips + collision warning banner"
|
|
2667
|
+
```
|
|
2668
|
+
|
|
2669
|
+
---
|
|
2670
|
+
|
|
2671
|
+
## Task 14: Architectural decision records (TDRs)
|
|
2672
|
+
|
|
2673
|
+
**Files:**
|
|
2674
|
+
- Create: `.claude/skills/architect/references/tdr-atomic-slot-claim.md`
|
|
2675
|
+
- Create: `.claude/skills/architect/references/tdr-evidence-based-cap.md`
|
|
2676
|
+
- Create: `.claude/skills/architect/references/tdr-failure-class-streaks.md`
|
|
2677
|
+
- Create: `.claude/skills/architect/references/tdr-manual-honors-cap.md`
|
|
2678
|
+
- Create: `.claude/skills/architect/references/tdr-lock-holders-leased.md`
|
|
2679
|
+
- Create: `.claude/skills/architect/references/tdr-chat-shares-event-loop.md`
|
|
2680
|
+
|
|
2681
|
+
Each TDR is a short markdown file capturing the architectural principle and its motivating incident. Pattern: Title / Status / Date / Context / Decision / Consequences.
|
|
2682
|
+
|
|
2683
|
+
- [ ] **Step 14.1: TDR 1 — atomic slot claim**
|
|
2684
|
+
|
|
2685
|
+
Create `.claude/skills/architect/references/tdr-atomic-slot-claim.md` with this body:
|
|
2686
|
+
|
|
2687
|
+
```markdown
|
|
2688
|
+
# TDR: Concurrency slot claim is a single SQL statement, not check-then-act
|
|
2689
|
+
|
|
2690
|
+
**Status:** Accepted
|
|
2691
|
+
**Date:** 2026-04-08
|
|
2692
|
+
**Incident:** 2026-04-08 schedule starvation (5 concurrent firings consumed ~12,600 turns, killed chat SSE)
|
|
2693
|
+
|
|
2694
|
+
## Context
|
|
2695
|
+
|
|
2696
|
+
The scheduler has two concurrent coordination points: `tickScheduler()` (the poll loop) and `drainQueue()` (the post-completion chain at `src/lib/schedules/scheduler.ts:420`). Both need to check "is the global cap full?" before firing a new task. A naive SELECT then INSERT across these two entry points races and allows the cap to be exceeded.
|
|
2697
|
+
|
|
2698
|
+
## Decision
|
|
2699
|
+
|
|
2700
|
+
The slot claim MUST be a single SQL statement. We use an atomic conditional UPDATE with a subquery inside the WHERE clause, exploiting SQLite's serialized write lock to guarantee two concurrent claim attempts cannot both succeed.
|
|
2701
|
+
|
|
2702
|
+
The implementation lives in `src/lib/schedules/slot-claim.ts`.
|
|
2703
|
+
|
|
2704
|
+
## Consequences
|
|
2705
|
+
|
|
2706
|
+
- Future coordination primitives must also use single-statement atomic claims. Never SELECT then UPDATE.
|
|
2707
|
+
- The approach is SQLite-specific. If the backend moves to Postgres, revisit with SELECT ... FOR UPDATE or advisory locks.
|
|
2708
|
+
- `changes = 0` is a normal outcome meaning "lost the race" — callers must handle it as "leave in queued, retry via drain."
|
|
2709
|
+
```
|
|
2710
|
+
|
|
2711
|
+
- [ ] **Step 14.2: TDR 2 — evidence-based cap**
|
|
2712
|
+
|
|
2713
|
+
Create `.claude/skills/architect/references/tdr-evidence-based-cap.md`:
|
|
2714
|
+
|
|
2715
|
+
```markdown
|
|
2716
|
+
# TDR: Scheduler cap is static and evidence-based
|
|
2717
|
+
|
|
2718
|
+
**Status:** Accepted
|
|
2719
|
+
**Date:** 2026-04-08
|
|
2720
|
+
|
|
2721
|
+
## Context
|
|
2722
|
+
|
|
2723
|
+
The 2026-04-08 incident showed 5 concurrent schedules starved the chat SSE stream. The cap of 2 (later 3) was chosen as a guess, not a measurement. Without `schedule_firing_metrics` we have no way to validate or refine it.
|
|
2724
|
+
|
|
2725
|
+
## Decision
|
|
2726
|
+
|
|
2727
|
+
The cap starts at 2 and is raised to 3 only after one week of `schedule_firing_metrics` telemetry shows:
|
|
2728
|
+
- Chat SSE P99 first-token latency stays below 2 seconds
|
|
2729
|
+
- `event_loop_lag_ms` p99 stays below 50ms
|
|
2730
|
+
- `slot_wait_ms` p95 stays below 60s under typical load
|
|
2731
|
+
|
|
2732
|
+
Any future change to the cap requires re-running the validation against the metrics table.
|
|
2733
|
+
|
|
2734
|
+
## Consequences
|
|
2735
|
+
|
|
2736
|
+
- `schedule_firing_metrics` is load-bearing. Never cut it from follow-up specs.
|
|
2737
|
+
- Dynamic cap adjustment is deferred until the static cap proves insufficient. Dynamic control loops have failure modes (oscillation, thundering herd) that don't belong in a first ship.
|
|
2738
|
+
```
|
|
2739
|
+
|
|
2740
|
+
- [ ] **Step 14.3: TDR 3 — failure class streaks**
|
|
2741
|
+
|
|
2742
|
+
Create `.claude/skills/architect/references/tdr-failure-class-streaks.md`:
|
|
2743
|
+
|
|
2744
|
+
```markdown
|
|
2745
|
+
# TDR: Auto-pause streak counts per failure class
|
|
2746
|
+
|
|
2747
|
+
**Status:** Accepted
|
|
2748
|
+
**Date:** 2026-04-08
|
|
2749
|
+
|
|
2750
|
+
## Context
|
|
2751
|
+
|
|
2752
|
+
The original scheduler had a single `failureStreak` that tripped auto-pause after 3 consecutive failures regardless of cause. Sharing this counter across genuinely-failing runs and misconfigured `maxTurns` values is a footgun: a user who sets `maxTurns=10` on a schedule averaging 40 would trip auto-pause in 3 firings — potentially within 3 minutes on a `* * * * *` cron — before they realized the config took effect.
|
|
2753
|
+
|
|
2754
|
+
## Decision
|
|
2755
|
+
|
|
2756
|
+
Split the streak counter per failure class:
|
|
2757
|
+
- `failureStreak` — generic failures (SDK error, timeout, auth, etc.). Auto-pause threshold: 3.
|
|
2758
|
+
- `turnBudgetBreachStreak` — turn-limit exceeded. Auto-pause threshold: 5, with first-breach grace: breaches in the first 2 cron intervals after a `maxTurnsSetAt` edit are logged only.
|
|
2759
|
+
|
|
2760
|
+
Future failure modes (e.g. context window exceeded, MCP tool failures) should each get their own counter if the appropriate auto-pause threshold differs from the generic 3.
|
|
2761
|
+
|
|
2762
|
+
## Consequences
|
|
2763
|
+
|
|
2764
|
+
- `schedules` schema grows one counter column per named failure class.
|
|
2765
|
+
- The runtime adapter must write explicit `failure_reason` at terminal transitions so the classifier has reliable input — string-matching error text is fragile.
|
|
2766
|
+
```
|
|
2767
|
+
|
|
2768
|
+
- [ ] **Step 14.4: TDR 4 — manual honors cap**
|
|
2769
|
+
|
|
2770
|
+
Create `.claude/skills/architect/references/tdr-manual-honors-cap.md`:
|
|
2771
|
+
|
|
2772
|
+
```markdown
|
|
2773
|
+
# TDR: Manual execute honors the global cap by default
|
|
2774
|
+
|
|
2775
|
+
**Status:** Accepted
|
|
2776
|
+
**Date:** 2026-04-08
|
|
2777
|
+
|
|
2778
|
+
## Context
|
|
2779
|
+
|
|
2780
|
+
Operational controls like "Run now" buttons are tempting to implement as cap-bypassing shortcuts, but a user who clicks them 5 times in 2 seconds can reproduce the exact incident profile that motivated the cap in the first place (2026-04-08: 5 concurrent Opus runs, ~12,600 turns, starved chat).
|
|
2781
|
+
|
|
2782
|
+
## Decision
|
|
2783
|
+
|
|
2784
|
+
`POST /api/schedules/:id/execute` honors `SCHEDULE_MAX_CONCURRENT` by default. When the cap is full, return `429` with an ETA for the next free slot. An explicit `?force=true` query parameter bypasses the cap, logged to `usage_ledger` as `activityType='manual_force_bypass'` for audit.
|
|
2785
|
+
|
|
2786
|
+
## Consequences
|
|
2787
|
+
|
|
2788
|
+
- Future operational endpoints (bulk re-run, workflow force-trigger) should follow the same pattern: honor cap + explicit force flag + audit log.
|
|
2789
|
+
- Users who genuinely need rapid-fire execution have an escape hatch, but the happy path defaults to safety.
|
|
2790
|
+
- Audit log entries can be queried to detect abusive or automated bypass patterns.
|
|
2791
|
+
```
|
|
2792
|
+
|
|
2793
|
+
- [ ] **Step 14.5: TDR 5 — lock holders leased**
|
|
2794
|
+
|
|
2795
|
+
Create `.claude/skills/architect/references/tdr-lock-holders-leased.md`:
|
|
2796
|
+
|
|
2797
|
+
```markdown
|
|
2798
|
+
# TDR: All lock holders carry lease expiries + reapers
|
|
2799
|
+
|
|
2800
|
+
**Status:** Accepted
|
|
2801
|
+
**Date:** 2026-04-08
|
|
2802
|
+
|
|
2803
|
+
## Context
|
|
2804
|
+
|
|
2805
|
+
A hung SDK call can permanently wedge any lock: group locks, concurrency slots, even the existing per-schedule claim (which sets `nextFireAt = NULL` as a lock at `src/lib/schedules/scheduler.ts:240`; if `fireSchedule` throws before writing the new `nextFireAt`, the schedule is stuck until process restart).
|
|
2806
|
+
|
|
2807
|
+
## Decision
|
|
2808
|
+
|
|
2809
|
+
Every lock primitive in the scheduler pipeline must carry a lease expiry and a reaper:
|
|
2810
|
+
1. **Concurrency slots** — `tasks.lease_expires_at` reaped at each `tickScheduler()` call. Expired leases are aborted via the execution-manager AbortController and marked failed/lease_expired.
|
|
2811
|
+
2. **Per-schedule claim** — currently relies on `bootstrapNextFireTimes()` at startup; future work should add a time-based reaper.
|
|
2812
|
+
3. **New locks** — any future coordination primitive must ship with a reaper from day one.
|
|
2813
|
+
|
|
2814
|
+
Default lease: 20 minutes. Override per-schedule via `schedules.max_run_duration_sec`.
|
|
2815
|
+
|
|
2816
|
+
## Consequences
|
|
2817
|
+
|
|
2818
|
+
- Lock holders cannot rely on "the other code path will clean this up." Every claim must be either released normally (on completion) or reaped (on lease expiry).
|
|
2819
|
+
- The reaper is idempotent — safe to run at every tick.
|
|
2820
|
+
- Aborting via AbortController requires the runtime adapter to honor the signal; all SDK query calls must pass through the abort controller from execution-manager.
|
|
2821
|
+
```
|
|
2822
|
+
|
|
2823
|
+
- [ ] **Step 14.6: TDR 6 — chat shares event loop**
|
|
2824
|
+
|
|
2825
|
+
Create `.claude/skills/architect/references/tdr-chat-shares-event-loop.md`:
|
|
2826
|
+
|
|
2827
|
+
```markdown
|
|
2828
|
+
# TDR: Chat and scheduled agents compete for the same Node event loop
|
|
2829
|
+
|
|
2830
|
+
**Status:** Accepted
|
|
2831
|
+
**Date:** 2026-04-08
|
|
2832
|
+
|
|
2833
|
+
## Context
|
|
2834
|
+
|
|
2835
|
+
Stagent runs chat and scheduled tasks in the same Node process, on the same event loop. The 2026-04-08 incident showed this is a critical architectural constraint: when 5 schedules saturated the event loop, a user's chat SSE stream was starved and dropped mid-stream.
|
|
2836
|
+
|
|
2837
|
+
## Decision
|
|
2838
|
+
|
|
2839
|
+
This is a known and intentional constraint until a worker-thread isolation architecture is designed. Any feature that adds agent-like workloads (image pipelines, MCP servers, streaming tools) must assume chat is on the critical path and must not starve it.
|
|
2840
|
+
|
|
2841
|
+
Mitigations:
|
|
2842
|
+
1. Global concurrency cap limits scheduled agents to `SCHEDULE_MAX_CONCURRENT` (default 2).
|
|
2843
|
+
2. Chat soft pressure signal — when chat is streaming, the scheduler defers new firings by 30s (`src/lib/chat/active-streams.ts` + `scheduler.ts:applyChatPressure`).
|
|
2844
|
+
3. Spec B hotfix guarantees chat messages never persist as empty content even under worst-case contention.
|
|
2845
|
+
|
|
2846
|
+
## Consequences
|
|
2847
|
+
|
|
2848
|
+
- Future high-throughput features must evaluate event-loop impact before shipping.
|
|
2849
|
+
- Worker-thread isolation is tracked as an architectural follow-up. This TDR is the anchor point for that future work.
|
|
2850
|
+
- Profiling under load should measure `event_loop_lag_ms` and alert when p99 exceeds 50ms.
|
|
2851
|
+
```
|
|
2852
|
+
|
|
2853
|
+
- [ ] **Step 14.7: Commit**
|
|
2854
|
+
|
|
2855
|
+
```bash
|
|
2856
|
+
git add .claude/skills/architect/references/tdr-atomic-slot-claim.md .claude/skills/architect/references/tdr-evidence-based-cap.md .claude/skills/architect/references/tdr-failure-class-streaks.md .claude/skills/architect/references/tdr-manual-honors-cap.md .claude/skills/architect/references/tdr-lock-holders-leased.md .claude/skills/architect/references/tdr-chat-shares-event-loop.md
|
|
2857
|
+
git commit -m "docs(architect): 6 TDRs for schedule orchestration principles"
|
|
2858
|
+
```
|
|
2859
|
+
|
|
2860
|
+
---
|
|
2861
|
+
|
|
2862
|
+
## Task 15: End-to-end integration test
|
|
2863
|
+
|
|
2864
|
+
**Files:**
|
|
2865
|
+
- Create: `src/lib/schedules/__tests__/integration.test.ts`
|
|
2866
|
+
|
|
2867
|
+
Final integration test validating the full cap + queue + reap path composes correctly. No Opus calls — the runtime is mocked.
|
|
2868
|
+
|
|
2869
|
+
- [ ] **Step 15.1: Write integration test**
|
|
2870
|
+
|
|
2871
|
+
Create `src/lib/schedules/__tests__/integration.test.ts`:
|
|
2872
|
+
|
|
2873
|
+
```typescript
|
|
2874
|
+
import { describe, it, expect, beforeEach, vi } from "vitest";
|
|
2875
|
+
import { db } from "@/lib/db";
|
|
2876
|
+
import {
|
|
2877
|
+
tasks,
|
|
2878
|
+
schedules,
|
|
2879
|
+
projects,
|
|
2880
|
+
settings,
|
|
2881
|
+
scheduleFiringMetrics,
|
|
2882
|
+
agentLogs,
|
|
2883
|
+
} from "@/lib/db/schema";
|
|
2884
|
+
import { eq } from "drizzle-orm";
|
|
2885
|
+
import { randomUUID } from "crypto";
|
|
2886
|
+
import { tickScheduler } from "../scheduler";
|
|
2887
|
+
import { countRunningScheduledSlots } from "../slot-claim";
|
|
2888
|
+
|
|
2889
|
+
vi.mock("@/lib/agents/runtime", () => ({
|
|
2890
|
+
executeTaskWithRuntime: vi.fn(async () => {
|
|
2891
|
+
// Simulate a short-running task
|
|
2892
|
+
await new Promise((r) => setTimeout(r, 20));
|
|
2893
|
+
}),
|
|
2894
|
+
}));
|
|
2895
|
+
|
|
2896
|
+
describe("schedule orchestration end-to-end", () => {
|
|
2897
|
+
beforeEach(() => {
|
|
2898
|
+
db.delete(scheduleFiringMetrics).run();
|
|
2899
|
+
db.delete(agentLogs).run();
|
|
2900
|
+
db.delete(tasks).run();
|
|
2901
|
+
db.delete(schedules).run();
|
|
2902
|
+
db.delete(projects).run();
|
|
2903
|
+
db.delete(settings).where(eq(settings.key, "schedule.maxConcurrent")).run();
|
|
2904
|
+
db.insert(settings)
|
|
2905
|
+
.values({ key: "schedule.maxConcurrent", value: "2", updatedAt: new Date() })
|
|
2906
|
+
.run();
|
|
2907
|
+
});
|
|
2908
|
+
|
|
2909
|
+
it("5 schedules firing at once → exactly 2 run, 3 queue", async () => {
|
|
2910
|
+
const pid = randomUUID();
|
|
2911
|
+
const now = new Date();
|
|
2912
|
+
db.insert(projects)
|
|
2913
|
+
.values({ id: pid, name: "p", status: "active", createdAt: now, updatedAt: now })
|
|
2914
|
+
.run();
|
|
2915
|
+
|
|
2916
|
+
const past = new Date(now.getTime() - 10_000);
|
|
2917
|
+
for (let i = 0; i < 5; i++) {
|
|
2918
|
+
db.insert(schedules)
|
|
2919
|
+
.values({
|
|
2920
|
+
id: randomUUID(),
|
|
2921
|
+
projectId: pid,
|
|
2922
|
+
name: `sched-${i}`,
|
|
2923
|
+
prompt: "test",
|
|
2924
|
+
cronExpression: "* * * * *",
|
|
2925
|
+
status: "active",
|
|
2926
|
+
type: "scheduled",
|
|
2927
|
+
firingCount: 0,
|
|
2928
|
+
suppressionCount: 0,
|
|
2929
|
+
heartbeatSpentToday: 0,
|
|
2930
|
+
failureStreak: 0,
|
|
2931
|
+
turnBudgetBreachStreak: 0,
|
|
2932
|
+
nextFireAt: past,
|
|
2933
|
+
createdAt: now,
|
|
2934
|
+
updatedAt: now,
|
|
2935
|
+
})
|
|
2936
|
+
.run();
|
|
2937
|
+
}
|
|
2938
|
+
|
|
2939
|
+
await tickScheduler();
|
|
2940
|
+
|
|
2941
|
+
expect(countRunningScheduledSlots()).toBe(2);
|
|
2942
|
+
const queued = db
|
|
2943
|
+
.select()
|
|
2944
|
+
.from(tasks)
|
|
2945
|
+
.where(eq(tasks.status, "queued"))
|
|
2946
|
+
.all();
|
|
2947
|
+
expect(queued.length).toBe(3);
|
|
2948
|
+
});
|
|
2949
|
+
});
|
|
2950
|
+
```
|
|
2951
|
+
|
|
2952
|
+
- [ ] **Step 15.2: Run and verify PASS**
|
|
2953
|
+
|
|
2954
|
+
Run: `npx vitest run src/lib/schedules/__tests__/integration.test.ts`
|
|
2955
|
+
Expected: PASS.
|
|
2956
|
+
|
|
2957
|
+
- [ ] **Step 15.3: Run full suite for final regression check**
|
|
2958
|
+
|
|
2959
|
+
Run: `npx vitest run`
|
|
2960
|
+
Expected: PASS across all test files.
|
|
2961
|
+
|
|
2962
|
+
- [ ] **Step 15.4: Commit**
|
|
2963
|
+
|
|
2964
|
+
```bash
|
|
2965
|
+
git add src/lib/schedules/__tests__/integration.test.ts
|
|
2966
|
+
git commit -m "test(schedules): end-to-end cap + queue integration test"
|
|
2967
|
+
```
|
|
2968
|
+
|
|
2969
|
+
---
|
|
2970
|
+
|
|
2971
|
+
## Final verification
|
|
2972
|
+
|
|
2973
|
+
After all tasks complete:
|
|
2974
|
+
|
|
2975
|
+
- [ ] **Full test suite**: `npx vitest run` — all green, no regressions
|
|
2976
|
+
- [ ] **TypeScript check**: `npx tsc --noEmit` — zero errors
|
|
2977
|
+
- [ ] **Manual smoke test**: `npm run dev`, create 3 schedules with overlapping crons, observe cap enforcement kicks in and queued schedules drain sequentially
|
|
2978
|
+
- [ ] **Incident reproduction**: manually fire 5 schedules via `POST /api/schedules/:id/execute?force=true` in rapid succession, send a chat message, confirm the chat SSE stream stays responsive and no empty-content rows are left behind
|
|
2979
|
+
- [ ] **Telemetry check**: query `SELECT * FROM schedule_firing_metrics` — confirm rows exist with non-null `slot_wait_ms` and `duration_ms`
|
|
2980
|
+
- [ ] **Roadmap update**: append a "Schedule Orchestration Resilience" subsection to `features/roadmap.md` with A/B/C completed entries plus future `schedule-collision-prevention` and `schedule-forecasting` entries
|
|
2981
|
+
- [ ] **Ship**: push the branch, open a PR, wait for CI, merge. Leave `SCHEDULE_MAX_CONCURRENT=2` for the first week of telemetry
|
|
2982
|
+
|
|
2983
|
+
If all items pass, the feature is ready to ship. After one week of telemetry showing chat SSE p99 < 2s and event-loop lag p99 < 50ms, raise `SCHEDULE_MAX_CONCURRENT` from 2 to 3 and continue monitoring.
|