@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/CHANGELOG.md +544 -0
  2. package/drizzle/0003_sparkling_xorn.sql +17 -0
  3. package/drizzle/0004_cultured_spyke.sql +2 -0
  4. package/drizzle/0005_classy_the_hand.sql +19 -0
  5. package/drizzle/0006_burly_wallop.sql +10 -0
  6. package/drizzle/0007_nappy_jackal.sql +1 -0
  7. package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
  8. package/drizzle/0009_steady_liz_osborn.sql +12 -0
  9. package/drizzle/0010_chunky_changeling.sql +2 -0
  10. package/drizzle/meta/0003_snapshot.json +1007 -0
  11. package/drizzle/meta/0004_snapshot.json +1028 -0
  12. package/drizzle/meta/0005_snapshot.json +1164 -0
  13. package/drizzle/meta/0006_snapshot.json +1261 -0
  14. package/drizzle/meta/0007_snapshot.json +1215 -0
  15. package/drizzle/meta/0008_snapshot.json +1215 -0
  16. package/drizzle/meta/0009_snapshot.json +1328 -0
  17. package/drizzle/meta/0010_snapshot.json +1349 -0
  18. package/drizzle/meta/_journal.json +56 -0
  19. package/package.json +23 -12
  20. package/src/action-types.ts +23 -0
  21. package/src/artifact-store.ts +16 -1
  22. package/src/automation-store.test.ts +143 -0
  23. package/src/automation-store.ts +30 -8
  24. package/src/builtin-triggers.test.ts +77 -74
  25. package/src/builtin-triggers.ts +105 -108
  26. package/src/dispatch/action-kind.ts +2 -0
  27. package/src/dispatch/assemble-get-service.ts +31 -0
  28. package/src/dispatch/cancel-resurrect.test.ts +147 -0
  29. package/src/dispatch/concurrency-race.test.ts +255 -0
  30. package/src/dispatch/concurrency-scope.test.ts +166 -0
  31. package/src/dispatch/condition.ts +24 -5
  32. package/src/dispatch/dwell-queue.ts +65 -0
  33. package/src/dispatch/dwell-store.ts +154 -0
  34. package/src/dispatch/dwell.it.test.ts +142 -0
  35. package/src/dispatch/dwell.test.ts +799 -0
  36. package/src/dispatch/dwell.ts +257 -0
  37. package/src/dispatch/engine.test.ts +189 -2
  38. package/src/dispatch/engine.ts +555 -9
  39. package/src/dispatch/entity-scope.test.ts +176 -0
  40. package/src/dispatch/get-service-wiring.test.ts +318 -0
  41. package/src/dispatch/numeric.test.ts +71 -0
  42. package/src/dispatch/numeric.ts +96 -0
  43. package/src/dispatch/render.test.ts +34 -0
  44. package/src/dispatch/render.ts +31 -11
  45. package/src/dispatch/reseed-run-secrets.ts +230 -0
  46. package/src/dispatch/run-secret-registry.test.ts +189 -0
  47. package/src/dispatch/run-secret-registry.ts +247 -0
  48. package/src/dispatch/run-state-masking.test.ts +376 -0
  49. package/src/dispatch/run-state-store.ts +95 -38
  50. package/src/dispatch/run-state.ts +226 -59
  51. package/src/dispatch/scope-artifact-masking.test.ts +138 -0
  52. package/src/dispatch/secret-ref-ids.test.ts +19 -0
  53. package/src/dispatch/secret-ref-ids.ts +17 -0
  54. package/src/dispatch/snapshots.test.ts +86 -0
  55. package/src/dispatch/snapshots.ts +79 -0
  56. package/src/dispatch/stage1-router.test.ts +324 -0
  57. package/src/dispatch/stage1-router.ts +152 -0
  58. package/src/dispatch/stage1.it.test.ts +84 -0
  59. package/src/dispatch/stage2-dispatch.test.ts +285 -0
  60. package/src/dispatch/stage2-dispatch.ts +207 -0
  61. package/src/dispatch/stage2-stalled.it.test.ts +132 -0
  62. package/src/dispatch/stalled-sweeper.test.ts +197 -0
  63. package/src/dispatch/stalled-sweeper.ts +112 -5
  64. package/src/dispatch/state-scope.test.ts +234 -0
  65. package/src/dispatch/state-scope.ts +322 -0
  66. package/src/dispatch/structured-conditions.test.ts +246 -0
  67. package/src/dispatch/structured-conditions.ts +146 -0
  68. package/src/dispatch/test-fixtures.ts +306 -38
  69. package/src/dispatch/trigger-fanin.test.ts +111 -0
  70. package/src/dispatch/trigger-subscriber.ts +316 -14
  71. package/src/dispatch/types.ts +263 -8
  72. package/src/dispatch/wait-timeout-queue.ts +89 -0
  73. package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
  74. package/src/dispatch/wait-until.test.ts +540 -0
  75. package/src/dispatch/wake-refs.test.ts +158 -0
  76. package/src/dispatch/wake-refs.ts +348 -0
  77. package/src/dispatch/window-gate.test.ts +513 -0
  78. package/src/dispatch/window-store.test.ts +162 -0
  79. package/src/dispatch/window-store.ts +102 -0
  80. package/src/entity/change-derivers.test.ts +148 -0
  81. package/src/entity/change-derivers.ts +143 -0
  82. package/src/entity/change-emitter.test.ts +66 -0
  83. package/src/entity/change-emitter.ts +76 -0
  84. package/src/entity/create-handle.ts +344 -0
  85. package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
  86. package/src/entity/define-entity.ts +157 -0
  87. package/src/entity/diff.test.ts +57 -0
  88. package/src/entity/diff.ts +54 -0
  89. package/src/entity/entity-store.test.ts +30 -0
  90. package/src/entity/entity-store.ts +171 -0
  91. package/src/entity/extension-point.ts +56 -0
  92. package/src/entity/fake-entity-store.ts +130 -0
  93. package/src/entity/hook.ts +19 -0
  94. package/src/entity/index.ts +50 -0
  95. package/src/entity/mutate-handle.test.ts +517 -0
  96. package/src/entity/on-entity-changed.test.ts +189 -0
  97. package/src/entity/on-entity-changed.ts +214 -0
  98. package/src/entity/registry.test.ts +181 -0
  99. package/src/entity/registry.ts +200 -0
  100. package/src/entity/stable-stringify.test.ts +55 -0
  101. package/src/entity/stable-stringify.ts +49 -0
  102. package/src/entity/wake-index.it.test.ts +251 -0
  103. package/src/entity/with-entity-write.test.ts +100 -0
  104. package/src/entity/with-entity-write.ts +69 -0
  105. package/src/entity-driven-trigger.ts +46 -0
  106. package/src/extension-points.ts +35 -0
  107. package/src/gitops-docs.test.ts +215 -0
  108. package/src/gitops-docs.ts +151 -0
  109. package/src/gitops-kinds.test.ts +174 -0
  110. package/src/gitops-kinds.ts +137 -0
  111. package/src/index.ts +355 -11
  112. package/src/migration/flapping-to-window.test.ts +123 -0
  113. package/src/migration/flapping-to-window.ts +205 -0
  114. package/src/router.test.ts +182 -1
  115. package/src/router.ts +73 -2
  116. package/src/schema.ts +236 -3
  117. package/src/script-test-replay.test.ts +88 -0
  118. package/src/script-test-replay.ts +100 -0
  119. package/src/script-test-shell-env.test.ts +41 -0
  120. package/src/script-test-shell-env.ts +89 -0
  121. package/src/script-test.test.ts +386 -0
  122. package/src/script-test.ts +258 -0
  123. package/src/trigger-registry.ts +2 -0
  124. package/src/validate-definition.test.ts +1 -0
  125. package/tsconfig.json +24 -0
@@ -0,0 +1,207 @@
1
+ /**
2
+ * Stage-2 dispatch fan-out (reactive automation engine §7, §13).
3
+ *
4
+ * Stage 1 (the `ENTITY_CHANGED` work-queue router) does only cheap indexed
5
+ * routing and enqueues one Stage-2 job per interested automation / waiting
6
+ * run onto the `automation-dispatch` queue. This consumer runs those jobs:
7
+ * any instance picks one up, so execution load spreads while Stage 1 stays
8
+ * fast.
9
+ *
10
+ * The job is a validated {@link DispatchJob}; the handler routes on `reason`:
11
+ * - `"trigger"` → start fresh run(s) for the matched automation
12
+ * (`startRunsForAutomationEvent`, which honours the trigger config gate,
13
+ * filter, dwell, and concurrency mode).
14
+ * - `"wake"` → resume the suspended `wait_until` (`checkWaitUntil`,
15
+ * which re-enriches scope, re-evaluates, and resumes or fails-on-timeout;
16
+ * idempotent via the per-run advisory lock).
17
+ *
18
+ * Mirrors the delay / dwell consumer wiring. `consumerGroup:
19
+ * "automation-dispatch-run"`, `maxRetries: 3`.
20
+ */
21
+ import type { Logger } from "@checkstack/backend-api";
22
+ import {
23
+ DispatchJobSchema,
24
+ type DispatchJob,
25
+ type EntityChanged,
26
+ } from "@checkstack/automation-common";
27
+
28
+ import type { AutomationStore } from "../automation-store";
29
+ import type { ChangeDeriverRegistry } from "../entity/change-derivers";
30
+ import { checkWaitUntil } from "./engine";
31
+ import { startRunsForAutomationEvent } from "./trigger-subscriber";
32
+ import type { DispatchDeps, LoadedAutomation } from "./types";
33
+
34
+ /** Durable Stage-2 queue name (reactive automation engine §13.1). */
35
+ export const DISPATCH_QUEUE_NAME = "automation-dispatch";
36
+
37
+ export interface DispatchQueueConsumerArgs {
38
+ deps: DispatchDeps;
39
+ automationStore: AutomationStore;
40
+ /**
41
+ * The per-kind payload mappers (registered alongside derivers). When the
42
+ * changed kind has a registered `toPayload`, the fresh-run `trigger.payload`
43
+ * matches that kind's domain-named `payloadSchema` (so `trigger.payload.
44
+ * incidentId` / `.systemId` / `.previousStatus` resolve); otherwise the
45
+ * generic change shape is used.
46
+ */
47
+ changeDerivers: ChangeDeriverRegistry;
48
+ logger: Logger;
49
+ }
50
+
51
+ export interface DispatchQueueConsumer {
52
+ stop: () => Promise<void>;
53
+ }
54
+
55
+ /**
56
+ * Generic fallback payload shape for kinds WITHOUT a registered payload
57
+ * mapper. Exposes the change as the entity id + the new state fields (or a
58
+ * tombstone marker), plus the kind for clarity.
59
+ */
60
+ function genericChangedPayload(
61
+ changed: EntityChanged,
62
+ ): Record<string, unknown> {
63
+ return {
64
+ kind: changed.kind,
65
+ id: changed.id,
66
+ prev: changed.prev,
67
+ next: changed.next,
68
+ delta: changed.delta,
69
+ changedFields: changed.changedFields,
70
+ // Convenience: spread the next state at the top level so existing
71
+ // payload-reading templates (`trigger.payload.status`) keep working
72
+ // when the change is a state update.
73
+ ...(changed.next === null ? {} : changed.next),
74
+ };
75
+ }
76
+
77
+ /**
78
+ * The entity-change payload becomes the trigger payload for a fresh run.
79
+ *
80
+ * Prefers the per-kind domain payload mapper (`registerChangeDeriver({
81
+ * toPayload })`) so the runtime `trigger.payload` matches each migrated
82
+ * trigger's declared `payloadSchema` — preserving the legacy domain keys
83
+ * operators read (`incidentId`, `systemId`, `previousStatus`, …). Falls back
84
+ * to the generic change shape for kinds without a mapper.
85
+ */
86
+ function changedToPayload(
87
+ changeDerivers: ChangeDeriverRegistry,
88
+ changed: EntityChanged,
89
+ ): Record<string, unknown> {
90
+ return changeDerivers.payload(changed) ?? genericChangedPayload(changed);
91
+ }
92
+
93
+ async function loadAutomation(
94
+ automationStore: AutomationStore,
95
+ automationId: string,
96
+ ): Promise<LoadedAutomation | undefined> {
97
+ const automation = await automationStore.getById(automationId);
98
+ if (!automation) return undefined;
99
+ return {
100
+ id: automation.id,
101
+ name: automation.name,
102
+ status: automation.status,
103
+ definition: automation.definition,
104
+ };
105
+ }
106
+
107
+ /**
108
+ * Handle one Stage-2 job. Exported (not just the consumer) so tests can
109
+ * drive it directly without a real queue.
110
+ */
111
+ export async function handleDispatchJob(args: {
112
+ deps: DispatchDeps;
113
+ automationStore: AutomationStore;
114
+ changeDerivers: ChangeDeriverRegistry;
115
+ job: DispatchJob;
116
+ }): Promise<void> {
117
+ const { deps, automationStore, changeDerivers, job } = args;
118
+
119
+ if (job.reason === "trigger") {
120
+ const automation = await loadAutomation(automationStore, job.automationId);
121
+ if (!automation) {
122
+ deps.logger.debug(
123
+ `stage2: automation ${job.automationId} gone; dropping trigger job`,
124
+ );
125
+ return;
126
+ }
127
+ // Only enabled automations dispatch (disabled = paused).
128
+ if (automation.status !== "enabled") return;
129
+ await startRunsForAutomationEvent({
130
+ deps,
131
+ automation,
132
+ eventId: job.triggerId,
133
+ triggerPayload: changedToPayload(changeDerivers, job.changed),
134
+ actor: job.changed.actor,
135
+ contextKey: job.changed.id,
136
+ });
137
+ return;
138
+ }
139
+
140
+ // reason === "wake": resume a suspended wait_until.
141
+ const lock = await deps.runStore.loadWaitLock(job.waitLockId);
142
+ if (!lock || lock.kind !== "until") {
143
+ deps.logger.debug(
144
+ `stage2: wait lock ${job.waitLockId} gone (resumed / cancelled); dropping wake job`,
145
+ );
146
+ return;
147
+ }
148
+ const run = await deps.runStore.loadRun(job.runId);
149
+ if (!run) {
150
+ await deps.runStore.deleteWaitLock(job.waitLockId);
151
+ return;
152
+ }
153
+ const automation = await loadAutomation(automationStore, run.automationId);
154
+ if (!automation) {
155
+ await deps.runStore.deleteWaitLock(job.waitLockId);
156
+ await deps.runStore.updateRunStatus(
157
+ job.runId,
158
+ "failed",
159
+ "automation deleted while run was suspended on wait_until",
160
+ );
161
+ await deps.runStateStore.clear(job.runId);
162
+ return;
163
+ }
164
+ // checkWaitUntil re-enriches scope (with every ref the wait depends on +
165
+ // the changed ref), re-evaluates the full condition, and resumes (or
166
+ // applies timeout). Idempotent: it deletes the lock before resuming and
167
+ // resumeRun takes the per-run advisory lock.
168
+ await checkWaitUntil(deps, {
169
+ runId: job.runId,
170
+ waitLockId: job.waitLockId,
171
+ automation,
172
+ changedRef: job.ref,
173
+ });
174
+ }
175
+
176
+ export async function startDispatchQueueConsumer(
177
+ args: DispatchQueueConsumerArgs,
178
+ ): Promise<DispatchQueueConsumer> {
179
+ const queue = args.deps.queueManager.getQueue<DispatchJob>(
180
+ DISPATCH_QUEUE_NAME,
181
+ );
182
+
183
+ await queue.consume(
184
+ async (rawJob) => {
185
+ const parsed = DispatchJobSchema.safeParse(rawJob.data);
186
+ if (!parsed.success) {
187
+ args.logger.warn(
188
+ `stage2: dropping malformed automation-dispatch job: ${parsed.error.message}`,
189
+ );
190
+ return;
191
+ }
192
+ await handleDispatchJob({
193
+ deps: args.deps,
194
+ automationStore: args.automationStore,
195
+ changeDerivers: args.changeDerivers,
196
+ job: parsed.data,
197
+ });
198
+ },
199
+ { consumerGroup: "automation-dispatch-run", maxRetries: 3 },
200
+ );
201
+
202
+ return {
203
+ stop: async () => {
204
+ await queue.stop();
205
+ },
206
+ };
207
+ }
@@ -0,0 +1,132 @@
1
+ /**
2
+ * Integration test (real Redis / BullMQ) for Stage-2 stalled redelivery.
3
+ *
4
+ * Part of the surgical integration lane (plan §14.4 #5, load-bearing for
5
+ * §15.5). The Stage-2 `automation-dispatch` queue relies on BullMQ
6
+ * redelivering a job whose worker died holding it (in-flight crash recovery
7
+ * via stalled-job redelivery, §17). This pins that third-party contract: a
8
+ * worker that claims a job and then DIES without completing it (we close it
9
+ * mid-flight while suppressing lock renewal via a long processing block)
10
+ * must have its job redelivered to a second worker and completed exactly
11
+ * once.
12
+ *
13
+ * To make the stall observable in a bounded test, the worker is configured
14
+ * with a short `lockDuration` + `stalledInterval` (the production worker uses
15
+ * 30s, §15.4 — too long for a test). We assert the SECOND worker eventually
16
+ * completes the job and that the side effect happens once.
17
+ *
18
+ * Gated behind `CHECKSTACK_IT=1` so the default `bun test` never runs it.
19
+ * Connection comes from `CHECKSTACK_IT_REDIS_URL` (defaulting to the
20
+ * `docker-compose-dev.yml` Redis port).
21
+ */
22
+ import { afterAll, beforeAll, describe, expect, it } from "bun:test";
23
+ import { Queue, Worker, type ConnectionOptions } from "bullmq";
24
+
25
+ function redisConnection(): ConnectionOptions {
26
+ const url = new URL(
27
+ process.env.CHECKSTACK_IT_REDIS_URL ?? "redis://localhost:6379",
28
+ );
29
+ return {
30
+ host: url.hostname,
31
+ port: Number(url.port || 6379),
32
+ password: url.password || undefined,
33
+ };
34
+ }
35
+
36
+ const QUEUE = `it_stage2_${crypto.randomUUID().replace(/-/g, "")}`;
37
+ const PREFIX = `it:${crypto.randomUUID().replace(/-/g, "")}`;
38
+
39
+ describe.skipIf(!process.env.CHECKSTACK_IT)(
40
+ "Stage-2 stalled redelivery (real Redis)",
41
+ () => {
42
+ let queue: Queue;
43
+ const workers: Worker[] = [];
44
+
45
+ beforeAll(() => {
46
+ queue = new Queue(QUEUE, {
47
+ connection: redisConnection(),
48
+ prefix: PREFIX,
49
+ });
50
+ });
51
+
52
+ afterAll(async () => {
53
+ for (const w of workers) await w.close().catch(() => {});
54
+ await queue.obliterate({ force: true }).catch(() => {});
55
+ await queue.close();
56
+ });
57
+
58
+ it("a dead worker's job is redelivered to another worker and completed once", async () => {
59
+ let completedBy = 0;
60
+ const completions: string[] = [];
61
+
62
+ // Short lock + stalled interval so the stall is observable quickly.
63
+ const sharedOpts = {
64
+ connection: redisConnection(),
65
+ prefix: PREFIX,
66
+ lockDuration: 1000,
67
+ stalledInterval: 1000,
68
+ maxStalledCount: 1,
69
+ } as const;
70
+
71
+ // Worker A: claims the job, then "dies" — it never resolves its
72
+ // processor (simulating a crash). We force-close it (without letting it
73
+ // finish) so the lock expires and BullMQ marks the job stalled.
74
+ //
75
+ // Determinism: with a single job and one healthy worker, starting BOTH
76
+ // workers up front lets BullMQ hand the job to EITHER one — a healthy B
77
+ // claim makes the "A dies mid-flight → B redelivers" assertion flaky even
78
+ // though production is correct. So we start ONLY A, wait until it has
79
+ // claimed the job, and only THEN start B and kill A. A is the guaranteed
80
+ // first claimer, and the real stalled-redelivery path is still exercised.
81
+ let aClaimed = false;
82
+ const workerA = new Worker(
83
+ QUEUE,
84
+ async () => {
85
+ aClaimed = true;
86
+ // Block far longer than lockDuration without renewing — simulate a
87
+ // hung/dead processor. The close() below pulls the rug out.
88
+ await new Promise((r) => setTimeout(r, 60_000));
89
+ },
90
+ sharedOpts,
91
+ );
92
+ workers.push(workerA);
93
+
94
+ await workerA.waitUntilReady();
95
+
96
+ await queue.add("dispatch", { reason: "wake", runId: "run-1" });
97
+
98
+ // Wait until A has claimed it (A is the only worker, so it WILL claim).
99
+ const start = Date.now();
100
+ while (!aClaimed && Date.now() - start < 5000) {
101
+ await new Promise((r) => setTimeout(r, 50));
102
+ }
103
+ expect(aClaimed).toBe(true);
104
+
105
+ // Only now start worker B — the healthy worker that should redeliver +
106
+ // complete the stalled job. Starting it after A claimed guarantees the
107
+ // job is not handed to B first.
108
+ const workerB = new Worker(
109
+ QUEUE,
110
+ async (job) => {
111
+ completedBy += 1;
112
+ completions.push(String(job.id));
113
+ },
114
+ sharedOpts,
115
+ );
116
+ workers.push(workerB);
117
+ await workerB.waitUntilReady();
118
+
119
+ // Kill worker A mid-flight (the rug-pull) so its lock can't renew.
120
+ await workerA.close(true);
121
+
122
+ // Wait for the stalled job to be redelivered to + completed by B.
123
+ const waitStart = Date.now();
124
+ while (completedBy === 0 && Date.now() - waitStart < 15_000) {
125
+ await new Promise((r) => setTimeout(r, 100));
126
+ }
127
+
128
+ expect(completedBy).toBe(1);
129
+ expect(completions).toHaveLength(1);
130
+ });
131
+ },
132
+ );
@@ -0,0 +1,197 @@
1
+ import { describe, expect, it } from "bun:test";
2
+ import { AutomationDefinitionSchema } from "@checkstack/automation-common";
3
+ import type { AutomationStore } from "../automation-store";
4
+ import { createActionRegistry } from "../action-registry";
5
+ import { dispatchTrigger, recoverStalledRun } from "./engine";
6
+ import { startStalledSweeper } from "./stalled-sweeper";
7
+ import {
8
+ makeDispatchDeps,
9
+ makeRecordingAction,
10
+ testPlugin,
11
+ } from "./test-fixtures";
12
+ import type { LoadedAutomation } from "./types";
13
+
14
+ function automation(actions: unknown[]): LoadedAutomation {
15
+ const definition = AutomationDefinitionSchema.parse({
16
+ name: "Sweeper test",
17
+ triggers: [{ event: "test.event" }],
18
+ conditions: [],
19
+ actions,
20
+ mode: "single",
21
+ max_runs: 10,
22
+ });
23
+ return { id: "auto-1", name: "Sweeper test", status: "enabled", definition };
24
+ }
25
+
26
+ function storeFor(auto: LoadedAutomation): AutomationStore {
27
+ return {
28
+ create: async () => {
29
+ throw new Error("nope");
30
+ },
31
+ update: async () => {
32
+ throw new Error("nope");
33
+ },
34
+ delete: async () => {},
35
+ toggle: async () => {
36
+ throw new Error("nope");
37
+ },
38
+ getById: async (id) =>
39
+ id === auto.id
40
+ ? {
41
+ id: auto.id,
42
+ name: auto.name,
43
+ description: undefined,
44
+ status: auto.status,
45
+ definition: auto.definition,
46
+ managedBy: undefined,
47
+ createdAt: new Date(),
48
+ updatedAt: new Date(),
49
+ }
50
+ : undefined,
51
+ list: async () => ({ items: [], total: 0 }),
52
+ listGroups: async () => [],
53
+ findEnabledByTriggerEvent: async () => [auto],
54
+ listEnabled: async () => [auto],
55
+ };
56
+ }
57
+
58
+ describe("stalled sweeper — C1: must not re-walk an intentional wait", () => {
59
+ it("does not re-run pre-wait actions or leak wait locks when sweeping a mid-wait run", async () => {
60
+ const actionsReg = createActionRegistry();
61
+ const rec = makeRecordingAction();
62
+ actionsReg.register(rec.definition, testPlugin);
63
+ const { deps, runs, state } = makeDispatchDeps({ actions: actionsReg });
64
+
65
+ const auto = automation([
66
+ { action: "test.record", config: { value: "before-delay" } },
67
+ { delay: { seconds: 3600 } },
68
+ { action: "test.record", config: { value: "after-delay" } },
69
+ ]);
70
+
71
+ // Genuinely dispatch — the run suspends on the delay (one wait lock, a
72
+ // checkpoint at the delay's path).
73
+ const result = await dispatchTrigger(deps, {
74
+ automation: auto,
75
+ triggerId: "test_event",
76
+ triggerEventId: "test.event",
77
+ payload: {},
78
+ contextKey: "ck-1",
79
+ });
80
+ expect(result.status).toBe("waiting");
81
+ expect(rec.calls.map((c) => c.value)).toEqual(["before-delay"]);
82
+ expect(runs.waitLocks.size).toBe(1);
83
+
84
+ // Simulate the crash window: the heartbeat went cold and the run row
85
+ // still reads "running" (the engine flips to running before each step;
86
+ // a mid-wait crash can leave it that way while the wait lock lives on).
87
+ // This is exactly the state that made the OLD sweeper re-walk from the
88
+ // top — `findStalledRunIds` returned it AND recoverStalledRun accepted
89
+ // it despite the live wait lock.
90
+ runs.runs.get(result.runId)!.status = "running";
91
+ state.states.get(result.runId)!.lastHeartbeatAt = new Date(
92
+ Date.now() - 10 * 60_000,
93
+ );
94
+
95
+ // Run the REAL sweeper. The delay-expiry sweep won't fire (timeoutAt is
96
+ // an hour out), so only the stalled-run sweep can touch this run.
97
+ const sweeper = startStalledSweeper({
98
+ deps,
99
+ automationStore: storeFor(auto),
100
+ logger: deps.logger,
101
+ staleAfterMs: 1,
102
+ intervalMs: 1_000_000,
103
+ });
104
+ await sweeper.sweep();
105
+ sweeper.stop();
106
+
107
+ // The pre-wait action must NOT have re-executed, and no extra wait lock
108
+ // may have accumulated (no duplicate delay job either).
109
+ expect(rec.calls.map((c) => c.value)).toEqual(["before-delay"]);
110
+ expect(runs.waitLocks.size).toBe(1);
111
+ });
112
+ });
113
+
114
+ describe("stalled sweeper — H4 + C1c: recoverStalledRun refuses a run holding a live wait lock", () => {
115
+ it("does not re-walk or create a duplicate lock for a crash-mid-wait run", async () => {
116
+ const actionsReg = createActionRegistry();
117
+ const rec = makeRecordingAction();
118
+ actionsReg.register(rec.definition, testPlugin);
119
+ const { deps, runs } = makeDispatchDeps({ actions: actionsReg });
120
+
121
+ const auto = automation([
122
+ { action: "test.record", config: { value: "before-delay" } },
123
+ { delay: { seconds: 3600 } },
124
+ { action: "test.record", config: { value: "after-delay" } },
125
+ ]);
126
+
127
+ const result = await dispatchTrigger(deps, {
128
+ automation: auto,
129
+ triggerId: "test_event",
130
+ triggerEventId: "test.event",
131
+ payload: {},
132
+ contextKey: "ck-1",
133
+ });
134
+ expect(result.status).toBe("waiting");
135
+ expect(runs.waitLocks.size).toBe(1);
136
+
137
+ // Simulate a crash that left the run marked `running` while still
138
+ // holding its wait lock (a status the wait paths hadn't yet cleared).
139
+ runs.runs.get(result.runId)!.status = "running";
140
+
141
+ const recovered = await recoverStalledRun(deps, {
142
+ runId: result.runId,
143
+ automation: auto,
144
+ });
145
+
146
+ // Recovery must refuse: no re-walk, no second wait lock, no duplicate
147
+ // delay job.
148
+ expect(recovered.status).toBe("running");
149
+ expect(rec.calls.map((c) => c.value)).toEqual(["before-delay"]);
150
+ expect(runs.waitLocks.size).toBe(1);
151
+ });
152
+ });
153
+
154
+ describe("stalled sweeper — windowed-count occurrence prune", () => {
155
+ it("deletes occurrence rows older than the 24h cap, keeping fresh ones", async () => {
156
+ const { deps, windows } = makeDispatchDeps({});
157
+
158
+ // One stale row (25h ago) + one fresh row (now).
159
+ await windows.store.recordAndCount({
160
+ automationId: "auto-1",
161
+ triggerId: "f",
162
+ eventId: "e",
163
+ contextKey: "sys-1",
164
+ occurredAt: new Date(Date.now() - 25 * 60 * 60_000),
165
+ windowMinutes: 60,
166
+ threshold: 1,
167
+ refire: "every",
168
+ });
169
+ await windows.store.recordAndCount({
170
+ automationId: "auto-1",
171
+ triggerId: "f",
172
+ eventId: "e",
173
+ contextKey: "sys-1",
174
+ occurredAt: new Date(),
175
+ windowMinutes: 60,
176
+ threshold: 1,
177
+ refire: "every",
178
+ });
179
+ expect(windows.events).toHaveLength(2);
180
+
181
+ const sweeper = startStalledSweeper({
182
+ deps,
183
+ automationStore: storeFor(automation([])),
184
+ logger: deps.logger,
185
+ staleAfterMs: 1,
186
+ intervalMs: 1_000_000,
187
+ });
188
+ await sweeper.sweep();
189
+ sweeper.stop();
190
+
191
+ // The stale row is pruned; the fresh one survives.
192
+ expect(windows.events).toHaveLength(1);
193
+ expect(windows.events[0]!.occurredAt.getTime()).toBeGreaterThan(
194
+ Date.now() - 60_000,
195
+ );
196
+ });
197
+ });
@@ -13,11 +13,22 @@
13
13
  * the queue scheduler lost the job).
14
14
  * - `kind: "trigger"` locks past `timeoutAt` fail the run with a
15
15
  * clear "wait timed out" error.
16
+ * - `kind: "until"` locks past `timeoutAt` apply the wait_until timeout
17
+ * policy via `checkWaitUntil` (continue / fail). This is the BACKSTOP
18
+ * for a lost timeout-timer job — a reactive `wait_until` is otherwise
19
+ * event-driven (Stage-1 wake), with no periodic re-check (reactive
20
+ * automation engine §7).
21
+ *
22
+ * And expired `for:` dwell timers whose `automation-dwell` queue job was
23
+ * lost: each is fired via `fireDwell` (which re-confirms state before
24
+ * starting the run). Idempotent via the dwell row's delete-on-fire.
16
25
  */
17
26
  import type { Logger } from "@checkstack/backend-api";
18
27
 
19
28
  import type { AutomationStore } from "../automation-store";
20
- import { recoverStalledRun, resumeRun } from "./engine";
29
+ import { checkWaitUntil, recoverStalledRun, resumeRun } from "./engine";
30
+ import { fireDwell } from "./dwell";
31
+ import { startRunRespectingMode } from "./trigger-subscriber";
21
32
  import type { DispatchDeps } from "./types";
22
33
 
23
34
  export interface StalledSweeperArgs {
@@ -40,6 +51,15 @@ export interface StalledSweeper {
40
51
  const DEFAULT_STALE_MS = 60_000; // 1 minute
41
52
  const DEFAULT_INTERVAL_MS = 30_000; // every 30 seconds
42
53
 
54
+ /**
55
+ * TTL for windowed-count occurrence rows. A row older than the maximum
56
+ * window any trigger can configure (the 1440-minute / 24h `WindowSchema`
57
+ * cap) can never contribute to an in-window count, so it is dead and prunable.
58
+ * Config-independent: pruning at the schema cap is always safe without
59
+ * reading any automation's actual window.
60
+ */
61
+ const WINDOW_EVENT_TTL_MS = 24 * 60 * 60_000; // 24 hours (the WindowSchema cap)
62
+
43
63
  export function startStalledSweeper(
44
64
  args: StalledSweeperArgs,
45
65
  ): StalledSweeper {
@@ -47,8 +67,15 @@ export function startStalledSweeper(
47
67
  const intervalMs = args.intervalMs ?? DEFAULT_INTERVAL_MS;
48
68
 
49
69
  const sweep = async (): Promise<void> => {
50
- await sweepStalledRuns(args, staleMs);
70
+ // Wait-aware sweeps run FIRST: they own `waiting` runs (delay / trigger
71
+ // / until expiry + resume). The stalled-run sweep is strictly for
72
+ // genuinely-`running` crashes and must not race ahead of them. (It now
73
+ // also filters to status='running', so it can't pick up a waiting run,
74
+ // but ordering keeps the wait paths authoritative within a cycle.)
51
75
  await sweepExpiredWaitLocks(args);
76
+ await sweepExpiredDwells(args);
77
+ await sweepExpiredWindowEvents(args);
78
+ await sweepStalledRuns(args, staleMs);
52
79
  };
53
80
 
54
81
  let timer: ReturnType<typeof setInterval> | undefined = setInterval(() => {
@@ -82,8 +109,8 @@ async function sweepStalledRuns(
82
109
  );
83
110
 
84
111
  for (const runId of stalled) {
85
- const acquired = await args.deps.runStateStore.tryAdvisoryLock(runId);
86
- if (!acquired) continue; // another instance already on it
112
+ const lock = await args.deps.runStateStore.tryAdvisoryLock(runId);
113
+ if (!lock) continue; // another instance already on it
87
114
  try {
88
115
  const run = await args.deps.runStore.loadRun(runId);
89
116
  if (!run) continue;
@@ -112,7 +139,7 @@ async function sweepStalledRuns(
112
139
  `automation sweeper failed to recover ${runId}: ${(error as Error).message}`,
113
140
  );
114
141
  } finally {
115
- await args.deps.runStateStore.releaseAdvisoryLock(runId);
142
+ await lock.release();
116
143
  }
117
144
  }
118
145
  }
@@ -125,6 +152,40 @@ async function sweepExpiredWaitLocks(
125
152
  if (expired.length === 0) return;
126
153
 
127
154
  for (const lock of expired) {
155
+ if (lock.kind === "until") {
156
+ // Backstop for a lost timeout-timer job: apply the wait_until timeout
157
+ // policy via checkWaitUntil (it re-evaluates one last time, then
158
+ // resumes-or-fails per continue_on_timeout). Idempotent. A reactive
159
+ // `until` lock without a timeout has no `timeoutAt`, so it never lands
160
+ // here — it is purely event-driven (Stage-1 wake).
161
+ const run = await args.deps.runStore.loadRun(lock.runId);
162
+ if (!run) {
163
+ await args.deps.runStore.deleteWaitLock(lock.id);
164
+ continue;
165
+ }
166
+ const automation = await args.automationStore.getById(run.automationId);
167
+ if (!automation) {
168
+ await args.deps.runStore.deleteWaitLock(lock.id);
169
+ await args.deps.runStore.updateRunStatus(
170
+ lock.runId,
171
+ "failed",
172
+ "automation deleted while run was suspended on wait_until",
173
+ );
174
+ await args.deps.runStateStore.clear(lock.runId);
175
+ continue;
176
+ }
177
+ await checkWaitUntil(args.deps, {
178
+ runId: lock.runId,
179
+ waitLockId: lock.id,
180
+ automation: {
181
+ id: automation.id,
182
+ name: automation.name,
183
+ status: automation.status,
184
+ definition: automation.definition,
185
+ },
186
+ });
187
+ continue;
188
+ }
128
189
  if (lock.kind === "delay") {
129
190
  // The queue scheduler may have lost the job — wake the run
130
191
  // ourselves. Idempotent: resumeRun takes the advisory lock and
@@ -162,3 +223,49 @@ async function sweepExpiredWaitLocks(
162
223
  await args.deps.runStateStore.clear(lock.runId);
163
224
  }
164
225
  }
226
+
227
+ async function sweepExpiredDwells(
228
+ args: StalledSweeperArgs,
229
+ ): Promise<void> {
230
+ const now = new Date();
231
+ const expired = await args.deps.dwellStore.sweepExpired(now);
232
+ if (expired.length === 0) return;
233
+ args.logger.debug(
234
+ `automation sweeper: ${expired.length} expired dwell(s) detected`,
235
+ );
236
+
237
+ for (const dwell of expired) {
238
+ try {
239
+ await fireDwell({
240
+ deps: args.deps,
241
+ automationStore: args.automationStore,
242
+ dwell,
243
+ startRun: startRunRespectingMode,
244
+ });
245
+ } catch (error) {
246
+ args.logger.warn(
247
+ `automation sweeper failed to fire dwell ${dwell.id}: ${(error as Error).message}`,
248
+ );
249
+ }
250
+ }
251
+ }
252
+
253
+ /**
254
+ * Prune windowed-count occurrence rows older than the 24h `WindowSchema`
255
+ * cap. Such rows can never contribute to any in-window count, so the delete
256
+ * is config-independent and safe. A bulk indexed range delete (`pruneIdx`);
257
+ * idempotent and cheap when there's nothing to prune.
258
+ */
259
+ async function sweepExpiredWindowEvents(
260
+ args: StalledSweeperArgs,
261
+ ): Promise<void> {
262
+ const cutoff = new Date(Date.now() - WINDOW_EVENT_TTL_MS);
263
+ try {
264
+ await args.deps.windowStore.sweepExpired(cutoff);
265
+ } catch (error) {
266
+ args.logger.warn(
267
+ `automation sweeper failed to prune window events: ${(error as Error).message}`,
268
+ );
269
+ }
270
+ }
271
+