npm - @checkstack/automation-backend - Versions diffs - 0.2.0 → 0.3.0 - Mend

@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

package/CHANGELOG.md +544 -0
package/drizzle/0003_sparkling_xorn.sql +17 -0
package/drizzle/0004_cultured_spyke.sql +2 -0
package/drizzle/0005_classy_the_hand.sql +19 -0
package/drizzle/0006_burly_wallop.sql +10 -0
package/drizzle/0007_nappy_jackal.sql +1 -0
package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
package/drizzle/0009_steady_liz_osborn.sql +12 -0
package/drizzle/0010_chunky_changeling.sql +2 -0
package/drizzle/meta/0003_snapshot.json +1007 -0
package/drizzle/meta/0004_snapshot.json +1028 -0
package/drizzle/meta/0005_snapshot.json +1164 -0
package/drizzle/meta/0006_snapshot.json +1261 -0
package/drizzle/meta/0007_snapshot.json +1215 -0
package/drizzle/meta/0008_snapshot.json +1215 -0
package/drizzle/meta/0009_snapshot.json +1328 -0
package/drizzle/meta/0010_snapshot.json +1349 -0
package/drizzle/meta/_journal.json +56 -0
package/package.json +23 -12
package/src/action-types.ts +23 -0
package/src/artifact-store.ts +16 -1
package/src/automation-store.test.ts +143 -0
package/src/automation-store.ts +30 -8
package/src/builtin-triggers.test.ts +77 -74
package/src/builtin-triggers.ts +105 -108
package/src/dispatch/action-kind.ts +2 -0
package/src/dispatch/assemble-get-service.ts +31 -0
package/src/dispatch/cancel-resurrect.test.ts +147 -0
package/src/dispatch/concurrency-race.test.ts +255 -0
package/src/dispatch/concurrency-scope.test.ts +166 -0
package/src/dispatch/condition.ts +24 -5
package/src/dispatch/dwell-queue.ts +65 -0
package/src/dispatch/dwell-store.ts +154 -0
package/src/dispatch/dwell.it.test.ts +142 -0
package/src/dispatch/dwell.test.ts +799 -0
package/src/dispatch/dwell.ts +257 -0
package/src/dispatch/engine.test.ts +189 -2
package/src/dispatch/engine.ts +555 -9
package/src/dispatch/entity-scope.test.ts +176 -0
package/src/dispatch/get-service-wiring.test.ts +318 -0
package/src/dispatch/numeric.test.ts +71 -0
package/src/dispatch/numeric.ts +96 -0
package/src/dispatch/render.test.ts +34 -0
package/src/dispatch/render.ts +31 -11
package/src/dispatch/reseed-run-secrets.ts +230 -0
package/src/dispatch/run-secret-registry.test.ts +189 -0
package/src/dispatch/run-secret-registry.ts +247 -0
package/src/dispatch/run-state-masking.test.ts +376 -0
package/src/dispatch/run-state-store.ts +95 -38
package/src/dispatch/run-state.ts +226 -59
package/src/dispatch/scope-artifact-masking.test.ts +138 -0
package/src/dispatch/secret-ref-ids.test.ts +19 -0
package/src/dispatch/secret-ref-ids.ts +17 -0
package/src/dispatch/snapshots.test.ts +86 -0
package/src/dispatch/snapshots.ts +79 -0
package/src/dispatch/stage1-router.test.ts +324 -0
package/src/dispatch/stage1-router.ts +152 -0
package/src/dispatch/stage1.it.test.ts +84 -0
package/src/dispatch/stage2-dispatch.test.ts +285 -0
package/src/dispatch/stage2-dispatch.ts +207 -0
package/src/dispatch/stage2-stalled.it.test.ts +132 -0
package/src/dispatch/stalled-sweeper.test.ts +197 -0
package/src/dispatch/stalled-sweeper.ts +112 -5
package/src/dispatch/state-scope.test.ts +234 -0
package/src/dispatch/state-scope.ts +322 -0
package/src/dispatch/structured-conditions.test.ts +246 -0
package/src/dispatch/structured-conditions.ts +146 -0
package/src/dispatch/test-fixtures.ts +306 -38
package/src/dispatch/trigger-fanin.test.ts +111 -0
package/src/dispatch/trigger-subscriber.ts +316 -14
package/src/dispatch/types.ts +263 -8
package/src/dispatch/wait-timeout-queue.ts +89 -0
package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
package/src/dispatch/wait-until.test.ts +540 -0
package/src/dispatch/wake-refs.test.ts +158 -0
package/src/dispatch/wake-refs.ts +348 -0
package/src/dispatch/window-gate.test.ts +513 -0
package/src/dispatch/window-store.test.ts +162 -0
package/src/dispatch/window-store.ts +102 -0
package/src/entity/change-derivers.test.ts +148 -0
package/src/entity/change-derivers.ts +143 -0
package/src/entity/change-emitter.test.ts +66 -0
package/src/entity/change-emitter.ts +76 -0
package/src/entity/create-handle.ts +344 -0
package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
package/src/entity/define-entity.ts +157 -0
package/src/entity/diff.test.ts +57 -0
package/src/entity/diff.ts +54 -0
package/src/entity/entity-store.test.ts +30 -0
package/src/entity/entity-store.ts +171 -0
package/src/entity/extension-point.ts +56 -0
package/src/entity/fake-entity-store.ts +130 -0
package/src/entity/hook.ts +19 -0
package/src/entity/index.ts +50 -0
package/src/entity/mutate-handle.test.ts +517 -0
package/src/entity/on-entity-changed.test.ts +189 -0
package/src/entity/on-entity-changed.ts +214 -0
package/src/entity/registry.test.ts +181 -0
package/src/entity/registry.ts +200 -0
package/src/entity/stable-stringify.test.ts +55 -0
package/src/entity/stable-stringify.ts +49 -0
package/src/entity/wake-index.it.test.ts +251 -0
package/src/entity/with-entity-write.test.ts +100 -0
package/src/entity/with-entity-write.ts +69 -0
package/src/entity-driven-trigger.ts +46 -0
package/src/extension-points.ts +35 -0
package/src/gitops-docs.test.ts +215 -0
package/src/gitops-docs.ts +151 -0
package/src/gitops-kinds.test.ts +174 -0
package/src/gitops-kinds.ts +137 -0
package/src/index.ts +355 -11
package/src/migration/flapping-to-window.test.ts +123 -0
package/src/migration/flapping-to-window.ts +205 -0
package/src/router.test.ts +182 -1
package/src/router.ts +73 -2
package/src/schema.ts +236 -3
package/src/script-test-replay.test.ts +88 -0
package/src/script-test-replay.ts +100 -0
package/src/script-test-shell-env.test.ts +41 -0
package/src/script-test-shell-env.ts +89 -0
package/src/script-test.test.ts +386 -0
package/src/script-test.ts +258 -0
package/src/trigger-registry.ts +2 -0
package/src/validate-definition.test.ts +1 -0
package/tsconfig.json +24 -0

package/src/dispatch/stage2-dispatch.ts ADDED Viewed

@@ -0,0 +1,207 @@
+/**
+ * Stage-2 dispatch fan-out (reactive automation engine §7, §13).
+ *
+ * Stage 1 (the `ENTITY_CHANGED` work-queue router) does only cheap indexed
+ * routing and enqueues one Stage-2 job per interested automation / waiting
+ * run onto the `automation-dispatch` queue. This consumer runs those jobs:
+ * any instance picks one up, so execution load spreads while Stage 1 stays
+ * fast.
+ *
+ * The job is a validated {@link DispatchJob}; the handler routes on `reason`:
+ *   - `"trigger"` → start fresh run(s) for the matched automation
+ *     (`startRunsForAutomationEvent`, which honours the trigger config gate,
+ *     filter, dwell, and concurrency mode).
+ *   - `"wake"`    → resume the suspended `wait_until` (`checkWaitUntil`,
+ *     which re-enriches scope, re-evaluates, and resumes or fails-on-timeout;
+ *     idempotent via the per-run advisory lock).
+ *
+ * Mirrors the delay / dwell consumer wiring. `consumerGroup:
+ * "automation-dispatch-run"`, `maxRetries: 3`.
+ */
+import type { Logger } from "@checkstack/backend-api";
+import {
+  DispatchJobSchema,
+  type DispatchJob,
+  type EntityChanged,
+} from "@checkstack/automation-common";
+import type { AutomationStore } from "../automation-store";
+import type { ChangeDeriverRegistry } from "../entity/change-derivers";
+import { checkWaitUntil } from "./engine";
+import { startRunsForAutomationEvent } from "./trigger-subscriber";
+import type { DispatchDeps, LoadedAutomation } from "./types";
+/** Durable Stage-2 queue name (reactive automation engine §13.1). */
+export const DISPATCH_QUEUE_NAME = "automation-dispatch";
+export interface DispatchQueueConsumerArgs {
+  deps: DispatchDeps;
+  automationStore: AutomationStore;
+  /**
+   * The per-kind payload mappers (registered alongside derivers). When the
+   * changed kind has a registered `toPayload`, the fresh-run `trigger.payload`
+   * matches that kind's domain-named `payloadSchema` (so `trigger.payload.
+   * incidentId` / `.systemId` / `.previousStatus` resolve); otherwise the
+   * generic change shape is used.
+   */
+  changeDerivers: ChangeDeriverRegistry;
+  logger: Logger;
+}
+export interface DispatchQueueConsumer {
+  stop: () => Promise<void>;
+}
+/**
+ * Generic fallback payload shape for kinds WITHOUT a registered payload
+ * mapper. Exposes the change as the entity id + the new state fields (or a
+ * tombstone marker), plus the kind for clarity.
+ */
+function genericChangedPayload(
+  changed: EntityChanged,
+): Record<string, unknown> {
+  return {
+    kind: changed.kind,
+    id: changed.id,
+    prev: changed.prev,
+    next: changed.next,
+    delta: changed.delta,
+    changedFields: changed.changedFields,
+    // Convenience: spread the next state at the top level so existing
+    // payload-reading templates (`trigger.payload.status`) keep working
+    // when the change is a state update.
+    ...(changed.next === null ? {} : changed.next),
+  };
+}
+/**
+ * The entity-change payload becomes the trigger payload for a fresh run.
+ *
+ * Prefers the per-kind domain payload mapper (`registerChangeDeriver({
+ * toPayload })`) so the runtime `trigger.payload` matches each migrated
+ * trigger's declared `payloadSchema` — preserving the legacy domain keys
+ * operators read (`incidentId`, `systemId`, `previousStatus`, …). Falls back
+ * to the generic change shape for kinds without a mapper.
+ */
+function changedToPayload(
+  changeDerivers: ChangeDeriverRegistry,
+  changed: EntityChanged,
+): Record<string, unknown> {
+  return changeDerivers.payload(changed) ?? genericChangedPayload(changed);
+}
+async function loadAutomation(
+  automationStore: AutomationStore,
+  automationId: string,
+): Promise<LoadedAutomation | undefined> {
+  const automation = await automationStore.getById(automationId);
+  if (!automation) return undefined;
+  return {
+    id: automation.id,
+    name: automation.name,
+    status: automation.status,
+    definition: automation.definition,
+  };
+}
+/**
+ * Handle one Stage-2 job. Exported (not just the consumer) so tests can
+ * drive it directly without a real queue.
+ */
+export async function handleDispatchJob(args: {
+  deps: DispatchDeps;
+  automationStore: AutomationStore;
+  changeDerivers: ChangeDeriverRegistry;
+  job: DispatchJob;
+}): Promise<void> {
+  const { deps, automationStore, changeDerivers, job } = args;
+  if (job.reason === "trigger") {
+    const automation = await loadAutomation(automationStore, job.automationId);
+    if (!automation) {
+      deps.logger.debug(
+        `stage2: automation ${job.automationId} gone; dropping trigger job`,
+      );
+      return;
+    }
+    // Only enabled automations dispatch (disabled = paused).
+    if (automation.status !== "enabled") return;
+    await startRunsForAutomationEvent({
+      deps,
+      automation,
+      eventId: job.triggerId,
+      triggerPayload: changedToPayload(changeDerivers, job.changed),
+      actor: job.changed.actor,
+      contextKey: job.changed.id,
+    });
+    return;
+  }
+  // reason === "wake": resume a suspended wait_until.
+  const lock = await deps.runStore.loadWaitLock(job.waitLockId);
+  if (!lock || lock.kind !== "until") {
+    deps.logger.debug(
+      `stage2: wait lock ${job.waitLockId} gone (resumed / cancelled); dropping wake job`,
+    );
+    return;
+  }
+  const run = await deps.runStore.loadRun(job.runId);
+  if (!run) {
+    await deps.runStore.deleteWaitLock(job.waitLockId);
+    return;
+  }
+  const automation = await loadAutomation(automationStore, run.automationId);
+  if (!automation) {
+    await deps.runStore.deleteWaitLock(job.waitLockId);
+    await deps.runStore.updateRunStatus(
+      job.runId,
+      "failed",
+      "automation deleted while run was suspended on wait_until",
+    );
+    await deps.runStateStore.clear(job.runId);
+    return;
+  }
+  // checkWaitUntil re-enriches scope (with every ref the wait depends on +
+  // the changed ref), re-evaluates the full condition, and resumes (or
+  // applies timeout). Idempotent: it deletes the lock before resuming and
+  // resumeRun takes the per-run advisory lock.
+  await checkWaitUntil(deps, {
+    runId: job.runId,
+    waitLockId: job.waitLockId,
+    automation,
+    changedRef: job.ref,
+  });
+}
+export async function startDispatchQueueConsumer(
+  args: DispatchQueueConsumerArgs,
+): Promise<DispatchQueueConsumer> {
+  const queue = args.deps.queueManager.getQueue<DispatchJob>(
+    DISPATCH_QUEUE_NAME,
+  );
+  await queue.consume(
+    async (rawJob) => {
+      const parsed = DispatchJobSchema.safeParse(rawJob.data);
+      if (!parsed.success) {
+        args.logger.warn(
+          `stage2: dropping malformed automation-dispatch job: ${parsed.error.message}`,
+        );
+        return;
+      }
+      await handleDispatchJob({
+        deps: args.deps,
+        automationStore: args.automationStore,
+        changeDerivers: args.changeDerivers,
+        job: parsed.data,
+      });
+    },
+    { consumerGroup: "automation-dispatch-run", maxRetries: 3 },
+  );
+  return {
+    stop: async () => {
+      await queue.stop();
+    },
+  };
+}

package/src/dispatch/stage2-stalled.it.test.ts ADDED Viewed

@@ -0,0 +1,132 @@
+/**
+ * Integration test (real Redis / BullMQ) for Stage-2 stalled redelivery.
+ *
+ * Part of the surgical integration lane (plan §14.4 #5, load-bearing for
+ * §15.5). The Stage-2 `automation-dispatch` queue relies on BullMQ
+ * redelivering a job whose worker died holding it (in-flight crash recovery
+ * via stalled-job redelivery, §17). This pins that third-party contract: a
+ * worker that claims a job and then DIES without completing it (we close it
+ * mid-flight while suppressing lock renewal via a long processing block)
+ * must have its job redelivered to a second worker and completed exactly
+ * once.
+ *
+ * To make the stall observable in a bounded test, the worker is configured
+ * with a short `lockDuration` + `stalledInterval` (the production worker uses
+ * 30s, §15.4 — too long for a test). We assert the SECOND worker eventually
+ * completes the job and that the side effect happens once.
+ *
+ * Gated behind `CHECKSTACK_IT=1` so the default `bun test` never runs it.
+ * Connection comes from `CHECKSTACK_IT_REDIS_URL` (defaulting to the
+ * `docker-compose-dev.yml` Redis port).
+ */
+import { afterAll, beforeAll, describe, expect, it } from "bun:test";
+import { Queue, Worker, type ConnectionOptions } from "bullmq";
+function redisConnection(): ConnectionOptions {
+  const url = new URL(
+    process.env.CHECKSTACK_IT_REDIS_URL ?? "redis://localhost:6379",
+  );
+  return {
+    host: url.hostname,
+    port: Number(url.port || 6379),
+    password: url.password || undefined,
+  };
+}
+const QUEUE = `it_stage2_${crypto.randomUUID().replace(/-/g, "")}`;
+const PREFIX = `it:${crypto.randomUUID().replace(/-/g, "")}`;
+describe.skipIf(!process.env.CHECKSTACK_IT)(
+  "Stage-2 stalled redelivery (real Redis)",
+  () => {
+    let queue: Queue;
+    const workers: Worker[] = [];
+    beforeAll(() => {
+      queue = new Queue(QUEUE, {
+        connection: redisConnection(),
+        prefix: PREFIX,
+      });
+    });
+    afterAll(async () => {
+      for (const w of workers) await w.close().catch(() => {});
+      await queue.obliterate({ force: true }).catch(() => {});
+      await queue.close();
+    });
+    it("a dead worker's job is redelivered to another worker and completed once", async () => {
+      let completedBy = 0;
+      const completions: string[] = [];
+      // Short lock + stalled interval so the stall is observable quickly.
+      const sharedOpts = {
+        connection: redisConnection(),
+        prefix: PREFIX,
+        lockDuration: 1000,
+        stalledInterval: 1000,
+        maxStalledCount: 1,
+      } as const;
+      // Worker A: claims the job, then "dies" — it never resolves its
+      // processor (simulating a crash). We force-close it (without letting it
+      // finish) so the lock expires and BullMQ marks the job stalled.
+      //
+      // Determinism: with a single job and one healthy worker, starting BOTH
+      // workers up front lets BullMQ hand the job to EITHER one — a healthy B
+      // claim makes the "A dies mid-flight → B redelivers" assertion flaky even
+      // though production is correct. So we start ONLY A, wait until it has
+      // claimed the job, and only THEN start B and kill A. A is the guaranteed
+      // first claimer, and the real stalled-redelivery path is still exercised.
+      let aClaimed = false;
+      const workerA = new Worker(
+        QUEUE,
+        async () => {
+          aClaimed = true;
+          // Block far longer than lockDuration without renewing — simulate a
+          // hung/dead processor. The close() below pulls the rug out.
+          await new Promise((r) => setTimeout(r, 60_000));
+        },
+        sharedOpts,
+      );
+      workers.push(workerA);
+      await workerA.waitUntilReady();
+      await queue.add("dispatch", { reason: "wake", runId: "run-1" });
+      // Wait until A has claimed it (A is the only worker, so it WILL claim).
+      const start = Date.now();
+      while (!aClaimed && Date.now() - start < 5000) {
+        await new Promise((r) => setTimeout(r, 50));
+      }
+      expect(aClaimed).toBe(true);
+      // Only now start worker B — the healthy worker that should redeliver +
+      // complete the stalled job. Starting it after A claimed guarantees the
+      // job is not handed to B first.
+      const workerB = new Worker(
+        QUEUE,
+        async (job) => {
+          completedBy += 1;
+          completions.push(String(job.id));
+        },
+        sharedOpts,
+      );
+      workers.push(workerB);
+      await workerB.waitUntilReady();
+      // Kill worker A mid-flight (the rug-pull) so its lock can't renew.
+      await workerA.close(true);
+      // Wait for the stalled job to be redelivered to + completed by B.
+      const waitStart = Date.now();
+      while (completedBy === 0 && Date.now() - waitStart < 15_000) {
+        await new Promise((r) => setTimeout(r, 100));
+      }
+      expect(completedBy).toBe(1);
+      expect(completions).toHaveLength(1);
+    });
+  },
+);

package/src/dispatch/stalled-sweeper.test.ts ADDED Viewed

@@ -0,0 +1,197 @@
+import { describe, expect, it } from "bun:test";
+import { AutomationDefinitionSchema } from "@checkstack/automation-common";
+import type { AutomationStore } from "../automation-store";
+import { createActionRegistry } from "../action-registry";
+import { dispatchTrigger, recoverStalledRun } from "./engine";
+import { startStalledSweeper } from "./stalled-sweeper";
+import {
+  makeDispatchDeps,
+  makeRecordingAction,
+  testPlugin,
+} from "./test-fixtures";
+import type { LoadedAutomation } from "./types";
+function automation(actions: unknown[]): LoadedAutomation {
+  const definition = AutomationDefinitionSchema.parse({
+    name: "Sweeper test",
+    triggers: [{ event: "test.event" }],
+    conditions: [],
+    actions,
+    mode: "single",
+    max_runs: 10,
+  });
+  return { id: "auto-1", name: "Sweeper test", status: "enabled", definition };
+}
+function storeFor(auto: LoadedAutomation): AutomationStore {
+  return {
+    create: async () => {
+      throw new Error("nope");
+    },
+    update: async () => {
+      throw new Error("nope");
+    },
+    delete: async () => {},
+    toggle: async () => {
+      throw new Error("nope");
+    },
+    getById: async (id) =>
+      id === auto.id
+        ? {
+            id: auto.id,
+            name: auto.name,
+            description: undefined,
+            status: auto.status,
+            definition: auto.definition,
+            managedBy: undefined,
+            createdAt: new Date(),
+            updatedAt: new Date(),
+          }
+        : undefined,
+    list: async () => ({ items: [], total: 0 }),
+    listGroups: async () => [],
+    findEnabledByTriggerEvent: async () => [auto],
+    listEnabled: async () => [auto],
+  };
+}
+describe("stalled sweeper — C1: must not re-walk an intentional wait", () => {
+  it("does not re-run pre-wait actions or leak wait locks when sweeping a mid-wait run", async () => {
+    const actionsReg = createActionRegistry();
+    const rec = makeRecordingAction();
+    actionsReg.register(rec.definition, testPlugin);
+    const { deps, runs, state } = makeDispatchDeps({ actions: actionsReg });
+    const auto = automation([
+      { action: "test.record", config: { value: "before-delay" } },
+      { delay: { seconds: 3600 } },
+      { action: "test.record", config: { value: "after-delay" } },
+    ]);
+    // Genuinely dispatch — the run suspends on the delay (one wait lock, a
+    // checkpoint at the delay's path).
+    const result = await dispatchTrigger(deps, {
+      automation: auto,
+      triggerId: "test_event",
+      triggerEventId: "test.event",
+      payload: {},
+      contextKey: "ck-1",
+    });
+    expect(result.status).toBe("waiting");
+    expect(rec.calls.map((c) => c.value)).toEqual(["before-delay"]);
+    expect(runs.waitLocks.size).toBe(1);
+    // Simulate the crash window: the heartbeat went cold and the run row
+    // still reads "running" (the engine flips to running before each step;
+    // a mid-wait crash can leave it that way while the wait lock lives on).
+    // This is exactly the state that made the OLD sweeper re-walk from the
+    // top — `findStalledRunIds` returned it AND recoverStalledRun accepted
+    // it despite the live wait lock.
+    runs.runs.get(result.runId)!.status = "running";
+    state.states.get(result.runId)!.lastHeartbeatAt = new Date(
+      Date.now() - 10 * 60_000,
+    );
+    // Run the REAL sweeper. The delay-expiry sweep won't fire (timeoutAt is
+    // an hour out), so only the stalled-run sweep can touch this run.
+    const sweeper = startStalledSweeper({
+      deps,
+      automationStore: storeFor(auto),
+      logger: deps.logger,
+      staleAfterMs: 1,
+      intervalMs: 1_000_000,
+    });
+    await sweeper.sweep();
+    sweeper.stop();
+    // The pre-wait action must NOT have re-executed, and no extra wait lock
+    // may have accumulated (no duplicate delay job either).
+    expect(rec.calls.map((c) => c.value)).toEqual(["before-delay"]);
+    expect(runs.waitLocks.size).toBe(1);
+  });
+});
+describe("stalled sweeper — H4 + C1c: recoverStalledRun refuses a run holding a live wait lock", () => {
+  it("does not re-walk or create a duplicate lock for a crash-mid-wait run", async () => {
+    const actionsReg = createActionRegistry();
+    const rec = makeRecordingAction();
+    actionsReg.register(rec.definition, testPlugin);
+    const { deps, runs } = makeDispatchDeps({ actions: actionsReg });
+    const auto = automation([
+      { action: "test.record", config: { value: "before-delay" } },
+      { delay: { seconds: 3600 } },
+      { action: "test.record", config: { value: "after-delay" } },
+    ]);
+    const result = await dispatchTrigger(deps, {
+      automation: auto,
+      triggerId: "test_event",
+      triggerEventId: "test.event",
+      payload: {},
+      contextKey: "ck-1",
+    });
+    expect(result.status).toBe("waiting");
+    expect(runs.waitLocks.size).toBe(1);
+    // Simulate a crash that left the run marked `running` while still
+    // holding its wait lock (a status the wait paths hadn't yet cleared).
+    runs.runs.get(result.runId)!.status = "running";
+    const recovered = await recoverStalledRun(deps, {
+      runId: result.runId,
+      automation: auto,
+    });
+    // Recovery must refuse: no re-walk, no second wait lock, no duplicate
+    // delay job.
+    expect(recovered.status).toBe("running");
+    expect(rec.calls.map((c) => c.value)).toEqual(["before-delay"]);
+    expect(runs.waitLocks.size).toBe(1);
+  });
+});
+describe("stalled sweeper — windowed-count occurrence prune", () => {
+  it("deletes occurrence rows older than the 24h cap, keeping fresh ones", async () => {
+    const { deps, windows } = makeDispatchDeps({});
+    // One stale row (25h ago) + one fresh row (now).
+    await windows.store.recordAndCount({
+      automationId: "auto-1",
+      triggerId: "f",
+      eventId: "e",
+      contextKey: "sys-1",
+      occurredAt: new Date(Date.now() - 25 * 60 * 60_000),
+      windowMinutes: 60,
+      threshold: 1,
+      refire: "every",
+    });
+    await windows.store.recordAndCount({
+      automationId: "auto-1",
+      triggerId: "f",
+      eventId: "e",
+      contextKey: "sys-1",
+      occurredAt: new Date(),
+      windowMinutes: 60,
+      threshold: 1,
+      refire: "every",
+    });
+    expect(windows.events).toHaveLength(2);
+    const sweeper = startStalledSweeper({
+      deps,
+      automationStore: storeFor(automation([])),
+      logger: deps.logger,
+      staleAfterMs: 1,
+      intervalMs: 1_000_000,
+    });
+    await sweeper.sweep();
+    sweeper.stop();
+    // The stale row is pruned; the fresh one survives.
+    expect(windows.events).toHaveLength(1);
+    expect(windows.events[0]!.occurredAt.getTime()).toBeGreaterThan(
+      Date.now() - 60_000,
+    );
+  });
+});

package/src/dispatch/stalled-sweeper.ts CHANGED Viewed

@@ -13,11 +13,22 @@
  *     the queue scheduler lost the job).
  *   - `kind: "trigger"` locks past `timeoutAt` fail the run with a
  *     clear "wait timed out" error.
+ *   - `kind: "until"` locks past `timeoutAt` apply the wait_until timeout
+ *     policy via `checkWaitUntil` (continue / fail). This is the BACKSTOP
+ *     for a lost timeout-timer job — a reactive `wait_until` is otherwise
+ *     event-driven (Stage-1 wake), with no periodic re-check (reactive
+ *     automation engine §7).
+ *
+ * And expired `for:` dwell timers whose `automation-dwell` queue job was
+ * lost: each is fired via `fireDwell` (which re-confirms state before
+ * starting the run). Idempotent via the dwell row's delete-on-fire.
  */
 import type { Logger } from "@checkstack/backend-api";
 import type { AutomationStore } from "../automation-store";
-import { recoverStalledRun, resumeRun } from "./engine";
+import { checkWaitUntil, recoverStalledRun, resumeRun } from "./engine";
+import { fireDwell } from "./dwell";
+import { startRunRespectingMode } from "./trigger-subscriber";
 import type { DispatchDeps } from "./types";
 export interface StalledSweeperArgs {
@@ -40,6 +51,15 @@ export interface StalledSweeper {
 const DEFAULT_STALE_MS = 60_000; // 1 minute
 const DEFAULT_INTERVAL_MS = 30_000; // every 30 seconds
+/**
+ * TTL for windowed-count occurrence rows. A row older than the maximum
+ * window any trigger can configure (the 1440-minute / 24h `WindowSchema`
+ * cap) can never contribute to an in-window count, so it is dead and prunable.
+ * Config-independent: pruning at the schema cap is always safe without
+ * reading any automation's actual window.
+ */
+const WINDOW_EVENT_TTL_MS = 24 * 60 * 60_000; // 24 hours (the WindowSchema cap)
 export function startStalledSweeper(
   args: StalledSweeperArgs,
 ): StalledSweeper {
@@ -47,8 +67,15 @@ export function startStalledSweeper(
   const intervalMs = args.intervalMs ?? DEFAULT_INTERVAL_MS;
   const sweep = async (): Promise<void> => {
-    await sweepStalledRuns(args, staleMs);
+    // Wait-aware sweeps run FIRST: they own `waiting` runs (delay / trigger
+    // / until expiry + resume). The stalled-run sweep is strictly for
+    // genuinely-`running` crashes and must not race ahead of them. (It now
+    // also filters to status='running', so it can't pick up a waiting run,
+    // but ordering keeps the wait paths authoritative within a cycle.)
     await sweepExpiredWaitLocks(args);
+    await sweepExpiredDwells(args);
+    await sweepExpiredWindowEvents(args);
+    await sweepStalledRuns(args, staleMs);
   };
   let timer: ReturnType<typeof setInterval> | undefined = setInterval(() => {
@@ -82,8 +109,8 @@ async function sweepStalledRuns(
   );
   for (const runId of stalled) {
-    const acquired = await args.deps.runStateStore.tryAdvisoryLock(runId);
-    if (!acquired) continue; // another instance already on it
+    const lock = await args.deps.runStateStore.tryAdvisoryLock(runId);
+    if (!lock) continue; // another instance already on it
     try {
       const run = await args.deps.runStore.loadRun(runId);
       if (!run) continue;
@@ -112,7 +139,7 @@ async function sweepStalledRuns(
         `automation sweeper failed to recover ${runId}: ${(error as Error).message}`,
       );
     } finally {
-      await args.deps.runStateStore.releaseAdvisoryLock(runId);
+      await lock.release();
     }
   }
 }
@@ -125,6 +152,40 @@ async function sweepExpiredWaitLocks(
   if (expired.length === 0) return;
   for (const lock of expired) {
+    if (lock.kind === "until") {
+      // Backstop for a lost timeout-timer job: apply the wait_until timeout
+      // policy via checkWaitUntil (it re-evaluates one last time, then
+      // resumes-or-fails per continue_on_timeout). Idempotent. A reactive
+      // `until` lock without a timeout has no `timeoutAt`, so it never lands
+      // here — it is purely event-driven (Stage-1 wake).
+      const run = await args.deps.runStore.loadRun(lock.runId);
+      if (!run) {
+        await args.deps.runStore.deleteWaitLock(lock.id);
+        continue;
+      }
+      const automation = await args.automationStore.getById(run.automationId);
+      if (!automation) {
+        await args.deps.runStore.deleteWaitLock(lock.id);
+        await args.deps.runStore.updateRunStatus(
+          lock.runId,
+          "failed",
+          "automation deleted while run was suspended on wait_until",
+        );
+        await args.deps.runStateStore.clear(lock.runId);
+        continue;
+      }
+      await checkWaitUntil(args.deps, {
+        runId: lock.runId,
+        waitLockId: lock.id,
+        automation: {
+          id: automation.id,
+          name: automation.name,
+          status: automation.status,
+          definition: automation.definition,
+        },
+      });
+      continue;
+    }
     if (lock.kind === "delay") {
       // The queue scheduler may have lost the job — wake the run
       // ourselves. Idempotent: resumeRun takes the advisory lock and
@@ -162,3 +223,49 @@ async function sweepExpiredWaitLocks(
     await args.deps.runStateStore.clear(lock.runId);
   }
 }
+async function sweepExpiredDwells(
+  args: StalledSweeperArgs,
+): Promise<void> {
+  const now = new Date();
+  const expired = await args.deps.dwellStore.sweepExpired(now);
+  if (expired.length === 0) return;
+  args.logger.debug(
+    `automation sweeper: ${expired.length} expired dwell(s) detected`,
+  );
+  for (const dwell of expired) {
+    try {
+      await fireDwell({
+        deps: args.deps,
+        automationStore: args.automationStore,
+        dwell,
+        startRun: startRunRespectingMode,
+      });
+    } catch (error) {
+      args.logger.warn(
+        `automation sweeper failed to fire dwell ${dwell.id}: ${(error as Error).message}`,
+      );
+    }
+  }
+}
+/**
+ * Prune windowed-count occurrence rows older than the 24h `WindowSchema`
+ * cap. Such rows can never contribute to any in-window count, so the delete
+ * is config-independent and safe. A bulk indexed range delete (`pruneIdx`);
+ * idempotent and cheap when there's nothing to prune.
+ */
+async function sweepExpiredWindowEvents(
+  args: StalledSweeperArgs,
+): Promise<void> {
+  const cutoff = new Date(Date.now() - WINDOW_EVENT_TTL_MS);
+  try {
+    await args.deps.windowStore.sweepExpired(cutoff);
+  } catch (error) {
+    args.logger.warn(
+      `automation sweeper failed to prune window events: ${(error as Error).message}`,
+    );
+  }
+}