npm - @checkstack/automation-backend - Versions diffs - 0.2.0 → 0.3.0 - Mend

@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

package/CHANGELOG.md +544 -0
package/drizzle/0003_sparkling_xorn.sql +17 -0
package/drizzle/0004_cultured_spyke.sql +2 -0
package/drizzle/0005_classy_the_hand.sql +19 -0
package/drizzle/0006_burly_wallop.sql +10 -0
package/drizzle/0007_nappy_jackal.sql +1 -0
package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
package/drizzle/0009_steady_liz_osborn.sql +12 -0
package/drizzle/0010_chunky_changeling.sql +2 -0
package/drizzle/meta/0003_snapshot.json +1007 -0
package/drizzle/meta/0004_snapshot.json +1028 -0
package/drizzle/meta/0005_snapshot.json +1164 -0
package/drizzle/meta/0006_snapshot.json +1261 -0
package/drizzle/meta/0007_snapshot.json +1215 -0
package/drizzle/meta/0008_snapshot.json +1215 -0
package/drizzle/meta/0009_snapshot.json +1328 -0
package/drizzle/meta/0010_snapshot.json +1349 -0
package/drizzle/meta/_journal.json +56 -0
package/package.json +23 -12
package/src/action-types.ts +23 -0
package/src/artifact-store.ts +16 -1
package/src/automation-store.test.ts +143 -0
package/src/automation-store.ts +30 -8
package/src/builtin-triggers.test.ts +77 -74
package/src/builtin-triggers.ts +105 -108
package/src/dispatch/action-kind.ts +2 -0
package/src/dispatch/assemble-get-service.ts +31 -0
package/src/dispatch/cancel-resurrect.test.ts +147 -0
package/src/dispatch/concurrency-race.test.ts +255 -0
package/src/dispatch/concurrency-scope.test.ts +166 -0
package/src/dispatch/condition.ts +24 -5
package/src/dispatch/dwell-queue.ts +65 -0
package/src/dispatch/dwell-store.ts +154 -0
package/src/dispatch/dwell.it.test.ts +142 -0
package/src/dispatch/dwell.test.ts +799 -0
package/src/dispatch/dwell.ts +257 -0
package/src/dispatch/engine.test.ts +189 -2
package/src/dispatch/engine.ts +555 -9
package/src/dispatch/entity-scope.test.ts +176 -0
package/src/dispatch/get-service-wiring.test.ts +318 -0
package/src/dispatch/numeric.test.ts +71 -0
package/src/dispatch/numeric.ts +96 -0
package/src/dispatch/render.test.ts +34 -0
package/src/dispatch/render.ts +31 -11
package/src/dispatch/reseed-run-secrets.ts +230 -0
package/src/dispatch/run-secret-registry.test.ts +189 -0
package/src/dispatch/run-secret-registry.ts +247 -0
package/src/dispatch/run-state-masking.test.ts +376 -0
package/src/dispatch/run-state-store.ts +95 -38
package/src/dispatch/run-state.ts +226 -59
package/src/dispatch/scope-artifact-masking.test.ts +138 -0
package/src/dispatch/secret-ref-ids.test.ts +19 -0
package/src/dispatch/secret-ref-ids.ts +17 -0
package/src/dispatch/snapshots.test.ts +86 -0
package/src/dispatch/snapshots.ts +79 -0
package/src/dispatch/stage1-router.test.ts +324 -0
package/src/dispatch/stage1-router.ts +152 -0
package/src/dispatch/stage1.it.test.ts +84 -0
package/src/dispatch/stage2-dispatch.test.ts +285 -0
package/src/dispatch/stage2-dispatch.ts +207 -0
package/src/dispatch/stage2-stalled.it.test.ts +132 -0
package/src/dispatch/stalled-sweeper.test.ts +197 -0
package/src/dispatch/stalled-sweeper.ts +112 -5
package/src/dispatch/state-scope.test.ts +234 -0
package/src/dispatch/state-scope.ts +322 -0
package/src/dispatch/structured-conditions.test.ts +246 -0
package/src/dispatch/structured-conditions.ts +146 -0
package/src/dispatch/test-fixtures.ts +306 -38
package/src/dispatch/trigger-fanin.test.ts +111 -0
package/src/dispatch/trigger-subscriber.ts +316 -14
package/src/dispatch/types.ts +263 -8
package/src/dispatch/wait-timeout-queue.ts +89 -0
package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
package/src/dispatch/wait-until.test.ts +540 -0
package/src/dispatch/wake-refs.test.ts +158 -0
package/src/dispatch/wake-refs.ts +348 -0
package/src/dispatch/window-gate.test.ts +513 -0
package/src/dispatch/window-store.test.ts +162 -0
package/src/dispatch/window-store.ts +102 -0
package/src/entity/change-derivers.test.ts +148 -0
package/src/entity/change-derivers.ts +143 -0
package/src/entity/change-emitter.test.ts +66 -0
package/src/entity/change-emitter.ts +76 -0
package/src/entity/create-handle.ts +344 -0
package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
package/src/entity/define-entity.ts +157 -0
package/src/entity/diff.test.ts +57 -0
package/src/entity/diff.ts +54 -0
package/src/entity/entity-store.test.ts +30 -0
package/src/entity/entity-store.ts +171 -0
package/src/entity/extension-point.ts +56 -0
package/src/entity/fake-entity-store.ts +130 -0
package/src/entity/hook.ts +19 -0
package/src/entity/index.ts +50 -0
package/src/entity/mutate-handle.test.ts +517 -0
package/src/entity/on-entity-changed.test.ts +189 -0
package/src/entity/on-entity-changed.ts +214 -0
package/src/entity/registry.test.ts +181 -0
package/src/entity/registry.ts +200 -0
package/src/entity/stable-stringify.test.ts +55 -0
package/src/entity/stable-stringify.ts +49 -0
package/src/entity/wake-index.it.test.ts +251 -0
package/src/entity/with-entity-write.test.ts +100 -0
package/src/entity/with-entity-write.ts +69 -0
package/src/entity-driven-trigger.ts +46 -0
package/src/extension-points.ts +35 -0
package/src/gitops-docs.test.ts +215 -0
package/src/gitops-docs.ts +151 -0
package/src/gitops-kinds.test.ts +174 -0
package/src/gitops-kinds.ts +137 -0
package/src/index.ts +355 -11
package/src/migration/flapping-to-window.test.ts +123 -0
package/src/migration/flapping-to-window.ts +205 -0
package/src/router.test.ts +182 -1
package/src/router.ts +73 -2
package/src/schema.ts +236 -3
package/src/script-test-replay.test.ts +88 -0
package/src/script-test-replay.ts +100 -0
package/src/script-test-shell-env.test.ts +41 -0
package/src/script-test-shell-env.ts +89 -0
package/src/script-test.test.ts +386 -0
package/src/script-test.ts +258 -0
package/src/trigger-registry.ts +2 -0
package/src/validate-definition.test.ts +1 -0
package/tsconfig.json +24 -0

package/src/dispatch/concurrency-race.test.ts ADDED Viewed

@@ -0,0 +1,255 @@
+import { describe, expect, it } from "bun:test";
+import { SYSTEM_ACTOR } from "@checkstack/common";
+import { AutomationDefinitionSchema } from "@checkstack/automation-common";
+import type { AutomationStore } from "../automation-store";
+import { createActionRegistry } from "../action-registry";
+import { recoverStalledRun, resumeRun } from "./engine";
+import { handleTriggerFiring } from "./trigger-subscriber";
+import { makeDispatchDeps, makeRecordingAction, testPlugin } from "./test-fixtures";
+import type { DispatchDeps, LoadedAutomation } from "./types";
+const EVENT = "test.event";
+/** Single-mode automation whose run stays active (waits forever). */
+function buildAutomation(): LoadedAutomation {
+  const definition = AutomationDefinitionSchema.parse({
+    name: "Race test",
+    triggers: [{ event: EVENT }],
+    conditions: [],
+    actions: [{ wait_for_trigger: { event: "never.fires" } }],
+    mode: "single",
+    max_runs: 10,
+  });
+  return { id: "auto-1", name: "Race test", status: "enabled", definition };
+}
+function storeFor(auto: LoadedAutomation): AutomationStore {
+  return {
+    create: async () => {
+      throw new Error("nope");
+    },
+    update: async () => {
+      throw new Error("nope");
+    },
+    delete: async () => {},
+    toggle: async () => {
+      throw new Error("nope");
+    },
+    getById: async () => undefined,
+    list: async () => ({ items: [], total: 0 }),
+    listGroups: async () => [],
+    findEnabledByTriggerEvent: async () => [auto],
+    listEnabled: async () => [auto],
+  };
+}
+function activeCount(runs: ReturnType<typeof makeDispatchDeps>["runs"]): number {
+  return [...runs.runs.values()].filter((r) =>
+    ["pending", "running", "waiting"].includes(r.status),
+  ).length;
+}
+describe("M1 — concurrency check-then-create race (single mode)", () => {
+  it("two concurrent fires create exactly one run", async () => {
+    const actionsReg = createActionRegistry();
+    actionsReg.register(makeRecordingAction().definition, testPlugin);
+    const { deps, runs } = makeDispatchDeps({
+      actions: actionsReg,
+      withConcurrencyLock: true,
+    });
+    const auto = buildAutomation();
+    // Widen the check-then-create window with a real async gap, so that
+    // WITHOUT serialization both fires can complete their "is a run active?"
+    // check before either has created its run — the exact interleaving that
+    // double-runs a single-mode automation. WITH the lock, the second fire
+    // blocks at lock-acquire and only checks after the first committed, so
+    // the gap is harmless. (Macrotask yield, not a 2-party barrier, so it
+    // works in both the locked and unlocked variants without deadlock.)
+    const realHasActiveRun = deps.runStore.hasActiveRun.bind(deps.runStore);
+    deps.runStore.hasActiveRun = async (automationId, contextKey) => {
+      const result = await realHasActiveRun(automationId, contextKey);
+      await new Promise((r) => setTimeout(r, 5));
+      return result;
+    };
+    const fire = () =>
+      handleTriggerFiring({
+        deps,
+        automationStore: storeFor(auto),
+        qualifiedEventId: EVENT,
+        triggerPayload: { id: "sys-1" },
+        actor: SYSTEM_ACTOR,
+        contextKey: "sys-1",
+      });
+    await Promise.all([fire(), fire()]);
+    expect(activeCount(runs)).toBe(1);
+  });
+});
+// ─── Resume-vs-recover same-run race ─────────────────────────────────────
+/**
+ * Wrap a recover the way `stalled-sweeper.ts` does: acquire the per-run
+ * advisory lock FIRST, then recover, releasing in a finally. The lock is the
+ * cross-path arbiter — the SAME `Set`-backed advisory-lock fake (one Set
+ * across both paths) is what makes "exactly one executes" hold across a
+ * sweeper-recover racing a wake-driven resume / a second sweeper.
+ */
+async function sweeperRecover(
+  deps: DispatchDeps,
+  args: { runId: string; automation: LoadedAutomation },
+): Promise<{ acted: boolean }> {
+  const lock = await deps.runStateStore.tryAdvisoryLock(args.runId);
+  if (!lock) return { acted: false }; // another instance already on it
+  try {
+    await recoverStalledRun(deps, args);
+    return { acted: true };
+  } finally {
+    await lock.release();
+  }
+}
+/** An automation: one recording action, gated behind a wait, then another. */
+function recoverableAutomation(actionsReg: ReturnType<typeof createActionRegistry>): {
+  auto: LoadedAutomation;
+  recorded: () => number;
+} {
+  const recording = makeRecordingAction();
+  actionsReg.register(recording.definition, testPlugin);
+  const definition = AutomationDefinitionSchema.parse({
+    name: "Recover race",
+    triggers: [{ event: EVENT }],
+    conditions: [],
+    actions: [
+      { action: "test.record", config: { value: "after-recover" } },
+    ],
+    mode: "single",
+    max_runs: 10,
+  });
+  return {
+    auto: { id: "auto-1", name: "Recover race", status: "enabled", definition },
+    recorded: () => recording.calls.length,
+  };
+}
+describe("M2 — resume-vs-recover same-run race (shared advisory lock)", () => {
+  it("two sweeper recoveries of one stalled run: exactly one executes", async () => {
+    const actionsReg = createActionRegistry();
+    const { auto, recorded } = recoverableAutomation(actionsReg);
+    const { deps, runs, state } = makeDispatchDeps({ actions: actionsReg });
+    // A genuinely-stalled run: status `running`, a persisted snapshot, and no
+    // wait lock — exactly what `recoverStalledRun` is allowed to re-walk.
+    const runId = "run-stalled";
+    runs.runs.set(runId, {
+      id: runId,
+      automationId: auto.id,
+      triggerId: "t",
+      triggerEventId: EVENT,
+      triggerPayload: {},
+      contextKey: null,
+      status: "running",
+      errorMessage: null,
+      startedAt: new Date(),
+      finishedAt: null,
+    });
+    state.states.set(runId, {
+      scopeSnapshot: { trigger: { id: "t", event: EVENT, payload: {} } },
+      lastActionPath: null, // crashed before the first step → from the top
+      lastHeartbeatAt: new Date(0),
+    });
+    // Two pods sweep the same stalled run at once; the shared `locks` Set
+    // (state.locks) arbitrates.
+    const [a, b] = await Promise.all([
+      sweeperRecover(deps, { runId, automation: auto }),
+      sweeperRecover(deps, { runId, automation: auto }),
+    ]);
+    expect([a.acted, b.acted].filter(Boolean)).toHaveLength(1);
+    expect(recorded()).toBe(1); // the action ran exactly once
+    expect(runs.runs.get(runId)!.status).toBe("success");
+    expect(state.locks.size).toBe(0); // lock released by the winner
+  });
+  it("a resume racing a recover for the same waiting run: the wake wins, recover no-ops", async () => {
+    const actionsReg = createActionRegistry();
+    const recording = makeRecordingAction();
+    actionsReg.register(recording.definition, testPlugin);
+    const definition = AutomationDefinitionSchema.parse({
+      name: "Resume race",
+      triggers: [{ event: EVENT }],
+      conditions: [],
+      actions: [
+        { wait_for_trigger: { event: "wake.event" } },
+        { action: "test.record", config: { value: "post-wait" } },
+      ],
+      mode: "single",
+      max_runs: 10,
+    });
+    const auto: LoadedAutomation = {
+      id: "auto-1",
+      name: "Resume race",
+      status: "enabled",
+      definition,
+    };
+    const { deps, runs, state } = makeDispatchDeps({ actions: actionsReg });
+    // A run intentionally suspended at the wait: status `waiting`, snapshot at
+    // the wait, plus a wait lock. `resumeRun` (the wake path) owns it; a
+    // sweeper recover must refuse (status not `running` + a live wait lock).
+    const runId = "run-waiting";
+    runs.runs.set(runId, {
+      id: runId,
+      automationId: auto.id,
+      triggerId: "t",
+      triggerEventId: EVENT,
+      triggerPayload: {},
+      contextKey: null,
+      status: "waiting",
+      errorMessage: null,
+      startedAt: new Date(),
+      finishedAt: null,
+    });
+    state.states.set(runId, {
+      scopeSnapshot: { trigger: { id: "t", event: EVENT, payload: {} } },
+      lastActionPath: "actions[0]",
+      lastHeartbeatAt: new Date(),
+    });
+    await deps.runStore.createWaitLock({
+      runId,
+      actionPath: "actions[0]",
+      kind: "trigger",
+      eventId: "wake.event",
+      contextKey: null,
+      filterTemplate: null,
+      timeoutAt: null,
+    });
+    // `recoverStalledRun` is invoked DIRECTLY (not under the sweeper lock
+    // wrapper) — faithful to production, where the sweeper only ever
+    // *recovers* `running` runs and never competes for a `waiting` run's
+    // lock (it filters on status and runs the wait paths first). So recover
+    // here must refuse on its own status / wait-lock guard, leaving the
+    // wake-driven `resumeRun` to own + complete the run.
+    const [resumeOut, recoverOut] = await Promise.all([
+      resumeRun(deps, {
+        runId,
+        automation: auto,
+        waitedAtPath: "actions[0]",
+      }),
+      recoverStalledRun(deps, { runId, automation: auto }),
+    ]);
+    // Recover refused (saw a non-`running` / wait-locked run); resume woke +
+    // completed it. The post-wait action ran EXACTLY once.
+    expect(recoverOut.status).toBe("waiting"); // refused, did not re-walk
+    expect(recording.calls).toHaveLength(1);
+    expect(resumeOut.status).toBe("success");
+    expect(runs.runs.get(runId)!.status).toBe("success");
+    expect(state.locks.size).toBe(0);
+  });
+});

package/src/dispatch/concurrency-scope.test.ts ADDED Viewed

@@ -0,0 +1,166 @@
+import { describe, it, expect } from "bun:test";
+import { SYSTEM_ACTOR } from "@checkstack/common";
+import {
+  AutomationDefinitionSchema,
+  type Automation,
+  type ConcurrencyScope,
+} from "@checkstack/automation-common";
+import type { AutomationStore } from "../automation-store";
+import { handleTriggerFiring } from "./trigger-subscriber";
+import { makeDispatchDeps, makeRecordingAction, testPlugin } from "./test-fixtures";
+import { createActionRegistry } from "../action-registry";
+import type { LoadedAutomation } from "./types";
+const EVENT = "test.event";
+/**
+ * An automation whose single action is a wait_for_trigger, so a started
+ * run stays in `waiting` (active) - lets us observe concurrency dedup.
+ */
+function buildAutomation(
+  scope: ConcurrencyScope,
+  opts: { mode?: string; maxRuns?: number } = {},
+): Automation {
+  const definition = AutomationDefinitionSchema.parse({
+    name: "Concurrency test",
+    triggers: [{ event: EVENT }],
+    conditions: [],
+    actions: [{ wait_for_trigger: { event: "never.fires" } }],
+    mode: opts.mode ?? "single",
+    concurrency_scope: scope,
+    max_runs: opts.maxRuns ?? 10,
+  });
+  return {
+    id: "auto-1",
+    name: "Concurrency test",
+    status: "enabled",
+    definition,
+    createdAt: new Date(),
+    updatedAt: new Date(),
+  };
+}
+function makeStore(auto: Automation): AutomationStore {
+  const loaded: LoadedAutomation = {
+    id: auto.id,
+    name: auto.name,
+    status: auto.status,
+    definition: auto.definition,
+  };
+  return {
+    create: async () => {
+      throw new Error("nope");
+    },
+    update: async () => {
+      throw new Error("nope");
+    },
+    delete: async () => {},
+    toggle: async () => {
+      throw new Error("nope");
+    },
+    getById: async (id) =>
+      id === auto.id
+        ? {
+            id: auto.id,
+            name: auto.name,
+            description: undefined,
+            status: auto.status,
+            definition: auto.definition,
+            managedBy: undefined,
+            createdAt: new Date(),
+            updatedAt: new Date(),
+          }
+        : undefined,
+    list: async () => ({ items: [auto], total: 1 }),
+    listGroups: async () => [],
+    findEnabledByTriggerEvent: async () => [loaded],
+    listEnabled: async () => [loaded],
+  };
+}
+function setup(
+  scope: ConcurrencyScope,
+  opts: { mode?: string; maxRuns?: number } = {},
+) {
+  const actionsReg = createActionRegistry();
+  const rec = makeRecordingAction();
+  actionsReg.register(rec.definition, testPlugin);
+  const { deps, runs } = makeDispatchDeps({ actions: actionsReg });
+  const auto = buildAutomation(scope, opts);
+  const store = makeStore(auto);
+  const fire = (systemId: string) =>
+    handleTriggerFiring({
+      deps,
+      automationStore: store,
+      qualifiedEventId: EVENT,
+      triggerPayload: { id: systemId },
+      actor: SYSTEM_ACTOR,
+      contextKey: systemId,
+    });
+  return { deps, runs, fire };
+}
+/** Count runs that are currently active (a started run waits forever here). */
+function activeCount(
+  runs: ReturnType<typeof makeDispatchDeps>["runs"],
+): number {
+  return [...runs.runs.values()].filter((r) =>
+    ["pending", "running", "waiting"].includes(r.status),
+  ).length;
+}
+describe("concurrency_scope: automation (default)", () => {
+  it("single mode dedups across ALL systems (one active run total)", async () => {
+    const { runs, fire } = setup("automation");
+    await fire("sys-a");
+    await fire("sys-b"); // different system, but per-automation single -> skipped
+    await fire("sys-a");
+    expect(activeCount(runs)).toBe(1);
+  });
+});
+describe("concurrency_scope: context_key", () => {
+  it("single mode dedups per system but runs different systems concurrently", async () => {
+    const { runs, fire } = setup("context_key");
+    await fire("sys-a"); // starts run for A
+    await fire("sys-b"); // starts run for B (different key)
+    await fire("sys-a"); // A already active -> deduped
+    await fire("sys-b"); // B already active -> deduped
+    // One active run per distinct system, no duplicates.
+    expect(activeCount(runs)).toBe(2);
+    const byContext = new Map<string | null, number>();
+    for (const r of runs.runs.values()) {
+      if (!["pending", "running", "waiting"].includes(r.status)) continue;
+      byContext.set(r.contextKey, (byContext.get(r.contextKey) ?? 0) + 1);
+    }
+    expect(byContext.get("sys-a")).toBe(1);
+    expect(byContext.get("sys-b")).toBe(1);
+  });
+});
+describe("concurrency modes (automation scope)", () => {
+  it("parallel mode allows up to max_runs concurrent runs, then caps", async () => {
+    const { runs, fire } = setup("automation", { mode: "parallel", maxRuns: 2 });
+    await fire("a");
+    await fire("b");
+    await fire("c"); // over the cap → skipped
+    expect(activeCount(runs)).toBe(2);
+  });
+  it("queued mode caps at max_runs (v1 behaves like parallel)", async () => {
+    const { runs, fire } = setup("automation", { mode: "queued", maxRuns: 1 });
+    await fire("a");
+    await fire("b"); // over the cap → skipped
+    expect(activeCount(runs)).toBe(1);
+  });
+  it("restart mode cancels the prior active run and starts fresh", async () => {
+    const { runs, fire } = setup("automation", { mode: "restart" });
+    await fire("a");
+    const firstId = [...runs.runs.values()][0]!.id;
+    await fire("b"); // cancels the first, starts a new run
+    expect(runs.runs.get(firstId)?.status).toBe("cancelled");
+    // Exactly one active run (the fresh one).
+    expect(activeCount(runs)).toBe(1);
+  });
+});

package/src/dispatch/condition.ts CHANGED Viewed

@@ -1,14 +1,16 @@
 /**
  * Condition evaluation for the dispatch engine.
  *
- * Conditions come in two shapes from the schema:
+ * Conditions come in several shapes from the schema:
  *
  *   - A template string returning truthy/falsy.
  *   - A combinator object — `{ and: [...] }`, `{ or: [...] }`, or
  *     `{ not: condition }` — recursing into nested conditions.
+ *   - A structured variant — `{ numeric_state }`, `{ time }`, `{ state }`.
  *
- * Both forms eval against the current dispatch scope through the shared
- * template engine.
+ * All forms eval against the current dispatch scope through the shared
+ * template engine. Structured variants additionally compute a fresh `now`
+ * per evaluation (the `time` variant) rather than reading scope `now`.
  */
 import {
   evaluateBoolean,
@@ -18,6 +20,12 @@ import {
 } from "@checkstack/template-engine";
 import type { Condition } from "@checkstack/automation-common";
+import {
+  evaluateNumericStateCondition,
+  evaluateStateCondition,
+  evaluateTimeCondition,
+} from "./structured-conditions";
 /**
  * Evaluate a condition to boolean.
  *
@@ -40,8 +48,19 @@ export function evaluateCondition(
   if ("or" in condition) {
     return condition.or.some((c) => evaluateCondition(c, context, filters));
   }
-  // not
-  return !evaluateCondition(condition.not, context, filters);
+  if ("not" in condition) {
+    return !evaluateCondition(condition.not, context, filters);
+  }
+  if ("numeric_state" in condition) {
+    return evaluateNumericStateCondition(condition, context, filters);
+  }
+  if ("time" in condition) {
+    // Fresh `now` per evaluation (constraint 7) — time-of-day gating must
+    // never read the frozen scope timestamp.
+    return evaluateTimeCondition(condition, new Date());
+  }
+  // state
+  return evaluateStateCondition(condition, context);
 }
 /**

package/src/dispatch/dwell-queue.ts ADDED Viewed

@@ -0,0 +1,65 @@
+/**
+ * Queue consumer that fires `for:` dwell timers.
+ *
+ * `armDwell` persists a dwell row and enqueues an `automation-dwell` job
+ * with the matching `startDelay`. When the scheduler fires the job, this
+ * consumer:
+ *
+ *   1. Loads the dwell row; bails if it's gone (cancelled, re-armed under
+ *      a different id, or already fired) — the row is the source of truth.
+ *   2. Delegates to `fireDwell`, which re-confirms the matched state
+ *      still holds, then starts the run via `startRunRespectingMode`.
+ *
+ * The stalled sweeper also catches expired dwell rows in case the queue
+ * job is lost; both paths are idempotent via delete-on-fire.
+ */
+import type { Logger } from "@checkstack/backend-api";
+import type { AutomationStore } from "../automation-store";
+import { DWELL_QUEUE_NAME, fireDwell, type DwellFireJob } from "./dwell";
+import { startRunRespectingMode } from "./trigger-subscriber";
+import type { DispatchDeps } from "./types";
+export interface DwellQueueConsumerArgs {
+  deps: DispatchDeps;
+  automationStore: AutomationStore;
+  logger: Logger;
+}
+export interface DwellQueueConsumer {
+  stop: () => Promise<void>;
+}
+export async function startDwellQueueConsumer(
+  args: DwellQueueConsumerArgs,
+): Promise<DwellQueueConsumer> {
+  const queue = args.deps.queueManager.getQueue<DwellFireJob>(
+    DWELL_QUEUE_NAME,
+  );
+  await queue.consume(
+    async (job) => {
+      const { dwellId } = job.data;
+      const dwell = await args.deps.dwellStore.load(dwellId);
+      if (!dwell) {
+        args.logger.debug(
+          `dwell-queue: dwell ${dwellId} no longer exists (cancelled / re-armed / already fired)`,
+        );
+        return;
+      }
+      await fireDwell({
+        deps: args.deps,
+        automationStore: args.automationStore,
+        dwell,
+        startRun: startRunRespectingMode,
+      });
+    },
+    { consumerGroup: "automation-dwell-fire", maxRetries: 3 },
+  );
+  return {
+    stop: async () => {
+      await queue.stop();
+    },
+  };
+}

package/src/dispatch/dwell-store.ts ADDED Viewed

@@ -0,0 +1,154 @@
+/**
+ * Drizzle-backed implementation of `DwellStore` — pre-run `for:` dwell
+ * timers. Kept thin: each method maps almost 1:1 to a DB statement. The
+ * row is the source of truth; the queue job is just the wake signal and
+ * cancellation is a row delete (constraint 2).
+ */
+import { and, eq, isNull, lte } from "drizzle-orm";
+import type { SafeDatabase } from "@checkstack/backend-api";
+import { automationDwellTimers } from "../schema";
+import type { DwellStore, LoadedDwell, UpsertDwellInput } from "./types";
+type Schema = { automationDwellTimers: typeof automationDwellTimers };
+function mapRow(row: typeof automationDwellTimers.$inferSelect): LoadedDwell {
+  return {
+    id: row.id,
+    automationId: row.automationId,
+    triggerId: row.triggerId,
+    eventId: row.eventId,
+    contextKey: row.contextKey,
+    armedStatus: row.armedStatus,
+    payloadSnapshot: row.payloadSnapshot,
+    actorSnapshot: row.actorSnapshot,
+    fireAt: row.fireAt,
+    createdAt: row.createdAt,
+  };
+}
+/** Build the `(automationId, triggerId, contextKey)` match predicate. */
+function keyWhere(
+  automationId: string,
+  triggerId: string,
+  contextKey: string | null,
+) {
+  return and(
+    eq(automationDwellTimers.automationId, automationId),
+    eq(automationDwellTimers.triggerId, triggerId),
+    contextKey === null
+      ? isNull(automationDwellTimers.contextKey)
+      : eq(automationDwellTimers.contextKey, contextKey),
+  );
+}
+export function createDwellStore(db: SafeDatabase<Schema>): DwellStore {
+  return {
+    async arm(input: UpsertDwellInput) {
+      // Insert-if-absent. A dwell already armed for this key is preserved
+      // UNCHANGED — its original `fireAt` stands so the `for:` window
+      // measures "continuously matched since first arm", not "since the
+      // most recent matching event". (A genuine recover-then-recur deletes
+      // the row first via inverse-cancel / re-confirm, starting fresh.)
+      // Fast path: if a row already exists for the key, return it untouched.
+      // (Also covers the null-context-key case where ON CONFLICT can't
+      // match, since NULLs are distinct in a Postgres unique index.)
+      const [existing] = await db
+        .select()
+        .from(automationDwellTimers)
+        .where(keyWhere(input.automationId, input.triggerId, input.contextKey))
+        .limit(1);
+      if (existing) {
+        return { id: existing.id, created: false, fireAt: existing.fireAt };
+      }
+      // No row yet — INSERT. ON CONFLICT DO NOTHING guards the race where a
+      // concurrent arm inserted between our SELECT and INSERT; in that case
+      // `returning` is empty and we re-read the winner's row.
+      const [row] = await db
+        .insert(automationDwellTimers)
+        .values({
+          automationId: input.automationId,
+          triggerId: input.triggerId,
+          eventId: input.eventId,
+          contextKey: input.contextKey,
+          armedStatus: input.armedStatus,
+          payloadSnapshot: input.payloadSnapshot,
+          actorSnapshot: input.actorSnapshot,
+          fireAt: input.fireAt,
+        })
+        .onConflictDoNothing({
+          target: [
+            automationDwellTimers.automationId,
+            automationDwellTimers.triggerId,
+            automationDwellTimers.contextKey,
+          ],
+        })
+        .returning();
+      if (row) {
+        return { id: row.id, created: true, fireAt: row.fireAt };
+      }
+      // Lost the race — another arm won. Re-read the existing row.
+      const [winner] = await db
+        .select()
+        .from(automationDwellTimers)
+        .where(keyWhere(input.automationId, input.triggerId, input.contextKey))
+        .limit(1);
+      if (!winner) throw new Error("arm dwell: row vanished after conflict");
+      return { id: winner.id, created: false, fireAt: winner.fireAt };
+    },
+    async load(id) {
+      const [row] = await db
+        .select()
+        .from(automationDwellTimers)
+        .where(eq(automationDwellTimers.id, id))
+        .limit(1);
+      return row ? mapRow(row) : undefined;
+    },
+    async findByKey(automationId, triggerId, contextKey) {
+      const [row] = await db
+        .select()
+        .from(automationDwellTimers)
+        .where(keyWhere(automationId, triggerId, contextKey))
+        .limit(1);
+      return row ? mapRow(row) : undefined;
+    },
+    async delete(id) {
+      // `RETURNING id` makes the delete an atomic claim: exactly one
+      // concurrent caller sees a returned row (the row the DB actually
+      // removed), every other caller gets an empty result. `fireDwell`
+      // relies on this so two pods (or the sweeper vs the queue consumer)
+      // can't both fire the same dwell.
+      const deleted = await db
+        .delete(automationDwellTimers)
+        .where(eq(automationDwellTimers.id, id))
+        .returning({ id: automationDwellTimers.id });
+      return deleted.length > 0;
+    },
+    async deleteByKey(automationId, triggerId, contextKey) {
+      await db
+        .delete(automationDwellTimers)
+        .where(keyWhere(automationId, triggerId, contextKey));
+    },
+    async deleteForAutomation(automationId) {
+      await db
+        .delete(automationDwellTimers)
+        .where(eq(automationDwellTimers.automationId, automationId));
+    },
+    async sweepExpired(now) {
+      const rows = await db
+        .select()
+        .from(automationDwellTimers)
+        .where(lte(automationDwellTimers.fireAt, now));
+      return rows.map((row) => mapRow(row));
+    },
+  };
+}