npm - @checkstack/automation-backend - Versions diffs - 0.2.0 → 0.3.0 - Mend

@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

package/CHANGELOG.md +544 -0
package/drizzle/0003_sparkling_xorn.sql +17 -0
package/drizzle/0004_cultured_spyke.sql +2 -0
package/drizzle/0005_classy_the_hand.sql +19 -0
package/drizzle/0006_burly_wallop.sql +10 -0
package/drizzle/0007_nappy_jackal.sql +1 -0
package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
package/drizzle/0009_steady_liz_osborn.sql +12 -0
package/drizzle/0010_chunky_changeling.sql +2 -0
package/drizzle/meta/0003_snapshot.json +1007 -0
package/drizzle/meta/0004_snapshot.json +1028 -0
package/drizzle/meta/0005_snapshot.json +1164 -0
package/drizzle/meta/0006_snapshot.json +1261 -0
package/drizzle/meta/0007_snapshot.json +1215 -0
package/drizzle/meta/0008_snapshot.json +1215 -0
package/drizzle/meta/0009_snapshot.json +1328 -0
package/drizzle/meta/0010_snapshot.json +1349 -0
package/drizzle/meta/_journal.json +56 -0
package/package.json +23 -12
package/src/action-types.ts +23 -0
package/src/artifact-store.ts +16 -1
package/src/automation-store.test.ts +143 -0
package/src/automation-store.ts +30 -8
package/src/builtin-triggers.test.ts +77 -74
package/src/builtin-triggers.ts +105 -108
package/src/dispatch/action-kind.ts +2 -0
package/src/dispatch/assemble-get-service.ts +31 -0
package/src/dispatch/cancel-resurrect.test.ts +147 -0
package/src/dispatch/concurrency-race.test.ts +255 -0
package/src/dispatch/concurrency-scope.test.ts +166 -0
package/src/dispatch/condition.ts +24 -5
package/src/dispatch/dwell-queue.ts +65 -0
package/src/dispatch/dwell-store.ts +154 -0
package/src/dispatch/dwell.it.test.ts +142 -0
package/src/dispatch/dwell.test.ts +799 -0
package/src/dispatch/dwell.ts +257 -0
package/src/dispatch/engine.test.ts +189 -2
package/src/dispatch/engine.ts +555 -9
package/src/dispatch/entity-scope.test.ts +176 -0
package/src/dispatch/get-service-wiring.test.ts +318 -0
package/src/dispatch/numeric.test.ts +71 -0
package/src/dispatch/numeric.ts +96 -0
package/src/dispatch/render.test.ts +34 -0
package/src/dispatch/render.ts +31 -11
package/src/dispatch/reseed-run-secrets.ts +230 -0
package/src/dispatch/run-secret-registry.test.ts +189 -0
package/src/dispatch/run-secret-registry.ts +247 -0
package/src/dispatch/run-state-masking.test.ts +376 -0
package/src/dispatch/run-state-store.ts +95 -38
package/src/dispatch/run-state.ts +226 -59
package/src/dispatch/scope-artifact-masking.test.ts +138 -0
package/src/dispatch/secret-ref-ids.test.ts +19 -0
package/src/dispatch/secret-ref-ids.ts +17 -0
package/src/dispatch/snapshots.test.ts +86 -0
package/src/dispatch/snapshots.ts +79 -0
package/src/dispatch/stage1-router.test.ts +324 -0
package/src/dispatch/stage1-router.ts +152 -0
package/src/dispatch/stage1.it.test.ts +84 -0
package/src/dispatch/stage2-dispatch.test.ts +285 -0
package/src/dispatch/stage2-dispatch.ts +207 -0
package/src/dispatch/stage2-stalled.it.test.ts +132 -0
package/src/dispatch/stalled-sweeper.test.ts +197 -0
package/src/dispatch/stalled-sweeper.ts +112 -5
package/src/dispatch/state-scope.test.ts +234 -0
package/src/dispatch/state-scope.ts +322 -0
package/src/dispatch/structured-conditions.test.ts +246 -0
package/src/dispatch/structured-conditions.ts +146 -0
package/src/dispatch/test-fixtures.ts +306 -38
package/src/dispatch/trigger-fanin.test.ts +111 -0
package/src/dispatch/trigger-subscriber.ts +316 -14
package/src/dispatch/types.ts +263 -8
package/src/dispatch/wait-timeout-queue.ts +89 -0
package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
package/src/dispatch/wait-until.test.ts +540 -0
package/src/dispatch/wake-refs.test.ts +158 -0
package/src/dispatch/wake-refs.ts +348 -0
package/src/dispatch/window-gate.test.ts +513 -0
package/src/dispatch/window-store.test.ts +162 -0
package/src/dispatch/window-store.ts +102 -0
package/src/entity/change-derivers.test.ts +148 -0
package/src/entity/change-derivers.ts +143 -0
package/src/entity/change-emitter.test.ts +66 -0
package/src/entity/change-emitter.ts +76 -0
package/src/entity/create-handle.ts +344 -0
package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
package/src/entity/define-entity.ts +157 -0
package/src/entity/diff.test.ts +57 -0
package/src/entity/diff.ts +54 -0
package/src/entity/entity-store.test.ts +30 -0
package/src/entity/entity-store.ts +171 -0
package/src/entity/extension-point.ts +56 -0
package/src/entity/fake-entity-store.ts +130 -0
package/src/entity/hook.ts +19 -0
package/src/entity/index.ts +50 -0
package/src/entity/mutate-handle.test.ts +517 -0
package/src/entity/on-entity-changed.test.ts +189 -0
package/src/entity/on-entity-changed.ts +214 -0
package/src/entity/registry.test.ts +181 -0
package/src/entity/registry.ts +200 -0
package/src/entity/stable-stringify.test.ts +55 -0
package/src/entity/stable-stringify.ts +49 -0
package/src/entity/wake-index.it.test.ts +251 -0
package/src/entity/with-entity-write.test.ts +100 -0
package/src/entity/with-entity-write.ts +69 -0
package/src/entity-driven-trigger.ts +46 -0
package/src/extension-points.ts +35 -0
package/src/gitops-docs.test.ts +215 -0
package/src/gitops-docs.ts +151 -0
package/src/gitops-kinds.test.ts +174 -0
package/src/gitops-kinds.ts +137 -0
package/src/index.ts +355 -11
package/src/migration/flapping-to-window.test.ts +123 -0
package/src/migration/flapping-to-window.ts +205 -0
package/src/router.test.ts +182 -1
package/src/router.ts +73 -2
package/src/schema.ts +236 -3
package/src/script-test-replay.test.ts +88 -0
package/src/script-test-replay.ts +100 -0
package/src/script-test-shell-env.test.ts +41 -0
package/src/script-test-shell-env.ts +89 -0
package/src/script-test.test.ts +386 -0
package/src/script-test.ts +258 -0
package/src/trigger-registry.ts +2 -0
package/src/validate-definition.test.ts +1 -0
package/tsconfig.json +24 -0

package/src/dispatch/run-state-store.ts CHANGED Viewed

@@ -11,10 +11,15 @@
  * at a time. The lock auto-releases when the holding connection dies —
  * exactly what we want during crash recovery.
  */
-import { lt, eq, sql } from "drizzle-orm";
-import type { SafeDatabase } from "@checkstack/backend-api";
+import { and, eq, lt } from "drizzle-orm";
+import type {
+  AdvisoryLockHandle,
+  AdvisoryLockService,
+  SafeDatabase,
+} from "@checkstack/backend-api";
-import { automationRunState } from "../schema";
+import { automationRunState, automationRuns } from "../schema";
+import type { RunSecretRegistry } from "./run-secret-registry";
 export interface RunStateSnapshot {
   scopeSnapshot: Record<string, unknown>;
@@ -27,11 +32,18 @@ export interface RunStateStore {
    * Write or update the per-run durable state. `lastActionPath` is the
    * path of the most recently completed action — resume walks the tree
    * looking for this path and treats the action at it as already done.
+   *
+   * Omitting `lastActionPath` (vs. passing `null`) on an UPDATE preserves
+   * the existing checkpoint. This matters at suspend-finalisation: the
+   * checkpoint written by the suspending action (its real path) must
+   * survive so a crash-recovery resumes from it rather than re-walking
+   * from `actions[0]`. Passing `null` explicitly still clobbers it (used
+   * only for the initial pre-first-step snapshot).
    */
   upsert(input: {
     runId: string;
     scopeSnapshot: Record<string, unknown>;
-    lastActionPath: string | null;
+    lastActionPath?: string | null;
   }): Promise<void>;
   load(runId: string): Promise<RunStateSnapshot | undefined>;
@@ -43,46 +55,89 @@ export interface RunStateStore {
   heartbeat(runId: string): Promise<void>;
   /**
-   * Run ids whose heartbeat is older than `threshold`. Returned in
-   * heartbeat-ascending order so the sweeper processes the most
-   * stale first.
+   * Run ids of `status = 'running'` runs whose heartbeat is older than
+   * `threshold`. Returned in heartbeat-ascending order so the sweeper
+   * processes the most stale first.
+   *
+   * The status filter is load-bearing: `waiting` runs (suspended on a
+   * `delay` / `wait_for_trigger` / `wait_until`) keep their state row but
+   * are NOT stalled - they are owned by the wait-lock / queue resume
+   * paths. Returning them here would let the sweeper re-walk an
+   * intentional wait every cycle, re-firing pre-wait side effects and
+   * leaking wait locks. Only a `running` run whose heartbeat went cold is
+   * a genuine crash.
    */
   findStalledRunIds(threshold: Date): Promise<string[]>;
   /**
-   * Try to acquire a Postgres session-level advisory lock for the run.
-   * Returns true on acquisition. The lock auto-releases when the holding
-   * DB session closes (e.g. on process crash), so dead instances don't
+   * Try to acquire a Postgres session-level advisory lock for the run on a
+   * dedicated pooled client. Returns a handle on acquisition (release it in
+   * a `finally`), or `null` if another instance already holds it.
+   *
+   * A dedicated client is required because the lock is held across the whole
+   * resume (which executes the run's actions — potentially long and
+   * involving external calls), so a transaction-scoped lock would mean a
+   * minutes-long open transaction. The session lock auto-releases when the
+   * holding connection dies (e.g. on process crash), so dead instances don't
    * leak locks.
    */
-  tryAdvisoryLock(runId: string): Promise<boolean>;
-  /** Release a previously-acquired advisory lock. */
-  releaseAdvisoryLock(runId: string): Promise<void>;
+  tryAdvisoryLock(runId: string): Promise<AdvisoryLockHandle | null>;
 }
-type Schema = { automationRunState: typeof automationRunState };
+type Schema = {
+  automationRunState: typeof automationRunState;
+  automationRuns: typeof automationRuns;
+};
+/** Namespace run locks in the global advisory-lock space. */
+function runLockKey(runId: string): string {
+  return `automation.run:${runId}`;
+}
 export function createRunStateStore(
   db: SafeDatabase<Schema>,
+  advisoryLock: AdvisoryLockService,
+  /**
+   * Run-scoped secret values accumulated during dispatch. When provided,
+   * the persisted `scopeSnapshot` is masked (Jenkins-style, by-value)
+   * BEFORE write — so a resolved connection credential threaded into
+   * `scope.variables` / `scope.artifacts` can't reach a replay reader
+   * (`getRunScopeForReplay`) unmasked. The registry is in-memory and gone
+   * by replay time, so persist-time is the only place masking can happen.
+   * Optional so tests / older boots degrade to no masking.
+   */
+  secretRegistry?: RunSecretRegistry,
 ): RunStateStore {
   return {
     async upsert(input) {
+      // Mask the scope snapshot at the persistence choke point — same
+      // pattern the run store uses for step / run output.
+      const maskedScope = (secretRegistry?.maskDeep(
+        input.runId,
+        input.scopeSnapshot,
+      ) ?? input.scopeSnapshot) as Record<string, unknown>;
+      // Omitting `lastActionPath` preserves the existing checkpoint on an
+      // UPDATE (so a suspend-finalisation doesn't clobber the suspending
+      // action's path to null). The INSERT still needs a value, so a fresh
+      // row defaults to null.
+      const updateSet: Record<string, unknown> = {
+        scopeSnapshot: maskedScope,
+        lastHeartbeatAt: new Date(),
+        updatedAt: new Date(),
+      };
+      if (input.lastActionPath !== undefined) {
+        updateSet.lastActionPath = input.lastActionPath;
+      }
       await db
         .insert(automationRunState)
         .values({
           runId: input.runId,
-          scopeSnapshot: input.scopeSnapshot,
-          lastActionPath: input.lastActionPath,
+          scopeSnapshot: maskedScope,
+          lastActionPath: input.lastActionPath ?? null,
         })
         .onConflictDoUpdate({
           target: automationRunState.runId,
-          set: {
-            scopeSnapshot: input.scopeSnapshot,
-            lastActionPath: input.lastActionPath,
-            lastHeartbeatAt: new Date(),
-            updatedAt: new Date(),
-          },
+          set: updateSet,
         });
     },
@@ -115,29 +170,31 @@ export function createRunStateStore(
     },
     async findStalledRunIds(threshold) {
+      // Join the run row so we only return runs that are actually
+      // `running`. A `waiting` run keeps its state snapshot but must NOT
+      // be re-walked by the sweeper - it is owned by the wait-lock /
+      // queue resume paths.
       const rows = await db
         .select({ runId: automationRunState.runId })
         .from(automationRunState)
-        .where(lt(automationRunState.lastHeartbeatAt, threshold))
+        .innerJoin(
+          automationRuns,
+          eq(automationRuns.id, automationRunState.runId),
+        )
+        .where(
+          and(
+            lt(automationRunState.lastHeartbeatAt, threshold),
+            eq(automationRuns.status, "running"),
+          ),
+        )
         .orderBy(automationRunState.lastHeartbeatAt);
       return rows.map((r) => r.runId);
     },
     async tryAdvisoryLock(runId) {
-      // hashtextextended returns int8 in Postgres, which pg_try_advisory_lock
-      // accepts directly. Using a deterministic hash means the same runId
-      // always maps to the same lock key across processes.
-      const result = await db.execute<{ ok: boolean }>(sql`
-        SELECT pg_try_advisory_lock(hashtextextended(${runId}, 0)) AS ok
-      `);
-      const rows = result as unknown as { rows: Array<{ ok: boolean }> };
-      return Boolean(rows.rows?.[0]?.ok);
-    },
-    async releaseAdvisoryLock(runId) {
-      await db.execute(sql`
-        SELECT pg_advisory_unlock(hashtextextended(${runId}, 0))
-      `);
+      // Acquire on a dedicated client (see interface doc) — the lock is held
+      // for the whole resume, so it must not ride a long-open transaction.
+      return advisoryLock.tryAcquire(runLockKey(runId));
     },
   };
 }

package/src/dispatch/run-state.ts CHANGED Viewed

@@ -8,32 +8,83 @@
  * trigger subscriber).
  */
 import { and, desc, eq, inArray, isNotNull, isNull, lte, sql } from "drizzle-orm";
-import type { SafeDatabase } from "@checkstack/backend-api";
+import type { Logger, SafeDatabase } from "@checkstack/backend-api";
 import {
+  automationRunState,
   automationRunSteps,
   automationRuns,
   automationWaitLocks,
+  automationWakeIndex,
 } from "../schema";
 import type {
   CreateRunInput,
   CreateStepInput,
   CreateWaitLockInput,
+  CreateWaitLockWithRefsInput,
   LoadedRun,
   LoadedStep,
   LoadedWaitLock,
   RunStore,
+  WaitLockKind,
 } from "./types";
+import { parseWaitConfig } from "./snapshots";
+import type { RunSecretRegistry } from "./run-secret-registry";
 type Schema = {
   automationRuns: typeof automationRuns;
   automationRunSteps: typeof automationRunSteps;
   automationWaitLocks: typeof automationWaitLocks;
+  automationRunState: typeof automationRunState;
+  automationWakeIndex: typeof automationWakeIndex;
 };
+/** The kind-level wildcard ref for a `${kind}:${id}` ref. */
+function wildcardRefFor(ref: string): string {
+  const colon = ref.indexOf(":");
+  const kind = colon === -1 ? ref : ref.slice(0, colon);
+  return `${kind}:*`;
+}
 const ACTIVE_STATUSES = ["pending", "running", "waiting"] as const;
-export function createRunStore(db: SafeDatabase<Schema>): RunStore {
+/**
+ * Predicate for "active runs of this automation". When `contextKey` is
+ * `undefined` the filter is per-automation (the default concurrency
+ * scope); when provided (string or `null`) it additionally narrows to
+ * that context key (the per-context-key scope) - `null` matches runs
+ * with no context key.
+ */
+function activeRunsPredicate(
+  automationId: string,
+  contextKey: string | null | undefined,
+) {
+  const conditions = [
+    eq(automationRuns.automationId, automationId),
+    inArray(automationRuns.status, [...ACTIVE_STATUSES]),
+  ];
+  if (contextKey !== undefined) {
+    conditions.push(
+      contextKey === null
+        ? isNull(automationRuns.contextKey)
+        : eq(automationRuns.contextKey, contextKey),
+    );
+  }
+  return and(...conditions);
+}
+export function createRunStore(
+  db: SafeDatabase<Schema>,
+  logger?: Logger,
+  /**
+   * Run-scoped secret values accumulated during dispatch. When provided,
+   * step `resultPayload` / `errorMessage` and run-level `errorMessage` are
+   * masked (Jenkins-style, by-value) BEFORE persistence, so no resolved
+   * secret can reach a DTO / run-detail page. Optional so tests / older
+   * boots degrade to no masking.
+   */
+  secretRegistry?: RunSecretRegistry,
+): RunStore {
   return {
     async createRun(input: CreateRunInput): Promise<string> {
       const [row] = await db
@@ -57,14 +108,22 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
         status === "failed" ||
         status === "cancelled" ||
         status === "skipped";
+      // Mask the run-level error before persisting (a provider HTTP error
+      // could embed a resolved credential).
+      const maskedError =
+        errorMessage === undefined
+          ? null
+          : (secretRegistry?.maskText(runId, errorMessage) ?? errorMessage);
       await db
         .update(automationRuns)
         .set({
           status,
-          errorMessage: errorMessage ?? null,
+          errorMessage: maskedError,
           finishedAt: isTerminal ? new Date() : null,
         })
         .where(eq(automationRuns.id, runId));
+      // Drop the run's accumulated mask set once it is terminal (memory-only).
+      if (isTerminal) secretRegistry?.drop(runId);
     },
     async loadRun(runId: string): Promise<LoadedRun | undefined> {
@@ -89,29 +148,25 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
       };
     },
-    async countActiveRuns(automationId: string): Promise<number> {
+    async countActiveRuns(
+      automationId: string,
+      contextKey?: string | null,
+    ): Promise<number> {
       const rows = await db
         .select({ count: sql<number>`count(*)::int` })
         .from(automationRuns)
-        .where(
-          and(
-            eq(automationRuns.automationId, automationId),
-            inArray(automationRuns.status, [...ACTIVE_STATUSES]),
-          ),
-        );
+        .where(activeRunsPredicate(automationId, contextKey));
       return rows[0]?.count ?? 0;
     },
-    async hasActiveRun(automationId: string): Promise<boolean> {
+    async hasActiveRun(
+      automationId: string,
+      contextKey?: string | null,
+    ): Promise<boolean> {
       const rows = await db
         .select({ id: automationRuns.id })
         .from(automationRuns)
-        .where(
-          and(
-            eq(automationRuns.automationId, automationId),
-            inArray(automationRuns.status, [...ACTIVE_STATUSES]),
-          ),
-        )
+        .where(activeRunsPredicate(automationId, contextKey))
         .limit(1);
       return rows.length > 0;
     },
@@ -119,6 +174,7 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
     async cancelActiveRuns(
       automationId: string,
       reason: string,
+      contextKey?: string | null,
     ): Promise<string[]> {
       const rows = await db
         .update(automationRuns)
@@ -127,14 +183,26 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
           errorMessage: reason,
           finishedAt: new Date(),
         })
-        .where(
-          and(
-            eq(automationRuns.automationId, automationId),
-            inArray(automationRuns.status, [...ACTIVE_STATUSES]),
-          ),
-        )
+        .where(activeRunsPredicate(automationId, contextKey))
         .returning({ id: automationRuns.id });
-      return rows.map((r) => r.id);
+      const ids = rows.map((r) => r.id);
+      // Tear down the cancelled runs' suspension state in the SAME
+      // operation: delete their wait locks and durable run-state so a
+      // later wake (wakeWaitingRuns / delay-expiry / a racing queue job)
+      // can't resurrect a cancelled run. Mirrors the operator cancelRun
+      // path. (resumeRun also guards on status, but cleaning up here stops
+      // the sweeper from even re-ticking an orphaned lock.)
+      if (ids.length > 0) {
+        await db
+          .delete(automationWaitLocks)
+          .where(inArray(automationWaitLocks.runId, ids));
+        await db
+          .delete(automationRunState)
+          .where(inArray(automationRunState.runId, ids));
+        // Drop each run's in-memory mask set (terminal).
+        for (const id of ids) secretRegistry?.drop(id);
+      }
+      return ids;
     },
     async createStep(input: CreateStepInput): Promise<string> {
@@ -151,6 +219,9 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
         })
         .returning({ id: automationRunSteps.id });
       if (!row) throw new Error("createStep: insert returned no rows");
+      // Link the step to its run so updateStep (which carries only stepId)
+      // can find the run's mask set.
+      secretRegistry?.linkStep(row.id, input.runId);
       return row.id;
     },
@@ -159,10 +230,23 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
         patch.status === "success" ||
         patch.status === "failed" ||
         patch.status === "skipped";
+      // Mask resolved secret values out of the step output BEFORE persist —
+      // this is the run-wide choke point covering ALL actions (provider,
+      // log, etc.), not just the script/collector source-side masking.
+      const maskedError =
+        patch.errorMessage === undefined
+          ? null
+          : (secretRegistry?.maskTextForStep(stepId, patch.errorMessage) ??
+            patch.errorMessage);
+      const maskedPayload =
+        patch.resultPayload === undefined
+          ? null
+          : (secretRegistry?.maskDeepForStep(stepId, patch.resultPayload) ??
+            patch.resultPayload);
       const set: Record<string, unknown> = {
         status: patch.status,
-        errorMessage: patch.errorMessage ?? null,
-        resultPayload: patch.resultPayload ?? null,
+        errorMessage: maskedError,
+        resultPayload: maskedPayload,
       };
       if (isTerminal) set.finishedAt = new Date();
       if (patch.incrementAttempts) {
@@ -214,12 +298,58 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
           contextKey: input.contextKey,
           filterTemplate: input.filterTemplate,
           timeoutAt: input.timeoutAt,
+          // Serialisation boundary: UntilWaitConfig is a plain JSON object
+          // but its `condition` union isn't structurally a Record, so cast.
+          waitConfig: input.waitConfig
+            ? (input.waitConfig as unknown as Record<string, unknown>)
+            : undefined,
         })
         .returning({ id: automationWaitLocks.id });
       if (!row) throw new Error("createWaitLock: insert returned no rows");
       return row.id;
     },
+    async createWaitLockWithWakeRefs(
+      input: CreateWaitLockWithRefsInput,
+    ): Promise<string> {
+      return db.transaction(async (tx) => {
+        const [row] = await tx
+          .insert(automationWaitLocks)
+          .values({
+            runId: input.runId,
+            actionPath: input.actionPath,
+            kind: "until",
+            eventId: input.eventId,
+            contextKey: input.contextKey,
+            filterTemplate: null,
+            timeoutAt: input.timeoutAt,
+            // Serialisation boundary — see createWaitLock.
+            waitConfig: input.waitConfig as unknown as Record<string, unknown>,
+          })
+          .returning({ id: automationWaitLocks.id });
+        if (!row) {
+          throw new Error("createWaitLockWithWakeRefs: insert returned no rows");
+        }
+        // De-dupe refs in-process before the insert (the unique index is the
+        // cross-process arm-race guard; this keeps the VALUES list tight).
+        const uniqueRefs = [...new Set(input.wakeRefs)];
+        if (uniqueRefs.length > 0) {
+          await tx
+            .insert(automationWakeIndex)
+            .values(
+              uniqueRefs.map((ref) => ({ waitLockId: row.id, ref })),
+            )
+            .onConflictDoNothing({
+              target: [
+                automationWakeIndex.waitLockId,
+                automationWakeIndex.ref,
+              ],
+            });
+        }
+        return row.id;
+      });
+    },
     async loadWaitLock(id) {
       const rows = await db
         .select()
@@ -228,17 +358,7 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
         .limit(1);
       const row = rows[0];
       if (!row) return;
-      return {
-        id: row.id,
-        runId: row.runId,
-        actionPath: row.actionPath,
-        kind: row.kind as "trigger" | "delay",
-        eventId: row.eventId,
-        contextKey: row.contextKey,
-        filterTemplate: row.filterTemplate,
-        timeoutAt: row.timeoutAt,
-        createdAt: row.createdAt,
-      };
+      return mapWaitLock(row, logger);
     },
     async findWaitLocksFor(
@@ -255,17 +375,48 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
         .select()
         .from(automationWaitLocks)
         .where(and(...filters));
-      return rows.map((r) => ({
-        id: r.id,
-        runId: r.runId,
-        actionPath: r.actionPath,
-        kind: r.kind as "trigger" | "delay",
-        eventId: r.eventId,
-        contextKey: r.contextKey,
-        filterTemplate: r.filterTemplate,
-        timeoutAt: r.timeoutAt,
-        createdAt: r.createdAt,
-      }));
+      return rows.map((r) => mapWaitLock(r, logger));
+    },
+    async findWaitLocksByWakeRef(ref: string): Promise<LoadedWaitLock[]> {
+      // The generalized form of findWaitLocksFor: join the wake-index onto
+      // the wait locks and match the exact ref OR the kind-level wildcard.
+      const wildcard = wildcardRefFor(ref);
+      const rows = await db
+        .select({ lock: automationWaitLocks })
+        .from(automationWaitLocks)
+        .innerJoin(
+          automationWakeIndex,
+          eq(automationWakeIndex.waitLockId, automationWaitLocks.id),
+        )
+        .where(
+          and(
+            eq(automationWaitLocks.kind, "until"),
+            inArray(automationWakeIndex.ref, [ref, wildcard]),
+          ),
+        );
+      // A wait may match on both the exact ref and the wildcard; de-dupe by id.
+      const byId = new Map<string, LoadedWaitLock>();
+      for (const r of rows) {
+        if (!byId.has(r.lock.id)) byId.set(r.lock.id, mapWaitLock(r.lock, logger));
+      }
+      return [...byId.values()];
+    },
+    async findWaitLocksByKind(kind): Promise<LoadedWaitLock[]> {
+      const rows = await db
+        .select()
+        .from(automationWaitLocks)
+        .where(eq(automationWaitLocks.kind, kind));
+      return rows.map((r) => mapWaitLock(r, logger));
+    },
+    async findWaitLocksByRun(runId): Promise<LoadedWaitLock[]> {
+      const rows = await db
+        .select()
+        .from(automationWaitLocks)
+        .where(eq(automationWaitLocks.runId, runId));
+      return rows.map((r) => mapWaitLock(r, logger));
     },
     async deleteWaitLock(id: string): Promise<void> {
@@ -282,17 +433,33 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
             lte(automationWaitLocks.timeoutAt, now),
           ),
         );
-      return rows.map((r) => ({
-        id: r.id,
-        runId: r.runId,
-        actionPath: r.actionPath,
-        kind: r.kind as "trigger" | "delay",
-        eventId: r.eventId,
-        contextKey: r.contextKey,
-        filterTemplate: r.filterTemplate,
-        timeoutAt: r.timeoutAt,
-        createdAt: r.createdAt,
-      }));
+      return rows.map((r) => mapWaitLock(r, logger));
     },
   };
 }
+/** Map a wait-lock row to the engine's {@link LoadedWaitLock}. */
+function mapWaitLock(
+  row: typeof automationWaitLocks.$inferSelect,
+  logger?: Logger,
+): LoadedWaitLock {
+  return {
+    id: row.id,
+    runId: row.runId,
+    actionPath: row.actionPath,
+    kind: row.kind as WaitLockKind,
+    eventId: row.eventId,
+    contextKey: row.contextKey,
+    filterTemplate: row.filterTemplate,
+    timeoutAt: row.timeoutAt,
+    // Parse the stored config on load — a drifted/hand-edited row degrades
+    // to null (engine treats the `until` lock as gone) instead of being
+    // trusted as a wrongly-typed UntilWaitConfig.
+    waitConfig: parseWaitConfig({
+      value: row.waitConfig,
+      logger,
+      context: `Wait lock ${row.id}`,
+    }),
+    createdAt: row.createdAt,
+  };
+}