npm - @checkstack/automation-backend - Versions diffs - 0.2.0 → 0.3.0 - Mend

@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

package/CHANGELOG.md +544 -0
package/drizzle/0003_sparkling_xorn.sql +17 -0
package/drizzle/0004_cultured_spyke.sql +2 -0
package/drizzle/0005_classy_the_hand.sql +19 -0
package/drizzle/0006_burly_wallop.sql +10 -0
package/drizzle/0007_nappy_jackal.sql +1 -0
package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
package/drizzle/0009_steady_liz_osborn.sql +12 -0
package/drizzle/0010_chunky_changeling.sql +2 -0
package/drizzle/meta/0003_snapshot.json +1007 -0
package/drizzle/meta/0004_snapshot.json +1028 -0
package/drizzle/meta/0005_snapshot.json +1164 -0
package/drizzle/meta/0006_snapshot.json +1261 -0
package/drizzle/meta/0007_snapshot.json +1215 -0
package/drizzle/meta/0008_snapshot.json +1215 -0
package/drizzle/meta/0009_snapshot.json +1328 -0
package/drizzle/meta/0010_snapshot.json +1349 -0
package/drizzle/meta/_journal.json +56 -0
package/package.json +23 -12
package/src/action-types.ts +23 -0
package/src/artifact-store.ts +16 -1
package/src/automation-store.test.ts +143 -0
package/src/automation-store.ts +30 -8
package/src/builtin-triggers.test.ts +77 -74
package/src/builtin-triggers.ts +105 -108
package/src/dispatch/action-kind.ts +2 -0
package/src/dispatch/assemble-get-service.ts +31 -0
package/src/dispatch/cancel-resurrect.test.ts +147 -0
package/src/dispatch/concurrency-race.test.ts +255 -0
package/src/dispatch/concurrency-scope.test.ts +166 -0
package/src/dispatch/condition.ts +24 -5
package/src/dispatch/dwell-queue.ts +65 -0
package/src/dispatch/dwell-store.ts +154 -0
package/src/dispatch/dwell.it.test.ts +142 -0
package/src/dispatch/dwell.test.ts +799 -0
package/src/dispatch/dwell.ts +257 -0
package/src/dispatch/engine.test.ts +189 -2
package/src/dispatch/engine.ts +555 -9
package/src/dispatch/entity-scope.test.ts +176 -0
package/src/dispatch/get-service-wiring.test.ts +318 -0
package/src/dispatch/numeric.test.ts +71 -0
package/src/dispatch/numeric.ts +96 -0
package/src/dispatch/render.test.ts +34 -0
package/src/dispatch/render.ts +31 -11
package/src/dispatch/reseed-run-secrets.ts +230 -0
package/src/dispatch/run-secret-registry.test.ts +189 -0
package/src/dispatch/run-secret-registry.ts +247 -0
package/src/dispatch/run-state-masking.test.ts +376 -0
package/src/dispatch/run-state-store.ts +95 -38
package/src/dispatch/run-state.ts +226 -59
package/src/dispatch/scope-artifact-masking.test.ts +138 -0
package/src/dispatch/secret-ref-ids.test.ts +19 -0
package/src/dispatch/secret-ref-ids.ts +17 -0
package/src/dispatch/snapshots.test.ts +86 -0
package/src/dispatch/snapshots.ts +79 -0
package/src/dispatch/stage1-router.test.ts +324 -0
package/src/dispatch/stage1-router.ts +152 -0
package/src/dispatch/stage1.it.test.ts +84 -0
package/src/dispatch/stage2-dispatch.test.ts +285 -0
package/src/dispatch/stage2-dispatch.ts +207 -0
package/src/dispatch/stage2-stalled.it.test.ts +132 -0
package/src/dispatch/stalled-sweeper.test.ts +197 -0
package/src/dispatch/stalled-sweeper.ts +112 -5
package/src/dispatch/state-scope.test.ts +234 -0
package/src/dispatch/state-scope.ts +322 -0
package/src/dispatch/structured-conditions.test.ts +246 -0
package/src/dispatch/structured-conditions.ts +146 -0
package/src/dispatch/test-fixtures.ts +306 -38
package/src/dispatch/trigger-fanin.test.ts +111 -0
package/src/dispatch/trigger-subscriber.ts +316 -14
package/src/dispatch/types.ts +263 -8
package/src/dispatch/wait-timeout-queue.ts +89 -0
package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
package/src/dispatch/wait-until.test.ts +540 -0
package/src/dispatch/wake-refs.test.ts +158 -0
package/src/dispatch/wake-refs.ts +348 -0
package/src/dispatch/window-gate.test.ts +513 -0
package/src/dispatch/window-store.test.ts +162 -0
package/src/dispatch/window-store.ts +102 -0
package/src/entity/change-derivers.test.ts +148 -0
package/src/entity/change-derivers.ts +143 -0
package/src/entity/change-emitter.test.ts +66 -0
package/src/entity/change-emitter.ts +76 -0
package/src/entity/create-handle.ts +344 -0
package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
package/src/entity/define-entity.ts +157 -0
package/src/entity/diff.test.ts +57 -0
package/src/entity/diff.ts +54 -0
package/src/entity/entity-store.test.ts +30 -0
package/src/entity/entity-store.ts +171 -0
package/src/entity/extension-point.ts +56 -0
package/src/entity/fake-entity-store.ts +130 -0
package/src/entity/hook.ts +19 -0
package/src/entity/index.ts +50 -0
package/src/entity/mutate-handle.test.ts +517 -0
package/src/entity/on-entity-changed.test.ts +189 -0
package/src/entity/on-entity-changed.ts +214 -0
package/src/entity/registry.test.ts +181 -0
package/src/entity/registry.ts +200 -0
package/src/entity/stable-stringify.test.ts +55 -0
package/src/entity/stable-stringify.ts +49 -0
package/src/entity/wake-index.it.test.ts +251 -0
package/src/entity/with-entity-write.test.ts +100 -0
package/src/entity/with-entity-write.ts +69 -0
package/src/entity-driven-trigger.ts +46 -0
package/src/extension-points.ts +35 -0
package/src/gitops-docs.test.ts +215 -0
package/src/gitops-docs.ts +151 -0
package/src/gitops-kinds.test.ts +174 -0
package/src/gitops-kinds.ts +137 -0
package/src/index.ts +355 -11
package/src/migration/flapping-to-window.test.ts +123 -0
package/src/migration/flapping-to-window.ts +205 -0
package/src/router.test.ts +182 -1
package/src/router.ts +73 -2
package/src/schema.ts +236 -3
package/src/script-test-replay.test.ts +88 -0
package/src/script-test-replay.ts +100 -0
package/src/script-test-shell-env.test.ts +41 -0
package/src/script-test-shell-env.ts +89 -0
package/src/script-test.test.ts +386 -0
package/src/script-test.ts +258 -0
package/src/trigger-registry.ts +2 -0
package/src/validate-definition.test.ts +1 -0
package/tsconfig.json +24 -0

package/src/dispatch/engine.ts CHANGED Viewed

@@ -49,6 +49,7 @@
 import type {
   Action,
   ChooseInput,
+  Condition,
   ConditionGuardInput,
   DelayInput,
   ParallelInput,
@@ -58,6 +59,7 @@ import type {
   StopInput,
   VariablesInput,
   WaitForTriggerInput,
+  WaitUntilInput,
 } from "@checkstack/automation-common";
 import { SYSTEM_ACTOR, type Actor } from "@checkstack/common";
 import type {
@@ -66,6 +68,8 @@ import type {
 import type { ActionRunScope } from "../action-types";
 import { detectActionKind, type ActionKind } from "./action-kind";
+import { wrapGetServiceForRun } from "./run-secret-registry";
+import { reseedRunSecretRegistry } from "./reseed-run-secrets";
 import { evaluateCondition } from "./condition";
 import { parseActionPath } from "./path-nav";
 import {
@@ -80,6 +84,16 @@ import {
   resolveConsumedArtifacts,
   withRepeatContext,
 } from "./scope";
+import {
+  enrichScopeWithEntities,
+  enrichScopeWithState,
+  type EntityRef,
+} from "./state-scope";
+import {
+  extractWakeRefs,
+  refToString,
+  HEALTH_ENTITY_KIND,
+} from "./wake-refs";
 import {
   formatActionPath,
   type ActionPath,
@@ -90,6 +104,72 @@ import {
   type StepOutcome,
 } from "./types";
+/**
+ * Per-run deps whose `getService` registers every resolved secret value
+ * into the run-scoped secret registry (for run-wide output masking). When
+ * the registry / ref-ids aren't configured (tests / minimal installs),
+ * the deps pass through unchanged.
+ */
+function withRunSecretCapture(
+  deps: DispatchDeps,
+  runId: string,
+): DispatchDeps {
+  if (
+    !deps.secretRegistry ||
+    !deps.secretResolverRefId ||
+    !deps.connectionStoreRefId
+  ) {
+    return deps;
+  }
+  return {
+    ...deps,
+    getService: wrapGetServiceForRun({
+      getService: deps.getService,
+      runId,
+      registry: deps.secretRegistry,
+      resolverRefId: deps.secretResolverRefId,
+      connectionStoreRefId: deps.connectionStoreRefId,
+    }),
+  };
+}
+/**
+ * Re-seed a resuming pod's run mask set from the automation's declared
+ * secret refs. The run's masking registry is in-memory and per-process, so
+ * a pod that did NOT originally resolve the run's secrets (the resume /
+ * stalled-recovery case) starts with an EMPTY mask set — letting a carried
+ * scope value / artifact / error persist unmasked. Re-resolving the
+ * declared `secretEnv` mappings + connection refs through the run's wrapped
+ * `getService` (which auto-registers) re-populates the same least-privilege
+ * set before we walk + persist. No-op when masking isn't wired (tests /
+ * minimal installs) or when `ctx.deps.getService` wasn't wrapped.
+ */
+async function reseedRunMaskSet(
+  deps: DispatchDeps,
+  wrappedDeps: DispatchDeps,
+  runId: string,
+  automation: LoadedAutomation,
+): Promise<void> {
+  if (
+    !deps.secretRegistry ||
+    !deps.secretResolverRefId ||
+    !deps.connectionStoreRefId
+  ) {
+    return;
+  }
+  await reseedRunSecretRegistry({
+    // The WRAPPED getService is the registering one — feed it so re-resolved
+    // values land in the run's mask set.
+    getService: wrappedDeps.getService,
+    registry: deps.secretRegistry,
+    runId,
+    definition: automation.definition,
+    resolverRefId: deps.secretResolverRefId,
+    connectionStoreRefId: deps.connectionStoreRefId,
+    logger: deps.logger,
+  });
+}
 /** Name of the durable queue we use for crash-safe delays. */
 export const DELAY_QUEUE_NAME = "automation-delay";
@@ -102,6 +182,27 @@ export interface DelayResumeJob {
   waitLockId: string;
 }
+/**
+ * Name of the durable queue carrying a reactive `wait_until`'s single
+ * timeout timer (reactive automation engine §7, §13.1). A `wait_until` is
+ * now reactive: a relevant `ENTITY_CHANGED` wakes it (Stage 1 →
+ * `checkWaitUntil`). This queue is NOT a re-check loop — it holds at most
+ * one job per suspended wait, scheduled at the deadline, mirroring the
+ * dwell timer pattern. On fire the consumer applies the timeout policy
+ * (continue/fail) via `checkWaitUntil` (which also re-evaluates the
+ * condition one last time).
+ */
+export const WAIT_TIMEOUT_QUEUE_NAME = "automation-wait-timeout";
+/**
+ * Job payload for a `wait_until` timeout timer. Carries the run + lock so
+ * the consumer can re-evaluate one final time and apply the timeout policy.
+ */
+export interface WaitTimeoutJob {
+  runId: string;
+  waitLockId: string;
+}
 // ─── Public entry points ──────────────────────────────────────────────────
 export interface DispatchTriggerArgs {
@@ -142,7 +243,7 @@ export async function dispatchTrigger(
   });
   const ctx: DispatchContext = {
-    deps,
+    deps: withRunSecretCapture(deps, runId),
     run: {
       runId,
       automation: args.automation,
@@ -162,6 +263,18 @@ export async function dispatchTrigger(
     resuming: false,
   };
+  // Pre-resolve live health state into scope before any condition or
+  // template evaluation (the engine is sync, so this is the only place
+  // live state can be fetched). Fail-open inside the helper.
+  await enrichScopeWithState({
+    scope: ctx.scope,
+    client: deps.healthCheckClient,
+    logger: deps.logger,
+    contextKey: args.contextKey,
+    usesState: args.automation.definition.uses_state,
+    transitionWindowMinutes: args.automation.definition.state_window_minutes,
+  });
   // Initial scope snapshot — gives the stalled sweeper something to
   // work with even if we crash before the first step finishes.
   await deps.runStateStore.upsert({
@@ -215,11 +328,27 @@ export async function resumeRun(
   const run = await deps.runStore.loadRun(args.runId);
   if (!run) throw new Error(`Cannot resume — run ${args.runId} not found`);
+  // Only a `waiting` run may be resumed. A run that was cancelled (restart
+  // mode / operator cancel) or already reached a terminal state must NEVER
+  // be resurrected by a late wake (wakeWaitingRuns, delay-expiry sweep, a
+  // racing queue job). Drop any stale wait lock for the run and return —
+  // mirrors the guard `checkWaitUntil` already applies for `until` locks.
+  if (run.status !== "waiting") {
+    const stale = await deps.runStore.findWaitLocksByRun(args.runId);
+    for (const lock of stale) {
+      await deps.runStore.deleteWaitLock(lock.id);
+    }
+    deps.logger.debug(
+      `resumeRun: run ${args.runId} is "${run.status}", not "waiting"; dropped ${stale.length} stale wait lock(s) and skipped resume`,
+    );
+    return { status: run.status };
+  }
   const waitedAt = parseActionPath(args.waitedAtPath);
   // Try to acquire the advisory lock so two resumers don't race.
-  const acquired = await deps.runStateStore.tryAdvisoryLock(args.runId);
-  if (!acquired) {
+  const lock = await deps.runStateStore.tryAdvisoryLock(args.runId);
+  if (!lock) {
     deps.logger.debug(
       `resumeRun: another instance already holds the lock for run ${args.runId}; skipping`,
     );
@@ -240,11 +369,30 @@ export async function resumeRun(
       scope.resume = { payload: args.payload };
     }
+    // Re-resolve live state on resume: the system may have changed during
+    // the wait, so conditions after a wait must see current state, not
+    // the snapshot taken at suspension time.
+    await enrichScopeWithState({
+      scope,
+      client: deps.healthCheckClient,
+      logger: deps.logger,
+      contextKey: run.contextKey,
+      usesState: args.automation.definition.uses_state,
+      transitionWindowMinutes: args.automation.definition.state_window_minutes,
+    });
     await deps.runStore.updateRunStatus(args.runId, "running");
     await deps.runStateStore.heartbeat(args.runId);
+    const wrappedDeps = withRunSecretCapture(deps, args.runId);
+    // Cross-pod mask re-seed: this pod may not be the one that resolved the
+    // run's secrets, so re-populate its (empty) mask set from the declared
+    // refs BEFORE walking / persisting — otherwise carried scope / artifact
+    // values would persist unmasked here. See `reseed-run-secrets.ts`.
+    await reseedRunMaskSet(deps, wrappedDeps, args.runId, args.automation);
     const ctx: DispatchContext = {
-      deps,
+      deps: wrappedDeps,
       run: {
         runId: args.runId,
         automation: args.automation,
@@ -271,7 +419,7 @@ export async function resumeRun(
     return await finaliseRun(ctx, outcome);
   } finally {
-    await deps.runStateStore.releaseAdvisoryLock(args.runId);
+    await lock.release();
   }
 }
@@ -287,7 +435,23 @@ export async function recoverStalledRun(
 ): Promise<{ status: string }> {
   const run = await deps.runStore.loadRun(args.runId);
   if (!run) throw new Error(`recoverStalledRun: run ${args.runId} not found`);
-  if (run.status !== "running" && run.status !== "waiting") {
+  if (run.status !== "running") {
+    // Only genuinely-running runs are recoverable here. A `waiting` run is
+    // owned by the wait-lock / queue resume paths; recovering it would
+    // re-walk an intentional wait. (The sweeper now filters to `running`,
+    // but guard here too so a direct caller can't resurrect a wait.)
+    return { status: run.status };
+  }
+  // A live wait lock means this run is intentionally suspended (a wait the
+  // status update may not yet reflect, or a racing path). Refuse rather
+  // than from-top re-walk: re-running pre-wait actions has observable side
+  // effects. The wait-lock / queue resume paths own this run.
+  const existingLocks = await deps.runStore.findWaitLocksByRun(args.runId);
+  if (existingLocks.length > 0) {
+    deps.logger.debug(
+      `recoverStalledRun: run ${args.runId} holds ${existingLocks.length} live wait lock(s); leaving it to the wait/resume paths`,
+    );
     return { status: run.status };
   }
@@ -307,8 +471,14 @@ export async function recoverStalledRun(
   await deps.runStore.updateRunStatus(args.runId, "running");
   await deps.runStateStore.heartbeat(args.runId);
+  const wrappedDeps = withRunSecretCapture(deps, args.runId);
+  // Cross-pod mask re-seed (see `reseedRunMaskSet` in `resumeRun`): the
+  // sweeper pod recovering this stalled run did not resolve its secrets, so
+  // re-populate the mask set from the declared refs before re-walking.
+  await reseedRunMaskSet(deps, wrappedDeps, args.runId, args.automation);
   const ctx: DispatchContext = {
-    deps,
+    deps: wrappedDeps,
     run: {
       runId: args.runId,
       automation: args.automation,
@@ -364,6 +534,206 @@ export async function recoverStalledRun(
   return await finaliseRun(ctx, outcome);
 }
+/**
+ * Outcome of a single `wait_until` re-check.
+ *   - "resumed"      → condition satisfied (or timed-out-continue); the run
+ *                      was resumed past the wait_until.
+ *   - "failed"       → timed out with continue_on_timeout=false; run failed.
+ *   - "still-waiting"→ not yet true and not timed out; caller re-enqueues.
+ *   - "gone"         → lock/run/automation no longer valid; nothing to do.
+ */
+export type WaitUntilCheckOutcome =
+  | "resumed"
+  | "failed"
+  | "still-waiting"
+  | "gone";
+/**
+ * Re-enrich a suspended `wait_until`'s scope before re-evaluation so the
+ * condition sees CURRENT state, not the value at suspension time. Two
+ * sources, kind-aware:
+ *
+ *   1. Health — resolved via the RPC `healthCheckClient`
+ *      (`enrichScopeWithState`), since the health aggregate is computed on
+ *      read and not stored as a framework entity row. Sets the rich
+ *      `scope.health.*` condition snapshot.
+ *   2. Every OTHER `state.<kind>.<id>` ref the wait depends on — resolved
+ *      kind-agnostically through the entity store
+ *      (`enrichScopeWithEntities` + `deps.entityResolverFor`), folding into
+ *      `scope.state.<kind>.<id>.<field>`. The refs are statically extracted
+ *      from the condition (concrete ids only — wildcards carry no id) PLUS
+ *      the concrete `changedRef` that woke this wait (so a wildcard wait on a
+ *      dynamic id still resolves the entity that actually changed).
+ */
+async function reEnrichWaitScope(args: {
+  deps: DispatchDeps;
+  scope: Record<string, unknown>;
+  automation: LoadedAutomation;
+  contextKey: string | null;
+  condition: Condition;
+  changedRef?: string;
+}): Promise<void> {
+  const { deps, scope, automation, contextKey, condition, changedRef } = args;
+  // Split the changed ref into its `${kind}:${id}` parts once — reused by
+  // both the health-resolution injection below and the entity-ref collection.
+  let changedKind: string | undefined;
+  let changedId: string | undefined;
+  if (changedRef) {
+    const colon = changedRef.indexOf(":");
+    if (colon > 0) {
+      changedKind = changedRef.slice(0, colon);
+      changedId = changedRef.slice(colon + 1);
+    }
+  }
+  // 1. Health: the rich condition snapshot, RPC-resolved. Sets scope.health.*.
+  //    A WILDCARD health wait (`health:*`) is woken by a concrete `health:sysX`
+  //    whose id may be NEITHER the contextKey NOR in `uses_state`. The health
+  //    aggregate is computed-on-read and is only resolved here for the systems
+  //    we pass in, so without the changed id the wait re-evaluates against an
+  //    empty `scope.health.systems[sysX]` and never resumes. Inject the changed
+  //    system's concrete id so a wildcard wake always resolves the system that
+  //    actually changed (deduped inside `enrichScopeWithState`).
+  const usesState =
+    changedKind === HEALTH_ENTITY_KIND && changedId && changedId !== "*"
+      ? [...(automation.definition.uses_state ?? []), changedId]
+      : automation.definition.uses_state;
+  await enrichScopeWithState({
+    scope,
+    client: deps.healthCheckClient,
+    logger: deps.logger,
+    contextKey,
+    usesState,
+    transitionWindowMinutes: automation.definition.state_window_minutes,
+  });
+  // 2. Kind-agnostic entity refs (entity-store-resolved). Collect the
+  //    concrete refs the condition reads plus the changed ref, drop the
+  //    health kind (already resolved above via the rich RPC path — excluding
+  //    it here keeps health resolved exactly once per re-enrichment) and any
+  //    wildcard (no concrete id).
+  const refs: EntityRef[] = [];
+  const seen = new Set<string>();
+  const addRef = (kind: string, id: string) => {
+    if (kind === HEALTH_ENTITY_KIND || id === "*" || id.length === 0) return;
+    const key = `${kind}:${id}`;
+    if (seen.has(key)) return;
+    seen.add(key);
+    refs.push({ kind, id });
+  };
+  for (const ref of extractWakeRefs(condition).refs) addRef(ref.kind, ref.id);
+  if (changedKind && changedId) addRef(changedKind, changedId);
+  if (refs.length === 0) return;
+  await enrichScopeWithEntities({
+    scope,
+    logger: deps.logger,
+    refs,
+    resolverFor: (kind) => deps.entityResolverFor?.(kind),
+  });
+}
+/**
+ * Re-check a suspended `wait_until`: re-enrich scope, evaluate the
+ * condition, and either resume the run (satisfied or timeout-continue),
+ * fail it (timeout-fail), or report "still waiting" so the caller
+ * re-schedules another check.
+ *
+ * Read-only until it acts; `resumeRun` takes the per-run advisory lock so
+ * a concurrent re-check / sweep can't double-resume. Idempotent: the lock
+ * is deleted before resuming, so a duplicate check finds nothing.
+ */
+export async function checkWaitUntil(
+  deps: DispatchDeps,
+  args: {
+    runId: string;
+    waitLockId: string;
+    automation: LoadedAutomation;
+    /**
+     * The `${kind}:${id}` ref of the change that woke this wait (Stage-2
+     * `wake` job). Included in the re-enrichment so the changed entity is
+     * always resolved into scope — essential for a wildcard wait whose
+     * condition reads a dynamic id (the ref isn't statically extractable).
+     */
+    changedRef?: string;
+  },
+): Promise<WaitUntilCheckOutcome> {
+  const lock = await deps.runStore.loadWaitLock(args.waitLockId);
+  if (!lock || lock.kind !== "until" || !lock.waitConfig) return "gone";
+  const run = await deps.runStore.loadRun(args.runId);
+  if (!run) {
+    await deps.runStore.deleteWaitLock(args.waitLockId);
+    return "gone";
+  }
+  if (run.status !== "waiting") {
+    // Already resumed / cancelled / terminal — drop the stale lock.
+    await deps.runStore.deleteWaitLock(args.waitLockId);
+    return "gone";
+  }
+  // Rebuild the scope from the snapshot + re-enrich live state so the
+  // condition sees CURRENT state, not the value at suspension time.
+  const persisted = await deps.runStateStore.load(args.runId);
+  const scope = persisted?.scopeSnapshot
+    ? { ...persisted.scopeSnapshot }
+    : buildInitialScope({
+        triggerId: run.triggerId,
+        triggerEventId: run.triggerEventId,
+        payload: run.triggerPayload,
+        startedAt: run.startedAt,
+      });
+  await reEnrichWaitScope({
+    deps,
+    scope,
+    automation: args.automation,
+    contextKey: run.contextKey,
+    condition: lock.waitConfig.condition,
+    changedRef: args.changedRef,
+  });
+  let satisfied = false;
+  try {
+    satisfied = evaluateCondition(
+      lock.waitConfig.condition,
+      scope as TemplateContext,
+      deps.filters,
+    );
+  } catch (error) {
+    deps.logger.warn(
+      `wait_until re-check threw (treating as not-yet): ${(error as Error).message}`,
+    );
+  }
+  const timedOut =
+    lock.timeoutAt !== null && lock.timeoutAt.getTime() <= Date.now();
+  if (satisfied || (timedOut && lock.waitConfig.continueOnTimeout)) {
+    await deps.runStore.deleteWaitLock(args.waitLockId);
+    await resumeRun(deps, {
+      runId: args.runId,
+      automation: args.automation,
+      waitedAtPath: lock.actionPath,
+    });
+    return "resumed";
+  }
+  if (timedOut) {
+    // continue_on_timeout = false → fail the run.
+    await deps.runStore.deleteWaitLock(args.waitLockId);
+    await deps.runStore.updateRunStatus(
+      args.runId,
+      "failed",
+      `wait_until timed out after waiting for its condition`,
+    );
+    await deps.runStateStore.clear(args.runId);
+    return "failed";
+  }
+  return "still-waiting";
+}
 // ─── Run finalisation ─────────────────────────────────────────────────────
 async function finaliseRun(
@@ -393,11 +763,14 @@ async function finaliseRun(
     errorMessage,
   );
   // Terminal runs drop their durable state. Suspended runs keep it so
-  // resumption has the scope to work with.
+  // resumption has the scope to work with — but we must NOT clobber
+  // `lastActionPath`: the suspending action already checkpointed its real
+  // path, and a crash recovery needs that to resume from the wait rather
+  // than re-walking from actions[0] (which would re-fire pre-wait side
+  // effects). Omit it so the existing checkpoint survives.
   await (status === "waiting" ? ctx.deps.runStateStore.upsert({
       runId: ctx.run.runId,
       scopeSnapshot: ctx.scope,
-      lastActionPath: null,
     }) : ctx.deps.runStateStore.clear(ctx.run.runId));
   return { runId: ctx.run.runId, status };
 }
@@ -578,6 +951,9 @@ async function executeAction(
         ctx,
       );
     }
+    case "wait_until": {
+      return await executeWaitUntil(action as WaitUntilInput, path, ctx);
+    }
     case "sequence": {
       return await executeSequence(
         action as SequenceInput,
@@ -1596,6 +1972,176 @@ async function executeWaitForTrigger(
   return { kind: "suspended", stepId };
 }
+// ─── Primitive: `wait_until` ─────────────────────────────────────────────
+/**
+ * Suspend the run until a condition becomes true, with an optional
+ * timeout. Unlike `wait_for_trigger` (wait for a named event), `wait_until`
+ * is REACTIVE (reactive automation engine §7): the engine statically
+ * extracts the `state.*` refs the condition reads (§8.3), persists a
+ * `kind: "until"` wait lock plus one wake-index row per ref (§8.1), and
+ * suspends with NO active job and NO polling. A relevant `ENTITY_CHANGED`
+ * wakes it (Stage 1 → `checkWaitUntil` re-evaluates the full condition and
+ * resumes if it now holds).
+ *
+ * Fast path: if the condition is ALREADY true against the current
+ * (enriched) scope, continue inline without suspending.
+ *
+ * Timeout: a single durable timer job at `timeoutAt` (NOT a re-check loop)
+ * applies the continue/fail policy. When ref extraction is wholly
+ * indeterminate (no concrete-or-wildcard ref) AND there is no timeout, the
+ * wait could never wake — we log at `warn` so it is never silent (§8.3).
+ */
+async function executeWaitUntil(
+  action: WaitUntilInput,
+  path: ActionPath,
+  ctx: DispatchContext,
+): Promise<StepOutcome> {
+  const stepId = await ctx.deps.runStore.createStep({
+    runId: ctx.run.runId,
+    actionPath: formatActionPath(path),
+    actionId: action.id ?? null,
+    actionKind: "wait_until",
+    providerActionId: null,
+  });
+  const cfg = action.wait_until;
+  // Fast path — already satisfied. Evaluate against the current scope
+  // (enriched at run start / resume). Errors are treated as "not yet".
+  let satisfied = false;
+  try {
+    satisfied = evaluateCondition(
+      cfg.condition,
+      templateContext(ctx),
+      ctx.deps.filters,
+    );
+  } catch (error) {
+    ctx.deps.logger.debug(
+      `wait_until initial eval threw (treating as not-yet): ${(error as Error).message}`,
+    );
+  }
+  if (satisfied) {
+    await ctx.deps.runStore.updateStep(stepId, {
+      status: "success",
+      resultPayload: { satisfied: true, immediate: true },
+    });
+    return { kind: "ok" };
+  }
+  const continueOnTimeout = cfg.continue_on_timeout ?? true;
+  const timeoutAt = cfg.timeout_seconds
+    ? new Date(Date.now() + cfg.timeout_seconds * 1000)
+    : null;
+  // Static reference extraction → wake-index dependency refs (§8.3).
+  const extracted = extractWakeRefs(cfg.condition);
+  const wakeRefs = extracted.refs.map((ref) => refToString(ref));
+  if (extracted.indeterminate && wakeRefs.length === 0) {
+    // The condition reads live state but no concrete-or-wildcard ref could
+    // be derived: the wait can only ever be released by the timeout timer.
+    // Never silent (§8.3, §12).
+    if (timeoutAt) {
+      ctx.deps.logger.warn(
+        `wait_until at ${formatActionPath(path)} (run ${ctx.run.runId}): could not extract any state ref from the condition; relying on the timeout timer only — it will not wake on state changes.`,
+      );
+    } else {
+      ctx.deps.logger.warn(
+        `wait_until at ${formatActionPath(path)} (run ${ctx.run.runId}): could not extract any state ref AND no timeout is set; this wait will never wake. Add a timeout or a concrete state.* reference.`,
+      );
+    }
+  }
+  const waitLockId = await ctx.deps.runStore.createWaitLockWithWakeRefs({
+    runId: ctx.run.runId,
+    actionPath: formatActionPath(path),
+    // Synthetic marker — reactive `until` locks aren't woken by named events.
+    eventId: `@@until:${ctx.run.runId}:${formatActionPath(path)}`,
+    contextKey: ctx.run.contextKey,
+    timeoutAt,
+    waitConfig: {
+      condition: cfg.condition,
+      continueOnTimeout,
+    },
+    wakeRefs,
+  });
+  // Persist scope before suspending so the wake re-check rebuilds it.
+  await checkpoint(ctx, path);
+  // Re-evaluate-on-registration guard (reactive automation engine §17).
+  // The condition was checked above (fast path), THEN the wait lock + its
+  // wake-index rows were committed. A relevant `ENTITY_CHANGED` landing in
+  // that arm window is routed by Stage 1 against the just-now-visible lock,
+  // but if the change committed BEFORE our wake rows were visible, Stage 1
+  // found no lock and enqueued no wake job — a lost wakeup. For a no-timeout
+  // wait nothing would ever re-check it (the sweeper filters `isNotNull
+  // (timeoutAt)`), so the run would stall permanently. Guard against this by
+  // re-evaluating ONCE against freshly re-enriched scope now that the lock is
+  // armed: any change that landed during the window is now observable. If the
+  // condition already holds, drop the lock (its wake-index rows cascade) and
+  // continue the current walk inline. Idempotent: the lock delete + the
+  // per-run advisory lock taken by any concurrent wake/resume path serialise
+  // this with a racing Stage-2 wake (whichever deletes the lock first wins;
+  // the loser sees `gone`).
+  let armedSatisfied = false;
+  try {
+    await reEnrichWaitScope({
+      deps: ctx.deps,
+      scope: ctx.scope,
+      automation: ctx.run.automation,
+      contextKey: ctx.run.contextKey,
+      condition: cfg.condition,
+    });
+    armedSatisfied = evaluateCondition(
+      cfg.condition,
+      templateContext(ctx),
+      ctx.deps.filters,
+    );
+  } catch (error) {
+    ctx.deps.logger.debug(
+      `wait_until arm-window re-eval threw (treating as not-yet): ${(error as Error).message}`,
+    );
+  }
+  if (armedSatisfied) {
+    await ctx.deps.runStore.deleteWaitLock(waitLockId);
+    await ctx.deps.runStore.updateStep(stepId, {
+      status: "success",
+      resultPayload: { satisfied: true, armWindow: true },
+    });
+    return { kind: "ok" };
+  }
+  // Single durable timeout timer (NOT a poll loop). Only armed when a
+  // deadline exists; otherwise the wait is purely event-driven.
+  if (timeoutAt) {
+    const queue = ctx.deps.queueManager.getQueue<WaitTimeoutJob>(
+      WAIT_TIMEOUT_QUEUE_NAME,
+    );
+    await queue.enqueue(
+      { runId: ctx.run.runId, waitLockId },
+      {
+        startDelay: Math.max(
+          Math.ceil((timeoutAt.getTime() - Date.now()) / 1000),
+          0,
+        ),
+        jobId: `${ctx.run.runId}:${waitLockId}:timeout`,
+      },
+    );
+  }
+  await ctx.deps.runStore.updateStep(stepId, {
+    status: "waiting",
+    resultPayload: {
+      waitLockId,
+      wakeRefs,
+      timeoutAt: timeoutAt?.toISOString(),
+    },
+  });
+  return { kind: "suspended", stepId };
+}
 // ─── Helpers ─────────────────────────────────────────────────────────────
 // ─── Primitive: `sequence` ───────────────────────────────────────────────