@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/CHANGELOG.md +544 -0
  2. package/drizzle/0003_sparkling_xorn.sql +17 -0
  3. package/drizzle/0004_cultured_spyke.sql +2 -0
  4. package/drizzle/0005_classy_the_hand.sql +19 -0
  5. package/drizzle/0006_burly_wallop.sql +10 -0
  6. package/drizzle/0007_nappy_jackal.sql +1 -0
  7. package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
  8. package/drizzle/0009_steady_liz_osborn.sql +12 -0
  9. package/drizzle/0010_chunky_changeling.sql +2 -0
  10. package/drizzle/meta/0003_snapshot.json +1007 -0
  11. package/drizzle/meta/0004_snapshot.json +1028 -0
  12. package/drizzle/meta/0005_snapshot.json +1164 -0
  13. package/drizzle/meta/0006_snapshot.json +1261 -0
  14. package/drizzle/meta/0007_snapshot.json +1215 -0
  15. package/drizzle/meta/0008_snapshot.json +1215 -0
  16. package/drizzle/meta/0009_snapshot.json +1328 -0
  17. package/drizzle/meta/0010_snapshot.json +1349 -0
  18. package/drizzle/meta/_journal.json +56 -0
  19. package/package.json +23 -12
  20. package/src/action-types.ts +23 -0
  21. package/src/artifact-store.ts +16 -1
  22. package/src/automation-store.test.ts +143 -0
  23. package/src/automation-store.ts +30 -8
  24. package/src/builtin-triggers.test.ts +77 -74
  25. package/src/builtin-triggers.ts +105 -108
  26. package/src/dispatch/action-kind.ts +2 -0
  27. package/src/dispatch/assemble-get-service.ts +31 -0
  28. package/src/dispatch/cancel-resurrect.test.ts +147 -0
  29. package/src/dispatch/concurrency-race.test.ts +255 -0
  30. package/src/dispatch/concurrency-scope.test.ts +166 -0
  31. package/src/dispatch/condition.ts +24 -5
  32. package/src/dispatch/dwell-queue.ts +65 -0
  33. package/src/dispatch/dwell-store.ts +154 -0
  34. package/src/dispatch/dwell.it.test.ts +142 -0
  35. package/src/dispatch/dwell.test.ts +799 -0
  36. package/src/dispatch/dwell.ts +257 -0
  37. package/src/dispatch/engine.test.ts +189 -2
  38. package/src/dispatch/engine.ts +555 -9
  39. package/src/dispatch/entity-scope.test.ts +176 -0
  40. package/src/dispatch/get-service-wiring.test.ts +318 -0
  41. package/src/dispatch/numeric.test.ts +71 -0
  42. package/src/dispatch/numeric.ts +96 -0
  43. package/src/dispatch/render.test.ts +34 -0
  44. package/src/dispatch/render.ts +31 -11
  45. package/src/dispatch/reseed-run-secrets.ts +230 -0
  46. package/src/dispatch/run-secret-registry.test.ts +189 -0
  47. package/src/dispatch/run-secret-registry.ts +247 -0
  48. package/src/dispatch/run-state-masking.test.ts +376 -0
  49. package/src/dispatch/run-state-store.ts +95 -38
  50. package/src/dispatch/run-state.ts +226 -59
  51. package/src/dispatch/scope-artifact-masking.test.ts +138 -0
  52. package/src/dispatch/secret-ref-ids.test.ts +19 -0
  53. package/src/dispatch/secret-ref-ids.ts +17 -0
  54. package/src/dispatch/snapshots.test.ts +86 -0
  55. package/src/dispatch/snapshots.ts +79 -0
  56. package/src/dispatch/stage1-router.test.ts +324 -0
  57. package/src/dispatch/stage1-router.ts +152 -0
  58. package/src/dispatch/stage1.it.test.ts +84 -0
  59. package/src/dispatch/stage2-dispatch.test.ts +285 -0
  60. package/src/dispatch/stage2-dispatch.ts +207 -0
  61. package/src/dispatch/stage2-stalled.it.test.ts +132 -0
  62. package/src/dispatch/stalled-sweeper.test.ts +197 -0
  63. package/src/dispatch/stalled-sweeper.ts +112 -5
  64. package/src/dispatch/state-scope.test.ts +234 -0
  65. package/src/dispatch/state-scope.ts +322 -0
  66. package/src/dispatch/structured-conditions.test.ts +246 -0
  67. package/src/dispatch/structured-conditions.ts +146 -0
  68. package/src/dispatch/test-fixtures.ts +306 -38
  69. package/src/dispatch/trigger-fanin.test.ts +111 -0
  70. package/src/dispatch/trigger-subscriber.ts +316 -14
  71. package/src/dispatch/types.ts +263 -8
  72. package/src/dispatch/wait-timeout-queue.ts +89 -0
  73. package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
  74. package/src/dispatch/wait-until.test.ts +540 -0
  75. package/src/dispatch/wake-refs.test.ts +158 -0
  76. package/src/dispatch/wake-refs.ts +348 -0
  77. package/src/dispatch/window-gate.test.ts +513 -0
  78. package/src/dispatch/window-store.test.ts +162 -0
  79. package/src/dispatch/window-store.ts +102 -0
  80. package/src/entity/change-derivers.test.ts +148 -0
  81. package/src/entity/change-derivers.ts +143 -0
  82. package/src/entity/change-emitter.test.ts +66 -0
  83. package/src/entity/change-emitter.ts +76 -0
  84. package/src/entity/create-handle.ts +344 -0
  85. package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
  86. package/src/entity/define-entity.ts +157 -0
  87. package/src/entity/diff.test.ts +57 -0
  88. package/src/entity/diff.ts +54 -0
  89. package/src/entity/entity-store.test.ts +30 -0
  90. package/src/entity/entity-store.ts +171 -0
  91. package/src/entity/extension-point.ts +56 -0
  92. package/src/entity/fake-entity-store.ts +130 -0
  93. package/src/entity/hook.ts +19 -0
  94. package/src/entity/index.ts +50 -0
  95. package/src/entity/mutate-handle.test.ts +517 -0
  96. package/src/entity/on-entity-changed.test.ts +189 -0
  97. package/src/entity/on-entity-changed.ts +214 -0
  98. package/src/entity/registry.test.ts +181 -0
  99. package/src/entity/registry.ts +200 -0
  100. package/src/entity/stable-stringify.test.ts +55 -0
  101. package/src/entity/stable-stringify.ts +49 -0
  102. package/src/entity/wake-index.it.test.ts +251 -0
  103. package/src/entity/with-entity-write.test.ts +100 -0
  104. package/src/entity/with-entity-write.ts +69 -0
  105. package/src/entity-driven-trigger.ts +46 -0
  106. package/src/extension-points.ts +35 -0
  107. package/src/gitops-docs.test.ts +215 -0
  108. package/src/gitops-docs.ts +151 -0
  109. package/src/gitops-kinds.test.ts +174 -0
  110. package/src/gitops-kinds.ts +137 -0
  111. package/src/index.ts +355 -11
  112. package/src/migration/flapping-to-window.test.ts +123 -0
  113. package/src/migration/flapping-to-window.ts +205 -0
  114. package/src/router.test.ts +182 -1
  115. package/src/router.ts +73 -2
  116. package/src/schema.ts +236 -3
  117. package/src/script-test-replay.test.ts +88 -0
  118. package/src/script-test-replay.ts +100 -0
  119. package/src/script-test-shell-env.test.ts +41 -0
  120. package/src/script-test-shell-env.ts +89 -0
  121. package/src/script-test.test.ts +386 -0
  122. package/src/script-test.ts +258 -0
  123. package/src/trigger-registry.ts +2 -0
  124. package/src/validate-definition.test.ts +1 -0
  125. package/tsconfig.json +24 -0
@@ -49,6 +49,7 @@
49
49
  import type {
50
50
  Action,
51
51
  ChooseInput,
52
+ Condition,
52
53
  ConditionGuardInput,
53
54
  DelayInput,
54
55
  ParallelInput,
@@ -58,6 +59,7 @@ import type {
58
59
  StopInput,
59
60
  VariablesInput,
60
61
  WaitForTriggerInput,
62
+ WaitUntilInput,
61
63
  } from "@checkstack/automation-common";
62
64
  import { SYSTEM_ACTOR, type Actor } from "@checkstack/common";
63
65
  import type {
@@ -66,6 +68,8 @@ import type {
66
68
 
67
69
  import type { ActionRunScope } from "../action-types";
68
70
  import { detectActionKind, type ActionKind } from "./action-kind";
71
+ import { wrapGetServiceForRun } from "./run-secret-registry";
72
+ import { reseedRunSecretRegistry } from "./reseed-run-secrets";
69
73
  import { evaluateCondition } from "./condition";
70
74
  import { parseActionPath } from "./path-nav";
71
75
  import {
@@ -80,6 +84,16 @@ import {
80
84
  resolveConsumedArtifacts,
81
85
  withRepeatContext,
82
86
  } from "./scope";
87
+ import {
88
+ enrichScopeWithEntities,
89
+ enrichScopeWithState,
90
+ type EntityRef,
91
+ } from "./state-scope";
92
+ import {
93
+ extractWakeRefs,
94
+ refToString,
95
+ HEALTH_ENTITY_KIND,
96
+ } from "./wake-refs";
83
97
  import {
84
98
  formatActionPath,
85
99
  type ActionPath,
@@ -90,6 +104,72 @@ import {
90
104
  type StepOutcome,
91
105
  } from "./types";
92
106
 
107
+ /**
108
+ * Per-run deps whose `getService` registers every resolved secret value
109
+ * into the run-scoped secret registry (for run-wide output masking). When
110
+ * the registry / ref-ids aren't configured (tests / minimal installs),
111
+ * the deps pass through unchanged.
112
+ */
113
+ function withRunSecretCapture(
114
+ deps: DispatchDeps,
115
+ runId: string,
116
+ ): DispatchDeps {
117
+ if (
118
+ !deps.secretRegistry ||
119
+ !deps.secretResolverRefId ||
120
+ !deps.connectionStoreRefId
121
+ ) {
122
+ return deps;
123
+ }
124
+ return {
125
+ ...deps,
126
+ getService: wrapGetServiceForRun({
127
+ getService: deps.getService,
128
+ runId,
129
+ registry: deps.secretRegistry,
130
+ resolverRefId: deps.secretResolverRefId,
131
+ connectionStoreRefId: deps.connectionStoreRefId,
132
+ }),
133
+ };
134
+ }
135
+
136
+ /**
137
+ * Re-seed a resuming pod's run mask set from the automation's declared
138
+ * secret refs. The run's masking registry is in-memory and per-process, so
139
+ * a pod that did NOT originally resolve the run's secrets (the resume /
140
+ * stalled-recovery case) starts with an EMPTY mask set — letting a carried
141
+ * scope value / artifact / error persist unmasked. Re-resolving the
142
+ * declared `secretEnv` mappings + connection refs through the run's wrapped
143
+ * `getService` (which auto-registers) re-populates the same least-privilege
144
+ * set before we walk + persist. No-op when masking isn't wired (tests /
145
+ * minimal installs) or when `ctx.deps.getService` wasn't wrapped.
146
+ */
147
+ async function reseedRunMaskSet(
148
+ deps: DispatchDeps,
149
+ wrappedDeps: DispatchDeps,
150
+ runId: string,
151
+ automation: LoadedAutomation,
152
+ ): Promise<void> {
153
+ if (
154
+ !deps.secretRegistry ||
155
+ !deps.secretResolverRefId ||
156
+ !deps.connectionStoreRefId
157
+ ) {
158
+ return;
159
+ }
160
+ await reseedRunSecretRegistry({
161
+ // The WRAPPED getService is the registering one — feed it so re-resolved
162
+ // values land in the run's mask set.
163
+ getService: wrappedDeps.getService,
164
+ registry: deps.secretRegistry,
165
+ runId,
166
+ definition: automation.definition,
167
+ resolverRefId: deps.secretResolverRefId,
168
+ connectionStoreRefId: deps.connectionStoreRefId,
169
+ logger: deps.logger,
170
+ });
171
+ }
172
+
93
173
  /** Name of the durable queue we use for crash-safe delays. */
94
174
  export const DELAY_QUEUE_NAME = "automation-delay";
95
175
 
@@ -102,6 +182,27 @@ export interface DelayResumeJob {
102
182
  waitLockId: string;
103
183
  }
104
184
 
185
+ /**
186
+ * Name of the durable queue carrying a reactive `wait_until`'s single
187
+ * timeout timer (reactive automation engine §7, §13.1). A `wait_until` is
188
+ * now reactive: a relevant `ENTITY_CHANGED` wakes it (Stage 1 →
189
+ * `checkWaitUntil`). This queue is NOT a re-check loop — it holds at most
190
+ * one job per suspended wait, scheduled at the deadline, mirroring the
191
+ * dwell timer pattern. On fire the consumer applies the timeout policy
192
+ * (continue/fail) via `checkWaitUntil` (which also re-evaluates the
193
+ * condition one last time).
194
+ */
195
+ export const WAIT_TIMEOUT_QUEUE_NAME = "automation-wait-timeout";
196
+
197
+ /**
198
+ * Job payload for a `wait_until` timeout timer. Carries the run + lock so
199
+ * the consumer can re-evaluate one final time and apply the timeout policy.
200
+ */
201
+ export interface WaitTimeoutJob {
202
+ runId: string;
203
+ waitLockId: string;
204
+ }
205
+
105
206
  // ─── Public entry points ──────────────────────────────────────────────────
106
207
 
107
208
  export interface DispatchTriggerArgs {
@@ -142,7 +243,7 @@ export async function dispatchTrigger(
142
243
  });
143
244
 
144
245
  const ctx: DispatchContext = {
145
- deps,
246
+ deps: withRunSecretCapture(deps, runId),
146
247
  run: {
147
248
  runId,
148
249
  automation: args.automation,
@@ -162,6 +263,18 @@ export async function dispatchTrigger(
162
263
  resuming: false,
163
264
  };
164
265
 
266
+ // Pre-resolve live health state into scope before any condition or
267
+ // template evaluation (the engine is sync, so this is the only place
268
+ // live state can be fetched). Fail-open inside the helper.
269
+ await enrichScopeWithState({
270
+ scope: ctx.scope,
271
+ client: deps.healthCheckClient,
272
+ logger: deps.logger,
273
+ contextKey: args.contextKey,
274
+ usesState: args.automation.definition.uses_state,
275
+ transitionWindowMinutes: args.automation.definition.state_window_minutes,
276
+ });
277
+
165
278
  // Initial scope snapshot — gives the stalled sweeper something to
166
279
  // work with even if we crash before the first step finishes.
167
280
  await deps.runStateStore.upsert({
@@ -215,11 +328,27 @@ export async function resumeRun(
215
328
  const run = await deps.runStore.loadRun(args.runId);
216
329
  if (!run) throw new Error(`Cannot resume — run ${args.runId} not found`);
217
330
 
331
+ // Only a `waiting` run may be resumed. A run that was cancelled (restart
332
+ // mode / operator cancel) or already reached a terminal state must NEVER
333
+ // be resurrected by a late wake (wakeWaitingRuns, delay-expiry sweep, a
334
+ // racing queue job). Drop any stale wait lock for the run and return —
335
+ // mirrors the guard `checkWaitUntil` already applies for `until` locks.
336
+ if (run.status !== "waiting") {
337
+ const stale = await deps.runStore.findWaitLocksByRun(args.runId);
338
+ for (const lock of stale) {
339
+ await deps.runStore.deleteWaitLock(lock.id);
340
+ }
341
+ deps.logger.debug(
342
+ `resumeRun: run ${args.runId} is "${run.status}", not "waiting"; dropped ${stale.length} stale wait lock(s) and skipped resume`,
343
+ );
344
+ return { status: run.status };
345
+ }
346
+
218
347
  const waitedAt = parseActionPath(args.waitedAtPath);
219
348
 
220
349
  // Try to acquire the advisory lock so two resumers don't race.
221
- const acquired = await deps.runStateStore.tryAdvisoryLock(args.runId);
222
- if (!acquired) {
350
+ const lock = await deps.runStateStore.tryAdvisoryLock(args.runId);
351
+ if (!lock) {
223
352
  deps.logger.debug(
224
353
  `resumeRun: another instance already holds the lock for run ${args.runId}; skipping`,
225
354
  );
@@ -240,11 +369,30 @@ export async function resumeRun(
240
369
  scope.resume = { payload: args.payload };
241
370
  }
242
371
 
372
+ // Re-resolve live state on resume: the system may have changed during
373
+ // the wait, so conditions after a wait must see current state, not
374
+ // the snapshot taken at suspension time.
375
+ await enrichScopeWithState({
376
+ scope,
377
+ client: deps.healthCheckClient,
378
+ logger: deps.logger,
379
+ contextKey: run.contextKey,
380
+ usesState: args.automation.definition.uses_state,
381
+ transitionWindowMinutes: args.automation.definition.state_window_minutes,
382
+ });
383
+
243
384
  await deps.runStore.updateRunStatus(args.runId, "running");
244
385
  await deps.runStateStore.heartbeat(args.runId);
245
386
 
387
+ const wrappedDeps = withRunSecretCapture(deps, args.runId);
388
+ // Cross-pod mask re-seed: this pod may not be the one that resolved the
389
+ // run's secrets, so re-populate its (empty) mask set from the declared
390
+ // refs BEFORE walking / persisting — otherwise carried scope / artifact
391
+ // values would persist unmasked here. See `reseed-run-secrets.ts`.
392
+ await reseedRunMaskSet(deps, wrappedDeps, args.runId, args.automation);
393
+
246
394
  const ctx: DispatchContext = {
247
- deps,
395
+ deps: wrappedDeps,
248
396
  run: {
249
397
  runId: args.runId,
250
398
  automation: args.automation,
@@ -271,7 +419,7 @@ export async function resumeRun(
271
419
 
272
420
  return await finaliseRun(ctx, outcome);
273
421
  } finally {
274
- await deps.runStateStore.releaseAdvisoryLock(args.runId);
422
+ await lock.release();
275
423
  }
276
424
  }
277
425
 
@@ -287,7 +435,23 @@ export async function recoverStalledRun(
287
435
  ): Promise<{ status: string }> {
288
436
  const run = await deps.runStore.loadRun(args.runId);
289
437
  if (!run) throw new Error(`recoverStalledRun: run ${args.runId} not found`);
290
- if (run.status !== "running" && run.status !== "waiting") {
438
+ if (run.status !== "running") {
439
+ // Only genuinely-running runs are recoverable here. A `waiting` run is
440
+ // owned by the wait-lock / queue resume paths; recovering it would
441
+ // re-walk an intentional wait. (The sweeper now filters to `running`,
442
+ // but guard here too so a direct caller can't resurrect a wait.)
443
+ return { status: run.status };
444
+ }
445
+
446
+ // A live wait lock means this run is intentionally suspended (a wait the
447
+ // status update may not yet reflect, or a racing path). Refuse rather
448
+ // than from-top re-walk: re-running pre-wait actions has observable side
449
+ // effects. The wait-lock / queue resume paths own this run.
450
+ const existingLocks = await deps.runStore.findWaitLocksByRun(args.runId);
451
+ if (existingLocks.length > 0) {
452
+ deps.logger.debug(
453
+ `recoverStalledRun: run ${args.runId} holds ${existingLocks.length} live wait lock(s); leaving it to the wait/resume paths`,
454
+ );
291
455
  return { status: run.status };
292
456
  }
293
457
 
@@ -307,8 +471,14 @@ export async function recoverStalledRun(
307
471
  await deps.runStore.updateRunStatus(args.runId, "running");
308
472
  await deps.runStateStore.heartbeat(args.runId);
309
473
 
474
+ const wrappedDeps = withRunSecretCapture(deps, args.runId);
475
+ // Cross-pod mask re-seed (see `reseedRunMaskSet` in `resumeRun`): the
476
+ // sweeper pod recovering this stalled run did not resolve its secrets, so
477
+ // re-populate the mask set from the declared refs before re-walking.
478
+ await reseedRunMaskSet(deps, wrappedDeps, args.runId, args.automation);
479
+
310
480
  const ctx: DispatchContext = {
311
- deps,
481
+ deps: wrappedDeps,
312
482
  run: {
313
483
  runId: args.runId,
314
484
  automation: args.automation,
@@ -364,6 +534,206 @@ export async function recoverStalledRun(
364
534
  return await finaliseRun(ctx, outcome);
365
535
  }
366
536
 
537
+ /**
538
+ * Outcome of a single `wait_until` re-check.
539
+ * - "resumed" → condition satisfied (or timed-out-continue); the run
540
+ * was resumed past the wait_until.
541
+ * - "failed" → timed out with continue_on_timeout=false; run failed.
542
+ * - "still-waiting"→ not yet true and not timed out; caller re-enqueues.
543
+ * - "gone" → lock/run/automation no longer valid; nothing to do.
544
+ */
545
+ export type WaitUntilCheckOutcome =
546
+ | "resumed"
547
+ | "failed"
548
+ | "still-waiting"
549
+ | "gone";
550
+
551
+ /**
552
+ * Re-enrich a suspended `wait_until`'s scope before re-evaluation so the
553
+ * condition sees CURRENT state, not the value at suspension time. Two
554
+ * sources, kind-aware:
555
+ *
556
+ * 1. Health — resolved via the RPC `healthCheckClient`
557
+ * (`enrichScopeWithState`), since the health aggregate is computed on
558
+ * read and not stored as a framework entity row. Sets the rich
559
+ * `scope.health.*` condition snapshot.
560
+ * 2. Every OTHER `state.<kind>.<id>` ref the wait depends on — resolved
561
+ * kind-agnostically through the entity store
562
+ * (`enrichScopeWithEntities` + `deps.entityResolverFor`), folding into
563
+ * `scope.state.<kind>.<id>.<field>`. The refs are statically extracted
564
+ * from the condition (concrete ids only — wildcards carry no id) PLUS
565
+ * the concrete `changedRef` that woke this wait (so a wildcard wait on a
566
+ * dynamic id still resolves the entity that actually changed).
567
+ */
568
+ async function reEnrichWaitScope(args: {
569
+ deps: DispatchDeps;
570
+ scope: Record<string, unknown>;
571
+ automation: LoadedAutomation;
572
+ contextKey: string | null;
573
+ condition: Condition;
574
+ changedRef?: string;
575
+ }): Promise<void> {
576
+ const { deps, scope, automation, contextKey, condition, changedRef } = args;
577
+
578
+ // Split the changed ref into its `${kind}:${id}` parts once — reused by
579
+ // both the health-resolution injection below and the entity-ref collection.
580
+ let changedKind: string | undefined;
581
+ let changedId: string | undefined;
582
+ if (changedRef) {
583
+ const colon = changedRef.indexOf(":");
584
+ if (colon > 0) {
585
+ changedKind = changedRef.slice(0, colon);
586
+ changedId = changedRef.slice(colon + 1);
587
+ }
588
+ }
589
+
590
+ // 1. Health: the rich condition snapshot, RPC-resolved. Sets scope.health.*.
591
+ // A WILDCARD health wait (`health:*`) is woken by a concrete `health:sysX`
592
+ // whose id may be NEITHER the contextKey NOR in `uses_state`. The health
593
+ // aggregate is computed-on-read and is only resolved here for the systems
594
+ // we pass in, so without the changed id the wait re-evaluates against an
595
+ // empty `scope.health.systems[sysX]` and never resumes. Inject the changed
596
+ // system's concrete id so a wildcard wake always resolves the system that
597
+ // actually changed (deduped inside `enrichScopeWithState`).
598
+ const usesState =
599
+ changedKind === HEALTH_ENTITY_KIND && changedId && changedId !== "*"
600
+ ? [...(automation.definition.uses_state ?? []), changedId]
601
+ : automation.definition.uses_state;
602
+ await enrichScopeWithState({
603
+ scope,
604
+ client: deps.healthCheckClient,
605
+ logger: deps.logger,
606
+ contextKey,
607
+ usesState,
608
+ transitionWindowMinutes: automation.definition.state_window_minutes,
609
+ });
610
+
611
+ // 2. Kind-agnostic entity refs (entity-store-resolved). Collect the
612
+ // concrete refs the condition reads plus the changed ref, drop the
613
+ // health kind (already resolved above via the rich RPC path — excluding
614
+ // it here keeps health resolved exactly once per re-enrichment) and any
615
+ // wildcard (no concrete id).
616
+ const refs: EntityRef[] = [];
617
+ const seen = new Set<string>();
618
+ const addRef = (kind: string, id: string) => {
619
+ if (kind === HEALTH_ENTITY_KIND || id === "*" || id.length === 0) return;
620
+ const key = `${kind}:${id}`;
621
+ if (seen.has(key)) return;
622
+ seen.add(key);
623
+ refs.push({ kind, id });
624
+ };
625
+ for (const ref of extractWakeRefs(condition).refs) addRef(ref.kind, ref.id);
626
+ if (changedKind && changedId) addRef(changedKind, changedId);
627
+ if (refs.length === 0) return;
628
+
629
+ await enrichScopeWithEntities({
630
+ scope,
631
+ logger: deps.logger,
632
+ refs,
633
+ resolverFor: (kind) => deps.entityResolverFor?.(kind),
634
+ });
635
+ }
636
+
637
+ /**
638
+ * Re-check a suspended `wait_until`: re-enrich scope, evaluate the
639
+ * condition, and either resume the run (satisfied or timeout-continue),
640
+ * fail it (timeout-fail), or report "still waiting" so the caller
641
+ * re-schedules another check.
642
+ *
643
+ * Read-only until it acts; `resumeRun` takes the per-run advisory lock so
644
+ * a concurrent re-check / sweep can't double-resume. Idempotent: the lock
645
+ * is deleted before resuming, so a duplicate check finds nothing.
646
+ */
647
+ export async function checkWaitUntil(
648
+ deps: DispatchDeps,
649
+ args: {
650
+ runId: string;
651
+ waitLockId: string;
652
+ automation: LoadedAutomation;
653
+ /**
654
+ * The `${kind}:${id}` ref of the change that woke this wait (Stage-2
655
+ * `wake` job). Included in the re-enrichment so the changed entity is
656
+ * always resolved into scope — essential for a wildcard wait whose
657
+ * condition reads a dynamic id (the ref isn't statically extractable).
658
+ */
659
+ changedRef?: string;
660
+ },
661
+ ): Promise<WaitUntilCheckOutcome> {
662
+ const lock = await deps.runStore.loadWaitLock(args.waitLockId);
663
+ if (!lock || lock.kind !== "until" || !lock.waitConfig) return "gone";
664
+
665
+ const run = await deps.runStore.loadRun(args.runId);
666
+ if (!run) {
667
+ await deps.runStore.deleteWaitLock(args.waitLockId);
668
+ return "gone";
669
+ }
670
+ if (run.status !== "waiting") {
671
+ // Already resumed / cancelled / terminal — drop the stale lock.
672
+ await deps.runStore.deleteWaitLock(args.waitLockId);
673
+ return "gone";
674
+ }
675
+
676
+ // Rebuild the scope from the snapshot + re-enrich live state so the
677
+ // condition sees CURRENT state, not the value at suspension time.
678
+ const persisted = await deps.runStateStore.load(args.runId);
679
+ const scope = persisted?.scopeSnapshot
680
+ ? { ...persisted.scopeSnapshot }
681
+ : buildInitialScope({
682
+ triggerId: run.triggerId,
683
+ triggerEventId: run.triggerEventId,
684
+ payload: run.triggerPayload,
685
+ startedAt: run.startedAt,
686
+ });
687
+ await reEnrichWaitScope({
688
+ deps,
689
+ scope,
690
+ automation: args.automation,
691
+ contextKey: run.contextKey,
692
+ condition: lock.waitConfig.condition,
693
+ changedRef: args.changedRef,
694
+ });
695
+
696
+ let satisfied = false;
697
+ try {
698
+ satisfied = evaluateCondition(
699
+ lock.waitConfig.condition,
700
+ scope as TemplateContext,
701
+ deps.filters,
702
+ );
703
+ } catch (error) {
704
+ deps.logger.warn(
705
+ `wait_until re-check threw (treating as not-yet): ${(error as Error).message}`,
706
+ );
707
+ }
708
+
709
+ const timedOut =
710
+ lock.timeoutAt !== null && lock.timeoutAt.getTime() <= Date.now();
711
+
712
+ if (satisfied || (timedOut && lock.waitConfig.continueOnTimeout)) {
713
+ await deps.runStore.deleteWaitLock(args.waitLockId);
714
+ await resumeRun(deps, {
715
+ runId: args.runId,
716
+ automation: args.automation,
717
+ waitedAtPath: lock.actionPath,
718
+ });
719
+ return "resumed";
720
+ }
721
+
722
+ if (timedOut) {
723
+ // continue_on_timeout = false → fail the run.
724
+ await deps.runStore.deleteWaitLock(args.waitLockId);
725
+ await deps.runStore.updateRunStatus(
726
+ args.runId,
727
+ "failed",
728
+ `wait_until timed out after waiting for its condition`,
729
+ );
730
+ await deps.runStateStore.clear(args.runId);
731
+ return "failed";
732
+ }
733
+
734
+ return "still-waiting";
735
+ }
736
+
367
737
  // ─── Run finalisation ─────────────────────────────────────────────────────
368
738
 
369
739
  async function finaliseRun(
@@ -393,11 +763,14 @@ async function finaliseRun(
393
763
  errorMessage,
394
764
  );
395
765
  // Terminal runs drop their durable state. Suspended runs keep it so
396
- // resumption has the scope to work with.
766
+ // resumption has the scope to work with — but we must NOT clobber
767
+ // `lastActionPath`: the suspending action already checkpointed its real
768
+ // path, and a crash recovery needs that to resume from the wait rather
769
+ // than re-walking from actions[0] (which would re-fire pre-wait side
770
+ // effects). Omit it so the existing checkpoint survives.
397
771
  await (status === "waiting" ? ctx.deps.runStateStore.upsert({
398
772
  runId: ctx.run.runId,
399
773
  scopeSnapshot: ctx.scope,
400
- lastActionPath: null,
401
774
  }) : ctx.deps.runStateStore.clear(ctx.run.runId));
402
775
  return { runId: ctx.run.runId, status };
403
776
  }
@@ -578,6 +951,9 @@ async function executeAction(
578
951
  ctx,
579
952
  );
580
953
  }
954
+ case "wait_until": {
955
+ return await executeWaitUntil(action as WaitUntilInput, path, ctx);
956
+ }
581
957
  case "sequence": {
582
958
  return await executeSequence(
583
959
  action as SequenceInput,
@@ -1596,6 +1972,176 @@ async function executeWaitForTrigger(
1596
1972
  return { kind: "suspended", stepId };
1597
1973
  }
1598
1974
 
1975
+ // ─── Primitive: `wait_until` ─────────────────────────────────────────────
1976
+
1977
+ /**
1978
+ * Suspend the run until a condition becomes true, with an optional
1979
+ * timeout. Unlike `wait_for_trigger` (wait for a named event), `wait_until`
1980
+ * is REACTIVE (reactive automation engine §7): the engine statically
1981
+ * extracts the `state.*` refs the condition reads (§8.3), persists a
1982
+ * `kind: "until"` wait lock plus one wake-index row per ref (§8.1), and
1983
+ * suspends with NO active job and NO polling. A relevant `ENTITY_CHANGED`
1984
+ * wakes it (Stage 1 → `checkWaitUntil` re-evaluates the full condition and
1985
+ * resumes if it now holds).
1986
+ *
1987
+ * Fast path: if the condition is ALREADY true against the current
1988
+ * (enriched) scope, continue inline without suspending.
1989
+ *
1990
+ * Timeout: a single durable timer job at `timeoutAt` (NOT a re-check loop)
1991
+ * applies the continue/fail policy. When ref extraction is wholly
1992
+ * indeterminate (no concrete-or-wildcard ref) AND there is no timeout, the
1993
+ * wait could never wake — we log at `warn` so it is never silent (§8.3).
1994
+ */
1995
+ async function executeWaitUntil(
1996
+ action: WaitUntilInput,
1997
+ path: ActionPath,
1998
+ ctx: DispatchContext,
1999
+ ): Promise<StepOutcome> {
2000
+ const stepId = await ctx.deps.runStore.createStep({
2001
+ runId: ctx.run.runId,
2002
+ actionPath: formatActionPath(path),
2003
+ actionId: action.id ?? null,
2004
+ actionKind: "wait_until",
2005
+ providerActionId: null,
2006
+ });
2007
+
2008
+ const cfg = action.wait_until;
2009
+
2010
+ // Fast path — already satisfied. Evaluate against the current scope
2011
+ // (enriched at run start / resume). Errors are treated as "not yet".
2012
+ let satisfied = false;
2013
+ try {
2014
+ satisfied = evaluateCondition(
2015
+ cfg.condition,
2016
+ templateContext(ctx),
2017
+ ctx.deps.filters,
2018
+ );
2019
+ } catch (error) {
2020
+ ctx.deps.logger.debug(
2021
+ `wait_until initial eval threw (treating as not-yet): ${(error as Error).message}`,
2022
+ );
2023
+ }
2024
+ if (satisfied) {
2025
+ await ctx.deps.runStore.updateStep(stepId, {
2026
+ status: "success",
2027
+ resultPayload: { satisfied: true, immediate: true },
2028
+ });
2029
+ return { kind: "ok" };
2030
+ }
2031
+
2032
+ const continueOnTimeout = cfg.continue_on_timeout ?? true;
2033
+ const timeoutAt = cfg.timeout_seconds
2034
+ ? new Date(Date.now() + cfg.timeout_seconds * 1000)
2035
+ : null;
2036
+
2037
+ // Static reference extraction → wake-index dependency refs (§8.3).
2038
+ const extracted = extractWakeRefs(cfg.condition);
2039
+ const wakeRefs = extracted.refs.map((ref) => refToString(ref));
2040
+
2041
+ if (extracted.indeterminate && wakeRefs.length === 0) {
2042
+ // The condition reads live state but no concrete-or-wildcard ref could
2043
+ // be derived: the wait can only ever be released by the timeout timer.
2044
+ // Never silent (§8.3, §12).
2045
+ if (timeoutAt) {
2046
+ ctx.deps.logger.warn(
2047
+ `wait_until at ${formatActionPath(path)} (run ${ctx.run.runId}): could not extract any state ref from the condition; relying on the timeout timer only — it will not wake on state changes.`,
2048
+ );
2049
+ } else {
2050
+ ctx.deps.logger.warn(
2051
+ `wait_until at ${formatActionPath(path)} (run ${ctx.run.runId}): could not extract any state ref AND no timeout is set; this wait will never wake. Add a timeout or a concrete state.* reference.`,
2052
+ );
2053
+ }
2054
+ }
2055
+
2056
+ const waitLockId = await ctx.deps.runStore.createWaitLockWithWakeRefs({
2057
+ runId: ctx.run.runId,
2058
+ actionPath: formatActionPath(path),
2059
+ // Synthetic marker — reactive `until` locks aren't woken by named events.
2060
+ eventId: `@@until:${ctx.run.runId}:${formatActionPath(path)}`,
2061
+ contextKey: ctx.run.contextKey,
2062
+ timeoutAt,
2063
+ waitConfig: {
2064
+ condition: cfg.condition,
2065
+ continueOnTimeout,
2066
+ },
2067
+ wakeRefs,
2068
+ });
2069
+
2070
+ // Persist scope before suspending so the wake re-check rebuilds it.
2071
+ await checkpoint(ctx, path);
2072
+
2073
+ // Re-evaluate-on-registration guard (reactive automation engine §17).
2074
+ // The condition was checked above (fast path), THEN the wait lock + its
2075
+ // wake-index rows were committed. A relevant `ENTITY_CHANGED` landing in
2076
+ // that arm window is routed by Stage 1 against the just-now-visible lock,
2077
+ // but if the change committed BEFORE our wake rows were visible, Stage 1
2078
+ // found no lock and enqueued no wake job — a lost wakeup. For a no-timeout
2079
+ // wait nothing would ever re-check it (the sweeper filters `isNotNull
2080
+ // (timeoutAt)`), so the run would stall permanently. Guard against this by
2081
+ // re-evaluating ONCE against freshly re-enriched scope now that the lock is
2082
+ // armed: any change that landed during the window is now observable. If the
2083
+ // condition already holds, drop the lock (its wake-index rows cascade) and
2084
+ // continue the current walk inline. Idempotent: the lock delete + the
2085
+ // per-run advisory lock taken by any concurrent wake/resume path serialise
2086
+ // this with a racing Stage-2 wake (whichever deletes the lock first wins;
2087
+ // the loser sees `gone`).
2088
+ let armedSatisfied = false;
2089
+ try {
2090
+ await reEnrichWaitScope({
2091
+ deps: ctx.deps,
2092
+ scope: ctx.scope,
2093
+ automation: ctx.run.automation,
2094
+ contextKey: ctx.run.contextKey,
2095
+ condition: cfg.condition,
2096
+ });
2097
+ armedSatisfied = evaluateCondition(
2098
+ cfg.condition,
2099
+ templateContext(ctx),
2100
+ ctx.deps.filters,
2101
+ );
2102
+ } catch (error) {
2103
+ ctx.deps.logger.debug(
2104
+ `wait_until arm-window re-eval threw (treating as not-yet): ${(error as Error).message}`,
2105
+ );
2106
+ }
2107
+ if (armedSatisfied) {
2108
+ await ctx.deps.runStore.deleteWaitLock(waitLockId);
2109
+ await ctx.deps.runStore.updateStep(stepId, {
2110
+ status: "success",
2111
+ resultPayload: { satisfied: true, armWindow: true },
2112
+ });
2113
+ return { kind: "ok" };
2114
+ }
2115
+
2116
+ // Single durable timeout timer (NOT a poll loop). Only armed when a
2117
+ // deadline exists; otherwise the wait is purely event-driven.
2118
+ if (timeoutAt) {
2119
+ const queue = ctx.deps.queueManager.getQueue<WaitTimeoutJob>(
2120
+ WAIT_TIMEOUT_QUEUE_NAME,
2121
+ );
2122
+ await queue.enqueue(
2123
+ { runId: ctx.run.runId, waitLockId },
2124
+ {
2125
+ startDelay: Math.max(
2126
+ Math.ceil((timeoutAt.getTime() - Date.now()) / 1000),
2127
+ 0,
2128
+ ),
2129
+ jobId: `${ctx.run.runId}:${waitLockId}:timeout`,
2130
+ },
2131
+ );
2132
+ }
2133
+
2134
+ await ctx.deps.runStore.updateStep(stepId, {
2135
+ status: "waiting",
2136
+ resultPayload: {
2137
+ waitLockId,
2138
+ wakeRefs,
2139
+ timeoutAt: timeoutAt?.toISOString(),
2140
+ },
2141
+ });
2142
+ return { kind: "suspended", stepId };
2143
+ }
2144
+
1599
2145
  // ─── Helpers ─────────────────────────────────────────────────────────────
1600
2146
 
1601
2147
  // ─── Primitive: `sequence` ───────────────────────────────────────────────