@checkstack/automation-backend 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +544 -0
- package/drizzle/0003_sparkling_xorn.sql +17 -0
- package/drizzle/0004_cultured_spyke.sql +2 -0
- package/drizzle/0005_classy_the_hand.sql +19 -0
- package/drizzle/0006_burly_wallop.sql +10 -0
- package/drizzle/0007_nappy_jackal.sql +1 -0
- package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
- package/drizzle/0009_steady_liz_osborn.sql +12 -0
- package/drizzle/0010_chunky_changeling.sql +2 -0
- package/drizzle/meta/0003_snapshot.json +1007 -0
- package/drizzle/meta/0004_snapshot.json +1028 -0
- package/drizzle/meta/0005_snapshot.json +1164 -0
- package/drizzle/meta/0006_snapshot.json +1261 -0
- package/drizzle/meta/0007_snapshot.json +1215 -0
- package/drizzle/meta/0008_snapshot.json +1215 -0
- package/drizzle/meta/0009_snapshot.json +1328 -0
- package/drizzle/meta/0010_snapshot.json +1349 -0
- package/drizzle/meta/_journal.json +56 -0
- package/package.json +23 -12
- package/src/action-types.ts +23 -0
- package/src/artifact-store.ts +16 -1
- package/src/automation-store.test.ts +143 -0
- package/src/automation-store.ts +30 -8
- package/src/builtin-triggers.test.ts +77 -74
- package/src/builtin-triggers.ts +105 -108
- package/src/dispatch/action-kind.ts +2 -0
- package/src/dispatch/assemble-get-service.ts +31 -0
- package/src/dispatch/cancel-resurrect.test.ts +147 -0
- package/src/dispatch/concurrency-race.test.ts +255 -0
- package/src/dispatch/concurrency-scope.test.ts +166 -0
- package/src/dispatch/condition.ts +24 -5
- package/src/dispatch/dwell-queue.ts +65 -0
- package/src/dispatch/dwell-store.ts +154 -0
- package/src/dispatch/dwell.it.test.ts +142 -0
- package/src/dispatch/dwell.test.ts +799 -0
- package/src/dispatch/dwell.ts +257 -0
- package/src/dispatch/engine.test.ts +189 -2
- package/src/dispatch/engine.ts +555 -9
- package/src/dispatch/entity-scope.test.ts +176 -0
- package/src/dispatch/get-service-wiring.test.ts +318 -0
- package/src/dispatch/numeric.test.ts +71 -0
- package/src/dispatch/numeric.ts +96 -0
- package/src/dispatch/render.test.ts +34 -0
- package/src/dispatch/render.ts +31 -11
- package/src/dispatch/reseed-run-secrets.ts +230 -0
- package/src/dispatch/run-secret-registry.test.ts +189 -0
- package/src/dispatch/run-secret-registry.ts +247 -0
- package/src/dispatch/run-state-masking.test.ts +376 -0
- package/src/dispatch/run-state-store.ts +95 -38
- package/src/dispatch/run-state.ts +226 -59
- package/src/dispatch/scope-artifact-masking.test.ts +138 -0
- package/src/dispatch/secret-ref-ids.test.ts +19 -0
- package/src/dispatch/secret-ref-ids.ts +17 -0
- package/src/dispatch/snapshots.test.ts +86 -0
- package/src/dispatch/snapshots.ts +79 -0
- package/src/dispatch/stage1-router.test.ts +324 -0
- package/src/dispatch/stage1-router.ts +152 -0
- package/src/dispatch/stage1.it.test.ts +84 -0
- package/src/dispatch/stage2-dispatch.test.ts +285 -0
- package/src/dispatch/stage2-dispatch.ts +207 -0
- package/src/dispatch/stage2-stalled.it.test.ts +132 -0
- package/src/dispatch/stalled-sweeper.test.ts +197 -0
- package/src/dispatch/stalled-sweeper.ts +112 -5
- package/src/dispatch/state-scope.test.ts +234 -0
- package/src/dispatch/state-scope.ts +322 -0
- package/src/dispatch/structured-conditions.test.ts +246 -0
- package/src/dispatch/structured-conditions.ts +146 -0
- package/src/dispatch/test-fixtures.ts +306 -38
- package/src/dispatch/trigger-fanin.test.ts +111 -0
- package/src/dispatch/trigger-subscriber.ts +316 -14
- package/src/dispatch/types.ts +263 -8
- package/src/dispatch/wait-timeout-queue.ts +89 -0
- package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
- package/src/dispatch/wait-until.test.ts +540 -0
- package/src/dispatch/wake-refs.test.ts +158 -0
- package/src/dispatch/wake-refs.ts +348 -0
- package/src/dispatch/window-gate.test.ts +513 -0
- package/src/dispatch/window-store.test.ts +162 -0
- package/src/dispatch/window-store.ts +102 -0
- package/src/entity/change-derivers.test.ts +148 -0
- package/src/entity/change-derivers.ts +143 -0
- package/src/entity/change-emitter.test.ts +66 -0
- package/src/entity/change-emitter.ts +76 -0
- package/src/entity/create-handle.ts +344 -0
- package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
- package/src/entity/define-entity.ts +157 -0
- package/src/entity/diff.test.ts +57 -0
- package/src/entity/diff.ts +54 -0
- package/src/entity/entity-store.test.ts +30 -0
- package/src/entity/entity-store.ts +171 -0
- package/src/entity/extension-point.ts +56 -0
- package/src/entity/fake-entity-store.ts +130 -0
- package/src/entity/hook.ts +19 -0
- package/src/entity/index.ts +50 -0
- package/src/entity/mutate-handle.test.ts +517 -0
- package/src/entity/on-entity-changed.test.ts +189 -0
- package/src/entity/on-entity-changed.ts +214 -0
- package/src/entity/registry.test.ts +181 -0
- package/src/entity/registry.ts +200 -0
- package/src/entity/stable-stringify.test.ts +55 -0
- package/src/entity/stable-stringify.ts +49 -0
- package/src/entity/wake-index.it.test.ts +251 -0
- package/src/entity/with-entity-write.test.ts +100 -0
- package/src/entity/with-entity-write.ts +69 -0
- package/src/entity-driven-trigger.ts +46 -0
- package/src/extension-points.ts +35 -0
- package/src/gitops-docs.test.ts +215 -0
- package/src/gitops-docs.ts +151 -0
- package/src/gitops-kinds.test.ts +174 -0
- package/src/gitops-kinds.ts +137 -0
- package/src/index.ts +355 -11
- package/src/migration/flapping-to-window.test.ts +123 -0
- package/src/migration/flapping-to-window.ts +205 -0
- package/src/router.test.ts +182 -1
- package/src/router.ts +73 -2
- package/src/schema.ts +236 -3
- package/src/script-test-replay.test.ts +88 -0
- package/src/script-test-replay.ts +100 -0
- package/src/script-test-shell-env.test.ts +41 -0
- package/src/script-test-shell-env.ts +89 -0
- package/src/script-test.test.ts +386 -0
- package/src/script-test.ts +258 -0
- package/src/trigger-registry.ts +2 -0
- package/src/validate-definition.test.ts +1 -0
- package/tsconfig.json +24 -0
package/src/dispatch/engine.ts
CHANGED
|
@@ -49,6 +49,7 @@
|
|
|
49
49
|
import type {
|
|
50
50
|
Action,
|
|
51
51
|
ChooseInput,
|
|
52
|
+
Condition,
|
|
52
53
|
ConditionGuardInput,
|
|
53
54
|
DelayInput,
|
|
54
55
|
ParallelInput,
|
|
@@ -58,6 +59,7 @@ import type {
|
|
|
58
59
|
StopInput,
|
|
59
60
|
VariablesInput,
|
|
60
61
|
WaitForTriggerInput,
|
|
62
|
+
WaitUntilInput,
|
|
61
63
|
} from "@checkstack/automation-common";
|
|
62
64
|
import { SYSTEM_ACTOR, type Actor } from "@checkstack/common";
|
|
63
65
|
import type {
|
|
@@ -66,6 +68,8 @@ import type {
|
|
|
66
68
|
|
|
67
69
|
import type { ActionRunScope } from "../action-types";
|
|
68
70
|
import { detectActionKind, type ActionKind } from "./action-kind";
|
|
71
|
+
import { wrapGetServiceForRun } from "./run-secret-registry";
|
|
72
|
+
import { reseedRunSecretRegistry } from "./reseed-run-secrets";
|
|
69
73
|
import { evaluateCondition } from "./condition";
|
|
70
74
|
import { parseActionPath } from "./path-nav";
|
|
71
75
|
import {
|
|
@@ -80,6 +84,16 @@ import {
|
|
|
80
84
|
resolveConsumedArtifacts,
|
|
81
85
|
withRepeatContext,
|
|
82
86
|
} from "./scope";
|
|
87
|
+
import {
|
|
88
|
+
enrichScopeWithEntities,
|
|
89
|
+
enrichScopeWithState,
|
|
90
|
+
type EntityRef,
|
|
91
|
+
} from "./state-scope";
|
|
92
|
+
import {
|
|
93
|
+
extractWakeRefs,
|
|
94
|
+
refToString,
|
|
95
|
+
HEALTH_ENTITY_KIND,
|
|
96
|
+
} from "./wake-refs";
|
|
83
97
|
import {
|
|
84
98
|
formatActionPath,
|
|
85
99
|
type ActionPath,
|
|
@@ -90,6 +104,72 @@ import {
|
|
|
90
104
|
type StepOutcome,
|
|
91
105
|
} from "./types";
|
|
92
106
|
|
|
107
|
+
/**
|
|
108
|
+
* Per-run deps whose `getService` registers every resolved secret value
|
|
109
|
+
* into the run-scoped secret registry (for run-wide output masking). When
|
|
110
|
+
* the registry / ref-ids aren't configured (tests / minimal installs),
|
|
111
|
+
* the deps pass through unchanged.
|
|
112
|
+
*/
|
|
113
|
+
function withRunSecretCapture(
|
|
114
|
+
deps: DispatchDeps,
|
|
115
|
+
runId: string,
|
|
116
|
+
): DispatchDeps {
|
|
117
|
+
if (
|
|
118
|
+
!deps.secretRegistry ||
|
|
119
|
+
!deps.secretResolverRefId ||
|
|
120
|
+
!deps.connectionStoreRefId
|
|
121
|
+
) {
|
|
122
|
+
return deps;
|
|
123
|
+
}
|
|
124
|
+
return {
|
|
125
|
+
...deps,
|
|
126
|
+
getService: wrapGetServiceForRun({
|
|
127
|
+
getService: deps.getService,
|
|
128
|
+
runId,
|
|
129
|
+
registry: deps.secretRegistry,
|
|
130
|
+
resolverRefId: deps.secretResolverRefId,
|
|
131
|
+
connectionStoreRefId: deps.connectionStoreRefId,
|
|
132
|
+
}),
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Re-seed a resuming pod's run mask set from the automation's declared
|
|
138
|
+
* secret refs. The run's masking registry is in-memory and per-process, so
|
|
139
|
+
* a pod that did NOT originally resolve the run's secrets (the resume /
|
|
140
|
+
* stalled-recovery case) starts with an EMPTY mask set — letting a carried
|
|
141
|
+
* scope value / artifact / error persist unmasked. Re-resolving the
|
|
142
|
+
* declared `secretEnv` mappings + connection refs through the run's wrapped
|
|
143
|
+
* `getService` (which auto-registers) re-populates the same least-privilege
|
|
144
|
+
* set before we walk + persist. No-op when masking isn't wired (tests /
|
|
145
|
+
* minimal installs) or when `ctx.deps.getService` wasn't wrapped.
|
|
146
|
+
*/
|
|
147
|
+
async function reseedRunMaskSet(
|
|
148
|
+
deps: DispatchDeps,
|
|
149
|
+
wrappedDeps: DispatchDeps,
|
|
150
|
+
runId: string,
|
|
151
|
+
automation: LoadedAutomation,
|
|
152
|
+
): Promise<void> {
|
|
153
|
+
if (
|
|
154
|
+
!deps.secretRegistry ||
|
|
155
|
+
!deps.secretResolverRefId ||
|
|
156
|
+
!deps.connectionStoreRefId
|
|
157
|
+
) {
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
await reseedRunSecretRegistry({
|
|
161
|
+
// The WRAPPED getService is the registering one — feed it so re-resolved
|
|
162
|
+
// values land in the run's mask set.
|
|
163
|
+
getService: wrappedDeps.getService,
|
|
164
|
+
registry: deps.secretRegistry,
|
|
165
|
+
runId,
|
|
166
|
+
definition: automation.definition,
|
|
167
|
+
resolverRefId: deps.secretResolverRefId,
|
|
168
|
+
connectionStoreRefId: deps.connectionStoreRefId,
|
|
169
|
+
logger: deps.logger,
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
|
|
93
173
|
/** Name of the durable queue we use for crash-safe delays. */
|
|
94
174
|
export const DELAY_QUEUE_NAME = "automation-delay";
|
|
95
175
|
|
|
@@ -102,6 +182,27 @@ export interface DelayResumeJob {
|
|
|
102
182
|
waitLockId: string;
|
|
103
183
|
}
|
|
104
184
|
|
|
185
|
+
/**
|
|
186
|
+
* Name of the durable queue carrying a reactive `wait_until`'s single
|
|
187
|
+
* timeout timer (reactive automation engine §7, §13.1). A `wait_until` is
|
|
188
|
+
* now reactive: a relevant `ENTITY_CHANGED` wakes it (Stage 1 →
|
|
189
|
+
* `checkWaitUntil`). This queue is NOT a re-check loop — it holds at most
|
|
190
|
+
* one job per suspended wait, scheduled at the deadline, mirroring the
|
|
191
|
+
* dwell timer pattern. On fire the consumer applies the timeout policy
|
|
192
|
+
* (continue/fail) via `checkWaitUntil` (which also re-evaluates the
|
|
193
|
+
* condition one last time).
|
|
194
|
+
*/
|
|
195
|
+
export const WAIT_TIMEOUT_QUEUE_NAME = "automation-wait-timeout";
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Job payload for a `wait_until` timeout timer. Carries the run + lock so
|
|
199
|
+
* the consumer can re-evaluate one final time and apply the timeout policy.
|
|
200
|
+
*/
|
|
201
|
+
export interface WaitTimeoutJob {
|
|
202
|
+
runId: string;
|
|
203
|
+
waitLockId: string;
|
|
204
|
+
}
|
|
205
|
+
|
|
105
206
|
// ─── Public entry points ──────────────────────────────────────────────────
|
|
106
207
|
|
|
107
208
|
export interface DispatchTriggerArgs {
|
|
@@ -142,7 +243,7 @@ export async function dispatchTrigger(
|
|
|
142
243
|
});
|
|
143
244
|
|
|
144
245
|
const ctx: DispatchContext = {
|
|
145
|
-
deps,
|
|
246
|
+
deps: withRunSecretCapture(deps, runId),
|
|
146
247
|
run: {
|
|
147
248
|
runId,
|
|
148
249
|
automation: args.automation,
|
|
@@ -162,6 +263,18 @@ export async function dispatchTrigger(
|
|
|
162
263
|
resuming: false,
|
|
163
264
|
};
|
|
164
265
|
|
|
266
|
+
// Pre-resolve live health state into scope before any condition or
|
|
267
|
+
// template evaluation (the engine is sync, so this is the only place
|
|
268
|
+
// live state can be fetched). Fail-open inside the helper.
|
|
269
|
+
await enrichScopeWithState({
|
|
270
|
+
scope: ctx.scope,
|
|
271
|
+
client: deps.healthCheckClient,
|
|
272
|
+
logger: deps.logger,
|
|
273
|
+
contextKey: args.contextKey,
|
|
274
|
+
usesState: args.automation.definition.uses_state,
|
|
275
|
+
transitionWindowMinutes: args.automation.definition.state_window_minutes,
|
|
276
|
+
});
|
|
277
|
+
|
|
165
278
|
// Initial scope snapshot — gives the stalled sweeper something to
|
|
166
279
|
// work with even if we crash before the first step finishes.
|
|
167
280
|
await deps.runStateStore.upsert({
|
|
@@ -215,11 +328,27 @@ export async function resumeRun(
|
|
|
215
328
|
const run = await deps.runStore.loadRun(args.runId);
|
|
216
329
|
if (!run) throw new Error(`Cannot resume — run ${args.runId} not found`);
|
|
217
330
|
|
|
331
|
+
// Only a `waiting` run may be resumed. A run that was cancelled (restart
|
|
332
|
+
// mode / operator cancel) or already reached a terminal state must NEVER
|
|
333
|
+
// be resurrected by a late wake (wakeWaitingRuns, delay-expiry sweep, a
|
|
334
|
+
// racing queue job). Drop any stale wait lock for the run and return —
|
|
335
|
+
// mirrors the guard `checkWaitUntil` already applies for `until` locks.
|
|
336
|
+
if (run.status !== "waiting") {
|
|
337
|
+
const stale = await deps.runStore.findWaitLocksByRun(args.runId);
|
|
338
|
+
for (const lock of stale) {
|
|
339
|
+
await deps.runStore.deleteWaitLock(lock.id);
|
|
340
|
+
}
|
|
341
|
+
deps.logger.debug(
|
|
342
|
+
`resumeRun: run ${args.runId} is "${run.status}", not "waiting"; dropped ${stale.length} stale wait lock(s) and skipped resume`,
|
|
343
|
+
);
|
|
344
|
+
return { status: run.status };
|
|
345
|
+
}
|
|
346
|
+
|
|
218
347
|
const waitedAt = parseActionPath(args.waitedAtPath);
|
|
219
348
|
|
|
220
349
|
// Try to acquire the advisory lock so two resumers don't race.
|
|
221
|
-
const
|
|
222
|
-
if (!
|
|
350
|
+
const lock = await deps.runStateStore.tryAdvisoryLock(args.runId);
|
|
351
|
+
if (!lock) {
|
|
223
352
|
deps.logger.debug(
|
|
224
353
|
`resumeRun: another instance already holds the lock for run ${args.runId}; skipping`,
|
|
225
354
|
);
|
|
@@ -240,11 +369,30 @@ export async function resumeRun(
|
|
|
240
369
|
scope.resume = { payload: args.payload };
|
|
241
370
|
}
|
|
242
371
|
|
|
372
|
+
// Re-resolve live state on resume: the system may have changed during
|
|
373
|
+
// the wait, so conditions after a wait must see current state, not
|
|
374
|
+
// the snapshot taken at suspension time.
|
|
375
|
+
await enrichScopeWithState({
|
|
376
|
+
scope,
|
|
377
|
+
client: deps.healthCheckClient,
|
|
378
|
+
logger: deps.logger,
|
|
379
|
+
contextKey: run.contextKey,
|
|
380
|
+
usesState: args.automation.definition.uses_state,
|
|
381
|
+
transitionWindowMinutes: args.automation.definition.state_window_minutes,
|
|
382
|
+
});
|
|
383
|
+
|
|
243
384
|
await deps.runStore.updateRunStatus(args.runId, "running");
|
|
244
385
|
await deps.runStateStore.heartbeat(args.runId);
|
|
245
386
|
|
|
387
|
+
const wrappedDeps = withRunSecretCapture(deps, args.runId);
|
|
388
|
+
// Cross-pod mask re-seed: this pod may not be the one that resolved the
|
|
389
|
+
// run's secrets, so re-populate its (empty) mask set from the declared
|
|
390
|
+
// refs BEFORE walking / persisting — otherwise carried scope / artifact
|
|
391
|
+
// values would persist unmasked here. See `reseed-run-secrets.ts`.
|
|
392
|
+
await reseedRunMaskSet(deps, wrappedDeps, args.runId, args.automation);
|
|
393
|
+
|
|
246
394
|
const ctx: DispatchContext = {
|
|
247
|
-
deps,
|
|
395
|
+
deps: wrappedDeps,
|
|
248
396
|
run: {
|
|
249
397
|
runId: args.runId,
|
|
250
398
|
automation: args.automation,
|
|
@@ -271,7 +419,7 @@ export async function resumeRun(
|
|
|
271
419
|
|
|
272
420
|
return await finaliseRun(ctx, outcome);
|
|
273
421
|
} finally {
|
|
274
|
-
await
|
|
422
|
+
await lock.release();
|
|
275
423
|
}
|
|
276
424
|
}
|
|
277
425
|
|
|
@@ -287,7 +435,23 @@ export async function recoverStalledRun(
|
|
|
287
435
|
): Promise<{ status: string }> {
|
|
288
436
|
const run = await deps.runStore.loadRun(args.runId);
|
|
289
437
|
if (!run) throw new Error(`recoverStalledRun: run ${args.runId} not found`);
|
|
290
|
-
if (run.status !== "running"
|
|
438
|
+
if (run.status !== "running") {
|
|
439
|
+
// Only genuinely-running runs are recoverable here. A `waiting` run is
|
|
440
|
+
// owned by the wait-lock / queue resume paths; recovering it would
|
|
441
|
+
// re-walk an intentional wait. (The sweeper now filters to `running`,
|
|
442
|
+
// but guard here too so a direct caller can't resurrect a wait.)
|
|
443
|
+
return { status: run.status };
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// A live wait lock means this run is intentionally suspended (a wait the
|
|
447
|
+
// status update may not yet reflect, or a racing path). Refuse rather
|
|
448
|
+
// than from-top re-walk: re-running pre-wait actions has observable side
|
|
449
|
+
// effects. The wait-lock / queue resume paths own this run.
|
|
450
|
+
const existingLocks = await deps.runStore.findWaitLocksByRun(args.runId);
|
|
451
|
+
if (existingLocks.length > 0) {
|
|
452
|
+
deps.logger.debug(
|
|
453
|
+
`recoverStalledRun: run ${args.runId} holds ${existingLocks.length} live wait lock(s); leaving it to the wait/resume paths`,
|
|
454
|
+
);
|
|
291
455
|
return { status: run.status };
|
|
292
456
|
}
|
|
293
457
|
|
|
@@ -307,8 +471,14 @@ export async function recoverStalledRun(
|
|
|
307
471
|
await deps.runStore.updateRunStatus(args.runId, "running");
|
|
308
472
|
await deps.runStateStore.heartbeat(args.runId);
|
|
309
473
|
|
|
474
|
+
const wrappedDeps = withRunSecretCapture(deps, args.runId);
|
|
475
|
+
// Cross-pod mask re-seed (see `reseedRunMaskSet` in `resumeRun`): the
|
|
476
|
+
// sweeper pod recovering this stalled run did not resolve its secrets, so
|
|
477
|
+
// re-populate the mask set from the declared refs before re-walking.
|
|
478
|
+
await reseedRunMaskSet(deps, wrappedDeps, args.runId, args.automation);
|
|
479
|
+
|
|
310
480
|
const ctx: DispatchContext = {
|
|
311
|
-
deps,
|
|
481
|
+
deps: wrappedDeps,
|
|
312
482
|
run: {
|
|
313
483
|
runId: args.runId,
|
|
314
484
|
automation: args.automation,
|
|
@@ -364,6 +534,206 @@ export async function recoverStalledRun(
|
|
|
364
534
|
return await finaliseRun(ctx, outcome);
|
|
365
535
|
}
|
|
366
536
|
|
|
537
|
+
/**
|
|
538
|
+
* Outcome of a single `wait_until` re-check.
|
|
539
|
+
* - "resumed" → condition satisfied (or timed-out-continue); the run
|
|
540
|
+
* was resumed past the wait_until.
|
|
541
|
+
* - "failed" → timed out with continue_on_timeout=false; run failed.
|
|
542
|
+
* - "still-waiting"→ not yet true and not timed out; caller re-enqueues.
|
|
543
|
+
* - "gone" → lock/run/automation no longer valid; nothing to do.
|
|
544
|
+
*/
|
|
545
|
+
export type WaitUntilCheckOutcome =
|
|
546
|
+
| "resumed"
|
|
547
|
+
| "failed"
|
|
548
|
+
| "still-waiting"
|
|
549
|
+
| "gone";
|
|
550
|
+
|
|
551
|
+
/**
|
|
552
|
+
* Re-enrich a suspended `wait_until`'s scope before re-evaluation so the
|
|
553
|
+
* condition sees CURRENT state, not the value at suspension time. Two
|
|
554
|
+
* sources, kind-aware:
|
|
555
|
+
*
|
|
556
|
+
* 1. Health — resolved via the RPC `healthCheckClient`
|
|
557
|
+
* (`enrichScopeWithState`), since the health aggregate is computed on
|
|
558
|
+
* read and not stored as a framework entity row. Sets the rich
|
|
559
|
+
* `scope.health.*` condition snapshot.
|
|
560
|
+
* 2. Every OTHER `state.<kind>.<id>` ref the wait depends on — resolved
|
|
561
|
+
* kind-agnostically through the entity store
|
|
562
|
+
* (`enrichScopeWithEntities` + `deps.entityResolverFor`), folding into
|
|
563
|
+
* `scope.state.<kind>.<id>.<field>`. The refs are statically extracted
|
|
564
|
+
* from the condition (concrete ids only — wildcards carry no id) PLUS
|
|
565
|
+
* the concrete `changedRef` that woke this wait (so a wildcard wait on a
|
|
566
|
+
* dynamic id still resolves the entity that actually changed).
|
|
567
|
+
*/
|
|
568
|
+
async function reEnrichWaitScope(args: {
|
|
569
|
+
deps: DispatchDeps;
|
|
570
|
+
scope: Record<string, unknown>;
|
|
571
|
+
automation: LoadedAutomation;
|
|
572
|
+
contextKey: string | null;
|
|
573
|
+
condition: Condition;
|
|
574
|
+
changedRef?: string;
|
|
575
|
+
}): Promise<void> {
|
|
576
|
+
const { deps, scope, automation, contextKey, condition, changedRef } = args;
|
|
577
|
+
|
|
578
|
+
// Split the changed ref into its `${kind}:${id}` parts once — reused by
|
|
579
|
+
// both the health-resolution injection below and the entity-ref collection.
|
|
580
|
+
let changedKind: string | undefined;
|
|
581
|
+
let changedId: string | undefined;
|
|
582
|
+
if (changedRef) {
|
|
583
|
+
const colon = changedRef.indexOf(":");
|
|
584
|
+
if (colon > 0) {
|
|
585
|
+
changedKind = changedRef.slice(0, colon);
|
|
586
|
+
changedId = changedRef.slice(colon + 1);
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
// 1. Health: the rich condition snapshot, RPC-resolved. Sets scope.health.*.
|
|
591
|
+
// A WILDCARD health wait (`health:*`) is woken by a concrete `health:sysX`
|
|
592
|
+
// whose id may be NEITHER the contextKey NOR in `uses_state`. The health
|
|
593
|
+
// aggregate is computed-on-read and is only resolved here for the systems
|
|
594
|
+
// we pass in, so without the changed id the wait re-evaluates against an
|
|
595
|
+
// empty `scope.health.systems[sysX]` and never resumes. Inject the changed
|
|
596
|
+
// system's concrete id so a wildcard wake always resolves the system that
|
|
597
|
+
// actually changed (deduped inside `enrichScopeWithState`).
|
|
598
|
+
const usesState =
|
|
599
|
+
changedKind === HEALTH_ENTITY_KIND && changedId && changedId !== "*"
|
|
600
|
+
? [...(automation.definition.uses_state ?? []), changedId]
|
|
601
|
+
: automation.definition.uses_state;
|
|
602
|
+
await enrichScopeWithState({
|
|
603
|
+
scope,
|
|
604
|
+
client: deps.healthCheckClient,
|
|
605
|
+
logger: deps.logger,
|
|
606
|
+
contextKey,
|
|
607
|
+
usesState,
|
|
608
|
+
transitionWindowMinutes: automation.definition.state_window_minutes,
|
|
609
|
+
});
|
|
610
|
+
|
|
611
|
+
// 2. Kind-agnostic entity refs (entity-store-resolved). Collect the
|
|
612
|
+
// concrete refs the condition reads plus the changed ref, drop the
|
|
613
|
+
// health kind (already resolved above via the rich RPC path — excluding
|
|
614
|
+
// it here keeps health resolved exactly once per re-enrichment) and any
|
|
615
|
+
// wildcard (no concrete id).
|
|
616
|
+
const refs: EntityRef[] = [];
|
|
617
|
+
const seen = new Set<string>();
|
|
618
|
+
const addRef = (kind: string, id: string) => {
|
|
619
|
+
if (kind === HEALTH_ENTITY_KIND || id === "*" || id.length === 0) return;
|
|
620
|
+
const key = `${kind}:${id}`;
|
|
621
|
+
if (seen.has(key)) return;
|
|
622
|
+
seen.add(key);
|
|
623
|
+
refs.push({ kind, id });
|
|
624
|
+
};
|
|
625
|
+
for (const ref of extractWakeRefs(condition).refs) addRef(ref.kind, ref.id);
|
|
626
|
+
if (changedKind && changedId) addRef(changedKind, changedId);
|
|
627
|
+
if (refs.length === 0) return;
|
|
628
|
+
|
|
629
|
+
await enrichScopeWithEntities({
|
|
630
|
+
scope,
|
|
631
|
+
logger: deps.logger,
|
|
632
|
+
refs,
|
|
633
|
+
resolverFor: (kind) => deps.entityResolverFor?.(kind),
|
|
634
|
+
});
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
/**
|
|
638
|
+
* Re-check a suspended `wait_until`: re-enrich scope, evaluate the
|
|
639
|
+
* condition, and either resume the run (satisfied or timeout-continue),
|
|
640
|
+
* fail it (timeout-fail), or report "still waiting" so the caller
|
|
641
|
+
* re-schedules another check.
|
|
642
|
+
*
|
|
643
|
+
* Read-only until it acts; `resumeRun` takes the per-run advisory lock so
|
|
644
|
+
* a concurrent re-check / sweep can't double-resume. Idempotent: the lock
|
|
645
|
+
* is deleted before resuming, so a duplicate check finds nothing.
|
|
646
|
+
*/
|
|
647
|
+
export async function checkWaitUntil(
|
|
648
|
+
deps: DispatchDeps,
|
|
649
|
+
args: {
|
|
650
|
+
runId: string;
|
|
651
|
+
waitLockId: string;
|
|
652
|
+
automation: LoadedAutomation;
|
|
653
|
+
/**
|
|
654
|
+
* The `${kind}:${id}` ref of the change that woke this wait (Stage-2
|
|
655
|
+
* `wake` job). Included in the re-enrichment so the changed entity is
|
|
656
|
+
* always resolved into scope — essential for a wildcard wait whose
|
|
657
|
+
* condition reads a dynamic id (the ref isn't statically extractable).
|
|
658
|
+
*/
|
|
659
|
+
changedRef?: string;
|
|
660
|
+
},
|
|
661
|
+
): Promise<WaitUntilCheckOutcome> {
|
|
662
|
+
const lock = await deps.runStore.loadWaitLock(args.waitLockId);
|
|
663
|
+
if (!lock || lock.kind !== "until" || !lock.waitConfig) return "gone";
|
|
664
|
+
|
|
665
|
+
const run = await deps.runStore.loadRun(args.runId);
|
|
666
|
+
if (!run) {
|
|
667
|
+
await deps.runStore.deleteWaitLock(args.waitLockId);
|
|
668
|
+
return "gone";
|
|
669
|
+
}
|
|
670
|
+
if (run.status !== "waiting") {
|
|
671
|
+
// Already resumed / cancelled / terminal — drop the stale lock.
|
|
672
|
+
await deps.runStore.deleteWaitLock(args.waitLockId);
|
|
673
|
+
return "gone";
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
// Rebuild the scope from the snapshot + re-enrich live state so the
|
|
677
|
+
// condition sees CURRENT state, not the value at suspension time.
|
|
678
|
+
const persisted = await deps.runStateStore.load(args.runId);
|
|
679
|
+
const scope = persisted?.scopeSnapshot
|
|
680
|
+
? { ...persisted.scopeSnapshot }
|
|
681
|
+
: buildInitialScope({
|
|
682
|
+
triggerId: run.triggerId,
|
|
683
|
+
triggerEventId: run.triggerEventId,
|
|
684
|
+
payload: run.triggerPayload,
|
|
685
|
+
startedAt: run.startedAt,
|
|
686
|
+
});
|
|
687
|
+
await reEnrichWaitScope({
|
|
688
|
+
deps,
|
|
689
|
+
scope,
|
|
690
|
+
automation: args.automation,
|
|
691
|
+
contextKey: run.contextKey,
|
|
692
|
+
condition: lock.waitConfig.condition,
|
|
693
|
+
changedRef: args.changedRef,
|
|
694
|
+
});
|
|
695
|
+
|
|
696
|
+
let satisfied = false;
|
|
697
|
+
try {
|
|
698
|
+
satisfied = evaluateCondition(
|
|
699
|
+
lock.waitConfig.condition,
|
|
700
|
+
scope as TemplateContext,
|
|
701
|
+
deps.filters,
|
|
702
|
+
);
|
|
703
|
+
} catch (error) {
|
|
704
|
+
deps.logger.warn(
|
|
705
|
+
`wait_until re-check threw (treating as not-yet): ${(error as Error).message}`,
|
|
706
|
+
);
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
const timedOut =
|
|
710
|
+
lock.timeoutAt !== null && lock.timeoutAt.getTime() <= Date.now();
|
|
711
|
+
|
|
712
|
+
if (satisfied || (timedOut && lock.waitConfig.continueOnTimeout)) {
|
|
713
|
+
await deps.runStore.deleteWaitLock(args.waitLockId);
|
|
714
|
+
await resumeRun(deps, {
|
|
715
|
+
runId: args.runId,
|
|
716
|
+
automation: args.automation,
|
|
717
|
+
waitedAtPath: lock.actionPath,
|
|
718
|
+
});
|
|
719
|
+
return "resumed";
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
if (timedOut) {
|
|
723
|
+
// continue_on_timeout = false → fail the run.
|
|
724
|
+
await deps.runStore.deleteWaitLock(args.waitLockId);
|
|
725
|
+
await deps.runStore.updateRunStatus(
|
|
726
|
+
args.runId,
|
|
727
|
+
"failed",
|
|
728
|
+
`wait_until timed out after waiting for its condition`,
|
|
729
|
+
);
|
|
730
|
+
await deps.runStateStore.clear(args.runId);
|
|
731
|
+
return "failed";
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
return "still-waiting";
|
|
735
|
+
}
|
|
736
|
+
|
|
367
737
|
// ─── Run finalisation ─────────────────────────────────────────────────────
|
|
368
738
|
|
|
369
739
|
async function finaliseRun(
|
|
@@ -393,11 +763,14 @@ async function finaliseRun(
|
|
|
393
763
|
errorMessage,
|
|
394
764
|
);
|
|
395
765
|
// Terminal runs drop their durable state. Suspended runs keep it so
|
|
396
|
-
// resumption has the scope to work with
|
|
766
|
+
// resumption has the scope to work with — but we must NOT clobber
|
|
767
|
+
// `lastActionPath`: the suspending action already checkpointed its real
|
|
768
|
+
// path, and a crash recovery needs that to resume from the wait rather
|
|
769
|
+
// than re-walking from actions[0] (which would re-fire pre-wait side
|
|
770
|
+
// effects). Omit it so the existing checkpoint survives.
|
|
397
771
|
await (status === "waiting" ? ctx.deps.runStateStore.upsert({
|
|
398
772
|
runId: ctx.run.runId,
|
|
399
773
|
scopeSnapshot: ctx.scope,
|
|
400
|
-
lastActionPath: null,
|
|
401
774
|
}) : ctx.deps.runStateStore.clear(ctx.run.runId));
|
|
402
775
|
return { runId: ctx.run.runId, status };
|
|
403
776
|
}
|
|
@@ -578,6 +951,9 @@ async function executeAction(
|
|
|
578
951
|
ctx,
|
|
579
952
|
);
|
|
580
953
|
}
|
|
954
|
+
case "wait_until": {
|
|
955
|
+
return await executeWaitUntil(action as WaitUntilInput, path, ctx);
|
|
956
|
+
}
|
|
581
957
|
case "sequence": {
|
|
582
958
|
return await executeSequence(
|
|
583
959
|
action as SequenceInput,
|
|
@@ -1596,6 +1972,176 @@ async function executeWaitForTrigger(
|
|
|
1596
1972
|
return { kind: "suspended", stepId };
|
|
1597
1973
|
}
|
|
1598
1974
|
|
|
1975
|
+
// ─── Primitive: `wait_until` ─────────────────────────────────────────────
|
|
1976
|
+
|
|
1977
|
+
/**
|
|
1978
|
+
* Suspend the run until a condition becomes true, with an optional
|
|
1979
|
+
* timeout. Unlike `wait_for_trigger` (wait for a named event), `wait_until`
|
|
1980
|
+
* is REACTIVE (reactive automation engine §7): the engine statically
|
|
1981
|
+
* extracts the `state.*` refs the condition reads (§8.3), persists a
|
|
1982
|
+
* `kind: "until"` wait lock plus one wake-index row per ref (§8.1), and
|
|
1983
|
+
* suspends with NO active job and NO polling. A relevant `ENTITY_CHANGED`
|
|
1984
|
+
* wakes it (Stage 1 → `checkWaitUntil` re-evaluates the full condition and
|
|
1985
|
+
* resumes if it now holds).
|
|
1986
|
+
*
|
|
1987
|
+
* Fast path: if the condition is ALREADY true against the current
|
|
1988
|
+
* (enriched) scope, continue inline without suspending.
|
|
1989
|
+
*
|
|
1990
|
+
* Timeout: a single durable timer job at `timeoutAt` (NOT a re-check loop)
|
|
1991
|
+
* applies the continue/fail policy. When ref extraction is wholly
|
|
1992
|
+
* indeterminate (no concrete-or-wildcard ref) AND there is no timeout, the
|
|
1993
|
+
* wait could never wake — we log at `warn` so it is never silent (§8.3).
|
|
1994
|
+
*/
|
|
1995
|
+
async function executeWaitUntil(
|
|
1996
|
+
action: WaitUntilInput,
|
|
1997
|
+
path: ActionPath,
|
|
1998
|
+
ctx: DispatchContext,
|
|
1999
|
+
): Promise<StepOutcome> {
|
|
2000
|
+
const stepId = await ctx.deps.runStore.createStep({
|
|
2001
|
+
runId: ctx.run.runId,
|
|
2002
|
+
actionPath: formatActionPath(path),
|
|
2003
|
+
actionId: action.id ?? null,
|
|
2004
|
+
actionKind: "wait_until",
|
|
2005
|
+
providerActionId: null,
|
|
2006
|
+
});
|
|
2007
|
+
|
|
2008
|
+
const cfg = action.wait_until;
|
|
2009
|
+
|
|
2010
|
+
// Fast path — already satisfied. Evaluate against the current scope
|
|
2011
|
+
// (enriched at run start / resume). Errors are treated as "not yet".
|
|
2012
|
+
let satisfied = false;
|
|
2013
|
+
try {
|
|
2014
|
+
satisfied = evaluateCondition(
|
|
2015
|
+
cfg.condition,
|
|
2016
|
+
templateContext(ctx),
|
|
2017
|
+
ctx.deps.filters,
|
|
2018
|
+
);
|
|
2019
|
+
} catch (error) {
|
|
2020
|
+
ctx.deps.logger.debug(
|
|
2021
|
+
`wait_until initial eval threw (treating as not-yet): ${(error as Error).message}`,
|
|
2022
|
+
);
|
|
2023
|
+
}
|
|
2024
|
+
if (satisfied) {
|
|
2025
|
+
await ctx.deps.runStore.updateStep(stepId, {
|
|
2026
|
+
status: "success",
|
|
2027
|
+
resultPayload: { satisfied: true, immediate: true },
|
|
2028
|
+
});
|
|
2029
|
+
return { kind: "ok" };
|
|
2030
|
+
}
|
|
2031
|
+
|
|
2032
|
+
const continueOnTimeout = cfg.continue_on_timeout ?? true;
|
|
2033
|
+
const timeoutAt = cfg.timeout_seconds
|
|
2034
|
+
? new Date(Date.now() + cfg.timeout_seconds * 1000)
|
|
2035
|
+
: null;
|
|
2036
|
+
|
|
2037
|
+
// Static reference extraction → wake-index dependency refs (§8.3).
|
|
2038
|
+
const extracted = extractWakeRefs(cfg.condition);
|
|
2039
|
+
const wakeRefs = extracted.refs.map((ref) => refToString(ref));
|
|
2040
|
+
|
|
2041
|
+
if (extracted.indeterminate && wakeRefs.length === 0) {
|
|
2042
|
+
// The condition reads live state but no concrete-or-wildcard ref could
|
|
2043
|
+
// be derived: the wait can only ever be released by the timeout timer.
|
|
2044
|
+
// Never silent (§8.3, §12).
|
|
2045
|
+
if (timeoutAt) {
|
|
2046
|
+
ctx.deps.logger.warn(
|
|
2047
|
+
`wait_until at ${formatActionPath(path)} (run ${ctx.run.runId}): could not extract any state ref from the condition; relying on the timeout timer only — it will not wake on state changes.`,
|
|
2048
|
+
);
|
|
2049
|
+
} else {
|
|
2050
|
+
ctx.deps.logger.warn(
|
|
2051
|
+
`wait_until at ${formatActionPath(path)} (run ${ctx.run.runId}): could not extract any state ref AND no timeout is set; this wait will never wake. Add a timeout or a concrete state.* reference.`,
|
|
2052
|
+
);
|
|
2053
|
+
}
|
|
2054
|
+
}
|
|
2055
|
+
|
|
2056
|
+
const waitLockId = await ctx.deps.runStore.createWaitLockWithWakeRefs({
|
|
2057
|
+
runId: ctx.run.runId,
|
|
2058
|
+
actionPath: formatActionPath(path),
|
|
2059
|
+
// Synthetic marker — reactive `until` locks aren't woken by named events.
|
|
2060
|
+
eventId: `@@until:${ctx.run.runId}:${formatActionPath(path)}`,
|
|
2061
|
+
contextKey: ctx.run.contextKey,
|
|
2062
|
+
timeoutAt,
|
|
2063
|
+
waitConfig: {
|
|
2064
|
+
condition: cfg.condition,
|
|
2065
|
+
continueOnTimeout,
|
|
2066
|
+
},
|
|
2067
|
+
wakeRefs,
|
|
2068
|
+
});
|
|
2069
|
+
|
|
2070
|
+
// Persist scope before suspending so the wake re-check rebuilds it.
|
|
2071
|
+
await checkpoint(ctx, path);
|
|
2072
|
+
|
|
2073
|
+
// Re-evaluate-on-registration guard (reactive automation engine §17).
|
|
2074
|
+
// The condition was checked above (fast path), THEN the wait lock + its
|
|
2075
|
+
// wake-index rows were committed. A relevant `ENTITY_CHANGED` landing in
|
|
2076
|
+
// that arm window is routed by Stage 1 against the just-now-visible lock,
|
|
2077
|
+
// but if the change committed BEFORE our wake rows were visible, Stage 1
|
|
2078
|
+
// found no lock and enqueued no wake job — a lost wakeup. For a no-timeout
|
|
2079
|
+
// wait nothing would ever re-check it (the sweeper filters `isNotNull
|
|
2080
|
+
// (timeoutAt)`), so the run would stall permanently. Guard against this by
|
|
2081
|
+
// re-evaluating ONCE against freshly re-enriched scope now that the lock is
|
|
2082
|
+
// armed: any change that landed during the window is now observable. If the
|
|
2083
|
+
// condition already holds, drop the lock (its wake-index rows cascade) and
|
|
2084
|
+
// continue the current walk inline. Idempotent: the lock delete + the
|
|
2085
|
+
// per-run advisory lock taken by any concurrent wake/resume path serialise
|
|
2086
|
+
// this with a racing Stage-2 wake (whichever deletes the lock first wins;
|
|
2087
|
+
// the loser sees `gone`).
|
|
2088
|
+
let armedSatisfied = false;
|
|
2089
|
+
try {
|
|
2090
|
+
await reEnrichWaitScope({
|
|
2091
|
+
deps: ctx.deps,
|
|
2092
|
+
scope: ctx.scope,
|
|
2093
|
+
automation: ctx.run.automation,
|
|
2094
|
+
contextKey: ctx.run.contextKey,
|
|
2095
|
+
condition: cfg.condition,
|
|
2096
|
+
});
|
|
2097
|
+
armedSatisfied = evaluateCondition(
|
|
2098
|
+
cfg.condition,
|
|
2099
|
+
templateContext(ctx),
|
|
2100
|
+
ctx.deps.filters,
|
|
2101
|
+
);
|
|
2102
|
+
} catch (error) {
|
|
2103
|
+
ctx.deps.logger.debug(
|
|
2104
|
+
`wait_until arm-window re-eval threw (treating as not-yet): ${(error as Error).message}`,
|
|
2105
|
+
);
|
|
2106
|
+
}
|
|
2107
|
+
if (armedSatisfied) {
|
|
2108
|
+
await ctx.deps.runStore.deleteWaitLock(waitLockId);
|
|
2109
|
+
await ctx.deps.runStore.updateStep(stepId, {
|
|
2110
|
+
status: "success",
|
|
2111
|
+
resultPayload: { satisfied: true, armWindow: true },
|
|
2112
|
+
});
|
|
2113
|
+
return { kind: "ok" };
|
|
2114
|
+
}
|
|
2115
|
+
|
|
2116
|
+
// Single durable timeout timer (NOT a poll loop). Only armed when a
|
|
2117
|
+
// deadline exists; otherwise the wait is purely event-driven.
|
|
2118
|
+
if (timeoutAt) {
|
|
2119
|
+
const queue = ctx.deps.queueManager.getQueue<WaitTimeoutJob>(
|
|
2120
|
+
WAIT_TIMEOUT_QUEUE_NAME,
|
|
2121
|
+
);
|
|
2122
|
+
await queue.enqueue(
|
|
2123
|
+
{ runId: ctx.run.runId, waitLockId },
|
|
2124
|
+
{
|
|
2125
|
+
startDelay: Math.max(
|
|
2126
|
+
Math.ceil((timeoutAt.getTime() - Date.now()) / 1000),
|
|
2127
|
+
0,
|
|
2128
|
+
),
|
|
2129
|
+
jobId: `${ctx.run.runId}:${waitLockId}:timeout`,
|
|
2130
|
+
},
|
|
2131
|
+
);
|
|
2132
|
+
}
|
|
2133
|
+
|
|
2134
|
+
await ctx.deps.runStore.updateStep(stepId, {
|
|
2135
|
+
status: "waiting",
|
|
2136
|
+
resultPayload: {
|
|
2137
|
+
waitLockId,
|
|
2138
|
+
wakeRefs,
|
|
2139
|
+
timeoutAt: timeoutAt?.toISOString(),
|
|
2140
|
+
},
|
|
2141
|
+
});
|
|
2142
|
+
return { kind: "suspended", stepId };
|
|
2143
|
+
}
|
|
2144
|
+
|
|
1599
2145
|
// ─── Helpers ─────────────────────────────────────────────────────────────
|
|
1600
2146
|
|
|
1601
2147
|
// ─── Primitive: `sequence` ───────────────────────────────────────────────
|