@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/CHANGELOG.md +544 -0
  2. package/drizzle/0003_sparkling_xorn.sql +17 -0
  3. package/drizzle/0004_cultured_spyke.sql +2 -0
  4. package/drizzle/0005_classy_the_hand.sql +19 -0
  5. package/drizzle/0006_burly_wallop.sql +10 -0
  6. package/drizzle/0007_nappy_jackal.sql +1 -0
  7. package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
  8. package/drizzle/0009_steady_liz_osborn.sql +12 -0
  9. package/drizzle/0010_chunky_changeling.sql +2 -0
  10. package/drizzle/meta/0003_snapshot.json +1007 -0
  11. package/drizzle/meta/0004_snapshot.json +1028 -0
  12. package/drizzle/meta/0005_snapshot.json +1164 -0
  13. package/drizzle/meta/0006_snapshot.json +1261 -0
  14. package/drizzle/meta/0007_snapshot.json +1215 -0
  15. package/drizzle/meta/0008_snapshot.json +1215 -0
  16. package/drizzle/meta/0009_snapshot.json +1328 -0
  17. package/drizzle/meta/0010_snapshot.json +1349 -0
  18. package/drizzle/meta/_journal.json +56 -0
  19. package/package.json +23 -12
  20. package/src/action-types.ts +23 -0
  21. package/src/artifact-store.ts +16 -1
  22. package/src/automation-store.test.ts +143 -0
  23. package/src/automation-store.ts +30 -8
  24. package/src/builtin-triggers.test.ts +77 -74
  25. package/src/builtin-triggers.ts +105 -108
  26. package/src/dispatch/action-kind.ts +2 -0
  27. package/src/dispatch/assemble-get-service.ts +31 -0
  28. package/src/dispatch/cancel-resurrect.test.ts +147 -0
  29. package/src/dispatch/concurrency-race.test.ts +255 -0
  30. package/src/dispatch/concurrency-scope.test.ts +166 -0
  31. package/src/dispatch/condition.ts +24 -5
  32. package/src/dispatch/dwell-queue.ts +65 -0
  33. package/src/dispatch/dwell-store.ts +154 -0
  34. package/src/dispatch/dwell.it.test.ts +142 -0
  35. package/src/dispatch/dwell.test.ts +799 -0
  36. package/src/dispatch/dwell.ts +257 -0
  37. package/src/dispatch/engine.test.ts +189 -2
  38. package/src/dispatch/engine.ts +555 -9
  39. package/src/dispatch/entity-scope.test.ts +176 -0
  40. package/src/dispatch/get-service-wiring.test.ts +318 -0
  41. package/src/dispatch/numeric.test.ts +71 -0
  42. package/src/dispatch/numeric.ts +96 -0
  43. package/src/dispatch/render.test.ts +34 -0
  44. package/src/dispatch/render.ts +31 -11
  45. package/src/dispatch/reseed-run-secrets.ts +230 -0
  46. package/src/dispatch/run-secret-registry.test.ts +189 -0
  47. package/src/dispatch/run-secret-registry.ts +247 -0
  48. package/src/dispatch/run-state-masking.test.ts +376 -0
  49. package/src/dispatch/run-state-store.ts +95 -38
  50. package/src/dispatch/run-state.ts +226 -59
  51. package/src/dispatch/scope-artifact-masking.test.ts +138 -0
  52. package/src/dispatch/secret-ref-ids.test.ts +19 -0
  53. package/src/dispatch/secret-ref-ids.ts +17 -0
  54. package/src/dispatch/snapshots.test.ts +86 -0
  55. package/src/dispatch/snapshots.ts +79 -0
  56. package/src/dispatch/stage1-router.test.ts +324 -0
  57. package/src/dispatch/stage1-router.ts +152 -0
  58. package/src/dispatch/stage1.it.test.ts +84 -0
  59. package/src/dispatch/stage2-dispatch.test.ts +285 -0
  60. package/src/dispatch/stage2-dispatch.ts +207 -0
  61. package/src/dispatch/stage2-stalled.it.test.ts +132 -0
  62. package/src/dispatch/stalled-sweeper.test.ts +197 -0
  63. package/src/dispatch/stalled-sweeper.ts +112 -5
  64. package/src/dispatch/state-scope.test.ts +234 -0
  65. package/src/dispatch/state-scope.ts +322 -0
  66. package/src/dispatch/structured-conditions.test.ts +246 -0
  67. package/src/dispatch/structured-conditions.ts +146 -0
  68. package/src/dispatch/test-fixtures.ts +306 -38
  69. package/src/dispatch/trigger-fanin.test.ts +111 -0
  70. package/src/dispatch/trigger-subscriber.ts +316 -14
  71. package/src/dispatch/types.ts +263 -8
  72. package/src/dispatch/wait-timeout-queue.ts +89 -0
  73. package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
  74. package/src/dispatch/wait-until.test.ts +540 -0
  75. package/src/dispatch/wake-refs.test.ts +158 -0
  76. package/src/dispatch/wake-refs.ts +348 -0
  77. package/src/dispatch/window-gate.test.ts +513 -0
  78. package/src/dispatch/window-store.test.ts +162 -0
  79. package/src/dispatch/window-store.ts +102 -0
  80. package/src/entity/change-derivers.test.ts +148 -0
  81. package/src/entity/change-derivers.ts +143 -0
  82. package/src/entity/change-emitter.test.ts +66 -0
  83. package/src/entity/change-emitter.ts +76 -0
  84. package/src/entity/create-handle.ts +344 -0
  85. package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
  86. package/src/entity/define-entity.ts +157 -0
  87. package/src/entity/diff.test.ts +57 -0
  88. package/src/entity/diff.ts +54 -0
  89. package/src/entity/entity-store.test.ts +30 -0
  90. package/src/entity/entity-store.ts +171 -0
  91. package/src/entity/extension-point.ts +56 -0
  92. package/src/entity/fake-entity-store.ts +130 -0
  93. package/src/entity/hook.ts +19 -0
  94. package/src/entity/index.ts +50 -0
  95. package/src/entity/mutate-handle.test.ts +517 -0
  96. package/src/entity/on-entity-changed.test.ts +189 -0
  97. package/src/entity/on-entity-changed.ts +214 -0
  98. package/src/entity/registry.test.ts +181 -0
  99. package/src/entity/registry.ts +200 -0
  100. package/src/entity/stable-stringify.test.ts +55 -0
  101. package/src/entity/stable-stringify.ts +49 -0
  102. package/src/entity/wake-index.it.test.ts +251 -0
  103. package/src/entity/with-entity-write.test.ts +100 -0
  104. package/src/entity/with-entity-write.ts +69 -0
  105. package/src/entity-driven-trigger.ts +46 -0
  106. package/src/extension-points.ts +35 -0
  107. package/src/gitops-docs.test.ts +215 -0
  108. package/src/gitops-docs.ts +151 -0
  109. package/src/gitops-kinds.test.ts +174 -0
  110. package/src/gitops-kinds.ts +137 -0
  111. package/src/index.ts +355 -11
  112. package/src/migration/flapping-to-window.test.ts +123 -0
  113. package/src/migration/flapping-to-window.ts +205 -0
  114. package/src/router.test.ts +182 -1
  115. package/src/router.ts +73 -2
  116. package/src/schema.ts +236 -3
  117. package/src/script-test-replay.test.ts +88 -0
  118. package/src/script-test-replay.ts +100 -0
  119. package/src/script-test-shell-env.test.ts +41 -0
  120. package/src/script-test-shell-env.ts +89 -0
  121. package/src/script-test.test.ts +386 -0
  122. package/src/script-test.ts +258 -0
  123. package/src/trigger-registry.ts +2 -0
  124. package/src/validate-definition.test.ts +1 -0
  125. package/tsconfig.json +24 -0
@@ -11,10 +11,15 @@
11
11
  * at a time. The lock auto-releases when the holding connection dies —
12
12
  * exactly what we want during crash recovery.
13
13
  */
14
- import { lt, eq, sql } from "drizzle-orm";
15
- import type { SafeDatabase } from "@checkstack/backend-api";
14
+ import { and, eq, lt } from "drizzle-orm";
15
+ import type {
16
+ AdvisoryLockHandle,
17
+ AdvisoryLockService,
18
+ SafeDatabase,
19
+ } from "@checkstack/backend-api";
16
20
 
17
- import { automationRunState } from "../schema";
21
+ import { automationRunState, automationRuns } from "../schema";
22
+ import type { RunSecretRegistry } from "./run-secret-registry";
18
23
 
19
24
  export interface RunStateSnapshot {
20
25
  scopeSnapshot: Record<string, unknown>;
@@ -27,11 +32,18 @@ export interface RunStateStore {
27
32
  * Write or update the per-run durable state. `lastActionPath` is the
28
33
  * path of the most recently completed action — resume walks the tree
29
34
  * looking for this path and treats the action at it as already done.
35
+ *
36
+ * Omitting `lastActionPath` (vs. passing `null`) on an UPDATE preserves
37
+ * the existing checkpoint. This matters at suspend-finalisation: the
38
+ * checkpoint written by the suspending action (its real path) must
39
+ * survive so a crash-recovery resumes from it rather than re-walking
40
+ * from `actions[0]`. Passing `null` explicitly still clobbers it (used
41
+ * only for the initial pre-first-step snapshot).
30
42
  */
31
43
  upsert(input: {
32
44
  runId: string;
33
45
  scopeSnapshot: Record<string, unknown>;
34
- lastActionPath: string | null;
46
+ lastActionPath?: string | null;
35
47
  }): Promise<void>;
36
48
 
37
49
  load(runId: string): Promise<RunStateSnapshot | undefined>;
@@ -43,46 +55,89 @@ export interface RunStateStore {
43
55
  heartbeat(runId: string): Promise<void>;
44
56
 
45
57
  /**
46
- * Run ids whose heartbeat is older than `threshold`. Returned in
47
- * heartbeat-ascending order so the sweeper processes the most
48
- * stale first.
58
+ * Run ids of `status = 'running'` runs whose heartbeat is older than
59
+ * `threshold`. Returned in heartbeat-ascending order so the sweeper
60
+ * processes the most stale first.
61
+ *
62
+ * The status filter is load-bearing: `waiting` runs (suspended on a
63
+ * `delay` / `wait_for_trigger` / `wait_until`) keep their state row but
64
+ * are NOT stalled - they are owned by the wait-lock / queue resume
65
+ * paths. Returning them here would let the sweeper re-walk an
66
+ * intentional wait every cycle, re-firing pre-wait side effects and
67
+ * leaking wait locks. Only a `running` run whose heartbeat went cold is
68
+ * a genuine crash.
49
69
  */
50
70
  findStalledRunIds(threshold: Date): Promise<string[]>;
51
71
 
52
72
  /**
53
- * Try to acquire a Postgres session-level advisory lock for the run.
54
- * Returns true on acquisition. The lock auto-releases when the holding
55
- * DB session closes (e.g. on process crash), so dead instances don't
73
+ * Try to acquire a Postgres session-level advisory lock for the run on a
74
+ * dedicated pooled client. Returns a handle on acquisition (release it in
75
+ * a `finally`), or `null` if another instance already holds it.
76
+ *
77
+ * A dedicated client is required because the lock is held across the whole
78
+ * resume (which executes the run's actions — potentially long and
79
+ * involving external calls), so a transaction-scoped lock would mean a
80
+ * minutes-long open transaction. The session lock auto-releases when the
81
+ * holding connection dies (e.g. on process crash), so dead instances don't
56
82
  * leak locks.
57
83
  */
58
- tryAdvisoryLock(runId: string): Promise<boolean>;
59
-
60
- /** Release a previously-acquired advisory lock. */
61
- releaseAdvisoryLock(runId: string): Promise<void>;
84
+ tryAdvisoryLock(runId: string): Promise<AdvisoryLockHandle | null>;
62
85
  }
63
86
 
64
- type Schema = { automationRunState: typeof automationRunState };
87
+ type Schema = {
88
+ automationRunState: typeof automationRunState;
89
+ automationRuns: typeof automationRuns;
90
+ };
91
+
92
+ /** Namespace run locks in the global advisory-lock space. */
93
+ function runLockKey(runId: string): string {
94
+ return `automation.run:${runId}`;
95
+ }
65
96
 
66
97
  export function createRunStateStore(
67
98
  db: SafeDatabase<Schema>,
99
+ advisoryLock: AdvisoryLockService,
100
+ /**
101
+ * Run-scoped secret values accumulated during dispatch. When provided,
102
+ * the persisted `scopeSnapshot` is masked (Jenkins-style, by-value)
103
+ * BEFORE write — so a resolved connection credential threaded into
104
+ * `scope.variables` / `scope.artifacts` can't reach a replay reader
105
+ * (`getRunScopeForReplay`) unmasked. The registry is in-memory and gone
106
+ * by replay time, so persist-time is the only place masking can happen.
107
+ * Optional so tests / older boots degrade to no masking.
108
+ */
109
+ secretRegistry?: RunSecretRegistry,
68
110
  ): RunStateStore {
69
111
  return {
70
112
  async upsert(input) {
113
+ // Mask the scope snapshot at the persistence choke point — same
114
+ // pattern the run store uses for step / run output.
115
+ const maskedScope = (secretRegistry?.maskDeep(
116
+ input.runId,
117
+ input.scopeSnapshot,
118
+ ) ?? input.scopeSnapshot) as Record<string, unknown>;
119
+ // Omitting `lastActionPath` preserves the existing checkpoint on an
120
+ // UPDATE (so a suspend-finalisation doesn't clobber the suspending
121
+ // action's path to null). The INSERT still needs a value, so a fresh
122
+ // row defaults to null.
123
+ const updateSet: Record<string, unknown> = {
124
+ scopeSnapshot: maskedScope,
125
+ lastHeartbeatAt: new Date(),
126
+ updatedAt: new Date(),
127
+ };
128
+ if (input.lastActionPath !== undefined) {
129
+ updateSet.lastActionPath = input.lastActionPath;
130
+ }
71
131
  await db
72
132
  .insert(automationRunState)
73
133
  .values({
74
134
  runId: input.runId,
75
- scopeSnapshot: input.scopeSnapshot,
76
- lastActionPath: input.lastActionPath,
135
+ scopeSnapshot: maskedScope,
136
+ lastActionPath: input.lastActionPath ?? null,
77
137
  })
78
138
  .onConflictDoUpdate({
79
139
  target: automationRunState.runId,
80
- set: {
81
- scopeSnapshot: input.scopeSnapshot,
82
- lastActionPath: input.lastActionPath,
83
- lastHeartbeatAt: new Date(),
84
- updatedAt: new Date(),
85
- },
140
+ set: updateSet,
86
141
  });
87
142
  },
88
143
 
@@ -115,29 +170,31 @@ export function createRunStateStore(
115
170
  },
116
171
 
117
172
  async findStalledRunIds(threshold) {
173
+ // Join the run row so we only return runs that are actually
174
+ // `running`. A `waiting` run keeps its state snapshot but must NOT
175
+ // be re-walked by the sweeper - it is owned by the wait-lock /
176
+ // queue resume paths.
118
177
  const rows = await db
119
178
  .select({ runId: automationRunState.runId })
120
179
  .from(automationRunState)
121
- .where(lt(automationRunState.lastHeartbeatAt, threshold))
180
+ .innerJoin(
181
+ automationRuns,
182
+ eq(automationRuns.id, automationRunState.runId),
183
+ )
184
+ .where(
185
+ and(
186
+ lt(automationRunState.lastHeartbeatAt, threshold),
187
+ eq(automationRuns.status, "running"),
188
+ ),
189
+ )
122
190
  .orderBy(automationRunState.lastHeartbeatAt);
123
191
  return rows.map((r) => r.runId);
124
192
  },
125
193
 
126
194
  async tryAdvisoryLock(runId) {
127
- // hashtextextended returns int8 in Postgres, which pg_try_advisory_lock
128
- // accepts directly. Using a deterministic hash means the same runId
129
- // always maps to the same lock key across processes.
130
- const result = await db.execute<{ ok: boolean }>(sql`
131
- SELECT pg_try_advisory_lock(hashtextextended(${runId}, 0)) AS ok
132
- `);
133
- const rows = result as unknown as { rows: Array<{ ok: boolean }> };
134
- return Boolean(rows.rows?.[0]?.ok);
135
- },
136
-
137
- async releaseAdvisoryLock(runId) {
138
- await db.execute(sql`
139
- SELECT pg_advisory_unlock(hashtextextended(${runId}, 0))
140
- `);
195
+ // Acquire on a dedicated client (see interface doc) — the lock is held
196
+ // for the whole resume, so it must not ride a long-open transaction.
197
+ return advisoryLock.tryAcquire(runLockKey(runId));
141
198
  },
142
199
  };
143
200
  }
@@ -8,32 +8,83 @@
8
8
  * trigger subscriber).
9
9
  */
10
10
  import { and, desc, eq, inArray, isNotNull, isNull, lte, sql } from "drizzle-orm";
11
- import type { SafeDatabase } from "@checkstack/backend-api";
11
+ import type { Logger, SafeDatabase } from "@checkstack/backend-api";
12
12
 
13
13
  import {
14
+ automationRunState,
14
15
  automationRunSteps,
15
16
  automationRuns,
16
17
  automationWaitLocks,
18
+ automationWakeIndex,
17
19
  } from "../schema";
18
20
  import type {
19
21
  CreateRunInput,
20
22
  CreateStepInput,
21
23
  CreateWaitLockInput,
24
+ CreateWaitLockWithRefsInput,
22
25
  LoadedRun,
23
26
  LoadedStep,
24
27
  LoadedWaitLock,
25
28
  RunStore,
29
+ WaitLockKind,
26
30
  } from "./types";
31
+ import { parseWaitConfig } from "./snapshots";
32
+ import type { RunSecretRegistry } from "./run-secret-registry";
27
33
 
28
34
  type Schema = {
29
35
  automationRuns: typeof automationRuns;
30
36
  automationRunSteps: typeof automationRunSteps;
31
37
  automationWaitLocks: typeof automationWaitLocks;
38
+ automationRunState: typeof automationRunState;
39
+ automationWakeIndex: typeof automationWakeIndex;
32
40
  };
33
41
 
42
+ /** The kind-level wildcard ref for a `${kind}:${id}` ref. */
43
+ function wildcardRefFor(ref: string): string {
44
+ const colon = ref.indexOf(":");
45
+ const kind = colon === -1 ? ref : ref.slice(0, colon);
46
+ return `${kind}:*`;
47
+ }
48
+
34
49
  const ACTIVE_STATUSES = ["pending", "running", "waiting"] as const;
35
50
 
36
- export function createRunStore(db: SafeDatabase<Schema>): RunStore {
51
+ /**
52
+ * Predicate for "active runs of this automation". When `contextKey` is
53
+ * `undefined` the filter is per-automation (the default concurrency
54
+ * scope); when provided (string or `null`) it additionally narrows to
55
+ * that context key (the per-context-key scope) - `null` matches runs
56
+ * with no context key.
57
+ */
58
+ function activeRunsPredicate(
59
+ automationId: string,
60
+ contextKey: string | null | undefined,
61
+ ) {
62
+ const conditions = [
63
+ eq(automationRuns.automationId, automationId),
64
+ inArray(automationRuns.status, [...ACTIVE_STATUSES]),
65
+ ];
66
+ if (contextKey !== undefined) {
67
+ conditions.push(
68
+ contextKey === null
69
+ ? isNull(automationRuns.contextKey)
70
+ : eq(automationRuns.contextKey, contextKey),
71
+ );
72
+ }
73
+ return and(...conditions);
74
+ }
75
+
76
+ export function createRunStore(
77
+ db: SafeDatabase<Schema>,
78
+ logger?: Logger,
79
+ /**
80
+ * Run-scoped secret values accumulated during dispatch. When provided,
81
+ * step `resultPayload` / `errorMessage` and run-level `errorMessage` are
82
+ * masked (Jenkins-style, by-value) BEFORE persistence, so no resolved
83
+ * secret can reach a DTO / run-detail page. Optional so tests / older
84
+ * boots degrade to no masking.
85
+ */
86
+ secretRegistry?: RunSecretRegistry,
87
+ ): RunStore {
37
88
  return {
38
89
  async createRun(input: CreateRunInput): Promise<string> {
39
90
  const [row] = await db
@@ -57,14 +108,22 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
57
108
  status === "failed" ||
58
109
  status === "cancelled" ||
59
110
  status === "skipped";
111
+ // Mask the run-level error before persisting (a provider HTTP error
112
+ // could embed a resolved credential).
113
+ const maskedError =
114
+ errorMessage === undefined
115
+ ? null
116
+ : (secretRegistry?.maskText(runId, errorMessage) ?? errorMessage);
60
117
  await db
61
118
  .update(automationRuns)
62
119
  .set({
63
120
  status,
64
- errorMessage: errorMessage ?? null,
121
+ errorMessage: maskedError,
65
122
  finishedAt: isTerminal ? new Date() : null,
66
123
  })
67
124
  .where(eq(automationRuns.id, runId));
125
+ // Drop the run's accumulated mask set once it is terminal (memory-only).
126
+ if (isTerminal) secretRegistry?.drop(runId);
68
127
  },
69
128
 
70
129
  async loadRun(runId: string): Promise<LoadedRun | undefined> {
@@ -89,29 +148,25 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
89
148
  };
90
149
  },
91
150
 
92
- async countActiveRuns(automationId: string): Promise<number> {
151
+ async countActiveRuns(
152
+ automationId: string,
153
+ contextKey?: string | null,
154
+ ): Promise<number> {
93
155
  const rows = await db
94
156
  .select({ count: sql<number>`count(*)::int` })
95
157
  .from(automationRuns)
96
- .where(
97
- and(
98
- eq(automationRuns.automationId, automationId),
99
- inArray(automationRuns.status, [...ACTIVE_STATUSES]),
100
- ),
101
- );
158
+ .where(activeRunsPredicate(automationId, contextKey));
102
159
  return rows[0]?.count ?? 0;
103
160
  },
104
161
 
105
- async hasActiveRun(automationId: string): Promise<boolean> {
162
+ async hasActiveRun(
163
+ automationId: string,
164
+ contextKey?: string | null,
165
+ ): Promise<boolean> {
106
166
  const rows = await db
107
167
  .select({ id: automationRuns.id })
108
168
  .from(automationRuns)
109
- .where(
110
- and(
111
- eq(automationRuns.automationId, automationId),
112
- inArray(automationRuns.status, [...ACTIVE_STATUSES]),
113
- ),
114
- )
169
+ .where(activeRunsPredicate(automationId, contextKey))
115
170
  .limit(1);
116
171
  return rows.length > 0;
117
172
  },
@@ -119,6 +174,7 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
119
174
  async cancelActiveRuns(
120
175
  automationId: string,
121
176
  reason: string,
177
+ contextKey?: string | null,
122
178
  ): Promise<string[]> {
123
179
  const rows = await db
124
180
  .update(automationRuns)
@@ -127,14 +183,26 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
127
183
  errorMessage: reason,
128
184
  finishedAt: new Date(),
129
185
  })
130
- .where(
131
- and(
132
- eq(automationRuns.automationId, automationId),
133
- inArray(automationRuns.status, [...ACTIVE_STATUSES]),
134
- ),
135
- )
186
+ .where(activeRunsPredicate(automationId, contextKey))
136
187
  .returning({ id: automationRuns.id });
137
- return rows.map((r) => r.id);
188
+ const ids = rows.map((r) => r.id);
189
+ // Tear down the cancelled runs' suspension state in the SAME
190
+ // operation: delete their wait locks and durable run-state so a
191
+ // later wake (wakeWaitingRuns / delay-expiry / a racing queue job)
192
+ // can't resurrect a cancelled run. Mirrors the operator cancelRun
193
+ // path. (resumeRun also guards on status, but cleaning up here stops
194
+ // the sweeper from even re-ticking an orphaned lock.)
195
+ if (ids.length > 0) {
196
+ await db
197
+ .delete(automationWaitLocks)
198
+ .where(inArray(automationWaitLocks.runId, ids));
199
+ await db
200
+ .delete(automationRunState)
201
+ .where(inArray(automationRunState.runId, ids));
202
+ // Drop each run's in-memory mask set (terminal).
203
+ for (const id of ids) secretRegistry?.drop(id);
204
+ }
205
+ return ids;
138
206
  },
139
207
 
140
208
  async createStep(input: CreateStepInput): Promise<string> {
@@ -151,6 +219,9 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
151
219
  })
152
220
  .returning({ id: automationRunSteps.id });
153
221
  if (!row) throw new Error("createStep: insert returned no rows");
222
+ // Link the step to its run so updateStep (which carries only stepId)
223
+ // can find the run's mask set.
224
+ secretRegistry?.linkStep(row.id, input.runId);
154
225
  return row.id;
155
226
  },
156
227
 
@@ -159,10 +230,23 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
159
230
  patch.status === "success" ||
160
231
  patch.status === "failed" ||
161
232
  patch.status === "skipped";
233
+ // Mask resolved secret values out of the step output BEFORE persist —
234
+ // this is the run-wide choke point covering ALL actions (provider,
235
+ // log, etc.), not just the script/collector source-side masking.
236
+ const maskedError =
237
+ patch.errorMessage === undefined
238
+ ? null
239
+ : (secretRegistry?.maskTextForStep(stepId, patch.errorMessage) ??
240
+ patch.errorMessage);
241
+ const maskedPayload =
242
+ patch.resultPayload === undefined
243
+ ? null
244
+ : (secretRegistry?.maskDeepForStep(stepId, patch.resultPayload) ??
245
+ patch.resultPayload);
162
246
  const set: Record<string, unknown> = {
163
247
  status: patch.status,
164
- errorMessage: patch.errorMessage ?? null,
165
- resultPayload: patch.resultPayload ?? null,
248
+ errorMessage: maskedError,
249
+ resultPayload: maskedPayload,
166
250
  };
167
251
  if (isTerminal) set.finishedAt = new Date();
168
252
  if (patch.incrementAttempts) {
@@ -214,12 +298,58 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
214
298
  contextKey: input.contextKey,
215
299
  filterTemplate: input.filterTemplate,
216
300
  timeoutAt: input.timeoutAt,
301
+ // Serialisation boundary: UntilWaitConfig is a plain JSON object
302
+ // but its `condition` union isn't structurally a Record, so cast.
303
+ waitConfig: input.waitConfig
304
+ ? (input.waitConfig as unknown as Record<string, unknown>)
305
+ : undefined,
217
306
  })
218
307
  .returning({ id: automationWaitLocks.id });
219
308
  if (!row) throw new Error("createWaitLock: insert returned no rows");
220
309
  return row.id;
221
310
  },
222
311
 
312
+ async createWaitLockWithWakeRefs(
313
+ input: CreateWaitLockWithRefsInput,
314
+ ): Promise<string> {
315
+ return db.transaction(async (tx) => {
316
+ const [row] = await tx
317
+ .insert(automationWaitLocks)
318
+ .values({
319
+ runId: input.runId,
320
+ actionPath: input.actionPath,
321
+ kind: "until",
322
+ eventId: input.eventId,
323
+ contextKey: input.contextKey,
324
+ filterTemplate: null,
325
+ timeoutAt: input.timeoutAt,
326
+ // Serialisation boundary — see createWaitLock.
327
+ waitConfig: input.waitConfig as unknown as Record<string, unknown>,
328
+ })
329
+ .returning({ id: automationWaitLocks.id });
330
+ if (!row) {
331
+ throw new Error("createWaitLockWithWakeRefs: insert returned no rows");
332
+ }
333
+ // De-dupe refs in-process before the insert (the unique index is the
334
+ // cross-process arm-race guard; this keeps the VALUES list tight).
335
+ const uniqueRefs = [...new Set(input.wakeRefs)];
336
+ if (uniqueRefs.length > 0) {
337
+ await tx
338
+ .insert(automationWakeIndex)
339
+ .values(
340
+ uniqueRefs.map((ref) => ({ waitLockId: row.id, ref })),
341
+ )
342
+ .onConflictDoNothing({
343
+ target: [
344
+ automationWakeIndex.waitLockId,
345
+ automationWakeIndex.ref,
346
+ ],
347
+ });
348
+ }
349
+ return row.id;
350
+ });
351
+ },
352
+
223
353
  async loadWaitLock(id) {
224
354
  const rows = await db
225
355
  .select()
@@ -228,17 +358,7 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
228
358
  .limit(1);
229
359
  const row = rows[0];
230
360
  if (!row) return;
231
- return {
232
- id: row.id,
233
- runId: row.runId,
234
- actionPath: row.actionPath,
235
- kind: row.kind as "trigger" | "delay",
236
- eventId: row.eventId,
237
- contextKey: row.contextKey,
238
- filterTemplate: row.filterTemplate,
239
- timeoutAt: row.timeoutAt,
240
- createdAt: row.createdAt,
241
- };
361
+ return mapWaitLock(row, logger);
242
362
  },
243
363
 
244
364
  async findWaitLocksFor(
@@ -255,17 +375,48 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
255
375
  .select()
256
376
  .from(automationWaitLocks)
257
377
  .where(and(...filters));
258
- return rows.map((r) => ({
259
- id: r.id,
260
- runId: r.runId,
261
- actionPath: r.actionPath,
262
- kind: r.kind as "trigger" | "delay",
263
- eventId: r.eventId,
264
- contextKey: r.contextKey,
265
- filterTemplate: r.filterTemplate,
266
- timeoutAt: r.timeoutAt,
267
- createdAt: r.createdAt,
268
- }));
378
+ return rows.map((r) => mapWaitLock(r, logger));
379
+ },
380
+
381
+ async findWaitLocksByWakeRef(ref: string): Promise<LoadedWaitLock[]> {
382
+ // The generalized form of findWaitLocksFor: join the wake-index onto
383
+ // the wait locks and match the exact ref OR the kind-level wildcard.
384
+ const wildcard = wildcardRefFor(ref);
385
+ const rows = await db
386
+ .select({ lock: automationWaitLocks })
387
+ .from(automationWaitLocks)
388
+ .innerJoin(
389
+ automationWakeIndex,
390
+ eq(automationWakeIndex.waitLockId, automationWaitLocks.id),
391
+ )
392
+ .where(
393
+ and(
394
+ eq(automationWaitLocks.kind, "until"),
395
+ inArray(automationWakeIndex.ref, [ref, wildcard]),
396
+ ),
397
+ );
398
+ // A wait may match on both the exact ref and the wildcard; de-dupe by id.
399
+ const byId = new Map<string, LoadedWaitLock>();
400
+ for (const r of rows) {
401
+ if (!byId.has(r.lock.id)) byId.set(r.lock.id, mapWaitLock(r.lock, logger));
402
+ }
403
+ return [...byId.values()];
404
+ },
405
+
406
+ async findWaitLocksByKind(kind): Promise<LoadedWaitLock[]> {
407
+ const rows = await db
408
+ .select()
409
+ .from(automationWaitLocks)
410
+ .where(eq(automationWaitLocks.kind, kind));
411
+ return rows.map((r) => mapWaitLock(r, logger));
412
+ },
413
+
414
+ async findWaitLocksByRun(runId): Promise<LoadedWaitLock[]> {
415
+ const rows = await db
416
+ .select()
417
+ .from(automationWaitLocks)
418
+ .where(eq(automationWaitLocks.runId, runId));
419
+ return rows.map((r) => mapWaitLock(r, logger));
269
420
  },
270
421
 
271
422
  async deleteWaitLock(id: string): Promise<void> {
@@ -282,17 +433,33 @@ export function createRunStore(db: SafeDatabase<Schema>): RunStore {
282
433
  lte(automationWaitLocks.timeoutAt, now),
283
434
  ),
284
435
  );
285
- return rows.map((r) => ({
286
- id: r.id,
287
- runId: r.runId,
288
- actionPath: r.actionPath,
289
- kind: r.kind as "trigger" | "delay",
290
- eventId: r.eventId,
291
- contextKey: r.contextKey,
292
- filterTemplate: r.filterTemplate,
293
- timeoutAt: r.timeoutAt,
294
- createdAt: r.createdAt,
295
- }));
436
+ return rows.map((r) => mapWaitLock(r, logger));
296
437
  },
297
438
  };
298
439
  }
440
+
441
+ /** Map a wait-lock row to the engine's {@link LoadedWaitLock}. */
442
+ function mapWaitLock(
443
+ row: typeof automationWaitLocks.$inferSelect,
444
+ logger?: Logger,
445
+ ): LoadedWaitLock {
446
+ return {
447
+ id: row.id,
448
+ runId: row.runId,
449
+ actionPath: row.actionPath,
450
+ kind: row.kind as WaitLockKind,
451
+ eventId: row.eventId,
452
+ contextKey: row.contextKey,
453
+ filterTemplate: row.filterTemplate,
454
+ timeoutAt: row.timeoutAt,
455
+ // Parse the stored config on load — a drifted/hand-edited row degrades
456
+ // to null (engine treats the `until` lock as gone) instead of being
457
+ // trusted as a wrongly-typed UntilWaitConfig.
458
+ waitConfig: parseWaitConfig({
459
+ value: row.waitConfig,
460
+ logger,
461
+ context: `Wait lock ${row.id}`,
462
+ }),
463
+ createdAt: row.createdAt,
464
+ };
465
+ }