@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/CHANGELOG.md +544 -0
  2. package/drizzle/0003_sparkling_xorn.sql +17 -0
  3. package/drizzle/0004_cultured_spyke.sql +2 -0
  4. package/drizzle/0005_classy_the_hand.sql +19 -0
  5. package/drizzle/0006_burly_wallop.sql +10 -0
  6. package/drizzle/0007_nappy_jackal.sql +1 -0
  7. package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
  8. package/drizzle/0009_steady_liz_osborn.sql +12 -0
  9. package/drizzle/0010_chunky_changeling.sql +2 -0
  10. package/drizzle/meta/0003_snapshot.json +1007 -0
  11. package/drizzle/meta/0004_snapshot.json +1028 -0
  12. package/drizzle/meta/0005_snapshot.json +1164 -0
  13. package/drizzle/meta/0006_snapshot.json +1261 -0
  14. package/drizzle/meta/0007_snapshot.json +1215 -0
  15. package/drizzle/meta/0008_snapshot.json +1215 -0
  16. package/drizzle/meta/0009_snapshot.json +1328 -0
  17. package/drizzle/meta/0010_snapshot.json +1349 -0
  18. package/drizzle/meta/_journal.json +56 -0
  19. package/package.json +23 -12
  20. package/src/action-types.ts +23 -0
  21. package/src/artifact-store.ts +16 -1
  22. package/src/automation-store.test.ts +143 -0
  23. package/src/automation-store.ts +30 -8
  24. package/src/builtin-triggers.test.ts +77 -74
  25. package/src/builtin-triggers.ts +105 -108
  26. package/src/dispatch/action-kind.ts +2 -0
  27. package/src/dispatch/assemble-get-service.ts +31 -0
  28. package/src/dispatch/cancel-resurrect.test.ts +147 -0
  29. package/src/dispatch/concurrency-race.test.ts +255 -0
  30. package/src/dispatch/concurrency-scope.test.ts +166 -0
  31. package/src/dispatch/condition.ts +24 -5
  32. package/src/dispatch/dwell-queue.ts +65 -0
  33. package/src/dispatch/dwell-store.ts +154 -0
  34. package/src/dispatch/dwell.it.test.ts +142 -0
  35. package/src/dispatch/dwell.test.ts +799 -0
  36. package/src/dispatch/dwell.ts +257 -0
  37. package/src/dispatch/engine.test.ts +189 -2
  38. package/src/dispatch/engine.ts +555 -9
  39. package/src/dispatch/entity-scope.test.ts +176 -0
  40. package/src/dispatch/get-service-wiring.test.ts +318 -0
  41. package/src/dispatch/numeric.test.ts +71 -0
  42. package/src/dispatch/numeric.ts +96 -0
  43. package/src/dispatch/render.test.ts +34 -0
  44. package/src/dispatch/render.ts +31 -11
  45. package/src/dispatch/reseed-run-secrets.ts +230 -0
  46. package/src/dispatch/run-secret-registry.test.ts +189 -0
  47. package/src/dispatch/run-secret-registry.ts +247 -0
  48. package/src/dispatch/run-state-masking.test.ts +376 -0
  49. package/src/dispatch/run-state-store.ts +95 -38
  50. package/src/dispatch/run-state.ts +226 -59
  51. package/src/dispatch/scope-artifact-masking.test.ts +138 -0
  52. package/src/dispatch/secret-ref-ids.test.ts +19 -0
  53. package/src/dispatch/secret-ref-ids.ts +17 -0
  54. package/src/dispatch/snapshots.test.ts +86 -0
  55. package/src/dispatch/snapshots.ts +79 -0
  56. package/src/dispatch/stage1-router.test.ts +324 -0
  57. package/src/dispatch/stage1-router.ts +152 -0
  58. package/src/dispatch/stage1.it.test.ts +84 -0
  59. package/src/dispatch/stage2-dispatch.test.ts +285 -0
  60. package/src/dispatch/stage2-dispatch.ts +207 -0
  61. package/src/dispatch/stage2-stalled.it.test.ts +132 -0
  62. package/src/dispatch/stalled-sweeper.test.ts +197 -0
  63. package/src/dispatch/stalled-sweeper.ts +112 -5
  64. package/src/dispatch/state-scope.test.ts +234 -0
  65. package/src/dispatch/state-scope.ts +322 -0
  66. package/src/dispatch/structured-conditions.test.ts +246 -0
  67. package/src/dispatch/structured-conditions.ts +146 -0
  68. package/src/dispatch/test-fixtures.ts +306 -38
  69. package/src/dispatch/trigger-fanin.test.ts +111 -0
  70. package/src/dispatch/trigger-subscriber.ts +316 -14
  71. package/src/dispatch/types.ts +263 -8
  72. package/src/dispatch/wait-timeout-queue.ts +89 -0
  73. package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
  74. package/src/dispatch/wait-until.test.ts +540 -0
  75. package/src/dispatch/wake-refs.test.ts +158 -0
  76. package/src/dispatch/wake-refs.ts +348 -0
  77. package/src/dispatch/window-gate.test.ts +513 -0
  78. package/src/dispatch/window-store.test.ts +162 -0
  79. package/src/dispatch/window-store.ts +102 -0
  80. package/src/entity/change-derivers.test.ts +148 -0
  81. package/src/entity/change-derivers.ts +143 -0
  82. package/src/entity/change-emitter.test.ts +66 -0
  83. package/src/entity/change-emitter.ts +76 -0
  84. package/src/entity/create-handle.ts +344 -0
  85. package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
  86. package/src/entity/define-entity.ts +157 -0
  87. package/src/entity/diff.test.ts +57 -0
  88. package/src/entity/diff.ts +54 -0
  89. package/src/entity/entity-store.test.ts +30 -0
  90. package/src/entity/entity-store.ts +171 -0
  91. package/src/entity/extension-point.ts +56 -0
  92. package/src/entity/fake-entity-store.ts +130 -0
  93. package/src/entity/hook.ts +19 -0
  94. package/src/entity/index.ts +50 -0
  95. package/src/entity/mutate-handle.test.ts +517 -0
  96. package/src/entity/on-entity-changed.test.ts +189 -0
  97. package/src/entity/on-entity-changed.ts +214 -0
  98. package/src/entity/registry.test.ts +181 -0
  99. package/src/entity/registry.ts +200 -0
  100. package/src/entity/stable-stringify.test.ts +55 -0
  101. package/src/entity/stable-stringify.ts +49 -0
  102. package/src/entity/wake-index.it.test.ts +251 -0
  103. package/src/entity/with-entity-write.test.ts +100 -0
  104. package/src/entity/with-entity-write.ts +69 -0
  105. package/src/entity-driven-trigger.ts +46 -0
  106. package/src/extension-points.ts +35 -0
  107. package/src/gitops-docs.test.ts +215 -0
  108. package/src/gitops-docs.ts +151 -0
  109. package/src/gitops-kinds.test.ts +174 -0
  110. package/src/gitops-kinds.ts +137 -0
  111. package/src/index.ts +355 -11
  112. package/src/migration/flapping-to-window.test.ts +123 -0
  113. package/src/migration/flapping-to-window.ts +205 -0
  114. package/src/router.test.ts +182 -1
  115. package/src/router.ts +73 -2
  116. package/src/schema.ts +236 -3
  117. package/src/script-test-replay.test.ts +88 -0
  118. package/src/script-test-replay.ts +100 -0
  119. package/src/script-test-shell-env.test.ts +41 -0
  120. package/src/script-test-shell-env.ts +89 -0
  121. package/src/script-test.test.ts +386 -0
  122. package/src/script-test.ts +258 -0
  123. package/src/trigger-registry.ts +2 -0
  124. package/src/validate-definition.test.ts +1 -0
  125. package/tsconfig.json +24 -0
package/CHANGELOG.md CHANGED
@@ -1,5 +1,549 @@
1
1
  # @checkstack/automation-backend
2
2
 
3
+ ## 0.3.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 270ef29: Fix automation provider actions and `secretEnv` script actions throwing in production.
8
+
9
+ The automation dispatch engine resolved provider-action dependencies (the integration connection store, the secret resolver) through a `getService` that was a throwing stub, so Jira / Teams / Webex actions and `secretEnv` script actions threw at execute time in production. The whole dispatch test suite stubbed `getService`, so the break was invisible.
10
+
11
+ Root cause: the plugin `env` exposed `registerService` but no resolver, so the dispatch path (the only context that resolves arbitrary cross-plugin refs outside an RPC handler) had nothing real to call.
12
+
13
+ Changes:
14
+
15
+ - `@checkstack/backend-api`: add `getService<S>(ref: ServiceRef<S>): Promise<S>` to the plugin `env` (`BackendPluginRegistry`). It resolves a service registered by any plugin through the real `ServiceRegistry` using the calling plugin's identity, and throws a clear error if the ref is not registered (never silently `undefined`). **NEW PLUGIN-AUTHOR CONTRACT**: `env.getService` is now available to resolve arbitrary cross-plugin service refs at init / afterPluginsReady time.
16
+ - `@checkstack/backend`: implement `env.getService` in both the plugin loader and the runtime single-plugin registration path, backed by `ServiceRegistry.get(ref, { pluginId })`.
17
+ - `@checkstack/automation-backend`: wire the dispatch `getService` to `env.getService` (was a throwing stub). This also activates run-wide provider-credential masking, because resolving the connection store / secret resolver now flows through the run's masking interceptor.
18
+
19
+ Also fixes a test-only seam where the `core/backend` test preload registered a no-op `registerRouter`, silently disabling oRPC router registration across the suite.
20
+
21
+ - b995afb: Show auto-generated trigger ids in the automation editor without clicking the field.
22
+
23
+ Previously, loading a stored definition (a seeded default, a GitOps-managed automation, or hand-written YAML) whose triggers carried no `id` left the Id field blank until the operator focused and blurred it. The editor now materializes the derived id eagerly on load - the same way the starter automation and "Add step" path already do - so the id is shown (and referenceable as `trigger.id`) immediately. The runtime already derived these ids, so saved definitions are unchanged.
24
+
25
+ The auto-incident migration also now writes explicit trigger ids (matching `deriveTriggerId(event)`) into the seeded sustained and flapping automations, so newly seeded defaults carry the same id the editor shows.
26
+
27
+ - b995afb: Surface per-variant config documentation for the `Automation` GitOps kind.
28
+
29
+ The GitOps editor and Kind Registry Browser now show the right config schema
30
+ for each automation trigger and provider action when authoring an
31
+ `Automation` YAML, mirroring how the `Healthcheck` kind documents its
32
+ strategy/collector configs:
33
+
34
+ - `triggers[].config` — one entry per registered trigger that declares a
35
+ `configSchema`, conditioned on the chosen `triggers[].event`.
36
+ - `actions[].config` — one entry per registered provider action,
37
+ conditioned on the chosen `actions[].action`.
38
+
39
+ New plugin-author contract on the entity kind registry:
40
+
41
+ - `@checkstack/gitops-common` / `@checkstack/gitops-backend`: add
42
+ `EntityKindRegistry.registerSpecSchemaDocumentationProvider(provider)`. The
43
+ provider is a thunk invoked on every `describeKinds()` (i.e. each time the
44
+ kind-browser RPC is queried), so the docs it returns reflect the current
45
+ state of whatever it reads — order-independent.
46
+
47
+ Why a lazy provider (and not the existing eager
48
+ `registerSpecSchemaDocumentation`): unlike Healthcheck, whose
49
+ strategy/collector registries are core services fully populated before any
50
+ plugin's `afterPluginsReady`, the automation trigger/action registries are
51
+ filled by other plugins across their `init` / `afterPluginsReady` phases with
52
+ no guaranteed ordering. Several plugins (catalog/maintenance/notification)
53
+ register their provider actions in their own `afterPluginsReady`, so the
54
+ previous one-shot eager registration snapshotted a half-populated (often
55
+ empty) registry and the Automation kind's "Additional Schemas" came up empty.
56
+ automation-backend now registers a provider instead, so trigger/action config
57
+ docs always reflect the fully-populated registries.
58
+
59
+ Documentation-only surface; no runtime reconcile behaviour changes.
60
+
61
+ - b995afb: Add grouping to automations so they are easier to find.
62
+
63
+ Each automation now carries an optional single free-text `group` label (HA-style "category"), stored as its own column on the `automations` row alongside `name` / `description` / `status` - it is NOT part of the definition / YAML. The automations list renders one collapsible section per group (sorted alphabetically, with an implicit "Ungrouped" bucket last), and the edit page gains a type-new-or-pick-existing group picker fed by the new `listAutomationGroups` query. `listAutomations` accepts an optional `group` filter.
64
+
65
+ Declaratively managed automations express their group via GitOps `metadata.labels.group`; the reconciler threads it onto the row (blank clears it).
66
+
67
+ A Drizzle migration adds the nullable `"group"` column and an index. Existing automations default to no group (Ungrouped) and behave exactly as before.
68
+
69
+ - 270ef29: Add live state in scope plus duration helpers to the automation sensing layer (Wave 2 Phase 14).
70
+
71
+ - `@checkstack/template-engine` ships four pure, synchronous duration filters: `minutes` and `hours` (number to milliseconds), `duration_since` (ms elapsed since an ISO timestamp), and `older_than(thresholdMs)` (boolean dwell check). They compute against real time at call time, so "now" is fresh per evaluation. Fail-safe on null/unparseable input.
72
+ - The dispatch engine pre-resolves live health state into scope before any condition or template evaluation (the engine is synchronous, so inline state queries are impossible). State is folded under a `health` namespace - `health.system.*` for the trigger's context system and `health.systems[<id>]` for ids listed in the automation's new `uses_state` field. One batched `getBulkHealthState` query per evaluation, wired at the fresh-run, resume, and trigger-gate sites. Fail-open: a missing client or provider error yields an empty namespace and a warning, never wedging unrelated automations.
73
+ - New `automationFilterExtensionPoint` lets plugins contribute pure template filters without forking the engine's default registry. Name collisions with built-ins are skipped with a warning.
74
+ - The editor variable-scope resolver and autocomplete catalogue now surface the `health.*` namespace and the new duration filters.
75
+
76
+ With this phase alone, an operator can build "notify me when a system has been unhealthy for 30 minutes" using an interval trigger plus a single `health.*` condition - no dwell timer required (the precise event-driven path lands in Phase 15).
77
+
78
+ - 270ef29: Add the `for:` dwell on triggers (Wave 2 Phase 15) - precise, event-driven, restart-safe "fire only if the matched state still holds after Y".
79
+
80
+ - New first-class `TriggerSchema.for` (decision D1): a single-unit duration (`{ seconds | minutes | hours }`) or `{ template }` rendering to seconds. A `durationToMs` helper resolves it. Not buried in `config`.
81
+ - New pre-run `automation_dwell_timers` table (decision D5): a dwell arms before any run exists, so it cannot reuse the run-scoped wait locks. Unique on `(automationId, triggerId, contextKey)` so a re-fire re-arms (pushes `fireAt`) rather than stacking timers.
82
+ - Arm / re-arm / fire / cancel wired into the trigger fan-in. When a `for:` trigger fires and its filter passes, the engine snapshots the current status, upserts the dwell row, and enqueues an `automation-dwell` wake job with the matching `startDelay` - no run starts yet.
83
+ - At expiry the dwell re-confirms (via the Phase 13 health-state provider) that the system is still in the armed status, then re-checks the automation's pre-run conditions, then starts the run honouring the concurrency mode. A recovery within the window cancels the pending fire even without an explicit inverse event.
84
+ - Cancellation is DB-side (delete the row; the queue job no-ops when it pops, since queue jobs are not cancellable). A contradicting state-change event eagerly deletes a stale dwell. Deleted automations drop their dwells via FK cascade; disabled automations drop them at fire time.
85
+ - Durability: the dwell row is the source of truth. A new `automation-dwell` queue consumer fires dwells, and the stalled sweeper catches expired rows whose job was lost. Both paths are idempotent via delete-on-fire, so a dwell fires at most once and survives restart.
86
+
87
+ Example:
88
+
89
+ ```yaml
90
+ triggers:
91
+ - event: healthcheck.system.degraded
92
+ for: { minutes: 30 }
93
+ actions:
94
+ - action: incident.create
95
+ config:
96
+ title: "{{ trigger.payload.systemName }} is critical"
97
+ severity: critical
98
+ systemIds: ["{{ trigger.payload.systemId }}"]
99
+ ```
100
+
101
+ - 270ef29: Add the `numeric_state` trigger and three structured condition variants (Wave 2 Phase 16, backend-only).
102
+
103
+ - New built-in `numeric_state` trigger: hook-backed on `healthcheck.check.completed`, fires when a numeric field (`latencyMs` top-level, or a `collectors.<id>.<field>` path) crosses an `above` / `below` threshold. The per-automation threshold is enforced by a new structured config gate (`TriggerDefinition.evaluateConfig`) that runs before the operator's template filter. Pairs with a trigger-level `for:` (Phase 15) for sustained thresholds. v1 is level-triggered; edge de-duplication is deferred. (Per-check `p95LatencyMs` is not in the hook payload; read windowed p95 via a `numeric_state` _condition_ against `health.system.p95_latency_ms` instead.)
104
+ - Corrected the Phase 15 dwell `arm` semantics to be insert-if-absent: a re-fire while a dwell is still armed PRESERVES the original `fireAt` instead of pushing it. Required for the level-triggered `numeric_state` trigger above - otherwise a trigger firing on every check completion (e.g. every 60s) with `for: 10m` would re-arm and push the deadline forward indefinitely, never elapsing. A genuine recover-then-recur still deletes the row (re-confirm / inverse-cancel) so a fresh window starts.
105
+ - Extended the condition grammar (`ConditionInput`) beyond `string | and | or | not` with three typed variants evaluated over the pre-resolved `health.*` scope plus a FRESH `now` per evaluation:
106
+ - `numeric_state`: `{ value, above?, below? }` (value is a literal number or a template/path string).
107
+ - `time`: `{ after?, before?, weekday?[], timezone? }` for on-call / quiet-hours gating, including overnight windows wrapping midnight, weekday filtering, and IANA timezone resolution via `Intl`.
108
+ - `state`: `{ entity, status, for? }` - a condition-side dwell read from `health.systems[entity].in_status_for_ms` (no new timer; it reads, it doesn't time).
109
+ - The raw template string stays the escape hatch. Everything round-trips through zod and YAML.
110
+
111
+ Editor widgets (ConditionEditor branches, duration/time-of-day inputs, operator selects) are intentionally deferred to Phase 19; the YAML editor already round-trips the new schema, so the feature is fully usable and testable via YAML today.
112
+
113
+ - 270ef29: Add the `wait_until` action primitive (Wave 2 Phase 17) - suspend a running automation until a condition becomes true, with an optional timeout (HA's `wait_template`).
114
+
115
+ - New `wait_until: { condition, timeout_seconds?, continue_on_timeout? }` primitive. `continue_on_timeout` defaults to true (HA semantics). Added to the schema, the action union, and `detectActionKind`. (The wait is fully reactive - see the reactive-dispatch-pipeline changeset; there is no `poll_seconds`.)
116
+ - `condition` accepts any condition shape - a template string or the Phase 16 structured `numeric_state` / `time` / `state` variants.
117
+ - Reactive resume: if the condition is already true it continues inline; otherwise it persists a `kind: "until"` wait lock (carrying the condition + timeout policy in a new `wait_config` jsonb column). The reactive-dispatch-pipeline changeset replaces the original poll-based re-check with a wake-index + a single timeout timer, so the wait is woken by a relevant entity change rather than ticked on an interval. Resumes take the per-run advisory lock so a wake and a sweep can't double-resume.
118
+ - Survives restart: the wait lock is the source of truth, and the stalled sweeper applies the timeout policy as a backstop if the wake/timer signal is lost.
119
+ - Works nested inside `choose` / `parallel` / `repeat` via the existing resume-remainder mechanism.
120
+ - Editor: a `wait_until` action card (frontend) mirroring `wait_for_trigger` - a `ConditionEditor` plus timeout and continue-on-timeout inputs. The structured numeric/time/state ConditionEditor branches land with the rest of the sensing-layer editor work; the card uses the expression-based editor for now.
121
+
122
+ - 270ef29: Add a windowed transition count to the health provider - the building block for custom flapping rules (Wave 2 Phase 18).
123
+
124
+ Flapping is already buildable today via the built-in `healthcheck.flapping_detected` trigger; this phase ships the GENERALIZATION for arbitrary "N status changes in M minutes" rules.
125
+
126
+ - `countStateTransitionsInWindow` counts aggregate status transitions for a system over a trailing window (from the Phase 13 `health_check_state_transitions` table - all statuses, generalizing the unhealthy-only flapping detector). Fail-safe to 0.
127
+ - `getHealthState` / `getBulkHealthState` now return `transitionsInWindow` + `transitionWindowMinutes`, and accept an optional `transitionWindowMinutes` input (default 60).
128
+ - The automation definition gains an optional top-level `state_window_minutes` (default 60), threaded through `enrichScopeWithState` so `health.system.transitions_in_window` / `health.system.transition_window_minutes` are folded into scope per evaluation.
129
+ - Operators author custom flapping as a `numeric_state` condition over `health.system.transitions_in_window` - no new condition variant, no editor change. The variable-scope resolver surfaces the new fields for autocomplete.
130
+
131
+ - 270ef29: Replace the hardcoded auto-incident path with default automations (Wave 2 Phase 20).
132
+
133
+ BREAKING CHANGES: Auto-incident is now automation-driven. The hardcoded background path that opened incidents on sustained-unhealthy / flapping and closed them after a cooldown (`auto-incident.ts`, `auto-incident-close-job.ts`) is removed. On upgrade, an idempotent, threshold-preserving migration seeds equivalent default automations from each assignment's existing `NotificationPolicy`, so alerting behaviour is preserved 1:1:
134
+
135
+ - `sustainedUnhealthyTrigger.durationMinutes` -> the `for:` dwell on a `healthcheck.system_degraded` trigger -> `incident.create`.
136
+ - auto-close `autoCloseAfterMinutes` -> a `wait_until` (healthy continuously for the cooldown) -> `incident.resolve`.
137
+ - `useNotificationSuppression` -> the incident's `suppressNotifications`.
138
+ - `skipDuringMaintenance` -> a `{{ !health.system.in_maintenance }}` pre-run condition.
139
+ - `flappingTrigger.{transitions,windowMinutes}` -> a second automation on the `healthcheck.flapping_detected` trigger -> `incident.create`.
140
+
141
+ Auto-incidents remain ONE OPEN INCIDENT PER SYSTEM, faithful to the old behaviour. `incident.create` gains an opt-in `dedupe_open_for_system` config flag (default false, so existing/custom automations are unaffected): when true, it reuses an existing open incident on the target system instead of opening a duplicate (the old `findActiveAutoIncident(systemId)` semantic), returning the reused incident as the produced `incident` artifact. The seeded default automations set this flag, so a system with several failing checks - sustained and/or flapping - still gets a single open incident; whichever check crosses its threshold first opens it, and the rest dedupe to it. Both sustained and flapping default automations open at `critical` severity (parity with the old path). Per-system run dedup within an automation uses `concurrency_scope: "context_key"` + `mode: "single"`.
142
+
143
+ Operators can read, edit, disable, and extend these automations (see the "Customise auto-incident" guide). Seeded automations are tagged via `managedBy` (`auto-incident:<systemId>:<configurationId>:<kind>`) so the migration is a no-op on re-runs; anything unmappable is recorded as a migration-failure row.
144
+
145
+ Flapping DETECTION (transition recording + the `healthcheck.flapping_detected` emit) is relocated into `flapping-detector.ts` and survives; the emit now fires unconditionally on a threshold cross (no longer gated on `autoOpenIncidentOnUnhealthy`), matching the hook's documented intent and required for the flapping default automation. The legacy `health_check_auto_incidents` mapping table is no longer written or read (it will be dropped in a follow-up migration); `health_check_unhealthy_transitions` is retained for the flapping detector.
146
+
147
+ New service-typed `HealthCheckApi.listAutoIncidentPolicies` RPC exposes each assignment's effective notification policy for the migration. `incident.create` adds the `dedupe_open_for_system` flag (additive, defaults off).
148
+
149
+ - 270ef29: Add the GitOps `Automation` entity kind (Wave 2 Phase 21).
150
+
151
+ - `automation-backend` registers an `Automation` kind with the GitOps entity-kind registry (`specSchema: AutomationDefinitionSchema`). Reconcile upserts by name (identity tracked via the returned entity id + provenance); reconciled rows are tagged `managed_by = "gitops"`. Delete is guarded to GitOps-managed rows. An automation's full definition - triggers (with `for:` dwells), structured conditions, the action catalog, mode, `concurrency_scope`, `uses_state`, `state_window_minutes` - can now be declared in Git.
152
+ - `automation-frontend`: the editor reads the GitOps provenance lock (`useProvenanceLock({ kind: "Automation", entityId })`) and, when locked, disables Save / Run-now / Delete and the form fields and shows a `GitOpsLockBanner`.
153
+ - Documented the `Automation` YAML format under the GitOps kinds reference, plus new automation platform overview + plugin-author ("extending") developer-guide pages.
154
+
155
+ - 270ef29: Add per-context-key concurrency scope to automations (Phase 20 prerequisite).
156
+
157
+ A new optional `concurrency_scope: "automation" | "context_key"` field on the automation definition controls the bucket the concurrency `mode` is evaluated over:
158
+
159
+ - `automation` (default, backward-compatible): one bucket for the whole automation - `single` allows one in-flight run total, `restart` cancels every active run. Existing automations are unchanged.
160
+ - `context_key`: an independent bucket per `contextKey` (typically per system / incident) - `single` allows one in-flight run _per context key_ (system A and system B run concurrently, but a second run for system A is deduped), and `restart` cancels only the active runs sharing the incoming context key.
161
+
162
+ `RunStore.hasActiveRun` / `countActiveRuns` / `cancelActiveRuns` gain an optional `contextKey` filter (the `automation_runs.context_key` column already exists, so no migration). `respectConcurrencyMode` threads the scope through. This is the primitive the default auto-incident automations need for faithful per-system deduplication.
163
+
164
+ - b995afb: Reactive two-stage dispatch pipeline + wake-index (reactive automation engine Phase 5).
165
+
166
+ The automation engine now reacts to entity-state changes through a two-stage work-queue pipeline instead of polling. State changes flow `ENTITY_CHANGED` → Stage-1 route (one instance claims) → Stage-2 dispatch fan-out (any instance runs one run).
167
+
168
+ - **Wake-index** (`automation_wake_index` child table of `automation_wait_locks`): a suspended `wait_until` records the `state.*` refs its condition reads (`${kind}:${id}`, or the kind-level wildcard `${kind}:*` when an id is dynamic), and a relevant change wakes it via an indexed intersection lookup. Reference extraction (`wake-refs.ts`) covers structured `state` / `numeric_state` conditions and template member-expressions rooted at `state.<kind>.<id>` or back-compat `health.*`; an indeterminate extraction logs at `warn` and falls back to the timeout timer only (never silent).
169
+ - **Reactive `wait_until`**: on suspend the engine inserts the wait lock + wake-index rows in a transaction and arms a single durable timeout timer at the deadline (queue `automation-wait-timeout`). A wake re-enriches scope **kind-agnostically** — health via the RPC client (`scope.health.*`, back-compat) AND every other `state.<kind>.<id>` ref the wait depends on (plus the changed ref) resolved through each kind's `read` accessor into `scope.state.<kind>.<id>.<field>` — then synchronously re-evaluates the full condition and resumes only if it now holds. This makes waits on non-health entities (incident, slo, …) resolve correctly when that kind changes, not just health. The stalled sweeper applies the timeout policy as a backstop if the timer job is lost.
170
+ - **Two-stage queues**: Stage 1 subscribes to `ENTITY_CHANGED` in work-queue mode (`workerGroup: "automation-entity-route"`) and does only indexed routing (wake-index intersection + trigger-event derivation), enqueuing per-run Stage-2 jobs onto `automation-dispatch` (`consumerGroup: "automation-dispatch-run"`, `maxRetries: 3`), which routes on `reason` to `dispatchTrigger` (trigger) or `resumeRun` (wake).
171
+ - **Entity-change → trigger-event derivation registry** (`registerChangeDeriver` on the `automation.entity` extension point): domains register a per-kind deriver mapping a change to the qualified trigger event id(s) Stage-1 routing fans out. No real domains are migrated in this phase, so production routing is a no-op until Phase 4 supplies the derivers.
172
+ - **Public `onEntityChanged({ kind, handler, delivery? })`** on the entity extension point: other plugins react to another domain's entity changes without touching the internal (unexported) `ENTITY_CHANGED` hook. Default delivery is `broadcast` (every instance); opt into `work-queue` (with a `workerGroup`) for exactly-once-per-cluster work.
173
+
174
+ BREAKING CHANGES:
175
+
176
+ - The polling `template` built-in trigger is removed. Its real cases are covered reactively by the `numeric_state` / `state` triggers + conditions. Re-author any `template` triggers as `numeric_state` / `state`.
177
+ - `wait_until` changed from interval polling to reactive wake-on-change. Semantics are preserved (wakes when the condition becomes true; times out at the deadline) but the `poll_seconds` field is now inert — a wait no longer re-checks on a timer, it is woken by a relevant `ENTITY_CHANGED` (with the durable timeout timer + sweeper as the deadline backstop).
178
+ - The `automation-wait-until` re-check queue and its consumer are removed (`wait-until-queue.ts`), along with the stalled sweeper's periodic `until`-lock re-tick. Reactive `wait_until` uses the wake-index + a single `automation-wait-timeout` timer instead.
179
+
180
+ - b995afb: fix(automation): preserve `${{ secrets.NAME }}` references in secret config fields during dispatch
181
+
182
+ The dispatch engine renders an action's `config` through the `{{ }}` template
183
+ engine before validating it. The secret-reference syntax `${{ secrets.NAME }}`
184
+ embeds `{{ secrets.NAME }}`, so the engine evaluated that inner expression
185
+ against a scope with no `secrets`, collapsing the value to `$` and failing
186
+ config validation (`invalid_union` on the secret field) for any real run that
187
+ used a `secretEnv` mapping or an `x-secret` field. The in-UI "Test Script"
188
+ path was unaffected because it never renders config.
189
+
190
+ `renderConfig` now passes fields annotated `x-secret` or `x-secret-env` through
191
+ verbatim (the same treatment as native-code `x-editor-types` fields), so the
192
+ secret reference reaches the secret resolver intact. Resolution and output
193
+ masking are unchanged.
194
+
195
+ - 270ef29: Fix cross-pod secret leak when a suspended automation run resumes on a different instance (security).
196
+
197
+ The run-wide output-masking registry is in-memory and per-process: it only holds the secret values a run resolved on the pod that originally ran it. When a run suspended (`wait_for_trigger` / `delay` / `wait_until`) on pod A and later resumed — via the wake path (`resumeRun`) or the stalled-run sweeper (`recoverStalledRun`) — on pod B with a fresh, empty registry, every masking choke point on pod B (step output, run error, scope snapshot, artifact data) ran against an EMPTY mask set. Any value still carrying pod A's resolved credential (a carried-over scope variable, an artifact echoing it, a provider error string) was therefore persisted UNMASKED, where `getRunScopeForReplay` and the run-detail UI could read it. This was the deferred "L2 cross-pod masking" gap.
198
+
199
+ Fix: on `resumeRun` / `recoverStalledRun`, RE-SEED the resuming pod's mask registry BEFORE walking or persisting. The engine re-resolves the automation's declared secret refs — the `secretEnv` mappings and `connectionId` references its action configs use, collected by walking the full nested action tree — through the run's already-wrapped `getService`, which auto-registers each resolved value. This re-populates exactly the least-privilege, by-value mask set the run is allowed to see (re-resolving is the same set the run resolves during normal execution, so it grants no extra access). Re-seeding is best-effort: a rotated/deleted secret simply isn't added to the mask set (the action's own re-run would surface a genuinely-missing secret), and a resolution failure never aborts the resume. No-op when masking isn't wired (tests / minimal installs).
200
+
201
+ - b995afb: Make `dependency-edge` a plugin-backed reactive entity via the Model-B entity state machine + rewire cross-plugin consumers.
202
+
203
+ Dependency defines a `dependency-edge` entity `{ sourceSystemId, targetSystemId, impactType, transitive }` keyed by dependency id. The `dependencies` table is BOTH authoritative AND the entity's current-state storage - there is no framework `entity_state` row for a dependency edge. `defineEntity` is given a plugin `read` accessor (`DependencyService.getManyEntityStates`) that projects the reactive subset straight off that table, and every reactive-state write goes through `handle.mutate` / `handle.remove`: `apply` performs the REAL `dependencies` write (the plugin's own db/tx, including the cycle/duplicate validation that may throw) and returns the new state; the framework snapshots `prev` via `read` BEFORE the write, appends the transition log, and emits `ENTITY_CHANGED` AFTER the write commits. Covered sites: create, update, delete (tombstone), plus the `dependency.create` / `dependency.remove` automation actions. Create sites pre-generate the id so the create's `prev` snapshot reads the not-yet-existing row as absent; `createDependency` accepts an optional pre-generated `id` (server-owned either way). The `dependency_derived_states` propagation cursor is declared non-reactive (bookkeeping).
204
+
205
+ A change -> trigger-event deriver reproduces the existing `dependency.created` / `.updated` / `.deleted` qualified events so automations keep firing. The old `dependency.created` / `.updated` / `.deleted` change hooks are removed; the catalog + healthcheck consumers switched from `onHook(<hook>)` to `onEntityChanged({ kind })`, all keeping `work-queue` delivery (cleanup + downstream-propagation are side-effecting writes that must run once per cluster):
206
+
207
+ - `dependency-system-cleanup`: reacts to `catalog-system` tombstones (`change.next === null`).
208
+ - `dependency-notification-evaluator` / `-recovery`: react to `health` changes filtered to a degraded / recovered transition via `classifyHealthChange`, reproducing the old `systemDegraded` / `systemHealthy` predicates.
209
+
210
+ `@checkstack/automation-backend` adds `makeEntityDrivenTriggerSetup()` - a no-op `setup` factory so a migrated domain's lifecycle triggers stay in the editor's trigger catalog (and register cleanly) while being fired by the entity change deriver via Stage-1 routing rather than a hook.
211
+
212
+ BREAKING CHANGES:
213
+
214
+ - The `dependency.created` / `dependency.updated` / `dependency.deleted` cross-plugin hooks (the `createHook` descriptors) are removed. Dependency lifecycle is now the reactive `dependency-edge` entity; the matching trigger events still fire (via the entity change deriver), so existing automations on `dependency.created/.updated/.deleted` keep working. The `dependency.impact_propagated` hook is KEPT (a derived fan-out signal, not a single mutable field). No in-repo plugin subscribed to the removed hooks.
215
+ - On the RPC create path, the `dependency.created` entity emit (via `mutate`) now precedes the `DEPENDENCY_CHANGED` realtime signal broadcast (previously the signal fired first, then the mirror); both still fire on a successful create.
216
+ - NARROWING: `dependency.updated` now fires only on a change to the REACTIVE state (`impactType`, `source`, `target`, or `transitive`). A label-only edit no longer fires `dependency.updated` (the label is not reactive entity state). Re-author any automation that needed to react to a label-only dependency edit against a different signal.
217
+
218
+ - 270ef29: Fix suspend/resume durability + complete the run-wide secret-masking guarantee.
219
+
220
+ A panel review confirmed several defects in the automation dispatch engine's suspend/resume durability and in the run-wide masking choke point. These survived because the unit suite stubbed the seam under test; the fixes ship with tests that exercise the real suspend / sweep / resume paths.
221
+
222
+ Suspend/resume durability:
223
+
224
+ - **Stalled sweeper no longer re-runs intentional waits.** `findStalledRunIds` now joins `automation_runs` and returns only `status = 'running'` runs, and suspend-finalisation no longer clobbers the run's `lastActionPath` checkpoint to `null`. Previously any wait longer than the stale window (>60s) was re-walked from the top every sweep cycle, re-firing pre-wait side effects and leaking wait locks. The wait-aware sweeps now also run before the stalled-run sweep.
225
+ - **Stalled recovery refuses a run holding a live wait lock.** `recoverStalledRun` now only recovers a genuinely-`running` run with no wait lock; a crash-mid-wait recovery is left to the wait/resume paths instead of re-walking from the top and creating a duplicate lock + duplicate delay job.
226
+ - **Cancelled runs can no longer resurrect.** `resumeRun` guards on `status === 'waiting'` (mirroring `checkWaitUntil`) and drops any stale lock for a non-waiting run, so `wakeWaitingRuns` / delay-expiry / a racing queue job can't wake a cancelled or terminal run. `cancelActiveRuns` (restart mode) now deletes the cancelled runs' wait locks + run-state in the same operation.
227
+ - **Concurrency check-then-create is serialized.** The `mode` check + `createRun` now run under a transaction-scoped advisory lock keyed on `(automationId, scope)`, so two concurrent fires can't both pass a `single`-mode "no active run" check and double-run.
228
+
229
+ Masking guarantee (now genuinely covers scope + artifacts):
230
+
231
+ - **The run-wide masking choke point now also masks the durable scope snapshot and produced artifacts.** The `RunSecretRegistry` is threaded into `RunStateStore.upsert` (masks `scopeSnapshot`) and `ArtifactStore.record` (masks `data`) so a resolved connection credential threaded into `scope.variables` or surfaced into an artifact is redacted before persist - and therefore cannot reach a read-only user via `getRunScopeForReplay`. **GUARANTEE CHANGE**: run-wide masking now covers step output, run error, scope snapshot, and artifact data for every action.
232
+ - **`testConnection` / `testProviderConnection` mask provider errors.** These RPCs run outside a dispatch run, so they build a per-call mask set from the resolved/submitted connection config and run any provider error through it before returning, so a provider error echoing a token can't cross back to the browser.
233
+ - **Short secrets surface a warning.** `setSecret` now warns when a value is shorter than `MIN_MASKABLE_LENGTH` (4) that it cannot be auto-redacted (the threshold is intentionally not lowered).
234
+
235
+ Internal:
236
+
237
+ - `@checkstack/backend-api`: `withXactLock`'s `fn` now receives the transaction handle `tx` so a critical section can run on the locked connection; the doc clarifies why running on the pool inside the lock window is still safe. The incident dedup caller's comment is corrected accordingly. `RunStore` gains `findWaitLocksByRun`.
238
+
239
+ - b995afb: Add the entity state machine core (`defineEntity`) - the foundational primitive of the reactive automation engine - as a Model-B plugin-backed reactive WRAPPER with NO framework-owned current-state storage.
240
+
241
+ `defineEntity` owns NO current-state storage of its own. Each kind declares a required plugin `read` accessor pointing at wherever its state lives (its own durable table, or a value computed on read from its own durable tables), and `defineEntity` makes that state reactive. There is no framework current-state store and no "homeless" fallback: every kind is plugin-backed. This makes a non-reactive write structurally impossible and guarantees every transition is durably logged without duplicating the plugin's state.
242
+
243
+ - `@checkstack/automation-backend`:
244
+
245
+ - New `automation.entity` extension point exposing `defineEntity(input)`, `declareNonReactiveState(input)`, `onEntityChanged(...)`, and `registerChangeDeriver(...)`. automation-backend registers the impl in `register`, so other plugins can resolve it and declare entities during their own `register`/`init` (Proxy-buffered until the impl registers).
246
+ - **Driven single mutation entry point.** All reactive-state writes go through `handle.mutate({ id, opts?, apply: () => Promise<TState> })`. The handle snapshots `prev` via `read` BEFORE the write, runs the plugin's `apply` (the actual write, committed in the PLUGIN's own transaction, returning the resulting state), validates `next` (zod), masks run-originated writes through the run-secret registry, diffs prev to next, and on a real diff appends the field-level transition rows to `entity_transitions` and emits `ENTITY_CHANGED` - both AFTER the plugin write commits (never on a rolled-back / throwing write). A structurally-unchanged write is a no-op. `handle.remove({ id, opts?, apply: () => Promise<void> })` is the tombstone counterpart (records the tombstone transition, emits next = null).
247
+ - **Cross-plugin transaction boundary.** `apply` takes NO framework tx: a plugin-backed kind lives behind a DIFFERENT drizzle client than `entity_transitions`, and two clients cannot share one transaction. The plugin write is authoritative; the transition-log append runs in the framework's own transaction AFTER the plugin write commits. A failure between them leaves correct plugin state with a missing history row (a gap, never a corruption).
248
+ - **`get` / `getMany`** route to the kind's `read`; **`inStateSince` / `inStateForMs` / `transitionCount`** read the per-field `entity_transitions` log (generalizing Phase-13 health transitions to any entity).
249
+ - **No framework keyed store.** There is no generic `entity_state` table, no `createKeyedStore`, and no `entityKeyedStoreServiceRef`: kinds whose state has no domain table of their own (the `health` aggregate, the `slo` budget/streak view) compute their `read` on demand from their own durable data instead of materializing a framework copy. `entity_transitions` (the change-history log) is the framework's ONLY persistent table and is written for EVERY kind regardless of where current state lives.
250
+ - **`entityResolverFor(kind)`** routes scope enrichment + the reactive `wait_until` wake re-eval to each kind's `read` accessor. Generalized scope enrichment (`enrichScopeWithEntities`) folds any `state.<kind>.<id>` ref into `scope.state.<kind>.<id>.<field>`. The rich `scope.health.*` condition snapshot (status, latency, success rate, in-maintenance, transitions-in-window, ...) is resolved EXCLUSIVELY through the healthcheck RPC path (the health aggregate is computed on read, not stored as a framework row) and the generic entity pass never writes `scope.health`; `state.health.*` remains the minimal reactive entity view. These are two complementary projections by design, not a migration shim.
251
+ - **Horizontal-scale read-consistency guard.** A reactive entity's current state MUST be globally readable from shared/durable storage, never process-local memory (`.agent/rules/state-and-scale.md`). Enforced by the `checkstack/no-pod-local-entity-state` ESLint tripwire at the `defineEntity({ read })` boundary (wired at `warn`) and the deterministic `cross-pod-read-consistency.it.test.ts` integration test.
252
+ - Load-time validation hard-fails a malformed registration (non-`z.object` state, missing/duplicate `kind`, or a missing / non-function `read`).
253
+ - The `ENTITY_CHANGED` hook is internal (not exported); the change emitter buffers events produced during the init window and flushes them in order once the hook wiring is available in `afterPluginsReady`.
254
+
255
+ - `@checkstack/automation-common`:
256
+
257
+ - New `EntityChangedSchema` (the `ENTITY_CHANGED` payload - `kind`, `id`, `prev`, `next`, `delta`, `changedFields`, `actor`, `occurredAt`) and `DispatchJobSchema` (the Stage-2 `trigger` / `wake` dispatch job).
258
+
259
+ - `@checkstack/automation-frontend`: the `wait_until` editor no longer offers the inert `poll_seconds` field (reactive waits don't poll).
260
+
261
+ This phase adds the primitive only: domains are migrated in their own changesets. No external behavior changes for existing automations.
262
+
263
+ BREAKING CHANGES: There is no framework current-state store. Any out-of-tree plugin must own its entity state in its own durable storage (its own table, or a compute-on-read over it) and pass a `read` accessor to `defineEntity`. `createKeyedStore` / `KeyedStore` / `entityKeyedStoreServiceRef` / `EntityKeyedStoreService` do not exist, and there is no `entity_state` table. `handle.set` / `handle.patch` and the `indexes` option do not exist; all writes go through `handle.mutate` / `handle.remove`.
264
+
265
+ - b995afb: Extract a shared `withEntityWrite` / `withEntityRemove` guard for PLUGIN-BACKED (Model B) reactive entities and refactor the per-domain copies onto it.
266
+
267
+ Every plugin-backed domain (incident, catalog, dependency, maintenance, slo, satellite) reimplemented the same "no handle wired → run the plugin write directly; handle wired → route through `handle.mutate` / `handle.remove`" guard, varying only in the id-key name. `@checkstack/automation-backend` now exports `withEntityWrite` / `withEntityRemove` (from the entity barrel) and each domain's thin, well-named wrappers (`writeIncidentEntity`, `writeMaintenanceEntity`, satellite's `mirror`, …) delegate to it, so the branch lives in exactly one place. Behavior is unchanged.
268
+
269
+ `writeHealthEntity` (healthcheck-backend) is intentionally NOT migrated onto the helper — it is genuinely bespoke (closure-captured durable state, distinct rethrow-vs-fail-soft branches, a per-system serializer, and it returns the computed state). SLO keeps its fail-soft `onError` wrapper around the shared guard.
270
+
271
+ - 270ef29: Fix several correctness defects around distributed coordination and stored-data handling.
272
+
273
+ - Dwell `for:` timers now fire via an atomic `DELETE ... RETURNING` claim, so two pods (or the stalled sweeper vs the queue consumer) can no longer both fire the same dwell.
274
+ - Postgres session-level advisory locks now keep connection affinity. A shared `AdvisoryLockService` (backed by a dedicated pooled client) replaces the previous acquire/release-on-different-connection pattern that leaked locks. Used by the script-packages installer election, the automation run resume + stalled sweeper, and (via a new transaction-scoped `withXactLock`) incident dedup.
275
+ - A storage migration that crashed mid-flight is now resumed on startup under the installer-election lock, instead of permanently wedging installs.
276
+ - Distributed script-package blobs carry a `blobSha256` and are verified before extraction (the SRI `integrity` hashes the npm tarball, not the transported archive). Backward-safe: entries without the field skip verification until a re-install regenerates the manifest.
277
+ - Archive extraction rejects zip-slip paths (absolute or `..` entries) before writing anything.
278
+ - `incident.create` with `dedupe_open_for_system` serializes its check-then-create per system, so concurrent triggers for the same system can't both open a duplicate incident.
279
+ - Seeded auto-incident filter expressions JSON-encode interpolated ids so a quote/backslash can't corrupt the expression.
280
+ - Stored jsonb snapshots (dwell `actorSnapshot`, wait-lock `waitConfig`) are validated with zod on load and degrade safely instead of flowing through as the wrong type.
281
+
282
+ - b995afb: Fix the `Automation` kind showing an empty "Additional Schemas" section in the GitOps Entity Kind Registry. The spec-schema documentation for `triggers[].config` and `actions[].config` was registered with `conditions` pointing at the `triggers[].event` / `actions[].action` discriminators. Those discriminators have no variant-selector group of their own in the kind browser, so the conditions could never be satisfied and every entry was filtered out (the section rendered empty even though the docs were registered).
283
+
284
+ The trigger/action config docs are now emitted as standalone variants (no `conditions`), mirroring how Healthcheck surfaces its primary `config` (strategy) field. Each field now renders its own variant dropdown so operators can browse every trigger and provider-action config schema directly.
285
+
286
+ - b995afb: Move health-check flapping configuration from the per-assignment notification policy onto the `healthcheck.flapping_detected` automation trigger.
287
+
288
+ Flapping thresholds (`transitions`, `windowMinutes`) are now configured on the trigger itself, next to the automation that reacts to them, instead of on each check assignment. The health-check executor still owns the windowed transition counting (it writes `health_check_unhealthy_transitions` and runs the window query), but it now SOURCES the thresholds from the subscribed automations' trigger config:
289
+
290
+ - On a transition-to-unhealthy it records the transition unconditionally (keeping history warm), then looks up the enabled automations subscribed to `healthcheck.flapping_detected`, collects the distinct set of configured windows, counts transitions once per distinct window, and emits one `healthcheck.flapping_detected` per window. The trigger's exact-window `evaluateConfig` gate then fires each automation only for its own window and transition threshold.
291
+ - A missing or partial flapping trigger config defaults to `{ transitions: 3, windowMinutes: 60 }`, so automations created before the trigger carried config keep working unchanged.
292
+ - `automation-backend` exposes a new backend-only, read-only `automationSubscriptionsRef` service ref (`findEnabledByTriggerEvent`) so a plugin that owns a trigger's underlying event can discover its subscribers' trigger config. It is never browser-exposed.
293
+
294
+ **BREAKING CHANGES**
295
+
296
+ - The per-assignment `notificationPolicy.flappingTrigger` field is removed. `NotificationPolicy` is now `{ suppressDeEscalations }` only. Stored rows that still carry a `flappingTrigger` key parse cleanly - the key is stripped on read - so no data migration is required, but the per-check flapping toggle/threshold in the assignment Notifications tab is gone; configure flapping on the trigger instead.
297
+ - The GitOps `System.healthcheck[].notificationPolicy.flappingTrigger` field is removed. A `flappingTrigger` block in a manifest is ignored. Move the thresholds to the `transitions` / `windowMinutes` config of your `healthcheck.flapping_detected` automation trigger.
298
+ - The standalone `enabled` flag for flapping is gone: flapping is "enabled" precisely when at least one enabled automation subscribes to `healthcheck.flapping_detected`. With no subscriber, the transition is still recorded but nothing is counted or emitted.
299
+
300
+ - b995afb: Fix four reactive-automation-engine defects in the `wait_until` / entity-change dispatch path.
301
+
302
+ - **Lost-wakeup re-evaluate-on-registration guard (HIGH, data-loss race).** `executeWaitUntil` evaluated its condition, then committed the wait lock + wake-index rows with NO re-evaluation after arming. An `ENTITY_CHANGED` for a relevant ref landing in that arm window was routed by Stage-1 against a not-yet-visible lock, enqueued no wake job, and — for a no-timeout wait (`timeoutAt` null, skipped by the sweeper) — the run stalled permanently (silent run leak). After arming the lock the engine now re-evaluates ONCE against freshly re-enriched scope; if the condition already holds it deletes the lock (its wake-index rows cascade) and continues the run inline. Idempotent via the lock delete + the per-run advisory lock.
303
+
304
+ - **Wildcard health wake drops the changed system (MEDIUM, correctness).** `reEnrichWaitScope` resolved health only for the trigger `contextKey` + `uses_state` ids and excluded the changed ref from health resolution. A wildcard health wait (`health:*`) woken by `health:sysX` — where `sysX` was neither the contextKey nor in `uses_state` — never had `scope.health.systems[sysX]` populated, so the condition read stale/empty state and failed to resume. The changed system's concrete id is now injected into health resolution during a wildcard wake.
305
+
306
+ - **`changeId` for dispatch dedup (LOW, correctness).** The Stage-2 trigger `jobId` embedded `changed.occurredAt` (millisecond granularity), so two DISTINCT changes to the same entity within one millisecond collapsed onto one job (the second run silently dropped). `EntityChangedSchema` gains an additive, back-compatible `changeId` (generated ONCE at emit time so it travels with redeliveries of the same change); the Stage-2 jobId now uses `changed.changeId` (falling back to `occurredAt` for legacy payloads). Redeliveries of one change still dedup; two real changes stay distinct.
307
+
308
+ - **Run-originated `mutate` returns the unmasked next state (LOW, correctness).** `handle.mutate` returned the `maskForRun`-masked next state, contradicting its "returns the resulting state" contract. Masking is now confined to the emitted `ENTITY_CHANGED` payload and the `entity_transitions` rows only; `mutate` returns the unmasked, zod-validated resulting state.
309
+
310
+ BREAKING CHANGES: none. The `changeId` field is additive and optional; all changes are behavior-preserving except where they fix the defects above.
311
+
312
+ - b995afb: Restore the documented domain payload fields on entity-driven automation triggers.
313
+
314
+ Migrated triggers declare domain-named `payloadSchema`s (incident `incidentId`; health `systemId` / `previousStatus`; catalog `systemId` / `changedFields`; dependency `dependencyId`), but Stage-2 dispatch built `trigger.payload` from the generic entity-change shape (`{ kind, id, prev, next, delta, ...next }`). Operator filters and templates reading `trigger.payload.incidentId` / `.systemId` / `.previousStatus` silently resolved to `undefined` — a regression vs the legacy hook payloads.
315
+
316
+ Changes:
317
+
318
+ - `@checkstack/automation-backend`: `registerChangeDeriver` now accepts an optional per-kind `toPayload(changed) => Record<string, unknown>` mapper (at most one per kind; a second distinct mapper throws). Stage-2's `changedToPayload` uses the registered mapper to build `trigger.payload` so it matches the kind's declared `payloadSchema`, falling back to the generic change shape for kinds without a mapper. New exported type `EntityChangePayloadMapper`.
319
+ - `@checkstack/incident-backend`, `@checkstack/healthcheck-backend`, `@checkstack/catalog-backend`, `@checkstack/dependency-backend`: implement and register a `toPayload` for each entity-driven kind so `trigger.payload` carries the legacy domain keys again.
320
+
321
+ Descriptive incident payload fields not derivable from the reactive entity state (`title`, `description`, `createdAt`, `resolvedAt`) are now OPTIONAL on the incident trigger `payloadSchema`s — they were always absent from an entity-driven payload.
322
+
323
+ - b995afb: Remove the legacy per-assignment auto-incident system. Auto-incidents are now built entirely by user-authored automations; nothing is seeded or hardcoded.
324
+
325
+ What was removed:
326
+
327
+ - The one-time migration that auto-seeded "sustained unhealthy" and "flapping" default automations from each assignment's notification policy, plus the `listAutoIncidentPolicies` RPC it consumed.
328
+ - The seeder-only notification-policy settings and their UI: `autoOpenIncidentOnUnhealthy`, `useNotificationSuppression`, `skipDuringMaintenance`, `sustainedUnhealthyTrigger`, and `autoCloseAfterMinutes`. The assignment **Notifications** tab now exposes only the two live settings: **Suppress de-escalation notifications** and the **flapping-detection** thresholds.
329
+ - The dead `health_check_auto_incidents` table (no longer written or read; dropped via migration).
330
+
331
+ What is preserved: flapping detection (`healthcheck.flapping_detected`) and de-escalation suppression are unchanged. The `flappingTrigger` and `suppressDeEscalations` policy fields stay exactly as before.
332
+
333
+ > [!NOTE]
334
+ > One-time cleanup: an automation-backend migration deletes the historically auto-seeded incident automations (`managed_by LIKE 'auto-incident:%'`) from existing databases. This is intentional and destructive - those automations were no longer managed by anything. If you had edited a seeded automation and want to keep it, re-create it as a normal automation before upgrading. See the "Build auto-incident automations" guide for templates.
335
+
336
+ > [!IMPORTANT]
337
+ > NARROWING: `NotificationPolicySchema` is narrowed to `{ suppressDeEscalations, flappingTrigger }`. Stored rows that still carry the removed legacy keys parse cleanly - zod strips the unknown keys on read - so no data migration is required for the `system_health_checks.notification_policy` column. GitOps `notificationPolicy` specs that set the removed fields are no longer accepted for those keys.
338
+
339
+ - 270ef29: Add in-UI script testing for automation `run_script` / `run_shell` actions.
340
+
341
+ A new `testScript` RPC runs a TypeScript or shell script against an
342
+ editable, auto-seeded sample context using the same sandboxed runner the
343
+ real action uses, so operators can test scripts directly in the editor
344
+ without dispatching a whole automation. Surfaces beneath any script field
345
+ flagged `x-script-testable` via the new `ScriptTestPanel` /
346
+ `ContextSampleEditor` components in `@checkstack/ui` and the
347
+ `scriptTestRenderer` prop threaded through `DynamicForm`.
348
+
349
+ - `@checkstack/automation-common`: adds the `testScript` contract +
350
+ `ScriptTest*` schemas (gated by `automation.manage`).
351
+ - `@checkstack/automation-backend`: implements `testScript` reusing the
352
+ shared ESM / shell runners; central-only, time-bounded.
353
+ - `@checkstack/backend-api`: new `x-script-testable` config-schema
354
+ metadata propagated to the frontend JSON Schema.
355
+ - `@checkstack/ui`: new `ScriptTestPanel` + `ContextSampleEditor`
356
+ components and a `scriptTestRenderer` prop on `DynamicForm`.
357
+ - `@checkstack/automation-frontend`: wires the test panel into the action
358
+ editor.
359
+ - `@checkstack/integration-script-backend`: marks the `run_script` /
360
+ `run_shell` script fields as testable.
361
+
362
+ - 270ef29: Extend in-UI script testing to health-check collectors, and add
363
+ load-from-run replay for automation script tests.
364
+
365
+ - Health-check collectors: a new `testCollectorScript` RPC runs the
366
+ inline-script (TypeScript) collector and the shell `script` collector
367
+ against an editable, auto-seeded sample context using the same
368
+ sandboxed runner the real collector uses. Surfaces beneath the
369
+ collector script fields in the collector editor (both marked
370
+ `x-script-testable`). Gated by `healthcheck.configuration.manage`.
371
+ - Automation replay: a new `getRunScopeForReplay` RPC reconstructs an
372
+ editable test context from a real run (trigger + persisted artifacts,
373
+ plus the durable scope snapshot when the run is still in-flight), and
374
+ the script-test panel gains a "Load from run" picker that seeds the
375
+ sample context from a past run.
376
+
377
+ Note: health-check executions do not persist the script / config /
378
+ check / system that produced a result, so there is no health-check
379
+ replay - auto-seed is the only context source for collector tests. This
380
+ is by design; see the feature plan.
381
+
382
+ - 270ef29: Activate npm packages in script execution: thread the managed
383
+ `resolutionRoot` into every user-script call site so an allowlisted package
384
+ can actually be `import`ed.
385
+
386
+ - `@checkstack/backend-api`: the ESM runner now always writes a per-run
387
+ `bunfig.toml` with `[install] auto = "disable"` and runs with that dir as
388
+ CWD. Without this Bun silently auto-installs any imported package from the
389
+ registry (verified), defeating the allowlist; with it, imports resolve
390
+ only against the reconciled `current/node_modules` (when a `resolutionRoot`
391
+ is set) and otherwise fail fast.
392
+ - `@checkstack/script-packages-backend`: `resolveResolutionRoot` /
393
+ `resolveResolutionRootFromStore` / `resolveResolutionRootForHost` decide a
394
+ host's resolution-root status (`none` / `ready` / `notReady`) from the
395
+ local `<store>/current`.
396
+ - `run_script` (integration-script-backend), the inline-script collector
397
+ (healthcheck-script-backend, core + satellite), and the in-UI `testScript`
398
+ / `testCollectorScript` endpoints all resolve the root per run and pass it
399
+ to the runner; `run_script` surfaces a clear "npm packages not ready"
400
+ error when configured-but-unsynced. Shell paths are unaffected (no module
401
+ resolution).
402
+
403
+ An opt-in end-to-end test (`CHECKSTACK_E2E_NETWORK=1`) proves an allowlisted
404
+ package imports successfully through the real `run_script` action execute
405
+ path, with non-network degradation tests running always.
406
+
407
+ BREAKING CHANGES: `@checkstack/backend-api`'s `defaultEsmScriptRunner` now
408
+ always disables Bun auto-install for the user subprocess. A script that
409
+ previously relied on Bun silently fetching an un-vendored package from the
410
+ registry at import time will now fail to resolve it. This is intentional -
411
+ package availability is governed by the admin allowlist - but any caller
412
+ depending on the old implicit auto-install behavior must add the package to
413
+ the allowlist instead. The new `EsmScriptRunOptions.resolutionRoot` field is
414
+ optional and additive (defaults to today's `os.tmpdir()` behavior when
415
+ unset), so the runner API itself is source-compatible.
416
+
417
+ - 270ef29: Add the Secrets platform (Phase 1): a central, plugin-agnostic secret manager with a pluggable backend extension point, a cross-plugin resolver service, and a universal Jenkins-style masking layer.
418
+
419
+ - New packages: `secrets-common` (schemas, contract, `secrets.read`/`secrets.manage`, masking utils), `secrets-backend` (`SecretBackend` extension point, `secretResolverRef`/`secretAdminRef` services, run-scoped masking context, RPC router), `secrets-backend-local` (default AES-256-GCM backend, owns the `secrets` table promoted from gitops), `secrets-frontend` (admin Settings page).
420
+ - Resolution machinery (`resolveSecretsBySchema`, `SecretStore`, `${{ secrets.NAME }}` / `x-secret`) is promoted out of `gitops-backend` into `secrets-backend`. GitOps now resolves and manages secrets through the platform's service refs (single source of truth); its secret table is migrated without loss.
421
+ - Universal masking seam wired at the central script-output boundaries: automation `run_script` / `run_shell` artifacts and the in-UI test panel redact run-scoped secret values from `result`/`stdout`/`stderr`/`error` before persist/return. Phase 1 resolves no run-scoped secrets yet, so masking is a no-op until Phase 2; the seam guarantees the boundary exists.
422
+ - No endpoint returns a secret value to a browser: DTOs expose only name/metadata/`hasValue`.
423
+
424
+ BREAKING CHANGES: `gitops-backend` now depends on `secrets-backend` and resolves/manages secrets through it. The `secrets` table is owned by `secrets-backend-local`; the gitops `secrets` table is retained as a migration source but is no longer the source of truth.
425
+
426
+ - 270ef29: Secrets platform Phase 2: secret -> env-var mapping with central resolve, inject, and mask.
427
+
428
+ - Script consumers declare a least-privilege `secretEnv` allowlist
429
+ (`{ ENV_NAME: "${{ secrets.NAME }}" }`). The automation `run_script` /
430
+ `run_shell` actions resolve ONLY the declared secrets via
431
+ `secretResolverRef.resolveForRun`, inject them into the runner env for
432
+ that run (memory-only; the ESM runner gained a per-run `env` option), and
433
+ mask their values out of stdout/stderr/result/error via the run-scoped
434
+ masking context. A missing required secret fails the run clearly. No
435
+ ambient secret access.
436
+ - Test panel: `testScript` / `testCollectorScript` inject named
437
+ `__SECRET_<NAME>__` placeholders by default, or user-supplied per-secret
438
+ overrides; real production values are never resolved in the test path,
439
+ and overrides are masked out of the result.
440
+ - Healthcheck collectors carry the `secretEnv` field for authoring +
441
+ the test panel; runtime injection on satellites lands in Phase 3.
442
+ - Editor UX: a new `@checkstack/ui` `SecretEnvEditor` renders `x-secret-env`
443
+ record fields with `${{ secrets.* }}` name autocomplete (from
444
+ `listSecretNames`), wired into the automation action editor and the
445
+ healthcheck collector editor. New `withConfigMeta` helper +
446
+ `x-secret-env` config-meta key in `@checkstack/backend-api`.
447
+
448
+ - 270ef29: Secrets platform Phase 5c: run-wide secret masking at the automation persistence choke point.
449
+
450
+ Every step writes `result_payload` / `error_message` (and the run writes a
451
+ run-level `error_message`) to `automation_run_steps` / `automation_runs`.
452
+ Previously only the script-action and satellite-collector output paths were
453
+ masked, so a provider HTTP error that embedded a resolved connection
454
+ credential could reach the run-detail UI unmasked.
455
+
456
+ Now the dispatch run accumulates every secret value it resolves
457
+ (`RunSecretRegistry`) by wrapping each run's `getService` so the secret
458
+ resolver (`resolveSecret` / `resolveForRun` / `resolveBySchema`) and the
459
+ connection store (`getConnectionWithCredentials`) register their resolved
460
+ values — least-privilege (only what this run resolved), in memory only,
461
+ dropped when the run goes terminal. The run-state store masks step + run
462
+ output with these values BEFORE persistence, so every downstream read / DTO
463
+ / run-detail page is masked by construction across ALL actions. The
464
+ existing script / satellite-collector source-side masking is kept as
465
+ defense in depth.
466
+
467
+ - b995afb: Add an optional `partitionBy` override to the windowed-count trigger gate.
468
+
469
+ A trigger's `window` block now accepts `partitionBy`, a bare expression (same flavour as `filter`, no `{{ }}`) that controls the key the occurrence count is bucketed by. When omitted, the gate keys by the trigger's built-in context key exactly as before (per system for health triggers), so existing automations are unchanged. When set, the expression is evaluated against the same trigger scope `filter` uses and coerced to a string - e.g. `trigger.payload.severity` for a per-severity rate, or `trigger.payload.systemId + ":" + trigger.payload.checkId` for a composite key. If the expression evaluates to null/undefined/empty or fails to evaluate, the gate falls back to the built-in context key (never global counting); eval errors are logged, matching the gate's fail-open posture.
470
+
471
+ Triggers can now declare `contextKeyLabel` (a UI hint, e.g. `"system"`) describing their built-in context dimension. It is surfaced through `TriggerInfo` so the editor's window "Partition by" field shows the default partition ("Leave blank to count per system" / "per automation" when a trigger has no context key). The healthcheck system triggers (`system_health_changed`, `system_degraded`, `system_healthy`, `check_failed`) and the built-in `numeric_state` trigger set it to `"system"`. This is a pure UI hint with no runtime behaviour.
472
+
473
+ The automation editor's window block gains a "Partition by" expression input (reusing the trigger filter's `trigger.payload.*` autocomplete), and the collapsed trigger card summary shows the partition when set.
474
+
475
+ - b995afb: Add a generic windowed-count / rate trigger gate, and express flapping detection on it.
476
+
477
+ Any trigger can now carry a `window: { count, minutes, refire }` block: the automation engine records each qualifying occurrence (after the structured config gate and the operator's `filter`) in a durable append log and counts rows within the trailing sliding window, scoped per context key (e.g. per system). `refire: "every"` (default) fires on every occurrence at/over the threshold; `refire: "once"` fires only on the crossing edge and re-arms as old occurrences age out. The gate runs in `maybeStartRun` after `filter` and before the `for:` dwell, so it composes with both.
478
+
479
+ Flapping is now an instance of this mechanism rather than a bespoke detector. The healthcheck `system_health_changed` raw change event plus a `filter` (`trigger.payload.newStatus != "healthy"`) plus `window: { count: 3, minutes: 60, refire: "once" }` reproduces flapping in the engine.
480
+
481
+ State-and-scale: window state lives in the new `automation_window_events` Postgres table (FK-cascade on the automation, the same delete-lifecycle as `automation_dwell_timers`). The count is read with pure SQL so every pod computes the same answer; the work-queue claim gives exactly one INSERT per emission, so there is no double-count. Rows older than the 24h schema cap are pruned by the existing stalled-sweeper. The `once` policy is best-effort under at-least-once redelivery (a redelivered emission can skip the exact crossing edge; `every` is redelivery-tolerant).
482
+
483
+ **BREAKING CHANGES:**
484
+
485
+ - The `healthcheck.flapping_detected` automation trigger and the `healthcheck.flapping_detected` hook are REMOVED. Flapping is now detected by the windowed-count gate on the `healthcheck.system_health_changed` trigger (`window` block, `refire: "once"`).
486
+ - Flapping is now PER-SYSTEM (the aggregated `health` entity), not per-`(system, configuration)`. Subscribe to `check_failed` with a `window` instead if you need per-check rate detection.
487
+ - The healthcheck `health_check_unhealthy_transitions` table is DROPPED (the per-check flapping audit log is no longer kept; counting moved into the engine).
488
+ - The backend-only `automation.subscriptions` service ref (`automationSubscriptionsRef` / `AutomationSubscriptions`) is REMOVED. The engine enumerates subscribers internally and the window gate runs per-automation inside `maybeStartRun`, so the external read-ref is no longer needed.
489
+ - Existing user-created flapping automations are AUTO-MIGRATED on boot: any trigger on `healthcheck.flapping_detected` is rewritten to `healthcheck.system_health_changed` + the canonical unhealthy-transition filter + `window: { count: transitions ?? 3, minutes: windowMinutes ?? 60, refire: "once" }`, dropping the old `config`. A pre-existing trigger filter is replaced with the canonical one (logged per row). An enabled automation that still references the removed event after migration logs a warning.
490
+
491
+ ### Patch Changes
492
+
493
+ - Updated dependencies [270ef29]
494
+ - Updated dependencies [b995afb]
495
+ - Updated dependencies [b995afb]
496
+ - Updated dependencies [270ef29]
497
+ - Updated dependencies [270ef29]
498
+ - Updated dependencies [270ef29]
499
+ - Updated dependencies [270ef29]
500
+ - Updated dependencies [270ef29]
501
+ - Updated dependencies [270ef29]
502
+ - Updated dependencies [270ef29]
503
+ - Updated dependencies [270ef29]
504
+ - Updated dependencies [270ef29]
505
+ - Updated dependencies [b995afb]
506
+ - Updated dependencies [270ef29]
507
+ - Updated dependencies [b995afb]
508
+ - Updated dependencies [270ef29]
509
+ - Updated dependencies [b995afb]
510
+ - Updated dependencies [b995afb]
511
+ - Updated dependencies [b995afb]
512
+ - Updated dependencies [270ef29]
513
+ - Updated dependencies [270ef29]
514
+ - Updated dependencies [270ef29]
515
+ - Updated dependencies [270ef29]
516
+ - Updated dependencies [270ef29]
517
+ - Updated dependencies [b995afb]
518
+ - Updated dependencies [b995afb]
519
+ - Updated dependencies [270ef29]
520
+ - Updated dependencies [270ef29]
521
+ - Updated dependencies [270ef29]
522
+ - Updated dependencies [b995afb]
523
+ - Updated dependencies [270ef29]
524
+ - Updated dependencies [b995afb]
525
+ - Updated dependencies [270ef29]
526
+ - Updated dependencies [270ef29]
527
+ - Updated dependencies [b995afb]
528
+ - Updated dependencies [270ef29]
529
+ - Updated dependencies [270ef29]
530
+ - Updated dependencies [270ef29]
531
+ - Updated dependencies [270ef29]
532
+ - Updated dependencies [270ef29]
533
+ - Updated dependencies [b995afb]
534
+ - Updated dependencies [b995afb]
535
+ - Updated dependencies [b995afb]
536
+ - @checkstack/backend-api@0.19.0
537
+ - @checkstack/gitops-common@0.5.0
538
+ - @checkstack/gitops-backend@0.4.0
539
+ - @checkstack/automation-common@0.3.0
540
+ - @checkstack/healthcheck-common@1.4.0
541
+ - @checkstack/template-engine@0.3.0
542
+ - @checkstack/script-packages-backend@0.2.0
543
+ - @checkstack/secrets-common@0.1.0
544
+ - @checkstack/command-backend@0.1.32
545
+ - @checkstack/queue-api@0.3.7
546
+
3
547
  ## 0.2.0
4
548
 
5
549
  ### Minor Changes
@@ -0,0 +1,17 @@
1
+ CREATE TABLE "automation_dwell_timers" (
2
+ "id" text PRIMARY KEY NOT NULL,
3
+ "automation_id" text NOT NULL,
4
+ "trigger_id" text NOT NULL,
5
+ "event_id" text NOT NULL,
6
+ "context_key" text,
7
+ "armed_status" text,
8
+ "payload_snapshot" jsonb NOT NULL,
9
+ "actor_snapshot" jsonb NOT NULL,
10
+ "fire_at" timestamp NOT NULL,
11
+ "created_at" timestamp DEFAULT now() NOT NULL
12
+ );
13
+ --> statement-breakpoint
14
+ ALTER TABLE "automation_dwell_timers" ADD CONSTRAINT "automation_dwell_timers_automation_id_automations_id_fk" FOREIGN KEY ("automation_id") REFERENCES "automations"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
15
+ CREATE UNIQUE INDEX "automation_dwell_timers_key_unique" ON "automation_dwell_timers" USING btree ("automation_id","trigger_id","context_key");--> statement-breakpoint
16
+ CREATE INDEX "automation_dwell_timers_fire_at_idx" ON "automation_dwell_timers" USING btree ("fire_at");--> statement-breakpoint
17
+ CREATE INDEX "automation_dwell_timers_automation_idx" ON "automation_dwell_timers" USING btree ("automation_id");
@@ -0,0 +1,2 @@
1
+ ALTER TABLE "automation_wait_locks" ADD COLUMN "wait_config" jsonb;--> statement-breakpoint
2
+ CREATE INDEX "automation_wait_locks_kind_idx" ON "automation_wait_locks" USING btree ("kind");