npm - @checkstack/automation-backend - Versions diffs - 0.2.0 → 0.3.0 - Mend

@checkstack/automation-backend 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

package/CHANGELOG.md +544 -0
package/drizzle/0003_sparkling_xorn.sql +17 -0
package/drizzle/0004_cultured_spyke.sql +2 -0
package/drizzle/0005_classy_the_hand.sql +19 -0
package/drizzle/0006_burly_wallop.sql +10 -0
package/drizzle/0007_nappy_jackal.sql +1 -0
package/drizzle/0008_remove_seeded_auto_incident_automations.sql +13 -0
package/drizzle/0009_steady_liz_osborn.sql +12 -0
package/drizzle/0010_chunky_changeling.sql +2 -0
package/drizzle/meta/0003_snapshot.json +1007 -0
package/drizzle/meta/0004_snapshot.json +1028 -0
package/drizzle/meta/0005_snapshot.json +1164 -0
package/drizzle/meta/0006_snapshot.json +1261 -0
package/drizzle/meta/0007_snapshot.json +1215 -0
package/drizzle/meta/0008_snapshot.json +1215 -0
package/drizzle/meta/0009_snapshot.json +1328 -0
package/drizzle/meta/0010_snapshot.json +1349 -0
package/drizzle/meta/_journal.json +56 -0
package/package.json +23 -12
package/src/action-types.ts +23 -0
package/src/artifact-store.ts +16 -1
package/src/automation-store.test.ts +143 -0
package/src/automation-store.ts +30 -8
package/src/builtin-triggers.test.ts +77 -74
package/src/builtin-triggers.ts +105 -108
package/src/dispatch/action-kind.ts +2 -0
package/src/dispatch/assemble-get-service.ts +31 -0
package/src/dispatch/cancel-resurrect.test.ts +147 -0
package/src/dispatch/concurrency-race.test.ts +255 -0
package/src/dispatch/concurrency-scope.test.ts +166 -0
package/src/dispatch/condition.ts +24 -5
package/src/dispatch/dwell-queue.ts +65 -0
package/src/dispatch/dwell-store.ts +154 -0
package/src/dispatch/dwell.it.test.ts +142 -0
package/src/dispatch/dwell.test.ts +799 -0
package/src/dispatch/dwell.ts +257 -0
package/src/dispatch/engine.test.ts +189 -2
package/src/dispatch/engine.ts +555 -9
package/src/dispatch/entity-scope.test.ts +176 -0
package/src/dispatch/get-service-wiring.test.ts +318 -0
package/src/dispatch/numeric.test.ts +71 -0
package/src/dispatch/numeric.ts +96 -0
package/src/dispatch/render.test.ts +34 -0
package/src/dispatch/render.ts +31 -11
package/src/dispatch/reseed-run-secrets.ts +230 -0
package/src/dispatch/run-secret-registry.test.ts +189 -0
package/src/dispatch/run-secret-registry.ts +247 -0
package/src/dispatch/run-state-masking.test.ts +376 -0
package/src/dispatch/run-state-store.ts +95 -38
package/src/dispatch/run-state.ts +226 -59
package/src/dispatch/scope-artifact-masking.test.ts +138 -0
package/src/dispatch/secret-ref-ids.test.ts +19 -0
package/src/dispatch/secret-ref-ids.ts +17 -0
package/src/dispatch/snapshots.test.ts +86 -0
package/src/dispatch/snapshots.ts +79 -0
package/src/dispatch/stage1-router.test.ts +324 -0
package/src/dispatch/stage1-router.ts +152 -0
package/src/dispatch/stage1.it.test.ts +84 -0
package/src/dispatch/stage2-dispatch.test.ts +285 -0
package/src/dispatch/stage2-dispatch.ts +207 -0
package/src/dispatch/stage2-stalled.it.test.ts +132 -0
package/src/dispatch/stalled-sweeper.test.ts +197 -0
package/src/dispatch/stalled-sweeper.ts +112 -5
package/src/dispatch/state-scope.test.ts +234 -0
package/src/dispatch/state-scope.ts +322 -0
package/src/dispatch/structured-conditions.test.ts +246 -0
package/src/dispatch/structured-conditions.ts +146 -0
package/src/dispatch/test-fixtures.ts +306 -38
package/src/dispatch/trigger-fanin.test.ts +111 -0
package/src/dispatch/trigger-subscriber.ts +316 -14
package/src/dispatch/types.ts +263 -8
package/src/dispatch/wait-timeout-queue.ts +89 -0
package/src/dispatch/wait-until-entity-wake.test.ts +544 -0
package/src/dispatch/wait-until.test.ts +540 -0
package/src/dispatch/wake-refs.test.ts +158 -0
package/src/dispatch/wake-refs.ts +348 -0
package/src/dispatch/window-gate.test.ts +513 -0
package/src/dispatch/window-store.test.ts +162 -0
package/src/dispatch/window-store.ts +102 -0
package/src/entity/change-derivers.test.ts +148 -0
package/src/entity/change-derivers.ts +143 -0
package/src/entity/change-emitter.test.ts +66 -0
package/src/entity/change-emitter.ts +76 -0
package/src/entity/create-handle.ts +344 -0
package/src/entity/cross-pod-read-consistency.it.test.ts +281 -0
package/src/entity/define-entity.ts +157 -0
package/src/entity/diff.test.ts +57 -0
package/src/entity/diff.ts +54 -0
package/src/entity/entity-store.test.ts +30 -0
package/src/entity/entity-store.ts +171 -0
package/src/entity/extension-point.ts +56 -0
package/src/entity/fake-entity-store.ts +130 -0
package/src/entity/hook.ts +19 -0
package/src/entity/index.ts +50 -0
package/src/entity/mutate-handle.test.ts +517 -0
package/src/entity/on-entity-changed.test.ts +189 -0
package/src/entity/on-entity-changed.ts +214 -0
package/src/entity/registry.test.ts +181 -0
package/src/entity/registry.ts +200 -0
package/src/entity/stable-stringify.test.ts +55 -0
package/src/entity/stable-stringify.ts +49 -0
package/src/entity/wake-index.it.test.ts +251 -0
package/src/entity/with-entity-write.test.ts +100 -0
package/src/entity/with-entity-write.ts +69 -0
package/src/entity-driven-trigger.ts +46 -0
package/src/extension-points.ts +35 -0
package/src/gitops-docs.test.ts +215 -0
package/src/gitops-docs.ts +151 -0
package/src/gitops-kinds.test.ts +174 -0
package/src/gitops-kinds.ts +137 -0
package/src/index.ts +355 -11
package/src/migration/flapping-to-window.test.ts +123 -0
package/src/migration/flapping-to-window.ts +205 -0
package/src/router.test.ts +182 -1
package/src/router.ts +73 -2
package/src/schema.ts +236 -3
package/src/script-test-replay.test.ts +88 -0
package/src/script-test-replay.ts +100 -0
package/src/script-test-shell-env.test.ts +41 -0
package/src/script-test-shell-env.ts +89 -0
package/src/script-test.test.ts +386 -0
package/src/script-test.ts +258 -0
package/src/trigger-registry.ts +2 -0
package/src/validate-definition.test.ts +1 -0
package/tsconfig.json +24 -0

package/src/dispatch/trigger-subscriber.ts CHANGED Viewed

@@ -21,10 +21,14 @@ import type {
 import { SYSTEM_ACTOR, type Actor } from "@checkstack/common";
 import type { AutomationStore } from "../automation-store";
+import { evaluate, parseCondition } from "@checkstack/template-engine";
 import { dispatchTrigger, resumeRun } from "./engine";
 import { evaluateCondition } from "./condition";
 import { renderString } from "./render";
 import { buildInitialScope } from "./scope";
+import { enrichScopeWithState } from "./state-scope";
+import { armDwell, type StartRunFromDwell } from "./dwell";
 import type {
   DispatchDeps,
   LoadedAutomation,
@@ -147,7 +151,7 @@ export async function setupTriggerSubscriptions(
   };
 }
-interface HandleTriggerFiringArgs {
+export interface HandleTriggerFiringArgs {
   deps: DispatchDeps;
   automationStore: AutomationStore;
   qualifiedEventId: string;
@@ -156,7 +160,7 @@ interface HandleTriggerFiringArgs {
   contextKey: string | null;
 }
-async function handleTriggerFiring(
+export async function handleTriggerFiring(
   args: HandleTriggerFiringArgs,
 ): Promise<void> {
   // ── Step 1: resume any waiting runs ──
@@ -182,6 +186,65 @@ async function handleTriggerFiring(
       });
     }
   }
+  // ── Step 3: eager inverse-cancel ──
+  // A state-change event may be the natural inverse of an armed dwell
+  // (e.g. `system.healthy` cancels a `system.degraded` + for: dwell on
+  // the same automation + system). The expiry re-confirm would catch
+  // this anyway, but cancelling now deletes the dwell row so its queue
+  // job no-ops promptly instead of waking and re-checking later.
+  await cancelStaleDwells(args);
+}
+/**
+ * For every automation referencing the firing event with a `for:` dwell
+ * armed on the same context key, re-confirm the system's current status;
+ * if it no longer matches the dwell's `armedStatus`, cancel the dwell.
+ * Bounded to the matching automations and skipped entirely when no
+ * health client is wired (nothing to re-confirm against).
+ */
+async function cancelStaleDwells(
+  args: HandleTriggerFiringArgs,
+): Promise<void> {
+  const client = args.deps.healthCheckClient;
+  if (!client || args.contextKey === null) return;
+  const matches = await args.automationStore.findEnabledByTriggerEvent(
+    args.qualifiedEventId,
+  );
+  let currentStatus: string | undefined;
+  for (const automation of matches) {
+    for (const trigger of automation.definition.triggers) {
+      if (!trigger.for) continue;
+      const triggerId = trigger.id ?? deriveTriggerId(trigger);
+      const dwell = await args.deps.dwellStore.findByKey(
+        automation.id,
+        triggerId,
+        args.contextKey,
+      );
+      if (!dwell || dwell.armedStatus === null) continue;
+      // Resolve current status once per firing (cheap memoised lookup).
+      if (currentStatus === undefined) {
+        try {
+          const state = await client.getHealthState({
+            systemId: args.contextKey,
+          });
+          currentStatus = state.status;
+        } catch {
+          return; // can't re-confirm — leave the dwell for expiry.
+        }
+      }
+      if (currentStatus !== dwell.armedStatus) {
+        await args.deps.dwellStore.delete(dwell.id);
+        args.deps.logger.debug(
+          `Cancelled dwell ${dwell.id} (${automation.id}/${triggerId}): system ${args.contextKey} left status "${dwell.armedStatus}" (now "${currentStatus}")`,
+        );
+      }
+    }
+  }
 }
 async function wakeWaitingRuns(args: HandleTriggerFiringArgs): Promise<void> {
@@ -202,6 +265,12 @@ async function wakeWaitingRuns(args: HandleTriggerFiringArgs): Promise<void> {
           actor: args.actor,
           startedAt: new Date(),
         });
+        await enrichScopeWithState({
+          scope: ctx,
+          client: args.deps.healthCheckClient,
+          logger: args.deps.logger,
+          contextKey: args.contextKey,
+        });
         const pass = evaluateCondition(
           lock.filterTemplate,
           ctx,
@@ -237,6 +306,39 @@ async function wakeWaitingRuns(args: HandleTriggerFiringArgs): Promise<void> {
   }
 }
+/**
+ * Stage-2 entry (reactive automation engine §13.3): start fresh runs for ONE
+ * already-resolved automation whose trigger references `eventId`, using the
+ * entity-change as the trigger payload. Mirrors the per-automation inner of
+ * `handleTriggerFiring` step 2, but scoped to a single automation so the
+ * Stage-2 fan-out job (one automation + one entity change) runs in isolation.
+ *
+ * Each matching trigger goes through `maybeStartRun` (config gate, filter,
+ * dwell, concurrency mode) exactly as the hook-driven path does.
+ */
+export async function startRunsForAutomationEvent(args: {
+  deps: DispatchDeps;
+  automation: LoadedAutomation;
+  eventId: string;
+  triggerPayload: Record<string, unknown>;
+  actor: Actor;
+  contextKey: string | null;
+}): Promise<void> {
+  for (const trigger of args.automation.definition.triggers.filter(
+    (t) => t.event === args.eventId,
+  )) {
+    await maybeStartRun({
+      deps: args.deps,
+      automation: args.automation,
+      trigger,
+      triggerPayload: args.triggerPayload,
+      actor: args.actor,
+      contextKey: args.contextKey,
+      eventId: args.eventId,
+    });
+  }
+}
 interface MaybeStartRunArgs {
   deps: DispatchDeps;
   automation: LoadedAutomation;
@@ -248,20 +350,51 @@ interface MaybeStartRunArgs {
 }
 async function maybeStartRun(args: MaybeStartRunArgs): Promise<void> {
-  // Trigger-level filter check.
+  // Structured config gate (e.g. numeric_state's above/below threshold).
+  // Runs before the operator's template filter. A registered trigger that
+  // declares `evaluateConfig` decides per-automation whether this payload
+  // fires, using the trigger's typed `config`.
+  const registered = args.deps.registries.triggers.getTrigger(args.eventId);
+  if (registered?.evaluateConfig) {
+    let pass: boolean;
+    try {
+      pass = registered.evaluateConfig(
+        args.triggerPayload,
+        args.trigger.config,
+      );
+    } catch (error) {
+      args.deps.logger.warn(
+        `Trigger config gate threw; skipping firing: ${(error as Error).message}`,
+      );
+      return;
+    }
+    if (!pass) return;
+  }
+  // Trigger-level filter gates BOTH the immediate run and arming a dwell.
+  // (Conditions, by contrast, gate the run itself and are evaluated at
+  // fire time so a dwell re-checks them after the duration.)
   if (args.trigger.filter) {
-    const ctx = buildInitialScope({
+    const filterScope = buildInitialScope({
       triggerId: args.trigger.id ?? deriveTriggerId(args.trigger),
       triggerEventId: args.eventId,
       payload: args.triggerPayload,
       actor: args.actor,
       startedAt: new Date(),
     });
+    await enrichScopeWithState({
+      scope: filterScope,
+      client: args.deps.healthCheckClient,
+      logger: args.deps.logger,
+      contextKey: args.contextKey,
+      usesState: args.automation.definition.uses_state,
+      transitionWindowMinutes: args.automation.definition.state_window_minutes,
+    });
     let pass: boolean;
     try {
       pass = evaluateCondition(
         args.trigger.filter,
-        ctx,
+        filterScope,
         args.deps.filters,
       );
     } catch (error) {
@@ -273,20 +406,100 @@ async function maybeStartRun(args: MaybeStartRunArgs): Promise<void> {
     if (!pass) return;
   }
-  // Top-level conditions gate the run.
-  if (args.automation.definition.conditions.length > 0) {
-    const ctx = buildInitialScope({
+  // Windowed-count / rate gate — runs AFTER the structured config gate + the
+  // operator's `filter` (so only QUALIFYING occurrences count) and BEFORE the
+  // `for:` dwell (so a window can compose with a dwell). Records this
+  // occurrence in the durable append log and counts rows in the trailing
+  // window; fires per the re-fire policy.
+  //
+  // Cross-pod: the work-queue claim gives exactly one INSERT per emission, and
+  // the COUNT is a pure DB read, so every pod agrees on whether the threshold
+  // was crossed (state-and-scale rule). No process-local state.
+  if (args.trigger.window) {
+    // Partition key the count buckets by. Defaults to the trigger's built-in
+    // context key (e.g. systemId); `partitionBy` overrides it with a bare
+    // expression evaluated against the SAME scope `filter` uses. An
+    // empty/undefined result or an eval error falls back to the built-in key
+    // (never accidental global counting).
+    const partitionKey = await resolvePartitionKey(args);
+    let fired: boolean;
+    try {
+      fired = await args.deps.windowStore.recordAndCount({
+        automationId: args.automation.id,
+        triggerId: args.trigger.id ?? deriveTriggerId(args.trigger),
+        eventId: args.eventId,
+        contextKey: partitionKey,
+        occurredAt: new Date(),
+        windowMinutes: args.trigger.window.minutes,
+        threshold: args.trigger.window.count,
+        refire: args.trigger.window.refire,
+      });
+    } catch (error) {
+      args.deps.logger.warn(
+        `Trigger window gate failed; skipping firing: ${(error as Error).message}`,
+      );
+      return;
+    }
+    if (!fired) return;
+  }
+  // `for:` dwell — arm (or re-arm) instead of starting the run now. The
+  // run starts only if the matched state still holds after the duration.
+  if (args.trigger.for) {
+    await armDwell({
+      deps: args.deps,
+      automation: args.automation,
+      trigger: args.trigger,
       triggerId: args.trigger.id ?? deriveTriggerId(args.trigger),
+      eventId: args.eventId,
+      contextKey: args.contextKey,
+      triggerPayload: args.triggerPayload,
+      actor: args.actor,
+    });
+    return;
+  }
+  await startRunRespectingMode({
+    deps: args.deps,
+    automation: args.automation,
+    trigger: args.trigger,
+    triggerId: args.trigger.id ?? deriveTriggerId(args.trigger),
+    eventId: args.eventId,
+    contextKey: args.contextKey,
+    triggerPayload: args.triggerPayload,
+    actor: args.actor,
+  });
+}
+/**
+ * Evaluate the automation's pre-run conditions (against freshly-enriched
+ * scope) and, if they pass, dispatch a run honouring the concurrency
+ * mode. Shared by the immediate trigger path and the dwell-fire path
+ * (so a dwell re-checks conditions at expiry, not at arm time).
+ */
+export const startRunRespectingMode: StartRunFromDwell = async (args) => {
+  // Top-level conditions gate the run, evaluated against enriched scope.
+  if (args.automation.definition.conditions.length > 0) {
+    const gateScope = buildInitialScope({
+      triggerId: args.triggerId,
       triggerEventId: args.eventId,
       payload: args.triggerPayload,
       actor: args.actor,
       startedAt: new Date(),
     });
+    await enrichScopeWithState({
+      scope: gateScope,
+      client: args.deps.healthCheckClient,
+      logger: args.deps.logger,
+      contextKey: args.contextKey,
+      usesState: args.automation.definition.uses_state,
+      transitionWindowMinutes: args.automation.definition.state_window_minutes,
+    });
     for (const condition of args.automation.definition.conditions) {
       try {
         const pass = evaluateCondition(
           condition,
-          ctx,
+          gateScope,
           args.deps.filters,
         );
         if (!pass) return;
@@ -301,14 +514,14 @@ async function maybeStartRun(args: MaybeStartRunArgs): Promise<void> {
     automationId: args.automation.id,
     mode: args.automation.definition.mode,
     maxRuns: args.automation.definition.max_runs,
-    triggerId: args.trigger.id ?? deriveTriggerId(args.trigger),
+    triggerId: args.triggerId,
     triggerEventId: args.eventId,
     triggerPayload: args.triggerPayload,
     actor: args.actor,
     contextKey: args.contextKey,
     automation: args.automation,
   });
-}
+};
 interface RespectConcurrencyArgs {
   deps: DispatchDeps;
@@ -325,10 +538,43 @@ interface RespectConcurrencyArgs {
 async function respectConcurrencyMode(
   args: RespectConcurrencyArgs,
+): Promise<void> {
+  // Per the automation's concurrency scope, the active-run bucket is
+  // either the whole automation (`undefined` → no context filter) or just
+  // the incoming context key. Passing `undefined` keeps the original
+  // per-automation behaviour for the default scope.
+  const scopeKey =
+    args.automation.definition.concurrency_scope === "context_key"
+      ? args.contextKey
+      : undefined;
+  // Serialize the check-then-create. Without a lock, two concurrent fires
+  // (two trigger events, a dwell-fire racing a fresh fire, or two pods) can
+  // both read "no active run" and both `dispatchTrigger`, double-running a
+  // `single`-mode automation. The lock is keyed on (automationId, scope) so
+  // it doesn't serialize unrelated automations or distinct context keys.
+  const lockKey = `automation.concurrency:${args.automationId}:${
+    scopeKey ?? "@@all"
+  }`;
+  const run = args.deps.withConcurrencyLock
+    ? <T>(fn: () => Promise<T>) => args.deps.withConcurrencyLock!(lockKey, fn)
+    : <T>(fn: () => Promise<T>) => fn();
+  await run(async () => {
+    await respectConcurrencyModeInner(args, scopeKey);
+  });
+}
+async function respectConcurrencyModeInner(
+  args: RespectConcurrencyArgs,
+  scopeKey: string | null | undefined,
 ): Promise<void> {
   switch (args.mode) {
     case "single": {
-      const active = await args.deps.runStore.hasActiveRun(args.automationId);
+      const active = await args.deps.runStore.hasActiveRun(
+        args.automationId,
+        scopeKey,
+      );
       if (active) {
         args.deps.logger.debug(
           `Skipping trigger for ${args.automationId} — single mode and a run is active`,
@@ -338,7 +584,10 @@ async function respectConcurrencyMode(
       break;
     }
     case "parallel": {
-      const count = await args.deps.runStore.countActiveRuns(args.automationId);
+      const count = await args.deps.runStore.countActiveRuns(
+        args.automationId,
+        scopeKey,
+      );
       if (count >= args.maxRuns) {
         args.deps.logger.debug(
           `Skipping trigger for ${args.automationId} — parallel limit reached (${count}/${args.maxRuns})`,
@@ -352,7 +601,10 @@ async function respectConcurrencyMode(
       // queueing requires its own coordination queue, which we add in a
       // follow-up. Behaviour stays correct (no double-fire) under the
       // existing work-queue mode.
-      const count = await args.deps.runStore.countActiveRuns(args.automationId);
+      const count = await args.deps.runStore.countActiveRuns(
+        args.automationId,
+        scopeKey,
+      );
       if (count >= args.maxRuns) return;
       break;
     }
@@ -360,6 +612,7 @@ async function respectConcurrencyMode(
       const cancelled = await args.deps.runStore.cancelActiveRuns(
         args.automationId,
         "restart — superseded by newer trigger",
+        scopeKey,
       );
       if (cancelled.length > 0) {
         args.deps.logger.debug(
@@ -386,6 +639,55 @@ async function respectConcurrencyMode(
 // is convenient for future filter expressions.
 void renderString;
+/**
+ * Resolve the partition key the windowed-count gate buckets the occurrence
+ * count by.
+ *
+ *  - No `window.partitionBy` → the trigger's built-in context key
+ *    (`args.contextKey`, e.g. systemId). Existing behaviour, unchanged.
+ *  - `window.partitionBy` set → evaluate it as a BARE expression (same flavour
+ *    as `filter`, no `{{ }}`) against the SAME scope `filter` uses, then
+ *    coerce the result to a string.
+ *  - The evaluated value is null/undefined/empty, OR evaluation throws →
+ *    fall back to `args.contextKey` (never accidental global counting). An
+ *    eval error is logged, matching the gate's fail-open posture.
+ */
+async function resolvePartitionKey(
+  args: MaybeStartRunArgs,
+): Promise<string | null> {
+  const expression = args.trigger.window?.partitionBy;
+  if (expression === undefined) return args.contextKey;
+  try {
+    const scope = buildInitialScope({
+      triggerId: args.trigger.id ?? deriveTriggerId(args.trigger),
+      triggerEventId: args.eventId,
+      payload: args.triggerPayload,
+      actor: args.actor,
+      startedAt: new Date(),
+    });
+    await enrichScopeWithState({
+      scope,
+      client: args.deps.healthCheckClient,
+      logger: args.deps.logger,
+      contextKey: args.contextKey,
+      usesState: args.automation.definition.uses_state,
+      transitionWindowMinutes: args.automation.definition.state_window_minutes,
+    });
+    const value = evaluate(parseCondition(expression), scope, {
+      filters: args.deps.filters,
+    });
+    if (value === null || value === undefined) return args.contextKey;
+    const key = String(value).trim();
+    return key.length > 0 ? key : args.contextKey;
+  } catch (error) {
+    args.deps.logger.warn(
+      `Trigger window partitionBy failed to evaluate; falling back to the built-in context key: ${(error as Error).message}`,
+    );
+    return args.contextKey;
+  }
+}
 /**
  * Derive a stable trigger id from the trigger declaration when the
  * operator hasn't assigned one. Slugifies the event id; collisions