npm - @checkstack/backend-api - Versions diffs - 0.17.1 → 0.19.0 - Mend

@checkstack/backend-api 0.17.1 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/CHANGELOG.md +295 -0
package/package.json +10 -8
package/src/actor.test.ts +29 -0
package/src/actor.ts +27 -0
package/src/advisory-lock.it.test.ts +111 -0
package/src/advisory-lock.test.ts +132 -0
package/src/advisory-lock.ts +174 -0
package/src/collector-strategy.ts +21 -0
package/src/core-services.ts +7 -0
package/src/esm-script-runner.test.ts +93 -1
package/src/esm-script-runner.ts +53 -2
package/src/event-bus-types.ts +13 -4
package/src/hooks.ts +14 -1
package/src/index.ts +2 -0
package/src/plugin-system.ts +21 -2
package/src/schema-utils.test.ts +44 -0
package/src/schema-utils.ts +6 -0
package/src/zod-config.ts +33 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,300 @@
 # @checkstack/backend-api
+## 0.19.0
+### Minor Changes
+- 270ef29: Fix automation provider actions and `secretEnv` script actions throwing in production.
+  The automation dispatch engine resolved provider-action dependencies (the integration connection store, the secret resolver) through a `getService` that was a throwing stub, so Jira / Teams / Webex actions and `secretEnv` script actions threw at execute time in production. The whole dispatch test suite stubbed `getService`, so the break was invisible.
+  Root cause: the plugin `env` exposed `registerService` but no resolver, so the dispatch path (the only context that resolves arbitrary cross-plugin refs outside an RPC handler) had nothing real to call.
+  Changes:
+  - `@checkstack/backend-api`: add `getService<S>(ref: ServiceRef<S>): Promise<S>` to the plugin `env` (`BackendPluginRegistry`). It resolves a service registered by any plugin through the real `ServiceRegistry` using the calling plugin's identity, and throws a clear error if the ref is not registered (never silently `undefined`). **NEW PLUGIN-AUTHOR CONTRACT**: `env.getService` is now available to resolve arbitrary cross-plugin service refs at init / afterPluginsReady time.
+  - `@checkstack/backend`: implement `env.getService` in both the plugin loader and the runtime single-plugin registration path, backed by `ServiceRegistry.get(ref, { pluginId })`.
+  - `@checkstack/automation-backend`: wire the dispatch `getService` to `env.getService` (was a throwing stub). This also activates run-wide provider-credential masking, because resolving the connection store / secret resolver now flows through the run's masking interceptor.
+  Also fixes a test-only seam where the `core/backend` test preload registered a no-op `registerRouter`, silently disabling oRPC router registration across the suite.
+- 270ef29: Fix suspend/resume durability + complete the run-wide secret-masking guarantee.
+  A panel review confirmed several defects in the automation dispatch engine's suspend/resume durability and in the run-wide masking choke point. These survived because the unit suite stubbed the seam under test; the fixes ship with tests that exercise the real suspend / sweep / resume paths.
+  Suspend/resume durability:
+  - **Stalled sweeper no longer re-runs intentional waits.** `findStalledRunIds` now joins `automation_runs` and returns only `status = 'running'` runs, and suspend-finalisation no longer clobbers the run's `lastActionPath` checkpoint to `null`. Previously any wait longer than the stale window (>60s) was re-walked from the top every sweep cycle, re-firing pre-wait side effects and leaking wait locks. The wait-aware sweeps now also run before the stalled-run sweep.
+  - **Stalled recovery refuses a run holding a live wait lock.** `recoverStalledRun` now only recovers a genuinely-`running` run with no wait lock; a crash-mid-wait recovery is left to the wait/resume paths instead of re-walking from the top and creating a duplicate lock + duplicate delay job.
+  - **Cancelled runs can no longer resurrect.** `resumeRun` guards on `status === 'waiting'` (mirroring `checkWaitUntil`) and drops any stale lock for a non-waiting run, so `wakeWaitingRuns` / delay-expiry / a racing queue job can't wake a cancelled or terminal run. `cancelActiveRuns` (restart mode) now deletes the cancelled runs' wait locks + run-state in the same operation.
+  - **Concurrency check-then-create is serialized.** The `mode` check + `createRun` now run under a transaction-scoped advisory lock keyed on `(automationId, scope)`, so two concurrent fires can't both pass a `single`-mode "no active run" check and double-run.
+  Masking guarantee (now genuinely covers scope + artifacts):
+  - **The run-wide masking choke point now also masks the durable scope snapshot and produced artifacts.** The `RunSecretRegistry` is threaded into `RunStateStore.upsert` (masks `scopeSnapshot`) and `ArtifactStore.record` (masks `data`) so a resolved connection credential threaded into `scope.variables` or surfaced into an artifact is redacted before persist - and therefore cannot reach a read-only user via `getRunScopeForReplay`. **GUARANTEE CHANGE**: run-wide masking now covers step output, run error, scope snapshot, and artifact data for every action.
+  - **`testConnection` / `testProviderConnection` mask provider errors.** These RPCs run outside a dispatch run, so they build a per-call mask set from the resolved/submitted connection config and run any provider error through it before returning, so a provider error echoing a token can't cross back to the browser.
+  - **Short secrets surface a warning.** `setSecret` now warns when a value is shorter than `MIN_MASKABLE_LENGTH` (4) that it cannot be auto-redacted (the threshold is intentionally not lowered).
+  Internal:
+  - `@checkstack/backend-api`: `withXactLock`'s `fn` now receives the transaction handle `tx` so a critical section can run on the locked connection; the doc clarifies why running on the pool inside the lock window is still safe. The incident dedup caller's comment is corrected accordingly. `RunStore` gains `findWaitLocksByRun`.
+- 270ef29: Fix several correctness defects around distributed coordination and stored-data handling.
+  - Dwell `for:` timers now fire via an atomic `DELETE ... RETURNING` claim, so two pods (or the stalled sweeper vs the queue consumer) can no longer both fire the same dwell.
+  - Postgres session-level advisory locks now keep connection affinity. A shared `AdvisoryLockService` (backed by a dedicated pooled client) replaces the previous acquire/release-on-different-connection pattern that leaked locks. Used by the script-packages installer election, the automation run resume + stalled sweeper, and (via a new transaction-scoped `withXactLock`) incident dedup.
+  - A storage migration that crashed mid-flight is now resumed on startup under the installer-election lock, instead of permanently wedging installs.
+  - Distributed script-package blobs carry a `blobSha256` and are verified before extraction (the SRI `integrity` hashes the npm tarball, not the transported archive). Backward-safe: entries without the field skip verification until a re-install regenerates the manifest.
+  - Archive extraction rejects zip-slip paths (absolute or `..` entries) before writing anything.
+  - `incident.create` with `dedupe_open_for_system` serializes its check-then-create per system, so concurrent triggers for the same system can't both open a duplicate incident.
+  - Seeded auto-incident filter expressions JSON-encode interpolated ids so a quote/backslash can't corrupt the expression.
+  - Stored jsonb snapshots (dwell `actorSnapshot`, wait-lock `waitConfig`) are validated with zod on load and degrade safely instead of flowing through as the wrong type.
+- b995afb: Harden the advisory-lock service against holder-connection termination.
+  A session-level advisory lock is held on a dedicated checked-out pool client.
+  If that backend is terminated (admin kill, failover, network drop) while the
+  lock is held, `pg` emits an `'error'` on the client; with no listener attached
+  that error is re-thrown by the EventEmitter and crashes the pod. The service
+  now attaches an error listener to the held client so the loss degrades
+  gracefully - the session lock is auto-released server-side when the backend
+  dies, and the key simply becomes acquirable again.
+  Also de-flaked the advisory-lock integration test: it now terminates only the
+  lock-holding backend (found via `pg_locks`) instead of every backend in the
+  database - the old blanket kill also tore down the pool's idle connections,
+  whose async errors flaked the run and left the pool unusable.
+- 270ef29: Add in-UI script testing for automation `run_script` / `run_shell` actions.
+  A new `testScript` RPC runs a TypeScript or shell script against an
+  editable, auto-seeded sample context using the same sandboxed runner the
+  real action uses, so operators can test scripts directly in the editor
+  without dispatching a whole automation. Surfaces beneath any script field
+  flagged `x-script-testable` via the new `ScriptTestPanel` /
+  `ContextSampleEditor` components in `@checkstack/ui` and the
+  `scriptTestRenderer` prop threaded through `DynamicForm`.
+  - `@checkstack/automation-common`: adds the `testScript` contract +
+    `ScriptTest*` schemas (gated by `automation.manage`).
+  - `@checkstack/automation-backend`: implements `testScript` reusing the
+    shared ESM / shell runners; central-only, time-bounded.
+  - `@checkstack/backend-api`: new `x-script-testable` config-schema
+    metadata propagated to the frontend JSON Schema.
+  - `@checkstack/ui`: new `ScriptTestPanel` + `ContextSampleEditor`
+    components and a `scriptTestRenderer` prop on `DynamicForm`.
+  - `@checkstack/automation-frontend`: wires the test panel into the action
+    editor.
+  - `@checkstack/integration-script-backend`: marks the `run_script` /
+    `run_shell` script fields as testable.
+- 270ef29: Activate npm packages in script execution: thread the managed
+  `resolutionRoot` into every user-script call site so an allowlisted package
+  can actually be `import`ed.
+  - `@checkstack/backend-api`: the ESM runner now always writes a per-run
+    `bunfig.toml` with `[install] auto = "disable"` and runs with that dir as
+    CWD. Without this Bun silently auto-installs any imported package from the
+    registry (verified), defeating the allowlist; with it, imports resolve
+    only against the reconciled `current/node_modules` (when a `resolutionRoot`
+    is set) and otherwise fail fast.
+  - `@checkstack/script-packages-backend`: `resolveResolutionRoot` /
+    `resolveResolutionRootFromStore` / `resolveResolutionRootForHost` decide a
+    host's resolution-root status (`none` / `ready` / `notReady`) from the
+    local `<store>/current`.
+  - `run_script` (integration-script-backend), the inline-script collector
+    (healthcheck-script-backend, core + satellite), and the in-UI `testScript`
+    / `testCollectorScript` endpoints all resolve the root per run and pass it
+    to the runner; `run_script` surfaces a clear "npm packages not ready"
+    error when configured-but-unsynced. Shell paths are unaffected (no module
+    resolution).
+  An opt-in end-to-end test (`CHECKSTACK_E2E_NETWORK=1`) proves an allowlisted
+  package imports successfully through the real `run_script` action execute
+  path, with non-network degradation tests running always.
+  BREAKING CHANGES: `@checkstack/backend-api`'s `defaultEsmScriptRunner` now
+  always disables Bun auto-install for the user subprocess. A script that
+  previously relied on Bun silently fetching an un-vendored package from the
+  registry at import time will now fail to resolve it. This is intentional -
+  package availability is governed by the admin allowlist - but any caller
+  depending on the old implicit auto-install behavior must add the package to
+  the allowlist instead. The new `EsmScriptRunOptions.resolutionRoot` field is
+  optional and additive (defaults to today's `os.tmpdir()` behavior when
+  unset), so the runner API itself is source-compatible.
+- 270ef29: Add the per-host script-package reconciler and the runner resolution root.
+  - `@checkstack/backend-api`: `EsmScriptRunOptions.resolutionRoot` - when
+    set, the per-run temp dir is created inside it so module resolution walks
+    up to `<resolutionRoot>/node_modules` and user scripts can `import`
+    managed npm packages. Defaults to today's `os.tmpdir()` behavior when
+    unset (backward-compatible; isolation unchanged - the subprocess still
+    only sees `SAFE_ENV_VARS`).
+  - `@checkstack/script-packages-backend`: content-addressed cache archive
+    (tar+gzip per package), pure delta diff (`computeMissingBlobs`), atomic
+    `current` symlink swap, the host reconciler (`reconcileToHash` -
+    idempotent: pull only missing blobs, materialize a versioned tree via
+    `bun install --offline`, atomically flip `current`), the concrete fs/Bun
+    adapter, the central install resolver, and the `script-packages.changed`
+    broadcast hook. An opt-in end-to-end test
+    (`CHECKSTACK_E2E_NETWORK=1`) proves resolve -> publish -> cold reconcile
+    (no registry) -> offline materialize -> import.
+- 270ef29: Secrets platform Phase 2: secret -> env-var mapping with central resolve, inject, and mask.
+  - Script consumers declare a least-privilege `secretEnv` allowlist
+    (`{ ENV_NAME: "${{ secrets.NAME }}" }`). The automation `run_script` /
+    `run_shell` actions resolve ONLY the declared secrets via
+    `secretResolverRef.resolveForRun`, inject them into the runner env for
+    that run (memory-only; the ESM runner gained a per-run `env` option), and
+    mask their values out of stdout/stderr/result/error via the run-scoped
+    masking context. A missing required secret fails the run clearly. No
+    ambient secret access.
+  - Test panel: `testScript` / `testCollectorScript` inject named
+    `__SECRET_<NAME>__` placeholders by default, or user-supplied per-secret
+    overrides; real production values are never resolved in the test path,
+    and overrides are masked out of the result.
+  - Healthcheck collectors carry the `secretEnv` field for authoring +
+    the test panel; runtime injection on satellites lands in Phase 3.
+  - Editor UX: a new `@checkstack/ui` `SecretEnvEditor` renders `x-secret-env`
+    record fields with `${{ secrets.* }}` name autocomplete (from
+    `listSecretNames`), wired into the automation action editor and the
+    healthcheck collector editor. New `withConfigMeta` helper +
+    `x-secret-env` config-meta key in `@checkstack/backend-api`.
+- 270ef29: Secrets platform Phase 3: just-in-time secret delivery to satellites + source-side masking, and central-execution injection for healthcheck collectors.
+  - New satellite WS messages `request_run_secrets` / `run_secrets`: just
+    before a satellite runs a collector that declares a `secretEnv`, it asks
+    core for that collector's resolved env; core resolves ONLY the secrets the
+    collector's OWN persisted assignment declares (least-privilege — the
+    satellite cannot choose) and replies with the env map (or a clear error).
+    The satellite injects it memory-only for the run and drops it on
+    completion. Secrets never ride the persisted assignment and never touch
+    disk.
+  - Source-side masking: the satellite runs `maskSecrets` over the collector's
+    stdout/stderr/result/error using the run's delivered values BEFORE the
+    result leaves the satellite (defense in depth).
+  - `CollectorStrategy.execute` gains an optional `secretEnv`. The
+    inline-script and shell collectors inject it into the runner
+    (`process.env` / `$VAR`) and mask the values out of their output.
+  - Healthcheck collectors running centrally (the queue executor) also resolve
+    - inject `secretEnv` via `secretResolverRef`, closing the gap where a
+      centrally-run secretEnv collector got no secrets. A missing required
+      secret fails the run clearly in all paths.
+### Patch Changes
+- Updated dependencies [270ef29]
+- Updated dependencies [270ef29]
+- Updated dependencies [270ef29]
+- Updated dependencies [b995afb]
+- Updated dependencies [b995afb]
+- Updated dependencies [270ef29]
+- Updated dependencies [270ef29]
+  - @checkstack/healthcheck-common@1.4.0
+  - @checkstack/cache-api@0.3.7
+  - @checkstack/queue-api@0.3.7
+## 0.18.0
+### Minor Changes
+- 6d52276: feat(automation): expose `trigger.actor` so automations can filter on who/what caused an event
+  Every platform event now carries an **actor** - the user, application (API
+  client), service (backend-to-backend), or `system` (background /
+  unauthenticated) that caused it - and the automation engine surfaces it to
+  automations as `trigger.actor`. This lets a trigger filter gate on the
+  origin of the event it reacts to:
+  ```text
+  {{ trigger.actor.type == "system" }}      # auto-created by the platform
+  {{ trigger.actor.type == "user" }}         # a human
+  {{ trigger.actor.id == "app-deploybot" }}  # a specific application
+  ```
+  `trigger.actor` is available on **every** trigger - it is injected by the
+  platform, not declared per trigger - and editor autocomplete + Run Script
+  context types include `trigger.actor.{type,id,name}`.
+  How it works:
+  - **`@checkstack/common`** adds the canonical `Actor` type / `ActorSchema`
+    and `SYSTEM_ACTOR`.
+  - **`@checkstack/backend-api`** adds `resolveActor(user)` and a
+    `HookEventMeta` envelope. The hook listener / `onHook` signature gains an
+    optional second `meta` argument (additive, backward compatible).
+  - **`@checkstack/backend`** wraps emitted hooks in an envelope so the actor
+    travels with the payload through the distributed queue, unwrapping it
+    before delivery. The RPC emit path captures the authenticated caller;
+    background emits default to the system actor. Raw/legacy queue data is
+    treated as a system-actor payload, so delivery stays backward compatible.
+  - **`@checkstack/automation-backend`** threads the actor into the dispatch
+    scope (`trigger.actor`), available to trigger filters, top-level
+    conditions, and all run templates, and persisted in the run's scope
+    snapshot. Manual runs are attributed to the invoking user.
+  - **`@checkstack/automation-common`** / **`@checkstack/automation-frontend`**
+    expose `trigger.actor` in the editor variable scope and the generated
+    Run Script `context.trigger.actor` types.
+  No database migration and no per-trigger schema changes: the actor rides as
+  event-envelope metadata and in the run scope snapshot.
+- 35bc682: feat(healthcheck): expose check + system run-context to script collectors
+  Script health checks can now read which check and system a run is for.
+  Previously shell scripts got only a curated env whitelist and inline
+  scripts only `context.config`, so a script had no built-in way to know
+  its own check name or the system it was checking.
+  - `@checkstack/backend-api`: new `CollectorRunContext` type
+    (`{ check: { id, name, intervalSeconds }, system: { id, name } }`) and
+    an optional `runContext` param on `CollectorStrategy.execute`. Optional,
+    so existing collector implementations are unaffected.
+  - Shell-script collector: injects reserved `CHECKSTACK_CHECK_ID`,
+    `CHECKSTACK_CHECK_NAME`, `CHECKSTACK_CHECK_INTERVAL_SECONDS`,
+    `CHECKSTACK_SYSTEM_ID`, `CHECKSTACK_SYSTEM_NAME` env vars (user-supplied
+    `env` still wins on collision).
+  - Inline-script collector: exposes `context.check` and `context.system`
+    alongside `context.config`; the inline-script editor now types them for
+    autocomplete.
+  - Shell editors (health-check collectors and automation shell actions) now
+    also suggest the user's own `env` (JSON) keys as `$NAME` completions, via
+    the new exported `customShellEnvVars` helper. Keys that aren't valid shell
+    identifiers are omitted.
+  - Fix: the Typefox `CodeEditor` captured a stale `onChange` at editor start,
+    so editing one `DynamicForm` field reverted sibling fields changed since
+    mount (e.g. typing in a shell `script` field wiped an unsaved `env` value,
+    or deleted a sibling automation action added after mount). The change
+    handler now routes through a ref to the current `onChange`.
+  - Fix: focusing a JSON editor threw "LanguageStatusService.addStatus is not
+    supported" because the standalone service set omitted `ILanguageStatusService`.
+    That one service is now registered via `serviceOverrides`.
+  - Fix: the automation trigger card nested a `<Badge>` (a `<div>`) inside a
+    `<p>`, producing a `validateDOMNesting` warning. Switched the wrapper to a
+    `<div>`.
+  - Local runs (`queue-executor`) and satellite runs both populate the
+    context. `SatelliteAssignment` (and the `getAssignmentsForSatellite`
+    RPC output) gained optional `configName` / `systemName` so the metadata
+    reaches satellite-side execution; `HealthCheckService` resolves the
+    system name via the catalog client.
+  BREAKING CHANGE: `createHealthCheckRouter` now requires a `catalogClient`
+  option (used to resolve system names for satellite assignments). Update
+  call sites to pass the catalog RPC client.
+### Patch Changes
+- Updated dependencies [6d52276]
+- Updated dependencies [35bc682]
+  - @checkstack/common@0.12.0
+  - @checkstack/healthcheck-common@1.3.0
+  - @checkstack/signal-common@0.2.5
+  - @checkstack/cache-api@0.3.6
+  - @checkstack/queue-api@0.3.6
 ## 0.17.1
 ### Patch Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@checkstack/backend-api",
-  "version": "0.17.1",
+  "version": "0.19.0",
   "license": "Elastic-2.0",
   "type": "module",
   "main": "./src/index.ts",
@@ -10,11 +10,11 @@
     "lint:code": "eslint . --max-warnings 0"
   },
   "dependencies": {
-    "@checkstack/common": "0.11.0",
-    "@checkstack/healthcheck-common": "1.1.2",
-    "@checkstack/cache-api": "0.3.4",
-    "@checkstack/queue-api": "0.3.4",
-    "@checkstack/signal-common": "0.2.4",
+    "@checkstack/common": "0.12.0",
+    "@checkstack/healthcheck-common": "1.3.0",
+    "@checkstack/cache-api": "0.3.6",
+    "@checkstack/queue-api": "0.3.6",
+    "@checkstack/signal-common": "0.2.5",
     "@orpc/client": "^1.13.14",
     "@orpc/contract": "^1.13.14",
     "@orpc/openapi": "^1.13.2",
@@ -26,9 +26,11 @@
     "zod": "^4.2.1"
   },
   "devDependencies": {
-    "@types/bun": "latest",
+    "@checkstack/scripts": "0.3.4",
     "@checkstack/tsconfig": "0.0.7",
-    "@checkstack/scripts": "0.3.3"
+    "@types/bun": "latest",
+    "@types/pg": "^8.20.0",
+    "pg": "^8.21.0"
   },
   "peerDependencies": {
     "hono": "^4.12.14",

package/src/actor.test.ts ADDED Viewed

@@ -0,0 +1,29 @@
+import { describe, it, expect } from "bun:test";
+import { SYSTEM_ACTOR } from "@checkstack/common";
+import { resolveActor } from "./actor";
+describe("resolveActor", () => {
+  it("falls back to the system actor when there is no caller", () => {
+    expect(resolveActor(undefined)).toEqual(SYSTEM_ACTOR);
+  });
+  it("maps a real (human) user", () => {
+    expect(
+      resolveActor({ type: "user", id: "user-1", name: "Nico" }),
+    ).toEqual({ type: "user", id: "user-1", name: "Nico" });
+  });
+  it("maps an application (API client)", () => {
+    expect(
+      resolveActor({ type: "application", id: "app-deploybot", name: "Deploy Bot" }),
+    ).toEqual({ type: "application", id: "app-deploybot", name: "Deploy Bot" });
+  });
+  it("maps a service to its originating plugin id", () => {
+    expect(resolveActor({ type: "service", pluginId: "healthcheck" })).toEqual({
+      type: "service",
+      id: "healthcheck",
+      name: "healthcheck",
+    });
+  });
+});

package/src/actor.ts ADDED Viewed

@@ -0,0 +1,27 @@
+import { SYSTEM_ACTOR, type Actor } from "@checkstack/common";
+import type { AuthUser } from "./types";
+/**
+ * Resolve the canonical platform {@link Actor} for an event from the
+ * authenticated caller. Background / unauthenticated emits (no `user`)
+ * resolve to the system actor, so every emitted event carries an actor.
+ *
+ * - {@link RealUser}        -> `{ type: "user" }`
+ * - {@link ApplicationUser} -> `{ type: "application" }`
+ * - {@link ServiceUser}     -> `{ type: "service", id: pluginId }`
+ * - `undefined`             -> {@link SYSTEM_ACTOR}
+ */
+export function resolveActor(user?: AuthUser): Actor {
+  if (!user) return SYSTEM_ACTOR;
+  switch (user.type) {
+    case "user": {
+      return { type: "user", id: user.id, name: user.name };
+    }
+    case "application": {
+      return { type: "application", id: user.id, name: user.name };
+    }
+    case "service": {
+      return { type: "service", id: user.pluginId, name: user.pluginId };
+    }
+  }
+}

package/src/advisory-lock.it.test.ts ADDED Viewed

@@ -0,0 +1,111 @@
+/**
+ * Integration test (real Postgres) for the advisory-lock service.
+ *
+ * This is part of the surgical integration lane (plan §14.4 #1). It pins the
+ * one behaviour fakes cannot model faithfully: Postgres session-level advisory
+ * locks are tied to the DB *connection* that acquired them, so the holding
+ * client must be the same one that releases — and killing the holding
+ * connection must auto-release the lock.
+ *
+ * Gated behind `CHECKSTACK_IT=1` so the default `bun test` never runs it. The
+ * `integration` CI job sets that flag and provides a real Postgres service
+ * container. Connection comes from `CHECKSTACK_IT_PG_URL` (defaulting to the
+ * `docker-compose-dev.yml` Postgres port).
+ */
+import { afterAll, beforeAll, describe, expect, it } from "bun:test";
+import { Pool } from "pg";
+import { createAdvisoryLockService } from "./advisory-lock";
+const PG_URL =
+  process.env.CHECKSTACK_IT_PG_URL ??
+  "postgres://postgres:postgres@localhost:5432/postgres";
+describe.skipIf(!process.env.CHECKSTACK_IT)(
+  "advisory-lock (real Postgres)",
+  () => {
+    let pool: Pool;
+    beforeAll(() => {
+      pool = new Pool({ connectionString: PG_URL });
+      // A pooled client can error asynchronously while idle (e.g. its backend
+      // is terminated by the kill test below). pg emits that on the pool; with
+      // no handler it surfaces as an unhandled "Connection terminated
+      // unexpectedly" error that fails the whole test file. Swallowing idle-
+      // client errors is the documented pg pattern - the tests still assert
+      // behaviour through fresh checkouts.
+      pool.on("error", () => {});
+    });
+    afterAll(async () => {
+      await pool.end();
+    });
+    it("a second tryAcquire of the same key returns null until release", async () => {
+      const service = createAdvisoryLockService(pool);
+      const key = `it-advisory-lock:${crypto.randomUUID()}`;
+      const first = await service.tryAcquire(key);
+      expect(first).not.toBeNull();
+      // The lock is held — a concurrent acquire of the SAME key must fail.
+      const second = await service.tryAcquire(key);
+      expect(second).toBeNull();
+      // After release, a third acquire succeeds.
+      await first?.release();
+      const third = await service.tryAcquire(key);
+      expect(third).not.toBeNull();
+      await third?.release();
+    });
+    it("killing the holding connection auto-releases the lock", async () => {
+      const service = createAdvisoryLockService(pool);
+      const key = `it-advisory-lock:${crypto.randomUUID()}`;
+      // Acquire on a dedicated client owned by the handle.
+      const held = await service.tryAcquire(key);
+      expect(held).not.toBeNull();
+      // While held, the key is unavailable.
+      const blocked = await service.tryAcquire(key);
+      expect(blocked).toBeNull();
+      // Terminate ONLY the backend holding the advisory lock - found via
+      // `pg_locks` - from a fresh connection. Dropping that session makes
+      // Postgres auto-release the lock. We deliberately do NOT kill every other
+      // backend (the old approach): that also terminated the pool's idle
+      // connections, whose async "connection terminated" errors flaked the test
+      // and left the pool unusable. The handle holds exactly one advisory lock,
+      // so this targets precisely the holder.
+      const killer = await pool.connect();
+      try {
+        await killer.query(
+          `SELECT pg_terminate_backend(pid)
+             FROM pg_locks
+            WHERE locktype = 'advisory'
+              AND pid <> pg_backend_pid()`,
+        );
+      } finally {
+        killer.release();
+      }
+      // The lock should now be acquirable again. Retry briefly because the
+      // server takes a moment to reap the terminated backend's session locks.
+      let reacquired: Awaited<ReturnType<typeof service.tryAcquire>> = null;
+      for (let attempt = 0; attempt < 20 && reacquired === null; attempt++) {
+        reacquired = await service.tryAcquire(key);
+        if (reacquired === null) {
+          await new Promise((resolve) => setTimeout(resolve, 50));
+        }
+      }
+      expect(reacquired).not.toBeNull();
+      await reacquired?.release();
+      // The `held` handle still owns its (now-terminated) client. Release it so
+      // the dead client is returned to the pool - otherwise `pool.end()` in
+      // afterAll blocks waiting for the checked-out client to drain. The unlock
+      // query runs against a dead connection and rejects; that's expected.
+      await held?.release().catch(() => {});
+    });
+  },
+);

package/src/advisory-lock.test.ts ADDED Viewed

@@ -0,0 +1,132 @@
+import { describe, it, expect } from "bun:test";
+import {
+  createAdvisoryLockService,
+  type AdvisoryLockPool,
+  type AdvisoryLockPoolClient,
+} from "./advisory-lock";
+/**
+ * Faithful fake of a `pg.Pool` that models Postgres' per-connection
+ * SESSION advisory-lock semantics:
+ *
+ *   - A key can be held by at most one connection at a time.
+ *   - `pg_try_advisory_lock` succeeds only if the key is free; it then
+ *     binds the key to the acquiring connection.
+ *   - `pg_advisory_unlock` only frees the key if THIS connection holds it
+ *     (a no-op otherwise) — exactly the bug we are guarding against: an
+ *     unlock issued on a different connection does nothing.
+ *
+ * This lets the test prove the service keeps acquire + release on ONE
+ * client.
+ */
+interface FakePool extends AdvisoryLockPool {
+  checkedOut: number;
+  released: number;
+}
+function makeFakePool(): FakePool {
+  // key -> owning connection id (or absent if free)
+  const heldBy = new Map<string, number>();
+  let nextConnId = 0;
+  const counters = { checkedOut: 0, released: 0 };
+  // hashtextextended($1, 0) is opaque here — we just key on the raw string,
+  // which is faithful since the SQL is deterministic per key.
+  function keyOf(values: unknown[] | undefined): string {
+    return String(values?.[0]);
+  }
+  return {
+    get checkedOut() {
+      return counters.checkedOut;
+    },
+    get released() {
+      return counters.released;
+    },
+    async connect(): Promise<AdvisoryLockPoolClient> {
+      const connId = nextConnId++;
+      counters.checkedOut++;
+      return {
+        async query<T>(queryText: string, values?: unknown[]) {
+          const key = keyOf(values);
+          if (queryText.includes("pg_try_advisory_lock")) {
+            const owner = heldBy.get(key);
+            const ok = owner === undefined;
+            if (ok) heldBy.set(key, connId);
+            return { rows: [{ ok } as unknown as T] };
+          }
+          if (queryText.includes("pg_advisory_unlock")) {
+            // Only the owning connection can release — model the leak bug.
+            if (heldBy.get(key) === connId) heldBy.delete(key);
+            return { rows: [{ ok: true } as unknown as T] };
+          }
+          return { rows: [] };
+        },
+        release() {
+          counters.released++;
+        },
+        on() {
+          // The fake never emits async client errors; the real client's
+          // `on('error')` hardening is exercised by the IT against real
+          // Postgres (killing the holding connection).
+        },
+      };
+    },
+  };
+}
+describe("createAdvisoryLockService", () => {
+  it("acquire → second acquire fails while held → release → third acquire succeeds", async () => {
+    const pool = makeFakePool();
+    const svc = createAdvisoryLockService(pool);
+    const first = await svc.tryAcquire("k");
+    expect(first).not.toBeNull();
+    // Held: a second acquire (different pooled connection) must fail.
+    const second = await svc.tryAcquire("k");
+    expect(second).toBeNull();
+    // Release on the SAME client that acquired (the bug is release no-op'ing
+    // because it ran on a different connection).
+    await first!.release();
+    const third = await svc.tryAcquire("k");
+    expect(third).not.toBeNull();
+    await third!.release();
+  });
+  it("returns the client to the pool on both the failed-acquire and release paths", async () => {
+    const pool = makeFakePool();
+    const svc = createAdvisoryLockService(pool);
+    const h = await svc.tryAcquire("k");
+    const blocked = await svc.tryAcquire("k"); // fails → must release client
+    expect(blocked).toBeNull();
+    await h!.release();
+    // 2 connects (one held+released, one failed+released) => 2 releases.
+    expect(pool.checkedOut).toBe(2);
+    expect(pool.released).toBe(2);
+  });
+  it("release is idempotent", async () => {
+    const pool = makeFakePool();
+    const svc = createAdvisoryLockService(pool);
+    const h = await svc.tryAcquire("k");
+    await h!.release();
+    await h!.release(); // no throw, no double client.release
+    expect(pool.released).toBe(1);
+  });
+  it("different keys do not block each other", async () => {
+    const pool = makeFakePool();
+    const svc = createAdvisoryLockService(pool);
+    const a = await svc.tryAcquire("a");
+    const b = await svc.tryAcquire("b");
+    expect(a).not.toBeNull();
+    expect(b).not.toBeNull();
+    await a!.release();
+    await b!.release();
+  });
+});

package/src/advisory-lock.ts ADDED Viewed

@@ -0,0 +1,174 @@
+/**
+ * Postgres advisory-lock helpers with correct connection affinity.
+ *
+ * Postgres session-level advisory locks (`pg_try_advisory_lock` /
+ * `pg_advisory_unlock`) are tied to the DB *session* (connection) that
+ * acquired them. The platform runs every plugin query through a
+ * schema-scoped proxy that wraps each statement in its own short
+ * transaction on a connection borrowed from the shared pool and returned
+ * immediately. Acquiring a session lock through that proxy therefore runs
+ * the lock on one pooled connection and the unlock on a *different* one —
+ * so the unlock no-ops and the lock leaks until the original connection is
+ * recycled. This module fixes that two ways:
+ *
+ *   - {@link AdvisoryLockService.tryAcquire} checks out ONE dedicated
+ *     client from the pool, acquires the session lock on it, and returns a
+ *     handle that owns that client. `release()` runs the unlock on the SAME
+ *     client and then returns it to the pool. Use this for long-held locks
+ *     (e.g. an installer election held across a minutes-long `bun install`)
+ *     where a long-open transaction would be unacceptable.
+ *
+ *   - {@link withXactLock} wraps acquire + work + release in a single
+ *     transaction using `pg_advisory_xact_lock`, which auto-releases at
+ *     COMMIT/ROLLBACK. Use this for SHORT critical sections (e.g. a
+ *     find-then-create dedup) where holding a transaction for the duration
+ *     is fine and the auto-release removes any chance of a leak.
+ *
+ * Keys are arbitrary strings hashed to Postgres' 64-bit lock space via
+ * `hashtextextended(key, 0)`. Callers SHOULD namespace keys (e.g.
+ * `"script-packages.installer"`, `"incident.dedupe:<systemId>"`) since the
+ * advisory-lock space is global to the database server, not schema-scoped.
+ */
+import { sql } from "drizzle-orm";
+import type { SafeDatabase } from "./plugin-system";
+/**
+ * Minimal pool surface this module needs. Modelled on `pg.Pool` /
+ * `pg.PoolClient` without importing `pg` directly so the helper stays a
+ * pure type-level contract; the backend wires in the real `adminPool`.
+ */
+export interface AdvisoryLockPoolClient {
+  query<T>(
+    queryText: string,
+    values?: unknown[],
+  ): Promise<{ rows: T[] }>;
+  /** Return the client to the pool. */
+  release(): void;
+  /**
+   * Subscribe to async client errors. A session-lock client is held for a long
+   * time; if its backend dies (admin termination, failover, network drop) `pg`
+   * emits `'error'` on the client, and an `'error'` with no listener is
+   * re-thrown by the EventEmitter and would crash the pod. We attach a listener
+   * so that loss degrades gracefully instead. Modelled on `pg.Client.on`.
+   */
+  on(event: "error", listener: (err: Error) => void): void;
+}
+export interface AdvisoryLockPool {
+  connect(): Promise<AdvisoryLockPoolClient>;
+}
+/** A held session-level advisory lock that owns its dedicated client. */
+export interface AdvisoryLockHandle {
+  /**
+   * Release the lock (`pg_advisory_unlock` on the SAME client) and return
+   * the client to the pool. Idempotent: a second call is a no-op.
+   */
+  release(): Promise<void>;
+}
+export interface AdvisoryLockService {
+  /**
+   * Try to acquire a session-level advisory lock for `key` on a dedicated
+   * pooled client. Returns a handle on success, or `null` if the lock is
+   * already held (by this or another process). The handle owns the client
+   * until `release()` is called, so callers MUST always release in a
+   * `finally`.
+   */
+  tryAcquire(key: string): Promise<AdvisoryLockHandle | null>;
+}
+/**
+ * Build an {@link AdvisoryLockService} backed by a pool. The backend
+ * provides the real admin pool; tests can provide a faithful fake that
+ * models per-connection session-lock semantics.
+ */
+export function createAdvisoryLockService(
+  pool: AdvisoryLockPool,
+): AdvisoryLockService {
+  return {
+    async tryAcquire(key) {
+      const client = await pool.connect();
+      // A held session lock keeps this client checked out (not idle), so the
+      // pool's own error handler won't cover it. If this backend is terminated
+      // (admin kill / failover) while the lock is held, `pg` emits `'error'`
+      // here; without a listener the process crashes. Swallow it - the session
+      // lock is auto-released server-side when the backend dies, and a stale
+      // `release()` is already a no-op-safe `finally`, so the loss surfaces as
+      // the key simply becoming acquirable again.
+      client.on("error", () => {});
+      let acquired = false;
+      try {
+        const result = await client.query<{ ok: boolean }>(
+          "SELECT pg_try_advisory_lock(hashtextextended($1, 0)) AS ok",
+          [key],
+        );
+        acquired = Boolean(result.rows[0]?.ok);
+      } catch (error) {
+        client.release();
+        throw error;
+      }
+      if (!acquired) {
+        // Did not get the lock — return the client immediately. (A failed
+        // pg_try_advisory_lock acquires nothing, so there is nothing to
+        // unlock.)
+        client.release();
+        return null;
+      }
+      let released = false;
+      return {
+        async release() {
+          if (released) return;
+          released = true;
+          try {
+            await client.query(
+              "SELECT pg_advisory_unlock(hashtextextended($1, 0))",
+              [key],
+            );
+          } finally {
+            client.release();
+          }
+        },
+      };
+    },
+  };
+}
+/**
+ * Run `fn` while holding a transaction-scoped advisory lock for `key`. The
+ * lock is acquired with `pg_advisory_xact_lock` (which BLOCKS until granted)
+ * inside a transaction and auto-released at COMMIT/ROLLBACK, so there is no
+ * unlock to leak. Use only for SHORT critical sections — the lock is held
+ * for the whole transaction.
+ *
+ * Because the scoped DB runs an entire `transaction()` callback on a single
+ * dedicated connection, the lock + the work + the implicit release all share
+ * one session, which is exactly the affinity session locks require.
+ *
+ * `fn` receives the transaction handle `tx` and MUST run its
+ * read-then-write critical section on it (not on the outer pool). Running
+ * the work on the pool would put it on a DIFFERENT connection than the one
+ * holding the lock — so two concurrent callers' critical sections could
+ * interleave even though both "hold" the lock. Using `tx` keeps the
+ * read-check + write atomic with respect to the lock.
+ */
+export async function withXactLock<
+  S extends Record<string, unknown>,
+  T,
+>({
+  db,
+  key,
+  fn,
+}: {
+  db: SafeDatabase<S>;
+  key: string;
+  fn: (tx: Parameters<Parameters<SafeDatabase<S>["transaction"]>[0]>[0]) => Promise<T>;
+}): Promise<T> {
+  return db.transaction(async (tx) => {
+    await tx.execute(
+      sql`SELECT pg_advisory_xact_lock(hashtextextended(${key}, 0))`,
+    );
+    return fn(tx);
+  });
+}

package/src/collector-strategy.ts CHANGED Viewed

@@ -17,6 +17,15 @@ export interface CollectorResult<TResult> {
   error?: string;
 }
+/**
+ * Curated, read-only metadata about the health check + system a collector
+ * run is for. Metadata only - never secrets/config.
+ */
+export interface CollectorRunContext {
+  check: { id: string; name: string; intervalSeconds: number };
+  system: { id: string; name: string };
+}
 /**
  * Generic collector strategy interface.
  *
@@ -71,12 +80,24 @@ export interface CollectorStrategy<
    * @param params.config - Validated collector configuration
    * @param params.client - Connected transport client
    * @param params.pluginId - ID of the transport strategy invoking this collector
+   * @param params.runContext - Curated, read-only metadata about the health
+   *   check + system this run is for (metadata only, never secrets/config)
    * @returns Collector result with typed metadata
    */
   execute(params: {
     config: TConfig;
     client: TClient;
     pluginId: string;
+    runContext?: CollectorRunContext;
+    /**
+     * Resolved secret env for THIS run (the collector's declared
+     * `secretEnv` mapped to values), delivered just-in-time. Injected into
+     * the collector's script execution env and never persisted. Empty /
+     * absent when the collector declares no secrets. The collector is
+     * responsible for masking these values out of its returned output
+     * (source-side defense in depth).
+     */
+    secretEnv?: Record<string, string>;
   }): Promise<CollectorResult<TResult>>;
   /**

package/src/core-services.ts CHANGED Viewed

@@ -16,6 +16,7 @@ import type { PluginArtifactStore } from "./plugin-artifact-store";
 import type { EventBus } from "./event-bus-types";
 import type { WebSocketRouteRegistry } from "./ws-registry";
 import type { ReadinessRegistry } from "./readiness-registry";
+import type { AdvisoryLockService } from "./advisory-lock";
 export * from "./types";
@@ -66,4 +67,10 @@ export const coreServices = {
   readinessRegistry: createServiceRef<ReadinessRegistry>(
     "core.readinessRegistry",
   ),
+  /**
+   * Postgres advisory-lock service backed by a dedicated pooled client, so
+   * session-level locks keep connection affinity across acquire/release.
+   * See {@link AdvisoryLockService}.
+   */
+  advisoryLock: createServiceRef<AdvisoryLockService>("core.advisoryLock"),
 };

package/src/esm-script-runner.test.ts CHANGED Viewed

@@ -1,5 +1,9 @@
-import { describe, expect, it } from "bun:test";
+import { afterAll, beforeAll, describe, expect, it } from "bun:test";
+import { mkdtemp, mkdir, writeFile, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import path from "node:path";
 import {
+  defaultEsmScriptRunner,
   normaliseUserScript,
   rewriteHelperImports,
 } from "./esm-script-runner";
@@ -167,3 +171,91 @@ describe("rewriteHelperImports", () => {
     expect(out).toBe(`import x from "${HELPER_URL}";`);
   });
 });
+describe("defaultEsmScriptRunner resolutionRoot", () => {
+  let root: string;
+  beforeAll(async () => {
+    // A throwaway "store" with a node_modules holding one fake package.
+    root = await mkdtemp(path.join(tmpdir(), "cs-resroot-"));
+    const pkgDir = path.join(root, "node_modules", "fake-pkg");
+    await mkdir(pkgDir, { recursive: true });
+    await writeFile(
+      path.join(pkgDir, "package.json"),
+      JSON.stringify({ name: "fake-pkg", version: "1.0.0", main: "index.mjs" }),
+    );
+    await writeFile(
+      path.join(pkgDir, "index.mjs"),
+      "export const greeting = 'hello-from-pkg';\n",
+    );
+  });
+  afterAll(async () => {
+    await rm(root, { recursive: true, force: true });
+  });
+  it("lets a script import a package from <resolutionRoot>/node_modules", async () => {
+    const res = await defaultEsmScriptRunner.run({
+      script: `import { greeting } from "fake-pkg";\nexport default greeting;`,
+      context: {},
+      timeoutMs: 15_000,
+      resolutionRoot: root,
+    });
+    expect(res.error).toBeUndefined();
+    expect(res.result).toBe("hello-from-pkg");
+  });
+  it("cannot resolve the package without a resolutionRoot (backward-compatible isolation)", async () => {
+    const res = await defaultEsmScriptRunner.run({
+      script: `import { greeting } from "fake-pkg";\nexport default greeting;`,
+      context: {},
+      timeoutMs: 15_000,
+    });
+    // No resolutionRoot -> runs under os.tmpdir(), no node_modules -> the
+    // import fails. Either an error is surfaced or no result is produced.
+    expect(res.result).toBeUndefined();
+    expect(res.error).toBeDefined();
+  });
+  it("does NOT auto-install a missing package from the registry (degradation)", async () => {
+    // A real, installable package name that is NOT in any resolutionRoot.
+    // With auto-install disabled in the per-run bunfig, Bun must error
+    // instead of silently fetching it from the registry.
+    const res = await defaultEsmScriptRunner.run({
+      script: `import isodd from "is-odd";\nexport default typeof isodd;`,
+      context: {},
+      timeoutMs: 20_000,
+    });
+    expect(res.result).toBeUndefined();
+    expect(res.error).toBeDefined();
+  });
+});
+describe("defaultEsmScriptRunner injected env", () => {
+  it("exposes injected env vars as process.env in the subprocess", async () => {
+    const res = await defaultEsmScriptRunner.run({
+      script: `export default process.env.API_TOKEN ?? null;`,
+      context: {},
+      timeoutMs: 15_000,
+      env: { API_TOKEN: "injected-secret-value" },
+    });
+    expect(res.error).toBeUndefined();
+    expect(res.result).toBe("injected-secret-value");
+  });
+  it("does NOT expose backend env that was not injected (isolation intact)", async () => {
+    // A backend secret present in the parent process must NOT leak through
+    // unless it was explicitly injected for this run.
+    process.env.__CS_TEST_BACKEND_SECRET = "must-not-leak";
+    try {
+      const res = await defaultEsmScriptRunner.run({
+        script: `export default process.env.__CS_TEST_BACKEND_SECRET ?? null;`,
+        context: {},
+        timeoutMs: 15_000,
+      });
+      expect(res.result).toBeNull();
+    } finally {
+      delete process.env.__CS_TEST_BACKEND_SECRET;
+    }
+  });
+});

package/src/esm-script-runner.ts CHANGED Viewed

@@ -86,6 +86,34 @@ export interface EsmScriptRunOptions {
   helperModuleName?: string;
   /** Name of the helper function injected as a global AND exported by the virtual module. */
   helperFunctionName?: string;
+  /**
+   * Optional directory the per-run temp dir is created *inside*, so Node /
+   * Bun module resolution walks up to `<resolutionRoot>/node_modules` and
+   * the user's script can `import` managed npm packages.
+   *
+   * When unset (the default), the per-run dir is created under
+   * `os.tmpdir()` exactly as before - backward-compatible, no node_modules
+   * visible, isolation unchanged. The script-packages reconciler points
+   * this at `<store>/current` (the atomically-flipped symlink to the
+   * active materialized tree).
+   *
+   * Execution isolation is unchanged either way: the subprocess still gets
+   * only `SAFE_ENV_VARS`, so packages cannot read backend secrets.
+   */
+  resolutionRoot?: string;
+  /**
+   * Extra environment variables injected into the subprocess for THIS run
+   * only, merged on top of `SAFE_ENV_VARS`. The Secrets platform uses this
+   * to inject a run's resolved secret -> env allowlist (decision 5,
+   * least-privilege): only the consumer's declared secrets are injected,
+   * memory-only, for the lifetime of this run. It deliberately does NOT
+   * widen the ambient `SAFE_ENV_VARS` whitelist — the values live only in
+   * this options object and the spawned process env.
+   *
+   * The user's script reads these as `process.env.ENV_NAME`. On a key
+   * collision with a safe var, the injected value wins.
+   */
+  env?: Record<string, string>;
 }
 /**
@@ -301,14 +329,22 @@ export const defaultEsmScriptRunner: EsmScriptRunner = {
     timeoutMs,
     helperModuleName,
     helperFunctionName,
+    resolutionRoot,
+    env: injectedEnv,
   }) {
     const sessionId = randomUUID();
     const markerStart = `##__CS_SCRIPT_RESULT_${sessionId}_START__##`;
     const markerEnd = `##__CS_SCRIPT_RESULT_${sessionId}_END__##`;
-    const tmpDir = await mkdtemp(path.join(tmpdir(), "checkstack-script-"));
+    // When a `resolutionRoot` is given, create the per-run dir *inside* it
+    // so module resolution walks up to `<resolutionRoot>/node_modules`.
+    // Otherwise fall back to `os.tmpdir()` (today's behavior - no
+    // node_modules visible, fully backward compatible).
+    const tmpBase = resolutionRoot ?? tmpdir();
+    const tmpDir = await mkdtemp(path.join(tmpBase, "checkstack-script-"));
     const userScriptPath = path.join(tmpDir, "user.mjs");
     const runnerPath = path.join(tmpDir, "runner.mjs");
+    const bunfigPath = path.join(tmpDir, "bunfig.toml");
     const hasHelper =
       typeof helperModuleName === "string" &&
@@ -348,6 +384,14 @@ export const defaultEsmScriptRunner: EsmScriptRunner = {
             })
           : normalisedSource;
+      // Disable Bun auto-install in the per-run dir ALWAYS. Without this,
+      // `import "any-package"` silently fetches from the registry (verified
+      // empirically), defeating the whole managed-allowlist model. With it,
+      // an import resolves ONLY against the reconciled `<resolutionRoot>/
+      // node_modules` (when set) and otherwise fails fast - the clear
+      // degradation the package feature requires.
+      await writeFile(bunfigPath, '[install]\nauto = "disable"\n', "utf8");
       await writeFile(userScriptPath, userSource, "utf8");
       await writeFile(
         runnerPath,
@@ -363,7 +407,14 @@ export const defaultEsmScriptRunner: EsmScriptRunner = {
       proc = spawn({
         cmd: [process.execPath, runnerPath],
-        env: pickSafeEnv(),
+        // CWD = the per-run dir so Bun reads its `bunfig.toml`
+        // (auto-install disabled) and resolves modules from
+        // `<resolutionRoot>/node_modules` when set.
+        cwd: tmpDir,
+        // Per-run injected env wins over the safe-vars whitelist. The
+        // injected secret values live only here + the child process; they
+        // never widen the ambient SAFE_ENV_VARS.
+        env: { ...pickSafeEnv(), ...injectedEnv },
         stdout: "pipe",
         stderr: "pipe",
       });

package/src/event-bus-types.ts CHANGED Viewed

@@ -1,4 +1,9 @@
-import type { Hook, HookSubscribeOptions, HookUnsubscribe } from "./hooks";
+import type {
+  Hook,
+  HookEventMeta,
+  HookSubscribeOptions,
+  HookUnsubscribe,
+} from "./hooks";
 /**
  * EventBus interface for dependency injection
@@ -7,22 +12,26 @@ export interface EventBus {
   subscribe<T>(
     pluginId: string,
     hook: Hook<T>,
-    listener: (payload: T) => Promise<void>,
+    listener: (payload: T, meta?: HookEventMeta) => Promise<void>,
     options?: HookSubscribeOptions
   ): Promise<HookUnsubscribe>;
   /**
    * Emit a hook through the distributed queue system.
    * All instances receive broadcast hooks; one instance handles work-queue hooks.
+   *
+   * `meta` carries event-envelope metadata (the acting `actor`). When omitted,
+   * the bus defaults to the system actor, so every emitted hook carries an
+   * actor even when emitted from a background/unauthenticated context.
    */
-  emit<T>(hook: Hook<T>, payload: T): Promise<void>;
+  emit<T>(hook: Hook<T>, payload: T, meta?: HookEventMeta): Promise<void>;
   /**
    * Emit a hook locally only (not distributed).
    * Use for instance-local hooks that should only run on THIS instance.
    * Uses Promise.allSettled to ensure one listener error doesn't block others.
    */
-  emitLocal<T>(hook: Hook<T>, payload: T): Promise<void>;
+  emitLocal<T>(hook: Hook<T>, payload: T, meta?: HookEventMeta): Promise<void>;
   shutdown(): Promise<void>;
 }

package/src/hooks.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { AccessRule } from "@checkstack/common";
+import type { AccessRule, Actor } from "@checkstack/common";
 /**
  * Hook definition for type-safe event emission and subscription
@@ -8,6 +8,19 @@ export interface Hook<T = unknown> {
   _type?: T; // Phantom type for TypeScript inference
 }
+/**
+ * Envelope metadata that travels alongside every emitted hook payload,
+ * independent of the hook's typed payload. Injected centrally at emit time
+ * (from the request context, defaulting to the system actor) and delivered to
+ * subscribers as the optional second listener argument.
+ *
+ * The automation engine reads `actor` and exposes it to automations as
+ * `trigger.actor`, so a trigger filter can gate on who/what caused the event.
+ */
+export interface HookEventMeta {
+  actor: Actor;
+}
 /**
  * Create a typed hook
  */

package/src/index.ts CHANGED Viewed

@@ -17,6 +17,7 @@ export * from "./rpc";
 export * from "./test-utils";
 export * from "./hooks";
 export * from "./event-bus-types";
+export * from "./actor";
 export * from "./plugin-source";
 export * from "./plugin-artifact-store";
 export * from "./notification-strategy";
@@ -32,3 +33,4 @@ export * from "./incremental-aggregation";
 export * from "./aggregated-result";
 export * from "./ws-registry";
 export * from "./readiness-registry";
+export * from "./advisory-lock";

package/src/plugin-system.ts CHANGED Viewed

@@ -2,7 +2,12 @@ import { NodePgDatabase } from "drizzle-orm/node-postgres";
 import { ServiceRef } from "./service-ref";
 import { ExtensionPoint } from "./extension-point";
 import type { AccessRule, PluginMetadata } from "@checkstack/common";
-import type { Hook, HookSubscribeOptions, HookUnsubscribe } from "./hooks";
+import type {
+  Hook,
+  HookEventMeta,
+  HookSubscribeOptions,
+  HookUnsubscribe,
+} from "./hooks";
 import { Router } from "@orpc/server";
 import { RpcContext } from "./rpc";
 import { AnyContractRouter } from "@orpc/contract";
@@ -48,7 +53,7 @@ export type AfterPluginsReadyContext = {
    */
   onHook: <T>(
     hook: Hook<T>,
-    listener: (payload: T) => Promise<void>,
+    listener: (payload: T, meta?: HookEventMeta) => Promise<void>,
     options?: HookSubscribeOptions,
   ) => HookUnsubscribe;
   /**
@@ -80,6 +85,20 @@ export type BackendPluginRegistry = {
     ) => Promise<void>;
   }) => void;
   registerService: <S>(ref: ServiceRef<S>, impl: S) => void;
+  /**
+   * Resolve a platform service registered by another plugin under `ref`,
+   * using THIS plugin's identity as the consumer (for audit / scoped
+   * factories). Mirrors the standard dependency-injection resolution used
+   * for declared `deps`, but allows resolving ARBITRARY cross-plugin refs
+   * at runtime — the path used by the automation dispatch engine to hand
+   * `getService` to provider actions at execute time.
+   *
+   * Resolves the service, or throws a clear error if `ref` is not
+   * registered (it never silently returns `undefined`). Safe to call from
+   * `init` / `afterPluginsReady` onward, by which point services are
+   * registered.
+   */
+  getService: <S>(ref: ServiceRef<S>) => Promise<S>;
   registerExtensionPoint: <T>(ref: ExtensionPoint<T>, impl: T) => void;
   getExtensionPoint: <T>(ref: ExtensionPoint<T>) => T;
   /**

package/src/schema-utils.test.ts ADDED Viewed

@@ -0,0 +1,44 @@
+import { describe, expect, test } from "bun:test";
+import { z } from "zod";
+import { configString, withConfigMeta } from "./zod-config";
+import { toJsonSchema } from "./schema-utils";
+describe("toJsonSchema x-* metadata", () => {
+  test("propagates x-script-testable and x-editor-types onto the field", () => {
+    const schema = z.object({
+      script: configString({
+        "x-editor-types": ["typescript"],
+        "x-script-testable": true,
+      }),
+    });
+    const json = toJsonSchema(schema) as {
+      properties: Record<string, Record<string, unknown>>;
+    };
+    expect(json.properties.script?.["x-script-testable"]).toBe(true);
+    expect(json.properties.script?.["x-editor-types"]).toEqual(["typescript"]);
+  });
+  test("omits x-script-testable when not set", () => {
+    const schema = z.object({
+      plain: configString({}),
+    });
+    const json = toJsonSchema(schema) as {
+      properties: Record<string, Record<string, unknown>>;
+    };
+    expect("x-script-testable" in (json.properties.plain ?? {})).toBe(false);
+  });
+  test("propagates x-secret-env onto a record field via withConfigMeta", () => {
+    const schema = z.object({
+      secretEnv: withConfigMeta(z.record(z.string(), z.string()), {
+        "x-secret-env": true,
+      }),
+    });
+    const json = toJsonSchema(schema) as {
+      properties: Record<string, Record<string, unknown>>;
+    };
+    expect(json.properties.secretEnv?.["x-secret-env"]).toBe(true);
+  });
+});

package/src/schema-utils.ts CHANGED Viewed

@@ -67,6 +67,12 @@ function addSchemaMetadata(
       if (meta["x-editor-types"]) {
         jsonField["x-editor-types"] = meta["x-editor-types"];
       }
+      if (meta["x-script-testable"]) {
+        jsonField["x-script-testable"] = true;
+      }
+      if (meta["x-secret-env"]) {
+        jsonField["x-secret-env"] = true;
+      }
       if (meta["x-hidden-when"]) {
         jsonField["x-hidden-when"] = meta["x-hidden-when"];
       }

package/src/zod-config.ts CHANGED Viewed

@@ -38,6 +38,22 @@ export interface ConfigMeta {
    * - "formdata": Key/value pair editor (URL-encoded)
    */
   "x-editor-types"?: EditorType[];
+  /**
+   * Mark this field as an inline script that can be tested in-UI. When the
+   * editor renders the field (via `MultiTypeEditorField`) and the owning
+   * page supplies a `scriptTestRenderer`, a test panel appears beneath the
+   * editor so operators can run the script against a sample context.
+   */
+  "x-script-testable"?: boolean;
+  /**
+   * Mark a record field as a secret -> env mapping
+   * (`{ ENV_NAME: "${{ secrets.NAME }}" }`). The editor renders a
+   * dedicated key (env name) + secret-name picker, with the available
+   * names supplied to `DynamicForm` via `secretNames` (from the secrets
+   * plugin's `listSecretNames`). Without the marker the record falls back
+   * to the plain JSON editor.
+   */
+  "x-secret-env"?: boolean;
 }
 /**
@@ -164,3 +180,20 @@ export function configBoolean(meta: ConfigMeta) {
   schema.register(configRegistry, meta);
   return schema;
 }
+/**
+ * Attach config metadata to an existing schema (e.g. a `z.record`) and
+ * return it. Use this when a field's base schema is defined elsewhere
+ * (such as `secretEnvMappingSchema` from `@checkstack/secrets-common`) but
+ * still needs editor metadata like `x-secret-env`.
+ */
+export function withConfigMeta<T extends z.ZodTypeAny>(
+  schema: T,
+  meta: ConfigMeta,
+): T {
+  // The registry is typed `z.registry<ConfigMeta>()`, so registering the
+  // meta is sound; the generic `T` confuses zod's conditional `.register`
+  // overload, so register through the base schema type.
+  (schema as z.ZodTypeAny).register(configRegistry, meta);
+  return schema;
+}