npm - @checkstack/backend-api - Versions diffs - 0.19.0 → 0.21.0 - Mend

@checkstack/backend-api 0.19.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/CHANGELOG.md +205 -0
package/package.json +12 -11
package/src/advisory-lock-pool.it.test.ts +282 -0
package/src/advisory-lock.test.ts +144 -3
package/src/advisory-lock.ts +97 -55
package/src/auth-strategy.ts +6 -3
package/src/bearer-token.ts +13 -0
package/src/collector-strategy.ts +9 -0
package/src/config-versioning.test.ts +227 -0
package/src/config-versioning.ts +172 -0
package/src/core-services.ts +14 -0
package/src/esm-script-runner.test.ts +55 -16
package/src/esm-script-runner.ts +212 -55
package/src/index.ts +3 -0
package/src/render-templatable-config.test.ts +168 -0
package/src/render-templatable-config.ts +193 -0
package/src/schema-utils.ts +3 -0
package/src/script-sandbox/capabilities.test.ts +122 -0
package/src/script-sandbox/capabilities.ts +372 -0
package/src/script-sandbox/capped-output.test.ts +116 -0
package/src/script-sandbox/capped-output.ts +172 -0
package/src/script-sandbox/env-guard.test.ts +105 -0
package/src/script-sandbox/env-guard.ts +129 -0
package/src/script-sandbox/filesystem.test.ts +437 -0
package/src/script-sandbox/filesystem.ts +514 -0
package/src/script-sandbox/forkbomb.it.test.ts +121 -0
package/src/script-sandbox/global-default.test.ts +161 -0
package/src/script-sandbox/global-default.ts +100 -0
package/src/script-sandbox/index.ts +14 -0
package/src/script-sandbox/network.test.ts +356 -0
package/src/script-sandbox/network.ts +373 -0
package/src/script-sandbox/observability.test.ts +210 -0
package/src/script-sandbox/observability.ts +168 -0
package/src/script-sandbox/output-truncation.test.ts +53 -0
package/src/script-sandbox/output-truncation.ts +69 -0
package/src/script-sandbox/policy.test.ts +189 -0
package/src/script-sandbox/policy.ts +220 -0
package/src/script-sandbox/provider.test.ts +61 -0
package/src/script-sandbox/provider.ts +134 -0
package/src/script-sandbox/readiness.test.ts +80 -0
package/src/script-sandbox/readiness.ts +117 -0
package/src/script-sandbox/report.ts +88 -0
package/src/script-sandbox/rootless-egress.it.test.ts +86 -0
package/src/script-sandbox/rootless-egress.test.ts +99 -0
package/src/script-sandbox/rootless-egress.ts +218 -0
package/src/script-sandbox/shell-quote.test.ts +32 -0
package/src/script-sandbox/shell-quote.ts +10 -0
package/src/script-sandbox/wrapper.test.ts +1194 -0
package/src/script-sandbox/wrapper.ts +714 -0
package/src/shell-script-runner.test.ts +243 -0
package/src/shell-script-runner.ts +210 -45
package/src/zod-config.test.ts +60 -0
package/src/zod-config.ts +38 -14
package/tsconfig.json +3 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,210 @@
 # @checkstack/backend-api
+## 0.21.0
+### Minor Changes
+- 9dcc848: Add the AI platform: a transport-agnostic tool spine, an OAuth Authorization Server + read-only MCP server, a propose/apply flow with audit log, a streaming in-app chat agent, per-conversation permission modes, per-integration spend caps, and user-scoped tool authorization.
+  Two new packages, `@checkstack/ai-common` (the `AiTool` contract, `read`/`mutate`/`destructive` effect classification, the `ai.*` access rules, the OpenAI-compatible connection shape, and the wire contracts) and `@checkstack/ai-backend` (the tool registry, extension points, principal-to-tool resolver, shared zod-to-JSON-Schema serializer, and all transports). The OpenAI-compatible integration provider registers through the existing integration provider extension point, so its API key is stored in the Secrets Vault and configured in the generic Connections UI.
+  What ships:
+  - Tool spine and extension points: `aiToolExtensionPoint.registerTool` (hand-authored composite tools) and `aiToolProjectionExtensionPoint.expose` (opt-in projections of existing oRPC procedures). Authorization mirrors `autoAuthMiddleware` exactly - a tool is surfaced only when every `requiredAccessRules` entry is satisfied, so a scope-narrowed principal can only ever see fewer tools.
+  - OAuth + MCP: Checkstack can act as its own OAuth 2.1 Authorization Server (authorization code + PKCE, consent screen, Dynamic Client Registration) and expose a read-only MCP server over Streamable HTTP at `/api/ai/mcp`. Off by default, enabled by the admin `ai.mcp-oauth` setting. A Bearer OAuth-token branch is added to the auth strategy; token scopes are intersected live with the bound user's access rules on every call. A shared-Postgres rate limiter throttles the DCR endpoint per client IP. `getMcpOAuthSettings` / `setMcpOAuthSettings` contracts added to `@checkstack/auth-common`. A minimal OAuth consent page (`/auth/oauth-consent`) renders the requesting client and scopes.
+  - Propose/apply + audit: a transport-agnostic two-step service - `propose` re-checks authz, runs the tool's `dryRun` without mutating, and returns a single-use proposal token (the `proposed` audit row IS the token store, 10-minute TTL, atomic single-use); `apply` re-parses the server-stored payload, re-checks authz, and atomically commits. The `ai_tool_calls` audit table records every call across both transports with a SHA-256 args hash (never raw arguments) and stamps who proposed and who applied. An `ai.toolCalled` event carries metadata only.
+  - In-app chat: a server-side, provider-agnostic Vercel AI SDK agent loop (OpenAI, Azure, OpenRouter, Ollama, vLLM, LM Studio, ...). The model provider is built on the backend from the integration credentials, so the API key never leaves the backend. The loop offers only resolver-allowed tools, auto-runs read tools (re-entering the live router as the logged-in user) and routes mutating / destructive tools through propose/apply. Durable conversation persistence (`ai_conversations`, `ai_messages`, owner-scoped RPCs) plus a streaming chat UI with a confirm-card component and per-integration model picker.
+  - Per-conversation permission mode (Claude-Code-style approve/auto), a durable `permission_mode` column on `ai_conversations` (default `approve`). `read` always auto-runs in both modes; `mutate` inherits the mode (auto-applies server-side in `auto`, confirm-carded in `approve`); `destructive` ALWAYS requires the human `applyTool` in both modes. Security invariant (structural + tested): the mode is consulted only on the `mutate` branch, so no `(effect, mode)` pair routes a destructive tool to auto-apply.
+  - Per-integration LLM spend cap (optional `spendCap` = `tokenBudget` + `windowMinutes`, default OFF). Spend is tracked in a shared-Postgres `ai_spend` ledger; enforcement is a rolling-window SUM run before each turn (HTTP 429 over budget). Per-principal tool rate-limit budgets are a rolling COUNT over `ai_tool_calls`, enforced on both transports. An absent / empty / incomplete `spendCap` is treated as "no cap" rather than rejected.
+  - Full tool-call replay: `ai_messages.model_messages` (jsonb) persists the canonical AI-SDK `ResponseMessage[]` per turn and replays them verbatim on the next turn; legacy rows fall back to text-only replay.
+  - Enforced no-secret-leak scrubbing: `appendMessage` runs `scrubContent` on every write, redacting credential-shaped keys and high-confidence credential values; a canary regression test asserts injected secrets are stripped. A hardening test suite asserts no secret appears in any AI-surface DTO and that handler-side authz holds when the model misbehaves.
+  - Provider correctness: the chat provider uses `@ai-sdk/openai-compatible`'s `chatModel` (plain `/chat/completions`), so OpenAI-compatible gateways (OpenRouter, DeepSeek, Ollama, vLLM) no longer reject turns with `invalid_prompt`; `@ai-sdk/openai` is removed.
+  BREAKING CHANGES:
+  - The `AiTool` contract (`@checkstack/ai-common`) gained a `TRpc` type parameter, and both `dryRun` and `execute` now receive a USER-SCOPED `rpcClient` arg bound to the originating user. Every plugin procedure a tool calls re-enters the live router AS THAT USER, so handler-side authorization (access rules AND per-resource/team scope) is enforced exactly as a direct UI/RPC call - closing a prior privilege-escalation where tools captured a trusted service client at construction. A hand-authored tool MUST resolve its plugin client from this per-call arg and MUST NOT capture a trusted service client at factory scope. Tool factories that previously took `{ rpcClient }` should drop that parameter.
+  - `AiToolProjectionExtensionPoint.expose` no longer takes a second `pluginMetadata` argument; the owning metadata lives on `input.sourcePluginMetadata`. Callers must drop the second argument.
+  State and scale: conversations, messages, the audit log, proposal tokens, the rate-limit counter, and the spend ledger all live in shared Postgres, so every pod answers identically and the agent loop is resumable on any pod. The only pod-local state is the live MCP connection registry (bookkeeping, never a source of truth). Cross-pod conversation readback, the spend cap, and the tool budget are verified by env-gated two-pod integration tests.
+  This is a beta minor.
+- 9dcc848: Automations now run as a configured service account, removing implicit god-mode from the dispatch path.
+  BREAKING: every automation must declare a `runAs` application (service account). Previously every automation action ran as the trusted service client, bypassing all access-rule, per-resource, and team-scope checks - so an automation could touch any team's data. Now each automation runs as a bounded `application` principal, and every data-access call an action makes is authorized exactly as that identity. An automation with no `runAs` fails to run with a clear error rather than falling back to the trusted client; legacy automations must be assigned a service account before they run again.
+  What changed:
+  - New top-level field `runAs` on automations (a `run_as_application_id` column + create/update inputs; `AutomationSchema.runAs`). Required on create; GitOps sets it via the `run-as` metadata label.
+  - A new `coreServices.rpcClientAs(applicationId)` mints a short-lived, backend-signed app-principal token; the auth service resolves it LIVE to an `application` principal (reusing `enrichApplicationPrincipal`), so it flows through full `autoAuthMiddleware` enforcement. The dispatch engine threads this client into every action's `execute` as the required `context.rpcClient`.
+  - Bind authority (anti-escalation): a user may only bind an application whose access rules are a subset of their own (`isApplicationBindable`); `getBindableApplications` lists only bindable apps, and the create/update handlers enforce the check.
+  - `notification.sendTransactional` moves from service-only to access-gated (`notification.send`, a new access rule), so an automation's `runAs` can call the built-in `notify_user` / `notification.send` actions; trusted services still bypass via short-circuit.
+  - A "Run as (Service Account)" picker in the automation editor, populated from `getBindableApplications` (server-side filtered to bindable apps), seeding from the loaded `runAs` on edit and passing it into create + update. First-class teaching UX: an inline info banner, a blocked Save with an inline hint until one is chosen, and an empty state linking to the Applications admin + docs when none are bindable.
+  State and scale: `runAs` resolution is a pure read over shared tables; the app-principal token is self-contained and verified statelessly, so the per-run client is correct under horizontal scale.
+  This is a beta minor.
+- 9dcc848: Harden config-versioning so stored configs always migrate-then-validate and broken migration chains fail fast at boot.
+  - `@checkstack/backend-api` `Versioned<T>` gains `parseAssumingV1` (migrate-from-v1 then validate leniently, runtime path), `parseStrictAssumingV1` (migrate then validate strictly, editor path), and `validateMigrationChainFromV1()`. A standalone pure helper `assertMigrationChainFromV1({ version, migrations })` is the single shared implementation behind the constructor guard and `validateMigrationChainFromV1`.
+  - `Versioned` now validates its own v1 -> `version` chain in the constructor, which runs at module import / plugin registration. A new `no-restricted-syntax` ESLint rule bans calling `parse` / `safeParse` / `parseAsync` / `strict` directly on a `Versioned`'s `.schema` member.
+  - Auth strategy migration chains are validated at the `betterAuthExtensionPoint.addStrategy` chokepoint (`@checkstack/auth-backend`).
+  - Automation action AND trigger configs migrate-then-validate (lenient at dispatch, strict in the editor validator, recursing into `choose`/`parallel`/`repeat`/`sequence` blocks). The `run_script` / `run_shell` action configs bump to `version: 2` dropping the removed `sandbox` key, fixing the editor's `Unrecognized key: sandbox` error.
+  - Anomaly read path now validates: `getAnomalyConfig` / `getAnomalyAssignmentConfig` run stored records through `Versioned.parseRecord`; `PartialAnomalySettingsSchema` moved to `@checkstack/anomaly-common`. Notification ConfigService reads thread the migrations argument, and per-strategy `userConfig` is migrate-then-validated before `send()`.
+  - gitops-apply migrate-then-validates authored health-check config; integration connection validation routes through `safeValidate`. The latent HTTP health-check `result` schema (at `version: 3` with no migrations) now ships a pass-through v1 -> v2 -> v3 chain.
+  BREAKING CHANGES (fail-fast at boot, intended):
+  - Any `Versioned` config with `version > 1` and an incomplete or non-contiguous migration chain now throws at construction (boot) instead of failing lazily on first read. This covers every `Versioned` instance repo-wide, including future plugin types. Out-of-tree plugins shipping such a config must add the missing migration step(s); all in-repo strategies already have complete chains.
+  - An auth strategy declaring `configVersion > 1` without a complete chain throws at registration.
+  - A trigger's per-automation config is now a versioned `config: Versioned<TConfig>` instead of a bare `configSchema?`. Plugins registering triggers with `configSchema:` must wrap it: `config: new Versioned({ version: 1, schema })`. The underlying schema stays reachable via `config.schema`; triggers without per-automation config are unaffected.
+  State and scale: all affected reads resolve from shared Postgres / in-process registries, so every pod sees the same migrated answer. No new framework-owned current-state store.
+  This is a beta minor.
+- 9dcc848: Add environments as a first-class catalog primitive, with per-environment health-check fan-out, config templating, per-environment reactive health, and script run-context exposure.
+  - Catalog primitive: an environment is a sibling of groups - a named, instance-global record carrying free-form custom fields (baseUrl, region, tier, ...) that any system can belong to many-to-many. New `environments` + `systems_environments` tables, `EnvironmentSchema` + create/update schemas, `EntityService` environment CRUD and membership joins, RPC endpoints gated by a new `catalogAccess.environment` access rule, a GitOps `Environment` kind + `System.environments` extension, and frontend management (an `EnvironmentEditor`, an Environments management panel, and a per-system environment picker). The Environments card's Add/Edit/Delete affordances are gated on `catalogAccess.environment.manage`.
+  - Per-environment fan-out: run identity becomes `(systemId, configurationId, environmentId)`. Runs, aggregates, and state transitions gain a nullable `environmentId`. The health-check assignment gains an `environmentIds` selector with three modes (All / Specific / None; `null` and `[]` are distinct). The queue executor resolves the effective environment set via the catalog `resolveSystemEnvironments` read and executes one isolated run per environment.
+  - Config templating: a new `x-templatable` config-field marker renders a string field through the template engine at execute time, against `{ environment, check, system }`. A shared `renderTemplatableConfig` and a `renderTemplatePreview` helper (re-exported from `@checkstack/template-engine`) keep editor previews identical to the run-time render. The HTTP collector's `url`, `headers[].value`, and `body` are templatable, rendered per environment (the strategy client build moves inside the per-env loop); the `url`'s `.url()` validation moves post-render. Secrets resolve before templating; a field marked both secret and `x-templatable` is rejected at plugin load. `DynamicForm` shows a live "Preview" line, and the catalog `EnvironmentPreviewPicker` ("Preview as: <environment>") drives it in the collector editor (only when the schema has a templatable field).
+  - Script run-context: `CollectorRunContext` gains an optional `environment` field (`{ id, name, fields }`, metadata only). Shell collectors receive `CHECKSTACK_ENV_ID` / `_NAME` / `CHECKSTACK_ENV_<FIELD>` vars; inline TS collectors read `globalThis.context.environment`; the editor test panel mirrors both. The env-less path is unchanged.
+  - Per-environment reactive health (see BREAKING below), env-keyed read/write paths, env-qualified serialization locks, an optional `trigger.payload.environmentId`, per-environment isolation, and an `ENVIRONMENT_RESOLUTION_FAILED` signal when catalog resolution degrades to a single env-less run.
+  BREAKING CHANGES: the reactive `health` entity's id-shape and cardinality change. It now encodes two views: per-environment (id `"<systemId>::<environmentId>"`) and a system rollup (id `"<systemId>"`, the worst status across environments + env-less runs). The rollup PRESERVES the pre-existing system-level contract - dashboards, status badges, and automations referencing health by `systemId` keep working without re-authoring - but the entity's contract surface changed (new id-shape, higher cardinality, new payload field), so it is flagged breaking. `getBulkHealthState` parses env-qualified ids and keys results by the original id.
+  State and scale: membership and custom fields live only in catalog Postgres and are re-read every tick via the cross-plugin RPC; env-keyed health reads from shared `health_check_runs` / aggregates / transitions (compute-on-read). Every pod resolves the same effective set and the same per-environment health. No pod-local environment state.
+  Also: `unwrapSchema` in `zod-config.ts` loops instead of single-pass-stripping so multi-layer wrappers (`.optional().default()`) still resolve `x-templatable` meta. The env-less `{{ environment.* }}` run notice logs at `debug` (a legitimate recurring configuration), while the post-render HTTP `.url()` check still fails a genuinely-broken empty render with a clear "Rendered URL is invalid" error.
+  This is a beta minor.
+- 9dcc848: Cut initial-load JS: lazy plugin contributions, a hardened lazy-by-default contribution contract, on-demand Monaco, and a lighter icon/chart load.
+  - Lazy plugin route pages: each plugin's route `element` references a `React.lazy`-wrapped page rendered inside a shared `<Suspense>` boundary. Plugins still register synchronously, so nav, slots, commands, API factories, and `foreignSignals` are available on first paint. This moves ~37 route-page chunks (~600 KB) out of the entry; the entry chunk drops from ~2.4 MB to ~190 KB. Auth flow pages stay eager. The `@checkstack/scripts` scaffold template generates lazy route pages too.
+  - Hardened contribution contract (BREAKING, frontend plugin contract): plugins declare contributions lazily and let the framework own code-splitting, Suspense, and per-plugin error isolation. Routes use `load: () => import("./Page").then((m) => ({ default: m.Page }))` instead of `element: <Page />` (`element` is still accepted for the rare page that must paint without a chunk fetch; provide exactly one). Slot extensions accept either an eager `component` or a lazy `load`; new `getLazyContribution` + `ExtensionComponent` exports from `@checkstack/frontend-api` render either kind. This also fixes runtime-installed plugins: `ExtensionSlot` subscribes to the plugin registry, and the API registry rebuilds when the plugin set changes (`getPlugins()` returns an immutable snapshot via `useSyncExternalStore`). A per-plugin error boundary contains a bad contribution.
+  - On-demand Monaco: the `@checkstack/ui` barrel no longer pulls the `@codingame/*` / `monaco-languageclient` stack into the initial load. `CodeEditor` lazy-loads its Monaco-backed editor behind `React.lazy` + Suspense, `validateTypeScriptSources` imports the editor API via in-body `await import(...)`, and the "vscode services ready" signal moved to a Monaco-free module. The ~10 MB editor body loads only when a `CodeEditor` mounts. A `react-vendor` `manualChunks` split was added for stable vendor caching.
+  - lucide-react 1.x + lighter icons/charts (BREAKING for icon consumers): lucide-react unified from three drifting ranges to `^1.17.0`. lucide v1 removed brand icons, so the GitHub/GitLab marks are vendored in `@checkstack/ui` (`GithubIcon`, `GitlabIcon`, `brandIcons`); a new `IconName` type (`LucideIconName | BrandIconName`) in `@checkstack/common` is canonical, accepted by `AuthStrategy.icon` and the card components, so data-driven brand names keep working. `DynamicIcon` no longer eagerly imports lucide's ~1600-icon map (~1 MB) - it lives in a `React.lazy` `iconRegistry` chunk fetched on first data-driven render, while statically named-imported icons tree-shake normally. The recharts-backed health-check charts (~300 KB) and the `HealthCheckSystemOverview` drawer leave the initial load.
+  BREAKING CHANGES:
+  - Frontend plugin contract: routes/slot contributions are lazy-by-default (`load` instead of `element`/eager elements) as described above.
+  - Any external consumer importing a brand icon from `lucide-react` (e.g. `import { Github } from "lucide-react"`) must switch to the vendored `@checkstack/ui` brand icons or a custom SVG.
+  This is a beta minor.
+- 9dcc848: Layered OS-level script sandbox, secure and fail-closed by default (epic #247).
+  Script and shell health checks and the `run_shell` / `run_script` automation actions now run inside a layered OS-level sandbox by default. The sandbox lives in `core/backend-api/src/script-sandbox/` (the single source of truth) and is enforced inside the shared runners, so it applies wherever a job runs.
+  Layers:
+  - Resource caps (CPU / memory / PID / FD / file-size, via `prlimit` on capable Linux; ESM JS-heap cap via `--max-old-space-size`; portable wall-clock timeout) and an OOM-safe streaming output cap.
+  - Privilege drop via a NON-ROOT supervisor model: the shipped images run the supervisor as non-root uid `65532`, so every sandboxed script inherits non-root and can never be host-root; filesystem + network confinement is delivered by ROOTLESS `bwrap`/`nsjail` via unprivileged user namespaces. `enforced.privilege` is truthful (true only when the child cannot run as host-root). Runners no longer pass `uid`/`gid` to `Bun.spawn` (a silent no-op and a forward-compat hazard).
+  - Filesystem isolation (`scratch-only` / `scratch-plus-ro`) confining the child to its per-run scratch dir over a read-only base; the interpreter path is RO-bound so the runtime execs, and `TMPDIR` is pinned to the in-namespace tmpfs.
+  - Network egress control: `deny` (routeless loopback-only netns), `allowlist` (real plumbed egress via macvlan OR rootless slirp4netns + an in-kernel nftables filter), and an always-on metadata / link-local block (`169.254.0.0/16`, `fe80::/10`, `fc00::/7`). No-blackhole invariant: `enforced.network` is never true when egress is actually severed or unfiltered; unpluggable egress degrades to surfaced host net.
+  - Per-run fork-bomb containment via RLIMIT*NPROC inside the fresh per-run user+PID namespace; a centralized forbidden-env denylist (`LD_PRELOAD`, `LD_LIBRARY_PATH`, `DYLD*_`, `NODE*OPTIONS`, `BUN*_`, caller `PATH` overrides).
+  - A validated tuned seccomp profile (`deploy/seccomp/checkstack-userns.json`) and a live `clone(CLONE_NEWUSER|CLONE_NEWNET)` capability probe (not the static sysctl), shipped by default in both Dockerfiles, `docker-compose.yml`, and `deploy/k8s/checkstack-sandbox.yaml`.
+  Global policy and operator surface:
+  - The global sandbox policy lives in ONE durable row owned by `script-packages` (its `ConfigService` row in shared `plugin_configs`). A single process-wide provider serves every runner; the two script plugins no longer register competing providers. A dedicated admin-only `script-sandbox.manage` permission gates both reading and writing the policy. New `getSandboxPolicy` / `setSandboxPolicy` endpoints and a Settings -> Script Sandbox admin UI (`enabled`, `onUnavailable`, network/filesystem/privilege modes, allow list, metadata block, resource caps). The startup capability/readiness log is emitted in-process by `script-packages-backend` (no fragile init-order RPC self-loop), and on a host that cannot enforce a layer a one-time startup warning explains the two local-dev paths (Docker, or set the global policy to `degrade`).
+  - Satellite relay: the WS protocol carries the resolved policy in the `authenticated` message and a `sandbox_policy` push-on-change; a satellite caches the last relayed policy and resolves every run through it.
+  BREAKING CHANGES (platform in BETA, shipped as minor):
+  - Scripts run sandboxed by default. The shipped global default is FAIL-CLOSED (`onUnavailable: "fail"`): when a requested layer cannot be enforced the run is REFUSED (clean `exitCode: -1`, never an unsandboxed spawn) rather than silently degrading. Deployments on hosts that cannot enforce a layer (no bubblewrap, user namespaces blocked, no `/proc` unmask) must run the official images with the documented runtime flags (the bundled seccomp profile + `systempaths=unconfined`, or k8s `procMount: Unmasked`), or set the global policy to `degrade`. On macOS / restricted containers the strong layers degrade to the portable subset and are surfaced per run.
+  - Default network posture is deny-egress (`allowlist` with an empty allow list, which resolves to the routeless `deny` path). Scripts calling external endpoints fail until those destinations are allowlisted in the global default. The always-on metadata / link-local block applies even under looser modes.
+  - The per-action / per-check `sandbox` config override and the transport `ScriptRequest.sandbox` field are removed; policy is global-only, so an automation/check author can no longer weaken the sandbox on their own item. Stored configs carrying a stray `sandbox` key are tolerated (stripped on parse).
+  - The shared runners' `run()` no longer accepts a `sandbox` option; callers rely on the global policy provider.
+  - A satellite fails closed (most restrictive profile) until it receives the first relayed policy; a relay-read failure or an older core keeps it fail-closed. A relay failure can never loosen a satellite's sandbox.
+  State and scale: the global policy is a single durable Postgres row read identically on every pod. Capability detection is per-process, deterministic from the host kernel, and surfaced per run via the `EffectiveSandbox` report (a Linux pod and a macOS satellite may legitimately differ). `CHECKSTACK_SANDBOX_UID/GID` and macvlan addressing are genuinely per-host infrastructure, surfaced per run, not the queryable policy. The satellite's policy cache is satellite-local transport state. No new pod-local current-state.
+  This is a beta minor.
+- 9dcc848: Align workspace dependency versions and migrate React Router to v7.
+  BREAKING CHANGES (React Router v7): All frontend packages now depend on `react-router-dom@^7.16.0`. Previously the workspace declared four divergent ranges (`^6.20.0`, `^6.22.0`, `^7.1.1`, `^7.14.2`), which resolved both `react-router@6` and `react-router@7` into a single bundle. Everything is now unified on v7. The public imports the app uses (`BrowserRouter`, `Routes`, `Route`, `Link`, `NavLink`, `MemoryRouter`, `useNavigate`, `useParams`, `useSearchParams`, `useLocation`) are unchanged between v6 and v7, so no source rewrites were required - but any out-of-tree plugin still on react-router v6 should upgrade to v7 (see the React Router v6 -> v7 upgrade guide) to share the host's single router instance via the import map.
+  Other unified ranges (no API change): `react` -> `^18.3.1`, the `@orpc/*` family (`contract`, `server`, `client`, `tanstack-query`, `openapi`, `zod`) -> `^1.14.4`, and `better-auth` -> `^1.6.13`.
+  Removed the pre-rename `@orpc/react-query` leftover from `@checkstack/frontend-api`; its `createRouterUtils` / `RouterUtils` / `ProcedureUtils` now come from `@orpc/tanstack-query` (the package already in use).
+  Stale in-range runtime deps pulled up to current published versions: `hono` `^4.12.23`, `@tanstack/react-query` (+devtools) `^5.100.14`, `date-fns` `^4.4.0`, `jose` `^6.2.3`, `tar` `^7.5.16`, `semver` `^7.8.1`, `@xyflow/react` `^12.11.0`.
+### Patch Changes
+- Updated dependencies [9dcc848]
+- Updated dependencies [9dcc848]
+- Updated dependencies [9dcc848]
+- Updated dependencies [9dcc848]
+- Updated dependencies [9dcc848]
+- Updated dependencies [9dcc848]
+- Updated dependencies [9dcc848]
+- Updated dependencies [9dcc848]
+  - @checkstack/healthcheck-common@1.5.0
+  - @checkstack/common@0.13.0
+  - @checkstack/template-engine@0.4.0
+  - @checkstack/cache-api@0.3.9
+  - @checkstack/queue-api@0.3.9
+  - @checkstack/signal-common@0.2.6
+## 0.20.0
+### Minor Changes
+- a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
+  Both the session-lock service and `withXactLock` HOLD a Postgres connection for
+  the lock's whole lifetime while the gated work runs on a _different_ connection.
+  Both lock and work were drawing from the single shared `adminPool` (which, with
+  no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
+  wait forever). Under concurrency >= pool size, every slot became a lock-holding
+  connection waiting for a work connection that could never free up: a permanent
+  deadlock. It surfaced as all connections stuck `idle in transaction` on
+  `pg_advisory_xact_lock` and every API request hanging into an upstream 502,
+  only after the server had been running long enough to hit that concurrency
+  (e.g. a burst of health-check evaluations or incident dedups).
+  Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
+  the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
+  deadlock class is impossible. `AdvisoryLockService` gains a pooled
+  `withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
+  pool); healthcheck's per-system serializer, incident's dedup-create, and the
+  automation single-mode concurrency lock now use it. The deadlock-prone
+  standalone `withXactLock({ db, ... })` helper is REMOVED.
+  Both pools are explicitly configured with `connectionTimeoutMillis` so any
+  future exhaustion fails fast and self-heals instead of hanging, and both get a
+  pool-level `error` handler (an idle pooled client whose backend dies otherwise
+  crashes the pod). The lock pool additionally sets
+  `idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
+  section is reaped server-side (auto-releasing the lock) rather than stranding a
+  key forever. The advisory-lock service also now removes its per-client error
+  listener on release (it previously leaked one listener per acquisition on each
+  reused pooled connection - an unbounded `MaxListenersExceeded` leak).
+  New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
+  `DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
+  (default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
+  `DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
+  (default 30000). Size pools off
+  `N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
+  BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
+  removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
+  `IncidentService`'s constructor now requires an `AdvisoryLockService` as its
+  second argument, and the healthcheck `createHealthEntitySerializer` /
+  `executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
+  instead of `db` for the serializer.
+### Patch Changes
+- @checkstack/cache-api@0.3.8
+- @checkstack/queue-api@0.3.8
 ## 0.19.0
 ### Minor Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@checkstack/backend-api",
-  "version": "0.19.0",
+  "version": "0.21.0",
   "license": "Elastic-2.0",
   "type": "module",
   "main": "./src/index.ts",
@@ -10,18 +10,19 @@
     "lint:code": "eslint . --max-warnings 0"
   },
   "dependencies": {
+    "@checkstack/cache-api": "0.3.8",
     "@checkstack/common": "0.12.0",
-    "@checkstack/healthcheck-common": "1.3.0",
-    "@checkstack/cache-api": "0.3.6",
-    "@checkstack/queue-api": "0.3.6",
+    "@checkstack/healthcheck-common": "1.4.0",
+    "@checkstack/queue-api": "0.3.8",
     "@checkstack/signal-common": "0.2.5",
-    "@orpc/client": "^1.13.14",
-    "@orpc/contract": "^1.13.14",
-    "@orpc/openapi": "^1.13.2",
-    "@orpc/server": "^1.13.2",
-    "@orpc/zod": "^1.13.2",
+    "@checkstack/template-engine": "0.3.0",
+    "@orpc/client": "^1.14.4",
+    "@orpc/contract": "^1.14.4",
+    "@orpc/openapi": "^1.14.4",
+    "@orpc/server": "^1.14.4",
+    "@orpc/zod": "^1.14.4",
     "drizzle-orm": "^0.45.0",
-    "hono": "^4.12.14",
+    "hono": "^4.12.23",
     "marked": "^17.0.1",
     "zod": "^4.2.1"
   },
@@ -33,7 +34,7 @@
     "pg": "^8.21.0"
   },
   "peerDependencies": {
-    "hono": "^4.12.14",
+    "hono": "^4.12.23",
     "drizzle-orm": "^0.45.0"
   },
   "checkstack": {

package/src/advisory-lock-pool.it.test.ts ADDED Viewed

@@ -0,0 +1,282 @@
+/**
+ * Integration test (real Postgres) for the advisory-lock CONNECTION-POOL
+ * contract — the behaviour that silently wedged production and that fakes
+ * cannot model: a held advisory lock keeps its connection checked out while the
+ * gated work runs on a *different* connection, so lock-pool / work-pool sizing
+ * decides whether the system makes progress or deadlocks.
+ *
+ * It pins three things against a live server:
+ *
+ *   1. REPRODUCE THE BUG: when the lock and its work share ONE pool, concurrency
+ *      at the pool size deadlocks (every slot is a lock-holder waiting for a
+ *      work connection that can never free up). This is a guard — if a refactor
+ *      makes this stop deadlocking, the throughput test below is no longer
+ *      proving anything.
+ *   2. THE FIX: with the lock on a DEDICATED pool, the same (and much higher)
+ *      concurrency completes with zero failures.
+ *   3. CORRECTNESS ACROSS INSTANCES: independent service instances with their
+ *      OWN pools (simulating N pods on one database) serialize a find-then-
+ *      create on a shared key down to exactly ONE row — with a no-lock control
+ *      proving the lock is what enforces it.
+ *
+ * Gated behind `CHECKSTACK_IT=1`; the integration CI job provides the Postgres
+ * service container. Connection from `CHECKSTACK_IT_PG_URL`.
+ */
+import { afterAll, beforeAll, describe, expect, it } from "bun:test";
+import { Pool } from "pg";
+import { createAdvisoryLockService } from "./advisory-lock";
+const PG_URL =
+  process.env.CHECKSTACK_IT_PG_URL ??
+  "postgres://postgres:postgres@localhost:5432/postgres";
+const DEDUP_TABLE = "it_advisory_dedup";
+describe.skipIf(!process.env.CHECKSTACK_IT)(
+  "advisory-lock pool contract (real Postgres)",
+  () => {
+    /** Pools created during a test; ended in afterEach-style cleanup helpers. */
+    const tracked: Pool[] = [];
+    function makePool(max: number, connectionTimeoutMillis = 5000): Pool {
+      const pool = new Pool({
+        connectionString: PG_URL,
+        max,
+        connectionTimeoutMillis,
+        idleTimeoutMillis: 1000,
+      });
+      // A held-lock client can error asynchronously (timeout / termination);
+      // swallow so it never surfaces as an unhandled error and fails the file.
+      pool.on("error", () => {});
+      tracked.push(pool);
+      return pool;
+    }
+    async function endTrackedPools(): Promise<void> {
+      await Promise.all(tracked.splice(0).map((p) => p.end().catch(() => {})));
+    }
+    let setupPool: Pool;
+    beforeAll(async () => {
+      setupPool = new Pool({ connectionString: PG_URL });
+      await setupPool.query(
+        `CREATE TABLE IF NOT EXISTS ${DEDUP_TABLE} (lock_key text NOT NULL, id text NOT NULL)`,
+      );
+    });
+    afterAll(async () => {
+      await setupPool.query(`DROP TABLE IF EXISTS ${DEDUP_TABLE}`);
+      await setupPool.end();
+      await endTrackedPools();
+    });
+    /**
+     * Find-then-create on `workPool`: insert exactly once per key. The 15ms gap
+     * between the read and the write widens the race window so an UNSERIALIZED
+     * run reliably double-inserts — making the lock's effect observable.
+     */
+    async function dedupCreate(workPool: Pool, key: string): Promise<boolean> {
+      const client = await workPool.connect();
+      try {
+        const { rows } = await client.query(
+          `SELECT id FROM ${DEDUP_TABLE} WHERE lock_key = $1 LIMIT 1`,
+          [key],
+        );
+        if (rows.length > 0) return false;
+        await new Promise((r) => setTimeout(r, 15));
+        await client.query(
+          `INSERT INTO ${DEDUP_TABLE} (lock_key, id) VALUES ($1, $2)`,
+          [key, crypto.randomUUID()],
+        );
+        return true;
+      } finally {
+        client.release();
+      }
+    }
+    async function countFor(key: string): Promise<number> {
+      const { rows } = await setupPool.query<{ n: string }>(
+        `SELECT count(*)::text AS n FROM ${DEDUP_TABLE} WHERE lock_key = $1`,
+        [key],
+      );
+      return Number(rows[0]?.n ?? "0");
+    }
+    it(
+      "REPRODUCES the deadlock when lock + work share one pool (the bug)",
+      async () => {
+        const POOL_MAX = 4;
+        // Single shared pool — the pre-fix wiring. The lock client AND the work
+        // client both come from here. Short connect timeout so the deadlock
+        // surfaces as a fast rejection rather than a long hang.
+        const pool = makePool(POOL_MAX, 1500);
+        const svc = createAdvisoryLockService(pool);
+        const runId = crypto.randomUUID();
+        // Exactly POOL_MAX concurrent ops, each on a DISTINCT key (so there is
+        // NO lock contention — the only thing that can stall is connection
+        // accounting). Each holds a lock client, then asks the same pool for a
+        // work client that will never come.
+        const results = await Promise.allSettled(
+          Array.from({ length: POOL_MAX }, (_, i) =>
+            svc.withXactLock({
+              key: `deadlock:${runId}:${i}`,
+              fn: async () => {
+                const c = await pool.connect();
+                try {
+                  await c.query("SELECT 1");
+                } finally {
+                  c.release();
+                }
+              },
+            }),
+          ),
+        );
+        const rejected = results.filter((r) => r.status === "rejected").length;
+        // The deadlock manifests as connection-acquire timeouts on the work
+        // checkout. If this ever becomes 0, the single-pool design no longer
+        // deadlocks and the throughput proof below must be re-examined.
+        expect(rejected).toBeGreaterThan(0);
+        await endTrackedPools();
+      },
+      30_000,
+    );
+    it(
+      "does NOT deadlock under high throughput with a dedicated lock pool (the fix)",
+      async () => {
+        // Deliberately TINY pools so any deadlock would be hit immediately; the
+        // fix is that lock and work draw from different pools.
+        const lockPool = makePool(4);
+        const workPool = makePool(4);
+        const svc = createAdvisoryLockService(lockPool);
+        const runId = crypto.randomUUID();
+        const CONCURRENCY = 200;
+        const results = await Promise.allSettled(
+          Array.from({ length: CONCURRENCY }, (_, i) =>
+            svc.withXactLock({
+              key: `throughput:${runId}:${i}`,
+              fn: async () => {
+                const c = await workPool.connect();
+                try {
+                  await c.query("SELECT 1");
+                } finally {
+                  c.release();
+                }
+              },
+            }),
+          ),
+        );
+        const rejected = results.filter((r) => r.status === "rejected");
+        // Every single operation must complete: no deadlock, no timeout.
+        expect(rejected).toHaveLength(0);
+        await endTrackedPools();
+      },
+      30_000,
+    );
+    it(
+      "serializes find-then-create across INSTANCES to exactly one row",
+      async () => {
+        // Each "pod" is an independent service instance with its OWN pools, all
+        // pointing at the same database — the real multi-instance topology. The
+        // advisory lock space is global to the server, so they must serialize.
+        const PODS = 6;
+        const ATTEMPTS_PER_POD = 5;
+        const key = `dedupe:${crypto.randomUUID()}`;
+        const pods = Array.from({ length: PODS }, () => {
+          const workPool = makePool(2);
+          const svc = createAdvisoryLockService(makePool(2));
+          return { workPool, svc };
+        });
+        const attempts = pods.flatMap((pod) =>
+          Array.from({ length: ATTEMPTS_PER_POD }, () =>
+            pod.svc.withXactLock({
+              key,
+              fn: () => dedupCreate(pod.workPool, key),
+            }),
+          ),
+        );
+        const settled = await Promise.allSettled(attempts);
+        const created = settled.filter(
+          (r) => r.status === "fulfilled" && r.value === true,
+        ).length;
+        // Exactly one attempt created the row; the rest observed it and no-oped.
+        expect(await countFor(key)).toBe(1);
+        expect(created).toBe(1);
+        await endTrackedPools();
+      },
+      30_000,
+    );
+    it(
+      "a STALLED critical section is reaped by idle_in_transaction_session_timeout, freeing the key",
+      async () => {
+        // The lock pool sets a short idle-in-transaction timeout. A held lock
+        // sits "idle in transaction" for the whole time `fn` runs, so a hung
+        // `fn` trips it: Postgres aborts the session, auto-releasing the lock -
+        // proving a stall self-heals instead of stranding the key forever.
+        const lockPool = new Pool({
+          connectionString: PG_URL,
+          max: 4,
+          connectionTimeoutMillis: 5000,
+          idle_in_transaction_session_timeout: 1000,
+        });
+        lockPool.on("error", () => {});
+        tracked.push(lockPool);
+        const svc = createAdvisoryLockService(lockPool);
+        const key = `stall:${crypto.randomUUID()}`;
+        let releaseHang!: () => void;
+        const hang = new Promise<void>((r) => (releaseHang = r));
+        // Holder whose critical section hangs (never issues another query).
+        const stalled = svc
+          .withXactLock({ key, fn: () => hang })
+          .catch(() => "rejected-as-expected");
+        // Wait past the 1s idle timeout so the server reaps the stalled holder.
+        await new Promise((r) => setTimeout(r, 1800));
+        // The key must be acquirable again now that the stalled session was
+        // aborted server-side.
+        const t0 = Date.now();
+        const got = await svc.withXactLock({ key, fn: async () => "ok" });
+        expect(got).toBe("ok");
+        expect(Date.now() - t0).toBeLessThan(3000);
+        releaseHang();
+        await stalled; // let the stalled call unwind (COMMIT fails on dead conn)
+        await endTrackedPools();
+      },
+      30_000,
+    );
+    it(
+      "CONTROL: the same workload WITHOUT the lock races into duplicates",
+      async () => {
+        // Proves the lock — not some incidental ordering — is what enforces
+        // single-creation above. Same widened-window find-then-create, run
+        // concurrently with NO advisory lock, must double-insert.
+        const workPool = makePool(8);
+        const key = `dedupe-nolock:${crypto.randomUUID()}`;
+        await Promise.all(
+          Array.from({ length: 8 }, () => dedupCreate(workPool, key)),
+        );
+        expect(await countFor(key)).toBeGreaterThan(1);
+        await endTrackedPools();
+      },
+      30_000,
+    );
+  },
+);