npm - @checkstack/backend - Versions diffs - 0.14.0 → 0.15.0 - Mend

@checkstack/backend 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/CHANGELOG.md +57 -0
package/package.json +1 -1
package/src/db.ts +109 -2
package/src/plugin-manager/core-services.ts +9 -6

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,62 @@
 # @checkstack/backend
+## 0.15.0
+### Minor Changes
+- a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
+  Both the session-lock service and `withXactLock` HOLD a Postgres connection for
+  the lock's whole lifetime while the gated work runs on a _different_ connection.
+  Both lock and work were drawing from the single shared `adminPool` (which, with
+  no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
+  wait forever). Under concurrency >= pool size, every slot became a lock-holding
+  connection waiting for a work connection that could never free up: a permanent
+  deadlock. It surfaced as all connections stuck `idle in transaction` on
+  `pg_advisory_xact_lock` and every API request hanging into an upstream 502,
+  only after the server had been running long enough to hit that concurrency
+  (e.g. a burst of health-check evaluations or incident dedups).
+  Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
+  the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
+  deadlock class is impossible. `AdvisoryLockService` gains a pooled
+  `withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
+  pool); healthcheck's per-system serializer, incident's dedup-create, and the
+  automation single-mode concurrency lock now use it. The deadlock-prone
+  standalone `withXactLock({ db, ... })` helper is REMOVED.
+  Both pools are explicitly configured with `connectionTimeoutMillis` so any
+  future exhaustion fails fast and self-heals instead of hanging, and both get a
+  pool-level `error` handler (an idle pooled client whose backend dies otherwise
+  crashes the pod). The lock pool additionally sets
+  `idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
+  section is reaped server-side (auto-releasing the lock) rather than stranding a
+  key forever. The advisory-lock service also now removes its per-client error
+  listener on release (it previously leaked one listener per acquisition on each
+  reused pooled connection - an unbounded `MaxListenersExceeded` leak).
+  New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
+  `DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
+  (default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
+  `DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
+  (default 30000). Size pools off
+  `N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
+  BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
+  removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
+  `IncidentService`'s constructor now requires an `AdvisoryLockService` as its
+  second argument, and the healthcheck `createHealthEntitySerializer` /
+  `executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
+  instead of `db` for the serializer.
+### Patch Changes
+- Updated dependencies [a57f7db]
+  - @checkstack/backend-api@0.20.0
+  - @checkstack/cache-api@0.3.8
+  - @checkstack/queue-api@0.3.8
+  - @checkstack/signal-backend@0.2.12
 ## 0.14.0
 ### Minor Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@checkstack/backend",
-  "version": "0.14.0",
+  "version": "0.15.0",
   "license": "Elastic-2.0",
   "checkstack": {
     "type": "backend"

package/src/db.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import { drizzle } from "drizzle-orm/node-postgres";
-import { Pool } from "pg";
+import { Pool, type PoolConfig } from "pg";
 import * as schema from "./schema";
+import { rootLogger } from "./logger";
 // Basic connection string sometimes fails with Bun + pg + docker SASL
 // parsing manually or relying on pg to pick up ENV variables if we don't pass anything
@@ -13,8 +14,114 @@ if (!connectionString) {
   throw new Error("DATABASE_URL is not defined");
 }
-export const adminPool = new Pool({
+/** Parse a positive-integer env var, falling back to `fallback` when unset/invalid. */
+function intFromEnv(name: string, fallback: number): number {
+  const raw = process.env[name];
+  if (raw === undefined || raw === "") return fallback;
+  const parsed = Number.parseInt(raw, 10);
+  return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
+}
+/**
+ * ## Connection budget (read this before bumping the defaults)
+ *
+ * The platform runs as N horizontally-scaled pods sharing ONE Postgres. The
+ * server-wide ceiling is `max_connections`, so the real budget is:
+ *
+ *   N_pods * (adminPool.max + lockPool.max) <= max_connections - headroom
+ *
+ * Size the pools off that budget (via the env vars below), NOT off the number
+ * of plugins: connections are never pinned per-plugin. The scoped-db proxy sets
+ * `SET LOCAL search_path` per transaction on a borrowed-then-returned
+ * connection, so a connection is occupied only for one transaction (or one
+ * held advisory lock), never for a plugin's lifetime.
+ *
+ * ## Why two pools
+ *
+ * Session/transaction advisory locks (see `advisory-lock.ts`) HOLD a connection
+ * for the lock's whole lifetime while the locked work runs on a *different*
+ * connection. If both came from one pool, then under concurrency >= pool size
+ * every slot becomes a lock-holding connection waiting for a work connection
+ * that can never free up - a self-inflicted deadlock (observed: 10 connections
+ * all `idle in transaction` on `pg_advisory_xact_lock`). Giving advisory locks
+ * their own `lockPool` makes the acquire graph acyclic (lockPool -> adminPool,
+ * never back), so that deadlock class is impossible.
+ *
+ * `connectionTimeoutMillis` is the universal safety net: a pg Pool defaults to
+ * waiting FOREVER for a free connection, which turns any future exhaustion into
+ * a permanent hang (and an upstream 502). With a finite timeout, an exhausted
+ * acquire throws, its holder unwinds and releases, and the pool self-heals.
+ */
+const COMMON_POOL_CONFIG = {
   connectionString,
+  connectionTimeoutMillis: intFromEnv(
+    "DATABASE_POOL_CONNECTION_TIMEOUT_MS",
+    10_000,
+  ),
+  idleTimeoutMillis: intFromEnv("DATABASE_POOL_IDLE_TIMEOUT_MS", 30_000),
+} satisfies PoolConfig;
+/**
+ * The main pool: serves every plugin query and all API request work. Never used
+ * to hold an advisory lock open (that is `lockPool`'s job).
+ */
+export const adminPool = new Pool({
+  ...COMMON_POOL_CONFIG,
+  max: intFromEnv("DATABASE_POOL_MAX", 20),
+});
+/**
+ * Dedicated pool for advisory locks ONLY (the session-lock service and
+ * `withXactLock`). Kept separate from `adminPool` so a held lock connection
+ * and the work it gates draw from different pools - see the deadlock note
+ * above. Sized for peak concurrent held locks INCLUDING nesting: one logical
+ * operation can hold up to two locks at once (an automation run lock wrapping
+ * an `incident.dedupe-open-for-system` lock), so this needs headroom above the
+ * raw concurrent-operation count.
+ *
+ * ## Stall backstops (server-enforced, can't be skipped by a hung process)
+ *
+ * Advisory locks lock no rows or tables - only other callers of the SAME key
+ * block. But a critical section whose `fn` hangs (e.g. an unbounded await)
+ * would hold its key + this connection open indefinitely, blocking same-key
+ * callers. Two connection-level timeouts bound that, set ONLY on this pool
+ * (where every transaction is a short lock critical section, so the limits are
+ * safe; the admin pool runs arbitrary plugin work and must NOT inherit them):
+ *
+ *   - `idle_in_transaction_session_timeout`: the lock transaction sits "idle in
+ *     transaction" for the whole time `fn` runs (it only issued BEGIN + the
+ *     lock). If `fn` hangs past this, Postgres ABORTS the session, which
+ *     auto-releases the advisory lock and frees the connection - so a stall
+ *     self-heals instead of stranding the lock forever.
+ *   - `lock_timeout`: a caller BLOCKED waiting to acquire a contended/stalled
+ *     key aborts after this (verified to apply to `pg_advisory_xact_lock`),
+ *     surfacing as a retryable error rather than an indefinite block that also
+ *     ties up a lock-pool connection.
+ *
+ * Both default high enough that a healthy short critical section never trips
+ * them; tune via env if your critical sections are unusually long.
+ */
+export const lockPool = new Pool({
+  ...COMMON_POOL_CONFIG,
+  max: intFromEnv("DATABASE_LOCK_POOL_MAX", 10),
+  idle_in_transaction_session_timeout: intFromEnv(
+    "DATABASE_LOCK_IDLE_TX_TIMEOUT_MS",
+    30_000,
+  ),
+  lock_timeout: intFromEnv("DATABASE_LOCK_TIMEOUT_MS", 30_000),
+});
+// A pg Pool emits 'error' on behalf of IDLE clients whose backend dies (admin
+// termination, failover, network drop). With no listener, that 'error' is
+// unhandled and CRASHES the process. Log and swallow: the pool discards the
+// dead client and hands out a fresh one on the next checkout. (Checked-out
+// clients are covered separately - the scoped-db transaction wrapper and the
+// advisory-lock service attach their own per-client handlers while held.)
+adminPool.on("error", (error) => {
+  rootLogger.warn("adminPool idle client error (recovered)", error);
+});
+lockPool.on("error", (error) => {
+  rootLogger.warn("lockPool idle client error (recovered)", error);
 });
 export const db = drizzle({ client: adminPool, schema });

package/src/plugin-manager/core-services.ts CHANGED Viewed

@@ -14,7 +14,7 @@ import {
 import { AuthApi } from "@checkstack/auth-common";
 import type { ServiceRegistry } from "../services/service-registry";
 import { rootLogger } from "../logger";
-import { db } from "../db";
+import { db, lockPool } from "../db";
 import { jwtService } from "../services/jwt";
 import {
   CoreHealthCheckRegistry,
@@ -98,11 +98,14 @@ export function registerCoreServices({
     return createScopedDb(db, assignedSchema);
   });
-  // 1b. Advisory Lock Factory (server-global, backed by the shared admin
-  // pool). Session locks need connection affinity, so the service checks
-  // out a dedicated client per acquired lock and releases on the SAME
-  // client — the scoped per-query DB proxy can't provide that.
-  const advisoryLockService = createAdvisoryLockService(adminPool);
+  // 1b. Advisory Lock Factory (server-global, backed by the DEDICATED
+  // `lockPool`, NOT `adminPool`). Both session locks (`tryAcquire`) and the
+  // transaction-scoped `withXactLock` HOLD a connection for the lock's whole
+  // lifetime while the locked work runs on `adminPool`. Drawing the lock
+  // connection from a separate pool keeps the acquire graph acyclic
+  // (lockPool -> adminPool, never back), so a held lock can never starve the
+  // work pool into the `idle in transaction` deadlock. See `db.ts`.
+  const advisoryLockService = createAdvisoryLockService(lockPool);
   registry.registerFactory(
     coreServices.advisoryLock,
     () => advisoryLockService,