@checkstack/backend 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,62 @@
1
1
  # @checkstack/backend
2
2
 
3
+ ## 0.15.0
4
+
5
+ ### Minor Changes
6
+
7
+ - a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
8
+
9
+ Both the session-lock service and `withXactLock` HOLD a Postgres connection for
10
+ the lock's whole lifetime while the gated work runs on a _different_ connection.
11
+ Both lock and work were drawing from the single shared `adminPool` (which, with
12
+ no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
13
+ wait forever). Under concurrency >= pool size, every slot became a lock-holding
14
+ connection waiting for a work connection that could never free up: a permanent
15
+ deadlock. It surfaced as all connections stuck `idle in transaction` on
16
+ `pg_advisory_xact_lock` and every API request hanging into an upstream 502,
17
+ only after the server had been running long enough to hit that concurrency
18
+ (e.g. a burst of health-check evaluations or incident dedups).
19
+
20
+ Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
21
+ the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
22
+ deadlock class is impossible. `AdvisoryLockService` gains a pooled
23
+ `withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
24
+ pool); healthcheck's per-system serializer, incident's dedup-create, and the
25
+ automation single-mode concurrency lock now use it. The deadlock-prone
26
+ standalone `withXactLock({ db, ... })` helper is REMOVED.
27
+
28
+ Both pools are explicitly configured with `connectionTimeoutMillis` so any
29
+ future exhaustion fails fast and self-heals instead of hanging, and both get a
30
+ pool-level `error` handler (an idle pooled client whose backend dies otherwise
31
+ crashes the pod). The lock pool additionally sets
32
+ `idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
33
+ section is reaped server-side (auto-releasing the lock) rather than stranding a
34
+ key forever. The advisory-lock service also now removes its per-client error
35
+ listener on release (it previously leaked one listener per acquisition on each
36
+ reused pooled connection - an unbounded `MaxListenersExceeded` leak).
37
+
38
+ New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
39
+ `DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
40
+ (default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
41
+ `DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
42
+ (default 30000). Size pools off
43
+ `N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
44
+
45
+ BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
46
+ removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
47
+ `IncidentService`'s constructor now requires an `AdvisoryLockService` as its
48
+ second argument, and the healthcheck `createHealthEntitySerializer` /
49
+ `executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
50
+ instead of `db` for the serializer.
51
+
52
+ ### Patch Changes
53
+
54
+ - Updated dependencies [a57f7db]
55
+ - @checkstack/backend-api@0.20.0
56
+ - @checkstack/cache-api@0.3.8
57
+ - @checkstack/queue-api@0.3.8
58
+ - @checkstack/signal-backend@0.2.12
59
+
3
60
  ## 0.14.0
4
61
 
5
62
  ### Minor Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@checkstack/backend",
3
- "version": "0.14.0",
3
+ "version": "0.15.0",
4
4
  "license": "Elastic-2.0",
5
5
  "checkstack": {
6
6
  "type": "backend"
package/src/db.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  import { drizzle } from "drizzle-orm/node-postgres";
2
- import { Pool } from "pg";
2
+ import { Pool, type PoolConfig } from "pg";
3
3
  import * as schema from "./schema";
4
+ import { rootLogger } from "./logger";
4
5
 
5
6
  // Basic connection string sometimes fails with Bun + pg + docker SASL
6
7
  // parsing manually or relying on pg to pick up ENV variables if we don't pass anything
@@ -13,8 +14,114 @@ if (!connectionString) {
13
14
  throw new Error("DATABASE_URL is not defined");
14
15
  }
15
16
 
16
- export const adminPool = new Pool({
17
+ /** Parse a positive-integer env var, falling back to `fallback` when unset/invalid. */
18
+ function intFromEnv(name: string, fallback: number): number {
19
+ const raw = process.env[name];
20
+ if (raw === undefined || raw === "") return fallback;
21
+ const parsed = Number.parseInt(raw, 10);
22
+ return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
23
+ }
24
+
25
+ /**
26
+ * ## Connection budget (read this before bumping the defaults)
27
+ *
28
+ * The platform runs as N horizontally-scaled pods sharing ONE Postgres. The
29
+ * server-wide ceiling is `max_connections`, so the real budget is:
30
+ *
31
+ * N_pods * (adminPool.max + lockPool.max) <= max_connections - headroom
32
+ *
33
+ * Size the pools off that budget (via the env vars below), NOT off the number
34
+ * of plugins: connections are never pinned per-plugin. The scoped-db proxy sets
35
+ * `SET LOCAL search_path` per transaction on a borrowed-then-returned
36
+ * connection, so a connection is occupied only for one transaction (or one
37
+ * held advisory lock), never for a plugin's lifetime.
38
+ *
39
+ * ## Why two pools
40
+ *
41
+ * Session/transaction advisory locks (see `advisory-lock.ts`) HOLD a connection
42
+ * for the lock's whole lifetime while the locked work runs on a *different*
43
+ * connection. If both came from one pool, then under concurrency >= pool size
44
+ * every slot becomes a lock-holding connection waiting for a work connection
45
+ * that can never free up - a self-inflicted deadlock (observed: 10 connections
46
+ * all `idle in transaction` on `pg_advisory_xact_lock`). Giving advisory locks
47
+ * their own `lockPool` makes the acquire graph acyclic (lockPool -> adminPool,
48
+ * never back), so that deadlock class is impossible.
49
+ *
50
+ * `connectionTimeoutMillis` is the universal safety net: a pg Pool defaults to
51
+ * waiting FOREVER for a free connection, which turns any future exhaustion into
52
+ * a permanent hang (and an upstream 502). With a finite timeout, an exhausted
53
+ * acquire throws, its holder unwinds and releases, and the pool self-heals.
54
+ */
55
+ const COMMON_POOL_CONFIG = {
17
56
  connectionString,
57
+ connectionTimeoutMillis: intFromEnv(
58
+ "DATABASE_POOL_CONNECTION_TIMEOUT_MS",
59
+ 10_000,
60
+ ),
61
+ idleTimeoutMillis: intFromEnv("DATABASE_POOL_IDLE_TIMEOUT_MS", 30_000),
62
+ } satisfies PoolConfig;
63
+
64
+ /**
65
+ * The main pool: serves every plugin query and all API request work. Never used
66
+ * to hold an advisory lock open (that is `lockPool`'s job).
67
+ */
68
+ export const adminPool = new Pool({
69
+ ...COMMON_POOL_CONFIG,
70
+ max: intFromEnv("DATABASE_POOL_MAX", 20),
71
+ });
72
+
73
+ /**
74
+ * Dedicated pool for advisory locks ONLY (the session-lock service and
75
+ * `withXactLock`). Kept separate from `adminPool` so a held lock connection
76
+ * and the work it gates draw from different pools - see the deadlock note
77
+ * above. Sized for peak concurrent held locks INCLUDING nesting: one logical
78
+ * operation can hold up to two locks at once (an automation run lock wrapping
79
+ * an `incident.dedupe-open-for-system` lock), so this needs headroom above the
80
+ * raw concurrent-operation count.
81
+ *
82
+ * ## Stall backstops (server-enforced, can't be skipped by a hung process)
83
+ *
84
+ * Advisory locks lock no rows or tables - only other callers of the SAME key
85
+ * block. But a critical section whose `fn` hangs (e.g. an unbounded await)
86
+ * would hold its key + this connection open indefinitely, blocking same-key
87
+ * callers. Two connection-level timeouts bound that, set ONLY on this pool
88
+ * (where every transaction is a short lock critical section, so the limits are
89
+ * safe; the admin pool runs arbitrary plugin work and must NOT inherit them):
90
+ *
91
+ * - `idle_in_transaction_session_timeout`: the lock transaction sits "idle in
92
+ * transaction" for the whole time `fn` runs (it only issued BEGIN + the
93
+ * lock). If `fn` hangs past this, Postgres ABORTS the session, which
94
+ * auto-releases the advisory lock and frees the connection - so a stall
95
+ * self-heals instead of stranding the lock forever.
96
+ * - `lock_timeout`: a caller BLOCKED waiting to acquire a contended/stalled
97
+ * key aborts after this (verified to apply to `pg_advisory_xact_lock`),
98
+ * surfacing as a retryable error rather than an indefinite block that also
99
+ * ties up a lock-pool connection.
100
+ *
101
+ * Both default high enough that a healthy short critical section never trips
102
+ * them; tune via env if your critical sections are unusually long.
103
+ */
104
+ export const lockPool = new Pool({
105
+ ...COMMON_POOL_CONFIG,
106
+ max: intFromEnv("DATABASE_LOCK_POOL_MAX", 10),
107
+ idle_in_transaction_session_timeout: intFromEnv(
108
+ "DATABASE_LOCK_IDLE_TX_TIMEOUT_MS",
109
+ 30_000,
110
+ ),
111
+ lock_timeout: intFromEnv("DATABASE_LOCK_TIMEOUT_MS", 30_000),
112
+ });
113
+
114
+ // A pg Pool emits 'error' on behalf of IDLE clients whose backend dies (admin
115
+ // termination, failover, network drop). With no listener, that 'error' is
116
+ // unhandled and CRASHES the process. Log and swallow: the pool discards the
117
+ // dead client and hands out a fresh one on the next checkout. (Checked-out
118
+ // clients are covered separately - the scoped-db transaction wrapper and the
119
+ // advisory-lock service attach their own per-client handlers while held.)
120
+ adminPool.on("error", (error) => {
121
+ rootLogger.warn("adminPool idle client error (recovered)", error);
122
+ });
123
+ lockPool.on("error", (error) => {
124
+ rootLogger.warn("lockPool idle client error (recovered)", error);
18
125
  });
19
126
 
20
127
  export const db = drizzle({ client: adminPool, schema });
@@ -14,7 +14,7 @@ import {
14
14
  import { AuthApi } from "@checkstack/auth-common";
15
15
  import type { ServiceRegistry } from "../services/service-registry";
16
16
  import { rootLogger } from "../logger";
17
- import { db } from "../db";
17
+ import { db, lockPool } from "../db";
18
18
  import { jwtService } from "../services/jwt";
19
19
  import {
20
20
  CoreHealthCheckRegistry,
@@ -98,11 +98,14 @@ export function registerCoreServices({
98
98
  return createScopedDb(db, assignedSchema);
99
99
  });
100
100
 
101
- // 1b. Advisory Lock Factory (server-global, backed by the shared admin
102
- // pool). Session locks need connection affinity, so the service checks
103
- // out a dedicated client per acquired lock and releases on the SAME
104
- // client the scoped per-query DB proxy can't provide that.
105
- const advisoryLockService = createAdvisoryLockService(adminPool);
101
+ // 1b. Advisory Lock Factory (server-global, backed by the DEDICATED
102
+ // `lockPool`, NOT `adminPool`). Both session locks (`tryAcquire`) and the
103
+ // transaction-scoped `withXactLock` HOLD a connection for the lock's whole
104
+ // lifetime while the locked work runs on `adminPool`. Drawing the lock
105
+ // connection from a separate pool keeps the acquire graph acyclic
106
+ // (lockPool -> adminPool, never back), so a held lock can never starve the
107
+ // work pool into the `idle in transaction` deadlock. See `db.ts`.
108
+ const advisoryLockService = createAdvisoryLockService(lockPool);
106
109
  registry.registerFactory(
107
110
  coreServices.advisoryLock,
108
111
  () => advisoryLockService,