@checkstack/backend-api 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,59 @@
1
1
  # @checkstack/backend-api
2
2
 
3
+ ## 0.20.0
4
+
5
+ ### Minor Changes
6
+
7
+ - a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
8
+
9
+ Both the session-lock service and `withXactLock` HOLD a Postgres connection for
10
+ the lock's whole lifetime while the gated work runs on a _different_ connection.
11
+ Both lock and work were drawing from the single shared `adminPool` (which, with
12
+ no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
13
+ wait forever). Under concurrency >= pool size, every slot became a lock-holding
14
+ connection waiting for a work connection that could never free up: a permanent
15
+ deadlock. It surfaced as all connections stuck `idle in transaction` on
16
+ `pg_advisory_xact_lock` and every API request hanging into an upstream 502,
17
+ only after the server had been running long enough to hit that concurrency
18
+ (e.g. a burst of health-check evaluations or incident dedups).
19
+
20
+ Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
21
+ the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
22
+ deadlock class is impossible. `AdvisoryLockService` gains a pooled
23
+ `withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
24
+ pool); healthcheck's per-system serializer, incident's dedup-create, and the
25
+ automation single-mode concurrency lock now use it. The deadlock-prone
26
+ standalone `withXactLock({ db, ... })` helper is REMOVED.
27
+
28
+ Both pools are explicitly configured with `connectionTimeoutMillis` so any
29
+ future exhaustion fails fast and self-heals instead of hanging, and both get a
30
+ pool-level `error` handler (an idle pooled client whose backend dies otherwise
31
+ crashes the pod). The lock pool additionally sets
32
+ `idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
33
+ section is reaped server-side (auto-releasing the lock) rather than stranding a
34
+ key forever. The advisory-lock service also now removes its per-client error
35
+ listener on release (it previously leaked one listener per acquisition on each
36
+ reused pooled connection - an unbounded `MaxListenersExceeded` leak).
37
+
38
+ New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
39
+ `DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
40
+ (default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
41
+ `DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
42
+ (default 30000). Size pools off
43
+ `N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
44
+
45
+ BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
46
+ removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
47
+ `IncidentService`'s constructor now requires an `AdvisoryLockService` as its
48
+ second argument, and the healthcheck `createHealthEntitySerializer` /
49
+ `executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
50
+ instead of `db` for the serializer.
51
+
52
+ ### Patch Changes
53
+
54
+ - @checkstack/cache-api@0.3.8
55
+ - @checkstack/queue-api@0.3.8
56
+
3
57
  ## 0.19.0
4
58
 
5
59
  ### Minor Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@checkstack/backend-api",
3
- "version": "0.19.0",
3
+ "version": "0.20.0",
4
4
  "license": "Elastic-2.0",
5
5
  "type": "module",
6
6
  "main": "./src/index.ts",
@@ -0,0 +1,282 @@
1
+ /**
2
+ * Integration test (real Postgres) for the advisory-lock CONNECTION-POOL
3
+ * contract — the behaviour that silently wedged production and that fakes
4
+ * cannot model: a held advisory lock keeps its connection checked out while the
5
+ * gated work runs on a *different* connection, so lock-pool / work-pool sizing
6
+ * decides whether the system makes progress or deadlocks.
7
+ *
8
+ * It pins three things against a live server:
9
+ *
10
+ * 1. REPRODUCE THE BUG: when the lock and its work share ONE pool, concurrency
11
+ * at the pool size deadlocks (every slot is a lock-holder waiting for a
12
+ * work connection that can never free up). This is a guard — if a refactor
13
+ * makes this stop deadlocking, the throughput test below is no longer
14
+ * proving anything.
15
+ * 2. THE FIX: with the lock on a DEDICATED pool, the same (and much higher)
16
+ * concurrency completes with zero failures.
17
+ * 3. CORRECTNESS ACROSS INSTANCES: independent service instances with their
18
+ * OWN pools (simulating N pods on one database) serialize a find-then-
19
+ * create on a shared key down to exactly ONE row — with a no-lock control
20
+ * proving the lock is what enforces it.
21
+ *
22
+ * Gated behind `CHECKSTACK_IT=1`; the integration CI job provides the Postgres
23
+ * service container. Connection from `CHECKSTACK_IT_PG_URL`.
24
+ */
25
+ import { afterAll, beforeAll, describe, expect, it } from "bun:test";
26
+ import { Pool } from "pg";
27
+ import { createAdvisoryLockService } from "./advisory-lock";
28
+
29
+ const PG_URL =
30
+ process.env.CHECKSTACK_IT_PG_URL ??
31
+ "postgres://postgres:postgres@localhost:5432/postgres";
32
+
33
+ const DEDUP_TABLE = "it_advisory_dedup";
34
+
35
+ describe.skipIf(!process.env.CHECKSTACK_IT)(
36
+ "advisory-lock pool contract (real Postgres)",
37
+ () => {
38
+ /** Pools created during a test; ended in afterEach-style cleanup helpers. */
39
+ const tracked: Pool[] = [];
40
+ function makePool(max: number, connectionTimeoutMillis = 5000): Pool {
41
+ const pool = new Pool({
42
+ connectionString: PG_URL,
43
+ max,
44
+ connectionTimeoutMillis,
45
+ idleTimeoutMillis: 1000,
46
+ });
47
+ // A held-lock client can error asynchronously (timeout / termination);
48
+ // swallow so it never surfaces as an unhandled error and fails the file.
49
+ pool.on("error", () => {});
50
+ tracked.push(pool);
51
+ return pool;
52
+ }
53
+ async function endTrackedPools(): Promise<void> {
54
+ await Promise.all(tracked.splice(0).map((p) => p.end().catch(() => {})));
55
+ }
56
+
57
+ let setupPool: Pool;
58
+ beforeAll(async () => {
59
+ setupPool = new Pool({ connectionString: PG_URL });
60
+ await setupPool.query(
61
+ `CREATE TABLE IF NOT EXISTS ${DEDUP_TABLE} (lock_key text NOT NULL, id text NOT NULL)`,
62
+ );
63
+ });
64
+ afterAll(async () => {
65
+ await setupPool.query(`DROP TABLE IF EXISTS ${DEDUP_TABLE}`);
66
+ await setupPool.end();
67
+ await endTrackedPools();
68
+ });
69
+
70
+ /**
71
+ * Find-then-create on `workPool`: insert exactly once per key. The 15ms gap
72
+ * between the read and the write widens the race window so an UNSERIALIZED
73
+ * run reliably double-inserts — making the lock's effect observable.
74
+ */
75
+ async function dedupCreate(workPool: Pool, key: string): Promise<boolean> {
76
+ const client = await workPool.connect();
77
+ try {
78
+ const { rows } = await client.query(
79
+ `SELECT id FROM ${DEDUP_TABLE} WHERE lock_key = $1 LIMIT 1`,
80
+ [key],
81
+ );
82
+ if (rows.length > 0) return false;
83
+ await new Promise((r) => setTimeout(r, 15));
84
+ await client.query(
85
+ `INSERT INTO ${DEDUP_TABLE} (lock_key, id) VALUES ($1, $2)`,
86
+ [key, crypto.randomUUID()],
87
+ );
88
+ return true;
89
+ } finally {
90
+ client.release();
91
+ }
92
+ }
93
+
94
+ async function countFor(key: string): Promise<number> {
95
+ const { rows } = await setupPool.query<{ n: string }>(
96
+ `SELECT count(*)::text AS n FROM ${DEDUP_TABLE} WHERE lock_key = $1`,
97
+ [key],
98
+ );
99
+ return Number(rows[0]?.n ?? "0");
100
+ }
101
+
102
+ it(
103
+ "REPRODUCES the deadlock when lock + work share one pool (the bug)",
104
+ async () => {
105
+ const POOL_MAX = 4;
106
+ // Single shared pool — the pre-fix wiring. The lock client AND the work
107
+ // client both come from here. Short connect timeout so the deadlock
108
+ // surfaces as a fast rejection rather than a long hang.
109
+ const pool = makePool(POOL_MAX, 1500);
110
+ const svc = createAdvisoryLockService(pool);
111
+ const runId = crypto.randomUUID();
112
+
113
+ // Exactly POOL_MAX concurrent ops, each on a DISTINCT key (so there is
114
+ // NO lock contention — the only thing that can stall is connection
115
+ // accounting). Each holds a lock client, then asks the same pool for a
116
+ // work client that will never come.
117
+ const results = await Promise.allSettled(
118
+ Array.from({ length: POOL_MAX }, (_, i) =>
119
+ svc.withXactLock({
120
+ key: `deadlock:${runId}:${i}`,
121
+ fn: async () => {
122
+ const c = await pool.connect();
123
+ try {
124
+ await c.query("SELECT 1");
125
+ } finally {
126
+ c.release();
127
+ }
128
+ },
129
+ }),
130
+ ),
131
+ );
132
+
133
+ const rejected = results.filter((r) => r.status === "rejected").length;
134
+ // The deadlock manifests as connection-acquire timeouts on the work
135
+ // checkout. If this ever becomes 0, the single-pool design no longer
136
+ // deadlocks and the throughput proof below must be re-examined.
137
+ expect(rejected).toBeGreaterThan(0);
138
+
139
+ await endTrackedPools();
140
+ },
141
+ 30_000,
142
+ );
143
+
144
+ it(
145
+ "does NOT deadlock under high throughput with a dedicated lock pool (the fix)",
146
+ async () => {
147
+ // Deliberately TINY pools so any deadlock would be hit immediately; the
148
+ // fix is that lock and work draw from different pools.
149
+ const lockPool = makePool(4);
150
+ const workPool = makePool(4);
151
+ const svc = createAdvisoryLockService(lockPool);
152
+ const runId = crypto.randomUUID();
153
+
154
+ const CONCURRENCY = 200;
155
+ const results = await Promise.allSettled(
156
+ Array.from({ length: CONCURRENCY }, (_, i) =>
157
+ svc.withXactLock({
158
+ key: `throughput:${runId}:${i}`,
159
+ fn: async () => {
160
+ const c = await workPool.connect();
161
+ try {
162
+ await c.query("SELECT 1");
163
+ } finally {
164
+ c.release();
165
+ }
166
+ },
167
+ }),
168
+ ),
169
+ );
170
+
171
+ const rejected = results.filter((r) => r.status === "rejected");
172
+ // Every single operation must complete: no deadlock, no timeout.
173
+ expect(rejected).toHaveLength(0);
174
+
175
+ await endTrackedPools();
176
+ },
177
+ 30_000,
178
+ );
179
+
180
+ it(
181
+ "serializes find-then-create across INSTANCES to exactly one row",
182
+ async () => {
183
+ // Each "pod" is an independent service instance with its OWN pools, all
184
+ // pointing at the same database — the real multi-instance topology. The
185
+ // advisory lock space is global to the server, so they must serialize.
186
+ const PODS = 6;
187
+ const ATTEMPTS_PER_POD = 5;
188
+ const key = `dedupe:${crypto.randomUUID()}`;
189
+
190
+ const pods = Array.from({ length: PODS }, () => {
191
+ const workPool = makePool(2);
192
+ const svc = createAdvisoryLockService(makePool(2));
193
+ return { workPool, svc };
194
+ });
195
+
196
+ const attempts = pods.flatMap((pod) =>
197
+ Array.from({ length: ATTEMPTS_PER_POD }, () =>
198
+ pod.svc.withXactLock({
199
+ key,
200
+ fn: () => dedupCreate(pod.workPool, key),
201
+ }),
202
+ ),
203
+ );
204
+
205
+ const settled = await Promise.allSettled(attempts);
206
+ const created = settled.filter(
207
+ (r) => r.status === "fulfilled" && r.value === true,
208
+ ).length;
209
+
210
+ // Exactly one attempt created the row; the rest observed it and no-oped.
211
+ expect(await countFor(key)).toBe(1);
212
+ expect(created).toBe(1);
213
+
214
+ await endTrackedPools();
215
+ },
216
+ 30_000,
217
+ );
218
+
219
+ it(
220
+ "a STALLED critical section is reaped by idle_in_transaction_session_timeout, freeing the key",
221
+ async () => {
222
+ // The lock pool sets a short idle-in-transaction timeout. A held lock
223
+ // sits "idle in transaction" for the whole time `fn` runs, so a hung
224
+ // `fn` trips it: Postgres aborts the session, auto-releasing the lock -
225
+ // proving a stall self-heals instead of stranding the key forever.
226
+ const lockPool = new Pool({
227
+ connectionString: PG_URL,
228
+ max: 4,
229
+ connectionTimeoutMillis: 5000,
230
+ idle_in_transaction_session_timeout: 1000,
231
+ });
232
+ lockPool.on("error", () => {});
233
+ tracked.push(lockPool);
234
+ const svc = createAdvisoryLockService(lockPool);
235
+ const key = `stall:${crypto.randomUUID()}`;
236
+
237
+ let releaseHang!: () => void;
238
+ const hang = new Promise<void>((r) => (releaseHang = r));
239
+
240
+ // Holder whose critical section hangs (never issues another query).
241
+ const stalled = svc
242
+ .withXactLock({ key, fn: () => hang })
243
+ .catch(() => "rejected-as-expected");
244
+
245
+ // Wait past the 1s idle timeout so the server reaps the stalled holder.
246
+ await new Promise((r) => setTimeout(r, 1800));
247
+
248
+ // The key must be acquirable again now that the stalled session was
249
+ // aborted server-side.
250
+ const t0 = Date.now();
251
+ const got = await svc.withXactLock({ key, fn: async () => "ok" });
252
+ expect(got).toBe("ok");
253
+ expect(Date.now() - t0).toBeLessThan(3000);
254
+
255
+ releaseHang();
256
+ await stalled; // let the stalled call unwind (COMMIT fails on dead conn)
257
+ await endTrackedPools();
258
+ },
259
+ 30_000,
260
+ );
261
+
262
+ it(
263
+ "CONTROL: the same workload WITHOUT the lock races into duplicates",
264
+ async () => {
265
+ // Proves the lock — not some incidental ordering — is what enforces
266
+ // single-creation above. Same widened-window find-then-create, run
267
+ // concurrently with NO advisory lock, must double-insert.
268
+ const workPool = makePool(8);
269
+ const key = `dedupe-nolock:${crypto.randomUUID()}`;
270
+
271
+ await Promise.all(
272
+ Array.from({ length: 8 }, () => dedupCreate(workPool, key)),
273
+ );
274
+
275
+ expect(await countFor(key)).toBeGreaterThan(1);
276
+
277
+ await endTrackedPools();
278
+ },
279
+ 30_000,
280
+ );
281
+ },
282
+ );
@@ -7,8 +7,9 @@ import {
7
7
 
8
8
  /**
9
9
  * Faithful fake of a `pg.Pool` that models Postgres' per-connection
10
- * SESSION advisory-lock semantics:
10
+ * advisory-lock semantics for BOTH lock flavours:
11
11
  *
12
+ * SESSION locks (`tryAcquire`):
12
13
  * - A key can be held by at most one connection at a time.
13
14
  * - `pg_try_advisory_lock` succeeds only if the key is free; it then
14
15
  * binds the key to the acquiring connection.
@@ -16,8 +17,15 @@ import {
16
17
  * (a no-op otherwise) — exactly the bug we are guarding against: an
17
18
  * unlock issued on a different connection does nothing.
18
19
  *
19
- * This lets the test prove the service keeps acquire + release on ONE
20
- * client.
20
+ * TRANSACTION locks (`withXactLock`):
21
+ * - `pg_advisory_xact_lock` BLOCKS until the key is free, then binds it to
22
+ * the acquiring connection's transaction.
23
+ * - `COMMIT` / `ROLLBACK` release every xact lock held by that connection
24
+ * and wake the next blocked waiter (FIFO) — modelling auto-release and
25
+ * the serialization guarantee.
26
+ *
27
+ * This lets the tests prove the service keeps acquire + release on ONE
28
+ * client and that concurrent `withXactLock` callers serialize.
21
29
  */
22
30
  interface FakePool extends AdvisoryLockPool {
23
31
  checkedOut: number;
@@ -27,6 +35,9 @@ interface FakePool extends AdvisoryLockPool {
27
35
  function makeFakePool(): FakePool {
28
36
  // key -> owning connection id (or absent if free)
29
37
  const heldBy = new Map<string, number>();
38
+ // xact key -> owning connection id; waiters queued FIFO per key.
39
+ const xactHeldBy = new Map<string, number>();
40
+ const xactWaiters = new Map<string, Array<() => void>>();
30
41
  let nextConnId = 0;
31
42
  const counters = { checkedOut: 0, released: 0 };
32
43
 
@@ -46,8 +57,21 @@ function makeFakePool(): FakePool {
46
57
  async connect(): Promise<AdvisoryLockPoolClient> {
47
58
  const connId = nextConnId++;
48
59
  counters.checkedOut++;
60
+ const releaseXactLocks = () => {
61
+ for (const [key, owner] of [...xactHeldBy.entries()]) {
62
+ if (owner !== connId) continue;
63
+ xactHeldBy.delete(key);
64
+ const next = xactWaiters.get(key)?.shift();
65
+ if (next) next();
66
+ }
67
+ };
49
68
  return {
50
69
  async query<T>(queryText: string, values?: unknown[]) {
70
+ if (queryText === "BEGIN") return { rows: [] };
71
+ if (queryText === "COMMIT" || queryText === "ROLLBACK") {
72
+ releaseXactLocks();
73
+ return { rows: [] };
74
+ }
51
75
  const key = keyOf(values);
52
76
  if (queryText.includes("pg_try_advisory_lock")) {
53
77
  const owner = heldBy.get(key);
@@ -55,6 +79,22 @@ function makeFakePool(): FakePool {
55
79
  if (ok) heldBy.set(key, connId);
56
80
  return { rows: [{ ok } as unknown as T] };
57
81
  }
82
+ if (queryText.includes("pg_advisory_xact_lock")) {
83
+ if (!xactHeldBy.has(key)) {
84
+ xactHeldBy.set(key, connId);
85
+ return { rows: [] };
86
+ }
87
+ // Blocked: enqueue and wait until a holder commits/rolls back.
88
+ await new Promise<void>((resolve) => {
89
+ const q = xactWaiters.get(key) ?? [];
90
+ q.push(() => {
91
+ xactHeldBy.set(key, connId);
92
+ resolve();
93
+ });
94
+ xactWaiters.set(key, q);
95
+ });
96
+ return { rows: [] };
97
+ }
58
98
  if (queryText.includes("pg_advisory_unlock")) {
59
99
  // Only the owning connection can release — model the leak bug.
60
100
  if (heldBy.get(key) === connId) heldBy.delete(key);
@@ -70,6 +110,10 @@ function makeFakePool(): FakePool {
70
110
  // `on('error')` hardening is exercised by the IT against real
71
111
  // Postgres (killing the holding connection).
72
112
  },
113
+ off() {
114
+ // Counterpart to `on`; the service detaches its error listener on
115
+ // release. No-op here since the fake never attaches one.
116
+ },
73
117
  };
74
118
  },
75
119
  };
@@ -130,3 +174,100 @@ describe("createAdvisoryLockService", () => {
130
174
  await b!.release();
131
175
  });
132
176
  });
177
+
178
+ describe("createAdvisoryLockService.withXactLock", () => {
179
+ const tick = (ms = 5) => new Promise((r) => setTimeout(r, ms));
180
+
181
+ it("runs fn, returns its value, and releases the client", async () => {
182
+ const pool = makeFakePool();
183
+ const svc = createAdvisoryLockService(pool);
184
+ const result = await svc.withXactLock({ key: "k", fn: async () => 42 });
185
+ expect(result).toBe(42);
186
+ expect(pool.checkedOut).toBe(1);
187
+ expect(pool.released).toBe(1);
188
+ });
189
+
190
+ it("serializes concurrent calls on the same key (second fn waits for first to commit)", async () => {
191
+ const pool = makeFakePool();
192
+ const svc = createAdvisoryLockService(pool);
193
+ const order: string[] = [];
194
+
195
+ let releaseFirst!: () => void;
196
+ const firstHeld = new Promise<void>((r) => (releaseFirst = r));
197
+
198
+ const p1 = svc.withXactLock({
199
+ key: "k",
200
+ fn: async () => {
201
+ order.push("1-start");
202
+ await firstHeld;
203
+ order.push("1-end");
204
+ },
205
+ });
206
+
207
+ // Let p1 acquire the lock before p2 attempts it.
208
+ await tick();
209
+ const p2 = svc.withXactLock({
210
+ key: "k",
211
+ fn: async () => {
212
+ order.push("2-start");
213
+ },
214
+ });
215
+
216
+ // While p1 holds the lock, p2's fn must NOT have started.
217
+ await tick();
218
+ expect(order).toEqual(["1-start"]);
219
+
220
+ releaseFirst();
221
+ await Promise.all([p1, p2]);
222
+ expect(order).toEqual(["1-start", "1-end", "2-start"]);
223
+ expect(pool.released).toBe(2);
224
+ });
225
+
226
+ it("rolls back and releases the client when fn throws, freeing the lock", async () => {
227
+ const pool = makeFakePool();
228
+ const svc = createAdvisoryLockService(pool);
229
+
230
+ await expect(
231
+ svc.withXactLock({
232
+ key: "k",
233
+ fn: async () => {
234
+ throw new Error("boom");
235
+ },
236
+ }),
237
+ ).rejects.toThrow("boom");
238
+
239
+ // Lock was released on rollback: a subsequent acquire succeeds promptly.
240
+ const after = await svc.withXactLock({ key: "k", fn: async () => "ok" });
241
+ expect(after).toBe("ok");
242
+ expect(pool.released).toBe(2);
243
+ });
244
+
245
+ it("different keys do not serialize", async () => {
246
+ const pool = makeFakePool();
247
+ const svc = createAdvisoryLockService(pool);
248
+ const started: string[] = [];
249
+
250
+ let release!: () => void;
251
+ const held = new Promise<void>((r) => (release = r));
252
+
253
+ const pA = svc.withXactLock({
254
+ key: "a",
255
+ fn: async () => {
256
+ started.push("a");
257
+ await held;
258
+ },
259
+ });
260
+ await tick();
261
+ // Key "b" must run even while "a" is still held.
262
+ await svc.withXactLock({
263
+ key: "b",
264
+ fn: async () => {
265
+ started.push("b");
266
+ },
267
+ });
268
+ expect(started).toContain("b");
269
+
270
+ release();
271
+ await pA;
272
+ });
273
+ });
@@ -18,19 +18,25 @@
18
18
  * (e.g. an installer election held across a minutes-long `bun install`)
19
19
  * where a long-open transaction would be unacceptable.
20
20
  *
21
- * - {@link withXactLock} wraps acquire + work + release in a single
22
- * transaction using `pg_advisory_xact_lock`, which auto-releases at
23
- * COMMIT/ROLLBACK. Use this for SHORT critical sections (e.g. a
24
- * find-then-create dedup) where holding a transaction for the duration
25
- * is fine and the auto-release removes any chance of a leak.
21
+ * - {@link AdvisoryLockService.withXactLock} wraps acquire + work + release
22
+ * in a single transaction using `pg_advisory_xact_lock`, which auto-
23
+ * releases at COMMIT/ROLLBACK. Use this for SHORT critical sections (e.g. a
24
+ * find-then-create dedup) where holding a transaction for the duration is
25
+ * fine and the auto-release removes any chance of a leak.
26
+ *
27
+ * BOTH run on the service's pool, which MUST be a pool dedicated to advisory
28
+ * locks (separate from the pool the locked work runs on). A held lock keeps its
29
+ * connection checked out for the lock's lifetime; if lock and work shared one
30
+ * pool, concurrency >= pool size would deadlock (every slot a lock-holder
31
+ * waiting for a work connection). The backend wires this to a dedicated
32
+ * `lockPool`; that pool also sets `idle_in_transaction_session_timeout` /
33
+ * `lock_timeout` so a stalled critical section cannot strand a lock forever.
26
34
  *
27
35
  * Keys are arbitrary strings hashed to Postgres' 64-bit lock space via
28
36
  * `hashtextextended(key, 0)`. Callers SHOULD namespace keys (e.g.
29
37
  * `"script-packages.installer"`, `"incident.dedupe:<systemId>"`) since the
30
38
  * advisory-lock space is global to the database server, not schema-scoped.
31
39
  */
32
- import { sql } from "drizzle-orm";
33
- import type { SafeDatabase } from "./plugin-system";
34
40
 
35
41
  /**
36
42
  * Minimal pool surface this module needs. Modelled on `pg.Pool` /
@@ -45,13 +51,22 @@ export interface AdvisoryLockPoolClient {
45
51
  /** Return the client to the pool. */
46
52
  release(): void;
47
53
  /**
48
- * Subscribe to async client errors. A session-lock client is held for a long
49
- * time; if its backend dies (admin termination, failover, network drop) `pg`
50
- * emits `'error'` on the client, and an `'error'` with no listener is
51
- * re-thrown by the EventEmitter and would crash the pod. We attach a listener
52
- * so that loss degrades gracefully instead. Modelled on `pg.Client.on`.
54
+ * Subscribe to async client errors. A held client (session lock, or an open
55
+ * xact-lock transaction) is checked out for a while; if its backend dies
56
+ * (admin termination, failover, network drop) `pg` emits `'error'` on the
57
+ * client, and an `'error'` with no listener is re-thrown by the EventEmitter
58
+ * and would crash the pod. We attach a listener so that loss degrades
59
+ * gracefully instead. Modelled on `pg.Client.on`.
53
60
  */
54
61
  on(event: "error", listener: (err: Error) => void): void;
62
+ /**
63
+ * Detach a previously-attached error listener. MUST be called before
64
+ * returning the client to the pool: pooled clients are reused, so attaching a
65
+ * fresh listener on every checkout WITHOUT removing it on release leaks one
66
+ * listener per acquisition on each long-lived physical connection (an
67
+ * unbounded `MaxListenersExceeded` leak). Modelled on `pg.Client.off`.
68
+ */
69
+ off(event: "error", listener: (err: Error) => void): void;
55
70
  }
56
71
 
57
72
  export interface AdvisoryLockPool {
@@ -76,8 +91,30 @@ export interface AdvisoryLockService {
76
91
  * `finally`.
77
92
  */
78
93
  tryAcquire(key: string): Promise<AdvisoryLockHandle | null>;
94
+ /**
95
+ * Run `fn` while holding a transaction-scoped advisory lock for `key`,
96
+ * acquired with `pg_advisory_xact_lock` (which BLOCKS until granted) on a
97
+ * dedicated client from THIS service's pool, and auto-released when that
98
+ * transaction commits/rolls back after `fn` settles.
99
+ *
100
+ * The lock transaction runs on this service's (dedicated lock) pool, while
101
+ * `fn` does its real work on whatever database it already holds (typically
102
+ * the shared admin pool). Because the held lock connection and the work
103
+ * connection come from DIFFERENT pools, the nested acquisition can never
104
+ * deadlock the work pool. Use this for SHORT critical sections that gate a
105
+ * read-then-write on another connection.
106
+ */
107
+ withXactLock<T>(args: { key: string; fn: () => Promise<T> }): Promise<T>;
79
108
  }
80
109
 
110
+ /**
111
+ * Shared no-op `'error'` listener for held clients. A single module-level
112
+ * reference (rather than a fresh closure per acquisition) is what lets `off`
113
+ * detach exactly the listener `on` attached, and avoids allocating one per
114
+ * lock. It captures nothing, so sharing it is safe.
115
+ */
116
+ const swallowClientError = (): void => {};
117
+
81
118
  /**
82
119
  * Build an {@link AdvisoryLockService} backed by a pool. The backend
83
120
  * provides the real admin pool; tests can provide a faithful fake that
@@ -95,8 +132,13 @@ export function createAdvisoryLockService(
95
132
  // here; without a listener the process crashes. Swallow it - the session
96
133
  // lock is auto-released server-side when the backend dies, and a stale
97
134
  // `release()` is already a no-op-safe `finally`, so the loss surfaces as
98
- // the key simply becoming acquirable again.
99
- client.on("error", () => {});
135
+ // the key simply becoming acquirable again. The listener is removed on
136
+ // release so it does not accumulate on the reused pooled connection.
137
+ client.on("error", swallowClientError);
138
+ const releaseClient = () => {
139
+ client.off("error", swallowClientError);
140
+ client.release();
141
+ };
100
142
  let acquired = false;
101
143
  try {
102
144
  const result = await client.query<{ ok: boolean }>(
@@ -105,14 +147,14 @@ export function createAdvisoryLockService(
105
147
  );
106
148
  acquired = Boolean(result.rows[0]?.ok);
107
149
  } catch (error) {
108
- client.release();
150
+ releaseClient();
109
151
  throw error;
110
152
  }
111
153
  if (!acquired) {
112
154
  // Did not get the lock — return the client immediately. (A failed
113
155
  // pg_try_advisory_lock acquires nothing, so there is nothing to
114
156
  // unlock.)
115
- client.release();
157
+ releaseClient();
116
158
  return null;
117
159
  }
118
160
 
@@ -127,48 +169,48 @@ export function createAdvisoryLockService(
127
169
  [key],
128
170
  );
129
171
  } finally {
130
- client.release();
172
+ releaseClient();
131
173
  }
132
174
  },
133
175
  };
134
176
  },
135
- };
136
- }
137
177
 
138
- /**
139
- * Run `fn` while holding a transaction-scoped advisory lock for `key`. The
140
- * lock is acquired with `pg_advisory_xact_lock` (which BLOCKS until granted)
141
- * inside a transaction and auto-released at COMMIT/ROLLBACK, so there is no
142
- * unlock to leak. Use only for SHORT critical sections the lock is held
143
- * for the whole transaction.
144
- *
145
- * Because the scoped DB runs an entire `transaction()` callback on a single
146
- * dedicated connection, the lock + the work + the implicit release all share
147
- * one session, which is exactly the affinity session locks require.
148
- *
149
- * `fn` receives the transaction handle `tx` and MUST run its
150
- * read-then-write critical section on it (not on the outer pool). Running
151
- * the work on the pool would put it on a DIFFERENT connection than the one
152
- * holding the lock — so two concurrent callers' critical sections could
153
- * interleave even though both "hold" the lock. Using `tx` keeps the
154
- * read-check + write atomic with respect to the lock.
155
- */
156
- export async function withXactLock<
157
- S extends Record<string, unknown>,
158
- T,
159
- >({
160
- db,
161
- key,
162
- fn,
163
- }: {
164
- db: SafeDatabase<S>;
165
- key: string;
166
- fn: (tx: Parameters<Parameters<SafeDatabase<S>["transaction"]>[0]>[0]) => Promise<T>;
167
- }): Promise<T> {
168
- return db.transaction(async (tx) => {
169
- await tx.execute(
170
- sql`SELECT pg_advisory_xact_lock(hashtextextended(${key}, 0))`,
171
- );
172
- return fn(tx);
173
- });
178
+ async withXactLock({ key, fn }) {
179
+ const client = await pool.connect();
180
+ // Same rationale as tryAcquire: the lock transaction keeps this client
181
+ // checked out (idle in transaction) while `fn` runs, so attach an error
182
+ // listener to survive a backend termination instead of crashing the pod.
183
+ // Removed in the finally so it does not accumulate on the reused client.
184
+ client.on("error", swallowClientError);
185
+ try {
186
+ await client.query("BEGIN");
187
+ try {
188
+ // BLOCKS on this dedicated client until the lock is granted; auto-
189
+ // released by the COMMIT/ROLLBACK below. `fn`'s own work runs on a
190
+ // DIFFERENT pool, so no same-pool nested-acquisition deadlock.
191
+ await client.query(
192
+ "SELECT pg_advisory_xact_lock(hashtextextended($1, 0))",
193
+ [key],
194
+ );
195
+ const result = await fn();
196
+ await client.query("COMMIT");
197
+ return result;
198
+ } catch (error) {
199
+ // Roll back so the xact lock releases and nothing partial lingers on
200
+ // this connection before it returns to the pool. Best-effort: if the
201
+ // backend already died, ROLLBACK throws but release() still frees the
202
+ // slot and the lock is auto-released server-side.
203
+ try {
204
+ await client.query("ROLLBACK");
205
+ } catch (rollbackError) {
206
+ void rollbackError;
207
+ }
208
+ throw error;
209
+ }
210
+ } finally {
211
+ client.off("error", swallowClientError);
212
+ client.release();
213
+ }
214
+ },
215
+ };
174
216
  }