npm - @checkstack/backend-api - Versions diffs - 0.19.0 → 0.20.0 - Mend

@checkstack/backend-api 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md +54 -0
package/package.json +1 -1
package/src/advisory-lock-pool.it.test.ts +282 -0
package/src/advisory-lock.test.ts +144 -3
package/src/advisory-lock.ts +97 -55

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,59 @@
 # @checkstack/backend-api
+## 0.20.0
+### Minor Changes
+- a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
+  Both the session-lock service and `withXactLock` HOLD a Postgres connection for
+  the lock's whole lifetime while the gated work runs on a _different_ connection.
+  Both lock and work were drawing from the single shared `adminPool` (which, with
+  no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
+  wait forever). Under concurrency >= pool size, every slot became a lock-holding
+  connection waiting for a work connection that could never free up: a permanent
+  deadlock. It surfaced as all connections stuck `idle in transaction` on
+  `pg_advisory_xact_lock` and every API request hanging into an upstream 502,
+  only after the server had been running long enough to hit that concurrency
+  (e.g. a burst of health-check evaluations or incident dedups).
+  Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
+  the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
+  deadlock class is impossible. `AdvisoryLockService` gains a pooled
+  `withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
+  pool); healthcheck's per-system serializer, incident's dedup-create, and the
+  automation single-mode concurrency lock now use it. The deadlock-prone
+  standalone `withXactLock({ db, ... })` helper is REMOVED.
+  Both pools are explicitly configured with `connectionTimeoutMillis` so any
+  future exhaustion fails fast and self-heals instead of hanging, and both get a
+  pool-level `error` handler (an idle pooled client whose backend dies otherwise
+  crashes the pod). The lock pool additionally sets
+  `idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
+  section is reaped server-side (auto-releasing the lock) rather than stranding a
+  key forever. The advisory-lock service also now removes its per-client error
+  listener on release (it previously leaked one listener per acquisition on each
+  reused pooled connection - an unbounded `MaxListenersExceeded` leak).
+  New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
+  `DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
+  (default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
+  `DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
+  (default 30000). Size pools off
+  `N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
+  BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
+  removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
+  `IncidentService`'s constructor now requires an `AdvisoryLockService` as its
+  second argument, and the healthcheck `createHealthEntitySerializer` /
+  `executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
+  instead of `db` for the serializer.
+### Patch Changes
+- @checkstack/cache-api@0.3.8
+- @checkstack/queue-api@0.3.8
 ## 0.19.0
 ### Minor Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@checkstack/backend-api",
-  "version": "0.19.0",
+  "version": "0.20.0",
   "license": "Elastic-2.0",
   "type": "module",
   "main": "./src/index.ts",

package/src/advisory-lock-pool.it.test.ts ADDED Viewed

@@ -0,0 +1,282 @@
+/**
+ * Integration test (real Postgres) for the advisory-lock CONNECTION-POOL
+ * contract — the behaviour that silently wedged production and that fakes
+ * cannot model: a held advisory lock keeps its connection checked out while the
+ * gated work runs on a *different* connection, so lock-pool / work-pool sizing
+ * decides whether the system makes progress or deadlocks.
+ *
+ * It pins three things against a live server:
+ *
+ *   1. REPRODUCE THE BUG: when the lock and its work share ONE pool, concurrency
+ *      at the pool size deadlocks (every slot is a lock-holder waiting for a
+ *      work connection that can never free up). This is a guard — if a refactor
+ *      makes this stop deadlocking, the throughput test below is no longer
+ *      proving anything.
+ *   2. THE FIX: with the lock on a DEDICATED pool, the same (and much higher)
+ *      concurrency completes with zero failures.
+ *   3. CORRECTNESS ACROSS INSTANCES: independent service instances with their
+ *      OWN pools (simulating N pods on one database) serialize a find-then-
+ *      create on a shared key down to exactly ONE row — with a no-lock control
+ *      proving the lock is what enforces it.
+ *
+ * Gated behind `CHECKSTACK_IT=1`; the integration CI job provides the Postgres
+ * service container. Connection from `CHECKSTACK_IT_PG_URL`.
+ */
+import { afterAll, beforeAll, describe, expect, it } from "bun:test";
+import { Pool } from "pg";
+import { createAdvisoryLockService } from "./advisory-lock";
+const PG_URL =
+  process.env.CHECKSTACK_IT_PG_URL ??
+  "postgres://postgres:postgres@localhost:5432/postgres";
+const DEDUP_TABLE = "it_advisory_dedup";
+describe.skipIf(!process.env.CHECKSTACK_IT)(
+  "advisory-lock pool contract (real Postgres)",
+  () => {
+    /** Pools created during a test; ended in afterEach-style cleanup helpers. */
+    const tracked: Pool[] = [];
+    function makePool(max: number, connectionTimeoutMillis = 5000): Pool {
+      const pool = new Pool({
+        connectionString: PG_URL,
+        max,
+        connectionTimeoutMillis,
+        idleTimeoutMillis: 1000,
+      });
+      // A held-lock client can error asynchronously (timeout / termination);
+      // swallow so it never surfaces as an unhandled error and fails the file.
+      pool.on("error", () => {});
+      tracked.push(pool);
+      return pool;
+    }
+    async function endTrackedPools(): Promise<void> {
+      await Promise.all(tracked.splice(0).map((p) => p.end().catch(() => {})));
+    }
+    let setupPool: Pool;
+    beforeAll(async () => {
+      setupPool = new Pool({ connectionString: PG_URL });
+      await setupPool.query(
+        `CREATE TABLE IF NOT EXISTS ${DEDUP_TABLE} (lock_key text NOT NULL, id text NOT NULL)`,
+      );
+    });
+    afterAll(async () => {
+      await setupPool.query(`DROP TABLE IF EXISTS ${DEDUP_TABLE}`);
+      await setupPool.end();
+      await endTrackedPools();
+    });
+    /**
+     * Find-then-create on `workPool`: insert exactly once per key. The 15ms gap
+     * between the read and the write widens the race window so an UNSERIALIZED
+     * run reliably double-inserts — making the lock's effect observable.
+     */
+    async function dedupCreate(workPool: Pool, key: string): Promise<boolean> {
+      const client = await workPool.connect();
+      try {
+        const { rows } = await client.query(
+          `SELECT id FROM ${DEDUP_TABLE} WHERE lock_key = $1 LIMIT 1`,
+          [key],
+        );
+        if (rows.length > 0) return false;
+        await new Promise((r) => setTimeout(r, 15));
+        await client.query(
+          `INSERT INTO ${DEDUP_TABLE} (lock_key, id) VALUES ($1, $2)`,
+          [key, crypto.randomUUID()],
+        );
+        return true;
+      } finally {
+        client.release();
+      }
+    }
+    async function countFor(key: string): Promise<number> {
+      const { rows } = await setupPool.query<{ n: string }>(
+        `SELECT count(*)::text AS n FROM ${DEDUP_TABLE} WHERE lock_key = $1`,
+        [key],
+      );
+      return Number(rows[0]?.n ?? "0");
+    }
+    it(
+      "REPRODUCES the deadlock when lock + work share one pool (the bug)",
+      async () => {
+        const POOL_MAX = 4;
+        // Single shared pool — the pre-fix wiring. The lock client AND the work
+        // client both come from here. Short connect timeout so the deadlock
+        // surfaces as a fast rejection rather than a long hang.
+        const pool = makePool(POOL_MAX, 1500);
+        const svc = createAdvisoryLockService(pool);
+        const runId = crypto.randomUUID();
+        // Exactly POOL_MAX concurrent ops, each on a DISTINCT key (so there is
+        // NO lock contention — the only thing that can stall is connection
+        // accounting). Each holds a lock client, then asks the same pool for a
+        // work client that will never come.
+        const results = await Promise.allSettled(
+          Array.from({ length: POOL_MAX }, (_, i) =>
+            svc.withXactLock({
+              key: `deadlock:${runId}:${i}`,
+              fn: async () => {
+                const c = await pool.connect();
+                try {
+                  await c.query("SELECT 1");
+                } finally {
+                  c.release();
+                }
+              },
+            }),
+          ),
+        );
+        const rejected = results.filter((r) => r.status === "rejected").length;
+        // The deadlock manifests as connection-acquire timeouts on the work
+        // checkout. If this ever becomes 0, the single-pool design no longer
+        // deadlocks and the throughput proof below must be re-examined.
+        expect(rejected).toBeGreaterThan(0);
+        await endTrackedPools();
+      },
+      30_000,
+    );
+    it(
+      "does NOT deadlock under high throughput with a dedicated lock pool (the fix)",
+      async () => {
+        // Deliberately TINY pools so any deadlock would be hit immediately; the
+        // fix is that lock and work draw from different pools.
+        const lockPool = makePool(4);
+        const workPool = makePool(4);
+        const svc = createAdvisoryLockService(lockPool);
+        const runId = crypto.randomUUID();
+        const CONCURRENCY = 200;
+        const results = await Promise.allSettled(
+          Array.from({ length: CONCURRENCY }, (_, i) =>
+            svc.withXactLock({
+              key: `throughput:${runId}:${i}`,
+              fn: async () => {
+                const c = await workPool.connect();
+                try {
+                  await c.query("SELECT 1");
+                } finally {
+                  c.release();
+                }
+              },
+            }),
+          ),
+        );
+        const rejected = results.filter((r) => r.status === "rejected");
+        // Every single operation must complete: no deadlock, no timeout.
+        expect(rejected).toHaveLength(0);
+        await endTrackedPools();
+      },
+      30_000,
+    );
+    it(
+      "serializes find-then-create across INSTANCES to exactly one row",
+      async () => {
+        // Each "pod" is an independent service instance with its OWN pools, all
+        // pointing at the same database — the real multi-instance topology. The
+        // advisory lock space is global to the server, so they must serialize.
+        const PODS = 6;
+        const ATTEMPTS_PER_POD = 5;
+        const key = `dedupe:${crypto.randomUUID()}`;
+        const pods = Array.from({ length: PODS }, () => {
+          const workPool = makePool(2);
+          const svc = createAdvisoryLockService(makePool(2));
+          return { workPool, svc };
+        });
+        const attempts = pods.flatMap((pod) =>
+          Array.from({ length: ATTEMPTS_PER_POD }, () =>
+            pod.svc.withXactLock({
+              key,
+              fn: () => dedupCreate(pod.workPool, key),
+            }),
+          ),
+        );
+        const settled = await Promise.allSettled(attempts);
+        const created = settled.filter(
+          (r) => r.status === "fulfilled" && r.value === true,
+        ).length;
+        // Exactly one attempt created the row; the rest observed it and no-oped.
+        expect(await countFor(key)).toBe(1);
+        expect(created).toBe(1);
+        await endTrackedPools();
+      },
+      30_000,
+    );
+    it(
+      "a STALLED critical section is reaped by idle_in_transaction_session_timeout, freeing the key",
+      async () => {
+        // The lock pool sets a short idle-in-transaction timeout. A held lock
+        // sits "idle in transaction" for the whole time `fn` runs, so a hung
+        // `fn` trips it: Postgres aborts the session, auto-releasing the lock -
+        // proving a stall self-heals instead of stranding the key forever.
+        const lockPool = new Pool({
+          connectionString: PG_URL,
+          max: 4,
+          connectionTimeoutMillis: 5000,
+          idle_in_transaction_session_timeout: 1000,
+        });
+        lockPool.on("error", () => {});
+        tracked.push(lockPool);
+        const svc = createAdvisoryLockService(lockPool);
+        const key = `stall:${crypto.randomUUID()}`;
+        let releaseHang!: () => void;
+        const hang = new Promise<void>((r) => (releaseHang = r));
+        // Holder whose critical section hangs (never issues another query).
+        const stalled = svc
+          .withXactLock({ key, fn: () => hang })
+          .catch(() => "rejected-as-expected");
+        // Wait past the 1s idle timeout so the server reaps the stalled holder.
+        await new Promise((r) => setTimeout(r, 1800));
+        // The key must be acquirable again now that the stalled session was
+        // aborted server-side.
+        const t0 = Date.now();
+        const got = await svc.withXactLock({ key, fn: async () => "ok" });
+        expect(got).toBe("ok");
+        expect(Date.now() - t0).toBeLessThan(3000);
+        releaseHang();
+        await stalled; // let the stalled call unwind (COMMIT fails on dead conn)
+        await endTrackedPools();
+      },
+      30_000,
+    );
+    it(
+      "CONTROL: the same workload WITHOUT the lock races into duplicates",
+      async () => {
+        // Proves the lock — not some incidental ordering — is what enforces
+        // single-creation above. Same widened-window find-then-create, run
+        // concurrently with NO advisory lock, must double-insert.
+        const workPool = makePool(8);
+        const key = `dedupe-nolock:${crypto.randomUUID()}`;
+        await Promise.all(
+          Array.from({ length: 8 }, () => dedupCreate(workPool, key)),
+        );
+        expect(await countFor(key)).toBeGreaterThan(1);
+        await endTrackedPools();
+      },
+      30_000,
+    );
+  },
+);

package/src/advisory-lock.test.ts CHANGED Viewed

@@ -7,8 +7,9 @@ import {
 /**
  * Faithful fake of a `pg.Pool` that models Postgres' per-connection
- * SESSION advisory-lock semantics:
+ * advisory-lock semantics for BOTH lock flavours:
  *
+ * SESSION locks (`tryAcquire`):
  *   - A key can be held by at most one connection at a time.
  *   - `pg_try_advisory_lock` succeeds only if the key is free; it then
  *     binds the key to the acquiring connection.
@@ -16,8 +17,15 @@ import {
  *     (a no-op otherwise) — exactly the bug we are guarding against: an
  *     unlock issued on a different connection does nothing.
  *
- * This lets the test prove the service keeps acquire + release on ONE
- * client.
+ * TRANSACTION locks (`withXactLock`):
+ *   - `pg_advisory_xact_lock` BLOCKS until the key is free, then binds it to
+ *     the acquiring connection's transaction.
+ *   - `COMMIT` / `ROLLBACK` release every xact lock held by that connection
+ *     and wake the next blocked waiter (FIFO) — modelling auto-release and
+ *     the serialization guarantee.
+ *
+ * This lets the tests prove the service keeps acquire + release on ONE
+ * client and that concurrent `withXactLock` callers serialize.
  */
 interface FakePool extends AdvisoryLockPool {
   checkedOut: number;
@@ -27,6 +35,9 @@ interface FakePool extends AdvisoryLockPool {
 function makeFakePool(): FakePool {
   // key -> owning connection id (or absent if free)
   const heldBy = new Map<string, number>();
+  // xact key -> owning connection id; waiters queued FIFO per key.
+  const xactHeldBy = new Map<string, number>();
+  const xactWaiters = new Map<string, Array<() => void>>();
   let nextConnId = 0;
   const counters = { checkedOut: 0, released: 0 };
@@ -46,8 +57,21 @@ function makeFakePool(): FakePool {
     async connect(): Promise<AdvisoryLockPoolClient> {
       const connId = nextConnId++;
       counters.checkedOut++;
+      const releaseXactLocks = () => {
+        for (const [key, owner] of [...xactHeldBy.entries()]) {
+          if (owner !== connId) continue;
+          xactHeldBy.delete(key);
+          const next = xactWaiters.get(key)?.shift();
+          if (next) next();
+        }
+      };
       return {
         async query<T>(queryText: string, values?: unknown[]) {
+          if (queryText === "BEGIN") return { rows: [] };
+          if (queryText === "COMMIT" || queryText === "ROLLBACK") {
+            releaseXactLocks();
+            return { rows: [] };
+          }
           const key = keyOf(values);
           if (queryText.includes("pg_try_advisory_lock")) {
             const owner = heldBy.get(key);
@@ -55,6 +79,22 @@ function makeFakePool(): FakePool {
             if (ok) heldBy.set(key, connId);
             return { rows: [{ ok } as unknown as T] };
           }
+          if (queryText.includes("pg_advisory_xact_lock")) {
+            if (!xactHeldBy.has(key)) {
+              xactHeldBy.set(key, connId);
+              return { rows: [] };
+            }
+            // Blocked: enqueue and wait until a holder commits/rolls back.
+            await new Promise<void>((resolve) => {
+              const q = xactWaiters.get(key) ?? [];
+              q.push(() => {
+                xactHeldBy.set(key, connId);
+                resolve();
+              });
+              xactWaiters.set(key, q);
+            });
+            return { rows: [] };
+          }
           if (queryText.includes("pg_advisory_unlock")) {
             // Only the owning connection can release — model the leak bug.
             if (heldBy.get(key) === connId) heldBy.delete(key);
@@ -70,6 +110,10 @@ function makeFakePool(): FakePool {
           // `on('error')` hardening is exercised by the IT against real
           // Postgres (killing the holding connection).
         },
+        off() {
+          // Counterpart to `on`; the service detaches its error listener on
+          // release. No-op here since the fake never attaches one.
+        },
       };
     },
   };
@@ -130,3 +174,100 @@ describe("createAdvisoryLockService", () => {
     await b!.release();
   });
 });
+describe("createAdvisoryLockService.withXactLock", () => {
+  const tick = (ms = 5) => new Promise((r) => setTimeout(r, ms));
+  it("runs fn, returns its value, and releases the client", async () => {
+    const pool = makeFakePool();
+    const svc = createAdvisoryLockService(pool);
+    const result = await svc.withXactLock({ key: "k", fn: async () => 42 });
+    expect(result).toBe(42);
+    expect(pool.checkedOut).toBe(1);
+    expect(pool.released).toBe(1);
+  });
+  it("serializes concurrent calls on the same key (second fn waits for first to commit)", async () => {
+    const pool = makeFakePool();
+    const svc = createAdvisoryLockService(pool);
+    const order: string[] = [];
+    let releaseFirst!: () => void;
+    const firstHeld = new Promise<void>((r) => (releaseFirst = r));
+    const p1 = svc.withXactLock({
+      key: "k",
+      fn: async () => {
+        order.push("1-start");
+        await firstHeld;
+        order.push("1-end");
+      },
+    });
+    // Let p1 acquire the lock before p2 attempts it.
+    await tick();
+    const p2 = svc.withXactLock({
+      key: "k",
+      fn: async () => {
+        order.push("2-start");
+      },
+    });
+    // While p1 holds the lock, p2's fn must NOT have started.
+    await tick();
+    expect(order).toEqual(["1-start"]);
+    releaseFirst();
+    await Promise.all([p1, p2]);
+    expect(order).toEqual(["1-start", "1-end", "2-start"]);
+    expect(pool.released).toBe(2);
+  });
+  it("rolls back and releases the client when fn throws, freeing the lock", async () => {
+    const pool = makeFakePool();
+    const svc = createAdvisoryLockService(pool);
+    await expect(
+      svc.withXactLock({
+        key: "k",
+        fn: async () => {
+          throw new Error("boom");
+        },
+      }),
+    ).rejects.toThrow("boom");
+    // Lock was released on rollback: a subsequent acquire succeeds promptly.
+    const after = await svc.withXactLock({ key: "k", fn: async () => "ok" });
+    expect(after).toBe("ok");
+    expect(pool.released).toBe(2);
+  });
+  it("different keys do not serialize", async () => {
+    const pool = makeFakePool();
+    const svc = createAdvisoryLockService(pool);
+    const started: string[] = [];
+    let release!: () => void;
+    const held = new Promise<void>((r) => (release = r));
+    const pA = svc.withXactLock({
+      key: "a",
+      fn: async () => {
+        started.push("a");
+        await held;
+      },
+    });
+    await tick();
+    // Key "b" must run even while "a" is still held.
+    await svc.withXactLock({
+      key: "b",
+      fn: async () => {
+        started.push("b");
+      },
+    });
+    expect(started).toContain("b");
+    release();
+    await pA;
+  });
+});

package/src/advisory-lock.ts CHANGED Viewed

@@ -18,19 +18,25 @@
  *     (e.g. an installer election held across a minutes-long `bun install`)
  *     where a long-open transaction would be unacceptable.
  *
- *   - {@link withXactLock} wraps acquire + work + release in a single
- *     transaction using `pg_advisory_xact_lock`, which auto-releases at
- *     COMMIT/ROLLBACK. Use this for SHORT critical sections (e.g. a
- *     find-then-create dedup) where holding a transaction for the duration
- *     is fine and the auto-release removes any chance of a leak.
+ *   - {@link AdvisoryLockService.withXactLock} wraps acquire + work + release
+ *     in a single transaction using `pg_advisory_xact_lock`, which auto-
+ *     releases at COMMIT/ROLLBACK. Use this for SHORT critical sections (e.g. a
+ *     find-then-create dedup) where holding a transaction for the duration is
+ *     fine and the auto-release removes any chance of a leak.
+ *
+ * BOTH run on the service's pool, which MUST be a pool dedicated to advisory
+ * locks (separate from the pool the locked work runs on). A held lock keeps its
+ * connection checked out for the lock's lifetime; if lock and work shared one
+ * pool, concurrency >= pool size would deadlock (every slot a lock-holder
+ * waiting for a work connection). The backend wires this to a dedicated
+ * `lockPool`; that pool also sets `idle_in_transaction_session_timeout` /
+ * `lock_timeout` so a stalled critical section cannot strand a lock forever.
  *
  * Keys are arbitrary strings hashed to Postgres' 64-bit lock space via
  * `hashtextextended(key, 0)`. Callers SHOULD namespace keys (e.g.
  * `"script-packages.installer"`, `"incident.dedupe:<systemId>"`) since the
  * advisory-lock space is global to the database server, not schema-scoped.
  */
-import { sql } from "drizzle-orm";
-import type { SafeDatabase } from "./plugin-system";
 /**
  * Minimal pool surface this module needs. Modelled on `pg.Pool` /
@@ -45,13 +51,22 @@ export interface AdvisoryLockPoolClient {
   /** Return the client to the pool. */
   release(): void;
   /**
-   * Subscribe to async client errors. A session-lock client is held for a long
-   * time; if its backend dies (admin termination, failover, network drop) `pg`
-   * emits `'error'` on the client, and an `'error'` with no listener is
-   * re-thrown by the EventEmitter and would crash the pod. We attach a listener
-   * so that loss degrades gracefully instead. Modelled on `pg.Client.on`.
+   * Subscribe to async client errors. A held client (session lock, or an open
+   * xact-lock transaction) is checked out for a while; if its backend dies
+   * (admin termination, failover, network drop) `pg` emits `'error'` on the
+   * client, and an `'error'` with no listener is re-thrown by the EventEmitter
+   * and would crash the pod. We attach a listener so that loss degrades
+   * gracefully instead. Modelled on `pg.Client.on`.
    */
   on(event: "error", listener: (err: Error) => void): void;
+  /**
+   * Detach a previously-attached error listener. MUST be called before
+   * returning the client to the pool: pooled clients are reused, so attaching a
+   * fresh listener on every checkout WITHOUT removing it on release leaks one
+   * listener per acquisition on each long-lived physical connection (an
+   * unbounded `MaxListenersExceeded` leak). Modelled on `pg.Client.off`.
+   */
+  off(event: "error", listener: (err: Error) => void): void;
 }
 export interface AdvisoryLockPool {
@@ -76,8 +91,30 @@ export interface AdvisoryLockService {
    * `finally`.
    */
   tryAcquire(key: string): Promise<AdvisoryLockHandle | null>;
+  /**
+   * Run `fn` while holding a transaction-scoped advisory lock for `key`,
+   * acquired with `pg_advisory_xact_lock` (which BLOCKS until granted) on a
+   * dedicated client from THIS service's pool, and auto-released when that
+   * transaction commits/rolls back after `fn` settles.
+   *
+   * The lock transaction runs on this service's (dedicated lock) pool, while
+   * `fn` does its real work on whatever database it already holds (typically
+   * the shared admin pool). Because the held lock connection and the work
+   * connection come from DIFFERENT pools, the nested acquisition can never
+   * deadlock the work pool. Use this for SHORT critical sections that gate a
+   * read-then-write on another connection.
+   */
+  withXactLock<T>(args: { key: string; fn: () => Promise<T> }): Promise<T>;
 }
+/**
+ * Shared no-op `'error'` listener for held clients. A single module-level
+ * reference (rather than a fresh closure per acquisition) is what lets `off`
+ * detach exactly the listener `on` attached, and avoids allocating one per
+ * lock. It captures nothing, so sharing it is safe.
+ */
+const swallowClientError = (): void => {};
 /**
  * Build an {@link AdvisoryLockService} backed by a pool. The backend
  * provides the real admin pool; tests can provide a faithful fake that
@@ -95,8 +132,13 @@ export function createAdvisoryLockService(
       // here; without a listener the process crashes. Swallow it - the session
       // lock is auto-released server-side when the backend dies, and a stale
       // `release()` is already a no-op-safe `finally`, so the loss surfaces as
-      // the key simply becoming acquirable again.
-      client.on("error", () => {});
+      // the key simply becoming acquirable again. The listener is removed on
+      // release so it does not accumulate on the reused pooled connection.
+      client.on("error", swallowClientError);
+      const releaseClient = () => {
+        client.off("error", swallowClientError);
+        client.release();
+      };
       let acquired = false;
       try {
         const result = await client.query<{ ok: boolean }>(
@@ -105,14 +147,14 @@ export function createAdvisoryLockService(
         );
         acquired = Boolean(result.rows[0]?.ok);
       } catch (error) {
-        client.release();
+        releaseClient();
         throw error;
       }
       if (!acquired) {
         // Did not get the lock — return the client immediately. (A failed
         // pg_try_advisory_lock acquires nothing, so there is nothing to
         // unlock.)
-        client.release();
+        releaseClient();
         return null;
       }
@@ -127,48 +169,48 @@ export function createAdvisoryLockService(
               [key],
             );
           } finally {
-            client.release();
+            releaseClient();
           }
         },
       };
     },
-  };
-}
-/**
- * Run `fn` while holding a transaction-scoped advisory lock for `key`. The
- * lock is acquired with `pg_advisory_xact_lock` (which BLOCKS until granted)
- * inside a transaction and auto-released at COMMIT/ROLLBACK, so there is no
- * unlock to leak. Use only for SHORT critical sections — the lock is held
- * for the whole transaction.
- *
- * Because the scoped DB runs an entire `transaction()` callback on a single
- * dedicated connection, the lock + the work + the implicit release all share
- * one session, which is exactly the affinity session locks require.
- *
- * `fn` receives the transaction handle `tx` and MUST run its
- * read-then-write critical section on it (not on the outer pool). Running
- * the work on the pool would put it on a DIFFERENT connection than the one
- * holding the lock — so two concurrent callers' critical sections could
- * interleave even though both "hold" the lock. Using `tx` keeps the
- * read-check + write atomic with respect to the lock.
- */
-export async function withXactLock<
-  S extends Record<string, unknown>,
-  T,
->({
-  db,
-  key,
-  fn,
-}: {
-  db: SafeDatabase<S>;
-  key: string;
-  fn: (tx: Parameters<Parameters<SafeDatabase<S>["transaction"]>[0]>[0]) => Promise<T>;
-}): Promise<T> {
-  return db.transaction(async (tx) => {
-    await tx.execute(
-      sql`SELECT pg_advisory_xact_lock(hashtextextended(${key}, 0))`,
-    );
-    return fn(tx);
-  });
+    async withXactLock({ key, fn }) {
+      const client = await pool.connect();
+      // Same rationale as tryAcquire: the lock transaction keeps this client
+      // checked out (idle in transaction) while `fn` runs, so attach an error
+      // listener to survive a backend termination instead of crashing the pod.
+      // Removed in the finally so it does not accumulate on the reused client.
+      client.on("error", swallowClientError);
+      try {
+        await client.query("BEGIN");
+        try {
+          // BLOCKS on this dedicated client until the lock is granted; auto-
+          // released by the COMMIT/ROLLBACK below. `fn`'s own work runs on a
+          // DIFFERENT pool, so no same-pool nested-acquisition deadlock.
+          await client.query(
+            "SELECT pg_advisory_xact_lock(hashtextextended($1, 0))",
+            [key],
+          );
+          const result = await fn();
+          await client.query("COMMIT");
+          return result;
+        } catch (error) {
+          // Roll back so the xact lock releases and nothing partial lingers on
+          // this connection before it returns to the pool. Best-effort: if the
+          // backend already died, ROLLBACK throws but release() still frees the
+          // slot and the lock is auto-released server-side.
+          try {
+            await client.query("ROLLBACK");
+          } catch (rollbackError) {
+            void rollbackError;
+          }
+          throw error;
+        }
+      } finally {
+        client.off("error", swallowClientError);
+        client.release();
+      }
+    },
+  };
 }