@checkstack/incident-backend 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,65 @@
1
1
  # @checkstack/incident-backend
2
2
 
3
+ ## 1.5.0
4
+
5
+ ### Minor Changes
6
+
7
+ - a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
8
+
9
+ Both the session-lock service and `withXactLock` HOLD a Postgres connection for
10
+ the lock's whole lifetime while the gated work runs on a _different_ connection.
11
+ Both lock and work were drawing from the single shared `adminPool` (which, with
12
+ no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
13
+ wait forever). Under concurrency >= pool size, every slot became a lock-holding
14
+ connection waiting for a work connection that could never free up: a permanent
15
+ deadlock. It surfaced as all connections stuck `idle in transaction` on
16
+ `pg_advisory_xact_lock` and every API request hanging into an upstream 502,
17
+ only after the server had been running long enough to hit that concurrency
18
+ (e.g. a burst of health-check evaluations or incident dedups).
19
+
20
+ Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
21
+ the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
22
+ deadlock class is impossible. `AdvisoryLockService` gains a pooled
23
+ `withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
24
+ pool); healthcheck's per-system serializer, incident's dedup-create, and the
25
+ automation single-mode concurrency lock now use it. The deadlock-prone
26
+ standalone `withXactLock({ db, ... })` helper is REMOVED.
27
+
28
+ Both pools are explicitly configured with `connectionTimeoutMillis` so any
29
+ future exhaustion fails fast and self-heals instead of hanging, and both get a
30
+ pool-level `error` handler (an idle pooled client whose backend dies otherwise
31
+ crashes the pod). The lock pool additionally sets
32
+ `idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
33
+ section is reaped server-side (auto-releasing the lock) rather than stranding a
34
+ key forever. The advisory-lock service also now removes its per-client error
35
+ listener on release (it previously leaked one listener per acquisition on each
36
+ reused pooled connection - an unbounded `MaxListenersExceeded` leak).
37
+
38
+ New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
39
+ `DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
40
+ (default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
41
+ `DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
42
+ (default 30000). Size pools off
43
+ `N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
44
+
45
+ BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
46
+ removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
47
+ `IncidentService`'s constructor now requires an `AdvisoryLockService` as its
48
+ second argument, and the healthcheck `createHealthEntitySerializer` /
49
+ `executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
50
+ instead of `db` for the serializer.
51
+
52
+ ### Patch Changes
53
+
54
+ - Updated dependencies [a57f7db]
55
+ - @checkstack/backend-api@0.20.0
56
+ - @checkstack/automation-backend@0.4.0
57
+ - @checkstack/cache-api@0.3.8
58
+ - @checkstack/catalog-backend@1.3.1
59
+ - @checkstack/command-backend@0.1.33
60
+ - @checkstack/integration-backend@0.3.1
61
+ - @checkstack/cache-utils@0.2.13
62
+
3
63
  ## 1.4.0
4
64
 
5
65
  ### Minor Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@checkstack/incident-backend",
3
- "version": "1.4.0",
3
+ "version": "1.5.0",
4
4
  "license": "Elastic-2.0",
5
5
  "type": "module",
6
6
  "main": "src/index.ts",
package/src/index.ts CHANGED
@@ -127,6 +127,7 @@ export default createBackendPlugin({
127
127
  rpcClient: coreServices.rpcClient,
128
128
  signalService: coreServices.signalService,
129
129
  cacheManager: coreServices.cacheManager,
130
+ advisoryLock: coreServices.advisoryLock,
130
131
  },
131
132
  init: async ({
132
133
  logger,
@@ -135,6 +136,7 @@ export default createBackendPlugin({
135
136
  rpcClient,
136
137
  signalService,
137
138
  cacheManager,
139
+ advisoryLock,
138
140
  }) => {
139
141
  logger.debug("🔧 Initializing Incident Backend...");
140
142
 
@@ -144,6 +146,7 @@ export default createBackendPlugin({
144
146
 
145
147
  const service = new IncidentService(
146
148
  database as SafeDatabase<typeof schema>,
149
+ advisoryLock,
147
150
  );
148
151
  // Publish the service for the PLUGIN-BACKED entity `read` accessor
149
152
  // (defined in register()). Mutations only run from here onward.
@@ -208,9 +211,14 @@ export default createBackendPlugin({
208
211
  // associations) + register subscription specs. Per-system /
209
212
  // per-group notification group lifecycle is fully owned by
210
213
  // notification-backend now — incident never touches it.
211
- afterPluginsReady: async ({ database, logger, rpcClient }) => {
214
+ afterPluginsReady: async ({
215
+ database,
216
+ logger,
217
+ rpcClient,
218
+ advisoryLock,
219
+ }) => {
212
220
  const typedDb = database as SafeDatabase<typeof schema>;
213
- const service = new IncidentService(typedDb);
221
+ const service = new IncidentService(typedDb, advisoryLock);
214
222
  const notificationClient = rpcClient.forPlugin(NotificationApi);
215
223
 
216
224
  await Promise.all([
@@ -1,4 +1,5 @@
1
1
  import { describe, it, expect, mock, beforeEach } from "bun:test";
2
+ import type { AdvisoryLockService } from "@checkstack/backend-api";
2
3
  import { IncidentService } from "./service";
3
4
  import {
4
5
  incidents,
@@ -7,6 +8,40 @@ import {
7
8
  incidentLinks,
8
9
  } from "./schema";
9
10
 
11
+ /**
12
+ * In-memory {@link AdvisoryLockService} that faithfully serializes
13
+ * `withXactLock` calls per key (a racing call on the same key cannot run its
14
+ * `fn` until the prior call's `fn` settles) — modelling `pg_advisory_xact_lock`
15
+ * without a real connection. Different keys are independent.
16
+ */
17
+ function makeFakeAdvisoryLock(): AdvisoryLockService {
18
+ const tails = new Map<string, Promise<unknown>>();
19
+ return {
20
+ tryAcquire: async () => ({ release: async () => {} }),
21
+ withXactLock<T>({
22
+ key,
23
+ fn,
24
+ }: {
25
+ key: string;
26
+ fn: () => Promise<T>;
27
+ }): Promise<T> {
28
+ const prior = tails.get(key) ?? Promise.resolve();
29
+ const result = prior.then(
30
+ () => fn(),
31
+ () => fn(),
32
+ );
33
+ tails.set(
34
+ key,
35
+ result.then(
36
+ () => undefined,
37
+ () => undefined,
38
+ ),
39
+ );
40
+ return result;
41
+ },
42
+ };
43
+ }
44
+
10
45
  /**
11
46
  * Programmable mock DB that records each `select(...).from(...).where(...)`
12
47
  * (and optional `.limit(...)`) chain and returns a configurable row array
@@ -48,7 +83,7 @@ describe("IncidentService.hasActiveIncidentWithSuppression", () => {
48
83
 
49
84
  const setup = (resultsByCall: unknown[][]) => {
50
85
  dbHelper = createProgrammableSelectDb(resultsByCall);
51
- service = new IncidentService(dbHelper.db as never);
86
+ service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
52
87
  };
53
88
 
54
89
  beforeEach(() => {
@@ -134,7 +169,7 @@ describe("IncidentService.hasActiveIncidentWithSuppression", () => {
134
169
  describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () => {
135
170
  it("returns {} for an empty id set without querying", async () => {
136
171
  const dbHelper = createProgrammableSelectDb([]);
137
- const service = new IncidentService(dbHelper.db as never);
172
+ const service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
138
173
  expect(await service.getManyEntityStates([])).toEqual({});
139
174
  expect(dbHelper.getCallCount()).toBe(0);
140
175
  });
@@ -153,7 +188,7 @@ describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () =
153
188
  { incidentId: "inc-2", systemId: "sys-c" },
154
189
  ],
155
190
  ]);
156
- const service = new IncidentService(dbHelper.db as never);
191
+ const service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
157
192
  const out = await service.getManyEntityStates(["inc-1", "inc-2", "inc-x"]);
158
193
  expect(out).toEqual({
159
194
  "inc-1": {
@@ -172,7 +207,7 @@ describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () =
172
207
  // incidents query returns nothing → no second query.
173
208
  [],
174
209
  ]);
175
- const service = new IncidentService(dbHelper.db as never);
210
+ const service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
176
211
  expect(await service.getManyEntityStates(["ghost"])).toEqual({});
177
212
  expect(dbHelper.getCallCount()).toBe(1);
178
213
  });
@@ -182,7 +217,7 @@ describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () =
182
217
  [{ id: "inc-1", status: "monitoring", severity: "critical" }],
183
218
  [], // no junction rows
184
219
  ]);
185
- const service = new IncidentService(dbHelper.db as never);
220
+ const service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
186
221
  const out = await service.getManyEntityStates(["inc-1"]);
187
222
  expect(out["inc-1"]).toEqual({
188
223
  status: "monitoring",
@@ -300,7 +335,7 @@ function createDedupFakeDb() {
300
335
  describe("IncidentService.createIncidentDedupedForSystem (M3)", () => {
301
336
  it("two concurrent dedupe creates for one system open exactly ONE incident", async () => {
302
337
  const { db, store } = createDedupFakeDb();
303
- const service = new IncidentService(db as never);
338
+ const service = new IncidentService(db as never, makeFakeAdvisoryLock());
304
339
 
305
340
  const input = {
306
341
  title: "Down",
package/src/service.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { eq, and, inArray, ne } from "drizzle-orm";
2
- import { withXactLock, type SafeDatabase } from "@checkstack/backend-api";
2
+ import type { AdvisoryLockService, SafeDatabase } from "@checkstack/backend-api";
3
3
  import * as schema from "./schema";
4
4
  import {
5
5
  incidents,
@@ -27,7 +27,10 @@ function generateId(): string {
27
27
  }
28
28
 
29
29
  export class IncidentService {
30
- constructor(private db: Db) {}
30
+ constructor(
31
+ private db: Db,
32
+ private advisoryLock: AdvisoryLockService,
33
+ ) {}
31
34
 
32
35
  /**
33
36
  * List incidents with optional filters
@@ -528,15 +531,16 @@ export class IncidentService {
528
531
  create: () => Promise<IncidentWithSystems>,
529
532
  ) => Promise<IncidentWithSystems> = (create) => create(),
530
533
  ): Promise<{ incident: IncidentWithSystems; reused: boolean }> {
531
- return withXactLock({
532
- db: this.db,
534
+ return this.advisoryLock.withXactLock({
533
535
  key: `incident.dedupe-open-for-system:${dedupeSystemId}`,
534
- // The find + create run on `this.db` (the pool), NOT on `tx`. That is
535
- // safe here because `pg_advisory_xact_lock` BLOCKS every other holder
536
- // of this key until this transaction commits: a racing caller waits
537
- // at lock-acquire, so its find can't observe "no open incident" until
538
- // ours has already committed the insert. The critical section is thus
539
- // serialized by the lock window even though it doesn't ride `tx`.
536
+ // The find + create deliberately run on `this.db` (the admin pool), NOT
537
+ // on the lock connection. That is safe because `pg_advisory_xact_lock`
538
+ // BLOCKS every other holder of this key until this lock transaction
539
+ // commits: a racing caller waits at lock-acquire, so its find can't
540
+ // observe "no open incident" until ours has already committed the
541
+ // insert. Crucially, the lock transaction lives on the DEDICATED lock
542
+ // pool (see `createAdvisoryLockService(lockPool)`), so holding it open
543
+ // while the work runs on the admin pool cannot starve the admin pool.
540
544
  fn: async () => {
541
545
  const existing = await this.findActiveIncidentForSystem(dedupeSystemId);
542
546
  if (existing) {