@checkstack/incident-backend 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +60 -0
- package/package.json +1 -1
- package/src/index.ts +10 -2
- package/src/service.test.ts +41 -6
- package/src/service.ts +14 -10
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,65 @@
|
|
|
1
1
|
# @checkstack/incident-backend
|
|
2
2
|
|
|
3
|
+
## 1.5.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
|
|
8
|
+
|
|
9
|
+
Both the session-lock service and `withXactLock` HOLD a Postgres connection for
|
|
10
|
+
the lock's whole lifetime while the gated work runs on a _different_ connection.
|
|
11
|
+
Both lock and work were drawing from the single shared `adminPool` (which, with
|
|
12
|
+
no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
|
|
13
|
+
wait forever). Under concurrency >= pool size, every slot became a lock-holding
|
|
14
|
+
connection waiting for a work connection that could never free up: a permanent
|
|
15
|
+
deadlock. It surfaced as all connections stuck `idle in transaction` on
|
|
16
|
+
`pg_advisory_xact_lock` and every API request hanging into an upstream 502,
|
|
17
|
+
only after the server had been running long enough to hit that concurrency
|
|
18
|
+
(e.g. a burst of health-check evaluations or incident dedups).
|
|
19
|
+
|
|
20
|
+
Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
|
|
21
|
+
the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
|
|
22
|
+
deadlock class is impossible. `AdvisoryLockService` gains a pooled
|
|
23
|
+
`withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
|
|
24
|
+
pool); healthcheck's per-system serializer, incident's dedup-create, and the
|
|
25
|
+
automation single-mode concurrency lock now use it. The deadlock-prone
|
|
26
|
+
standalone `withXactLock({ db, ... })` helper is REMOVED.
|
|
27
|
+
|
|
28
|
+
Both pools are explicitly configured with `connectionTimeoutMillis` so any
|
|
29
|
+
future exhaustion fails fast and self-heals instead of hanging, and both get a
|
|
30
|
+
pool-level `error` handler (an idle pooled client whose backend dies otherwise
|
|
31
|
+
crashes the pod). The lock pool additionally sets
|
|
32
|
+
`idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
|
|
33
|
+
section is reaped server-side (auto-releasing the lock) rather than stranding a
|
|
34
|
+
key forever. The advisory-lock service also now removes its per-client error
|
|
35
|
+
listener on release (it previously leaked one listener per acquisition on each
|
|
36
|
+
reused pooled connection - an unbounded `MaxListenersExceeded` leak).
|
|
37
|
+
|
|
38
|
+
New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
|
|
39
|
+
`DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
|
|
40
|
+
(default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
|
|
41
|
+
`DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
|
|
42
|
+
(default 30000). Size pools off
|
|
43
|
+
`N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
|
|
44
|
+
|
|
45
|
+
BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
|
|
46
|
+
removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
|
|
47
|
+
`IncidentService`'s constructor now requires an `AdvisoryLockService` as its
|
|
48
|
+
second argument, and the healthcheck `createHealthEntitySerializer` /
|
|
49
|
+
`executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
|
|
50
|
+
instead of `db` for the serializer.
|
|
51
|
+
|
|
52
|
+
### Patch Changes
|
|
53
|
+
|
|
54
|
+
- Updated dependencies [a57f7db]
|
|
55
|
+
- @checkstack/backend-api@0.20.0
|
|
56
|
+
- @checkstack/automation-backend@0.4.0
|
|
57
|
+
- @checkstack/cache-api@0.3.8
|
|
58
|
+
- @checkstack/catalog-backend@1.3.1
|
|
59
|
+
- @checkstack/command-backend@0.1.33
|
|
60
|
+
- @checkstack/integration-backend@0.3.1
|
|
61
|
+
- @checkstack/cache-utils@0.2.13
|
|
62
|
+
|
|
3
63
|
## 1.4.0
|
|
4
64
|
|
|
5
65
|
### Minor Changes
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -127,6 +127,7 @@ export default createBackendPlugin({
|
|
|
127
127
|
rpcClient: coreServices.rpcClient,
|
|
128
128
|
signalService: coreServices.signalService,
|
|
129
129
|
cacheManager: coreServices.cacheManager,
|
|
130
|
+
advisoryLock: coreServices.advisoryLock,
|
|
130
131
|
},
|
|
131
132
|
init: async ({
|
|
132
133
|
logger,
|
|
@@ -135,6 +136,7 @@ export default createBackendPlugin({
|
|
|
135
136
|
rpcClient,
|
|
136
137
|
signalService,
|
|
137
138
|
cacheManager,
|
|
139
|
+
advisoryLock,
|
|
138
140
|
}) => {
|
|
139
141
|
logger.debug("🔧 Initializing Incident Backend...");
|
|
140
142
|
|
|
@@ -144,6 +146,7 @@ export default createBackendPlugin({
|
|
|
144
146
|
|
|
145
147
|
const service = new IncidentService(
|
|
146
148
|
database as SafeDatabase<typeof schema>,
|
|
149
|
+
advisoryLock,
|
|
147
150
|
);
|
|
148
151
|
// Publish the service for the PLUGIN-BACKED entity `read` accessor
|
|
149
152
|
// (defined in register()). Mutations only run from here onward.
|
|
@@ -208,9 +211,14 @@ export default createBackendPlugin({
|
|
|
208
211
|
// associations) + register subscription specs. Per-system /
|
|
209
212
|
// per-group notification group lifecycle is fully owned by
|
|
210
213
|
// notification-backend now — incident never touches it.
|
|
211
|
-
afterPluginsReady: async ({
|
|
214
|
+
afterPluginsReady: async ({
|
|
215
|
+
database,
|
|
216
|
+
logger,
|
|
217
|
+
rpcClient,
|
|
218
|
+
advisoryLock,
|
|
219
|
+
}) => {
|
|
212
220
|
const typedDb = database as SafeDatabase<typeof schema>;
|
|
213
|
-
const service = new IncidentService(typedDb);
|
|
221
|
+
const service = new IncidentService(typedDb, advisoryLock);
|
|
214
222
|
const notificationClient = rpcClient.forPlugin(NotificationApi);
|
|
215
223
|
|
|
216
224
|
await Promise.all([
|
package/src/service.test.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { describe, it, expect, mock, beforeEach } from "bun:test";
|
|
2
|
+
import type { AdvisoryLockService } from "@checkstack/backend-api";
|
|
2
3
|
import { IncidentService } from "./service";
|
|
3
4
|
import {
|
|
4
5
|
incidents,
|
|
@@ -7,6 +8,40 @@ import {
|
|
|
7
8
|
incidentLinks,
|
|
8
9
|
} from "./schema";
|
|
9
10
|
|
|
11
|
+
/**
|
|
12
|
+
* In-memory {@link AdvisoryLockService} that faithfully serializes
|
|
13
|
+
* `withXactLock` calls per key (a racing call on the same key cannot run its
|
|
14
|
+
* `fn` until the prior call's `fn` settles) — modelling `pg_advisory_xact_lock`
|
|
15
|
+
* without a real connection. Different keys are independent.
|
|
16
|
+
*/
|
|
17
|
+
function makeFakeAdvisoryLock(): AdvisoryLockService {
|
|
18
|
+
const tails = new Map<string, Promise<unknown>>();
|
|
19
|
+
return {
|
|
20
|
+
tryAcquire: async () => ({ release: async () => {} }),
|
|
21
|
+
withXactLock<T>({
|
|
22
|
+
key,
|
|
23
|
+
fn,
|
|
24
|
+
}: {
|
|
25
|
+
key: string;
|
|
26
|
+
fn: () => Promise<T>;
|
|
27
|
+
}): Promise<T> {
|
|
28
|
+
const prior = tails.get(key) ?? Promise.resolve();
|
|
29
|
+
const result = prior.then(
|
|
30
|
+
() => fn(),
|
|
31
|
+
() => fn(),
|
|
32
|
+
);
|
|
33
|
+
tails.set(
|
|
34
|
+
key,
|
|
35
|
+
result.then(
|
|
36
|
+
() => undefined,
|
|
37
|
+
() => undefined,
|
|
38
|
+
),
|
|
39
|
+
);
|
|
40
|
+
return result;
|
|
41
|
+
},
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
|
|
10
45
|
/**
|
|
11
46
|
* Programmable mock DB that records each `select(...).from(...).where(...)`
|
|
12
47
|
* (and optional `.limit(...)`) chain and returns a configurable row array
|
|
@@ -48,7 +83,7 @@ describe("IncidentService.hasActiveIncidentWithSuppression", () => {
|
|
|
48
83
|
|
|
49
84
|
const setup = (resultsByCall: unknown[][]) => {
|
|
50
85
|
dbHelper = createProgrammableSelectDb(resultsByCall);
|
|
51
|
-
service = new IncidentService(dbHelper.db as never);
|
|
86
|
+
service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
|
|
52
87
|
};
|
|
53
88
|
|
|
54
89
|
beforeEach(() => {
|
|
@@ -134,7 +169,7 @@ describe("IncidentService.hasActiveIncidentWithSuppression", () => {
|
|
|
134
169
|
describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () => {
|
|
135
170
|
it("returns {} for an empty id set without querying", async () => {
|
|
136
171
|
const dbHelper = createProgrammableSelectDb([]);
|
|
137
|
-
const service = new IncidentService(dbHelper.db as never);
|
|
172
|
+
const service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
|
|
138
173
|
expect(await service.getManyEntityStates([])).toEqual({});
|
|
139
174
|
expect(dbHelper.getCallCount()).toBe(0);
|
|
140
175
|
});
|
|
@@ -153,7 +188,7 @@ describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () =
|
|
|
153
188
|
{ incidentId: "inc-2", systemId: "sys-c" },
|
|
154
189
|
],
|
|
155
190
|
]);
|
|
156
|
-
const service = new IncidentService(dbHelper.db as never);
|
|
191
|
+
const service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
|
|
157
192
|
const out = await service.getManyEntityStates(["inc-1", "inc-2", "inc-x"]);
|
|
158
193
|
expect(out).toEqual({
|
|
159
194
|
"inc-1": {
|
|
@@ -172,7 +207,7 @@ describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () =
|
|
|
172
207
|
// incidents query returns nothing → no second query.
|
|
173
208
|
[],
|
|
174
209
|
]);
|
|
175
|
-
const service = new IncidentService(dbHelper.db as never);
|
|
210
|
+
const service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
|
|
176
211
|
expect(await service.getManyEntityStates(["ghost"])).toEqual({});
|
|
177
212
|
expect(dbHelper.getCallCount()).toBe(1);
|
|
178
213
|
});
|
|
@@ -182,7 +217,7 @@ describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () =
|
|
|
182
217
|
[{ id: "inc-1", status: "monitoring", severity: "critical" }],
|
|
183
218
|
[], // no junction rows
|
|
184
219
|
]);
|
|
185
|
-
const service = new IncidentService(dbHelper.db as never);
|
|
220
|
+
const service = new IncidentService(dbHelper.db as never, makeFakeAdvisoryLock());
|
|
186
221
|
const out = await service.getManyEntityStates(["inc-1"]);
|
|
187
222
|
expect(out["inc-1"]).toEqual({
|
|
188
223
|
status: "monitoring",
|
|
@@ -300,7 +335,7 @@ function createDedupFakeDb() {
|
|
|
300
335
|
describe("IncidentService.createIncidentDedupedForSystem (M3)", () => {
|
|
301
336
|
it("two concurrent dedupe creates for one system open exactly ONE incident", async () => {
|
|
302
337
|
const { db, store } = createDedupFakeDb();
|
|
303
|
-
const service = new IncidentService(db as never);
|
|
338
|
+
const service = new IncidentService(db as never, makeFakeAdvisoryLock());
|
|
304
339
|
|
|
305
340
|
const input = {
|
|
306
341
|
title: "Down",
|
package/src/service.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { eq, and, inArray, ne } from "drizzle-orm";
|
|
2
|
-
import {
|
|
2
|
+
import type { AdvisoryLockService, SafeDatabase } from "@checkstack/backend-api";
|
|
3
3
|
import * as schema from "./schema";
|
|
4
4
|
import {
|
|
5
5
|
incidents,
|
|
@@ -27,7 +27,10 @@ function generateId(): string {
|
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
export class IncidentService {
|
|
30
|
-
constructor(
|
|
30
|
+
constructor(
|
|
31
|
+
private db: Db,
|
|
32
|
+
private advisoryLock: AdvisoryLockService,
|
|
33
|
+
) {}
|
|
31
34
|
|
|
32
35
|
/**
|
|
33
36
|
* List incidents with optional filters
|
|
@@ -528,15 +531,16 @@ export class IncidentService {
|
|
|
528
531
|
create: () => Promise<IncidentWithSystems>,
|
|
529
532
|
) => Promise<IncidentWithSystems> = (create) => create(),
|
|
530
533
|
): Promise<{ incident: IncidentWithSystems; reused: boolean }> {
|
|
531
|
-
return withXactLock({
|
|
532
|
-
db: this.db,
|
|
534
|
+
return this.advisoryLock.withXactLock({
|
|
533
535
|
key: `incident.dedupe-open-for-system:${dedupeSystemId}`,
|
|
534
|
-
// The find + create run on `this.db` (the pool), NOT
|
|
535
|
-
// safe
|
|
536
|
-
// of this key until this transaction
|
|
537
|
-
// at lock-acquire, so its find can't
|
|
538
|
-
// ours has already committed the
|
|
539
|
-
//
|
|
536
|
+
// The find + create deliberately run on `this.db` (the admin pool), NOT
|
|
537
|
+
// on the lock connection. That is safe because `pg_advisory_xact_lock`
|
|
538
|
+
// BLOCKS every other holder of this key until this lock transaction
|
|
539
|
+
// commits: a racing caller waits at lock-acquire, so its find can't
|
|
540
|
+
// observe "no open incident" until ours has already committed the
|
|
541
|
+
// insert. Crucially, the lock transaction lives on the DEDICATED lock
|
|
542
|
+
// pool (see `createAdvisoryLockService(lockPool)`), so holding it open
|
|
543
|
+
// while the work runs on the admin pool cannot starve the admin pool.
|
|
540
544
|
fn: async () => {
|
|
541
545
|
const existing = await this.findActiveIncidentForSystem(dedupeSystemId);
|
|
542
546
|
if (existing) {
|