@checkstack/healthcheck-backend 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,85 @@
1
1
  # @checkstack/healthcheck-backend
2
2
 
3
+ ## 1.5.0
4
+
5
+ ### Minor Changes
6
+
7
+ - a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
8
+
9
+ Both the session-lock service and `withXactLock` HOLD a Postgres connection for
10
+ the lock's whole lifetime while the gated work runs on a _different_ connection.
11
+ Both lock and work were drawing from the single shared `adminPool` (which, with
12
+ no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
13
+ wait forever). Under concurrency >= pool size, every slot became a lock-holding
14
+ connection waiting for a work connection that could never free up: a permanent
15
+ deadlock. It surfaced as all connections stuck `idle in transaction` on
16
+ `pg_advisory_xact_lock` and every API request hanging into an upstream 502,
17
+ only after the server had been running long enough to hit that concurrency
18
+ (e.g. a burst of health-check evaluations or incident dedups).
19
+
20
+ Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
21
+ the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
22
+ deadlock class is impossible. `AdvisoryLockService` gains a pooled
23
+ `withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
24
+ pool); healthcheck's per-system serializer, incident's dedup-create, and the
25
+ automation single-mode concurrency lock now use it. The deadlock-prone
26
+ standalone `withXactLock({ db, ... })` helper is REMOVED.
27
+
28
+ Both pools are explicitly configured with `connectionTimeoutMillis` so any
29
+ future exhaustion fails fast and self-heals instead of hanging, and both get a
30
+ pool-level `error` handler (an idle pooled client whose backend dies otherwise
31
+ crashes the pod). The lock pool additionally sets
32
+ `idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
33
+ section is reaped server-side (auto-releasing the lock) rather than stranding a
34
+ key forever. The advisory-lock service also now removes its per-client error
35
+ listener on release (it previously leaked one listener per acquisition on each
36
+ reused pooled connection - an unbounded `MaxListenersExceeded` leak).
37
+
38
+ New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
39
+ `DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
40
+ (default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
41
+ `DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
42
+ (default 30000). Size pools off
43
+ `N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
44
+
45
+ BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
46
+ removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
47
+ `IncidentService`'s constructor now requires an `AdvisoryLockService` as its
48
+ second argument, and the healthcheck `createHealthEntitySerializer` /
49
+ `executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
50
+ instead of `db` for the serializer.
51
+
52
+ - 0d9e5d8: fix: stop a single transient health check failure from escalating to "unhealthy"
53
+
54
+ In consecutive threshold mode, when a run failed but the failure streak had
55
+ not yet reached the configured degraded threshold (and there were not yet
56
+ enough successes to confirm healthy), the evaluator fell back to the raw
57
+ status of the latest run. A single failing run (e.g. a check timeout) that
58
+ recovered on the next run therefore flipped the system to "unhealthy" and
59
+ fired a spurious "System health critical" notification before the configured
60
+ consecutive-failure count (default 2 for degraded, 5 for unhealthy) was
61
+ reached.
62
+
63
+ The evaluator now falls back to "healthy" in this case, matching window mode's
64
+ behaviour and the intent of the thresholds: a transient blip below the
65
+ degraded threshold no longer escalates the system status.
66
+
67
+ ### Patch Changes
68
+
69
+ - Updated dependencies [a57f7db]
70
+ - @checkstack/backend-api@0.20.0
71
+ - @checkstack/incident-backend@1.5.0
72
+ - @checkstack/automation-backend@0.4.0
73
+ - @checkstack/secrets-backend@0.1.1
74
+ - @checkstack/cache-api@0.3.8
75
+ - @checkstack/catalog-backend@1.3.1
76
+ - @checkstack/command-backend@0.1.33
77
+ - @checkstack/gitops-backend@0.4.1
78
+ - @checkstack/queue-api@0.3.8
79
+ - @checkstack/satellite-backend@0.5.1
80
+ - @checkstack/script-packages-backend@0.2.1
81
+ - @checkstack/cache-utils@0.2.13
82
+
3
83
  ## 1.4.0
4
84
 
5
85
  ### Minor Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@checkstack/healthcheck-backend",
3
- "version": "1.4.0",
3
+ "version": "1.5.0",
4
4
  "license": "Elastic-2.0",
5
5
  "type": "module",
6
6
  "main": "src/index.ts",
@@ -663,36 +663,32 @@ describe("per-system serialization (Defect 2 regression)", () => {
663
663
  expect(emitted[0].next.status).toBe("unhealthy");
664
664
  });
665
665
 
666
- it("createHealthEntitySerializer keys the advisory lock `health:<systemId>` and runs work in a transaction", async () => {
667
- // Intercept `db.transaction` + the advisory-lock SQL the serializer's
668
- // `withXactLock` issues. The fake runs `fn(tx)` inline (single connection),
669
- // mirroring `withXactLock`'s single-session contract. We assert the
670
- // namespaced key flows into `pg_advisory_xact_lock(...)`.
671
- const executedKeys: string[] = [];
672
- let transactionRan = false;
673
- const fakeDb = {
674
- transaction: async (
675
- cb: (tx: { execute: (q: unknown) => Promise<void> }) => Promise<unknown>,
676
- ) => {
677
- transactionRan = true;
678
- return cb({
679
- execute: async (q) => {
680
- // The bound key is a plain string chunk in the drizzle template.
681
- const chunks = (q as { queryChunks?: unknown[] }).queryChunks ?? [];
682
- for (const c of chunks) {
683
- if (typeof c === "string") executedKeys.push(c);
684
- }
685
- },
686
- });
666
+ it("createHealthEntitySerializer routes work through the advisory lock keyed `health:<systemId>`", async () => {
667
+ // The serializer now delegates to the shared AdvisoryLockService's
668
+ // `withXactLock` (lock held on the dedicated lock pool, work on the admin
669
+ // pool). Assert the per-system namespaced key flows through and `fn` runs.
670
+ const keys: string[] = [];
671
+ const advisoryLock = {
672
+ tryAcquire: async () => ({ release: async () => {} }),
673
+ withXactLock<T>({
674
+ key,
675
+ fn,
676
+ }: {
677
+ key: string;
678
+ fn: () => Promise<T>;
679
+ }): Promise<T> {
680
+ keys.push(key);
681
+ return fn();
687
682
  },
688
- } as unknown as Parameters<typeof createHealthEntitySerializer>[0]["db"];
683
+ } satisfies Parameters<
684
+ typeof createHealthEntitySerializer
685
+ >[0]["advisoryLock"];
689
686
 
690
- const serializer = createHealthEntitySerializer({ db: fakeDb });
687
+ const serializer = createHealthEntitySerializer({ advisoryLock });
691
688
  const result = await serializer("sys-42")(async () => "ok");
692
689
 
693
690
  expect(result).toBe("ok");
694
- expect(transactionRan).toBe(true);
695
691
  // The advisory lock was acquired with the per-system namespaced key.
696
- expect(executedKeys).toContain("health:sys-42");
692
+ expect(keys).toContain("health:sys-42");
697
693
  });
698
694
  });
@@ -23,7 +23,7 @@
23
23
  */
24
24
  import { z } from "zod";
25
25
  import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
26
- import { withXactLock, type SafeDatabase } from "@checkstack/backend-api";
26
+ import type { AdvisoryLockService } from "@checkstack/backend-api";
27
27
  import type {
28
28
  EntityChangeDeriver,
29
29
  EntityChangePayloadMapper,
@@ -31,11 +31,6 @@ import type {
31
31
  EntityRead,
32
32
  } from "@checkstack/automation-backend";
33
33
  import type { HealthCheckService } from "./service";
34
- import * as schema from "./schema";
35
- // Re-export the change type through automation-backend's barrel (it
36
- // re-exports it from automation-common) so this domain needs no extra dep.
37
-
38
- type Db = SafeDatabase<typeof schema>;
39
34
 
40
35
  /** Entity kind id for the per-system aggregated health. */
41
36
  export const HEALTH_ENTITY_KIND = "health";
@@ -360,10 +355,13 @@ export function healthSystemLockKey(systemId: string): string {
360
355
  * commits.
361
356
  */
362
357
  export function createHealthEntitySerializer(deps: {
363
- db: Db;
358
+ advisoryLock: AdvisoryLockService;
364
359
  }): (systemId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
365
- const { db } = deps;
360
+ const { advisoryLock } = deps;
366
361
  return (systemId) =>
367
362
  <T>(fn: () => Promise<T>) =>
368
- withXactLock({ db, key: healthSystemLockKey(systemId), fn: () => fn() });
363
+ advisoryLock.withXactLock({
364
+ key: healthSystemLockKey(systemId),
365
+ fn: () => fn(),
366
+ });
369
367
  }
package/src/index.ts CHANGED
@@ -198,6 +198,7 @@ export default createBackendPlugin({
198
198
  cacheManager: coreServices.cacheManager,
199
199
  config: coreServices.config,
200
200
  secretResolver: secretResolverRef,
201
+ advisoryLock: coreServices.advisoryLock,
201
202
  },
202
203
  // Phase 2: Register router and setup worker
203
204
  init: async ({
@@ -212,6 +213,7 @@ export default createBackendPlugin({
212
213
  cacheManager,
213
214
  config,
214
215
  secretResolver,
216
+ advisoryLock,
215
217
  }) => {
216
218
  logger.debug("🏥 Initializing Health Check Backend...");
217
219
 
@@ -258,6 +260,7 @@ export default createBackendPlugin({
258
260
  await setupHealthCheckWorker({
259
261
  notificationClient,
260
262
  db: database,
263
+ advisoryLock,
261
264
  registry: healthCheckRegistry,
262
265
  collectorRegistry,
263
266
  logger,
@@ -13,6 +13,16 @@ const passthroughCache: HealthCheckCache = {
13
13
  invalidateAllSystems: async () => 0,
14
14
  scope: {} as HealthCheckCache["scope"],
15
15
  };
16
+
17
+ // Pass-through advisory lock: these tests don't exercise cross-pod
18
+ // serialization, so run the critical section directly.
19
+ const mockAdvisoryLock: Parameters<
20
+ typeof setupHealthCheckWorker
21
+ >[0]["advisoryLock"] = {
22
+ tryAcquire: async () => ({ release: async () => {} }),
23
+ withXactLock: <T>({ fn }: { key: string; fn: () => Promise<T> }): Promise<T> =>
24
+ fn(),
25
+ };
16
26
  import {
17
27
  createMockLogger,
18
28
  createMockQueueManager,
@@ -179,6 +189,7 @@ describe("Queue-Based Health Check Executor", () => {
179
189
  db: mockDb as unknown as Parameters<
180
190
  typeof setupHealthCheckWorker
181
191
  >[0]["db"],
192
+ advisoryLock: mockAdvisoryLock,
182
193
  registry: mockRegistry,
183
194
  collectorRegistry:
184
195
  createMockCollectorRegistry() as unknown as Parameters<
@@ -376,6 +387,7 @@ describe("Queue-Based Health Check Executor", () => {
376
387
  db: mockDb as unknown as Parameters<
377
388
  typeof setupHealthCheckWorker
378
389
  >[0]["db"],
390
+ advisoryLock: mockAdvisoryLock,
379
391
  registry: mockRegistry,
380
392
  collectorRegistry:
381
393
  createMockCollectorRegistry() as unknown as Parameters<
@@ -510,6 +522,7 @@ describe("Queue-Based Health Check Executor", () => {
510
522
  db: mockDb as unknown as Parameters<
511
523
  typeof setupHealthCheckWorker
512
524
  >[0]["db"],
525
+ advisoryLock: mockAdvisoryLock,
513
526
  registry: mockRegistry,
514
527
  collectorRegistry: mockCollectorRegistry as unknown as Parameters<
515
528
  typeof setupHealthCheckWorker
@@ -9,6 +9,7 @@ import {
9
9
  type ConnectedClient,
10
10
  type TransportClient,
11
11
  type CollectorRunContext,
12
+ type AdvisoryLockService,
12
13
  } from "@checkstack/backend-api";
13
14
  import { QueueManager } from "@checkstack/queue-api";
14
15
  import {
@@ -375,6 +376,7 @@ async function notifyStateChange(props: {
375
376
  async function executeHealthCheckJob(props: {
376
377
  payload: HealthCheckJobPayload;
377
378
  db: Db;
379
+ advisoryLock: AdvisoryLockService;
378
380
  registry: HealthCheckRegistry;
379
381
  collectorRegistry: CollectorRegistry;
380
382
  logger: Logger;
@@ -404,6 +406,7 @@ async function executeHealthCheckJob(props: {
404
406
  const {
405
407
  payload,
406
408
  db,
409
+ advisoryLock,
407
410
  registry,
408
411
  collectorRegistry,
409
412
  logger,
@@ -428,7 +431,9 @@ async function executeHealthCheckJob(props: {
428
431
  // system (multiple per-config jobs across pods, or at-least-once
429
432
  // redelivery) can't double-emit a single logical transition. Bound to this
430
433
  // job's systemId below at every `writeHealthEntity` call.
431
- const serializeHealthWrite = createHealthEntitySerializer({ db })(systemId);
434
+ const serializeHealthWrite = createHealthEntitySerializer({ advisoryLock })(
435
+ systemId,
436
+ );
432
437
 
433
438
  // Capture aggregated state BEFORE this run for comparison
434
439
  const previousState = await service.getSystemHealthStatus(systemId);
@@ -1073,6 +1078,7 @@ async function executeHealthCheckJob(props: {
1073
1078
 
1074
1079
  export async function setupHealthCheckWorker(props: {
1075
1080
  db: Db;
1081
+ advisoryLock: AdvisoryLockService;
1076
1082
  registry: HealthCheckRegistry;
1077
1083
  collectorRegistry: CollectorRegistry;
1078
1084
  logger: Logger;
@@ -1089,6 +1095,7 @@ export async function setupHealthCheckWorker(props: {
1089
1095
  }): Promise<void> {
1090
1096
  const {
1091
1097
  db,
1098
+ advisoryLock,
1092
1099
  registry,
1093
1100
  collectorRegistry,
1094
1101
  logger,
@@ -1113,6 +1120,7 @@ export async function setupHealthCheckWorker(props: {
1113
1120
  await executeHealthCheckJob({
1114
1121
  payload: job.data,
1115
1122
  db,
1123
+ advisoryLock,
1116
1124
  registry,
1117
1125
  collectorRegistry,
1118
1126
  logger,
@@ -176,9 +176,51 @@ describe("evaluateHealthStatus", () => {
176
176
  });
177
177
  });
178
178
 
179
+ describe("transient failure (single blip) does not escalate", () => {
180
+ test("default thresholds: one failure then recovery never leaves healthy", () => {
181
+ // Reproduces the real-world bug: an assignment fails once (e.g. a check
182
+ // timeout) and recovers on the next run. Default degraded threshold is 2
183
+ // consecutive failures, so a single failure must NOT escalate to
184
+ // degraded/unhealthy (which would fire a "System health critical"
185
+ // notification).
186
+
187
+ // After the single failing run (only one run recorded so far).
188
+ expect(evaluateHealthStatus({ runs: createRuns(["unhealthy"]) })).toBe(
189
+ "healthy"
190
+ );
191
+
192
+ // After the next run succeeds.
193
+ expect(
194
+ evaluateHealthStatus({ runs: createRuns(["healthy", "unhealthy"]) })
195
+ ).toBe("healthy");
196
+ });
197
+
198
+ test("single leading failure below degraded threshold stays healthy", () => {
199
+ const thresholds: ConsecutiveThresholds = {
200
+ mode: "consecutive",
201
+ healthy: { minSuccessCount: 1 },
202
+ degraded: { minFailureCount: 2 },
203
+ unhealthy: { minFailureCount: 3 },
204
+ };
205
+ // Most recent run failed once, then a flicker of success, then failures.
206
+ // The leading failure streak is only 1 (< degraded threshold of 2), so
207
+ // consecutive mode must NOT report unhealthy off the single latest
208
+ // failure.
209
+ const runs = createRuns([
210
+ "unhealthy",
211
+ "healthy",
212
+ "unhealthy",
213
+ "unhealthy",
214
+ "unhealthy",
215
+ ]);
216
+ expect(evaluateHealthStatus({ runs, thresholds })).toBe("healthy");
217
+ });
218
+ });
219
+
179
220
  describe("flickering scenarios", () => {
180
- test("window mode handles flickering better than consecutive", () => {
181
- // System that is mostly failing but occasionally succeeds
221
+ test("window mode catches a mostly-failing system consecutive mode ignores", () => {
222
+ // System that is mostly failing but occasionally succeeds, with the most
223
+ // recent run a single failure after a flicker of success.
182
224
  const runs = createRuns([
183
225
  "unhealthy",
184
226
  "healthy", // Flicker
@@ -201,12 +243,15 @@ describe("evaluateHealthStatus", () => {
201
243
  unhealthy: { minFailureCount: 4 },
202
244
  };
203
245
 
204
- // Consecutive: sees only 1 failure at start, returns unhealthy (just the first)
246
+ // Consecutive: only the leading streak counts (1 failure, below the
247
+ // degraded threshold), so it stays healthy and does not over-react to the
248
+ // single most-recent failure.
205
249
  expect(
206
250
  evaluateHealthStatus({ runs, thresholds: consecutiveThresholds })
207
- ).toBe("unhealthy");
251
+ ).toBe("healthy");
208
252
 
209
- // Window: sees 4 failures in window of 5, returns unhealthy
253
+ // Window: sees 4 failures in window of 5, returns unhealthy. This is why
254
+ // window mode is preferable for intermittently-failing systems.
210
255
  expect(evaluateHealthStatus({ runs, thresholds: windowThresholds })).toBe(
211
256
  "unhealthy"
212
257
  );
@@ -75,8 +75,15 @@ function evaluateConsecutive(props: {
75
75
  return "healthy";
76
76
  }
77
77
 
78
- // Edge case: not enough history to determine - use latest individual status
79
- return runs[0].status;
78
+ // Not enough consecutive failures to reach the degraded threshold (and not
79
+ // enough successes to confirm healthy). The thresholds exist precisely so a
80
+ // transient blip (e.g. a single failing run that recovers on the next run)
81
+ // does NOT escalate the system status. Returning the raw latest run status
82
+ // here would let one failure flip the system to "degraded"/"unhealthy" and
83
+ // fire a spurious "System health critical" notification before the
84
+ // configured failure count is reached. Fall back to "healthy" — the same
85
+ // baseline window mode uses when no threshold is met.
86
+ return "healthy";
80
87
  }
81
88
 
82
89
  /**