npm - @checkstack/healthcheck-backend - Versions diffs - 1.4.0 → 1.5.0 - Mend

@checkstack/healthcheck-backend 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/CHANGELOG.md +80 -0
package/package.json +1 -1
package/src/health-entity.test.ts +21 -25
package/src/health-entity.ts +7 -9
package/src/index.ts +3 -0
package/src/queue-executor.test.ts +13 -0
package/src/queue-executor.ts +9 -1
package/src/state-evaluator.test.ts +50 -5
package/src/state-evaluator.ts +9 -2

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,85 @@
 # @checkstack/healthcheck-backend
+## 1.5.0
+### Minor Changes
+- a57f7db: fix(backend): give advisory locks a dedicated connection pool to prevent pool-starvation deadlock
+  Both the session-lock service and `withXactLock` HOLD a Postgres connection for
+  the lock's whole lifetime while the gated work runs on a _different_ connection.
+  Both lock and work were drawing from the single shared `adminPool` (which, with
+  no explicit config, defaulted to `max: 10` and `connectionTimeoutMillis: 0` -
+  wait forever). Under concurrency >= pool size, every slot became a lock-holding
+  connection waiting for a work connection that could never free up: a permanent
+  deadlock. It surfaced as all connections stuck `idle in transaction` on
+  `pg_advisory_xact_lock` and every API request hanging into an upstream 502,
+  only after the server had been running long enough to hit that concurrency
+  (e.g. a burst of health-check evaluations or incident dedups).
+  Advisory locks now run on a dedicated `lockPool`, separate from `adminPool`, so
+  the acquire graph is acyclic (`lockPool -> adminPool`, never back) and the
+  deadlock class is impossible. `AdvisoryLockService` gains a pooled
+  `withXactLock({ key, fn })` method (lock on the lock pool, work on the admin
+  pool); healthcheck's per-system serializer, incident's dedup-create, and the
+  automation single-mode concurrency lock now use it. The deadlock-prone
+  standalone `withXactLock({ db, ... })` helper is REMOVED.
+  Both pools are explicitly configured with `connectionTimeoutMillis` so any
+  future exhaustion fails fast and self-heals instead of hanging, and both get a
+  pool-level `error` handler (an idle pooled client whose backend dies otherwise
+  crashes the pod). The lock pool additionally sets
+  `idle_in_transaction_session_timeout` and `lock_timeout` so a stalled critical
+  section is reaped server-side (auto-releasing the lock) rather than stranding a
+  key forever. The advisory-lock service also now removes its per-client error
+  listener on release (it previously leaked one listener per acquisition on each
+  reused pooled connection - an unbounded `MaxListenersExceeded` leak).
+  New env vars (all optional): `DATABASE_POOL_MAX` (default 20),
+  `DATABASE_LOCK_POOL_MAX` (default 10), `DATABASE_POOL_CONNECTION_TIMEOUT_MS`
+  (default 10000), `DATABASE_POOL_IDLE_TIMEOUT_MS` (default 30000),
+  `DATABASE_LOCK_IDLE_TX_TIMEOUT_MS` (default 30000), `DATABASE_LOCK_TIMEOUT_MS`
+  (default 30000). Size pools off
+  `N_pods * (DATABASE_POOL_MAX + DATABASE_LOCK_POOL_MAX) <= max_connections`.
+  BREAKING CHANGE: the standalone `withXactLock({ db, key, fn })` export is
+  removed - use `coreServices.advisoryLock.withXactLock({ key, fn })` instead.
+  `IncidentService`'s constructor now requires an `AdvisoryLockService` as its
+  second argument, and the healthcheck `createHealthEntitySerializer` /
+  `executeHealthCheckJob` / `setupHealthCheckWorker` helpers take `advisoryLock`
+  instead of `db` for the serializer.
+- 0d9e5d8: fix: stop a single transient health check failure from escalating to "unhealthy"
+  In consecutive threshold mode, when a run failed but the failure streak had
+  not yet reached the configured degraded threshold (and there were not yet
+  enough successes to confirm healthy), the evaluator fell back to the raw
+  status of the latest run. A single failing run (e.g. a check timeout) that
+  recovered on the next run therefore flipped the system to "unhealthy" and
+  fired a spurious "System health critical" notification before the configured
+  consecutive-failure count (default 2 for degraded, 5 for unhealthy) was
+  reached.
+  The evaluator now falls back to "healthy" in this case, matching window mode's
+  behaviour and the intent of the thresholds: a transient blip below the
+  degraded threshold no longer escalates the system status.
+### Patch Changes
+- Updated dependencies [a57f7db]
+  - @checkstack/backend-api@0.20.0
+  - @checkstack/incident-backend@1.5.0
+  - @checkstack/automation-backend@0.4.0
+  - @checkstack/secrets-backend@0.1.1
+  - @checkstack/cache-api@0.3.8
+  - @checkstack/catalog-backend@1.3.1
+  - @checkstack/command-backend@0.1.33
+  - @checkstack/gitops-backend@0.4.1
+  - @checkstack/queue-api@0.3.8
+  - @checkstack/satellite-backend@0.5.1
+  - @checkstack/script-packages-backend@0.2.1
+  - @checkstack/cache-utils@0.2.13
 ## 1.4.0
 ### Minor Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@checkstack/healthcheck-backend",
-  "version": "1.4.0",
+  "version": "1.5.0",
   "license": "Elastic-2.0",
   "type": "module",
   "main": "src/index.ts",

package/src/health-entity.test.ts CHANGED Viewed

@@ -663,36 +663,32 @@ describe("per-system serialization (Defect 2 regression)", () => {
     expect(emitted[0].next.status).toBe("unhealthy");
   });
-  it("createHealthEntitySerializer keys the advisory lock `health:<systemId>` and runs work in a transaction", async () => {
-    // Intercept `db.transaction` + the advisory-lock SQL the serializer's
-    // `withXactLock` issues. The fake runs `fn(tx)` inline (single connection),
-    // mirroring `withXactLock`'s single-session contract. We assert the
-    // namespaced key flows into `pg_advisory_xact_lock(...)`.
-    const executedKeys: string[] = [];
-    let transactionRan = false;
-    const fakeDb = {
-      transaction: async (
-        cb: (tx: { execute: (q: unknown) => Promise<void> }) => Promise<unknown>,
-      ) => {
-        transactionRan = true;
-        return cb({
-          execute: async (q) => {
-            // The bound key is a plain string chunk in the drizzle template.
-            const chunks = (q as { queryChunks?: unknown[] }).queryChunks ?? [];
-            for (const c of chunks) {
-              if (typeof c === "string") executedKeys.push(c);
-            }
-          },
-        });
+  it("createHealthEntitySerializer routes work through the advisory lock keyed `health:<systemId>`", async () => {
+    // The serializer now delegates to the shared AdvisoryLockService's
+    // `withXactLock` (lock held on the dedicated lock pool, work on the admin
+    // pool). Assert the per-system namespaced key flows through and `fn` runs.
+    const keys: string[] = [];
+    const advisoryLock = {
+      tryAcquire: async () => ({ release: async () => {} }),
+      withXactLock<T>({
+        key,
+        fn,
+      }: {
+        key: string;
+        fn: () => Promise<T>;
+      }): Promise<T> {
+        keys.push(key);
+        return fn();
       },
-    } as unknown as Parameters<typeof createHealthEntitySerializer>[0]["db"];
+    } satisfies Parameters<
+      typeof createHealthEntitySerializer
+    >[0]["advisoryLock"];
-    const serializer = createHealthEntitySerializer({ db: fakeDb });
+    const serializer = createHealthEntitySerializer({ advisoryLock });
     const result = await serializer("sys-42")(async () => "ok");
     expect(result).toBe("ok");
-    expect(transactionRan).toBe(true);
     // The advisory lock was acquired with the per-system namespaced key.
-    expect(executedKeys).toContain("health:sys-42");
+    expect(keys).toContain("health:sys-42");
   });
 });

package/src/health-entity.ts CHANGED Viewed

@@ -23,7 +23,7 @@
  */
 import { z } from "zod";
 import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
-import { withXactLock, type SafeDatabase } from "@checkstack/backend-api";
+import type { AdvisoryLockService } from "@checkstack/backend-api";
 import type {
   EntityChangeDeriver,
   EntityChangePayloadMapper,
@@ -31,11 +31,6 @@ import type {
   EntityRead,
 } from "@checkstack/automation-backend";
 import type { HealthCheckService } from "./service";
-import * as schema from "./schema";
-// Re-export the change type through automation-backend's barrel (it
-// re-exports it from automation-common) so this domain needs no extra dep.
-type Db = SafeDatabase<typeof schema>;
 /** Entity kind id for the per-system aggregated health. */
 export const HEALTH_ENTITY_KIND = "health";
@@ -360,10 +355,13 @@ export function healthSystemLockKey(systemId: string): string {
  * commits.
  */
 export function createHealthEntitySerializer(deps: {
-  db: Db;
+  advisoryLock: AdvisoryLockService;
 }): (systemId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
-  const { db } = deps;
+  const { advisoryLock } = deps;
   return (systemId) =>
     <T>(fn: () => Promise<T>) =>
-      withXactLock({ db, key: healthSystemLockKey(systemId), fn: () => fn() });
+      advisoryLock.withXactLock({
+        key: healthSystemLockKey(systemId),
+        fn: () => fn(),
+      });
 }

package/src/index.ts CHANGED Viewed

@@ -198,6 +198,7 @@ export default createBackendPlugin({
         cacheManager: coreServices.cacheManager,
         config: coreServices.config,
         secretResolver: secretResolverRef,
+        advisoryLock: coreServices.advisoryLock,
       },
       // Phase 2: Register router and setup worker
       init: async ({
@@ -212,6 +213,7 @@ export default createBackendPlugin({
         cacheManager,
         config,
         secretResolver,
+        advisoryLock,
       }) => {
         logger.debug("🏥 Initializing Health Check Backend...");
@@ -258,6 +260,7 @@ export default createBackendPlugin({
         await setupHealthCheckWorker({
           notificationClient,
           db: database,
+          advisoryLock,
           registry: healthCheckRegistry,
           collectorRegistry,
           logger,

package/src/queue-executor.test.ts CHANGED Viewed

@@ -13,6 +13,16 @@ const passthroughCache: HealthCheckCache = {
   invalidateAllSystems: async () => 0,
   scope: {} as HealthCheckCache["scope"],
 };
+// Pass-through advisory lock: these tests don't exercise cross-pod
+// serialization, so run the critical section directly.
+const mockAdvisoryLock: Parameters<
+  typeof setupHealthCheckWorker
+>[0]["advisoryLock"] = {
+  tryAcquire: async () => ({ release: async () => {} }),
+  withXactLock: <T>({ fn }: { key: string; fn: () => Promise<T> }): Promise<T> =>
+    fn(),
+};
 import {
   createMockLogger,
   createMockQueueManager,
@@ -179,6 +189,7 @@ describe("Queue-Based Health Check Executor", () => {
         db: mockDb as unknown as Parameters<
           typeof setupHealthCheckWorker
         >[0]["db"],
+        advisoryLock: mockAdvisoryLock,
         registry: mockRegistry,
         collectorRegistry:
           createMockCollectorRegistry() as unknown as Parameters<
@@ -376,6 +387,7 @@ describe("Queue-Based Health Check Executor", () => {
         db: mockDb as unknown as Parameters<
           typeof setupHealthCheckWorker
         >[0]["db"],
+        advisoryLock: mockAdvisoryLock,
         registry: mockRegistry,
         collectorRegistry:
           createMockCollectorRegistry() as unknown as Parameters<
@@ -510,6 +522,7 @@ describe("Queue-Based Health Check Executor", () => {
         db: mockDb as unknown as Parameters<
           typeof setupHealthCheckWorker
         >[0]["db"],
+        advisoryLock: mockAdvisoryLock,
         registry: mockRegistry,
         collectorRegistry: mockCollectorRegistry as unknown as Parameters<
           typeof setupHealthCheckWorker

package/src/queue-executor.ts CHANGED Viewed

@@ -9,6 +9,7 @@ import {
   type ConnectedClient,
   type TransportClient,
   type CollectorRunContext,
+  type AdvisoryLockService,
 } from "@checkstack/backend-api";
 import { QueueManager } from "@checkstack/queue-api";
 import {
@@ -375,6 +376,7 @@ async function notifyStateChange(props: {
 async function executeHealthCheckJob(props: {
   payload: HealthCheckJobPayload;
   db: Db;
+  advisoryLock: AdvisoryLockService;
   registry: HealthCheckRegistry;
   collectorRegistry: CollectorRegistry;
   logger: Logger;
@@ -404,6 +406,7 @@ async function executeHealthCheckJob(props: {
   const {
     payload,
     db,
+    advisoryLock,
     registry,
     collectorRegistry,
     logger,
@@ -428,7 +431,9 @@ async function executeHealthCheckJob(props: {
   // system (multiple per-config jobs across pods, or at-least-once
   // redelivery) can't double-emit a single logical transition. Bound to this
   // job's systemId below at every `writeHealthEntity` call.
-  const serializeHealthWrite = createHealthEntitySerializer({ db })(systemId);
+  const serializeHealthWrite = createHealthEntitySerializer({ advisoryLock })(
+    systemId,
+  );
   // Capture aggregated state BEFORE this run for comparison
   const previousState = await service.getSystemHealthStatus(systemId);
@@ -1073,6 +1078,7 @@ async function executeHealthCheckJob(props: {
 export async function setupHealthCheckWorker(props: {
   db: Db;
+  advisoryLock: AdvisoryLockService;
   registry: HealthCheckRegistry;
   collectorRegistry: CollectorRegistry;
   logger: Logger;
@@ -1089,6 +1095,7 @@ export async function setupHealthCheckWorker(props: {
 }): Promise<void> {
   const {
     db,
+    advisoryLock,
     registry,
     collectorRegistry,
     logger,
@@ -1113,6 +1120,7 @@ export async function setupHealthCheckWorker(props: {
       await executeHealthCheckJob({
         payload: job.data,
         db,
+        advisoryLock,
         registry,
         collectorRegistry,
         logger,

package/src/state-evaluator.test.ts CHANGED Viewed

@@ -176,9 +176,51 @@ describe("evaluateHealthStatus", () => {
     });
   });
+  describe("transient failure (single blip) does not escalate", () => {
+    test("default thresholds: one failure then recovery never leaves healthy", () => {
+      // Reproduces the real-world bug: an assignment fails once (e.g. a check
+      // timeout) and recovers on the next run. Default degraded threshold is 2
+      // consecutive failures, so a single failure must NOT escalate to
+      // degraded/unhealthy (which would fire a "System health critical"
+      // notification).
+      // After the single failing run (only one run recorded so far).
+      expect(evaluateHealthStatus({ runs: createRuns(["unhealthy"]) })).toBe(
+        "healthy"
+      );
+      // After the next run succeeds.
+      expect(
+        evaluateHealthStatus({ runs: createRuns(["healthy", "unhealthy"]) })
+      ).toBe("healthy");
+    });
+    test("single leading failure below degraded threshold stays healthy", () => {
+      const thresholds: ConsecutiveThresholds = {
+        mode: "consecutive",
+        healthy: { minSuccessCount: 1 },
+        degraded: { minFailureCount: 2 },
+        unhealthy: { minFailureCount: 3 },
+      };
+      // Most recent run failed once, then a flicker of success, then failures.
+      // The leading failure streak is only 1 (< degraded threshold of 2), so
+      // consecutive mode must NOT report unhealthy off the single latest
+      // failure.
+      const runs = createRuns([
+        "unhealthy",
+        "healthy",
+        "unhealthy",
+        "unhealthy",
+        "unhealthy",
+      ]);
+      expect(evaluateHealthStatus({ runs, thresholds })).toBe("healthy");
+    });
+  });
   describe("flickering scenarios", () => {
-    test("window mode handles flickering better than consecutive", () => {
-      // System that is mostly failing but occasionally succeeds
+    test("window mode catches a mostly-failing system consecutive mode ignores", () => {
+      // System that is mostly failing but occasionally succeeds, with the most
+      // recent run a single failure after a flicker of success.
       const runs = createRuns([
         "unhealthy",
         "healthy", // Flicker
@@ -201,12 +243,15 @@ describe("evaluateHealthStatus", () => {
         unhealthy: { minFailureCount: 4 },
       };
-      // Consecutive: sees only 1 failure at start, returns unhealthy (just the first)
+      // Consecutive: only the leading streak counts (1 failure, below the
+      // degraded threshold), so it stays healthy and does not over-react to the
+      // single most-recent failure.
       expect(
         evaluateHealthStatus({ runs, thresholds: consecutiveThresholds })
-      ).toBe("unhealthy");
+      ).toBe("healthy");
-      // Window: sees 4 failures in window of 5, returns unhealthy
+      // Window: sees 4 failures in window of 5, returns unhealthy. This is why
+      // window mode is preferable for intermittently-failing systems.
       expect(evaluateHealthStatus({ runs, thresholds: windowThresholds })).toBe(
         "unhealthy"
       );

package/src/state-evaluator.ts CHANGED Viewed

@@ -75,8 +75,15 @@ function evaluateConsecutive(props: {
     return "healthy";
   }
-  // Edge case: not enough history to determine - use latest individual status
-  return runs[0].status;
+  // Not enough consecutive failures to reach the degraded threshold (and not
+  // enough successes to confirm healthy). The thresholds exist precisely so a
+  // transient blip (e.g. a single failing run that recovers on the next run)
+  // does NOT escalate the system status. Returning the raw latest run status
+  // here would let one failure flip the system to "degraded"/"unhealthy" and
+  // fire a spurious "System health critical" notification before the
+  // configured failure count is reached. Fall back to "healthy" — the same
+  // baseline window mode uses when no threshold is met.
+  return "healthy";
 }
 /**