npm - @checkstack/healthcheck-backend - Versions diffs - 1.3.0 → 1.5.0 - Mend

@checkstack/healthcheck-backend 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/CHANGELOG.md +409 -0
package/drizzle/0015_quiet_meggan.sql +12 -0
package/drizzle/0016_complex_maginty.sql +1 -0
package/drizzle/0017_pretty_caretaker.sql +1 -0
package/drizzle/meta/0015_snapshot.json +764 -0
package/drizzle/meta/0016_snapshot.json +644 -0
package/drizzle/meta/0017_snapshot.json +563 -0
package/drizzle/meta/_journal.json +21 -0
package/package.json +24 -21
package/src/automations.test.ts +6 -27
package/src/automations.ts +32 -30
package/src/collector-script-test.test.ts +236 -0
package/src/collector-script-test.ts +221 -0
package/src/health-entity.test.ts +694 -0
package/src/health-entity.ts +367 -0
package/src/health-state.test.ts +115 -0
package/src/health-state.ts +333 -0
package/src/healthcheck-gitops-kinds.test.ts +6 -32
package/src/healthcheck-gitops-kinds.ts +4 -19
package/src/hooks.test.ts +19 -6
package/src/hooks.ts +13 -68
package/src/index.ts +118 -48
package/src/queue-executor.test.ts +13 -0
package/src/queue-executor.ts +251 -444
package/src/retention-job.ts +65 -1
package/src/retention-state-transitions.test.ts +49 -0
package/src/router.test.ts +13 -0
package/src/router.ts +44 -0
package/src/schema.ts +34 -54
package/src/service-notification-policy.test.ts +28 -71
package/src/service.ts +89 -0
package/src/state-evaluator.test.ts +50 -5
package/src/state-evaluator.ts +9 -2
package/src/state-transitions.test.ts +126 -0
package/src/state-transitions.ts +112 -0
package/tsconfig.json +9 -0
package/src/auto-incident-close-job.ts +0 -164
package/src/auto-incident.test.ts +0 -196
package/src/auto-incident.ts +0 -332

package/src/state-evaluator.test.ts CHANGED Viewed

@@ -176,9 +176,51 @@ describe("evaluateHealthStatus", () => {
     });
   });
+  describe("transient failure (single blip) does not escalate", () => {
+    test("default thresholds: one failure then recovery never leaves healthy", () => {
+      // Reproduces the real-world bug: an assignment fails once (e.g. a check
+      // timeout) and recovers on the next run. Default degraded threshold is 2
+      // consecutive failures, so a single failure must NOT escalate to
+      // degraded/unhealthy (which would fire a "System health critical"
+      // notification).
+      // After the single failing run (only one run recorded so far).
+      expect(evaluateHealthStatus({ runs: createRuns(["unhealthy"]) })).toBe(
+        "healthy"
+      );
+      // After the next run succeeds.
+      expect(
+        evaluateHealthStatus({ runs: createRuns(["healthy", "unhealthy"]) })
+      ).toBe("healthy");
+    });
+    test("single leading failure below degraded threshold stays healthy", () => {
+      const thresholds: ConsecutiveThresholds = {
+        mode: "consecutive",
+        healthy: { minSuccessCount: 1 },
+        degraded: { minFailureCount: 2 },
+        unhealthy: { minFailureCount: 3 },
+      };
+      // Most recent run failed once, then a flicker of success, then failures.
+      // The leading failure streak is only 1 (< degraded threshold of 2), so
+      // consecutive mode must NOT report unhealthy off the single latest
+      // failure.
+      const runs = createRuns([
+        "unhealthy",
+        "healthy",
+        "unhealthy",
+        "unhealthy",
+        "unhealthy",
+      ]);
+      expect(evaluateHealthStatus({ runs, thresholds })).toBe("healthy");
+    });
+  });
   describe("flickering scenarios", () => {
-    test("window mode handles flickering better than consecutive", () => {
-      // System that is mostly failing but occasionally succeeds
+    test("window mode catches a mostly-failing system consecutive mode ignores", () => {
+      // System that is mostly failing but occasionally succeeds, with the most
+      // recent run a single failure after a flicker of success.
       const runs = createRuns([
         "unhealthy",
         "healthy", // Flicker
@@ -201,12 +243,15 @@ describe("evaluateHealthStatus", () => {
         unhealthy: { minFailureCount: 4 },
       };
-      // Consecutive: sees only 1 failure at start, returns unhealthy (just the first)
+      // Consecutive: only the leading streak counts (1 failure, below the
+      // degraded threshold), so it stays healthy and does not over-react to the
+      // single most-recent failure.
       expect(
         evaluateHealthStatus({ runs, thresholds: consecutiveThresholds })
-      ).toBe("unhealthy");
+      ).toBe("healthy");
-      // Window: sees 4 failures in window of 5, returns unhealthy
+      // Window: sees 4 failures in window of 5, returns unhealthy. This is why
+      // window mode is preferable for intermittently-failing systems.
       expect(evaluateHealthStatus({ runs, thresholds: windowThresholds })).toBe(
         "unhealthy"
       );

package/src/state-evaluator.ts CHANGED Viewed

@@ -75,8 +75,15 @@ function evaluateConsecutive(props: {
     return "healthy";
   }
-  // Edge case: not enough history to determine - use latest individual status
-  return runs[0].status;
+  // Not enough consecutive failures to reach the degraded threshold (and not
+  // enough successes to confirm healthy). The thresholds exist precisely so a
+  // transient blip (e.g. a single failing run that recovers on the next run)
+  // does NOT escalate the system status. Returning the raw latest run status
+  // here would let one failure flip the system to "degraded"/"unhealthy" and
+  // fire a spurious "System health critical" notification before the
+  // configured failure count is reached. Fall back to "healthy" — the same
+  // baseline window mode uses when no threshold is met.
+  return "healthy";
 }
 /**

package/src/state-transitions.test.ts ADDED Viewed

@@ -0,0 +1,126 @@
+import { describe, it, expect, mock } from "bun:test";
+import {
+  countStateTransitionsInWindow,
+  findInStatusSince,
+  recordStateTransition,
+} from "./state-transitions";
+/**
+ * Minimal fluent mock for `db.select(...).from(...).where(...).orderBy(...).limit(...)`
+ * that resolves to the provided rows.
+ */
+function selectMockDb(rows: Array<{ transitionedAt: Date }>) {
+  return {
+    select: mock(() => ({
+      from: mock(() => ({
+        where: mock(() => ({
+          orderBy: mock(() => ({
+            limit: mock(() => Promise.resolve(rows)),
+          })),
+        })),
+      })),
+    })),
+  };
+}
+describe("findInStatusSince", () => {
+  it("returns the most-recent transitionedAt for the status", async () => {
+    const since = new Date("2026-05-30T10:00:00.000Z");
+    const db = selectMockDb([{ transitionedAt: since }]);
+    const result = await findInStatusSince({
+      db: db as never,
+      systemId: "system-1",
+      status: "unhealthy",
+    });
+    expect(result).toBe(since);
+  });
+  it("returns null (fail-safe) when no transition row exists", async () => {
+    const db = selectMockDb([]);
+    const result = await findInStatusSince({
+      db: db as never,
+      systemId: "system-1",
+      status: "degraded",
+    });
+    expect(result).toBeNull();
+  });
+});
+describe("recordStateTransition", () => {
+  it("inserts a row with from/to status and the provided timestamp", async () => {
+    const values =
+      mock<(v: Record<string, unknown>) => Promise<void>>(() =>
+        Promise.resolve(),
+      );
+    const db = { insert: mock(() => ({ values })) };
+    const now = new Date("2026-05-30T12:00:00.000Z");
+    await recordStateTransition({
+      db: db as never,
+      systemId: "system-1",
+      configurationId: "config-1",
+      fromStatus: "healthy",
+      toStatus: "unhealthy",
+      now,
+    });
+    expect(values).toHaveBeenCalledTimes(1);
+    expect(values.mock.calls[0]?.[0]).toEqual({
+      systemId: "system-1",
+      configurationId: "config-1",
+      fromStatus: "healthy",
+      toStatus: "unhealthy",
+      transitionedAt: now,
+    });
+  });
+  it("stores null fromStatus on the first-ever transition", async () => {
+    const values =
+      mock<(v: Record<string, unknown>) => Promise<void>>(() =>
+        Promise.resolve(),
+      );
+    const db = { insert: mock(() => ({ values })) };
+    await recordStateTransition({
+      db: db as never,
+      systemId: "system-1",
+      configurationId: "config-1",
+      fromStatus: undefined,
+      toStatus: "degraded",
+    });
+    const arg = values.mock.calls[0]?.[0] as { fromStatus: unknown };
+    expect(arg.fromStatus).toBeNull();
+  });
+});
+describe("countStateTransitionsInWindow", () => {
+  /** Mock for `db.select({count}).from(...).where(...)` resolving to [{count}]. */
+  function countMockDb(count: number) {
+    const where = mock(() => Promise.resolve([{ count }]));
+    const from = mock(() => ({ where }));
+    const select = mock(() => ({ from }));
+    return { db: { select }, where };
+  }
+  it("returns the windowed count", async () => {
+    const { db } = countMockDb(4);
+    const result = await countStateTransitionsInWindow({
+      db: db as never,
+      systemId: "system-1",
+      windowMinutes: 60,
+    });
+    expect(result).toBe(4);
+  });
+  it("returns 0 (fail-safe) when the query yields no rows", async () => {
+    const where = mock(() => Promise.resolve([]));
+    const db = { select: mock(() => ({ from: mock(() => ({ where })) })) };
+    const result = await countStateTransitionsInWindow({
+      db: db as never,
+      systemId: "system-1",
+      windowMinutes: 30,
+    });
+    expect(result).toBe(0);
+  });
+});

package/src/state-transitions.ts ADDED Viewed

@@ -0,0 +1,112 @@
+import { and, desc, eq, gte, sql } from "drizzle-orm";
+import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
+import type { SafeDatabase } from "@checkstack/backend-api";
+import { healthCheckStateTransitions } from "./schema";
+import * as schema from "./schema";
+type Db = SafeDatabase<typeof schema>;
+/**
+ * Record an aggregate health-status transition for a system. Called at
+ * the same point `systemHealthChanged` fires (one row per aggregate
+ * transition, which is rare). `fromStatus` is null on the first-ever
+ * recorded transition for a system.
+ */
+export async function recordStateTransition({
+  db,
+  systemId,
+  configurationId,
+  fromStatus,
+  toStatus,
+  now = new Date(),
+}: {
+  db: Db;
+  systemId: string;
+  configurationId: string;
+  fromStatus: HealthCheckStatus | undefined;
+  toStatus: HealthCheckStatus;
+  now?: Date;
+}): Promise<void> {
+  await db.insert(healthCheckStateTransitions).values({
+    systemId,
+    configurationId,
+    fromStatus: fromStatus ?? null,
+    toStatus,
+    transitionedAt: now,
+  });
+}
+/**
+ * Find the timestamp at which the system most recently entered the
+ * given status (the start of its current streak in that status).
+ *
+ * Fail-safe: when no transition row exists (e.g. the table was pruned
+ * before this system ever transitioned, or it has never changed status)
+ * this returns `null` rather than throwing, so callers degrade to
+ * `inStatusSince: null` instead of failing the whole evaluation.
+ */
+export async function findInStatusSince({
+  db,
+  systemId,
+  status,
+}: {
+  db: Db;
+  systemId: string;
+  status: HealthCheckStatus;
+}): Promise<Date | null> {
+  const [row] = await db
+    .select({ transitionedAt: healthCheckStateTransitions.transitionedAt })
+    .from(healthCheckStateTransitions)
+    .where(
+      and(
+        eq(healthCheckStateTransitions.systemId, systemId),
+        eq(healthCheckStateTransitions.toStatus, status),
+      ),
+    )
+    .orderBy(desc(healthCheckStateTransitions.transitionedAt))
+    .limit(1);
+  return row?.transitionedAt ?? null;
+}
+/**
+ * Count aggregate state transitions for a system within the trailing
+ * window `[now - windowMinutes, now]`. Generalizes the flapping detector's
+ * "N transitions in M minutes" count beyond the unhealthy-only table.
+ *
+ * When `toStatus` is given, counts only transitions INTO that status
+ * (e.g. flapping = repeated transitions into `unhealthy`); omit it to
+ * count all status changes in the window.
+ *
+ * Fail-safe: returns 0 on any error rather than throwing, so a count
+ * read never wedges an evaluation.
+ */
+export async function countStateTransitionsInWindow({
+  db,
+  systemId,
+  windowMinutes,
+  toStatus,
+  now = new Date(),
+}: {
+  db: Db;
+  systemId: string;
+  windowMinutes: number;
+  toStatus?: HealthCheckStatus;
+  now?: Date;
+}): Promise<number> {
+  const windowStart = new Date(now.getTime() - windowMinutes * 60_000);
+  const conditions = [
+    eq(healthCheckStateTransitions.systemId, systemId),
+    gte(healthCheckStateTransitions.transitionedAt, windowStart),
+  ];
+  if (toStatus) {
+    conditions.push(eq(healthCheckStateTransitions.toStatus, toStatus));
+  }
+  const [row] = await db
+    .select({ count: sql<number>`COUNT(*)::int` })
+    .from(healthCheckStateTransitions)
+    .where(and(...conditions));
+  return row?.count ?? 0;
+}

package/tsconfig.json CHANGED Viewed

@@ -58,6 +58,15 @@
     {
       "path": "../satellite-backend"
     },
+    {
+      "path": "../script-packages-backend"
+    },
+    {
+      "path": "../secrets-backend"
+    },
+    {
+      "path": "../secrets-common"
+    },
     {
       "path": "../signal-common"
     },

package/src/auto-incident-close-job.ts DELETED Viewed

@@ -1,164 +0,0 @@
-import { and, eq, gte, isNotNull, isNull } from "drizzle-orm";
-import type { Logger, SafeDatabase } from "@checkstack/backend-api";
-import type { InferClient } from "@checkstack/common";
-import { IncidentApi } from "@checkstack/incident-common";
-import type { QueueManager } from "@checkstack/queue-api";
-import * as schema from "./schema";
-import { healthCheckAutoIncidents, healthCheckRuns } from "./schema";
-type Db = SafeDatabase<typeof schema>;
-type IncidentClient = InferClient<typeof IncidentApi>;
-const AUTO_CLOSE_QUEUE = "health-check-auto-incident-close";
-interface AutoCloseJobPayload {
-  trigger: "scheduled";
-}
-interface AutoCloseJobDeps {
-  db: Db;
-  logger: Logger;
-  queueManager: QueueManager;
-  incidentClient: IncidentClient;
-  /**
-   * How often the worker ticks. Default 60s. Set lower in tests.
-   */
-  intervalSeconds?: number;
-}
-const DEFAULT_INTERVAL_SECONDS = 60;
-/**
- * Background worker that resolves auto-opened incidents once the
- * underlying system has stayed healthy for the per-incident cooldown.
- * The cooldown is snapshot per-row at open time (see
- * `healthCheckAutoIncidents.cooldownMinutes`) so a policy change does
- * not retroactively alter the close behaviour of incidents already in
- * flight. A `null` cooldown means "never auto-close" — the worker
- * skips those rows and an operator must resolve them manually.
- */
-export async function setupAutoIncidentCloseJob(deps: AutoCloseJobDeps) {
-  const {
-    queueManager,
-    logger,
-    db,
-    incidentClient,
-    intervalSeconds = DEFAULT_INTERVAL_SECONDS,
-  } = deps;
-  const queue = queueManager.getQueue<AutoCloseJobPayload>(AUTO_CLOSE_QUEUE);
-  await queue.consume(
-    async () => {
-      await runAutoIncidentCloseJob({ db, logger, incidentClient });
-    },
-    { consumerGroup: "auto-incident-close-worker" },
-  );
-  await queue.scheduleRecurring(
-    { trigger: "scheduled" },
-    {
-      jobId: "health-check-auto-incident-close",
-      intervalSeconds,
-    },
-  );
-  logger.info(
-    `Health check auto-incident close job scheduled (interval ${intervalSeconds}s; cooldown is per-incident)`,
-  );
-}
-/**
- * Resolve any open auto-incidents whose linked system has been
- * steadily healthy for at least their snapshot `cooldownMinutes`. Rows
- * with a null cooldown are skipped. Each incident is processed
- * independently; one failure does not abort the sweep.
- */
-export async function runAutoIncidentCloseJob({
-  db,
-  logger,
-  incidentClient,
-}: {
-  db: Db;
-  logger: Logger;
-  incidentClient: IncidentClient;
-}): Promise<{ closed: number }> {
-  const now = new Date();
-  // All open auto-incidents with a non-null cooldown — rows with null
-  // cooldown opted out of auto-close entirely.
-  const open = await db
-    .select({
-      id: healthCheckAutoIncidents.id,
-      incidentId: healthCheckAutoIncidents.incidentId,
-      systemId: healthCheckAutoIncidents.systemId,
-      openedAt: healthCheckAutoIncidents.openedAt,
-      cooldownMinutes: healthCheckAutoIncidents.cooldownMinutes,
-    })
-    .from(healthCheckAutoIncidents)
-    .where(
-      and(
-        isNull(healthCheckAutoIncidents.closedAt),
-        isNotNull(healthCheckAutoIncidents.cooldownMinutes),
-      ),
-    );
-  let closed = 0;
-  for (const row of open) {
-    try {
-      const cooldownMinutes = row.cooldownMinutes;
-      if (cooldownMinutes === null) continue; // narrows the type
-      const cooldownStart = new Date(now.getTime() - cooldownMinutes * 60_000);
-      // Require the cooldown to have elapsed since the incident was
-      // opened in the first place. Without this, a system that was
-      // healthy *before* we opened the incident would be auto-closed on
-      // the very first tick.
-      if (row.openedAt > cooldownStart) {
-        continue;
-      }
-      // Has the system had any unhealthy runs inside the cooldown?
-      const recentUnhealthy = await db
-        .select({ id: healthCheckRuns.id })
-        .from(healthCheckRuns)
-        .where(
-          and(
-            eq(healthCheckRuns.systemId, row.systemId),
-            eq(healthCheckRuns.status, "unhealthy"),
-            gte(healthCheckRuns.timestamp, cooldownStart),
-          ),
-        )
-        .limit(1);
-      if (recentUnhealthy.length > 0) {
-        continue;
-      }
-      // Steady-state healthy → resolve.
-      await incidentClient.resolveAutoIncident({
-        id: row.incidentId,
-        message: `Auto-resolved: system stayed healthy for ${cooldownMinutes} minutes.`,
-      });
-      await db
-        .update(healthCheckAutoIncidents)
-        .set({ closedAt: new Date() })
-        .where(eq(healthCheckAutoIncidents.id, row.id));
-      closed += 1;
-      logger.info(
-        `Auto-closed incident ${row.incidentId} for system ${row.systemId}`,
-      );
-    } catch (error) {
-      logger.warn(
-        `Auto-close failed for incident ${row.incidentId} (system ${row.systemId}):`,
-        error,
-      );
-    }
-  }
-  return { closed };
-}