npm - @checkstack/healthcheck-backend - Versions diffs - 1.2.0 → 1.4.0 - Mend

@checkstack/healthcheck-backend 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/CHANGELOG.md +541 -0
package/drizzle/0015_quiet_meggan.sql +12 -0
package/drizzle/0016_complex_maginty.sql +1 -0
package/drizzle/0017_pretty_caretaker.sql +1 -0
package/drizzle/meta/0015_snapshot.json +764 -0
package/drizzle/meta/0016_snapshot.json +644 -0
package/drizzle/meta/0017_snapshot.json +563 -0
package/drizzle/meta/_journal.json +21 -0
package/package.json +24 -21
package/src/automations.test.ts +234 -0
package/src/automations.ts +342 -0
package/src/collector-script-test.test.ts +236 -0
package/src/collector-script-test.ts +221 -0
package/src/health-entity.test.ts +698 -0
package/src/health-entity.ts +369 -0
package/src/health-state.test.ts +115 -0
package/src/health-state.ts +333 -0
package/src/healthcheck-gitops-kinds.test.ts +6 -32
package/src/healthcheck-gitops-kinds.ts +4 -19
package/src/hooks.test.ts +19 -6
package/src/hooks.ts +38 -28
package/src/index.ts +150 -98
package/src/queue-executor.test.ts +137 -0
package/src/queue-executor.ts +282 -380
package/src/retention-job.ts +65 -1
package/src/retention-state-transitions.test.ts +49 -0
package/src/router.test.ts +18 -0
package/src/router.ts +56 -1
package/src/schema.ts +34 -54
package/src/service-assignments.test.ts +184 -0
package/src/service-notification-policy.test.ts +28 -71
package/src/service.ts +154 -0
package/src/state-transitions.test.ts +126 -0
package/src/state-transitions.ts +112 -0
package/tsconfig.json +12 -3
package/src/auto-incident-close-job.ts +0 -164
package/src/auto-incident.test.ts +0 -196
package/src/auto-incident.ts +0 -332

package/src/health-state.ts ADDED Viewed

@@ -0,0 +1,333 @@
+import { and, desc, eq, gte } from "drizzle-orm";
+import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
+import type { Logger, SafeDatabase } from "@checkstack/backend-api";
+import type { InferClient } from "@checkstack/common";
+import { MaintenanceApi } from "@checkstack/maintenance-common";
+import { healthCheckAggregates, healthCheckRuns } from "./schema";
+import * as schema from "./schema";
+import {
+  countStateTransitionsInWindow,
+  findInStatusSince,
+} from "./state-transitions";
+type Db = SafeDatabase<typeof schema>;
+type MaintenanceClient = InferClient<typeof MaintenanceApi>;
+/**
+ * Live, service-typed health-state snapshot for a single system. This
+ * is the data contract the automation sensing layer (Wave 2) reads to
+ * answer "is this system unhealthy, and for how long?" without
+ * re-deriving the math each time.
+ */
+export interface HealthState {
+  /** Aggregate status across all enabled checks. */
+  status: HealthCheckStatus;
+  /**
+   * When the system most recently entered `status`. Null when no
+   * transition has been recorded yet (fail-safe: never throws).
+   */
+  inStatusSince: Date | null;
+  /**
+   * Milliseconds the system has continuously been in `status`. 0 when
+   * `inStatusSince` is unknown.
+   */
+  inStatusForMs: number;
+  /** Latency of the newest run, if any. */
+  latencyMs?: number;
+  /** Windowed average latency from recent aggregate buckets. */
+  avgLatencyMs?: number;
+  /** Windowed p95 latency from recent aggregate buckets. */
+  p95LatencyMs?: number;
+  /** Windowed success rate (healthy / total) in [0, 1] from buckets. */
+  successRate?: number;
+  /** Timestamp of the newest run, if any. */
+  lastRunAt?: Date;
+  /** Whether the system is currently in a maintenance window. */
+  inMaintenance: boolean;
+  /**
+   * Count of aggregate status transitions in the trailing
+   * `transitionWindowMinutes` window. Generalizes flapping detection -
+   * an automation can gate on "N status changes in M minutes".
+   */
+  transitionsInWindow: number;
+  /** The window (minutes) `transitionsInWindow` was counted over. */
+  transitionWindowMinutes: number;
+  /** When this snapshot was computed. */
+  evaluatedAt: Date;
+}
+/** Raw inputs to the pure builder, decoupled from the DB layer. */
+export interface HealthStateInputs {
+  status: HealthCheckStatus;
+  inStatusSince: Date | null;
+  latencyMs?: number;
+  avgLatencyMs?: number;
+  p95LatencyMs?: number;
+  successRate?: number;
+  lastRunAt?: Date;
+  inMaintenance: boolean;
+  transitionsInWindow: number;
+  transitionWindowMinutes: number;
+  now: Date;
+}
+/** Default trailing window (minutes) for the transition count. */
+export const DEFAULT_TRANSITION_WINDOW_MINUTES = 60;
+/**
+ * Pure assembler for a {@link HealthState}. Computes `inStatusForMs`
+ * from `inStatusSince` relative to `now`, clamped at 0 so clock skew
+ * never yields a negative duration. No I/O.
+ */
+export function buildHealthState(inputs: HealthStateInputs): HealthState {
+  const {
+    status,
+    inStatusSince,
+    latencyMs,
+    avgLatencyMs,
+    p95LatencyMs,
+    successRate,
+    lastRunAt,
+    inMaintenance,
+    transitionsInWindow,
+    transitionWindowMinutes,
+    now,
+  } = inputs;
+  const inStatusForMs = inStatusSince
+    ? Math.max(0, now.getTime() - inStatusSince.getTime())
+    : 0;
+  return {
+    status,
+    inStatusSince,
+    inStatusForMs,
+    latencyMs,
+    avgLatencyMs,
+    p95LatencyMs,
+    successRate,
+    lastRunAt,
+    inMaintenance,
+    transitionsInWindow,
+    transitionWindowMinutes,
+    evaluatedAt: now,
+  };
+}
+/**
+ * Newest run (latency + timestamp) for a system, optionally narrowed to
+ * a single check. Returns undefined fields when no run exists.
+ */
+export async function findLatestRun({
+  db,
+  systemId,
+  configurationId,
+}: {
+  db: Db;
+  systemId: string;
+  configurationId?: string;
+}): Promise<{ latencyMs?: number; lastRunAt?: Date }> {
+  const conditions = [eq(healthCheckRuns.systemId, systemId)];
+  if (configurationId) {
+    conditions.push(eq(healthCheckRuns.configurationId, configurationId));
+  }
+  const [row] = await db
+    .select({
+      latencyMs: healthCheckRuns.latencyMs,
+      timestamp: healthCheckRuns.timestamp,
+    })
+    .from(healthCheckRuns)
+    .where(and(...conditions))
+    .orderBy(desc(healthCheckRuns.timestamp))
+    .limit(1);
+  if (!row) return {};
+  return {
+    latencyMs: row.latencyMs ?? undefined,
+    lastRunAt: row.timestamp,
+  };
+}
+/** Number of hours of aggregate buckets folded into windowed metrics. */
+const DEFAULT_METRICS_WINDOW_HOURS = 24;
+/**
+ * Windowed metrics (avg/p95 latency, success rate) computed from hourly
+ * aggregate buckets over the trailing window. Returns undefined fields
+ * when no buckets exist in the window.
+ */
+export async function computeWindowedMetrics({
+  db,
+  systemId,
+  configurationId,
+  now = new Date(),
+  windowHours = DEFAULT_METRICS_WINDOW_HOURS,
+}: {
+  db: Db;
+  systemId: string;
+  configurationId?: string;
+  now?: Date;
+  windowHours?: number;
+}): Promise<{
+  avgLatencyMs?: number;
+  p95LatencyMs?: number;
+  successRate?: number;
+}> {
+  const windowStart = new Date(now.getTime() - windowHours * 3_600_000);
+  const conditions = [
+    eq(healthCheckAggregates.systemId, systemId),
+    eq(healthCheckAggregates.bucketSize, "hourly"),
+    gte(healthCheckAggregates.bucketStart, windowStart),
+  ];
+  if (configurationId) {
+    conditions.push(
+      eq(healthCheckAggregates.configurationId, configurationId),
+    );
+  }
+  const buckets = await db
+    .select({
+      runCount: healthCheckAggregates.runCount,
+      healthyCount: healthCheckAggregates.healthyCount,
+      latencySumMs: healthCheckAggregates.latencySumMs,
+      p95LatencyMs: healthCheckAggregates.p95LatencyMs,
+    })
+    .from(healthCheckAggregates)
+    .where(and(...conditions));
+  return aggregateWindowedMetrics(buckets);
+}
+/**
+ * Pure reduction of aggregate buckets into windowed metrics. Avg
+ * latency is the latency-sum-weighted mean; p95 is the max bucket p95
+ * (a conservative upper bound without re-merging t-digests); success
+ * rate is healthy/total across the window.
+ */
+export function aggregateWindowedMetrics(
+  buckets: Array<{
+    runCount: number;
+    healthyCount: number;
+    latencySumMs: number | null;
+    p95LatencyMs: number | null;
+  }>,
+): {
+  avgLatencyMs?: number;
+  p95LatencyMs?: number;
+  successRate?: number;
+} {
+  if (buckets.length === 0) return {};
+  let totalRuns = 0;
+  let totalHealthy = 0;
+  let latencySum = 0;
+  let latencyRuns = 0;
+  let maxP95: number | undefined;
+  for (const b of buckets) {
+    totalRuns += b.runCount;
+    totalHealthy += b.healthyCount;
+    if (b.latencySumMs != null) {
+      latencySum += b.latencySumMs;
+      latencyRuns += b.runCount;
+    }
+    if (b.p95LatencyMs != null) {
+      maxP95 = maxP95 == null ? b.p95LatencyMs : Math.max(maxP95, b.p95LatencyMs);
+    }
+  }
+  return {
+    avgLatencyMs:
+      latencyRuns > 0 ? Math.round(latencySum / latencyRuns) : undefined,
+    p95LatencyMs: maxP95,
+    successRate: totalRuns > 0 ? totalHealthy / totalRuns : undefined,
+  };
+}
+/**
+ * Check whether a system is currently in a maintenance window
+ * (suppression-agnostic). Fail-open to `false` on client error so a
+ * maintenance-plugin outage never wedges health-state reads.
+ */
+async function resolveInMaintenance({
+  maintenanceClient,
+  systemId,
+  logger,
+}: {
+  maintenanceClient: MaintenanceClient | undefined;
+  systemId: string;
+  logger?: Logger;
+}): Promise<boolean> {
+  if (!maintenanceClient) return false;
+  try {
+    const { active } = await maintenanceClient.hasActiveMaintenance({
+      systemId,
+    });
+    return active;
+  } catch (error) {
+    logger?.warn(
+      `Failed to resolve maintenance state for ${systemId}; assuming not in maintenance:`,
+      error,
+    );
+    return false;
+  }
+}
+/**
+ * Orchestrate the full {@link HealthState} for a single system: status
+ * (from the provided resolver), in-status-since (transitions table),
+ * latest run, windowed metrics, and maintenance state. `now` is passed
+ * explicitly so callers can keep a stable evaluation timestamp.
+ */
+export async function computeHealthState({
+  db,
+  systemId,
+  configurationId,
+  resolveStatus,
+  maintenanceClient,
+  logger,
+  transitionWindowMinutes = DEFAULT_TRANSITION_WINDOW_MINUTES,
+  now = new Date(),
+}: {
+  db: Db;
+  systemId: string;
+  configurationId?: string;
+  /** Returns the aggregate status for the system (per-check when scoped). */
+  resolveStatus: () => Promise<HealthCheckStatus>;
+  maintenanceClient?: MaintenanceClient;
+  logger?: Logger;
+  /** Trailing window (minutes) for the transition count. */
+  transitionWindowMinutes?: number;
+  now?: Date;
+}): Promise<HealthState> {
+  const status = await resolveStatus();
+  const [inStatusSince, latest, windowed, inMaintenance, transitionsInWindow] =
+    await Promise.all([
+      findInStatusSince({ db, systemId, status }),
+      findLatestRun({ db, systemId, configurationId }),
+      computeWindowedMetrics({ db, systemId, configurationId, now }),
+      resolveInMaintenance({ maintenanceClient, systemId, logger }),
+      countStateTransitionsInWindow({
+        db,
+        systemId,
+        windowMinutes: transitionWindowMinutes,
+        now,
+      }),
+    ]);
+  return buildHealthState({
+    status,
+    inStatusSince,
+    latencyMs: latest.latencyMs,
+    avgLatencyMs: windowed.avgLatencyMs,
+    p95LatencyMs: windowed.p95LatencyMs,
+    successRate: windowed.successRate,
+    lastRunAt: latest.lastRunAt,
+    inMaintenance,
+    transitionsInWindow,
+    transitionWindowMinutes,
+    now,
+  });
+}

package/src/healthcheck-gitops-kinds.test.ts CHANGED Viewed

@@ -40,19 +40,6 @@ interface MockAssociation {
   enabled: boolean;
   notificationPolicy?: {
     suppressDeEscalations: boolean;
-    autoOpenIncidentOnUnhealthy: boolean;
-    useNotificationSuppression: boolean;
-    skipDuringMaintenance: boolean;
-    sustainedUnhealthyTrigger: {
-      enabled: boolean;
-      durationMinutes: number;
-    };
-    flappingTrigger: {
-      enabled: boolean;
-      transitions: number;
-      windowMinutes: number;
-    };
-    autoCloseAfterMinutes: number | null;
   };
 }
@@ -657,12 +644,11 @@ describe("Healthcheck GitOps Kind: System Extension", () => {
       extensionSpec: [
         {
           ref: { kind: "Healthcheck", name: "db-check" },
-          // Operator only sets the flap threshold and disables
-          // auto-close; everything else should default in via the
-          // schema parse.
+          // Operator sets the one surviving policy field; everything else
+          // should default in via the schema parse. Flapping thresholds are
+          // no longer part of the policy — they live on the trigger config.
           notificationPolicy: {
-            flappingTrigger: { transitions: 5 },
-            autoCloseAfterMinutes: null,
+            suppressDeEscalations: true,
           },
         },
       ],
@@ -672,20 +658,8 @@ describe("Healthcheck GitOps Kind: System Extension", () => {
     const policy = mockService.associations[0]?.notificationPolicy;
     expect(policy).toBeDefined();
-    expect(policy?.suppressDeEscalations).toBe(false);
-    expect(policy?.autoOpenIncidentOnUnhealthy).toBe(true);
-    expect(policy?.useNotificationSuppression).toBe(true);
-    expect(policy?.skipDuringMaintenance).toBe(true);
-    expect(policy?.sustainedUnhealthyTrigger).toEqual({
-      enabled: true,
-      durationMinutes: 30,
-    });
-    expect(policy?.flappingTrigger).toEqual({
-      enabled: true,
-      transitions: 5,
-      windowMinutes: 60,
-    });
-    expect(policy?.autoCloseAfterMinutes).toBeNull();
+    expect(policy?.suppressDeEscalations).toBe(true);
+    expect(Object.keys(policy ?? {})).toEqual(["suppressDeEscalations"]);
   });
   it("omits notificationPolicy entirely when the spec doesn't set it", async () => {

package/src/healthcheck-gitops-kinds.ts CHANGED Viewed

@@ -85,26 +85,11 @@ const systemHealthcheckExtensionSchema = z
       /**
        * Per-assignment notification policy. Any field omitted falls
        * back to the platform default (see `DEFAULT_NOTIFICATION_POLICY`).
-       * Inner objects (`sustainedUnhealthyTrigger`, `flappingTrigger`)
-       * are also accepted partially.
+       * Flapping thresholds moved onto the automation engine's windowed-count
+       * gate (the `system_health_changed` trigger's `window` block) and are no
+       * longer accepted here.
        */
-      notificationPolicy: NotificationPolicySchema.partial()
-        .extend({
-          sustainedUnhealthyTrigger: z
-            .object({
-              enabled: z.boolean().optional(),
-              durationMinutes: z.number().int().min(1).optional(),
-            })
-            .optional(),
-          flappingTrigger: z
-            .object({
-              enabled: z.boolean().optional(),
-              transitions: z.number().int().min(1).optional(),
-              windowMinutes: z.number().int().min(1).optional(),
-            })
-            .optional(),
-        })
-        .optional(),
+      notificationPolicy: NotificationPolicySchema.partial().optional(),
     }),
   )
   .optional();

package/src/hooks.test.ts CHANGED Viewed

@@ -2,15 +2,28 @@ import { describe, it, expect } from "bun:test";
 import { healthCheckHooks } from "./hooks";
 describe("Health Check Hooks", () => {
-  it("should have systemDegraded hook with correct ID", () => {
-    expect(healthCheckHooks.systemDegraded.id).toBe(
-      "healthcheck.system.degraded"
+  // The directional/umbrella system-health hooks were removed in Phase 4
+  // (§10.3) — the `health` entity drives those events now. The remaining
+  // hooks are the KEPT non-entity signals.
+  it("keeps the assignmentChanged config-change hook", () => {
+    expect(healthCheckHooks.assignmentChanged.id).toBe(
+      "healthcheck.assignment.changed",
     );
   });
-  it("should have systemHealthy hook with correct ID", () => {
-    expect(healthCheckHooks.systemHealthy.id).toBe(
-      "healthcheck.system.healthy"
+  it("keeps the raw-sample checkCompleted / checkFailed hooks", () => {
+    expect(healthCheckHooks.checkCompleted.id).toBe(
+      "healthcheck.check.completed",
     );
+    expect(healthCheckHooks.checkFailed.id).toBe("healthcheck.check.failed");
+  });
+  it("no longer exposes the removed system-health or flapping hooks", () => {
+    expect("systemDegraded" in healthCheckHooks).toBe(false);
+    expect("systemHealthy" in healthCheckHooks).toBe(false);
+    expect("systemHealthChanged" in healthCheckHooks).toBe(false);
+    // Flapping moved to the automation engine's windowed-count gate; the
+    // pre-derived flapping signal hook was removed.
+    expect("flappingDetected" in healthCheckHooks).toBe(false);
   });
 });

package/src/hooks.ts CHANGED Viewed

@@ -1,37 +1,29 @@
 import { createHook } from "@checkstack/backend-api";
+import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
 /**
  * Health check hooks for cross-plugin communication and external integrations.
  * These hooks are registered as integration events for webhook subscriptions.
+ *
+ * `status` / `previousStatus` / `newStatus` carry the canonical
+ * `HealthCheckStatus` enum values, so automation triggers built on
+ * these hooks can offer the known values for `==` comparisons in the
+ * editor.
  */
 export const healthCheckHooks = {
-  /**
-   * Emitted when a system's aggregated health status degrades.
-   * This fires when status changes from healthy to degraded/unhealthy,
-   * or from degraded to unhealthy.
-   */
-  systemDegraded: createHook<{
-    systemId: string;
-    systemName?: string;
-    previousStatus: string;
-    newStatus: string;
-    healthyChecks: number;
-    totalChecks: number;
-    timestamp: string;
-  }>("healthcheck.system.degraded"),
-  /**
-   * Emitted when a system's aggregated health status recovers to healthy.
-   * This fires when status changes from degraded/unhealthy to healthy.
-   */
-  systemHealthy: createHook<{
-    systemId: string;
-    systemName?: string;
-    previousStatus: string;
-    healthyChecks: number;
-    totalChecks: number;
-    timestamp: string;
-  }>("healthcheck.system.healthy"),
+  // The `healthcheck.system.degraded` / `.healthy` / `.health_changed` hooks
+  // were removed in Phase 4 (§10.3): the per-system aggregated health is now
+  // the reactive `health` entity, whose change deriver fires the
+  // `healthcheck.system_degraded` / `_healthy` / `_health_changed` trigger
+  // events through Stage-1 routing. The remaining hooks below are KEPT:
+  // `assignmentChanged` (config signal) and `checkCompleted` / `checkFailed`
+  // (high-frequency raw samples + numeric_state wake source).
+  //
+  // The `flappingDetected` hook was removed: flapping is now detected in the
+  // automation engine by the windowed-count gate on the
+  // `healthcheck.system_health_changed` trigger (base raw change event +
+  // `filter` + `window: { count, minutes, refire: "once" }`), so healthcheck
+  // no longer computes or emits a pre-derived flapping signal.
   /**
    * Emitted when a health check ↔ system association changes.
@@ -50,9 +42,27 @@ export const healthCheckHooks = {
   checkCompleted: createHook<{
     systemId: string;
     configurationId: string;
-    status: string;
+    status: HealthCheckStatus;
     latencyMs: number | undefined;
     result: Record<string, unknown> | undefined;
     timestamp: string;
   }>("healthcheck.check.completed"),
+  /**
+   * Narrow variant of `checkCompleted` — fires only when an individual
+   * check run completed with a non-`healthy` status. Carries the
+   * latency + raw result so subscribers can branch on collector-
+   * specific fields without re-querying. Operators usually prefer
+   * this over `checkCompleted` for incident-style automation because
+   * a "trigger on any completion, then filter" automation is harder
+   * to read at a glance than a typed `check_failed` entry point.
+   */
+  checkFailed: createHook<{
+    systemId: string;
+    configurationId: string;
+    status: HealthCheckStatus;
+    latencyMs: number | undefined;
+    result: Record<string, unknown> | undefined;
+    timestamp: string;
+  }>("healthcheck.check.failed"),
 } as const;