npm - @checkstack/healthcheck-backend - Versions diffs - 1.1.4 → 1.3.0 - Mend

@checkstack/healthcheck-backend 1.1.4 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/CHANGELOG.md +256 -0
package/drizzle/0012_fair_boomer.sql +1 -0
package/drizzle/0013_clean_fabian_cortez.sql +20 -0
package/drizzle/0014_chilly_ultragirl.sql +2 -0
package/drizzle/meta/0012_snapshot.json +447 -0
package/drizzle/meta/0013_snapshot.json +615 -0
package/drizzle/meta/0014_snapshot.json +648 -0
package/drizzle/meta/_journal.json +21 -0
package/package.json +21 -20
package/src/auto-incident-close-job.ts +164 -0
package/src/auto-incident.test.ts +196 -0
package/src/auto-incident.ts +332 -0
package/src/automations.test.ts +255 -0
package/src/automations.ts +340 -0
package/src/healthcheck-gitops-kinds.test.ts +93 -0
package/src/healthcheck-gitops-kinds.ts +34 -0
package/src/hooks.ts +69 -4
package/src/index.ts +80 -52
package/src/notification-defaults-config.ts +10 -0
package/src/notification-policy.test.ts +104 -0
package/src/notification-policy.ts +56 -0
package/src/queue-executor.test.ts +137 -0
package/src/queue-executor.ts +434 -42
package/src/router.test.ts +12 -0
package/src/router.ts +30 -2
package/src/schema.ts +76 -0
package/src/service-assignments.test.ts +184 -0
package/src/service-notification-policy.test.ts +174 -0
package/src/service.ts +195 -1
package/tsconfig.json +5 -2

package/src/queue-executor.ts CHANGED Viewed

@@ -8,6 +8,7 @@ import {
   type BaseStrategyConfig,
   type ConnectedClient,
   type TransportClient,
+  type CollectorRunContext,
 } from "@checkstack/backend-api";
 import { QueueManager } from "@checkstack/queue-api";
 import {
@@ -39,6 +40,21 @@ import { HealthCheckService } from "./service";
 import { healthCheckHooks } from "./hooks";
 import { incrementHourlyAggregate } from "./realtime-aggregation";
 import type { HealthCheckCache } from "./cache";
+import {
+  classifyTransition,
+  shouldNotifyTransition,
+} from "./notification-policy";
+import {
+  findLastAutoIncidentClose,
+  findUnhealthySince,
+  hasHealthyRunSince,
+  isMaintenanceSuppressed,
+  isTransitionToUnhealthy,
+  openAutoIncident,
+  recordUnhealthyTransition,
+  shouldOpenForFlapping,
+  shouldOpenForSustainedUnhealthy,
+} from "./auto-incident";
 type Db = SafeDatabase<typeof schema>;
 type CatalogClient = InferClient<typeof CatalogApi>;
@@ -47,8 +63,13 @@ type IncidentClient = InferClient<typeof IncidentApi>;
 type NotificationClient = InferClient<typeof NotificationApi>;
 /**
- * Emit the checkCompleted hook if available.
- * Extracted to avoid duplicating the hook emission pattern across success/error paths.
+ * Emit the checkCompleted hook if available, plus the narrower
+ * `checkFailed` hook when the result wasn't `healthy` (so operators
+ * can wire a typed "trigger on failure" automation without having to
+ * filter `checkCompleted` themselves).
+ *
+ * Extracted to avoid duplicating the hook emission pattern across
+ * success/error paths.
  */
 async function emitCheckCompletedHook({
   getEmitHook,
@@ -66,14 +87,26 @@ async function emitCheckCompletedHook({
   result: Record<string, unknown> | undefined;
 }): Promise<void> {
   const emitHook = getEmitHook();
-  if (emitHook) {
-    await emitHook(healthCheckHooks.checkCompleted, {
+  if (!emitHook) return;
+  const timestamp = new Date().toISOString();
+  await emitHook(healthCheckHooks.checkCompleted, {
+    systemId,
+    configurationId,
+    status,
+    latencyMs,
+    result,
+    timestamp,
+  });
+  // Narrow follow-up — informational for automation triggers; the
+  // auto-incident pipeline still runs on its own thresholds.
+  if (status !== "healthy") {
+    await emitHook(healthCheckHooks.checkFailed, {
       systemId,
       configurationId,
       status,
       latencyMs,
       result,
-      timestamp: new Date().toISOString(),
+      timestamp,
     });
   }
 }
@@ -87,9 +120,11 @@ export interface HealthCheckJobPayload {
 }
 /**
- * Queue name for health check execution
+ * Queue name for health check execution. Exported so consumers like
+ * the `healthcheck.run_now` automation action can enqueue a one-off
+ * job without re-importing the recurring-job factory.
  */
-const HEALTH_CHECK_QUEUE = "health-checks";
+export const HEALTH_CHECK_QUEUE = "health-checks";
 /**
  * Worker group for health check execution (work-queue mode)
@@ -136,15 +171,245 @@ export async function scheduleHealthCheck(props: {
   });
 }
+/**
+ * After every check run, evaluate the per-check auto-incident
+ * triggers. Either trigger can independently open an incident:
+ *
+ * - **flapping**: this just-completed run was a transition to
+ *   unhealthy AND `N` such transitions have happened within the
+ *   configured window.
+ * - **sustained**: the check is currently unhealthy AND has been so
+ *   continuously for at least the configured duration.
+ *
+ * Both triggers honour the require-recovery rule: after the most
+ * recent auto-incident close (manual or auto), no new auto-incident
+ * opens until the check has logged at least one healthy run. This
+ * stops a manual close → still-unhealthy → re-open loop.
+ *
+ * Active maintenance with suppression skips both triggers when the
+ * policy opts in.
+ */
+async function maybeOpenAutoIncidentForCheck(props: {
+  db: Db;
+  service: HealthCheckService;
+  incidentClient: IncidentClient;
+  maintenanceClient: MaintenanceClient;
+  logger: Logger;
+  systemId: string;
+  systemName: string;
+  configurationId: string;
+  configurationName: string;
+  /**
+   * Same closure-based getter the queue executor uses elsewhere; let
+   * us fire the `flapping_detected` automation hook from inside the
+   * flapping evaluator without re-threading `emitHook` through every
+   * intermediate caller. Optional — when absent, the hook simply
+   * doesn't fire (e.g. in unit tests that don't care about it).
+   */
+  getEmitHook?: () => EmitHookFn | undefined;
+  previousState: {
+    checkStatuses: Array<{
+      configurationId: string;
+      status: HealthCheckStatus;
+    }>;
+  };
+  newState: {
+    checkStatuses: Array<{
+      configurationId: string;
+      status: HealthCheckStatus;
+    }>;
+  };
+}): Promise<void> {
+  const {
+    db,
+    service,
+    incidentClient,
+    maintenanceClient,
+    logger,
+    systemId,
+    systemName,
+    configurationId,
+    configurationName,
+    getEmitHook,
+    previousState,
+    newState,
+  } = props;
+  const next = newState.checkStatuses.find(
+    (c) => c.configurationId === configurationId,
+  );
+  // Only auto-incident logic applies when the check is currently
+  // unhealthy — both triggers require it.
+  if (!next || next.status !== "unhealthy") return;
+  const prev = previousState.checkStatuses.find(
+    (c) => c.configurationId === configurationId,
+  );
+  const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
+  let policy;
+  try {
+    policy = await service.getAssignmentNotificationPolicy({
+      systemId,
+      configurationId,
+    });
+  } catch (error) {
+    logger.warn(
+      `Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
+      error,
+    );
+    return;
+  }
+  if (!policy.autoOpenIncidentOnUnhealthy) return;
+  // Honour active maintenance windows — operators have explicitly
+  // said the system is down on purpose.
+  if (policy.skipDuringMaintenance) {
+    const suppressed = await isMaintenanceSuppressed({
+      maintenanceClient,
+      systemId,
+      logger,
+    });
+    if (suppressed) {
+      logger.debug(
+        `Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
+      );
+      return;
+    }
+  }
+  // Require-recovery: if there's a prior closed auto-incident for
+  // this assignment, the check must have logged at least one healthy
+  // run since the close before we can open another one. Without this,
+  // an operator's manual close on a still-broken system would loop.
+  const lastCloseAt = await findLastAutoIncidentClose({
+    db,
+    systemId,
+    configurationId,
+  });
+  if (lastCloseAt) {
+    const recovered = await hasHealthyRunSince({
+      db,
+      systemId,
+      configurationId,
+      since: lastCloseAt,
+    });
+    if (!recovered) {
+      return;
+    }
+  }
+  // Record the transition (if any) and evaluate the flapping trigger
+  // against transitions that happened after the last close window.
+  let flappingOpens = false;
+  if (isTransition) {
+    try {
+      const count = await recordUnhealthyTransition({
+        db,
+        configurationId,
+        systemId,
+        windowMinutes: policy.flappingTrigger.windowMinutes,
+        since: lastCloseAt,
+      });
+      flappingOpens = shouldOpenForFlapping({
+        policy,
+        recentTransitionCount: count,
+      });
+      // Fire the informational `flapping_detected` automation hook
+      // independently of the auto-incident decision: an operator may
+      // care about flapping even with the auto-incident pipeline
+      // turned off.
+      if (
+        policy.flappingTrigger.enabled &&
+        count >= policy.flappingTrigger.transitions
+      ) {
+        const emit = getEmitHook?.();
+        if (emit) {
+          try {
+            await emit(healthCheckHooks.flappingDetected, {
+              systemId,
+              configurationId,
+              transitionCount: count,
+              windowMinutes: policy.flappingTrigger.windowMinutes,
+              timestamp: new Date().toISOString(),
+            });
+          } catch (error) {
+            logger.warn(
+              `Failed to emit healthcheck.flapping_detected hook for ${systemId}/${configurationId}:`,
+              error,
+            );
+          }
+        }
+      }
+    } catch (error) {
+      logger.warn(
+        `Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
+        error,
+      );
+    }
+  }
+  // Evaluate the sustained-duration trigger on every run while the
+  // check is unhealthy (not just on transition).
+  let sustainedOpens = false;
+  if (policy.sustainedUnhealthyTrigger.enabled) {
+    const unhealthySince = await findUnhealthySince({
+      db,
+      configurationId,
+      systemId,
+      since: lastCloseAt,
+    });
+    if (unhealthySince) {
+      sustainedOpens = shouldOpenForSustainedUnhealthy({
+        policy,
+        unhealthyForMs: Date.now() - unhealthySince.getTime(),
+      });
+    }
+  }
+  if (!flappingOpens && !sustainedOpens) return;
+  const reason = flappingOpens
+    ? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
+    : `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
+  await openAutoIncident({
+    db,
+    incidentClient,
+    logger,
+    systemId,
+    systemName,
+    configurationId,
+    configurationName,
+    policy,
+    reason,
+  });
+}
 /**
  * Notify system subscribers about a health state change.
- * Skips notification if the system has active maintenance or incident with suppression enabled.
+ * Skips notification when:
+ * - the system has active maintenance/incident with suppression enabled, or
+ * - the policy of the check that just ran opts into de-escalation
+ *   suppression and this transition is a de-escalation (e.g.
+ *   `unhealthy → degraded`).
+ *
+ * For non-recovery transitions, the action CTA is deep-linked to the
+ * failing-checks filter so operators land directly on the problem.
+ *
+ * Policy is resolved per-assignment (per system+configuration) — the
+ * just-ran check is the one driving any aggregate transition in this
+ * execution, so its policy is the authoritative one.
  */
 async function notifyStateChange(props: {
   systemId: string;
   systemName: string;
+  configurationId: string;
   previousStatus: HealthCheckStatus;
   newStatus: HealthCheckStatus;
+  service: HealthCheckService;
   catalogClient: CatalogClient;
   notificationClient: NotificationClient;
   maintenanceClient: MaintenanceClient;
@@ -154,8 +419,10 @@ async function notifyStateChange(props: {
   const {
     systemId,
     systemName,
+    configurationId,
     previousStatus,
     newStatus,
+    service,
     catalogClient,
     notificationClient,
     maintenanceClient,
@@ -163,8 +430,31 @@ async function notifyStateChange(props: {
     logger,
   } = props;
-  // Only notify on actual state changes
-  if (newStatus === previousStatus) {
+  const transition = classifyTransition(previousStatus, newStatus);
+  if (transition === "none") {
+    return;
+  }
+  // Per-assignment notification policy. Failure to load defaults to
+  // "notify everything" rather than dropping the notification.
+  let suppressDeEscalations = false;
+  try {
+    const policy = await service.getAssignmentNotificationPolicy({
+      systemId,
+      configurationId,
+    });
+    suppressDeEscalations = policy.suppressDeEscalations;
+  } catch (error) {
+    logger.warn(
+      `Failed to load notification policy for ${systemId}/${configurationId}, applying defaults:`,
+      error,
+    );
+  }
+  if (!shouldNotifyTransition(transition, { suppressDeEscalations })) {
+    logger.debug(
+      `Skipping notification for ${systemId}: ${transition} suppressed by policy`,
+    );
     return;
   }
@@ -204,36 +494,38 @@ async function notifyStateChange(props: {
     );
   }
-  const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
-  const isDegraded = newStatus === "degraded";
-  const isUnhealthy = newStatus === "unhealthy";
   let title: string;
   let body: string;
   let importance: "info" | "warning" | "critical";
-  if (isRecovery) {
+  if (transition === "recovery") {
     title = `System health restored: ${systemName}`;
     body =
       `All health checks for **${systemName}** are now passing. The system has returned to normal operation.`;
     importance = "info";
-  } else if (isUnhealthy) {
+  } else if (newStatus === "unhealthy") {
     title = `System health critical: ${systemName}`;
     body = `Health checks indicate **${systemName}** is unhealthy and may be down.`;
     importance = "critical";
-  } else if (isDegraded) {
+  } else {
+    // degraded — either an escalation from healthy or a partial recovery
     title = `System health degraded: ${systemName}`;
     body =
       `Some health checks for **${systemName}** are failing. The system may be experiencing issues.`;
     importance = "warning";
-  } else {
-    // No notification for healthy → healthy (if somehow missed above)
-    return;
   }
   const systemDetailPath = resolveRoute(catalogRoutes.routes.systemDetail, {
     systemId,
   });
+  // Recovery lands on the default (all) view; failing transitions deep-link
+  // operators into the failing-checks filter so they can debug immediately.
+  const actionUrl =
+    transition === "recovery"
+      ? systemDetailPath
+      : `${systemDetailPath}?filter=failing`;
+  const actionLabel =
+    transition === "recovery" ? "View System" : "View failing checks";
   void catalogClient; // parents are resolved server-side via stored target edges
@@ -244,7 +536,7 @@ async function notifyStateChange(props: {
       title,
       body,
       importance,
-      action: { label: "View System", url: systemDetailPath },
+      action: { label: actionLabel, url: actionUrl },
       collapseKey: systemHealthCollapseKey(systemId),
       subjects: [
         createSystemSubject({
@@ -376,6 +668,17 @@ async function executeHealthCheckJob(props: {
       logger.debug(`Could not fetch system name for ${systemId}, using ID`);
     }
+    // Curated, read-only run-context metadata exposed to collectors.
+    // Metadata only - never secrets or config.
+    const runContext: CollectorRunContext = {
+      check: {
+        id: configId,
+        name: configRow.configName || configId,
+        intervalSeconds: configRow.interval,
+      },
+      system: { id: systemId, name: systemName },
+    };
     const strategy = registry.getStrategy(configRow.strategyId);
     if (!strategy) {
       logger.warn(
@@ -426,6 +729,7 @@ async function executeHealthCheckJob(props: {
                 config: collectorEntry.config,
                 client: connectedClient!.client,
                 pluginId: configRow.strategyId,
+                runContext,
               });
               // Check for collector-level error
@@ -598,11 +902,13 @@ async function executeHealthCheckJob(props: {
       const newState = await service.getSystemHealthStatus(systemId);
       if (newState.status !== previousStatus) {
         await notifyStateChange({
-        notificationClient,
+          notificationClient,
           systemId,
           systemName,
+          configurationId: configId,
           previousStatus,
           newStatus: newState.status,
+          service,
           catalogClient,
           maintenanceClient,
           incidentClient,
@@ -610,6 +916,24 @@ async function executeHealthCheckJob(props: {
         });
       }
+      // Per-check auto-incident: runs whether or not the aggregate
+      // changed (a check can transition to unhealthy without flipping
+      // the aggregate if another check is already unhealthy).
+      await maybeOpenAutoIncidentForCheck({
+        db,
+        service,
+        incidentClient,
+        maintenanceClient,
+        logger,
+        systemId,
+        systemName,
+        configurationId: configId,
+        configurationName: configRow.configName,
+        getEmitHook,
+        previousState,
+        newState,
+      });
       return;
     } finally {
       if (connectedClient) {
@@ -696,8 +1020,10 @@ async function executeHealthCheckJob(props: {
         notificationClient,
         systemId,
         systemName,
+        configurationId: configId,
         previousStatus,
         newStatus: newState.status,
+        service,
         catalogClient,
         maintenanceClient,
         incidentClient,
@@ -714,16 +1040,20 @@ async function executeHealthCheckJob(props: {
       // Emit integration hooks for external integrations
       const emitHook = getEmitHook();
       if (emitHook) {
+        const healthyChecks = newState.checkStatuses.filter(
+          (c) => c.status === "healthy",
+        ).length;
+        const totalChecks = newState.checkStatuses.length;
+        const timestamp = new Date().toISOString();
         if (newState.status === "healthy" && previousStatus !== "healthy") {
           // Recovery: system became healthy
           await emitHook(healthCheckHooks.systemHealthy, {
             systemId,
             previousStatus,
-            healthyChecks: newState.checkStatuses.filter(
-              (c) => c.status === "healthy",
-            ).length,
-            totalChecks: newState.checkStatuses.length,
-            timestamp: new Date().toISOString(),
+            healthyChecks,
+            totalChecks,
+            timestamp,
           });
           logger.debug(
             `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
@@ -737,19 +1067,47 @@ async function executeHealthCheckJob(props: {
             systemId,
             previousStatus,
             newStatus: newState.status,
-            healthyChecks: newState.checkStatuses.filter(
-              (c) => c.status === "healthy",
-            ).length,
-            totalChecks: newState.checkStatuses.length,
-            timestamp: new Date().toISOString(),
+            healthyChecks,
+            totalChecks,
+            timestamp,
           });
           logger.debug(
             `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
           );
         }
+        // Umbrella hook — fires on every transition. Emitted alongside
+        // the directional hooks so existing subscribers stay unchanged
+        // while new automation triggers can react to any change.
+        if (previousStatus !== newState.status) {
+          await emitHook(healthCheckHooks.systemHealthChanged, {
+            systemId,
+            previousStatus,
+            newStatus: newState.status,
+            healthyChecks,
+            totalChecks,
+            timestamp,
+          });
+        }
       }
     }
+    // Per-check auto-incident: see comment on the failed-execution path.
+    await maybeOpenAutoIncidentForCheck({
+      db,
+      service,
+      incidentClient,
+      maintenanceClient,
+      logger,
+      systemId,
+      systemName,
+      configurationId: configId,
+      configurationName: configRow.configName,
+      getEmitHook,
+      previousState,
+      newState,
+    });
     // Note: No manual rescheduling needed - recurring job handles it automatically
   } catch (error) {
     logger.error(
@@ -828,8 +1186,10 @@ async function executeHealthCheckJob(props: {
         notificationClient,
         systemId,
         systemName,
+        configurationId: configId,
         previousStatus,
         newStatus: newState.status,
+        service,
         catalogClient,
         maintenanceClient,
         incidentClient,
@@ -846,16 +1206,20 @@ async function executeHealthCheckJob(props: {
       // Emit integration hooks for external integrations
       const emitHook = getEmitHook();
       if (emitHook) {
+        const healthyChecks = newState.checkStatuses.filter(
+          (c) => c.status === "healthy",
+        ).length;
+        const totalChecks = newState.checkStatuses.length;
+        const timestamp = new Date().toISOString();
         if (newState.status === "healthy" && previousStatus !== "healthy") {
           // Recovery: system became healthy
           await emitHook(healthCheckHooks.systemHealthy, {
             systemId,
             previousStatus,
-            healthyChecks: newState.checkStatuses.filter(
-              (c) => c.status === "healthy",
-            ).length,
-            totalChecks: newState.checkStatuses.length,
-            timestamp: new Date().toISOString(),
+            healthyChecks,
+            totalChecks,
+            timestamp,
           });
           logger.debug(
             `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
@@ -869,19 +1233,47 @@ async function executeHealthCheckJob(props: {
             systemId,
             previousStatus,
             newStatus: newState.status,
-            healthyChecks: newState.checkStatuses.filter(
-              (c) => c.status === "healthy",
-            ).length,
-            totalChecks: newState.checkStatuses.length,
-            timestamp: new Date().toISOString(),
+            healthyChecks,
+            totalChecks,
+            timestamp,
           });
           logger.debug(
             `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
           );
         }
+        // Umbrella hook — fires on every transition. Emitted alongside
+        // the directional hooks so existing subscribers stay unchanged
+        // while new automation triggers can react to any change.
+        if (previousStatus !== newState.status) {
+          await emitHook(healthCheckHooks.systemHealthChanged, {
+            systemId,
+            previousStatus,
+            newStatus: newState.status,
+            healthyChecks,
+            totalChecks,
+            timestamp,
+          });
+        }
       }
     }
+    // Per-check auto-incident: see comment on the failed-execution path.
+    await maybeOpenAutoIncidentForCheck({
+      db,
+      service,
+      incidentClient,
+      maintenanceClient,
+      logger,
+      systemId,
+      systemName,
+      configurationId: configId,
+      configurationName: configName,
+      getEmitHook,
+      previousState,
+      newState,
+    });
     // Note: No manual rescheduling needed - recurring job handles it automatically
   }
 }

package/src/router.test.ts CHANGED Viewed

@@ -62,6 +62,16 @@ describe("HealthCheck Router", () => {
     getProvenance: mock<any>(() => Promise.resolve(null)),
   };
+  const mockConfigService = {
+    get: mock(async () => undefined),
+    set: mock(async () => {}),
+    getRedacted: mock(async () => undefined),
+  };
+  const mockCatalogClient = {
+    getSystem: mock(async () => null),
+  };
   const router = createHealthCheckRouter({
     database: mockDb as never,
     registry: mockRegistry,
@@ -69,6 +79,8 @@ describe("HealthCheck Router", () => {
     gitOpsClient: mockGitOpsClient as never,
     getEmitHook: () => undefined,
     cache: passthroughCache,
+    configService: mockConfigService as never,
+    catalogClient: mockCatalogClient as never,
   });
   it("getStrategies returns strategies from registry", async () => {