@checkstack/healthcheck-backend 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +541 -0
  2. package/drizzle/0015_quiet_meggan.sql +12 -0
  3. package/drizzle/0016_complex_maginty.sql +1 -0
  4. package/drizzle/0017_pretty_caretaker.sql +1 -0
  5. package/drizzle/meta/0015_snapshot.json +764 -0
  6. package/drizzle/meta/0016_snapshot.json +644 -0
  7. package/drizzle/meta/0017_snapshot.json +563 -0
  8. package/drizzle/meta/_journal.json +21 -0
  9. package/package.json +24 -21
  10. package/src/automations.test.ts +234 -0
  11. package/src/automations.ts +342 -0
  12. package/src/collector-script-test.test.ts +236 -0
  13. package/src/collector-script-test.ts +221 -0
  14. package/src/health-entity.test.ts +698 -0
  15. package/src/health-entity.ts +369 -0
  16. package/src/health-state.test.ts +115 -0
  17. package/src/health-state.ts +333 -0
  18. package/src/healthcheck-gitops-kinds.test.ts +6 -32
  19. package/src/healthcheck-gitops-kinds.ts +4 -19
  20. package/src/hooks.test.ts +19 -6
  21. package/src/hooks.ts +38 -28
  22. package/src/index.ts +150 -98
  23. package/src/queue-executor.test.ts +137 -0
  24. package/src/queue-executor.ts +282 -380
  25. package/src/retention-job.ts +65 -1
  26. package/src/retention-state-transitions.test.ts +49 -0
  27. package/src/router.test.ts +18 -0
  28. package/src/router.ts +56 -1
  29. package/src/schema.ts +34 -54
  30. package/src/service-assignments.test.ts +184 -0
  31. package/src/service-notification-policy.test.ts +28 -71
  32. package/src/service.ts +154 -0
  33. package/src/state-transitions.test.ts +126 -0
  34. package/src/state-transitions.ts +112 -0
  35. package/tsconfig.json +12 -3
  36. package/src/auto-incident-close-job.ts +0 -164
  37. package/src/auto-incident.test.ts +0 -196
  38. package/src/auto-incident.ts +0 -332
@@ -0,0 +1,333 @@
1
+ import { and, desc, eq, gte } from "drizzle-orm";
2
+ import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
3
+ import type { Logger, SafeDatabase } from "@checkstack/backend-api";
4
+ import type { InferClient } from "@checkstack/common";
5
+ import { MaintenanceApi } from "@checkstack/maintenance-common";
6
+ import { healthCheckAggregates, healthCheckRuns } from "./schema";
7
+ import * as schema from "./schema";
8
+ import {
9
+ countStateTransitionsInWindow,
10
+ findInStatusSince,
11
+ } from "./state-transitions";
12
+
13
+ type Db = SafeDatabase<typeof schema>;
14
+ type MaintenanceClient = InferClient<typeof MaintenanceApi>;
15
+
16
+ /**
17
+ * Live, service-typed health-state snapshot for a single system. This
18
+ * is the data contract the automation sensing layer (Wave 2) reads to
19
+ * answer "is this system unhealthy, and for how long?" without
20
+ * re-deriving the math each time.
21
+ */
22
+ export interface HealthState {
23
+ /** Aggregate status across all enabled checks. */
24
+ status: HealthCheckStatus;
25
+ /**
26
+ * When the system most recently entered `status`. Null when no
27
+ * transition has been recorded yet (fail-safe: never throws).
28
+ */
29
+ inStatusSince: Date | null;
30
+ /**
31
+ * Milliseconds the system has continuously been in `status`. 0 when
32
+ * `inStatusSince` is unknown.
33
+ */
34
+ inStatusForMs: number;
35
+ /** Latency of the newest run, if any. */
36
+ latencyMs?: number;
37
+ /** Windowed average latency from recent aggregate buckets. */
38
+ avgLatencyMs?: number;
39
+ /** Windowed p95 latency from recent aggregate buckets. */
40
+ p95LatencyMs?: number;
41
+ /** Windowed success rate (healthy / total) in [0, 1] from buckets. */
42
+ successRate?: number;
43
+ /** Timestamp of the newest run, if any. */
44
+ lastRunAt?: Date;
45
+ /** Whether the system is currently in a maintenance window. */
46
+ inMaintenance: boolean;
47
+ /**
48
+ * Count of aggregate status transitions in the trailing
49
+ * `transitionWindowMinutes` window. Generalizes flapping detection -
50
+ * an automation can gate on "N status changes in M minutes".
51
+ */
52
+ transitionsInWindow: number;
53
+ /** The window (minutes) `transitionsInWindow` was counted over. */
54
+ transitionWindowMinutes: number;
55
+ /** When this snapshot was computed. */
56
+ evaluatedAt: Date;
57
+ }
58
+
59
+ /** Raw inputs to the pure builder, decoupled from the DB layer. */
60
+ export interface HealthStateInputs {
61
+ status: HealthCheckStatus;
62
+ inStatusSince: Date | null;
63
+ latencyMs?: number;
64
+ avgLatencyMs?: number;
65
+ p95LatencyMs?: number;
66
+ successRate?: number;
67
+ lastRunAt?: Date;
68
+ inMaintenance: boolean;
69
+ transitionsInWindow: number;
70
+ transitionWindowMinutes: number;
71
+ now: Date;
72
+ }
73
+
74
+ /** Default trailing window (minutes) for the transition count. */
75
+ export const DEFAULT_TRANSITION_WINDOW_MINUTES = 60;
76
+
77
+ /**
78
+ * Pure assembler for a {@link HealthState}. Computes `inStatusForMs`
79
+ * from `inStatusSince` relative to `now`, clamped at 0 so clock skew
80
+ * never yields a negative duration. No I/O.
81
+ */
82
+ export function buildHealthState(inputs: HealthStateInputs): HealthState {
83
+ const {
84
+ status,
85
+ inStatusSince,
86
+ latencyMs,
87
+ avgLatencyMs,
88
+ p95LatencyMs,
89
+ successRate,
90
+ lastRunAt,
91
+ inMaintenance,
92
+ transitionsInWindow,
93
+ transitionWindowMinutes,
94
+ now,
95
+ } = inputs;
96
+
97
+ const inStatusForMs = inStatusSince
98
+ ? Math.max(0, now.getTime() - inStatusSince.getTime())
99
+ : 0;
100
+
101
+ return {
102
+ status,
103
+ inStatusSince,
104
+ inStatusForMs,
105
+ latencyMs,
106
+ avgLatencyMs,
107
+ p95LatencyMs,
108
+ successRate,
109
+ lastRunAt,
110
+ inMaintenance,
111
+ transitionsInWindow,
112
+ transitionWindowMinutes,
113
+ evaluatedAt: now,
114
+ };
115
+ }
116
+
117
+ /**
118
+ * Newest run (latency + timestamp) for a system, optionally narrowed to
119
+ * a single check. Returns undefined fields when no run exists.
120
+ */
121
+ export async function findLatestRun({
122
+ db,
123
+ systemId,
124
+ configurationId,
125
+ }: {
126
+ db: Db;
127
+ systemId: string;
128
+ configurationId?: string;
129
+ }): Promise<{ latencyMs?: number; lastRunAt?: Date }> {
130
+ const conditions = [eq(healthCheckRuns.systemId, systemId)];
131
+ if (configurationId) {
132
+ conditions.push(eq(healthCheckRuns.configurationId, configurationId));
133
+ }
134
+
135
+ const [row] = await db
136
+ .select({
137
+ latencyMs: healthCheckRuns.latencyMs,
138
+ timestamp: healthCheckRuns.timestamp,
139
+ })
140
+ .from(healthCheckRuns)
141
+ .where(and(...conditions))
142
+ .orderBy(desc(healthCheckRuns.timestamp))
143
+ .limit(1);
144
+
145
+ if (!row) return {};
146
+ return {
147
+ latencyMs: row.latencyMs ?? undefined,
148
+ lastRunAt: row.timestamp,
149
+ };
150
+ }
151
+
152
+ /** Number of hours of aggregate buckets folded into windowed metrics. */
153
+ const DEFAULT_METRICS_WINDOW_HOURS = 24;
154
+
155
+ /**
156
+ * Windowed metrics (avg/p95 latency, success rate) computed from hourly
157
+ * aggregate buckets over the trailing window. Returns undefined fields
158
+ * when no buckets exist in the window.
159
+ */
160
+ export async function computeWindowedMetrics({
161
+ db,
162
+ systemId,
163
+ configurationId,
164
+ now = new Date(),
165
+ windowHours = DEFAULT_METRICS_WINDOW_HOURS,
166
+ }: {
167
+ db: Db;
168
+ systemId: string;
169
+ configurationId?: string;
170
+ now?: Date;
171
+ windowHours?: number;
172
+ }): Promise<{
173
+ avgLatencyMs?: number;
174
+ p95LatencyMs?: number;
175
+ successRate?: number;
176
+ }> {
177
+ const windowStart = new Date(now.getTime() - windowHours * 3_600_000);
178
+ const conditions = [
179
+ eq(healthCheckAggregates.systemId, systemId),
180
+ eq(healthCheckAggregates.bucketSize, "hourly"),
181
+ gte(healthCheckAggregates.bucketStart, windowStart),
182
+ ];
183
+ if (configurationId) {
184
+ conditions.push(
185
+ eq(healthCheckAggregates.configurationId, configurationId),
186
+ );
187
+ }
188
+
189
+ const buckets = await db
190
+ .select({
191
+ runCount: healthCheckAggregates.runCount,
192
+ healthyCount: healthCheckAggregates.healthyCount,
193
+ latencySumMs: healthCheckAggregates.latencySumMs,
194
+ p95LatencyMs: healthCheckAggregates.p95LatencyMs,
195
+ })
196
+ .from(healthCheckAggregates)
197
+ .where(and(...conditions));
198
+
199
+ return aggregateWindowedMetrics(buckets);
200
+ }
201
+
202
+ /**
203
+ * Pure reduction of aggregate buckets into windowed metrics. Avg
204
+ * latency is the latency-sum-weighted mean; p95 is the max bucket p95
205
+ * (a conservative upper bound without re-merging t-digests); success
206
+ * rate is healthy/total across the window.
207
+ */
208
+ export function aggregateWindowedMetrics(
209
+ buckets: Array<{
210
+ runCount: number;
211
+ healthyCount: number;
212
+ latencySumMs: number | null;
213
+ p95LatencyMs: number | null;
214
+ }>,
215
+ ): {
216
+ avgLatencyMs?: number;
217
+ p95LatencyMs?: number;
218
+ successRate?: number;
219
+ } {
220
+ if (buckets.length === 0) return {};
221
+
222
+ let totalRuns = 0;
223
+ let totalHealthy = 0;
224
+ let latencySum = 0;
225
+ let latencyRuns = 0;
226
+ let maxP95: number | undefined;
227
+
228
+ for (const b of buckets) {
229
+ totalRuns += b.runCount;
230
+ totalHealthy += b.healthyCount;
231
+ if (b.latencySumMs != null) {
232
+ latencySum += b.latencySumMs;
233
+ latencyRuns += b.runCount;
234
+ }
235
+ if (b.p95LatencyMs != null) {
236
+ maxP95 = maxP95 == null ? b.p95LatencyMs : Math.max(maxP95, b.p95LatencyMs);
237
+ }
238
+ }
239
+
240
+ return {
241
+ avgLatencyMs:
242
+ latencyRuns > 0 ? Math.round(latencySum / latencyRuns) : undefined,
243
+ p95LatencyMs: maxP95,
244
+ successRate: totalRuns > 0 ? totalHealthy / totalRuns : undefined,
245
+ };
246
+ }
247
+
248
+ /**
249
+ * Check whether a system is currently in a maintenance window
250
+ * (suppression-agnostic). Fail-open to `false` on client error so a
251
+ * maintenance-plugin outage never wedges health-state reads.
252
+ */
253
+ async function resolveInMaintenance({
254
+ maintenanceClient,
255
+ systemId,
256
+ logger,
257
+ }: {
258
+ maintenanceClient: MaintenanceClient | undefined;
259
+ systemId: string;
260
+ logger?: Logger;
261
+ }): Promise<boolean> {
262
+ if (!maintenanceClient) return false;
263
+ try {
264
+ const { active } = await maintenanceClient.hasActiveMaintenance({
265
+ systemId,
266
+ });
267
+ return active;
268
+ } catch (error) {
269
+ logger?.warn(
270
+ `Failed to resolve maintenance state for ${systemId}; assuming not in maintenance:`,
271
+ error,
272
+ );
273
+ return false;
274
+ }
275
+ }
276
+
277
+ /**
278
+ * Orchestrate the full {@link HealthState} for a single system: status
279
+ * (from the provided resolver), in-status-since (transitions table),
280
+ * latest run, windowed metrics, and maintenance state. `now` is passed
281
+ * explicitly so callers can keep a stable evaluation timestamp.
282
+ */
283
+ export async function computeHealthState({
284
+ db,
285
+ systemId,
286
+ configurationId,
287
+ resolveStatus,
288
+ maintenanceClient,
289
+ logger,
290
+ transitionWindowMinutes = DEFAULT_TRANSITION_WINDOW_MINUTES,
291
+ now = new Date(),
292
+ }: {
293
+ db: Db;
294
+ systemId: string;
295
+ configurationId?: string;
296
+ /** Returns the aggregate status for the system (per-check when scoped). */
297
+ resolveStatus: () => Promise<HealthCheckStatus>;
298
+ maintenanceClient?: MaintenanceClient;
299
+ logger?: Logger;
300
+ /** Trailing window (minutes) for the transition count. */
301
+ transitionWindowMinutes?: number;
302
+ now?: Date;
303
+ }): Promise<HealthState> {
304
+ const status = await resolveStatus();
305
+
306
+ const [inStatusSince, latest, windowed, inMaintenance, transitionsInWindow] =
307
+ await Promise.all([
308
+ findInStatusSince({ db, systemId, status }),
309
+ findLatestRun({ db, systemId, configurationId }),
310
+ computeWindowedMetrics({ db, systemId, configurationId, now }),
311
+ resolveInMaintenance({ maintenanceClient, systemId, logger }),
312
+ countStateTransitionsInWindow({
313
+ db,
314
+ systemId,
315
+ windowMinutes: transitionWindowMinutes,
316
+ now,
317
+ }),
318
+ ]);
319
+
320
+ return buildHealthState({
321
+ status,
322
+ inStatusSince,
323
+ latencyMs: latest.latencyMs,
324
+ avgLatencyMs: windowed.avgLatencyMs,
325
+ p95LatencyMs: windowed.p95LatencyMs,
326
+ successRate: windowed.successRate,
327
+ lastRunAt: latest.lastRunAt,
328
+ inMaintenance,
329
+ transitionsInWindow,
330
+ transitionWindowMinutes,
331
+ now,
332
+ });
333
+ }
@@ -40,19 +40,6 @@ interface MockAssociation {
40
40
  enabled: boolean;
41
41
  notificationPolicy?: {
42
42
  suppressDeEscalations: boolean;
43
- autoOpenIncidentOnUnhealthy: boolean;
44
- useNotificationSuppression: boolean;
45
- skipDuringMaintenance: boolean;
46
- sustainedUnhealthyTrigger: {
47
- enabled: boolean;
48
- durationMinutes: number;
49
- };
50
- flappingTrigger: {
51
- enabled: boolean;
52
- transitions: number;
53
- windowMinutes: number;
54
- };
55
- autoCloseAfterMinutes: number | null;
56
43
  };
57
44
  }
58
45
 
@@ -657,12 +644,11 @@ describe("Healthcheck GitOps Kind: System Extension", () => {
657
644
  extensionSpec: [
658
645
  {
659
646
  ref: { kind: "Healthcheck", name: "db-check" },
660
- // Operator only sets the flap threshold and disables
661
- // auto-close; everything else should default in via the
662
- // schema parse.
647
+ // Operator sets the one surviving policy field; everything else
648
+ // should default in via the schema parse. Flapping thresholds are
649
+ // no longer part of the policy — they live on the trigger config.
663
650
  notificationPolicy: {
664
- flappingTrigger: { transitions: 5 },
665
- autoCloseAfterMinutes: null,
651
+ suppressDeEscalations: true,
666
652
  },
667
653
  },
668
654
  ],
@@ -672,20 +658,8 @@ describe("Healthcheck GitOps Kind: System Extension", () => {
672
658
 
673
659
  const policy = mockService.associations[0]?.notificationPolicy;
674
660
  expect(policy).toBeDefined();
675
- expect(policy?.suppressDeEscalations).toBe(false);
676
- expect(policy?.autoOpenIncidentOnUnhealthy).toBe(true);
677
- expect(policy?.useNotificationSuppression).toBe(true);
678
- expect(policy?.skipDuringMaintenance).toBe(true);
679
- expect(policy?.sustainedUnhealthyTrigger).toEqual({
680
- enabled: true,
681
- durationMinutes: 30,
682
- });
683
- expect(policy?.flappingTrigger).toEqual({
684
- enabled: true,
685
- transitions: 5,
686
- windowMinutes: 60,
687
- });
688
- expect(policy?.autoCloseAfterMinutes).toBeNull();
661
+ expect(policy?.suppressDeEscalations).toBe(true);
662
+ expect(Object.keys(policy ?? {})).toEqual(["suppressDeEscalations"]);
689
663
  });
690
664
 
691
665
  it("omits notificationPolicy entirely when the spec doesn't set it", async () => {
@@ -85,26 +85,11 @@ const systemHealthcheckExtensionSchema = z
85
85
  /**
86
86
  * Per-assignment notification policy. Any field omitted falls
87
87
  * back to the platform default (see `DEFAULT_NOTIFICATION_POLICY`).
88
- * Inner objects (`sustainedUnhealthyTrigger`, `flappingTrigger`)
89
- * are also accepted partially.
88
+ * Flapping thresholds moved onto the automation engine's windowed-count
89
+ * gate (the `system_health_changed` trigger's `window` block) and are no
90
+ * longer accepted here.
90
91
  */
91
- notificationPolicy: NotificationPolicySchema.partial()
92
- .extend({
93
- sustainedUnhealthyTrigger: z
94
- .object({
95
- enabled: z.boolean().optional(),
96
- durationMinutes: z.number().int().min(1).optional(),
97
- })
98
- .optional(),
99
- flappingTrigger: z
100
- .object({
101
- enabled: z.boolean().optional(),
102
- transitions: z.number().int().min(1).optional(),
103
- windowMinutes: z.number().int().min(1).optional(),
104
- })
105
- .optional(),
106
- })
107
- .optional(),
92
+ notificationPolicy: NotificationPolicySchema.partial().optional(),
108
93
  }),
109
94
  )
110
95
  .optional();
package/src/hooks.test.ts CHANGED
@@ -2,15 +2,28 @@ import { describe, it, expect } from "bun:test";
2
2
  import { healthCheckHooks } from "./hooks";
3
3
 
4
4
  describe("Health Check Hooks", () => {
5
- it("should have systemDegraded hook with correct ID", () => {
6
- expect(healthCheckHooks.systemDegraded.id).toBe(
7
- "healthcheck.system.degraded"
5
+ // The directional/umbrella system-health hooks were removed in Phase 4
6
+ // (§10.3) — the `health` entity drives those events now. The remaining
7
+ // hooks are the KEPT non-entity signals.
8
+ it("keeps the assignmentChanged config-change hook", () => {
9
+ expect(healthCheckHooks.assignmentChanged.id).toBe(
10
+ "healthcheck.assignment.changed",
8
11
  );
9
12
  });
10
13
 
11
- it("should have systemHealthy hook with correct ID", () => {
12
- expect(healthCheckHooks.systemHealthy.id).toBe(
13
- "healthcheck.system.healthy"
14
+ it("keeps the raw-sample checkCompleted / checkFailed hooks", () => {
15
+ expect(healthCheckHooks.checkCompleted.id).toBe(
16
+ "healthcheck.check.completed",
14
17
  );
18
+ expect(healthCheckHooks.checkFailed.id).toBe("healthcheck.check.failed");
19
+ });
20
+
21
+ it("no longer exposes the removed system-health or flapping hooks", () => {
22
+ expect("systemDegraded" in healthCheckHooks).toBe(false);
23
+ expect("systemHealthy" in healthCheckHooks).toBe(false);
24
+ expect("systemHealthChanged" in healthCheckHooks).toBe(false);
25
+ // Flapping moved to the automation engine's windowed-count gate; the
26
+ // pre-derived flapping signal hook was removed.
27
+ expect("flappingDetected" in healthCheckHooks).toBe(false);
15
28
  });
16
29
  });
package/src/hooks.ts CHANGED
@@ -1,37 +1,29 @@
1
1
  import { createHook } from "@checkstack/backend-api";
2
+ import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
2
3
 
3
4
  /**
4
5
  * Health check hooks for cross-plugin communication and external integrations.
5
6
  * These hooks are registered as integration events for webhook subscriptions.
7
+ *
8
+ * `status` / `previousStatus` / `newStatus` carry the canonical
9
+ * `HealthCheckStatus` enum values, so automation triggers built on
10
+ * these hooks can offer the known values for `==` comparisons in the
11
+ * editor.
6
12
  */
7
13
  export const healthCheckHooks = {
8
- /**
9
- * Emitted when a system's aggregated health status degrades.
10
- * This fires when status changes from healthy to degraded/unhealthy,
11
- * or from degraded to unhealthy.
12
- */
13
- systemDegraded: createHook<{
14
- systemId: string;
15
- systemName?: string;
16
- previousStatus: string;
17
- newStatus: string;
18
- healthyChecks: number;
19
- totalChecks: number;
20
- timestamp: string;
21
- }>("healthcheck.system.degraded"),
22
-
23
- /**
24
- * Emitted when a system's aggregated health status recovers to healthy.
25
- * This fires when status changes from degraded/unhealthy to healthy.
26
- */
27
- systemHealthy: createHook<{
28
- systemId: string;
29
- systemName?: string;
30
- previousStatus: string;
31
- healthyChecks: number;
32
- totalChecks: number;
33
- timestamp: string;
34
- }>("healthcheck.system.healthy"),
14
+ // The `healthcheck.system.degraded` / `.healthy` / `.health_changed` hooks
15
+ // were removed in Phase 4 (§10.3): the per-system aggregated health is now
16
+ // the reactive `health` entity, whose change deriver fires the
17
+ // `healthcheck.system_degraded` / `_healthy` / `_health_changed` trigger
18
+ // events through Stage-1 routing. The remaining hooks below are KEPT:
19
+ // `assignmentChanged` (config signal) and `checkCompleted` / `checkFailed`
20
+ // (high-frequency raw samples + numeric_state wake source).
21
+ //
22
+ // The `flappingDetected` hook was removed: flapping is now detected in the
23
+ // automation engine by the windowed-count gate on the
24
+ // `healthcheck.system_health_changed` trigger (base raw change event +
25
+ // `filter` + `window: { count, minutes, refire: "once" }`), so healthcheck
26
+ // no longer computes or emits a pre-derived flapping signal.
35
27
 
36
28
  /**
37
29
  * Emitted when a health check ↔ system association changes.
@@ -50,9 +42,27 @@ export const healthCheckHooks = {
50
42
  checkCompleted: createHook<{
51
43
  systemId: string;
52
44
  configurationId: string;
53
- status: string;
45
+ status: HealthCheckStatus;
54
46
  latencyMs: number | undefined;
55
47
  result: Record<string, unknown> | undefined;
56
48
  timestamp: string;
57
49
  }>("healthcheck.check.completed"),
50
+
51
+ /**
52
+ * Narrow variant of `checkCompleted` — fires only when an individual
53
+ * check run completed with a non-`healthy` status. Carries the
54
+ * latency + raw result so subscribers can branch on collector-
55
+ * specific fields without re-querying. Operators usually prefer
56
+ * this over `checkCompleted` for incident-style automation because
57
+ * a "trigger on any completion, then filter" automation is harder
58
+ * to read at a glance than a typed `check_failed` entry point.
59
+ */
60
+ checkFailed: createHook<{
61
+ systemId: string;
62
+ configurationId: string;
63
+ status: HealthCheckStatus;
64
+ latencyMs: number | undefined;
65
+ result: Record<string, unknown> | undefined;
66
+ timestamp: string;
67
+ }>("healthcheck.check.failed"),
58
68
  } as const;