@checkstack/healthcheck-backend 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,369 @@
1
+ /**
2
+ * The reactive `health` entity (reactive automation engine §10.3).
3
+ *
4
+ * Model B PLUGIN-BACKED + COMPUTED entity. There is NO framework `entity_state`
5
+ * row for a system's aggregated health and NO domain table of its own — the
6
+ * reactive subset `{ status, healthyChecks, totalChecks }` is COMPUTED on demand
7
+ * by the `read` accessor from the SAME durable health data the rest of the
8
+ * plugin reads (`health_check_runs` via `service.getSystemHealthStatus`). Every
9
+ * evaluation-site write goes through `handle.mutate`, whose `apply` performs the
10
+ * REAL durable write (insert run + increment aggregate) and returns the
11
+ * freshly-computed view. The framework snapshots `prev` via `read` BEFORE
12
+ * `apply` runs (i.e. before the run is persisted), diffs prev → next, appends
13
+ * the transition log, and emits `ENTITY_CHANGED`.
14
+ *
15
+ * This module is the single source of truth for:
16
+ * - the `health` entity zod state schema + kind id,
17
+ * - the PLUGIN-BACKED + COMPUTED `read` accessor
18
+ * ({@link createHealthEntityRead}),
19
+ * - the change → trigger-event deriver (so the existing
20
+ * `healthcheck.system.degraded` / `.healthy` / `.health_changed`
21
+ * automations keep firing), and
22
+ * - the `writeHealthEntity` helper called at every evaluation-write site.
23
+ */
24
+ import { z } from "zod";
25
+ import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
26
+ import { withXactLock, type SafeDatabase } from "@checkstack/backend-api";
27
+ import type {
28
+ EntityChangeDeriver,
29
+ EntityChangePayloadMapper,
30
+ EntityHandle,
31
+ EntityRead,
32
+ } from "@checkstack/automation-backend";
33
+ import type { HealthCheckService } from "./service";
34
+ import * as schema from "./schema";
35
+ // Re-export the change type through automation-backend's barrel (it
36
+ // re-exports it from automation-common) so this domain needs no extra dep.
37
+
38
+ type Db = SafeDatabase<typeof schema>;
39
+
40
+ /** Entity kind id for the per-system aggregated health. */
41
+ export const HEALTH_ENTITY_KIND = "health";
42
+
43
+ /**
44
+ * Reactive state subset surfaced as the entity view. The full aggregate
45
+ * (per-check breakdown, timestamps, etc.) stays in the domain tables; only
46
+ * the fields automations reason about live here. Computed on read from the
47
+ * same durable data `getSystemHealthStatus` reads — never materialized.
48
+ */
49
+ export const HealthEntityStateSchema = z.object({
50
+ status: HealthCheckStatusSchema,
51
+ healthyChecks: z.number().int().nonnegative(),
52
+ totalChecks: z.number().int().nonnegative(),
53
+ });
54
+
55
+ export type HealthEntityState = z.infer<typeof HealthEntityStateSchema>;
56
+
57
+ /**
58
+ * Qualified trigger event ids the health entity drives. These are the
59
+ * TRIGGER qualifiedIds (`${pluginId}.${trigger.id}`) that automations store
60
+ * in `trigger.event` and that Stage-1 routing matches on via
61
+ * `findEnabledByTriggerEvent` — NOT the underlying hook ids. The healthcheck
62
+ * triggers use underscore ids (`system_degraded`, …), so the deriver must
63
+ * emit `healthcheck.system_degraded`, not the dotted hook id
64
+ * `healthcheck.system.degraded`. (Verified against `automations.ts` trigger
65
+ * ids + `trigger-subscriber.ts` which fires on `t.event === qualifiedId`.)
66
+ */
67
+ export const HEALTH_TRIGGER_EVENTS = {
68
+ degraded: "healthcheck.system_degraded",
69
+ healthy: "healthcheck.system_healthy",
70
+ healthChanged: "healthcheck.system_health_changed",
71
+ } as const;
72
+
73
+ /**
74
+ * Read `status` off a serialized entity-state record (the change payload's
75
+ * `prev` / `next` are plain JSON records, not the typed state).
76
+ */
77
+ function readStatus(state: Record<string, unknown> | null): string | null {
78
+ if (state === null) return null;
79
+ const status = state["status"];
80
+ return typeof status === "string" ? status : null;
81
+ }
82
+
83
+ /**
84
+ * Map a `health` entity change to the qualified trigger event id(s) the
85
+ * existing automations match on. Reproduces the directional + umbrella emit
86
+ * conditions that lived inline in `queue-executor.ts`:
87
+ * - recovery (→ healthy): next === "healthy" && prev !== "healthy"
88
+ * - degradation: prev === "healthy" && next !== "healthy"
89
+ * - umbrella (any change): prev !== next
90
+ *
91
+ * A create (`prev === null`) or tombstone (`next === null`) fires nothing —
92
+ * there is no prior aggregate transition to react to, matching the old
93
+ * behavior where the directional/umbrella hooks only emitted on a real
94
+ * status transition of an already-tracked system.
95
+ */
96
+ export const deriveHealthTriggerEvents: EntityChangeDeriver = (changed) => {
97
+ const prev = readStatus(changed.prev);
98
+ const next = readStatus(changed.next);
99
+ if (prev === null || next === null) return [];
100
+ if (prev === next) return [];
101
+
102
+ const events: string[] = [];
103
+ if (next === "healthy") {
104
+ events.push(HEALTH_TRIGGER_EVENTS.healthy);
105
+ } else if (prev === "healthy") {
106
+ events.push(HEALTH_TRIGGER_EVENTS.degraded);
107
+ }
108
+ // Umbrella fires on every transition, alongside the directional event.
109
+ events.push(HEALTH_TRIGGER_EVENTS.healthChanged);
110
+ return events;
111
+ };
112
+
113
+ function readNumber(
114
+ state: Record<string, unknown> | null,
115
+ field: string,
116
+ ): number | undefined {
117
+ if (state === null) return undefined;
118
+ const value = state[field];
119
+ return typeof value === "number" ? value : undefined;
120
+ }
121
+
122
+ /**
123
+ * Map a `health` entity change to the domain-named `trigger.payload` the
124
+ * healthcheck triggers declare via `payloadSchema` (`systemId`,
125
+ * `previousStatus`, `newStatus`, `healthyChecks`, `totalChecks`, `timestamp`).
126
+ * Restores the keys operators read (`trigger.payload.systemId`,
127
+ * `.previousStatus`, …) that the generic change shape omits.
128
+ *
129
+ * `systemId` is the entity id; `previousStatus` is `prev.status` and `newStatus`
130
+ * is `next.status`; `healthyChecks` / `totalChecks` come from `next`;
131
+ * `timestamp` is the change's `occurredAt`. `systemName` is not derivable from a
132
+ * health change (it lives in the catalog) and is OPTIONAL on the schemas, so it
133
+ * is omitted.
134
+ */
135
+ export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
136
+ return {
137
+ systemId: changed.id,
138
+ previousStatus: readStatus(changed.prev) ?? undefined,
139
+ newStatus: readStatus(changed.next) ?? undefined,
140
+ healthyChecks: readNumber(changed.next, "healthyChecks") ?? 0,
141
+ totalChecks: readNumber(changed.next, "totalChecks") ?? 0,
142
+ timestamp: changed.occurredAt,
143
+ };
144
+ };
145
+
146
+ /**
147
+ * Classify a `health` entity change for cross-plugin consumers (slo,
148
+ * dependency) that previously subscribed to the directional
149
+ * `systemDegraded` / `systemHealthy` hooks. Returns the systemId plus
150
+ * boolean transition flags, reproducing the exact emit conditions so a
151
+ * consumer can reproduce its old behavior via `onEntityChanged`.
152
+ *
153
+ * - `degraded`: prev === "healthy" && next !== "healthy" (and next exists)
154
+ * - `recovered`: next === "healthy" && prev !== "healthy" (and prev exists)
155
+ *
156
+ * Create / tombstone produce neither (no prior aggregate transition).
157
+ */
158
+ export interface HealthChangeClassification {
159
+ systemId: string;
160
+ previousStatus: string | null;
161
+ newStatus: string | null;
162
+ degraded: boolean;
163
+ recovered: boolean;
164
+ }
165
+
166
+ export function classifyHealthChange(changed: {
167
+ id: string;
168
+ prev: Record<string, unknown> | null;
169
+ next: Record<string, unknown> | null;
170
+ }): HealthChangeClassification {
171
+ const previousStatus = readStatus(changed.prev);
172
+ const newStatus = readStatus(changed.next);
173
+ const bothPresent = previousStatus !== null && newStatus !== null;
174
+ const degraded =
175
+ bothPresent && previousStatus === "healthy" && newStatus !== "healthy";
176
+ const recovered =
177
+ bothPresent && newStatus === "healthy" && previousStatus !== "healthy";
178
+ return {
179
+ systemId: changed.id,
180
+ previousStatus,
181
+ newStatus,
182
+ degraded,
183
+ recovered,
184
+ };
185
+ }
186
+
187
+ /**
188
+ * Compute the reactive `health` view for a single system from durable data.
189
+ *
190
+ * Derives `{ status, healthyChecks, totalChecks }` from the SAME default-
191
+ * `healthy` baseline aggregate the executor reads via
192
+ * `getSystemHealthStatus`:
193
+ * - `status` = `getSystemHealthStatus(systemId).status` (the worst-
194
+ * wins aggregate across the system's ENABLED checks, computed from
195
+ * `health_check_runs` via `evaluateHealthStatus`; a check with no runs yet
196
+ * evaluates to `"healthy"`),
197
+ * - `healthyChecks` = count of per-check statuses that are `"healthy"`,
198
+ * - `totalChecks` = number of enabled checks (`checkStatuses.length`).
199
+ *
200
+ * EXISTENCE GATE: the entity resolves iff the system has at least one ENABLED
201
+ * check association (`checkStatuses.length > 0`). A system with no enabled
202
+ * checks has no `health` entity and is omitted from the batched `read` (its
203
+ * health is undefined, not a meaningful `healthy`).
204
+ *
205
+ * The gate is intentionally on ASSOCIATIONS, not on persisted runs: a system
206
+ * that has an enabled check but has never run yet resolves to the default-
207
+ * `healthy` baseline (the exact value `getSystemHealthStatus` returns for an
208
+ * empty run window). That makes a first-ever evaluation that comes up
209
+ * unhealthy a real `healthy → degraded` diff — firing `system_degraded` /
210
+ * `health_changed` and the `degraded` `onEntityChanged` for SLO/dependency
211
+ * consumers — instead of a suppressed create (`prev === null`). The entity and
212
+ * the executor therefore agree on the pre-run baseline.
213
+ */
214
+ export async function computeHealthEntityState(args: {
215
+ service: HealthCheckService;
216
+ systemId: string;
217
+ }): Promise<HealthEntityState | undefined> {
218
+ const { service, systemId } = args;
219
+ const overview = await service.getSystemHealthStatus(systemId);
220
+ // No enabled check associations ⇒ no health entity for this system.
221
+ if (overview.checkStatuses.length === 0) return undefined;
222
+ return {
223
+ status: overview.status,
224
+ healthyChecks: overview.checkStatuses.filter((c) => c.status === "healthy")
225
+ .length,
226
+ totalChecks: overview.checkStatuses.length,
227
+ };
228
+ }
229
+
230
+ /**
231
+ * Build the PLUGIN-BACKED + COMPUTED `read` accessor for the `health` entity.
232
+ * For each systemId, assembles the view via {@link computeHealthEntityState}
233
+ * (systems with no runs omitted). This is the single source of truth that
234
+ * `handle.mutate` snapshots `prev` from and `get`/`getMany`/scope enrichment
235
+ * route through — no framework `entity_state` storage.
236
+ */
237
+ export function createHealthEntityRead(deps: {
238
+ service: HealthCheckService;
239
+ }): EntityRead<HealthEntityState> {
240
+ const { service } = deps;
241
+ return async (ids) => {
242
+ if (ids.length === 0) return {};
243
+ const out: Record<string, HealthEntityState> = {};
244
+ await Promise.all(
245
+ ids.map(async (systemId) => {
246
+ const state = await computeHealthEntityState({ service, systemId });
247
+ if (state) out[systemId] = state;
248
+ }),
249
+ );
250
+ return out;
251
+ };
252
+ }
253
+
254
+ /**
255
+ * Drive an evaluation-site health write through `handle.mutate` (§10.3).
256
+ *
257
+ * `apply` performs the REAL durable write (insert the run + increment the
258
+ * hourly aggregate) and returns the freshly-computed `health` view. The
259
+ * framework snapshots `prev` via `read` BEFORE `apply` runs — i.e. BEFORE the
260
+ * run is persisted — so a real status change yields exactly one correct
261
+ * `ENTITY_CHANGED` with accurate prev → next, whose deriver fires the
262
+ * `healthcheck.system_degraded` / `_healthy` / `_health_changed` trigger
263
+ * events. An unchanged aggregate is a no-op (the handle diffs internally).
264
+ *
265
+ * Concurrency:
266
+ * - `serialize`, when provided, wraps the ENTIRE snapshot-prev + apply + diff
267
+ * + emit (the `handle.mutate` call) in a per-`systemId` critical section.
268
+ * Without it, concurrent evaluations of one system (multiple per-config jobs
269
+ * across pods, or at-least-once redelivery) interleave: both snapshot
270
+ * `prev = healthy`, both persist a failing run, both diff `healthy →
271
+ * degraded`, and both emit — yielding two `ENTITY_CHANGED` + two transition
272
+ * rows for one logical transition (inflating `transitionCount`/flapping and
273
+ * re-running dependency notify). The executor wires this to a transaction-
274
+ * scoped advisory lock keyed `health:<systemId>` (`withXactLock`), so two
275
+ * concurrent evals of one system serialize through prev-snapshot to emit.
276
+ * The durable `apply` write is the SAME whether serialized or not — only the
277
+ * snapshot/diff/emit window is protected.
278
+ *
279
+ * Failure handling:
280
+ * - When no `handle` is bound (version skew / tests), `apply` still runs —
281
+ * the durable write is never gated on entity reactivity. (The serialization
282
+ * lock is part of the reactive path, so an unbound handle skips it too; the
283
+ * durable insert keeps its own ordering guarantees.)
284
+ * - If `apply` throws BEFORE the durable write commits, the error propagates
285
+ * so the executor's own error path (fallback insert) runs. We detect this
286
+ * via `durableState`: it is only set once `apply` has produced its view, so
287
+ * if it is still unset when `mutate` throws, the durable write did not
288
+ * commit.
289
+ * - If the FRAMEWORK reactivity throws AFTER the durable write committed
290
+ * (transition append / emit — the documented Model B post-commit boundary),
291
+ * we route it to `onError` and DO NOT rethrow: a reactivity failure must
292
+ * never break health-check execution (the durable tables already hold the
293
+ * authoritative state).
294
+ *
295
+ * Returns the computed view (or `undefined` if `apply` never produced one,
296
+ * which only happens when it threw and `handle` was absent — in which case the
297
+ * throw already propagated).
298
+ */
299
+ export async function writeHealthEntity(args: {
300
+ handle: EntityHandle<HealthEntityState> | undefined;
301
+ systemId: string;
302
+ apply: () => Promise<HealthEntityState>;
303
+ onError?: (error: unknown) => void;
304
+ /**
305
+ * Optional per-`systemId` critical section wrapping the snapshot-prev +
306
+ * apply + diff + emit. The executor supplies a transaction-scoped advisory
307
+ * lock (`withXactLock`, key `health:<systemId>`) so concurrent evaluations
308
+ * of one system can't double-emit a single logical transition. Identity by
309
+ * default (no serialization) for the unbound-handle / test paths.
310
+ */
311
+ serialize?: <T>(fn: () => Promise<T>) => Promise<T>;
312
+ }): Promise<HealthEntityState> {
313
+ const { handle, systemId, apply, onError, serialize } = args;
314
+ if (!handle) {
315
+ // No reactivity bound — run the durable write directly.
316
+ return apply();
317
+ }
318
+ const run = serialize ?? (<T>(fn: () => Promise<T>) => fn());
319
+ let durableState: HealthEntityState | undefined;
320
+ try {
321
+ // The lock scope MUST cover prev-snapshot through emit: `handle.mutate`
322
+ // snapshots `prev` via `read`, runs `apply`, diffs, and emits inside one
323
+ // call, and we wrap that whole call so two concurrent evals serialize.
324
+ return await run(() =>
325
+ handle.mutate({
326
+ id: systemId,
327
+ apply: async () => {
328
+ durableState = await apply();
329
+ return durableState;
330
+ },
331
+ }),
332
+ );
333
+ } catch (error) {
334
+ // `apply` never committed ⇒ the durable write failed; propagate so the
335
+ // executor's outer catch can run its fallback path.
336
+ if (durableState === undefined) throw error;
337
+ // Durable write committed; only the framework reactivity failed. Fail-soft.
338
+ onError?.(error);
339
+ return durableState;
340
+ }
341
+ }
342
+
343
+ /** Advisory-lock key namespace for the per-system health critical section. */
344
+ export function healthSystemLockKey(systemId: string): string {
345
+ return `health:${systemId}`;
346
+ }
347
+
348
+ /**
349
+ * Build the per-`systemId` serializer for {@link writeHealthEntity} backed by
350
+ * a transaction-scoped advisory lock (`withXactLock`, key
351
+ * `health:<systemId>`). The returned function blocks until it holds the
352
+ * system's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
353
+ * auto-releases the lock at COMMIT/ROLLBACK. Two concurrent evaluations of one
354
+ * system therefore serialize — exactly one logical `healthy → degraded`
355
+ * transition emits exactly one `ENTITY_CHANGED` + one transition row.
356
+ *
357
+ * `fn` does its own durable writes on the outer pool; the lock only gates
358
+ * ENTRY to the critical section, so its connection affinity is irrelevant —
359
+ * the second caller cannot acquire the xact lock until the first transaction
360
+ * commits.
361
+ */
362
+ export function createHealthEntitySerializer(deps: {
363
+ db: Db;
364
+ }): (systemId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
365
+ const { db } = deps;
366
+ return (systemId) =>
367
+ <T>(fn: () => Promise<T>) =>
368
+ withXactLock({ db, key: healthSystemLockKey(systemId), fn: () => fn() });
369
+ }
@@ -0,0 +1,115 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import {
3
+ aggregateWindowedMetrics,
4
+ buildHealthState,
5
+ } from "./health-state";
6
+
7
+ describe("buildHealthState", () => {
8
+ const now = new Date("2026-05-30T12:00:00.000Z");
9
+
10
+ it("computes inStatusForMs from inStatusSince relative to now", () => {
11
+ const since = new Date("2026-05-30T11:30:00.000Z"); // 30 min ago
12
+ const state = buildHealthState({
13
+ status: "unhealthy",
14
+ inStatusSince: since,
15
+ inMaintenance: false,
16
+ transitionsInWindow: 0,
17
+ transitionWindowMinutes: 60,
18
+ now,
19
+ });
20
+ expect(state.status).toBe("unhealthy");
21
+ expect(state.inStatusSince).toBe(since);
22
+ expect(state.inStatusForMs).toBe(30 * 60_000);
23
+ expect(state.evaluatedAt).toBe(now);
24
+ });
25
+
26
+ it("returns inStatusForMs 0 when inStatusSince is null (fail-safe)", () => {
27
+ const state = buildHealthState({
28
+ status: "healthy",
29
+ inStatusSince: null,
30
+ inMaintenance: false,
31
+ transitionsInWindow: 0,
32
+ transitionWindowMinutes: 60,
33
+ now,
34
+ });
35
+ expect(state.inStatusSince).toBeNull();
36
+ expect(state.inStatusForMs).toBe(0);
37
+ });
38
+
39
+ it("clamps negative durations to 0 under clock skew", () => {
40
+ const future = new Date("2026-05-30T12:05:00.000Z");
41
+ const state = buildHealthState({
42
+ status: "degraded",
43
+ inStatusSince: future,
44
+ inMaintenance: false,
45
+ transitionsInWindow: 0,
46
+ transitionWindowMinutes: 60,
47
+ now,
48
+ });
49
+ expect(state.inStatusForMs).toBe(0);
50
+ });
51
+
52
+ it("passes through metrics and maintenance flag", () => {
53
+ const state = buildHealthState({
54
+ status: "healthy",
55
+ inStatusSince: null,
56
+ latencyMs: 42,
57
+ avgLatencyMs: 50,
58
+ p95LatencyMs: 120,
59
+ successRate: 0.99,
60
+ lastRunAt: now,
61
+ inMaintenance: true,
62
+ transitionsInWindow: 3,
63
+ transitionWindowMinutes: 60,
64
+ now,
65
+ });
66
+ expect(state.latencyMs).toBe(42);
67
+ expect(state.avgLatencyMs).toBe(50);
68
+ expect(state.p95LatencyMs).toBe(120);
69
+ expect(state.successRate).toBe(0.99);
70
+ expect(state.lastRunAt).toBe(now);
71
+ expect(state.inMaintenance).toBe(true);
72
+ expect(state.transitionsInWindow).toBe(3);
73
+ expect(state.transitionWindowMinutes).toBe(60);
74
+ });
75
+ });
76
+
77
+ describe("aggregateWindowedMetrics", () => {
78
+ it("returns empty object when there are no buckets", () => {
79
+ expect(aggregateWindowedMetrics([])).toEqual({});
80
+ });
81
+
82
+ it("computes latency-sum-weighted average, max p95, and success rate", () => {
83
+ const result = aggregateWindowedMetrics([
84
+ { runCount: 10, healthyCount: 9, latencySumMs: 1000, p95LatencyMs: 100 },
85
+ { runCount: 30, healthyCount: 30, latencySumMs: 6000, p95LatencyMs: 200 },
86
+ ]);
87
+ // (1000 + 6000) / (10 + 30) = 175
88
+ expect(result.avgLatencyMs).toBe(175);
89
+ // max p95 across buckets
90
+ expect(result.p95LatencyMs).toBe(200);
91
+ // (9 + 30) / (10 + 30) = 0.975
92
+ expect(result.successRate).toBeCloseTo(0.975, 5);
93
+ });
94
+
95
+ it("ignores buckets with null latency for the average but counts them for success rate", () => {
96
+ const result = aggregateWindowedMetrics([
97
+ { runCount: 5, healthyCount: 5, latencySumMs: null, p95LatencyMs: null },
98
+ { runCount: 5, healthyCount: 4, latencySumMs: 500, p95LatencyMs: 80 },
99
+ ]);
100
+ // only the second bucket contributes latency: 500 / 5 = 100
101
+ expect(result.avgLatencyMs).toBe(100);
102
+ expect(result.p95LatencyMs).toBe(80);
103
+ // success rate spans both: 9 / 10
104
+ expect(result.successRate).toBeCloseTo(0.9, 5);
105
+ });
106
+
107
+ it("omits avg/p95 when no bucket carries latency", () => {
108
+ const result = aggregateWindowedMetrics([
109
+ { runCount: 3, healthyCount: 2, latencySumMs: null, p95LatencyMs: null },
110
+ ]);
111
+ expect(result.avgLatencyMs).toBeUndefined();
112
+ expect(result.p95LatencyMs).toBeUndefined();
113
+ expect(result.successRate).toBeCloseTo(2 / 3, 5);
114
+ });
115
+ });