@checkstack/healthcheck-backend 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +409 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +6 -27
- package/src/automations.ts +32 -30
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +694 -0
- package/src/health-entity.ts +367 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +13 -68
- package/src/index.ts +118 -48
- package/src/queue-executor.test.ts +13 -0
- package/src/queue-executor.ts +251 -444
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +13 -0
- package/src/router.ts +44 -0
- package/src/schema.ts +34 -54
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +89 -0
- package/src/state-evaluator.test.ts +50 -5
- package/src/state-evaluator.ts +9 -2
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +9 -0
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The reactive `health` entity (reactive automation engine §10.3).
|
|
3
|
+
*
|
|
4
|
+
* Model B PLUGIN-BACKED + COMPUTED entity. There is NO framework `entity_state`
|
|
5
|
+
* row for a system's aggregated health and NO domain table of its own — the
|
|
6
|
+
* reactive subset `{ status, healthyChecks, totalChecks }` is COMPUTED on demand
|
|
7
|
+
* by the `read` accessor from the SAME durable health data the rest of the
|
|
8
|
+
* plugin reads (`health_check_runs` via `service.getSystemHealthStatus`). Every
|
|
9
|
+
* evaluation-site write goes through `handle.mutate`, whose `apply` performs the
|
|
10
|
+
* REAL durable write (insert run + increment aggregate) and returns the
|
|
11
|
+
* freshly-computed view. The framework snapshots `prev` via `read` BEFORE
|
|
12
|
+
* `apply` runs (i.e. before the run is persisted), diffs prev → next, appends
|
|
13
|
+
* the transition log, and emits `ENTITY_CHANGED`.
|
|
14
|
+
*
|
|
15
|
+
* This module is the single source of truth for:
|
|
16
|
+
* - the `health` entity zod state schema + kind id,
|
|
17
|
+
* - the PLUGIN-BACKED + COMPUTED `read` accessor
|
|
18
|
+
* ({@link createHealthEntityRead}),
|
|
19
|
+
* - the change → trigger-event deriver (so the existing
|
|
20
|
+
* `healthcheck.system.degraded` / `.healthy` / `.health_changed`
|
|
21
|
+
* automations keep firing), and
|
|
22
|
+
* - the `writeHealthEntity` helper called at every evaluation-write site.
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
|
|
26
|
+
import type { AdvisoryLockService } from "@checkstack/backend-api";
|
|
27
|
+
import type {
|
|
28
|
+
EntityChangeDeriver,
|
|
29
|
+
EntityChangePayloadMapper,
|
|
30
|
+
EntityHandle,
|
|
31
|
+
EntityRead,
|
|
32
|
+
} from "@checkstack/automation-backend";
|
|
33
|
+
import type { HealthCheckService } from "./service";
|
|
34
|
+
|
|
35
|
+
/** Entity kind id for the per-system aggregated health. */
|
|
36
|
+
export const HEALTH_ENTITY_KIND = "health";
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Reactive state subset surfaced as the entity view. The full aggregate
|
|
40
|
+
* (per-check breakdown, timestamps, etc.) stays in the domain tables; only
|
|
41
|
+
* the fields automations reason about live here. Computed on read from the
|
|
42
|
+
* same durable data `getSystemHealthStatus` reads — never materialized.
|
|
43
|
+
*/
|
|
44
|
+
export const HealthEntityStateSchema = z.object({
|
|
45
|
+
status: HealthCheckStatusSchema,
|
|
46
|
+
healthyChecks: z.number().int().nonnegative(),
|
|
47
|
+
totalChecks: z.number().int().nonnegative(),
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
export type HealthEntityState = z.infer<typeof HealthEntityStateSchema>;
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Qualified trigger event ids the health entity drives. These are the
|
|
54
|
+
* TRIGGER qualifiedIds (`${pluginId}.${trigger.id}`) that automations store
|
|
55
|
+
* in `trigger.event` and that Stage-1 routing matches on via
|
|
56
|
+
* `findEnabledByTriggerEvent` — NOT the underlying hook ids. The healthcheck
|
|
57
|
+
* triggers use underscore ids (`system_degraded`, …), so the deriver must
|
|
58
|
+
* emit `healthcheck.system_degraded`, not the dotted hook id
|
|
59
|
+
* `healthcheck.system.degraded`. (Verified against `automations.ts` trigger
|
|
60
|
+
* ids + `trigger-subscriber.ts` which fires on `t.event === qualifiedId`.)
|
|
61
|
+
*/
|
|
62
|
+
export const HEALTH_TRIGGER_EVENTS = {
|
|
63
|
+
degraded: "healthcheck.system_degraded",
|
|
64
|
+
healthy: "healthcheck.system_healthy",
|
|
65
|
+
healthChanged: "healthcheck.system_health_changed",
|
|
66
|
+
} as const;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Read `status` off a serialized entity-state record (the change payload's
|
|
70
|
+
* `prev` / `next` are plain JSON records, not the typed state).
|
|
71
|
+
*/
|
|
72
|
+
function readStatus(state: Record<string, unknown> | null): string | null {
|
|
73
|
+
if (state === null) return null;
|
|
74
|
+
const status = state["status"];
|
|
75
|
+
return typeof status === "string" ? status : null;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Map a `health` entity change to the qualified trigger event id(s) the
|
|
80
|
+
* existing automations match on. Reproduces the directional + umbrella emit
|
|
81
|
+
* conditions that lived inline in `queue-executor.ts`:
|
|
82
|
+
* - recovery (→ healthy): next === "healthy" && prev !== "healthy"
|
|
83
|
+
* - degradation: prev === "healthy" && next !== "healthy"
|
|
84
|
+
* - umbrella (any change): prev !== next
|
|
85
|
+
*
|
|
86
|
+
* A create (`prev === null`) or tombstone (`next === null`) fires nothing —
|
|
87
|
+
* there is no prior aggregate transition to react to, matching the old
|
|
88
|
+
* behavior where the directional/umbrella hooks only emitted on a real
|
|
89
|
+
* status transition of an already-tracked system.
|
|
90
|
+
*/
|
|
91
|
+
export const deriveHealthTriggerEvents: EntityChangeDeriver = (changed) => {
|
|
92
|
+
const prev = readStatus(changed.prev);
|
|
93
|
+
const next = readStatus(changed.next);
|
|
94
|
+
if (prev === null || next === null) return [];
|
|
95
|
+
if (prev === next) return [];
|
|
96
|
+
|
|
97
|
+
const events: string[] = [];
|
|
98
|
+
if (next === "healthy") {
|
|
99
|
+
events.push(HEALTH_TRIGGER_EVENTS.healthy);
|
|
100
|
+
} else if (prev === "healthy") {
|
|
101
|
+
events.push(HEALTH_TRIGGER_EVENTS.degraded);
|
|
102
|
+
}
|
|
103
|
+
// Umbrella fires on every transition, alongside the directional event.
|
|
104
|
+
events.push(HEALTH_TRIGGER_EVENTS.healthChanged);
|
|
105
|
+
return events;
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
function readNumber(
|
|
109
|
+
state: Record<string, unknown> | null,
|
|
110
|
+
field: string,
|
|
111
|
+
): number | undefined {
|
|
112
|
+
if (state === null) return undefined;
|
|
113
|
+
const value = state[field];
|
|
114
|
+
return typeof value === "number" ? value : undefined;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Map a `health` entity change to the domain-named `trigger.payload` the
|
|
119
|
+
* healthcheck triggers declare via `payloadSchema` (`systemId`,
|
|
120
|
+
* `previousStatus`, `newStatus`, `healthyChecks`, `totalChecks`, `timestamp`).
|
|
121
|
+
* Restores the keys operators read (`trigger.payload.systemId`,
|
|
122
|
+
* `.previousStatus`, …) that the generic change shape omits.
|
|
123
|
+
*
|
|
124
|
+
* `systemId` is the entity id; `previousStatus` is `prev.status` and `newStatus`
|
|
125
|
+
* is `next.status`; `healthyChecks` / `totalChecks` come from `next`;
|
|
126
|
+
* `timestamp` is the change's `occurredAt`. `systemName` is not derivable from a
|
|
127
|
+
* health change (it lives in the catalog) and is OPTIONAL on the schemas, so it
|
|
128
|
+
* is omitted.
|
|
129
|
+
*/
|
|
130
|
+
export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
|
|
131
|
+
return {
|
|
132
|
+
systemId: changed.id,
|
|
133
|
+
previousStatus: readStatus(changed.prev) ?? undefined,
|
|
134
|
+
newStatus: readStatus(changed.next) ?? undefined,
|
|
135
|
+
healthyChecks: readNumber(changed.next, "healthyChecks") ?? 0,
|
|
136
|
+
totalChecks: readNumber(changed.next, "totalChecks") ?? 0,
|
|
137
|
+
timestamp: changed.occurredAt,
|
|
138
|
+
};
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Classify a `health` entity change for cross-plugin consumers (slo,
|
|
143
|
+
* dependency) that previously subscribed to the directional
|
|
144
|
+
* `systemDegraded` / `systemHealthy` hooks. Returns the systemId plus
|
|
145
|
+
* boolean transition flags, reproducing the exact emit conditions so a
|
|
146
|
+
* consumer can reproduce its old behavior via `onEntityChanged`.
|
|
147
|
+
*
|
|
148
|
+
* - `degraded`: prev === "healthy" && next !== "healthy" (and next exists)
|
|
149
|
+
* - `recovered`: next === "healthy" && prev !== "healthy" (and prev exists)
|
|
150
|
+
*
|
|
151
|
+
* Create / tombstone produce neither (no prior aggregate transition).
|
|
152
|
+
*/
|
|
153
|
+
export interface HealthChangeClassification {
|
|
154
|
+
systemId: string;
|
|
155
|
+
previousStatus: string | null;
|
|
156
|
+
newStatus: string | null;
|
|
157
|
+
degraded: boolean;
|
|
158
|
+
recovered: boolean;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export function classifyHealthChange(changed: {
|
|
162
|
+
id: string;
|
|
163
|
+
prev: Record<string, unknown> | null;
|
|
164
|
+
next: Record<string, unknown> | null;
|
|
165
|
+
}): HealthChangeClassification {
|
|
166
|
+
const previousStatus = readStatus(changed.prev);
|
|
167
|
+
const newStatus = readStatus(changed.next);
|
|
168
|
+
const bothPresent = previousStatus !== null && newStatus !== null;
|
|
169
|
+
const degraded =
|
|
170
|
+
bothPresent && previousStatus === "healthy" && newStatus !== "healthy";
|
|
171
|
+
const recovered =
|
|
172
|
+
bothPresent && newStatus === "healthy" && previousStatus !== "healthy";
|
|
173
|
+
return {
|
|
174
|
+
systemId: changed.id,
|
|
175
|
+
previousStatus,
|
|
176
|
+
newStatus,
|
|
177
|
+
degraded,
|
|
178
|
+
recovered,
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Compute the reactive `health` view for a single system from durable data.
|
|
184
|
+
*
|
|
185
|
+
* Derives `{ status, healthyChecks, totalChecks }` from the SAME default-
|
|
186
|
+
* `healthy` baseline aggregate the executor reads via
|
|
187
|
+
* `getSystemHealthStatus`:
|
|
188
|
+
* - `status` = `getSystemHealthStatus(systemId).status` (the worst-
|
|
189
|
+
* wins aggregate across the system's ENABLED checks, computed from
|
|
190
|
+
* `health_check_runs` via `evaluateHealthStatus`; a check with no runs yet
|
|
191
|
+
* evaluates to `"healthy"`),
|
|
192
|
+
* - `healthyChecks` = count of per-check statuses that are `"healthy"`,
|
|
193
|
+
* - `totalChecks` = number of enabled checks (`checkStatuses.length`).
|
|
194
|
+
*
|
|
195
|
+
* EXISTENCE GATE: the entity resolves iff the system has at least one ENABLED
|
|
196
|
+
* check association (`checkStatuses.length > 0`). A system with no enabled
|
|
197
|
+
* checks has no `health` entity and is omitted from the batched `read` (its
|
|
198
|
+
* health is undefined, not a meaningful `healthy`).
|
|
199
|
+
*
|
|
200
|
+
* The gate is intentionally on ASSOCIATIONS, not on persisted runs: a system
|
|
201
|
+
* that has an enabled check but has never run yet resolves to the default-
|
|
202
|
+
* `healthy` baseline (the exact value `getSystemHealthStatus` returns for an
|
|
203
|
+
* empty run window). That makes a first-ever evaluation that comes up
|
|
204
|
+
* unhealthy a real `healthy → degraded` diff — firing `system_degraded` /
|
|
205
|
+
* `health_changed` and the `degraded` `onEntityChanged` for SLO/dependency
|
|
206
|
+
* consumers — instead of a suppressed create (`prev === null`). The entity and
|
|
207
|
+
* the executor therefore agree on the pre-run baseline.
|
|
208
|
+
*/
|
|
209
|
+
export async function computeHealthEntityState(args: {
|
|
210
|
+
service: HealthCheckService;
|
|
211
|
+
systemId: string;
|
|
212
|
+
}): Promise<HealthEntityState | undefined> {
|
|
213
|
+
const { service, systemId } = args;
|
|
214
|
+
const overview = await service.getSystemHealthStatus(systemId);
|
|
215
|
+
// No enabled check associations ⇒ no health entity for this system.
|
|
216
|
+
if (overview.checkStatuses.length === 0) return undefined;
|
|
217
|
+
return {
|
|
218
|
+
status: overview.status,
|
|
219
|
+
healthyChecks: overview.checkStatuses.filter((c) => c.status === "healthy")
|
|
220
|
+
.length,
|
|
221
|
+
totalChecks: overview.checkStatuses.length,
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Build the PLUGIN-BACKED + COMPUTED `read` accessor for the `health` entity.
|
|
227
|
+
* For each systemId, assembles the view via {@link computeHealthEntityState}
|
|
228
|
+
* (systems with no runs omitted). This is the single source of truth that
|
|
229
|
+
* `handle.mutate` snapshots `prev` from and `get`/`getMany`/scope enrichment
|
|
230
|
+
* route through — no framework `entity_state` storage.
|
|
231
|
+
*/
|
|
232
|
+
export function createHealthEntityRead(deps: {
|
|
233
|
+
service: HealthCheckService;
|
|
234
|
+
}): EntityRead<HealthEntityState> {
|
|
235
|
+
const { service } = deps;
|
|
236
|
+
return async (ids) => {
|
|
237
|
+
if (ids.length === 0) return {};
|
|
238
|
+
const out: Record<string, HealthEntityState> = {};
|
|
239
|
+
await Promise.all(
|
|
240
|
+
ids.map(async (systemId) => {
|
|
241
|
+
const state = await computeHealthEntityState({ service, systemId });
|
|
242
|
+
if (state) out[systemId] = state;
|
|
243
|
+
}),
|
|
244
|
+
);
|
|
245
|
+
return out;
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Drive an evaluation-site health write through `handle.mutate` (§10.3).
|
|
251
|
+
*
|
|
252
|
+
* `apply` performs the REAL durable write (insert the run + increment the
|
|
253
|
+
* hourly aggregate) and returns the freshly-computed `health` view. The
|
|
254
|
+
* framework snapshots `prev` via `read` BEFORE `apply` runs — i.e. BEFORE the
|
|
255
|
+
* run is persisted — so a real status change yields exactly one correct
|
|
256
|
+
* `ENTITY_CHANGED` with accurate prev → next, whose deriver fires the
|
|
257
|
+
* `healthcheck.system_degraded` / `_healthy` / `_health_changed` trigger
|
|
258
|
+
* events. An unchanged aggregate is a no-op (the handle diffs internally).
|
|
259
|
+
*
|
|
260
|
+
* Concurrency:
|
|
261
|
+
* - `serialize`, when provided, wraps the ENTIRE snapshot-prev + apply + diff
|
|
262
|
+
* + emit (the `handle.mutate` call) in a per-`systemId` critical section.
|
|
263
|
+
* Without it, concurrent evaluations of one system (multiple per-config jobs
|
|
264
|
+
* across pods, or at-least-once redelivery) interleave: both snapshot
|
|
265
|
+
* `prev = healthy`, both persist a failing run, both diff `healthy →
|
|
266
|
+
* degraded`, and both emit — yielding two `ENTITY_CHANGED` + two transition
|
|
267
|
+
* rows for one logical transition (inflating `transitionCount`/flapping and
|
|
268
|
+
* re-running dependency notify). The executor wires this to a transaction-
|
|
269
|
+
* scoped advisory lock keyed `health:<systemId>` (`withXactLock`), so two
|
|
270
|
+
* concurrent evals of one system serialize through prev-snapshot to emit.
|
|
271
|
+
* The durable `apply` write is the SAME whether serialized or not — only the
|
|
272
|
+
* snapshot/diff/emit window is protected.
|
|
273
|
+
*
|
|
274
|
+
* Failure handling:
|
|
275
|
+
* - When no `handle` is bound (version skew / tests), `apply` still runs —
|
|
276
|
+
* the durable write is never gated on entity reactivity. (The serialization
|
|
277
|
+
* lock is part of the reactive path, so an unbound handle skips it too; the
|
|
278
|
+
* durable insert keeps its own ordering guarantees.)
|
|
279
|
+
* - If `apply` throws BEFORE the durable write commits, the error propagates
|
|
280
|
+
* so the executor's own error path (fallback insert) runs. We detect this
|
|
281
|
+
* via `durableState`: it is only set once `apply` has produced its view, so
|
|
282
|
+
* if it is still unset when `mutate` throws, the durable write did not
|
|
283
|
+
* commit.
|
|
284
|
+
* - If the FRAMEWORK reactivity throws AFTER the durable write committed
|
|
285
|
+
* (transition append / emit — the documented Model B post-commit boundary),
|
|
286
|
+
* we route it to `onError` and DO NOT rethrow: a reactivity failure must
|
|
287
|
+
* never break health-check execution (the durable tables already hold the
|
|
288
|
+
* authoritative state).
|
|
289
|
+
*
|
|
290
|
+
* Returns the computed view (or `undefined` if `apply` never produced one,
|
|
291
|
+
* which only happens when it threw and `handle` was absent — in which case the
|
|
292
|
+
* throw already propagated).
|
|
293
|
+
*/
|
|
294
|
+
export async function writeHealthEntity(args: {
|
|
295
|
+
handle: EntityHandle<HealthEntityState> | undefined;
|
|
296
|
+
systemId: string;
|
|
297
|
+
apply: () => Promise<HealthEntityState>;
|
|
298
|
+
onError?: (error: unknown) => void;
|
|
299
|
+
/**
|
|
300
|
+
* Optional per-`systemId` critical section wrapping the snapshot-prev +
|
|
301
|
+
* apply + diff + emit. The executor supplies a transaction-scoped advisory
|
|
302
|
+
* lock (`withXactLock`, key `health:<systemId>`) so concurrent evaluations
|
|
303
|
+
* of one system can't double-emit a single logical transition. Identity by
|
|
304
|
+
* default (no serialization) for the unbound-handle / test paths.
|
|
305
|
+
*/
|
|
306
|
+
serialize?: <T>(fn: () => Promise<T>) => Promise<T>;
|
|
307
|
+
}): Promise<HealthEntityState> {
|
|
308
|
+
const { handle, systemId, apply, onError, serialize } = args;
|
|
309
|
+
if (!handle) {
|
|
310
|
+
// No reactivity bound — run the durable write directly.
|
|
311
|
+
return apply();
|
|
312
|
+
}
|
|
313
|
+
const run = serialize ?? (<T>(fn: () => Promise<T>) => fn());
|
|
314
|
+
let durableState: HealthEntityState | undefined;
|
|
315
|
+
try {
|
|
316
|
+
// The lock scope MUST cover prev-snapshot through emit: `handle.mutate`
|
|
317
|
+
// snapshots `prev` via `read`, runs `apply`, diffs, and emits inside one
|
|
318
|
+
// call, and we wrap that whole call so two concurrent evals serialize.
|
|
319
|
+
return await run(() =>
|
|
320
|
+
handle.mutate({
|
|
321
|
+
id: systemId,
|
|
322
|
+
apply: async () => {
|
|
323
|
+
durableState = await apply();
|
|
324
|
+
return durableState;
|
|
325
|
+
},
|
|
326
|
+
}),
|
|
327
|
+
);
|
|
328
|
+
} catch (error) {
|
|
329
|
+
// `apply` never committed ⇒ the durable write failed; propagate so the
|
|
330
|
+
// executor's outer catch can run its fallback path.
|
|
331
|
+
if (durableState === undefined) throw error;
|
|
332
|
+
// Durable write committed; only the framework reactivity failed. Fail-soft.
|
|
333
|
+
onError?.(error);
|
|
334
|
+
return durableState;
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
/** Advisory-lock key namespace for the per-system health critical section. */
|
|
339
|
+
export function healthSystemLockKey(systemId: string): string {
|
|
340
|
+
return `health:${systemId}`;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Build the per-`systemId` serializer for {@link writeHealthEntity} backed by
|
|
345
|
+
* a transaction-scoped advisory lock (`withXactLock`, key
|
|
346
|
+
* `health:<systemId>`). The returned function blocks until it holds the
|
|
347
|
+
* system's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
|
|
348
|
+
* auto-releases the lock at COMMIT/ROLLBACK. Two concurrent evaluations of one
|
|
349
|
+
* system therefore serialize — exactly one logical `healthy → degraded`
|
|
350
|
+
* transition emits exactly one `ENTITY_CHANGED` + one transition row.
|
|
351
|
+
*
|
|
352
|
+
* `fn` does its own durable writes on the outer pool; the lock only gates
|
|
353
|
+
* ENTRY to the critical section, so its connection affinity is irrelevant —
|
|
354
|
+
* the second caller cannot acquire the xact lock until the first transaction
|
|
355
|
+
* commits.
|
|
356
|
+
*/
|
|
357
|
+
export function createHealthEntitySerializer(deps: {
|
|
358
|
+
advisoryLock: AdvisoryLockService;
|
|
359
|
+
}): (systemId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
|
|
360
|
+
const { advisoryLock } = deps;
|
|
361
|
+
return (systemId) =>
|
|
362
|
+
<T>(fn: () => Promise<T>) =>
|
|
363
|
+
advisoryLock.withXactLock({
|
|
364
|
+
key: healthSystemLockKey(systemId),
|
|
365
|
+
fn: () => fn(),
|
|
366
|
+
});
|
|
367
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import {
|
|
3
|
+
aggregateWindowedMetrics,
|
|
4
|
+
buildHealthState,
|
|
5
|
+
} from "./health-state";
|
|
6
|
+
|
|
7
|
+
describe("buildHealthState", () => {
|
|
8
|
+
const now = new Date("2026-05-30T12:00:00.000Z");
|
|
9
|
+
|
|
10
|
+
it("computes inStatusForMs from inStatusSince relative to now", () => {
|
|
11
|
+
const since = new Date("2026-05-30T11:30:00.000Z"); // 30 min ago
|
|
12
|
+
const state = buildHealthState({
|
|
13
|
+
status: "unhealthy",
|
|
14
|
+
inStatusSince: since,
|
|
15
|
+
inMaintenance: false,
|
|
16
|
+
transitionsInWindow: 0,
|
|
17
|
+
transitionWindowMinutes: 60,
|
|
18
|
+
now,
|
|
19
|
+
});
|
|
20
|
+
expect(state.status).toBe("unhealthy");
|
|
21
|
+
expect(state.inStatusSince).toBe(since);
|
|
22
|
+
expect(state.inStatusForMs).toBe(30 * 60_000);
|
|
23
|
+
expect(state.evaluatedAt).toBe(now);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("returns inStatusForMs 0 when inStatusSince is null (fail-safe)", () => {
|
|
27
|
+
const state = buildHealthState({
|
|
28
|
+
status: "healthy",
|
|
29
|
+
inStatusSince: null,
|
|
30
|
+
inMaintenance: false,
|
|
31
|
+
transitionsInWindow: 0,
|
|
32
|
+
transitionWindowMinutes: 60,
|
|
33
|
+
now,
|
|
34
|
+
});
|
|
35
|
+
expect(state.inStatusSince).toBeNull();
|
|
36
|
+
expect(state.inStatusForMs).toBe(0);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("clamps negative durations to 0 under clock skew", () => {
|
|
40
|
+
const future = new Date("2026-05-30T12:05:00.000Z");
|
|
41
|
+
const state = buildHealthState({
|
|
42
|
+
status: "degraded",
|
|
43
|
+
inStatusSince: future,
|
|
44
|
+
inMaintenance: false,
|
|
45
|
+
transitionsInWindow: 0,
|
|
46
|
+
transitionWindowMinutes: 60,
|
|
47
|
+
now,
|
|
48
|
+
});
|
|
49
|
+
expect(state.inStatusForMs).toBe(0);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("passes through metrics and maintenance flag", () => {
|
|
53
|
+
const state = buildHealthState({
|
|
54
|
+
status: "healthy",
|
|
55
|
+
inStatusSince: null,
|
|
56
|
+
latencyMs: 42,
|
|
57
|
+
avgLatencyMs: 50,
|
|
58
|
+
p95LatencyMs: 120,
|
|
59
|
+
successRate: 0.99,
|
|
60
|
+
lastRunAt: now,
|
|
61
|
+
inMaintenance: true,
|
|
62
|
+
transitionsInWindow: 3,
|
|
63
|
+
transitionWindowMinutes: 60,
|
|
64
|
+
now,
|
|
65
|
+
});
|
|
66
|
+
expect(state.latencyMs).toBe(42);
|
|
67
|
+
expect(state.avgLatencyMs).toBe(50);
|
|
68
|
+
expect(state.p95LatencyMs).toBe(120);
|
|
69
|
+
expect(state.successRate).toBe(0.99);
|
|
70
|
+
expect(state.lastRunAt).toBe(now);
|
|
71
|
+
expect(state.inMaintenance).toBe(true);
|
|
72
|
+
expect(state.transitionsInWindow).toBe(3);
|
|
73
|
+
expect(state.transitionWindowMinutes).toBe(60);
|
|
74
|
+
});
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
describe("aggregateWindowedMetrics", () => {
|
|
78
|
+
it("returns empty object when there are no buckets", () => {
|
|
79
|
+
expect(aggregateWindowedMetrics([])).toEqual({});
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it("computes latency-sum-weighted average, max p95, and success rate", () => {
|
|
83
|
+
const result = aggregateWindowedMetrics([
|
|
84
|
+
{ runCount: 10, healthyCount: 9, latencySumMs: 1000, p95LatencyMs: 100 },
|
|
85
|
+
{ runCount: 30, healthyCount: 30, latencySumMs: 6000, p95LatencyMs: 200 },
|
|
86
|
+
]);
|
|
87
|
+
// (1000 + 6000) / (10 + 30) = 175
|
|
88
|
+
expect(result.avgLatencyMs).toBe(175);
|
|
89
|
+
// max p95 across buckets
|
|
90
|
+
expect(result.p95LatencyMs).toBe(200);
|
|
91
|
+
// (9 + 30) / (10 + 30) = 0.975
|
|
92
|
+
expect(result.successRate).toBeCloseTo(0.975, 5);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
it("ignores buckets with null latency for the average but counts them for success rate", () => {
|
|
96
|
+
const result = aggregateWindowedMetrics([
|
|
97
|
+
{ runCount: 5, healthyCount: 5, latencySumMs: null, p95LatencyMs: null },
|
|
98
|
+
{ runCount: 5, healthyCount: 4, latencySumMs: 500, p95LatencyMs: 80 },
|
|
99
|
+
]);
|
|
100
|
+
// only the second bucket contributes latency: 500 / 5 = 100
|
|
101
|
+
expect(result.avgLatencyMs).toBe(100);
|
|
102
|
+
expect(result.p95LatencyMs).toBe(80);
|
|
103
|
+
// success rate spans both: 9 / 10
|
|
104
|
+
expect(result.successRate).toBeCloseTo(0.9, 5);
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
it("omits avg/p95 when no bucket carries latency", () => {
|
|
108
|
+
const result = aggregateWindowedMetrics([
|
|
109
|
+
{ runCount: 3, healthyCount: 2, latencySumMs: null, p95LatencyMs: null },
|
|
110
|
+
]);
|
|
111
|
+
expect(result.avgLatencyMs).toBeUndefined();
|
|
112
|
+
expect(result.p95LatencyMs).toBeUndefined();
|
|
113
|
+
expect(result.successRate).toBeCloseTo(2 / 3, 5);
|
|
114
|
+
});
|
|
115
|
+
});
|