@checkstack/healthcheck-backend 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +409 -0
  2. package/drizzle/0015_quiet_meggan.sql +12 -0
  3. package/drizzle/0016_complex_maginty.sql +1 -0
  4. package/drizzle/0017_pretty_caretaker.sql +1 -0
  5. package/drizzle/meta/0015_snapshot.json +764 -0
  6. package/drizzle/meta/0016_snapshot.json +644 -0
  7. package/drizzle/meta/0017_snapshot.json +563 -0
  8. package/drizzle/meta/_journal.json +21 -0
  9. package/package.json +24 -21
  10. package/src/automations.test.ts +6 -27
  11. package/src/automations.ts +32 -30
  12. package/src/collector-script-test.test.ts +236 -0
  13. package/src/collector-script-test.ts +221 -0
  14. package/src/health-entity.test.ts +694 -0
  15. package/src/health-entity.ts +367 -0
  16. package/src/health-state.test.ts +115 -0
  17. package/src/health-state.ts +333 -0
  18. package/src/healthcheck-gitops-kinds.test.ts +6 -32
  19. package/src/healthcheck-gitops-kinds.ts +4 -19
  20. package/src/hooks.test.ts +19 -6
  21. package/src/hooks.ts +13 -68
  22. package/src/index.ts +118 -48
  23. package/src/queue-executor.test.ts +13 -0
  24. package/src/queue-executor.ts +251 -444
  25. package/src/retention-job.ts +65 -1
  26. package/src/retention-state-transitions.test.ts +49 -0
  27. package/src/router.test.ts +13 -0
  28. package/src/router.ts +44 -0
  29. package/src/schema.ts +34 -54
  30. package/src/service-notification-policy.test.ts +28 -71
  31. package/src/service.ts +89 -0
  32. package/src/state-evaluator.test.ts +50 -5
  33. package/src/state-evaluator.ts +9 -2
  34. package/src/state-transitions.test.ts +126 -0
  35. package/src/state-transitions.ts +112 -0
  36. package/tsconfig.json +9 -0
  37. package/src/auto-incident-close-job.ts +0 -164
  38. package/src/auto-incident.test.ts +0 -196
  39. package/src/auto-incident.ts +0 -332
@@ -0,0 +1,367 @@
1
+ /**
2
+ * The reactive `health` entity (reactive automation engine §10.3).
3
+ *
4
+ * Model B PLUGIN-BACKED + COMPUTED entity. There is NO framework `entity_state`
5
+ * row for a system's aggregated health and NO domain table of its own — the
6
+ * reactive subset `{ status, healthyChecks, totalChecks }` is COMPUTED on demand
7
+ * by the `read` accessor from the SAME durable health data the rest of the
8
+ * plugin reads (`health_check_runs` via `service.getSystemHealthStatus`). Every
9
+ * evaluation-site write goes through `handle.mutate`, whose `apply` performs the
10
+ * REAL durable write (insert run + increment aggregate) and returns the
11
+ * freshly-computed view. The framework snapshots `prev` via `read` BEFORE
12
+ * `apply` runs (i.e. before the run is persisted), diffs prev → next, appends
13
+ * the transition log, and emits `ENTITY_CHANGED`.
14
+ *
15
+ * This module is the single source of truth for:
16
+ * - the `health` entity zod state schema + kind id,
17
+ * - the PLUGIN-BACKED + COMPUTED `read` accessor
18
+ * ({@link createHealthEntityRead}),
19
+ * - the change → trigger-event deriver (so the existing
20
+ * `healthcheck.system.degraded` / `.healthy` / `.health_changed`
21
+ * automations keep firing), and
22
+ * - the `writeHealthEntity` helper called at every evaluation-write site.
23
+ */
24
+ import { z } from "zod";
25
+ import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
26
+ import type { AdvisoryLockService } from "@checkstack/backend-api";
27
+ import type {
28
+ EntityChangeDeriver,
29
+ EntityChangePayloadMapper,
30
+ EntityHandle,
31
+ EntityRead,
32
+ } from "@checkstack/automation-backend";
33
+ import type { HealthCheckService } from "./service";
34
+
35
+ /** Entity kind id for the per-system aggregated health. */
36
+ export const HEALTH_ENTITY_KIND = "health";
37
+
38
+ /**
39
+ * Reactive state subset surfaced as the entity view. The full aggregate
40
+ * (per-check breakdown, timestamps, etc.) stays in the domain tables; only
41
+ * the fields automations reason about live here. Computed on read from the
42
+ * same durable data `getSystemHealthStatus` reads — never materialized.
43
+ */
44
+ export const HealthEntityStateSchema = z.object({
45
+ status: HealthCheckStatusSchema,
46
+ healthyChecks: z.number().int().nonnegative(),
47
+ totalChecks: z.number().int().nonnegative(),
48
+ });
49
+
50
+ export type HealthEntityState = z.infer<typeof HealthEntityStateSchema>;
51
+
52
+ /**
53
+ * Qualified trigger event ids the health entity drives. These are the
54
+ * TRIGGER qualifiedIds (`${pluginId}.${trigger.id}`) that automations store
55
+ * in `trigger.event` and that Stage-1 routing matches on via
56
+ * `findEnabledByTriggerEvent` — NOT the underlying hook ids. The healthcheck
57
+ * triggers use underscore ids (`system_degraded`, …), so the deriver must
58
+ * emit `healthcheck.system_degraded`, not the dotted hook id
59
+ * `healthcheck.system.degraded`. (Verified against `automations.ts` trigger
60
+ * ids + `trigger-subscriber.ts` which fires on `t.event === qualifiedId`.)
61
+ */
62
+ export const HEALTH_TRIGGER_EVENTS = {
63
+ degraded: "healthcheck.system_degraded",
64
+ healthy: "healthcheck.system_healthy",
65
+ healthChanged: "healthcheck.system_health_changed",
66
+ } as const;
67
+
68
+ /**
69
+ * Read `status` off a serialized entity-state record (the change payload's
70
+ * `prev` / `next` are plain JSON records, not the typed state).
71
+ */
72
+ function readStatus(state: Record<string, unknown> | null): string | null {
73
+ if (state === null) return null;
74
+ const status = state["status"];
75
+ return typeof status === "string" ? status : null;
76
+ }
77
+
78
+ /**
79
+ * Map a `health` entity change to the qualified trigger event id(s) the
80
+ * existing automations match on. Reproduces the directional + umbrella emit
81
+ * conditions that lived inline in `queue-executor.ts`:
82
+ * - recovery (→ healthy): next === "healthy" && prev !== "healthy"
83
+ * - degradation: prev === "healthy" && next !== "healthy"
84
+ * - umbrella (any change): prev !== next
85
+ *
86
+ * A create (`prev === null`) or tombstone (`next === null`) fires nothing —
87
+ * there is no prior aggregate transition to react to, matching the old
88
+ * behavior where the directional/umbrella hooks only emitted on a real
89
+ * status transition of an already-tracked system.
90
+ */
91
+ export const deriveHealthTriggerEvents: EntityChangeDeriver = (changed) => {
92
+ const prev = readStatus(changed.prev);
93
+ const next = readStatus(changed.next);
94
+ if (prev === null || next === null) return [];
95
+ if (prev === next) return [];
96
+
97
+ const events: string[] = [];
98
+ if (next === "healthy") {
99
+ events.push(HEALTH_TRIGGER_EVENTS.healthy);
100
+ } else if (prev === "healthy") {
101
+ events.push(HEALTH_TRIGGER_EVENTS.degraded);
102
+ }
103
+ // Umbrella fires on every transition, alongside the directional event.
104
+ events.push(HEALTH_TRIGGER_EVENTS.healthChanged);
105
+ return events;
106
+ };
107
+
108
+ function readNumber(
109
+ state: Record<string, unknown> | null,
110
+ field: string,
111
+ ): number | undefined {
112
+ if (state === null) return undefined;
113
+ const value = state[field];
114
+ return typeof value === "number" ? value : undefined;
115
+ }
116
+
117
+ /**
118
+ * Map a `health` entity change to the domain-named `trigger.payload` the
119
+ * healthcheck triggers declare via `payloadSchema` (`systemId`,
120
+ * `previousStatus`, `newStatus`, `healthyChecks`, `totalChecks`, `timestamp`).
121
+ * Restores the keys operators read (`trigger.payload.systemId`,
122
+ * `.previousStatus`, …) that the generic change shape omits.
123
+ *
124
+ * `systemId` is the entity id; `previousStatus` is `prev.status` and `newStatus`
125
+ * is `next.status`; `healthyChecks` / `totalChecks` come from `next`;
126
+ * `timestamp` is the change's `occurredAt`. `systemName` is not derivable from a
127
+ * health change (it lives in the catalog) and is OPTIONAL on the schemas, so it
128
+ * is omitted.
129
+ */
130
+ export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
131
+ return {
132
+ systemId: changed.id,
133
+ previousStatus: readStatus(changed.prev) ?? undefined,
134
+ newStatus: readStatus(changed.next) ?? undefined,
135
+ healthyChecks: readNumber(changed.next, "healthyChecks") ?? 0,
136
+ totalChecks: readNumber(changed.next, "totalChecks") ?? 0,
137
+ timestamp: changed.occurredAt,
138
+ };
139
+ };
140
+
141
+ /**
142
+ * Classify a `health` entity change for cross-plugin consumers (slo,
143
+ * dependency) that previously subscribed to the directional
144
+ * `systemDegraded` / `systemHealthy` hooks. Returns the systemId plus
145
+ * boolean transition flags, reproducing the exact emit conditions so a
146
+ * consumer can reproduce its old behavior via `onEntityChanged`.
147
+ *
148
+ * - `degraded`: prev === "healthy" && next !== "healthy" (and next exists)
149
+ * - `recovered`: next === "healthy" && prev !== "healthy" (and prev exists)
150
+ *
151
+ * Create / tombstone produce neither (no prior aggregate transition).
152
+ */
153
+ export interface HealthChangeClassification {
154
+ systemId: string;
155
+ previousStatus: string | null;
156
+ newStatus: string | null;
157
+ degraded: boolean;
158
+ recovered: boolean;
159
+ }
160
+
161
+ export function classifyHealthChange(changed: {
162
+ id: string;
163
+ prev: Record<string, unknown> | null;
164
+ next: Record<string, unknown> | null;
165
+ }): HealthChangeClassification {
166
+ const previousStatus = readStatus(changed.prev);
167
+ const newStatus = readStatus(changed.next);
168
+ const bothPresent = previousStatus !== null && newStatus !== null;
169
+ const degraded =
170
+ bothPresent && previousStatus === "healthy" && newStatus !== "healthy";
171
+ const recovered =
172
+ bothPresent && newStatus === "healthy" && previousStatus !== "healthy";
173
+ return {
174
+ systemId: changed.id,
175
+ previousStatus,
176
+ newStatus,
177
+ degraded,
178
+ recovered,
179
+ };
180
+ }
181
+
182
+ /**
183
+ * Compute the reactive `health` view for a single system from durable data.
184
+ *
185
+ * Derives `{ status, healthyChecks, totalChecks }` from the SAME default-
186
+ * `healthy` baseline aggregate the executor reads via
187
+ * `getSystemHealthStatus`:
188
+ * - `status` = `getSystemHealthStatus(systemId).status` (the worst-
189
+ * wins aggregate across the system's ENABLED checks, computed from
190
+ * `health_check_runs` via `evaluateHealthStatus`; a check with no runs yet
191
+ * evaluates to `"healthy"`),
192
+ * - `healthyChecks` = count of per-check statuses that are `"healthy"`,
193
+ * - `totalChecks` = number of enabled checks (`checkStatuses.length`).
194
+ *
195
+ * EXISTENCE GATE: the entity resolves iff the system has at least one ENABLED
196
+ * check association (`checkStatuses.length > 0`). A system with no enabled
197
+ * checks has no `health` entity and is omitted from the batched `read` (its
198
+ * health is undefined, not a meaningful `healthy`).
199
+ *
200
+ * The gate is intentionally on ASSOCIATIONS, not on persisted runs: a system
201
+ * that has an enabled check but has never run yet resolves to the default-
202
+ * `healthy` baseline (the exact value `getSystemHealthStatus` returns for an
203
+ * empty run window). That makes a first-ever evaluation that comes up
204
+ * unhealthy a real `healthy → degraded` diff — firing `system_degraded` /
205
+ * `health_changed` and the `degraded` `onEntityChanged` for SLO/dependency
206
+ * consumers — instead of a suppressed create (`prev === null`). The entity and
207
+ * the executor therefore agree on the pre-run baseline.
208
+ */
209
+ export async function computeHealthEntityState(args: {
210
+ service: HealthCheckService;
211
+ systemId: string;
212
+ }): Promise<HealthEntityState | undefined> {
213
+ const { service, systemId } = args;
214
+ const overview = await service.getSystemHealthStatus(systemId);
215
+ // No enabled check associations ⇒ no health entity for this system.
216
+ if (overview.checkStatuses.length === 0) return undefined;
217
+ return {
218
+ status: overview.status,
219
+ healthyChecks: overview.checkStatuses.filter((c) => c.status === "healthy")
220
+ .length,
221
+ totalChecks: overview.checkStatuses.length,
222
+ };
223
+ }
224
+
225
+ /**
226
+ * Build the PLUGIN-BACKED + COMPUTED `read` accessor for the `health` entity.
227
+ * For each systemId, assembles the view via {@link computeHealthEntityState}
228
+ * (systems with no runs omitted). This is the single source of truth that
229
+ * `handle.mutate` snapshots `prev` from and `get`/`getMany`/scope enrichment
230
+ * route through — no framework `entity_state` storage.
231
+ */
232
+ export function createHealthEntityRead(deps: {
233
+ service: HealthCheckService;
234
+ }): EntityRead<HealthEntityState> {
235
+ const { service } = deps;
236
+ return async (ids) => {
237
+ if (ids.length === 0) return {};
238
+ const out: Record<string, HealthEntityState> = {};
239
+ await Promise.all(
240
+ ids.map(async (systemId) => {
241
+ const state = await computeHealthEntityState({ service, systemId });
242
+ if (state) out[systemId] = state;
243
+ }),
244
+ );
245
+ return out;
246
+ };
247
+ }
248
+
249
+ /**
250
+ * Drive an evaluation-site health write through `handle.mutate` (§10.3).
251
+ *
252
+ * `apply` performs the REAL durable write (insert the run + increment the
253
+ * hourly aggregate) and returns the freshly-computed `health` view. The
254
+ * framework snapshots `prev` via `read` BEFORE `apply` runs — i.e. BEFORE the
255
+ * run is persisted — so a real status change yields exactly one correct
256
+ * `ENTITY_CHANGED` with accurate prev → next, whose deriver fires the
257
+ * `healthcheck.system_degraded` / `_healthy` / `_health_changed` trigger
258
+ * events. An unchanged aggregate is a no-op (the handle diffs internally).
259
+ *
260
+ * Concurrency:
261
+ * - `serialize`, when provided, wraps the ENTIRE snapshot-prev + apply + diff
262
+ * + emit (the `handle.mutate` call) in a per-`systemId` critical section.
263
+ * Without it, concurrent evaluations of one system (multiple per-config jobs
264
+ * across pods, or at-least-once redelivery) interleave: both snapshot
265
+ * `prev = healthy`, both persist a failing run, both diff `healthy →
266
+ * degraded`, and both emit — yielding two `ENTITY_CHANGED` + two transition
267
+ * rows for one logical transition (inflating `transitionCount`/flapping and
268
+ * re-running dependency notify). The executor wires this to a transaction-
269
+ * scoped advisory lock keyed `health:<systemId>` (`withXactLock`), so two
270
+ * concurrent evals of one system serialize through prev-snapshot to emit.
271
+ * The durable `apply` write is the SAME whether serialized or not — only the
272
+ * snapshot/diff/emit window is protected.
273
+ *
274
+ * Failure handling:
275
+ * - When no `handle` is bound (version skew / tests), `apply` still runs —
276
+ * the durable write is never gated on entity reactivity. (The serialization
277
+ * lock is part of the reactive path, so an unbound handle skips it too; the
278
+ * durable insert keeps its own ordering guarantees.)
279
+ * - If `apply` throws BEFORE the durable write commits, the error propagates
280
+ * so the executor's own error path (fallback insert) runs. We detect this
281
+ * via `durableState`: it is only set once `apply` has produced its view, so
282
+ * if it is still unset when `mutate` throws, the durable write did not
283
+ * commit.
284
+ * - If the FRAMEWORK reactivity throws AFTER the durable write committed
285
+ * (transition append / emit — the documented Model B post-commit boundary),
286
+ * we route it to `onError` and DO NOT rethrow: a reactivity failure must
287
+ * never break health-check execution (the durable tables already hold the
288
+ * authoritative state).
289
+ *
290
+ * Returns the computed view (or `undefined` if `apply` never produced one,
291
+ * which only happens when it threw and `handle` was absent — in which case the
292
+ * throw already propagated).
293
+ */
294
+ export async function writeHealthEntity(args: {
295
+ handle: EntityHandle<HealthEntityState> | undefined;
296
+ systemId: string;
297
+ apply: () => Promise<HealthEntityState>;
298
+ onError?: (error: unknown) => void;
299
+ /**
300
+ * Optional per-`systemId` critical section wrapping the snapshot-prev +
301
+ * apply + diff + emit. The executor supplies a transaction-scoped advisory
302
+ * lock (`withXactLock`, key `health:<systemId>`) so concurrent evaluations
303
+ * of one system can't double-emit a single logical transition. Identity by
304
+ * default (no serialization) for the unbound-handle / test paths.
305
+ */
306
+ serialize?: <T>(fn: () => Promise<T>) => Promise<T>;
307
+ }): Promise<HealthEntityState> {
308
+ const { handle, systemId, apply, onError, serialize } = args;
309
+ if (!handle) {
310
+ // No reactivity bound — run the durable write directly.
311
+ return apply();
312
+ }
313
+ const run = serialize ?? (<T>(fn: () => Promise<T>) => fn());
314
+ let durableState: HealthEntityState | undefined;
315
+ try {
316
+ // The lock scope MUST cover prev-snapshot through emit: `handle.mutate`
317
+ // snapshots `prev` via `read`, runs `apply`, diffs, and emits inside one
318
+ // call, and we wrap that whole call so two concurrent evals serialize.
319
+ return await run(() =>
320
+ handle.mutate({
321
+ id: systemId,
322
+ apply: async () => {
323
+ durableState = await apply();
324
+ return durableState;
325
+ },
326
+ }),
327
+ );
328
+ } catch (error) {
329
+ // `apply` never committed ⇒ the durable write failed; propagate so the
330
+ // executor's outer catch can run its fallback path.
331
+ if (durableState === undefined) throw error;
332
+ // Durable write committed; only the framework reactivity failed. Fail-soft.
333
+ onError?.(error);
334
+ return durableState;
335
+ }
336
+ }
337
+
338
+ /** Advisory-lock key namespace for the per-system health critical section. */
339
+ export function healthSystemLockKey(systemId: string): string {
340
+ return `health:${systemId}`;
341
+ }
342
+
343
+ /**
344
+ * Build the per-`systemId` serializer for {@link writeHealthEntity} backed by
345
+ * a transaction-scoped advisory lock (`withXactLock`, key
346
+ * `health:<systemId>`). The returned function blocks until it holds the
347
+ * system's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
348
+ * auto-releases the lock at COMMIT/ROLLBACK. Two concurrent evaluations of one
349
+ * system therefore serialize — exactly one logical `healthy → degraded`
350
+ * transition emits exactly one `ENTITY_CHANGED` + one transition row.
351
+ *
352
+ * `fn` does its own durable writes on the outer pool; the lock only gates
353
+ * ENTRY to the critical section, so its connection affinity is irrelevant —
354
+ * the second caller cannot acquire the xact lock until the first transaction
355
+ * commits.
356
+ */
357
+ export function createHealthEntitySerializer(deps: {
358
+ advisoryLock: AdvisoryLockService;
359
+ }): (systemId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
360
+ const { advisoryLock } = deps;
361
+ return (systemId) =>
362
+ <T>(fn: () => Promise<T>) =>
363
+ advisoryLock.withXactLock({
364
+ key: healthSystemLockKey(systemId),
365
+ fn: () => fn(),
366
+ });
367
+ }
@@ -0,0 +1,115 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import {
3
+ aggregateWindowedMetrics,
4
+ buildHealthState,
5
+ } from "./health-state";
6
+
7
+ describe("buildHealthState", () => {
8
+ const now = new Date("2026-05-30T12:00:00.000Z");
9
+
10
+ it("computes inStatusForMs from inStatusSince relative to now", () => {
11
+ const since = new Date("2026-05-30T11:30:00.000Z"); // 30 min ago
12
+ const state = buildHealthState({
13
+ status: "unhealthy",
14
+ inStatusSince: since,
15
+ inMaintenance: false,
16
+ transitionsInWindow: 0,
17
+ transitionWindowMinutes: 60,
18
+ now,
19
+ });
20
+ expect(state.status).toBe("unhealthy");
21
+ expect(state.inStatusSince).toBe(since);
22
+ expect(state.inStatusForMs).toBe(30 * 60_000);
23
+ expect(state.evaluatedAt).toBe(now);
24
+ });
25
+
26
+ it("returns inStatusForMs 0 when inStatusSince is null (fail-safe)", () => {
27
+ const state = buildHealthState({
28
+ status: "healthy",
29
+ inStatusSince: null,
30
+ inMaintenance: false,
31
+ transitionsInWindow: 0,
32
+ transitionWindowMinutes: 60,
33
+ now,
34
+ });
35
+ expect(state.inStatusSince).toBeNull();
36
+ expect(state.inStatusForMs).toBe(0);
37
+ });
38
+
39
+ it("clamps negative durations to 0 under clock skew", () => {
40
+ const future = new Date("2026-05-30T12:05:00.000Z");
41
+ const state = buildHealthState({
42
+ status: "degraded",
43
+ inStatusSince: future,
44
+ inMaintenance: false,
45
+ transitionsInWindow: 0,
46
+ transitionWindowMinutes: 60,
47
+ now,
48
+ });
49
+ expect(state.inStatusForMs).toBe(0);
50
+ });
51
+
52
+ it("passes through metrics and maintenance flag", () => {
53
+ const state = buildHealthState({
54
+ status: "healthy",
55
+ inStatusSince: null,
56
+ latencyMs: 42,
57
+ avgLatencyMs: 50,
58
+ p95LatencyMs: 120,
59
+ successRate: 0.99,
60
+ lastRunAt: now,
61
+ inMaintenance: true,
62
+ transitionsInWindow: 3,
63
+ transitionWindowMinutes: 60,
64
+ now,
65
+ });
66
+ expect(state.latencyMs).toBe(42);
67
+ expect(state.avgLatencyMs).toBe(50);
68
+ expect(state.p95LatencyMs).toBe(120);
69
+ expect(state.successRate).toBe(0.99);
70
+ expect(state.lastRunAt).toBe(now);
71
+ expect(state.inMaintenance).toBe(true);
72
+ expect(state.transitionsInWindow).toBe(3);
73
+ expect(state.transitionWindowMinutes).toBe(60);
74
+ });
75
+ });
76
+
77
+ describe("aggregateWindowedMetrics", () => {
78
+ it("returns empty object when there are no buckets", () => {
79
+ expect(aggregateWindowedMetrics([])).toEqual({});
80
+ });
81
+
82
+ it("computes latency-sum-weighted average, max p95, and success rate", () => {
83
+ const result = aggregateWindowedMetrics([
84
+ { runCount: 10, healthyCount: 9, latencySumMs: 1000, p95LatencyMs: 100 },
85
+ { runCount: 30, healthyCount: 30, latencySumMs: 6000, p95LatencyMs: 200 },
86
+ ]);
87
+ // (1000 + 6000) / (10 + 30) = 175
88
+ expect(result.avgLatencyMs).toBe(175);
89
+ // max p95 across buckets
90
+ expect(result.p95LatencyMs).toBe(200);
91
+ // (9 + 30) / (10 + 30) = 0.975
92
+ expect(result.successRate).toBeCloseTo(0.975, 5);
93
+ });
94
+
95
+ it("ignores buckets with null latency for the average but counts them for success rate", () => {
96
+ const result = aggregateWindowedMetrics([
97
+ { runCount: 5, healthyCount: 5, latencySumMs: null, p95LatencyMs: null },
98
+ { runCount: 5, healthyCount: 4, latencySumMs: 500, p95LatencyMs: 80 },
99
+ ]);
100
+ // only the second bucket contributes latency: 500 / 5 = 100
101
+ expect(result.avgLatencyMs).toBe(100);
102
+ expect(result.p95LatencyMs).toBe(80);
103
+ // success rate spans both: 9 / 10
104
+ expect(result.successRate).toBeCloseTo(0.9, 5);
105
+ });
106
+
107
+ it("omits avg/p95 when no bucket carries latency", () => {
108
+ const result = aggregateWindowedMetrics([
109
+ { runCount: 3, healthyCount: 2, latencySumMs: null, p95LatencyMs: null },
110
+ ]);
111
+ expect(result.avgLatencyMs).toBeUndefined();
112
+ expect(result.p95LatencyMs).toBeUndefined();
113
+ expect(result.successRate).toBeCloseTo(2 / 3, 5);
114
+ });
115
+ });