@checkstack/healthcheck-backend 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +329 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +6 -27
- package/src/automations.ts +32 -30
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +698 -0
- package/src/health-entity.ts +369 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +13 -68
- package/src/index.ts +115 -48
- package/src/queue-executor.ts +243 -444
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +13 -0
- package/src/router.ts +44 -0
- package/src/schema.ts +34 -54
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +89 -0
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +9 -0
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
|
@@ -0,0 +1,698 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import type {
|
|
3
|
+
EntityChanged,
|
|
4
|
+
EntityHandle,
|
|
5
|
+
MutateInput,
|
|
6
|
+
} from "@checkstack/automation-backend";
|
|
7
|
+
import { SYSTEM_ACTOR } from "@checkstack/common";
|
|
8
|
+
|
|
9
|
+
import {
|
|
10
|
+
HEALTH_ENTITY_KIND,
|
|
11
|
+
HEALTH_TRIGGER_EVENTS,
|
|
12
|
+
HealthEntityStateSchema,
|
|
13
|
+
classifyHealthChange,
|
|
14
|
+
computeHealthEntityState,
|
|
15
|
+
createHealthEntityRead,
|
|
16
|
+
createHealthEntitySerializer,
|
|
17
|
+
deriveHealthTriggerEvents,
|
|
18
|
+
healthChangeToPayload,
|
|
19
|
+
writeHealthEntity,
|
|
20
|
+
type HealthEntityState,
|
|
21
|
+
} from "./health-entity";
|
|
22
|
+
import type { HealthCheckService } from "./service";
|
|
23
|
+
import {
|
|
24
|
+
systemDegradedTrigger,
|
|
25
|
+
systemHealthyTrigger,
|
|
26
|
+
systemHealthChangedTrigger,
|
|
27
|
+
} from "./automations";
|
|
28
|
+
|
|
29
|
+
const HEALTHCHECK_PLUGIN_ID = "healthcheck";
|
|
30
|
+
|
|
31
|
+
function change(overrides: Partial<EntityChanged> = {}): EntityChanged {
|
|
32
|
+
return {
|
|
33
|
+
kind: HEALTH_ENTITY_KIND,
|
|
34
|
+
id: "sys-1",
|
|
35
|
+
prev: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
|
|
36
|
+
next: { status: "unhealthy", healthyChecks: 0, totalChecks: 2 },
|
|
37
|
+
delta: { status: "unhealthy", healthyChecks: 0 },
|
|
38
|
+
changedFields: ["status", "healthyChecks"],
|
|
39
|
+
actor: SYSTEM_ACTOR,
|
|
40
|
+
occurredAt: new Date().toISOString(),
|
|
41
|
+
...overrides,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
describe("HEALTH_TRIGGER_EVENTS (must equal the trigger qualifiedIds)", () => {
|
|
46
|
+
it("emits the underscore trigger qualifiedIds, not the dotted hook ids", () => {
|
|
47
|
+
// Stage-1 routing fires automations on `t.event === trigger.qualifiedId`
|
|
48
|
+
// (`${pluginId}.${trigger.id}`). The healthcheck triggers have ids
|
|
49
|
+
// `system_degraded` / `system_healthy` / `system_health_changed`, so the
|
|
50
|
+
// deriver MUST emit these — NOT the dotted hook ids.
|
|
51
|
+
expect(HEALTH_TRIGGER_EVENTS.degraded).toBe("healthcheck.system_degraded");
|
|
52
|
+
expect(HEALTH_TRIGGER_EVENTS.healthy).toBe("healthcheck.system_healthy");
|
|
53
|
+
expect(HEALTH_TRIGGER_EVENTS.healthChanged).toBe(
|
|
54
|
+
"healthcheck.system_health_changed",
|
|
55
|
+
);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
it("matches the registered trigger qualifiedIds exactly", () => {
|
|
59
|
+
// Compare as plain strings (the constants are narrow literal types).
|
|
60
|
+
expect(`${HEALTHCHECK_PLUGIN_ID}.${systemDegradedTrigger.id}`).toBe(
|
|
61
|
+
HEALTH_TRIGGER_EVENTS.degraded,
|
|
62
|
+
);
|
|
63
|
+
expect(`${HEALTHCHECK_PLUGIN_ID}.${systemHealthyTrigger.id}`).toBe(
|
|
64
|
+
HEALTH_TRIGGER_EVENTS.healthy,
|
|
65
|
+
);
|
|
66
|
+
expect(`${HEALTHCHECK_PLUGIN_ID}.${systemHealthChangedTrigger.id}`).toBe(
|
|
67
|
+
HEALTH_TRIGGER_EVENTS.healthChanged,
|
|
68
|
+
);
|
|
69
|
+
});
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
describe("deriveHealthTriggerEvents", () => {
|
|
73
|
+
it("maps a healthy → unhealthy transition to degraded + umbrella", () => {
|
|
74
|
+
const events = deriveHealthTriggerEvents(change());
|
|
75
|
+
expect(events).toEqual([
|
|
76
|
+
HEALTH_TRIGGER_EVENTS.degraded,
|
|
77
|
+
HEALTH_TRIGGER_EVENTS.healthChanged,
|
|
78
|
+
]);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it("maps a degraded → healthy recovery to healthy + umbrella", () => {
|
|
82
|
+
const events = deriveHealthTriggerEvents(
|
|
83
|
+
change({
|
|
84
|
+
prev: { status: "degraded", healthyChecks: 1, totalChecks: 2 },
|
|
85
|
+
next: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
|
|
86
|
+
}),
|
|
87
|
+
);
|
|
88
|
+
expect(events).toEqual([
|
|
89
|
+
HEALTH_TRIGGER_EVENTS.healthy,
|
|
90
|
+
HEALTH_TRIGGER_EVENTS.healthChanged,
|
|
91
|
+
]);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it("maps a degraded → unhealthy transition to umbrella only (no directional)", () => {
|
|
95
|
+
const events = deriveHealthTriggerEvents(
|
|
96
|
+
change({
|
|
97
|
+
prev: { status: "degraded", healthyChecks: 1, totalChecks: 2 },
|
|
98
|
+
next: { status: "unhealthy", healthyChecks: 0, totalChecks: 2 },
|
|
99
|
+
}),
|
|
100
|
+
);
|
|
101
|
+
// Neither side is "healthy", so only the umbrella fires.
|
|
102
|
+
expect(events).toEqual([HEALTH_TRIGGER_EVENTS.healthChanged]);
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it("fires nothing on create (prev === null)", () => {
|
|
106
|
+
expect(deriveHealthTriggerEvents(change({ prev: null }))).toEqual([]);
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
it("fires nothing on tombstone (next === null)", () => {
|
|
110
|
+
expect(deriveHealthTriggerEvents(change({ next: null }))).toEqual([]);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it("fires nothing when only non-status fields changed", () => {
|
|
114
|
+
expect(
|
|
115
|
+
deriveHealthTriggerEvents(
|
|
116
|
+
change({
|
|
117
|
+
prev: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
|
|
118
|
+
next: { status: "healthy", healthyChecks: 1, totalChecks: 2 },
|
|
119
|
+
}),
|
|
120
|
+
),
|
|
121
|
+
).toEqual([]);
|
|
122
|
+
});
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
describe("healthChangeToPayload — payloadSchema parity", () => {
|
|
126
|
+
it("a degradation payload validates against the degraded trigger's payloadSchema", () => {
|
|
127
|
+
const payload = healthChangeToPayload(change());
|
|
128
|
+
const parsed = systemDegradedTrigger.payloadSchema.parse(payload);
|
|
129
|
+
expect(parsed.systemId).toBe("sys-1");
|
|
130
|
+
expect(parsed.previousStatus).toBe("healthy");
|
|
131
|
+
expect(parsed.newStatus).toBe("unhealthy");
|
|
132
|
+
expect(parsed.healthyChecks).toBe(0);
|
|
133
|
+
expect(parsed.totalChecks).toBe(2);
|
|
134
|
+
expect(typeof parsed.timestamp).toBe("string");
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
it("a recovery payload validates against the healthy trigger's payloadSchema", () => {
|
|
138
|
+
const payload = healthChangeToPayload(
|
|
139
|
+
change({
|
|
140
|
+
prev: { status: "unhealthy", healthyChecks: 0, totalChecks: 2 },
|
|
141
|
+
next: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
|
|
142
|
+
}),
|
|
143
|
+
);
|
|
144
|
+
const parsed = systemHealthyTrigger.payloadSchema.parse(payload);
|
|
145
|
+
expect(parsed.systemId).toBe("sys-1");
|
|
146
|
+
expect(parsed.previousStatus).toBe("unhealthy");
|
|
147
|
+
expect(parsed.healthyChecks).toBe(2);
|
|
148
|
+
expect(parsed.totalChecks).toBe(2);
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
it("any transition payload validates against the health_changed trigger's payloadSchema", () => {
|
|
152
|
+
const payload = healthChangeToPayload(change());
|
|
153
|
+
const parsed = systemHealthChangedTrigger.payloadSchema.parse(payload);
|
|
154
|
+
expect(parsed.systemId).toBe("sys-1");
|
|
155
|
+
expect(parsed.previousStatus).toBe("healthy");
|
|
156
|
+
expect(parsed.newStatus).toBe("unhealthy");
|
|
157
|
+
});
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
describe("classifyHealthChange (cross-plugin consumer predicate)", () => {
|
|
161
|
+
it("flags degraded on healthy → unhealthy (the old systemDegraded condition)", () => {
|
|
162
|
+
const c = classifyHealthChange(change());
|
|
163
|
+
expect(c).toEqual({
|
|
164
|
+
systemId: "sys-1",
|
|
165
|
+
previousStatus: "healthy",
|
|
166
|
+
newStatus: "unhealthy",
|
|
167
|
+
degraded: true,
|
|
168
|
+
recovered: false,
|
|
169
|
+
});
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
it("flags recovered on degraded → healthy (the old systemHealthy condition)", () => {
|
|
173
|
+
const c = classifyHealthChange(
|
|
174
|
+
change({
|
|
175
|
+
prev: { status: "degraded", healthyChecks: 1, totalChecks: 2 },
|
|
176
|
+
next: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
|
|
177
|
+
}),
|
|
178
|
+
);
|
|
179
|
+
expect(c.recovered).toBe(true);
|
|
180
|
+
expect(c.degraded).toBe(false);
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
it("flags neither on a non-healthy ↔ non-healthy transition", () => {
|
|
184
|
+
const c = classifyHealthChange(
|
|
185
|
+
change({
|
|
186
|
+
prev: { status: "degraded", healthyChecks: 1, totalChecks: 2 },
|
|
187
|
+
next: { status: "unhealthy", healthyChecks: 0, totalChecks: 2 },
|
|
188
|
+
}),
|
|
189
|
+
);
|
|
190
|
+
expect(c.degraded).toBe(false);
|
|
191
|
+
expect(c.recovered).toBe(false);
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
it("flags neither on create / tombstone", () => {
|
|
195
|
+
expect(classifyHealthChange(change({ prev: null })).degraded).toBe(false);
|
|
196
|
+
expect(classifyHealthChange(change({ next: null })).recovered).toBe(false);
|
|
197
|
+
});
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
describe("HealthEntityStateSchema", () => {
|
|
201
|
+
it("accepts the reactive subset", () => {
|
|
202
|
+
const parsed = HealthEntityStateSchema.parse({
|
|
203
|
+
status: "degraded",
|
|
204
|
+
healthyChecks: 1,
|
|
205
|
+
totalChecks: 3,
|
|
206
|
+
});
|
|
207
|
+
expect(parsed.status).toBe("degraded");
|
|
208
|
+
});
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
213
|
+
// COMPUTE-ON-READ: the `read` accessor derives the view from durable data.
|
|
214
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
215
|
+
|
|
216
|
+
type CheckStatus = HealthEntityState["status"];
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Fake service whose `getSystemHealthStatus` returns canned per-system state.
|
|
220
|
+
* A system absent from the map gets the SAME default-`healthy` baseline the
|
|
221
|
+
* real `getSystemHealthStatus` returns for an empty run window — and an empty
|
|
222
|
+
* `checkStatuses` for a system with no enabled associations (the existence
|
|
223
|
+
* gate), exactly like production.
|
|
224
|
+
*/
|
|
225
|
+
function fakeService(
|
|
226
|
+
statusBySystem: Record<
|
|
227
|
+
string,
|
|
228
|
+
{ status: CheckStatus; checkStatuses: Array<{ status: CheckStatus }> }
|
|
229
|
+
>,
|
|
230
|
+
): HealthCheckService {
|
|
231
|
+
return {
|
|
232
|
+
getSystemHealthStatus: async (systemId: string) => {
|
|
233
|
+
const found = statusBySystem[systemId];
|
|
234
|
+
return {
|
|
235
|
+
status: found?.status ?? ("healthy" as CheckStatus),
|
|
236
|
+
evaluatedAt: new Date(),
|
|
237
|
+
checkStatuses: (found?.checkStatuses ?? []).map((c, i) => ({
|
|
238
|
+
configurationId: `cfg-${i}`,
|
|
239
|
+
configurationName: `Check ${i}`,
|
|
240
|
+
status: c.status,
|
|
241
|
+
runsConsidered: 1,
|
|
242
|
+
})),
|
|
243
|
+
};
|
|
244
|
+
},
|
|
245
|
+
} as unknown as HealthCheckService;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
describe("computeHealthEntityState (compute-on-read from durable data)", () => {
|
|
249
|
+
it("omits a system with NO enabled check associations (existence gate)", async () => {
|
|
250
|
+
// No enabled associations ⇒ `getSystemHealthStatus` returns checkStatuses:
|
|
251
|
+
// [] ⇒ no `health` entity for this system.
|
|
252
|
+
const service = fakeService({
|
|
253
|
+
"sys-1": { status: "healthy", checkStatuses: [] },
|
|
254
|
+
});
|
|
255
|
+
const state = await computeHealthEntityState({ service, systemId: "sys-1" });
|
|
256
|
+
expect(state).toBeUndefined();
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
it("resolves the default-`healthy` baseline for a system with an enabled check but no runs yet", async () => {
|
|
260
|
+
// A run-less system with an enabled check evaluates to the default-healthy
|
|
261
|
+
// baseline (the executor's pre-run state), NOT undefined — so a first-ever
|
|
262
|
+
// unhealthy run is a real healthy → degraded diff (Defect 1 fix).
|
|
263
|
+
const service = fakeService({
|
|
264
|
+
"sys-1": {
|
|
265
|
+
status: "healthy",
|
|
266
|
+
checkStatuses: [{ status: "healthy" }, { status: "healthy" }],
|
|
267
|
+
},
|
|
268
|
+
});
|
|
269
|
+
const state = await computeHealthEntityState({ service, systemId: "sys-1" });
|
|
270
|
+
expect(state).toEqual({
|
|
271
|
+
status: "healthy",
|
|
272
|
+
healthyChecks: 2,
|
|
273
|
+
totalChecks: 2,
|
|
274
|
+
});
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
it("derives { status, healthyChecks, totalChecks } from the worst-wins aggregate", async () => {
|
|
278
|
+
const service = fakeService({
|
|
279
|
+
"sys-1": {
|
|
280
|
+
status: "degraded",
|
|
281
|
+
checkStatuses: [
|
|
282
|
+
{ status: "healthy" },
|
|
283
|
+
{ status: "degraded" },
|
|
284
|
+
{ status: "healthy" },
|
|
285
|
+
],
|
|
286
|
+
},
|
|
287
|
+
});
|
|
288
|
+
const state = await computeHealthEntityState({ service, systemId: "sys-1" });
|
|
289
|
+
// status = worst-wins aggregate; healthyChecks = count of "healthy";
|
|
290
|
+
// totalChecks = number of enabled checks.
|
|
291
|
+
expect(state).toEqual({
|
|
292
|
+
status: "degraded",
|
|
293
|
+
healthyChecks: 2,
|
|
294
|
+
totalChecks: 3,
|
|
295
|
+
});
|
|
296
|
+
});
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
describe("createHealthEntityRead (batched, omits checkless systems)", () => {
|
|
300
|
+
it("returns a map keyed by systemId, omitting systems with no enabled checks", async () => {
|
|
301
|
+
const service = fakeService({
|
|
302
|
+
"sys-a": { status: "healthy", checkStatuses: [{ status: "healthy" }] },
|
|
303
|
+
// sys-b has NO enabled associations ⇒ omitted from the read.
|
|
304
|
+
"sys-b": { status: "healthy", checkStatuses: [] },
|
|
305
|
+
"sys-c": {
|
|
306
|
+
status: "unhealthy",
|
|
307
|
+
checkStatuses: [{ status: "healthy" }, { status: "unhealthy" }],
|
|
308
|
+
},
|
|
309
|
+
});
|
|
310
|
+
const read = createHealthEntityRead({ service });
|
|
311
|
+
const out = await read(["sys-a", "sys-b", "sys-c"]);
|
|
312
|
+
expect(out).toEqual({
|
|
313
|
+
"sys-a": { status: "healthy", healthyChecks: 1, totalChecks: 1 },
|
|
314
|
+
"sys-c": { status: "unhealthy", healthyChecks: 1, totalChecks: 2 },
|
|
315
|
+
});
|
|
316
|
+
expect(out["sys-b"]).toBeUndefined();
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
it("returns {} for an empty id list without touching the backing", async () => {
|
|
320
|
+
const read = createHealthEntityRead({ service: fakeService({}) });
|
|
321
|
+
expect(await read([])).toEqual({});
|
|
322
|
+
});
|
|
323
|
+
});
|
|
324
|
+
|
|
325
|
+
describe("writeHealthEntity (durable write driven through handle.mutate)", () => {
|
|
326
|
+
/**
|
|
327
|
+
* Fake handle reproducing the Model B pipeline's observable timing: snapshot
|
|
328
|
+
* `prev` via `read`, run `apply` (the REAL durable write), diff, and emit on
|
|
329
|
+
* a real change. Records the (prev, next) pair so the test can assert the
|
|
330
|
+
* framework snapshotted prev BEFORE the durable write committed.
|
|
331
|
+
*/
|
|
332
|
+
function fakeHandle(args: {
|
|
333
|
+
read: () => Promise<HealthEntityState | undefined>;
|
|
334
|
+
onEmit?: (change: {
|
|
335
|
+
prev: HealthEntityState | undefined;
|
|
336
|
+
next: HealthEntityState;
|
|
337
|
+
}) => void;
|
|
338
|
+
failAfterApply?: boolean;
|
|
339
|
+
}): EntityHandle<HealthEntityState> {
|
|
340
|
+
const { read, onEmit, failAfterApply } = args;
|
|
341
|
+
return {
|
|
342
|
+
kind: HEALTH_ENTITY_KIND,
|
|
343
|
+
async mutate(input: MutateInput<HealthEntityState>) {
|
|
344
|
+
const prev = await read(); // snapshot BEFORE apply
|
|
345
|
+
const next = await input.apply(); // the REAL durable write
|
|
346
|
+
if (failAfterApply) throw new Error("emit failed");
|
|
347
|
+
// Only emit on a real change (status diff suffices for the test).
|
|
348
|
+
if (!prev || prev.status !== next.status) onEmit?.({ prev, next });
|
|
349
|
+
return next;
|
|
350
|
+
},
|
|
351
|
+
} as unknown as EntityHandle<HealthEntityState>;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
it("snapshots prev BEFORE apply runs the durable write (one correct change)", async () => {
|
|
355
|
+
// `read` reflects state BEFORE apply; apply flips it. The framework must
|
|
356
|
+
// capture the pre-write prev, so the change is prev=healthy → next=unhealthy.
|
|
357
|
+
let persisted: HealthEntityState = {
|
|
358
|
+
status: "healthy",
|
|
359
|
+
healthyChecks: 2,
|
|
360
|
+
totalChecks: 2,
|
|
361
|
+
};
|
|
362
|
+
const emitted: Array<{
|
|
363
|
+
prev: HealthEntityState | undefined;
|
|
364
|
+
next: HealthEntityState;
|
|
365
|
+
}> = [];
|
|
366
|
+
const handle = fakeHandle({
|
|
367
|
+
read: async () => persisted,
|
|
368
|
+
onEmit: (c) => emitted.push(c),
|
|
369
|
+
});
|
|
370
|
+
|
|
371
|
+
const next = await writeHealthEntity({
|
|
372
|
+
handle,
|
|
373
|
+
systemId: "sys-1",
|
|
374
|
+
apply: async () => {
|
|
375
|
+
persisted = { status: "unhealthy", healthyChecks: 0, totalChecks: 2 };
|
|
376
|
+
return persisted;
|
|
377
|
+
},
|
|
378
|
+
});
|
|
379
|
+
|
|
380
|
+
expect(next).toEqual({
|
|
381
|
+
status: "unhealthy",
|
|
382
|
+
healthyChecks: 0,
|
|
383
|
+
totalChecks: 2,
|
|
384
|
+
});
|
|
385
|
+
// Exactly one change, with the pre-write prev and post-write next.
|
|
386
|
+
expect(emitted).toHaveLength(1);
|
|
387
|
+
expect(emitted[0].prev?.status).toBe("healthy");
|
|
388
|
+
expect(emitted[0].next.status).toBe("unhealthy");
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
it("runs the durable write even when no handle is bound", async () => {
|
|
392
|
+
let ran = false;
|
|
393
|
+
const next = await writeHealthEntity({
|
|
394
|
+
handle: undefined,
|
|
395
|
+
systemId: "sys-1",
|
|
396
|
+
apply: async () => {
|
|
397
|
+
ran = true;
|
|
398
|
+
return { status: "healthy", healthyChecks: 1, totalChecks: 1 };
|
|
399
|
+
},
|
|
400
|
+
});
|
|
401
|
+
expect(ran).toBe(true);
|
|
402
|
+
expect(next.status).toBe("healthy");
|
|
403
|
+
});
|
|
404
|
+
|
|
405
|
+
it("routes a post-commit framework failure to onError (fail-soft)", async () => {
|
|
406
|
+
let captured: unknown;
|
|
407
|
+
const handle = fakeHandle({
|
|
408
|
+
read: async () => ({
|
|
409
|
+
status: "healthy",
|
|
410
|
+
healthyChecks: 1,
|
|
411
|
+
totalChecks: 1,
|
|
412
|
+
}),
|
|
413
|
+
failAfterApply: true,
|
|
414
|
+
});
|
|
415
|
+
// apply commits, THEN the handle throws (emit failure). Must not rethrow.
|
|
416
|
+
const result = await writeHealthEntity({
|
|
417
|
+
handle,
|
|
418
|
+
systemId: "sys-1",
|
|
419
|
+
apply: async () => ({
|
|
420
|
+
status: "unhealthy",
|
|
421
|
+
healthyChecks: 0,
|
|
422
|
+
totalChecks: 1,
|
|
423
|
+
}),
|
|
424
|
+
onError: (e) => {
|
|
425
|
+
captured = e;
|
|
426
|
+
},
|
|
427
|
+
});
|
|
428
|
+
expect((captured as Error).message).toBe("emit failed");
|
|
429
|
+
// The committed state is still returned (fail-soft, not lost).
|
|
430
|
+
expect(result.status).toBe("unhealthy");
|
|
431
|
+
});
|
|
432
|
+
|
|
433
|
+
it("rethrows when the durable write itself fails (executor fallback runs)", async () => {
|
|
434
|
+
const handle = fakeHandle({
|
|
435
|
+
read: async () => ({
|
|
436
|
+
status: "healthy",
|
|
437
|
+
healthyChecks: 1,
|
|
438
|
+
totalChecks: 1,
|
|
439
|
+
}),
|
|
440
|
+
});
|
|
441
|
+
let onErrorCalled = false;
|
|
442
|
+
await expect(
|
|
443
|
+
writeHealthEntity({
|
|
444
|
+
handle,
|
|
445
|
+
systemId: "sys-1",
|
|
446
|
+
apply: async () => {
|
|
447
|
+
throw new Error("insert failed");
|
|
448
|
+
},
|
|
449
|
+
onError: () => {
|
|
450
|
+
onErrorCalled = true;
|
|
451
|
+
},
|
|
452
|
+
}),
|
|
453
|
+
).rejects.toThrow("insert failed");
|
|
454
|
+
// A durable-write failure must propagate, NOT be swallowed by onError.
|
|
455
|
+
expect(onErrorCalled).toBe(false);
|
|
456
|
+
});
|
|
457
|
+
});
|
|
458
|
+
|
|
459
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
460
|
+
// DEFECT 1 (first-run degradation): a system whose very first run comes up
|
|
461
|
+
// unhealthy must produce a real healthy → degraded diff so `system_degraded` /
|
|
462
|
+
// `health_changed` fire and the `degraded` `onEntityChanged` opens SLO /
|
|
463
|
+
// dependency downtime — NOT a suppressed create (prev === null).
|
|
464
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
465
|
+
|
|
466
|
+
describe("first-run-unhealthy degradation (Defect 1 regression)", () => {
|
|
467
|
+
/**
|
|
468
|
+
* A handle whose `read` snapshots `prev` from `computeHealthEntityState`
|
|
469
|
+
* (the SAME compute-on-read accessor production uses) against a service that
|
|
470
|
+
* reflects the durable state. Before `apply`, the system has an enabled
|
|
471
|
+
* check but no runs ⇒ default-`healthy` baseline; `apply` records the first
|
|
472
|
+
* (unhealthy) run, so the post-write read sees unhealthy. We assert the
|
|
473
|
+
* framework-style change (prev → next) drives the directional + umbrella
|
|
474
|
+
* trigger events and the `degraded` classification.
|
|
475
|
+
*/
|
|
476
|
+
it("fires system_degraded + umbrella and a `degraded` onEntityChanged on the first-ever unhealthy run", async () => {
|
|
477
|
+
// Durable state the fake service reads. Starts run-less (default healthy),
|
|
478
|
+
// flips to unhealthy when the first run is recorded by `apply`.
|
|
479
|
+
let firstRunRecorded = false;
|
|
480
|
+
const service = {
|
|
481
|
+
getSystemHealthStatus: async () => ({
|
|
482
|
+
status: firstRunRecorded
|
|
483
|
+
? ("unhealthy" as CheckStatus)
|
|
484
|
+
: ("healthy" as CheckStatus),
|
|
485
|
+
evaluatedAt: new Date(),
|
|
486
|
+
// One ENABLED check association exists from the start (run-less but
|
|
487
|
+
// configured), so the entity resolves to the healthy baseline.
|
|
488
|
+
checkStatuses: [
|
|
489
|
+
{
|
|
490
|
+
configurationId: "cfg-0",
|
|
491
|
+
configurationName: "Check 0",
|
|
492
|
+
status: firstRunRecorded
|
|
493
|
+
? ("unhealthy" as CheckStatus)
|
|
494
|
+
: ("healthy" as CheckStatus),
|
|
495
|
+
runsConsidered: firstRunRecorded ? 1 : 0,
|
|
496
|
+
},
|
|
497
|
+
],
|
|
498
|
+
}),
|
|
499
|
+
} as unknown as HealthCheckService;
|
|
500
|
+
|
|
501
|
+
const emitted: Array<{
|
|
502
|
+
prev: HealthEntityState | undefined;
|
|
503
|
+
next: HealthEntityState;
|
|
504
|
+
}> = [];
|
|
505
|
+
|
|
506
|
+
// Model B handle: snapshot prev via the REAL compute-on-read accessor
|
|
507
|
+
// BEFORE apply, run apply, diff, emit on a real change.
|
|
508
|
+
const handle = {
|
|
509
|
+
kind: HEALTH_ENTITY_KIND,
|
|
510
|
+
async mutate(input: MutateInput<HealthEntityState>) {
|
|
511
|
+
const prev = await computeHealthEntityState({
|
|
512
|
+
service,
|
|
513
|
+
systemId: "sys-1",
|
|
514
|
+
});
|
|
515
|
+
const next = await input.apply();
|
|
516
|
+
if (!prev || prev.status !== next.status) onEmitChange(prev, next);
|
|
517
|
+
return next;
|
|
518
|
+
},
|
|
519
|
+
} as unknown as EntityHandle<HealthEntityState>;
|
|
520
|
+
|
|
521
|
+
function onEmitChange(
|
|
522
|
+
prev: HealthEntityState | undefined,
|
|
523
|
+
next: HealthEntityState,
|
|
524
|
+
) {
|
|
525
|
+
emitted.push({ prev, next });
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
const next = await writeHealthEntity({
|
|
529
|
+
handle,
|
|
530
|
+
systemId: "sys-1",
|
|
531
|
+
apply: async () => {
|
|
532
|
+
// The durable first run lands here (unhealthy).
|
|
533
|
+
firstRunRecorded = true;
|
|
534
|
+
const computed = await computeHealthEntityState({
|
|
535
|
+
service,
|
|
536
|
+
systemId: "sys-1",
|
|
537
|
+
});
|
|
538
|
+
if (!computed) throw new Error("expected a computed view");
|
|
539
|
+
return computed;
|
|
540
|
+
},
|
|
541
|
+
});
|
|
542
|
+
|
|
543
|
+
// Exactly one emit, a real healthy → unhealthy transition (NOT a create).
|
|
544
|
+
expect(emitted).toHaveLength(1);
|
|
545
|
+
expect(emitted[0].prev).toEqual({
|
|
546
|
+
status: "healthy",
|
|
547
|
+
healthyChecks: 1,
|
|
548
|
+
totalChecks: 1,
|
|
549
|
+
});
|
|
550
|
+
expect(emitted[0].next.status).toBe("unhealthy");
|
|
551
|
+
expect(next.status).toBe("unhealthy");
|
|
552
|
+
|
|
553
|
+
// The deriver fires the directional + umbrella trigger events.
|
|
554
|
+
const events = deriveHealthTriggerEvents({
|
|
555
|
+
kind: HEALTH_ENTITY_KIND,
|
|
556
|
+
id: "sys-1",
|
|
557
|
+
prev: emitted[0].prev ?? null,
|
|
558
|
+
next: emitted[0].next,
|
|
559
|
+
delta: {},
|
|
560
|
+
changedFields: ["status"],
|
|
561
|
+
actor: SYSTEM_ACTOR,
|
|
562
|
+
occurredAt: new Date().toISOString(),
|
|
563
|
+
});
|
|
564
|
+
expect(events).toEqual([
|
|
565
|
+
HEALTH_TRIGGER_EVENTS.degraded,
|
|
566
|
+
HEALTH_TRIGGER_EVENTS.healthChanged,
|
|
567
|
+
]);
|
|
568
|
+
|
|
569
|
+
// The cross-plugin consumer predicate reports `degraded` (opens SLO /
|
|
570
|
+
// dependency downtime).
|
|
571
|
+
const classified = classifyHealthChange({
|
|
572
|
+
id: "sys-1",
|
|
573
|
+
prev: emitted[0].prev ?? null,
|
|
574
|
+
next: emitted[0].next,
|
|
575
|
+
});
|
|
576
|
+
expect(classified.degraded).toBe(true);
|
|
577
|
+
expect(classified.recovered).toBe(false);
|
|
578
|
+
});
|
|
579
|
+
});
|
|
580
|
+
|
|
581
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
582
|
+
// DEFECT 2 (concurrent N-pod evaluation): two concurrent `writeHealthEntity`
|
|
583
|
+
// for ONE system must serialize through prev-snapshot → emit, producing
|
|
584
|
+
// exactly ONE transition + ONE emit (not two).
|
|
585
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
586
|
+
|
|
587
|
+
describe("per-system serialization (Defect 2 regression)", () => {
|
|
588
|
+
/**
|
|
589
|
+
* A faithful in-memory stand-in for `withXactLock`'s mutual exclusion: a
|
|
590
|
+
* per-key promise chain. Two callers with the same key run strictly one
|
|
591
|
+
* after another; the second cannot enter until the first resolves — exactly
|
|
592
|
+
* the guarantee `pg_advisory_xact_lock` provides across pods.
|
|
593
|
+
*/
|
|
594
|
+
function makeKeyedSerializer() {
|
|
595
|
+
const chains = new Map<string, Promise<unknown>>();
|
|
596
|
+
return (key: string) =>
|
|
597
|
+
<T>(fn: () => Promise<T>): Promise<T> => {
|
|
598
|
+
const prior = chains.get(key) ?? Promise.resolve();
|
|
599
|
+
const next = prior.then(fn, fn);
|
|
600
|
+
chains.set(
|
|
601
|
+
key,
|
|
602
|
+
next.then(
|
|
603
|
+
() => undefined,
|
|
604
|
+
() => undefined,
|
|
605
|
+
),
|
|
606
|
+
);
|
|
607
|
+
return next;
|
|
608
|
+
};
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
it("two concurrent evals of one system emit exactly ONE transition", async () => {
|
|
612
|
+
// Shared durable state both evaluations write to. The first failing run
|
|
613
|
+
// flips it to unhealthy; the second sees it already unhealthy ⇒ no-op.
|
|
614
|
+
let unhealthy = false;
|
|
615
|
+
const compute = (): HealthEntityState => ({
|
|
616
|
+
status: unhealthy ? "unhealthy" : "healthy",
|
|
617
|
+
healthyChecks: unhealthy ? 0 : 1,
|
|
618
|
+
totalChecks: 1,
|
|
619
|
+
});
|
|
620
|
+
|
|
621
|
+
const emitted: Array<{
|
|
622
|
+
prev: HealthEntityState | undefined;
|
|
623
|
+
next: HealthEntityState;
|
|
624
|
+
}> = [];
|
|
625
|
+
|
|
626
|
+
// Model B handle: snapshot prev (current durable view) BEFORE apply, run
|
|
627
|
+
// apply, diff, emit on a real change. With NO lock, two concurrent calls
|
|
628
|
+
// both snapshot prev=healthy and both emit; the lock prevents that.
|
|
629
|
+
const handle = {
|
|
630
|
+
kind: HEALTH_ENTITY_KIND,
|
|
631
|
+
async mutate(input: MutateInput<HealthEntityState>) {
|
|
632
|
+
const prev = compute(); // snapshot BEFORE apply
|
|
633
|
+
// Yield so the second concurrent caller could interleave here if it
|
|
634
|
+
// were not serialized — the lock must prevent that.
|
|
635
|
+
await Promise.resolve();
|
|
636
|
+
const next = await input.apply();
|
|
637
|
+
if (prev.status !== next.status) emitted.push({ prev, next });
|
|
638
|
+
return next;
|
|
639
|
+
},
|
|
640
|
+
} as unknown as EntityHandle<HealthEntityState>;
|
|
641
|
+
|
|
642
|
+
const keyed = makeKeyedSerializer();
|
|
643
|
+
const serialize = keyed(`health:sys-1`);
|
|
644
|
+
|
|
645
|
+
const evalOnce = () =>
|
|
646
|
+
writeHealthEntity({
|
|
647
|
+
handle,
|
|
648
|
+
systemId: "sys-1",
|
|
649
|
+
serialize,
|
|
650
|
+
apply: async () => {
|
|
651
|
+
// The durable "insert failing run" — first writer flips the state.
|
|
652
|
+
unhealthy = true;
|
|
653
|
+
return compute();
|
|
654
|
+
},
|
|
655
|
+
});
|
|
656
|
+
|
|
657
|
+
// Fire both concurrently for the SAME system.
|
|
658
|
+
await Promise.all([evalOnce(), evalOnce()]);
|
|
659
|
+
|
|
660
|
+
// Exactly one logical transition emitted (healthy → unhealthy), not two.
|
|
661
|
+
expect(emitted).toHaveLength(1);
|
|
662
|
+
expect(emitted[0].prev?.status).toBe("healthy");
|
|
663
|
+
expect(emitted[0].next.status).toBe("unhealthy");
|
|
664
|
+
});
|
|
665
|
+
|
|
666
|
+
it("createHealthEntitySerializer keys the advisory lock `health:<systemId>` and runs work in a transaction", async () => {
|
|
667
|
+
// Intercept `db.transaction` + the advisory-lock SQL the serializer's
|
|
668
|
+
// `withXactLock` issues. The fake runs `fn(tx)` inline (single connection),
|
|
669
|
+
// mirroring `withXactLock`'s single-session contract. We assert the
|
|
670
|
+
// namespaced key flows into `pg_advisory_xact_lock(...)`.
|
|
671
|
+
const executedKeys: string[] = [];
|
|
672
|
+
let transactionRan = false;
|
|
673
|
+
const fakeDb = {
|
|
674
|
+
transaction: async (
|
|
675
|
+
cb: (tx: { execute: (q: unknown) => Promise<void> }) => Promise<unknown>,
|
|
676
|
+
) => {
|
|
677
|
+
transactionRan = true;
|
|
678
|
+
return cb({
|
|
679
|
+
execute: async (q) => {
|
|
680
|
+
// The bound key is a plain string chunk in the drizzle template.
|
|
681
|
+
const chunks = (q as { queryChunks?: unknown[] }).queryChunks ?? [];
|
|
682
|
+
for (const c of chunks) {
|
|
683
|
+
if (typeof c === "string") executedKeys.push(c);
|
|
684
|
+
}
|
|
685
|
+
},
|
|
686
|
+
});
|
|
687
|
+
},
|
|
688
|
+
} as unknown as Parameters<typeof createHealthEntitySerializer>[0]["db"];
|
|
689
|
+
|
|
690
|
+
const serializer = createHealthEntitySerializer({ db: fakeDb });
|
|
691
|
+
const result = await serializer("sys-42")(async () => "ok");
|
|
692
|
+
|
|
693
|
+
expect(result).toBe("ok");
|
|
694
|
+
expect(transactionRan).toBe(true);
|
|
695
|
+
// The advisory lock was acquired with the per-system namespaced key.
|
|
696
|
+
expect(executedKeys).toContain("health:sys-42");
|
|
697
|
+
});
|
|
698
|
+
});
|