@checkstack/healthcheck-backend 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +541 -0
  2. package/drizzle/0015_quiet_meggan.sql +12 -0
  3. package/drizzle/0016_complex_maginty.sql +1 -0
  4. package/drizzle/0017_pretty_caretaker.sql +1 -0
  5. package/drizzle/meta/0015_snapshot.json +764 -0
  6. package/drizzle/meta/0016_snapshot.json +644 -0
  7. package/drizzle/meta/0017_snapshot.json +563 -0
  8. package/drizzle/meta/_journal.json +21 -0
  9. package/package.json +24 -21
  10. package/src/automations.test.ts +234 -0
  11. package/src/automations.ts +342 -0
  12. package/src/collector-script-test.test.ts +236 -0
  13. package/src/collector-script-test.ts +221 -0
  14. package/src/health-entity.test.ts +698 -0
  15. package/src/health-entity.ts +369 -0
  16. package/src/health-state.test.ts +115 -0
  17. package/src/health-state.ts +333 -0
  18. package/src/healthcheck-gitops-kinds.test.ts +6 -32
  19. package/src/healthcheck-gitops-kinds.ts +4 -19
  20. package/src/hooks.test.ts +19 -6
  21. package/src/hooks.ts +38 -28
  22. package/src/index.ts +150 -98
  23. package/src/queue-executor.test.ts +137 -0
  24. package/src/queue-executor.ts +282 -380
  25. package/src/retention-job.ts +65 -1
  26. package/src/retention-state-transitions.test.ts +49 -0
  27. package/src/router.test.ts +18 -0
  28. package/src/router.ts +56 -1
  29. package/src/schema.ts +34 -54
  30. package/src/service-assignments.test.ts +184 -0
  31. package/src/service-notification-policy.test.ts +28 -71
  32. package/src/service.ts +154 -0
  33. package/src/state-transitions.test.ts +126 -0
  34. package/src/state-transitions.ts +112 -0
  35. package/tsconfig.json +12 -3
  36. package/src/auto-incident-close-job.ts +0 -164
  37. package/src/auto-incident.test.ts +0 -196
  38. package/src/auto-incident.ts +0 -332
@@ -0,0 +1,698 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import type {
3
+ EntityChanged,
4
+ EntityHandle,
5
+ MutateInput,
6
+ } from "@checkstack/automation-backend";
7
+ import { SYSTEM_ACTOR } from "@checkstack/common";
8
+
9
+ import {
10
+ HEALTH_ENTITY_KIND,
11
+ HEALTH_TRIGGER_EVENTS,
12
+ HealthEntityStateSchema,
13
+ classifyHealthChange,
14
+ computeHealthEntityState,
15
+ createHealthEntityRead,
16
+ createHealthEntitySerializer,
17
+ deriveHealthTriggerEvents,
18
+ healthChangeToPayload,
19
+ writeHealthEntity,
20
+ type HealthEntityState,
21
+ } from "./health-entity";
22
+ import type { HealthCheckService } from "./service";
23
+ import {
24
+ systemDegradedTrigger,
25
+ systemHealthyTrigger,
26
+ systemHealthChangedTrigger,
27
+ } from "./automations";
28
+
29
+ const HEALTHCHECK_PLUGIN_ID = "healthcheck";
30
+
31
+ function change(overrides: Partial<EntityChanged> = {}): EntityChanged {
32
+ return {
33
+ kind: HEALTH_ENTITY_KIND,
34
+ id: "sys-1",
35
+ prev: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
36
+ next: { status: "unhealthy", healthyChecks: 0, totalChecks: 2 },
37
+ delta: { status: "unhealthy", healthyChecks: 0 },
38
+ changedFields: ["status", "healthyChecks"],
39
+ actor: SYSTEM_ACTOR,
40
+ occurredAt: new Date().toISOString(),
41
+ ...overrides,
42
+ };
43
+ }
44
+
45
+ describe("HEALTH_TRIGGER_EVENTS (must equal the trigger qualifiedIds)", () => {
46
+ it("emits the underscore trigger qualifiedIds, not the dotted hook ids", () => {
47
+ // Stage-1 routing fires automations on `t.event === trigger.qualifiedId`
48
+ // (`${pluginId}.${trigger.id}`). The healthcheck triggers have ids
49
+ // `system_degraded` / `system_healthy` / `system_health_changed`, so the
50
+ // deriver MUST emit these — NOT the dotted hook ids.
51
+ expect(HEALTH_TRIGGER_EVENTS.degraded).toBe("healthcheck.system_degraded");
52
+ expect(HEALTH_TRIGGER_EVENTS.healthy).toBe("healthcheck.system_healthy");
53
+ expect(HEALTH_TRIGGER_EVENTS.healthChanged).toBe(
54
+ "healthcheck.system_health_changed",
55
+ );
56
+ });
57
+
58
+ it("matches the registered trigger qualifiedIds exactly", () => {
59
+ // Compare as plain strings (the constants are narrow literal types).
60
+ expect(`${HEALTHCHECK_PLUGIN_ID}.${systemDegradedTrigger.id}`).toBe(
61
+ HEALTH_TRIGGER_EVENTS.degraded,
62
+ );
63
+ expect(`${HEALTHCHECK_PLUGIN_ID}.${systemHealthyTrigger.id}`).toBe(
64
+ HEALTH_TRIGGER_EVENTS.healthy,
65
+ );
66
+ expect(`${HEALTHCHECK_PLUGIN_ID}.${systemHealthChangedTrigger.id}`).toBe(
67
+ HEALTH_TRIGGER_EVENTS.healthChanged,
68
+ );
69
+ });
70
+ });
71
+
72
+ describe("deriveHealthTriggerEvents", () => {
73
+ it("maps a healthy → unhealthy transition to degraded + umbrella", () => {
74
+ const events = deriveHealthTriggerEvents(change());
75
+ expect(events).toEqual([
76
+ HEALTH_TRIGGER_EVENTS.degraded,
77
+ HEALTH_TRIGGER_EVENTS.healthChanged,
78
+ ]);
79
+ });
80
+
81
+ it("maps a degraded → healthy recovery to healthy + umbrella", () => {
82
+ const events = deriveHealthTriggerEvents(
83
+ change({
84
+ prev: { status: "degraded", healthyChecks: 1, totalChecks: 2 },
85
+ next: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
86
+ }),
87
+ );
88
+ expect(events).toEqual([
89
+ HEALTH_TRIGGER_EVENTS.healthy,
90
+ HEALTH_TRIGGER_EVENTS.healthChanged,
91
+ ]);
92
+ });
93
+
94
+ it("maps a degraded → unhealthy transition to umbrella only (no directional)", () => {
95
+ const events = deriveHealthTriggerEvents(
96
+ change({
97
+ prev: { status: "degraded", healthyChecks: 1, totalChecks: 2 },
98
+ next: { status: "unhealthy", healthyChecks: 0, totalChecks: 2 },
99
+ }),
100
+ );
101
+ // Neither side is "healthy", so only the umbrella fires.
102
+ expect(events).toEqual([HEALTH_TRIGGER_EVENTS.healthChanged]);
103
+ });
104
+
105
+ it("fires nothing on create (prev === null)", () => {
106
+ expect(deriveHealthTriggerEvents(change({ prev: null }))).toEqual([]);
107
+ });
108
+
109
+ it("fires nothing on tombstone (next === null)", () => {
110
+ expect(deriveHealthTriggerEvents(change({ next: null }))).toEqual([]);
111
+ });
112
+
113
+ it("fires nothing when only non-status fields changed", () => {
114
+ expect(
115
+ deriveHealthTriggerEvents(
116
+ change({
117
+ prev: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
118
+ next: { status: "healthy", healthyChecks: 1, totalChecks: 2 },
119
+ }),
120
+ ),
121
+ ).toEqual([]);
122
+ });
123
+ });
124
+
125
+ describe("healthChangeToPayload — payloadSchema parity", () => {
126
+ it("a degradation payload validates against the degraded trigger's payloadSchema", () => {
127
+ const payload = healthChangeToPayload(change());
128
+ const parsed = systemDegradedTrigger.payloadSchema.parse(payload);
129
+ expect(parsed.systemId).toBe("sys-1");
130
+ expect(parsed.previousStatus).toBe("healthy");
131
+ expect(parsed.newStatus).toBe("unhealthy");
132
+ expect(parsed.healthyChecks).toBe(0);
133
+ expect(parsed.totalChecks).toBe(2);
134
+ expect(typeof parsed.timestamp).toBe("string");
135
+ });
136
+
137
+ it("a recovery payload validates against the healthy trigger's payloadSchema", () => {
138
+ const payload = healthChangeToPayload(
139
+ change({
140
+ prev: { status: "unhealthy", healthyChecks: 0, totalChecks: 2 },
141
+ next: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
142
+ }),
143
+ );
144
+ const parsed = systemHealthyTrigger.payloadSchema.parse(payload);
145
+ expect(parsed.systemId).toBe("sys-1");
146
+ expect(parsed.previousStatus).toBe("unhealthy");
147
+ expect(parsed.healthyChecks).toBe(2);
148
+ expect(parsed.totalChecks).toBe(2);
149
+ });
150
+
151
+ it("any transition payload validates against the health_changed trigger's payloadSchema", () => {
152
+ const payload = healthChangeToPayload(change());
153
+ const parsed = systemHealthChangedTrigger.payloadSchema.parse(payload);
154
+ expect(parsed.systemId).toBe("sys-1");
155
+ expect(parsed.previousStatus).toBe("healthy");
156
+ expect(parsed.newStatus).toBe("unhealthy");
157
+ });
158
+ });
159
+
160
+ describe("classifyHealthChange (cross-plugin consumer predicate)", () => {
161
+ it("flags degraded on healthy → unhealthy (the old systemDegraded condition)", () => {
162
+ const c = classifyHealthChange(change());
163
+ expect(c).toEqual({
164
+ systemId: "sys-1",
165
+ previousStatus: "healthy",
166
+ newStatus: "unhealthy",
167
+ degraded: true,
168
+ recovered: false,
169
+ });
170
+ });
171
+
172
+ it("flags recovered on degraded → healthy (the old systemHealthy condition)", () => {
173
+ const c = classifyHealthChange(
174
+ change({
175
+ prev: { status: "degraded", healthyChecks: 1, totalChecks: 2 },
176
+ next: { status: "healthy", healthyChecks: 2, totalChecks: 2 },
177
+ }),
178
+ );
179
+ expect(c.recovered).toBe(true);
180
+ expect(c.degraded).toBe(false);
181
+ });
182
+
183
+ it("flags neither on a non-healthy ↔ non-healthy transition", () => {
184
+ const c = classifyHealthChange(
185
+ change({
186
+ prev: { status: "degraded", healthyChecks: 1, totalChecks: 2 },
187
+ next: { status: "unhealthy", healthyChecks: 0, totalChecks: 2 },
188
+ }),
189
+ );
190
+ expect(c.degraded).toBe(false);
191
+ expect(c.recovered).toBe(false);
192
+ });
193
+
194
+ it("flags neither on create / tombstone", () => {
195
+ expect(classifyHealthChange(change({ prev: null })).degraded).toBe(false);
196
+ expect(classifyHealthChange(change({ next: null })).recovered).toBe(false);
197
+ });
198
+ });
199
+
200
+ describe("HealthEntityStateSchema", () => {
201
+ it("accepts the reactive subset", () => {
202
+ const parsed = HealthEntityStateSchema.parse({
203
+ status: "degraded",
204
+ healthyChecks: 1,
205
+ totalChecks: 3,
206
+ });
207
+ expect(parsed.status).toBe("degraded");
208
+ });
209
+ });
210
+
211
+
212
+ // ──────────────────────────────────────────────────────────────────────────
213
+ // COMPUTE-ON-READ: the `read` accessor derives the view from durable data.
214
+ // ──────────────────────────────────────────────────────────────────────────
215
+
216
+ type CheckStatus = HealthEntityState["status"];
217
+
218
+ /**
219
+ * Fake service whose `getSystemHealthStatus` returns canned per-system state.
220
+ * A system absent from the map gets the SAME default-`healthy` baseline the
221
+ * real `getSystemHealthStatus` returns for an empty run window — and an empty
222
+ * `checkStatuses` for a system with no enabled associations (the existence
223
+ * gate), exactly like production.
224
+ */
225
+ function fakeService(
226
+ statusBySystem: Record<
227
+ string,
228
+ { status: CheckStatus; checkStatuses: Array<{ status: CheckStatus }> }
229
+ >,
230
+ ): HealthCheckService {
231
+ return {
232
+ getSystemHealthStatus: async (systemId: string) => {
233
+ const found = statusBySystem[systemId];
234
+ return {
235
+ status: found?.status ?? ("healthy" as CheckStatus),
236
+ evaluatedAt: new Date(),
237
+ checkStatuses: (found?.checkStatuses ?? []).map((c, i) => ({
238
+ configurationId: `cfg-${i}`,
239
+ configurationName: `Check ${i}`,
240
+ status: c.status,
241
+ runsConsidered: 1,
242
+ })),
243
+ };
244
+ },
245
+ } as unknown as HealthCheckService;
246
+ }
247
+
248
+ describe("computeHealthEntityState (compute-on-read from durable data)", () => {
249
+ it("omits a system with NO enabled check associations (existence gate)", async () => {
250
+ // No enabled associations ⇒ `getSystemHealthStatus` returns checkStatuses:
251
+ // [] ⇒ no `health` entity for this system.
252
+ const service = fakeService({
253
+ "sys-1": { status: "healthy", checkStatuses: [] },
254
+ });
255
+ const state = await computeHealthEntityState({ service, systemId: "sys-1" });
256
+ expect(state).toBeUndefined();
257
+ });
258
+
259
+ it("resolves the default-`healthy` baseline for a system with an enabled check but no runs yet", async () => {
260
+ // A run-less system with an enabled check evaluates to the default-healthy
261
+ // baseline (the executor's pre-run state), NOT undefined — so a first-ever
262
+ // unhealthy run is a real healthy → degraded diff (Defect 1 fix).
263
+ const service = fakeService({
264
+ "sys-1": {
265
+ status: "healthy",
266
+ checkStatuses: [{ status: "healthy" }, { status: "healthy" }],
267
+ },
268
+ });
269
+ const state = await computeHealthEntityState({ service, systemId: "sys-1" });
270
+ expect(state).toEqual({
271
+ status: "healthy",
272
+ healthyChecks: 2,
273
+ totalChecks: 2,
274
+ });
275
+ });
276
+
277
+ it("derives { status, healthyChecks, totalChecks } from the worst-wins aggregate", async () => {
278
+ const service = fakeService({
279
+ "sys-1": {
280
+ status: "degraded",
281
+ checkStatuses: [
282
+ { status: "healthy" },
283
+ { status: "degraded" },
284
+ { status: "healthy" },
285
+ ],
286
+ },
287
+ });
288
+ const state = await computeHealthEntityState({ service, systemId: "sys-1" });
289
+ // status = worst-wins aggregate; healthyChecks = count of "healthy";
290
+ // totalChecks = number of enabled checks.
291
+ expect(state).toEqual({
292
+ status: "degraded",
293
+ healthyChecks: 2,
294
+ totalChecks: 3,
295
+ });
296
+ });
297
+ });
298
+
299
+ describe("createHealthEntityRead (batched, omits checkless systems)", () => {
300
+ it("returns a map keyed by systemId, omitting systems with no enabled checks", async () => {
301
+ const service = fakeService({
302
+ "sys-a": { status: "healthy", checkStatuses: [{ status: "healthy" }] },
303
+ // sys-b has NO enabled associations ⇒ omitted from the read.
304
+ "sys-b": { status: "healthy", checkStatuses: [] },
305
+ "sys-c": {
306
+ status: "unhealthy",
307
+ checkStatuses: [{ status: "healthy" }, { status: "unhealthy" }],
308
+ },
309
+ });
310
+ const read = createHealthEntityRead({ service });
311
+ const out = await read(["sys-a", "sys-b", "sys-c"]);
312
+ expect(out).toEqual({
313
+ "sys-a": { status: "healthy", healthyChecks: 1, totalChecks: 1 },
314
+ "sys-c": { status: "unhealthy", healthyChecks: 1, totalChecks: 2 },
315
+ });
316
+ expect(out["sys-b"]).toBeUndefined();
317
+ });
318
+
319
+ it("returns {} for an empty id list without touching the backing", async () => {
320
+ const read = createHealthEntityRead({ service: fakeService({}) });
321
+ expect(await read([])).toEqual({});
322
+ });
323
+ });
324
+
325
+ describe("writeHealthEntity (durable write driven through handle.mutate)", () => {
326
+ /**
327
+ * Fake handle reproducing the Model B pipeline's observable timing: snapshot
328
+ * `prev` via `read`, run `apply` (the REAL durable write), diff, and emit on
329
+ * a real change. Records the (prev, next) pair so the test can assert the
330
+ * framework snapshotted prev BEFORE the durable write committed.
331
+ */
332
+ function fakeHandle(args: {
333
+ read: () => Promise<HealthEntityState | undefined>;
334
+ onEmit?: (change: {
335
+ prev: HealthEntityState | undefined;
336
+ next: HealthEntityState;
337
+ }) => void;
338
+ failAfterApply?: boolean;
339
+ }): EntityHandle<HealthEntityState> {
340
+ const { read, onEmit, failAfterApply } = args;
341
+ return {
342
+ kind: HEALTH_ENTITY_KIND,
343
+ async mutate(input: MutateInput<HealthEntityState>) {
344
+ const prev = await read(); // snapshot BEFORE apply
345
+ const next = await input.apply(); // the REAL durable write
346
+ if (failAfterApply) throw new Error("emit failed");
347
+ // Only emit on a real change (status diff suffices for the test).
348
+ if (!prev || prev.status !== next.status) onEmit?.({ prev, next });
349
+ return next;
350
+ },
351
+ } as unknown as EntityHandle<HealthEntityState>;
352
+ }
353
+
354
+ it("snapshots prev BEFORE apply runs the durable write (one correct change)", async () => {
355
+ // `read` reflects state BEFORE apply; apply flips it. The framework must
356
+ // capture the pre-write prev, so the change is prev=healthy → next=unhealthy.
357
+ let persisted: HealthEntityState = {
358
+ status: "healthy",
359
+ healthyChecks: 2,
360
+ totalChecks: 2,
361
+ };
362
+ const emitted: Array<{
363
+ prev: HealthEntityState | undefined;
364
+ next: HealthEntityState;
365
+ }> = [];
366
+ const handle = fakeHandle({
367
+ read: async () => persisted,
368
+ onEmit: (c) => emitted.push(c),
369
+ });
370
+
371
+ const next = await writeHealthEntity({
372
+ handle,
373
+ systemId: "sys-1",
374
+ apply: async () => {
375
+ persisted = { status: "unhealthy", healthyChecks: 0, totalChecks: 2 };
376
+ return persisted;
377
+ },
378
+ });
379
+
380
+ expect(next).toEqual({
381
+ status: "unhealthy",
382
+ healthyChecks: 0,
383
+ totalChecks: 2,
384
+ });
385
+ // Exactly one change, with the pre-write prev and post-write next.
386
+ expect(emitted).toHaveLength(1);
387
+ expect(emitted[0].prev?.status).toBe("healthy");
388
+ expect(emitted[0].next.status).toBe("unhealthy");
389
+ });
390
+
391
+ it("runs the durable write even when no handle is bound", async () => {
392
+ let ran = false;
393
+ const next = await writeHealthEntity({
394
+ handle: undefined,
395
+ systemId: "sys-1",
396
+ apply: async () => {
397
+ ran = true;
398
+ return { status: "healthy", healthyChecks: 1, totalChecks: 1 };
399
+ },
400
+ });
401
+ expect(ran).toBe(true);
402
+ expect(next.status).toBe("healthy");
403
+ });
404
+
405
+ it("routes a post-commit framework failure to onError (fail-soft)", async () => {
406
+ let captured: unknown;
407
+ const handle = fakeHandle({
408
+ read: async () => ({
409
+ status: "healthy",
410
+ healthyChecks: 1,
411
+ totalChecks: 1,
412
+ }),
413
+ failAfterApply: true,
414
+ });
415
+ // apply commits, THEN the handle throws (emit failure). Must not rethrow.
416
+ const result = await writeHealthEntity({
417
+ handle,
418
+ systemId: "sys-1",
419
+ apply: async () => ({
420
+ status: "unhealthy",
421
+ healthyChecks: 0,
422
+ totalChecks: 1,
423
+ }),
424
+ onError: (e) => {
425
+ captured = e;
426
+ },
427
+ });
428
+ expect((captured as Error).message).toBe("emit failed");
429
+ // The committed state is still returned (fail-soft, not lost).
430
+ expect(result.status).toBe("unhealthy");
431
+ });
432
+
433
+ it("rethrows when the durable write itself fails (executor fallback runs)", async () => {
434
+ const handle = fakeHandle({
435
+ read: async () => ({
436
+ status: "healthy",
437
+ healthyChecks: 1,
438
+ totalChecks: 1,
439
+ }),
440
+ });
441
+ let onErrorCalled = false;
442
+ await expect(
443
+ writeHealthEntity({
444
+ handle,
445
+ systemId: "sys-1",
446
+ apply: async () => {
447
+ throw new Error("insert failed");
448
+ },
449
+ onError: () => {
450
+ onErrorCalled = true;
451
+ },
452
+ }),
453
+ ).rejects.toThrow("insert failed");
454
+ // A durable-write failure must propagate, NOT be swallowed by onError.
455
+ expect(onErrorCalled).toBe(false);
456
+ });
457
+ });
458
+
459
+ // ──────────────────────────────────────────────────────────────────────────
460
+ // DEFECT 1 (first-run degradation): a system whose very first run comes up
461
+ // unhealthy must produce a real healthy → degraded diff so `system_degraded` /
462
+ // `health_changed` fire and the `degraded` `onEntityChanged` opens SLO /
463
+ // dependency downtime — NOT a suppressed create (prev === null).
464
+ // ──────────────────────────────────────────────────────────────────────────
465
+
466
+ describe("first-run-unhealthy degradation (Defect 1 regression)", () => {
467
+ /**
468
+ * A handle whose `read` snapshots `prev` from `computeHealthEntityState`
469
+ * (the SAME compute-on-read accessor production uses) against a service that
470
+ * reflects the durable state. Before `apply`, the system has an enabled
471
+ * check but no runs ⇒ default-`healthy` baseline; `apply` records the first
472
+ * (unhealthy) run, so the post-write read sees unhealthy. We assert the
473
+ * framework-style change (prev → next) drives the directional + umbrella
474
+ * trigger events and the `degraded` classification.
475
+ */
476
+ it("fires system_degraded + umbrella and a `degraded` onEntityChanged on the first-ever unhealthy run", async () => {
477
+ // Durable state the fake service reads. Starts run-less (default healthy),
478
+ // flips to unhealthy when the first run is recorded by `apply`.
479
+ let firstRunRecorded = false;
480
+ const service = {
481
+ getSystemHealthStatus: async () => ({
482
+ status: firstRunRecorded
483
+ ? ("unhealthy" as CheckStatus)
484
+ : ("healthy" as CheckStatus),
485
+ evaluatedAt: new Date(),
486
+ // One ENABLED check association exists from the start (run-less but
487
+ // configured), so the entity resolves to the healthy baseline.
488
+ checkStatuses: [
489
+ {
490
+ configurationId: "cfg-0",
491
+ configurationName: "Check 0",
492
+ status: firstRunRecorded
493
+ ? ("unhealthy" as CheckStatus)
494
+ : ("healthy" as CheckStatus),
495
+ runsConsidered: firstRunRecorded ? 1 : 0,
496
+ },
497
+ ],
498
+ }),
499
+ } as unknown as HealthCheckService;
500
+
501
+ const emitted: Array<{
502
+ prev: HealthEntityState | undefined;
503
+ next: HealthEntityState;
504
+ }> = [];
505
+
506
+ // Model B handle: snapshot prev via the REAL compute-on-read accessor
507
+ // BEFORE apply, run apply, diff, emit on a real change.
508
+ const handle = {
509
+ kind: HEALTH_ENTITY_KIND,
510
+ async mutate(input: MutateInput<HealthEntityState>) {
511
+ const prev = await computeHealthEntityState({
512
+ service,
513
+ systemId: "sys-1",
514
+ });
515
+ const next = await input.apply();
516
+ if (!prev || prev.status !== next.status) onEmitChange(prev, next);
517
+ return next;
518
+ },
519
+ } as unknown as EntityHandle<HealthEntityState>;
520
+
521
+ function onEmitChange(
522
+ prev: HealthEntityState | undefined,
523
+ next: HealthEntityState,
524
+ ) {
525
+ emitted.push({ prev, next });
526
+ }
527
+
528
+ const next = await writeHealthEntity({
529
+ handle,
530
+ systemId: "sys-1",
531
+ apply: async () => {
532
+ // The durable first run lands here (unhealthy).
533
+ firstRunRecorded = true;
534
+ const computed = await computeHealthEntityState({
535
+ service,
536
+ systemId: "sys-1",
537
+ });
538
+ if (!computed) throw new Error("expected a computed view");
539
+ return computed;
540
+ },
541
+ });
542
+
543
+ // Exactly one emit, a real healthy → unhealthy transition (NOT a create).
544
+ expect(emitted).toHaveLength(1);
545
+ expect(emitted[0].prev).toEqual({
546
+ status: "healthy",
547
+ healthyChecks: 1,
548
+ totalChecks: 1,
549
+ });
550
+ expect(emitted[0].next.status).toBe("unhealthy");
551
+ expect(next.status).toBe("unhealthy");
552
+
553
+ // The deriver fires the directional + umbrella trigger events.
554
+ const events = deriveHealthTriggerEvents({
555
+ kind: HEALTH_ENTITY_KIND,
556
+ id: "sys-1",
557
+ prev: emitted[0].prev ?? null,
558
+ next: emitted[0].next,
559
+ delta: {},
560
+ changedFields: ["status"],
561
+ actor: SYSTEM_ACTOR,
562
+ occurredAt: new Date().toISOString(),
563
+ });
564
+ expect(events).toEqual([
565
+ HEALTH_TRIGGER_EVENTS.degraded,
566
+ HEALTH_TRIGGER_EVENTS.healthChanged,
567
+ ]);
568
+
569
+ // The cross-plugin consumer predicate reports `degraded` (opens SLO /
570
+ // dependency downtime).
571
+ const classified = classifyHealthChange({
572
+ id: "sys-1",
573
+ prev: emitted[0].prev ?? null,
574
+ next: emitted[0].next,
575
+ });
576
+ expect(classified.degraded).toBe(true);
577
+ expect(classified.recovered).toBe(false);
578
+ });
579
+ });
580
+
581
+ // ──────────────────────────────────────────────────────────────────────────
582
+ // DEFECT 2 (concurrent N-pod evaluation): two concurrent `writeHealthEntity`
583
+ // for ONE system must serialize through prev-snapshot → emit, producing
584
+ // exactly ONE transition + ONE emit (not two).
585
+ // ──────────────────────────────────────────────────────────────────────────
586
+
587
+ describe("per-system serialization (Defect 2 regression)", () => {
588
+ /**
589
+ * A faithful in-memory stand-in for `withXactLock`'s mutual exclusion: a
590
+ * per-key promise chain. Two callers with the same key run strictly one
591
+ * after another; the second cannot enter until the first resolves — exactly
592
+ * the guarantee `pg_advisory_xact_lock` provides across pods.
593
+ */
594
+ function makeKeyedSerializer() {
595
+ const chains = new Map<string, Promise<unknown>>();
596
+ return (key: string) =>
597
+ <T>(fn: () => Promise<T>): Promise<T> => {
598
+ const prior = chains.get(key) ?? Promise.resolve();
599
+ const next = prior.then(fn, fn);
600
+ chains.set(
601
+ key,
602
+ next.then(
603
+ () => undefined,
604
+ () => undefined,
605
+ ),
606
+ );
607
+ return next;
608
+ };
609
+ }
610
+
611
+ it("two concurrent evals of one system emit exactly ONE transition", async () => {
612
+ // Shared durable state both evaluations write to. The first failing run
613
+ // flips it to unhealthy; the second sees it already unhealthy ⇒ no-op.
614
+ let unhealthy = false;
615
+ const compute = (): HealthEntityState => ({
616
+ status: unhealthy ? "unhealthy" : "healthy",
617
+ healthyChecks: unhealthy ? 0 : 1,
618
+ totalChecks: 1,
619
+ });
620
+
621
+ const emitted: Array<{
622
+ prev: HealthEntityState | undefined;
623
+ next: HealthEntityState;
624
+ }> = [];
625
+
626
+ // Model B handle: snapshot prev (current durable view) BEFORE apply, run
627
+ // apply, diff, emit on a real change. With NO lock, two concurrent calls
628
+ // both snapshot prev=healthy and both emit; the lock prevents that.
629
+ const handle = {
630
+ kind: HEALTH_ENTITY_KIND,
631
+ async mutate(input: MutateInput<HealthEntityState>) {
632
+ const prev = compute(); // snapshot BEFORE apply
633
+ // Yield so the second concurrent caller could interleave here if it
634
+ // were not serialized — the lock must prevent that.
635
+ await Promise.resolve();
636
+ const next = await input.apply();
637
+ if (prev.status !== next.status) emitted.push({ prev, next });
638
+ return next;
639
+ },
640
+ } as unknown as EntityHandle<HealthEntityState>;
641
+
642
+ const keyed = makeKeyedSerializer();
643
+ const serialize = keyed(`health:sys-1`);
644
+
645
+ const evalOnce = () =>
646
+ writeHealthEntity({
647
+ handle,
648
+ systemId: "sys-1",
649
+ serialize,
650
+ apply: async () => {
651
+ // The durable "insert failing run" — first writer flips the state.
652
+ unhealthy = true;
653
+ return compute();
654
+ },
655
+ });
656
+
657
+ // Fire both concurrently for the SAME system.
658
+ await Promise.all([evalOnce(), evalOnce()]);
659
+
660
+ // Exactly one logical transition emitted (healthy → unhealthy), not two.
661
+ expect(emitted).toHaveLength(1);
662
+ expect(emitted[0].prev?.status).toBe("healthy");
663
+ expect(emitted[0].next.status).toBe("unhealthy");
664
+ });
665
+
666
+ it("createHealthEntitySerializer keys the advisory lock `health:<systemId>` and runs work in a transaction", async () => {
667
+ // Intercept `db.transaction` + the advisory-lock SQL the serializer's
668
+ // `withXactLock` issues. The fake runs `fn(tx)` inline (single connection),
669
+ // mirroring `withXactLock`'s single-session contract. We assert the
670
+ // namespaced key flows into `pg_advisory_xact_lock(...)`.
671
+ const executedKeys: string[] = [];
672
+ let transactionRan = false;
673
+ const fakeDb = {
674
+ transaction: async (
675
+ cb: (tx: { execute: (q: unknown) => Promise<void> }) => Promise<unknown>,
676
+ ) => {
677
+ transactionRan = true;
678
+ return cb({
679
+ execute: async (q) => {
680
+ // The bound key is a plain string chunk in the drizzle template.
681
+ const chunks = (q as { queryChunks?: unknown[] }).queryChunks ?? [];
682
+ for (const c of chunks) {
683
+ if (typeof c === "string") executedKeys.push(c);
684
+ }
685
+ },
686
+ });
687
+ },
688
+ } as unknown as Parameters<typeof createHealthEntitySerializer>[0]["db"];
689
+
690
+ const serializer = createHealthEntitySerializer({ db: fakeDb });
691
+ const result = await serializer("sys-42")(async () => "ok");
692
+
693
+ expect(result).toBe("ok");
694
+ expect(transactionRan).toBe(true);
695
+ // The advisory lock was acquired with the per-system namespaced key.
696
+ expect(executedKeys).toContain("health:sys-42");
697
+ });
698
+ });