@checkstack/healthcheck-backend 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +409 -0
  2. package/drizzle/0015_quiet_meggan.sql +12 -0
  3. package/drizzle/0016_complex_maginty.sql +1 -0
  4. package/drizzle/0017_pretty_caretaker.sql +1 -0
  5. package/drizzle/meta/0015_snapshot.json +764 -0
  6. package/drizzle/meta/0016_snapshot.json +644 -0
  7. package/drizzle/meta/0017_snapshot.json +563 -0
  8. package/drizzle/meta/_journal.json +21 -0
  9. package/package.json +24 -21
  10. package/src/automations.test.ts +6 -27
  11. package/src/automations.ts +32 -30
  12. package/src/collector-script-test.test.ts +236 -0
  13. package/src/collector-script-test.ts +221 -0
  14. package/src/health-entity.test.ts +694 -0
  15. package/src/health-entity.ts +367 -0
  16. package/src/health-state.test.ts +115 -0
  17. package/src/health-state.ts +333 -0
  18. package/src/healthcheck-gitops-kinds.test.ts +6 -32
  19. package/src/healthcheck-gitops-kinds.ts +4 -19
  20. package/src/hooks.test.ts +19 -6
  21. package/src/hooks.ts +13 -68
  22. package/src/index.ts +118 -48
  23. package/src/queue-executor.test.ts +13 -0
  24. package/src/queue-executor.ts +251 -444
  25. package/src/retention-job.ts +65 -1
  26. package/src/retention-state-transitions.test.ts +49 -0
  27. package/src/router.test.ts +13 -0
  28. package/src/router.ts +44 -0
  29. package/src/schema.ts +34 -54
  30. package/src/service-notification-policy.test.ts +28 -71
  31. package/src/service.ts +89 -0
  32. package/src/state-evaluator.test.ts +50 -5
  33. package/src/state-evaluator.ts +9 -2
  34. package/src/state-transitions.test.ts +126 -0
  35. package/src/state-transitions.ts +112 -0
  36. package/tsconfig.json +9 -0
  37. package/src/auto-incident-close-job.ts +0 -164
  38. package/src/auto-incident.test.ts +0 -196
  39. package/src/auto-incident.ts +0 -332
@@ -9,6 +9,7 @@ import {
9
9
  type ConnectedClient,
10
10
  type TransportClient,
11
11
  type CollectorRunContext,
12
+ type AdvisoryLockService,
12
13
  } from "@checkstack/backend-api";
13
14
  import { QueueManager } from "@checkstack/queue-api";
14
15
  import {
@@ -36,6 +37,8 @@ import { IncidentApi } from "@checkstack/incident-common";
36
37
  import { NotificationApi } from "@checkstack/notification-common";
37
38
  import { healthcheckSystemSubscription } from "@checkstack/healthcheck-common";
38
39
  import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
40
+ import { secretEnvMappingSchema } from "@checkstack/secrets-common";
41
+ import type { SecretResolverService } from "@checkstack/secrets-backend";
39
42
  import { HealthCheckService } from "./service";
40
43
  import { healthCheckHooks } from "./hooks";
41
44
  import { incrementHourlyAggregate } from "./realtime-aggregation";
@@ -44,17 +47,13 @@ import {
44
47
  classifyTransition,
45
48
  shouldNotifyTransition,
46
49
  } from "./notification-policy";
50
+ import { recordStateTransition } from "./state-transitions";
47
51
  import {
48
- findLastAutoIncidentClose,
49
- findUnhealthySince,
50
- hasHealthyRunSince,
51
- isMaintenanceSuppressed,
52
- isTransitionToUnhealthy,
53
- openAutoIncident,
54
- recordUnhealthyTransition,
55
- shouldOpenForFlapping,
56
- shouldOpenForSustainedUnhealthy,
57
- } from "./auto-incident";
52
+ writeHealthEntity,
53
+ createHealthEntitySerializer,
54
+ type HealthEntityState,
55
+ } from "./health-entity";
56
+ import type { EntityHandle } from "@checkstack/automation-backend";
58
57
 
59
58
  type Db = SafeDatabase<typeof schema>;
60
59
  type CatalogClient = InferClient<typeof CatalogApi>;
@@ -62,6 +61,28 @@ type MaintenanceClient = InferClient<typeof MaintenanceApi>;
62
61
  type IncidentClient = InferClient<typeof IncidentApi>;
63
62
  type NotificationClient = InferClient<typeof NotificationApi>;
64
63
 
64
+ /** Shape of the aggregated state returned by `getSystemHealthStatus`. */
65
+ type AggregatedHealth = Awaited<
66
+ ReturnType<HealthCheckService["getSystemHealthStatus"]>
67
+ >;
68
+
69
+ /**
70
+ * Derive the reactive `health` entity view from the freshly-computed
71
+ * aggregated state. Mirrors `computeHealthEntityState` exactly: `status` is the
72
+ * worst-wins aggregate, `healthyChecks` counts per-check `"healthy"` statuses,
73
+ * and `totalChecks` is the number of enabled checks. Kept here so the
74
+ * `handle.mutate` write returns the SAME view the `read` accessor would have
75
+ * computed for the post-write state (the handle thus never re-reads).
76
+ */
77
+ function toHealthEntityView(state: AggregatedHealth): HealthEntityState {
78
+ return {
79
+ status: state.status,
80
+ healthyChecks: state.checkStatuses.filter((c) => c.status === "healthy")
81
+ .length,
82
+ totalChecks: state.checkStatuses.length,
83
+ };
84
+ }
85
+
65
86
  /**
66
87
  * Emit the checkCompleted hook if available, plus the narrower
67
88
  * `checkFailed` hook when the result wasn't `healthy` (so operators
@@ -171,222 +192,12 @@ export async function scheduleHealthCheck(props: {
171
192
  });
172
193
  }
173
194
 
174
- /**
175
- * After every check run, evaluate the per-check auto-incident
176
- * triggers. Either trigger can independently open an incident:
177
- *
178
- * - **flapping**: this just-completed run was a transition to
179
- * unhealthy AND `N` such transitions have happened within the
180
- * configured window.
181
- * - **sustained**: the check is currently unhealthy AND has been so
182
- * continuously for at least the configured duration.
183
- *
184
- * Both triggers honour the require-recovery rule: after the most
185
- * recent auto-incident close (manual or auto), no new auto-incident
186
- * opens until the check has logged at least one healthy run. This
187
- * stops a manual close → still-unhealthy → re-open loop.
188
- *
189
- * Active maintenance with suppression skips both triggers when the
190
- * policy opts in.
191
- */
192
- async function maybeOpenAutoIncidentForCheck(props: {
193
- db: Db;
194
- service: HealthCheckService;
195
- incidentClient: IncidentClient;
196
- maintenanceClient: MaintenanceClient;
197
- logger: Logger;
198
- systemId: string;
199
- systemName: string;
200
- configurationId: string;
201
- configurationName: string;
202
- /**
203
- * Same closure-based getter the queue executor uses elsewhere; let
204
- * us fire the `flapping_detected` automation hook from inside the
205
- * flapping evaluator without re-threading `emitHook` through every
206
- * intermediate caller. Optional — when absent, the hook simply
207
- * doesn't fire (e.g. in unit tests that don't care about it).
208
- */
209
- getEmitHook?: () => EmitHookFn | undefined;
210
- previousState: {
211
- checkStatuses: Array<{
212
- configurationId: string;
213
- status: HealthCheckStatus;
214
- }>;
215
- };
216
- newState: {
217
- checkStatuses: Array<{
218
- configurationId: string;
219
- status: HealthCheckStatus;
220
- }>;
221
- };
222
- }): Promise<void> {
223
- const {
224
- db,
225
- service,
226
- incidentClient,
227
- maintenanceClient,
228
- logger,
229
- systemId,
230
- systemName,
231
- configurationId,
232
- configurationName,
233
- getEmitHook,
234
- previousState,
235
- newState,
236
- } = props;
237
-
238
- const next = newState.checkStatuses.find(
239
- (c) => c.configurationId === configurationId,
240
- );
241
- // Only auto-incident logic applies when the check is currently
242
- // unhealthy — both triggers require it.
243
- if (!next || next.status !== "unhealthy") return;
244
-
245
- const prev = previousState.checkStatuses.find(
246
- (c) => c.configurationId === configurationId,
247
- );
248
- const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
249
-
250
- let policy;
251
- try {
252
- policy = await service.getAssignmentNotificationPolicy({
253
- systemId,
254
- configurationId,
255
- });
256
- } catch (error) {
257
- logger.warn(
258
- `Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
259
- error,
260
- );
261
- return;
262
- }
263
-
264
- if (!policy.autoOpenIncidentOnUnhealthy) return;
265
-
266
- // Honour active maintenance windows — operators have explicitly
267
- // said the system is down on purpose.
268
- if (policy.skipDuringMaintenance) {
269
- const suppressed = await isMaintenanceSuppressed({
270
- maintenanceClient,
271
- systemId,
272
- logger,
273
- });
274
- if (suppressed) {
275
- logger.debug(
276
- `Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
277
- );
278
- return;
279
- }
280
- }
281
-
282
- // Require-recovery: if there's a prior closed auto-incident for
283
- // this assignment, the check must have logged at least one healthy
284
- // run since the close before we can open another one. Without this,
285
- // an operator's manual close on a still-broken system would loop.
286
- const lastCloseAt = await findLastAutoIncidentClose({
287
- db,
288
- systemId,
289
- configurationId,
290
- });
291
- if (lastCloseAt) {
292
- const recovered = await hasHealthyRunSince({
293
- db,
294
- systemId,
295
- configurationId,
296
- since: lastCloseAt,
297
- });
298
- if (!recovered) {
299
- return;
300
- }
301
- }
302
-
303
- // Record the transition (if any) and evaluate the flapping trigger
304
- // against transitions that happened after the last close window.
305
- let flappingOpens = false;
306
- if (isTransition) {
307
- try {
308
- const count = await recordUnhealthyTransition({
309
- db,
310
- configurationId,
311
- systemId,
312
- windowMinutes: policy.flappingTrigger.windowMinutes,
313
- since: lastCloseAt,
314
- });
315
- flappingOpens = shouldOpenForFlapping({
316
- policy,
317
- recentTransitionCount: count,
318
- });
319
-
320
- // Fire the informational `flapping_detected` automation hook
321
- // independently of the auto-incident decision: an operator may
322
- // care about flapping even with the auto-incident pipeline
323
- // turned off.
324
- if (
325
- policy.flappingTrigger.enabled &&
326
- count >= policy.flappingTrigger.transitions
327
- ) {
328
- const emit = getEmitHook?.();
329
- if (emit) {
330
- try {
331
- await emit(healthCheckHooks.flappingDetected, {
332
- systemId,
333
- configurationId,
334
- transitionCount: count,
335
- windowMinutes: policy.flappingTrigger.windowMinutes,
336
- timestamp: new Date().toISOString(),
337
- });
338
- } catch (error) {
339
- logger.warn(
340
- `Failed to emit healthcheck.flapping_detected hook for ${systemId}/${configurationId}:`,
341
- error,
342
- );
343
- }
344
- }
345
- }
346
- } catch (error) {
347
- logger.warn(
348
- `Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
349
- error,
350
- );
351
- }
352
- }
353
-
354
- // Evaluate the sustained-duration trigger on every run while the
355
- // check is unhealthy (not just on transition).
356
- let sustainedOpens = false;
357
- if (policy.sustainedUnhealthyTrigger.enabled) {
358
- const unhealthySince = await findUnhealthySince({
359
- db,
360
- configurationId,
361
- systemId,
362
- since: lastCloseAt,
363
- });
364
- if (unhealthySince) {
365
- sustainedOpens = shouldOpenForSustainedUnhealthy({
366
- policy,
367
- unhealthyForMs: Date.now() - unhealthySince.getTime(),
368
- });
369
- }
370
- }
371
-
372
- if (!flappingOpens && !sustainedOpens) return;
373
-
374
- const reason = flappingOpens
375
- ? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
376
- : `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
377
-
378
- await openAutoIncident({
379
- db,
380
- incidentClient,
381
- logger,
382
- systemId,
383
- systemName,
384
- configurationId,
385
- configurationName,
386
- policy,
387
- reason,
388
- });
389
- }
195
+ // Flapping detection no longer lives here. It moved into the automation
196
+ // engine as a windowed-count gate on the `healthcheck.system_health_changed`
197
+ // trigger (raw aggregated-health change + `filter` +
198
+ // `window: { count, minutes, refire: "once" }`). The queue executor emits only
199
+ // the raw per-system health change (via the reactive `health` entity deriver,
200
+ // unchanged); the engine does the counting.
390
201
 
391
202
  /**
392
203
  * Notify system subscribers about a health state change.
@@ -565,6 +376,7 @@ async function notifyStateChange(props: {
565
376
  async function executeHealthCheckJob(props: {
566
377
  payload: HealthCheckJobPayload;
567
378
  db: Db;
379
+ advisoryLock: AdvisoryLockService;
568
380
  registry: HealthCheckRegistry;
569
381
  collectorRegistry: CollectorRegistry;
570
382
  logger: Logger;
@@ -575,10 +387,26 @@ async function executeHealthCheckJob(props: {
575
387
  incidentClient: IncidentClient;
576
388
  getEmitHook: () => EmitHookFn | undefined;
577
389
  cache: HealthCheckCache;
390
+ /**
391
+ * Resolver for the reactive `health` entity handle (§10.3). Returns the
392
+ * handle once automation-backend has bound the entity store; `undefined`
393
+ * during version skew / tests. Mirrors the `getEmitHook` closure pattern.
394
+ * The entity is PLUGIN-BACKED + COMPUTED — there is no keyed store; the
395
+ * durable run/aggregate write IS the entity write (see `writeHealthEntity`).
396
+ */
397
+ getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
398
+ /**
399
+ * Central secret resolver. When set, a collector declaring a `secretEnv`
400
+ * has it resolved + injected for this centrally-executed run; the
401
+ * collector masks the values out of its output. Optional for version-skew
402
+ * / test isolation.
403
+ */
404
+ secretResolver?: SecretResolverService;
578
405
  }): Promise<void> {
579
406
  const {
580
407
  payload,
581
408
  db,
409
+ advisoryLock,
582
410
  registry,
583
411
  collectorRegistry,
584
412
  logger,
@@ -588,13 +416,25 @@ async function executeHealthCheckJob(props: {
588
416
  maintenanceClient,
589
417
  incidentClient,
590
418
  getEmitHook,
419
+ getHealthEntity,
591
420
  cache,
421
+ secretResolver,
592
422
  } = props;
593
423
  const { configId, systemId } = payload;
594
424
 
595
425
  // Create service for aggregated state evaluation
596
426
  const service = new HealthCheckService(db, registry, collectorRegistry);
597
427
 
428
+ // Per-system serializer for the reactive health mutate (§10.3): a
429
+ // transaction-scoped advisory lock keyed `health:<systemId>` wraps the
430
+ // snapshot-prev + apply + diff + emit so concurrent evaluations of one
431
+ // system (multiple per-config jobs across pods, or at-least-once
432
+ // redelivery) can't double-emit a single logical transition. Bound to this
433
+ // job's systemId below at every `writeHealthEntity` call.
434
+ const serializeHealthWrite = createHealthEntitySerializer({ advisoryLock })(
435
+ systemId,
436
+ );
437
+
598
438
  // Capture aggregated state BEFORE this run for comparison
599
439
  const previousState = await service.getSystemHealthStatus(systemId);
600
440
  const previousStatus = previousState.status;
@@ -725,11 +565,31 @@ async function executeHealthCheckJob(props: {
725
565
  const storageKey = collectorEntry.id;
726
566
 
727
567
  try {
568
+ // Resolve the collector's declared secretEnv for THIS run
569
+ // (central execution). The collector injects it and masks the
570
+ // values out of its output. A missing required secret throws
571
+ // and fails the collector clearly.
572
+ let secretEnv: Record<string, string> | undefined;
573
+ const declared = secretEnvMappingSchema.safeParse(
574
+ (collectorEntry.config as { secretEnv?: unknown }).secretEnv,
575
+ );
576
+ if (
577
+ secretResolver &&
578
+ declared.success &&
579
+ Object.keys(declared.data).length > 0
580
+ ) {
581
+ const resolved = await secretResolver.resolveForRun({
582
+ secretEnv: declared.data,
583
+ });
584
+ secretEnv = resolved.env;
585
+ }
586
+
728
587
  const collectorResult = await registered.collector.execute({
729
588
  config: collectorEntry.config,
730
589
  client: connectedClient!.client,
731
590
  pluginId: configRow.strategyId,
732
591
  runContext,
592
+ ...(secretEnv ? { secretEnv } : {}),
733
593
  });
734
594
 
735
595
  // Check for collector-level error
@@ -860,26 +720,44 @@ async function executeHealthCheckJob(props: {
860
720
  },
861
721
  };
862
722
 
863
- await db.insert(healthCheckRuns).values({
864
- configurationId: configId,
723
+ // Persist the run + aggregate THROUGH the reactive `health` entity:
724
+ // `apply` does the durable write and returns the freshly-computed view.
725
+ // The framework snapshots `prev` via `read` BEFORE this insert, so a real
726
+ // status change emits exactly one correct `ENTITY_CHANGED` (§10.3). The
727
+ // computed aggregated state is stashed for the transition/notify path.
728
+ let newState!: AggregatedHealth;
729
+ await writeHealthEntity({
730
+ handle: getHealthEntity?.(),
865
731
  systemId,
866
- status: result.status,
867
- latencyMs: result.latencyMs,
868
- result: { ...result } as Record<string, unknown>,
869
- sourceId: undefined,
870
- sourceLabel: "Local",
871
- });
732
+ apply: async () => {
733
+ await db.insert(healthCheckRuns).values({
734
+ configurationId: configId,
735
+ systemId,
736
+ status: result.status,
737
+ latencyMs: result.latencyMs,
738
+ result: { ...result } as Record<string, unknown>,
739
+ sourceId: undefined,
740
+ sourceLabel: "Local",
741
+ });
872
742
 
873
- await incrementHourlyAggregate({
874
- db,
875
- systemId,
876
- configurationId: configId,
877
- status: result.status,
878
- latencyMs: result.latencyMs,
879
- runTimestamp: new Date(),
880
- result: { ...result } as Record<string, unknown>,
881
- collectorRegistry,
882
- sourceLabel: "Local",
743
+ await incrementHourlyAggregate({
744
+ db,
745
+ systemId,
746
+ configurationId: configId,
747
+ status: result.status,
748
+ latencyMs: result.latencyMs,
749
+ runTimestamp: new Date(),
750
+ result: { ...result } as Record<string, unknown>,
751
+ collectorRegistry,
752
+ sourceLabel: "Local",
753
+ });
754
+
755
+ newState = await service.getSystemHealthStatus(systemId);
756
+ return toHealthEntityView(newState);
757
+ },
758
+ serialize: serializeHealthWrite,
759
+ onError: (error) =>
760
+ logger.warn(`Failed to mirror health entity for ${systemId}`, error),
883
761
  });
884
762
 
885
763
  logger.debug(
@@ -899,8 +777,17 @@ async function executeHealthCheckJob(props: {
899
777
  latencyMs: result.latencyMs,
900
778
  });
901
779
 
902
- const newState = await service.getSystemHealthStatus(systemId);
903
780
  if (newState.status !== previousStatus) {
781
+ // Record the aggregate transition so the sensing layer has a
782
+ // reliable "in status since" for every status (Wave 2).
783
+ await recordStateTransition({
784
+ db,
785
+ systemId,
786
+ configurationId: configId,
787
+ fromStatus: previousStatus,
788
+ toStatus: newState.status,
789
+ });
790
+
904
791
  await notifyStateChange({
905
792
  notificationClient,
906
793
  systemId,
@@ -916,24 +803,6 @@ async function executeHealthCheckJob(props: {
916
803
  });
917
804
  }
918
805
 
919
- // Per-check auto-incident: runs whether or not the aggregate
920
- // changed (a check can transition to unhealthy without flipping
921
- // the aggregate if another check is already unhealthy).
922
- await maybeOpenAutoIncidentForCheck({
923
- db,
924
- service,
925
- incidentClient,
926
- maintenanceClient,
927
- logger,
928
- systemId,
929
- systemName,
930
- configurationId: configId,
931
- configurationName: configRow.configName,
932
- getEmitHook,
933
- previousState,
934
- newState,
935
- });
936
-
937
806
  return;
938
807
  } finally {
939
808
  if (connectedClient) {
@@ -962,28 +831,48 @@ async function executeHealthCheckJob(props: {
962
831
  },
963
832
  };
964
833
 
965
- // Store result (spread to convert structured type to plain record for jsonb)
966
- await db.insert(healthCheckRuns).values({
967
- configurationId: configId,
834
+ // Persist the run + aggregate THROUGH the reactive `health` entity on
835
+ // every run (§10.3): `apply` does the durable write (insert + hourly
836
+ // aggregate) and returns the freshly-computed view. The framework
837
+ // snapshots `prev` via the COMPUTE-ON-READ accessor BEFORE this insert, so
838
+ // an unchanged aggregate is a no-op and a real status change drives the
839
+ // directional/umbrella trigger events via `deriveHealthTriggerEvents` —
840
+ // exactly one correct `ENTITY_CHANGED` with accurate prev → next.
841
+ let newState!: AggregatedHealth;
842
+ await writeHealthEntity({
843
+ handle: getHealthEntity?.(),
968
844
  systemId,
969
- status: result.status,
970
- latencyMs: result.latencyMs,
971
- result: { ...result } as Record<string, unknown>,
972
- sourceId: undefined,
973
- sourceLabel: "Local",
974
- });
845
+ apply: async () => {
846
+ // Store result (spread to convert structured type to plain record for jsonb)
847
+ await db.insert(healthCheckRuns).values({
848
+ configurationId: configId,
849
+ systemId,
850
+ status: result.status,
851
+ latencyMs: result.latencyMs,
852
+ result: { ...result } as Record<string, unknown>,
853
+ sourceId: undefined,
854
+ sourceLabel: "Local",
855
+ });
975
856
 
976
- // Trigger incremental hourly aggregation
977
- await incrementHourlyAggregate({
978
- db,
979
- systemId,
980
- configurationId: configId,
981
- status: result.status,
982
- latencyMs: result.latencyMs,
983
- runTimestamp: new Date(),
984
- result: { ...result } as Record<string, unknown>,
985
- collectorRegistry,
986
- sourceLabel: "Local",
857
+ // Trigger incremental hourly aggregation
858
+ await incrementHourlyAggregate({
859
+ db,
860
+ systemId,
861
+ configurationId: configId,
862
+ status: result.status,
863
+ latencyMs: result.latencyMs,
864
+ runTimestamp: new Date(),
865
+ result: { ...result } as Record<string, unknown>,
866
+ collectorRegistry,
867
+ sourceLabel: "Local",
868
+ });
869
+
870
+ newState = await service.getSystemHealthStatus(systemId);
871
+ return toHealthEntityView(newState);
872
+ },
873
+ serialize: serializeHealthWrite,
874
+ onError: (error) =>
875
+ logger.warn(`Failed to mirror health entity for ${systemId}`, error),
987
876
  });
988
877
 
989
878
  logger.debug(
@@ -1013,9 +902,17 @@ async function executeHealthCheckJob(props: {
1013
902
  result: (result.metadata?.collectors as Record<string, unknown>) ?? undefined,
1014
903
  });
1015
904
 
1016
- // Check if aggregated state changed and notify subscribers
1017
- const newState = await service.getSystemHealthStatus(systemId);
1018
905
  if (newState.status !== previousStatus) {
906
+ // Record the aggregate transition so the sensing layer has a
907
+ // reliable "in status since" for every status (Wave 2).
908
+ await recordStateTransition({
909
+ db,
910
+ systemId,
911
+ configurationId: configId,
912
+ fromStatus: previousStatus,
913
+ toStatus: newState.status,
914
+ });
915
+
1019
916
  await notifyStateChange({
1020
917
  notificationClient,
1021
918
  systemId,
@@ -1037,77 +934,13 @@ async function executeHealthCheckJob(props: {
1037
934
  newStatus: newState.status,
1038
935
  });
1039
936
 
1040
- // Emit integration hooks for external integrations
1041
- const emitHook = getEmitHook();
1042
- if (emitHook) {
1043
- const healthyChecks = newState.checkStatuses.filter(
1044
- (c) => c.status === "healthy",
1045
- ).length;
1046
- const totalChecks = newState.checkStatuses.length;
1047
- const timestamp = new Date().toISOString();
1048
-
1049
- if (newState.status === "healthy" && previousStatus !== "healthy") {
1050
- // Recovery: system became healthy
1051
- await emitHook(healthCheckHooks.systemHealthy, {
1052
- systemId,
1053
- previousStatus,
1054
- healthyChecks,
1055
- totalChecks,
1056
- timestamp,
1057
- });
1058
- logger.debug(
1059
- `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
1060
- );
1061
- } else if (
1062
- previousStatus === "healthy" &&
1063
- newState.status !== "healthy"
1064
- ) {
1065
- // Degradation: system went from healthy to unhealthy/degraded
1066
- await emitHook(healthCheckHooks.systemDegraded, {
1067
- systemId,
1068
- previousStatus,
1069
- newStatus: newState.status,
1070
- healthyChecks,
1071
- totalChecks,
1072
- timestamp,
1073
- });
1074
- logger.debug(
1075
- `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
1076
- );
1077
- }
1078
-
1079
- // Umbrella hook — fires on every transition. Emitted alongside
1080
- // the directional hooks so existing subscribers stay unchanged
1081
- // while new automation triggers can react to any change.
1082
- if (previousStatus !== newState.status) {
1083
- await emitHook(healthCheckHooks.systemHealthChanged, {
1084
- systemId,
1085
- previousStatus,
1086
- newStatus: newState.status,
1087
- healthyChecks,
1088
- totalChecks,
1089
- timestamp,
1090
- });
1091
- }
1092
- }
937
+ // The directional + umbrella system-health hooks were removed in
938
+ // Phase 4 (§10.3): the `health` entity mirror above is the single
939
+ // source of truth, and its change deriver fires the
940
+ // `healthcheck.system_degraded` / `_healthy` / `_health_changed`
941
+ // trigger events through Stage-1 routing. Nothing to emit here.
1093
942
  }
1094
943
 
1095
- // Per-check auto-incident: see comment on the failed-execution path.
1096
- await maybeOpenAutoIncidentForCheck({
1097
- db,
1098
- service,
1099
- incidentClient,
1100
- maintenanceClient,
1101
- logger,
1102
- systemId,
1103
- systemName,
1104
- configurationId: configId,
1105
- configurationName: configRow.configName,
1106
- getEmitHook,
1107
- previousState,
1108
- newState,
1109
- });
1110
-
1111
944
  // Note: No manual rescheduling needed - recurring job handles it automatically
1112
945
  } catch (error) {
1113
946
  logger.error(
@@ -1115,27 +948,48 @@ async function executeHealthCheckJob(props: {
1115
948
  error,
1116
949
  );
1117
950
 
1118
- // Store failure (no latencyMs for failures)
1119
- await db.insert(healthCheckRuns).values({
1120
- configurationId: configId,
951
+ // Persist the failure run + aggregate THROUGH the reactive `health`
952
+ // entity: `apply` does the durable write and returns the freshly-computed
953
+ // view. The framework snapshots `prev` via the compute-on-read accessor
954
+ // BEFORE this insert, so a real status change emits exactly one correct
955
+ // `ENTITY_CHANGED` (§10.3). See the success path for the full rationale.
956
+ let newState!: AggregatedHealth;
957
+ await writeHealthEntity({
958
+ handle: getHealthEntity?.(),
1121
959
  systemId,
1122
- status: "unhealthy",
1123
- result: { error: String(error) } as Record<string, unknown>,
1124
- sourceId: undefined,
1125
- sourceLabel: "Local",
1126
- });
960
+ apply: async () => {
961
+ // Store failure (no latencyMs for failures)
962
+ await db.insert(healthCheckRuns).values({
963
+ configurationId: configId,
964
+ systemId,
965
+ status: "unhealthy",
966
+ result: { error: String(error) } as Record<string, unknown>,
967
+ sourceId: undefined,
968
+ sourceLabel: "Local",
969
+ });
1127
970
 
1128
- // Trigger incremental hourly aggregation
1129
- await incrementHourlyAggregate({
1130
- db,
1131
- systemId,
1132
- configurationId: configId,
1133
- status: "unhealthy",
1134
- latencyMs: undefined,
1135
- runTimestamp: new Date(),
1136
- // No collector data for error cases
1137
- collectorRegistry,
1138
- sourceLabel: "Local",
971
+ // Trigger incremental hourly aggregation
972
+ await incrementHourlyAggregate({
973
+ db,
974
+ systemId,
975
+ configurationId: configId,
976
+ status: "unhealthy",
977
+ latencyMs: undefined,
978
+ runTimestamp: new Date(),
979
+ // No collector data for error cases
980
+ collectorRegistry,
981
+ sourceLabel: "Local",
982
+ });
983
+
984
+ newState = await service.getSystemHealthStatus(systemId);
985
+ return toHealthEntityView(newState);
986
+ },
987
+ serialize: serializeHealthWrite,
988
+ onError: (mirrorError) =>
989
+ logger.warn(
990
+ `Failed to mirror health entity for ${systemId}`,
991
+ mirrorError,
992
+ ),
1139
993
  });
1140
994
 
1141
995
  // Try to fetch names for the enriched signal (best-effort)
@@ -1179,9 +1033,17 @@ async function executeHealthCheckJob(props: {
1179
1033
  result: undefined,
1180
1034
  });
1181
1035
 
1182
- // Check if aggregated state changed and notify subscribers
1183
- const newState = await service.getSystemHealthStatus(systemId);
1184
1036
  if (newState.status !== previousStatus) {
1037
+ // Record the aggregate transition so the sensing layer has a
1038
+ // reliable "in status since" for every status (Wave 2).
1039
+ await recordStateTransition({
1040
+ db,
1041
+ systemId,
1042
+ configurationId: configId,
1043
+ fromStatus: previousStatus,
1044
+ toStatus: newState.status,
1045
+ });
1046
+
1185
1047
  await notifyStateChange({
1186
1048
  notificationClient,
1187
1049
  systemId,
@@ -1203,83 +1065,20 @@ async function executeHealthCheckJob(props: {
1203
1065
  newStatus: newState.status,
1204
1066
  });
1205
1067
 
1206
- // Emit integration hooks for external integrations
1207
- const emitHook = getEmitHook();
1208
- if (emitHook) {
1209
- const healthyChecks = newState.checkStatuses.filter(
1210
- (c) => c.status === "healthy",
1211
- ).length;
1212
- const totalChecks = newState.checkStatuses.length;
1213
- const timestamp = new Date().toISOString();
1214
-
1215
- if (newState.status === "healthy" && previousStatus !== "healthy") {
1216
- // Recovery: system became healthy
1217
- await emitHook(healthCheckHooks.systemHealthy, {
1218
- systemId,
1219
- previousStatus,
1220
- healthyChecks,
1221
- totalChecks,
1222
- timestamp,
1223
- });
1224
- logger.debug(
1225
- `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
1226
- );
1227
- } else if (
1228
- previousStatus === "healthy" &&
1229
- newState.status !== "healthy"
1230
- ) {
1231
- // Degradation: system went from healthy to unhealthy/degraded
1232
- await emitHook(healthCheckHooks.systemDegraded, {
1233
- systemId,
1234
- previousStatus,
1235
- newStatus: newState.status,
1236
- healthyChecks,
1237
- totalChecks,
1238
- timestamp,
1239
- });
1240
- logger.debug(
1241
- `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
1242
- );
1243
- }
1244
-
1245
- // Umbrella hook — fires on every transition. Emitted alongside
1246
- // the directional hooks so existing subscribers stay unchanged
1247
- // while new automation triggers can react to any change.
1248
- if (previousStatus !== newState.status) {
1249
- await emitHook(healthCheckHooks.systemHealthChanged, {
1250
- systemId,
1251
- previousStatus,
1252
- newStatus: newState.status,
1253
- healthyChecks,
1254
- totalChecks,
1255
- timestamp,
1256
- });
1257
- }
1258
- }
1068
+ // The directional + umbrella system-health hooks were removed in
1069
+ // Phase 4 (§10.3): the `health` entity mirror above is the single
1070
+ // source of truth, and its change deriver fires the
1071
+ // `healthcheck.system_degraded` / `_healthy` / `_health_changed`
1072
+ // trigger events through Stage-1 routing. Nothing to emit here.
1259
1073
  }
1260
1074
 
1261
- // Per-check auto-incident: see comment on the failed-execution path.
1262
- await maybeOpenAutoIncidentForCheck({
1263
- db,
1264
- service,
1265
- incidentClient,
1266
- maintenanceClient,
1267
- logger,
1268
- systemId,
1269
- systemName,
1270
- configurationId: configId,
1271
- configurationName: configName,
1272
- getEmitHook,
1273
- previousState,
1274
- newState,
1275
- });
1276
-
1277
1075
  // Note: No manual rescheduling needed - recurring job handles it automatically
1278
1076
  }
1279
1077
  }
1280
1078
 
1281
1079
  export async function setupHealthCheckWorker(props: {
1282
1080
  db: Db;
1081
+ advisoryLock: AdvisoryLockService;
1283
1082
  registry: HealthCheckRegistry;
1284
1083
  collectorRegistry: CollectorRegistry;
1285
1084
  logger: Logger;
@@ -1290,10 +1089,13 @@ export async function setupHealthCheckWorker(props: {
1290
1089
  maintenanceClient: MaintenanceClient;
1291
1090
  incidentClient: IncidentClient;
1292
1091
  getEmitHook: () => EmitHookFn | undefined;
1092
+ getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
1293
1093
  cache: HealthCheckCache;
1094
+ secretResolver?: SecretResolverService;
1294
1095
  }): Promise<void> {
1295
1096
  const {
1296
1097
  db,
1098
+ advisoryLock,
1297
1099
  registry,
1298
1100
  collectorRegistry,
1299
1101
  logger,
@@ -1304,7 +1106,9 @@ export async function setupHealthCheckWorker(props: {
1304
1106
  maintenanceClient,
1305
1107
  incidentClient,
1306
1108
  getEmitHook,
1109
+ getHealthEntity,
1307
1110
  cache,
1111
+ secretResolver,
1308
1112
  } = props;
1309
1113
 
1310
1114
  const queue =
@@ -1316,6 +1120,7 @@ export async function setupHealthCheckWorker(props: {
1316
1120
  await executeHealthCheckJob({
1317
1121
  payload: job.data,
1318
1122
  db,
1123
+ advisoryLock,
1319
1124
  registry,
1320
1125
  collectorRegistry,
1321
1126
  logger,
@@ -1325,7 +1130,9 @@ export async function setupHealthCheckWorker(props: {
1325
1130
  maintenanceClient,
1326
1131
  incidentClient,
1327
1132
  getEmitHook,
1133
+ getHealthEntity,
1328
1134
  cache,
1135
+ secretResolver,
1329
1136
  });
1330
1137
  },
1331
1138
  {