@checkstack/healthcheck-backend 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,8 @@ import { IncidentApi } from "@checkstack/incident-common";
36
36
  import { NotificationApi } from "@checkstack/notification-common";
37
37
  import { healthcheckSystemSubscription } from "@checkstack/healthcheck-common";
38
38
  import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
39
+ import { secretEnvMappingSchema } from "@checkstack/secrets-common";
40
+ import type { SecretResolverService } from "@checkstack/secrets-backend";
39
41
  import { HealthCheckService } from "./service";
40
42
  import { healthCheckHooks } from "./hooks";
41
43
  import { incrementHourlyAggregate } from "./realtime-aggregation";
@@ -44,17 +46,13 @@ import {
44
46
  classifyTransition,
45
47
  shouldNotifyTransition,
46
48
  } from "./notification-policy";
49
+ import { recordStateTransition } from "./state-transitions";
47
50
  import {
48
- findLastAutoIncidentClose,
49
- findUnhealthySince,
50
- hasHealthyRunSince,
51
- isMaintenanceSuppressed,
52
- isTransitionToUnhealthy,
53
- openAutoIncident,
54
- recordUnhealthyTransition,
55
- shouldOpenForFlapping,
56
- shouldOpenForSustainedUnhealthy,
57
- } from "./auto-incident";
51
+ writeHealthEntity,
52
+ createHealthEntitySerializer,
53
+ type HealthEntityState,
54
+ } from "./health-entity";
55
+ import type { EntityHandle } from "@checkstack/automation-backend";
58
56
 
59
57
  type Db = SafeDatabase<typeof schema>;
60
58
  type CatalogClient = InferClient<typeof CatalogApi>;
@@ -62,6 +60,28 @@ type MaintenanceClient = InferClient<typeof MaintenanceApi>;
62
60
  type IncidentClient = InferClient<typeof IncidentApi>;
63
61
  type NotificationClient = InferClient<typeof NotificationApi>;
64
62
 
63
+ /** Shape of the aggregated state returned by `getSystemHealthStatus`. */
64
+ type AggregatedHealth = Awaited<
65
+ ReturnType<HealthCheckService["getSystemHealthStatus"]>
66
+ >;
67
+
68
+ /**
69
+ * Derive the reactive `health` entity view from the freshly-computed
70
+ * aggregated state. Mirrors `computeHealthEntityState` exactly: `status` is the
71
+ * worst-wins aggregate, `healthyChecks` counts per-check `"healthy"` statuses,
72
+ * and `totalChecks` is the number of enabled checks. Kept here so the
73
+ * `handle.mutate` write returns the SAME view the `read` accessor would have
74
+ * computed for the post-write state (the handle thus never re-reads).
75
+ */
76
+ function toHealthEntityView(state: AggregatedHealth): HealthEntityState {
77
+ return {
78
+ status: state.status,
79
+ healthyChecks: state.checkStatuses.filter((c) => c.status === "healthy")
80
+ .length,
81
+ totalChecks: state.checkStatuses.length,
82
+ };
83
+ }
84
+
65
85
  /**
66
86
  * Emit the checkCompleted hook if available, plus the narrower
67
87
  * `checkFailed` hook when the result wasn't `healthy` (so operators
@@ -171,222 +191,12 @@ export async function scheduleHealthCheck(props: {
171
191
  });
172
192
  }
173
193
 
174
- /**
175
- * After every check run, evaluate the per-check auto-incident
176
- * triggers. Either trigger can independently open an incident:
177
- *
178
- * - **flapping**: this just-completed run was a transition to
179
- * unhealthy AND `N` such transitions have happened within the
180
- * configured window.
181
- * - **sustained**: the check is currently unhealthy AND has been so
182
- * continuously for at least the configured duration.
183
- *
184
- * Both triggers honour the require-recovery rule: after the most
185
- * recent auto-incident close (manual or auto), no new auto-incident
186
- * opens until the check has logged at least one healthy run. This
187
- * stops a manual close → still-unhealthy → re-open loop.
188
- *
189
- * Active maintenance with suppression skips both triggers when the
190
- * policy opts in.
191
- */
192
- async function maybeOpenAutoIncidentForCheck(props: {
193
- db: Db;
194
- service: HealthCheckService;
195
- incidentClient: IncidentClient;
196
- maintenanceClient: MaintenanceClient;
197
- logger: Logger;
198
- systemId: string;
199
- systemName: string;
200
- configurationId: string;
201
- configurationName: string;
202
- /**
203
- * Same closure-based getter the queue executor uses elsewhere; let
204
- * us fire the `flapping_detected` automation hook from inside the
205
- * flapping evaluator without re-threading `emitHook` through every
206
- * intermediate caller. Optional — when absent, the hook simply
207
- * doesn't fire (e.g. in unit tests that don't care about it).
208
- */
209
- getEmitHook?: () => EmitHookFn | undefined;
210
- previousState: {
211
- checkStatuses: Array<{
212
- configurationId: string;
213
- status: HealthCheckStatus;
214
- }>;
215
- };
216
- newState: {
217
- checkStatuses: Array<{
218
- configurationId: string;
219
- status: HealthCheckStatus;
220
- }>;
221
- };
222
- }): Promise<void> {
223
- const {
224
- db,
225
- service,
226
- incidentClient,
227
- maintenanceClient,
228
- logger,
229
- systemId,
230
- systemName,
231
- configurationId,
232
- configurationName,
233
- getEmitHook,
234
- previousState,
235
- newState,
236
- } = props;
237
-
238
- const next = newState.checkStatuses.find(
239
- (c) => c.configurationId === configurationId,
240
- );
241
- // Only auto-incident logic applies when the check is currently
242
- // unhealthy — both triggers require it.
243
- if (!next || next.status !== "unhealthy") return;
244
-
245
- const prev = previousState.checkStatuses.find(
246
- (c) => c.configurationId === configurationId,
247
- );
248
- const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
249
-
250
- let policy;
251
- try {
252
- policy = await service.getAssignmentNotificationPolicy({
253
- systemId,
254
- configurationId,
255
- });
256
- } catch (error) {
257
- logger.warn(
258
- `Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
259
- error,
260
- );
261
- return;
262
- }
263
-
264
- if (!policy.autoOpenIncidentOnUnhealthy) return;
265
-
266
- // Honour active maintenance windows — operators have explicitly
267
- // said the system is down on purpose.
268
- if (policy.skipDuringMaintenance) {
269
- const suppressed = await isMaintenanceSuppressed({
270
- maintenanceClient,
271
- systemId,
272
- logger,
273
- });
274
- if (suppressed) {
275
- logger.debug(
276
- `Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
277
- );
278
- return;
279
- }
280
- }
281
-
282
- // Require-recovery: if there's a prior closed auto-incident for
283
- // this assignment, the check must have logged at least one healthy
284
- // run since the close before we can open another one. Without this,
285
- // an operator's manual close on a still-broken system would loop.
286
- const lastCloseAt = await findLastAutoIncidentClose({
287
- db,
288
- systemId,
289
- configurationId,
290
- });
291
- if (lastCloseAt) {
292
- const recovered = await hasHealthyRunSince({
293
- db,
294
- systemId,
295
- configurationId,
296
- since: lastCloseAt,
297
- });
298
- if (!recovered) {
299
- return;
300
- }
301
- }
302
-
303
- // Record the transition (if any) and evaluate the flapping trigger
304
- // against transitions that happened after the last close window.
305
- let flappingOpens = false;
306
- if (isTransition) {
307
- try {
308
- const count = await recordUnhealthyTransition({
309
- db,
310
- configurationId,
311
- systemId,
312
- windowMinutes: policy.flappingTrigger.windowMinutes,
313
- since: lastCloseAt,
314
- });
315
- flappingOpens = shouldOpenForFlapping({
316
- policy,
317
- recentTransitionCount: count,
318
- });
319
-
320
- // Fire the informational `flapping_detected` automation hook
321
- // independently of the auto-incident decision: an operator may
322
- // care about flapping even with the auto-incident pipeline
323
- // turned off.
324
- if (
325
- policy.flappingTrigger.enabled &&
326
- count >= policy.flappingTrigger.transitions
327
- ) {
328
- const emit = getEmitHook?.();
329
- if (emit) {
330
- try {
331
- await emit(healthCheckHooks.flappingDetected, {
332
- systemId,
333
- configurationId,
334
- transitionCount: count,
335
- windowMinutes: policy.flappingTrigger.windowMinutes,
336
- timestamp: new Date().toISOString(),
337
- });
338
- } catch (error) {
339
- logger.warn(
340
- `Failed to emit healthcheck.flapping_detected hook for ${systemId}/${configurationId}:`,
341
- error,
342
- );
343
- }
344
- }
345
- }
346
- } catch (error) {
347
- logger.warn(
348
- `Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
349
- error,
350
- );
351
- }
352
- }
353
-
354
- // Evaluate the sustained-duration trigger on every run while the
355
- // check is unhealthy (not just on transition).
356
- let sustainedOpens = false;
357
- if (policy.sustainedUnhealthyTrigger.enabled) {
358
- const unhealthySince = await findUnhealthySince({
359
- db,
360
- configurationId,
361
- systemId,
362
- since: lastCloseAt,
363
- });
364
- if (unhealthySince) {
365
- sustainedOpens = shouldOpenForSustainedUnhealthy({
366
- policy,
367
- unhealthyForMs: Date.now() - unhealthySince.getTime(),
368
- });
369
- }
370
- }
371
-
372
- if (!flappingOpens && !sustainedOpens) return;
373
-
374
- const reason = flappingOpens
375
- ? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
376
- : `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
377
-
378
- await openAutoIncident({
379
- db,
380
- incidentClient,
381
- logger,
382
- systemId,
383
- systemName,
384
- configurationId,
385
- configurationName,
386
- policy,
387
- reason,
388
- });
389
- }
194
+ // Flapping detection no longer lives here. It moved into the automation
195
+ // engine as a windowed-count gate on the `healthcheck.system_health_changed`
196
+ // trigger (raw aggregated-health change + `filter` +
197
+ // `window: { count, minutes, refire: "once" }`). The queue executor emits only
198
+ // the raw per-system health change (via the reactive `health` entity deriver,
199
+ // unchanged); the engine does the counting.
390
200
 
391
201
  /**
392
202
  * Notify system subscribers about a health state change.
@@ -575,6 +385,21 @@ async function executeHealthCheckJob(props: {
575
385
  incidentClient: IncidentClient;
576
386
  getEmitHook: () => EmitHookFn | undefined;
577
387
  cache: HealthCheckCache;
388
+ /**
389
+ * Resolver for the reactive `health` entity handle (§10.3). Returns the
390
+ * handle once automation-backend has bound the entity store; `undefined`
391
+ * during version skew / tests. Mirrors the `getEmitHook` closure pattern.
392
+ * The entity is PLUGIN-BACKED + COMPUTED — there is no keyed store; the
393
+ * durable run/aggregate write IS the entity write (see `writeHealthEntity`).
394
+ */
395
+ getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
396
+ /**
397
+ * Central secret resolver. When set, a collector declaring a `secretEnv`
398
+ * has it resolved + injected for this centrally-executed run; the
399
+ * collector masks the values out of its output. Optional for version-skew
400
+ * / test isolation.
401
+ */
402
+ secretResolver?: SecretResolverService;
578
403
  }): Promise<void> {
579
404
  const {
580
405
  payload,
@@ -588,13 +413,23 @@ async function executeHealthCheckJob(props: {
588
413
  maintenanceClient,
589
414
  incidentClient,
590
415
  getEmitHook,
416
+ getHealthEntity,
591
417
  cache,
418
+ secretResolver,
592
419
  } = props;
593
420
  const { configId, systemId } = payload;
594
421
 
595
422
  // Create service for aggregated state evaluation
596
423
  const service = new HealthCheckService(db, registry, collectorRegistry);
597
424
 
425
+ // Per-system serializer for the reactive health mutate (§10.3): a
426
+ // transaction-scoped advisory lock keyed `health:<systemId>` wraps the
427
+ // snapshot-prev + apply + diff + emit so concurrent evaluations of one
428
+ // system (multiple per-config jobs across pods, or at-least-once
429
+ // redelivery) can't double-emit a single logical transition. Bound to this
430
+ // job's systemId below at every `writeHealthEntity` call.
431
+ const serializeHealthWrite = createHealthEntitySerializer({ db })(systemId);
432
+
598
433
  // Capture aggregated state BEFORE this run for comparison
599
434
  const previousState = await service.getSystemHealthStatus(systemId);
600
435
  const previousStatus = previousState.status;
@@ -725,11 +560,31 @@ async function executeHealthCheckJob(props: {
725
560
  const storageKey = collectorEntry.id;
726
561
 
727
562
  try {
563
+ // Resolve the collector's declared secretEnv for THIS run
564
+ // (central execution). The collector injects it and masks the
565
+ // values out of its output. A missing required secret throws
566
+ // and fails the collector clearly.
567
+ let secretEnv: Record<string, string> | undefined;
568
+ const declared = secretEnvMappingSchema.safeParse(
569
+ (collectorEntry.config as { secretEnv?: unknown }).secretEnv,
570
+ );
571
+ if (
572
+ secretResolver &&
573
+ declared.success &&
574
+ Object.keys(declared.data).length > 0
575
+ ) {
576
+ const resolved = await secretResolver.resolveForRun({
577
+ secretEnv: declared.data,
578
+ });
579
+ secretEnv = resolved.env;
580
+ }
581
+
728
582
  const collectorResult = await registered.collector.execute({
729
583
  config: collectorEntry.config,
730
584
  client: connectedClient!.client,
731
585
  pluginId: configRow.strategyId,
732
586
  runContext,
587
+ ...(secretEnv ? { secretEnv } : {}),
733
588
  });
734
589
 
735
590
  // Check for collector-level error
@@ -860,26 +715,44 @@ async function executeHealthCheckJob(props: {
860
715
  },
861
716
  };
862
717
 
863
- await db.insert(healthCheckRuns).values({
864
- configurationId: configId,
718
+ // Persist the run + aggregate THROUGH the reactive `health` entity:
719
+ // `apply` does the durable write and returns the freshly-computed view.
720
+ // The framework snapshots `prev` via `read` BEFORE this insert, so a real
721
+ // status change emits exactly one correct `ENTITY_CHANGED` (§10.3). The
722
+ // computed aggregated state is stashed for the transition/notify path.
723
+ let newState!: AggregatedHealth;
724
+ await writeHealthEntity({
725
+ handle: getHealthEntity?.(),
865
726
  systemId,
866
- status: result.status,
867
- latencyMs: result.latencyMs,
868
- result: { ...result } as Record<string, unknown>,
869
- sourceId: undefined,
870
- sourceLabel: "Local",
871
- });
727
+ apply: async () => {
728
+ await db.insert(healthCheckRuns).values({
729
+ configurationId: configId,
730
+ systemId,
731
+ status: result.status,
732
+ latencyMs: result.latencyMs,
733
+ result: { ...result } as Record<string, unknown>,
734
+ sourceId: undefined,
735
+ sourceLabel: "Local",
736
+ });
872
737
 
873
- await incrementHourlyAggregate({
874
- db,
875
- systemId,
876
- configurationId: configId,
877
- status: result.status,
878
- latencyMs: result.latencyMs,
879
- runTimestamp: new Date(),
880
- result: { ...result } as Record<string, unknown>,
881
- collectorRegistry,
882
- sourceLabel: "Local",
738
+ await incrementHourlyAggregate({
739
+ db,
740
+ systemId,
741
+ configurationId: configId,
742
+ status: result.status,
743
+ latencyMs: result.latencyMs,
744
+ runTimestamp: new Date(),
745
+ result: { ...result } as Record<string, unknown>,
746
+ collectorRegistry,
747
+ sourceLabel: "Local",
748
+ });
749
+
750
+ newState = await service.getSystemHealthStatus(systemId);
751
+ return toHealthEntityView(newState);
752
+ },
753
+ serialize: serializeHealthWrite,
754
+ onError: (error) =>
755
+ logger.warn(`Failed to mirror health entity for ${systemId}`, error),
883
756
  });
884
757
 
885
758
  logger.debug(
@@ -899,8 +772,17 @@ async function executeHealthCheckJob(props: {
899
772
  latencyMs: result.latencyMs,
900
773
  });
901
774
 
902
- const newState = await service.getSystemHealthStatus(systemId);
903
775
  if (newState.status !== previousStatus) {
776
+ // Record the aggregate transition so the sensing layer has a
777
+ // reliable "in status since" for every status (Wave 2).
778
+ await recordStateTransition({
779
+ db,
780
+ systemId,
781
+ configurationId: configId,
782
+ fromStatus: previousStatus,
783
+ toStatus: newState.status,
784
+ });
785
+
904
786
  await notifyStateChange({
905
787
  notificationClient,
906
788
  systemId,
@@ -916,24 +798,6 @@ async function executeHealthCheckJob(props: {
916
798
  });
917
799
  }
918
800
 
919
- // Per-check auto-incident: runs whether or not the aggregate
920
- // changed (a check can transition to unhealthy without flipping
921
- // the aggregate if another check is already unhealthy).
922
- await maybeOpenAutoIncidentForCheck({
923
- db,
924
- service,
925
- incidentClient,
926
- maintenanceClient,
927
- logger,
928
- systemId,
929
- systemName,
930
- configurationId: configId,
931
- configurationName: configRow.configName,
932
- getEmitHook,
933
- previousState,
934
- newState,
935
- });
936
-
937
801
  return;
938
802
  } finally {
939
803
  if (connectedClient) {
@@ -962,28 +826,48 @@ async function executeHealthCheckJob(props: {
962
826
  },
963
827
  };
964
828
 
965
- // Store result (spread to convert structured type to plain record for jsonb)
966
- await db.insert(healthCheckRuns).values({
967
- configurationId: configId,
829
+ // Persist the run + aggregate THROUGH the reactive `health` entity on
830
+ // every run (§10.3): `apply` does the durable write (insert + hourly
831
+ // aggregate) and returns the freshly-computed view. The framework
832
+ // snapshots `prev` via the COMPUTE-ON-READ accessor BEFORE this insert, so
833
+ // an unchanged aggregate is a no-op and a real status change drives the
834
+ // directional/umbrella trigger events via `deriveHealthTriggerEvents` —
835
+ // exactly one correct `ENTITY_CHANGED` with accurate prev → next.
836
+ let newState!: AggregatedHealth;
837
+ await writeHealthEntity({
838
+ handle: getHealthEntity?.(),
968
839
  systemId,
969
- status: result.status,
970
- latencyMs: result.latencyMs,
971
- result: { ...result } as Record<string, unknown>,
972
- sourceId: undefined,
973
- sourceLabel: "Local",
974
- });
840
+ apply: async () => {
841
+ // Store result (spread to convert structured type to plain record for jsonb)
842
+ await db.insert(healthCheckRuns).values({
843
+ configurationId: configId,
844
+ systemId,
845
+ status: result.status,
846
+ latencyMs: result.latencyMs,
847
+ result: { ...result } as Record<string, unknown>,
848
+ sourceId: undefined,
849
+ sourceLabel: "Local",
850
+ });
975
851
 
976
- // Trigger incremental hourly aggregation
977
- await incrementHourlyAggregate({
978
- db,
979
- systemId,
980
- configurationId: configId,
981
- status: result.status,
982
- latencyMs: result.latencyMs,
983
- runTimestamp: new Date(),
984
- result: { ...result } as Record<string, unknown>,
985
- collectorRegistry,
986
- sourceLabel: "Local",
852
+ // Trigger incremental hourly aggregation
853
+ await incrementHourlyAggregate({
854
+ db,
855
+ systemId,
856
+ configurationId: configId,
857
+ status: result.status,
858
+ latencyMs: result.latencyMs,
859
+ runTimestamp: new Date(),
860
+ result: { ...result } as Record<string, unknown>,
861
+ collectorRegistry,
862
+ sourceLabel: "Local",
863
+ });
864
+
865
+ newState = await service.getSystemHealthStatus(systemId);
866
+ return toHealthEntityView(newState);
867
+ },
868
+ serialize: serializeHealthWrite,
869
+ onError: (error) =>
870
+ logger.warn(`Failed to mirror health entity for ${systemId}`, error),
987
871
  });
988
872
 
989
873
  logger.debug(
@@ -1013,9 +897,17 @@ async function executeHealthCheckJob(props: {
1013
897
  result: (result.metadata?.collectors as Record<string, unknown>) ?? undefined,
1014
898
  });
1015
899
 
1016
- // Check if aggregated state changed and notify subscribers
1017
- const newState = await service.getSystemHealthStatus(systemId);
1018
900
  if (newState.status !== previousStatus) {
901
+ // Record the aggregate transition so the sensing layer has a
902
+ // reliable "in status since" for every status (Wave 2).
903
+ await recordStateTransition({
904
+ db,
905
+ systemId,
906
+ configurationId: configId,
907
+ fromStatus: previousStatus,
908
+ toStatus: newState.status,
909
+ });
910
+
1019
911
  await notifyStateChange({
1020
912
  notificationClient,
1021
913
  systemId,
@@ -1037,77 +929,13 @@ async function executeHealthCheckJob(props: {
1037
929
  newStatus: newState.status,
1038
930
  });
1039
931
 
1040
- // Emit integration hooks for external integrations
1041
- const emitHook = getEmitHook();
1042
- if (emitHook) {
1043
- const healthyChecks = newState.checkStatuses.filter(
1044
- (c) => c.status === "healthy",
1045
- ).length;
1046
- const totalChecks = newState.checkStatuses.length;
1047
- const timestamp = new Date().toISOString();
1048
-
1049
- if (newState.status === "healthy" && previousStatus !== "healthy") {
1050
- // Recovery: system became healthy
1051
- await emitHook(healthCheckHooks.systemHealthy, {
1052
- systemId,
1053
- previousStatus,
1054
- healthyChecks,
1055
- totalChecks,
1056
- timestamp,
1057
- });
1058
- logger.debug(
1059
- `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
1060
- );
1061
- } else if (
1062
- previousStatus === "healthy" &&
1063
- newState.status !== "healthy"
1064
- ) {
1065
- // Degradation: system went from healthy to unhealthy/degraded
1066
- await emitHook(healthCheckHooks.systemDegraded, {
1067
- systemId,
1068
- previousStatus,
1069
- newStatus: newState.status,
1070
- healthyChecks,
1071
- totalChecks,
1072
- timestamp,
1073
- });
1074
- logger.debug(
1075
- `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
1076
- );
1077
- }
1078
-
1079
- // Umbrella hook — fires on every transition. Emitted alongside
1080
- // the directional hooks so existing subscribers stay unchanged
1081
- // while new automation triggers can react to any change.
1082
- if (previousStatus !== newState.status) {
1083
- await emitHook(healthCheckHooks.systemHealthChanged, {
1084
- systemId,
1085
- previousStatus,
1086
- newStatus: newState.status,
1087
- healthyChecks,
1088
- totalChecks,
1089
- timestamp,
1090
- });
1091
- }
1092
- }
932
+ // The directional + umbrella system-health hooks were removed in
933
+ // Phase 4 (§10.3): the `health` entity mirror above is the single
934
+ // source of truth, and its change deriver fires the
935
+ // `healthcheck.system_degraded` / `_healthy` / `_health_changed`
936
+ // trigger events through Stage-1 routing. Nothing to emit here.
1093
937
  }
1094
938
 
1095
- // Per-check auto-incident: see comment on the failed-execution path.
1096
- await maybeOpenAutoIncidentForCheck({
1097
- db,
1098
- service,
1099
- incidentClient,
1100
- maintenanceClient,
1101
- logger,
1102
- systemId,
1103
- systemName,
1104
- configurationId: configId,
1105
- configurationName: configRow.configName,
1106
- getEmitHook,
1107
- previousState,
1108
- newState,
1109
- });
1110
-
1111
939
  // Note: No manual rescheduling needed - recurring job handles it automatically
1112
940
  } catch (error) {
1113
941
  logger.error(
@@ -1115,27 +943,48 @@ async function executeHealthCheckJob(props: {
1115
943
  error,
1116
944
  );
1117
945
 
1118
- // Store failure (no latencyMs for failures)
1119
- await db.insert(healthCheckRuns).values({
1120
- configurationId: configId,
946
+ // Persist the failure run + aggregate THROUGH the reactive `health`
947
+ // entity: `apply` does the durable write and returns the freshly-computed
948
+ // view. The framework snapshots `prev` via the compute-on-read accessor
949
+ // BEFORE this insert, so a real status change emits exactly one correct
950
+ // `ENTITY_CHANGED` (§10.3). See the success path for the full rationale.
951
+ let newState!: AggregatedHealth;
952
+ await writeHealthEntity({
953
+ handle: getHealthEntity?.(),
1121
954
  systemId,
1122
- status: "unhealthy",
1123
- result: { error: String(error) } as Record<string, unknown>,
1124
- sourceId: undefined,
1125
- sourceLabel: "Local",
1126
- });
955
+ apply: async () => {
956
+ // Store failure (no latencyMs for failures)
957
+ await db.insert(healthCheckRuns).values({
958
+ configurationId: configId,
959
+ systemId,
960
+ status: "unhealthy",
961
+ result: { error: String(error) } as Record<string, unknown>,
962
+ sourceId: undefined,
963
+ sourceLabel: "Local",
964
+ });
1127
965
 
1128
- // Trigger incremental hourly aggregation
1129
- await incrementHourlyAggregate({
1130
- db,
1131
- systemId,
1132
- configurationId: configId,
1133
- status: "unhealthy",
1134
- latencyMs: undefined,
1135
- runTimestamp: new Date(),
1136
- // No collector data for error cases
1137
- collectorRegistry,
1138
- sourceLabel: "Local",
966
+ // Trigger incremental hourly aggregation
967
+ await incrementHourlyAggregate({
968
+ db,
969
+ systemId,
970
+ configurationId: configId,
971
+ status: "unhealthy",
972
+ latencyMs: undefined,
973
+ runTimestamp: new Date(),
974
+ // No collector data for error cases
975
+ collectorRegistry,
976
+ sourceLabel: "Local",
977
+ });
978
+
979
+ newState = await service.getSystemHealthStatus(systemId);
980
+ return toHealthEntityView(newState);
981
+ },
982
+ serialize: serializeHealthWrite,
983
+ onError: (mirrorError) =>
984
+ logger.warn(
985
+ `Failed to mirror health entity for ${systemId}`,
986
+ mirrorError,
987
+ ),
1139
988
  });
1140
989
 
1141
990
  // Try to fetch names for the enriched signal (best-effort)
@@ -1179,9 +1028,17 @@ async function executeHealthCheckJob(props: {
1179
1028
  result: undefined,
1180
1029
  });
1181
1030
 
1182
- // Check if aggregated state changed and notify subscribers
1183
- const newState = await service.getSystemHealthStatus(systemId);
1184
1031
  if (newState.status !== previousStatus) {
1032
+ // Record the aggregate transition so the sensing layer has a
1033
+ // reliable "in status since" for every status (Wave 2).
1034
+ await recordStateTransition({
1035
+ db,
1036
+ systemId,
1037
+ configurationId: configId,
1038
+ fromStatus: previousStatus,
1039
+ toStatus: newState.status,
1040
+ });
1041
+
1185
1042
  await notifyStateChange({
1186
1043
  notificationClient,
1187
1044
  systemId,
@@ -1203,77 +1060,13 @@ async function executeHealthCheckJob(props: {
1203
1060
  newStatus: newState.status,
1204
1061
  });
1205
1062
 
1206
- // Emit integration hooks for external integrations
1207
- const emitHook = getEmitHook();
1208
- if (emitHook) {
1209
- const healthyChecks = newState.checkStatuses.filter(
1210
- (c) => c.status === "healthy",
1211
- ).length;
1212
- const totalChecks = newState.checkStatuses.length;
1213
- const timestamp = new Date().toISOString();
1214
-
1215
- if (newState.status === "healthy" && previousStatus !== "healthy") {
1216
- // Recovery: system became healthy
1217
- await emitHook(healthCheckHooks.systemHealthy, {
1218
- systemId,
1219
- previousStatus,
1220
- healthyChecks,
1221
- totalChecks,
1222
- timestamp,
1223
- });
1224
- logger.debug(
1225
- `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
1226
- );
1227
- } else if (
1228
- previousStatus === "healthy" &&
1229
- newState.status !== "healthy"
1230
- ) {
1231
- // Degradation: system went from healthy to unhealthy/degraded
1232
- await emitHook(healthCheckHooks.systemDegraded, {
1233
- systemId,
1234
- previousStatus,
1235
- newStatus: newState.status,
1236
- healthyChecks,
1237
- totalChecks,
1238
- timestamp,
1239
- });
1240
- logger.debug(
1241
- `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
1242
- );
1243
- }
1244
-
1245
- // Umbrella hook — fires on every transition. Emitted alongside
1246
- // the directional hooks so existing subscribers stay unchanged
1247
- // while new automation triggers can react to any change.
1248
- if (previousStatus !== newState.status) {
1249
- await emitHook(healthCheckHooks.systemHealthChanged, {
1250
- systemId,
1251
- previousStatus,
1252
- newStatus: newState.status,
1253
- healthyChecks,
1254
- totalChecks,
1255
- timestamp,
1256
- });
1257
- }
1258
- }
1063
+ // The directional + umbrella system-health hooks were removed in
1064
+ // Phase 4 (§10.3): the `health` entity mirror above is the single
1065
+ // source of truth, and its change deriver fires the
1066
+ // `healthcheck.system_degraded` / `_healthy` / `_health_changed`
1067
+ // trigger events through Stage-1 routing. Nothing to emit here.
1259
1068
  }
1260
1069
 
1261
- // Per-check auto-incident: see comment on the failed-execution path.
1262
- await maybeOpenAutoIncidentForCheck({
1263
- db,
1264
- service,
1265
- incidentClient,
1266
- maintenanceClient,
1267
- logger,
1268
- systemId,
1269
- systemName,
1270
- configurationId: configId,
1271
- configurationName: configName,
1272
- getEmitHook,
1273
- previousState,
1274
- newState,
1275
- });
1276
-
1277
1070
  // Note: No manual rescheduling needed - recurring job handles it automatically
1278
1071
  }
1279
1072
  }
@@ -1290,7 +1083,9 @@ export async function setupHealthCheckWorker(props: {
1290
1083
  maintenanceClient: MaintenanceClient;
1291
1084
  incidentClient: IncidentClient;
1292
1085
  getEmitHook: () => EmitHookFn | undefined;
1086
+ getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
1293
1087
  cache: HealthCheckCache;
1088
+ secretResolver?: SecretResolverService;
1294
1089
  }): Promise<void> {
1295
1090
  const {
1296
1091
  db,
@@ -1304,7 +1099,9 @@ export async function setupHealthCheckWorker(props: {
1304
1099
  maintenanceClient,
1305
1100
  incidentClient,
1306
1101
  getEmitHook,
1102
+ getHealthEntity,
1307
1103
  cache,
1104
+ secretResolver,
1308
1105
  } = props;
1309
1106
 
1310
1107
  const queue =
@@ -1325,7 +1122,9 @@ export async function setupHealthCheckWorker(props: {
1325
1122
  maintenanceClient,
1326
1123
  incidentClient,
1327
1124
  getEmitHook,
1125
+ getHealthEntity,
1328
1126
  cache,
1127
+ secretResolver,
1329
1128
  });
1330
1129
  },
1331
1130
  {