@checkstack/healthcheck-backend 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +541 -0
  2. package/drizzle/0015_quiet_meggan.sql +12 -0
  3. package/drizzle/0016_complex_maginty.sql +1 -0
  4. package/drizzle/0017_pretty_caretaker.sql +1 -0
  5. package/drizzle/meta/0015_snapshot.json +764 -0
  6. package/drizzle/meta/0016_snapshot.json +644 -0
  7. package/drizzle/meta/0017_snapshot.json +563 -0
  8. package/drizzle/meta/_journal.json +21 -0
  9. package/package.json +24 -21
  10. package/src/automations.test.ts +234 -0
  11. package/src/automations.ts +342 -0
  12. package/src/collector-script-test.test.ts +236 -0
  13. package/src/collector-script-test.ts +221 -0
  14. package/src/health-entity.test.ts +698 -0
  15. package/src/health-entity.ts +369 -0
  16. package/src/health-state.test.ts +115 -0
  17. package/src/health-state.ts +333 -0
  18. package/src/healthcheck-gitops-kinds.test.ts +6 -32
  19. package/src/healthcheck-gitops-kinds.ts +4 -19
  20. package/src/hooks.test.ts +19 -6
  21. package/src/hooks.ts +38 -28
  22. package/src/index.ts +150 -98
  23. package/src/queue-executor.test.ts +137 -0
  24. package/src/queue-executor.ts +282 -380
  25. package/src/retention-job.ts +65 -1
  26. package/src/retention-state-transitions.test.ts +49 -0
  27. package/src/router.test.ts +18 -0
  28. package/src/router.ts +56 -1
  29. package/src/schema.ts +34 -54
  30. package/src/service-assignments.test.ts +184 -0
  31. package/src/service-notification-policy.test.ts +28 -71
  32. package/src/service.ts +154 -0
  33. package/src/state-transitions.test.ts +126 -0
  34. package/src/state-transitions.ts +112 -0
  35. package/tsconfig.json +12 -3
  36. package/src/auto-incident-close-job.ts +0 -164
  37. package/src/auto-incident.test.ts +0 -196
  38. package/src/auto-incident.ts +0 -332
@@ -8,6 +8,7 @@ import {
8
8
  type BaseStrategyConfig,
9
9
  type ConnectedClient,
10
10
  type TransportClient,
11
+ type CollectorRunContext,
11
12
  } from "@checkstack/backend-api";
12
13
  import { QueueManager } from "@checkstack/queue-api";
13
14
  import {
@@ -35,6 +36,8 @@ import { IncidentApi } from "@checkstack/incident-common";
35
36
  import { NotificationApi } from "@checkstack/notification-common";
36
37
  import { healthcheckSystemSubscription } from "@checkstack/healthcheck-common";
37
38
  import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
39
+ import { secretEnvMappingSchema } from "@checkstack/secrets-common";
40
+ import type { SecretResolverService } from "@checkstack/secrets-backend";
38
41
  import { HealthCheckService } from "./service";
39
42
  import { healthCheckHooks } from "./hooks";
40
43
  import { incrementHourlyAggregate } from "./realtime-aggregation";
@@ -43,17 +46,13 @@ import {
43
46
  classifyTransition,
44
47
  shouldNotifyTransition,
45
48
  } from "./notification-policy";
49
+ import { recordStateTransition } from "./state-transitions";
46
50
  import {
47
- findLastAutoIncidentClose,
48
- findUnhealthySince,
49
- hasHealthyRunSince,
50
- isMaintenanceSuppressed,
51
- isTransitionToUnhealthy,
52
- openAutoIncident,
53
- recordUnhealthyTransition,
54
- shouldOpenForFlapping,
55
- shouldOpenForSustainedUnhealthy,
56
- } from "./auto-incident";
51
+ writeHealthEntity,
52
+ createHealthEntitySerializer,
53
+ type HealthEntityState,
54
+ } from "./health-entity";
55
+ import type { EntityHandle } from "@checkstack/automation-backend";
57
56
 
58
57
  type Db = SafeDatabase<typeof schema>;
59
58
  type CatalogClient = InferClient<typeof CatalogApi>;
@@ -61,9 +60,36 @@ type MaintenanceClient = InferClient<typeof MaintenanceApi>;
61
60
  type IncidentClient = InferClient<typeof IncidentApi>;
62
61
  type NotificationClient = InferClient<typeof NotificationApi>;
63
62
 
63
+ /** Shape of the aggregated state returned by `getSystemHealthStatus`. */
64
+ type AggregatedHealth = Awaited<
65
+ ReturnType<HealthCheckService["getSystemHealthStatus"]>
66
+ >;
67
+
64
68
  /**
65
- * Emit the checkCompleted hook if available.
66
- * Extracted to avoid duplicating the hook emission pattern across success/error paths.
69
+ * Derive the reactive `health` entity view from the freshly-computed
70
+ * aggregated state. Mirrors `computeHealthEntityState` exactly: `status` is the
71
+ * worst-wins aggregate, `healthyChecks` counts per-check `"healthy"` statuses,
72
+ * and `totalChecks` is the number of enabled checks. Kept here so the
73
+ * `handle.mutate` write returns the SAME view the `read` accessor would have
74
+ * computed for the post-write state (the handle thus never re-reads).
75
+ */
76
+ function toHealthEntityView(state: AggregatedHealth): HealthEntityState {
77
+ return {
78
+ status: state.status,
79
+ healthyChecks: state.checkStatuses.filter((c) => c.status === "healthy")
80
+ .length,
81
+ totalChecks: state.checkStatuses.length,
82
+ };
83
+ }
84
+
85
+ /**
86
+ * Emit the checkCompleted hook if available, plus the narrower
87
+ * `checkFailed` hook when the result wasn't `healthy` (so operators
88
+ * can wire a typed "trigger on failure" automation without having to
89
+ * filter `checkCompleted` themselves).
90
+ *
91
+ * Extracted to avoid duplicating the hook emission pattern across
92
+ * success/error paths.
67
93
  */
68
94
  async function emitCheckCompletedHook({
69
95
  getEmitHook,
@@ -81,14 +107,26 @@ async function emitCheckCompletedHook({
81
107
  result: Record<string, unknown> | undefined;
82
108
  }): Promise<void> {
83
109
  const emitHook = getEmitHook();
84
- if (emitHook) {
85
- await emitHook(healthCheckHooks.checkCompleted, {
110
+ if (!emitHook) return;
111
+ const timestamp = new Date().toISOString();
112
+ await emitHook(healthCheckHooks.checkCompleted, {
113
+ systemId,
114
+ configurationId,
115
+ status,
116
+ latencyMs,
117
+ result,
118
+ timestamp,
119
+ });
120
+ // Narrow follow-up — informational for automation triggers; the
121
+ // auto-incident pipeline still runs on its own thresholds.
122
+ if (status !== "healthy") {
123
+ await emitHook(healthCheckHooks.checkFailed, {
86
124
  systemId,
87
125
  configurationId,
88
126
  status,
89
127
  latencyMs,
90
128
  result,
91
- timestamp: new Date().toISOString(),
129
+ timestamp,
92
130
  });
93
131
  }
94
132
  }
@@ -102,9 +140,11 @@ export interface HealthCheckJobPayload {
102
140
  }
103
141
 
104
142
  /**
105
- * Queue name for health check execution
143
+ * Queue name for health check execution. Exported so consumers like
144
+ * the `healthcheck.run_now` automation action can enqueue a one-off
145
+ * job without re-importing the recurring-job factory.
106
146
  */
107
- const HEALTH_CHECK_QUEUE = "health-checks";
147
+ export const HEALTH_CHECK_QUEUE = "health-checks";
108
148
 
109
149
  /**
110
150
  * Worker group for health check execution (work-queue mode)
@@ -151,186 +191,12 @@ export async function scheduleHealthCheck(props: {
151
191
  });
152
192
  }
153
193
 
154
- /**
155
- * After every check run, evaluate the per-check auto-incident
156
- * triggers. Either trigger can independently open an incident:
157
- *
158
- * - **flapping**: this just-completed run was a transition to
159
- * unhealthy AND `N` such transitions have happened within the
160
- * configured window.
161
- * - **sustained**: the check is currently unhealthy AND has been so
162
- * continuously for at least the configured duration.
163
- *
164
- * Both triggers honour the require-recovery rule: after the most
165
- * recent auto-incident close (manual or auto), no new auto-incident
166
- * opens until the check has logged at least one healthy run. This
167
- * stops a manual close → still-unhealthy → re-open loop.
168
- *
169
- * Active maintenance with suppression skips both triggers when the
170
- * policy opts in.
171
- */
172
- async function maybeOpenAutoIncidentForCheck(props: {
173
- db: Db;
174
- service: HealthCheckService;
175
- incidentClient: IncidentClient;
176
- maintenanceClient: MaintenanceClient;
177
- logger: Logger;
178
- systemId: string;
179
- systemName: string;
180
- configurationId: string;
181
- configurationName: string;
182
- previousState: {
183
- checkStatuses: Array<{
184
- configurationId: string;
185
- status: HealthCheckStatus;
186
- }>;
187
- };
188
- newState: {
189
- checkStatuses: Array<{
190
- configurationId: string;
191
- status: HealthCheckStatus;
192
- }>;
193
- };
194
- }): Promise<void> {
195
- const {
196
- db,
197
- service,
198
- incidentClient,
199
- maintenanceClient,
200
- logger,
201
- systemId,
202
- systemName,
203
- configurationId,
204
- configurationName,
205
- previousState,
206
- newState,
207
- } = props;
208
-
209
- const next = newState.checkStatuses.find(
210
- (c) => c.configurationId === configurationId,
211
- );
212
- // Only auto-incident logic applies when the check is currently
213
- // unhealthy — both triggers require it.
214
- if (!next || next.status !== "unhealthy") return;
215
-
216
- const prev = previousState.checkStatuses.find(
217
- (c) => c.configurationId === configurationId,
218
- );
219
- const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
220
-
221
- let policy;
222
- try {
223
- policy = await service.getAssignmentNotificationPolicy({
224
- systemId,
225
- configurationId,
226
- });
227
- } catch (error) {
228
- logger.warn(
229
- `Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
230
- error,
231
- );
232
- return;
233
- }
234
-
235
- if (!policy.autoOpenIncidentOnUnhealthy) return;
236
-
237
- // Honour active maintenance windows — operators have explicitly
238
- // said the system is down on purpose.
239
- if (policy.skipDuringMaintenance) {
240
- const suppressed = await isMaintenanceSuppressed({
241
- maintenanceClient,
242
- systemId,
243
- logger,
244
- });
245
- if (suppressed) {
246
- logger.debug(
247
- `Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
248
- );
249
- return;
250
- }
251
- }
252
-
253
- // Require-recovery: if there's a prior closed auto-incident for
254
- // this assignment, the check must have logged at least one healthy
255
- // run since the close before we can open another one. Without this,
256
- // an operator's manual close on a still-broken system would loop.
257
- const lastCloseAt = await findLastAutoIncidentClose({
258
- db,
259
- systemId,
260
- configurationId,
261
- });
262
- if (lastCloseAt) {
263
- const recovered = await hasHealthyRunSince({
264
- db,
265
- systemId,
266
- configurationId,
267
- since: lastCloseAt,
268
- });
269
- if (!recovered) {
270
- return;
271
- }
272
- }
273
-
274
- // Record the transition (if any) and evaluate the flapping trigger
275
- // against transitions that happened after the last close window.
276
- let flappingOpens = false;
277
- if (isTransition) {
278
- try {
279
- const count = await recordUnhealthyTransition({
280
- db,
281
- configurationId,
282
- systemId,
283
- windowMinutes: policy.flappingTrigger.windowMinutes,
284
- since: lastCloseAt,
285
- });
286
- flappingOpens = shouldOpenForFlapping({
287
- policy,
288
- recentTransitionCount: count,
289
- });
290
- } catch (error) {
291
- logger.warn(
292
- `Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
293
- error,
294
- );
295
- }
296
- }
297
-
298
- // Evaluate the sustained-duration trigger on every run while the
299
- // check is unhealthy (not just on transition).
300
- let sustainedOpens = false;
301
- if (policy.sustainedUnhealthyTrigger.enabled) {
302
- const unhealthySince = await findUnhealthySince({
303
- db,
304
- configurationId,
305
- systemId,
306
- since: lastCloseAt,
307
- });
308
- if (unhealthySince) {
309
- sustainedOpens = shouldOpenForSustainedUnhealthy({
310
- policy,
311
- unhealthyForMs: Date.now() - unhealthySince.getTime(),
312
- });
313
- }
314
- }
315
-
316
- if (!flappingOpens && !sustainedOpens) return;
317
-
318
- const reason = flappingOpens
319
- ? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
320
- : `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
321
-
322
- await openAutoIncident({
323
- db,
324
- incidentClient,
325
- logger,
326
- systemId,
327
- systemName,
328
- configurationId,
329
- configurationName,
330
- policy,
331
- reason,
332
- });
333
- }
194
+ // Flapping detection no longer lives here. It moved into the automation
195
+ // engine as a windowed-count gate on the `healthcheck.system_health_changed`
196
+ // trigger (raw aggregated-health change + `filter` +
197
+ // `window: { count, minutes, refire: "once" }`). The queue executor emits only
198
+ // the raw per-system health change (via the reactive `health` entity deriver,
199
+ // unchanged); the engine does the counting.
334
200
 
335
201
  /**
336
202
  * Notify system subscribers about a health state change.
@@ -519,6 +385,21 @@ async function executeHealthCheckJob(props: {
519
385
  incidentClient: IncidentClient;
520
386
  getEmitHook: () => EmitHookFn | undefined;
521
387
  cache: HealthCheckCache;
388
+ /**
389
+ * Resolver for the reactive `health` entity handle (§10.3). Returns the
390
+ * handle once automation-backend has bound the entity store; `undefined`
391
+ * during version skew / tests. Mirrors the `getEmitHook` closure pattern.
392
+ * The entity is PLUGIN-BACKED + COMPUTED — there is no keyed store; the
393
+ * durable run/aggregate write IS the entity write (see `writeHealthEntity`).
394
+ */
395
+ getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
396
+ /**
397
+ * Central secret resolver. When set, a collector declaring a `secretEnv`
398
+ * has it resolved + injected for this centrally-executed run; the
399
+ * collector masks the values out of its output. Optional for version-skew
400
+ * / test isolation.
401
+ */
402
+ secretResolver?: SecretResolverService;
522
403
  }): Promise<void> {
523
404
  const {
524
405
  payload,
@@ -532,13 +413,23 @@ async function executeHealthCheckJob(props: {
532
413
  maintenanceClient,
533
414
  incidentClient,
534
415
  getEmitHook,
416
+ getHealthEntity,
535
417
  cache,
418
+ secretResolver,
536
419
  } = props;
537
420
  const { configId, systemId } = payload;
538
421
 
539
422
  // Create service for aggregated state evaluation
540
423
  const service = new HealthCheckService(db, registry, collectorRegistry);
541
424
 
425
+ // Per-system serializer for the reactive health mutate (§10.3): a
426
+ // transaction-scoped advisory lock keyed `health:<systemId>` wraps the
427
+ // snapshot-prev + apply + diff + emit so concurrent evaluations of one
428
+ // system (multiple per-config jobs across pods, or at-least-once
429
+ // redelivery) can't double-emit a single logical transition. Bound to this
430
+ // job's systemId below at every `writeHealthEntity` call.
431
+ const serializeHealthWrite = createHealthEntitySerializer({ db })(systemId);
432
+
542
433
  // Capture aggregated state BEFORE this run for comparison
543
434
  const previousState = await service.getSystemHealthStatus(systemId);
544
435
  const previousStatus = previousState.status;
@@ -612,6 +503,17 @@ async function executeHealthCheckJob(props: {
612
503
  logger.debug(`Could not fetch system name for ${systemId}, using ID`);
613
504
  }
614
505
 
506
+ // Curated, read-only run-context metadata exposed to collectors.
507
+ // Metadata only - never secrets or config.
508
+ const runContext: CollectorRunContext = {
509
+ check: {
510
+ id: configId,
511
+ name: configRow.configName || configId,
512
+ intervalSeconds: configRow.interval,
513
+ },
514
+ system: { id: systemId, name: systemName },
515
+ };
516
+
615
517
  const strategy = registry.getStrategy(configRow.strategyId);
616
518
  if (!strategy) {
617
519
  logger.warn(
@@ -658,10 +560,31 @@ async function executeHealthCheckJob(props: {
658
560
  const storageKey = collectorEntry.id;
659
561
 
660
562
  try {
563
+ // Resolve the collector's declared secretEnv for THIS run
564
+ // (central execution). The collector injects it and masks the
565
+ // values out of its output. A missing required secret throws
566
+ // and fails the collector clearly.
567
+ let secretEnv: Record<string, string> | undefined;
568
+ const declared = secretEnvMappingSchema.safeParse(
569
+ (collectorEntry.config as { secretEnv?: unknown }).secretEnv,
570
+ );
571
+ if (
572
+ secretResolver &&
573
+ declared.success &&
574
+ Object.keys(declared.data).length > 0
575
+ ) {
576
+ const resolved = await secretResolver.resolveForRun({
577
+ secretEnv: declared.data,
578
+ });
579
+ secretEnv = resolved.env;
580
+ }
581
+
661
582
  const collectorResult = await registered.collector.execute({
662
583
  config: collectorEntry.config,
663
584
  client: connectedClient!.client,
664
585
  pluginId: configRow.strategyId,
586
+ runContext,
587
+ ...(secretEnv ? { secretEnv } : {}),
665
588
  });
666
589
 
667
590
  // Check for collector-level error
@@ -792,26 +715,44 @@ async function executeHealthCheckJob(props: {
792
715
  },
793
716
  };
794
717
 
795
- await db.insert(healthCheckRuns).values({
796
- configurationId: configId,
718
+ // Persist the run + aggregate THROUGH the reactive `health` entity:
719
+ // `apply` does the durable write and returns the freshly-computed view.
720
+ // The framework snapshots `prev` via `read` BEFORE this insert, so a real
721
+ // status change emits exactly one correct `ENTITY_CHANGED` (§10.3). The
722
+ // computed aggregated state is stashed for the transition/notify path.
723
+ let newState!: AggregatedHealth;
724
+ await writeHealthEntity({
725
+ handle: getHealthEntity?.(),
797
726
  systemId,
798
- status: result.status,
799
- latencyMs: result.latencyMs,
800
- result: { ...result } as Record<string, unknown>,
801
- sourceId: undefined,
802
- sourceLabel: "Local",
803
- });
727
+ apply: async () => {
728
+ await db.insert(healthCheckRuns).values({
729
+ configurationId: configId,
730
+ systemId,
731
+ status: result.status,
732
+ latencyMs: result.latencyMs,
733
+ result: { ...result } as Record<string, unknown>,
734
+ sourceId: undefined,
735
+ sourceLabel: "Local",
736
+ });
804
737
 
805
- await incrementHourlyAggregate({
806
- db,
807
- systemId,
808
- configurationId: configId,
809
- status: result.status,
810
- latencyMs: result.latencyMs,
811
- runTimestamp: new Date(),
812
- result: { ...result } as Record<string, unknown>,
813
- collectorRegistry,
814
- sourceLabel: "Local",
738
+ await incrementHourlyAggregate({
739
+ db,
740
+ systemId,
741
+ configurationId: configId,
742
+ status: result.status,
743
+ latencyMs: result.latencyMs,
744
+ runTimestamp: new Date(),
745
+ result: { ...result } as Record<string, unknown>,
746
+ collectorRegistry,
747
+ sourceLabel: "Local",
748
+ });
749
+
750
+ newState = await service.getSystemHealthStatus(systemId);
751
+ return toHealthEntityView(newState);
752
+ },
753
+ serialize: serializeHealthWrite,
754
+ onError: (error) =>
755
+ logger.warn(`Failed to mirror health entity for ${systemId}`, error),
815
756
  });
816
757
 
817
758
  logger.debug(
@@ -831,8 +772,17 @@ async function executeHealthCheckJob(props: {
831
772
  latencyMs: result.latencyMs,
832
773
  });
833
774
 
834
- const newState = await service.getSystemHealthStatus(systemId);
835
775
  if (newState.status !== previousStatus) {
776
+ // Record the aggregate transition so the sensing layer has a
777
+ // reliable "in status since" for every status (Wave 2).
778
+ await recordStateTransition({
779
+ db,
780
+ systemId,
781
+ configurationId: configId,
782
+ fromStatus: previousStatus,
783
+ toStatus: newState.status,
784
+ });
785
+
836
786
  await notifyStateChange({
837
787
  notificationClient,
838
788
  systemId,
@@ -848,23 +798,6 @@ async function executeHealthCheckJob(props: {
848
798
  });
849
799
  }
850
800
 
851
- // Per-check auto-incident: runs whether or not the aggregate
852
- // changed (a check can transition to unhealthy without flipping
853
- // the aggregate if another check is already unhealthy).
854
- await maybeOpenAutoIncidentForCheck({
855
- db,
856
- service,
857
- incidentClient,
858
- maintenanceClient,
859
- logger,
860
- systemId,
861
- systemName,
862
- configurationId: configId,
863
- configurationName: configRow.configName,
864
- previousState,
865
- newState,
866
- });
867
-
868
801
  return;
869
802
  } finally {
870
803
  if (connectedClient) {
@@ -893,28 +826,48 @@ async function executeHealthCheckJob(props: {
893
826
  },
894
827
  };
895
828
 
896
- // Store result (spread to convert structured type to plain record for jsonb)
897
- await db.insert(healthCheckRuns).values({
898
- configurationId: configId,
829
+ // Persist the run + aggregate THROUGH the reactive `health` entity on
830
+ // every run (§10.3): `apply` does the durable write (insert + hourly
831
+ // aggregate) and returns the freshly-computed view. The framework
832
+ // snapshots `prev` via the COMPUTE-ON-READ accessor BEFORE this insert, so
833
+ // an unchanged aggregate is a no-op and a real status change drives the
834
+ // directional/umbrella trigger events via `deriveHealthTriggerEvents` —
835
+ // exactly one correct `ENTITY_CHANGED` with accurate prev → next.
836
+ let newState!: AggregatedHealth;
837
+ await writeHealthEntity({
838
+ handle: getHealthEntity?.(),
899
839
  systemId,
900
- status: result.status,
901
- latencyMs: result.latencyMs,
902
- result: { ...result } as Record<string, unknown>,
903
- sourceId: undefined,
904
- sourceLabel: "Local",
905
- });
840
+ apply: async () => {
841
+ // Store result (spread to convert structured type to plain record for jsonb)
842
+ await db.insert(healthCheckRuns).values({
843
+ configurationId: configId,
844
+ systemId,
845
+ status: result.status,
846
+ latencyMs: result.latencyMs,
847
+ result: { ...result } as Record<string, unknown>,
848
+ sourceId: undefined,
849
+ sourceLabel: "Local",
850
+ });
906
851
 
907
- // Trigger incremental hourly aggregation
908
- await incrementHourlyAggregate({
909
- db,
910
- systemId,
911
- configurationId: configId,
912
- status: result.status,
913
- latencyMs: result.latencyMs,
914
- runTimestamp: new Date(),
915
- result: { ...result } as Record<string, unknown>,
916
- collectorRegistry,
917
- sourceLabel: "Local",
852
+ // Trigger incremental hourly aggregation
853
+ await incrementHourlyAggregate({
854
+ db,
855
+ systemId,
856
+ configurationId: configId,
857
+ status: result.status,
858
+ latencyMs: result.latencyMs,
859
+ runTimestamp: new Date(),
860
+ result: { ...result } as Record<string, unknown>,
861
+ collectorRegistry,
862
+ sourceLabel: "Local",
863
+ });
864
+
865
+ newState = await service.getSystemHealthStatus(systemId);
866
+ return toHealthEntityView(newState);
867
+ },
868
+ serialize: serializeHealthWrite,
869
+ onError: (error) =>
870
+ logger.warn(`Failed to mirror health entity for ${systemId}`, error),
918
871
  });
919
872
 
920
873
  logger.debug(
@@ -944,9 +897,17 @@ async function executeHealthCheckJob(props: {
944
897
  result: (result.metadata?.collectors as Record<string, unknown>) ?? undefined,
945
898
  });
946
899
 
947
- // Check if aggregated state changed and notify subscribers
948
- const newState = await service.getSystemHealthStatus(systemId);
949
900
  if (newState.status !== previousStatus) {
901
+ // Record the aggregate transition so the sensing layer has a
902
+ // reliable "in status since" for every status (Wave 2).
903
+ await recordStateTransition({
904
+ db,
905
+ systemId,
906
+ configurationId: configId,
907
+ fromStatus: previousStatus,
908
+ toStatus: newState.status,
909
+ });
910
+
950
911
  await notifyStateChange({
951
912
  notificationClient,
952
913
  systemId,
@@ -968,60 +929,13 @@ async function executeHealthCheckJob(props: {
968
929
  newStatus: newState.status,
969
930
  });
970
931
 
971
- // Emit integration hooks for external integrations
972
- const emitHook = getEmitHook();
973
- if (emitHook) {
974
- if (newState.status === "healthy" && previousStatus !== "healthy") {
975
- // Recovery: system became healthy
976
- await emitHook(healthCheckHooks.systemHealthy, {
977
- systemId,
978
- previousStatus,
979
- healthyChecks: newState.checkStatuses.filter(
980
- (c) => c.status === "healthy",
981
- ).length,
982
- totalChecks: newState.checkStatuses.length,
983
- timestamp: new Date().toISOString(),
984
- });
985
- logger.debug(
986
- `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
987
- );
988
- } else if (
989
- previousStatus === "healthy" &&
990
- newState.status !== "healthy"
991
- ) {
992
- // Degradation: system went from healthy to unhealthy/degraded
993
- await emitHook(healthCheckHooks.systemDegraded, {
994
- systemId,
995
- previousStatus,
996
- newStatus: newState.status,
997
- healthyChecks: newState.checkStatuses.filter(
998
- (c) => c.status === "healthy",
999
- ).length,
1000
- totalChecks: newState.checkStatuses.length,
1001
- timestamp: new Date().toISOString(),
1002
- });
1003
- logger.debug(
1004
- `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
1005
- );
1006
- }
1007
- }
932
+ // The directional + umbrella system-health hooks were removed in
933
+ // Phase 4 (§10.3): the `health` entity mirror above is the single
934
+ // source of truth, and its change deriver fires the
935
+ // `healthcheck.system_degraded` / `_healthy` / `_health_changed`
936
+ // trigger events through Stage-1 routing. Nothing to emit here.
1008
937
  }
1009
938
 
1010
- // Per-check auto-incident: see comment on the failed-execution path.
1011
- await maybeOpenAutoIncidentForCheck({
1012
- db,
1013
- service,
1014
- incidentClient,
1015
- maintenanceClient,
1016
- logger,
1017
- systemId,
1018
- systemName,
1019
- configurationId: configId,
1020
- configurationName: configRow.configName,
1021
- previousState,
1022
- newState,
1023
- });
1024
-
1025
939
  // Note: No manual rescheduling needed - recurring job handles it automatically
1026
940
  } catch (error) {
1027
941
  logger.error(
@@ -1029,27 +943,48 @@ async function executeHealthCheckJob(props: {
1029
943
  error,
1030
944
  );
1031
945
 
1032
- // Store failure (no latencyMs for failures)
1033
- await db.insert(healthCheckRuns).values({
1034
- configurationId: configId,
946
+ // Persist the failure run + aggregate THROUGH the reactive `health`
947
+ // entity: `apply` does the durable write and returns the freshly-computed
948
+ // view. The framework snapshots `prev` via the compute-on-read accessor
949
+ // BEFORE this insert, so a real status change emits exactly one correct
950
+ // `ENTITY_CHANGED` (§10.3). See the success path for the full rationale.
951
+ let newState!: AggregatedHealth;
952
+ await writeHealthEntity({
953
+ handle: getHealthEntity?.(),
1035
954
  systemId,
1036
- status: "unhealthy",
1037
- result: { error: String(error) } as Record<string, unknown>,
1038
- sourceId: undefined,
1039
- sourceLabel: "Local",
1040
- });
955
+ apply: async () => {
956
+ // Store failure (no latencyMs for failures)
957
+ await db.insert(healthCheckRuns).values({
958
+ configurationId: configId,
959
+ systemId,
960
+ status: "unhealthy",
961
+ result: { error: String(error) } as Record<string, unknown>,
962
+ sourceId: undefined,
963
+ sourceLabel: "Local",
964
+ });
1041
965
 
1042
- // Trigger incremental hourly aggregation
1043
- await incrementHourlyAggregate({
1044
- db,
1045
- systemId,
1046
- configurationId: configId,
1047
- status: "unhealthy",
1048
- latencyMs: undefined,
1049
- runTimestamp: new Date(),
1050
- // No collector data for error cases
1051
- collectorRegistry,
1052
- sourceLabel: "Local",
966
+ // Trigger incremental hourly aggregation
967
+ await incrementHourlyAggregate({
968
+ db,
969
+ systemId,
970
+ configurationId: configId,
971
+ status: "unhealthy",
972
+ latencyMs: undefined,
973
+ runTimestamp: new Date(),
974
+ // No collector data for error cases
975
+ collectorRegistry,
976
+ sourceLabel: "Local",
977
+ });
978
+
979
+ newState = await service.getSystemHealthStatus(systemId);
980
+ return toHealthEntityView(newState);
981
+ },
982
+ serialize: serializeHealthWrite,
983
+ onError: (mirrorError) =>
984
+ logger.warn(
985
+ `Failed to mirror health entity for ${systemId}`,
986
+ mirrorError,
987
+ ),
1053
988
  });
1054
989
 
1055
990
  // Try to fetch names for the enriched signal (best-effort)
@@ -1093,9 +1028,17 @@ async function executeHealthCheckJob(props: {
1093
1028
  result: undefined,
1094
1029
  });
1095
1030
 
1096
- // Check if aggregated state changed and notify subscribers
1097
- const newState = await service.getSystemHealthStatus(systemId);
1098
1031
  if (newState.status !== previousStatus) {
1032
+ // Record the aggregate transition so the sensing layer has a
1033
+ // reliable "in status since" for every status (Wave 2).
1034
+ await recordStateTransition({
1035
+ db,
1036
+ systemId,
1037
+ configurationId: configId,
1038
+ fromStatus: previousStatus,
1039
+ toStatus: newState.status,
1040
+ });
1041
+
1099
1042
  await notifyStateChange({
1100
1043
  notificationClient,
1101
1044
  systemId,
@@ -1117,60 +1060,13 @@ async function executeHealthCheckJob(props: {
1117
1060
  newStatus: newState.status,
1118
1061
  });
1119
1062
 
1120
- // Emit integration hooks for external integrations
1121
- const emitHook = getEmitHook();
1122
- if (emitHook) {
1123
- if (newState.status === "healthy" && previousStatus !== "healthy") {
1124
- // Recovery: system became healthy
1125
- await emitHook(healthCheckHooks.systemHealthy, {
1126
- systemId,
1127
- previousStatus,
1128
- healthyChecks: newState.checkStatuses.filter(
1129
- (c) => c.status === "healthy",
1130
- ).length,
1131
- totalChecks: newState.checkStatuses.length,
1132
- timestamp: new Date().toISOString(),
1133
- });
1134
- logger.debug(
1135
- `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
1136
- );
1137
- } else if (
1138
- previousStatus === "healthy" &&
1139
- newState.status !== "healthy"
1140
- ) {
1141
- // Degradation: system went from healthy to unhealthy/degraded
1142
- await emitHook(healthCheckHooks.systemDegraded, {
1143
- systemId,
1144
- previousStatus,
1145
- newStatus: newState.status,
1146
- healthyChecks: newState.checkStatuses.filter(
1147
- (c) => c.status === "healthy",
1148
- ).length,
1149
- totalChecks: newState.checkStatuses.length,
1150
- timestamp: new Date().toISOString(),
1151
- });
1152
- logger.debug(
1153
- `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
1154
- );
1155
- }
1156
- }
1063
+ // The directional + umbrella system-health hooks were removed in
1064
+ // Phase 4 (§10.3): the `health` entity mirror above is the single
1065
+ // source of truth, and its change deriver fires the
1066
+ // `healthcheck.system_degraded` / `_healthy` / `_health_changed`
1067
+ // trigger events through Stage-1 routing. Nothing to emit here.
1157
1068
  }
1158
1069
 
1159
- // Per-check auto-incident: see comment on the failed-execution path.
1160
- await maybeOpenAutoIncidentForCheck({
1161
- db,
1162
- service,
1163
- incidentClient,
1164
- maintenanceClient,
1165
- logger,
1166
- systemId,
1167
- systemName,
1168
- configurationId: configId,
1169
- configurationName: configName,
1170
- previousState,
1171
- newState,
1172
- });
1173
-
1174
1070
  // Note: No manual rescheduling needed - recurring job handles it automatically
1175
1071
  }
1176
1072
  }
@@ -1187,7 +1083,9 @@ export async function setupHealthCheckWorker(props: {
1187
1083
  maintenanceClient: MaintenanceClient;
1188
1084
  incidentClient: IncidentClient;
1189
1085
  getEmitHook: () => EmitHookFn | undefined;
1086
+ getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
1190
1087
  cache: HealthCheckCache;
1088
+ secretResolver?: SecretResolverService;
1191
1089
  }): Promise<void> {
1192
1090
  const {
1193
1091
  db,
@@ -1201,7 +1099,9 @@ export async function setupHealthCheckWorker(props: {
1201
1099
  maintenanceClient,
1202
1100
  incidentClient,
1203
1101
  getEmitHook,
1102
+ getHealthEntity,
1204
1103
  cache,
1104
+ secretResolver,
1205
1105
  } = props;
1206
1106
 
1207
1107
  const queue =
@@ -1222,7 +1122,9 @@ export async function setupHealthCheckWorker(props: {
1222
1122
  maintenanceClient,
1223
1123
  incidentClient,
1224
1124
  getEmitHook,
1125
+ getHealthEntity,
1225
1126
  cache,
1127
+ secretResolver,
1226
1128
  });
1227
1129
  },
1228
1130
  {