@checkstack/healthcheck-backend 1.1.4 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ import {
8
8
  type BaseStrategyConfig,
9
9
  type ConnectedClient,
10
10
  type TransportClient,
11
+ type CollectorRunContext,
11
12
  } from "@checkstack/backend-api";
12
13
  import { QueueManager } from "@checkstack/queue-api";
13
14
  import {
@@ -39,6 +40,21 @@ import { HealthCheckService } from "./service";
39
40
  import { healthCheckHooks } from "./hooks";
40
41
  import { incrementHourlyAggregate } from "./realtime-aggregation";
41
42
  import type { HealthCheckCache } from "./cache";
43
+ import {
44
+ classifyTransition,
45
+ shouldNotifyTransition,
46
+ } from "./notification-policy";
47
+ import {
48
+ findLastAutoIncidentClose,
49
+ findUnhealthySince,
50
+ hasHealthyRunSince,
51
+ isMaintenanceSuppressed,
52
+ isTransitionToUnhealthy,
53
+ openAutoIncident,
54
+ recordUnhealthyTransition,
55
+ shouldOpenForFlapping,
56
+ shouldOpenForSustainedUnhealthy,
57
+ } from "./auto-incident";
42
58
 
43
59
  type Db = SafeDatabase<typeof schema>;
44
60
  type CatalogClient = InferClient<typeof CatalogApi>;
@@ -47,8 +63,13 @@ type IncidentClient = InferClient<typeof IncidentApi>;
47
63
  type NotificationClient = InferClient<typeof NotificationApi>;
48
64
 
49
65
  /**
50
- * Emit the checkCompleted hook if available.
51
- * Extracted to avoid duplicating the hook emission pattern across success/error paths.
66
+ * Emit the checkCompleted hook if available, plus the narrower
67
+ * `checkFailed` hook when the result wasn't `healthy` (so operators
68
+ * can wire a typed "trigger on failure" automation without having to
69
+ * filter `checkCompleted` themselves).
70
+ *
71
+ * Extracted to avoid duplicating the hook emission pattern across
72
+ * success/error paths.
52
73
  */
53
74
  async function emitCheckCompletedHook({
54
75
  getEmitHook,
@@ -66,14 +87,26 @@ async function emitCheckCompletedHook({
66
87
  result: Record<string, unknown> | undefined;
67
88
  }): Promise<void> {
68
89
  const emitHook = getEmitHook();
69
- if (emitHook) {
70
- await emitHook(healthCheckHooks.checkCompleted, {
90
+ if (!emitHook) return;
91
+ const timestamp = new Date().toISOString();
92
+ await emitHook(healthCheckHooks.checkCompleted, {
93
+ systemId,
94
+ configurationId,
95
+ status,
96
+ latencyMs,
97
+ result,
98
+ timestamp,
99
+ });
100
+ // Narrow follow-up — informational for automation triggers; the
101
+ // auto-incident pipeline still runs on its own thresholds.
102
+ if (status !== "healthy") {
103
+ await emitHook(healthCheckHooks.checkFailed, {
71
104
  systemId,
72
105
  configurationId,
73
106
  status,
74
107
  latencyMs,
75
108
  result,
76
- timestamp: new Date().toISOString(),
109
+ timestamp,
77
110
  });
78
111
  }
79
112
  }
@@ -87,9 +120,11 @@ export interface HealthCheckJobPayload {
87
120
  }
88
121
 
89
122
  /**
90
- * Queue name for health check execution
123
+ * Queue name for health check execution. Exported so consumers like
124
+ * the `healthcheck.run_now` automation action can enqueue a one-off
125
+ * job without re-importing the recurring-job factory.
91
126
  */
92
- const HEALTH_CHECK_QUEUE = "health-checks";
127
+ export const HEALTH_CHECK_QUEUE = "health-checks";
93
128
 
94
129
  /**
95
130
  * Worker group for health check execution (work-queue mode)
@@ -136,15 +171,245 @@ export async function scheduleHealthCheck(props: {
136
171
  });
137
172
  }
138
173
 
174
+ /**
175
+ * After every check run, evaluate the per-check auto-incident
176
+ * triggers. Either trigger can independently open an incident:
177
+ *
178
+ * - **flapping**: this just-completed run was a transition to
179
+ * unhealthy AND `N` such transitions have happened within the
180
+ * configured window.
181
+ * - **sustained**: the check is currently unhealthy AND has been so
182
+ * continuously for at least the configured duration.
183
+ *
184
+ * Both triggers honour the require-recovery rule: after the most
185
+ * recent auto-incident close (manual or auto), no new auto-incident
186
+ * opens until the check has logged at least one healthy run. This
187
+ * stops a manual close → still-unhealthy → re-open loop.
188
+ *
189
+ * Active maintenance with suppression skips both triggers when the
190
+ * policy opts in.
191
+ */
192
+ async function maybeOpenAutoIncidentForCheck(props: {
193
+ db: Db;
194
+ service: HealthCheckService;
195
+ incidentClient: IncidentClient;
196
+ maintenanceClient: MaintenanceClient;
197
+ logger: Logger;
198
+ systemId: string;
199
+ systemName: string;
200
+ configurationId: string;
201
+ configurationName: string;
202
+ /**
203
+ * Same closure-based getter the queue executor uses elsewhere; let
204
+ * us fire the `flapping_detected` automation hook from inside the
205
+ * flapping evaluator without re-threading `emitHook` through every
206
+ * intermediate caller. Optional — when absent, the hook simply
207
+ * doesn't fire (e.g. in unit tests that don't care about it).
208
+ */
209
+ getEmitHook?: () => EmitHookFn | undefined;
210
+ previousState: {
211
+ checkStatuses: Array<{
212
+ configurationId: string;
213
+ status: HealthCheckStatus;
214
+ }>;
215
+ };
216
+ newState: {
217
+ checkStatuses: Array<{
218
+ configurationId: string;
219
+ status: HealthCheckStatus;
220
+ }>;
221
+ };
222
+ }): Promise<void> {
223
+ const {
224
+ db,
225
+ service,
226
+ incidentClient,
227
+ maintenanceClient,
228
+ logger,
229
+ systemId,
230
+ systemName,
231
+ configurationId,
232
+ configurationName,
233
+ getEmitHook,
234
+ previousState,
235
+ newState,
236
+ } = props;
237
+
238
+ const next = newState.checkStatuses.find(
239
+ (c) => c.configurationId === configurationId,
240
+ );
241
+ // Only auto-incident logic applies when the check is currently
242
+ // unhealthy — both triggers require it.
243
+ if (!next || next.status !== "unhealthy") return;
244
+
245
+ const prev = previousState.checkStatuses.find(
246
+ (c) => c.configurationId === configurationId,
247
+ );
248
+ const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
249
+
250
+ let policy;
251
+ try {
252
+ policy = await service.getAssignmentNotificationPolicy({
253
+ systemId,
254
+ configurationId,
255
+ });
256
+ } catch (error) {
257
+ logger.warn(
258
+ `Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
259
+ error,
260
+ );
261
+ return;
262
+ }
263
+
264
+ if (!policy.autoOpenIncidentOnUnhealthy) return;
265
+
266
+ // Honour active maintenance windows — operators have explicitly
267
+ // said the system is down on purpose.
268
+ if (policy.skipDuringMaintenance) {
269
+ const suppressed = await isMaintenanceSuppressed({
270
+ maintenanceClient,
271
+ systemId,
272
+ logger,
273
+ });
274
+ if (suppressed) {
275
+ logger.debug(
276
+ `Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
277
+ );
278
+ return;
279
+ }
280
+ }
281
+
282
+ // Require-recovery: if there's a prior closed auto-incident for
283
+ // this assignment, the check must have logged at least one healthy
284
+ // run since the close before we can open another one. Without this,
285
+ // an operator's manual close on a still-broken system would loop.
286
+ const lastCloseAt = await findLastAutoIncidentClose({
287
+ db,
288
+ systemId,
289
+ configurationId,
290
+ });
291
+ if (lastCloseAt) {
292
+ const recovered = await hasHealthyRunSince({
293
+ db,
294
+ systemId,
295
+ configurationId,
296
+ since: lastCloseAt,
297
+ });
298
+ if (!recovered) {
299
+ return;
300
+ }
301
+ }
302
+
303
+ // Record the transition (if any) and evaluate the flapping trigger
304
+ // against transitions that happened after the last close window.
305
+ let flappingOpens = false;
306
+ if (isTransition) {
307
+ try {
308
+ const count = await recordUnhealthyTransition({
309
+ db,
310
+ configurationId,
311
+ systemId,
312
+ windowMinutes: policy.flappingTrigger.windowMinutes,
313
+ since: lastCloseAt,
314
+ });
315
+ flappingOpens = shouldOpenForFlapping({
316
+ policy,
317
+ recentTransitionCount: count,
318
+ });
319
+
320
+ // Fire the informational `flapping_detected` automation hook
321
+ // independently of the auto-incident decision: an operator may
322
+ // care about flapping even with the auto-incident pipeline
323
+ // turned off.
324
+ if (
325
+ policy.flappingTrigger.enabled &&
326
+ count >= policy.flappingTrigger.transitions
327
+ ) {
328
+ const emit = getEmitHook?.();
329
+ if (emit) {
330
+ try {
331
+ await emit(healthCheckHooks.flappingDetected, {
332
+ systemId,
333
+ configurationId,
334
+ transitionCount: count,
335
+ windowMinutes: policy.flappingTrigger.windowMinutes,
336
+ timestamp: new Date().toISOString(),
337
+ });
338
+ } catch (error) {
339
+ logger.warn(
340
+ `Failed to emit healthcheck.flapping_detected hook for ${systemId}/${configurationId}:`,
341
+ error,
342
+ );
343
+ }
344
+ }
345
+ }
346
+ } catch (error) {
347
+ logger.warn(
348
+ `Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
349
+ error,
350
+ );
351
+ }
352
+ }
353
+
354
+ // Evaluate the sustained-duration trigger on every run while the
355
+ // check is unhealthy (not just on transition).
356
+ let sustainedOpens = false;
357
+ if (policy.sustainedUnhealthyTrigger.enabled) {
358
+ const unhealthySince = await findUnhealthySince({
359
+ db,
360
+ configurationId,
361
+ systemId,
362
+ since: lastCloseAt,
363
+ });
364
+ if (unhealthySince) {
365
+ sustainedOpens = shouldOpenForSustainedUnhealthy({
366
+ policy,
367
+ unhealthyForMs: Date.now() - unhealthySince.getTime(),
368
+ });
369
+ }
370
+ }
371
+
372
+ if (!flappingOpens && !sustainedOpens) return;
373
+
374
+ const reason = flappingOpens
375
+ ? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
376
+ : `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
377
+
378
+ await openAutoIncident({
379
+ db,
380
+ incidentClient,
381
+ logger,
382
+ systemId,
383
+ systemName,
384
+ configurationId,
385
+ configurationName,
386
+ policy,
387
+ reason,
388
+ });
389
+ }
390
+
139
391
  /**
140
392
  * Notify system subscribers about a health state change.
141
- * Skips notification if the system has active maintenance or incident with suppression enabled.
393
+ * Skips notification when:
394
+ * - the system has active maintenance/incident with suppression enabled, or
395
+ * - the policy of the check that just ran opts into de-escalation
396
+ * suppression and this transition is a de-escalation (e.g.
397
+ * `unhealthy → degraded`).
398
+ *
399
+ * For non-recovery transitions, the action CTA is deep-linked to the
400
+ * failing-checks filter so operators land directly on the problem.
401
+ *
402
+ * Policy is resolved per-assignment (per system+configuration) — the
403
+ * just-ran check is the one driving any aggregate transition in this
404
+ * execution, so its policy is the authoritative one.
142
405
  */
143
406
  async function notifyStateChange(props: {
144
407
  systemId: string;
145
408
  systemName: string;
409
+ configurationId: string;
146
410
  previousStatus: HealthCheckStatus;
147
411
  newStatus: HealthCheckStatus;
412
+ service: HealthCheckService;
148
413
  catalogClient: CatalogClient;
149
414
  notificationClient: NotificationClient;
150
415
  maintenanceClient: MaintenanceClient;
@@ -154,8 +419,10 @@ async function notifyStateChange(props: {
154
419
  const {
155
420
  systemId,
156
421
  systemName,
422
+ configurationId,
157
423
  previousStatus,
158
424
  newStatus,
425
+ service,
159
426
  catalogClient,
160
427
  notificationClient,
161
428
  maintenanceClient,
@@ -163,8 +430,31 @@ async function notifyStateChange(props: {
163
430
  logger,
164
431
  } = props;
165
432
 
166
- // Only notify on actual state changes
167
- if (newStatus === previousStatus) {
433
+ const transition = classifyTransition(previousStatus, newStatus);
434
+ if (transition === "none") {
435
+ return;
436
+ }
437
+
438
+ // Per-assignment notification policy. Failure to load defaults to
439
+ // "notify everything" rather than dropping the notification.
440
+ let suppressDeEscalations = false;
441
+ try {
442
+ const policy = await service.getAssignmentNotificationPolicy({
443
+ systemId,
444
+ configurationId,
445
+ });
446
+ suppressDeEscalations = policy.suppressDeEscalations;
447
+ } catch (error) {
448
+ logger.warn(
449
+ `Failed to load notification policy for ${systemId}/${configurationId}, applying defaults:`,
450
+ error,
451
+ );
452
+ }
453
+
454
+ if (!shouldNotifyTransition(transition, { suppressDeEscalations })) {
455
+ logger.debug(
456
+ `Skipping notification for ${systemId}: ${transition} suppressed by policy`,
457
+ );
168
458
  return;
169
459
  }
170
460
 
@@ -204,36 +494,38 @@ async function notifyStateChange(props: {
204
494
  );
205
495
  }
206
496
 
207
- const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
208
- const isDegraded = newStatus === "degraded";
209
- const isUnhealthy = newStatus === "unhealthy";
210
-
211
497
  let title: string;
212
498
  let body: string;
213
499
  let importance: "info" | "warning" | "critical";
214
500
 
215
- if (isRecovery) {
501
+ if (transition === "recovery") {
216
502
  title = `System health restored: ${systemName}`;
217
503
  body =
218
504
  `All health checks for **${systemName}** are now passing. The system has returned to normal operation.`;
219
505
  importance = "info";
220
- } else if (isUnhealthy) {
506
+ } else if (newStatus === "unhealthy") {
221
507
  title = `System health critical: ${systemName}`;
222
508
  body = `Health checks indicate **${systemName}** is unhealthy and may be down.`;
223
509
  importance = "critical";
224
- } else if (isDegraded) {
510
+ } else {
511
+ // degraded — either an escalation from healthy or a partial recovery
225
512
  title = `System health degraded: ${systemName}`;
226
513
  body =
227
514
  `Some health checks for **${systemName}** are failing. The system may be experiencing issues.`;
228
515
  importance = "warning";
229
- } else {
230
- // No notification for healthy → healthy (if somehow missed above)
231
- return;
232
516
  }
233
517
 
234
518
  const systemDetailPath = resolveRoute(catalogRoutes.routes.systemDetail, {
235
519
  systemId,
236
520
  });
521
+ // Recovery lands on the default (all) view; failing transitions deep-link
522
+ // operators into the failing-checks filter so they can debug immediately.
523
+ const actionUrl =
524
+ transition === "recovery"
525
+ ? systemDetailPath
526
+ : `${systemDetailPath}?filter=failing`;
527
+ const actionLabel =
528
+ transition === "recovery" ? "View System" : "View failing checks";
237
529
 
238
530
  void catalogClient; // parents are resolved server-side via stored target edges
239
531
 
@@ -244,7 +536,7 @@ async function notifyStateChange(props: {
244
536
  title,
245
537
  body,
246
538
  importance,
247
- action: { label: "View System", url: systemDetailPath },
539
+ action: { label: actionLabel, url: actionUrl },
248
540
  collapseKey: systemHealthCollapseKey(systemId),
249
541
  subjects: [
250
542
  createSystemSubject({
@@ -376,6 +668,17 @@ async function executeHealthCheckJob(props: {
376
668
  logger.debug(`Could not fetch system name for ${systemId}, using ID`);
377
669
  }
378
670
 
671
+ // Curated, read-only run-context metadata exposed to collectors.
672
+ // Metadata only - never secrets or config.
673
+ const runContext: CollectorRunContext = {
674
+ check: {
675
+ id: configId,
676
+ name: configRow.configName || configId,
677
+ intervalSeconds: configRow.interval,
678
+ },
679
+ system: { id: systemId, name: systemName },
680
+ };
681
+
379
682
  const strategy = registry.getStrategy(configRow.strategyId);
380
683
  if (!strategy) {
381
684
  logger.warn(
@@ -426,6 +729,7 @@ async function executeHealthCheckJob(props: {
426
729
  config: collectorEntry.config,
427
730
  client: connectedClient!.client,
428
731
  pluginId: configRow.strategyId,
732
+ runContext,
429
733
  });
430
734
 
431
735
  // Check for collector-level error
@@ -598,11 +902,13 @@ async function executeHealthCheckJob(props: {
598
902
  const newState = await service.getSystemHealthStatus(systemId);
599
903
  if (newState.status !== previousStatus) {
600
904
  await notifyStateChange({
601
- notificationClient,
905
+ notificationClient,
602
906
  systemId,
603
907
  systemName,
908
+ configurationId: configId,
604
909
  previousStatus,
605
910
  newStatus: newState.status,
911
+ service,
606
912
  catalogClient,
607
913
  maintenanceClient,
608
914
  incidentClient,
@@ -610,6 +916,24 @@ async function executeHealthCheckJob(props: {
610
916
  });
611
917
  }
612
918
 
919
+ // Per-check auto-incident: runs whether or not the aggregate
920
+ // changed (a check can transition to unhealthy without flipping
921
+ // the aggregate if another check is already unhealthy).
922
+ await maybeOpenAutoIncidentForCheck({
923
+ db,
924
+ service,
925
+ incidentClient,
926
+ maintenanceClient,
927
+ logger,
928
+ systemId,
929
+ systemName,
930
+ configurationId: configId,
931
+ configurationName: configRow.configName,
932
+ getEmitHook,
933
+ previousState,
934
+ newState,
935
+ });
936
+
613
937
  return;
614
938
  } finally {
615
939
  if (connectedClient) {
@@ -696,8 +1020,10 @@ async function executeHealthCheckJob(props: {
696
1020
  notificationClient,
697
1021
  systemId,
698
1022
  systemName,
1023
+ configurationId: configId,
699
1024
  previousStatus,
700
1025
  newStatus: newState.status,
1026
+ service,
701
1027
  catalogClient,
702
1028
  maintenanceClient,
703
1029
  incidentClient,
@@ -714,16 +1040,20 @@ async function executeHealthCheckJob(props: {
714
1040
  // Emit integration hooks for external integrations
715
1041
  const emitHook = getEmitHook();
716
1042
  if (emitHook) {
1043
+ const healthyChecks = newState.checkStatuses.filter(
1044
+ (c) => c.status === "healthy",
1045
+ ).length;
1046
+ const totalChecks = newState.checkStatuses.length;
1047
+ const timestamp = new Date().toISOString();
1048
+
717
1049
  if (newState.status === "healthy" && previousStatus !== "healthy") {
718
1050
  // Recovery: system became healthy
719
1051
  await emitHook(healthCheckHooks.systemHealthy, {
720
1052
  systemId,
721
1053
  previousStatus,
722
- healthyChecks: newState.checkStatuses.filter(
723
- (c) => c.status === "healthy",
724
- ).length,
725
- totalChecks: newState.checkStatuses.length,
726
- timestamp: new Date().toISOString(),
1054
+ healthyChecks,
1055
+ totalChecks,
1056
+ timestamp,
727
1057
  });
728
1058
  logger.debug(
729
1059
  `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
@@ -737,19 +1067,47 @@ async function executeHealthCheckJob(props: {
737
1067
  systemId,
738
1068
  previousStatus,
739
1069
  newStatus: newState.status,
740
- healthyChecks: newState.checkStatuses.filter(
741
- (c) => c.status === "healthy",
742
- ).length,
743
- totalChecks: newState.checkStatuses.length,
744
- timestamp: new Date().toISOString(),
1070
+ healthyChecks,
1071
+ totalChecks,
1072
+ timestamp,
745
1073
  });
746
1074
  logger.debug(
747
1075
  `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
748
1076
  );
749
1077
  }
1078
+
1079
+ // Umbrella hook — fires on every transition. Emitted alongside
1080
+ // the directional hooks so existing subscribers stay unchanged
1081
+ // while new automation triggers can react to any change.
1082
+ if (previousStatus !== newState.status) {
1083
+ await emitHook(healthCheckHooks.systemHealthChanged, {
1084
+ systemId,
1085
+ previousStatus,
1086
+ newStatus: newState.status,
1087
+ healthyChecks,
1088
+ totalChecks,
1089
+ timestamp,
1090
+ });
1091
+ }
750
1092
  }
751
1093
  }
752
1094
 
1095
+ // Per-check auto-incident: see comment on the failed-execution path.
1096
+ await maybeOpenAutoIncidentForCheck({
1097
+ db,
1098
+ service,
1099
+ incidentClient,
1100
+ maintenanceClient,
1101
+ logger,
1102
+ systemId,
1103
+ systemName,
1104
+ configurationId: configId,
1105
+ configurationName: configRow.configName,
1106
+ getEmitHook,
1107
+ previousState,
1108
+ newState,
1109
+ });
1110
+
753
1111
  // Note: No manual rescheduling needed - recurring job handles it automatically
754
1112
  } catch (error) {
755
1113
  logger.error(
@@ -828,8 +1186,10 @@ async function executeHealthCheckJob(props: {
828
1186
  notificationClient,
829
1187
  systemId,
830
1188
  systemName,
1189
+ configurationId: configId,
831
1190
  previousStatus,
832
1191
  newStatus: newState.status,
1192
+ service,
833
1193
  catalogClient,
834
1194
  maintenanceClient,
835
1195
  incidentClient,
@@ -846,16 +1206,20 @@ async function executeHealthCheckJob(props: {
846
1206
  // Emit integration hooks for external integrations
847
1207
  const emitHook = getEmitHook();
848
1208
  if (emitHook) {
1209
+ const healthyChecks = newState.checkStatuses.filter(
1210
+ (c) => c.status === "healthy",
1211
+ ).length;
1212
+ const totalChecks = newState.checkStatuses.length;
1213
+ const timestamp = new Date().toISOString();
1214
+
849
1215
  if (newState.status === "healthy" && previousStatus !== "healthy") {
850
1216
  // Recovery: system became healthy
851
1217
  await emitHook(healthCheckHooks.systemHealthy, {
852
1218
  systemId,
853
1219
  previousStatus,
854
- healthyChecks: newState.checkStatuses.filter(
855
- (c) => c.status === "healthy",
856
- ).length,
857
- totalChecks: newState.checkStatuses.length,
858
- timestamp: new Date().toISOString(),
1220
+ healthyChecks,
1221
+ totalChecks,
1222
+ timestamp,
859
1223
  });
860
1224
  logger.debug(
861
1225
  `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
@@ -869,19 +1233,47 @@ async function executeHealthCheckJob(props: {
869
1233
  systemId,
870
1234
  previousStatus,
871
1235
  newStatus: newState.status,
872
- healthyChecks: newState.checkStatuses.filter(
873
- (c) => c.status === "healthy",
874
- ).length,
875
- totalChecks: newState.checkStatuses.length,
876
- timestamp: new Date().toISOString(),
1236
+ healthyChecks,
1237
+ totalChecks,
1238
+ timestamp,
877
1239
  });
878
1240
  logger.debug(
879
1241
  `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
880
1242
  );
881
1243
  }
1244
+
1245
+ // Umbrella hook — fires on every transition. Emitted alongside
1246
+ // the directional hooks so existing subscribers stay unchanged
1247
+ // while new automation triggers can react to any change.
1248
+ if (previousStatus !== newState.status) {
1249
+ await emitHook(healthCheckHooks.systemHealthChanged, {
1250
+ systemId,
1251
+ previousStatus,
1252
+ newStatus: newState.status,
1253
+ healthyChecks,
1254
+ totalChecks,
1255
+ timestamp,
1256
+ });
1257
+ }
882
1258
  }
883
1259
  }
884
1260
 
1261
+ // Per-check auto-incident: see comment on the failed-execution path.
1262
+ await maybeOpenAutoIncidentForCheck({
1263
+ db,
1264
+ service,
1265
+ incidentClient,
1266
+ maintenanceClient,
1267
+ logger,
1268
+ systemId,
1269
+ systemName,
1270
+ configurationId: configId,
1271
+ configurationName: configName,
1272
+ getEmitHook,
1273
+ previousState,
1274
+ newState,
1275
+ });
1276
+
885
1277
  // Note: No manual rescheduling needed - recurring job handles it automatically
886
1278
  }
887
1279
  }
@@ -62,6 +62,16 @@ describe("HealthCheck Router", () => {
62
62
  getProvenance: mock<any>(() => Promise.resolve(null)),
63
63
  };
64
64
 
65
+ const mockConfigService = {
66
+ get: mock(async () => undefined),
67
+ set: mock(async () => {}),
68
+ getRedacted: mock(async () => undefined),
69
+ };
70
+
71
+ const mockCatalogClient = {
72
+ getSystem: mock(async () => null),
73
+ };
74
+
65
75
  const router = createHealthCheckRouter({
66
76
  database: mockDb as never,
67
77
  registry: mockRegistry,
@@ -69,6 +79,8 @@ describe("HealthCheck Router", () => {
69
79
  gitOpsClient: mockGitOpsClient as never,
70
80
  getEmitHook: () => undefined,
71
81
  cache: passthroughCache,
82
+ configService: mockConfigService as never,
83
+ catalogClient: mockCatalogClient as never,
72
84
  });
73
85
 
74
86
  it("getStrategies returns strategies from registry", async () => {