@checkstack/healthcheck-backend 1.1.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +256 -0
- package/drizzle/0012_fair_boomer.sql +1 -0
- package/drizzle/0013_clean_fabian_cortez.sql +20 -0
- package/drizzle/0014_chilly_ultragirl.sql +2 -0
- package/drizzle/meta/0012_snapshot.json +447 -0
- package/drizzle/meta/0013_snapshot.json +615 -0
- package/drizzle/meta/0014_snapshot.json +648 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +21 -20
- package/src/auto-incident-close-job.ts +164 -0
- package/src/auto-incident.test.ts +196 -0
- package/src/auto-incident.ts +332 -0
- package/src/automations.test.ts +255 -0
- package/src/automations.ts +340 -0
- package/src/healthcheck-gitops-kinds.test.ts +93 -0
- package/src/healthcheck-gitops-kinds.ts +34 -0
- package/src/hooks.ts +69 -4
- package/src/index.ts +80 -52
- package/src/notification-defaults-config.ts +10 -0
- package/src/notification-policy.test.ts +104 -0
- package/src/notification-policy.ts +56 -0
- package/src/queue-executor.test.ts +137 -0
- package/src/queue-executor.ts +434 -42
- package/src/router.test.ts +12 -0
- package/src/router.ts +30 -2
- package/src/schema.ts +76 -0
- package/src/service-assignments.test.ts +184 -0
- package/src/service-notification-policy.test.ts +174 -0
- package/src/service.ts +195 -1
- package/tsconfig.json +5 -2
package/src/queue-executor.ts
CHANGED
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
type BaseStrategyConfig,
|
|
9
9
|
type ConnectedClient,
|
|
10
10
|
type TransportClient,
|
|
11
|
+
type CollectorRunContext,
|
|
11
12
|
} from "@checkstack/backend-api";
|
|
12
13
|
import { QueueManager } from "@checkstack/queue-api";
|
|
13
14
|
import {
|
|
@@ -39,6 +40,21 @@ import { HealthCheckService } from "./service";
|
|
|
39
40
|
import { healthCheckHooks } from "./hooks";
|
|
40
41
|
import { incrementHourlyAggregate } from "./realtime-aggregation";
|
|
41
42
|
import type { HealthCheckCache } from "./cache";
|
|
43
|
+
import {
|
|
44
|
+
classifyTransition,
|
|
45
|
+
shouldNotifyTransition,
|
|
46
|
+
} from "./notification-policy";
|
|
47
|
+
import {
|
|
48
|
+
findLastAutoIncidentClose,
|
|
49
|
+
findUnhealthySince,
|
|
50
|
+
hasHealthyRunSince,
|
|
51
|
+
isMaintenanceSuppressed,
|
|
52
|
+
isTransitionToUnhealthy,
|
|
53
|
+
openAutoIncident,
|
|
54
|
+
recordUnhealthyTransition,
|
|
55
|
+
shouldOpenForFlapping,
|
|
56
|
+
shouldOpenForSustainedUnhealthy,
|
|
57
|
+
} from "./auto-incident";
|
|
42
58
|
|
|
43
59
|
type Db = SafeDatabase<typeof schema>;
|
|
44
60
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
@@ -47,8 +63,13 @@ type IncidentClient = InferClient<typeof IncidentApi>;
|
|
|
47
63
|
type NotificationClient = InferClient<typeof NotificationApi>;
|
|
48
64
|
|
|
49
65
|
/**
|
|
50
|
-
* Emit the checkCompleted hook if available
|
|
51
|
-
*
|
|
66
|
+
* Emit the checkCompleted hook if available, plus the narrower
|
|
67
|
+
* `checkFailed` hook when the result wasn't `healthy` (so operators
|
|
68
|
+
* can wire a typed "trigger on failure" automation without having to
|
|
69
|
+
* filter `checkCompleted` themselves).
|
|
70
|
+
*
|
|
71
|
+
* Extracted to avoid duplicating the hook emission pattern across
|
|
72
|
+
* success/error paths.
|
|
52
73
|
*/
|
|
53
74
|
async function emitCheckCompletedHook({
|
|
54
75
|
getEmitHook,
|
|
@@ -66,14 +87,26 @@ async function emitCheckCompletedHook({
|
|
|
66
87
|
result: Record<string, unknown> | undefined;
|
|
67
88
|
}): Promise<void> {
|
|
68
89
|
const emitHook = getEmitHook();
|
|
69
|
-
if (emitHook)
|
|
70
|
-
|
|
90
|
+
if (!emitHook) return;
|
|
91
|
+
const timestamp = new Date().toISOString();
|
|
92
|
+
await emitHook(healthCheckHooks.checkCompleted, {
|
|
93
|
+
systemId,
|
|
94
|
+
configurationId,
|
|
95
|
+
status,
|
|
96
|
+
latencyMs,
|
|
97
|
+
result,
|
|
98
|
+
timestamp,
|
|
99
|
+
});
|
|
100
|
+
// Narrow follow-up — informational for automation triggers; the
|
|
101
|
+
// auto-incident pipeline still runs on its own thresholds.
|
|
102
|
+
if (status !== "healthy") {
|
|
103
|
+
await emitHook(healthCheckHooks.checkFailed, {
|
|
71
104
|
systemId,
|
|
72
105
|
configurationId,
|
|
73
106
|
status,
|
|
74
107
|
latencyMs,
|
|
75
108
|
result,
|
|
76
|
-
timestamp
|
|
109
|
+
timestamp,
|
|
77
110
|
});
|
|
78
111
|
}
|
|
79
112
|
}
|
|
@@ -87,9 +120,11 @@ export interface HealthCheckJobPayload {
|
|
|
87
120
|
}
|
|
88
121
|
|
|
89
122
|
/**
|
|
90
|
-
* Queue name for health check execution
|
|
123
|
+
* Queue name for health check execution. Exported so consumers like
|
|
124
|
+
* the `healthcheck.run_now` automation action can enqueue a one-off
|
|
125
|
+
* job without re-importing the recurring-job factory.
|
|
91
126
|
*/
|
|
92
|
-
const HEALTH_CHECK_QUEUE = "health-checks";
|
|
127
|
+
export const HEALTH_CHECK_QUEUE = "health-checks";
|
|
93
128
|
|
|
94
129
|
/**
|
|
95
130
|
* Worker group for health check execution (work-queue mode)
|
|
@@ -136,15 +171,245 @@ export async function scheduleHealthCheck(props: {
|
|
|
136
171
|
});
|
|
137
172
|
}
|
|
138
173
|
|
|
174
|
+
/**
|
|
175
|
+
* After every check run, evaluate the per-check auto-incident
|
|
176
|
+
* triggers. Either trigger can independently open an incident:
|
|
177
|
+
*
|
|
178
|
+
* - **flapping**: this just-completed run was a transition to
|
|
179
|
+
* unhealthy AND `N` such transitions have happened within the
|
|
180
|
+
* configured window.
|
|
181
|
+
* - **sustained**: the check is currently unhealthy AND has been so
|
|
182
|
+
* continuously for at least the configured duration.
|
|
183
|
+
*
|
|
184
|
+
* Both triggers honour the require-recovery rule: after the most
|
|
185
|
+
* recent auto-incident close (manual or auto), no new auto-incident
|
|
186
|
+
* opens until the check has logged at least one healthy run. This
|
|
187
|
+
* stops a manual close → still-unhealthy → re-open loop.
|
|
188
|
+
*
|
|
189
|
+
* Active maintenance with suppression skips both triggers when the
|
|
190
|
+
* policy opts in.
|
|
191
|
+
*/
|
|
192
|
+
async function maybeOpenAutoIncidentForCheck(props: {
|
|
193
|
+
db: Db;
|
|
194
|
+
service: HealthCheckService;
|
|
195
|
+
incidentClient: IncidentClient;
|
|
196
|
+
maintenanceClient: MaintenanceClient;
|
|
197
|
+
logger: Logger;
|
|
198
|
+
systemId: string;
|
|
199
|
+
systemName: string;
|
|
200
|
+
configurationId: string;
|
|
201
|
+
configurationName: string;
|
|
202
|
+
/**
|
|
203
|
+
* Same closure-based getter the queue executor uses elsewhere; let
|
|
204
|
+
* us fire the `flapping_detected` automation hook from inside the
|
|
205
|
+
* flapping evaluator without re-threading `emitHook` through every
|
|
206
|
+
* intermediate caller. Optional — when absent, the hook simply
|
|
207
|
+
* doesn't fire (e.g. in unit tests that don't care about it).
|
|
208
|
+
*/
|
|
209
|
+
getEmitHook?: () => EmitHookFn | undefined;
|
|
210
|
+
previousState: {
|
|
211
|
+
checkStatuses: Array<{
|
|
212
|
+
configurationId: string;
|
|
213
|
+
status: HealthCheckStatus;
|
|
214
|
+
}>;
|
|
215
|
+
};
|
|
216
|
+
newState: {
|
|
217
|
+
checkStatuses: Array<{
|
|
218
|
+
configurationId: string;
|
|
219
|
+
status: HealthCheckStatus;
|
|
220
|
+
}>;
|
|
221
|
+
};
|
|
222
|
+
}): Promise<void> {
|
|
223
|
+
const {
|
|
224
|
+
db,
|
|
225
|
+
service,
|
|
226
|
+
incidentClient,
|
|
227
|
+
maintenanceClient,
|
|
228
|
+
logger,
|
|
229
|
+
systemId,
|
|
230
|
+
systemName,
|
|
231
|
+
configurationId,
|
|
232
|
+
configurationName,
|
|
233
|
+
getEmitHook,
|
|
234
|
+
previousState,
|
|
235
|
+
newState,
|
|
236
|
+
} = props;
|
|
237
|
+
|
|
238
|
+
const next = newState.checkStatuses.find(
|
|
239
|
+
(c) => c.configurationId === configurationId,
|
|
240
|
+
);
|
|
241
|
+
// Only auto-incident logic applies when the check is currently
|
|
242
|
+
// unhealthy — both triggers require it.
|
|
243
|
+
if (!next || next.status !== "unhealthy") return;
|
|
244
|
+
|
|
245
|
+
const prev = previousState.checkStatuses.find(
|
|
246
|
+
(c) => c.configurationId === configurationId,
|
|
247
|
+
);
|
|
248
|
+
const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
|
|
249
|
+
|
|
250
|
+
let policy;
|
|
251
|
+
try {
|
|
252
|
+
policy = await service.getAssignmentNotificationPolicy({
|
|
253
|
+
systemId,
|
|
254
|
+
configurationId,
|
|
255
|
+
});
|
|
256
|
+
} catch (error) {
|
|
257
|
+
logger.warn(
|
|
258
|
+
`Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
|
|
259
|
+
error,
|
|
260
|
+
);
|
|
261
|
+
return;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
if (!policy.autoOpenIncidentOnUnhealthy) return;
|
|
265
|
+
|
|
266
|
+
// Honour active maintenance windows — operators have explicitly
|
|
267
|
+
// said the system is down on purpose.
|
|
268
|
+
if (policy.skipDuringMaintenance) {
|
|
269
|
+
const suppressed = await isMaintenanceSuppressed({
|
|
270
|
+
maintenanceClient,
|
|
271
|
+
systemId,
|
|
272
|
+
logger,
|
|
273
|
+
});
|
|
274
|
+
if (suppressed) {
|
|
275
|
+
logger.debug(
|
|
276
|
+
`Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
|
|
277
|
+
);
|
|
278
|
+
return;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Require-recovery: if there's a prior closed auto-incident for
|
|
283
|
+
// this assignment, the check must have logged at least one healthy
|
|
284
|
+
// run since the close before we can open another one. Without this,
|
|
285
|
+
// an operator's manual close on a still-broken system would loop.
|
|
286
|
+
const lastCloseAt = await findLastAutoIncidentClose({
|
|
287
|
+
db,
|
|
288
|
+
systemId,
|
|
289
|
+
configurationId,
|
|
290
|
+
});
|
|
291
|
+
if (lastCloseAt) {
|
|
292
|
+
const recovered = await hasHealthyRunSince({
|
|
293
|
+
db,
|
|
294
|
+
systemId,
|
|
295
|
+
configurationId,
|
|
296
|
+
since: lastCloseAt,
|
|
297
|
+
});
|
|
298
|
+
if (!recovered) {
|
|
299
|
+
return;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Record the transition (if any) and evaluate the flapping trigger
|
|
304
|
+
// against transitions that happened after the last close window.
|
|
305
|
+
let flappingOpens = false;
|
|
306
|
+
if (isTransition) {
|
|
307
|
+
try {
|
|
308
|
+
const count = await recordUnhealthyTransition({
|
|
309
|
+
db,
|
|
310
|
+
configurationId,
|
|
311
|
+
systemId,
|
|
312
|
+
windowMinutes: policy.flappingTrigger.windowMinutes,
|
|
313
|
+
since: lastCloseAt,
|
|
314
|
+
});
|
|
315
|
+
flappingOpens = shouldOpenForFlapping({
|
|
316
|
+
policy,
|
|
317
|
+
recentTransitionCount: count,
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
// Fire the informational `flapping_detected` automation hook
|
|
321
|
+
// independently of the auto-incident decision: an operator may
|
|
322
|
+
// care about flapping even with the auto-incident pipeline
|
|
323
|
+
// turned off.
|
|
324
|
+
if (
|
|
325
|
+
policy.flappingTrigger.enabled &&
|
|
326
|
+
count >= policy.flappingTrigger.transitions
|
|
327
|
+
) {
|
|
328
|
+
const emit = getEmitHook?.();
|
|
329
|
+
if (emit) {
|
|
330
|
+
try {
|
|
331
|
+
await emit(healthCheckHooks.flappingDetected, {
|
|
332
|
+
systemId,
|
|
333
|
+
configurationId,
|
|
334
|
+
transitionCount: count,
|
|
335
|
+
windowMinutes: policy.flappingTrigger.windowMinutes,
|
|
336
|
+
timestamp: new Date().toISOString(),
|
|
337
|
+
});
|
|
338
|
+
} catch (error) {
|
|
339
|
+
logger.warn(
|
|
340
|
+
`Failed to emit healthcheck.flapping_detected hook for ${systemId}/${configurationId}:`,
|
|
341
|
+
error,
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
} catch (error) {
|
|
347
|
+
logger.warn(
|
|
348
|
+
`Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
|
|
349
|
+
error,
|
|
350
|
+
);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Evaluate the sustained-duration trigger on every run while the
|
|
355
|
+
// check is unhealthy (not just on transition).
|
|
356
|
+
let sustainedOpens = false;
|
|
357
|
+
if (policy.sustainedUnhealthyTrigger.enabled) {
|
|
358
|
+
const unhealthySince = await findUnhealthySince({
|
|
359
|
+
db,
|
|
360
|
+
configurationId,
|
|
361
|
+
systemId,
|
|
362
|
+
since: lastCloseAt,
|
|
363
|
+
});
|
|
364
|
+
if (unhealthySince) {
|
|
365
|
+
sustainedOpens = shouldOpenForSustainedUnhealthy({
|
|
366
|
+
policy,
|
|
367
|
+
unhealthyForMs: Date.now() - unhealthySince.getTime(),
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
if (!flappingOpens && !sustainedOpens) return;
|
|
373
|
+
|
|
374
|
+
const reason = flappingOpens
|
|
375
|
+
? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
|
|
376
|
+
: `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
|
|
377
|
+
|
|
378
|
+
await openAutoIncident({
|
|
379
|
+
db,
|
|
380
|
+
incidentClient,
|
|
381
|
+
logger,
|
|
382
|
+
systemId,
|
|
383
|
+
systemName,
|
|
384
|
+
configurationId,
|
|
385
|
+
configurationName,
|
|
386
|
+
policy,
|
|
387
|
+
reason,
|
|
388
|
+
});
|
|
389
|
+
}
|
|
390
|
+
|
|
139
391
|
/**
|
|
140
392
|
* Notify system subscribers about a health state change.
|
|
141
|
-
* Skips notification
|
|
393
|
+
* Skips notification when:
|
|
394
|
+
* - the system has active maintenance/incident with suppression enabled, or
|
|
395
|
+
* - the policy of the check that just ran opts into de-escalation
|
|
396
|
+
* suppression and this transition is a de-escalation (e.g.
|
|
397
|
+
* `unhealthy → degraded`).
|
|
398
|
+
*
|
|
399
|
+
* For non-recovery transitions, the action CTA is deep-linked to the
|
|
400
|
+
* failing-checks filter so operators land directly on the problem.
|
|
401
|
+
*
|
|
402
|
+
* Policy is resolved per-assignment (per system+configuration) — the
|
|
403
|
+
* just-ran check is the one driving any aggregate transition in this
|
|
404
|
+
* execution, so its policy is the authoritative one.
|
|
142
405
|
*/
|
|
143
406
|
async function notifyStateChange(props: {
|
|
144
407
|
systemId: string;
|
|
145
408
|
systemName: string;
|
|
409
|
+
configurationId: string;
|
|
146
410
|
previousStatus: HealthCheckStatus;
|
|
147
411
|
newStatus: HealthCheckStatus;
|
|
412
|
+
service: HealthCheckService;
|
|
148
413
|
catalogClient: CatalogClient;
|
|
149
414
|
notificationClient: NotificationClient;
|
|
150
415
|
maintenanceClient: MaintenanceClient;
|
|
@@ -154,8 +419,10 @@ async function notifyStateChange(props: {
|
|
|
154
419
|
const {
|
|
155
420
|
systemId,
|
|
156
421
|
systemName,
|
|
422
|
+
configurationId,
|
|
157
423
|
previousStatus,
|
|
158
424
|
newStatus,
|
|
425
|
+
service,
|
|
159
426
|
catalogClient,
|
|
160
427
|
notificationClient,
|
|
161
428
|
maintenanceClient,
|
|
@@ -163,8 +430,31 @@ async function notifyStateChange(props: {
|
|
|
163
430
|
logger,
|
|
164
431
|
} = props;
|
|
165
432
|
|
|
166
|
-
|
|
167
|
-
if (
|
|
433
|
+
const transition = classifyTransition(previousStatus, newStatus);
|
|
434
|
+
if (transition === "none") {
|
|
435
|
+
return;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Per-assignment notification policy. Failure to load defaults to
|
|
439
|
+
// "notify everything" rather than dropping the notification.
|
|
440
|
+
let suppressDeEscalations = false;
|
|
441
|
+
try {
|
|
442
|
+
const policy = await service.getAssignmentNotificationPolicy({
|
|
443
|
+
systemId,
|
|
444
|
+
configurationId,
|
|
445
|
+
});
|
|
446
|
+
suppressDeEscalations = policy.suppressDeEscalations;
|
|
447
|
+
} catch (error) {
|
|
448
|
+
logger.warn(
|
|
449
|
+
`Failed to load notification policy for ${systemId}/${configurationId}, applying defaults:`,
|
|
450
|
+
error,
|
|
451
|
+
);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
if (!shouldNotifyTransition(transition, { suppressDeEscalations })) {
|
|
455
|
+
logger.debug(
|
|
456
|
+
`Skipping notification for ${systemId}: ${transition} suppressed by policy`,
|
|
457
|
+
);
|
|
168
458
|
return;
|
|
169
459
|
}
|
|
170
460
|
|
|
@@ -204,36 +494,38 @@ async function notifyStateChange(props: {
|
|
|
204
494
|
);
|
|
205
495
|
}
|
|
206
496
|
|
|
207
|
-
const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
|
|
208
|
-
const isDegraded = newStatus === "degraded";
|
|
209
|
-
const isUnhealthy = newStatus === "unhealthy";
|
|
210
|
-
|
|
211
497
|
let title: string;
|
|
212
498
|
let body: string;
|
|
213
499
|
let importance: "info" | "warning" | "critical";
|
|
214
500
|
|
|
215
|
-
if (
|
|
501
|
+
if (transition === "recovery") {
|
|
216
502
|
title = `System health restored: ${systemName}`;
|
|
217
503
|
body =
|
|
218
504
|
`All health checks for **${systemName}** are now passing. The system has returned to normal operation.`;
|
|
219
505
|
importance = "info";
|
|
220
|
-
} else if (
|
|
506
|
+
} else if (newStatus === "unhealthy") {
|
|
221
507
|
title = `System health critical: ${systemName}`;
|
|
222
508
|
body = `Health checks indicate **${systemName}** is unhealthy and may be down.`;
|
|
223
509
|
importance = "critical";
|
|
224
|
-
} else
|
|
510
|
+
} else {
|
|
511
|
+
// degraded — either an escalation from healthy or a partial recovery
|
|
225
512
|
title = `System health degraded: ${systemName}`;
|
|
226
513
|
body =
|
|
227
514
|
`Some health checks for **${systemName}** are failing. The system may be experiencing issues.`;
|
|
228
515
|
importance = "warning";
|
|
229
|
-
} else {
|
|
230
|
-
// No notification for healthy → healthy (if somehow missed above)
|
|
231
|
-
return;
|
|
232
516
|
}
|
|
233
517
|
|
|
234
518
|
const systemDetailPath = resolveRoute(catalogRoutes.routes.systemDetail, {
|
|
235
519
|
systemId,
|
|
236
520
|
});
|
|
521
|
+
// Recovery lands on the default (all) view; failing transitions deep-link
|
|
522
|
+
// operators into the failing-checks filter so they can debug immediately.
|
|
523
|
+
const actionUrl =
|
|
524
|
+
transition === "recovery"
|
|
525
|
+
? systemDetailPath
|
|
526
|
+
: `${systemDetailPath}?filter=failing`;
|
|
527
|
+
const actionLabel =
|
|
528
|
+
transition === "recovery" ? "View System" : "View failing checks";
|
|
237
529
|
|
|
238
530
|
void catalogClient; // parents are resolved server-side via stored target edges
|
|
239
531
|
|
|
@@ -244,7 +536,7 @@ async function notifyStateChange(props: {
|
|
|
244
536
|
title,
|
|
245
537
|
body,
|
|
246
538
|
importance,
|
|
247
|
-
action: { label:
|
|
539
|
+
action: { label: actionLabel, url: actionUrl },
|
|
248
540
|
collapseKey: systemHealthCollapseKey(systemId),
|
|
249
541
|
subjects: [
|
|
250
542
|
createSystemSubject({
|
|
@@ -376,6 +668,17 @@ async function executeHealthCheckJob(props: {
|
|
|
376
668
|
logger.debug(`Could not fetch system name for ${systemId}, using ID`);
|
|
377
669
|
}
|
|
378
670
|
|
|
671
|
+
// Curated, read-only run-context metadata exposed to collectors.
|
|
672
|
+
// Metadata only - never secrets or config.
|
|
673
|
+
const runContext: CollectorRunContext = {
|
|
674
|
+
check: {
|
|
675
|
+
id: configId,
|
|
676
|
+
name: configRow.configName || configId,
|
|
677
|
+
intervalSeconds: configRow.interval,
|
|
678
|
+
},
|
|
679
|
+
system: { id: systemId, name: systemName },
|
|
680
|
+
};
|
|
681
|
+
|
|
379
682
|
const strategy = registry.getStrategy(configRow.strategyId);
|
|
380
683
|
if (!strategy) {
|
|
381
684
|
logger.warn(
|
|
@@ -426,6 +729,7 @@ async function executeHealthCheckJob(props: {
|
|
|
426
729
|
config: collectorEntry.config,
|
|
427
730
|
client: connectedClient!.client,
|
|
428
731
|
pluginId: configRow.strategyId,
|
|
732
|
+
runContext,
|
|
429
733
|
});
|
|
430
734
|
|
|
431
735
|
// Check for collector-level error
|
|
@@ -598,11 +902,13 @@ async function executeHealthCheckJob(props: {
|
|
|
598
902
|
const newState = await service.getSystemHealthStatus(systemId);
|
|
599
903
|
if (newState.status !== previousStatus) {
|
|
600
904
|
await notifyStateChange({
|
|
601
|
-
|
|
905
|
+
notificationClient,
|
|
602
906
|
systemId,
|
|
603
907
|
systemName,
|
|
908
|
+
configurationId: configId,
|
|
604
909
|
previousStatus,
|
|
605
910
|
newStatus: newState.status,
|
|
911
|
+
service,
|
|
606
912
|
catalogClient,
|
|
607
913
|
maintenanceClient,
|
|
608
914
|
incidentClient,
|
|
@@ -610,6 +916,24 @@ async function executeHealthCheckJob(props: {
|
|
|
610
916
|
});
|
|
611
917
|
}
|
|
612
918
|
|
|
919
|
+
// Per-check auto-incident: runs whether or not the aggregate
|
|
920
|
+
// changed (a check can transition to unhealthy without flipping
|
|
921
|
+
// the aggregate if another check is already unhealthy).
|
|
922
|
+
await maybeOpenAutoIncidentForCheck({
|
|
923
|
+
db,
|
|
924
|
+
service,
|
|
925
|
+
incidentClient,
|
|
926
|
+
maintenanceClient,
|
|
927
|
+
logger,
|
|
928
|
+
systemId,
|
|
929
|
+
systemName,
|
|
930
|
+
configurationId: configId,
|
|
931
|
+
configurationName: configRow.configName,
|
|
932
|
+
getEmitHook,
|
|
933
|
+
previousState,
|
|
934
|
+
newState,
|
|
935
|
+
});
|
|
936
|
+
|
|
613
937
|
return;
|
|
614
938
|
} finally {
|
|
615
939
|
if (connectedClient) {
|
|
@@ -696,8 +1020,10 @@ async function executeHealthCheckJob(props: {
|
|
|
696
1020
|
notificationClient,
|
|
697
1021
|
systemId,
|
|
698
1022
|
systemName,
|
|
1023
|
+
configurationId: configId,
|
|
699
1024
|
previousStatus,
|
|
700
1025
|
newStatus: newState.status,
|
|
1026
|
+
service,
|
|
701
1027
|
catalogClient,
|
|
702
1028
|
maintenanceClient,
|
|
703
1029
|
incidentClient,
|
|
@@ -714,16 +1040,20 @@ async function executeHealthCheckJob(props: {
|
|
|
714
1040
|
// Emit integration hooks for external integrations
|
|
715
1041
|
const emitHook = getEmitHook();
|
|
716
1042
|
if (emitHook) {
|
|
1043
|
+
const healthyChecks = newState.checkStatuses.filter(
|
|
1044
|
+
(c) => c.status === "healthy",
|
|
1045
|
+
).length;
|
|
1046
|
+
const totalChecks = newState.checkStatuses.length;
|
|
1047
|
+
const timestamp = new Date().toISOString();
|
|
1048
|
+
|
|
717
1049
|
if (newState.status === "healthy" && previousStatus !== "healthy") {
|
|
718
1050
|
// Recovery: system became healthy
|
|
719
1051
|
await emitHook(healthCheckHooks.systemHealthy, {
|
|
720
1052
|
systemId,
|
|
721
1053
|
previousStatus,
|
|
722
|
-
healthyChecks
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
totalChecks: newState.checkStatuses.length,
|
|
726
|
-
timestamp: new Date().toISOString(),
|
|
1054
|
+
healthyChecks,
|
|
1055
|
+
totalChecks,
|
|
1056
|
+
timestamp,
|
|
727
1057
|
});
|
|
728
1058
|
logger.debug(
|
|
729
1059
|
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
@@ -737,19 +1067,47 @@ async function executeHealthCheckJob(props: {
|
|
|
737
1067
|
systemId,
|
|
738
1068
|
previousStatus,
|
|
739
1069
|
newStatus: newState.status,
|
|
740
|
-
healthyChecks
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
totalChecks: newState.checkStatuses.length,
|
|
744
|
-
timestamp: new Date().toISOString(),
|
|
1070
|
+
healthyChecks,
|
|
1071
|
+
totalChecks,
|
|
1072
|
+
timestamp,
|
|
745
1073
|
});
|
|
746
1074
|
logger.debug(
|
|
747
1075
|
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
748
1076
|
);
|
|
749
1077
|
}
|
|
1078
|
+
|
|
1079
|
+
// Umbrella hook — fires on every transition. Emitted alongside
|
|
1080
|
+
// the directional hooks so existing subscribers stay unchanged
|
|
1081
|
+
// while new automation triggers can react to any change.
|
|
1082
|
+
if (previousStatus !== newState.status) {
|
|
1083
|
+
await emitHook(healthCheckHooks.systemHealthChanged, {
|
|
1084
|
+
systemId,
|
|
1085
|
+
previousStatus,
|
|
1086
|
+
newStatus: newState.status,
|
|
1087
|
+
healthyChecks,
|
|
1088
|
+
totalChecks,
|
|
1089
|
+
timestamp,
|
|
1090
|
+
});
|
|
1091
|
+
}
|
|
750
1092
|
}
|
|
751
1093
|
}
|
|
752
1094
|
|
|
1095
|
+
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1096
|
+
await maybeOpenAutoIncidentForCheck({
|
|
1097
|
+
db,
|
|
1098
|
+
service,
|
|
1099
|
+
incidentClient,
|
|
1100
|
+
maintenanceClient,
|
|
1101
|
+
logger,
|
|
1102
|
+
systemId,
|
|
1103
|
+
systemName,
|
|
1104
|
+
configurationId: configId,
|
|
1105
|
+
configurationName: configRow.configName,
|
|
1106
|
+
getEmitHook,
|
|
1107
|
+
previousState,
|
|
1108
|
+
newState,
|
|
1109
|
+
});
|
|
1110
|
+
|
|
753
1111
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
754
1112
|
} catch (error) {
|
|
755
1113
|
logger.error(
|
|
@@ -828,8 +1186,10 @@ async function executeHealthCheckJob(props: {
|
|
|
828
1186
|
notificationClient,
|
|
829
1187
|
systemId,
|
|
830
1188
|
systemName,
|
|
1189
|
+
configurationId: configId,
|
|
831
1190
|
previousStatus,
|
|
832
1191
|
newStatus: newState.status,
|
|
1192
|
+
service,
|
|
833
1193
|
catalogClient,
|
|
834
1194
|
maintenanceClient,
|
|
835
1195
|
incidentClient,
|
|
@@ -846,16 +1206,20 @@ async function executeHealthCheckJob(props: {
|
|
|
846
1206
|
// Emit integration hooks for external integrations
|
|
847
1207
|
const emitHook = getEmitHook();
|
|
848
1208
|
if (emitHook) {
|
|
1209
|
+
const healthyChecks = newState.checkStatuses.filter(
|
|
1210
|
+
(c) => c.status === "healthy",
|
|
1211
|
+
).length;
|
|
1212
|
+
const totalChecks = newState.checkStatuses.length;
|
|
1213
|
+
const timestamp = new Date().toISOString();
|
|
1214
|
+
|
|
849
1215
|
if (newState.status === "healthy" && previousStatus !== "healthy") {
|
|
850
1216
|
// Recovery: system became healthy
|
|
851
1217
|
await emitHook(healthCheckHooks.systemHealthy, {
|
|
852
1218
|
systemId,
|
|
853
1219
|
previousStatus,
|
|
854
|
-
healthyChecks
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
totalChecks: newState.checkStatuses.length,
|
|
858
|
-
timestamp: new Date().toISOString(),
|
|
1220
|
+
healthyChecks,
|
|
1221
|
+
totalChecks,
|
|
1222
|
+
timestamp,
|
|
859
1223
|
});
|
|
860
1224
|
logger.debug(
|
|
861
1225
|
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
@@ -869,19 +1233,47 @@ async function executeHealthCheckJob(props: {
|
|
|
869
1233
|
systemId,
|
|
870
1234
|
previousStatus,
|
|
871
1235
|
newStatus: newState.status,
|
|
872
|
-
healthyChecks
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
totalChecks: newState.checkStatuses.length,
|
|
876
|
-
timestamp: new Date().toISOString(),
|
|
1236
|
+
healthyChecks,
|
|
1237
|
+
totalChecks,
|
|
1238
|
+
timestamp,
|
|
877
1239
|
});
|
|
878
1240
|
logger.debug(
|
|
879
1241
|
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
880
1242
|
);
|
|
881
1243
|
}
|
|
1244
|
+
|
|
1245
|
+
// Umbrella hook — fires on every transition. Emitted alongside
|
|
1246
|
+
// the directional hooks so existing subscribers stay unchanged
|
|
1247
|
+
// while new automation triggers can react to any change.
|
|
1248
|
+
if (previousStatus !== newState.status) {
|
|
1249
|
+
await emitHook(healthCheckHooks.systemHealthChanged, {
|
|
1250
|
+
systemId,
|
|
1251
|
+
previousStatus,
|
|
1252
|
+
newStatus: newState.status,
|
|
1253
|
+
healthyChecks,
|
|
1254
|
+
totalChecks,
|
|
1255
|
+
timestamp,
|
|
1256
|
+
});
|
|
1257
|
+
}
|
|
882
1258
|
}
|
|
883
1259
|
}
|
|
884
1260
|
|
|
1261
|
+
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1262
|
+
await maybeOpenAutoIncidentForCheck({
|
|
1263
|
+
db,
|
|
1264
|
+
service,
|
|
1265
|
+
incidentClient,
|
|
1266
|
+
maintenanceClient,
|
|
1267
|
+
logger,
|
|
1268
|
+
systemId,
|
|
1269
|
+
systemName,
|
|
1270
|
+
configurationId: configId,
|
|
1271
|
+
configurationName: configName,
|
|
1272
|
+
getEmitHook,
|
|
1273
|
+
previousState,
|
|
1274
|
+
newState,
|
|
1275
|
+
});
|
|
1276
|
+
|
|
885
1277
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
886
1278
|
}
|
|
887
1279
|
}
|
package/src/router.test.ts
CHANGED
|
@@ -62,6 +62,16 @@ describe("HealthCheck Router", () => {
|
|
|
62
62
|
getProvenance: mock<any>(() => Promise.resolve(null)),
|
|
63
63
|
};
|
|
64
64
|
|
|
65
|
+
const mockConfigService = {
|
|
66
|
+
get: mock(async () => undefined),
|
|
67
|
+
set: mock(async () => {}),
|
|
68
|
+
getRedacted: mock(async () => undefined),
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
const mockCatalogClient = {
|
|
72
|
+
getSystem: mock(async () => null),
|
|
73
|
+
};
|
|
74
|
+
|
|
65
75
|
const router = createHealthCheckRouter({
|
|
66
76
|
database: mockDb as never,
|
|
67
77
|
registry: mockRegistry,
|
|
@@ -69,6 +79,8 @@ describe("HealthCheck Router", () => {
|
|
|
69
79
|
gitOpsClient: mockGitOpsClient as never,
|
|
70
80
|
getEmitHook: () => undefined,
|
|
71
81
|
cache: passthroughCache,
|
|
82
|
+
configService: mockConfigService as never,
|
|
83
|
+
catalogClient: mockCatalogClient as never,
|
|
72
84
|
});
|
|
73
85
|
|
|
74
86
|
it("getStrategies returns strategies from registry", async () => {
|