@checkstack/healthcheck-backend 1.1.3 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +99 -0
- package/drizzle/0012_fair_boomer.sql +1 -0
- package/drizzle/0013_clean_fabian_cortez.sql +20 -0
- package/drizzle/0014_chilly_ultragirl.sql +2 -0
- package/drizzle/meta/0012_snapshot.json +447 -0
- package/drizzle/meta/0013_snapshot.json +615 -0
- package/drizzle/meta/0014_snapshot.json +648 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +21 -20
- package/src/auto-incident-close-job.ts +164 -0
- package/src/auto-incident.test.ts +196 -0
- package/src/auto-incident.ts +332 -0
- package/src/healthcheck-gitops-kinds.test.ts +93 -0
- package/src/healthcheck-gitops-kinds.ts +34 -0
- package/src/index.ts +43 -0
- package/src/notification-defaults-config.ts +10 -0
- package/src/notification-policy.test.ts +104 -0
- package/src/notification-policy.ts +56 -0
- package/src/queue-executor.ts +304 -15
- package/src/router.test.ts +7 -0
- package/src/router.ts +21 -2
- package/src/schema.ts +76 -0
- package/src/service-notification-policy.test.ts +174 -0
- package/src/service.ts +130 -1
- package/tsconfig.json +3 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import type {
|
|
3
|
+
HealthCheckStatus,
|
|
4
|
+
NotificationPolicy,
|
|
5
|
+
} from "@checkstack/healthcheck-common";
|
|
6
|
+
import {
|
|
7
|
+
classifyTransition,
|
|
8
|
+
shouldNotifyTransition,
|
|
9
|
+
type TransitionKind,
|
|
10
|
+
} from "./notification-policy";
|
|
11
|
+
|
|
12
|
+
const STATES: HealthCheckStatus[] = ["healthy", "degraded", "unhealthy"];
|
|
13
|
+
|
|
14
|
+
describe("classifyTransition", () => {
|
|
15
|
+
// Build the full 3×3 transition matrix so future severity edits stay
|
|
16
|
+
// honest. Every cell here doubles as documentation.
|
|
17
|
+
const matrix: Record<
|
|
18
|
+
HealthCheckStatus,
|
|
19
|
+
Record<HealthCheckStatus, TransitionKind>
|
|
20
|
+
> = {
|
|
21
|
+
healthy: {
|
|
22
|
+
healthy: "none",
|
|
23
|
+
degraded: "escalation",
|
|
24
|
+
unhealthy: "escalation",
|
|
25
|
+
},
|
|
26
|
+
degraded: {
|
|
27
|
+
healthy: "recovery",
|
|
28
|
+
degraded: "none",
|
|
29
|
+
unhealthy: "escalation",
|
|
30
|
+
},
|
|
31
|
+
unhealthy: {
|
|
32
|
+
healthy: "recovery",
|
|
33
|
+
degraded: "deescalation",
|
|
34
|
+
unhealthy: "none",
|
|
35
|
+
},
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
for (const prev of STATES) {
|
|
39
|
+
for (const next of STATES) {
|
|
40
|
+
it(`${prev} → ${next} = ${matrix[prev][next]}`, () => {
|
|
41
|
+
expect(classifyTransition(prev, next)).toBe(matrix[prev][next]);
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
describe("shouldNotifyTransition", () => {
|
|
48
|
+
// The helper only reads `suppressDeEscalations`; narrow the fixture
|
|
49
|
+
// type so the test doesn't need to keep up with unrelated policy
|
|
50
|
+
// fields added over time.
|
|
51
|
+
const off: Pick<NotificationPolicy, "suppressDeEscalations"> = {
|
|
52
|
+
suppressDeEscalations: false,
|
|
53
|
+
};
|
|
54
|
+
const on: Pick<NotificationPolicy, "suppressDeEscalations"> = {
|
|
55
|
+
suppressDeEscalations: true,
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
it("never notifies on `none` (no actual change)", () => {
|
|
59
|
+
expect(shouldNotifyTransition("none", off)).toBe(false);
|
|
60
|
+
expect(shouldNotifyTransition("none", on)).toBe(false);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it("always notifies on escalations regardless of policy", () => {
|
|
64
|
+
expect(shouldNotifyTransition("escalation", off)).toBe(true);
|
|
65
|
+
expect(shouldNotifyTransition("escalation", on)).toBe(true);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("always notifies on recoveries regardless of policy", () => {
|
|
69
|
+
expect(shouldNotifyTransition("recovery", off)).toBe(true);
|
|
70
|
+
expect(shouldNotifyTransition("recovery", on)).toBe(true);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it("notifies on de-escalations by default", () => {
|
|
74
|
+
expect(shouldNotifyTransition("deescalation", off)).toBe(true);
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it("suppresses de-escalations when the policy opts in", () => {
|
|
78
|
+
expect(shouldNotifyTransition("deescalation", on)).toBe(false);
|
|
79
|
+
});
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
describe("flapping scenario from the bug report", () => {
|
|
83
|
+
// healthy → degraded → unhealthy → degraded → healthy
|
|
84
|
+
//
|
|
85
|
+
// With suppression on, the intermediate `unhealthy → degraded`
|
|
86
|
+
// notification (the one operators called out as spammy) must be
|
|
87
|
+
// skipped, while escalation and recovery still fire.
|
|
88
|
+
const policy: Pick<NotificationPolicy, "suppressDeEscalations"> = {
|
|
89
|
+
suppressDeEscalations: true,
|
|
90
|
+
};
|
|
91
|
+
const sequence: [HealthCheckStatus, HealthCheckStatus, boolean][] = [
|
|
92
|
+
["healthy", "degraded", true], // escalation
|
|
93
|
+
["degraded", "unhealthy", true], // escalation
|
|
94
|
+
["unhealthy", "degraded", false], // de-escalation — suppressed
|
|
95
|
+
["degraded", "healthy", true], // recovery
|
|
96
|
+
];
|
|
97
|
+
|
|
98
|
+
for (const [prev, next, expected] of sequence) {
|
|
99
|
+
it(`${prev} → ${next} should notify: ${expected}`, () => {
|
|
100
|
+
const kind = classifyTransition(prev, next);
|
|
101
|
+
expect(shouldNotifyTransition(kind, policy)).toBe(expected);
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
});
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
HealthCheckStatus,
|
|
3
|
+
NotificationPolicy,
|
|
4
|
+
} from "@checkstack/healthcheck-common";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* The kind of transition a system health change represents. Used to
|
|
8
|
+
* decide whether a notification should fire and how its CTA should
|
|
9
|
+
* link back into the UI.
|
|
10
|
+
*/
|
|
11
|
+
export type TransitionKind =
|
|
12
|
+
/** No actual change (e.g. healthy → healthy). */
|
|
13
|
+
| "none"
|
|
14
|
+
/** Severity increased (healthy → degraded, degraded → unhealthy, ...). */
|
|
15
|
+
| "escalation"
|
|
16
|
+
/** Severity decreased but did not reach healthy (unhealthy → degraded). */
|
|
17
|
+
| "deescalation"
|
|
18
|
+
/** Returned to healthy from any non-healthy state. */
|
|
19
|
+
| "recovery";
|
|
20
|
+
|
|
21
|
+
const SEVERITY: Record<HealthCheckStatus, number> = {
|
|
22
|
+
healthy: 0,
|
|
23
|
+
degraded: 1,
|
|
24
|
+
unhealthy: 2,
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Classify a transition between two health states. Pure and total over
|
|
29
|
+
* the cartesian product of `HealthCheckStatus` values.
|
|
30
|
+
*/
|
|
31
|
+
export function classifyTransition(
|
|
32
|
+
previous: HealthCheckStatus,
|
|
33
|
+
next: HealthCheckStatus,
|
|
34
|
+
): TransitionKind {
|
|
35
|
+
if (previous === next) return "none";
|
|
36
|
+
if (next === "healthy") return "recovery";
|
|
37
|
+
return SEVERITY[next] > SEVERITY[previous] ? "escalation" : "deescalation";
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Decide whether a transition should produce a notification given the
|
|
42
|
+
* effective per-system policy. Escalations and recoveries always notify;
|
|
43
|
+
* de-escalations are suppressed when the policy opts in.
|
|
44
|
+
*
|
|
45
|
+
* Accepts the narrowed `Pick` because callers may only have the
|
|
46
|
+
* suppression flag — full policy resolution requires per-check lookups
|
|
47
|
+
* that aren't relevant to this decision.
|
|
48
|
+
*/
|
|
49
|
+
export function shouldNotifyTransition(
|
|
50
|
+
kind: TransitionKind,
|
|
51
|
+
policy: Pick<NotificationPolicy, "suppressDeEscalations">,
|
|
52
|
+
): boolean {
|
|
53
|
+
if (kind === "none") return false;
|
|
54
|
+
if (kind === "deescalation" && policy.suppressDeEscalations) return false;
|
|
55
|
+
return true;
|
|
56
|
+
}
|
package/src/queue-executor.ts
CHANGED
|
@@ -39,6 +39,21 @@ import { HealthCheckService } from "./service";
|
|
|
39
39
|
import { healthCheckHooks } from "./hooks";
|
|
40
40
|
import { incrementHourlyAggregate } from "./realtime-aggregation";
|
|
41
41
|
import type { HealthCheckCache } from "./cache";
|
|
42
|
+
import {
|
|
43
|
+
classifyTransition,
|
|
44
|
+
shouldNotifyTransition,
|
|
45
|
+
} from "./notification-policy";
|
|
46
|
+
import {
|
|
47
|
+
findLastAutoIncidentClose,
|
|
48
|
+
findUnhealthySince,
|
|
49
|
+
hasHealthyRunSince,
|
|
50
|
+
isMaintenanceSuppressed,
|
|
51
|
+
isTransitionToUnhealthy,
|
|
52
|
+
openAutoIncident,
|
|
53
|
+
recordUnhealthyTransition,
|
|
54
|
+
shouldOpenForFlapping,
|
|
55
|
+
shouldOpenForSustainedUnhealthy,
|
|
56
|
+
} from "./auto-incident";
|
|
42
57
|
|
|
43
58
|
type Db = SafeDatabase<typeof schema>;
|
|
44
59
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
@@ -136,15 +151,209 @@ export async function scheduleHealthCheck(props: {
|
|
|
136
151
|
});
|
|
137
152
|
}
|
|
138
153
|
|
|
154
|
+
/**
|
|
155
|
+
* After every check run, evaluate the per-check auto-incident
|
|
156
|
+
* triggers. Either trigger can independently open an incident:
|
|
157
|
+
*
|
|
158
|
+
* - **flapping**: this just-completed run was a transition to
|
|
159
|
+
* unhealthy AND `N` such transitions have happened within the
|
|
160
|
+
* configured window.
|
|
161
|
+
* - **sustained**: the check is currently unhealthy AND has been so
|
|
162
|
+
* continuously for at least the configured duration.
|
|
163
|
+
*
|
|
164
|
+
* Both triggers honour the require-recovery rule: after the most
|
|
165
|
+
* recent auto-incident close (manual or auto), no new auto-incident
|
|
166
|
+
* opens until the check has logged at least one healthy run. This
|
|
167
|
+
* stops a manual close → still-unhealthy → re-open loop.
|
|
168
|
+
*
|
|
169
|
+
* Active maintenance with suppression skips both triggers when the
|
|
170
|
+
* policy opts in.
|
|
171
|
+
*/
|
|
172
|
+
async function maybeOpenAutoIncidentForCheck(props: {
|
|
173
|
+
db: Db;
|
|
174
|
+
service: HealthCheckService;
|
|
175
|
+
incidentClient: IncidentClient;
|
|
176
|
+
maintenanceClient: MaintenanceClient;
|
|
177
|
+
logger: Logger;
|
|
178
|
+
systemId: string;
|
|
179
|
+
systemName: string;
|
|
180
|
+
configurationId: string;
|
|
181
|
+
configurationName: string;
|
|
182
|
+
previousState: {
|
|
183
|
+
checkStatuses: Array<{
|
|
184
|
+
configurationId: string;
|
|
185
|
+
status: HealthCheckStatus;
|
|
186
|
+
}>;
|
|
187
|
+
};
|
|
188
|
+
newState: {
|
|
189
|
+
checkStatuses: Array<{
|
|
190
|
+
configurationId: string;
|
|
191
|
+
status: HealthCheckStatus;
|
|
192
|
+
}>;
|
|
193
|
+
};
|
|
194
|
+
}): Promise<void> {
|
|
195
|
+
const {
|
|
196
|
+
db,
|
|
197
|
+
service,
|
|
198
|
+
incidentClient,
|
|
199
|
+
maintenanceClient,
|
|
200
|
+
logger,
|
|
201
|
+
systemId,
|
|
202
|
+
systemName,
|
|
203
|
+
configurationId,
|
|
204
|
+
configurationName,
|
|
205
|
+
previousState,
|
|
206
|
+
newState,
|
|
207
|
+
} = props;
|
|
208
|
+
|
|
209
|
+
const next = newState.checkStatuses.find(
|
|
210
|
+
(c) => c.configurationId === configurationId,
|
|
211
|
+
);
|
|
212
|
+
// Only auto-incident logic applies when the check is currently
|
|
213
|
+
// unhealthy — both triggers require it.
|
|
214
|
+
if (!next || next.status !== "unhealthy") return;
|
|
215
|
+
|
|
216
|
+
const prev = previousState.checkStatuses.find(
|
|
217
|
+
(c) => c.configurationId === configurationId,
|
|
218
|
+
);
|
|
219
|
+
const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
|
|
220
|
+
|
|
221
|
+
let policy;
|
|
222
|
+
try {
|
|
223
|
+
policy = await service.getAssignmentNotificationPolicy({
|
|
224
|
+
systemId,
|
|
225
|
+
configurationId,
|
|
226
|
+
});
|
|
227
|
+
} catch (error) {
|
|
228
|
+
logger.warn(
|
|
229
|
+
`Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
|
|
230
|
+
error,
|
|
231
|
+
);
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if (!policy.autoOpenIncidentOnUnhealthy) return;
|
|
236
|
+
|
|
237
|
+
// Honour active maintenance windows — operators have explicitly
|
|
238
|
+
// said the system is down on purpose.
|
|
239
|
+
if (policy.skipDuringMaintenance) {
|
|
240
|
+
const suppressed = await isMaintenanceSuppressed({
|
|
241
|
+
maintenanceClient,
|
|
242
|
+
systemId,
|
|
243
|
+
logger,
|
|
244
|
+
});
|
|
245
|
+
if (suppressed) {
|
|
246
|
+
logger.debug(
|
|
247
|
+
`Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
|
|
248
|
+
);
|
|
249
|
+
return;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Require-recovery: if there's a prior closed auto-incident for
|
|
254
|
+
// this assignment, the check must have logged at least one healthy
|
|
255
|
+
// run since the close before we can open another one. Without this,
|
|
256
|
+
// an operator's manual close on a still-broken system would loop.
|
|
257
|
+
const lastCloseAt = await findLastAutoIncidentClose({
|
|
258
|
+
db,
|
|
259
|
+
systemId,
|
|
260
|
+
configurationId,
|
|
261
|
+
});
|
|
262
|
+
if (lastCloseAt) {
|
|
263
|
+
const recovered = await hasHealthyRunSince({
|
|
264
|
+
db,
|
|
265
|
+
systemId,
|
|
266
|
+
configurationId,
|
|
267
|
+
since: lastCloseAt,
|
|
268
|
+
});
|
|
269
|
+
if (!recovered) {
|
|
270
|
+
return;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Record the transition (if any) and evaluate the flapping trigger
|
|
275
|
+
// against transitions that happened after the last close window.
|
|
276
|
+
let flappingOpens = false;
|
|
277
|
+
if (isTransition) {
|
|
278
|
+
try {
|
|
279
|
+
const count = await recordUnhealthyTransition({
|
|
280
|
+
db,
|
|
281
|
+
configurationId,
|
|
282
|
+
systemId,
|
|
283
|
+
windowMinutes: policy.flappingTrigger.windowMinutes,
|
|
284
|
+
since: lastCloseAt,
|
|
285
|
+
});
|
|
286
|
+
flappingOpens = shouldOpenForFlapping({
|
|
287
|
+
policy,
|
|
288
|
+
recentTransitionCount: count,
|
|
289
|
+
});
|
|
290
|
+
} catch (error) {
|
|
291
|
+
logger.warn(
|
|
292
|
+
`Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
|
|
293
|
+
error,
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Evaluate the sustained-duration trigger on every run while the
|
|
299
|
+
// check is unhealthy (not just on transition).
|
|
300
|
+
let sustainedOpens = false;
|
|
301
|
+
if (policy.sustainedUnhealthyTrigger.enabled) {
|
|
302
|
+
const unhealthySince = await findUnhealthySince({
|
|
303
|
+
db,
|
|
304
|
+
configurationId,
|
|
305
|
+
systemId,
|
|
306
|
+
since: lastCloseAt,
|
|
307
|
+
});
|
|
308
|
+
if (unhealthySince) {
|
|
309
|
+
sustainedOpens = shouldOpenForSustainedUnhealthy({
|
|
310
|
+
policy,
|
|
311
|
+
unhealthyForMs: Date.now() - unhealthySince.getTime(),
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
if (!flappingOpens && !sustainedOpens) return;
|
|
317
|
+
|
|
318
|
+
const reason = flappingOpens
|
|
319
|
+
? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
|
|
320
|
+
: `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
|
|
321
|
+
|
|
322
|
+
await openAutoIncident({
|
|
323
|
+
db,
|
|
324
|
+
incidentClient,
|
|
325
|
+
logger,
|
|
326
|
+
systemId,
|
|
327
|
+
systemName,
|
|
328
|
+
configurationId,
|
|
329
|
+
configurationName,
|
|
330
|
+
policy,
|
|
331
|
+
reason,
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
|
|
139
335
|
/**
|
|
140
336
|
* Notify system subscribers about a health state change.
|
|
141
|
-
* Skips notification
|
|
337
|
+
* Skips notification when:
|
|
338
|
+
* - the system has active maintenance/incident with suppression enabled, or
|
|
339
|
+
* - the policy of the check that just ran opts into de-escalation
|
|
340
|
+
* suppression and this transition is a de-escalation (e.g.
|
|
341
|
+
* `unhealthy → degraded`).
|
|
342
|
+
*
|
|
343
|
+
* For non-recovery transitions, the action CTA is deep-linked to the
|
|
344
|
+
* failing-checks filter so operators land directly on the problem.
|
|
345
|
+
*
|
|
346
|
+
* Policy is resolved per-assignment (per system+configuration) — the
|
|
347
|
+
* just-ran check is the one driving any aggregate transition in this
|
|
348
|
+
* execution, so its policy is the authoritative one.
|
|
142
349
|
*/
|
|
143
350
|
async function notifyStateChange(props: {
|
|
144
351
|
systemId: string;
|
|
145
352
|
systemName: string;
|
|
353
|
+
configurationId: string;
|
|
146
354
|
previousStatus: HealthCheckStatus;
|
|
147
355
|
newStatus: HealthCheckStatus;
|
|
356
|
+
service: HealthCheckService;
|
|
148
357
|
catalogClient: CatalogClient;
|
|
149
358
|
notificationClient: NotificationClient;
|
|
150
359
|
maintenanceClient: MaintenanceClient;
|
|
@@ -154,8 +363,10 @@ async function notifyStateChange(props: {
|
|
|
154
363
|
const {
|
|
155
364
|
systemId,
|
|
156
365
|
systemName,
|
|
366
|
+
configurationId,
|
|
157
367
|
previousStatus,
|
|
158
368
|
newStatus,
|
|
369
|
+
service,
|
|
159
370
|
catalogClient,
|
|
160
371
|
notificationClient,
|
|
161
372
|
maintenanceClient,
|
|
@@ -163,8 +374,31 @@ async function notifyStateChange(props: {
|
|
|
163
374
|
logger,
|
|
164
375
|
} = props;
|
|
165
376
|
|
|
166
|
-
|
|
167
|
-
if (
|
|
377
|
+
const transition = classifyTransition(previousStatus, newStatus);
|
|
378
|
+
if (transition === "none") {
|
|
379
|
+
return;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// Per-assignment notification policy. Failure to load defaults to
|
|
383
|
+
// "notify everything" rather than dropping the notification.
|
|
384
|
+
let suppressDeEscalations = false;
|
|
385
|
+
try {
|
|
386
|
+
const policy = await service.getAssignmentNotificationPolicy({
|
|
387
|
+
systemId,
|
|
388
|
+
configurationId,
|
|
389
|
+
});
|
|
390
|
+
suppressDeEscalations = policy.suppressDeEscalations;
|
|
391
|
+
} catch (error) {
|
|
392
|
+
logger.warn(
|
|
393
|
+
`Failed to load notification policy for ${systemId}/${configurationId}, applying defaults:`,
|
|
394
|
+
error,
|
|
395
|
+
);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
if (!shouldNotifyTransition(transition, { suppressDeEscalations })) {
|
|
399
|
+
logger.debug(
|
|
400
|
+
`Skipping notification for ${systemId}: ${transition} suppressed by policy`,
|
|
401
|
+
);
|
|
168
402
|
return;
|
|
169
403
|
}
|
|
170
404
|
|
|
@@ -204,36 +438,38 @@ async function notifyStateChange(props: {
|
|
|
204
438
|
);
|
|
205
439
|
}
|
|
206
440
|
|
|
207
|
-
const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
|
|
208
|
-
const isDegraded = newStatus === "degraded";
|
|
209
|
-
const isUnhealthy = newStatus === "unhealthy";
|
|
210
|
-
|
|
211
441
|
let title: string;
|
|
212
442
|
let body: string;
|
|
213
443
|
let importance: "info" | "warning" | "critical";
|
|
214
444
|
|
|
215
|
-
if (
|
|
445
|
+
if (transition === "recovery") {
|
|
216
446
|
title = `System health restored: ${systemName}`;
|
|
217
447
|
body =
|
|
218
448
|
`All health checks for **${systemName}** are now passing. The system has returned to normal operation.`;
|
|
219
449
|
importance = "info";
|
|
220
|
-
} else if (
|
|
450
|
+
} else if (newStatus === "unhealthy") {
|
|
221
451
|
title = `System health critical: ${systemName}`;
|
|
222
452
|
body = `Health checks indicate **${systemName}** is unhealthy and may be down.`;
|
|
223
453
|
importance = "critical";
|
|
224
|
-
} else
|
|
454
|
+
} else {
|
|
455
|
+
// degraded — either an escalation from healthy or a partial recovery
|
|
225
456
|
title = `System health degraded: ${systemName}`;
|
|
226
457
|
body =
|
|
227
458
|
`Some health checks for **${systemName}** are failing. The system may be experiencing issues.`;
|
|
228
459
|
importance = "warning";
|
|
229
|
-
} else {
|
|
230
|
-
// No notification for healthy → healthy (if somehow missed above)
|
|
231
|
-
return;
|
|
232
460
|
}
|
|
233
461
|
|
|
234
462
|
const systemDetailPath = resolveRoute(catalogRoutes.routes.systemDetail, {
|
|
235
463
|
systemId,
|
|
236
464
|
});
|
|
465
|
+
// Recovery lands on the default (all) view; failing transitions deep-link
|
|
466
|
+
// operators into the failing-checks filter so they can debug immediately.
|
|
467
|
+
const actionUrl =
|
|
468
|
+
transition === "recovery"
|
|
469
|
+
? systemDetailPath
|
|
470
|
+
: `${systemDetailPath}?filter=failing`;
|
|
471
|
+
const actionLabel =
|
|
472
|
+
transition === "recovery" ? "View System" : "View failing checks";
|
|
237
473
|
|
|
238
474
|
void catalogClient; // parents are resolved server-side via stored target edges
|
|
239
475
|
|
|
@@ -244,7 +480,7 @@ async function notifyStateChange(props: {
|
|
|
244
480
|
title,
|
|
245
481
|
body,
|
|
246
482
|
importance,
|
|
247
|
-
action: { label:
|
|
483
|
+
action: { label: actionLabel, url: actionUrl },
|
|
248
484
|
collapseKey: systemHealthCollapseKey(systemId),
|
|
249
485
|
subjects: [
|
|
250
486
|
createSystemSubject({
|
|
@@ -598,11 +834,13 @@ async function executeHealthCheckJob(props: {
|
|
|
598
834
|
const newState = await service.getSystemHealthStatus(systemId);
|
|
599
835
|
if (newState.status !== previousStatus) {
|
|
600
836
|
await notifyStateChange({
|
|
601
|
-
|
|
837
|
+
notificationClient,
|
|
602
838
|
systemId,
|
|
603
839
|
systemName,
|
|
840
|
+
configurationId: configId,
|
|
604
841
|
previousStatus,
|
|
605
842
|
newStatus: newState.status,
|
|
843
|
+
service,
|
|
606
844
|
catalogClient,
|
|
607
845
|
maintenanceClient,
|
|
608
846
|
incidentClient,
|
|
@@ -610,6 +848,23 @@ async function executeHealthCheckJob(props: {
|
|
|
610
848
|
});
|
|
611
849
|
}
|
|
612
850
|
|
|
851
|
+
// Per-check auto-incident: runs whether or not the aggregate
|
|
852
|
+
// changed (a check can transition to unhealthy without flipping
|
|
853
|
+
// the aggregate if another check is already unhealthy).
|
|
854
|
+
await maybeOpenAutoIncidentForCheck({
|
|
855
|
+
db,
|
|
856
|
+
service,
|
|
857
|
+
incidentClient,
|
|
858
|
+
maintenanceClient,
|
|
859
|
+
logger,
|
|
860
|
+
systemId,
|
|
861
|
+
systemName,
|
|
862
|
+
configurationId: configId,
|
|
863
|
+
configurationName: configRow.configName,
|
|
864
|
+
previousState,
|
|
865
|
+
newState,
|
|
866
|
+
});
|
|
867
|
+
|
|
613
868
|
return;
|
|
614
869
|
} finally {
|
|
615
870
|
if (connectedClient) {
|
|
@@ -696,8 +951,10 @@ async function executeHealthCheckJob(props: {
|
|
|
696
951
|
notificationClient,
|
|
697
952
|
systemId,
|
|
698
953
|
systemName,
|
|
954
|
+
configurationId: configId,
|
|
699
955
|
previousStatus,
|
|
700
956
|
newStatus: newState.status,
|
|
957
|
+
service,
|
|
701
958
|
catalogClient,
|
|
702
959
|
maintenanceClient,
|
|
703
960
|
incidentClient,
|
|
@@ -750,6 +1007,21 @@ async function executeHealthCheckJob(props: {
|
|
|
750
1007
|
}
|
|
751
1008
|
}
|
|
752
1009
|
|
|
1010
|
+
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1011
|
+
await maybeOpenAutoIncidentForCheck({
|
|
1012
|
+
db,
|
|
1013
|
+
service,
|
|
1014
|
+
incidentClient,
|
|
1015
|
+
maintenanceClient,
|
|
1016
|
+
logger,
|
|
1017
|
+
systemId,
|
|
1018
|
+
systemName,
|
|
1019
|
+
configurationId: configId,
|
|
1020
|
+
configurationName: configRow.configName,
|
|
1021
|
+
previousState,
|
|
1022
|
+
newState,
|
|
1023
|
+
});
|
|
1024
|
+
|
|
753
1025
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
754
1026
|
} catch (error) {
|
|
755
1027
|
logger.error(
|
|
@@ -828,8 +1100,10 @@ async function executeHealthCheckJob(props: {
|
|
|
828
1100
|
notificationClient,
|
|
829
1101
|
systemId,
|
|
830
1102
|
systemName,
|
|
1103
|
+
configurationId: configId,
|
|
831
1104
|
previousStatus,
|
|
832
1105
|
newStatus: newState.status,
|
|
1106
|
+
service,
|
|
833
1107
|
catalogClient,
|
|
834
1108
|
maintenanceClient,
|
|
835
1109
|
incidentClient,
|
|
@@ -882,6 +1156,21 @@ async function executeHealthCheckJob(props: {
|
|
|
882
1156
|
}
|
|
883
1157
|
}
|
|
884
1158
|
|
|
1159
|
+
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1160
|
+
await maybeOpenAutoIncidentForCheck({
|
|
1161
|
+
db,
|
|
1162
|
+
service,
|
|
1163
|
+
incidentClient,
|
|
1164
|
+
maintenanceClient,
|
|
1165
|
+
logger,
|
|
1166
|
+
systemId,
|
|
1167
|
+
systemName,
|
|
1168
|
+
configurationId: configId,
|
|
1169
|
+
configurationName: configName,
|
|
1170
|
+
previousState,
|
|
1171
|
+
newState,
|
|
1172
|
+
});
|
|
1173
|
+
|
|
885
1174
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
886
1175
|
}
|
|
887
1176
|
}
|
package/src/router.test.ts
CHANGED
|
@@ -62,6 +62,12 @@ describe("HealthCheck Router", () => {
|
|
|
62
62
|
getProvenance: mock<any>(() => Promise.resolve(null)),
|
|
63
63
|
};
|
|
64
64
|
|
|
65
|
+
const mockConfigService = {
|
|
66
|
+
get: mock(async () => undefined),
|
|
67
|
+
set: mock(async () => {}),
|
|
68
|
+
getRedacted: mock(async () => undefined),
|
|
69
|
+
};
|
|
70
|
+
|
|
65
71
|
const router = createHealthCheckRouter({
|
|
66
72
|
database: mockDb as never,
|
|
67
73
|
registry: mockRegistry,
|
|
@@ -69,6 +75,7 @@ describe("HealthCheck Router", () => {
|
|
|
69
75
|
gitOpsClient: mockGitOpsClient as never,
|
|
70
76
|
getEmitHook: () => undefined,
|
|
71
77
|
cache: passthroughCache,
|
|
78
|
+
configService: mockConfigService as never,
|
|
72
79
|
});
|
|
73
80
|
|
|
74
81
|
it("getStrategies returns strategies from registry", async () => {
|
package/src/router.ts
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import { implement, ORPCError } from "@orpc/server";
|
|
2
2
|
import {
|
|
3
3
|
autoAuthMiddleware,
|
|
4
|
+
correlationMiddleware,
|
|
4
5
|
toJsonSchema,
|
|
5
6
|
type RpcContext,
|
|
6
7
|
type HealthCheckRegistry,
|
|
7
8
|
type SafeDatabase,
|
|
8
9
|
type CollectorRegistry,
|
|
10
|
+
type ConfigService,
|
|
9
11
|
} from "@checkstack/backend-api";
|
|
10
12
|
import { healthCheckContract } from "@checkstack/healthcheck-common";
|
|
11
13
|
import type { StrategyCategory } from "@checkstack/healthcheck-common";
|
|
@@ -30,14 +32,21 @@ export const createHealthCheckRouter = (opts: {
|
|
|
30
32
|
gitOpsClient: InferClient<typeof GitOpsApi>;
|
|
31
33
|
getEmitHook: () => ((hook: { id: string }, payload: Record<string, unknown>) => Promise<void>) | undefined;
|
|
32
34
|
cache: HealthCheckCache;
|
|
35
|
+
configService: ConfigService;
|
|
33
36
|
}) => {
|
|
34
|
-
const { database, registry, collectorRegistry, getEmitHook, cache } = opts;
|
|
37
|
+
const { database, registry, collectorRegistry, getEmitHook, cache, configService } = opts;
|
|
35
38
|
// Create service instance once - shared across all handlers
|
|
36
|
-
const service = new HealthCheckService(
|
|
39
|
+
const service = new HealthCheckService(
|
|
40
|
+
database,
|
|
41
|
+
registry,
|
|
42
|
+
collectorRegistry,
|
|
43
|
+
configService,
|
|
44
|
+
);
|
|
37
45
|
|
|
38
46
|
// Create contract implementer with context type AND auto auth middleware
|
|
39
47
|
const os = implement(healthCheckContract)
|
|
40
48
|
.$context<RpcContext>()
|
|
49
|
+
.use(correlationMiddleware)
|
|
41
50
|
.use(autoAuthMiddleware);
|
|
42
51
|
|
|
43
52
|
const enforceNotGitOpsLocked = async (kind: string, entityId: string) => {
|
|
@@ -220,6 +229,16 @@ export const createHealthCheckRouter = (opts: {
|
|
|
220
229
|
}
|
|
221
230
|
}),
|
|
222
231
|
|
|
232
|
+
getPlatformNotificationDefaults:
|
|
233
|
+
os.getPlatformNotificationDefaults.handler(async () => {
|
|
234
|
+
return service.getPlatformNotificationDefaults();
|
|
235
|
+
}),
|
|
236
|
+
|
|
237
|
+
setPlatformNotificationDefaults:
|
|
238
|
+
os.setPlatformNotificationDefaults.handler(async ({ input }) => {
|
|
239
|
+
await service.setPlatformNotificationDefaults(input);
|
|
240
|
+
}),
|
|
241
|
+
|
|
223
242
|
getRetentionConfig: os.getRetentionConfig.handler(async ({ input }) => {
|
|
224
243
|
return service.getRetentionConfig(input.systemId, input.configurationId);
|
|
225
244
|
}),
|