@checkstack/healthcheck-backend 1.1.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/drizzle/0012_fair_boomer.sql +1 -0
- package/drizzle/0013_clean_fabian_cortez.sql +20 -0
- package/drizzle/0014_chilly_ultragirl.sql +2 -0
- package/drizzle/meta/0012_snapshot.json +447 -0
- package/drizzle/meta/0013_snapshot.json +615 -0
- package/drizzle/meta/0014_snapshot.json +648 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +21 -20
- package/src/auto-incident-close-job.ts +164 -0
- package/src/auto-incident.test.ts +196 -0
- package/src/auto-incident.ts +332 -0
- package/src/healthcheck-gitops-kinds.test.ts +93 -0
- package/src/healthcheck-gitops-kinds.ts +34 -0
- package/src/index.ts +43 -0
- package/src/notification-defaults-config.ts +10 -0
- package/src/notification-policy.test.ts +104 -0
- package/src/notification-policy.ts +56 -0
- package/src/queue-executor.ts +304 -15
- package/src/router.test.ts +7 -0
- package/src/router.ts +19 -2
- package/src/schema.ts +76 -0
- package/src/service-notification-policy.test.ts +174 -0
- package/src/service.ts +130 -1
- package/tsconfig.json +3 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
import { and, desc, eq, gte, isNotNull, isNull, sql } from "drizzle-orm";
|
|
2
|
+
import type {
|
|
3
|
+
HealthCheckStatus,
|
|
4
|
+
NotificationPolicy,
|
|
5
|
+
} from "@checkstack/healthcheck-common";
|
|
6
|
+
import type { Logger, SafeDatabase } from "@checkstack/backend-api";
|
|
7
|
+
import type { InferClient } from "@checkstack/common";
|
|
8
|
+
import { IncidentApi } from "@checkstack/incident-common";
|
|
9
|
+
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
10
|
+
import {
|
|
11
|
+
healthCheckAutoIncidents,
|
|
12
|
+
healthCheckRuns,
|
|
13
|
+
healthCheckUnhealthyTransitions,
|
|
14
|
+
} from "./schema";
|
|
15
|
+
import * as schema from "./schema";
|
|
16
|
+
|
|
17
|
+
type Db = SafeDatabase<typeof schema>;
|
|
18
|
+
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
19
|
+
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Returns true when the per-check evaluated state went from anything
|
|
23
|
+
* other than `unhealthy` to `unhealthy` between two evaluations.
|
|
24
|
+
*/
|
|
25
|
+
export function isTransitionToUnhealthy(
|
|
26
|
+
previous: HealthCheckStatus | undefined,
|
|
27
|
+
next: HealthCheckStatus,
|
|
28
|
+
): boolean {
|
|
29
|
+
return next === "unhealthy" && previous !== "unhealthy";
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Record a transition-to-unhealthy in the audit table and return the
|
|
34
|
+
* total transition count for this check inside the configured window
|
|
35
|
+
* (the new row is included in the count). When `since` is provided,
|
|
36
|
+
* only transitions strictly after that timestamp are counted — used
|
|
37
|
+
* to ensure a freshly-opened auto-incident isn't re-triggered by
|
|
38
|
+
* pre-close transitions after the prior incident was resolved.
|
|
39
|
+
*/
|
|
40
|
+
export async function recordUnhealthyTransition({
|
|
41
|
+
db,
|
|
42
|
+
configurationId,
|
|
43
|
+
systemId,
|
|
44
|
+
windowMinutes,
|
|
45
|
+
since,
|
|
46
|
+
now = new Date(),
|
|
47
|
+
}: {
|
|
48
|
+
db: Db;
|
|
49
|
+
configurationId: string;
|
|
50
|
+
systemId: string;
|
|
51
|
+
windowMinutes: number;
|
|
52
|
+
since?: Date;
|
|
53
|
+
now?: Date;
|
|
54
|
+
}): Promise<number> {
|
|
55
|
+
await db.insert(healthCheckUnhealthyTransitions).values({
|
|
56
|
+
configurationId,
|
|
57
|
+
systemId,
|
|
58
|
+
transitionedAt: now,
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
const windowStart = new Date(now.getTime() - windowMinutes * 60_000);
|
|
62
|
+
const lowerBound =
|
|
63
|
+
since && since > windowStart ? since : windowStart;
|
|
64
|
+
|
|
65
|
+
const result = await db
|
|
66
|
+
.select({ count: sql<number>`COUNT(*)::int` })
|
|
67
|
+
.from(healthCheckUnhealthyTransitions)
|
|
68
|
+
.where(
|
|
69
|
+
and(
|
|
70
|
+
eq(healthCheckUnhealthyTransitions.configurationId, configurationId),
|
|
71
|
+
eq(healthCheckUnhealthyTransitions.systemId, systemId),
|
|
72
|
+
gte(healthCheckUnhealthyTransitions.transitionedAt, lowerBound),
|
|
73
|
+
),
|
|
74
|
+
);
|
|
75
|
+
|
|
76
|
+
return result[0]?.count ?? 0;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Decide whether the flapping trigger should open an auto-incident.
|
|
81
|
+
* Returns false when the trigger is disabled or the count is below
|
|
82
|
+
* the configured threshold.
|
|
83
|
+
*/
|
|
84
|
+
export function shouldOpenForFlapping({
|
|
85
|
+
policy,
|
|
86
|
+
recentTransitionCount,
|
|
87
|
+
}: {
|
|
88
|
+
policy: NotificationPolicy;
|
|
89
|
+
recentTransitionCount: number;
|
|
90
|
+
}): boolean {
|
|
91
|
+
if (!policy.autoOpenIncidentOnUnhealthy) return false;
|
|
92
|
+
if (!policy.flappingTrigger.enabled) return false;
|
|
93
|
+
return recentTransitionCount >= policy.flappingTrigger.transitions;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Decide whether the sustained-duration trigger should open an
|
|
98
|
+
* auto-incident given the elapsed-unhealthy time for this check.
|
|
99
|
+
*/
|
|
100
|
+
export function shouldOpenForSustainedUnhealthy({
|
|
101
|
+
policy,
|
|
102
|
+
unhealthyForMs,
|
|
103
|
+
}: {
|
|
104
|
+
policy: NotificationPolicy;
|
|
105
|
+
/** How long the check has been continuously unhealthy. */
|
|
106
|
+
unhealthyForMs: number;
|
|
107
|
+
}): boolean {
|
|
108
|
+
if (!policy.autoOpenIncidentOnUnhealthy) return false;
|
|
109
|
+
if (!policy.sustainedUnhealthyTrigger.enabled) return false;
|
|
110
|
+
const thresholdMs =
|
|
111
|
+
policy.sustainedUnhealthyTrigger.durationMinutes * 60_000;
|
|
112
|
+
return unhealthyForMs >= thresholdMs;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Find the most recent transition to `unhealthy` for this check that
|
|
117
|
+
* happened after `since` (if provided). Used by the sustained-trigger
|
|
118
|
+
* evaluator to compute "how long has the check been unhealthy?"
|
|
119
|
+
*/
|
|
120
|
+
export async function findUnhealthySince({
|
|
121
|
+
db,
|
|
122
|
+
configurationId,
|
|
123
|
+
systemId,
|
|
124
|
+
since,
|
|
125
|
+
}: {
|
|
126
|
+
db: Db;
|
|
127
|
+
configurationId: string;
|
|
128
|
+
systemId: string;
|
|
129
|
+
since?: Date;
|
|
130
|
+
}): Promise<Date | undefined> {
|
|
131
|
+
const conditions = [
|
|
132
|
+
eq(healthCheckUnhealthyTransitions.configurationId, configurationId),
|
|
133
|
+
eq(healthCheckUnhealthyTransitions.systemId, systemId),
|
|
134
|
+
];
|
|
135
|
+
if (since) {
|
|
136
|
+
conditions.push(gte(healthCheckUnhealthyTransitions.transitionedAt, since));
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const [row] = await db
|
|
140
|
+
.select({
|
|
141
|
+
transitionedAt: healthCheckUnhealthyTransitions.transitionedAt,
|
|
142
|
+
})
|
|
143
|
+
.from(healthCheckUnhealthyTransitions)
|
|
144
|
+
.where(and(...conditions))
|
|
145
|
+
.orderBy(desc(healthCheckUnhealthyTransitions.transitionedAt))
|
|
146
|
+
.limit(1);
|
|
147
|
+
|
|
148
|
+
return row?.transitionedAt;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Find any currently-active (closedAt IS NULL) auto-incident for the
|
|
153
|
+
* system. Used to avoid opening a duplicate when one is already open.
|
|
154
|
+
*/
|
|
155
|
+
export async function findActiveAutoIncident({
|
|
156
|
+
db,
|
|
157
|
+
systemId,
|
|
158
|
+
}: {
|
|
159
|
+
db: Db;
|
|
160
|
+
systemId: string;
|
|
161
|
+
}): Promise<{ id: string; incidentId: string } | undefined> {
|
|
162
|
+
const rows = await db
|
|
163
|
+
.select({
|
|
164
|
+
id: healthCheckAutoIncidents.id,
|
|
165
|
+
incidentId: healthCheckAutoIncidents.incidentId,
|
|
166
|
+
})
|
|
167
|
+
.from(healthCheckAutoIncidents)
|
|
168
|
+
.where(
|
|
169
|
+
and(
|
|
170
|
+
eq(healthCheckAutoIncidents.systemId, systemId),
|
|
171
|
+
isNull(healthCheckAutoIncidents.closedAt),
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
.limit(1);
|
|
175
|
+
|
|
176
|
+
return rows[0];
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Most recent close time for an auto-incident on this assignment, or
|
|
181
|
+
* undefined if none has ever closed. Used to gate re-opens behind a
|
|
182
|
+
* "must recover first" rule.
|
|
183
|
+
*/
|
|
184
|
+
export async function findLastAutoIncidentClose({
|
|
185
|
+
db,
|
|
186
|
+
systemId,
|
|
187
|
+
configurationId,
|
|
188
|
+
}: {
|
|
189
|
+
db: Db;
|
|
190
|
+
systemId: string;
|
|
191
|
+
configurationId: string;
|
|
192
|
+
}): Promise<Date | undefined> {
|
|
193
|
+
const [row] = await db
|
|
194
|
+
.select({ closedAt: healthCheckAutoIncidents.closedAt })
|
|
195
|
+
.from(healthCheckAutoIncidents)
|
|
196
|
+
.where(
|
|
197
|
+
and(
|
|
198
|
+
eq(healthCheckAutoIncidents.systemId, systemId),
|
|
199
|
+
eq(healthCheckAutoIncidents.configurationId, configurationId),
|
|
200
|
+
isNotNull(healthCheckAutoIncidents.closedAt),
|
|
201
|
+
),
|
|
202
|
+
)
|
|
203
|
+
.orderBy(desc(healthCheckAutoIncidents.closedAt))
|
|
204
|
+
.limit(1);
|
|
205
|
+
|
|
206
|
+
return row?.closedAt ?? undefined;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Has this check produced at least one healthy run since the given
|
|
211
|
+
* timestamp? Used to confirm the system has actually recovered between
|
|
212
|
+
* the last auto-incident close and now before a new auto-incident is
|
|
213
|
+
* allowed to open.
|
|
214
|
+
*/
|
|
215
|
+
export async function hasHealthyRunSince({
|
|
216
|
+
db,
|
|
217
|
+
systemId,
|
|
218
|
+
configurationId,
|
|
219
|
+
since,
|
|
220
|
+
}: {
|
|
221
|
+
db: Db;
|
|
222
|
+
systemId: string;
|
|
223
|
+
configurationId: string;
|
|
224
|
+
since: Date;
|
|
225
|
+
}): Promise<boolean> {
|
|
226
|
+
const [row] = await db
|
|
227
|
+
.select({ id: healthCheckRuns.id })
|
|
228
|
+
.from(healthCheckRuns)
|
|
229
|
+
.where(
|
|
230
|
+
and(
|
|
231
|
+
eq(healthCheckRuns.systemId, systemId),
|
|
232
|
+
eq(healthCheckRuns.configurationId, configurationId),
|
|
233
|
+
eq(healthCheckRuns.status, "healthy"),
|
|
234
|
+
gte(healthCheckRuns.timestamp, since),
|
|
235
|
+
),
|
|
236
|
+
)
|
|
237
|
+
.limit(1);
|
|
238
|
+
|
|
239
|
+
return !!row;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Check whether the system currently has an active maintenance window
|
|
244
|
+
* with suppression. Falls back to "not suppressed" on errors so a
|
|
245
|
+
* downstream outage doesn't accidentally block legitimate incidents.
|
|
246
|
+
*/
|
|
247
|
+
export async function isMaintenanceSuppressed({
|
|
248
|
+
maintenanceClient,
|
|
249
|
+
systemId,
|
|
250
|
+
logger,
|
|
251
|
+
}: {
|
|
252
|
+
maintenanceClient: MaintenanceClient;
|
|
253
|
+
systemId: string;
|
|
254
|
+
logger: Logger;
|
|
255
|
+
}): Promise<boolean> {
|
|
256
|
+
try {
|
|
257
|
+
const { suppressed } =
|
|
258
|
+
await maintenanceClient.hasActiveMaintenanceWithSuppression({ systemId });
|
|
259
|
+
return suppressed;
|
|
260
|
+
} catch (error) {
|
|
261
|
+
logger.warn(
|
|
262
|
+
`Failed to check maintenance for ${systemId} during auto-incident decision; assuming not suppressed:`,
|
|
263
|
+
error,
|
|
264
|
+
);
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Open an auto-incident through the incident plugin's service-level
|
|
271
|
+
* RPC and persist the mapping so the auto-close worker can find and
|
|
272
|
+
* resolve it later. No-op (returns existing mapping) when an active
|
|
273
|
+
* auto-incident already exists for the system.
|
|
274
|
+
*/
|
|
275
|
+
export async function openAutoIncident({
|
|
276
|
+
db,
|
|
277
|
+
incidentClient,
|
|
278
|
+
logger,
|
|
279
|
+
systemId,
|
|
280
|
+
systemName,
|
|
281
|
+
configurationId,
|
|
282
|
+
configurationName,
|
|
283
|
+
policy,
|
|
284
|
+
reason,
|
|
285
|
+
}: {
|
|
286
|
+
db: Db;
|
|
287
|
+
incidentClient: IncidentClient;
|
|
288
|
+
logger: Logger;
|
|
289
|
+
systemId: string;
|
|
290
|
+
systemName: string;
|
|
291
|
+
configurationId: string;
|
|
292
|
+
configurationName: string;
|
|
293
|
+
policy: NotificationPolicy;
|
|
294
|
+
/** Short human-readable phrase for the incident description. */
|
|
295
|
+
reason: string;
|
|
296
|
+
}): Promise<{ incidentId: string } | undefined> {
|
|
297
|
+
const existing = await findActiveAutoIncident({ db, systemId });
|
|
298
|
+
if (existing) {
|
|
299
|
+
return { incidentId: existing.incidentId };
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
try {
|
|
303
|
+
const { id: incidentId } = await incidentClient.createAutoIncident({
|
|
304
|
+
title: `${systemName} is critical`,
|
|
305
|
+
description: `Auto-opened by health check **${configurationName}** (${reason}).`,
|
|
306
|
+
severity: "critical",
|
|
307
|
+
suppressNotifications: policy.useNotificationSuppression,
|
|
308
|
+
systemIds: [systemId],
|
|
309
|
+
initialMessage: `Health check \`${configurationName}\` triggered the auto-incident: ${reason}.`,
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
await db.insert(healthCheckAutoIncidents).values({
|
|
313
|
+
incidentId,
|
|
314
|
+
systemId,
|
|
315
|
+
configurationId,
|
|
316
|
+
cooldownMinutes: policy.autoCloseAfterMinutes,
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
logger.info(
|
|
320
|
+
`Auto-opened incident ${incidentId} for system ${systemId} (check ${configurationId}; ${reason})`,
|
|
321
|
+
);
|
|
322
|
+
return { incidentId };
|
|
323
|
+
} catch (error) {
|
|
324
|
+
// Auto-incident creation is best-effort — failure here shouldn't
|
|
325
|
+
// block the rest of the health-check flow.
|
|
326
|
+
logger.warn(
|
|
327
|
+
`Failed to open auto-incident for system ${systemId} (check ${configurationId}):`,
|
|
328
|
+
error,
|
|
329
|
+
);
|
|
330
|
+
return undefined;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
@@ -38,6 +38,22 @@ interface MockAssociation {
|
|
|
38
38
|
systemId: string;
|
|
39
39
|
configurationId: string;
|
|
40
40
|
enabled: boolean;
|
|
41
|
+
notificationPolicy?: {
|
|
42
|
+
suppressDeEscalations: boolean;
|
|
43
|
+
autoOpenIncidentOnUnhealthy: boolean;
|
|
44
|
+
useNotificationSuppression: boolean;
|
|
45
|
+
skipDuringMaintenance: boolean;
|
|
46
|
+
sustainedUnhealthyTrigger: {
|
|
47
|
+
enabled: boolean;
|
|
48
|
+
durationMinutes: number;
|
|
49
|
+
};
|
|
50
|
+
flappingTrigger: {
|
|
51
|
+
enabled: boolean;
|
|
52
|
+
transitions: number;
|
|
53
|
+
windowMinutes: number;
|
|
54
|
+
};
|
|
55
|
+
autoCloseAfterMinutes: number | null;
|
|
56
|
+
};
|
|
41
57
|
}
|
|
42
58
|
|
|
43
59
|
function createMockService() {
|
|
@@ -80,6 +96,7 @@ function createMockService() {
|
|
|
80
96
|
systemId: string;
|
|
81
97
|
configurationId: string;
|
|
82
98
|
enabled?: boolean;
|
|
99
|
+
notificationPolicy?: MockAssociation["notificationPolicy"];
|
|
83
100
|
}) => {
|
|
84
101
|
const existing = associations.find(
|
|
85
102
|
(a) =>
|
|
@@ -88,11 +105,13 @@ function createMockService() {
|
|
|
88
105
|
);
|
|
89
106
|
if (existing) {
|
|
90
107
|
existing.enabled = props.enabled ?? true;
|
|
108
|
+
existing.notificationPolicy = props.notificationPolicy;
|
|
91
109
|
} else {
|
|
92
110
|
associations.push({
|
|
93
111
|
systemId: props.systemId,
|
|
94
112
|
configurationId: props.configurationId,
|
|
95
113
|
enabled: props.enabled ?? true,
|
|
114
|
+
notificationPolicy: props.notificationPolicy,
|
|
96
115
|
});
|
|
97
116
|
}
|
|
98
117
|
},
|
|
@@ -619,6 +638,80 @@ describe("Healthcheck GitOps Kind: System Extension", () => {
|
|
|
619
638
|
).rejects.toThrow(/Cannot resolve Healthcheck ref "nonexistent-check"/);
|
|
620
639
|
});
|
|
621
640
|
|
|
641
|
+
it("passes a fully-defaulted notificationPolicy through when partial fields are supplied", async () => {
|
|
642
|
+
const ext = buildExtension();
|
|
643
|
+
|
|
644
|
+
const contextWithRefs: ReconcileContext = {
|
|
645
|
+
...mockContext,
|
|
646
|
+
resolveEntityRef: async ({ kind, entityName }) =>
|
|
647
|
+
kind === "Healthcheck" && entityName === "db-check" ? "hc-1" : undefined,
|
|
648
|
+
};
|
|
649
|
+
|
|
650
|
+
await ext.reconcile({
|
|
651
|
+
entity: {
|
|
652
|
+
apiVersion: CHECKSTACK_API_VERSION,
|
|
653
|
+
kind: "System",
|
|
654
|
+
metadata: { name: "payment-service" },
|
|
655
|
+
spec: {},
|
|
656
|
+
},
|
|
657
|
+
extensionSpec: [
|
|
658
|
+
{
|
|
659
|
+
ref: { kind: "Healthcheck", name: "db-check" },
|
|
660
|
+
// Operator only sets the flap threshold and disables
|
|
661
|
+
// auto-close; everything else should default in via the
|
|
662
|
+
// schema parse.
|
|
663
|
+
notificationPolicy: {
|
|
664
|
+
flappingTrigger: { transitions: 5 },
|
|
665
|
+
autoCloseAfterMinutes: null,
|
|
666
|
+
},
|
|
667
|
+
},
|
|
668
|
+
],
|
|
669
|
+
entityId: "sys-123",
|
|
670
|
+
context: contextWithRefs,
|
|
671
|
+
});
|
|
672
|
+
|
|
673
|
+
const policy = mockService.associations[0]?.notificationPolicy;
|
|
674
|
+
expect(policy).toBeDefined();
|
|
675
|
+
expect(policy?.suppressDeEscalations).toBe(false);
|
|
676
|
+
expect(policy?.autoOpenIncidentOnUnhealthy).toBe(true);
|
|
677
|
+
expect(policy?.useNotificationSuppression).toBe(true);
|
|
678
|
+
expect(policy?.skipDuringMaintenance).toBe(true);
|
|
679
|
+
expect(policy?.sustainedUnhealthyTrigger).toEqual({
|
|
680
|
+
enabled: true,
|
|
681
|
+
durationMinutes: 30,
|
|
682
|
+
});
|
|
683
|
+
expect(policy?.flappingTrigger).toEqual({
|
|
684
|
+
enabled: true,
|
|
685
|
+
transitions: 5,
|
|
686
|
+
windowMinutes: 60,
|
|
687
|
+
});
|
|
688
|
+
expect(policy?.autoCloseAfterMinutes).toBeNull();
|
|
689
|
+
});
|
|
690
|
+
|
|
691
|
+
it("omits notificationPolicy entirely when the spec doesn't set it", async () => {
|
|
692
|
+
const ext = buildExtension();
|
|
693
|
+
|
|
694
|
+
const contextWithRefs: ReconcileContext = {
|
|
695
|
+
...mockContext,
|
|
696
|
+
resolveEntityRef: async ({ kind, entityName }) =>
|
|
697
|
+
kind === "Healthcheck" && entityName === "db-check" ? "hc-1" : undefined,
|
|
698
|
+
};
|
|
699
|
+
|
|
700
|
+
await ext.reconcile({
|
|
701
|
+
entity: {
|
|
702
|
+
apiVersion: CHECKSTACK_API_VERSION,
|
|
703
|
+
kind: "System",
|
|
704
|
+
metadata: { name: "payment-service" },
|
|
705
|
+
spec: {},
|
|
706
|
+
},
|
|
707
|
+
extensionSpec: [{ ref: { kind: "Healthcheck", name: "db-check" } }],
|
|
708
|
+
entityId: "sys-123",
|
|
709
|
+
context: contextWithRefs,
|
|
710
|
+
});
|
|
711
|
+
|
|
712
|
+
expect(mockService.associations[0]?.notificationPolicy).toBeUndefined();
|
|
713
|
+
});
|
|
714
|
+
|
|
622
715
|
it("skips when extensionSpec is empty", async () => {
|
|
623
716
|
const ext = buildExtension();
|
|
624
717
|
|
|
@@ -13,6 +13,7 @@ import type {
|
|
|
13
13
|
HealthCheckRegistry,
|
|
14
14
|
CollectorRegistry,
|
|
15
15
|
} from "@checkstack/backend-api";
|
|
16
|
+
import { NotificationPolicySchema } from "@checkstack/healthcheck-common";
|
|
16
17
|
import { HealthCheckService } from "./service";
|
|
17
18
|
import {
|
|
18
19
|
DynamicOperators,
|
|
@@ -81,6 +82,29 @@ const systemHealthcheckExtensionSchema = z
|
|
|
81
82
|
unhealthyThreshold: z.number().int().min(1).optional(),
|
|
82
83
|
satelliteIds: z.array(z.string()).optional(),
|
|
83
84
|
includeLocal: z.boolean().optional(),
|
|
85
|
+
/**
|
|
86
|
+
* Per-assignment notification policy. Any field omitted falls
|
|
87
|
+
* back to the platform default (see `DEFAULT_NOTIFICATION_POLICY`).
|
|
88
|
+
* Inner objects (`sustainedUnhealthyTrigger`, `flappingTrigger`)
|
|
89
|
+
* are also accepted partially.
|
|
90
|
+
*/
|
|
91
|
+
notificationPolicy: NotificationPolicySchema.partial()
|
|
92
|
+
.extend({
|
|
93
|
+
sustainedUnhealthyTrigger: z
|
|
94
|
+
.object({
|
|
95
|
+
enabled: z.boolean().optional(),
|
|
96
|
+
durationMinutes: z.number().int().min(1).optional(),
|
|
97
|
+
})
|
|
98
|
+
.optional(),
|
|
99
|
+
flappingTrigger: z
|
|
100
|
+
.object({
|
|
101
|
+
enabled: z.boolean().optional(),
|
|
102
|
+
transitions: z.number().int().min(1).optional(),
|
|
103
|
+
windowMinutes: z.number().int().min(1).optional(),
|
|
104
|
+
})
|
|
105
|
+
.optional(),
|
|
106
|
+
})
|
|
107
|
+
.optional(),
|
|
84
108
|
}),
|
|
85
109
|
)
|
|
86
110
|
.optional();
|
|
@@ -317,6 +341,15 @@ export function buildSystemHealthcheckExtension(
|
|
|
317
341
|
}
|
|
318
342
|
: undefined;
|
|
319
343
|
|
|
344
|
+
// Materialise the (possibly partial) policy through the full
|
|
345
|
+
// schema so DEFAULT_NOTIFICATION_POLICY fields fill in any
|
|
346
|
+
// keys the operator omitted. Omitting `notificationPolicy`
|
|
347
|
+
// entirely leaves the stored value as null (defaults applied
|
|
348
|
+
// at read time).
|
|
349
|
+
const notificationPolicy = entry.notificationPolicy
|
|
350
|
+
? NotificationPolicySchema.parse(entry.notificationPolicy)
|
|
351
|
+
: undefined;
|
|
352
|
+
|
|
320
353
|
await service.associateSystem({
|
|
321
354
|
systemId: systemEntityId,
|
|
322
355
|
configurationId: configId,
|
|
@@ -324,6 +357,7 @@ export function buildSystemHealthcheckExtension(
|
|
|
324
357
|
stateThresholds,
|
|
325
358
|
satelliteIds: entry.satelliteIds,
|
|
326
359
|
includeLocal: entry.includeLocal,
|
|
360
|
+
notificationPolicy,
|
|
327
361
|
});
|
|
328
362
|
|
|
329
363
|
// Retrieve config to get the interval for scheduling
|
package/src/index.ts
CHANGED
|
@@ -3,6 +3,7 @@ import {
|
|
|
3
3
|
bootstrapHealthChecks,
|
|
4
4
|
} from "./queue-executor";
|
|
5
5
|
import { setupRetentionJob } from "./retention-job";
|
|
6
|
+
import { setupAutoIncidentCloseJob } from "./auto-incident-close-job";
|
|
6
7
|
import * as schema from "./schema";
|
|
7
8
|
import {
|
|
8
9
|
healthCheckAccessRules,
|
|
@@ -34,6 +35,9 @@ import { HealthCheckService } from "./service";
|
|
|
34
35
|
import { registerHealthcheckGitOpsKinds, registerHealthcheckGitOpsDocumentation } from "./healthcheck-gitops-kinds";
|
|
35
36
|
import { catalogHooks } from "@checkstack/catalog-backend";
|
|
36
37
|
import { satelliteHooks } from "@checkstack/satellite-backend";
|
|
38
|
+
import { incidentHooks } from "@checkstack/incident-backend";
|
|
39
|
+
import { eq, and, isNull } from "drizzle-orm";
|
|
40
|
+
import { healthCheckAutoIncidents } from "./schema";
|
|
37
41
|
import { CatalogApi } from "@checkstack/catalog-common";
|
|
38
42
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
39
43
|
import { IncidentApi } from "@checkstack/incident-common";
|
|
@@ -159,6 +163,7 @@ export default createBackendPlugin({
|
|
|
159
163
|
queueManager: coreServices.queueManager,
|
|
160
164
|
signalService: coreServices.signalService,
|
|
161
165
|
cacheManager: coreServices.cacheManager,
|
|
166
|
+
config: coreServices.config,
|
|
162
167
|
},
|
|
163
168
|
// Phase 2: Register router and setup worker
|
|
164
169
|
init: async ({
|
|
@@ -171,6 +176,7 @@ export default createBackendPlugin({
|
|
|
171
176
|
queueManager,
|
|
172
177
|
signalService,
|
|
173
178
|
cacheManager,
|
|
179
|
+
config,
|
|
174
180
|
}) => {
|
|
175
181
|
logger.debug("🏥 Initializing Health Check Backend...");
|
|
176
182
|
|
|
@@ -225,6 +231,16 @@ export default createBackendPlugin({
|
|
|
225
231
|
queueManager,
|
|
226
232
|
});
|
|
227
233
|
|
|
234
|
+
// Setup auto-incident close worker (ticks every 60s, closes
|
|
235
|
+
// auto-opened incidents whose systems have been steady-healthy
|
|
236
|
+
// for the cooldown).
|
|
237
|
+
await setupAutoIncidentCloseJob({
|
|
238
|
+
db: database,
|
|
239
|
+
logger,
|
|
240
|
+
queueManager,
|
|
241
|
+
incidentClient,
|
|
242
|
+
});
|
|
243
|
+
|
|
228
244
|
const healthCheckRouter = createHealthCheckRouter({
|
|
229
245
|
database: database as SafeDatabase<typeof schema>,
|
|
230
246
|
registry: healthCheckRegistry,
|
|
@@ -232,6 +248,7 @@ export default createBackendPlugin({
|
|
|
232
248
|
gitOpsClient,
|
|
233
249
|
getEmitHook: () => storedEmitHook,
|
|
234
250
|
cache,
|
|
251
|
+
configService: config,
|
|
235
252
|
});
|
|
236
253
|
rpc.registerRouter(healthCheckRouter, healthCheckContract);
|
|
237
254
|
|
|
@@ -335,6 +352,32 @@ export default createBackendPlugin({
|
|
|
335
352
|
{ mode: "work-queue", workerGroup: "satellite-cleanup" },
|
|
336
353
|
);
|
|
337
354
|
|
|
355
|
+
// Sync our auto-incident mapping when an incident is resolved.
|
|
356
|
+
// Without this, a manually-closed incident would still appear
|
|
357
|
+
// "active" in our mapping, blocking the require-recovery rule
|
|
358
|
+
// from re-evaluating fresh transitions.
|
|
359
|
+
onHook(
|
|
360
|
+
incidentHooks.incidentResolved,
|
|
361
|
+
async ({ incidentId }) => {
|
|
362
|
+
const updated = await database
|
|
363
|
+
.update(healthCheckAutoIncidents)
|
|
364
|
+
.set({ closedAt: new Date() })
|
|
365
|
+
.where(
|
|
366
|
+
and(
|
|
367
|
+
eq(healthCheckAutoIncidents.incidentId, incidentId),
|
|
368
|
+
isNull(healthCheckAutoIncidents.closedAt),
|
|
369
|
+
),
|
|
370
|
+
)
|
|
371
|
+
.returning({ id: healthCheckAutoIncidents.id });
|
|
372
|
+
if (updated.length > 0) {
|
|
373
|
+
logger.debug(
|
|
374
|
+
`Marked auto-incident mapping closed for resolved incident ${incidentId}`,
|
|
375
|
+
);
|
|
376
|
+
}
|
|
377
|
+
},
|
|
378
|
+
{ mode: "work-queue", workerGroup: "auto-incident-sync" },
|
|
379
|
+
);
|
|
380
|
+
|
|
338
381
|
logger.debug("✅ Health Check Backend afterPluginsReady complete.");
|
|
339
382
|
},
|
|
340
383
|
});
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Versioned schema used by `ConfigService` to persist platform-wide
|
|
3
|
+
* notification defaults. The shape is the runtime `NotificationPolicy`
|
|
4
|
+
* itself — each field has a built-in compile-time default so an empty
|
|
5
|
+
* stored value still parses to a valid policy.
|
|
6
|
+
*/
|
|
7
|
+
export { NotificationPolicySchema as notificationDefaultsConfigV1 } from "@checkstack/healthcheck-common";
|
|
8
|
+
|
|
9
|
+
export const NOTIFICATION_DEFAULTS_CONFIG_ID = "healthcheck.notification-defaults";
|
|
10
|
+
export const NOTIFICATION_DEFAULTS_CONFIG_VERSION = 1;
|