@checkstack/healthcheck-backend 1.1.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +256 -0
- package/drizzle/0012_fair_boomer.sql +1 -0
- package/drizzle/0013_clean_fabian_cortez.sql +20 -0
- package/drizzle/0014_chilly_ultragirl.sql +2 -0
- package/drizzle/meta/0012_snapshot.json +447 -0
- package/drizzle/meta/0013_snapshot.json +615 -0
- package/drizzle/meta/0014_snapshot.json +648 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +21 -20
- package/src/auto-incident-close-job.ts +164 -0
- package/src/auto-incident.test.ts +196 -0
- package/src/auto-incident.ts +332 -0
- package/src/automations.test.ts +255 -0
- package/src/automations.ts +340 -0
- package/src/healthcheck-gitops-kinds.test.ts +93 -0
- package/src/healthcheck-gitops-kinds.ts +34 -0
- package/src/hooks.ts +69 -4
- package/src/index.ts +80 -52
- package/src/notification-defaults-config.ts +10 -0
- package/src/notification-policy.test.ts +104 -0
- package/src/notification-policy.ts +56 -0
- package/src/queue-executor.test.ts +137 -0
- package/src/queue-executor.ts +434 -42
- package/src/router.test.ts +12 -0
- package/src/router.ts +30 -2
- package/src/schema.ts +76 -0
- package/src/service-assignments.test.ts +184 -0
- package/src/service-notification-policy.test.ts +174 -0
- package/src/service.ts +195 -1
- package/tsconfig.json +5 -2
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Healthcheck triggers + actions registered with the Automation Platform.
|
|
3
|
+
*
|
|
4
|
+
* Triggers:
|
|
5
|
+
* - `healthcheck.system.degraded` — existing directional hook
|
|
6
|
+
* - `healthcheck.system.healthy` — existing directional hook
|
|
7
|
+
* - `healthcheck.system.health_changed` — new umbrella hook,
|
|
8
|
+
* fires on every aggregated-health transition. Carries both the
|
|
9
|
+
* previous and new statuses so subscribers don't have to listen
|
|
10
|
+
* to two hooks and coalesce themselves.
|
|
11
|
+
*
|
|
12
|
+
* Actions:
|
|
13
|
+
* - `healthcheck.run_now`: enqueue a one-off run of a specific
|
|
14
|
+
* `(systemId, configurationId)` assignment. The recurring
|
|
15
|
+
* schedule keeps ticking; this just nudges the queue.
|
|
16
|
+
* - `healthcheck.enable_assignment` /
|
|
17
|
+
* `healthcheck.disable_assignment`: flip the `enabled` flag on an
|
|
18
|
+
* existing assignment via `service.setAssignmentEnabled`. Emits
|
|
19
|
+
* the existing `assignmentChanged` hook so the satellite-config
|
|
20
|
+
* relay picks up the change.
|
|
21
|
+
*
|
|
22
|
+
* Mutation actions emit hooks themselves (via the `emitHook` factory
|
|
23
|
+
* dep) so downstream automations + caches react the same way as
|
|
24
|
+
* RPC-driven mutations.
|
|
25
|
+
*/
|
|
26
|
+
import { z } from "zod";
|
|
27
|
+
import { Versioned, type Hook } from "@checkstack/backend-api";
|
|
28
|
+
import type { QueueManager } from "@checkstack/queue-api";
|
|
29
|
+
import type {
|
|
30
|
+
ActionDefinition,
|
|
31
|
+
TriggerDefinition,
|
|
32
|
+
} from "@checkstack/automation-backend";
|
|
33
|
+
import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
|
|
34
|
+
|
|
35
|
+
import { healthCheckHooks } from "./hooks";
|
|
36
|
+
import {
|
|
37
|
+
HEALTH_CHECK_QUEUE,
|
|
38
|
+
type HealthCheckJobPayload,
|
|
39
|
+
} from "./queue-executor";
|
|
40
|
+
import type { HealthCheckService } from "./service";
|
|
41
|
+
|
|
42
|
+
// ─── Payload schemas — match the hook payloads exactly ─────────────────
|
|
43
|
+
|
|
44
|
+
const systemDegradedPayloadSchema = z.object({
|
|
45
|
+
systemId: z.string(),
|
|
46
|
+
systemName: z.string().optional(),
|
|
47
|
+
previousStatus: HealthCheckStatusSchema,
|
|
48
|
+
newStatus: HealthCheckStatusSchema,
|
|
49
|
+
healthyChecks: z.number(),
|
|
50
|
+
totalChecks: z.number(),
|
|
51
|
+
timestamp: z.string(),
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
const systemHealthyPayloadSchema = z.object({
|
|
55
|
+
systemId: z.string(),
|
|
56
|
+
systemName: z.string().optional(),
|
|
57
|
+
previousStatus: HealthCheckStatusSchema,
|
|
58
|
+
healthyChecks: z.number(),
|
|
59
|
+
totalChecks: z.number(),
|
|
60
|
+
timestamp: z.string(),
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
const systemHealthChangedPayloadSchema = z.object({
|
|
64
|
+
systemId: z.string(),
|
|
65
|
+
systemName: z.string().optional(),
|
|
66
|
+
previousStatus: HealthCheckStatusSchema,
|
|
67
|
+
newStatus: HealthCheckStatusSchema,
|
|
68
|
+
healthyChecks: z.number(),
|
|
69
|
+
totalChecks: z.number(),
|
|
70
|
+
timestamp: z.string(),
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
const checkFailedPayloadSchema = z.object({
|
|
74
|
+
systemId: z.string(),
|
|
75
|
+
configurationId: z.string(),
|
|
76
|
+
status: HealthCheckStatusSchema,
|
|
77
|
+
latencyMs: z.number().optional(),
|
|
78
|
+
result: z.record(z.string(), z.unknown()).optional(),
|
|
79
|
+
timestamp: z.string(),
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
const flappingDetectedPayloadSchema = z.object({
|
|
83
|
+
systemId: z.string(),
|
|
84
|
+
configurationId: z.string(),
|
|
85
|
+
transitionCount: z.number(),
|
|
86
|
+
windowMinutes: z.number(),
|
|
87
|
+
timestamp: z.string(),
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
// ─── Triggers ──────────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
export const systemDegradedTrigger: TriggerDefinition<
|
|
93
|
+
z.infer<typeof systemDegradedPayloadSchema>
|
|
94
|
+
> = {
|
|
95
|
+
id: "system_degraded",
|
|
96
|
+
displayName: "System Health Degraded",
|
|
97
|
+
description:
|
|
98
|
+
"Fires when a system's health transitions from healthy to degraded/unhealthy",
|
|
99
|
+
category: "Health",
|
|
100
|
+
icon: "HeartPulse",
|
|
101
|
+
payloadSchema: systemDegradedPayloadSchema,
|
|
102
|
+
hook: healthCheckHooks.systemDegraded,
|
|
103
|
+
contextKey: (p) => p.systemId,
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
export const systemHealthyTrigger: TriggerDefinition<
|
|
107
|
+
z.infer<typeof systemHealthyPayloadSchema>
|
|
108
|
+
> = {
|
|
109
|
+
id: "system_healthy",
|
|
110
|
+
displayName: "System Health Restored",
|
|
111
|
+
description: "Fires when a system's health recovers to healthy",
|
|
112
|
+
category: "Health",
|
|
113
|
+
icon: "HeartPulse",
|
|
114
|
+
payloadSchema: systemHealthyPayloadSchema,
|
|
115
|
+
hook: healthCheckHooks.systemHealthy,
|
|
116
|
+
contextKey: (p) => p.systemId,
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
export const systemHealthChangedTrigger: TriggerDefinition<
|
|
120
|
+
z.infer<typeof systemHealthChangedPayloadSchema>
|
|
121
|
+
> = {
|
|
122
|
+
id: "system_health_changed",
|
|
123
|
+
displayName: "System Health Changed",
|
|
124
|
+
description:
|
|
125
|
+
"Fires on every aggregated-health transition — carries previous + new status",
|
|
126
|
+
category: "Health",
|
|
127
|
+
icon: "HeartPulse",
|
|
128
|
+
payloadSchema: systemHealthChangedPayloadSchema,
|
|
129
|
+
hook: healthCheckHooks.systemHealthChanged,
|
|
130
|
+
contextKey: (p) => p.systemId,
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
export const checkFailedTrigger: TriggerDefinition<
|
|
134
|
+
z.infer<typeof checkFailedPayloadSchema>
|
|
135
|
+
> = {
|
|
136
|
+
id: "check_failed",
|
|
137
|
+
displayName: "Health Check Failed",
|
|
138
|
+
description:
|
|
139
|
+
"Fires when an individual check run completes with a non-`healthy` status",
|
|
140
|
+
category: "Health",
|
|
141
|
+
icon: "TriangleAlert",
|
|
142
|
+
payloadSchema: checkFailedPayloadSchema,
|
|
143
|
+
hook: healthCheckHooks.checkFailed,
|
|
144
|
+
contextKey: (p) => p.systemId,
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
export const flappingDetectedTrigger: TriggerDefinition<
|
|
148
|
+
z.infer<typeof flappingDetectedPayloadSchema>
|
|
149
|
+
> = {
|
|
150
|
+
id: "flapping_detected",
|
|
151
|
+
displayName: "Health Check Flapping",
|
|
152
|
+
description:
|
|
153
|
+
"Fires when N unhealthy transitions are observed within the policy window. Re-fires on every additional transition while flapping; debounce in the automation if needed.",
|
|
154
|
+
category: "Health",
|
|
155
|
+
icon: "Repeat",
|
|
156
|
+
payloadSchema: flappingDetectedPayloadSchema,
|
|
157
|
+
hook: healthCheckHooks.flappingDetected,
|
|
158
|
+
contextKey: (p) => p.systemId,
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
export const healthCheckTriggers: TriggerDefinition<unknown>[] = [
|
|
162
|
+
systemDegradedTrigger as TriggerDefinition<unknown>,
|
|
163
|
+
systemHealthyTrigger as TriggerDefinition<unknown>,
|
|
164
|
+
systemHealthChangedTrigger as TriggerDefinition<unknown>,
|
|
165
|
+
checkFailedTrigger as TriggerDefinition<unknown>,
|
|
166
|
+
flappingDetectedTrigger as TriggerDefinition<unknown>,
|
|
167
|
+
];
|
|
168
|
+
|
|
169
|
+
// ─── Action configs ────────────────────────────────────────────────────
|
|
170
|
+
|
|
171
|
+
const runNowConfigSchema = z.object({
|
|
172
|
+
systemId: z.string().min(1).describe("Target system id"),
|
|
173
|
+
configurationId: z
|
|
174
|
+
.string()
|
|
175
|
+
.min(1)
|
|
176
|
+
.describe("Target health-check configuration id"),
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
const assignmentToggleConfigSchema = z.object({
|
|
180
|
+
systemId: z.string().min(1),
|
|
181
|
+
configurationId: z.string().min(1),
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
// ─── Artifact ──────────────────────────────────────────────────────────
|
|
185
|
+
|
|
186
|
+
const assignmentArtifactSchema = z.object({
|
|
187
|
+
systemId: z.string(),
|
|
188
|
+
configurationId: z.string(),
|
|
189
|
+
enabled: z.boolean().optional(),
|
|
190
|
+
enqueued: z.boolean().optional(),
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
export type AssignmentArtifact = z.infer<typeof assignmentArtifactSchema>;
|
|
194
|
+
|
|
195
|
+
export const assignmentArtifactType = {
|
|
196
|
+
id: "assignment",
|
|
197
|
+
displayName: "Healthcheck Assignment",
|
|
198
|
+
description:
|
|
199
|
+
"Identifies the system↔configuration assignment touched by an automation action",
|
|
200
|
+
schema: assignmentArtifactSchema,
|
|
201
|
+
} as const;
|
|
202
|
+
|
|
203
|
+
// ─── Action factory ────────────────────────────────────────────────────
|
|
204
|
+
|
|
205
|
+
export interface HealthCheckActionDeps {
|
|
206
|
+
service: HealthCheckService;
|
|
207
|
+
queueManager: QueueManager;
|
|
208
|
+
emitHook: <T>(hook: Hook<T>, payload: T) => Promise<void>;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
export function createHealthCheckActions(
|
|
212
|
+
deps: HealthCheckActionDeps,
|
|
213
|
+
): ActionDefinition<unknown, unknown>[] {
|
|
214
|
+
const runNow: ActionDefinition<
|
|
215
|
+
z.infer<typeof runNowConfigSchema>,
|
|
216
|
+
AssignmentArtifact
|
|
217
|
+
> = {
|
|
218
|
+
id: "run_now",
|
|
219
|
+
displayName: "Run Health Check Now",
|
|
220
|
+
description:
|
|
221
|
+
"Enqueue a one-off run of the given assignment. Doesn't disturb the recurring schedule.",
|
|
222
|
+
category: "Health",
|
|
223
|
+
icon: "Play",
|
|
224
|
+
config: new Versioned({ version: 1, schema: runNowConfigSchema }),
|
|
225
|
+
produces: "healthcheck.assignment",
|
|
226
|
+
execute: async ({ config, logger }) => {
|
|
227
|
+
const queue = deps.queueManager.getQueue<HealthCheckJobPayload>(
|
|
228
|
+
HEALTH_CHECK_QUEUE,
|
|
229
|
+
);
|
|
230
|
+
await queue.enqueue({
|
|
231
|
+
configId: config.configurationId,
|
|
232
|
+
systemId: config.systemId,
|
|
233
|
+
});
|
|
234
|
+
logger.info(
|
|
235
|
+
`Automation enqueued run for ${config.systemId}:${config.configurationId}`,
|
|
236
|
+
);
|
|
237
|
+
return {
|
|
238
|
+
success: true,
|
|
239
|
+
externalId: `${config.systemId}:${config.configurationId}`,
|
|
240
|
+
artifact: {
|
|
241
|
+
systemId: config.systemId,
|
|
242
|
+
configurationId: config.configurationId,
|
|
243
|
+
enqueued: true,
|
|
244
|
+
},
|
|
245
|
+
};
|
|
246
|
+
},
|
|
247
|
+
};
|
|
248
|
+
|
|
249
|
+
const enableAssignment: ActionDefinition<
|
|
250
|
+
z.infer<typeof assignmentToggleConfigSchema>,
|
|
251
|
+
AssignmentArtifact
|
|
252
|
+
> = {
|
|
253
|
+
id: "enable_assignment",
|
|
254
|
+
displayName: "Enable Health Check Assignment",
|
|
255
|
+
description:
|
|
256
|
+
"Flip the `enabled` flag on an existing system↔configuration assignment to true.",
|
|
257
|
+
category: "Health",
|
|
258
|
+
icon: "Power",
|
|
259
|
+
config: new Versioned({ version: 1, schema: assignmentToggleConfigSchema }),
|
|
260
|
+
produces: "healthcheck.assignment",
|
|
261
|
+
execute: async ({ config, logger }) => {
|
|
262
|
+
const updated = await deps.service.setAssignmentEnabled(
|
|
263
|
+
config.systemId,
|
|
264
|
+
config.configurationId,
|
|
265
|
+
true,
|
|
266
|
+
);
|
|
267
|
+
if (!updated) {
|
|
268
|
+
return {
|
|
269
|
+
success: false,
|
|
270
|
+
error: `Assignment not found: ${config.systemId} ↔ ${config.configurationId}`,
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
await deps.emitHook(healthCheckHooks.assignmentChanged, {
|
|
274
|
+
systemId: config.systemId,
|
|
275
|
+
configurationId: config.configurationId,
|
|
276
|
+
});
|
|
277
|
+
logger.info(
|
|
278
|
+
`Automation enabled assignment ${config.systemId}:${config.configurationId}`,
|
|
279
|
+
);
|
|
280
|
+
return {
|
|
281
|
+
success: true,
|
|
282
|
+
externalId: `${config.systemId}:${config.configurationId}`,
|
|
283
|
+
artifact: {
|
|
284
|
+
systemId: config.systemId,
|
|
285
|
+
configurationId: config.configurationId,
|
|
286
|
+
enabled: true,
|
|
287
|
+
},
|
|
288
|
+
};
|
|
289
|
+
},
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
const disableAssignment: ActionDefinition<
|
|
293
|
+
z.infer<typeof assignmentToggleConfigSchema>,
|
|
294
|
+
AssignmentArtifact
|
|
295
|
+
> = {
|
|
296
|
+
id: "disable_assignment",
|
|
297
|
+
displayName: "Disable Health Check Assignment",
|
|
298
|
+
description:
|
|
299
|
+
"Flip the `enabled` flag on an existing system↔configuration assignment to false.",
|
|
300
|
+
category: "Health",
|
|
301
|
+
icon: "PowerOff",
|
|
302
|
+
config: new Versioned({ version: 1, schema: assignmentToggleConfigSchema }),
|
|
303
|
+
produces: "healthcheck.assignment",
|
|
304
|
+
execute: async ({ config, logger }) => {
|
|
305
|
+
const updated = await deps.service.setAssignmentEnabled(
|
|
306
|
+
config.systemId,
|
|
307
|
+
config.configurationId,
|
|
308
|
+
false,
|
|
309
|
+
);
|
|
310
|
+
if (!updated) {
|
|
311
|
+
return {
|
|
312
|
+
success: false,
|
|
313
|
+
error: `Assignment not found: ${config.systemId} ↔ ${config.configurationId}`,
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
await deps.emitHook(healthCheckHooks.assignmentChanged, {
|
|
317
|
+
systemId: config.systemId,
|
|
318
|
+
configurationId: config.configurationId,
|
|
319
|
+
});
|
|
320
|
+
logger.info(
|
|
321
|
+
`Automation disabled assignment ${config.systemId}:${config.configurationId}`,
|
|
322
|
+
);
|
|
323
|
+
return {
|
|
324
|
+
success: true,
|
|
325
|
+
externalId: `${config.systemId}:${config.configurationId}`,
|
|
326
|
+
artifact: {
|
|
327
|
+
systemId: config.systemId,
|
|
328
|
+
configurationId: config.configurationId,
|
|
329
|
+
enabled: false,
|
|
330
|
+
},
|
|
331
|
+
};
|
|
332
|
+
},
|
|
333
|
+
};
|
|
334
|
+
|
|
335
|
+
return [
|
|
336
|
+
runNow as ActionDefinition<unknown, unknown>,
|
|
337
|
+
enableAssignment as ActionDefinition<unknown, unknown>,
|
|
338
|
+
disableAssignment as ActionDefinition<unknown, unknown>,
|
|
339
|
+
];
|
|
340
|
+
}
|
|
@@ -38,6 +38,22 @@ interface MockAssociation {
|
|
|
38
38
|
systemId: string;
|
|
39
39
|
configurationId: string;
|
|
40
40
|
enabled: boolean;
|
|
41
|
+
notificationPolicy?: {
|
|
42
|
+
suppressDeEscalations: boolean;
|
|
43
|
+
autoOpenIncidentOnUnhealthy: boolean;
|
|
44
|
+
useNotificationSuppression: boolean;
|
|
45
|
+
skipDuringMaintenance: boolean;
|
|
46
|
+
sustainedUnhealthyTrigger: {
|
|
47
|
+
enabled: boolean;
|
|
48
|
+
durationMinutes: number;
|
|
49
|
+
};
|
|
50
|
+
flappingTrigger: {
|
|
51
|
+
enabled: boolean;
|
|
52
|
+
transitions: number;
|
|
53
|
+
windowMinutes: number;
|
|
54
|
+
};
|
|
55
|
+
autoCloseAfterMinutes: number | null;
|
|
56
|
+
};
|
|
41
57
|
}
|
|
42
58
|
|
|
43
59
|
function createMockService() {
|
|
@@ -80,6 +96,7 @@ function createMockService() {
|
|
|
80
96
|
systemId: string;
|
|
81
97
|
configurationId: string;
|
|
82
98
|
enabled?: boolean;
|
|
99
|
+
notificationPolicy?: MockAssociation["notificationPolicy"];
|
|
83
100
|
}) => {
|
|
84
101
|
const existing = associations.find(
|
|
85
102
|
(a) =>
|
|
@@ -88,11 +105,13 @@ function createMockService() {
|
|
|
88
105
|
);
|
|
89
106
|
if (existing) {
|
|
90
107
|
existing.enabled = props.enabled ?? true;
|
|
108
|
+
existing.notificationPolicy = props.notificationPolicy;
|
|
91
109
|
} else {
|
|
92
110
|
associations.push({
|
|
93
111
|
systemId: props.systemId,
|
|
94
112
|
configurationId: props.configurationId,
|
|
95
113
|
enabled: props.enabled ?? true,
|
|
114
|
+
notificationPolicy: props.notificationPolicy,
|
|
96
115
|
});
|
|
97
116
|
}
|
|
98
117
|
},
|
|
@@ -619,6 +638,80 @@ describe("Healthcheck GitOps Kind: System Extension", () => {
|
|
|
619
638
|
).rejects.toThrow(/Cannot resolve Healthcheck ref "nonexistent-check"/);
|
|
620
639
|
});
|
|
621
640
|
|
|
641
|
+
it("passes a fully-defaulted notificationPolicy through when partial fields are supplied", async () => {
|
|
642
|
+
const ext = buildExtension();
|
|
643
|
+
|
|
644
|
+
const contextWithRefs: ReconcileContext = {
|
|
645
|
+
...mockContext,
|
|
646
|
+
resolveEntityRef: async ({ kind, entityName }) =>
|
|
647
|
+
kind === "Healthcheck" && entityName === "db-check" ? "hc-1" : undefined,
|
|
648
|
+
};
|
|
649
|
+
|
|
650
|
+
await ext.reconcile({
|
|
651
|
+
entity: {
|
|
652
|
+
apiVersion: CHECKSTACK_API_VERSION,
|
|
653
|
+
kind: "System",
|
|
654
|
+
metadata: { name: "payment-service" },
|
|
655
|
+
spec: {},
|
|
656
|
+
},
|
|
657
|
+
extensionSpec: [
|
|
658
|
+
{
|
|
659
|
+
ref: { kind: "Healthcheck", name: "db-check" },
|
|
660
|
+
// Operator only sets the flap threshold and disables
|
|
661
|
+
// auto-close; everything else should default in via the
|
|
662
|
+
// schema parse.
|
|
663
|
+
notificationPolicy: {
|
|
664
|
+
flappingTrigger: { transitions: 5 },
|
|
665
|
+
autoCloseAfterMinutes: null,
|
|
666
|
+
},
|
|
667
|
+
},
|
|
668
|
+
],
|
|
669
|
+
entityId: "sys-123",
|
|
670
|
+
context: contextWithRefs,
|
|
671
|
+
});
|
|
672
|
+
|
|
673
|
+
const policy = mockService.associations[0]?.notificationPolicy;
|
|
674
|
+
expect(policy).toBeDefined();
|
|
675
|
+
expect(policy?.suppressDeEscalations).toBe(false);
|
|
676
|
+
expect(policy?.autoOpenIncidentOnUnhealthy).toBe(true);
|
|
677
|
+
expect(policy?.useNotificationSuppression).toBe(true);
|
|
678
|
+
expect(policy?.skipDuringMaintenance).toBe(true);
|
|
679
|
+
expect(policy?.sustainedUnhealthyTrigger).toEqual({
|
|
680
|
+
enabled: true,
|
|
681
|
+
durationMinutes: 30,
|
|
682
|
+
});
|
|
683
|
+
expect(policy?.flappingTrigger).toEqual({
|
|
684
|
+
enabled: true,
|
|
685
|
+
transitions: 5,
|
|
686
|
+
windowMinutes: 60,
|
|
687
|
+
});
|
|
688
|
+
expect(policy?.autoCloseAfterMinutes).toBeNull();
|
|
689
|
+
});
|
|
690
|
+
|
|
691
|
+
it("omits notificationPolicy entirely when the spec doesn't set it", async () => {
|
|
692
|
+
const ext = buildExtension();
|
|
693
|
+
|
|
694
|
+
const contextWithRefs: ReconcileContext = {
|
|
695
|
+
...mockContext,
|
|
696
|
+
resolveEntityRef: async ({ kind, entityName }) =>
|
|
697
|
+
kind === "Healthcheck" && entityName === "db-check" ? "hc-1" : undefined,
|
|
698
|
+
};
|
|
699
|
+
|
|
700
|
+
await ext.reconcile({
|
|
701
|
+
entity: {
|
|
702
|
+
apiVersion: CHECKSTACK_API_VERSION,
|
|
703
|
+
kind: "System",
|
|
704
|
+
metadata: { name: "payment-service" },
|
|
705
|
+
spec: {},
|
|
706
|
+
},
|
|
707
|
+
extensionSpec: [{ ref: { kind: "Healthcheck", name: "db-check" } }],
|
|
708
|
+
entityId: "sys-123",
|
|
709
|
+
context: contextWithRefs,
|
|
710
|
+
});
|
|
711
|
+
|
|
712
|
+
expect(mockService.associations[0]?.notificationPolicy).toBeUndefined();
|
|
713
|
+
});
|
|
714
|
+
|
|
622
715
|
it("skips when extensionSpec is empty", async () => {
|
|
623
716
|
const ext = buildExtension();
|
|
624
717
|
|
|
@@ -13,6 +13,7 @@ import type {
|
|
|
13
13
|
HealthCheckRegistry,
|
|
14
14
|
CollectorRegistry,
|
|
15
15
|
} from "@checkstack/backend-api";
|
|
16
|
+
import { NotificationPolicySchema } from "@checkstack/healthcheck-common";
|
|
16
17
|
import { HealthCheckService } from "./service";
|
|
17
18
|
import {
|
|
18
19
|
DynamicOperators,
|
|
@@ -81,6 +82,29 @@ const systemHealthcheckExtensionSchema = z
|
|
|
81
82
|
unhealthyThreshold: z.number().int().min(1).optional(),
|
|
82
83
|
satelliteIds: z.array(z.string()).optional(),
|
|
83
84
|
includeLocal: z.boolean().optional(),
|
|
85
|
+
/**
|
|
86
|
+
* Per-assignment notification policy. Any field omitted falls
|
|
87
|
+
* back to the platform default (see `DEFAULT_NOTIFICATION_POLICY`).
|
|
88
|
+
* Inner objects (`sustainedUnhealthyTrigger`, `flappingTrigger`)
|
|
89
|
+
* are also accepted partially.
|
|
90
|
+
*/
|
|
91
|
+
notificationPolicy: NotificationPolicySchema.partial()
|
|
92
|
+
.extend({
|
|
93
|
+
sustainedUnhealthyTrigger: z
|
|
94
|
+
.object({
|
|
95
|
+
enabled: z.boolean().optional(),
|
|
96
|
+
durationMinutes: z.number().int().min(1).optional(),
|
|
97
|
+
})
|
|
98
|
+
.optional(),
|
|
99
|
+
flappingTrigger: z
|
|
100
|
+
.object({
|
|
101
|
+
enabled: z.boolean().optional(),
|
|
102
|
+
transitions: z.number().int().min(1).optional(),
|
|
103
|
+
windowMinutes: z.number().int().min(1).optional(),
|
|
104
|
+
})
|
|
105
|
+
.optional(),
|
|
106
|
+
})
|
|
107
|
+
.optional(),
|
|
84
108
|
}),
|
|
85
109
|
)
|
|
86
110
|
.optional();
|
|
@@ -317,6 +341,15 @@ export function buildSystemHealthcheckExtension(
|
|
|
317
341
|
}
|
|
318
342
|
: undefined;
|
|
319
343
|
|
|
344
|
+
// Materialise the (possibly partial) policy through the full
|
|
345
|
+
// schema so DEFAULT_NOTIFICATION_POLICY fields fill in any
|
|
346
|
+
// keys the operator omitted. Omitting `notificationPolicy`
|
|
347
|
+
// entirely leaves the stored value as null (defaults applied
|
|
348
|
+
// at read time).
|
|
349
|
+
const notificationPolicy = entry.notificationPolicy
|
|
350
|
+
? NotificationPolicySchema.parse(entry.notificationPolicy)
|
|
351
|
+
: undefined;
|
|
352
|
+
|
|
320
353
|
await service.associateSystem({
|
|
321
354
|
systemId: systemEntityId,
|
|
322
355
|
configurationId: configId,
|
|
@@ -324,6 +357,7 @@ export function buildSystemHealthcheckExtension(
|
|
|
324
357
|
stateThresholds,
|
|
325
358
|
satelliteIds: entry.satelliteIds,
|
|
326
359
|
includeLocal: entry.includeLocal,
|
|
360
|
+
notificationPolicy,
|
|
327
361
|
});
|
|
328
362
|
|
|
329
363
|
// Retrieve config to get the interval for scheduling
|
package/src/hooks.ts
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
import { createHook } from "@checkstack/backend-api";
|
|
2
|
+
import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Health check hooks for cross-plugin communication and external integrations.
|
|
5
6
|
* These hooks are registered as integration events for webhook subscriptions.
|
|
7
|
+
*
|
|
8
|
+
* `status` / `previousStatus` / `newStatus` carry the canonical
|
|
9
|
+
* `HealthCheckStatus` enum values, so automation triggers built on
|
|
10
|
+
* these hooks can offer the known values for `==` comparisons in the
|
|
11
|
+
* editor.
|
|
6
12
|
*/
|
|
7
13
|
export const healthCheckHooks = {
|
|
8
14
|
/**
|
|
@@ -13,8 +19,8 @@ export const healthCheckHooks = {
|
|
|
13
19
|
systemDegraded: createHook<{
|
|
14
20
|
systemId: string;
|
|
15
21
|
systemName?: string;
|
|
16
|
-
previousStatus:
|
|
17
|
-
newStatus:
|
|
22
|
+
previousStatus: HealthCheckStatus;
|
|
23
|
+
newStatus: HealthCheckStatus;
|
|
18
24
|
healthyChecks: number;
|
|
19
25
|
totalChecks: number;
|
|
20
26
|
timestamp: string;
|
|
@@ -27,7 +33,7 @@ export const healthCheckHooks = {
|
|
|
27
33
|
systemHealthy: createHook<{
|
|
28
34
|
systemId: string;
|
|
29
35
|
systemName?: string;
|
|
30
|
-
previousStatus:
|
|
36
|
+
previousStatus: HealthCheckStatus;
|
|
31
37
|
healthyChecks: number;
|
|
32
38
|
totalChecks: number;
|
|
33
39
|
timestamp: string;
|
|
@@ -50,9 +56,68 @@ export const healthCheckHooks = {
|
|
|
50
56
|
checkCompleted: createHook<{
|
|
51
57
|
systemId: string;
|
|
52
58
|
configurationId: string;
|
|
53
|
-
status:
|
|
59
|
+
status: HealthCheckStatus;
|
|
54
60
|
latencyMs: number | undefined;
|
|
55
61
|
result: Record<string, unknown> | undefined;
|
|
56
62
|
timestamp: string;
|
|
57
63
|
}>("healthcheck.check.completed"),
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Umbrella variant of `systemDegraded` + `systemHealthy` — fires on
|
|
67
|
+
* **any** aggregated-health transition, carrying both the previous
|
|
68
|
+
* and new statuses. Subscribers (e.g. an automation that wants to
|
|
69
|
+
* react to every state change without subscribing to two hooks
|
|
70
|
+
* and coalescing themselves) prefer this one.
|
|
71
|
+
*
|
|
72
|
+
* Emitted alongside the directional hooks, never instead of them,
|
|
73
|
+
* so existing subscribers keep working unchanged.
|
|
74
|
+
*/
|
|
75
|
+
systemHealthChanged: createHook<{
|
|
76
|
+
systemId: string;
|
|
77
|
+
systemName?: string;
|
|
78
|
+
previousStatus: HealthCheckStatus;
|
|
79
|
+
newStatus: HealthCheckStatus;
|
|
80
|
+
healthyChecks: number;
|
|
81
|
+
totalChecks: number;
|
|
82
|
+
timestamp: string;
|
|
83
|
+
}>("healthcheck.system.health_changed"),
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Narrow variant of `checkCompleted` — fires only when an individual
|
|
87
|
+
* check run completed with a non-`healthy` status. Carries the
|
|
88
|
+
* latency + raw result so subscribers can branch on collector-
|
|
89
|
+
* specific fields without re-querying. Operators usually prefer
|
|
90
|
+
* this over `checkCompleted` for incident-style automation because
|
|
91
|
+
* a "trigger on any completion, then filter" automation is harder
|
|
92
|
+
* to read at a glance than a typed `check_failed` entry point.
|
|
93
|
+
*/
|
|
94
|
+
checkFailed: createHook<{
|
|
95
|
+
systemId: string;
|
|
96
|
+
configurationId: string;
|
|
97
|
+
status: HealthCheckStatus;
|
|
98
|
+
latencyMs: number | undefined;
|
|
99
|
+
result: Record<string, unknown> | undefined;
|
|
100
|
+
timestamp: string;
|
|
101
|
+
}>("healthcheck.check.failed"),
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Emitted when the flapping-detector observes ≥ N unhealthy
|
|
105
|
+
* transitions in the policy's configured window. Fires regardless
|
|
106
|
+
* of whether `autoOpenIncidentOnUnhealthy` is enabled — the hook is
|
|
107
|
+
* informational; the auto-incident pipeline still gates on the
|
|
108
|
+
* policy.
|
|
109
|
+
*
|
|
110
|
+
* Re-fires on every additional transition past the threshold while
|
|
111
|
+
* the check stays in a flapping pattern, so automations that want
|
|
112
|
+
* "page once and only once" should debounce on `(systemId,
|
|
113
|
+
* configurationId)`. Carrying the observed transition count + the
|
|
114
|
+
* window length lets subscribers reason about both.
|
|
115
|
+
*/
|
|
116
|
+
flappingDetected: createHook<{
|
|
117
|
+
systemId: string;
|
|
118
|
+
configurationId: string;
|
|
119
|
+
transitionCount: number;
|
|
120
|
+
windowMinutes: number;
|
|
121
|
+
timestamp: string;
|
|
122
|
+
}>("healthcheck.flapping_detected"),
|
|
58
123
|
} as const;
|