@checkstack/healthcheck-backend 1.1.4 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,332 @@
1
+ import { and, desc, eq, gte, isNotNull, isNull, sql } from "drizzle-orm";
2
+ import type {
3
+ HealthCheckStatus,
4
+ NotificationPolicy,
5
+ } from "@checkstack/healthcheck-common";
6
+ import type { Logger, SafeDatabase } from "@checkstack/backend-api";
7
+ import type { InferClient } from "@checkstack/common";
8
+ import { IncidentApi } from "@checkstack/incident-common";
9
+ import { MaintenanceApi } from "@checkstack/maintenance-common";
10
+ import {
11
+ healthCheckAutoIncidents,
12
+ healthCheckRuns,
13
+ healthCheckUnhealthyTransitions,
14
+ } from "./schema";
15
+ import * as schema from "./schema";
16
+
17
+ type Db = SafeDatabase<typeof schema>;
18
+ type IncidentClient = InferClient<typeof IncidentApi>;
19
+ type MaintenanceClient = InferClient<typeof MaintenanceApi>;
20
+
21
+ /**
22
+ * Returns true when the per-check evaluated state went from anything
23
+ * other than `unhealthy` to `unhealthy` between two evaluations.
24
+ */
25
+ export function isTransitionToUnhealthy(
26
+ previous: HealthCheckStatus | undefined,
27
+ next: HealthCheckStatus,
28
+ ): boolean {
29
+ return next === "unhealthy" && previous !== "unhealthy";
30
+ }
31
+
32
+ /**
33
+ * Record a transition-to-unhealthy in the audit table and return the
34
+ * total transition count for this check inside the configured window
35
+ * (the new row is included in the count). When `since` is provided,
36
+ * only transitions strictly after that timestamp are counted — used
37
+ * to ensure a freshly-opened auto-incident isn't re-triggered by
38
+ * pre-close transitions after the prior incident was resolved.
39
+ */
40
+ export async function recordUnhealthyTransition({
41
+ db,
42
+ configurationId,
43
+ systemId,
44
+ windowMinutes,
45
+ since,
46
+ now = new Date(),
47
+ }: {
48
+ db: Db;
49
+ configurationId: string;
50
+ systemId: string;
51
+ windowMinutes: number;
52
+ since?: Date;
53
+ now?: Date;
54
+ }): Promise<number> {
55
+ await db.insert(healthCheckUnhealthyTransitions).values({
56
+ configurationId,
57
+ systemId,
58
+ transitionedAt: now,
59
+ });
60
+
61
+ const windowStart = new Date(now.getTime() - windowMinutes * 60_000);
62
+ const lowerBound =
63
+ since && since > windowStart ? since : windowStart;
64
+
65
+ const result = await db
66
+ .select({ count: sql<number>`COUNT(*)::int` })
67
+ .from(healthCheckUnhealthyTransitions)
68
+ .where(
69
+ and(
70
+ eq(healthCheckUnhealthyTransitions.configurationId, configurationId),
71
+ eq(healthCheckUnhealthyTransitions.systemId, systemId),
72
+ gte(healthCheckUnhealthyTransitions.transitionedAt, lowerBound),
73
+ ),
74
+ );
75
+
76
+ return result[0]?.count ?? 0;
77
+ }
78
+
79
+ /**
80
+ * Decide whether the flapping trigger should open an auto-incident.
81
+ * Returns false when the trigger is disabled or the count is below
82
+ * the configured threshold.
83
+ */
84
+ export function shouldOpenForFlapping({
85
+ policy,
86
+ recentTransitionCount,
87
+ }: {
88
+ policy: NotificationPolicy;
89
+ recentTransitionCount: number;
90
+ }): boolean {
91
+ if (!policy.autoOpenIncidentOnUnhealthy) return false;
92
+ if (!policy.flappingTrigger.enabled) return false;
93
+ return recentTransitionCount >= policy.flappingTrigger.transitions;
94
+ }
95
+
96
+ /**
97
+ * Decide whether the sustained-duration trigger should open an
98
+ * auto-incident given the elapsed-unhealthy time for this check.
99
+ */
100
+ export function shouldOpenForSustainedUnhealthy({
101
+ policy,
102
+ unhealthyForMs,
103
+ }: {
104
+ policy: NotificationPolicy;
105
+ /** How long the check has been continuously unhealthy. */
106
+ unhealthyForMs: number;
107
+ }): boolean {
108
+ if (!policy.autoOpenIncidentOnUnhealthy) return false;
109
+ if (!policy.sustainedUnhealthyTrigger.enabled) return false;
110
+ const thresholdMs =
111
+ policy.sustainedUnhealthyTrigger.durationMinutes * 60_000;
112
+ return unhealthyForMs >= thresholdMs;
113
+ }
114
+
115
+ /**
116
+ * Find the most recent transition to `unhealthy` for this check that
117
+ * happened after `since` (if provided). Used by the sustained-trigger
118
+ * evaluator to compute "how long has the check been unhealthy?"
119
+ */
120
+ export async function findUnhealthySince({
121
+ db,
122
+ configurationId,
123
+ systemId,
124
+ since,
125
+ }: {
126
+ db: Db;
127
+ configurationId: string;
128
+ systemId: string;
129
+ since?: Date;
130
+ }): Promise<Date | undefined> {
131
+ const conditions = [
132
+ eq(healthCheckUnhealthyTransitions.configurationId, configurationId),
133
+ eq(healthCheckUnhealthyTransitions.systemId, systemId),
134
+ ];
135
+ if (since) {
136
+ conditions.push(gte(healthCheckUnhealthyTransitions.transitionedAt, since));
137
+ }
138
+
139
+ const [row] = await db
140
+ .select({
141
+ transitionedAt: healthCheckUnhealthyTransitions.transitionedAt,
142
+ })
143
+ .from(healthCheckUnhealthyTransitions)
144
+ .where(and(...conditions))
145
+ .orderBy(desc(healthCheckUnhealthyTransitions.transitionedAt))
146
+ .limit(1);
147
+
148
+ return row?.transitionedAt;
149
+ }
150
+
151
+ /**
152
+ * Find any currently-active (closedAt IS NULL) auto-incident for the
153
+ * system. Used to avoid opening a duplicate when one is already open.
154
+ */
155
+ export async function findActiveAutoIncident({
156
+ db,
157
+ systemId,
158
+ }: {
159
+ db: Db;
160
+ systemId: string;
161
+ }): Promise<{ id: string; incidentId: string } | undefined> {
162
+ const rows = await db
163
+ .select({
164
+ id: healthCheckAutoIncidents.id,
165
+ incidentId: healthCheckAutoIncidents.incidentId,
166
+ })
167
+ .from(healthCheckAutoIncidents)
168
+ .where(
169
+ and(
170
+ eq(healthCheckAutoIncidents.systemId, systemId),
171
+ isNull(healthCheckAutoIncidents.closedAt),
172
+ ),
173
+ )
174
+ .limit(1);
175
+
176
+ return rows[0];
177
+ }
178
+
179
+ /**
180
+ * Most recent close time for an auto-incident on this assignment, or
181
+ * undefined if none has ever closed. Used to gate re-opens behind a
182
+ * "must recover first" rule.
183
+ */
184
+ export async function findLastAutoIncidentClose({
185
+ db,
186
+ systemId,
187
+ configurationId,
188
+ }: {
189
+ db: Db;
190
+ systemId: string;
191
+ configurationId: string;
192
+ }): Promise<Date | undefined> {
193
+ const [row] = await db
194
+ .select({ closedAt: healthCheckAutoIncidents.closedAt })
195
+ .from(healthCheckAutoIncidents)
196
+ .where(
197
+ and(
198
+ eq(healthCheckAutoIncidents.systemId, systemId),
199
+ eq(healthCheckAutoIncidents.configurationId, configurationId),
200
+ isNotNull(healthCheckAutoIncidents.closedAt),
201
+ ),
202
+ )
203
+ .orderBy(desc(healthCheckAutoIncidents.closedAt))
204
+ .limit(1);
205
+
206
+ return row?.closedAt ?? undefined;
207
+ }
208
+
209
+ /**
210
+ * Has this check produced at least one healthy run since the given
211
+ * timestamp? Used to confirm the system has actually recovered between
212
+ * the last auto-incident close and now before a new auto-incident is
213
+ * allowed to open.
214
+ */
215
+ export async function hasHealthyRunSince({
216
+ db,
217
+ systemId,
218
+ configurationId,
219
+ since,
220
+ }: {
221
+ db: Db;
222
+ systemId: string;
223
+ configurationId: string;
224
+ since: Date;
225
+ }): Promise<boolean> {
226
+ const [row] = await db
227
+ .select({ id: healthCheckRuns.id })
228
+ .from(healthCheckRuns)
229
+ .where(
230
+ and(
231
+ eq(healthCheckRuns.systemId, systemId),
232
+ eq(healthCheckRuns.configurationId, configurationId),
233
+ eq(healthCheckRuns.status, "healthy"),
234
+ gte(healthCheckRuns.timestamp, since),
235
+ ),
236
+ )
237
+ .limit(1);
238
+
239
+ return !!row;
240
+ }
241
+
242
+ /**
243
+ * Check whether the system currently has an active maintenance window
244
+ * with suppression. Falls back to "not suppressed" on errors so a
245
+ * downstream outage doesn't accidentally block legitimate incidents.
246
+ */
247
+ export async function isMaintenanceSuppressed({
248
+ maintenanceClient,
249
+ systemId,
250
+ logger,
251
+ }: {
252
+ maintenanceClient: MaintenanceClient;
253
+ systemId: string;
254
+ logger: Logger;
255
+ }): Promise<boolean> {
256
+ try {
257
+ const { suppressed } =
258
+ await maintenanceClient.hasActiveMaintenanceWithSuppression({ systemId });
259
+ return suppressed;
260
+ } catch (error) {
261
+ logger.warn(
262
+ `Failed to check maintenance for ${systemId} during auto-incident decision; assuming not suppressed:`,
263
+ error,
264
+ );
265
+ return false;
266
+ }
267
+ }
268
+
269
+ /**
270
+ * Open an auto-incident through the incident plugin's service-level
271
+ * RPC and persist the mapping so the auto-close worker can find and
272
+ * resolve it later. No-op (returns existing mapping) when an active
273
+ * auto-incident already exists for the system.
274
+ */
275
+ export async function openAutoIncident({
276
+ db,
277
+ incidentClient,
278
+ logger,
279
+ systemId,
280
+ systemName,
281
+ configurationId,
282
+ configurationName,
283
+ policy,
284
+ reason,
285
+ }: {
286
+ db: Db;
287
+ incidentClient: IncidentClient;
288
+ logger: Logger;
289
+ systemId: string;
290
+ systemName: string;
291
+ configurationId: string;
292
+ configurationName: string;
293
+ policy: NotificationPolicy;
294
+ /** Short human-readable phrase for the incident description. */
295
+ reason: string;
296
+ }): Promise<{ incidentId: string } | undefined> {
297
+ const existing = await findActiveAutoIncident({ db, systemId });
298
+ if (existing) {
299
+ return { incidentId: existing.incidentId };
300
+ }
301
+
302
+ try {
303
+ const { id: incidentId } = await incidentClient.createAutoIncident({
304
+ title: `${systemName} is critical`,
305
+ description: `Auto-opened by health check **${configurationName}** (${reason}).`,
306
+ severity: "critical",
307
+ suppressNotifications: policy.useNotificationSuppression,
308
+ systemIds: [systemId],
309
+ initialMessage: `Health check \`${configurationName}\` triggered the auto-incident: ${reason}.`,
310
+ });
311
+
312
+ await db.insert(healthCheckAutoIncidents).values({
313
+ incidentId,
314
+ systemId,
315
+ configurationId,
316
+ cooldownMinutes: policy.autoCloseAfterMinutes,
317
+ });
318
+
319
+ logger.info(
320
+ `Auto-opened incident ${incidentId} for system ${systemId} (check ${configurationId}; ${reason})`,
321
+ );
322
+ return { incidentId };
323
+ } catch (error) {
324
+ // Auto-incident creation is best-effort — failure here shouldn't
325
+ // block the rest of the health-check flow.
326
+ logger.warn(
327
+ `Failed to open auto-incident for system ${systemId} (check ${configurationId}):`,
328
+ error,
329
+ );
330
+ return undefined;
331
+ }
332
+ }
@@ -0,0 +1,255 @@
1
+ /**
2
+ * Behaviour tests for the healthcheck automation triggers + actions.
3
+ */
4
+ import { describe, expect, it, mock } from "bun:test";
5
+ import type { Logger } from "@checkstack/backend-api";
6
+ import type { QueueManager } from "@checkstack/queue-api";
7
+ import { createMockLogger } from "@checkstack/test-utils-backend";
8
+
9
+ import {
10
+ assignmentArtifactType,
11
+ checkFailedTrigger,
12
+ createHealthCheckActions,
13
+ flappingDetectedTrigger,
14
+ healthCheckTriggers,
15
+ systemDegradedTrigger,
16
+ systemHealthChangedTrigger,
17
+ systemHealthyTrigger,
18
+ } from "./automations";
19
+ import { healthCheckHooks } from "./hooks";
20
+ import type { HealthCheckService } from "./service";
21
+
22
+ const logger = createMockLogger() as Logger;
23
+
24
+ const ctxBase = {
25
+ runId: "run-1",
26
+ automationId: "auto-1",
27
+ contextKey: null,
28
+ logger,
29
+ getService: async <T,>(): Promise<T> => {
30
+ throw new Error("not used");
31
+ },
32
+ };
33
+
34
+ describe("healthcheck triggers", () => {
35
+ it("exposes five triggers in a stable order", () => {
36
+ expect(healthCheckTriggers).toHaveLength(5);
37
+ expect(healthCheckTriggers[0]).toBe(
38
+ systemDegradedTrigger as (typeof healthCheckTriggers)[number],
39
+ );
40
+ expect(healthCheckTriggers[1]).toBe(
41
+ systemHealthyTrigger as (typeof healthCheckTriggers)[number],
42
+ );
43
+ expect(healthCheckTriggers[2]).toBe(
44
+ systemHealthChangedTrigger as (typeof healthCheckTriggers)[number],
45
+ );
46
+ expect(healthCheckTriggers[3]).toBe(
47
+ checkFailedTrigger as (typeof healthCheckTriggers)[number],
48
+ );
49
+ expect(healthCheckTriggers[4]).toBe(
50
+ flappingDetectedTrigger as (typeof healthCheckTriggers)[number],
51
+ );
52
+ });
53
+
54
+ it("validates checkFailed payload and extracts systemId", () => {
55
+ const ok = checkFailedTrigger.payloadSchema.safeParse({
56
+ systemId: "sys-1",
57
+ configurationId: "cfg-1",
58
+ status: "unhealthy",
59
+ timestamp: "2026-05-29T12:00:00Z",
60
+ });
61
+ expect(ok.success).toBe(true);
62
+ expect(
63
+ checkFailedTrigger.contextKey?.({
64
+ systemId: "sys-1",
65
+ configurationId: "cfg-1",
66
+ status: "unhealthy",
67
+ timestamp: "2026-05-29T12:00:00Z",
68
+ }),
69
+ ).toBe("sys-1");
70
+ });
71
+
72
+ it("validates flappingDetected payload and requires transitionCount + windowMinutes", () => {
73
+ const ok = flappingDetectedTrigger.payloadSchema.safeParse({
74
+ systemId: "sys-1",
75
+ configurationId: "cfg-1",
76
+ transitionCount: 5,
77
+ windowMinutes: 10,
78
+ timestamp: "2026-05-29T12:00:00Z",
79
+ });
80
+ expect(ok.success).toBe(true);
81
+
82
+ const bad = flappingDetectedTrigger.payloadSchema.safeParse({
83
+ systemId: "sys-1",
84
+ configurationId: "cfg-1",
85
+ timestamp: "2026-05-29T12:00:00Z",
86
+ });
87
+ expect(bad.success).toBe(false);
88
+ });
89
+
90
+ it("extracts systemId as the contextKey on all three", () => {
91
+ const degradedOrChanged = {
92
+ systemId: "sys-1",
93
+ previousStatus: "healthy",
94
+ newStatus: "degraded",
95
+ healthyChecks: 1,
96
+ totalChecks: 2,
97
+ timestamp: "2026-05-29T11:00:00Z",
98
+ } as const;
99
+ const healthy = {
100
+ systemId: "sys-1",
101
+ previousStatus: "degraded",
102
+ healthyChecks: 2,
103
+ totalChecks: 2,
104
+ timestamp: "2026-05-29T11:00:00Z",
105
+ } as const;
106
+ expect(systemDegradedTrigger.contextKey?.(degradedOrChanged)).toBe("sys-1");
107
+ expect(systemHealthyTrigger.contextKey?.(healthy)).toBe("sys-1");
108
+ expect(systemHealthChangedTrigger.contextKey?.(degradedOrChanged)).toBe(
109
+ "sys-1",
110
+ );
111
+ });
112
+ });
113
+
114
+ describe("assignmentArtifactType", () => {
115
+ it("validates the canonical assignment artifact", () => {
116
+ const ok = assignmentArtifactType.schema.safeParse({
117
+ systemId: "sys-1",
118
+ configurationId: "cfg-1",
119
+ enabled: true,
120
+ });
121
+ expect(ok.success).toBe(true);
122
+ });
123
+ });
124
+
125
+ function makeService(args: {
126
+ setAssignmentEnabledReturn?: boolean;
127
+ }): HealthCheckService & { setMock: ReturnType<typeof mock> } {
128
+ const setMock = mock(
129
+ async (_sysId: string, _cfgId: string, _enabled: boolean) =>
130
+ args.setAssignmentEnabledReturn ?? true,
131
+ );
132
+ return {
133
+ setAssignmentEnabled: setMock,
134
+ setMock,
135
+ } as unknown as HealthCheckService & { setMock: ReturnType<typeof mock> };
136
+ }
137
+
138
+ interface QueueEnqueueRecorder {
139
+ queueManager: QueueManager;
140
+ enqueueMock: ReturnType<typeof mock>;
141
+ }
142
+
143
+ function makeQueueManager(): QueueEnqueueRecorder {
144
+ const enqueueMock = mock(async (_payload: unknown) => "job-id");
145
+ const queue = {
146
+ enqueue: enqueueMock,
147
+ // Other queue methods aren't exercised by the action.
148
+ };
149
+ const queueManager = {
150
+ getQueue: () => queue,
151
+ } as unknown as QueueManager;
152
+ return { queueManager, enqueueMock };
153
+ }
154
+
155
+ describe("healthcheck.run_now", () => {
156
+ it("enqueues a one-off job and emits an enqueued=true artifact", async () => {
157
+ const service = makeService({});
158
+ const { queueManager, enqueueMock } = makeQueueManager();
159
+ const emitHook = mock(async (_hook: unknown, _payload: unknown) => {});
160
+ const [runNow] = createHealthCheckActions({
161
+ service,
162
+ queueManager,
163
+ emitHook: emitHook as never,
164
+ });
165
+
166
+ const result = await runNow!.execute({
167
+ ...ctxBase,
168
+ consumedArtifacts: {},
169
+ config: { systemId: "sys-1", configurationId: "cfg-1" } as never,
170
+ });
171
+
172
+ expect(result.success).toBe(true);
173
+ if (!result.success) return;
174
+ expect(result.externalId).toBe("sys-1:cfg-1");
175
+ expect(enqueueMock).toHaveBeenCalledTimes(1);
176
+ expect(enqueueMock.mock.calls[0]![0]).toEqual({
177
+ configId: "cfg-1",
178
+ systemId: "sys-1",
179
+ });
180
+ // run_now doesn't mutate any DB row → no hook to emit.
181
+ expect(emitHook).not.toHaveBeenCalled();
182
+ });
183
+ });
184
+
185
+ describe("healthcheck.enable_assignment", () => {
186
+ it("flips enabled=true on the existing row, fires assignmentChanged, and emits the artifact", async () => {
187
+ const service = makeService({ setAssignmentEnabledReturn: true });
188
+ const { queueManager } = makeQueueManager();
189
+ const emitHook = mock(async (_hook: unknown, _payload: unknown) => {});
190
+ const [, enable] = createHealthCheckActions({
191
+ service,
192
+ queueManager,
193
+ emitHook: emitHook as never,
194
+ });
195
+
196
+ const result = await enable!.execute({
197
+ ...ctxBase,
198
+ consumedArtifacts: {},
199
+ config: { systemId: "sys-1", configurationId: "cfg-1" } as never,
200
+ });
201
+
202
+ expect(result.success).toBe(true);
203
+ if (!result.success) return;
204
+ expect((result.artifact as { enabled: boolean }).enabled).toBe(true);
205
+ expect(service.setMock).toHaveBeenCalledWith("sys-1", "cfg-1", true);
206
+ expect(emitHook).toHaveBeenCalledTimes(1);
207
+ expect(emitHook.mock.calls[0]![0]).toBe(healthCheckHooks.assignmentChanged);
208
+ });
209
+
210
+ it("returns failure when the assignment row does not exist", async () => {
211
+ const service = makeService({ setAssignmentEnabledReturn: false });
212
+ const { queueManager } = makeQueueManager();
213
+ const emitHook = mock(async (_hook: unknown, _payload: unknown) => {});
214
+ const [, enable] = createHealthCheckActions({
215
+ service,
216
+ queueManager,
217
+ emitHook: emitHook as never,
218
+ });
219
+
220
+ const result = await enable!.execute({
221
+ ...ctxBase,
222
+ consumedArtifacts: {},
223
+ config: { systemId: "sys-1", configurationId: "missing" } as never,
224
+ });
225
+
226
+ expect(result.success).toBe(false);
227
+ if (result.success) return;
228
+ expect(result.error).toMatch(/Assignment not found/);
229
+ expect(emitHook).not.toHaveBeenCalled();
230
+ });
231
+ });
232
+
233
+ describe("healthcheck.disable_assignment", () => {
234
+ it("flips enabled=false on the existing row and emits the artifact", async () => {
235
+ const service = makeService({ setAssignmentEnabledReturn: true });
236
+ const { queueManager } = makeQueueManager();
237
+ const emitHook = mock(async (_hook: unknown, _payload: unknown) => {});
238
+ const [, , disable] = createHealthCheckActions({
239
+ service,
240
+ queueManager,
241
+ emitHook: emitHook as never,
242
+ });
243
+
244
+ const result = await disable!.execute({
245
+ ...ctxBase,
246
+ consumedArtifacts: {},
247
+ config: { systemId: "sys-1", configurationId: "cfg-1" } as never,
248
+ });
249
+
250
+ expect(result.success).toBe(true);
251
+ if (!result.success) return;
252
+ expect((result.artifact as { enabled: boolean }).enabled).toBe(false);
253
+ expect(service.setMock).toHaveBeenCalledWith("sys-1", "cfg-1", false);
254
+ });
255
+ });