@checkstack/healthcheck-backend 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +409 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +6 -27
- package/src/automations.ts +32 -30
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +694 -0
- package/src/health-entity.ts +367 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +13 -68
- package/src/index.ts +118 -48
- package/src/queue-executor.test.ts +13 -0
- package/src/queue-executor.ts +251 -444
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +13 -0
- package/src/router.ts +44 -0
- package/src/schema.ts +34 -54
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +89 -0
- package/src/state-evaluator.test.ts +50 -5
- package/src/state-evaluator.ts +9 -2
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +9 -0
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
import { and, desc, eq, gte } from "drizzle-orm";
|
|
2
|
+
import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
|
|
3
|
+
import type { Logger, SafeDatabase } from "@checkstack/backend-api";
|
|
4
|
+
import type { InferClient } from "@checkstack/common";
|
|
5
|
+
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
6
|
+
import { healthCheckAggregates, healthCheckRuns } from "./schema";
|
|
7
|
+
import * as schema from "./schema";
|
|
8
|
+
import {
|
|
9
|
+
countStateTransitionsInWindow,
|
|
10
|
+
findInStatusSince,
|
|
11
|
+
} from "./state-transitions";
|
|
12
|
+
|
|
13
|
+
type Db = SafeDatabase<typeof schema>;
|
|
14
|
+
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Live, service-typed health-state snapshot for a single system. This
|
|
18
|
+
* is the data contract the automation sensing layer (Wave 2) reads to
|
|
19
|
+
* answer "is this system unhealthy, and for how long?" without
|
|
20
|
+
* re-deriving the math each time.
|
|
21
|
+
*/
|
|
22
|
+
export interface HealthState {
|
|
23
|
+
/** Aggregate status across all enabled checks. */
|
|
24
|
+
status: HealthCheckStatus;
|
|
25
|
+
/**
|
|
26
|
+
* When the system most recently entered `status`. Null when no
|
|
27
|
+
* transition has been recorded yet (fail-safe: never throws).
|
|
28
|
+
*/
|
|
29
|
+
inStatusSince: Date | null;
|
|
30
|
+
/**
|
|
31
|
+
* Milliseconds the system has continuously been in `status`. 0 when
|
|
32
|
+
* `inStatusSince` is unknown.
|
|
33
|
+
*/
|
|
34
|
+
inStatusForMs: number;
|
|
35
|
+
/** Latency of the newest run, if any. */
|
|
36
|
+
latencyMs?: number;
|
|
37
|
+
/** Windowed average latency from recent aggregate buckets. */
|
|
38
|
+
avgLatencyMs?: number;
|
|
39
|
+
/** Windowed p95 latency from recent aggregate buckets. */
|
|
40
|
+
p95LatencyMs?: number;
|
|
41
|
+
/** Windowed success rate (healthy / total) in [0, 1] from buckets. */
|
|
42
|
+
successRate?: number;
|
|
43
|
+
/** Timestamp of the newest run, if any. */
|
|
44
|
+
lastRunAt?: Date;
|
|
45
|
+
/** Whether the system is currently in a maintenance window. */
|
|
46
|
+
inMaintenance: boolean;
|
|
47
|
+
/**
|
|
48
|
+
* Count of aggregate status transitions in the trailing
|
|
49
|
+
* `transitionWindowMinutes` window. Generalizes flapping detection -
|
|
50
|
+
* an automation can gate on "N status changes in M minutes".
|
|
51
|
+
*/
|
|
52
|
+
transitionsInWindow: number;
|
|
53
|
+
/** The window (minutes) `transitionsInWindow` was counted over. */
|
|
54
|
+
transitionWindowMinutes: number;
|
|
55
|
+
/** When this snapshot was computed. */
|
|
56
|
+
evaluatedAt: Date;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Raw inputs to the pure builder, decoupled from the DB layer. */
|
|
60
|
+
export interface HealthStateInputs {
|
|
61
|
+
status: HealthCheckStatus;
|
|
62
|
+
inStatusSince: Date | null;
|
|
63
|
+
latencyMs?: number;
|
|
64
|
+
avgLatencyMs?: number;
|
|
65
|
+
p95LatencyMs?: number;
|
|
66
|
+
successRate?: number;
|
|
67
|
+
lastRunAt?: Date;
|
|
68
|
+
inMaintenance: boolean;
|
|
69
|
+
transitionsInWindow: number;
|
|
70
|
+
transitionWindowMinutes: number;
|
|
71
|
+
now: Date;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Default trailing window (minutes) for the transition count. */
|
|
75
|
+
export const DEFAULT_TRANSITION_WINDOW_MINUTES = 60;
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Pure assembler for a {@link HealthState}. Computes `inStatusForMs`
|
|
79
|
+
* from `inStatusSince` relative to `now`, clamped at 0 so clock skew
|
|
80
|
+
* never yields a negative duration. No I/O.
|
|
81
|
+
*/
|
|
82
|
+
export function buildHealthState(inputs: HealthStateInputs): HealthState {
|
|
83
|
+
const {
|
|
84
|
+
status,
|
|
85
|
+
inStatusSince,
|
|
86
|
+
latencyMs,
|
|
87
|
+
avgLatencyMs,
|
|
88
|
+
p95LatencyMs,
|
|
89
|
+
successRate,
|
|
90
|
+
lastRunAt,
|
|
91
|
+
inMaintenance,
|
|
92
|
+
transitionsInWindow,
|
|
93
|
+
transitionWindowMinutes,
|
|
94
|
+
now,
|
|
95
|
+
} = inputs;
|
|
96
|
+
|
|
97
|
+
const inStatusForMs = inStatusSince
|
|
98
|
+
? Math.max(0, now.getTime() - inStatusSince.getTime())
|
|
99
|
+
: 0;
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
status,
|
|
103
|
+
inStatusSince,
|
|
104
|
+
inStatusForMs,
|
|
105
|
+
latencyMs,
|
|
106
|
+
avgLatencyMs,
|
|
107
|
+
p95LatencyMs,
|
|
108
|
+
successRate,
|
|
109
|
+
lastRunAt,
|
|
110
|
+
inMaintenance,
|
|
111
|
+
transitionsInWindow,
|
|
112
|
+
transitionWindowMinutes,
|
|
113
|
+
evaluatedAt: now,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Newest run (latency + timestamp) for a system, optionally narrowed to
|
|
119
|
+
* a single check. Returns undefined fields when no run exists.
|
|
120
|
+
*/
|
|
121
|
+
export async function findLatestRun({
|
|
122
|
+
db,
|
|
123
|
+
systemId,
|
|
124
|
+
configurationId,
|
|
125
|
+
}: {
|
|
126
|
+
db: Db;
|
|
127
|
+
systemId: string;
|
|
128
|
+
configurationId?: string;
|
|
129
|
+
}): Promise<{ latencyMs?: number; lastRunAt?: Date }> {
|
|
130
|
+
const conditions = [eq(healthCheckRuns.systemId, systemId)];
|
|
131
|
+
if (configurationId) {
|
|
132
|
+
conditions.push(eq(healthCheckRuns.configurationId, configurationId));
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const [row] = await db
|
|
136
|
+
.select({
|
|
137
|
+
latencyMs: healthCheckRuns.latencyMs,
|
|
138
|
+
timestamp: healthCheckRuns.timestamp,
|
|
139
|
+
})
|
|
140
|
+
.from(healthCheckRuns)
|
|
141
|
+
.where(and(...conditions))
|
|
142
|
+
.orderBy(desc(healthCheckRuns.timestamp))
|
|
143
|
+
.limit(1);
|
|
144
|
+
|
|
145
|
+
if (!row) return {};
|
|
146
|
+
return {
|
|
147
|
+
latencyMs: row.latencyMs ?? undefined,
|
|
148
|
+
lastRunAt: row.timestamp,
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/** Number of hours of aggregate buckets folded into windowed metrics. */
|
|
153
|
+
const DEFAULT_METRICS_WINDOW_HOURS = 24;
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Windowed metrics (avg/p95 latency, success rate) computed from hourly
|
|
157
|
+
* aggregate buckets over the trailing window. Returns undefined fields
|
|
158
|
+
* when no buckets exist in the window.
|
|
159
|
+
*/
|
|
160
|
+
export async function computeWindowedMetrics({
|
|
161
|
+
db,
|
|
162
|
+
systemId,
|
|
163
|
+
configurationId,
|
|
164
|
+
now = new Date(),
|
|
165
|
+
windowHours = DEFAULT_METRICS_WINDOW_HOURS,
|
|
166
|
+
}: {
|
|
167
|
+
db: Db;
|
|
168
|
+
systemId: string;
|
|
169
|
+
configurationId?: string;
|
|
170
|
+
now?: Date;
|
|
171
|
+
windowHours?: number;
|
|
172
|
+
}): Promise<{
|
|
173
|
+
avgLatencyMs?: number;
|
|
174
|
+
p95LatencyMs?: number;
|
|
175
|
+
successRate?: number;
|
|
176
|
+
}> {
|
|
177
|
+
const windowStart = new Date(now.getTime() - windowHours * 3_600_000);
|
|
178
|
+
const conditions = [
|
|
179
|
+
eq(healthCheckAggregates.systemId, systemId),
|
|
180
|
+
eq(healthCheckAggregates.bucketSize, "hourly"),
|
|
181
|
+
gte(healthCheckAggregates.bucketStart, windowStart),
|
|
182
|
+
];
|
|
183
|
+
if (configurationId) {
|
|
184
|
+
conditions.push(
|
|
185
|
+
eq(healthCheckAggregates.configurationId, configurationId),
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const buckets = await db
|
|
190
|
+
.select({
|
|
191
|
+
runCount: healthCheckAggregates.runCount,
|
|
192
|
+
healthyCount: healthCheckAggregates.healthyCount,
|
|
193
|
+
latencySumMs: healthCheckAggregates.latencySumMs,
|
|
194
|
+
p95LatencyMs: healthCheckAggregates.p95LatencyMs,
|
|
195
|
+
})
|
|
196
|
+
.from(healthCheckAggregates)
|
|
197
|
+
.where(and(...conditions));
|
|
198
|
+
|
|
199
|
+
return aggregateWindowedMetrics(buckets);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Pure reduction of aggregate buckets into windowed metrics. Avg
|
|
204
|
+
* latency is the latency-sum-weighted mean; p95 is the max bucket p95
|
|
205
|
+
* (a conservative upper bound without re-merging t-digests); success
|
|
206
|
+
* rate is healthy/total across the window.
|
|
207
|
+
*/
|
|
208
|
+
export function aggregateWindowedMetrics(
|
|
209
|
+
buckets: Array<{
|
|
210
|
+
runCount: number;
|
|
211
|
+
healthyCount: number;
|
|
212
|
+
latencySumMs: number | null;
|
|
213
|
+
p95LatencyMs: number | null;
|
|
214
|
+
}>,
|
|
215
|
+
): {
|
|
216
|
+
avgLatencyMs?: number;
|
|
217
|
+
p95LatencyMs?: number;
|
|
218
|
+
successRate?: number;
|
|
219
|
+
} {
|
|
220
|
+
if (buckets.length === 0) return {};
|
|
221
|
+
|
|
222
|
+
let totalRuns = 0;
|
|
223
|
+
let totalHealthy = 0;
|
|
224
|
+
let latencySum = 0;
|
|
225
|
+
let latencyRuns = 0;
|
|
226
|
+
let maxP95: number | undefined;
|
|
227
|
+
|
|
228
|
+
for (const b of buckets) {
|
|
229
|
+
totalRuns += b.runCount;
|
|
230
|
+
totalHealthy += b.healthyCount;
|
|
231
|
+
if (b.latencySumMs != null) {
|
|
232
|
+
latencySum += b.latencySumMs;
|
|
233
|
+
latencyRuns += b.runCount;
|
|
234
|
+
}
|
|
235
|
+
if (b.p95LatencyMs != null) {
|
|
236
|
+
maxP95 = maxP95 == null ? b.p95LatencyMs : Math.max(maxP95, b.p95LatencyMs);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
avgLatencyMs:
|
|
242
|
+
latencyRuns > 0 ? Math.round(latencySum / latencyRuns) : undefined,
|
|
243
|
+
p95LatencyMs: maxP95,
|
|
244
|
+
successRate: totalRuns > 0 ? totalHealthy / totalRuns : undefined,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Check whether a system is currently in a maintenance window
|
|
250
|
+
* (suppression-agnostic). Fail-open to `false` on client error so a
|
|
251
|
+
* maintenance-plugin outage never wedges health-state reads.
|
|
252
|
+
*/
|
|
253
|
+
async function resolveInMaintenance({
|
|
254
|
+
maintenanceClient,
|
|
255
|
+
systemId,
|
|
256
|
+
logger,
|
|
257
|
+
}: {
|
|
258
|
+
maintenanceClient: MaintenanceClient | undefined;
|
|
259
|
+
systemId: string;
|
|
260
|
+
logger?: Logger;
|
|
261
|
+
}): Promise<boolean> {
|
|
262
|
+
if (!maintenanceClient) return false;
|
|
263
|
+
try {
|
|
264
|
+
const { active } = await maintenanceClient.hasActiveMaintenance({
|
|
265
|
+
systemId,
|
|
266
|
+
});
|
|
267
|
+
return active;
|
|
268
|
+
} catch (error) {
|
|
269
|
+
logger?.warn(
|
|
270
|
+
`Failed to resolve maintenance state for ${systemId}; assuming not in maintenance:`,
|
|
271
|
+
error,
|
|
272
|
+
);
|
|
273
|
+
return false;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
/**
|
|
278
|
+
* Orchestrate the full {@link HealthState} for a single system: status
|
|
279
|
+
* (from the provided resolver), in-status-since (transitions table),
|
|
280
|
+
* latest run, windowed metrics, and maintenance state. `now` is passed
|
|
281
|
+
* explicitly so callers can keep a stable evaluation timestamp.
|
|
282
|
+
*/
|
|
283
|
+
export async function computeHealthState({
|
|
284
|
+
db,
|
|
285
|
+
systemId,
|
|
286
|
+
configurationId,
|
|
287
|
+
resolveStatus,
|
|
288
|
+
maintenanceClient,
|
|
289
|
+
logger,
|
|
290
|
+
transitionWindowMinutes = DEFAULT_TRANSITION_WINDOW_MINUTES,
|
|
291
|
+
now = new Date(),
|
|
292
|
+
}: {
|
|
293
|
+
db: Db;
|
|
294
|
+
systemId: string;
|
|
295
|
+
configurationId?: string;
|
|
296
|
+
/** Returns the aggregate status for the system (per-check when scoped). */
|
|
297
|
+
resolveStatus: () => Promise<HealthCheckStatus>;
|
|
298
|
+
maintenanceClient?: MaintenanceClient;
|
|
299
|
+
logger?: Logger;
|
|
300
|
+
/** Trailing window (minutes) for the transition count. */
|
|
301
|
+
transitionWindowMinutes?: number;
|
|
302
|
+
now?: Date;
|
|
303
|
+
}): Promise<HealthState> {
|
|
304
|
+
const status = await resolveStatus();
|
|
305
|
+
|
|
306
|
+
const [inStatusSince, latest, windowed, inMaintenance, transitionsInWindow] =
|
|
307
|
+
await Promise.all([
|
|
308
|
+
findInStatusSince({ db, systemId, status }),
|
|
309
|
+
findLatestRun({ db, systemId, configurationId }),
|
|
310
|
+
computeWindowedMetrics({ db, systemId, configurationId, now }),
|
|
311
|
+
resolveInMaintenance({ maintenanceClient, systemId, logger }),
|
|
312
|
+
countStateTransitionsInWindow({
|
|
313
|
+
db,
|
|
314
|
+
systemId,
|
|
315
|
+
windowMinutes: transitionWindowMinutes,
|
|
316
|
+
now,
|
|
317
|
+
}),
|
|
318
|
+
]);
|
|
319
|
+
|
|
320
|
+
return buildHealthState({
|
|
321
|
+
status,
|
|
322
|
+
inStatusSince,
|
|
323
|
+
latencyMs: latest.latencyMs,
|
|
324
|
+
avgLatencyMs: windowed.avgLatencyMs,
|
|
325
|
+
p95LatencyMs: windowed.p95LatencyMs,
|
|
326
|
+
successRate: windowed.successRate,
|
|
327
|
+
lastRunAt: latest.lastRunAt,
|
|
328
|
+
inMaintenance,
|
|
329
|
+
transitionsInWindow,
|
|
330
|
+
transitionWindowMinutes,
|
|
331
|
+
now,
|
|
332
|
+
});
|
|
333
|
+
}
|
|
@@ -40,19 +40,6 @@ interface MockAssociation {
|
|
|
40
40
|
enabled: boolean;
|
|
41
41
|
notificationPolicy?: {
|
|
42
42
|
suppressDeEscalations: boolean;
|
|
43
|
-
autoOpenIncidentOnUnhealthy: boolean;
|
|
44
|
-
useNotificationSuppression: boolean;
|
|
45
|
-
skipDuringMaintenance: boolean;
|
|
46
|
-
sustainedUnhealthyTrigger: {
|
|
47
|
-
enabled: boolean;
|
|
48
|
-
durationMinutes: number;
|
|
49
|
-
};
|
|
50
|
-
flappingTrigger: {
|
|
51
|
-
enabled: boolean;
|
|
52
|
-
transitions: number;
|
|
53
|
-
windowMinutes: number;
|
|
54
|
-
};
|
|
55
|
-
autoCloseAfterMinutes: number | null;
|
|
56
43
|
};
|
|
57
44
|
}
|
|
58
45
|
|
|
@@ -657,12 +644,11 @@ describe("Healthcheck GitOps Kind: System Extension", () => {
|
|
|
657
644
|
extensionSpec: [
|
|
658
645
|
{
|
|
659
646
|
ref: { kind: "Healthcheck", name: "db-check" },
|
|
660
|
-
// Operator
|
|
661
|
-
//
|
|
662
|
-
//
|
|
647
|
+
// Operator sets the one surviving policy field; everything else
|
|
648
|
+
// should default in via the schema parse. Flapping thresholds are
|
|
649
|
+
// no longer part of the policy — they live on the trigger config.
|
|
663
650
|
notificationPolicy: {
|
|
664
|
-
|
|
665
|
-
autoCloseAfterMinutes: null,
|
|
651
|
+
suppressDeEscalations: true,
|
|
666
652
|
},
|
|
667
653
|
},
|
|
668
654
|
],
|
|
@@ -672,20 +658,8 @@ describe("Healthcheck GitOps Kind: System Extension", () => {
|
|
|
672
658
|
|
|
673
659
|
const policy = mockService.associations[0]?.notificationPolicy;
|
|
674
660
|
expect(policy).toBeDefined();
|
|
675
|
-
expect(policy?.suppressDeEscalations).toBe(
|
|
676
|
-
expect(policy
|
|
677
|
-
expect(policy?.useNotificationSuppression).toBe(true);
|
|
678
|
-
expect(policy?.skipDuringMaintenance).toBe(true);
|
|
679
|
-
expect(policy?.sustainedUnhealthyTrigger).toEqual({
|
|
680
|
-
enabled: true,
|
|
681
|
-
durationMinutes: 30,
|
|
682
|
-
});
|
|
683
|
-
expect(policy?.flappingTrigger).toEqual({
|
|
684
|
-
enabled: true,
|
|
685
|
-
transitions: 5,
|
|
686
|
-
windowMinutes: 60,
|
|
687
|
-
});
|
|
688
|
-
expect(policy?.autoCloseAfterMinutes).toBeNull();
|
|
661
|
+
expect(policy?.suppressDeEscalations).toBe(true);
|
|
662
|
+
expect(Object.keys(policy ?? {})).toEqual(["suppressDeEscalations"]);
|
|
689
663
|
});
|
|
690
664
|
|
|
691
665
|
it("omits notificationPolicy entirely when the spec doesn't set it", async () => {
|
|
@@ -85,26 +85,11 @@ const systemHealthcheckExtensionSchema = z
|
|
|
85
85
|
/**
|
|
86
86
|
* Per-assignment notification policy. Any field omitted falls
|
|
87
87
|
* back to the platform default (see `DEFAULT_NOTIFICATION_POLICY`).
|
|
88
|
-
*
|
|
89
|
-
*
|
|
88
|
+
* Flapping thresholds moved onto the automation engine's windowed-count
|
|
89
|
+
* gate (the `system_health_changed` trigger's `window` block) and are no
|
|
90
|
+
* longer accepted here.
|
|
90
91
|
*/
|
|
91
|
-
notificationPolicy: NotificationPolicySchema.partial()
|
|
92
|
-
.extend({
|
|
93
|
-
sustainedUnhealthyTrigger: z
|
|
94
|
-
.object({
|
|
95
|
-
enabled: z.boolean().optional(),
|
|
96
|
-
durationMinutes: z.number().int().min(1).optional(),
|
|
97
|
-
})
|
|
98
|
-
.optional(),
|
|
99
|
-
flappingTrigger: z
|
|
100
|
-
.object({
|
|
101
|
-
enabled: z.boolean().optional(),
|
|
102
|
-
transitions: z.number().int().min(1).optional(),
|
|
103
|
-
windowMinutes: z.number().int().min(1).optional(),
|
|
104
|
-
})
|
|
105
|
-
.optional(),
|
|
106
|
-
})
|
|
107
|
-
.optional(),
|
|
92
|
+
notificationPolicy: NotificationPolicySchema.partial().optional(),
|
|
108
93
|
}),
|
|
109
94
|
)
|
|
110
95
|
.optional();
|
package/src/hooks.test.ts
CHANGED
|
@@ -2,15 +2,28 @@ import { describe, it, expect } from "bun:test";
|
|
|
2
2
|
import { healthCheckHooks } from "./hooks";
|
|
3
3
|
|
|
4
4
|
describe("Health Check Hooks", () => {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
// The directional/umbrella system-health hooks were removed in Phase 4
|
|
6
|
+
// (§10.3) — the `health` entity drives those events now. The remaining
|
|
7
|
+
// hooks are the KEPT non-entity signals.
|
|
8
|
+
it("keeps the assignmentChanged config-change hook", () => {
|
|
9
|
+
expect(healthCheckHooks.assignmentChanged.id).toBe(
|
|
10
|
+
"healthcheck.assignment.changed",
|
|
8
11
|
);
|
|
9
12
|
});
|
|
10
13
|
|
|
11
|
-
it("
|
|
12
|
-
expect(healthCheckHooks.
|
|
13
|
-
"healthcheck.
|
|
14
|
+
it("keeps the raw-sample checkCompleted / checkFailed hooks", () => {
|
|
15
|
+
expect(healthCheckHooks.checkCompleted.id).toBe(
|
|
16
|
+
"healthcheck.check.completed",
|
|
14
17
|
);
|
|
18
|
+
expect(healthCheckHooks.checkFailed.id).toBe("healthcheck.check.failed");
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
it("no longer exposes the removed system-health or flapping hooks", () => {
|
|
22
|
+
expect("systemDegraded" in healthCheckHooks).toBe(false);
|
|
23
|
+
expect("systemHealthy" in healthCheckHooks).toBe(false);
|
|
24
|
+
expect("systemHealthChanged" in healthCheckHooks).toBe(false);
|
|
25
|
+
// Flapping moved to the automation engine's windowed-count gate; the
|
|
26
|
+
// pre-derived flapping signal hook was removed.
|
|
27
|
+
expect("flappingDetected" in healthCheckHooks).toBe(false);
|
|
15
28
|
});
|
|
16
29
|
});
|
package/src/hooks.ts
CHANGED
|
@@ -11,33 +11,19 @@ import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
|
|
|
11
11
|
* editor.
|
|
12
12
|
*/
|
|
13
13
|
export const healthCheckHooks = {
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
}>("healthcheck.system.degraded"),
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Emitted when a system's aggregated health status recovers to healthy.
|
|
31
|
-
* This fires when status changes from degraded/unhealthy to healthy.
|
|
32
|
-
*/
|
|
33
|
-
systemHealthy: createHook<{
|
|
34
|
-
systemId: string;
|
|
35
|
-
systemName?: string;
|
|
36
|
-
previousStatus: HealthCheckStatus;
|
|
37
|
-
healthyChecks: number;
|
|
38
|
-
totalChecks: number;
|
|
39
|
-
timestamp: string;
|
|
40
|
-
}>("healthcheck.system.healthy"),
|
|
14
|
+
// The `healthcheck.system.degraded` / `.healthy` / `.health_changed` hooks
|
|
15
|
+
// were removed in Phase 4 (§10.3): the per-system aggregated health is now
|
|
16
|
+
// the reactive `health` entity, whose change deriver fires the
|
|
17
|
+
// `healthcheck.system_degraded` / `_healthy` / `_health_changed` trigger
|
|
18
|
+
// events through Stage-1 routing. The remaining hooks below are KEPT:
|
|
19
|
+
// `assignmentChanged` (config signal) and `checkCompleted` / `checkFailed`
|
|
20
|
+
// (high-frequency raw samples + numeric_state wake source).
|
|
21
|
+
//
|
|
22
|
+
// The `flappingDetected` hook was removed: flapping is now detected in the
|
|
23
|
+
// automation engine by the windowed-count gate on the
|
|
24
|
+
// `healthcheck.system_health_changed` trigger (base raw change event +
|
|
25
|
+
// `filter` + `window: { count, minutes, refire: "once" }`), so healthcheck
|
|
26
|
+
// no longer computes or emits a pre-derived flapping signal.
|
|
41
27
|
|
|
42
28
|
/**
|
|
43
29
|
* Emitted when a health check ↔ system association changes.
|
|
@@ -62,26 +48,6 @@ export const healthCheckHooks = {
|
|
|
62
48
|
timestamp: string;
|
|
63
49
|
}>("healthcheck.check.completed"),
|
|
64
50
|
|
|
65
|
-
/**
|
|
66
|
-
* Umbrella variant of `systemDegraded` + `systemHealthy` — fires on
|
|
67
|
-
* **any** aggregated-health transition, carrying both the previous
|
|
68
|
-
* and new statuses. Subscribers (e.g. an automation that wants to
|
|
69
|
-
* react to every state change without subscribing to two hooks
|
|
70
|
-
* and coalescing themselves) prefer this one.
|
|
71
|
-
*
|
|
72
|
-
* Emitted alongside the directional hooks, never instead of them,
|
|
73
|
-
* so existing subscribers keep working unchanged.
|
|
74
|
-
*/
|
|
75
|
-
systemHealthChanged: createHook<{
|
|
76
|
-
systemId: string;
|
|
77
|
-
systemName?: string;
|
|
78
|
-
previousStatus: HealthCheckStatus;
|
|
79
|
-
newStatus: HealthCheckStatus;
|
|
80
|
-
healthyChecks: number;
|
|
81
|
-
totalChecks: number;
|
|
82
|
-
timestamp: string;
|
|
83
|
-
}>("healthcheck.system.health_changed"),
|
|
84
|
-
|
|
85
51
|
/**
|
|
86
52
|
* Narrow variant of `checkCompleted` — fires only when an individual
|
|
87
53
|
* check run completed with a non-`healthy` status. Carries the
|
|
@@ -99,25 +65,4 @@ export const healthCheckHooks = {
|
|
|
99
65
|
result: Record<string, unknown> | undefined;
|
|
100
66
|
timestamp: string;
|
|
101
67
|
}>("healthcheck.check.failed"),
|
|
102
|
-
|
|
103
|
-
/**
|
|
104
|
-
* Emitted when the flapping-detector observes ≥ N unhealthy
|
|
105
|
-
* transitions in the policy's configured window. Fires regardless
|
|
106
|
-
* of whether `autoOpenIncidentOnUnhealthy` is enabled — the hook is
|
|
107
|
-
* informational; the auto-incident pipeline still gates on the
|
|
108
|
-
* policy.
|
|
109
|
-
*
|
|
110
|
-
* Re-fires on every additional transition past the threshold while
|
|
111
|
-
* the check stays in a flapping pattern, so automations that want
|
|
112
|
-
* "page once and only once" should debounce on `(systemId,
|
|
113
|
-
* configurationId)`. Carrying the observed transition count + the
|
|
114
|
-
* window length lets subscribers reason about both.
|
|
115
|
-
*/
|
|
116
|
-
flappingDetected: createHook<{
|
|
117
|
-
systemId: string;
|
|
118
|
-
configurationId: string;
|
|
119
|
-
transitionCount: number;
|
|
120
|
-
windowMinutes: number;
|
|
121
|
-
timestamp: string;
|
|
122
|
-
}>("healthcheck.flapping_detected"),
|
|
123
68
|
} as const;
|