@checkstack/healthcheck-backend 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/drizzle/0008_broad_black_tom.sql +1 -0
- package/drizzle/meta/0008_snapshot.json +420 -0
- package/drizzle/meta/_journal.json +7 -0
- package/package.json +2 -1
- package/src/index.ts +5 -0
- package/src/queue-executor.test.ts +133 -0
- package/src/queue-executor.ts +40 -1
- package/src/router.ts +8 -0
- package/src/schema.ts +2 -0
- package/src/service-pause.test.ts +50 -0
- package/src/service.ts +15 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
# @checkstack/healthcheck-backend
|
|
2
2
|
|
|
3
|
+
## 0.6.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- 11d2679: Add ability to pause health check configurations globally. When paused, health checks continue to be scheduled but execution is skipped for all systems using that configuration. Users with manage access can pause/resume from the Health Checks config page.
|
|
8
|
+
- cce5453: Add notification suppression for incidents
|
|
9
|
+
|
|
10
|
+
- Added `suppressNotifications` field to incidents, allowing active incidents to optionally suppress health check notifications
|
|
11
|
+
- When enabled, health status change notifications will not be sent for affected systems while the incident is active (not resolved)
|
|
12
|
+
- Mirrors the existing maintenance notification suppression pattern
|
|
13
|
+
- Added toggle UI in the IncidentEditor dialog
|
|
14
|
+
- Added `hasActiveIncidentWithSuppression` RPC endpoint for service-to-service queries
|
|
15
|
+
|
|
16
|
+
### Patch Changes
|
|
17
|
+
|
|
18
|
+
- Updated dependencies [11d2679]
|
|
19
|
+
- Updated dependencies [cce5453]
|
|
20
|
+
- @checkstack/healthcheck-common@0.6.0
|
|
21
|
+
- @checkstack/incident-common@0.4.0
|
|
22
|
+
|
|
3
23
|
## 0.5.0
|
|
4
24
|
|
|
5
25
|
### Minor Changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ALTER TABLE "health_check_configurations" ADD COLUMN "paused" boolean DEFAULT false NOT NULL;
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "86171dc8-efcc-4246-a95a-665fdefb1a1f",
|
|
3
|
+
"prevId": "bb50b71f-3f81-4cb2-aac6-7e7564060fa1",
|
|
4
|
+
"version": "7",
|
|
5
|
+
"dialect": "postgresql",
|
|
6
|
+
"tables": {
|
|
7
|
+
"public.health_check_aggregates": {
|
|
8
|
+
"name": "health_check_aggregates",
|
|
9
|
+
"schema": "",
|
|
10
|
+
"columns": {
|
|
11
|
+
"id": {
|
|
12
|
+
"name": "id",
|
|
13
|
+
"type": "uuid",
|
|
14
|
+
"primaryKey": true,
|
|
15
|
+
"notNull": true,
|
|
16
|
+
"default": "gen_random_uuid()"
|
|
17
|
+
},
|
|
18
|
+
"configuration_id": {
|
|
19
|
+
"name": "configuration_id",
|
|
20
|
+
"type": "uuid",
|
|
21
|
+
"primaryKey": false,
|
|
22
|
+
"notNull": true
|
|
23
|
+
},
|
|
24
|
+
"system_id": {
|
|
25
|
+
"name": "system_id",
|
|
26
|
+
"type": "text",
|
|
27
|
+
"primaryKey": false,
|
|
28
|
+
"notNull": true
|
|
29
|
+
},
|
|
30
|
+
"bucket_start": {
|
|
31
|
+
"name": "bucket_start",
|
|
32
|
+
"type": "timestamp",
|
|
33
|
+
"primaryKey": false,
|
|
34
|
+
"notNull": true
|
|
35
|
+
},
|
|
36
|
+
"bucket_size": {
|
|
37
|
+
"name": "bucket_size",
|
|
38
|
+
"type": "bucket_size",
|
|
39
|
+
"typeSchema": "public",
|
|
40
|
+
"primaryKey": false,
|
|
41
|
+
"notNull": true
|
|
42
|
+
},
|
|
43
|
+
"run_count": {
|
|
44
|
+
"name": "run_count",
|
|
45
|
+
"type": "integer",
|
|
46
|
+
"primaryKey": false,
|
|
47
|
+
"notNull": true
|
|
48
|
+
},
|
|
49
|
+
"healthy_count": {
|
|
50
|
+
"name": "healthy_count",
|
|
51
|
+
"type": "integer",
|
|
52
|
+
"primaryKey": false,
|
|
53
|
+
"notNull": true
|
|
54
|
+
},
|
|
55
|
+
"degraded_count": {
|
|
56
|
+
"name": "degraded_count",
|
|
57
|
+
"type": "integer",
|
|
58
|
+
"primaryKey": false,
|
|
59
|
+
"notNull": true
|
|
60
|
+
},
|
|
61
|
+
"unhealthy_count": {
|
|
62
|
+
"name": "unhealthy_count",
|
|
63
|
+
"type": "integer",
|
|
64
|
+
"primaryKey": false,
|
|
65
|
+
"notNull": true
|
|
66
|
+
},
|
|
67
|
+
"latency_sum_ms": {
|
|
68
|
+
"name": "latency_sum_ms",
|
|
69
|
+
"type": "integer",
|
|
70
|
+
"primaryKey": false,
|
|
71
|
+
"notNull": false
|
|
72
|
+
},
|
|
73
|
+
"avg_latency_ms": {
|
|
74
|
+
"name": "avg_latency_ms",
|
|
75
|
+
"type": "integer",
|
|
76
|
+
"primaryKey": false,
|
|
77
|
+
"notNull": false
|
|
78
|
+
},
|
|
79
|
+
"min_latency_ms": {
|
|
80
|
+
"name": "min_latency_ms",
|
|
81
|
+
"type": "integer",
|
|
82
|
+
"primaryKey": false,
|
|
83
|
+
"notNull": false
|
|
84
|
+
},
|
|
85
|
+
"max_latency_ms": {
|
|
86
|
+
"name": "max_latency_ms",
|
|
87
|
+
"type": "integer",
|
|
88
|
+
"primaryKey": false,
|
|
89
|
+
"notNull": false
|
|
90
|
+
},
|
|
91
|
+
"p95_latency_ms": {
|
|
92
|
+
"name": "p95_latency_ms",
|
|
93
|
+
"type": "integer",
|
|
94
|
+
"primaryKey": false,
|
|
95
|
+
"notNull": false
|
|
96
|
+
},
|
|
97
|
+
"aggregated_result": {
|
|
98
|
+
"name": "aggregated_result",
|
|
99
|
+
"type": "jsonb",
|
|
100
|
+
"primaryKey": false,
|
|
101
|
+
"notNull": false
|
|
102
|
+
}
|
|
103
|
+
},
|
|
104
|
+
"indexes": {
|
|
105
|
+
"health_check_aggregates_bucket_unique": {
|
|
106
|
+
"name": "health_check_aggregates_bucket_unique",
|
|
107
|
+
"columns": [
|
|
108
|
+
{
|
|
109
|
+
"expression": "configuration_id",
|
|
110
|
+
"isExpression": false,
|
|
111
|
+
"asc": true,
|
|
112
|
+
"nulls": "last"
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
"expression": "system_id",
|
|
116
|
+
"isExpression": false,
|
|
117
|
+
"asc": true,
|
|
118
|
+
"nulls": "last"
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"expression": "bucket_start",
|
|
122
|
+
"isExpression": false,
|
|
123
|
+
"asc": true,
|
|
124
|
+
"nulls": "last"
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
"expression": "bucket_size",
|
|
128
|
+
"isExpression": false,
|
|
129
|
+
"asc": true,
|
|
130
|
+
"nulls": "last"
|
|
131
|
+
}
|
|
132
|
+
],
|
|
133
|
+
"isUnique": true,
|
|
134
|
+
"concurrently": false,
|
|
135
|
+
"method": "btree",
|
|
136
|
+
"with": {}
|
|
137
|
+
}
|
|
138
|
+
},
|
|
139
|
+
"foreignKeys": {
|
|
140
|
+
"health_check_aggregates_configuration_id_health_check_configurations_id_fk": {
|
|
141
|
+
"name": "health_check_aggregates_configuration_id_health_check_configurations_id_fk",
|
|
142
|
+
"tableFrom": "health_check_aggregates",
|
|
143
|
+
"tableTo": "health_check_configurations",
|
|
144
|
+
"columnsFrom": [
|
|
145
|
+
"configuration_id"
|
|
146
|
+
],
|
|
147
|
+
"columnsTo": [
|
|
148
|
+
"id"
|
|
149
|
+
],
|
|
150
|
+
"onDelete": "cascade",
|
|
151
|
+
"onUpdate": "no action"
|
|
152
|
+
}
|
|
153
|
+
},
|
|
154
|
+
"compositePrimaryKeys": {},
|
|
155
|
+
"uniqueConstraints": {},
|
|
156
|
+
"policies": {},
|
|
157
|
+
"checkConstraints": {},
|
|
158
|
+
"isRLSEnabled": false
|
|
159
|
+
},
|
|
160
|
+
"public.health_check_configurations": {
|
|
161
|
+
"name": "health_check_configurations",
|
|
162
|
+
"schema": "",
|
|
163
|
+
"columns": {
|
|
164
|
+
"id": {
|
|
165
|
+
"name": "id",
|
|
166
|
+
"type": "uuid",
|
|
167
|
+
"primaryKey": true,
|
|
168
|
+
"notNull": true,
|
|
169
|
+
"default": "gen_random_uuid()"
|
|
170
|
+
},
|
|
171
|
+
"name": {
|
|
172
|
+
"name": "name",
|
|
173
|
+
"type": "text",
|
|
174
|
+
"primaryKey": false,
|
|
175
|
+
"notNull": true
|
|
176
|
+
},
|
|
177
|
+
"strategy_id": {
|
|
178
|
+
"name": "strategy_id",
|
|
179
|
+
"type": "text",
|
|
180
|
+
"primaryKey": false,
|
|
181
|
+
"notNull": true
|
|
182
|
+
},
|
|
183
|
+
"config": {
|
|
184
|
+
"name": "config",
|
|
185
|
+
"type": "jsonb",
|
|
186
|
+
"primaryKey": false,
|
|
187
|
+
"notNull": true
|
|
188
|
+
},
|
|
189
|
+
"collectors": {
|
|
190
|
+
"name": "collectors",
|
|
191
|
+
"type": "jsonb",
|
|
192
|
+
"primaryKey": false,
|
|
193
|
+
"notNull": false
|
|
194
|
+
},
|
|
195
|
+
"interval_seconds": {
|
|
196
|
+
"name": "interval_seconds",
|
|
197
|
+
"type": "integer",
|
|
198
|
+
"primaryKey": false,
|
|
199
|
+
"notNull": true
|
|
200
|
+
},
|
|
201
|
+
"is_template": {
|
|
202
|
+
"name": "is_template",
|
|
203
|
+
"type": "boolean",
|
|
204
|
+
"primaryKey": false,
|
|
205
|
+
"notNull": false,
|
|
206
|
+
"default": false
|
|
207
|
+
},
|
|
208
|
+
"paused": {
|
|
209
|
+
"name": "paused",
|
|
210
|
+
"type": "boolean",
|
|
211
|
+
"primaryKey": false,
|
|
212
|
+
"notNull": true,
|
|
213
|
+
"default": false
|
|
214
|
+
},
|
|
215
|
+
"created_at": {
|
|
216
|
+
"name": "created_at",
|
|
217
|
+
"type": "timestamp",
|
|
218
|
+
"primaryKey": false,
|
|
219
|
+
"notNull": true,
|
|
220
|
+
"default": "now()"
|
|
221
|
+
},
|
|
222
|
+
"updated_at": {
|
|
223
|
+
"name": "updated_at",
|
|
224
|
+
"type": "timestamp",
|
|
225
|
+
"primaryKey": false,
|
|
226
|
+
"notNull": true,
|
|
227
|
+
"default": "now()"
|
|
228
|
+
}
|
|
229
|
+
},
|
|
230
|
+
"indexes": {},
|
|
231
|
+
"foreignKeys": {},
|
|
232
|
+
"compositePrimaryKeys": {},
|
|
233
|
+
"uniqueConstraints": {},
|
|
234
|
+
"policies": {},
|
|
235
|
+
"checkConstraints": {},
|
|
236
|
+
"isRLSEnabled": false
|
|
237
|
+
},
|
|
238
|
+
"public.health_check_runs": {
|
|
239
|
+
"name": "health_check_runs",
|
|
240
|
+
"schema": "",
|
|
241
|
+
"columns": {
|
|
242
|
+
"id": {
|
|
243
|
+
"name": "id",
|
|
244
|
+
"type": "uuid",
|
|
245
|
+
"primaryKey": true,
|
|
246
|
+
"notNull": true,
|
|
247
|
+
"default": "gen_random_uuid()"
|
|
248
|
+
},
|
|
249
|
+
"configuration_id": {
|
|
250
|
+
"name": "configuration_id",
|
|
251
|
+
"type": "uuid",
|
|
252
|
+
"primaryKey": false,
|
|
253
|
+
"notNull": true
|
|
254
|
+
},
|
|
255
|
+
"system_id": {
|
|
256
|
+
"name": "system_id",
|
|
257
|
+
"type": "text",
|
|
258
|
+
"primaryKey": false,
|
|
259
|
+
"notNull": true
|
|
260
|
+
},
|
|
261
|
+
"status": {
|
|
262
|
+
"name": "status",
|
|
263
|
+
"type": "health_check_status",
|
|
264
|
+
"typeSchema": "public",
|
|
265
|
+
"primaryKey": false,
|
|
266
|
+
"notNull": true
|
|
267
|
+
},
|
|
268
|
+
"latency_ms": {
|
|
269
|
+
"name": "latency_ms",
|
|
270
|
+
"type": "integer",
|
|
271
|
+
"primaryKey": false,
|
|
272
|
+
"notNull": false
|
|
273
|
+
},
|
|
274
|
+
"result": {
|
|
275
|
+
"name": "result",
|
|
276
|
+
"type": "jsonb",
|
|
277
|
+
"primaryKey": false,
|
|
278
|
+
"notNull": false
|
|
279
|
+
},
|
|
280
|
+
"timestamp": {
|
|
281
|
+
"name": "timestamp",
|
|
282
|
+
"type": "timestamp",
|
|
283
|
+
"primaryKey": false,
|
|
284
|
+
"notNull": true,
|
|
285
|
+
"default": "now()"
|
|
286
|
+
}
|
|
287
|
+
},
|
|
288
|
+
"indexes": {},
|
|
289
|
+
"foreignKeys": {
|
|
290
|
+
"health_check_runs_configuration_id_health_check_configurations_id_fk": {
|
|
291
|
+
"name": "health_check_runs_configuration_id_health_check_configurations_id_fk",
|
|
292
|
+
"tableFrom": "health_check_runs",
|
|
293
|
+
"tableTo": "health_check_configurations",
|
|
294
|
+
"columnsFrom": [
|
|
295
|
+
"configuration_id"
|
|
296
|
+
],
|
|
297
|
+
"columnsTo": [
|
|
298
|
+
"id"
|
|
299
|
+
],
|
|
300
|
+
"onDelete": "cascade",
|
|
301
|
+
"onUpdate": "no action"
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
"compositePrimaryKeys": {},
|
|
305
|
+
"uniqueConstraints": {},
|
|
306
|
+
"policies": {},
|
|
307
|
+
"checkConstraints": {},
|
|
308
|
+
"isRLSEnabled": false
|
|
309
|
+
},
|
|
310
|
+
"public.system_health_checks": {
|
|
311
|
+
"name": "system_health_checks",
|
|
312
|
+
"schema": "",
|
|
313
|
+
"columns": {
|
|
314
|
+
"system_id": {
|
|
315
|
+
"name": "system_id",
|
|
316
|
+
"type": "text",
|
|
317
|
+
"primaryKey": false,
|
|
318
|
+
"notNull": true
|
|
319
|
+
},
|
|
320
|
+
"configuration_id": {
|
|
321
|
+
"name": "configuration_id",
|
|
322
|
+
"type": "uuid",
|
|
323
|
+
"primaryKey": false,
|
|
324
|
+
"notNull": true
|
|
325
|
+
},
|
|
326
|
+
"enabled": {
|
|
327
|
+
"name": "enabled",
|
|
328
|
+
"type": "boolean",
|
|
329
|
+
"primaryKey": false,
|
|
330
|
+
"notNull": true,
|
|
331
|
+
"default": true
|
|
332
|
+
},
|
|
333
|
+
"state_thresholds": {
|
|
334
|
+
"name": "state_thresholds",
|
|
335
|
+
"type": "jsonb",
|
|
336
|
+
"primaryKey": false,
|
|
337
|
+
"notNull": false
|
|
338
|
+
},
|
|
339
|
+
"retention_config": {
|
|
340
|
+
"name": "retention_config",
|
|
341
|
+
"type": "jsonb",
|
|
342
|
+
"primaryKey": false,
|
|
343
|
+
"notNull": false
|
|
344
|
+
},
|
|
345
|
+
"created_at": {
|
|
346
|
+
"name": "created_at",
|
|
347
|
+
"type": "timestamp",
|
|
348
|
+
"primaryKey": false,
|
|
349
|
+
"notNull": true,
|
|
350
|
+
"default": "now()"
|
|
351
|
+
},
|
|
352
|
+
"updated_at": {
|
|
353
|
+
"name": "updated_at",
|
|
354
|
+
"type": "timestamp",
|
|
355
|
+
"primaryKey": false,
|
|
356
|
+
"notNull": true,
|
|
357
|
+
"default": "now()"
|
|
358
|
+
}
|
|
359
|
+
},
|
|
360
|
+
"indexes": {},
|
|
361
|
+
"foreignKeys": {
|
|
362
|
+
"system_health_checks_configuration_id_health_check_configurations_id_fk": {
|
|
363
|
+
"name": "system_health_checks_configuration_id_health_check_configurations_id_fk",
|
|
364
|
+
"tableFrom": "system_health_checks",
|
|
365
|
+
"tableTo": "health_check_configurations",
|
|
366
|
+
"columnsFrom": [
|
|
367
|
+
"configuration_id"
|
|
368
|
+
],
|
|
369
|
+
"columnsTo": [
|
|
370
|
+
"id"
|
|
371
|
+
],
|
|
372
|
+
"onDelete": "cascade",
|
|
373
|
+
"onUpdate": "no action"
|
|
374
|
+
}
|
|
375
|
+
},
|
|
376
|
+
"compositePrimaryKeys": {
|
|
377
|
+
"system_health_checks_system_id_configuration_id_pk": {
|
|
378
|
+
"name": "system_health_checks_system_id_configuration_id_pk",
|
|
379
|
+
"columns": [
|
|
380
|
+
"system_id",
|
|
381
|
+
"configuration_id"
|
|
382
|
+
]
|
|
383
|
+
}
|
|
384
|
+
},
|
|
385
|
+
"uniqueConstraints": {},
|
|
386
|
+
"policies": {},
|
|
387
|
+
"checkConstraints": {},
|
|
388
|
+
"isRLSEnabled": false
|
|
389
|
+
}
|
|
390
|
+
},
|
|
391
|
+
"enums": {
|
|
392
|
+
"public.bucket_size": {
|
|
393
|
+
"name": "bucket_size",
|
|
394
|
+
"schema": "public",
|
|
395
|
+
"values": [
|
|
396
|
+
"hourly",
|
|
397
|
+
"daily"
|
|
398
|
+
]
|
|
399
|
+
},
|
|
400
|
+
"public.health_check_status": {
|
|
401
|
+
"name": "health_check_status",
|
|
402
|
+
"schema": "public",
|
|
403
|
+
"values": [
|
|
404
|
+
"healthy",
|
|
405
|
+
"unhealthy",
|
|
406
|
+
"degraded"
|
|
407
|
+
]
|
|
408
|
+
}
|
|
409
|
+
},
|
|
410
|
+
"schemas": {},
|
|
411
|
+
"sequences": {},
|
|
412
|
+
"roles": {},
|
|
413
|
+
"policies": {},
|
|
414
|
+
"views": {},
|
|
415
|
+
"_meta": {
|
|
416
|
+
"columns": {},
|
|
417
|
+
"schemas": {},
|
|
418
|
+
"tables": {}
|
|
419
|
+
}
|
|
420
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@checkstack/healthcheck-backend",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "src/index.ts",
|
|
6
6
|
"scripts": {
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
"@checkstack/healthcheck-common": "workspace:*",
|
|
17
17
|
"@checkstack/integration-backend": "workspace:*",
|
|
18
18
|
"@checkstack/maintenance-common": "workspace:*",
|
|
19
|
+
"@checkstack/incident-common": "workspace:*",
|
|
19
20
|
"@checkstack/queue-api": "workspace:*",
|
|
20
21
|
"@checkstack/signal-common": "workspace:*",
|
|
21
22
|
"@checkstack/command-backend": "workspace:*",
|
package/src/index.ts
CHANGED
|
@@ -24,6 +24,7 @@ import { HealthCheckService } from "./service";
|
|
|
24
24
|
import { catalogHooks } from "@checkstack/catalog-backend";
|
|
25
25
|
import { CatalogApi } from "@checkstack/catalog-common";
|
|
26
26
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
27
|
+
import { IncidentApi } from "@checkstack/incident-common";
|
|
27
28
|
import { healthCheckHooks } from "./hooks";
|
|
28
29
|
import { registerSearchProvider } from "@checkstack/command-backend";
|
|
29
30
|
import { resolveRoute } from "@checkstack/common";
|
|
@@ -117,6 +118,9 @@ export default createBackendPlugin({
|
|
|
117
118
|
// Create maintenance client for notification suppression checks
|
|
118
119
|
const maintenanceClient = rpcClient.forPlugin(MaintenanceApi);
|
|
119
120
|
|
|
121
|
+
// Create incident client for notification suppression checks
|
|
122
|
+
const incidentClient = rpcClient.forPlugin(IncidentApi);
|
|
123
|
+
|
|
120
124
|
// Setup queue-based health check worker
|
|
121
125
|
await setupHealthCheckWorker({
|
|
122
126
|
db: database,
|
|
@@ -127,6 +131,7 @@ export default createBackendPlugin({
|
|
|
127
131
|
signalService,
|
|
128
132
|
catalogClient,
|
|
129
133
|
maintenanceClient,
|
|
134
|
+
incidentClient,
|
|
130
135
|
getEmitHook: () => storedEmitHook,
|
|
131
136
|
});
|
|
132
137
|
|
|
@@ -92,6 +92,23 @@ const createMockMaintenanceClient = () => ({
|
|
|
92
92
|
deleteMaintenance: mock(async () => ({ success: true })),
|
|
93
93
|
});
|
|
94
94
|
|
|
95
|
+
// Helper to create mock incident client for notification suppression checks
|
|
96
|
+
const createMockIncidentClient = () => ({
|
|
97
|
+
hasActiveIncidentWithSuppression: mock(async () => ({
|
|
98
|
+
suppressed: false,
|
|
99
|
+
})),
|
|
100
|
+
// Other methods not used in queue-executor
|
|
101
|
+
listIncidents: mock(async () => ({ incidents: [] })),
|
|
102
|
+
getIncident: mock(async () => null),
|
|
103
|
+
getIncidentsForSystem: mock(async () => []),
|
|
104
|
+
getBulkIncidentsForSystems: mock(async () => ({ incidents: {} })),
|
|
105
|
+
createIncident: mock(async () => ({})),
|
|
106
|
+
updateIncident: mock(async () => ({})),
|
|
107
|
+
addUpdate: mock(async () => ({})),
|
|
108
|
+
resolveIncident: mock(async () => ({})),
|
|
109
|
+
deleteIncident: mock(async () => ({ success: true })),
|
|
110
|
+
});
|
|
111
|
+
|
|
95
112
|
describe("Queue-Based Health Check Executor", () => {
|
|
96
113
|
describe("scheduleHealthCheck", () => {
|
|
97
114
|
it("should enqueue a health check with delay and deterministic jobId", async () => {
|
|
@@ -145,6 +162,7 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
145
162
|
const mockQueueManager = createMockQueueManager();
|
|
146
163
|
const mockCatalogClient = createMockCatalogClient();
|
|
147
164
|
const mockMaintenanceClient = createMockMaintenanceClient();
|
|
165
|
+
const mockIncidentClient = createMockIncidentClient();
|
|
148
166
|
|
|
149
167
|
await setupHealthCheckWorker({
|
|
150
168
|
db: mockDb as unknown as Parameters<
|
|
@@ -164,6 +182,9 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
164
182
|
maintenanceClient: mockMaintenanceClient as unknown as Parameters<
|
|
165
183
|
typeof setupHealthCheckWorker
|
|
166
184
|
>[0]["maintenanceClient"],
|
|
185
|
+
incidentClient: mockIncidentClient as unknown as Parameters<
|
|
186
|
+
typeof setupHealthCheckWorker
|
|
187
|
+
>[0]["incidentClient"],
|
|
167
188
|
getEmitHook: () => undefined,
|
|
168
189
|
});
|
|
169
190
|
|
|
@@ -264,4 +285,116 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
264
285
|
);
|
|
265
286
|
});
|
|
266
287
|
});
|
|
288
|
+
|
|
289
|
+
describe("executeHealthCheckJob - paused behavior", () => {
|
|
290
|
+
it("should skip execution when configuration is paused", async () => {
|
|
291
|
+
const mockDb = createMockDb();
|
|
292
|
+
const mockRegistry = createMockRegistry();
|
|
293
|
+
const mockLogger = createMockLogger();
|
|
294
|
+
const mockQueueManager = createMockQueueManager();
|
|
295
|
+
const mockCatalogClient = createMockCatalogClient();
|
|
296
|
+
const mockMaintenanceClient = createMockMaintenanceClient();
|
|
297
|
+
const mockIncidentClient = createMockIncidentClient();
|
|
298
|
+
const mockSignalService = createMockSignalService();
|
|
299
|
+
|
|
300
|
+
// Mock the database to return a paused configuration
|
|
301
|
+
let selectCallCount = 0;
|
|
302
|
+
(mockDb.select as any) = mock(() => {
|
|
303
|
+
selectCallCount++;
|
|
304
|
+
if (selectCallCount === 1) {
|
|
305
|
+
// First call: get previous system health status
|
|
306
|
+
return {
|
|
307
|
+
from: mock(() => ({
|
|
308
|
+
innerJoin: mock(() => ({
|
|
309
|
+
where: mock(() => Promise.resolve([])),
|
|
310
|
+
})),
|
|
311
|
+
})),
|
|
312
|
+
};
|
|
313
|
+
} else if (selectCallCount === 2) {
|
|
314
|
+
// Second call: fetch configuration (return paused config)
|
|
315
|
+
return {
|
|
316
|
+
from: mock(() => ({
|
|
317
|
+
innerJoin: mock(() => ({
|
|
318
|
+
where: mock(() =>
|
|
319
|
+
Promise.resolve([
|
|
320
|
+
{
|
|
321
|
+
configId: "config-1",
|
|
322
|
+
configName: "Test Check",
|
|
323
|
+
strategyId: "test-strategy",
|
|
324
|
+
config: {},
|
|
325
|
+
collectors: [],
|
|
326
|
+
interval: 30,
|
|
327
|
+
enabled: true,
|
|
328
|
+
paused: true, // Configuration is paused
|
|
329
|
+
},
|
|
330
|
+
]),
|
|
331
|
+
),
|
|
332
|
+
})),
|
|
333
|
+
})),
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
// Default
|
|
337
|
+
return {
|
|
338
|
+
from: mock(() => ({
|
|
339
|
+
innerJoin: mock(() => ({
|
|
340
|
+
where: mock(() => Promise.resolve([])),
|
|
341
|
+
})),
|
|
342
|
+
})),
|
|
343
|
+
};
|
|
344
|
+
});
|
|
345
|
+
|
|
346
|
+
// Setup worker and get handler
|
|
347
|
+
const queue =
|
|
348
|
+
mockQueueManager.getQueue<HealthCheckJobPayload>("health-checks");
|
|
349
|
+
let capturedHandler:
|
|
350
|
+
| ((job: { data: HealthCheckJobPayload }) => Promise<void>)
|
|
351
|
+
| undefined;
|
|
352
|
+
(queue.consume as any) = mock(
|
|
353
|
+
async (
|
|
354
|
+
handler: (job: { data: HealthCheckJobPayload }) => Promise<void>,
|
|
355
|
+
) => {
|
|
356
|
+
capturedHandler = handler;
|
|
357
|
+
},
|
|
358
|
+
);
|
|
359
|
+
|
|
360
|
+
await setupHealthCheckWorker({
|
|
361
|
+
db: mockDb as unknown as Parameters<
|
|
362
|
+
typeof setupHealthCheckWorker
|
|
363
|
+
>[0]["db"],
|
|
364
|
+
registry: mockRegistry,
|
|
365
|
+
collectorRegistry:
|
|
366
|
+
createMockCollectorRegistry() as unknown as Parameters<
|
|
367
|
+
typeof setupHealthCheckWorker
|
|
368
|
+
>[0]["collectorRegistry"],
|
|
369
|
+
logger: mockLogger,
|
|
370
|
+
queueManager: mockQueueManager,
|
|
371
|
+
signalService: mockSignalService,
|
|
372
|
+
catalogClient: mockCatalogClient as unknown as Parameters<
|
|
373
|
+
typeof setupHealthCheckWorker
|
|
374
|
+
>[0]["catalogClient"],
|
|
375
|
+
maintenanceClient: mockMaintenanceClient as unknown as Parameters<
|
|
376
|
+
typeof setupHealthCheckWorker
|
|
377
|
+
>[0]["maintenanceClient"],
|
|
378
|
+
incidentClient: mockIncidentClient as unknown as Parameters<
|
|
379
|
+
typeof setupHealthCheckWorker
|
|
380
|
+
>[0]["incidentClient"],
|
|
381
|
+
getEmitHook: () => undefined,
|
|
382
|
+
});
|
|
383
|
+
|
|
384
|
+
// Execute a paused health check
|
|
385
|
+
if (capturedHandler) {
|
|
386
|
+
await capturedHandler({
|
|
387
|
+
data: { configId: "config-1", systemId: "system-1" },
|
|
388
|
+
});
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Verify execution was skipped with appropriate log
|
|
392
|
+
expect(mockLogger.debug).toHaveBeenCalledWith(
|
|
393
|
+
expect.stringContaining("is paused, skipping execution"),
|
|
394
|
+
);
|
|
395
|
+
|
|
396
|
+
// Verify no signal was broadcast (since execution was skipped)
|
|
397
|
+
expect(mockSignalService.getRecordedSignals()).toHaveLength(0);
|
|
398
|
+
});
|
|
399
|
+
});
|
|
267
400
|
});
|
package/src/queue-executor.ts
CHANGED
|
@@ -22,6 +22,7 @@ import {
|
|
|
22
22
|
} from "@checkstack/healthcheck-common";
|
|
23
23
|
import { CatalogApi, catalogRoutes } from "@checkstack/catalog-common";
|
|
24
24
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
25
|
+
import { IncidentApi } from "@checkstack/incident-common";
|
|
25
26
|
import { resolveRoute, type InferClient } from "@checkstack/common";
|
|
26
27
|
import { HealthCheckService } from "./service";
|
|
27
28
|
import { healthCheckHooks } from "./hooks";
|
|
@@ -29,6 +30,7 @@ import { healthCheckHooks } from "./hooks";
|
|
|
29
30
|
type Db = SafeDatabase<typeof schema>;
|
|
30
31
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
31
32
|
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
33
|
+
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
32
34
|
|
|
33
35
|
/**
|
|
34
36
|
* Payload for health check queue jobs
|
|
@@ -90,7 +92,7 @@ export async function scheduleHealthCheck(props: {
|
|
|
90
92
|
|
|
91
93
|
/**
|
|
92
94
|
* Notify system subscribers about a health state change.
|
|
93
|
-
* Skips notification if the system has active maintenance with suppression enabled.
|
|
95
|
+
* Skips notification if the system has active maintenance or incident with suppression enabled.
|
|
94
96
|
*/
|
|
95
97
|
async function notifyStateChange(props: {
|
|
96
98
|
systemId: string;
|
|
@@ -98,6 +100,7 @@ async function notifyStateChange(props: {
|
|
|
98
100
|
newStatus: HealthCheckStatus;
|
|
99
101
|
catalogClient: CatalogClient;
|
|
100
102
|
maintenanceClient: MaintenanceClient;
|
|
103
|
+
incidentClient: IncidentClient;
|
|
101
104
|
logger: Logger;
|
|
102
105
|
}): Promise<void> {
|
|
103
106
|
const {
|
|
@@ -106,6 +109,7 @@ async function notifyStateChange(props: {
|
|
|
106
109
|
newStatus,
|
|
107
110
|
catalogClient,
|
|
108
111
|
maintenanceClient,
|
|
112
|
+
incidentClient,
|
|
109
113
|
logger,
|
|
110
114
|
} = props;
|
|
111
115
|
|
|
@@ -132,6 +136,24 @@ async function notifyStateChange(props: {
|
|
|
132
136
|
);
|
|
133
137
|
}
|
|
134
138
|
|
|
139
|
+
// Check if notifications should be suppressed due to active incident
|
|
140
|
+
try {
|
|
141
|
+
const { suppressed } =
|
|
142
|
+
await incidentClient.hasActiveIncidentWithSuppression({ systemId });
|
|
143
|
+
if (suppressed) {
|
|
144
|
+
logger.debug(
|
|
145
|
+
`Skipping notification for ${systemId}: active incident with suppression enabled`,
|
|
146
|
+
);
|
|
147
|
+
return;
|
|
148
|
+
}
|
|
149
|
+
} catch (error) {
|
|
150
|
+
// Log but continue with notification - suppression check failure shouldn't block notifications
|
|
151
|
+
logger.warn(
|
|
152
|
+
`Failed to check incident suppression for ${systemId}, proceeding with notification:`,
|
|
153
|
+
error,
|
|
154
|
+
);
|
|
155
|
+
}
|
|
156
|
+
|
|
135
157
|
const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
|
|
136
158
|
const isDegraded = newStatus === "degraded";
|
|
137
159
|
const isUnhealthy = newStatus === "unhealthy";
|
|
@@ -196,6 +218,7 @@ async function executeHealthCheckJob(props: {
|
|
|
196
218
|
signalService: SignalService;
|
|
197
219
|
catalogClient: CatalogClient;
|
|
198
220
|
maintenanceClient: MaintenanceClient;
|
|
221
|
+
incidentClient: IncidentClient;
|
|
199
222
|
getEmitHook: () => EmitHookFn | undefined;
|
|
200
223
|
}): Promise<void> {
|
|
201
224
|
const {
|
|
@@ -207,6 +230,7 @@ async function executeHealthCheckJob(props: {
|
|
|
207
230
|
signalService,
|
|
208
231
|
catalogClient,
|
|
209
232
|
maintenanceClient,
|
|
233
|
+
incidentClient,
|
|
210
234
|
getEmitHook,
|
|
211
235
|
} = props;
|
|
212
236
|
const { configId, systemId } = payload;
|
|
@@ -229,6 +253,7 @@ async function executeHealthCheckJob(props: {
|
|
|
229
253
|
collectors: healthCheckConfigurations.collectors,
|
|
230
254
|
interval: healthCheckConfigurations.intervalSeconds,
|
|
231
255
|
enabled: systemHealthChecks.enabled,
|
|
256
|
+
paused: healthCheckConfigurations.paused,
|
|
232
257
|
})
|
|
233
258
|
.from(systemHealthChecks)
|
|
234
259
|
.innerJoin(
|
|
@@ -251,6 +276,14 @@ async function executeHealthCheckJob(props: {
|
|
|
251
276
|
return;
|
|
252
277
|
}
|
|
253
278
|
|
|
279
|
+
// If configuration is paused, skip execution (job continues to be scheduled)
|
|
280
|
+
if (configRow.paused) {
|
|
281
|
+
logger.debug(
|
|
282
|
+
`Health check ${configId} is paused, skipping execution for system ${systemId}`,
|
|
283
|
+
);
|
|
284
|
+
return;
|
|
285
|
+
}
|
|
286
|
+
|
|
254
287
|
// Fetch system name for signal payload
|
|
255
288
|
let systemName = systemId;
|
|
256
289
|
try {
|
|
@@ -322,6 +355,7 @@ async function executeHealthCheckJob(props: {
|
|
|
322
355
|
newStatus: newState.status,
|
|
323
356
|
catalogClient,
|
|
324
357
|
maintenanceClient,
|
|
358
|
+
incidentClient,
|
|
325
359
|
logger,
|
|
326
360
|
});
|
|
327
361
|
}
|
|
@@ -470,6 +504,7 @@ async function executeHealthCheckJob(props: {
|
|
|
470
504
|
newStatus: newState.status,
|
|
471
505
|
catalogClient,
|
|
472
506
|
maintenanceClient,
|
|
507
|
+
incidentClient,
|
|
473
508
|
logger,
|
|
474
509
|
});
|
|
475
510
|
|
|
@@ -564,6 +599,7 @@ async function executeHealthCheckJob(props: {
|
|
|
564
599
|
newStatus: newState.status,
|
|
565
600
|
catalogClient,
|
|
566
601
|
maintenanceClient,
|
|
602
|
+
incidentClient,
|
|
567
603
|
logger,
|
|
568
604
|
});
|
|
569
605
|
|
|
@@ -619,6 +655,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
619
655
|
signalService: SignalService;
|
|
620
656
|
catalogClient: CatalogClient;
|
|
621
657
|
maintenanceClient: MaintenanceClient;
|
|
658
|
+
incidentClient: IncidentClient;
|
|
622
659
|
getEmitHook: () => EmitHookFn | undefined;
|
|
623
660
|
}): Promise<void> {
|
|
624
661
|
const {
|
|
@@ -630,6 +667,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
630
667
|
signalService,
|
|
631
668
|
catalogClient,
|
|
632
669
|
maintenanceClient,
|
|
670
|
+
incidentClient,
|
|
633
671
|
getEmitHook,
|
|
634
672
|
} = props;
|
|
635
673
|
|
|
@@ -648,6 +686,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
648
686
|
signalService,
|
|
649
687
|
catalogClient,
|
|
650
688
|
maintenanceClient,
|
|
689
|
+
incidentClient,
|
|
651
690
|
getEmitHook,
|
|
652
691
|
});
|
|
653
692
|
},
|
package/src/router.ts
CHANGED
|
@@ -105,6 +105,14 @@ export const createHealthCheckRouter = (
|
|
|
105
105
|
await service.deleteConfiguration(input);
|
|
106
106
|
}),
|
|
107
107
|
|
|
108
|
+
pauseConfiguration: os.pauseConfiguration.handler(async ({ input }) => {
|
|
109
|
+
await service.pauseConfiguration(input);
|
|
110
|
+
}),
|
|
111
|
+
|
|
112
|
+
resumeConfiguration: os.resumeConfiguration.handler(async ({ input }) => {
|
|
113
|
+
await service.resumeConfiguration(input);
|
|
114
|
+
}),
|
|
115
|
+
|
|
108
116
|
getSystemConfigurations: os.getSystemConfigurations.handler(
|
|
109
117
|
async ({ input }) => {
|
|
110
118
|
return service.getSystemConfigurations(input);
|
package/src/schema.ts
CHANGED
|
@@ -45,6 +45,8 @@ export const healthCheckConfigurations = pgTable(
|
|
|
45
45
|
collectors: jsonb("collectors").$type<CollectorConfigEntry[]>(),
|
|
46
46
|
intervalSeconds: integer("interval_seconds").notNull(),
|
|
47
47
|
isTemplate: boolean("is_template").default(false),
|
|
48
|
+
/** Whether this configuration is paused (execution skipped for all systems) */
|
|
49
|
+
paused: boolean("paused").default(false).notNull(),
|
|
48
50
|
createdAt: timestamp("created_at").defaultNow().notNull(),
|
|
49
51
|
updatedAt: timestamp("updated_at").defaultNow().notNull(),
|
|
50
52
|
},
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { describe, it, expect, mock, beforeEach } from "bun:test";
|
|
2
|
+
import { HealthCheckService } from "./service";
|
|
3
|
+
import { createMockDb } from "@checkstack/test-utils-backend";
|
|
4
|
+
|
|
5
|
+
describe("HealthCheckService - pause/resume", () => {
|
|
6
|
+
let mockDb: ReturnType<typeof createMockDb>;
|
|
7
|
+
let service: HealthCheckService;
|
|
8
|
+
let mockUpdate: ReturnType<typeof mock>;
|
|
9
|
+
let mockSet: ReturnType<typeof mock>;
|
|
10
|
+
let mockWhere: ReturnType<typeof mock>;
|
|
11
|
+
|
|
12
|
+
beforeEach(() => {
|
|
13
|
+
mockDb = createMockDb();
|
|
14
|
+
mockWhere = mock(() => Promise.resolve());
|
|
15
|
+
mockSet = mock(() => ({ where: mockWhere }));
|
|
16
|
+
mockUpdate = mock(() => ({ set: mockSet }));
|
|
17
|
+
(mockDb.update as any) = mockUpdate;
|
|
18
|
+
service = new HealthCheckService(mockDb as any);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
describe("pauseConfiguration", () => {
|
|
22
|
+
it("should update paused to true and set updatedAt", async () => {
|
|
23
|
+
await service.pauseConfiguration("config-123");
|
|
24
|
+
|
|
25
|
+
expect(mockUpdate).toHaveBeenCalled();
|
|
26
|
+
expect(mockSet).toHaveBeenCalledWith(
|
|
27
|
+
expect.objectContaining({
|
|
28
|
+
paused: true,
|
|
29
|
+
updatedAt: expect.any(Date),
|
|
30
|
+
}),
|
|
31
|
+
);
|
|
32
|
+
expect(mockWhere).toHaveBeenCalled();
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
describe("resumeConfiguration", () => {
|
|
37
|
+
it("should update paused to false and set updatedAt", async () => {
|
|
38
|
+
await service.resumeConfiguration("config-456");
|
|
39
|
+
|
|
40
|
+
expect(mockUpdate).toHaveBeenCalled();
|
|
41
|
+
expect(mockSet).toHaveBeenCalledWith(
|
|
42
|
+
expect.objectContaining({
|
|
43
|
+
paused: false,
|
|
44
|
+
updatedAt: expect.any(Date),
|
|
45
|
+
}),
|
|
46
|
+
);
|
|
47
|
+
expect(mockWhere).toHaveBeenCalled();
|
|
48
|
+
});
|
|
49
|
+
});
|
|
50
|
+
});
|
package/src/service.ts
CHANGED
|
@@ -105,6 +105,20 @@ export class HealthCheckService {
|
|
|
105
105
|
.where(eq(healthCheckConfigurations.id, id));
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
+
async pauseConfiguration(id: string): Promise<void> {
|
|
109
|
+
await this.db
|
|
110
|
+
.update(healthCheckConfigurations)
|
|
111
|
+
.set({ paused: true, updatedAt: new Date() })
|
|
112
|
+
.where(eq(healthCheckConfigurations.id, id));
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
async resumeConfiguration(id: string): Promise<void> {
|
|
116
|
+
await this.db
|
|
117
|
+
.update(healthCheckConfigurations)
|
|
118
|
+
.set({ paused: false, updatedAt: new Date() })
|
|
119
|
+
.where(eq(healthCheckConfigurations.id, id));
|
|
120
|
+
}
|
|
121
|
+
|
|
108
122
|
async getConfigurations(): Promise<HealthCheckConfiguration[]> {
|
|
109
123
|
const configs = await this.db.select().from(healthCheckConfigurations);
|
|
110
124
|
return configs.map((c) => this.mapConfig(c));
|
|
@@ -884,6 +898,7 @@ export class HealthCheckService {
|
|
|
884
898
|
config: row.config,
|
|
885
899
|
collectors: row.collectors ?? undefined,
|
|
886
900
|
intervalSeconds: row.intervalSeconds,
|
|
901
|
+
paused: row.paused,
|
|
887
902
|
createdAt: row.createdAt,
|
|
888
903
|
updatedAt: row.updatedAt,
|
|
889
904
|
};
|