@checkstack/healthcheck-backend 0.4.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +116 -0
- package/drizzle/0007_tense_misty_knight.sql +1 -0
- package/drizzle/0008_broad_black_tom.sql +1 -0
- package/drizzle/meta/0007_snapshot.json +413 -0
- package/drizzle/meta/0008_snapshot.json +420 -0
- package/drizzle/meta/_journal.json +14 -0
- package/package.json +2 -1
- package/src/aggregation-utils.test.ts +644 -0
- package/src/aggregation-utils.ts +399 -0
- package/src/aggregation.test.ts +222 -79
- package/src/index.ts +16 -0
- package/src/queue-executor.test.ts +133 -0
- package/src/queue-executor.ts +48 -2
- package/src/retention-job.ts +72 -43
- package/src/router.test.ts +14 -3
- package/src/router.ts +14 -1
- package/src/schema.ts +8 -4
- package/src/service-pause.test.ts +50 -0
- package/src/service.ts +273 -101
package/src/queue-executor.ts
CHANGED
|
@@ -18,9 +18,11 @@ import { type SignalService } from "@checkstack/signal-common";
|
|
|
18
18
|
import {
|
|
19
19
|
HEALTH_CHECK_RUN_COMPLETED,
|
|
20
20
|
type HealthCheckStatus,
|
|
21
|
+
stripEphemeralFields,
|
|
21
22
|
} from "@checkstack/healthcheck-common";
|
|
22
23
|
import { CatalogApi, catalogRoutes } from "@checkstack/catalog-common";
|
|
23
24
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
25
|
+
import { IncidentApi } from "@checkstack/incident-common";
|
|
24
26
|
import { resolveRoute, type InferClient } from "@checkstack/common";
|
|
25
27
|
import { HealthCheckService } from "./service";
|
|
26
28
|
import { healthCheckHooks } from "./hooks";
|
|
@@ -28,6 +30,7 @@ import { healthCheckHooks } from "./hooks";
|
|
|
28
30
|
type Db = SafeDatabase<typeof schema>;
|
|
29
31
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
30
32
|
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
33
|
+
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
31
34
|
|
|
32
35
|
/**
|
|
33
36
|
* Payload for health check queue jobs
|
|
@@ -89,7 +92,7 @@ export async function scheduleHealthCheck(props: {
|
|
|
89
92
|
|
|
90
93
|
/**
|
|
91
94
|
* Notify system subscribers about a health state change.
|
|
92
|
-
* Skips notification if the system has active maintenance with suppression enabled.
|
|
95
|
+
* Skips notification if the system has active maintenance or incident with suppression enabled.
|
|
93
96
|
*/
|
|
94
97
|
async function notifyStateChange(props: {
|
|
95
98
|
systemId: string;
|
|
@@ -97,6 +100,7 @@ async function notifyStateChange(props: {
|
|
|
97
100
|
newStatus: HealthCheckStatus;
|
|
98
101
|
catalogClient: CatalogClient;
|
|
99
102
|
maintenanceClient: MaintenanceClient;
|
|
103
|
+
incidentClient: IncidentClient;
|
|
100
104
|
logger: Logger;
|
|
101
105
|
}): Promise<void> {
|
|
102
106
|
const {
|
|
@@ -105,6 +109,7 @@ async function notifyStateChange(props: {
|
|
|
105
109
|
newStatus,
|
|
106
110
|
catalogClient,
|
|
107
111
|
maintenanceClient,
|
|
112
|
+
incidentClient,
|
|
108
113
|
logger,
|
|
109
114
|
} = props;
|
|
110
115
|
|
|
@@ -131,6 +136,24 @@ async function notifyStateChange(props: {
|
|
|
131
136
|
);
|
|
132
137
|
}
|
|
133
138
|
|
|
139
|
+
// Check if notifications should be suppressed due to active incident
|
|
140
|
+
try {
|
|
141
|
+
const { suppressed } =
|
|
142
|
+
await incidentClient.hasActiveIncidentWithSuppression({ systemId });
|
|
143
|
+
if (suppressed) {
|
|
144
|
+
logger.debug(
|
|
145
|
+
`Skipping notification for ${systemId}: active incident with suppression enabled`,
|
|
146
|
+
);
|
|
147
|
+
return;
|
|
148
|
+
}
|
|
149
|
+
} catch (error) {
|
|
150
|
+
// Log but continue with notification - suppression check failure shouldn't block notifications
|
|
151
|
+
logger.warn(
|
|
152
|
+
`Failed to check incident suppression for ${systemId}, proceeding with notification:`,
|
|
153
|
+
error,
|
|
154
|
+
);
|
|
155
|
+
}
|
|
156
|
+
|
|
134
157
|
const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
|
|
135
158
|
const isDegraded = newStatus === "degraded";
|
|
136
159
|
const isUnhealthy = newStatus === "unhealthy";
|
|
@@ -195,6 +218,7 @@ async function executeHealthCheckJob(props: {
|
|
|
195
218
|
signalService: SignalService;
|
|
196
219
|
catalogClient: CatalogClient;
|
|
197
220
|
maintenanceClient: MaintenanceClient;
|
|
221
|
+
incidentClient: IncidentClient;
|
|
198
222
|
getEmitHook: () => EmitHookFn | undefined;
|
|
199
223
|
}): Promise<void> {
|
|
200
224
|
const {
|
|
@@ -206,6 +230,7 @@ async function executeHealthCheckJob(props: {
|
|
|
206
230
|
signalService,
|
|
207
231
|
catalogClient,
|
|
208
232
|
maintenanceClient,
|
|
233
|
+
incidentClient,
|
|
209
234
|
getEmitHook,
|
|
210
235
|
} = props;
|
|
211
236
|
const { configId, systemId } = payload;
|
|
@@ -228,6 +253,7 @@ async function executeHealthCheckJob(props: {
|
|
|
228
253
|
collectors: healthCheckConfigurations.collectors,
|
|
229
254
|
interval: healthCheckConfigurations.intervalSeconds,
|
|
230
255
|
enabled: systemHealthChecks.enabled,
|
|
256
|
+
paused: healthCheckConfigurations.paused,
|
|
231
257
|
})
|
|
232
258
|
.from(systemHealthChecks)
|
|
233
259
|
.innerJoin(
|
|
@@ -250,6 +276,14 @@ async function executeHealthCheckJob(props: {
|
|
|
250
276
|
return;
|
|
251
277
|
}
|
|
252
278
|
|
|
279
|
+
// If configuration is paused, skip execution (job continues to be scheduled)
|
|
280
|
+
if (configRow.paused) {
|
|
281
|
+
logger.debug(
|
|
282
|
+
`Health check ${configId} is paused, skipping execution for system ${systemId}`,
|
|
283
|
+
);
|
|
284
|
+
return;
|
|
285
|
+
}
|
|
286
|
+
|
|
253
287
|
// Fetch system name for signal payload
|
|
254
288
|
let systemName = systemId;
|
|
255
289
|
try {
|
|
@@ -321,6 +355,7 @@ async function executeHealthCheckJob(props: {
|
|
|
321
355
|
newStatus: newState.status,
|
|
322
356
|
catalogClient,
|
|
323
357
|
maintenanceClient,
|
|
358
|
+
incidentClient,
|
|
324
359
|
logger,
|
|
325
360
|
});
|
|
326
361
|
}
|
|
@@ -388,11 +423,17 @@ async function executeHealthCheckJob(props: {
|
|
|
388
423
|
}
|
|
389
424
|
}
|
|
390
425
|
|
|
426
|
+
// Strip ephemeral fields (like HTTP body) before storage to save space
|
|
427
|
+
const strippedResult = stripEphemeralFields(
|
|
428
|
+
collectorResult.result as Record<string, unknown>,
|
|
429
|
+
registered.collector.result.schema,
|
|
430
|
+
);
|
|
431
|
+
|
|
391
432
|
// Store result under the collector's UUID, with collector type and assertion metadata
|
|
392
433
|
collectorResults[storageKey] = {
|
|
393
434
|
_collectorId: collectorEntry.collectorId, // Store the type for frontend schema linking
|
|
394
435
|
_assertionFailed: assertionFailed, // null if no assertion failed
|
|
395
|
-
...
|
|
436
|
+
...strippedResult,
|
|
396
437
|
};
|
|
397
438
|
} catch (error) {
|
|
398
439
|
hasCollectorError = true;
|
|
@@ -463,6 +504,7 @@ async function executeHealthCheckJob(props: {
|
|
|
463
504
|
newStatus: newState.status,
|
|
464
505
|
catalogClient,
|
|
465
506
|
maintenanceClient,
|
|
507
|
+
incidentClient,
|
|
466
508
|
logger,
|
|
467
509
|
});
|
|
468
510
|
|
|
@@ -557,6 +599,7 @@ async function executeHealthCheckJob(props: {
|
|
|
557
599
|
newStatus: newState.status,
|
|
558
600
|
catalogClient,
|
|
559
601
|
maintenanceClient,
|
|
602
|
+
incidentClient,
|
|
560
603
|
logger,
|
|
561
604
|
});
|
|
562
605
|
|
|
@@ -612,6 +655,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
612
655
|
signalService: SignalService;
|
|
613
656
|
catalogClient: CatalogClient;
|
|
614
657
|
maintenanceClient: MaintenanceClient;
|
|
658
|
+
incidentClient: IncidentClient;
|
|
615
659
|
getEmitHook: () => EmitHookFn | undefined;
|
|
616
660
|
}): Promise<void> {
|
|
617
661
|
const {
|
|
@@ -623,6 +667,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
623
667
|
signalService,
|
|
624
668
|
catalogClient,
|
|
625
669
|
maintenanceClient,
|
|
670
|
+
incidentClient,
|
|
626
671
|
getEmitHook,
|
|
627
672
|
} = props;
|
|
628
673
|
|
|
@@ -641,6 +686,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
641
686
|
signalService,
|
|
642
687
|
catalogClient,
|
|
643
688
|
maintenanceClient,
|
|
689
|
+
incidentClient,
|
|
644
690
|
getEmitHook,
|
|
645
691
|
});
|
|
646
692
|
},
|
package/src/retention-job.ts
CHANGED
|
@@ -2,6 +2,7 @@ import type {
|
|
|
2
2
|
HealthCheckRegistry,
|
|
3
3
|
Logger,
|
|
4
4
|
SafeDatabase,
|
|
5
|
+
CollectorRegistry,
|
|
5
6
|
} from "@checkstack/backend-api";
|
|
6
7
|
import * as schema from "./schema";
|
|
7
8
|
import {
|
|
@@ -13,12 +14,19 @@ import {
|
|
|
13
14
|
} from "./schema";
|
|
14
15
|
import { eq, and, lt, sql } from "drizzle-orm";
|
|
15
16
|
import type { QueueManager } from "@checkstack/queue-api";
|
|
17
|
+
import {
|
|
18
|
+
aggregateCollectorData,
|
|
19
|
+
calculateLatencyStats,
|
|
20
|
+
countStatuses,
|
|
21
|
+
extractLatencies,
|
|
22
|
+
} from "./aggregation-utils";
|
|
16
23
|
|
|
17
24
|
type Db = SafeDatabase<typeof schema>;
|
|
18
25
|
|
|
19
26
|
interface RetentionJobDeps {
|
|
20
27
|
db: Db;
|
|
21
28
|
registry: HealthCheckRegistry;
|
|
29
|
+
collectorRegistry: CollectorRegistry;
|
|
22
30
|
logger: Logger;
|
|
23
31
|
queueManager: QueueManager;
|
|
24
32
|
}
|
|
@@ -36,7 +44,7 @@ interface RetentionJobPayload {
|
|
|
36
44
|
* 3. Deletes expired daily aggregates
|
|
37
45
|
*/
|
|
38
46
|
export async function setupRetentionJob(deps: RetentionJobDeps) {
|
|
39
|
-
const { queueManager, logger, db, registry } = deps;
|
|
47
|
+
const { queueManager, logger, db, registry, collectorRegistry } = deps;
|
|
40
48
|
|
|
41
49
|
const queue = queueManager.getQueue<RetentionJobPayload>(RETENTION_QUEUE);
|
|
42
50
|
|
|
@@ -44,7 +52,13 @@ export async function setupRetentionJob(deps: RetentionJobDeps) {
|
|
|
44
52
|
await queue.consume(
|
|
45
53
|
async () => {
|
|
46
54
|
logger.info("Starting health check retention job");
|
|
47
|
-
await runRetentionJob({
|
|
55
|
+
await runRetentionJob({
|
|
56
|
+
db,
|
|
57
|
+
registry,
|
|
58
|
+
collectorRegistry,
|
|
59
|
+
logger,
|
|
60
|
+
queueManager,
|
|
61
|
+
});
|
|
48
62
|
logger.info("Completed health check retention job");
|
|
49
63
|
},
|
|
50
64
|
{ consumerGroup: "retention-worker" },
|
|
@@ -66,7 +80,7 @@ export async function setupRetentionJob(deps: RetentionJobDeps) {
|
|
|
66
80
|
* Main retention job logic
|
|
67
81
|
*/
|
|
68
82
|
export async function runRetentionJob(deps: RetentionJobDeps) {
|
|
69
|
-
const { db, registry, logger } = deps;
|
|
83
|
+
const { db, registry, collectorRegistry, logger } = deps;
|
|
70
84
|
|
|
71
85
|
// Get all unique system-config assignments
|
|
72
86
|
const assignments = await db.select().from(systemHealthChecks);
|
|
@@ -80,6 +94,7 @@ export async function runRetentionJob(deps: RetentionJobDeps) {
|
|
|
80
94
|
await aggregateRawRuns({
|
|
81
95
|
db,
|
|
82
96
|
registry,
|
|
97
|
+
collectorRegistry,
|
|
83
98
|
systemId: assignment.systemId,
|
|
84
99
|
configurationId: assignment.configurationId,
|
|
85
100
|
rawRetentionDays: retentionConfig.rawRetentionDays,
|
|
@@ -112,6 +127,7 @@ export async function runRetentionJob(deps: RetentionJobDeps) {
|
|
|
112
127
|
interface AggregateRawRunsParams {
|
|
113
128
|
db: Db;
|
|
114
129
|
registry: HealthCheckRegistry;
|
|
130
|
+
collectorRegistry: CollectorRegistry;
|
|
115
131
|
systemId: string;
|
|
116
132
|
configurationId: string;
|
|
117
133
|
rawRetentionDays: number;
|
|
@@ -121,7 +137,14 @@ interface AggregateRawRunsParams {
|
|
|
121
137
|
* Aggregates raw runs older than retention period into hourly buckets
|
|
122
138
|
*/
|
|
123
139
|
async function aggregateRawRuns(params: AggregateRawRunsParams) {
|
|
124
|
-
const {
|
|
140
|
+
const {
|
|
141
|
+
db,
|
|
142
|
+
registry,
|
|
143
|
+
collectorRegistry,
|
|
144
|
+
systemId,
|
|
145
|
+
configurationId,
|
|
146
|
+
rawRetentionDays,
|
|
147
|
+
} = params;
|
|
125
148
|
|
|
126
149
|
const cutoffDate = new Date();
|
|
127
150
|
cutoffDate.setDate(cutoffDate.getDate() - rawRetentionDays);
|
|
@@ -183,42 +206,43 @@ async function aggregateRawRuns(params: AggregateRawRunsParams) {
|
|
|
183
206
|
|
|
184
207
|
// Create aggregates and delete raw runs
|
|
185
208
|
for (const [, bucket] of buckets) {
|
|
186
|
-
// Calculate metrics
|
|
187
209
|
const runCount = bucket.runs.length;
|
|
188
|
-
let healthyCount = 0;
|
|
189
|
-
let degradedCount = 0;
|
|
190
|
-
let unhealthyCount = 0;
|
|
191
|
-
for (const r of bucket.runs) {
|
|
192
|
-
if (r.status === "healthy") healthyCount++;
|
|
193
|
-
if (r.status === "degraded") degradedCount++;
|
|
194
|
-
if (r.status === "unhealthy") unhealthyCount++;
|
|
195
|
-
}
|
|
196
210
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
.
|
|
200
|
-
|
|
201
|
-
let avgLatencyMs: number | undefined;
|
|
202
|
-
let minLatencyMs: number | undefined;
|
|
203
|
-
let maxLatencyMs: number | undefined;
|
|
204
|
-
let p95LatencyMs: number | undefined;
|
|
205
|
-
|
|
206
|
-
if (latencies.length > 0) {
|
|
207
|
-
let sum = 0;
|
|
208
|
-
for (const l of latencies) sum += l;
|
|
209
|
-
avgLatencyMs = Math.round(sum / latencies.length);
|
|
210
|
-
minLatencyMs = Math.min(...latencies);
|
|
211
|
-
maxLatencyMs = Math.max(...latencies);
|
|
212
|
-
p95LatencyMs = calculatePercentile(latencies, 95);
|
|
213
|
-
}
|
|
211
|
+
// Calculate status counts
|
|
212
|
+
const { healthyCount, degradedCount, unhealthyCount } = countStatuses(
|
|
213
|
+
bucket.runs,
|
|
214
|
+
);
|
|
214
215
|
|
|
215
|
-
//
|
|
216
|
+
// Calculate latency stats
|
|
217
|
+
const latencies = extractLatencies(bucket.runs);
|
|
218
|
+
const {
|
|
219
|
+
latencySumMs,
|
|
220
|
+
avgLatencyMs,
|
|
221
|
+
minLatencyMs,
|
|
222
|
+
maxLatencyMs,
|
|
223
|
+
p95LatencyMs,
|
|
224
|
+
} = calculateLatencyStats(latencies);
|
|
225
|
+
|
|
226
|
+
// Aggregate strategy result
|
|
216
227
|
let aggregatedResult: Record<string, unknown> | undefined;
|
|
217
228
|
if (strategy) {
|
|
218
|
-
|
|
229
|
+
const strategyResult = strategy.aggregateResult(bucket.runs) as Record<
|
|
219
230
|
string,
|
|
220
231
|
unknown
|
|
221
232
|
>;
|
|
233
|
+
|
|
234
|
+
// Aggregate collector data
|
|
235
|
+
const collectorsAggregated = aggregateCollectorData(
|
|
236
|
+
bucket.runs,
|
|
237
|
+
collectorRegistry,
|
|
238
|
+
);
|
|
239
|
+
|
|
240
|
+
aggregatedResult = {
|
|
241
|
+
...strategyResult,
|
|
242
|
+
...(Object.keys(collectorsAggregated).length > 0
|
|
243
|
+
? { collectors: collectorsAggregated }
|
|
244
|
+
: {}),
|
|
245
|
+
};
|
|
222
246
|
}
|
|
223
247
|
|
|
224
248
|
// Insert or update aggregate
|
|
@@ -233,6 +257,7 @@ async function aggregateRawRuns(params: AggregateRawRunsParams) {
|
|
|
233
257
|
healthyCount,
|
|
234
258
|
degradedCount,
|
|
235
259
|
unhealthyCount,
|
|
260
|
+
latencySumMs,
|
|
236
261
|
avgLatencyMs,
|
|
237
262
|
minLatencyMs,
|
|
238
263
|
maxLatencyMs,
|
|
@@ -319,20 +344,23 @@ async function rollupHourlyAggregates(params: RollupParams) {
|
|
|
319
344
|
let healthyCount = 0;
|
|
320
345
|
let degradedCount = 0;
|
|
321
346
|
let unhealthyCount = 0;
|
|
322
|
-
let
|
|
347
|
+
let latencySumMs = 0;
|
|
323
348
|
|
|
324
349
|
for (const a of bucket.aggregates) {
|
|
325
350
|
runCount += a.runCount;
|
|
326
351
|
healthyCount += a.healthyCount;
|
|
327
352
|
degradedCount += a.degradedCount;
|
|
328
353
|
unhealthyCount += a.unhealthyCount;
|
|
329
|
-
if
|
|
330
|
-
|
|
354
|
+
// Use latencySumMs if available, fallback to avg*count approximation
|
|
355
|
+
if (a.latencySumMs !== null) {
|
|
356
|
+
latencySumMs += a.latencySumMs;
|
|
357
|
+
} else if (a.avgLatencyMs !== null) {
|
|
358
|
+
latencySumMs += a.avgLatencyMs * a.runCount;
|
|
331
359
|
}
|
|
332
360
|
}
|
|
333
361
|
|
|
334
362
|
const avgLatencyMs =
|
|
335
|
-
runCount > 0 ? Math.round(
|
|
363
|
+
runCount > 0 ? Math.round(latencySumMs / runCount) : undefined;
|
|
336
364
|
|
|
337
365
|
// Min/max across all hourly buckets
|
|
338
366
|
const minValues = bucket.aggregates
|
|
@@ -341,10 +369,16 @@ async function rollupHourlyAggregates(params: RollupParams) {
|
|
|
341
369
|
const maxValues = bucket.aggregates
|
|
342
370
|
.map((a) => a.maxLatencyMs)
|
|
343
371
|
.filter((v): v is number => v !== null);
|
|
372
|
+
const p95Values = bucket.aggregates
|
|
373
|
+
.map((a) => a.p95LatencyMs)
|
|
374
|
+
.filter((v): v is number => v !== null);
|
|
344
375
|
const minLatencyMs =
|
|
345
376
|
minValues.length > 0 ? Math.min(...minValues) : undefined;
|
|
346
377
|
const maxLatencyMs =
|
|
347
378
|
maxValues.length > 0 ? Math.max(...maxValues) : undefined;
|
|
379
|
+
// Use max of hourly p95s as upper bound approximation
|
|
380
|
+
const p95LatencyMs =
|
|
381
|
+
p95Values.length > 0 ? Math.max(...p95Values) : undefined;
|
|
348
382
|
|
|
349
383
|
// Insert daily aggregate
|
|
350
384
|
await db.insert(healthCheckAggregates).values({
|
|
@@ -356,10 +390,11 @@ async function rollupHourlyAggregates(params: RollupParams) {
|
|
|
356
390
|
healthyCount,
|
|
357
391
|
degradedCount,
|
|
358
392
|
unhealthyCount,
|
|
393
|
+
latencySumMs: latencySumMs > 0 ? latencySumMs : undefined,
|
|
359
394
|
avgLatencyMs,
|
|
360
395
|
minLatencyMs,
|
|
361
396
|
maxLatencyMs,
|
|
362
|
-
p95LatencyMs
|
|
397
|
+
p95LatencyMs,
|
|
363
398
|
aggregatedResult: undefined, // Cannot combine result across hours
|
|
364
399
|
});
|
|
365
400
|
|
|
@@ -399,9 +434,3 @@ async function deleteExpiredAggregates(params: DeleteExpiredParams) {
|
|
|
399
434
|
),
|
|
400
435
|
);
|
|
401
436
|
}
|
|
402
|
-
|
|
403
|
-
function calculatePercentile(values: number[], percentile: number): number {
|
|
404
|
-
const sorted = values.toSorted((a, b) => a - b);
|
|
405
|
-
const index = Math.ceil((percentile / 100) * sorted.length) - 1;
|
|
406
|
-
return sorted[Math.max(0, index)];
|
|
407
|
-
}
|
package/src/router.test.ts
CHANGED
|
@@ -43,7 +43,18 @@ describe("HealthCheck Router", () => {
|
|
|
43
43
|
getStrategiesWithMeta: mock(() => []),
|
|
44
44
|
};
|
|
45
45
|
|
|
46
|
-
const
|
|
46
|
+
const mockCollectorRegistry = {
|
|
47
|
+
register: mock(),
|
|
48
|
+
getCollector: mock(),
|
|
49
|
+
getCollectors: mock(() => []),
|
|
50
|
+
getCollectorsForPlugin: mock(() => []),
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
const router = createHealthCheckRouter(
|
|
54
|
+
mockDb as never,
|
|
55
|
+
mockRegistry,
|
|
56
|
+
mockCollectorRegistry as never,
|
|
57
|
+
);
|
|
47
58
|
|
|
48
59
|
it("getStrategies returns strategies from registry", async () => {
|
|
49
60
|
const context = createMockRpcContext({
|
|
@@ -119,7 +130,7 @@ describe("HealthCheck Router", () => {
|
|
|
119
130
|
const result = await call(
|
|
120
131
|
router.getCollectors,
|
|
121
132
|
{ strategyId: "healthcheck-ssh" },
|
|
122
|
-
{ context }
|
|
133
|
+
{ context },
|
|
123
134
|
);
|
|
124
135
|
expect(result).toHaveLength(1);
|
|
125
136
|
expect(result[0].id).toBe("collector-hardware.cpu");
|
|
@@ -139,7 +150,7 @@ describe("HealthCheck Router", () => {
|
|
|
139
150
|
const result = await call(
|
|
140
151
|
router.getCollectors,
|
|
141
152
|
{ strategyId: "unknown" },
|
|
142
|
-
{ context }
|
|
153
|
+
{ context },
|
|
143
154
|
);
|
|
144
155
|
expect(result).toHaveLength(0);
|
|
145
156
|
});
|
package/src/router.ts
CHANGED
|
@@ -5,6 +5,7 @@ import {
|
|
|
5
5
|
type RpcContext,
|
|
6
6
|
type HealthCheckRegistry,
|
|
7
7
|
type SafeDatabase,
|
|
8
|
+
type CollectorRegistry,
|
|
8
9
|
} from "@checkstack/backend-api";
|
|
9
10
|
import { healthCheckContract } from "@checkstack/healthcheck-common";
|
|
10
11
|
import { HealthCheckService } from "./service";
|
|
@@ -20,9 +21,10 @@ import { toJsonSchemaWithChartMeta } from "./schema-utils";
|
|
|
20
21
|
export const createHealthCheckRouter = (
|
|
21
22
|
database: SafeDatabase<typeof schema>,
|
|
22
23
|
registry: HealthCheckRegistry,
|
|
24
|
+
collectorRegistry: CollectorRegistry,
|
|
23
25
|
) => {
|
|
24
26
|
// Create service instance once - shared across all handlers
|
|
25
|
-
const service = new HealthCheckService(database, registry);
|
|
27
|
+
const service = new HealthCheckService(database, registry, collectorRegistry);
|
|
26
28
|
|
|
27
29
|
// Create contract implementer with context type AND auto auth middleware
|
|
28
30
|
const os = implement(healthCheckContract)
|
|
@@ -74,6 +76,9 @@ export const createHealthCheckRouter = (
|
|
|
74
76
|
description: collector.description,
|
|
75
77
|
configSchema: toJsonSchema(collector.config.schema),
|
|
76
78
|
resultSchema: toJsonSchemaWithChartMeta(collector.result.schema),
|
|
79
|
+
aggregatedResultSchema: collector.aggregatedResult
|
|
80
|
+
? toJsonSchemaWithChartMeta(collector.aggregatedResult.schema)
|
|
81
|
+
: undefined,
|
|
77
82
|
allowMultiple: collector.allowMultiple ?? false,
|
|
78
83
|
}));
|
|
79
84
|
}),
|
|
@@ -100,6 +105,14 @@ export const createHealthCheckRouter = (
|
|
|
100
105
|
await service.deleteConfiguration(input);
|
|
101
106
|
}),
|
|
102
107
|
|
|
108
|
+
pauseConfiguration: os.pauseConfiguration.handler(async ({ input }) => {
|
|
109
|
+
await service.pauseConfiguration(input);
|
|
110
|
+
}),
|
|
111
|
+
|
|
112
|
+
resumeConfiguration: os.resumeConfiguration.handler(async ({ input }) => {
|
|
113
|
+
await service.resumeConfiguration(input);
|
|
114
|
+
}),
|
|
115
|
+
|
|
103
116
|
getSystemConfigurations: os.getSystemConfigurations.handler(
|
|
104
117
|
async ({ input }) => {
|
|
105
118
|
return service.getSystemConfigurations(input);
|
package/src/schema.ts
CHANGED
|
@@ -45,9 +45,11 @@ export const healthCheckConfigurations = pgTable(
|
|
|
45
45
|
collectors: jsonb("collectors").$type<CollectorConfigEntry[]>(),
|
|
46
46
|
intervalSeconds: integer("interval_seconds").notNull(),
|
|
47
47
|
isTemplate: boolean("is_template").default(false),
|
|
48
|
+
/** Whether this configuration is paused (execution skipped for all systems) */
|
|
49
|
+
paused: boolean("paused").default(false).notNull(),
|
|
48
50
|
createdAt: timestamp("created_at").defaultNow().notNull(),
|
|
49
51
|
updatedAt: timestamp("updated_at").defaultNow().notNull(),
|
|
50
|
-
}
|
|
52
|
+
},
|
|
51
53
|
);
|
|
52
54
|
|
|
53
55
|
/**
|
|
@@ -93,7 +95,7 @@ export const systemHealthChecks = pgTable(
|
|
|
93
95
|
},
|
|
94
96
|
(t) => ({
|
|
95
97
|
pk: primaryKey({ columns: [t.systemId, t.configurationId] }),
|
|
96
|
-
})
|
|
98
|
+
}),
|
|
97
99
|
);
|
|
98
100
|
|
|
99
101
|
export const healthCheckRuns = pgTable("health_check_runs", {
|
|
@@ -137,6 +139,8 @@ export const healthCheckAggregates = pgTable(
|
|
|
137
139
|
healthyCount: integer("healthy_count").notNull(),
|
|
138
140
|
degradedCount: integer("degraded_count").notNull(),
|
|
139
141
|
unhealthyCount: integer("unhealthy_count").notNull(),
|
|
142
|
+
/** Sum of all latencies in this bucket (for accurate averaging when combining) */
|
|
143
|
+
latencySumMs: integer("latency_sum_ms"),
|
|
140
144
|
avgLatencyMs: integer("avg_latency_ms"),
|
|
141
145
|
minLatencyMs: integer("min_latency_ms"),
|
|
142
146
|
maxLatencyMs: integer("max_latency_ms"),
|
|
@@ -152,7 +156,7 @@ export const healthCheckAggregates = pgTable(
|
|
|
152
156
|
t.configurationId,
|
|
153
157
|
t.systemId,
|
|
154
158
|
t.bucketStart,
|
|
155
|
-
t.bucketSize
|
|
159
|
+
t.bucketSize,
|
|
156
160
|
),
|
|
157
|
-
})
|
|
161
|
+
}),
|
|
158
162
|
);
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { describe, it, expect, mock, beforeEach } from "bun:test";
|
|
2
|
+
import { HealthCheckService } from "./service";
|
|
3
|
+
import { createMockDb } from "@checkstack/test-utils-backend";
|
|
4
|
+
|
|
5
|
+
describe("HealthCheckService - pause/resume", () => {
|
|
6
|
+
let mockDb: ReturnType<typeof createMockDb>;
|
|
7
|
+
let service: HealthCheckService;
|
|
8
|
+
let mockUpdate: ReturnType<typeof mock>;
|
|
9
|
+
let mockSet: ReturnType<typeof mock>;
|
|
10
|
+
let mockWhere: ReturnType<typeof mock>;
|
|
11
|
+
|
|
12
|
+
beforeEach(() => {
|
|
13
|
+
mockDb = createMockDb();
|
|
14
|
+
mockWhere = mock(() => Promise.resolve());
|
|
15
|
+
mockSet = mock(() => ({ where: mockWhere }));
|
|
16
|
+
mockUpdate = mock(() => ({ set: mockSet }));
|
|
17
|
+
(mockDb.update as any) = mockUpdate;
|
|
18
|
+
service = new HealthCheckService(mockDb as any);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
describe("pauseConfiguration", () => {
|
|
22
|
+
it("should update paused to true and set updatedAt", async () => {
|
|
23
|
+
await service.pauseConfiguration("config-123");
|
|
24
|
+
|
|
25
|
+
expect(mockUpdate).toHaveBeenCalled();
|
|
26
|
+
expect(mockSet).toHaveBeenCalledWith(
|
|
27
|
+
expect.objectContaining({
|
|
28
|
+
paused: true,
|
|
29
|
+
updatedAt: expect.any(Date),
|
|
30
|
+
}),
|
|
31
|
+
);
|
|
32
|
+
expect(mockWhere).toHaveBeenCalled();
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
describe("resumeConfiguration", () => {
|
|
37
|
+
it("should update paused to false and set updatedAt", async () => {
|
|
38
|
+
await service.resumeConfiguration("config-456");
|
|
39
|
+
|
|
40
|
+
expect(mockUpdate).toHaveBeenCalled();
|
|
41
|
+
expect(mockSet).toHaveBeenCalledWith(
|
|
42
|
+
expect.objectContaining({
|
|
43
|
+
paused: false,
|
|
44
|
+
updatedAt: expect.any(Date),
|
|
45
|
+
}),
|
|
46
|
+
);
|
|
47
|
+
expect(mockWhere).toHaveBeenCalled();
|
|
48
|
+
});
|
|
49
|
+
});
|
|
50
|
+
});
|