@checkstack/healthcheck-backend 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +181 -0
- package/drizzle/0000_stormy_slayback.sql +33 -0
- package/drizzle/0001_thin_shotgun.sql +1 -0
- package/drizzle/0002_closed_lucky_pierre.sql +19 -0
- package/drizzle/0003_powerful_rage.sql +1 -0
- package/drizzle/0004_short_ezekiel.sql +1 -0
- package/drizzle/0005_glossy_longshot.sql +1 -0
- package/drizzle/meta/0000_snapshot.json +234 -0
- package/drizzle/meta/0001_snapshot.json +240 -0
- package/drizzle/meta/0002_snapshot.json +361 -0
- package/drizzle/meta/0003_snapshot.json +367 -0
- package/drizzle/meta/0004_snapshot.json +401 -0
- package/drizzle/meta/0005_snapshot.json +401 -0
- package/drizzle/meta/_journal.json +48 -0
- package/drizzle.config.ts +7 -0
- package/package.json +37 -0
- package/src/aggregation.test.ts +373 -0
- package/src/hooks.test.ts +16 -0
- package/src/hooks.ts +35 -0
- package/src/index.ts +195 -0
- package/src/queue-executor.test.ts +229 -0
- package/src/queue-executor.ts +569 -0
- package/src/retention-job.ts +404 -0
- package/src/router.test.ts +81 -0
- package/src/router.ts +157 -0
- package/src/schema.ts +153 -0
- package/src/service.ts +718 -0
- package/src/state-evaluator.test.ts +237 -0
- package/src/state-evaluator.ts +105 -0
- package/src/state-thresholds-migrations.ts +15 -0
- package/tsconfig.json +6 -0
|
@@ -0,0 +1,569 @@
|
|
|
1
|
+
import {
|
|
2
|
+
HealthCheckRegistry,
|
|
3
|
+
Logger,
|
|
4
|
+
type EmitHookFn,
|
|
5
|
+
} from "@checkstack/backend-api";
|
|
6
|
+
import { QueueManager } from "@checkstack/queue-api";
|
|
7
|
+
import {
|
|
8
|
+
healthCheckConfigurations,
|
|
9
|
+
systemHealthChecks,
|
|
10
|
+
healthCheckRuns,
|
|
11
|
+
} from "./schema";
|
|
12
|
+
import * as schema from "./schema";
|
|
13
|
+
import { eq, and, max } from "drizzle-orm";
|
|
14
|
+
import { NodePgDatabase } from "drizzle-orm/node-postgres";
|
|
15
|
+
import { type SignalService } from "@checkstack/signal-common";
|
|
16
|
+
import {
|
|
17
|
+
HEALTH_CHECK_RUN_COMPLETED,
|
|
18
|
+
type HealthCheckStatus,
|
|
19
|
+
} from "@checkstack/healthcheck-common";
|
|
20
|
+
import { CatalogApi, catalogRoutes } from "@checkstack/catalog-common";
|
|
21
|
+
import { resolveRoute, type InferClient } from "@checkstack/common";
|
|
22
|
+
import { HealthCheckService } from "./service";
|
|
23
|
+
import { healthCheckHooks } from "./hooks";
|
|
24
|
+
|
|
25
|
+
type Db = NodePgDatabase<typeof schema>;
|
|
26
|
+
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Payload for health check queue jobs
|
|
30
|
+
*/
|
|
31
|
+
export interface HealthCheckJobPayload {
|
|
32
|
+
configId: string;
|
|
33
|
+
systemId: string;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Queue name for health check execution
|
|
38
|
+
*/
|
|
39
|
+
const HEALTH_CHECK_QUEUE = "health-checks";
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Worker group for health check execution (work-queue mode)
|
|
43
|
+
*/
|
|
44
|
+
const WORKER_GROUP = "health-check-executor";
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Schedule a health check for execution using recurring jobs
|
|
48
|
+
* @param queueManager - Queue manager service
|
|
49
|
+
* @param payload - Health check job payload
|
|
50
|
+
* @param intervalSeconds - Interval between executions
|
|
51
|
+
* @param startDelay - Optional delay before first execution (for delta-based scheduling)
|
|
52
|
+
* @param logger - Optional logger
|
|
53
|
+
*/
|
|
54
|
+
export async function scheduleHealthCheck(props: {
|
|
55
|
+
queueManager: QueueManager;
|
|
56
|
+
payload: HealthCheckJobPayload;
|
|
57
|
+
intervalSeconds: number;
|
|
58
|
+
startDelay?: number;
|
|
59
|
+
logger?: Logger;
|
|
60
|
+
}): Promise<string> {
|
|
61
|
+
const {
|
|
62
|
+
queueManager,
|
|
63
|
+
payload,
|
|
64
|
+
intervalSeconds,
|
|
65
|
+
startDelay = 0,
|
|
66
|
+
logger,
|
|
67
|
+
} = props;
|
|
68
|
+
|
|
69
|
+
const queue =
|
|
70
|
+
queueManager.getQueue<HealthCheckJobPayload>(HEALTH_CHECK_QUEUE);
|
|
71
|
+
|
|
72
|
+
const jobId = `healthcheck:${payload.configId}:${payload.systemId}`;
|
|
73
|
+
|
|
74
|
+
logger?.debug(
|
|
75
|
+
`Scheduling recurring health check ${jobId} with interval ${intervalSeconds}s, startDelay ${startDelay}s`
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
return queue.scheduleRecurring(payload, {
|
|
79
|
+
jobId,
|
|
80
|
+
intervalSeconds,
|
|
81
|
+
startDelay,
|
|
82
|
+
priority: 0,
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Notify system subscribers about a health state change.
|
|
88
|
+
*/
|
|
89
|
+
async function notifyStateChange(props: {
|
|
90
|
+
systemId: string;
|
|
91
|
+
previousStatus: HealthCheckStatus;
|
|
92
|
+
newStatus: HealthCheckStatus;
|
|
93
|
+
catalogClient: CatalogClient;
|
|
94
|
+
logger: Logger;
|
|
95
|
+
}): Promise<void> {
|
|
96
|
+
const { systemId, previousStatus, newStatus, catalogClient, logger } = props;
|
|
97
|
+
|
|
98
|
+
// Only notify on actual state changes
|
|
99
|
+
if (newStatus === previousStatus) {
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
|
|
104
|
+
const isDegraded = newStatus === "degraded";
|
|
105
|
+
const isUnhealthy = newStatus === "unhealthy";
|
|
106
|
+
|
|
107
|
+
let title: string;
|
|
108
|
+
let body: string;
|
|
109
|
+
let importance: "info" | "warning" | "critical";
|
|
110
|
+
|
|
111
|
+
if (isRecovery) {
|
|
112
|
+
title = "System health restored";
|
|
113
|
+
body =
|
|
114
|
+
"All health checks are now passing. The system has returned to normal operation.";
|
|
115
|
+
importance = "info";
|
|
116
|
+
} else if (isUnhealthy) {
|
|
117
|
+
title = "System health critical";
|
|
118
|
+
body = "Health checks indicate the system is unhealthy and may be down.";
|
|
119
|
+
importance = "critical";
|
|
120
|
+
} else if (isDegraded) {
|
|
121
|
+
title = "System health degraded";
|
|
122
|
+
body =
|
|
123
|
+
"Some health checks are failing. The system may be experiencing issues.";
|
|
124
|
+
importance = "warning";
|
|
125
|
+
} else {
|
|
126
|
+
// No notification for healthy → healthy (if somehow missed above)
|
|
127
|
+
return;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const systemDetailPath = resolveRoute(catalogRoutes.routes.systemDetail, {
|
|
131
|
+
systemId,
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
try {
|
|
135
|
+
await catalogClient.notifySystemSubscribers({
|
|
136
|
+
systemId,
|
|
137
|
+
title,
|
|
138
|
+
body,
|
|
139
|
+
importance,
|
|
140
|
+
action: { label: "View System", url: systemDetailPath },
|
|
141
|
+
includeGroupSubscribers: true,
|
|
142
|
+
});
|
|
143
|
+
logger.debug(
|
|
144
|
+
`Notified subscribers: ${previousStatus} → ${newStatus} for system ${systemId}`
|
|
145
|
+
);
|
|
146
|
+
} catch (error) {
|
|
147
|
+
// Log but don't fail the operation - notifications are best-effort
|
|
148
|
+
logger.warn(
|
|
149
|
+
`Failed to notify subscribers for health state change on system ${systemId}:`,
|
|
150
|
+
error
|
|
151
|
+
);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Execute a health check job
|
|
157
|
+
*/
|
|
158
|
+
async function executeHealthCheckJob(props: {
|
|
159
|
+
payload: HealthCheckJobPayload;
|
|
160
|
+
db: Db;
|
|
161
|
+
registry: HealthCheckRegistry;
|
|
162
|
+
logger: Logger;
|
|
163
|
+
signalService: SignalService;
|
|
164
|
+
catalogClient: CatalogClient;
|
|
165
|
+
getEmitHook: () => EmitHookFn | undefined;
|
|
166
|
+
}): Promise<void> {
|
|
167
|
+
const {
|
|
168
|
+
payload,
|
|
169
|
+
db,
|
|
170
|
+
registry,
|
|
171
|
+
logger,
|
|
172
|
+
signalService,
|
|
173
|
+
catalogClient,
|
|
174
|
+
getEmitHook,
|
|
175
|
+
} = props;
|
|
176
|
+
const { configId, systemId } = payload;
|
|
177
|
+
|
|
178
|
+
// Create service for aggregated state evaluation
|
|
179
|
+
const service = new HealthCheckService(db);
|
|
180
|
+
|
|
181
|
+
// Capture aggregated state BEFORE this run for comparison
|
|
182
|
+
const previousState = await service.getSystemHealthStatus(systemId);
|
|
183
|
+
const previousStatus = previousState.status;
|
|
184
|
+
|
|
185
|
+
try {
|
|
186
|
+
// Fetch configuration (including name for signals)
|
|
187
|
+
const [configRow] = await db
|
|
188
|
+
.select({
|
|
189
|
+
configId: healthCheckConfigurations.id,
|
|
190
|
+
configName: healthCheckConfigurations.name,
|
|
191
|
+
strategyId: healthCheckConfigurations.strategyId,
|
|
192
|
+
config: healthCheckConfigurations.config,
|
|
193
|
+
interval: healthCheckConfigurations.intervalSeconds,
|
|
194
|
+
enabled: systemHealthChecks.enabled,
|
|
195
|
+
})
|
|
196
|
+
.from(systemHealthChecks)
|
|
197
|
+
.innerJoin(
|
|
198
|
+
healthCheckConfigurations,
|
|
199
|
+
eq(systemHealthChecks.configurationId, healthCheckConfigurations.id)
|
|
200
|
+
)
|
|
201
|
+
.where(
|
|
202
|
+
and(
|
|
203
|
+
eq(systemHealthChecks.systemId, systemId),
|
|
204
|
+
eq(systemHealthChecks.configurationId, configId),
|
|
205
|
+
eq(systemHealthChecks.enabled, true)
|
|
206
|
+
)
|
|
207
|
+
);
|
|
208
|
+
|
|
209
|
+
// If configuration not found or disabled, exit without rescheduling
|
|
210
|
+
if (!configRow) {
|
|
211
|
+
logger.debug(
|
|
212
|
+
`Health check ${configId} for system ${systemId} not found or disabled, not rescheduling`
|
|
213
|
+
);
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Fetch system name for signal payload
|
|
218
|
+
let systemName = systemId;
|
|
219
|
+
try {
|
|
220
|
+
const system = await catalogClient.getSystem({ systemId });
|
|
221
|
+
if (system) {
|
|
222
|
+
systemName = system.name;
|
|
223
|
+
}
|
|
224
|
+
} catch {
|
|
225
|
+
// Fall back to systemId if catalog lookup fails
|
|
226
|
+
logger.debug(`Could not fetch system name for ${systemId}, using ID`);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
const strategy = registry.getStrategy(configRow.strategyId);
|
|
230
|
+
if (!strategy) {
|
|
231
|
+
logger.warn(
|
|
232
|
+
`Strategy ${configRow.strategyId} not found for config ${configId}`
|
|
233
|
+
);
|
|
234
|
+
return;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Execute health check
|
|
238
|
+
const result = await strategy.execute(
|
|
239
|
+
configRow.config as Record<string, unknown>
|
|
240
|
+
);
|
|
241
|
+
|
|
242
|
+
// Store result (spread to convert structured type to plain record for jsonb)
|
|
243
|
+
await db.insert(healthCheckRuns).values({
|
|
244
|
+
configurationId: configId,
|
|
245
|
+
systemId,
|
|
246
|
+
status: result.status,
|
|
247
|
+
latencyMs: result.latencyMs,
|
|
248
|
+
result: { ...result } as Record<string, unknown>,
|
|
249
|
+
});
|
|
250
|
+
|
|
251
|
+
logger.debug(
|
|
252
|
+
`Ran health check ${configId} for system ${systemId}: ${result.status}`
|
|
253
|
+
);
|
|
254
|
+
|
|
255
|
+
// Broadcast enriched signal for realtime frontend updates (e.g., terminal feed)
|
|
256
|
+
await signalService.broadcast(HEALTH_CHECK_RUN_COMPLETED, {
|
|
257
|
+
systemId,
|
|
258
|
+
systemName,
|
|
259
|
+
configurationId: configId,
|
|
260
|
+
configurationName: configRow.configName,
|
|
261
|
+
status: result.status,
|
|
262
|
+
latencyMs: result.latencyMs,
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
// Check if aggregated state changed and notify subscribers
|
|
266
|
+
const newState = await service.getSystemHealthStatus(systemId);
|
|
267
|
+
if (newState.status !== previousStatus) {
|
|
268
|
+
await notifyStateChange({
|
|
269
|
+
systemId,
|
|
270
|
+
previousStatus,
|
|
271
|
+
newStatus: newState.status,
|
|
272
|
+
catalogClient,
|
|
273
|
+
logger,
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
// Emit integration hooks for external integrations
|
|
277
|
+
const emitHook = getEmitHook();
|
|
278
|
+
if (emitHook) {
|
|
279
|
+
if (newState.status === "healthy" && previousStatus !== "healthy") {
|
|
280
|
+
// Recovery: system became healthy
|
|
281
|
+
await emitHook(healthCheckHooks.systemHealthy, {
|
|
282
|
+
systemId,
|
|
283
|
+
previousStatus,
|
|
284
|
+
healthyChecks: newState.checkStatuses.filter(
|
|
285
|
+
(c) => c.status === "healthy"
|
|
286
|
+
).length,
|
|
287
|
+
totalChecks: newState.checkStatuses.length,
|
|
288
|
+
timestamp: new Date().toISOString(),
|
|
289
|
+
});
|
|
290
|
+
logger.debug(
|
|
291
|
+
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`
|
|
292
|
+
);
|
|
293
|
+
} else if (
|
|
294
|
+
previousStatus === "healthy" &&
|
|
295
|
+
newState.status !== "healthy"
|
|
296
|
+
) {
|
|
297
|
+
// Degradation: system went from healthy to unhealthy/degraded
|
|
298
|
+
await emitHook(healthCheckHooks.systemDegraded, {
|
|
299
|
+
systemId,
|
|
300
|
+
previousStatus,
|
|
301
|
+
newStatus: newState.status,
|
|
302
|
+
healthyChecks: newState.checkStatuses.filter(
|
|
303
|
+
(c) => c.status === "healthy"
|
|
304
|
+
).length,
|
|
305
|
+
totalChecks: newState.checkStatuses.length,
|
|
306
|
+
timestamp: new Date().toISOString(),
|
|
307
|
+
});
|
|
308
|
+
logger.debug(
|
|
309
|
+
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`
|
|
310
|
+
);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
316
|
+
} catch (error) {
|
|
317
|
+
logger.error(
|
|
318
|
+
`Failed to execute health check ${configId} for system ${systemId}`,
|
|
319
|
+
error
|
|
320
|
+
);
|
|
321
|
+
|
|
322
|
+
// Store failure (no latencyMs for failures)
|
|
323
|
+
await db.insert(healthCheckRuns).values({
|
|
324
|
+
configurationId: configId,
|
|
325
|
+
systemId,
|
|
326
|
+
status: "unhealthy",
|
|
327
|
+
result: { error: String(error) } as Record<string, unknown>,
|
|
328
|
+
});
|
|
329
|
+
|
|
330
|
+
// Try to fetch names for the enriched signal (best-effort)
|
|
331
|
+
let systemName = systemId;
|
|
332
|
+
let configName = configId;
|
|
333
|
+
try {
|
|
334
|
+
const system = await catalogClient.getSystem({ systemId });
|
|
335
|
+
if (system) {
|
|
336
|
+
systemName = system.name;
|
|
337
|
+
}
|
|
338
|
+
const [config] = await db
|
|
339
|
+
.select({ name: healthCheckConfigurations.name })
|
|
340
|
+
.from(healthCheckConfigurations)
|
|
341
|
+
.where(eq(healthCheckConfigurations.id, configId));
|
|
342
|
+
if (config) {
|
|
343
|
+
configName = config.name;
|
|
344
|
+
}
|
|
345
|
+
} catch {
|
|
346
|
+
// Use IDs as fallback
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// Broadcast enriched failure signal for realtime frontend updates
|
|
350
|
+
await signalService.broadcast(HEALTH_CHECK_RUN_COMPLETED, {
|
|
351
|
+
systemId,
|
|
352
|
+
systemName,
|
|
353
|
+
configurationId: configId,
|
|
354
|
+
configurationName: configName,
|
|
355
|
+
status: "unhealthy",
|
|
356
|
+
});
|
|
357
|
+
|
|
358
|
+
// Check if aggregated state changed and notify subscribers
|
|
359
|
+
const newState = await service.getSystemHealthStatus(systemId);
|
|
360
|
+
if (newState.status !== previousStatus) {
|
|
361
|
+
await notifyStateChange({
|
|
362
|
+
systemId,
|
|
363
|
+
previousStatus,
|
|
364
|
+
newStatus: newState.status,
|
|
365
|
+
catalogClient,
|
|
366
|
+
logger,
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
// Emit integration hooks for external integrations
|
|
370
|
+
const emitHook = getEmitHook();
|
|
371
|
+
if (emitHook) {
|
|
372
|
+
if (newState.status === "healthy" && previousStatus !== "healthy") {
|
|
373
|
+
// Recovery: system became healthy
|
|
374
|
+
await emitHook(healthCheckHooks.systemHealthy, {
|
|
375
|
+
systemId,
|
|
376
|
+
previousStatus,
|
|
377
|
+
healthyChecks: newState.checkStatuses.filter(
|
|
378
|
+
(c) => c.status === "healthy"
|
|
379
|
+
).length,
|
|
380
|
+
totalChecks: newState.checkStatuses.length,
|
|
381
|
+
timestamp: new Date().toISOString(),
|
|
382
|
+
});
|
|
383
|
+
logger.debug(
|
|
384
|
+
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`
|
|
385
|
+
);
|
|
386
|
+
} else if (
|
|
387
|
+
previousStatus === "healthy" &&
|
|
388
|
+
newState.status !== "healthy"
|
|
389
|
+
) {
|
|
390
|
+
// Degradation: system went from healthy to unhealthy/degraded
|
|
391
|
+
await emitHook(healthCheckHooks.systemDegraded, {
|
|
392
|
+
systemId,
|
|
393
|
+
previousStatus,
|
|
394
|
+
newStatus: newState.status,
|
|
395
|
+
healthyChecks: newState.checkStatuses.filter(
|
|
396
|
+
(c) => c.status === "healthy"
|
|
397
|
+
).length,
|
|
398
|
+
totalChecks: newState.checkStatuses.length,
|
|
399
|
+
timestamp: new Date().toISOString(),
|
|
400
|
+
});
|
|
401
|
+
logger.debug(
|
|
402
|
+
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`
|
|
403
|
+
);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
export async function setupHealthCheckWorker(props: {
|
|
413
|
+
db: Db;
|
|
414
|
+
registry: HealthCheckRegistry;
|
|
415
|
+
logger: Logger;
|
|
416
|
+
queueManager: QueueManager;
|
|
417
|
+
signalService: SignalService;
|
|
418
|
+
catalogClient: CatalogClient;
|
|
419
|
+
getEmitHook: () => EmitHookFn | undefined;
|
|
420
|
+
}): Promise<void> {
|
|
421
|
+
const {
|
|
422
|
+
db,
|
|
423
|
+
registry,
|
|
424
|
+
logger,
|
|
425
|
+
queueManager,
|
|
426
|
+
signalService,
|
|
427
|
+
catalogClient,
|
|
428
|
+
getEmitHook,
|
|
429
|
+
} = props;
|
|
430
|
+
|
|
431
|
+
const queue =
|
|
432
|
+
queueManager.getQueue<HealthCheckJobPayload>(HEALTH_CHECK_QUEUE);
|
|
433
|
+
|
|
434
|
+
// Subscribe to health check queue in work-queue mode
|
|
435
|
+
await queue.consume(
|
|
436
|
+
async (job) => {
|
|
437
|
+
await executeHealthCheckJob({
|
|
438
|
+
payload: job.data,
|
|
439
|
+
db,
|
|
440
|
+
registry,
|
|
441
|
+
logger,
|
|
442
|
+
signalService,
|
|
443
|
+
catalogClient,
|
|
444
|
+
getEmitHook,
|
|
445
|
+
});
|
|
446
|
+
},
|
|
447
|
+
{
|
|
448
|
+
consumerGroup: WORKER_GROUP,
|
|
449
|
+
maxRetries: 0, // Health checks should not retry on failure
|
|
450
|
+
}
|
|
451
|
+
);
|
|
452
|
+
|
|
453
|
+
logger.debug("🎯 Health Check Worker subscribed to queue");
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/**
|
|
457
|
+
* Bootstrap health checks by enqueueing all enabled checks
|
|
458
|
+
*/
|
|
459
|
+
export async function bootstrapHealthChecks(props: {
|
|
460
|
+
db: Db;
|
|
461
|
+
queueManager: QueueManager;
|
|
462
|
+
logger: Logger;
|
|
463
|
+
}): Promise<void> {
|
|
464
|
+
const { db, queueManager, logger } = props;
|
|
465
|
+
|
|
466
|
+
// Get all enabled health checks
|
|
467
|
+
const enabledChecks = await db
|
|
468
|
+
.select({
|
|
469
|
+
systemId: systemHealthChecks.systemId,
|
|
470
|
+
configId: healthCheckConfigurations.id,
|
|
471
|
+
interval: healthCheckConfigurations.intervalSeconds,
|
|
472
|
+
})
|
|
473
|
+
.from(systemHealthChecks)
|
|
474
|
+
.innerJoin(
|
|
475
|
+
healthCheckConfigurations,
|
|
476
|
+
eq(systemHealthChecks.configurationId, healthCheckConfigurations.id)
|
|
477
|
+
)
|
|
478
|
+
.where(eq(systemHealthChecks.enabled, true));
|
|
479
|
+
|
|
480
|
+
// Get latest run timestamp for each system+config pair
|
|
481
|
+
// Using Drizzle's max() function for proper timestamp handling (no raw SQL)
|
|
482
|
+
const latestRuns = await db
|
|
483
|
+
.select({
|
|
484
|
+
systemId: healthCheckRuns.systemId,
|
|
485
|
+
configurationId: healthCheckRuns.configurationId,
|
|
486
|
+
maxTimestamp: max(healthCheckRuns.timestamp),
|
|
487
|
+
})
|
|
488
|
+
.from(healthCheckRuns)
|
|
489
|
+
.groupBy(healthCheckRuns.systemId, healthCheckRuns.configurationId);
|
|
490
|
+
|
|
491
|
+
// Create a lookup map for fast access
|
|
492
|
+
const lastRunMap = new Map<string, Date>();
|
|
493
|
+
for (const run of latestRuns) {
|
|
494
|
+
if (run.maxTimestamp) {
|
|
495
|
+
const key = `${run.systemId}:${run.configurationId}`;
|
|
496
|
+
lastRunMap.set(key, run.maxTimestamp);
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
logger.debug(`Bootstrapping ${enabledChecks.length} health checks`);
|
|
501
|
+
|
|
502
|
+
for (const check of enabledChecks) {
|
|
503
|
+
// Look up the last run from the map
|
|
504
|
+
const lastRunKey = `${check.systemId}:${check.configId}`;
|
|
505
|
+
const lastRun = lastRunMap.get(lastRunKey);
|
|
506
|
+
|
|
507
|
+
// Calculate delay for first run based on time since last run
|
|
508
|
+
let startDelay = 0;
|
|
509
|
+
if (lastRun) {
|
|
510
|
+
const elapsedSeconds = Math.floor(
|
|
511
|
+
(Date.now() - lastRun.getTime()) / 1000
|
|
512
|
+
);
|
|
513
|
+
if (elapsedSeconds < check.interval) {
|
|
514
|
+
// Not overdue yet - schedule with remaining time
|
|
515
|
+
startDelay = check.interval - elapsedSeconds;
|
|
516
|
+
}
|
|
517
|
+
// Otherwise it's overdue - run immediately (startDelay = 0)
|
|
518
|
+
logger.debug(
|
|
519
|
+
`Health check ${check.configId}:${
|
|
520
|
+
check.systemId
|
|
521
|
+
} - lastRun: ${lastRun.toISOString()}, elapsed: ${elapsedSeconds}s, interval: ${
|
|
522
|
+
check.interval
|
|
523
|
+
}s, startDelay: ${startDelay}s`
|
|
524
|
+
);
|
|
525
|
+
} else {
|
|
526
|
+
logger.debug(
|
|
527
|
+
`Health check ${check.configId}:${check.systemId} - no lastRun found, running immediately`
|
|
528
|
+
);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
await scheduleHealthCheck({
|
|
532
|
+
queueManager,
|
|
533
|
+
payload: {
|
|
534
|
+
configId: check.configId,
|
|
535
|
+
systemId: check.systemId,
|
|
536
|
+
},
|
|
537
|
+
intervalSeconds: check.interval,
|
|
538
|
+
startDelay,
|
|
539
|
+
logger,
|
|
540
|
+
});
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
logger.debug(`✅ Bootstrapped ${enabledChecks.length} health checks`);
|
|
544
|
+
|
|
545
|
+
// Clean up orphaned jobs
|
|
546
|
+
const queue =
|
|
547
|
+
queueManager.getQueue<HealthCheckJobPayload>(HEALTH_CHECK_QUEUE);
|
|
548
|
+
const allRecurringJobs = await queue.listRecurringJobs();
|
|
549
|
+
const expectedJobIds = new Set(
|
|
550
|
+
enabledChecks.map(
|
|
551
|
+
(check) => `healthcheck:${check.configId}:${check.systemId}`
|
|
552
|
+
)
|
|
553
|
+
);
|
|
554
|
+
|
|
555
|
+
const orphanedJobs = allRecurringJobs.filter(
|
|
556
|
+
(jobId) => jobId.startsWith("healthcheck:") && !expectedJobIds.has(jobId)
|
|
557
|
+
);
|
|
558
|
+
|
|
559
|
+
for (const jobId of orphanedJobs) {
|
|
560
|
+
await queue.cancelRecurring(jobId);
|
|
561
|
+
logger.debug(`Removed orphaned job scheduler: ${jobId}`);
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
if (orphanedJobs.length > 0) {
|
|
565
|
+
logger.info(
|
|
566
|
+
`🧹 Cleaned up ${orphanedJobs.length} orphaned health check jobs`
|
|
567
|
+
);
|
|
568
|
+
}
|
|
569
|
+
}
|