@checkstack/healthcheck-backend 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,569 @@
1
+ import {
2
+ HealthCheckRegistry,
3
+ Logger,
4
+ type EmitHookFn,
5
+ } from "@checkstack/backend-api";
6
+ import { QueueManager } from "@checkstack/queue-api";
7
+ import {
8
+ healthCheckConfigurations,
9
+ systemHealthChecks,
10
+ healthCheckRuns,
11
+ } from "./schema";
12
+ import * as schema from "./schema";
13
+ import { eq, and, max } from "drizzle-orm";
14
+ import { NodePgDatabase } from "drizzle-orm/node-postgres";
15
+ import { type SignalService } from "@checkstack/signal-common";
16
+ import {
17
+ HEALTH_CHECK_RUN_COMPLETED,
18
+ type HealthCheckStatus,
19
+ } from "@checkstack/healthcheck-common";
20
+ import { CatalogApi, catalogRoutes } from "@checkstack/catalog-common";
21
+ import { resolveRoute, type InferClient } from "@checkstack/common";
22
+ import { HealthCheckService } from "./service";
23
+ import { healthCheckHooks } from "./hooks";
24
+
25
+ type Db = NodePgDatabase<typeof schema>;
26
+ type CatalogClient = InferClient<typeof CatalogApi>;
27
+
28
+ /**
29
+ * Payload for health check queue jobs
30
+ */
31
+ export interface HealthCheckJobPayload {
32
+ configId: string;
33
+ systemId: string;
34
+ }
35
+
36
+ /**
37
+ * Queue name for health check execution
38
+ */
39
+ const HEALTH_CHECK_QUEUE = "health-checks";
40
+
41
+ /**
42
+ * Worker group for health check execution (work-queue mode)
43
+ */
44
+ const WORKER_GROUP = "health-check-executor";
45
+
46
+ /**
47
+ * Schedule a health check for execution using recurring jobs
48
+ * @param queueManager - Queue manager service
49
+ * @param payload - Health check job payload
50
+ * @param intervalSeconds - Interval between executions
51
+ * @param startDelay - Optional delay before first execution (for delta-based scheduling)
52
+ * @param logger - Optional logger
53
+ */
54
+ export async function scheduleHealthCheck(props: {
55
+ queueManager: QueueManager;
56
+ payload: HealthCheckJobPayload;
57
+ intervalSeconds: number;
58
+ startDelay?: number;
59
+ logger?: Logger;
60
+ }): Promise<string> {
61
+ const {
62
+ queueManager,
63
+ payload,
64
+ intervalSeconds,
65
+ startDelay = 0,
66
+ logger,
67
+ } = props;
68
+
69
+ const queue =
70
+ queueManager.getQueue<HealthCheckJobPayload>(HEALTH_CHECK_QUEUE);
71
+
72
+ const jobId = `healthcheck:${payload.configId}:${payload.systemId}`;
73
+
74
+ logger?.debug(
75
+ `Scheduling recurring health check ${jobId} with interval ${intervalSeconds}s, startDelay ${startDelay}s`
76
+ );
77
+
78
+ return queue.scheduleRecurring(payload, {
79
+ jobId,
80
+ intervalSeconds,
81
+ startDelay,
82
+ priority: 0,
83
+ });
84
+ }
85
+
86
+ /**
87
+ * Notify system subscribers about a health state change.
88
+ */
89
+ async function notifyStateChange(props: {
90
+ systemId: string;
91
+ previousStatus: HealthCheckStatus;
92
+ newStatus: HealthCheckStatus;
93
+ catalogClient: CatalogClient;
94
+ logger: Logger;
95
+ }): Promise<void> {
96
+ const { systemId, previousStatus, newStatus, catalogClient, logger } = props;
97
+
98
+ // Only notify on actual state changes
99
+ if (newStatus === previousStatus) {
100
+ return;
101
+ }
102
+
103
+ const isRecovery = newStatus === "healthy" && previousStatus !== "healthy";
104
+ const isDegraded = newStatus === "degraded";
105
+ const isUnhealthy = newStatus === "unhealthy";
106
+
107
+ let title: string;
108
+ let body: string;
109
+ let importance: "info" | "warning" | "critical";
110
+
111
+ if (isRecovery) {
112
+ title = "System health restored";
113
+ body =
114
+ "All health checks are now passing. The system has returned to normal operation.";
115
+ importance = "info";
116
+ } else if (isUnhealthy) {
117
+ title = "System health critical";
118
+ body = "Health checks indicate the system is unhealthy and may be down.";
119
+ importance = "critical";
120
+ } else if (isDegraded) {
121
+ title = "System health degraded";
122
+ body =
123
+ "Some health checks are failing. The system may be experiencing issues.";
124
+ importance = "warning";
125
+ } else {
126
+ // No notification for healthy → healthy (if somehow missed above)
127
+ return;
128
+ }
129
+
130
+ const systemDetailPath = resolveRoute(catalogRoutes.routes.systemDetail, {
131
+ systemId,
132
+ });
133
+
134
+ try {
135
+ await catalogClient.notifySystemSubscribers({
136
+ systemId,
137
+ title,
138
+ body,
139
+ importance,
140
+ action: { label: "View System", url: systemDetailPath },
141
+ includeGroupSubscribers: true,
142
+ });
143
+ logger.debug(
144
+ `Notified subscribers: ${previousStatus} → ${newStatus} for system ${systemId}`
145
+ );
146
+ } catch (error) {
147
+ // Log but don't fail the operation - notifications are best-effort
148
+ logger.warn(
149
+ `Failed to notify subscribers for health state change on system ${systemId}:`,
150
+ error
151
+ );
152
+ }
153
+ }
154
+
155
+ /**
156
+ * Execute a health check job
157
+ */
158
+ async function executeHealthCheckJob(props: {
159
+ payload: HealthCheckJobPayload;
160
+ db: Db;
161
+ registry: HealthCheckRegistry;
162
+ logger: Logger;
163
+ signalService: SignalService;
164
+ catalogClient: CatalogClient;
165
+ getEmitHook: () => EmitHookFn | undefined;
166
+ }): Promise<void> {
167
+ const {
168
+ payload,
169
+ db,
170
+ registry,
171
+ logger,
172
+ signalService,
173
+ catalogClient,
174
+ getEmitHook,
175
+ } = props;
176
+ const { configId, systemId } = payload;
177
+
178
+ // Create service for aggregated state evaluation
179
+ const service = new HealthCheckService(db);
180
+
181
+ // Capture aggregated state BEFORE this run for comparison
182
+ const previousState = await service.getSystemHealthStatus(systemId);
183
+ const previousStatus = previousState.status;
184
+
185
+ try {
186
+ // Fetch configuration (including name for signals)
187
+ const [configRow] = await db
188
+ .select({
189
+ configId: healthCheckConfigurations.id,
190
+ configName: healthCheckConfigurations.name,
191
+ strategyId: healthCheckConfigurations.strategyId,
192
+ config: healthCheckConfigurations.config,
193
+ interval: healthCheckConfigurations.intervalSeconds,
194
+ enabled: systemHealthChecks.enabled,
195
+ })
196
+ .from(systemHealthChecks)
197
+ .innerJoin(
198
+ healthCheckConfigurations,
199
+ eq(systemHealthChecks.configurationId, healthCheckConfigurations.id)
200
+ )
201
+ .where(
202
+ and(
203
+ eq(systemHealthChecks.systemId, systemId),
204
+ eq(systemHealthChecks.configurationId, configId),
205
+ eq(systemHealthChecks.enabled, true)
206
+ )
207
+ );
208
+
209
+ // If configuration not found or disabled, exit without rescheduling
210
+ if (!configRow) {
211
+ logger.debug(
212
+ `Health check ${configId} for system ${systemId} not found or disabled, not rescheduling`
213
+ );
214
+ return;
215
+ }
216
+
217
+ // Fetch system name for signal payload
218
+ let systemName = systemId;
219
+ try {
220
+ const system = await catalogClient.getSystem({ systemId });
221
+ if (system) {
222
+ systemName = system.name;
223
+ }
224
+ } catch {
225
+ // Fall back to systemId if catalog lookup fails
226
+ logger.debug(`Could not fetch system name for ${systemId}, using ID`);
227
+ }
228
+
229
+ const strategy = registry.getStrategy(configRow.strategyId);
230
+ if (!strategy) {
231
+ logger.warn(
232
+ `Strategy ${configRow.strategyId} not found for config ${configId}`
233
+ );
234
+ return;
235
+ }
236
+
237
+ // Execute health check
238
+ const result = await strategy.execute(
239
+ configRow.config as Record<string, unknown>
240
+ );
241
+
242
+ // Store result (spread to convert structured type to plain record for jsonb)
243
+ await db.insert(healthCheckRuns).values({
244
+ configurationId: configId,
245
+ systemId,
246
+ status: result.status,
247
+ latencyMs: result.latencyMs,
248
+ result: { ...result } as Record<string, unknown>,
249
+ });
250
+
251
+ logger.debug(
252
+ `Ran health check ${configId} for system ${systemId}: ${result.status}`
253
+ );
254
+
255
+ // Broadcast enriched signal for realtime frontend updates (e.g., terminal feed)
256
+ await signalService.broadcast(HEALTH_CHECK_RUN_COMPLETED, {
257
+ systemId,
258
+ systemName,
259
+ configurationId: configId,
260
+ configurationName: configRow.configName,
261
+ status: result.status,
262
+ latencyMs: result.latencyMs,
263
+ });
264
+
265
+ // Check if aggregated state changed and notify subscribers
266
+ const newState = await service.getSystemHealthStatus(systemId);
267
+ if (newState.status !== previousStatus) {
268
+ await notifyStateChange({
269
+ systemId,
270
+ previousStatus,
271
+ newStatus: newState.status,
272
+ catalogClient,
273
+ logger,
274
+ });
275
+
276
+ // Emit integration hooks for external integrations
277
+ const emitHook = getEmitHook();
278
+ if (emitHook) {
279
+ if (newState.status === "healthy" && previousStatus !== "healthy") {
280
+ // Recovery: system became healthy
281
+ await emitHook(healthCheckHooks.systemHealthy, {
282
+ systemId,
283
+ previousStatus,
284
+ healthyChecks: newState.checkStatuses.filter(
285
+ (c) => c.status === "healthy"
286
+ ).length,
287
+ totalChecks: newState.checkStatuses.length,
288
+ timestamp: new Date().toISOString(),
289
+ });
290
+ logger.debug(
291
+ `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`
292
+ );
293
+ } else if (
294
+ previousStatus === "healthy" &&
295
+ newState.status !== "healthy"
296
+ ) {
297
+ // Degradation: system went from healthy to unhealthy/degraded
298
+ await emitHook(healthCheckHooks.systemDegraded, {
299
+ systemId,
300
+ previousStatus,
301
+ newStatus: newState.status,
302
+ healthyChecks: newState.checkStatuses.filter(
303
+ (c) => c.status === "healthy"
304
+ ).length,
305
+ totalChecks: newState.checkStatuses.length,
306
+ timestamp: new Date().toISOString(),
307
+ });
308
+ logger.debug(
309
+ `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`
310
+ );
311
+ }
312
+ }
313
+ }
314
+
315
+ // Note: No manual rescheduling needed - recurring job handles it automatically
316
+ } catch (error) {
317
+ logger.error(
318
+ `Failed to execute health check ${configId} for system ${systemId}`,
319
+ error
320
+ );
321
+
322
+ // Store failure (no latencyMs for failures)
323
+ await db.insert(healthCheckRuns).values({
324
+ configurationId: configId,
325
+ systemId,
326
+ status: "unhealthy",
327
+ result: { error: String(error) } as Record<string, unknown>,
328
+ });
329
+
330
+ // Try to fetch names for the enriched signal (best-effort)
331
+ let systemName = systemId;
332
+ let configName = configId;
333
+ try {
334
+ const system = await catalogClient.getSystem({ systemId });
335
+ if (system) {
336
+ systemName = system.name;
337
+ }
338
+ const [config] = await db
339
+ .select({ name: healthCheckConfigurations.name })
340
+ .from(healthCheckConfigurations)
341
+ .where(eq(healthCheckConfigurations.id, configId));
342
+ if (config) {
343
+ configName = config.name;
344
+ }
345
+ } catch {
346
+ // Use IDs as fallback
347
+ }
348
+
349
+ // Broadcast enriched failure signal for realtime frontend updates
350
+ await signalService.broadcast(HEALTH_CHECK_RUN_COMPLETED, {
351
+ systemId,
352
+ systemName,
353
+ configurationId: configId,
354
+ configurationName: configName,
355
+ status: "unhealthy",
356
+ });
357
+
358
+ // Check if aggregated state changed and notify subscribers
359
+ const newState = await service.getSystemHealthStatus(systemId);
360
+ if (newState.status !== previousStatus) {
361
+ await notifyStateChange({
362
+ systemId,
363
+ previousStatus,
364
+ newStatus: newState.status,
365
+ catalogClient,
366
+ logger,
367
+ });
368
+
369
+ // Emit integration hooks for external integrations
370
+ const emitHook = getEmitHook();
371
+ if (emitHook) {
372
+ if (newState.status === "healthy" && previousStatus !== "healthy") {
373
+ // Recovery: system became healthy
374
+ await emitHook(healthCheckHooks.systemHealthy, {
375
+ systemId,
376
+ previousStatus,
377
+ healthyChecks: newState.checkStatuses.filter(
378
+ (c) => c.status === "healthy"
379
+ ).length,
380
+ totalChecks: newState.checkStatuses.length,
381
+ timestamp: new Date().toISOString(),
382
+ });
383
+ logger.debug(
384
+ `Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`
385
+ );
386
+ } else if (
387
+ previousStatus === "healthy" &&
388
+ newState.status !== "healthy"
389
+ ) {
390
+ // Degradation: system went from healthy to unhealthy/degraded
391
+ await emitHook(healthCheckHooks.systemDegraded, {
392
+ systemId,
393
+ previousStatus,
394
+ newStatus: newState.status,
395
+ healthyChecks: newState.checkStatuses.filter(
396
+ (c) => c.status === "healthy"
397
+ ).length,
398
+ totalChecks: newState.checkStatuses.length,
399
+ timestamp: new Date().toISOString(),
400
+ });
401
+ logger.debug(
402
+ `Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`
403
+ );
404
+ }
405
+ }
406
+ }
407
+
408
+ // Note: No manual rescheduling needed - recurring job handles it automatically
409
+ }
410
+ }
411
+
412
+ export async function setupHealthCheckWorker(props: {
413
+ db: Db;
414
+ registry: HealthCheckRegistry;
415
+ logger: Logger;
416
+ queueManager: QueueManager;
417
+ signalService: SignalService;
418
+ catalogClient: CatalogClient;
419
+ getEmitHook: () => EmitHookFn | undefined;
420
+ }): Promise<void> {
421
+ const {
422
+ db,
423
+ registry,
424
+ logger,
425
+ queueManager,
426
+ signalService,
427
+ catalogClient,
428
+ getEmitHook,
429
+ } = props;
430
+
431
+ const queue =
432
+ queueManager.getQueue<HealthCheckJobPayload>(HEALTH_CHECK_QUEUE);
433
+
434
+ // Subscribe to health check queue in work-queue mode
435
+ await queue.consume(
436
+ async (job) => {
437
+ await executeHealthCheckJob({
438
+ payload: job.data,
439
+ db,
440
+ registry,
441
+ logger,
442
+ signalService,
443
+ catalogClient,
444
+ getEmitHook,
445
+ });
446
+ },
447
+ {
448
+ consumerGroup: WORKER_GROUP,
449
+ maxRetries: 0, // Health checks should not retry on failure
450
+ }
451
+ );
452
+
453
+ logger.debug("🎯 Health Check Worker subscribed to queue");
454
+ }
455
+
456
+ /**
457
+ * Bootstrap health checks by enqueueing all enabled checks
458
+ */
459
+ export async function bootstrapHealthChecks(props: {
460
+ db: Db;
461
+ queueManager: QueueManager;
462
+ logger: Logger;
463
+ }): Promise<void> {
464
+ const { db, queueManager, logger } = props;
465
+
466
+ // Get all enabled health checks
467
+ const enabledChecks = await db
468
+ .select({
469
+ systemId: systemHealthChecks.systemId,
470
+ configId: healthCheckConfigurations.id,
471
+ interval: healthCheckConfigurations.intervalSeconds,
472
+ })
473
+ .from(systemHealthChecks)
474
+ .innerJoin(
475
+ healthCheckConfigurations,
476
+ eq(systemHealthChecks.configurationId, healthCheckConfigurations.id)
477
+ )
478
+ .where(eq(systemHealthChecks.enabled, true));
479
+
480
+ // Get latest run timestamp for each system+config pair
481
+ // Using Drizzle's max() function for proper timestamp handling (no raw SQL)
482
+ const latestRuns = await db
483
+ .select({
484
+ systemId: healthCheckRuns.systemId,
485
+ configurationId: healthCheckRuns.configurationId,
486
+ maxTimestamp: max(healthCheckRuns.timestamp),
487
+ })
488
+ .from(healthCheckRuns)
489
+ .groupBy(healthCheckRuns.systemId, healthCheckRuns.configurationId);
490
+
491
+ // Create a lookup map for fast access
492
+ const lastRunMap = new Map<string, Date>();
493
+ for (const run of latestRuns) {
494
+ if (run.maxTimestamp) {
495
+ const key = `${run.systemId}:${run.configurationId}`;
496
+ lastRunMap.set(key, run.maxTimestamp);
497
+ }
498
+ }
499
+
500
+ logger.debug(`Bootstrapping ${enabledChecks.length} health checks`);
501
+
502
+ for (const check of enabledChecks) {
503
+ // Look up the last run from the map
504
+ const lastRunKey = `${check.systemId}:${check.configId}`;
505
+ const lastRun = lastRunMap.get(lastRunKey);
506
+
507
+ // Calculate delay for first run based on time since last run
508
+ let startDelay = 0;
509
+ if (lastRun) {
510
+ const elapsedSeconds = Math.floor(
511
+ (Date.now() - lastRun.getTime()) / 1000
512
+ );
513
+ if (elapsedSeconds < check.interval) {
514
+ // Not overdue yet - schedule with remaining time
515
+ startDelay = check.interval - elapsedSeconds;
516
+ }
517
+ // Otherwise it's overdue - run immediately (startDelay = 0)
518
+ logger.debug(
519
+ `Health check ${check.configId}:${
520
+ check.systemId
521
+ } - lastRun: ${lastRun.toISOString()}, elapsed: ${elapsedSeconds}s, interval: ${
522
+ check.interval
523
+ }s, startDelay: ${startDelay}s`
524
+ );
525
+ } else {
526
+ logger.debug(
527
+ `Health check ${check.configId}:${check.systemId} - no lastRun found, running immediately`
528
+ );
529
+ }
530
+
531
+ await scheduleHealthCheck({
532
+ queueManager,
533
+ payload: {
534
+ configId: check.configId,
535
+ systemId: check.systemId,
536
+ },
537
+ intervalSeconds: check.interval,
538
+ startDelay,
539
+ logger,
540
+ });
541
+ }
542
+
543
+ logger.debug(`✅ Bootstrapped ${enabledChecks.length} health checks`);
544
+
545
+ // Clean up orphaned jobs
546
+ const queue =
547
+ queueManager.getQueue<HealthCheckJobPayload>(HEALTH_CHECK_QUEUE);
548
+ const allRecurringJobs = await queue.listRecurringJobs();
549
+ const expectedJobIds = new Set(
550
+ enabledChecks.map(
551
+ (check) => `healthcheck:${check.configId}:${check.systemId}`
552
+ )
553
+ );
554
+
555
+ const orphanedJobs = allRecurringJobs.filter(
556
+ (jobId) => jobId.startsWith("healthcheck:") && !expectedJobIds.has(jobId)
557
+ );
558
+
559
+ for (const jobId of orphanedJobs) {
560
+ await queue.cancelRecurring(jobId);
561
+ logger.debug(`Removed orphaned job scheduler: ${jobId}`);
562
+ }
563
+
564
+ if (orphanedJobs.length > 0) {
565
+ logger.info(
566
+ `🧹 Cleaned up ${orphanedJobs.length} orphaned health check jobs`
567
+ );
568
+ }
569
+ }