@checkstack/healthcheck-backend 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,32 @@
1
- import { NodePgDatabase } from "drizzle-orm/node-postgres";
1
+ import type {
2
+ HealthCheckRegistry,
3
+ Logger,
4
+ SafeDatabase,
5
+ CollectorRegistry,
6
+ } from "@checkstack/backend-api";
2
7
  import * as schema from "./schema";
3
8
  import {
4
9
  healthCheckRuns,
5
10
  systemHealthChecks,
6
11
  healthCheckAggregates,
12
+ healthCheckConfigurations,
7
13
  DEFAULT_RETENTION_CONFIG,
8
14
  } from "./schema";
9
15
  import { eq, and, lt, sql } from "drizzle-orm";
10
- import type {
11
- HealthCheckRegistry,
12
- Logger,
13
- } from "@checkstack/backend-api";
14
16
  import type { QueueManager } from "@checkstack/queue-api";
17
+ import {
18
+ aggregateCollectorData,
19
+ calculateLatencyStats,
20
+ countStatuses,
21
+ extractLatencies,
22
+ } from "./aggregation-utils";
15
23
 
16
- type Db = NodePgDatabase<typeof schema>;
24
+ type Db = SafeDatabase<typeof schema>;
17
25
 
18
26
  interface RetentionJobDeps {
19
27
  db: Db;
20
28
  registry: HealthCheckRegistry;
29
+ collectorRegistry: CollectorRegistry;
21
30
  logger: Logger;
22
31
  queueManager: QueueManager;
23
32
  }
@@ -35,7 +44,7 @@ interface RetentionJobPayload {
35
44
  * 3. Deletes expired daily aggregates
36
45
  */
37
46
  export async function setupRetentionJob(deps: RetentionJobDeps) {
38
- const { queueManager, logger, db, registry } = deps;
47
+ const { queueManager, logger, db, registry, collectorRegistry } = deps;
39
48
 
40
49
  const queue = queueManager.getQueue<RetentionJobPayload>(RETENTION_QUEUE);
41
50
 
@@ -43,10 +52,16 @@ export async function setupRetentionJob(deps: RetentionJobDeps) {
43
52
  await queue.consume(
44
53
  async () => {
45
54
  logger.info("Starting health check retention job");
46
- await runRetentionJob({ db, registry, logger, queueManager });
55
+ await runRetentionJob({
56
+ db,
57
+ registry,
58
+ collectorRegistry,
59
+ logger,
60
+ queueManager,
61
+ });
47
62
  logger.info("Completed health check retention job");
48
63
  },
49
- { consumerGroup: "retention-worker" }
64
+ { consumerGroup: "retention-worker" },
50
65
  );
51
66
 
52
67
  // Schedule daily retention run (86400 seconds = 24 hours)
@@ -55,7 +70,7 @@ export async function setupRetentionJob(deps: RetentionJobDeps) {
55
70
  {
56
71
  jobId: "health-check-retention-daily",
57
72
  intervalSeconds: 24 * 60 * 60, // Daily (24 hours)
58
- }
73
+ },
59
74
  );
60
75
 
61
76
  logger.info("Health check retention job scheduled (runs daily)");
@@ -65,7 +80,7 @@ export async function setupRetentionJob(deps: RetentionJobDeps) {
65
80
  * Main retention job logic
66
81
  */
67
82
  export async function runRetentionJob(deps: RetentionJobDeps) {
68
- const { db, registry, logger } = deps;
83
+ const { db, registry, collectorRegistry, logger } = deps;
69
84
 
70
85
  // Get all unique system-config assignments
71
86
  const assignments = await db.select().from(systemHealthChecks);
@@ -79,6 +94,7 @@ export async function runRetentionJob(deps: RetentionJobDeps) {
79
94
  await aggregateRawRuns({
80
95
  db,
81
96
  registry,
97
+ collectorRegistry,
82
98
  systemId: assignment.systemId,
83
99
  configurationId: assignment.configurationId,
84
100
  rawRetentionDays: retentionConfig.rawRetentionDays,
@@ -102,7 +118,7 @@ export async function runRetentionJob(deps: RetentionJobDeps) {
102
118
  } catch (error) {
103
119
  logger.error(
104
120
  `Retention job failed for ${assignment.systemId}/${assignment.configurationId}`,
105
- { error }
121
+ { error },
106
122
  );
107
123
  }
108
124
  }
@@ -111,6 +127,7 @@ export async function runRetentionJob(deps: RetentionJobDeps) {
111
127
  interface AggregateRawRunsParams {
112
128
  db: Db;
113
129
  registry: HealthCheckRegistry;
130
+ collectorRegistry: CollectorRegistry;
114
131
  systemId: string;
115
132
  configurationId: string;
116
133
  rawRetentionDays: number;
@@ -120,16 +137,25 @@ interface AggregateRawRunsParams {
120
137
  * Aggregates raw runs older than retention period into hourly buckets
121
138
  */
122
139
  async function aggregateRawRuns(params: AggregateRawRunsParams) {
123
- const { db, registry, systemId, configurationId, rawRetentionDays } = params;
140
+ const {
141
+ db,
142
+ registry,
143
+ collectorRegistry,
144
+ systemId,
145
+ configurationId,
146
+ rawRetentionDays,
147
+ } = params;
124
148
 
125
149
  const cutoffDate = new Date();
126
150
  cutoffDate.setDate(cutoffDate.getDate() - rawRetentionDays);
127
151
  cutoffDate.setHours(cutoffDate.getHours(), 0, 0, 0); // Round to hour
128
152
 
129
153
  // Get strategy for metadata aggregation
130
- const config = await db.query.healthCheckConfigurations.findFirst({
131
- where: eq(schema.healthCheckConfigurations.id, configurationId),
132
- });
154
+ const [config] = await db
155
+ .select()
156
+ .from(healthCheckConfigurations)
157
+ .where(eq(healthCheckConfigurations.id, configurationId))
158
+ .limit(1);
133
159
  const strategy = config ? registry.getStrategy(config.strategyId) : undefined;
134
160
 
135
161
  // Query raw runs older than cutoff, grouped by hour
@@ -140,8 +166,8 @@ async function aggregateRawRuns(params: AggregateRawRunsParams) {
140
166
  and(
141
167
  eq(healthCheckRuns.systemId, systemId),
142
168
  eq(healthCheckRuns.configurationId, configurationId),
143
- lt(healthCheckRuns.timestamp, cutoffDate)
144
- )
169
+ lt(healthCheckRuns.timestamp, cutoffDate),
170
+ ),
145
171
  )
146
172
  .orderBy(healthCheckRuns.timestamp);
147
173
 
@@ -180,42 +206,43 @@ async function aggregateRawRuns(params: AggregateRawRunsParams) {
180
206
 
181
207
  // Create aggregates and delete raw runs
182
208
  for (const [, bucket] of buckets) {
183
- // Calculate metrics
184
209
  const runCount = bucket.runs.length;
185
- let healthyCount = 0;
186
- let degradedCount = 0;
187
- let unhealthyCount = 0;
188
- for (const r of bucket.runs) {
189
- if (r.status === "healthy") healthyCount++;
190
- if (r.status === "degraded") degradedCount++;
191
- if (r.status === "unhealthy") unhealthyCount++;
192
- }
193
210
 
194
- const latencies = bucket.runs
195
- .map((r) => r.latencyMs)
196
- .filter((l): l is number => l !== undefined);
197
-
198
- let avgLatencyMs: number | undefined;
199
- let minLatencyMs: number | undefined;
200
- let maxLatencyMs: number | undefined;
201
- let p95LatencyMs: number | undefined;
202
-
203
- if (latencies.length > 0) {
204
- let sum = 0;
205
- for (const l of latencies) sum += l;
206
- avgLatencyMs = Math.round(sum / latencies.length);
207
- minLatencyMs = Math.min(...latencies);
208
- maxLatencyMs = Math.max(...latencies);
209
- p95LatencyMs = calculatePercentile(latencies, 95);
210
- }
211
+ // Calculate status counts
212
+ const { healthyCount, degradedCount, unhealthyCount } = countStatuses(
213
+ bucket.runs,
214
+ );
211
215
 
212
- // Aggregate result if strategy is available
216
+ // Calculate latency stats
217
+ const latencies = extractLatencies(bucket.runs);
218
+ const {
219
+ latencySumMs,
220
+ avgLatencyMs,
221
+ minLatencyMs,
222
+ maxLatencyMs,
223
+ p95LatencyMs,
224
+ } = calculateLatencyStats(latencies);
225
+
226
+ // Aggregate strategy result
213
227
  let aggregatedResult: Record<string, unknown> | undefined;
214
228
  if (strategy) {
215
- aggregatedResult = strategy.aggregateResult(bucket.runs) as Record<
229
+ const strategyResult = strategy.aggregateResult(bucket.runs) as Record<
216
230
  string,
217
231
  unknown
218
232
  >;
233
+
234
+ // Aggregate collector data
235
+ const collectorsAggregated = aggregateCollectorData(
236
+ bucket.runs,
237
+ collectorRegistry,
238
+ );
239
+
240
+ aggregatedResult = {
241
+ ...strategyResult,
242
+ ...(Object.keys(collectorsAggregated).length > 0
243
+ ? { collectors: collectorsAggregated }
244
+ : {}),
245
+ };
219
246
  }
220
247
 
221
248
  // Insert or update aggregate
@@ -230,6 +257,7 @@ async function aggregateRawRuns(params: AggregateRawRunsParams) {
230
257
  healthyCount,
231
258
  degradedCount,
232
259
  unhealthyCount,
260
+ latencySumMs,
233
261
  avgLatencyMs,
234
262
  minLatencyMs,
235
263
  maxLatencyMs,
@@ -284,8 +312,8 @@ async function rollupHourlyAggregates(params: RollupParams) {
284
312
  eq(healthCheckAggregates.systemId, systemId),
285
313
  eq(healthCheckAggregates.configurationId, configurationId),
286
314
  eq(healthCheckAggregates.bucketSize, "hourly"),
287
- lt(healthCheckAggregates.bucketStart, cutoffDate)
288
- )
315
+ lt(healthCheckAggregates.bucketStart, cutoffDate),
316
+ ),
289
317
  );
290
318
 
291
319
  if (oldHourly.length === 0) return;
@@ -316,20 +344,23 @@ async function rollupHourlyAggregates(params: RollupParams) {
316
344
  let healthyCount = 0;
317
345
  let degradedCount = 0;
318
346
  let unhealthyCount = 0;
319
- let totalWeightedLatency = 0;
347
+ let latencySumMs = 0;
320
348
 
321
349
  for (const a of bucket.aggregates) {
322
350
  runCount += a.runCount;
323
351
  healthyCount += a.healthyCount;
324
352
  degradedCount += a.degradedCount;
325
353
  unhealthyCount += a.unhealthyCount;
326
- if (a.avgLatencyMs !== null) {
327
- totalWeightedLatency += a.avgLatencyMs * a.runCount;
354
+ // Use latencySumMs if available, fallback to avg*count approximation
355
+ if (a.latencySumMs !== null) {
356
+ latencySumMs += a.latencySumMs;
357
+ } else if (a.avgLatencyMs !== null) {
358
+ latencySumMs += a.avgLatencyMs * a.runCount;
328
359
  }
329
360
  }
330
361
 
331
362
  const avgLatencyMs =
332
- runCount > 0 ? Math.round(totalWeightedLatency / runCount) : undefined;
363
+ runCount > 0 ? Math.round(latencySumMs / runCount) : undefined;
333
364
 
334
365
  // Min/max across all hourly buckets
335
366
  const minValues = bucket.aggregates
@@ -338,10 +369,16 @@ async function rollupHourlyAggregates(params: RollupParams) {
338
369
  const maxValues = bucket.aggregates
339
370
  .map((a) => a.maxLatencyMs)
340
371
  .filter((v): v is number => v !== null);
372
+ const p95Values = bucket.aggregates
373
+ .map((a) => a.p95LatencyMs)
374
+ .filter((v): v is number => v !== null);
341
375
  const minLatencyMs =
342
376
  minValues.length > 0 ? Math.min(...minValues) : undefined;
343
377
  const maxLatencyMs =
344
378
  maxValues.length > 0 ? Math.max(...maxValues) : undefined;
379
+ // Use max of hourly p95s as upper bound approximation
380
+ const p95LatencyMs =
381
+ p95Values.length > 0 ? Math.max(...p95Values) : undefined;
345
382
 
346
383
  // Insert daily aggregate
347
384
  await db.insert(healthCheckAggregates).values({
@@ -353,10 +390,11 @@ async function rollupHourlyAggregates(params: RollupParams) {
353
390
  healthyCount,
354
391
  degradedCount,
355
392
  unhealthyCount,
393
+ latencySumMs: latencySumMs > 0 ? latencySumMs : undefined,
356
394
  avgLatencyMs,
357
395
  minLatencyMs,
358
396
  maxLatencyMs,
359
- p95LatencyMs: undefined, // Cannot accurately combine p95s
397
+ p95LatencyMs,
360
398
  aggregatedResult: undefined, // Cannot combine result across hours
361
399
  });
362
400
 
@@ -392,13 +430,7 @@ async function deleteExpiredAggregates(params: DeleteExpiredParams) {
392
430
  eq(healthCheckAggregates.systemId, systemId),
393
431
  eq(healthCheckAggregates.configurationId, configurationId),
394
432
  eq(healthCheckAggregates.bucketSize, "daily"),
395
- lt(healthCheckAggregates.bucketStart, cutoffDate)
396
- )
433
+ lt(healthCheckAggregates.bucketStart, cutoffDate),
434
+ ),
397
435
  );
398
436
  }
399
-
400
- function calculatePercentile(values: number[], percentile: number): number {
401
- const sorted = values.toSorted((a, b) => a - b);
402
- const index = Math.ceil((percentile / 100) * sorted.length) - 1;
403
- return sorted[Math.max(0, index)];
404
- }
@@ -43,7 +43,18 @@ describe("HealthCheck Router", () => {
43
43
  getStrategiesWithMeta: mock(() => []),
44
44
  };
45
45
 
46
- const router = createHealthCheckRouter(mockDb as never, mockRegistry);
46
+ const mockCollectorRegistry = {
47
+ register: mock(),
48
+ getCollector: mock(),
49
+ getCollectors: mock(() => []),
50
+ getCollectorsForPlugin: mock(() => []),
51
+ };
52
+
53
+ const router = createHealthCheckRouter(
54
+ mockDb as never,
55
+ mockRegistry,
56
+ mockCollectorRegistry as never,
57
+ );
47
58
 
48
59
  it("getStrategies returns strategies from registry", async () => {
49
60
  const context = createMockRpcContext({
@@ -119,7 +130,7 @@ describe("HealthCheck Router", () => {
119
130
  const result = await call(
120
131
  router.getCollectors,
121
132
  { strategyId: "healthcheck-ssh" },
122
- { context }
133
+ { context },
123
134
  );
124
135
  expect(result).toHaveLength(1);
125
136
  expect(result[0].id).toBe("collector-hardware.cpu");
@@ -139,7 +150,7 @@ describe("HealthCheck Router", () => {
139
150
  const result = await call(
140
151
  router.getCollectors,
141
152
  { strategyId: "unknown" },
142
- { context }
153
+ { context },
143
154
  );
144
155
  expect(result).toHaveLength(0);
145
156
  });
package/src/router.ts CHANGED
@@ -4,10 +4,11 @@ import {
4
4
  toJsonSchema,
5
5
  type RpcContext,
6
6
  type HealthCheckRegistry,
7
+ type SafeDatabase,
8
+ type CollectorRegistry,
7
9
  } from "@checkstack/backend-api";
8
10
  import { healthCheckContract } from "@checkstack/healthcheck-common";
9
11
  import { HealthCheckService } from "./service";
10
- import { NodePgDatabase } from "drizzle-orm/node-postgres";
11
12
  import * as schema from "./schema";
12
13
  import { toJsonSchemaWithChartMeta } from "./schema-utils";
13
14
 
@@ -18,11 +19,12 @@ import { toJsonSchemaWithChartMeta } from "./schema-utils";
18
19
  * based on the contract's meta.userType and meta.access.
19
20
  */
20
21
  export const createHealthCheckRouter = (
21
- database: NodePgDatabase<typeof schema>,
22
- registry: HealthCheckRegistry
22
+ database: SafeDatabase<typeof schema>,
23
+ registry: HealthCheckRegistry,
24
+ collectorRegistry: CollectorRegistry,
23
25
  ) => {
24
26
  // Create service instance once - shared across all handlers
25
- const service = new HealthCheckService(database, registry);
27
+ const service = new HealthCheckService(database, registry, collectorRegistry);
26
28
 
27
29
  // Create contract implementer with context type AND auto auth middleware
28
30
  const os = implement(healthCheckContract)
@@ -40,7 +42,7 @@ export const createHealthCheckRouter = (
40
42
  ? toJsonSchemaWithChartMeta(r.strategy.result.schema)
41
43
  : undefined,
42
44
  aggregatedResultSchema: toJsonSchemaWithChartMeta(
43
- r.strategy.aggregatedResult.schema
45
+ r.strategy.aggregatedResult.schema,
44
46
  ),
45
47
  }));
46
48
  }),
@@ -48,7 +50,7 @@ export const createHealthCheckRouter = (
48
50
  getCollectors: os.getCollectors.handler(async ({ input, context }) => {
49
51
  // Get strategy to verify it exists
50
52
  const strategy = context.healthCheckRegistry.getStrategy(
51
- input.strategyId
53
+ input.strategyId,
52
54
  );
53
55
  if (!strategy) {
54
56
  return [];
@@ -74,6 +76,9 @@ export const createHealthCheckRouter = (
74
76
  description: collector.description,
75
77
  configSchema: toJsonSchema(collector.config.schema),
76
78
  resultSchema: toJsonSchemaWithChartMeta(collector.result.schema),
79
+ aggregatedResultSchema: collector.aggregatedResult
80
+ ? toJsonSchemaWithChartMeta(collector.aggregatedResult.schema)
81
+ : undefined,
77
82
  allowMultiple: collector.allowMultiple ?? false,
78
83
  }));
79
84
  }),
@@ -103,13 +108,13 @@ export const createHealthCheckRouter = (
103
108
  getSystemConfigurations: os.getSystemConfigurations.handler(
104
109
  async ({ input }) => {
105
110
  return service.getSystemConfigurations(input);
106
- }
111
+ },
107
112
  ),
108
113
 
109
114
  getSystemAssociations: os.getSystemAssociations.handler(
110
115
  async ({ input }) => {
111
116
  return service.getSystemAssociations(input.systemId);
112
- }
117
+ },
113
118
  ),
114
119
 
115
120
  associateSystem: os.associateSystem.handler(async ({ input, context }) => {
@@ -123,7 +128,7 @@ export const createHealthCheckRouter = (
123
128
  // If enabling the health check, schedule it immediately
124
129
  if (input.body.enabled) {
125
130
  const config = await service.getConfiguration(
126
- input.body.configurationId
131
+ input.body.configurationId,
127
132
  );
128
133
  if (config) {
129
134
  const { scheduleHealthCheck } = await import("./queue-executor");
@@ -152,9 +157,9 @@ export const createHealthCheckRouter = (
152
157
  await service.updateRetentionConfig(
153
158
  input.systemId,
154
159
  input.configurationId,
155
- input.retentionConfig
160
+ input.retentionConfig,
156
161
  );
157
- }
162
+ },
158
163
  ),
159
164
 
160
165
  getHistory: os.getHistory.handler(async ({ input }) => {
@@ -176,12 +181,12 @@ export const createHealthCheckRouter = (
176
181
  return service.getAggregatedHistory(input, {
177
182
  includeAggregatedResult: true,
178
183
  });
179
- }
184
+ },
180
185
  ),
181
186
  getSystemHealthStatus: os.getSystemHealthStatus.handler(
182
187
  async ({ input }) => {
183
188
  return service.getSystemHealthStatus(input.systemId);
184
- }
189
+ },
185
190
  ),
186
191
 
187
192
  getBulkSystemHealthStatus: os.getBulkSystemHealthStatus.handler(
@@ -195,17 +200,17 @@ export const createHealthCheckRouter = (
195
200
  await Promise.all(
196
201
  input.systemIds.map(async (systemId) => {
197
202
  statuses[systemId] = await service.getSystemHealthStatus(systemId);
198
- })
203
+ }),
199
204
  );
200
205
 
201
206
  return { statuses };
202
- }
207
+ },
203
208
  ),
204
209
 
205
210
  getSystemHealthOverview: os.getSystemHealthOverview.handler(
206
211
  async ({ input }) => {
207
212
  return service.getSystemHealthOverview(input.systemId);
208
- }
213
+ },
209
214
  ),
210
215
  });
211
216
  };
package/src/schema.ts CHANGED
@@ -47,7 +47,7 @@ export const healthCheckConfigurations = pgTable(
47
47
  isTemplate: boolean("is_template").default(false),
48
48
  createdAt: timestamp("created_at").defaultNow().notNull(),
49
49
  updatedAt: timestamp("updated_at").defaultNow().notNull(),
50
- }
50
+ },
51
51
  );
52
52
 
53
53
  /**
@@ -93,7 +93,7 @@ export const systemHealthChecks = pgTable(
93
93
  },
94
94
  (t) => ({
95
95
  pk: primaryKey({ columns: [t.systemId, t.configurationId] }),
96
- })
96
+ }),
97
97
  );
98
98
 
99
99
  export const healthCheckRuns = pgTable("health_check_runs", {
@@ -137,6 +137,8 @@ export const healthCheckAggregates = pgTable(
137
137
  healthyCount: integer("healthy_count").notNull(),
138
138
  degradedCount: integer("degraded_count").notNull(),
139
139
  unhealthyCount: integer("unhealthy_count").notNull(),
140
+ /** Sum of all latencies in this bucket (for accurate averaging when combining) */
141
+ latencySumMs: integer("latency_sum_ms"),
140
142
  avgLatencyMs: integer("avg_latency_ms"),
141
143
  minLatencyMs: integer("min_latency_ms"),
142
144
  maxLatencyMs: integer("max_latency_ms"),
@@ -152,7 +154,7 @@ export const healthCheckAggregates = pgTable(
152
154
  t.configurationId,
153
155
  t.systemId,
154
156
  t.bucketStart,
155
- t.bucketSize
157
+ t.bucketSize,
156
158
  ),
157
- })
159
+ }),
158
160
  );