@checkstack/healthcheck-backend 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -22,6 +22,7 @@ import { z } from "zod";
22
22
  import { createHealthCheckRouter } from "./router";
23
23
  import { HealthCheckService } from "./service";
24
24
  import { catalogHooks } from "@checkstack/catalog-backend";
25
+ import { satelliteHooks } from "@checkstack/satellite-backend";
25
26
  import { CatalogApi } from "@checkstack/catalog-common";
26
27
  import { MaintenanceApi } from "@checkstack/maintenance-common";
27
28
  import { IncidentApi } from "@checkstack/incident-common";
@@ -142,11 +143,12 @@ export default createBackendPlugin({
142
143
  queueManager,
143
144
  });
144
145
 
145
- const healthCheckRouter = createHealthCheckRouter(
146
- database as SafeDatabase<typeof schema>,
147
- healthCheckRegistry,
146
+ const healthCheckRouter = createHealthCheckRouter({
147
+ database: database as SafeDatabase<typeof schema>,
148
+ registry: healthCheckRegistry,
148
149
  collectorRegistry,
149
- );
150
+ getEmitHook: () => storedEmitHook,
151
+ });
150
152
  rpc.registerRouter(healthCheckRouter, healthCheckContract);
151
153
 
152
154
  // Register command palette commands
@@ -212,6 +214,18 @@ export default createBackendPlugin({
212
214
  { mode: "work-queue", workerGroup: "system-cleanup" },
213
215
  );
214
216
 
217
+ // Subscribe to satellite deletion to scrub satellite IDs from associations
218
+ onHook(
219
+ satelliteHooks.satelliteRemoved,
220
+ async (payload) => {
221
+ logger.debug(
222
+ `Scrubbing satellite ${payload.satelliteId} from health check associations`,
223
+ );
224
+ await service.scrubSatelliteFromAssociations(payload.satelliteId);
225
+ },
226
+ { mode: "work-queue", workerGroup: "satellite-cleanup" },
227
+ );
228
+
215
229
  logger.debug("✅ Health Check Backend afterPluginsReady complete.");
216
230
  },
217
231
  });
@@ -27,7 +27,7 @@ import {
27
27
  import { CatalogApi, catalogRoutes } from "@checkstack/catalog-common";
28
28
  import { MaintenanceApi } from "@checkstack/maintenance-common";
29
29
  import { IncidentApi } from "@checkstack/incident-common";
30
- import { resolveRoute, type InferClient } from "@checkstack/common";
30
+ import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
31
31
  import { HealthCheckService } from "./service";
32
32
  import { healthCheckHooks } from "./hooks";
33
33
  import { incrementHourlyAggregate } from "./realtime-aggregation";
@@ -259,6 +259,8 @@ async function executeHealthCheckJob(props: {
259
259
  interval: healthCheckConfigurations.intervalSeconds,
260
260
  enabled: systemHealthChecks.enabled,
261
261
  paused: healthCheckConfigurations.paused,
262
+ includeLocal: systemHealthChecks.includeLocal,
263
+ satelliteIds: systemHealthChecks.satelliteIds,
262
264
  })
263
265
  .from(systemHealthChecks)
264
266
  .innerJoin(
@@ -289,6 +291,19 @@ async function executeHealthCheckJob(props: {
289
291
  return;
290
292
  }
291
293
 
294
+ // If includeLocal is false and satellites are assigned, skip local execution
295
+ // (satellites handle execution, local core doesn't run this check)
296
+ if (
297
+ !configRow.includeLocal &&
298
+ configRow.satelliteIds &&
299
+ configRow.satelliteIds.length > 0
300
+ ) {
301
+ logger.debug(
302
+ `Health check ${configId} for system ${systemId} is satellite-only, skipping local execution`,
303
+ );
304
+ return;
305
+ }
306
+
292
307
  // Fetch system name for signal payload
293
308
  let systemName = systemId;
294
309
  try {
@@ -400,7 +415,7 @@ async function executeHealthCheckJob(props: {
400
415
  };
401
416
  } catch (error) {
402
417
  const errorStr =
403
- error instanceof Error ? error.message : String(error);
418
+ extractErrorMessage(error);
404
419
  logger.debug(`Collector ${storageKey} failed: ${errorStr}`);
405
420
  return {
406
421
  storageKey,
@@ -465,7 +480,7 @@ async function executeHealthCheckJob(props: {
465
480
  } catch (error) {
466
481
  const latencyMs = Math.round(performance.now() - start);
467
482
  const caughtError =
468
- error instanceof Error ? error.message : String(error);
483
+ extractErrorMessage(error);
469
484
 
470
485
  // Use a specific error message if available, otherwise use the caught error
471
486
  const finalError = errorMessage || caughtError;
@@ -486,6 +501,8 @@ async function executeHealthCheckJob(props: {
486
501
  status: result.status,
487
502
  latencyMs: result.latencyMs,
488
503
  result: { ...result } as Record<string, unknown>,
504
+ sourceId: undefined,
505
+ sourceLabel: "Local",
489
506
  });
490
507
 
491
508
  await incrementHourlyAggregate({
@@ -497,6 +514,7 @@ async function executeHealthCheckJob(props: {
497
514
  runTimestamp: new Date(),
498
515
  result: { ...result } as Record<string, unknown>,
499
516
  collectorRegistry,
517
+ sourceLabel: "Local",
500
518
  });
501
519
 
502
520
  logger.debug(
@@ -560,6 +578,8 @@ async function executeHealthCheckJob(props: {
560
578
  status: result.status,
561
579
  latencyMs: result.latencyMs,
562
580
  result: { ...result } as Record<string, unknown>,
581
+ sourceId: undefined,
582
+ sourceLabel: "Local",
563
583
  });
564
584
 
565
585
  // Trigger incremental hourly aggregation
@@ -572,6 +592,7 @@ async function executeHealthCheckJob(props: {
572
592
  runTimestamp: new Date(),
573
593
  result: { ...result } as Record<string, unknown>,
574
594
  collectorRegistry,
595
+ sourceLabel: "Local",
575
596
  });
576
597
 
577
598
  logger.debug(
@@ -660,6 +681,8 @@ async function executeHealthCheckJob(props: {
660
681
  systemId,
661
682
  status: "unhealthy",
662
683
  result: { error: String(error) } as Record<string, unknown>,
684
+ sourceId: undefined,
685
+ sourceLabel: "Local",
663
686
  });
664
687
 
665
688
  // Trigger incremental hourly aggregation
@@ -672,6 +695,7 @@ async function executeHealthCheckJob(props: {
672
695
  runTimestamp: new Date(),
673
696
  // No collector data for error cases
674
697
  collectorRegistry,
698
+ sourceLabel: "Local",
675
699
  });
676
700
 
677
701
  // Try to fetch names for the enriched signal (best-effort)
@@ -67,6 +67,10 @@ interface IncrementHourlyAggregateParams {
67
67
  result?: Record<string, unknown>;
68
68
  /** Collector registry for aggregating collector data via mergeResult */
69
69
  collectorRegistry?: CollectorRegistry;
70
+ /** Source identifier: undefined = local core, string = satellite ID */
71
+ sourceId?: string;
72
+ /** Human-readable source label for display */
73
+ sourceLabel?: string;
70
74
  }
71
75
 
72
76
  /**
@@ -88,6 +92,8 @@ export async function incrementHourlyAggregate(
88
92
  runTimestamp,
89
93
  result,
90
94
  collectorRegistry,
95
+ sourceId,
96
+ sourceLabel,
91
97
  } = params;
92
98
 
93
99
  const bucketStart = getHourBucketStart(runTimestamp);
@@ -107,6 +113,9 @@ export async function incrementHourlyAggregate(
107
113
  eq(healthCheckAggregates.configurationId, configurationId),
108
114
  eq(healthCheckAggregates.bucketStart, bucketStart),
109
115
  eq(healthCheckAggregates.bucketSize, "hourly"),
116
+ sourceId
117
+ ? eq(healthCheckAggregates.sourceId, sourceId)
118
+ : sql`${healthCheckAggregates.sourceId} IS NULL`,
110
119
  ),
111
120
  )
112
121
  .limit(1);
@@ -181,6 +190,8 @@ export async function incrementHourlyAggregate(
181
190
  p95LatencyMs: latencyUpdate?.p95,
182
191
  tdigestState: latencyUpdate?.tdigestState,
183
192
  aggregatedResult,
193
+ sourceId: sourceId ?? undefined,
194
+ sourceLabel: sourceLabel ?? undefined,
184
195
  })
185
196
  .onConflictDoUpdate({
186
197
  target: [
@@ -188,6 +199,7 @@ export async function incrementHourlyAggregate(
188
199
  healthCheckAggregates.systemId,
189
200
  healthCheckAggregates.bucketStart,
190
201
  healthCheckAggregates.bucketSize,
202
+ healthCheckAggregates.sourceId,
191
203
  ],
192
204
  set: {
193
205
  runCount: sql`${healthCheckAggregates.runCount} + 1`,
@@ -50,11 +50,12 @@ describe("HealthCheck Router", () => {
50
50
  getCollectorsForPlugin: mock(() => []),
51
51
  };
52
52
 
53
- const router = createHealthCheckRouter(
54
- mockDb as never,
55
- mockRegistry,
56
- mockCollectorRegistry as never,
57
- );
53
+ const router = createHealthCheckRouter({
54
+ database: mockDb as never,
55
+ registry: mockRegistry,
56
+ collectorRegistry: mockCollectorRegistry as never,
57
+ getEmitHook: () => undefined,
58
+ });
58
59
 
59
60
  it("getStrategies returns strategies from registry", async () => {
60
61
  const context = createMockRpcContext({
package/src/router.ts CHANGED
@@ -10,6 +10,7 @@ import {
10
10
  import { healthCheckContract } from "@checkstack/healthcheck-common";
11
11
  import type { StrategyCategory } from "@checkstack/healthcheck-common";
12
12
  import { HealthCheckService } from "./service";
13
+ import { healthCheckHooks } from "./hooks";
13
14
  import * as schema from "./schema";
14
15
  import { toJsonSchemaWithChartMeta } from "./schema-utils";
15
16
 
@@ -19,11 +20,13 @@ import { toJsonSchemaWithChartMeta } from "./schema-utils";
19
20
  * Auth and access rules are automatically enforced via autoAuthMiddleware
20
21
  * based on the contract's meta.userType and meta.access.
21
22
  */
22
- export const createHealthCheckRouter = (
23
- database: SafeDatabase<typeof schema>,
24
- registry: HealthCheckRegistry,
25
- collectorRegistry: CollectorRegistry,
26
- ) => {
23
+ export const createHealthCheckRouter = (opts: {
24
+ database: SafeDatabase<typeof schema>;
25
+ registry: HealthCheckRegistry;
26
+ collectorRegistry: CollectorRegistry;
27
+ getEmitHook: () => ((hook: { id: string }, payload: Record<string, unknown>) => Promise<void>) | undefined;
28
+ }) => {
29
+ const { database, registry, collectorRegistry, getEmitHook } = opts;
27
30
  // Create service instance once - shared across all handlers
28
31
  const service = new HealthCheckService(database, registry, collectorRegistry);
29
32
 
@@ -137,6 +140,8 @@ export const createHealthCheckRouter = (
137
140
  configurationId: input.body.configurationId,
138
141
  enabled: input.body.enabled,
139
142
  stateThresholds: input.body.stateThresholds,
143
+ satelliteIds: input.body.satelliteIds,
144
+ includeLocal: input.body.includeLocal,
140
145
  });
141
146
 
142
147
  // If enabling the health check, schedule it immediately
@@ -156,10 +161,28 @@ export const createHealthCheckRouter = (
156
161
  });
157
162
  }
158
163
  }
164
+
165
+ // Notify subscribers (e.g., satellite-backend) that assignments changed
166
+ const emitHook = getEmitHook();
167
+ if (emitHook) {
168
+ await emitHook(healthCheckHooks.assignmentChanged, {
169
+ systemId: input.systemId,
170
+ configurationId: input.body.configurationId,
171
+ });
172
+ }
159
173
  }),
160
174
 
161
175
  disassociateSystem: os.disassociateSystem.handler(async ({ input }) => {
162
176
  await service.disassociateSystem(input.systemId, input.configId);
177
+
178
+ // Notify subscribers that assignments changed
179
+ const emitHook = getEmitHook();
180
+ if (emitHook) {
181
+ await emitHook(healthCheckHooks.assignmentChanged, {
182
+ systemId: input.systemId,
183
+ configurationId: input.configId,
184
+ });
185
+ }
163
186
  }),
164
187
 
165
188
  getRetentionConfig: os.getRetentionConfig.handler(async ({ input }) => {
@@ -231,9 +254,21 @@ export const createHealthCheckRouter = (
231
254
  },
232
255
  ),
233
256
 
234
- getAvailabilityStats: os.getAvailabilityStats.handler(async ({ input }) => {
235
- return service.getAvailabilityStats(input);
236
- }),
257
+ // ========================================================================
258
+ // SERVICE INTERFACE (S2S — satellite-backend)
259
+ // ========================================================================
260
+
261
+ getAssignmentsForSatellite: os.getAssignmentsForSatellite.handler(
262
+ async ({ input }) => {
263
+ return service.getAssignmentsForSatellite(input.satelliteId);
264
+ },
265
+ ),
266
+
267
+ ingestSatelliteResult: os.ingestSatelliteResult.handler(
268
+ async ({ input }) => {
269
+ await service.ingestSatelliteResult(input);
270
+ },
271
+ ),
237
272
  });
238
273
  };
239
274
 
package/src/schema.ts CHANGED
@@ -90,6 +90,16 @@ export const systemHealthChecks = pgTable(
90
90
  * Null means use default retention settings.
91
91
  */
92
92
  retentionConfig: jsonb("retention_config").$type<RetentionConfig>(),
93
+ /**
94
+ * IDs of satellites assigned to execute this health check.
95
+ * When set, the check runs on these satellite nodes in addition to (or instead of) the core.
96
+ */
97
+ satelliteIds: jsonb("satellite_ids").$type<string[]>(),
98
+ /**
99
+ * Whether to also run this check locally on the core instance.
100
+ * Defaults to true. Only relevant when satelliteIds is set.
101
+ */
102
+ includeLocal: boolean("include_local").default(true).notNull(),
93
103
  createdAt: timestamp("created_at").defaultNow().notNull(),
94
104
  updatedAt: timestamp("updated_at").defaultNow().notNull(),
95
105
  },
@@ -108,6 +118,16 @@ export const healthCheckRuns = pgTable("health_check_runs", {
108
118
  /** Execution duration in milliseconds */
109
119
  latencyMs: integer("latency_ms"),
110
120
  result: jsonb("result").$type<Record<string, unknown>>(),
121
+ /**
122
+ * Source identifier for result attribution.
123
+ * null = local core execution, UUID = satellite ID.
124
+ */
125
+ sourceId: text("source_id"),
126
+ /**
127
+ * Human-readable source label for UI display.
128
+ * e.g. "Local" or "EU West (eu-west-1)".
129
+ */
130
+ sourceLabel: text("source_label"),
111
131
  timestamp: timestamp("timestamp").defaultNow().notNull(),
112
132
  });
113
133
 
@@ -151,14 +171,24 @@ export const healthCheckAggregates = pgTable(
151
171
  jsonb("aggregated_result").$type<Record<string, unknown>>(),
152
172
  /** Serialized t-digest state for incremental p95 calculation */
153
173
  tdigestState: jsonb("tdigest_state").$type<number[]>(),
174
+ /**
175
+ * Source identifier for per-region aggregation.
176
+ * null = local core execution, UUID = satellite ID.
177
+ */
178
+ sourceId: text("source_id"),
179
+ /**
180
+ * Human-readable source label for UI display.
181
+ */
182
+ sourceLabel: text("source_label"),
154
183
  },
155
184
  (t) => ({
156
- // Unique constraint for upsert operations
185
+ // Unique constraint includes sourceId for per-region aggregation
157
186
  bucketUnique: uniqueIndex("health_check_aggregates_bucket_unique").on(
158
187
  t.configurationId,
159
188
  t.systemId,
160
189
  t.bucketStart,
161
190
  t.bucketSize,
191
+ t.sourceId,
162
192
  ),
163
193
  }),
164
194
  );