@checkstack/healthcheck-backend 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +108 -0
- package/drizzle/0010_colorful_shinobi_shaw.sql +8 -0
- package/drizzle/meta/0010_snapshot.json +469 -0
- package/drizzle/meta/_journal.json +7 -0
- package/package.json +15 -14
- package/src/hooks.ts +10 -0
- package/src/index.ts +18 -4
- package/src/queue-executor.ts +27 -3
- package/src/realtime-aggregation.ts +12 -0
- package/src/router.test.ts +6 -5
- package/src/router.ts +43 -8
- package/src/schema.ts +31 -1
- package/src/service.ts +215 -144
- package/src/availability.test.ts +0 -236
package/src/index.ts
CHANGED
|
@@ -22,6 +22,7 @@ import { z } from "zod";
|
|
|
22
22
|
import { createHealthCheckRouter } from "./router";
|
|
23
23
|
import { HealthCheckService } from "./service";
|
|
24
24
|
import { catalogHooks } from "@checkstack/catalog-backend";
|
|
25
|
+
import { satelliteHooks } from "@checkstack/satellite-backend";
|
|
25
26
|
import { CatalogApi } from "@checkstack/catalog-common";
|
|
26
27
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
27
28
|
import { IncidentApi } from "@checkstack/incident-common";
|
|
@@ -142,11 +143,12 @@ export default createBackendPlugin({
|
|
|
142
143
|
queueManager,
|
|
143
144
|
});
|
|
144
145
|
|
|
145
|
-
const healthCheckRouter = createHealthCheckRouter(
|
|
146
|
-
database as SafeDatabase<typeof schema>,
|
|
147
|
-
healthCheckRegistry,
|
|
146
|
+
const healthCheckRouter = createHealthCheckRouter({
|
|
147
|
+
database: database as SafeDatabase<typeof schema>,
|
|
148
|
+
registry: healthCheckRegistry,
|
|
148
149
|
collectorRegistry,
|
|
149
|
-
|
|
150
|
+
getEmitHook: () => storedEmitHook,
|
|
151
|
+
});
|
|
150
152
|
rpc.registerRouter(healthCheckRouter, healthCheckContract);
|
|
151
153
|
|
|
152
154
|
// Register command palette commands
|
|
@@ -212,6 +214,18 @@ export default createBackendPlugin({
|
|
|
212
214
|
{ mode: "work-queue", workerGroup: "system-cleanup" },
|
|
213
215
|
);
|
|
214
216
|
|
|
217
|
+
// Subscribe to satellite deletion to scrub satellite IDs from associations
|
|
218
|
+
onHook(
|
|
219
|
+
satelliteHooks.satelliteRemoved,
|
|
220
|
+
async (payload) => {
|
|
221
|
+
logger.debug(
|
|
222
|
+
`Scrubbing satellite ${payload.satelliteId} from health check associations`,
|
|
223
|
+
);
|
|
224
|
+
await service.scrubSatelliteFromAssociations(payload.satelliteId);
|
|
225
|
+
},
|
|
226
|
+
{ mode: "work-queue", workerGroup: "satellite-cleanup" },
|
|
227
|
+
);
|
|
228
|
+
|
|
215
229
|
logger.debug("✅ Health Check Backend afterPluginsReady complete.");
|
|
216
230
|
},
|
|
217
231
|
});
|
package/src/queue-executor.ts
CHANGED
|
@@ -27,7 +27,7 @@ import {
|
|
|
27
27
|
import { CatalogApi, catalogRoutes } from "@checkstack/catalog-common";
|
|
28
28
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
29
29
|
import { IncidentApi } from "@checkstack/incident-common";
|
|
30
|
-
import { resolveRoute, type InferClient } from "@checkstack/common";
|
|
30
|
+
import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
|
|
31
31
|
import { HealthCheckService } from "./service";
|
|
32
32
|
import { healthCheckHooks } from "./hooks";
|
|
33
33
|
import { incrementHourlyAggregate } from "./realtime-aggregation";
|
|
@@ -259,6 +259,8 @@ async function executeHealthCheckJob(props: {
|
|
|
259
259
|
interval: healthCheckConfigurations.intervalSeconds,
|
|
260
260
|
enabled: systemHealthChecks.enabled,
|
|
261
261
|
paused: healthCheckConfigurations.paused,
|
|
262
|
+
includeLocal: systemHealthChecks.includeLocal,
|
|
263
|
+
satelliteIds: systemHealthChecks.satelliteIds,
|
|
262
264
|
})
|
|
263
265
|
.from(systemHealthChecks)
|
|
264
266
|
.innerJoin(
|
|
@@ -289,6 +291,19 @@ async function executeHealthCheckJob(props: {
|
|
|
289
291
|
return;
|
|
290
292
|
}
|
|
291
293
|
|
|
294
|
+
// If includeLocal is false and satellites are assigned, skip local execution
|
|
295
|
+
// (satellites handle execution, local core doesn't run this check)
|
|
296
|
+
if (
|
|
297
|
+
!configRow.includeLocal &&
|
|
298
|
+
configRow.satelliteIds &&
|
|
299
|
+
configRow.satelliteIds.length > 0
|
|
300
|
+
) {
|
|
301
|
+
logger.debug(
|
|
302
|
+
`Health check ${configId} for system ${systemId} is satellite-only, skipping local execution`,
|
|
303
|
+
);
|
|
304
|
+
return;
|
|
305
|
+
}
|
|
306
|
+
|
|
292
307
|
// Fetch system name for signal payload
|
|
293
308
|
let systemName = systemId;
|
|
294
309
|
try {
|
|
@@ -400,7 +415,7 @@ async function executeHealthCheckJob(props: {
|
|
|
400
415
|
};
|
|
401
416
|
} catch (error) {
|
|
402
417
|
const errorStr =
|
|
403
|
-
|
|
418
|
+
extractErrorMessage(error);
|
|
404
419
|
logger.debug(`Collector ${storageKey} failed: ${errorStr}`);
|
|
405
420
|
return {
|
|
406
421
|
storageKey,
|
|
@@ -465,7 +480,7 @@ async function executeHealthCheckJob(props: {
|
|
|
465
480
|
} catch (error) {
|
|
466
481
|
const latencyMs = Math.round(performance.now() - start);
|
|
467
482
|
const caughtError =
|
|
468
|
-
|
|
483
|
+
extractErrorMessage(error);
|
|
469
484
|
|
|
470
485
|
// Use a specific error message if available, otherwise use the caught error
|
|
471
486
|
const finalError = errorMessage || caughtError;
|
|
@@ -486,6 +501,8 @@ async function executeHealthCheckJob(props: {
|
|
|
486
501
|
status: result.status,
|
|
487
502
|
latencyMs: result.latencyMs,
|
|
488
503
|
result: { ...result } as Record<string, unknown>,
|
|
504
|
+
sourceId: undefined,
|
|
505
|
+
sourceLabel: "Local",
|
|
489
506
|
});
|
|
490
507
|
|
|
491
508
|
await incrementHourlyAggregate({
|
|
@@ -497,6 +514,7 @@ async function executeHealthCheckJob(props: {
|
|
|
497
514
|
runTimestamp: new Date(),
|
|
498
515
|
result: { ...result } as Record<string, unknown>,
|
|
499
516
|
collectorRegistry,
|
|
517
|
+
sourceLabel: "Local",
|
|
500
518
|
});
|
|
501
519
|
|
|
502
520
|
logger.debug(
|
|
@@ -560,6 +578,8 @@ async function executeHealthCheckJob(props: {
|
|
|
560
578
|
status: result.status,
|
|
561
579
|
latencyMs: result.latencyMs,
|
|
562
580
|
result: { ...result } as Record<string, unknown>,
|
|
581
|
+
sourceId: undefined,
|
|
582
|
+
sourceLabel: "Local",
|
|
563
583
|
});
|
|
564
584
|
|
|
565
585
|
// Trigger incremental hourly aggregation
|
|
@@ -572,6 +592,7 @@ async function executeHealthCheckJob(props: {
|
|
|
572
592
|
runTimestamp: new Date(),
|
|
573
593
|
result: { ...result } as Record<string, unknown>,
|
|
574
594
|
collectorRegistry,
|
|
595
|
+
sourceLabel: "Local",
|
|
575
596
|
});
|
|
576
597
|
|
|
577
598
|
logger.debug(
|
|
@@ -660,6 +681,8 @@ async function executeHealthCheckJob(props: {
|
|
|
660
681
|
systemId,
|
|
661
682
|
status: "unhealthy",
|
|
662
683
|
result: { error: String(error) } as Record<string, unknown>,
|
|
684
|
+
sourceId: undefined,
|
|
685
|
+
sourceLabel: "Local",
|
|
663
686
|
});
|
|
664
687
|
|
|
665
688
|
// Trigger incremental hourly aggregation
|
|
@@ -672,6 +695,7 @@ async function executeHealthCheckJob(props: {
|
|
|
672
695
|
runTimestamp: new Date(),
|
|
673
696
|
// No collector data for error cases
|
|
674
697
|
collectorRegistry,
|
|
698
|
+
sourceLabel: "Local",
|
|
675
699
|
});
|
|
676
700
|
|
|
677
701
|
// Try to fetch names for the enriched signal (best-effort)
|
|
@@ -67,6 +67,10 @@ interface IncrementHourlyAggregateParams {
|
|
|
67
67
|
result?: Record<string, unknown>;
|
|
68
68
|
/** Collector registry for aggregating collector data via mergeResult */
|
|
69
69
|
collectorRegistry?: CollectorRegistry;
|
|
70
|
+
/** Source identifier: undefined = local core, string = satellite ID */
|
|
71
|
+
sourceId?: string;
|
|
72
|
+
/** Human-readable source label for display */
|
|
73
|
+
sourceLabel?: string;
|
|
70
74
|
}
|
|
71
75
|
|
|
72
76
|
/**
|
|
@@ -88,6 +92,8 @@ export async function incrementHourlyAggregate(
|
|
|
88
92
|
runTimestamp,
|
|
89
93
|
result,
|
|
90
94
|
collectorRegistry,
|
|
95
|
+
sourceId,
|
|
96
|
+
sourceLabel,
|
|
91
97
|
} = params;
|
|
92
98
|
|
|
93
99
|
const bucketStart = getHourBucketStart(runTimestamp);
|
|
@@ -107,6 +113,9 @@ export async function incrementHourlyAggregate(
|
|
|
107
113
|
eq(healthCheckAggregates.configurationId, configurationId),
|
|
108
114
|
eq(healthCheckAggregates.bucketStart, bucketStart),
|
|
109
115
|
eq(healthCheckAggregates.bucketSize, "hourly"),
|
|
116
|
+
sourceId
|
|
117
|
+
? eq(healthCheckAggregates.sourceId, sourceId)
|
|
118
|
+
: sql`${healthCheckAggregates.sourceId} IS NULL`,
|
|
110
119
|
),
|
|
111
120
|
)
|
|
112
121
|
.limit(1);
|
|
@@ -181,6 +190,8 @@ export async function incrementHourlyAggregate(
|
|
|
181
190
|
p95LatencyMs: latencyUpdate?.p95,
|
|
182
191
|
tdigestState: latencyUpdate?.tdigestState,
|
|
183
192
|
aggregatedResult,
|
|
193
|
+
sourceId: sourceId ?? undefined,
|
|
194
|
+
sourceLabel: sourceLabel ?? undefined,
|
|
184
195
|
})
|
|
185
196
|
.onConflictDoUpdate({
|
|
186
197
|
target: [
|
|
@@ -188,6 +199,7 @@ export async function incrementHourlyAggregate(
|
|
|
188
199
|
healthCheckAggregates.systemId,
|
|
189
200
|
healthCheckAggregates.bucketStart,
|
|
190
201
|
healthCheckAggregates.bucketSize,
|
|
202
|
+
healthCheckAggregates.sourceId,
|
|
191
203
|
],
|
|
192
204
|
set: {
|
|
193
205
|
runCount: sql`${healthCheckAggregates.runCount} + 1`,
|
package/src/router.test.ts
CHANGED
|
@@ -50,11 +50,12 @@ describe("HealthCheck Router", () => {
|
|
|
50
50
|
getCollectorsForPlugin: mock(() => []),
|
|
51
51
|
};
|
|
52
52
|
|
|
53
|
-
const router = createHealthCheckRouter(
|
|
54
|
-
mockDb as never,
|
|
55
|
-
mockRegistry,
|
|
56
|
-
mockCollectorRegistry as never,
|
|
57
|
-
|
|
53
|
+
const router = createHealthCheckRouter({
|
|
54
|
+
database: mockDb as never,
|
|
55
|
+
registry: mockRegistry,
|
|
56
|
+
collectorRegistry: mockCollectorRegistry as never,
|
|
57
|
+
getEmitHook: () => undefined,
|
|
58
|
+
});
|
|
58
59
|
|
|
59
60
|
it("getStrategies returns strategies from registry", async () => {
|
|
60
61
|
const context = createMockRpcContext({
|
package/src/router.ts
CHANGED
|
@@ -10,6 +10,7 @@ import {
|
|
|
10
10
|
import { healthCheckContract } from "@checkstack/healthcheck-common";
|
|
11
11
|
import type { StrategyCategory } from "@checkstack/healthcheck-common";
|
|
12
12
|
import { HealthCheckService } from "./service";
|
|
13
|
+
import { healthCheckHooks } from "./hooks";
|
|
13
14
|
import * as schema from "./schema";
|
|
14
15
|
import { toJsonSchemaWithChartMeta } from "./schema-utils";
|
|
15
16
|
|
|
@@ -19,11 +20,13 @@ import { toJsonSchemaWithChartMeta } from "./schema-utils";
|
|
|
19
20
|
* Auth and access rules are automatically enforced via autoAuthMiddleware
|
|
20
21
|
* based on the contract's meta.userType and meta.access.
|
|
21
22
|
*/
|
|
22
|
-
export const createHealthCheckRouter = (
|
|
23
|
-
database: SafeDatabase<typeof schema
|
|
24
|
-
registry: HealthCheckRegistry
|
|
25
|
-
collectorRegistry: CollectorRegistry
|
|
26
|
-
) => {
|
|
23
|
+
export const createHealthCheckRouter = (opts: {
|
|
24
|
+
database: SafeDatabase<typeof schema>;
|
|
25
|
+
registry: HealthCheckRegistry;
|
|
26
|
+
collectorRegistry: CollectorRegistry;
|
|
27
|
+
getEmitHook: () => ((hook: { id: string }, payload: Record<string, unknown>) => Promise<void>) | undefined;
|
|
28
|
+
}) => {
|
|
29
|
+
const { database, registry, collectorRegistry, getEmitHook } = opts;
|
|
27
30
|
// Create service instance once - shared across all handlers
|
|
28
31
|
const service = new HealthCheckService(database, registry, collectorRegistry);
|
|
29
32
|
|
|
@@ -137,6 +140,8 @@ export const createHealthCheckRouter = (
|
|
|
137
140
|
configurationId: input.body.configurationId,
|
|
138
141
|
enabled: input.body.enabled,
|
|
139
142
|
stateThresholds: input.body.stateThresholds,
|
|
143
|
+
satelliteIds: input.body.satelliteIds,
|
|
144
|
+
includeLocal: input.body.includeLocal,
|
|
140
145
|
});
|
|
141
146
|
|
|
142
147
|
// If enabling the health check, schedule it immediately
|
|
@@ -156,10 +161,28 @@ export const createHealthCheckRouter = (
|
|
|
156
161
|
});
|
|
157
162
|
}
|
|
158
163
|
}
|
|
164
|
+
|
|
165
|
+
// Notify subscribers (e.g., satellite-backend) that assignments changed
|
|
166
|
+
const emitHook = getEmitHook();
|
|
167
|
+
if (emitHook) {
|
|
168
|
+
await emitHook(healthCheckHooks.assignmentChanged, {
|
|
169
|
+
systemId: input.systemId,
|
|
170
|
+
configurationId: input.body.configurationId,
|
|
171
|
+
});
|
|
172
|
+
}
|
|
159
173
|
}),
|
|
160
174
|
|
|
161
175
|
disassociateSystem: os.disassociateSystem.handler(async ({ input }) => {
|
|
162
176
|
await service.disassociateSystem(input.systemId, input.configId);
|
|
177
|
+
|
|
178
|
+
// Notify subscribers that assignments changed
|
|
179
|
+
const emitHook = getEmitHook();
|
|
180
|
+
if (emitHook) {
|
|
181
|
+
await emitHook(healthCheckHooks.assignmentChanged, {
|
|
182
|
+
systemId: input.systemId,
|
|
183
|
+
configurationId: input.configId,
|
|
184
|
+
});
|
|
185
|
+
}
|
|
163
186
|
}),
|
|
164
187
|
|
|
165
188
|
getRetentionConfig: os.getRetentionConfig.handler(async ({ input }) => {
|
|
@@ -231,9 +254,21 @@ export const createHealthCheckRouter = (
|
|
|
231
254
|
},
|
|
232
255
|
),
|
|
233
256
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
257
|
+
// ========================================================================
|
|
258
|
+
// SERVICE INTERFACE (S2S — satellite-backend)
|
|
259
|
+
// ========================================================================
|
|
260
|
+
|
|
261
|
+
getAssignmentsForSatellite: os.getAssignmentsForSatellite.handler(
|
|
262
|
+
async ({ input }) => {
|
|
263
|
+
return service.getAssignmentsForSatellite(input.satelliteId);
|
|
264
|
+
},
|
|
265
|
+
),
|
|
266
|
+
|
|
267
|
+
ingestSatelliteResult: os.ingestSatelliteResult.handler(
|
|
268
|
+
async ({ input }) => {
|
|
269
|
+
await service.ingestSatelliteResult(input);
|
|
270
|
+
},
|
|
271
|
+
),
|
|
237
272
|
});
|
|
238
273
|
};
|
|
239
274
|
|
package/src/schema.ts
CHANGED
|
@@ -90,6 +90,16 @@ export const systemHealthChecks = pgTable(
|
|
|
90
90
|
* Null means use default retention settings.
|
|
91
91
|
*/
|
|
92
92
|
retentionConfig: jsonb("retention_config").$type<RetentionConfig>(),
|
|
93
|
+
/**
|
|
94
|
+
* IDs of satellites assigned to execute this health check.
|
|
95
|
+
* When set, the check runs on these satellite nodes in addition to (or instead of) the core.
|
|
96
|
+
*/
|
|
97
|
+
satelliteIds: jsonb("satellite_ids").$type<string[]>(),
|
|
98
|
+
/**
|
|
99
|
+
* Whether to also run this check locally on the core instance.
|
|
100
|
+
* Defaults to true. Only relevant when satelliteIds is set.
|
|
101
|
+
*/
|
|
102
|
+
includeLocal: boolean("include_local").default(true).notNull(),
|
|
93
103
|
createdAt: timestamp("created_at").defaultNow().notNull(),
|
|
94
104
|
updatedAt: timestamp("updated_at").defaultNow().notNull(),
|
|
95
105
|
},
|
|
@@ -108,6 +118,16 @@ export const healthCheckRuns = pgTable("health_check_runs", {
|
|
|
108
118
|
/** Execution duration in milliseconds */
|
|
109
119
|
latencyMs: integer("latency_ms"),
|
|
110
120
|
result: jsonb("result").$type<Record<string, unknown>>(),
|
|
121
|
+
/**
|
|
122
|
+
* Source identifier for result attribution.
|
|
123
|
+
* null = local core execution, UUID = satellite ID.
|
|
124
|
+
*/
|
|
125
|
+
sourceId: text("source_id"),
|
|
126
|
+
/**
|
|
127
|
+
* Human-readable source label for UI display.
|
|
128
|
+
* e.g. "Local" or "EU West (eu-west-1)".
|
|
129
|
+
*/
|
|
130
|
+
sourceLabel: text("source_label"),
|
|
111
131
|
timestamp: timestamp("timestamp").defaultNow().notNull(),
|
|
112
132
|
});
|
|
113
133
|
|
|
@@ -151,14 +171,24 @@ export const healthCheckAggregates = pgTable(
|
|
|
151
171
|
jsonb("aggregated_result").$type<Record<string, unknown>>(),
|
|
152
172
|
/** Serialized t-digest state for incremental p95 calculation */
|
|
153
173
|
tdigestState: jsonb("tdigest_state").$type<number[]>(),
|
|
174
|
+
/**
|
|
175
|
+
* Source identifier for per-region aggregation.
|
|
176
|
+
* null = local core execution, UUID = satellite ID.
|
|
177
|
+
*/
|
|
178
|
+
sourceId: text("source_id"),
|
|
179
|
+
/**
|
|
180
|
+
* Human-readable source label for UI display.
|
|
181
|
+
*/
|
|
182
|
+
sourceLabel: text("source_label"),
|
|
154
183
|
},
|
|
155
184
|
(t) => ({
|
|
156
|
-
// Unique constraint for
|
|
185
|
+
// Unique constraint includes sourceId for per-region aggregation
|
|
157
186
|
bucketUnique: uniqueIndex("health_check_aggregates_bucket_unique").on(
|
|
158
187
|
t.configurationId,
|
|
159
188
|
t.systemId,
|
|
160
189
|
t.bucketStart,
|
|
161
190
|
t.bucketSize,
|
|
191
|
+
t.sourceId,
|
|
162
192
|
),
|
|
163
193
|
}),
|
|
164
194
|
);
|