@checkstack/healthcheck-backend 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +409 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +6 -27
- package/src/automations.ts +32 -30
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +694 -0
- package/src/health-entity.ts +367 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +13 -68
- package/src/index.ts +118 -48
- package/src/queue-executor.test.ts +13 -0
- package/src/queue-executor.ts +251 -444
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +13 -0
- package/src/router.ts +44 -0
- package/src/schema.ts +34 -54
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +89 -0
- package/src/state-evaluator.test.ts +50 -5
- package/src/state-evaluator.ts +9 -2
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +9 -0
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
package/src/index.ts
CHANGED
|
@@ -3,7 +3,6 @@ import {
|
|
|
3
3
|
bootstrapHealthChecks,
|
|
4
4
|
} from "./queue-executor";
|
|
5
5
|
import { setupRetentionJob } from "./retention-job";
|
|
6
|
-
import { setupAutoIncidentCloseJob } from "./auto-incident-close-job";
|
|
7
6
|
import * as schema from "./schema";
|
|
8
7
|
import {
|
|
9
8
|
healthCheckAccessRules,
|
|
@@ -31,8 +30,19 @@ import {
|
|
|
31
30
|
automationActionExtensionPoint,
|
|
32
31
|
automationArtifactTypeExtensionPoint,
|
|
33
32
|
automationTriggerExtensionPoint,
|
|
33
|
+
entityExtensionPoint,
|
|
34
|
+
type EntityHandle,
|
|
34
35
|
} from "@checkstack/automation-backend";
|
|
36
|
+
import {
|
|
37
|
+
HEALTH_ENTITY_KIND,
|
|
38
|
+
HealthEntityStateSchema,
|
|
39
|
+
createHealthEntityRead,
|
|
40
|
+
deriveHealthTriggerEvents,
|
|
41
|
+
healthChangeToPayload,
|
|
42
|
+
type HealthEntityState,
|
|
43
|
+
} from "./health-entity";
|
|
35
44
|
import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
|
|
45
|
+
import { secretResolverRef } from "@checkstack/secrets-backend";
|
|
36
46
|
import { createHealthCheckRouter } from "./router";
|
|
37
47
|
import { HealthCheckService } from "./service";
|
|
38
48
|
import {
|
|
@@ -41,11 +51,8 @@ import {
|
|
|
41
51
|
healthCheckTriggers,
|
|
42
52
|
} from "./automations";
|
|
43
53
|
import { registerHealthcheckGitOpsKinds, registerHealthcheckGitOpsDocumentation } from "./healthcheck-gitops-kinds";
|
|
44
|
-
import {
|
|
54
|
+
import { CATALOG_SYSTEM_ENTITY_KIND } from "@checkstack/catalog-backend";
|
|
45
55
|
import { satelliteHooks } from "@checkstack/satellite-backend";
|
|
46
|
-
import { incidentHooks } from "@checkstack/incident-backend";
|
|
47
|
-
import { eq, and, isNull } from "drizzle-orm";
|
|
48
|
-
import { healthCheckAutoIncidents } from "./schema";
|
|
49
56
|
import { CatalogApi } from "@checkstack/catalog-common";
|
|
50
57
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
51
58
|
import { IncidentApi } from "@checkstack/incident-common";
|
|
@@ -57,6 +64,21 @@ import { createHealthCheckCache } from "./cache";
|
|
|
57
64
|
// Store emitHook reference for use during Phase 2 init
|
|
58
65
|
let storedEmitHook: EmitHookFn | undefined;
|
|
59
66
|
|
|
67
|
+
// The reactive `health` entity handle (§10.3). Defined in register() via
|
|
68
|
+
// the entity extension point (buffered until automation-backend registers
|
|
69
|
+
// the impl); mutations only fire from init() onward once the read accessor
|
|
70
|
+
// has its db + service.
|
|
71
|
+
let healthEntity: EntityHandle<HealthEntityState> | undefined;
|
|
72
|
+
|
|
73
|
+
// PLUGIN-BACKED + COMPUTED kind: the `health` aggregate has no domain table and
|
|
74
|
+
// no framework `entity_state` row — its current state is COMPUTED on read from
|
|
75
|
+
// the durable `health_check_runs` (via `getSystemHealthStatus`). The db +
|
|
76
|
+
// service are only available in init(), but the entity `read` accessor must be
|
|
77
|
+
// supplied at `defineEntity` time in register(). These holders bridge the two;
|
|
78
|
+
// init() sets them before any mutation runs (the queue worker — the only
|
|
79
|
+
// mutation site — is set up in init() after these are bound).
|
|
80
|
+
let healthEntityService: HealthCheckService | undefined;
|
|
81
|
+
|
|
60
82
|
export default createBackendPlugin({
|
|
61
83
|
metadata: pluginMetadata,
|
|
62
84
|
register(env) {
|
|
@@ -80,6 +102,47 @@ export default createBackendPlugin({
|
|
|
80
102
|
.getExtensionPoint(automationArtifactTypeExtensionPoint)
|
|
81
103
|
.registerArtifactType(assignmentArtifactType, pluginMetadata);
|
|
82
104
|
|
|
105
|
+
// ─── Reactive `health` entity (§10.3) ──────────────────────────────
|
|
106
|
+
// PLUGIN-BACKED + COMPUTED kind (Model B): the per-system aggregate has no
|
|
107
|
+
// domain table and NO framework `entity_state` row. `read` COMPUTES each
|
|
108
|
+
// system's `{ status, healthyChecks, totalChecks }` on demand from the same
|
|
109
|
+
// durable `health_check_runs` the rest of the plugin reads (via
|
|
110
|
+
// `getSystemHealthStatus`), gated on the system having at least one ENABLED
|
|
111
|
+
// check association — see `createHealthEntityRead`. A system with an enabled
|
|
112
|
+
// check but no runs yet resolves to the default-`healthy` baseline so a
|
|
113
|
+
// first-ever unhealthy run is a real `healthy → degraded` diff. The service
|
|
114
|
+
// is resolved in init() and bridged via the holder. The change →
|
|
115
|
+
// trigger-event deriver keeps the
|
|
116
|
+
// existing `healthcheck.system.degraded` / `.healthy` / `.health_changed`
|
|
117
|
+
// automations firing off the computed state.
|
|
118
|
+
const entityPoint = env.getExtensionPoint(entityExtensionPoint);
|
|
119
|
+
healthEntity = entityPoint.defineEntity<HealthEntityState>({
|
|
120
|
+
kind: HEALTH_ENTITY_KIND,
|
|
121
|
+
state: HealthEntityStateSchema,
|
|
122
|
+
read: (ids) => {
|
|
123
|
+
const service = healthEntityService;
|
|
124
|
+
if (!service) {
|
|
125
|
+
throw new Error(
|
|
126
|
+
"health entity read before init: service not yet resolved",
|
|
127
|
+
);
|
|
128
|
+
}
|
|
129
|
+
return createHealthEntityRead({ service })(ids);
|
|
130
|
+
},
|
|
131
|
+
});
|
|
132
|
+
entityPoint.registerChangeDeriver({
|
|
133
|
+
kind: HEALTH_ENTITY_KIND,
|
|
134
|
+
derive: deriveHealthTriggerEvents,
|
|
135
|
+
toPayload: healthChangeToPayload,
|
|
136
|
+
});
|
|
137
|
+
// Raw per-check samples + cursors are intentionally NON-reactive (§5):
|
|
138
|
+
// a firehose of individual runs would melt the wake-index; the
|
|
139
|
+
// aggregate is the entity.
|
|
140
|
+
entityPoint.declareNonReactiveState({
|
|
141
|
+
table: "health_check_runs",
|
|
142
|
+
reason: "raw-sample",
|
|
143
|
+
note: "High-frequency individual check executions. The per-system aggregate is the `health` entity; raw runs stay a numeric_state wake source only.",
|
|
144
|
+
});
|
|
145
|
+
|
|
83
146
|
// ─── GitOps Entity Kind Registration ───────────────────────────────
|
|
84
147
|
// Mutable refs — populated during init(), consumed by reconcile closures.
|
|
85
148
|
let gitopsDb: SafeDatabase<typeof schema> | undefined;
|
|
@@ -134,6 +197,8 @@ export default createBackendPlugin({
|
|
|
134
197
|
signalService: coreServices.signalService,
|
|
135
198
|
cacheManager: coreServices.cacheManager,
|
|
136
199
|
config: coreServices.config,
|
|
200
|
+
secretResolver: secretResolverRef,
|
|
201
|
+
advisoryLock: coreServices.advisoryLock,
|
|
137
202
|
},
|
|
138
203
|
// Phase 2: Register router and setup worker
|
|
139
204
|
init: async ({
|
|
@@ -147,6 +212,8 @@ export default createBackendPlugin({
|
|
|
147
212
|
signalService,
|
|
148
213
|
cacheManager,
|
|
149
214
|
config,
|
|
215
|
+
secretResolver,
|
|
216
|
+
advisoryLock,
|
|
150
217
|
}) => {
|
|
151
218
|
logger.debug("🏥 Initializing Health Check Backend...");
|
|
152
219
|
|
|
@@ -156,6 +223,17 @@ export default createBackendPlugin({
|
|
|
156
223
|
gitopsCollectorRegistry = collectorRegistry;
|
|
157
224
|
gitopsQueueManager = queueManager;
|
|
158
225
|
|
|
226
|
+
// Bind the COMPUTE-ON-READ accessor's db + service for the `health`
|
|
227
|
+
// entity (defined in register()). From here onward the entity `read`
|
|
228
|
+
// computes each system's aggregate from durable `health_check_runs`,
|
|
229
|
+
// and the queue worker (set up just below — the only mutation site)
|
|
230
|
+
// drives writes through `handle.mutate`.
|
|
231
|
+
healthEntityService = new HealthCheckService(
|
|
232
|
+
database,
|
|
233
|
+
healthCheckRegistry,
|
|
234
|
+
collectorRegistry,
|
|
235
|
+
);
|
|
236
|
+
|
|
159
237
|
// Create catalog client for notification delegation
|
|
160
238
|
const catalogClient = rpcClient.forPlugin(CatalogApi);
|
|
161
239
|
|
|
@@ -182,6 +260,7 @@ export default createBackendPlugin({
|
|
|
182
260
|
await setupHealthCheckWorker({
|
|
183
261
|
notificationClient,
|
|
184
262
|
db: database,
|
|
263
|
+
advisoryLock,
|
|
185
264
|
registry: healthCheckRegistry,
|
|
186
265
|
collectorRegistry,
|
|
187
266
|
logger,
|
|
@@ -191,7 +270,9 @@ export default createBackendPlugin({
|
|
|
191
270
|
maintenanceClient,
|
|
192
271
|
incidentClient,
|
|
193
272
|
getEmitHook: () => storedEmitHook,
|
|
273
|
+
getHealthEntity: () => healthEntity,
|
|
194
274
|
cache,
|
|
275
|
+
secretResolver,
|
|
195
276
|
});
|
|
196
277
|
|
|
197
278
|
// Setup retention job for tiered storage (daily aggregation)
|
|
@@ -201,15 +282,12 @@ export default createBackendPlugin({
|
|
|
201
282
|
queueManager,
|
|
202
283
|
});
|
|
203
284
|
|
|
204
|
-
//
|
|
205
|
-
//
|
|
206
|
-
// for
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
queueManager,
|
|
211
|
-
incidentClient,
|
|
212
|
-
});
|
|
285
|
+
// The hardcoded auto-incident open/close path was removed — auto-
|
|
286
|
+
// incident behaviour is now built entirely by user automations
|
|
287
|
+
// (e.g. `healthcheck.system_degraded` + `for:` → `incident.create`).
|
|
288
|
+
// Flapping is detected by the automation engine's windowed-count gate
|
|
289
|
+
// on the `system_health_changed` trigger — healthcheck emits only the
|
|
290
|
+
// raw aggregated-health change (via the reactive `health` entity).
|
|
213
291
|
|
|
214
292
|
const healthCheckRouter = createHealthCheckRouter({
|
|
215
293
|
database: database as SafeDatabase<typeof schema>,
|
|
@@ -220,6 +298,8 @@ export default createBackendPlugin({
|
|
|
220
298
|
cache,
|
|
221
299
|
configService: config,
|
|
222
300
|
catalogClient,
|
|
301
|
+
maintenanceClient,
|
|
302
|
+
logger,
|
|
223
303
|
});
|
|
224
304
|
rpc.registerRouter(healthCheckRouter, healthCheckContract);
|
|
225
305
|
|
|
@@ -310,17 +390,23 @@ export default createBackendPlugin({
|
|
|
310
390
|
automationActions.registerAction(action, pluginMetadata);
|
|
311
391
|
}
|
|
312
392
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
393
|
+
// React to catalog system deletion (tombstone) via the reactive
|
|
394
|
+
// `catalog-system` entity instead of the (removed) `system.deleted`
|
|
395
|
+
// hook (§10.4). `work-queue` delivery preserved: association cleanup
|
|
396
|
+
// must run once per cluster, not per-instance.
|
|
397
|
+
entityPoint.onEntityChanged({
|
|
398
|
+
kind: CATALOG_SYSTEM_ENTITY_KIND,
|
|
399
|
+
handler: async (change) => {
|
|
400
|
+
if (change.next !== null) return; // tombstone only
|
|
401
|
+
const systemId = change.id;
|
|
316
402
|
logger.debug(
|
|
317
|
-
`Cleaning up health check associations for deleted system: ${
|
|
403
|
+
`Cleaning up health check associations for deleted system: ${systemId}`,
|
|
318
404
|
);
|
|
319
|
-
await service.removeAllSystemAssociations(
|
|
320
|
-
await healthCheckCache?.invalidateSystem(
|
|
405
|
+
await service.removeAllSystemAssociations(systemId);
|
|
406
|
+
await healthCheckCache?.invalidateSystem(systemId);
|
|
321
407
|
},
|
|
322
|
-
{ mode: "work-queue", workerGroup: "system-cleanup" },
|
|
323
|
-
);
|
|
408
|
+
delivery: { mode: "work-queue", workerGroup: "system-cleanup" },
|
|
409
|
+
});
|
|
324
410
|
|
|
325
411
|
// Subscribe to satellite deletion to scrub satellite IDs from associations
|
|
326
412
|
onHook(
|
|
@@ -337,32 +423,6 @@ export default createBackendPlugin({
|
|
|
337
423
|
{ mode: "work-queue", workerGroup: "satellite-cleanup" },
|
|
338
424
|
);
|
|
339
425
|
|
|
340
|
-
// Sync our auto-incident mapping when an incident is resolved.
|
|
341
|
-
// Without this, a manually-closed incident would still appear
|
|
342
|
-
// "active" in our mapping, blocking the require-recovery rule
|
|
343
|
-
// from re-evaluating fresh transitions.
|
|
344
|
-
onHook(
|
|
345
|
-
incidentHooks.incidentResolved,
|
|
346
|
-
async ({ incidentId }) => {
|
|
347
|
-
const updated = await database
|
|
348
|
-
.update(healthCheckAutoIncidents)
|
|
349
|
-
.set({ closedAt: new Date() })
|
|
350
|
-
.where(
|
|
351
|
-
and(
|
|
352
|
-
eq(healthCheckAutoIncidents.incidentId, incidentId),
|
|
353
|
-
isNull(healthCheckAutoIncidents.closedAt),
|
|
354
|
-
),
|
|
355
|
-
)
|
|
356
|
-
.returning({ id: healthCheckAutoIncidents.id });
|
|
357
|
-
if (updated.length > 0) {
|
|
358
|
-
logger.debug(
|
|
359
|
-
`Marked auto-incident mapping closed for resolved incident ${incidentId}`,
|
|
360
|
-
);
|
|
361
|
-
}
|
|
362
|
-
},
|
|
363
|
-
{ mode: "work-queue", workerGroup: "auto-incident-sync" },
|
|
364
|
-
);
|
|
365
|
-
|
|
366
426
|
logger.debug("✅ Health Check Backend afterPluginsReady complete.");
|
|
367
427
|
},
|
|
368
428
|
});
|
|
@@ -371,3 +431,13 @@ export default createBackendPlugin({
|
|
|
371
431
|
|
|
372
432
|
// Re-export hooks for other plugins to use
|
|
373
433
|
export { healthCheckHooks } from "./hooks";
|
|
434
|
+
|
|
435
|
+
// Re-export the reactive `health` entity surface so cross-plugin consumers
|
|
436
|
+
// (slo, dependency) can subscribe via onEntityChanged + classify changes
|
|
437
|
+
// without duplicating the kind id / transition predicate (§10.3).
|
|
438
|
+
export {
|
|
439
|
+
HEALTH_ENTITY_KIND,
|
|
440
|
+
classifyHealthChange,
|
|
441
|
+
type HealthChangeClassification,
|
|
442
|
+
type HealthEntityState,
|
|
443
|
+
} from "./health-entity";
|
|
@@ -13,6 +13,16 @@ const passthroughCache: HealthCheckCache = {
|
|
|
13
13
|
invalidateAllSystems: async () => 0,
|
|
14
14
|
scope: {} as HealthCheckCache["scope"],
|
|
15
15
|
};
|
|
16
|
+
|
|
17
|
+
// Pass-through advisory lock: these tests don't exercise cross-pod
|
|
18
|
+
// serialization, so run the critical section directly.
|
|
19
|
+
const mockAdvisoryLock: Parameters<
|
|
20
|
+
typeof setupHealthCheckWorker
|
|
21
|
+
>[0]["advisoryLock"] = {
|
|
22
|
+
tryAcquire: async () => ({ release: async () => {} }),
|
|
23
|
+
withXactLock: <T>({ fn }: { key: string; fn: () => Promise<T> }): Promise<T> =>
|
|
24
|
+
fn(),
|
|
25
|
+
};
|
|
16
26
|
import {
|
|
17
27
|
createMockLogger,
|
|
18
28
|
createMockQueueManager,
|
|
@@ -179,6 +189,7 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
179
189
|
db: mockDb as unknown as Parameters<
|
|
180
190
|
typeof setupHealthCheckWorker
|
|
181
191
|
>[0]["db"],
|
|
192
|
+
advisoryLock: mockAdvisoryLock,
|
|
182
193
|
registry: mockRegistry,
|
|
183
194
|
collectorRegistry:
|
|
184
195
|
createMockCollectorRegistry() as unknown as Parameters<
|
|
@@ -376,6 +387,7 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
376
387
|
db: mockDb as unknown as Parameters<
|
|
377
388
|
typeof setupHealthCheckWorker
|
|
378
389
|
>[0]["db"],
|
|
390
|
+
advisoryLock: mockAdvisoryLock,
|
|
379
391
|
registry: mockRegistry,
|
|
380
392
|
collectorRegistry:
|
|
381
393
|
createMockCollectorRegistry() as unknown as Parameters<
|
|
@@ -510,6 +522,7 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
510
522
|
db: mockDb as unknown as Parameters<
|
|
511
523
|
typeof setupHealthCheckWorker
|
|
512
524
|
>[0]["db"],
|
|
525
|
+
advisoryLock: mockAdvisoryLock,
|
|
513
526
|
registry: mockRegistry,
|
|
514
527
|
collectorRegistry: mockCollectorRegistry as unknown as Parameters<
|
|
515
528
|
typeof setupHealthCheckWorker
|