@checkstack/healthcheck-backend 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +329 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +6 -27
- package/src/automations.ts +32 -30
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +698 -0
- package/src/health-entity.ts +369 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +13 -68
- package/src/index.ts +115 -48
- package/src/queue-executor.ts +243 -444
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +13 -0
- package/src/router.ts +44 -0
- package/src/schema.ts +34 -54
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +89 -0
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +9 -0
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
package/src/index.ts
CHANGED
|
@@ -3,7 +3,6 @@ import {
|
|
|
3
3
|
bootstrapHealthChecks,
|
|
4
4
|
} from "./queue-executor";
|
|
5
5
|
import { setupRetentionJob } from "./retention-job";
|
|
6
|
-
import { setupAutoIncidentCloseJob } from "./auto-incident-close-job";
|
|
7
6
|
import * as schema from "./schema";
|
|
8
7
|
import {
|
|
9
8
|
healthCheckAccessRules,
|
|
@@ -31,8 +30,19 @@ import {
|
|
|
31
30
|
automationActionExtensionPoint,
|
|
32
31
|
automationArtifactTypeExtensionPoint,
|
|
33
32
|
automationTriggerExtensionPoint,
|
|
33
|
+
entityExtensionPoint,
|
|
34
|
+
type EntityHandle,
|
|
34
35
|
} from "@checkstack/automation-backend";
|
|
36
|
+
import {
|
|
37
|
+
HEALTH_ENTITY_KIND,
|
|
38
|
+
HealthEntityStateSchema,
|
|
39
|
+
createHealthEntityRead,
|
|
40
|
+
deriveHealthTriggerEvents,
|
|
41
|
+
healthChangeToPayload,
|
|
42
|
+
type HealthEntityState,
|
|
43
|
+
} from "./health-entity";
|
|
35
44
|
import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
|
|
45
|
+
import { secretResolverRef } from "@checkstack/secrets-backend";
|
|
36
46
|
import { createHealthCheckRouter } from "./router";
|
|
37
47
|
import { HealthCheckService } from "./service";
|
|
38
48
|
import {
|
|
@@ -41,11 +51,8 @@ import {
|
|
|
41
51
|
healthCheckTriggers,
|
|
42
52
|
} from "./automations";
|
|
43
53
|
import { registerHealthcheckGitOpsKinds, registerHealthcheckGitOpsDocumentation } from "./healthcheck-gitops-kinds";
|
|
44
|
-
import {
|
|
54
|
+
import { CATALOG_SYSTEM_ENTITY_KIND } from "@checkstack/catalog-backend";
|
|
45
55
|
import { satelliteHooks } from "@checkstack/satellite-backend";
|
|
46
|
-
import { incidentHooks } from "@checkstack/incident-backend";
|
|
47
|
-
import { eq, and, isNull } from "drizzle-orm";
|
|
48
|
-
import { healthCheckAutoIncidents } from "./schema";
|
|
49
56
|
import { CatalogApi } from "@checkstack/catalog-common";
|
|
50
57
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
51
58
|
import { IncidentApi } from "@checkstack/incident-common";
|
|
@@ -57,6 +64,21 @@ import { createHealthCheckCache } from "./cache";
|
|
|
57
64
|
// Store emitHook reference for use during Phase 2 init
|
|
58
65
|
let storedEmitHook: EmitHookFn | undefined;
|
|
59
66
|
|
|
67
|
+
// The reactive `health` entity handle (§10.3). Defined in register() via
|
|
68
|
+
// the entity extension point (buffered until automation-backend registers
|
|
69
|
+
// the impl); mutations only fire from init() onward once the read accessor
|
|
70
|
+
// has its db + service.
|
|
71
|
+
let healthEntity: EntityHandle<HealthEntityState> | undefined;
|
|
72
|
+
|
|
73
|
+
// PLUGIN-BACKED + COMPUTED kind: the `health` aggregate has no domain table and
|
|
74
|
+
// no framework `entity_state` row — its current state is COMPUTED on read from
|
|
75
|
+
// the durable `health_check_runs` (via `getSystemHealthStatus`). The db +
|
|
76
|
+
// service are only available in init(), but the entity `read` accessor must be
|
|
77
|
+
// supplied at `defineEntity` time in register(). These holders bridge the two;
|
|
78
|
+
// init() sets them before any mutation runs (the queue worker — the only
|
|
79
|
+
// mutation site — is set up in init() after these are bound).
|
|
80
|
+
let healthEntityService: HealthCheckService | undefined;
|
|
81
|
+
|
|
60
82
|
export default createBackendPlugin({
|
|
61
83
|
metadata: pluginMetadata,
|
|
62
84
|
register(env) {
|
|
@@ -80,6 +102,47 @@ export default createBackendPlugin({
|
|
|
80
102
|
.getExtensionPoint(automationArtifactTypeExtensionPoint)
|
|
81
103
|
.registerArtifactType(assignmentArtifactType, pluginMetadata);
|
|
82
104
|
|
|
105
|
+
// ─── Reactive `health` entity (§10.3) ──────────────────────────────
|
|
106
|
+
// PLUGIN-BACKED + COMPUTED kind (Model B): the per-system aggregate has no
|
|
107
|
+
// domain table and NO framework `entity_state` row. `read` COMPUTES each
|
|
108
|
+
// system's `{ status, healthyChecks, totalChecks }` on demand from the same
|
|
109
|
+
// durable `health_check_runs` the rest of the plugin reads (via
|
|
110
|
+
// `getSystemHealthStatus`), gated on the system having at least one ENABLED
|
|
111
|
+
// check association — see `createHealthEntityRead`. A system with an enabled
|
|
112
|
+
// check but no runs yet resolves to the default-`healthy` baseline so a
|
|
113
|
+
// first-ever unhealthy run is a real `healthy → degraded` diff. The service
|
|
114
|
+
// is resolved in init() and bridged via the holder. The change →
|
|
115
|
+
// trigger-event deriver keeps the
|
|
116
|
+
// existing `healthcheck.system.degraded` / `.healthy` / `.health_changed`
|
|
117
|
+
// automations firing off the computed state.
|
|
118
|
+
const entityPoint = env.getExtensionPoint(entityExtensionPoint);
|
|
119
|
+
healthEntity = entityPoint.defineEntity<HealthEntityState>({
|
|
120
|
+
kind: HEALTH_ENTITY_KIND,
|
|
121
|
+
state: HealthEntityStateSchema,
|
|
122
|
+
read: (ids) => {
|
|
123
|
+
const service = healthEntityService;
|
|
124
|
+
if (!service) {
|
|
125
|
+
throw new Error(
|
|
126
|
+
"health entity read before init: service not yet resolved",
|
|
127
|
+
);
|
|
128
|
+
}
|
|
129
|
+
return createHealthEntityRead({ service })(ids);
|
|
130
|
+
},
|
|
131
|
+
});
|
|
132
|
+
entityPoint.registerChangeDeriver({
|
|
133
|
+
kind: HEALTH_ENTITY_KIND,
|
|
134
|
+
derive: deriveHealthTriggerEvents,
|
|
135
|
+
toPayload: healthChangeToPayload,
|
|
136
|
+
});
|
|
137
|
+
// Raw per-check samples + cursors are intentionally NON-reactive (§5):
|
|
138
|
+
// a firehose of individual runs would melt the wake-index; the
|
|
139
|
+
// aggregate is the entity.
|
|
140
|
+
entityPoint.declareNonReactiveState({
|
|
141
|
+
table: "health_check_runs",
|
|
142
|
+
reason: "raw-sample",
|
|
143
|
+
note: "High-frequency individual check executions. The per-system aggregate is the `health` entity; raw runs stay a numeric_state wake source only.",
|
|
144
|
+
});
|
|
145
|
+
|
|
83
146
|
// ─── GitOps Entity Kind Registration ───────────────────────────────
|
|
84
147
|
// Mutable refs — populated during init(), consumed by reconcile closures.
|
|
85
148
|
let gitopsDb: SafeDatabase<typeof schema> | undefined;
|
|
@@ -134,6 +197,7 @@ export default createBackendPlugin({
|
|
|
134
197
|
signalService: coreServices.signalService,
|
|
135
198
|
cacheManager: coreServices.cacheManager,
|
|
136
199
|
config: coreServices.config,
|
|
200
|
+
secretResolver: secretResolverRef,
|
|
137
201
|
},
|
|
138
202
|
// Phase 2: Register router and setup worker
|
|
139
203
|
init: async ({
|
|
@@ -147,6 +211,7 @@ export default createBackendPlugin({
|
|
|
147
211
|
signalService,
|
|
148
212
|
cacheManager,
|
|
149
213
|
config,
|
|
214
|
+
secretResolver,
|
|
150
215
|
}) => {
|
|
151
216
|
logger.debug("🏥 Initializing Health Check Backend...");
|
|
152
217
|
|
|
@@ -156,6 +221,17 @@ export default createBackendPlugin({
|
|
|
156
221
|
gitopsCollectorRegistry = collectorRegistry;
|
|
157
222
|
gitopsQueueManager = queueManager;
|
|
158
223
|
|
|
224
|
+
// Bind the COMPUTE-ON-READ accessor's db + service for the `health`
|
|
225
|
+
// entity (defined in register()). From here onward the entity `read`
|
|
226
|
+
// computes each system's aggregate from durable `health_check_runs`,
|
|
227
|
+
// and the queue worker (set up just below — the only mutation site)
|
|
228
|
+
// drives writes through `handle.mutate`.
|
|
229
|
+
healthEntityService = new HealthCheckService(
|
|
230
|
+
database,
|
|
231
|
+
healthCheckRegistry,
|
|
232
|
+
collectorRegistry,
|
|
233
|
+
);
|
|
234
|
+
|
|
159
235
|
// Create catalog client for notification delegation
|
|
160
236
|
const catalogClient = rpcClient.forPlugin(CatalogApi);
|
|
161
237
|
|
|
@@ -191,7 +267,9 @@ export default createBackendPlugin({
|
|
|
191
267
|
maintenanceClient,
|
|
192
268
|
incidentClient,
|
|
193
269
|
getEmitHook: () => storedEmitHook,
|
|
270
|
+
getHealthEntity: () => healthEntity,
|
|
194
271
|
cache,
|
|
272
|
+
secretResolver,
|
|
195
273
|
});
|
|
196
274
|
|
|
197
275
|
// Setup retention job for tiered storage (daily aggregation)
|
|
@@ -201,15 +279,12 @@ export default createBackendPlugin({
|
|
|
201
279
|
queueManager,
|
|
202
280
|
});
|
|
203
281
|
|
|
204
|
-
//
|
|
205
|
-
//
|
|
206
|
-
// for
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
queueManager,
|
|
211
|
-
incidentClient,
|
|
212
|
-
});
|
|
282
|
+
// The hardcoded auto-incident open/close path was removed — auto-
|
|
283
|
+
// incident behaviour is now built entirely by user automations
|
|
284
|
+
// (e.g. `healthcheck.system_degraded` + `for:` → `incident.create`).
|
|
285
|
+
// Flapping is detected by the automation engine's windowed-count gate
|
|
286
|
+
// on the `system_health_changed` trigger — healthcheck emits only the
|
|
287
|
+
// raw aggregated-health change (via the reactive `health` entity).
|
|
213
288
|
|
|
214
289
|
const healthCheckRouter = createHealthCheckRouter({
|
|
215
290
|
database: database as SafeDatabase<typeof schema>,
|
|
@@ -220,6 +295,8 @@ export default createBackendPlugin({
|
|
|
220
295
|
cache,
|
|
221
296
|
configService: config,
|
|
222
297
|
catalogClient,
|
|
298
|
+
maintenanceClient,
|
|
299
|
+
logger,
|
|
223
300
|
});
|
|
224
301
|
rpc.registerRouter(healthCheckRouter, healthCheckContract);
|
|
225
302
|
|
|
@@ -310,17 +387,23 @@ export default createBackendPlugin({
|
|
|
310
387
|
automationActions.registerAction(action, pluginMetadata);
|
|
311
388
|
}
|
|
312
389
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
390
|
+
// React to catalog system deletion (tombstone) via the reactive
|
|
391
|
+
// `catalog-system` entity instead of the (removed) `system.deleted`
|
|
392
|
+
// hook (§10.4). `work-queue` delivery preserved: association cleanup
|
|
393
|
+
// must run once per cluster, not per-instance.
|
|
394
|
+
entityPoint.onEntityChanged({
|
|
395
|
+
kind: CATALOG_SYSTEM_ENTITY_KIND,
|
|
396
|
+
handler: async (change) => {
|
|
397
|
+
if (change.next !== null) return; // tombstone only
|
|
398
|
+
const systemId = change.id;
|
|
316
399
|
logger.debug(
|
|
317
|
-
`Cleaning up health check associations for deleted system: ${
|
|
400
|
+
`Cleaning up health check associations for deleted system: ${systemId}`,
|
|
318
401
|
);
|
|
319
|
-
await service.removeAllSystemAssociations(
|
|
320
|
-
await healthCheckCache?.invalidateSystem(
|
|
402
|
+
await service.removeAllSystemAssociations(systemId);
|
|
403
|
+
await healthCheckCache?.invalidateSystem(systemId);
|
|
321
404
|
},
|
|
322
|
-
{ mode: "work-queue", workerGroup: "system-cleanup" },
|
|
323
|
-
);
|
|
405
|
+
delivery: { mode: "work-queue", workerGroup: "system-cleanup" },
|
|
406
|
+
});
|
|
324
407
|
|
|
325
408
|
// Subscribe to satellite deletion to scrub satellite IDs from associations
|
|
326
409
|
onHook(
|
|
@@ -337,32 +420,6 @@ export default createBackendPlugin({
|
|
|
337
420
|
{ mode: "work-queue", workerGroup: "satellite-cleanup" },
|
|
338
421
|
);
|
|
339
422
|
|
|
340
|
-
// Sync our auto-incident mapping when an incident is resolved.
|
|
341
|
-
// Without this, a manually-closed incident would still appear
|
|
342
|
-
// "active" in our mapping, blocking the require-recovery rule
|
|
343
|
-
// from re-evaluating fresh transitions.
|
|
344
|
-
onHook(
|
|
345
|
-
incidentHooks.incidentResolved,
|
|
346
|
-
async ({ incidentId }) => {
|
|
347
|
-
const updated = await database
|
|
348
|
-
.update(healthCheckAutoIncidents)
|
|
349
|
-
.set({ closedAt: new Date() })
|
|
350
|
-
.where(
|
|
351
|
-
and(
|
|
352
|
-
eq(healthCheckAutoIncidents.incidentId, incidentId),
|
|
353
|
-
isNull(healthCheckAutoIncidents.closedAt),
|
|
354
|
-
),
|
|
355
|
-
)
|
|
356
|
-
.returning({ id: healthCheckAutoIncidents.id });
|
|
357
|
-
if (updated.length > 0) {
|
|
358
|
-
logger.debug(
|
|
359
|
-
`Marked auto-incident mapping closed for resolved incident ${incidentId}`,
|
|
360
|
-
);
|
|
361
|
-
}
|
|
362
|
-
},
|
|
363
|
-
{ mode: "work-queue", workerGroup: "auto-incident-sync" },
|
|
364
|
-
);
|
|
365
|
-
|
|
366
423
|
logger.debug("✅ Health Check Backend afterPluginsReady complete.");
|
|
367
424
|
},
|
|
368
425
|
});
|
|
@@ -371,3 +428,13 @@ export default createBackendPlugin({
|
|
|
371
428
|
|
|
372
429
|
// Re-export hooks for other plugins to use
|
|
373
430
|
export { healthCheckHooks } from "./hooks";
|
|
431
|
+
|
|
432
|
+
// Re-export the reactive `health` entity surface so cross-plugin consumers
|
|
433
|
+
// (slo, dependency) can subscribe via onEntityChanged + classify changes
|
|
434
|
+
// without duplicating the kind id / transition predicate (§10.3).
|
|
435
|
+
export {
|
|
436
|
+
HEALTH_ENTITY_KIND,
|
|
437
|
+
classifyHealthChange,
|
|
438
|
+
type HealthChangeClassification,
|
|
439
|
+
type HealthEntityState,
|
|
440
|
+
} from "./health-entity";
|