@checkstack/healthcheck-backend 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +409 -0
  2. package/drizzle/0015_quiet_meggan.sql +12 -0
  3. package/drizzle/0016_complex_maginty.sql +1 -0
  4. package/drizzle/0017_pretty_caretaker.sql +1 -0
  5. package/drizzle/meta/0015_snapshot.json +764 -0
  6. package/drizzle/meta/0016_snapshot.json +644 -0
  7. package/drizzle/meta/0017_snapshot.json +563 -0
  8. package/drizzle/meta/_journal.json +21 -0
  9. package/package.json +24 -21
  10. package/src/automations.test.ts +6 -27
  11. package/src/automations.ts +32 -30
  12. package/src/collector-script-test.test.ts +236 -0
  13. package/src/collector-script-test.ts +221 -0
  14. package/src/health-entity.test.ts +694 -0
  15. package/src/health-entity.ts +367 -0
  16. package/src/health-state.test.ts +115 -0
  17. package/src/health-state.ts +333 -0
  18. package/src/healthcheck-gitops-kinds.test.ts +6 -32
  19. package/src/healthcheck-gitops-kinds.ts +4 -19
  20. package/src/hooks.test.ts +19 -6
  21. package/src/hooks.ts +13 -68
  22. package/src/index.ts +118 -48
  23. package/src/queue-executor.test.ts +13 -0
  24. package/src/queue-executor.ts +251 -444
  25. package/src/retention-job.ts +65 -1
  26. package/src/retention-state-transitions.test.ts +49 -0
  27. package/src/router.test.ts +13 -0
  28. package/src/router.ts +44 -0
  29. package/src/schema.ts +34 -54
  30. package/src/service-notification-policy.test.ts +28 -71
  31. package/src/service.ts +89 -0
  32. package/src/state-evaluator.test.ts +50 -5
  33. package/src/state-evaluator.ts +9 -2
  34. package/src/state-transitions.test.ts +126 -0
  35. package/src/state-transitions.ts +112 -0
  36. package/tsconfig.json +9 -0
  37. package/src/auto-incident-close-job.ts +0 -164
  38. package/src/auto-incident.test.ts +0 -196
  39. package/src/auto-incident.ts +0 -332
package/src/index.ts CHANGED
@@ -3,7 +3,6 @@ import {
3
3
  bootstrapHealthChecks,
4
4
  } from "./queue-executor";
5
5
  import { setupRetentionJob } from "./retention-job";
6
- import { setupAutoIncidentCloseJob } from "./auto-incident-close-job";
7
6
  import * as schema from "./schema";
8
7
  import {
9
8
  healthCheckAccessRules,
@@ -31,8 +30,19 @@ import {
31
30
  automationActionExtensionPoint,
32
31
  automationArtifactTypeExtensionPoint,
33
32
  automationTriggerExtensionPoint,
33
+ entityExtensionPoint,
34
+ type EntityHandle,
34
35
  } from "@checkstack/automation-backend";
36
+ import {
37
+ HEALTH_ENTITY_KIND,
38
+ HealthEntityStateSchema,
39
+ createHealthEntityRead,
40
+ deriveHealthTriggerEvents,
41
+ healthChangeToPayload,
42
+ type HealthEntityState,
43
+ } from "./health-entity";
35
44
  import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
45
+ import { secretResolverRef } from "@checkstack/secrets-backend";
36
46
  import { createHealthCheckRouter } from "./router";
37
47
  import { HealthCheckService } from "./service";
38
48
  import {
@@ -41,11 +51,8 @@ import {
41
51
  healthCheckTriggers,
42
52
  } from "./automations";
43
53
  import { registerHealthcheckGitOpsKinds, registerHealthcheckGitOpsDocumentation } from "./healthcheck-gitops-kinds";
44
- import { catalogHooks } from "@checkstack/catalog-backend";
54
+ import { CATALOG_SYSTEM_ENTITY_KIND } from "@checkstack/catalog-backend";
45
55
  import { satelliteHooks } from "@checkstack/satellite-backend";
46
- import { incidentHooks } from "@checkstack/incident-backend";
47
- import { eq, and, isNull } from "drizzle-orm";
48
- import { healthCheckAutoIncidents } from "./schema";
49
56
  import { CatalogApi } from "@checkstack/catalog-common";
50
57
  import { MaintenanceApi } from "@checkstack/maintenance-common";
51
58
  import { IncidentApi } from "@checkstack/incident-common";
@@ -57,6 +64,21 @@ import { createHealthCheckCache } from "./cache";
57
64
  // Store emitHook reference for use during Phase 2 init
58
65
  let storedEmitHook: EmitHookFn | undefined;
59
66
 
67
+ // The reactive `health` entity handle (§10.3). Defined in register() via
68
+ // the entity extension point (buffered until automation-backend registers
69
+ // the impl); mutations only fire from init() onward once the read accessor
70
+ // has its db + service.
71
+ let healthEntity: EntityHandle<HealthEntityState> | undefined;
72
+
73
+ // PLUGIN-BACKED + COMPUTED kind: the `health` aggregate has no domain table and
74
+ // no framework `entity_state` row — its current state is COMPUTED on read from
75
+ // the durable `health_check_runs` (via `getSystemHealthStatus`). The db +
76
+ // service are only available in init(), but the entity `read` accessor must be
77
+ // supplied at `defineEntity` time in register(). These holders bridge the two;
78
+ // init() sets them before any mutation runs (the queue worker — the only
79
+ // mutation site — is set up in init() after these are bound).
80
+ let healthEntityService: HealthCheckService | undefined;
81
+
60
82
  export default createBackendPlugin({
61
83
  metadata: pluginMetadata,
62
84
  register(env) {
@@ -80,6 +102,47 @@ export default createBackendPlugin({
80
102
  .getExtensionPoint(automationArtifactTypeExtensionPoint)
81
103
  .registerArtifactType(assignmentArtifactType, pluginMetadata);
82
104
 
105
+ // ─── Reactive `health` entity (§10.3) ──────────────────────────────
106
+ // PLUGIN-BACKED + COMPUTED kind (Model B): the per-system aggregate has no
107
+ // domain table and NO framework `entity_state` row. `read` COMPUTES each
108
+ // system's `{ status, healthyChecks, totalChecks }` on demand from the same
109
+ // durable `health_check_runs` the rest of the plugin reads (via
110
+ // `getSystemHealthStatus`), gated on the system having at least one ENABLED
111
+ // check association — see `createHealthEntityRead`. A system with an enabled
112
+ // check but no runs yet resolves to the default-`healthy` baseline so a
113
+ // first-ever unhealthy run is a real `healthy → degraded` diff. The service
114
+ // is resolved in init() and bridged via the holder. The change →
115
+ // trigger-event deriver keeps the
116
+ // existing `healthcheck.system.degraded` / `.healthy` / `.health_changed`
117
+ // automations firing off the computed state.
118
+ const entityPoint = env.getExtensionPoint(entityExtensionPoint);
119
+ healthEntity = entityPoint.defineEntity<HealthEntityState>({
120
+ kind: HEALTH_ENTITY_KIND,
121
+ state: HealthEntityStateSchema,
122
+ read: (ids) => {
123
+ const service = healthEntityService;
124
+ if (!service) {
125
+ throw new Error(
126
+ "health entity read before init: service not yet resolved",
127
+ );
128
+ }
129
+ return createHealthEntityRead({ service })(ids);
130
+ },
131
+ });
132
+ entityPoint.registerChangeDeriver({
133
+ kind: HEALTH_ENTITY_KIND,
134
+ derive: deriveHealthTriggerEvents,
135
+ toPayload: healthChangeToPayload,
136
+ });
137
+ // Raw per-check samples + cursors are intentionally NON-reactive (§5):
138
+ // a firehose of individual runs would melt the wake-index; the
139
+ // aggregate is the entity.
140
+ entityPoint.declareNonReactiveState({
141
+ table: "health_check_runs",
142
+ reason: "raw-sample",
143
+ note: "High-frequency individual check executions. The per-system aggregate is the `health` entity; raw runs stay a numeric_state wake source only.",
144
+ });
145
+
83
146
  // ─── GitOps Entity Kind Registration ───────────────────────────────
84
147
  // Mutable refs — populated during init(), consumed by reconcile closures.
85
148
  let gitopsDb: SafeDatabase<typeof schema> | undefined;
@@ -134,6 +197,8 @@ export default createBackendPlugin({
134
197
  signalService: coreServices.signalService,
135
198
  cacheManager: coreServices.cacheManager,
136
199
  config: coreServices.config,
200
+ secretResolver: secretResolverRef,
201
+ advisoryLock: coreServices.advisoryLock,
137
202
  },
138
203
  // Phase 2: Register router and setup worker
139
204
  init: async ({
@@ -147,6 +212,8 @@ export default createBackendPlugin({
147
212
  signalService,
148
213
  cacheManager,
149
214
  config,
215
+ secretResolver,
216
+ advisoryLock,
150
217
  }) => {
151
218
  logger.debug("🏥 Initializing Health Check Backend...");
152
219
 
@@ -156,6 +223,17 @@ export default createBackendPlugin({
156
223
  gitopsCollectorRegistry = collectorRegistry;
157
224
  gitopsQueueManager = queueManager;
158
225
 
226
+ // Bind the COMPUTE-ON-READ accessor's db + service for the `health`
227
+ // entity (defined in register()). From here onward the entity `read`
228
+ // computes each system's aggregate from durable `health_check_runs`,
229
+ // and the queue worker (set up just below — the only mutation site)
230
+ // drives writes through `handle.mutate`.
231
+ healthEntityService = new HealthCheckService(
232
+ database,
233
+ healthCheckRegistry,
234
+ collectorRegistry,
235
+ );
236
+
159
237
  // Create catalog client for notification delegation
160
238
  const catalogClient = rpcClient.forPlugin(CatalogApi);
161
239
 
@@ -182,6 +260,7 @@ export default createBackendPlugin({
182
260
  await setupHealthCheckWorker({
183
261
  notificationClient,
184
262
  db: database,
263
+ advisoryLock,
185
264
  registry: healthCheckRegistry,
186
265
  collectorRegistry,
187
266
  logger,
@@ -191,7 +270,9 @@ export default createBackendPlugin({
191
270
  maintenanceClient,
192
271
  incidentClient,
193
272
  getEmitHook: () => storedEmitHook,
273
+ getHealthEntity: () => healthEntity,
194
274
  cache,
275
+ secretResolver,
195
276
  });
196
277
 
197
278
  // Setup retention job for tiered storage (daily aggregation)
@@ -201,15 +282,12 @@ export default createBackendPlugin({
201
282
  queueManager,
202
283
  });
203
284
 
204
- // Setup auto-incident close worker (ticks every 60s, closes
205
- // auto-opened incidents whose systems have been steady-healthy
206
- // for the cooldown).
207
- await setupAutoIncidentCloseJob({
208
- db: database,
209
- logger,
210
- queueManager,
211
- incidentClient,
212
- });
285
+ // The hardcoded auto-incident open/close path was removed auto-
286
+ // incident behaviour is now built entirely by user automations
287
+ // (e.g. `healthcheck.system_degraded` + `for:` `incident.create`).
288
+ // Flapping is detected by the automation engine's windowed-count gate
289
+ // on the `system_health_changed` trigger — healthcheck emits only the
290
+ // raw aggregated-health change (via the reactive `health` entity).
213
291
 
214
292
  const healthCheckRouter = createHealthCheckRouter({
215
293
  database: database as SafeDatabase<typeof schema>,
@@ -220,6 +298,8 @@ export default createBackendPlugin({
220
298
  cache,
221
299
  configService: config,
222
300
  catalogClient,
301
+ maintenanceClient,
302
+ logger,
223
303
  });
224
304
  rpc.registerRouter(healthCheckRouter, healthCheckContract);
225
305
 
@@ -310,17 +390,23 @@ export default createBackendPlugin({
310
390
  automationActions.registerAction(action, pluginMetadata);
311
391
  }
312
392
 
313
- onHook(
314
- catalogHooks.systemDeleted,
315
- async (payload) => {
393
+ // React to catalog system deletion (tombstone) via the reactive
394
+ // `catalog-system` entity instead of the (removed) `system.deleted`
395
+ // hook (§10.4). `work-queue` delivery preserved: association cleanup
396
+ // must run once per cluster, not per-instance.
397
+ entityPoint.onEntityChanged({
398
+ kind: CATALOG_SYSTEM_ENTITY_KIND,
399
+ handler: async (change) => {
400
+ if (change.next !== null) return; // tombstone only
401
+ const systemId = change.id;
316
402
  logger.debug(
317
- `Cleaning up health check associations for deleted system: ${payload.systemId}`,
403
+ `Cleaning up health check associations for deleted system: ${systemId}`,
318
404
  );
319
- await service.removeAllSystemAssociations(payload.systemId);
320
- await healthCheckCache?.invalidateSystem(payload.systemId);
405
+ await service.removeAllSystemAssociations(systemId);
406
+ await healthCheckCache?.invalidateSystem(systemId);
321
407
  },
322
- { mode: "work-queue", workerGroup: "system-cleanup" },
323
- );
408
+ delivery: { mode: "work-queue", workerGroup: "system-cleanup" },
409
+ });
324
410
 
325
411
  // Subscribe to satellite deletion to scrub satellite IDs from associations
326
412
  onHook(
@@ -337,32 +423,6 @@ export default createBackendPlugin({
337
423
  { mode: "work-queue", workerGroup: "satellite-cleanup" },
338
424
  );
339
425
 
340
- // Sync our auto-incident mapping when an incident is resolved.
341
- // Without this, a manually-closed incident would still appear
342
- // "active" in our mapping, blocking the require-recovery rule
343
- // from re-evaluating fresh transitions.
344
- onHook(
345
- incidentHooks.incidentResolved,
346
- async ({ incidentId }) => {
347
- const updated = await database
348
- .update(healthCheckAutoIncidents)
349
- .set({ closedAt: new Date() })
350
- .where(
351
- and(
352
- eq(healthCheckAutoIncidents.incidentId, incidentId),
353
- isNull(healthCheckAutoIncidents.closedAt),
354
- ),
355
- )
356
- .returning({ id: healthCheckAutoIncidents.id });
357
- if (updated.length > 0) {
358
- logger.debug(
359
- `Marked auto-incident mapping closed for resolved incident ${incidentId}`,
360
- );
361
- }
362
- },
363
- { mode: "work-queue", workerGroup: "auto-incident-sync" },
364
- );
365
-
366
426
  logger.debug("✅ Health Check Backend afterPluginsReady complete.");
367
427
  },
368
428
  });
@@ -371,3 +431,13 @@ export default createBackendPlugin({
371
431
 
372
432
  // Re-export hooks for other plugins to use
373
433
  export { healthCheckHooks } from "./hooks";
434
+
435
+ // Re-export the reactive `health` entity surface so cross-plugin consumers
436
+ // (slo, dependency) can subscribe via onEntityChanged + classify changes
437
+ // without duplicating the kind id / transition predicate (§10.3).
438
+ export {
439
+ HEALTH_ENTITY_KIND,
440
+ classifyHealthChange,
441
+ type HealthChangeClassification,
442
+ type HealthEntityState,
443
+ } from "./health-entity";
@@ -13,6 +13,16 @@ const passthroughCache: HealthCheckCache = {
13
13
  invalidateAllSystems: async () => 0,
14
14
  scope: {} as HealthCheckCache["scope"],
15
15
  };
16
+
17
+ // Pass-through advisory lock: these tests don't exercise cross-pod
18
+ // serialization, so run the critical section directly.
19
+ const mockAdvisoryLock: Parameters<
20
+ typeof setupHealthCheckWorker
21
+ >[0]["advisoryLock"] = {
22
+ tryAcquire: async () => ({ release: async () => {} }),
23
+ withXactLock: <T>({ fn }: { key: string; fn: () => Promise<T> }): Promise<T> =>
24
+ fn(),
25
+ };
16
26
  import {
17
27
  createMockLogger,
18
28
  createMockQueueManager,
@@ -179,6 +189,7 @@ describe("Queue-Based Health Check Executor", () => {
179
189
  db: mockDb as unknown as Parameters<
180
190
  typeof setupHealthCheckWorker
181
191
  >[0]["db"],
192
+ advisoryLock: mockAdvisoryLock,
182
193
  registry: mockRegistry,
183
194
  collectorRegistry:
184
195
  createMockCollectorRegistry() as unknown as Parameters<
@@ -376,6 +387,7 @@ describe("Queue-Based Health Check Executor", () => {
376
387
  db: mockDb as unknown as Parameters<
377
388
  typeof setupHealthCheckWorker
378
389
  >[0]["db"],
390
+ advisoryLock: mockAdvisoryLock,
379
391
  registry: mockRegistry,
380
392
  collectorRegistry:
381
393
  createMockCollectorRegistry() as unknown as Parameters<
@@ -510,6 +522,7 @@ describe("Queue-Based Health Check Executor", () => {
510
522
  db: mockDb as unknown as Parameters<
511
523
  typeof setupHealthCheckWorker
512
524
  >[0]["db"],
525
+ advisoryLock: mockAdvisoryLock,
513
526
  registry: mockRegistry,
514
527
  collectorRegistry: mockCollectorRegistry as unknown as Parameters<
515
528
  typeof setupHealthCheckWorker