@checkstack/healthcheck-backend 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -3,7 +3,6 @@ import {
3
3
  bootstrapHealthChecks,
4
4
  } from "./queue-executor";
5
5
  import { setupRetentionJob } from "./retention-job";
6
- import { setupAutoIncidentCloseJob } from "./auto-incident-close-job";
7
6
  import * as schema from "./schema";
8
7
  import {
9
8
  healthCheckAccessRules,
@@ -31,8 +30,19 @@ import {
31
30
  automationActionExtensionPoint,
32
31
  automationArtifactTypeExtensionPoint,
33
32
  automationTriggerExtensionPoint,
33
+ entityExtensionPoint,
34
+ type EntityHandle,
34
35
  } from "@checkstack/automation-backend";
36
+ import {
37
+ HEALTH_ENTITY_KIND,
38
+ HealthEntityStateSchema,
39
+ createHealthEntityRead,
40
+ deriveHealthTriggerEvents,
41
+ healthChangeToPayload,
42
+ type HealthEntityState,
43
+ } from "./health-entity";
35
44
  import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
45
+ import { secretResolverRef } from "@checkstack/secrets-backend";
36
46
  import { createHealthCheckRouter } from "./router";
37
47
  import { HealthCheckService } from "./service";
38
48
  import {
@@ -41,11 +51,8 @@ import {
41
51
  healthCheckTriggers,
42
52
  } from "./automations";
43
53
  import { registerHealthcheckGitOpsKinds, registerHealthcheckGitOpsDocumentation } from "./healthcheck-gitops-kinds";
44
- import { catalogHooks } from "@checkstack/catalog-backend";
54
+ import { CATALOG_SYSTEM_ENTITY_KIND } from "@checkstack/catalog-backend";
45
55
  import { satelliteHooks } from "@checkstack/satellite-backend";
46
- import { incidentHooks } from "@checkstack/incident-backend";
47
- import { eq, and, isNull } from "drizzle-orm";
48
- import { healthCheckAutoIncidents } from "./schema";
49
56
  import { CatalogApi } from "@checkstack/catalog-common";
50
57
  import { MaintenanceApi } from "@checkstack/maintenance-common";
51
58
  import { IncidentApi } from "@checkstack/incident-common";
@@ -57,6 +64,21 @@ import { createHealthCheckCache } from "./cache";
57
64
  // Store emitHook reference for use during Phase 2 init
58
65
  let storedEmitHook: EmitHookFn | undefined;
59
66
 
67
+ // The reactive `health` entity handle (§10.3). Defined in register() via
68
+ // the entity extension point (buffered until automation-backend registers
69
+ // the impl); mutations only fire from init() onward once the read accessor
70
+ // has its db + service.
71
+ let healthEntity: EntityHandle<HealthEntityState> | undefined;
72
+
73
+ // PLUGIN-BACKED + COMPUTED kind: the `health` aggregate has no domain table and
74
+ // no framework `entity_state` row — its current state is COMPUTED on read from
75
+ // the durable `health_check_runs` (via `getSystemHealthStatus`). The db +
76
+ // service are only available in init(), but the entity `read` accessor must be
77
+ // supplied at `defineEntity` time in register(). These holders bridge the two;
78
+ // init() sets them before any mutation runs (the queue worker — the only
79
+ // mutation site — is set up in init() after these are bound).
80
+ let healthEntityService: HealthCheckService | undefined;
81
+
60
82
  export default createBackendPlugin({
61
83
  metadata: pluginMetadata,
62
84
  register(env) {
@@ -80,6 +102,47 @@ export default createBackendPlugin({
80
102
  .getExtensionPoint(automationArtifactTypeExtensionPoint)
81
103
  .registerArtifactType(assignmentArtifactType, pluginMetadata);
82
104
 
105
+ // ─── Reactive `health` entity (§10.3) ──────────────────────────────
106
+ // PLUGIN-BACKED + COMPUTED kind (Model B): the per-system aggregate has no
107
+ // domain table and NO framework `entity_state` row. `read` COMPUTES each
108
+ // system's `{ status, healthyChecks, totalChecks }` on demand from the same
109
+ // durable `health_check_runs` the rest of the plugin reads (via
110
+ // `getSystemHealthStatus`), gated on the system having at least one ENABLED
111
+ // check association — see `createHealthEntityRead`. A system with an enabled
112
+ // check but no runs yet resolves to the default-`healthy` baseline so a
113
+ // first-ever unhealthy run is a real `healthy → degraded` diff. The service
114
+ // is resolved in init() and bridged via the holder. The change →
115
+ // trigger-event deriver keeps the
116
+ // existing `healthcheck.system.degraded` / `.healthy` / `.health_changed`
117
+ // automations firing off the computed state.
118
+ const entityPoint = env.getExtensionPoint(entityExtensionPoint);
119
+ healthEntity = entityPoint.defineEntity<HealthEntityState>({
120
+ kind: HEALTH_ENTITY_KIND,
121
+ state: HealthEntityStateSchema,
122
+ read: (ids) => {
123
+ const service = healthEntityService;
124
+ if (!service) {
125
+ throw new Error(
126
+ "health entity read before init: service not yet resolved",
127
+ );
128
+ }
129
+ return createHealthEntityRead({ service })(ids);
130
+ },
131
+ });
132
+ entityPoint.registerChangeDeriver({
133
+ kind: HEALTH_ENTITY_KIND,
134
+ derive: deriveHealthTriggerEvents,
135
+ toPayload: healthChangeToPayload,
136
+ });
137
+ // Raw per-check samples + cursors are intentionally NON-reactive (§5):
138
+ // a firehose of individual runs would melt the wake-index; the
139
+ // aggregate is the entity.
140
+ entityPoint.declareNonReactiveState({
141
+ table: "health_check_runs",
142
+ reason: "raw-sample",
143
+ note: "High-frequency individual check executions. The per-system aggregate is the `health` entity; raw runs stay a numeric_state wake source only.",
144
+ });
145
+
83
146
  // ─── GitOps Entity Kind Registration ───────────────────────────────
84
147
  // Mutable refs — populated during init(), consumed by reconcile closures.
85
148
  let gitopsDb: SafeDatabase<typeof schema> | undefined;
@@ -134,6 +197,7 @@ export default createBackendPlugin({
134
197
  signalService: coreServices.signalService,
135
198
  cacheManager: coreServices.cacheManager,
136
199
  config: coreServices.config,
200
+ secretResolver: secretResolverRef,
137
201
  },
138
202
  // Phase 2: Register router and setup worker
139
203
  init: async ({
@@ -147,6 +211,7 @@ export default createBackendPlugin({
147
211
  signalService,
148
212
  cacheManager,
149
213
  config,
214
+ secretResolver,
150
215
  }) => {
151
216
  logger.debug("🏥 Initializing Health Check Backend...");
152
217
 
@@ -156,6 +221,17 @@ export default createBackendPlugin({
156
221
  gitopsCollectorRegistry = collectorRegistry;
157
222
  gitopsQueueManager = queueManager;
158
223
 
224
+ // Bind the COMPUTE-ON-READ accessor's db + service for the `health`
225
+ // entity (defined in register()). From here onward the entity `read`
226
+ // computes each system's aggregate from durable `health_check_runs`,
227
+ // and the queue worker (set up just below — the only mutation site)
228
+ // drives writes through `handle.mutate`.
229
+ healthEntityService = new HealthCheckService(
230
+ database,
231
+ healthCheckRegistry,
232
+ collectorRegistry,
233
+ );
234
+
159
235
  // Create catalog client for notification delegation
160
236
  const catalogClient = rpcClient.forPlugin(CatalogApi);
161
237
 
@@ -191,7 +267,9 @@ export default createBackendPlugin({
191
267
  maintenanceClient,
192
268
  incidentClient,
193
269
  getEmitHook: () => storedEmitHook,
270
+ getHealthEntity: () => healthEntity,
194
271
  cache,
272
+ secretResolver,
195
273
  });
196
274
 
197
275
  // Setup retention job for tiered storage (daily aggregation)
@@ -201,15 +279,12 @@ export default createBackendPlugin({
201
279
  queueManager,
202
280
  });
203
281
 
204
- // Setup auto-incident close worker (ticks every 60s, closes
205
- // auto-opened incidents whose systems have been steady-healthy
206
- // for the cooldown).
207
- await setupAutoIncidentCloseJob({
208
- db: database,
209
- logger,
210
- queueManager,
211
- incidentClient,
212
- });
282
+ // The hardcoded auto-incident open/close path was removed auto-
283
+ // incident behaviour is now built entirely by user automations
284
+ // (e.g. `healthcheck.system_degraded` + `for:` `incident.create`).
285
+ // Flapping is detected by the automation engine's windowed-count gate
286
+ // on the `system_health_changed` trigger — healthcheck emits only the
287
+ // raw aggregated-health change (via the reactive `health` entity).
213
288
 
214
289
  const healthCheckRouter = createHealthCheckRouter({
215
290
  database: database as SafeDatabase<typeof schema>,
@@ -220,6 +295,8 @@ export default createBackendPlugin({
220
295
  cache,
221
296
  configService: config,
222
297
  catalogClient,
298
+ maintenanceClient,
299
+ logger,
223
300
  });
224
301
  rpc.registerRouter(healthCheckRouter, healthCheckContract);
225
302
 
@@ -310,17 +387,23 @@ export default createBackendPlugin({
310
387
  automationActions.registerAction(action, pluginMetadata);
311
388
  }
312
389
 
313
- onHook(
314
- catalogHooks.systemDeleted,
315
- async (payload) => {
390
+ // React to catalog system deletion (tombstone) via the reactive
391
+ // `catalog-system` entity instead of the (removed) `system.deleted`
392
+ // hook (§10.4). `work-queue` delivery preserved: association cleanup
393
+ // must run once per cluster, not per-instance.
394
+ entityPoint.onEntityChanged({
395
+ kind: CATALOG_SYSTEM_ENTITY_KIND,
396
+ handler: async (change) => {
397
+ if (change.next !== null) return; // tombstone only
398
+ const systemId = change.id;
316
399
  logger.debug(
317
- `Cleaning up health check associations for deleted system: ${payload.systemId}`,
400
+ `Cleaning up health check associations for deleted system: ${systemId}`,
318
401
  );
319
- await service.removeAllSystemAssociations(payload.systemId);
320
- await healthCheckCache?.invalidateSystem(payload.systemId);
402
+ await service.removeAllSystemAssociations(systemId);
403
+ await healthCheckCache?.invalidateSystem(systemId);
321
404
  },
322
- { mode: "work-queue", workerGroup: "system-cleanup" },
323
- );
405
+ delivery: { mode: "work-queue", workerGroup: "system-cleanup" },
406
+ });
324
407
 
325
408
  // Subscribe to satellite deletion to scrub satellite IDs from associations
326
409
  onHook(
@@ -337,32 +420,6 @@ export default createBackendPlugin({
337
420
  { mode: "work-queue", workerGroup: "satellite-cleanup" },
338
421
  );
339
422
 
340
- // Sync our auto-incident mapping when an incident is resolved.
341
- // Without this, a manually-closed incident would still appear
342
- // "active" in our mapping, blocking the require-recovery rule
343
- // from re-evaluating fresh transitions.
344
- onHook(
345
- incidentHooks.incidentResolved,
346
- async ({ incidentId }) => {
347
- const updated = await database
348
- .update(healthCheckAutoIncidents)
349
- .set({ closedAt: new Date() })
350
- .where(
351
- and(
352
- eq(healthCheckAutoIncidents.incidentId, incidentId),
353
- isNull(healthCheckAutoIncidents.closedAt),
354
- ),
355
- )
356
- .returning({ id: healthCheckAutoIncidents.id });
357
- if (updated.length > 0) {
358
- logger.debug(
359
- `Marked auto-incident mapping closed for resolved incident ${incidentId}`,
360
- );
361
- }
362
- },
363
- { mode: "work-queue", workerGroup: "auto-incident-sync" },
364
- );
365
-
366
423
  logger.debug("✅ Health Check Backend afterPluginsReady complete.");
367
424
  },
368
425
  });
@@ -371,3 +428,13 @@ export default createBackendPlugin({
371
428
 
372
429
  // Re-export hooks for other plugins to use
373
430
  export { healthCheckHooks } from "./hooks";
431
+
432
+ // Re-export the reactive `health` entity surface so cross-plugin consumers
433
+ // (slo, dependency) can subscribe via onEntityChanged + classify changes
434
+ // without duplicating the kind id / transition predicate (§10.3).
435
+ export {
436
+ HEALTH_ENTITY_KIND,
437
+ classifyHealthChange,
438
+ type HealthChangeClassification,
439
+ type HealthEntityState,
440
+ } from "./health-entity";