@checkstack/healthcheck-backend 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +541 -0
  2. package/drizzle/0015_quiet_meggan.sql +12 -0
  3. package/drizzle/0016_complex_maginty.sql +1 -0
  4. package/drizzle/0017_pretty_caretaker.sql +1 -0
  5. package/drizzle/meta/0015_snapshot.json +764 -0
  6. package/drizzle/meta/0016_snapshot.json +644 -0
  7. package/drizzle/meta/0017_snapshot.json +563 -0
  8. package/drizzle/meta/_journal.json +21 -0
  9. package/package.json +24 -21
  10. package/src/automations.test.ts +234 -0
  11. package/src/automations.ts +342 -0
  12. package/src/collector-script-test.test.ts +236 -0
  13. package/src/collector-script-test.ts +221 -0
  14. package/src/health-entity.test.ts +698 -0
  15. package/src/health-entity.ts +369 -0
  16. package/src/health-state.test.ts +115 -0
  17. package/src/health-state.ts +333 -0
  18. package/src/healthcheck-gitops-kinds.test.ts +6 -32
  19. package/src/healthcheck-gitops-kinds.ts +4 -19
  20. package/src/hooks.test.ts +19 -6
  21. package/src/hooks.ts +38 -28
  22. package/src/index.ts +150 -98
  23. package/src/queue-executor.test.ts +137 -0
  24. package/src/queue-executor.ts +282 -380
  25. package/src/retention-job.ts +65 -1
  26. package/src/retention-state-transitions.test.ts +49 -0
  27. package/src/router.test.ts +18 -0
  28. package/src/router.ts +56 -1
  29. package/src/schema.ts +34 -54
  30. package/src/service-assignments.test.ts +184 -0
  31. package/src/service-notification-policy.test.ts +28 -71
  32. package/src/service.ts +154 -0
  33. package/src/state-transitions.test.ts +126 -0
  34. package/src/state-transitions.ts +112 -0
  35. package/tsconfig.json +12 -3
  36. package/src/auto-incident-close-job.ts +0 -164
  37. package/src/auto-incident.test.ts +0 -196
  38. package/src/auto-incident.ts +0 -332
package/src/index.ts CHANGED
@@ -3,7 +3,6 @@ import {
3
3
  bootstrapHealthChecks,
4
4
  } from "./queue-executor";
5
5
  import { setupRetentionJob } from "./retention-job";
6
- import { setupAutoIncidentCloseJob } from "./auto-incident-close-job";
7
6
  import * as schema from "./schema";
8
7
  import {
9
8
  healthCheckAccessRules,
@@ -27,52 +26,59 @@ import {
27
26
  type CollectorRegistry,
28
27
  } from "@checkstack/backend-api";
29
28
  import type { QueueManager } from "@checkstack/queue-api";
30
- import { integrationEventExtensionPoint } from "@checkstack/integration-backend";
29
+ import {
30
+ automationActionExtensionPoint,
31
+ automationArtifactTypeExtensionPoint,
32
+ automationTriggerExtensionPoint,
33
+ entityExtensionPoint,
34
+ type EntityHandle,
35
+ } from "@checkstack/automation-backend";
36
+ import {
37
+ HEALTH_ENTITY_KIND,
38
+ HealthEntityStateSchema,
39
+ createHealthEntityRead,
40
+ deriveHealthTriggerEvents,
41
+ healthChangeToPayload,
42
+ type HealthEntityState,
43
+ } from "./health-entity";
31
44
  import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
32
- import { z } from "zod";
45
+ import { secretResolverRef } from "@checkstack/secrets-backend";
33
46
  import { createHealthCheckRouter } from "./router";
34
47
  import { HealthCheckService } from "./service";
48
+ import {
49
+ assignmentArtifactType,
50
+ createHealthCheckActions,
51
+ healthCheckTriggers,
52
+ } from "./automations";
35
53
  import { registerHealthcheckGitOpsKinds, registerHealthcheckGitOpsDocumentation } from "./healthcheck-gitops-kinds";
36
- import { catalogHooks } from "@checkstack/catalog-backend";
54
+ import { CATALOG_SYSTEM_ENTITY_KIND } from "@checkstack/catalog-backend";
37
55
  import { satelliteHooks } from "@checkstack/satellite-backend";
38
- import { incidentHooks } from "@checkstack/incident-backend";
39
- import { eq, and, isNull } from "drizzle-orm";
40
- import { healthCheckAutoIncidents } from "./schema";
41
56
  import { CatalogApi } from "@checkstack/catalog-common";
42
57
  import { MaintenanceApi } from "@checkstack/maintenance-common";
43
58
  import { IncidentApi } from "@checkstack/incident-common";
44
59
  import { GitOpsApi } from "@checkstack/gitops-common";
45
- import { healthCheckHooks } from "./hooks";
46
60
  import { registerSearchProvider } from "@checkstack/command-backend";
47
61
  import { resolveRoute } from "@checkstack/common";
48
62
  import { createHealthCheckCache } from "./cache";
49
63
 
50
- // =============================================================================
51
- // Integration Event Payload Schemas
52
- // =============================================================================
53
-
54
- const systemDegradedPayloadSchema = z.object({
55
- systemId: z.string(),
56
- systemName: z.string().optional(),
57
- previousStatus: z.string(),
58
- newStatus: z.string(),
59
- healthyChecks: z.number(),
60
- totalChecks: z.number(),
61
- timestamp: z.string(),
62
- });
63
-
64
- const systemHealthyPayloadSchema = z.object({
65
- systemId: z.string(),
66
- systemName: z.string().optional(),
67
- previousStatus: z.string(),
68
- healthyChecks: z.number(),
69
- totalChecks: z.number(),
70
- timestamp: z.string(),
71
- });
72
-
73
64
  // Store emitHook reference for use during Phase 2 init
74
65
  let storedEmitHook: EmitHookFn | undefined;
75
66
 
67
+ // The reactive `health` entity handle (§10.3). Defined in register() via
68
+ // the entity extension point (buffered until automation-backend registers
69
+ // the impl); mutations only fire from init() onward once the read accessor
70
+ // has its db + service.
71
+ let healthEntity: EntityHandle<HealthEntityState> | undefined;
72
+
73
+ // PLUGIN-BACKED + COMPUTED kind: the `health` aggregate has no domain table and
74
+ // no framework `entity_state` row — its current state is COMPUTED on read from
75
+ // the durable `health_check_runs` (via `getSystemHealthStatus`). The db +
76
+ // service are only available in init(), but the entity `read` accessor must be
77
+ // supplied at `defineEntity` time in register(). These holders bridge the two;
78
+ // init() sets them before any mutation runs (the queue worker — the only
79
+ // mutation site — is set up in init() after these are bound).
80
+ let healthEntityService: HealthCheckService | undefined;
81
+
76
82
  export default createBackendPlugin({
77
83
  metadata: pluginMetadata,
78
84
  register(env) {
@@ -82,33 +88,60 @@ export default createBackendPlugin({
82
88
  healthcheckGroupSubscription,
83
89
  ]);
84
90
 
85
- // Register hooks as integration events
86
- const integrationEvents = env.getExtensionPoint(
87
- integrationEventExtensionPoint,
91
+ // ─── Automation Platform: triggers + artifact type ─────────────────
92
+ // Buffered behind the extension point until automation-backend's
93
+ // register() runs. Actions are wired in afterPluginsReady where
94
+ // `emitHook` becomes available.
95
+ const automationTriggers = env.getExtensionPoint(
96
+ automationTriggerExtensionPoint,
88
97
  );
98
+ for (const trigger of healthCheckTriggers) {
99
+ automationTriggers.registerTrigger(trigger, pluginMetadata);
100
+ }
101
+ env
102
+ .getExtensionPoint(automationArtifactTypeExtensionPoint)
103
+ .registerArtifactType(assignmentArtifactType, pluginMetadata);
89
104
 
90
- integrationEvents.registerEvent(
91
- {
92
- hook: healthCheckHooks.systemDegraded,
93
- displayName: "System Health Degraded",
94
- description:
95
- "Fired when a system's health status transitions from healthy to degraded/unhealthy",
96
- category: "Health",
97
- payloadSchema: systemDegradedPayloadSchema,
105
+ // ─── Reactive `health` entity (§10.3) ──────────────────────────────
106
+ // PLUGIN-BACKED + COMPUTED kind (Model B): the per-system aggregate has no
107
+ // domain table and NO framework `entity_state` row. `read` COMPUTES each
108
+ // system's `{ status, healthyChecks, totalChecks }` on demand from the same
109
+ // durable `health_check_runs` the rest of the plugin reads (via
110
+ // `getSystemHealthStatus`), gated on the system having at least one ENABLED
111
+ // check association — see `createHealthEntityRead`. A system with an enabled
112
+ // check but no runs yet resolves to the default-`healthy` baseline so a
113
+ // first-ever unhealthy run is a real `healthy → degraded` diff. The service
114
+ // is resolved in init() and bridged via the holder. The change →
115
+ // trigger-event deriver keeps the
116
+ // existing `healthcheck.system.degraded` / `.healthy` / `.health_changed`
117
+ // automations firing off the computed state.
118
+ const entityPoint = env.getExtensionPoint(entityExtensionPoint);
119
+ healthEntity = entityPoint.defineEntity<HealthEntityState>({
120
+ kind: HEALTH_ENTITY_KIND,
121
+ state: HealthEntityStateSchema,
122
+ read: (ids) => {
123
+ const service = healthEntityService;
124
+ if (!service) {
125
+ throw new Error(
126
+ "health entity read before init: service not yet resolved",
127
+ );
128
+ }
129
+ return createHealthEntityRead({ service })(ids);
98
130
  },
99
- pluginMetadata,
100
- );
101
-
102
- integrationEvents.registerEvent(
103
- {
104
- hook: healthCheckHooks.systemHealthy,
105
- displayName: "System Health Restored",
106
- description: "Fired when a system's health status recovers to healthy",
107
- category: "Health",
108
- payloadSchema: systemHealthyPayloadSchema,
109
- },
110
- pluginMetadata,
111
- );
131
+ });
132
+ entityPoint.registerChangeDeriver({
133
+ kind: HEALTH_ENTITY_KIND,
134
+ derive: deriveHealthTriggerEvents,
135
+ toPayload: healthChangeToPayload,
136
+ });
137
+ // Raw per-check samples + cursors are intentionally NON-reactive (§5):
138
+ // a firehose of individual runs would melt the wake-index; the
139
+ // aggregate is the entity.
140
+ entityPoint.declareNonReactiveState({
141
+ table: "health_check_runs",
142
+ reason: "raw-sample",
143
+ note: "High-frequency individual check executions. The per-system aggregate is the `health` entity; raw runs stay a numeric_state wake source only.",
144
+ });
112
145
 
113
146
  // ─── GitOps Entity Kind Registration ───────────────────────────────
114
147
  // Mutable refs — populated during init(), consumed by reconcile closures.
@@ -164,6 +197,7 @@ export default createBackendPlugin({
164
197
  signalService: coreServices.signalService,
165
198
  cacheManager: coreServices.cacheManager,
166
199
  config: coreServices.config,
200
+ secretResolver: secretResolverRef,
167
201
  },
168
202
  // Phase 2: Register router and setup worker
169
203
  init: async ({
@@ -177,6 +211,7 @@ export default createBackendPlugin({
177
211
  signalService,
178
212
  cacheManager,
179
213
  config,
214
+ secretResolver,
180
215
  }) => {
181
216
  logger.debug("🏥 Initializing Health Check Backend...");
182
217
 
@@ -186,6 +221,17 @@ export default createBackendPlugin({
186
221
  gitopsCollectorRegistry = collectorRegistry;
187
222
  gitopsQueueManager = queueManager;
188
223
 
224
+ // Bind the COMPUTE-ON-READ accessor's db + service for the `health`
225
+ // entity (defined in register()). From here onward the entity `read`
226
+ // computes each system's aggregate from durable `health_check_runs`,
227
+ // and the queue worker (set up just below — the only mutation site)
228
+ // drives writes through `handle.mutate`.
229
+ healthEntityService = new HealthCheckService(
230
+ database,
231
+ healthCheckRegistry,
232
+ collectorRegistry,
233
+ );
234
+
189
235
  // Create catalog client for notification delegation
190
236
  const catalogClient = rpcClient.forPlugin(CatalogApi);
191
237
 
@@ -221,7 +267,9 @@ export default createBackendPlugin({
221
267
  maintenanceClient,
222
268
  incidentClient,
223
269
  getEmitHook: () => storedEmitHook,
270
+ getHealthEntity: () => healthEntity,
224
271
  cache,
272
+ secretResolver,
225
273
  });
226
274
 
227
275
  // Setup retention job for tiered storage (daily aggregation)
@@ -231,15 +279,12 @@ export default createBackendPlugin({
231
279
  queueManager,
232
280
  });
233
281
 
234
- // Setup auto-incident close worker (ticks every 60s, closes
235
- // auto-opened incidents whose systems have been steady-healthy
236
- // for the cooldown).
237
- await setupAutoIncidentCloseJob({
238
- db: database,
239
- logger,
240
- queueManager,
241
- incidentClient,
242
- });
282
+ // The hardcoded auto-incident open/close path was removed auto-
283
+ // incident behaviour is now built entirely by user automations
284
+ // (e.g. `healthcheck.system_degraded` + `for:` `incident.create`).
285
+ // Flapping is detected by the automation engine's windowed-count gate
286
+ // on the `system_health_changed` trigger — healthcheck emits only the
287
+ // raw aggregated-health change (via the reactive `health` entity).
243
288
 
244
289
  const healthCheckRouter = createHealthCheckRouter({
245
290
  database: database as SafeDatabase<typeof schema>,
@@ -249,6 +294,9 @@ export default createBackendPlugin({
249
294
  getEmitHook: () => storedEmitHook,
250
295
  cache,
251
296
  configService: config,
297
+ catalogClient,
298
+ maintenanceClient,
299
+ logger,
252
300
  });
253
301
  rpc.registerRouter(healthCheckRouter, healthCheckContract);
254
302
 
@@ -325,17 +373,37 @@ export default createBackendPlugin({
325
373
  healthCheckRegistry,
326
374
  collectorRegistry,
327
375
  );
328
- onHook(
329
- catalogHooks.systemDeleted,
330
- async (payload) => {
376
+
377
+ // Register automation actions now that `emitHook` + `queueManager`
378
+ // are both available.
379
+ const automationActions = env.getExtensionPoint(
380
+ automationActionExtensionPoint,
381
+ );
382
+ for (const action of createHealthCheckActions({
383
+ service,
384
+ queueManager,
385
+ emitHook,
386
+ })) {
387
+ automationActions.registerAction(action, pluginMetadata);
388
+ }
389
+
390
+ // React to catalog system deletion (tombstone) via the reactive
391
+ // `catalog-system` entity instead of the (removed) `system.deleted`
392
+ // hook (§10.4). `work-queue` delivery preserved: association cleanup
393
+ // must run once per cluster, not per-instance.
394
+ entityPoint.onEntityChanged({
395
+ kind: CATALOG_SYSTEM_ENTITY_KIND,
396
+ handler: async (change) => {
397
+ if (change.next !== null) return; // tombstone only
398
+ const systemId = change.id;
331
399
  logger.debug(
332
- `Cleaning up health check associations for deleted system: ${payload.systemId}`,
400
+ `Cleaning up health check associations for deleted system: ${systemId}`,
333
401
  );
334
- await service.removeAllSystemAssociations(payload.systemId);
335
- await healthCheckCache?.invalidateSystem(payload.systemId);
402
+ await service.removeAllSystemAssociations(systemId);
403
+ await healthCheckCache?.invalidateSystem(systemId);
336
404
  },
337
- { mode: "work-queue", workerGroup: "system-cleanup" },
338
- );
405
+ delivery: { mode: "work-queue", workerGroup: "system-cleanup" },
406
+ });
339
407
 
340
408
  // Subscribe to satellite deletion to scrub satellite IDs from associations
341
409
  onHook(
@@ -352,32 +420,6 @@ export default createBackendPlugin({
352
420
  { mode: "work-queue", workerGroup: "satellite-cleanup" },
353
421
  );
354
422
 
355
- // Sync our auto-incident mapping when an incident is resolved.
356
- // Without this, a manually-closed incident would still appear
357
- // "active" in our mapping, blocking the require-recovery rule
358
- // from re-evaluating fresh transitions.
359
- onHook(
360
- incidentHooks.incidentResolved,
361
- async ({ incidentId }) => {
362
- const updated = await database
363
- .update(healthCheckAutoIncidents)
364
- .set({ closedAt: new Date() })
365
- .where(
366
- and(
367
- eq(healthCheckAutoIncidents.incidentId, incidentId),
368
- isNull(healthCheckAutoIncidents.closedAt),
369
- ),
370
- )
371
- .returning({ id: healthCheckAutoIncidents.id });
372
- if (updated.length > 0) {
373
- logger.debug(
374
- `Marked auto-incident mapping closed for resolved incident ${incidentId}`,
375
- );
376
- }
377
- },
378
- { mode: "work-queue", workerGroup: "auto-incident-sync" },
379
- );
380
-
381
423
  logger.debug("✅ Health Check Backend afterPluginsReady complete.");
382
424
  },
383
425
  });
@@ -386,3 +428,13 @@ export default createBackendPlugin({
386
428
 
387
429
  // Re-export hooks for other plugins to use
388
430
  export { healthCheckHooks } from "./hooks";
431
+
432
+ // Re-export the reactive `health` entity surface so cross-plugin consumers
433
+ // (slo, dependency) can subscribe via onEntityChanged + classify changes
434
+ // without duplicating the kind id / transition predicate (§10.3).
435
+ export {
436
+ HEALTH_ENTITY_KIND,
437
+ classifyHealthChange,
438
+ type HealthChangeClassification,
439
+ type HealthEntityState,
440
+ } from "./health-entity";
@@ -72,6 +72,7 @@ const createMockCatalogClient = () => ({
72
72
  // Other methods not used in queue-executor
73
73
  getEntities: mock(async () => ({ systems: [], groups: [] })),
74
74
  getSystems: mock(async () => ({ systems: [] })),
75
+ getSystem: mock(async () => null),
75
76
  getGroups: mock(async () => []),
76
77
  createSystem: mock(async () => ({})),
77
78
  updateSystem: mock(async () => ({})),
@@ -415,4 +416,140 @@ describe("Queue-Based Health Check Executor", () => {
415
416
  expect(mockSignalService.getRecordedSignals()).toHaveLength(0);
416
417
  });
417
418
  });
419
+
420
+ describe("executeHealthCheckJob - collector run-context", () => {
421
+ it("passes curated run-context to the collector (name falls back to id when configName is null)", async () => {
422
+ const mockDb = createMockDb();
423
+ const mockRegistry = createMockRegistry();
424
+ const mockLogger = createMockLogger();
425
+ const mockQueueManager = createMockQueueManager();
426
+ const mockCatalogClient = createMockCatalogClient();
427
+ const mockMaintenanceClient = createMockMaintenanceClient();
428
+ const mockIncidentClient = createMockIncidentClient();
429
+ const mockSignalService = createMockSignalService();
430
+
431
+ // Catalog resolves the system name.
432
+ (mockCatalogClient.getSystem as any) = mock(async () => ({
433
+ id: "system-1",
434
+ name: "web-01",
435
+ }));
436
+
437
+ // configName is null -> run-context check.name must fall back to id.
438
+ let selectCallCount = 0;
439
+ (mockDb.select as any) = mock(() => {
440
+ selectCallCount++;
441
+ if (selectCallCount === 2) {
442
+ return {
443
+ from: mock(() => ({
444
+ innerJoin: mock(() => ({
445
+ where: mock(() =>
446
+ Promise.resolve([
447
+ {
448
+ configId: "config-1",
449
+ configName: null,
450
+ strategyId: "test-strategy",
451
+ config: { timeout: 5000 },
452
+ collectors: [
453
+ { id: "col-1", collectorId: "test-collector", config: {} },
454
+ ],
455
+ interval: 45,
456
+ enabled: true,
457
+ paused: false,
458
+ includeLocal: true,
459
+ satelliteIds: [],
460
+ },
461
+ ]),
462
+ ),
463
+ })),
464
+ })),
465
+ };
466
+ }
467
+ return {
468
+ from: mock(() => ({
469
+ innerJoin: mock(() => ({
470
+ where: mock(() => Promise.resolve([])),
471
+ })),
472
+ })),
473
+ };
474
+ });
475
+
476
+ // Capture the run-context the collector receives.
477
+ let capturedRunContext: unknown;
478
+ const collectorExecute = mock(
479
+ async (params: { runContext?: unknown }) => {
480
+ capturedRunContext = params.runContext;
481
+ return { result: {} };
482
+ },
483
+ );
484
+ const mockCollectorRegistry = {
485
+ register: mock(() => {}),
486
+ getCollector: mock(() => ({
487
+ collector: {
488
+ id: "test-collector",
489
+ execute: collectorExecute,
490
+ mergeResult: mock(() => ({})),
491
+ },
492
+ })),
493
+ getCollectors: mock(() => []),
494
+ };
495
+
496
+ const queue =
497
+ mockQueueManager.getQueue<HealthCheckJobPayload>("health-checks");
498
+ let capturedHandler:
499
+ | ((job: { data: HealthCheckJobPayload }) => Promise<void>)
500
+ | undefined;
501
+ (queue.consume as any) = mock(
502
+ async (
503
+ handler: (job: { data: HealthCheckJobPayload }) => Promise<void>,
504
+ ) => {
505
+ capturedHandler = handler;
506
+ },
507
+ );
508
+
509
+ await setupHealthCheckWorker({
510
+ db: mockDb as unknown as Parameters<
511
+ typeof setupHealthCheckWorker
512
+ >[0]["db"],
513
+ registry: mockRegistry,
514
+ collectorRegistry: mockCollectorRegistry as unknown as Parameters<
515
+ typeof setupHealthCheckWorker
516
+ >[0]["collectorRegistry"],
517
+ logger: mockLogger,
518
+ queueManager: mockQueueManager,
519
+ signalService: mockSignalService,
520
+ catalogClient: mockCatalogClient as unknown as Parameters<
521
+ typeof setupHealthCheckWorker
522
+ >[0]["catalogClient"],
523
+ notificationClient: {
524
+ notifyForSubscription: () => Promise.resolve({ notifiedCount: 0 }),
525
+ } as unknown as Parameters<
526
+ typeof setupHealthCheckWorker
527
+ >[0]["notificationClient"],
528
+ maintenanceClient: mockMaintenanceClient as unknown as Parameters<
529
+ typeof setupHealthCheckWorker
530
+ >[0]["maintenanceClient"],
531
+ incidentClient: mockIncidentClient as unknown as Parameters<
532
+ typeof setupHealthCheckWorker
533
+ >[0]["incidentClient"],
534
+ getEmitHook: () => undefined,
535
+ cache: passthroughCache,
536
+ });
537
+
538
+ if (capturedHandler) {
539
+ // The collector runs early in the execution sequence; downstream
540
+ // aggregation/persistence touches DB surfaces the lightweight mock
541
+ // doesn't model, so tolerate a later throw — the run-context we
542
+ // assert on is captured synchronously at collector-execute time.
543
+ await capturedHandler({
544
+ data: { configId: "config-1", systemId: "system-1" },
545
+ }).catch(() => {});
546
+ }
547
+
548
+ expect(collectorExecute).toHaveBeenCalled();
549
+ expect(capturedRunContext).toEqual({
550
+ check: { id: "config-1", name: "config-1", intervalSeconds: 45 },
551
+ system: { id: "system-1", name: "web-01" },
552
+ });
553
+ });
554
+ });
418
555
  });