@checkstack/healthcheck-backend 1.1.4 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -3,6 +3,7 @@ import {
3
3
  bootstrapHealthChecks,
4
4
  } from "./queue-executor";
5
5
  import { setupRetentionJob } from "./retention-job";
6
+ import { setupAutoIncidentCloseJob } from "./auto-incident-close-job";
6
7
  import * as schema from "./schema";
7
8
  import {
8
9
  healthCheckAccessRules,
@@ -26,46 +27,33 @@ import {
26
27
  type CollectorRegistry,
27
28
  } from "@checkstack/backend-api";
28
29
  import type { QueueManager } from "@checkstack/queue-api";
29
- import { integrationEventExtensionPoint } from "@checkstack/integration-backend";
30
+ import {
31
+ automationActionExtensionPoint,
32
+ automationArtifactTypeExtensionPoint,
33
+ automationTriggerExtensionPoint,
34
+ } from "@checkstack/automation-backend";
30
35
  import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
31
- import { z } from "zod";
32
36
  import { createHealthCheckRouter } from "./router";
33
37
  import { HealthCheckService } from "./service";
38
+ import {
39
+ assignmentArtifactType,
40
+ createHealthCheckActions,
41
+ healthCheckTriggers,
42
+ } from "./automations";
34
43
  import { registerHealthcheckGitOpsKinds, registerHealthcheckGitOpsDocumentation } from "./healthcheck-gitops-kinds";
35
44
  import { catalogHooks } from "@checkstack/catalog-backend";
36
45
  import { satelliteHooks } from "@checkstack/satellite-backend";
46
+ import { incidentHooks } from "@checkstack/incident-backend";
47
+ import { eq, and, isNull } from "drizzle-orm";
48
+ import { healthCheckAutoIncidents } from "./schema";
37
49
  import { CatalogApi } from "@checkstack/catalog-common";
38
50
  import { MaintenanceApi } from "@checkstack/maintenance-common";
39
51
  import { IncidentApi } from "@checkstack/incident-common";
40
52
  import { GitOpsApi } from "@checkstack/gitops-common";
41
- import { healthCheckHooks } from "./hooks";
42
53
  import { registerSearchProvider } from "@checkstack/command-backend";
43
54
  import { resolveRoute } from "@checkstack/common";
44
55
  import { createHealthCheckCache } from "./cache";
45
56
 
46
- // =============================================================================
47
- // Integration Event Payload Schemas
48
- // =============================================================================
49
-
50
- const systemDegradedPayloadSchema = z.object({
51
- systemId: z.string(),
52
- systemName: z.string().optional(),
53
- previousStatus: z.string(),
54
- newStatus: z.string(),
55
- healthyChecks: z.number(),
56
- totalChecks: z.number(),
57
- timestamp: z.string(),
58
- });
59
-
60
- const systemHealthyPayloadSchema = z.object({
61
- systemId: z.string(),
62
- systemName: z.string().optional(),
63
- previousStatus: z.string(),
64
- healthyChecks: z.number(),
65
- totalChecks: z.number(),
66
- timestamp: z.string(),
67
- });
68
-
69
57
  // Store emitHook reference for use during Phase 2 init
70
58
  let storedEmitHook: EmitHookFn | undefined;
71
59
 
@@ -78,33 +66,19 @@ export default createBackendPlugin({
78
66
  healthcheckGroupSubscription,
79
67
  ]);
80
68
 
81
- // Register hooks as integration events
82
- const integrationEvents = env.getExtensionPoint(
83
- integrationEventExtensionPoint,
84
- );
85
-
86
- integrationEvents.registerEvent(
87
- {
88
- hook: healthCheckHooks.systemDegraded,
89
- displayName: "System Health Degraded",
90
- description:
91
- "Fired when a system's health status transitions from healthy to degraded/unhealthy",
92
- category: "Health",
93
- payloadSchema: systemDegradedPayloadSchema,
94
- },
95
- pluginMetadata,
96
- );
97
-
98
- integrationEvents.registerEvent(
99
- {
100
- hook: healthCheckHooks.systemHealthy,
101
- displayName: "System Health Restored",
102
- description: "Fired when a system's health status recovers to healthy",
103
- category: "Health",
104
- payloadSchema: systemHealthyPayloadSchema,
105
- },
106
- pluginMetadata,
69
+ // ─── Automation Platform: triggers + artifact type ─────────────────
70
+ // Buffered behind the extension point until automation-backend's
71
+ // register() runs. Actions are wired in afterPluginsReady where
72
+ // `emitHook` becomes available.
73
+ const automationTriggers = env.getExtensionPoint(
74
+ automationTriggerExtensionPoint,
107
75
  );
76
+ for (const trigger of healthCheckTriggers) {
77
+ automationTriggers.registerTrigger(trigger, pluginMetadata);
78
+ }
79
+ env
80
+ .getExtensionPoint(automationArtifactTypeExtensionPoint)
81
+ .registerArtifactType(assignmentArtifactType, pluginMetadata);
108
82
 
109
83
  // ─── GitOps Entity Kind Registration ───────────────────────────────
110
84
  // Mutable refs — populated during init(), consumed by reconcile closures.
@@ -159,6 +133,7 @@ export default createBackendPlugin({
159
133
  queueManager: coreServices.queueManager,
160
134
  signalService: coreServices.signalService,
161
135
  cacheManager: coreServices.cacheManager,
136
+ config: coreServices.config,
162
137
  },
163
138
  // Phase 2: Register router and setup worker
164
139
  init: async ({
@@ -171,6 +146,7 @@ export default createBackendPlugin({
171
146
  queueManager,
172
147
  signalService,
173
148
  cacheManager,
149
+ config,
174
150
  }) => {
175
151
  logger.debug("🏥 Initializing Health Check Backend...");
176
152
 
@@ -225,6 +201,16 @@ export default createBackendPlugin({
225
201
  queueManager,
226
202
  });
227
203
 
204
+ // Setup auto-incident close worker (ticks every 60s, closes
205
+ // auto-opened incidents whose systems have been steady-healthy
206
+ // for the cooldown).
207
+ await setupAutoIncidentCloseJob({
208
+ db: database,
209
+ logger,
210
+ queueManager,
211
+ incidentClient,
212
+ });
213
+
228
214
  const healthCheckRouter = createHealthCheckRouter({
229
215
  database: database as SafeDatabase<typeof schema>,
230
216
  registry: healthCheckRegistry,
@@ -232,6 +218,8 @@ export default createBackendPlugin({
232
218
  gitOpsClient,
233
219
  getEmitHook: () => storedEmitHook,
234
220
  cache,
221
+ configService: config,
222
+ catalogClient,
235
223
  });
236
224
  rpc.registerRouter(healthCheckRouter, healthCheckContract);
237
225
 
@@ -308,6 +296,20 @@ export default createBackendPlugin({
308
296
  healthCheckRegistry,
309
297
  collectorRegistry,
310
298
  );
299
+
300
+ // Register automation actions now that `emitHook` + `queueManager`
301
+ // are both available.
302
+ const automationActions = env.getExtensionPoint(
303
+ automationActionExtensionPoint,
304
+ );
305
+ for (const action of createHealthCheckActions({
306
+ service,
307
+ queueManager,
308
+ emitHook,
309
+ })) {
310
+ automationActions.registerAction(action, pluginMetadata);
311
+ }
312
+
311
313
  onHook(
312
314
  catalogHooks.systemDeleted,
313
315
  async (payload) => {
@@ -335,6 +337,32 @@ export default createBackendPlugin({
335
337
  { mode: "work-queue", workerGroup: "satellite-cleanup" },
336
338
  );
337
339
 
340
+ // Sync our auto-incident mapping when an incident is resolved.
341
+ // Without this, a manually-closed incident would still appear
342
+ // "active" in our mapping, blocking the require-recovery rule
343
+ // from re-evaluating fresh transitions.
344
+ onHook(
345
+ incidentHooks.incidentResolved,
346
+ async ({ incidentId }) => {
347
+ const updated = await database
348
+ .update(healthCheckAutoIncidents)
349
+ .set({ closedAt: new Date() })
350
+ .where(
351
+ and(
352
+ eq(healthCheckAutoIncidents.incidentId, incidentId),
353
+ isNull(healthCheckAutoIncidents.closedAt),
354
+ ),
355
+ )
356
+ .returning({ id: healthCheckAutoIncidents.id });
357
+ if (updated.length > 0) {
358
+ logger.debug(
359
+ `Marked auto-incident mapping closed for resolved incident ${incidentId}`,
360
+ );
361
+ }
362
+ },
363
+ { mode: "work-queue", workerGroup: "auto-incident-sync" },
364
+ );
365
+
338
366
  logger.debug("✅ Health Check Backend afterPluginsReady complete.");
339
367
  },
340
368
  });
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Versioned schema used by `ConfigService` to persist platform-wide
3
+ * notification defaults. The shape is the runtime `NotificationPolicy`
4
+ * itself — each field has a built-in compile-time default so an empty
5
+ * stored value still parses to a valid policy.
6
+ */
7
+ export { NotificationPolicySchema as notificationDefaultsConfigV1 } from "@checkstack/healthcheck-common";
8
+
9
+ export const NOTIFICATION_DEFAULTS_CONFIG_ID = "healthcheck.notification-defaults";
10
+ export const NOTIFICATION_DEFAULTS_CONFIG_VERSION = 1;
@@ -0,0 +1,104 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import type {
3
+ HealthCheckStatus,
4
+ NotificationPolicy,
5
+ } from "@checkstack/healthcheck-common";
6
+ import {
7
+ classifyTransition,
8
+ shouldNotifyTransition,
9
+ type TransitionKind,
10
+ } from "./notification-policy";
11
+
12
+ const STATES: HealthCheckStatus[] = ["healthy", "degraded", "unhealthy"];
13
+
14
+ describe("classifyTransition", () => {
15
+ // Build the full 3×3 transition matrix so future severity edits stay
16
+ // honest. Every cell here doubles as documentation.
17
+ const matrix: Record<
18
+ HealthCheckStatus,
19
+ Record<HealthCheckStatus, TransitionKind>
20
+ > = {
21
+ healthy: {
22
+ healthy: "none",
23
+ degraded: "escalation",
24
+ unhealthy: "escalation",
25
+ },
26
+ degraded: {
27
+ healthy: "recovery",
28
+ degraded: "none",
29
+ unhealthy: "escalation",
30
+ },
31
+ unhealthy: {
32
+ healthy: "recovery",
33
+ degraded: "deescalation",
34
+ unhealthy: "none",
35
+ },
36
+ };
37
+
38
+ for (const prev of STATES) {
39
+ for (const next of STATES) {
40
+ it(`${prev} → ${next} = ${matrix[prev][next]}`, () => {
41
+ expect(classifyTransition(prev, next)).toBe(matrix[prev][next]);
42
+ });
43
+ }
44
+ }
45
+ });
46
+
47
+ describe("shouldNotifyTransition", () => {
48
+ // The helper only reads `suppressDeEscalations`; narrow the fixture
49
+ // type so the test doesn't need to keep up with unrelated policy
50
+ // fields added over time.
51
+ const off: Pick<NotificationPolicy, "suppressDeEscalations"> = {
52
+ suppressDeEscalations: false,
53
+ };
54
+ const on: Pick<NotificationPolicy, "suppressDeEscalations"> = {
55
+ suppressDeEscalations: true,
56
+ };
57
+
58
+ it("never notifies on `none` (no actual change)", () => {
59
+ expect(shouldNotifyTransition("none", off)).toBe(false);
60
+ expect(shouldNotifyTransition("none", on)).toBe(false);
61
+ });
62
+
63
+ it("always notifies on escalations regardless of policy", () => {
64
+ expect(shouldNotifyTransition("escalation", off)).toBe(true);
65
+ expect(shouldNotifyTransition("escalation", on)).toBe(true);
66
+ });
67
+
68
+ it("always notifies on recoveries regardless of policy", () => {
69
+ expect(shouldNotifyTransition("recovery", off)).toBe(true);
70
+ expect(shouldNotifyTransition("recovery", on)).toBe(true);
71
+ });
72
+
73
+ it("notifies on de-escalations by default", () => {
74
+ expect(shouldNotifyTransition("deescalation", off)).toBe(true);
75
+ });
76
+
77
+ it("suppresses de-escalations when the policy opts in", () => {
78
+ expect(shouldNotifyTransition("deescalation", on)).toBe(false);
79
+ });
80
+ });
81
+
82
+ describe("flapping scenario from the bug report", () => {
83
+ // healthy → degraded → unhealthy → degraded → healthy
84
+ //
85
+ // With suppression on, the intermediate `unhealthy → degraded`
86
+ // notification (the one operators called out as spammy) must be
87
+ // skipped, while escalation and recovery still fire.
88
+ const policy: Pick<NotificationPolicy, "suppressDeEscalations"> = {
89
+ suppressDeEscalations: true,
90
+ };
91
+ const sequence: [HealthCheckStatus, HealthCheckStatus, boolean][] = [
92
+ ["healthy", "degraded", true], // escalation
93
+ ["degraded", "unhealthy", true], // escalation
94
+ ["unhealthy", "degraded", false], // de-escalation — suppressed
95
+ ["degraded", "healthy", true], // recovery
96
+ ];
97
+
98
+ for (const [prev, next, expected] of sequence) {
99
+ it(`${prev} → ${next} should notify: ${expected}`, () => {
100
+ const kind = classifyTransition(prev, next);
101
+ expect(shouldNotifyTransition(kind, policy)).toBe(expected);
102
+ });
103
+ }
104
+ });
@@ -0,0 +1,56 @@
1
+ import type {
2
+ HealthCheckStatus,
3
+ NotificationPolicy,
4
+ } from "@checkstack/healthcheck-common";
5
+
6
+ /**
7
+ * The kind of transition a system health change represents. Used to
8
+ * decide whether a notification should fire and how its CTA should
9
+ * link back into the UI.
10
+ */
11
+ export type TransitionKind =
12
+ /** No actual change (e.g. healthy → healthy). */
13
+ | "none"
14
+ /** Severity increased (healthy → degraded, degraded → unhealthy, ...). */
15
+ | "escalation"
16
+ /** Severity decreased but did not reach healthy (unhealthy → degraded). */
17
+ | "deescalation"
18
+ /** Returned to healthy from any non-healthy state. */
19
+ | "recovery";
20
+
21
+ const SEVERITY: Record<HealthCheckStatus, number> = {
22
+ healthy: 0,
23
+ degraded: 1,
24
+ unhealthy: 2,
25
+ };
26
+
27
+ /**
28
+ * Classify a transition between two health states. Pure and total over
29
+ * the cartesian product of `HealthCheckStatus` values.
30
+ */
31
+ export function classifyTransition(
32
+ previous: HealthCheckStatus,
33
+ next: HealthCheckStatus,
34
+ ): TransitionKind {
35
+ if (previous === next) return "none";
36
+ if (next === "healthy") return "recovery";
37
+ return SEVERITY[next] > SEVERITY[previous] ? "escalation" : "deescalation";
38
+ }
39
+
40
+ /**
41
+ * Decide whether a transition should produce a notification given the
42
+ * effective per-system policy. Escalations and recoveries always notify;
43
+ * de-escalations are suppressed when the policy opts in.
44
+ *
45
+ * Accepts the narrowed `Pick` because callers may only have the
46
+ * suppression flag — full policy resolution requires per-check lookups
47
+ * that aren't relevant to this decision.
48
+ */
49
+ export function shouldNotifyTransition(
50
+ kind: TransitionKind,
51
+ policy: Pick<NotificationPolicy, "suppressDeEscalations">,
52
+ ): boolean {
53
+ if (kind === "none") return false;
54
+ if (kind === "deescalation" && policy.suppressDeEscalations) return false;
55
+ return true;
56
+ }
@@ -72,6 +72,7 @@ const createMockCatalogClient = () => ({
72
72
  // Other methods not used in queue-executor
73
73
  getEntities: mock(async () => ({ systems: [], groups: [] })),
74
74
  getSystems: mock(async () => ({ systems: [] })),
75
+ getSystem: mock(async () => null),
75
76
  getGroups: mock(async () => []),
76
77
  createSystem: mock(async () => ({})),
77
78
  updateSystem: mock(async () => ({})),
@@ -415,4 +416,140 @@ describe("Queue-Based Health Check Executor", () => {
415
416
  expect(mockSignalService.getRecordedSignals()).toHaveLength(0);
416
417
  });
417
418
  });
419
+
420
+ describe("executeHealthCheckJob - collector run-context", () => {
421
+ it("passes curated run-context to the collector (name falls back to id when configName is null)", async () => {
422
+ const mockDb = createMockDb();
423
+ const mockRegistry = createMockRegistry();
424
+ const mockLogger = createMockLogger();
425
+ const mockQueueManager = createMockQueueManager();
426
+ const mockCatalogClient = createMockCatalogClient();
427
+ const mockMaintenanceClient = createMockMaintenanceClient();
428
+ const mockIncidentClient = createMockIncidentClient();
429
+ const mockSignalService = createMockSignalService();
430
+
431
+ // Catalog resolves the system name.
432
+ (mockCatalogClient.getSystem as any) = mock(async () => ({
433
+ id: "system-1",
434
+ name: "web-01",
435
+ }));
436
+
437
+ // configName is null -> run-context check.name must fall back to id.
438
+ let selectCallCount = 0;
439
+ (mockDb.select as any) = mock(() => {
440
+ selectCallCount++;
441
+ if (selectCallCount === 2) {
442
+ return {
443
+ from: mock(() => ({
444
+ innerJoin: mock(() => ({
445
+ where: mock(() =>
446
+ Promise.resolve([
447
+ {
448
+ configId: "config-1",
449
+ configName: null,
450
+ strategyId: "test-strategy",
451
+ config: { timeout: 5000 },
452
+ collectors: [
453
+ { id: "col-1", collectorId: "test-collector", config: {} },
454
+ ],
455
+ interval: 45,
456
+ enabled: true,
457
+ paused: false,
458
+ includeLocal: true,
459
+ satelliteIds: [],
460
+ },
461
+ ]),
462
+ ),
463
+ })),
464
+ })),
465
+ };
466
+ }
467
+ return {
468
+ from: mock(() => ({
469
+ innerJoin: mock(() => ({
470
+ where: mock(() => Promise.resolve([])),
471
+ })),
472
+ })),
473
+ };
474
+ });
475
+
476
+ // Capture the run-context the collector receives.
477
+ let capturedRunContext: unknown;
478
+ const collectorExecute = mock(
479
+ async (params: { runContext?: unknown }) => {
480
+ capturedRunContext = params.runContext;
481
+ return { result: {} };
482
+ },
483
+ );
484
+ const mockCollectorRegistry = {
485
+ register: mock(() => {}),
486
+ getCollector: mock(() => ({
487
+ collector: {
488
+ id: "test-collector",
489
+ execute: collectorExecute,
490
+ mergeResult: mock(() => ({})),
491
+ },
492
+ })),
493
+ getCollectors: mock(() => []),
494
+ };
495
+
496
+ const queue =
497
+ mockQueueManager.getQueue<HealthCheckJobPayload>("health-checks");
498
+ let capturedHandler:
499
+ | ((job: { data: HealthCheckJobPayload }) => Promise<void>)
500
+ | undefined;
501
+ (queue.consume as any) = mock(
502
+ async (
503
+ handler: (job: { data: HealthCheckJobPayload }) => Promise<void>,
504
+ ) => {
505
+ capturedHandler = handler;
506
+ },
507
+ );
508
+
509
+ await setupHealthCheckWorker({
510
+ db: mockDb as unknown as Parameters<
511
+ typeof setupHealthCheckWorker
512
+ >[0]["db"],
513
+ registry: mockRegistry,
514
+ collectorRegistry: mockCollectorRegistry as unknown as Parameters<
515
+ typeof setupHealthCheckWorker
516
+ >[0]["collectorRegistry"],
517
+ logger: mockLogger,
518
+ queueManager: mockQueueManager,
519
+ signalService: mockSignalService,
520
+ catalogClient: mockCatalogClient as unknown as Parameters<
521
+ typeof setupHealthCheckWorker
522
+ >[0]["catalogClient"],
523
+ notificationClient: {
524
+ notifyForSubscription: () => Promise.resolve({ notifiedCount: 0 }),
525
+ } as unknown as Parameters<
526
+ typeof setupHealthCheckWorker
527
+ >[0]["notificationClient"],
528
+ maintenanceClient: mockMaintenanceClient as unknown as Parameters<
529
+ typeof setupHealthCheckWorker
530
+ >[0]["maintenanceClient"],
531
+ incidentClient: mockIncidentClient as unknown as Parameters<
532
+ typeof setupHealthCheckWorker
533
+ >[0]["incidentClient"],
534
+ getEmitHook: () => undefined,
535
+ cache: passthroughCache,
536
+ });
537
+
538
+ if (capturedHandler) {
539
+ // The collector runs early in the execution sequence; downstream
540
+ // aggregation/persistence touches DB surfaces the lightweight mock
541
+ // doesn't model, so tolerate a later throw — the run-context we
542
+ // assert on is captured synchronously at collector-execute time.
543
+ await capturedHandler({
544
+ data: { configId: "config-1", systemId: "system-1" },
545
+ }).catch(() => {});
546
+ }
547
+
548
+ expect(collectorExecute).toHaveBeenCalled();
549
+ expect(capturedRunContext).toEqual({
550
+ check: { id: "config-1", name: "config-1", intervalSeconds: 45 },
551
+ system: { id: "system-1", name: "web-01" },
552
+ });
553
+ });
554
+ });
418
555
  });