@checkstack/healthcheck-backend 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,340 @@
1
+ /**
2
+ * Healthcheck triggers + actions registered with the Automation Platform.
3
+ *
4
+ * Triggers:
5
+ * - `healthcheck.system.degraded` — existing directional hook
6
+ * - `healthcheck.system.healthy` — existing directional hook
7
+ * - `healthcheck.system.health_changed` — new umbrella hook,
8
+ * fires on every aggregated-health transition. Carries both the
9
+ * previous and new statuses so subscribers don't have to listen
10
+ * to two hooks and coalesce themselves.
11
+ *
12
+ * Actions:
13
+ * - `healthcheck.run_now`: enqueue a one-off run of a specific
14
+ * `(systemId, configurationId)` assignment. The recurring
15
+ * schedule keeps ticking; this just nudges the queue.
16
+ * - `healthcheck.enable_assignment` /
17
+ * `healthcheck.disable_assignment`: flip the `enabled` flag on an
18
+ * existing assignment via `service.setAssignmentEnabled`. Emits
19
+ * the existing `assignmentChanged` hook so the satellite-config
20
+ * relay picks up the change.
21
+ *
22
+ * Mutation actions emit hooks themselves (via the `emitHook` factory
23
+ * dep) so downstream automations + caches react the same way as
24
+ * RPC-driven mutations.
25
+ */
26
+ import { z } from "zod";
27
+ import { Versioned, type Hook } from "@checkstack/backend-api";
28
+ import type { QueueManager } from "@checkstack/queue-api";
29
+ import type {
30
+ ActionDefinition,
31
+ TriggerDefinition,
32
+ } from "@checkstack/automation-backend";
33
+ import { HealthCheckStatusSchema } from "@checkstack/healthcheck-common";
34
+
35
+ import { healthCheckHooks } from "./hooks";
36
+ import {
37
+ HEALTH_CHECK_QUEUE,
38
+ type HealthCheckJobPayload,
39
+ } from "./queue-executor";
40
+ import type { HealthCheckService } from "./service";
41
+
42
+ // ─── Payload schemas — match the hook payloads exactly ─────────────────
43
+
44
+ const systemDegradedPayloadSchema = z.object({
45
+ systemId: z.string(),
46
+ systemName: z.string().optional(),
47
+ previousStatus: HealthCheckStatusSchema,
48
+ newStatus: HealthCheckStatusSchema,
49
+ healthyChecks: z.number(),
50
+ totalChecks: z.number(),
51
+ timestamp: z.string(),
52
+ });
53
+
54
+ const systemHealthyPayloadSchema = z.object({
55
+ systemId: z.string(),
56
+ systemName: z.string().optional(),
57
+ previousStatus: HealthCheckStatusSchema,
58
+ healthyChecks: z.number(),
59
+ totalChecks: z.number(),
60
+ timestamp: z.string(),
61
+ });
62
+
63
+ const systemHealthChangedPayloadSchema = z.object({
64
+ systemId: z.string(),
65
+ systemName: z.string().optional(),
66
+ previousStatus: HealthCheckStatusSchema,
67
+ newStatus: HealthCheckStatusSchema,
68
+ healthyChecks: z.number(),
69
+ totalChecks: z.number(),
70
+ timestamp: z.string(),
71
+ });
72
+
73
+ const checkFailedPayloadSchema = z.object({
74
+ systemId: z.string(),
75
+ configurationId: z.string(),
76
+ status: HealthCheckStatusSchema,
77
+ latencyMs: z.number().optional(),
78
+ result: z.record(z.string(), z.unknown()).optional(),
79
+ timestamp: z.string(),
80
+ });
81
+
82
+ const flappingDetectedPayloadSchema = z.object({
83
+ systemId: z.string(),
84
+ configurationId: z.string(),
85
+ transitionCount: z.number(),
86
+ windowMinutes: z.number(),
87
+ timestamp: z.string(),
88
+ });
89
+
90
+ // ─── Triggers ──────────────────────────────────────────────────────────
91
+
92
+ export const systemDegradedTrigger: TriggerDefinition<
93
+ z.infer<typeof systemDegradedPayloadSchema>
94
+ > = {
95
+ id: "system_degraded",
96
+ displayName: "System Health Degraded",
97
+ description:
98
+ "Fires when a system's health transitions from healthy to degraded/unhealthy",
99
+ category: "Health",
100
+ icon: "HeartPulse",
101
+ payloadSchema: systemDegradedPayloadSchema,
102
+ hook: healthCheckHooks.systemDegraded,
103
+ contextKey: (p) => p.systemId,
104
+ };
105
+
106
+ export const systemHealthyTrigger: TriggerDefinition<
107
+ z.infer<typeof systemHealthyPayloadSchema>
108
+ > = {
109
+ id: "system_healthy",
110
+ displayName: "System Health Restored",
111
+ description: "Fires when a system's health recovers to healthy",
112
+ category: "Health",
113
+ icon: "HeartPulse",
114
+ payloadSchema: systemHealthyPayloadSchema,
115
+ hook: healthCheckHooks.systemHealthy,
116
+ contextKey: (p) => p.systemId,
117
+ };
118
+
119
+ export const systemHealthChangedTrigger: TriggerDefinition<
120
+ z.infer<typeof systemHealthChangedPayloadSchema>
121
+ > = {
122
+ id: "system_health_changed",
123
+ displayName: "System Health Changed",
124
+ description:
125
+ "Fires on every aggregated-health transition — carries previous + new status",
126
+ category: "Health",
127
+ icon: "HeartPulse",
128
+ payloadSchema: systemHealthChangedPayloadSchema,
129
+ hook: healthCheckHooks.systemHealthChanged,
130
+ contextKey: (p) => p.systemId,
131
+ };
132
+
133
+ export const checkFailedTrigger: TriggerDefinition<
134
+ z.infer<typeof checkFailedPayloadSchema>
135
+ > = {
136
+ id: "check_failed",
137
+ displayName: "Health Check Failed",
138
+ description:
139
+ "Fires when an individual check run completes with a non-`healthy` status",
140
+ category: "Health",
141
+ icon: "TriangleAlert",
142
+ payloadSchema: checkFailedPayloadSchema,
143
+ hook: healthCheckHooks.checkFailed,
144
+ contextKey: (p) => p.systemId,
145
+ };
146
+
147
+ export const flappingDetectedTrigger: TriggerDefinition<
148
+ z.infer<typeof flappingDetectedPayloadSchema>
149
+ > = {
150
+ id: "flapping_detected",
151
+ displayName: "Health Check Flapping",
152
+ description:
153
+ "Fires when N unhealthy transitions are observed within the policy window. Re-fires on every additional transition while flapping; debounce in the automation if needed.",
154
+ category: "Health",
155
+ icon: "Repeat",
156
+ payloadSchema: flappingDetectedPayloadSchema,
157
+ hook: healthCheckHooks.flappingDetected,
158
+ contextKey: (p) => p.systemId,
159
+ };
160
+
161
+ export const healthCheckTriggers: TriggerDefinition<unknown>[] = [
162
+ systemDegradedTrigger as TriggerDefinition<unknown>,
163
+ systemHealthyTrigger as TriggerDefinition<unknown>,
164
+ systemHealthChangedTrigger as TriggerDefinition<unknown>,
165
+ checkFailedTrigger as TriggerDefinition<unknown>,
166
+ flappingDetectedTrigger as TriggerDefinition<unknown>,
167
+ ];
168
+
169
+ // ─── Action configs ────────────────────────────────────────────────────
170
+
171
+ const runNowConfigSchema = z.object({
172
+ systemId: z.string().min(1).describe("Target system id"),
173
+ configurationId: z
174
+ .string()
175
+ .min(1)
176
+ .describe("Target health-check configuration id"),
177
+ });
178
+
179
+ const assignmentToggleConfigSchema = z.object({
180
+ systemId: z.string().min(1),
181
+ configurationId: z.string().min(1),
182
+ });
183
+
184
+ // ─── Artifact ──────────────────────────────────────────────────────────
185
+
186
+ const assignmentArtifactSchema = z.object({
187
+ systemId: z.string(),
188
+ configurationId: z.string(),
189
+ enabled: z.boolean().optional(),
190
+ enqueued: z.boolean().optional(),
191
+ });
192
+
193
+ export type AssignmentArtifact = z.infer<typeof assignmentArtifactSchema>;
194
+
195
+ export const assignmentArtifactType = {
196
+ id: "assignment",
197
+ displayName: "Healthcheck Assignment",
198
+ description:
199
+ "Identifies the system↔configuration assignment touched by an automation action",
200
+ schema: assignmentArtifactSchema,
201
+ } as const;
202
+
203
+ // ─── Action factory ────────────────────────────────────────────────────
204
+
205
+ export interface HealthCheckActionDeps {
206
+ service: HealthCheckService;
207
+ queueManager: QueueManager;
208
+ emitHook: <T>(hook: Hook<T>, payload: T) => Promise<void>;
209
+ }
210
+
211
+ export function createHealthCheckActions(
212
+ deps: HealthCheckActionDeps,
213
+ ): ActionDefinition<unknown, unknown>[] {
214
+ const runNow: ActionDefinition<
215
+ z.infer<typeof runNowConfigSchema>,
216
+ AssignmentArtifact
217
+ > = {
218
+ id: "run_now",
219
+ displayName: "Run Health Check Now",
220
+ description:
221
+ "Enqueue a one-off run of the given assignment. Doesn't disturb the recurring schedule.",
222
+ category: "Health",
223
+ icon: "Play",
224
+ config: new Versioned({ version: 1, schema: runNowConfigSchema }),
225
+ produces: "healthcheck.assignment",
226
+ execute: async ({ config, logger }) => {
227
+ const queue = deps.queueManager.getQueue<HealthCheckJobPayload>(
228
+ HEALTH_CHECK_QUEUE,
229
+ );
230
+ await queue.enqueue({
231
+ configId: config.configurationId,
232
+ systemId: config.systemId,
233
+ });
234
+ logger.info(
235
+ `Automation enqueued run for ${config.systemId}:${config.configurationId}`,
236
+ );
237
+ return {
238
+ success: true,
239
+ externalId: `${config.systemId}:${config.configurationId}`,
240
+ artifact: {
241
+ systemId: config.systemId,
242
+ configurationId: config.configurationId,
243
+ enqueued: true,
244
+ },
245
+ };
246
+ },
247
+ };
248
+
249
+ const enableAssignment: ActionDefinition<
250
+ z.infer<typeof assignmentToggleConfigSchema>,
251
+ AssignmentArtifact
252
+ > = {
253
+ id: "enable_assignment",
254
+ displayName: "Enable Health Check Assignment",
255
+ description:
256
+ "Flip the `enabled` flag on an existing system↔configuration assignment to true.",
257
+ category: "Health",
258
+ icon: "Power",
259
+ config: new Versioned({ version: 1, schema: assignmentToggleConfigSchema }),
260
+ produces: "healthcheck.assignment",
261
+ execute: async ({ config, logger }) => {
262
+ const updated = await deps.service.setAssignmentEnabled(
263
+ config.systemId,
264
+ config.configurationId,
265
+ true,
266
+ );
267
+ if (!updated) {
268
+ return {
269
+ success: false,
270
+ error: `Assignment not found: ${config.systemId} ↔ ${config.configurationId}`,
271
+ };
272
+ }
273
+ await deps.emitHook(healthCheckHooks.assignmentChanged, {
274
+ systemId: config.systemId,
275
+ configurationId: config.configurationId,
276
+ });
277
+ logger.info(
278
+ `Automation enabled assignment ${config.systemId}:${config.configurationId}`,
279
+ );
280
+ return {
281
+ success: true,
282
+ externalId: `${config.systemId}:${config.configurationId}`,
283
+ artifact: {
284
+ systemId: config.systemId,
285
+ configurationId: config.configurationId,
286
+ enabled: true,
287
+ },
288
+ };
289
+ },
290
+ };
291
+
292
+ const disableAssignment: ActionDefinition<
293
+ z.infer<typeof assignmentToggleConfigSchema>,
294
+ AssignmentArtifact
295
+ > = {
296
+ id: "disable_assignment",
297
+ displayName: "Disable Health Check Assignment",
298
+ description:
299
+ "Flip the `enabled` flag on an existing system↔configuration assignment to false.",
300
+ category: "Health",
301
+ icon: "PowerOff",
302
+ config: new Versioned({ version: 1, schema: assignmentToggleConfigSchema }),
303
+ produces: "healthcheck.assignment",
304
+ execute: async ({ config, logger }) => {
305
+ const updated = await deps.service.setAssignmentEnabled(
306
+ config.systemId,
307
+ config.configurationId,
308
+ false,
309
+ );
310
+ if (!updated) {
311
+ return {
312
+ success: false,
313
+ error: `Assignment not found: ${config.systemId} ↔ ${config.configurationId}`,
314
+ };
315
+ }
316
+ await deps.emitHook(healthCheckHooks.assignmentChanged, {
317
+ systemId: config.systemId,
318
+ configurationId: config.configurationId,
319
+ });
320
+ logger.info(
321
+ `Automation disabled assignment ${config.systemId}:${config.configurationId}`,
322
+ );
323
+ return {
324
+ success: true,
325
+ externalId: `${config.systemId}:${config.configurationId}`,
326
+ artifact: {
327
+ systemId: config.systemId,
328
+ configurationId: config.configurationId,
329
+ enabled: false,
330
+ },
331
+ };
332
+ },
333
+ };
334
+
335
+ return [
336
+ runNow as ActionDefinition<unknown, unknown>,
337
+ enableAssignment as ActionDefinition<unknown, unknown>,
338
+ disableAssignment as ActionDefinition<unknown, unknown>,
339
+ ];
340
+ }
package/src/hooks.ts CHANGED
@@ -1,8 +1,14 @@
1
1
  import { createHook } from "@checkstack/backend-api";
2
+ import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
2
3
 
3
4
  /**
4
5
  * Health check hooks for cross-plugin communication and external integrations.
5
6
  * These hooks are registered as integration events for webhook subscriptions.
7
+ *
8
+ * `status` / `previousStatus` / `newStatus` carry the canonical
9
+ * `HealthCheckStatus` enum values, so automation triggers built on
10
+ * these hooks can offer the known values for `==` comparisons in the
11
+ * editor.
6
12
  */
7
13
  export const healthCheckHooks = {
8
14
  /**
@@ -13,8 +19,8 @@ export const healthCheckHooks = {
13
19
  systemDegraded: createHook<{
14
20
  systemId: string;
15
21
  systemName?: string;
16
- previousStatus: string;
17
- newStatus: string;
22
+ previousStatus: HealthCheckStatus;
23
+ newStatus: HealthCheckStatus;
18
24
  healthyChecks: number;
19
25
  totalChecks: number;
20
26
  timestamp: string;
@@ -27,7 +33,7 @@ export const healthCheckHooks = {
27
33
  systemHealthy: createHook<{
28
34
  systemId: string;
29
35
  systemName?: string;
30
- previousStatus: string;
36
+ previousStatus: HealthCheckStatus;
31
37
  healthyChecks: number;
32
38
  totalChecks: number;
33
39
  timestamp: string;
@@ -50,9 +56,68 @@ export const healthCheckHooks = {
50
56
  checkCompleted: createHook<{
51
57
  systemId: string;
52
58
  configurationId: string;
53
- status: string;
59
+ status: HealthCheckStatus;
54
60
  latencyMs: number | undefined;
55
61
  result: Record<string, unknown> | undefined;
56
62
  timestamp: string;
57
63
  }>("healthcheck.check.completed"),
64
+
65
+ /**
66
+ * Umbrella variant of `systemDegraded` + `systemHealthy` — fires on
67
+ * **any** aggregated-health transition, carrying both the previous
68
+ * and new statuses. Subscribers (e.g. an automation that wants to
69
+ * react to every state change without subscribing to two hooks
70
+ * and coalescing themselves) prefer this one.
71
+ *
72
+ * Emitted alongside the directional hooks, never instead of them,
73
+ * so existing subscribers keep working unchanged.
74
+ */
75
+ systemHealthChanged: createHook<{
76
+ systemId: string;
77
+ systemName?: string;
78
+ previousStatus: HealthCheckStatus;
79
+ newStatus: HealthCheckStatus;
80
+ healthyChecks: number;
81
+ totalChecks: number;
82
+ timestamp: string;
83
+ }>("healthcheck.system.health_changed"),
84
+
85
+ /**
86
+ * Narrow variant of `checkCompleted` — fires only when an individual
87
+ * check run completed with a non-`healthy` status. Carries the
88
+ * latency + raw result so subscribers can branch on collector-
89
+ * specific fields without re-querying. Operators usually prefer
90
+ * this over `checkCompleted` for incident-style automation because
91
+ * a "trigger on any completion, then filter" automation is harder
92
+ * to read at a glance than a typed `check_failed` entry point.
93
+ */
94
+ checkFailed: createHook<{
95
+ systemId: string;
96
+ configurationId: string;
97
+ status: HealthCheckStatus;
98
+ latencyMs: number | undefined;
99
+ result: Record<string, unknown> | undefined;
100
+ timestamp: string;
101
+ }>("healthcheck.check.failed"),
102
+
103
+ /**
104
+ * Emitted when the flapping-detector observes ≥ N unhealthy
105
+ * transitions in the policy's configured window. Fires regardless
106
+ * of whether `autoOpenIncidentOnUnhealthy` is enabled — the hook is
107
+ * informational; the auto-incident pipeline still gates on the
108
+ * policy.
109
+ *
110
+ * Re-fires on every additional transition past the threshold while
111
+ * the check stays in a flapping pattern, so automations that want
112
+ * "page once and only once" should debounce on `(systemId,
113
+ * configurationId)`. Carrying the observed transition count + the
114
+ * window length lets subscribers reason about both.
115
+ */
116
+ flappingDetected: createHook<{
117
+ systemId: string;
118
+ configurationId: string;
119
+ transitionCount: number;
120
+ windowMinutes: number;
121
+ timestamp: string;
122
+ }>("healthcheck.flapping_detected"),
58
123
  } as const;
package/src/index.ts CHANGED
@@ -27,11 +27,19 @@ import {
27
27
  type CollectorRegistry,
28
28
  } from "@checkstack/backend-api";
29
29
  import type { QueueManager } from "@checkstack/queue-api";
30
- import { integrationEventExtensionPoint } from "@checkstack/integration-backend";
30
+ import {
31
+ automationActionExtensionPoint,
32
+ automationArtifactTypeExtensionPoint,
33
+ automationTriggerExtensionPoint,
34
+ } from "@checkstack/automation-backend";
31
35
  import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
32
- import { z } from "zod";
33
36
  import { createHealthCheckRouter } from "./router";
34
37
  import { HealthCheckService } from "./service";
38
+ import {
39
+ assignmentArtifactType,
40
+ createHealthCheckActions,
41
+ healthCheckTriggers,
42
+ } from "./automations";
35
43
  import { registerHealthcheckGitOpsKinds, registerHealthcheckGitOpsDocumentation } from "./healthcheck-gitops-kinds";
36
44
  import { catalogHooks } from "@checkstack/catalog-backend";
37
45
  import { satelliteHooks } from "@checkstack/satellite-backend";
@@ -42,34 +50,10 @@ import { CatalogApi } from "@checkstack/catalog-common";
42
50
  import { MaintenanceApi } from "@checkstack/maintenance-common";
43
51
  import { IncidentApi } from "@checkstack/incident-common";
44
52
  import { GitOpsApi } from "@checkstack/gitops-common";
45
- import { healthCheckHooks } from "./hooks";
46
53
  import { registerSearchProvider } from "@checkstack/command-backend";
47
54
  import { resolveRoute } from "@checkstack/common";
48
55
  import { createHealthCheckCache } from "./cache";
49
56
 
50
- // =============================================================================
51
- // Integration Event Payload Schemas
52
- // =============================================================================
53
-
54
- const systemDegradedPayloadSchema = z.object({
55
- systemId: z.string(),
56
- systemName: z.string().optional(),
57
- previousStatus: z.string(),
58
- newStatus: z.string(),
59
- healthyChecks: z.number(),
60
- totalChecks: z.number(),
61
- timestamp: z.string(),
62
- });
63
-
64
- const systemHealthyPayloadSchema = z.object({
65
- systemId: z.string(),
66
- systemName: z.string().optional(),
67
- previousStatus: z.string(),
68
- healthyChecks: z.number(),
69
- totalChecks: z.number(),
70
- timestamp: z.string(),
71
- });
72
-
73
57
  // Store emitHook reference for use during Phase 2 init
74
58
  let storedEmitHook: EmitHookFn | undefined;
75
59
 
@@ -82,33 +66,19 @@ export default createBackendPlugin({
82
66
  healthcheckGroupSubscription,
83
67
  ]);
84
68
 
85
- // Register hooks as integration events
86
- const integrationEvents = env.getExtensionPoint(
87
- integrationEventExtensionPoint,
88
- );
89
-
90
- integrationEvents.registerEvent(
91
- {
92
- hook: healthCheckHooks.systemDegraded,
93
- displayName: "System Health Degraded",
94
- description:
95
- "Fired when a system's health status transitions from healthy to degraded/unhealthy",
96
- category: "Health",
97
- payloadSchema: systemDegradedPayloadSchema,
98
- },
99
- pluginMetadata,
100
- );
101
-
102
- integrationEvents.registerEvent(
103
- {
104
- hook: healthCheckHooks.systemHealthy,
105
- displayName: "System Health Restored",
106
- description: "Fired when a system's health status recovers to healthy",
107
- category: "Health",
108
- payloadSchema: systemHealthyPayloadSchema,
109
- },
110
- pluginMetadata,
69
+ // ─── Automation Platform: triggers + artifact type ─────────────────
70
+ // Buffered behind the extension point until automation-backend's
71
+ // register() runs. Actions are wired in afterPluginsReady where
72
+ // `emitHook` becomes available.
73
+ const automationTriggers = env.getExtensionPoint(
74
+ automationTriggerExtensionPoint,
111
75
  );
76
+ for (const trigger of healthCheckTriggers) {
77
+ automationTriggers.registerTrigger(trigger, pluginMetadata);
78
+ }
79
+ env
80
+ .getExtensionPoint(automationArtifactTypeExtensionPoint)
81
+ .registerArtifactType(assignmentArtifactType, pluginMetadata);
112
82
 
113
83
  // ─── GitOps Entity Kind Registration ───────────────────────────────
114
84
  // Mutable refs — populated during init(), consumed by reconcile closures.
@@ -249,6 +219,7 @@ export default createBackendPlugin({
249
219
  getEmitHook: () => storedEmitHook,
250
220
  cache,
251
221
  configService: config,
222
+ catalogClient,
252
223
  });
253
224
  rpc.registerRouter(healthCheckRouter, healthCheckContract);
254
225
 
@@ -325,6 +296,20 @@ export default createBackendPlugin({
325
296
  healthCheckRegistry,
326
297
  collectorRegistry,
327
298
  );
299
+
300
+ // Register automation actions now that `emitHook` + `queueManager`
301
+ // are both available.
302
+ const automationActions = env.getExtensionPoint(
303
+ automationActionExtensionPoint,
304
+ );
305
+ for (const action of createHealthCheckActions({
306
+ service,
307
+ queueManager,
308
+ emitHook,
309
+ })) {
310
+ automationActions.registerAction(action, pluginMetadata);
311
+ }
312
+
328
313
  onHook(
329
314
  catalogHooks.systemDeleted,
330
315
  async (payload) => {