@checkstack/healthcheck-backend 1.1.4 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/router.ts CHANGED
@@ -7,6 +7,7 @@ import {
7
7
  type HealthCheckRegistry,
8
8
  type SafeDatabase,
9
9
  type CollectorRegistry,
10
+ type ConfigService,
10
11
  } from "@checkstack/backend-api";
11
12
  import { healthCheckContract } from "@checkstack/healthcheck-common";
12
13
  import type { StrategyCategory } from "@checkstack/healthcheck-common";
@@ -16,6 +17,7 @@ import * as schema from "./schema";
16
17
  import { toJsonSchemaWithChartMeta } from "./schema-utils";
17
18
  import type { InferClient } from "@checkstack/common";
18
19
  import { GitOpsApi } from "@checkstack/gitops-common";
20
+ import { CatalogApi } from "@checkstack/catalog-common";
19
21
  import type { HealthCheckCache } from "./cache";
20
22
 
21
23
  /**
@@ -31,10 +33,26 @@ export const createHealthCheckRouter = (opts: {
31
33
  gitOpsClient: InferClient<typeof GitOpsApi>;
32
34
  getEmitHook: () => ((hook: { id: string }, payload: Record<string, unknown>) => Promise<void>) | undefined;
33
35
  cache: HealthCheckCache;
36
+ configService: ConfigService;
37
+ catalogClient: InferClient<typeof CatalogApi>;
34
38
  }) => {
35
- const { database, registry, collectorRegistry, getEmitHook, cache } = opts;
39
+ const {
40
+ database,
41
+ registry,
42
+ collectorRegistry,
43
+ getEmitHook,
44
+ cache,
45
+ configService,
46
+ catalogClient,
47
+ } = opts;
36
48
  // Create service instance once - shared across all handlers
37
- const service = new HealthCheckService(database, registry, collectorRegistry);
49
+ const service = new HealthCheckService(
50
+ database,
51
+ registry,
52
+ collectorRegistry,
53
+ configService,
54
+ catalogClient,
55
+ );
38
56
 
39
57
  // Create contract implementer with context type AND auto auth middleware
40
58
  const os = implement(healthCheckContract)
@@ -222,6 +240,16 @@ export const createHealthCheckRouter = (opts: {
222
240
  }
223
241
  }),
224
242
 
243
+ getPlatformNotificationDefaults:
244
+ os.getPlatformNotificationDefaults.handler(async () => {
245
+ return service.getPlatformNotificationDefaults();
246
+ }),
247
+
248
+ setPlatformNotificationDefaults:
249
+ os.setPlatformNotificationDefaults.handler(async ({ input }) => {
250
+ await service.setPlatformNotificationDefaults(input);
251
+ }),
252
+
225
253
  getRetentionConfig: os.getRetentionConfig.handler(async ({ input }) => {
226
254
  return service.getRetentionConfig(input.systemId, input.configurationId);
227
255
  }),
package/src/schema.ts CHANGED
@@ -9,10 +9,12 @@ import {
9
9
  timestamp,
10
10
  primaryKey,
11
11
  unique,
12
+ index,
12
13
  } from "drizzle-orm/pg-core";
13
14
  import type {
14
15
  StateThresholds,
15
16
  CollectorConfigEntry,
17
+ NotificationPolicy,
16
18
  } from "@checkstack/healthcheck-common";
17
19
  import type { VersionedRecord } from "@checkstack/backend-api";
18
20
 
@@ -100,6 +102,12 @@ export const systemHealthChecks = pgTable(
100
102
  * Defaults to true. Only relevant when satelliteIds is set.
101
103
  */
102
104
  includeLocal: boolean("include_local").default(true).notNull(),
105
+ /**
106
+ * Per-association notification policy. Null falls back to platform
107
+ * defaults (no suppression).
108
+ */
109
+ notificationPolicy:
110
+ jsonb("notification_policy").$type<NotificationPolicy>(),
103
111
  createdAt: timestamp("created_at").defaultNow().notNull(),
104
112
  updatedAt: timestamp("updated_at").defaultNow().notNull(),
105
113
  },
@@ -108,6 +116,74 @@ export const systemHealthChecks = pgTable(
108
116
  }),
109
117
  );
110
118
 
119
+ /**
120
+ * Records each time a check's *evaluated* state transitions from
121
+ * non-unhealthy to unhealthy. Used to decide whether the per-check
122
+ * incident threshold (N transitions in M minutes) has been met.
123
+ * Pruned by the retention job alongside raw runs.
124
+ */
125
+ export const healthCheckUnhealthyTransitions = pgTable(
126
+ "health_check_unhealthy_transitions",
127
+ {
128
+ id: uuid("id").primaryKey().defaultRandom(),
129
+ configurationId: uuid("configuration_id")
130
+ .notNull()
131
+ .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
132
+ systemId: text("system_id").notNull(),
133
+ transitionedAt: timestamp("transitioned_at").defaultNow().notNull(),
134
+ },
135
+ (t) => ({
136
+ // Powers the threshold count query
137
+ // (WHERE config_id = ? AND system_id = ? AND transitioned_at > ?).
138
+ lookupIdx: index(
139
+ "health_check_unhealthy_transitions_lookup_idx",
140
+ ).on(t.configurationId, t.systemId, t.transitionedAt),
141
+ }),
142
+ );
143
+
144
+ /**
145
+ * Mapping of auto-opened incidents back to the system + check that
146
+ * triggered them. `closedAt` stays null while the incident is active;
147
+ * the auto-close worker sets it once the linked system has been
148
+ * steadily healthy for the cooldown.
149
+ *
150
+ * No FK to the incident table — that lives in another plugin's schema
151
+ * and we treat it as a soft reference (incident deletes are handled
152
+ * by the auto-close worker, which tolerates missing rows).
153
+ */
154
+ export const healthCheckAutoIncidents = pgTable(
155
+ "health_check_auto_incidents",
156
+ {
157
+ id: uuid("id").primaryKey().defaultRandom(),
158
+ incidentId: uuid("incident_id").notNull(),
159
+ systemId: text("system_id").notNull(),
160
+ configurationId: uuid("configuration_id")
161
+ .notNull()
162
+ .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
163
+ openedAt: timestamp("opened_at").defaultNow().notNull(),
164
+ closedAt: timestamp("closed_at"),
165
+ /**
166
+ * Auto-close cooldown snapshot taken when the incident was opened.
167
+ * `null` means "never auto-close" — the worker leaves this
168
+ * incident alone and an operator must resolve it manually. Stored
169
+ * per-row so a later policy change doesn't retroactively alter
170
+ * the close behaviour of incidents already in flight.
171
+ */
172
+ cooldownMinutes: integer("cooldown_minutes"),
173
+ },
174
+ (t) => ({
175
+ // Powers "is there an active auto-incident for this system?" check.
176
+ activeBySystemIdx: index(
177
+ "health_check_auto_incidents_active_by_system_idx",
178
+ ).on(t.systemId, t.closedAt),
179
+ // Powers "find the most recent close for this assignment" lookup
180
+ // used by the require-recovery-before-reopen check.
181
+ lastCloseByAssignmentIdx: index(
182
+ "health_check_auto_incidents_last_close_idx",
183
+ ).on(t.configurationId, t.systemId, t.closedAt),
184
+ }),
185
+ );
186
+
111
187
  export const healthCheckRuns = pgTable("health_check_runs", {
112
188
  id: uuid("id").primaryKey().defaultRandom(),
113
189
  configurationId: uuid("configuration_id")
@@ -0,0 +1,184 @@
1
+ import { describe, it, expect, mock, beforeEach } from "bun:test";
2
+ import { HealthCheckService } from "./service";
3
+
4
+ /**
5
+ * Tests for getAssignmentsForSatellite run-context population:
6
+ * - assignments carry configName (from the config row's name)
7
+ * - systemName resolves via the optional catalog client, falling back to
8
+ * systemId when no client is wired or the lookup fails.
9
+ */
10
+ describe("HealthCheckService.getAssignmentsForSatellite", () => {
11
+ const SATELLITE_ID = "sat-1";
12
+
13
+ type Association = {
14
+ systemId: string;
15
+ configurationId: string;
16
+ satelliteIds: string[] | null;
17
+ enabled: boolean;
18
+ };
19
+
20
+ type Config = {
21
+ id: string;
22
+ name: string;
23
+ strategyId: string;
24
+ config: Record<string, unknown>;
25
+ collectors: unknown[] | null;
26
+ intervalSeconds: number;
27
+ paused: boolean;
28
+ };
29
+
30
+ let associations: Association[] = [];
31
+ let configs: Config[] = [];
32
+
33
+ /**
34
+ * Mock db: the method issues two distinct select shapes:
35
+ * - associations: .select({...}).from(systemHealthChecks) -> awaited array
36
+ * - config: .select().from(...).where(...) -> awaited array
37
+ * We disambiguate by call order: the first select() resolves associations,
38
+ * subsequent select().from().where() resolve a single matching config.
39
+ */
40
+ function createMockDb() {
41
+ let firstSelect = true;
42
+ return {
43
+ select: mock(() => {
44
+ if (firstSelect) {
45
+ firstSelect = false;
46
+ return {
47
+ from: mock(() => Promise.resolve([...associations])),
48
+ };
49
+ }
50
+ return {
51
+ from: mock(() => ({
52
+ where: mock(() => {
53
+ // Return the next unmatched config in order; the loop fetches
54
+ // one config per matching association.
55
+ return Promise.resolve(configs.length > 0 ? [configs[0]] : []);
56
+ }),
57
+ })),
58
+ };
59
+ }),
60
+ };
61
+ }
62
+
63
+ beforeEach(() => {
64
+ associations = [];
65
+ configs = [];
66
+ });
67
+
68
+ it("populates configName and resolves systemName via the catalog client", async () => {
69
+ associations = [
70
+ {
71
+ systemId: "system-1",
72
+ configurationId: "config-1",
73
+ satelliteIds: [SATELLITE_ID],
74
+ enabled: true,
75
+ },
76
+ ];
77
+ configs = [
78
+ {
79
+ id: "config-1",
80
+ name: "API health",
81
+ strategyId: "http",
82
+ config: { url: "https://example.com" },
83
+ collectors: null,
84
+ intervalSeconds: 60,
85
+ paused: false,
86
+ },
87
+ ];
88
+
89
+ const getSystem = mock(() =>
90
+ Promise.resolve({ id: "system-1", name: "Production API" }),
91
+ );
92
+ const catalogClient = { getSystem } as never;
93
+
94
+ const mockDb = createMockDb();
95
+ const service = new HealthCheckService(
96
+ mockDb as never,
97
+ {} as never,
98
+ {} as never,
99
+ undefined,
100
+ catalogClient,
101
+ );
102
+
103
+ const result = await service.getAssignmentsForSatellite(SATELLITE_ID);
104
+
105
+ expect(result).toHaveLength(1);
106
+ expect(result[0].configName).toBe("API health");
107
+ expect(result[0].systemName).toBe("Production API");
108
+ expect(getSystem).toHaveBeenCalledWith({ systemId: "system-1" });
109
+ });
110
+
111
+ it("falls back to systemId when no catalog client is provided", async () => {
112
+ associations = [
113
+ {
114
+ systemId: "system-1",
115
+ configurationId: "config-1",
116
+ satelliteIds: [SATELLITE_ID],
117
+ enabled: true,
118
+ },
119
+ ];
120
+ configs = [
121
+ {
122
+ id: "config-1",
123
+ name: "API health",
124
+ strategyId: "http",
125
+ config: {},
126
+ collectors: null,
127
+ intervalSeconds: 30,
128
+ paused: false,
129
+ },
130
+ ];
131
+
132
+ const mockDb = createMockDb();
133
+ const service = new HealthCheckService(
134
+ mockDb as never,
135
+ {} as never,
136
+ {} as never,
137
+ );
138
+
139
+ const result = await service.getAssignmentsForSatellite(SATELLITE_ID);
140
+
141
+ expect(result).toHaveLength(1);
142
+ expect(result[0].configName).toBe("API health");
143
+ expect(result[0].systemName).toBe("system-1");
144
+ });
145
+
146
+ it("falls back to systemId when the catalog lookup throws", async () => {
147
+ associations = [
148
+ {
149
+ systemId: "system-1",
150
+ configurationId: "config-1",
151
+ satelliteIds: [SATELLITE_ID],
152
+ enabled: true,
153
+ },
154
+ ];
155
+ configs = [
156
+ {
157
+ id: "config-1",
158
+ name: "API health",
159
+ strategyId: "http",
160
+ config: {},
161
+ collectors: null,
162
+ intervalSeconds: 60,
163
+ paused: false,
164
+ },
165
+ ];
166
+
167
+ const getSystem = mock(() => Promise.reject(new Error("catalog down")));
168
+ const catalogClient = { getSystem } as never;
169
+
170
+ const mockDb = createMockDb();
171
+ const service = new HealthCheckService(
172
+ mockDb as never,
173
+ {} as never,
174
+ {} as never,
175
+ undefined,
176
+ catalogClient,
177
+ );
178
+
179
+ const result = await service.getAssignmentsForSatellite(SATELLITE_ID);
180
+
181
+ expect(result).toHaveLength(1);
182
+ expect(result[0].systemName).toBe("system-1");
183
+ });
184
+ });
@@ -0,0 +1,174 @@
1
+ import { describe, it, expect, mock } from "bun:test";
2
+ import { HealthCheckService } from "./service";
3
+ import { createMockDb } from "@checkstack/test-utils-backend";
4
+ import {
5
+ DEFAULT_NOTIFICATION_POLICY,
6
+ type NotificationPolicy,
7
+ } from "@checkstack/healthcheck-common";
8
+
9
+ /**
10
+ * Build a service whose only DB interaction is the chain used by
11
+ * `getAssignmentNotificationPolicy`. The chain ends in `.limit(1)` and
12
+ * returns the supplied rows verbatim. An optional in-memory platform
13
+ * default stands in for the ConfigService.
14
+ */
15
+ function buildServiceWithRows(
16
+ rows: unknown[],
17
+ platformDefault?: NotificationPolicy,
18
+ ): HealthCheckService {
19
+ const mockDb = createMockDb();
20
+ const limitChain = mock(async () => rows);
21
+ const whereChain = mock(() => ({ limit: limitChain }));
22
+ const fromChain = mock(() => ({ where: whereChain }));
23
+ const selectChain = mock(() => ({ from: fromChain }));
24
+ (mockDb as { select: unknown }).select = selectChain;
25
+
26
+ const configService =
27
+ platformDefault === undefined
28
+ ? undefined
29
+ : ({
30
+ get: mock(async () => platformDefault),
31
+ set: mock(async () => {}),
32
+ } as never);
33
+
34
+ return new HealthCheckService(
35
+ mockDb as never,
36
+ {} as never,
37
+ {} as never,
38
+ configService,
39
+ );
40
+ }
41
+
42
+ describe("HealthCheckService.getAssignmentNotificationPolicy", () => {
43
+ it("falls back to compile-time defaults when no association and no platform defaults", async () => {
44
+ const service = buildServiceWithRows([]);
45
+ const policy = await service.getAssignmentNotificationPolicy({
46
+ systemId: "sys-1",
47
+ configurationId: "cfg-1",
48
+ });
49
+ expect(policy).toEqual(DEFAULT_NOTIFICATION_POLICY);
50
+ });
51
+
52
+ it("falls back to platform defaults when association exists but notificationPolicy is null", async () => {
53
+ const customPlatformDefault: NotificationPolicy = {
54
+ ...DEFAULT_NOTIFICATION_POLICY,
55
+ autoCloseAfterMinutes: 120,
56
+ sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 15 },
57
+ };
58
+ const service = buildServiceWithRows(
59
+ [{ notificationPolicy: null }],
60
+ customPlatformDefault,
61
+ );
62
+ const policy = await service.getAssignmentNotificationPolicy({
63
+ systemId: "sys-1",
64
+ configurationId: "cfg-1",
65
+ });
66
+ expect(policy.autoCloseAfterMinutes).toBe(120);
67
+ expect(policy.sustainedUnhealthyTrigger.durationMinutes).toBe(15);
68
+ });
69
+
70
+ it("falls back to platform defaults when no association exists", async () => {
71
+ const customPlatformDefault: NotificationPolicy = {
72
+ ...DEFAULT_NOTIFICATION_POLICY,
73
+ flappingTrigger: { enabled: true, transitions: 10, windowMinutes: 30 },
74
+ };
75
+ const service = buildServiceWithRows([], customPlatformDefault);
76
+ const policy = await service.getAssignmentNotificationPolicy({
77
+ systemId: "sys-1",
78
+ configurationId: "cfg-1",
79
+ });
80
+ expect(policy.flappingTrigger).toEqual({
81
+ enabled: true,
82
+ transitions: 10,
83
+ windowMinutes: 30,
84
+ });
85
+ });
86
+
87
+ it("prefers per-assignment override over platform defaults", async () => {
88
+ const platformDefault: NotificationPolicy = {
89
+ ...DEFAULT_NOTIFICATION_POLICY,
90
+ autoOpenIncidentOnUnhealthy: false,
91
+ };
92
+ const assignmentOverride = {
93
+ suppressDeEscalations: true,
94
+ autoOpenIncidentOnUnhealthy: true, // overrides platform default
95
+ useNotificationSuppression: true,
96
+ skipDuringMaintenance: true,
97
+ sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 30 },
98
+ flappingTrigger: { enabled: true, transitions: 3, windowMinutes: 60 },
99
+ autoCloseAfterMinutes: 30,
100
+ };
101
+ const service = buildServiceWithRows(
102
+ [{ notificationPolicy: assignmentOverride }],
103
+ platformDefault,
104
+ );
105
+ const policy = await service.getAssignmentNotificationPolicy({
106
+ systemId: "sys-1",
107
+ configurationId: "cfg-1",
108
+ });
109
+ expect(policy.autoOpenIncidentOnUnhealthy).toBe(true);
110
+ expect(policy.suppressDeEscalations).toBe(true);
111
+ });
112
+
113
+ it("fills in defaults for partial stored policies", async () => {
114
+ // Older rows may have only `suppressDeEscalations` set from the
115
+ // first migration. All other fields must default in.
116
+ const service = buildServiceWithRows([
117
+ { notificationPolicy: { suppressDeEscalations: true } },
118
+ ]);
119
+ const policy = await service.getAssignmentNotificationPolicy({
120
+ systemId: "sys-1",
121
+ configurationId: "cfg-1",
122
+ });
123
+ expect(policy.suppressDeEscalations).toBe(true);
124
+ expect(policy.autoOpenIncidentOnUnhealthy).toBe(true);
125
+ expect(policy.useNotificationSuppression).toBe(true);
126
+ expect(policy.skipDuringMaintenance).toBe(true);
127
+ expect(policy.sustainedUnhealthyTrigger).toEqual({
128
+ enabled: true,
129
+ durationMinutes: 30,
130
+ });
131
+ expect(policy.flappingTrigger).toEqual({
132
+ enabled: true,
133
+ transitions: 3,
134
+ windowMinutes: 60,
135
+ });
136
+ expect(policy.autoCloseAfterMinutes).toBe(30);
137
+ });
138
+
139
+ it("returns explicit values exactly when fully specified", async () => {
140
+ const service = buildServiceWithRows([
141
+ {
142
+ notificationPolicy: {
143
+ suppressDeEscalations: false,
144
+ autoOpenIncidentOnUnhealthy: false,
145
+ useNotificationSuppression: false,
146
+ skipDuringMaintenance: false,
147
+ sustainedUnhealthyTrigger: { enabled: false, durationMinutes: 15 },
148
+ flappingTrigger: {
149
+ enabled: true,
150
+ transitions: 5,
151
+ windowMinutes: 30,
152
+ },
153
+ autoCloseAfterMinutes: null,
154
+ },
155
+ },
156
+ ]);
157
+ const policy = await service.getAssignmentNotificationPolicy({
158
+ systemId: "sys-1",
159
+ configurationId: "cfg-1",
160
+ });
161
+ expect(policy.autoOpenIncidentOnUnhealthy).toBe(false);
162
+ expect(policy.skipDuringMaintenance).toBe(false);
163
+ expect(policy.sustainedUnhealthyTrigger).toEqual({
164
+ enabled: false,
165
+ durationMinutes: 15,
166
+ });
167
+ expect(policy.flappingTrigger).toEqual({
168
+ enabled: true,
169
+ transitions: 5,
170
+ windowMinutes: 30,
171
+ });
172
+ expect(policy.autoCloseAfterMinutes).toBeNull();
173
+ });
174
+ });