@checkstack/healthcheck-backend 1.1.3 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/schema.ts CHANGED
@@ -9,10 +9,12 @@ import {
9
9
  timestamp,
10
10
  primaryKey,
11
11
  unique,
12
+ index,
12
13
  } from "drizzle-orm/pg-core";
13
14
  import type {
14
15
  StateThresholds,
15
16
  CollectorConfigEntry,
17
+ NotificationPolicy,
16
18
  } from "@checkstack/healthcheck-common";
17
19
  import type { VersionedRecord } from "@checkstack/backend-api";
18
20
 
@@ -100,6 +102,12 @@ export const systemHealthChecks = pgTable(
100
102
  * Defaults to true. Only relevant when satelliteIds is set.
101
103
  */
102
104
  includeLocal: boolean("include_local").default(true).notNull(),
105
+ /**
106
+ * Per-association notification policy. Null falls back to platform
107
+ * defaults (no suppression).
108
+ */
109
+ notificationPolicy:
110
+ jsonb("notification_policy").$type<NotificationPolicy>(),
103
111
  createdAt: timestamp("created_at").defaultNow().notNull(),
104
112
  updatedAt: timestamp("updated_at").defaultNow().notNull(),
105
113
  },
@@ -108,6 +116,74 @@ export const systemHealthChecks = pgTable(
108
116
  }),
109
117
  );
110
118
 
119
+ /**
120
+ * Records each time a check's *evaluated* state transitions from
121
+ * non-unhealthy to unhealthy. Used to decide whether the per-check
122
+ * incident threshold (N transitions in M minutes) has been met.
123
+ * Pruned by the retention job alongside raw runs.
124
+ */
125
+ export const healthCheckUnhealthyTransitions = pgTable(
126
+ "health_check_unhealthy_transitions",
127
+ {
128
+ id: uuid("id").primaryKey().defaultRandom(),
129
+ configurationId: uuid("configuration_id")
130
+ .notNull()
131
+ .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
132
+ systemId: text("system_id").notNull(),
133
+ transitionedAt: timestamp("transitioned_at").defaultNow().notNull(),
134
+ },
135
+ (t) => ({
136
+ // Powers the threshold count query
137
+ // (WHERE config_id = ? AND system_id = ? AND transitioned_at > ?).
138
+ lookupIdx: index(
139
+ "health_check_unhealthy_transitions_lookup_idx",
140
+ ).on(t.configurationId, t.systemId, t.transitionedAt),
141
+ }),
142
+ );
143
+
144
+ /**
145
+ * Mapping of auto-opened incidents back to the system + check that
146
+ * triggered them. `closedAt` stays null while the incident is active;
147
+ * the auto-close worker sets it once the linked system has been
148
+ * steadily healthy for the cooldown.
149
+ *
150
+ * No FK to the incident table — that lives in another plugin's schema
151
+ * and we treat it as a soft reference (incident deletes are handled
152
+ * by the auto-close worker, which tolerates missing rows).
153
+ */
154
+ export const healthCheckAutoIncidents = pgTable(
155
+ "health_check_auto_incidents",
156
+ {
157
+ id: uuid("id").primaryKey().defaultRandom(),
158
+ incidentId: uuid("incident_id").notNull(),
159
+ systemId: text("system_id").notNull(),
160
+ configurationId: uuid("configuration_id")
161
+ .notNull()
162
+ .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
163
+ openedAt: timestamp("opened_at").defaultNow().notNull(),
164
+ closedAt: timestamp("closed_at"),
165
+ /**
166
+ * Auto-close cooldown snapshot taken when the incident was opened.
167
+ * `null` means "never auto-close" — the worker leaves this
168
+ * incident alone and an operator must resolve it manually. Stored
169
+ * per-row so a later policy change doesn't retroactively alter
170
+ * the close behaviour of incidents already in flight.
171
+ */
172
+ cooldownMinutes: integer("cooldown_minutes"),
173
+ },
174
+ (t) => ({
175
+ // Powers "is there an active auto-incident for this system?" check.
176
+ activeBySystemIdx: index(
177
+ "health_check_auto_incidents_active_by_system_idx",
178
+ ).on(t.systemId, t.closedAt),
179
+ // Powers "find the most recent close for this assignment" lookup
180
+ // used by the require-recovery-before-reopen check.
181
+ lastCloseByAssignmentIdx: index(
182
+ "health_check_auto_incidents_last_close_idx",
183
+ ).on(t.configurationId, t.systemId, t.closedAt),
184
+ }),
185
+ );
186
+
111
187
  export const healthCheckRuns = pgTable("health_check_runs", {
112
188
  id: uuid("id").primaryKey().defaultRandom(),
113
189
  configurationId: uuid("configuration_id")
@@ -0,0 +1,174 @@
1
+ import { describe, it, expect, mock } from "bun:test";
2
+ import { HealthCheckService } from "./service";
3
+ import { createMockDb } from "@checkstack/test-utils-backend";
4
+ import {
5
+ DEFAULT_NOTIFICATION_POLICY,
6
+ type NotificationPolicy,
7
+ } from "@checkstack/healthcheck-common";
8
+
9
+ /**
10
+ * Build a service whose only DB interaction is the chain used by
11
+ * `getAssignmentNotificationPolicy`. The chain ends in `.limit(1)` and
12
+ * returns the supplied rows verbatim. An optional in-memory platform
13
+ * default stands in for the ConfigService.
14
+ */
15
+ function buildServiceWithRows(
16
+ rows: unknown[],
17
+ platformDefault?: NotificationPolicy,
18
+ ): HealthCheckService {
19
+ const mockDb = createMockDb();
20
+ const limitChain = mock(async () => rows);
21
+ const whereChain = mock(() => ({ limit: limitChain }));
22
+ const fromChain = mock(() => ({ where: whereChain }));
23
+ const selectChain = mock(() => ({ from: fromChain }));
24
+ (mockDb as { select: unknown }).select = selectChain;
25
+
26
+ const configService =
27
+ platformDefault === undefined
28
+ ? undefined
29
+ : ({
30
+ get: mock(async () => platformDefault),
31
+ set: mock(async () => {}),
32
+ } as never);
33
+
34
+ return new HealthCheckService(
35
+ mockDb as never,
36
+ {} as never,
37
+ {} as never,
38
+ configService,
39
+ );
40
+ }
41
+
42
+ describe("HealthCheckService.getAssignmentNotificationPolicy", () => {
43
+ it("falls back to compile-time defaults when no association and no platform defaults", async () => {
44
+ const service = buildServiceWithRows([]);
45
+ const policy = await service.getAssignmentNotificationPolicy({
46
+ systemId: "sys-1",
47
+ configurationId: "cfg-1",
48
+ });
49
+ expect(policy).toEqual(DEFAULT_NOTIFICATION_POLICY);
50
+ });
51
+
52
+ it("falls back to platform defaults when association exists but notificationPolicy is null", async () => {
53
+ const customPlatformDefault: NotificationPolicy = {
54
+ ...DEFAULT_NOTIFICATION_POLICY,
55
+ autoCloseAfterMinutes: 120,
56
+ sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 15 },
57
+ };
58
+ const service = buildServiceWithRows(
59
+ [{ notificationPolicy: null }],
60
+ customPlatformDefault,
61
+ );
62
+ const policy = await service.getAssignmentNotificationPolicy({
63
+ systemId: "sys-1",
64
+ configurationId: "cfg-1",
65
+ });
66
+ expect(policy.autoCloseAfterMinutes).toBe(120);
67
+ expect(policy.sustainedUnhealthyTrigger.durationMinutes).toBe(15);
68
+ });
69
+
70
+ it("falls back to platform defaults when no association exists", async () => {
71
+ const customPlatformDefault: NotificationPolicy = {
72
+ ...DEFAULT_NOTIFICATION_POLICY,
73
+ flappingTrigger: { enabled: true, transitions: 10, windowMinutes: 30 },
74
+ };
75
+ const service = buildServiceWithRows([], customPlatformDefault);
76
+ const policy = await service.getAssignmentNotificationPolicy({
77
+ systemId: "sys-1",
78
+ configurationId: "cfg-1",
79
+ });
80
+ expect(policy.flappingTrigger).toEqual({
81
+ enabled: true,
82
+ transitions: 10,
83
+ windowMinutes: 30,
84
+ });
85
+ });
86
+
87
+ it("prefers per-assignment override over platform defaults", async () => {
88
+ const platformDefault: NotificationPolicy = {
89
+ ...DEFAULT_NOTIFICATION_POLICY,
90
+ autoOpenIncidentOnUnhealthy: false,
91
+ };
92
+ const assignmentOverride = {
93
+ suppressDeEscalations: true,
94
+ autoOpenIncidentOnUnhealthy: true, // overrides platform default
95
+ useNotificationSuppression: true,
96
+ skipDuringMaintenance: true,
97
+ sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 30 },
98
+ flappingTrigger: { enabled: true, transitions: 3, windowMinutes: 60 },
99
+ autoCloseAfterMinutes: 30,
100
+ };
101
+ const service = buildServiceWithRows(
102
+ [{ notificationPolicy: assignmentOverride }],
103
+ platformDefault,
104
+ );
105
+ const policy = await service.getAssignmentNotificationPolicy({
106
+ systemId: "sys-1",
107
+ configurationId: "cfg-1",
108
+ });
109
+ expect(policy.autoOpenIncidentOnUnhealthy).toBe(true);
110
+ expect(policy.suppressDeEscalations).toBe(true);
111
+ });
112
+
113
+ it("fills in defaults for partial stored policies", async () => {
114
+ // Older rows may have only `suppressDeEscalations` set from the
115
+ // first migration. All other fields must default in.
116
+ const service = buildServiceWithRows([
117
+ { notificationPolicy: { suppressDeEscalations: true } },
118
+ ]);
119
+ const policy = await service.getAssignmentNotificationPolicy({
120
+ systemId: "sys-1",
121
+ configurationId: "cfg-1",
122
+ });
123
+ expect(policy.suppressDeEscalations).toBe(true);
124
+ expect(policy.autoOpenIncidentOnUnhealthy).toBe(true);
125
+ expect(policy.useNotificationSuppression).toBe(true);
126
+ expect(policy.skipDuringMaintenance).toBe(true);
127
+ expect(policy.sustainedUnhealthyTrigger).toEqual({
128
+ enabled: true,
129
+ durationMinutes: 30,
130
+ });
131
+ expect(policy.flappingTrigger).toEqual({
132
+ enabled: true,
133
+ transitions: 3,
134
+ windowMinutes: 60,
135
+ });
136
+ expect(policy.autoCloseAfterMinutes).toBe(30);
137
+ });
138
+
139
+ it("returns explicit values exactly when fully specified", async () => {
140
+ const service = buildServiceWithRows([
141
+ {
142
+ notificationPolicy: {
143
+ suppressDeEscalations: false,
144
+ autoOpenIncidentOnUnhealthy: false,
145
+ useNotificationSuppression: false,
146
+ skipDuringMaintenance: false,
147
+ sustainedUnhealthyTrigger: { enabled: false, durationMinutes: 15 },
148
+ flappingTrigger: {
149
+ enabled: true,
150
+ transitions: 5,
151
+ windowMinutes: 30,
152
+ },
153
+ autoCloseAfterMinutes: null,
154
+ },
155
+ },
156
+ ]);
157
+ const policy = await service.getAssignmentNotificationPolicy({
158
+ systemId: "sys-1",
159
+ configurationId: "cfg-1",
160
+ });
161
+ expect(policy.autoOpenIncidentOnUnhealthy).toBe(false);
162
+ expect(policy.skipDuringMaintenance).toBe(false);
163
+ expect(policy.sustainedUnhealthyTrigger).toEqual({
164
+ enabled: false,
165
+ durationMinutes: 15,
166
+ });
167
+ expect(policy.flappingTrigger).toEqual({
168
+ enabled: true,
169
+ transitions: 5,
170
+ windowMinutes: 30,
171
+ });
172
+ expect(policy.autoCloseAfterMinutes).toBeNull();
173
+ });
174
+ });
package/src/service.ts CHANGED
@@ -6,7 +6,16 @@ import {
6
6
  HealthCheckStatus,
7
7
  RetentionConfig,
8
8
  type HealthCheckRunResult,
9
+ type NotificationPolicy,
10
+ NotificationPolicySchema,
11
+ DEFAULT_NOTIFICATION_POLICY,
9
12
  } from "@checkstack/healthcheck-common";
13
+ import type { ConfigService } from "@checkstack/backend-api";
14
+ import {
15
+ notificationDefaultsConfigV1,
16
+ NOTIFICATION_DEFAULTS_CONFIG_ID,
17
+ NOTIFICATION_DEFAULTS_CONFIG_VERSION,
18
+ } from "./notification-defaults-config";
10
19
  import {
11
20
  healthCheckConfigurations,
12
21
  systemHealthChecks,
@@ -15,7 +24,16 @@ import {
15
24
  VersionedStateThresholds,
16
25
  } from "./schema";
17
26
  import * as schema from "./schema";
18
- import { eq, and, InferSelectModel, desc, gte, lte, isNull } from "drizzle-orm";
27
+ import {
28
+ eq,
29
+ and,
30
+ InferSelectModel,
31
+ desc,
32
+ gte,
33
+ lte,
34
+ isNull,
35
+ inArray,
36
+ } from "drizzle-orm";
19
37
  import { ORPCError } from "@orpc/server";
20
38
  import { evaluateHealthStatus } from "./state-evaluator";
21
39
  import { stateThresholds } from "./state-thresholds-migrations";
@@ -57,8 +75,56 @@ export class HealthCheckService {
57
75
  private db: Db,
58
76
  private registry: HealthCheckRegistry,
59
77
  private collectorRegistry: CollectorRegistry,
78
+ /**
79
+ * Optional — only required by code paths that resolve platform
80
+ * defaults (notification policy fallback). When absent, callers
81
+ * fall back to the compile-time `DEFAULT_NOTIFICATION_POLICY`.
82
+ * Kept optional so existing GitOps-only / test constructions don't
83
+ * have to plumb it through.
84
+ */
85
+ private configService?: ConfigService,
60
86
  ) {}
61
87
 
88
+ /**
89
+ * Resolve the platform-wide notification policy defaults. Returns
90
+ * the compile-time defaults when no `configService` was provided or
91
+ * nothing has ever been persisted. Stored values are passed through
92
+ * the schema so missing fields default in.
93
+ */
94
+ async getPlatformNotificationDefaults(): Promise<NotificationPolicy> {
95
+ if (!this.configService) {
96
+ return DEFAULT_NOTIFICATION_POLICY;
97
+ }
98
+ const stored = await this.configService.get(
99
+ NOTIFICATION_DEFAULTS_CONFIG_ID,
100
+ notificationDefaultsConfigV1,
101
+ NOTIFICATION_DEFAULTS_CONFIG_VERSION,
102
+ );
103
+ return stored ?? DEFAULT_NOTIFICATION_POLICY;
104
+ }
105
+
106
+ /**
107
+ * Persist platform-wide notification policy defaults. Per-assignment
108
+ * rows with `notificationPolicy = null` will read the new defaults
109
+ * on their next evaluation. In-flight auto-incidents are unaffected
110
+ * (their cooldown is snapshotted per-row at open time).
111
+ */
112
+ async setPlatformNotificationDefaults(
113
+ policy: NotificationPolicy,
114
+ ): Promise<void> {
115
+ if (!this.configService) {
116
+ throw new Error(
117
+ "ConfigService not configured; cannot persist platform notification defaults",
118
+ );
119
+ }
120
+ await this.configService.set(
121
+ NOTIFICATION_DEFAULTS_CONFIG_ID,
122
+ notificationDefaultsConfigV1,
123
+ NOTIFICATION_DEFAULTS_CONFIG_VERSION,
124
+ policy,
125
+ );
126
+ }
127
+
62
128
  async createConfiguration(
63
129
  data: CreateHealthCheckConfiguration,
64
130
  ): Promise<HealthCheckConfiguration> {
@@ -133,6 +199,7 @@ export class HealthCheckService {
133
199
  stateThresholds?: StateThresholds;
134
200
  satelliteIds?: string[];
135
201
  includeLocal?: boolean;
202
+ notificationPolicy?: NotificationPolicy;
136
203
  }) {
137
204
  const {
138
205
  systemId,
@@ -141,6 +208,7 @@ export class HealthCheckService {
141
208
  stateThresholds: stateThresholds_,
142
209
  satelliteIds,
143
210
  includeLocal = true,
211
+ notificationPolicy,
144
212
  } = props;
145
213
 
146
214
  // Wrap thresholds in versioned config if provided
@@ -156,6 +224,7 @@ export class HealthCheckService {
156
224
  stateThresholds: versionedThresholds,
157
225
  satelliteIds: satelliteIds ?? undefined,
158
226
  includeLocal,
227
+ notificationPolicy: notificationPolicy ?? undefined,
159
228
  })
160
229
  .onConflictDoUpdate({
161
230
  target: [
@@ -167,6 +236,7 @@ export class HealthCheckService {
167
236
  stateThresholds: versionedThresholds,
168
237
  satelliteIds: satelliteIds ?? undefined,
169
238
  includeLocal,
239
+ notificationPolicy: notificationPolicy ?? undefined,
170
240
  updatedAt: new Date(),
171
241
  },
172
242
  });
@@ -282,6 +352,7 @@ export class HealthCheckService {
282
352
  stateThresholds: systemHealthChecks.stateThresholds,
283
353
  satelliteIds: systemHealthChecks.satelliteIds,
284
354
  includeLocal: systemHealthChecks.includeLocal,
355
+ notificationPolicy: systemHealthChecks.notificationPolicy,
285
356
  })
286
357
  .from(systemHealthChecks)
287
358
  .innerJoin(
@@ -304,11 +375,55 @@ export class HealthCheckService {
304
375
  stateThresholds: thresholds,
305
376
  satelliteIds: row.satelliteIds ?? undefined,
306
377
  includeLocal: row.includeLocal,
378
+ notificationPolicy: row.notificationPolicy ?? undefined,
307
379
  });
308
380
  }
309
381
  return results;
310
382
  }
311
383
 
384
+ /**
385
+ * Resolve the fully-defaulted notification policy for a single
386
+ * (system, configuration) association. Resolution order:
387
+ *
388
+ * 1. Per-assignment override (`systemHealthChecks.notificationPolicy`)
389
+ * when non-null. Stored as a full policy; missing keys defaulted
390
+ * via zod parse.
391
+ * 2. Platform-wide defaults via `ConfigService`.
392
+ * 3. Compile-time `DEFAULT_NOTIFICATION_POLICY`.
393
+ *
394
+ * The all-or-nothing semantic is intentional: assignment rows are
395
+ * either fully-overridden or fully-inherited from the platform.
396
+ * Operators can revert an override by setting the row's policy to
397
+ * `null`, which is the "Use platform defaults" action in the UI.
398
+ */
399
+ async getAssignmentNotificationPolicy({
400
+ systemId,
401
+ configurationId,
402
+ }: {
403
+ systemId: string;
404
+ configurationId: string;
405
+ }): Promise<NotificationPolicy> {
406
+ const [row] = await this.db
407
+ .select({
408
+ notificationPolicy: systemHealthChecks.notificationPolicy,
409
+ })
410
+ .from(systemHealthChecks)
411
+ .where(
412
+ and(
413
+ eq(systemHealthChecks.systemId, systemId),
414
+ eq(systemHealthChecks.configurationId, configurationId),
415
+ ),
416
+ )
417
+ .limit(1);
418
+
419
+ // No assignment row → use platform defaults (the only sensible
420
+ // value for a configuration nothing has explicitly touched).
421
+ if (!row || row.notificationPolicy === null) {
422
+ return this.getPlatformNotificationDefaults();
423
+ }
424
+ return NotificationPolicySchema.parse(row.notificationPolicy);
425
+ }
426
+
312
427
  /**
313
428
  * Get the evaluated health status for a system based on configured thresholds.
314
429
  * Aggregates status from all health check configurations for this system.
@@ -489,6 +604,7 @@ export class HealthCheckService {
489
604
  startDate?: Date;
490
605
  endDate?: Date;
491
606
  sourceFilter?: string;
607
+ statusFilter?: HealthCheckStatus[];
492
608
  limit?: number;
493
609
  offset?: number;
494
610
  sortOrder: "asc" | "desc";
@@ -499,6 +615,7 @@ export class HealthCheckService {
499
615
  startDate,
500
616
  endDate,
501
617
  sourceFilter,
618
+ statusFilter,
502
619
  limit = 10,
503
620
  offset = 0,
504
621
  sortOrder,
@@ -518,6 +635,11 @@ export class HealthCheckService {
518
635
  conditions.push(eq(healthCheckRuns.sourceId, sourceFilter));
519
636
  }
520
637
 
638
+ // Status filtering (e.g. only failing runs)
639
+ if (statusFilter && statusFilter.length > 0) {
640
+ conditions.push(inArray(healthCheckRuns.status, statusFilter));
641
+ }
642
+
521
643
  // Build where clause
522
644
  const whereClause = conditions.length > 0 ? and(...conditions) : undefined;
523
645
 
@@ -563,6 +685,7 @@ export class HealthCheckService {
563
685
  startDate?: Date;
564
686
  endDate?: Date;
565
687
  sourceFilter?: string;
688
+ statusFilter?: HealthCheckStatus[];
566
689
  limit?: number;
567
690
  offset?: number;
568
691
  sortOrder: "asc" | "desc";
@@ -573,6 +696,7 @@ export class HealthCheckService {
573
696
  startDate,
574
697
  endDate,
575
698
  sourceFilter,
699
+ statusFilter,
576
700
  limit = 10,
577
701
  offset = 0,
578
702
  sortOrder,
@@ -592,6 +716,11 @@ export class HealthCheckService {
592
716
  conditions.push(eq(healthCheckRuns.sourceId, sourceFilter));
593
717
  }
594
718
 
719
+ // Status filtering (e.g. only failing runs)
720
+ if (statusFilter && statusFilter.length > 0) {
721
+ conditions.push(inArray(healthCheckRuns.status, statusFilter));
722
+ }
723
+
595
724
  const whereClause = conditions.length > 0 ? and(...conditions) : undefined;
596
725
  const total = await this.db.$count(healthCheckRuns, whereClause);
597
726
 
package/tsconfig.json CHANGED
@@ -37,6 +37,9 @@
37
37
  {
38
38
  "path": "../healthcheck-common"
39
39
  },
40
+ {
41
+ "path": "../incident-backend"
42
+ },
40
43
  {
41
44
  "path": "../incident-common"
42
45
  },