@checkstack/healthcheck-common 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,88 @@
1
1
  # @checkstack/healthcheck-common
2
2
 
3
+ ## 1.3.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 35bc682: feat(healthcheck): expose check + system run-context to script collectors
8
+
9
+ Script health checks can now read which check and system a run is for.
10
+ Previously shell scripts got only a curated env whitelist and inline
11
+ scripts only `context.config`, so a script had no built-in way to know
12
+ its own check name or the system it was checking.
13
+
14
+ - `@checkstack/backend-api`: new `CollectorRunContext` type
15
+ (`{ check: { id, name, intervalSeconds }, system: { id, name } }`) and
16
+ an optional `runContext` param on `CollectorStrategy.execute`. Optional,
17
+ so existing collector implementations are unaffected.
18
+ - Shell-script collector: injects reserved `CHECKSTACK_CHECK_ID`,
19
+ `CHECKSTACK_CHECK_NAME`, `CHECKSTACK_CHECK_INTERVAL_SECONDS`,
20
+ `CHECKSTACK_SYSTEM_ID`, `CHECKSTACK_SYSTEM_NAME` env vars (user-supplied
21
+ `env` still wins on collision).
22
+ - Inline-script collector: exposes `context.check` and `context.system`
23
+ alongside `context.config`; the inline-script editor now types them for
24
+ autocomplete.
25
+ - Shell editors (health-check collectors and automation shell actions) now
26
+ also suggest the user's own `env` (JSON) keys as `$NAME` completions, via
27
+ the new exported `customShellEnvVars` helper. Keys that aren't valid shell
28
+ identifiers are omitted.
29
+ - Fix: the Typefox `CodeEditor` captured a stale `onChange` at editor start,
30
+ so editing one `DynamicForm` field reverted sibling fields changed since
31
+ mount (e.g. typing in a shell `script` field wiped an unsaved `env` value,
32
+ or deleted a sibling automation action added after mount). The change
33
+ handler now routes through a ref to the current `onChange`.
34
+ - Fix: focusing a JSON editor threw "LanguageStatusService.addStatus is not
35
+ supported" because the standalone service set omitted `ILanguageStatusService`.
36
+ That one service is now registered via `serviceOverrides`.
37
+ - Fix: the automation trigger card nested a `<Badge>` (a `<div>`) inside a
38
+ `<p>`, producing a `validateDOMNesting` warning. Switched the wrapper to a
39
+ `<div>`.
40
+ - Local runs (`queue-executor`) and satellite runs both populate the
41
+ context. `SatelliteAssignment` (and the `getAssignmentsForSatellite`
42
+ RPC output) gained optional `configName` / `systemName` so the metadata
43
+ reaches satellite-side execution; `HealthCheckService` resolves the
44
+ system name via the catalog client.
45
+
46
+ BREAKING CHANGE: `createHealthCheckRouter` now requires a `catalogClient`
47
+ option (used to resolve system names for satellite assignments). Update
48
+ call sites to pass the catalog RPC client.
49
+
50
+ ### Patch Changes
51
+
52
+ - Updated dependencies [6d52276]
53
+ - @checkstack/common@0.12.0
54
+ - @checkstack/catalog-common@2.2.3
55
+ - @checkstack/notification-common@1.2.1
56
+ - @checkstack/signal-common@0.2.5
57
+
58
+ ## 1.2.0
59
+
60
+ ### Minor Changes
61
+
62
+ - ba07ae2: Quiet down notification spam on flapping systems, auto-open incidents when a check goes critical, and let operators land directly on the broken checks.
63
+
64
+ Notification policy lives **per healthcheck assignment** (one row per `system × configuration`). Different checks on the same system are fully independent — disabling a setting on one check does not affect the others. Defaults preserve existing behaviour for `suppressDeEscalations`; **auto-incident defaults to on** for new and existing assignments.
65
+
66
+ - **`suppressDeEscalations`** (off by default). When on, transitions from a worse state to a better-but-still-failing state (e.g. `unhealthy → degraded`) no longer fire a notification. Escalations and full recoveries to `healthy` are unaffected. Resolved per assignment (the just-ran check is the one driving any aggregate transition).
67
+ - **`autoOpenIncidentOnUnhealthy`** (on by default). Either of two independent triggers can open the auto-incident:
68
+ - **`sustainedUnhealthyTrigger`** (default 30 min) — opens when the check stays continuously unhealthy for the configured duration. Catches real outages.
69
+ - **`flappingTrigger`** (default 3 transitions in 60 min) — opens when the check flips to unhealthy that many times in the window. Catches persistent flapping where each unhealthy phase is too brief for the sustained trigger.
70
+ Each trigger can be individually disabled. One incident per system: triggering checks attach to an existing active auto-incident.
71
+ - **`useNotificationSuppression`** (on by default, only meaningful when auto-open is on). Controls whether the auto-opened incident is created with `suppressNotifications: true` — leaving this off opens the incident but still pings operators on each transition.
72
+ - **`skipDuringMaintenance`** (on by default). No auto-incident is opened while the system has an active maintenance window with suppression. The system is intentionally down and shouldn't trip the on-call.
73
+ - **`autoCloseAfterMinutes`** (default 30). Auto-close cooldown is now per-assignment and snapshotted per-incident at open time — later policy edits don't alter in-flight incidents. Setting `null` ("Never auto-close") leaves the incident for manual resolution.
74
+ - **Require-recovery rule.** After any auto-incident closes (manual or auto), no new auto-incident can open until the check has logged at least one healthy run. Prevents a "operator dismissed but it's still broken" loop.
75
+ - **Auto-close worker** ticks every 60s and resolves auto-opened incidents whose systems have been healthy for their per-row `cooldownMinutes`. Rows with `null` cooldown are skipped entirely. Per-incident: failed close attempts are logged but never abort the sweep.
76
+ - **`incidentResolved` hook subscriber** syncs the auto-incident mapping when an operator manually resolves the incident, so the require-recovery rule sees the close immediately.
77
+ - **Platform-wide defaults.** New admin RPCs `getPlatformNotificationDefaults` / `setPlatformNotificationDefaults` (under the existing `healthcheck.configuration.{read,manage}` access rules) let operators set notification policy once for the whole instance. Per-assignment rows with `notificationPolicy: null` inherit the platform defaults at read time. UI: a "Notification defaults" button in the Assignment IDE opens a modal editor. The per-assignment Notifications panel shows an inheritance banner — "Using platform defaults" (read-only) with an "Override" button, or "Custom override" with a "Use platform defaults" button to revert. The all-or-nothing model keeps the mental model simple: each assignment is either fully inherited or fully overridden.
78
+ - **New service-level RPCs** on the incident plugin (`createAutoIncident`, `resolveAutoIncident`) let other plugins open/close incidents without a user context. Reused by the healthcheck auto-incident flow.
79
+ - **Health-state notification CTA** now deep-links to `?filter=failing` on the system detail page for non-recovery transitions (label changes to "View failing checks"). The system overview gains an `All / Failing / Healthy` segmented filter wired to the same `?filter=…` param.
80
+ - **Notification bell badge** now counts collapse groups instead of raw rows, so the number matches what the user sees in the notifications list. Built on `COUNT(DISTINCT COALESCE(collapse_key, id))` — notifications without a collapse key still each count as one.
81
+ - **`statusFilter` on `getHistory` / `getDetailedHistory`** lets the run-history page and the drawer's Recent Runs panel filter to `All / Healthy / Failing` via shared pills, with the page resetting to the first page on filter change.
82
+ - **Pagination defaults aligned with selector options.** Several pages defaulted to a page size (5 or 20) that wasn't in the dropdown's options (`[10, 25, 50, 100]`), so the page-size `<Select>` rendered empty. The drawer's Recent Runs now defaults to 10; the Run History, History List, and Delivery Logs pages now default to 25.
83
+
84
+ Includes Drizzle migrations adding the `notification_policy` jsonb column to `system_health_checks`, plus two new tables: `health_check_unhealthy_transitions` (for threshold counting) and `health_check_auto_incidents` (for mapping back to incident ids during auto-close).
85
+
3
86
  ## 1.1.2
4
87
 
5
88
  ### Patch Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@checkstack/healthcheck-common",
3
- "version": "1.1.2",
3
+ "version": "1.3.0",
4
4
  "license": "Elastic-2.0",
5
5
  "type": "module",
6
6
  "exports": {
@@ -9,16 +9,16 @@
9
9
  }
10
10
  },
11
11
  "dependencies": {
12
- "@checkstack/common": "0.10.0",
13
- "@checkstack/catalog-common": "2.2.1",
14
- "@checkstack/notification-common": "1.1.1",
15
- "@checkstack/signal-common": "0.2.3",
12
+ "@checkstack/common": "0.11.0",
13
+ "@checkstack/catalog-common": "2.2.2",
14
+ "@checkstack/notification-common": "1.2.0",
15
+ "@checkstack/signal-common": "0.2.4",
16
16
  "zod": "^4.2.1"
17
17
  },
18
18
  "devDependencies": {
19
19
  "typescript": "^5.7.2",
20
20
  "@checkstack/tsconfig": "0.0.7",
21
- "@checkstack/scripts": "0.3.2"
21
+ "@checkstack/scripts": "0.3.3"
22
22
  },
23
23
  "scripts": {
24
24
  "typecheck": "tsgo -b",
@@ -17,6 +17,7 @@ import {
17
17
  RetentionConfigSchema,
18
18
  AggregatedBucketBaseSchema,
19
19
  AggregatedBucketSchema,
20
+ NotificationPolicySchema,
20
21
  } from "./schemas";
21
22
 
22
23
  // --- Response Schemas for Evaluated Status ---
@@ -155,6 +156,8 @@ export const healthCheckContract = {
155
156
  satelliteIds: z.array(z.string()).optional(),
156
157
  /** Whether to also run this check locally on the core (default: true) */
157
158
  includeLocal: z.boolean(),
159
+ /** Per-association notification policy (omitted = platform defaults) */
160
+ notificationPolicy: NotificationPolicySchema.optional(),
158
161
  }),
159
162
  ),
160
163
  ),
@@ -172,6 +175,31 @@ export const healthCheckContract = {
172
175
  )
173
176
  .output(z.void()),
174
177
 
178
+ /**
179
+ * Read the platform-wide notification policy defaults. Per-assignment
180
+ * rows with no override inherit these values; admin tooling reads
181
+ * them to populate the defaults editor. Compile-time defaults fill
182
+ * in any unset fields.
183
+ */
184
+ getPlatformNotificationDefaults: proc({
185
+ operationType: "query",
186
+ userType: "authenticated",
187
+ access: [healthCheckAccess.configuration.read],
188
+ }).output(NotificationPolicySchema),
189
+
190
+ /**
191
+ * Update the platform-wide notification policy defaults. Per-
192
+ * assignment rows that inherit (notificationPolicy = null) will pick
193
+ * up the new values on the next read.
194
+ */
195
+ setPlatformNotificationDefaults: proc({
196
+ operationType: "mutation",
197
+ userType: "authenticated",
198
+ access: [healthCheckAccess.configuration.manage],
199
+ })
200
+ .input(NotificationPolicySchema)
201
+ .output(z.void()),
202
+
175
203
  disassociateSystem: proc({
176
204
  operationType: "mutation",
177
205
  userType: "authenticated",
@@ -238,6 +266,8 @@ export const healthCheckContract = {
238
266
  endDate: z.date().optional(),
239
267
  /** Filter by source: "local" = core only, satellite UUID = specific satellite, undefined = all */
240
268
  sourceFilter: z.string().optional(),
269
+ /** Restrict runs to the listed statuses. Omitted/empty = no filter. */
270
+ statusFilter: z.array(HealthCheckStatusSchema).optional(),
241
271
  limit: z.number().optional().default(10),
242
272
  offset: z.number().optional().default(0),
243
273
  sortOrder: z.enum(["asc", "desc"]),
@@ -263,6 +293,8 @@ export const healthCheckContract = {
263
293
  endDate: z.date().optional(),
264
294
  /** Filter by source: "local" = core only, satellite UUID = specific satellite, undefined = all */
265
295
  sourceFilter: z.string().optional(),
296
+ /** Restrict runs to the listed statuses. Omitted/empty = no filter. */
297
+ statusFilter: z.array(HealthCheckStatusSchema).optional(),
266
298
  limit: z.number().optional().default(10),
267
299
  offset: z.number().optional().default(0),
268
300
  sortOrder: z.enum(["asc", "desc"]),
@@ -425,6 +457,8 @@ export const healthCheckContract = {
425
457
  )
426
458
  .optional(),
427
459
  intervalSeconds: z.number(),
460
+ configName: z.string().optional(),
461
+ systemName: z.string().optional(),
428
462
  }),
429
463
  ),
430
464
  ),
package/src/schemas.ts CHANGED
@@ -207,6 +207,121 @@ export const DEFAULT_STATE_THRESHOLDS: StateThresholds = {
207
207
  unhealthy: { minFailureCount: 5 },
208
208
  };
209
209
 
210
+ // --- Notification Policy ---
211
+
212
+ /**
213
+ * Trigger that opens an auto-incident after a check has been
214
+ * continuously `unhealthy` for at least `durationMinutes`. Resets if
215
+ * the check recovers to non-unhealthy in between.
216
+ */
217
+ export const SustainedUnhealthyTriggerSchema = z.object({
218
+ /** When false, this trigger is fully disabled. */
219
+ enabled: z.boolean().default(true),
220
+ /** Minimum continuous-unhealthy time before opening. */
221
+ durationMinutes: z.number().int().min(1).default(30),
222
+ });
223
+
224
+ export type SustainedUnhealthyTrigger = z.infer<
225
+ typeof SustainedUnhealthyTriggerSchema
226
+ >;
227
+
228
+ /**
229
+ * Trigger that opens an auto-incident when a check has transitioned
230
+ * to `unhealthy` at least `transitions` times within `windowMinutes`.
231
+ * Catches flapping that the sustained-duration trigger would miss
232
+ * because each unhealthy phase is too short.
233
+ */
234
+ export const FlappingTriggerSchema = z.object({
235
+ /** When false, this trigger is fully disabled. */
236
+ enabled: z.boolean().default(true),
237
+ /** Minimum number of transitions-to-unhealthy needed in the window. */
238
+ transitions: z.number().int().min(1).default(3),
239
+ /** Sliding window in minutes the transitions are counted over. */
240
+ windowMinutes: z.number().int().min(1).default(60),
241
+ });
242
+
243
+ export type FlappingTrigger = z.infer<typeof FlappingTriggerSchema>;
244
+
245
+ export const DEFAULT_SUSTAINED_TRIGGER: SustainedUnhealthyTrigger = {
246
+ enabled: true,
247
+ durationMinutes: 30,
248
+ };
249
+
250
+ export const DEFAULT_FLAPPING_TRIGGER: FlappingTrigger = {
251
+ enabled: true,
252
+ transitions: 3,
253
+ windowMinutes: 60,
254
+ };
255
+
256
+ /**
257
+ * Per-association notification preferences. All fields are evaluated
258
+ * per (system, configuration) — different checks on the same system
259
+ * are fully independent.
260
+ */
261
+ export const NotificationPolicySchema = z.object({
262
+ /**
263
+ * When true, do not emit notifications for de-escalations (e.g.
264
+ * `unhealthy → degraded`). Escalations and recoveries to `healthy`
265
+ * still notify.
266
+ */
267
+ suppressDeEscalations: z.boolean().default(false),
268
+ /**
269
+ * When true, the configured triggers can open auto-managed incidents
270
+ * on the system. Setting this to false disables both triggers
271
+ * regardless of their individual `enabled` flags.
272
+ */
273
+ autoOpenIncidentOnUnhealthy: z.boolean().default(true),
274
+ /**
275
+ * When true, the auto-opened incident is created with
276
+ * `suppressNotifications` enabled so further health-state
277
+ * notifications for the system are silenced until the incident is
278
+ * resolved. Only meaningful when `autoOpenIncidentOnUnhealthy` is on.
279
+ */
280
+ useNotificationSuppression: z.boolean().default(true),
281
+ /**
282
+ * When true, no auto-incident is opened while the system has an
283
+ * active maintenance window with notification suppression. The
284
+ * system is intentionally down and shouldn't trip the on-call.
285
+ */
286
+ skipDuringMaintenance: z.boolean().default(true),
287
+ /**
288
+ * Trigger A: "this check has been unhealthy for X minutes
289
+ * continuously." Catches real outages.
290
+ */
291
+ sustainedUnhealthyTrigger: SustainedUnhealthyTriggerSchema.default(
292
+ DEFAULT_SUSTAINED_TRIGGER,
293
+ ),
294
+ /**
295
+ * Trigger B: "this check transitioned to unhealthy N times in M
296
+ * minutes." Catches persistent flapping where no individual
297
+ * unhealthy phase is long enough for the sustained trigger.
298
+ */
299
+ flappingTrigger: FlappingTriggerSchema.default(DEFAULT_FLAPPING_TRIGGER),
300
+ /**
301
+ * Minutes of sustained healthy state required before an auto-opened
302
+ * incident is auto-closed. `null` disables auto-close — the
303
+ * incident stays open until an operator resolves it manually.
304
+ */
305
+ autoCloseAfterMinutes: z
306
+ .number()
307
+ .int()
308
+ .min(1)
309
+ .nullable()
310
+ .default(30),
311
+ });
312
+
313
+ export type NotificationPolicy = z.infer<typeof NotificationPolicySchema>;
314
+
315
+ export const DEFAULT_NOTIFICATION_POLICY: NotificationPolicy = {
316
+ suppressDeEscalations: false,
317
+ autoOpenIncidentOnUnhealthy: true,
318
+ useNotificationSuppression: true,
319
+ skipDuringMaintenance: true,
320
+ sustainedUnhealthyTrigger: DEFAULT_SUSTAINED_TRIGGER,
321
+ flappingTrigger: DEFAULT_FLAPPING_TRIGGER,
322
+ autoCloseAfterMinutes: 30,
323
+ };
324
+
210
325
  export const AssociateHealthCheckSchema = z.object({
211
326
  configurationId: z.string().uuid(),
212
327
  enabled: z.boolean().default(true),
@@ -215,6 +330,8 @@ export const AssociateHealthCheckSchema = z.object({
215
330
  satelliteIds: z.array(z.string()).optional(),
216
331
  /** Whether to also run this check locally on the core instance (default: true) */
217
332
  includeLocal: z.boolean().default(true),
333
+ /** Per-association notification policy. Defaults applied when omitted. */
334
+ notificationPolicy: NotificationPolicySchema.optional(),
218
335
  });
219
336
 
220
337
  export type AssociateHealthCheck = z.infer<typeof AssociateHealthCheckSchema>;