@checkstack/healthcheck-backend 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,142 @@
1
1
  # @checkstack/healthcheck-backend
2
2
 
3
+ ## 0.18.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 8d1ef12: ## Anomaly Detection & UI Improvements
8
+
9
+ ### Anomaly Detection Enhancements (Phase 2)
10
+
11
+ - **`@checkstack/anomaly-backend`**: Implemented background baseline analyzer jobs and anomaly trend deviation detection mechanics.
12
+ - **`@checkstack/anomaly-common`**: Added new baseline statistical logic and inference rules.
13
+ - **`@checkstack/anomaly-frontend`**: Added new Anomaly Widget and refactored system detail rendering to be more human-readable.
14
+ - **`@checkstack/dashboard-frontend`**: Refined the global anomaly widget and fixed hardcoded access gating to render appropriately.
15
+ - **`@checkstack/healthcheck-backend`**: Connected executor telemetry to the anomaly pipeline.
16
+ - **`@checkstack/healthcheck-frontend`**: Reconciled baseline display consistency in Drawer and charts.
17
+
18
+ ### Notification Identifiers
19
+
20
+ - **`@checkstack/incident-backend`**: Resolved system IDs to human-readable System Names within Incident notifications to eliminate ID-only alert content.
21
+ - **`@checkstack/maintenance-backend`**: Adopted the same resolution strategy for Maintenance notifications to keep parity.
22
+
23
+ ### UI Experience
24
+
25
+ - **`@checkstack/incident-frontend`**: Fixed the "Back to X" BackLink to properly use `react-router` hook `useNavigate` instead of doing a full application reload.
26
+ - **`@checkstack/healthcheck-frontend`**: Implemented `useNavigate` for seamless SPA back-linking.
27
+ - **`@checkstack/integration-frontend`**: Updated connections and delivery logs links to navigate without hard reloads.
28
+
29
+ - 8d1ef12: ## Per-entity caching with single-flight + safe invalidation across the dashboard hot paths
30
+
31
+ ### `@checkstack/cache-api`
32
+
33
+ - **Breaking** for backend implementors: `CacheProvider` now requires `deleteByPrefix(prefix: string): Promise<number>` for family-level invalidation. The in-memory provider implements it; downstream providers (Redis, etc.) must add it before upgrading.
34
+ - `createScopedCache` forwards `deleteByPrefix` and keeps prefixes scoped to the calling plugin.
35
+
36
+ ### `@checkstack/cache-utils` (new package)
37
+
38
+ High-level read-through caching helpers built on `CacheProvider`:
39
+
40
+ - `createCachedScope({ cacheManager, pluginId })` returns a scope with `wrap`, `wrapMany`, `invalidate`, and `invalidatePrefix`.
41
+ - **Single-flight**: concurrent cache misses for the same key share one loader.
42
+ - **Per-entity bulk caching** via `wrapMany` so list/bulk RPCs cache by id rather than by the full input shape — overlapping callers share entries and invalidation stays exact.
43
+ - **Race-safe invalidation** via per-key epoch counters: a loader started before a mutation cannot repopulate the cache with stale data after the mutation invalidates it. The mutation invariant is `db.write → cache.invalidate (await) → signals.emit`.
44
+ - Cache failures fall through to the loader so a cache outage cannot break reads.
45
+
46
+ ### `@checkstack/backend`
47
+
48
+ - The internal null `CacheProvider` (used when no cache backend is configured) now implements the new `deleteByPrefix` method as a no-op. Patch bump only — no behavior change for existing callers.
49
+
50
+ ### `@checkstack/healthcheck-backend`
51
+
52
+ - `getSystemHealthStatus` and `getBulkSystemHealthStatus` now read through a per-system cache (`healthcheck:status:<systemId>`), eliminating N database queries per dashboard refresh for unchanged systems.
53
+ - Mutation paths (configuration CRUD, system associations, satellite ingest, queue-driven check runs, system/satellite removal hooks) invalidate affected keys before broadcasting their signals so frontend refetches always observe fresh data.
54
+
55
+ ### `@checkstack/incident-backend`
56
+
57
+ - `listIncidents`, `getIncident`, `getIncidentsForSystem`, and `getBulkIncidentsForSystems` now read through a scoped cache:
58
+ - per-incident at `incident:<id>`
59
+ - per-system at `system:<systemId>`
60
+ - per-filter-shape at `list:<stable-stringify(filters)>` for the few list shapes the dashboard polls
61
+ - Mutations (`createIncident`, `updateIncident`, `addUpdate`, `resolveIncident`, `deleteIncident`) invalidate the incident, every affected system, and every cached list before broadcasting `INCIDENT_UPDATED`.
62
+ - The catalog `systemDeleted` cleanup hook drops that system's cached entries.
63
+
64
+ ### `@checkstack/maintenance-backend`
65
+
66
+ - `listMaintenances`, `getMaintenance`, `getMaintenancesForSystem`, and `getBulkMaintenancesForSystems` use the same per-entity / per-system / per-filter-shape pattern as incidents.
67
+ - Mutations (`createMaintenance`, `updateMaintenance`, `addUpdate`, `closeMaintenance`, `deleteMaintenance`) invalidate before broadcasting `MAINTENANCE_UPDATED`.
68
+
69
+ ### `@checkstack/catalog-backend`
70
+
71
+ - Topology reads (`getEntities`, `getSystems`, `getSystem`, `getGroups`, `getSystemGroupIds`) cache under the `entity:` family (25s TTL).
72
+ - Views (`getViews`) and per-system contacts (`getSystemContacts`) cache in their own families.
73
+ - System / group / membership mutations drop the entire `entity:` family (every reader joins the same tables); view and contact mutations drop only their respective scopes.
74
+
75
+ ### `@checkstack/slo-backend`
76
+
77
+ - `listObjectives`, `getObjective`, `getObjectivesForSystem`, and `getBulkObjectivesForSystems` cache results including the expensive `engine.computeStatus` output.
78
+ - Per-entity caching for the bulk handler so dashboards with overlapping system sets share entries.
79
+ - Mutations (`createObjective`, `updateObjective`, `deleteObjective`) invalidate before broadcasting `SLO_STATUS_CHANGED`.
80
+
81
+ ### `@checkstack/anomaly-backend`
82
+
83
+ - New `router-cache.ts` adds a cache scope distinct from the existing detector baseline cache, keyed by stable filter hash.
84
+ - `getAnomalies` and `getAnomalyBaselines` cache through this scope (15s TTL).
85
+ - The detector invalidates the router cache before broadcasting `ANOMALY_STATE_CHANGED` on every state transition (suspicious/anomaly/recovered).
86
+ - Config mutations also invalidate.
87
+
88
+ ### `@checkstack/notification-backend`
89
+
90
+ - `getUnreadCount`, `getNotifications`, and `getSubscriptions` cache per-user.
91
+ - `markAsRead`, `deleteNotification`, `notifyUsers`, and `notifyGroups` invalidate every affected user's cache before sending realtime signals to that user.
92
+ - `subscribe` and `unsubscribe` invalidate the user's subscription cache.
93
+
94
+ ### `@checkstack/announcement-backend`
95
+
96
+ - `getActiveAnnouncements` caches per-user (or anonymous) and per-`includeDismissed` flag (45s TTL — admin-driven, slowly changing).
97
+ - `listAllAnnouncements` caches under a single key.
98
+ - `dismissAnnouncement` only drops that user's cache; `createAnnouncement`, `updateAnnouncement`, `deleteAnnouncement` drop every user's cache before broadcasting `ANNOUNCEMENT_UPDATED`.
99
+ - The auth `userDeleted` cleanup hook drops that user's cached entries.
100
+
101
+ ### Patch Changes
102
+
103
+ - Updated dependencies [8d1ef12]
104
+ - Updated dependencies [8d1ef12]
105
+ - Updated dependencies [8d1ef12]
106
+ - Updated dependencies [8d1ef12]
107
+ - @checkstack/healthcheck-common@0.12.0
108
+ - @checkstack/common@0.7.0
109
+ - @checkstack/cache-api@0.2.0
110
+ - @checkstack/cache-utils@0.2.0
111
+ - @checkstack/catalog-backend@0.7.0
112
+ - @checkstack/backend-api@0.13.0
113
+ - @checkstack/satellite-backend@0.2.16
114
+ - @checkstack/catalog-common@1.5.2
115
+ - @checkstack/command-backend@0.1.20
116
+ - @checkstack/gitops-backend@0.2.4
117
+ - @checkstack/gitops-common@0.2.1
118
+ - @checkstack/incident-common@0.4.9
119
+ - @checkstack/integration-backend@0.1.20
120
+ - @checkstack/maintenance-common@0.4.11
121
+ - @checkstack/signal-common@0.1.10
122
+ - @checkstack/queue-api@0.2.14
123
+
124
+ ## 0.17.1
125
+
126
+ ### Patch Changes
127
+
128
+ - c4e7560: Fix data integrity, cache invalidation, and mobile UI issues
129
+
130
+ - **Centralized mutation cache invalidation**: Every mutation now automatically invalidates its plugin's query cache on success via the shared `createProcedureHook` in `orpc-query.tsx`. This ensures all views stay in sync without requiring individual components to remember manual `invalidateQueries` calls.
131
+ - **Fixed oRPC query key matching**: Query keys use nested arrays (`[["pluginId"]]`) to correctly match oRPC's `[pathArray, options]` key structure. Fixed the broken flat-string pattern in `SystemBadgeDataProvider`.
132
+ - **Fixed hourly aggregation duplication**: Added `NULLS NOT DISTINCT` to the `health_check_aggregates` unique constraint so local runs (`source_id = NULL`) correctly conflict-match instead of creating duplicate hourly buckets. Includes a migration to clean up existing duplicates.
133
+ - **Fixed modal scrolling on mobile**: Added `max-height` + `overflow-y-auto` to `ConfirmationModal`, and refactored `Dialog` from translate-centering to flex-centering with `dvh` units for reliable mobile scroll containment.
134
+ - @checkstack/catalog-common@1.5.1
135
+ - @checkstack/incident-common@0.4.8
136
+ - @checkstack/maintenance-common@0.4.10
137
+ - @checkstack/satellite-backend@0.2.15
138
+ - @checkstack/catalog-backend@0.6.1
139
+
3
140
  ## 0.17.0
4
141
 
5
142
  ### Minor Changes
@@ -0,0 +1,16 @@
1
+ DROP INDEX "health_check_aggregates_bucket_unique";--> statement-breakpoint
2
+
3
+ -- Clean up duplicate local (source_id IS NULL) hourly buckets that were
4
+ -- created due to the missing NULLS NOT DISTINCT clause. Keep the row with
5
+ -- the highest id (most recent insert) for each bucket group.
6
+ DELETE FROM "health_check_aggregates" a
7
+ USING "health_check_aggregates" b
8
+ WHERE a.source_id IS NULL
9
+ AND b.source_id IS NULL
10
+ AND a.configuration_id = b.configuration_id
11
+ AND a.system_id = b.system_id
12
+ AND a.bucket_start = b.bucket_start
13
+ AND a.bucket_size = b.bucket_size
14
+ AND a.id < b.id;--> statement-breakpoint
15
+
16
+ ALTER TABLE "health_check_aggregates" ADD CONSTRAINT "health_check_aggregates_bucket_unique" UNIQUE NULLS NOT DISTINCT("configuration_id","system_id","bucket_start","bucket_size","source_id");
@@ -0,0 +1,441 @@
1
+ {
2
+ "id": "446eaec3-7894-4425-b7db-00c7761ca83f",
3
+ "prevId": "743f0b98-66a5-462e-8c35-4b2eb3093010",
4
+ "version": "7",
5
+ "dialect": "postgresql",
6
+ "tables": {
7
+ "public.health_check_aggregates": {
8
+ "name": "health_check_aggregates",
9
+ "schema": "",
10
+ "columns": {
11
+ "id": {
12
+ "name": "id",
13
+ "type": "uuid",
14
+ "primaryKey": true,
15
+ "notNull": true,
16
+ "default": "gen_random_uuid()"
17
+ },
18
+ "configuration_id": {
19
+ "name": "configuration_id",
20
+ "type": "uuid",
21
+ "primaryKey": false,
22
+ "notNull": true
23
+ },
24
+ "system_id": {
25
+ "name": "system_id",
26
+ "type": "text",
27
+ "primaryKey": false,
28
+ "notNull": true
29
+ },
30
+ "bucket_start": {
31
+ "name": "bucket_start",
32
+ "type": "timestamp",
33
+ "primaryKey": false,
34
+ "notNull": true
35
+ },
36
+ "bucket_size": {
37
+ "name": "bucket_size",
38
+ "type": "bucket_size",
39
+ "typeSchema": "public",
40
+ "primaryKey": false,
41
+ "notNull": true
42
+ },
43
+ "run_count": {
44
+ "name": "run_count",
45
+ "type": "integer",
46
+ "primaryKey": false,
47
+ "notNull": true
48
+ },
49
+ "healthy_count": {
50
+ "name": "healthy_count",
51
+ "type": "integer",
52
+ "primaryKey": false,
53
+ "notNull": true
54
+ },
55
+ "degraded_count": {
56
+ "name": "degraded_count",
57
+ "type": "integer",
58
+ "primaryKey": false,
59
+ "notNull": true
60
+ },
61
+ "unhealthy_count": {
62
+ "name": "unhealthy_count",
63
+ "type": "integer",
64
+ "primaryKey": false,
65
+ "notNull": true
66
+ },
67
+ "latency_sum_ms": {
68
+ "name": "latency_sum_ms",
69
+ "type": "integer",
70
+ "primaryKey": false,
71
+ "notNull": false
72
+ },
73
+ "avg_latency_ms": {
74
+ "name": "avg_latency_ms",
75
+ "type": "integer",
76
+ "primaryKey": false,
77
+ "notNull": false
78
+ },
79
+ "min_latency_ms": {
80
+ "name": "min_latency_ms",
81
+ "type": "integer",
82
+ "primaryKey": false,
83
+ "notNull": false
84
+ },
85
+ "max_latency_ms": {
86
+ "name": "max_latency_ms",
87
+ "type": "integer",
88
+ "primaryKey": false,
89
+ "notNull": false
90
+ },
91
+ "p95_latency_ms": {
92
+ "name": "p95_latency_ms",
93
+ "type": "integer",
94
+ "primaryKey": false,
95
+ "notNull": false
96
+ },
97
+ "aggregated_result": {
98
+ "name": "aggregated_result",
99
+ "type": "jsonb",
100
+ "primaryKey": false,
101
+ "notNull": false
102
+ },
103
+ "tdigest_state": {
104
+ "name": "tdigest_state",
105
+ "type": "jsonb",
106
+ "primaryKey": false,
107
+ "notNull": false
108
+ },
109
+ "source_id": {
110
+ "name": "source_id",
111
+ "type": "text",
112
+ "primaryKey": false,
113
+ "notNull": false
114
+ },
115
+ "source_label": {
116
+ "name": "source_label",
117
+ "type": "text",
118
+ "primaryKey": false,
119
+ "notNull": false
120
+ }
121
+ },
122
+ "indexes": {},
123
+ "foreignKeys": {
124
+ "health_check_aggregates_configuration_id_health_check_configurations_id_fk": {
125
+ "name": "health_check_aggregates_configuration_id_health_check_configurations_id_fk",
126
+ "tableFrom": "health_check_aggregates",
127
+ "tableTo": "health_check_configurations",
128
+ "columnsFrom": [
129
+ "configuration_id"
130
+ ],
131
+ "columnsTo": [
132
+ "id"
133
+ ],
134
+ "onDelete": "cascade",
135
+ "onUpdate": "no action"
136
+ }
137
+ },
138
+ "compositePrimaryKeys": {},
139
+ "uniqueConstraints": {
140
+ "health_check_aggregates_bucket_unique": {
141
+ "name": "health_check_aggregates_bucket_unique",
142
+ "nullsNotDistinct": true,
143
+ "columns": [
144
+ "configuration_id",
145
+ "system_id",
146
+ "bucket_start",
147
+ "bucket_size",
148
+ "source_id"
149
+ ]
150
+ }
151
+ },
152
+ "policies": {},
153
+ "checkConstraints": {},
154
+ "isRLSEnabled": false
155
+ },
156
+ "public.health_check_configurations": {
157
+ "name": "health_check_configurations",
158
+ "schema": "",
159
+ "columns": {
160
+ "id": {
161
+ "name": "id",
162
+ "type": "uuid",
163
+ "primaryKey": true,
164
+ "notNull": true,
165
+ "default": "gen_random_uuid()"
166
+ },
167
+ "name": {
168
+ "name": "name",
169
+ "type": "text",
170
+ "primaryKey": false,
171
+ "notNull": true
172
+ },
173
+ "strategy_id": {
174
+ "name": "strategy_id",
175
+ "type": "text",
176
+ "primaryKey": false,
177
+ "notNull": true
178
+ },
179
+ "config": {
180
+ "name": "config",
181
+ "type": "jsonb",
182
+ "primaryKey": false,
183
+ "notNull": true
184
+ },
185
+ "collectors": {
186
+ "name": "collectors",
187
+ "type": "jsonb",
188
+ "primaryKey": false,
189
+ "notNull": false
190
+ },
191
+ "interval_seconds": {
192
+ "name": "interval_seconds",
193
+ "type": "integer",
194
+ "primaryKey": false,
195
+ "notNull": true
196
+ },
197
+ "is_template": {
198
+ "name": "is_template",
199
+ "type": "boolean",
200
+ "primaryKey": false,
201
+ "notNull": false,
202
+ "default": false
203
+ },
204
+ "paused": {
205
+ "name": "paused",
206
+ "type": "boolean",
207
+ "primaryKey": false,
208
+ "notNull": true,
209
+ "default": false
210
+ },
211
+ "created_at": {
212
+ "name": "created_at",
213
+ "type": "timestamp",
214
+ "primaryKey": false,
215
+ "notNull": true,
216
+ "default": "now()"
217
+ },
218
+ "updated_at": {
219
+ "name": "updated_at",
220
+ "type": "timestamp",
221
+ "primaryKey": false,
222
+ "notNull": true,
223
+ "default": "now()"
224
+ }
225
+ },
226
+ "indexes": {},
227
+ "foreignKeys": {},
228
+ "compositePrimaryKeys": {},
229
+ "uniqueConstraints": {},
230
+ "policies": {},
231
+ "checkConstraints": {},
232
+ "isRLSEnabled": false
233
+ },
234
+ "public.health_check_runs": {
235
+ "name": "health_check_runs",
236
+ "schema": "",
237
+ "columns": {
238
+ "id": {
239
+ "name": "id",
240
+ "type": "uuid",
241
+ "primaryKey": true,
242
+ "notNull": true,
243
+ "default": "gen_random_uuid()"
244
+ },
245
+ "configuration_id": {
246
+ "name": "configuration_id",
247
+ "type": "uuid",
248
+ "primaryKey": false,
249
+ "notNull": true
250
+ },
251
+ "system_id": {
252
+ "name": "system_id",
253
+ "type": "text",
254
+ "primaryKey": false,
255
+ "notNull": true
256
+ },
257
+ "status": {
258
+ "name": "status",
259
+ "type": "health_check_status",
260
+ "typeSchema": "public",
261
+ "primaryKey": false,
262
+ "notNull": true
263
+ },
264
+ "latency_ms": {
265
+ "name": "latency_ms",
266
+ "type": "integer",
267
+ "primaryKey": false,
268
+ "notNull": false
269
+ },
270
+ "result": {
271
+ "name": "result",
272
+ "type": "jsonb",
273
+ "primaryKey": false,
274
+ "notNull": false
275
+ },
276
+ "source_id": {
277
+ "name": "source_id",
278
+ "type": "text",
279
+ "primaryKey": false,
280
+ "notNull": false
281
+ },
282
+ "source_label": {
283
+ "name": "source_label",
284
+ "type": "text",
285
+ "primaryKey": false,
286
+ "notNull": false
287
+ },
288
+ "timestamp": {
289
+ "name": "timestamp",
290
+ "type": "timestamp",
291
+ "primaryKey": false,
292
+ "notNull": true,
293
+ "default": "now()"
294
+ }
295
+ },
296
+ "indexes": {},
297
+ "foreignKeys": {
298
+ "health_check_runs_configuration_id_health_check_configurations_id_fk": {
299
+ "name": "health_check_runs_configuration_id_health_check_configurations_id_fk",
300
+ "tableFrom": "health_check_runs",
301
+ "tableTo": "health_check_configurations",
302
+ "columnsFrom": [
303
+ "configuration_id"
304
+ ],
305
+ "columnsTo": [
306
+ "id"
307
+ ],
308
+ "onDelete": "cascade",
309
+ "onUpdate": "no action"
310
+ }
311
+ },
312
+ "compositePrimaryKeys": {},
313
+ "uniqueConstraints": {},
314
+ "policies": {},
315
+ "checkConstraints": {},
316
+ "isRLSEnabled": false
317
+ },
318
+ "public.system_health_checks": {
319
+ "name": "system_health_checks",
320
+ "schema": "",
321
+ "columns": {
322
+ "system_id": {
323
+ "name": "system_id",
324
+ "type": "text",
325
+ "primaryKey": false,
326
+ "notNull": true
327
+ },
328
+ "configuration_id": {
329
+ "name": "configuration_id",
330
+ "type": "uuid",
331
+ "primaryKey": false,
332
+ "notNull": true
333
+ },
334
+ "enabled": {
335
+ "name": "enabled",
336
+ "type": "boolean",
337
+ "primaryKey": false,
338
+ "notNull": true,
339
+ "default": true
340
+ },
341
+ "state_thresholds": {
342
+ "name": "state_thresholds",
343
+ "type": "jsonb",
344
+ "primaryKey": false,
345
+ "notNull": false
346
+ },
347
+ "retention_config": {
348
+ "name": "retention_config",
349
+ "type": "jsonb",
350
+ "primaryKey": false,
351
+ "notNull": false
352
+ },
353
+ "satellite_ids": {
354
+ "name": "satellite_ids",
355
+ "type": "jsonb",
356
+ "primaryKey": false,
357
+ "notNull": false
358
+ },
359
+ "include_local": {
360
+ "name": "include_local",
361
+ "type": "boolean",
362
+ "primaryKey": false,
363
+ "notNull": true,
364
+ "default": true
365
+ },
366
+ "created_at": {
367
+ "name": "created_at",
368
+ "type": "timestamp",
369
+ "primaryKey": false,
370
+ "notNull": true,
371
+ "default": "now()"
372
+ },
373
+ "updated_at": {
374
+ "name": "updated_at",
375
+ "type": "timestamp",
376
+ "primaryKey": false,
377
+ "notNull": true,
378
+ "default": "now()"
379
+ }
380
+ },
381
+ "indexes": {},
382
+ "foreignKeys": {
383
+ "system_health_checks_configuration_id_health_check_configurations_id_fk": {
384
+ "name": "system_health_checks_configuration_id_health_check_configurations_id_fk",
385
+ "tableFrom": "system_health_checks",
386
+ "tableTo": "health_check_configurations",
387
+ "columnsFrom": [
388
+ "configuration_id"
389
+ ],
390
+ "columnsTo": [
391
+ "id"
392
+ ],
393
+ "onDelete": "cascade",
394
+ "onUpdate": "no action"
395
+ }
396
+ },
397
+ "compositePrimaryKeys": {
398
+ "system_health_checks_system_id_configuration_id_pk": {
399
+ "name": "system_health_checks_system_id_configuration_id_pk",
400
+ "columns": [
401
+ "system_id",
402
+ "configuration_id"
403
+ ]
404
+ }
405
+ },
406
+ "uniqueConstraints": {},
407
+ "policies": {},
408
+ "checkConstraints": {},
409
+ "isRLSEnabled": false
410
+ }
411
+ },
412
+ "enums": {
413
+ "public.bucket_size": {
414
+ "name": "bucket_size",
415
+ "schema": "public",
416
+ "values": [
417
+ "hourly",
418
+ "daily"
419
+ ]
420
+ },
421
+ "public.health_check_status": {
422
+ "name": "health_check_status",
423
+ "schema": "public",
424
+ "values": [
425
+ "healthy",
426
+ "unhealthy",
427
+ "degraded"
428
+ ]
429
+ }
430
+ },
431
+ "schemas": {},
432
+ "sequences": {},
433
+ "roles": {},
434
+ "policies": {},
435
+ "views": {},
436
+ "_meta": {
437
+ "columns": {},
438
+ "schemas": {},
439
+ "tables": {}
440
+ }
441
+ }
@@ -78,6 +78,13 @@
78
78
  "when": 1776599270689,
79
79
  "tag": "0010_colorful_shinobi_shaw",
80
80
  "breakpoints": true
81
+ },
82
+ {
83
+ "idx": 11,
84
+ "version": "7",
85
+ "when": 1777354405576,
86
+ "tag": "0011_fluffy_sphinx",
87
+ "breakpoints": true
81
88
  }
82
89
  ]
83
90
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@checkstack/healthcheck-backend",
3
- "version": "0.17.0",
3
+ "version": "0.18.0",
4
4
  "type": "module",
5
5
  "main": "src/index.ts",
6
6
  "checkstack": {
@@ -14,18 +14,20 @@
14
14
  },
15
15
  "dependencies": {
16
16
  "@checkstack/backend-api": "0.12.0",
17
- "@checkstack/catalog-backend": "0.5.4",
18
- "@checkstack/catalog-common": "1.4.1",
17
+ "@checkstack/cache-api": "0.1.0",
18
+ "@checkstack/cache-utils": "0.1.0",
19
+ "@checkstack/catalog-backend": "0.6.1",
20
+ "@checkstack/catalog-common": "1.5.1",
19
21
  "@checkstack/command-backend": "0.1.19",
20
22
  "@checkstack/common": "0.6.5",
21
23
  "@checkstack/gitops-backend": "0.2.3",
22
24
  "@checkstack/gitops-common": "0.2.0",
23
25
  "@checkstack/healthcheck-common": "0.11.0",
24
- "@checkstack/incident-common": "0.4.7",
26
+ "@checkstack/incident-common": "0.4.8",
25
27
  "@checkstack/integration-backend": "0.1.19",
26
- "@checkstack/maintenance-common": "0.4.9",
28
+ "@checkstack/maintenance-common": "0.4.10",
27
29
  "@checkstack/queue-api": "0.2.13",
28
- "@checkstack/satellite-backend": "0.2.13",
30
+ "@checkstack/satellite-backend": "0.2.15",
29
31
  "@checkstack/signal-common": "0.1.9",
30
32
  "@hono/zod-validator": "^0.7.6",
31
33
  "drizzle-orm": "^0.45.0",
package/src/cache.ts ADDED
@@ -0,0 +1,70 @@
1
+ import type { CacheManager } from "@checkstack/cache-api";
2
+ import {
3
+ createCachedScope,
4
+ type CachedScope,
5
+ } from "@checkstack/cache-utils";
6
+ import type { Logger } from "@checkstack/backend-api";
7
+ import type { HealthCheckService } from "./service";
8
+
9
+ /**
10
+ * TTL chosen to be slightly shorter than the dashboard's 30s `staleTime` so
11
+ * that signal-driven invalidation almost always wins, and TTL only acts as
12
+ * a safety net for paths that forget to invalidate.
13
+ */
14
+ const STATUS_TTL_MS = 15_000;
15
+
16
+ /**
17
+ * Per-entity cache helpers for the healthcheck plugin. Wrapping reads
18
+ * goes through {@link wrapSystemHealthStatus}; mutations should call
19
+ * {@link invalidateSystem} after the DB write but before emitting any
20
+ * signal so that frontend refetches see fresh data.
21
+ */
22
+ export interface HealthCheckCache {
23
+ /** Read-through cache for one system's health status. */
24
+ wrapSystemHealthStatus: (
25
+ systemId: string,
26
+ loader: () => ReturnType<HealthCheckService["getSystemHealthStatus"]>,
27
+ ) => ReturnType<HealthCheckService["getSystemHealthStatus"]>;
28
+
29
+ /** Invalidate a single system's cached status. */
30
+ invalidateSystem: (systemId: string) => Promise<void>;
31
+
32
+ /**
33
+ * Invalidate every system's cached status. Used when the change can
34
+ * affect many systems at once (e.g. a configuration update with
35
+ * cross-system fan-out, or a plugin reload).
36
+ */
37
+ invalidateAllSystems: () => Promise<number>;
38
+
39
+ /** Underlying scope, exposed for advanced callers. */
40
+ scope: CachedScope;
41
+ }
42
+
43
+ const STATUS_KEY_PREFIX = "status:";
44
+ const statusKey = (systemId: string): string =>
45
+ `${STATUS_KEY_PREFIX}${systemId}`;
46
+
47
+ export function createHealthCheckCache({
48
+ cacheManager,
49
+ logger,
50
+ }: {
51
+ cacheManager: CacheManager;
52
+ logger: Logger;
53
+ }): HealthCheckCache {
54
+ const scope = createCachedScope({
55
+ cacheManager,
56
+ pluginId: "healthcheck",
57
+ defaultTtlMs: STATUS_TTL_MS,
58
+ onError: (op, error) => {
59
+ logger.warn(`healthcheck cache ${op} failed: ${String(error)}`);
60
+ },
61
+ });
62
+
63
+ return {
64
+ wrapSystemHealthStatus: (systemId, loader) =>
65
+ scope.wrap(statusKey(systemId), loader),
66
+ invalidateSystem: (systemId) => scope.invalidate(statusKey(systemId)),
67
+ invalidateAllSystems: () => scope.invalidatePrefix(STATUS_KEY_PREFIX),
68
+ scope,
69
+ };
70
+ }
package/src/hooks.ts CHANGED
@@ -42,4 +42,17 @@ export const healthCheckHooks = {
42
42
  systemId: string;
43
43
  configurationId: string;
44
44
  }>("healthcheck.assignment.changed"),
45
+
46
+ /**
47
+ * Emitted when a single health check execution finishes.
48
+ * This is used by the anomaly detection engine to run the inline fast detector.
49
+ */
50
+ checkCompleted: createHook<{
51
+ systemId: string;
52
+ configurationId: string;
53
+ status: string;
54
+ latencyMs: number | undefined;
55
+ result: Record<string, unknown> | undefined;
56
+ timestamp: string;
57
+ }>("healthcheck.check.completed"),
45
58
  } as const;
package/src/index.ts CHANGED
@@ -35,6 +35,7 @@ import { GitOpsApi } from "@checkstack/gitops-common";
35
35
  import { healthCheckHooks } from "./hooks";
36
36
  import { registerSearchProvider } from "@checkstack/command-backend";
37
37
  import { resolveRoute } from "@checkstack/common";
38
+ import { createHealthCheckCache } from "./cache";
38
39
 
39
40
  // =============================================================================
40
41
  // Integration Event Payload Schemas
@@ -101,6 +102,9 @@ export default createBackendPlugin({
101
102
  let gitopsHealthCheckRegistry: HealthCheckRegistry | undefined;
102
103
  let gitopsCollectorRegistry: CollectorRegistry | undefined;
103
104
  let gitopsQueueManager: QueueManager | undefined;
105
+ let healthCheckCache:
106
+ | ReturnType<typeof createHealthCheckCache>
107
+ | undefined;
104
108
 
105
109
  const kindRegistry = env.getExtensionPoint(entityKindExtensionPoint);
106
110
  registerHealthcheckGitOpsKinds({
@@ -144,6 +148,7 @@ export default createBackendPlugin({
144
148
  rpcClient: coreServices.rpcClient,
145
149
  queueManager: coreServices.queueManager,
146
150
  signalService: coreServices.signalService,
151
+ cacheManager: coreServices.cacheManager,
147
152
  },
148
153
  // Phase 2: Register router and setup worker
149
154
  init: async ({
@@ -155,6 +160,7 @@ export default createBackendPlugin({
155
160
  rpcClient,
156
161
  queueManager,
157
162
  signalService,
163
+ cacheManager,
158
164
  }) => {
159
165
  logger.debug("🏥 Initializing Health Check Backend...");
160
166
 
@@ -176,6 +182,13 @@ export default createBackendPlugin({
176
182
  // Create gitops client for provenance lock checks
177
183
  const gitOpsClient = rpcClient.forPlugin(GitOpsApi);
178
184
 
185
+ // Per-entity status cache shared between the router, queue executor,
186
+ // and afterPluginsReady cleanup hooks. Mutations / new check results
187
+ // invalidate by systemId BEFORE emitting signals so frontend
188
+ // refetches see fresh data.
189
+ const cache = createHealthCheckCache({ cacheManager, logger });
190
+ healthCheckCache = cache;
191
+
179
192
  // Setup queue-based health check worker
180
193
  await setupHealthCheckWorker({
181
194
  db: database,
@@ -188,6 +201,7 @@ export default createBackendPlugin({
188
201
  maintenanceClient,
189
202
  incidentClient,
190
203
  getEmitHook: () => storedEmitHook,
204
+ cache,
191
205
  });
192
206
 
193
207
  // Setup retention job for tiered storage (daily aggregation)
@@ -203,6 +217,7 @@ export default createBackendPlugin({
203
217
  collectorRegistry,
204
218
  gitOpsClient,
205
219
  getEmitHook: () => storedEmitHook,
220
+ cache,
206
221
  });
207
222
  rpc.registerRouter(healthCheckRouter, healthCheckContract);
208
223
 
@@ -272,6 +287,7 @@ export default createBackendPlugin({
272
287
  `Cleaning up health check associations for deleted system: ${payload.systemId}`,
273
288
  );
274
289
  await service.removeAllSystemAssociations(payload.systemId);
290
+ await healthCheckCache?.invalidateSystem(payload.systemId);
275
291
  },
276
292
  { mode: "work-queue", workerGroup: "system-cleanup" },
277
293
  );
@@ -284,6 +300,9 @@ export default createBackendPlugin({
284
300
  `Scrubbing satellite ${payload.satelliteId} from health check associations`,
285
301
  );
286
302
  await service.scrubSatelliteFromAssociations(payload.satelliteId);
303
+ // Satellite removal can change the includedness of many systems'
304
+ // checks; invalidate everything since we don't know which.
305
+ await healthCheckCache?.invalidateAllSystems();
287
306
  },
288
307
  { mode: "work-queue", workerGroup: "satellite-cleanup" },
289
308
  );
@@ -5,6 +5,14 @@ import {
5
5
  bootstrapHealthChecks,
6
6
  type HealthCheckJobPayload,
7
7
  } from "./queue-executor";
8
+ import type { HealthCheckCache } from "./cache";
9
+
10
+ const passthroughCache: HealthCheckCache = {
11
+ wrapSystemHealthStatus: (_systemId, loader) => loader(),
12
+ invalidateSystem: async () => {},
13
+ invalidateAllSystems: async () => 0,
14
+ scope: {} as HealthCheckCache["scope"],
15
+ };
8
16
  import {
9
17
  createMockLogger,
10
18
  createMockQueueManager,
@@ -188,6 +196,7 @@ describe("Queue-Based Health Check Executor", () => {
188
196
  typeof setupHealthCheckWorker
189
197
  >[0]["incidentClient"],
190
198
  getEmitHook: () => undefined,
199
+ cache: passthroughCache,
191
200
  });
192
201
 
193
202
  expect(mockLogger.debug).toHaveBeenCalledWith(
@@ -381,6 +390,7 @@ describe("Queue-Based Health Check Executor", () => {
381
390
  typeof setupHealthCheckWorker
382
391
  >[0]["incidentClient"],
383
392
  getEmitHook: () => undefined,
393
+ cache: passthroughCache,
384
394
  });
385
395
 
386
396
  // Execute a paused health check
@@ -31,12 +31,45 @@ import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/
31
31
  import { HealthCheckService } from "./service";
32
32
  import { healthCheckHooks } from "./hooks";
33
33
  import { incrementHourlyAggregate } from "./realtime-aggregation";
34
+ import type { HealthCheckCache } from "./cache";
34
35
 
35
36
  type Db = SafeDatabase<typeof schema>;
36
37
  type CatalogClient = InferClient<typeof CatalogApi>;
37
38
  type MaintenanceClient = InferClient<typeof MaintenanceApi>;
38
39
  type IncidentClient = InferClient<typeof IncidentApi>;
39
40
 
41
+ /**
42
+ * Emit the checkCompleted hook if available.
43
+ * Extracted to avoid duplicating the hook emission pattern across success/error paths.
44
+ */
45
+ async function emitCheckCompletedHook({
46
+ getEmitHook,
47
+ systemId,
48
+ configurationId,
49
+ status,
50
+ latencyMs,
51
+ result,
52
+ }: {
53
+ getEmitHook: () => EmitHookFn | undefined;
54
+ systemId: string;
55
+ configurationId: string;
56
+ status: string;
57
+ latencyMs: number | undefined;
58
+ result: Record<string, unknown> | undefined;
59
+ }): Promise<void> {
60
+ const emitHook = getEmitHook();
61
+ if (emitHook) {
62
+ await emitHook(healthCheckHooks.checkCompleted, {
63
+ systemId,
64
+ configurationId,
65
+ status,
66
+ latencyMs,
67
+ result,
68
+ timestamp: new Date().toISOString(),
69
+ });
70
+ }
71
+ }
72
+
40
73
  /**
41
74
  * Payload for health check queue jobs
42
75
  */
@@ -227,6 +260,7 @@ async function executeHealthCheckJob(props: {
227
260
  maintenanceClient: MaintenanceClient;
228
261
  incidentClient: IncidentClient;
229
262
  getEmitHook: () => EmitHookFn | undefined;
263
+ cache: HealthCheckCache;
230
264
  }): Promise<void> {
231
265
  const {
232
266
  payload,
@@ -239,6 +273,7 @@ async function executeHealthCheckJob(props: {
239
273
  maintenanceClient,
240
274
  incidentClient,
241
275
  getEmitHook,
276
+ cache,
242
277
  } = props;
243
278
  const { configId, systemId } = payload;
244
279
 
@@ -524,6 +559,10 @@ async function executeHealthCheckJob(props: {
524
559
  `Health check ${configId} for system ${systemId} failed: ${finalError}`,
525
560
  );
526
561
 
562
+ // Invalidate the per-system status cache before broadcasting so any
563
+ // frontend that refetches in response to the signal gets fresh data.
564
+ await cache.invalidateSystem(systemId);
565
+
527
566
  await signalService.broadcast(HEALTH_CHECK_RUN_COMPLETED, {
528
567
  systemId,
529
568
  systemName,
@@ -603,6 +642,10 @@ async function executeHealthCheckJob(props: {
603
642
  `Ran health check ${configId} for system ${systemId}: ${result.status}`,
604
643
  );
605
644
 
645
+ // Invalidate the per-system status cache before broadcasting so any
646
+ // frontend that refetches in response to the signal gets fresh data.
647
+ await cache.invalidateSystem(systemId);
648
+
606
649
  // Broadcast enriched signal for realtime frontend updates (e.g., terminal feed)
607
650
  await signalService.broadcast(HEALTH_CHECK_RUN_COMPLETED, {
608
651
  systemId,
@@ -613,6 +656,15 @@ async function executeHealthCheckJob(props: {
613
656
  latencyMs: result.latencyMs,
614
657
  });
615
658
 
659
+ await emitCheckCompletedHook({
660
+ getEmitHook,
661
+ systemId,
662
+ configurationId: configId,
663
+ status: result.status,
664
+ latencyMs: result.latencyMs,
665
+ result: (result.metadata?.collectors as Record<string, unknown>) ?? undefined,
666
+ });
667
+
616
668
  // Check if aggregated state changed and notify subscribers
617
669
  const newState = await service.getSystemHealthStatus(systemId);
618
670
  if (newState.status !== previousStatus) {
@@ -722,6 +774,10 @@ async function executeHealthCheckJob(props: {
722
774
  // Use IDs as fallback
723
775
  }
724
776
 
777
+ // Invalidate the per-system status cache before broadcasting so any
778
+ // frontend that refetches in response to the signal gets fresh data.
779
+ await cache.invalidateSystem(systemId);
780
+
725
781
  // Broadcast enriched failure signal for realtime frontend updates
726
782
  await signalService.broadcast(HEALTH_CHECK_RUN_COMPLETED, {
727
783
  systemId,
@@ -731,6 +787,15 @@ async function executeHealthCheckJob(props: {
731
787
  status: "unhealthy",
732
788
  });
733
789
 
790
+ await emitCheckCompletedHook({
791
+ getEmitHook,
792
+ systemId,
793
+ configurationId: configId,
794
+ status: "unhealthy",
795
+ latencyMs: undefined,
796
+ result: undefined,
797
+ });
798
+
734
799
  // Check if aggregated state changed and notify subscribers
735
800
  const newState = await service.getSystemHealthStatus(systemId);
736
801
  if (newState.status !== previousStatus) {
@@ -806,6 +871,7 @@ export async function setupHealthCheckWorker(props: {
806
871
  maintenanceClient: MaintenanceClient;
807
872
  incidentClient: IncidentClient;
808
873
  getEmitHook: () => EmitHookFn | undefined;
874
+ cache: HealthCheckCache;
809
875
  }): Promise<void> {
810
876
  const {
811
877
  db,
@@ -818,6 +884,7 @@ export async function setupHealthCheckWorker(props: {
818
884
  maintenanceClient,
819
885
  incidentClient,
820
886
  getEmitHook,
887
+ cache,
821
888
  } = props;
822
889
 
823
890
  const queue =
@@ -837,6 +904,7 @@ export async function setupHealthCheckWorker(props: {
837
904
  maintenanceClient,
838
905
  incidentClient,
839
906
  getEmitHook,
907
+ cache,
840
908
  });
841
909
  },
842
910
  {
@@ -3,6 +3,14 @@ import { createHealthCheckRouter } from "./router";
3
3
  import { createMockRpcContext } from "@checkstack/backend-api";
4
4
  import { call } from "@orpc/server";
5
5
  import { z } from "zod";
6
+ import type { HealthCheckCache } from "./cache";
7
+
8
+ const passthroughCache: HealthCheckCache = {
9
+ wrapSystemHealthStatus: (_systemId, loader) => loader(),
10
+ invalidateSystem: async () => {},
11
+ invalidateAllSystems: async () => 0,
12
+ scope: {} as HealthCheckCache["scope"],
13
+ };
6
14
 
7
15
  describe("HealthCheck Router", () => {
8
16
  const mockUser = {
@@ -60,6 +68,7 @@ describe("HealthCheck Router", () => {
60
68
  collectorRegistry: mockCollectorRegistry as never,
61
69
  gitOpsClient: mockGitOpsClient as never,
62
70
  getEmitHook: () => undefined,
71
+ cache: passthroughCache,
63
72
  });
64
73
 
65
74
  it("getStrategies returns strategies from registry", async () => {
package/src/router.ts CHANGED
@@ -15,6 +15,7 @@ import * as schema from "./schema";
15
15
  import { toJsonSchemaWithChartMeta } from "./schema-utils";
16
16
  import type { InferClient } from "@checkstack/common";
17
17
  import { GitOpsApi } from "@checkstack/gitops-common";
18
+ import type { HealthCheckCache } from "./cache";
18
19
 
19
20
  /**
20
21
  * Creates the healthcheck router using contract-based implementation.
@@ -28,8 +29,9 @@ export const createHealthCheckRouter = (opts: {
28
29
  collectorRegistry: CollectorRegistry;
29
30
  gitOpsClient: InferClient<typeof GitOpsApi>;
30
31
  getEmitHook: () => ((hook: { id: string }, payload: Record<string, unknown>) => Promise<void>) | undefined;
32
+ cache: HealthCheckCache;
31
33
  }) => {
32
- const { database, registry, collectorRegistry, getEmitHook } = opts;
34
+ const { database, registry, collectorRegistry, getEmitHook, cache } = opts;
33
35
  // Create service instance once - shared across all handlers
34
36
  const service = new HealthCheckService(database, registry, collectorRegistry);
35
37
 
@@ -112,7 +114,12 @@ export const createHealthCheckRouter = (opts: {
112
114
  }),
113
115
 
114
116
  createConfiguration: os.createConfiguration.handler(async ({ input }) => {
115
- return service.createConfiguration(input);
117
+ const created = await service.createConfiguration(input);
118
+ // A new configuration could be associated with any system later; the
119
+ // safe move is to drop every per-system status cache so the next read
120
+ // recomputes from fresh DB state.
121
+ await cache.invalidateAllSystems();
122
+ return created;
116
123
  }),
117
124
 
118
125
  updateConfiguration: os.updateConfiguration.handler(async ({ input }) => {
@@ -123,22 +130,27 @@ export const createHealthCheckRouter = (opts: {
123
130
  message: "Configuration not found",
124
131
  });
125
132
  }
133
+ // Configuration update affects every system that has it associated.
134
+ await cache.invalidateAllSystems();
126
135
  return config;
127
136
  }),
128
137
 
129
138
  deleteConfiguration: os.deleteConfiguration.handler(async ({ input }) => {
130
139
  await enforceNotGitOpsLocked("Healthcheck", input);
131
140
  await service.deleteConfiguration(input);
141
+ await cache.invalidateAllSystems();
132
142
  }),
133
143
 
134
144
  pauseConfiguration: os.pauseConfiguration.handler(async ({ input }) => {
135
145
  await enforceNotGitOpsLocked("Healthcheck", input);
136
146
  await service.pauseConfiguration(input);
147
+ await cache.invalidateAllSystems();
137
148
  }),
138
149
 
139
150
  resumeConfiguration: os.resumeConfiguration.handler(async ({ input }) => {
140
151
  await enforceNotGitOpsLocked("Healthcheck", input);
141
152
  await service.resumeConfiguration(input);
153
+ await cache.invalidateAllSystems();
142
154
  }),
143
155
 
144
156
  getSystemConfigurations: os.getSystemConfigurations.handler(
@@ -163,6 +175,7 @@ export const createHealthCheckRouter = (opts: {
163
175
  satelliteIds: input.body.satelliteIds,
164
176
  includeLocal: input.body.includeLocal,
165
177
  });
178
+ await cache.invalidateSystem(input.systemId);
166
179
 
167
180
  // If enabling the health check, schedule it immediately
168
181
  if (input.body.enabled) {
@@ -195,6 +208,7 @@ export const createHealthCheckRouter = (opts: {
195
208
  disassociateSystem: os.disassociateSystem.handler(async ({ input }) => {
196
209
  await enforceNotGitOpsLocked("System", input.systemId);
197
210
  await service.disassociateSystem(input.systemId, input.configId);
211
+ await cache.invalidateSystem(input.systemId);
198
212
 
199
213
  // Notify subscribers that assignments changed
200
214
  const emitHook = getEmitHook();
@@ -248,24 +262,30 @@ export const createHealthCheckRouter = (opts: {
248
262
  ),
249
263
  getSystemHealthStatus: os.getSystemHealthStatus.handler(
250
264
  async ({ input }) => {
251
- return service.getSystemHealthStatus(input.systemId);
265
+ return cache.wrapSystemHealthStatus(input.systemId, () =>
266
+ service.getSystemHealthStatus(input.systemId),
267
+ );
252
268
  },
253
269
  ),
254
270
 
255
271
  getBulkSystemHealthStatus: os.getBulkSystemHealthStatus.handler(
256
272
  async ({ input }) => {
273
+ // Per-entity caching: each system's status is cached individually
274
+ // and invalidated by id on mutations, so dashboards with overlapping
275
+ // (but non-identical) system sets share cache entries. See
276
+ // ./cache.ts for the key/TTL/invalidation contract.
257
277
  const statuses: Record<
258
278
  string,
259
279
  Awaited<ReturnType<typeof service.getSystemHealthStatus>>
260
280
  > = {};
261
-
262
- // Fetch health status for each system in parallel
263
281
  await Promise.all(
264
282
  input.systemIds.map(async (systemId) => {
265
- statuses[systemId] = await service.getSystemHealthStatus(systemId);
283
+ statuses[systemId] = await cache.wrapSystemHealthStatus(
284
+ systemId,
285
+ () => service.getSystemHealthStatus(systemId),
286
+ );
266
287
  }),
267
288
  );
268
-
269
289
  return { statuses };
270
290
  },
271
291
  ),
@@ -289,6 +309,15 @@ export const createHealthCheckRouter = (opts: {
289
309
  ingestSatelliteResult: os.ingestSatelliteResult.handler(
290
310
  async ({ input }) => {
291
311
  await service.ingestSatelliteResult(input);
312
+ // A satellite result writes a new run for this system, so the
313
+ // cached aggregate status is now stale.
314
+ await cache.invalidateSystem(input.systemId);
315
+ },
316
+ ),
317
+
318
+ getRunsForAnalysis: os.getRunsForAnalysis.handler(
319
+ async ({ input }) => {
320
+ return service.getRunsForAnalysis(input);
292
321
  },
293
322
  ),
294
323
  });
@@ -60,6 +60,14 @@ function addHealthResultMeta(
60
60
  jsonField["x-chart-unit"] = healthMeta["x-chart-unit"];
61
61
  if (healthMeta["x-jsonpath"])
62
62
  jsonField["x-jsonpath"] = healthMeta["x-jsonpath"];
63
+ if (healthMeta["x-anomaly-enabled"] !== undefined)
64
+ jsonField["x-anomaly-enabled"] = healthMeta["x-anomaly-enabled"];
65
+ if (healthMeta["x-anomaly-direction"])
66
+ jsonField["x-anomaly-direction"] = healthMeta["x-anomaly-direction"];
67
+ if (healthMeta["x-anomaly-sensitivity"] !== undefined)
68
+ jsonField["x-anomaly-sensitivity"] = healthMeta["x-anomaly-sensitivity"];
69
+ if (healthMeta["x-anomaly-confirmation-window"] !== undefined)
70
+ jsonField["x-anomaly-confirmation-window"] = healthMeta["x-anomaly-confirmation-window"];
63
71
  }
64
72
 
65
73
  // Recurse into nested objects and arrays
package/src/schema.ts CHANGED
@@ -8,7 +8,7 @@ import {
8
8
  uuid,
9
9
  timestamp,
10
10
  primaryKey,
11
- uniqueIndex,
11
+ unique,
12
12
  } from "drizzle-orm/pg-core";
13
13
  import type {
14
14
  StateThresholds,
@@ -182,13 +182,15 @@ export const healthCheckAggregates = pgTable(
182
182
  sourceLabel: text("source_label"),
183
183
  },
184
184
  (t) => ({
185
- // Unique constraint includes sourceId for per-region aggregation
186
- bucketUnique: uniqueIndex("health_check_aggregates_bucket_unique").on(
185
+ // Unique constraint includes sourceId for per-region aggregation.
186
+ // NULLS NOT DISTINCT ensures local runs (sourceId=NULL) correctly
187
+ // conflict-match instead of creating duplicate rows per hour.
188
+ bucketUnique: unique("health_check_aggregates_bucket_unique").on(
187
189
  t.configurationId,
188
190
  t.systemId,
189
191
  t.bucketStart,
190
192
  t.bucketSize,
191
193
  t.sourceId,
192
- ),
194
+ ).nullsNotDistinct(),
193
195
  }),
194
196
  );
package/src/service.ts CHANGED
@@ -1091,6 +1091,53 @@ export class HealthCheckService {
1091
1091
  return assignments;
1092
1092
  }
1093
1093
 
1094
+ async getRunsForAnalysis(props: {
1095
+ startDate: Date;
1096
+ limitPerAssignment?: number;
1097
+ }) {
1098
+ const { startDate, limitPerAssignment = 200 } = props;
1099
+
1100
+ // Fetch all active associations
1101
+ const activeAssignments = await this.db
1102
+ .select({
1103
+ systemId: systemHealthChecks.systemId,
1104
+ configurationId: systemHealthChecks.configurationId,
1105
+ })
1106
+ .from(systemHealthChecks)
1107
+ .where(eq(systemHealthChecks.enabled, true));
1108
+
1109
+ const results = [];
1110
+
1111
+ // For each assignment, fetch the recent runs
1112
+ // This endpoint is used specifically for cross-plugin background jobs
1113
+ for (const assignment of activeAssignments) {
1114
+ const runs = await this.db
1115
+ .select({
1116
+ result: healthCheckRuns.result,
1117
+ })
1118
+ .from(healthCheckRuns)
1119
+ .where(
1120
+ and(
1121
+ eq(healthCheckRuns.systemId, assignment.systemId),
1122
+ eq(healthCheckRuns.configurationId, assignment.configurationId),
1123
+ gte(healthCheckRuns.timestamp, startDate),
1124
+ ),
1125
+ )
1126
+ .orderBy(desc(healthCheckRuns.timestamp))
1127
+ .limit(limitPerAssignment);
1128
+
1129
+ results.push({
1130
+ systemId: assignment.systemId,
1131
+ configurationId: assignment.configurationId,
1132
+ runs: runs.map((r) => ({
1133
+ result: r.result,
1134
+ })),
1135
+ });
1136
+ }
1137
+
1138
+ return results;
1139
+ }
1140
+
1094
1141
  /**
1095
1142
  * Ingest a health check result from a satellite.
1096
1143
  * Stores the run with source attribution (sourceId + sourceLabel)
@@ -1116,7 +1163,9 @@ export class HealthCheckService {
1116
1163
  sourceLabel,
1117
1164
  } = props;
1118
1165
 
1119
- const resultRecord = result ? { ...result } as Record<string, unknown> : {};
1166
+ const resultRecord = result
1167
+ ? ({ ...result } as Record<string, unknown>)
1168
+ : {};
1120
1169
 
1121
1170
  await this.db.insert(healthCheckRuns).values({
1122
1171
  configurationId: configId,