@checkstack/healthcheck-backend 0.18.1 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +171 -0
- package/package.json +18 -16
- package/src/index.ts +28 -0
- package/src/queue-executor.test.ts +6 -0
- package/src/queue-executor.ts +33 -4
- package/src/retention-job.ts +54 -18
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,176 @@
|
|
|
1
1
|
# @checkstack/healthcheck-backend
|
|
2
2
|
|
|
3
|
+
## 1.0.1
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 2a749d3: fix: run afterPluginsReady in topological order; merge daily rollups on conflict
|
|
8
|
+
|
|
9
|
+
Two resilience fixes for the dependency chain:
|
|
10
|
+
|
|
11
|
+
1. **Plugin loader**: Phase 3 (`afterPluginsReady`) now iterates plugins
|
|
12
|
+
in the same topologically-sorted order as Phase 2 (`init`). Previously
|
|
13
|
+
it iterated `pendingInits` in registration order, which raced
|
|
14
|
+
subscription-spec dependencies — catalog's afterPluginsReady registers
|
|
15
|
+
`catalog.system` and `catalog.group` notification targets, and emitting
|
|
16
|
+
plugins (incident, maintenance, …) call `registerSubscriptionSpec`
|
|
17
|
+
against those targets in their own afterPluginsReady. With registration
|
|
18
|
+
order, an emitter could run before catalog and hit
|
|
19
|
+
`Target type catalog.group is not registered`. Sorted order encodes
|
|
20
|
+
the dependency via `spec.target.ownerPlugin`, so the emitter now
|
|
21
|
+
always runs after the target owner.
|
|
22
|
+
|
|
23
|
+
2. **Healthcheck retention job**: the daily rollup now upserts
|
|
24
|
+
`health_check_aggregates` with `ON CONFLICT DO UPDATE` instead of a
|
|
25
|
+
plain insert. Previously, late-arriving hourly aggregates (e.g. from
|
|
26
|
+
a satellite that was offline when the prior rollup ran) would crash
|
|
27
|
+
the rollup with a unique-constraint violation on
|
|
28
|
+
`(configuration_id, system_id, bucket_start, bucket_size, source_id)`.
|
|
29
|
+
The merge sums counts and folds min/max/p95 into the existing daily
|
|
30
|
+
row.
|
|
31
|
+
|
|
32
|
+
- @checkstack/satellite-backend@0.2.19
|
|
33
|
+
|
|
34
|
+
## 1.0.0
|
|
35
|
+
|
|
36
|
+
### Major Changes
|
|
37
|
+
|
|
38
|
+
- 32d52c6: feat: notification target pattern + per-spec subscriptions
|
|
39
|
+
|
|
40
|
+
Replaces the all-or-nothing catalog system/group notification model with a
|
|
41
|
+
platform-level target pattern. Each notification-emitting plugin declares
|
|
42
|
+
_subscription specs_ against typed _target_ objects exported from the
|
|
43
|
+
target's owning plugin (catalog ships `catalogSystemTarget` and
|
|
44
|
+
`catalogGroupTarget`). Notification-backend handles every per-resource
|
|
45
|
+
group lifecycle, parent-edge inheritance, and legacy-subscription seeding
|
|
46
|
+
— plugins never author groupId helpers, lifecycle hooks, or migration
|
|
47
|
+
code again.
|
|
48
|
+
|
|
49
|
+
**Plugin-author surface area is now ~12 lines per emitter:**
|
|
50
|
+
|
|
51
|
+
```ts
|
|
52
|
+
// <plugin>-common
|
|
53
|
+
const { defineSubscription } = createSubscriptionFactory(pluginMetadata);
|
|
54
|
+
export const fooSystemSubscription = defineSubscription({
|
|
55
|
+
localId: "system",
|
|
56
|
+
target: catalogSystemTarget,
|
|
57
|
+
display: { title: "Foo Alerts", description: "...", iconName: "Bell" },
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
// <plugin>-backend register()
|
|
61
|
+
env.registerSubscriptionSpecs([fooSystemSubscription]);
|
|
62
|
+
// ^ feeds the plugin loader's dependency sorter — each spec's
|
|
63
|
+
// target.ownerPlugin becomes an implicit init-order dep, so this
|
|
64
|
+
// plugin automatically waits for catalog (the target owner) to
|
|
65
|
+
// finish init + afterPluginsReady before its own runs.
|
|
66
|
+
|
|
67
|
+
// <plugin>-backend afterPluginsReady
|
|
68
|
+
await notificationClient.registerSubscriptionSpec(
|
|
69
|
+
specToRegistration(fooSystemSubscription)
|
|
70
|
+
);
|
|
71
|
+
// dispatch
|
|
72
|
+
await notificationClient.notifyForSubscription({
|
|
73
|
+
specId: fooSystemSubscription.specId,
|
|
74
|
+
resourceKeys: [systemId],
|
|
75
|
+
title,
|
|
76
|
+
body,
|
|
77
|
+
importance,
|
|
78
|
+
action,
|
|
79
|
+
collapseKey,
|
|
80
|
+
subjects,
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// <plugin>-frontend
|
|
84
|
+
createNotificationSubscriptionExtension({ spec: fooSystemSubscription });
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Migrated plugins**: anomaly, incident, maintenance, healthcheck,
|
|
88
|
+
dependency. Each lost its bespoke `notification-groups.ts`,
|
|
89
|
+
`bootstrap*NotificationGroups`, `ensure*Group`, and inheritance walk —
|
|
90
|
+
all of that is now centralized in notification-backend's
|
|
91
|
+
`subscription-engine`.
|
|
92
|
+
|
|
93
|
+
**Plugin loader change** (`@checkstack/backend-api`,
|
|
94
|
+
`@checkstack/backend`): the register-time API gains
|
|
95
|
+
`env.registerSubscriptionSpecs([...specs])`. The dependency sorter
|
|
96
|
+
walks `spec.target.ownerPlugin` for every declared spec and adds the
|
|
97
|
+
target owner as an init-order dependency of the emitting plugin. This
|
|
98
|
+
guarantees that catalog (the owner of the platform's `system` and
|
|
99
|
+
`group` targets) completes init + afterPluginsReady before any
|
|
100
|
+
emitting plugin tries to register its specs against the notification
|
|
101
|
+
service — no string-prefix heuristics, no manual `dependsOnPlugins`
|
|
102
|
+
list, no stub rows. Plugins that fail to declare their specs at
|
|
103
|
+
register time get a clear `Target type X is not registered. Did the
|
|
104
|
+
emitting plugin declare this spec via env.registerSubscriptionSpecs?`
|
|
105
|
+
error from the dispatcher.
|
|
106
|
+
|
|
107
|
+
**Removed** (no backwards compat):
|
|
108
|
+
|
|
109
|
+
- `catalogClient.notifySystemSubscribers` and
|
|
110
|
+
`catalogClient.notifyManySystemSubscribers`
|
|
111
|
+
- `notificationClient.notifyUsers` and `notificationClient.notifyGroups`
|
|
112
|
+
as direct dispatch primitives — replaced by spec-bound
|
|
113
|
+
`notifyForSubscription`
|
|
114
|
+
- catalog's `bootstrapNotificationGroups` (replaced by
|
|
115
|
+
`bootstrapNotificationTargets`)
|
|
116
|
+
|
|
117
|
+
**Enforcement**: the dispatcher rejects calls referencing unregistered
|
|
118
|
+
specIds, specs owned by other plugins, or resourceKeys that haven't been
|
|
119
|
+
pushed via `upsertNotificationResource`. Display metadata for any
|
|
120
|
+
groupId is recoverable via the spec registry, so audit lists render
|
|
121
|
+
correct labels even when an emitter's frontend isn't loaded.
|
|
122
|
+
|
|
123
|
+
**Per-field anomaly mute** keeps working — it now lives inside the
|
|
124
|
+
generic SubscriptionRow's optional `SubControls` panel
|
|
125
|
+
(`AnomalyFieldMuteList`), exposed through the catalog system detail
|
|
126
|
+
page's notifications card.
|
|
127
|
+
|
|
128
|
+
The catalog system detail page renders a "Notifications" card hosting
|
|
129
|
+
`SystemNotificationSubscriptionsSlot`. The matching group surface is
|
|
130
|
+
not yet rendered — group-level subscriptions are wired end-to-end on
|
|
131
|
+
the backend; a follow-up will add the host UI.
|
|
132
|
+
|
|
133
|
+
**Migration of existing subscribers**: target types declare a
|
|
134
|
+
`legacyGroupIdTemplate`; on first registration of each spec,
|
|
135
|
+
notification-backend reads subscribers from the legacy
|
|
136
|
+
`catalog.system.<id>` / `catalog.group.<id>` groups and seeds the new
|
|
137
|
+
spec groups exactly once per (spec × resource) pair, tracked in
|
|
138
|
+
`subscription_migrations`. Anomaly stays opt-in (its target also
|
|
139
|
+
declares the template, but the user-explicit nature of the original
|
|
140
|
+
opt-in flow means the seeding produces the same set of subscribers
|
|
141
|
+
they already had).
|
|
142
|
+
|
|
143
|
+
### Patch Changes
|
|
144
|
+
|
|
145
|
+
- 32d52c6: Bulk notifications affecting multiple systems and collapse lifecycle events into a single card.
|
|
146
|
+
|
|
147
|
+
Notifications now carry an optional `subjects` array (the entities they affect) and an optional `collapseKey` (so related notifications collapse into one row per recipient). Incidents, maintenances, anomalies, healthchecks, and dependency-impact events route through these new fields, so an incident affecting three systems produces one in-app notification + one external send per subscriber instead of three. Lifecycle updates for the same entity (created → updated → resolved) also collapse, with an expandable "+N updates" timeline.
|
|
148
|
+
|
|
149
|
+
Subject kinds are namespaced as `<pluginId>.<localKind>` and built via type-safe helpers exported from each domain's common package (`createSystemSubject`, `incidentCollapseKey`, etc.). The frontend kind registry (`registerSubjectKind`) lets plugins bind icon + label for their kinds; unknown kinds fall back to a generic chip.
|
|
150
|
+
|
|
151
|
+
All notification strategies (SMTP, Slack, Discord, Teams, Telegram, Pushover, Gotify, Webex, Backstage) render the affected subjects natively in their format (HTML cards, Slack blocks, Discord embed fields, adaptive cards, markdown lists, etc.).
|
|
152
|
+
|
|
153
|
+
- Updated dependencies [32d52c6]
|
|
154
|
+
- Updated dependencies [32d52c6]
|
|
155
|
+
- Updated dependencies [32d52c6]
|
|
156
|
+
- Updated dependencies [32d52c6]
|
|
157
|
+
- Updated dependencies [32d52c6]
|
|
158
|
+
- Updated dependencies [32d52c6]
|
|
159
|
+
- @checkstack/gitops-backend@0.2.6
|
|
160
|
+
- @checkstack/integration-backend@0.1.22
|
|
161
|
+
- @checkstack/satellite-backend@0.2.18
|
|
162
|
+
- @checkstack/notification-common@1.0.0
|
|
163
|
+
- @checkstack/catalog-backend@1.0.0
|
|
164
|
+
- @checkstack/catalog-common@2.0.0
|
|
165
|
+
- @checkstack/incident-common@1.0.0
|
|
166
|
+
- @checkstack/maintenance-common@1.0.0
|
|
167
|
+
- @checkstack/healthcheck-common@1.0.0
|
|
168
|
+
- @checkstack/backend-api@0.14.0
|
|
169
|
+
- @checkstack/cache-api@0.2.2
|
|
170
|
+
- @checkstack/command-backend@0.1.22
|
|
171
|
+
- @checkstack/queue-api@0.2.16
|
|
172
|
+
- @checkstack/cache-utils@0.2.2
|
|
173
|
+
|
|
3
174
|
## 0.18.1
|
|
4
175
|
|
|
5
176
|
### Patch Changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@checkstack/healthcheck-backend",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "1.0.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "src/index.ts",
|
|
6
6
|
"checkstack": {
|
|
@@ -13,22 +13,23 @@
|
|
|
13
13
|
"lint:code": "eslint . --max-warnings 0"
|
|
14
14
|
},
|
|
15
15
|
"dependencies": {
|
|
16
|
-
"@checkstack/backend-api": "0.13.
|
|
17
|
-
"@checkstack/cache-api": "0.2.
|
|
18
|
-
"@checkstack/cache-utils": "0.2.
|
|
19
|
-
"@checkstack/catalog-backend": "0.7.
|
|
20
|
-
"@checkstack/catalog-common": "1.5.
|
|
21
|
-
"@checkstack/command-backend": "0.1.
|
|
16
|
+
"@checkstack/backend-api": "0.13.1",
|
|
17
|
+
"@checkstack/cache-api": "0.2.1",
|
|
18
|
+
"@checkstack/cache-utils": "0.2.1",
|
|
19
|
+
"@checkstack/catalog-backend": "0.7.1",
|
|
20
|
+
"@checkstack/catalog-common": "1.5.3",
|
|
21
|
+
"@checkstack/command-backend": "0.1.21",
|
|
22
22
|
"@checkstack/common": "0.7.0",
|
|
23
|
-
"@checkstack/gitops-backend": "0.2.
|
|
23
|
+
"@checkstack/gitops-backend": "0.2.5",
|
|
24
24
|
"@checkstack/gitops-common": "0.2.1",
|
|
25
|
-
"@checkstack/healthcheck-common": "0.
|
|
26
|
-
"@checkstack/incident-common": "0.
|
|
27
|
-
"@checkstack/integration-backend": "0.1.
|
|
28
|
-
"@checkstack/maintenance-common": "0.
|
|
29
|
-
"@checkstack/
|
|
30
|
-
"@checkstack/
|
|
31
|
-
"@checkstack/
|
|
25
|
+
"@checkstack/healthcheck-common": "0.13.0",
|
|
26
|
+
"@checkstack/incident-common": "0.5.0",
|
|
27
|
+
"@checkstack/integration-backend": "0.1.21",
|
|
28
|
+
"@checkstack/maintenance-common": "0.5.0",
|
|
29
|
+
"@checkstack/notification-common": "0.3.0",
|
|
30
|
+
"@checkstack/queue-api": "0.2.15",
|
|
31
|
+
"@checkstack/satellite-backend": "0.2.17",
|
|
32
|
+
"@checkstack/signal-common": "0.2.0",
|
|
32
33
|
"@hono/zod-validator": "^0.7.6",
|
|
33
34
|
"drizzle-orm": "^0.45.0",
|
|
34
35
|
"hono": "^4.12.14",
|
|
@@ -39,11 +40,12 @@
|
|
|
39
40
|
"devDependencies": {
|
|
40
41
|
"@checkstack/drizzle-helper": "0.0.4",
|
|
41
42
|
"@checkstack/scripts": "0.1.2",
|
|
42
|
-
"@checkstack/test-utils-backend": "0.1.
|
|
43
|
+
"@checkstack/test-utils-backend": "0.1.21",
|
|
43
44
|
"@checkstack/tsconfig": "0.0.5",
|
|
44
45
|
"@types/bun": "^1.0.0",
|
|
45
46
|
"@types/tdigest": "^0.1.5",
|
|
46
47
|
"date-fns": "^4.1.0",
|
|
48
|
+
"drizzle-kit": "^0.31.10",
|
|
47
49
|
"typescript": "^5.0.0"
|
|
48
50
|
}
|
|
49
51
|
}
|
package/src/index.ts
CHANGED
|
@@ -10,7 +10,13 @@ import {
|
|
|
10
10
|
pluginMetadata,
|
|
11
11
|
healthCheckContract,
|
|
12
12
|
healthcheckRoutes,
|
|
13
|
+
healthcheckSystemSubscription,
|
|
14
|
+
healthcheckGroupSubscription,
|
|
13
15
|
} from "@checkstack/healthcheck-common";
|
|
16
|
+
import {
|
|
17
|
+
NotificationApi,
|
|
18
|
+
specToRegistration,
|
|
19
|
+
} from "@checkstack/notification-common";
|
|
14
20
|
import {
|
|
15
21
|
createBackendPlugin,
|
|
16
22
|
coreServices,
|
|
@@ -67,6 +73,10 @@ export default createBackendPlugin({
|
|
|
67
73
|
metadata: pluginMetadata,
|
|
68
74
|
register(env) {
|
|
69
75
|
env.registerAccessRules(healthCheckAccessRules);
|
|
76
|
+
env.registerSubscriptionSpecs([
|
|
77
|
+
healthcheckSystemSubscription,
|
|
78
|
+
healthcheckGroupSubscription,
|
|
79
|
+
]);
|
|
70
80
|
|
|
71
81
|
// Register hooks as integration events
|
|
72
82
|
const integrationEvents = env.getExtensionPoint(
|
|
@@ -179,6 +189,9 @@ export default createBackendPlugin({
|
|
|
179
189
|
// Create incident client for notification suppression checks
|
|
180
190
|
const incidentClient = rpcClient.forPlugin(IncidentApi);
|
|
181
191
|
|
|
192
|
+
// Notification client for spec-bound dispatch
|
|
193
|
+
const notificationClient = rpcClient.forPlugin(NotificationApi);
|
|
194
|
+
|
|
182
195
|
// Create gitops client for provenance lock checks
|
|
183
196
|
const gitOpsClient = rpcClient.forPlugin(GitOpsApi);
|
|
184
197
|
|
|
@@ -191,6 +204,7 @@ export default createBackendPlugin({
|
|
|
191
204
|
|
|
192
205
|
// Setup queue-based health check worker
|
|
193
206
|
await setupHealthCheckWorker({
|
|
207
|
+
notificationClient,
|
|
194
208
|
db: database,
|
|
195
209
|
registry: healthCheckRegistry,
|
|
196
210
|
collectorRegistry,
|
|
@@ -255,6 +269,7 @@ export default createBackendPlugin({
|
|
|
255
269
|
logger,
|
|
256
270
|
onHook,
|
|
257
271
|
emitHook,
|
|
272
|
+
rpcClient,
|
|
258
273
|
healthCheckRegistry,
|
|
259
274
|
collectorRegistry,
|
|
260
275
|
}) => {
|
|
@@ -267,6 +282,19 @@ export default createBackendPlugin({
|
|
|
267
282
|
logger,
|
|
268
283
|
});
|
|
269
284
|
|
|
285
|
+
// Notification subscription specs. Per-resource group lifecycle
|
|
286
|
+
// is owned by notification-backend now — healthcheck just
|
|
287
|
+
// declares the specs it dispatches under.
|
|
288
|
+
const afterNotificationClient = rpcClient.forPlugin(NotificationApi);
|
|
289
|
+
await Promise.all([
|
|
290
|
+
afterNotificationClient.registerSubscriptionSpec(
|
|
291
|
+
specToRegistration(healthcheckSystemSubscription),
|
|
292
|
+
),
|
|
293
|
+
afterNotificationClient.registerSubscriptionSpec(
|
|
294
|
+
specToRegistration(healthcheckGroupSubscription),
|
|
295
|
+
),
|
|
296
|
+
]);
|
|
297
|
+
|
|
270
298
|
// Register GitOps documentation now that registries are populated
|
|
271
299
|
registerHealthcheckGitOpsDocumentation({
|
|
272
300
|
kindRegistry,
|
|
@@ -189,6 +189,9 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
189
189
|
catalogClient: mockCatalogClient as unknown as Parameters<
|
|
190
190
|
typeof setupHealthCheckWorker
|
|
191
191
|
>[0]["catalogClient"],
|
|
192
|
+
notificationClient: { notifyForSubscription: () => Promise.resolve({ notifiedCount: 0 }) } as unknown as Parameters<
|
|
193
|
+
typeof setupHealthCheckWorker
|
|
194
|
+
>[0]["notificationClient"],
|
|
192
195
|
maintenanceClient: mockMaintenanceClient as unknown as Parameters<
|
|
193
196
|
typeof setupHealthCheckWorker
|
|
194
197
|
>[0]["maintenanceClient"],
|
|
@@ -383,6 +386,9 @@ describe("Queue-Based Health Check Executor", () => {
|
|
|
383
386
|
catalogClient: mockCatalogClient as unknown as Parameters<
|
|
384
387
|
typeof setupHealthCheckWorker
|
|
385
388
|
>[0]["catalogClient"],
|
|
389
|
+
notificationClient: { notifyForSubscription: () => Promise.resolve({ notifiedCount: 0 }) } as unknown as Parameters<
|
|
390
|
+
typeof setupHealthCheckWorker
|
|
391
|
+
>[0]["notificationClient"],
|
|
386
392
|
maintenanceClient: mockMaintenanceClient as unknown as Parameters<
|
|
387
393
|
typeof setupHealthCheckWorker
|
|
388
394
|
>[0]["maintenanceClient"],
|
package/src/queue-executor.ts
CHANGED
|
@@ -24,9 +24,16 @@ import {
|
|
|
24
24
|
type HealthCheckStatus,
|
|
25
25
|
stripEphemeralFields,
|
|
26
26
|
} from "@checkstack/healthcheck-common";
|
|
27
|
-
import {
|
|
27
|
+
import {
|
|
28
|
+
CatalogApi,
|
|
29
|
+
catalogRoutes,
|
|
30
|
+
createSystemSubject,
|
|
31
|
+
} from "@checkstack/catalog-common";
|
|
32
|
+
import { systemHealthCollapseKey } from "@checkstack/healthcheck-common";
|
|
28
33
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
29
34
|
import { IncidentApi } from "@checkstack/incident-common";
|
|
35
|
+
import { NotificationApi } from "@checkstack/notification-common";
|
|
36
|
+
import { healthcheckSystemSubscription } from "@checkstack/healthcheck-common";
|
|
30
37
|
import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
|
|
31
38
|
import { HealthCheckService } from "./service";
|
|
32
39
|
import { healthCheckHooks } from "./hooks";
|
|
@@ -37,6 +44,7 @@ type Db = SafeDatabase<typeof schema>;
|
|
|
37
44
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
38
45
|
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
39
46
|
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
47
|
+
type NotificationClient = InferClient<typeof NotificationApi>;
|
|
40
48
|
|
|
41
49
|
/**
|
|
42
50
|
* Emit the checkCompleted hook if available.
|
|
@@ -138,6 +146,7 @@ async function notifyStateChange(props: {
|
|
|
138
146
|
previousStatus: HealthCheckStatus;
|
|
139
147
|
newStatus: HealthCheckStatus;
|
|
140
148
|
catalogClient: CatalogClient;
|
|
149
|
+
notificationClient: NotificationClient;
|
|
141
150
|
maintenanceClient: MaintenanceClient;
|
|
142
151
|
incidentClient: IncidentClient;
|
|
143
152
|
logger: Logger;
|
|
@@ -148,6 +157,7 @@ async function notifyStateChange(props: {
|
|
|
148
157
|
previousStatus,
|
|
149
158
|
newStatus,
|
|
150
159
|
catalogClient,
|
|
160
|
+
notificationClient,
|
|
151
161
|
maintenanceClient,
|
|
152
162
|
incidentClient,
|
|
153
163
|
logger,
|
|
@@ -225,14 +235,25 @@ async function notifyStateChange(props: {
|
|
|
225
235
|
systemId,
|
|
226
236
|
});
|
|
227
237
|
|
|
238
|
+
void catalogClient; // parents are resolved server-side via stored target edges
|
|
239
|
+
|
|
228
240
|
try {
|
|
229
|
-
await
|
|
230
|
-
|
|
241
|
+
await notificationClient.notifyForSubscription({
|
|
242
|
+
specId: healthcheckSystemSubscription.specId,
|
|
243
|
+
resourceKeys: [systemId],
|
|
231
244
|
title,
|
|
232
245
|
body,
|
|
233
246
|
importance,
|
|
234
247
|
action: { label: "View System", url: systemDetailPath },
|
|
235
|
-
|
|
248
|
+
collapseKey: systemHealthCollapseKey(systemId),
|
|
249
|
+
subjects: [
|
|
250
|
+
createSystemSubject({
|
|
251
|
+
id: systemId,
|
|
252
|
+
name: systemName,
|
|
253
|
+
url: systemDetailPath,
|
|
254
|
+
status: newStatus,
|
|
255
|
+
}),
|
|
256
|
+
],
|
|
236
257
|
});
|
|
237
258
|
logger.debug(
|
|
238
259
|
`Notified subscribers: ${previousStatus} → ${newStatus} for system ${systemId}`,
|
|
@@ -257,6 +278,7 @@ async function executeHealthCheckJob(props: {
|
|
|
257
278
|
logger: Logger;
|
|
258
279
|
signalService: SignalService;
|
|
259
280
|
catalogClient: CatalogClient;
|
|
281
|
+
notificationClient: NotificationClient;
|
|
260
282
|
maintenanceClient: MaintenanceClient;
|
|
261
283
|
incidentClient: IncidentClient;
|
|
262
284
|
getEmitHook: () => EmitHookFn | undefined;
|
|
@@ -270,6 +292,7 @@ async function executeHealthCheckJob(props: {
|
|
|
270
292
|
logger,
|
|
271
293
|
signalService,
|
|
272
294
|
catalogClient,
|
|
295
|
+
notificationClient,
|
|
273
296
|
maintenanceClient,
|
|
274
297
|
incidentClient,
|
|
275
298
|
getEmitHook,
|
|
@@ -575,6 +598,7 @@ async function executeHealthCheckJob(props: {
|
|
|
575
598
|
const newState = await service.getSystemHealthStatus(systemId);
|
|
576
599
|
if (newState.status !== previousStatus) {
|
|
577
600
|
await notifyStateChange({
|
|
601
|
+
notificationClient,
|
|
578
602
|
systemId,
|
|
579
603
|
systemName,
|
|
580
604
|
previousStatus,
|
|
@@ -669,6 +693,7 @@ async function executeHealthCheckJob(props: {
|
|
|
669
693
|
const newState = await service.getSystemHealthStatus(systemId);
|
|
670
694
|
if (newState.status !== previousStatus) {
|
|
671
695
|
await notifyStateChange({
|
|
696
|
+
notificationClient,
|
|
672
697
|
systemId,
|
|
673
698
|
systemName,
|
|
674
699
|
previousStatus,
|
|
@@ -800,6 +825,7 @@ async function executeHealthCheckJob(props: {
|
|
|
800
825
|
const newState = await service.getSystemHealthStatus(systemId);
|
|
801
826
|
if (newState.status !== previousStatus) {
|
|
802
827
|
await notifyStateChange({
|
|
828
|
+
notificationClient,
|
|
803
829
|
systemId,
|
|
804
830
|
systemName,
|
|
805
831
|
previousStatus,
|
|
@@ -868,6 +894,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
868
894
|
queueManager: QueueManager;
|
|
869
895
|
signalService: SignalService;
|
|
870
896
|
catalogClient: CatalogClient;
|
|
897
|
+
notificationClient: NotificationClient;
|
|
871
898
|
maintenanceClient: MaintenanceClient;
|
|
872
899
|
incidentClient: IncidentClient;
|
|
873
900
|
getEmitHook: () => EmitHookFn | undefined;
|
|
@@ -881,6 +908,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
881
908
|
queueManager,
|
|
882
909
|
signalService,
|
|
883
910
|
catalogClient,
|
|
911
|
+
notificationClient,
|
|
884
912
|
maintenanceClient,
|
|
885
913
|
incidentClient,
|
|
886
914
|
getEmitHook,
|
|
@@ -901,6 +929,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
901
929
|
logger,
|
|
902
930
|
signalService,
|
|
903
931
|
catalogClient,
|
|
932
|
+
notificationClient,
|
|
904
933
|
maintenanceClient,
|
|
905
934
|
incidentClient,
|
|
906
935
|
getEmitHook,
|
package/src/retention-job.ts
CHANGED
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
healthCheckAggregates,
|
|
7
7
|
DEFAULT_RETENTION_CONFIG,
|
|
8
8
|
} from "./schema";
|
|
9
|
-
import { eq, and, lt } from "drizzle-orm";
|
|
9
|
+
import { eq, and, lt, sql } from "drizzle-orm";
|
|
10
10
|
import type { QueueManager } from "@checkstack/queue-api";
|
|
11
11
|
|
|
12
12
|
type Db = SafeDatabase<typeof schema>;
|
|
@@ -228,23 +228,59 @@ async function rollupHourlyAggregates(params: RollupParams) {
|
|
|
228
228
|
const p95LatencyMs =
|
|
229
229
|
p95Values.length > 0 ? Math.max(...p95Values) : undefined;
|
|
230
230
|
|
|
231
|
-
//
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
231
|
+
// Upsert the daily aggregate. A row may already exist for this
|
|
232
|
+
// (configurationId, systemId, day, daily, sourceId=null) tuple if a
|
|
233
|
+
// prior rollup ran and then late-arriving hourly buckets (e.g. from
|
|
234
|
+
// a satellite that was offline) were rolled up afterwards. Merge in
|
|
235
|
+
// that case rather than crashing — sums add, min/max/p95 fold.
|
|
236
|
+
const newLatencySum = latencySumMs > 0 ? latencySumMs : undefined;
|
|
237
|
+
await db
|
|
238
|
+
.insert(healthCheckAggregates)
|
|
239
|
+
.values({
|
|
240
|
+
configurationId,
|
|
241
|
+
systemId,
|
|
242
|
+
bucketStart: bucket.bucketStart,
|
|
243
|
+
bucketSize: "daily",
|
|
244
|
+
runCount,
|
|
245
|
+
healthyCount,
|
|
246
|
+
degradedCount,
|
|
247
|
+
unhealthyCount,
|
|
248
|
+
latencySumMs: newLatencySum,
|
|
249
|
+
avgLatencyMs,
|
|
250
|
+
minLatencyMs,
|
|
251
|
+
maxLatencyMs,
|
|
252
|
+
p95LatencyMs,
|
|
253
|
+
aggregatedResult: undefined, // Cannot combine result across hours
|
|
254
|
+
})
|
|
255
|
+
.onConflictDoUpdate({
|
|
256
|
+
target: [
|
|
257
|
+
healthCheckAggregates.configurationId,
|
|
258
|
+
healthCheckAggregates.systemId,
|
|
259
|
+
healthCheckAggregates.bucketStart,
|
|
260
|
+
healthCheckAggregates.bucketSize,
|
|
261
|
+
healthCheckAggregates.sourceId,
|
|
262
|
+
],
|
|
263
|
+
set: {
|
|
264
|
+
runCount: sql`${healthCheckAggregates.runCount} + ${runCount}`,
|
|
265
|
+
healthyCount: sql`${healthCheckAggregates.healthyCount} + ${healthyCount}`,
|
|
266
|
+
degradedCount: sql`${healthCheckAggregates.degradedCount} + ${degradedCount}`,
|
|
267
|
+
unhealthyCount: sql`${healthCheckAggregates.unhealthyCount} + ${unhealthyCount}`,
|
|
268
|
+
latencySumMs: sql`COALESCE(${healthCheckAggregates.latencySumMs}, 0) + ${newLatencySum ?? 0}`,
|
|
269
|
+
avgLatencyMs: sql`CASE WHEN (${healthCheckAggregates.runCount} + ${runCount}) > 0 THEN (COALESCE(${healthCheckAggregates.latencySumMs}, 0) + ${newLatencySum ?? 0}) / (${healthCheckAggregates.runCount} + ${runCount}) ELSE ${healthCheckAggregates.avgLatencyMs} END`,
|
|
270
|
+
minLatencyMs:
|
|
271
|
+
minLatencyMs === undefined
|
|
272
|
+
? sql`${healthCheckAggregates.minLatencyMs}`
|
|
273
|
+
: sql`LEAST(COALESCE(${healthCheckAggregates.minLatencyMs}, ${minLatencyMs}), ${minLatencyMs})`,
|
|
274
|
+
maxLatencyMs:
|
|
275
|
+
maxLatencyMs === undefined
|
|
276
|
+
? sql`${healthCheckAggregates.maxLatencyMs}`
|
|
277
|
+
: sql`GREATEST(COALESCE(${healthCheckAggregates.maxLatencyMs}, ${maxLatencyMs}), ${maxLatencyMs})`,
|
|
278
|
+
p95LatencyMs:
|
|
279
|
+
p95LatencyMs === undefined
|
|
280
|
+
? sql`${healthCheckAggregates.p95LatencyMs}`
|
|
281
|
+
: sql`GREATEST(COALESCE(${healthCheckAggregates.p95LatencyMs}, ${p95LatencyMs}), ${p95LatencyMs})`,
|
|
282
|
+
},
|
|
283
|
+
});
|
|
248
284
|
|
|
249
285
|
// Delete processed hourly aggregates
|
|
250
286
|
for (const hourly of bucket.aggregates) {
|