@checkstack/healthcheck-backend 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/package.json +1 -1
- package/src/retention-job.ts +54 -18
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,36 @@
|
|
|
1
1
|
# @checkstack/healthcheck-backend
|
|
2
2
|
|
|
3
|
+
## 1.0.1
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 2a749d3: fix: run afterPluginsReady in topological order; merge daily rollups on conflict
|
|
8
|
+
|
|
9
|
+
Two resilience fixes for the dependency chain:
|
|
10
|
+
|
|
11
|
+
1. **Plugin loader**: Phase 3 (`afterPluginsReady`) now iterates plugins
|
|
12
|
+
in the same topologically-sorted order as Phase 2 (`init`). Previously
|
|
13
|
+
it iterated `pendingInits` in registration order, which raced
|
|
14
|
+
subscription-spec dependencies — catalog's afterPluginsReady registers
|
|
15
|
+
`catalog.system` and `catalog.group` notification targets, and emitting
|
|
16
|
+
plugins (incident, maintenance, …) call `registerSubscriptionSpec`
|
|
17
|
+
against those targets in their own afterPluginsReady. With registration
|
|
18
|
+
order, an emitter could run before catalog and hit
|
|
19
|
+
`Target type catalog.group is not registered`. Sorted order encodes
|
|
20
|
+
the dependency via `spec.target.ownerPlugin`, so the emitter now
|
|
21
|
+
always runs after the target owner.
|
|
22
|
+
|
|
23
|
+
2. **Healthcheck retention job**: the daily rollup now upserts
|
|
24
|
+
`health_check_aggregates` with `ON CONFLICT DO UPDATE` instead of a
|
|
25
|
+
plain insert. Previously, late-arriving hourly aggregates (e.g. from
|
|
26
|
+
a satellite that was offline when the prior rollup ran) would crash
|
|
27
|
+
the rollup with a unique-constraint violation on
|
|
28
|
+
`(configuration_id, system_id, bucket_start, bucket_size, source_id)`.
|
|
29
|
+
The merge sums counts and folds min/max/p95 into the existing daily
|
|
30
|
+
row.
|
|
31
|
+
|
|
32
|
+
- @checkstack/satellite-backend@0.2.19
|
|
33
|
+
|
|
3
34
|
## 1.0.0
|
|
4
35
|
|
|
5
36
|
### Major Changes
|
package/package.json
CHANGED
package/src/retention-job.ts
CHANGED
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
healthCheckAggregates,
|
|
7
7
|
DEFAULT_RETENTION_CONFIG,
|
|
8
8
|
} from "./schema";
|
|
9
|
-
import { eq, and, lt } from "drizzle-orm";
|
|
9
|
+
import { eq, and, lt, sql } from "drizzle-orm";
|
|
10
10
|
import type { QueueManager } from "@checkstack/queue-api";
|
|
11
11
|
|
|
12
12
|
type Db = SafeDatabase<typeof schema>;
|
|
@@ -228,23 +228,59 @@ async function rollupHourlyAggregates(params: RollupParams) {
|
|
|
228
228
|
const p95LatencyMs =
|
|
229
229
|
p95Values.length > 0 ? Math.max(...p95Values) : undefined;
|
|
230
230
|
|
|
231
|
-
//
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
231
|
+
// Upsert the daily aggregate. A row may already exist for this
|
|
232
|
+
// (configurationId, systemId, day, daily, sourceId=null) tuple if a
|
|
233
|
+
// prior rollup ran and then late-arriving hourly buckets (e.g. from
|
|
234
|
+
// a satellite that was offline) were rolled up afterwards. Merge in
|
|
235
|
+
// that case rather than crashing — sums add, min/max/p95 fold.
|
|
236
|
+
const newLatencySum = latencySumMs > 0 ? latencySumMs : undefined;
|
|
237
|
+
await db
|
|
238
|
+
.insert(healthCheckAggregates)
|
|
239
|
+
.values({
|
|
240
|
+
configurationId,
|
|
241
|
+
systemId,
|
|
242
|
+
bucketStart: bucket.bucketStart,
|
|
243
|
+
bucketSize: "daily",
|
|
244
|
+
runCount,
|
|
245
|
+
healthyCount,
|
|
246
|
+
degradedCount,
|
|
247
|
+
unhealthyCount,
|
|
248
|
+
latencySumMs: newLatencySum,
|
|
249
|
+
avgLatencyMs,
|
|
250
|
+
minLatencyMs,
|
|
251
|
+
maxLatencyMs,
|
|
252
|
+
p95LatencyMs,
|
|
253
|
+
aggregatedResult: undefined, // Cannot combine result across hours
|
|
254
|
+
})
|
|
255
|
+
.onConflictDoUpdate({
|
|
256
|
+
target: [
|
|
257
|
+
healthCheckAggregates.configurationId,
|
|
258
|
+
healthCheckAggregates.systemId,
|
|
259
|
+
healthCheckAggregates.bucketStart,
|
|
260
|
+
healthCheckAggregates.bucketSize,
|
|
261
|
+
healthCheckAggregates.sourceId,
|
|
262
|
+
],
|
|
263
|
+
set: {
|
|
264
|
+
runCount: sql`${healthCheckAggregates.runCount} + ${runCount}`,
|
|
265
|
+
healthyCount: sql`${healthCheckAggregates.healthyCount} + ${healthyCount}`,
|
|
266
|
+
degradedCount: sql`${healthCheckAggregates.degradedCount} + ${degradedCount}`,
|
|
267
|
+
unhealthyCount: sql`${healthCheckAggregates.unhealthyCount} + ${unhealthyCount}`,
|
|
268
|
+
latencySumMs: sql`COALESCE(${healthCheckAggregates.latencySumMs}, 0) + ${newLatencySum ?? 0}`,
|
|
269
|
+
avgLatencyMs: sql`CASE WHEN (${healthCheckAggregates.runCount} + ${runCount}) > 0 THEN (COALESCE(${healthCheckAggregates.latencySumMs}, 0) + ${newLatencySum ?? 0}) / (${healthCheckAggregates.runCount} + ${runCount}) ELSE ${healthCheckAggregates.avgLatencyMs} END`,
|
|
270
|
+
minLatencyMs:
|
|
271
|
+
minLatencyMs === undefined
|
|
272
|
+
? sql`${healthCheckAggregates.minLatencyMs}`
|
|
273
|
+
: sql`LEAST(COALESCE(${healthCheckAggregates.minLatencyMs}, ${minLatencyMs}), ${minLatencyMs})`,
|
|
274
|
+
maxLatencyMs:
|
|
275
|
+
maxLatencyMs === undefined
|
|
276
|
+
? sql`${healthCheckAggregates.maxLatencyMs}`
|
|
277
|
+
: sql`GREATEST(COALESCE(${healthCheckAggregates.maxLatencyMs}, ${maxLatencyMs}), ${maxLatencyMs})`,
|
|
278
|
+
p95LatencyMs:
|
|
279
|
+
p95LatencyMs === undefined
|
|
280
|
+
? sql`${healthCheckAggregates.p95LatencyMs}`
|
|
281
|
+
: sql`GREATEST(COALESCE(${healthCheckAggregates.p95LatencyMs}, ${p95LatencyMs}), ${p95LatencyMs})`,
|
|
282
|
+
},
|
|
283
|
+
});
|
|
248
284
|
|
|
249
285
|
// Delete processed hourly aggregates
|
|
250
286
|
for (const hourly of bucket.aggregates) {
|