npm - @checkstack/healthcheck-backend - Versions diffs - 1.0.0 → 1.0.1 - Mend

@checkstack/healthcheck-backend 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,36 @@
 # @checkstack/healthcheck-backend
+## 1.0.1
+### Patch Changes
+- 2a749d3: fix: run afterPluginsReady in topological order; merge daily rollups on conflict
+  Two resilience fixes for the dependency chain:
+  1. **Plugin loader**: Phase 3 (`afterPluginsReady`) now iterates plugins
+     in the same topologically-sorted order as Phase 2 (`init`). Previously
+     it iterated `pendingInits` in registration order, which raced
+     subscription-spec dependencies — catalog's afterPluginsReady registers
+     `catalog.system` and `catalog.group` notification targets, and emitting
+     plugins (incident, maintenance, …) call `registerSubscriptionSpec`
+     against those targets in their own afterPluginsReady. With registration
+     order, an emitter could run before catalog and hit
+     `Target type catalog.group is not registered`. Sorted order encodes
+     the dependency via `spec.target.ownerPlugin`, so the emitter now
+     always runs after the target owner.
+  2. **Healthcheck retention job**: the daily rollup now upserts
+     `health_check_aggregates` with `ON CONFLICT DO UPDATE` instead of a
+     plain insert. Previously, late-arriving hourly aggregates (e.g. from
+     a satellite that was offline when the prior rollup ran) would crash
+     the rollup with a unique-constraint violation on
+     `(configuration_id, system_id, bucket_start, bucket_size, source_id)`.
+     The merge sums counts and folds min/max/p95 into the existing daily
+     row.
+  - @checkstack/satellite-backend@0.2.19
 ## 1.0.0
 ### Major Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@checkstack/healthcheck-backend",
-  "version": "1.0.0",
+  "version": "1.0.1",
   "type": "module",
   "main": "src/index.ts",
   "checkstack": {

package/src/retention-job.ts CHANGED Viewed

@@ -6,7 +6,7 @@ import {
   healthCheckAggregates,
   DEFAULT_RETENTION_CONFIG,
 } from "./schema";
-import { eq, and, lt } from "drizzle-orm";
+import { eq, and, lt, sql } from "drizzle-orm";
 import type { QueueManager } from "@checkstack/queue-api";
 type Db = SafeDatabase<typeof schema>;
@@ -228,23 +228,59 @@ async function rollupHourlyAggregates(params: RollupParams) {
     const p95LatencyMs =
       p95Values.length > 0 ? Math.max(...p95Values) : undefined;
-    // Insert daily aggregate
-    await db.insert(healthCheckAggregates).values({
-      configurationId,
-      systemId,
-      bucketStart: bucket.bucketStart,
-      bucketSize: "daily",
-      runCount,
-      healthyCount,
-      degradedCount,
-      unhealthyCount,
-      latencySumMs: latencySumMs > 0 ? latencySumMs : undefined,
-      avgLatencyMs,
-      minLatencyMs,
-      maxLatencyMs,
-      p95LatencyMs,
-      aggregatedResult: undefined, // Cannot combine result across hours
-    });
+    // Upsert the daily aggregate. A row may already exist for this
+    // (configurationId, systemId, day, daily, sourceId=null) tuple if a
+    // prior rollup ran and then late-arriving hourly buckets (e.g. from
+    // a satellite that was offline) were rolled up afterwards. Merge in
+    // that case rather than crashing — sums add, min/max/p95 fold.
+    const newLatencySum = latencySumMs > 0 ? latencySumMs : undefined;
+    await db
+      .insert(healthCheckAggregates)
+      .values({
+        configurationId,
+        systemId,
+        bucketStart: bucket.bucketStart,
+        bucketSize: "daily",
+        runCount,
+        healthyCount,
+        degradedCount,
+        unhealthyCount,
+        latencySumMs: newLatencySum,
+        avgLatencyMs,
+        minLatencyMs,
+        maxLatencyMs,
+        p95LatencyMs,
+        aggregatedResult: undefined, // Cannot combine result across hours
+      })
+      .onConflictDoUpdate({
+        target: [
+          healthCheckAggregates.configurationId,
+          healthCheckAggregates.systemId,
+          healthCheckAggregates.bucketStart,
+          healthCheckAggregates.bucketSize,
+          healthCheckAggregates.sourceId,
+        ],
+        set: {
+          runCount: sql`${healthCheckAggregates.runCount} + ${runCount}`,
+          healthyCount: sql`${healthCheckAggregates.healthyCount} + ${healthyCount}`,
+          degradedCount: sql`${healthCheckAggregates.degradedCount} + ${degradedCount}`,
+          unhealthyCount: sql`${healthCheckAggregates.unhealthyCount} + ${unhealthyCount}`,
+          latencySumMs: sql`COALESCE(${healthCheckAggregates.latencySumMs}, 0) + ${newLatencySum ?? 0}`,
+          avgLatencyMs: sql`CASE WHEN (${healthCheckAggregates.runCount} + ${runCount}) > 0 THEN (COALESCE(${healthCheckAggregates.latencySumMs}, 0) + ${newLatencySum ?? 0}) / (${healthCheckAggregates.runCount} + ${runCount}) ELSE ${healthCheckAggregates.avgLatencyMs} END`,
+          minLatencyMs:
+            minLatencyMs === undefined
+              ? sql`${healthCheckAggregates.minLatencyMs}`
+              : sql`LEAST(COALESCE(${healthCheckAggregates.minLatencyMs}, ${minLatencyMs}), ${minLatencyMs})`,
+          maxLatencyMs:
+            maxLatencyMs === undefined
+              ? sql`${healthCheckAggregates.maxLatencyMs}`
+              : sql`GREATEST(COALESCE(${healthCheckAggregates.maxLatencyMs}, ${maxLatencyMs}), ${maxLatencyMs})`,
+          p95LatencyMs:
+            p95LatencyMs === undefined
+              ? sql`${healthCheckAggregates.p95LatencyMs}`
+              : sql`GREATEST(COALESCE(${healthCheckAggregates.p95LatencyMs}, ${p95LatencyMs}), ${p95LatencyMs})`,
+        },
+      });
     // Delete processed hourly aggregates
     for (const hourly of bucket.aggregates) {