npm - @checkstack/healthcheck-backend - Versions diffs - 0.4.2 → 0.6.0 - Mend

@checkstack/healthcheck-backend 0.4.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/CHANGELOG.md +116 -0
package/drizzle/0007_tense_misty_knight.sql +1 -0
package/drizzle/0008_broad_black_tom.sql +1 -0
package/drizzle/meta/0007_snapshot.json +413 -0
package/drizzle/meta/0008_snapshot.json +420 -0
package/drizzle/meta/_journal.json +14 -0
package/package.json +2 -1
package/src/aggregation-utils.test.ts +644 -0
package/src/aggregation-utils.ts +399 -0
package/src/aggregation.test.ts +222 -79
package/src/index.ts +16 -0
package/src/queue-executor.test.ts +133 -0
package/src/queue-executor.ts +48 -2
package/src/retention-job.ts +72 -43
package/src/router.test.ts +14 -3
package/src/router.ts +14 -1
package/src/schema.ts +8 -4
package/src/service-pause.test.ts +50 -0
package/src/service.ts +273 -101

package/src/service.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import {
   healthCheckConfigurations,
   systemHealthChecks,
   healthCheckRuns,
+  healthCheckAggregates,
   VersionedStateThresholds,
 } from "./schema";
 import * as schema from "./schema";
@@ -20,7 +21,17 @@ import { stateThresholds } from "./state-thresholds-migrations";
 import type {
   HealthCheckRegistry,
   SafeDatabase,
+  CollectorRegistry,
 } from "@checkstack/backend-api";
+import {
+  aggregateCollectorData,
+  extractLatencies,
+  mergeTieredBuckets,
+  reaggregateBuckets,
+  countStatuses,
+  calculateLatencyStats,
+  type NormalizedBucket,
+} from "./aggregation-utils";
 // Drizzle type helper - uses SafeDatabase to prevent relational query API usage
 type Db = SafeDatabase<typeof schema>;
@@ -43,6 +54,7 @@ export class HealthCheckService {
   constructor(
     private db: Db,
     private registry?: HealthCheckRegistry,
+    private collectorRegistry?: CollectorRegistry,
   ) {}
   async createConfiguration(
@@ -93,6 +105,20 @@ export class HealthCheckService {
       .where(eq(healthCheckConfigurations.id, id));
   }
+  async pauseConfiguration(id: string): Promise<void> {
+    await this.db
+      .update(healthCheckConfigurations)
+      .set({ paused: true, updatedAt: new Date() })
+      .where(eq(healthCheckConfigurations.id, id));
+  }
+  async resumeConfiguration(id: string): Promise<void> {
+    await this.db
+      .update(healthCheckConfigurations)
+      .set({ paused: false, updatedAt: new Date() })
+      .where(eq(healthCheckConfigurations.id, id));
+  }
   async getConfigurations(): Promise<HealthCheckConfiguration[]> {
     const configs = await this.db.select().from(healthCheckConfigurations);
     return configs.map((c) => this.mapConfig(c));
@@ -551,9 +577,9 @@ export class HealthCheckService {
   }
   /**
-   * Get aggregated health check history with bucketed metrics.
-   * Currently aggregates raw data on-the-fly. Will merge with stored aggregates
-   * once the retention job populates historical data.
+   * Get aggregated health check history with dynamically-sized buckets.
+   * Queries all three tiers (raw, hourly, daily) and merges with priority.
+   * Bucket interval is calculated as (endDate - startDate) / targetPoints.
    */
   async getAggregatedHistory(
     props: {
@@ -561,23 +587,25 @@ export class HealthCheckService {
       configurationId: string;
       startDate: Date;
       endDate: Date;
-      bucketSize: "hourly" | "daily" | "auto";
+      targetPoints?: number;
     },
     options: { includeAggregatedResult: boolean },
   ) {
-    const { systemId, configurationId, startDate, endDate } = props;
-    let bucketSize = props.bucketSize;
-    // Auto-select bucket size based on range
-    if (bucketSize === "auto") {
-      const diffDays =
-        (endDate.getTime() - startDate.getTime()) / (1000 * 60 * 60 * 24);
-      bucketSize = diffDays > 7 ? "daily" : "hourly";
-    }
+    const {
+      systemId,
+      configurationId,
+      startDate,
+      endDate,
+      targetPoints = 500,
+    } = props;
+    // Calculate dynamic bucket interval
+    const rangeMs = endDate.getTime() - startDate.getTime();
+    const MIN_INTERVAL_MS = 1000; // 1 second minimum
+    const bucketIntervalMs = Math.max(rangeMs / targetPoints, MIN_INTERVAL_MS);
+    const bucketIntervalSeconds = Math.round(bucketIntervalMs / 1000);
     // Get the configuration to find the strategy
-    // Note: Using standard select instead of relational query API
-    // as the relational API is blocked by the scoped database proxy
     const [config] = await this.db
       .select()
       .from(healthCheckConfigurations)
@@ -590,21 +618,175 @@ export class HealthCheckService {
         ? this.registry.getStrategy(config.strategyId)
         : undefined;
-    // Query raw runs within the date range (including result for metadata)
-    const runs = await this.db
-      .select()
-      .from(healthCheckRuns)
-      .where(
-        and(
-          eq(healthCheckRuns.systemId, systemId),
-          eq(healthCheckRuns.configurationId, configurationId),
-          gte(healthCheckRuns.timestamp, startDate),
-          lte(healthCheckRuns.timestamp, endDate),
-        ),
-      )
-      .orderBy(healthCheckRuns.timestamp);
+    // Query all three tiers in parallel
+    const [rawRuns, hourlyAggregates, dailyAggregates] = await Promise.all([
+      // Raw runs
+      this.db
+        .select()
+        .from(healthCheckRuns)
+        .where(
+          and(
+            eq(healthCheckRuns.systemId, systemId),
+            eq(healthCheckRuns.configurationId, configurationId),
+            gte(healthCheckRuns.timestamp, startDate),
+            lte(healthCheckRuns.timestamp, endDate),
+          ),
+        )
+        .orderBy(healthCheckRuns.timestamp),
+      // Hourly aggregates
+      this.db
+        .select()
+        .from(healthCheckAggregates)
+        .where(
+          and(
+            eq(healthCheckAggregates.systemId, systemId),
+            eq(healthCheckAggregates.configurationId, configurationId),
+            eq(healthCheckAggregates.bucketSize, "hourly"),
+            gte(healthCheckAggregates.bucketStart, startDate),
+            lte(healthCheckAggregates.bucketStart, endDate),
+          ),
+        )
+        .orderBy(healthCheckAggregates.bucketStart),
+      // Daily aggregates
+      this.db
+        .select()
+        .from(healthCheckAggregates)
+        .where(
+          and(
+            eq(healthCheckAggregates.systemId, systemId),
+            eq(healthCheckAggregates.configurationId, configurationId),
+            eq(healthCheckAggregates.bucketSize, "daily"),
+            gte(healthCheckAggregates.bucketStart, startDate),
+            lte(healthCheckAggregates.bucketStart, endDate),
+          ),
+        )
+        .orderBy(healthCheckAggregates.bucketStart),
+    ]);
+    // Normalize raw runs to buckets using target interval for proper aggregation
+    // This ensures aggregatedResult is computed per target bucket, not per sub-bucket
+    const rawBuckets = this.normalizeRawRunsToBuckets({
+      runs: rawRuns,
+      bucketIntervalMs: bucketIntervalMs,
+      rangeStart: startDate,
+      strategy,
+    });
+    // Normalize hourly and daily aggregates to NormalizedBucket format
+    const HOURLY_MS = 60 * 60 * 1000;
+    const DAILY_MS = 24 * 60 * 60 * 1000;
+    const hourlyBuckets: NormalizedBucket[] = hourlyAggregates.map((agg) => ({
+      bucketStart: agg.bucketStart,
+      bucketEndMs: agg.bucketStart.getTime() + HOURLY_MS,
+      runCount: agg.runCount,
+      healthyCount: agg.healthyCount,
+      degradedCount: agg.degradedCount,
+      unhealthyCount: agg.unhealthyCount,
+      latencySumMs: agg.latencySumMs ?? undefined,
+      minLatencyMs: agg.minLatencyMs ?? undefined,
+      maxLatencyMs: agg.maxLatencyMs ?? undefined,
+      p95LatencyMs: agg.p95LatencyMs ?? undefined,
+      aggregatedResult: agg.aggregatedResult ?? undefined,
+      sourceTier: "hourly" as const,
+    }));
+    const dailyBuckets: NormalizedBucket[] = dailyAggregates.map((agg) => ({
+      bucketStart: agg.bucketStart,
+      bucketEndMs: agg.bucketStart.getTime() + DAILY_MS,
+      runCount: agg.runCount,
+      healthyCount: agg.healthyCount,
+      degradedCount: agg.degradedCount,
+      unhealthyCount: agg.unhealthyCount,
+      latencySumMs: agg.latencySumMs ?? undefined,
+      minLatencyMs: agg.minLatencyMs ?? undefined,
+      maxLatencyMs: agg.maxLatencyMs ?? undefined,
+      p95LatencyMs: agg.p95LatencyMs ?? undefined,
+      aggregatedResult: agg.aggregatedResult ?? undefined,
+      sourceTier: "daily" as const,
+    }));
+    // Merge all tiers with priority (raw > hourly > daily)
+    const mergedBuckets = mergeTieredBuckets({
+      rawBuckets,
+      hourlyBuckets,
+      dailyBuckets,
+    });
+    // Re-aggregate to target bucket interval
+    const targetBuckets = reaggregateBuckets({
+      sourceBuckets: mergedBuckets,
+      targetIntervalMs: bucketIntervalMs,
+      rangeStart: startDate,
+    });
+    // Convert to output format
+    const buckets = targetBuckets.map((bucket) => {
+      const successRate =
+        bucket.runCount > 0 ? bucket.healthyCount / bucket.runCount : 0;
+      const avgLatencyMs =
+        bucket.latencySumMs !== undefined && bucket.runCount > 0
+          ? Math.round(bucket.latencySumMs / bucket.runCount)
+          : undefined;
-    // Group runs into buckets (with full result for metadata aggregation)
+      const baseBucket = {
+        bucketStart: bucket.bucketStart,
+        bucketIntervalSeconds,
+        runCount: bucket.runCount,
+        healthyCount: bucket.healthyCount,
+        degradedCount: bucket.degradedCount,
+        unhealthyCount: bucket.unhealthyCount,
+        successRate,
+        avgLatencyMs,
+        minLatencyMs: bucket.minLatencyMs,
+        maxLatencyMs: bucket.maxLatencyMs,
+        p95LatencyMs: bucket.p95LatencyMs,
+      };
+      // Include aggregatedResult if available (only from raw data)
+      if (options.includeAggregatedResult && bucket.aggregatedResult) {
+        return {
+          ...baseBucket,
+          aggregatedResult: bucket.aggregatedResult,
+        };
+      }
+      return baseBucket;
+    });
+    return { buckets, bucketIntervalSeconds };
+  }
+  /**
+   * Normalize raw runs into buckets for merging with aggregate tiers.
+   */
+  private normalizeRawRunsToBuckets(params: {
+    runs: Array<{
+      id: string;
+      status: "healthy" | "unhealthy" | "degraded";
+      timestamp: Date;
+      latencyMs: number | null;
+      result: Record<string, unknown> | null;
+    }>;
+    bucketIntervalMs: number;
+    rangeStart: Date;
+    strategy?: {
+      aggregateResult: (
+        runs: Array<{
+          status: "healthy" | "unhealthy" | "degraded";
+          latencyMs?: number;
+          metadata?: unknown;
+        }>,
+      ) => unknown;
+    };
+  }): NormalizedBucket[] {
+    const { runs, bucketIntervalMs, rangeStart, strategy } = params;
+    if (runs.length === 0) {
+      return [];
+    }
+    // Group runs by bucket
     const bucketMap = new Map<
       string,
       {
@@ -618,17 +800,21 @@ export class HealthCheckService {
     >();
     for (const run of runs) {
-      const bucketStart = this.getBucketStart(run.timestamp, bucketSize);
+      const bucketStart = this.getBucketStartDynamic(
+        run.timestamp,
+        rangeStart,
+        bucketIntervalMs,
+      );
       const key = bucketStart.toISOString();
       if (!bucketMap.has(key)) {
         bucketMap.set(key, { bucketStart, runs: [] });
       }
-      // run.result is StoredHealthCheckResult: { status, latencyMs, message, metadata }
-      // Strategy's aggregateResult expects metadata to be the strategy-specific fields
       const storedResult = run.result as {
         metadata?: Record<string, unknown>;
       } | null;
       bucketMap.get(key)!.runs.push({
         status: run.status,
         latencyMs: run.latencyMs ?? undefined,
@@ -636,85 +822,70 @@ export class HealthCheckService {
       });
     }
-    // Calculate metrics for each bucket
-    const buckets = [...bucketMap.values()].map((bucket) => {
-      const runCount = bucket.runs.length;
-      const healthyCount = bucket.runs.filter(
-        (r) => r.status === "healthy",
-      ).length;
-      const degradedCount = bucket.runs.filter(
-        (r) => r.status === "degraded",
-      ).length;
-      const unhealthyCount = bucket.runs.filter(
-        (r) => r.status === "unhealthy",
-      ).length;
-      const successRate = runCount > 0 ? healthyCount / runCount : 0;
-      const latencies = bucket.runs
-        .map((r) => r.latencyMs)
-        .filter((l): l is number => typeof l === "number");
-      const avgLatencyMs =
-        latencies.length > 0
-          ? Math.round(latencies.reduce((a, b) => a + b, 0) / latencies.length)
-          : undefined;
-      const minLatencyMs =
-        latencies.length > 0 ? Math.min(...latencies) : undefined;
-      const maxLatencyMs =
-        latencies.length > 0 ? Math.max(...latencies) : undefined;
-      const p95LatencyMs =
-        latencies.length > 0
-          ? this.calculatePercentile(latencies, 95)
-          : undefined;
+    // Convert to NormalizedBucket format
+    const result: NormalizedBucket[] = [];
-      // Build base bucket (always included)
-      const baseBucket = {
+    for (const [, bucket] of bucketMap) {
+      const { healthyCount, degradedCount, unhealthyCount } = countStatuses(
+        bucket.runs,
+      );
+      const latencies = extractLatencies(bucket.runs);
+      const latencyStats = calculateLatencyStats(latencies);
+      // Compute aggregatedResult if strategy is available
+      let aggregatedResult: Record<string, unknown> | undefined;
+      if (strategy) {
+        const strategyResult = strategy.aggregateResult(bucket.runs) as Record<
+          string,
+          unknown
+        >;
+        // Aggregate collector data if collector registry is available
+        let collectorsAggregated: Record<string, unknown> | undefined;
+        if (this.collectorRegistry) {
+          collectorsAggregated = aggregateCollectorData(
+            bucket.runs,
+            this.collectorRegistry,
+          );
+        }
+        aggregatedResult = {
+          ...strategyResult,
+          ...(collectorsAggregated ? { collectors: collectorsAggregated } : {}),
+        };
+      }
+      result.push({
         bucketStart: bucket.bucketStart,
-        bucketSize: bucketSize as "hourly" | "daily",
-        runCount,
+        bucketEndMs: bucket.bucketStart.getTime() + bucketIntervalMs,
+        runCount: bucket.runs.length,
         healthyCount,
         degradedCount,
         unhealthyCount,
-        successRate,
-        avgLatencyMs,
-        minLatencyMs,
-        maxLatencyMs,
-        p95LatencyMs,
-      };
-      // Only include aggregatedResult if requested and strategy is available
-      if (options.includeAggregatedResult && strategy) {
-        return {
-          ...baseBucket,
-          aggregatedResult: strategy.aggregateResult(bucket.runs) as Record<
-            string,
-            unknown
-          >,
-        };
-      }
-      return baseBucket;
-    });
+        latencySumMs: latencyStats.latencySumMs,
+        minLatencyMs: latencyStats.minLatencyMs,
+        maxLatencyMs: latencyStats.maxLatencyMs,
+        p95LatencyMs: latencyStats.p95LatencyMs,
+        aggregatedResult,
+        sourceTier: "raw",
+      });
+    }
-    return { buckets };
+    return result;
   }
-  private getBucketStart(
+  /**
+   * Calculate bucket start time for dynamic interval sizing.
+   * Aligns buckets to the query start time.
+   */
+  private getBucketStartDynamic(
     timestamp: Date,
-    bucketSize: "hourly" | "daily",
+    rangeStart: Date,
+    intervalMs: number,
   ): Date {
-    const date = new Date(timestamp);
-    if (bucketSize === "daily") {
-      date.setHours(0, 0, 0, 0);
-    } else {
-      date.setMinutes(0, 0, 0);
-    }
-    return date;
-  }
-  private calculatePercentile(values: number[], percentile: number): number {
-    const sorted = values.toSorted((a, b) => a - b);
-    const index = Math.ceil((percentile / 100) * sorted.length) - 1;
-    return sorted[Math.max(0, index)];
+    const offsetMs = timestamp.getTime() - rangeStart.getTime();
+    const bucketIndex = Math.floor(offsetMs / intervalMs);
+    return new Date(rangeStart.getTime() + bucketIndex * intervalMs);
   }
   private mapConfig(
@@ -727,6 +898,7 @@ export class HealthCheckService {
       config: row.config,
       collectors: row.collectors ?? undefined,
       intervalSeconds: row.intervalSeconds,
+      paused: row.paused,
       createdAt: row.createdAt,
       updatedAt: row.updatedAt,
     };