@checkstack/healthcheck-backend 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +541 -0
  2. package/drizzle/0015_quiet_meggan.sql +12 -0
  3. package/drizzle/0016_complex_maginty.sql +1 -0
  4. package/drizzle/0017_pretty_caretaker.sql +1 -0
  5. package/drizzle/meta/0015_snapshot.json +764 -0
  6. package/drizzle/meta/0016_snapshot.json +644 -0
  7. package/drizzle/meta/0017_snapshot.json +563 -0
  8. package/drizzle/meta/_journal.json +21 -0
  9. package/package.json +24 -21
  10. package/src/automations.test.ts +234 -0
  11. package/src/automations.ts +342 -0
  12. package/src/collector-script-test.test.ts +236 -0
  13. package/src/collector-script-test.ts +221 -0
  14. package/src/health-entity.test.ts +698 -0
  15. package/src/health-entity.ts +369 -0
  16. package/src/health-state.test.ts +115 -0
  17. package/src/health-state.ts +333 -0
  18. package/src/healthcheck-gitops-kinds.test.ts +6 -32
  19. package/src/healthcheck-gitops-kinds.ts +4 -19
  20. package/src/hooks.test.ts +19 -6
  21. package/src/hooks.ts +38 -28
  22. package/src/index.ts +150 -98
  23. package/src/queue-executor.test.ts +137 -0
  24. package/src/queue-executor.ts +282 -380
  25. package/src/retention-job.ts +65 -1
  26. package/src/retention-state-transitions.test.ts +49 -0
  27. package/src/router.test.ts +18 -0
  28. package/src/router.ts +56 -1
  29. package/src/schema.ts +34 -54
  30. package/src/service-assignments.test.ts +184 -0
  31. package/src/service-notification-policy.test.ts +28 -71
  32. package/src/service.ts +154 -0
  33. package/src/state-transitions.test.ts +126 -0
  34. package/src/state-transitions.ts +112 -0
  35. package/tsconfig.json +12 -3
  36. package/src/auto-incident-close-job.ts +0 -164
  37. package/src/auto-incident.test.ts +0 -196
  38. package/src/auto-incident.ts +0 -332
@@ -4,9 +4,10 @@ import {
4
4
  healthCheckRuns,
5
5
  systemHealthChecks,
6
6
  healthCheckAggregates,
7
+ healthCheckStateTransitions,
7
8
  DEFAULT_RETENTION_CONFIG,
8
9
  } from "./schema";
9
- import { eq, and, lt, sql } from "drizzle-orm";
10
+ import { eq, and, lt, sql, desc } from "drizzle-orm";
10
11
  import type { QueueManager } from "@checkstack/queue-api";
11
12
 
12
13
  type Db = SafeDatabase<typeof schema>;
@@ -69,6 +70,29 @@ export async function runRetentionJob(deps: RetentionJobDeps) {
69
70
  // Get all unique system-config assignments
70
71
  const assignments = await db.select().from(systemHealthChecks);
71
72
 
73
+ // State transitions are system-level (not per-config), so prune them
74
+ // once per unique system rather than once per assignment. Use the
75
+ // longest rawRetentionDays among the system's assignments so a single
76
+ // short-retention check can't drop history another check still wants.
77
+ const rawRetentionBySystem = new Map<string, number>();
78
+ for (const assignment of assignments) {
79
+ const days = (
80
+ assignment.retentionConfig ?? DEFAULT_RETENTION_CONFIG
81
+ ).rawRetentionDays;
82
+ const existing = rawRetentionBySystem.get(assignment.systemId);
83
+ if (existing === undefined || days > existing) {
84
+ rawRetentionBySystem.set(assignment.systemId, days);
85
+ }
86
+ }
87
+
88
+ for (const [systemId, rawRetentionDays] of rawRetentionBySystem) {
89
+ try {
90
+ await pruneStateTransitions({ db, systemId, rawRetentionDays });
91
+ } catch (error) {
92
+ logger.error(`State-transition prune failed for ${systemId}`, { error });
93
+ }
94
+ }
95
+
72
96
  for (const assignment of assignments) {
73
97
  const retentionConfig =
74
98
  assignment.retentionConfig ?? DEFAULT_RETENTION_CONFIG;
@@ -134,6 +158,46 @@ async function deleteExpiredRawRuns(params: DeleteExpiredRawRunsParams) {
134
158
  );
135
159
  }
136
160
 
161
+ interface PruneStateTransitionsParams {
162
+ db: Db;
163
+ systemId: string;
164
+ rawRetentionDays: number;
165
+ }
166
+
167
+ /**
168
+ * Prune aggregate state-transition rows older than the raw-run retention
169
+ * window, but ALWAYS keep the single most-recent transition for the
170
+ * system so "in current status since" never blanks for an active streak
171
+ * (decision D3). Deletes rows that are both older than the cutoff AND
172
+ * not the newest row.
173
+ */
174
+ export async function pruneStateTransitions(
175
+ params: PruneStateTransitionsParams,
176
+ ): Promise<void> {
177
+ const { db, systemId, rawRetentionDays } = params;
178
+
179
+ const cutoffDate = new Date();
180
+ cutoffDate.setDate(cutoffDate.getDate() - rawRetentionDays);
181
+
182
+ const [newest] = await db
183
+ .select({ id: healthCheckStateTransitions.id })
184
+ .from(healthCheckStateTransitions)
185
+ .where(eq(healthCheckStateTransitions.systemId, systemId))
186
+ .orderBy(desc(healthCheckStateTransitions.transitionedAt))
187
+ .limit(1);
188
+
189
+ const conditions = [
190
+ eq(healthCheckStateTransitions.systemId, systemId),
191
+ lt(healthCheckStateTransitions.transitionedAt, cutoffDate),
192
+ ];
193
+ if (newest) {
194
+ // Never delete the most-recent row, even if it predates the cutoff.
195
+ conditions.push(sql`${healthCheckStateTransitions.id} <> ${newest.id}`);
196
+ }
197
+
198
+ await db.delete(healthCheckStateTransitions).where(and(...conditions));
199
+ }
200
+
137
201
  interface RollupParams {
138
202
  db: Db;
139
203
  systemId: string;
@@ -0,0 +1,49 @@
1
+ import { describe, it, expect, mock } from "bun:test";
2
+ import { pruneStateTransitions } from "./retention-job";
3
+
4
+ /**
5
+ * Mock db exposing the two shapes pruneStateTransitions uses:
6
+ * - select(...).from(...).where(...).orderBy(...).limit(...) -> newest row
7
+ * - delete(...).where(...) -> prune
8
+ */
9
+ function makeMockDb(newestRows: Array<{ id: string }>) {
10
+ const where = mock(() => Promise.resolve());
11
+ const db = {
12
+ select: mock(() => ({
13
+ from: mock(() => ({
14
+ where: mock(() => ({
15
+ orderBy: mock(() => ({
16
+ limit: mock(() => Promise.resolve(newestRows)),
17
+ })),
18
+ })),
19
+ })),
20
+ })),
21
+ delete: mock(() => ({ where })),
22
+ };
23
+ return { db, deleteWhere: where };
24
+ }
25
+
26
+ describe("pruneStateTransitions", () => {
27
+ it("issues a delete that preserves the newest row when one exists", async () => {
28
+ const { db, deleteWhere } = makeMockDb([{ id: "newest-id" }]);
29
+ await pruneStateTransitions({
30
+ db: db as never,
31
+ systemId: "system-1",
32
+ rawRetentionDays: 7,
33
+ });
34
+ expect(db.select).toHaveBeenCalledTimes(1);
35
+ expect(db.delete).toHaveBeenCalledTimes(1);
36
+ expect(deleteWhere).toHaveBeenCalledTimes(1);
37
+ });
38
+
39
+ it("still issues a delete (cutoff-only) when the table is empty for the system", async () => {
40
+ const { db, deleteWhere } = makeMockDb([]);
41
+ await pruneStateTransitions({
42
+ db: db as never,
43
+ systemId: "system-1",
44
+ rawRetentionDays: 7,
45
+ });
46
+ expect(db.delete).toHaveBeenCalledTimes(1);
47
+ expect(deleteWhere).toHaveBeenCalledTimes(1);
48
+ });
49
+ });
@@ -68,6 +68,21 @@ describe("HealthCheck Router", () => {
68
68
  getRedacted: mock(async () => undefined),
69
69
  };
70
70
 
71
+ const mockCatalogClient = {
72
+ getSystem: mock(async () => null),
73
+ };
74
+
75
+ const mockMaintenanceClient = {
76
+ hasActiveMaintenance: mock(async () => ({ active: false })),
77
+ };
78
+
79
+ const mockLogger = {
80
+ debug: mock(() => {}),
81
+ info: mock(() => {}),
82
+ warn: mock(() => {}),
83
+ error: mock(() => {}),
84
+ };
85
+
71
86
  const router = createHealthCheckRouter({
72
87
  database: mockDb as never,
73
88
  registry: mockRegistry,
@@ -76,6 +91,9 @@ describe("HealthCheck Router", () => {
76
91
  getEmitHook: () => undefined,
77
92
  cache: passthroughCache,
78
93
  configService: mockConfigService as never,
94
+ catalogClient: mockCatalogClient as never,
95
+ maintenanceClient: mockMaintenanceClient as never,
96
+ logger: mockLogger as never,
79
97
  });
80
98
 
81
99
  it("getStrategies returns strategies from registry", async () => {
package/src/router.ts CHANGED
@@ -11,12 +11,20 @@ import {
11
11
  } from "@checkstack/backend-api";
12
12
  import { healthCheckContract } from "@checkstack/healthcheck-common";
13
13
  import type { StrategyCategory } from "@checkstack/healthcheck-common";
14
+ import {
15
+ resolveResolutionRootFromStore,
16
+ resolveScriptPackagesDir,
17
+ } from "@checkstack/script-packages-backend";
14
18
  import { HealthCheckService } from "./service";
19
+ import { runCollectorScriptTest } from "./collector-script-test";
15
20
  import { healthCheckHooks } from "./hooks";
16
21
  import * as schema from "./schema";
17
22
  import { toJsonSchemaWithChartMeta } from "./schema-utils";
18
23
  import type { InferClient } from "@checkstack/common";
19
24
  import { GitOpsApi } from "@checkstack/gitops-common";
25
+ import { CatalogApi } from "@checkstack/catalog-common";
26
+ import { MaintenanceApi } from "@checkstack/maintenance-common";
27
+ import type { Logger } from "@checkstack/backend-api";
20
28
  import type { HealthCheckCache } from "./cache";
21
29
 
22
30
  /**
@@ -33,14 +41,28 @@ export const createHealthCheckRouter = (opts: {
33
41
  getEmitHook: () => ((hook: { id: string }, payload: Record<string, unknown>) => Promise<void>) | undefined;
34
42
  cache: HealthCheckCache;
35
43
  configService: ConfigService;
44
+ catalogClient: InferClient<typeof CatalogApi>;
45
+ maintenanceClient: InferClient<typeof MaintenanceApi>;
46
+ logger: Logger;
36
47
  }) => {
37
- const { database, registry, collectorRegistry, getEmitHook, cache, configService } = opts;
48
+ const {
49
+ database,
50
+ registry,
51
+ collectorRegistry,
52
+ getEmitHook,
53
+ cache,
54
+ configService,
55
+ catalogClient,
56
+ maintenanceClient,
57
+ logger,
58
+ } = opts;
38
59
  // Create service instance once - shared across all handlers
39
60
  const service = new HealthCheckService(
40
61
  database,
41
62
  registry,
42
63
  collectorRegistry,
43
64
  configService,
65
+ catalogClient,
44
66
  );
45
67
 
46
68
  // Create contract implementer with context type AND auto auth middleware
@@ -114,6 +136,19 @@ export const createHealthCheckRouter = (opts: {
114
136
  }));
115
137
  }),
116
138
 
139
+ testCollectorScript: os.testCollectorScript.handler(async ({ input }) => {
140
+ // Resolve the managed npm-package root from the local store so a
141
+ // collector test resolves the same allowlisted packages the real
142
+ // collector would (plan §4.1). Filesystem-only; safety is the
143
+ // runner's (auto-install disabled).
144
+ const status = await resolveResolutionRootFromStore(
145
+ resolveScriptPackagesDir(),
146
+ );
147
+ const resolutionRoot =
148
+ status.mode === "ready" ? status.root : undefined;
149
+ return runCollectorScriptTest({ input, deps: { resolutionRoot } });
150
+ }),
151
+
117
152
  getConfigurations: os.getConfigurations.handler(async () => {
118
153
  return { configurations: await service.getConfigurations() };
119
154
  }),
@@ -315,6 +350,26 @@ export const createHealthCheckRouter = (opts: {
315
350
  },
316
351
  ),
317
352
 
353
+ getHealthState: os.getHealthState.handler(async ({ input }) => {
354
+ return service.getHealthState({
355
+ systemId: input.systemId,
356
+ configurationId: input.configurationId,
357
+ transitionWindowMinutes: input.transitionWindowMinutes,
358
+ maintenanceClient,
359
+ logger,
360
+ });
361
+ }),
362
+
363
+ getBulkHealthState: os.getBulkHealthState.handler(async ({ input }) => {
364
+ const states = await service.getBulkHealthState({
365
+ systemIds: input.systemIds,
366
+ transitionWindowMinutes: input.transitionWindowMinutes,
367
+ maintenanceClient,
368
+ logger,
369
+ });
370
+ return { states };
371
+ }),
372
+
318
373
  // ========================================================================
319
374
  // SERVICE INTERFACE (S2S — satellite-backend)
320
375
  // ========================================================================
package/src/schema.ts CHANGED
@@ -117,70 +117,50 @@ export const systemHealthChecks = pgTable(
117
117
  );
118
118
 
119
119
  /**
120
- * Records each time a check's *evaluated* state transitions from
121
- * non-unhealthy to unhealthy. Used to decide whether the per-check
122
- * incident threshold (N transitions in M minutes) has been met.
123
- * Pruned by the retention job alongside raw runs.
124
- */
125
- export const healthCheckUnhealthyTransitions = pgTable(
126
- "health_check_unhealthy_transitions",
127
- {
128
- id: uuid("id").primaryKey().defaultRandom(),
129
- configurationId: uuid("configuration_id")
130
- .notNull()
131
- .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
132
- systemId: text("system_id").notNull(),
133
- transitionedAt: timestamp("transitioned_at").defaultNow().notNull(),
134
- },
135
- (t) => ({
136
- // Powers the threshold count query
137
- // (WHERE config_id = ? AND system_id = ? AND transitioned_at > ?).
138
- lookupIdx: index(
139
- "health_check_unhealthy_transitions_lookup_idx",
140
- ).on(t.configurationId, t.systemId, t.transitionedAt),
141
- }),
142
- );
143
-
144
- /**
145
- * Mapping of auto-opened incidents back to the system + check that
146
- * triggered them. `closedAt` stays null while the incident is active;
147
- * the auto-close worker sets it once the linked system has been
148
- * steadily healthy for the cooldown.
120
+ * Records every *aggregate* health-status transition for a system
121
+ * (e.g. healthy -> degraded -> unhealthy -> healthy). This table is
122
+ * unconditional and covers ALL statuses, giving a reliable
123
+ * "in current status since" timestamp for arbitrary statuses.
124
+ *
125
+ * The former `health_check_unhealthy_transitions` table (per-check,
126
+ * unhealthy-only) was dropped: flapping is now counted in the automation
127
+ * engine's `automation_window_events` log via the windowed-count gate on the
128
+ * `system_health_changed` trigger, so healthcheck no longer keeps a separate
129
+ * transition audit for flapping.
149
130
  *
150
- * No FK to the incident table that lives in another plugin's schema
151
- * and we treat it as a soft reference (incident deletes are handled
152
- * by the auto-close worker, which tolerates missing rows).
131
+ * One row is written per aggregate transition at the same point the
132
+ * `systemHealthChanged` hook fires. `configurationId` is the check that
133
+ * drove the transition (the just-ran check).
134
+ *
135
+ * Retention: pruned alongside raw runs, EXCEPT the single most-recent
136
+ * row per system is always kept so "in status since" never blanks for
137
+ * an active streak.
153
138
  */
154
- export const healthCheckAutoIncidents = pgTable(
155
- "health_check_auto_incidents",
139
+ export const healthCheckStateTransitions = pgTable(
140
+ "health_check_state_transitions",
156
141
  {
157
142
  id: uuid("id").primaryKey().defaultRandom(),
158
- incidentId: uuid("incident_id").notNull(),
159
143
  systemId: text("system_id").notNull(),
144
+ /** The check whose run drove this aggregate transition. */
160
145
  configurationId: uuid("configuration_id")
161
146
  .notNull()
162
147
  .references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
163
- openedAt: timestamp("opened_at").defaultNow().notNull(),
164
- closedAt: timestamp("closed_at"),
165
- /**
166
- * Auto-close cooldown snapshot taken when the incident was opened.
167
- * `null` means "never auto-close" — the worker leaves this
168
- * incident alone and an operator must resolve it manually. Stored
169
- * per-row so a later policy change doesn't retroactively alter
170
- * the close behaviour of incidents already in flight.
171
- */
172
- cooldownMinutes: integer("cooldown_minutes"),
148
+ fromStatus: healthCheckStatusEnum("from_status"),
149
+ toStatus: healthCheckStatusEnum("to_status").notNull(),
150
+ transitionedAt: timestamp("transitioned_at").defaultNow().notNull(),
173
151
  },
174
152
  (t) => ({
175
- // Powers "is there an active auto-incident for this system?" check.
176
- activeBySystemIdx: index(
177
- "health_check_auto_incidents_active_by_system_idx",
178
- ).on(t.systemId, t.closedAt),
179
- // Powers "find the most recent close for this assignment" lookup
180
- // used by the require-recovery-before-reopen check.
181
- lastCloseByAssignmentIdx: index(
182
- "health_check_auto_incidents_last_close_idx",
183
- ).on(t.configurationId, t.systemId, t.closedAt),
153
+ // Powers "most recent transition into status X for this system"
154
+ // (WHERE system_id = ? AND to_status = ? ORDER BY transitioned_at DESC).
155
+ lookupIdx: index("health_check_state_transitions_lookup_idx").on(
156
+ t.systemId,
157
+ t.toStatus,
158
+ t.transitionedAt,
159
+ ),
160
+ // Powers the retention "keep newest per system" sweep.
161
+ systemRecentIdx: index(
162
+ "health_check_state_transitions_system_recent_idx",
163
+ ).on(t.systemId, t.transitionedAt),
184
164
  }),
185
165
  );
186
166
 
@@ -0,0 +1,184 @@
1
+ import { describe, it, expect, mock, beforeEach } from "bun:test";
2
+ import { HealthCheckService } from "./service";
3
+
4
+ /**
5
+ * Tests for getAssignmentsForSatellite run-context population:
6
+ * - assignments carry configName (from the config row's name)
7
+ * - systemName resolves via the optional catalog client, falling back to
8
+ * systemId when no client is wired or the lookup fails.
9
+ */
10
+ describe("HealthCheckService.getAssignmentsForSatellite", () => {
11
+ const SATELLITE_ID = "sat-1";
12
+
13
+ type Association = {
14
+ systemId: string;
15
+ configurationId: string;
16
+ satelliteIds: string[] | null;
17
+ enabled: boolean;
18
+ };
19
+
20
+ type Config = {
21
+ id: string;
22
+ name: string;
23
+ strategyId: string;
24
+ config: Record<string, unknown>;
25
+ collectors: unknown[] | null;
26
+ intervalSeconds: number;
27
+ paused: boolean;
28
+ };
29
+
30
+ let associations: Association[] = [];
31
+ let configs: Config[] = [];
32
+
33
+ /**
34
+ * Mock db: the method issues two distinct select shapes:
35
+ * - associations: .select({...}).from(systemHealthChecks) -> awaited array
36
+ * - config: .select().from(...).where(...) -> awaited array
37
+ * We disambiguate by call order: the first select() resolves associations,
38
+ * subsequent select().from().where() resolve a single matching config.
39
+ */
40
+ function createMockDb() {
41
+ let firstSelect = true;
42
+ return {
43
+ select: mock(() => {
44
+ if (firstSelect) {
45
+ firstSelect = false;
46
+ return {
47
+ from: mock(() => Promise.resolve([...associations])),
48
+ };
49
+ }
50
+ return {
51
+ from: mock(() => ({
52
+ where: mock(() => {
53
+ // Return the next unmatched config in order; the loop fetches
54
+ // one config per matching association.
55
+ return Promise.resolve(configs.length > 0 ? [configs[0]] : []);
56
+ }),
57
+ })),
58
+ };
59
+ }),
60
+ };
61
+ }
62
+
63
+ beforeEach(() => {
64
+ associations = [];
65
+ configs = [];
66
+ });
67
+
68
+ it("populates configName and resolves systemName via the catalog client", async () => {
69
+ associations = [
70
+ {
71
+ systemId: "system-1",
72
+ configurationId: "config-1",
73
+ satelliteIds: [SATELLITE_ID],
74
+ enabled: true,
75
+ },
76
+ ];
77
+ configs = [
78
+ {
79
+ id: "config-1",
80
+ name: "API health",
81
+ strategyId: "http",
82
+ config: { url: "https://example.com" },
83
+ collectors: null,
84
+ intervalSeconds: 60,
85
+ paused: false,
86
+ },
87
+ ];
88
+
89
+ const getSystem = mock(() =>
90
+ Promise.resolve({ id: "system-1", name: "Production API" }),
91
+ );
92
+ const catalogClient = { getSystem } as never;
93
+
94
+ const mockDb = createMockDb();
95
+ const service = new HealthCheckService(
96
+ mockDb as never,
97
+ {} as never,
98
+ {} as never,
99
+ undefined,
100
+ catalogClient,
101
+ );
102
+
103
+ const result = await service.getAssignmentsForSatellite(SATELLITE_ID);
104
+
105
+ expect(result).toHaveLength(1);
106
+ expect(result[0].configName).toBe("API health");
107
+ expect(result[0].systemName).toBe("Production API");
108
+ expect(getSystem).toHaveBeenCalledWith({ systemId: "system-1" });
109
+ });
110
+
111
+ it("falls back to systemId when no catalog client is provided", async () => {
112
+ associations = [
113
+ {
114
+ systemId: "system-1",
115
+ configurationId: "config-1",
116
+ satelliteIds: [SATELLITE_ID],
117
+ enabled: true,
118
+ },
119
+ ];
120
+ configs = [
121
+ {
122
+ id: "config-1",
123
+ name: "API health",
124
+ strategyId: "http",
125
+ config: {},
126
+ collectors: null,
127
+ intervalSeconds: 30,
128
+ paused: false,
129
+ },
130
+ ];
131
+
132
+ const mockDb = createMockDb();
133
+ const service = new HealthCheckService(
134
+ mockDb as never,
135
+ {} as never,
136
+ {} as never,
137
+ );
138
+
139
+ const result = await service.getAssignmentsForSatellite(SATELLITE_ID);
140
+
141
+ expect(result).toHaveLength(1);
142
+ expect(result[0].configName).toBe("API health");
143
+ expect(result[0].systemName).toBe("system-1");
144
+ });
145
+
146
+ it("falls back to systemId when the catalog lookup throws", async () => {
147
+ associations = [
148
+ {
149
+ systemId: "system-1",
150
+ configurationId: "config-1",
151
+ satelliteIds: [SATELLITE_ID],
152
+ enabled: true,
153
+ },
154
+ ];
155
+ configs = [
156
+ {
157
+ id: "config-1",
158
+ name: "API health",
159
+ strategyId: "http",
160
+ config: {},
161
+ collectors: null,
162
+ intervalSeconds: 60,
163
+ paused: false,
164
+ },
165
+ ];
166
+
167
+ const getSystem = mock(() => Promise.reject(new Error("catalog down")));
168
+ const catalogClient = { getSystem } as never;
169
+
170
+ const mockDb = createMockDb();
171
+ const service = new HealthCheckService(
172
+ mockDb as never,
173
+ {} as never,
174
+ {} as never,
175
+ undefined,
176
+ catalogClient,
177
+ );
178
+
179
+ const result = await service.getAssignmentsForSatellite(SATELLITE_ID);
180
+
181
+ expect(result).toHaveLength(1);
182
+ expect(result[0].systemName).toBe("system-1");
183
+ });
184
+ });