@checkstack/healthcheck-backend 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +329 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +6 -27
- package/src/automations.ts +32 -30
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +698 -0
- package/src/health-entity.ts +369 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +13 -68
- package/src/index.ts +115 -48
- package/src/queue-executor.ts +243 -444
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +13 -0
- package/src/router.ts +44 -0
- package/src/schema.ts +34 -54
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +89 -0
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +9 -0
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
package/src/retention-job.ts
CHANGED
|
@@ -4,9 +4,10 @@ import {
|
|
|
4
4
|
healthCheckRuns,
|
|
5
5
|
systemHealthChecks,
|
|
6
6
|
healthCheckAggregates,
|
|
7
|
+
healthCheckStateTransitions,
|
|
7
8
|
DEFAULT_RETENTION_CONFIG,
|
|
8
9
|
} from "./schema";
|
|
9
|
-
import { eq, and, lt, sql } from "drizzle-orm";
|
|
10
|
+
import { eq, and, lt, sql, desc } from "drizzle-orm";
|
|
10
11
|
import type { QueueManager } from "@checkstack/queue-api";
|
|
11
12
|
|
|
12
13
|
type Db = SafeDatabase<typeof schema>;
|
|
@@ -69,6 +70,29 @@ export async function runRetentionJob(deps: RetentionJobDeps) {
|
|
|
69
70
|
// Get all unique system-config assignments
|
|
70
71
|
const assignments = await db.select().from(systemHealthChecks);
|
|
71
72
|
|
|
73
|
+
// State transitions are system-level (not per-config), so prune them
|
|
74
|
+
// once per unique system rather than once per assignment. Use the
|
|
75
|
+
// longest rawRetentionDays among the system's assignments so a single
|
|
76
|
+
// short-retention check can't drop history another check still wants.
|
|
77
|
+
const rawRetentionBySystem = new Map<string, number>();
|
|
78
|
+
for (const assignment of assignments) {
|
|
79
|
+
const days = (
|
|
80
|
+
assignment.retentionConfig ?? DEFAULT_RETENTION_CONFIG
|
|
81
|
+
).rawRetentionDays;
|
|
82
|
+
const existing = rawRetentionBySystem.get(assignment.systemId);
|
|
83
|
+
if (existing === undefined || days > existing) {
|
|
84
|
+
rawRetentionBySystem.set(assignment.systemId, days);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
for (const [systemId, rawRetentionDays] of rawRetentionBySystem) {
|
|
89
|
+
try {
|
|
90
|
+
await pruneStateTransitions({ db, systemId, rawRetentionDays });
|
|
91
|
+
} catch (error) {
|
|
92
|
+
logger.error(`State-transition prune failed for ${systemId}`, { error });
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
72
96
|
for (const assignment of assignments) {
|
|
73
97
|
const retentionConfig =
|
|
74
98
|
assignment.retentionConfig ?? DEFAULT_RETENTION_CONFIG;
|
|
@@ -134,6 +158,46 @@ async function deleteExpiredRawRuns(params: DeleteExpiredRawRunsParams) {
|
|
|
134
158
|
);
|
|
135
159
|
}
|
|
136
160
|
|
|
161
|
+
interface PruneStateTransitionsParams {
|
|
162
|
+
db: Db;
|
|
163
|
+
systemId: string;
|
|
164
|
+
rawRetentionDays: number;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Prune aggregate state-transition rows older than the raw-run retention
|
|
169
|
+
* window, but ALWAYS keep the single most-recent transition for the
|
|
170
|
+
* system so "in current status since" never blanks for an active streak
|
|
171
|
+
* (decision D3). Deletes rows that are both older than the cutoff AND
|
|
172
|
+
* not the newest row.
|
|
173
|
+
*/
|
|
174
|
+
export async function pruneStateTransitions(
|
|
175
|
+
params: PruneStateTransitionsParams,
|
|
176
|
+
): Promise<void> {
|
|
177
|
+
const { db, systemId, rawRetentionDays } = params;
|
|
178
|
+
|
|
179
|
+
const cutoffDate = new Date();
|
|
180
|
+
cutoffDate.setDate(cutoffDate.getDate() - rawRetentionDays);
|
|
181
|
+
|
|
182
|
+
const [newest] = await db
|
|
183
|
+
.select({ id: healthCheckStateTransitions.id })
|
|
184
|
+
.from(healthCheckStateTransitions)
|
|
185
|
+
.where(eq(healthCheckStateTransitions.systemId, systemId))
|
|
186
|
+
.orderBy(desc(healthCheckStateTransitions.transitionedAt))
|
|
187
|
+
.limit(1);
|
|
188
|
+
|
|
189
|
+
const conditions = [
|
|
190
|
+
eq(healthCheckStateTransitions.systemId, systemId),
|
|
191
|
+
lt(healthCheckStateTransitions.transitionedAt, cutoffDate),
|
|
192
|
+
];
|
|
193
|
+
if (newest) {
|
|
194
|
+
// Never delete the most-recent row, even if it predates the cutoff.
|
|
195
|
+
conditions.push(sql`${healthCheckStateTransitions.id} <> ${newest.id}`);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
await db.delete(healthCheckStateTransitions).where(and(...conditions));
|
|
199
|
+
}
|
|
200
|
+
|
|
137
201
|
interface RollupParams {
|
|
138
202
|
db: Db;
|
|
139
203
|
systemId: string;
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { describe, it, expect, mock } from "bun:test";
|
|
2
|
+
import { pruneStateTransitions } from "./retention-job";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Mock db exposing the two shapes pruneStateTransitions uses:
|
|
6
|
+
* - select(...).from(...).where(...).orderBy(...).limit(...) -> newest row
|
|
7
|
+
* - delete(...).where(...) -> prune
|
|
8
|
+
*/
|
|
9
|
+
function makeMockDb(newestRows: Array<{ id: string }>) {
|
|
10
|
+
const where = mock(() => Promise.resolve());
|
|
11
|
+
const db = {
|
|
12
|
+
select: mock(() => ({
|
|
13
|
+
from: mock(() => ({
|
|
14
|
+
where: mock(() => ({
|
|
15
|
+
orderBy: mock(() => ({
|
|
16
|
+
limit: mock(() => Promise.resolve(newestRows)),
|
|
17
|
+
})),
|
|
18
|
+
})),
|
|
19
|
+
})),
|
|
20
|
+
})),
|
|
21
|
+
delete: mock(() => ({ where })),
|
|
22
|
+
};
|
|
23
|
+
return { db, deleteWhere: where };
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
describe("pruneStateTransitions", () => {
|
|
27
|
+
it("issues a delete that preserves the newest row when one exists", async () => {
|
|
28
|
+
const { db, deleteWhere } = makeMockDb([{ id: "newest-id" }]);
|
|
29
|
+
await pruneStateTransitions({
|
|
30
|
+
db: db as never,
|
|
31
|
+
systemId: "system-1",
|
|
32
|
+
rawRetentionDays: 7,
|
|
33
|
+
});
|
|
34
|
+
expect(db.select).toHaveBeenCalledTimes(1);
|
|
35
|
+
expect(db.delete).toHaveBeenCalledTimes(1);
|
|
36
|
+
expect(deleteWhere).toHaveBeenCalledTimes(1);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("still issues a delete (cutoff-only) when the table is empty for the system", async () => {
|
|
40
|
+
const { db, deleteWhere } = makeMockDb([]);
|
|
41
|
+
await pruneStateTransitions({
|
|
42
|
+
db: db as never,
|
|
43
|
+
systemId: "system-1",
|
|
44
|
+
rawRetentionDays: 7,
|
|
45
|
+
});
|
|
46
|
+
expect(db.delete).toHaveBeenCalledTimes(1);
|
|
47
|
+
expect(deleteWhere).toHaveBeenCalledTimes(1);
|
|
48
|
+
});
|
|
49
|
+
});
|
package/src/router.test.ts
CHANGED
|
@@ -72,6 +72,17 @@ describe("HealthCheck Router", () => {
|
|
|
72
72
|
getSystem: mock(async () => null),
|
|
73
73
|
};
|
|
74
74
|
|
|
75
|
+
const mockMaintenanceClient = {
|
|
76
|
+
hasActiveMaintenance: mock(async () => ({ active: false })),
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const mockLogger = {
|
|
80
|
+
debug: mock(() => {}),
|
|
81
|
+
info: mock(() => {}),
|
|
82
|
+
warn: mock(() => {}),
|
|
83
|
+
error: mock(() => {}),
|
|
84
|
+
};
|
|
85
|
+
|
|
75
86
|
const router = createHealthCheckRouter({
|
|
76
87
|
database: mockDb as never,
|
|
77
88
|
registry: mockRegistry,
|
|
@@ -81,6 +92,8 @@ describe("HealthCheck Router", () => {
|
|
|
81
92
|
cache: passthroughCache,
|
|
82
93
|
configService: mockConfigService as never,
|
|
83
94
|
catalogClient: mockCatalogClient as never,
|
|
95
|
+
maintenanceClient: mockMaintenanceClient as never,
|
|
96
|
+
logger: mockLogger as never,
|
|
84
97
|
});
|
|
85
98
|
|
|
86
99
|
it("getStrategies returns strategies from registry", async () => {
|
package/src/router.ts
CHANGED
|
@@ -11,13 +11,20 @@ import {
|
|
|
11
11
|
} from "@checkstack/backend-api";
|
|
12
12
|
import { healthCheckContract } from "@checkstack/healthcheck-common";
|
|
13
13
|
import type { StrategyCategory } from "@checkstack/healthcheck-common";
|
|
14
|
+
import {
|
|
15
|
+
resolveResolutionRootFromStore,
|
|
16
|
+
resolveScriptPackagesDir,
|
|
17
|
+
} from "@checkstack/script-packages-backend";
|
|
14
18
|
import { HealthCheckService } from "./service";
|
|
19
|
+
import { runCollectorScriptTest } from "./collector-script-test";
|
|
15
20
|
import { healthCheckHooks } from "./hooks";
|
|
16
21
|
import * as schema from "./schema";
|
|
17
22
|
import { toJsonSchemaWithChartMeta } from "./schema-utils";
|
|
18
23
|
import type { InferClient } from "@checkstack/common";
|
|
19
24
|
import { GitOpsApi } from "@checkstack/gitops-common";
|
|
20
25
|
import { CatalogApi } from "@checkstack/catalog-common";
|
|
26
|
+
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
27
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
21
28
|
import type { HealthCheckCache } from "./cache";
|
|
22
29
|
|
|
23
30
|
/**
|
|
@@ -35,6 +42,8 @@ export const createHealthCheckRouter = (opts: {
|
|
|
35
42
|
cache: HealthCheckCache;
|
|
36
43
|
configService: ConfigService;
|
|
37
44
|
catalogClient: InferClient<typeof CatalogApi>;
|
|
45
|
+
maintenanceClient: InferClient<typeof MaintenanceApi>;
|
|
46
|
+
logger: Logger;
|
|
38
47
|
}) => {
|
|
39
48
|
const {
|
|
40
49
|
database,
|
|
@@ -44,6 +53,8 @@ export const createHealthCheckRouter = (opts: {
|
|
|
44
53
|
cache,
|
|
45
54
|
configService,
|
|
46
55
|
catalogClient,
|
|
56
|
+
maintenanceClient,
|
|
57
|
+
logger,
|
|
47
58
|
} = opts;
|
|
48
59
|
// Create service instance once - shared across all handlers
|
|
49
60
|
const service = new HealthCheckService(
|
|
@@ -125,6 +136,19 @@ export const createHealthCheckRouter = (opts: {
|
|
|
125
136
|
}));
|
|
126
137
|
}),
|
|
127
138
|
|
|
139
|
+
testCollectorScript: os.testCollectorScript.handler(async ({ input }) => {
|
|
140
|
+
// Resolve the managed npm-package root from the local store so a
|
|
141
|
+
// collector test resolves the same allowlisted packages the real
|
|
142
|
+
// collector would (plan §4.1). Filesystem-only; safety is the
|
|
143
|
+
// runner's (auto-install disabled).
|
|
144
|
+
const status = await resolveResolutionRootFromStore(
|
|
145
|
+
resolveScriptPackagesDir(),
|
|
146
|
+
);
|
|
147
|
+
const resolutionRoot =
|
|
148
|
+
status.mode === "ready" ? status.root : undefined;
|
|
149
|
+
return runCollectorScriptTest({ input, deps: { resolutionRoot } });
|
|
150
|
+
}),
|
|
151
|
+
|
|
128
152
|
getConfigurations: os.getConfigurations.handler(async () => {
|
|
129
153
|
return { configurations: await service.getConfigurations() };
|
|
130
154
|
}),
|
|
@@ -326,6 +350,26 @@ export const createHealthCheckRouter = (opts: {
|
|
|
326
350
|
},
|
|
327
351
|
),
|
|
328
352
|
|
|
353
|
+
getHealthState: os.getHealthState.handler(async ({ input }) => {
|
|
354
|
+
return service.getHealthState({
|
|
355
|
+
systemId: input.systemId,
|
|
356
|
+
configurationId: input.configurationId,
|
|
357
|
+
transitionWindowMinutes: input.transitionWindowMinutes,
|
|
358
|
+
maintenanceClient,
|
|
359
|
+
logger,
|
|
360
|
+
});
|
|
361
|
+
}),
|
|
362
|
+
|
|
363
|
+
getBulkHealthState: os.getBulkHealthState.handler(async ({ input }) => {
|
|
364
|
+
const states = await service.getBulkHealthState({
|
|
365
|
+
systemIds: input.systemIds,
|
|
366
|
+
transitionWindowMinutes: input.transitionWindowMinutes,
|
|
367
|
+
maintenanceClient,
|
|
368
|
+
logger,
|
|
369
|
+
});
|
|
370
|
+
return { states };
|
|
371
|
+
}),
|
|
372
|
+
|
|
329
373
|
// ========================================================================
|
|
330
374
|
// SERVICE INTERFACE (S2S — satellite-backend)
|
|
331
375
|
// ========================================================================
|
package/src/schema.ts
CHANGED
|
@@ -117,70 +117,50 @@ export const systemHealthChecks = pgTable(
|
|
|
117
117
|
);
|
|
118
118
|
|
|
119
119
|
/**
|
|
120
|
-
* Records
|
|
121
|
-
*
|
|
122
|
-
*
|
|
123
|
-
*
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
.notNull()
|
|
131
|
-
.references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
|
|
132
|
-
systemId: text("system_id").notNull(),
|
|
133
|
-
transitionedAt: timestamp("transitioned_at").defaultNow().notNull(),
|
|
134
|
-
},
|
|
135
|
-
(t) => ({
|
|
136
|
-
// Powers the threshold count query
|
|
137
|
-
// (WHERE config_id = ? AND system_id = ? AND transitioned_at > ?).
|
|
138
|
-
lookupIdx: index(
|
|
139
|
-
"health_check_unhealthy_transitions_lookup_idx",
|
|
140
|
-
).on(t.configurationId, t.systemId, t.transitionedAt),
|
|
141
|
-
}),
|
|
142
|
-
);
|
|
143
|
-
|
|
144
|
-
/**
|
|
145
|
-
* Mapping of auto-opened incidents back to the system + check that
|
|
146
|
-
* triggered them. `closedAt` stays null while the incident is active;
|
|
147
|
-
* the auto-close worker sets it once the linked system has been
|
|
148
|
-
* steadily healthy for the cooldown.
|
|
120
|
+
* Records every *aggregate* health-status transition for a system
|
|
121
|
+
* (e.g. healthy -> degraded -> unhealthy -> healthy). This table is
|
|
122
|
+
* unconditional and covers ALL statuses, giving a reliable
|
|
123
|
+
* "in current status since" timestamp for arbitrary statuses.
|
|
124
|
+
*
|
|
125
|
+
* The former `health_check_unhealthy_transitions` table (per-check,
|
|
126
|
+
* unhealthy-only) was dropped: flapping is now counted in the automation
|
|
127
|
+
* engine's `automation_window_events` log via the windowed-count gate on the
|
|
128
|
+
* `system_health_changed` trigger, so healthcheck no longer keeps a separate
|
|
129
|
+
* transition audit for flapping.
|
|
149
130
|
*
|
|
150
|
-
*
|
|
151
|
-
*
|
|
152
|
-
*
|
|
131
|
+
* One row is written per aggregate transition at the same point the
|
|
132
|
+
* `systemHealthChanged` hook fires. `configurationId` is the check that
|
|
133
|
+
* drove the transition (the just-ran check).
|
|
134
|
+
*
|
|
135
|
+
* Retention: pruned alongside raw runs, EXCEPT the single most-recent
|
|
136
|
+
* row per system is always kept so "in status since" never blanks for
|
|
137
|
+
* an active streak.
|
|
153
138
|
*/
|
|
154
|
-
export const
|
|
155
|
-
"
|
|
139
|
+
export const healthCheckStateTransitions = pgTable(
|
|
140
|
+
"health_check_state_transitions",
|
|
156
141
|
{
|
|
157
142
|
id: uuid("id").primaryKey().defaultRandom(),
|
|
158
|
-
incidentId: uuid("incident_id").notNull(),
|
|
159
143
|
systemId: text("system_id").notNull(),
|
|
144
|
+
/** The check whose run drove this aggregate transition. */
|
|
160
145
|
configurationId: uuid("configuration_id")
|
|
161
146
|
.notNull()
|
|
162
147
|
.references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
* Auto-close cooldown snapshot taken when the incident was opened.
|
|
167
|
-
* `null` means "never auto-close" — the worker leaves this
|
|
168
|
-
* incident alone and an operator must resolve it manually. Stored
|
|
169
|
-
* per-row so a later policy change doesn't retroactively alter
|
|
170
|
-
* the close behaviour of incidents already in flight.
|
|
171
|
-
*/
|
|
172
|
-
cooldownMinutes: integer("cooldown_minutes"),
|
|
148
|
+
fromStatus: healthCheckStatusEnum("from_status"),
|
|
149
|
+
toStatus: healthCheckStatusEnum("to_status").notNull(),
|
|
150
|
+
transitionedAt: timestamp("transitioned_at").defaultNow().notNull(),
|
|
173
151
|
},
|
|
174
152
|
(t) => ({
|
|
175
|
-
// Powers "
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
153
|
+
// Powers "most recent transition into status X for this system"
|
|
154
|
+
// (WHERE system_id = ? AND to_status = ? ORDER BY transitioned_at DESC).
|
|
155
|
+
lookupIdx: index("health_check_state_transitions_lookup_idx").on(
|
|
156
|
+
t.systemId,
|
|
157
|
+
t.toStatus,
|
|
158
|
+
t.transitionedAt,
|
|
159
|
+
),
|
|
160
|
+
// Powers the retention "keep newest per system" sweep.
|
|
161
|
+
systemRecentIdx: index(
|
|
162
|
+
"health_check_state_transitions_system_recent_idx",
|
|
163
|
+
).on(t.systemId, t.transitionedAt),
|
|
184
164
|
}),
|
|
185
165
|
);
|
|
186
166
|
|
|
@@ -51,9 +51,7 @@ describe("HealthCheckService.getAssignmentNotificationPolicy", () => {
|
|
|
51
51
|
|
|
52
52
|
it("falls back to platform defaults when association exists but notificationPolicy is null", async () => {
|
|
53
53
|
const customPlatformDefault: NotificationPolicy = {
|
|
54
|
-
|
|
55
|
-
autoCloseAfterMinutes: 120,
|
|
56
|
-
sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 15 },
|
|
54
|
+
suppressDeEscalations: true,
|
|
57
55
|
};
|
|
58
56
|
const service = buildServiceWithRows(
|
|
59
57
|
[{ notificationPolicy: null }],
|
|
@@ -63,40 +61,27 @@ describe("HealthCheckService.getAssignmentNotificationPolicy", () => {
|
|
|
63
61
|
systemId: "sys-1",
|
|
64
62
|
configurationId: "cfg-1",
|
|
65
63
|
});
|
|
66
|
-
expect(policy.
|
|
67
|
-
expect(policy.sustainedUnhealthyTrigger.durationMinutes).toBe(15);
|
|
64
|
+
expect(policy.suppressDeEscalations).toBe(true);
|
|
68
65
|
});
|
|
69
66
|
|
|
70
67
|
it("falls back to platform defaults when no association exists", async () => {
|
|
71
68
|
const customPlatformDefault: NotificationPolicy = {
|
|
72
|
-
|
|
73
|
-
flappingTrigger: { enabled: true, transitions: 10, windowMinutes: 30 },
|
|
69
|
+
suppressDeEscalations: true,
|
|
74
70
|
};
|
|
75
71
|
const service = buildServiceWithRows([], customPlatformDefault);
|
|
76
72
|
const policy = await service.getAssignmentNotificationPolicy({
|
|
77
73
|
systemId: "sys-1",
|
|
78
74
|
configurationId: "cfg-1",
|
|
79
75
|
});
|
|
80
|
-
expect(policy
|
|
81
|
-
enabled: true,
|
|
82
|
-
transitions: 10,
|
|
83
|
-
windowMinutes: 30,
|
|
84
|
-
});
|
|
76
|
+
expect(policy).toEqual({ suppressDeEscalations: true });
|
|
85
77
|
});
|
|
86
78
|
|
|
87
79
|
it("prefers per-assignment override over platform defaults", async () => {
|
|
88
80
|
const platformDefault: NotificationPolicy = {
|
|
89
|
-
|
|
90
|
-
autoOpenIncidentOnUnhealthy: false,
|
|
81
|
+
suppressDeEscalations: false,
|
|
91
82
|
};
|
|
92
83
|
const assignmentOverride = {
|
|
93
|
-
suppressDeEscalations: true,
|
|
94
|
-
autoOpenIncidentOnUnhealthy: true, // overrides platform default
|
|
95
|
-
useNotificationSuppression: true,
|
|
96
|
-
skipDuringMaintenance: true,
|
|
97
|
-
sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 30 },
|
|
98
|
-
flappingTrigger: { enabled: true, transitions: 3, windowMinutes: 60 },
|
|
99
|
-
autoCloseAfterMinutes: 30,
|
|
84
|
+
suppressDeEscalations: true, // overrides platform default
|
|
100
85
|
};
|
|
101
86
|
const service = buildServiceWithRows(
|
|
102
87
|
[{ notificationPolicy: assignmentOverride }],
|
|
@@ -106,69 +91,41 @@ describe("HealthCheckService.getAssignmentNotificationPolicy", () => {
|
|
|
106
91
|
systemId: "sys-1",
|
|
107
92
|
configurationId: "cfg-1",
|
|
108
93
|
});
|
|
109
|
-
expect(policy.autoOpenIncidentOnUnhealthy).toBe(true);
|
|
110
94
|
expect(policy.suppressDeEscalations).toBe(true);
|
|
111
95
|
});
|
|
112
96
|
|
|
113
|
-
it("fills in defaults for
|
|
114
|
-
|
|
115
|
-
// first migration. All other fields must default in.
|
|
116
|
-
const service = buildServiceWithRows([
|
|
117
|
-
{ notificationPolicy: { suppressDeEscalations: true } },
|
|
118
|
-
]);
|
|
97
|
+
it("fills in defaults for an empty stored policy", async () => {
|
|
98
|
+
const service = buildServiceWithRows([{ notificationPolicy: {} }]);
|
|
119
99
|
const policy = await service.getAssignmentNotificationPolicy({
|
|
120
100
|
systemId: "sys-1",
|
|
121
101
|
configurationId: "cfg-1",
|
|
122
102
|
});
|
|
123
|
-
expect(policy
|
|
124
|
-
expect(policy.autoOpenIncidentOnUnhealthy).toBe(true);
|
|
125
|
-
expect(policy.useNotificationSuppression).toBe(true);
|
|
126
|
-
expect(policy.skipDuringMaintenance).toBe(true);
|
|
127
|
-
expect(policy.sustainedUnhealthyTrigger).toEqual({
|
|
128
|
-
enabled: true,
|
|
129
|
-
durationMinutes: 30,
|
|
130
|
-
});
|
|
131
|
-
expect(policy.flappingTrigger).toEqual({
|
|
132
|
-
enabled: true,
|
|
133
|
-
transitions: 3,
|
|
134
|
-
windowMinutes: 60,
|
|
135
|
-
});
|
|
136
|
-
expect(policy.autoCloseAfterMinutes).toBe(30);
|
|
103
|
+
expect(policy).toEqual({ suppressDeEscalations: false });
|
|
137
104
|
});
|
|
138
105
|
|
|
139
|
-
it("
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
},
|
|
106
|
+
it("strips removed legacy keys (auto-incident AND flapping) from stored rows without throwing", async () => {
|
|
107
|
+
// A row persisted before the legacy auto-incident fields and the flapping
|
|
108
|
+
// thresholds were removed still carries the larger object. The schema
|
|
109
|
+
// strips the dead keys and keeps the one surviving field.
|
|
110
|
+
const legacyOversizedRow = {
|
|
111
|
+
notificationPolicy: {
|
|
112
|
+
suppressDeEscalations: true,
|
|
113
|
+
// Removed flapping thresholds — moved onto the automation trigger.
|
|
114
|
+
flappingTrigger: { enabled: true, transitions: 7, windowMinutes: 45 },
|
|
115
|
+
// Removed legacy auto-incident keys — must be dropped, not rejected.
|
|
116
|
+
autoOpenIncidentOnUnhealthy: true,
|
|
117
|
+
useNotificationSuppression: true,
|
|
118
|
+
skipDuringMaintenance: true,
|
|
119
|
+
sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 15 },
|
|
120
|
+
autoCloseAfterMinutes: 120,
|
|
155
121
|
},
|
|
156
|
-
|
|
122
|
+
};
|
|
123
|
+
const service = buildServiceWithRows([legacyOversizedRow]);
|
|
157
124
|
const policy = await service.getAssignmentNotificationPolicy({
|
|
158
125
|
systemId: "sys-1",
|
|
159
126
|
configurationId: "cfg-1",
|
|
160
127
|
});
|
|
161
|
-
expect(policy
|
|
162
|
-
expect(policy
|
|
163
|
-
expect(policy.sustainedUnhealthyTrigger).toEqual({
|
|
164
|
-
enabled: false,
|
|
165
|
-
durationMinutes: 15,
|
|
166
|
-
});
|
|
167
|
-
expect(policy.flappingTrigger).toEqual({
|
|
168
|
-
enabled: true,
|
|
169
|
-
transitions: 5,
|
|
170
|
-
windowMinutes: 30,
|
|
171
|
-
});
|
|
172
|
-
expect(policy.autoCloseAfterMinutes).toBeNull();
|
|
128
|
+
expect(policy).toEqual({ suppressDeEscalations: true });
|
|
129
|
+
expect(Object.keys(policy)).toEqual(["suppressDeEscalations"]);
|
|
173
130
|
});
|
|
174
131
|
});
|
package/src/service.ts
CHANGED
|
@@ -38,7 +38,10 @@ import {
|
|
|
38
38
|
} from "drizzle-orm";
|
|
39
39
|
import { ORPCError } from "@orpc/server";
|
|
40
40
|
import { evaluateHealthStatus } from "./state-evaluator";
|
|
41
|
+
import { computeHealthState, type HealthState } from "./health-state";
|
|
41
42
|
import { stateThresholds } from "./state-thresholds-migrations";
|
|
43
|
+
import type { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
44
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
42
45
|
import { incrementHourlyAggregate } from "./realtime-aggregation";
|
|
43
46
|
import type {
|
|
44
47
|
HealthCheckRegistry,
|
|
@@ -62,6 +65,10 @@ type Db = SafeDatabase<typeof schema>;
|
|
|
62
65
|
// satellite assignment run-context. Optional on the service.
|
|
63
66
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
64
67
|
|
|
68
|
+
// Maintenance client type used to fold suppression-agnostic maintenance
|
|
69
|
+
// state into the health-state snapshot. Optional on the read path.
|
|
70
|
+
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
71
|
+
|
|
65
72
|
interface SystemCheckStatus {
|
|
66
73
|
configurationId: string;
|
|
67
74
|
configurationName: string;
|
|
@@ -558,6 +565,88 @@ export class HealthCheckService {
|
|
|
558
565
|
};
|
|
559
566
|
}
|
|
560
567
|
|
|
568
|
+
/**
|
|
569
|
+
* Live health-state snapshot for a single system (Wave-2 sensing
|
|
570
|
+
* contract). When `configurationId` is given, status reflects that
|
|
571
|
+
* one check; otherwise it is the aggregate. `inStatusSince` /
|
|
572
|
+
* `inStatusForMs` come from the state-transitions table, latency from
|
|
573
|
+
* the newest run, windowed metrics from hourly aggregates, and
|
|
574
|
+
* `inMaintenance` from the maintenance plugin (suppression-agnostic,
|
|
575
|
+
* fail-open). `now` is threaded so bulk reads share one timestamp.
|
|
576
|
+
*/
|
|
577
|
+
async getHealthState({
|
|
578
|
+
systemId,
|
|
579
|
+
configurationId,
|
|
580
|
+
maintenanceClient,
|
|
581
|
+
logger,
|
|
582
|
+
transitionWindowMinutes,
|
|
583
|
+
now,
|
|
584
|
+
}: {
|
|
585
|
+
systemId: string;
|
|
586
|
+
configurationId?: string;
|
|
587
|
+
maintenanceClient?: MaintenanceClient;
|
|
588
|
+
logger?: Logger;
|
|
589
|
+
transitionWindowMinutes?: number;
|
|
590
|
+
now?: Date;
|
|
591
|
+
}): Promise<HealthState> {
|
|
592
|
+
return computeHealthState({
|
|
593
|
+
db: this.db,
|
|
594
|
+
systemId,
|
|
595
|
+
configurationId,
|
|
596
|
+
maintenanceClient,
|
|
597
|
+
logger,
|
|
598
|
+
transitionWindowMinutes,
|
|
599
|
+
now,
|
|
600
|
+
resolveStatus: async () => {
|
|
601
|
+
const overview = await this.getSystemHealthStatus(systemId);
|
|
602
|
+
if (!configurationId) return overview.status;
|
|
603
|
+
const check = overview.checkStatuses.find(
|
|
604
|
+
(c) => c.configurationId === configurationId,
|
|
605
|
+
);
|
|
606
|
+
// Unknown check id -> treat as healthy (no signal), mirroring
|
|
607
|
+
// the "no checks configured" default.
|
|
608
|
+
return check?.status ?? "healthy";
|
|
609
|
+
},
|
|
610
|
+
});
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/**
|
|
614
|
+
* Bulk variant of {@link getHealthState}. Resolves every system in
|
|
615
|
+
* parallel against a single shared `now` so durations are consistent
|
|
616
|
+
* across the batch. Avoids N+1 from dashboards and multi-system
|
|
617
|
+
* automation rules.
|
|
618
|
+
*/
|
|
619
|
+
async getBulkHealthState({
|
|
620
|
+
systemIds,
|
|
621
|
+
maintenanceClient,
|
|
622
|
+
logger,
|
|
623
|
+
transitionWindowMinutes,
|
|
624
|
+
now = new Date(),
|
|
625
|
+
}: {
|
|
626
|
+
systemIds: string[];
|
|
627
|
+
maintenanceClient?: MaintenanceClient;
|
|
628
|
+
logger?: Logger;
|
|
629
|
+
transitionWindowMinutes?: number;
|
|
630
|
+
now?: Date;
|
|
631
|
+
}): Promise<Record<string, HealthState>> {
|
|
632
|
+
const entries = await Promise.all(
|
|
633
|
+
systemIds.map(
|
|
634
|
+
async (systemId) =>
|
|
635
|
+
[
|
|
636
|
+
systemId,
|
|
637
|
+
await this.getHealthState({
|
|
638
|
+
systemId,
|
|
639
|
+
maintenanceClient,
|
|
640
|
+
logger,
|
|
641
|
+
transitionWindowMinutes,
|
|
642
|
+
now,
|
|
643
|
+
}),
|
|
644
|
+
] as const,
|
|
645
|
+
),
|
|
646
|
+
);
|
|
647
|
+
return Object.fromEntries(entries);
|
|
648
|
+
}
|
|
649
|
+
|
|
561
650
|
/**
|
|
562
651
|
* Get comprehensive health overview for a system.
|
|
563
652
|
* Returns all health checks with their last 25 runs for sparkline visualization.
|