@checkstack/healthcheck-backend 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +541 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +234 -0
- package/src/automations.ts +342 -0
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +698 -0
- package/src/health-entity.ts +369 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +38 -28
- package/src/index.ts +150 -98
- package/src/queue-executor.test.ts +137 -0
- package/src/queue-executor.ts +282 -380
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +18 -0
- package/src/router.ts +56 -1
- package/src/schema.ts +34 -54
- package/src/service-assignments.test.ts +184 -0
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +154 -0
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +12 -3
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
|
@@ -51,9 +51,7 @@ describe("HealthCheckService.getAssignmentNotificationPolicy", () => {
|
|
|
51
51
|
|
|
52
52
|
it("falls back to platform defaults when association exists but notificationPolicy is null", async () => {
|
|
53
53
|
const customPlatformDefault: NotificationPolicy = {
|
|
54
|
-
|
|
55
|
-
autoCloseAfterMinutes: 120,
|
|
56
|
-
sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 15 },
|
|
54
|
+
suppressDeEscalations: true,
|
|
57
55
|
};
|
|
58
56
|
const service = buildServiceWithRows(
|
|
59
57
|
[{ notificationPolicy: null }],
|
|
@@ -63,40 +61,27 @@ describe("HealthCheckService.getAssignmentNotificationPolicy", () => {
|
|
|
63
61
|
systemId: "sys-1",
|
|
64
62
|
configurationId: "cfg-1",
|
|
65
63
|
});
|
|
66
|
-
expect(policy.
|
|
67
|
-
expect(policy.sustainedUnhealthyTrigger.durationMinutes).toBe(15);
|
|
64
|
+
expect(policy.suppressDeEscalations).toBe(true);
|
|
68
65
|
});
|
|
69
66
|
|
|
70
67
|
it("falls back to platform defaults when no association exists", async () => {
|
|
71
68
|
const customPlatformDefault: NotificationPolicy = {
|
|
72
|
-
|
|
73
|
-
flappingTrigger: { enabled: true, transitions: 10, windowMinutes: 30 },
|
|
69
|
+
suppressDeEscalations: true,
|
|
74
70
|
};
|
|
75
71
|
const service = buildServiceWithRows([], customPlatformDefault);
|
|
76
72
|
const policy = await service.getAssignmentNotificationPolicy({
|
|
77
73
|
systemId: "sys-1",
|
|
78
74
|
configurationId: "cfg-1",
|
|
79
75
|
});
|
|
80
|
-
expect(policy
|
|
81
|
-
enabled: true,
|
|
82
|
-
transitions: 10,
|
|
83
|
-
windowMinutes: 30,
|
|
84
|
-
});
|
|
76
|
+
expect(policy).toEqual({ suppressDeEscalations: true });
|
|
85
77
|
});
|
|
86
78
|
|
|
87
79
|
it("prefers per-assignment override over platform defaults", async () => {
|
|
88
80
|
const platformDefault: NotificationPolicy = {
|
|
89
|
-
|
|
90
|
-
autoOpenIncidentOnUnhealthy: false,
|
|
81
|
+
suppressDeEscalations: false,
|
|
91
82
|
};
|
|
92
83
|
const assignmentOverride = {
|
|
93
|
-
suppressDeEscalations: true,
|
|
94
|
-
autoOpenIncidentOnUnhealthy: true, // overrides platform default
|
|
95
|
-
useNotificationSuppression: true,
|
|
96
|
-
skipDuringMaintenance: true,
|
|
97
|
-
sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 30 },
|
|
98
|
-
flappingTrigger: { enabled: true, transitions: 3, windowMinutes: 60 },
|
|
99
|
-
autoCloseAfterMinutes: 30,
|
|
84
|
+
suppressDeEscalations: true, // overrides platform default
|
|
100
85
|
};
|
|
101
86
|
const service = buildServiceWithRows(
|
|
102
87
|
[{ notificationPolicy: assignmentOverride }],
|
|
@@ -106,69 +91,41 @@ describe("HealthCheckService.getAssignmentNotificationPolicy", () => {
|
|
|
106
91
|
systemId: "sys-1",
|
|
107
92
|
configurationId: "cfg-1",
|
|
108
93
|
});
|
|
109
|
-
expect(policy.autoOpenIncidentOnUnhealthy).toBe(true);
|
|
110
94
|
expect(policy.suppressDeEscalations).toBe(true);
|
|
111
95
|
});
|
|
112
96
|
|
|
113
|
-
it("fills in defaults for
|
|
114
|
-
|
|
115
|
-
// first migration. All other fields must default in.
|
|
116
|
-
const service = buildServiceWithRows([
|
|
117
|
-
{ notificationPolicy: { suppressDeEscalations: true } },
|
|
118
|
-
]);
|
|
97
|
+
it("fills in defaults for an empty stored policy", async () => {
|
|
98
|
+
const service = buildServiceWithRows([{ notificationPolicy: {} }]);
|
|
119
99
|
const policy = await service.getAssignmentNotificationPolicy({
|
|
120
100
|
systemId: "sys-1",
|
|
121
101
|
configurationId: "cfg-1",
|
|
122
102
|
});
|
|
123
|
-
expect(policy
|
|
124
|
-
expect(policy.autoOpenIncidentOnUnhealthy).toBe(true);
|
|
125
|
-
expect(policy.useNotificationSuppression).toBe(true);
|
|
126
|
-
expect(policy.skipDuringMaintenance).toBe(true);
|
|
127
|
-
expect(policy.sustainedUnhealthyTrigger).toEqual({
|
|
128
|
-
enabled: true,
|
|
129
|
-
durationMinutes: 30,
|
|
130
|
-
});
|
|
131
|
-
expect(policy.flappingTrigger).toEqual({
|
|
132
|
-
enabled: true,
|
|
133
|
-
transitions: 3,
|
|
134
|
-
windowMinutes: 60,
|
|
135
|
-
});
|
|
136
|
-
expect(policy.autoCloseAfterMinutes).toBe(30);
|
|
103
|
+
expect(policy).toEqual({ suppressDeEscalations: false });
|
|
137
104
|
});
|
|
138
105
|
|
|
139
|
-
it("
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
},
|
|
106
|
+
it("strips removed legacy keys (auto-incident AND flapping) from stored rows without throwing", async () => {
|
|
107
|
+
// A row persisted before the legacy auto-incident fields and the flapping
|
|
108
|
+
// thresholds were removed still carries the larger object. The schema
|
|
109
|
+
// strips the dead keys and keeps the one surviving field.
|
|
110
|
+
const legacyOversizedRow = {
|
|
111
|
+
notificationPolicy: {
|
|
112
|
+
suppressDeEscalations: true,
|
|
113
|
+
// Removed flapping thresholds — moved onto the automation trigger.
|
|
114
|
+
flappingTrigger: { enabled: true, transitions: 7, windowMinutes: 45 },
|
|
115
|
+
// Removed legacy auto-incident keys — must be dropped, not rejected.
|
|
116
|
+
autoOpenIncidentOnUnhealthy: true,
|
|
117
|
+
useNotificationSuppression: true,
|
|
118
|
+
skipDuringMaintenance: true,
|
|
119
|
+
sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 15 },
|
|
120
|
+
autoCloseAfterMinutes: 120,
|
|
155
121
|
},
|
|
156
|
-
|
|
122
|
+
};
|
|
123
|
+
const service = buildServiceWithRows([legacyOversizedRow]);
|
|
157
124
|
const policy = await service.getAssignmentNotificationPolicy({
|
|
158
125
|
systemId: "sys-1",
|
|
159
126
|
configurationId: "cfg-1",
|
|
160
127
|
});
|
|
161
|
-
expect(policy
|
|
162
|
-
expect(policy
|
|
163
|
-
expect(policy.sustainedUnhealthyTrigger).toEqual({
|
|
164
|
-
enabled: false,
|
|
165
|
-
durationMinutes: 15,
|
|
166
|
-
});
|
|
167
|
-
expect(policy.flappingTrigger).toEqual({
|
|
168
|
-
enabled: true,
|
|
169
|
-
transitions: 5,
|
|
170
|
-
windowMinutes: 30,
|
|
171
|
-
});
|
|
172
|
-
expect(policy.autoCloseAfterMinutes).toBeNull();
|
|
128
|
+
expect(policy).toEqual({ suppressDeEscalations: true });
|
|
129
|
+
expect(Object.keys(policy)).toEqual(["suppressDeEscalations"]);
|
|
173
130
|
});
|
|
174
131
|
});
|
package/src/service.ts
CHANGED
|
@@ -11,6 +11,8 @@ import {
|
|
|
11
11
|
DEFAULT_NOTIFICATION_POLICY,
|
|
12
12
|
} from "@checkstack/healthcheck-common";
|
|
13
13
|
import type { ConfigService } from "@checkstack/backend-api";
|
|
14
|
+
import type { InferClient } from "@checkstack/common";
|
|
15
|
+
import type { CatalogApi } from "@checkstack/catalog-common";
|
|
14
16
|
import {
|
|
15
17
|
notificationDefaultsConfigV1,
|
|
16
18
|
NOTIFICATION_DEFAULTS_CONFIG_ID,
|
|
@@ -36,7 +38,10 @@ import {
|
|
|
36
38
|
} from "drizzle-orm";
|
|
37
39
|
import { ORPCError } from "@orpc/server";
|
|
38
40
|
import { evaluateHealthStatus } from "./state-evaluator";
|
|
41
|
+
import { computeHealthState, type HealthState } from "./health-state";
|
|
39
42
|
import { stateThresholds } from "./state-thresholds-migrations";
|
|
43
|
+
import type { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
44
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
40
45
|
import { incrementHourlyAggregate } from "./realtime-aggregation";
|
|
41
46
|
import type {
|
|
42
47
|
HealthCheckRegistry,
|
|
@@ -56,6 +61,14 @@ import {
|
|
|
56
61
|
// Drizzle type helper - uses SafeDatabase to prevent relational query API usage
|
|
57
62
|
type Db = SafeDatabase<typeof schema>;
|
|
58
63
|
|
|
64
|
+
// Catalog client type used to resolve human-readable system names for
|
|
65
|
+
// satellite assignment run-context. Optional on the service.
|
|
66
|
+
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
67
|
+
|
|
68
|
+
// Maintenance client type used to fold suppression-agnostic maintenance
|
|
69
|
+
// state into the health-state snapshot. Optional on the read path.
|
|
70
|
+
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
71
|
+
|
|
59
72
|
interface SystemCheckStatus {
|
|
60
73
|
configurationId: string;
|
|
61
74
|
configurationName: string;
|
|
@@ -83,6 +96,12 @@ export class HealthCheckService {
|
|
|
83
96
|
* have to plumb it through.
|
|
84
97
|
*/
|
|
85
98
|
private configService?: ConfigService,
|
|
99
|
+
/**
|
|
100
|
+
* Optional — used to resolve human-readable system names when building
|
|
101
|
+
* satellite assignment run-context. When absent (e.g. GitOps-only /
|
|
102
|
+
* test constructions), `systemName` falls back to the `systemId`.
|
|
103
|
+
*/
|
|
104
|
+
private catalogClient?: CatalogClient,
|
|
86
105
|
) {}
|
|
87
106
|
|
|
88
107
|
/**
|
|
@@ -242,6 +261,35 @@ export class HealthCheckService {
|
|
|
242
261
|
});
|
|
243
262
|
}
|
|
244
263
|
|
|
264
|
+
/**
|
|
265
|
+
* Flip the `enabled` flag on an existing `systemHealthChecks` row
|
|
266
|
+
* without touching any of the other configuration (thresholds,
|
|
267
|
+
* satellite assignment, notification policy). Returns `true` when a
|
|
268
|
+
* row was updated, `false` when the assignment doesn't exist.
|
|
269
|
+
*
|
|
270
|
+
* Carved out so the automation actions `enable_assignment` /
|
|
271
|
+
* `disable_assignment` don't have to round-trip through
|
|
272
|
+
* `associateSystem` (which would otherwise wipe operator-managed
|
|
273
|
+
* fields when invoked with a sparse partial).
|
|
274
|
+
*/
|
|
275
|
+
async setAssignmentEnabled(
|
|
276
|
+
systemId: string,
|
|
277
|
+
configurationId: string,
|
|
278
|
+
enabled: boolean,
|
|
279
|
+
): Promise<boolean> {
|
|
280
|
+
const result = await this.db
|
|
281
|
+
.update(systemHealthChecks)
|
|
282
|
+
.set({ enabled, updatedAt: new Date() })
|
|
283
|
+
.where(
|
|
284
|
+
and(
|
|
285
|
+
eq(systemHealthChecks.systemId, systemId),
|
|
286
|
+
eq(systemHealthChecks.configurationId, configurationId),
|
|
287
|
+
),
|
|
288
|
+
)
|
|
289
|
+
.returning({ systemId: systemHealthChecks.systemId });
|
|
290
|
+
return result.length > 0;
|
|
291
|
+
}
|
|
292
|
+
|
|
245
293
|
async disassociateSystem(systemId: string, configurationId: string) {
|
|
246
294
|
await this.db
|
|
247
295
|
.delete(systemHealthChecks)
|
|
@@ -517,6 +565,88 @@ export class HealthCheckService {
|
|
|
517
565
|
};
|
|
518
566
|
}
|
|
519
567
|
|
|
568
|
+
/**
|
|
569
|
+
* Live health-state snapshot for a single system (Wave-2 sensing
|
|
570
|
+
* contract). When `configurationId` is given, status reflects that
|
|
571
|
+
* one check; otherwise it is the aggregate. `inStatusSince` /
|
|
572
|
+
* `inStatusForMs` come from the state-transitions table, latency from
|
|
573
|
+
* the newest run, windowed metrics from hourly aggregates, and
|
|
574
|
+
* `inMaintenance` from the maintenance plugin (suppression-agnostic,
|
|
575
|
+
* fail-open). `now` is threaded so bulk reads share one timestamp.
|
|
576
|
+
*/
|
|
577
|
+
async getHealthState({
|
|
578
|
+
systemId,
|
|
579
|
+
configurationId,
|
|
580
|
+
maintenanceClient,
|
|
581
|
+
logger,
|
|
582
|
+
transitionWindowMinutes,
|
|
583
|
+
now,
|
|
584
|
+
}: {
|
|
585
|
+
systemId: string;
|
|
586
|
+
configurationId?: string;
|
|
587
|
+
maintenanceClient?: MaintenanceClient;
|
|
588
|
+
logger?: Logger;
|
|
589
|
+
transitionWindowMinutes?: number;
|
|
590
|
+
now?: Date;
|
|
591
|
+
}): Promise<HealthState> {
|
|
592
|
+
return computeHealthState({
|
|
593
|
+
db: this.db,
|
|
594
|
+
systemId,
|
|
595
|
+
configurationId,
|
|
596
|
+
maintenanceClient,
|
|
597
|
+
logger,
|
|
598
|
+
transitionWindowMinutes,
|
|
599
|
+
now,
|
|
600
|
+
resolveStatus: async () => {
|
|
601
|
+
const overview = await this.getSystemHealthStatus(systemId);
|
|
602
|
+
if (!configurationId) return overview.status;
|
|
603
|
+
const check = overview.checkStatuses.find(
|
|
604
|
+
(c) => c.configurationId === configurationId,
|
|
605
|
+
);
|
|
606
|
+
// Unknown check id -> treat as healthy (no signal), mirroring
|
|
607
|
+
// the "no checks configured" default.
|
|
608
|
+
return check?.status ?? "healthy";
|
|
609
|
+
},
|
|
610
|
+
});
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/**
|
|
614
|
+
* Bulk variant of {@link getHealthState}. Resolves every system in
|
|
615
|
+
* parallel against a single shared `now` so durations are consistent
|
|
616
|
+
* across the batch. Avoids N+1 from dashboards and multi-system
|
|
617
|
+
* automation rules.
|
|
618
|
+
*/
|
|
619
|
+
async getBulkHealthState({
|
|
620
|
+
systemIds,
|
|
621
|
+
maintenanceClient,
|
|
622
|
+
logger,
|
|
623
|
+
transitionWindowMinutes,
|
|
624
|
+
now = new Date(),
|
|
625
|
+
}: {
|
|
626
|
+
systemIds: string[];
|
|
627
|
+
maintenanceClient?: MaintenanceClient;
|
|
628
|
+
logger?: Logger;
|
|
629
|
+
transitionWindowMinutes?: number;
|
|
630
|
+
now?: Date;
|
|
631
|
+
}): Promise<Record<string, HealthState>> {
|
|
632
|
+
const entries = await Promise.all(
|
|
633
|
+
systemIds.map(
|
|
634
|
+
async (systemId) =>
|
|
635
|
+
[
|
|
636
|
+
systemId,
|
|
637
|
+
await this.getHealthState({
|
|
638
|
+
systemId,
|
|
639
|
+
maintenanceClient,
|
|
640
|
+
logger,
|
|
641
|
+
transitionWindowMinutes,
|
|
642
|
+
now,
|
|
643
|
+
}),
|
|
644
|
+
] as const,
|
|
645
|
+
),
|
|
646
|
+
);
|
|
647
|
+
return Object.fromEntries(entries);
|
|
648
|
+
}
|
|
649
|
+
|
|
520
650
|
/**
|
|
521
651
|
* Get comprehensive health overview for a system.
|
|
522
652
|
* Returns all health checks with their last 25 runs for sparkline visualization.
|
|
@@ -1197,6 +1327,27 @@ export class HealthCheckService {
|
|
|
1197
1327
|
|
|
1198
1328
|
if (matchingAssociations.length === 0) return [];
|
|
1199
1329
|
|
|
1330
|
+
// Resolve human-readable system names once per distinct systemId.
|
|
1331
|
+
// Falls back to the systemId when no catalog client is wired or the
|
|
1332
|
+
// lookup fails, mirroring the queue-executor's resolution behaviour.
|
|
1333
|
+
const systemNameCache = new Map<string, string>();
|
|
1334
|
+
const resolveSystemName = async (systemId: string): Promise<string> => {
|
|
1335
|
+
const cached = systemNameCache.get(systemId);
|
|
1336
|
+
if (cached !== undefined) return cached;
|
|
1337
|
+
|
|
1338
|
+
let systemName = systemId;
|
|
1339
|
+
if (this.catalogClient) {
|
|
1340
|
+
try {
|
|
1341
|
+
const system = await this.catalogClient.getSystem({ systemId });
|
|
1342
|
+
if (system) systemName = system.name;
|
|
1343
|
+
} catch {
|
|
1344
|
+
// Fall back to systemId if catalog lookup fails.
|
|
1345
|
+
}
|
|
1346
|
+
}
|
|
1347
|
+
systemNameCache.set(systemId, systemName);
|
|
1348
|
+
return systemName;
|
|
1349
|
+
};
|
|
1350
|
+
|
|
1200
1351
|
// Get configurations for each matching association
|
|
1201
1352
|
const assignments = [];
|
|
1202
1353
|
for (const assoc of matchingAssociations) {
|
|
@@ -1214,6 +1365,9 @@ export class HealthCheckService {
|
|
|
1214
1365
|
config: config.config,
|
|
1215
1366
|
collectors: config.collectors ?? undefined,
|
|
1216
1367
|
intervalSeconds: config.intervalSeconds,
|
|
1368
|
+
// Curated run-context metadata exposed to satellite collectors.
|
|
1369
|
+
configName: config.name,
|
|
1370
|
+
systemName: await resolveSystemName(assoc.systemId),
|
|
1217
1371
|
});
|
|
1218
1372
|
}
|
|
1219
1373
|
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import { describe, it, expect, mock } from "bun:test";
|
|
2
|
+
import {
|
|
3
|
+
countStateTransitionsInWindow,
|
|
4
|
+
findInStatusSince,
|
|
5
|
+
recordStateTransition,
|
|
6
|
+
} from "./state-transitions";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Minimal fluent mock for `db.select(...).from(...).where(...).orderBy(...).limit(...)`
|
|
10
|
+
* that resolves to the provided rows.
|
|
11
|
+
*/
|
|
12
|
+
function selectMockDb(rows: Array<{ transitionedAt: Date }>) {
|
|
13
|
+
return {
|
|
14
|
+
select: mock(() => ({
|
|
15
|
+
from: mock(() => ({
|
|
16
|
+
where: mock(() => ({
|
|
17
|
+
orderBy: mock(() => ({
|
|
18
|
+
limit: mock(() => Promise.resolve(rows)),
|
|
19
|
+
})),
|
|
20
|
+
})),
|
|
21
|
+
})),
|
|
22
|
+
})),
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
describe("findInStatusSince", () => {
|
|
27
|
+
it("returns the most-recent transitionedAt for the status", async () => {
|
|
28
|
+
const since = new Date("2026-05-30T10:00:00.000Z");
|
|
29
|
+
const db = selectMockDb([{ transitionedAt: since }]);
|
|
30
|
+
const result = await findInStatusSince({
|
|
31
|
+
db: db as never,
|
|
32
|
+
systemId: "system-1",
|
|
33
|
+
status: "unhealthy",
|
|
34
|
+
});
|
|
35
|
+
expect(result).toBe(since);
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
it("returns null (fail-safe) when no transition row exists", async () => {
|
|
39
|
+
const db = selectMockDb([]);
|
|
40
|
+
const result = await findInStatusSince({
|
|
41
|
+
db: db as never,
|
|
42
|
+
systemId: "system-1",
|
|
43
|
+
status: "degraded",
|
|
44
|
+
});
|
|
45
|
+
expect(result).toBeNull();
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
describe("recordStateTransition", () => {
|
|
50
|
+
it("inserts a row with from/to status and the provided timestamp", async () => {
|
|
51
|
+
const values =
|
|
52
|
+
mock<(v: Record<string, unknown>) => Promise<void>>(() =>
|
|
53
|
+
Promise.resolve(),
|
|
54
|
+
);
|
|
55
|
+
const db = { insert: mock(() => ({ values })) };
|
|
56
|
+
const now = new Date("2026-05-30T12:00:00.000Z");
|
|
57
|
+
|
|
58
|
+
await recordStateTransition({
|
|
59
|
+
db: db as never,
|
|
60
|
+
systemId: "system-1",
|
|
61
|
+
configurationId: "config-1",
|
|
62
|
+
fromStatus: "healthy",
|
|
63
|
+
toStatus: "unhealthy",
|
|
64
|
+
now,
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
expect(values).toHaveBeenCalledTimes(1);
|
|
68
|
+
expect(values.mock.calls[0]?.[0]).toEqual({
|
|
69
|
+
systemId: "system-1",
|
|
70
|
+
configurationId: "config-1",
|
|
71
|
+
fromStatus: "healthy",
|
|
72
|
+
toStatus: "unhealthy",
|
|
73
|
+
transitionedAt: now,
|
|
74
|
+
});
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it("stores null fromStatus on the first-ever transition", async () => {
|
|
78
|
+
const values =
|
|
79
|
+
mock<(v: Record<string, unknown>) => Promise<void>>(() =>
|
|
80
|
+
Promise.resolve(),
|
|
81
|
+
);
|
|
82
|
+
const db = { insert: mock(() => ({ values })) };
|
|
83
|
+
|
|
84
|
+
await recordStateTransition({
|
|
85
|
+
db: db as never,
|
|
86
|
+
systemId: "system-1",
|
|
87
|
+
configurationId: "config-1",
|
|
88
|
+
fromStatus: undefined,
|
|
89
|
+
toStatus: "degraded",
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
const arg = values.mock.calls[0]?.[0] as { fromStatus: unknown };
|
|
93
|
+
expect(arg.fromStatus).toBeNull();
|
|
94
|
+
});
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
describe("countStateTransitionsInWindow", () => {
|
|
98
|
+
/** Mock for `db.select({count}).from(...).where(...)` resolving to [{count}]. */
|
|
99
|
+
function countMockDb(count: number) {
|
|
100
|
+
const where = mock(() => Promise.resolve([{ count }]));
|
|
101
|
+
const from = mock(() => ({ where }));
|
|
102
|
+
const select = mock(() => ({ from }));
|
|
103
|
+
return { db: { select }, where };
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
it("returns the windowed count", async () => {
|
|
107
|
+
const { db } = countMockDb(4);
|
|
108
|
+
const result = await countStateTransitionsInWindow({
|
|
109
|
+
db: db as never,
|
|
110
|
+
systemId: "system-1",
|
|
111
|
+
windowMinutes: 60,
|
|
112
|
+
});
|
|
113
|
+
expect(result).toBe(4);
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
it("returns 0 (fail-safe) when the query yields no rows", async () => {
|
|
117
|
+
const where = mock(() => Promise.resolve([]));
|
|
118
|
+
const db = { select: mock(() => ({ from: mock(() => ({ where })) })) };
|
|
119
|
+
const result = await countStateTransitionsInWindow({
|
|
120
|
+
db: db as never,
|
|
121
|
+
systemId: "system-1",
|
|
122
|
+
windowMinutes: 30,
|
|
123
|
+
});
|
|
124
|
+
expect(result).toBe(0);
|
|
125
|
+
});
|
|
126
|
+
});
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import { and, desc, eq, gte, sql } from "drizzle-orm";
|
|
2
|
+
import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
|
|
3
|
+
import type { SafeDatabase } from "@checkstack/backend-api";
|
|
4
|
+
import { healthCheckStateTransitions } from "./schema";
|
|
5
|
+
import * as schema from "./schema";
|
|
6
|
+
|
|
7
|
+
type Db = SafeDatabase<typeof schema>;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Record an aggregate health-status transition for a system. Called at
|
|
11
|
+
* the same point `systemHealthChanged` fires (one row per aggregate
|
|
12
|
+
* transition, which is rare). `fromStatus` is null on the first-ever
|
|
13
|
+
* recorded transition for a system.
|
|
14
|
+
*/
|
|
15
|
+
export async function recordStateTransition({
|
|
16
|
+
db,
|
|
17
|
+
systemId,
|
|
18
|
+
configurationId,
|
|
19
|
+
fromStatus,
|
|
20
|
+
toStatus,
|
|
21
|
+
now = new Date(),
|
|
22
|
+
}: {
|
|
23
|
+
db: Db;
|
|
24
|
+
systemId: string;
|
|
25
|
+
configurationId: string;
|
|
26
|
+
fromStatus: HealthCheckStatus | undefined;
|
|
27
|
+
toStatus: HealthCheckStatus;
|
|
28
|
+
now?: Date;
|
|
29
|
+
}): Promise<void> {
|
|
30
|
+
await db.insert(healthCheckStateTransitions).values({
|
|
31
|
+
systemId,
|
|
32
|
+
configurationId,
|
|
33
|
+
fromStatus: fromStatus ?? null,
|
|
34
|
+
toStatus,
|
|
35
|
+
transitionedAt: now,
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Find the timestamp at which the system most recently entered the
|
|
41
|
+
* given status (the start of its current streak in that status).
|
|
42
|
+
*
|
|
43
|
+
* Fail-safe: when no transition row exists (e.g. the table was pruned
|
|
44
|
+
* before this system ever transitioned, or it has never changed status)
|
|
45
|
+
* this returns `null` rather than throwing, so callers degrade to
|
|
46
|
+
* `inStatusSince: null` instead of failing the whole evaluation.
|
|
47
|
+
*/
|
|
48
|
+
export async function findInStatusSince({
|
|
49
|
+
db,
|
|
50
|
+
systemId,
|
|
51
|
+
status,
|
|
52
|
+
}: {
|
|
53
|
+
db: Db;
|
|
54
|
+
systemId: string;
|
|
55
|
+
status: HealthCheckStatus;
|
|
56
|
+
}): Promise<Date | null> {
|
|
57
|
+
const [row] = await db
|
|
58
|
+
.select({ transitionedAt: healthCheckStateTransitions.transitionedAt })
|
|
59
|
+
.from(healthCheckStateTransitions)
|
|
60
|
+
.where(
|
|
61
|
+
and(
|
|
62
|
+
eq(healthCheckStateTransitions.systemId, systemId),
|
|
63
|
+
eq(healthCheckStateTransitions.toStatus, status),
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
.orderBy(desc(healthCheckStateTransitions.transitionedAt))
|
|
67
|
+
.limit(1);
|
|
68
|
+
|
|
69
|
+
return row?.transitionedAt ?? null;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Count aggregate state transitions for a system within the trailing
|
|
74
|
+
* window `[now - windowMinutes, now]`. Generalizes the flapping detector's
|
|
75
|
+
* "N transitions in M minutes" count beyond the unhealthy-only table.
|
|
76
|
+
*
|
|
77
|
+
* When `toStatus` is given, counts only transitions INTO that status
|
|
78
|
+
* (e.g. flapping = repeated transitions into `unhealthy`); omit it to
|
|
79
|
+
* count all status changes in the window.
|
|
80
|
+
*
|
|
81
|
+
* Fail-safe: returns 0 on any error rather than throwing, so a count
|
|
82
|
+
* read never wedges an evaluation.
|
|
83
|
+
*/
|
|
84
|
+
export async function countStateTransitionsInWindow({
|
|
85
|
+
db,
|
|
86
|
+
systemId,
|
|
87
|
+
windowMinutes,
|
|
88
|
+
toStatus,
|
|
89
|
+
now = new Date(),
|
|
90
|
+
}: {
|
|
91
|
+
db: Db;
|
|
92
|
+
systemId: string;
|
|
93
|
+
windowMinutes: number;
|
|
94
|
+
toStatus?: HealthCheckStatus;
|
|
95
|
+
now?: Date;
|
|
96
|
+
}): Promise<number> {
|
|
97
|
+
const windowStart = new Date(now.getTime() - windowMinutes * 60_000);
|
|
98
|
+
const conditions = [
|
|
99
|
+
eq(healthCheckStateTransitions.systemId, systemId),
|
|
100
|
+
gte(healthCheckStateTransitions.transitionedAt, windowStart),
|
|
101
|
+
];
|
|
102
|
+
if (toStatus) {
|
|
103
|
+
conditions.push(eq(healthCheckStateTransitions.toStatus, toStatus));
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const [row] = await db
|
|
107
|
+
.select({ count: sql<number>`COUNT(*)::int` })
|
|
108
|
+
.from(healthCheckStateTransitions)
|
|
109
|
+
.where(and(...conditions));
|
|
110
|
+
|
|
111
|
+
return row?.count ?? 0;
|
|
112
|
+
}
|
package/tsconfig.json
CHANGED
|
@@ -4,6 +4,9 @@
|
|
|
4
4
|
"src"
|
|
5
5
|
],
|
|
6
6
|
"references": [
|
|
7
|
+
{
|
|
8
|
+
"path": "../automation-backend"
|
|
9
|
+
},
|
|
7
10
|
{
|
|
8
11
|
"path": "../backend-api"
|
|
9
12
|
},
|
|
@@ -43,9 +46,6 @@
|
|
|
43
46
|
{
|
|
44
47
|
"path": "../incident-common"
|
|
45
48
|
},
|
|
46
|
-
{
|
|
47
|
-
"path": "../integration-backend"
|
|
48
|
-
},
|
|
49
49
|
{
|
|
50
50
|
"path": "../maintenance-common"
|
|
51
51
|
},
|
|
@@ -58,6 +58,15 @@
|
|
|
58
58
|
{
|
|
59
59
|
"path": "../satellite-backend"
|
|
60
60
|
},
|
|
61
|
+
{
|
|
62
|
+
"path": "../script-packages-backend"
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
"path": "../secrets-backend"
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"path": "../secrets-common"
|
|
69
|
+
},
|
|
61
70
|
{
|
|
62
71
|
"path": "../signal-common"
|
|
63
72
|
},
|