@checkstack/healthcheck-backend 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +541 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +234 -0
- package/src/automations.ts +342 -0
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +698 -0
- package/src/health-entity.ts +369 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +38 -28
- package/src/index.ts +150 -98
- package/src/queue-executor.test.ts +137 -0
- package/src/queue-executor.ts +282 -380
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +18 -0
- package/src/router.ts +56 -1
- package/src/schema.ts +34 -54
- package/src/service-assignments.test.ts +184 -0
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +154 -0
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +12 -3
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
|
@@ -1,164 +0,0 @@
|
|
|
1
|
-
import { and, eq, gte, isNotNull, isNull } from "drizzle-orm";
|
|
2
|
-
import type { Logger, SafeDatabase } from "@checkstack/backend-api";
|
|
3
|
-
import type { InferClient } from "@checkstack/common";
|
|
4
|
-
import { IncidentApi } from "@checkstack/incident-common";
|
|
5
|
-
import type { QueueManager } from "@checkstack/queue-api";
|
|
6
|
-
import * as schema from "./schema";
|
|
7
|
-
import { healthCheckAutoIncidents, healthCheckRuns } from "./schema";
|
|
8
|
-
|
|
9
|
-
type Db = SafeDatabase<typeof schema>;
|
|
10
|
-
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
11
|
-
|
|
12
|
-
const AUTO_CLOSE_QUEUE = "health-check-auto-incident-close";
|
|
13
|
-
|
|
14
|
-
interface AutoCloseJobPayload {
|
|
15
|
-
trigger: "scheduled";
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
interface AutoCloseJobDeps {
|
|
19
|
-
db: Db;
|
|
20
|
-
logger: Logger;
|
|
21
|
-
queueManager: QueueManager;
|
|
22
|
-
incidentClient: IncidentClient;
|
|
23
|
-
/**
|
|
24
|
-
* How often the worker ticks. Default 60s. Set lower in tests.
|
|
25
|
-
*/
|
|
26
|
-
intervalSeconds?: number;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
const DEFAULT_INTERVAL_SECONDS = 60;
|
|
30
|
-
|
|
31
|
-
/**
|
|
32
|
-
* Background worker that resolves auto-opened incidents once the
|
|
33
|
-
* underlying system has stayed healthy for the per-incident cooldown.
|
|
34
|
-
* The cooldown is snapshot per-row at open time (see
|
|
35
|
-
* `healthCheckAutoIncidents.cooldownMinutes`) so a policy change does
|
|
36
|
-
* not retroactively alter the close behaviour of incidents already in
|
|
37
|
-
* flight. A `null` cooldown means "never auto-close" — the worker
|
|
38
|
-
* skips those rows and an operator must resolve them manually.
|
|
39
|
-
*/
|
|
40
|
-
export async function setupAutoIncidentCloseJob(deps: AutoCloseJobDeps) {
|
|
41
|
-
const {
|
|
42
|
-
queueManager,
|
|
43
|
-
logger,
|
|
44
|
-
db,
|
|
45
|
-
incidentClient,
|
|
46
|
-
intervalSeconds = DEFAULT_INTERVAL_SECONDS,
|
|
47
|
-
} = deps;
|
|
48
|
-
|
|
49
|
-
const queue = queueManager.getQueue<AutoCloseJobPayload>(AUTO_CLOSE_QUEUE);
|
|
50
|
-
|
|
51
|
-
await queue.consume(
|
|
52
|
-
async () => {
|
|
53
|
-
await runAutoIncidentCloseJob({ db, logger, incidentClient });
|
|
54
|
-
},
|
|
55
|
-
{ consumerGroup: "auto-incident-close-worker" },
|
|
56
|
-
);
|
|
57
|
-
|
|
58
|
-
await queue.scheduleRecurring(
|
|
59
|
-
{ trigger: "scheduled" },
|
|
60
|
-
{
|
|
61
|
-
jobId: "health-check-auto-incident-close",
|
|
62
|
-
intervalSeconds,
|
|
63
|
-
},
|
|
64
|
-
);
|
|
65
|
-
|
|
66
|
-
logger.info(
|
|
67
|
-
`Health check auto-incident close job scheduled (interval ${intervalSeconds}s; cooldown is per-incident)`,
|
|
68
|
-
);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Resolve any open auto-incidents whose linked system has been
|
|
73
|
-
* steadily healthy for at least their snapshot `cooldownMinutes`. Rows
|
|
74
|
-
* with a null cooldown are skipped. Each incident is processed
|
|
75
|
-
* independently; one failure does not abort the sweep.
|
|
76
|
-
*/
|
|
77
|
-
export async function runAutoIncidentCloseJob({
|
|
78
|
-
db,
|
|
79
|
-
logger,
|
|
80
|
-
incidentClient,
|
|
81
|
-
}: {
|
|
82
|
-
db: Db;
|
|
83
|
-
logger: Logger;
|
|
84
|
-
incidentClient: IncidentClient;
|
|
85
|
-
}): Promise<{ closed: number }> {
|
|
86
|
-
const now = new Date();
|
|
87
|
-
|
|
88
|
-
// All open auto-incidents with a non-null cooldown — rows with null
|
|
89
|
-
// cooldown opted out of auto-close entirely.
|
|
90
|
-
const open = await db
|
|
91
|
-
.select({
|
|
92
|
-
id: healthCheckAutoIncidents.id,
|
|
93
|
-
incidentId: healthCheckAutoIncidents.incidentId,
|
|
94
|
-
systemId: healthCheckAutoIncidents.systemId,
|
|
95
|
-
openedAt: healthCheckAutoIncidents.openedAt,
|
|
96
|
-
cooldownMinutes: healthCheckAutoIncidents.cooldownMinutes,
|
|
97
|
-
})
|
|
98
|
-
.from(healthCheckAutoIncidents)
|
|
99
|
-
.where(
|
|
100
|
-
and(
|
|
101
|
-
isNull(healthCheckAutoIncidents.closedAt),
|
|
102
|
-
isNotNull(healthCheckAutoIncidents.cooldownMinutes),
|
|
103
|
-
),
|
|
104
|
-
);
|
|
105
|
-
|
|
106
|
-
let closed = 0;
|
|
107
|
-
|
|
108
|
-
for (const row of open) {
|
|
109
|
-
try {
|
|
110
|
-
const cooldownMinutes = row.cooldownMinutes;
|
|
111
|
-
if (cooldownMinutes === null) continue; // narrows the type
|
|
112
|
-
|
|
113
|
-
const cooldownStart = new Date(now.getTime() - cooldownMinutes * 60_000);
|
|
114
|
-
|
|
115
|
-
// Require the cooldown to have elapsed since the incident was
|
|
116
|
-
// opened in the first place. Without this, a system that was
|
|
117
|
-
// healthy *before* we opened the incident would be auto-closed on
|
|
118
|
-
// the very first tick.
|
|
119
|
-
if (row.openedAt > cooldownStart) {
|
|
120
|
-
continue;
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
// Has the system had any unhealthy runs inside the cooldown?
|
|
124
|
-
const recentUnhealthy = await db
|
|
125
|
-
.select({ id: healthCheckRuns.id })
|
|
126
|
-
.from(healthCheckRuns)
|
|
127
|
-
.where(
|
|
128
|
-
and(
|
|
129
|
-
eq(healthCheckRuns.systemId, row.systemId),
|
|
130
|
-
eq(healthCheckRuns.status, "unhealthy"),
|
|
131
|
-
gte(healthCheckRuns.timestamp, cooldownStart),
|
|
132
|
-
),
|
|
133
|
-
)
|
|
134
|
-
.limit(1);
|
|
135
|
-
|
|
136
|
-
if (recentUnhealthy.length > 0) {
|
|
137
|
-
continue;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// Steady-state healthy → resolve.
|
|
141
|
-
await incidentClient.resolveAutoIncident({
|
|
142
|
-
id: row.incidentId,
|
|
143
|
-
message: `Auto-resolved: system stayed healthy for ${cooldownMinutes} minutes.`,
|
|
144
|
-
});
|
|
145
|
-
|
|
146
|
-
await db
|
|
147
|
-
.update(healthCheckAutoIncidents)
|
|
148
|
-
.set({ closedAt: new Date() })
|
|
149
|
-
.where(eq(healthCheckAutoIncidents.id, row.id));
|
|
150
|
-
|
|
151
|
-
closed += 1;
|
|
152
|
-
logger.info(
|
|
153
|
-
`Auto-closed incident ${row.incidentId} for system ${row.systemId}`,
|
|
154
|
-
);
|
|
155
|
-
} catch (error) {
|
|
156
|
-
logger.warn(
|
|
157
|
-
`Auto-close failed for incident ${row.incidentId} (system ${row.systemId}):`,
|
|
158
|
-
error,
|
|
159
|
-
);
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
return { closed };
|
|
164
|
-
}
|
|
@@ -1,196 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from "bun:test";
|
|
2
|
-
import type {
|
|
3
|
-
HealthCheckStatus,
|
|
4
|
-
NotificationPolicy,
|
|
5
|
-
} from "@checkstack/healthcheck-common";
|
|
6
|
-
import {
|
|
7
|
-
isTransitionToUnhealthy,
|
|
8
|
-
shouldOpenForFlapping,
|
|
9
|
-
shouldOpenForSustainedUnhealthy,
|
|
10
|
-
} from "./auto-incident";
|
|
11
|
-
|
|
12
|
-
const ALL_STATES: HealthCheckStatus[] = ["healthy", "degraded", "unhealthy"];
|
|
13
|
-
|
|
14
|
-
function policy(overrides: Partial<NotificationPolicy> = {}): NotificationPolicy {
|
|
15
|
-
return {
|
|
16
|
-
suppressDeEscalations: false,
|
|
17
|
-
autoOpenIncidentOnUnhealthy: true,
|
|
18
|
-
useNotificationSuppression: true,
|
|
19
|
-
skipDuringMaintenance: true,
|
|
20
|
-
sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 30 },
|
|
21
|
-
flappingTrigger: { enabled: true, transitions: 3, windowMinutes: 60 },
|
|
22
|
-
autoCloseAfterMinutes: 30,
|
|
23
|
-
...overrides,
|
|
24
|
-
};
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
describe("isTransitionToUnhealthy", () => {
|
|
28
|
-
it("returns true on healthy → unhealthy", () => {
|
|
29
|
-
expect(isTransitionToUnhealthy("healthy", "unhealthy")).toBe(true);
|
|
30
|
-
});
|
|
31
|
-
|
|
32
|
-
it("returns true on degraded → unhealthy", () => {
|
|
33
|
-
expect(isTransitionToUnhealthy("degraded", "unhealthy")).toBe(true);
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
it("returns true on undefined → unhealthy (first-ever evaluation)", () => {
|
|
37
|
-
expect(isTransitionToUnhealthy(undefined, "unhealthy")).toBe(true);
|
|
38
|
-
});
|
|
39
|
-
|
|
40
|
-
it("returns false when staying unhealthy", () => {
|
|
41
|
-
expect(isTransitionToUnhealthy("unhealthy", "unhealthy")).toBe(false);
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
for (const next of ALL_STATES) {
|
|
45
|
-
if (next === "unhealthy") continue;
|
|
46
|
-
it(`returns false when transitioning to ${next}`, () => {
|
|
47
|
-
for (const prev of [...ALL_STATES, undefined]) {
|
|
48
|
-
expect(isTransitionToUnhealthy(prev, next)).toBe(false);
|
|
49
|
-
}
|
|
50
|
-
});
|
|
51
|
-
}
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
describe("shouldOpenForFlapping", () => {
|
|
55
|
-
it("never opens when auto-open is disabled at the top level", () => {
|
|
56
|
-
const p = policy({ autoOpenIncidentOnUnhealthy: false });
|
|
57
|
-
expect(
|
|
58
|
-
shouldOpenForFlapping({ policy: p, recentTransitionCount: 999 }),
|
|
59
|
-
).toBe(false);
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
it("never opens when the flapping trigger itself is disabled", () => {
|
|
63
|
-
const p = policy({
|
|
64
|
-
flappingTrigger: { enabled: false, transitions: 1, windowMinutes: 60 },
|
|
65
|
-
});
|
|
66
|
-
expect(
|
|
67
|
-
shouldOpenForFlapping({ policy: p, recentTransitionCount: 999 }),
|
|
68
|
-
).toBe(false);
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
it("does not open below the configured transition count", () => {
|
|
72
|
-
const p = policy(); // default transitions: 3
|
|
73
|
-
expect(
|
|
74
|
-
shouldOpenForFlapping({ policy: p, recentTransitionCount: 1 }),
|
|
75
|
-
).toBe(false);
|
|
76
|
-
expect(
|
|
77
|
-
shouldOpenForFlapping({ policy: p, recentTransitionCount: 2 }),
|
|
78
|
-
).toBe(false);
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
it("opens once the count reaches the threshold", () => {
|
|
82
|
-
const p = policy();
|
|
83
|
-
expect(
|
|
84
|
-
shouldOpenForFlapping({ policy: p, recentTransitionCount: 3 }),
|
|
85
|
-
).toBe(true);
|
|
86
|
-
});
|
|
87
|
-
|
|
88
|
-
it("stays open above the threshold (no upper bound)", () => {
|
|
89
|
-
const p = policy();
|
|
90
|
-
expect(
|
|
91
|
-
shouldOpenForFlapping({ policy: p, recentTransitionCount: 99 }),
|
|
92
|
-
).toBe(true);
|
|
93
|
-
});
|
|
94
|
-
});
|
|
95
|
-
|
|
96
|
-
describe("shouldOpenForSustainedUnhealthy", () => {
|
|
97
|
-
it("never opens when auto-open is disabled at the top level", () => {
|
|
98
|
-
const p = policy({ autoOpenIncidentOnUnhealthy: false });
|
|
99
|
-
expect(
|
|
100
|
-
shouldOpenForSustainedUnhealthy({
|
|
101
|
-
policy: p,
|
|
102
|
-
unhealthyForMs: 10 * 60 * 60_000,
|
|
103
|
-
}),
|
|
104
|
-
).toBe(false);
|
|
105
|
-
});
|
|
106
|
-
|
|
107
|
-
it("never opens when the sustained trigger itself is disabled", () => {
|
|
108
|
-
const p = policy({
|
|
109
|
-
sustainedUnhealthyTrigger: { enabled: false, durationMinutes: 30 },
|
|
110
|
-
});
|
|
111
|
-
expect(
|
|
112
|
-
shouldOpenForSustainedUnhealthy({
|
|
113
|
-
policy: p,
|
|
114
|
-
unhealthyForMs: 10 * 60 * 60_000,
|
|
115
|
-
}),
|
|
116
|
-
).toBe(false);
|
|
117
|
-
});
|
|
118
|
-
|
|
119
|
-
it("does not open below the configured duration", () => {
|
|
120
|
-
// 29 minutes < 30 minute threshold
|
|
121
|
-
expect(
|
|
122
|
-
shouldOpenForSustainedUnhealthy({
|
|
123
|
-
policy: policy(),
|
|
124
|
-
unhealthyForMs: 29 * 60_000,
|
|
125
|
-
}),
|
|
126
|
-
).toBe(false);
|
|
127
|
-
});
|
|
128
|
-
|
|
129
|
-
it("opens exactly at the threshold", () => {
|
|
130
|
-
expect(
|
|
131
|
-
shouldOpenForSustainedUnhealthy({
|
|
132
|
-
policy: policy(),
|
|
133
|
-
unhealthyForMs: 30 * 60_000,
|
|
134
|
-
}),
|
|
135
|
-
).toBe(true);
|
|
136
|
-
});
|
|
137
|
-
|
|
138
|
-
it("opens beyond the threshold", () => {
|
|
139
|
-
expect(
|
|
140
|
-
shouldOpenForSustainedUnhealthy({
|
|
141
|
-
policy: policy(),
|
|
142
|
-
unhealthyForMs: 60 * 60_000,
|
|
143
|
-
}),
|
|
144
|
-
).toBe(true);
|
|
145
|
-
});
|
|
146
|
-
|
|
147
|
-
it("respects a custom duration", () => {
|
|
148
|
-
const p = policy({
|
|
149
|
-
sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 5 },
|
|
150
|
-
});
|
|
151
|
-
expect(
|
|
152
|
-
shouldOpenForSustainedUnhealthy({
|
|
153
|
-
policy: p,
|
|
154
|
-
unhealthyForMs: 4 * 60_000,
|
|
155
|
-
}),
|
|
156
|
-
).toBe(false);
|
|
157
|
-
expect(
|
|
158
|
-
shouldOpenForSustainedUnhealthy({
|
|
159
|
-
policy: p,
|
|
160
|
-
unhealthyForMs: 5 * 60_000,
|
|
161
|
-
}),
|
|
162
|
-
).toBe(true);
|
|
163
|
-
});
|
|
164
|
-
});
|
|
165
|
-
|
|
166
|
-
describe("flapping vs sustained", () => {
|
|
167
|
-
// The two triggers cover different failure modes. Both should fire
|
|
168
|
-
// on their respective inputs; either is sufficient to open.
|
|
169
|
-
it("flapping fires on persistent flapping where each phase is brief", () => {
|
|
170
|
-
// Check has flapped 3 times in the last hour but each unhealthy
|
|
171
|
-
// phase was only 5 min long (so sustained would never fire).
|
|
172
|
-
expect(
|
|
173
|
-
shouldOpenForFlapping({ policy: policy(), recentTransitionCount: 3 }),
|
|
174
|
-
).toBe(true);
|
|
175
|
-
expect(
|
|
176
|
-
shouldOpenForSustainedUnhealthy({
|
|
177
|
-
policy: policy(),
|
|
178
|
-
unhealthyForMs: 5 * 60_000,
|
|
179
|
-
}),
|
|
180
|
-
).toBe(false);
|
|
181
|
-
});
|
|
182
|
-
|
|
183
|
-
it("sustained fires on a real outage that hasn't flapped yet", () => {
|
|
184
|
-
// Only 1 transition (the original break), but it has been
|
|
185
|
-
// unhealthy for 45 minutes straight.
|
|
186
|
-
expect(
|
|
187
|
-
shouldOpenForFlapping({ policy: policy(), recentTransitionCount: 1 }),
|
|
188
|
-
).toBe(false);
|
|
189
|
-
expect(
|
|
190
|
-
shouldOpenForSustainedUnhealthy({
|
|
191
|
-
policy: policy(),
|
|
192
|
-
unhealthyForMs: 45 * 60_000,
|
|
193
|
-
}),
|
|
194
|
-
).toBe(true);
|
|
195
|
-
});
|
|
196
|
-
});
|
package/src/auto-incident.ts
DELETED
|
@@ -1,332 +0,0 @@
|
|
|
1
|
-
import { and, desc, eq, gte, isNotNull, isNull, sql } from "drizzle-orm";
|
|
2
|
-
import type {
|
|
3
|
-
HealthCheckStatus,
|
|
4
|
-
NotificationPolicy,
|
|
5
|
-
} from "@checkstack/healthcheck-common";
|
|
6
|
-
import type { Logger, SafeDatabase } from "@checkstack/backend-api";
|
|
7
|
-
import type { InferClient } from "@checkstack/common";
|
|
8
|
-
import { IncidentApi } from "@checkstack/incident-common";
|
|
9
|
-
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
10
|
-
import {
|
|
11
|
-
healthCheckAutoIncidents,
|
|
12
|
-
healthCheckRuns,
|
|
13
|
-
healthCheckUnhealthyTransitions,
|
|
14
|
-
} from "./schema";
|
|
15
|
-
import * as schema from "./schema";
|
|
16
|
-
|
|
17
|
-
type Db = SafeDatabase<typeof schema>;
|
|
18
|
-
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
19
|
-
type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
20
|
-
|
|
21
|
-
/**
|
|
22
|
-
* Returns true when the per-check evaluated state went from anything
|
|
23
|
-
* other than `unhealthy` to `unhealthy` between two evaluations.
|
|
24
|
-
*/
|
|
25
|
-
export function isTransitionToUnhealthy(
|
|
26
|
-
previous: HealthCheckStatus | undefined,
|
|
27
|
-
next: HealthCheckStatus,
|
|
28
|
-
): boolean {
|
|
29
|
-
return next === "unhealthy" && previous !== "unhealthy";
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
/**
|
|
33
|
-
* Record a transition-to-unhealthy in the audit table and return the
|
|
34
|
-
* total transition count for this check inside the configured window
|
|
35
|
-
* (the new row is included in the count). When `since` is provided,
|
|
36
|
-
* only transitions strictly after that timestamp are counted — used
|
|
37
|
-
* to ensure a freshly-opened auto-incident isn't re-triggered by
|
|
38
|
-
* pre-close transitions after the prior incident was resolved.
|
|
39
|
-
*/
|
|
40
|
-
export async function recordUnhealthyTransition({
|
|
41
|
-
db,
|
|
42
|
-
configurationId,
|
|
43
|
-
systemId,
|
|
44
|
-
windowMinutes,
|
|
45
|
-
since,
|
|
46
|
-
now = new Date(),
|
|
47
|
-
}: {
|
|
48
|
-
db: Db;
|
|
49
|
-
configurationId: string;
|
|
50
|
-
systemId: string;
|
|
51
|
-
windowMinutes: number;
|
|
52
|
-
since?: Date;
|
|
53
|
-
now?: Date;
|
|
54
|
-
}): Promise<number> {
|
|
55
|
-
await db.insert(healthCheckUnhealthyTransitions).values({
|
|
56
|
-
configurationId,
|
|
57
|
-
systemId,
|
|
58
|
-
transitionedAt: now,
|
|
59
|
-
});
|
|
60
|
-
|
|
61
|
-
const windowStart = new Date(now.getTime() - windowMinutes * 60_000);
|
|
62
|
-
const lowerBound =
|
|
63
|
-
since && since > windowStart ? since : windowStart;
|
|
64
|
-
|
|
65
|
-
const result = await db
|
|
66
|
-
.select({ count: sql<number>`COUNT(*)::int` })
|
|
67
|
-
.from(healthCheckUnhealthyTransitions)
|
|
68
|
-
.where(
|
|
69
|
-
and(
|
|
70
|
-
eq(healthCheckUnhealthyTransitions.configurationId, configurationId),
|
|
71
|
-
eq(healthCheckUnhealthyTransitions.systemId, systemId),
|
|
72
|
-
gte(healthCheckUnhealthyTransitions.transitionedAt, lowerBound),
|
|
73
|
-
),
|
|
74
|
-
);
|
|
75
|
-
|
|
76
|
-
return result[0]?.count ?? 0;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
/**
|
|
80
|
-
* Decide whether the flapping trigger should open an auto-incident.
|
|
81
|
-
* Returns false when the trigger is disabled or the count is below
|
|
82
|
-
* the configured threshold.
|
|
83
|
-
*/
|
|
84
|
-
export function shouldOpenForFlapping({
|
|
85
|
-
policy,
|
|
86
|
-
recentTransitionCount,
|
|
87
|
-
}: {
|
|
88
|
-
policy: NotificationPolicy;
|
|
89
|
-
recentTransitionCount: number;
|
|
90
|
-
}): boolean {
|
|
91
|
-
if (!policy.autoOpenIncidentOnUnhealthy) return false;
|
|
92
|
-
if (!policy.flappingTrigger.enabled) return false;
|
|
93
|
-
return recentTransitionCount >= policy.flappingTrigger.transitions;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* Decide whether the sustained-duration trigger should open an
|
|
98
|
-
* auto-incident given the elapsed-unhealthy time for this check.
|
|
99
|
-
*/
|
|
100
|
-
export function shouldOpenForSustainedUnhealthy({
|
|
101
|
-
policy,
|
|
102
|
-
unhealthyForMs,
|
|
103
|
-
}: {
|
|
104
|
-
policy: NotificationPolicy;
|
|
105
|
-
/** How long the check has been continuously unhealthy. */
|
|
106
|
-
unhealthyForMs: number;
|
|
107
|
-
}): boolean {
|
|
108
|
-
if (!policy.autoOpenIncidentOnUnhealthy) return false;
|
|
109
|
-
if (!policy.sustainedUnhealthyTrigger.enabled) return false;
|
|
110
|
-
const thresholdMs =
|
|
111
|
-
policy.sustainedUnhealthyTrigger.durationMinutes * 60_000;
|
|
112
|
-
return unhealthyForMs >= thresholdMs;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
/**
|
|
116
|
-
* Find the most recent transition to `unhealthy` for this check that
|
|
117
|
-
* happened after `since` (if provided). Used by the sustained-trigger
|
|
118
|
-
* evaluator to compute "how long has the check been unhealthy?"
|
|
119
|
-
*/
|
|
120
|
-
export async function findUnhealthySince({
|
|
121
|
-
db,
|
|
122
|
-
configurationId,
|
|
123
|
-
systemId,
|
|
124
|
-
since,
|
|
125
|
-
}: {
|
|
126
|
-
db: Db;
|
|
127
|
-
configurationId: string;
|
|
128
|
-
systemId: string;
|
|
129
|
-
since?: Date;
|
|
130
|
-
}): Promise<Date | undefined> {
|
|
131
|
-
const conditions = [
|
|
132
|
-
eq(healthCheckUnhealthyTransitions.configurationId, configurationId),
|
|
133
|
-
eq(healthCheckUnhealthyTransitions.systemId, systemId),
|
|
134
|
-
];
|
|
135
|
-
if (since) {
|
|
136
|
-
conditions.push(gte(healthCheckUnhealthyTransitions.transitionedAt, since));
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
const [row] = await db
|
|
140
|
-
.select({
|
|
141
|
-
transitionedAt: healthCheckUnhealthyTransitions.transitionedAt,
|
|
142
|
-
})
|
|
143
|
-
.from(healthCheckUnhealthyTransitions)
|
|
144
|
-
.where(and(...conditions))
|
|
145
|
-
.orderBy(desc(healthCheckUnhealthyTransitions.transitionedAt))
|
|
146
|
-
.limit(1);
|
|
147
|
-
|
|
148
|
-
return row?.transitionedAt;
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/**
|
|
152
|
-
* Find any currently-active (closedAt IS NULL) auto-incident for the
|
|
153
|
-
* system. Used to avoid opening a duplicate when one is already open.
|
|
154
|
-
*/
|
|
155
|
-
export async function findActiveAutoIncident({
|
|
156
|
-
db,
|
|
157
|
-
systemId,
|
|
158
|
-
}: {
|
|
159
|
-
db: Db;
|
|
160
|
-
systemId: string;
|
|
161
|
-
}): Promise<{ id: string; incidentId: string } | undefined> {
|
|
162
|
-
const rows = await db
|
|
163
|
-
.select({
|
|
164
|
-
id: healthCheckAutoIncidents.id,
|
|
165
|
-
incidentId: healthCheckAutoIncidents.incidentId,
|
|
166
|
-
})
|
|
167
|
-
.from(healthCheckAutoIncidents)
|
|
168
|
-
.where(
|
|
169
|
-
and(
|
|
170
|
-
eq(healthCheckAutoIncidents.systemId, systemId),
|
|
171
|
-
isNull(healthCheckAutoIncidents.closedAt),
|
|
172
|
-
),
|
|
173
|
-
)
|
|
174
|
-
.limit(1);
|
|
175
|
-
|
|
176
|
-
return rows[0];
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
/**
|
|
180
|
-
* Most recent close time for an auto-incident on this assignment, or
|
|
181
|
-
* undefined if none has ever closed. Used to gate re-opens behind a
|
|
182
|
-
* "must recover first" rule.
|
|
183
|
-
*/
|
|
184
|
-
export async function findLastAutoIncidentClose({
|
|
185
|
-
db,
|
|
186
|
-
systemId,
|
|
187
|
-
configurationId,
|
|
188
|
-
}: {
|
|
189
|
-
db: Db;
|
|
190
|
-
systemId: string;
|
|
191
|
-
configurationId: string;
|
|
192
|
-
}): Promise<Date | undefined> {
|
|
193
|
-
const [row] = await db
|
|
194
|
-
.select({ closedAt: healthCheckAutoIncidents.closedAt })
|
|
195
|
-
.from(healthCheckAutoIncidents)
|
|
196
|
-
.where(
|
|
197
|
-
and(
|
|
198
|
-
eq(healthCheckAutoIncidents.systemId, systemId),
|
|
199
|
-
eq(healthCheckAutoIncidents.configurationId, configurationId),
|
|
200
|
-
isNotNull(healthCheckAutoIncidents.closedAt),
|
|
201
|
-
),
|
|
202
|
-
)
|
|
203
|
-
.orderBy(desc(healthCheckAutoIncidents.closedAt))
|
|
204
|
-
.limit(1);
|
|
205
|
-
|
|
206
|
-
return row?.closedAt ?? undefined;
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
/**
|
|
210
|
-
* Has this check produced at least one healthy run since the given
|
|
211
|
-
* timestamp? Used to confirm the system has actually recovered between
|
|
212
|
-
* the last auto-incident close and now before a new auto-incident is
|
|
213
|
-
* allowed to open.
|
|
214
|
-
*/
|
|
215
|
-
export async function hasHealthyRunSince({
|
|
216
|
-
db,
|
|
217
|
-
systemId,
|
|
218
|
-
configurationId,
|
|
219
|
-
since,
|
|
220
|
-
}: {
|
|
221
|
-
db: Db;
|
|
222
|
-
systemId: string;
|
|
223
|
-
configurationId: string;
|
|
224
|
-
since: Date;
|
|
225
|
-
}): Promise<boolean> {
|
|
226
|
-
const [row] = await db
|
|
227
|
-
.select({ id: healthCheckRuns.id })
|
|
228
|
-
.from(healthCheckRuns)
|
|
229
|
-
.where(
|
|
230
|
-
and(
|
|
231
|
-
eq(healthCheckRuns.systemId, systemId),
|
|
232
|
-
eq(healthCheckRuns.configurationId, configurationId),
|
|
233
|
-
eq(healthCheckRuns.status, "healthy"),
|
|
234
|
-
gte(healthCheckRuns.timestamp, since),
|
|
235
|
-
),
|
|
236
|
-
)
|
|
237
|
-
.limit(1);
|
|
238
|
-
|
|
239
|
-
return !!row;
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
/**
|
|
243
|
-
* Check whether the system currently has an active maintenance window
|
|
244
|
-
* with suppression. Falls back to "not suppressed" on errors so a
|
|
245
|
-
* downstream outage doesn't accidentally block legitimate incidents.
|
|
246
|
-
*/
|
|
247
|
-
export async function isMaintenanceSuppressed({
|
|
248
|
-
maintenanceClient,
|
|
249
|
-
systemId,
|
|
250
|
-
logger,
|
|
251
|
-
}: {
|
|
252
|
-
maintenanceClient: MaintenanceClient;
|
|
253
|
-
systemId: string;
|
|
254
|
-
logger: Logger;
|
|
255
|
-
}): Promise<boolean> {
|
|
256
|
-
try {
|
|
257
|
-
const { suppressed } =
|
|
258
|
-
await maintenanceClient.hasActiveMaintenanceWithSuppression({ systemId });
|
|
259
|
-
return suppressed;
|
|
260
|
-
} catch (error) {
|
|
261
|
-
logger.warn(
|
|
262
|
-
`Failed to check maintenance for ${systemId} during auto-incident decision; assuming not suppressed:`,
|
|
263
|
-
error,
|
|
264
|
-
);
|
|
265
|
-
return false;
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
/**
|
|
270
|
-
* Open an auto-incident through the incident plugin's service-level
|
|
271
|
-
* RPC and persist the mapping so the auto-close worker can find and
|
|
272
|
-
* resolve it later. No-op (returns existing mapping) when an active
|
|
273
|
-
* auto-incident already exists for the system.
|
|
274
|
-
*/
|
|
275
|
-
export async function openAutoIncident({
|
|
276
|
-
db,
|
|
277
|
-
incidentClient,
|
|
278
|
-
logger,
|
|
279
|
-
systemId,
|
|
280
|
-
systemName,
|
|
281
|
-
configurationId,
|
|
282
|
-
configurationName,
|
|
283
|
-
policy,
|
|
284
|
-
reason,
|
|
285
|
-
}: {
|
|
286
|
-
db: Db;
|
|
287
|
-
incidentClient: IncidentClient;
|
|
288
|
-
logger: Logger;
|
|
289
|
-
systemId: string;
|
|
290
|
-
systemName: string;
|
|
291
|
-
configurationId: string;
|
|
292
|
-
configurationName: string;
|
|
293
|
-
policy: NotificationPolicy;
|
|
294
|
-
/** Short human-readable phrase for the incident description. */
|
|
295
|
-
reason: string;
|
|
296
|
-
}): Promise<{ incidentId: string } | undefined> {
|
|
297
|
-
const existing = await findActiveAutoIncident({ db, systemId });
|
|
298
|
-
if (existing) {
|
|
299
|
-
return { incidentId: existing.incidentId };
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
try {
|
|
303
|
-
const { id: incidentId } = await incidentClient.createAutoIncident({
|
|
304
|
-
title: `${systemName} is critical`,
|
|
305
|
-
description: `Auto-opened by health check **${configurationName}** (${reason}).`,
|
|
306
|
-
severity: "critical",
|
|
307
|
-
suppressNotifications: policy.useNotificationSuppression,
|
|
308
|
-
systemIds: [systemId],
|
|
309
|
-
initialMessage: `Health check \`${configurationName}\` triggered the auto-incident: ${reason}.`,
|
|
310
|
-
});
|
|
311
|
-
|
|
312
|
-
await db.insert(healthCheckAutoIncidents).values({
|
|
313
|
-
incidentId,
|
|
314
|
-
systemId,
|
|
315
|
-
configurationId,
|
|
316
|
-
cooldownMinutes: policy.autoCloseAfterMinutes,
|
|
317
|
-
});
|
|
318
|
-
|
|
319
|
-
logger.info(
|
|
320
|
-
`Auto-opened incident ${incidentId} for system ${systemId} (check ${configurationId}; ${reason})`,
|
|
321
|
-
);
|
|
322
|
-
return { incidentId };
|
|
323
|
-
} catch (error) {
|
|
324
|
-
// Auto-incident creation is best-effort — failure here shouldn't
|
|
325
|
-
// block the rest of the health-check flow.
|
|
326
|
-
logger.warn(
|
|
327
|
-
`Failed to open auto-incident for system ${systemId} (check ${configurationId}):`,
|
|
328
|
-
error,
|
|
329
|
-
);
|
|
330
|
-
return undefined;
|
|
331
|
-
}
|
|
332
|
-
}
|