@checkstack/healthcheck-backend 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ import { describe, it, expect, mock } from "bun:test";
2
+ import {
3
+ countStateTransitionsInWindow,
4
+ findInStatusSince,
5
+ recordStateTransition,
6
+ } from "./state-transitions";
7
+
8
+ /**
9
+ * Minimal fluent mock for `db.select(...).from(...).where(...).orderBy(...).limit(...)`
10
+ * that resolves to the provided rows.
11
+ */
12
+ function selectMockDb(rows: Array<{ transitionedAt: Date }>) {
13
+ return {
14
+ select: mock(() => ({
15
+ from: mock(() => ({
16
+ where: mock(() => ({
17
+ orderBy: mock(() => ({
18
+ limit: mock(() => Promise.resolve(rows)),
19
+ })),
20
+ })),
21
+ })),
22
+ })),
23
+ };
24
+ }
25
+
26
+ describe("findInStatusSince", () => {
27
+ it("returns the most-recent transitionedAt for the status", async () => {
28
+ const since = new Date("2026-05-30T10:00:00.000Z");
29
+ const db = selectMockDb([{ transitionedAt: since }]);
30
+ const result = await findInStatusSince({
31
+ db: db as never,
32
+ systemId: "system-1",
33
+ status: "unhealthy",
34
+ });
35
+ expect(result).toBe(since);
36
+ });
37
+
38
+ it("returns null (fail-safe) when no transition row exists", async () => {
39
+ const db = selectMockDb([]);
40
+ const result = await findInStatusSince({
41
+ db: db as never,
42
+ systemId: "system-1",
43
+ status: "degraded",
44
+ });
45
+ expect(result).toBeNull();
46
+ });
47
+ });
48
+
49
+ describe("recordStateTransition", () => {
50
+ it("inserts a row with from/to status and the provided timestamp", async () => {
51
+ const values =
52
+ mock<(v: Record<string, unknown>) => Promise<void>>(() =>
53
+ Promise.resolve(),
54
+ );
55
+ const db = { insert: mock(() => ({ values })) };
56
+ const now = new Date("2026-05-30T12:00:00.000Z");
57
+
58
+ await recordStateTransition({
59
+ db: db as never,
60
+ systemId: "system-1",
61
+ configurationId: "config-1",
62
+ fromStatus: "healthy",
63
+ toStatus: "unhealthy",
64
+ now,
65
+ });
66
+
67
+ expect(values).toHaveBeenCalledTimes(1);
68
+ expect(values.mock.calls[0]?.[0]).toEqual({
69
+ systemId: "system-1",
70
+ configurationId: "config-1",
71
+ fromStatus: "healthy",
72
+ toStatus: "unhealthy",
73
+ transitionedAt: now,
74
+ });
75
+ });
76
+
77
+ it("stores null fromStatus on the first-ever transition", async () => {
78
+ const values =
79
+ mock<(v: Record<string, unknown>) => Promise<void>>(() =>
80
+ Promise.resolve(),
81
+ );
82
+ const db = { insert: mock(() => ({ values })) };
83
+
84
+ await recordStateTransition({
85
+ db: db as never,
86
+ systemId: "system-1",
87
+ configurationId: "config-1",
88
+ fromStatus: undefined,
89
+ toStatus: "degraded",
90
+ });
91
+
92
+ const arg = values.mock.calls[0]?.[0] as { fromStatus: unknown };
93
+ expect(arg.fromStatus).toBeNull();
94
+ });
95
+ });
96
+
97
+ describe("countStateTransitionsInWindow", () => {
98
+ /** Mock for `db.select({count}).from(...).where(...)` resolving to [{count}]. */
99
+ function countMockDb(count: number) {
100
+ const where = mock(() => Promise.resolve([{ count }]));
101
+ const from = mock(() => ({ where }));
102
+ const select = mock(() => ({ from }));
103
+ return { db: { select }, where };
104
+ }
105
+
106
+ it("returns the windowed count", async () => {
107
+ const { db } = countMockDb(4);
108
+ const result = await countStateTransitionsInWindow({
109
+ db: db as never,
110
+ systemId: "system-1",
111
+ windowMinutes: 60,
112
+ });
113
+ expect(result).toBe(4);
114
+ });
115
+
116
+ it("returns 0 (fail-safe) when the query yields no rows", async () => {
117
+ const where = mock(() => Promise.resolve([]));
118
+ const db = { select: mock(() => ({ from: mock(() => ({ where })) })) };
119
+ const result = await countStateTransitionsInWindow({
120
+ db: db as never,
121
+ systemId: "system-1",
122
+ windowMinutes: 30,
123
+ });
124
+ expect(result).toBe(0);
125
+ });
126
+ });
@@ -0,0 +1,112 @@
1
+ import { and, desc, eq, gte, sql } from "drizzle-orm";
2
+ import type { HealthCheckStatus } from "@checkstack/healthcheck-common";
3
+ import type { SafeDatabase } from "@checkstack/backend-api";
4
+ import { healthCheckStateTransitions } from "./schema";
5
+ import * as schema from "./schema";
6
+
7
+ type Db = SafeDatabase<typeof schema>;
8
+
9
+ /**
10
+ * Record an aggregate health-status transition for a system. Called at
11
+ * the same point `systemHealthChanged` fires (one row per aggregate
12
+ * transition, which is rare). `fromStatus` is null on the first-ever
13
+ * recorded transition for a system.
14
+ */
15
+ export async function recordStateTransition({
16
+ db,
17
+ systemId,
18
+ configurationId,
19
+ fromStatus,
20
+ toStatus,
21
+ now = new Date(),
22
+ }: {
23
+ db: Db;
24
+ systemId: string;
25
+ configurationId: string;
26
+ fromStatus: HealthCheckStatus | undefined;
27
+ toStatus: HealthCheckStatus;
28
+ now?: Date;
29
+ }): Promise<void> {
30
+ await db.insert(healthCheckStateTransitions).values({
31
+ systemId,
32
+ configurationId,
33
+ fromStatus: fromStatus ?? null,
34
+ toStatus,
35
+ transitionedAt: now,
36
+ });
37
+ }
38
+
39
+ /**
40
+ * Find the timestamp at which the system most recently entered the
41
+ * given status (the start of its current streak in that status).
42
+ *
43
+ * Fail-safe: when no transition row exists (e.g. the table was pruned
44
+ * before this system ever transitioned, or it has never changed status)
45
+ * this returns `null` rather than throwing, so callers degrade to
46
+ * `inStatusSince: null` instead of failing the whole evaluation.
47
+ */
48
+ export async function findInStatusSince({
49
+ db,
50
+ systemId,
51
+ status,
52
+ }: {
53
+ db: Db;
54
+ systemId: string;
55
+ status: HealthCheckStatus;
56
+ }): Promise<Date | null> {
57
+ const [row] = await db
58
+ .select({ transitionedAt: healthCheckStateTransitions.transitionedAt })
59
+ .from(healthCheckStateTransitions)
60
+ .where(
61
+ and(
62
+ eq(healthCheckStateTransitions.systemId, systemId),
63
+ eq(healthCheckStateTransitions.toStatus, status),
64
+ ),
65
+ )
66
+ .orderBy(desc(healthCheckStateTransitions.transitionedAt))
67
+ .limit(1);
68
+
69
+ return row?.transitionedAt ?? null;
70
+ }
71
+
72
+ /**
73
+ * Count aggregate state transitions for a system within the trailing
74
+ * window `[now - windowMinutes, now]`. Generalizes the flapping detector's
75
+ * "N transitions in M minutes" count beyond the unhealthy-only table.
76
+ *
77
+ * When `toStatus` is given, counts only transitions INTO that status
78
+ * (e.g. flapping = repeated transitions into `unhealthy`); omit it to
79
+ * count all status changes in the window.
80
+ *
81
+ * Fail-safe: returns 0 on any error rather than throwing, so a count
82
+ * read never wedges an evaluation.
83
+ */
84
+ export async function countStateTransitionsInWindow({
85
+ db,
86
+ systemId,
87
+ windowMinutes,
88
+ toStatus,
89
+ now = new Date(),
90
+ }: {
91
+ db: Db;
92
+ systemId: string;
93
+ windowMinutes: number;
94
+ toStatus?: HealthCheckStatus;
95
+ now?: Date;
96
+ }): Promise<number> {
97
+ const windowStart = new Date(now.getTime() - windowMinutes * 60_000);
98
+ const conditions = [
99
+ eq(healthCheckStateTransitions.systemId, systemId),
100
+ gte(healthCheckStateTransitions.transitionedAt, windowStart),
101
+ ];
102
+ if (toStatus) {
103
+ conditions.push(eq(healthCheckStateTransitions.toStatus, toStatus));
104
+ }
105
+
106
+ const [row] = await db
107
+ .select({ count: sql<number>`COUNT(*)::int` })
108
+ .from(healthCheckStateTransitions)
109
+ .where(and(...conditions));
110
+
111
+ return row?.count ?? 0;
112
+ }
package/tsconfig.json CHANGED
@@ -58,6 +58,15 @@
58
58
  {
59
59
  "path": "../satellite-backend"
60
60
  },
61
+ {
62
+ "path": "../script-packages-backend"
63
+ },
64
+ {
65
+ "path": "../secrets-backend"
66
+ },
67
+ {
68
+ "path": "../secrets-common"
69
+ },
61
70
  {
62
71
  "path": "../signal-common"
63
72
  },
@@ -1,164 +0,0 @@
1
- import { and, eq, gte, isNotNull, isNull } from "drizzle-orm";
2
- import type { Logger, SafeDatabase } from "@checkstack/backend-api";
3
- import type { InferClient } from "@checkstack/common";
4
- import { IncidentApi } from "@checkstack/incident-common";
5
- import type { QueueManager } from "@checkstack/queue-api";
6
- import * as schema from "./schema";
7
- import { healthCheckAutoIncidents, healthCheckRuns } from "./schema";
8
-
9
- type Db = SafeDatabase<typeof schema>;
10
- type IncidentClient = InferClient<typeof IncidentApi>;
11
-
12
- const AUTO_CLOSE_QUEUE = "health-check-auto-incident-close";
13
-
14
- interface AutoCloseJobPayload {
15
- trigger: "scheduled";
16
- }
17
-
18
- interface AutoCloseJobDeps {
19
- db: Db;
20
- logger: Logger;
21
- queueManager: QueueManager;
22
- incidentClient: IncidentClient;
23
- /**
24
- * How often the worker ticks. Default 60s. Set lower in tests.
25
- */
26
- intervalSeconds?: number;
27
- }
28
-
29
- const DEFAULT_INTERVAL_SECONDS = 60;
30
-
31
- /**
32
- * Background worker that resolves auto-opened incidents once the
33
- * underlying system has stayed healthy for the per-incident cooldown.
34
- * The cooldown is snapshot per-row at open time (see
35
- * `healthCheckAutoIncidents.cooldownMinutes`) so a policy change does
36
- * not retroactively alter the close behaviour of incidents already in
37
- * flight. A `null` cooldown means "never auto-close" — the worker
38
- * skips those rows and an operator must resolve them manually.
39
- */
40
- export async function setupAutoIncidentCloseJob(deps: AutoCloseJobDeps) {
41
- const {
42
- queueManager,
43
- logger,
44
- db,
45
- incidentClient,
46
- intervalSeconds = DEFAULT_INTERVAL_SECONDS,
47
- } = deps;
48
-
49
- const queue = queueManager.getQueue<AutoCloseJobPayload>(AUTO_CLOSE_QUEUE);
50
-
51
- await queue.consume(
52
- async () => {
53
- await runAutoIncidentCloseJob({ db, logger, incidentClient });
54
- },
55
- { consumerGroup: "auto-incident-close-worker" },
56
- );
57
-
58
- await queue.scheduleRecurring(
59
- { trigger: "scheduled" },
60
- {
61
- jobId: "health-check-auto-incident-close",
62
- intervalSeconds,
63
- },
64
- );
65
-
66
- logger.info(
67
- `Health check auto-incident close job scheduled (interval ${intervalSeconds}s; cooldown is per-incident)`,
68
- );
69
- }
70
-
71
- /**
72
- * Resolve any open auto-incidents whose linked system has been
73
- * steadily healthy for at least their snapshot `cooldownMinutes`. Rows
74
- * with a null cooldown are skipped. Each incident is processed
75
- * independently; one failure does not abort the sweep.
76
- */
77
- export async function runAutoIncidentCloseJob({
78
- db,
79
- logger,
80
- incidentClient,
81
- }: {
82
- db: Db;
83
- logger: Logger;
84
- incidentClient: IncidentClient;
85
- }): Promise<{ closed: number }> {
86
- const now = new Date();
87
-
88
- // All open auto-incidents with a non-null cooldown — rows with null
89
- // cooldown opted out of auto-close entirely.
90
- const open = await db
91
- .select({
92
- id: healthCheckAutoIncidents.id,
93
- incidentId: healthCheckAutoIncidents.incidentId,
94
- systemId: healthCheckAutoIncidents.systemId,
95
- openedAt: healthCheckAutoIncidents.openedAt,
96
- cooldownMinutes: healthCheckAutoIncidents.cooldownMinutes,
97
- })
98
- .from(healthCheckAutoIncidents)
99
- .where(
100
- and(
101
- isNull(healthCheckAutoIncidents.closedAt),
102
- isNotNull(healthCheckAutoIncidents.cooldownMinutes),
103
- ),
104
- );
105
-
106
- let closed = 0;
107
-
108
- for (const row of open) {
109
- try {
110
- const cooldownMinutes = row.cooldownMinutes;
111
- if (cooldownMinutes === null) continue; // narrows the type
112
-
113
- const cooldownStart = new Date(now.getTime() - cooldownMinutes * 60_000);
114
-
115
- // Require the cooldown to have elapsed since the incident was
116
- // opened in the first place. Without this, a system that was
117
- // healthy *before* we opened the incident would be auto-closed on
118
- // the very first tick.
119
- if (row.openedAt > cooldownStart) {
120
- continue;
121
- }
122
-
123
- // Has the system had any unhealthy runs inside the cooldown?
124
- const recentUnhealthy = await db
125
- .select({ id: healthCheckRuns.id })
126
- .from(healthCheckRuns)
127
- .where(
128
- and(
129
- eq(healthCheckRuns.systemId, row.systemId),
130
- eq(healthCheckRuns.status, "unhealthy"),
131
- gte(healthCheckRuns.timestamp, cooldownStart),
132
- ),
133
- )
134
- .limit(1);
135
-
136
- if (recentUnhealthy.length > 0) {
137
- continue;
138
- }
139
-
140
- // Steady-state healthy → resolve.
141
- await incidentClient.resolveAutoIncident({
142
- id: row.incidentId,
143
- message: `Auto-resolved: system stayed healthy for ${cooldownMinutes} minutes.`,
144
- });
145
-
146
- await db
147
- .update(healthCheckAutoIncidents)
148
- .set({ closedAt: new Date() })
149
- .where(eq(healthCheckAutoIncidents.id, row.id));
150
-
151
- closed += 1;
152
- logger.info(
153
- `Auto-closed incident ${row.incidentId} for system ${row.systemId}`,
154
- );
155
- } catch (error) {
156
- logger.warn(
157
- `Auto-close failed for incident ${row.incidentId} (system ${row.systemId}):`,
158
- error,
159
- );
160
- }
161
- }
162
-
163
- return { closed };
164
- }
@@ -1,196 +0,0 @@
1
- import { describe, it, expect } from "bun:test";
2
- import type {
3
- HealthCheckStatus,
4
- NotificationPolicy,
5
- } from "@checkstack/healthcheck-common";
6
- import {
7
- isTransitionToUnhealthy,
8
- shouldOpenForFlapping,
9
- shouldOpenForSustainedUnhealthy,
10
- } from "./auto-incident";
11
-
12
- const ALL_STATES: HealthCheckStatus[] = ["healthy", "degraded", "unhealthy"];
13
-
14
- function policy(overrides: Partial<NotificationPolicy> = {}): NotificationPolicy {
15
- return {
16
- suppressDeEscalations: false,
17
- autoOpenIncidentOnUnhealthy: true,
18
- useNotificationSuppression: true,
19
- skipDuringMaintenance: true,
20
- sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 30 },
21
- flappingTrigger: { enabled: true, transitions: 3, windowMinutes: 60 },
22
- autoCloseAfterMinutes: 30,
23
- ...overrides,
24
- };
25
- }
26
-
27
- describe("isTransitionToUnhealthy", () => {
28
- it("returns true on healthy → unhealthy", () => {
29
- expect(isTransitionToUnhealthy("healthy", "unhealthy")).toBe(true);
30
- });
31
-
32
- it("returns true on degraded → unhealthy", () => {
33
- expect(isTransitionToUnhealthy("degraded", "unhealthy")).toBe(true);
34
- });
35
-
36
- it("returns true on undefined → unhealthy (first-ever evaluation)", () => {
37
- expect(isTransitionToUnhealthy(undefined, "unhealthy")).toBe(true);
38
- });
39
-
40
- it("returns false when staying unhealthy", () => {
41
- expect(isTransitionToUnhealthy("unhealthy", "unhealthy")).toBe(false);
42
- });
43
-
44
- for (const next of ALL_STATES) {
45
- if (next === "unhealthy") continue;
46
- it(`returns false when transitioning to ${next}`, () => {
47
- for (const prev of [...ALL_STATES, undefined]) {
48
- expect(isTransitionToUnhealthy(prev, next)).toBe(false);
49
- }
50
- });
51
- }
52
- });
53
-
54
- describe("shouldOpenForFlapping", () => {
55
- it("never opens when auto-open is disabled at the top level", () => {
56
- const p = policy({ autoOpenIncidentOnUnhealthy: false });
57
- expect(
58
- shouldOpenForFlapping({ policy: p, recentTransitionCount: 999 }),
59
- ).toBe(false);
60
- });
61
-
62
- it("never opens when the flapping trigger itself is disabled", () => {
63
- const p = policy({
64
- flappingTrigger: { enabled: false, transitions: 1, windowMinutes: 60 },
65
- });
66
- expect(
67
- shouldOpenForFlapping({ policy: p, recentTransitionCount: 999 }),
68
- ).toBe(false);
69
- });
70
-
71
- it("does not open below the configured transition count", () => {
72
- const p = policy(); // default transitions: 3
73
- expect(
74
- shouldOpenForFlapping({ policy: p, recentTransitionCount: 1 }),
75
- ).toBe(false);
76
- expect(
77
- shouldOpenForFlapping({ policy: p, recentTransitionCount: 2 }),
78
- ).toBe(false);
79
- });
80
-
81
- it("opens once the count reaches the threshold", () => {
82
- const p = policy();
83
- expect(
84
- shouldOpenForFlapping({ policy: p, recentTransitionCount: 3 }),
85
- ).toBe(true);
86
- });
87
-
88
- it("stays open above the threshold (no upper bound)", () => {
89
- const p = policy();
90
- expect(
91
- shouldOpenForFlapping({ policy: p, recentTransitionCount: 99 }),
92
- ).toBe(true);
93
- });
94
- });
95
-
96
- describe("shouldOpenForSustainedUnhealthy", () => {
97
- it("never opens when auto-open is disabled at the top level", () => {
98
- const p = policy({ autoOpenIncidentOnUnhealthy: false });
99
- expect(
100
- shouldOpenForSustainedUnhealthy({
101
- policy: p,
102
- unhealthyForMs: 10 * 60 * 60_000,
103
- }),
104
- ).toBe(false);
105
- });
106
-
107
- it("never opens when the sustained trigger itself is disabled", () => {
108
- const p = policy({
109
- sustainedUnhealthyTrigger: { enabled: false, durationMinutes: 30 },
110
- });
111
- expect(
112
- shouldOpenForSustainedUnhealthy({
113
- policy: p,
114
- unhealthyForMs: 10 * 60 * 60_000,
115
- }),
116
- ).toBe(false);
117
- });
118
-
119
- it("does not open below the configured duration", () => {
120
- // 29 minutes < 30 minute threshold
121
- expect(
122
- shouldOpenForSustainedUnhealthy({
123
- policy: policy(),
124
- unhealthyForMs: 29 * 60_000,
125
- }),
126
- ).toBe(false);
127
- });
128
-
129
- it("opens exactly at the threshold", () => {
130
- expect(
131
- shouldOpenForSustainedUnhealthy({
132
- policy: policy(),
133
- unhealthyForMs: 30 * 60_000,
134
- }),
135
- ).toBe(true);
136
- });
137
-
138
- it("opens beyond the threshold", () => {
139
- expect(
140
- shouldOpenForSustainedUnhealthy({
141
- policy: policy(),
142
- unhealthyForMs: 60 * 60_000,
143
- }),
144
- ).toBe(true);
145
- });
146
-
147
- it("respects a custom duration", () => {
148
- const p = policy({
149
- sustainedUnhealthyTrigger: { enabled: true, durationMinutes: 5 },
150
- });
151
- expect(
152
- shouldOpenForSustainedUnhealthy({
153
- policy: p,
154
- unhealthyForMs: 4 * 60_000,
155
- }),
156
- ).toBe(false);
157
- expect(
158
- shouldOpenForSustainedUnhealthy({
159
- policy: p,
160
- unhealthyForMs: 5 * 60_000,
161
- }),
162
- ).toBe(true);
163
- });
164
- });
165
-
166
- describe("flapping vs sustained", () => {
167
- // The two triggers cover different failure modes. Both should fire
168
- // on their respective inputs; either is sufficient to open.
169
- it("flapping fires on persistent flapping where each phase is brief", () => {
170
- // Check has flapped 3 times in the last hour but each unhealthy
171
- // phase was only 5 min long (so sustained would never fire).
172
- expect(
173
- shouldOpenForFlapping({ policy: policy(), recentTransitionCount: 3 }),
174
- ).toBe(true);
175
- expect(
176
- shouldOpenForSustainedUnhealthy({
177
- policy: policy(),
178
- unhealthyForMs: 5 * 60_000,
179
- }),
180
- ).toBe(false);
181
- });
182
-
183
- it("sustained fires on a real outage that hasn't flapped yet", () => {
184
- // Only 1 transition (the original break), but it has been
185
- // unhealthy for 45 minutes straight.
186
- expect(
187
- shouldOpenForFlapping({ policy: policy(), recentTransitionCount: 1 }),
188
- ).toBe(false);
189
- expect(
190
- shouldOpenForSustainedUnhealthy({
191
- policy: policy(),
192
- unhealthyForMs: 45 * 60_000,
193
- }),
194
- ).toBe(true);
195
- });
196
- });