@checkstack/healthcheck-backend 1.1.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +256 -0
- package/drizzle/0012_fair_boomer.sql +1 -0
- package/drizzle/0013_clean_fabian_cortez.sql +20 -0
- package/drizzle/0014_chilly_ultragirl.sql +2 -0
- package/drizzle/meta/0012_snapshot.json +447 -0
- package/drizzle/meta/0013_snapshot.json +615 -0
- package/drizzle/meta/0014_snapshot.json +648 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +21 -20
- package/src/auto-incident-close-job.ts +164 -0
- package/src/auto-incident.test.ts +196 -0
- package/src/auto-incident.ts +332 -0
- package/src/automations.test.ts +255 -0
- package/src/automations.ts +340 -0
- package/src/healthcheck-gitops-kinds.test.ts +93 -0
- package/src/healthcheck-gitops-kinds.ts +34 -0
- package/src/hooks.ts +69 -4
- package/src/index.ts +80 -52
- package/src/notification-defaults-config.ts +10 -0
- package/src/notification-policy.test.ts +104 -0
- package/src/notification-policy.ts +56 -0
- package/src/queue-executor.test.ts +137 -0
- package/src/queue-executor.ts +434 -42
- package/src/router.test.ts +12 -0
- package/src/router.ts +30 -2
- package/src/schema.ts +76 -0
- package/src/service-assignments.test.ts +184 -0
- package/src/service-notification-policy.test.ts +174 -0
- package/src/service.ts +195 -1
- package/tsconfig.json +5 -2
package/src/service.ts
CHANGED
|
@@ -6,7 +6,18 @@ import {
|
|
|
6
6
|
HealthCheckStatus,
|
|
7
7
|
RetentionConfig,
|
|
8
8
|
type HealthCheckRunResult,
|
|
9
|
+
type NotificationPolicy,
|
|
10
|
+
NotificationPolicySchema,
|
|
11
|
+
DEFAULT_NOTIFICATION_POLICY,
|
|
9
12
|
} from "@checkstack/healthcheck-common";
|
|
13
|
+
import type { ConfigService } from "@checkstack/backend-api";
|
|
14
|
+
import type { InferClient } from "@checkstack/common";
|
|
15
|
+
import type { CatalogApi } from "@checkstack/catalog-common";
|
|
16
|
+
import {
|
|
17
|
+
notificationDefaultsConfigV1,
|
|
18
|
+
NOTIFICATION_DEFAULTS_CONFIG_ID,
|
|
19
|
+
NOTIFICATION_DEFAULTS_CONFIG_VERSION,
|
|
20
|
+
} from "./notification-defaults-config";
|
|
10
21
|
import {
|
|
11
22
|
healthCheckConfigurations,
|
|
12
23
|
systemHealthChecks,
|
|
@@ -15,7 +26,16 @@ import {
|
|
|
15
26
|
VersionedStateThresholds,
|
|
16
27
|
} from "./schema";
|
|
17
28
|
import * as schema from "./schema";
|
|
18
|
-
import {
|
|
29
|
+
import {
|
|
30
|
+
eq,
|
|
31
|
+
and,
|
|
32
|
+
InferSelectModel,
|
|
33
|
+
desc,
|
|
34
|
+
gte,
|
|
35
|
+
lte,
|
|
36
|
+
isNull,
|
|
37
|
+
inArray,
|
|
38
|
+
} from "drizzle-orm";
|
|
19
39
|
import { ORPCError } from "@orpc/server";
|
|
20
40
|
import { evaluateHealthStatus } from "./state-evaluator";
|
|
21
41
|
import { stateThresholds } from "./state-thresholds-migrations";
|
|
@@ -38,6 +58,10 @@ import {
|
|
|
38
58
|
// Drizzle type helper - uses SafeDatabase to prevent relational query API usage
|
|
39
59
|
type Db = SafeDatabase<typeof schema>;
|
|
40
60
|
|
|
61
|
+
// Catalog client type used to resolve human-readable system names for
|
|
62
|
+
// satellite assignment run-context. Optional on the service.
|
|
63
|
+
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
64
|
+
|
|
41
65
|
interface SystemCheckStatus {
|
|
42
66
|
configurationId: string;
|
|
43
67
|
configurationName: string;
|
|
@@ -57,8 +81,62 @@ export class HealthCheckService {
|
|
|
57
81
|
private db: Db,
|
|
58
82
|
private registry: HealthCheckRegistry,
|
|
59
83
|
private collectorRegistry: CollectorRegistry,
|
|
84
|
+
/**
|
|
85
|
+
* Optional — only required by code paths that resolve platform
|
|
86
|
+
* defaults (notification policy fallback). When absent, callers
|
|
87
|
+
* fall back to the compile-time `DEFAULT_NOTIFICATION_POLICY`.
|
|
88
|
+
* Kept optional so existing GitOps-only / test constructions don't
|
|
89
|
+
* have to plumb it through.
|
|
90
|
+
*/
|
|
91
|
+
private configService?: ConfigService,
|
|
92
|
+
/**
|
|
93
|
+
* Optional — used to resolve human-readable system names when building
|
|
94
|
+
* satellite assignment run-context. When absent (e.g. GitOps-only /
|
|
95
|
+
* test constructions), `systemName` falls back to the `systemId`.
|
|
96
|
+
*/
|
|
97
|
+
private catalogClient?: CatalogClient,
|
|
60
98
|
) {}
|
|
61
99
|
|
|
100
|
+
/**
|
|
101
|
+
* Resolve the platform-wide notification policy defaults. Returns
|
|
102
|
+
* the compile-time defaults when no `configService` was provided or
|
|
103
|
+
* nothing has ever been persisted. Stored values are passed through
|
|
104
|
+
* the schema so missing fields default in.
|
|
105
|
+
*/
|
|
106
|
+
async getPlatformNotificationDefaults(): Promise<NotificationPolicy> {
|
|
107
|
+
if (!this.configService) {
|
|
108
|
+
return DEFAULT_NOTIFICATION_POLICY;
|
|
109
|
+
}
|
|
110
|
+
const stored = await this.configService.get(
|
|
111
|
+
NOTIFICATION_DEFAULTS_CONFIG_ID,
|
|
112
|
+
notificationDefaultsConfigV1,
|
|
113
|
+
NOTIFICATION_DEFAULTS_CONFIG_VERSION,
|
|
114
|
+
);
|
|
115
|
+
return stored ?? DEFAULT_NOTIFICATION_POLICY;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Persist platform-wide notification policy defaults. Per-assignment
|
|
120
|
+
* rows with `notificationPolicy = null` will read the new defaults
|
|
121
|
+
* on their next evaluation. In-flight auto-incidents are unaffected
|
|
122
|
+
* (their cooldown is snapshotted per-row at open time).
|
|
123
|
+
*/
|
|
124
|
+
async setPlatformNotificationDefaults(
|
|
125
|
+
policy: NotificationPolicy,
|
|
126
|
+
): Promise<void> {
|
|
127
|
+
if (!this.configService) {
|
|
128
|
+
throw new Error(
|
|
129
|
+
"ConfigService not configured; cannot persist platform notification defaults",
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
await this.configService.set(
|
|
133
|
+
NOTIFICATION_DEFAULTS_CONFIG_ID,
|
|
134
|
+
notificationDefaultsConfigV1,
|
|
135
|
+
NOTIFICATION_DEFAULTS_CONFIG_VERSION,
|
|
136
|
+
policy,
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
|
|
62
140
|
async createConfiguration(
|
|
63
141
|
data: CreateHealthCheckConfiguration,
|
|
64
142
|
): Promise<HealthCheckConfiguration> {
|
|
@@ -133,6 +211,7 @@ export class HealthCheckService {
|
|
|
133
211
|
stateThresholds?: StateThresholds;
|
|
134
212
|
satelliteIds?: string[];
|
|
135
213
|
includeLocal?: boolean;
|
|
214
|
+
notificationPolicy?: NotificationPolicy;
|
|
136
215
|
}) {
|
|
137
216
|
const {
|
|
138
217
|
systemId,
|
|
@@ -141,6 +220,7 @@ export class HealthCheckService {
|
|
|
141
220
|
stateThresholds: stateThresholds_,
|
|
142
221
|
satelliteIds,
|
|
143
222
|
includeLocal = true,
|
|
223
|
+
notificationPolicy,
|
|
144
224
|
} = props;
|
|
145
225
|
|
|
146
226
|
// Wrap thresholds in versioned config if provided
|
|
@@ -156,6 +236,7 @@ export class HealthCheckService {
|
|
|
156
236
|
stateThresholds: versionedThresholds,
|
|
157
237
|
satelliteIds: satelliteIds ?? undefined,
|
|
158
238
|
includeLocal,
|
|
239
|
+
notificationPolicy: notificationPolicy ?? undefined,
|
|
159
240
|
})
|
|
160
241
|
.onConflictDoUpdate({
|
|
161
242
|
target: [
|
|
@@ -167,11 +248,41 @@ export class HealthCheckService {
|
|
|
167
248
|
stateThresholds: versionedThresholds,
|
|
168
249
|
satelliteIds: satelliteIds ?? undefined,
|
|
169
250
|
includeLocal,
|
|
251
|
+
notificationPolicy: notificationPolicy ?? undefined,
|
|
170
252
|
updatedAt: new Date(),
|
|
171
253
|
},
|
|
172
254
|
});
|
|
173
255
|
}
|
|
174
256
|
|
|
257
|
+
/**
|
|
258
|
+
* Flip the `enabled` flag on an existing `systemHealthChecks` row
|
|
259
|
+
* without touching any of the other configuration (thresholds,
|
|
260
|
+
* satellite assignment, notification policy). Returns `true` when a
|
|
261
|
+
* row was updated, `false` when the assignment doesn't exist.
|
|
262
|
+
*
|
|
263
|
+
* Carved out so the automation actions `enable_assignment` /
|
|
264
|
+
* `disable_assignment` don't have to round-trip through
|
|
265
|
+
* `associateSystem` (which would otherwise wipe operator-managed
|
|
266
|
+
* fields when invoked with a sparse partial).
|
|
267
|
+
*/
|
|
268
|
+
async setAssignmentEnabled(
|
|
269
|
+
systemId: string,
|
|
270
|
+
configurationId: string,
|
|
271
|
+
enabled: boolean,
|
|
272
|
+
): Promise<boolean> {
|
|
273
|
+
const result = await this.db
|
|
274
|
+
.update(systemHealthChecks)
|
|
275
|
+
.set({ enabled, updatedAt: new Date() })
|
|
276
|
+
.where(
|
|
277
|
+
and(
|
|
278
|
+
eq(systemHealthChecks.systemId, systemId),
|
|
279
|
+
eq(systemHealthChecks.configurationId, configurationId),
|
|
280
|
+
),
|
|
281
|
+
)
|
|
282
|
+
.returning({ systemId: systemHealthChecks.systemId });
|
|
283
|
+
return result.length > 0;
|
|
284
|
+
}
|
|
285
|
+
|
|
175
286
|
async disassociateSystem(systemId: string, configurationId: string) {
|
|
176
287
|
await this.db
|
|
177
288
|
.delete(systemHealthChecks)
|
|
@@ -282,6 +393,7 @@ export class HealthCheckService {
|
|
|
282
393
|
stateThresholds: systemHealthChecks.stateThresholds,
|
|
283
394
|
satelliteIds: systemHealthChecks.satelliteIds,
|
|
284
395
|
includeLocal: systemHealthChecks.includeLocal,
|
|
396
|
+
notificationPolicy: systemHealthChecks.notificationPolicy,
|
|
285
397
|
})
|
|
286
398
|
.from(systemHealthChecks)
|
|
287
399
|
.innerJoin(
|
|
@@ -304,11 +416,55 @@ export class HealthCheckService {
|
|
|
304
416
|
stateThresholds: thresholds,
|
|
305
417
|
satelliteIds: row.satelliteIds ?? undefined,
|
|
306
418
|
includeLocal: row.includeLocal,
|
|
419
|
+
notificationPolicy: row.notificationPolicy ?? undefined,
|
|
307
420
|
});
|
|
308
421
|
}
|
|
309
422
|
return results;
|
|
310
423
|
}
|
|
311
424
|
|
|
425
|
+
/**
|
|
426
|
+
* Resolve the fully-defaulted notification policy for a single
|
|
427
|
+
* (system, configuration) association. Resolution order:
|
|
428
|
+
*
|
|
429
|
+
* 1. Per-assignment override (`systemHealthChecks.notificationPolicy`)
|
|
430
|
+
* when non-null. Stored as a full policy; missing keys defaulted
|
|
431
|
+
* via zod parse.
|
|
432
|
+
* 2. Platform-wide defaults via `ConfigService`.
|
|
433
|
+
* 3. Compile-time `DEFAULT_NOTIFICATION_POLICY`.
|
|
434
|
+
*
|
|
435
|
+
* The all-or-nothing semantic is intentional: assignment rows are
|
|
436
|
+
* either fully-overridden or fully-inherited from the platform.
|
|
437
|
+
* Operators can revert an override by setting the row's policy to
|
|
438
|
+
* `null`, which is the "Use platform defaults" action in the UI.
|
|
439
|
+
*/
|
|
440
|
+
async getAssignmentNotificationPolicy({
|
|
441
|
+
systemId,
|
|
442
|
+
configurationId,
|
|
443
|
+
}: {
|
|
444
|
+
systemId: string;
|
|
445
|
+
configurationId: string;
|
|
446
|
+
}): Promise<NotificationPolicy> {
|
|
447
|
+
const [row] = await this.db
|
|
448
|
+
.select({
|
|
449
|
+
notificationPolicy: systemHealthChecks.notificationPolicy,
|
|
450
|
+
})
|
|
451
|
+
.from(systemHealthChecks)
|
|
452
|
+
.where(
|
|
453
|
+
and(
|
|
454
|
+
eq(systemHealthChecks.systemId, systemId),
|
|
455
|
+
eq(systemHealthChecks.configurationId, configurationId),
|
|
456
|
+
),
|
|
457
|
+
)
|
|
458
|
+
.limit(1);
|
|
459
|
+
|
|
460
|
+
// No assignment row → use platform defaults (the only sensible
|
|
461
|
+
// value for a configuration nothing has explicitly touched).
|
|
462
|
+
if (!row || row.notificationPolicy === null) {
|
|
463
|
+
return this.getPlatformNotificationDefaults();
|
|
464
|
+
}
|
|
465
|
+
return NotificationPolicySchema.parse(row.notificationPolicy);
|
|
466
|
+
}
|
|
467
|
+
|
|
312
468
|
/**
|
|
313
469
|
* Get the evaluated health status for a system based on configured thresholds.
|
|
314
470
|
* Aggregates status from all health check configurations for this system.
|
|
@@ -489,6 +645,7 @@ export class HealthCheckService {
|
|
|
489
645
|
startDate?: Date;
|
|
490
646
|
endDate?: Date;
|
|
491
647
|
sourceFilter?: string;
|
|
648
|
+
statusFilter?: HealthCheckStatus[];
|
|
492
649
|
limit?: number;
|
|
493
650
|
offset?: number;
|
|
494
651
|
sortOrder: "asc" | "desc";
|
|
@@ -499,6 +656,7 @@ export class HealthCheckService {
|
|
|
499
656
|
startDate,
|
|
500
657
|
endDate,
|
|
501
658
|
sourceFilter,
|
|
659
|
+
statusFilter,
|
|
502
660
|
limit = 10,
|
|
503
661
|
offset = 0,
|
|
504
662
|
sortOrder,
|
|
@@ -518,6 +676,11 @@ export class HealthCheckService {
|
|
|
518
676
|
conditions.push(eq(healthCheckRuns.sourceId, sourceFilter));
|
|
519
677
|
}
|
|
520
678
|
|
|
679
|
+
// Status filtering (e.g. only failing runs)
|
|
680
|
+
if (statusFilter && statusFilter.length > 0) {
|
|
681
|
+
conditions.push(inArray(healthCheckRuns.status, statusFilter));
|
|
682
|
+
}
|
|
683
|
+
|
|
521
684
|
// Build where clause
|
|
522
685
|
const whereClause = conditions.length > 0 ? and(...conditions) : undefined;
|
|
523
686
|
|
|
@@ -563,6 +726,7 @@ export class HealthCheckService {
|
|
|
563
726
|
startDate?: Date;
|
|
564
727
|
endDate?: Date;
|
|
565
728
|
sourceFilter?: string;
|
|
729
|
+
statusFilter?: HealthCheckStatus[];
|
|
566
730
|
limit?: number;
|
|
567
731
|
offset?: number;
|
|
568
732
|
sortOrder: "asc" | "desc";
|
|
@@ -573,6 +737,7 @@ export class HealthCheckService {
|
|
|
573
737
|
startDate,
|
|
574
738
|
endDate,
|
|
575
739
|
sourceFilter,
|
|
740
|
+
statusFilter,
|
|
576
741
|
limit = 10,
|
|
577
742
|
offset = 0,
|
|
578
743
|
sortOrder,
|
|
@@ -592,6 +757,11 @@ export class HealthCheckService {
|
|
|
592
757
|
conditions.push(eq(healthCheckRuns.sourceId, sourceFilter));
|
|
593
758
|
}
|
|
594
759
|
|
|
760
|
+
// Status filtering (e.g. only failing runs)
|
|
761
|
+
if (statusFilter && statusFilter.length > 0) {
|
|
762
|
+
conditions.push(inArray(healthCheckRuns.status, statusFilter));
|
|
763
|
+
}
|
|
764
|
+
|
|
595
765
|
const whereClause = conditions.length > 0 ? and(...conditions) : undefined;
|
|
596
766
|
const total = await this.db.$count(healthCheckRuns, whereClause);
|
|
597
767
|
|
|
@@ -1068,6 +1238,27 @@ export class HealthCheckService {
|
|
|
1068
1238
|
|
|
1069
1239
|
if (matchingAssociations.length === 0) return [];
|
|
1070
1240
|
|
|
1241
|
+
// Resolve human-readable system names once per distinct systemId.
|
|
1242
|
+
// Falls back to the systemId when no catalog client is wired or the
|
|
1243
|
+
// lookup fails, mirroring the queue-executor's resolution behaviour.
|
|
1244
|
+
const systemNameCache = new Map<string, string>();
|
|
1245
|
+
const resolveSystemName = async (systemId: string): Promise<string> => {
|
|
1246
|
+
const cached = systemNameCache.get(systemId);
|
|
1247
|
+
if (cached !== undefined) return cached;
|
|
1248
|
+
|
|
1249
|
+
let systemName = systemId;
|
|
1250
|
+
if (this.catalogClient) {
|
|
1251
|
+
try {
|
|
1252
|
+
const system = await this.catalogClient.getSystem({ systemId });
|
|
1253
|
+
if (system) systemName = system.name;
|
|
1254
|
+
} catch {
|
|
1255
|
+
// Fall back to systemId if catalog lookup fails.
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
systemNameCache.set(systemId, systemName);
|
|
1259
|
+
return systemName;
|
|
1260
|
+
};
|
|
1261
|
+
|
|
1071
1262
|
// Get configurations for each matching association
|
|
1072
1263
|
const assignments = [];
|
|
1073
1264
|
for (const assoc of matchingAssociations) {
|
|
@@ -1085,6 +1276,9 @@ export class HealthCheckService {
|
|
|
1085
1276
|
config: config.config,
|
|
1086
1277
|
collectors: config.collectors ?? undefined,
|
|
1087
1278
|
intervalSeconds: config.intervalSeconds,
|
|
1279
|
+
// Curated run-context metadata exposed to satellite collectors.
|
|
1280
|
+
configName: config.name,
|
|
1281
|
+
systemName: await resolveSystemName(assoc.systemId),
|
|
1088
1282
|
});
|
|
1089
1283
|
}
|
|
1090
1284
|
|
package/tsconfig.json
CHANGED
|
@@ -4,6 +4,9 @@
|
|
|
4
4
|
"src"
|
|
5
5
|
],
|
|
6
6
|
"references": [
|
|
7
|
+
{
|
|
8
|
+
"path": "../automation-backend"
|
|
9
|
+
},
|
|
7
10
|
{
|
|
8
11
|
"path": "../backend-api"
|
|
9
12
|
},
|
|
@@ -38,10 +41,10 @@
|
|
|
38
41
|
"path": "../healthcheck-common"
|
|
39
42
|
},
|
|
40
43
|
{
|
|
41
|
-
"path": "../incident-
|
|
44
|
+
"path": "../incident-backend"
|
|
42
45
|
},
|
|
43
46
|
{
|
|
44
|
-
"path": "../
|
|
47
|
+
"path": "../incident-common"
|
|
45
48
|
},
|
|
46
49
|
{
|
|
47
50
|
"path": "../maintenance-common"
|