@checkstack/healthcheck-backend 1.4.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +303 -0
- package/drizzle/0018_abnormal_preak.sql +10 -0
- package/drizzle/meta/0018_snapshot.json +600 -0
- package/drizzle/meta/_journal.json +7 -0
- package/package.json +26 -21
- package/src/ai/assertion-validation.test.ts +117 -0
- package/src/ai/assertion-validation.ts +147 -0
- package/src/ai/healthcheck-capabilities.test.ts +158 -0
- package/src/ai/healthcheck-capabilities.ts +217 -0
- package/src/ai/healthcheck-delete.test.ts +81 -0
- package/src/ai/healthcheck-delete.ts +81 -0
- package/src/ai/healthcheck-projection.test.ts +36 -0
- package/src/ai/healthcheck-propose.test.ts +268 -0
- package/src/ai/healthcheck-propose.ts +290 -0
- package/src/ai/healthcheck-script-tools.test.ts +93 -0
- package/src/ai/healthcheck-script-tools.ts +179 -0
- package/src/ai/healthcheck-update.test.ts +123 -0
- package/src/ai/healthcheck-update.ts +123 -0
- package/src/ai/notify-subscribers.test.ts +109 -0
- package/src/ai/notify-subscribers.ts +176 -0
- package/src/ai/register-ai-tools.test.ts +41 -0
- package/src/ai/register-ai-tools.ts +53 -0
- package/src/ai/shell-env-table.test.ts +47 -0
- package/src/automations.test.ts +2 -1
- package/src/automations.ts +9 -1
- package/src/collector-script-test.test.ts +53 -1
- package/src/collector-script-test.ts +59 -7
- package/src/effective-environments.test.ts +93 -0
- package/src/effective-environments.ts +64 -0
- package/src/health-entity-id.ts +57 -0
- package/src/health-entity.test.ts +405 -31
- package/src/health-entity.ts +99 -43
- package/src/health-state.ts +41 -4
- package/src/healthcheck-gitops-kinds.test.ts +95 -0
- package/src/healthcheck-gitops-kinds.ts +56 -13
- package/src/index.ts +33 -0
- package/src/migration-chain-contract.test.ts +57 -0
- package/src/queue-executor.test.ts +814 -0
- package/src/queue-executor.ts +342 -50
- package/src/realtime-aggregation.test.ts +30 -0
- package/src/realtime-aggregation.ts +16 -0
- package/src/retention-job.ts +167 -93
- package/src/retention-rollup.test.ts +118 -0
- package/src/router.test.ts +120 -1
- package/src/router.ts +20 -0
- package/src/schema.ts +44 -6
- package/src/service.ts +199 -43
- package/src/state-evaluator.test.ts +50 -5
- package/src/state-evaluator.ts +9 -2
- package/src/state-transitions.test.ts +104 -0
- package/src/state-transitions.ts +39 -1
- package/src/validate-configuration.test.ts +205 -0
- package/src/validate-configuration.ts +159 -0
- package/tsconfig.json +9 -0
package/src/schema.ts
CHANGED
|
@@ -97,6 +97,20 @@ export const systemHealthChecks = pgTable(
|
|
|
97
97
|
* When set, the check runs on these satellite nodes in addition to (or instead of) the core.
|
|
98
98
|
*/
|
|
99
99
|
satelliteIds: jsonb("satellite_ids").$type<string[]>(),
|
|
100
|
+
/**
|
|
101
|
+
* Per-assignment environment selector for per-environment fan-out.
|
|
102
|
+
*
|
|
103
|
+
* Semantics (null vs [] are SEMANTICALLY DISTINCT here, unlike most
|
|
104
|
+
* nullable jsonb in this schema):
|
|
105
|
+
* - `null` => all environments the system currently belongs to.
|
|
106
|
+
* - `[]` => opt out: run ONCE with no environment in context.
|
|
107
|
+
* - non-empty => exactly those environment ids, intersected with the
|
|
108
|
+
* system's current membership (a removed env silently drops out).
|
|
109
|
+
*
|
|
110
|
+
* The service distinguishes `row.environmentIds === null` from
|
|
111
|
+
* `length === 0`; jsonb stores both faithfully.
|
|
112
|
+
*/
|
|
113
|
+
environmentIds: jsonb("environment_ids").$type<string[]>(),
|
|
100
114
|
/**
|
|
101
115
|
* Whether to also run this check locally on the core instance.
|
|
102
116
|
* Defaults to true. Only relevant when satelliteIds is set.
|
|
@@ -145,22 +159,30 @@ export const healthCheckStateTransitions = pgTable(
|
|
|
145
159
|
configurationId: uuid("configuration_id")
|
|
146
160
|
.notNull()
|
|
147
161
|
.references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
|
|
162
|
+
/**
|
|
163
|
+
* Environment this transition belongs to. null = a transition recorded
|
|
164
|
+
* for an env-less run (the opt-out / no-membership case). Per-environment
|
|
165
|
+
* transitions stay distinct so "in status since" is env-scoped.
|
|
166
|
+
*/
|
|
167
|
+
environmentId: text("environment_id"),
|
|
148
168
|
fromStatus: healthCheckStatusEnum("from_status"),
|
|
149
169
|
toStatus: healthCheckStatusEnum("to_status").notNull(),
|
|
150
170
|
transitionedAt: timestamp("transitioned_at").defaultNow().notNull(),
|
|
151
171
|
},
|
|
152
172
|
(t) => ({
|
|
153
|
-
// Powers "most recent transition into status X for this system"
|
|
154
|
-
// (WHERE system_id = ? AND
|
|
173
|
+
// Powers "most recent transition into status X for this system+env"
|
|
174
|
+
// (WHERE system_id = ? AND environment_id = ? AND to_status = ?
|
|
175
|
+
// ORDER BY transitioned_at DESC).
|
|
155
176
|
lookupIdx: index("health_check_state_transitions_lookup_idx").on(
|
|
156
177
|
t.systemId,
|
|
178
|
+
t.environmentId,
|
|
157
179
|
t.toStatus,
|
|
158
180
|
t.transitionedAt,
|
|
159
181
|
),
|
|
160
|
-
// Powers the retention "keep newest per system" sweep.
|
|
182
|
+
// Powers the retention "keep newest per system+env" sweep.
|
|
161
183
|
systemRecentIdx: index(
|
|
162
184
|
"health_check_state_transitions_system_recent_idx",
|
|
163
|
-
).on(t.systemId, t.transitionedAt),
|
|
185
|
+
).on(t.systemId, t.environmentId, t.transitionedAt),
|
|
164
186
|
}),
|
|
165
187
|
);
|
|
166
188
|
|
|
@@ -170,6 +192,15 @@ export const healthCheckRuns = pgTable("health_check_runs", {
|
|
|
170
192
|
.notNull()
|
|
171
193
|
.references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
|
|
172
194
|
systemId: text("system_id").notNull(),
|
|
195
|
+
/**
|
|
196
|
+
* Environment this run was executed for (per-environment fan-out).
|
|
197
|
+
* null = ran with no environment (the opt-out / no-membership case,
|
|
198
|
+
* which is exactly the pre-feature behavior). Nullable text, NOT a FK
|
|
199
|
+
* to the catalog `environments` table (healthcheck and catalog are
|
|
200
|
+
* separate plugins with separate Postgres schemas, mirroring how
|
|
201
|
+
* `systemId` is a bare text with no FK to `systems`).
|
|
202
|
+
*/
|
|
203
|
+
environmentId: text("environment_id"),
|
|
173
204
|
status: healthCheckStatusEnum("status").notNull(),
|
|
174
205
|
/** Execution duration in milliseconds */
|
|
175
206
|
latencyMs: integer("latency_ms"),
|
|
@@ -207,6 +238,11 @@ export const healthCheckAggregates = pgTable(
|
|
|
207
238
|
.notNull()
|
|
208
239
|
.references(() => healthCheckConfigurations.id, { onDelete: "cascade" }),
|
|
209
240
|
systemId: text("system_id").notNull(),
|
|
241
|
+
/**
|
|
242
|
+
* Environment this bucket aggregates. null = env-less runs. Part of the
|
|
243
|
+
* unique key so per-environment buckets stay separate.
|
|
244
|
+
*/
|
|
245
|
+
environmentId: text("environment_id"),
|
|
210
246
|
bucketStart: timestamp("bucket_start").notNull(),
|
|
211
247
|
bucketSize: bucketSizeEnum("bucket_size").notNull(),
|
|
212
248
|
|
|
@@ -238,12 +274,14 @@ export const healthCheckAggregates = pgTable(
|
|
|
238
274
|
sourceLabel: text("source_label"),
|
|
239
275
|
},
|
|
240
276
|
(t) => ({
|
|
241
|
-
// Unique constraint includes
|
|
242
|
-
// NULLS NOT DISTINCT ensures
|
|
277
|
+
// Unique constraint includes environmentId (per-environment fan-out)
|
|
278
|
+
// and sourceId (per-region aggregation). NULLS NOT DISTINCT ensures
|
|
279
|
+
// env-less / local runs (environmentId / sourceId = NULL) correctly
|
|
243
280
|
// conflict-match instead of creating duplicate rows per hour.
|
|
244
281
|
bucketUnique: unique("health_check_aggregates_bucket_unique").on(
|
|
245
282
|
t.configurationId,
|
|
246
283
|
t.systemId,
|
|
284
|
+
t.environmentId,
|
|
247
285
|
t.bucketStart,
|
|
248
286
|
t.bucketSize,
|
|
249
287
|
t.sourceId,
|
package/src/service.ts
CHANGED
|
@@ -9,6 +9,7 @@ import {
|
|
|
9
9
|
type NotificationPolicy,
|
|
10
10
|
NotificationPolicySchema,
|
|
11
11
|
DEFAULT_NOTIFICATION_POLICY,
|
|
12
|
+
type CollectorConfigEntry,
|
|
12
13
|
} from "@checkstack/healthcheck-common";
|
|
13
14
|
import type { ConfigService } from "@checkstack/backend-api";
|
|
14
15
|
import type { InferClient } from "@checkstack/common";
|
|
@@ -39,6 +40,7 @@ import {
|
|
|
39
40
|
import { ORPCError } from "@orpc/server";
|
|
40
41
|
import { evaluateHealthStatus } from "./state-evaluator";
|
|
41
42
|
import { computeHealthState, type HealthState } from "./health-state";
|
|
43
|
+
import { parseHealthEntityId } from "./health-entity-id";
|
|
42
44
|
import { stateThresholds } from "./state-thresholds-migrations";
|
|
43
45
|
import type { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
44
46
|
import type { Logger } from "@checkstack/backend-api";
|
|
@@ -61,6 +63,16 @@ import {
|
|
|
61
63
|
// Drizzle type helper - uses SafeDatabase to prevent relational query API usage
|
|
62
64
|
type Db = SafeDatabase<typeof schema>;
|
|
63
65
|
|
|
66
|
+
/**
|
|
67
|
+
* Narrow a migrated config (typed `unknown` by the versioning chain) to a
|
|
68
|
+
* spreadable record. Every registered strategy/collector config schema is an
|
|
69
|
+
* object, so a successfully validated value is always object-shaped at
|
|
70
|
+
* runtime; this guard keeps the type-level handling cast-free.
|
|
71
|
+
*/
|
|
72
|
+
function isConfigRecord(value: unknown): value is Record<string, unknown> {
|
|
73
|
+
return typeof value === "object" && value !== null;
|
|
74
|
+
}
|
|
75
|
+
|
|
64
76
|
// Catalog client type used to resolve human-readable system names for
|
|
65
77
|
// satellite assignment run-context. Optional on the service.
|
|
66
78
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
@@ -208,7 +220,7 @@ export class HealthCheckService {
|
|
|
208
220
|
|
|
209
221
|
async getConfigurations(): Promise<HealthCheckConfiguration[]> {
|
|
210
222
|
const configs = await this.db.select().from(healthCheckConfigurations);
|
|
211
|
-
return configs.map((c) => this.mapConfig(c));
|
|
223
|
+
return Promise.all(configs.map((c) => this.mapConfig(c)));
|
|
212
224
|
}
|
|
213
225
|
|
|
214
226
|
async associateSystem(props: {
|
|
@@ -217,6 +229,13 @@ export class HealthCheckService {
|
|
|
217
229
|
enabled?: boolean;
|
|
218
230
|
stateThresholds?: StateThresholds;
|
|
219
231
|
satelliteIds?: string[];
|
|
232
|
+
/**
|
|
233
|
+
* Per-assignment environment selector. `null` (or `undefined`) = all
|
|
234
|
+
* current environments; `[]` = opt out (env-less); non-empty = those
|
|
235
|
+
* ids. `null` and `[]` are stored distinctly so the run-time resolver
|
|
236
|
+
* can tell "all" from "opt out". `undefined` is normalized to `null`.
|
|
237
|
+
*/
|
|
238
|
+
environmentIds?: string[] | null;
|
|
220
239
|
includeLocal?: boolean;
|
|
221
240
|
notificationPolicy?: NotificationPolicy;
|
|
222
241
|
}) {
|
|
@@ -226,10 +245,16 @@ export class HealthCheckService {
|
|
|
226
245
|
enabled = true,
|
|
227
246
|
stateThresholds: stateThresholds_,
|
|
228
247
|
satelliteIds,
|
|
248
|
+
environmentIds,
|
|
229
249
|
includeLocal = true,
|
|
230
250
|
notificationPolicy,
|
|
231
251
|
} = props;
|
|
232
252
|
|
|
253
|
+
// Preserve the null/[]/list distinction faithfully. `undefined` props
|
|
254
|
+
// mean "not provided" -> treat as `null` ("all current environments"),
|
|
255
|
+
// the default fan-out behavior. `[]` is kept verbatim (opt-out).
|
|
256
|
+
const environmentIdsValue: string[] | null = environmentIds ?? null;
|
|
257
|
+
|
|
233
258
|
// Wrap thresholds in versioned config if provided
|
|
234
259
|
const versionedThresholds: VersionedStateThresholds | undefined =
|
|
235
260
|
stateThresholds_ ? stateThresholds.create(stateThresholds_) : undefined;
|
|
@@ -242,6 +267,7 @@ export class HealthCheckService {
|
|
|
242
267
|
enabled,
|
|
243
268
|
stateThresholds: versionedThresholds,
|
|
244
269
|
satelliteIds: satelliteIds ?? undefined,
|
|
270
|
+
environmentIds: environmentIdsValue,
|
|
245
271
|
includeLocal,
|
|
246
272
|
notificationPolicy: notificationPolicy ?? undefined,
|
|
247
273
|
})
|
|
@@ -254,6 +280,7 @@ export class HealthCheckService {
|
|
|
254
280
|
enabled,
|
|
255
281
|
stateThresholds: versionedThresholds,
|
|
256
282
|
satelliteIds: satelliteIds ?? undefined,
|
|
283
|
+
environmentIds: environmentIdsValue,
|
|
257
284
|
includeLocal,
|
|
258
285
|
notificationPolicy: notificationPolicy ?? undefined,
|
|
259
286
|
updatedAt: new Date(),
|
|
@@ -385,7 +412,7 @@ export class HealthCheckService {
|
|
|
385
412
|
)
|
|
386
413
|
.where(eq(systemHealthChecks.systemId, systemId));
|
|
387
414
|
|
|
388
|
-
return rows.map((r) => this.mapConfig(r.config));
|
|
415
|
+
return Promise.all(rows.map((r) => this.mapConfig(r.config)));
|
|
389
416
|
}
|
|
390
417
|
|
|
391
418
|
/**
|
|
@@ -399,6 +426,7 @@ export class HealthCheckService {
|
|
|
399
426
|
enabled: systemHealthChecks.enabled,
|
|
400
427
|
stateThresholds: systemHealthChecks.stateThresholds,
|
|
401
428
|
satelliteIds: systemHealthChecks.satelliteIds,
|
|
429
|
+
environmentIds: systemHealthChecks.environmentIds,
|
|
402
430
|
includeLocal: systemHealthChecks.includeLocal,
|
|
403
431
|
notificationPolicy: systemHealthChecks.notificationPolicy,
|
|
404
432
|
})
|
|
@@ -422,6 +450,9 @@ export class HealthCheckService {
|
|
|
422
450
|
enabled: row.enabled,
|
|
423
451
|
stateThresholds: thresholds,
|
|
424
452
|
satelliteIds: row.satelliteIds ?? undefined,
|
|
453
|
+
// Preserve the null/[]/list distinction (null = all envs, [] = opt
|
|
454
|
+
// out). Do NOT collapse null to undefined via `??`.
|
|
455
|
+
environmentIds: row.environmentIds,
|
|
425
456
|
includeLocal: row.includeLocal,
|
|
426
457
|
notificationPolicy: row.notificationPolicy ?? undefined,
|
|
427
458
|
});
|
|
@@ -475,9 +506,25 @@ export class HealthCheckService {
|
|
|
475
506
|
/**
|
|
476
507
|
* Get the evaluated health status for a system based on configured thresholds.
|
|
477
508
|
* Aggregates status from all health check configurations for this system.
|
|
509
|
+
*
|
|
510
|
+
* Environment dimension (Phase 3b, §7.4.2):
|
|
511
|
+
* - `environmentId` OMITTED (or `undefined`) ⇒ the **system rollup**: all
|
|
512
|
+
* runs for the system regardless of environment. "Any env unhealthy ⇒ at
|
|
513
|
+
* least one unhealthy run in the window" already yields worst-status
|
|
514
|
+
* semantics for the window-based evaluator, and it exactly matches the
|
|
515
|
+
* pre-3b behavior when no environments exist (no extra catalog read).
|
|
516
|
+
* - `environmentId` a STRING ⇒ the per-environment slice: only runs whose
|
|
517
|
+
* `environment_id` equals that id.
|
|
518
|
+
* - `environmentId` `null` ⇒ the ENV-LESS slice: only runs with
|
|
519
|
+
* `environment_id IS NULL` (the opt-out / no-membership case).
|
|
520
|
+
*
|
|
521
|
+
* The env filter narrows ONLY the per-check run window; the set of enabled
|
|
522
|
+
* associations (and thus `checkStatuses.length`, the existence gate) is the
|
|
523
|
+
* same across views, so a per-env view and the rollup agree on totalChecks.
|
|
478
524
|
*/
|
|
479
525
|
async getSystemHealthStatus(
|
|
480
526
|
systemId: string,
|
|
527
|
+
environmentId?: string | null,
|
|
481
528
|
): Promise<SystemHealthStatusResponse> {
|
|
482
529
|
// Get all associations for this system with their thresholds and config names
|
|
483
530
|
const associations = await this.db
|
|
@@ -512,6 +559,17 @@ export class HealthCheckService {
|
|
|
512
559
|
const checkStatuses: SystemCheckStatus[] = [];
|
|
513
560
|
const maxWindowSize = 100; // Max configurable window size
|
|
514
561
|
|
|
562
|
+
// Environment filter for the per-check run window. `undefined` (rollup)
|
|
563
|
+
// adds no predicate; `null` filters to the env-less slice; a string
|
|
564
|
+
// filters to that environment. The lookup index leads with
|
|
565
|
+
// (system_id, environment_id, …) so the env-scoped query is index-efficient.
|
|
566
|
+
const envFilter =
|
|
567
|
+
environmentId === undefined
|
|
568
|
+
? undefined
|
|
569
|
+
: environmentId === null
|
|
570
|
+
? isNull(healthCheckRuns.environmentId)
|
|
571
|
+
: eq(healthCheckRuns.environmentId, environmentId);
|
|
572
|
+
|
|
515
573
|
for (const assoc of associations) {
|
|
516
574
|
const runs = await this.db
|
|
517
575
|
.select({
|
|
@@ -523,6 +581,7 @@ export class HealthCheckService {
|
|
|
523
581
|
and(
|
|
524
582
|
eq(healthCheckRuns.systemId, systemId),
|
|
525
583
|
eq(healthCheckRuns.configurationId, assoc.configurationId),
|
|
584
|
+
...(envFilter ? [envFilter] : []),
|
|
526
585
|
),
|
|
527
586
|
)
|
|
528
587
|
.orderBy(desc(healthCheckRuns.timestamp))
|
|
@@ -577,6 +636,7 @@ export class HealthCheckService {
|
|
|
577
636
|
async getHealthState({
|
|
578
637
|
systemId,
|
|
579
638
|
configurationId,
|
|
639
|
+
environmentId,
|
|
580
640
|
maintenanceClient,
|
|
581
641
|
logger,
|
|
582
642
|
transitionWindowMinutes,
|
|
@@ -584,6 +644,13 @@ export class HealthCheckService {
|
|
|
584
644
|
}: {
|
|
585
645
|
systemId: string;
|
|
586
646
|
configurationId?: string;
|
|
647
|
+
/**
|
|
648
|
+
* Environment to scope the snapshot to (Phase 3b). `undefined` = the
|
|
649
|
+
* system rollup; `null` = the env-less slice; a string = that env. Threads
|
|
650
|
+
* into both the status resolver and every durable read in
|
|
651
|
+
* `computeHealthState`.
|
|
652
|
+
*/
|
|
653
|
+
environmentId?: string | null;
|
|
587
654
|
maintenanceClient?: MaintenanceClient;
|
|
588
655
|
logger?: Logger;
|
|
589
656
|
transitionWindowMinutes?: number;
|
|
@@ -593,12 +660,16 @@ export class HealthCheckService {
|
|
|
593
660
|
db: this.db,
|
|
594
661
|
systemId,
|
|
595
662
|
configurationId,
|
|
663
|
+
environmentId,
|
|
596
664
|
maintenanceClient,
|
|
597
665
|
logger,
|
|
598
666
|
transitionWindowMinutes,
|
|
599
667
|
now,
|
|
600
668
|
resolveStatus: async () => {
|
|
601
|
-
const overview = await this.getSystemHealthStatus(
|
|
669
|
+
const overview = await this.getSystemHealthStatus(
|
|
670
|
+
systemId,
|
|
671
|
+
environmentId,
|
|
672
|
+
);
|
|
602
673
|
if (!configurationId) return overview.status;
|
|
603
674
|
const check = overview.checkStatuses.find(
|
|
604
675
|
(c) => c.configurationId === configurationId,
|
|
@@ -611,10 +682,17 @@ export class HealthCheckService {
|
|
|
611
682
|
}
|
|
612
683
|
|
|
613
684
|
/**
|
|
614
|
-
* Bulk variant of {@link getHealthState}. Resolves every
|
|
615
|
-
*
|
|
616
|
-
*
|
|
617
|
-
*
|
|
685
|
+
* Bulk variant of {@link getHealthState}. Resolves every id in parallel
|
|
686
|
+
* against a single shared `now` so durations are consistent across the
|
|
687
|
+
* batch. Avoids N+1 from dashboards and multi-system automation rules.
|
|
688
|
+
*
|
|
689
|
+
* Environment-aware (Phase 3b, §7.4.4): an id may be the bare `"<systemId>"`
|
|
690
|
+
* (the system rollup) OR the env-qualified `"<systemId>::<environmentId>"`
|
|
691
|
+
* (a per-environment view). Each id is parsed via {@link parseHealthEntityId}
|
|
692
|
+
* and resolved against the right env slice, and the result is keyed by the
|
|
693
|
+
* ORIGINAL id string. So scope enrichment that reads
|
|
694
|
+
* `health.systems["<systemId>::<environmentId>"]` gets the per-env snapshot
|
|
695
|
+
* and `health.systems["<systemId>"]` gets the rollup, with no caller change.
|
|
618
696
|
*/
|
|
619
697
|
async getBulkHealthState({
|
|
620
698
|
systemIds,
|
|
@@ -623,6 +701,7 @@ export class HealthCheckService {
|
|
|
623
701
|
transitionWindowMinutes,
|
|
624
702
|
now = new Date(),
|
|
625
703
|
}: {
|
|
704
|
+
/** Health entity ids — bare systemId (rollup) or `systemId::environmentId`. */
|
|
626
705
|
systemIds: string[];
|
|
627
706
|
maintenanceClient?: MaintenanceClient;
|
|
628
707
|
logger?: Logger;
|
|
@@ -630,19 +709,25 @@ export class HealthCheckService {
|
|
|
630
709
|
now?: Date;
|
|
631
710
|
}): Promise<Record<string, HealthState>> {
|
|
632
711
|
const entries = await Promise.all(
|
|
633
|
-
systemIds.map(
|
|
634
|
-
|
|
635
|
-
|
|
712
|
+
systemIds.map(async (id) => {
|
|
713
|
+
const { systemId, environmentId } = parseHealthEntityId(id);
|
|
714
|
+
return [
|
|
715
|
+
id,
|
|
716
|
+
await this.getHealthState({
|
|
636
717
|
systemId,
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
718
|
+
// A bare `<systemId>` id is the ROLLUP and must read ALL runs
|
|
719
|
+
// (`undefined`), NOT the env-less slice (`null`, i.e.
|
|
720
|
+
// `env_id IS NULL`). `parseHealthEntityId` returns `null` for a
|
|
721
|
+
// bare id; map it to `undefined` here. `null` stays reserved for
|
|
722
|
+
// an explicit env-less read.
|
|
723
|
+
environmentId: environmentId === null ? undefined : environmentId,
|
|
724
|
+
maintenanceClient,
|
|
725
|
+
logger,
|
|
726
|
+
transitionWindowMinutes,
|
|
727
|
+
now,
|
|
728
|
+
}),
|
|
729
|
+
] as const;
|
|
730
|
+
}),
|
|
646
731
|
);
|
|
647
732
|
return Object.fromEntries(entries);
|
|
648
733
|
}
|
|
@@ -797,6 +882,7 @@ export class HealthCheckService {
|
|
|
797
882
|
status: run.status,
|
|
798
883
|
timestamp: run.timestamp,
|
|
799
884
|
latencyMs: run.latencyMs ?? undefined,
|
|
885
|
+
environmentId: run.environmentId ?? undefined,
|
|
800
886
|
sourceId: run.sourceId ?? undefined,
|
|
801
887
|
sourceLabel: run.sourceLabel ?? undefined,
|
|
802
888
|
})),
|
|
@@ -875,6 +961,7 @@ export class HealthCheckService {
|
|
|
875
961
|
result: run.result ?? {},
|
|
876
962
|
timestamp: run.timestamp,
|
|
877
963
|
latencyMs: run.latencyMs ?? undefined,
|
|
964
|
+
environmentId: run.environmentId ?? undefined,
|
|
878
965
|
sourceId: run.sourceId ?? undefined,
|
|
879
966
|
sourceLabel: run.sourceLabel ?? undefined,
|
|
880
967
|
})),
|
|
@@ -905,6 +992,7 @@ export class HealthCheckService {
|
|
|
905
992
|
result: r.result ?? {},
|
|
906
993
|
timestamp: r.timestamp,
|
|
907
994
|
latencyMs: r.latencyMs ?? undefined,
|
|
995
|
+
environmentId: r.environmentId ?? undefined,
|
|
908
996
|
sourceId: r.sourceId ?? undefined,
|
|
909
997
|
sourceLabel: r.sourceLabel ?? undefined,
|
|
910
998
|
};
|
|
@@ -1255,15 +1343,23 @@ export class HealthCheckService {
|
|
|
1255
1343
|
return new Date(rangeStart.getTime() + bucketIndex * intervalMs);
|
|
1256
1344
|
}
|
|
1257
1345
|
|
|
1258
|
-
|
|
1346
|
+
/**
|
|
1347
|
+
* Map a stored configuration row to the public DTO, migrating the
|
|
1348
|
+
* (UNVERSIONED) strategy + collector configs via assume-v1-on-read so the
|
|
1349
|
+
* read API (router / frontend / gitops `getConfiguration`) returns migrated
|
|
1350
|
+
* shapes. Migrations are idempotent, so an already-current config is a
|
|
1351
|
+
* no-op. An unregistered strategy/collector or a failed migrate falls back
|
|
1352
|
+
* to the raw stored blob rather than dropping the configuration.
|
|
1353
|
+
*/
|
|
1354
|
+
private async mapConfig(
|
|
1259
1355
|
row: InferSelectModel<typeof healthCheckConfigurations>,
|
|
1260
|
-
): HealthCheckConfiguration {
|
|
1356
|
+
): Promise<HealthCheckConfiguration> {
|
|
1261
1357
|
return {
|
|
1262
1358
|
id: row.id,
|
|
1263
1359
|
name: row.name,
|
|
1264
1360
|
strategyId: row.strategyId,
|
|
1265
|
-
config: row.config,
|
|
1266
|
-
collectors: row.collectors
|
|
1361
|
+
config: await this.migrateStrategyConfig(row.strategyId, row.config),
|
|
1362
|
+
collectors: await this.migrateCollectorEntries(row.collectors),
|
|
1267
1363
|
intervalSeconds: row.intervalSeconds,
|
|
1268
1364
|
paused: row.paused,
|
|
1269
1365
|
createdAt: row.createdAt,
|
|
@@ -1271,6 +1367,56 @@ export class HealthCheckService {
|
|
|
1271
1367
|
};
|
|
1272
1368
|
}
|
|
1273
1369
|
|
|
1370
|
+
/**
|
|
1371
|
+
* Migrate a stored strategy config via assume-v1-on-read. Falls back to the
|
|
1372
|
+
* raw blob when the strategy is not registered or the migrate/validate
|
|
1373
|
+
* throws, so a read never drops a configuration on a transient mismatch.
|
|
1374
|
+
*/
|
|
1375
|
+
private async migrateStrategyConfig(
|
|
1376
|
+
strategyId: string,
|
|
1377
|
+
rawConfig: Record<string, unknown>,
|
|
1378
|
+
): Promise<Record<string, unknown>> {
|
|
1379
|
+
const strategy = this.registry?.getStrategy(strategyId);
|
|
1380
|
+
if (!strategy) return rawConfig;
|
|
1381
|
+
try {
|
|
1382
|
+
const migrated = await strategy.config.parseAssumingV1(rawConfig);
|
|
1383
|
+
return { ...migrated };
|
|
1384
|
+
} catch {
|
|
1385
|
+
return rawConfig;
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
/**
|
|
1390
|
+
* Migrate each collector entry's stored config via assume-v1-on-read,
|
|
1391
|
+
* preserving id/collectorId/assertions. Falls back to the raw entry config
|
|
1392
|
+
* when the collector is not registered or migrate/validate throws.
|
|
1393
|
+
*/
|
|
1394
|
+
private async migrateCollectorEntries(
|
|
1395
|
+
collectors: CollectorConfigEntry[] | null,
|
|
1396
|
+
): Promise<CollectorConfigEntry[] | undefined> {
|
|
1397
|
+
if (!collectors || collectors.length === 0) return undefined;
|
|
1398
|
+
return Promise.all(
|
|
1399
|
+
collectors.map(async (entry) => {
|
|
1400
|
+
const registered = this.collectorRegistry?.getCollector(
|
|
1401
|
+
entry.collectorId,
|
|
1402
|
+
);
|
|
1403
|
+
if (!registered) return entry;
|
|
1404
|
+
try {
|
|
1405
|
+
const migrated = await registered.collector.config.parseAssumingV1(
|
|
1406
|
+
entry.config,
|
|
1407
|
+
);
|
|
1408
|
+
// A registered collector's config schema is always an object, so a
|
|
1409
|
+
// successful migrate yields a record; fall back to the raw entry if
|
|
1410
|
+
// the validated value is somehow not object-shaped.
|
|
1411
|
+
if (!isConfigRecord(migrated)) return entry;
|
|
1412
|
+
return { ...entry, config: { ...migrated } };
|
|
1413
|
+
} catch {
|
|
1414
|
+
return entry;
|
|
1415
|
+
}
|
|
1416
|
+
}),
|
|
1417
|
+
);
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1274
1420
|
/**
|
|
1275
1421
|
* Remove a satellite ID from all systemHealthChecks.satelliteIds arrays.
|
|
1276
1422
|
* Called when a satellite is deleted via the satellite.removed hook.
|
|
@@ -1450,27 +1596,37 @@ export class HealthCheckService {
|
|
|
1450
1596
|
? ({ ...result } as Record<string, unknown>)
|
|
1451
1597
|
: {};
|
|
1452
1598
|
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1599
|
+
// Atomic: the run row and the hourly-aggregate increment it feeds must
|
|
1600
|
+
// commit together. Without the transaction a failure on the (non-idempotent
|
|
1601
|
+
// `runCount + 1`) aggregate left a committed run that the aggregate never
|
|
1602
|
+
// counted - or, on the reverse ordering, an aggregate with no backing run.
|
|
1603
|
+
// NOTE: this guarantees run/aggregate consistency, but does NOT make a
|
|
1604
|
+
// *duplicate satellite delivery* (a re-POST after a committed write)
|
|
1605
|
+
// idempotent - that requires a dedupe key on the high-volume runs table and
|
|
1606
|
+
// is tracked as a separate follow-up.
|
|
1607
|
+
await this.db.transaction(async (tx) => {
|
|
1608
|
+
await tx.insert(healthCheckRuns).values({
|
|
1609
|
+
configurationId: configId,
|
|
1610
|
+
systemId,
|
|
1611
|
+
status,
|
|
1612
|
+
latencyMs,
|
|
1613
|
+
result: resultRecord,
|
|
1614
|
+
sourceId,
|
|
1615
|
+
sourceLabel,
|
|
1616
|
+
});
|
|
1462
1617
|
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1618
|
+
// Trigger incremental hourly aggregation — same as local executor
|
|
1619
|
+
await incrementHourlyAggregate({
|
|
1620
|
+
db: tx,
|
|
1621
|
+
systemId,
|
|
1622
|
+
configurationId: configId,
|
|
1623
|
+
status,
|
|
1624
|
+
latencyMs,
|
|
1625
|
+
runTimestamp: new Date(props.executedAt),
|
|
1626
|
+
result: resultRecord,
|
|
1627
|
+
collectorRegistry: this.collectorRegistry,
|
|
1628
|
+
sourceLabel,
|
|
1629
|
+
});
|
|
1474
1630
|
});
|
|
1475
1631
|
}
|
|
1476
1632
|
}
|
|
@@ -176,9 +176,51 @@ describe("evaluateHealthStatus", () => {
|
|
|
176
176
|
});
|
|
177
177
|
});
|
|
178
178
|
|
|
179
|
+
describe("transient failure (single blip) does not escalate", () => {
|
|
180
|
+
test("default thresholds: one failure then recovery never leaves healthy", () => {
|
|
181
|
+
// Reproduces the real-world bug: an assignment fails once (e.g. a check
|
|
182
|
+
// timeout) and recovers on the next run. Default degraded threshold is 2
|
|
183
|
+
// consecutive failures, so a single failure must NOT escalate to
|
|
184
|
+
// degraded/unhealthy (which would fire a "System health critical"
|
|
185
|
+
// notification).
|
|
186
|
+
|
|
187
|
+
// After the single failing run (only one run recorded so far).
|
|
188
|
+
expect(evaluateHealthStatus({ runs: createRuns(["unhealthy"]) })).toBe(
|
|
189
|
+
"healthy"
|
|
190
|
+
);
|
|
191
|
+
|
|
192
|
+
// After the next run succeeds.
|
|
193
|
+
expect(
|
|
194
|
+
evaluateHealthStatus({ runs: createRuns(["healthy", "unhealthy"]) })
|
|
195
|
+
).toBe("healthy");
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
test("single leading failure below degraded threshold stays healthy", () => {
|
|
199
|
+
const thresholds: ConsecutiveThresholds = {
|
|
200
|
+
mode: "consecutive",
|
|
201
|
+
healthy: { minSuccessCount: 1 },
|
|
202
|
+
degraded: { minFailureCount: 2 },
|
|
203
|
+
unhealthy: { minFailureCount: 3 },
|
|
204
|
+
};
|
|
205
|
+
// Most recent run failed once, then a flicker of success, then failures.
|
|
206
|
+
// The leading failure streak is only 1 (< degraded threshold of 2), so
|
|
207
|
+
// consecutive mode must NOT report unhealthy off the single latest
|
|
208
|
+
// failure.
|
|
209
|
+
const runs = createRuns([
|
|
210
|
+
"unhealthy",
|
|
211
|
+
"healthy",
|
|
212
|
+
"unhealthy",
|
|
213
|
+
"unhealthy",
|
|
214
|
+
"unhealthy",
|
|
215
|
+
]);
|
|
216
|
+
expect(evaluateHealthStatus({ runs, thresholds })).toBe("healthy");
|
|
217
|
+
});
|
|
218
|
+
});
|
|
219
|
+
|
|
179
220
|
describe("flickering scenarios", () => {
|
|
180
|
-
test("window mode
|
|
181
|
-
// System that is mostly failing but occasionally succeeds
|
|
221
|
+
test("window mode catches a mostly-failing system consecutive mode ignores", () => {
|
|
222
|
+
// System that is mostly failing but occasionally succeeds, with the most
|
|
223
|
+
// recent run a single failure after a flicker of success.
|
|
182
224
|
const runs = createRuns([
|
|
183
225
|
"unhealthy",
|
|
184
226
|
"healthy", // Flicker
|
|
@@ -201,12 +243,15 @@ describe("evaluateHealthStatus", () => {
|
|
|
201
243
|
unhealthy: { minFailureCount: 4 },
|
|
202
244
|
};
|
|
203
245
|
|
|
204
|
-
// Consecutive:
|
|
246
|
+
// Consecutive: only the leading streak counts (1 failure, below the
|
|
247
|
+
// degraded threshold), so it stays healthy and does not over-react to the
|
|
248
|
+
// single most-recent failure.
|
|
205
249
|
expect(
|
|
206
250
|
evaluateHealthStatus({ runs, thresholds: consecutiveThresholds })
|
|
207
|
-
).toBe("
|
|
251
|
+
).toBe("healthy");
|
|
208
252
|
|
|
209
|
-
// Window: sees 4 failures in window of 5, returns unhealthy
|
|
253
|
+
// Window: sees 4 failures in window of 5, returns unhealthy. This is why
|
|
254
|
+
// window mode is preferable for intermittently-failing systems.
|
|
210
255
|
expect(evaluateHealthStatus({ runs, thresholds: windowThresholds })).toBe(
|
|
211
256
|
"unhealthy"
|
|
212
257
|
);
|
package/src/state-evaluator.ts
CHANGED
|
@@ -75,8 +75,15 @@ function evaluateConsecutive(props: {
|
|
|
75
75
|
return "healthy";
|
|
76
76
|
}
|
|
77
77
|
|
|
78
|
-
//
|
|
79
|
-
|
|
78
|
+
// Not enough consecutive failures to reach the degraded threshold (and not
|
|
79
|
+
// enough successes to confirm healthy). The thresholds exist precisely so a
|
|
80
|
+
// transient blip (e.g. a single failing run that recovers on the next run)
|
|
81
|
+
// does NOT escalate the system status. Returning the raw latest run status
|
|
82
|
+
// here would let one failure flip the system to "degraded"/"unhealthy" and
|
|
83
|
+
// fire a spurious "System health critical" notification before the
|
|
84
|
+
// configured failure count is reached. Fall back to "healthy" — the same
|
|
85
|
+
// baseline window mode uses when no threshold is met.
|
|
86
|
+
return "healthy";
|
|
80
87
|
}
|
|
81
88
|
|
|
82
89
|
/**
|