@checkstack/healthcheck-backend 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +541 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +234 -0
- package/src/automations.ts +342 -0
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +698 -0
- package/src/health-entity.ts +369 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +38 -28
- package/src/index.ts +150 -98
- package/src/queue-executor.test.ts +137 -0
- package/src/queue-executor.ts +282 -380
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +18 -0
- package/src/router.ts +56 -1
- package/src/schema.ts +34 -54
- package/src/service-assignments.test.ts +184 -0
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +154 -0
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +12 -3
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
package/src/queue-executor.ts
CHANGED
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
type BaseStrategyConfig,
|
|
9
9
|
type ConnectedClient,
|
|
10
10
|
type TransportClient,
|
|
11
|
+
type CollectorRunContext,
|
|
11
12
|
} from "@checkstack/backend-api";
|
|
12
13
|
import { QueueManager } from "@checkstack/queue-api";
|
|
13
14
|
import {
|
|
@@ -35,6 +36,8 @@ import { IncidentApi } from "@checkstack/incident-common";
|
|
|
35
36
|
import { NotificationApi } from "@checkstack/notification-common";
|
|
36
37
|
import { healthcheckSystemSubscription } from "@checkstack/healthcheck-common";
|
|
37
38
|
import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
|
|
39
|
+
import { secretEnvMappingSchema } from "@checkstack/secrets-common";
|
|
40
|
+
import type { SecretResolverService } from "@checkstack/secrets-backend";
|
|
38
41
|
import { HealthCheckService } from "./service";
|
|
39
42
|
import { healthCheckHooks } from "./hooks";
|
|
40
43
|
import { incrementHourlyAggregate } from "./realtime-aggregation";
|
|
@@ -43,17 +46,13 @@ import {
|
|
|
43
46
|
classifyTransition,
|
|
44
47
|
shouldNotifyTransition,
|
|
45
48
|
} from "./notification-policy";
|
|
49
|
+
import { recordStateTransition } from "./state-transitions";
|
|
46
50
|
import {
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
openAutoIncident,
|
|
53
|
-
recordUnhealthyTransition,
|
|
54
|
-
shouldOpenForFlapping,
|
|
55
|
-
shouldOpenForSustainedUnhealthy,
|
|
56
|
-
} from "./auto-incident";
|
|
51
|
+
writeHealthEntity,
|
|
52
|
+
createHealthEntitySerializer,
|
|
53
|
+
type HealthEntityState,
|
|
54
|
+
} from "./health-entity";
|
|
55
|
+
import type { EntityHandle } from "@checkstack/automation-backend";
|
|
57
56
|
|
|
58
57
|
type Db = SafeDatabase<typeof schema>;
|
|
59
58
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
@@ -61,9 +60,36 @@ type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
|
61
60
|
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
62
61
|
type NotificationClient = InferClient<typeof NotificationApi>;
|
|
63
62
|
|
|
63
|
+
/** Shape of the aggregated state returned by `getSystemHealthStatus`. */
|
|
64
|
+
type AggregatedHealth = Awaited<
|
|
65
|
+
ReturnType<HealthCheckService["getSystemHealthStatus"]>
|
|
66
|
+
>;
|
|
67
|
+
|
|
64
68
|
/**
|
|
65
|
-
*
|
|
66
|
-
*
|
|
69
|
+
* Derive the reactive `health` entity view from the freshly-computed
|
|
70
|
+
* aggregated state. Mirrors `computeHealthEntityState` exactly: `status` is the
|
|
71
|
+
* worst-wins aggregate, `healthyChecks` counts per-check `"healthy"` statuses,
|
|
72
|
+
* and `totalChecks` is the number of enabled checks. Kept here so the
|
|
73
|
+
* `handle.mutate` write returns the SAME view the `read` accessor would have
|
|
74
|
+
* computed for the post-write state (the handle thus never re-reads).
|
|
75
|
+
*/
|
|
76
|
+
function toHealthEntityView(state: AggregatedHealth): HealthEntityState {
|
|
77
|
+
return {
|
|
78
|
+
status: state.status,
|
|
79
|
+
healthyChecks: state.checkStatuses.filter((c) => c.status === "healthy")
|
|
80
|
+
.length,
|
|
81
|
+
totalChecks: state.checkStatuses.length,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Emit the checkCompleted hook if available, plus the narrower
|
|
87
|
+
* `checkFailed` hook when the result wasn't `healthy` (so operators
|
|
88
|
+
* can wire a typed "trigger on failure" automation without having to
|
|
89
|
+
* filter `checkCompleted` themselves).
|
|
90
|
+
*
|
|
91
|
+
* Extracted to avoid duplicating the hook emission pattern across
|
|
92
|
+
* success/error paths.
|
|
67
93
|
*/
|
|
68
94
|
async function emitCheckCompletedHook({
|
|
69
95
|
getEmitHook,
|
|
@@ -81,14 +107,26 @@ async function emitCheckCompletedHook({
|
|
|
81
107
|
result: Record<string, unknown> | undefined;
|
|
82
108
|
}): Promise<void> {
|
|
83
109
|
const emitHook = getEmitHook();
|
|
84
|
-
if (emitHook)
|
|
85
|
-
|
|
110
|
+
if (!emitHook) return;
|
|
111
|
+
const timestamp = new Date().toISOString();
|
|
112
|
+
await emitHook(healthCheckHooks.checkCompleted, {
|
|
113
|
+
systemId,
|
|
114
|
+
configurationId,
|
|
115
|
+
status,
|
|
116
|
+
latencyMs,
|
|
117
|
+
result,
|
|
118
|
+
timestamp,
|
|
119
|
+
});
|
|
120
|
+
// Narrow follow-up — informational for automation triggers; the
|
|
121
|
+
// auto-incident pipeline still runs on its own thresholds.
|
|
122
|
+
if (status !== "healthy") {
|
|
123
|
+
await emitHook(healthCheckHooks.checkFailed, {
|
|
86
124
|
systemId,
|
|
87
125
|
configurationId,
|
|
88
126
|
status,
|
|
89
127
|
latencyMs,
|
|
90
128
|
result,
|
|
91
|
-
timestamp
|
|
129
|
+
timestamp,
|
|
92
130
|
});
|
|
93
131
|
}
|
|
94
132
|
}
|
|
@@ -102,9 +140,11 @@ export interface HealthCheckJobPayload {
|
|
|
102
140
|
}
|
|
103
141
|
|
|
104
142
|
/**
|
|
105
|
-
* Queue name for health check execution
|
|
143
|
+
* Queue name for health check execution. Exported so consumers like
|
|
144
|
+
* the `healthcheck.run_now` automation action can enqueue a one-off
|
|
145
|
+
* job without re-importing the recurring-job factory.
|
|
106
146
|
*/
|
|
107
|
-
const HEALTH_CHECK_QUEUE = "health-checks";
|
|
147
|
+
export const HEALTH_CHECK_QUEUE = "health-checks";
|
|
108
148
|
|
|
109
149
|
/**
|
|
110
150
|
* Worker group for health check execution (work-queue mode)
|
|
@@ -151,186 +191,12 @@ export async function scheduleHealthCheck(props: {
|
|
|
151
191
|
});
|
|
152
192
|
}
|
|
153
193
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
* configured window.
|
|
161
|
-
* - **sustained**: the check is currently unhealthy AND has been so
|
|
162
|
-
* continuously for at least the configured duration.
|
|
163
|
-
*
|
|
164
|
-
* Both triggers honour the require-recovery rule: after the most
|
|
165
|
-
* recent auto-incident close (manual or auto), no new auto-incident
|
|
166
|
-
* opens until the check has logged at least one healthy run. This
|
|
167
|
-
* stops a manual close → still-unhealthy → re-open loop.
|
|
168
|
-
*
|
|
169
|
-
* Active maintenance with suppression skips both triggers when the
|
|
170
|
-
* policy opts in.
|
|
171
|
-
*/
|
|
172
|
-
async function maybeOpenAutoIncidentForCheck(props: {
|
|
173
|
-
db: Db;
|
|
174
|
-
service: HealthCheckService;
|
|
175
|
-
incidentClient: IncidentClient;
|
|
176
|
-
maintenanceClient: MaintenanceClient;
|
|
177
|
-
logger: Logger;
|
|
178
|
-
systemId: string;
|
|
179
|
-
systemName: string;
|
|
180
|
-
configurationId: string;
|
|
181
|
-
configurationName: string;
|
|
182
|
-
previousState: {
|
|
183
|
-
checkStatuses: Array<{
|
|
184
|
-
configurationId: string;
|
|
185
|
-
status: HealthCheckStatus;
|
|
186
|
-
}>;
|
|
187
|
-
};
|
|
188
|
-
newState: {
|
|
189
|
-
checkStatuses: Array<{
|
|
190
|
-
configurationId: string;
|
|
191
|
-
status: HealthCheckStatus;
|
|
192
|
-
}>;
|
|
193
|
-
};
|
|
194
|
-
}): Promise<void> {
|
|
195
|
-
const {
|
|
196
|
-
db,
|
|
197
|
-
service,
|
|
198
|
-
incidentClient,
|
|
199
|
-
maintenanceClient,
|
|
200
|
-
logger,
|
|
201
|
-
systemId,
|
|
202
|
-
systemName,
|
|
203
|
-
configurationId,
|
|
204
|
-
configurationName,
|
|
205
|
-
previousState,
|
|
206
|
-
newState,
|
|
207
|
-
} = props;
|
|
208
|
-
|
|
209
|
-
const next = newState.checkStatuses.find(
|
|
210
|
-
(c) => c.configurationId === configurationId,
|
|
211
|
-
);
|
|
212
|
-
// Only auto-incident logic applies when the check is currently
|
|
213
|
-
// unhealthy — both triggers require it.
|
|
214
|
-
if (!next || next.status !== "unhealthy") return;
|
|
215
|
-
|
|
216
|
-
const prev = previousState.checkStatuses.find(
|
|
217
|
-
(c) => c.configurationId === configurationId,
|
|
218
|
-
);
|
|
219
|
-
const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
|
|
220
|
-
|
|
221
|
-
let policy;
|
|
222
|
-
try {
|
|
223
|
-
policy = await service.getAssignmentNotificationPolicy({
|
|
224
|
-
systemId,
|
|
225
|
-
configurationId,
|
|
226
|
-
});
|
|
227
|
-
} catch (error) {
|
|
228
|
-
logger.warn(
|
|
229
|
-
`Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
|
|
230
|
-
error,
|
|
231
|
-
);
|
|
232
|
-
return;
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
if (!policy.autoOpenIncidentOnUnhealthy) return;
|
|
236
|
-
|
|
237
|
-
// Honour active maintenance windows — operators have explicitly
|
|
238
|
-
// said the system is down on purpose.
|
|
239
|
-
if (policy.skipDuringMaintenance) {
|
|
240
|
-
const suppressed = await isMaintenanceSuppressed({
|
|
241
|
-
maintenanceClient,
|
|
242
|
-
systemId,
|
|
243
|
-
logger,
|
|
244
|
-
});
|
|
245
|
-
if (suppressed) {
|
|
246
|
-
logger.debug(
|
|
247
|
-
`Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
|
|
248
|
-
);
|
|
249
|
-
return;
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
// Require-recovery: if there's a prior closed auto-incident for
|
|
254
|
-
// this assignment, the check must have logged at least one healthy
|
|
255
|
-
// run since the close before we can open another one. Without this,
|
|
256
|
-
// an operator's manual close on a still-broken system would loop.
|
|
257
|
-
const lastCloseAt = await findLastAutoIncidentClose({
|
|
258
|
-
db,
|
|
259
|
-
systemId,
|
|
260
|
-
configurationId,
|
|
261
|
-
});
|
|
262
|
-
if (lastCloseAt) {
|
|
263
|
-
const recovered = await hasHealthyRunSince({
|
|
264
|
-
db,
|
|
265
|
-
systemId,
|
|
266
|
-
configurationId,
|
|
267
|
-
since: lastCloseAt,
|
|
268
|
-
});
|
|
269
|
-
if (!recovered) {
|
|
270
|
-
return;
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
// Record the transition (if any) and evaluate the flapping trigger
|
|
275
|
-
// against transitions that happened after the last close window.
|
|
276
|
-
let flappingOpens = false;
|
|
277
|
-
if (isTransition) {
|
|
278
|
-
try {
|
|
279
|
-
const count = await recordUnhealthyTransition({
|
|
280
|
-
db,
|
|
281
|
-
configurationId,
|
|
282
|
-
systemId,
|
|
283
|
-
windowMinutes: policy.flappingTrigger.windowMinutes,
|
|
284
|
-
since: lastCloseAt,
|
|
285
|
-
});
|
|
286
|
-
flappingOpens = shouldOpenForFlapping({
|
|
287
|
-
policy,
|
|
288
|
-
recentTransitionCount: count,
|
|
289
|
-
});
|
|
290
|
-
} catch (error) {
|
|
291
|
-
logger.warn(
|
|
292
|
-
`Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
|
|
293
|
-
error,
|
|
294
|
-
);
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
// Evaluate the sustained-duration trigger on every run while the
|
|
299
|
-
// check is unhealthy (not just on transition).
|
|
300
|
-
let sustainedOpens = false;
|
|
301
|
-
if (policy.sustainedUnhealthyTrigger.enabled) {
|
|
302
|
-
const unhealthySince = await findUnhealthySince({
|
|
303
|
-
db,
|
|
304
|
-
configurationId,
|
|
305
|
-
systemId,
|
|
306
|
-
since: lastCloseAt,
|
|
307
|
-
});
|
|
308
|
-
if (unhealthySince) {
|
|
309
|
-
sustainedOpens = shouldOpenForSustainedUnhealthy({
|
|
310
|
-
policy,
|
|
311
|
-
unhealthyForMs: Date.now() - unhealthySince.getTime(),
|
|
312
|
-
});
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
if (!flappingOpens && !sustainedOpens) return;
|
|
317
|
-
|
|
318
|
-
const reason = flappingOpens
|
|
319
|
-
? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
|
|
320
|
-
: `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
|
|
321
|
-
|
|
322
|
-
await openAutoIncident({
|
|
323
|
-
db,
|
|
324
|
-
incidentClient,
|
|
325
|
-
logger,
|
|
326
|
-
systemId,
|
|
327
|
-
systemName,
|
|
328
|
-
configurationId,
|
|
329
|
-
configurationName,
|
|
330
|
-
policy,
|
|
331
|
-
reason,
|
|
332
|
-
});
|
|
333
|
-
}
|
|
194
|
+
// Flapping detection no longer lives here. It moved into the automation
|
|
195
|
+
// engine as a windowed-count gate on the `healthcheck.system_health_changed`
|
|
196
|
+
// trigger (raw aggregated-health change + `filter` +
|
|
197
|
+
// `window: { count, minutes, refire: "once" }`). The queue executor emits only
|
|
198
|
+
// the raw per-system health change (via the reactive `health` entity deriver,
|
|
199
|
+
// unchanged); the engine does the counting.
|
|
334
200
|
|
|
335
201
|
/**
|
|
336
202
|
* Notify system subscribers about a health state change.
|
|
@@ -519,6 +385,21 @@ async function executeHealthCheckJob(props: {
|
|
|
519
385
|
incidentClient: IncidentClient;
|
|
520
386
|
getEmitHook: () => EmitHookFn | undefined;
|
|
521
387
|
cache: HealthCheckCache;
|
|
388
|
+
/**
|
|
389
|
+
* Resolver for the reactive `health` entity handle (§10.3). Returns the
|
|
390
|
+
* handle once automation-backend has bound the entity store; `undefined`
|
|
391
|
+
* during version skew / tests. Mirrors the `getEmitHook` closure pattern.
|
|
392
|
+
* The entity is PLUGIN-BACKED + COMPUTED — there is no keyed store; the
|
|
393
|
+
* durable run/aggregate write IS the entity write (see `writeHealthEntity`).
|
|
394
|
+
*/
|
|
395
|
+
getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
|
|
396
|
+
/**
|
|
397
|
+
* Central secret resolver. When set, a collector declaring a `secretEnv`
|
|
398
|
+
* has it resolved + injected for this centrally-executed run; the
|
|
399
|
+
* collector masks the values out of its output. Optional for version-skew
|
|
400
|
+
* / test isolation.
|
|
401
|
+
*/
|
|
402
|
+
secretResolver?: SecretResolverService;
|
|
522
403
|
}): Promise<void> {
|
|
523
404
|
const {
|
|
524
405
|
payload,
|
|
@@ -532,13 +413,23 @@ async function executeHealthCheckJob(props: {
|
|
|
532
413
|
maintenanceClient,
|
|
533
414
|
incidentClient,
|
|
534
415
|
getEmitHook,
|
|
416
|
+
getHealthEntity,
|
|
535
417
|
cache,
|
|
418
|
+
secretResolver,
|
|
536
419
|
} = props;
|
|
537
420
|
const { configId, systemId } = payload;
|
|
538
421
|
|
|
539
422
|
// Create service for aggregated state evaluation
|
|
540
423
|
const service = new HealthCheckService(db, registry, collectorRegistry);
|
|
541
424
|
|
|
425
|
+
// Per-system serializer for the reactive health mutate (§10.3): a
|
|
426
|
+
// transaction-scoped advisory lock keyed `health:<systemId>` wraps the
|
|
427
|
+
// snapshot-prev + apply + diff + emit so concurrent evaluations of one
|
|
428
|
+
// system (multiple per-config jobs across pods, or at-least-once
|
|
429
|
+
// redelivery) can't double-emit a single logical transition. Bound to this
|
|
430
|
+
// job's systemId below at every `writeHealthEntity` call.
|
|
431
|
+
const serializeHealthWrite = createHealthEntitySerializer({ db })(systemId);
|
|
432
|
+
|
|
542
433
|
// Capture aggregated state BEFORE this run for comparison
|
|
543
434
|
const previousState = await service.getSystemHealthStatus(systemId);
|
|
544
435
|
const previousStatus = previousState.status;
|
|
@@ -612,6 +503,17 @@ async function executeHealthCheckJob(props: {
|
|
|
612
503
|
logger.debug(`Could not fetch system name for ${systemId}, using ID`);
|
|
613
504
|
}
|
|
614
505
|
|
|
506
|
+
// Curated, read-only run-context metadata exposed to collectors.
|
|
507
|
+
// Metadata only - never secrets or config.
|
|
508
|
+
const runContext: CollectorRunContext = {
|
|
509
|
+
check: {
|
|
510
|
+
id: configId,
|
|
511
|
+
name: configRow.configName || configId,
|
|
512
|
+
intervalSeconds: configRow.interval,
|
|
513
|
+
},
|
|
514
|
+
system: { id: systemId, name: systemName },
|
|
515
|
+
};
|
|
516
|
+
|
|
615
517
|
const strategy = registry.getStrategy(configRow.strategyId);
|
|
616
518
|
if (!strategy) {
|
|
617
519
|
logger.warn(
|
|
@@ -658,10 +560,31 @@ async function executeHealthCheckJob(props: {
|
|
|
658
560
|
const storageKey = collectorEntry.id;
|
|
659
561
|
|
|
660
562
|
try {
|
|
563
|
+
// Resolve the collector's declared secretEnv for THIS run
|
|
564
|
+
// (central execution). The collector injects it and masks the
|
|
565
|
+
// values out of its output. A missing required secret throws
|
|
566
|
+
// and fails the collector clearly.
|
|
567
|
+
let secretEnv: Record<string, string> | undefined;
|
|
568
|
+
const declared = secretEnvMappingSchema.safeParse(
|
|
569
|
+
(collectorEntry.config as { secretEnv?: unknown }).secretEnv,
|
|
570
|
+
);
|
|
571
|
+
if (
|
|
572
|
+
secretResolver &&
|
|
573
|
+
declared.success &&
|
|
574
|
+
Object.keys(declared.data).length > 0
|
|
575
|
+
) {
|
|
576
|
+
const resolved = await secretResolver.resolveForRun({
|
|
577
|
+
secretEnv: declared.data,
|
|
578
|
+
});
|
|
579
|
+
secretEnv = resolved.env;
|
|
580
|
+
}
|
|
581
|
+
|
|
661
582
|
const collectorResult = await registered.collector.execute({
|
|
662
583
|
config: collectorEntry.config,
|
|
663
584
|
client: connectedClient!.client,
|
|
664
585
|
pluginId: configRow.strategyId,
|
|
586
|
+
runContext,
|
|
587
|
+
...(secretEnv ? { secretEnv } : {}),
|
|
665
588
|
});
|
|
666
589
|
|
|
667
590
|
// Check for collector-level error
|
|
@@ -792,26 +715,44 @@ async function executeHealthCheckJob(props: {
|
|
|
792
715
|
},
|
|
793
716
|
};
|
|
794
717
|
|
|
795
|
-
|
|
796
|
-
|
|
718
|
+
// Persist the run + aggregate THROUGH the reactive `health` entity:
|
|
719
|
+
// `apply` does the durable write and returns the freshly-computed view.
|
|
720
|
+
// The framework snapshots `prev` via `read` BEFORE this insert, so a real
|
|
721
|
+
// status change emits exactly one correct `ENTITY_CHANGED` (§10.3). The
|
|
722
|
+
// computed aggregated state is stashed for the transition/notify path.
|
|
723
|
+
let newState!: AggregatedHealth;
|
|
724
|
+
await writeHealthEntity({
|
|
725
|
+
handle: getHealthEntity?.(),
|
|
797
726
|
systemId,
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
727
|
+
apply: async () => {
|
|
728
|
+
await db.insert(healthCheckRuns).values({
|
|
729
|
+
configurationId: configId,
|
|
730
|
+
systemId,
|
|
731
|
+
status: result.status,
|
|
732
|
+
latencyMs: result.latencyMs,
|
|
733
|
+
result: { ...result } as Record<string, unknown>,
|
|
734
|
+
sourceId: undefined,
|
|
735
|
+
sourceLabel: "Local",
|
|
736
|
+
});
|
|
804
737
|
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
738
|
+
await incrementHourlyAggregate({
|
|
739
|
+
db,
|
|
740
|
+
systemId,
|
|
741
|
+
configurationId: configId,
|
|
742
|
+
status: result.status,
|
|
743
|
+
latencyMs: result.latencyMs,
|
|
744
|
+
runTimestamp: new Date(),
|
|
745
|
+
result: { ...result } as Record<string, unknown>,
|
|
746
|
+
collectorRegistry,
|
|
747
|
+
sourceLabel: "Local",
|
|
748
|
+
});
|
|
749
|
+
|
|
750
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
751
|
+
return toHealthEntityView(newState);
|
|
752
|
+
},
|
|
753
|
+
serialize: serializeHealthWrite,
|
|
754
|
+
onError: (error) =>
|
|
755
|
+
logger.warn(`Failed to mirror health entity for ${systemId}`, error),
|
|
815
756
|
});
|
|
816
757
|
|
|
817
758
|
logger.debug(
|
|
@@ -831,8 +772,17 @@ async function executeHealthCheckJob(props: {
|
|
|
831
772
|
latencyMs: result.latencyMs,
|
|
832
773
|
});
|
|
833
774
|
|
|
834
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
835
775
|
if (newState.status !== previousStatus) {
|
|
776
|
+
// Record the aggregate transition so the sensing layer has a
|
|
777
|
+
// reliable "in status since" for every status (Wave 2).
|
|
778
|
+
await recordStateTransition({
|
|
779
|
+
db,
|
|
780
|
+
systemId,
|
|
781
|
+
configurationId: configId,
|
|
782
|
+
fromStatus: previousStatus,
|
|
783
|
+
toStatus: newState.status,
|
|
784
|
+
});
|
|
785
|
+
|
|
836
786
|
await notifyStateChange({
|
|
837
787
|
notificationClient,
|
|
838
788
|
systemId,
|
|
@@ -848,23 +798,6 @@ async function executeHealthCheckJob(props: {
|
|
|
848
798
|
});
|
|
849
799
|
}
|
|
850
800
|
|
|
851
|
-
// Per-check auto-incident: runs whether or not the aggregate
|
|
852
|
-
// changed (a check can transition to unhealthy without flipping
|
|
853
|
-
// the aggregate if another check is already unhealthy).
|
|
854
|
-
await maybeOpenAutoIncidentForCheck({
|
|
855
|
-
db,
|
|
856
|
-
service,
|
|
857
|
-
incidentClient,
|
|
858
|
-
maintenanceClient,
|
|
859
|
-
logger,
|
|
860
|
-
systemId,
|
|
861
|
-
systemName,
|
|
862
|
-
configurationId: configId,
|
|
863
|
-
configurationName: configRow.configName,
|
|
864
|
-
previousState,
|
|
865
|
-
newState,
|
|
866
|
-
});
|
|
867
|
-
|
|
868
801
|
return;
|
|
869
802
|
} finally {
|
|
870
803
|
if (connectedClient) {
|
|
@@ -893,28 +826,48 @@ async function executeHealthCheckJob(props: {
|
|
|
893
826
|
},
|
|
894
827
|
};
|
|
895
828
|
|
|
896
|
-
//
|
|
897
|
-
|
|
898
|
-
|
|
829
|
+
// Persist the run + aggregate THROUGH the reactive `health` entity on
|
|
830
|
+
// every run (§10.3): `apply` does the durable write (insert + hourly
|
|
831
|
+
// aggregate) and returns the freshly-computed view. The framework
|
|
832
|
+
// snapshots `prev` via the COMPUTE-ON-READ accessor BEFORE this insert, so
|
|
833
|
+
// an unchanged aggregate is a no-op and a real status change drives the
|
|
834
|
+
// directional/umbrella trigger events via `deriveHealthTriggerEvents` —
|
|
835
|
+
// exactly one correct `ENTITY_CHANGED` with accurate prev → next.
|
|
836
|
+
let newState!: AggregatedHealth;
|
|
837
|
+
await writeHealthEntity({
|
|
838
|
+
handle: getHealthEntity?.(),
|
|
899
839
|
systemId,
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
840
|
+
apply: async () => {
|
|
841
|
+
// Store result (spread to convert structured type to plain record for jsonb)
|
|
842
|
+
await db.insert(healthCheckRuns).values({
|
|
843
|
+
configurationId: configId,
|
|
844
|
+
systemId,
|
|
845
|
+
status: result.status,
|
|
846
|
+
latencyMs: result.latencyMs,
|
|
847
|
+
result: { ...result } as Record<string, unknown>,
|
|
848
|
+
sourceId: undefined,
|
|
849
|
+
sourceLabel: "Local",
|
|
850
|
+
});
|
|
906
851
|
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
852
|
+
// Trigger incremental hourly aggregation
|
|
853
|
+
await incrementHourlyAggregate({
|
|
854
|
+
db,
|
|
855
|
+
systemId,
|
|
856
|
+
configurationId: configId,
|
|
857
|
+
status: result.status,
|
|
858
|
+
latencyMs: result.latencyMs,
|
|
859
|
+
runTimestamp: new Date(),
|
|
860
|
+
result: { ...result } as Record<string, unknown>,
|
|
861
|
+
collectorRegistry,
|
|
862
|
+
sourceLabel: "Local",
|
|
863
|
+
});
|
|
864
|
+
|
|
865
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
866
|
+
return toHealthEntityView(newState);
|
|
867
|
+
},
|
|
868
|
+
serialize: serializeHealthWrite,
|
|
869
|
+
onError: (error) =>
|
|
870
|
+
logger.warn(`Failed to mirror health entity for ${systemId}`, error),
|
|
918
871
|
});
|
|
919
872
|
|
|
920
873
|
logger.debug(
|
|
@@ -944,9 +897,17 @@ async function executeHealthCheckJob(props: {
|
|
|
944
897
|
result: (result.metadata?.collectors as Record<string, unknown>) ?? undefined,
|
|
945
898
|
});
|
|
946
899
|
|
|
947
|
-
// Check if aggregated state changed and notify subscribers
|
|
948
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
949
900
|
if (newState.status !== previousStatus) {
|
|
901
|
+
// Record the aggregate transition so the sensing layer has a
|
|
902
|
+
// reliable "in status since" for every status (Wave 2).
|
|
903
|
+
await recordStateTransition({
|
|
904
|
+
db,
|
|
905
|
+
systemId,
|
|
906
|
+
configurationId: configId,
|
|
907
|
+
fromStatus: previousStatus,
|
|
908
|
+
toStatus: newState.status,
|
|
909
|
+
});
|
|
910
|
+
|
|
950
911
|
await notifyStateChange({
|
|
951
912
|
notificationClient,
|
|
952
913
|
systemId,
|
|
@@ -968,60 +929,13 @@ async function executeHealthCheckJob(props: {
|
|
|
968
929
|
newStatus: newState.status,
|
|
969
930
|
});
|
|
970
931
|
|
|
971
|
-
//
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
await emitHook(healthCheckHooks.systemHealthy, {
|
|
977
|
-
systemId,
|
|
978
|
-
previousStatus,
|
|
979
|
-
healthyChecks: newState.checkStatuses.filter(
|
|
980
|
-
(c) => c.status === "healthy",
|
|
981
|
-
).length,
|
|
982
|
-
totalChecks: newState.checkStatuses.length,
|
|
983
|
-
timestamp: new Date().toISOString(),
|
|
984
|
-
});
|
|
985
|
-
logger.debug(
|
|
986
|
-
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
987
|
-
);
|
|
988
|
-
} else if (
|
|
989
|
-
previousStatus === "healthy" &&
|
|
990
|
-
newState.status !== "healthy"
|
|
991
|
-
) {
|
|
992
|
-
// Degradation: system went from healthy to unhealthy/degraded
|
|
993
|
-
await emitHook(healthCheckHooks.systemDegraded, {
|
|
994
|
-
systemId,
|
|
995
|
-
previousStatus,
|
|
996
|
-
newStatus: newState.status,
|
|
997
|
-
healthyChecks: newState.checkStatuses.filter(
|
|
998
|
-
(c) => c.status === "healthy",
|
|
999
|
-
).length,
|
|
1000
|
-
totalChecks: newState.checkStatuses.length,
|
|
1001
|
-
timestamp: new Date().toISOString(),
|
|
1002
|
-
});
|
|
1003
|
-
logger.debug(
|
|
1004
|
-
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
1005
|
-
);
|
|
1006
|
-
}
|
|
1007
|
-
}
|
|
932
|
+
// The directional + umbrella system-health hooks were removed in
|
|
933
|
+
// Phase 4 (§10.3): the `health` entity mirror above is the single
|
|
934
|
+
// source of truth, and its change deriver fires the
|
|
935
|
+
// `healthcheck.system_degraded` / `_healthy` / `_health_changed`
|
|
936
|
+
// trigger events through Stage-1 routing. Nothing to emit here.
|
|
1008
937
|
}
|
|
1009
938
|
|
|
1010
|
-
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1011
|
-
await maybeOpenAutoIncidentForCheck({
|
|
1012
|
-
db,
|
|
1013
|
-
service,
|
|
1014
|
-
incidentClient,
|
|
1015
|
-
maintenanceClient,
|
|
1016
|
-
logger,
|
|
1017
|
-
systemId,
|
|
1018
|
-
systemName,
|
|
1019
|
-
configurationId: configId,
|
|
1020
|
-
configurationName: configRow.configName,
|
|
1021
|
-
previousState,
|
|
1022
|
-
newState,
|
|
1023
|
-
});
|
|
1024
|
-
|
|
1025
939
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
1026
940
|
} catch (error) {
|
|
1027
941
|
logger.error(
|
|
@@ -1029,27 +943,48 @@ async function executeHealthCheckJob(props: {
|
|
|
1029
943
|
error,
|
|
1030
944
|
);
|
|
1031
945
|
|
|
1032
|
-
//
|
|
1033
|
-
|
|
1034
|
-
|
|
946
|
+
// Persist the failure run + aggregate THROUGH the reactive `health`
|
|
947
|
+
// entity: `apply` does the durable write and returns the freshly-computed
|
|
948
|
+
// view. The framework snapshots `prev` via the compute-on-read accessor
|
|
949
|
+
// BEFORE this insert, so a real status change emits exactly one correct
|
|
950
|
+
// `ENTITY_CHANGED` (§10.3). See the success path for the full rationale.
|
|
951
|
+
let newState!: AggregatedHealth;
|
|
952
|
+
await writeHealthEntity({
|
|
953
|
+
handle: getHealthEntity?.(),
|
|
1035
954
|
systemId,
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
955
|
+
apply: async () => {
|
|
956
|
+
// Store failure (no latencyMs for failures)
|
|
957
|
+
await db.insert(healthCheckRuns).values({
|
|
958
|
+
configurationId: configId,
|
|
959
|
+
systemId,
|
|
960
|
+
status: "unhealthy",
|
|
961
|
+
result: { error: String(error) } as Record<string, unknown>,
|
|
962
|
+
sourceId: undefined,
|
|
963
|
+
sourceLabel: "Local",
|
|
964
|
+
});
|
|
1041
965
|
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
966
|
+
// Trigger incremental hourly aggregation
|
|
967
|
+
await incrementHourlyAggregate({
|
|
968
|
+
db,
|
|
969
|
+
systemId,
|
|
970
|
+
configurationId: configId,
|
|
971
|
+
status: "unhealthy",
|
|
972
|
+
latencyMs: undefined,
|
|
973
|
+
runTimestamp: new Date(),
|
|
974
|
+
// No collector data for error cases
|
|
975
|
+
collectorRegistry,
|
|
976
|
+
sourceLabel: "Local",
|
|
977
|
+
});
|
|
978
|
+
|
|
979
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
980
|
+
return toHealthEntityView(newState);
|
|
981
|
+
},
|
|
982
|
+
serialize: serializeHealthWrite,
|
|
983
|
+
onError: (mirrorError) =>
|
|
984
|
+
logger.warn(
|
|
985
|
+
`Failed to mirror health entity for ${systemId}`,
|
|
986
|
+
mirrorError,
|
|
987
|
+
),
|
|
1053
988
|
});
|
|
1054
989
|
|
|
1055
990
|
// Try to fetch names for the enriched signal (best-effort)
|
|
@@ -1093,9 +1028,17 @@ async function executeHealthCheckJob(props: {
|
|
|
1093
1028
|
result: undefined,
|
|
1094
1029
|
});
|
|
1095
1030
|
|
|
1096
|
-
// Check if aggregated state changed and notify subscribers
|
|
1097
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
1098
1031
|
if (newState.status !== previousStatus) {
|
|
1032
|
+
// Record the aggregate transition so the sensing layer has a
|
|
1033
|
+
// reliable "in status since" for every status (Wave 2).
|
|
1034
|
+
await recordStateTransition({
|
|
1035
|
+
db,
|
|
1036
|
+
systemId,
|
|
1037
|
+
configurationId: configId,
|
|
1038
|
+
fromStatus: previousStatus,
|
|
1039
|
+
toStatus: newState.status,
|
|
1040
|
+
});
|
|
1041
|
+
|
|
1099
1042
|
await notifyStateChange({
|
|
1100
1043
|
notificationClient,
|
|
1101
1044
|
systemId,
|
|
@@ -1117,60 +1060,13 @@ async function executeHealthCheckJob(props: {
|
|
|
1117
1060
|
newStatus: newState.status,
|
|
1118
1061
|
});
|
|
1119
1062
|
|
|
1120
|
-
//
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
await emitHook(healthCheckHooks.systemHealthy, {
|
|
1126
|
-
systemId,
|
|
1127
|
-
previousStatus,
|
|
1128
|
-
healthyChecks: newState.checkStatuses.filter(
|
|
1129
|
-
(c) => c.status === "healthy",
|
|
1130
|
-
).length,
|
|
1131
|
-
totalChecks: newState.checkStatuses.length,
|
|
1132
|
-
timestamp: new Date().toISOString(),
|
|
1133
|
-
});
|
|
1134
|
-
logger.debug(
|
|
1135
|
-
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
1136
|
-
);
|
|
1137
|
-
} else if (
|
|
1138
|
-
previousStatus === "healthy" &&
|
|
1139
|
-
newState.status !== "healthy"
|
|
1140
|
-
) {
|
|
1141
|
-
// Degradation: system went from healthy to unhealthy/degraded
|
|
1142
|
-
await emitHook(healthCheckHooks.systemDegraded, {
|
|
1143
|
-
systemId,
|
|
1144
|
-
previousStatus,
|
|
1145
|
-
newStatus: newState.status,
|
|
1146
|
-
healthyChecks: newState.checkStatuses.filter(
|
|
1147
|
-
(c) => c.status === "healthy",
|
|
1148
|
-
).length,
|
|
1149
|
-
totalChecks: newState.checkStatuses.length,
|
|
1150
|
-
timestamp: new Date().toISOString(),
|
|
1151
|
-
});
|
|
1152
|
-
logger.debug(
|
|
1153
|
-
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
1154
|
-
);
|
|
1155
|
-
}
|
|
1156
|
-
}
|
|
1063
|
+
// The directional + umbrella system-health hooks were removed in
|
|
1064
|
+
// Phase 4 (§10.3): the `health` entity mirror above is the single
|
|
1065
|
+
// source of truth, and its change deriver fires the
|
|
1066
|
+
// `healthcheck.system_degraded` / `_healthy` / `_health_changed`
|
|
1067
|
+
// trigger events through Stage-1 routing. Nothing to emit here.
|
|
1157
1068
|
}
|
|
1158
1069
|
|
|
1159
|
-
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1160
|
-
await maybeOpenAutoIncidentForCheck({
|
|
1161
|
-
db,
|
|
1162
|
-
service,
|
|
1163
|
-
incidentClient,
|
|
1164
|
-
maintenanceClient,
|
|
1165
|
-
logger,
|
|
1166
|
-
systemId,
|
|
1167
|
-
systemName,
|
|
1168
|
-
configurationId: configId,
|
|
1169
|
-
configurationName: configName,
|
|
1170
|
-
previousState,
|
|
1171
|
-
newState,
|
|
1172
|
-
});
|
|
1173
|
-
|
|
1174
1070
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
1175
1071
|
}
|
|
1176
1072
|
}
|
|
@@ -1187,7 +1083,9 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1187
1083
|
maintenanceClient: MaintenanceClient;
|
|
1188
1084
|
incidentClient: IncidentClient;
|
|
1189
1085
|
getEmitHook: () => EmitHookFn | undefined;
|
|
1086
|
+
getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
|
|
1190
1087
|
cache: HealthCheckCache;
|
|
1088
|
+
secretResolver?: SecretResolverService;
|
|
1191
1089
|
}): Promise<void> {
|
|
1192
1090
|
const {
|
|
1193
1091
|
db,
|
|
@@ -1201,7 +1099,9 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1201
1099
|
maintenanceClient,
|
|
1202
1100
|
incidentClient,
|
|
1203
1101
|
getEmitHook,
|
|
1102
|
+
getHealthEntity,
|
|
1204
1103
|
cache,
|
|
1104
|
+
secretResolver,
|
|
1205
1105
|
} = props;
|
|
1206
1106
|
|
|
1207
1107
|
const queue =
|
|
@@ -1222,7 +1122,9 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1222
1122
|
maintenanceClient,
|
|
1223
1123
|
incidentClient,
|
|
1224
1124
|
getEmitHook,
|
|
1125
|
+
getHealthEntity,
|
|
1225
1126
|
cache,
|
|
1127
|
+
secretResolver,
|
|
1226
1128
|
});
|
|
1227
1129
|
},
|
|
1228
1130
|
{
|