@checkstack/healthcheck-backend 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +409 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +6 -27
- package/src/automations.ts +32 -30
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +694 -0
- package/src/health-entity.ts +367 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +13 -68
- package/src/index.ts +118 -48
- package/src/queue-executor.test.ts +13 -0
- package/src/queue-executor.ts +251 -444
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +13 -0
- package/src/router.ts +44 -0
- package/src/schema.ts +34 -54
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +89 -0
- package/src/state-evaluator.test.ts +50 -5
- package/src/state-evaluator.ts +9 -2
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +9 -0
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
package/src/queue-executor.ts
CHANGED
|
@@ -9,6 +9,7 @@ import {
|
|
|
9
9
|
type ConnectedClient,
|
|
10
10
|
type TransportClient,
|
|
11
11
|
type CollectorRunContext,
|
|
12
|
+
type AdvisoryLockService,
|
|
12
13
|
} from "@checkstack/backend-api";
|
|
13
14
|
import { QueueManager } from "@checkstack/queue-api";
|
|
14
15
|
import {
|
|
@@ -36,6 +37,8 @@ import { IncidentApi } from "@checkstack/incident-common";
|
|
|
36
37
|
import { NotificationApi } from "@checkstack/notification-common";
|
|
37
38
|
import { healthcheckSystemSubscription } from "@checkstack/healthcheck-common";
|
|
38
39
|
import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
|
|
40
|
+
import { secretEnvMappingSchema } from "@checkstack/secrets-common";
|
|
41
|
+
import type { SecretResolverService } from "@checkstack/secrets-backend";
|
|
39
42
|
import { HealthCheckService } from "./service";
|
|
40
43
|
import { healthCheckHooks } from "./hooks";
|
|
41
44
|
import { incrementHourlyAggregate } from "./realtime-aggregation";
|
|
@@ -44,17 +47,13 @@ import {
|
|
|
44
47
|
classifyTransition,
|
|
45
48
|
shouldNotifyTransition,
|
|
46
49
|
} from "./notification-policy";
|
|
50
|
+
import { recordStateTransition } from "./state-transitions";
|
|
47
51
|
import {
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
openAutoIncident,
|
|
54
|
-
recordUnhealthyTransition,
|
|
55
|
-
shouldOpenForFlapping,
|
|
56
|
-
shouldOpenForSustainedUnhealthy,
|
|
57
|
-
} from "./auto-incident";
|
|
52
|
+
writeHealthEntity,
|
|
53
|
+
createHealthEntitySerializer,
|
|
54
|
+
type HealthEntityState,
|
|
55
|
+
} from "./health-entity";
|
|
56
|
+
import type { EntityHandle } from "@checkstack/automation-backend";
|
|
58
57
|
|
|
59
58
|
type Db = SafeDatabase<typeof schema>;
|
|
60
59
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
@@ -62,6 +61,28 @@ type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
|
62
61
|
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
63
62
|
type NotificationClient = InferClient<typeof NotificationApi>;
|
|
64
63
|
|
|
64
|
+
/** Shape of the aggregated state returned by `getSystemHealthStatus`. */
|
|
65
|
+
type AggregatedHealth = Awaited<
|
|
66
|
+
ReturnType<HealthCheckService["getSystemHealthStatus"]>
|
|
67
|
+
>;
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Derive the reactive `health` entity view from the freshly-computed
|
|
71
|
+
* aggregated state. Mirrors `computeHealthEntityState` exactly: `status` is the
|
|
72
|
+
* worst-wins aggregate, `healthyChecks` counts per-check `"healthy"` statuses,
|
|
73
|
+
* and `totalChecks` is the number of enabled checks. Kept here so the
|
|
74
|
+
* `handle.mutate` write returns the SAME view the `read` accessor would have
|
|
75
|
+
* computed for the post-write state (the handle thus never re-reads).
|
|
76
|
+
*/
|
|
77
|
+
function toHealthEntityView(state: AggregatedHealth): HealthEntityState {
|
|
78
|
+
return {
|
|
79
|
+
status: state.status,
|
|
80
|
+
healthyChecks: state.checkStatuses.filter((c) => c.status === "healthy")
|
|
81
|
+
.length,
|
|
82
|
+
totalChecks: state.checkStatuses.length,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
65
86
|
/**
|
|
66
87
|
* Emit the checkCompleted hook if available, plus the narrower
|
|
67
88
|
* `checkFailed` hook when the result wasn't `healthy` (so operators
|
|
@@ -171,222 +192,12 @@ export async function scheduleHealthCheck(props: {
|
|
|
171
192
|
});
|
|
172
193
|
}
|
|
173
194
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
* configured window.
|
|
181
|
-
* - **sustained**: the check is currently unhealthy AND has been so
|
|
182
|
-
* continuously for at least the configured duration.
|
|
183
|
-
*
|
|
184
|
-
* Both triggers honour the require-recovery rule: after the most
|
|
185
|
-
* recent auto-incident close (manual or auto), no new auto-incident
|
|
186
|
-
* opens until the check has logged at least one healthy run. This
|
|
187
|
-
* stops a manual close → still-unhealthy → re-open loop.
|
|
188
|
-
*
|
|
189
|
-
* Active maintenance with suppression skips both triggers when the
|
|
190
|
-
* policy opts in.
|
|
191
|
-
*/
|
|
192
|
-
async function maybeOpenAutoIncidentForCheck(props: {
|
|
193
|
-
db: Db;
|
|
194
|
-
service: HealthCheckService;
|
|
195
|
-
incidentClient: IncidentClient;
|
|
196
|
-
maintenanceClient: MaintenanceClient;
|
|
197
|
-
logger: Logger;
|
|
198
|
-
systemId: string;
|
|
199
|
-
systemName: string;
|
|
200
|
-
configurationId: string;
|
|
201
|
-
configurationName: string;
|
|
202
|
-
/**
|
|
203
|
-
* Same closure-based getter the queue executor uses elsewhere; let
|
|
204
|
-
* us fire the `flapping_detected` automation hook from inside the
|
|
205
|
-
* flapping evaluator without re-threading `emitHook` through every
|
|
206
|
-
* intermediate caller. Optional — when absent, the hook simply
|
|
207
|
-
* doesn't fire (e.g. in unit tests that don't care about it).
|
|
208
|
-
*/
|
|
209
|
-
getEmitHook?: () => EmitHookFn | undefined;
|
|
210
|
-
previousState: {
|
|
211
|
-
checkStatuses: Array<{
|
|
212
|
-
configurationId: string;
|
|
213
|
-
status: HealthCheckStatus;
|
|
214
|
-
}>;
|
|
215
|
-
};
|
|
216
|
-
newState: {
|
|
217
|
-
checkStatuses: Array<{
|
|
218
|
-
configurationId: string;
|
|
219
|
-
status: HealthCheckStatus;
|
|
220
|
-
}>;
|
|
221
|
-
};
|
|
222
|
-
}): Promise<void> {
|
|
223
|
-
const {
|
|
224
|
-
db,
|
|
225
|
-
service,
|
|
226
|
-
incidentClient,
|
|
227
|
-
maintenanceClient,
|
|
228
|
-
logger,
|
|
229
|
-
systemId,
|
|
230
|
-
systemName,
|
|
231
|
-
configurationId,
|
|
232
|
-
configurationName,
|
|
233
|
-
getEmitHook,
|
|
234
|
-
previousState,
|
|
235
|
-
newState,
|
|
236
|
-
} = props;
|
|
237
|
-
|
|
238
|
-
const next = newState.checkStatuses.find(
|
|
239
|
-
(c) => c.configurationId === configurationId,
|
|
240
|
-
);
|
|
241
|
-
// Only auto-incident logic applies when the check is currently
|
|
242
|
-
// unhealthy — both triggers require it.
|
|
243
|
-
if (!next || next.status !== "unhealthy") return;
|
|
244
|
-
|
|
245
|
-
const prev = previousState.checkStatuses.find(
|
|
246
|
-
(c) => c.configurationId === configurationId,
|
|
247
|
-
);
|
|
248
|
-
const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
|
|
249
|
-
|
|
250
|
-
let policy;
|
|
251
|
-
try {
|
|
252
|
-
policy = await service.getAssignmentNotificationPolicy({
|
|
253
|
-
systemId,
|
|
254
|
-
configurationId,
|
|
255
|
-
});
|
|
256
|
-
} catch (error) {
|
|
257
|
-
logger.warn(
|
|
258
|
-
`Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
|
|
259
|
-
error,
|
|
260
|
-
);
|
|
261
|
-
return;
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
if (!policy.autoOpenIncidentOnUnhealthy) return;
|
|
265
|
-
|
|
266
|
-
// Honour active maintenance windows — operators have explicitly
|
|
267
|
-
// said the system is down on purpose.
|
|
268
|
-
if (policy.skipDuringMaintenance) {
|
|
269
|
-
const suppressed = await isMaintenanceSuppressed({
|
|
270
|
-
maintenanceClient,
|
|
271
|
-
systemId,
|
|
272
|
-
logger,
|
|
273
|
-
});
|
|
274
|
-
if (suppressed) {
|
|
275
|
-
logger.debug(
|
|
276
|
-
`Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
|
|
277
|
-
);
|
|
278
|
-
return;
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
// Require-recovery: if there's a prior closed auto-incident for
|
|
283
|
-
// this assignment, the check must have logged at least one healthy
|
|
284
|
-
// run since the close before we can open another one. Without this,
|
|
285
|
-
// an operator's manual close on a still-broken system would loop.
|
|
286
|
-
const lastCloseAt = await findLastAutoIncidentClose({
|
|
287
|
-
db,
|
|
288
|
-
systemId,
|
|
289
|
-
configurationId,
|
|
290
|
-
});
|
|
291
|
-
if (lastCloseAt) {
|
|
292
|
-
const recovered = await hasHealthyRunSince({
|
|
293
|
-
db,
|
|
294
|
-
systemId,
|
|
295
|
-
configurationId,
|
|
296
|
-
since: lastCloseAt,
|
|
297
|
-
});
|
|
298
|
-
if (!recovered) {
|
|
299
|
-
return;
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
// Record the transition (if any) and evaluate the flapping trigger
|
|
304
|
-
// against transitions that happened after the last close window.
|
|
305
|
-
let flappingOpens = false;
|
|
306
|
-
if (isTransition) {
|
|
307
|
-
try {
|
|
308
|
-
const count = await recordUnhealthyTransition({
|
|
309
|
-
db,
|
|
310
|
-
configurationId,
|
|
311
|
-
systemId,
|
|
312
|
-
windowMinutes: policy.flappingTrigger.windowMinutes,
|
|
313
|
-
since: lastCloseAt,
|
|
314
|
-
});
|
|
315
|
-
flappingOpens = shouldOpenForFlapping({
|
|
316
|
-
policy,
|
|
317
|
-
recentTransitionCount: count,
|
|
318
|
-
});
|
|
319
|
-
|
|
320
|
-
// Fire the informational `flapping_detected` automation hook
|
|
321
|
-
// independently of the auto-incident decision: an operator may
|
|
322
|
-
// care about flapping even with the auto-incident pipeline
|
|
323
|
-
// turned off.
|
|
324
|
-
if (
|
|
325
|
-
policy.flappingTrigger.enabled &&
|
|
326
|
-
count >= policy.flappingTrigger.transitions
|
|
327
|
-
) {
|
|
328
|
-
const emit = getEmitHook?.();
|
|
329
|
-
if (emit) {
|
|
330
|
-
try {
|
|
331
|
-
await emit(healthCheckHooks.flappingDetected, {
|
|
332
|
-
systemId,
|
|
333
|
-
configurationId,
|
|
334
|
-
transitionCount: count,
|
|
335
|
-
windowMinutes: policy.flappingTrigger.windowMinutes,
|
|
336
|
-
timestamp: new Date().toISOString(),
|
|
337
|
-
});
|
|
338
|
-
} catch (error) {
|
|
339
|
-
logger.warn(
|
|
340
|
-
`Failed to emit healthcheck.flapping_detected hook for ${systemId}/${configurationId}:`,
|
|
341
|
-
error,
|
|
342
|
-
);
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
} catch (error) {
|
|
347
|
-
logger.warn(
|
|
348
|
-
`Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
|
|
349
|
-
error,
|
|
350
|
-
);
|
|
351
|
-
}
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
// Evaluate the sustained-duration trigger on every run while the
|
|
355
|
-
// check is unhealthy (not just on transition).
|
|
356
|
-
let sustainedOpens = false;
|
|
357
|
-
if (policy.sustainedUnhealthyTrigger.enabled) {
|
|
358
|
-
const unhealthySince = await findUnhealthySince({
|
|
359
|
-
db,
|
|
360
|
-
configurationId,
|
|
361
|
-
systemId,
|
|
362
|
-
since: lastCloseAt,
|
|
363
|
-
});
|
|
364
|
-
if (unhealthySince) {
|
|
365
|
-
sustainedOpens = shouldOpenForSustainedUnhealthy({
|
|
366
|
-
policy,
|
|
367
|
-
unhealthyForMs: Date.now() - unhealthySince.getTime(),
|
|
368
|
-
});
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
if (!flappingOpens && !sustainedOpens) return;
|
|
373
|
-
|
|
374
|
-
const reason = flappingOpens
|
|
375
|
-
? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
|
|
376
|
-
: `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
|
|
377
|
-
|
|
378
|
-
await openAutoIncident({
|
|
379
|
-
db,
|
|
380
|
-
incidentClient,
|
|
381
|
-
logger,
|
|
382
|
-
systemId,
|
|
383
|
-
systemName,
|
|
384
|
-
configurationId,
|
|
385
|
-
configurationName,
|
|
386
|
-
policy,
|
|
387
|
-
reason,
|
|
388
|
-
});
|
|
389
|
-
}
|
|
195
|
+
// Flapping detection no longer lives here. It moved into the automation
|
|
196
|
+
// engine as a windowed-count gate on the `healthcheck.system_health_changed`
|
|
197
|
+
// trigger (raw aggregated-health change + `filter` +
|
|
198
|
+
// `window: { count, minutes, refire: "once" }`). The queue executor emits only
|
|
199
|
+
// the raw per-system health change (via the reactive `health` entity deriver,
|
|
200
|
+
// unchanged); the engine does the counting.
|
|
390
201
|
|
|
391
202
|
/**
|
|
392
203
|
* Notify system subscribers about a health state change.
|
|
@@ -565,6 +376,7 @@ async function notifyStateChange(props: {
|
|
|
565
376
|
async function executeHealthCheckJob(props: {
|
|
566
377
|
payload: HealthCheckJobPayload;
|
|
567
378
|
db: Db;
|
|
379
|
+
advisoryLock: AdvisoryLockService;
|
|
568
380
|
registry: HealthCheckRegistry;
|
|
569
381
|
collectorRegistry: CollectorRegistry;
|
|
570
382
|
logger: Logger;
|
|
@@ -575,10 +387,26 @@ async function executeHealthCheckJob(props: {
|
|
|
575
387
|
incidentClient: IncidentClient;
|
|
576
388
|
getEmitHook: () => EmitHookFn | undefined;
|
|
577
389
|
cache: HealthCheckCache;
|
|
390
|
+
/**
|
|
391
|
+
* Resolver for the reactive `health` entity handle (§10.3). Returns the
|
|
392
|
+
* handle once automation-backend has bound the entity store; `undefined`
|
|
393
|
+
* during version skew / tests. Mirrors the `getEmitHook` closure pattern.
|
|
394
|
+
* The entity is PLUGIN-BACKED + COMPUTED — there is no keyed store; the
|
|
395
|
+
* durable run/aggregate write IS the entity write (see `writeHealthEntity`).
|
|
396
|
+
*/
|
|
397
|
+
getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
|
|
398
|
+
/**
|
|
399
|
+
* Central secret resolver. When set, a collector declaring a `secretEnv`
|
|
400
|
+
* has it resolved + injected for this centrally-executed run; the
|
|
401
|
+
* collector masks the values out of its output. Optional for version-skew
|
|
402
|
+
* / test isolation.
|
|
403
|
+
*/
|
|
404
|
+
secretResolver?: SecretResolverService;
|
|
578
405
|
}): Promise<void> {
|
|
579
406
|
const {
|
|
580
407
|
payload,
|
|
581
408
|
db,
|
|
409
|
+
advisoryLock,
|
|
582
410
|
registry,
|
|
583
411
|
collectorRegistry,
|
|
584
412
|
logger,
|
|
@@ -588,13 +416,25 @@ async function executeHealthCheckJob(props: {
|
|
|
588
416
|
maintenanceClient,
|
|
589
417
|
incidentClient,
|
|
590
418
|
getEmitHook,
|
|
419
|
+
getHealthEntity,
|
|
591
420
|
cache,
|
|
421
|
+
secretResolver,
|
|
592
422
|
} = props;
|
|
593
423
|
const { configId, systemId } = payload;
|
|
594
424
|
|
|
595
425
|
// Create service for aggregated state evaluation
|
|
596
426
|
const service = new HealthCheckService(db, registry, collectorRegistry);
|
|
597
427
|
|
|
428
|
+
// Per-system serializer for the reactive health mutate (§10.3): a
|
|
429
|
+
// transaction-scoped advisory lock keyed `health:<systemId>` wraps the
|
|
430
|
+
// snapshot-prev + apply + diff + emit so concurrent evaluations of one
|
|
431
|
+
// system (multiple per-config jobs across pods, or at-least-once
|
|
432
|
+
// redelivery) can't double-emit a single logical transition. Bound to this
|
|
433
|
+
// job's systemId below at every `writeHealthEntity` call.
|
|
434
|
+
const serializeHealthWrite = createHealthEntitySerializer({ advisoryLock })(
|
|
435
|
+
systemId,
|
|
436
|
+
);
|
|
437
|
+
|
|
598
438
|
// Capture aggregated state BEFORE this run for comparison
|
|
599
439
|
const previousState = await service.getSystemHealthStatus(systemId);
|
|
600
440
|
const previousStatus = previousState.status;
|
|
@@ -725,11 +565,31 @@ async function executeHealthCheckJob(props: {
|
|
|
725
565
|
const storageKey = collectorEntry.id;
|
|
726
566
|
|
|
727
567
|
try {
|
|
568
|
+
// Resolve the collector's declared secretEnv for THIS run
|
|
569
|
+
// (central execution). The collector injects it and masks the
|
|
570
|
+
// values out of its output. A missing required secret throws
|
|
571
|
+
// and fails the collector clearly.
|
|
572
|
+
let secretEnv: Record<string, string> | undefined;
|
|
573
|
+
const declared = secretEnvMappingSchema.safeParse(
|
|
574
|
+
(collectorEntry.config as { secretEnv?: unknown }).secretEnv,
|
|
575
|
+
);
|
|
576
|
+
if (
|
|
577
|
+
secretResolver &&
|
|
578
|
+
declared.success &&
|
|
579
|
+
Object.keys(declared.data).length > 0
|
|
580
|
+
) {
|
|
581
|
+
const resolved = await secretResolver.resolveForRun({
|
|
582
|
+
secretEnv: declared.data,
|
|
583
|
+
});
|
|
584
|
+
secretEnv = resolved.env;
|
|
585
|
+
}
|
|
586
|
+
|
|
728
587
|
const collectorResult = await registered.collector.execute({
|
|
729
588
|
config: collectorEntry.config,
|
|
730
589
|
client: connectedClient!.client,
|
|
731
590
|
pluginId: configRow.strategyId,
|
|
732
591
|
runContext,
|
|
592
|
+
...(secretEnv ? { secretEnv } : {}),
|
|
733
593
|
});
|
|
734
594
|
|
|
735
595
|
// Check for collector-level error
|
|
@@ -860,26 +720,44 @@ async function executeHealthCheckJob(props: {
|
|
|
860
720
|
},
|
|
861
721
|
};
|
|
862
722
|
|
|
863
|
-
|
|
864
|
-
|
|
723
|
+
// Persist the run + aggregate THROUGH the reactive `health` entity:
|
|
724
|
+
// `apply` does the durable write and returns the freshly-computed view.
|
|
725
|
+
// The framework snapshots `prev` via `read` BEFORE this insert, so a real
|
|
726
|
+
// status change emits exactly one correct `ENTITY_CHANGED` (§10.3). The
|
|
727
|
+
// computed aggregated state is stashed for the transition/notify path.
|
|
728
|
+
let newState!: AggregatedHealth;
|
|
729
|
+
await writeHealthEntity({
|
|
730
|
+
handle: getHealthEntity?.(),
|
|
865
731
|
systemId,
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
732
|
+
apply: async () => {
|
|
733
|
+
await db.insert(healthCheckRuns).values({
|
|
734
|
+
configurationId: configId,
|
|
735
|
+
systemId,
|
|
736
|
+
status: result.status,
|
|
737
|
+
latencyMs: result.latencyMs,
|
|
738
|
+
result: { ...result } as Record<string, unknown>,
|
|
739
|
+
sourceId: undefined,
|
|
740
|
+
sourceLabel: "Local",
|
|
741
|
+
});
|
|
872
742
|
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
743
|
+
await incrementHourlyAggregate({
|
|
744
|
+
db,
|
|
745
|
+
systemId,
|
|
746
|
+
configurationId: configId,
|
|
747
|
+
status: result.status,
|
|
748
|
+
latencyMs: result.latencyMs,
|
|
749
|
+
runTimestamp: new Date(),
|
|
750
|
+
result: { ...result } as Record<string, unknown>,
|
|
751
|
+
collectorRegistry,
|
|
752
|
+
sourceLabel: "Local",
|
|
753
|
+
});
|
|
754
|
+
|
|
755
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
756
|
+
return toHealthEntityView(newState);
|
|
757
|
+
},
|
|
758
|
+
serialize: serializeHealthWrite,
|
|
759
|
+
onError: (error) =>
|
|
760
|
+
logger.warn(`Failed to mirror health entity for ${systemId}`, error),
|
|
883
761
|
});
|
|
884
762
|
|
|
885
763
|
logger.debug(
|
|
@@ -899,8 +777,17 @@ async function executeHealthCheckJob(props: {
|
|
|
899
777
|
latencyMs: result.latencyMs,
|
|
900
778
|
});
|
|
901
779
|
|
|
902
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
903
780
|
if (newState.status !== previousStatus) {
|
|
781
|
+
// Record the aggregate transition so the sensing layer has a
|
|
782
|
+
// reliable "in status since" for every status (Wave 2).
|
|
783
|
+
await recordStateTransition({
|
|
784
|
+
db,
|
|
785
|
+
systemId,
|
|
786
|
+
configurationId: configId,
|
|
787
|
+
fromStatus: previousStatus,
|
|
788
|
+
toStatus: newState.status,
|
|
789
|
+
});
|
|
790
|
+
|
|
904
791
|
await notifyStateChange({
|
|
905
792
|
notificationClient,
|
|
906
793
|
systemId,
|
|
@@ -916,24 +803,6 @@ async function executeHealthCheckJob(props: {
|
|
|
916
803
|
});
|
|
917
804
|
}
|
|
918
805
|
|
|
919
|
-
// Per-check auto-incident: runs whether or not the aggregate
|
|
920
|
-
// changed (a check can transition to unhealthy without flipping
|
|
921
|
-
// the aggregate if another check is already unhealthy).
|
|
922
|
-
await maybeOpenAutoIncidentForCheck({
|
|
923
|
-
db,
|
|
924
|
-
service,
|
|
925
|
-
incidentClient,
|
|
926
|
-
maintenanceClient,
|
|
927
|
-
logger,
|
|
928
|
-
systemId,
|
|
929
|
-
systemName,
|
|
930
|
-
configurationId: configId,
|
|
931
|
-
configurationName: configRow.configName,
|
|
932
|
-
getEmitHook,
|
|
933
|
-
previousState,
|
|
934
|
-
newState,
|
|
935
|
-
});
|
|
936
|
-
|
|
937
806
|
return;
|
|
938
807
|
} finally {
|
|
939
808
|
if (connectedClient) {
|
|
@@ -962,28 +831,48 @@ async function executeHealthCheckJob(props: {
|
|
|
962
831
|
},
|
|
963
832
|
};
|
|
964
833
|
|
|
965
|
-
//
|
|
966
|
-
|
|
967
|
-
|
|
834
|
+
// Persist the run + aggregate THROUGH the reactive `health` entity on
|
|
835
|
+
// every run (§10.3): `apply` does the durable write (insert + hourly
|
|
836
|
+
// aggregate) and returns the freshly-computed view. The framework
|
|
837
|
+
// snapshots `prev` via the COMPUTE-ON-READ accessor BEFORE this insert, so
|
|
838
|
+
// an unchanged aggregate is a no-op and a real status change drives the
|
|
839
|
+
// directional/umbrella trigger events via `deriveHealthTriggerEvents` —
|
|
840
|
+
// exactly one correct `ENTITY_CHANGED` with accurate prev → next.
|
|
841
|
+
let newState!: AggregatedHealth;
|
|
842
|
+
await writeHealthEntity({
|
|
843
|
+
handle: getHealthEntity?.(),
|
|
968
844
|
systemId,
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
845
|
+
apply: async () => {
|
|
846
|
+
// Store result (spread to convert structured type to plain record for jsonb)
|
|
847
|
+
await db.insert(healthCheckRuns).values({
|
|
848
|
+
configurationId: configId,
|
|
849
|
+
systemId,
|
|
850
|
+
status: result.status,
|
|
851
|
+
latencyMs: result.latencyMs,
|
|
852
|
+
result: { ...result } as Record<string, unknown>,
|
|
853
|
+
sourceId: undefined,
|
|
854
|
+
sourceLabel: "Local",
|
|
855
|
+
});
|
|
975
856
|
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
857
|
+
// Trigger incremental hourly aggregation
|
|
858
|
+
await incrementHourlyAggregate({
|
|
859
|
+
db,
|
|
860
|
+
systemId,
|
|
861
|
+
configurationId: configId,
|
|
862
|
+
status: result.status,
|
|
863
|
+
latencyMs: result.latencyMs,
|
|
864
|
+
runTimestamp: new Date(),
|
|
865
|
+
result: { ...result } as Record<string, unknown>,
|
|
866
|
+
collectorRegistry,
|
|
867
|
+
sourceLabel: "Local",
|
|
868
|
+
});
|
|
869
|
+
|
|
870
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
871
|
+
return toHealthEntityView(newState);
|
|
872
|
+
},
|
|
873
|
+
serialize: serializeHealthWrite,
|
|
874
|
+
onError: (error) =>
|
|
875
|
+
logger.warn(`Failed to mirror health entity for ${systemId}`, error),
|
|
987
876
|
});
|
|
988
877
|
|
|
989
878
|
logger.debug(
|
|
@@ -1013,9 +902,17 @@ async function executeHealthCheckJob(props: {
|
|
|
1013
902
|
result: (result.metadata?.collectors as Record<string, unknown>) ?? undefined,
|
|
1014
903
|
});
|
|
1015
904
|
|
|
1016
|
-
// Check if aggregated state changed and notify subscribers
|
|
1017
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
1018
905
|
if (newState.status !== previousStatus) {
|
|
906
|
+
// Record the aggregate transition so the sensing layer has a
|
|
907
|
+
// reliable "in status since" for every status (Wave 2).
|
|
908
|
+
await recordStateTransition({
|
|
909
|
+
db,
|
|
910
|
+
systemId,
|
|
911
|
+
configurationId: configId,
|
|
912
|
+
fromStatus: previousStatus,
|
|
913
|
+
toStatus: newState.status,
|
|
914
|
+
});
|
|
915
|
+
|
|
1019
916
|
await notifyStateChange({
|
|
1020
917
|
notificationClient,
|
|
1021
918
|
systemId,
|
|
@@ -1037,77 +934,13 @@ async function executeHealthCheckJob(props: {
|
|
|
1037
934
|
newStatus: newState.status,
|
|
1038
935
|
});
|
|
1039
936
|
|
|
1040
|
-
//
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
).length;
|
|
1046
|
-
const totalChecks = newState.checkStatuses.length;
|
|
1047
|
-
const timestamp = new Date().toISOString();
|
|
1048
|
-
|
|
1049
|
-
if (newState.status === "healthy" && previousStatus !== "healthy") {
|
|
1050
|
-
// Recovery: system became healthy
|
|
1051
|
-
await emitHook(healthCheckHooks.systemHealthy, {
|
|
1052
|
-
systemId,
|
|
1053
|
-
previousStatus,
|
|
1054
|
-
healthyChecks,
|
|
1055
|
-
totalChecks,
|
|
1056
|
-
timestamp,
|
|
1057
|
-
});
|
|
1058
|
-
logger.debug(
|
|
1059
|
-
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
1060
|
-
);
|
|
1061
|
-
} else if (
|
|
1062
|
-
previousStatus === "healthy" &&
|
|
1063
|
-
newState.status !== "healthy"
|
|
1064
|
-
) {
|
|
1065
|
-
// Degradation: system went from healthy to unhealthy/degraded
|
|
1066
|
-
await emitHook(healthCheckHooks.systemDegraded, {
|
|
1067
|
-
systemId,
|
|
1068
|
-
previousStatus,
|
|
1069
|
-
newStatus: newState.status,
|
|
1070
|
-
healthyChecks,
|
|
1071
|
-
totalChecks,
|
|
1072
|
-
timestamp,
|
|
1073
|
-
});
|
|
1074
|
-
logger.debug(
|
|
1075
|
-
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
1076
|
-
);
|
|
1077
|
-
}
|
|
1078
|
-
|
|
1079
|
-
// Umbrella hook — fires on every transition. Emitted alongside
|
|
1080
|
-
// the directional hooks so existing subscribers stay unchanged
|
|
1081
|
-
// while new automation triggers can react to any change.
|
|
1082
|
-
if (previousStatus !== newState.status) {
|
|
1083
|
-
await emitHook(healthCheckHooks.systemHealthChanged, {
|
|
1084
|
-
systemId,
|
|
1085
|
-
previousStatus,
|
|
1086
|
-
newStatus: newState.status,
|
|
1087
|
-
healthyChecks,
|
|
1088
|
-
totalChecks,
|
|
1089
|
-
timestamp,
|
|
1090
|
-
});
|
|
1091
|
-
}
|
|
1092
|
-
}
|
|
937
|
+
// The directional + umbrella system-health hooks were removed in
|
|
938
|
+
// Phase 4 (§10.3): the `health` entity mirror above is the single
|
|
939
|
+
// source of truth, and its change deriver fires the
|
|
940
|
+
// `healthcheck.system_degraded` / `_healthy` / `_health_changed`
|
|
941
|
+
// trigger events through Stage-1 routing. Nothing to emit here.
|
|
1093
942
|
}
|
|
1094
943
|
|
|
1095
|
-
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1096
|
-
await maybeOpenAutoIncidentForCheck({
|
|
1097
|
-
db,
|
|
1098
|
-
service,
|
|
1099
|
-
incidentClient,
|
|
1100
|
-
maintenanceClient,
|
|
1101
|
-
logger,
|
|
1102
|
-
systemId,
|
|
1103
|
-
systemName,
|
|
1104
|
-
configurationId: configId,
|
|
1105
|
-
configurationName: configRow.configName,
|
|
1106
|
-
getEmitHook,
|
|
1107
|
-
previousState,
|
|
1108
|
-
newState,
|
|
1109
|
-
});
|
|
1110
|
-
|
|
1111
944
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
1112
945
|
} catch (error) {
|
|
1113
946
|
logger.error(
|
|
@@ -1115,27 +948,48 @@ async function executeHealthCheckJob(props: {
|
|
|
1115
948
|
error,
|
|
1116
949
|
);
|
|
1117
950
|
|
|
1118
|
-
//
|
|
1119
|
-
|
|
1120
|
-
|
|
951
|
+
// Persist the failure run + aggregate THROUGH the reactive `health`
|
|
952
|
+
// entity: `apply` does the durable write and returns the freshly-computed
|
|
953
|
+
// view. The framework snapshots `prev` via the compute-on-read accessor
|
|
954
|
+
// BEFORE this insert, so a real status change emits exactly one correct
|
|
955
|
+
// `ENTITY_CHANGED` (§10.3). See the success path for the full rationale.
|
|
956
|
+
let newState!: AggregatedHealth;
|
|
957
|
+
await writeHealthEntity({
|
|
958
|
+
handle: getHealthEntity?.(),
|
|
1121
959
|
systemId,
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
960
|
+
apply: async () => {
|
|
961
|
+
// Store failure (no latencyMs for failures)
|
|
962
|
+
await db.insert(healthCheckRuns).values({
|
|
963
|
+
configurationId: configId,
|
|
964
|
+
systemId,
|
|
965
|
+
status: "unhealthy",
|
|
966
|
+
result: { error: String(error) } as Record<string, unknown>,
|
|
967
|
+
sourceId: undefined,
|
|
968
|
+
sourceLabel: "Local",
|
|
969
|
+
});
|
|
1127
970
|
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
971
|
+
// Trigger incremental hourly aggregation
|
|
972
|
+
await incrementHourlyAggregate({
|
|
973
|
+
db,
|
|
974
|
+
systemId,
|
|
975
|
+
configurationId: configId,
|
|
976
|
+
status: "unhealthy",
|
|
977
|
+
latencyMs: undefined,
|
|
978
|
+
runTimestamp: new Date(),
|
|
979
|
+
// No collector data for error cases
|
|
980
|
+
collectorRegistry,
|
|
981
|
+
sourceLabel: "Local",
|
|
982
|
+
});
|
|
983
|
+
|
|
984
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
985
|
+
return toHealthEntityView(newState);
|
|
986
|
+
},
|
|
987
|
+
serialize: serializeHealthWrite,
|
|
988
|
+
onError: (mirrorError) =>
|
|
989
|
+
logger.warn(
|
|
990
|
+
`Failed to mirror health entity for ${systemId}`,
|
|
991
|
+
mirrorError,
|
|
992
|
+
),
|
|
1139
993
|
});
|
|
1140
994
|
|
|
1141
995
|
// Try to fetch names for the enriched signal (best-effort)
|
|
@@ -1179,9 +1033,17 @@ async function executeHealthCheckJob(props: {
|
|
|
1179
1033
|
result: undefined,
|
|
1180
1034
|
});
|
|
1181
1035
|
|
|
1182
|
-
// Check if aggregated state changed and notify subscribers
|
|
1183
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
1184
1036
|
if (newState.status !== previousStatus) {
|
|
1037
|
+
// Record the aggregate transition so the sensing layer has a
|
|
1038
|
+
// reliable "in status since" for every status (Wave 2).
|
|
1039
|
+
await recordStateTransition({
|
|
1040
|
+
db,
|
|
1041
|
+
systemId,
|
|
1042
|
+
configurationId: configId,
|
|
1043
|
+
fromStatus: previousStatus,
|
|
1044
|
+
toStatus: newState.status,
|
|
1045
|
+
});
|
|
1046
|
+
|
|
1185
1047
|
await notifyStateChange({
|
|
1186
1048
|
notificationClient,
|
|
1187
1049
|
systemId,
|
|
@@ -1203,83 +1065,20 @@ async function executeHealthCheckJob(props: {
|
|
|
1203
1065
|
newStatus: newState.status,
|
|
1204
1066
|
});
|
|
1205
1067
|
|
|
1206
|
-
//
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
).length;
|
|
1212
|
-
const totalChecks = newState.checkStatuses.length;
|
|
1213
|
-
const timestamp = new Date().toISOString();
|
|
1214
|
-
|
|
1215
|
-
if (newState.status === "healthy" && previousStatus !== "healthy") {
|
|
1216
|
-
// Recovery: system became healthy
|
|
1217
|
-
await emitHook(healthCheckHooks.systemHealthy, {
|
|
1218
|
-
systemId,
|
|
1219
|
-
previousStatus,
|
|
1220
|
-
healthyChecks,
|
|
1221
|
-
totalChecks,
|
|
1222
|
-
timestamp,
|
|
1223
|
-
});
|
|
1224
|
-
logger.debug(
|
|
1225
|
-
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
1226
|
-
);
|
|
1227
|
-
} else if (
|
|
1228
|
-
previousStatus === "healthy" &&
|
|
1229
|
-
newState.status !== "healthy"
|
|
1230
|
-
) {
|
|
1231
|
-
// Degradation: system went from healthy to unhealthy/degraded
|
|
1232
|
-
await emitHook(healthCheckHooks.systemDegraded, {
|
|
1233
|
-
systemId,
|
|
1234
|
-
previousStatus,
|
|
1235
|
-
newStatus: newState.status,
|
|
1236
|
-
healthyChecks,
|
|
1237
|
-
totalChecks,
|
|
1238
|
-
timestamp,
|
|
1239
|
-
});
|
|
1240
|
-
logger.debug(
|
|
1241
|
-
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
1242
|
-
);
|
|
1243
|
-
}
|
|
1244
|
-
|
|
1245
|
-
// Umbrella hook — fires on every transition. Emitted alongside
|
|
1246
|
-
// the directional hooks so existing subscribers stay unchanged
|
|
1247
|
-
// while new automation triggers can react to any change.
|
|
1248
|
-
if (previousStatus !== newState.status) {
|
|
1249
|
-
await emitHook(healthCheckHooks.systemHealthChanged, {
|
|
1250
|
-
systemId,
|
|
1251
|
-
previousStatus,
|
|
1252
|
-
newStatus: newState.status,
|
|
1253
|
-
healthyChecks,
|
|
1254
|
-
totalChecks,
|
|
1255
|
-
timestamp,
|
|
1256
|
-
});
|
|
1257
|
-
}
|
|
1258
|
-
}
|
|
1068
|
+
// The directional + umbrella system-health hooks were removed in
|
|
1069
|
+
// Phase 4 (§10.3): the `health` entity mirror above is the single
|
|
1070
|
+
// source of truth, and its change deriver fires the
|
|
1071
|
+
// `healthcheck.system_degraded` / `_healthy` / `_health_changed`
|
|
1072
|
+
// trigger events through Stage-1 routing. Nothing to emit here.
|
|
1259
1073
|
}
|
|
1260
1074
|
|
|
1261
|
-
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1262
|
-
await maybeOpenAutoIncidentForCheck({
|
|
1263
|
-
db,
|
|
1264
|
-
service,
|
|
1265
|
-
incidentClient,
|
|
1266
|
-
maintenanceClient,
|
|
1267
|
-
logger,
|
|
1268
|
-
systemId,
|
|
1269
|
-
systemName,
|
|
1270
|
-
configurationId: configId,
|
|
1271
|
-
configurationName: configName,
|
|
1272
|
-
getEmitHook,
|
|
1273
|
-
previousState,
|
|
1274
|
-
newState,
|
|
1275
|
-
});
|
|
1276
|
-
|
|
1277
1075
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
1278
1076
|
}
|
|
1279
1077
|
}
|
|
1280
1078
|
|
|
1281
1079
|
export async function setupHealthCheckWorker(props: {
|
|
1282
1080
|
db: Db;
|
|
1081
|
+
advisoryLock: AdvisoryLockService;
|
|
1283
1082
|
registry: HealthCheckRegistry;
|
|
1284
1083
|
collectorRegistry: CollectorRegistry;
|
|
1285
1084
|
logger: Logger;
|
|
@@ -1290,10 +1089,13 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1290
1089
|
maintenanceClient: MaintenanceClient;
|
|
1291
1090
|
incidentClient: IncidentClient;
|
|
1292
1091
|
getEmitHook: () => EmitHookFn | undefined;
|
|
1092
|
+
getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
|
|
1293
1093
|
cache: HealthCheckCache;
|
|
1094
|
+
secretResolver?: SecretResolverService;
|
|
1294
1095
|
}): Promise<void> {
|
|
1295
1096
|
const {
|
|
1296
1097
|
db,
|
|
1098
|
+
advisoryLock,
|
|
1297
1099
|
registry,
|
|
1298
1100
|
collectorRegistry,
|
|
1299
1101
|
logger,
|
|
@@ -1304,7 +1106,9 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1304
1106
|
maintenanceClient,
|
|
1305
1107
|
incidentClient,
|
|
1306
1108
|
getEmitHook,
|
|
1109
|
+
getHealthEntity,
|
|
1307
1110
|
cache,
|
|
1111
|
+
secretResolver,
|
|
1308
1112
|
} = props;
|
|
1309
1113
|
|
|
1310
1114
|
const queue =
|
|
@@ -1316,6 +1120,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1316
1120
|
await executeHealthCheckJob({
|
|
1317
1121
|
payload: job.data,
|
|
1318
1122
|
db,
|
|
1123
|
+
advisoryLock,
|
|
1319
1124
|
registry,
|
|
1320
1125
|
collectorRegistry,
|
|
1321
1126
|
logger,
|
|
@@ -1325,7 +1130,9 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1325
1130
|
maintenanceClient,
|
|
1326
1131
|
incidentClient,
|
|
1327
1132
|
getEmitHook,
|
|
1133
|
+
getHealthEntity,
|
|
1328
1134
|
cache,
|
|
1135
|
+
secretResolver,
|
|
1329
1136
|
});
|
|
1330
1137
|
},
|
|
1331
1138
|
{
|