@checkstack/healthcheck-backend 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +329 -0
- package/drizzle/0015_quiet_meggan.sql +12 -0
- package/drizzle/0016_complex_maginty.sql +1 -0
- package/drizzle/0017_pretty_caretaker.sql +1 -0
- package/drizzle/meta/0015_snapshot.json +764 -0
- package/drizzle/meta/0016_snapshot.json +644 -0
- package/drizzle/meta/0017_snapshot.json +563 -0
- package/drizzle/meta/_journal.json +21 -0
- package/package.json +24 -21
- package/src/automations.test.ts +6 -27
- package/src/automations.ts +32 -30
- package/src/collector-script-test.test.ts +236 -0
- package/src/collector-script-test.ts +221 -0
- package/src/health-entity.test.ts +698 -0
- package/src/health-entity.ts +369 -0
- package/src/health-state.test.ts +115 -0
- package/src/health-state.ts +333 -0
- package/src/healthcheck-gitops-kinds.test.ts +6 -32
- package/src/healthcheck-gitops-kinds.ts +4 -19
- package/src/hooks.test.ts +19 -6
- package/src/hooks.ts +13 -68
- package/src/index.ts +115 -48
- package/src/queue-executor.ts +243 -444
- package/src/retention-job.ts +65 -1
- package/src/retention-state-transitions.test.ts +49 -0
- package/src/router.test.ts +13 -0
- package/src/router.ts +44 -0
- package/src/schema.ts +34 -54
- package/src/service-notification-policy.test.ts +28 -71
- package/src/service.ts +89 -0
- package/src/state-transitions.test.ts +126 -0
- package/src/state-transitions.ts +112 -0
- package/tsconfig.json +9 -0
- package/src/auto-incident-close-job.ts +0 -164
- package/src/auto-incident.test.ts +0 -196
- package/src/auto-incident.ts +0 -332
package/src/queue-executor.ts
CHANGED
|
@@ -36,6 +36,8 @@ import { IncidentApi } from "@checkstack/incident-common";
|
|
|
36
36
|
import { NotificationApi } from "@checkstack/notification-common";
|
|
37
37
|
import { healthcheckSystemSubscription } from "@checkstack/healthcheck-common";
|
|
38
38
|
import { resolveRoute, type InferClient, extractErrorMessage} from "@checkstack/common";
|
|
39
|
+
import { secretEnvMappingSchema } from "@checkstack/secrets-common";
|
|
40
|
+
import type { SecretResolverService } from "@checkstack/secrets-backend";
|
|
39
41
|
import { HealthCheckService } from "./service";
|
|
40
42
|
import { healthCheckHooks } from "./hooks";
|
|
41
43
|
import { incrementHourlyAggregate } from "./realtime-aggregation";
|
|
@@ -44,17 +46,13 @@ import {
|
|
|
44
46
|
classifyTransition,
|
|
45
47
|
shouldNotifyTransition,
|
|
46
48
|
} from "./notification-policy";
|
|
49
|
+
import { recordStateTransition } from "./state-transitions";
|
|
47
50
|
import {
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
openAutoIncident,
|
|
54
|
-
recordUnhealthyTransition,
|
|
55
|
-
shouldOpenForFlapping,
|
|
56
|
-
shouldOpenForSustainedUnhealthy,
|
|
57
|
-
} from "./auto-incident";
|
|
51
|
+
writeHealthEntity,
|
|
52
|
+
createHealthEntitySerializer,
|
|
53
|
+
type HealthEntityState,
|
|
54
|
+
} from "./health-entity";
|
|
55
|
+
import type { EntityHandle } from "@checkstack/automation-backend";
|
|
58
56
|
|
|
59
57
|
type Db = SafeDatabase<typeof schema>;
|
|
60
58
|
type CatalogClient = InferClient<typeof CatalogApi>;
|
|
@@ -62,6 +60,28 @@ type MaintenanceClient = InferClient<typeof MaintenanceApi>;
|
|
|
62
60
|
type IncidentClient = InferClient<typeof IncidentApi>;
|
|
63
61
|
type NotificationClient = InferClient<typeof NotificationApi>;
|
|
64
62
|
|
|
63
|
+
/** Shape of the aggregated state returned by `getSystemHealthStatus`. */
|
|
64
|
+
type AggregatedHealth = Awaited<
|
|
65
|
+
ReturnType<HealthCheckService["getSystemHealthStatus"]>
|
|
66
|
+
>;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Derive the reactive `health` entity view from the freshly-computed
|
|
70
|
+
* aggregated state. Mirrors `computeHealthEntityState` exactly: `status` is the
|
|
71
|
+
* worst-wins aggregate, `healthyChecks` counts per-check `"healthy"` statuses,
|
|
72
|
+
* and `totalChecks` is the number of enabled checks. Kept here so the
|
|
73
|
+
* `handle.mutate` write returns the SAME view the `read` accessor would have
|
|
74
|
+
* computed for the post-write state (the handle thus never re-reads).
|
|
75
|
+
*/
|
|
76
|
+
function toHealthEntityView(state: AggregatedHealth): HealthEntityState {
|
|
77
|
+
return {
|
|
78
|
+
status: state.status,
|
|
79
|
+
healthyChecks: state.checkStatuses.filter((c) => c.status === "healthy")
|
|
80
|
+
.length,
|
|
81
|
+
totalChecks: state.checkStatuses.length,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
65
85
|
/**
|
|
66
86
|
* Emit the checkCompleted hook if available, plus the narrower
|
|
67
87
|
* `checkFailed` hook when the result wasn't `healthy` (so operators
|
|
@@ -171,222 +191,12 @@ export async function scheduleHealthCheck(props: {
|
|
|
171
191
|
});
|
|
172
192
|
}
|
|
173
193
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
* configured window.
|
|
181
|
-
* - **sustained**: the check is currently unhealthy AND has been so
|
|
182
|
-
* continuously for at least the configured duration.
|
|
183
|
-
*
|
|
184
|
-
* Both triggers honour the require-recovery rule: after the most
|
|
185
|
-
* recent auto-incident close (manual or auto), no new auto-incident
|
|
186
|
-
* opens until the check has logged at least one healthy run. This
|
|
187
|
-
* stops a manual close → still-unhealthy → re-open loop.
|
|
188
|
-
*
|
|
189
|
-
* Active maintenance with suppression skips both triggers when the
|
|
190
|
-
* policy opts in.
|
|
191
|
-
*/
|
|
192
|
-
async function maybeOpenAutoIncidentForCheck(props: {
|
|
193
|
-
db: Db;
|
|
194
|
-
service: HealthCheckService;
|
|
195
|
-
incidentClient: IncidentClient;
|
|
196
|
-
maintenanceClient: MaintenanceClient;
|
|
197
|
-
logger: Logger;
|
|
198
|
-
systemId: string;
|
|
199
|
-
systemName: string;
|
|
200
|
-
configurationId: string;
|
|
201
|
-
configurationName: string;
|
|
202
|
-
/**
|
|
203
|
-
* Same closure-based getter the queue executor uses elsewhere; let
|
|
204
|
-
* us fire the `flapping_detected` automation hook from inside the
|
|
205
|
-
* flapping evaluator without re-threading `emitHook` through every
|
|
206
|
-
* intermediate caller. Optional — when absent, the hook simply
|
|
207
|
-
* doesn't fire (e.g. in unit tests that don't care about it).
|
|
208
|
-
*/
|
|
209
|
-
getEmitHook?: () => EmitHookFn | undefined;
|
|
210
|
-
previousState: {
|
|
211
|
-
checkStatuses: Array<{
|
|
212
|
-
configurationId: string;
|
|
213
|
-
status: HealthCheckStatus;
|
|
214
|
-
}>;
|
|
215
|
-
};
|
|
216
|
-
newState: {
|
|
217
|
-
checkStatuses: Array<{
|
|
218
|
-
configurationId: string;
|
|
219
|
-
status: HealthCheckStatus;
|
|
220
|
-
}>;
|
|
221
|
-
};
|
|
222
|
-
}): Promise<void> {
|
|
223
|
-
const {
|
|
224
|
-
db,
|
|
225
|
-
service,
|
|
226
|
-
incidentClient,
|
|
227
|
-
maintenanceClient,
|
|
228
|
-
logger,
|
|
229
|
-
systemId,
|
|
230
|
-
systemName,
|
|
231
|
-
configurationId,
|
|
232
|
-
configurationName,
|
|
233
|
-
getEmitHook,
|
|
234
|
-
previousState,
|
|
235
|
-
newState,
|
|
236
|
-
} = props;
|
|
237
|
-
|
|
238
|
-
const next = newState.checkStatuses.find(
|
|
239
|
-
(c) => c.configurationId === configurationId,
|
|
240
|
-
);
|
|
241
|
-
// Only auto-incident logic applies when the check is currently
|
|
242
|
-
// unhealthy — both triggers require it.
|
|
243
|
-
if (!next || next.status !== "unhealthy") return;
|
|
244
|
-
|
|
245
|
-
const prev = previousState.checkStatuses.find(
|
|
246
|
-
(c) => c.configurationId === configurationId,
|
|
247
|
-
);
|
|
248
|
-
const isTransition = isTransitionToUnhealthy(prev?.status, next.status);
|
|
249
|
-
|
|
250
|
-
let policy;
|
|
251
|
-
try {
|
|
252
|
-
policy = await service.getAssignmentNotificationPolicy({
|
|
253
|
-
systemId,
|
|
254
|
-
configurationId,
|
|
255
|
-
});
|
|
256
|
-
} catch (error) {
|
|
257
|
-
logger.warn(
|
|
258
|
-
`Failed to load policy for auto-incident decision (${systemId}/${configurationId}):`,
|
|
259
|
-
error,
|
|
260
|
-
);
|
|
261
|
-
return;
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
if (!policy.autoOpenIncidentOnUnhealthy) return;
|
|
265
|
-
|
|
266
|
-
// Honour active maintenance windows — operators have explicitly
|
|
267
|
-
// said the system is down on purpose.
|
|
268
|
-
if (policy.skipDuringMaintenance) {
|
|
269
|
-
const suppressed = await isMaintenanceSuppressed({
|
|
270
|
-
maintenanceClient,
|
|
271
|
-
systemId,
|
|
272
|
-
logger,
|
|
273
|
-
});
|
|
274
|
-
if (suppressed) {
|
|
275
|
-
logger.debug(
|
|
276
|
-
`Skipping auto-incident for ${systemId}/${configurationId}: active maintenance`,
|
|
277
|
-
);
|
|
278
|
-
return;
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
// Require-recovery: if there's a prior closed auto-incident for
|
|
283
|
-
// this assignment, the check must have logged at least one healthy
|
|
284
|
-
// run since the close before we can open another one. Without this,
|
|
285
|
-
// an operator's manual close on a still-broken system would loop.
|
|
286
|
-
const lastCloseAt = await findLastAutoIncidentClose({
|
|
287
|
-
db,
|
|
288
|
-
systemId,
|
|
289
|
-
configurationId,
|
|
290
|
-
});
|
|
291
|
-
if (lastCloseAt) {
|
|
292
|
-
const recovered = await hasHealthyRunSince({
|
|
293
|
-
db,
|
|
294
|
-
systemId,
|
|
295
|
-
configurationId,
|
|
296
|
-
since: lastCloseAt,
|
|
297
|
-
});
|
|
298
|
-
if (!recovered) {
|
|
299
|
-
return;
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
// Record the transition (if any) and evaluate the flapping trigger
|
|
304
|
-
// against transitions that happened after the last close window.
|
|
305
|
-
let flappingOpens = false;
|
|
306
|
-
if (isTransition) {
|
|
307
|
-
try {
|
|
308
|
-
const count = await recordUnhealthyTransition({
|
|
309
|
-
db,
|
|
310
|
-
configurationId,
|
|
311
|
-
systemId,
|
|
312
|
-
windowMinutes: policy.flappingTrigger.windowMinutes,
|
|
313
|
-
since: lastCloseAt,
|
|
314
|
-
});
|
|
315
|
-
flappingOpens = shouldOpenForFlapping({
|
|
316
|
-
policy,
|
|
317
|
-
recentTransitionCount: count,
|
|
318
|
-
});
|
|
319
|
-
|
|
320
|
-
// Fire the informational `flapping_detected` automation hook
|
|
321
|
-
// independently of the auto-incident decision: an operator may
|
|
322
|
-
// care about flapping even with the auto-incident pipeline
|
|
323
|
-
// turned off.
|
|
324
|
-
if (
|
|
325
|
-
policy.flappingTrigger.enabled &&
|
|
326
|
-
count >= policy.flappingTrigger.transitions
|
|
327
|
-
) {
|
|
328
|
-
const emit = getEmitHook?.();
|
|
329
|
-
if (emit) {
|
|
330
|
-
try {
|
|
331
|
-
await emit(healthCheckHooks.flappingDetected, {
|
|
332
|
-
systemId,
|
|
333
|
-
configurationId,
|
|
334
|
-
transitionCount: count,
|
|
335
|
-
windowMinutes: policy.flappingTrigger.windowMinutes,
|
|
336
|
-
timestamp: new Date().toISOString(),
|
|
337
|
-
});
|
|
338
|
-
} catch (error) {
|
|
339
|
-
logger.warn(
|
|
340
|
-
`Failed to emit healthcheck.flapping_detected hook for ${systemId}/${configurationId}:`,
|
|
341
|
-
error,
|
|
342
|
-
);
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
} catch (error) {
|
|
347
|
-
logger.warn(
|
|
348
|
-
`Failed to record unhealthy transition for ${systemId}/${configurationId}:`,
|
|
349
|
-
error,
|
|
350
|
-
);
|
|
351
|
-
}
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
// Evaluate the sustained-duration trigger on every run while the
|
|
355
|
-
// check is unhealthy (not just on transition).
|
|
356
|
-
let sustainedOpens = false;
|
|
357
|
-
if (policy.sustainedUnhealthyTrigger.enabled) {
|
|
358
|
-
const unhealthySince = await findUnhealthySince({
|
|
359
|
-
db,
|
|
360
|
-
configurationId,
|
|
361
|
-
systemId,
|
|
362
|
-
since: lastCloseAt,
|
|
363
|
-
});
|
|
364
|
-
if (unhealthySince) {
|
|
365
|
-
sustainedOpens = shouldOpenForSustainedUnhealthy({
|
|
366
|
-
policy,
|
|
367
|
-
unhealthyForMs: Date.now() - unhealthySince.getTime(),
|
|
368
|
-
});
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
if (!flappingOpens && !sustainedOpens) return;
|
|
373
|
-
|
|
374
|
-
const reason = flappingOpens
|
|
375
|
-
? `flapping: ≥${policy.flappingTrigger.transitions} transitions in ${policy.flappingTrigger.windowMinutes} min`
|
|
376
|
-
: `unhealthy ≥${policy.sustainedUnhealthyTrigger.durationMinutes} min continuously`;
|
|
377
|
-
|
|
378
|
-
await openAutoIncident({
|
|
379
|
-
db,
|
|
380
|
-
incidentClient,
|
|
381
|
-
logger,
|
|
382
|
-
systemId,
|
|
383
|
-
systemName,
|
|
384
|
-
configurationId,
|
|
385
|
-
configurationName,
|
|
386
|
-
policy,
|
|
387
|
-
reason,
|
|
388
|
-
});
|
|
389
|
-
}
|
|
194
|
+
// Flapping detection no longer lives here. It moved into the automation
|
|
195
|
+
// engine as a windowed-count gate on the `healthcheck.system_health_changed`
|
|
196
|
+
// trigger (raw aggregated-health change + `filter` +
|
|
197
|
+
// `window: { count, minutes, refire: "once" }`). The queue executor emits only
|
|
198
|
+
// the raw per-system health change (via the reactive `health` entity deriver,
|
|
199
|
+
// unchanged); the engine does the counting.
|
|
390
200
|
|
|
391
201
|
/**
|
|
392
202
|
* Notify system subscribers about a health state change.
|
|
@@ -575,6 +385,21 @@ async function executeHealthCheckJob(props: {
|
|
|
575
385
|
incidentClient: IncidentClient;
|
|
576
386
|
getEmitHook: () => EmitHookFn | undefined;
|
|
577
387
|
cache: HealthCheckCache;
|
|
388
|
+
/**
|
|
389
|
+
* Resolver for the reactive `health` entity handle (§10.3). Returns the
|
|
390
|
+
* handle once automation-backend has bound the entity store; `undefined`
|
|
391
|
+
* during version skew / tests. Mirrors the `getEmitHook` closure pattern.
|
|
392
|
+
* The entity is PLUGIN-BACKED + COMPUTED — there is no keyed store; the
|
|
393
|
+
* durable run/aggregate write IS the entity write (see `writeHealthEntity`).
|
|
394
|
+
*/
|
|
395
|
+
getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
|
|
396
|
+
/**
|
|
397
|
+
* Central secret resolver. When set, a collector declaring a `secretEnv`
|
|
398
|
+
* has it resolved + injected for this centrally-executed run; the
|
|
399
|
+
* collector masks the values out of its output. Optional for version-skew
|
|
400
|
+
* / test isolation.
|
|
401
|
+
*/
|
|
402
|
+
secretResolver?: SecretResolverService;
|
|
578
403
|
}): Promise<void> {
|
|
579
404
|
const {
|
|
580
405
|
payload,
|
|
@@ -588,13 +413,23 @@ async function executeHealthCheckJob(props: {
|
|
|
588
413
|
maintenanceClient,
|
|
589
414
|
incidentClient,
|
|
590
415
|
getEmitHook,
|
|
416
|
+
getHealthEntity,
|
|
591
417
|
cache,
|
|
418
|
+
secretResolver,
|
|
592
419
|
} = props;
|
|
593
420
|
const { configId, systemId } = payload;
|
|
594
421
|
|
|
595
422
|
// Create service for aggregated state evaluation
|
|
596
423
|
const service = new HealthCheckService(db, registry, collectorRegistry);
|
|
597
424
|
|
|
425
|
+
// Per-system serializer for the reactive health mutate (§10.3): a
|
|
426
|
+
// transaction-scoped advisory lock keyed `health:<systemId>` wraps the
|
|
427
|
+
// snapshot-prev + apply + diff + emit so concurrent evaluations of one
|
|
428
|
+
// system (multiple per-config jobs across pods, or at-least-once
|
|
429
|
+
// redelivery) can't double-emit a single logical transition. Bound to this
|
|
430
|
+
// job's systemId below at every `writeHealthEntity` call.
|
|
431
|
+
const serializeHealthWrite = createHealthEntitySerializer({ db })(systemId);
|
|
432
|
+
|
|
598
433
|
// Capture aggregated state BEFORE this run for comparison
|
|
599
434
|
const previousState = await service.getSystemHealthStatus(systemId);
|
|
600
435
|
const previousStatus = previousState.status;
|
|
@@ -725,11 +560,31 @@ async function executeHealthCheckJob(props: {
|
|
|
725
560
|
const storageKey = collectorEntry.id;
|
|
726
561
|
|
|
727
562
|
try {
|
|
563
|
+
// Resolve the collector's declared secretEnv for THIS run
|
|
564
|
+
// (central execution). The collector injects it and masks the
|
|
565
|
+
// values out of its output. A missing required secret throws
|
|
566
|
+
// and fails the collector clearly.
|
|
567
|
+
let secretEnv: Record<string, string> | undefined;
|
|
568
|
+
const declared = secretEnvMappingSchema.safeParse(
|
|
569
|
+
(collectorEntry.config as { secretEnv?: unknown }).secretEnv,
|
|
570
|
+
);
|
|
571
|
+
if (
|
|
572
|
+
secretResolver &&
|
|
573
|
+
declared.success &&
|
|
574
|
+
Object.keys(declared.data).length > 0
|
|
575
|
+
) {
|
|
576
|
+
const resolved = await secretResolver.resolveForRun({
|
|
577
|
+
secretEnv: declared.data,
|
|
578
|
+
});
|
|
579
|
+
secretEnv = resolved.env;
|
|
580
|
+
}
|
|
581
|
+
|
|
728
582
|
const collectorResult = await registered.collector.execute({
|
|
729
583
|
config: collectorEntry.config,
|
|
730
584
|
client: connectedClient!.client,
|
|
731
585
|
pluginId: configRow.strategyId,
|
|
732
586
|
runContext,
|
|
587
|
+
...(secretEnv ? { secretEnv } : {}),
|
|
733
588
|
});
|
|
734
589
|
|
|
735
590
|
// Check for collector-level error
|
|
@@ -860,26 +715,44 @@ async function executeHealthCheckJob(props: {
|
|
|
860
715
|
},
|
|
861
716
|
};
|
|
862
717
|
|
|
863
|
-
|
|
864
|
-
|
|
718
|
+
// Persist the run + aggregate THROUGH the reactive `health` entity:
|
|
719
|
+
// `apply` does the durable write and returns the freshly-computed view.
|
|
720
|
+
// The framework snapshots `prev` via `read` BEFORE this insert, so a real
|
|
721
|
+
// status change emits exactly one correct `ENTITY_CHANGED` (§10.3). The
|
|
722
|
+
// computed aggregated state is stashed for the transition/notify path.
|
|
723
|
+
let newState!: AggregatedHealth;
|
|
724
|
+
await writeHealthEntity({
|
|
725
|
+
handle: getHealthEntity?.(),
|
|
865
726
|
systemId,
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
727
|
+
apply: async () => {
|
|
728
|
+
await db.insert(healthCheckRuns).values({
|
|
729
|
+
configurationId: configId,
|
|
730
|
+
systemId,
|
|
731
|
+
status: result.status,
|
|
732
|
+
latencyMs: result.latencyMs,
|
|
733
|
+
result: { ...result } as Record<string, unknown>,
|
|
734
|
+
sourceId: undefined,
|
|
735
|
+
sourceLabel: "Local",
|
|
736
|
+
});
|
|
872
737
|
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
738
|
+
await incrementHourlyAggregate({
|
|
739
|
+
db,
|
|
740
|
+
systemId,
|
|
741
|
+
configurationId: configId,
|
|
742
|
+
status: result.status,
|
|
743
|
+
latencyMs: result.latencyMs,
|
|
744
|
+
runTimestamp: new Date(),
|
|
745
|
+
result: { ...result } as Record<string, unknown>,
|
|
746
|
+
collectorRegistry,
|
|
747
|
+
sourceLabel: "Local",
|
|
748
|
+
});
|
|
749
|
+
|
|
750
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
751
|
+
return toHealthEntityView(newState);
|
|
752
|
+
},
|
|
753
|
+
serialize: serializeHealthWrite,
|
|
754
|
+
onError: (error) =>
|
|
755
|
+
logger.warn(`Failed to mirror health entity for ${systemId}`, error),
|
|
883
756
|
});
|
|
884
757
|
|
|
885
758
|
logger.debug(
|
|
@@ -899,8 +772,17 @@ async function executeHealthCheckJob(props: {
|
|
|
899
772
|
latencyMs: result.latencyMs,
|
|
900
773
|
});
|
|
901
774
|
|
|
902
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
903
775
|
if (newState.status !== previousStatus) {
|
|
776
|
+
// Record the aggregate transition so the sensing layer has a
|
|
777
|
+
// reliable "in status since" for every status (Wave 2).
|
|
778
|
+
await recordStateTransition({
|
|
779
|
+
db,
|
|
780
|
+
systemId,
|
|
781
|
+
configurationId: configId,
|
|
782
|
+
fromStatus: previousStatus,
|
|
783
|
+
toStatus: newState.status,
|
|
784
|
+
});
|
|
785
|
+
|
|
904
786
|
await notifyStateChange({
|
|
905
787
|
notificationClient,
|
|
906
788
|
systemId,
|
|
@@ -916,24 +798,6 @@ async function executeHealthCheckJob(props: {
|
|
|
916
798
|
});
|
|
917
799
|
}
|
|
918
800
|
|
|
919
|
-
// Per-check auto-incident: runs whether or not the aggregate
|
|
920
|
-
// changed (a check can transition to unhealthy without flipping
|
|
921
|
-
// the aggregate if another check is already unhealthy).
|
|
922
|
-
await maybeOpenAutoIncidentForCheck({
|
|
923
|
-
db,
|
|
924
|
-
service,
|
|
925
|
-
incidentClient,
|
|
926
|
-
maintenanceClient,
|
|
927
|
-
logger,
|
|
928
|
-
systemId,
|
|
929
|
-
systemName,
|
|
930
|
-
configurationId: configId,
|
|
931
|
-
configurationName: configRow.configName,
|
|
932
|
-
getEmitHook,
|
|
933
|
-
previousState,
|
|
934
|
-
newState,
|
|
935
|
-
});
|
|
936
|
-
|
|
937
801
|
return;
|
|
938
802
|
} finally {
|
|
939
803
|
if (connectedClient) {
|
|
@@ -962,28 +826,48 @@ async function executeHealthCheckJob(props: {
|
|
|
962
826
|
},
|
|
963
827
|
};
|
|
964
828
|
|
|
965
|
-
//
|
|
966
|
-
|
|
967
|
-
|
|
829
|
+
// Persist the run + aggregate THROUGH the reactive `health` entity on
|
|
830
|
+
// every run (§10.3): `apply` does the durable write (insert + hourly
|
|
831
|
+
// aggregate) and returns the freshly-computed view. The framework
|
|
832
|
+
// snapshots `prev` via the COMPUTE-ON-READ accessor BEFORE this insert, so
|
|
833
|
+
// an unchanged aggregate is a no-op and a real status change drives the
|
|
834
|
+
// directional/umbrella trigger events via `deriveHealthTriggerEvents` —
|
|
835
|
+
// exactly one correct `ENTITY_CHANGED` with accurate prev → next.
|
|
836
|
+
let newState!: AggregatedHealth;
|
|
837
|
+
await writeHealthEntity({
|
|
838
|
+
handle: getHealthEntity?.(),
|
|
968
839
|
systemId,
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
840
|
+
apply: async () => {
|
|
841
|
+
// Store result (spread to convert structured type to plain record for jsonb)
|
|
842
|
+
await db.insert(healthCheckRuns).values({
|
|
843
|
+
configurationId: configId,
|
|
844
|
+
systemId,
|
|
845
|
+
status: result.status,
|
|
846
|
+
latencyMs: result.latencyMs,
|
|
847
|
+
result: { ...result } as Record<string, unknown>,
|
|
848
|
+
sourceId: undefined,
|
|
849
|
+
sourceLabel: "Local",
|
|
850
|
+
});
|
|
975
851
|
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
852
|
+
// Trigger incremental hourly aggregation
|
|
853
|
+
await incrementHourlyAggregate({
|
|
854
|
+
db,
|
|
855
|
+
systemId,
|
|
856
|
+
configurationId: configId,
|
|
857
|
+
status: result.status,
|
|
858
|
+
latencyMs: result.latencyMs,
|
|
859
|
+
runTimestamp: new Date(),
|
|
860
|
+
result: { ...result } as Record<string, unknown>,
|
|
861
|
+
collectorRegistry,
|
|
862
|
+
sourceLabel: "Local",
|
|
863
|
+
});
|
|
864
|
+
|
|
865
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
866
|
+
return toHealthEntityView(newState);
|
|
867
|
+
},
|
|
868
|
+
serialize: serializeHealthWrite,
|
|
869
|
+
onError: (error) =>
|
|
870
|
+
logger.warn(`Failed to mirror health entity for ${systemId}`, error),
|
|
987
871
|
});
|
|
988
872
|
|
|
989
873
|
logger.debug(
|
|
@@ -1013,9 +897,17 @@ async function executeHealthCheckJob(props: {
|
|
|
1013
897
|
result: (result.metadata?.collectors as Record<string, unknown>) ?? undefined,
|
|
1014
898
|
});
|
|
1015
899
|
|
|
1016
|
-
// Check if aggregated state changed and notify subscribers
|
|
1017
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
1018
900
|
if (newState.status !== previousStatus) {
|
|
901
|
+
// Record the aggregate transition so the sensing layer has a
|
|
902
|
+
// reliable "in status since" for every status (Wave 2).
|
|
903
|
+
await recordStateTransition({
|
|
904
|
+
db,
|
|
905
|
+
systemId,
|
|
906
|
+
configurationId: configId,
|
|
907
|
+
fromStatus: previousStatus,
|
|
908
|
+
toStatus: newState.status,
|
|
909
|
+
});
|
|
910
|
+
|
|
1019
911
|
await notifyStateChange({
|
|
1020
912
|
notificationClient,
|
|
1021
913
|
systemId,
|
|
@@ -1037,77 +929,13 @@ async function executeHealthCheckJob(props: {
|
|
|
1037
929
|
newStatus: newState.status,
|
|
1038
930
|
});
|
|
1039
931
|
|
|
1040
|
-
//
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
).length;
|
|
1046
|
-
const totalChecks = newState.checkStatuses.length;
|
|
1047
|
-
const timestamp = new Date().toISOString();
|
|
1048
|
-
|
|
1049
|
-
if (newState.status === "healthy" && previousStatus !== "healthy") {
|
|
1050
|
-
// Recovery: system became healthy
|
|
1051
|
-
await emitHook(healthCheckHooks.systemHealthy, {
|
|
1052
|
-
systemId,
|
|
1053
|
-
previousStatus,
|
|
1054
|
-
healthyChecks,
|
|
1055
|
-
totalChecks,
|
|
1056
|
-
timestamp,
|
|
1057
|
-
});
|
|
1058
|
-
logger.debug(
|
|
1059
|
-
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
1060
|
-
);
|
|
1061
|
-
} else if (
|
|
1062
|
-
previousStatus === "healthy" &&
|
|
1063
|
-
newState.status !== "healthy"
|
|
1064
|
-
) {
|
|
1065
|
-
// Degradation: system went from healthy to unhealthy/degraded
|
|
1066
|
-
await emitHook(healthCheckHooks.systemDegraded, {
|
|
1067
|
-
systemId,
|
|
1068
|
-
previousStatus,
|
|
1069
|
-
newStatus: newState.status,
|
|
1070
|
-
healthyChecks,
|
|
1071
|
-
totalChecks,
|
|
1072
|
-
timestamp,
|
|
1073
|
-
});
|
|
1074
|
-
logger.debug(
|
|
1075
|
-
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
1076
|
-
);
|
|
1077
|
-
}
|
|
1078
|
-
|
|
1079
|
-
// Umbrella hook — fires on every transition. Emitted alongside
|
|
1080
|
-
// the directional hooks so existing subscribers stay unchanged
|
|
1081
|
-
// while new automation triggers can react to any change.
|
|
1082
|
-
if (previousStatus !== newState.status) {
|
|
1083
|
-
await emitHook(healthCheckHooks.systemHealthChanged, {
|
|
1084
|
-
systemId,
|
|
1085
|
-
previousStatus,
|
|
1086
|
-
newStatus: newState.status,
|
|
1087
|
-
healthyChecks,
|
|
1088
|
-
totalChecks,
|
|
1089
|
-
timestamp,
|
|
1090
|
-
});
|
|
1091
|
-
}
|
|
1092
|
-
}
|
|
932
|
+
// The directional + umbrella system-health hooks were removed in
|
|
933
|
+
// Phase 4 (§10.3): the `health` entity mirror above is the single
|
|
934
|
+
// source of truth, and its change deriver fires the
|
|
935
|
+
// `healthcheck.system_degraded` / `_healthy` / `_health_changed`
|
|
936
|
+
// trigger events through Stage-1 routing. Nothing to emit here.
|
|
1093
937
|
}
|
|
1094
938
|
|
|
1095
|
-
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1096
|
-
await maybeOpenAutoIncidentForCheck({
|
|
1097
|
-
db,
|
|
1098
|
-
service,
|
|
1099
|
-
incidentClient,
|
|
1100
|
-
maintenanceClient,
|
|
1101
|
-
logger,
|
|
1102
|
-
systemId,
|
|
1103
|
-
systemName,
|
|
1104
|
-
configurationId: configId,
|
|
1105
|
-
configurationName: configRow.configName,
|
|
1106
|
-
getEmitHook,
|
|
1107
|
-
previousState,
|
|
1108
|
-
newState,
|
|
1109
|
-
});
|
|
1110
|
-
|
|
1111
939
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
1112
940
|
} catch (error) {
|
|
1113
941
|
logger.error(
|
|
@@ -1115,27 +943,48 @@ async function executeHealthCheckJob(props: {
|
|
|
1115
943
|
error,
|
|
1116
944
|
);
|
|
1117
945
|
|
|
1118
|
-
//
|
|
1119
|
-
|
|
1120
|
-
|
|
946
|
+
// Persist the failure run + aggregate THROUGH the reactive `health`
|
|
947
|
+
// entity: `apply` does the durable write and returns the freshly-computed
|
|
948
|
+
// view. The framework snapshots `prev` via the compute-on-read accessor
|
|
949
|
+
// BEFORE this insert, so a real status change emits exactly one correct
|
|
950
|
+
// `ENTITY_CHANGED` (§10.3). See the success path for the full rationale.
|
|
951
|
+
let newState!: AggregatedHealth;
|
|
952
|
+
await writeHealthEntity({
|
|
953
|
+
handle: getHealthEntity?.(),
|
|
1121
954
|
systemId,
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
955
|
+
apply: async () => {
|
|
956
|
+
// Store failure (no latencyMs for failures)
|
|
957
|
+
await db.insert(healthCheckRuns).values({
|
|
958
|
+
configurationId: configId,
|
|
959
|
+
systemId,
|
|
960
|
+
status: "unhealthy",
|
|
961
|
+
result: { error: String(error) } as Record<string, unknown>,
|
|
962
|
+
sourceId: undefined,
|
|
963
|
+
sourceLabel: "Local",
|
|
964
|
+
});
|
|
1127
965
|
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
966
|
+
// Trigger incremental hourly aggregation
|
|
967
|
+
await incrementHourlyAggregate({
|
|
968
|
+
db,
|
|
969
|
+
systemId,
|
|
970
|
+
configurationId: configId,
|
|
971
|
+
status: "unhealthy",
|
|
972
|
+
latencyMs: undefined,
|
|
973
|
+
runTimestamp: new Date(),
|
|
974
|
+
// No collector data for error cases
|
|
975
|
+
collectorRegistry,
|
|
976
|
+
sourceLabel: "Local",
|
|
977
|
+
});
|
|
978
|
+
|
|
979
|
+
newState = await service.getSystemHealthStatus(systemId);
|
|
980
|
+
return toHealthEntityView(newState);
|
|
981
|
+
},
|
|
982
|
+
serialize: serializeHealthWrite,
|
|
983
|
+
onError: (mirrorError) =>
|
|
984
|
+
logger.warn(
|
|
985
|
+
`Failed to mirror health entity for ${systemId}`,
|
|
986
|
+
mirrorError,
|
|
987
|
+
),
|
|
1139
988
|
});
|
|
1140
989
|
|
|
1141
990
|
// Try to fetch names for the enriched signal (best-effort)
|
|
@@ -1179,9 +1028,17 @@ async function executeHealthCheckJob(props: {
|
|
|
1179
1028
|
result: undefined,
|
|
1180
1029
|
});
|
|
1181
1030
|
|
|
1182
|
-
// Check if aggregated state changed and notify subscribers
|
|
1183
|
-
const newState = await service.getSystemHealthStatus(systemId);
|
|
1184
1031
|
if (newState.status !== previousStatus) {
|
|
1032
|
+
// Record the aggregate transition so the sensing layer has a
|
|
1033
|
+
// reliable "in status since" for every status (Wave 2).
|
|
1034
|
+
await recordStateTransition({
|
|
1035
|
+
db,
|
|
1036
|
+
systemId,
|
|
1037
|
+
configurationId: configId,
|
|
1038
|
+
fromStatus: previousStatus,
|
|
1039
|
+
toStatus: newState.status,
|
|
1040
|
+
});
|
|
1041
|
+
|
|
1185
1042
|
await notifyStateChange({
|
|
1186
1043
|
notificationClient,
|
|
1187
1044
|
systemId,
|
|
@@ -1203,77 +1060,13 @@ async function executeHealthCheckJob(props: {
|
|
|
1203
1060
|
newStatus: newState.status,
|
|
1204
1061
|
});
|
|
1205
1062
|
|
|
1206
|
-
//
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
).length;
|
|
1212
|
-
const totalChecks = newState.checkStatuses.length;
|
|
1213
|
-
const timestamp = new Date().toISOString();
|
|
1214
|
-
|
|
1215
|
-
if (newState.status === "healthy" && previousStatus !== "healthy") {
|
|
1216
|
-
// Recovery: system became healthy
|
|
1217
|
-
await emitHook(healthCheckHooks.systemHealthy, {
|
|
1218
|
-
systemId,
|
|
1219
|
-
previousStatus,
|
|
1220
|
-
healthyChecks,
|
|
1221
|
-
totalChecks,
|
|
1222
|
-
timestamp,
|
|
1223
|
-
});
|
|
1224
|
-
logger.debug(
|
|
1225
|
-
`Emitted systemHealthy hook: ${previousStatus} → ${newState.status}`,
|
|
1226
|
-
);
|
|
1227
|
-
} else if (
|
|
1228
|
-
previousStatus === "healthy" &&
|
|
1229
|
-
newState.status !== "healthy"
|
|
1230
|
-
) {
|
|
1231
|
-
// Degradation: system went from healthy to unhealthy/degraded
|
|
1232
|
-
await emitHook(healthCheckHooks.systemDegraded, {
|
|
1233
|
-
systemId,
|
|
1234
|
-
previousStatus,
|
|
1235
|
-
newStatus: newState.status,
|
|
1236
|
-
healthyChecks,
|
|
1237
|
-
totalChecks,
|
|
1238
|
-
timestamp,
|
|
1239
|
-
});
|
|
1240
|
-
logger.debug(
|
|
1241
|
-
`Emitted systemDegraded hook: ${previousStatus} → ${newState.status}`,
|
|
1242
|
-
);
|
|
1243
|
-
}
|
|
1244
|
-
|
|
1245
|
-
// Umbrella hook — fires on every transition. Emitted alongside
|
|
1246
|
-
// the directional hooks so existing subscribers stay unchanged
|
|
1247
|
-
// while new automation triggers can react to any change.
|
|
1248
|
-
if (previousStatus !== newState.status) {
|
|
1249
|
-
await emitHook(healthCheckHooks.systemHealthChanged, {
|
|
1250
|
-
systemId,
|
|
1251
|
-
previousStatus,
|
|
1252
|
-
newStatus: newState.status,
|
|
1253
|
-
healthyChecks,
|
|
1254
|
-
totalChecks,
|
|
1255
|
-
timestamp,
|
|
1256
|
-
});
|
|
1257
|
-
}
|
|
1258
|
-
}
|
|
1063
|
+
// The directional + umbrella system-health hooks were removed in
|
|
1064
|
+
// Phase 4 (§10.3): the `health` entity mirror above is the single
|
|
1065
|
+
// source of truth, and its change deriver fires the
|
|
1066
|
+
// `healthcheck.system_degraded` / `_healthy` / `_health_changed`
|
|
1067
|
+
// trigger events through Stage-1 routing. Nothing to emit here.
|
|
1259
1068
|
}
|
|
1260
1069
|
|
|
1261
|
-
// Per-check auto-incident: see comment on the failed-execution path.
|
|
1262
|
-
await maybeOpenAutoIncidentForCheck({
|
|
1263
|
-
db,
|
|
1264
|
-
service,
|
|
1265
|
-
incidentClient,
|
|
1266
|
-
maintenanceClient,
|
|
1267
|
-
logger,
|
|
1268
|
-
systemId,
|
|
1269
|
-
systemName,
|
|
1270
|
-
configurationId: configId,
|
|
1271
|
-
configurationName: configName,
|
|
1272
|
-
getEmitHook,
|
|
1273
|
-
previousState,
|
|
1274
|
-
newState,
|
|
1275
|
-
});
|
|
1276
|
-
|
|
1277
1070
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
1278
1071
|
}
|
|
1279
1072
|
}
|
|
@@ -1290,7 +1083,9 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1290
1083
|
maintenanceClient: MaintenanceClient;
|
|
1291
1084
|
incidentClient: IncidentClient;
|
|
1292
1085
|
getEmitHook: () => EmitHookFn | undefined;
|
|
1086
|
+
getHealthEntity?: () => EntityHandle<HealthEntityState> | undefined;
|
|
1293
1087
|
cache: HealthCheckCache;
|
|
1088
|
+
secretResolver?: SecretResolverService;
|
|
1294
1089
|
}): Promise<void> {
|
|
1295
1090
|
const {
|
|
1296
1091
|
db,
|
|
@@ -1304,7 +1099,9 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1304
1099
|
maintenanceClient,
|
|
1305
1100
|
incidentClient,
|
|
1306
1101
|
getEmitHook,
|
|
1102
|
+
getHealthEntity,
|
|
1307
1103
|
cache,
|
|
1104
|
+
secretResolver,
|
|
1308
1105
|
} = props;
|
|
1309
1106
|
|
|
1310
1107
|
const queue =
|
|
@@ -1325,7 +1122,9 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1325
1122
|
maintenanceClient,
|
|
1326
1123
|
incidentClient,
|
|
1327
1124
|
getEmitHook,
|
|
1125
|
+
getHealthEntity,
|
|
1328
1126
|
cache,
|
|
1127
|
+
secretResolver,
|
|
1329
1128
|
});
|
|
1330
1129
|
},
|
|
1331
1130
|
{
|