@checkstack/healthcheck-backend 1.4.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +303 -0
- package/drizzle/0018_abnormal_preak.sql +10 -0
- package/drizzle/meta/0018_snapshot.json +600 -0
- package/drizzle/meta/_journal.json +7 -0
- package/package.json +26 -21
- package/src/ai/assertion-validation.test.ts +117 -0
- package/src/ai/assertion-validation.ts +147 -0
- package/src/ai/healthcheck-capabilities.test.ts +158 -0
- package/src/ai/healthcheck-capabilities.ts +217 -0
- package/src/ai/healthcheck-delete.test.ts +81 -0
- package/src/ai/healthcheck-delete.ts +81 -0
- package/src/ai/healthcheck-projection.test.ts +36 -0
- package/src/ai/healthcheck-propose.test.ts +268 -0
- package/src/ai/healthcheck-propose.ts +290 -0
- package/src/ai/healthcheck-script-tools.test.ts +93 -0
- package/src/ai/healthcheck-script-tools.ts +179 -0
- package/src/ai/healthcheck-update.test.ts +123 -0
- package/src/ai/healthcheck-update.ts +123 -0
- package/src/ai/notify-subscribers.test.ts +109 -0
- package/src/ai/notify-subscribers.ts +176 -0
- package/src/ai/register-ai-tools.test.ts +41 -0
- package/src/ai/register-ai-tools.ts +53 -0
- package/src/ai/shell-env-table.test.ts +47 -0
- package/src/automations.test.ts +2 -1
- package/src/automations.ts +9 -1
- package/src/collector-script-test.test.ts +53 -1
- package/src/collector-script-test.ts +59 -7
- package/src/effective-environments.test.ts +93 -0
- package/src/effective-environments.ts +64 -0
- package/src/health-entity-id.ts +57 -0
- package/src/health-entity.test.ts +405 -31
- package/src/health-entity.ts +99 -43
- package/src/health-state.ts +41 -4
- package/src/healthcheck-gitops-kinds.test.ts +95 -0
- package/src/healthcheck-gitops-kinds.ts +56 -13
- package/src/index.ts +33 -0
- package/src/migration-chain-contract.test.ts +57 -0
- package/src/queue-executor.test.ts +814 -0
- package/src/queue-executor.ts +342 -50
- package/src/realtime-aggregation.test.ts +30 -0
- package/src/realtime-aggregation.ts +16 -0
- package/src/retention-job.ts +167 -93
- package/src/retention-rollup.test.ts +118 -0
- package/src/router.test.ts +120 -1
- package/src/router.ts +20 -0
- package/src/schema.ts +44 -6
- package/src/service.ts +199 -43
- package/src/state-evaluator.test.ts +50 -5
- package/src/state-evaluator.ts +9 -2
- package/src/state-transitions.test.ts +104 -0
- package/src/state-transitions.ts +39 -1
- package/src/validate-configuration.test.ts +205 -0
- package/src/validate-configuration.ts +159 -0
- package/tsconfig.json +9 -0
package/src/queue-executor.ts
CHANGED
|
@@ -9,6 +9,8 @@ import {
|
|
|
9
9
|
type ConnectedClient,
|
|
10
10
|
type TransportClient,
|
|
11
11
|
type CollectorRunContext,
|
|
12
|
+
type AdvisoryLockService,
|
|
13
|
+
renderTemplatableConfig,
|
|
12
14
|
} from "@checkstack/backend-api";
|
|
13
15
|
import { QueueManager } from "@checkstack/queue-api";
|
|
14
16
|
import {
|
|
@@ -22,6 +24,7 @@ import { type SignalService } from "@checkstack/signal-common";
|
|
|
22
24
|
import {
|
|
23
25
|
HEALTH_CHECK_RUN_COMPLETED,
|
|
24
26
|
SYSTEM_STATUS_CHANGED,
|
|
27
|
+
ENVIRONMENT_RESOLUTION_FAILED,
|
|
25
28
|
type HealthCheckStatus,
|
|
26
29
|
stripEphemeralFields,
|
|
27
30
|
} from "@checkstack/healthcheck-common";
|
|
@@ -29,7 +32,12 @@ import {
|
|
|
29
32
|
CatalogApi,
|
|
30
33
|
catalogRoutes,
|
|
31
34
|
createSystemSubject,
|
|
35
|
+
type Environment,
|
|
32
36
|
} from "@checkstack/catalog-common";
|
|
37
|
+
import {
|
|
38
|
+
resolveEffectiveEnvironments,
|
|
39
|
+
type EffectiveEnvironment,
|
|
40
|
+
} from "./effective-environments";
|
|
33
41
|
import { systemHealthCollapseKey } from "@checkstack/healthcheck-common";
|
|
34
42
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
35
43
|
import { IncidentApi } from "@checkstack/incident-common";
|
|
@@ -52,6 +60,7 @@ import {
|
|
|
52
60
|
createHealthEntitySerializer,
|
|
53
61
|
type HealthEntityState,
|
|
54
62
|
} from "./health-entity";
|
|
63
|
+
import { encodeHealthEntityId } from "./health-entity-id";
|
|
55
64
|
import type { EntityHandle } from "@checkstack/automation-backend";
|
|
56
65
|
|
|
57
66
|
type Db = SafeDatabase<typeof schema>;
|
|
@@ -375,6 +384,7 @@ async function notifyStateChange(props: {
|
|
|
375
384
|
async function executeHealthCheckJob(props: {
|
|
376
385
|
payload: HealthCheckJobPayload;
|
|
377
386
|
db: Db;
|
|
387
|
+
advisoryLock: AdvisoryLockService;
|
|
378
388
|
registry: HealthCheckRegistry;
|
|
379
389
|
collectorRegistry: CollectorRegistry;
|
|
380
390
|
logger: Logger;
|
|
@@ -404,6 +414,7 @@ async function executeHealthCheckJob(props: {
|
|
|
404
414
|
const {
|
|
405
415
|
payload,
|
|
406
416
|
db,
|
|
417
|
+
advisoryLock,
|
|
407
418
|
registry,
|
|
408
419
|
collectorRegistry,
|
|
409
420
|
logger,
|
|
@@ -422,17 +433,22 @@ async function executeHealthCheckJob(props: {
|
|
|
422
433
|
// Create service for aggregated state evaluation
|
|
423
434
|
const service = new HealthCheckService(db, registry, collectorRegistry);
|
|
424
435
|
|
|
425
|
-
// Per-
|
|
426
|
-
// transaction-scoped advisory lock keyed `health:<
|
|
427
|
-
// snapshot-prev + apply + diff + emit so concurrent evaluations
|
|
428
|
-
//
|
|
429
|
-
//
|
|
430
|
-
//
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
+
// Per-ENTITY serializer factory for the reactive health mutate (§10.3,
|
|
437
|
+
// Phase 3b): a transaction-scoped advisory lock keyed `health:<entityId>`
|
|
438
|
+
// wraps the snapshot-prev + apply + diff + emit so concurrent evaluations
|
|
439
|
+
// of one (system, environment) — or of the system rollup — can't double-emit
|
|
440
|
+
// a single logical transition. Bound to the qualified entity id at each
|
|
441
|
+
// `writeHealthEntity` call so distinct envs / the rollup don't block each
|
|
442
|
+
// other.
|
|
443
|
+
const makeHealthSerializer = createHealthEntitySerializer({ advisoryLock });
|
|
444
|
+
|
|
445
|
+
// The system-rollup status BEFORE this tick (all environments + env-less).
|
|
446
|
+
// Captured once so the post-loop rollup write (§7.4.3) — and the
|
|
447
|
+
// catastrophic-failure path — can record a correct prev → next rollup
|
|
448
|
+
// transition (environmentId = null). This is the system-wide aggregate read
|
|
449
|
+
// the executor has always taken first.
|
|
450
|
+
const rollupPreviousState = await service.getSystemHealthStatus(systemId);
|
|
451
|
+
const rollupPreviousStatus = rollupPreviousState.status;
|
|
436
452
|
|
|
437
453
|
try {
|
|
438
454
|
// Fetch configuration (including name for signals)
|
|
@@ -448,6 +464,7 @@ async function executeHealthCheckJob(props: {
|
|
|
448
464
|
paused: healthCheckConfigurations.paused,
|
|
449
465
|
includeLocal: systemHealthChecks.includeLocal,
|
|
450
466
|
satelliteIds: systemHealthChecks.satelliteIds,
|
|
467
|
+
environmentIds: systemHealthChecks.environmentIds,
|
|
451
468
|
})
|
|
452
469
|
.from(systemHealthChecks)
|
|
453
470
|
.innerJoin(
|
|
@@ -503,17 +520,6 @@ async function executeHealthCheckJob(props: {
|
|
|
503
520
|
logger.debug(`Could not fetch system name for ${systemId}, using ID`);
|
|
504
521
|
}
|
|
505
522
|
|
|
506
|
-
// Curated, read-only run-context metadata exposed to collectors.
|
|
507
|
-
// Metadata only - never secrets or config.
|
|
508
|
-
const runContext: CollectorRunContext = {
|
|
509
|
-
check: {
|
|
510
|
-
id: configId,
|
|
511
|
-
name: configRow.configName || configId,
|
|
512
|
-
intervalSeconds: configRow.interval,
|
|
513
|
-
},
|
|
514
|
-
system: { id: systemId, name: systemName },
|
|
515
|
-
};
|
|
516
|
-
|
|
517
523
|
const strategy = registry.getStrategy(configRow.strategyId);
|
|
518
524
|
if (!strategy) {
|
|
519
525
|
logger.warn(
|
|
@@ -522,10 +528,158 @@ async function executeHealthCheckJob(props: {
|
|
|
522
528
|
return;
|
|
523
529
|
}
|
|
524
530
|
|
|
525
|
-
//
|
|
526
|
-
|
|
531
|
+
// Migrate the stored (UNVERSIONED) strategy config ONCE, before the
|
|
532
|
+
// per-environment render loop, so every env renders from the same
|
|
533
|
+
// migrated shape. Stored configs predate explicit versioning and may be
|
|
534
|
+
// genuinely v1 (e.g. an HTTP config still carrying url/method); assume-v1
|
|
535
|
+
// -on-read runs the declared migration chain, then validates. The
|
|
536
|
+
// migrations are idempotent, so an already-current config is a no-op.
|
|
537
|
+
const strategyConfig: BaseStrategyConfig =
|
|
538
|
+
await strategy.config.parseAssumingV1(configRow.config);
|
|
527
539
|
const executionTimeout = strategyConfig.timeout ?? 60_000;
|
|
528
540
|
|
|
541
|
+
// ── Per-environment fan-out (§7) ────────────────────────────────────────
|
|
542
|
+
// Resolve the effective environment set from the assignment + the
|
|
543
|
+
// system's current catalog membership, then run ONCE PER environment.
|
|
544
|
+
// An empty effective set (opt-out `[]`, or `null` with no membership)
|
|
545
|
+
// collapses to a single env-less run with `environment` unset — exactly
|
|
546
|
+
// the pre-feature behavior. Membership lives ONLY in the catalog Postgres
|
|
547
|
+
// tables and is re-read every tick via the cross-plugin RPC, so every pod
|
|
548
|
+
// resolves the same set (state-and-scale: no pod-local env state).
|
|
549
|
+
let membership: Environment[] = [];
|
|
550
|
+
try {
|
|
551
|
+
membership = await catalogClient.resolveSystemEnvironments({ systemId });
|
|
552
|
+
} catch (error) {
|
|
553
|
+
// Fail-open: a catalog read failure must not wedge the check. Degrade
|
|
554
|
+
// to an env-less run (today's behavior) rather than skipping the tick.
|
|
555
|
+
logger.warn(
|
|
556
|
+
`Could not resolve environments for system ${systemId}, running env-less`,
|
|
557
|
+
error,
|
|
558
|
+
);
|
|
559
|
+
// Observability: a `logger.warn` alone is easy to miss when a durable
|
|
560
|
+
// catalog misconfig (or outage) silently strips per-environment fan-out.
|
|
561
|
+
// Broadcast a counter-style signal so the degradation is observable.
|
|
562
|
+
// Best-effort — never let the signal break the (still-running) check.
|
|
563
|
+
try {
|
|
564
|
+
await signalService.broadcast(ENVIRONMENT_RESOLUTION_FAILED, {
|
|
565
|
+
systemId,
|
|
566
|
+
configurationId: configId,
|
|
567
|
+
error: extractErrorMessage(error),
|
|
568
|
+
});
|
|
569
|
+
} catch (signalError) {
|
|
570
|
+
logger.warn(
|
|
571
|
+
`Failed to broadcast environment-resolution-failed signal for ${systemId}`,
|
|
572
|
+
signalError,
|
|
573
|
+
);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
const effectiveEnvs = resolveEffectiveEnvironments({
|
|
577
|
+
environmentIds: configRow.environmentIds,
|
|
578
|
+
membership,
|
|
579
|
+
});
|
|
580
|
+
// `null` env => the single env-less run. Each entry => one run per env.
|
|
581
|
+
const runEnvironments: (EffectiveEnvironment | null)[] =
|
|
582
|
+
effectiveEnvs.length > 0 ? effectiveEnvs : [null];
|
|
583
|
+
|
|
584
|
+
// Execute one run per effective environment. Runs are independent (own
|
|
585
|
+
// status / latency / result) and persisted with their own
|
|
586
|
+
// `environmentId`. Phase 3b: each env-run mutates its OWN env-qualified
|
|
587
|
+
// `health` entity (`<systemId>::<environmentId>`, or the bare `<systemId>`
|
|
588
|
+
// for the env-less run) through a per-entity serializer; after the loop a
|
|
589
|
+
// single ROLLUP write for the bare `<systemId>` recomputes the worst-status
|
|
590
|
+
// rollup so system-level consumers keep firing off the unchanged id.
|
|
591
|
+
//
|
|
592
|
+
// Track whether ANY per-env run persisted (so the rollup write only runs
|
|
593
|
+
// when there is something to roll up — an all-failed loop still leaves the
|
|
594
|
+
// durable runs the per-env apply already wrote).
|
|
595
|
+
let anyEnvRunPersisted = false;
|
|
596
|
+
// Whether this tick fans out into REAL environments (vs. the single
|
|
597
|
+
// env-less run). When env-less, the loop's lone write already targets the
|
|
598
|
+
// bare `<systemId>` entity — which IS the rollup — so no separate rollup
|
|
599
|
+
// write is needed. With real envs, the loop writes `<systemId>::<env>`
|
|
600
|
+
// entities and we recompute the bare-`<systemId>` rollup after the loop.
|
|
601
|
+
const isFannedOut = effectiveEnvs.length > 0;
|
|
602
|
+
for (const environment of runEnvironments) {
|
|
603
|
+
const environmentId = environment?.id ?? null;
|
|
604
|
+
// The env-qualified entity id this run mutates. For the env-less run
|
|
605
|
+
// (environmentId === null) this is the bare systemId — which is also the
|
|
606
|
+
// rollup id, so the env-less run IS the rollup (no separate rollup write
|
|
607
|
+
// is needed when the system has no environments — see below).
|
|
608
|
+
const envEntityId = encodeHealthEntityId({ systemId, environmentId });
|
|
609
|
+
const serializeEnvWrite = makeHealthSerializer(envEntityId);
|
|
610
|
+
|
|
611
|
+
// Per-env baseline status for the transition log: the env-scoped
|
|
612
|
+
// aggregate BEFORE this run. Computed per env so a transition row is
|
|
613
|
+
// recorded against the right (system, environment) streak.
|
|
614
|
+
const previousState = await service.getSystemHealthStatus(
|
|
615
|
+
systemId,
|
|
616
|
+
environmentId,
|
|
617
|
+
);
|
|
618
|
+
const previousStatus = previousState.status;
|
|
619
|
+
|
|
620
|
+
// Curated, read-only run-context metadata exposed to collectors.
|
|
621
|
+
// Metadata only - never secrets or config. `environment` carries the
|
|
622
|
+
// resolved env's verbatim custom fields for this run (Phase 2 surfaces
|
|
623
|
+
// consume it); absent for the env-less run.
|
|
624
|
+
const runContext: CollectorRunContext = {
|
|
625
|
+
check: {
|
|
626
|
+
id: configId,
|
|
627
|
+
name: configRow.configName || configId,
|
|
628
|
+
intervalSeconds: configRow.interval,
|
|
629
|
+
},
|
|
630
|
+
system: { id: systemId, name: systemName },
|
|
631
|
+
...(environment
|
|
632
|
+
? {
|
|
633
|
+
environment: {
|
|
634
|
+
id: environment.id,
|
|
635
|
+
name: environment.name,
|
|
636
|
+
fields: environment.fields,
|
|
637
|
+
},
|
|
638
|
+
}
|
|
639
|
+
: {}),
|
|
640
|
+
};
|
|
641
|
+
|
|
642
|
+
// Templating context for the per-env config render pass (§6.3.3).
|
|
643
|
+
// Carries only environment custom fields + curated check/system
|
|
644
|
+
// metadata - never secrets. `{{ environment.baseUrl }}` resolves from
|
|
645
|
+
// the resolved env's verbatim fields; an env-less run gets `{}` so a
|
|
646
|
+
// reference renders to empty string (strict: false); see the debug log
|
|
647
|
+
// below.
|
|
648
|
+
const templateContext = {
|
|
649
|
+
environment: runContext.environment?.fields ?? {},
|
|
650
|
+
check: runContext.check,
|
|
651
|
+
system: runContext.system,
|
|
652
|
+
};
|
|
653
|
+
if (!runContext.environment) {
|
|
654
|
+
// §11.6: render-empty when a run has no environment. An env-less run is
|
|
655
|
+
// a legitimate, documented configuration (the None assignment mode, or
|
|
656
|
+
// All-environments with no membership), and it recurs every interval -
|
|
657
|
+
// so this is `debug`, not `warn`, to avoid spamming the log. When an
|
|
658
|
+
// empty `{{ environment.* }}` render actually matters, the HTTP
|
|
659
|
+
// post-render `.url()` check already fails the run with a concrete
|
|
660
|
+
// "Rendered URL is invalid" error; we do not inspect every field here.
|
|
661
|
+
logger.debug(
|
|
662
|
+
`Health check ${configId} for system ${systemId} ran with no environment; ` +
|
|
663
|
+
`any {{ environment.* }} references render to empty string`,
|
|
664
|
+
);
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
// (2) Environment/templating pass (NEW) - renders `{{ environment.* }}`
|
|
668
|
+
// etc. in `x-templatable` fields. Runs PER ENVIRONMENT, AFTER the secret
|
|
669
|
+
// resolution (secrets first, templating second - §6.3.4) and BEFORE the
|
|
670
|
+
// strategy client build, so each env gets its own rendered strategy
|
|
671
|
+
// config + client. The collector configs are rendered just before each
|
|
672
|
+
// collector executes (below) so the secretEnv resolution stays first.
|
|
673
|
+
const renderedStrategyConfig = renderTemplatableConfig({
|
|
674
|
+
config: strategyConfig,
|
|
675
|
+
schema: strategy.config.schema,
|
|
676
|
+
context: templateContext,
|
|
677
|
+
}) as BaseStrategyConfig;
|
|
678
|
+
|
|
679
|
+
// Per-environment isolation: an unexpected failure persisting ONE
|
|
680
|
+
// environment's run must not abort the sibling environments' runs.
|
|
681
|
+
// Each iteration's run is independent (§7.2), so we log and continue.
|
|
682
|
+
try {
|
|
529
683
|
// Execute health check using createClient pattern with unified hard timeout
|
|
530
684
|
const start = performance.now();
|
|
531
685
|
let connectionTimeMs: number | undefined;
|
|
@@ -541,8 +695,11 @@ async function executeHealthCheckJob(props: {
|
|
|
541
695
|
// Platform-level hard timeout wrapping the entire execution sequence
|
|
542
696
|
await Promise.race([
|
|
543
697
|
(async () => {
|
|
544
|
-
// 1. Establish connection
|
|
545
|
-
|
|
698
|
+
// 1. Establish connection. The strategy client build moves INSIDE
|
|
699
|
+
// the per-env loop (§6.3.3): each env gets its own rendered config +
|
|
700
|
+
// client, so a single job no longer bakes in one env's rendered
|
|
701
|
+
// strategy config.
|
|
702
|
+
connectedClient = await strategy.createClient(renderedStrategyConfig);
|
|
546
703
|
connectionTimeMs = Math.round(performance.now() - start);
|
|
547
704
|
|
|
548
705
|
// 2. Execute collectors in parallel
|
|
@@ -579,8 +736,31 @@ async function executeHealthCheckJob(props: {
|
|
|
579
736
|
secretEnv = resolved.env;
|
|
580
737
|
}
|
|
581
738
|
|
|
739
|
+
// Migrate the stored (UNVERSIONED) collector config via
|
|
740
|
+
// assume-v1-on-read: runs the declared migration chain, then
|
|
741
|
+
// validates. Migrations are idempotent, so an already-current
|
|
742
|
+
// config is a no-op. This runs BEFORE templating so the render
|
|
743
|
+
// pass sees the migrated shape; the secretEnv resolution above
|
|
744
|
+
// reads the raw `secretEnv` mapping (a constant string field
|
|
745
|
+
// unaffected by the strategy/collector reshapes), keeping the
|
|
746
|
+
// migrate -> secret resolve -> render -> execute order intact.
|
|
747
|
+
const migratedCollectorConfig =
|
|
748
|
+
await registered.collector.config.parseAssumingV1(
|
|
749
|
+
collectorEntry.config,
|
|
750
|
+
);
|
|
751
|
+
|
|
752
|
+
// (2) Environment/templating pass for the collector config -
|
|
753
|
+
// runs AFTER the secretEnv resolution above (secrets first,
|
|
754
|
+
// templating second) and renders `{{ environment.* }}` in this
|
|
755
|
+
// collector's `x-templatable` fields against the per-env context.
|
|
756
|
+
const renderedCollectorConfig = renderTemplatableConfig({
|
|
757
|
+
config: migratedCollectorConfig,
|
|
758
|
+
schema: registered.collector.config.schema,
|
|
759
|
+
context: templateContext,
|
|
760
|
+
});
|
|
761
|
+
|
|
582
762
|
const collectorResult = await registered.collector.execute({
|
|
583
|
-
config:
|
|
763
|
+
config: renderedCollectorConfig,
|
|
584
764
|
client: connectedClient!.client,
|
|
585
765
|
pluginId: configRow.strategyId,
|
|
586
766
|
runContext,
|
|
@@ -723,11 +903,12 @@ async function executeHealthCheckJob(props: {
|
|
|
723
903
|
let newState!: AggregatedHealth;
|
|
724
904
|
await writeHealthEntity({
|
|
725
905
|
handle: getHealthEntity?.(),
|
|
726
|
-
|
|
906
|
+
entityId: envEntityId,
|
|
727
907
|
apply: async () => {
|
|
728
908
|
await db.insert(healthCheckRuns).values({
|
|
729
909
|
configurationId: configId,
|
|
730
910
|
systemId,
|
|
911
|
+
environmentId,
|
|
731
912
|
status: result.status,
|
|
732
913
|
latencyMs: result.latencyMs,
|
|
733
914
|
result: { ...result } as Record<string, unknown>,
|
|
@@ -739,6 +920,7 @@ async function executeHealthCheckJob(props: {
|
|
|
739
920
|
db,
|
|
740
921
|
systemId,
|
|
741
922
|
configurationId: configId,
|
|
923
|
+
environmentId,
|
|
742
924
|
status: result.status,
|
|
743
925
|
latencyMs: result.latencyMs,
|
|
744
926
|
runTimestamp: new Date(),
|
|
@@ -747,13 +929,18 @@ async function executeHealthCheckJob(props: {
|
|
|
747
929
|
sourceLabel: "Local",
|
|
748
930
|
});
|
|
749
931
|
|
|
750
|
-
|
|
932
|
+
// Env-scoped view: the per-env entity reflects only this env's runs.
|
|
933
|
+
newState = await service.getSystemHealthStatus(systemId, environmentId);
|
|
751
934
|
return toHealthEntityView(newState);
|
|
752
935
|
},
|
|
753
|
-
serialize:
|
|
936
|
+
serialize: serializeEnvWrite,
|
|
754
937
|
onError: (error) =>
|
|
755
|
-
logger.warn(
|
|
938
|
+
logger.warn(
|
|
939
|
+
`Failed to mirror health entity for ${envEntityId}`,
|
|
940
|
+
error,
|
|
941
|
+
),
|
|
756
942
|
});
|
|
943
|
+
anyEnvRunPersisted = true;
|
|
757
944
|
|
|
758
945
|
logger.debug(
|
|
759
946
|
`Health check ${configId} for system ${systemId} failed: ${finalError}`,
|
|
@@ -779,6 +966,7 @@ async function executeHealthCheckJob(props: {
|
|
|
779
966
|
db,
|
|
780
967
|
systemId,
|
|
781
968
|
configurationId: configId,
|
|
969
|
+
environmentId,
|
|
782
970
|
fromStatus: previousStatus,
|
|
783
971
|
toStatus: newState.status,
|
|
784
972
|
});
|
|
@@ -798,7 +986,9 @@ async function executeHealthCheckJob(props: {
|
|
|
798
986
|
});
|
|
799
987
|
}
|
|
800
988
|
|
|
801
|
-
|
|
989
|
+
// This environment's run is done (failed). Continue to the next
|
|
990
|
+
// effective environment rather than ending the whole job.
|
|
991
|
+
continue;
|
|
802
992
|
} finally {
|
|
803
993
|
if (connectedClient) {
|
|
804
994
|
try {
|
|
@@ -836,12 +1026,13 @@ async function executeHealthCheckJob(props: {
|
|
|
836
1026
|
let newState!: AggregatedHealth;
|
|
837
1027
|
await writeHealthEntity({
|
|
838
1028
|
handle: getHealthEntity?.(),
|
|
839
|
-
|
|
1029
|
+
entityId: envEntityId,
|
|
840
1030
|
apply: async () => {
|
|
841
1031
|
// Store result (spread to convert structured type to plain record for jsonb)
|
|
842
1032
|
await db.insert(healthCheckRuns).values({
|
|
843
1033
|
configurationId: configId,
|
|
844
1034
|
systemId,
|
|
1035
|
+
environmentId,
|
|
845
1036
|
status: result.status,
|
|
846
1037
|
latencyMs: result.latencyMs,
|
|
847
1038
|
result: { ...result } as Record<string, unknown>,
|
|
@@ -854,6 +1045,7 @@ async function executeHealthCheckJob(props: {
|
|
|
854
1045
|
db,
|
|
855
1046
|
systemId,
|
|
856
1047
|
configurationId: configId,
|
|
1048
|
+
environmentId,
|
|
857
1049
|
status: result.status,
|
|
858
1050
|
latencyMs: result.latencyMs,
|
|
859
1051
|
runTimestamp: new Date(),
|
|
@@ -862,13 +1054,15 @@ async function executeHealthCheckJob(props: {
|
|
|
862
1054
|
sourceLabel: "Local",
|
|
863
1055
|
});
|
|
864
1056
|
|
|
865
|
-
|
|
1057
|
+
// Env-scoped view: the per-env entity reflects only this env's runs.
|
|
1058
|
+
newState = await service.getSystemHealthStatus(systemId, environmentId);
|
|
866
1059
|
return toHealthEntityView(newState);
|
|
867
1060
|
},
|
|
868
|
-
serialize:
|
|
1061
|
+
serialize: serializeEnvWrite,
|
|
869
1062
|
onError: (error) =>
|
|
870
|
-
logger.warn(`Failed to mirror health entity for ${
|
|
1063
|
+
logger.warn(`Failed to mirror health entity for ${envEntityId}`, error),
|
|
871
1064
|
});
|
|
1065
|
+
anyEnvRunPersisted = true;
|
|
872
1066
|
|
|
873
1067
|
logger.debug(
|
|
874
1068
|
`Ran health check ${configId} for system ${systemId}: ${result.status}`,
|
|
@@ -904,6 +1098,7 @@ async function executeHealthCheckJob(props: {
|
|
|
904
1098
|
db,
|
|
905
1099
|
systemId,
|
|
906
1100
|
configurationId: configId,
|
|
1101
|
+
environmentId,
|
|
907
1102
|
fromStatus: previousStatus,
|
|
908
1103
|
toStatus: newState.status,
|
|
909
1104
|
});
|
|
@@ -922,12 +1117,19 @@ async function executeHealthCheckJob(props: {
|
|
|
922
1117
|
logger,
|
|
923
1118
|
});
|
|
924
1119
|
|
|
925
|
-
//
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
1120
|
+
// The system-level `SYSTEM_STATUS_CHANGED` signal must carry the ROLLUP
|
|
1121
|
+
// status, not a per-env status. When fanned out, the post-loop rollup
|
|
1122
|
+
// write broadcasts it once with the worst-status rollup; emitting it here
|
|
1123
|
+
// per env would send up to N system-level signals/tick carrying per-env
|
|
1124
|
+
// status. Only the env-less run (which IS the rollup — `!isFannedOut`)
|
|
1125
|
+
// broadcasts the system-level signal from inside the loop.
|
|
1126
|
+
if (!isFannedOut) {
|
|
1127
|
+
await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
|
|
1128
|
+
systemId,
|
|
1129
|
+
previousStatus: previousStatus as HealthCheckStatus,
|
|
1130
|
+
newStatus: newState.status,
|
|
1131
|
+
});
|
|
1132
|
+
}
|
|
931
1133
|
|
|
932
1134
|
// The directional + umbrella system-health hooks were removed in
|
|
933
1135
|
// Phase 4 (§10.3): the `health` entity mirror above is the single
|
|
@@ -935,6 +1137,91 @@ async function executeHealthCheckJob(props: {
|
|
|
935
1137
|
// `healthcheck.system_degraded` / `_healthy` / `_health_changed`
|
|
936
1138
|
// trigger events through Stage-1 routing. Nothing to emit here.
|
|
937
1139
|
}
|
|
1140
|
+
} catch (envError) {
|
|
1141
|
+
// Isolate this environment's failure; continue with the next env.
|
|
1142
|
+
logger.error(
|
|
1143
|
+
`Failed to run health check ${configId} for system ${systemId}` +
|
|
1144
|
+
(environmentId ? ` (environment ${environmentId})` : " (env-less)"),
|
|
1145
|
+
envError,
|
|
1146
|
+
);
|
|
1147
|
+
}
|
|
1148
|
+
} // end per-environment fan-out loop (for ... of runEnvironments)
|
|
1149
|
+
|
|
1150
|
+
// ── System rollup write (§7.4.3) ───────────────────────────────────────
|
|
1151
|
+
// With real environments, the per-env writes mutated `<systemId>::<env>`
|
|
1152
|
+
// entities; the bare `<systemId>` ROLLUP entity (the worst-status view
|
|
1153
|
+
// every existing system-level consumer references) must now recompute so
|
|
1154
|
+
// it diffs/emits its OWN `ENTITY_CHANGED`. The rollup `apply` does NO new
|
|
1155
|
+
// durable insert (the runs are already persisted by the per-env writes) —
|
|
1156
|
+
// it just recomputes + returns the all-runs rollup view so the framework
|
|
1157
|
+
// diffs prev → next. Keyed on the bare `health:<systemId>` lock so it
|
|
1158
|
+
// serializes against itself, independent of the per-env locks.
|
|
1159
|
+
//
|
|
1160
|
+
// Skipped when env-less (the loop's lone write already targeted the bare
|
|
1161
|
+
// `<systemId>` entity = the rollup) or when nothing persisted (a fully
|
|
1162
|
+
// isolated-failure loop left no new runs to roll up).
|
|
1163
|
+
if (isFannedOut && anyEnvRunPersisted) {
|
|
1164
|
+
const rollupEntityId = encodeHealthEntityId({ systemId });
|
|
1165
|
+
let rollupState!: AggregatedHealth;
|
|
1166
|
+
try {
|
|
1167
|
+
await writeHealthEntity({
|
|
1168
|
+
handle: getHealthEntity?.(),
|
|
1169
|
+
entityId: rollupEntityId,
|
|
1170
|
+
apply: async () => {
|
|
1171
|
+
// No durable insert — recompute the all-runs (rollup) view.
|
|
1172
|
+
rollupState = await service.getSystemHealthStatus(systemId);
|
|
1173
|
+
return toHealthEntityView(rollupState);
|
|
1174
|
+
},
|
|
1175
|
+
serialize: makeHealthSerializer(rollupEntityId),
|
|
1176
|
+
onError: (error) =>
|
|
1177
|
+
logger.warn(
|
|
1178
|
+
`Failed to mirror rollup health entity for ${systemId}`,
|
|
1179
|
+
error,
|
|
1180
|
+
),
|
|
1181
|
+
});
|
|
1182
|
+
|
|
1183
|
+
// Record the ROLLUP transition (environmentId = null) so system-level
|
|
1184
|
+
// "in status since" reflects the aggregate, and notify on a real
|
|
1185
|
+
// rollup status change so existing system-level notifications fire.
|
|
1186
|
+
if (rollupState.status !== rollupPreviousStatus) {
|
|
1187
|
+
await recordStateTransition({
|
|
1188
|
+
db,
|
|
1189
|
+
systemId,
|
|
1190
|
+
configurationId: configId,
|
|
1191
|
+
environmentId: null,
|
|
1192
|
+
fromStatus: rollupPreviousStatus,
|
|
1193
|
+
toStatus: rollupState.status,
|
|
1194
|
+
});
|
|
1195
|
+
|
|
1196
|
+
await notifyStateChange({
|
|
1197
|
+
notificationClient,
|
|
1198
|
+
systemId,
|
|
1199
|
+
systemName,
|
|
1200
|
+
configurationId: configId,
|
|
1201
|
+
previousStatus: rollupPreviousStatus,
|
|
1202
|
+
newStatus: rollupState.status,
|
|
1203
|
+
service,
|
|
1204
|
+
catalogClient,
|
|
1205
|
+
maintenanceClient,
|
|
1206
|
+
incidentClient,
|
|
1207
|
+
logger,
|
|
1208
|
+
});
|
|
1209
|
+
|
|
1210
|
+
await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
|
|
1211
|
+
systemId,
|
|
1212
|
+
previousStatus: rollupPreviousStatus as HealthCheckStatus,
|
|
1213
|
+
newStatus: rollupState.status,
|
|
1214
|
+
});
|
|
1215
|
+
}
|
|
1216
|
+
} catch (rollupError) {
|
|
1217
|
+
// The rollup is best-effort reactivity over already-durable runs; a
|
|
1218
|
+
// failure must not wedge the (completed) per-env runs.
|
|
1219
|
+
logger.error(
|
|
1220
|
+
`Failed to write system rollup health for ${systemId}`,
|
|
1221
|
+
rollupError,
|
|
1222
|
+
);
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
938
1225
|
|
|
939
1226
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
940
1227
|
} catch (error) {
|
|
@@ -943,15 +1230,17 @@ async function executeHealthCheckJob(props: {
|
|
|
943
1230
|
error,
|
|
944
1231
|
);
|
|
945
1232
|
|
|
946
|
-
//
|
|
947
|
-
//
|
|
948
|
-
//
|
|
949
|
-
//
|
|
950
|
-
//
|
|
1233
|
+
// Catastrophic job-level failure (e.g. the config fetch / env resolution
|
|
1234
|
+
// threw before the fan-out loop). Persist a single env-less failure run
|
|
1235
|
+
// against the bare `<systemId>` entity — which IS the system rollup — so
|
|
1236
|
+
// the system-level health change still emits. Reuses the pre-tick
|
|
1237
|
+
// rollup status captured before the try block.
|
|
1238
|
+
const rollupEntityId = encodeHealthEntityId({ systemId });
|
|
1239
|
+
const previousStatus = rollupPreviousStatus;
|
|
951
1240
|
let newState!: AggregatedHealth;
|
|
952
1241
|
await writeHealthEntity({
|
|
953
1242
|
handle: getHealthEntity?.(),
|
|
954
|
-
|
|
1243
|
+
entityId: rollupEntityId,
|
|
955
1244
|
apply: async () => {
|
|
956
1245
|
// Store failure (no latencyMs for failures)
|
|
957
1246
|
await db.insert(healthCheckRuns).values({
|
|
@@ -979,10 +1268,10 @@ async function executeHealthCheckJob(props: {
|
|
|
979
1268
|
newState = await service.getSystemHealthStatus(systemId);
|
|
980
1269
|
return toHealthEntityView(newState);
|
|
981
1270
|
},
|
|
982
|
-
serialize:
|
|
1271
|
+
serialize: makeHealthSerializer(rollupEntityId),
|
|
983
1272
|
onError: (mirrorError) =>
|
|
984
1273
|
logger.warn(
|
|
985
|
-
`Failed to mirror health entity for ${
|
|
1274
|
+
`Failed to mirror health entity for ${rollupEntityId}`,
|
|
986
1275
|
mirrorError,
|
|
987
1276
|
),
|
|
988
1277
|
});
|
|
@@ -1073,6 +1362,7 @@ async function executeHealthCheckJob(props: {
|
|
|
1073
1362
|
|
|
1074
1363
|
export async function setupHealthCheckWorker(props: {
|
|
1075
1364
|
db: Db;
|
|
1365
|
+
advisoryLock: AdvisoryLockService;
|
|
1076
1366
|
registry: HealthCheckRegistry;
|
|
1077
1367
|
collectorRegistry: CollectorRegistry;
|
|
1078
1368
|
logger: Logger;
|
|
@@ -1089,6 +1379,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1089
1379
|
}): Promise<void> {
|
|
1090
1380
|
const {
|
|
1091
1381
|
db,
|
|
1382
|
+
advisoryLock,
|
|
1092
1383
|
registry,
|
|
1093
1384
|
collectorRegistry,
|
|
1094
1385
|
logger,
|
|
@@ -1113,6 +1404,7 @@ export async function setupHealthCheckWorker(props: {
|
|
|
1113
1404
|
await executeHealthCheckJob({
|
|
1114
1405
|
payload: job.data,
|
|
1115
1406
|
db,
|
|
1407
|
+
advisoryLock,
|
|
1116
1408
|
registry,
|
|
1117
1409
|
collectorRegistry,
|
|
1118
1410
|
logger,
|
|
@@ -171,6 +171,36 @@ describe("incrementHourlyAggregate", () => {
|
|
|
171
171
|
expect(inserted.maxLatencyMs).toBe(150);
|
|
172
172
|
});
|
|
173
173
|
|
|
174
|
+
it("writes the environmentId into the aggregate (per-environment fan-out)", async () => {
|
|
175
|
+
await incrementHourlyAggregate({
|
|
176
|
+
db: mockDb as never,
|
|
177
|
+
systemId: "sys-1",
|
|
178
|
+
configurationId: "config-1",
|
|
179
|
+
environmentId: "prod",
|
|
180
|
+
status: "healthy",
|
|
181
|
+
latencyMs: 150,
|
|
182
|
+
runTimestamp: new Date("2024-01-15T10:35:00Z"),
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
const inserted = insertedValues[0] as Record<string, unknown>;
|
|
186
|
+
expect(inserted.environmentId).toBe("prod");
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
it("normalizes an env-less run to environmentId = null", async () => {
|
|
190
|
+
await incrementHourlyAggregate({
|
|
191
|
+
db: mockDb as never,
|
|
192
|
+
systemId: "sys-1",
|
|
193
|
+
configurationId: "config-1",
|
|
194
|
+
// environmentId omitted -> env-less run
|
|
195
|
+
status: "healthy",
|
|
196
|
+
latencyMs: 150,
|
|
197
|
+
runTimestamp: new Date("2024-01-15T10:35:00Z"),
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
const inserted = insertedValues[0] as Record<string, unknown>;
|
|
201
|
+
expect(inserted.environmentId).toBeNull();
|
|
202
|
+
});
|
|
203
|
+
|
|
174
204
|
it("increments counts for unhealthy status", async () => {
|
|
175
205
|
await incrementHourlyAggregate({
|
|
176
206
|
db: mockDb as never,
|