@checkstack/healthcheck-backend 1.5.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +253 -0
- package/drizzle/0018_abnormal_preak.sql +10 -0
- package/drizzle/meta/0018_snapshot.json +600 -0
- package/drizzle/meta/_journal.json +7 -0
- package/package.json +32 -27
- package/src/ai/assertion-validation.test.ts +117 -0
- package/src/ai/assertion-validation.ts +147 -0
- package/src/ai/healthcheck-capabilities.test.ts +158 -0
- package/src/ai/healthcheck-capabilities.ts +217 -0
- package/src/ai/healthcheck-delete.test.ts +81 -0
- package/src/ai/healthcheck-delete.ts +81 -0
- package/src/ai/healthcheck-projection.test.ts +36 -0
- package/src/ai/healthcheck-propose.test.ts +268 -0
- package/src/ai/healthcheck-propose.ts +290 -0
- package/src/ai/healthcheck-script-tools.test.ts +93 -0
- package/src/ai/healthcheck-script-tools.ts +179 -0
- package/src/ai/healthcheck-update.test.ts +123 -0
- package/src/ai/healthcheck-update.ts +123 -0
- package/src/ai/notify-subscribers.test.ts +109 -0
- package/src/ai/notify-subscribers.ts +176 -0
- package/src/ai/register-ai-tools.test.ts +41 -0
- package/src/ai/register-ai-tools.ts +53 -0
- package/src/ai/shell-env-table.test.ts +47 -0
- package/src/automations.test.ts +2 -1
- package/src/automations.ts +9 -1
- package/src/collector-script-test.test.ts +53 -1
- package/src/collector-script-test.ts +59 -7
- package/src/effective-environments.test.ts +93 -0
- package/src/effective-environments.ts +64 -0
- package/src/health-entity-id.ts +57 -0
- package/src/health-entity.test.ts +384 -6
- package/src/health-entity.ts +93 -35
- package/src/health-state.ts +41 -4
- package/src/healthcheck-gitops-kinds.test.ts +95 -0
- package/src/healthcheck-gitops-kinds.ts +56 -13
- package/src/index.ts +30 -0
- package/src/migration-chain-contract.test.ts +57 -0
- package/src/queue-executor.test.ts +801 -0
- package/src/queue-executor.ts +336 -52
- package/src/realtime-aggregation.test.ts +30 -0
- package/src/realtime-aggregation.ts +16 -0
- package/src/retention-job.ts +167 -93
- package/src/retention-rollup.test.ts +118 -0
- package/src/router.test.ts +120 -1
- package/src/router.ts +20 -0
- package/src/schema.ts +44 -6
- package/src/service.ts +199 -43
- package/src/state-transitions.test.ts +104 -0
- package/src/state-transitions.ts +39 -1
- package/src/validate-configuration.test.ts +205 -0
- package/src/validate-configuration.ts +159 -0
- package/tsconfig.json +9 -0
package/src/queue-executor.ts
CHANGED
|
@@ -10,6 +10,7 @@ import {
|
|
|
10
10
|
type TransportClient,
|
|
11
11
|
type CollectorRunContext,
|
|
12
12
|
type AdvisoryLockService,
|
|
13
|
+
renderTemplatableConfig,
|
|
13
14
|
} from "@checkstack/backend-api";
|
|
14
15
|
import { QueueManager } from "@checkstack/queue-api";
|
|
15
16
|
import {
|
|
@@ -23,6 +24,7 @@ import { type SignalService } from "@checkstack/signal-common";
|
|
|
23
24
|
import {
|
|
24
25
|
HEALTH_CHECK_RUN_COMPLETED,
|
|
25
26
|
SYSTEM_STATUS_CHANGED,
|
|
27
|
+
ENVIRONMENT_RESOLUTION_FAILED,
|
|
26
28
|
type HealthCheckStatus,
|
|
27
29
|
stripEphemeralFields,
|
|
28
30
|
} from "@checkstack/healthcheck-common";
|
|
@@ -30,7 +32,12 @@ import {
|
|
|
30
32
|
CatalogApi,
|
|
31
33
|
catalogRoutes,
|
|
32
34
|
createSystemSubject,
|
|
35
|
+
type Environment,
|
|
33
36
|
} from "@checkstack/catalog-common";
|
|
37
|
+
import {
|
|
38
|
+
resolveEffectiveEnvironments,
|
|
39
|
+
type EffectiveEnvironment,
|
|
40
|
+
} from "./effective-environments";
|
|
34
41
|
import { systemHealthCollapseKey } from "@checkstack/healthcheck-common";
|
|
35
42
|
import { MaintenanceApi } from "@checkstack/maintenance-common";
|
|
36
43
|
import { IncidentApi } from "@checkstack/incident-common";
|
|
@@ -53,6 +60,7 @@ import {
|
|
|
53
60
|
createHealthEntitySerializer,
|
|
54
61
|
type HealthEntityState,
|
|
55
62
|
} from "./health-entity";
|
|
63
|
+
import { encodeHealthEntityId } from "./health-entity-id";
|
|
56
64
|
import type { EntityHandle } from "@checkstack/automation-backend";
|
|
57
65
|
|
|
58
66
|
type Db = SafeDatabase<typeof schema>;
|
|
@@ -425,19 +433,22 @@ async function executeHealthCheckJob(props: {
|
|
|
425
433
|
// Create service for aggregated state evaluation
|
|
426
434
|
const service = new HealthCheckService(db, registry, collectorRegistry);
|
|
427
435
|
|
|
428
|
-
// Per-
|
|
429
|
-
// transaction-scoped advisory lock keyed `health:<
|
|
430
|
-
// snapshot-prev + apply + diff + emit so concurrent evaluations
|
|
431
|
-
//
|
|
432
|
-
//
|
|
433
|
-
//
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
//
|
|
439
|
-
|
|
440
|
-
|
|
436
|
+
// Per-ENTITY serializer factory for the reactive health mutate (§10.3,
|
|
437
|
+
// Phase 3b): a transaction-scoped advisory lock keyed `health:<entityId>`
|
|
438
|
+
// wraps the snapshot-prev + apply + diff + emit so concurrent evaluations
|
|
439
|
+
// of one (system, environment) — or of the system rollup — can't double-emit
|
|
440
|
+
// a single logical transition. Bound to the qualified entity id at each
|
|
441
|
+
// `writeHealthEntity` call so distinct envs / the rollup don't block each
|
|
442
|
+
// other.
|
|
443
|
+
const makeHealthSerializer = createHealthEntitySerializer({ advisoryLock });
|
|
444
|
+
|
|
445
|
+
// The system-rollup status BEFORE this tick (all environments + env-less).
|
|
446
|
+
// Captured once so the post-loop rollup write (§7.4.3) — and the
|
|
447
|
+
// catastrophic-failure path — can record a correct prev → next rollup
|
|
448
|
+
// transition (environmentId = null). This is the system-wide aggregate read
|
|
449
|
+
// the executor has always taken first.
|
|
450
|
+
const rollupPreviousState = await service.getSystemHealthStatus(systemId);
|
|
451
|
+
const rollupPreviousStatus = rollupPreviousState.status;
|
|
441
452
|
|
|
442
453
|
try {
|
|
443
454
|
// Fetch configuration (including name for signals)
|
|
@@ -453,6 +464,7 @@ async function executeHealthCheckJob(props: {
|
|
|
453
464
|
paused: healthCheckConfigurations.paused,
|
|
454
465
|
includeLocal: systemHealthChecks.includeLocal,
|
|
455
466
|
satelliteIds: systemHealthChecks.satelliteIds,
|
|
467
|
+
environmentIds: systemHealthChecks.environmentIds,
|
|
456
468
|
})
|
|
457
469
|
.from(systemHealthChecks)
|
|
458
470
|
.innerJoin(
|
|
@@ -508,17 +520,6 @@ async function executeHealthCheckJob(props: {
|
|
|
508
520
|
logger.debug(`Could not fetch system name for ${systemId}, using ID`);
|
|
509
521
|
}
|
|
510
522
|
|
|
511
|
-
// Curated, read-only run-context metadata exposed to collectors.
|
|
512
|
-
// Metadata only - never secrets or config.
|
|
513
|
-
const runContext: CollectorRunContext = {
|
|
514
|
-
check: {
|
|
515
|
-
id: configId,
|
|
516
|
-
name: configRow.configName || configId,
|
|
517
|
-
intervalSeconds: configRow.interval,
|
|
518
|
-
},
|
|
519
|
-
system: { id: systemId, name: systemName },
|
|
520
|
-
};
|
|
521
|
-
|
|
522
523
|
const strategy = registry.getStrategy(configRow.strategyId);
|
|
523
524
|
if (!strategy) {
|
|
524
525
|
logger.warn(
|
|
@@ -527,10 +528,158 @@ async function executeHealthCheckJob(props: {
|
|
|
527
528
|
return;
|
|
528
529
|
}
|
|
529
530
|
|
|
530
|
-
//
|
|
531
|
-
|
|
531
|
+
// Migrate the stored (UNVERSIONED) strategy config ONCE, before the
|
|
532
|
+
// per-environment render loop, so every env renders from the same
|
|
533
|
+
// migrated shape. Stored configs predate explicit versioning and may be
|
|
534
|
+
// genuinely v1 (e.g. an HTTP config still carrying url/method); assume-v1
|
|
535
|
+
// -on-read runs the declared migration chain, then validates. The
|
|
536
|
+
// migrations are idempotent, so an already-current config is a no-op.
|
|
537
|
+
const strategyConfig: BaseStrategyConfig =
|
|
538
|
+
await strategy.config.parseAssumingV1(configRow.config);
|
|
532
539
|
const executionTimeout = strategyConfig.timeout ?? 60_000;
|
|
533
540
|
|
|
541
|
+
// ── Per-environment fan-out (§7) ────────────────────────────────────────
|
|
542
|
+
// Resolve the effective environment set from the assignment + the
|
|
543
|
+
// system's current catalog membership, then run ONCE PER environment.
|
|
544
|
+
// An empty effective set (opt-out `[]`, or `null` with no membership)
|
|
545
|
+
// collapses to a single env-less run with `environment` unset — exactly
|
|
546
|
+
// the pre-feature behavior. Membership lives ONLY in the catalog Postgres
|
|
547
|
+
// tables and is re-read every tick via the cross-plugin RPC, so every pod
|
|
548
|
+
// resolves the same set (state-and-scale: no pod-local env state).
|
|
549
|
+
let membership: Environment[] = [];
|
|
550
|
+
try {
|
|
551
|
+
membership = await catalogClient.resolveSystemEnvironments({ systemId });
|
|
552
|
+
} catch (error) {
|
|
553
|
+
// Fail-open: a catalog read failure must not wedge the check. Degrade
|
|
554
|
+
// to an env-less run (today's behavior) rather than skipping the tick.
|
|
555
|
+
logger.warn(
|
|
556
|
+
`Could not resolve environments for system ${systemId}, running env-less`,
|
|
557
|
+
error,
|
|
558
|
+
);
|
|
559
|
+
// Observability: a `logger.warn` alone is easy to miss when a durable
|
|
560
|
+
// catalog misconfig (or outage) silently strips per-environment fan-out.
|
|
561
|
+
// Broadcast a counter-style signal so the degradation is observable.
|
|
562
|
+
// Best-effort — never let the signal break the (still-running) check.
|
|
563
|
+
try {
|
|
564
|
+
await signalService.broadcast(ENVIRONMENT_RESOLUTION_FAILED, {
|
|
565
|
+
systemId,
|
|
566
|
+
configurationId: configId,
|
|
567
|
+
error: extractErrorMessage(error),
|
|
568
|
+
});
|
|
569
|
+
} catch (signalError) {
|
|
570
|
+
logger.warn(
|
|
571
|
+
`Failed to broadcast environment-resolution-failed signal for ${systemId}`,
|
|
572
|
+
signalError,
|
|
573
|
+
);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
const effectiveEnvs = resolveEffectiveEnvironments({
|
|
577
|
+
environmentIds: configRow.environmentIds,
|
|
578
|
+
membership,
|
|
579
|
+
});
|
|
580
|
+
// `null` env => the single env-less run. Each entry => one run per env.
|
|
581
|
+
const runEnvironments: (EffectiveEnvironment | null)[] =
|
|
582
|
+
effectiveEnvs.length > 0 ? effectiveEnvs : [null];
|
|
583
|
+
|
|
584
|
+
// Execute one run per effective environment. Runs are independent (own
|
|
585
|
+
// status / latency / result) and persisted with their own
|
|
586
|
+
// `environmentId`. Phase 3b: each env-run mutates its OWN env-qualified
|
|
587
|
+
// `health` entity (`<systemId>::<environmentId>`, or the bare `<systemId>`
|
|
588
|
+
// for the env-less run) through a per-entity serializer; after the loop a
|
|
589
|
+
// single ROLLUP write for the bare `<systemId>` recomputes the worst-status
|
|
590
|
+
// rollup so system-level consumers keep firing off the unchanged id.
|
|
591
|
+
//
|
|
592
|
+
// Track whether ANY per-env run persisted (so the rollup write only runs
|
|
593
|
+
// when there is something to roll up — an all-failed loop still leaves the
|
|
594
|
+
// durable runs the per-env apply already wrote).
|
|
595
|
+
let anyEnvRunPersisted = false;
|
|
596
|
+
// Whether this tick fans out into REAL environments (vs. the single
|
|
597
|
+
// env-less run). When env-less, the loop's lone write already targets the
|
|
598
|
+
// bare `<systemId>` entity — which IS the rollup — so no separate rollup
|
|
599
|
+
// write is needed. With real envs, the loop writes `<systemId>::<env>`
|
|
600
|
+
// entities and we recompute the bare-`<systemId>` rollup after the loop.
|
|
601
|
+
const isFannedOut = effectiveEnvs.length > 0;
|
|
602
|
+
for (const environment of runEnvironments) {
|
|
603
|
+
const environmentId = environment?.id ?? null;
|
|
604
|
+
// The env-qualified entity id this run mutates. For the env-less run
|
|
605
|
+
// (environmentId === null) this is the bare systemId — which is also the
|
|
606
|
+
// rollup id, so the env-less run IS the rollup (no separate rollup write
|
|
607
|
+
// is needed when the system has no environments — see below).
|
|
608
|
+
const envEntityId = encodeHealthEntityId({ systemId, environmentId });
|
|
609
|
+
const serializeEnvWrite = makeHealthSerializer(envEntityId);
|
|
610
|
+
|
|
611
|
+
// Per-env baseline status for the transition log: the env-scoped
|
|
612
|
+
// aggregate BEFORE this run. Computed per env so a transition row is
|
|
613
|
+
// recorded against the right (system, environment) streak.
|
|
614
|
+
const previousState = await service.getSystemHealthStatus(
|
|
615
|
+
systemId,
|
|
616
|
+
environmentId,
|
|
617
|
+
);
|
|
618
|
+
const previousStatus = previousState.status;
|
|
619
|
+
|
|
620
|
+
// Curated, read-only run-context metadata exposed to collectors.
|
|
621
|
+
// Metadata only - never secrets or config. `environment` carries the
|
|
622
|
+
// resolved env's verbatim custom fields for this run (Phase 2 surfaces
|
|
623
|
+
// consume it); absent for the env-less run.
|
|
624
|
+
const runContext: CollectorRunContext = {
|
|
625
|
+
check: {
|
|
626
|
+
id: configId,
|
|
627
|
+
name: configRow.configName || configId,
|
|
628
|
+
intervalSeconds: configRow.interval,
|
|
629
|
+
},
|
|
630
|
+
system: { id: systemId, name: systemName },
|
|
631
|
+
...(environment
|
|
632
|
+
? {
|
|
633
|
+
environment: {
|
|
634
|
+
id: environment.id,
|
|
635
|
+
name: environment.name,
|
|
636
|
+
fields: environment.fields,
|
|
637
|
+
},
|
|
638
|
+
}
|
|
639
|
+
: {}),
|
|
640
|
+
};
|
|
641
|
+
|
|
642
|
+
// Templating context for the per-env config render pass (§6.3.3).
|
|
643
|
+
// Carries only environment custom fields + curated check/system
|
|
644
|
+
// metadata - never secrets. `{{ environment.baseUrl }}` resolves from
|
|
645
|
+
// the resolved env's verbatim fields; an env-less run gets `{}` so a
|
|
646
|
+
// reference renders to empty string (strict: false); see the debug log
|
|
647
|
+
// below.
|
|
648
|
+
const templateContext = {
|
|
649
|
+
environment: runContext.environment?.fields ?? {},
|
|
650
|
+
check: runContext.check,
|
|
651
|
+
system: runContext.system,
|
|
652
|
+
};
|
|
653
|
+
if (!runContext.environment) {
|
|
654
|
+
// §11.6: render-empty when a run has no environment. An env-less run is
|
|
655
|
+
// a legitimate, documented configuration (the None assignment mode, or
|
|
656
|
+
// All-environments with no membership), and it recurs every interval -
|
|
657
|
+
// so this is `debug`, not `warn`, to avoid spamming the log. When an
|
|
658
|
+
// empty `{{ environment.* }}` render actually matters, the HTTP
|
|
659
|
+
// post-render `.url()` check already fails the run with a concrete
|
|
660
|
+
// "Rendered URL is invalid" error; we do not inspect every field here.
|
|
661
|
+
logger.debug(
|
|
662
|
+
`Health check ${configId} for system ${systemId} ran with no environment; ` +
|
|
663
|
+
`any {{ environment.* }} references render to empty string`,
|
|
664
|
+
);
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
// (2) Environment/templating pass (NEW) - renders `{{ environment.* }}`
|
|
668
|
+
// etc. in `x-templatable` fields. Runs PER ENVIRONMENT, AFTER the secret
|
|
669
|
+
// resolution (secrets first, templating second - §6.3.4) and BEFORE the
|
|
670
|
+
// strategy client build, so each env gets its own rendered strategy
|
|
671
|
+
// config + client. The collector configs are rendered just before each
|
|
672
|
+
// collector executes (below) so the secretEnv resolution stays first.
|
|
673
|
+
const renderedStrategyConfig = renderTemplatableConfig({
|
|
674
|
+
config: strategyConfig,
|
|
675
|
+
schema: strategy.config.schema,
|
|
676
|
+
context: templateContext,
|
|
677
|
+
}) as BaseStrategyConfig;
|
|
678
|
+
|
|
679
|
+
// Per-environment isolation: an unexpected failure persisting ONE
|
|
680
|
+
// environment's run must not abort the sibling environments' runs.
|
|
681
|
+
// Each iteration's run is independent (§7.2), so we log and continue.
|
|
682
|
+
try {
|
|
534
683
|
// Execute health check using createClient pattern with unified hard timeout
|
|
535
684
|
const start = performance.now();
|
|
536
685
|
let connectionTimeMs: number | undefined;
|
|
@@ -546,8 +695,11 @@ async function executeHealthCheckJob(props: {
|
|
|
546
695
|
// Platform-level hard timeout wrapping the entire execution sequence
|
|
547
696
|
await Promise.race([
|
|
548
697
|
(async () => {
|
|
549
|
-
// 1. Establish connection
|
|
550
|
-
|
|
698
|
+
// 1. Establish connection. The strategy client build moves INSIDE
|
|
699
|
+
// the per-env loop (§6.3.3): each env gets its own rendered config +
|
|
700
|
+
// client, so a single job no longer bakes in one env's rendered
|
|
701
|
+
// strategy config.
|
|
702
|
+
connectedClient = await strategy.createClient(renderedStrategyConfig);
|
|
551
703
|
connectionTimeMs = Math.round(performance.now() - start);
|
|
552
704
|
|
|
553
705
|
// 2. Execute collectors in parallel
|
|
@@ -584,8 +736,31 @@ async function executeHealthCheckJob(props: {
|
|
|
584
736
|
secretEnv = resolved.env;
|
|
585
737
|
}
|
|
586
738
|
|
|
739
|
+
// Migrate the stored (UNVERSIONED) collector config via
|
|
740
|
+
// assume-v1-on-read: runs the declared migration chain, then
|
|
741
|
+
// validates. Migrations are idempotent, so an already-current
|
|
742
|
+
// config is a no-op. This runs BEFORE templating so the render
|
|
743
|
+
// pass sees the migrated shape; the secretEnv resolution above
|
|
744
|
+
// reads the raw `secretEnv` mapping (a constant string field
|
|
745
|
+
// unaffected by the strategy/collector reshapes), keeping the
|
|
746
|
+
// migrate -> secret resolve -> render -> execute order intact.
|
|
747
|
+
const migratedCollectorConfig =
|
|
748
|
+
await registered.collector.config.parseAssumingV1(
|
|
749
|
+
collectorEntry.config,
|
|
750
|
+
);
|
|
751
|
+
|
|
752
|
+
// (2) Environment/templating pass for the collector config -
|
|
753
|
+
// runs AFTER the secretEnv resolution above (secrets first,
|
|
754
|
+
// templating second) and renders `{{ environment.* }}` in this
|
|
755
|
+
// collector's `x-templatable` fields against the per-env context.
|
|
756
|
+
const renderedCollectorConfig = renderTemplatableConfig({
|
|
757
|
+
config: migratedCollectorConfig,
|
|
758
|
+
schema: registered.collector.config.schema,
|
|
759
|
+
context: templateContext,
|
|
760
|
+
});
|
|
761
|
+
|
|
587
762
|
const collectorResult = await registered.collector.execute({
|
|
588
|
-
config:
|
|
763
|
+
config: renderedCollectorConfig,
|
|
589
764
|
client: connectedClient!.client,
|
|
590
765
|
pluginId: configRow.strategyId,
|
|
591
766
|
runContext,
|
|
@@ -728,11 +903,12 @@ async function executeHealthCheckJob(props: {
|
|
|
728
903
|
let newState!: AggregatedHealth;
|
|
729
904
|
await writeHealthEntity({
|
|
730
905
|
handle: getHealthEntity?.(),
|
|
731
|
-
|
|
906
|
+
entityId: envEntityId,
|
|
732
907
|
apply: async () => {
|
|
733
908
|
await db.insert(healthCheckRuns).values({
|
|
734
909
|
configurationId: configId,
|
|
735
910
|
systemId,
|
|
911
|
+
environmentId,
|
|
736
912
|
status: result.status,
|
|
737
913
|
latencyMs: result.latencyMs,
|
|
738
914
|
result: { ...result } as Record<string, unknown>,
|
|
@@ -744,6 +920,7 @@ async function executeHealthCheckJob(props: {
|
|
|
744
920
|
db,
|
|
745
921
|
systemId,
|
|
746
922
|
configurationId: configId,
|
|
923
|
+
environmentId,
|
|
747
924
|
status: result.status,
|
|
748
925
|
latencyMs: result.latencyMs,
|
|
749
926
|
runTimestamp: new Date(),
|
|
@@ -752,13 +929,18 @@ async function executeHealthCheckJob(props: {
|
|
|
752
929
|
sourceLabel: "Local",
|
|
753
930
|
});
|
|
754
931
|
|
|
755
|
-
|
|
932
|
+
// Env-scoped view: the per-env entity reflects only this env's runs.
|
|
933
|
+
newState = await service.getSystemHealthStatus(systemId, environmentId);
|
|
756
934
|
return toHealthEntityView(newState);
|
|
757
935
|
},
|
|
758
|
-
serialize:
|
|
936
|
+
serialize: serializeEnvWrite,
|
|
759
937
|
onError: (error) =>
|
|
760
|
-
logger.warn(
|
|
938
|
+
logger.warn(
|
|
939
|
+
`Failed to mirror health entity for ${envEntityId}`,
|
|
940
|
+
error,
|
|
941
|
+
),
|
|
761
942
|
});
|
|
943
|
+
anyEnvRunPersisted = true;
|
|
762
944
|
|
|
763
945
|
logger.debug(
|
|
764
946
|
`Health check ${configId} for system ${systemId} failed: ${finalError}`,
|
|
@@ -784,6 +966,7 @@ async function executeHealthCheckJob(props: {
|
|
|
784
966
|
db,
|
|
785
967
|
systemId,
|
|
786
968
|
configurationId: configId,
|
|
969
|
+
environmentId,
|
|
787
970
|
fromStatus: previousStatus,
|
|
788
971
|
toStatus: newState.status,
|
|
789
972
|
});
|
|
@@ -803,7 +986,9 @@ async function executeHealthCheckJob(props: {
|
|
|
803
986
|
});
|
|
804
987
|
}
|
|
805
988
|
|
|
806
|
-
|
|
989
|
+
// This environment's run is done (failed). Continue to the next
|
|
990
|
+
// effective environment rather than ending the whole job.
|
|
991
|
+
continue;
|
|
807
992
|
} finally {
|
|
808
993
|
if (connectedClient) {
|
|
809
994
|
try {
|
|
@@ -841,12 +1026,13 @@ async function executeHealthCheckJob(props: {
|
|
|
841
1026
|
let newState!: AggregatedHealth;
|
|
842
1027
|
await writeHealthEntity({
|
|
843
1028
|
handle: getHealthEntity?.(),
|
|
844
|
-
|
|
1029
|
+
entityId: envEntityId,
|
|
845
1030
|
apply: async () => {
|
|
846
1031
|
// Store result (spread to convert structured type to plain record for jsonb)
|
|
847
1032
|
await db.insert(healthCheckRuns).values({
|
|
848
1033
|
configurationId: configId,
|
|
849
1034
|
systemId,
|
|
1035
|
+
environmentId,
|
|
850
1036
|
status: result.status,
|
|
851
1037
|
latencyMs: result.latencyMs,
|
|
852
1038
|
result: { ...result } as Record<string, unknown>,
|
|
@@ -859,6 +1045,7 @@ async function executeHealthCheckJob(props: {
|
|
|
859
1045
|
db,
|
|
860
1046
|
systemId,
|
|
861
1047
|
configurationId: configId,
|
|
1048
|
+
environmentId,
|
|
862
1049
|
status: result.status,
|
|
863
1050
|
latencyMs: result.latencyMs,
|
|
864
1051
|
runTimestamp: new Date(),
|
|
@@ -867,13 +1054,15 @@ async function executeHealthCheckJob(props: {
|
|
|
867
1054
|
sourceLabel: "Local",
|
|
868
1055
|
});
|
|
869
1056
|
|
|
870
|
-
|
|
1057
|
+
// Env-scoped view: the per-env entity reflects only this env's runs.
|
|
1058
|
+
newState = await service.getSystemHealthStatus(systemId, environmentId);
|
|
871
1059
|
return toHealthEntityView(newState);
|
|
872
1060
|
},
|
|
873
|
-
serialize:
|
|
1061
|
+
serialize: serializeEnvWrite,
|
|
874
1062
|
onError: (error) =>
|
|
875
|
-
logger.warn(`Failed to mirror health entity for ${
|
|
1063
|
+
logger.warn(`Failed to mirror health entity for ${envEntityId}`, error),
|
|
876
1064
|
});
|
|
1065
|
+
anyEnvRunPersisted = true;
|
|
877
1066
|
|
|
878
1067
|
logger.debug(
|
|
879
1068
|
`Ran health check ${configId} for system ${systemId}: ${result.status}`,
|
|
@@ -909,6 +1098,7 @@ async function executeHealthCheckJob(props: {
|
|
|
909
1098
|
db,
|
|
910
1099
|
systemId,
|
|
911
1100
|
configurationId: configId,
|
|
1101
|
+
environmentId,
|
|
912
1102
|
fromStatus: previousStatus,
|
|
913
1103
|
toStatus: newState.status,
|
|
914
1104
|
});
|
|
@@ -927,12 +1117,19 @@ async function executeHealthCheckJob(props: {
|
|
|
927
1117
|
logger,
|
|
928
1118
|
});
|
|
929
1119
|
|
|
930
|
-
//
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
1120
|
+
// The system-level `SYSTEM_STATUS_CHANGED` signal must carry the ROLLUP
|
|
1121
|
+
// status, not a per-env status. When fanned out, the post-loop rollup
|
|
1122
|
+
// write broadcasts it once with the worst-status rollup; emitting it here
|
|
1123
|
+
// per env would send up to N system-level signals/tick carrying per-env
|
|
1124
|
+
// status. Only the env-less run (which IS the rollup — `!isFannedOut`)
|
|
1125
|
+
// broadcasts the system-level signal from inside the loop.
|
|
1126
|
+
if (!isFannedOut) {
|
|
1127
|
+
await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
|
|
1128
|
+
systemId,
|
|
1129
|
+
previousStatus: previousStatus as HealthCheckStatus,
|
|
1130
|
+
newStatus: newState.status,
|
|
1131
|
+
});
|
|
1132
|
+
}
|
|
936
1133
|
|
|
937
1134
|
// The directional + umbrella system-health hooks were removed in
|
|
938
1135
|
// Phase 4 (§10.3): the `health` entity mirror above is the single
|
|
@@ -940,6 +1137,91 @@ async function executeHealthCheckJob(props: {
|
|
|
940
1137
|
// `healthcheck.system_degraded` / `_healthy` / `_health_changed`
|
|
941
1138
|
// trigger events through Stage-1 routing. Nothing to emit here.
|
|
942
1139
|
}
|
|
1140
|
+
} catch (envError) {
|
|
1141
|
+
// Isolate this environment's failure; continue with the next env.
|
|
1142
|
+
logger.error(
|
|
1143
|
+
`Failed to run health check ${configId} for system ${systemId}` +
|
|
1144
|
+
(environmentId ? ` (environment ${environmentId})` : " (env-less)"),
|
|
1145
|
+
envError,
|
|
1146
|
+
);
|
|
1147
|
+
}
|
|
1148
|
+
} // end per-environment fan-out loop (for ... of runEnvironments)
|
|
1149
|
+
|
|
1150
|
+
// ── System rollup write (§7.4.3) ───────────────────────────────────────
|
|
1151
|
+
// With real environments, the per-env writes mutated `<systemId>::<env>`
|
|
1152
|
+
// entities; the bare `<systemId>` ROLLUP entity (the worst-status view
|
|
1153
|
+
// every existing system-level consumer references) must now recompute so
|
|
1154
|
+
// it diffs/emits its OWN `ENTITY_CHANGED`. The rollup `apply` does NO new
|
|
1155
|
+
// durable insert (the runs are already persisted by the per-env writes) —
|
|
1156
|
+
// it just recomputes + returns the all-runs rollup view so the framework
|
|
1157
|
+
// diffs prev → next. Keyed on the bare `health:<systemId>` lock so it
|
|
1158
|
+
// serializes against itself, independent of the per-env locks.
|
|
1159
|
+
//
|
|
1160
|
+
// Skipped when env-less (the loop's lone write already targeted the bare
|
|
1161
|
+
// `<systemId>` entity = the rollup) or when nothing persisted (a fully
|
|
1162
|
+
// isolated-failure loop left no new runs to roll up).
|
|
1163
|
+
if (isFannedOut && anyEnvRunPersisted) {
|
|
1164
|
+
const rollupEntityId = encodeHealthEntityId({ systemId });
|
|
1165
|
+
let rollupState!: AggregatedHealth;
|
|
1166
|
+
try {
|
|
1167
|
+
await writeHealthEntity({
|
|
1168
|
+
handle: getHealthEntity?.(),
|
|
1169
|
+
entityId: rollupEntityId,
|
|
1170
|
+
apply: async () => {
|
|
1171
|
+
// No durable insert — recompute the all-runs (rollup) view.
|
|
1172
|
+
rollupState = await service.getSystemHealthStatus(systemId);
|
|
1173
|
+
return toHealthEntityView(rollupState);
|
|
1174
|
+
},
|
|
1175
|
+
serialize: makeHealthSerializer(rollupEntityId),
|
|
1176
|
+
onError: (error) =>
|
|
1177
|
+
logger.warn(
|
|
1178
|
+
`Failed to mirror rollup health entity for ${systemId}`,
|
|
1179
|
+
error,
|
|
1180
|
+
),
|
|
1181
|
+
});
|
|
1182
|
+
|
|
1183
|
+
// Record the ROLLUP transition (environmentId = null) so system-level
|
|
1184
|
+
// "in status since" reflects the aggregate, and notify on a real
|
|
1185
|
+
// rollup status change so existing system-level notifications fire.
|
|
1186
|
+
if (rollupState.status !== rollupPreviousStatus) {
|
|
1187
|
+
await recordStateTransition({
|
|
1188
|
+
db,
|
|
1189
|
+
systemId,
|
|
1190
|
+
configurationId: configId,
|
|
1191
|
+
environmentId: null,
|
|
1192
|
+
fromStatus: rollupPreviousStatus,
|
|
1193
|
+
toStatus: rollupState.status,
|
|
1194
|
+
});
|
|
1195
|
+
|
|
1196
|
+
await notifyStateChange({
|
|
1197
|
+
notificationClient,
|
|
1198
|
+
systemId,
|
|
1199
|
+
systemName,
|
|
1200
|
+
configurationId: configId,
|
|
1201
|
+
previousStatus: rollupPreviousStatus,
|
|
1202
|
+
newStatus: rollupState.status,
|
|
1203
|
+
service,
|
|
1204
|
+
catalogClient,
|
|
1205
|
+
maintenanceClient,
|
|
1206
|
+
incidentClient,
|
|
1207
|
+
logger,
|
|
1208
|
+
});
|
|
1209
|
+
|
|
1210
|
+
await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
|
|
1211
|
+
systemId,
|
|
1212
|
+
previousStatus: rollupPreviousStatus as HealthCheckStatus,
|
|
1213
|
+
newStatus: rollupState.status,
|
|
1214
|
+
});
|
|
1215
|
+
}
|
|
1216
|
+
} catch (rollupError) {
|
|
1217
|
+
// The rollup is best-effort reactivity over already-durable runs; a
|
|
1218
|
+
// failure must not wedge the (completed) per-env runs.
|
|
1219
|
+
logger.error(
|
|
1220
|
+
`Failed to write system rollup health for ${systemId}`,
|
|
1221
|
+
rollupError,
|
|
1222
|
+
);
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
943
1225
|
|
|
944
1226
|
// Note: No manual rescheduling needed - recurring job handles it automatically
|
|
945
1227
|
} catch (error) {
|
|
@@ -948,15 +1230,17 @@ async function executeHealthCheckJob(props: {
|
|
|
948
1230
|
error,
|
|
949
1231
|
);
|
|
950
1232
|
|
|
951
|
-
//
|
|
952
|
-
//
|
|
953
|
-
//
|
|
954
|
-
//
|
|
955
|
-
//
|
|
1233
|
+
// Catastrophic job-level failure (e.g. the config fetch / env resolution
|
|
1234
|
+
// threw before the fan-out loop). Persist a single env-less failure run
|
|
1235
|
+
// against the bare `<systemId>` entity — which IS the system rollup — so
|
|
1236
|
+
// the system-level health change still emits. Reuses the pre-tick
|
|
1237
|
+
// rollup status captured before the try block.
|
|
1238
|
+
const rollupEntityId = encodeHealthEntityId({ systemId });
|
|
1239
|
+
const previousStatus = rollupPreviousStatus;
|
|
956
1240
|
let newState!: AggregatedHealth;
|
|
957
1241
|
await writeHealthEntity({
|
|
958
1242
|
handle: getHealthEntity?.(),
|
|
959
|
-
|
|
1243
|
+
entityId: rollupEntityId,
|
|
960
1244
|
apply: async () => {
|
|
961
1245
|
// Store failure (no latencyMs for failures)
|
|
962
1246
|
await db.insert(healthCheckRuns).values({
|
|
@@ -984,10 +1268,10 @@ async function executeHealthCheckJob(props: {
|
|
|
984
1268
|
newState = await service.getSystemHealthStatus(systemId);
|
|
985
1269
|
return toHealthEntityView(newState);
|
|
986
1270
|
},
|
|
987
|
-
serialize:
|
|
1271
|
+
serialize: makeHealthSerializer(rollupEntityId),
|
|
988
1272
|
onError: (mirrorError) =>
|
|
989
1273
|
logger.warn(
|
|
990
|
-
`Failed to mirror health entity for ${
|
|
1274
|
+
`Failed to mirror health entity for ${rollupEntityId}`,
|
|
991
1275
|
mirrorError,
|
|
992
1276
|
),
|
|
993
1277
|
});
|
|
@@ -171,6 +171,36 @@ describe("incrementHourlyAggregate", () => {
|
|
|
171
171
|
expect(inserted.maxLatencyMs).toBe(150);
|
|
172
172
|
});
|
|
173
173
|
|
|
174
|
+
it("writes the environmentId into the aggregate (per-environment fan-out)", async () => {
|
|
175
|
+
await incrementHourlyAggregate({
|
|
176
|
+
db: mockDb as never,
|
|
177
|
+
systemId: "sys-1",
|
|
178
|
+
configurationId: "config-1",
|
|
179
|
+
environmentId: "prod",
|
|
180
|
+
status: "healthy",
|
|
181
|
+
latencyMs: 150,
|
|
182
|
+
runTimestamp: new Date("2024-01-15T10:35:00Z"),
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
const inserted = insertedValues[0] as Record<string, unknown>;
|
|
186
|
+
expect(inserted.environmentId).toBe("prod");
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
it("normalizes an env-less run to environmentId = null", async () => {
|
|
190
|
+
await incrementHourlyAggregate({
|
|
191
|
+
db: mockDb as never,
|
|
192
|
+
systemId: "sys-1",
|
|
193
|
+
configurationId: "config-1",
|
|
194
|
+
// environmentId omitted -> env-less run
|
|
195
|
+
status: "healthy",
|
|
196
|
+
latencyMs: 150,
|
|
197
|
+
runTimestamp: new Date("2024-01-15T10:35:00Z"),
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
const inserted = insertedValues[0] as Record<string, unknown>;
|
|
201
|
+
expect(inserted.environmentId).toBeNull();
|
|
202
|
+
});
|
|
203
|
+
|
|
174
204
|
it("increments counts for unhealthy status", async () => {
|
|
175
205
|
await incrementHourlyAggregate({
|
|
176
206
|
db: mockDb as never,
|
|
@@ -60,6 +60,12 @@ interface IncrementHourlyAggregateParams {
|
|
|
60
60
|
db: Db;
|
|
61
61
|
systemId: string;
|
|
62
62
|
configurationId: string;
|
|
63
|
+
/**
|
|
64
|
+
* Environment this run was for (per-environment fan-out). null/undefined =
|
|
65
|
+
* env-less run. Part of the aggregate uniqueness key so per-environment
|
|
66
|
+
* buckets stay separate.
|
|
67
|
+
*/
|
|
68
|
+
environmentId?: string | null;
|
|
63
69
|
status: "healthy" | "unhealthy" | "degraded";
|
|
64
70
|
latencyMs: number | undefined;
|
|
65
71
|
runTimestamp: Date;
|
|
@@ -87,6 +93,7 @@ export async function incrementHourlyAggregate(
|
|
|
87
93
|
db,
|
|
88
94
|
systemId,
|
|
89
95
|
configurationId,
|
|
96
|
+
environmentId,
|
|
90
97
|
status,
|
|
91
98
|
latencyMs,
|
|
92
99
|
runTimestamp,
|
|
@@ -96,6 +103,10 @@ export async function incrementHourlyAggregate(
|
|
|
96
103
|
sourceLabel,
|
|
97
104
|
} = params;
|
|
98
105
|
|
|
106
|
+
// Normalize undefined -> null so the env-less slice is one stable key
|
|
107
|
+
// (NULLS NOT DISTINCT matches on it via the IS NULL predicate below).
|
|
108
|
+
const envId = environmentId ?? null;
|
|
109
|
+
|
|
99
110
|
const bucketStart = getHourBucketStart(runTimestamp);
|
|
100
111
|
|
|
101
112
|
// First, try to fetch existing aggregate to merge t-digest state and collector data
|
|
@@ -111,6 +122,9 @@ export async function incrementHourlyAggregate(
|
|
|
111
122
|
and(
|
|
112
123
|
eq(healthCheckAggregates.systemId, systemId),
|
|
113
124
|
eq(healthCheckAggregates.configurationId, configurationId),
|
|
125
|
+
envId
|
|
126
|
+
? eq(healthCheckAggregates.environmentId, envId)
|
|
127
|
+
: sql`${healthCheckAggregates.environmentId} IS NULL`,
|
|
114
128
|
eq(healthCheckAggregates.bucketStart, bucketStart),
|
|
115
129
|
eq(healthCheckAggregates.bucketSize, "hourly"),
|
|
116
130
|
sourceId
|
|
@@ -177,6 +191,7 @@ export async function incrementHourlyAggregate(
|
|
|
177
191
|
.values({
|
|
178
192
|
configurationId,
|
|
179
193
|
systemId,
|
|
194
|
+
environmentId: envId,
|
|
180
195
|
bucketStart,
|
|
181
196
|
bucketSize: "hourly",
|
|
182
197
|
runCount: 1,
|
|
@@ -197,6 +212,7 @@ export async function incrementHourlyAggregate(
|
|
|
197
212
|
target: [
|
|
198
213
|
healthCheckAggregates.configurationId,
|
|
199
214
|
healthCheckAggregates.systemId,
|
|
215
|
+
healthCheckAggregates.environmentId,
|
|
200
216
|
healthCheckAggregates.bucketStart,
|
|
201
217
|
healthCheckAggregates.bucketSize,
|
|
202
218
|
healthCheckAggregates.sourceId,
|