@checkstack/healthcheck-backend 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CHANGELOG.md +303 -0
  2. package/drizzle/0018_abnormal_preak.sql +10 -0
  3. package/drizzle/meta/0018_snapshot.json +600 -0
  4. package/drizzle/meta/_journal.json +7 -0
  5. package/package.json +26 -21
  6. package/src/ai/assertion-validation.test.ts +117 -0
  7. package/src/ai/assertion-validation.ts +147 -0
  8. package/src/ai/healthcheck-capabilities.test.ts +158 -0
  9. package/src/ai/healthcheck-capabilities.ts +217 -0
  10. package/src/ai/healthcheck-delete.test.ts +81 -0
  11. package/src/ai/healthcheck-delete.ts +81 -0
  12. package/src/ai/healthcheck-projection.test.ts +36 -0
  13. package/src/ai/healthcheck-propose.test.ts +268 -0
  14. package/src/ai/healthcheck-propose.ts +290 -0
  15. package/src/ai/healthcheck-script-tools.test.ts +93 -0
  16. package/src/ai/healthcheck-script-tools.ts +179 -0
  17. package/src/ai/healthcheck-update.test.ts +123 -0
  18. package/src/ai/healthcheck-update.ts +123 -0
  19. package/src/ai/notify-subscribers.test.ts +109 -0
  20. package/src/ai/notify-subscribers.ts +176 -0
  21. package/src/ai/register-ai-tools.test.ts +41 -0
  22. package/src/ai/register-ai-tools.ts +53 -0
  23. package/src/ai/shell-env-table.test.ts +47 -0
  24. package/src/automations.test.ts +2 -1
  25. package/src/automations.ts +9 -1
  26. package/src/collector-script-test.test.ts +53 -1
  27. package/src/collector-script-test.ts +59 -7
  28. package/src/effective-environments.test.ts +93 -0
  29. package/src/effective-environments.ts +64 -0
  30. package/src/health-entity-id.ts +57 -0
  31. package/src/health-entity.test.ts +405 -31
  32. package/src/health-entity.ts +99 -43
  33. package/src/health-state.ts +41 -4
  34. package/src/healthcheck-gitops-kinds.test.ts +95 -0
  35. package/src/healthcheck-gitops-kinds.ts +56 -13
  36. package/src/index.ts +33 -0
  37. package/src/migration-chain-contract.test.ts +57 -0
  38. package/src/queue-executor.test.ts +814 -0
  39. package/src/queue-executor.ts +342 -50
  40. package/src/realtime-aggregation.test.ts +30 -0
  41. package/src/realtime-aggregation.ts +16 -0
  42. package/src/retention-job.ts +167 -93
  43. package/src/retention-rollup.test.ts +118 -0
  44. package/src/router.test.ts +120 -1
  45. package/src/router.ts +20 -0
  46. package/src/schema.ts +44 -6
  47. package/src/service.ts +199 -43
  48. package/src/state-evaluator.test.ts +50 -5
  49. package/src/state-evaluator.ts +9 -2
  50. package/src/state-transitions.test.ts +104 -0
  51. package/src/state-transitions.ts +39 -1
  52. package/src/validate-configuration.test.ts +205 -0
  53. package/src/validate-configuration.ts +159 -0
  54. package/tsconfig.json +9 -0
@@ -9,6 +9,8 @@ import {
9
9
  type ConnectedClient,
10
10
  type TransportClient,
11
11
  type CollectorRunContext,
12
+ type AdvisoryLockService,
13
+ renderTemplatableConfig,
12
14
  } from "@checkstack/backend-api";
13
15
  import { QueueManager } from "@checkstack/queue-api";
14
16
  import {
@@ -22,6 +24,7 @@ import { type SignalService } from "@checkstack/signal-common";
22
24
  import {
23
25
  HEALTH_CHECK_RUN_COMPLETED,
24
26
  SYSTEM_STATUS_CHANGED,
27
+ ENVIRONMENT_RESOLUTION_FAILED,
25
28
  type HealthCheckStatus,
26
29
  stripEphemeralFields,
27
30
  } from "@checkstack/healthcheck-common";
@@ -29,7 +32,12 @@ import {
29
32
  CatalogApi,
30
33
  catalogRoutes,
31
34
  createSystemSubject,
35
+ type Environment,
32
36
  } from "@checkstack/catalog-common";
37
+ import {
38
+ resolveEffectiveEnvironments,
39
+ type EffectiveEnvironment,
40
+ } from "./effective-environments";
33
41
  import { systemHealthCollapseKey } from "@checkstack/healthcheck-common";
34
42
  import { MaintenanceApi } from "@checkstack/maintenance-common";
35
43
  import { IncidentApi } from "@checkstack/incident-common";
@@ -52,6 +60,7 @@ import {
52
60
  createHealthEntitySerializer,
53
61
  type HealthEntityState,
54
62
  } from "./health-entity";
63
+ import { encodeHealthEntityId } from "./health-entity-id";
55
64
  import type { EntityHandle } from "@checkstack/automation-backend";
56
65
 
57
66
  type Db = SafeDatabase<typeof schema>;
@@ -375,6 +384,7 @@ async function notifyStateChange(props: {
375
384
  async function executeHealthCheckJob(props: {
376
385
  payload: HealthCheckJobPayload;
377
386
  db: Db;
387
+ advisoryLock: AdvisoryLockService;
378
388
  registry: HealthCheckRegistry;
379
389
  collectorRegistry: CollectorRegistry;
380
390
  logger: Logger;
@@ -404,6 +414,7 @@ async function executeHealthCheckJob(props: {
404
414
  const {
405
415
  payload,
406
416
  db,
417
+ advisoryLock,
407
418
  registry,
408
419
  collectorRegistry,
409
420
  logger,
@@ -422,17 +433,22 @@ async function executeHealthCheckJob(props: {
422
433
  // Create service for aggregated state evaluation
423
434
  const service = new HealthCheckService(db, registry, collectorRegistry);
424
435
 
425
- // Per-system serializer for the reactive health mutate (§10.3): a
426
- // transaction-scoped advisory lock keyed `health:<systemId>` wraps the
427
- // snapshot-prev + apply + diff + emit so concurrent evaluations of one
428
- // system (multiple per-config jobs across pods, or at-least-once
429
- // redelivery) can't double-emit a single logical transition. Bound to this
430
- // job's systemId below at every `writeHealthEntity` call.
431
- const serializeHealthWrite = createHealthEntitySerializer({ db })(systemId);
432
-
433
- // Capture aggregated state BEFORE this run for comparison
434
- const previousState = await service.getSystemHealthStatus(systemId);
435
- const previousStatus = previousState.status;
436
+ // Per-ENTITY serializer factory for the reactive health mutate (§10.3,
437
+ // Phase 3b): a transaction-scoped advisory lock keyed `health:<entityId>`
438
+ // wraps the snapshot-prev + apply + diff + emit so concurrent evaluations
439
+ // of one (system, environment) or of the system rollup — can't double-emit
440
+ // a single logical transition. Bound to the qualified entity id at each
441
+ // `writeHealthEntity` call so distinct envs / the rollup don't block each
442
+ // other.
443
+ const makeHealthSerializer = createHealthEntitySerializer({ advisoryLock });
444
+
445
+ // The system-rollup status BEFORE this tick (all environments + env-less).
446
+ // Captured once so the post-loop rollup write (§7.4.3) — and the
447
+ // catastrophic-failure path — can record a correct prev → next rollup
448
+ // transition (environmentId = null). This is the system-wide aggregate read
449
+ // the executor has always taken first.
450
+ const rollupPreviousState = await service.getSystemHealthStatus(systemId);
451
+ const rollupPreviousStatus = rollupPreviousState.status;
436
452
 
437
453
  try {
438
454
  // Fetch configuration (including name for signals)
@@ -448,6 +464,7 @@ async function executeHealthCheckJob(props: {
448
464
  paused: healthCheckConfigurations.paused,
449
465
  includeLocal: systemHealthChecks.includeLocal,
450
466
  satelliteIds: systemHealthChecks.satelliteIds,
467
+ environmentIds: systemHealthChecks.environmentIds,
451
468
  })
452
469
  .from(systemHealthChecks)
453
470
  .innerJoin(
@@ -503,17 +520,6 @@ async function executeHealthCheckJob(props: {
503
520
  logger.debug(`Could not fetch system name for ${systemId}, using ID`);
504
521
  }
505
522
 
506
- // Curated, read-only run-context metadata exposed to collectors.
507
- // Metadata only - never secrets or config.
508
- const runContext: CollectorRunContext = {
509
- check: {
510
- id: configId,
511
- name: configRow.configName || configId,
512
- intervalSeconds: configRow.interval,
513
- },
514
- system: { id: systemId, name: systemName },
515
- };
516
-
517
523
  const strategy = registry.getStrategy(configRow.strategyId);
518
524
  if (!strategy) {
519
525
  logger.warn(
@@ -522,10 +528,158 @@ async function executeHealthCheckJob(props: {
522
528
  return;
523
529
  }
524
530
 
525
- // Extract timeout from strategy config for platform-level enforcement
526
- const strategyConfig = configRow.config as unknown as BaseStrategyConfig;
531
+ // Migrate the stored (UNVERSIONED) strategy config ONCE, before the
532
+ // per-environment render loop, so every env renders from the same
533
+ // migrated shape. Stored configs predate explicit versioning and may be
534
+ // genuinely v1 (e.g. an HTTP config still carrying url/method); assume-v1
535
+ // -on-read runs the declared migration chain, then validates. The
536
+ // migrations are idempotent, so an already-current config is a no-op.
537
+ const strategyConfig: BaseStrategyConfig =
538
+ await strategy.config.parseAssumingV1(configRow.config);
527
539
  const executionTimeout = strategyConfig.timeout ?? 60_000;
528
540
 
541
+ // ── Per-environment fan-out (§7) ────────────────────────────────────────
542
+ // Resolve the effective environment set from the assignment + the
543
+ // system's current catalog membership, then run ONCE PER environment.
544
+ // An empty effective set (opt-out `[]`, or `null` with no membership)
545
+ // collapses to a single env-less run with `environment` unset — exactly
546
+ // the pre-feature behavior. Membership lives ONLY in the catalog Postgres
547
+ // tables and is re-read every tick via the cross-plugin RPC, so every pod
548
+ // resolves the same set (state-and-scale: no pod-local env state).
549
+ let membership: Environment[] = [];
550
+ try {
551
+ membership = await catalogClient.resolveSystemEnvironments({ systemId });
552
+ } catch (error) {
553
+ // Fail-open: a catalog read failure must not wedge the check. Degrade
554
+ // to an env-less run (today's behavior) rather than skipping the tick.
555
+ logger.warn(
556
+ `Could not resolve environments for system ${systemId}, running env-less`,
557
+ error,
558
+ );
559
+ // Observability: a `logger.warn` alone is easy to miss when a durable
560
+ // catalog misconfig (or outage) silently strips per-environment fan-out.
561
+ // Broadcast a counter-style signal so the degradation is observable.
562
+ // Best-effort — never let the signal break the (still-running) check.
563
+ try {
564
+ await signalService.broadcast(ENVIRONMENT_RESOLUTION_FAILED, {
565
+ systemId,
566
+ configurationId: configId,
567
+ error: extractErrorMessage(error),
568
+ });
569
+ } catch (signalError) {
570
+ logger.warn(
571
+ `Failed to broadcast environment-resolution-failed signal for ${systemId}`,
572
+ signalError,
573
+ );
574
+ }
575
+ }
576
+ const effectiveEnvs = resolveEffectiveEnvironments({
577
+ environmentIds: configRow.environmentIds,
578
+ membership,
579
+ });
580
+ // `null` env => the single env-less run. Each entry => one run per env.
581
+ const runEnvironments: (EffectiveEnvironment | null)[] =
582
+ effectiveEnvs.length > 0 ? effectiveEnvs : [null];
583
+
584
+ // Execute one run per effective environment. Runs are independent (own
585
+ // status / latency / result) and persisted with their own
586
+ // `environmentId`. Phase 3b: each env-run mutates its OWN env-qualified
587
+ // `health` entity (`<systemId>::<environmentId>`, or the bare `<systemId>`
588
+ // for the env-less run) through a per-entity serializer; after the loop a
589
+ // single ROLLUP write for the bare `<systemId>` recomputes the worst-status
590
+ // rollup so system-level consumers keep firing off the unchanged id.
591
+ //
592
+ // Track whether ANY per-env run persisted (so the rollup write only runs
593
+ // when there is something to roll up — an all-failed loop still leaves the
594
+ // durable runs the per-env apply already wrote).
595
+ let anyEnvRunPersisted = false;
596
+ // Whether this tick fans out into REAL environments (vs. the single
597
+ // env-less run). When env-less, the loop's lone write already targets the
598
+ // bare `<systemId>` entity — which IS the rollup — so no separate rollup
599
+ // write is needed. With real envs, the loop writes `<systemId>::<env>`
600
+ // entities and we recompute the bare-`<systemId>` rollup after the loop.
601
+ const isFannedOut = effectiveEnvs.length > 0;
602
+ for (const environment of runEnvironments) {
603
+ const environmentId = environment?.id ?? null;
604
+ // The env-qualified entity id this run mutates. For the env-less run
605
+ // (environmentId === null) this is the bare systemId — which is also the
606
+ // rollup id, so the env-less run IS the rollup (no separate rollup write
607
+ // is needed when the system has no environments — see below).
608
+ const envEntityId = encodeHealthEntityId({ systemId, environmentId });
609
+ const serializeEnvWrite = makeHealthSerializer(envEntityId);
610
+
611
+ // Per-env baseline status for the transition log: the env-scoped
612
+ // aggregate BEFORE this run. Computed per env so a transition row is
613
+ // recorded against the right (system, environment) streak.
614
+ const previousState = await service.getSystemHealthStatus(
615
+ systemId,
616
+ environmentId,
617
+ );
618
+ const previousStatus = previousState.status;
619
+
620
+ // Curated, read-only run-context metadata exposed to collectors.
621
+ // Metadata only - never secrets or config. `environment` carries the
622
+ // resolved env's verbatim custom fields for this run (Phase 2 surfaces
623
+ // consume it); absent for the env-less run.
624
+ const runContext: CollectorRunContext = {
625
+ check: {
626
+ id: configId,
627
+ name: configRow.configName || configId,
628
+ intervalSeconds: configRow.interval,
629
+ },
630
+ system: { id: systemId, name: systemName },
631
+ ...(environment
632
+ ? {
633
+ environment: {
634
+ id: environment.id,
635
+ name: environment.name,
636
+ fields: environment.fields,
637
+ },
638
+ }
639
+ : {}),
640
+ };
641
+
642
+ // Templating context for the per-env config render pass (§6.3.3).
643
+ // Carries only environment custom fields + curated check/system
644
+ // metadata - never secrets. `{{ environment.baseUrl }}` resolves from
645
+ // the resolved env's verbatim fields; an env-less run gets `{}` so a
646
+ // reference renders to empty string (strict: false); see the debug log
647
+ // below.
648
+ const templateContext = {
649
+ environment: runContext.environment?.fields ?? {},
650
+ check: runContext.check,
651
+ system: runContext.system,
652
+ };
653
+ if (!runContext.environment) {
654
+ // §11.6: render-empty when a run has no environment. An env-less run is
655
+ // a legitimate, documented configuration (the None assignment mode, or
656
+ // All-environments with no membership), and it recurs every interval -
657
+ // so this is `debug`, not `warn`, to avoid spamming the log. When an
658
+ // empty `{{ environment.* }}` render actually matters, the HTTP
659
+ // post-render `.url()` check already fails the run with a concrete
660
+ // "Rendered URL is invalid" error; we do not inspect every field here.
661
+ logger.debug(
662
+ `Health check ${configId} for system ${systemId} ran with no environment; ` +
663
+ `any {{ environment.* }} references render to empty string`,
664
+ );
665
+ }
666
+
667
+ // (2) Environment/templating pass (NEW) - renders `{{ environment.* }}`
668
+ // etc. in `x-templatable` fields. Runs PER ENVIRONMENT, AFTER the secret
669
+ // resolution (secrets first, templating second - §6.3.4) and BEFORE the
670
+ // strategy client build, so each env gets its own rendered strategy
671
+ // config + client. The collector configs are rendered just before each
672
+ // collector executes (below) so the secretEnv resolution stays first.
673
+ const renderedStrategyConfig = renderTemplatableConfig({
674
+ config: strategyConfig,
675
+ schema: strategy.config.schema,
676
+ context: templateContext,
677
+ }) as BaseStrategyConfig;
678
+
679
+ // Per-environment isolation: an unexpected failure persisting ONE
680
+ // environment's run must not abort the sibling environments' runs.
681
+ // Each iteration's run is independent (§7.2), so we log and continue.
682
+ try {
529
683
  // Execute health check using createClient pattern with unified hard timeout
530
684
  const start = performance.now();
531
685
  let connectionTimeMs: number | undefined;
@@ -541,8 +695,11 @@ async function executeHealthCheckJob(props: {
541
695
  // Platform-level hard timeout wrapping the entire execution sequence
542
696
  await Promise.race([
543
697
  (async () => {
544
- // 1. Establish connection
545
- connectedClient = await strategy.createClient(strategyConfig);
698
+ // 1. Establish connection. The strategy client build moves INSIDE
699
+ // the per-env loop (§6.3.3): each env gets its own rendered config +
700
+ // client, so a single job no longer bakes in one env's rendered
701
+ // strategy config.
702
+ connectedClient = await strategy.createClient(renderedStrategyConfig);
546
703
  connectionTimeMs = Math.round(performance.now() - start);
547
704
 
548
705
  // 2. Execute collectors in parallel
@@ -579,8 +736,31 @@ async function executeHealthCheckJob(props: {
579
736
  secretEnv = resolved.env;
580
737
  }
581
738
 
739
+ // Migrate the stored (UNVERSIONED) collector config via
740
+ // assume-v1-on-read: runs the declared migration chain, then
741
+ // validates. Migrations are idempotent, so an already-current
742
+ // config is a no-op. This runs BEFORE templating so the render
743
+ // pass sees the migrated shape; the secretEnv resolution above
744
+ // reads the raw `secretEnv` mapping (a constant string field
745
+ // unaffected by the strategy/collector reshapes), keeping the
746
+ // migrate -> secret resolve -> render -> execute order intact.
747
+ const migratedCollectorConfig =
748
+ await registered.collector.config.parseAssumingV1(
749
+ collectorEntry.config,
750
+ );
751
+
752
+ // (2) Environment/templating pass for the collector config -
753
+ // runs AFTER the secretEnv resolution above (secrets first,
754
+ // templating second) and renders `{{ environment.* }}` in this
755
+ // collector's `x-templatable` fields against the per-env context.
756
+ const renderedCollectorConfig = renderTemplatableConfig({
757
+ config: migratedCollectorConfig,
758
+ schema: registered.collector.config.schema,
759
+ context: templateContext,
760
+ });
761
+
582
762
  const collectorResult = await registered.collector.execute({
583
- config: collectorEntry.config,
763
+ config: renderedCollectorConfig,
584
764
  client: connectedClient!.client,
585
765
  pluginId: configRow.strategyId,
586
766
  runContext,
@@ -723,11 +903,12 @@ async function executeHealthCheckJob(props: {
723
903
  let newState!: AggregatedHealth;
724
904
  await writeHealthEntity({
725
905
  handle: getHealthEntity?.(),
726
- systemId,
906
+ entityId: envEntityId,
727
907
  apply: async () => {
728
908
  await db.insert(healthCheckRuns).values({
729
909
  configurationId: configId,
730
910
  systemId,
911
+ environmentId,
731
912
  status: result.status,
732
913
  latencyMs: result.latencyMs,
733
914
  result: { ...result } as Record<string, unknown>,
@@ -739,6 +920,7 @@ async function executeHealthCheckJob(props: {
739
920
  db,
740
921
  systemId,
741
922
  configurationId: configId,
923
+ environmentId,
742
924
  status: result.status,
743
925
  latencyMs: result.latencyMs,
744
926
  runTimestamp: new Date(),
@@ -747,13 +929,18 @@ async function executeHealthCheckJob(props: {
747
929
  sourceLabel: "Local",
748
930
  });
749
931
 
750
- newState = await service.getSystemHealthStatus(systemId);
932
+ // Env-scoped view: the per-env entity reflects only this env's runs.
933
+ newState = await service.getSystemHealthStatus(systemId, environmentId);
751
934
  return toHealthEntityView(newState);
752
935
  },
753
- serialize: serializeHealthWrite,
936
+ serialize: serializeEnvWrite,
754
937
  onError: (error) =>
755
- logger.warn(`Failed to mirror health entity for ${systemId}`, error),
938
+ logger.warn(
939
+ `Failed to mirror health entity for ${envEntityId}`,
940
+ error,
941
+ ),
756
942
  });
943
+ anyEnvRunPersisted = true;
757
944
 
758
945
  logger.debug(
759
946
  `Health check ${configId} for system ${systemId} failed: ${finalError}`,
@@ -779,6 +966,7 @@ async function executeHealthCheckJob(props: {
779
966
  db,
780
967
  systemId,
781
968
  configurationId: configId,
969
+ environmentId,
782
970
  fromStatus: previousStatus,
783
971
  toStatus: newState.status,
784
972
  });
@@ -798,7 +986,9 @@ async function executeHealthCheckJob(props: {
798
986
  });
799
987
  }
800
988
 
801
- return;
989
+ // This environment's run is done (failed). Continue to the next
990
+ // effective environment rather than ending the whole job.
991
+ continue;
802
992
  } finally {
803
993
  if (connectedClient) {
804
994
  try {
@@ -836,12 +1026,13 @@ async function executeHealthCheckJob(props: {
836
1026
  let newState!: AggregatedHealth;
837
1027
  await writeHealthEntity({
838
1028
  handle: getHealthEntity?.(),
839
- systemId,
1029
+ entityId: envEntityId,
840
1030
  apply: async () => {
841
1031
  // Store result (spread to convert structured type to plain record for jsonb)
842
1032
  await db.insert(healthCheckRuns).values({
843
1033
  configurationId: configId,
844
1034
  systemId,
1035
+ environmentId,
845
1036
  status: result.status,
846
1037
  latencyMs: result.latencyMs,
847
1038
  result: { ...result } as Record<string, unknown>,
@@ -854,6 +1045,7 @@ async function executeHealthCheckJob(props: {
854
1045
  db,
855
1046
  systemId,
856
1047
  configurationId: configId,
1048
+ environmentId,
857
1049
  status: result.status,
858
1050
  latencyMs: result.latencyMs,
859
1051
  runTimestamp: new Date(),
@@ -862,13 +1054,15 @@ async function executeHealthCheckJob(props: {
862
1054
  sourceLabel: "Local",
863
1055
  });
864
1056
 
865
- newState = await service.getSystemHealthStatus(systemId);
1057
+ // Env-scoped view: the per-env entity reflects only this env's runs.
1058
+ newState = await service.getSystemHealthStatus(systemId, environmentId);
866
1059
  return toHealthEntityView(newState);
867
1060
  },
868
- serialize: serializeHealthWrite,
1061
+ serialize: serializeEnvWrite,
869
1062
  onError: (error) =>
870
- logger.warn(`Failed to mirror health entity for ${systemId}`, error),
1063
+ logger.warn(`Failed to mirror health entity for ${envEntityId}`, error),
871
1064
  });
1065
+ anyEnvRunPersisted = true;
872
1066
 
873
1067
  logger.debug(
874
1068
  `Ran health check ${configId} for system ${systemId}: ${result.status}`,
@@ -904,6 +1098,7 @@ async function executeHealthCheckJob(props: {
904
1098
  db,
905
1099
  systemId,
906
1100
  configurationId: configId,
1101
+ environmentId,
907
1102
  fromStatus: previousStatus,
908
1103
  toStatus: newState.status,
909
1104
  });
@@ -922,12 +1117,19 @@ async function executeHealthCheckJob(props: {
922
1117
  logger,
923
1118
  });
924
1119
 
925
- // Broadcast system-level status change signal for frontend reactivity
926
- await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
927
- systemId,
928
- previousStatus: previousStatus as HealthCheckStatus,
929
- newStatus: newState.status,
930
- });
1120
+ // The system-level `SYSTEM_STATUS_CHANGED` signal must carry the ROLLUP
1121
+ // status, not a per-env status. When fanned out, the post-loop rollup
1122
+ // write broadcasts it once with the worst-status rollup; emitting it here
1123
+ // per env would send up to N system-level signals/tick carrying per-env
1124
+ // status. Only the env-less run (which IS the rollup — `!isFannedOut`)
1125
+ // broadcasts the system-level signal from inside the loop.
1126
+ if (!isFannedOut) {
1127
+ await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
1128
+ systemId,
1129
+ previousStatus: previousStatus as HealthCheckStatus,
1130
+ newStatus: newState.status,
1131
+ });
1132
+ }
931
1133
 
932
1134
  // The directional + umbrella system-health hooks were removed in
933
1135
  // Phase 4 (§10.3): the `health` entity mirror above is the single
@@ -935,6 +1137,91 @@ async function executeHealthCheckJob(props: {
935
1137
  // `healthcheck.system_degraded` / `_healthy` / `_health_changed`
936
1138
  // trigger events through Stage-1 routing. Nothing to emit here.
937
1139
  }
1140
+ } catch (envError) {
1141
+ // Isolate this environment's failure; continue with the next env.
1142
+ logger.error(
1143
+ `Failed to run health check ${configId} for system ${systemId}` +
1144
+ (environmentId ? ` (environment ${environmentId})` : " (env-less)"),
1145
+ envError,
1146
+ );
1147
+ }
1148
+ } // end per-environment fan-out loop (for ... of runEnvironments)
1149
+
1150
+ // ── System rollup write (§7.4.3) ───────────────────────────────────────
1151
+ // With real environments, the per-env writes mutated `<systemId>::<env>`
1152
+ // entities; the bare `<systemId>` ROLLUP entity (the worst-status view
1153
+ // every existing system-level consumer references) must now recompute so
1154
+ // it diffs/emits its OWN `ENTITY_CHANGED`. The rollup `apply` does NO new
1155
+ // durable insert (the runs are already persisted by the per-env writes) —
1156
+ // it just recomputes + returns the all-runs rollup view so the framework
1157
+ // diffs prev → next. Keyed on the bare `health:<systemId>` lock so it
1158
+ // serializes against itself, independent of the per-env locks.
1159
+ //
1160
+ // Skipped when env-less (the loop's lone write already targeted the bare
1161
+ // `<systemId>` entity = the rollup) or when nothing persisted (a fully
1162
+ // isolated-failure loop left no new runs to roll up).
1163
+ if (isFannedOut && anyEnvRunPersisted) {
1164
+ const rollupEntityId = encodeHealthEntityId({ systemId });
1165
+ let rollupState!: AggregatedHealth;
1166
+ try {
1167
+ await writeHealthEntity({
1168
+ handle: getHealthEntity?.(),
1169
+ entityId: rollupEntityId,
1170
+ apply: async () => {
1171
+ // No durable insert — recompute the all-runs (rollup) view.
1172
+ rollupState = await service.getSystemHealthStatus(systemId);
1173
+ return toHealthEntityView(rollupState);
1174
+ },
1175
+ serialize: makeHealthSerializer(rollupEntityId),
1176
+ onError: (error) =>
1177
+ logger.warn(
1178
+ `Failed to mirror rollup health entity for ${systemId}`,
1179
+ error,
1180
+ ),
1181
+ });
1182
+
1183
+ // Record the ROLLUP transition (environmentId = null) so system-level
1184
+ // "in status since" reflects the aggregate, and notify on a real
1185
+ // rollup status change so existing system-level notifications fire.
1186
+ if (rollupState.status !== rollupPreviousStatus) {
1187
+ await recordStateTransition({
1188
+ db,
1189
+ systemId,
1190
+ configurationId: configId,
1191
+ environmentId: null,
1192
+ fromStatus: rollupPreviousStatus,
1193
+ toStatus: rollupState.status,
1194
+ });
1195
+
1196
+ await notifyStateChange({
1197
+ notificationClient,
1198
+ systemId,
1199
+ systemName,
1200
+ configurationId: configId,
1201
+ previousStatus: rollupPreviousStatus,
1202
+ newStatus: rollupState.status,
1203
+ service,
1204
+ catalogClient,
1205
+ maintenanceClient,
1206
+ incidentClient,
1207
+ logger,
1208
+ });
1209
+
1210
+ await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
1211
+ systemId,
1212
+ previousStatus: rollupPreviousStatus as HealthCheckStatus,
1213
+ newStatus: rollupState.status,
1214
+ });
1215
+ }
1216
+ } catch (rollupError) {
1217
+ // The rollup is best-effort reactivity over already-durable runs; a
1218
+ // failure must not wedge the (completed) per-env runs.
1219
+ logger.error(
1220
+ `Failed to write system rollup health for ${systemId}`,
1221
+ rollupError,
1222
+ );
1223
+ }
1224
+ }
938
1225
 
939
1226
  // Note: No manual rescheduling needed - recurring job handles it automatically
940
1227
  } catch (error) {
@@ -943,15 +1230,17 @@ async function executeHealthCheckJob(props: {
943
1230
  error,
944
1231
  );
945
1232
 
946
- // Persist the failure run + aggregate THROUGH the reactive `health`
947
- // entity: `apply` does the durable write and returns the freshly-computed
948
- // view. The framework snapshots `prev` via the compute-on-read accessor
949
- // BEFORE this insert, so a real status change emits exactly one correct
950
- // `ENTITY_CHANGED` (§10.3). See the success path for the full rationale.
1233
+ // Catastrophic job-level failure (e.g. the config fetch / env resolution
1234
+ // threw before the fan-out loop). Persist a single env-less failure run
1235
+ // against the bare `<systemId>` entity which IS the system rollup — so
1236
+ // the system-level health change still emits. Reuses the pre-tick
1237
+ // rollup status captured before the try block.
1238
+ const rollupEntityId = encodeHealthEntityId({ systemId });
1239
+ const previousStatus = rollupPreviousStatus;
951
1240
  let newState!: AggregatedHealth;
952
1241
  await writeHealthEntity({
953
1242
  handle: getHealthEntity?.(),
954
- systemId,
1243
+ entityId: rollupEntityId,
955
1244
  apply: async () => {
956
1245
  // Store failure (no latencyMs for failures)
957
1246
  await db.insert(healthCheckRuns).values({
@@ -979,10 +1268,10 @@ async function executeHealthCheckJob(props: {
979
1268
  newState = await service.getSystemHealthStatus(systemId);
980
1269
  return toHealthEntityView(newState);
981
1270
  },
982
- serialize: serializeHealthWrite,
1271
+ serialize: makeHealthSerializer(rollupEntityId),
983
1272
  onError: (mirrorError) =>
984
1273
  logger.warn(
985
- `Failed to mirror health entity for ${systemId}`,
1274
+ `Failed to mirror health entity for ${rollupEntityId}`,
986
1275
  mirrorError,
987
1276
  ),
988
1277
  });
@@ -1073,6 +1362,7 @@ async function executeHealthCheckJob(props: {
1073
1362
 
1074
1363
  export async function setupHealthCheckWorker(props: {
1075
1364
  db: Db;
1365
+ advisoryLock: AdvisoryLockService;
1076
1366
  registry: HealthCheckRegistry;
1077
1367
  collectorRegistry: CollectorRegistry;
1078
1368
  logger: Logger;
@@ -1089,6 +1379,7 @@ export async function setupHealthCheckWorker(props: {
1089
1379
  }): Promise<void> {
1090
1380
  const {
1091
1381
  db,
1382
+ advisoryLock,
1092
1383
  registry,
1093
1384
  collectorRegistry,
1094
1385
  logger,
@@ -1113,6 +1404,7 @@ export async function setupHealthCheckWorker(props: {
1113
1404
  await executeHealthCheckJob({
1114
1405
  payload: job.data,
1115
1406
  db,
1407
+ advisoryLock,
1116
1408
  registry,
1117
1409
  collectorRegistry,
1118
1410
  logger,
@@ -171,6 +171,36 @@ describe("incrementHourlyAggregate", () => {
171
171
  expect(inserted.maxLatencyMs).toBe(150);
172
172
  });
173
173
 
174
+ it("writes the environmentId into the aggregate (per-environment fan-out)", async () => {
175
+ await incrementHourlyAggregate({
176
+ db: mockDb as never,
177
+ systemId: "sys-1",
178
+ configurationId: "config-1",
179
+ environmentId: "prod",
180
+ status: "healthy",
181
+ latencyMs: 150,
182
+ runTimestamp: new Date("2024-01-15T10:35:00Z"),
183
+ });
184
+
185
+ const inserted = insertedValues[0] as Record<string, unknown>;
186
+ expect(inserted.environmentId).toBe("prod");
187
+ });
188
+
189
+ it("normalizes an env-less run to environmentId = null", async () => {
190
+ await incrementHourlyAggregate({
191
+ db: mockDb as never,
192
+ systemId: "sys-1",
193
+ configurationId: "config-1",
194
+ // environmentId omitted -> env-less run
195
+ status: "healthy",
196
+ latencyMs: 150,
197
+ runTimestamp: new Date("2024-01-15T10:35:00Z"),
198
+ });
199
+
200
+ const inserted = insertedValues[0] as Record<string, unknown>;
201
+ expect(inserted.environmentId).toBeNull();
202
+ });
203
+
174
204
  it("increments counts for unhealthy status", async () => {
175
205
  await incrementHourlyAggregate({
176
206
  db: mockDb as never,