@checkstack/healthcheck-backend 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/CHANGELOG.md +223 -0
  2. package/drizzle/0018_abnormal_preak.sql +10 -0
  3. package/drizzle/meta/0018_snapshot.json +600 -0
  4. package/drizzle/meta/_journal.json +7 -0
  5. package/package.json +26 -21
  6. package/src/ai/assertion-validation.test.ts +117 -0
  7. package/src/ai/assertion-validation.ts +147 -0
  8. package/src/ai/healthcheck-capabilities.test.ts +158 -0
  9. package/src/ai/healthcheck-capabilities.ts +217 -0
  10. package/src/ai/healthcheck-delete.test.ts +81 -0
  11. package/src/ai/healthcheck-delete.ts +81 -0
  12. package/src/ai/healthcheck-projection.test.ts +36 -0
  13. package/src/ai/healthcheck-propose.test.ts +268 -0
  14. package/src/ai/healthcheck-propose.ts +290 -0
  15. package/src/ai/healthcheck-script-tools.test.ts +93 -0
  16. package/src/ai/healthcheck-script-tools.ts +179 -0
  17. package/src/ai/healthcheck-update.test.ts +123 -0
  18. package/src/ai/healthcheck-update.ts +123 -0
  19. package/src/ai/notify-subscribers.test.ts +109 -0
  20. package/src/ai/notify-subscribers.ts +176 -0
  21. package/src/ai/register-ai-tools.test.ts +41 -0
  22. package/src/ai/register-ai-tools.ts +53 -0
  23. package/src/ai/shell-env-table.test.ts +47 -0
  24. package/src/automations.test.ts +2 -1
  25. package/src/automations.ts +9 -1
  26. package/src/collector-script-test.test.ts +53 -1
  27. package/src/collector-script-test.ts +59 -7
  28. package/src/effective-environments.test.ts +93 -0
  29. package/src/effective-environments.ts +64 -0
  30. package/src/health-entity-id.ts +57 -0
  31. package/src/health-entity.test.ts +384 -6
  32. package/src/health-entity.ts +93 -35
  33. package/src/health-state.ts +41 -4
  34. package/src/healthcheck-gitops-kinds.test.ts +95 -0
  35. package/src/healthcheck-gitops-kinds.ts +56 -13
  36. package/src/index.ts +30 -0
  37. package/src/migration-chain-contract.test.ts +57 -0
  38. package/src/queue-executor.test.ts +801 -0
  39. package/src/queue-executor.ts +336 -52
  40. package/src/realtime-aggregation.test.ts +30 -0
  41. package/src/realtime-aggregation.ts +16 -0
  42. package/src/retention-job.ts +167 -93
  43. package/src/retention-rollup.test.ts +118 -0
  44. package/src/router.test.ts +120 -1
  45. package/src/router.ts +20 -0
  46. package/src/schema.ts +44 -6
  47. package/src/service.ts +199 -43
  48. package/src/state-transitions.test.ts +104 -0
  49. package/src/state-transitions.ts +39 -1
  50. package/src/validate-configuration.test.ts +205 -0
  51. package/src/validate-configuration.ts +159 -0
  52. package/tsconfig.json +9 -0
@@ -10,6 +10,7 @@ import {
10
10
  type TransportClient,
11
11
  type CollectorRunContext,
12
12
  type AdvisoryLockService,
13
+ renderTemplatableConfig,
13
14
  } from "@checkstack/backend-api";
14
15
  import { QueueManager } from "@checkstack/queue-api";
15
16
  import {
@@ -23,6 +24,7 @@ import { type SignalService } from "@checkstack/signal-common";
23
24
  import {
24
25
  HEALTH_CHECK_RUN_COMPLETED,
25
26
  SYSTEM_STATUS_CHANGED,
27
+ ENVIRONMENT_RESOLUTION_FAILED,
26
28
  type HealthCheckStatus,
27
29
  stripEphemeralFields,
28
30
  } from "@checkstack/healthcheck-common";
@@ -30,7 +32,12 @@ import {
30
32
  CatalogApi,
31
33
  catalogRoutes,
32
34
  createSystemSubject,
35
+ type Environment,
33
36
  } from "@checkstack/catalog-common";
37
+ import {
38
+ resolveEffectiveEnvironments,
39
+ type EffectiveEnvironment,
40
+ } from "./effective-environments";
34
41
  import { systemHealthCollapseKey } from "@checkstack/healthcheck-common";
35
42
  import { MaintenanceApi } from "@checkstack/maintenance-common";
36
43
  import { IncidentApi } from "@checkstack/incident-common";
@@ -53,6 +60,7 @@ import {
53
60
  createHealthEntitySerializer,
54
61
  type HealthEntityState,
55
62
  } from "./health-entity";
63
+ import { encodeHealthEntityId } from "./health-entity-id";
56
64
  import type { EntityHandle } from "@checkstack/automation-backend";
57
65
 
58
66
  type Db = SafeDatabase<typeof schema>;
@@ -425,19 +433,22 @@ async function executeHealthCheckJob(props: {
425
433
  // Create service for aggregated state evaluation
426
434
  const service = new HealthCheckService(db, registry, collectorRegistry);
427
435
 
428
- // Per-system serializer for the reactive health mutate (§10.3): a
429
- // transaction-scoped advisory lock keyed `health:<systemId>` wraps the
430
- // snapshot-prev + apply + diff + emit so concurrent evaluations of one
431
- // system (multiple per-config jobs across pods, or at-least-once
432
- // redelivery) can't double-emit a single logical transition. Bound to this
433
- // job's systemId below at every `writeHealthEntity` call.
434
- const serializeHealthWrite = createHealthEntitySerializer({ advisoryLock })(
435
- systemId,
436
- );
437
-
438
- // Capture aggregated state BEFORE this run for comparison
439
- const previousState = await service.getSystemHealthStatus(systemId);
440
- const previousStatus = previousState.status;
436
+ // Per-ENTITY serializer factory for the reactive health mutate (§10.3,
437
+ // Phase 3b): a transaction-scoped advisory lock keyed `health:<entityId>`
438
+ // wraps the snapshot-prev + apply + diff + emit so concurrent evaluations
439
+ // of one (system, environment) or of the system rollup — can't double-emit
440
+ // a single logical transition. Bound to the qualified entity id at each
441
+ // `writeHealthEntity` call so distinct envs / the rollup don't block each
442
+ // other.
443
+ const makeHealthSerializer = createHealthEntitySerializer({ advisoryLock });
444
+
445
+ // The system-rollup status BEFORE this tick (all environments + env-less).
446
+ // Captured once so the post-loop rollup write (§7.4.3) — and the
447
+ // catastrophic-failure path can record a correct prev → next rollup
448
+ // transition (environmentId = null). This is the system-wide aggregate read
449
+ // the executor has always taken first.
450
+ const rollupPreviousState = await service.getSystemHealthStatus(systemId);
451
+ const rollupPreviousStatus = rollupPreviousState.status;
441
452
 
442
453
  try {
443
454
  // Fetch configuration (including name for signals)
@@ -453,6 +464,7 @@ async function executeHealthCheckJob(props: {
453
464
  paused: healthCheckConfigurations.paused,
454
465
  includeLocal: systemHealthChecks.includeLocal,
455
466
  satelliteIds: systemHealthChecks.satelliteIds,
467
+ environmentIds: systemHealthChecks.environmentIds,
456
468
  })
457
469
  .from(systemHealthChecks)
458
470
  .innerJoin(
@@ -508,17 +520,6 @@ async function executeHealthCheckJob(props: {
508
520
  logger.debug(`Could not fetch system name for ${systemId}, using ID`);
509
521
  }
510
522
 
511
- // Curated, read-only run-context metadata exposed to collectors.
512
- // Metadata only - never secrets or config.
513
- const runContext: CollectorRunContext = {
514
- check: {
515
- id: configId,
516
- name: configRow.configName || configId,
517
- intervalSeconds: configRow.interval,
518
- },
519
- system: { id: systemId, name: systemName },
520
- };
521
-
522
523
  const strategy = registry.getStrategy(configRow.strategyId);
523
524
  if (!strategy) {
524
525
  logger.warn(
@@ -527,10 +528,158 @@ async function executeHealthCheckJob(props: {
527
528
  return;
528
529
  }
529
530
 
530
- // Extract timeout from strategy config for platform-level enforcement
531
- const strategyConfig = configRow.config as unknown as BaseStrategyConfig;
531
+ // Migrate the stored (UNVERSIONED) strategy config ONCE, before the
532
+ // per-environment render loop, so every env renders from the same
533
+ // migrated shape. Stored configs predate explicit versioning and may be
534
+ // genuinely v1 (e.g. an HTTP config still carrying url/method); assume-v1
535
+ // -on-read runs the declared migration chain, then validates. The
536
+ // migrations are idempotent, so an already-current config is a no-op.
537
+ const strategyConfig: BaseStrategyConfig =
538
+ await strategy.config.parseAssumingV1(configRow.config);
532
539
  const executionTimeout = strategyConfig.timeout ?? 60_000;
533
540
 
541
+ // ── Per-environment fan-out (§7) ────────────────────────────────────────
542
+ // Resolve the effective environment set from the assignment + the
543
+ // system's current catalog membership, then run ONCE PER environment.
544
+ // An empty effective set (opt-out `[]`, or `null` with no membership)
545
+ // collapses to a single env-less run with `environment` unset — exactly
546
+ // the pre-feature behavior. Membership lives ONLY in the catalog Postgres
547
+ // tables and is re-read every tick via the cross-plugin RPC, so every pod
548
+ // resolves the same set (state-and-scale: no pod-local env state).
549
+ let membership: Environment[] = [];
550
+ try {
551
+ membership = await catalogClient.resolveSystemEnvironments({ systemId });
552
+ } catch (error) {
553
+ // Fail-open: a catalog read failure must not wedge the check. Degrade
554
+ // to an env-less run (today's behavior) rather than skipping the tick.
555
+ logger.warn(
556
+ `Could not resolve environments for system ${systemId}, running env-less`,
557
+ error,
558
+ );
559
+ // Observability: a `logger.warn` alone is easy to miss when a durable
560
+ // catalog misconfig (or outage) silently strips per-environment fan-out.
561
+ // Broadcast a counter-style signal so the degradation is observable.
562
+ // Best-effort — never let the signal break the (still-running) check.
563
+ try {
564
+ await signalService.broadcast(ENVIRONMENT_RESOLUTION_FAILED, {
565
+ systemId,
566
+ configurationId: configId,
567
+ error: extractErrorMessage(error),
568
+ });
569
+ } catch (signalError) {
570
+ logger.warn(
571
+ `Failed to broadcast environment-resolution-failed signal for ${systemId}`,
572
+ signalError,
573
+ );
574
+ }
575
+ }
576
+ const effectiveEnvs = resolveEffectiveEnvironments({
577
+ environmentIds: configRow.environmentIds,
578
+ membership,
579
+ });
580
+ // `null` env => the single env-less run. Each entry => one run per env.
581
+ const runEnvironments: (EffectiveEnvironment | null)[] =
582
+ effectiveEnvs.length > 0 ? effectiveEnvs : [null];
583
+
584
+ // Execute one run per effective environment. Runs are independent (own
585
+ // status / latency / result) and persisted with their own
586
+ // `environmentId`. Phase 3b: each env-run mutates its OWN env-qualified
587
+ // `health` entity (`<systemId>::<environmentId>`, or the bare `<systemId>`
588
+ // for the env-less run) through a per-entity serializer; after the loop a
589
+ // single ROLLUP write for the bare `<systemId>` recomputes the worst-status
590
+ // rollup so system-level consumers keep firing off the unchanged id.
591
+ //
592
+ // Track whether ANY per-env run persisted (so the rollup write only runs
593
+ // when there is something to roll up — an all-failed loop still leaves the
594
+ // durable runs the per-env apply already wrote).
595
+ let anyEnvRunPersisted = false;
596
+ // Whether this tick fans out into REAL environments (vs. the single
597
+ // env-less run). When env-less, the loop's lone write already targets the
598
+ // bare `<systemId>` entity — which IS the rollup — so no separate rollup
599
+ // write is needed. With real envs, the loop writes `<systemId>::<env>`
600
+ // entities and we recompute the bare-`<systemId>` rollup after the loop.
601
+ const isFannedOut = effectiveEnvs.length > 0;
602
+ for (const environment of runEnvironments) {
603
+ const environmentId = environment?.id ?? null;
604
+ // The env-qualified entity id this run mutates. For the env-less run
605
+ // (environmentId === null) this is the bare systemId — which is also the
606
+ // rollup id, so the env-less run IS the rollup (no separate rollup write
607
+ // is needed when the system has no environments — see below).
608
+ const envEntityId = encodeHealthEntityId({ systemId, environmentId });
609
+ const serializeEnvWrite = makeHealthSerializer(envEntityId);
610
+
611
+ // Per-env baseline status for the transition log: the env-scoped
612
+ // aggregate BEFORE this run. Computed per env so a transition row is
613
+ // recorded against the right (system, environment) streak.
614
+ const previousState = await service.getSystemHealthStatus(
615
+ systemId,
616
+ environmentId,
617
+ );
618
+ const previousStatus = previousState.status;
619
+
620
+ // Curated, read-only run-context metadata exposed to collectors.
621
+ // Metadata only - never secrets or config. `environment` carries the
622
+ // resolved env's verbatim custom fields for this run (Phase 2 surfaces
623
+ // consume it); absent for the env-less run.
624
+ const runContext: CollectorRunContext = {
625
+ check: {
626
+ id: configId,
627
+ name: configRow.configName || configId,
628
+ intervalSeconds: configRow.interval,
629
+ },
630
+ system: { id: systemId, name: systemName },
631
+ ...(environment
632
+ ? {
633
+ environment: {
634
+ id: environment.id,
635
+ name: environment.name,
636
+ fields: environment.fields,
637
+ },
638
+ }
639
+ : {}),
640
+ };
641
+
642
+ // Templating context for the per-env config render pass (§6.3.3).
643
+ // Carries only environment custom fields + curated check/system
644
+ // metadata - never secrets. `{{ environment.baseUrl }}` resolves from
645
+ // the resolved env's verbatim fields; an env-less run gets `{}` so a
646
+ // reference renders to empty string (strict: false); see the debug log
647
+ // below.
648
+ const templateContext = {
649
+ environment: runContext.environment?.fields ?? {},
650
+ check: runContext.check,
651
+ system: runContext.system,
652
+ };
653
+ if (!runContext.environment) {
654
+ // §11.6: render-empty when a run has no environment. An env-less run is
655
+ // a legitimate, documented configuration (the None assignment mode, or
656
+ // All-environments with no membership), and it recurs every interval -
657
+ // so this is `debug`, not `warn`, to avoid spamming the log. When an
658
+ // empty `{{ environment.* }}` render actually matters, the HTTP
659
+ // post-render `.url()` check already fails the run with a concrete
660
+ // "Rendered URL is invalid" error; we do not inspect every field here.
661
+ logger.debug(
662
+ `Health check ${configId} for system ${systemId} ran with no environment; ` +
663
+ `any {{ environment.* }} references render to empty string`,
664
+ );
665
+ }
666
+
667
+ // (2) Environment/templating pass (NEW) - renders `{{ environment.* }}`
668
+ // etc. in `x-templatable` fields. Runs PER ENVIRONMENT, AFTER the secret
669
+ // resolution (secrets first, templating second - §6.3.4) and BEFORE the
670
+ // strategy client build, so each env gets its own rendered strategy
671
+ // config + client. The collector configs are rendered just before each
672
+ // collector executes (below) so the secretEnv resolution stays first.
673
+ const renderedStrategyConfig = renderTemplatableConfig({
674
+ config: strategyConfig,
675
+ schema: strategy.config.schema,
676
+ context: templateContext,
677
+ }) as BaseStrategyConfig;
678
+
679
+ // Per-environment isolation: an unexpected failure persisting ONE
680
+ // environment's run must not abort the sibling environments' runs.
681
+ // Each iteration's run is independent (§7.2), so we log and continue.
682
+ try {
534
683
  // Execute health check using createClient pattern with unified hard timeout
535
684
  const start = performance.now();
536
685
  let connectionTimeMs: number | undefined;
@@ -546,8 +695,11 @@ async function executeHealthCheckJob(props: {
546
695
  // Platform-level hard timeout wrapping the entire execution sequence
547
696
  await Promise.race([
548
697
  (async () => {
549
- // 1. Establish connection
550
- connectedClient = await strategy.createClient(strategyConfig);
698
+ // 1. Establish connection. The strategy client build moves INSIDE
699
+ // the per-env loop (§6.3.3): each env gets its own rendered config +
700
+ // client, so a single job no longer bakes in one env's rendered
701
+ // strategy config.
702
+ connectedClient = await strategy.createClient(renderedStrategyConfig);
551
703
  connectionTimeMs = Math.round(performance.now() - start);
552
704
 
553
705
  // 2. Execute collectors in parallel
@@ -584,8 +736,31 @@ async function executeHealthCheckJob(props: {
584
736
  secretEnv = resolved.env;
585
737
  }
586
738
 
739
+ // Migrate the stored (UNVERSIONED) collector config via
740
+ // assume-v1-on-read: runs the declared migration chain, then
741
+ // validates. Migrations are idempotent, so an already-current
742
+ // config is a no-op. This runs BEFORE templating so the render
743
+ // pass sees the migrated shape; the secretEnv resolution above
744
+ // reads the raw `secretEnv` mapping (a constant string field
745
+ // unaffected by the strategy/collector reshapes), keeping the
746
+ // migrate -> secret resolve -> render -> execute order intact.
747
+ const migratedCollectorConfig =
748
+ await registered.collector.config.parseAssumingV1(
749
+ collectorEntry.config,
750
+ );
751
+
752
+ // (2) Environment/templating pass for the collector config -
753
+ // runs AFTER the secretEnv resolution above (secrets first,
754
+ // templating second) and renders `{{ environment.* }}` in this
755
+ // collector's `x-templatable` fields against the per-env context.
756
+ const renderedCollectorConfig = renderTemplatableConfig({
757
+ config: migratedCollectorConfig,
758
+ schema: registered.collector.config.schema,
759
+ context: templateContext,
760
+ });
761
+
587
762
  const collectorResult = await registered.collector.execute({
588
- config: collectorEntry.config,
763
+ config: renderedCollectorConfig,
589
764
  client: connectedClient!.client,
590
765
  pluginId: configRow.strategyId,
591
766
  runContext,
@@ -728,11 +903,12 @@ async function executeHealthCheckJob(props: {
728
903
  let newState!: AggregatedHealth;
729
904
  await writeHealthEntity({
730
905
  handle: getHealthEntity?.(),
731
- systemId,
906
+ entityId: envEntityId,
732
907
  apply: async () => {
733
908
  await db.insert(healthCheckRuns).values({
734
909
  configurationId: configId,
735
910
  systemId,
911
+ environmentId,
736
912
  status: result.status,
737
913
  latencyMs: result.latencyMs,
738
914
  result: { ...result } as Record<string, unknown>,
@@ -744,6 +920,7 @@ async function executeHealthCheckJob(props: {
744
920
  db,
745
921
  systemId,
746
922
  configurationId: configId,
923
+ environmentId,
747
924
  status: result.status,
748
925
  latencyMs: result.latencyMs,
749
926
  runTimestamp: new Date(),
@@ -752,13 +929,18 @@ async function executeHealthCheckJob(props: {
752
929
  sourceLabel: "Local",
753
930
  });
754
931
 
755
- newState = await service.getSystemHealthStatus(systemId);
932
+ // Env-scoped view: the per-env entity reflects only this env's runs.
933
+ newState = await service.getSystemHealthStatus(systemId, environmentId);
756
934
  return toHealthEntityView(newState);
757
935
  },
758
- serialize: serializeHealthWrite,
936
+ serialize: serializeEnvWrite,
759
937
  onError: (error) =>
760
- logger.warn(`Failed to mirror health entity for ${systemId}`, error),
938
+ logger.warn(
939
+ `Failed to mirror health entity for ${envEntityId}`,
940
+ error,
941
+ ),
761
942
  });
943
+ anyEnvRunPersisted = true;
762
944
 
763
945
  logger.debug(
764
946
  `Health check ${configId} for system ${systemId} failed: ${finalError}`,
@@ -784,6 +966,7 @@ async function executeHealthCheckJob(props: {
784
966
  db,
785
967
  systemId,
786
968
  configurationId: configId,
969
+ environmentId,
787
970
  fromStatus: previousStatus,
788
971
  toStatus: newState.status,
789
972
  });
@@ -803,7 +986,9 @@ async function executeHealthCheckJob(props: {
803
986
  });
804
987
  }
805
988
 
806
- return;
989
+ // This environment's run is done (failed). Continue to the next
990
+ // effective environment rather than ending the whole job.
991
+ continue;
807
992
  } finally {
808
993
  if (connectedClient) {
809
994
  try {
@@ -841,12 +1026,13 @@ async function executeHealthCheckJob(props: {
841
1026
  let newState!: AggregatedHealth;
842
1027
  await writeHealthEntity({
843
1028
  handle: getHealthEntity?.(),
844
- systemId,
1029
+ entityId: envEntityId,
845
1030
  apply: async () => {
846
1031
  // Store result (spread to convert structured type to plain record for jsonb)
847
1032
  await db.insert(healthCheckRuns).values({
848
1033
  configurationId: configId,
849
1034
  systemId,
1035
+ environmentId,
850
1036
  status: result.status,
851
1037
  latencyMs: result.latencyMs,
852
1038
  result: { ...result } as Record<string, unknown>,
@@ -859,6 +1045,7 @@ async function executeHealthCheckJob(props: {
859
1045
  db,
860
1046
  systemId,
861
1047
  configurationId: configId,
1048
+ environmentId,
862
1049
  status: result.status,
863
1050
  latencyMs: result.latencyMs,
864
1051
  runTimestamp: new Date(),
@@ -867,13 +1054,15 @@ async function executeHealthCheckJob(props: {
867
1054
  sourceLabel: "Local",
868
1055
  });
869
1056
 
870
- newState = await service.getSystemHealthStatus(systemId);
1057
+ // Env-scoped view: the per-env entity reflects only this env's runs.
1058
+ newState = await service.getSystemHealthStatus(systemId, environmentId);
871
1059
  return toHealthEntityView(newState);
872
1060
  },
873
- serialize: serializeHealthWrite,
1061
+ serialize: serializeEnvWrite,
874
1062
  onError: (error) =>
875
- logger.warn(`Failed to mirror health entity for ${systemId}`, error),
1063
+ logger.warn(`Failed to mirror health entity for ${envEntityId}`, error),
876
1064
  });
1065
+ anyEnvRunPersisted = true;
877
1066
 
878
1067
  logger.debug(
879
1068
  `Ran health check ${configId} for system ${systemId}: ${result.status}`,
@@ -909,6 +1098,7 @@ async function executeHealthCheckJob(props: {
909
1098
  db,
910
1099
  systemId,
911
1100
  configurationId: configId,
1101
+ environmentId,
912
1102
  fromStatus: previousStatus,
913
1103
  toStatus: newState.status,
914
1104
  });
@@ -927,12 +1117,19 @@ async function executeHealthCheckJob(props: {
927
1117
  logger,
928
1118
  });
929
1119
 
930
- // Broadcast system-level status change signal for frontend reactivity
931
- await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
932
- systemId,
933
- previousStatus: previousStatus as HealthCheckStatus,
934
- newStatus: newState.status,
935
- });
1120
+ // The system-level `SYSTEM_STATUS_CHANGED` signal must carry the ROLLUP
1121
+ // status, not a per-env status. When fanned out, the post-loop rollup
1122
+ // write broadcasts it once with the worst-status rollup; emitting it here
1123
+ // per env would send up to N system-level signals/tick carrying per-env
1124
+ // status. Only the env-less run (which IS the rollup — `!isFannedOut`)
1125
+ // broadcasts the system-level signal from inside the loop.
1126
+ if (!isFannedOut) {
1127
+ await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
1128
+ systemId,
1129
+ previousStatus: previousStatus as HealthCheckStatus,
1130
+ newStatus: newState.status,
1131
+ });
1132
+ }
936
1133
 
937
1134
  // The directional + umbrella system-health hooks were removed in
938
1135
  // Phase 4 (§10.3): the `health` entity mirror above is the single
@@ -940,6 +1137,91 @@ async function executeHealthCheckJob(props: {
940
1137
  // `healthcheck.system_degraded` / `_healthy` / `_health_changed`
941
1138
  // trigger events through Stage-1 routing. Nothing to emit here.
942
1139
  }
1140
+ } catch (envError) {
1141
+ // Isolate this environment's failure; continue with the next env.
1142
+ logger.error(
1143
+ `Failed to run health check ${configId} for system ${systemId}` +
1144
+ (environmentId ? ` (environment ${environmentId})` : " (env-less)"),
1145
+ envError,
1146
+ );
1147
+ }
1148
+ } // end per-environment fan-out loop (for ... of runEnvironments)
1149
+
1150
+ // ── System rollup write (§7.4.3) ───────────────────────────────────────
1151
+ // With real environments, the per-env writes mutated `<systemId>::<env>`
1152
+ // entities; the bare `<systemId>` ROLLUP entity (the worst-status view
1153
+ // every existing system-level consumer references) must now recompute so
1154
+ // it diffs/emits its OWN `ENTITY_CHANGED`. The rollup `apply` does NO new
1155
+ // durable insert (the runs are already persisted by the per-env writes) —
1156
+ // it just recomputes + returns the all-runs rollup view so the framework
1157
+ // diffs prev → next. Keyed on the bare `health:<systemId>` lock so it
1158
+ // serializes against itself, independent of the per-env locks.
1159
+ //
1160
+ // Skipped when env-less (the loop's lone write already targeted the bare
1161
+ // `<systemId>` entity = the rollup) or when nothing persisted (a fully
1162
+ // isolated-failure loop left no new runs to roll up).
1163
+ if (isFannedOut && anyEnvRunPersisted) {
1164
+ const rollupEntityId = encodeHealthEntityId({ systemId });
1165
+ let rollupState!: AggregatedHealth;
1166
+ try {
1167
+ await writeHealthEntity({
1168
+ handle: getHealthEntity?.(),
1169
+ entityId: rollupEntityId,
1170
+ apply: async () => {
1171
+ // No durable insert — recompute the all-runs (rollup) view.
1172
+ rollupState = await service.getSystemHealthStatus(systemId);
1173
+ return toHealthEntityView(rollupState);
1174
+ },
1175
+ serialize: makeHealthSerializer(rollupEntityId),
1176
+ onError: (error) =>
1177
+ logger.warn(
1178
+ `Failed to mirror rollup health entity for ${systemId}`,
1179
+ error,
1180
+ ),
1181
+ });
1182
+
1183
+ // Record the ROLLUP transition (environmentId = null) so system-level
1184
+ // "in status since" reflects the aggregate, and notify on a real
1185
+ // rollup status change so existing system-level notifications fire.
1186
+ if (rollupState.status !== rollupPreviousStatus) {
1187
+ await recordStateTransition({
1188
+ db,
1189
+ systemId,
1190
+ configurationId: configId,
1191
+ environmentId: null,
1192
+ fromStatus: rollupPreviousStatus,
1193
+ toStatus: rollupState.status,
1194
+ });
1195
+
1196
+ await notifyStateChange({
1197
+ notificationClient,
1198
+ systemId,
1199
+ systemName,
1200
+ configurationId: configId,
1201
+ previousStatus: rollupPreviousStatus,
1202
+ newStatus: rollupState.status,
1203
+ service,
1204
+ catalogClient,
1205
+ maintenanceClient,
1206
+ incidentClient,
1207
+ logger,
1208
+ });
1209
+
1210
+ await signalService.broadcast(SYSTEM_STATUS_CHANGED, {
1211
+ systemId,
1212
+ previousStatus: rollupPreviousStatus as HealthCheckStatus,
1213
+ newStatus: rollupState.status,
1214
+ });
1215
+ }
1216
+ } catch (rollupError) {
1217
+ // The rollup is best-effort reactivity over already-durable runs; a
1218
+ // failure must not wedge the (completed) per-env runs.
1219
+ logger.error(
1220
+ `Failed to write system rollup health for ${systemId}`,
1221
+ rollupError,
1222
+ );
1223
+ }
1224
+ }
943
1225
 
944
1226
  // Note: No manual rescheduling needed - recurring job handles it automatically
945
1227
  } catch (error) {
@@ -948,15 +1230,17 @@ async function executeHealthCheckJob(props: {
948
1230
  error,
949
1231
  );
950
1232
 
951
- // Persist the failure run + aggregate THROUGH the reactive `health`
952
- // entity: `apply` does the durable write and returns the freshly-computed
953
- // view. The framework snapshots `prev` via the compute-on-read accessor
954
- // BEFORE this insert, so a real status change emits exactly one correct
955
- // `ENTITY_CHANGED` (§10.3). See the success path for the full rationale.
1233
+ // Catastrophic job-level failure (e.g. the config fetch / env resolution
1234
+ // threw before the fan-out loop). Persist a single env-less failure run
1235
+ // against the bare `<systemId>` entity which IS the system rollup — so
1236
+ // the system-level health change still emits. Reuses the pre-tick
1237
+ // rollup status captured before the try block.
1238
+ const rollupEntityId = encodeHealthEntityId({ systemId });
1239
+ const previousStatus = rollupPreviousStatus;
956
1240
  let newState!: AggregatedHealth;
957
1241
  await writeHealthEntity({
958
1242
  handle: getHealthEntity?.(),
959
- systemId,
1243
+ entityId: rollupEntityId,
960
1244
  apply: async () => {
961
1245
  // Store failure (no latencyMs for failures)
962
1246
  await db.insert(healthCheckRuns).values({
@@ -984,10 +1268,10 @@ async function executeHealthCheckJob(props: {
984
1268
  newState = await service.getSystemHealthStatus(systemId);
985
1269
  return toHealthEntityView(newState);
986
1270
  },
987
- serialize: serializeHealthWrite,
1271
+ serialize: makeHealthSerializer(rollupEntityId),
988
1272
  onError: (mirrorError) =>
989
1273
  logger.warn(
990
- `Failed to mirror health entity for ${systemId}`,
1274
+ `Failed to mirror health entity for ${rollupEntityId}`,
991
1275
  mirrorError,
992
1276
  ),
993
1277
  });
@@ -171,6 +171,36 @@ describe("incrementHourlyAggregate", () => {
171
171
  expect(inserted.maxLatencyMs).toBe(150);
172
172
  });
173
173
 
174
+ it("writes the environmentId into the aggregate (per-environment fan-out)", async () => {
175
+ await incrementHourlyAggregate({
176
+ db: mockDb as never,
177
+ systemId: "sys-1",
178
+ configurationId: "config-1",
179
+ environmentId: "prod",
180
+ status: "healthy",
181
+ latencyMs: 150,
182
+ runTimestamp: new Date("2024-01-15T10:35:00Z"),
183
+ });
184
+
185
+ const inserted = insertedValues[0] as Record<string, unknown>;
186
+ expect(inserted.environmentId).toBe("prod");
187
+ });
188
+
189
+ it("normalizes an env-less run to environmentId = null", async () => {
190
+ await incrementHourlyAggregate({
191
+ db: mockDb as never,
192
+ systemId: "sys-1",
193
+ configurationId: "config-1",
194
+ // environmentId omitted -> env-less run
195
+ status: "healthy",
196
+ latencyMs: 150,
197
+ runTimestamp: new Date("2024-01-15T10:35:00Z"),
198
+ });
199
+
200
+ const inserted = insertedValues[0] as Record<string, unknown>;
201
+ expect(inserted.environmentId).toBeNull();
202
+ });
203
+
174
204
  it("increments counts for unhealthy status", async () => {
175
205
  await incrementHourlyAggregate({
176
206
  db: mockDb as never,
@@ -60,6 +60,12 @@ interface IncrementHourlyAggregateParams {
60
60
  db: Db;
61
61
  systemId: string;
62
62
  configurationId: string;
63
+ /**
64
+ * Environment this run was for (per-environment fan-out). null/undefined =
65
+ * env-less run. Part of the aggregate uniqueness key so per-environment
66
+ * buckets stay separate.
67
+ */
68
+ environmentId?: string | null;
63
69
  status: "healthy" | "unhealthy" | "degraded";
64
70
  latencyMs: number | undefined;
65
71
  runTimestamp: Date;
@@ -87,6 +93,7 @@ export async function incrementHourlyAggregate(
87
93
  db,
88
94
  systemId,
89
95
  configurationId,
96
+ environmentId,
90
97
  status,
91
98
  latencyMs,
92
99
  runTimestamp,
@@ -96,6 +103,10 @@ export async function incrementHourlyAggregate(
96
103
  sourceLabel,
97
104
  } = params;
98
105
 
106
+ // Normalize undefined -> null so the env-less slice is one stable key
107
+ // (NULLS NOT DISTINCT matches on it via the IS NULL predicate below).
108
+ const envId = environmentId ?? null;
109
+
99
110
  const bucketStart = getHourBucketStart(runTimestamp);
100
111
 
101
112
  // First, try to fetch existing aggregate to merge t-digest state and collector data
@@ -111,6 +122,9 @@ export async function incrementHourlyAggregate(
111
122
  and(
112
123
  eq(healthCheckAggregates.systemId, systemId),
113
124
  eq(healthCheckAggregates.configurationId, configurationId),
125
+ envId
126
+ ? eq(healthCheckAggregates.environmentId, envId)
127
+ : sql`${healthCheckAggregates.environmentId} IS NULL`,
114
128
  eq(healthCheckAggregates.bucketStart, bucketStart),
115
129
  eq(healthCheckAggregates.bucketSize, "hourly"),
116
130
  sourceId
@@ -177,6 +191,7 @@ export async function incrementHourlyAggregate(
177
191
  .values({
178
192
  configurationId,
179
193
  systemId,
194
+ environmentId: envId,
180
195
  bucketStart,
181
196
  bucketSize: "hourly",
182
197
  runCount: 1,
@@ -197,6 +212,7 @@ export async function incrementHourlyAggregate(
197
212
  target: [
198
213
  healthCheckAggregates.configurationId,
199
214
  healthCheckAggregates.systemId,
215
+ healthCheckAggregates.environmentId,
200
216
  healthCheckAggregates.bucketStart,
201
217
  healthCheckAggregates.bucketSize,
202
218
  healthCheckAggregates.sourceId,