@checkstack/healthcheck-backend 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/CHANGELOG.md +223 -0
  2. package/drizzle/0018_abnormal_preak.sql +10 -0
  3. package/drizzle/meta/0018_snapshot.json +600 -0
  4. package/drizzle/meta/_journal.json +7 -0
  5. package/package.json +26 -21
  6. package/src/ai/assertion-validation.test.ts +117 -0
  7. package/src/ai/assertion-validation.ts +147 -0
  8. package/src/ai/healthcheck-capabilities.test.ts +158 -0
  9. package/src/ai/healthcheck-capabilities.ts +217 -0
  10. package/src/ai/healthcheck-delete.test.ts +81 -0
  11. package/src/ai/healthcheck-delete.ts +81 -0
  12. package/src/ai/healthcheck-projection.test.ts +36 -0
  13. package/src/ai/healthcheck-propose.test.ts +268 -0
  14. package/src/ai/healthcheck-propose.ts +290 -0
  15. package/src/ai/healthcheck-script-tools.test.ts +93 -0
  16. package/src/ai/healthcheck-script-tools.ts +179 -0
  17. package/src/ai/healthcheck-update.test.ts +123 -0
  18. package/src/ai/healthcheck-update.ts +123 -0
  19. package/src/ai/notify-subscribers.test.ts +109 -0
  20. package/src/ai/notify-subscribers.ts +176 -0
  21. package/src/ai/register-ai-tools.test.ts +41 -0
  22. package/src/ai/register-ai-tools.ts +53 -0
  23. package/src/ai/shell-env-table.test.ts +47 -0
  24. package/src/automations.test.ts +2 -1
  25. package/src/automations.ts +9 -1
  26. package/src/collector-script-test.test.ts +53 -1
  27. package/src/collector-script-test.ts +59 -7
  28. package/src/effective-environments.test.ts +93 -0
  29. package/src/effective-environments.ts +64 -0
  30. package/src/health-entity-id.ts +57 -0
  31. package/src/health-entity.test.ts +384 -6
  32. package/src/health-entity.ts +93 -35
  33. package/src/health-state.ts +41 -4
  34. package/src/healthcheck-gitops-kinds.test.ts +95 -0
  35. package/src/healthcheck-gitops-kinds.ts +56 -13
  36. package/src/index.ts +30 -0
  37. package/src/migration-chain-contract.test.ts +57 -0
  38. package/src/queue-executor.test.ts +801 -0
  39. package/src/queue-executor.ts +336 -52
  40. package/src/realtime-aggregation.test.ts +30 -0
  41. package/src/realtime-aggregation.ts +16 -0
  42. package/src/retention-job.ts +167 -93
  43. package/src/retention-rollup.test.ts +118 -0
  44. package/src/router.test.ts +120 -1
  45. package/src/router.ts +20 -0
  46. package/src/schema.ts +44 -6
  47. package/src/service.ts +199 -43
  48. package/src/state-transitions.test.ts +104 -0
  49. package/src/state-transitions.ts +39 -1
  50. package/src/validate-configuration.test.ts +205 -0
  51. package/src/validate-configuration.ts +159 -0
  52. package/tsconfig.json +9 -0
@@ -20,6 +20,10 @@ import {
20
20
  type HealthEntityState,
21
21
  } from "./health-entity";
22
22
  import type { HealthCheckService } from "./service";
23
+ import {
24
+ encodeHealthEntityId,
25
+ parseHealthEntityId,
26
+ } from "./health-entity-id";
23
27
  import {
24
28
  systemDegradedTrigger,
25
29
  systemHealthyTrigger,
@@ -162,6 +166,7 @@ describe("classifyHealthChange (cross-plugin consumer predicate)", () => {
162
166
  const c = classifyHealthChange(change());
163
167
  expect(c).toEqual({
164
168
  systemId: "sys-1",
169
+ environmentId: null,
165
170
  previousStatus: "healthy",
166
171
  newStatus: "unhealthy",
167
172
  degraded: true,
@@ -370,7 +375,7 @@ describe("writeHealthEntity (durable write driven through handle.mutate)", () =>
370
375
 
371
376
  const next = await writeHealthEntity({
372
377
  handle,
373
- systemId: "sys-1",
378
+ entityId: "sys-1",
374
379
  apply: async () => {
375
380
  persisted = { status: "unhealthy", healthyChecks: 0, totalChecks: 2 };
376
381
  return persisted;
@@ -392,7 +397,7 @@ describe("writeHealthEntity (durable write driven through handle.mutate)", () =>
392
397
  let ran = false;
393
398
  const next = await writeHealthEntity({
394
399
  handle: undefined,
395
- systemId: "sys-1",
400
+ entityId: "sys-1",
396
401
  apply: async () => {
397
402
  ran = true;
398
403
  return { status: "healthy", healthyChecks: 1, totalChecks: 1 };
@@ -415,7 +420,7 @@ describe("writeHealthEntity (durable write driven through handle.mutate)", () =>
415
420
  // apply commits, THEN the handle throws (emit failure). Must not rethrow.
416
421
  const result = await writeHealthEntity({
417
422
  handle,
418
- systemId: "sys-1",
423
+ entityId: "sys-1",
419
424
  apply: async () => ({
420
425
  status: "unhealthy",
421
426
  healthyChecks: 0,
@@ -442,7 +447,7 @@ describe("writeHealthEntity (durable write driven through handle.mutate)", () =>
442
447
  await expect(
443
448
  writeHealthEntity({
444
449
  handle,
445
- systemId: "sys-1",
450
+ entityId: "sys-1",
446
451
  apply: async () => {
447
452
  throw new Error("insert failed");
448
453
  },
@@ -527,7 +532,7 @@ describe("first-run-unhealthy degradation (Defect 1 regression)", () => {
527
532
 
528
533
  const next = await writeHealthEntity({
529
534
  handle,
530
- systemId: "sys-1",
535
+ entityId: "sys-1",
531
536
  apply: async () => {
532
537
  // The durable first run lands here (unhealthy).
533
538
  firstRunRecorded = true;
@@ -645,7 +650,7 @@ describe("per-system serialization (Defect 2 regression)", () => {
645
650
  const evalOnce = () =>
646
651
  writeHealthEntity({
647
652
  handle,
648
- systemId: "sys-1",
653
+ entityId: "sys-1",
649
654
  serialize,
650
655
  apply: async () => {
651
656
  // The durable "insert failing run" — first writer flips the state.
@@ -691,4 +696,377 @@ describe("per-system serialization (Defect 2 regression)", () => {
691
696
  // The advisory lock was acquired with the per-system namespaced key.
692
697
  expect(keys).toContain("health:sys-42");
693
698
  });
699
+
700
+ it("serializes per ENV-QUALIFIED id so distinct envs / the rollup use distinct lock keys", async () => {
701
+ const keys: string[] = [];
702
+ const advisoryLock = {
703
+ tryAcquire: async () => ({ release: async () => {} }),
704
+ withXactLock<T>({
705
+ key,
706
+ fn,
707
+ }: {
708
+ key: string;
709
+ fn: () => Promise<T>;
710
+ }): Promise<T> {
711
+ keys.push(key);
712
+ return fn();
713
+ },
714
+ } satisfies Parameters<
715
+ typeof createHealthEntitySerializer
716
+ >[0]["advisoryLock"];
717
+
718
+ const make = createHealthEntitySerializer({ advisoryLock });
719
+ await make(encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" }))(
720
+ async () => "ok",
721
+ );
722
+ await make(encodeHealthEntityId({ systemId: "sys-1", environmentId: "staging" }))(
723
+ async () => "ok",
724
+ );
725
+ await make(encodeHealthEntityId({ systemId: "sys-1" }))(async () => "ok");
726
+
727
+ // Per-env keys are env-qualified; the rollup uses the bare systemId. All
728
+ // three are DISTINCT, so they never block each other.
729
+ expect(keys).toEqual([
730
+ "health:sys-1::prod",
731
+ "health:sys-1::staging",
732
+ "health:sys-1",
733
+ ]);
734
+ });
735
+ });
736
+
737
+ // ──────────────────────────────────────────────────────────────────────────
738
+ // PHASE 3b: the `health` entity is env-keyed — `<systemId>` (rollup) and
739
+ // `<systemId>::<environmentId>` (per-env) views share one kind, distinguished
740
+ // only by id-shape. The rollup MUST preserve the pre-3b system-level contract.
741
+ // ──────────────────────────────────────────────────────────────────────────
742
+
743
+ describe("health-entity-id encode/parse round-trip", () => {
744
+ it("encodes the bare systemId for the rollup (no environment)", () => {
745
+ expect(encodeHealthEntityId({ systemId: "sys-1" })).toBe("sys-1");
746
+ expect(encodeHealthEntityId({ systemId: "sys-1", environmentId: null })).toBe(
747
+ "sys-1",
748
+ );
749
+ });
750
+
751
+ it("encodes `<systemId>::<environmentId>` for a per-env id", () => {
752
+ expect(
753
+ encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" }),
754
+ ).toBe("sys-1::prod");
755
+ });
756
+
757
+ it("parses a bare id as the rollup (environmentId null)", () => {
758
+ expect(parseHealthEntityId("sys-1")).toEqual({
759
+ systemId: "sys-1",
760
+ environmentId: null,
761
+ });
762
+ });
763
+
764
+ it("parses a per-env id into (systemId, environmentId)", () => {
765
+ expect(parseHealthEntityId("sys-1::prod")).toEqual({
766
+ systemId: "sys-1",
767
+ environmentId: "prod",
768
+ });
769
+ });
770
+ });
771
+
772
+ /** Sentinel key for the env-less slice (`environmentId === null`) in the fake.
773
+ * Kept DISTINCT from the rollup key (`"<systemId>"`, selected by `undefined`)
774
+ * so the fake faithfully models production's `IS NULL` filter — collapsing
775
+ * them is what masked the rollup BLOCKER. */
776
+ const ENVLESS_KEY = "::__envless__";
777
+
778
+ /**
779
+ * Env-aware fake service: `getSystemHealthStatus(systemId, environmentId)`
780
+ * returns canned per-(system, env) state, distinguishing all THREE arg modes
781
+ * exactly as production's SQL does:
782
+ * - `environmentId === undefined` ⇒ ROLLUP (all runs) — key `"<systemId>"`.
783
+ * - `environmentId === null` ⇒ ENV-LESS slice (`env_id IS NULL`) — key
784
+ * `"<systemId>::__envless__"` (DISTINCT from the rollup key).
785
+ * - a string ⇒ per-env slice — key `"<systemId>::<env>"`.
786
+ */
787
+ function fakeEnvService(
788
+ byEntityId: Record<
789
+ string,
790
+ { status: CheckStatus; checkStatuses: Array<{ status: CheckStatus }> }
791
+ >,
792
+ ): HealthCheckService {
793
+ return {
794
+ getSystemHealthStatus: async (
795
+ systemId: string,
796
+ environmentId?: string | null,
797
+ ) => {
798
+ const key =
799
+ environmentId === undefined
800
+ ? systemId
801
+ : environmentId === null
802
+ ? `${systemId}${ENVLESS_KEY}`
803
+ : `${systemId}::${environmentId}`;
804
+ const found = byEntityId[key];
805
+ return {
806
+ status: found?.status ?? ("healthy" as CheckStatus),
807
+ evaluatedAt: new Date(),
808
+ checkStatuses: (found?.checkStatuses ?? []).map((c, i) => ({
809
+ configurationId: `cfg-${i}`,
810
+ configurationName: `Check ${i}`,
811
+ status: c.status,
812
+ runsConsidered: 1,
813
+ })),
814
+ };
815
+ },
816
+ } as unknown as HealthCheckService;
817
+ }
818
+
819
+ describe("createHealthEntityRead — env-keyed (rollup vs per-env)", () => {
820
+ it("resolves the per-env view for a `<systemId>::<env>` id and the rollup for a bare id", async () => {
821
+ const service = fakeEnvService({
822
+ // Rollup: worst across envs (unhealthy because prod is unhealthy).
823
+ "sys-1": {
824
+ status: "unhealthy",
825
+ checkStatuses: [{ status: "unhealthy" }],
826
+ },
827
+ "sys-1::prod": {
828
+ status: "unhealthy",
829
+ checkStatuses: [{ status: "unhealthy" }],
830
+ },
831
+ "sys-1::staging": {
832
+ status: "healthy",
833
+ checkStatuses: [{ status: "healthy" }],
834
+ },
835
+ });
836
+ const read = createHealthEntityRead({ service });
837
+ const out = await read(["sys-1", "sys-1::prod", "sys-1::staging"]);
838
+
839
+ // Keyed by the ORIGINAL (env-qualified) id, each resolving the right view.
840
+ expect(out["sys-1"]?.status).toBe("unhealthy"); // rollup
841
+ expect(out["sys-1::prod"]?.status).toBe("unhealthy"); // per-env
842
+ expect(out["sys-1::staging"]?.status).toBe("healthy"); // per-env
843
+ });
844
+
845
+ it("rollup of a system WITH environments reads ALL runs (worst status), NOT the env-less slice (BLOCKER regression)", async () => {
846
+ // A system whose runs ALL carry a non-null env_id: there is NO env-less
847
+ // slice entry. The bare-`<systemId>` ROLLUP must read ALL runs (worst
848
+ // status across envs), NOT `env_id IS NULL` (which would find zero rows and
849
+ // report default healthy). The fake omits the ENV-LESS key entirely, so a
850
+ // bug that resolved the rollup via `null` would return default healthy here.
851
+ const service = fakeEnvService({
852
+ // Rollup (all-runs / `undefined`): worst across envs = unhealthy.
853
+ "sys-1": {
854
+ status: "unhealthy",
855
+ checkStatuses: [{ status: "unhealthy" }],
856
+ },
857
+ "sys-1::prod": {
858
+ status: "unhealthy",
859
+ checkStatuses: [{ status: "unhealthy" }],
860
+ },
861
+ "sys-1::staging": {
862
+ status: "healthy",
863
+ checkStatuses: [{ status: "healthy" }],
864
+ },
865
+ // NOTE: deliberately NO "sys-1::__envless__" entry — every run has an env.
866
+ });
867
+ const read = createHealthEntityRead({ service });
868
+ const out = await read(["sys-1"]);
869
+ // Worst status across environments — NOT the (empty) env-less slice's
870
+ // default healthy.
871
+ expect(out["sys-1"]?.status).toBe("unhealthy");
872
+ });
873
+
874
+ it("rollup preserves the pre-3b contract: a bare-systemId read equals today's status when no envs exist", async () => {
875
+ // A system with no environments has only the bare-systemId (rollup =
876
+ // env-less) entry — exactly the pre-3b shape.
877
+ const service = fakeEnvService({
878
+ "sys-1": {
879
+ status: "degraded",
880
+ checkStatuses: [{ status: "healthy" }, { status: "degraded" }],
881
+ },
882
+ });
883
+ const read = createHealthEntityRead({ service });
884
+ const out = await read(["sys-1"]);
885
+ expect(out["sys-1"]).toEqual({
886
+ status: "degraded",
887
+ healthyChecks: 1,
888
+ totalChecks: 2,
889
+ });
890
+ });
891
+
892
+ it("omits a per-env id whose system has no enabled checks (existence gate holds per id)", async () => {
893
+ const service = fakeEnvService({
894
+ "sys-1::prod": { status: "healthy", checkStatuses: [] },
895
+ });
896
+ const read = createHealthEntityRead({ service });
897
+ const out = await read(["sys-1::prod"]);
898
+ expect(out["sys-1::prod"]).toBeUndefined();
899
+ });
900
+ });
901
+
902
+ describe("computeHealthEntityState — environment-aware", () => {
903
+ it("computes the env-scoped view for a concrete environment", async () => {
904
+ const service = fakeEnvService({
905
+ "sys-1::prod": {
906
+ status: "unhealthy",
907
+ checkStatuses: [{ status: "unhealthy" }, { status: "healthy" }],
908
+ },
909
+ });
910
+ const state = await computeHealthEntityState({
911
+ service,
912
+ systemId: "sys-1",
913
+ environmentId: "prod",
914
+ });
915
+ expect(state).toEqual({
916
+ status: "unhealthy",
917
+ healthyChecks: 1,
918
+ totalChecks: 2,
919
+ });
920
+ });
921
+
922
+ it("computes the rollup view when environmentId is omitted", async () => {
923
+ const service = fakeEnvService({
924
+ "sys-1": {
925
+ status: "degraded",
926
+ checkStatuses: [{ status: "degraded" }],
927
+ },
928
+ });
929
+ const state = await computeHealthEntityState({ service, systemId: "sys-1" });
930
+ expect(state?.status).toBe("degraded");
931
+ });
932
+ });
933
+
934
+ describe("healthChangeToPayload — env-qualified id", () => {
935
+ it("sets payload.environmentId for a PER-ENV change and validates against the schema", () => {
936
+ const payload = healthChangeToPayload(
937
+ change({ id: encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" }) }),
938
+ );
939
+ const parsed = systemDegradedTrigger.payloadSchema.parse(payload);
940
+ // systemId is the bare systemId portion; environmentId is the env.
941
+ expect(parsed.systemId).toBe("sys-1");
942
+ expect(parsed.environmentId).toBe("prod");
943
+ });
944
+
945
+ it("OMITS environmentId for the system ROLLUP change (back-compat: bare systemId)", () => {
946
+ const payload = healthChangeToPayload(change({ id: "sys-1" }));
947
+ const parsed = systemHealthChangedTrigger.payloadSchema.parse(payload);
948
+ expect(parsed.systemId).toBe("sys-1");
949
+ // Absent for the rollup — existing system-level automations are unaffected.
950
+ expect(parsed.environmentId).toBeUndefined();
951
+ });
952
+ });
953
+
954
+ describe("classifyHealthChange — env-qualified id", () => {
955
+ it("reports the systemId portion + environmentId for a per-env change", () => {
956
+ const c = classifyHealthChange(
957
+ change({ id: encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" }) }),
958
+ );
959
+ expect(c.systemId).toBe("sys-1");
960
+ expect(c.environmentId).toBe("prod");
961
+ expect(c.degraded).toBe(true);
962
+ });
963
+
964
+ it("reports environmentId null for the rollup change", () => {
965
+ const c = classifyHealthChange(change({ id: "sys-1" }));
966
+ expect(c.systemId).toBe("sys-1");
967
+ expect(c.environmentId).toBeNull();
968
+ });
969
+ });
970
+
971
+ describe("per-env + rollup serialization under concurrent writes", () => {
972
+ /** Same keyed-serializer stand-in as the Defect-2 test, reused here. */
973
+ function makeKeyedSerializer() {
974
+ const chains = new Map<string, Promise<unknown>>();
975
+ return (key: string) =>
976
+ <T>(fn: () => Promise<T>): Promise<T> => {
977
+ const prior = chains.get(key) ?? Promise.resolve();
978
+ const next = prior.then(fn, fn);
979
+ chains.set(
980
+ key,
981
+ next.then(
982
+ () => undefined,
983
+ () => undefined,
984
+ ),
985
+ );
986
+ return next;
987
+ };
988
+ }
989
+
990
+ it("two concurrent evals of the SAME (system, env) emit exactly one transition", async () => {
991
+ let unhealthy = false;
992
+ const compute = (): HealthEntityState => ({
993
+ status: unhealthy ? "unhealthy" : "healthy",
994
+ healthyChecks: unhealthy ? 0 : 1,
995
+ totalChecks: 1,
996
+ });
997
+ const emitted: Array<{
998
+ prev: HealthEntityState | undefined;
999
+ next: HealthEntityState;
1000
+ }> = [];
1001
+ const handle = {
1002
+ kind: HEALTH_ENTITY_KIND,
1003
+ async mutate(input: MutateInput<HealthEntityState>) {
1004
+ const prev = compute();
1005
+ await Promise.resolve();
1006
+ const next = await input.apply();
1007
+ if (prev.status !== next.status) emitted.push({ prev, next });
1008
+ return next;
1009
+ },
1010
+ } as unknown as EntityHandle<HealthEntityState>;
1011
+
1012
+ const keyed = makeKeyedSerializer();
1013
+ const envId = encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" });
1014
+ const serialize = keyed(`health:${envId}`);
1015
+ const evalOnce = () =>
1016
+ writeHealthEntity({
1017
+ handle,
1018
+ entityId: envId,
1019
+ serialize,
1020
+ apply: async () => {
1021
+ unhealthy = true;
1022
+ return compute();
1023
+ },
1024
+ });
1025
+
1026
+ await Promise.all([evalOnce(), evalOnce()]);
1027
+ expect(emitted).toHaveLength(1);
1028
+ });
1029
+
1030
+ it("a per-env write and the rollup write run in PARALLEL (distinct keys, no mutual block)", async () => {
1031
+ const keyed = makeKeyedSerializer();
1032
+ const order: string[] = [];
1033
+ const envId = encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" });
1034
+ const rollupId = encodeHealthEntityId({ systemId: "sys-1" });
1035
+
1036
+ const handle = {
1037
+ kind: HEALTH_ENTITY_KIND,
1038
+ async mutate(input: MutateInput<HealthEntityState>) {
1039
+ return input.apply();
1040
+ },
1041
+ } as unknown as EntityHandle<HealthEntityState>;
1042
+
1043
+ // The env write holds its critical section across a microtask; if the
1044
+ // rollup were on the SAME key it would be forced to wait. Distinct keys
1045
+ // let them interleave.
1046
+ const envWrite = writeHealthEntity({
1047
+ handle,
1048
+ entityId: envId,
1049
+ serialize: keyed(`health:${envId}`),
1050
+ apply: async () => {
1051
+ order.push("env-start");
1052
+ await Promise.resolve();
1053
+ order.push("env-end");
1054
+ return { status: "healthy", healthyChecks: 1, totalChecks: 1 };
1055
+ },
1056
+ });
1057
+ const rollupWrite = writeHealthEntity({
1058
+ handle,
1059
+ entityId: rollupId,
1060
+ serialize: keyed(`health:${rollupId}`),
1061
+ apply: async () => {
1062
+ order.push("rollup-start");
1063
+ return { status: "healthy", healthyChecks: 1, totalChecks: 1 };
1064
+ },
1065
+ });
1066
+
1067
+ await Promise.all([envWrite, rollupWrite]);
1068
+
1069
+ // Interleaved: rollup-start ran before env-end (they did not serialize).
1070
+ expect(order.indexOf("rollup-start")).toBeLessThan(order.indexOf("env-end"));
1071
+ });
694
1072
  });
@@ -31,8 +31,9 @@ import type {
31
31
  EntityRead,
32
32
  } from "@checkstack/automation-backend";
33
33
  import type { HealthCheckService } from "./service";
34
+ import { parseHealthEntityId } from "./health-entity-id";
34
35
 
35
- /** Entity kind id for the per-system aggregated health. */
36
+ /** Entity kind id for the aggregated health (system rollup + per-environment). */
36
37
  export const HEALTH_ENTITY_KIND = "health";
37
38
 
38
39
  /**
@@ -121,15 +122,23 @@ function readNumber(
121
122
  * Restores the keys operators read (`trigger.payload.systemId`,
122
123
  * `.previousStatus`, …) that the generic change shape omits.
123
124
  *
124
- * `systemId` is the entity id; `previousStatus` is `prev.status` and `newStatus`
125
- * is `next.status`; `healthyChecks` / `totalChecks` come from `next`;
126
- * `timestamp` is the change's `occurredAt`. `systemName` is not derivable from a
127
- * health change (it lives in the catalog) and is OPTIONAL on the schemas, so it
128
- * is omitted.
125
+ * The entity id is now env-qualified (Phase 3b): `payload.systemId` is ALWAYS
126
+ * the systemId portion (so existing automations reading `trigger.payload.systemId`
127
+ * are unaffected the rollup carries the bare systemId), and the NEW optional
128
+ * `payload.environmentId` is the env portion present only for a per-environment
129
+ * change, absent (undefined) for the system rollup. `previousStatus` is
130
+ * `prev.status` and `newStatus` is `next.status`; `healthyChecks` / `totalChecks`
131
+ * come from `next`; `timestamp` is the change's `occurredAt`. `systemName` is not
132
+ * derivable from a health change (it lives in the catalog) and is OPTIONAL on the
133
+ * schemas, so it is omitted.
129
134
  */
130
135
  export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
136
+ const { systemId, environmentId } = parseHealthEntityId(changed.id);
131
137
  return {
132
- systemId: changed.id,
138
+ systemId,
139
+ // Present only for a per-env change; omitted for the rollup so the field
140
+ // is `undefined` (the optional schema accepts both).
141
+ ...(environmentId === null ? {} : { environmentId }),
133
142
  previousStatus: readStatus(changed.prev) ?? undefined,
134
143
  newStatus: readStatus(changed.next) ?? undefined,
135
144
  healthyChecks: readNumber(changed.next, "healthyChecks") ?? 0,
@@ -152,6 +161,12 @@ export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
152
161
  */
153
162
  export interface HealthChangeClassification {
154
163
  systemId: string;
164
+ /**
165
+ * The environment portion of the entity id (Phase 3b). `null` for the
166
+ * system rollup change; the env id for a per-environment change. Cross-plugin
167
+ * consumers that only care about the system (SLO / dependency) can ignore it.
168
+ */
169
+ environmentId: string | null;
155
170
  previousStatus: string | null;
156
171
  newStatus: string | null;
157
172
  degraded: boolean;
@@ -163,6 +178,7 @@ export function classifyHealthChange(changed: {
163
178
  prev: Record<string, unknown> | null;
164
179
  next: Record<string, unknown> | null;
165
180
  }): HealthChangeClassification {
181
+ const { systemId, environmentId } = parseHealthEntityId(changed.id);
166
182
  const previousStatus = readStatus(changed.prev);
167
183
  const newStatus = readStatus(changed.next);
168
184
  const bothPresent = previousStatus !== null && newStatus !== null;
@@ -171,7 +187,8 @@ export function classifyHealthChange(changed: {
171
187
  const recovered =
172
188
  bothPresent && newStatus === "healthy" && previousStatus !== "healthy";
173
189
  return {
174
- systemId: changed.id,
190
+ systemId,
191
+ environmentId,
175
192
  previousStatus,
176
193
  newStatus,
177
194
  degraded,
@@ -209,9 +226,17 @@ export function classifyHealthChange(changed: {
209
226
  export async function computeHealthEntityState(args: {
210
227
  service: HealthCheckService;
211
228
  systemId: string;
229
+ /**
230
+ * Environment to compute the view for (Phase 3b). `undefined` = the SYSTEM
231
+ * ROLLUP (worst status across all environments + env-less runs — the
232
+ * all-runs aggregate, §7.4.2). `null` = the env-less slice. A string = that
233
+ * environment's per-env view. The existence gate (`checkStatuses.length`) is
234
+ * env-independent, so a per-env view and the rollup agree on totalChecks.
235
+ */
236
+ environmentId?: string | null;
212
237
  }): Promise<HealthEntityState | undefined> {
213
- const { service, systemId } = args;
214
- const overview = await service.getSystemHealthStatus(systemId);
238
+ const { service, systemId, environmentId } = args;
239
+ const overview = await service.getSystemHealthStatus(systemId, environmentId);
215
240
  // No enabled check associations ⇒ no health entity for this system.
216
241
  if (overview.checkStatuses.length === 0) return undefined;
217
242
  return {
@@ -224,10 +249,16 @@ export async function computeHealthEntityState(args: {
224
249
 
225
250
  /**
226
251
  * Build the PLUGIN-BACKED + COMPUTED `read` accessor for the `health` entity.
227
- * For each systemId, assembles the view via {@link computeHealthEntityState}
228
- * (systems with no runs omitted). This is the single source of truth that
229
- * `handle.mutate` snapshots `prev` from and `get`/`getMany`/scope enrichment
230
- * route through no framework `entity_state` storage.
252
+ *
253
+ * Env-aware id parsing (Phase 3b, §7.4.2): each incoming id is parsed via
254
+ * {@link parseHealthEntityId}. A BARE `"<systemId>"` resolves the SYSTEM
255
+ * ROLLUP; a `"<systemId>::<environmentId>"` resolves that environment's
256
+ * per-env view. The result is keyed by the ORIGINAL id, so the reactive
257
+ * engine, `getMany`, and scope enrichment all see the right view for the id
258
+ * they asked for. Systems with no enabled check associations are omitted
259
+ * (existence gate). No framework `entity_state` storage — compute-on-read from
260
+ * the durable, env-keyed `health_check_runs`, so a read returns the same answer
261
+ * on every pod (state-and-scale).
231
262
  */
232
263
  export function createHealthEntityRead(deps: {
233
264
  service: HealthCheckService;
@@ -237,9 +268,20 @@ export function createHealthEntityRead(deps: {
237
268
  if (ids.length === 0) return {};
238
269
  const out: Record<string, HealthEntityState> = {};
239
270
  await Promise.all(
240
- ids.map(async (systemId) => {
241
- const state = await computeHealthEntityState({ service, systemId });
242
- if (state) out[systemId] = state;
271
+ ids.map(async (id) => {
272
+ const { systemId, environmentId } = parseHealthEntityId(id);
273
+ const state = await computeHealthEntityState({
274
+ service,
275
+ systemId,
276
+ // A bare `<systemId>` id is the ROLLUP: `parseHealthEntityId`
277
+ // returns `environmentId: null` for it (so the payload mapper can
278
+ // tell "rollup → omit environmentId"), but the rollup must read ALL
279
+ // runs — `undefined` — NOT the env-less slice (`null`, which filters
280
+ // to `env_id IS NULL`). Reserve `null` for an explicit env-less
281
+ // read; map the rollup's null to undefined here.
282
+ environmentId: environmentId === null ? undefined : environmentId,
283
+ });
284
+ if (state) out[id] = state;
243
285
  }),
244
286
  );
245
287
  return out;
@@ -293,19 +335,28 @@ export function createHealthEntityRead(deps: {
293
335
  */
294
336
  export async function writeHealthEntity(args: {
295
337
  handle: EntityHandle<HealthEntityState> | undefined;
296
- systemId: string;
338
+ /**
339
+ * The `health` entity id to mutate (Phase 3b): the env-qualified
340
+ * `"<systemId>::<environmentId>"` for a per-env write, or the bare
341
+ * `"<systemId>"` for the env-less / system-rollup write. This is the id the
342
+ * framework diffs/emits, so it drives both the per-env and rollup
343
+ * `ENTITY_CHANGED`.
344
+ */
345
+ entityId: string;
297
346
  apply: () => Promise<HealthEntityState>;
298
347
  onError?: (error: unknown) => void;
299
348
  /**
300
- * Optional per-`systemId` critical section wrapping the snapshot-prev +
349
+ * Optional per-`entityId` critical section wrapping the snapshot-prev +
301
350
  * apply + diff + emit. The executor supplies a transaction-scoped advisory
302
- * lock (`withXactLock`, key `health:<systemId>`) so concurrent evaluations
303
- * of one system can't double-emit a single logical transition. Identity by
304
- * default (no serialization) for the unbound-handle / test paths.
351
+ * lock (`withXactLock`, key `health:<entityId>`) so concurrent evaluations
352
+ * of one (system, environment) — or of the rollup — can't double-emit a
353
+ * single logical transition, and per-env + rollup writes serialize against
354
+ * their OWN keys (distinct envs / the rollup don't block each other).
355
+ * Identity by default (no serialization) for the unbound-handle / test paths.
305
356
  */
306
357
  serialize?: <T>(fn: () => Promise<T>) => Promise<T>;
307
358
  }): Promise<HealthEntityState> {
308
- const { handle, systemId, apply, onError, serialize } = args;
359
+ const { handle, entityId, apply, onError, serialize } = args;
309
360
  if (!handle) {
310
361
  // No reactivity bound — run the durable write directly.
311
362
  return apply();
@@ -318,7 +369,7 @@ export async function writeHealthEntity(args: {
318
369
  // call, and we wrap that whole call so two concurrent evals serialize.
319
370
  return await run(() =>
320
371
  handle.mutate({
321
- id: systemId,
372
+ id: entityId,
322
373
  apply: async () => {
323
374
  durableState = await apply();
324
375
  return durableState;
@@ -335,19 +386,26 @@ export async function writeHealthEntity(args: {
335
386
  }
336
387
  }
337
388
 
338
- /** Advisory-lock key namespace for the per-system health critical section. */
339
- export function healthSystemLockKey(systemId: string): string {
340
- return `health:${systemId}`;
389
+ /**
390
+ * Advisory-lock key namespace for the per-entity health critical section. The
391
+ * argument is the FULL `health` entity id (Phase 3b): the bare `"<systemId>"`
392
+ * for the rollup or `"<systemId>::<environmentId>"` for a per-env write. Two
393
+ * different envs (or an env vs the rollup) get DIFFERENT keys, so they
394
+ * serialize independently and never block each other.
395
+ */
396
+ export function healthEntityLockKey(entityId: string): string {
397
+ return `health:${entityId}`;
341
398
  }
342
399
 
343
400
  /**
344
- * Build the per-`systemId` serializer for {@link writeHealthEntity} backed by
401
+ * Build the per-`entityId` serializer for {@link writeHealthEntity} backed by
345
402
  * a transaction-scoped advisory lock (`withXactLock`, key
346
- * `health:<systemId>`). The returned function blocks until it holds the
347
- * system's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
403
+ * `health:<entityId>`). The returned function blocks until it holds the
404
+ * entity's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
348
405
  * auto-releases the lock at COMMIT/ROLLBACK. Two concurrent evaluations of one
349
- * system therefore serialize exactly one logical `healthy degraded`
350
- * transition emits exactly one `ENTITY_CHANGED` + one transition row.
406
+ * (system, environment)or of the rollup therefore serialize, while
407
+ * distinct envs proceed in parallel. Exactly one logical transition per entity
408
+ * emits exactly one `ENTITY_CHANGED` + one transition row.
351
409
  *
352
410
  * `fn` does its own durable writes on the outer pool; the lock only gates
353
411
  * ENTRY to the critical section, so its connection affinity is irrelevant —
@@ -356,12 +414,12 @@ export function healthSystemLockKey(systemId: string): string {
356
414
  */
357
415
  export function createHealthEntitySerializer(deps: {
358
416
  advisoryLock: AdvisoryLockService;
359
- }): (systemId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
417
+ }): (entityId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
360
418
  const { advisoryLock } = deps;
361
- return (systemId) =>
419
+ return (entityId) =>
362
420
  <T>(fn: () => Promise<T>) =>
363
421
  advisoryLock.withXactLock({
364
- key: healthSystemLockKey(systemId),
422
+ key: healthEntityLockKey(entityId),
365
423
  fn: () => fn(),
366
424
  });
367
425
  }