@checkstack/healthcheck-backend 1.5.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +253 -0
- package/drizzle/0018_abnormal_preak.sql +10 -0
- package/drizzle/meta/0018_snapshot.json +600 -0
- package/drizzle/meta/_journal.json +7 -0
- package/package.json +32 -27
- package/src/ai/assertion-validation.test.ts +117 -0
- package/src/ai/assertion-validation.ts +147 -0
- package/src/ai/healthcheck-capabilities.test.ts +158 -0
- package/src/ai/healthcheck-capabilities.ts +217 -0
- package/src/ai/healthcheck-delete.test.ts +81 -0
- package/src/ai/healthcheck-delete.ts +81 -0
- package/src/ai/healthcheck-projection.test.ts +36 -0
- package/src/ai/healthcheck-propose.test.ts +268 -0
- package/src/ai/healthcheck-propose.ts +290 -0
- package/src/ai/healthcheck-script-tools.test.ts +93 -0
- package/src/ai/healthcheck-script-tools.ts +179 -0
- package/src/ai/healthcheck-update.test.ts +123 -0
- package/src/ai/healthcheck-update.ts +123 -0
- package/src/ai/notify-subscribers.test.ts +109 -0
- package/src/ai/notify-subscribers.ts +176 -0
- package/src/ai/register-ai-tools.test.ts +41 -0
- package/src/ai/register-ai-tools.ts +53 -0
- package/src/ai/shell-env-table.test.ts +47 -0
- package/src/automations.test.ts +2 -1
- package/src/automations.ts +9 -1
- package/src/collector-script-test.test.ts +53 -1
- package/src/collector-script-test.ts +59 -7
- package/src/effective-environments.test.ts +93 -0
- package/src/effective-environments.ts +64 -0
- package/src/health-entity-id.ts +57 -0
- package/src/health-entity.test.ts +384 -6
- package/src/health-entity.ts +93 -35
- package/src/health-state.ts +41 -4
- package/src/healthcheck-gitops-kinds.test.ts +95 -0
- package/src/healthcheck-gitops-kinds.ts +56 -13
- package/src/index.ts +30 -0
- package/src/migration-chain-contract.test.ts +57 -0
- package/src/queue-executor.test.ts +801 -0
- package/src/queue-executor.ts +336 -52
- package/src/realtime-aggregation.test.ts +30 -0
- package/src/realtime-aggregation.ts +16 -0
- package/src/retention-job.ts +167 -93
- package/src/retention-rollup.test.ts +118 -0
- package/src/router.test.ts +120 -1
- package/src/router.ts +20 -0
- package/src/schema.ts +44 -6
- package/src/service.ts +199 -43
- package/src/state-transitions.test.ts +104 -0
- package/src/state-transitions.ts +39 -1
- package/src/validate-configuration.test.ts +205 -0
- package/src/validate-configuration.ts +159 -0
- package/tsconfig.json +9 -0
|
@@ -20,6 +20,10 @@ import {
|
|
|
20
20
|
type HealthEntityState,
|
|
21
21
|
} from "./health-entity";
|
|
22
22
|
import type { HealthCheckService } from "./service";
|
|
23
|
+
import {
|
|
24
|
+
encodeHealthEntityId,
|
|
25
|
+
parseHealthEntityId,
|
|
26
|
+
} from "./health-entity-id";
|
|
23
27
|
import {
|
|
24
28
|
systemDegradedTrigger,
|
|
25
29
|
systemHealthyTrigger,
|
|
@@ -162,6 +166,7 @@ describe("classifyHealthChange (cross-plugin consumer predicate)", () => {
|
|
|
162
166
|
const c = classifyHealthChange(change());
|
|
163
167
|
expect(c).toEqual({
|
|
164
168
|
systemId: "sys-1",
|
|
169
|
+
environmentId: null,
|
|
165
170
|
previousStatus: "healthy",
|
|
166
171
|
newStatus: "unhealthy",
|
|
167
172
|
degraded: true,
|
|
@@ -370,7 +375,7 @@ describe("writeHealthEntity (durable write driven through handle.mutate)", () =>
|
|
|
370
375
|
|
|
371
376
|
const next = await writeHealthEntity({
|
|
372
377
|
handle,
|
|
373
|
-
|
|
378
|
+
entityId: "sys-1",
|
|
374
379
|
apply: async () => {
|
|
375
380
|
persisted = { status: "unhealthy", healthyChecks: 0, totalChecks: 2 };
|
|
376
381
|
return persisted;
|
|
@@ -392,7 +397,7 @@ describe("writeHealthEntity (durable write driven through handle.mutate)", () =>
|
|
|
392
397
|
let ran = false;
|
|
393
398
|
const next = await writeHealthEntity({
|
|
394
399
|
handle: undefined,
|
|
395
|
-
|
|
400
|
+
entityId: "sys-1",
|
|
396
401
|
apply: async () => {
|
|
397
402
|
ran = true;
|
|
398
403
|
return { status: "healthy", healthyChecks: 1, totalChecks: 1 };
|
|
@@ -415,7 +420,7 @@ describe("writeHealthEntity (durable write driven through handle.mutate)", () =>
|
|
|
415
420
|
// apply commits, THEN the handle throws (emit failure). Must not rethrow.
|
|
416
421
|
const result = await writeHealthEntity({
|
|
417
422
|
handle,
|
|
418
|
-
|
|
423
|
+
entityId: "sys-1",
|
|
419
424
|
apply: async () => ({
|
|
420
425
|
status: "unhealthy",
|
|
421
426
|
healthyChecks: 0,
|
|
@@ -442,7 +447,7 @@ describe("writeHealthEntity (durable write driven through handle.mutate)", () =>
|
|
|
442
447
|
await expect(
|
|
443
448
|
writeHealthEntity({
|
|
444
449
|
handle,
|
|
445
|
-
|
|
450
|
+
entityId: "sys-1",
|
|
446
451
|
apply: async () => {
|
|
447
452
|
throw new Error("insert failed");
|
|
448
453
|
},
|
|
@@ -527,7 +532,7 @@ describe("first-run-unhealthy degradation (Defect 1 regression)", () => {
|
|
|
527
532
|
|
|
528
533
|
const next = await writeHealthEntity({
|
|
529
534
|
handle,
|
|
530
|
-
|
|
535
|
+
entityId: "sys-1",
|
|
531
536
|
apply: async () => {
|
|
532
537
|
// The durable first run lands here (unhealthy).
|
|
533
538
|
firstRunRecorded = true;
|
|
@@ -645,7 +650,7 @@ describe("per-system serialization (Defect 2 regression)", () => {
|
|
|
645
650
|
const evalOnce = () =>
|
|
646
651
|
writeHealthEntity({
|
|
647
652
|
handle,
|
|
648
|
-
|
|
653
|
+
entityId: "sys-1",
|
|
649
654
|
serialize,
|
|
650
655
|
apply: async () => {
|
|
651
656
|
// The durable "insert failing run" — first writer flips the state.
|
|
@@ -691,4 +696,377 @@ describe("per-system serialization (Defect 2 regression)", () => {
|
|
|
691
696
|
// The advisory lock was acquired with the per-system namespaced key.
|
|
692
697
|
expect(keys).toContain("health:sys-42");
|
|
693
698
|
});
|
|
699
|
+
|
|
700
|
+
it("serializes per ENV-QUALIFIED id so distinct envs / the rollup use distinct lock keys", async () => {
|
|
701
|
+
const keys: string[] = [];
|
|
702
|
+
const advisoryLock = {
|
|
703
|
+
tryAcquire: async () => ({ release: async () => {} }),
|
|
704
|
+
withXactLock<T>({
|
|
705
|
+
key,
|
|
706
|
+
fn,
|
|
707
|
+
}: {
|
|
708
|
+
key: string;
|
|
709
|
+
fn: () => Promise<T>;
|
|
710
|
+
}): Promise<T> {
|
|
711
|
+
keys.push(key);
|
|
712
|
+
return fn();
|
|
713
|
+
},
|
|
714
|
+
} satisfies Parameters<
|
|
715
|
+
typeof createHealthEntitySerializer
|
|
716
|
+
>[0]["advisoryLock"];
|
|
717
|
+
|
|
718
|
+
const make = createHealthEntitySerializer({ advisoryLock });
|
|
719
|
+
await make(encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" }))(
|
|
720
|
+
async () => "ok",
|
|
721
|
+
);
|
|
722
|
+
await make(encodeHealthEntityId({ systemId: "sys-1", environmentId: "staging" }))(
|
|
723
|
+
async () => "ok",
|
|
724
|
+
);
|
|
725
|
+
await make(encodeHealthEntityId({ systemId: "sys-1" }))(async () => "ok");
|
|
726
|
+
|
|
727
|
+
// Per-env keys are env-qualified; the rollup uses the bare systemId. All
|
|
728
|
+
// three are DISTINCT, so they never block each other.
|
|
729
|
+
expect(keys).toEqual([
|
|
730
|
+
"health:sys-1::prod",
|
|
731
|
+
"health:sys-1::staging",
|
|
732
|
+
"health:sys-1",
|
|
733
|
+
]);
|
|
734
|
+
});
|
|
735
|
+
});
|
|
736
|
+
|
|
737
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
738
|
+
// PHASE 3b: the `health` entity is env-keyed — `<systemId>` (rollup) and
|
|
739
|
+
// `<systemId>::<environmentId>` (per-env) views share one kind, distinguished
|
|
740
|
+
// only by id-shape. The rollup MUST preserve the pre-3b system-level contract.
|
|
741
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
742
|
+
|
|
743
|
+
describe("health-entity-id encode/parse round-trip", () => {
|
|
744
|
+
it("encodes the bare systemId for the rollup (no environment)", () => {
|
|
745
|
+
expect(encodeHealthEntityId({ systemId: "sys-1" })).toBe("sys-1");
|
|
746
|
+
expect(encodeHealthEntityId({ systemId: "sys-1", environmentId: null })).toBe(
|
|
747
|
+
"sys-1",
|
|
748
|
+
);
|
|
749
|
+
});
|
|
750
|
+
|
|
751
|
+
it("encodes `<systemId>::<environmentId>` for a per-env id", () => {
|
|
752
|
+
expect(
|
|
753
|
+
encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" }),
|
|
754
|
+
).toBe("sys-1::prod");
|
|
755
|
+
});
|
|
756
|
+
|
|
757
|
+
it("parses a bare id as the rollup (environmentId null)", () => {
|
|
758
|
+
expect(parseHealthEntityId("sys-1")).toEqual({
|
|
759
|
+
systemId: "sys-1",
|
|
760
|
+
environmentId: null,
|
|
761
|
+
});
|
|
762
|
+
});
|
|
763
|
+
|
|
764
|
+
it("parses a per-env id into (systemId, environmentId)", () => {
|
|
765
|
+
expect(parseHealthEntityId("sys-1::prod")).toEqual({
|
|
766
|
+
systemId: "sys-1",
|
|
767
|
+
environmentId: "prod",
|
|
768
|
+
});
|
|
769
|
+
});
|
|
770
|
+
});
|
|
771
|
+
|
|
772
|
+
/** Sentinel key for the env-less slice (`environmentId === null`) in the fake.
|
|
773
|
+
* Kept DISTINCT from the rollup key (`"<systemId>"`, selected by `undefined`)
|
|
774
|
+
* so the fake faithfully models production's `IS NULL` filter — collapsing
|
|
775
|
+
* them is what masked the rollup BLOCKER. */
|
|
776
|
+
const ENVLESS_KEY = "::__envless__";
|
|
777
|
+
|
|
778
|
+
/**
|
|
779
|
+
* Env-aware fake service: `getSystemHealthStatus(systemId, environmentId)`
|
|
780
|
+
* returns canned per-(system, env) state, distinguishing all THREE arg modes
|
|
781
|
+
* exactly as production's SQL does:
|
|
782
|
+
* - `environmentId === undefined` ⇒ ROLLUP (all runs) — key `"<systemId>"`.
|
|
783
|
+
* - `environmentId === null` ⇒ ENV-LESS slice (`env_id IS NULL`) — key
|
|
784
|
+
* `"<systemId>::__envless__"` (DISTINCT from the rollup key).
|
|
785
|
+
* - a string ⇒ per-env slice — key `"<systemId>::<env>"`.
|
|
786
|
+
*/
|
|
787
|
+
function fakeEnvService(
|
|
788
|
+
byEntityId: Record<
|
|
789
|
+
string,
|
|
790
|
+
{ status: CheckStatus; checkStatuses: Array<{ status: CheckStatus }> }
|
|
791
|
+
>,
|
|
792
|
+
): HealthCheckService {
|
|
793
|
+
return {
|
|
794
|
+
getSystemHealthStatus: async (
|
|
795
|
+
systemId: string,
|
|
796
|
+
environmentId?: string | null,
|
|
797
|
+
) => {
|
|
798
|
+
const key =
|
|
799
|
+
environmentId === undefined
|
|
800
|
+
? systemId
|
|
801
|
+
: environmentId === null
|
|
802
|
+
? `${systemId}${ENVLESS_KEY}`
|
|
803
|
+
: `${systemId}::${environmentId}`;
|
|
804
|
+
const found = byEntityId[key];
|
|
805
|
+
return {
|
|
806
|
+
status: found?.status ?? ("healthy" as CheckStatus),
|
|
807
|
+
evaluatedAt: new Date(),
|
|
808
|
+
checkStatuses: (found?.checkStatuses ?? []).map((c, i) => ({
|
|
809
|
+
configurationId: `cfg-${i}`,
|
|
810
|
+
configurationName: `Check ${i}`,
|
|
811
|
+
status: c.status,
|
|
812
|
+
runsConsidered: 1,
|
|
813
|
+
})),
|
|
814
|
+
};
|
|
815
|
+
},
|
|
816
|
+
} as unknown as HealthCheckService;
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
describe("createHealthEntityRead — env-keyed (rollup vs per-env)", () => {
|
|
820
|
+
it("resolves the per-env view for a `<systemId>::<env>` id and the rollup for a bare id", async () => {
|
|
821
|
+
const service = fakeEnvService({
|
|
822
|
+
// Rollup: worst across envs (unhealthy because prod is unhealthy).
|
|
823
|
+
"sys-1": {
|
|
824
|
+
status: "unhealthy",
|
|
825
|
+
checkStatuses: [{ status: "unhealthy" }],
|
|
826
|
+
},
|
|
827
|
+
"sys-1::prod": {
|
|
828
|
+
status: "unhealthy",
|
|
829
|
+
checkStatuses: [{ status: "unhealthy" }],
|
|
830
|
+
},
|
|
831
|
+
"sys-1::staging": {
|
|
832
|
+
status: "healthy",
|
|
833
|
+
checkStatuses: [{ status: "healthy" }],
|
|
834
|
+
},
|
|
835
|
+
});
|
|
836
|
+
const read = createHealthEntityRead({ service });
|
|
837
|
+
const out = await read(["sys-1", "sys-1::prod", "sys-1::staging"]);
|
|
838
|
+
|
|
839
|
+
// Keyed by the ORIGINAL (env-qualified) id, each resolving the right view.
|
|
840
|
+
expect(out["sys-1"]?.status).toBe("unhealthy"); // rollup
|
|
841
|
+
expect(out["sys-1::prod"]?.status).toBe("unhealthy"); // per-env
|
|
842
|
+
expect(out["sys-1::staging"]?.status).toBe("healthy"); // per-env
|
|
843
|
+
});
|
|
844
|
+
|
|
845
|
+
it("rollup of a system WITH environments reads ALL runs (worst status), NOT the env-less slice (BLOCKER regression)", async () => {
|
|
846
|
+
// A system whose runs ALL carry a non-null env_id: there is NO env-less
|
|
847
|
+
// slice entry. The bare-`<systemId>` ROLLUP must read ALL runs (worst
|
|
848
|
+
// status across envs), NOT `env_id IS NULL` (which would find zero rows and
|
|
849
|
+
// report default healthy). The fake omits the ENV-LESS key entirely, so a
|
|
850
|
+
// bug that resolved the rollup via `null` would return default healthy here.
|
|
851
|
+
const service = fakeEnvService({
|
|
852
|
+
// Rollup (all-runs / `undefined`): worst across envs = unhealthy.
|
|
853
|
+
"sys-1": {
|
|
854
|
+
status: "unhealthy",
|
|
855
|
+
checkStatuses: [{ status: "unhealthy" }],
|
|
856
|
+
},
|
|
857
|
+
"sys-1::prod": {
|
|
858
|
+
status: "unhealthy",
|
|
859
|
+
checkStatuses: [{ status: "unhealthy" }],
|
|
860
|
+
},
|
|
861
|
+
"sys-1::staging": {
|
|
862
|
+
status: "healthy",
|
|
863
|
+
checkStatuses: [{ status: "healthy" }],
|
|
864
|
+
},
|
|
865
|
+
// NOTE: deliberately NO "sys-1::__envless__" entry — every run has an env.
|
|
866
|
+
});
|
|
867
|
+
const read = createHealthEntityRead({ service });
|
|
868
|
+
const out = await read(["sys-1"]);
|
|
869
|
+
// Worst status across environments — NOT the (empty) env-less slice's
|
|
870
|
+
// default healthy.
|
|
871
|
+
expect(out["sys-1"]?.status).toBe("unhealthy");
|
|
872
|
+
});
|
|
873
|
+
|
|
874
|
+
it("rollup preserves the pre-3b contract: a bare-systemId read equals today's status when no envs exist", async () => {
|
|
875
|
+
// A system with no environments has only the bare-systemId (rollup =
|
|
876
|
+
// env-less) entry — exactly the pre-3b shape.
|
|
877
|
+
const service = fakeEnvService({
|
|
878
|
+
"sys-1": {
|
|
879
|
+
status: "degraded",
|
|
880
|
+
checkStatuses: [{ status: "healthy" }, { status: "degraded" }],
|
|
881
|
+
},
|
|
882
|
+
});
|
|
883
|
+
const read = createHealthEntityRead({ service });
|
|
884
|
+
const out = await read(["sys-1"]);
|
|
885
|
+
expect(out["sys-1"]).toEqual({
|
|
886
|
+
status: "degraded",
|
|
887
|
+
healthyChecks: 1,
|
|
888
|
+
totalChecks: 2,
|
|
889
|
+
});
|
|
890
|
+
});
|
|
891
|
+
|
|
892
|
+
it("omits a per-env id whose system has no enabled checks (existence gate holds per id)", async () => {
|
|
893
|
+
const service = fakeEnvService({
|
|
894
|
+
"sys-1::prod": { status: "healthy", checkStatuses: [] },
|
|
895
|
+
});
|
|
896
|
+
const read = createHealthEntityRead({ service });
|
|
897
|
+
const out = await read(["sys-1::prod"]);
|
|
898
|
+
expect(out["sys-1::prod"]).toBeUndefined();
|
|
899
|
+
});
|
|
900
|
+
});
|
|
901
|
+
|
|
902
|
+
describe("computeHealthEntityState — environment-aware", () => {
|
|
903
|
+
it("computes the env-scoped view for a concrete environment", async () => {
|
|
904
|
+
const service = fakeEnvService({
|
|
905
|
+
"sys-1::prod": {
|
|
906
|
+
status: "unhealthy",
|
|
907
|
+
checkStatuses: [{ status: "unhealthy" }, { status: "healthy" }],
|
|
908
|
+
},
|
|
909
|
+
});
|
|
910
|
+
const state = await computeHealthEntityState({
|
|
911
|
+
service,
|
|
912
|
+
systemId: "sys-1",
|
|
913
|
+
environmentId: "prod",
|
|
914
|
+
});
|
|
915
|
+
expect(state).toEqual({
|
|
916
|
+
status: "unhealthy",
|
|
917
|
+
healthyChecks: 1,
|
|
918
|
+
totalChecks: 2,
|
|
919
|
+
});
|
|
920
|
+
});
|
|
921
|
+
|
|
922
|
+
it("computes the rollup view when environmentId is omitted", async () => {
|
|
923
|
+
const service = fakeEnvService({
|
|
924
|
+
"sys-1": {
|
|
925
|
+
status: "degraded",
|
|
926
|
+
checkStatuses: [{ status: "degraded" }],
|
|
927
|
+
},
|
|
928
|
+
});
|
|
929
|
+
const state = await computeHealthEntityState({ service, systemId: "sys-1" });
|
|
930
|
+
expect(state?.status).toBe("degraded");
|
|
931
|
+
});
|
|
932
|
+
});
|
|
933
|
+
|
|
934
|
+
describe("healthChangeToPayload — env-qualified id", () => {
|
|
935
|
+
it("sets payload.environmentId for a PER-ENV change and validates against the schema", () => {
|
|
936
|
+
const payload = healthChangeToPayload(
|
|
937
|
+
change({ id: encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" }) }),
|
|
938
|
+
);
|
|
939
|
+
const parsed = systemDegradedTrigger.payloadSchema.parse(payload);
|
|
940
|
+
// systemId is the bare systemId portion; environmentId is the env.
|
|
941
|
+
expect(parsed.systemId).toBe("sys-1");
|
|
942
|
+
expect(parsed.environmentId).toBe("prod");
|
|
943
|
+
});
|
|
944
|
+
|
|
945
|
+
it("OMITS environmentId for the system ROLLUP change (back-compat: bare systemId)", () => {
|
|
946
|
+
const payload = healthChangeToPayload(change({ id: "sys-1" }));
|
|
947
|
+
const parsed = systemHealthChangedTrigger.payloadSchema.parse(payload);
|
|
948
|
+
expect(parsed.systemId).toBe("sys-1");
|
|
949
|
+
// Absent for the rollup — existing system-level automations are unaffected.
|
|
950
|
+
expect(parsed.environmentId).toBeUndefined();
|
|
951
|
+
});
|
|
952
|
+
});
|
|
953
|
+
|
|
954
|
+
describe("classifyHealthChange — env-qualified id", () => {
|
|
955
|
+
it("reports the systemId portion + environmentId for a per-env change", () => {
|
|
956
|
+
const c = classifyHealthChange(
|
|
957
|
+
change({ id: encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" }) }),
|
|
958
|
+
);
|
|
959
|
+
expect(c.systemId).toBe("sys-1");
|
|
960
|
+
expect(c.environmentId).toBe("prod");
|
|
961
|
+
expect(c.degraded).toBe(true);
|
|
962
|
+
});
|
|
963
|
+
|
|
964
|
+
it("reports environmentId null for the rollup change", () => {
|
|
965
|
+
const c = classifyHealthChange(change({ id: "sys-1" }));
|
|
966
|
+
expect(c.systemId).toBe("sys-1");
|
|
967
|
+
expect(c.environmentId).toBeNull();
|
|
968
|
+
});
|
|
969
|
+
});
|
|
970
|
+
|
|
971
|
+
describe("per-env + rollup serialization under concurrent writes", () => {
|
|
972
|
+
/** Same keyed-serializer stand-in as the Defect-2 test, reused here. */
|
|
973
|
+
function makeKeyedSerializer() {
|
|
974
|
+
const chains = new Map<string, Promise<unknown>>();
|
|
975
|
+
return (key: string) =>
|
|
976
|
+
<T>(fn: () => Promise<T>): Promise<T> => {
|
|
977
|
+
const prior = chains.get(key) ?? Promise.resolve();
|
|
978
|
+
const next = prior.then(fn, fn);
|
|
979
|
+
chains.set(
|
|
980
|
+
key,
|
|
981
|
+
next.then(
|
|
982
|
+
() => undefined,
|
|
983
|
+
() => undefined,
|
|
984
|
+
),
|
|
985
|
+
);
|
|
986
|
+
return next;
|
|
987
|
+
};
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
it("two concurrent evals of the SAME (system, env) emit exactly one transition", async () => {
|
|
991
|
+
let unhealthy = false;
|
|
992
|
+
const compute = (): HealthEntityState => ({
|
|
993
|
+
status: unhealthy ? "unhealthy" : "healthy",
|
|
994
|
+
healthyChecks: unhealthy ? 0 : 1,
|
|
995
|
+
totalChecks: 1,
|
|
996
|
+
});
|
|
997
|
+
const emitted: Array<{
|
|
998
|
+
prev: HealthEntityState | undefined;
|
|
999
|
+
next: HealthEntityState;
|
|
1000
|
+
}> = [];
|
|
1001
|
+
const handle = {
|
|
1002
|
+
kind: HEALTH_ENTITY_KIND,
|
|
1003
|
+
async mutate(input: MutateInput<HealthEntityState>) {
|
|
1004
|
+
const prev = compute();
|
|
1005
|
+
await Promise.resolve();
|
|
1006
|
+
const next = await input.apply();
|
|
1007
|
+
if (prev.status !== next.status) emitted.push({ prev, next });
|
|
1008
|
+
return next;
|
|
1009
|
+
},
|
|
1010
|
+
} as unknown as EntityHandle<HealthEntityState>;
|
|
1011
|
+
|
|
1012
|
+
const keyed = makeKeyedSerializer();
|
|
1013
|
+
const envId = encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" });
|
|
1014
|
+
const serialize = keyed(`health:${envId}`);
|
|
1015
|
+
const evalOnce = () =>
|
|
1016
|
+
writeHealthEntity({
|
|
1017
|
+
handle,
|
|
1018
|
+
entityId: envId,
|
|
1019
|
+
serialize,
|
|
1020
|
+
apply: async () => {
|
|
1021
|
+
unhealthy = true;
|
|
1022
|
+
return compute();
|
|
1023
|
+
},
|
|
1024
|
+
});
|
|
1025
|
+
|
|
1026
|
+
await Promise.all([evalOnce(), evalOnce()]);
|
|
1027
|
+
expect(emitted).toHaveLength(1);
|
|
1028
|
+
});
|
|
1029
|
+
|
|
1030
|
+
it("a per-env write and the rollup write run in PARALLEL (distinct keys, no mutual block)", async () => {
|
|
1031
|
+
const keyed = makeKeyedSerializer();
|
|
1032
|
+
const order: string[] = [];
|
|
1033
|
+
const envId = encodeHealthEntityId({ systemId: "sys-1", environmentId: "prod" });
|
|
1034
|
+
const rollupId = encodeHealthEntityId({ systemId: "sys-1" });
|
|
1035
|
+
|
|
1036
|
+
const handle = {
|
|
1037
|
+
kind: HEALTH_ENTITY_KIND,
|
|
1038
|
+
async mutate(input: MutateInput<HealthEntityState>) {
|
|
1039
|
+
return input.apply();
|
|
1040
|
+
},
|
|
1041
|
+
} as unknown as EntityHandle<HealthEntityState>;
|
|
1042
|
+
|
|
1043
|
+
// The env write holds its critical section across a microtask; if the
|
|
1044
|
+
// rollup were on the SAME key it would be forced to wait. Distinct keys
|
|
1045
|
+
// let them interleave.
|
|
1046
|
+
const envWrite = writeHealthEntity({
|
|
1047
|
+
handle,
|
|
1048
|
+
entityId: envId,
|
|
1049
|
+
serialize: keyed(`health:${envId}`),
|
|
1050
|
+
apply: async () => {
|
|
1051
|
+
order.push("env-start");
|
|
1052
|
+
await Promise.resolve();
|
|
1053
|
+
order.push("env-end");
|
|
1054
|
+
return { status: "healthy", healthyChecks: 1, totalChecks: 1 };
|
|
1055
|
+
},
|
|
1056
|
+
});
|
|
1057
|
+
const rollupWrite = writeHealthEntity({
|
|
1058
|
+
handle,
|
|
1059
|
+
entityId: rollupId,
|
|
1060
|
+
serialize: keyed(`health:${rollupId}`),
|
|
1061
|
+
apply: async () => {
|
|
1062
|
+
order.push("rollup-start");
|
|
1063
|
+
return { status: "healthy", healthyChecks: 1, totalChecks: 1 };
|
|
1064
|
+
},
|
|
1065
|
+
});
|
|
1066
|
+
|
|
1067
|
+
await Promise.all([envWrite, rollupWrite]);
|
|
1068
|
+
|
|
1069
|
+
// Interleaved: rollup-start ran before env-end (they did not serialize).
|
|
1070
|
+
expect(order.indexOf("rollup-start")).toBeLessThan(order.indexOf("env-end"));
|
|
1071
|
+
});
|
|
694
1072
|
});
|
package/src/health-entity.ts
CHANGED
|
@@ -31,8 +31,9 @@ import type {
|
|
|
31
31
|
EntityRead,
|
|
32
32
|
} from "@checkstack/automation-backend";
|
|
33
33
|
import type { HealthCheckService } from "./service";
|
|
34
|
+
import { parseHealthEntityId } from "./health-entity-id";
|
|
34
35
|
|
|
35
|
-
/** Entity kind id for the
|
|
36
|
+
/** Entity kind id for the aggregated health (system rollup + per-environment). */
|
|
36
37
|
export const HEALTH_ENTITY_KIND = "health";
|
|
37
38
|
|
|
38
39
|
/**
|
|
@@ -121,15 +122,23 @@ function readNumber(
|
|
|
121
122
|
* Restores the keys operators read (`trigger.payload.systemId`,
|
|
122
123
|
* `.previousStatus`, …) that the generic change shape omits.
|
|
123
124
|
*
|
|
124
|
-
*
|
|
125
|
-
*
|
|
126
|
-
*
|
|
127
|
-
*
|
|
128
|
-
*
|
|
125
|
+
* The entity id is now env-qualified (Phase 3b): `payload.systemId` is ALWAYS
|
|
126
|
+
* the systemId portion (so existing automations reading `trigger.payload.systemId`
|
|
127
|
+
* are unaffected — the rollup carries the bare systemId), and the NEW optional
|
|
128
|
+
* `payload.environmentId` is the env portion — present only for a per-environment
|
|
129
|
+
* change, absent (undefined) for the system rollup. `previousStatus` is
|
|
130
|
+
* `prev.status` and `newStatus` is `next.status`; `healthyChecks` / `totalChecks`
|
|
131
|
+
* come from `next`; `timestamp` is the change's `occurredAt`. `systemName` is not
|
|
132
|
+
* derivable from a health change (it lives in the catalog) and is OPTIONAL on the
|
|
133
|
+
* schemas, so it is omitted.
|
|
129
134
|
*/
|
|
130
135
|
export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
|
|
136
|
+
const { systemId, environmentId } = parseHealthEntityId(changed.id);
|
|
131
137
|
return {
|
|
132
|
-
systemId
|
|
138
|
+
systemId,
|
|
139
|
+
// Present only for a per-env change; omitted for the rollup so the field
|
|
140
|
+
// is `undefined` (the optional schema accepts both).
|
|
141
|
+
...(environmentId === null ? {} : { environmentId }),
|
|
133
142
|
previousStatus: readStatus(changed.prev) ?? undefined,
|
|
134
143
|
newStatus: readStatus(changed.next) ?? undefined,
|
|
135
144
|
healthyChecks: readNumber(changed.next, "healthyChecks") ?? 0,
|
|
@@ -152,6 +161,12 @@ export const healthChangeToPayload: EntityChangePayloadMapper = (changed) => {
|
|
|
152
161
|
*/
|
|
153
162
|
export interface HealthChangeClassification {
|
|
154
163
|
systemId: string;
|
|
164
|
+
/**
|
|
165
|
+
* The environment portion of the entity id (Phase 3b). `null` for the
|
|
166
|
+
* system rollup change; the env id for a per-environment change. Cross-plugin
|
|
167
|
+
* consumers that only care about the system (SLO / dependency) can ignore it.
|
|
168
|
+
*/
|
|
169
|
+
environmentId: string | null;
|
|
155
170
|
previousStatus: string | null;
|
|
156
171
|
newStatus: string | null;
|
|
157
172
|
degraded: boolean;
|
|
@@ -163,6 +178,7 @@ export function classifyHealthChange(changed: {
|
|
|
163
178
|
prev: Record<string, unknown> | null;
|
|
164
179
|
next: Record<string, unknown> | null;
|
|
165
180
|
}): HealthChangeClassification {
|
|
181
|
+
const { systemId, environmentId } = parseHealthEntityId(changed.id);
|
|
166
182
|
const previousStatus = readStatus(changed.prev);
|
|
167
183
|
const newStatus = readStatus(changed.next);
|
|
168
184
|
const bothPresent = previousStatus !== null && newStatus !== null;
|
|
@@ -171,7 +187,8 @@ export function classifyHealthChange(changed: {
|
|
|
171
187
|
const recovered =
|
|
172
188
|
bothPresent && newStatus === "healthy" && previousStatus !== "healthy";
|
|
173
189
|
return {
|
|
174
|
-
systemId
|
|
190
|
+
systemId,
|
|
191
|
+
environmentId,
|
|
175
192
|
previousStatus,
|
|
176
193
|
newStatus,
|
|
177
194
|
degraded,
|
|
@@ -209,9 +226,17 @@ export function classifyHealthChange(changed: {
|
|
|
209
226
|
export async function computeHealthEntityState(args: {
|
|
210
227
|
service: HealthCheckService;
|
|
211
228
|
systemId: string;
|
|
229
|
+
/**
|
|
230
|
+
* Environment to compute the view for (Phase 3b). `undefined` = the SYSTEM
|
|
231
|
+
* ROLLUP (worst status across all environments + env-less runs — the
|
|
232
|
+
* all-runs aggregate, §7.4.2). `null` = the env-less slice. A string = that
|
|
233
|
+
* environment's per-env view. The existence gate (`checkStatuses.length`) is
|
|
234
|
+
* env-independent, so a per-env view and the rollup agree on totalChecks.
|
|
235
|
+
*/
|
|
236
|
+
environmentId?: string | null;
|
|
212
237
|
}): Promise<HealthEntityState | undefined> {
|
|
213
|
-
const { service, systemId } = args;
|
|
214
|
-
const overview = await service.getSystemHealthStatus(systemId);
|
|
238
|
+
const { service, systemId, environmentId } = args;
|
|
239
|
+
const overview = await service.getSystemHealthStatus(systemId, environmentId);
|
|
215
240
|
// No enabled check associations ⇒ no health entity for this system.
|
|
216
241
|
if (overview.checkStatuses.length === 0) return undefined;
|
|
217
242
|
return {
|
|
@@ -224,10 +249,16 @@ export async function computeHealthEntityState(args: {
|
|
|
224
249
|
|
|
225
250
|
/**
|
|
226
251
|
* Build the PLUGIN-BACKED + COMPUTED `read` accessor for the `health` entity.
|
|
227
|
-
*
|
|
228
|
-
*
|
|
229
|
-
*
|
|
230
|
-
*
|
|
252
|
+
*
|
|
253
|
+
* Env-aware id parsing (Phase 3b, §7.4.2): each incoming id is parsed via
|
|
254
|
+
* {@link parseHealthEntityId}. A BARE `"<systemId>"` resolves the SYSTEM
|
|
255
|
+
* ROLLUP; a `"<systemId>::<environmentId>"` resolves that environment's
|
|
256
|
+
* per-env view. The result is keyed by the ORIGINAL id, so the reactive
|
|
257
|
+
* engine, `getMany`, and scope enrichment all see the right view for the id
|
|
258
|
+
* they asked for. Systems with no enabled check associations are omitted
|
|
259
|
+
* (existence gate). No framework `entity_state` storage — compute-on-read from
|
|
260
|
+
* the durable, env-keyed `health_check_runs`, so a read returns the same answer
|
|
261
|
+
* on every pod (state-and-scale).
|
|
231
262
|
*/
|
|
232
263
|
export function createHealthEntityRead(deps: {
|
|
233
264
|
service: HealthCheckService;
|
|
@@ -237,9 +268,20 @@ export function createHealthEntityRead(deps: {
|
|
|
237
268
|
if (ids.length === 0) return {};
|
|
238
269
|
const out: Record<string, HealthEntityState> = {};
|
|
239
270
|
await Promise.all(
|
|
240
|
-
ids.map(async (
|
|
241
|
-
const
|
|
242
|
-
|
|
271
|
+
ids.map(async (id) => {
|
|
272
|
+
const { systemId, environmentId } = parseHealthEntityId(id);
|
|
273
|
+
const state = await computeHealthEntityState({
|
|
274
|
+
service,
|
|
275
|
+
systemId,
|
|
276
|
+
// A bare `<systemId>` id is the ROLLUP: `parseHealthEntityId`
|
|
277
|
+
// returns `environmentId: null` for it (so the payload mapper can
|
|
278
|
+
// tell "rollup → omit environmentId"), but the rollup must read ALL
|
|
279
|
+
// runs — `undefined` — NOT the env-less slice (`null`, which filters
|
|
280
|
+
// to `env_id IS NULL`). Reserve `null` for an explicit env-less
|
|
281
|
+
// read; map the rollup's null to undefined here.
|
|
282
|
+
environmentId: environmentId === null ? undefined : environmentId,
|
|
283
|
+
});
|
|
284
|
+
if (state) out[id] = state;
|
|
243
285
|
}),
|
|
244
286
|
);
|
|
245
287
|
return out;
|
|
@@ -293,19 +335,28 @@ export function createHealthEntityRead(deps: {
|
|
|
293
335
|
*/
|
|
294
336
|
export async function writeHealthEntity(args: {
|
|
295
337
|
handle: EntityHandle<HealthEntityState> | undefined;
|
|
296
|
-
|
|
338
|
+
/**
|
|
339
|
+
* The `health` entity id to mutate (Phase 3b): the env-qualified
|
|
340
|
+
* `"<systemId>::<environmentId>"` for a per-env write, or the bare
|
|
341
|
+
* `"<systemId>"` for the env-less / system-rollup write. This is the id the
|
|
342
|
+
* framework diffs/emits, so it drives both the per-env and rollup
|
|
343
|
+
* `ENTITY_CHANGED`.
|
|
344
|
+
*/
|
|
345
|
+
entityId: string;
|
|
297
346
|
apply: () => Promise<HealthEntityState>;
|
|
298
347
|
onError?: (error: unknown) => void;
|
|
299
348
|
/**
|
|
300
|
-
* Optional per-`
|
|
349
|
+
* Optional per-`entityId` critical section wrapping the snapshot-prev +
|
|
301
350
|
* apply + diff + emit. The executor supplies a transaction-scoped advisory
|
|
302
|
-
* lock (`withXactLock`, key `health:<
|
|
303
|
-
* of one system can't double-emit a
|
|
304
|
-
*
|
|
351
|
+
* lock (`withXactLock`, key `health:<entityId>`) so concurrent evaluations
|
|
352
|
+
* of one (system, environment) — or of the rollup — can't double-emit a
|
|
353
|
+
* single logical transition, and per-env + rollup writes serialize against
|
|
354
|
+
* their OWN keys (distinct envs / the rollup don't block each other).
|
|
355
|
+
* Identity by default (no serialization) for the unbound-handle / test paths.
|
|
305
356
|
*/
|
|
306
357
|
serialize?: <T>(fn: () => Promise<T>) => Promise<T>;
|
|
307
358
|
}): Promise<HealthEntityState> {
|
|
308
|
-
const { handle,
|
|
359
|
+
const { handle, entityId, apply, onError, serialize } = args;
|
|
309
360
|
if (!handle) {
|
|
310
361
|
// No reactivity bound — run the durable write directly.
|
|
311
362
|
return apply();
|
|
@@ -318,7 +369,7 @@ export async function writeHealthEntity(args: {
|
|
|
318
369
|
// call, and we wrap that whole call so two concurrent evals serialize.
|
|
319
370
|
return await run(() =>
|
|
320
371
|
handle.mutate({
|
|
321
|
-
id:
|
|
372
|
+
id: entityId,
|
|
322
373
|
apply: async () => {
|
|
323
374
|
durableState = await apply();
|
|
324
375
|
return durableState;
|
|
@@ -335,19 +386,26 @@ export async function writeHealthEntity(args: {
|
|
|
335
386
|
}
|
|
336
387
|
}
|
|
337
388
|
|
|
338
|
-
/**
|
|
339
|
-
|
|
340
|
-
|
|
389
|
+
/**
|
|
390
|
+
* Advisory-lock key namespace for the per-entity health critical section. The
|
|
391
|
+
* argument is the FULL `health` entity id (Phase 3b): the bare `"<systemId>"`
|
|
392
|
+
* for the rollup or `"<systemId>::<environmentId>"` for a per-env write. Two
|
|
393
|
+
* different envs (or an env vs the rollup) get DIFFERENT keys, so they
|
|
394
|
+
* serialize independently and never block each other.
|
|
395
|
+
*/
|
|
396
|
+
export function healthEntityLockKey(entityId: string): string {
|
|
397
|
+
return `health:${entityId}`;
|
|
341
398
|
}
|
|
342
399
|
|
|
343
400
|
/**
|
|
344
|
-
* Build the per-`
|
|
401
|
+
* Build the per-`entityId` serializer for {@link writeHealthEntity} backed by
|
|
345
402
|
* a transaction-scoped advisory lock (`withXactLock`, key
|
|
346
|
-
* `health:<
|
|
347
|
-
*
|
|
403
|
+
* `health:<entityId>`). The returned function blocks until it holds the
|
|
404
|
+
* entity's lock, runs `fn` (the whole snapshot-prev + apply + diff + emit), and
|
|
348
405
|
* auto-releases the lock at COMMIT/ROLLBACK. Two concurrent evaluations of one
|
|
349
|
-
* system
|
|
350
|
-
*
|
|
406
|
+
* (system, environment) — or of the rollup — therefore serialize, while
|
|
407
|
+
* distinct envs proceed in parallel. Exactly one logical transition per entity
|
|
408
|
+
* emits exactly one `ENTITY_CHANGED` + one transition row.
|
|
351
409
|
*
|
|
352
410
|
* `fn` does its own durable writes on the outer pool; the lock only gates
|
|
353
411
|
* ENTRY to the critical section, so its connection affinity is irrelevant —
|
|
@@ -356,12 +414,12 @@ export function healthSystemLockKey(systemId: string): string {
|
|
|
356
414
|
*/
|
|
357
415
|
export function createHealthEntitySerializer(deps: {
|
|
358
416
|
advisoryLock: AdvisoryLockService;
|
|
359
|
-
}): (
|
|
417
|
+
}): (entityId: string) => <T>(fn: () => Promise<T>) => Promise<T> {
|
|
360
418
|
const { advisoryLock } = deps;
|
|
361
|
-
return (
|
|
419
|
+
return (entityId) =>
|
|
362
420
|
<T>(fn: () => Promise<T>) =>
|
|
363
421
|
advisoryLock.withXactLock({
|
|
364
|
-
key:
|
|
422
|
+
key: healthEntityLockKey(entityId),
|
|
365
423
|
fn: () => fn(),
|
|
366
424
|
});
|
|
367
425
|
}
|