@checkstack/healthcheck-backend 1.5.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/CHANGELOG.md +253 -0
  2. package/drizzle/0018_abnormal_preak.sql +10 -0
  3. package/drizzle/meta/0018_snapshot.json +600 -0
  4. package/drizzle/meta/_journal.json +7 -0
  5. package/package.json +32 -27
  6. package/src/ai/assertion-validation.test.ts +117 -0
  7. package/src/ai/assertion-validation.ts +147 -0
  8. package/src/ai/healthcheck-capabilities.test.ts +158 -0
  9. package/src/ai/healthcheck-capabilities.ts +217 -0
  10. package/src/ai/healthcheck-delete.test.ts +81 -0
  11. package/src/ai/healthcheck-delete.ts +81 -0
  12. package/src/ai/healthcheck-projection.test.ts +36 -0
  13. package/src/ai/healthcheck-propose.test.ts +268 -0
  14. package/src/ai/healthcheck-propose.ts +290 -0
  15. package/src/ai/healthcheck-script-tools.test.ts +93 -0
  16. package/src/ai/healthcheck-script-tools.ts +179 -0
  17. package/src/ai/healthcheck-update.test.ts +123 -0
  18. package/src/ai/healthcheck-update.ts +123 -0
  19. package/src/ai/notify-subscribers.test.ts +109 -0
  20. package/src/ai/notify-subscribers.ts +176 -0
  21. package/src/ai/register-ai-tools.test.ts +41 -0
  22. package/src/ai/register-ai-tools.ts +53 -0
  23. package/src/ai/shell-env-table.test.ts +47 -0
  24. package/src/automations.test.ts +2 -1
  25. package/src/automations.ts +9 -1
  26. package/src/collector-script-test.test.ts +53 -1
  27. package/src/collector-script-test.ts +59 -7
  28. package/src/effective-environments.test.ts +93 -0
  29. package/src/effective-environments.ts +64 -0
  30. package/src/health-entity-id.ts +57 -0
  31. package/src/health-entity.test.ts +384 -6
  32. package/src/health-entity.ts +93 -35
  33. package/src/health-state.ts +41 -4
  34. package/src/healthcheck-gitops-kinds.test.ts +95 -0
  35. package/src/healthcheck-gitops-kinds.ts +56 -13
  36. package/src/index.ts +30 -0
  37. package/src/migration-chain-contract.test.ts +57 -0
  38. package/src/queue-executor.test.ts +801 -0
  39. package/src/queue-executor.ts +336 -52
  40. package/src/realtime-aggregation.test.ts +30 -0
  41. package/src/realtime-aggregation.ts +16 -0
  42. package/src/retention-job.ts +167 -93
  43. package/src/retention-rollup.test.ts +118 -0
  44. package/src/router.test.ts +120 -1
  45. package/src/router.ts +20 -0
  46. package/src/schema.ts +44 -6
  47. package/src/service.ts +199 -43
  48. package/src/state-transitions.test.ts +104 -0
  49. package/src/state-transitions.ts +39 -1
  50. package/src/validate-configuration.test.ts +205 -0
  51. package/src/validate-configuration.ts +159 -0
  52. package/tsconfig.json +9 -0
@@ -34,6 +34,7 @@ import {
34
34
  Versioned,
35
35
  VersionedAggregated,
36
36
  aggregatedCounter,
37
+ configString,
37
38
  z,
38
39
  } from "@checkstack/backend-api";
39
40
  import { mock } from "bun:test";
@@ -499,6 +500,7 @@ describe("Queue-Based Health Check Executor", () => {
499
500
  collector: {
500
501
  id: "test-collector",
501
502
  execute: collectorExecute,
503
+ config: new Versioned({ version: 1, schema: z.object({}) }),
502
504
  mergeResult: mock(() => ({})),
503
505
  },
504
506
  })),
@@ -564,5 +566,804 @@ describe("Queue-Based Health Check Executor", () => {
564
566
  system: { id: "system-1", name: "web-01" },
565
567
  });
566
568
  });
569
+
570
+ it("migrates a stored v1 strategy + collector config on the execution path", async () => {
571
+ const mockDb = createMockDb();
572
+ const mockLogger = createMockLogger();
573
+ const mockQueueManager = createMockQueueManager();
574
+ const mockCatalogClient = createMockCatalogClient();
575
+ const mockMaintenanceClient = createMockMaintenanceClient();
576
+ const mockIncidentClient = createMockIncidentClient();
577
+ const mockSignalService = createMockSignalService();
578
+
579
+ // A strategy whose config migrates v1 -> v2 by STRIPPING a moved field
580
+ // (`endpoint`), mirroring the real health-check reshapers. The stored
581
+ // config is genuinely v1 (carries `endpoint`); the executor must run the
582
+ // migration before handing the config to createClient.
583
+ let capturedStrategyConfig: unknown;
584
+ const strategyMigratingRegistry: HealthCheckRegistry = {
585
+ getStrategy: mock(() => ({
586
+ id: "migrating-strategy",
587
+ displayName: "Migrating",
588
+ config: new Versioned({
589
+ version: 2,
590
+ schema: z.object({ timeout: z.number() }),
591
+ migrations: [
592
+ {
593
+ fromVersion: 1,
594
+ toVersion: 2,
595
+ description: "strip endpoint",
596
+ migrate: (data: unknown): unknown => {
597
+ if (
598
+ typeof data === "object" &&
599
+ data !== null &&
600
+ "endpoint" in data
601
+ ) {
602
+ const timeout = (data as { timeout?: unknown }).timeout;
603
+ return { timeout: typeof timeout === "number" ? timeout : 0 };
604
+ }
605
+ return data;
606
+ },
607
+ },
608
+ ],
609
+ }),
610
+ result: new Versioned({ version: 1, schema: z.object({}) }),
611
+ aggregatedResult: new VersionedAggregated({
612
+ version: 1,
613
+ fields: { count: aggregatedCounter({}) },
614
+ }),
615
+ createClient: mock(async (config: unknown) => {
616
+ capturedStrategyConfig = config;
617
+ return {
618
+ client: { exec: mock(async () => ({})) },
619
+ close: mock(() => {}),
620
+ };
621
+ }),
622
+ mergeResult: mock(() => ({})),
623
+ })),
624
+ register: mock(() => {}),
625
+ getStrategies: mock(() => []),
626
+ getStrategiesWithMeta: mock(() => []),
627
+ };
628
+
629
+ // A collector whose config migrates v1 -> v2 by renaming `cmd` -> `value`.
630
+ let capturedCollectorConfig: unknown;
631
+ const collectorExecute = mock(async (params: { config?: unknown }) => {
632
+ capturedCollectorConfig = params.config;
633
+ return { result: {} };
634
+ });
635
+ const migratingCollectorRegistry = {
636
+ register: mock(() => {}),
637
+ getCollector: mock(() => ({
638
+ collector: {
639
+ id: "migrating-collector",
640
+ execute: collectorExecute,
641
+ config: new Versioned({
642
+ version: 2,
643
+ schema: z.object({ value: z.string() }),
644
+ migrations: [
645
+ {
646
+ fromVersion: 1,
647
+ toVersion: 2,
648
+ description: "rename cmd -> value",
649
+ migrate: (data: unknown): unknown => {
650
+ if (
651
+ typeof data === "object" &&
652
+ data !== null &&
653
+ "cmd" in data &&
654
+ !("value" in data)
655
+ ) {
656
+ const cmd = (data as { cmd?: unknown }).cmd;
657
+ return { value: typeof cmd === "string" ? cmd : "" };
658
+ }
659
+ return data;
660
+ },
661
+ },
662
+ ],
663
+ }),
664
+ result: new Versioned({ version: 1, schema: z.object({}) }),
665
+ mergeResult: mock(() => ({})),
666
+ },
667
+ })),
668
+ getCollectors: mock(() => []),
669
+ };
670
+
671
+ let selectCallCount = 0;
672
+ (mockDb.select as any) = mock(() => {
673
+ selectCallCount++;
674
+ if (selectCallCount === 2) {
675
+ return {
676
+ from: mock(() => ({
677
+ innerJoin: mock(() => ({
678
+ where: mock(() =>
679
+ Promise.resolve([
680
+ {
681
+ configId: "config-1",
682
+ configName: "v1 config",
683
+ strategyId: "migrating-strategy",
684
+ // Stored RAW + genuinely v1 (carries the moved field).
685
+ config: { endpoint: "tcp://old", timeout: 1234 },
686
+ collectors: [
687
+ {
688
+ id: "col-1",
689
+ collectorId: "migrating-collector",
690
+ config: { cmd: "legacy-value" },
691
+ },
692
+ ],
693
+ interval: 30,
694
+ enabled: true,
695
+ paused: false,
696
+ includeLocal: true,
697
+ satelliteIds: [],
698
+ },
699
+ ]),
700
+ ),
701
+ })),
702
+ })),
703
+ };
704
+ }
705
+ return {
706
+ from: mock(() => ({
707
+ innerJoin: mock(() => ({
708
+ where: mock(() => Promise.resolve([])),
709
+ })),
710
+ })),
711
+ };
712
+ });
713
+
714
+ const queue =
715
+ mockQueueManager.getQueue<HealthCheckJobPayload>("health-checks");
716
+ let capturedHandler:
717
+ | ((job: { data: HealthCheckJobPayload }) => Promise<void>)
718
+ | undefined;
719
+ (queue.consume as any) = mock(
720
+ async (
721
+ handler: (job: { data: HealthCheckJobPayload }) => Promise<void>,
722
+ ) => {
723
+ capturedHandler = handler;
724
+ },
725
+ );
726
+
727
+ await setupHealthCheckWorker({
728
+ db: mockDb as unknown as Parameters<
729
+ typeof setupHealthCheckWorker
730
+ >[0]["db"],
731
+ advisoryLock: mockAdvisoryLock,
732
+ registry: strategyMigratingRegistry,
733
+ collectorRegistry: migratingCollectorRegistry as unknown as Parameters<
734
+ typeof setupHealthCheckWorker
735
+ >[0]["collectorRegistry"],
736
+ logger: mockLogger,
737
+ queueManager: mockQueueManager,
738
+ signalService: mockSignalService,
739
+ catalogClient: mockCatalogClient as unknown as Parameters<
740
+ typeof setupHealthCheckWorker
741
+ >[0]["catalogClient"],
742
+ notificationClient: {
743
+ notifyForSubscription: () => Promise.resolve({ notifiedCount: 0 }),
744
+ } as unknown as Parameters<
745
+ typeof setupHealthCheckWorker
746
+ >[0]["notificationClient"],
747
+ maintenanceClient: mockMaintenanceClient as unknown as Parameters<
748
+ typeof setupHealthCheckWorker
749
+ >[0]["maintenanceClient"],
750
+ incidentClient: mockIncidentClient as unknown as Parameters<
751
+ typeof setupHealthCheckWorker
752
+ >[0]["incidentClient"],
753
+ getEmitHook: () => undefined,
754
+ cache: passthroughCache,
755
+ });
756
+
757
+ if (capturedHandler) {
758
+ await capturedHandler({
759
+ data: { configId: "config-1", systemId: "system-1" },
760
+ }).catch(() => {});
761
+ }
762
+
763
+ // Strategy config reached createClient MIGRATED (endpoint stripped,
764
+ // timeout preserved) and VALIDATED against the v2 schema.
765
+ expect(capturedStrategyConfig).toEqual({ timeout: 1234 });
766
+ // Collector config reached execute MIGRATED (cmd renamed to value).
767
+ expect(collectorExecute).toHaveBeenCalled();
768
+ expect(capturedCollectorConfig).toEqual({ value: "legacy-value" });
769
+ });
770
+ });
771
+
772
+ describe("executeHealthCheckJob - per-environment fan-out", () => {
773
+ /**
774
+ * Drive one job with a configurable assignment `environmentIds` + catalog
775
+ * membership, capturing the run-context handed to the collector on EACH
776
+ * run. The collector executes once per fanned-out run, so the captured
777
+ * list is a faithful witness of "one run per effective environment".
778
+ */
779
+ async function runFanOut({
780
+ environmentIds,
781
+ membership,
782
+ collectorConfig = {},
783
+ collectorConfigSchema = z.object({}),
784
+ }: {
785
+ environmentIds: string[] | null;
786
+ membership: Array<{
787
+ id: string;
788
+ name: string;
789
+ metadata: Record<string, unknown> | null;
790
+ }>;
791
+ /** Stored (pre-render) collector config for the single collector. */
792
+ collectorConfig?: Record<string, unknown>;
793
+ /** Schema used to detect `x-templatable` fields for the render pass. */
794
+ collectorConfigSchema?: z.ZodType<unknown>;
795
+ }): Promise<Array<{ environment?: unknown; config?: unknown }>> {
796
+ const mockDb = createMockDb();
797
+ const mockRegistry = createMockRegistry();
798
+ const mockLogger = createMockLogger();
799
+ const mockQueueManager = createMockQueueManager();
800
+ const mockCatalogClient = createMockCatalogClient();
801
+ const mockMaintenanceClient = createMockMaintenanceClient();
802
+ const mockIncidentClient = createMockIncidentClient();
803
+ const mockSignalService = createMockSignalService();
804
+
805
+ (mockCatalogClient.getSystem as any) = mock(async () => ({
806
+ id: "system-1",
807
+ name: "web-01",
808
+ }));
809
+ (mockCatalogClient as any).resolveSystemEnvironments = mock(async () =>
810
+ membership.map((m) => ({
811
+ ...m,
812
+ description: null,
813
+ systemIds: [],
814
+ createdAt: new Date(),
815
+ updatedAt: new Date(),
816
+ })),
817
+ );
818
+
819
+ let selectCallCount = 0;
820
+ (mockDb.select as any) = mock(() => {
821
+ selectCallCount++;
822
+ if (selectCallCount === 2) {
823
+ return {
824
+ from: mock(() => ({
825
+ innerJoin: mock(() => ({
826
+ where: mock(() =>
827
+ Promise.resolve([
828
+ {
829
+ configId: "config-1",
830
+ configName: "Check",
831
+ strategyId: "test-strategy",
832
+ config: { timeout: 5000 },
833
+ collectors: [
834
+ {
835
+ id: "col-1",
836
+ collectorId: "test-collector",
837
+ config: collectorConfig,
838
+ },
839
+ ],
840
+ interval: 45,
841
+ enabled: true,
842
+ paused: false,
843
+ includeLocal: true,
844
+ satelliteIds: [],
845
+ environmentIds,
846
+ },
847
+ ]),
848
+ ),
849
+ })),
850
+ })),
851
+ };
852
+ }
853
+ return {
854
+ from: mock(() => ({
855
+ innerJoin: mock(() => ({
856
+ where: mock(() => Promise.resolve([])),
857
+ })),
858
+ })),
859
+ };
860
+ });
861
+
862
+ const captured: Array<{ environment?: unknown; config?: unknown }> = [];
863
+ const collectorExecute = mock(
864
+ async (params: {
865
+ runContext?: { environment?: unknown };
866
+ config?: unknown;
867
+ }) => {
868
+ captured.push({
869
+ environment: params.runContext?.environment,
870
+ config: params.config,
871
+ });
872
+ return { result: {} };
873
+ },
874
+ );
875
+ const mockCollectorRegistry = {
876
+ register: mock(() => {}),
877
+ getCollector: mock(() => ({
878
+ collector: {
879
+ id: "test-collector",
880
+ execute: collectorExecute,
881
+ config: new Versioned({
882
+ version: 1,
883
+ schema: collectorConfigSchema,
884
+ }),
885
+ mergeResult: mock(() => ({})),
886
+ },
887
+ })),
888
+ getCollectors: mock(() => []),
889
+ };
890
+
891
+ const queue =
892
+ mockQueueManager.getQueue<HealthCheckJobPayload>("health-checks");
893
+ let capturedHandler:
894
+ | ((job: { data: HealthCheckJobPayload }) => Promise<void>)
895
+ | undefined;
896
+ (queue.consume as any) = mock(
897
+ async (
898
+ handler: (job: { data: HealthCheckJobPayload }) => Promise<void>,
899
+ ) => {
900
+ capturedHandler = handler;
901
+ },
902
+ );
903
+
904
+ await setupHealthCheckWorker({
905
+ db: mockDb as unknown as Parameters<
906
+ typeof setupHealthCheckWorker
907
+ >[0]["db"],
908
+ advisoryLock: mockAdvisoryLock,
909
+ registry: mockRegistry,
910
+ collectorRegistry: mockCollectorRegistry as unknown as Parameters<
911
+ typeof setupHealthCheckWorker
912
+ >[0]["collectorRegistry"],
913
+ logger: mockLogger,
914
+ queueManager: mockQueueManager,
915
+ signalService: mockSignalService,
916
+ catalogClient: mockCatalogClient as unknown as Parameters<
917
+ typeof setupHealthCheckWorker
918
+ >[0]["catalogClient"],
919
+ notificationClient: {
920
+ notifyForSubscription: () => Promise.resolve({ notifiedCount: 0 }),
921
+ } as unknown as Parameters<
922
+ typeof setupHealthCheckWorker
923
+ >[0]["notificationClient"],
924
+ maintenanceClient: mockMaintenanceClient as unknown as Parameters<
925
+ typeof setupHealthCheckWorker
926
+ >[0]["maintenanceClient"],
927
+ incidentClient: mockIncidentClient as unknown as Parameters<
928
+ typeof setupHealthCheckWorker
929
+ >[0]["incidentClient"],
930
+ getEmitHook: () => undefined,
931
+ cache: passthroughCache,
932
+ });
933
+
934
+ if (capturedHandler) {
935
+ // Downstream persistence touches DB surfaces the lightweight mock
936
+ // doesn't fully model; tolerate a later throw — run-contexts are
937
+ // captured synchronously at collector-execute time, one per run.
938
+ await capturedHandler({
939
+ data: { configId: "config-1", systemId: "system-1" },
940
+ }).catch(() => {});
941
+ }
942
+
943
+ return captured;
944
+ }
945
+
946
+ it("runs once per effective environment with that env in run-context (null selector = all)", async () => {
947
+ const captured = await runFanOut({
948
+ environmentIds: null,
949
+ membership: [
950
+ { id: "prod", name: "Production", metadata: { baseUrl: "p" } },
951
+ { id: "staging", name: "Staging", metadata: { baseUrl: "s" } },
952
+ ],
953
+ });
954
+
955
+ expect(captured).toHaveLength(2);
956
+ expect(captured[0]?.environment).toEqual({
957
+ id: "prod",
958
+ name: "Production",
959
+ fields: { baseUrl: "p" },
960
+ });
961
+ expect(captured[1]?.environment).toEqual({
962
+ id: "staging",
963
+ name: "Staging",
964
+ fields: { baseUrl: "s" },
965
+ });
966
+ });
967
+
968
+ it("renders x-templatable config fields per environment against environment.*", async () => {
969
+ const captured = await runFanOut({
970
+ environmentIds: null,
971
+ membership: [
972
+ {
973
+ id: "prod",
974
+ name: "Production",
975
+ metadata: { baseUrl: "https://prod.example.com" },
976
+ },
977
+ {
978
+ id: "staging",
979
+ name: "Staging",
980
+ metadata: { baseUrl: "https://staging.example.com" },
981
+ },
982
+ ],
983
+ collectorConfig: { url: "{{ environment.baseUrl }}/healthz" },
984
+ collectorConfigSchema: z.object({
985
+ url: configString({ "x-templatable": true }),
986
+ }),
987
+ });
988
+
989
+ expect(captured).toHaveLength(2);
990
+ // Each env gets its own rendered config (per-env render pass, §6.3.3).
991
+ expect((captured[0]?.config as { url: string }).url).toBe(
992
+ "https://prod.example.com/healthz",
993
+ );
994
+ expect((captured[1]?.config as { url: string }).url).toBe(
995
+ "https://staging.example.com/healthz",
996
+ );
997
+ });
998
+
999
+ it("renders environment.* to empty string for an env-less run (render-empty, §11.6)", async () => {
1000
+ const captured = await runFanOut({
1001
+ environmentIds: [],
1002
+ membership: [
1003
+ { id: "prod", name: "Production", metadata: { baseUrl: "x" } },
1004
+ ],
1005
+ collectorConfig: { url: "{{ environment.baseUrl }}/healthz" },
1006
+ collectorConfigSchema: z.object({
1007
+ url: configString({ "x-templatable": true }),
1008
+ }),
1009
+ });
1010
+
1011
+ expect(captured).toHaveLength(1);
1012
+ expect(captured[0]?.environment).toBeUndefined();
1013
+ // Missing path renders empty (strict: false) — the HTTP collector's
1014
+ // post-render .url() check turns this into a clear config error.
1015
+ expect((captured[0]?.config as { url: string }).url).toBe("/healthz");
1016
+ });
1017
+
1018
+ it("runs only the explicit subset, intersected with membership", async () => {
1019
+ const captured = await runFanOut({
1020
+ environmentIds: ["staging"],
1021
+ membership: [
1022
+ { id: "prod", name: "Production", metadata: {} },
1023
+ { id: "staging", name: "Staging", metadata: {} },
1024
+ ],
1025
+ });
1026
+
1027
+ expect(captured).toHaveLength(1);
1028
+ expect((captured[0]?.environment as { id: string }).id).toBe("staging");
1029
+ });
1030
+
1031
+ it("runs exactly once with no environment when opting out ([] selector)", async () => {
1032
+ const captured = await runFanOut({
1033
+ environmentIds: [],
1034
+ membership: [{ id: "prod", name: "Production", metadata: {} }],
1035
+ });
1036
+
1037
+ expect(captured).toHaveLength(1);
1038
+ expect(captured[0]?.environment).toBeUndefined();
1039
+ });
1040
+
1041
+ it("runs exactly once env-less when the system has no environments (null selector, empty membership)", async () => {
1042
+ const captured = await runFanOut({
1043
+ environmentIds: null,
1044
+ membership: [],
1045
+ });
1046
+
1047
+ expect(captured).toHaveLength(1);
1048
+ expect(captured[0]?.environment).toBeUndefined();
1049
+ });
1050
+
1051
+ /**
1052
+ * Per-environment ISOLATION regression (§7.2). When the FIRST
1053
+ * environment's run throws (here: its durable persist rejects, which —
1054
+ * with no health-entity handle bound — propagates out of
1055
+ * `writeHealthEntity` to the per-env catch), the loop MUST log and
1056
+ * continue so the SECOND environment still produces a run. One env's
1057
+ * failure must never abort its siblings.
1058
+ */
1059
+ it("continues to the next environment when the first environment's run throws", async () => {
1060
+ const mockDb = createMockDb();
1061
+ const mockRegistry = createMockRegistry();
1062
+ const mockLogger = createMockLogger();
1063
+ const mockQueueManager = createMockQueueManager();
1064
+ const mockCatalogClient = createMockCatalogClient();
1065
+ const mockMaintenanceClient = createMockMaintenanceClient();
1066
+ const mockIncidentClient = createMockIncidentClient();
1067
+ const mockSignalService = createMockSignalService();
1068
+
1069
+ (mockCatalogClient.getSystem as any) = mock(async () => ({
1070
+ id: "system-1",
1071
+ name: "web-01",
1072
+ }));
1073
+ const membership = [
1074
+ { id: "prod", name: "Production", metadata: {} },
1075
+ { id: "staging", name: "Staging", metadata: {} },
1076
+ ];
1077
+ (mockCatalogClient as any).resolveSystemEnvironments = mock(async () =>
1078
+ membership.map((m) => ({
1079
+ ...m,
1080
+ description: null,
1081
+ systemIds: [],
1082
+ createdAt: new Date(),
1083
+ updatedAt: new Date(),
1084
+ })),
1085
+ );
1086
+
1087
+ let selectCallCount = 0;
1088
+ (mockDb.select as any) = mock(() => {
1089
+ selectCallCount++;
1090
+ if (selectCallCount === 2) {
1091
+ return {
1092
+ from: mock(() => ({
1093
+ innerJoin: mock(() => ({
1094
+ where: mock(() =>
1095
+ Promise.resolve([
1096
+ {
1097
+ configId: "config-1",
1098
+ configName: "Check",
1099
+ strategyId: "test-strategy",
1100
+ config: { timeout: 5000 },
1101
+ collectors: [
1102
+ {
1103
+ id: "col-1",
1104
+ collectorId: "test-collector",
1105
+ config: {},
1106
+ },
1107
+ ],
1108
+ interval: 45,
1109
+ enabled: true,
1110
+ paused: false,
1111
+ includeLocal: true,
1112
+ satelliteIds: [],
1113
+ environmentIds: null,
1114
+ },
1115
+ ]),
1116
+ ),
1117
+ })),
1118
+ })),
1119
+ };
1120
+ }
1121
+ return {
1122
+ from: mock(() => ({
1123
+ innerJoin: mock(() => ({
1124
+ where: mock(() => Promise.resolve([])),
1125
+ })),
1126
+ })),
1127
+ };
1128
+ });
1129
+
1130
+ // The first environment's run insert REJECTS; the second succeeds.
1131
+ // With no health-entity handle bound, a failed `apply` propagates out
1132
+ // of `writeHealthEntity`, so this throw reaches the per-env catch.
1133
+ let insertCalls = 0;
1134
+ (mockDb.insert as any) = mock(() => ({
1135
+ values: mock(() => {
1136
+ insertCalls++;
1137
+ if (insertCalls === 1) {
1138
+ return Promise.reject(new Error("env-1 persist failed"));
1139
+ }
1140
+ return Promise.resolve();
1141
+ }),
1142
+ }));
1143
+
1144
+ const envSeen: Array<string | undefined> = [];
1145
+ const collectorExecute = mock(
1146
+ async (params: { runContext?: { environment?: { id?: string } } }) => {
1147
+ envSeen.push(params.runContext?.environment?.id);
1148
+ return { result: {} };
1149
+ },
1150
+ );
1151
+ const mockCollectorRegistry = {
1152
+ register: mock(() => {}),
1153
+ getCollector: mock(() => ({
1154
+ collector: {
1155
+ id: "test-collector",
1156
+ execute: collectorExecute,
1157
+ config: new Versioned({ version: 1, schema: z.object({}) }),
1158
+ mergeResult: mock(() => ({})),
1159
+ },
1160
+ })),
1161
+ getCollectors: mock(() => []),
1162
+ };
1163
+
1164
+ const queue =
1165
+ mockQueueManager.getQueue<HealthCheckJobPayload>("health-checks");
1166
+ let capturedHandler:
1167
+ | ((job: { data: HealthCheckJobPayload }) => Promise<void>)
1168
+ | undefined;
1169
+ (queue.consume as any) = mock(
1170
+ async (
1171
+ handler: (job: { data: HealthCheckJobPayload }) => Promise<void>,
1172
+ ) => {
1173
+ capturedHandler = handler;
1174
+ },
1175
+ );
1176
+
1177
+ await setupHealthCheckWorker({
1178
+ db: mockDb as unknown as Parameters<
1179
+ typeof setupHealthCheckWorker
1180
+ >[0]["db"],
1181
+ advisoryLock: mockAdvisoryLock,
1182
+ registry: mockRegistry,
1183
+ collectorRegistry: mockCollectorRegistry as unknown as Parameters<
1184
+ typeof setupHealthCheckWorker
1185
+ >[0]["collectorRegistry"],
1186
+ logger: mockLogger,
1187
+ queueManager: mockQueueManager,
1188
+ signalService: mockSignalService,
1189
+ catalogClient: mockCatalogClient as unknown as Parameters<
1190
+ typeof setupHealthCheckWorker
1191
+ >[0]["catalogClient"],
1192
+ notificationClient: {
1193
+ notifyForSubscription: () => Promise.resolve({ notifiedCount: 0 }),
1194
+ } as unknown as Parameters<
1195
+ typeof setupHealthCheckWorker
1196
+ >[0]["notificationClient"],
1197
+ maintenanceClient: mockMaintenanceClient as unknown as Parameters<
1198
+ typeof setupHealthCheckWorker
1199
+ >[0]["maintenanceClient"],
1200
+ incidentClient: mockIncidentClient as unknown as Parameters<
1201
+ typeof setupHealthCheckWorker
1202
+ >[0]["incidentClient"],
1203
+ getEmitHook: () => undefined,
1204
+ cache: passthroughCache,
1205
+ });
1206
+
1207
+ if (capturedHandler) {
1208
+ await capturedHandler({
1209
+ data: { configId: "config-1", systemId: "system-1" },
1210
+ });
1211
+ }
1212
+
1213
+ // BOTH environments' collectors ran — the first env's persist failure
1214
+ // did not abort the loop.
1215
+ expect(envSeen).toEqual(["prod", "staging"]);
1216
+ // The failure was logged (isolated), not propagated.
1217
+ expect(mockLogger.error).toHaveBeenCalled();
1218
+ });
1219
+
1220
+ /**
1221
+ * Fail-open OBSERVABILITY (P3 review item 2). When the catalog
1222
+ * `resolveSystemEnvironments` read fails and the executor degrades to a
1223
+ * single env-less run, it MUST emit a counter-style signal (not just a
1224
+ * `logger.warn`) so durable catalog misconfig / outage is observable.
1225
+ */
1226
+ it("broadcasts ENVIRONMENT_RESOLUTION_FAILED and degrades to one env-less run when the catalog read fails", async () => {
1227
+ const mockDb = createMockDb();
1228
+ const mockRegistry = createMockRegistry();
1229
+ const mockLogger = createMockLogger();
1230
+ const mockQueueManager = createMockQueueManager();
1231
+ const mockCatalogClient = createMockCatalogClient();
1232
+ const mockMaintenanceClient = createMockMaintenanceClient();
1233
+ const mockIncidentClient = createMockIncidentClient();
1234
+ const mockSignalService = createMockSignalService();
1235
+
1236
+ (mockCatalogClient.getSystem as any) = mock(async () => ({
1237
+ id: "system-1",
1238
+ name: "web-01",
1239
+ }));
1240
+ // The catalog read REJECTS — the executor must fail open.
1241
+ (mockCatalogClient as any).resolveSystemEnvironments = mock(async () => {
1242
+ throw new Error("catalog unavailable");
1243
+ });
1244
+
1245
+ let selectCallCount = 0;
1246
+ (mockDb.select as any) = mock(() => {
1247
+ selectCallCount++;
1248
+ if (selectCallCount === 2) {
1249
+ return {
1250
+ from: mock(() => ({
1251
+ innerJoin: mock(() => ({
1252
+ where: mock(() =>
1253
+ Promise.resolve([
1254
+ {
1255
+ configId: "config-1",
1256
+ configName: "Check",
1257
+ strategyId: "test-strategy",
1258
+ config: { timeout: 5000 },
1259
+ collectors: [
1260
+ {
1261
+ id: "col-1",
1262
+ collectorId: "test-collector",
1263
+ config: {},
1264
+ },
1265
+ ],
1266
+ interval: 45,
1267
+ enabled: true,
1268
+ paused: false,
1269
+ includeLocal: true,
1270
+ satelliteIds: [],
1271
+ environmentIds: null,
1272
+ },
1273
+ ]),
1274
+ ),
1275
+ })),
1276
+ })),
1277
+ };
1278
+ }
1279
+ return {
1280
+ from: mock(() => ({
1281
+ innerJoin: mock(() => ({
1282
+ where: mock(() => Promise.resolve([])),
1283
+ })),
1284
+ })),
1285
+ };
1286
+ });
1287
+
1288
+ const envSeen: Array<string | undefined> = [];
1289
+ const collectorExecute = mock(
1290
+ async (params: { runContext?: { environment?: { id?: string } } }) => {
1291
+ envSeen.push(params.runContext?.environment?.id);
1292
+ return { result: {} };
1293
+ },
1294
+ );
1295
+ const mockCollectorRegistry = {
1296
+ register: mock(() => {}),
1297
+ getCollector: mock(() => ({
1298
+ collector: {
1299
+ id: "test-collector",
1300
+ execute: collectorExecute,
1301
+ config: new Versioned({ version: 1, schema: z.object({}) }),
1302
+ mergeResult: mock(() => ({})),
1303
+ },
1304
+ })),
1305
+ getCollectors: mock(() => []),
1306
+ };
1307
+
1308
+ const queue =
1309
+ mockQueueManager.getQueue<HealthCheckJobPayload>("health-checks");
1310
+ let capturedHandler:
1311
+ | ((job: { data: HealthCheckJobPayload }) => Promise<void>)
1312
+ | undefined;
1313
+ (queue.consume as any) = mock(
1314
+ async (
1315
+ handler: (job: { data: HealthCheckJobPayload }) => Promise<void>,
1316
+ ) => {
1317
+ capturedHandler = handler;
1318
+ },
1319
+ );
1320
+
1321
+ await setupHealthCheckWorker({
1322
+ db: mockDb as unknown as Parameters<
1323
+ typeof setupHealthCheckWorker
1324
+ >[0]["db"],
1325
+ advisoryLock: mockAdvisoryLock,
1326
+ registry: mockRegistry,
1327
+ collectorRegistry: mockCollectorRegistry as unknown as Parameters<
1328
+ typeof setupHealthCheckWorker
1329
+ >[0]["collectorRegistry"],
1330
+ logger: mockLogger,
1331
+ queueManager: mockQueueManager,
1332
+ signalService: mockSignalService,
1333
+ catalogClient: mockCatalogClient as unknown as Parameters<
1334
+ typeof setupHealthCheckWorker
1335
+ >[0]["catalogClient"],
1336
+ notificationClient: {
1337
+ notifyForSubscription: () => Promise.resolve({ notifiedCount: 0 }),
1338
+ } as unknown as Parameters<
1339
+ typeof setupHealthCheckWorker
1340
+ >[0]["notificationClient"],
1341
+ maintenanceClient: mockMaintenanceClient as unknown as Parameters<
1342
+ typeof setupHealthCheckWorker
1343
+ >[0]["maintenanceClient"],
1344
+ incidentClient: mockIncidentClient as unknown as Parameters<
1345
+ typeof setupHealthCheckWorker
1346
+ >[0]["incidentClient"],
1347
+ getEmitHook: () => undefined,
1348
+ cache: passthroughCache,
1349
+ });
1350
+
1351
+ if (capturedHandler) {
1352
+ await capturedHandler({
1353
+ data: { configId: "config-1", systemId: "system-1" },
1354
+ }).catch(() => {});
1355
+ }
1356
+
1357
+ // Degraded to exactly one env-less run.
1358
+ expect(envSeen).toEqual([undefined]);
1359
+ // The observability signal was broadcast with the failure detail.
1360
+ const resolutionFailed = mockSignalService.getRecordedSignalsById(
1361
+ "healthcheck.environment.resolution_failed",
1362
+ );
1363
+ expect(resolutionFailed).toHaveLength(1);
1364
+ expect(
1365
+ (resolutionFailed[0]?.payload as { systemId?: string }).systemId,
1366
+ ).toBe("system-1");
1367
+ });
567
1368
  });
568
1369
  });