@oneuptime/common 10.5.1 → 10.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/Models/DatabaseModels/TelemetryException.ts +10 -0
  2. package/Server/API/TelemetryAPI.ts +406 -0
  3. package/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.ts +20 -0
  4. package/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.ts +115 -0
  5. package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +4 -0
  6. package/Server/Services/ExceptionAggregationService.ts +51 -3
  7. package/Server/Services/LogAggregationService.ts +1 -0
  8. package/Server/Services/MetricAggregationService.ts +227 -0
  9. package/Server/Services/OpenTelemetryIngestService.ts +101 -1
  10. package/Server/Services/TraceAggregationService.ts +1 -0
  11. package/Server/Utils/Monitor/MonitorLogUtil.ts +146 -6
  12. package/Server/Utils/Telemetry/ResourceFacetResolver.ts +299 -0
  13. package/UI/Components/LogsViewer/LogsViewer.tsx +10 -0
  14. package/UI/Components/LogsViewer/components/FacetSection.tsx +40 -3
  15. package/UI/Components/LogsViewer/components/LogsFacetSidebar.tsx +23 -0
  16. package/UI/Components/LogsViewer/types.ts +2 -0
  17. package/UI/Components/TelemetryViewer/TelemetryViewer.tsx +8 -0
  18. package/UI/Components/TelemetryViewer/components/TelemetryFacetSection.tsx +49 -3
  19. package/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.tsx +16 -0
  20. package/UI/Components/TelemetryViewer/types.ts +12 -0
  21. package/build/dist/Models/DatabaseModels/TelemetryException.js +11 -0
  22. package/build/dist/Models/DatabaseModels/TelemetryException.js.map +1 -1
  23. package/build/dist/Server/API/TelemetryAPI.js +285 -0
  24. package/build/dist/Server/API/TelemetryAPI.js.map +1 -1
  25. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.js +18 -0
  26. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.js.map +1 -0
  27. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.js +106 -0
  28. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.js.map +1 -0
  29. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +4 -0
  30. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js.map +1 -1
  31. package/build/dist/Server/Services/ExceptionAggregationService.js +44 -4
  32. package/build/dist/Server/Services/ExceptionAggregationService.js.map +1 -1
  33. package/build/dist/Server/Services/LogAggregationService.js.map +1 -1
  34. package/build/dist/Server/Services/MetricAggregationService.js +159 -0
  35. package/build/dist/Server/Services/MetricAggregationService.js.map +1 -0
  36. package/build/dist/Server/Services/OpenTelemetryIngestService.js +60 -3
  37. package/build/dist/Server/Services/OpenTelemetryIngestService.js.map +1 -1
  38. package/build/dist/Server/Services/TraceAggregationService.js.map +1 -1
  39. package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js +127 -4
  40. package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js.map +1 -1
  41. package/build/dist/Server/Utils/Telemetry/ResourceFacetResolver.js +204 -0
  42. package/build/dist/Server/Utils/Telemetry/ResourceFacetResolver.js.map +1 -0
  43. package/build/dist/UI/Components/LogsViewer/LogsViewer.js +1 -1
  44. package/build/dist/UI/Components/LogsViewer/LogsViewer.js.map +1 -1
  45. package/build/dist/UI/Components/LogsViewer/components/FacetSection.js +26 -6
  46. package/build/dist/UI/Components/LogsViewer/components/FacetSection.js.map +1 -1
  47. package/build/dist/UI/Components/LogsViewer/components/LogsFacetSidebar.js +12 -1
  48. package/build/dist/UI/Components/LogsViewer/components/LogsFacetSidebar.js.map +1 -1
  49. package/build/dist/UI/Components/LogsViewer/types.js.map +1 -1
  50. package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js +1 -1
  51. package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js.map +1 -1
  52. package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSection.js +32 -6
  53. package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSection.js.map +1 -1
  54. package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.js +6 -1
  55. package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.js.map +1 -1
  56. package/package.json +1 -1
@@ -65,6 +65,16 @@ import Service from "./Service";
65
65
  name: "TelemetryException",
66
66
  })
67
67
  @Index(["projectId", "isResolved", "isArchived"]) // Exceptions dashboard counts/filters
68
+ /*
69
+ * Composite uniqueness on the dedup key used by the OTel traces ingest
70
+ * batched upsert. The ingest path collapses every exception event in a
71
+ * worker batch into a single INSERT … ON CONFLICT (projectId,
72
+ * serviceId, fingerprint) DO UPDATE statement; this index is what makes
73
+ * that conflict target resolvable and stops two concurrent workers from
74
+ * racing the old findOneBy + update path into duplicate rows or lost
75
+ * occuranceCount increments.
76
+ */
77
+ @Index(["projectId", "serviceId", "fingerprint"], { unique: true })
68
78
  export default class TelemetryException extends DatabaseBaseModel {
69
79
  @ColumnAccessControl({
70
80
  create: [
@@ -32,7 +32,13 @@ import TraceAggregationService, {
32
32
  import ExceptionAggregationService, {
33
33
  HistogramBucket as ExceptionHistogramBucket,
34
34
  HistogramRequest as ExceptionHistogramRequest,
35
+ FacetValue as ExceptionFacetValue,
36
+ FacetRequest as ExceptionFacetRequest,
35
37
  } from "../Services/ExceptionAggregationService";
38
+ import MetricAggregationService, {
39
+ FacetValue as MetricFacetValue,
40
+ FacetRequest as MetricFacetRequest,
41
+ } from "../Services/MetricAggregationService";
36
42
  import ProfileAggregationService, {
37
43
  FlamegraphRequest,
38
44
  FunctionListRequest,
@@ -55,6 +61,10 @@ import SortOrder from "../../Types/BaseDatabase/SortOrder";
55
61
  import ObjectID from "../../Types/ObjectID";
56
62
  import OneUptimeDate from "../../Types/Date";
57
63
  import { JSONObject } from "../../Types/JSON";
64
+ import ResourceFacetResolver, {
65
+ ResolvedFacetValue,
66
+ ResourceFacetSpec,
67
+ } from "../Utils/Telemetry/ResourceFacetResolver";
58
68
 
59
69
  const router: ExpressRouter = Express.getRouter();
60
70
 
@@ -393,6 +403,18 @@ router.post(
393
403
  ? (body["attributes"] as Record<string, string>)
394
404
  : undefined;
395
405
 
406
+ /*
407
+ * Per-facet partial-match filter applied at the Postgres source-of-truth
408
+ * lookup stage. Only consulted for resource facets (serviceId / hostId /
409
+ * dockerHostId / kubernetesClusterId) — other facets continue to filter
410
+ * client-side over the loaded value list.
411
+ */
412
+ const facetSearchText: Record<string, string> | undefined = body[
413
+ "facetSearchText"
414
+ ]
415
+ ? (body["facetSearchText"] as Record<string, string>)
416
+ : undefined;
417
+
396
418
  /*
397
419
  * Capture tenantId locally so TypeScript narrowing survives the
398
420
  * async closure below (narrowing is lost across closure boundaries).
@@ -437,6 +459,40 @@ router.post(
437
459
  facetResults,
438
460
  );
439
461
 
462
+ /*
463
+ * Replace resource-facet results with the Postgres source-of-truth list
464
+ * (filtered by facetSearchText and enriched with displayName). See the
465
+ * trace facets handler above for the rationale — same pattern, same
466
+ * benefit: low-volume resources stay visible and search can reach
467
+ * resources outside the ClickHouse sample window.
468
+ */
469
+ const resourceSpecs: Array<ResourceFacetSpec> = facetKeys
470
+ .filter((key: string): boolean => {
471
+ return ResourceFacetResolver.isResourceFacet(key);
472
+ })
473
+ .map((key: string): ResourceFacetSpec => {
474
+ const counts: Map<string, number> = new Map();
475
+ for (const fv of facets[key] || []) {
476
+ counts.set(fv.value, fv.count);
477
+ }
478
+ return {
479
+ facetKey: key,
480
+ counts,
481
+ searchText: facetSearchText?.[key],
482
+ limit,
483
+ };
484
+ });
485
+
486
+ if (resourceSpecs.length > 0) {
487
+ const resolved: Record<
488
+ string,
489
+ Array<ResolvedFacetValue>
490
+ > = await ResourceFacetResolver.resolve(projectId, resourceSpecs);
491
+ for (const key of Object.keys(resolved)) {
492
+ facets[key] = resolved[key] as Array<FacetValue>;
493
+ }
494
+ }
495
+
440
496
  return Response.sendJsonObjectResponse(req, res, {
441
497
  facets: facets as unknown as JSONObject,
442
498
  });
@@ -613,6 +669,18 @@ router.post(
613
669
  ? (body["attributes"] as Record<string, string>)
614
670
  : undefined;
615
671
 
672
+ /*
673
+ * Per-facet partial-match filter applied at the Postgres source-of-truth
674
+ * lookup stage. Only consulted for resource facets (serviceId / hostId /
675
+ * dockerHostId / kubernetesClusterId) — other facets continue to filter
676
+ * client-side over the loaded value list.
677
+ */
678
+ const facetSearchText: Record<string, string> | undefined = body[
679
+ "facetSearchText"
680
+ ]
681
+ ? (body["facetSearchText"] as Record<string, string>)
682
+ : undefined;
683
+
616
684
  /*
617
685
  * Compute all facets from a single sort-key-aligned sample query
618
686
  * (ORDER BY startTime DESC LIMIT N) and count top-K in Node. This
@@ -648,6 +716,44 @@ router.post(
648
716
  );
649
717
  }
650
718
 
719
+ /*
720
+ * Replace resource-facet results with the Postgres source-of-truth list
721
+ * (filtered by facetSearchText and enriched with displayName). Counts
722
+ * come from the ClickHouse sample above — entities with no recent
723
+ * telemetry surface with count 0 instead of being hidden entirely. This
724
+ * means low-volume services / hosts still appear in the sidebar and the
725
+ * search box can find resources beyond the loaded subset.
726
+ */
727
+ const resourceSpecs: Array<ResourceFacetSpec> = facetKeys
728
+ .filter((key: string): boolean => {
729
+ return ResourceFacetResolver.isResourceFacet(key);
730
+ })
731
+ .map((key: string): ResourceFacetSpec => {
732
+ const counts: Map<string, number> = new Map();
733
+ for (const fv of facets[key] || []) {
734
+ counts.set(fv.value, fv.count);
735
+ }
736
+ return {
737
+ facetKey: key,
738
+ counts,
739
+ searchText: facetSearchText?.[key],
740
+ limit,
741
+ };
742
+ });
743
+
744
+ if (resourceSpecs.length > 0) {
745
+ const resolved: Record<
746
+ string,
747
+ Array<ResolvedFacetValue>
748
+ > = await ResourceFacetResolver.resolve(
749
+ databaseProps.tenantId,
750
+ resourceSpecs,
751
+ );
752
+ for (const key of Object.keys(resolved)) {
753
+ facets[key] = resolved[key] as Array<TraceFacetValue>;
754
+ }
755
+ }
756
+
651
757
  return Response.sendJsonObjectResponse(req, res, {
652
758
  facets: facets as unknown as JSONObject,
653
759
  });
@@ -748,6 +854,306 @@ router.post(
748
854
  },
749
855
  );
750
856
 
857
+ // --- Exception Facets Endpoint ---
858
+
859
+ router.post(
860
+ "/telemetry/exceptions/facets",
861
+ UserMiddleware.getUserMiddleware,
862
+ async (
863
+ req: ExpressRequest,
864
+ res: ExpressResponse,
865
+ next: NextFunction,
866
+ ): Promise<void> => {
867
+ try {
868
+ const databaseProps: DatabaseCommonInteractionProps =
869
+ await CommonAPI.getDatabaseCommonInteractionProps(req);
870
+
871
+ if (!databaseProps?.tenantId) {
872
+ return Response.sendErrorResponse(
873
+ req,
874
+ res,
875
+ new BadDataException("Invalid Project ID"),
876
+ );
877
+ }
878
+
879
+ const body: JSONObject = req.body as JSONObject;
880
+
881
+ const facetKeys: Array<string> = body["facetKeys"]
882
+ ? (body["facetKeys"] as Array<string>)
883
+ : [
884
+ "serviceId",
885
+ "hostId",
886
+ "dockerHostId",
887
+ "kubernetesClusterId",
888
+ "exceptionType",
889
+ "environment",
890
+ ];
891
+
892
+ const startTime: Date = body["startTime"]
893
+ ? OneUptimeDate.fromString(body["startTime"] as string)
894
+ : OneUptimeDate.addRemoveHours(OneUptimeDate.getCurrentDate(), -24);
895
+
896
+ const endTime: Date = body["endTime"]
897
+ ? OneUptimeDate.fromString(body["endTime"] as string)
898
+ : OneUptimeDate.getCurrentDate();
899
+
900
+ const limit: number = (body["limit"] as number) || 500;
901
+
902
+ const serviceIds: Array<ObjectID> | undefined = body["serviceIds"]
903
+ ? (body["serviceIds"] as Array<string>).map((id: string) => {
904
+ return new ObjectID(id);
905
+ })
906
+ : undefined;
907
+
908
+ const exceptionTypes: Array<string> | undefined = body["exceptionTypes"]
909
+ ? (body["exceptionTypes"] as Array<string>)
910
+ : undefined;
911
+
912
+ const environments: Array<string> | undefined = body["environments"]
913
+ ? (body["environments"] as Array<string>)
914
+ : undefined;
915
+
916
+ const fingerprints: Array<string> | undefined = body["fingerprints"]
917
+ ? (body["fingerprints"] as Array<string>)
918
+ : undefined;
919
+
920
+ const traceIds: Array<string> | undefined = body["traceIds"]
921
+ ? (body["traceIds"] as Array<string>)
922
+ : undefined;
923
+
924
+ const escaped: boolean | undefined =
925
+ body["escaped"] === undefined ? undefined : Boolean(body["escaped"]);
926
+
927
+ const messageSearchText: string | undefined = body["messageSearchText"]
928
+ ? (body["messageSearchText"] as string)
929
+ : undefined;
930
+
931
+ /*
932
+ * Per-facet partial-match filter applied at the Postgres source-of-truth
933
+ * lookup stage. Only consulted for resource facets — other facets
934
+ * continue to filter client-side over the loaded value list.
935
+ */
936
+ const facetSearchText: Record<string, string> | undefined = body[
937
+ "facetSearchText"
938
+ ]
939
+ ? (body["facetSearchText"] as Record<string, string>)
940
+ : undefined;
941
+
942
+ const projectId: ObjectID = databaseProps.tenantId;
943
+
944
+ /*
945
+ * Per-facet ClickHouse query in parallel. Per-facet errors degrade
946
+ * gracefully to [] so a slow / failing facet can't block the others.
947
+ */
948
+ const facetResults: Array<readonly [string, Array<ExceptionFacetValue>]> =
949
+ await Promise.all(
950
+ facetKeys.map(
951
+ async (
952
+ facetKey: string,
953
+ ): Promise<readonly [string, Array<ExceptionFacetValue>]> => {
954
+ try {
955
+ const request: ExceptionFacetRequest = {
956
+ projectId,
957
+ startTime,
958
+ endTime,
959
+ facetKey,
960
+ limit,
961
+ serviceIds,
962
+ exceptionTypes,
963
+ environments,
964
+ fingerprints,
965
+ traceIds,
966
+ escaped,
967
+ messageSearchText,
968
+ };
969
+ const values: Array<ExceptionFacetValue> =
970
+ await ExceptionAggregationService.getFacetValues(request);
971
+ return [facetKey, values] as const;
972
+ } catch {
973
+ return [facetKey, [] as Array<ExceptionFacetValue>] as const;
974
+ }
975
+ },
976
+ ),
977
+ );
978
+
979
+ const facets: Record<
980
+ string,
981
+ Array<ExceptionFacetValue>
982
+ > = Object.fromEntries(facetResults);
983
+
984
+ /*
985
+ * Replace resource-facet results with the Postgres source-of-truth list
986
+ * (filtered by facetSearchText and enriched with displayName). Same
987
+ * pattern as the trace/log facets endpoints.
988
+ */
989
+ const resourceSpecs: Array<ResourceFacetSpec> = facetKeys
990
+ .filter((key: string): boolean => {
991
+ return ResourceFacetResolver.isResourceFacet(key);
992
+ })
993
+ .map((key: string): ResourceFacetSpec => {
994
+ const counts: Map<string, number> = new Map();
995
+ for (const fv of facets[key] || []) {
996
+ counts.set(fv.value, fv.count);
997
+ }
998
+ return {
999
+ facetKey: key,
1000
+ counts,
1001
+ searchText: facetSearchText?.[key],
1002
+ limit,
1003
+ };
1004
+ });
1005
+
1006
+ if (resourceSpecs.length > 0) {
1007
+ const resolved: Record<
1008
+ string,
1009
+ Array<ResolvedFacetValue>
1010
+ > = await ResourceFacetResolver.resolve(projectId, resourceSpecs);
1011
+ for (const key of Object.keys(resolved)) {
1012
+ facets[key] = resolved[key] as Array<ExceptionFacetValue>;
1013
+ }
1014
+ }
1015
+
1016
+ return Response.sendJsonObjectResponse(req, res, {
1017
+ facets: facets as unknown as JSONObject,
1018
+ });
1019
+ } catch (err: unknown) {
1020
+ next(err);
1021
+ }
1022
+ },
1023
+ );
1024
+
1025
+ // --- Metric Facets Endpoint ---
1026
+
1027
+ router.post(
1028
+ "/telemetry/metrics/facets",
1029
+ UserMiddleware.getUserMiddleware,
1030
+ async (
1031
+ req: ExpressRequest,
1032
+ res: ExpressResponse,
1033
+ next: NextFunction,
1034
+ ): Promise<void> => {
1035
+ try {
1036
+ const databaseProps: DatabaseCommonInteractionProps =
1037
+ await CommonAPI.getDatabaseCommonInteractionProps(req);
1038
+
1039
+ if (!databaseProps?.tenantId) {
1040
+ return Response.sendErrorResponse(
1041
+ req,
1042
+ res,
1043
+ new BadDataException("Invalid Project ID"),
1044
+ );
1045
+ }
1046
+
1047
+ const body: JSONObject = req.body as JSONObject;
1048
+
1049
+ const facetKeys: Array<string> = body["facetKeys"]
1050
+ ? (body["facetKeys"] as Array<string>)
1051
+ : ["serviceId", "hostId", "dockerHostId", "kubernetesClusterId"];
1052
+
1053
+ const startTime: Date = body["startTime"]
1054
+ ? OneUptimeDate.fromString(body["startTime"] as string)
1055
+ : OneUptimeDate.addRemoveHours(OneUptimeDate.getCurrentDate(), -1);
1056
+
1057
+ const endTime: Date = body["endTime"]
1058
+ ? OneUptimeDate.fromString(body["endTime"] as string)
1059
+ : OneUptimeDate.getCurrentDate();
1060
+
1061
+ const limit: number = (body["limit"] as number) || 500;
1062
+
1063
+ const serviceIds: Array<ObjectID> | undefined = body["serviceIds"]
1064
+ ? (body["serviceIds"] as Array<string>).map((id: string) => {
1065
+ return new ObjectID(id);
1066
+ })
1067
+ : undefined;
1068
+
1069
+ const metricNames: Array<string> | undefined = body["metricNames"]
1070
+ ? (body["metricNames"] as Array<string>)
1071
+ : undefined;
1072
+
1073
+ const facetSearchText: Record<string, string> | undefined = body[
1074
+ "facetSearchText"
1075
+ ]
1076
+ ? (body["facetSearchText"] as Record<string, string>)
1077
+ : undefined;
1078
+
1079
+ const projectId: ObjectID = databaseProps.tenantId;
1080
+
1081
+ /*
1082
+ * Per-facet ClickHouse GROUP BY in parallel. Per-facet errors degrade
1083
+ * to [] so a slow facet doesn't block the rest.
1084
+ */
1085
+ const facetResults: Array<readonly [string, Array<MetricFacetValue>]> =
1086
+ await Promise.all(
1087
+ facetKeys.map(
1088
+ async (
1089
+ facetKey: string,
1090
+ ): Promise<readonly [string, Array<MetricFacetValue>]> => {
1091
+ try {
1092
+ const request: MetricFacetRequest = {
1093
+ projectId,
1094
+ startTime,
1095
+ endTime,
1096
+ facetKey,
1097
+ limit,
1098
+ serviceIds,
1099
+ metricNames,
1100
+ };
1101
+ const values: Array<MetricFacetValue> =
1102
+ await MetricAggregationService.getFacetValues(request);
1103
+ return [facetKey, values] as const;
1104
+ } catch {
1105
+ return [facetKey, [] as Array<MetricFacetValue>] as const;
1106
+ }
1107
+ },
1108
+ ),
1109
+ );
1110
+
1111
+ const facets: Record<
1112
+ string,
1113
+ Array<MetricFacetValue>
1114
+ > = Object.fromEntries(facetResults);
1115
+
1116
+ /*
1117
+ * Replace resource-facet results with the Postgres source-of-truth list
1118
+ * (filtered by facetSearchText and enriched with displayName). Same
1119
+ * pattern as the trace / log / exception facets endpoints.
1120
+ */
1121
+ const resourceSpecs: Array<ResourceFacetSpec> = facetKeys
1122
+ .filter((key: string): boolean => {
1123
+ return ResourceFacetResolver.isResourceFacet(key);
1124
+ })
1125
+ .map((key: string): ResourceFacetSpec => {
1126
+ const counts: Map<string, number> = new Map();
1127
+ for (const fv of facets[key] || []) {
1128
+ counts.set(fv.value, fv.count);
1129
+ }
1130
+ return {
1131
+ facetKey: key,
1132
+ counts,
1133
+ searchText: facetSearchText?.[key],
1134
+ limit,
1135
+ };
1136
+ });
1137
+
1138
+ if (resourceSpecs.length > 0) {
1139
+ const resolved: Record<
1140
+ string,
1141
+ Array<ResolvedFacetValue>
1142
+ > = await ResourceFacetResolver.resolve(projectId, resourceSpecs);
1143
+ for (const key of Object.keys(resolved)) {
1144
+ facets[key] = resolved[key] as Array<MetricFacetValue>;
1145
+ }
1146
+ }
1147
+
1148
+ return Response.sendJsonObjectResponse(req, res, {
1149
+ facets: facets as unknown as JSONObject,
1150
+ });
1151
+ } catch (err: unknown) {
1152
+ next(err);
1153
+ }
1154
+ },
1155
+ );
1156
+
751
1157
  // --- Log Analytics Endpoint ---
752
1158
 
753
1159
  router.post(
@@ -0,0 +1,20 @@
1
+ import { MigrationInterface, QueryRunner } from "typeorm";
2
+
3
+ export class MigrationName1779879993421 implements MigrationInterface {
4
+ public name = 'MigrationName1779879993421'
5
+
6
+ public async up(queryRunner: QueryRunner): Promise<void> {
7
+ await queryRunner.query(`DROP INDEX "public"."IDX_telemetry_exception_project_service_fingerprint"`);
8
+ await queryRunner.query(`ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "rotation" SET DEFAULT '{"_type":"Recurring","value":{"intervalType":"Day","intervalCount":{"_type":"PositiveNumber","value":1}}}'`);
9
+ await queryRunner.query(`ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "restrictionTimes" SET DEFAULT '{"_type":"RestrictionTimes","value":{"restictionType":"None","dayRestrictionTimes":null,"weeklyRestrictionTimes":[]}}'`);
10
+ await queryRunner.query(`CREATE UNIQUE INDEX "IDX_1f55d43a0b73e883bb226158c7" ON "TelemetryException" ("projectId", "serviceId", "fingerprint") `);
11
+ }
12
+
13
+ public async down(queryRunner: QueryRunner): Promise<void> {
14
+ await queryRunner.query(`DROP INDEX "public"."IDX_1f55d43a0b73e883bb226158c7"`);
15
+ await queryRunner.query(`ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "restrictionTimes" SET DEFAULT '{"_type": "RestrictionTimes", "value": {"restictionType": "None", "dayRestrictionTimes": null, "weeklyRestrictionTimes": []}}'`);
16
+ await queryRunner.query(`ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "rotation" SET DEFAULT '{"_type": "Recurring", "value": {"intervalType": "Day", "intervalCount": {"_type": "PositiveNumber", "value": 1}}}'`);
17
+ await queryRunner.query(`CREATE UNIQUE INDEX "IDX_telemetry_exception_project_service_fingerprint" ON "TelemetryException" ("projectId", "serviceId", "fingerprint") `);
18
+ }
19
+
20
+ }
@@ -0,0 +1,115 @@
1
+ import { MigrationInterface, QueryRunner } from "typeorm";
2
+
3
+ /*
4
+ * The OTel traces ingest path used to call
5
+ * ExceptionUtil.saveOrUpdateTelemetryException once per exception
6
+ * event with a findOneBy + updateOneBy/create pair, fire-and-forget,
7
+ * from inside the span loop. That has three problems we are fixing
8
+ * in tandem with this schema change:
9
+ *
10
+ * 1. Cost: each event is a Postgres round-trip. A worker batch
11
+ * with thousands of exception events drives thousands of
12
+ * parallel SELECT/UPDATE statements and starves the pool.
13
+ * 2. Lost increments: `occuranceCount = existing.occuranceCount + 1`
14
+ * is read-modify-write at the JS layer, so two workers
15
+ * seeing the same row at the same instant collapse to a
16
+ * single +1 instead of +2.
17
+ * 3. Duplicate rows: two workers both missing the row at the
18
+ * same time both INSERT, with no DB-level guard, producing
19
+ * two TelemetryException rows for the same fingerprint.
20
+ *
21
+ * The ingest path is moving to a single batched
22
+ * INSERT … ON CONFLICT ("projectId", "serviceId", "fingerprint")
23
+ * DO UPDATE SET "occuranceCount" =
24
+ * "TelemetryException"."occuranceCount" + EXCLUDED."occuranceCount",
25
+ * ...
26
+ * statement per worker batch, which needs the composite unique
27
+ * index this migration creates. Before we can create the index we
28
+ * have to clear out the duplicate rows produced by the legacy race
29
+ * (problem 3 above) — otherwise the CREATE UNIQUE INDEX would fail
30
+ * on production data.
31
+ *
32
+ * Strategy: pick one survivor per (projectId, serviceId, fingerprint)
33
+ * group and hard-delete the rest. We do NOT try to merge
34
+ * occuranceCount / firstSeenAt / lastSeenAt from the losers into the
35
+ * survivor — the simpler delete-only approach trades a small,
36
+ * one-time count discrepancy on duplicated fingerprints for a much
37
+ * simpler migration that is easy to reason about and roll back. The
38
+ * next exception occurrence for that fingerprint will re-increment
39
+ * the survivor via the new ON CONFLICT upsert, and the dashboard
40
+ * recovers within seconds.
41
+ *
42
+ * Survivor selection prefers the row that was carrying the most
43
+ * traffic before the unique index landed, because in the legacy
44
+ * code path `findOneBy` returned an implementation-defined row and
45
+ * all subsequent UPDATEs piled into that one — discarding it would
46
+ * be the most lossy choice. Order is:
47
+ * 1. Highest occuranceCount (the "real" row absorbing updates).
48
+ * 2. Most recent lastSeenAt (in case counts are tied).
49
+ * 3. Non-deleted before deleted (live data beats soft-deleted).
50
+ * 4. Smallest _id as a deterministic tiebreaker so re-runs pick
51
+ * the same survivor.
52
+ *
53
+ * TelemetryException is a leaf table — no other table holds an FK
54
+ * referencing it — so we do not need to reparent anything before
55
+ * deleting loser rows. NULL-fingerprint rows are left alone; the
56
+ * composite unique index treats NULLs as distinct, and the new
57
+ * ingest path never produces a NULL fingerprint anyway.
58
+ */
59
+ export class DedupeTelemetryExceptionsAndAddUniqueIndex1779900000000
60
+ implements MigrationInterface
61
+ {
62
+ public name: string =
63
+ "DedupeTelemetryExceptionsAndAddUniqueIndex1779900000000";
64
+
65
+ public async up(queryRunner: QueryRunner): Promise<void> {
66
+ // 1. Delete every row that is not the chosen survivor for its group.
67
+ await queryRunner.query(`
68
+ WITH survivors AS (
69
+ SELECT DISTINCT ON ("projectId", "serviceId", "fingerprint")
70
+ _id AS survivor_id
71
+ FROM "TelemetryException"
72
+ WHERE "fingerprint" IS NOT NULL
73
+ ORDER BY
74
+ "projectId",
75
+ "serviceId",
76
+ "fingerprint",
77
+ COALESCE("occuranceCount", 0) DESC,
78
+ "lastSeenAt" DESC NULLS LAST,
79
+ CASE WHEN "deletedAt" IS NULL THEN 0 ELSE 1 END,
80
+ _id ASC
81
+ )
82
+ DELETE FROM "TelemetryException" te
83
+ WHERE te."fingerprint" IS NOT NULL
84
+ AND te._id NOT IN (SELECT survivor_id FROM survivors)
85
+ AND EXISTS (
86
+ SELECT 1
87
+ FROM "TelemetryException" t2
88
+ WHERE t2."projectId" = te."projectId"
89
+ AND t2."serviceId" = te."serviceId"
90
+ AND t2."fingerprint" = te."fingerprint"
91
+ AND t2._id <> te._id
92
+ );
93
+ `);
94
+
95
+ /*
96
+ * 2. Create the DB-level composite unique index. Matches the
97
+ * @Index decorator on TelemetryException and is the conflict
98
+ * target for the batched upsert in ExceptionUtil.
99
+ */
100
+ await queryRunner.query(
101
+ `CREATE UNIQUE INDEX "IDX_telemetry_exception_project_service_fingerprint" ON "TelemetryException" ("projectId", "serviceId", "fingerprint") `,
102
+ );
103
+ }
104
+
105
+ public async down(queryRunner: QueryRunner): Promise<void> {
106
+ await queryRunner.query(
107
+ `DROP INDEX "public"."IDX_telemetry_exception_project_service_fingerprint"`,
108
+ );
109
+ /*
110
+ * The duplicate rows deleted in up() are not resurrectable from
111
+ * a down-migration, and recreating them is not desirable — they
112
+ * only existed because of a race the unique index now prevents.
113
+ */
114
+ }
115
+ }
@@ -354,6 +354,8 @@ import { AttachServiceToScheduledMaintenanceTemplatesAndLabelRules1779742211961
354
354
  import { MigrationName1779790539196 } from "./1779790539196-MigrationName";
355
355
  import { ExpandOwnerRuleInheritFlags1779823516881 } from "./1779823516881-ExpandOwnerRuleInheritFlags";
356
356
  import { RenameStatusPageZhToZhCN1779827700000 } from "./1779827700000-RenameStatusPageZhToZhCN";
357
+ import { MigrationName1779879993421 } from "./1779879993421-MigrationName";
358
+ import { DedupeTelemetryExceptionsAndAddUniqueIndex1779900000000 } from "./1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex";
357
359
  export default [
358
360
  InitialMigration,
359
361
  MigrationName1717678334852,
@@ -711,4 +713,6 @@ export default [
711
713
  MigrationName1779790539196,
712
714
  ExpandOwnerRuleInheritFlags1779823516881,
713
715
  RenameStatusPageZhToZhCN1779827700000,
716
+ DedupeTelemetryExceptionsAndAddUniqueIndex1779900000000,
717
+ MigrationName1779879993421
714
718
  ];