@oneuptime/common 10.5.1 → 10.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Models/DatabaseModels/TelemetryException.ts +10 -0
- package/Server/API/TelemetryAPI.ts +406 -0
- package/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.ts +20 -0
- package/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.ts +115 -0
- package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +4 -0
- package/Server/Services/ExceptionAggregationService.ts +51 -3
- package/Server/Services/LogAggregationService.ts +1 -0
- package/Server/Services/MetricAggregationService.ts +227 -0
- package/Server/Services/OpenTelemetryIngestService.ts +101 -1
- package/Server/Services/TraceAggregationService.ts +1 -0
- package/Server/Utils/Monitor/MonitorLogUtil.ts +146 -6
- package/Server/Utils/Telemetry/ResourceFacetResolver.ts +299 -0
- package/UI/Components/LogsViewer/LogsViewer.tsx +10 -0
- package/UI/Components/LogsViewer/components/FacetSection.tsx +40 -3
- package/UI/Components/LogsViewer/components/LogsFacetSidebar.tsx +23 -0
- package/UI/Components/LogsViewer/types.ts +2 -0
- package/UI/Components/TelemetryViewer/TelemetryViewer.tsx +8 -0
- package/UI/Components/TelemetryViewer/components/TelemetryFacetSection.tsx +49 -3
- package/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.tsx +16 -0
- package/UI/Components/TelemetryViewer/types.ts +12 -0
- package/build/dist/Models/DatabaseModels/TelemetryException.js +11 -0
- package/build/dist/Models/DatabaseModels/TelemetryException.js.map +1 -1
- package/build/dist/Server/API/TelemetryAPI.js +285 -0
- package/build/dist/Server/API/TelemetryAPI.js.map +1 -1
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.js +18 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.js.map +1 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.js +106 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.js.map +1 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +4 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js.map +1 -1
- package/build/dist/Server/Services/ExceptionAggregationService.js +44 -4
- package/build/dist/Server/Services/ExceptionAggregationService.js.map +1 -1
- package/build/dist/Server/Services/LogAggregationService.js.map +1 -1
- package/build/dist/Server/Services/MetricAggregationService.js +159 -0
- package/build/dist/Server/Services/MetricAggregationService.js.map +1 -0
- package/build/dist/Server/Services/OpenTelemetryIngestService.js +60 -3
- package/build/dist/Server/Services/OpenTelemetryIngestService.js.map +1 -1
- package/build/dist/Server/Services/TraceAggregationService.js.map +1 -1
- package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js +127 -4
- package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js.map +1 -1
- package/build/dist/Server/Utils/Telemetry/ResourceFacetResolver.js +204 -0
- package/build/dist/Server/Utils/Telemetry/ResourceFacetResolver.js.map +1 -0
- package/build/dist/UI/Components/LogsViewer/LogsViewer.js +1 -1
- package/build/dist/UI/Components/LogsViewer/LogsViewer.js.map +1 -1
- package/build/dist/UI/Components/LogsViewer/components/FacetSection.js +26 -6
- package/build/dist/UI/Components/LogsViewer/components/FacetSection.js.map +1 -1
- package/build/dist/UI/Components/LogsViewer/components/LogsFacetSidebar.js +12 -1
- package/build/dist/UI/Components/LogsViewer/components/LogsFacetSidebar.js.map +1 -1
- package/build/dist/UI/Components/LogsViewer/types.js.map +1 -1
- package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js +1 -1
- package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js.map +1 -1
- package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSection.js +32 -6
- package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSection.js.map +1 -1
- package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.js +6 -1
- package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.js.map +1 -1
- package/package.json +1 -1
|
@@ -65,6 +65,16 @@ import Service from "./Service";
|
|
|
65
65
|
name: "TelemetryException",
|
|
66
66
|
})
|
|
67
67
|
@Index(["projectId", "isResolved", "isArchived"]) // Exceptions dashboard counts/filters
|
|
68
|
+
/*
|
|
69
|
+
* Composite uniqueness on the dedup key used by the OTel traces ingest
|
|
70
|
+
* batched upsert. The ingest path collapses every exception event in a
|
|
71
|
+
* worker batch into a single INSERT … ON CONFLICT (projectId,
|
|
72
|
+
* serviceId, fingerprint) DO UPDATE statement; this index is what makes
|
|
73
|
+
* that conflict target resolvable and stops two concurrent workers from
|
|
74
|
+
* racing the old findOneBy + update path into duplicate rows or lost
|
|
75
|
+
* occuranceCount increments.
|
|
76
|
+
*/
|
|
77
|
+
@Index(["projectId", "serviceId", "fingerprint"], { unique: true })
|
|
68
78
|
export default class TelemetryException extends DatabaseBaseModel {
|
|
69
79
|
@ColumnAccessControl({
|
|
70
80
|
create: [
|
|
@@ -32,7 +32,13 @@ import TraceAggregationService, {
|
|
|
32
32
|
import ExceptionAggregationService, {
|
|
33
33
|
HistogramBucket as ExceptionHistogramBucket,
|
|
34
34
|
HistogramRequest as ExceptionHistogramRequest,
|
|
35
|
+
FacetValue as ExceptionFacetValue,
|
|
36
|
+
FacetRequest as ExceptionFacetRequest,
|
|
35
37
|
} from "../Services/ExceptionAggregationService";
|
|
38
|
+
import MetricAggregationService, {
|
|
39
|
+
FacetValue as MetricFacetValue,
|
|
40
|
+
FacetRequest as MetricFacetRequest,
|
|
41
|
+
} from "../Services/MetricAggregationService";
|
|
36
42
|
import ProfileAggregationService, {
|
|
37
43
|
FlamegraphRequest,
|
|
38
44
|
FunctionListRequest,
|
|
@@ -55,6 +61,10 @@ import SortOrder from "../../Types/BaseDatabase/SortOrder";
|
|
|
55
61
|
import ObjectID from "../../Types/ObjectID";
|
|
56
62
|
import OneUptimeDate from "../../Types/Date";
|
|
57
63
|
import { JSONObject } from "../../Types/JSON";
|
|
64
|
+
import ResourceFacetResolver, {
|
|
65
|
+
ResolvedFacetValue,
|
|
66
|
+
ResourceFacetSpec,
|
|
67
|
+
} from "../Utils/Telemetry/ResourceFacetResolver";
|
|
58
68
|
|
|
59
69
|
const router: ExpressRouter = Express.getRouter();
|
|
60
70
|
|
|
@@ -393,6 +403,18 @@ router.post(
|
|
|
393
403
|
? (body["attributes"] as Record<string, string>)
|
|
394
404
|
: undefined;
|
|
395
405
|
|
|
406
|
+
/*
|
|
407
|
+
* Per-facet partial-match filter applied at the Postgres source-of-truth
|
|
408
|
+
* lookup stage. Only consulted for resource facets (serviceId / hostId /
|
|
409
|
+
* dockerHostId / kubernetesClusterId) — other facets continue to filter
|
|
410
|
+
* client-side over the loaded value list.
|
|
411
|
+
*/
|
|
412
|
+
const facetSearchText: Record<string, string> | undefined = body[
|
|
413
|
+
"facetSearchText"
|
|
414
|
+
]
|
|
415
|
+
? (body["facetSearchText"] as Record<string, string>)
|
|
416
|
+
: undefined;
|
|
417
|
+
|
|
396
418
|
/*
|
|
397
419
|
* Capture tenantId locally so TypeScript narrowing survives the
|
|
398
420
|
* async closure below (narrowing is lost across closure boundaries).
|
|
@@ -437,6 +459,40 @@ router.post(
|
|
|
437
459
|
facetResults,
|
|
438
460
|
);
|
|
439
461
|
|
|
462
|
+
/*
|
|
463
|
+
* Replace resource-facet results with the Postgres source-of-truth list
|
|
464
|
+
* (filtered by facetSearchText and enriched with displayName). See the
|
|
465
|
+
* trace facets handler above for the rationale — same pattern, same
|
|
466
|
+
* benefit: low-volume resources stay visible and search can reach
|
|
467
|
+
* resources outside the ClickHouse sample window.
|
|
468
|
+
*/
|
|
469
|
+
const resourceSpecs: Array<ResourceFacetSpec> = facetKeys
|
|
470
|
+
.filter((key: string): boolean => {
|
|
471
|
+
return ResourceFacetResolver.isResourceFacet(key);
|
|
472
|
+
})
|
|
473
|
+
.map((key: string): ResourceFacetSpec => {
|
|
474
|
+
const counts: Map<string, number> = new Map();
|
|
475
|
+
for (const fv of facets[key] || []) {
|
|
476
|
+
counts.set(fv.value, fv.count);
|
|
477
|
+
}
|
|
478
|
+
return {
|
|
479
|
+
facetKey: key,
|
|
480
|
+
counts,
|
|
481
|
+
searchText: facetSearchText?.[key],
|
|
482
|
+
limit,
|
|
483
|
+
};
|
|
484
|
+
});
|
|
485
|
+
|
|
486
|
+
if (resourceSpecs.length > 0) {
|
|
487
|
+
const resolved: Record<
|
|
488
|
+
string,
|
|
489
|
+
Array<ResolvedFacetValue>
|
|
490
|
+
> = await ResourceFacetResolver.resolve(projectId, resourceSpecs);
|
|
491
|
+
for (const key of Object.keys(resolved)) {
|
|
492
|
+
facets[key] = resolved[key] as Array<FacetValue>;
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
|
|
440
496
|
return Response.sendJsonObjectResponse(req, res, {
|
|
441
497
|
facets: facets as unknown as JSONObject,
|
|
442
498
|
});
|
|
@@ -613,6 +669,18 @@ router.post(
|
|
|
613
669
|
? (body["attributes"] as Record<string, string>)
|
|
614
670
|
: undefined;
|
|
615
671
|
|
|
672
|
+
/*
|
|
673
|
+
* Per-facet partial-match filter applied at the Postgres source-of-truth
|
|
674
|
+
* lookup stage. Only consulted for resource facets (serviceId / hostId /
|
|
675
|
+
* dockerHostId / kubernetesClusterId) — other facets continue to filter
|
|
676
|
+
* client-side over the loaded value list.
|
|
677
|
+
*/
|
|
678
|
+
const facetSearchText: Record<string, string> | undefined = body[
|
|
679
|
+
"facetSearchText"
|
|
680
|
+
]
|
|
681
|
+
? (body["facetSearchText"] as Record<string, string>)
|
|
682
|
+
: undefined;
|
|
683
|
+
|
|
616
684
|
/*
|
|
617
685
|
* Compute all facets from a single sort-key-aligned sample query
|
|
618
686
|
* (ORDER BY startTime DESC LIMIT N) and count top-K in Node. This
|
|
@@ -648,6 +716,44 @@ router.post(
|
|
|
648
716
|
);
|
|
649
717
|
}
|
|
650
718
|
|
|
719
|
+
/*
|
|
720
|
+
* Replace resource-facet results with the Postgres source-of-truth list
|
|
721
|
+
* (filtered by facetSearchText and enriched with displayName). Counts
|
|
722
|
+
* come from the ClickHouse sample above — entities with no recent
|
|
723
|
+
* telemetry surface with count 0 instead of being hidden entirely. This
|
|
724
|
+
* means low-volume services / hosts still appear in the sidebar and the
|
|
725
|
+
* search box can find resources beyond the loaded subset.
|
|
726
|
+
*/
|
|
727
|
+
const resourceSpecs: Array<ResourceFacetSpec> = facetKeys
|
|
728
|
+
.filter((key: string): boolean => {
|
|
729
|
+
return ResourceFacetResolver.isResourceFacet(key);
|
|
730
|
+
})
|
|
731
|
+
.map((key: string): ResourceFacetSpec => {
|
|
732
|
+
const counts: Map<string, number> = new Map();
|
|
733
|
+
for (const fv of facets[key] || []) {
|
|
734
|
+
counts.set(fv.value, fv.count);
|
|
735
|
+
}
|
|
736
|
+
return {
|
|
737
|
+
facetKey: key,
|
|
738
|
+
counts,
|
|
739
|
+
searchText: facetSearchText?.[key],
|
|
740
|
+
limit,
|
|
741
|
+
};
|
|
742
|
+
});
|
|
743
|
+
|
|
744
|
+
if (resourceSpecs.length > 0) {
|
|
745
|
+
const resolved: Record<
|
|
746
|
+
string,
|
|
747
|
+
Array<ResolvedFacetValue>
|
|
748
|
+
> = await ResourceFacetResolver.resolve(
|
|
749
|
+
databaseProps.tenantId,
|
|
750
|
+
resourceSpecs,
|
|
751
|
+
);
|
|
752
|
+
for (const key of Object.keys(resolved)) {
|
|
753
|
+
facets[key] = resolved[key] as Array<TraceFacetValue>;
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
651
757
|
return Response.sendJsonObjectResponse(req, res, {
|
|
652
758
|
facets: facets as unknown as JSONObject,
|
|
653
759
|
});
|
|
@@ -748,6 +854,306 @@ router.post(
|
|
|
748
854
|
},
|
|
749
855
|
);
|
|
750
856
|
|
|
857
|
+
// --- Exception Facets Endpoint ---
|
|
858
|
+
|
|
859
|
+
router.post(
|
|
860
|
+
"/telemetry/exceptions/facets",
|
|
861
|
+
UserMiddleware.getUserMiddleware,
|
|
862
|
+
async (
|
|
863
|
+
req: ExpressRequest,
|
|
864
|
+
res: ExpressResponse,
|
|
865
|
+
next: NextFunction,
|
|
866
|
+
): Promise<void> => {
|
|
867
|
+
try {
|
|
868
|
+
const databaseProps: DatabaseCommonInteractionProps =
|
|
869
|
+
await CommonAPI.getDatabaseCommonInteractionProps(req);
|
|
870
|
+
|
|
871
|
+
if (!databaseProps?.tenantId) {
|
|
872
|
+
return Response.sendErrorResponse(
|
|
873
|
+
req,
|
|
874
|
+
res,
|
|
875
|
+
new BadDataException("Invalid Project ID"),
|
|
876
|
+
);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
const body: JSONObject = req.body as JSONObject;
|
|
880
|
+
|
|
881
|
+
const facetKeys: Array<string> = body["facetKeys"]
|
|
882
|
+
? (body["facetKeys"] as Array<string>)
|
|
883
|
+
: [
|
|
884
|
+
"serviceId",
|
|
885
|
+
"hostId",
|
|
886
|
+
"dockerHostId",
|
|
887
|
+
"kubernetesClusterId",
|
|
888
|
+
"exceptionType",
|
|
889
|
+
"environment",
|
|
890
|
+
];
|
|
891
|
+
|
|
892
|
+
const startTime: Date = body["startTime"]
|
|
893
|
+
? OneUptimeDate.fromString(body["startTime"] as string)
|
|
894
|
+
: OneUptimeDate.addRemoveHours(OneUptimeDate.getCurrentDate(), -24);
|
|
895
|
+
|
|
896
|
+
const endTime: Date = body["endTime"]
|
|
897
|
+
? OneUptimeDate.fromString(body["endTime"] as string)
|
|
898
|
+
: OneUptimeDate.getCurrentDate();
|
|
899
|
+
|
|
900
|
+
const limit: number = (body["limit"] as number) || 500;
|
|
901
|
+
|
|
902
|
+
const serviceIds: Array<ObjectID> | undefined = body["serviceIds"]
|
|
903
|
+
? (body["serviceIds"] as Array<string>).map((id: string) => {
|
|
904
|
+
return new ObjectID(id);
|
|
905
|
+
})
|
|
906
|
+
: undefined;
|
|
907
|
+
|
|
908
|
+
const exceptionTypes: Array<string> | undefined = body["exceptionTypes"]
|
|
909
|
+
? (body["exceptionTypes"] as Array<string>)
|
|
910
|
+
: undefined;
|
|
911
|
+
|
|
912
|
+
const environments: Array<string> | undefined = body["environments"]
|
|
913
|
+
? (body["environments"] as Array<string>)
|
|
914
|
+
: undefined;
|
|
915
|
+
|
|
916
|
+
const fingerprints: Array<string> | undefined = body["fingerprints"]
|
|
917
|
+
? (body["fingerprints"] as Array<string>)
|
|
918
|
+
: undefined;
|
|
919
|
+
|
|
920
|
+
const traceIds: Array<string> | undefined = body["traceIds"]
|
|
921
|
+
? (body["traceIds"] as Array<string>)
|
|
922
|
+
: undefined;
|
|
923
|
+
|
|
924
|
+
const escaped: boolean | undefined =
|
|
925
|
+
body["escaped"] === undefined ? undefined : Boolean(body["escaped"]);
|
|
926
|
+
|
|
927
|
+
const messageSearchText: string | undefined = body["messageSearchText"]
|
|
928
|
+
? (body["messageSearchText"] as string)
|
|
929
|
+
: undefined;
|
|
930
|
+
|
|
931
|
+
/*
|
|
932
|
+
* Per-facet partial-match filter applied at the Postgres source-of-truth
|
|
933
|
+
* lookup stage. Only consulted for resource facets — other facets
|
|
934
|
+
* continue to filter client-side over the loaded value list.
|
|
935
|
+
*/
|
|
936
|
+
const facetSearchText: Record<string, string> | undefined = body[
|
|
937
|
+
"facetSearchText"
|
|
938
|
+
]
|
|
939
|
+
? (body["facetSearchText"] as Record<string, string>)
|
|
940
|
+
: undefined;
|
|
941
|
+
|
|
942
|
+
const projectId: ObjectID = databaseProps.tenantId;
|
|
943
|
+
|
|
944
|
+
/*
|
|
945
|
+
* Per-facet ClickHouse query in parallel. Per-facet errors degrade
|
|
946
|
+
* gracefully to [] so a slow / failing facet can't block the others.
|
|
947
|
+
*/
|
|
948
|
+
const facetResults: Array<readonly [string, Array<ExceptionFacetValue>]> =
|
|
949
|
+
await Promise.all(
|
|
950
|
+
facetKeys.map(
|
|
951
|
+
async (
|
|
952
|
+
facetKey: string,
|
|
953
|
+
): Promise<readonly [string, Array<ExceptionFacetValue>]> => {
|
|
954
|
+
try {
|
|
955
|
+
const request: ExceptionFacetRequest = {
|
|
956
|
+
projectId,
|
|
957
|
+
startTime,
|
|
958
|
+
endTime,
|
|
959
|
+
facetKey,
|
|
960
|
+
limit,
|
|
961
|
+
serviceIds,
|
|
962
|
+
exceptionTypes,
|
|
963
|
+
environments,
|
|
964
|
+
fingerprints,
|
|
965
|
+
traceIds,
|
|
966
|
+
escaped,
|
|
967
|
+
messageSearchText,
|
|
968
|
+
};
|
|
969
|
+
const values: Array<ExceptionFacetValue> =
|
|
970
|
+
await ExceptionAggregationService.getFacetValues(request);
|
|
971
|
+
return [facetKey, values] as const;
|
|
972
|
+
} catch {
|
|
973
|
+
return [facetKey, [] as Array<ExceptionFacetValue>] as const;
|
|
974
|
+
}
|
|
975
|
+
},
|
|
976
|
+
),
|
|
977
|
+
);
|
|
978
|
+
|
|
979
|
+
const facets: Record<
|
|
980
|
+
string,
|
|
981
|
+
Array<ExceptionFacetValue>
|
|
982
|
+
> = Object.fromEntries(facetResults);
|
|
983
|
+
|
|
984
|
+
/*
|
|
985
|
+
* Replace resource-facet results with the Postgres source-of-truth list
|
|
986
|
+
* (filtered by facetSearchText and enriched with displayName). Same
|
|
987
|
+
* pattern as the trace/log facets endpoints.
|
|
988
|
+
*/
|
|
989
|
+
const resourceSpecs: Array<ResourceFacetSpec> = facetKeys
|
|
990
|
+
.filter((key: string): boolean => {
|
|
991
|
+
return ResourceFacetResolver.isResourceFacet(key);
|
|
992
|
+
})
|
|
993
|
+
.map((key: string): ResourceFacetSpec => {
|
|
994
|
+
const counts: Map<string, number> = new Map();
|
|
995
|
+
for (const fv of facets[key] || []) {
|
|
996
|
+
counts.set(fv.value, fv.count);
|
|
997
|
+
}
|
|
998
|
+
return {
|
|
999
|
+
facetKey: key,
|
|
1000
|
+
counts,
|
|
1001
|
+
searchText: facetSearchText?.[key],
|
|
1002
|
+
limit,
|
|
1003
|
+
};
|
|
1004
|
+
});
|
|
1005
|
+
|
|
1006
|
+
if (resourceSpecs.length > 0) {
|
|
1007
|
+
const resolved: Record<
|
|
1008
|
+
string,
|
|
1009
|
+
Array<ResolvedFacetValue>
|
|
1010
|
+
> = await ResourceFacetResolver.resolve(projectId, resourceSpecs);
|
|
1011
|
+
for (const key of Object.keys(resolved)) {
|
|
1012
|
+
facets[key] = resolved[key] as Array<ExceptionFacetValue>;
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
return Response.sendJsonObjectResponse(req, res, {
|
|
1017
|
+
facets: facets as unknown as JSONObject,
|
|
1018
|
+
});
|
|
1019
|
+
} catch (err: unknown) {
|
|
1020
|
+
next(err);
|
|
1021
|
+
}
|
|
1022
|
+
},
|
|
1023
|
+
);
|
|
1024
|
+
|
|
1025
|
+
// --- Metric Facets Endpoint ---
|
|
1026
|
+
|
|
1027
|
+
router.post(
|
|
1028
|
+
"/telemetry/metrics/facets",
|
|
1029
|
+
UserMiddleware.getUserMiddleware,
|
|
1030
|
+
async (
|
|
1031
|
+
req: ExpressRequest,
|
|
1032
|
+
res: ExpressResponse,
|
|
1033
|
+
next: NextFunction,
|
|
1034
|
+
): Promise<void> => {
|
|
1035
|
+
try {
|
|
1036
|
+
const databaseProps: DatabaseCommonInteractionProps =
|
|
1037
|
+
await CommonAPI.getDatabaseCommonInteractionProps(req);
|
|
1038
|
+
|
|
1039
|
+
if (!databaseProps?.tenantId) {
|
|
1040
|
+
return Response.sendErrorResponse(
|
|
1041
|
+
req,
|
|
1042
|
+
res,
|
|
1043
|
+
new BadDataException("Invalid Project ID"),
|
|
1044
|
+
);
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
const body: JSONObject = req.body as JSONObject;
|
|
1048
|
+
|
|
1049
|
+
const facetKeys: Array<string> = body["facetKeys"]
|
|
1050
|
+
? (body["facetKeys"] as Array<string>)
|
|
1051
|
+
: ["serviceId", "hostId", "dockerHostId", "kubernetesClusterId"];
|
|
1052
|
+
|
|
1053
|
+
const startTime: Date = body["startTime"]
|
|
1054
|
+
? OneUptimeDate.fromString(body["startTime"] as string)
|
|
1055
|
+
: OneUptimeDate.addRemoveHours(OneUptimeDate.getCurrentDate(), -1);
|
|
1056
|
+
|
|
1057
|
+
const endTime: Date = body["endTime"]
|
|
1058
|
+
? OneUptimeDate.fromString(body["endTime"] as string)
|
|
1059
|
+
: OneUptimeDate.getCurrentDate();
|
|
1060
|
+
|
|
1061
|
+
const limit: number = (body["limit"] as number) || 500;
|
|
1062
|
+
|
|
1063
|
+
const serviceIds: Array<ObjectID> | undefined = body["serviceIds"]
|
|
1064
|
+
? (body["serviceIds"] as Array<string>).map((id: string) => {
|
|
1065
|
+
return new ObjectID(id);
|
|
1066
|
+
})
|
|
1067
|
+
: undefined;
|
|
1068
|
+
|
|
1069
|
+
const metricNames: Array<string> | undefined = body["metricNames"]
|
|
1070
|
+
? (body["metricNames"] as Array<string>)
|
|
1071
|
+
: undefined;
|
|
1072
|
+
|
|
1073
|
+
const facetSearchText: Record<string, string> | undefined = body[
|
|
1074
|
+
"facetSearchText"
|
|
1075
|
+
]
|
|
1076
|
+
? (body["facetSearchText"] as Record<string, string>)
|
|
1077
|
+
: undefined;
|
|
1078
|
+
|
|
1079
|
+
const projectId: ObjectID = databaseProps.tenantId;
|
|
1080
|
+
|
|
1081
|
+
/*
|
|
1082
|
+
* Per-facet ClickHouse GROUP BY in parallel. Per-facet errors degrade
|
|
1083
|
+
* to [] so a slow facet doesn't block the rest.
|
|
1084
|
+
*/
|
|
1085
|
+
const facetResults: Array<readonly [string, Array<MetricFacetValue>]> =
|
|
1086
|
+
await Promise.all(
|
|
1087
|
+
facetKeys.map(
|
|
1088
|
+
async (
|
|
1089
|
+
facetKey: string,
|
|
1090
|
+
): Promise<readonly [string, Array<MetricFacetValue>]> => {
|
|
1091
|
+
try {
|
|
1092
|
+
const request: MetricFacetRequest = {
|
|
1093
|
+
projectId,
|
|
1094
|
+
startTime,
|
|
1095
|
+
endTime,
|
|
1096
|
+
facetKey,
|
|
1097
|
+
limit,
|
|
1098
|
+
serviceIds,
|
|
1099
|
+
metricNames,
|
|
1100
|
+
};
|
|
1101
|
+
const values: Array<MetricFacetValue> =
|
|
1102
|
+
await MetricAggregationService.getFacetValues(request);
|
|
1103
|
+
return [facetKey, values] as const;
|
|
1104
|
+
} catch {
|
|
1105
|
+
return [facetKey, [] as Array<MetricFacetValue>] as const;
|
|
1106
|
+
}
|
|
1107
|
+
},
|
|
1108
|
+
),
|
|
1109
|
+
);
|
|
1110
|
+
|
|
1111
|
+
const facets: Record<
|
|
1112
|
+
string,
|
|
1113
|
+
Array<MetricFacetValue>
|
|
1114
|
+
> = Object.fromEntries(facetResults);
|
|
1115
|
+
|
|
1116
|
+
/*
|
|
1117
|
+
* Replace resource-facet results with the Postgres source-of-truth list
|
|
1118
|
+
* (filtered by facetSearchText and enriched with displayName). Same
|
|
1119
|
+
* pattern as the trace / log / exception facets endpoints.
|
|
1120
|
+
*/
|
|
1121
|
+
const resourceSpecs: Array<ResourceFacetSpec> = facetKeys
|
|
1122
|
+
.filter((key: string): boolean => {
|
|
1123
|
+
return ResourceFacetResolver.isResourceFacet(key);
|
|
1124
|
+
})
|
|
1125
|
+
.map((key: string): ResourceFacetSpec => {
|
|
1126
|
+
const counts: Map<string, number> = new Map();
|
|
1127
|
+
for (const fv of facets[key] || []) {
|
|
1128
|
+
counts.set(fv.value, fv.count);
|
|
1129
|
+
}
|
|
1130
|
+
return {
|
|
1131
|
+
facetKey: key,
|
|
1132
|
+
counts,
|
|
1133
|
+
searchText: facetSearchText?.[key],
|
|
1134
|
+
limit,
|
|
1135
|
+
};
|
|
1136
|
+
});
|
|
1137
|
+
|
|
1138
|
+
if (resourceSpecs.length > 0) {
|
|
1139
|
+
const resolved: Record<
|
|
1140
|
+
string,
|
|
1141
|
+
Array<ResolvedFacetValue>
|
|
1142
|
+
> = await ResourceFacetResolver.resolve(projectId, resourceSpecs);
|
|
1143
|
+
for (const key of Object.keys(resolved)) {
|
|
1144
|
+
facets[key] = resolved[key] as Array<MetricFacetValue>;
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
return Response.sendJsonObjectResponse(req, res, {
|
|
1149
|
+
facets: facets as unknown as JSONObject,
|
|
1150
|
+
});
|
|
1151
|
+
} catch (err: unknown) {
|
|
1152
|
+
next(err);
|
|
1153
|
+
}
|
|
1154
|
+
},
|
|
1155
|
+
);
|
|
1156
|
+
|
|
751
1157
|
// --- Log Analytics Endpoint ---
|
|
752
1158
|
|
|
753
1159
|
router.post(
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { MigrationInterface, QueryRunner } from "typeorm";
|
|
2
|
+
|
|
3
|
+
export class MigrationName1779879993421 implements MigrationInterface {
|
|
4
|
+
public name = 'MigrationName1779879993421'
|
|
5
|
+
|
|
6
|
+
public async up(queryRunner: QueryRunner): Promise<void> {
|
|
7
|
+
await queryRunner.query(`DROP INDEX "public"."IDX_telemetry_exception_project_service_fingerprint"`);
|
|
8
|
+
await queryRunner.query(`ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "rotation" SET DEFAULT '{"_type":"Recurring","value":{"intervalType":"Day","intervalCount":{"_type":"PositiveNumber","value":1}}}'`);
|
|
9
|
+
await queryRunner.query(`ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "restrictionTimes" SET DEFAULT '{"_type":"RestrictionTimes","value":{"restictionType":"None","dayRestrictionTimes":null,"weeklyRestrictionTimes":[]}}'`);
|
|
10
|
+
await queryRunner.query(`CREATE UNIQUE INDEX "IDX_1f55d43a0b73e883bb226158c7" ON "TelemetryException" ("projectId", "serviceId", "fingerprint") `);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
public async down(queryRunner: QueryRunner): Promise<void> {
|
|
14
|
+
await queryRunner.query(`DROP INDEX "public"."IDX_1f55d43a0b73e883bb226158c7"`);
|
|
15
|
+
await queryRunner.query(`ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "restrictionTimes" SET DEFAULT '{"_type": "RestrictionTimes", "value": {"restictionType": "None", "dayRestrictionTimes": null, "weeklyRestrictionTimes": []}}'`);
|
|
16
|
+
await queryRunner.query(`ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "rotation" SET DEFAULT '{"_type": "Recurring", "value": {"intervalType": "Day", "intervalCount": {"_type": "PositiveNumber", "value": 1}}}'`);
|
|
17
|
+
await queryRunner.query(`CREATE UNIQUE INDEX "IDX_telemetry_exception_project_service_fingerprint" ON "TelemetryException" ("projectId", "serviceId", "fingerprint") `);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { MigrationInterface, QueryRunner } from "typeorm";
|
|
2
|
+
|
|
3
|
+
/*
|
|
4
|
+
* The OTel traces ingest path used to call
|
|
5
|
+
* ExceptionUtil.saveOrUpdateTelemetryException once per exception
|
|
6
|
+
* event with a findOneBy + updateOneBy/create pair, fire-and-forget,
|
|
7
|
+
* from inside the span loop. That has three problems we are fixing
|
|
8
|
+
* in tandem with this schema change:
|
|
9
|
+
*
|
|
10
|
+
* 1. Cost: each event is a Postgres round-trip. A worker batch
|
|
11
|
+
* with thousands of exception events drives thousands of
|
|
12
|
+
* parallel SELECT/UPDATE statements and starves the pool.
|
|
13
|
+
* 2. Lost increments: `occuranceCount = existing.occuranceCount + 1`
|
|
14
|
+
* is read-modify-write at the JS layer, so two workers
|
|
15
|
+
* seeing the same row at the same instant collapse to a
|
|
16
|
+
* single +1 instead of +2.
|
|
17
|
+
* 3. Duplicate rows: two workers both missing the row at the
|
|
18
|
+
* same time both INSERT, with no DB-level guard, producing
|
|
19
|
+
* two TelemetryException rows for the same fingerprint.
|
|
20
|
+
*
|
|
21
|
+
* The ingest path is moving to a single batched
|
|
22
|
+
* INSERT … ON CONFLICT ("projectId", "serviceId", "fingerprint")
|
|
23
|
+
* DO UPDATE SET "occuranceCount" =
|
|
24
|
+
* "TelemetryException"."occuranceCount" + EXCLUDED."occuranceCount",
|
|
25
|
+
* ...
|
|
26
|
+
* statement per worker batch, which needs the composite unique
|
|
27
|
+
* index this migration creates. Before we can create the index we
|
|
28
|
+
* have to clear out the duplicate rows produced by the legacy race
|
|
29
|
+
* (problem 3 above) — otherwise the CREATE UNIQUE INDEX would fail
|
|
30
|
+
* on production data.
|
|
31
|
+
*
|
|
32
|
+
* Strategy: pick one survivor per (projectId, serviceId, fingerprint)
|
|
33
|
+
* group and hard-delete the rest. We do NOT try to merge
|
|
34
|
+
* occuranceCount / firstSeenAt / lastSeenAt from the losers into the
|
|
35
|
+
* survivor — the simpler delete-only approach trades a small,
|
|
36
|
+
* one-time count discrepancy on duplicated fingerprints for a much
|
|
37
|
+
* simpler migration that is easy to reason about and roll back. The
|
|
38
|
+
* next exception occurrence for that fingerprint will re-increment
|
|
39
|
+
* the survivor via the new ON CONFLICT upsert, and the dashboard
|
|
40
|
+
* recovers within seconds.
|
|
41
|
+
*
|
|
42
|
+
* Survivor selection prefers the row that was carrying the most
|
|
43
|
+
* traffic before the unique index landed, because in the legacy
|
|
44
|
+
* code path `findOneBy` returned an implementation-defined row and
|
|
45
|
+
* all subsequent UPDATEs piled into that one — discarding it would
|
|
46
|
+
* be the most lossy choice. Order is:
|
|
47
|
+
* 1. Highest occuranceCount (the "real" row absorbing updates).
|
|
48
|
+
* 2. Most recent lastSeenAt (in case counts are tied).
|
|
49
|
+
* 3. Non-deleted before deleted (live data beats soft-deleted).
|
|
50
|
+
* 4. Smallest _id as a deterministic tiebreaker so re-runs pick
|
|
51
|
+
* the same survivor.
|
|
52
|
+
*
|
|
53
|
+
* TelemetryException is a leaf table — no other table holds an FK
|
|
54
|
+
* referencing it — so we do not need to reparent anything before
|
|
55
|
+
* deleting loser rows. NULL-fingerprint rows are left alone; the
|
|
56
|
+
* composite unique index treats NULLs as distinct, and the new
|
|
57
|
+
* ingest path never produces a NULL fingerprint anyway.
|
|
58
|
+
*/
|
|
59
|
+
export class DedupeTelemetryExceptionsAndAddUniqueIndex1779900000000
|
|
60
|
+
implements MigrationInterface
|
|
61
|
+
{
|
|
62
|
+
public name: string =
|
|
63
|
+
"DedupeTelemetryExceptionsAndAddUniqueIndex1779900000000";
|
|
64
|
+
|
|
65
|
+
public async up(queryRunner: QueryRunner): Promise<void> {
|
|
66
|
+
// 1. Delete every row that is not the chosen survivor for its group.
|
|
67
|
+
await queryRunner.query(`
|
|
68
|
+
WITH survivors AS (
|
|
69
|
+
SELECT DISTINCT ON ("projectId", "serviceId", "fingerprint")
|
|
70
|
+
_id AS survivor_id
|
|
71
|
+
FROM "TelemetryException"
|
|
72
|
+
WHERE "fingerprint" IS NOT NULL
|
|
73
|
+
ORDER BY
|
|
74
|
+
"projectId",
|
|
75
|
+
"serviceId",
|
|
76
|
+
"fingerprint",
|
|
77
|
+
COALESCE("occuranceCount", 0) DESC,
|
|
78
|
+
"lastSeenAt" DESC NULLS LAST,
|
|
79
|
+
CASE WHEN "deletedAt" IS NULL THEN 0 ELSE 1 END,
|
|
80
|
+
_id ASC
|
|
81
|
+
)
|
|
82
|
+
DELETE FROM "TelemetryException" te
|
|
83
|
+
WHERE te."fingerprint" IS NOT NULL
|
|
84
|
+
AND te._id NOT IN (SELECT survivor_id FROM survivors)
|
|
85
|
+
AND EXISTS (
|
|
86
|
+
SELECT 1
|
|
87
|
+
FROM "TelemetryException" t2
|
|
88
|
+
WHERE t2."projectId" = te."projectId"
|
|
89
|
+
AND t2."serviceId" = te."serviceId"
|
|
90
|
+
AND t2."fingerprint" = te."fingerprint"
|
|
91
|
+
AND t2._id <> te._id
|
|
92
|
+
);
|
|
93
|
+
`);
|
|
94
|
+
|
|
95
|
+
/*
|
|
96
|
+
* 2. Create the DB-level composite unique index. Matches the
|
|
97
|
+
* @Index decorator on TelemetryException and is the conflict
|
|
98
|
+
* target for the batched upsert in ExceptionUtil.
|
|
99
|
+
*/
|
|
100
|
+
await queryRunner.query(
|
|
101
|
+
`CREATE UNIQUE INDEX "IDX_telemetry_exception_project_service_fingerprint" ON "TelemetryException" ("projectId", "serviceId", "fingerprint") `,
|
|
102
|
+
);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
public async down(queryRunner: QueryRunner): Promise<void> {
|
|
106
|
+
await queryRunner.query(
|
|
107
|
+
`DROP INDEX "public"."IDX_telemetry_exception_project_service_fingerprint"`,
|
|
108
|
+
);
|
|
109
|
+
/*
|
|
110
|
+
* The duplicate rows deleted in up() are not resurrectable from
|
|
111
|
+
* a down-migration, and recreating them is not desirable — they
|
|
112
|
+
* only existed because of a race the unique index now prevents.
|
|
113
|
+
*/
|
|
114
|
+
}
|
|
115
|
+
}
|
|
@@ -354,6 +354,8 @@ import { AttachServiceToScheduledMaintenanceTemplatesAndLabelRules1779742211961
|
|
|
354
354
|
import { MigrationName1779790539196 } from "./1779790539196-MigrationName";
|
|
355
355
|
import { ExpandOwnerRuleInheritFlags1779823516881 } from "./1779823516881-ExpandOwnerRuleInheritFlags";
|
|
356
356
|
import { RenameStatusPageZhToZhCN1779827700000 } from "./1779827700000-RenameStatusPageZhToZhCN";
|
|
357
|
+
import { MigrationName1779879993421 } from "./1779879993421-MigrationName";
|
|
358
|
+
import { DedupeTelemetryExceptionsAndAddUniqueIndex1779900000000 } from "./1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex";
|
|
357
359
|
export default [
|
|
358
360
|
InitialMigration,
|
|
359
361
|
MigrationName1717678334852,
|
|
@@ -711,4 +713,6 @@ export default [
|
|
|
711
713
|
MigrationName1779790539196,
|
|
712
714
|
ExpandOwnerRuleInheritFlags1779823516881,
|
|
713
715
|
RenameStatusPageZhToZhCN1779827700000,
|
|
716
|
+
DedupeTelemetryExceptionsAndAddUniqueIndex1779900000000,
|
|
717
|
+
MigrationName1779879993421
|
|
714
718
|
];
|