@oneuptime/common 10.5.0 → 10.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Models/DatabaseModels/TelemetryException.ts +10 -0
- package/Server/API/TelemetryAPI.ts +406 -0
- package/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.ts +20 -0
- package/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.ts +115 -0
- package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +4 -0
- package/Server/Services/ExceptionAggregationService.ts +51 -3
- package/Server/Services/LogAggregationService.ts +1 -0
- package/Server/Services/MetricAggregationService.ts +227 -0
- package/Server/Services/OpenTelemetryIngestService.ts +101 -1
- package/Server/Services/TraceAggregationService.ts +1 -0
- package/Server/Utils/Monitor/MonitorLogUtil.ts +146 -6
- package/Server/Utils/Telemetry/ResourceFacetResolver.ts +299 -0
- package/UI/Components/LogsViewer/LogsViewer.tsx +10 -0
- package/UI/Components/LogsViewer/components/FacetSection.tsx +40 -3
- package/UI/Components/LogsViewer/components/LogsFacetSidebar.tsx +23 -0
- package/UI/Components/LogsViewer/types.ts +2 -0
- package/UI/Components/TelemetryViewer/TelemetryViewer.tsx +8 -0
- package/UI/Components/TelemetryViewer/components/TelemetryFacetSection.tsx +49 -3
- package/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.tsx +16 -0
- package/UI/Components/TelemetryViewer/types.ts +12 -0
- package/build/dist/Models/DatabaseModels/TelemetryException.js +11 -0
- package/build/dist/Models/DatabaseModels/TelemetryException.js.map +1 -1
- package/build/dist/Server/API/TelemetryAPI.js +285 -0
- package/build/dist/Server/API/TelemetryAPI.js.map +1 -1
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.js +18 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.js.map +1 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.js +106 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.js.map +1 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +4 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js.map +1 -1
- package/build/dist/Server/Services/ExceptionAggregationService.js +44 -4
- package/build/dist/Server/Services/ExceptionAggregationService.js.map +1 -1
- package/build/dist/Server/Services/LogAggregationService.js.map +1 -1
- package/build/dist/Server/Services/MetricAggregationService.js +159 -0
- package/build/dist/Server/Services/MetricAggregationService.js.map +1 -0
- package/build/dist/Server/Services/OpenTelemetryIngestService.js +60 -3
- package/build/dist/Server/Services/OpenTelemetryIngestService.js.map +1 -1
- package/build/dist/Server/Services/TraceAggregationService.js.map +1 -1
- package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js +127 -4
- package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js.map +1 -1
- package/build/dist/Server/Utils/Telemetry/ResourceFacetResolver.js +204 -0
- package/build/dist/Server/Utils/Telemetry/ResourceFacetResolver.js.map +1 -0
- package/build/dist/UI/Components/LogsViewer/LogsViewer.js +1 -1
- package/build/dist/UI/Components/LogsViewer/LogsViewer.js.map +1 -1
- package/build/dist/UI/Components/LogsViewer/components/FacetSection.js +26 -6
- package/build/dist/UI/Components/LogsViewer/components/FacetSection.js.map +1 -1
- package/build/dist/UI/Components/LogsViewer/components/LogsFacetSidebar.js +12 -1
- package/build/dist/UI/Components/LogsViewer/components/LogsFacetSidebar.js.map +1 -1
- package/build/dist/UI/Components/LogsViewer/types.js.map +1 -1
- package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js +1 -1
- package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js.map +1 -1
- package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSection.js +32 -6
- package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSection.js.map +1 -1
- package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.js +6 -1
- package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.js.map +1 -1
- package/package.json +1 -1
|
@@ -8,6 +8,7 @@ import Includes from "../../Types/BaseDatabase/Includes";
|
|
|
8
8
|
import AnalyticsTableName from "../../Types/AnalyticsDatabase/AnalyticsTableName";
|
|
9
9
|
import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
|
|
10
10
|
import { DbJSONResponse, Results } from "./AnalyticsDatabaseService";
|
|
11
|
+
import ServiceType from "../../Types/Telemetry/ServiceType";
|
|
11
12
|
|
|
12
13
|
export interface HistogramBucket {
|
|
13
14
|
time: string;
|
|
@@ -35,6 +36,7 @@ export interface HistogramRequest extends ExceptionFilters {
|
|
|
35
36
|
export interface FacetValue {
|
|
36
37
|
value: string;
|
|
37
38
|
count: number;
|
|
39
|
+
displayName?: string | undefined;
|
|
38
40
|
}
|
|
39
41
|
|
|
40
42
|
export interface FacetRequest extends ExceptionFilters {
|
|
@@ -59,6 +61,18 @@ export class ExceptionAggregationService {
|
|
|
59
61
|
"escaped",
|
|
60
62
|
"release",
|
|
61
63
|
]);
|
|
64
|
+
/*
|
|
65
|
+
* Virtual facet keys — same scheme as TraceAggregationService /
|
|
66
|
+
* LogAggregationService. The `serviceId` slot is reused for host /
|
|
67
|
+
* docker host / k8s cluster ids, disambiguated by the `serviceType`
|
|
68
|
+
* discriminator column on each ExceptionInstance row.
|
|
69
|
+
*/
|
|
70
|
+
private static readonly RESOURCE_FACET_KEYS: Map<string, ServiceType> =
|
|
71
|
+
new Map([
|
|
72
|
+
["hostId", ServiceType.Host],
|
|
73
|
+
["dockerHostId", ServiceType.DockerHost],
|
|
74
|
+
["kubernetesClusterId", ServiceType.KubernetesCluster],
|
|
75
|
+
]);
|
|
62
76
|
private static readonly ATTRIBUTE_KEY_PATTERN: RegExp = /^[a-zA-Z0-9._:/-]+$/;
|
|
63
77
|
private static readonly MAX_FACET_KEY_LENGTH: number = 256;
|
|
64
78
|
|
|
@@ -169,12 +183,24 @@ export class ExceptionAggregationService {
|
|
|
169
183
|
|
|
170
184
|
ExceptionAggregationService.validateFacetKey(request.facetKey);
|
|
171
185
|
|
|
186
|
+
const resourceServiceType: ServiceType | undefined =
|
|
187
|
+
ExceptionAggregationService.RESOURCE_FACET_KEYS.get(request.facetKey);
|
|
188
|
+
const isResourceFacet: boolean = resourceServiceType !== undefined;
|
|
172
189
|
const isTopLevelColumn: boolean =
|
|
190
|
+
isResourceFacet ||
|
|
173
191
|
ExceptionAggregationService.isTopLevelColumn(request.facetKey);
|
|
174
192
|
|
|
175
193
|
const statement: Statement = new Statement();
|
|
176
194
|
|
|
177
|
-
if (
|
|
195
|
+
if (isResourceFacet) {
|
|
196
|
+
/*
|
|
197
|
+
* Virtual facet — group serviceId values whose row carries the matching
|
|
198
|
+
* ServiceType discriminator (Host / DockerHost / KubernetesCluster).
|
|
199
|
+
*/
|
|
200
|
+
statement.append(
|
|
201
|
+
SQL`SELECT toString(serviceId) AS val, count() AS cnt FROM ${ExceptionAggregationService.TABLE_NAME}`,
|
|
202
|
+
);
|
|
203
|
+
} else if (isTopLevelColumn) {
|
|
178
204
|
statement.append(
|
|
179
205
|
SQL`SELECT toString(${request.facetKey}) AS val, count() AS cnt FROM ${ExceptionAggregationService.TABLE_NAME}`,
|
|
180
206
|
);
|
|
@@ -200,7 +226,26 @@ export class ExceptionAggregationService {
|
|
|
200
226
|
}}`,
|
|
201
227
|
);
|
|
202
228
|
|
|
203
|
-
if (
|
|
229
|
+
if (isResourceFacet) {
|
|
230
|
+
statement.append(
|
|
231
|
+
SQL` AND serviceType = ${{
|
|
232
|
+
type: TableColumnType.Text,
|
|
233
|
+
value: resourceServiceType as string,
|
|
234
|
+
}}`,
|
|
235
|
+
);
|
|
236
|
+
} else if (request.facetKey === "serviceId") {
|
|
237
|
+
/*
|
|
238
|
+
* Constrain the canonical Services facet to rows that actually
|
|
239
|
+
* belong to a Service. NULL / empty serviceType covers legacy rows
|
|
240
|
+
* ingested before the discriminator existed.
|
|
241
|
+
*/
|
|
242
|
+
statement.append(
|
|
243
|
+
SQL` AND (serviceType = '' OR serviceType = ${{
|
|
244
|
+
type: TableColumnType.Text,
|
|
245
|
+
value: ServiceType.OpenTelemetry as string,
|
|
246
|
+
}})`,
|
|
247
|
+
);
|
|
248
|
+
} else if (!isTopLevelColumn) {
|
|
204
249
|
statement.append(
|
|
205
250
|
SQL` AND JSONHas(attributes, ${{
|
|
206
251
|
type: TableColumnType.Text,
|
|
@@ -322,7 +367,10 @@ export class ExceptionAggregationService {
|
|
|
322
367
|
throw new BadDataException("Invalid facetKey");
|
|
323
368
|
}
|
|
324
369
|
|
|
325
|
-
if (
|
|
370
|
+
if (
|
|
371
|
+
ExceptionAggregationService.isTopLevelColumn(facetKey) ||
|
|
372
|
+
ExceptionAggregationService.RESOURCE_FACET_KEYS.has(facetKey)
|
|
373
|
+
) {
|
|
326
374
|
return;
|
|
327
375
|
}
|
|
328
376
|
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import { SQL, Statement } from "../Utils/AnalyticsDatabase/Statement";
|
|
2
|
+
import MetricService from "./MetricService";
|
|
3
|
+
import TableColumnType from "../../Types/AnalyticsDatabase/TableColumnType";
|
|
4
|
+
import { JSONObject } from "../../Types/JSON";
|
|
5
|
+
import ObjectID from "../../Types/ObjectID";
|
|
6
|
+
import BadDataException from "../../Types/Exception/BadDataException";
|
|
7
|
+
import Includes from "../../Types/BaseDatabase/Includes";
|
|
8
|
+
import AnalyticsTableName from "../../Types/AnalyticsDatabase/AnalyticsTableName";
|
|
9
|
+
import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
|
|
10
|
+
import { DbJSONResponse, Results } from "./AnalyticsDatabaseService";
|
|
11
|
+
import ServiceType from "../../Types/Telemetry/ServiceType";
|
|
12
|
+
|
|
13
|
+
export interface FacetValue {
|
|
14
|
+
value: string;
|
|
15
|
+
count: number;
|
|
16
|
+
displayName?: string | undefined;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface MetricFilters {
|
|
20
|
+
serviceIds?: Array<ObjectID> | undefined;
|
|
21
|
+
metricNames?: Array<string> | undefined;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface FacetRequest extends MetricFilters {
|
|
25
|
+
projectId: ObjectID;
|
|
26
|
+
startTime: Date;
|
|
27
|
+
endTime: Date;
|
|
28
|
+
facetKey: string;
|
|
29
|
+
limit?: number | undefined;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/*
|
|
33
|
+
* Facet aggregation for the Metrics page sidebar. Same shape as
|
|
34
|
+
* TraceAggregationService / LogAggregationService — per-facet GROUP BY on
|
|
35
|
+
* the analytics table, with a `serviceType` discriminator that lets the
|
|
36
|
+
* `serviceId` column carry Host / DockerHost / KubernetesCluster ids for
|
|
37
|
+
* the corresponding virtual facets.
|
|
38
|
+
*/
|
|
39
|
+
export class MetricAggregationService {
|
|
40
|
+
private static readonly DEFAULT_FACET_LIMIT: number = 500;
|
|
41
|
+
private static readonly TABLE_NAME: string = AnalyticsTableName.Metric;
|
|
42
|
+
private static readonly TOP_LEVEL_COLUMNS: Set<string> = new Set([
|
|
43
|
+
"serviceId",
|
|
44
|
+
"name",
|
|
45
|
+
]);
|
|
46
|
+
private static readonly RESOURCE_FACET_KEYS: Map<string, ServiceType> =
|
|
47
|
+
new Map([
|
|
48
|
+
["hostId", ServiceType.Host],
|
|
49
|
+
["dockerHostId", ServiceType.DockerHost],
|
|
50
|
+
["kubernetesClusterId", ServiceType.KubernetesCluster],
|
|
51
|
+
]);
|
|
52
|
+
private static readonly ATTRIBUTE_KEY_PATTERN: RegExp = /^[a-zA-Z0-9._:/-]+$/;
|
|
53
|
+
private static readonly MAX_FACET_KEY_LENGTH: number = 256;
|
|
54
|
+
|
|
55
|
+
@CaptureSpan()
|
|
56
|
+
public static async getFacetValues(
|
|
57
|
+
request: FacetRequest,
|
|
58
|
+
): Promise<Array<FacetValue>> {
|
|
59
|
+
const statement: Statement =
|
|
60
|
+
MetricAggregationService.buildFacetStatement(request);
|
|
61
|
+
|
|
62
|
+
const dbResult: Results = await MetricService.executeQuery(statement);
|
|
63
|
+
const response: DbJSONResponse = await dbResult.json<{
|
|
64
|
+
data?: Array<JSONObject>;
|
|
65
|
+
}>();
|
|
66
|
+
|
|
67
|
+
const rows: Array<JSONObject> = response.data || [];
|
|
68
|
+
|
|
69
|
+
return rows
|
|
70
|
+
.map((row: JSONObject): FacetValue => {
|
|
71
|
+
return {
|
|
72
|
+
value: String(row["val"] || ""),
|
|
73
|
+
count: Number(row["cnt"] || 0),
|
|
74
|
+
};
|
|
75
|
+
})
|
|
76
|
+
.filter((facet: FacetValue): boolean => {
|
|
77
|
+
return facet.value.length > 0;
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
private static buildFacetStatement(request: FacetRequest): Statement {
|
|
82
|
+
const limit: number =
|
|
83
|
+
request.limit ?? MetricAggregationService.DEFAULT_FACET_LIMIT;
|
|
84
|
+
|
|
85
|
+
MetricAggregationService.validateFacetKey(request.facetKey);
|
|
86
|
+
|
|
87
|
+
const resourceServiceType: ServiceType | undefined =
|
|
88
|
+
MetricAggregationService.RESOURCE_FACET_KEYS.get(request.facetKey);
|
|
89
|
+
const isResourceFacet: boolean = resourceServiceType !== undefined;
|
|
90
|
+
const isTopLevelColumn: boolean =
|
|
91
|
+
isResourceFacet ||
|
|
92
|
+
MetricAggregationService.isTopLevelColumn(request.facetKey);
|
|
93
|
+
|
|
94
|
+
const statement: Statement = new Statement();
|
|
95
|
+
|
|
96
|
+
if (isResourceFacet) {
|
|
97
|
+
statement.append(
|
|
98
|
+
SQL`SELECT toString(serviceId) AS val, count() AS cnt FROM ${MetricAggregationService.TABLE_NAME}`,
|
|
99
|
+
);
|
|
100
|
+
} else if (isTopLevelColumn) {
|
|
101
|
+
statement.append(
|
|
102
|
+
SQL`SELECT toString(${request.facetKey}) AS val, count() AS cnt FROM ${MetricAggregationService.TABLE_NAME}`,
|
|
103
|
+
);
|
|
104
|
+
} else {
|
|
105
|
+
statement.append(
|
|
106
|
+
SQL`SELECT JSONExtractRaw(attributes, ${{
|
|
107
|
+
type: TableColumnType.Text,
|
|
108
|
+
value: request.facetKey,
|
|
109
|
+
}}) AS val, count() AS cnt FROM ${MetricAggregationService.TABLE_NAME}`,
|
|
110
|
+
);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
statement.append(
|
|
114
|
+
SQL` WHERE projectId = ${{
|
|
115
|
+
type: TableColumnType.ObjectID,
|
|
116
|
+
value: request.projectId,
|
|
117
|
+
}} AND time >= ${{
|
|
118
|
+
type: TableColumnType.Date,
|
|
119
|
+
value: request.startTime,
|
|
120
|
+
}} AND time <= ${{
|
|
121
|
+
type: TableColumnType.Date,
|
|
122
|
+
value: request.endTime,
|
|
123
|
+
}}`,
|
|
124
|
+
);
|
|
125
|
+
|
|
126
|
+
if (isResourceFacet) {
|
|
127
|
+
statement.append(
|
|
128
|
+
SQL` AND serviceType = ${{
|
|
129
|
+
type: TableColumnType.Text,
|
|
130
|
+
value: resourceServiceType as string,
|
|
131
|
+
}}`,
|
|
132
|
+
);
|
|
133
|
+
} else if (request.facetKey === "serviceId") {
|
|
134
|
+
statement.append(
|
|
135
|
+
SQL` AND (serviceType = '' OR serviceType = ${{
|
|
136
|
+
type: TableColumnType.Text,
|
|
137
|
+
value: ServiceType.OpenTelemetry as string,
|
|
138
|
+
}})`,
|
|
139
|
+
);
|
|
140
|
+
} else if (!isTopLevelColumn) {
|
|
141
|
+
statement.append(
|
|
142
|
+
SQL` AND JSONHas(attributes, ${{
|
|
143
|
+
type: TableColumnType.Text,
|
|
144
|
+
value: request.facetKey,
|
|
145
|
+
}}) = 1`,
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
MetricAggregationService.appendCommonFilters(statement, request);
|
|
150
|
+
|
|
151
|
+
statement.append(
|
|
152
|
+
SQL` GROUP BY val ORDER BY cnt DESC LIMIT ${{
|
|
153
|
+
type: TableColumnType.Number,
|
|
154
|
+
value: limit,
|
|
155
|
+
}}`,
|
|
156
|
+
);
|
|
157
|
+
|
|
158
|
+
/*
|
|
159
|
+
* Defense in depth: cap runtime below nginx's 60s proxy_read_timeout
|
|
160
|
+
* so a slow facet never starves the endpoint.
|
|
161
|
+
*/
|
|
162
|
+
statement.append(
|
|
163
|
+
" SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
164
|
+
);
|
|
165
|
+
|
|
166
|
+
return statement;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
private static appendCommonFilters(
|
|
170
|
+
statement: Statement,
|
|
171
|
+
request: MetricFilters,
|
|
172
|
+
): void {
|
|
173
|
+
if (request.serviceIds && request.serviceIds.length > 0) {
|
|
174
|
+
statement.append(
|
|
175
|
+
SQL` AND serviceId IN (${{
|
|
176
|
+
type: TableColumnType.ObjectID,
|
|
177
|
+
value: new Includes(
|
|
178
|
+
request.serviceIds.map((id: ObjectID) => {
|
|
179
|
+
return id.toString();
|
|
180
|
+
}),
|
|
181
|
+
),
|
|
182
|
+
}})`,
|
|
183
|
+
);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if (request.metricNames && request.metricNames.length > 0) {
|
|
187
|
+
statement.append(
|
|
188
|
+
SQL` AND name IN (${{
|
|
189
|
+
type: TableColumnType.Text,
|
|
190
|
+
value: new Includes(request.metricNames),
|
|
191
|
+
}})`,
|
|
192
|
+
);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
private static isTopLevelColumn(key: string): boolean {
|
|
197
|
+
return MetricAggregationService.TOP_LEVEL_COLUMNS.has(key);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
private static validateFacetKey(
|
|
201
|
+
facetKey: unknown,
|
|
202
|
+
): asserts facetKey is string {
|
|
203
|
+
if (typeof facetKey !== "string") {
|
|
204
|
+
throw new BadDataException("Invalid facetKey");
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
if (
|
|
208
|
+
facetKey.length === 0 ||
|
|
209
|
+
facetKey.length > MetricAggregationService.MAX_FACET_KEY_LENGTH
|
|
210
|
+
) {
|
|
211
|
+
throw new BadDataException("Invalid facetKey");
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (
|
|
215
|
+
MetricAggregationService.isTopLevelColumn(facetKey) ||
|
|
216
|
+
MetricAggregationService.RESOURCE_FACET_KEYS.has(facetKey)
|
|
217
|
+
) {
|
|
218
|
+
return;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (!MetricAggregationService.ATTRIBUTE_KEY_PATTERN.test(facetKey)) {
|
|
222
|
+
throw new BadDataException("Invalid facetKey");
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
export default MetricAggregationService;
|
|
@@ -23,6 +23,7 @@ import KubernetesCluster from "../../Models/DatabaseModels/KubernetesCluster";
|
|
|
23
23
|
import HostService from "./HostService";
|
|
24
24
|
import DockerHostService from "./DockerHostService";
|
|
25
25
|
import KubernetesClusterService from "./KubernetesClusterService";
|
|
26
|
+
import GlobalCache from "../Infrastructure/GlobalCache";
|
|
26
27
|
|
|
27
28
|
export enum OtelAggregationTemporality {
|
|
28
29
|
Cumulative = "AGGREGATION_TEMPORALITY_CUMULATIVE",
|
|
@@ -52,11 +53,83 @@ interface ProjectRetentionContext {
|
|
|
52
53
|
projectRetentionInDays: number;
|
|
53
54
|
}
|
|
54
55
|
|
|
56
|
+
/*
|
|
57
|
+
* Per-process memo holding the project's telemetry retention
|
|
58
|
+
* context for the duration of the cache TTL. The L2 GlobalCache
|
|
59
|
+
* (Redis, shared across workers) is the source of truth for
|
|
60
|
+
* cross-process freshness; this L1 in-process Map exists so the
|
|
61
|
+
* dozens of getProjectRetentionContext calls per worker batch
|
|
62
|
+
* (one per resource span -> Service/Host/DockerHost/Kubernetes
|
|
63
|
+
* resolution) collapse to zero network round-trips after the
|
|
64
|
+
* first one warms it. Both layers TTL out together.
|
|
65
|
+
*/
|
|
66
|
+
interface CachedRetentionContext {
|
|
67
|
+
context: ProjectRetentionContext;
|
|
68
|
+
expiresAtMs: number;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const PROJECT_RETENTION_CACHE_NAMESPACE: string = "project-retention-context";
|
|
72
|
+
/*
|
|
73
|
+
* 5-minute TTL is long enough that steady-state ingest sees ~100%
|
|
74
|
+
* cache hits and short enough that an admin retention change in
|
|
75
|
+
* the UI propagates without us having to wire up cross-process
|
|
76
|
+
* invalidation (which would need pub/sub — GlobalCache has no
|
|
77
|
+
* delete primitive today). Admins changing retention should
|
|
78
|
+
* expect up to 5 minutes of lag before new rows pick up the new
|
|
79
|
+
* config; existing rows keep whatever retentionDate they were
|
|
80
|
+
* stamped with at ingest time and aren't affected either way.
|
|
81
|
+
*/
|
|
82
|
+
const PROJECT_RETENTION_CACHE_TTL_SECONDS: number = 5 * 60;
|
|
83
|
+
const projectRetentionInProcessCache: Map<string, CachedRetentionContext> =
|
|
84
|
+
new Map();
|
|
85
|
+
|
|
55
86
|
export default class OTelIngestService {
|
|
56
87
|
@CaptureSpan()
|
|
57
88
|
private static async getProjectRetentionContext(
|
|
58
89
|
projectId: ObjectID,
|
|
59
90
|
): Promise<ProjectRetentionContext> {
|
|
91
|
+
const projectIdStr: string = projectId.toString();
|
|
92
|
+
const now: number = Date.now();
|
|
93
|
+
|
|
94
|
+
// L1: in-process memo. Zero network cost.
|
|
95
|
+
const memoed: CachedRetentionContext | undefined =
|
|
96
|
+
projectRetentionInProcessCache.get(projectIdStr);
|
|
97
|
+
if (memoed && memoed.expiresAtMs > now) {
|
|
98
|
+
return memoed.context;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// L2: Redis. Single round-trip; shared across workers.
|
|
102
|
+
try {
|
|
103
|
+
const cached: JSONObject | null = await GlobalCache.getJSONObject(
|
|
104
|
+
PROJECT_RETENTION_CACHE_NAMESPACE,
|
|
105
|
+
projectIdStr,
|
|
106
|
+
);
|
|
107
|
+
if (cached) {
|
|
108
|
+
const context: ProjectRetentionContext = {
|
|
109
|
+
projectRetentionConfig:
|
|
110
|
+
(cached[
|
|
111
|
+
"projectRetentionConfig"
|
|
112
|
+
] as TelemetryRetentionConfig | null) ?? null,
|
|
113
|
+
projectRetentionInDays:
|
|
114
|
+
(cached["projectRetentionInDays"] as number) ||
|
|
115
|
+
DEFAULT_RETENTION_IN_DAYS,
|
|
116
|
+
};
|
|
117
|
+
projectRetentionInProcessCache.set(projectIdStr, {
|
|
118
|
+
context,
|
|
119
|
+
expiresAtMs: now + PROJECT_RETENTION_CACHE_TTL_SECONDS * 1000,
|
|
120
|
+
});
|
|
121
|
+
return context;
|
|
122
|
+
}
|
|
123
|
+
} catch (err) {
|
|
124
|
+
// Cache outage must never fail ingest. Fall through to Postgres.
|
|
125
|
+
logger.warn(
|
|
126
|
+
`Project retention cache read failed for project ${projectIdStr}; falling back to Postgres: ${
|
|
127
|
+
err instanceof Error ? err.message : String(err)
|
|
128
|
+
}`,
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Cold path: hit Postgres and warm both caches.
|
|
60
133
|
const project: Project | null = await ProjectService.findOneById({
|
|
61
134
|
id: projectId,
|
|
62
135
|
select: {
|
|
@@ -68,11 +141,38 @@ export default class OTelIngestService {
|
|
|
68
141
|
},
|
|
69
142
|
});
|
|
70
143
|
|
|
71
|
-
|
|
144
|
+
const context: ProjectRetentionContext = {
|
|
72
145
|
projectRetentionConfig: project?.telemetryRetentionConfig ?? null,
|
|
73
146
|
projectRetentionInDays:
|
|
74
147
|
project?.defaultTelemetryRetentionInDays || DEFAULT_RETENTION_IN_DAYS,
|
|
75
148
|
};
|
|
149
|
+
|
|
150
|
+
projectRetentionInProcessCache.set(projectIdStr, {
|
|
151
|
+
context,
|
|
152
|
+
expiresAtMs: now + PROJECT_RETENTION_CACHE_TTL_SECONDS * 1000,
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
try {
|
|
156
|
+
await GlobalCache.setJSON(
|
|
157
|
+
PROJECT_RETENTION_CACHE_NAMESPACE,
|
|
158
|
+
projectIdStr,
|
|
159
|
+
{
|
|
160
|
+
projectRetentionConfig: (context.projectRetentionConfig ??
|
|
161
|
+
null) as unknown as JSONObject,
|
|
162
|
+
projectRetentionInDays: context.projectRetentionInDays,
|
|
163
|
+
},
|
|
164
|
+
{ expiresInSeconds: PROJECT_RETENTION_CACHE_TTL_SECONDS },
|
|
165
|
+
);
|
|
166
|
+
} catch (err) {
|
|
167
|
+
// Best-effort warm. Don't fail the request.
|
|
168
|
+
logger.warn(
|
|
169
|
+
`Project retention cache write failed for project ${projectIdStr}: ${
|
|
170
|
+
err instanceof Error ? err.message : String(err)
|
|
171
|
+
}`,
|
|
172
|
+
);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return context;
|
|
76
176
|
}
|
|
77
177
|
|
|
78
178
|
@CaptureSpan()
|
|
@@ -7,6 +7,29 @@ import ObjectID from "../../../Types/ObjectID";
|
|
|
7
7
|
import { JSONObject } from "../../../Types/JSON";
|
|
8
8
|
import DataToProcess from "./DataToProcess";
|
|
9
9
|
|
|
10
|
+
/*
|
|
11
|
+
* Maximum rows held in memory before we force a flush, and the
|
|
12
|
+
* maximum time we hold a row in the buffer before flushing.
|
|
13
|
+
*
|
|
14
|
+
* Sizing rationale: 10,000 rows at a few KB each is ~tens of MB
|
|
15
|
+
* of peak heap per process, which is fine for the API/worker
|
|
16
|
+
* processes that run monitor probes. 5 seconds is a tight enough
|
|
17
|
+
* worst-case latency that the dashboard "last log" view feels
|
|
18
|
+
* live, and loose enough that even a single very chatty monitor
|
|
19
|
+
* coalesces dozens to thousands of inserts into one.
|
|
20
|
+
*
|
|
21
|
+
* The legacy implementation called MonitorLogService.insertJsonRows
|
|
22
|
+
* with a one-element array per monitor probe, fire-and-forget.
|
|
23
|
+
* ClickHouse's async_insert deduplicates these into part files,
|
|
24
|
+
* but every call still pays an HTTP round-trip and the
|
|
25
|
+
* @clickhouse/client pool can saturate when probe traffic spikes
|
|
26
|
+
* (e.g. a 10k-monitor project running concurrent probes). Batching
|
|
27
|
+
* collapses N round-trips into N / 10_000 (size-bounded) or
|
|
28
|
+
* N / (5s × per-second-rate) (time-bounded), whichever comes first.
|
|
29
|
+
*/
|
|
30
|
+
const MONITOR_LOG_FLUSH_BATCH_SIZE: number = 10_000;
|
|
31
|
+
const MONITOR_LOG_FLUSH_INTERVAL_MS: number = 5_000;
|
|
32
|
+
|
|
10
33
|
export default class MonitorLogUtil {
|
|
11
34
|
// Default retention in days if GlobalConfig is not set
|
|
12
35
|
private static readonly DEFAULT_RETENTION_DAYS: number = 1;
|
|
@@ -16,6 +39,18 @@ export default class MonitorLogUtil {
|
|
|
16
39
|
private static lastCacheRefresh: Date | null = null;
|
|
17
40
|
private static readonly CACHE_TTL_MS: number = 5 * 60 * 1000; // 5 minutes
|
|
18
41
|
|
|
42
|
+
/*
|
|
43
|
+
* In-process write buffer for MonitorLog rows. Rows accumulate
|
|
44
|
+
* here until either MONITOR_LOG_FLUSH_BATCH_SIZE rows arrive
|
|
45
|
+
* (size trigger) or MONITOR_LOG_FLUSH_INTERVAL_MS elapses since
|
|
46
|
+
* the first row entered an empty buffer (time trigger),
|
|
47
|
+
* whichever comes first. On graceful shutdown the SIGTERM /
|
|
48
|
+
* SIGINT hook below drains the buffer before the process exits.
|
|
49
|
+
*/
|
|
50
|
+
private static buffer: Array<JSONObject> = [];
|
|
51
|
+
private static flushTimer: NodeJS.Timeout | null = null;
|
|
52
|
+
private static shutdownHooksRegistered: boolean = false;
|
|
53
|
+
|
|
19
54
|
private static async getRetentionDays(): Promise<number> {
|
|
20
55
|
const now: Date = OneUptimeDate.getCurrentDate();
|
|
21
56
|
|
|
@@ -83,7 +118,7 @@ export default class MonitorLogUtil {
|
|
|
83
118
|
return;
|
|
84
119
|
}
|
|
85
120
|
|
|
86
|
-
// Fire-and-forget: fetch retention config then
|
|
121
|
+
// Fire-and-forget: fetch retention config then enqueue
|
|
87
122
|
this.getRetentionDays()
|
|
88
123
|
.then((retentionDays: number) => {
|
|
89
124
|
const logIngestionDate: Date = OneUptimeDate.getCurrentDate();
|
|
@@ -106,14 +141,119 @@ export default class MonitorLogUtil {
|
|
|
106
141
|
retentionDate: OneUptimeDate.toClickhouseDateTime(retentionDate),
|
|
107
142
|
};
|
|
108
143
|
|
|
109
|
-
|
|
110
|
-
(err: Error) => {
|
|
111
|
-
logger.error(err);
|
|
112
|
-
},
|
|
113
|
-
);
|
|
144
|
+
this.enqueueRow(monitorLogRow);
|
|
114
145
|
})
|
|
115
146
|
.catch((err: Error) => {
|
|
116
147
|
logger.error(err);
|
|
117
148
|
});
|
|
118
149
|
}
|
|
150
|
+
|
|
151
|
+
/*
|
|
152
|
+
* Append a row to the buffer, then decide whether to flush
|
|
153
|
+
* immediately (size threshold) or schedule the timed flush
|
|
154
|
+
* (first row in an empty buffer). The timer is intentionally
|
|
155
|
+
* NOT reset on every row — we want strict "at most 5 s of
|
|
156
|
+
* staleness", not "at most 5 s of idle". Resetting per-row
|
|
157
|
+
* could indefinitely delay flush under steady ingest.
|
|
158
|
+
*/
|
|
159
|
+
private static enqueueRow(row: JSONObject): void {
|
|
160
|
+
this.ensureShutdownHooks();
|
|
161
|
+
|
|
162
|
+
this.buffer.push(row);
|
|
163
|
+
|
|
164
|
+
if (this.buffer.length >= MONITOR_LOG_FLUSH_BATCH_SIZE) {
|
|
165
|
+
// Size trigger — flush immediately.
|
|
166
|
+
this.triggerFlush();
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
if (!this.flushTimer && this.buffer.length === 1) {
|
|
171
|
+
// First row in an empty buffer — start the time-based flush.
|
|
172
|
+
this.flushTimer = setTimeout(() => {
|
|
173
|
+
this.triggerFlush();
|
|
174
|
+
}, MONITOR_LOG_FLUSH_INTERVAL_MS);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/*
|
|
179
|
+
* Synchronous fire-and-forget flush used by the normal hot
|
|
180
|
+
* path: callers must not be blocked waiting for ClickHouse.
|
|
181
|
+
* Swap the buffer out first so new arrivals during the network
|
|
182
|
+
* round-trip land in a fresh array and are not double-flushed.
|
|
183
|
+
*/
|
|
184
|
+
private static triggerFlush(): void {
|
|
185
|
+
if (this.flushTimer) {
|
|
186
|
+
clearTimeout(this.flushTimer);
|
|
187
|
+
this.flushTimer = null;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if (this.buffer.length === 0) {
|
|
191
|
+
return;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const toFlush: Array<JSONObject> = this.buffer;
|
|
195
|
+
this.buffer = [];
|
|
196
|
+
|
|
197
|
+
MonitorLogService.insertJsonRows(toFlush).catch((err: Error) => {
|
|
198
|
+
logger.error(
|
|
199
|
+
`MonitorLog batch insert failed for ${toFlush.length} rows; batch dropped.`,
|
|
200
|
+
);
|
|
201
|
+
logger.error(err);
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/*
|
|
206
|
+
* Awaitable flush used by the SIGTERM / SIGINT shutdown hook
|
|
207
|
+
* so we don't lose buffered rows when Kubernetes / Docker
|
|
208
|
+
* sends the process to bed. Errors are logged but swallowed —
|
|
209
|
+
* a failing flush must not block the rest of the shutdown
|
|
210
|
+
* chain (other handlers may still need to run).
|
|
211
|
+
*/
|
|
212
|
+
private static async flushAndWait(): Promise<void> {
|
|
213
|
+
if (this.flushTimer) {
|
|
214
|
+
clearTimeout(this.flushTimer);
|
|
215
|
+
this.flushTimer = null;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if (this.buffer.length === 0) {
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
const toFlush: Array<JSONObject> = this.buffer;
|
|
223
|
+
this.buffer = [];
|
|
224
|
+
|
|
225
|
+
try {
|
|
226
|
+
await MonitorLogService.insertJsonRows(toFlush);
|
|
227
|
+
} catch (err) {
|
|
228
|
+
logger.error(
|
|
229
|
+
`MonitorLog shutdown flush failed for ${toFlush.length} rows; batch dropped.`,
|
|
230
|
+
);
|
|
231
|
+
logger.error(err);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/*
|
|
236
|
+
* Register SIGTERM / SIGINT handlers exactly once, lazily on
|
|
237
|
+
* first ingest. We avoid registering at module-load time so
|
|
238
|
+
* tooling that imports this file (e.g. migration runners,
|
|
239
|
+
* CLI scripts) doesn't end up with stray process listeners.
|
|
240
|
+
*/
|
|
241
|
+
private static ensureShutdownHooks(): void {
|
|
242
|
+
if (this.shutdownHooksRegistered) {
|
|
243
|
+
return;
|
|
244
|
+
}
|
|
245
|
+
this.shutdownHooksRegistered = true;
|
|
246
|
+
|
|
247
|
+
const flushOnShutdown: () => Promise<void> = async (): Promise<void> => {
|
|
248
|
+
try {
|
|
249
|
+
await this.flushAndWait();
|
|
250
|
+
} catch (err) {
|
|
251
|
+
logger.error("Error flushing MonitorLog buffer on shutdown:");
|
|
252
|
+
logger.error(err);
|
|
253
|
+
}
|
|
254
|
+
};
|
|
255
|
+
|
|
256
|
+
process.on("SIGTERM", flushOnShutdown);
|
|
257
|
+
process.on("SIGINT", flushOnShutdown);
|
|
258
|
+
}
|
|
119
259
|
}
|