@oneuptime/common 10.5.1 → 10.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/Models/DatabaseModels/TelemetryException.ts +10 -0
  2. package/Server/API/TelemetryAPI.ts +406 -0
  3. package/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.ts +20 -0
  4. package/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.ts +115 -0
  5. package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +4 -0
  6. package/Server/Services/ExceptionAggregationService.ts +51 -3
  7. package/Server/Services/LogAggregationService.ts +1 -0
  8. package/Server/Services/MetricAggregationService.ts +227 -0
  9. package/Server/Services/OpenTelemetryIngestService.ts +101 -1
  10. package/Server/Services/TraceAggregationService.ts +1 -0
  11. package/Server/Utils/Monitor/MonitorLogUtil.ts +146 -6
  12. package/Server/Utils/Telemetry/ResourceFacetResolver.ts +299 -0
  13. package/UI/Components/LogsViewer/LogsViewer.tsx +10 -0
  14. package/UI/Components/LogsViewer/components/FacetSection.tsx +40 -3
  15. package/UI/Components/LogsViewer/components/LogsFacetSidebar.tsx +23 -0
  16. package/UI/Components/LogsViewer/types.ts +2 -0
  17. package/UI/Components/TelemetryViewer/TelemetryViewer.tsx +8 -0
  18. package/UI/Components/TelemetryViewer/components/TelemetryFacetSection.tsx +49 -3
  19. package/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.tsx +16 -0
  20. package/UI/Components/TelemetryViewer/types.ts +12 -0
  21. package/build/dist/Models/DatabaseModels/TelemetryException.js +11 -0
  22. package/build/dist/Models/DatabaseModels/TelemetryException.js.map +1 -1
  23. package/build/dist/Server/API/TelemetryAPI.js +285 -0
  24. package/build/dist/Server/API/TelemetryAPI.js.map +1 -1
  25. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.js +18 -0
  26. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779879993421-MigrationName.js.map +1 -0
  27. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.js +106 -0
  28. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1779900000000-DedupeTelemetryExceptionsAndAddUniqueIndex.js.map +1 -0
  29. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +4 -0
  30. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js.map +1 -1
  31. package/build/dist/Server/Services/ExceptionAggregationService.js +44 -4
  32. package/build/dist/Server/Services/ExceptionAggregationService.js.map +1 -1
  33. package/build/dist/Server/Services/LogAggregationService.js.map +1 -1
  34. package/build/dist/Server/Services/MetricAggregationService.js +159 -0
  35. package/build/dist/Server/Services/MetricAggregationService.js.map +1 -0
  36. package/build/dist/Server/Services/OpenTelemetryIngestService.js +60 -3
  37. package/build/dist/Server/Services/OpenTelemetryIngestService.js.map +1 -1
  38. package/build/dist/Server/Services/TraceAggregationService.js.map +1 -1
  39. package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js +127 -4
  40. package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js.map +1 -1
  41. package/build/dist/Server/Utils/Telemetry/ResourceFacetResolver.js +204 -0
  42. package/build/dist/Server/Utils/Telemetry/ResourceFacetResolver.js.map +1 -0
  43. package/build/dist/UI/Components/LogsViewer/LogsViewer.js +1 -1
  44. package/build/dist/UI/Components/LogsViewer/LogsViewer.js.map +1 -1
  45. package/build/dist/UI/Components/LogsViewer/components/FacetSection.js +26 -6
  46. package/build/dist/UI/Components/LogsViewer/components/FacetSection.js.map +1 -1
  47. package/build/dist/UI/Components/LogsViewer/components/LogsFacetSidebar.js +12 -1
  48. package/build/dist/UI/Components/LogsViewer/components/LogsFacetSidebar.js.map +1 -1
  49. package/build/dist/UI/Components/LogsViewer/types.js.map +1 -1
  50. package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js +1 -1
  51. package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js.map +1 -1
  52. package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSection.js +32 -6
  53. package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSection.js.map +1 -1
  54. package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.js +6 -1
  55. package/build/dist/UI/Components/TelemetryViewer/components/TelemetryFacetSidebar.js.map +1 -1
  56. package/package.json +1 -1
@@ -8,6 +8,7 @@ import Includes from "../../Types/BaseDatabase/Includes";
8
8
  import AnalyticsTableName from "../../Types/AnalyticsDatabase/AnalyticsTableName";
9
9
  import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
10
10
  import { DbJSONResponse, Results } from "./AnalyticsDatabaseService";
11
+ import ServiceType from "../../Types/Telemetry/ServiceType";
11
12
 
12
13
  export interface HistogramBucket {
13
14
  time: string;
@@ -35,6 +36,7 @@ export interface HistogramRequest extends ExceptionFilters {
35
36
  export interface FacetValue {
36
37
  value: string;
37
38
  count: number;
39
+ displayName?: string | undefined;
38
40
  }
39
41
 
40
42
  export interface FacetRequest extends ExceptionFilters {
@@ -59,6 +61,18 @@ export class ExceptionAggregationService {
59
61
  "escaped",
60
62
  "release",
61
63
  ]);
64
+ /*
65
+ * Virtual facet keys — same scheme as TraceAggregationService /
66
+ * LogAggregationService. The `serviceId` slot is reused for host /
67
+ * docker host / k8s cluster ids, disambiguated by the `serviceType`
68
+ * discriminator column on each ExceptionInstance row.
69
+ */
70
+ private static readonly RESOURCE_FACET_KEYS: Map<string, ServiceType> =
71
+ new Map([
72
+ ["hostId", ServiceType.Host],
73
+ ["dockerHostId", ServiceType.DockerHost],
74
+ ["kubernetesClusterId", ServiceType.KubernetesCluster],
75
+ ]);
62
76
  private static readonly ATTRIBUTE_KEY_PATTERN: RegExp = /^[a-zA-Z0-9._:/-]+$/;
63
77
  private static readonly MAX_FACET_KEY_LENGTH: number = 256;
64
78
 
@@ -169,12 +183,24 @@ export class ExceptionAggregationService {
169
183
 
170
184
  ExceptionAggregationService.validateFacetKey(request.facetKey);
171
185
 
186
+ const resourceServiceType: ServiceType | undefined =
187
+ ExceptionAggregationService.RESOURCE_FACET_KEYS.get(request.facetKey);
188
+ const isResourceFacet: boolean = resourceServiceType !== undefined;
172
189
  const isTopLevelColumn: boolean =
190
+ isResourceFacet ||
173
191
  ExceptionAggregationService.isTopLevelColumn(request.facetKey);
174
192
 
175
193
  const statement: Statement = new Statement();
176
194
 
177
- if (isTopLevelColumn) {
195
+ if (isResourceFacet) {
196
+ /*
197
+ * Virtual facet — group serviceId values whose row carries the matching
198
+ * ServiceType discriminator (Host / DockerHost / KubernetesCluster).
199
+ */
200
+ statement.append(
201
+ SQL`SELECT toString(serviceId) AS val, count() AS cnt FROM ${ExceptionAggregationService.TABLE_NAME}`,
202
+ );
203
+ } else if (isTopLevelColumn) {
178
204
  statement.append(
179
205
  SQL`SELECT toString(${request.facetKey}) AS val, count() AS cnt FROM ${ExceptionAggregationService.TABLE_NAME}`,
180
206
  );
@@ -200,7 +226,26 @@ export class ExceptionAggregationService {
200
226
  }}`,
201
227
  );
202
228
 
203
- if (!isTopLevelColumn) {
229
+ if (isResourceFacet) {
230
+ statement.append(
231
+ SQL` AND serviceType = ${{
232
+ type: TableColumnType.Text,
233
+ value: resourceServiceType as string,
234
+ }}`,
235
+ );
236
+ } else if (request.facetKey === "serviceId") {
237
+ /*
238
+ * Constrain the canonical Services facet to rows that actually
239
+ * belong to a Service. NULL / empty serviceType covers legacy rows
240
+ * ingested before the discriminator existed.
241
+ */
242
+ statement.append(
243
+ SQL` AND (serviceType = '' OR serviceType = ${{
244
+ type: TableColumnType.Text,
245
+ value: ServiceType.OpenTelemetry as string,
246
+ }})`,
247
+ );
248
+ } else if (!isTopLevelColumn) {
204
249
  statement.append(
205
250
  SQL` AND JSONHas(attributes, ${{
206
251
  type: TableColumnType.Text,
@@ -322,7 +367,10 @@ export class ExceptionAggregationService {
322
367
  throw new BadDataException("Invalid facetKey");
323
368
  }
324
369
 
325
- if (ExceptionAggregationService.isTopLevelColumn(facetKey)) {
370
+ if (
371
+ ExceptionAggregationService.isTopLevelColumn(facetKey) ||
372
+ ExceptionAggregationService.RESOURCE_FACET_KEYS.has(facetKey)
373
+ ) {
326
374
  return;
327
375
  }
328
376
 
@@ -32,6 +32,7 @@ export interface HistogramRequest {
32
32
  export interface FacetValue {
33
33
  value: string;
34
34
  count: number;
35
+ displayName?: string | undefined;
35
36
  }
36
37
 
37
38
  export interface FacetRequest {
@@ -0,0 +1,227 @@
1
+ import { SQL, Statement } from "../Utils/AnalyticsDatabase/Statement";
2
+ import MetricService from "./MetricService";
3
+ import TableColumnType from "../../Types/AnalyticsDatabase/TableColumnType";
4
+ import { JSONObject } from "../../Types/JSON";
5
+ import ObjectID from "../../Types/ObjectID";
6
+ import BadDataException from "../../Types/Exception/BadDataException";
7
+ import Includes from "../../Types/BaseDatabase/Includes";
8
+ import AnalyticsTableName from "../../Types/AnalyticsDatabase/AnalyticsTableName";
9
+ import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
10
+ import { DbJSONResponse, Results } from "./AnalyticsDatabaseService";
11
+ import ServiceType from "../../Types/Telemetry/ServiceType";
12
+
13
+ export interface FacetValue {
14
+ value: string;
15
+ count: number;
16
+ displayName?: string | undefined;
17
+ }
18
+
19
+ export interface MetricFilters {
20
+ serviceIds?: Array<ObjectID> | undefined;
21
+ metricNames?: Array<string> | undefined;
22
+ }
23
+
24
+ export interface FacetRequest extends MetricFilters {
25
+ projectId: ObjectID;
26
+ startTime: Date;
27
+ endTime: Date;
28
+ facetKey: string;
29
+ limit?: number | undefined;
30
+ }
31
+
32
+ /*
33
+ * Facet aggregation for the Metrics page sidebar. Same shape as
34
+ * TraceAggregationService / LogAggregationService — per-facet GROUP BY on
35
+ * the analytics table, with a `serviceType` discriminator that lets the
36
+ * `serviceId` column carry Host / DockerHost / KubernetesCluster ids for
37
+ * the corresponding virtual facets.
38
+ */
39
+ export class MetricAggregationService {
40
+ private static readonly DEFAULT_FACET_LIMIT: number = 500;
41
+ private static readonly TABLE_NAME: string = AnalyticsTableName.Metric;
42
+ private static readonly TOP_LEVEL_COLUMNS: Set<string> = new Set([
43
+ "serviceId",
44
+ "name",
45
+ ]);
46
+ private static readonly RESOURCE_FACET_KEYS: Map<string, ServiceType> =
47
+ new Map([
48
+ ["hostId", ServiceType.Host],
49
+ ["dockerHostId", ServiceType.DockerHost],
50
+ ["kubernetesClusterId", ServiceType.KubernetesCluster],
51
+ ]);
52
+ private static readonly ATTRIBUTE_KEY_PATTERN: RegExp = /^[a-zA-Z0-9._:/-]+$/;
53
+ private static readonly MAX_FACET_KEY_LENGTH: number = 256;
54
+
55
+ @CaptureSpan()
56
+ public static async getFacetValues(
57
+ request: FacetRequest,
58
+ ): Promise<Array<FacetValue>> {
59
+ const statement: Statement =
60
+ MetricAggregationService.buildFacetStatement(request);
61
+
62
+ const dbResult: Results = await MetricService.executeQuery(statement);
63
+ const response: DbJSONResponse = await dbResult.json<{
64
+ data?: Array<JSONObject>;
65
+ }>();
66
+
67
+ const rows: Array<JSONObject> = response.data || [];
68
+
69
+ return rows
70
+ .map((row: JSONObject): FacetValue => {
71
+ return {
72
+ value: String(row["val"] || ""),
73
+ count: Number(row["cnt"] || 0),
74
+ };
75
+ })
76
+ .filter((facet: FacetValue): boolean => {
77
+ return facet.value.length > 0;
78
+ });
79
+ }
80
+
81
+ private static buildFacetStatement(request: FacetRequest): Statement {
82
+ const limit: number =
83
+ request.limit ?? MetricAggregationService.DEFAULT_FACET_LIMIT;
84
+
85
+ MetricAggregationService.validateFacetKey(request.facetKey);
86
+
87
+ const resourceServiceType: ServiceType | undefined =
88
+ MetricAggregationService.RESOURCE_FACET_KEYS.get(request.facetKey);
89
+ const isResourceFacet: boolean = resourceServiceType !== undefined;
90
+ const isTopLevelColumn: boolean =
91
+ isResourceFacet ||
92
+ MetricAggregationService.isTopLevelColumn(request.facetKey);
93
+
94
+ const statement: Statement = new Statement();
95
+
96
+ if (isResourceFacet) {
97
+ statement.append(
98
+ SQL`SELECT toString(serviceId) AS val, count() AS cnt FROM ${MetricAggregationService.TABLE_NAME}`,
99
+ );
100
+ } else if (isTopLevelColumn) {
101
+ statement.append(
102
+ SQL`SELECT toString(${request.facetKey}) AS val, count() AS cnt FROM ${MetricAggregationService.TABLE_NAME}`,
103
+ );
104
+ } else {
105
+ statement.append(
106
+ SQL`SELECT JSONExtractRaw(attributes, ${{
107
+ type: TableColumnType.Text,
108
+ value: request.facetKey,
109
+ }}) AS val, count() AS cnt FROM ${MetricAggregationService.TABLE_NAME}`,
110
+ );
111
+ }
112
+
113
+ statement.append(
114
+ SQL` WHERE projectId = ${{
115
+ type: TableColumnType.ObjectID,
116
+ value: request.projectId,
117
+ }} AND time >= ${{
118
+ type: TableColumnType.Date,
119
+ value: request.startTime,
120
+ }} AND time <= ${{
121
+ type: TableColumnType.Date,
122
+ value: request.endTime,
123
+ }}`,
124
+ );
125
+
126
+ if (isResourceFacet) {
127
+ statement.append(
128
+ SQL` AND serviceType = ${{
129
+ type: TableColumnType.Text,
130
+ value: resourceServiceType as string,
131
+ }}`,
132
+ );
133
+ } else if (request.facetKey === "serviceId") {
134
+ statement.append(
135
+ SQL` AND (serviceType = '' OR serviceType = ${{
136
+ type: TableColumnType.Text,
137
+ value: ServiceType.OpenTelemetry as string,
138
+ }})`,
139
+ );
140
+ } else if (!isTopLevelColumn) {
141
+ statement.append(
142
+ SQL` AND JSONHas(attributes, ${{
143
+ type: TableColumnType.Text,
144
+ value: request.facetKey,
145
+ }}) = 1`,
146
+ );
147
+ }
148
+
149
+ MetricAggregationService.appendCommonFilters(statement, request);
150
+
151
+ statement.append(
152
+ SQL` GROUP BY val ORDER BY cnt DESC LIMIT ${{
153
+ type: TableColumnType.Number,
154
+ value: limit,
155
+ }}`,
156
+ );
157
+
158
+ /*
159
+ * Defense in depth: cap runtime below nginx's 60s proxy_read_timeout
160
+ * so a slow facet never starves the endpoint.
161
+ */
162
+ statement.append(
163
+ " SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
164
+ );
165
+
166
+ return statement;
167
+ }
168
+
169
+ private static appendCommonFilters(
170
+ statement: Statement,
171
+ request: MetricFilters,
172
+ ): void {
173
+ if (request.serviceIds && request.serviceIds.length > 0) {
174
+ statement.append(
175
+ SQL` AND serviceId IN (${{
176
+ type: TableColumnType.ObjectID,
177
+ value: new Includes(
178
+ request.serviceIds.map((id: ObjectID) => {
179
+ return id.toString();
180
+ }),
181
+ ),
182
+ }})`,
183
+ );
184
+ }
185
+
186
+ if (request.metricNames && request.metricNames.length > 0) {
187
+ statement.append(
188
+ SQL` AND name IN (${{
189
+ type: TableColumnType.Text,
190
+ value: new Includes(request.metricNames),
191
+ }})`,
192
+ );
193
+ }
194
+ }
195
+
196
+ private static isTopLevelColumn(key: string): boolean {
197
+ return MetricAggregationService.TOP_LEVEL_COLUMNS.has(key);
198
+ }
199
+
200
+ private static validateFacetKey(
201
+ facetKey: unknown,
202
+ ): asserts facetKey is string {
203
+ if (typeof facetKey !== "string") {
204
+ throw new BadDataException("Invalid facetKey");
205
+ }
206
+
207
+ if (
208
+ facetKey.length === 0 ||
209
+ facetKey.length > MetricAggregationService.MAX_FACET_KEY_LENGTH
210
+ ) {
211
+ throw new BadDataException("Invalid facetKey");
212
+ }
213
+
214
+ if (
215
+ MetricAggregationService.isTopLevelColumn(facetKey) ||
216
+ MetricAggregationService.RESOURCE_FACET_KEYS.has(facetKey)
217
+ ) {
218
+ return;
219
+ }
220
+
221
+ if (!MetricAggregationService.ATTRIBUTE_KEY_PATTERN.test(facetKey)) {
222
+ throw new BadDataException("Invalid facetKey");
223
+ }
224
+ }
225
+ }
226
+
227
+ export default MetricAggregationService;
@@ -23,6 +23,7 @@ import KubernetesCluster from "../../Models/DatabaseModels/KubernetesCluster";
23
23
  import HostService from "./HostService";
24
24
  import DockerHostService from "./DockerHostService";
25
25
  import KubernetesClusterService from "./KubernetesClusterService";
26
+ import GlobalCache from "../Infrastructure/GlobalCache";
26
27
 
27
28
  export enum OtelAggregationTemporality {
28
29
  Cumulative = "AGGREGATION_TEMPORALITY_CUMULATIVE",
@@ -52,11 +53,83 @@ interface ProjectRetentionContext {
52
53
  projectRetentionInDays: number;
53
54
  }
54
55
 
56
+ /*
57
+ * Per-process memo holding the project's telemetry retention
58
+ * context for the duration of the cache TTL. The L2 GlobalCache
59
+ * (Redis, shared across workers) is the source of truth for
60
+ * cross-process freshness; this L1 in-process Map exists so the
61
+ * dozens of getProjectRetentionContext calls per worker batch
62
+ * (one per resource span -> Service/Host/DockerHost/Kubernetes
63
+ * resolution) collapse to zero network round-trips after the
64
+ * first one warms it. Both layers TTL out together.
65
+ */
66
+ interface CachedRetentionContext {
67
+ context: ProjectRetentionContext;
68
+ expiresAtMs: number;
69
+ }
70
+
71
+ const PROJECT_RETENTION_CACHE_NAMESPACE: string = "project-retention-context";
72
+ /*
73
+ * 5-minute TTL is long enough that steady-state ingest sees ~100%
74
+ * cache hits and short enough that an admin retention change in
75
+ * the UI propagates without us having to wire up cross-process
76
+ * invalidation (which would need pub/sub — GlobalCache has no
77
+ * delete primitive today). Admins changing retention should
78
+ * expect up to 5 minutes of lag before new rows pick up the new
79
+ * config; existing rows keep whatever retentionDate they were
80
+ * stamped with at ingest time and aren't affected either way.
81
+ */
82
+ const PROJECT_RETENTION_CACHE_TTL_SECONDS: number = 5 * 60;
83
+ const projectRetentionInProcessCache: Map<string, CachedRetentionContext> =
84
+ new Map();
85
+
55
86
  export default class OTelIngestService {
56
87
  @CaptureSpan()
57
88
  private static async getProjectRetentionContext(
58
89
  projectId: ObjectID,
59
90
  ): Promise<ProjectRetentionContext> {
91
+ const projectIdStr: string = projectId.toString();
92
+ const now: number = Date.now();
93
+
94
+ // L1: in-process memo. Zero network cost.
95
+ const memoed: CachedRetentionContext | undefined =
96
+ projectRetentionInProcessCache.get(projectIdStr);
97
+ if (memoed && memoed.expiresAtMs > now) {
98
+ return memoed.context;
99
+ }
100
+
101
+ // L2: Redis. Single round-trip; shared across workers.
102
+ try {
103
+ const cached: JSONObject | null = await GlobalCache.getJSONObject(
104
+ PROJECT_RETENTION_CACHE_NAMESPACE,
105
+ projectIdStr,
106
+ );
107
+ if (cached) {
108
+ const context: ProjectRetentionContext = {
109
+ projectRetentionConfig:
110
+ (cached[
111
+ "projectRetentionConfig"
112
+ ] as TelemetryRetentionConfig | null) ?? null,
113
+ projectRetentionInDays:
114
+ (cached["projectRetentionInDays"] as number) ||
115
+ DEFAULT_RETENTION_IN_DAYS,
116
+ };
117
+ projectRetentionInProcessCache.set(projectIdStr, {
118
+ context,
119
+ expiresAtMs: now + PROJECT_RETENTION_CACHE_TTL_SECONDS * 1000,
120
+ });
121
+ return context;
122
+ }
123
+ } catch (err) {
124
+ // Cache outage must never fail ingest. Fall through to Postgres.
125
+ logger.warn(
126
+ `Project retention cache read failed for project ${projectIdStr}; falling back to Postgres: ${
127
+ err instanceof Error ? err.message : String(err)
128
+ }`,
129
+ );
130
+ }
131
+
132
+ // Cold path: hit Postgres and warm both caches.
60
133
  const project: Project | null = await ProjectService.findOneById({
61
134
  id: projectId,
62
135
  select: {
@@ -68,11 +141,38 @@ export default class OTelIngestService {
68
141
  },
69
142
  });
70
143
 
71
- return {
144
+ const context: ProjectRetentionContext = {
72
145
  projectRetentionConfig: project?.telemetryRetentionConfig ?? null,
73
146
  projectRetentionInDays:
74
147
  project?.defaultTelemetryRetentionInDays || DEFAULT_RETENTION_IN_DAYS,
75
148
  };
149
+
150
+ projectRetentionInProcessCache.set(projectIdStr, {
151
+ context,
152
+ expiresAtMs: now + PROJECT_RETENTION_CACHE_TTL_SECONDS * 1000,
153
+ });
154
+
155
+ try {
156
+ await GlobalCache.setJSON(
157
+ PROJECT_RETENTION_CACHE_NAMESPACE,
158
+ projectIdStr,
159
+ {
160
+ projectRetentionConfig: (context.projectRetentionConfig ??
161
+ null) as unknown as JSONObject,
162
+ projectRetentionInDays: context.projectRetentionInDays,
163
+ },
164
+ { expiresInSeconds: PROJECT_RETENTION_CACHE_TTL_SECONDS },
165
+ );
166
+ } catch (err) {
167
+ // Best-effort warm. Don't fail the request.
168
+ logger.warn(
169
+ `Project retention cache write failed for project ${projectIdStr}: ${
170
+ err instanceof Error ? err.message : String(err)
171
+ }`,
172
+ );
173
+ }
174
+
175
+ return context;
76
176
  }
77
177
 
78
178
  @CaptureSpan()
@@ -38,6 +38,7 @@ export interface HistogramRequest extends TraceFilters {
38
38
  export interface FacetValue {
39
39
  value: string;
40
40
  count: number;
41
+ displayName?: string | undefined;
41
42
  }
42
43
 
43
44
  export interface FacetRequest extends TraceFilters {
@@ -7,6 +7,29 @@ import ObjectID from "../../../Types/ObjectID";
7
7
  import { JSONObject } from "../../../Types/JSON";
8
8
  import DataToProcess from "./DataToProcess";
9
9
 
10
+ /*
11
+ * Maximum rows held in memory before we force a flush, and the
12
+ * maximum time we hold a row in the buffer before flushing.
13
+ *
14
+ * Sizing rationale: 10,000 rows at a few KB each is ~tens of MB
15
+ * of peak heap per process, which is fine for the API/worker
16
+ * processes that run monitor probes. 5 seconds is a tight enough
17
+ * worst-case latency that the dashboard "last log" view feels
18
+ * live, and loose enough that even a single very chatty monitor
19
+ * coalesces dozens to thousands of inserts into one.
20
+ *
21
+ * The legacy implementation called MonitorLogService.insertJsonRows
22
+ * with a one-element array per monitor probe, fire-and-forget.
23
+ * ClickHouse's async_insert deduplicates these into part files,
24
+ * but every call still pays an HTTP round-trip and the
25
+ * @clickhouse/client pool can saturate when probe traffic spikes
26
+ * (e.g. a 10k-monitor project running concurrent probes). Batching
27
+ * collapses N round-trips into N / 10_000 (size-bounded) or
28
+ * N / (5s × per-second-rate) (time-bounded), whichever comes first.
29
+ */
30
+ const MONITOR_LOG_FLUSH_BATCH_SIZE: number = 10_000;
31
+ const MONITOR_LOG_FLUSH_INTERVAL_MS: number = 5_000;
32
+
10
33
  export default class MonitorLogUtil {
11
34
  // Default retention in days if GlobalConfig is not set
12
35
  private static readonly DEFAULT_RETENTION_DAYS: number = 1;
@@ -16,6 +39,18 @@ export default class MonitorLogUtil {
16
39
  private static lastCacheRefresh: Date | null = null;
17
40
  private static readonly CACHE_TTL_MS: number = 5 * 60 * 1000; // 5 minutes
18
41
 
42
+ /*
43
+ * In-process write buffer for MonitorLog rows. Rows accumulate
44
+ * here until either MONITOR_LOG_FLUSH_BATCH_SIZE rows arrive
45
+ * (size trigger) or MONITOR_LOG_FLUSH_INTERVAL_MS elapses since
46
+ * the first row entered an empty buffer (time trigger),
47
+ * whichever comes first. On graceful shutdown the SIGTERM /
48
+ * SIGINT hook below drains the buffer before the process exits.
49
+ */
50
+ private static buffer: Array<JSONObject> = [];
51
+ private static flushTimer: NodeJS.Timeout | null = null;
52
+ private static shutdownHooksRegistered: boolean = false;
53
+
19
54
  private static async getRetentionDays(): Promise<number> {
20
55
  const now: Date = OneUptimeDate.getCurrentDate();
21
56
 
@@ -83,7 +118,7 @@ export default class MonitorLogUtil {
83
118
  return;
84
119
  }
85
120
 
86
- // Fire-and-forget: fetch retention config then insert
121
+ // Fire-and-forget: fetch retention config then enqueue
87
122
  this.getRetentionDays()
88
123
  .then((retentionDays: number) => {
89
124
  const logIngestionDate: Date = OneUptimeDate.getCurrentDate();
@@ -106,14 +141,119 @@ export default class MonitorLogUtil {
106
141
  retentionDate: OneUptimeDate.toClickhouseDateTime(retentionDate),
107
142
  };
108
143
 
109
- MonitorLogService.insertJsonRows([monitorLogRow]).catch(
110
- (err: Error) => {
111
- logger.error(err);
112
- },
113
- );
144
+ this.enqueueRow(monitorLogRow);
114
145
  })
115
146
  .catch((err: Error) => {
116
147
  logger.error(err);
117
148
  });
118
149
  }
150
+
151
+ /*
152
+ * Append a row to the buffer, then decide whether to flush
153
+ * immediately (size threshold) or schedule the timed flush
154
+ * (first row in an empty buffer). The timer is intentionally
155
+ * NOT reset on every row — we want strict "at most 5 s of
156
+ * staleness", not "at most 5 s of idle". Resetting per-row
157
+ * could indefinitely delay flush under steady ingest.
158
+ */
159
+ private static enqueueRow(row: JSONObject): void {
160
+ this.ensureShutdownHooks();
161
+
162
+ this.buffer.push(row);
163
+
164
+ if (this.buffer.length >= MONITOR_LOG_FLUSH_BATCH_SIZE) {
165
+ // Size trigger — flush immediately.
166
+ this.triggerFlush();
167
+ return;
168
+ }
169
+
170
+ if (!this.flushTimer && this.buffer.length === 1) {
171
+ // First row in an empty buffer — start the time-based flush.
172
+ this.flushTimer = setTimeout(() => {
173
+ this.triggerFlush();
174
+ }, MONITOR_LOG_FLUSH_INTERVAL_MS);
175
+ }
176
+ }
177
+
178
+ /*
179
+ * Synchronous fire-and-forget flush used by the normal hot
180
+ * path: callers must not be blocked waiting for ClickHouse.
181
+ * Swap the buffer out first so new arrivals during the network
182
+ * round-trip land in a fresh array and are not double-flushed.
183
+ */
184
+ private static triggerFlush(): void {
185
+ if (this.flushTimer) {
186
+ clearTimeout(this.flushTimer);
187
+ this.flushTimer = null;
188
+ }
189
+
190
+ if (this.buffer.length === 0) {
191
+ return;
192
+ }
193
+
194
+ const toFlush: Array<JSONObject> = this.buffer;
195
+ this.buffer = [];
196
+
197
+ MonitorLogService.insertJsonRows(toFlush).catch((err: Error) => {
198
+ logger.error(
199
+ `MonitorLog batch insert failed for ${toFlush.length} rows; batch dropped.`,
200
+ );
201
+ logger.error(err);
202
+ });
203
+ }
204
+
205
+ /*
206
+ * Awaitable flush used by the SIGTERM / SIGINT shutdown hook
207
+ * so we don't lose buffered rows when Kubernetes / Docker
208
+ * sends the process to bed. Errors are logged but swallowed —
209
+ * a failing flush must not block the rest of the shutdown
210
+ * chain (other handlers may still need to run).
211
+ */
212
+ private static async flushAndWait(): Promise<void> {
213
+ if (this.flushTimer) {
214
+ clearTimeout(this.flushTimer);
215
+ this.flushTimer = null;
216
+ }
217
+
218
+ if (this.buffer.length === 0) {
219
+ return;
220
+ }
221
+
222
+ const toFlush: Array<JSONObject> = this.buffer;
223
+ this.buffer = [];
224
+
225
+ try {
226
+ await MonitorLogService.insertJsonRows(toFlush);
227
+ } catch (err) {
228
+ logger.error(
229
+ `MonitorLog shutdown flush failed for ${toFlush.length} rows; batch dropped.`,
230
+ );
231
+ logger.error(err);
232
+ }
233
+ }
234
+
235
+ /*
236
+ * Register SIGTERM / SIGINT handlers exactly once, lazily on
237
+ * first ingest. We avoid registering at module-load time so
238
+ * tooling that imports this file (e.g. migration runners,
239
+ * CLI scripts) doesn't end up with stray process listeners.
240
+ */
241
+ private static ensureShutdownHooks(): void {
242
+ if (this.shutdownHooksRegistered) {
243
+ return;
244
+ }
245
+ this.shutdownHooksRegistered = true;
246
+
247
+ const flushOnShutdown: () => Promise<void> = async (): Promise<void> => {
248
+ try {
249
+ await this.flushAndWait();
250
+ } catch (err) {
251
+ logger.error("Error flushing MonitorLog buffer on shutdown:");
252
+ logger.error(err);
253
+ }
254
+ };
255
+
256
+ process.on("SIGTERM", flushOnShutdown);
257
+ process.on("SIGINT", flushOnShutdown);
258
+ }
119
259
  }