@oneuptime/common 10.0.58 → 10.0.59
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Models/AnalyticsModels/Span.ts +45 -0
- package/Server/API/BaseAnalyticsAPI.ts +16 -15
- package/Server/Middleware/TelemetryIngest.ts +7 -0
- package/Server/Services/AnalyticsDatabaseService.ts +50 -11
- package/Server/Services/SpanService.ts +176 -0
- package/Server/Services/TraceAggregationService.ts +55 -22
- package/Tests/Server/Services/AnalyticsDatabaseService.test.ts +8 -4
- package/UI/Components/TelemetryViewer/TelemetryViewer.tsx +1 -1
- package/build/dist/Models/AnalyticsModels/Span.js +40 -0
- package/build/dist/Models/AnalyticsModels/Span.js.map +1 -1
- package/build/dist/Server/API/BaseAnalyticsAPI.js +16 -14
- package/build/dist/Server/API/BaseAnalyticsAPI.js.map +1 -1
- package/build/dist/Server/Middleware/TelemetryIngest.js +4 -0
- package/build/dist/Server/Middleware/TelemetryIngest.js.map +1 -1
- package/build/dist/Server/Services/AnalyticsDatabaseService.js +35 -6
- package/build/dist/Server/Services/AnalyticsDatabaseService.js.map +1 -1
- package/build/dist/Server/Services/SpanService.js +142 -0
- package/build/dist/Server/Services/SpanService.js.map +1 -1
- package/build/dist/Server/Services/TraceAggregationService.js +42 -12
- package/build/dist/Server/Services/TraceAggregationService.js.map +1 -1
- package/build/dist/Tests/Server/Services/AnalyticsDatabaseService.test.js +8 -4
- package/build/dist/Tests/Server/Services/AnalyticsDatabaseService.test.js.map +1 -1
- package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js +1 -1
- package/build/dist/UI/Components/TelemetryViewer/TelemetryViewer.js.map +1 -1
- package/package.json +1 -1
|
@@ -565,6 +565,37 @@ export default class Span extends AnalyticsBaseModel {
|
|
|
565
565
|
},
|
|
566
566
|
});
|
|
567
567
|
|
|
568
|
+
const isRootSpanColumn: AnalyticsTableColumn = new AnalyticsTableColumn({
|
|
569
|
+
key: "isRootSpan",
|
|
570
|
+
title: "Is Root Span",
|
|
571
|
+
description:
|
|
572
|
+
"Whether this span is a root span (has no parent), populated at ingest time for fast root-only filtering",
|
|
573
|
+
required: true,
|
|
574
|
+
defaultValue: false,
|
|
575
|
+
type: TableColumnType.Boolean,
|
|
576
|
+
skipIndex: {
|
|
577
|
+
name: "idx_is_root_span",
|
|
578
|
+
type: SkipIndexType.Set,
|
|
579
|
+
params: [2],
|
|
580
|
+
granularity: 4,
|
|
581
|
+
},
|
|
582
|
+
accessControl: {
|
|
583
|
+
read: [
|
|
584
|
+
Permission.ProjectOwner,
|
|
585
|
+
Permission.ProjectAdmin,
|
|
586
|
+
Permission.ProjectMember,
|
|
587
|
+
Permission.ReadTelemetryServiceTraces,
|
|
588
|
+
],
|
|
589
|
+
create: [
|
|
590
|
+
Permission.ProjectOwner,
|
|
591
|
+
Permission.ProjectAdmin,
|
|
592
|
+
Permission.ProjectMember,
|
|
593
|
+
Permission.CreateTelemetryServiceTraces,
|
|
594
|
+
],
|
|
595
|
+
update: [],
|
|
596
|
+
},
|
|
597
|
+
});
|
|
598
|
+
|
|
568
599
|
const retentionDateColumn: AnalyticsTableColumn = new AnalyticsTableColumn({
|
|
569
600
|
key: "retentionDate",
|
|
570
601
|
title: "Retention Date",
|
|
@@ -628,6 +659,7 @@ export default class Span extends AnalyticsBaseModel {
|
|
|
628
659
|
nameColumn,
|
|
629
660
|
kindColumn,
|
|
630
661
|
hasExceptionColumn,
|
|
662
|
+
isRootSpanColumn,
|
|
631
663
|
retentionDateColumn,
|
|
632
664
|
],
|
|
633
665
|
projections: [
|
|
@@ -641,6 +673,11 @@ export default class Span extends AnalyticsBaseModel {
|
|
|
641
673
|
query:
|
|
642
674
|
"SELECT projectId, traceId, startTime, serviceId, spanId, parentSpanId, name, durationUnixNano, statusCode, hasException ORDER BY (projectId, traceId, startTime)",
|
|
643
675
|
},
|
|
676
|
+
{
|
|
677
|
+
name: "proj_hist_by_minute",
|
|
678
|
+
query:
|
|
679
|
+
"SELECT projectId, toStartOfMinute(startTime) AS minute, serviceId, statusCode, isRootSpan, count() AS cnt GROUP BY projectId, minute, serviceId, statusCode, isRootSpan",
|
|
680
|
+
},
|
|
644
681
|
],
|
|
645
682
|
sortKeys: ["projectId", "startTime", "serviceId", "traceId"],
|
|
646
683
|
primaryKeys: ["projectId", "startTime", "serviceId", "traceId"],
|
|
@@ -809,6 +846,14 @@ export default class Span extends AnalyticsBaseModel {
|
|
|
809
846
|
this.setColumnValue("hasException", v);
|
|
810
847
|
}
|
|
811
848
|
|
|
849
|
+
public get isRootSpan(): boolean | undefined {
|
|
850
|
+
return this.getColumnValue("isRootSpan") as boolean | undefined;
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
public set isRootSpan(v: boolean | undefined) {
|
|
854
|
+
this.setColumnValue("isRootSpan", v);
|
|
855
|
+
}
|
|
856
|
+
|
|
812
857
|
public get retentionDate(): Date | undefined {
|
|
813
858
|
return this.getColumnValue("retentionDate") as Date | undefined;
|
|
814
859
|
}
|
|
@@ -255,21 +255,22 @@ export default class BaseAnalyticsAPI<
|
|
|
255
255
|
const databaseProps: DatabaseCommonInteractionProps =
|
|
256
256
|
await CommonAPI.getDatabaseCommonInteractionProps(req);
|
|
257
257
|
|
|
258
|
-
const list
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
258
|
+
const [list, count] = await Promise.all([
|
|
259
|
+
this.service.findBy({
|
|
260
|
+
query,
|
|
261
|
+
select,
|
|
262
|
+
skip: skip,
|
|
263
|
+
limit: limit,
|
|
264
|
+
sort: sort,
|
|
265
|
+
groupBy: groupBy,
|
|
266
|
+
props: databaseProps,
|
|
267
|
+
}),
|
|
268
|
+
this.service.countBy({
|
|
269
|
+
query,
|
|
270
|
+
groupBy: groupBy,
|
|
271
|
+
props: databaseProps,
|
|
272
|
+
}),
|
|
273
|
+
]);
|
|
273
274
|
|
|
274
275
|
return Response.sendEntityArrayResponse(
|
|
275
276
|
req,
|
|
@@ -41,6 +41,13 @@ export default class TelemetryIngest {
|
|
|
41
41
|
| undefined;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
+
// if x-oneuptime-ingestion-key header is present then use that as token.
|
|
45
|
+
if (!oneuptimeToken) {
|
|
46
|
+
oneuptimeToken = req.headers["x-oneuptime-ingestion-key"] as
|
|
47
|
+
| string
|
|
48
|
+
| undefined;
|
|
49
|
+
}
|
|
50
|
+
|
|
44
51
|
if (!oneuptimeToken) {
|
|
45
52
|
logger.error(
|
|
46
53
|
"Missing header: x-oneuptime-token",
|
|
@@ -211,18 +211,33 @@ export default class AnalyticsDatabaseService<
|
|
|
211
211
|
tableName: this.model.tableName,
|
|
212
212
|
} as LogAttributes);
|
|
213
213
|
|
|
214
|
-
const resultInJSON: ResponseJSON<JSONObject> =
|
|
215
|
-
await dbResult.json<JSONObject>();
|
|
216
|
-
|
|
217
214
|
let countPositive: PositiveNumber = new PositiveNumber(0);
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
resultInJSON
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
resultInJSON.data[0]
|
|
215
|
+
|
|
216
|
+
try {
|
|
217
|
+
const resultInJSON: ResponseJSON<JSONObject> =
|
|
218
|
+
await dbResult.json<JSONObject>();
|
|
219
|
+
|
|
220
|
+
if (
|
|
221
|
+
resultInJSON.data &&
|
|
222
|
+
resultInJSON.data[0] &&
|
|
223
|
+
resultInJSON.data[0]["count"] &&
|
|
224
|
+
typeof resultInJSON.data[0]["count"] === "string"
|
|
225
|
+
) {
|
|
226
|
+
countPositive = new PositiveNumber(
|
|
227
|
+
resultInJSON.data[0]["count"] as string,
|
|
228
|
+
);
|
|
229
|
+
}
|
|
230
|
+
} catch {
|
|
231
|
+
/*
|
|
232
|
+
* When max_execution_time fires with timeout_overflow_mode='break',
|
|
233
|
+
* ClickHouse may return a truncated response for count() queries
|
|
234
|
+
* (the aggregation has no partial row to emit). Treat this as
|
|
235
|
+
* "count unavailable" rather than a fatal error — the list query
|
|
236
|
+
* itself still succeeds.
|
|
237
|
+
*/
|
|
238
|
+
logger.warn(
|
|
239
|
+
`${this.model.tableName} count query returned unparseable response, defaulting to 0`,
|
|
240
|
+
{ tableName: this.model.tableName } as LogAttributes,
|
|
226
241
|
);
|
|
227
242
|
}
|
|
228
243
|
|
|
@@ -689,6 +704,19 @@ export default class AnalyticsDatabaseService<
|
|
|
689
704
|
}}
|
|
690
705
|
`);
|
|
691
706
|
}
|
|
707
|
+
|
|
708
|
+
/*
|
|
709
|
+
* Cap count query runtime below the ClickHouse client's 58s
|
|
710
|
+
* request_timeout. Wide time-range queries on large tables (e.g. Span)
|
|
711
|
+
* can scan billions of rows; without a cap the query runs until the
|
|
712
|
+
* HTTP client disconnects, wasting ClickHouse resources. With 'break'
|
|
713
|
+
* mode ClickHouse returns a partial (lower-bound) count rather than
|
|
714
|
+
* throwing, which is acceptable for pagination display.
|
|
715
|
+
*/
|
|
716
|
+
statement.append(
|
|
717
|
+
" SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
718
|
+
);
|
|
719
|
+
|
|
692
720
|
logger.debug(`${this.model.tableName} Count Statement`, { tableName: this.model.tableName } as LogAttributes);
|
|
693
721
|
logger.debug(statement, { tableName: this.model.tableName } as LogAttributes);
|
|
694
722
|
|
|
@@ -813,6 +841,17 @@ export default class AnalyticsDatabaseService<
|
|
|
813
841
|
}}
|
|
814
842
|
`);
|
|
815
843
|
|
|
844
|
+
/*
|
|
845
|
+
* Defense in depth: cap find-query runtime below the ClickHouse
|
|
846
|
+
* client's 58s request_timeout. The LIMIT clause keeps most queries
|
|
847
|
+
* fast, but complex WHERE filters (e.g. parentSpanId IS NULL) on
|
|
848
|
+
* wide time ranges can still cause long scans. 'break' mode returns
|
|
849
|
+
* partial results rather than throwing.
|
|
850
|
+
*/
|
|
851
|
+
statement.append(
|
|
852
|
+
" SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
853
|
+
);
|
|
854
|
+
|
|
816
855
|
logger.debug(`${this.model.tableName} Find Statement`, { tableName: this.model.tableName } as LogAttributes);
|
|
817
856
|
logger.debug(statement, { tableName: this.model.tableName } as LogAttributes);
|
|
818
857
|
|
|
@@ -1,11 +1,187 @@
|
|
|
1
1
|
import ClickhouseDatabase from "../Infrastructure/ClickhouseDatabase";
|
|
2
2
|
import AnalyticsDatabaseService from "./AnalyticsDatabaseService";
|
|
3
3
|
import Span from "../../Models/AnalyticsModels/Span";
|
|
4
|
+
import CountBy from "../Types/AnalyticsDatabase/CountBy";
|
|
5
|
+
import { SQL, Statement } from "../Utils/AnalyticsDatabase/Statement";
|
|
6
|
+
import TableColumnType from "../../Types/AnalyticsDatabase/TableColumnType";
|
|
7
|
+
import InBetween from "../../Types/BaseDatabase/InBetween";
|
|
8
|
+
import Includes from "../../Types/BaseDatabase/Includes";
|
|
9
|
+
import ObjectID from "../../Types/ObjectID";
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Columns the proj_hist_by_minute projection can answer with. If a count
|
|
13
|
+
* query touches only these columns, we can route it through the projection
|
|
14
|
+
* for a 100x+ speedup over scanning the raw 1.8B-row span table. Any other
|
|
15
|
+
* column (kind, name, traceId, attributes, …) forces a fallback to the
|
|
16
|
+
* generic table scan because the projection doesn't store those values.
|
|
17
|
+
*/
|
|
18
|
+
const PROJECTION_ELIGIBLE_KEYS: Set<string> = new Set([
|
|
19
|
+
"projectId",
|
|
20
|
+
"startTime",
|
|
21
|
+
"isRootSpan",
|
|
22
|
+
"serviceId",
|
|
23
|
+
"statusCode",
|
|
24
|
+
]);
|
|
4
25
|
|
|
5
26
|
export class SpanService extends AnalyticsDatabaseService<Span> {
|
|
6
27
|
public constructor(clickhouseDatabase?: ClickhouseDatabase | undefined) {
|
|
7
28
|
super({ modelType: Span, database: clickhouseDatabase });
|
|
8
29
|
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Normalize a JSON-deserialized date value to a Date instance. When a query
|
|
33
|
+
* crosses the API boundary, InBetween's startValue/endValue come back as ISO
|
|
34
|
+
* strings (or numeric epoch ms) rather than Date objects. Returns null if
|
|
35
|
+
* the value is unusable.
|
|
36
|
+
*/
|
|
37
|
+
private static coerceToDate(value: unknown): Date | null {
|
|
38
|
+
if (value instanceof Date) {
|
|
39
|
+
return isNaN(value.getTime()) ? null : value;
|
|
40
|
+
}
|
|
41
|
+
if (typeof value === "string" || typeof value === "number") {
|
|
42
|
+
const parsed: Date = new Date(value);
|
|
43
|
+
return isNaN(parsed.getTime()) ? null : parsed;
|
|
44
|
+
}
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Override the count statement to route eligible queries through the
|
|
50
|
+
* proj_hist_by_minute projection. The projection is keyed on
|
|
51
|
+
* (projectId, toStartOfMinute(startTime), serviceId, statusCode, isRootSpan)
|
|
52
|
+
* so its WHERE clause must reference the projection's exact expressions —
|
|
53
|
+
* filtering on raw `startTime` won't trigger projection use.
|
|
54
|
+
*
|
|
55
|
+
* Trade-off: time bounds get rounded to the minute, so the count can be
|
|
56
|
+
* inflated by spans that started in the same minute as the boundary. For
|
|
57
|
+
* pagination this is acceptable.
|
|
58
|
+
*/
|
|
59
|
+
public override toCountStatement(countBy: CountBy<Span>): Statement {
|
|
60
|
+
const projectionStatement: Statement | null =
|
|
61
|
+
this.tryBuildProjectionCountStatement(countBy);
|
|
62
|
+
if (projectionStatement) {
|
|
63
|
+
return projectionStatement;
|
|
64
|
+
}
|
|
65
|
+
return super.toCountStatement(countBy);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
private tryBuildProjectionCountStatement(
|
|
69
|
+
countBy: CountBy<Span>,
|
|
70
|
+
): Statement | null {
|
|
71
|
+
if (countBy.groupBy && Object.keys(countBy.groupBy).length > 0) {
|
|
72
|
+
// GROUP BY count needs the raw table; projection can't help.
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const query: Record<string, unknown> = (countBy.query ||
|
|
77
|
+
{}) as unknown as Record<string, unknown>;
|
|
78
|
+
|
|
79
|
+
// Bail out if the query references any column the projection doesn't store.
|
|
80
|
+
for (const key of Object.keys(query)) {
|
|
81
|
+
if (!PROJECTION_ELIGIBLE_KEYS.has(key)) {
|
|
82
|
+
return null;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const projectId: ObjectID | undefined = query["projectId"] as
|
|
87
|
+
| ObjectID
|
|
88
|
+
| undefined;
|
|
89
|
+
const startTimeFilter: unknown = query["startTime"];
|
|
90
|
+
|
|
91
|
+
/*
|
|
92
|
+
* Projection only helps when both projectId and a time range are bound —
|
|
93
|
+
* these are the partition pruning / primary key conditions the optimizer
|
|
94
|
+
* needs to see in projection-form.
|
|
95
|
+
*/
|
|
96
|
+
if (!projectId || !(startTimeFilter instanceof InBetween)) {
|
|
97
|
+
return null;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (!this.database) {
|
|
101
|
+
this.useDefaultDatabase();
|
|
102
|
+
}
|
|
103
|
+
const databaseName: string = this.database.getDatasourceOptions().database!;
|
|
104
|
+
|
|
105
|
+
const startValue: Date | null = SpanService.coerceToDate(
|
|
106
|
+
startTimeFilter.startValue,
|
|
107
|
+
);
|
|
108
|
+
const endValue: Date | null = SpanService.coerceToDate(
|
|
109
|
+
startTimeFilter.endValue,
|
|
110
|
+
);
|
|
111
|
+
if (!startValue || !endValue) {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const statement: Statement = SQL`SELECT count() AS count FROM ${databaseName}.${this.model.tableName} WHERE projectId = ${{
|
|
116
|
+
type: TableColumnType.ObjectID,
|
|
117
|
+
value: projectId,
|
|
118
|
+
}} AND toStartOfMinute(startTime) >= toStartOfMinute(${{
|
|
119
|
+
type: TableColumnType.Date,
|
|
120
|
+
value: startValue,
|
|
121
|
+
}}) AND toStartOfMinute(startTime) <= toStartOfMinute(${{
|
|
122
|
+
type: TableColumnType.Date,
|
|
123
|
+
value: endValue,
|
|
124
|
+
}})`;
|
|
125
|
+
|
|
126
|
+
if (query["isRootSpan"] !== undefined) {
|
|
127
|
+
statement.append(
|
|
128
|
+
SQL` AND isRootSpan = ${{
|
|
129
|
+
type: TableColumnType.Boolean,
|
|
130
|
+
value: Boolean(query["isRootSpan"]),
|
|
131
|
+
}}`,
|
|
132
|
+
);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const serviceIdValue: unknown = query["serviceId"];
|
|
136
|
+
if (serviceIdValue instanceof ObjectID) {
|
|
137
|
+
statement.append(
|
|
138
|
+
SQL` AND serviceId = ${{
|
|
139
|
+
type: TableColumnType.ObjectID,
|
|
140
|
+
value: serviceIdValue,
|
|
141
|
+
}}`,
|
|
142
|
+
);
|
|
143
|
+
} else if (serviceIdValue instanceof Includes) {
|
|
144
|
+
statement.append(
|
|
145
|
+
SQL` AND serviceId IN (${{
|
|
146
|
+
type: TableColumnType.ObjectID,
|
|
147
|
+
value: serviceIdValue,
|
|
148
|
+
}})`,
|
|
149
|
+
);
|
|
150
|
+
} else if (serviceIdValue !== undefined) {
|
|
151
|
+
// Unrecognized serviceId form — let the generic path handle it.
|
|
152
|
+
return null;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const statusCodeValue: unknown = query["statusCode"];
|
|
156
|
+
if (typeof statusCodeValue === "number") {
|
|
157
|
+
statement.append(
|
|
158
|
+
SQL` AND statusCode = ${{
|
|
159
|
+
type: TableColumnType.Number,
|
|
160
|
+
value: statusCodeValue,
|
|
161
|
+
}}`,
|
|
162
|
+
);
|
|
163
|
+
} else if (statusCodeValue instanceof Includes) {
|
|
164
|
+
statement.append(
|
|
165
|
+
SQL` AND statusCode IN (${{
|
|
166
|
+
type: TableColumnType.Number,
|
|
167
|
+
value: statusCodeValue,
|
|
168
|
+
}})`,
|
|
169
|
+
);
|
|
170
|
+
} else if (statusCodeValue !== undefined) {
|
|
171
|
+
return null;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/*
|
|
175
|
+
* optimize_use_projections is on by default in modern ClickHouse but we
|
|
176
|
+
* set it explicitly to make the intent obvious. The 45s cap is defense
|
|
177
|
+
* in depth — projection scans should complete in <1s.
|
|
178
|
+
*/
|
|
179
|
+
statement.append(
|
|
180
|
+
" SETTINGS optimize_use_projections = 1, max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
181
|
+
);
|
|
182
|
+
|
|
183
|
+
return statement;
|
|
184
|
+
}
|
|
9
185
|
}
|
|
10
186
|
|
|
11
187
|
export default new SpanService();
|
|
@@ -8,6 +8,7 @@ import Includes from "../../Types/BaseDatabase/Includes";
|
|
|
8
8
|
import AnalyticsTableName from "../../Types/AnalyticsDatabase/AnalyticsTableName";
|
|
9
9
|
import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
|
|
10
10
|
import { DbJSONResponse, Results } from "./AnalyticsDatabaseService";
|
|
11
|
+
import logger from "../Utils/Logger";
|
|
11
12
|
|
|
12
13
|
export interface HistogramBucket {
|
|
13
14
|
time: string;
|
|
@@ -66,6 +67,7 @@ export class TraceAggregationService {
|
|
|
66
67
|
"name",
|
|
67
68
|
"kind",
|
|
68
69
|
"statusCode",
|
|
70
|
+
"isRootSpan",
|
|
69
71
|
]);
|
|
70
72
|
private static readonly ATTRIBUTE_KEY_PATTERN: RegExp = /^[a-zA-Z0-9._:/-]+$/;
|
|
71
73
|
private static readonly MAX_FACET_KEY_LENGTH: number = 256;
|
|
@@ -78,11 +80,23 @@ export class TraceAggregationService {
|
|
|
78
80
|
TraceAggregationService.buildHistogramStatement(request);
|
|
79
81
|
|
|
80
82
|
const dbResult: Results = await SpanService.executeQuery(statement);
|
|
81
|
-
const response: DbJSONResponse = await dbResult.json<{
|
|
82
|
-
data?: Array<JSONObject>;
|
|
83
|
-
}>();
|
|
84
83
|
|
|
85
|
-
|
|
84
|
+
let rows: Array<JSONObject> = [];
|
|
85
|
+
try {
|
|
86
|
+
const response: DbJSONResponse = await dbResult.json<{
|
|
87
|
+
data?: Array<JSONObject>;
|
|
88
|
+
}>();
|
|
89
|
+
rows = response.data || [];
|
|
90
|
+
} catch {
|
|
91
|
+
/*
|
|
92
|
+
* When max_execution_time fires with timeout_overflow_mode='break',
|
|
93
|
+
* ClickHouse may return a truncated JSON response. Return an empty
|
|
94
|
+
* histogram rather than failing — the user still sees the span list.
|
|
95
|
+
*/
|
|
96
|
+
logger.warn(
|
|
97
|
+
"Histogram query returned unparseable response, returning empty result",
|
|
98
|
+
);
|
|
99
|
+
}
|
|
86
100
|
|
|
87
101
|
return rows.map((row: JSONObject): HistogramBucket => {
|
|
88
102
|
return {
|
|
@@ -289,41 +303,60 @@ export class TraceAggregationService {
|
|
|
289
303
|
private static buildHistogramStatement(request: HistogramRequest): Statement {
|
|
290
304
|
const intervalSeconds: number = request.bucketSizeInMinutes * 60;
|
|
291
305
|
|
|
306
|
+
/*
|
|
307
|
+
* Two-stage aggregation. The inner query groups by minute + statusCode,
|
|
308
|
+
* which exactly matches the proj_hist_by_minute projection. With
|
|
309
|
+
* optimize_use_projections=1 (default in modern ClickHouse), the inner
|
|
310
|
+
* scan reads pre-aggregated rows instead of the 1.8B-row span table —
|
|
311
|
+
* even for multi-week ranges. The outer query then re-buckets the tiny
|
|
312
|
+
* minute-level result to the requested bucket size.
|
|
313
|
+
*
|
|
314
|
+
* If any non-projection filter (kind, name, traceId, nameSearchText,
|
|
315
|
+
* attributes) is active, ClickHouse transparently falls back to
|
|
316
|
+
* scanning the main table for the inner query — same cost as before.
|
|
317
|
+
*/
|
|
292
318
|
const statement: Statement = SQL`
|
|
293
319
|
SELECT
|
|
294
|
-
toStartOfInterval(
|
|
320
|
+
toStartOfInterval(minute, INTERVAL ${{
|
|
295
321
|
type: TableColumnType.Number,
|
|
296
322
|
value: intervalSeconds,
|
|
297
323
|
}} SECOND) AS bucket,
|
|
298
324
|
statusCode,
|
|
299
|
-
|
|
300
|
-
FROM
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
AND startTime <= ${{
|
|
310
|
-
type: TableColumnType.Date,
|
|
311
|
-
value: request.endTime,
|
|
325
|
+
sum(cnt_minute) AS cnt
|
|
326
|
+
FROM (
|
|
327
|
+
SELECT
|
|
328
|
+
toStartOfMinute(startTime) AS minute,
|
|
329
|
+
statusCode,
|
|
330
|
+
count() AS cnt_minute
|
|
331
|
+
FROM ${TraceAggregationService.TABLE_NAME}
|
|
332
|
+
WHERE projectId = ${{
|
|
333
|
+
type: TableColumnType.ObjectID,
|
|
334
|
+
value: request.projectId,
|
|
312
335
|
}}
|
|
336
|
+
AND startTime >= ${{
|
|
337
|
+
type: TableColumnType.Date,
|
|
338
|
+
value: request.startTime,
|
|
339
|
+
}}
|
|
340
|
+
AND startTime <= ${{
|
|
341
|
+
type: TableColumnType.Date,
|
|
342
|
+
value: request.endTime,
|
|
343
|
+
}}
|
|
313
344
|
`;
|
|
314
345
|
|
|
315
346
|
TraceAggregationService.appendCommonFilters(statement, request);
|
|
316
347
|
|
|
317
|
-
statement.append(
|
|
348
|
+
statement.append(
|
|
349
|
+
" GROUP BY minute, statusCode ) GROUP BY bucket, statusCode ORDER BY bucket ASC",
|
|
350
|
+
);
|
|
318
351
|
|
|
319
352
|
/*
|
|
320
353
|
* Defense in depth: cap histogram runtime below nginx's 60s
|
|
321
354
|
* proxy_read_timeout. ClickHouse returns partial aggregated results
|
|
322
355
|
* with 'break' mode rather than throwing, which is acceptable for
|
|
323
|
-
* a density visualization.
|
|
356
|
+
* a density visualization. Explicitly enable projection use.
|
|
324
357
|
*/
|
|
325
358
|
statement.append(
|
|
326
|
-
" SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
359
|
+
" SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break', optimize_use_projections = 1",
|
|
327
360
|
);
|
|
328
361
|
|
|
329
362
|
return statement;
|
|
@@ -401,7 +434,7 @@ export class TraceAggregationService {
|
|
|
401
434
|
request: TraceFilters,
|
|
402
435
|
): void {
|
|
403
436
|
if (request.rootOnly) {
|
|
404
|
-
statement.append(" AND
|
|
437
|
+
statement.append(" AND isRootSpan = 1");
|
|
405
438
|
}
|
|
406
439
|
|
|
407
440
|
if (request.serviceIds && request.serviceIds.length > 0) {
|
|
@@ -106,7 +106,8 @@ describe("AnalyticsDatabaseService", () => {
|
|
|
106
106
|
"SELECT\n" +
|
|
107
107
|
" count() as count\n" +
|
|
108
108
|
"FROM {p0:Identifier}.{p1:Identifier}\n" +
|
|
109
|
-
"WHERE TRUE <where-statement>"
|
|
109
|
+
"WHERE TRUE <where-statement>" +
|
|
110
|
+
" SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
110
111
|
);
|
|
111
112
|
expect(statement.query_params).toStrictEqual({
|
|
112
113
|
p0: "oneuptime",
|
|
@@ -126,7 +127,8 @@ describe("AnalyticsDatabaseService", () => {
|
|
|
126
127
|
" count() as count\n" +
|
|
127
128
|
"FROM {p0:Identifier}.{p1:Identifier}\n" +
|
|
128
129
|
"WHERE TRUE <where-statement>\n" +
|
|
129
|
-
"LIMIT {p2:Int32}"
|
|
130
|
+
"LIMIT {p2:Int32}\n" +
|
|
131
|
+
" SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
130
132
|
);
|
|
131
133
|
expect(statement.query_params).toStrictEqual({
|
|
132
134
|
p0: "oneuptime",
|
|
@@ -147,7 +149,8 @@ describe("AnalyticsDatabaseService", () => {
|
|
|
147
149
|
" count() as count\n" +
|
|
148
150
|
"FROM {p0:Identifier}.{p1:Identifier}\n" +
|
|
149
151
|
"WHERE TRUE <where-statement>\n" +
|
|
150
|
-
"OFFSET {p2:Int32}"
|
|
152
|
+
"OFFSET {p2:Int32}\n" +
|
|
153
|
+
" SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
151
154
|
);
|
|
152
155
|
expect(statement.query_params).toStrictEqual({
|
|
153
156
|
p0: "oneuptime",
|
|
@@ -211,7 +214,8 @@ describe("AnalyticsDatabaseService", () => {
|
|
|
211
214
|
});
|
|
212
215
|
|
|
213
216
|
expect(statement.query).toBe(
|
|
214
|
-
"SELECT <select-statement> FROM {p0:Identifier}.{p1:Identifier} WHERE TRUE <where-statement> ORDER BY <sort-statement> LIMIT {p2:Int32} OFFSET {p3:Int32}"
|
|
217
|
+
"SELECT <select-statement> FROM {p0:Identifier}.{p1:Identifier} WHERE TRUE <where-statement> ORDER BY <sort-statement> LIMIT {p2:Int32} OFFSET {p3:Int32}\n" +
|
|
218
|
+
"SETTINGS max_execution_time = 45, timeout_overflow_mode = 'break'",
|
|
215
219
|
);
|
|
216
220
|
expect(statement.query_params).toStrictEqual({
|
|
217
221
|
p0: "oneuptime",
|
|
@@ -107,7 +107,7 @@ function TelemetryViewerInner<T>(props: TelemetryViewerProps<T>): ReactElement {
|
|
|
107
107
|
const showHistogram: boolean = props.showHistogram ?? true;
|
|
108
108
|
|
|
109
109
|
return (
|
|
110
|
-
<div className="flex h-
|
|
110
|
+
<div className="flex min-h-0 w-full flex-1 flex-col gap-3">
|
|
111
111
|
{/* Toolbar */}
|
|
112
112
|
<div className="flex flex-wrap items-center gap-2">
|
|
113
113
|
<div className="min-w-0 flex-1">
|
|
@@ -523,6 +523,35 @@ export default class Span extends AnalyticsBaseModel {
|
|
|
523
523
|
update: [],
|
|
524
524
|
},
|
|
525
525
|
});
|
|
526
|
+
const isRootSpanColumn = new AnalyticsTableColumn({
|
|
527
|
+
key: "isRootSpan",
|
|
528
|
+
title: "Is Root Span",
|
|
529
|
+
description: "Whether this span is a root span (has no parent), populated at ingest time for fast root-only filtering",
|
|
530
|
+
required: true,
|
|
531
|
+
defaultValue: false,
|
|
532
|
+
type: TableColumnType.Boolean,
|
|
533
|
+
skipIndex: {
|
|
534
|
+
name: "idx_is_root_span",
|
|
535
|
+
type: SkipIndexType.Set,
|
|
536
|
+
params: [2],
|
|
537
|
+
granularity: 4,
|
|
538
|
+
},
|
|
539
|
+
accessControl: {
|
|
540
|
+
read: [
|
|
541
|
+
Permission.ProjectOwner,
|
|
542
|
+
Permission.ProjectAdmin,
|
|
543
|
+
Permission.ProjectMember,
|
|
544
|
+
Permission.ReadTelemetryServiceTraces,
|
|
545
|
+
],
|
|
546
|
+
create: [
|
|
547
|
+
Permission.ProjectOwner,
|
|
548
|
+
Permission.ProjectAdmin,
|
|
549
|
+
Permission.ProjectMember,
|
|
550
|
+
Permission.CreateTelemetryServiceTraces,
|
|
551
|
+
],
|
|
552
|
+
update: [],
|
|
553
|
+
},
|
|
554
|
+
});
|
|
526
555
|
const retentionDateColumn = new AnalyticsTableColumn({
|
|
527
556
|
key: "retentionDate",
|
|
528
557
|
title: "Retention Date",
|
|
@@ -584,6 +613,7 @@ export default class Span extends AnalyticsBaseModel {
|
|
|
584
613
|
nameColumn,
|
|
585
614
|
kindColumn,
|
|
586
615
|
hasExceptionColumn,
|
|
616
|
+
isRootSpanColumn,
|
|
587
617
|
retentionDateColumn,
|
|
588
618
|
],
|
|
589
619
|
projections: [
|
|
@@ -595,6 +625,10 @@ export default class Span extends AnalyticsBaseModel {
|
|
|
595
625
|
name: "proj_trace_by_id",
|
|
596
626
|
query: "SELECT projectId, traceId, startTime, serviceId, spanId, parentSpanId, name, durationUnixNano, statusCode, hasException ORDER BY (projectId, traceId, startTime)",
|
|
597
627
|
},
|
|
628
|
+
{
|
|
629
|
+
name: "proj_hist_by_minute",
|
|
630
|
+
query: "SELECT projectId, toStartOfMinute(startTime) AS minute, serviceId, statusCode, isRootSpan, count() AS cnt GROUP BY projectId, minute, serviceId, statusCode, isRootSpan",
|
|
631
|
+
},
|
|
598
632
|
],
|
|
599
633
|
sortKeys: ["projectId", "startTime", "serviceId", "traceId"],
|
|
600
634
|
primaryKeys: ["projectId", "startTime", "serviceId", "traceId"],
|
|
@@ -722,6 +756,12 @@ export default class Span extends AnalyticsBaseModel {
|
|
|
722
756
|
set hasException(v) {
|
|
723
757
|
this.setColumnValue("hasException", v);
|
|
724
758
|
}
|
|
759
|
+
get isRootSpan() {
|
|
760
|
+
return this.getColumnValue("isRootSpan");
|
|
761
|
+
}
|
|
762
|
+
set isRootSpan(v) {
|
|
763
|
+
this.setColumnValue("isRootSpan", v);
|
|
764
|
+
}
|
|
725
765
|
get retentionDate() {
|
|
726
766
|
return this.getColumnValue("retentionDate");
|
|
727
767
|
}
|