@oneuptime/common 10.5.17 → 10.5.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Server/API/TelemetryAPI.ts +6 -0
- package/Server/EnvironmentConfig.ts +27 -0
- package/Server/Infrastructure/ClickhouseDatabase.ts +21 -1
- package/Server/Infrastructure/Postgres/DataSourceOptions.ts +19 -0
- package/Server/Infrastructure/PostgresDatabase.ts +27 -1
- package/Server/Infrastructure/QueueWorker.ts +14 -3
- package/Server/Infrastructure/Redis.ts +11 -0
- package/Server/Services/TelemetryAttributeService.ts +38 -2
- package/Server/Utils/Express.ts +32 -0
- package/Server/Utils/GracefulShutdown.ts +194 -0
- package/Server/Utils/Monitor/MonitorLogUtil.ts +22 -17
- package/Server/Utils/Profiling.ts +14 -6
- package/Server/Utils/Telemetry/LogExceptionExtractor.ts +289 -0
- package/Server/Utils/Telemetry/StackTraceParser.ts +423 -0
- package/Server/Utils/Telemetry.ts +15 -5
- package/Tests/Server/Services/TelemetryAttributeService.test.ts +83 -0
- package/Tests/Server/Utils/Telemetry/LogExceptionExtractor.test.ts +0 -0
- package/UI/Components/AutocompleteTextInput/AutocompleteTextInput.tsx +7 -1
- package/UI/Components/Dictionary/Dictionary.tsx +19 -0
- package/UI/Components/Filters/FiltersForm.tsx +1 -0
- package/UI/Components/Filters/JSONFilter.tsx +2 -0
- package/UI/Components/Filters/Types/Filter.ts +1 -0
- package/build/dist/Server/API/TelemetryAPI.js +4 -0
- package/build/dist/Server/API/TelemetryAPI.js.map +1 -1
- package/build/dist/Server/EnvironmentConfig.js +19 -0
- package/build/dist/Server/EnvironmentConfig.js.map +1 -1
- package/build/dist/Server/Infrastructure/ClickhouseDatabase.js +16 -2
- package/build/dist/Server/Infrastructure/ClickhouseDatabase.js.map +1 -1
- package/build/dist/Server/Infrastructure/Postgres/DataSourceOptions.js +10 -9
- package/build/dist/Server/Infrastructure/Postgres/DataSourceOptions.js.map +1 -1
- package/build/dist/Server/Infrastructure/PostgresDatabase.js +20 -1
- package/build/dist/Server/Infrastructure/PostgresDatabase.js.map +1 -1
- package/build/dist/Server/Infrastructure/QueueWorker.js +9 -2
- package/build/dist/Server/Infrastructure/QueueWorker.js.map +1 -1
- package/build/dist/Server/Infrastructure/Redis.js +5 -0
- package/build/dist/Server/Infrastructure/Redis.js.map +1 -1
- package/build/dist/Server/Services/TelemetryAttributeService.js +23 -1
- package/build/dist/Server/Services/TelemetryAttributeService.js.map +1 -1
- package/build/dist/Server/Utils/Express.js +23 -0
- package/build/dist/Server/Utils/Express.js.map +1 -1
- package/build/dist/Server/Utils/GracefulShutdown.js +145 -0
- package/build/dist/Server/Utils/GracefulShutdown.js.map +1 -0
- package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js +12 -10
- package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js.map +1 -1
- package/build/dist/Server/Utils/Profiling.js +8 -3
- package/build/dist/Server/Utils/Profiling.js.map +1 -1
- package/build/dist/Server/Utils/Telemetry/LogExceptionExtractor.js +214 -0
- package/build/dist/Server/Utils/Telemetry/LogExceptionExtractor.js.map +1 -0
- package/build/dist/Server/Utils/Telemetry/StackTraceParser.js +365 -0
- package/build/dist/Server/Utils/Telemetry/StackTraceParser.js.map +1 -0
- package/build/dist/Server/Utils/Telemetry.js +10 -4
- package/build/dist/Server/Utils/Telemetry.js.map +1 -1
- package/build/dist/Tests/Server/Services/TelemetryAttributeService.test.js +50 -0
- package/build/dist/Tests/Server/Services/TelemetryAttributeService.test.js.map +1 -0
- package/build/dist/Tests/Server/Utils/Telemetry/LogExceptionExtractor.test.js +0 -0
- package/build/dist/Tests/Server/Utils/Telemetry/LogExceptionExtractor.test.js.map +1 -0
- package/build/dist/UI/Components/AutocompleteTextInput/AutocompleteTextInput.js +7 -1
- package/build/dist/UI/Components/AutocompleteTextInput/AutocompleteTextInput.js.map +1 -1
- package/build/dist/UI/Components/Dictionary/Dictionary.js +10 -0
- package/build/dist/UI/Components/Dictionary/Dictionary.js.map +1 -1
- package/build/dist/UI/Components/Filters/FiltersForm.js +1 -1
- package/build/dist/UI/Components/Filters/FiltersForm.js.map +1 -1
- package/build/dist/UI/Components/Filters/JSONFilter.js +1 -1
- package/build/dist/UI/Components/Filters/JSONFilter.js.map +1 -1
- package/package.json +1 -1
|
@@ -236,12 +236,18 @@ const getAttributeValues: GetAttributeValuesFunction = async (
|
|
|
236
236
|
? (req.body["metricName"] as string)
|
|
237
237
|
: undefined;
|
|
238
238
|
|
|
239
|
+
const searchText: string | undefined =
|
|
240
|
+
req.body["searchText"] && typeof req.body["searchText"] === "string"
|
|
241
|
+
? (req.body["searchText"] as string)
|
|
242
|
+
: undefined;
|
|
243
|
+
|
|
239
244
|
const values: string[] =
|
|
240
245
|
await TelemetryAttributeService.fetchAttributeValues({
|
|
241
246
|
projectId: databaseProps.tenantId,
|
|
242
247
|
telemetryType,
|
|
243
248
|
metricName,
|
|
244
249
|
attributeKey,
|
|
250
|
+
searchText,
|
|
245
251
|
});
|
|
246
252
|
|
|
247
253
|
return Response.sendJsonObjectResponse(req, res, {
|
|
@@ -204,6 +204,33 @@ export const PostgresIdleTimeoutMs: number = parseInt(
|
|
|
204
204
|
10,
|
|
205
205
|
);
|
|
206
206
|
|
|
207
|
+
/*
|
|
208
|
+
* TCP keepalive initial delay (ms) for Postgres sockets. When the client
|
|
209
|
+
* process dies ungracefully (SIGKILL, OOM, crash) or a network partition cuts
|
|
210
|
+
* the link, Postgres has no way to know the client is gone and the backend
|
|
211
|
+
* lingers as an orphaned connection — by default up to the OS
|
|
212
|
+
* tcp_keepalive_time (~2h on Linux). Enabling socket keepalive makes
|
|
213
|
+
* node-postgres probe the peer so dead connections are detected and torn down
|
|
214
|
+
* promptly.
|
|
215
|
+
*/
|
|
216
|
+
export const PostgresKeepAliveInitialDelayMs: number = parseInt(
|
|
217
|
+
process.env["DATABASE_KEEPALIVE_INITIAL_DELAY_MS"] || "10000",
|
|
218
|
+
10,
|
|
219
|
+
);
|
|
220
|
+
|
|
221
|
+
/*
|
|
222
|
+
* Postgres-side idle-session timeout (ms). Server-side backstop for orphaned
|
|
223
|
+
* connections: the server terminates any session that sits idle (outside a
|
|
224
|
+
* transaction) longer than this. MUST be larger than the pool's
|
|
225
|
+
* idleTimeoutMillis (PostgresIdleTimeoutMs) so the pool reaps its own healthy
|
|
226
|
+
* idle connections first and only truly-orphaned sessions (client gone) ever
|
|
227
|
+
* hit this. Set to 0 to disable. Requires Postgres 14+.
|
|
228
|
+
*/
|
|
229
|
+
export const PostgresIdleSessionTimeoutMs: number = parseInt(
|
|
230
|
+
process.env["DATABASE_IDLE_SESSION_TIMEOUT_MS"] || "300000",
|
|
231
|
+
10,
|
|
232
|
+
);
|
|
233
|
+
|
|
207
234
|
/*
|
|
208
235
|
* TypeORM slow-query log threshold (ms). Any query exceeding this is
|
|
209
236
|
* logged so we can find offenders in production without per-query
|
|
@@ -14,6 +14,7 @@ import HTTPErrorResponse from "../../Types/API/HTTPErrorResponse";
|
|
|
14
14
|
import HTTPResponse from "../../Types/API/HTTPResponse";
|
|
15
15
|
import { JSONObject } from "../../Types/JSON";
|
|
16
16
|
import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
|
|
17
|
+
import GracefulShutdown, { ShutdownPriority } from "../Utils/GracefulShutdown";
|
|
17
18
|
|
|
18
19
|
export type ClickhouseClient = ClickHouseClient;
|
|
19
20
|
|
|
@@ -21,6 +22,14 @@ export default class ClickhouseDatabase {
|
|
|
21
22
|
private dataSource!: ClickhouseClient | null;
|
|
22
23
|
private options: ClickHouseClientConfigOptions;
|
|
23
24
|
|
|
25
|
+
/*
|
|
26
|
+
* Each instance owns its own pool (App vs. Ingest), so each needs a
|
|
27
|
+
* distinct shutdown-handler name. The two instances share a database name,
|
|
28
|
+
* so a per-instance counter is what makes the names unique.
|
|
29
|
+
*/
|
|
30
|
+
private static instanceCounter: number = 0;
|
|
31
|
+
private readonly instanceId: number = ++ClickhouseDatabase.instanceCounter;
|
|
32
|
+
|
|
24
33
|
public constructor(
|
|
25
34
|
options: ClickHouseClientConfigOptions = dataSourceOptions,
|
|
26
35
|
) {
|
|
@@ -97,7 +106,18 @@ export default class ClickhouseDatabase {
|
|
|
97
106
|
}
|
|
98
107
|
};
|
|
99
108
|
|
|
100
|
-
|
|
109
|
+
const client: ClickhouseClient = await connectToDatabase();
|
|
110
|
+
|
|
111
|
+
// Close this Clickhouse pool on shutdown.
|
|
112
|
+
GracefulShutdown.registerHandler(
|
|
113
|
+
`ClickhouseDatabase#${this.instanceId}`,
|
|
114
|
+
ShutdownPriority.DataStores,
|
|
115
|
+
() => {
|
|
116
|
+
return this.disconnect();
|
|
117
|
+
},
|
|
118
|
+
);
|
|
119
|
+
|
|
120
|
+
return client;
|
|
101
121
|
} catch (err) {
|
|
102
122
|
logger.error("Clickhouse Database Connection Failed");
|
|
103
123
|
logger.error(err);
|
|
@@ -11,7 +11,9 @@ import {
|
|
|
11
11
|
MaxPostgresConnections,
|
|
12
12
|
PostgresConnectionAcquireTimeoutMs,
|
|
13
13
|
PostgresIdleInTransactionTimeoutMs,
|
|
14
|
+
PostgresIdleSessionTimeoutMs,
|
|
14
15
|
PostgresIdleTimeoutMs,
|
|
16
|
+
PostgresKeepAliveInitialDelayMs,
|
|
15
17
|
PostgresQueryTimeoutMs,
|
|
16
18
|
PostgresSlowQueryLogThresholdMs,
|
|
17
19
|
PostgresStatementTimeoutMs,
|
|
@@ -54,6 +56,23 @@ const dataSourceOptions: DataSourceOptions = {
|
|
|
54
56
|
statement_timeout: PostgresStatementTimeoutMs,
|
|
55
57
|
query_timeout: PostgresQueryTimeoutMs,
|
|
56
58
|
idle_in_transaction_session_timeout: PostgresIdleInTransactionTimeoutMs,
|
|
59
|
+
/*
|
|
60
|
+
* Detect dead TCP peers (ungraceful client exit / network partition) so
|
|
61
|
+
* orphaned server-side connections get torn down instead of lingering
|
|
62
|
+
* until the OS keepalive default (~2h).
|
|
63
|
+
*/
|
|
64
|
+
keepAlive: true,
|
|
65
|
+
keepAliveInitialDelayMillis: PostgresKeepAliveInitialDelayMs,
|
|
66
|
+
/*
|
|
67
|
+
* Server-side backstop for orphaned idle sessions. node-postgres has no
|
|
68
|
+
* first-class option for this GUC, so pass it via the libpq `options`
|
|
69
|
+
* startup parameter. Unitless values are milliseconds. Only applied when
|
|
70
|
+
* > 0, and must exceed idleTimeoutMillis (see EnvironmentConfig) so the
|
|
71
|
+
* pool reaps healthy idle connections before the server force-closes them.
|
|
72
|
+
*/
|
|
73
|
+
...(PostgresIdleSessionTimeoutMs > 0
|
|
74
|
+
? { options: `-c idle_session_timeout=${PostgresIdleSessionTimeoutMs}` }
|
|
75
|
+
: {}),
|
|
57
76
|
},
|
|
58
77
|
/*
|
|
59
78
|
* Log any query slower than the configured threshold so we can find
|
|
@@ -4,6 +4,7 @@ import Sleep from "../../Types/Sleep";
|
|
|
4
4
|
import { DataSource, DataSourceOptions } from "typeorm";
|
|
5
5
|
import { createDatabase, dropDatabase } from "typeorm-extension";
|
|
6
6
|
import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
|
|
7
|
+
import GracefulShutdown, { ShutdownPriority } from "../Utils/GracefulShutdown";
|
|
7
8
|
|
|
8
9
|
export type DatabaseSourceOptions = DataSourceOptions;
|
|
9
10
|
export type DatabaseSource = DataSource;
|
|
@@ -30,6 +31,15 @@ export default class Database {
|
|
|
30
31
|
|
|
31
32
|
@CaptureSpan()
|
|
32
33
|
public static async connect(): Promise<DataSource> {
|
|
34
|
+
/*
|
|
35
|
+
* Idempotent: a second connect() must not overwrite (and thereby orphan)
|
|
36
|
+
* the existing pool. Return the live DataSource instead of building a new
|
|
37
|
+
* one.
|
|
38
|
+
*/
|
|
39
|
+
if (this.dataSource) {
|
|
40
|
+
return this.dataSource;
|
|
41
|
+
}
|
|
42
|
+
|
|
33
43
|
let retry: number = 0;
|
|
34
44
|
|
|
35
45
|
const dataSourceOptions: DataSourceOptions = this.getDatasourceOptions();
|
|
@@ -64,7 +74,23 @@ export default class Database {
|
|
|
64
74
|
}
|
|
65
75
|
};
|
|
66
76
|
|
|
67
|
-
|
|
77
|
+
const dataSource: DataSource = await connectToDatabase();
|
|
78
|
+
|
|
79
|
+
/*
|
|
80
|
+
* Drain the pool on shutdown. Registered here (after a successful
|
|
81
|
+
* connect) so we never register cleanup for a pool that was never
|
|
82
|
+
* created, and — thanks to GracefulShutdown deduping by name — exactly
|
|
83
|
+
* once even if connect() is somehow reached twice.
|
|
84
|
+
*/
|
|
85
|
+
GracefulShutdown.registerHandler(
|
|
86
|
+
"PostgresDatabase",
|
|
87
|
+
ShutdownPriority.DataStores,
|
|
88
|
+
() => {
|
|
89
|
+
return this.disconnect();
|
|
90
|
+
},
|
|
91
|
+
);
|
|
92
|
+
|
|
93
|
+
return dataSource;
|
|
68
94
|
} catch (err) {
|
|
69
95
|
logger.error("Postgres Database Connection Failed");
|
|
70
96
|
logger.error(err);
|
|
@@ -15,6 +15,7 @@ import Telemetry, {
|
|
|
15
15
|
SpanStatusCode,
|
|
16
16
|
} from "../Utils/Telemetry";
|
|
17
17
|
import Redis from "./Redis";
|
|
18
|
+
import GracefulShutdown, { ShutdownPriority } from "../Utils/GracefulShutdown";
|
|
18
19
|
|
|
19
20
|
export default class QueueWorker {
|
|
20
21
|
@CaptureSpan()
|
|
@@ -116,9 +117,19 @@ export default class QueueWorker {
|
|
|
116
117
|
: {}),
|
|
117
118
|
});
|
|
118
119
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
/*
|
|
121
|
+
* Stop pulling new jobs and let in-flight ones finish on shutdown. Runs in
|
|
122
|
+
* the Workers tier — before datastores are drained — so jobs mid-flight can
|
|
123
|
+
* still reach Postgres / Redis. Replaces a SIGINT-only handler that never
|
|
124
|
+
* fired in containers (Kubernetes / docker stop send SIGTERM).
|
|
125
|
+
*/
|
|
126
|
+
GracefulShutdown.registerHandler(
|
|
127
|
+
`QueueWorker:${queueName}`,
|
|
128
|
+
ShutdownPriority.Workers,
|
|
129
|
+
() => {
|
|
130
|
+
return worker.close();
|
|
131
|
+
},
|
|
132
|
+
);
|
|
122
133
|
|
|
123
134
|
return worker;
|
|
124
135
|
}
|
|
@@ -15,6 +15,7 @@ import logger from "../Utils/Logger";
|
|
|
15
15
|
import Sleep from "../../Types/Sleep";
|
|
16
16
|
import { Redis as RedisClient, RedisOptions } from "ioredis";
|
|
17
17
|
import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
|
|
18
|
+
import GracefulShutdown, { ShutdownPriority } from "../Utils/GracefulShutdown";
|
|
18
19
|
|
|
19
20
|
export type ClientType = RedisClient;
|
|
20
21
|
export type RedisOptionsType = RedisOptions;
|
|
@@ -122,6 +123,16 @@ export default abstract class Redis {
|
|
|
122
123
|
logger.debug(
|
|
123
124
|
`Redis connected on ${RedisHostname}:${RedisPort.toNumber()}`,
|
|
124
125
|
);
|
|
126
|
+
|
|
127
|
+
// Close the Redis connection on shutdown.
|
|
128
|
+
GracefulShutdown.registerHandler(
|
|
129
|
+
"Redis",
|
|
130
|
+
ShutdownPriority.DataStores,
|
|
131
|
+
() => {
|
|
132
|
+
return this.disconnect();
|
|
133
|
+
},
|
|
134
|
+
);
|
|
135
|
+
|
|
125
136
|
return this.client;
|
|
126
137
|
} catch (err) {
|
|
127
138
|
logger.error("Redis Connection Failed");
|
|
@@ -364,6 +364,7 @@ export class TelemetryAttributeService {
|
|
|
364
364
|
telemetryType: TelemetryType;
|
|
365
365
|
metricName?: string | undefined;
|
|
366
366
|
attributeKey: string;
|
|
367
|
+
searchText?: string | undefined;
|
|
367
368
|
}): Promise<string[]> {
|
|
368
369
|
const source: TelemetrySource | null = this.getTelemetrySource(
|
|
369
370
|
data.telemetryType,
|
|
@@ -378,15 +379,17 @@ export class TelemetryAttributeService {
|
|
|
378
379
|
source,
|
|
379
380
|
metricName: data.metricName,
|
|
380
381
|
attributeKey: data.attributeKey,
|
|
382
|
+
searchText: data.searchText,
|
|
381
383
|
});
|
|
382
384
|
}
|
|
383
385
|
|
|
384
|
-
private static
|
|
386
|
+
private static buildAttributeValuesStatement(data: {
|
|
385
387
|
projectId: ObjectID;
|
|
386
388
|
source: TelemetrySource;
|
|
387
389
|
metricName?: string | undefined;
|
|
388
390
|
attributeKey: string;
|
|
389
|
-
|
|
391
|
+
searchText?: string | undefined;
|
|
392
|
+
}): Statement {
|
|
390
393
|
const lookbackStartDate: Date =
|
|
391
394
|
TelemetryAttributeService.getLookbackStartDate();
|
|
392
395
|
|
|
@@ -419,6 +422,26 @@ export class TelemetryAttributeService {
|
|
|
419
422
|
);
|
|
420
423
|
}
|
|
421
424
|
|
|
425
|
+
/*
|
|
426
|
+
* Case-insensitive substring filter so the value autocomplete keeps
|
|
427
|
+
* narrowing server-side as the user types. Without it only the first
|
|
428
|
+
* ATTRIBUTE_VALUES_LIMIT values (alphabetically) are ever reachable,
|
|
429
|
+
* which hides matches on high-cardinality keys (host.name, url, ...).
|
|
430
|
+
* Mirrors the ILIKE idiom used for bodySearchText / nameSearchText.
|
|
431
|
+
*/
|
|
432
|
+
if (data.searchText && data.searchText.trim().length > 0) {
|
|
433
|
+
statement.append(
|
|
434
|
+
SQL`
|
|
435
|
+
AND ${data.source.attributesColumn}[${{
|
|
436
|
+
type: TableColumnType.Text,
|
|
437
|
+
value: data.attributeKey,
|
|
438
|
+
}}] ILIKE ${{
|
|
439
|
+
type: TableColumnType.Text,
|
|
440
|
+
value: `%${data.searchText.trim()}%`,
|
|
441
|
+
}}`,
|
|
442
|
+
);
|
|
443
|
+
}
|
|
444
|
+
|
|
422
445
|
statement.append(
|
|
423
446
|
SQL`
|
|
424
447
|
ORDER BY attributeValue ASC
|
|
@@ -428,6 +451,19 @@ export class TelemetryAttributeService {
|
|
|
428
451
|
}}`,
|
|
429
452
|
);
|
|
430
453
|
|
|
454
|
+
return statement;
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
private static async fetchAttributeValuesFromDatabase(data: {
|
|
458
|
+
projectId: ObjectID;
|
|
459
|
+
source: TelemetrySource;
|
|
460
|
+
metricName?: string | undefined;
|
|
461
|
+
attributeKey: string;
|
|
462
|
+
searchText?: string | undefined;
|
|
463
|
+
}): Promise<Array<string>> {
|
|
464
|
+
const statement: Statement =
|
|
465
|
+
TelemetryAttributeService.buildAttributeValuesStatement(data);
|
|
466
|
+
|
|
431
467
|
const dbResult: Results = await data.source.service.executeQuery(statement);
|
|
432
468
|
const response: DbJSONResponse = await dbResult.json<{
|
|
433
469
|
data?: Array<JSONObject>;
|
package/Server/Utils/Express.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logger from "./Logger";
|
|
2
|
+
import GracefulShutdown, { ShutdownPriority } from "./GracefulShutdown";
|
|
2
3
|
import Dictionary from "../../Types/Dictionary";
|
|
3
4
|
import GenericFunction from "../../Types/GenericFunction";
|
|
4
5
|
import { JSONObject, JSONObjectOrArray } from "../../Types/JSON";
|
|
@@ -104,6 +105,37 @@ class Express {
|
|
|
104
105
|
this.httpServer = createServer(this.app);
|
|
105
106
|
}
|
|
106
107
|
|
|
108
|
+
/*
|
|
109
|
+
* On shutdown, stop accepting new connections first (before datastores are
|
|
110
|
+
* drained) so in-flight requests can finish but new ones don't acquire
|
|
111
|
+
* resources we're about to tear down. closeIdleConnections() drops idle
|
|
112
|
+
* keep-alive sockets so server.close() doesn't block waiting on them; the
|
|
113
|
+
* GracefulShutdown per-handler timeout bounds anything still in flight.
|
|
114
|
+
*/
|
|
115
|
+
GracefulShutdown.registerHandler(
|
|
116
|
+
"HttpServer",
|
|
117
|
+
ShutdownPriority.HttpServer,
|
|
118
|
+
() => {
|
|
119
|
+
return new Promise<void>((resolve: () => void) => {
|
|
120
|
+
if (!this.httpServer || !this.httpServer.listening) {
|
|
121
|
+
resolve();
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const server: Server & { closeIdleConnections?: () => void } =
|
|
126
|
+
this.httpServer;
|
|
127
|
+
|
|
128
|
+
if (typeof server.closeIdleConnections === "function") {
|
|
129
|
+
server.closeIdleConnections();
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
server.close(() => {
|
|
133
|
+
resolve();
|
|
134
|
+
});
|
|
135
|
+
});
|
|
136
|
+
},
|
|
137
|
+
);
|
|
138
|
+
|
|
107
139
|
type ResolveFunction = (app: express.Application) => void;
|
|
108
140
|
|
|
109
141
|
return new Promise<express.Application>((resolve: ResolveFunction) => {
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import logger from "./Logger";
|
|
2
|
+
|
|
3
|
+
/*
|
|
4
|
+
* Centralized graceful-shutdown coordinator.
|
|
5
|
+
*
|
|
6
|
+
* Before this existed, each subsystem registered its own
|
|
7
|
+
* process.on("SIGTERM" | "SIGINT") handler independently. The telemetry
|
|
8
|
+
* handler in particular called process.exit(0) as soon as the OTEL SDK
|
|
9
|
+
* flushed, which raced every other handler and — crucially — meant the
|
|
10
|
+
* Postgres / Redis / Clickhouse pools were never drained. Connections were
|
|
11
|
+
* left for the OS socket teardown to reap (and leaked outright on SIGKILL or
|
|
12
|
+
* a network partition).
|
|
13
|
+
*
|
|
14
|
+
* Now every subsystem registers an async cleanup callback here, and this class
|
|
15
|
+
* is the single owner of the signal handlers and of process.exit. Handlers run
|
|
16
|
+
* in ascending priority order (lower first) so we stop accepting new work
|
|
17
|
+
* before tearing down the resources that work depends on:
|
|
18
|
+
*
|
|
19
|
+
* HttpServer (10) -> stop accepting new HTTP requests
|
|
20
|
+
* Workers (20) -> stop pulling new queue jobs, finish in-flight jobs
|
|
21
|
+
* Buffers (30) -> flush in-memory write buffers to their datastore
|
|
22
|
+
* DataStores (40) -> drain Postgres / Redis / Clickhouse pools
|
|
23
|
+
* Telemetry (50) -> flush traces / metrics / logs / profiles last
|
|
24
|
+
*
|
|
25
|
+
* Handlers in the same tier run concurrently. Each handler is bounded by a
|
|
26
|
+
* per-handler timeout, and the whole sequence by an overall deadline, so a
|
|
27
|
+
* single hung handler can never wedge the shutdown.
|
|
28
|
+
*/
|
|
29
|
+
export enum ShutdownPriority {
|
|
30
|
+
HttpServer = 10,
|
|
31
|
+
Workers = 20,
|
|
32
|
+
Buffers = 30,
|
|
33
|
+
DataStores = 40,
|
|
34
|
+
Telemetry = 50,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export type ShutdownCallback = () => Promise<void> | void;
|
|
38
|
+
|
|
39
|
+
interface RegisteredShutdownHandler {
|
|
40
|
+
name: string;
|
|
41
|
+
priority: ShutdownPriority;
|
|
42
|
+
callback: ShutdownCallback;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export default class GracefulShutdown {
|
|
46
|
+
private static handlers: Array<RegisteredShutdownHandler> = [];
|
|
47
|
+
private static signalListenersInstalled: boolean = false;
|
|
48
|
+
private static isShuttingDown: boolean = false;
|
|
49
|
+
|
|
50
|
+
/*
|
|
51
|
+
* How long a single handler may run before we give up on it and move on.
|
|
52
|
+
* Kept comfortably under the orchestrator (Kubernetes) default
|
|
53
|
+
* terminationGracePeriodSeconds of 30s.
|
|
54
|
+
*/
|
|
55
|
+
private static readonly perHandlerTimeoutMs: number = 10_000;
|
|
56
|
+
|
|
57
|
+
/*
|
|
58
|
+
* Hard ceiling for the entire shutdown. If we blow past this we force-exit
|
|
59
|
+
* rather than risk being SIGKILLed mid-cleanup.
|
|
60
|
+
*/
|
|
61
|
+
private static readonly overallTimeoutMs: number = 25_000;
|
|
62
|
+
|
|
63
|
+
/*
|
|
64
|
+
* Register a cleanup callback to run on SIGTERM / SIGINT. Registering by a
|
|
65
|
+
* stable name is idempotent: a repeat registration (e.g. a second connect())
|
|
66
|
+
* replaces the previous callback instead of stacking a duplicate. Callers
|
|
67
|
+
* that own multiple independent resources (e.g. two Clickhouse pools) must
|
|
68
|
+
* therefore pass distinct names.
|
|
69
|
+
*/
|
|
70
|
+
public static registerHandler(
|
|
71
|
+
name: string,
|
|
72
|
+
priority: ShutdownPriority,
|
|
73
|
+
callback: ShutdownCallback,
|
|
74
|
+
): void {
|
|
75
|
+
const existingIndex: number = this.handlers.findIndex(
|
|
76
|
+
(handler: RegisteredShutdownHandler) => {
|
|
77
|
+
return handler.name === name;
|
|
78
|
+
},
|
|
79
|
+
);
|
|
80
|
+
|
|
81
|
+
if (existingIndex >= 0) {
|
|
82
|
+
this.handlers[existingIndex] = { name, priority, callback };
|
|
83
|
+
} else {
|
|
84
|
+
this.handlers.push({ name, priority, callback });
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
this.installSignalListeners();
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
private static installSignalListeners(): void {
|
|
91
|
+
if (this.signalListenersInstalled) {
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
94
|
+
this.signalListenersInstalled = true;
|
|
95
|
+
|
|
96
|
+
process.on("SIGTERM", () => {
|
|
97
|
+
void this.shutdown("SIGTERM");
|
|
98
|
+
});
|
|
99
|
+
process.on("SIGINT", () => {
|
|
100
|
+
void this.shutdown("SIGINT");
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
public static async shutdown(signal: string): Promise<void> {
|
|
105
|
+
if (this.isShuttingDown) {
|
|
106
|
+
/*
|
|
107
|
+
* A second signal while we're already draining means the operator (or
|
|
108
|
+
* orchestrator) is impatient. Bail out immediately.
|
|
109
|
+
*/
|
|
110
|
+
logger.warn(
|
|
111
|
+
`GracefulShutdown: received ${signal} while already shutting down. Forcing exit.`,
|
|
112
|
+
);
|
|
113
|
+
return process.exit(1);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
this.isShuttingDown = true;
|
|
117
|
+
logger.info(
|
|
118
|
+
`GracefulShutdown: received ${signal}. Draining ${this.handlers.length} handler(s)...`,
|
|
119
|
+
);
|
|
120
|
+
|
|
121
|
+
const forceExitTimer: ReturnType<typeof setTimeout> = setTimeout(() => {
|
|
122
|
+
logger.error(
|
|
123
|
+
`GracefulShutdown: exceeded ${this.overallTimeoutMs}ms overall deadline. Forcing exit.`,
|
|
124
|
+
);
|
|
125
|
+
return process.exit(1);
|
|
126
|
+
}, this.overallTimeoutMs);
|
|
127
|
+
|
|
128
|
+
// Don't let this timer keep the event loop alive on its own.
|
|
129
|
+
forceExitTimer.unref();
|
|
130
|
+
|
|
131
|
+
// Run handlers tier by tier; lower priority tiers complete before the next.
|
|
132
|
+
const tiers: Array<number> = Array.from(
|
|
133
|
+
new Set(
|
|
134
|
+
this.handlers.map((handler: RegisteredShutdownHandler) => {
|
|
135
|
+
return handler.priority;
|
|
136
|
+
}),
|
|
137
|
+
),
|
|
138
|
+
).sort((a: number, b: number) => {
|
|
139
|
+
return a - b;
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
for (const tier of tiers) {
|
|
143
|
+
const handlersInTier: Array<RegisteredShutdownHandler> =
|
|
144
|
+
this.handlers.filter((handler: RegisteredShutdownHandler) => {
|
|
145
|
+
return handler.priority === tier;
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
// Handlers within a tier are independent, so run them concurrently.
|
|
149
|
+
await Promise.all(
|
|
150
|
+
handlersInTier.map((handler: RegisteredShutdownHandler) => {
|
|
151
|
+
return this.runHandlerWithTimeout(handler);
|
|
152
|
+
}),
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
clearTimeout(forceExitTimer);
|
|
157
|
+
logger.info("GracefulShutdown: all handlers complete. Exiting cleanly.");
|
|
158
|
+
return process.exit(0);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
private static async runHandlerWithTimeout(
|
|
162
|
+
handler: RegisteredShutdownHandler,
|
|
163
|
+
): Promise<void> {
|
|
164
|
+
logger.debug(`GracefulShutdown: running handler "${handler.name}"...`);
|
|
165
|
+
|
|
166
|
+
let timer: ReturnType<typeof setTimeout> | null = null;
|
|
167
|
+
|
|
168
|
+
const timeout: Promise<void> = new Promise<void>((resolve: () => void) => {
|
|
169
|
+
timer = setTimeout(() => {
|
|
170
|
+
logger.warn(
|
|
171
|
+
`GracefulShutdown: handler "${handler.name}" exceeded ${this.perHandlerTimeoutMs}ms. Moving on.`,
|
|
172
|
+
);
|
|
173
|
+
return resolve();
|
|
174
|
+
}, this.perHandlerTimeoutMs);
|
|
175
|
+
timer.unref();
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
const run: Promise<void> = (async (): Promise<void> => {
|
|
179
|
+
try {
|
|
180
|
+
await handler.callback();
|
|
181
|
+
logger.debug(`GracefulShutdown: handler "${handler.name}" done.`);
|
|
182
|
+
} catch (err) {
|
|
183
|
+
logger.error(`GracefulShutdown: handler "${handler.name}" failed:`);
|
|
184
|
+
logger.error(err);
|
|
185
|
+
}
|
|
186
|
+
})();
|
|
187
|
+
|
|
188
|
+
await Promise.race([run, timeout]);
|
|
189
|
+
|
|
190
|
+
if (timer) {
|
|
191
|
+
clearTimeout(timer);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
@@ -2,6 +2,7 @@ import MonitorLogService from "../../Services/MonitorLogService";
|
|
|
2
2
|
import GlobalConfigService from "../../Services/GlobalConfigService";
|
|
3
3
|
import GlobalConfig from "../../../Models/DatabaseModels/GlobalConfig";
|
|
4
4
|
import logger from "../Logger";
|
|
5
|
+
import GracefulShutdown, { ShutdownPriority } from "../GracefulShutdown";
|
|
5
6
|
import OneUptimeDate from "../../../Types/Date";
|
|
6
7
|
import ObjectID from "../../../Types/ObjectID";
|
|
7
8
|
import { JSONObject } from "../../../Types/JSON";
|
|
@@ -44,8 +45,9 @@ export default class MonitorLogUtil {
|
|
|
44
45
|
* here until either MONITOR_LOG_FLUSH_BATCH_SIZE rows arrive
|
|
45
46
|
* (size trigger) or MONITOR_LOG_FLUSH_INTERVAL_MS elapses since
|
|
46
47
|
* the first row entered an empty buffer (time trigger),
|
|
47
|
-
* whichever comes first. On graceful shutdown the
|
|
48
|
-
*
|
|
48
|
+
* whichever comes first. On graceful shutdown the registered
|
|
49
|
+
* GracefulShutdown handler below drains the buffer (in the
|
|
50
|
+
* Buffers tier, before the datastores are torn down).
|
|
49
51
|
*/
|
|
50
52
|
private static buffer: Array<JSONObject> = [];
|
|
51
53
|
private static flushTimer: NodeJS.Timeout | null = null;
|
|
@@ -233,10 +235,12 @@ export default class MonitorLogUtil {
|
|
|
233
235
|
}
|
|
234
236
|
|
|
235
237
|
/*
|
|
236
|
-
* Register
|
|
237
|
-
*
|
|
238
|
-
*
|
|
239
|
-
*
|
|
238
|
+
* Register the shutdown flush exactly once, lazily on first ingest. We avoid
|
|
239
|
+
* registering at module-load time so tooling that imports this file (e.g.
|
|
240
|
+
* migration runners, CLI scripts) doesn't end up holding a stray handler.
|
|
241
|
+
*
|
|
242
|
+
* Runs in the Buffers tier — ahead of the DataStores tier — so the buffer is
|
|
243
|
+
* drained to Clickhouse before the datastore pools are torn down.
|
|
240
244
|
*/
|
|
241
245
|
private static ensureShutdownHooks(): void {
|
|
242
246
|
if (this.shutdownHooksRegistered) {
|
|
@@ -244,16 +248,17 @@ export default class MonitorLogUtil {
|
|
|
244
248
|
}
|
|
245
249
|
this.shutdownHooksRegistered = true;
|
|
246
250
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
251
|
+
GracefulShutdown.registerHandler(
|
|
252
|
+
"MonitorLogUtil",
|
|
253
|
+
ShutdownPriority.Buffers,
|
|
254
|
+
async (): Promise<void> => {
|
|
255
|
+
try {
|
|
256
|
+
await this.flushAndWait();
|
|
257
|
+
} catch (err) {
|
|
258
|
+
logger.error("Error flushing MonitorLog buffer on shutdown:");
|
|
259
|
+
logger.error(err);
|
|
260
|
+
}
|
|
261
|
+
},
|
|
262
|
+
);
|
|
258
263
|
}
|
|
259
264
|
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import Pyroscope from "@pyroscope/nodejs";
|
|
2
2
|
import { EnableProfiling } from "../EnvironmentConfig";
|
|
3
3
|
import logger, { LogAttributes } from "./Logger";
|
|
4
|
+
import GracefulShutdown, { ShutdownPriority } from "./GracefulShutdown";
|
|
4
5
|
|
|
5
6
|
export default class Profiling {
|
|
6
7
|
public static init(data: { serviceName: string }): void {
|
|
@@ -44,12 +45,19 @@ export default class Profiling {
|
|
|
44
45
|
logger.error(err, profilingLogAttributes);
|
|
45
46
|
}
|
|
46
47
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
48
|
+
// Stop the profiler last (Telemetry tier), alongside the OTEL flush.
|
|
49
|
+
GracefulShutdown.registerHandler(
|
|
50
|
+
"Profiling",
|
|
51
|
+
ShutdownPriority.Telemetry,
|
|
52
|
+
async (): Promise<void> => {
|
|
53
|
+
try {
|
|
54
|
+
await Pyroscope.stop();
|
|
55
|
+
} catch (err) {
|
|
56
|
+
logger.error("Error stopping profiler:", profilingLogAttributes);
|
|
57
|
+
logger.error(err, profilingLogAttributes);
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
);
|
|
53
61
|
}
|
|
54
62
|
|
|
55
63
|
private static getServerAddress(): string | undefined {
|