@oneuptime/common 10.5.17 → 10.5.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/Server/API/TelemetryAPI.ts +6 -0
  2. package/Server/EnvironmentConfig.ts +27 -0
  3. package/Server/Infrastructure/ClickhouseDatabase.ts +21 -1
  4. package/Server/Infrastructure/Postgres/DataSourceOptions.ts +19 -0
  5. package/Server/Infrastructure/PostgresDatabase.ts +27 -1
  6. package/Server/Infrastructure/QueueWorker.ts +14 -3
  7. package/Server/Infrastructure/Redis.ts +11 -0
  8. package/Server/Services/TelemetryAttributeService.ts +38 -2
  9. package/Server/Utils/Express.ts +32 -0
  10. package/Server/Utils/GracefulShutdown.ts +194 -0
  11. package/Server/Utils/Monitor/MonitorLogUtil.ts +22 -17
  12. package/Server/Utils/Profiling.ts +14 -6
  13. package/Server/Utils/Telemetry/LogExceptionExtractor.ts +289 -0
  14. package/Server/Utils/Telemetry/StackTraceParser.ts +423 -0
  15. package/Server/Utils/Telemetry.ts +15 -5
  16. package/Tests/Server/Services/TelemetryAttributeService.test.ts +83 -0
  17. package/Tests/Server/Utils/Telemetry/LogExceptionExtractor.test.ts +0 -0
  18. package/UI/Components/AutocompleteTextInput/AutocompleteTextInput.tsx +7 -1
  19. package/UI/Components/Dictionary/Dictionary.tsx +19 -0
  20. package/UI/Components/Filters/FiltersForm.tsx +1 -0
  21. package/UI/Components/Filters/JSONFilter.tsx +2 -0
  22. package/UI/Components/Filters/Types/Filter.ts +1 -0
  23. package/build/dist/Server/API/TelemetryAPI.js +4 -0
  24. package/build/dist/Server/API/TelemetryAPI.js.map +1 -1
  25. package/build/dist/Server/EnvironmentConfig.js +19 -0
  26. package/build/dist/Server/EnvironmentConfig.js.map +1 -1
  27. package/build/dist/Server/Infrastructure/ClickhouseDatabase.js +16 -2
  28. package/build/dist/Server/Infrastructure/ClickhouseDatabase.js.map +1 -1
  29. package/build/dist/Server/Infrastructure/Postgres/DataSourceOptions.js +10 -9
  30. package/build/dist/Server/Infrastructure/Postgres/DataSourceOptions.js.map +1 -1
  31. package/build/dist/Server/Infrastructure/PostgresDatabase.js +20 -1
  32. package/build/dist/Server/Infrastructure/PostgresDatabase.js.map +1 -1
  33. package/build/dist/Server/Infrastructure/QueueWorker.js +9 -2
  34. package/build/dist/Server/Infrastructure/QueueWorker.js.map +1 -1
  35. package/build/dist/Server/Infrastructure/Redis.js +5 -0
  36. package/build/dist/Server/Infrastructure/Redis.js.map +1 -1
  37. package/build/dist/Server/Services/TelemetryAttributeService.js +23 -1
  38. package/build/dist/Server/Services/TelemetryAttributeService.js.map +1 -1
  39. package/build/dist/Server/Utils/Express.js +23 -0
  40. package/build/dist/Server/Utils/Express.js.map +1 -1
  41. package/build/dist/Server/Utils/GracefulShutdown.js +145 -0
  42. package/build/dist/Server/Utils/GracefulShutdown.js.map +1 -0
  43. package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js +12 -10
  44. package/build/dist/Server/Utils/Monitor/MonitorLogUtil.js.map +1 -1
  45. package/build/dist/Server/Utils/Profiling.js +8 -3
  46. package/build/dist/Server/Utils/Profiling.js.map +1 -1
  47. package/build/dist/Server/Utils/Telemetry/LogExceptionExtractor.js +214 -0
  48. package/build/dist/Server/Utils/Telemetry/LogExceptionExtractor.js.map +1 -0
  49. package/build/dist/Server/Utils/Telemetry/StackTraceParser.js +365 -0
  50. package/build/dist/Server/Utils/Telemetry/StackTraceParser.js.map +1 -0
  51. package/build/dist/Server/Utils/Telemetry.js +10 -4
  52. package/build/dist/Server/Utils/Telemetry.js.map +1 -1
  53. package/build/dist/Tests/Server/Services/TelemetryAttributeService.test.js +50 -0
  54. package/build/dist/Tests/Server/Services/TelemetryAttributeService.test.js.map +1 -0
  55. package/build/dist/Tests/Server/Utils/Telemetry/LogExceptionExtractor.test.js +0 -0
  56. package/build/dist/Tests/Server/Utils/Telemetry/LogExceptionExtractor.test.js.map +1 -0
  57. package/build/dist/UI/Components/AutocompleteTextInput/AutocompleteTextInput.js +7 -1
  58. package/build/dist/UI/Components/AutocompleteTextInput/AutocompleteTextInput.js.map +1 -1
  59. package/build/dist/UI/Components/Dictionary/Dictionary.js +10 -0
  60. package/build/dist/UI/Components/Dictionary/Dictionary.js.map +1 -1
  61. package/build/dist/UI/Components/Filters/FiltersForm.js +1 -1
  62. package/build/dist/UI/Components/Filters/FiltersForm.js.map +1 -1
  63. package/build/dist/UI/Components/Filters/JSONFilter.js +1 -1
  64. package/build/dist/UI/Components/Filters/JSONFilter.js.map +1 -1
  65. package/package.json +1 -1
@@ -236,12 +236,18 @@ const getAttributeValues: GetAttributeValuesFunction = async (
236
236
  ? (req.body["metricName"] as string)
237
237
  : undefined;
238
238
 
239
+ const searchText: string | undefined =
240
+ req.body["searchText"] && typeof req.body["searchText"] === "string"
241
+ ? (req.body["searchText"] as string)
242
+ : undefined;
243
+
239
244
  const values: string[] =
240
245
  await TelemetryAttributeService.fetchAttributeValues({
241
246
  projectId: databaseProps.tenantId,
242
247
  telemetryType,
243
248
  metricName,
244
249
  attributeKey,
250
+ searchText,
245
251
  });
246
252
 
247
253
  return Response.sendJsonObjectResponse(req, res, {
@@ -204,6 +204,33 @@ export const PostgresIdleTimeoutMs: number = parseInt(
204
204
  10,
205
205
  );
206
206
 
207
+ /*
208
+ * TCP keepalive initial delay (ms) for Postgres sockets. When the client
209
+ * process dies ungracefully (SIGKILL, OOM, crash) or a network partition cuts
210
+ * the link, Postgres has no way to know the client is gone and the backend
211
+ * lingers as an orphaned connection — by default up to the OS
212
+ * tcp_keepalive_time (~2h on Linux). Enabling socket keepalive makes
213
+ * node-postgres probe the peer so dead connections are detected and torn down
214
+ * promptly.
215
+ */
216
+ export const PostgresKeepAliveInitialDelayMs: number = parseInt(
217
+ process.env["DATABASE_KEEPALIVE_INITIAL_DELAY_MS"] || "10000",
218
+ 10,
219
+ );
220
+
221
+ /*
222
+ * Postgres-side idle-session timeout (ms). Server-side backstop for orphaned
223
+ * connections: the server terminates any session that sits idle (outside a
224
+ * transaction) longer than this. MUST be larger than the pool's
225
+ * idleTimeoutMillis (PostgresIdleTimeoutMs) so the pool reaps its own healthy
226
+ * idle connections first and only truly-orphaned sessions (client gone) ever
227
+ * hit this. Set to 0 to disable. Requires Postgres 14+.
228
+ */
229
+ export const PostgresIdleSessionTimeoutMs: number = parseInt(
230
+ process.env["DATABASE_IDLE_SESSION_TIMEOUT_MS"] || "300000",
231
+ 10,
232
+ );
233
+
207
234
  /*
208
235
  * TypeORM slow-query log threshold (ms). Any query exceeding this is
209
236
  * logged so we can find offenders in production without per-query
@@ -14,6 +14,7 @@ import HTTPErrorResponse from "../../Types/API/HTTPErrorResponse";
14
14
  import HTTPResponse from "../../Types/API/HTTPResponse";
15
15
  import { JSONObject } from "../../Types/JSON";
16
16
  import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
17
+ import GracefulShutdown, { ShutdownPriority } from "../Utils/GracefulShutdown";
17
18
 
18
19
  export type ClickhouseClient = ClickHouseClient;
19
20
 
@@ -21,6 +22,14 @@ export default class ClickhouseDatabase {
21
22
  private dataSource!: ClickhouseClient | null;
22
23
  private options: ClickHouseClientConfigOptions;
23
24
 
25
+ /*
26
+ * Each instance owns its own pool (App vs. Ingest), so each needs a
27
+ * distinct shutdown-handler name. The two instances share a database name,
28
+ * so a per-instance counter is what makes the names unique.
29
+ */
30
+ private static instanceCounter: number = 0;
31
+ private readonly instanceId: number = ++ClickhouseDatabase.instanceCounter;
32
+
24
33
  public constructor(
25
34
  options: ClickHouseClientConfigOptions = dataSourceOptions,
26
35
  ) {
@@ -97,7 +106,18 @@ export default class ClickhouseDatabase {
97
106
  }
98
107
  };
99
108
 
100
- return await connectToDatabase();
109
+ const client: ClickhouseClient = await connectToDatabase();
110
+
111
+ // Close this Clickhouse pool on shutdown.
112
+ GracefulShutdown.registerHandler(
113
+ `ClickhouseDatabase#${this.instanceId}`,
114
+ ShutdownPriority.DataStores,
115
+ () => {
116
+ return this.disconnect();
117
+ },
118
+ );
119
+
120
+ return client;
101
121
  } catch (err) {
102
122
  logger.error("Clickhouse Database Connection Failed");
103
123
  logger.error(err);
@@ -11,7 +11,9 @@ import {
11
11
  MaxPostgresConnections,
12
12
  PostgresConnectionAcquireTimeoutMs,
13
13
  PostgresIdleInTransactionTimeoutMs,
14
+ PostgresIdleSessionTimeoutMs,
14
15
  PostgresIdleTimeoutMs,
16
+ PostgresKeepAliveInitialDelayMs,
15
17
  PostgresQueryTimeoutMs,
16
18
  PostgresSlowQueryLogThresholdMs,
17
19
  PostgresStatementTimeoutMs,
@@ -54,6 +56,23 @@ const dataSourceOptions: DataSourceOptions = {
54
56
  statement_timeout: PostgresStatementTimeoutMs,
55
57
  query_timeout: PostgresQueryTimeoutMs,
56
58
  idle_in_transaction_session_timeout: PostgresIdleInTransactionTimeoutMs,
59
+ /*
60
+ * Detect dead TCP peers (ungraceful client exit / network partition) so
61
+ * orphaned server-side connections get torn down instead of lingering
62
+ * until the OS keepalive default (~2h).
63
+ */
64
+ keepAlive: true,
65
+ keepAliveInitialDelayMillis: PostgresKeepAliveInitialDelayMs,
66
+ /*
67
+ * Server-side backstop for orphaned idle sessions. node-postgres has no
68
+ * first-class option for this GUC, so pass it via the libpq `options`
69
+ * startup parameter. Unitless values are milliseconds. Only applied when
70
+ * > 0, and must exceed idleTimeoutMillis (see EnvironmentConfig) so the
71
+ * pool reaps healthy idle connections before the server force-closes them.
72
+ */
73
+ ...(PostgresIdleSessionTimeoutMs > 0
74
+ ? { options: `-c idle_session_timeout=${PostgresIdleSessionTimeoutMs}` }
75
+ : {}),
57
76
  },
58
77
  /*
59
78
  * Log any query slower than the configured threshold so we can find
@@ -4,6 +4,7 @@ import Sleep from "../../Types/Sleep";
4
4
  import { DataSource, DataSourceOptions } from "typeorm";
5
5
  import { createDatabase, dropDatabase } from "typeorm-extension";
6
6
  import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
7
+ import GracefulShutdown, { ShutdownPriority } from "../Utils/GracefulShutdown";
7
8
 
8
9
  export type DatabaseSourceOptions = DataSourceOptions;
9
10
  export type DatabaseSource = DataSource;
@@ -30,6 +31,15 @@ export default class Database {
30
31
 
31
32
  @CaptureSpan()
32
33
  public static async connect(): Promise<DataSource> {
34
+ /*
35
+ * Idempotent: a second connect() must not overwrite (and thereby orphan)
36
+ * the existing pool. Return the live DataSource instead of building a new
37
+ * one.
38
+ */
39
+ if (this.dataSource) {
40
+ return this.dataSource;
41
+ }
42
+
33
43
  let retry: number = 0;
34
44
 
35
45
  const dataSourceOptions: DataSourceOptions = this.getDatasourceOptions();
@@ -64,7 +74,23 @@ export default class Database {
64
74
  }
65
75
  };
66
76
 
67
- return await connectToDatabase();
77
+ const dataSource: DataSource = await connectToDatabase();
78
+
79
+ /*
80
+ * Drain the pool on shutdown. Registered here (after a successful
81
+ * connect) so we never register cleanup for a pool that was never
82
+ * created, and — thanks to GracefulShutdown deduping by name — exactly
83
+ * once even if connect() is somehow reached twice.
84
+ */
85
+ GracefulShutdown.registerHandler(
86
+ "PostgresDatabase",
87
+ ShutdownPriority.DataStores,
88
+ () => {
89
+ return this.disconnect();
90
+ },
91
+ );
92
+
93
+ return dataSource;
68
94
  } catch (err) {
69
95
  logger.error("Postgres Database Connection Failed");
70
96
  logger.error(err);
@@ -15,6 +15,7 @@ import Telemetry, {
15
15
  SpanStatusCode,
16
16
  } from "../Utils/Telemetry";
17
17
  import Redis from "./Redis";
18
+ import GracefulShutdown, { ShutdownPriority } from "../Utils/GracefulShutdown";
18
19
 
19
20
  export default class QueueWorker {
20
21
  @CaptureSpan()
@@ -116,9 +117,19 @@ export default class QueueWorker {
116
117
  : {}),
117
118
  });
118
119
 
119
- process.on("SIGINT", async () => {
120
- await worker.close();
121
- });
120
+ /*
121
+ * Stop pulling new jobs and let in-flight ones finish on shutdown. Runs in
122
+ * the Workers tier — before datastores are drained — so jobs mid-flight can
123
+ * still reach Postgres / Redis. Replaces a SIGINT-only handler that never
124
+ * fired in containers (Kubernetes / docker stop send SIGTERM).
125
+ */
126
+ GracefulShutdown.registerHandler(
127
+ `QueueWorker:${queueName}`,
128
+ ShutdownPriority.Workers,
129
+ () => {
130
+ return worker.close();
131
+ },
132
+ );
122
133
 
123
134
  return worker;
124
135
  }
@@ -15,6 +15,7 @@ import logger from "../Utils/Logger";
15
15
  import Sleep from "../../Types/Sleep";
16
16
  import { Redis as RedisClient, RedisOptions } from "ioredis";
17
17
  import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
18
+ import GracefulShutdown, { ShutdownPriority } from "../Utils/GracefulShutdown";
18
19
 
19
20
  export type ClientType = RedisClient;
20
21
  export type RedisOptionsType = RedisOptions;
@@ -122,6 +123,16 @@ export default abstract class Redis {
122
123
  logger.debug(
123
124
  `Redis connected on ${RedisHostname}:${RedisPort.toNumber()}`,
124
125
  );
126
+
127
+ // Close the Redis connection on shutdown.
128
+ GracefulShutdown.registerHandler(
129
+ "Redis",
130
+ ShutdownPriority.DataStores,
131
+ () => {
132
+ return this.disconnect();
133
+ },
134
+ );
135
+
125
136
  return this.client;
126
137
  } catch (err) {
127
138
  logger.error("Redis Connection Failed");
@@ -364,6 +364,7 @@ export class TelemetryAttributeService {
364
364
  telemetryType: TelemetryType;
365
365
  metricName?: string | undefined;
366
366
  attributeKey: string;
367
+ searchText?: string | undefined;
367
368
  }): Promise<string[]> {
368
369
  const source: TelemetrySource | null = this.getTelemetrySource(
369
370
  data.telemetryType,
@@ -378,15 +379,17 @@ export class TelemetryAttributeService {
378
379
  source,
379
380
  metricName: data.metricName,
380
381
  attributeKey: data.attributeKey,
382
+ searchText: data.searchText,
381
383
  });
382
384
  }
383
385
 
384
- private static async fetchAttributeValuesFromDatabase(data: {
386
+ private static buildAttributeValuesStatement(data: {
385
387
  projectId: ObjectID;
386
388
  source: TelemetrySource;
387
389
  metricName?: string | undefined;
388
390
  attributeKey: string;
389
- }): Promise<Array<string>> {
391
+ searchText?: string | undefined;
392
+ }): Statement {
390
393
  const lookbackStartDate: Date =
391
394
  TelemetryAttributeService.getLookbackStartDate();
392
395
 
@@ -419,6 +422,26 @@ export class TelemetryAttributeService {
419
422
  );
420
423
  }
421
424
 
425
+ /*
426
+ * Case-insensitive substring filter so the value autocomplete keeps
427
+ * narrowing server-side as the user types. Without it only the first
428
+ * ATTRIBUTE_VALUES_LIMIT values (alphabetically) are ever reachable,
429
+ * which hides matches on high-cardinality keys (host.name, url, ...).
430
+ * Mirrors the ILIKE idiom used for bodySearchText / nameSearchText.
431
+ */
432
+ if (data.searchText && data.searchText.trim().length > 0) {
433
+ statement.append(
434
+ SQL`
435
+ AND ${data.source.attributesColumn}[${{
436
+ type: TableColumnType.Text,
437
+ value: data.attributeKey,
438
+ }}] ILIKE ${{
439
+ type: TableColumnType.Text,
440
+ value: `%${data.searchText.trim()}%`,
441
+ }}`,
442
+ );
443
+ }
444
+
422
445
  statement.append(
423
446
  SQL`
424
447
  ORDER BY attributeValue ASC
@@ -428,6 +451,19 @@ export class TelemetryAttributeService {
428
451
  }}`,
429
452
  );
430
453
 
454
+ return statement;
455
+ }
456
+
457
+ private static async fetchAttributeValuesFromDatabase(data: {
458
+ projectId: ObjectID;
459
+ source: TelemetrySource;
460
+ metricName?: string | undefined;
461
+ attributeKey: string;
462
+ searchText?: string | undefined;
463
+ }): Promise<Array<string>> {
464
+ const statement: Statement =
465
+ TelemetryAttributeService.buildAttributeValuesStatement(data);
466
+
431
467
  const dbResult: Results = await data.source.service.executeQuery(statement);
432
468
  const response: DbJSONResponse = await dbResult.json<{
433
469
  data?: Array<JSONObject>;
@@ -1,4 +1,5 @@
1
1
  import logger from "./Logger";
2
+ import GracefulShutdown, { ShutdownPriority } from "./GracefulShutdown";
2
3
  import Dictionary from "../../Types/Dictionary";
3
4
  import GenericFunction from "../../Types/GenericFunction";
4
5
  import { JSONObject, JSONObjectOrArray } from "../../Types/JSON";
@@ -104,6 +105,37 @@ class Express {
104
105
  this.httpServer = createServer(this.app);
105
106
  }
106
107
 
108
+ /*
109
+ * On shutdown, stop accepting new connections first (before datastores are
110
+ * drained) so in-flight requests can finish but new ones don't acquire
111
+ * resources we're about to tear down. closeIdleConnections() drops idle
112
+ * keep-alive sockets so server.close() doesn't block waiting on them; the
113
+ * GracefulShutdown per-handler timeout bounds anything still in flight.
114
+ */
115
+ GracefulShutdown.registerHandler(
116
+ "HttpServer",
117
+ ShutdownPriority.HttpServer,
118
+ () => {
119
+ return new Promise<void>((resolve: () => void) => {
120
+ if (!this.httpServer || !this.httpServer.listening) {
121
+ resolve();
122
+ return;
123
+ }
124
+
125
+ const server: Server & { closeIdleConnections?: () => void } =
126
+ this.httpServer;
127
+
128
+ if (typeof server.closeIdleConnections === "function") {
129
+ server.closeIdleConnections();
130
+ }
131
+
132
+ server.close(() => {
133
+ resolve();
134
+ });
135
+ });
136
+ },
137
+ );
138
+
107
139
  type ResolveFunction = (app: express.Application) => void;
108
140
 
109
141
  return new Promise<express.Application>((resolve: ResolveFunction) => {
@@ -0,0 +1,194 @@
1
+ import logger from "./Logger";
2
+
3
+ /*
4
+ * Centralized graceful-shutdown coordinator.
5
+ *
6
+ * Before this existed, each subsystem registered its own
7
+ * process.on("SIGTERM" | "SIGINT") handler independently. The telemetry
8
+ * handler in particular called process.exit(0) as soon as the OTEL SDK
9
+ * flushed, which raced every other handler and — crucially — meant the
10
+ * Postgres / Redis / Clickhouse pools were never drained. Connections were
11
+ * left for the OS socket teardown to reap (and leaked outright on SIGKILL or
12
+ * a network partition).
13
+ *
14
+ * Now every subsystem registers an async cleanup callback here, and this class
15
+ * is the single owner of the signal handlers and of process.exit. Handlers run
16
+ * in ascending priority order (lower first) so we stop accepting new work
17
+ * before tearing down the resources that work depends on:
18
+ *
19
+ * HttpServer (10) -> stop accepting new HTTP requests
20
+ * Workers (20) -> stop pulling new queue jobs, finish in-flight jobs
21
+ * Buffers (30) -> flush in-memory write buffers to their datastore
22
+ * DataStores (40) -> drain Postgres / Redis / Clickhouse pools
23
+ * Telemetry (50) -> flush traces / metrics / logs / profiles last
24
+ *
25
+ * Handlers in the same tier run concurrently. Each handler is bounded by a
26
+ * per-handler timeout, and the whole sequence by an overall deadline, so a
27
+ * single hung handler can never wedge the shutdown.
28
+ */
29
+ export enum ShutdownPriority {
30
+ HttpServer = 10,
31
+ Workers = 20,
32
+ Buffers = 30,
33
+ DataStores = 40,
34
+ Telemetry = 50,
35
+ }
36
+
37
+ export type ShutdownCallback = () => Promise<void> | void;
38
+
39
+ interface RegisteredShutdownHandler {
40
+ name: string;
41
+ priority: ShutdownPriority;
42
+ callback: ShutdownCallback;
43
+ }
44
+
45
+ export default class GracefulShutdown {
46
+ private static handlers: Array<RegisteredShutdownHandler> = [];
47
+ private static signalListenersInstalled: boolean = false;
48
+ private static isShuttingDown: boolean = false;
49
+
50
+ /*
51
+ * How long a single handler may run before we give up on it and move on.
52
+ * Kept comfortably under the orchestrator (Kubernetes) default
53
+ * terminationGracePeriodSeconds of 30s.
54
+ */
55
+ private static readonly perHandlerTimeoutMs: number = 10_000;
56
+
57
+ /*
58
+ * Hard ceiling for the entire shutdown. If we blow past this we force-exit
59
+ * rather than risk being SIGKILLed mid-cleanup.
60
+ */
61
+ private static readonly overallTimeoutMs: number = 25_000;
62
+
63
+ /*
64
+ * Register a cleanup callback to run on SIGTERM / SIGINT. Registering by a
65
+ * stable name is idempotent: a repeat registration (e.g. a second connect())
66
+ * replaces the previous callback instead of stacking a duplicate. Callers
67
+ * that own multiple independent resources (e.g. two Clickhouse pools) must
68
+ * therefore pass distinct names.
69
+ */
70
+ public static registerHandler(
71
+ name: string,
72
+ priority: ShutdownPriority,
73
+ callback: ShutdownCallback,
74
+ ): void {
75
+ const existingIndex: number = this.handlers.findIndex(
76
+ (handler: RegisteredShutdownHandler) => {
77
+ return handler.name === name;
78
+ },
79
+ );
80
+
81
+ if (existingIndex >= 0) {
82
+ this.handlers[existingIndex] = { name, priority, callback };
83
+ } else {
84
+ this.handlers.push({ name, priority, callback });
85
+ }
86
+
87
+ this.installSignalListeners();
88
+ }
89
+
90
+ private static installSignalListeners(): void {
91
+ if (this.signalListenersInstalled) {
92
+ return;
93
+ }
94
+ this.signalListenersInstalled = true;
95
+
96
+ process.on("SIGTERM", () => {
97
+ void this.shutdown("SIGTERM");
98
+ });
99
+ process.on("SIGINT", () => {
100
+ void this.shutdown("SIGINT");
101
+ });
102
+ }
103
+
104
+ public static async shutdown(signal: string): Promise<void> {
105
+ if (this.isShuttingDown) {
106
+ /*
107
+ * A second signal while we're already draining means the operator (or
108
+ * orchestrator) is impatient. Bail out immediately.
109
+ */
110
+ logger.warn(
111
+ `GracefulShutdown: received ${signal} while already shutting down. Forcing exit.`,
112
+ );
113
+ return process.exit(1);
114
+ }
115
+
116
+ this.isShuttingDown = true;
117
+ logger.info(
118
+ `GracefulShutdown: received ${signal}. Draining ${this.handlers.length} handler(s)...`,
119
+ );
120
+
121
+ const forceExitTimer: ReturnType<typeof setTimeout> = setTimeout(() => {
122
+ logger.error(
123
+ `GracefulShutdown: exceeded ${this.overallTimeoutMs}ms overall deadline. Forcing exit.`,
124
+ );
125
+ return process.exit(1);
126
+ }, this.overallTimeoutMs);
127
+
128
+ // Don't let this timer keep the event loop alive on its own.
129
+ forceExitTimer.unref();
130
+
131
+ // Run handlers tier by tier; lower priority tiers complete before the next.
132
+ const tiers: Array<number> = Array.from(
133
+ new Set(
134
+ this.handlers.map((handler: RegisteredShutdownHandler) => {
135
+ return handler.priority;
136
+ }),
137
+ ),
138
+ ).sort((a: number, b: number) => {
139
+ return a - b;
140
+ });
141
+
142
+ for (const tier of tiers) {
143
+ const handlersInTier: Array<RegisteredShutdownHandler> =
144
+ this.handlers.filter((handler: RegisteredShutdownHandler) => {
145
+ return handler.priority === tier;
146
+ });
147
+
148
+ // Handlers within a tier are independent, so run them concurrently.
149
+ await Promise.all(
150
+ handlersInTier.map((handler: RegisteredShutdownHandler) => {
151
+ return this.runHandlerWithTimeout(handler);
152
+ }),
153
+ );
154
+ }
155
+
156
+ clearTimeout(forceExitTimer);
157
+ logger.info("GracefulShutdown: all handlers complete. Exiting cleanly.");
158
+ return process.exit(0);
159
+ }
160
+
161
+ private static async runHandlerWithTimeout(
162
+ handler: RegisteredShutdownHandler,
163
+ ): Promise<void> {
164
+ logger.debug(`GracefulShutdown: running handler "${handler.name}"...`);
165
+
166
+ let timer: ReturnType<typeof setTimeout> | null = null;
167
+
168
+ const timeout: Promise<void> = new Promise<void>((resolve: () => void) => {
169
+ timer = setTimeout(() => {
170
+ logger.warn(
171
+ `GracefulShutdown: handler "${handler.name}" exceeded ${this.perHandlerTimeoutMs}ms. Moving on.`,
172
+ );
173
+ return resolve();
174
+ }, this.perHandlerTimeoutMs);
175
+ timer.unref();
176
+ });
177
+
178
+ const run: Promise<void> = (async (): Promise<void> => {
179
+ try {
180
+ await handler.callback();
181
+ logger.debug(`GracefulShutdown: handler "${handler.name}" done.`);
182
+ } catch (err) {
183
+ logger.error(`GracefulShutdown: handler "${handler.name}" failed:`);
184
+ logger.error(err);
185
+ }
186
+ })();
187
+
188
+ await Promise.race([run, timeout]);
189
+
190
+ if (timer) {
191
+ clearTimeout(timer);
192
+ }
193
+ }
194
+ }
@@ -2,6 +2,7 @@ import MonitorLogService from "../../Services/MonitorLogService";
2
2
  import GlobalConfigService from "../../Services/GlobalConfigService";
3
3
  import GlobalConfig from "../../../Models/DatabaseModels/GlobalConfig";
4
4
  import logger from "../Logger";
5
+ import GracefulShutdown, { ShutdownPriority } from "../GracefulShutdown";
5
6
  import OneUptimeDate from "../../../Types/Date";
6
7
  import ObjectID from "../../../Types/ObjectID";
7
8
  import { JSONObject } from "../../../Types/JSON";
@@ -44,8 +45,9 @@ export default class MonitorLogUtil {
44
45
  * here until either MONITOR_LOG_FLUSH_BATCH_SIZE rows arrive
45
46
  * (size trigger) or MONITOR_LOG_FLUSH_INTERVAL_MS elapses since
46
47
  * the first row entered an empty buffer (time trigger),
47
- * whichever comes first. On graceful shutdown the SIGTERM /
48
- * SIGINT hook below drains the buffer before the process exits.
48
+ * whichever comes first. On graceful shutdown the registered
49
+ * GracefulShutdown handler below drains the buffer (in the
50
+ * Buffers tier, before the datastores are torn down).
49
51
  */
50
52
  private static buffer: Array<JSONObject> = [];
51
53
  private static flushTimer: NodeJS.Timeout | null = null;
@@ -233,10 +235,12 @@ export default class MonitorLogUtil {
233
235
  }
234
236
 
235
237
  /*
236
- * Register SIGTERM / SIGINT handlers exactly once, lazily on
237
- * first ingest. We avoid registering at module-load time so
238
- * tooling that imports this file (e.g. migration runners,
239
- * CLI scripts) doesn't end up with stray process listeners.
238
+ * Register the shutdown flush exactly once, lazily on first ingest. We avoid
239
+ * registering at module-load time so tooling that imports this file (e.g.
240
+ * migration runners, CLI scripts) doesn't end up holding a stray handler.
241
+ *
242
+ * Runs in the Buffers tier — ahead of the DataStores tier — so the buffer is
243
+ * drained to Clickhouse before the datastore pools are torn down.
240
244
  */
241
245
  private static ensureShutdownHooks(): void {
242
246
  if (this.shutdownHooksRegistered) {
@@ -244,16 +248,17 @@ export default class MonitorLogUtil {
244
248
  }
245
249
  this.shutdownHooksRegistered = true;
246
250
 
247
- const flushOnShutdown: () => Promise<void> = async (): Promise<void> => {
248
- try {
249
- await this.flushAndWait();
250
- } catch (err) {
251
- logger.error("Error flushing MonitorLog buffer on shutdown:");
252
- logger.error(err);
253
- }
254
- };
255
-
256
- process.on("SIGTERM", flushOnShutdown);
257
- process.on("SIGINT", flushOnShutdown);
251
+ GracefulShutdown.registerHandler(
252
+ "MonitorLogUtil",
253
+ ShutdownPriority.Buffers,
254
+ async (): Promise<void> => {
255
+ try {
256
+ await this.flushAndWait();
257
+ } catch (err) {
258
+ logger.error("Error flushing MonitorLog buffer on shutdown:");
259
+ logger.error(err);
260
+ }
261
+ },
262
+ );
258
263
  }
259
264
  }
@@ -1,6 +1,7 @@
1
1
  import Pyroscope from "@pyroscope/nodejs";
2
2
  import { EnableProfiling } from "../EnvironmentConfig";
3
3
  import logger, { LogAttributes } from "./Logger";
4
+ import GracefulShutdown, { ShutdownPriority } from "./GracefulShutdown";
4
5
 
5
6
  export default class Profiling {
6
7
  public static init(data: { serviceName: string }): void {
@@ -44,12 +45,19 @@ export default class Profiling {
44
45
  logger.error(err, profilingLogAttributes);
45
46
  }
46
47
 
47
- process.on("SIGTERM", () => {
48
- Pyroscope.stop().catch((err: unknown) => {
49
- logger.error("Error stopping profiler:", profilingLogAttributes);
50
- logger.error(err, profilingLogAttributes);
51
- });
52
- });
48
+ // Stop the profiler last (Telemetry tier), alongside the OTEL flush.
49
+ GracefulShutdown.registerHandler(
50
+ "Profiling",
51
+ ShutdownPriority.Telemetry,
52
+ async (): Promise<void> => {
53
+ try {
54
+ await Pyroscope.stop();
55
+ } catch (err) {
56
+ logger.error("Error stopping profiler:", profilingLogAttributes);
57
+ logger.error(err, profilingLogAttributes);
58
+ }
59
+ },
60
+ );
53
61
  }
54
62
 
55
63
  private static getServerAddress(): string | undefined {