@oneuptime/common 10.0.83 → 10.0.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/Models/DatabaseModels/Index.ts +2 -0
  2. package/Models/DatabaseModels/KubernetesContainer.ts +552 -0
  3. package/Models/DatabaseModels/KubernetesResource.ts +130 -0
  4. package/Models/DatabaseModels/LlmLog.ts +2 -1
  5. package/Models/DatabaseModels/LlmProvider.ts +5 -4
  6. package/Models/DatabaseModels/Project.ts +40 -0
  7. package/Server/API/KubernetesResourceAPI.ts +144 -12
  8. package/Server/Infrastructure/Postgres/SchemaMigrations/1777550162848-MigrationName.ts +29 -0
  9. package/Server/Infrastructure/Postgres/SchemaMigrations/1777571961028-MigrationName.ts +99 -0
  10. package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +4 -0
  11. package/Server/Infrastructure/Queue.ts +60 -0
  12. package/Server/Infrastructure/QueueWorker.ts +39 -1
  13. package/Server/Middleware/HttpMetricsMiddleware.ts +92 -0
  14. package/Server/Services/AuditLogService.ts +19 -1
  15. package/Server/Services/KubernetesContainerService.ts +264 -0
  16. package/Server/Services/KubernetesResourceService.ts +233 -0
  17. package/Server/Services/StatusPageSubscriberService.ts +4 -4
  18. package/Server/Types/Database/Permissions/AccessControlPermission.ts +3 -3
  19. package/Server/Utils/LLM/LLMService.ts +132 -11
  20. package/Server/Utils/Monitor/MonitorAlert.ts +1 -1
  21. package/Server/Utils/Monitor/MonitorIncident.ts +1 -1
  22. package/Server/Utils/StartServer.ts +2 -0
  23. package/Server/Utils/Telemetry/AppMetrics.ts +211 -0
  24. package/Server/Utils/Telemetry/RuntimeMetrics.ts +169 -0
  25. package/Server/Utils/Telemetry.ts +98 -0
  26. package/Server/Utils/Workspace/Slack/Actions/Alert.ts +2 -2
  27. package/Server/Utils/Workspace/Slack/Actions/Incident.ts +2 -2
  28. package/Server/Utils/Workspace/Slack/Actions/ScheduledMaintenance.ts +2 -2
  29. package/Tests/jest.setup.ts +18 -0
  30. package/Types/Kubernetes/KubernetesInventoryExtractor.ts +171 -5
  31. package/Types/LLM/LlmType.ts +3 -0
  32. package/UI/Components/Forms/ModelForm.tsx +3 -3
  33. package/UI/Components/Label/Labels.tsx +10 -2
  34. package/UI/Components/LogsViewer/components/LogsAnalyticsView.tsx +2 -2
  35. package/Utils/UUID.ts +1 -3
  36. package/build/dist/Models/DatabaseModels/Index.js +2 -0
  37. package/build/dist/Models/DatabaseModels/Index.js.map +1 -1
  38. package/build/dist/Models/DatabaseModels/KubernetesContainer.js +581 -0
  39. package/build/dist/Models/DatabaseModels/KubernetesContainer.js.map +1 -0
  40. package/build/dist/Models/DatabaseModels/KubernetesResource.js +135 -0
  41. package/build/dist/Models/DatabaseModels/KubernetesResource.js.map +1 -1
  42. package/build/dist/Models/DatabaseModels/LlmLog.js +1 -1
  43. package/build/dist/Models/DatabaseModels/LlmLog.js.map +1 -1
  44. package/build/dist/Models/DatabaseModels/LlmProvider.js +4 -4
  45. package/build/dist/Models/DatabaseModels/LlmProvider.js.map +1 -1
  46. package/build/dist/Models/DatabaseModels/Project.js +41 -0
  47. package/build/dist/Models/DatabaseModels/Project.js.map +1 -1
  48. package/build/dist/Server/API/KubernetesResourceAPI.js +106 -9
  49. package/build/dist/Server/API/KubernetesResourceAPI.js.map +1 -1
  50. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1777550162848-MigrationName.js +16 -0
  51. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1777550162848-MigrationName.js.map +1 -0
  52. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1777571961028-MigrationName.js +40 -0
  53. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1777571961028-MigrationName.js.map +1 -0
  54. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +4 -0
  55. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js.map +1 -1
  56. package/build/dist/Server/Infrastructure/Queue.js +44 -0
  57. package/build/dist/Server/Infrastructure/Queue.js.map +1 -1
  58. package/build/dist/Server/Infrastructure/QueueWorker.js +31 -1
  59. package/build/dist/Server/Infrastructure/QueueWorker.js.map +1 -1
  60. package/build/dist/Server/Middleware/HttpMetricsMiddleware.js +61 -0
  61. package/build/dist/Server/Middleware/HttpMetricsMiddleware.js.map +1 -0
  62. package/build/dist/Server/Services/AuditLogService.js +14 -1
  63. package/build/dist/Server/Services/AuditLogService.js.map +1 -1
  64. package/build/dist/Server/Services/KubernetesContainerService.js +179 -0
  65. package/build/dist/Server/Services/KubernetesContainerService.js.map +1 -0
  66. package/build/dist/Server/Services/KubernetesResourceService.js +175 -0
  67. package/build/dist/Server/Services/KubernetesResourceService.js.map +1 -1
  68. package/build/dist/Server/Services/StatusPageSubscriberService.js +4 -4
  69. package/build/dist/Server/Services/StatusPageSubscriberService.js.map +1 -1
  70. package/build/dist/Server/Types/Database/Permissions/AccessControlPermission.js +3 -3
  71. package/build/dist/Server/Utils/LLM/LLMService.js +111 -13
  72. package/build/dist/Server/Utils/LLM/LLMService.js.map +1 -1
  73. package/build/dist/Server/Utils/Monitor/MonitorAlert.js +1 -1
  74. package/build/dist/Server/Utils/Monitor/MonitorAlert.js.map +1 -1
  75. package/build/dist/Server/Utils/Monitor/MonitorIncident.js +1 -1
  76. package/build/dist/Server/Utils/Monitor/MonitorIncident.js.map +1 -1
  77. package/build/dist/Server/Utils/StartServer.js +2 -0
  78. package/build/dist/Server/Utils/StartServer.js.map +1 -1
  79. package/build/dist/Server/Utils/Telemetry/AppMetrics.js +167 -0
  80. package/build/dist/Server/Utils/Telemetry/AppMetrics.js.map +1 -0
  81. package/build/dist/Server/Utils/Telemetry/RuntimeMetrics.js +141 -0
  82. package/build/dist/Server/Utils/Telemetry/RuntimeMetrics.js.map +1 -0
  83. package/build/dist/Server/Utils/Telemetry.js +47 -0
  84. package/build/dist/Server/Utils/Telemetry.js.map +1 -1
  85. package/build/dist/Server/Utils/Workspace/Slack/Actions/Alert.js +2 -2
  86. package/build/dist/Server/Utils/Workspace/Slack/Actions/Incident.js +2 -2
  87. package/build/dist/Server/Utils/Workspace/Slack/Actions/ScheduledMaintenance.js +2 -2
  88. package/build/dist/Tests/jest.setup.js +17 -0
  89. package/build/dist/Tests/jest.setup.js.map +1 -1
  90. package/build/dist/Types/Kubernetes/KubernetesInventoryExtractor.js +116 -4
  91. package/build/dist/Types/Kubernetes/KubernetesInventoryExtractor.js.map +1 -1
  92. package/build/dist/Types/LLM/LlmType.js +3 -0
  93. package/build/dist/Types/LLM/LlmType.js.map +1 -1
  94. package/build/dist/UI/Components/Forms/ModelForm.js +3 -3
  95. package/build/dist/UI/Components/Label/Labels.js +8 -2
  96. package/build/dist/UI/Components/Label/Labels.js.map +1 -1
  97. package/build/dist/UI/Components/LogsViewer/components/LogsAnalyticsView.js.map +1 -1
  98. package/build/dist/Utils/UUID.js +1 -2
  99. package/build/dist/Utils/UUID.js.map +1 -1
  100. package/package.json +6 -8
@@ -46,7 +46,11 @@ export default class LLMService {
46
46
 
47
47
  switch (config.llmType) {
48
48
  case LlmType.OpenAI:
49
- return await this.getOpenAICompletion(config, request);
49
+ case LlmType.Groq:
50
+ case LlmType.Mistral:
51
+ return await this.getOpenAICompatibleCompletion(config, request);
52
+ case LlmType.AzureOpenAI:
53
+ return await this.getAzureOpenAICompletion(config, request);
50
54
  case LlmType.Anthropic:
51
55
  return await this.getAnthropicCompletion(config, request);
52
56
  case LlmType.Ollama:
@@ -57,17 +61,32 @@ export default class LLMService {
57
61
  }
58
62
 
59
63
  @CaptureSpan()
60
- private static async getOpenAICompletion(
64
+ private static async getOpenAICompatibleCompletion(
61
65
  config: LLMProviderConfig,
62
66
  request: LLMCompletionRequest,
63
67
  ): Promise<LLMCompletionResponse> {
64
68
  if (!config.apiKey) {
65
- throw new BadDataException("OpenAI API key is required");
69
+ throw new BadDataException(`${config.llmType} API key is required`);
66
70
  }
67
71
 
68
- const baseUrl: string = config.baseUrl || "https://api.openai.com/v1";
69
- const modelName: string = config.modelName || "gpt-4o";
72
+ const defaultBaseUrls: Record<string, string> = {
73
+ [LlmType.OpenAI]: "https://api.openai.com/v1",
74
+ [LlmType.Groq]: "https://api.groq.com/openai/v1",
75
+ [LlmType.Mistral]: "https://api.mistral.ai/v1",
76
+ };
70
77
 
78
+ const defaultModels: Record<string, string> = {
79
+ [LlmType.OpenAI]: "gpt-4o",
80
+ [LlmType.Groq]: "llama-3.3-70b-versatile",
81
+ [LlmType.Mistral]: "mistral-large-latest",
82
+ };
83
+
84
+ const baseUrl: string =
85
+ config.baseUrl ||
86
+ defaultBaseUrls[config.llmType] ||
87
+ "https://api.openai.com/v1";
88
+ const modelName: string =
89
+ config.modelName || defaultModels[config.llmType] || "gpt-4o";
71
90
  const response: HTTPErrorResponse | HTTPResponse<JSONObject> =
72
91
  await API.post<JSONObject>({
73
92
  url: URL.fromString(`${baseUrl}/chat/completions`),
@@ -88,20 +107,122 @@ export default class LLMService {
88
107
  options: {
89
108
  retries: 2,
90
109
  exponentialBackoff: true,
91
- timeout: 120000, // 2 minutes timeout for LLM calls
110
+ timeout: 120000,
111
+ },
112
+ });
113
+
114
+ const logAttributes: LogAttributes = {
115
+ llmType: config.llmType,
116
+ modelName: modelName,
117
+ };
118
+
119
+ if (response instanceof HTTPErrorResponse) {
120
+ logger.error(`Error from ${config.llmType} API:`, logAttributes);
121
+ logger.error(response, logAttributes);
122
+ throw new BadDataException(
123
+ `${config.llmType} API error: ${JSON.stringify(response.jsonData)}`,
124
+ );
125
+ }
126
+
127
+ const jsonData: JSONObject = response.jsonData as JSONObject;
128
+ const choices: Array<JSONObject> = jsonData["choices"] as Array<JSONObject>;
129
+
130
+ if (!choices || choices.length === 0) {
131
+ throw new BadDataException(`No response from ${config.llmType}`);
132
+ }
133
+
134
+ const message: JSONObject = choices[0]!["message"] as JSONObject;
135
+ const usage: JSONObject = jsonData["usage"] as JSONObject;
136
+
137
+ return {
138
+ content: message["content"] as string,
139
+ usage: usage
140
+ ? {
141
+ promptTokens: usage["prompt_tokens"] as number,
142
+ completionTokens: usage["completion_tokens"] as number,
143
+ totalTokens: usage["total_tokens"] as number,
144
+ }
145
+ : undefined,
146
+ };
147
+ }
148
+
149
+ /*
150
+ * Default Azure OpenAI API version. Users can override by including
151
+ * ?api-version=... in their configured base URL.
152
+ */
153
+ private static readonly AZURE_OPENAI_DEFAULT_API_VERSION: string =
154
+ "2024-10-21";
155
+
156
+ private static buildAzureOpenAIChatCompletionsUrl(baseUrl: string): string {
157
+ const trimmed: string = baseUrl.replace(/\/+$/, "");
158
+ const queryIndex: number = trimmed.indexOf("?");
159
+ const pathPart: string =
160
+ queryIndex >= 0 ? trimmed.substring(0, queryIndex) : trimmed;
161
+ const queryPart: string =
162
+ queryIndex >= 0 ? trimmed.substring(queryIndex + 1) : "";
163
+
164
+ const params: URLSearchParams = new URLSearchParams(queryPart);
165
+ if (!params.has("api-version")) {
166
+ params.set("api-version", LLMService.AZURE_OPENAI_DEFAULT_API_VERSION);
167
+ }
168
+
169
+ return `${pathPart}/chat/completions?${params.toString()}`;
170
+ }
171
+
172
+ @CaptureSpan()
173
+ private static async getAzureOpenAICompletion(
174
+ config: LLMProviderConfig,
175
+ request: LLMCompletionRequest,
176
+ ): Promise<LLMCompletionResponse> {
177
+ if (!config.apiKey) {
178
+ throw new BadDataException("Azure OpenAI API key is required");
179
+ }
180
+
181
+ if (!config.baseUrl) {
182
+ throw new BadDataException(
183
+ "Azure OpenAI Base URL is required (e.g. https://<resource>.openai.azure.com/openai/deployments/<deployment>)",
184
+ );
185
+ }
186
+
187
+ const modelName: string = config.modelName || "gpt-4o";
188
+ const requestUrl: string = LLMService.buildAzureOpenAIChatCompletionsUrl(
189
+ config.baseUrl,
190
+ );
191
+
192
+ const response: HTTPErrorResponse | HTTPResponse<JSONObject> =
193
+ await API.post<JSONObject>({
194
+ url: URL.fromString(requestUrl),
195
+ data: {
196
+ model: modelName,
197
+ messages: request.messages.map((msg: LLMMessage) => {
198
+ return {
199
+ role: msg.role,
200
+ content: msg.content,
201
+ };
202
+ }),
203
+ temperature: request.temperature ?? 0.7,
204
+ },
205
+ headers: {
206
+ "api-key": config.apiKey,
207
+ "Content-Type": "application/json",
208
+ },
209
+ options: {
210
+ retries: 2,
211
+ exponentialBackoff: true,
212
+ timeout: 120000,
92
213
  },
93
214
  });
94
215
 
95
- const openAILogAttributes: LogAttributes = {
216
+ const logAttributes: LogAttributes = {
96
217
  llmType: config.llmType,
97
218
  modelName: modelName,
98
219
  };
99
220
 
100
221
  if (response instanceof HTTPErrorResponse) {
101
- logger.error("Error from OpenAI API:", openAILogAttributes);
102
- logger.error(response, openAILogAttributes);
222
+ logger.error("Error from Azure OpenAI API:", logAttributes);
223
+ logger.error(response, logAttributes);
103
224
  throw new BadDataException(
104
- `OpenAI API error: ${JSON.stringify(response.jsonData)}`,
225
+ `Azure OpenAI API error: ${JSON.stringify(response.jsonData)}`,
105
226
  );
106
227
  }
107
228
 
@@ -109,7 +230,7 @@ export default class LLMService {
109
230
  const choices: Array<JSONObject> = jsonData["choices"] as Array<JSONObject>;
110
231
 
111
232
  if (!choices || choices.length === 0) {
112
- throw new BadDataException("No response from OpenAI");
233
+ throw new BadDataException("No response from Azure OpenAI");
113
234
  }
114
235
 
115
236
  const message: JSONObject = choices[0]!["message"] as JSONObject;
@@ -37,7 +37,7 @@ export default class MonitorAlert {
37
37
  evaluationSummary?: MonitorEvaluationSummary | undefined;
38
38
  breachingSeriesFingerprints?: Set<string> | undefined;
39
39
  }): Promise<Array<Alert>> {
40
- // check active alerts and if there are open alerts, do not cretae anothr alert.
40
+ // check active alerts and if there are open alerts, do not create another alert.
41
41
  const openAlerts: Array<Alert> = await AlertService.findBy({
42
42
  query: {
43
43
  monitor: input.monitorId!,
@@ -50,7 +50,7 @@ export default class MonitorIncident {
50
50
  */
51
51
  breachingSeriesFingerprints?: Set<string> | undefined;
52
52
  }): Promise<Array<Incident>> {
53
- // check active incidents and if there are open incidents, do not cretae anothr incident.
53
+ // check active incidents and if there are open incidents, do not create another incident.
54
54
  const openIncidents: Array<Incident> = await IncidentService.findBy({
55
55
  query: {
56
56
  monitors: [input.monitorId],
@@ -7,6 +7,7 @@ import {
7
7
  getFrontendEnvVars,
8
8
  } from "../EnvironmentConfig";
9
9
  import LocalCache from "../Infrastructure/LocalCache";
10
+ import HttpMetricsMiddleware from "../Middleware/HttpMetricsMiddleware";
10
11
  import "./Environment";
11
12
  import Express, {
12
13
  ExpressApplication,
@@ -106,6 +107,7 @@ const setDefaultHeaders: RequestHandler = (
106
107
  };
107
108
 
108
109
  app.use(cors());
110
+ app.use(HttpMetricsMiddleware);
109
111
  app.use(setDefaultHeaders);
110
112
 
111
113
  // Set the view engine to ejs
@@ -0,0 +1,211 @@
1
+ import Telemetry, {
2
+ TelemetryCounter,
3
+ TelemetryHistogram,
4
+ TelemetryUpDownCounter,
5
+ } from "../Telemetry";
6
+
7
+ /**
8
+ * Central catalog of metric instruments emitted by OneUptime services about
9
+ * themselves (server-side observability of the OneUptime platform). All
10
+ * instruments are lazy-created on first access and cached, so importing this
11
+ * module from multiple call sites is safe.
12
+ *
13
+ * Naming follows OpenTelemetry semantic conventions where applicable, with a
14
+ * `oneuptime.` prefix for application-specific signals.
15
+ *
16
+ * Cardinality rule: never attach high-cardinality identifiers (userId,
17
+ * projectId, monitorId, requestId, raw URLs) to metric attributes — those
18
+ * belong on traces and logs. Stick to bounded enums (method, status_code,
19
+ * monitor type, queue name, channel, outcome).
20
+ */
21
+ export default class AppMetrics {
22
+ // -- HTTP server -------------------------------------------------------
23
+
24
+ private static httpRequestCounter: TelemetryCounter | null = null;
25
+ private static httpRequestDuration: TelemetryHistogram | null = null;
26
+ private static httpRequestsInFlight: TelemetryUpDownCounter | null = null;
27
+
28
+ public static getHttpRequestCounter(): TelemetryCounter {
29
+ if (!this.httpRequestCounter) {
30
+ this.httpRequestCounter = Telemetry.getCounter({
31
+ name: "http.server.request.count",
32
+ description:
33
+ "Number of HTTP requests handled by the server, partitioned by method, route and status class.",
34
+ unit: "1",
35
+ });
36
+ }
37
+
38
+ return this.httpRequestCounter;
39
+ }
40
+
41
+ public static getHttpRequestDuration(): TelemetryHistogram {
42
+ if (!this.httpRequestDuration) {
43
+ this.httpRequestDuration = Telemetry.getHistogram({
44
+ name: "http.server.request.duration",
45
+ description: "Duration of HTTP server requests.",
46
+ unit: "ms",
47
+ });
48
+ }
49
+
50
+ return this.httpRequestDuration;
51
+ }
52
+
53
+ public static getHttpRequestsInFlight(): TelemetryUpDownCounter {
54
+ if (!this.httpRequestsInFlight) {
55
+ this.httpRequestsInFlight = Telemetry.getGauge({
56
+ name: "http.server.active_requests",
57
+ description: "Number of HTTP requests currently being processed.",
58
+ unit: "1",
59
+ });
60
+ }
61
+
62
+ return this.httpRequestsInFlight;
63
+ }
64
+
65
+ // -- Worker / background jobs -----------------------------------------
66
+
67
+ private static workerJobCounter: TelemetryCounter | null = null;
68
+ private static workerJobDuration: TelemetryHistogram | null = null;
69
+ private static workerJobsInFlight: TelemetryUpDownCounter | null = null;
70
+
71
+ public static getWorkerJobCounter(): TelemetryCounter {
72
+ if (!this.workerJobCounter) {
73
+ this.workerJobCounter = Telemetry.getCounter({
74
+ name: "worker.job.count",
75
+ description:
76
+ "Number of background worker jobs processed, partitioned by queue, job name and outcome.",
77
+ unit: "1",
78
+ });
79
+ }
80
+
81
+ return this.workerJobCounter;
82
+ }
83
+
84
+ public static getWorkerJobDuration(): TelemetryHistogram {
85
+ if (!this.workerJobDuration) {
86
+ this.workerJobDuration = Telemetry.getHistogram({
87
+ name: "worker.job.duration",
88
+ description: "Duration of background worker job execution.",
89
+ unit: "ms",
90
+ });
91
+ }
92
+
93
+ return this.workerJobDuration;
94
+ }
95
+
96
+ public static getWorkerJobsInFlight(): TelemetryUpDownCounter {
97
+ if (!this.workerJobsInFlight) {
98
+ this.workerJobsInFlight = Telemetry.getGauge({
99
+ name: "worker.job.active",
100
+ description: "Number of worker jobs currently executing.",
101
+ unit: "1",
102
+ });
103
+ }
104
+
105
+ return this.workerJobsInFlight;
106
+ }
107
+
108
+ // -- Probe monitor checks ---------------------------------------------
109
+
110
+ private static probeCheckCounter: TelemetryCounter | null = null;
111
+ private static probeCheckDuration: TelemetryHistogram | null = null;
112
+
113
+ public static getProbeCheckCounter(): TelemetryCounter {
114
+ if (!this.probeCheckCounter) {
115
+ this.probeCheckCounter = Telemetry.getCounter({
116
+ name: "probe.monitor.check.count",
117
+ description:
118
+ "Number of monitor checks executed by the probe, partitioned by monitor type and outcome.",
119
+ unit: "1",
120
+ });
121
+ }
122
+
123
+ return this.probeCheckCounter;
124
+ }
125
+
126
+ public static getProbeCheckDuration(): TelemetryHistogram {
127
+ if (!this.probeCheckDuration) {
128
+ this.probeCheckDuration = Telemetry.getHistogram({
129
+ name: "probe.monitor.check.duration",
130
+ description: "Duration of probe monitor checks.",
131
+ unit: "ms",
132
+ });
133
+ }
134
+
135
+ return this.probeCheckDuration;
136
+ }
137
+
138
+ // -- Notification dispatch (Mail/SMS/Call/Push) -----------------------
139
+
140
+ private static notificationCounter: TelemetryCounter | null = null;
141
+ private static notificationDuration: TelemetryHistogram | null = null;
142
+
143
+ public static getNotificationCounter(): TelemetryCounter {
144
+ if (!this.notificationCounter) {
145
+ this.notificationCounter = Telemetry.getCounter({
146
+ name: "notification.send.count",
147
+ description:
148
+ "Number of notifications dispatched, partitioned by channel and outcome.",
149
+ unit: "1",
150
+ });
151
+ }
152
+
153
+ return this.notificationCounter;
154
+ }
155
+
156
+ public static getNotificationDuration(): TelemetryHistogram {
157
+ if (!this.notificationDuration) {
158
+ this.notificationDuration = Telemetry.getHistogram({
159
+ name: "notification.send.duration",
160
+ description: "Duration of notification dispatch calls.",
161
+ unit: "ms",
162
+ });
163
+ }
164
+
165
+ return this.notificationDuration;
166
+ }
167
+
168
+ // -- OTLP / telemetry ingestion ---------------------------------------
169
+
170
+ private static ingestCounter: TelemetryCounter | null = null;
171
+ private static ingestDuration: TelemetryHistogram | null = null;
172
+ private static ingestPayloadBytes: TelemetryHistogram | null = null;
173
+
174
+ public static getIngestCounter(): TelemetryCounter {
175
+ if (!this.ingestCounter) {
176
+ this.ingestCounter = Telemetry.getCounter({
177
+ name: "telemetry.ingest.request.count",
178
+ description:
179
+ "Number of telemetry ingestion requests received, partitioned by signal and outcome.",
180
+ unit: "1",
181
+ });
182
+ }
183
+
184
+ return this.ingestCounter;
185
+ }
186
+
187
+ public static getIngestDuration(): TelemetryHistogram {
188
+ if (!this.ingestDuration) {
189
+ this.ingestDuration = Telemetry.getHistogram({
190
+ name: "telemetry.ingest.request.duration",
191
+ description: "Duration of telemetry ingestion request handling.",
192
+ unit: "ms",
193
+ });
194
+ }
195
+
196
+ return this.ingestDuration;
197
+ }
198
+
199
+ public static getIngestPayloadBytes(): TelemetryHistogram {
200
+ if (!this.ingestPayloadBytes) {
201
+ this.ingestPayloadBytes = Telemetry.getHistogram({
202
+ name: "telemetry.ingest.request.payload.size",
203
+ description:
204
+ "Size of telemetry ingestion request payloads, after decompression.",
205
+ unit: "By",
206
+ });
207
+ }
208
+
209
+ return this.ingestPayloadBytes;
210
+ }
211
+ }
@@ -0,0 +1,169 @@
1
+ import { monitorEventLoopDelay, IntervalHistogram } from "perf_hooks";
2
+ import type { Attributes, ObservableResult } from "@opentelemetry/api";
3
+ import Telemetry from "../Telemetry";
4
+ import logger from "../Logger";
5
+
6
+ /**
7
+ * Process-level runtime metrics (memory, CPU, event-loop lag).
8
+ *
9
+ * Implemented as observable gauges so they're sampled at export time rather
10
+ * than continuously. Registered once per process from `Telemetry.init()`.
11
+ *
12
+ * Metric names follow the OpenTelemetry semantic conventions for Node.js
13
+ * process and runtime metrics where they exist.
14
+ */
15
+ export default class RuntimeMetrics {
16
+ private static initialized: boolean = false;
17
+
18
+ private static lastCpuUsage: NodeJS.CpuUsage | null = null;
19
+ private static lastCpuSampleTimestampNs: bigint | null = null;
20
+
21
+ private static eventLoopHistogram: IntervalHistogram | null = null;
22
+
23
+ public static init(): void {
24
+ if (this.initialized) {
25
+ return;
26
+ }
27
+
28
+ if (!Telemetry.isMetricsEnabled()) {
29
+ return;
30
+ }
31
+
32
+ try {
33
+ this.startEventLoopMonitor();
34
+
35
+ Telemetry.getObservableGauge({
36
+ name: "process.runtime.nodejs.memory.heap.used",
37
+ description: "V8 heap memory currently in use.",
38
+ unit: "By",
39
+ callback: (result: ObservableResult<Attributes>) => {
40
+ result.observe(process.memoryUsage().heapUsed);
41
+ },
42
+ });
43
+
44
+ Telemetry.getObservableGauge({
45
+ name: "process.runtime.nodejs.memory.heap.total",
46
+ description: "Total size of allocated V8 heap.",
47
+ unit: "By",
48
+ callback: (result: ObservableResult<Attributes>) => {
49
+ result.observe(process.memoryUsage().heapTotal);
50
+ },
51
+ });
52
+
53
+ Telemetry.getObservableGauge({
54
+ name: "process.runtime.nodejs.memory.rss",
55
+ description:
56
+ "Resident set size — total memory allocated to the Node.js process.",
57
+ unit: "By",
58
+ callback: (result: ObservableResult<Attributes>) => {
59
+ result.observe(process.memoryUsage().rss);
60
+ },
61
+ });
62
+
63
+ Telemetry.getObservableGauge({
64
+ name: "process.runtime.nodejs.memory.external",
65
+ description:
66
+ "Memory used by C++ objects bound to JavaScript objects managed by V8.",
67
+ unit: "By",
68
+ callback: (result: ObservableResult<Attributes>) => {
69
+ result.observe(process.memoryUsage().external);
70
+ },
71
+ });
72
+
73
+ Telemetry.getObservableGauge({
74
+ name: "process.runtime.nodejs.cpu.utilization",
75
+ description:
76
+ "Fraction of a single CPU core used by this Node.js process since the last sample (0-1, may exceed 1 on multi-core).",
77
+ unit: "1",
78
+ callback: (result: ObservableResult<Attributes>) => {
79
+ const utilization: number = this.sampleCpuUtilization();
80
+ if (Number.isFinite(utilization)) {
81
+ result.observe(utilization);
82
+ }
83
+ },
84
+ });
85
+
86
+ Telemetry.getObservableGauge({
87
+ name: "process.runtime.nodejs.eventloop.lag",
88
+ description:
89
+ "Event loop scheduling delay (mean and p99 over the sampling interval).",
90
+ unit: "ms",
91
+ callback: (result: ObservableResult<Attributes>) => {
92
+ if (!this.eventLoopHistogram) {
93
+ return;
94
+ }
95
+
96
+ const meanMs: number = this.eventLoopHistogram.mean / 1e6;
97
+ const p99Ms: number = this.eventLoopHistogram.percentile(99) / 1e6;
98
+ const maxMs: number = this.eventLoopHistogram.max / 1e6;
99
+
100
+ if (Number.isFinite(meanMs)) {
101
+ result.observe(meanMs, { quantile: "mean" });
102
+ }
103
+ if (Number.isFinite(p99Ms)) {
104
+ result.observe(p99Ms, { quantile: "p99" });
105
+ }
106
+ if (Number.isFinite(maxMs)) {
107
+ result.observe(maxMs, { quantile: "max" });
108
+ }
109
+
110
+ this.eventLoopHistogram.reset();
111
+ },
112
+ });
113
+
114
+ Telemetry.getObservableGauge({
115
+ name: "process.runtime.nodejs.uptime",
116
+ description: "Time elapsed since the Node.js process started.",
117
+ unit: "s",
118
+ callback: (result: ObservableResult<Attributes>) => {
119
+ result.observe(process.uptime());
120
+ },
121
+ });
122
+
123
+ this.initialized = true;
124
+ } catch (err) {
125
+ logger.error("Failed to initialize Node.js runtime metrics");
126
+ logger.error(err);
127
+ }
128
+ }
129
+
130
+ private static startEventLoopMonitor(): void {
131
+ if (this.eventLoopHistogram) {
132
+ return;
133
+ }
134
+
135
+ /*
136
+ * Resolution in milliseconds (the API expects nanoseconds via internal
137
+ * resolution, but `monitorEventLoopDelay` accepts a millisecond value).
138
+ */
139
+ this.eventLoopHistogram = monitorEventLoopDelay({ resolution: 20 });
140
+ this.eventLoopHistogram.enable();
141
+ }
142
+
143
+ private static sampleCpuUtilization(): number {
144
+ const nowNs: bigint = process.hrtime.bigint();
145
+ const usage: NodeJS.CpuUsage = process.cpuUsage();
146
+
147
+ if (!this.lastCpuUsage || !this.lastCpuSampleTimestampNs) {
148
+ this.lastCpuUsage = usage;
149
+ this.lastCpuSampleTimestampNs = nowNs;
150
+ return 0;
151
+ }
152
+
153
+ const elapsedNs: bigint = nowNs - this.lastCpuSampleTimestampNs;
154
+ const elapsedMicros: number = Number(elapsedNs / BigInt(1000));
155
+
156
+ if (elapsedMicros <= 0) {
157
+ return 0;
158
+ }
159
+
160
+ const userDelta: number = usage.user - this.lastCpuUsage.user;
161
+ const systemDelta: number = usage.system - this.lastCpuUsage.system;
162
+ const utilization: number = (userDelta + systemDelta) / elapsedMicros;
163
+
164
+ this.lastCpuUsage = usage;
165
+ this.lastCpuSampleTimestampNs = nowNs;
166
+
167
+ return utilization;
168
+ }
169
+ }