@oneuptime/common 10.0.84 → 10.0.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/Models/DatabaseModels/Index.ts +2 -0
  2. package/Models/DatabaseModels/KubernetesContainer.ts +552 -0
  3. package/Models/DatabaseModels/KubernetesResource.ts +130 -0
  4. package/Models/DatabaseModels/LlmLog.ts +2 -1
  5. package/Models/DatabaseModels/LlmProvider.ts +5 -4
  6. package/Models/DatabaseModels/Project.ts +40 -0
  7. package/Server/API/KubernetesResourceAPI.ts +144 -12
  8. package/Server/Infrastructure/Postgres/SchemaMigrations/1777550162848-MigrationName.ts +29 -0
  9. package/Server/Infrastructure/Postgres/SchemaMigrations/1777571961028-MigrationName.ts +99 -0
  10. package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +4 -0
  11. package/Server/Infrastructure/Queue.ts +60 -0
  12. package/Server/Infrastructure/QueueWorker.ts +39 -1
  13. package/Server/Middleware/HttpMetricsMiddleware.ts +92 -0
  14. package/Server/Services/AuditLogService.ts +19 -1
  15. package/Server/Services/KubernetesContainerService.ts +264 -0
  16. package/Server/Services/KubernetesResourceService.ts +233 -0
  17. package/Server/Services/StatusPageSubscriberService.ts +4 -4
  18. package/Server/Types/Database/Permissions/AccessControlPermission.ts +3 -3
  19. package/Server/Utils/LLM/LLMService.ts +132 -11
  20. package/Server/Utils/Monitor/MonitorAlert.ts +1 -1
  21. package/Server/Utils/Monitor/MonitorIncident.ts +1 -1
  22. package/Server/Utils/StartServer.ts +2 -0
  23. package/Server/Utils/Telemetry/AppMetrics.ts +211 -0
  24. package/Server/Utils/Telemetry/RuntimeMetrics.ts +169 -0
  25. package/Server/Utils/Telemetry.ts +98 -0
  26. package/Server/Utils/Workspace/Slack/Actions/Alert.ts +2 -2
  27. package/Server/Utils/Workspace/Slack/Actions/Incident.ts +2 -2
  28. package/Server/Utils/Workspace/Slack/Actions/ScheduledMaintenance.ts +2 -2
  29. package/Tests/jest.setup.ts +18 -0
  30. package/Types/Kubernetes/KubernetesInventoryExtractor.ts +171 -5
  31. package/Types/LLM/LlmType.ts +3 -0
  32. package/UI/Components/Forms/ModelForm.tsx +3 -3
  33. package/UI/Components/LogsViewer/components/LogsAnalyticsView.tsx +2 -2
  34. package/Utils/UUID.ts +1 -3
  35. package/build/dist/Models/DatabaseModels/Index.js +2 -0
  36. package/build/dist/Models/DatabaseModels/Index.js.map +1 -1
  37. package/build/dist/Models/DatabaseModels/KubernetesContainer.js +581 -0
  38. package/build/dist/Models/DatabaseModels/KubernetesContainer.js.map +1 -0
  39. package/build/dist/Models/DatabaseModels/KubernetesResource.js +135 -0
  40. package/build/dist/Models/DatabaseModels/KubernetesResource.js.map +1 -1
  41. package/build/dist/Models/DatabaseModels/LlmLog.js +1 -1
  42. package/build/dist/Models/DatabaseModels/LlmLog.js.map +1 -1
  43. package/build/dist/Models/DatabaseModels/LlmProvider.js +4 -4
  44. package/build/dist/Models/DatabaseModels/LlmProvider.js.map +1 -1
  45. package/build/dist/Models/DatabaseModels/Project.js +41 -0
  46. package/build/dist/Models/DatabaseModels/Project.js.map +1 -1
  47. package/build/dist/Server/API/KubernetesResourceAPI.js +106 -9
  48. package/build/dist/Server/API/KubernetesResourceAPI.js.map +1 -1
  49. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1777550162848-MigrationName.js +16 -0
  50. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1777550162848-MigrationName.js.map +1 -0
  51. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1777571961028-MigrationName.js +40 -0
  52. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1777571961028-MigrationName.js.map +1 -0
  53. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +4 -0
  54. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js.map +1 -1
  55. package/build/dist/Server/Infrastructure/Queue.js +44 -0
  56. package/build/dist/Server/Infrastructure/Queue.js.map +1 -1
  57. package/build/dist/Server/Infrastructure/QueueWorker.js +31 -1
  58. package/build/dist/Server/Infrastructure/QueueWorker.js.map +1 -1
  59. package/build/dist/Server/Middleware/HttpMetricsMiddleware.js +61 -0
  60. package/build/dist/Server/Middleware/HttpMetricsMiddleware.js.map +1 -0
  61. package/build/dist/Server/Services/AuditLogService.js +14 -1
  62. package/build/dist/Server/Services/AuditLogService.js.map +1 -1
  63. package/build/dist/Server/Services/KubernetesContainerService.js +179 -0
  64. package/build/dist/Server/Services/KubernetesContainerService.js.map +1 -0
  65. package/build/dist/Server/Services/KubernetesResourceService.js +175 -0
  66. package/build/dist/Server/Services/KubernetesResourceService.js.map +1 -1
  67. package/build/dist/Server/Services/StatusPageSubscriberService.js +4 -4
  68. package/build/dist/Server/Services/StatusPageSubscriberService.js.map +1 -1
  69. package/build/dist/Server/Types/Database/Permissions/AccessControlPermission.js +3 -3
  70. package/build/dist/Server/Utils/LLM/LLMService.js +111 -13
  71. package/build/dist/Server/Utils/LLM/LLMService.js.map +1 -1
  72. package/build/dist/Server/Utils/Monitor/MonitorAlert.js +1 -1
  73. package/build/dist/Server/Utils/Monitor/MonitorAlert.js.map +1 -1
  74. package/build/dist/Server/Utils/Monitor/MonitorIncident.js +1 -1
  75. package/build/dist/Server/Utils/Monitor/MonitorIncident.js.map +1 -1
  76. package/build/dist/Server/Utils/StartServer.js +2 -0
  77. package/build/dist/Server/Utils/StartServer.js.map +1 -1
  78. package/build/dist/Server/Utils/Telemetry/AppMetrics.js +167 -0
  79. package/build/dist/Server/Utils/Telemetry/AppMetrics.js.map +1 -0
  80. package/build/dist/Server/Utils/Telemetry/RuntimeMetrics.js +141 -0
  81. package/build/dist/Server/Utils/Telemetry/RuntimeMetrics.js.map +1 -0
  82. package/build/dist/Server/Utils/Telemetry.js +47 -0
  83. package/build/dist/Server/Utils/Telemetry.js.map +1 -1
  84. package/build/dist/Server/Utils/Workspace/Slack/Actions/Alert.js +2 -2
  85. package/build/dist/Server/Utils/Workspace/Slack/Actions/Incident.js +2 -2
  86. package/build/dist/Server/Utils/Workspace/Slack/Actions/ScheduledMaintenance.js +2 -2
  87. package/build/dist/Tests/jest.setup.js +17 -0
  88. package/build/dist/Tests/jest.setup.js.map +1 -1
  89. package/build/dist/Types/Kubernetes/KubernetesInventoryExtractor.js +116 -4
  90. package/build/dist/Types/Kubernetes/KubernetesInventoryExtractor.js.map +1 -1
  91. package/build/dist/Types/LLM/LlmType.js +3 -0
  92. package/build/dist/Types/LLM/LlmType.js.map +1 -1
  93. package/build/dist/UI/Components/Forms/ModelForm.js +3 -3
  94. package/build/dist/UI/Components/LogsViewer/components/LogsAnalyticsView.js.map +1 -1
  95. package/build/dist/Utils/UUID.js +1 -2
  96. package/build/dist/Utils/UUID.js.map +1 -1
  97. package/package.json +6 -8
@@ -42,7 +42,7 @@ import LlmType from "../../Types/LLM/LlmType";
42
42
  pluralName: "LLM Providers",
43
43
  icon: IconProp.Bolt,
44
44
  tableDescription:
45
- "Manage LLM Provider configurations. Connect to OpenAI, Anthropic, Ollama, or other LLM providers to enable AI features.",
45
+ "Manage LLM Provider configurations. Connect to OpenAI, Azure OpenAI, Anthropic, Groq, Mistral, Ollama, or other LLM providers to enable AI features.",
46
46
  })
47
47
  @TableAccessControl({
48
48
  create: [
@@ -179,7 +179,8 @@ export default class LlmProvider extends BaseModel {
179
179
  required: true,
180
180
  type: TableColumnType.ShortText,
181
181
  title: "LLM Type",
182
- description: "The type of LLM provider (OpenAI, Anthropic, Ollama, etc.)",
182
+ description:
183
+ "The type of LLM provider (OpenAI, Azure OpenAI, Anthropic, Groq, Mistral, Ollama, etc.)",
183
184
  })
184
185
  @Column({
185
186
  nullable: false,
@@ -214,7 +215,7 @@ export default class LlmProvider extends BaseModel {
214
215
  type: TableColumnType.LongText,
215
216
  title: "API Key",
216
217
  description:
217
- "The API key for the LLM provider. Required for OpenAI and Anthropic.",
218
+ "The API key for the LLM provider. Required for OpenAI, Azure OpenAI, Anthropic, Groq, and Mistral.",
218
219
  encrypted: true,
219
220
  })
220
221
  @Column({
@@ -276,7 +277,7 @@ export default class LlmProvider extends BaseModel {
276
277
  type: TableColumnType.ShortURL,
277
278
  title: "Base URL",
278
279
  description:
279
- "The base URL for the LLM API. Required for Ollama, optional for others.",
280
+ "The base URL for the LLM API. Required for Azure OpenAI and Ollama, optional for others.",
280
281
  })
281
282
  @Column({
282
283
  nullable: true,
@@ -2180,4 +2180,44 @@ export default class Project extends TenantModel {
2180
2180
  create: PlanType.Free,
2181
2181
  })
2182
2182
  public auditLogsRetentionInDays?: number = undefined;
2183
+
2184
+ @ColumnAccessControl({
2185
+ create: [Permission.User],
2186
+ read: [
2187
+ Permission.ProjectOwner,
2188
+ Permission.ProjectAdmin,
2189
+ Permission.ProjectMember,
2190
+ Permission.Viewer,
2191
+ Permission.ReadProject,
2192
+ Permission.UnAuthorizedSsoUser,
2193
+ Permission.ProjectUser,
2194
+ Permission.ReadAllProjectResources,
2195
+ ],
2196
+ update: [
2197
+ Permission.ProjectOwner,
2198
+ Permission.ProjectAdmin,
2199
+ Permission.EditProject,
2200
+ ],
2201
+ })
2202
+ @TableColumn({
2203
+ required: true,
2204
+ type: TableColumnType.Boolean,
2205
+ isDefaultValueColumn: true,
2206
+ defaultValue: false,
2207
+ title: "Store System Events",
2208
+ description:
2209
+ "When enabled, audit logs will also include events triggered by the system. By default, only events triggered by users are recorded.",
2210
+ })
2211
+ @Column({
2212
+ type: ColumnType.Boolean,
2213
+ nullable: false,
2214
+ unique: false,
2215
+ default: false,
2216
+ })
2217
+ @ColumnBillingAccessControl({
2218
+ read: PlanType.Free,
2219
+ update: PlanType.Enterprise,
2220
+ create: PlanType.Free,
2221
+ })
2222
+ public storeSystemEventsInAuditLogs?: boolean = undefined;
2183
2223
  }
@@ -56,14 +56,57 @@ export default class KubernetesResourceAPI extends BaseAPI<
56
56
  }
57
57
  },
58
58
  );
59
+
60
+ /*
61
+ * Latest CPU+memory aggregated by Pod namespace. Powers the
62
+ * Namespaces list view without a ClickHouse round-trip.
63
+ */
64
+ this.router.post(
65
+ `${new this.entityType()
66
+ .getCrudApiPath()
67
+ ?.toString()}/latest-pod-metrics-by-namespace/:clusterId`,
68
+ UserMiddleware.getUserMiddleware,
69
+ async (req: ExpressRequest, res: ExpressResponse, next: NextFunction) => {
70
+ try {
71
+ await this.getLatestPodMetricsByNamespace(req, res);
72
+ } catch (err) {
73
+ next(err);
74
+ }
75
+ },
76
+ );
77
+
78
+ /*
79
+ * Latest CPU+memory aggregated by Pod ownerReferences[].name for
80
+ * a given owner kind. Powers Deployments/StatefulSets/DaemonSets/
81
+ * Jobs/CronJobs list views.
82
+ */
83
+ this.router.post(
84
+ `${new this.entityType()
85
+ .getCrudApiPath()
86
+ ?.toString()}/latest-pod-metrics-by-owner/:clusterId/:ownerKind`,
87
+ UserMiddleware.getUserMiddleware,
88
+ async (req: ExpressRequest, res: ExpressResponse, next: NextFunction) => {
89
+ try {
90
+ await this.getLatestPodMetricsByOwner(req, res);
91
+ } catch (err) {
92
+ next(err);
93
+ }
94
+ },
95
+ );
59
96
  }
60
97
 
61
- private async getInventorySummary(
62
- req: ExpressRequest,
63
- res: ExpressResponse,
64
- ): Promise<void> {
98
+ /*
99
+ * Cluster + auth resolution shared by the cluster-scoped sub-routes.
100
+ * Returns the (projectId, kubernetesClusterId) tuple after enforcing
101
+ * the standard ACL chain. Throws NotFound when the cluster is
102
+ * missing or the caller lacks read access (indistinguishable on
103
+ * purpose).
104
+ */
105
+ private async resolveClusterForRequest(req: ExpressRequest): Promise<{
106
+ projectId: ObjectID;
107
+ kubernetesClusterId: ObjectID;
108
+ }> {
65
109
  const clusterIdParam: string | undefined = req.params["clusterId"];
66
-
67
110
  if (!clusterIdParam) {
68
111
  throw new BadDataException("Cluster ID is required");
69
112
  }
@@ -78,12 +121,6 @@ export default class KubernetesResourceAPI extends BaseAPI<
78
121
  const props: DatabaseCommonInteractionProps =
79
122
  await CommonAPI.getDatabaseCommonInteractionProps(req);
80
123
 
81
- /*
82
- * Authorize: the caller must be able to read the parent cluster.
83
- * findOneById applies the full ACL chain; a null return means 404
84
- * (either the cluster doesn't exist or the caller cannot see it —
85
- * indistinguishable on purpose).
86
- */
87
124
  const cluster: KubernetesCluster | null =
88
125
  await KubernetesClusterService.findOneById({
89
126
  id: kubernetesClusterId,
@@ -98,9 +135,104 @@ export default class KubernetesResourceAPI extends BaseAPI<
98
135
  throw new NotFoundException("Kubernetes Cluster not found");
99
136
  }
100
137
 
101
- const summary: InventorySummary = await this.service.getInventorySummary({
138
+ return {
102
139
  projectId: cluster.projectId,
103
140
  kubernetesClusterId,
141
+ };
142
+ }
143
+
144
+ /*
145
+ * Translate a service-layer Map of aggregates into a JSON dict
146
+ * { name: { cpuPercent, memoryBytes } } suitable for the wire.
147
+ * memoryBytes is stringified so values past 2 GiB don't overflow
148
+ * client-side number parsing in the JSON path; the UI parses it
149
+ * back to a number for rendering.
150
+ */
151
+ private mapAggregatesToJson(
152
+ aggregates: Map<string, { cpuPercent: number; memoryBytes: number }>,
153
+ ): JSONObject {
154
+ const out: JSONObject = {};
155
+ for (const [name, value] of aggregates.entries()) {
156
+ out[name] = {
157
+ cpuPercent: value.cpuPercent,
158
+ memoryBytes: value.memoryBytes.toString(),
159
+ };
160
+ }
161
+ return out;
162
+ }
163
+
164
+ private async getLatestPodMetricsByNamespace(
165
+ req: ExpressRequest,
166
+ res: ExpressResponse,
167
+ ): Promise<void> {
168
+ const { projectId, kubernetesClusterId } =
169
+ await this.resolveClusterForRequest(req);
170
+
171
+ const staleAfter: Date = new Date(Date.now() - 15 * 60 * 1000);
172
+ const aggregates: Map<string, { cpuPercent: number; memoryBytes: number }> =
173
+ await this.service.getLatestMetricsByNamespace({
174
+ projectId,
175
+ kubernetesClusterId,
176
+ staleAfter,
177
+ });
178
+
179
+ return Response.sendJsonObjectResponse(req, res, {
180
+ aggregates: this.mapAggregatesToJson(aggregates),
181
+ });
182
+ }
183
+
184
+ private async getLatestPodMetricsByOwner(
185
+ req: ExpressRequest,
186
+ res: ExpressResponse,
187
+ ): Promise<void> {
188
+ const ownerKind: string | undefined = req.params["ownerKind"];
189
+ if (!ownerKind) {
190
+ throw new BadDataException("Owner kind is required");
191
+ }
192
+ /*
193
+ * Only a small allow-list of owner kinds makes sense here; reject
194
+ * anything else so the endpoint can't be used to probe arbitrary
195
+ * jsonb_array_elements paths.
196
+ */
197
+ const allowed: Set<string> = new Set([
198
+ "Deployment",
199
+ "StatefulSet",
200
+ "DaemonSet",
201
+ "Job",
202
+ "CronJob",
203
+ "ReplicaSet",
204
+ ]);
205
+ if (!allowed.has(ownerKind)) {
206
+ throw new BadDataException(`Unsupported owner kind: ${ownerKind}`);
207
+ }
208
+
209
+ const { projectId, kubernetesClusterId } =
210
+ await this.resolveClusterForRequest(req);
211
+
212
+ const staleAfter: Date = new Date(Date.now() - 15 * 60 * 1000);
213
+ const aggregates: Map<string, { cpuPercent: number; memoryBytes: number }> =
214
+ await this.service.getLatestMetricsByOwner({
215
+ projectId,
216
+ kubernetesClusterId,
217
+ ownerKind,
218
+ staleAfter,
219
+ });
220
+
221
+ return Response.sendJsonObjectResponse(req, res, {
222
+ aggregates: this.mapAggregatesToJson(aggregates),
223
+ });
224
+ }
225
+
226
+ private async getInventorySummary(
227
+ req: ExpressRequest,
228
+ res: ExpressResponse,
229
+ ): Promise<void> {
230
+ const { projectId, kubernetesClusterId } =
231
+ await this.resolveClusterForRequest(req);
232
+
233
+ const summary: InventorySummary = await this.service.getInventorySummary({
234
+ projectId,
235
+ kubernetesClusterId,
104
236
  });
105
237
 
106
238
  const responseBody: JSONObject = {
@@ -0,0 +1,29 @@
1
+ import { MigrationInterface, QueryRunner } from "typeorm";
2
+
3
+ export class MigrationName1777550162848 implements MigrationInterface {
4
+ public name: string = "MigrationName1777550162848";
5
+
6
+ public async up(queryRunner: QueryRunner): Promise<void> {
7
+ await queryRunner.query(
8
+ `ALTER TABLE "Project" ADD "storeSystemEventsInAuditLogs" boolean NOT NULL DEFAULT false`,
9
+ );
10
+ await queryRunner.query(
11
+ `ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "rotation" SET DEFAULT '{"_type":"Recurring","value":{"intervalType":"Day","intervalCount":{"_type":"PositiveNumber","value":1}}}'`,
12
+ );
13
+ await queryRunner.query(
14
+ `ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "restrictionTimes" SET DEFAULT '{"_type":"RestrictionTimes","value":{"restictionType":"None","dayRestrictionTimes":null,"weeklyRestrictionTimes":[]}}'`,
15
+ );
16
+ }
17
+
18
+ public async down(queryRunner: QueryRunner): Promise<void> {
19
+ await queryRunner.query(
20
+ `ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "restrictionTimes" SET DEFAULT '{"_type": "RestrictionTimes", "value": {"restictionType": "None", "dayRestrictionTimes": null, "weeklyRestrictionTimes": []}}'`,
21
+ );
22
+ await queryRunner.query(
23
+ `ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "rotation" SET DEFAULT '{"_type": "Recurring", "value": {"intervalType": "Day", "intervalCount": {"_type": "PositiveNumber", "value": 1}}}'`,
24
+ );
25
+ await queryRunner.query(
26
+ `ALTER TABLE "Project" DROP COLUMN "storeSystemEventsInAuditLogs"`,
27
+ );
28
+ }
29
+ }
@@ -0,0 +1,99 @@
1
+ import { MigrationInterface, QueryRunner } from "typeorm";
2
+
3
+ export class MigrationName1777571961028 implements MigrationInterface {
4
+ public name: string = "MigrationName1777571961028";
5
+
6
+ public async up(queryRunner: QueryRunner): Promise<void> {
7
+ await queryRunner.query(
8
+ `CREATE TABLE "KubernetesContainer" ("_id" uuid NOT NULL DEFAULT uuid_generate_v4(), "createdAt" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT now(), "updatedAt" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT now(), "deletedAt" TIMESTAMP WITH TIME ZONE, "version" integer NOT NULL, "projectId" uuid NOT NULL, "kubernetesClusterId" uuid NOT NULL, "podNamespaceKey" character varying(100) NOT NULL DEFAULT '', "podName" character varying(100) NOT NULL, "name" character varying(100) NOT NULL, "image" character varying(500), "state" character varying(100), "reason" character varying(100), "isReady" boolean, "restartCount" integer, "memoryLimitBytes" bigint, "latestCpuPercent" numeric, "latestMemoryBytes" bigint, "metricsUpdatedAt" TIMESTAMP WITH TIME ZONE, "lastSeenAt" TIMESTAMP WITH TIME ZONE NOT NULL, "createdByUserId" uuid, "deletedByUserId" uuid, CONSTRAINT "PK_7e19b5140bc3005a6ea2f8f7aee" PRIMARY KEY ("_id"))`,
9
+ );
10
+ await queryRunner.query(
11
+ `CREATE INDEX "IDX_fcc7f4bc83564a8c7885233f6e" ON "KubernetesContainer" ("projectId") `,
12
+ );
13
+ await queryRunner.query(
14
+ `CREATE INDEX "IDX_5303bcae1a72f9830bd7d15e2c" ON "KubernetesContainer" ("kubernetesClusterId") `,
15
+ );
16
+ await queryRunner.query(
17
+ `CREATE UNIQUE INDEX "IDX_1dcb8fed322a9bddfabb60cbc7" ON "KubernetesContainer" ("projectId", "kubernetesClusterId", "podNamespaceKey", "podName", "name") `,
18
+ );
19
+ await queryRunner.query(
20
+ `ALTER TABLE "KubernetesResource" ADD "controllerDeploymentName" character varying(100)`,
21
+ );
22
+ await queryRunner.query(
23
+ `ALTER TABLE "KubernetesResource" ADD "controllerCronJobName" character varying(100)`,
24
+ );
25
+ await queryRunner.query(
26
+ `ALTER TABLE "KubernetesResource" ADD "latestCpuPercent" numeric`,
27
+ );
28
+ await queryRunner.query(
29
+ `ALTER TABLE "KubernetesResource" ADD "latestMemoryBytes" bigint`,
30
+ );
31
+ await queryRunner.query(
32
+ `ALTER TABLE "KubernetesResource" ADD "metricsUpdatedAt" TIMESTAMP WITH TIME ZONE`,
33
+ );
34
+ await queryRunner.query(
35
+ `ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "rotation" SET DEFAULT '{"_type":"Recurring","value":{"intervalType":"Day","intervalCount":{"_type":"PositiveNumber","value":1}}}'`,
36
+ );
37
+ await queryRunner.query(
38
+ `ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "restrictionTimes" SET DEFAULT '{"_type":"RestrictionTimes","value":{"restictionType":"None","dayRestrictionTimes":null,"weeklyRestrictionTimes":[]}}'`,
39
+ );
40
+ await queryRunner.query(
41
+ `ALTER TABLE "KubernetesContainer" ADD CONSTRAINT "FK_fcc7f4bc83564a8c7885233f6e3" FOREIGN KEY ("projectId") REFERENCES "Project"("_id") ON DELETE CASCADE ON UPDATE NO ACTION`,
42
+ );
43
+ await queryRunner.query(
44
+ `ALTER TABLE "KubernetesContainer" ADD CONSTRAINT "FK_5303bcae1a72f9830bd7d15e2cd" FOREIGN KEY ("kubernetesClusterId") REFERENCES "KubernetesCluster"("_id") ON DELETE CASCADE ON UPDATE NO ACTION`,
45
+ );
46
+ await queryRunner.query(
47
+ `ALTER TABLE "KubernetesContainer" ADD CONSTRAINT "FK_d0f740eb8fc87c2426d78babf6b" FOREIGN KEY ("createdByUserId") REFERENCES "User"("_id") ON DELETE SET NULL ON UPDATE NO ACTION`,
48
+ );
49
+ await queryRunner.query(
50
+ `ALTER TABLE "KubernetesContainer" ADD CONSTRAINT "FK_eadbc98e53bc5788d8313e52c67" FOREIGN KEY ("deletedByUserId") REFERENCES "User"("_id") ON DELETE SET NULL ON UPDATE NO ACTION`,
51
+ );
52
+ }
53
+
54
+ public async down(queryRunner: QueryRunner): Promise<void> {
55
+ await queryRunner.query(
56
+ `ALTER TABLE "KubernetesContainer" DROP CONSTRAINT "FK_eadbc98e53bc5788d8313e52c67"`,
57
+ );
58
+ await queryRunner.query(
59
+ `ALTER TABLE "KubernetesContainer" DROP CONSTRAINT "FK_d0f740eb8fc87c2426d78babf6b"`,
60
+ );
61
+ await queryRunner.query(
62
+ `ALTER TABLE "KubernetesContainer" DROP CONSTRAINT "FK_5303bcae1a72f9830bd7d15e2cd"`,
63
+ );
64
+ await queryRunner.query(
65
+ `ALTER TABLE "KubernetesContainer" DROP CONSTRAINT "FK_fcc7f4bc83564a8c7885233f6e3"`,
66
+ );
67
+ await queryRunner.query(
68
+ `ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "restrictionTimes" SET DEFAULT '{"_type": "RestrictionTimes", "value": {"restictionType": "None", "dayRestrictionTimes": null, "weeklyRestrictionTimes": []}}'`,
69
+ );
70
+ await queryRunner.query(
71
+ `ALTER TABLE "OnCallDutyPolicyScheduleLayer" ALTER COLUMN "rotation" SET DEFAULT '{"_type": "Recurring", "value": {"intervalType": "Day", "intervalCount": {"_type": "PositiveNumber", "value": 1}}}'`,
72
+ );
73
+ await queryRunner.query(
74
+ `ALTER TABLE "KubernetesResource" DROP COLUMN "metricsUpdatedAt"`,
75
+ );
76
+ await queryRunner.query(
77
+ `ALTER TABLE "KubernetesResource" DROP COLUMN "latestMemoryBytes"`,
78
+ );
79
+ await queryRunner.query(
80
+ `ALTER TABLE "KubernetesResource" DROP COLUMN "latestCpuPercent"`,
81
+ );
82
+ await queryRunner.query(
83
+ `ALTER TABLE "KubernetesResource" DROP COLUMN "controllerCronJobName"`,
84
+ );
85
+ await queryRunner.query(
86
+ `ALTER TABLE "KubernetesResource" DROP COLUMN "controllerDeploymentName"`,
87
+ );
88
+ await queryRunner.query(
89
+ `DROP INDEX "public"."IDX_1dcb8fed322a9bddfabb60cbc7"`,
90
+ );
91
+ await queryRunner.query(
92
+ `DROP INDEX "public"."IDX_5303bcae1a72f9830bd7d15e2c"`,
93
+ );
94
+ await queryRunner.query(
95
+ `DROP INDEX "public"."IDX_fcc7f4bc83564a8c7885233f6e"`,
96
+ );
97
+ await queryRunner.query(`DROP TABLE "KubernetesContainer"`);
98
+ }
99
+ }
@@ -293,6 +293,8 @@ import { MigrationName1776940714709 } from "./1776940714709-MigrationName";
293
293
  import { AddStatusPageLanguageSettings1776971364783 } from "./1776971364783-AddStatusPageLanguageSettings";
294
294
  import { AddTelemetryRetentionSettings1777018175127 } from "./1777018175127-AddTelemetryRetentionSettings";
295
295
  import { AddMonitorTemplate1777201966799 } from "./1777201966799-AddMonitorTemplate";
296
+ import { MigrationName1777550162848 } from "./1777550162848-MigrationName";
297
+ import { MigrationName1777571961028 } from "./1777571961028-MigrationName";
296
298
  export default [
297
299
  InitialMigration,
298
300
  MigrationName1717678334852,
@@ -589,4 +591,6 @@ export default [
589
591
  AddStatusPageLanguageSettings1776971364783,
590
592
  AddTelemetryRetentionSettings1777018175127,
591
593
  AddMonitorTemplate1777201966799,
594
+ MigrationName1777550162848,
595
+ MigrationName1777571961028,
592
596
  ];
@@ -8,6 +8,8 @@ import { BullMQAdapter } from "@bull-board/api/bullMQAdapter";
8
8
  import { ExpressRouter } from "../Utils/Express";
9
9
  import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
10
10
  import logger from "../Utils/Logger";
11
+ import Telemetry from "../Utils/Telemetry";
12
+ import type { Attributes, ObservableResult } from "@opentelemetry/api";
11
13
  import Redis from "./Redis";
12
14
 
13
15
  export enum QueueName {
@@ -23,6 +25,7 @@ export default class Queue {
23
25
  private static queueDict: Dictionary<BullQueue> = {};
24
26
  // track queues we have already run initial cleanup on
25
27
  private static cleanedQueueNames: Set<string> = new Set<string>();
28
+ private static queueSizeMetricRegistered: boolean = false;
26
29
  // store repeatable jobs to re-add on reconnect
27
30
  private static repeatableJobs: Dictionary<
28
31
  Dictionary<{
@@ -99,6 +102,9 @@ export default class Queue {
99
102
  // save it to the dictionary
100
103
  this.queueDict[queueName] = queue;
101
104
 
105
+ // Register the observable gauge once any queue exists in this process.
106
+ this.registerQueueSizeMetric();
107
+
102
108
  // Add event listener to re-add repeatable jobs on reconnect
103
109
  this.setupReconnectListener(queue, queueName).catch((err: unknown) => {
104
110
  logger.error("Error setting up reconnect listener for queue");
@@ -243,6 +249,60 @@ export default class Queue {
243
249
  return jobAdded;
244
250
  }
245
251
 
252
+ private static registerQueueSizeMetric(): void {
253
+ if (this.queueSizeMetricRegistered) {
254
+ return;
255
+ }
256
+
257
+ if (!Telemetry.isMetricsEnabled()) {
258
+ return;
259
+ }
260
+
261
+ try {
262
+ Telemetry.getObservableGauge({
263
+ name: "queue.size",
264
+ description:
265
+ "Number of BullMQ jobs in each queue, partitioned by job state.",
266
+ unit: "1",
267
+ callback: async (
268
+ result: ObservableResult<Attributes>,
269
+ ): Promise<void> => {
270
+ for (const queueName of Object.keys(this.queueDict)) {
271
+ try {
272
+ const stats: {
273
+ waiting: number;
274
+ active: number;
275
+ completed: number;
276
+ failed: number;
277
+ delayed: number;
278
+ total: number;
279
+ } = await this.getQueueStats(queueName as QueueName);
280
+
281
+ const baseAttrs: Attributes = {
282
+ "messaging.system": "bullmq",
283
+ "messaging.destination.name": queueName,
284
+ };
285
+
286
+ result.observe(stats.waiting, { ...baseAttrs, state: "waiting" });
287
+ result.observe(stats.active, { ...baseAttrs, state: "active" });
288
+ result.observe(stats.delayed, { ...baseAttrs, state: "delayed" });
289
+ result.observe(stats.failed, { ...baseAttrs, state: "failed" });
290
+ } catch (err) {
291
+ // Don't let one queue's stat failure break others.
292
+ logger.debug("Failed to read queue stats");
293
+ logger.debug(err);
294
+ }
295
+ }
296
+ },
297
+ });
298
+
299
+ this.queueSizeMetricRegistered = true;
300
+ } catch (err) {
301
+ logger.error("Failed to register queue.size metric");
302
+ logger.error(err);
303
+ }
304
+ }
305
+
246
306
  @CaptureSpan()
247
307
  public static async getQueueSize(queueName: QueueName): Promise<number> {
248
308
  const queue: BullQueue = this.getQueue(queueName);
@@ -7,6 +7,7 @@ import {
7
7
  } from "../../Types/FunctionTypes";
8
8
  import { Worker } from "bullmq";
9
9
  import CaptureSpan from "../Utils/Telemetry/CaptureSpan";
10
+ import AppMetrics from "../Utils/Telemetry/AppMetrics";
10
11
  import Redis from "./Redis";
11
12
 
12
13
  export default class QueueWorker {
@@ -29,7 +30,44 @@ export default class QueueWorker {
29
30
  maxStalledCount?: number;
30
31
  },
31
32
  ): Worker {
32
- const worker: Worker = new Worker(queueName, onJobInQueue, {
33
+ const instrumentedJobHandler: (job: QueueJob) => Promise<void> = async (
34
+ job: QueueJob,
35
+ ): Promise<void> => {
36
+ const startNs: bigint = process.hrtime.bigint();
37
+ const baseAttributes: Record<string, string> = {
38
+ "messaging.system": "bullmq",
39
+ "messaging.destination.name": queueName,
40
+ "messaging.operation.name": job.name || "unknown",
41
+ };
42
+
43
+ AppMetrics.getWorkerJobsInFlight().add(1, baseAttributes);
44
+
45
+ let outcome: "success" | "failure" | "timeout" = "success";
46
+
47
+ try {
48
+ await onJobInQueue(job);
49
+ } catch (err) {
50
+ outcome =
51
+ err instanceof TimeoutException ||
52
+ (err as { name?: string })?.name === "TimeoutException"
53
+ ? "timeout"
54
+ : "failure";
55
+ throw err;
56
+ } finally {
57
+ const elapsedNs: bigint = process.hrtime.bigint() - startNs;
58
+ const durationMs: number = Number(elapsedNs) / 1e6;
59
+ const attributes: Record<string, string> = {
60
+ ...baseAttributes,
61
+ outcome,
62
+ };
63
+
64
+ AppMetrics.getWorkerJobCounter().add(1, attributes);
65
+ AppMetrics.getWorkerJobDuration().record(durationMs, attributes);
66
+ AppMetrics.getWorkerJobsInFlight().add(-1, baseAttributes);
67
+ }
68
+ };
69
+
70
+ const worker: Worker = new Worker(queueName, instrumentedJobHandler, {
33
71
  connection: Redis.getRedisOptions(),
34
72
  concurrency: options.concurrency,
35
73
  // Only set these values if provided so we do not override BullMQ defaults
@@ -0,0 +1,92 @@
1
+ import {
2
+ ExpressRequest,
3
+ ExpressResponse,
4
+ NextFunction,
5
+ } from "../Utils/Express";
6
+ import AppMetrics from "../Utils/Telemetry/AppMetrics";
7
+
8
+ /**
9
+ * Express middleware that records HTTP server metrics (request count,
10
+ * duration, and in-flight gauge) for every request.
11
+ *
12
+ * Attributes are kept low-cardinality on purpose:
13
+ * - http.request.method: GET / POST / ...
14
+ * - http.route: Express route template (e.g. /api/users/:id)
15
+ * or "unmatched" when nothing matched the request.
16
+ * - http.response.status_code: full status code (bounded set).
17
+ * - status_class: 2xx / 3xx / 4xx / 5xx — handy for fast queries.
18
+ *
19
+ * High-cardinality identifiers (raw URL, query string, userId, projectId,
20
+ * requestId) intentionally stay on traces and logs.
21
+ */
22
+ const HttpMetricsMiddleware: (
23
+ req: ExpressRequest,
24
+ res: ExpressResponse,
25
+ next: NextFunction,
26
+ ) => void = (
27
+ req: ExpressRequest,
28
+ res: ExpressResponse,
29
+ next: NextFunction,
30
+ ): void => {
31
+ const startNs: bigint = process.hrtime.bigint();
32
+ const method: string = (req.method || "UNKNOWN").toUpperCase();
33
+
34
+ const inFlight: ReturnType<typeof AppMetrics.getHttpRequestsInFlight> =
35
+ AppMetrics.getHttpRequestsInFlight();
36
+
37
+ inFlight.add(1, { "http.request.method": method });
38
+
39
+ let recorded: boolean = false;
40
+
41
+ const recordOnce: () => void = (): void => {
42
+ if (recorded) {
43
+ return;
44
+ }
45
+ recorded = true;
46
+
47
+ const elapsedNs: bigint = process.hrtime.bigint() - startNs;
48
+ const durationMs: number = Number(elapsedNs) / 1e6;
49
+ const statusCode: number = res.statusCode || 0;
50
+ const statusClass: string =
51
+ statusCode >= 100 && statusCode < 600
52
+ ? `${Math.floor(statusCode / 100)}xx`
53
+ : "unknown";
54
+
55
+ /*
56
+ * Express populates req.route once the request has matched a route
57
+ * handler. For 404s (no match), record the request under a stable
58
+ * "unmatched" label rather than the raw URL to avoid cardinality blowup.
59
+ */
60
+ const routeWithMethod: { path?: string } | undefined = (
61
+ req as ExpressRequest & { route?: { path?: string } }
62
+ ).route;
63
+
64
+ const baseUrl: string = (req as ExpressRequest & { baseUrl?: string })
65
+ .baseUrl
66
+ ? (req as ExpressRequest & { baseUrl: string }).baseUrl
67
+ : "";
68
+
69
+ const routeTemplate: string =
70
+ routeWithMethod && typeof routeWithMethod.path === "string"
71
+ ? `${baseUrl}${routeWithMethod.path}`
72
+ : "unmatched";
73
+
74
+ const attributes: Record<string, string | number> = {
75
+ "http.request.method": method,
76
+ "http.route": routeTemplate,
77
+ "http.response.status_code": statusCode,
78
+ status_class: statusClass,
79
+ };
80
+
81
+ AppMetrics.getHttpRequestCounter().add(1, attributes);
82
+ AppMetrics.getHttpRequestDuration().record(durationMs, attributes);
83
+ inFlight.add(-1, { "http.request.method": method });
84
+ };
85
+
86
+ res.on("finish", recordOnce);
87
+ res.on("close", recordOnce);
88
+
89
+ next();
90
+ };
91
+
92
+ export default HttpMetricsMiddleware;