@oneuptime/common 10.0.35 → 10.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/Server/Infrastructure/Postgres/SchemaMigrations/{1773761409952-MigrationName.ts → 1774000000001-MigrationName.ts} +2 -2
  2. package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +2 -2
  3. package/Server/Types/Markdown.ts +11 -3
  4. package/Server/Utils/Monitor/MonitorCriteriaEvaluator.ts +4 -1
  5. package/Types/Code/CodeType.ts +1 -1
  6. package/Types/Metrics/MetricQueryConfigData.ts +1 -0
  7. package/Types/Monitor/CriteriaFilter.ts +19 -0
  8. package/Types/Monitor/KubernetesAlertTemplates.ts +703 -0
  9. package/Types/Monitor/KubernetesMetricCatalog.ts +347 -0
  10. package/Types/Monitor/MonitorCriteriaInstance.ts +86 -0
  11. package/Types/Monitor/MonitorStep.ts +36 -1
  12. package/Types/Monitor/MonitorStepKubernetesMonitor.ts +50 -0
  13. package/Types/Monitor/MonitorType.ts +14 -10
  14. package/UI/Components/AlertBanner/AlertBanner.tsx +69 -0
  15. package/UI/Components/ConditionsTable/ConditionsTable.tsx +149 -0
  16. package/UI/Components/Dictionary/DictionaryOfStingsViewer.tsx +35 -15
  17. package/UI/Components/ExpandableText/ExpandableText.tsx +42 -0
  18. package/UI/Components/FilterButtons/FilterButtons.tsx +60 -0
  19. package/UI/Components/Markdown.tsx/MarkdownEditor.tsx +4 -1
  20. package/UI/Components/ResourceUsageBar/ResourceUsageBar.tsx +58 -0
  21. package/UI/Components/StackedProgressBar/StackedProgressBar.tsx +81 -0
  22. package/UI/Components/StatusBadge/StatusBadge.tsx +44 -0
  23. package/UI/Components/Tabs/Tabs.tsx +36 -8
  24. package/UI/Utils/Dropdown.ts +2 -1
  25. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/{1773761409952-MigrationName.js → 1774000000001-MigrationName.js} +3 -3
  26. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/{1773761409952-MigrationName.js.map → 1774000000001-MigrationName.js.map} +1 -1
  27. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +2 -2
  28. package/build/dist/Server/Types/Markdown.js +10 -2
  29. package/build/dist/Server/Types/Markdown.js.map +1 -1
  30. package/build/dist/Server/Utils/Monitor/MonitorCriteriaEvaluator.js +2 -1
  31. package/build/dist/Server/Utils/Monitor/MonitorCriteriaEvaluator.js.map +1 -1
  32. package/build/dist/Types/Code/CodeType.js +1 -1
  33. package/build/dist/Types/Code/CodeType.js.map +1 -1
  34. package/build/dist/Types/Monitor/CriteriaFilter.js +18 -0
  35. package/build/dist/Types/Monitor/CriteriaFilter.js.map +1 -1
  36. package/build/dist/Types/Monitor/KubernetesAlertTemplates.js +594 -0
  37. package/build/dist/Types/Monitor/KubernetesAlertTemplates.js.map +1 -0
  38. package/build/dist/Types/Monitor/KubernetesMetricCatalog.js +311 -0
  39. package/build/dist/Types/Monitor/KubernetesMetricCatalog.js.map +1 -0
  40. package/build/dist/Types/Monitor/MonitorCriteriaInstance.js +78 -0
  41. package/build/dist/Types/Monitor/MonitorCriteriaInstance.js.map +1 -1
  42. package/build/dist/Types/Monitor/MonitorStep.js +24 -1
  43. package/build/dist/Types/Monitor/MonitorStep.js.map +1 -1
  44. package/build/dist/Types/Monitor/MonitorStepKubernetesMonitor.js +30 -0
  45. package/build/dist/Types/Monitor/MonitorStepKubernetesMonitor.js.map +1 -0
  46. package/build/dist/Types/Monitor/MonitorType.js +13 -10
  47. package/build/dist/Types/Monitor/MonitorType.js.map +1 -1
  48. package/build/dist/UI/Components/AlertBanner/AlertBanner.js +42 -0
  49. package/build/dist/UI/Components/AlertBanner/AlertBanner.js.map +1 -0
  50. package/build/dist/UI/Components/ConditionsTable/ConditionsTable.js +83 -0
  51. package/build/dist/UI/Components/ConditionsTable/ConditionsTable.js.map +1 -0
  52. package/build/dist/UI/Components/Dictionary/DictionaryOfStingsViewer.js +14 -8
  53. package/build/dist/UI/Components/Dictionary/DictionaryOfStingsViewer.js.map +1 -1
  54. package/build/dist/UI/Components/ExpandableText/ExpandableText.js +19 -0
  55. package/build/dist/UI/Components/ExpandableText/ExpandableText.js.map +1 -0
  56. package/build/dist/UI/Components/FilterButtons/FilterButtons.js +17 -0
  57. package/build/dist/UI/Components/FilterButtons/FilterButtons.js.map +1 -0
  58. package/build/dist/UI/Components/Markdown.tsx/MarkdownEditor.js +3 -1
  59. package/build/dist/UI/Components/Markdown.tsx/MarkdownEditor.js.map +1 -1
  60. package/build/dist/UI/Components/ResourceUsageBar/ResourceUsageBar.js +23 -0
  61. package/build/dist/UI/Components/ResourceUsageBar/ResourceUsageBar.js.map +1 -0
  62. package/build/dist/UI/Components/StackedProgressBar/StackedProgressBar.js +34 -0
  63. package/build/dist/UI/Components/StackedProgressBar/StackedProgressBar.js.map +1 -0
  64. package/build/dist/UI/Components/StatusBadge/StatusBadge.js +22 -0
  65. package/build/dist/UI/Components/StatusBadge/StatusBadge.js.map +1 -0
  66. package/build/dist/UI/Components/Tabs/Tabs.js +32 -9
  67. package/build/dist/UI/Components/Tabs/Tabs.js.map +1 -1
  68. package/build/dist/UI/Utils/Dropdown.js +2 -1
  69. package/build/dist/UI/Utils/Dropdown.js.map +1 -1
  70. package/package.json +1 -1
@@ -0,0 +1,703 @@
1
+ import ObjectID from "../ObjectID";
2
+ import MonitorStep from "./MonitorStep";
3
+ import MonitorCriteria from "./MonitorCriteria";
4
+ import MonitorCriteriaInstance from "./MonitorCriteriaInstance";
5
+ import FilterCondition from "../Filter/FilterCondition";
6
+ import { CheckOn, FilterType, EvaluateOverTimeType } from "./CriteriaFilter";
7
+ import MonitorStepKubernetesMonitor, {
8
+ KubernetesResourceScope,
9
+ } from "./MonitorStepKubernetesMonitor";
10
+ import RollingTime from "../RollingTime/RollingTime";
11
+ import MetricsAggregationType from "../Metrics/MetricsAggregationType";
12
+
13
+ export type KubernetesAlertTemplateCategory =
14
+ | "Workload"
15
+ | "Node"
16
+ | "ControlPlane"
17
+ | "Storage"
18
+ | "Scheduling";
19
+
20
+ export type KubernetesAlertTemplateSeverity = "Critical" | "Warning";
21
+
22
+ export interface KubernetesAlertTemplateArgs {
23
+ clusterIdentifier: string;
24
+ onlineMonitorStatusId: ObjectID;
25
+ offlineMonitorStatusId: ObjectID;
26
+ defaultIncidentSeverityId: ObjectID;
27
+ defaultAlertSeverityId: ObjectID;
28
+ monitorName: string;
29
+ }
30
+
31
+ export interface KubernetesAlertTemplate {
32
+ id: string;
33
+ name: string;
34
+ description: string;
35
+ category: KubernetesAlertTemplateCategory;
36
+ severity: KubernetesAlertTemplateSeverity;
37
+ getMonitorStep: (args: KubernetesAlertTemplateArgs) => MonitorStep;
38
+ }
39
+
40
+ export function buildKubernetesMonitorStep(args: {
41
+ kubernetesMonitor: MonitorStepKubernetesMonitor;
42
+ offlineCriteriaInstance: MonitorCriteriaInstance;
43
+ onlineCriteriaInstance: MonitorCriteriaInstance;
44
+ }): MonitorStep {
45
+ const monitorStep: MonitorStep = new MonitorStep();
46
+
47
+ const monitorCriteria: MonitorCriteria = new MonitorCriteria();
48
+
49
+ monitorCriteria.data = {
50
+ monitorCriteriaInstanceArray: [
51
+ args.offlineCriteriaInstance,
52
+ args.onlineCriteriaInstance,
53
+ ],
54
+ };
55
+
56
+ monitorStep.data = {
57
+ id: ObjectID.generate().toString(),
58
+ monitorDestination: undefined,
59
+ doNotFollowRedirects: undefined,
60
+ monitorDestinationPort: undefined,
61
+ monitorCriteria: monitorCriteria,
62
+ requestType: "GET" as any,
63
+ requestHeaders: undefined,
64
+ requestBody: undefined,
65
+ customCode: undefined,
66
+ screenSizeTypes: undefined,
67
+ browserTypes: undefined,
68
+ retryCountOnError: undefined,
69
+ logMonitor: undefined,
70
+ traceMonitor: undefined,
71
+ metricMonitor: undefined,
72
+ exceptionMonitor: undefined,
73
+ snmpMonitor: undefined,
74
+ dnsMonitor: undefined,
75
+ domainMonitor: undefined,
76
+ externalStatusPageMonitor: undefined,
77
+ kubernetesMonitor: args.kubernetesMonitor,
78
+ };
79
+
80
+ return monitorStep;
81
+ }
82
+
83
+ export function buildOfflineCriteriaInstance(args: {
84
+ offlineMonitorStatusId: ObjectID;
85
+ incidentSeverityId: ObjectID;
86
+ alertSeverityId: ObjectID;
87
+ monitorName: string;
88
+ metricAlias: string;
89
+ filterType: FilterType;
90
+ value: number;
91
+ }): MonitorCriteriaInstance {
92
+ const instance: MonitorCriteriaInstance = new MonitorCriteriaInstance();
93
+
94
+ instance.data = {
95
+ id: ObjectID.generate().toString(),
96
+ monitorStatusId: args.offlineMonitorStatusId,
97
+ filterCondition: FilterCondition.Any,
98
+ filters: [
99
+ {
100
+ checkOn: CheckOn.MetricValue,
101
+ filterType: args.filterType,
102
+ metricMonitorOptions: {
103
+ metricAggregationType: EvaluateOverTimeType.AnyValue,
104
+ metricAlias: args.metricAlias,
105
+ },
106
+ value: args.value,
107
+ },
108
+ ],
109
+ incidents: [
110
+ {
111
+ title: `${args.monitorName} - Alert Triggered`,
112
+ description: `${args.monitorName} has triggered an alert condition.`,
113
+ incidentSeverityId: args.incidentSeverityId,
114
+ autoResolveIncident: true,
115
+ id: ObjectID.generate().toString(),
116
+ onCallPolicyIds: [],
117
+ },
118
+ ],
119
+ alerts: [
120
+ {
121
+ title: `${args.monitorName} - Alert`,
122
+ description: `${args.monitorName} has triggered an alert condition.`,
123
+ alertSeverityId: args.alertSeverityId,
124
+ autoResolveAlert: true,
125
+ id: ObjectID.generate().toString(),
126
+ onCallPolicyIds: [],
127
+ },
128
+ ],
129
+ changeMonitorStatus: true,
130
+ createIncidents: true,
131
+ createAlerts: true,
132
+ name: `${args.monitorName} - Unhealthy`,
133
+ description: `Criteria for detecting unhealthy state.`,
134
+ };
135
+
136
+ return instance;
137
+ }
138
+
139
+ export function buildOnlineCriteriaInstance(args: {
140
+ onlineMonitorStatusId: ObjectID;
141
+ metricAlias: string;
142
+ filterType: FilterType;
143
+ value: number;
144
+ }): MonitorCriteriaInstance {
145
+ const instance: MonitorCriteriaInstance = new MonitorCriteriaInstance();
146
+
147
+ instance.data = {
148
+ id: ObjectID.generate().toString(),
149
+ monitorStatusId: args.onlineMonitorStatusId,
150
+ filterCondition: FilterCondition.Any,
151
+ filters: [
152
+ {
153
+ checkOn: CheckOn.MetricValue,
154
+ filterType: args.filterType,
155
+ metricMonitorOptions: {
156
+ metricAggregationType: EvaluateOverTimeType.AnyValue,
157
+ metricAlias: args.metricAlias,
158
+ },
159
+ value: args.value,
160
+ },
161
+ ],
162
+ incidents: [],
163
+ alerts: [],
164
+ changeMonitorStatus: true,
165
+ createIncidents: false,
166
+ createAlerts: false,
167
+ name: "Healthy",
168
+ description: "Criteria for healthy state.",
169
+ };
170
+
171
+ return instance;
172
+ }
173
+
174
+ export function buildKubernetesMonitorConfig(args: {
175
+ clusterIdentifier: string;
176
+ metricName: string;
177
+ metricAlias: string;
178
+ resourceScope: KubernetesResourceScope;
179
+ rollingTime: RollingTime;
180
+ aggregationType: MetricsAggregationType;
181
+ attributes?: Record<string, string>;
182
+ }): MonitorStepKubernetesMonitor {
183
+ return {
184
+ clusterIdentifier: args.clusterIdentifier,
185
+ resourceScope: args.resourceScope,
186
+ resourceFilters: {},
187
+ metricViewConfig: {
188
+ queryConfigs: [
189
+ {
190
+ metricAliasData: {
191
+ metricVariable: args.metricAlias,
192
+ title: args.metricAlias,
193
+ description: args.metricAlias,
194
+ legend: args.metricAlias,
195
+ legendUnit: undefined,
196
+ },
197
+ metricQueryData: {
198
+ filterData: {
199
+ metricName: args.metricName,
200
+ attributes: args.attributes || {},
201
+ aggegationType: args.aggregationType,
202
+ aggregateBy: {},
203
+ },
204
+ },
205
+ },
206
+ ],
207
+ formulaConfigs: [],
208
+ },
209
+ rollingTime: args.rollingTime,
210
+ };
211
+ }
212
+
213
+ // --- Template Definitions ---
214
+
215
+ const crashLoopBackOffTemplate: KubernetesAlertTemplate = {
216
+ id: "k8s-crashloopbackoff",
217
+ name: "CrashLoopBackOff Detection",
218
+ description:
219
+ "Alert when container restart count exceeds threshold, indicating a CrashLoopBackOff condition.",
220
+ category: "Workload",
221
+ severity: "Critical",
222
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
223
+ const metricAlias: string = "container_restarts";
224
+
225
+ return buildKubernetesMonitorStep({
226
+ kubernetesMonitor: buildKubernetesMonitorConfig({
227
+ clusterIdentifier: args.clusterIdentifier,
228
+ metricName: "k8s.container.restarts",
229
+ metricAlias,
230
+ resourceScope: KubernetesResourceScope.Cluster,
231
+ rollingTime: RollingTime.Past5Minutes,
232
+ aggregationType: MetricsAggregationType.Max,
233
+ }),
234
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
235
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
236
+ incidentSeverityId: args.defaultIncidentSeverityId,
237
+ alertSeverityId: args.defaultAlertSeverityId,
238
+ monitorName: args.monitorName,
239
+ metricAlias,
240
+ filterType: FilterType.GreaterThan,
241
+ value: 5,
242
+ }),
243
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
244
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
245
+ metricAlias,
246
+ filterType: FilterType.LessThanOrEqualTo,
247
+ value: 5,
248
+ }),
249
+ });
250
+ },
251
+ };
252
+
253
+ const podPendingTemplate: KubernetesAlertTemplate = {
254
+ id: "k8s-pod-pending",
255
+ name: "Pod Stuck in Pending",
256
+ description:
257
+ "Alert when pods remain in Pending phase, indicating scheduling or resource issues.",
258
+ category: "Scheduling",
259
+ severity: "Warning",
260
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
261
+ const metricAlias: string = "pending_pods";
262
+
263
+ return buildKubernetesMonitorStep({
264
+ kubernetesMonitor: buildKubernetesMonitorConfig({
265
+ clusterIdentifier: args.clusterIdentifier,
266
+ metricName: "k8s.pod.phase",
267
+ metricAlias,
268
+ resourceScope: KubernetesResourceScope.Cluster,
269
+ rollingTime: RollingTime.Past5Minutes,
270
+ aggregationType: MetricsAggregationType.Sum,
271
+ attributes: { "k8s.pod.phase": "Pending" },
272
+ }),
273
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
274
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
275
+ incidentSeverityId: args.defaultIncidentSeverityId,
276
+ alertSeverityId: args.defaultAlertSeverityId,
277
+ monitorName: args.monitorName,
278
+ metricAlias,
279
+ filterType: FilterType.GreaterThan,
280
+ value: 0,
281
+ }),
282
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
283
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
284
+ metricAlias,
285
+ filterType: FilterType.EqualTo,
286
+ value: 0,
287
+ }),
288
+ });
289
+ },
290
+ };
291
+
292
+ const nodeNotReadyTemplate: KubernetesAlertTemplate = {
293
+ id: "k8s-node-not-ready",
294
+ name: "Node Not Ready",
295
+ description:
296
+ "Alert when a node condition transitions to NotReady, indicating node health issues.",
297
+ category: "Node",
298
+ severity: "Critical",
299
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
300
+ const metricAlias: string = "node_ready";
301
+
302
+ return buildKubernetesMonitorStep({
303
+ kubernetesMonitor: buildKubernetesMonitorConfig({
304
+ clusterIdentifier: args.clusterIdentifier,
305
+ metricName: "k8s.node.condition_ready",
306
+ metricAlias,
307
+ resourceScope: KubernetesResourceScope.Node,
308
+ rollingTime: RollingTime.Past5Minutes,
309
+ aggregationType: MetricsAggregationType.Min,
310
+ }),
311
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
312
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
313
+ incidentSeverityId: args.defaultIncidentSeverityId,
314
+ alertSeverityId: args.defaultAlertSeverityId,
315
+ monitorName: args.monitorName,
316
+ metricAlias,
317
+ filterType: FilterType.EqualTo,
318
+ value: 0,
319
+ }),
320
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
321
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
322
+ metricAlias,
323
+ filterType: FilterType.GreaterThan,
324
+ value: 0,
325
+ }),
326
+ });
327
+ },
328
+ };
329
+
330
+ const highCpuTemplate: KubernetesAlertTemplate = {
331
+ id: "k8s-high-cpu",
332
+ name: "High Node CPU Utilization",
333
+ description: "Alert when node CPU utilization exceeds 90% sustained.",
334
+ category: "Node",
335
+ severity: "Warning",
336
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
337
+ const metricAlias: string = "node_cpu";
338
+
339
+ return buildKubernetesMonitorStep({
340
+ kubernetesMonitor: buildKubernetesMonitorConfig({
341
+ clusterIdentifier: args.clusterIdentifier,
342
+ metricName: "k8s.node.cpu.utilization",
343
+ metricAlias,
344
+ resourceScope: KubernetesResourceScope.Node,
345
+ rollingTime: RollingTime.Past5Minutes,
346
+ aggregationType: MetricsAggregationType.Avg,
347
+ }),
348
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
349
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
350
+ incidentSeverityId: args.defaultIncidentSeverityId,
351
+ alertSeverityId: args.defaultAlertSeverityId,
352
+ monitorName: args.monitorName,
353
+ metricAlias,
354
+ filterType: FilterType.GreaterThan,
355
+ value: 90,
356
+ }),
357
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
358
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
359
+ metricAlias,
360
+ filterType: FilterType.LessThanOrEqualTo,
361
+ value: 90,
362
+ }),
363
+ });
364
+ },
365
+ };
366
+
367
+ const highMemoryTemplate: KubernetesAlertTemplate = {
368
+ id: "k8s-high-memory",
369
+ name: "High Node Memory Utilization",
370
+ description: "Alert when node memory utilization exceeds 85% sustained.",
371
+ category: "Node",
372
+ severity: "Warning",
373
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
374
+ const metricAlias: string = "node_memory";
375
+
376
+ return buildKubernetesMonitorStep({
377
+ kubernetesMonitor: buildKubernetesMonitorConfig({
378
+ clusterIdentifier: args.clusterIdentifier,
379
+ metricName: "k8s.node.memory.usage",
380
+ metricAlias,
381
+ resourceScope: KubernetesResourceScope.Node,
382
+ rollingTime: RollingTime.Past5Minutes,
383
+ aggregationType: MetricsAggregationType.Avg,
384
+ }),
385
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
386
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
387
+ incidentSeverityId: args.defaultIncidentSeverityId,
388
+ alertSeverityId: args.defaultAlertSeverityId,
389
+ monitorName: args.monitorName,
390
+ metricAlias,
391
+ filterType: FilterType.GreaterThan,
392
+ value: 85,
393
+ }),
394
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
395
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
396
+ metricAlias,
397
+ filterType: FilterType.LessThanOrEqualTo,
398
+ value: 85,
399
+ }),
400
+ });
401
+ },
402
+ };
403
+
404
+ const deploymentReplicaMismatchTemplate: KubernetesAlertTemplate = {
405
+ id: "k8s-deployment-replica-mismatch",
406
+ name: "Deployment Replica Mismatch",
407
+ description:
408
+ "Alert when available replicas are less than desired replicas for a deployment.",
409
+ category: "Workload",
410
+ severity: "Warning",
411
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
412
+ const metricAlias: string = "unavailable_replicas";
413
+
414
+ return buildKubernetesMonitorStep({
415
+ kubernetesMonitor: buildKubernetesMonitorConfig({
416
+ clusterIdentifier: args.clusterIdentifier,
417
+ metricName: "k8s.deployment.unavailable_replicas",
418
+ metricAlias,
419
+ resourceScope: KubernetesResourceScope.Workload,
420
+ rollingTime: RollingTime.Past5Minutes,
421
+ aggregationType: MetricsAggregationType.Max,
422
+ }),
423
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
424
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
425
+ incidentSeverityId: args.defaultIncidentSeverityId,
426
+ alertSeverityId: args.defaultAlertSeverityId,
427
+ monitorName: args.monitorName,
428
+ metricAlias,
429
+ filterType: FilterType.GreaterThan,
430
+ value: 0,
431
+ }),
432
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
433
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
434
+ metricAlias,
435
+ filterType: FilterType.EqualTo,
436
+ value: 0,
437
+ }),
438
+ });
439
+ },
440
+ };
441
+
442
+ const jobFailuresTemplate: KubernetesAlertTemplate = {
443
+ id: "k8s-job-failures",
444
+ name: "Job Failures",
445
+ description: "Alert when Kubernetes jobs fail.",
446
+ category: "Workload",
447
+ severity: "Warning",
448
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
449
+ const metricAlias: string = "failed_pods";
450
+
451
+ return buildKubernetesMonitorStep({
452
+ kubernetesMonitor: buildKubernetesMonitorConfig({
453
+ clusterIdentifier: args.clusterIdentifier,
454
+ metricName: "k8s.job.failed_pods",
455
+ metricAlias,
456
+ resourceScope: KubernetesResourceScope.Workload,
457
+ rollingTime: RollingTime.Past5Minutes,
458
+ aggregationType: MetricsAggregationType.Max,
459
+ }),
460
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
461
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
462
+ incidentSeverityId: args.defaultIncidentSeverityId,
463
+ alertSeverityId: args.defaultAlertSeverityId,
464
+ monitorName: args.monitorName,
465
+ metricAlias,
466
+ filterType: FilterType.GreaterThan,
467
+ value: 0,
468
+ }),
469
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
470
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
471
+ metricAlias,
472
+ filterType: FilterType.EqualTo,
473
+ value: 0,
474
+ }),
475
+ });
476
+ },
477
+ };
478
+
479
+ const etcdNoLeaderTemplate: KubernetesAlertTemplate = {
480
+ id: "k8s-etcd-no-leader",
481
+ name: "etcd No Leader",
482
+ description:
483
+ "Alert immediately when etcd has no leader elected. This is a critical cluster health issue.",
484
+ category: "ControlPlane",
485
+ severity: "Critical",
486
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
487
+ const metricAlias: string = "etcd_has_leader";
488
+
489
+ return buildKubernetesMonitorStep({
490
+ kubernetesMonitor: buildKubernetesMonitorConfig({
491
+ clusterIdentifier: args.clusterIdentifier,
492
+ metricName: "etcd_server_has_leader",
493
+ metricAlias,
494
+ resourceScope: KubernetesResourceScope.Cluster,
495
+ rollingTime: RollingTime.Past1Minute,
496
+ aggregationType: MetricsAggregationType.Min,
497
+ }),
498
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
499
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
500
+ incidentSeverityId: args.defaultIncidentSeverityId,
501
+ alertSeverityId: args.defaultAlertSeverityId,
502
+ monitorName: args.monitorName,
503
+ metricAlias,
504
+ filterType: FilterType.EqualTo,
505
+ value: 0,
506
+ }),
507
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
508
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
509
+ metricAlias,
510
+ filterType: FilterType.GreaterThan,
511
+ value: 0,
512
+ }),
513
+ });
514
+ },
515
+ };
516
+
517
+ const apiServerThrottlingTemplate: KubernetesAlertTemplate = {
518
+ id: "k8s-apiserver-throttling",
519
+ name: "API Server Throttling",
520
+ description:
521
+ "Alert when the Kubernetes API server is dropping requests due to throttling.",
522
+ category: "ControlPlane",
523
+ severity: "Critical",
524
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
525
+ const metricAlias: string = "dropped_requests";
526
+
527
+ return buildKubernetesMonitorStep({
528
+ kubernetesMonitor: buildKubernetesMonitorConfig({
529
+ clusterIdentifier: args.clusterIdentifier,
530
+ metricName: "apiserver_dropped_requests_total",
531
+ metricAlias,
532
+ resourceScope: KubernetesResourceScope.Cluster,
533
+ rollingTime: RollingTime.Past5Minutes,
534
+ aggregationType: MetricsAggregationType.Sum,
535
+ }),
536
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
537
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
538
+ incidentSeverityId: args.defaultIncidentSeverityId,
539
+ alertSeverityId: args.defaultAlertSeverityId,
540
+ monitorName: args.monitorName,
541
+ metricAlias,
542
+ filterType: FilterType.GreaterThan,
543
+ value: 0,
544
+ }),
545
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
546
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
547
+ metricAlias,
548
+ filterType: FilterType.EqualTo,
549
+ value: 0,
550
+ }),
551
+ });
552
+ },
553
+ };
554
+
555
+ const schedulerBacklogTemplate: KubernetesAlertTemplate = {
556
+ id: "k8s-scheduler-backlog",
557
+ name: "Scheduler Backlog",
558
+ description:
559
+ "Alert when there are pods waiting to be scheduled for more than 5 minutes.",
560
+ category: "Scheduling",
561
+ severity: "Warning",
562
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
563
+ const metricAlias: string = "pending_pods";
564
+
565
+ return buildKubernetesMonitorStep({
566
+ kubernetesMonitor: buildKubernetesMonitorConfig({
567
+ clusterIdentifier: args.clusterIdentifier,
568
+ metricName: "scheduler_pending_pods",
569
+ metricAlias,
570
+ resourceScope: KubernetesResourceScope.Cluster,
571
+ rollingTime: RollingTime.Past5Minutes,
572
+ aggregationType: MetricsAggregationType.Avg,
573
+ }),
574
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
575
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
576
+ incidentSeverityId: args.defaultIncidentSeverityId,
577
+ alertSeverityId: args.defaultAlertSeverityId,
578
+ monitorName: args.monitorName,
579
+ metricAlias,
580
+ filterType: FilterType.GreaterThan,
581
+ value: 0,
582
+ }),
583
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
584
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
585
+ metricAlias,
586
+ filterType: FilterType.EqualTo,
587
+ value: 0,
588
+ }),
589
+ });
590
+ },
591
+ };
592
+
593
+ const highDiskUsageTemplate: KubernetesAlertTemplate = {
594
+ id: "k8s-high-disk-usage",
595
+ name: "High Node Disk Usage",
596
+ description: "Alert when node filesystem usage exceeds 90% capacity.",
597
+ category: "Storage",
598
+ severity: "Warning",
599
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
600
+ const metricAlias: string = "disk_usage";
601
+
602
+ return buildKubernetesMonitorStep({
603
+ kubernetesMonitor: buildKubernetesMonitorConfig({
604
+ clusterIdentifier: args.clusterIdentifier,
605
+ metricName: "k8s.node.filesystem.usage",
606
+ metricAlias,
607
+ resourceScope: KubernetesResourceScope.Node,
608
+ rollingTime: RollingTime.Past5Minutes,
609
+ aggregationType: MetricsAggregationType.Avg,
610
+ }),
611
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
612
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
613
+ incidentSeverityId: args.defaultIncidentSeverityId,
614
+ alertSeverityId: args.defaultAlertSeverityId,
615
+ monitorName: args.monitorName,
616
+ metricAlias,
617
+ filterType: FilterType.GreaterThan,
618
+ value: 90,
619
+ }),
620
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
621
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
622
+ metricAlias,
623
+ filterType: FilterType.LessThanOrEqualTo,
624
+ value: 90,
625
+ }),
626
+ });
627
+ },
628
+ };
629
+
630
+ const daemonSetUnavailableTemplate: KubernetesAlertTemplate = {
631
+ id: "k8s-daemonset-unavailable",
632
+ name: "DaemonSet Unavailable Nodes",
633
+ description:
634
+ "Alert when a DaemonSet has unavailable nodes where the daemon pod should be running.",
635
+ category: "Workload",
636
+ severity: "Warning",
637
+ getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => {
638
+ const metricAlias: string = "unavailable_nodes";
639
+
640
+ return buildKubernetesMonitorStep({
641
+ kubernetesMonitor: buildKubernetesMonitorConfig({
642
+ clusterIdentifier: args.clusterIdentifier,
643
+ metricName: "k8s.daemonset.misscheduled_nodes",
644
+ metricAlias,
645
+ resourceScope: KubernetesResourceScope.Workload,
646
+ rollingTime: RollingTime.Past5Minutes,
647
+ aggregationType: MetricsAggregationType.Max,
648
+ }),
649
+ offlineCriteriaInstance: buildOfflineCriteriaInstance({
650
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
651
+ incidentSeverityId: args.defaultIncidentSeverityId,
652
+ alertSeverityId: args.defaultAlertSeverityId,
653
+ monitorName: args.monitorName,
654
+ metricAlias,
655
+ filterType: FilterType.GreaterThan,
656
+ value: 0,
657
+ }),
658
+ onlineCriteriaInstance: buildOnlineCriteriaInstance({
659
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
660
+ metricAlias,
661
+ filterType: FilterType.EqualTo,
662
+ value: 0,
663
+ }),
664
+ });
665
+ },
666
+ };
667
+
668
+ export function getAllKubernetesAlertTemplates(): Array<KubernetesAlertTemplate> {
669
+ return [
670
+ crashLoopBackOffTemplate,
671
+ podPendingTemplate,
672
+ nodeNotReadyTemplate,
673
+ highCpuTemplate,
674
+ highMemoryTemplate,
675
+ deploymentReplicaMismatchTemplate,
676
+ jobFailuresTemplate,
677
+ etcdNoLeaderTemplate,
678
+ apiServerThrottlingTemplate,
679
+ schedulerBacklogTemplate,
680
+ highDiskUsageTemplate,
681
+ daemonSetUnavailableTemplate,
682
+ ];
683
+ }
684
+
685
+ export function getKubernetesAlertTemplatesByCategory(
686
+ category: KubernetesAlertTemplateCategory,
687
+ ): Array<KubernetesAlertTemplate> {
688
+ return getAllKubernetesAlertTemplates().filter(
689
+ (template: KubernetesAlertTemplate) => {
690
+ return template.category === category;
691
+ },
692
+ );
693
+ }
694
+
695
+ export function getKubernetesAlertTemplateById(
696
+ id: string,
697
+ ): KubernetesAlertTemplate | undefined {
698
+ return getAllKubernetesAlertTemplates().find(
699
+ (template: KubernetesAlertTemplate) => {
700
+ return template.id === id;
701
+ },
702
+ );
703
+ }