@oneuptime/common 10.5.32 → 10.5.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Models/DatabaseModels/KubernetesResource.ts +37 -0
- package/Server/API/KubernetesResourceAPI.ts +27 -18
- package/Server/Infrastructure/Postgres/SchemaMigrations/1780651429467-AddKubernetesLatestMemoryPercent.ts +19 -0
- package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +2 -0
- package/Server/Services/KubernetesResourceService.ts +37 -11
- package/Server/Utils/Monitor/MonitorAlert.ts +34 -0
- package/Server/Utils/Monitor/MonitorIncident.ts +60 -93
- package/Server/Utils/Monitor/MonitorMaintenanceSuppression.ts +229 -0
- package/Server/Utils/Monitor/MonitorResource.ts +18 -0
- package/Server/Utils/Monitor/SeriesResourceLabels.ts +156 -0
- package/Tests/Server/Utils/Monitor/MonitorMaintenanceSuppression.test.ts +211 -0
- package/build/dist/Models/DatabaseModels/KubernetesResource.js +38 -0
- package/build/dist/Models/DatabaseModels/KubernetesResource.js.map +1 -1
- package/build/dist/Server/API/KubernetesResourceAPI.js +6 -4
- package/build/dist/Server/API/KubernetesResourceAPI.js.map +1 -1
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1780651429467-AddKubernetesLatestMemoryPercent.js +12 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1780651429467-AddKubernetesLatestMemoryPercent.js.map +1 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +2 -0
- package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js.map +1 -1
- package/build/dist/Server/Services/KubernetesResourceService.js +13 -5
- package/build/dist/Server/Services/KubernetesResourceService.js.map +1 -1
- package/build/dist/Server/Utils/Monitor/MonitorAlert.js +36 -17
- package/build/dist/Server/Utils/Monitor/MonitorAlert.js.map +1 -1
- package/build/dist/Server/Utils/Monitor/MonitorIncident.js +60 -107
- package/build/dist/Server/Utils/Monitor/MonitorIncident.js.map +1 -1
- package/build/dist/Server/Utils/Monitor/MonitorMaintenanceSuppression.js +165 -0
- package/build/dist/Server/Utils/Monitor/MonitorMaintenanceSuppression.js.map +1 -0
- package/build/dist/Server/Utils/Monitor/MonitorResource.js +16 -0
- package/build/dist/Server/Utils/Monitor/MonitorResource.js.map +1 -1
- package/build/dist/Server/Utils/Monitor/SeriesResourceLabels.js +106 -0
- package/build/dist/Server/Utils/Monitor/SeriesResourceLabels.js.map +1 -0
- package/build/dist/Tests/Server/Utils/Monitor/MonitorMaintenanceSuppression.test.js +142 -0
- package/build/dist/Tests/Server/Utils/Monitor/MonitorMaintenanceSuppression.test.js.map +1 -0
- package/package.json +1 -1
|
@@ -553,6 +553,43 @@ export default class KubernetesResource extends BaseModel {
|
|
|
553
553
|
})
|
|
554
554
|
public latestMemoryBytes?: number = undefined;
|
|
555
555
|
|
|
556
|
+
@ColumnAccessControl({
|
|
557
|
+
create: [],
|
|
558
|
+
read: READ_PERMISSIONS,
|
|
559
|
+
update: [],
|
|
560
|
+
})
|
|
561
|
+
@TableColumn({
|
|
562
|
+
required: false,
|
|
563
|
+
type: TableColumnType.Number,
|
|
564
|
+
canReadOnRelationQuery: true,
|
|
565
|
+
title: "Latest Memory Percent",
|
|
566
|
+
description:
|
|
567
|
+
"Most recent memory usage as a percent of the resource's node allocatable memory (Pod or Node). Stored as decimal — mirrors latestCpuPercent — so the workload/namespace list views can SUM a per-pod percentage. Null until the first metric arrives or while the node's allocatable memory is still unknown.",
|
|
568
|
+
})
|
|
569
|
+
@Column({
|
|
570
|
+
nullable: true,
|
|
571
|
+
type: ColumnType.Decimal,
|
|
572
|
+
transformer: {
|
|
573
|
+
to: (value: number | null | undefined): number | null => {
|
|
574
|
+
if (value === null || value === undefined) {
|
|
575
|
+
return null;
|
|
576
|
+
}
|
|
577
|
+
return value;
|
|
578
|
+
},
|
|
579
|
+
from: (value: string | number | null | undefined): number | null => {
|
|
580
|
+
if (value === null || value === undefined) {
|
|
581
|
+
return null;
|
|
582
|
+
}
|
|
583
|
+
if (typeof value === "number") {
|
|
584
|
+
return value;
|
|
585
|
+
}
|
|
586
|
+
const parsed: number = parseFloat(value);
|
|
587
|
+
return isNaN(parsed) ? null : parsed;
|
|
588
|
+
},
|
|
589
|
+
},
|
|
590
|
+
})
|
|
591
|
+
public latestMemoryPercent?: number = undefined;
|
|
592
|
+
|
|
556
593
|
@ColumnAccessControl({
|
|
557
594
|
create: [],
|
|
558
595
|
read: READ_PERMISSIONS,
|
|
@@ -143,19 +143,24 @@ export default class KubernetesResourceAPI extends BaseAPI<
|
|
|
143
143
|
|
|
144
144
|
/*
|
|
145
145
|
* Translate a service-layer Map of aggregates into a JSON dict
|
|
146
|
-
* { name: { cpuPercent, memoryBytes } } suitable for
|
|
147
|
-
* memoryBytes is stringified so values past 2 GiB don't
|
|
148
|
-
* client-side number parsing in the JSON path; the UI parses
|
|
149
|
-
* back to a number for rendering.
|
|
146
|
+
* { name: { cpuPercent, memoryBytes, memoryPercent } } suitable for
|
|
147
|
+
* the wire. memoryBytes is stringified so values past 2 GiB don't
|
|
148
|
+
* overflow client-side number parsing in the JSON path; the UI parses
|
|
149
|
+
* it back to a number for rendering. memoryPercent is the summed
|
|
150
|
+
* per-pod "% of node allocatable memory" (parallel to cpuPercent).
|
|
150
151
|
*/
|
|
151
152
|
private mapAggregatesToJson(
|
|
152
|
-
aggregates: Map<
|
|
153
|
+
aggregates: Map<
|
|
154
|
+
string,
|
|
155
|
+
{ cpuPercent: number; memoryBytes: number; memoryPercent: number }
|
|
156
|
+
>,
|
|
153
157
|
): JSONObject {
|
|
154
158
|
const out: JSONObject = {};
|
|
155
159
|
for (const [name, value] of aggregates.entries()) {
|
|
156
160
|
out[name] = {
|
|
157
161
|
cpuPercent: value.cpuPercent,
|
|
158
162
|
memoryBytes: value.memoryBytes.toString(),
|
|
163
|
+
memoryPercent: value.memoryPercent,
|
|
159
164
|
};
|
|
160
165
|
}
|
|
161
166
|
return out;
|
|
@@ -169,12 +174,14 @@ export default class KubernetesResourceAPI extends BaseAPI<
|
|
|
169
174
|
await this.resolveClusterForRequest(req);
|
|
170
175
|
|
|
171
176
|
const staleAfter: Date = new Date(Date.now() - 15 * 60 * 1000);
|
|
172
|
-
const aggregates: Map<
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
177
|
+
const aggregates: Map<
|
|
178
|
+
string,
|
|
179
|
+
{ cpuPercent: number; memoryBytes: number; memoryPercent: number }
|
|
180
|
+
> = await this.service.getLatestMetricsByNamespace({
|
|
181
|
+
projectId,
|
|
182
|
+
kubernetesClusterId,
|
|
183
|
+
staleAfter,
|
|
184
|
+
});
|
|
178
185
|
|
|
179
186
|
return Response.sendJsonObjectResponse(req, res, {
|
|
180
187
|
aggregates: this.mapAggregatesToJson(aggregates),
|
|
@@ -210,13 +217,15 @@ export default class KubernetesResourceAPI extends BaseAPI<
|
|
|
210
217
|
await this.resolveClusterForRequest(req);
|
|
211
218
|
|
|
212
219
|
const staleAfter: Date = new Date(Date.now() - 15 * 60 * 1000);
|
|
213
|
-
const aggregates: Map<
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
+
const aggregates: Map<
|
|
221
|
+
string,
|
|
222
|
+
{ cpuPercent: number; memoryBytes: number; memoryPercent: number }
|
|
223
|
+
> = await this.service.getLatestMetricsByOwner({
|
|
224
|
+
projectId,
|
|
225
|
+
kubernetesClusterId,
|
|
226
|
+
ownerKind,
|
|
227
|
+
staleAfter,
|
|
228
|
+
});
|
|
220
229
|
|
|
221
230
|
return Response.sendJsonObjectResponse(req, res, {
|
|
222
231
|
aggregates: this.mapAggregatesToJson(aggregates),
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { MigrationInterface, QueryRunner } from "typeorm";
|
|
2
|
+
|
|
3
|
+
export class AddKubernetesLatestMemoryPercent1780651429467
|
|
4
|
+
implements MigrationInterface
|
|
5
|
+
{
|
|
6
|
+
public name = "AddKubernetesLatestMemoryPercent1780651429467";
|
|
7
|
+
|
|
8
|
+
public async up(queryRunner: QueryRunner): Promise<void> {
|
|
9
|
+
await queryRunner.query(
|
|
10
|
+
`ALTER TABLE "KubernetesResource" ADD "latestMemoryPercent" numeric`,
|
|
11
|
+
);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
public async down(queryRunner: QueryRunner): Promise<void> {
|
|
15
|
+
await queryRunner.query(
|
|
16
|
+
`ALTER TABLE "KubernetesResource" DROP COLUMN "latestMemoryPercent"`,
|
|
17
|
+
);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
@@ -366,6 +366,7 @@ import { MigrationName1780382837019 } from "./1780382837019-MigrationName";
|
|
|
366
366
|
import { MigrationName1780387560604 } from "./1780387560604-MigrationName";
|
|
367
367
|
import { MigrationName1780388219225 } from "./1780388219225-MigrationName";
|
|
368
368
|
import { AddMetricAndTraceSavedView1780645560183 } from "./1780645560183-AddMetricAndTraceSavedView";
|
|
369
|
+
import { AddKubernetesLatestMemoryPercent1780651429467 } from "./1780651429467-AddKubernetesLatestMemoryPercent";
|
|
369
370
|
|
|
370
371
|
export default [
|
|
371
372
|
InitialMigration,
|
|
@@ -736,4 +737,5 @@ export default [
|
|
|
736
737
|
MigrationName1780387560604,
|
|
737
738
|
MigrationName1780388219225,
|
|
738
739
|
AddMetricAndTraceSavedView1780645560183,
|
|
740
|
+
AddKubernetesLatestMemoryPercent1780651429467,
|
|
739
741
|
];
|
|
@@ -46,6 +46,7 @@ export interface ResourceLatestMetric {
|
|
|
46
46
|
name: string;
|
|
47
47
|
cpuPercent: number | null;
|
|
48
48
|
memoryBytes: number | null;
|
|
49
|
+
memoryPercent: number | null;
|
|
49
50
|
observedAt: Date;
|
|
50
51
|
/*
|
|
51
52
|
* Optional Pod controller lineage. Read from
|
|
@@ -447,7 +448,7 @@ export class Service extends DatabaseService<Model> {
|
|
|
447
448
|
|
|
448
449
|
for (const m of chunk) {
|
|
449
450
|
valueFragments.push(
|
|
450
|
-
`($${paramIndex++}, $${paramIndex++}, $${paramIndex++}, $${paramIndex++}::numeric, $${paramIndex++}::bigint, $${paramIndex++}::timestamptz, $${paramIndex++}, $${paramIndex++})`,
|
|
451
|
+
`($${paramIndex++}, $${paramIndex++}, $${paramIndex++}, $${paramIndex++}::numeric, $${paramIndex++}::bigint, $${paramIndex++}::numeric, $${paramIndex++}::timestamptz, $${paramIndex++}, $${paramIndex++})`,
|
|
451
452
|
);
|
|
452
453
|
params.push(
|
|
453
454
|
m.kind,
|
|
@@ -459,6 +460,9 @@ export class Service extends DatabaseService<Model> {
|
|
|
459
460
|
m.memoryBytes !== null && m.memoryBytes !== undefined
|
|
460
461
|
? Math.trunc(m.memoryBytes).toString()
|
|
461
462
|
: null,
|
|
463
|
+
m.memoryPercent !== null && m.memoryPercent !== undefined
|
|
464
|
+
? m.memoryPercent
|
|
465
|
+
: null,
|
|
462
466
|
m.observedAt,
|
|
463
467
|
m.controllerDeploymentName ?? null,
|
|
464
468
|
m.controllerCronJobName ?? null,
|
|
@@ -470,12 +474,13 @@ export class Service extends DatabaseService<Model> {
|
|
|
470
474
|
SET
|
|
471
475
|
"latestCpuPercent" = COALESCE(v."cpu", k."latestCpuPercent"),
|
|
472
476
|
"latestMemoryBytes" = COALESCE(v."mem", k."latestMemoryBytes"),
|
|
477
|
+
"latestMemoryPercent" = COALESCE(v."memPct", k."latestMemoryPercent"),
|
|
473
478
|
"metricsUpdatedAt" = v."observedAt",
|
|
474
479
|
"controllerDeploymentName" = COALESCE(v."deployName", k."controllerDeploymentName"),
|
|
475
480
|
"controllerCronJobName" = COALESCE(v."cronName", k."controllerCronJobName"),
|
|
476
481
|
"updatedAt" = now()
|
|
477
482
|
FROM (VALUES ${valueFragments.join(", ")})
|
|
478
|
-
AS v("kind", "ns", "name", "cpu", "mem", "observedAt", "deployName", "cronName")
|
|
483
|
+
AS v("kind", "ns", "name", "cpu", "mem", "memPct", "observedAt", "deployName", "cronName")
|
|
479
484
|
WHERE
|
|
480
485
|
k."projectId" = $1
|
|
481
486
|
AND k."kubernetesClusterId" = $2
|
|
@@ -744,15 +749,22 @@ export class Service extends DatabaseService<Model> {
|
|
|
744
749
|
projectId: ObjectID;
|
|
745
750
|
kubernetesClusterId: ObjectID;
|
|
746
751
|
staleAfter: Date;
|
|
747
|
-
}): Promise<
|
|
752
|
+
}): Promise<
|
|
753
|
+
Map<
|
|
754
|
+
string,
|
|
755
|
+
{ cpuPercent: number; memoryBytes: number; memoryPercent: number }
|
|
756
|
+
>
|
|
757
|
+
> {
|
|
748
758
|
const rows: Array<{
|
|
749
759
|
namespaceKey: string;
|
|
750
760
|
cpu: string | null;
|
|
751
761
|
mem: string | null;
|
|
762
|
+
memPct: string | null;
|
|
752
763
|
}> = await this.getRepository().manager.query(
|
|
753
764
|
`SELECT "namespaceKey",
|
|
754
765
|
SUM("latestCpuPercent")::text AS cpu,
|
|
755
|
-
SUM("latestMemoryBytes")::text AS mem
|
|
766
|
+
SUM("latestMemoryBytes")::text AS mem,
|
|
767
|
+
SUM("latestMemoryPercent")::text AS "memPct"
|
|
756
768
|
FROM "KubernetesResource"
|
|
757
769
|
WHERE "projectId" = $1
|
|
758
770
|
AND "kubernetesClusterId" = $2
|
|
@@ -768,12 +780,15 @@ export class Service extends DatabaseService<Model> {
|
|
|
768
780
|
],
|
|
769
781
|
);
|
|
770
782
|
|
|
771
|
-
const out: Map<
|
|
772
|
-
|
|
783
|
+
const out: Map<
|
|
784
|
+
string,
|
|
785
|
+
{ cpuPercent: number; memoryBytes: number; memoryPercent: number }
|
|
786
|
+
> = new Map();
|
|
773
787
|
for (const row of rows) {
|
|
774
788
|
out.set(row.namespaceKey || "", {
|
|
775
789
|
cpuPercent: row.cpu ? parseFloat(row.cpu) || 0 : 0,
|
|
776
790
|
memoryBytes: row.mem ? parseInt(row.mem, 10) || 0 : 0,
|
|
791
|
+
memoryPercent: row.memPct ? parseFloat(row.memPct) || 0 : 0,
|
|
777
792
|
});
|
|
778
793
|
}
|
|
779
794
|
return out;
|
|
@@ -800,11 +815,17 @@ export class Service extends DatabaseService<Model> {
|
|
|
800
815
|
kubernetesClusterId: ObjectID;
|
|
801
816
|
ownerKind: string;
|
|
802
817
|
staleAfter: Date;
|
|
803
|
-
}): Promise<
|
|
818
|
+
}): Promise<
|
|
819
|
+
Map<
|
|
820
|
+
string,
|
|
821
|
+
{ cpuPercent: number; memoryBytes: number; memoryPercent: number }
|
|
822
|
+
>
|
|
823
|
+
> {
|
|
804
824
|
let rows: Array<{
|
|
805
825
|
ownerName: string;
|
|
806
826
|
cpu: string | null;
|
|
807
827
|
mem: string | null;
|
|
828
|
+
memPct: string | null;
|
|
808
829
|
}>;
|
|
809
830
|
|
|
810
831
|
if (data.ownerKind === "Deployment" || data.ownerKind === "CronJob") {
|
|
@@ -816,7 +837,8 @@ export class Service extends DatabaseService<Model> {
|
|
|
816
837
|
`SELECT
|
|
817
838
|
"${column}" AS "ownerName",
|
|
818
839
|
SUM("latestCpuPercent")::text AS cpu,
|
|
819
|
-
SUM("latestMemoryBytes")::text AS mem
|
|
840
|
+
SUM("latestMemoryBytes")::text AS mem,
|
|
841
|
+
SUM("latestMemoryPercent")::text AS "memPct"
|
|
820
842
|
FROM "KubernetesResource"
|
|
821
843
|
WHERE "projectId" = $1
|
|
822
844
|
AND "kubernetesClusterId" = $2
|
|
@@ -837,7 +859,8 @@ export class Service extends DatabaseService<Model> {
|
|
|
837
859
|
`SELECT
|
|
838
860
|
(owner->>'name') AS "ownerName",
|
|
839
861
|
SUM("latestCpuPercent")::text AS cpu,
|
|
840
|
-
SUM("latestMemoryBytes")::text AS mem
|
|
862
|
+
SUM("latestMemoryBytes")::text AS mem,
|
|
863
|
+
SUM("latestMemoryPercent")::text AS "memPct"
|
|
841
864
|
FROM "KubernetesResource",
|
|
842
865
|
jsonb_array_elements("ownerReferences"->'items') AS owner
|
|
843
866
|
WHERE "projectId" = $1
|
|
@@ -858,8 +881,10 @@ export class Service extends DatabaseService<Model> {
|
|
|
858
881
|
);
|
|
859
882
|
}
|
|
860
883
|
|
|
861
|
-
const out: Map<
|
|
862
|
-
|
|
884
|
+
const out: Map<
|
|
885
|
+
string,
|
|
886
|
+
{ cpuPercent: number; memoryBytes: number; memoryPercent: number }
|
|
887
|
+
> = new Map();
|
|
863
888
|
for (const row of rows) {
|
|
864
889
|
if (!row.ownerName) {
|
|
865
890
|
continue;
|
|
@@ -867,6 +892,7 @@ export class Service extends DatabaseService<Model> {
|
|
|
867
892
|
out.set(row.ownerName, {
|
|
868
893
|
cpuPercent: row.cpu ? parseFloat(row.cpu) || 0 : 0,
|
|
869
894
|
memoryBytes: row.mem ? parseInt(row.mem, 10) || 0 : 0,
|
|
895
|
+
memoryPercent: row.memPct ? parseFloat(row.memPct) || 0 : 0,
|
|
870
896
|
});
|
|
871
897
|
}
|
|
872
898
|
return out;
|
|
@@ -115,6 +115,13 @@ export default class MonitorAlert {
|
|
|
115
115
|
telemetryQuery?: TelemetryQuery | undefined;
|
|
116
116
|
};
|
|
117
117
|
matchesPerSeries?: Array<PerSeriesCriteriaMatch> | undefined;
|
|
118
|
+
/**
|
|
119
|
+
* Series fingerprints whose underlying resource is inside an
|
|
120
|
+
* ongoing scheduled maintenance window. Alerts for these series are
|
|
121
|
+
* suppressed at creation time even though the monitor keeps
|
|
122
|
+
* evaluating. See MonitorMaintenanceSuppression.
|
|
123
|
+
*/
|
|
124
|
+
suppressedSeriesFingerprints?: Set<string> | undefined;
|
|
118
125
|
}): Promise<void> {
|
|
119
126
|
const alertLogAttributes: LogAttributes = {
|
|
120
127
|
projectId: input.monitor.projectId?.toString(),
|
|
@@ -164,6 +171,33 @@ export default class MonitorAlert {
|
|
|
164
171
|
const seriesRootCause: string =
|
|
165
172
|
seriesMatch?.rootCause || input.rootCause;
|
|
166
173
|
|
|
174
|
+
/*
|
|
175
|
+
* Per-series scheduled-maintenance suppression: skip creating an
|
|
176
|
+
* alert for a series whose resource is inside an ongoing
|
|
177
|
+
* maintenance window. Other series on the same monitor are
|
|
178
|
+
* unaffected. Only *new* creation is suppressed — existing open
|
|
179
|
+
* alerts follow the normal resolve path.
|
|
180
|
+
*/
|
|
181
|
+
if (
|
|
182
|
+
seriesFingerprint &&
|
|
183
|
+
input.suppressedSeriesFingerprints?.has(seriesFingerprint)
|
|
184
|
+
) {
|
|
185
|
+
logger.debug(
|
|
186
|
+
`${input.monitor.id?.toString()} - Skipping alert for series ${seriesFingerprint}: its resource is under an active scheduled maintenance window.`,
|
|
187
|
+
alertLogAttributes,
|
|
188
|
+
);
|
|
189
|
+
|
|
190
|
+
input.evaluationSummary?.events.push({
|
|
191
|
+
type: "alert-skipped",
|
|
192
|
+
title: "Alert suppressed by scheduled maintenance",
|
|
193
|
+
message:
|
|
194
|
+
"Skipped creating an alert because the resource for this series is under an active scheduled maintenance window.",
|
|
195
|
+
relatedCriteriaId: input.criteriaInstance.data?.id,
|
|
196
|
+
at: OneUptimeDate.getCurrentDate(),
|
|
197
|
+
});
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
|
|
167
201
|
const alreadyOpenAlert: Alert | undefined = openAlerts.find(
|
|
168
202
|
(alert: Alert) => {
|
|
169
203
|
return (
|
|
@@ -37,6 +37,9 @@ import OneUptimeDate from "../../../Types/Date";
|
|
|
37
37
|
import MonitorEvaluationSummary from "../../../Types/Monitor/MonitorEvaluationSummary";
|
|
38
38
|
import { IncidentMemberRoleAssignment } from "../../../Types/Monitor/CriteriaIncident";
|
|
39
39
|
import { PerSeriesCriteriaMatch } from "../../../Types/Probe/ProbeApiIngestResponse";
|
|
40
|
+
import SeriesResourceLabels, {
|
|
41
|
+
SeriesResourceRefs,
|
|
42
|
+
} from "./SeriesResourceLabels";
|
|
40
43
|
|
|
41
44
|
export default class MonitorIncident {
|
|
42
45
|
@CaptureSpan()
|
|
@@ -143,6 +146,14 @@ export default class MonitorIncident {
|
|
|
143
146
|
* reference `{{host.name}}` etc. via the template engine.
|
|
144
147
|
*/
|
|
145
148
|
matchesPerSeries?: Array<PerSeriesCriteriaMatch> | undefined;
|
|
149
|
+
/**
|
|
150
|
+
* Series fingerprints whose underlying resource (host, docker host,
|
|
151
|
+
* kubernetes cluster, or service) is inside an ongoing scheduled
|
|
152
|
+
* maintenance window. The monitor itself keeps evaluating — it is
|
|
153
|
+
* not attached to the maintenance — but incidents for these series
|
|
154
|
+
* are suppressed at creation time. See MonitorMaintenanceSuppression.
|
|
155
|
+
*/
|
|
156
|
+
suppressedSeriesFingerprints?: Set<string> | undefined;
|
|
146
157
|
}): Promise<void> {
|
|
147
158
|
const incidentLogAttributes: LogAttributes = {
|
|
148
159
|
projectId: input.monitor.projectId?.toString(),
|
|
@@ -202,6 +213,37 @@ export default class MonitorIncident {
|
|
|
202
213
|
const seriesRootCause: string =
|
|
203
214
|
seriesMatch?.rootCause || input.rootCause;
|
|
204
215
|
|
|
216
|
+
/*
|
|
217
|
+
* Per-series scheduled-maintenance suppression: this series'
|
|
218
|
+
* resource is inside an ongoing maintenance window, so skip
|
|
219
|
+
* creating an incident for it. Other series on the same monitor
|
|
220
|
+
* whose resources are not under maintenance still get incidents.
|
|
221
|
+
* Note: we only suppress *new* creation — any incident already
|
|
222
|
+
* open for this series is left to the normal resolve path
|
|
223
|
+
* (checkOpenIncidentsAndCloseIfResolved still sees the full
|
|
224
|
+
* breaching set), so a real incident raised before maintenance
|
|
225
|
+
* is not silently closed.
|
|
226
|
+
*/
|
|
227
|
+
if (
|
|
228
|
+
seriesFingerprint &&
|
|
229
|
+
input.suppressedSeriesFingerprints?.has(seriesFingerprint)
|
|
230
|
+
) {
|
|
231
|
+
logger.debug(
|
|
232
|
+
`${input.monitor.id?.toString()} - Skipping incident for series ${seriesFingerprint}: its resource is under an active scheduled maintenance window.`,
|
|
233
|
+
incidentLogAttributes,
|
|
234
|
+
);
|
|
235
|
+
|
|
236
|
+
input.evaluationSummary?.events.push({
|
|
237
|
+
type: "incident-skipped",
|
|
238
|
+
title: "Incident suppressed by scheduled maintenance",
|
|
239
|
+
message:
|
|
240
|
+
"Skipped creating an incident because the resource for this series is under an active scheduled maintenance window.",
|
|
241
|
+
relatedCriteriaId: input.criteriaInstance.data?.id,
|
|
242
|
+
at: OneUptimeDate.getCurrentDate(),
|
|
243
|
+
});
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
|
|
205
247
|
const alreadyOpenIncident: Incident | undefined = openIncidents.find(
|
|
206
248
|
(incident: Incident) => {
|
|
207
249
|
return (
|
|
@@ -485,97 +527,22 @@ export default class MonitorIncident {
|
|
|
485
527
|
}
|
|
486
528
|
|
|
487
529
|
/*
|
|
488
|
-
* Pull every host / docker-host / k8s-cluster
|
|
489
|
-
* series labels and attach the matching project-scoped
|
|
490
|
-
* the incident.
|
|
491
|
-
*
|
|
492
|
-
*
|
|
493
|
-
*
|
|
494
|
-
*
|
|
495
|
-
* the `resource.` prefix in ClickHouse, so prefixed and unprefixed
|
|
496
|
-
* forms are both accepted — whichever the group-by query surfaced,
|
|
497
|
-
* we'll find it. Multi-value labels are flattened, so a series that
|
|
498
|
-
* groups by a multi-valued attribute attaches every matching
|
|
499
|
-
* record. Lookups are always project-scoped so a stale or hostile
|
|
500
|
-
* stamp can't pull in a record from another tenant.
|
|
501
|
-
*
|
|
502
|
-
* For Docker hosts we deliberately ignore raw `host.name`/
|
|
503
|
-
* `oneuptime.host.name`: those are the Host's territory, and the
|
|
504
|
-
* ingest pipeline stamps `oneuptime.docker.host.*` independently
|
|
505
|
-
* when the source is a docker host.
|
|
530
|
+
* Pull every host / docker-host / k8s-cluster / service identifier
|
|
531
|
+
* out of the series labels and attach the matching project-scoped
|
|
532
|
+
* records to the incident. The label-key → resource-type mapping
|
|
533
|
+
* lives in SeriesResourceLabels (shared with the scheduled-maintenance
|
|
534
|
+
* suppression path so the two never disagree about which labels
|
|
535
|
+
* identify which resource). Lookups are always project-scoped so a
|
|
536
|
+
* stale or hostile stamp can't pull in a record from another tenant.
|
|
506
537
|
*/
|
|
507
538
|
private static async linkResourceContextFromSeries(input: {
|
|
508
539
|
incident: Incident;
|
|
509
540
|
seriesLabels: JSONObject;
|
|
510
541
|
projectId: ObjectID;
|
|
511
542
|
}): Promise<void> {
|
|
512
|
-
const
|
|
513
|
-
|
|
514
|
-
)
|
|
515
|
-
const found: Set<string> = new Set<string>();
|
|
516
|
-
for (const key of keys) {
|
|
517
|
-
const value: unknown = input.seriesLabels[key];
|
|
518
|
-
if (typeof value === "string" && value.length > 0) {
|
|
519
|
-
found.add(value);
|
|
520
|
-
continue;
|
|
521
|
-
}
|
|
522
|
-
if (Array.isArray(value)) {
|
|
523
|
-
for (const item of value) {
|
|
524
|
-
if (typeof item === "string" && item.length > 0) {
|
|
525
|
-
found.add(item);
|
|
526
|
-
}
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
return Array.from(found);
|
|
531
|
-
};
|
|
532
|
-
|
|
533
|
-
const hostIds: Array<string> = collect([
|
|
534
|
-
"resource.oneuptime.host.id",
|
|
535
|
-
"oneuptime.host.id",
|
|
536
|
-
]);
|
|
537
|
-
const hostNames: Array<string> = collect([
|
|
538
|
-
"resource.oneuptime.host.name",
|
|
539
|
-
"oneuptime.host.name",
|
|
540
|
-
"resource.host.name",
|
|
541
|
-
"host.name",
|
|
542
|
-
]);
|
|
543
|
-
|
|
544
|
-
const dockerHostIds: Array<string> = collect([
|
|
545
|
-
"resource.oneuptime.docker.host.id",
|
|
546
|
-
"oneuptime.docker.host.id",
|
|
547
|
-
]);
|
|
548
|
-
const dockerHostNames: Array<string> = collect([
|
|
549
|
-
"resource.oneuptime.docker.host.name",
|
|
550
|
-
"oneuptime.docker.host.name",
|
|
551
|
-
]);
|
|
552
|
-
|
|
553
|
-
const clusterIds: Array<string> = collect([
|
|
554
|
-
"resource.oneuptime.kubernetes.cluster.id",
|
|
555
|
-
"oneuptime.kubernetes.cluster.id",
|
|
556
|
-
]);
|
|
557
|
-
const clusterNames: Array<string> = collect([
|
|
558
|
-
"resource.oneuptime.kubernetes.cluster.name",
|
|
559
|
-
"oneuptime.kubernetes.cluster.name",
|
|
560
|
-
"resource.k8s.cluster.name",
|
|
561
|
-
"k8s.cluster.name",
|
|
562
|
-
]);
|
|
563
|
-
|
|
564
|
-
/*
|
|
565
|
-
* Services come from OTel-ingested telemetry. The ingest pipeline
|
|
566
|
-
* auto-creates a Service row keyed by `service.name`, so any series
|
|
567
|
-
* label that carries that attribute (raw or prefixed) tells us the
|
|
568
|
-
* emitting service. We also accept the `oneuptime.service.id`
|
|
569
|
-
* stamp for callers that already resolved the ID upstream.
|
|
570
|
-
*/
|
|
571
|
-
const serviceIds: Array<string> = collect([
|
|
572
|
-
"resource.oneuptime.service.id",
|
|
573
|
-
"oneuptime.service.id",
|
|
574
|
-
]);
|
|
575
|
-
const serviceNames: Array<string> = collect([
|
|
576
|
-
"resource.service.name",
|
|
577
|
-
"service.name",
|
|
578
|
-
]);
|
|
543
|
+
const refs: SeriesResourceRefs = SeriesResourceLabels.extractResourceRefs(
|
|
544
|
+
input.seriesLabels,
|
|
545
|
+
);
|
|
579
546
|
|
|
580
547
|
const [
|
|
581
548
|
resolvedHosts,
|
|
@@ -584,29 +551,29 @@ export default class MonitorIncident {
|
|
|
584
551
|
resolvedServices,
|
|
585
552
|
] = await Promise.all([
|
|
586
553
|
this.resolveResourceIds({
|
|
587
|
-
ids: hostIds,
|
|
588
|
-
names: hostNames,
|
|
554
|
+
ids: refs.hostIds,
|
|
555
|
+
names: refs.hostNames,
|
|
589
556
|
nameColumn: "hostIdentifier",
|
|
590
557
|
projectId: input.projectId,
|
|
591
558
|
findBy: HostService.findBy.bind(HostService),
|
|
592
559
|
}),
|
|
593
560
|
this.resolveResourceIds({
|
|
594
|
-
ids: dockerHostIds,
|
|
595
|
-
names: dockerHostNames,
|
|
561
|
+
ids: refs.dockerHostIds,
|
|
562
|
+
names: refs.dockerHostNames,
|
|
596
563
|
nameColumn: "hostIdentifier",
|
|
597
564
|
projectId: input.projectId,
|
|
598
565
|
findBy: DockerHostService.findBy.bind(DockerHostService),
|
|
599
566
|
}),
|
|
600
567
|
this.resolveResourceIds({
|
|
601
|
-
ids:
|
|
602
|
-
names:
|
|
568
|
+
ids: refs.kubernetesClusterIds,
|
|
569
|
+
names: refs.kubernetesClusterNames,
|
|
603
570
|
nameColumn: "clusterIdentifier",
|
|
604
571
|
projectId: input.projectId,
|
|
605
572
|
findBy: KubernetesClusterService.findBy.bind(KubernetesClusterService),
|
|
606
573
|
}),
|
|
607
574
|
this.resolveResourceIds({
|
|
608
|
-
ids: serviceIds,
|
|
609
|
-
names: serviceNames,
|
|
575
|
+
ids: refs.serviceIds,
|
|
576
|
+
names: refs.serviceNames,
|
|
610
577
|
nameColumn: "name",
|
|
611
578
|
projectId: input.projectId,
|
|
612
579
|
findBy: ServiceService.findBy.bind(ServiceService),
|