@l.x/datadog-cloud 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/INSTRUMENTATION_REPORT.md +217 -0
- package/LICENSE +122 -0
- package/Pulumi.dev-portal-prod.yaml +10 -0
- package/Pulumi.yaml +6 -0
- package/config.ts +243 -0
- package/dashboards/Pulumi.prod.yaml +6 -0
- package/dashboards/Pulumi.yaml +6 -0
- package/dashboards/config.ts +30 -0
- package/dashboards/dashboard-factory.ts +187 -0
- package/dashboards/dashboard-types.ts +127 -0
- package/dashboards/definitions/dev-portal/index.ts +1 -0
- package/dashboards/definitions/dev-portal/service-dashboard.ts +169 -0
- package/dashboards/definitions/index.ts +1 -0
- package/dashboards/esc/shared.yaml +7 -0
- package/dashboards/index.ts +40 -0
- package/dashboards/package.json +20 -0
- package/esc/dev-portal.yaml +13 -0
- package/factory.ts +79 -0
- package/index.ts +79 -0
- package/monitors/dev-portal/auth.ts +44 -0
- package/monitors/dev-portal/availability.ts +44 -0
- package/monitors/dev-portal/errors.ts +47 -0
- package/monitors/dev-portal/gateway.ts +47 -0
- package/monitors/dev-portal/index.ts +6 -0
- package/monitors/dev-portal/latency.ts +44 -0
- package/monitors/dev-portal/logs.ts +26 -0
- package/monitors/index.ts +8 -0
- package/package.json +20 -0
- package/tsconfig.json +16 -0
- package/types.ts +129 -0
package/index.ts
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import * as pulumi from '@pulumi/pulumi';
|
|
2
|
+
import {createMonitors} from './factory';
|
|
3
|
+
import {
|
|
4
|
+
devPortalLatencyMonitors,
|
|
5
|
+
devPortalErrorMonitors,
|
|
6
|
+
devPortalAvailabilityMonitors,
|
|
7
|
+
devPortalAuthMonitors,
|
|
8
|
+
devPortalGatewayMonitors,
|
|
9
|
+
devPortalLogMonitors,
|
|
10
|
+
} from './monitors';
|
|
11
|
+
import {settings} from './config';
|
|
12
|
+
import {MonitorDefinition} from './types';
|
|
13
|
+
|
|
14
|
+
// Log configuration
|
|
15
|
+
pulumi.log.info(`Environment: ${settings.environment}`);
|
|
16
|
+
pulumi.log.info(`Team: ${settings.defaultTeam}`);
|
|
17
|
+
pulumi.log.info(`Tag Filter: ${settings.tagFilter}`);
|
|
18
|
+
|
|
19
|
+
const isProd = settings.environment === 'prod';
|
|
20
|
+
|
|
21
|
+
// Create monitors based on the team specified in config
|
|
22
|
+
// Each stack should only create monitors for its team
|
|
23
|
+
const team = settings.defaultTeam;
|
|
24
|
+
|
|
25
|
+
// Team monitor definitions
|
|
26
|
+
const teamMonitors: Record<
|
|
27
|
+
string,
|
|
28
|
+
{monitors: MonitorDefinition[]; category: string}[]
|
|
29
|
+
> = {
|
|
30
|
+
'dev-portal': [
|
|
31
|
+
{monitors: devPortalLatencyMonitors, category: 'latency'},
|
|
32
|
+
{monitors: devPortalErrorMonitors, category: 'errors'},
|
|
33
|
+
{monitors: devPortalAvailabilityMonitors, category: 'availability'},
|
|
34
|
+
{monitors: devPortalAuthMonitors, category: 'auth'},
|
|
35
|
+
{monitors: devPortalGatewayMonitors, category: 'gateway'},
|
|
36
|
+
{monitors: devPortalLogMonitors, category: 'logs'},
|
|
37
|
+
],
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
// Get monitors for current team
|
|
41
|
+
const currentTeamMonitors = teamMonitors[team];
|
|
42
|
+
if (!currentTeamMonitors) {
|
|
43
|
+
throw new Error(
|
|
44
|
+
`Unknown team: ${team}. Valid teams: ${Object.keys(teamMonitors).join(', ')}`
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Create monitors for current team
|
|
49
|
+
const createdMonitors: Record<
|
|
50
|
+
string,
|
|
51
|
+
Record<string, pulumi.Output<string>>
|
|
52
|
+
> = {};
|
|
53
|
+
let totalCount = 0;
|
|
54
|
+
|
|
55
|
+
for (const {monitors, category} of currentTeamMonitors) {
|
|
56
|
+
// Filter out prodOnly monitors on non-prod stacks
|
|
57
|
+
const filtered = isProd ? monitors : monitors.filter(m => !m.prodOnly);
|
|
58
|
+
const created = createMonitors(filtered);
|
|
59
|
+
createdMonitors[category] = Object.fromEntries(
|
|
60
|
+
Object.entries(created).map(([k, v]) => [k, v.id])
|
|
61
|
+
);
|
|
62
|
+
totalCount += Object.keys(created).length;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Export monitor IDs grouped by category
|
|
66
|
+
export const monitorIds = createdMonitors;
|
|
67
|
+
|
|
68
|
+
// Export summary
|
|
69
|
+
export const summary = {
|
|
70
|
+
team,
|
|
71
|
+
environment: settings.environment,
|
|
72
|
+
totalMonitors: totalCount,
|
|
73
|
+
byCategory: Object.fromEntries(
|
|
74
|
+
Object.entries(createdMonitors).map(([cat, monitors]) => [
|
|
75
|
+
cat,
|
|
76
|
+
Object.keys(monitors).length,
|
|
77
|
+
])
|
|
78
|
+
),
|
|
79
|
+
};
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import {MonitorDefinition} from '../../types';
|
|
2
|
+
|
|
3
|
+
export const devPortalAuthMonitors: MonitorDefinition[] = [
|
|
4
|
+
{
|
|
5
|
+
id: 'dev-portal_auth_failure_rate',
|
|
6
|
+
name: 'Auth Failure Rate on dev-portal',
|
|
7
|
+
type: 'log alert',
|
|
8
|
+
query: `logs("service:dev-portal (message:*auth*failed OR message:*session*failed)").index("*").rollup("count").last("15m") > 20`,
|
|
9
|
+
alertBody:
|
|
10
|
+
'Auth failure log count for dev-portal has exceeded 20 in the last 15 minutes.',
|
|
11
|
+
recoveryBody: 'Auth failure rate for dev-portal has recovered.',
|
|
12
|
+
team: 'dev-portal',
|
|
13
|
+
priority: 3,
|
|
14
|
+
thresholds: {critical: 20},
|
|
15
|
+
logQuery: 'service:dev-portal',
|
|
16
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
17
|
+
readmeUrl:
|
|
18
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
19
|
+
dashboards: [],
|
|
20
|
+
enablePaging: false,
|
|
21
|
+
onMissingData: 'show_no_data',
|
|
22
|
+
prodOnly: true,
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
id: 'dev-portal_session_conflict_spike',
|
|
26
|
+
name: 'Session Conflict Spike on dev-portal',
|
|
27
|
+
type: 'log alert',
|
|
28
|
+
query: `logs("service:dev-portal message:*conflict").index("*").rollup("count").last("15m") > 10`,
|
|
29
|
+
alertBody:
|
|
30
|
+
'Session conflict log count for dev-portal has exceeded 10 in the last 15 minutes.',
|
|
31
|
+
recoveryBody: 'Session conflict rate for dev-portal has recovered.',
|
|
32
|
+
team: 'dev-portal',
|
|
33
|
+
priority: 3,
|
|
34
|
+
thresholds: {critical: 10},
|
|
35
|
+
logQuery: 'service:dev-portal',
|
|
36
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
37
|
+
readmeUrl:
|
|
38
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
39
|
+
dashboards: [],
|
|
40
|
+
enablePaging: false,
|
|
41
|
+
onMissingData: 'show_no_data',
|
|
42
|
+
prodOnly: true,
|
|
43
|
+
},
|
|
44
|
+
];
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import {MonitorDefinition} from '../../types';
|
|
2
|
+
import {settings} from '../../config';
|
|
3
|
+
|
|
4
|
+
// ALB metrics use name:dev-portal-lb tag, scoped by environment from Pulumi config
|
|
5
|
+
const albTagFilter = `name:dev-portal-lb,unienv:${settings.environment}`;
|
|
6
|
+
|
|
7
|
+
export const devPortalAvailabilityMonitors: MonitorDefinition[] = [
|
|
8
|
+
{
|
|
9
|
+
id: 'dev-portal_availability_zero_traffic',
|
|
10
|
+
name: 'Zero Traffic on dev-portal',
|
|
11
|
+
type: 'query alert',
|
|
12
|
+
query: `sum(last_10m):sum:aws.applicationelb.request_count{${albTagFilter}}.as_count() == 0`,
|
|
13
|
+
alertBody: 'dev-portal is receiving zero traffic. Service may be down.',
|
|
14
|
+
recoveryBody: 'dev-portal is receiving traffic again.',
|
|
15
|
+
team: 'dev-portal',
|
|
16
|
+
priority: 1,
|
|
17
|
+
thresholds: {critical: 0},
|
|
18
|
+
noDataTimeframe: 15,
|
|
19
|
+
notifyNoData: true,
|
|
20
|
+
logQuery: 'service:dev-portal',
|
|
21
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
22
|
+
readmeUrl:
|
|
23
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
24
|
+
dashboards: [],
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
id: 'dev-portal_availability_success_rate',
|
|
28
|
+
name: 'Success Rate on dev-portal',
|
|
29
|
+
type: 'query alert',
|
|
30
|
+
query: `sum(last_5m):(1 - sum:aws.applicationelb.httpcode_target_5xx{${albTagFilter}}.as_count() / sum:aws.applicationelb.request_count{${albTagFilter}}.as_count()) * 100 < 99`,
|
|
31
|
+
alertBody: 'Success rate for dev-portal has dropped below 99%.',
|
|
32
|
+
recoveryBody: 'Success rate for dev-portal has recovered above 99%.',
|
|
33
|
+
team: 'dev-portal',
|
|
34
|
+
priority: 2,
|
|
35
|
+
thresholds: {critical: 99, warning: 99.5},
|
|
36
|
+
logQuery: 'service:dev-portal status:error',
|
|
37
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
38
|
+
readmeUrl:
|
|
39
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
40
|
+
dashboards: [],
|
|
41
|
+
notifyNoData: true,
|
|
42
|
+
noDataTimeframe: 15,
|
|
43
|
+
},
|
|
44
|
+
];
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import {MonitorDefinition} from '../../types';
|
|
2
|
+
import {settings} from '../../config';
|
|
3
|
+
|
|
4
|
+
// ALB metrics use name:dev-portal-lb tag, scoped by environment from Pulumi config
|
|
5
|
+
const albTagFilter = `name:dev-portal-lb,unienv:${settings.environment}`;
|
|
6
|
+
|
|
7
|
+
export const devPortalErrorMonitors: MonitorDefinition[] = [
|
|
8
|
+
{
|
|
9
|
+
id: 'dev-portal_error_5xx',
|
|
10
|
+
name: '5xx Error Rate on dev-portal',
|
|
11
|
+
type: 'query alert',
|
|
12
|
+
query: `sum(last_5m):sum:aws.applicationelb.httpcode_target_5xx{${albTagFilter}}.as_count() / sum:aws.applicationelb.request_count{${albTagFilter}}.as_count() * 100 > 5`,
|
|
13
|
+
alertBody: '5xx error rate for dev-portal has exceeded 5%.',
|
|
14
|
+
recoveryBody: '5xx error rate for dev-portal has recovered.',
|
|
15
|
+
team: 'dev-portal',
|
|
16
|
+
priority: 2,
|
|
17
|
+
thresholds: {critical: 5, warning: 2},
|
|
18
|
+
logQuery: 'service:dev-portal status:error',
|
|
19
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
20
|
+
readmeUrl:
|
|
21
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
22
|
+
dashboards: [],
|
|
23
|
+
notifyNoData: true,
|
|
24
|
+
noDataTimeframe: 15,
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
id: 'dev-portal_error_anomaly',
|
|
28
|
+
name: 'Error Count Anomaly on dev-portal',
|
|
29
|
+
type: 'query alert',
|
|
30
|
+
query: `avg(last_4h):anomalies(sum:aws.applicationelb.httpcode_target_5xx{${albTagFilter}}.as_count(), 'agile', 3, direction='above', interval=300, alert_window='last_30m', count_default_zero='true') >= 1`,
|
|
31
|
+
alertBody: 'Anomalous spike in error count detected for dev-portal.',
|
|
32
|
+
recoveryBody: 'Error count anomaly for dev-portal has recovered.',
|
|
33
|
+
team: 'dev-portal',
|
|
34
|
+
priority: 3,
|
|
35
|
+
thresholds: {critical: 1, warning: 0.8},
|
|
36
|
+
thresholdWindows: {
|
|
37
|
+
triggerWindow: 'last_30m',
|
|
38
|
+
recoveryWindow: 'last_30m',
|
|
39
|
+
},
|
|
40
|
+
logQuery: 'service:dev-portal status:error',
|
|
41
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
42
|
+
readmeUrl:
|
|
43
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
44
|
+
dashboards: [],
|
|
45
|
+
enablePaging: false,
|
|
46
|
+
},
|
|
47
|
+
];
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import {MonitorDefinition} from '../../types';
|
|
2
|
+
|
|
3
|
+
export const devPortalGatewayMonitors: MonitorDefinition[] = [
|
|
4
|
+
{
|
|
5
|
+
id: 'dev-portal_gateway_proxy_errors',
|
|
6
|
+
name: 'Gateway Proxy Errors on dev-portal',
|
|
7
|
+
type: 'log alert',
|
|
8
|
+
query:
|
|
9
|
+
'logs("service:dev-portal message:gateway.proxy.error").index("*").rollup("count").last("10m") > 5',
|
|
10
|
+
alertBody:
|
|
11
|
+
'Gateway proxy errors on dev-portal have exceeded threshold. The app is logging gateway.proxy.error at error level — check url, method, and duration_ms fields for details.',
|
|
12
|
+
recoveryBody:
|
|
13
|
+
'Gateway proxy errors on dev-portal have recovered.',
|
|
14
|
+
team: 'dev-portal',
|
|
15
|
+
priority: 2,
|
|
16
|
+
thresholds: {critical: 5},
|
|
17
|
+
logQuery: 'service:dev-portal',
|
|
18
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
19
|
+
readmeUrl:
|
|
20
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
21
|
+
dashboards: [],
|
|
22
|
+
onMissingData: 'show_no_data',
|
|
23
|
+
prodOnly: true,
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
id: 'dev-portal_gateway_proxy_high_latency',
|
|
27
|
+
name: 'Gateway Proxy High Latency on dev-portal',
|
|
28
|
+
type: 'log alert',
|
|
29
|
+
query:
|
|
30
|
+
'logs("service:dev-portal message:gateway.proxy.complete @duration_ms:>3000").index("*").rollup("count").last("10m") > 10',
|
|
31
|
+
alertBody:
|
|
32
|
+
'Gateway proxy latency on dev-portal is consistently high. More than 10 requests exceeded 3000ms duration in the last 10 minutes.',
|
|
33
|
+
recoveryBody:
|
|
34
|
+
'Gateway proxy latency on dev-portal has recovered.',
|
|
35
|
+
team: 'dev-portal',
|
|
36
|
+
priority: 3,
|
|
37
|
+
thresholds: {critical: 10},
|
|
38
|
+
logQuery: 'service:dev-portal',
|
|
39
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
40
|
+
readmeUrl:
|
|
41
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
42
|
+
dashboards: [],
|
|
43
|
+
enablePaging: false,
|
|
44
|
+
onMissingData: 'show_no_data',
|
|
45
|
+
prodOnly: true,
|
|
46
|
+
},
|
|
47
|
+
];
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export {devPortalLatencyMonitors} from './latency';
|
|
2
|
+
export {devPortalErrorMonitors} from './errors';
|
|
3
|
+
export {devPortalAvailabilityMonitors} from './availability';
|
|
4
|
+
export {devPortalAuthMonitors} from './auth';
|
|
5
|
+
export {devPortalGatewayMonitors} from './gateway';
|
|
6
|
+
export {devPortalLogMonitors} from './logs';
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import {MonitorDefinition} from '../../types';
|
|
2
|
+
import {settings} from '../../config';
|
|
3
|
+
|
|
4
|
+
// ALB metrics use name:dev-portal-lb tag, scoped by environment from Pulumi config
|
|
5
|
+
const albTagFilter = `name:dev-portal-lb,unienv:${settings.environment}`;
|
|
6
|
+
|
|
7
|
+
export const devPortalLatencyMonitors: MonitorDefinition[] = [
|
|
8
|
+
{
|
|
9
|
+
id: 'dev-portal_latency_p95',
|
|
10
|
+
name: 'P95 Latency on dev-portal',
|
|
11
|
+
type: 'query alert',
|
|
12
|
+
query: `avg(last_5m):avg:aws.applicationelb.target_response_time.p95{${albTagFilter}} > 2`,
|
|
13
|
+
alertBody: 'P95 latency for dev-portal has exceeded 2 seconds.',
|
|
14
|
+
recoveryBody: 'P95 latency for dev-portal has recovered.',
|
|
15
|
+
team: 'dev-portal',
|
|
16
|
+
priority: 3,
|
|
17
|
+
thresholds: {critical: 2, warning: 1.5},
|
|
18
|
+
logQuery: 'service:dev-portal',
|
|
19
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
20
|
+
readmeUrl:
|
|
21
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
22
|
+
dashboards: [],
|
|
23
|
+
notifyNoData: true,
|
|
24
|
+
noDataTimeframe: 15,
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
id: 'dev-portal_latency_p99',
|
|
28
|
+
name: 'P99 Latency on dev-portal',
|
|
29
|
+
type: 'query alert',
|
|
30
|
+
query: `avg(last_5m):avg:aws.applicationelb.target_response_time.p99{${albTagFilter}} > 5`,
|
|
31
|
+
alertBody: 'P99 latency for dev-portal has exceeded 5 seconds.',
|
|
32
|
+
recoveryBody: 'P99 latency for dev-portal has recovered.',
|
|
33
|
+
team: 'dev-portal',
|
|
34
|
+
priority: 2,
|
|
35
|
+
thresholds: {critical: 5, warning: 3},
|
|
36
|
+
logQuery: 'service:dev-portal',
|
|
37
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
38
|
+
readmeUrl:
|
|
39
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
40
|
+
dashboards: [],
|
|
41
|
+
notifyNoData: true,
|
|
42
|
+
noDataTimeframe: 15,
|
|
43
|
+
},
|
|
44
|
+
];
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import {MonitorDefinition} from '../../types';
|
|
2
|
+
|
|
3
|
+
export const devPortalLogMonitors: MonitorDefinition[] = [
|
|
4
|
+
{
|
|
5
|
+
id: 'dev-portal_log_error_spike',
|
|
6
|
+
name: 'Error Log Spike on dev-portal',
|
|
7
|
+
type: 'log alert',
|
|
8
|
+
query:
|
|
9
|
+
'logs("service:dev-portal level:error").index("*").rollup("count").last("15m") > 50',
|
|
10
|
+
alertBody:
|
|
11
|
+
'Error-level log volume for dev-portal has exceeded 50 in the last 15 minutes.',
|
|
12
|
+
recoveryBody:
|
|
13
|
+
'Error-level log volume for dev-portal has recovered to normal levels.',
|
|
14
|
+
team: 'dev-portal',
|
|
15
|
+
priority: 2,
|
|
16
|
+
thresholds: {critical: 50},
|
|
17
|
+
logQuery: 'service:dev-portal level:error',
|
|
18
|
+
runbookUrl: 'https://www.notion.so/lx/dev-portal-runbook',
|
|
19
|
+
readmeUrl:
|
|
20
|
+
'https://github.com/Lx/universe/tree/main/apps/dev-portal',
|
|
21
|
+
dashboards: [],
|
|
22
|
+
enablePaging: false,
|
|
23
|
+
onMissingData: 'show_no_data',
|
|
24
|
+
prodOnly: true,
|
|
25
|
+
},
|
|
26
|
+
];
|
package/package.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@l.x/datadog-cloud",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Datadog monitors and dashboards for Universe services",
|
|
5
|
+
"main": "index.ts",
|
|
6
|
+
"dependencies": {
|
|
7
|
+
"@pulumi/datadog": "4.60.0",
|
|
8
|
+
"@pulumi/pulumi": "3.207.0"
|
|
9
|
+
},
|
|
10
|
+
"devDependencies": {
|
|
11
|
+
"@types/node": "22.13.1",
|
|
12
|
+
"typescript": "5.8.3"
|
|
13
|
+
},
|
|
14
|
+
"scripts": {
|
|
15
|
+
"preview": "pulumi preview",
|
|
16
|
+
"up": "pulumi up",
|
|
17
|
+
"destroy": "pulumi destroy",
|
|
18
|
+
"refresh": "pulumi refresh"
|
|
19
|
+
}
|
|
20
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"strict": true,
|
|
4
|
+
"target": "ES2020",
|
|
5
|
+
"module": "commonjs",
|
|
6
|
+
"lib": ["ES2020"],
|
|
7
|
+
"moduleResolution": "node",
|
|
8
|
+
"outDir": "./bin",
|
|
9
|
+
"rootDir": ".",
|
|
10
|
+
"esModuleInterop": true,
|
|
11
|
+
"skipLibCheck": true,
|
|
12
|
+
"forceConsistentCasingInFileNames": true
|
|
13
|
+
},
|
|
14
|
+
"include": ["./**/*.ts"],
|
|
15
|
+
"exclude": ["node_modules", "bin", "dashboards"]
|
|
16
|
+
}
|
package/types.ts
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import * as datadog from '@pulumi/datadog';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Dashboard link configuration
|
|
5
|
+
*/
|
|
6
|
+
export interface DashboardLink {
|
|
7
|
+
/** Display name for the dashboard */
|
|
8
|
+
name: string;
|
|
9
|
+
/** Datadog dashboard URL path (e.g., /dashboard/abc-123) */
|
|
10
|
+
url: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Monitor definition input - simplified interface for defining monitors
|
|
15
|
+
*/
|
|
16
|
+
export interface MonitorDefinition {
|
|
17
|
+
/** Unique identifier for the monitor (used in resource name and tags) */
|
|
18
|
+
id: string;
|
|
19
|
+
|
|
20
|
+
/** Display name - can include template variables like {{service.name}} */
|
|
21
|
+
name: string;
|
|
22
|
+
|
|
23
|
+
/** Monitor type */
|
|
24
|
+
type:
|
|
25
|
+
| 'query alert'
|
|
26
|
+
| 'event-v2 alert'
|
|
27
|
+
| 'composite'
|
|
28
|
+
| 'log alert'
|
|
29
|
+
| 'metric alert'
|
|
30
|
+
| 'ci-pipelines alert';
|
|
31
|
+
|
|
32
|
+
/** Datadog query */
|
|
33
|
+
query: string;
|
|
34
|
+
|
|
35
|
+
/** Alert message body (will be wrapped with standardized links section) */
|
|
36
|
+
alertBody: string;
|
|
37
|
+
|
|
38
|
+
/** Recovery message body (optional) */
|
|
39
|
+
recoveryBody?: string;
|
|
40
|
+
|
|
41
|
+
/** Team that owns this monitor - used to lookup EP from ESC */
|
|
42
|
+
team: string;
|
|
43
|
+
|
|
44
|
+
/** Monitor priority (1-5, where 1 is highest) */
|
|
45
|
+
priority: 1 | 2 | 3 | 4 | 5;
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Log query filter for the affected resource.
|
|
49
|
+
* Used to generate log links in alert body.
|
|
50
|
+
* Examples:
|
|
51
|
+
* - "service:{{service.name}}"
|
|
52
|
+
* - "host:{{host.name}}"
|
|
53
|
+
*/
|
|
54
|
+
logQuery: string;
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Notion runbook URL for this monitor/service.
|
|
58
|
+
* Should contain troubleshooting steps and escalation procedures.
|
|
59
|
+
*/
|
|
60
|
+
runbookUrl: string;
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* GitHub README URL for the service codebase.
|
|
64
|
+
* Links to the service's documentation in the repository.
|
|
65
|
+
*/
|
|
66
|
+
readmeUrl: string;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Relevant Datadog dashboards for investigating this alert.
|
|
70
|
+
*/
|
|
71
|
+
dashboards: DashboardLink[];
|
|
72
|
+
|
|
73
|
+
/** Threshold configuration */
|
|
74
|
+
thresholds?: {
|
|
75
|
+
critical?: number;
|
|
76
|
+
warning?: number;
|
|
77
|
+
criticalRecovery?: number;
|
|
78
|
+
warningRecovery?: number;
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
/** Threshold windows for anomaly monitors */
|
|
82
|
+
thresholdWindows?: {
|
|
83
|
+
triggerWindow?: string;
|
|
84
|
+
recoveryWindow?: string;
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
/** Minutes before notifying on missing data (0 = disabled) */
|
|
88
|
+
noDataTimeframe?: number;
|
|
89
|
+
|
|
90
|
+
/** Notify when data stops reporting */
|
|
91
|
+
notifyNoData?: boolean;
|
|
92
|
+
|
|
93
|
+
/** Minutes between re-notifications (0 = disabled) */
|
|
94
|
+
renotifyInterval?: number;
|
|
95
|
+
|
|
96
|
+
/** Seconds to wait before evaluating new groups */
|
|
97
|
+
newGroupDelay?: number;
|
|
98
|
+
|
|
99
|
+
/** Seconds to delay metric evaluation */
|
|
100
|
+
evaluationDelay?: number;
|
|
101
|
+
|
|
102
|
+
/** Additional tags beyond the standard ones */
|
|
103
|
+
additionalTags?: string[];
|
|
104
|
+
|
|
105
|
+
/** Include incident.io webhook (default: true) */
|
|
106
|
+
includeIncidentWebhook?: boolean;
|
|
107
|
+
|
|
108
|
+
/** Enable paging via escalation policy tag (default: true) */
|
|
109
|
+
enablePaging?: boolean;
|
|
110
|
+
|
|
111
|
+
/** Only deploy this monitor on prod stacks (for environment-agnostic queries that would duplicate) */
|
|
112
|
+
prodOnly?: boolean;
|
|
113
|
+
|
|
114
|
+
/** On missing data behavior */
|
|
115
|
+
onMissingData?:
|
|
116
|
+
| 'default'
|
|
117
|
+
| 'show_no_data'
|
|
118
|
+
| 'resolve'
|
|
119
|
+
| 'show_and_notify_no_data';
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Standard monitor options
|
|
124
|
+
*/
|
|
125
|
+
export const defaultMonitorOptions: Partial<datadog.MonitorArgs> = {
|
|
126
|
+
notifyAudit: false,
|
|
127
|
+
includeTags: false,
|
|
128
|
+
onMissingData: 'default',
|
|
129
|
+
};
|