@l.x/datadog-cloud 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ # Dev Portal Instrumentation Report
2
+
3
+ > Updated: 2026-02-25
4
+
5
+ ## Executive Summary
6
+
7
+ The dev-portal has **strong application-level logging** in code but **zero logs flowing to Datadog** and **no APM traces**. The structured JSON logging infrastructure exists in the app (wide events, DD correlation fields, PII scrubbing) but the DD Agent sidecar is not configured to forward container logs. ALB metrics (request count, latency percentiles, HTTP status codes) are available via the AWS integration and are now used for the 6 infrastructure monitors.
8
+
9
+ ## Key Finding: Log Collection Gap
10
+
11
+ **Container stdout logs are NOT reaching Datadog.** Investigation via the Datadog API confirmed:
12
+
13
+ - `service:dev-portal` returns **zero logs** across all indexes (searched 7 days)
14
+ - `ecs_cluster:dev-portal-ecs` returns **zero logs**
15
+ - Host tags confirm **DD Agent is running** on ECS instances (apps: agent, docker, container, ntp)
16
+ - The DD Agent trace agent is running but receiving **zero spans**
17
+ - Container/docker metrics are reported for the hosts
18
+
19
+ **Root cause:** The DD Agent sidecar likely needs log collection enabled for the dev-portal container. The app writes structured JSON to stdout, but the agent isn't configured to collect container logs (no `DD_LOGS_ENABLED=true` or `com.datadoghq.ad.logs` Docker labels found).
20
+
21
+ ## Available Data Sources
22
+
23
+ ### What EXISTS in Datadog
24
+
25
+ | Source | Data Available | Tag Filter |
26
+ |---|---|---|
27
+ | ALB metrics | request_count, target_response_time (p50/p90/p95/p99), httpcode_target_2xx/3xx/4xx/5xx, healthy_host_count | `name:dev-portal-lb,unienv:prod` |
28
+ | EC2/ECS host metrics | cpu, memory, network, disk | `ecs_cluster:dev-portal-ecs` |
29
+ | Container metrics | cpu, memory, network, uptime, thread count | `host:i-*` (dev-portal hosts) |
30
+ | DD Agent telemetry | trace_agent.heartbeat, stats_writer.*, trace_writer.* | `host:i-*` |
31
+
32
+ ### What DOES NOT exist in Datadog
33
+
34
+ | Source | Why |
35
+ |---|---|
36
+ | APM traces (`trace.web.request`) | `dd-trace` incompatible with Bun runtime, no OTEL configured |
37
+ | Application logs (`service:dev-portal`) | DD Agent log collection not enabled for container |
38
+ | RUM data | No browser SDK |
39
+ | Synthetic checks | Not configured |
40
+
41
+ ## Current Instrumentation Inventory
42
+
43
+ ### Server-Side (Hono + Bun)
44
+
45
+ | Signal | Instrumented? | Where | Reaching DD? |
46
+ |---|---|---|---|
47
+ | Request lifecycle (method, path, status, duration) | **Yes** | `server.ts` wide event → `request.complete` log | **No** — logs not collected |
48
+ | tRPC procedure spans (name, duration, outcome) | **Yes** | `wideEvent.addProcedure()` in tRPC middleware | **No** — logs not collected |
49
+ | Entry Gateway proxy (url, status, duration) | **Yes** | `proxy.ts` → `gateway.proxy.complete` / `gateway.proxy.error` | **No** — logs not collected |
50
+ | Auth flow events (initiate, verify, conflict, mismatch) | **Yes** | `server.ts` tRPC routers → structured log messages | **No** — logs not collected |
51
+ | Error logging with stack traces | **Yes** | `structuredJsonLogger.ts` → `level:error` | **No** — logs not collected |
52
+ | PII scrubbing | **Yes** | `scrub.ts` — redacts tokens, emails, secrets | N/A |
53
+ | DD trace/span correlation IDs | **Yes** | `structuredJsonLogger.ts` → `dd.trace_id`, `dd.span_id`, `dd.service` | **No** — logs not collected |
54
+ | Traffic classification (human/crawler/ai-tool) | **Yes** | `ai-traffic.ts` → `traffic_type`, `traffic_agent` on wide event | **No** — logs not collected |
55
+ | Amplitude analytics (server-side) | **Yes** | `service.ts` → Amplitude Node SDK | N/A (separate system) |
56
+
57
+ ### Client-Side (React Router v7 SSR)
58
+
59
+ | Signal | Instrumented? | Where | Reaching DD? |
60
+ |---|---|---|---|
61
+ | Chunk load error recovery | **Yes** | `entry.client.tsx` → auto-reload on chunk failure | No |
62
+ | Hydration errors | **Yes** | `entry.client.tsx` → `consoleLoggerFactory` | No |
63
+ | Route error tracking | **Yes** | `root.tsx` ErrorBoundary → `trackError()` via tRPC | No |
64
+ | Page views | **Yes** | `usePageView()` hook → Amplitude | N/A |
65
+ | Core Web Vitals | **No** | — | — |
66
+ | JS error tracking | **No** | — | — |
67
+
68
+ ### Infrastructure
69
+
70
+ | Signal | Available? | Where | Reaching DD? |
71
+ |---|---|---|---|
72
+ | ALB request metrics | **Yes** | AWS integration → `aws.applicationelb.*` | **Yes** |
73
+ | ALB latency percentiles | **Yes** | AWS integration → `target_response_time.p95/p99` | **Yes** |
74
+ | ALB HTTP status codes | **Yes** | AWS integration → `httpcode_target_2xx/5xx` | **Yes** |
75
+ | ECS host health | **Yes** | DD Agent → `system.*` | **Yes** |
76
+ | Container metrics | **Yes** | DD Agent → `container.*`, `docker.*` | **Yes** |
77
+ | ECS task health | **Yes** | Docker HEALTHCHECK → `/health` every 30s | **Yes** |
78
+ | Container stdout logs | **Yes** in container | Structured JSON → stdout | **No** — not forwarded to DD |
79
+ | Datadog APM traces | **No** | `dd-trace` not compatible with Bun runtime | No |
80
+ | Datadog RUM | **No** | No browser SDK | No |
81
+
82
+ ## Monitor Coverage Matrix
83
+
84
+ ### Infrastructure Monitors (6 — ALB-based)
85
+
86
+ | Monitor | Type | Signal Source | Query | Status |
87
+ |---|---|---|---|---|
88
+ | P95 Latency | query alert | `aws.applicationelb.target_response_time.p95` | `avg(last_5m) > 2s` | **Active** |
89
+ | P99 Latency | query alert | `aws.applicationelb.target_response_time.p99` | `avg(last_5m) > 5s` | **Active** |
90
+ | 5xx Error Rate | query alert | `httpcode_target_5xx / request_count` | `sum(last_5m) > 5%` | **Active** |
91
+ | Error Count Anomaly | query alert | `httpcode_target_5xx` anomaly | agile, 3 deviations | **Active** |
92
+ | Zero Traffic | query alert | `aws.applicationelb.request_count` | `sum(last_10m) == 0` | **Active** |
93
+ | Success Rate | query alert | `1 - (5xx / request_count)` | `< 99%` | **Active** |
94
+
95
+ ### Application Monitors (5 — Log-based, BLOCKED)
96
+
97
+ | Monitor | Type | Signal Source | Query | Status |
98
+ |---|---|---|---|---|
99
+ | Auth Failure Rate | log alert | `service:dev-portal` logs | `auth*failed OR session*failed > 20/15m` | **No Data** — logs not collected |
100
+ | Session Conflict Spike | log alert | `service:dev-portal` logs | `message:*conflict > 10/15m` | **No Data** — logs not collected |
101
+ | Gateway Proxy Errors | log alert | `service:dev-portal` logs | `gateway.proxy.error > 5/10m` | **No Data** — logs not collected |
102
+ | Gateway Proxy High Latency | log alert | `service:dev-portal` logs | `duration_ms > 3000, count > 10/10m` | **No Data** — logs not collected |
103
+ | Error Log Spike | log alert | `service:dev-portal` logs | `level:error > 50/15m` | **No Data** — logs not collected |
104
+
105
+ All 5 log monitors have `onMissingData: show_no_data` so they correctly report "No Data" instead of false "OK".
106
+
107
+ ## Gap Analysis
108
+
109
+ ### Critical Gaps (P0)
110
+
111
+ #### 1. Container Log Collection Not Enabled
112
+ **Problem:** The DD Agent sidecar runs on dev-portal ECS hosts but is not collecting container logs. All structured JSON logging in the app (wide events, auth events, gateway events, errors) writes to stdout but never reaches Datadog.
113
+
114
+ **Impact:** 5 log-based monitors are non-functional. No application-level visibility in Datadog.
115
+
116
+ **Recommendation:** Enable log collection on the DD Agent:
117
+ - Set `DD_LOGS_ENABLED=true` on the DD Agent sidecar container
118
+ - Add `DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL=true` or configure Docker labels (`com.datadoghq.ad.logs`) on the dev-portal container
119
+ - Ensure the DD Agent has `logs_enabled: true` in its config
120
+ - The app already writes DD-compatible structured JSON with correlation fields — no app changes needed
121
+
122
+ #### 2. Dev-portal Team EP is Placeholder
123
+ **Problem:** The `dev-portal` entry in `shared-infra/incident` ESC uses SRE's EP (`01K4XB5BT0SZY16A4J6Z20WXD8`) and Slack (`@slack-sre-alerts`) as placeholders.
124
+
125
+ **Recommendation:** Create a dedicated dev-portal EP in incident.io and Slack channel, then update the ESC.
126
+
127
+ ### High Gaps (P1)
128
+
129
+ #### 3. Runtime Uncertainty: Bun vs Node.js
130
+ **Problem:** The Dockerfile shows `CMD ["bun", "run", ...]` and `--target=bun`, but the actual ECS task definition may differ. If the production runtime is Node.js, `dd-trace` APM could be enabled.
131
+
132
+ **Recommendation:** Verify the actual production runtime in the ECS task definition. If Node.js, enable `dd-trace` for full APM. If Bun, investigate OTEL support.
133
+
134
+ #### 4. No Client-Side Error Visibility
135
+ **Problem:** JS errors, hydration failures, and chunk load errors are only logged to browser console.
136
+
137
+ **Recommendation (Phase 2):** Add Datadog RUM SDK to `entry.client.tsx`.
138
+
139
+ #### 5. No Synthetic Monitoring
140
+ **Problem:** No automated uptime checks beyond Docker HEALTHCHECK (container-local only).
141
+
142
+ **Recommendation (Phase 2):** Create Datadog Synthetics for `/health`, `/docs`, `/dashboard`.
143
+
144
+ ### Medium Gaps (P2)
145
+
146
+ #### 6. API Key Scope Limits Debugging
147
+ **Problem:** The DD app key in `shared-infra/datadog-2026` ESC is scoped and cannot query the metrics API (returns 404). This limits ability to validate monitor queries programmatically.
148
+
149
+ **Recommendation:** Either request broader app key scope, or validate queries through the Datadog UI instead of API.
150
+
151
+ #### 7. tRPC Procedure-Level Metrics Not Extractable
152
+ **Problem:** The wide event `procedures` array with per-procedure timing is nested JSON — hard to create per-procedure metrics.
153
+
154
+ **Recommendation:** Log individual procedure spans as separate log lines (once log collection is enabled).
155
+
156
+ #### 8. AI Traffic Not in Datadog
157
+ **Problem:** AI traffic classification tracked in Amplitude but not visible in DD dashboards.
158
+
159
+ **Recommendation:** Once logs flow, the `traffic_type` and `traffic_agent` fields are queryable in DD Log Analytics.
160
+
161
+ ### Low Gaps (P3)
162
+
163
+ #### 9. Amplitude Parallel Universe
164
+ **Problem:** Full Amplitude analytics integration (auth, page views, API keys, AI traffic, feedback) creates split-brain observability.
165
+
166
+ **Recommendation:** Long-term, decide if Amplitude events should mirror to DD custom metrics.
167
+
168
+ #### 10. No Deployment Tracking
169
+ **Problem:** No deployment events sent to Datadog.
170
+
171
+ **Recommendation:** Add `datadog.ServiceDefinition` and emit deployment events from CI/CD.
172
+
173
+ ## Recommended Phased Roadmap
174
+
175
+ ### Phase 1: Log Collection (CRITICAL — Next Sprint)
176
+ - [ ] Enable DD Agent log collection for dev-portal ECS containers
177
+ - [ ] Verify logs appear in Datadog with `service:dev-portal`
178
+ - [ ] Confirm 5 log-based monitors transition from "No Data" to active
179
+ - [x] Register `dev-portal` team in `shared-infra/incident` ESC (placeholder)
180
+ - [ ] Replace placeholder EP/Slack with dev-portal team's actual values
181
+
182
+ ### Phase 2: APM Resolution
183
+ - [ ] Verify Bun vs Node.js runtime in production ECS task definition
184
+ - [ ] If Node.js: enable `dd-trace` for full APM
185
+ - [ ] If Bun: investigate OTEL/OpenTelemetry support in Bun
186
+ - [ ] Add dashboard rows for auth, gateway, infra, AI traffic
187
+
188
+ ### Phase 3: Client-Side (Future)
189
+ - [ ] Add Datadog RUM SDK
190
+ - [ ] Create synthetic monitors
191
+ - [ ] Set up deployment tracking
192
+
193
+ ### Phase 4: Advanced (Future)
194
+ - [ ] SLOs based on error budget
195
+ - [ ] Anomaly detection on tRPC procedures
196
+ - [ ] Composite monitors for cascading failures
197
+ - [ ] Integration with Amplitude for unified dashboards
198
+
199
+ ## Monitor Inventory (Current State)
200
+
201
+ | # | Monitor | Category | Type | Signal | Priority | Status |
202
+ |---|---|---|---|---|---|---|
203
+ | 1 | P95 Latency | latency | query alert | ALB `target_response_time.p95` | P3 | Active |
204
+ | 2 | P99 Latency | latency | query alert | ALB `target_response_time.p99` | P2 | Active |
205
+ | 3 | 5xx Error Rate | errors | query alert | ALB `httpcode_target_5xx` | P2 | Active |
206
+ | 4 | Error Count Anomaly | errors | query alert | ALB `httpcode_target_5xx` | P3 | Active |
207
+ | 5 | Zero Traffic | availability | query alert | ALB `request_count` | P1 | Active |
208
+ | 6 | Success Rate | availability | query alert | ALB `httpcode_target_5xx/request_count` | P2 | Active |
209
+ | 7 | Auth Failure Rate | auth | log alert | `service:dev-portal` logs | P3 | No Data |
210
+ | 8 | Session Conflict Spike | auth | log alert | `service:dev-portal` logs | P3 | No Data |
211
+ | 9 | Gateway Proxy Errors | gateway | log alert | `service:dev-portal` logs | P2 | No Data |
212
+ | 10 | Gateway Proxy High Latency | gateway | log alert | `service:dev-portal` logs | P3 | No Data |
213
+ | 11 | Error Log Spike | logs | log alert | `service:dev-portal` logs | P2 | No Data |
214
+
215
+ **Total: 11 monitors** (6 ALB-based active, 5 log-based awaiting log collection)
216
+
217
+ **Paging/Slack: disabled** for initial rollout (`disablePaging: true`, `disableSlack: true` in ESC and stack config).
package/LICENSE ADDED
@@ -0,0 +1,122 @@
1
+ Lux Ecosystem License
2
+ Version 1.2, December 2025
3
+
4
+ Copyright (c) 2020-2025 Lux Industries Inc.
5
+ All rights reserved.
6
+
7
+ TECHNOLOGY PORTFOLIO - PATENT APPLICATIONS PLANNED
8
+ Contact: licensing@lux.network
9
+
10
+ ================================================================================
11
+ TERMS AND CONDITIONS
12
+ ================================================================================
13
+
14
+ 1. DEFINITIONS
15
+
16
+ "Lux Primary Network" means the official Lux blockchain with Network ID=1
17
+ and EVM Chain ID=96369.
18
+
19
+ "Authorized Network" means the Lux Primary Network, official testnets/devnets,
20
+ and any L1/L2/L3 chain descending from the Lux Primary Network.
21
+
22
+ "Descending Chain" means an L1/L2/L3 chain built on, anchored to, or deriving
23
+ security from the Lux Primary Network or its authorized testnets.
24
+
25
+ "Research Use" means non-commercial academic research, education, personal
26
+ study, or evaluation purposes.
27
+
28
+ "Commercial Use" means any use in connection with a product or service
29
+ offered for sale or fee, internal use by a for-profit entity, or any use
30
+ to generate revenue.
31
+
32
+ 2. GRANT OF LICENSE
33
+
34
+ Subject to these terms, Lux Industries Inc grants you a non-exclusive,
35
+ royalty-free license to:
36
+
37
+ (a) Use for Research Use without restriction;
38
+
39
+ (b) Operate on the Lux Primary Network (Network ID=1, EVM Chain ID=96369);
40
+
41
+ (c) Operate on official Lux testnets and devnets;
42
+
43
+ (d) Operate L1/L2/L3 chains descending from the Lux Primary Network;
44
+
45
+ (e) Build applications within the Lux ecosystem;
46
+
47
+ (f) Contribute improvements back to the original repositories.
48
+
49
+ 3. RESTRICTIONS
50
+
51
+ Without a commercial license from Lux Industries Inc, you may NOT:
52
+
53
+ (a) Fork the Lux Network or any Lux software;
54
+
55
+ (b) Create competing networks not descending from Lux Primary Network;
56
+
57
+ (c) Use for Commercial Use outside the Lux ecosystem;
58
+
59
+ (d) Sublicense or transfer rights outside the Lux ecosystem;
60
+
61
+ (e) Use to create competing blockchain networks, exchanges, custody
62
+ services, or cryptographic systems outside the Lux ecosystem.
63
+
64
+ 4. NO FORKS POLICY
65
+
66
+ Lux Industries Inc maintains ZERO TOLERANCE for unauthorized forks.
67
+ Any fork or deployment on an unauthorized network constitutes:
68
+
69
+ (a) Breach of this license;
70
+ (b) Grounds for immediate legal action.
71
+
72
+ 5. RIGHTS RESERVATION
73
+
74
+ All rights not explicitly granted are reserved by Lux Industries Inc.
75
+
76
+ We plan to apply for patent protection for the technology in this
77
+ repository. Any implementation outside the Lux ecosystem may require
78
+ a separate commercial license.
79
+
80
+ 6. DISCLAIMER OF WARRANTY
81
+
82
+ THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
83
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
84
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
85
+
86
+ 7. LIMITATION OF LIABILITY
87
+
88
+ IN NO EVENT SHALL LUX INDUSTRIES INC BE LIABLE FOR ANY CLAIM, DAMAGES
89
+ OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
90
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE.
91
+
92
+ 8. TERMINATION
93
+
94
+ This license terminates immediately upon any breach, including but not
95
+ limited to deployment on unauthorized networks or creation of forks.
96
+
97
+ 9. GOVERNING LAW
98
+
99
+ This License shall be governed by the laws of the State of Delaware.
100
+
101
+ 10. COMMERCIAL LICENSING
102
+
103
+ For commercial use outside the Lux ecosystem:
104
+
105
+ Lux Industries Inc.
106
+ Email: licensing@lux.network
107
+ Subject: Commercial License Request
108
+
109
+ ================================================================================
110
+ TL;DR
111
+ ================================================================================
112
+
113
+ - Research/academic use = OK
114
+ - Lux Primary Network (Network ID=1, Chain ID=96369) = OK
115
+ - L1/L2/L3 chains descending from Lux Primary Network = OK
116
+ - Commercial products outside Lux ecosystem = Contact licensing@lux.network
117
+ - Forks = Absolutely not
118
+
119
+ ================================================================================
120
+
121
+ See LP-0012 for full licensing documentation:
122
+ https://github.com/luxfi/lps/blob/main/LPs/lp-0012-ecosystem-licensing.md
@@ -0,0 +1,10 @@
1
+ # Note: disablePaging/disableSlack are set in both ESC (esc/dev-portal.yaml) and here.
2
+ # Stack config overrides ESC. To enable paging, remove from both places.
3
+ environment:
4
+ - datadog-cloud-universe/dev-portal
5
+ config:
6
+ datadog-cloud-universe:environment: prod
7
+ datadog-cloud-universe:tagFilter: "(unienv:prod OR env:prod)"
8
+ datadog-cloud-universe:incidentWebhook: "@webhook-Incident-io"
9
+ datadog-cloud-universe:disablePaging: "true"
10
+ datadog-cloud-universe:disableSlack: "true"
package/Pulumi.yaml ADDED
@@ -0,0 +1,6 @@
1
+ name: datadog-cloud-universe
2
+ runtime:
3
+ name: nodejs
4
+ options:
5
+ packagemanager: bun
6
+ description: Datadog monitors for Universe services
package/config.ts ADDED
@@ -0,0 +1,243 @@
1
+ import * as pulumi from '@pulumi/pulumi';
2
+ import {DashboardLink} from './types';
3
+
4
+ const config = new pulumi.Config();
5
+
6
+ /**
7
+ * Team configuration from ESC
8
+ * Each team has an escalation policy ID and slack channel
9
+ */
10
+ export interface TeamConfig {
11
+ ep: string;
12
+ slack: string;
13
+ }
14
+
15
+ /**
16
+ * Monitor configuration loaded from Pulumi config / ESC
17
+ */
18
+ export interface MonitorSettings {
19
+ environment: string;
20
+ tagFilter: string;
21
+ incidentWebhook: string;
22
+ defaultTeam: string;
23
+ teams: Record<string, TeamConfig>;
24
+ }
25
+
26
+ // Load team configurations from ESC
27
+ // Expected format in ESC: { "sre": { "ep": "01K4XB5BT0...", "slack": "@slack-sre-alerts" }, ... }
28
+ const teamsRaw = config.getObject<Record<string, TeamConfig>>('teams') || {};
29
+
30
+ // Build tagFilter: base filter from stack config, optionally ANDed with team-specific extra filter
31
+ const baseTagFilter = config.get('tagFilter') || '(unienv:prod OR env:prod)';
32
+ const tagFilterExtra = config.get('tagFilterExtra');
33
+ const tagFilter = tagFilterExtra
34
+ ? `${baseTagFilter} AND ${tagFilterExtra}`
35
+ : baseTagFilter;
36
+
37
+ export const settings: MonitorSettings & {
38
+ disablePaging: boolean;
39
+ disableSlack: boolean;
40
+ } = {
41
+ environment: config.get('environment') || 'prod',
42
+ tagFilter,
43
+ incidentWebhook: config.get('incidentWebhook') || '@webhook-Incident-io',
44
+ defaultTeam: config.get('defaultTeam') || 'dev-portal',
45
+ teams: teamsRaw,
46
+ // Set to true to disable all paging (EP tags + incident webhooks) for testing
47
+ disablePaging: config.getBoolean('disablePaging') || false,
48
+ // Set to true to disable all Slack notifications for testing
49
+ disableSlack: config.getBoolean('disableSlack') || false,
50
+ };
51
+
52
+ /**
53
+ * Get the team configuration, falling back to default team if not found.
54
+ * Throws error if neither team nor default has configuration.
55
+ */
56
+ export function getTeamConfig(team: string): TeamConfig {
57
+ const teamConfig = settings.teams[team];
58
+ if (teamConfig) {
59
+ return teamConfig;
60
+ }
61
+
62
+ const defaultConfig = settings.teams[settings.defaultTeam];
63
+ if (!defaultConfig) {
64
+ throw new Error(
65
+ `No configuration found for team '${team}' or default team '${settings.defaultTeam}'. ` +
66
+ 'Configure teams in Pulumi ESC (shared-infra/incident).'
67
+ );
68
+ }
69
+
70
+ pulumi.log.warn(
71
+ `No configuration found for team '${team}', using default team '${settings.defaultTeam}'`
72
+ );
73
+ return defaultConfig;
74
+ }
75
+
76
+ /**
77
+ * Get the escalation policy tag for a team.
78
+ * Falls back to default team if specified team not found.
79
+ */
80
+ export function getEscalationPolicyTag(team: string): string {
81
+ return getTeamConfig(team).ep;
82
+ }
83
+
84
+ /**
85
+ * Get the slack channel for a team.
86
+ * Falls back to default team if specified team not found.
87
+ */
88
+ export function getSlackChannel(team: string): string {
89
+ return getTeamConfig(team).slack;
90
+ }
91
+
92
+ /**
93
+ * Build standard tags for a monitor
94
+ */
95
+ export function buildTags(opts: {
96
+ signalId: string;
97
+ team: string;
98
+ enablePaging?: boolean;
99
+ additionalTags?: string[];
100
+ }): string[] {
101
+ const tags = [
102
+ `serverless_id:${opts.signalId}`,
103
+ `env:${settings.environment}`,
104
+ `unienv:${settings.environment}`,
105
+ `team:${opts.team}`,
106
+ 'managed-by:pulumi',
107
+ ];
108
+
109
+ // Only include EP tag if paging is enabled (default: true) and not globally disabled
110
+ if (opts.enablePaging !== false && !settings.disablePaging) {
111
+ const ep = getEscalationPolicyTag(opts.team);
112
+ tags.push(`ep:${ep}`);
113
+ }
114
+
115
+ if (opts.additionalTags) {
116
+ tags.push(...opts.additionalTags);
117
+ }
118
+
119
+ return tags;
120
+ }
121
+
122
+ /**
123
+ * Build standardized links section for alert body.
124
+ *
125
+ * Includes:
126
+ * - Logs: Links to Datadog logs filtered by resource, time-bounded to alert window
127
+ * - Runbook: Notion runbook for troubleshooting
128
+ * - README: GitHub README for related documentation
129
+ * - Dashboards: Relevant Datadog dashboards
130
+ *
131
+ * Uses Datadog template variables:
132
+ * - {{last_triggered_at_epoch}} - when alert triggered (milliseconds)
133
+ */
134
+ export function buildLinksSection(opts: {
135
+ logQuery: string;
136
+ runbookUrl: string;
137
+ readmeUrl: string;
138
+ dashboards: DashboardLink[];
139
+ }): string {
140
+ // URL-encode the log query for use in URLs
141
+ const encodedLogQuery = opts.logQuery
142
+ .replace(/:/g, '%3A')
143
+ .replace(/ /g, '%20');
144
+
145
+ let links = `---
146
+
147
+ **Links:**
148
+
149
+ **Logs** (from alert start):
150
+ * [View Logs](/logs?query=${encodedLogQuery}&from_ts={{last_triggered_at_epoch}})
151
+
152
+ **Runbook:**
153
+ * [Troubleshooting Guide](${opts.runbookUrl})
154
+
155
+ **Codebase:**
156
+ * [README](${opts.readmeUrl})
157
+
158
+ **Dashboards:**`;
159
+
160
+ for (const dashboard of opts.dashboards) {
161
+ links += `\n* [${dashboard.name}](${dashboard.url})`;
162
+ }
163
+
164
+ return links;
165
+ }
166
+
167
+ /**
168
+ * Build recovery links section with time-bounded log link.
169
+ *
170
+ * Uses Datadog template variables:
171
+ * - {{last_triggered_at_epoch}} - when alert triggered
172
+ * - {{last_resolved_at_epoch}} - when alert resolved
173
+ */
174
+ export function buildRecoveryLinksSection(opts: {logQuery: string}): string {
175
+ const encodedLogQuery = opts.logQuery
176
+ .replace(/:/g, '%3A')
177
+ .replace(/ /g, '%20');
178
+
179
+ return `
180
+ **Logs** (alert window):
181
+ * [View Logs](/logs?query=${encodedLogQuery}&from_ts={{last_triggered_at_epoch}}&to_ts={{last_resolved_at_epoch}})`;
182
+ }
183
+
184
+ /**
185
+ * Build alert message with incident.io webhook integration and standardized links
186
+ */
187
+ export function buildMessage(opts: {
188
+ alertBody: string;
189
+ recoveryBody?: string;
190
+ team: string;
191
+ logQuery: string;
192
+ runbookUrl: string;
193
+ readmeUrl: string;
194
+ dashboards: DashboardLink[];
195
+ includeIncidentWebhook?: boolean;
196
+ }): string {
197
+ // Disable webhook if globally disabled or explicitly set to false
198
+ const includeWebhook =
199
+ opts.includeIncidentWebhook !== false && !settings.disablePaging;
200
+ const slackChannel = getSlackChannel(opts.team);
201
+
202
+ // Build the links section
203
+ const linksSection = buildLinksSection({
204
+ logQuery: opts.logQuery,
205
+ runbookUrl: opts.runbookUrl,
206
+ readmeUrl: opts.readmeUrl,
207
+ dashboards: opts.dashboards,
208
+ });
209
+
210
+ // Build recovery links
211
+ const recoveryLinks = buildRecoveryLinksSection({logQuery: opts.logQuery});
212
+
213
+ const tagContext = `aws_account: {{aws_account.name}}, service: {{service.name}}
214
+ uniapp: {{uniapp.name}}, unistk: {{unistk.name}}, unienv: {{unienv.name}}, unigrp: {{unigrp.name}}, uniprj: {{uniprj.name}}, unisha: {{unisha.name}}, uniown: {{uniown.name}}
215
+
216
+ `;
217
+
218
+ let message = `{{#is_alert}}\n${tagContext}${opts.alertBody}\n\n${linksSection}`;
219
+
220
+ if (includeWebhook) {
221
+ message += `\n\n${settings.incidentWebhook}`;
222
+ }
223
+
224
+ message += '\n{{/is_alert}}\n\n';
225
+
226
+ // Warning block with same tag context
227
+ message += `{{#is_warning}}\n${tagContext}${opts.alertBody}\n\n${linksSection}\n{{/is_warning}}\n\n`;
228
+
229
+ if (includeWebhook) {
230
+ message += `{{#is_alert_recovery}} ${settings.incidentWebhook} {{/is_alert_recovery}}\n\n`;
231
+ }
232
+
233
+ // Only include Slack channel if not globally disabled
234
+ if (!settings.disableSlack) {
235
+ message += `${slackChannel}\n`;
236
+ }
237
+
238
+ if (opts.recoveryBody) {
239
+ message += `\n{{#is_recovery}}\n${tagContext}${opts.recoveryBody}\n${recoveryLinks}\n{{/is_recovery}}`;
240
+ }
241
+
242
+ return message;
243
+ }
@@ -0,0 +1,6 @@
1
+ environment:
2
+ - datadog-dashboards-universe/shared
3
+ config:
4
+ datadog-dashboards-universe:environment: prod
5
+ datadog-dashboards-universe:tagFilter: "(unienv:prod OR env:prod)"
6
+ datadog-dashboards-universe:defaultTeam: dev-portal
@@ -0,0 +1,6 @@
1
+ name: datadog-dashboards-universe
2
+ runtime:
3
+ name: nodejs
4
+ options:
5
+ packagemanager: bun
6
+ description: Datadog dashboards for Universe services
@@ -0,0 +1,30 @@
1
+ import * as pulumi from '@pulumi/pulumi';
2
+
3
+ const config = new pulumi.Config();
4
+
5
+ /**
6
+ * Dashboard configuration loaded from Pulumi config / ESC
7
+ *
8
+ * Simplified compared to monitors - dashboards don't need:
9
+ * - Team EP (escalation policy) - dashboards don't trigger incidents
10
+ * - Slack channels - dashboards don't send notifications
11
+ * - Incident webhooks - dashboards don't create incidents
12
+ */
13
+ export interface DashboardSettings {
14
+ environment: string;
15
+ tagFilter: string;
16
+ defaultTeam: string;
17
+ }
18
+
19
+ // Build tagFilter: base filter from stack config, optionally ANDed with team-specific extra filter
20
+ const baseTagFilter = config.get('tagFilter') || '(unienv:prod OR env:prod)';
21
+ const tagFilterExtra = config.get('tagFilterExtra');
22
+ const tagFilter = tagFilterExtra
23
+ ? `${baseTagFilter} AND ${tagFilterExtra}`
24
+ : baseTagFilter;
25
+
26
+ export const settings: DashboardSettings = {
27
+ environment: config.get('environment') || 'prod',
28
+ tagFilter,
29
+ defaultTeam: config.get('defaultTeam') || 'dev-portal',
30
+ };