npm - @l.x/datadog-cloud - Versions diffs - 1.0.0 - Mend

@l.x/datadog-cloud 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/INSTRUMENTATION_REPORT.md +217 -0
package/LICENSE +122 -0
package/Pulumi.dev-portal-prod.yaml +10 -0
package/Pulumi.yaml +6 -0
package/config.ts +243 -0
package/dashboards/Pulumi.prod.yaml +6 -0
package/dashboards/Pulumi.yaml +6 -0
package/dashboards/config.ts +30 -0
package/dashboards/dashboard-factory.ts +187 -0
package/dashboards/dashboard-types.ts +127 -0
package/dashboards/definitions/dev-portal/index.ts +1 -0
package/dashboards/definitions/dev-portal/service-dashboard.ts +169 -0
package/dashboards/definitions/index.ts +1 -0
package/dashboards/esc/shared.yaml +7 -0
package/dashboards/index.ts +40 -0
package/dashboards/package.json +20 -0
package/esc/dev-portal.yaml +13 -0
package/factory.ts +79 -0
package/index.ts +79 -0
package/monitors/dev-portal/auth.ts +44 -0
package/monitors/dev-portal/availability.ts +44 -0
package/monitors/dev-portal/errors.ts +47 -0
package/monitors/dev-portal/gateway.ts +47 -0
package/monitors/dev-portal/index.ts +6 -0
package/monitors/dev-portal/latency.ts +44 -0
package/monitors/dev-portal/logs.ts +26 -0
package/monitors/index.ts +8 -0
package/package.json +20 -0
package/tsconfig.json +16 -0
package/types.ts +129 -0

package/INSTRUMENTATION_REPORT.md ADDED Viewed

@@ -0,0 +1,217 @@
+# Dev Portal Instrumentation Report
+> Updated: 2026-02-25
+## Executive Summary
+The dev-portal has **strong application-level logging** in code but **zero logs flowing to Datadog** and **no APM traces**. The structured JSON logging infrastructure exists in the app (wide events, DD correlation fields, PII scrubbing) but the DD Agent sidecar is not configured to forward container logs. ALB metrics (request count, latency percentiles, HTTP status codes) are available via the AWS integration and are now used for the 6 infrastructure monitors.
+## Key Finding: Log Collection Gap
+**Container stdout logs are NOT reaching Datadog.** Investigation via the Datadog API confirmed:
+- `service:dev-portal` returns **zero logs** across all indexes (searched 7 days)
+- `ecs_cluster:dev-portal-ecs` returns **zero logs**
+- Host tags confirm **DD Agent is running** on ECS instances (apps: agent, docker, container, ntp)
+- The DD Agent trace agent is running but receiving **zero spans**
+- Container/docker metrics are reported for the hosts
+**Root cause:** The DD Agent sidecar likely needs log collection enabled for the dev-portal container. The app writes structured JSON to stdout, but the agent isn't configured to collect container logs (no `DD_LOGS_ENABLED=true` or `com.datadoghq.ad.logs` Docker labels found).
+## Available Data Sources
+### What EXISTS in Datadog
+| Source | Data Available | Tag Filter |
+|---|---|---|
+| ALB metrics | request_count, target_response_time (p50/p90/p95/p99), httpcode_target_2xx/3xx/4xx/5xx, healthy_host_count | `name:dev-portal-lb,unienv:prod` |
+| EC2/ECS host metrics | cpu, memory, network, disk | `ecs_cluster:dev-portal-ecs` |
+| Container metrics | cpu, memory, network, uptime, thread count | `host:i-*` (dev-portal hosts) |
+| DD Agent telemetry | trace_agent.heartbeat, stats_writer.*, trace_writer.* | `host:i-*` |
+### What DOES NOT exist in Datadog
+| Source | Why |
+|---|---|
+| APM traces (`trace.web.request`) | `dd-trace` incompatible with Bun runtime, no OTEL configured |
+| Application logs (`service:dev-portal`) | DD Agent log collection not enabled for container |
+| RUM data | No browser SDK |
+| Synthetic checks | Not configured |
+## Current Instrumentation Inventory
+### Server-Side (Hono + Bun)
+| Signal | Instrumented? | Where | Reaching DD? |
+|---|---|---|---|
+| Request lifecycle (method, path, status, duration) | **Yes** | `server.ts` wide event → `request.complete` log | **No** — logs not collected |
+| tRPC procedure spans (name, duration, outcome) | **Yes** | `wideEvent.addProcedure()` in tRPC middleware | **No** — logs not collected |
+| Entry Gateway proxy (url, status, duration) | **Yes** | `proxy.ts` → `gateway.proxy.complete` / `gateway.proxy.error` | **No** — logs not collected |
+| Auth flow events (initiate, verify, conflict, mismatch) | **Yes** | `server.ts` tRPC routers → structured log messages | **No** — logs not collected |
+| Error logging with stack traces | **Yes** | `structuredJsonLogger.ts` → `level:error` | **No** — logs not collected |
+| PII scrubbing | **Yes** | `scrub.ts` — redacts tokens, emails, secrets | N/A |
+| DD trace/span correlation IDs | **Yes** | `structuredJsonLogger.ts` → `dd.trace_id`, `dd.span_id`, `dd.service` | **No** — logs not collected |
+| Traffic classification (human/crawler/ai-tool) | **Yes** | `ai-traffic.ts` → `traffic_type`, `traffic_agent` on wide event | **No** — logs not collected |
+| Amplitude analytics (server-side) | **Yes** | `service.ts` → Amplitude Node SDK | N/A (separate system) |
+### Client-Side (React Router v7 SSR)
+| Signal | Instrumented? | Where | Reaching DD? |
+|---|---|---|---|
+| Chunk load error recovery | **Yes** | `entry.client.tsx` → auto-reload on chunk failure | No |
+| Hydration errors | **Yes** | `entry.client.tsx` → `consoleLoggerFactory` | No |
+| Route error tracking | **Yes** | `root.tsx` ErrorBoundary → `trackError()` via tRPC | No |
+| Page views | **Yes** | `usePageView()` hook → Amplitude | N/A |
+| Core Web Vitals | **No** | — | — |
+| JS error tracking | **No** | — | — |
+### Infrastructure
+| Signal | Available? | Where | Reaching DD? |
+|---|---|---|---|
+| ALB request metrics | **Yes** | AWS integration → `aws.applicationelb.*` | **Yes** |
+| ALB latency percentiles | **Yes** | AWS integration → `target_response_time.p95/p99` | **Yes** |
+| ALB HTTP status codes | **Yes** | AWS integration → `httpcode_target_2xx/5xx` | **Yes** |
+| ECS host health | **Yes** | DD Agent → `system.*` | **Yes** |
+| Container metrics | **Yes** | DD Agent → `container.*`, `docker.*` | **Yes** |
+| ECS task health | **Yes** | Docker HEALTHCHECK → `/health` every 30s | **Yes** |
+| Container stdout logs | **Yes** in container | Structured JSON → stdout | **No** — not forwarded to DD |
+| Datadog APM traces | **No** | `dd-trace` not compatible with Bun runtime | No |
+| Datadog RUM | **No** | No browser SDK | No |
+## Monitor Coverage Matrix
+### Infrastructure Monitors (6 — ALB-based)
+| Monitor | Type | Signal Source | Query | Status |
+|---|---|---|---|---|
+| P95 Latency | query alert | `aws.applicationelb.target_response_time.p95` | `avg(last_5m) > 2s` | **Active** |
+| P99 Latency | query alert | `aws.applicationelb.target_response_time.p99` | `avg(last_5m) > 5s` | **Active** |
+| 5xx Error Rate | query alert | `httpcode_target_5xx / request_count` | `sum(last_5m) > 5%` | **Active** |
+| Error Count Anomaly | query alert | `httpcode_target_5xx` anomaly | agile, 3 deviations | **Active** |
+| Zero Traffic | query alert | `aws.applicationelb.request_count` | `sum(last_10m) == 0` | **Active** |
+| Success Rate | query alert | `1 - (5xx / request_count)` | `< 99%` | **Active** |
+### Application Monitors (5 — Log-based, BLOCKED)
+| Monitor | Type | Signal Source | Query | Status |
+|---|---|---|---|---|
+| Auth Failure Rate | log alert | `service:dev-portal` logs | `auth*failed OR session*failed > 20/15m` | **No Data** — logs not collected |
+| Session Conflict Spike | log alert | `service:dev-portal` logs | `message:*conflict > 10/15m` | **No Data** — logs not collected |
+| Gateway Proxy Errors | log alert | `service:dev-portal` logs | `gateway.proxy.error > 5/10m` | **No Data** — logs not collected |
+| Gateway Proxy High Latency | log alert | `service:dev-portal` logs | `duration_ms > 3000, count > 10/10m` | **No Data** — logs not collected |
+| Error Log Spike | log alert | `service:dev-portal` logs | `level:error > 50/15m` | **No Data** — logs not collected |
+All 5 log monitors have `onMissingData: show_no_data` so they correctly report "No Data" instead of false "OK".
+## Gap Analysis
+### Critical Gaps (P0)
+#### 1. Container Log Collection Not Enabled
+**Problem:** The DD Agent sidecar runs on dev-portal ECS hosts but is not collecting container logs. All structured JSON logging in the app (wide events, auth events, gateway events, errors) writes to stdout but never reaches Datadog.
+**Impact:** 5 log-based monitors are non-functional. No application-level visibility in Datadog.
+**Recommendation:** Enable log collection on the DD Agent:
+- Set `DD_LOGS_ENABLED=true` on the DD Agent sidecar container
+- Add `DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL=true` or configure Docker labels (`com.datadoghq.ad.logs`) on the dev-portal container
+- Ensure the DD Agent has `logs_enabled: true` in its config
+- The app already writes DD-compatible structured JSON with correlation fields — no app changes needed
+#### 2. Dev-portal Team EP is Placeholder
+**Problem:** The `dev-portal` entry in `shared-infra/incident` ESC uses SRE's EP (`01K4XB5BT0SZY16A4J6Z20WXD8`) and Slack (`@slack-sre-alerts`) as placeholders.
+**Recommendation:** Create a dedicated dev-portal EP in incident.io and Slack channel, then update the ESC.
+### High Gaps (P1)
+#### 3. Runtime Uncertainty: Bun vs Node.js
+**Problem:** The Dockerfile shows `CMD ["bun", "run", ...]` and `--target=bun`, but the actual ECS task definition may differ. If the production runtime is Node.js, `dd-trace` APM could be enabled.
+**Recommendation:** Verify the actual production runtime in the ECS task definition. If Node.js, enable `dd-trace` for full APM. If Bun, investigate OTEL support.
+#### 4. No Client-Side Error Visibility
+**Problem:** JS errors, hydration failures, and chunk load errors are only logged to browser console.
+**Recommendation (Phase 2):** Add Datadog RUM SDK to `entry.client.tsx`.
+#### 5. No Synthetic Monitoring
+**Problem:** No automated uptime checks beyond Docker HEALTHCHECK (container-local only).
+**Recommendation (Phase 2):** Create Datadog Synthetics for `/health`, `/docs`, `/dashboard`.
+### Medium Gaps (P2)
+#### 6. API Key Scope Limits Debugging
+**Problem:** The DD app key in `shared-infra/datadog-2026` ESC is scoped and cannot query the metrics API (returns 404). This limits ability to validate monitor queries programmatically.
+**Recommendation:** Either request broader app key scope, or validate queries through the Datadog UI instead of API.
+#### 7. tRPC Procedure-Level Metrics Not Extractable
+**Problem:** The wide event `procedures` array with per-procedure timing is nested JSON — hard to create per-procedure metrics.
+**Recommendation:** Log individual procedure spans as separate log lines (once log collection is enabled).
+#### 8. AI Traffic Not in Datadog
+**Problem:** AI traffic classification tracked in Amplitude but not visible in DD dashboards.
+**Recommendation:** Once logs flow, the `traffic_type` and `traffic_agent` fields are queryable in DD Log Analytics.
+### Low Gaps (P3)
+#### 9. Amplitude Parallel Universe
+**Problem:** Full Amplitude analytics integration (auth, page views, API keys, AI traffic, feedback) creates split-brain observability.
+**Recommendation:** Long-term, decide if Amplitude events should mirror to DD custom metrics.
+#### 10. No Deployment Tracking
+**Problem:** No deployment events sent to Datadog.
+**Recommendation:** Add `datadog.ServiceDefinition` and emit deployment events from CI/CD.
+## Recommended Phased Roadmap
+### Phase 1: Log Collection (CRITICAL — Next Sprint)
+- [ ] Enable DD Agent log collection for dev-portal ECS containers
+- [ ] Verify logs appear in Datadog with `service:dev-portal`
+- [ ] Confirm 5 log-based monitors transition from "No Data" to active
+- [x] Register `dev-portal` team in `shared-infra/incident` ESC (placeholder)
+- [ ] Replace placeholder EP/Slack with dev-portal team's actual values
+### Phase 2: APM Resolution
+- [ ] Verify Bun vs Node.js runtime in production ECS task definition
+- [ ] If Node.js: enable `dd-trace` for full APM
+- [ ] If Bun: investigate OTEL/OpenTelemetry support in Bun
+- [ ] Add dashboard rows for auth, gateway, infra, AI traffic
+### Phase 3: Client-Side (Future)
+- [ ] Add Datadog RUM SDK
+- [ ] Create synthetic monitors
+- [ ] Set up deployment tracking
+### Phase 4: Advanced (Future)
+- [ ] SLOs based on error budget
+- [ ] Anomaly detection on tRPC procedures
+- [ ] Composite monitors for cascading failures
+- [ ] Integration with Amplitude for unified dashboards
+## Monitor Inventory (Current State)
+| # | Monitor | Category | Type | Signal | Priority | Status |
+|---|---|---|---|---|---|---|
+| 1 | P95 Latency | latency | query alert | ALB `target_response_time.p95` | P3 | Active |
+| 2 | P99 Latency | latency | query alert | ALB `target_response_time.p99` | P2 | Active |
+| 3 | 5xx Error Rate | errors | query alert | ALB `httpcode_target_5xx` | P2 | Active |
+| 4 | Error Count Anomaly | errors | query alert | ALB `httpcode_target_5xx` | P3 | Active |
+| 5 | Zero Traffic | availability | query alert | ALB `request_count` | P1 | Active |
+| 6 | Success Rate | availability | query alert | ALB `httpcode_target_5xx/request_count` | P2 | Active |
+| 7 | Auth Failure Rate | auth | log alert | `service:dev-portal` logs | P3 | No Data |
+| 8 | Session Conflict Spike | auth | log alert | `service:dev-portal` logs | P3 | No Data |
+| 9 | Gateway Proxy Errors | gateway | log alert | `service:dev-portal` logs | P2 | No Data |
+| 10 | Gateway Proxy High Latency | gateway | log alert | `service:dev-portal` logs | P3 | No Data |
+| 11 | Error Log Spike | logs | log alert | `service:dev-portal` logs | P2 | No Data |
+**Total: 11 monitors** (6 ALB-based active, 5 log-based awaiting log collection)
+**Paging/Slack: disabled** for initial rollout (`disablePaging: true`, `disableSlack: true` in ESC and stack config).

package/LICENSE ADDED Viewed

@@ -0,0 +1,122 @@
+Lux Ecosystem License
+Version 1.2, December 2025
+Copyright (c) 2020-2025 Lux Industries Inc.
+All rights reserved.
+TECHNOLOGY PORTFOLIO - PATENT APPLICATIONS PLANNED
+Contact: licensing@lux.network
+================================================================================
+                          TERMS AND CONDITIONS
+================================================================================
+1. DEFINITIONS
+   "Lux Primary Network" means the official Lux blockchain with Network ID=1
+   and EVM Chain ID=96369.
+   "Authorized Network" means the Lux Primary Network, official testnets/devnets,
+   and any L1/L2/L3 chain descending from the Lux Primary Network.
+   "Descending Chain" means an L1/L2/L3 chain built on, anchored to, or deriving
+   security from the Lux Primary Network or its authorized testnets.
+   "Research Use" means non-commercial academic research, education, personal
+   study, or evaluation purposes.
+   "Commercial Use" means any use in connection with a product or service
+   offered for sale or fee, internal use by a for-profit entity, or any use
+   to generate revenue.
+2. GRANT OF LICENSE
+   Subject to these terms, Lux Industries Inc grants you a non-exclusive,
+   royalty-free license to:
+   (a) Use for Research Use without restriction;
+   (b) Operate on the Lux Primary Network (Network ID=1, EVM Chain ID=96369);
+   (c) Operate on official Lux testnets and devnets;
+   (d) Operate L1/L2/L3 chains descending from the Lux Primary Network;
+   (e) Build applications within the Lux ecosystem;
+   (f) Contribute improvements back to the original repositories.
+3. RESTRICTIONS
+   Without a commercial license from Lux Industries Inc, you may NOT:
+   (a) Fork the Lux Network or any Lux software;
+   (b) Create competing networks not descending from Lux Primary Network;
+   (c) Use for Commercial Use outside the Lux ecosystem;
+   (d) Sublicense or transfer rights outside the Lux ecosystem;
+   (e) Use to create competing blockchain networks, exchanges, custody
+       services, or cryptographic systems outside the Lux ecosystem.
+4. NO FORKS POLICY
+   Lux Industries Inc maintains ZERO TOLERANCE for unauthorized forks.
+   Any fork or deployment on an unauthorized network constitutes:
+   (a) Breach of this license;
+   (b) Grounds for immediate legal action.
+5. RIGHTS RESERVATION
+   All rights not explicitly granted are reserved by Lux Industries Inc.
+   We plan to apply for patent protection for the technology in this
+   repository. Any implementation outside the Lux ecosystem may require
+   a separate commercial license.
+6. DISCLAIMER OF WARRANTY
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+7. LIMITATION OF LIABILITY
+   IN NO EVENT SHALL LUX INDUSTRIES INC BE LIABLE FOR ANY CLAIM, DAMAGES
+   OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+   ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE.
+8. TERMINATION
+   This license terminates immediately upon any breach, including but not
+   limited to deployment on unauthorized networks or creation of forks.
+9. GOVERNING LAW
+   This License shall be governed by the laws of the State of Delaware.
+10. COMMERCIAL LICENSING
+    For commercial use outside the Lux ecosystem:
+    Lux Industries Inc.
+    Email: licensing@lux.network
+    Subject: Commercial License Request
+================================================================================
+                              TL;DR
+================================================================================
+- Research/academic use = OK
+- Lux Primary Network (Network ID=1, Chain ID=96369) = OK
+- L1/L2/L3 chains descending from Lux Primary Network = OK
+- Commercial products outside Lux ecosystem = Contact licensing@lux.network
+- Forks = Absolutely not
+================================================================================
+See LP-0012 for full licensing documentation:
+https://github.com/luxfi/lps/blob/main/LPs/lp-0012-ecosystem-licensing.md

package/Pulumi.dev-portal-prod.yaml ADDED Viewed

@@ -0,0 +1,10 @@
+# Note: disablePaging/disableSlack are set in both ESC (esc/dev-portal.yaml) and here.
+# Stack config overrides ESC. To enable paging, remove from both places.
+environment:
+  - datadog-cloud-universe/dev-portal
+config:
+  datadog-cloud-universe:environment: prod
+  datadog-cloud-universe:tagFilter: "(unienv:prod OR env:prod)"
+  datadog-cloud-universe:incidentWebhook: "@webhook-Incident-io"
+  datadog-cloud-universe:disablePaging: "true"
+  datadog-cloud-universe:disableSlack: "true"

package/Pulumi.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+name: datadog-cloud-universe
+runtime:
+  name: nodejs
+  options:
+    packagemanager: bun
+description: Datadog monitors for Universe services

package/config.ts ADDED Viewed

@@ -0,0 +1,243 @@
+import * as pulumi from '@pulumi/pulumi';
+import {DashboardLink} from './types';
+const config = new pulumi.Config();
+/**
+ * Team configuration from ESC
+ * Each team has an escalation policy ID and slack channel
+ */
+export interface TeamConfig {
+  ep: string;
+  slack: string;
+}
+/**
+ * Monitor configuration loaded from Pulumi config / ESC
+ */
+export interface MonitorSettings {
+  environment: string;
+  tagFilter: string;
+  incidentWebhook: string;
+  defaultTeam: string;
+  teams: Record<string, TeamConfig>;
+}
+// Load team configurations from ESC
+// Expected format in ESC: { "sre": { "ep": "01K4XB5BT0...", "slack": "@slack-sre-alerts" }, ... }
+const teamsRaw = config.getObject<Record<string, TeamConfig>>('teams') || {};
+// Build tagFilter: base filter from stack config, optionally ANDed with team-specific extra filter
+const baseTagFilter = config.get('tagFilter') || '(unienv:prod OR env:prod)';
+const tagFilterExtra = config.get('tagFilterExtra');
+const tagFilter = tagFilterExtra
+  ? `${baseTagFilter} AND ${tagFilterExtra}`
+  : baseTagFilter;
+export const settings: MonitorSettings & {
+  disablePaging: boolean;
+  disableSlack: boolean;
+} = {
+  environment: config.get('environment') || 'prod',
+  tagFilter,
+  incidentWebhook: config.get('incidentWebhook') || '@webhook-Incident-io',
+  defaultTeam: config.get('defaultTeam') || 'dev-portal',
+  teams: teamsRaw,
+  // Set to true to disable all paging (EP tags + incident webhooks) for testing
+  disablePaging: config.getBoolean('disablePaging') || false,
+  // Set to true to disable all Slack notifications for testing
+  disableSlack: config.getBoolean('disableSlack') || false,
+};
+/**
+ * Get the team configuration, falling back to default team if not found.
+ * Throws error if neither team nor default has configuration.
+ */
+export function getTeamConfig(team: string): TeamConfig {
+  const teamConfig = settings.teams[team];
+  if (teamConfig) {
+    return teamConfig;
+  }
+  const defaultConfig = settings.teams[settings.defaultTeam];
+  if (!defaultConfig) {
+    throw new Error(
+      `No configuration found for team '${team}' or default team '${settings.defaultTeam}'. ` +
+        'Configure teams in Pulumi ESC (shared-infra/incident).'
+    );
+  }
+  pulumi.log.warn(
+    `No configuration found for team '${team}', using default team '${settings.defaultTeam}'`
+  );
+  return defaultConfig;
+}
+/**
+ * Get the escalation policy tag for a team.
+ * Falls back to default team if specified team not found.
+ */
+export function getEscalationPolicyTag(team: string): string {
+  return getTeamConfig(team).ep;
+}
+/**
+ * Get the slack channel for a team.
+ * Falls back to default team if specified team not found.
+ */
+export function getSlackChannel(team: string): string {
+  return getTeamConfig(team).slack;
+}
+/**
+ * Build standard tags for a monitor
+ */
+export function buildTags(opts: {
+  signalId: string;
+  team: string;
+  enablePaging?: boolean;
+  additionalTags?: string[];
+}): string[] {
+  const tags = [
+    `serverless_id:${opts.signalId}`,
+    `env:${settings.environment}`,
+    `unienv:${settings.environment}`,
+    `team:${opts.team}`,
+    'managed-by:pulumi',
+  ];
+  // Only include EP tag if paging is enabled (default: true) and not globally disabled
+  if (opts.enablePaging !== false && !settings.disablePaging) {
+    const ep = getEscalationPolicyTag(opts.team);
+    tags.push(`ep:${ep}`);
+  }
+  if (opts.additionalTags) {
+    tags.push(...opts.additionalTags);
+  }
+  return tags;
+}
+/**
+ * Build standardized links section for alert body.
+ *
+ * Includes:
+ * - Logs: Links to Datadog logs filtered by resource, time-bounded to alert window
+ * - Runbook: Notion runbook for troubleshooting
+ * - README: GitHub README for related documentation
+ * - Dashboards: Relevant Datadog dashboards
+ *
+ * Uses Datadog template variables:
+ * - {{last_triggered_at_epoch}} - when alert triggered (milliseconds)
+ */
+export function buildLinksSection(opts: {
+  logQuery: string;
+  runbookUrl: string;
+  readmeUrl: string;
+  dashboards: DashboardLink[];
+}): string {
+  // URL-encode the log query for use in URLs
+  const encodedLogQuery = opts.logQuery
+    .replace(/:/g, '%3A')
+    .replace(/ /g, '%20');
+  let links = `---
+**Links:**
+**Logs** (from alert start):
+* [View Logs](/logs?query=${encodedLogQuery}&from_ts={{last_triggered_at_epoch}})
+**Runbook:**
+* [Troubleshooting Guide](${opts.runbookUrl})
+**Codebase:**
+* [README](${opts.readmeUrl})
+**Dashboards:**`;
+  for (const dashboard of opts.dashboards) {
+    links += `\n* [${dashboard.name}](${dashboard.url})`;
+  }
+  return links;
+}
+/**
+ * Build recovery links section with time-bounded log link.
+ *
+ * Uses Datadog template variables:
+ * - {{last_triggered_at_epoch}} - when alert triggered
+ * - {{last_resolved_at_epoch}} - when alert resolved
+ */
+export function buildRecoveryLinksSection(opts: {logQuery: string}): string {
+  const encodedLogQuery = opts.logQuery
+    .replace(/:/g, '%3A')
+    .replace(/ /g, '%20');
+  return `
+**Logs** (alert window):
+* [View Logs](/logs?query=${encodedLogQuery}&from_ts={{last_triggered_at_epoch}}&to_ts={{last_resolved_at_epoch}})`;
+}
+/**
+ * Build alert message with incident.io webhook integration and standardized links
+ */
+export function buildMessage(opts: {
+  alertBody: string;
+  recoveryBody?: string;
+  team: string;
+  logQuery: string;
+  runbookUrl: string;
+  readmeUrl: string;
+  dashboards: DashboardLink[];
+  includeIncidentWebhook?: boolean;
+}): string {
+  // Disable webhook if globally disabled or explicitly set to false
+  const includeWebhook =
+    opts.includeIncidentWebhook !== false && !settings.disablePaging;
+  const slackChannel = getSlackChannel(opts.team);
+  // Build the links section
+  const linksSection = buildLinksSection({
+    logQuery: opts.logQuery,
+    runbookUrl: opts.runbookUrl,
+    readmeUrl: opts.readmeUrl,
+    dashboards: opts.dashboards,
+  });
+  // Build recovery links
+  const recoveryLinks = buildRecoveryLinksSection({logQuery: opts.logQuery});
+  const tagContext = `aws_account: {{aws_account.name}}, service: {{service.name}}
+uniapp: {{uniapp.name}}, unistk: {{unistk.name}}, unienv: {{unienv.name}}, unigrp: {{unigrp.name}}, uniprj: {{uniprj.name}}, unisha: {{unisha.name}}, uniown: {{uniown.name}}
+`;
+  let message = `{{#is_alert}}\n${tagContext}${opts.alertBody}\n\n${linksSection}`;
+  if (includeWebhook) {
+    message += `\n\n${settings.incidentWebhook}`;
+  }
+  message += '\n{{/is_alert}}\n\n';
+  // Warning block with same tag context
+  message += `{{#is_warning}}\n${tagContext}${opts.alertBody}\n\n${linksSection}\n{{/is_warning}}\n\n`;
+  if (includeWebhook) {
+    message += `{{#is_alert_recovery}} ${settings.incidentWebhook} {{/is_alert_recovery}}\n\n`;
+  }
+  // Only include Slack channel if not globally disabled
+  if (!settings.disableSlack) {
+    message += `${slackChannel}\n`;
+  }
+  if (opts.recoveryBody) {
+    message += `\n{{#is_recovery}}\n${tagContext}${opts.recoveryBody}\n${recoveryLinks}\n{{/is_recovery}}`;
+  }
+  return message;
+}

package/dashboards/Pulumi.prod.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+environment:
+  - datadog-dashboards-universe/shared
+config:
+  datadog-dashboards-universe:environment: prod
+  datadog-dashboards-universe:tagFilter: "(unienv:prod OR env:prod)"
+  datadog-dashboards-universe:defaultTeam: dev-portal

package/dashboards/Pulumi.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+name: datadog-dashboards-universe
+runtime:
+  name: nodejs
+  options:
+    packagemanager: bun
+description: Datadog dashboards for Universe services

package/dashboards/config.ts ADDED Viewed

@@ -0,0 +1,30 @@
+import * as pulumi from '@pulumi/pulumi';
+const config = new pulumi.Config();
+/**
+ * Dashboard configuration loaded from Pulumi config / ESC
+ *
+ * Simplified compared to monitors - dashboards don't need:
+ * - Team EP (escalation policy) - dashboards don't trigger incidents
+ * - Slack channels - dashboards don't send notifications
+ * - Incident webhooks - dashboards don't create incidents
+ */
+export interface DashboardSettings {
+  environment: string;
+  tagFilter: string;
+  defaultTeam: string;
+}
+// Build tagFilter: base filter from stack config, optionally ANDed with team-specific extra filter
+const baseTagFilter = config.get('tagFilter') || '(unienv:prod OR env:prod)';
+const tagFilterExtra = config.get('tagFilterExtra');
+const tagFilter = tagFilterExtra
+  ? `${baseTagFilter} AND ${tagFilterExtra}`
+  : baseTagFilter;
+export const settings: DashboardSettings = {
+  environment: config.get('environment') || 'prod',
+  tagFilter,
+  defaultTeam: config.get('defaultTeam') || 'dev-portal',
+};