npm - ecip-observability-stack - Versions diffs - 1.0.0 - Mend

ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/CLAUDE.md +48 -0
package/README.md +75 -0
package/alerts/analysis-backlog.yaml +39 -0
package/alerts/cache-degradation.yaml +44 -0
package/alerts/dlq-depth.yaml +56 -0
package/alerts/lsp-daemon.yaml +43 -0
package/alerts/mcp-latency.yaml +46 -0
package/alerts/security-anomaly.yaml +59 -0
package/alerts/sla-latency.yaml +61 -0
package/chaos/kafka-broker-restart.sh +168 -0
package/chaos/kill-lsp-daemon.sh +148 -0
package/chaos/redis-node-failure.sh +318 -0
package/ci/check-observability-contract.js +285 -0
package/ci/eslint-plugin-ecip/index.js +209 -0
package/ci/eslint-plugin-ecip/package.json +12 -0
package/ci/github-actions-observability-gate.yaml +180 -0
package/ci/ruff-shared.toml +41 -0
package/collector/otel-collector-config.yaml +226 -0
package/collector/otel-collector-daemonset.yaml +168 -0
package/collector/sampling-config.yaml +83 -0
package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
package/dashboards/analysis-throughput.json +166 -0
package/dashboards/cache-performance.json +129 -0
package/dashboards/cross-repo-fanout.json +93 -0
package/dashboards/event-bus-dlq.json +129 -0
package/dashboards/lsp-daemon-health.json +104 -0
package/dashboards/mcp-call-graph.json +114 -0
package/dashboards/query-latency.json +160 -0
package/dashboards/security-events.json +131 -0
package/docs/M08-Observability-Design.md +639 -0
package/docs/PROGRESS.md +375 -0
package/docs/module-documentation.md +64 -0
package/elasticsearch/ilm-policy.json +57 -0
package/elasticsearch/index-template.json +62 -0
package/elasticsearch/kibana-space.yaml +53 -0
package/helm/Chart.yaml +30 -0
package/helm/templates/configmaps.yaml +25 -0
package/helm/templates/elasticsearch.yaml +68 -0
package/helm/templates/grafana-secret.yaml +22 -0
package/helm/templates/grafana.yaml +19 -0
package/helm/templates/loki.yaml +33 -0
package/helm/templates/otel-collector.yaml +119 -0
package/helm/templates/prometheus.yaml +43 -0
package/helm/templates/tempo.yaml +16 -0
package/helm/values.prod.yaml +159 -0
package/helm/values.yaml +146 -0
package/logging-lib/nodejs/package.json +57 -0
package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
package/logging-lib/python/pyproject.toml +45 -0
package/logging-lib/python/src/__init__.py +19 -0
package/logging-lib/python/src/logger.py +131 -0
package/logging-lib/python/src/security_events.py +150 -0
package/logging-lib/python/src/tracer.py +185 -0
package/logging-lib/python/tests/test_logger.py +113 -0
package/package.json +21 -0
package/prometheus/prometheus-values.yaml +170 -0
package/prometheus/recording-rules.yaml +97 -0
package/prometheus/scrape-configs.yaml +122 -0
package/runbooks/SDK-INTEGRATION.md +239 -0
package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
package/runbooks/dashboard-guide.md +169 -0
package/scripts/lint-dashboards.js +184 -0
package/tempo/tempo-datasource.yaml +46 -0
package/tempo/tempo-values.yaml +94 -0
package/tests/alert-threshold-config.test.ts +283 -0
package/tests/log-schema-validation.test.ts +246 -0
package/tests/metric-label-validation.test.ts +292 -0
package/tests/otel-pipeline-integration.test.ts +420 -0
package/tests/security-events.test.ts +417 -0
package/tsconfig.json +17 -0
package/vitest.config.ts +21 -0
package/vitest.integration.config.ts +9 -0

package/prometheus/prometheus-values.yaml ADDED Viewed

@@ -0,0 +1,170 @@
+# =============================================================================
+# ECIP M08 — Prometheus Helm Values (kube-prometheus-stack)
+# =============================================================================
+# Base values for dev/staging. Production overrides in values.prod.yaml.
+# =============================================================================
+prometheus:
+  prometheusSpec:
+    retention: 30d
+    retentionSize: 50GB
+    resources:
+      requests:
+        cpu: 500m
+        memory: 2Gi
+      limits:
+        cpu: 2000m
+        memory: 8Gi
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          storageClassName: standard
+          accessModes: ["ReadWriteOnce"]
+          resources:
+            requests:
+              storage: 100Gi
+    # Scrape interval: 15s for application metrics, 30s for infrastructure
+    scrapeInterval: 15s
+    evaluationInterval: 15s
+    # Rule files from ConfigMap
+    ruleSelector:
+      matchLabels:
+        role: alert-rules
+        app: ecip
+    # Additional scrape configs loaded from ConfigMap
+    additionalScrapeConfigsSecret:
+      enabled: true
+      name: ecip-scrape-configs
+      key: scrape-configs.yaml
+    # Series limit alert — prevent high-cardinality OOM
+    additionalAlertManagerConfigs: []
+  # Enable admin API for snapshot/compaction
+  enableAdminAPI: true
+alertmanager:
+  enabled: true
+  alertmanagerSpec:
+    resources:
+      requests:
+        cpu: 100m
+        memory: 128Mi
+      limits:
+        cpu: 200m
+        memory: 256Mi
+  config:
+    global:
+      resolve_timeout: 5m
+    route:
+      group_by: ['alertname', 'module', 'severity']
+      group_wait: 30s
+      group_interval: 5m
+      repeat_interval: 4h
+      receiver: 'slack-warnings'
+      routes:
+        # Critical alerts → PagerDuty + Slack
+        - match:
+            severity: critical
+          receiver: 'pagerduty-critical'
+          continue: true
+        - match:
+            severity: critical
+          receiver: 'slack-critical'
+        # Security alerts → dedicated Slack channel
+        - match:
+            team: security
+          receiver: 'slack-security'
+        # Warning alerts → Slack only
+        - match:
+            severity: warning
+          receiver: 'slack-warnings'
+    receivers:
+      - name: 'pagerduty-critical'
+        pagerduty_configs:
+          - routing_key: '<PAGERDUTY_ROUTING_KEY>'
+            severity: critical
+            description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}'
+      - name: 'slack-critical'
+        slack_configs:
+          - api_url: '<SLACK_WEBHOOK_ALERTS>'
+            channel: '#ecip-alerts'
+            title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
+            text: '{{ .CommonAnnotations.description }}'
+            send_resolved: true
+      - name: 'slack-warnings'
+        slack_configs:
+          - api_url: '<SLACK_WEBHOOK_ALERTS_WARN>'
+            channel: '#ecip-alerts-warn'
+            title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
+            text: '{{ .CommonAnnotations.description }}'
+            send_resolved: true
+      - name: 'slack-security'
+        slack_configs:
+          - api_url: '<SLACK_WEBHOOK_SECURITY>'
+            channel: '#ecip-security'
+            title: '🔒 SECURITY: {{ .GroupLabels.alertname }}'
+            text: '{{ .CommonAnnotations.description }}'
+            send_resolved: true
+grafana:
+  enabled: true
+  adminPassword: admin
+  persistence:
+    enabled: true
+    size: 10Gi
+  sidecar:
+    dashboards:
+      enabled: true
+      label: grafana_dashboard
+      folder: /var/lib/grafana/dashboards/ecip
+      provider:
+        foldersFromFilesStructure: false
+        folder: ECIP
+  datasources:
+    datasources.yaml:
+      apiVersion: 1
+      datasources:
+        - name: Prometheus
+          type: prometheus
+          url: http://prometheus-operated:9090
+          access: proxy
+          isDefault: true
+        - name: Tempo
+          type: tempo
+          url: http://tempo:3200
+          access: proxy
+          jsonData:
+            tracesToMetrics:
+              datasourceUid: prometheus
+            nodeGraph:
+              enabled: true
+            serviceMap:
+              datasourceUid: prometheus
+        - name: Elasticsearch
+          type: elasticsearch
+          url: http://elasticsearch:9200
+          access: proxy
+          jsonData:
+            index: ecip-security-events-*
+            timeField: "@timestamp"
+            esVersion: "8.0.0"

package/prometheus/recording-rules.yaml ADDED Viewed

@@ -0,0 +1,97 @@
+# =============================================================================
+# ECIP M08 — Prometheus Recording Rules
+# =============================================================================
+# Pre-computed rate/ratio metrics for dashboard performance.
+# Without these, every Grafana query recalculates from raw histograms.
+# =============================================================================
+groups:
+  - name: ecip.recording.latency
+    interval: 30s
+    rules:
+      # Query latency quantiles (pre-computed)
+      - record: ecip:query_duration_ms:p50
+        expr: >
+          histogram_quantile(0.50,
+            sum(rate(query_duration_ms_bucket{job="ecip-query-service"}[5m])) by (le)
+          )
+      - record: ecip:query_duration_ms:p95
+        expr: >
+          histogram_quantile(0.95,
+            sum(rate(query_duration_ms_bucket{job="ecip-query-service"}[5m])) by (le)
+          )
+      - record: ecip:query_duration_ms:p99
+        expr: >
+          histogram_quantile(0.99,
+            sum(rate(query_duration_ms_bucket{job="ecip-query-service"}[5m])) by (le)
+          )
+      # Analysis duration quantiles
+      - record: ecip:analysis_duration_ms:p50
+        expr: >
+          histogram_quantile(0.50,
+            sum(rate(analysis_duration_ms_bucket{job="ecip-analysis-engine"}[5m])) by (le)
+          )
+      - record: ecip:analysis_duration_ms:p95
+        expr: >
+          histogram_quantile(0.95,
+            sum(rate(analysis_duration_ms_bucket{job="ecip-analysis-engine"}[5m])) by (le)
+          )
+      # MCP call latency
+      - record: ecip:mcp_call_duration_ms:p95
+        expr: >
+          histogram_quantile(0.95,
+            sum(rate(mcp_call_duration_ms_bucket[5m])) by (le, tool_name)
+          )
+      # gRPC request latency
+      - record: ecip:grpc_request_duration_ms:p95
+        expr: >
+          histogram_quantile(0.95,
+            sum(rate(grpc_request_duration_ms_bucket[5m])) by (le, service, method)
+          )
+      # Knowledge store write latency
+      - record: ecip:knowledge_store_write_duration_ms:p95
+        expr: >
+          histogram_quantile(0.95,
+            sum(rate(knowledge_store_write_duration_ms_bucket[5m])) by (le, store_type)
+          )
+  - name: ecip.recording.rates
+    interval: 30s
+    rules:
+      # Query throughput (requests/sec)
+      - record: ecip:query_rate:rps
+        expr: >
+          sum(rate(query_duration_ms_count{job="ecip-query-service"}[5m])) by (mode)
+      # Analysis throughput (events/sec)
+      - record: ecip:analysis_rate:rps
+        expr: >
+          sum(rate(analysis_duration_ms_count{job="ecip-analysis-engine"}[5m]))
+      # Error rates
+      - record: ecip:query_error_rate:ratio
+        expr: >
+          sum(rate(query_duration_ms_count{job="ecip-query-service",status_code=~"5.."}[5m]))
+          /
+          sum(rate(query_duration_ms_count{job="ecip-query-service"}[5m]))
+      # LSP daemon restart rate (per hour)
+      - record: ecip:lsp_daemon_restart_rate:per_hour
+        expr: >
+          sum(rate(lsp_daemon_restarts_total[1h])) by (repo, language)
+      # Auth failure rate
+      - record: ecip:auth_failure_rate:5m
+        expr: >
+          sum(increase(auth_failure_total[5m])) by (reason)
+      # RBAC denial rate
+      - record: ecip:rbac_denial_rate:5m
+        expr: >
+          sum(increase(rbac_denial_total[5m])) by (resource, action)

package/prometheus/scrape-configs.yaml ADDED Viewed

@@ -0,0 +1,122 @@
+# =============================================================================
+# ECIP M08 — Prometheus Scrape Configs
+# =============================================================================
+# Per-service scrape targets. Updated as modules come online.
+# Loaded into Prometheus via Secret/ConfigMap reference.
+# =============================================================================
+# --- M01: API Gateway ---
+- job_name: ecip-api-gateway
+  scrape_interval: 15s
+  kubernetes_sd_configs:
+    - role: pod
+      namespaces:
+        names: [ecip]
+  relabel_configs:
+    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+      regex: ecip-api-gateway
+      action: keep
+    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
+      target_label: __address__
+      regex: (.+)
+      replacement: ${1}:${2}
+    - target_label: module
+      replacement: M01
+# --- M02: Analysis Engine ---
+- job_name: ecip-analysis-engine
+  scrape_interval: 15s
+  kubernetes_sd_configs:
+    - role: pod
+      namespaces:
+        names: [ecip]
+  relabel_configs:
+    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+      regex: ecip-analysis-engine
+      action: keep
+    - target_label: module
+      replacement: M02
+# --- M03: Knowledge Store ---
+- job_name: ecip-knowledge-store
+  scrape_interval: 15s
+  kubernetes_sd_configs:
+    - role: pod
+      namespaces:
+        names: [ecip]
+  relabel_configs:
+    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+      regex: ecip-knowledge-store
+      action: keep
+    - target_label: module
+      replacement: M03
+# --- M04: Query Service ---
+- job_name: ecip-query-service
+  scrape_interval: 15s
+  kubernetes_sd_configs:
+    - role: pod
+      namespaces:
+        names: [ecip]
+  relabel_configs:
+    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+      regex: ecip-query-service
+      action: keep
+    - target_label: module
+      replacement: M04
+# --- M05: MCP Server ---
+- job_name: ecip-mcp-server
+  scrape_interval: 15s
+  kubernetes_sd_configs:
+    - role: pod
+      namespaces:
+        names: [ecip]
+  relabel_configs:
+    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+      regex: ecip-mcp-server
+      action: keep
+    - target_label: module
+      replacement: M05
+# --- M06: Registry Service ---
+- job_name: ecip-registry-service
+  scrape_interval: 15s
+  kubernetes_sd_configs:
+    - role: pod
+      namespaces:
+        names: [ecip]
+  relabel_configs:
+    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+      regex: ecip-registry-service
+      action: keep
+    - target_label: module
+      replacement: M06
+# --- M07: Event Bus ---
+- job_name: ecip-event-bus
+  scrape_interval: 15s
+  kubernetes_sd_configs:
+    - role: pod
+      namespaces:
+        names: [ecip]
+  relabel_configs:
+    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+      regex: ecip-event-bus
+      action: keep
+    - target_label: module
+      replacement: M07
+# --- OTel Collector self-metrics ---
+- job_name: otel-collector
+  scrape_interval: 30s
+  kubernetes_sd_configs:
+    - role: pod
+      namespaces:
+        names: [monitoring]
+  relabel_configs:
+    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+      regex: otel-collector
+      action: keep
+    - target_label: module
+      replacement: M08

package/runbooks/SDK-INTEGRATION.md ADDED Viewed

@@ -0,0 +1,239 @@
+# SDK Integration Guide — `@ecip/observability`
+> **Priority:** P0 — Must be completed in Week 1
+> **Audience:** All ECIP module teams
+> **Time to integrate:** ≤ 30 minutes per service
+---
+## Overview
+Every ECIP service **must** emit structured logs, traces, and metrics through the `@ecip/observability` SDK (Node.js/TypeScript) or the `ecip-observability` package (Python). Direct use of `console.log`, `print()`, raw Pino, or raw OpenTelemetry SDK is prohibited in production code paths.
+The CI gate (M08-T10) blocks PRs that do not satisfy integration requirements.
+---
+## Node.js / TypeScript (M01, M03, M04, M05, M06, M07)
+### Step 1 — Install
+```bash
+npm install @ecip/observability @opentelemetry/sdk-node
+```
+### Step 2 — Initialize Tracer (BEFORE all other imports)
+Create `src/instrument.ts` as the **very first imported file** in your entry point:
+```typescript
+// src/instrument.ts
+import { initTracer } from '@ecip/observability';
+initTracer({
+  serviceName: 'ecip-api-gateway', // use your module name
+  otlpEndpoint: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
+});
+```
+In your entry `src/server.ts` or `src/index.ts`:
+```typescript
+import './instrument'; // MUST be the first import
+import express from 'express';
+// ... rest of your imports
+```
+### Step 3 — Create Logger with Mandatory Fields
+```typescript
+import { createLogger } from '@ecip/observability';
+// All four fields are REQUIRED at compile time
+const log = createLogger({
+  repo: 'acme-corp/auth-service',
+  branch: 'main',
+  user_id: ctx.userId,
+  module: 'M01',
+});
+log.info({ duration_ms: 43, cached: false }, 'Request handled');
+log.warn({ queue_depth: 150 }, 'Backlog growing');
+log.error({ err }, 'Unhandled exception');
+```
+> **TypeScript will fail compilation** if any of `repo`, `branch`, `user_id`, or `module` is missing.
+### Step 4 — Add Trace Middleware
+```typescript
+import { traceMiddleware } from '@ecip/observability';
+const app = express();
+app.use(traceMiddleware());
+```
+This automatically:
+- Creates a span for every incoming request
+- Propagates W3C `traceparent` headers
+- Injects `trace_id` and `span_id` into all log lines
+### Step 5 — Emit Security Events
+```typescript
+import { emitAuthFailure, emitRbacDenial } from '@ecip/observability';
+// Authentication failure (e.g., invalid JWT)
+emitAuthFailure({
+  userId: ctx.userId,
+  sourceIp: req.ip,
+  method: req.method,
+  path: req.path,
+  reason: 'jwt_expired',
+});
+// RBAC denial (valid auth, insufficient permissions)
+emitRbacDenial({
+  userId: ctx.userId,
+  resource: repoId,
+  action: 'write',
+  reason: 'rbac_insufficient_role',
+});
+```
+> Security events route to a **dedicated** Elasticsearch index via a separate OTel pipeline. Never log security events through the general logger.
+### Step 6 — Custom Spans (Optional)
+```typescript
+import { withSpan } from '@ecip/observability';
+const result = await withSpan('knowledge-store.lookup', async (span) => {
+  span.setAttribute('repo', repoId);
+  const data = await ksClient.lookup(repoId, query);
+  span.setAttribute('result_count', data.length);
+  return data;
+});
+```
+---
+## Python (M02 — Analysis Engine)
+### Step 1 — Install
+```bash
+pip install ecip-observability opentelemetry-sdk
+```
+### Step 2 — Initialize Tracer
+```python
+# At the very top of your __main__ or entry point
+from ecip_observability import init_tracer, get_logger
+init_tracer(service_name="ecip-analysis-engine")
+```
+### Step 3 — Create Logger
+```python
+log = get_logger(
+    repo="acme-corp/auth-service",
+    branch="main",
+    user_id=user_id,
+    module="M02",
+)
+log.info("Analysis complete", duration_ms=14200, files_indexed=47)
+```
+> `get_logger()` raises `MissingObservabilityContext` if any required field is missing.
+### Step 4 — Auto-Span Decorator
+```python
+from ecip_observability import traced
+@traced(name="lsp.symbol_extraction")
+def extract_symbols(file_path: str) -> list:
+    # Span automatically started/ended
+    # Exceptions auto-captured as span errors
+    ...
+# Also works with async functions
+@traced(name="lsp.hover_info")
+async def get_hover_info(file_path: str, line: int) -> dict:
+    ...
+```
+---
+## Helm Value Injection
+All OTel configuration is injected by the Helm chart. **Module teams do not hardcode endpoints.**
+Add this to your module's Helm `values.yaml`:
+```yaml
+env:
+  - name: OTEL_EXPORTER_OTLP_ENDPOINT
+    value: "http://otel-collector.monitoring:4318"
+  - name: OTEL_SERVICE_NAME
+    valueFrom:
+      fieldRef:
+        fieldPath: metadata.labels['app.kubernetes.io/name']
+  - name: OTEL_RESOURCE_ATTRIBUTES
+    value: "ecip.module=M04,deployment.environment=production"
+```
+---
+## CI Gate Requirements
+A module PR is **blocked from merging** unless all of the following are true:
+| # | Requirement | Enforcement |
+|---|---|---|
+| 1 | `@ecip/observability` or `ecip-observability` in dependency list | Package manifest check |
+| 2 | `initTracer()` called before the service starts accepting requests | Startup integration test |
+| 3 | At least one log line per request handler uses `createLogger()` with all mandatory fields | ESLint rule (Node.js), Ruff rule (Python) |
+| 4 | No `console.log` or `print()` in production code paths | ESLint `no-console` / Ruff `T201` |
+---
+## Troubleshooting
+### "Traces not showing up in Tempo"
+1. Check collector health: `curl http://otel-collector.monitoring:13133/healthz`
+2. Check zpages: `http://otel-collector.monitoring:55679/debug/tracez`
+3. Verify `OTEL_EXPORTER_OTLP_ENDPOINT` is set in your pod
+4. Remember: only 5% of healthy traces are sampled. Error traces are always sampled.
+### "Logger TypeScript compilation error"
+You're missing a mandatory field. All four fields are required:
+```typescript
+createLogger({ repo, branch, user_id, module })
+```
+### "Security events not in Elasticsearch"
+Security events use a separate OTel log provider. Ensure you're using `emitAuthFailure()` / `emitRbacDenial()`, not the general logger.
+---
+## Collector Endpoints
+| Protocol | Endpoint | Use Case |
+|---|---|---|
+| gRPC | `otel-collector.monitoring:4317` | Default for SDK auto-instrumentation |
+| HTTP | `otel-collector.monitoring:4318` | SDKs that don't support gRPC |
+| Health | `otel-collector.monitoring:13133` | Liveness/readiness probes |
+| Metrics | `otel-collector.monitoring:8888` | Collector self-monitoring |
+| zPages | `otel-collector.monitoring:55679` | Debug trace/span pipelines |
+---
+*Last updated: March 2026 · Platform Team*