ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
@@ -0,0 +1,41 @@
1
+ # ===========================================================================
2
+ # ECIP Shared Ruff Configuration
3
+ # ===========================================================================
4
+ # Enforces M08 observability contract for Python modules (M02).
5
+ #
6
+ # Install in consuming module's pyproject.toml:
7
+ # [tool.ruff]
8
+ # extend = "../ecip-observability-stack/ci/ruff-shared.toml"
9
+ # ===========================================================================
10
+
11
+ target-version = "py311"
12
+ line-length = 120
13
+
14
+ [lint]
15
+ select = [
16
+ "E", # pycodestyle errors
17
+ "F", # pyflakes
18
+ "W", # pycodestyle warnings
19
+ "I", # isort
20
+ "N", # pep8-naming
21
+ "UP", # pyupgrade
22
+ "B", # flake8-bugbear
23
+ "A", # flake8-builtins
24
+ "C4", # flake8-comprehensions
25
+ "SIM", # flake8-simplify
26
+ "T20", # flake8-print — BANS print() in production code
27
+ ]
28
+
29
+ # T20 = flake8-print: flags any use of print() or pprint()
30
+ # This is the Python equivalent of the ESLint console.log ban.
31
+ # Modules must use ecip_observability.get_logger() instead.
32
+
33
+ [lint.per-file-ignores]
34
+ # Allow print() in test files and scripts
35
+ "**/tests/**" = ["T20"]
36
+ "**/test/**" = ["T20"]
37
+ "**/scripts/**" = ["T20"]
38
+ "conftest.py" = ["T20"]
39
+
40
+ [lint.isort]
41
+ known-first-party = ["ecip_observability"]
@@ -0,0 +1,226 @@
1
+ # =============================================================================
2
+ # ECIP M08 — OpenTelemetry Collector Configuration
3
+ # =============================================================================
4
+ # CRITICAL FILE: An error here silently drops ALL observability data.
5
+ # All changes must be reviewed by the Platform team.
6
+ #
7
+ # Four separate pipelines:
8
+ # traces → Grafana Tempo
9
+ # metrics → Prometheus (remote write)
10
+ # logs/security → Elasticsearch (security events ONLY — NFR-SEC-007)
11
+ # logs → Grafana Loki (general application logs — OD-01 resolved)
12
+ # =============================================================================
13
+
14
+ receivers:
15
+ otlp:
16
+ protocols:
17
+ grpc:
18
+ endpoint: 0.0.0.0:4317
19
+ http:
20
+ endpoint: 0.0.0.0:4318
21
+
22
+ # Collector self-metrics
23
+ prometheus:
24
+ config:
25
+ scrape_configs:
26
+ - job_name: otel-collector
27
+ scrape_interval: 15s
28
+ static_configs:
29
+ - targets: ["0.0.0.0:8888"]
30
+
31
+ processors:
32
+ # Memory limiter — prevents OOM on the DaemonSet collector pod
33
+ memory_limiter:
34
+ check_interval: 5s
35
+ limit_mib: 512
36
+ spike_limit_mib: 128
37
+
38
+ # Batch processor — reduces network overhead
39
+ batch:
40
+ send_batch_size: 8192
41
+ send_batch_max_size: 16384
42
+ timeout: 5s
43
+
44
+ # Enforce required ECIP span attributes
45
+ attributes/enforce:
46
+ actions:
47
+ - key: ecip.module
48
+ action: upsert
49
+ from_attribute: service.name
50
+ - key: ecip.org_id
51
+ action: upsert
52
+ from_attribute: ecip.org_id
53
+ - key: ecip.repo_id
54
+ action: upsert
55
+ from_attribute: ecip.repo_id
56
+
57
+ # Resource detection for Kubernetes metadata
58
+ resourcedetection:
59
+ detectors: [env, system, docker, gcp, ecs, ec2]
60
+ timeout: 5s
61
+ override: false
62
+
63
+ k8sattributes:
64
+ auth_type: "serviceAccount"
65
+ passthrough: false
66
+ extract:
67
+ metadata:
68
+ - k8s.pod.name
69
+ - k8s.pod.uid
70
+ - k8s.namespace.name
71
+ - k8s.node.name
72
+ - k8s.deployment.name
73
+
74
+ # Tail-based sampling — decisions made after full trace assembled
75
+ # Imported from sampling-config.yaml via --config flag
76
+ tail_sampling:
77
+ decision_wait: 10s
78
+ num_traces: 100000
79
+ expected_new_traces_per_sec: 10000
80
+ policies:
81
+ # Rule 1: Always sample error traces (100%)
82
+ - name: errors-always-sample
83
+ type: status_code
84
+ status_code:
85
+ status_codes:
86
+ - ERROR
87
+
88
+ # Rule 2: Always sample slow traces > 1000ms (100%)
89
+ - name: slow-queries-sample
90
+ type: latency
91
+ latency:
92
+ threshold_ms: 1000
93
+
94
+ # Rule 3: Default sampling — 5% of healthy traces
95
+ - name: default-sample
96
+ type: probabilistic
97
+ probabilistic:
98
+ sampling_percentage: 5
99
+
100
+ # Filter: only security events go to the logs pipeline
101
+ filter/security_events:
102
+ logs:
103
+ include:
104
+ match_type: strict
105
+ record_attributes:
106
+ - key: event.category
107
+ value: authentication
108
+ - key: event.category
109
+ value: authorization
110
+
111
+ exporters:
112
+ # Traces → Grafana Tempo
113
+ otlp/tempo:
114
+ endpoint: tempo.monitoring:4317
115
+ tls:
116
+ insecure: false
117
+ ca_file: /etc/ssl/certs/ca-certificates.crt
118
+ retry_on_failure:
119
+ enabled: true
120
+ initial_interval: 5s
121
+ max_interval: 30s
122
+ max_elapsed_time: 300s
123
+
124
+ # Metrics → Prometheus (via remote write)
125
+ prometheusremotewrite:
126
+ endpoint: http://prometheus.monitoring:9090/api/v1/write
127
+ tls:
128
+ insecure: true
129
+ resource_to_telemetry_conversion:
130
+ enabled: true
131
+
132
+ # Security logs → Elasticsearch
133
+ elasticsearch/security:
134
+ endpoints:
135
+ - https://elasticsearch.monitoring:9200
136
+ logs_index: ecip-security-events
137
+ tls:
138
+ ca_file: /etc/ssl/certs/ca-certificates.crt
139
+ retry:
140
+ enabled: true
141
+ initial_interval: 5s
142
+ max_interval: 60s
143
+ flush:
144
+ bytes: 5000000
145
+ interval: 10s
146
+
147
+ # General application logs → Grafana Loki (OD-01 resolution)
148
+ loki:
149
+ endpoint: http://loki-gateway.monitoring:3100/loki/api/v1/push
150
+ labels:
151
+ resource:
152
+ service.name: "service_name"
153
+ ecip.module: "module"
154
+ k8s.namespace.name: "namespace"
155
+ attributes:
156
+ level: ""
157
+ tenant_id: ecip
158
+ retry_on_failure:
159
+ enabled: true
160
+ initial_interval: 5s
161
+ max_interval: 30s
162
+
163
+ # Debug exporter — enabled only in dev/staging
164
+ logging:
165
+ loglevel: warn
166
+
167
+ extensions:
168
+ health_check:
169
+ endpoint: 0.0.0.0:13133
170
+
171
+ zpages:
172
+ endpoint: 0.0.0.0:55679
173
+
174
+ pprof:
175
+ endpoint: 0.0.0.0:1777
176
+
177
+ service:
178
+ extensions: [health_check, zpages, pprof]
179
+
180
+ pipelines:
181
+ # Traces pipeline: OTLP → process → Tempo
182
+ traces:
183
+ receivers: [otlp]
184
+ processors:
185
+ - memory_limiter
186
+ - k8sattributes
187
+ - resourcedetection
188
+ - attributes/enforce
189
+ - tail_sampling
190
+ - batch
191
+ exporters: [otlp/tempo]
192
+
193
+ # Metrics pipeline: OTLP → process → Prometheus
194
+ metrics:
195
+ receivers: [otlp, prometheus]
196
+ processors:
197
+ - memory_limiter
198
+ - k8sattributes
199
+ - resourcedetection
200
+ - batch
201
+ exporters: [prometheusremotewrite]
202
+
203
+ # Logs pipeline (security): OTLP → filter security events → Elasticsearch
204
+ logs/security:
205
+ receivers: [otlp]
206
+ processors:
207
+ - memory_limiter
208
+ - filter/security_events
209
+ - batch
210
+ exporters: [elasticsearch/security]
211
+
212
+ # Logs pipeline (general): OTLP → all application logs → Loki
213
+ logs:
214
+ receivers: [otlp]
215
+ processors:
216
+ - memory_limiter
217
+ - k8sattributes
218
+ - resourcedetection
219
+ - batch
220
+ exporters: [loki]
221
+
222
+ telemetry:
223
+ logs:
224
+ level: info
225
+ metrics:
226
+ address: 0.0.0.0:8888
@@ -0,0 +1,168 @@
1
+ # =============================================================================
2
+ # ECIP M08 — OTel Collector DaemonSet
3
+ # =============================================================================
4
+ # DaemonSet topology: one collector pod per node.
5
+ # Pods communicate via localhost — no cross-node span transit on hot path.
6
+ # =============================================================================
7
+ apiVersion: apps/v1
8
+ kind: DaemonSet
9
+ metadata:
10
+ name: otel-collector
11
+ namespace: monitoring
12
+ labels:
13
+ app.kubernetes.io/name: otel-collector
14
+ app.kubernetes.io/component: observability
15
+ app.kubernetes.io/part-of: ecip
16
+ ecip.module: M08
17
+ spec:
18
+ selector:
19
+ matchLabels:
20
+ app.kubernetes.io/name: otel-collector
21
+ updateStrategy:
22
+ type: RollingUpdate
23
+ rollingUpdate:
24
+ maxUnavailable: 1
25
+ template:
26
+ metadata:
27
+ labels:
28
+ app.kubernetes.io/name: otel-collector
29
+ ecip.module: M08
30
+ annotations:
31
+ prometheus.io/scrape: "true"
32
+ prometheus.io/port: "8888"
33
+ prometheus.io/path: "/metrics"
34
+ spec:
35
+ serviceAccountName: otel-collector
36
+ containers:
37
+ - name: otel-collector
38
+ image: otel/opentelemetry-collector-contrib:0.96.0
39
+ args:
40
+ - --config=/etc/otel/otel-collector-config.yaml
41
+ ports:
42
+ - name: otlp-grpc
43
+ containerPort: 4317
44
+ hostPort: 4317
45
+ protocol: TCP
46
+ - name: otlp-http
47
+ containerPort: 4318
48
+ hostPort: 4318
49
+ protocol: TCP
50
+ - name: health
51
+ containerPort: 13133
52
+ protocol: TCP
53
+ - name: metrics
54
+ containerPort: 8888
55
+ protocol: TCP
56
+ - name: zpages
57
+ containerPort: 55679
58
+ protocol: TCP
59
+ resources:
60
+ requests:
61
+ cpu: 200m
62
+ memory: 256Mi
63
+ limits:
64
+ cpu: 1000m
65
+ memory: 512Mi
66
+ volumeMounts:
67
+ - name: collector-config
68
+ mountPath: /etc/otel
69
+ readOnly: true
70
+ - name: tls-certs
71
+ mountPath: /etc/ssl/certs
72
+ readOnly: true
73
+ livenessProbe:
74
+ httpGet:
75
+ path: /
76
+ port: 13133
77
+ initialDelaySeconds: 10
78
+ periodSeconds: 15
79
+ timeoutSeconds: 5
80
+ readinessProbe:
81
+ httpGet:
82
+ path: /
83
+ port: 13133
84
+ initialDelaySeconds: 5
85
+ periodSeconds: 10
86
+ timeoutSeconds: 3
87
+ env:
88
+ - name: K8S_NODE_NAME
89
+ valueFrom:
90
+ fieldRef:
91
+ fieldPath: spec.nodeName
92
+ - name: K8S_POD_NAME
93
+ valueFrom:
94
+ fieldRef:
95
+ fieldPath: metadata.name
96
+ - name: K8S_NAMESPACE
97
+ valueFrom:
98
+ fieldRef:
99
+ fieldPath: metadata.namespace
100
+ volumes:
101
+ - name: collector-config
102
+ configMap:
103
+ name: otel-collector-config
104
+ - name: tls-certs
105
+ secret:
106
+ secretName: otel-collector-tls
107
+ tolerations:
108
+ - effect: NoSchedule
109
+ operator: Exists
110
+ terminationGracePeriodSeconds: 30
111
+ ---
112
+ apiVersion: v1
113
+ kind: Service
114
+ metadata:
115
+ name: otel-collector
116
+ namespace: monitoring
117
+ labels:
118
+ app.kubernetes.io/name: otel-collector
119
+ spec:
120
+ type: ClusterIP
121
+ ports:
122
+ - name: otlp-grpc
123
+ port: 4317
124
+ targetPort: 4317
125
+ protocol: TCP
126
+ - name: otlp-http
127
+ port: 4318
128
+ targetPort: 4318
129
+ protocol: TCP
130
+ - name: metrics
131
+ port: 8888
132
+ targetPort: 8888
133
+ protocol: TCP
134
+ selector:
135
+ app.kubernetes.io/name: otel-collector
136
+ ---
137
+ apiVersion: v1
138
+ kind: ServiceAccount
139
+ metadata:
140
+ name: otel-collector
141
+ namespace: monitoring
142
+ labels:
143
+ app.kubernetes.io/name: otel-collector
144
+ ---
145
+ apiVersion: rbac.authorization.k8s.io/v1
146
+ kind: ClusterRole
147
+ metadata:
148
+ name: otel-collector
149
+ rules:
150
+ - apiGroups: [""]
151
+ resources: ["pods", "namespaces", "nodes"]
152
+ verbs: ["get", "list", "watch"]
153
+ - apiGroups: ["apps"]
154
+ resources: ["deployments", "replicasets", "daemonsets", "statefulsets"]
155
+ verbs: ["get", "list", "watch"]
156
+ ---
157
+ apiVersion: rbac.authorization.k8s.io/v1
158
+ kind: ClusterRoleBinding
159
+ metadata:
160
+ name: otel-collector
161
+ subjects:
162
+ - kind: ServiceAccount
163
+ name: otel-collector
164
+ namespace: monitoring
165
+ roleRef:
166
+ kind: ClusterRole
167
+ name: otel-collector
168
+ apiGroup: rbac.authorization.k8s.io
@@ -0,0 +1,83 @@
1
+ # =============================================================================
2
+ # ECIP M08 — Tail-Based Sampling Configuration
3
+ # =============================================================================
4
+ # Sampling decisions are made AFTER the full trace is assembled.
5
+ # This ensures error traces are always captured regardless of sample rate.
6
+ #
7
+ # Rules are evaluated in order — first match wins.
8
+ # Default: 5% for healthy traces, 100% for errors and slow traces.
9
+ # =============================================================================
10
+
11
+ # Decision wait time: how long to wait for all spans of a trace to arrive
12
+ # before making a sampling decision. 10s is conservative — most ECIP
13
+ # traces complete in < 2s even with MCP fan-out.
14
+ decision_wait: 10s
15
+
16
+ # Maximum number of traces held in memory pending a decision.
17
+ # At 10K new traces/sec and 10s wait, this is ~100K traces in flight.
18
+ num_traces: 100000
19
+
20
+ # Expected trace arrival rate — used for memory pre-allocation.
21
+ expected_new_traces_per_sec: 10000
22
+
23
+ policies:
24
+ # --- Priority 1: Always sample errors ---
25
+ # Any trace containing a span with status ERROR is sampled at 100%.
26
+ # This is the most critical rule — production debugging depends on it.
27
+ - name: errors-always-sample
28
+ type: status_code
29
+ status_code:
30
+ status_codes:
31
+ - ERROR
32
+
33
+ # --- Priority 2: Always sample slow traces ---
34
+ # Any trace with end-to-end latency > 1000ms is sampled at 100%.
35
+ # This catches SLA-breaching queries before alerts fire.
36
+ - name: slow-queries-sample
37
+ type: latency
38
+ latency:
39
+ threshold_ms: 1000
40
+
41
+ # --- Priority 3: Always sample security events ---
42
+ # Traces containing auth/RBAC-related spans are always captured.
43
+ - name: security-events-sample
44
+ type: string_attribute
45
+ string_attribute:
46
+ key: event.category
47
+ values:
48
+ - authentication
49
+ - authorization
50
+
51
+ # --- Priority 4: Sample LSP daemon operations ---
52
+ # LSP operations are high-value for debugging but high-volume.
53
+ # Sample at 20% (higher than default) for better coverage.
54
+ - name: lsp-operations-sample
55
+ type: string_attribute
56
+ string_attribute:
57
+ key: ecip.module
58
+ values:
59
+ - M02
60
+ probabilistic:
61
+ sampling_percentage: 20
62
+
63
+ # --- Default: 5% probabilistic sampling ---
64
+ # All remaining traces are sampled at 5%.
65
+ # At 10K traces/sec, this yields ~500 traces/sec to Tempo storage.
66
+ - name: default-sample
67
+ type: probabilistic
68
+ probabilistic:
69
+ sampling_percentage: 5
70
+
71
+ # =============================================================================
72
+ # Tuning notes (to be updated after Week 8 load testing):
73
+ #
74
+ # If Tempo storage grows faster than budget:
75
+ # 1. Reduce default from 5% → 2%
76
+ # 2. Reduce lsp-operations from 20% → 10%
77
+ # 3. NEVER reduce errors-always-sample below 100%
78
+ #
79
+ # If Collector memory exceeds limit_mib (512):
80
+ # 1. Reduce num_traces from 100K → 50K
81
+ # 2. Reduce decision_wait from 10s → 5s
82
+ # 3. Increase DaemonSet memory limit (last resort)
83
+ # =============================================================================
@@ -0,0 +1,16 @@
1
+ # Grafana Dashboard Provisioning — ECIP Observability
2
+ # Auto-provisions dashboards via Grafana sidecar
3
+ apiVersion: 1
4
+
5
+ providers:
6
+ - name: ecip-dashboards
7
+ orgId: 1
8
+ folder: ECIP
9
+ type: file
10
+ disableDeletion: false
11
+ editable: true
12
+ updateIntervalSeconds: 30
13
+ allowUiUpdates: true
14
+ options:
15
+ path: /var/lib/grafana/dashboards/ecip
16
+ foldersFromFilesStructure: false
@@ -0,0 +1,166 @@
1
+ {
2
+ "description": "ECIP Analysis Engine (M02) — Events processed, backlog, Kafka consumer lag",
3
+ "editable": true,
4
+ "panels": [
5
+ {
6
+ "title": "Analysis Throughput",
7
+ "type": "row",
8
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
9
+ "collapsed": false
10
+ },
11
+ {
12
+ "title": "Analysis Duration p50 / p95",
13
+ "type": "timeseries",
14
+ "datasource": "Prometheus",
15
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
16
+ "fieldConfig": {
17
+ "defaults": {
18
+ "unit": "ms",
19
+ "thresholds": {
20
+ "mode": "absolute",
21
+ "steps": [
22
+ { "color": "green", "value": null },
23
+ { "color": "yellow", "value": 60000 },
24
+ { "color": "red", "value": 120000 }
25
+ ]
26
+ }
27
+ }
28
+ },
29
+ "targets": [
30
+ {
31
+ "expr": "histogram_quantile(0.50, sum(rate(analysis_duration_ms_bucket{job=\"ecip-analysis-engine\"}[5m])) by (le))",
32
+ "legendFormat": "p50"
33
+ },
34
+ {
35
+ "expr": "histogram_quantile(0.95, sum(rate(analysis_duration_ms_bucket{job=\"ecip-analysis-engine\"}[5m])) by (le))",
36
+ "legendFormat": "p95"
37
+ }
38
+ ]
39
+ },
40
+ {
41
+ "title": "Analysis Duration by Branch Type",
42
+ "type": "timeseries",
43
+ "datasource": "Prometheus",
44
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
45
+ "fieldConfig": {
46
+ "defaults": { "unit": "ms" }
47
+ },
48
+ "targets": [
49
+ {
50
+ "expr": "histogram_quantile(0.95, sum(rate(analysis_duration_ms_bucket{job=\"ecip-analysis-engine\"}[5m])) by (le, branch_type))",
51
+ "legendFormat": "p95 — {{branch_type}}"
52
+ }
53
+ ]
54
+ },
55
+ {
56
+ "title": "Events Processed / sec",
57
+ "type": "timeseries",
58
+ "datasource": "Prometheus",
59
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
60
+ "fieldConfig": {
61
+ "defaults": { "unit": "ops" }
62
+ },
63
+ "targets": [
64
+ {
65
+ "expr": "sum(rate(analysis_duration_ms_count{job=\"ecip-analysis-engine\"}[5m]))",
66
+ "legendFormat": "Events/s"
67
+ }
68
+ ]
69
+ },
70
+ {
71
+ "title": "Kafka Consumer Lag (Analysis Topics)",
72
+ "type": "timeseries",
73
+ "datasource": "Prometheus",
74
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
75
+ "fieldConfig": {
76
+ "defaults": { "unit": "short" }
77
+ },
78
+ "targets": [
79
+ {
80
+ "expr": "sum(kafka_consumergroup_lag{group=~\"ecip-analysis.*\"}) by (topic)",
81
+ "legendFormat": "{{topic}}"
82
+ }
83
+ ]
84
+ },
85
+ {
86
+ "title": "Analysis Duration by Language (p95)",
87
+ "type": "bargauge",
88
+ "datasource": "Prometheus",
89
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 },
90
+ "fieldConfig": {
91
+ "defaults": { "unit": "ms" }
92
+ },
93
+ "targets": [
94
+ {
95
+ "expr": "histogram_quantile(0.95, sum(rate(analysis_duration_ms_bucket{job=\"ecip-analysis-engine\"}[5m])) by (le, language))",
96
+ "legendFormat": "{{language}}",
97
+ "instant": true
98
+ }
99
+ ]
100
+ },
101
+ {
102
+ "title": "Event Backlog",
103
+ "type": "stat",
104
+ "datasource": "Prometheus",
105
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 },
106
+ "fieldConfig": {
107
+ "defaults": {
108
+ "unit": "short",
109
+ "thresholds": {
110
+ "mode": "absolute",
111
+ "steps": [
112
+ { "color": "green", "value": null },
113
+ { "color": "yellow", "value": 500 },
114
+ { "color": "red", "value": 1000 }
115
+ ]
116
+ }
117
+ }
118
+ },
119
+ "targets": [
120
+ {
121
+ "expr": "sum(kafka_consumergroup_lag{group=~\"ecip-analysis.*\"})",
122
+ "legendFormat": "Total Backlog"
123
+ }
124
+ ]
125
+ },
126
+ {
127
+ "title": "Embedding Migration Progress",
128
+ "type": "gauge",
129
+ "datasource": "Prometheus",
130
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 25 },
131
+ "fieldConfig": {
132
+ "defaults": {
133
+ "unit": "percentunit",
134
+ "min": 0,
135
+ "max": 1
136
+ }
137
+ },
138
+ "targets": [
139
+ {
140
+ "expr": "embedding_migration_progress{phase=\"cutover\"}",
141
+ "legendFormat": "{{repo}} — {{phase}}"
142
+ }
143
+ ]
144
+ }
145
+ ],
146
+ "refresh": "30s",
147
+ "schemaVersion": 39,
148
+ "tags": ["ecip", "m02", "analysis-engine", "throughput"],
149
+ "templating": {
150
+ "list": [
151
+ {
152
+ "name": "repo",
153
+ "type": "query",
154
+ "datasource": "Prometheus",
155
+ "query": "label_values(analysis_duration_ms_bucket, repo)",
156
+ "refresh": 2,
157
+ "includeAll": true,
158
+ "multi": true
159
+ }
160
+ ]
161
+ },
162
+ "time": { "from": "now-1h", "to": "now" },
163
+ "title": "ECIP — Analysis Throughput",
164
+ "uid": "ecip-analysis-throughput",
165
+ "version": 1
166
+ }