ecip-observability-stack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +48 -0
- package/README.md +75 -0
- package/alerts/analysis-backlog.yaml +39 -0
- package/alerts/cache-degradation.yaml +44 -0
- package/alerts/dlq-depth.yaml +56 -0
- package/alerts/lsp-daemon.yaml +43 -0
- package/alerts/mcp-latency.yaml +46 -0
- package/alerts/security-anomaly.yaml +59 -0
- package/alerts/sla-latency.yaml +61 -0
- package/chaos/kafka-broker-restart.sh +168 -0
- package/chaos/kill-lsp-daemon.sh +148 -0
- package/chaos/redis-node-failure.sh +318 -0
- package/ci/check-observability-contract.js +285 -0
- package/ci/eslint-plugin-ecip/index.js +209 -0
- package/ci/eslint-plugin-ecip/package.json +12 -0
- package/ci/github-actions-observability-gate.yaml +180 -0
- package/ci/ruff-shared.toml +41 -0
- package/collector/otel-collector-config.yaml +226 -0
- package/collector/otel-collector-daemonset.yaml +168 -0
- package/collector/sampling-config.yaml +83 -0
- package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
- package/dashboards/analysis-throughput.json +166 -0
- package/dashboards/cache-performance.json +129 -0
- package/dashboards/cross-repo-fanout.json +93 -0
- package/dashboards/event-bus-dlq.json +129 -0
- package/dashboards/lsp-daemon-health.json +104 -0
- package/dashboards/mcp-call-graph.json +114 -0
- package/dashboards/query-latency.json +160 -0
- package/dashboards/security-events.json +131 -0
- package/docs/M08-Observability-Design.md +639 -0
- package/docs/PROGRESS.md +375 -0
- package/docs/module-documentation.md +64 -0
- package/elasticsearch/ilm-policy.json +57 -0
- package/elasticsearch/index-template.json +62 -0
- package/elasticsearch/kibana-space.yaml +53 -0
- package/helm/Chart.yaml +30 -0
- package/helm/templates/configmaps.yaml +25 -0
- package/helm/templates/elasticsearch.yaml +68 -0
- package/helm/templates/grafana-secret.yaml +22 -0
- package/helm/templates/grafana.yaml +19 -0
- package/helm/templates/loki.yaml +33 -0
- package/helm/templates/otel-collector.yaml +119 -0
- package/helm/templates/prometheus.yaml +43 -0
- package/helm/templates/tempo.yaml +16 -0
- package/helm/values.prod.yaml +159 -0
- package/helm/values.yaml +146 -0
- package/logging-lib/nodejs/package.json +57 -0
- package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
- package/logging-lib/python/pyproject.toml +45 -0
- package/logging-lib/python/src/__init__.py +19 -0
- package/logging-lib/python/src/logger.py +131 -0
- package/logging-lib/python/src/security_events.py +150 -0
- package/logging-lib/python/src/tracer.py +185 -0
- package/logging-lib/python/tests/test_logger.py +113 -0
- package/package.json +21 -0
- package/prometheus/prometheus-values.yaml +170 -0
- package/prometheus/recording-rules.yaml +97 -0
- package/prometheus/scrape-configs.yaml +122 -0
- package/runbooks/SDK-INTEGRATION.md +239 -0
- package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
- package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
- package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
- package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
- package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
- package/runbooks/dashboard-guide.md +169 -0
- package/scripts/lint-dashboards.js +184 -0
- package/tempo/tempo-datasource.yaml +46 -0
- package/tempo/tempo-values.yaml +94 -0
- package/tests/alert-threshold-config.test.ts +283 -0
- package/tests/log-schema-validation.test.ts +246 -0
- package/tests/metric-label-validation.test.ts +292 -0
- package/tests/otel-pipeline-integration.test.ts +420 -0
- package/tests/security-events.test.ts +417 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +21 -0
- package/vitest.integration.config.ts +9 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# ===========================================================================
|
|
2
|
+
# ECIP Shared Ruff Configuration
|
|
3
|
+
# ===========================================================================
|
|
4
|
+
# Enforces M08 observability contract for Python modules (M02).
|
|
5
|
+
#
|
|
6
|
+
# Install in consuming module's pyproject.toml:
|
|
7
|
+
# [tool.ruff]
|
|
8
|
+
# extend = "../ecip-observability-stack/ci/ruff-shared.toml"
|
|
9
|
+
# ===========================================================================
|
|
10
|
+
|
|
11
|
+
target-version = "py311"
|
|
12
|
+
line-length = 120
|
|
13
|
+
|
|
14
|
+
[lint]
|
|
15
|
+
select = [
|
|
16
|
+
"E", # pycodestyle errors
|
|
17
|
+
"F", # pyflakes
|
|
18
|
+
"W", # pycodestyle warnings
|
|
19
|
+
"I", # isort
|
|
20
|
+
"N", # pep8-naming
|
|
21
|
+
"UP", # pyupgrade
|
|
22
|
+
"B", # flake8-bugbear
|
|
23
|
+
"A", # flake8-builtins
|
|
24
|
+
"C4", # flake8-comprehensions
|
|
25
|
+
"SIM", # flake8-simplify
|
|
26
|
+
"T20", # flake8-print — BANS print() in production code
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# T20 = flake8-print: flags any use of print() or pprint()
|
|
30
|
+
# This is the Python equivalent of the ESLint console.log ban.
|
|
31
|
+
# Modules must use ecip_observability.get_logger() instead.
|
|
32
|
+
|
|
33
|
+
[lint.per-file-ignores]
|
|
34
|
+
# Allow print() in test files and scripts
|
|
35
|
+
"**/tests/**" = ["T20"]
|
|
36
|
+
"**/test/**" = ["T20"]
|
|
37
|
+
"**/scripts/**" = ["T20"]
|
|
38
|
+
"conftest.py" = ["T20"]
|
|
39
|
+
|
|
40
|
+
[lint.isort]
|
|
41
|
+
known-first-party = ["ecip_observability"]
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# ECIP M08 — OpenTelemetry Collector Configuration
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# CRITICAL FILE: An error here silently drops ALL observability data.
|
|
5
|
+
# All changes must be reviewed by the Platform team.
|
|
6
|
+
#
|
|
7
|
+
# Four separate pipelines:
|
|
8
|
+
# traces → Grafana Tempo
|
|
9
|
+
# metrics → Prometheus (remote write)
|
|
10
|
+
# logs/security → Elasticsearch (security events ONLY — NFR-SEC-007)
|
|
11
|
+
# logs → Grafana Loki (general application logs — OD-01 resolved)
|
|
12
|
+
# =============================================================================
|
|
13
|
+
|
|
14
|
+
receivers:
|
|
15
|
+
otlp:
|
|
16
|
+
protocols:
|
|
17
|
+
grpc:
|
|
18
|
+
endpoint: 0.0.0.0:4317
|
|
19
|
+
http:
|
|
20
|
+
endpoint: 0.0.0.0:4318
|
|
21
|
+
|
|
22
|
+
# Collector self-metrics
|
|
23
|
+
prometheus:
|
|
24
|
+
config:
|
|
25
|
+
scrape_configs:
|
|
26
|
+
- job_name: otel-collector
|
|
27
|
+
scrape_interval: 15s
|
|
28
|
+
static_configs:
|
|
29
|
+
- targets: ["0.0.0.0:8888"]
|
|
30
|
+
|
|
31
|
+
processors:
|
|
32
|
+
# Memory limiter — prevents OOM on the DaemonSet collector pod
|
|
33
|
+
memory_limiter:
|
|
34
|
+
check_interval: 5s
|
|
35
|
+
limit_mib: 512
|
|
36
|
+
spike_limit_mib: 128
|
|
37
|
+
|
|
38
|
+
# Batch processor — reduces network overhead
|
|
39
|
+
batch:
|
|
40
|
+
send_batch_size: 8192
|
|
41
|
+
send_batch_max_size: 16384
|
|
42
|
+
timeout: 5s
|
|
43
|
+
|
|
44
|
+
# Enforce required ECIP span attributes
|
|
45
|
+
attributes/enforce:
|
|
46
|
+
actions:
|
|
47
|
+
- key: ecip.module
|
|
48
|
+
action: upsert
|
|
49
|
+
from_attribute: service.name
|
|
50
|
+
- key: ecip.org_id
|
|
51
|
+
action: upsert
|
|
52
|
+
from_attribute: ecip.org_id
|
|
53
|
+
- key: ecip.repo_id
|
|
54
|
+
action: upsert
|
|
55
|
+
from_attribute: ecip.repo_id
|
|
56
|
+
|
|
57
|
+
# Resource detection for Kubernetes metadata
|
|
58
|
+
resourcedetection:
|
|
59
|
+
detectors: [env, system, docker, gcp, ecs, ec2]
|
|
60
|
+
timeout: 5s
|
|
61
|
+
override: false
|
|
62
|
+
|
|
63
|
+
k8sattributes:
|
|
64
|
+
auth_type: "serviceAccount"
|
|
65
|
+
passthrough: false
|
|
66
|
+
extract:
|
|
67
|
+
metadata:
|
|
68
|
+
- k8s.pod.name
|
|
69
|
+
- k8s.pod.uid
|
|
70
|
+
- k8s.namespace.name
|
|
71
|
+
- k8s.node.name
|
|
72
|
+
- k8s.deployment.name
|
|
73
|
+
|
|
74
|
+
# Tail-based sampling — decisions made after full trace assembled
|
|
75
|
+
# Imported from sampling-config.yaml via --config flag
|
|
76
|
+
tail_sampling:
|
|
77
|
+
decision_wait: 10s
|
|
78
|
+
num_traces: 100000
|
|
79
|
+
expected_new_traces_per_sec: 10000
|
|
80
|
+
policies:
|
|
81
|
+
# Rule 1: Always sample error traces (100%)
|
|
82
|
+
- name: errors-always-sample
|
|
83
|
+
type: status_code
|
|
84
|
+
status_code:
|
|
85
|
+
status_codes:
|
|
86
|
+
- ERROR
|
|
87
|
+
|
|
88
|
+
# Rule 2: Always sample slow traces > 1000ms (100%)
|
|
89
|
+
- name: slow-queries-sample
|
|
90
|
+
type: latency
|
|
91
|
+
latency:
|
|
92
|
+
threshold_ms: 1000
|
|
93
|
+
|
|
94
|
+
# Rule 3: Default sampling — 5% of healthy traces
|
|
95
|
+
- name: default-sample
|
|
96
|
+
type: probabilistic
|
|
97
|
+
probabilistic:
|
|
98
|
+
sampling_percentage: 5
|
|
99
|
+
|
|
100
|
+
# Filter: only security events go to the logs pipeline
|
|
101
|
+
filter/security_events:
|
|
102
|
+
logs:
|
|
103
|
+
include:
|
|
104
|
+
match_type: strict
|
|
105
|
+
record_attributes:
|
|
106
|
+
- key: event.category
|
|
107
|
+
value: authentication
|
|
108
|
+
- key: event.category
|
|
109
|
+
value: authorization
|
|
110
|
+
|
|
111
|
+
exporters:
|
|
112
|
+
# Traces → Grafana Tempo
|
|
113
|
+
otlp/tempo:
|
|
114
|
+
endpoint: tempo.monitoring:4317
|
|
115
|
+
tls:
|
|
116
|
+
insecure: false
|
|
117
|
+
ca_file: /etc/ssl/certs/ca-certificates.crt
|
|
118
|
+
retry_on_failure:
|
|
119
|
+
enabled: true
|
|
120
|
+
initial_interval: 5s
|
|
121
|
+
max_interval: 30s
|
|
122
|
+
max_elapsed_time: 300s
|
|
123
|
+
|
|
124
|
+
# Metrics → Prometheus (via remote write)
|
|
125
|
+
prometheusremotewrite:
|
|
126
|
+
endpoint: http://prometheus.monitoring:9090/api/v1/write
|
|
127
|
+
tls:
|
|
128
|
+
insecure: true
|
|
129
|
+
resource_to_telemetry_conversion:
|
|
130
|
+
enabled: true
|
|
131
|
+
|
|
132
|
+
# Security logs → Elasticsearch
|
|
133
|
+
elasticsearch/security:
|
|
134
|
+
endpoints:
|
|
135
|
+
- https://elasticsearch.monitoring:9200
|
|
136
|
+
logs_index: ecip-security-events
|
|
137
|
+
tls:
|
|
138
|
+
ca_file: /etc/ssl/certs/ca-certificates.crt
|
|
139
|
+
retry:
|
|
140
|
+
enabled: true
|
|
141
|
+
initial_interval: 5s
|
|
142
|
+
max_interval: 60s
|
|
143
|
+
flush:
|
|
144
|
+
bytes: 5000000
|
|
145
|
+
interval: 10s
|
|
146
|
+
|
|
147
|
+
# General application logs → Grafana Loki (OD-01 resolution)
|
|
148
|
+
loki:
|
|
149
|
+
endpoint: http://loki-gateway.monitoring:3100/loki/api/v1/push
|
|
150
|
+
labels:
|
|
151
|
+
resource:
|
|
152
|
+
service.name: "service_name"
|
|
153
|
+
ecip.module: "module"
|
|
154
|
+
k8s.namespace.name: "namespace"
|
|
155
|
+
attributes:
|
|
156
|
+
level: ""
|
|
157
|
+
tenant_id: ecip
|
|
158
|
+
retry_on_failure:
|
|
159
|
+
enabled: true
|
|
160
|
+
initial_interval: 5s
|
|
161
|
+
max_interval: 30s
|
|
162
|
+
|
|
163
|
+
# Debug exporter — enabled only in dev/staging
|
|
164
|
+
logging:
|
|
165
|
+
loglevel: warn
|
|
166
|
+
|
|
167
|
+
extensions:
|
|
168
|
+
health_check:
|
|
169
|
+
endpoint: 0.0.0.0:13133
|
|
170
|
+
|
|
171
|
+
zpages:
|
|
172
|
+
endpoint: 0.0.0.0:55679
|
|
173
|
+
|
|
174
|
+
pprof:
|
|
175
|
+
endpoint: 0.0.0.0:1777
|
|
176
|
+
|
|
177
|
+
service:
|
|
178
|
+
extensions: [health_check, zpages, pprof]
|
|
179
|
+
|
|
180
|
+
pipelines:
|
|
181
|
+
# Traces pipeline: OTLP → process → Tempo
|
|
182
|
+
traces:
|
|
183
|
+
receivers: [otlp]
|
|
184
|
+
processors:
|
|
185
|
+
- memory_limiter
|
|
186
|
+
- k8sattributes
|
|
187
|
+
- resourcedetection
|
|
188
|
+
- attributes/enforce
|
|
189
|
+
- tail_sampling
|
|
190
|
+
- batch
|
|
191
|
+
exporters: [otlp/tempo]
|
|
192
|
+
|
|
193
|
+
# Metrics pipeline: OTLP → process → Prometheus
|
|
194
|
+
metrics:
|
|
195
|
+
receivers: [otlp, prometheus]
|
|
196
|
+
processors:
|
|
197
|
+
- memory_limiter
|
|
198
|
+
- k8sattributes
|
|
199
|
+
- resourcedetection
|
|
200
|
+
- batch
|
|
201
|
+
exporters: [prometheusremotewrite]
|
|
202
|
+
|
|
203
|
+
# Logs pipeline (security): OTLP → filter security events → Elasticsearch
|
|
204
|
+
logs/security:
|
|
205
|
+
receivers: [otlp]
|
|
206
|
+
processors:
|
|
207
|
+
- memory_limiter
|
|
208
|
+
- filter/security_events
|
|
209
|
+
- batch
|
|
210
|
+
exporters: [elasticsearch/security]
|
|
211
|
+
|
|
212
|
+
# Logs pipeline (general): OTLP → all application logs → Loki
|
|
213
|
+
logs:
|
|
214
|
+
receivers: [otlp]
|
|
215
|
+
processors:
|
|
216
|
+
- memory_limiter
|
|
217
|
+
- k8sattributes
|
|
218
|
+
- resourcedetection
|
|
219
|
+
- batch
|
|
220
|
+
exporters: [loki]
|
|
221
|
+
|
|
222
|
+
telemetry:
|
|
223
|
+
logs:
|
|
224
|
+
level: info
|
|
225
|
+
metrics:
|
|
226
|
+
address: 0.0.0.0:8888
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# ECIP M08 — OTel Collector DaemonSet
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# DaemonSet topology: one collector pod per node.
|
|
5
|
+
# Pods communicate via localhost — no cross-node span transit on hot path.
|
|
6
|
+
# =============================================================================
|
|
7
|
+
apiVersion: apps/v1
|
|
8
|
+
kind: DaemonSet
|
|
9
|
+
metadata:
|
|
10
|
+
name: otel-collector
|
|
11
|
+
namespace: monitoring
|
|
12
|
+
labels:
|
|
13
|
+
app.kubernetes.io/name: otel-collector
|
|
14
|
+
app.kubernetes.io/component: observability
|
|
15
|
+
app.kubernetes.io/part-of: ecip
|
|
16
|
+
ecip.module: M08
|
|
17
|
+
spec:
|
|
18
|
+
selector:
|
|
19
|
+
matchLabels:
|
|
20
|
+
app.kubernetes.io/name: otel-collector
|
|
21
|
+
updateStrategy:
|
|
22
|
+
type: RollingUpdate
|
|
23
|
+
rollingUpdate:
|
|
24
|
+
maxUnavailable: 1
|
|
25
|
+
template:
|
|
26
|
+
metadata:
|
|
27
|
+
labels:
|
|
28
|
+
app.kubernetes.io/name: otel-collector
|
|
29
|
+
ecip.module: M08
|
|
30
|
+
annotations:
|
|
31
|
+
prometheus.io/scrape: "true"
|
|
32
|
+
prometheus.io/port: "8888"
|
|
33
|
+
prometheus.io/path: "/metrics"
|
|
34
|
+
spec:
|
|
35
|
+
serviceAccountName: otel-collector
|
|
36
|
+
containers:
|
|
37
|
+
- name: otel-collector
|
|
38
|
+
image: otel/opentelemetry-collector-contrib:0.96.0
|
|
39
|
+
args:
|
|
40
|
+
- --config=/etc/otel/otel-collector-config.yaml
|
|
41
|
+
ports:
|
|
42
|
+
- name: otlp-grpc
|
|
43
|
+
containerPort: 4317
|
|
44
|
+
hostPort: 4317
|
|
45
|
+
protocol: TCP
|
|
46
|
+
- name: otlp-http
|
|
47
|
+
containerPort: 4318
|
|
48
|
+
hostPort: 4318
|
|
49
|
+
protocol: TCP
|
|
50
|
+
- name: health
|
|
51
|
+
containerPort: 13133
|
|
52
|
+
protocol: TCP
|
|
53
|
+
- name: metrics
|
|
54
|
+
containerPort: 8888
|
|
55
|
+
protocol: TCP
|
|
56
|
+
- name: zpages
|
|
57
|
+
containerPort: 55679
|
|
58
|
+
protocol: TCP
|
|
59
|
+
resources:
|
|
60
|
+
requests:
|
|
61
|
+
cpu: 200m
|
|
62
|
+
memory: 256Mi
|
|
63
|
+
limits:
|
|
64
|
+
cpu: 1000m
|
|
65
|
+
memory: 512Mi
|
|
66
|
+
volumeMounts:
|
|
67
|
+
- name: collector-config
|
|
68
|
+
mountPath: /etc/otel
|
|
69
|
+
readOnly: true
|
|
70
|
+
- name: tls-certs
|
|
71
|
+
mountPath: /etc/ssl/certs
|
|
72
|
+
readOnly: true
|
|
73
|
+
livenessProbe:
|
|
74
|
+
httpGet:
|
|
75
|
+
path: /
|
|
76
|
+
port: 13133
|
|
77
|
+
initialDelaySeconds: 10
|
|
78
|
+
periodSeconds: 15
|
|
79
|
+
timeoutSeconds: 5
|
|
80
|
+
readinessProbe:
|
|
81
|
+
httpGet:
|
|
82
|
+
path: /
|
|
83
|
+
port: 13133
|
|
84
|
+
initialDelaySeconds: 5
|
|
85
|
+
periodSeconds: 10
|
|
86
|
+
timeoutSeconds: 3
|
|
87
|
+
env:
|
|
88
|
+
- name: K8S_NODE_NAME
|
|
89
|
+
valueFrom:
|
|
90
|
+
fieldRef:
|
|
91
|
+
fieldPath: spec.nodeName
|
|
92
|
+
- name: K8S_POD_NAME
|
|
93
|
+
valueFrom:
|
|
94
|
+
fieldRef:
|
|
95
|
+
fieldPath: metadata.name
|
|
96
|
+
- name: K8S_NAMESPACE
|
|
97
|
+
valueFrom:
|
|
98
|
+
fieldRef:
|
|
99
|
+
fieldPath: metadata.namespace
|
|
100
|
+
volumes:
|
|
101
|
+
- name: collector-config
|
|
102
|
+
configMap:
|
|
103
|
+
name: otel-collector-config
|
|
104
|
+
- name: tls-certs
|
|
105
|
+
secret:
|
|
106
|
+
secretName: otel-collector-tls
|
|
107
|
+
tolerations:
|
|
108
|
+
- effect: NoSchedule
|
|
109
|
+
operator: Exists
|
|
110
|
+
terminationGracePeriodSeconds: 30
|
|
111
|
+
---
|
|
112
|
+
apiVersion: v1
|
|
113
|
+
kind: Service
|
|
114
|
+
metadata:
|
|
115
|
+
name: otel-collector
|
|
116
|
+
namespace: monitoring
|
|
117
|
+
labels:
|
|
118
|
+
app.kubernetes.io/name: otel-collector
|
|
119
|
+
spec:
|
|
120
|
+
type: ClusterIP
|
|
121
|
+
ports:
|
|
122
|
+
- name: otlp-grpc
|
|
123
|
+
port: 4317
|
|
124
|
+
targetPort: 4317
|
|
125
|
+
protocol: TCP
|
|
126
|
+
- name: otlp-http
|
|
127
|
+
port: 4318
|
|
128
|
+
targetPort: 4318
|
|
129
|
+
protocol: TCP
|
|
130
|
+
- name: metrics
|
|
131
|
+
port: 8888
|
|
132
|
+
targetPort: 8888
|
|
133
|
+
protocol: TCP
|
|
134
|
+
selector:
|
|
135
|
+
app.kubernetes.io/name: otel-collector
|
|
136
|
+
---
|
|
137
|
+
apiVersion: v1
|
|
138
|
+
kind: ServiceAccount
|
|
139
|
+
metadata:
|
|
140
|
+
name: otel-collector
|
|
141
|
+
namespace: monitoring
|
|
142
|
+
labels:
|
|
143
|
+
app.kubernetes.io/name: otel-collector
|
|
144
|
+
---
|
|
145
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
146
|
+
kind: ClusterRole
|
|
147
|
+
metadata:
|
|
148
|
+
name: otel-collector
|
|
149
|
+
rules:
|
|
150
|
+
- apiGroups: [""]
|
|
151
|
+
resources: ["pods", "namespaces", "nodes"]
|
|
152
|
+
verbs: ["get", "list", "watch"]
|
|
153
|
+
- apiGroups: ["apps"]
|
|
154
|
+
resources: ["deployments", "replicasets", "daemonsets", "statefulsets"]
|
|
155
|
+
verbs: ["get", "list", "watch"]
|
|
156
|
+
---
|
|
157
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
158
|
+
kind: ClusterRoleBinding
|
|
159
|
+
metadata:
|
|
160
|
+
name: otel-collector
|
|
161
|
+
subjects:
|
|
162
|
+
- kind: ServiceAccount
|
|
163
|
+
name: otel-collector
|
|
164
|
+
namespace: monitoring
|
|
165
|
+
roleRef:
|
|
166
|
+
kind: ClusterRole
|
|
167
|
+
name: otel-collector
|
|
168
|
+
apiGroup: rbac.authorization.k8s.io
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# ECIP M08 — Tail-Based Sampling Configuration
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Sampling decisions are made AFTER the full trace is assembled.
|
|
5
|
+
# This ensures error traces are always captured regardless of sample rate.
|
|
6
|
+
#
|
|
7
|
+
# Rules are evaluated in order — first match wins.
|
|
8
|
+
# Default: 5% for healthy traces, 100% for errors and slow traces.
|
|
9
|
+
# =============================================================================
|
|
10
|
+
|
|
11
|
+
# Decision wait time: how long to wait for all spans of a trace to arrive
|
|
12
|
+
# before making a sampling decision. 10s is conservative — most ECIP
|
|
13
|
+
# traces complete in < 2s even with MCP fan-out.
|
|
14
|
+
decision_wait: 10s
|
|
15
|
+
|
|
16
|
+
# Maximum number of traces held in memory pending a decision.
|
|
17
|
+
# At 10K new traces/sec and 10s wait, this is ~100K traces in flight.
|
|
18
|
+
num_traces: 100000
|
|
19
|
+
|
|
20
|
+
# Expected trace arrival rate — used for memory pre-allocation.
|
|
21
|
+
expected_new_traces_per_sec: 10000
|
|
22
|
+
|
|
23
|
+
policies:
|
|
24
|
+
# --- Priority 1: Always sample errors ---
|
|
25
|
+
# Any trace containing a span with status ERROR is sampled at 100%.
|
|
26
|
+
# This is the most critical rule — production debugging depends on it.
|
|
27
|
+
- name: errors-always-sample
|
|
28
|
+
type: status_code
|
|
29
|
+
status_code:
|
|
30
|
+
status_codes:
|
|
31
|
+
- ERROR
|
|
32
|
+
|
|
33
|
+
# --- Priority 2: Always sample slow traces ---
|
|
34
|
+
# Any trace with end-to-end latency > 1000ms is sampled at 100%.
|
|
35
|
+
# This catches SLA-breaching queries before alerts fire.
|
|
36
|
+
- name: slow-queries-sample
|
|
37
|
+
type: latency
|
|
38
|
+
latency:
|
|
39
|
+
threshold_ms: 1000
|
|
40
|
+
|
|
41
|
+
# --- Priority 3: Always sample security events ---
|
|
42
|
+
# Traces containing auth/RBAC-related spans are always captured.
|
|
43
|
+
- name: security-events-sample
|
|
44
|
+
type: string_attribute
|
|
45
|
+
string_attribute:
|
|
46
|
+
key: event.category
|
|
47
|
+
values:
|
|
48
|
+
- authentication
|
|
49
|
+
- authorization
|
|
50
|
+
|
|
51
|
+
# --- Priority 4: Sample LSP daemon operations ---
|
|
52
|
+
# LSP operations are high-value for debugging but high-volume.
|
|
53
|
+
# Sample at 20% (higher than default) for better coverage.
|
|
54
|
+
- name: lsp-operations-sample
|
|
55
|
+
type: string_attribute
|
|
56
|
+
string_attribute:
|
|
57
|
+
key: ecip.module
|
|
58
|
+
values:
|
|
59
|
+
- M02
|
|
60
|
+
probabilistic:
|
|
61
|
+
sampling_percentage: 20
|
|
62
|
+
|
|
63
|
+
# --- Default: 5% probabilistic sampling ---
|
|
64
|
+
# All remaining traces are sampled at 5%.
|
|
65
|
+
# At 10K traces/sec, this yields ~500 traces/sec to Tempo storage.
|
|
66
|
+
- name: default-sample
|
|
67
|
+
type: probabilistic
|
|
68
|
+
probabilistic:
|
|
69
|
+
sampling_percentage: 5
|
|
70
|
+
|
|
71
|
+
# =============================================================================
|
|
72
|
+
# Tuning notes (to be updated after Week 8 load testing):
|
|
73
|
+
#
|
|
74
|
+
# If Tempo storage grows faster than budget:
|
|
75
|
+
# 1. Reduce default from 5% → 2%
|
|
76
|
+
# 2. Reduce lsp-operations from 20% → 10%
|
|
77
|
+
# 3. NEVER reduce errors-always-sample below 100%
|
|
78
|
+
#
|
|
79
|
+
# If Collector memory exceeds limit_mib (512):
|
|
80
|
+
# 1. Reduce num_traces from 100K → 50K
|
|
81
|
+
# 2. Reduce decision_wait from 10s → 5s
|
|
82
|
+
# 3. Increase DaemonSet memory limit (last resort)
|
|
83
|
+
# =============================================================================
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Grafana Dashboard Provisioning — ECIP Observability
|
|
2
|
+
# Auto-provisions dashboards via Grafana sidecar
|
|
3
|
+
apiVersion: 1
|
|
4
|
+
|
|
5
|
+
providers:
|
|
6
|
+
- name: ecip-dashboards
|
|
7
|
+
orgId: 1
|
|
8
|
+
folder: ECIP
|
|
9
|
+
type: file
|
|
10
|
+
disableDeletion: false
|
|
11
|
+
editable: true
|
|
12
|
+
updateIntervalSeconds: 30
|
|
13
|
+
allowUiUpdates: true
|
|
14
|
+
options:
|
|
15
|
+
path: /var/lib/grafana/dashboards/ecip
|
|
16
|
+
foldersFromFilesStructure: false
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "ECIP Analysis Engine (M02) — Events processed, backlog, Kafka consumer lag",
|
|
3
|
+
"editable": true,
|
|
4
|
+
"panels": [
|
|
5
|
+
{
|
|
6
|
+
"title": "Analysis Throughput",
|
|
7
|
+
"type": "row",
|
|
8
|
+
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
9
|
+
"collapsed": false
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"title": "Analysis Duration p50 / p95",
|
|
13
|
+
"type": "timeseries",
|
|
14
|
+
"datasource": "Prometheus",
|
|
15
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
|
|
16
|
+
"fieldConfig": {
|
|
17
|
+
"defaults": {
|
|
18
|
+
"unit": "ms",
|
|
19
|
+
"thresholds": {
|
|
20
|
+
"mode": "absolute",
|
|
21
|
+
"steps": [
|
|
22
|
+
{ "color": "green", "value": null },
|
|
23
|
+
{ "color": "yellow", "value": 60000 },
|
|
24
|
+
{ "color": "red", "value": 120000 }
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"targets": [
|
|
30
|
+
{
|
|
31
|
+
"expr": "histogram_quantile(0.50, sum(rate(analysis_duration_ms_bucket{job=\"ecip-analysis-engine\"}[5m])) by (le))",
|
|
32
|
+
"legendFormat": "p50"
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"expr": "histogram_quantile(0.95, sum(rate(analysis_duration_ms_bucket{job=\"ecip-analysis-engine\"}[5m])) by (le))",
|
|
36
|
+
"legendFormat": "p95"
|
|
37
|
+
}
|
|
38
|
+
]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"title": "Analysis Duration by Branch Type",
|
|
42
|
+
"type": "timeseries",
|
|
43
|
+
"datasource": "Prometheus",
|
|
44
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
|
|
45
|
+
"fieldConfig": {
|
|
46
|
+
"defaults": { "unit": "ms" }
|
|
47
|
+
},
|
|
48
|
+
"targets": [
|
|
49
|
+
{
|
|
50
|
+
"expr": "histogram_quantile(0.95, sum(rate(analysis_duration_ms_bucket{job=\"ecip-analysis-engine\"}[5m])) by (le, branch_type))",
|
|
51
|
+
"legendFormat": "p95 — {{branch_type}}"
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"title": "Events Processed / sec",
|
|
57
|
+
"type": "timeseries",
|
|
58
|
+
"datasource": "Prometheus",
|
|
59
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
60
|
+
"fieldConfig": {
|
|
61
|
+
"defaults": { "unit": "ops" }
|
|
62
|
+
},
|
|
63
|
+
"targets": [
|
|
64
|
+
{
|
|
65
|
+
"expr": "sum(rate(analysis_duration_ms_count{job=\"ecip-analysis-engine\"}[5m]))",
|
|
66
|
+
"legendFormat": "Events/s"
|
|
67
|
+
}
|
|
68
|
+
]
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"title": "Kafka Consumer Lag (Analysis Topics)",
|
|
72
|
+
"type": "timeseries",
|
|
73
|
+
"datasource": "Prometheus",
|
|
74
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
75
|
+
"fieldConfig": {
|
|
76
|
+
"defaults": { "unit": "short" }
|
|
77
|
+
},
|
|
78
|
+
"targets": [
|
|
79
|
+
{
|
|
80
|
+
"expr": "sum(kafka_consumergroup_lag{group=~\"ecip-analysis.*\"}) by (topic)",
|
|
81
|
+
"legendFormat": "{{topic}}"
|
|
82
|
+
}
|
|
83
|
+
]
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"title": "Analysis Duration by Language (p95)",
|
|
87
|
+
"type": "bargauge",
|
|
88
|
+
"datasource": "Prometheus",
|
|
89
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 },
|
|
90
|
+
"fieldConfig": {
|
|
91
|
+
"defaults": { "unit": "ms" }
|
|
92
|
+
},
|
|
93
|
+
"targets": [
|
|
94
|
+
{
|
|
95
|
+
"expr": "histogram_quantile(0.95, sum(rate(analysis_duration_ms_bucket{job=\"ecip-analysis-engine\"}[5m])) by (le, language))",
|
|
96
|
+
"legendFormat": "{{language}}",
|
|
97
|
+
"instant": true
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"title": "Event Backlog",
|
|
103
|
+
"type": "stat",
|
|
104
|
+
"datasource": "Prometheus",
|
|
105
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 },
|
|
106
|
+
"fieldConfig": {
|
|
107
|
+
"defaults": {
|
|
108
|
+
"unit": "short",
|
|
109
|
+
"thresholds": {
|
|
110
|
+
"mode": "absolute",
|
|
111
|
+
"steps": [
|
|
112
|
+
{ "color": "green", "value": null },
|
|
113
|
+
{ "color": "yellow", "value": 500 },
|
|
114
|
+
{ "color": "red", "value": 1000 }
|
|
115
|
+
]
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
},
|
|
119
|
+
"targets": [
|
|
120
|
+
{
|
|
121
|
+
"expr": "sum(kafka_consumergroup_lag{group=~\"ecip-analysis.*\"})",
|
|
122
|
+
"legendFormat": "Total Backlog"
|
|
123
|
+
}
|
|
124
|
+
]
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
"title": "Embedding Migration Progress",
|
|
128
|
+
"type": "gauge",
|
|
129
|
+
"datasource": "Prometheus",
|
|
130
|
+
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 25 },
|
|
131
|
+
"fieldConfig": {
|
|
132
|
+
"defaults": {
|
|
133
|
+
"unit": "percentunit",
|
|
134
|
+
"min": 0,
|
|
135
|
+
"max": 1
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
"targets": [
|
|
139
|
+
{
|
|
140
|
+
"expr": "embedding_migration_progress{phase=\"cutover\"}",
|
|
141
|
+
"legendFormat": "{{repo}} — {{phase}}"
|
|
142
|
+
}
|
|
143
|
+
]
|
|
144
|
+
}
|
|
145
|
+
],
|
|
146
|
+
"refresh": "30s",
|
|
147
|
+
"schemaVersion": 39,
|
|
148
|
+
"tags": ["ecip", "m02", "analysis-engine", "throughput"],
|
|
149
|
+
"templating": {
|
|
150
|
+
"list": [
|
|
151
|
+
{
|
|
152
|
+
"name": "repo",
|
|
153
|
+
"type": "query",
|
|
154
|
+
"datasource": "Prometheus",
|
|
155
|
+
"query": "label_values(analysis_duration_ms_bucket, repo)",
|
|
156
|
+
"refresh": 2,
|
|
157
|
+
"includeAll": true,
|
|
158
|
+
"multi": true
|
|
159
|
+
}
|
|
160
|
+
]
|
|
161
|
+
},
|
|
162
|
+
"time": { "from": "now-1h", "to": "now" },
|
|
163
|
+
"title": "ECIP — Analysis Throughput",
|
|
164
|
+
"uid": "ecip-analysis-throughput",
|
|
165
|
+
"version": 1
|
|
166
|
+
}
|