e11y 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +4 -0
  3. data/.rubocop.yml +69 -0
  4. data/CHANGELOG.md +26 -0
  5. data/CODE_OF_CONDUCT.md +64 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +179 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/run_all.rb +33 -0
  10. data/config/README.md +83 -0
  11. data/config/loki-local-config.yaml +35 -0
  12. data/config/prometheus.yml +15 -0
  13. data/docker-compose.yml +78 -0
  14. data/docs/00-ICP-AND-TIMELINE.md +483 -0
  15. data/docs/01-SCALE-REQUIREMENTS.md +858 -0
  16. data/docs/ADR-001-architecture.md +2617 -0
  17. data/docs/ADR-002-metrics-yabeda.md +1395 -0
  18. data/docs/ADR-003-slo-observability.md +3337 -0
  19. data/docs/ADR-004-adapter-architecture.md +2385 -0
  20. data/docs/ADR-005-tracing-context.md +1372 -0
  21. data/docs/ADR-006-security-compliance.md +4143 -0
  22. data/docs/ADR-007-opentelemetry-integration.md +1385 -0
  23. data/docs/ADR-008-rails-integration.md +1911 -0
  24. data/docs/ADR-009-cost-optimization.md +2993 -0
  25. data/docs/ADR-010-developer-experience.md +2166 -0
  26. data/docs/ADR-011-testing-strategy.md +1836 -0
  27. data/docs/ADR-012-event-evolution.md +958 -0
  28. data/docs/ADR-013-reliability-error-handling.md +2750 -0
  29. data/docs/ADR-014-event-driven-slo.md +1533 -0
  30. data/docs/ADR-015-middleware-order.md +1061 -0
  31. data/docs/ADR-016-self-monitoring-slo.md +1234 -0
  32. data/docs/API-REFERENCE-L28.md +914 -0
  33. data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
  34. data/docs/IMPLEMENTATION_NOTES.md +2804 -0
  35. data/docs/IMPLEMENTATION_PLAN.md +1971 -0
  36. data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
  37. data/docs/PLAN.md +148 -0
  38. data/docs/QUICK-START.md +934 -0
  39. data/docs/README.md +296 -0
  40. data/docs/design/00-memory-optimization.md +593 -0
  41. data/docs/guides/MIGRATION-L27-L28.md +692 -0
  42. data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
  43. data/docs/guides/README.md +44 -0
  44. data/docs/prd/01-overview-vision.md +440 -0
  45. data/docs/use_cases/README.md +119 -0
  46. data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
  47. data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
  48. data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
  49. data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
  50. data/docs/use_cases/UC-005-sentry-integration.md +759 -0
  51. data/docs/use_cases/UC-006-trace-context-management.md +905 -0
  52. data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
  53. data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
  54. data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
  55. data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
  56. data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
  57. data/docs/use_cases/UC-012-audit-trail.md +2301 -0
  58. data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
  59. data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
  60. data/docs/use_cases/UC-015-cost-optimization.md +735 -0
  61. data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
  62. data/docs/use_cases/UC-017-local-development.md +867 -0
  63. data/docs/use_cases/UC-018-testing-events.md +1081 -0
  64. data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
  65. data/docs/use_cases/UC-020-event-versioning.md +708 -0
  66. data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
  67. data/docs/use_cases/UC-022-event-registry.md +648 -0
  68. data/docs/use_cases/backlog.md +226 -0
  69. data/e11y.gemspec +76 -0
  70. data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
  71. data/lib/e11y/adapters/audit_encrypted.rb +239 -0
  72. data/lib/e11y/adapters/base.rb +580 -0
  73. data/lib/e11y/adapters/file.rb +224 -0
  74. data/lib/e11y/adapters/in_memory.rb +216 -0
  75. data/lib/e11y/adapters/loki.rb +333 -0
  76. data/lib/e11y/adapters/otel_logs.rb +203 -0
  77. data/lib/e11y/adapters/registry.rb +141 -0
  78. data/lib/e11y/adapters/sentry.rb +230 -0
  79. data/lib/e11y/adapters/stdout.rb +108 -0
  80. data/lib/e11y/adapters/yabeda.rb +370 -0
  81. data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
  82. data/lib/e11y/buffers/base_buffer.rb +40 -0
  83. data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
  84. data/lib/e11y/buffers/ring_buffer.rb +267 -0
  85. data/lib/e11y/buffers.rb +14 -0
  86. data/lib/e11y/console.rb +122 -0
  87. data/lib/e11y/current.rb +48 -0
  88. data/lib/e11y/event/base.rb +894 -0
  89. data/lib/e11y/event/value_sampling_config.rb +84 -0
  90. data/lib/e11y/events/base_audit_event.rb +43 -0
  91. data/lib/e11y/events/base_payment_event.rb +33 -0
  92. data/lib/e11y/events/rails/cache/delete.rb +21 -0
  93. data/lib/e11y/events/rails/cache/read.rb +23 -0
  94. data/lib/e11y/events/rails/cache/write.rb +22 -0
  95. data/lib/e11y/events/rails/database/query.rb +45 -0
  96. data/lib/e11y/events/rails/http/redirect.rb +21 -0
  97. data/lib/e11y/events/rails/http/request.rb +26 -0
  98. data/lib/e11y/events/rails/http/send_file.rb +21 -0
  99. data/lib/e11y/events/rails/http/start_processing.rb +26 -0
  100. data/lib/e11y/events/rails/job/completed.rb +22 -0
  101. data/lib/e11y/events/rails/job/enqueued.rb +22 -0
  102. data/lib/e11y/events/rails/job/failed.rb +22 -0
  103. data/lib/e11y/events/rails/job/scheduled.rb +23 -0
  104. data/lib/e11y/events/rails/job/started.rb +22 -0
  105. data/lib/e11y/events/rails/log.rb +56 -0
  106. data/lib/e11y/events/rails/view/render.rb +23 -0
  107. data/lib/e11y/events.rb +18 -0
  108. data/lib/e11y/instruments/active_job.rb +201 -0
  109. data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
  110. data/lib/e11y/instruments/sidekiq.rb +175 -0
  111. data/lib/e11y/logger/bridge.rb +205 -0
  112. data/lib/e11y/metrics/cardinality_protection.rb +172 -0
  113. data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
  114. data/lib/e11y/metrics/registry.rb +234 -0
  115. data/lib/e11y/metrics/relabeling.rb +226 -0
  116. data/lib/e11y/metrics.rb +102 -0
  117. data/lib/e11y/middleware/audit_signing.rb +174 -0
  118. data/lib/e11y/middleware/base.rb +140 -0
  119. data/lib/e11y/middleware/event_slo.rb +167 -0
  120. data/lib/e11y/middleware/pii_filter.rb +266 -0
  121. data/lib/e11y/middleware/pii_filtering.rb +280 -0
  122. data/lib/e11y/middleware/rate_limiting.rb +214 -0
  123. data/lib/e11y/middleware/request.rb +163 -0
  124. data/lib/e11y/middleware/routing.rb +157 -0
  125. data/lib/e11y/middleware/sampling.rb +254 -0
  126. data/lib/e11y/middleware/slo.rb +168 -0
  127. data/lib/e11y/middleware/trace_context.rb +131 -0
  128. data/lib/e11y/middleware/validation.rb +118 -0
  129. data/lib/e11y/middleware/versioning.rb +132 -0
  130. data/lib/e11y/middleware.rb +12 -0
  131. data/lib/e11y/pii/patterns.rb +90 -0
  132. data/lib/e11y/pii.rb +13 -0
  133. data/lib/e11y/pipeline/builder.rb +155 -0
  134. data/lib/e11y/pipeline/zone_validator.rb +110 -0
  135. data/lib/e11y/pipeline.rb +12 -0
  136. data/lib/e11y/presets/audit_event.rb +65 -0
  137. data/lib/e11y/presets/debug_event.rb +34 -0
  138. data/lib/e11y/presets/high_value_event.rb +51 -0
  139. data/lib/e11y/presets.rb +19 -0
  140. data/lib/e11y/railtie.rb +138 -0
  141. data/lib/e11y/reliability/circuit_breaker.rb +216 -0
  142. data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
  143. data/lib/e11y/reliability/dlq/filter.rb +117 -0
  144. data/lib/e11y/reliability/retry_handler.rb +207 -0
  145. data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
  146. data/lib/e11y/sampling/error_spike_detector.rb +225 -0
  147. data/lib/e11y/sampling/load_monitor.rb +161 -0
  148. data/lib/e11y/sampling/stratified_tracker.rb +92 -0
  149. data/lib/e11y/sampling/value_extractor.rb +82 -0
  150. data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
  151. data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
  152. data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
  153. data/lib/e11y/slo/event_driven.rb +150 -0
  154. data/lib/e11y/slo/tracker.rb +119 -0
  155. data/lib/e11y/version.rb +9 -0
  156. data/lib/e11y.rb +283 -0
  157. metadata +452 -0
@@ -0,0 +1,1234 @@
1
+ # ADR-016: Self-Monitoring & SLO for E11y Gem
2
+
3
+ **Status:** Draft
4
+ **Date:** January 15, 2026
5
+ **Covers:** Internal observability and reliability of E11y gem itself
6
+ **Depends On:** ADR-001 (Core), ADR-002 (Metrics), ADR-003 (SLO)
7
+
8
+ ---
9
+
10
+ ## 📋 Table of Contents
11
+
12
+ 1. [Context & Problem](#1-context--problem)
13
+ 2. [Architecture Overview](#2-architecture-overview)
14
+ 3. [Self-Monitoring Metrics](#3-self-monitoring-metrics)
15
+ 4. [Internal SLO Tracking](#4-internal-slo-tracking)
16
+ 5. [Performance Budget](#5-performance-budget)
17
+ 6. [Health Checks](#6-health-checks)
18
+ 7. [Alerting Strategy](#7-alerting-strategy)
19
+ 8. [Trade-offs](#8-trade-offs)
20
+
21
+ ---
22
+
23
+ ## 1. Context & Problem
24
+
25
+ ### 1.1. Problem Statement
26
+
27
+ **E11y is critical infrastructure** - if it fails, the entire observability stack is blind.
28
+
29
+ **Current Pain Points:**
30
+
31
+ ```ruby
32
+ # ❌ PROBLEM 1: No visibility into E11y itself
33
+ # E11y tracks app events, but who tracks E11y?
34
+ # If E11y buffer is full → events dropped → no alert!
35
+ # If E11y adapter fails → silent failure → no alert!
36
+ ```
37
+
38
+ ```ruby
39
+ # ❌ PROBLEM 2: No performance guarantees
40
+ # E11y.track() should be <1ms p99
41
+ # But how do we know if it's slow?
42
+ # How do we detect regressions?
43
+ ```
44
+
45
+ ```ruby
46
+ # ❌ PROBLEM 3: No reliability SLO
47
+ # E11y should deliver 99.9% of events
48
+ # But we don't measure this!
49
+ # How many events are dropped?
50
+ ```
51
+
52
+ ```ruby
53
+ # ❌ PROBLEM 4: No cost visibility
54
+ # E11y processes millions of events
55
+ # How much CPU/memory does it use?
56
+ # Is it worth the overhead?
57
+ ```
58
+
59
+ ### 1.2. Design Principles
60
+
61
+ **1. Self-Monitoring Must Be Lightweight**
62
+ ```ruby
63
+ # E11y self-monitoring should use <1% of E11y's own overhead
64
+ # Otherwise, we create a monitoring-of-monitoring spiral
65
+ ```
66
+
67
+ **2. Self-Monitoring Must Be Reliable**
68
+ ```ruby
69
+ # If E11y fails, self-monitoring must still work
70
+ # Use separate, independent metrics path
71
+ ```
72
+
73
+ **3. Self-Monitoring Must Be Actionable**
74
+ ```ruby
75
+ # Every metric must have:
76
+ # - Clear threshold
77
+ # - Runbook link
78
+ # - Automatic alert
79
+ ```
80
+
81
+ ### 1.3. Goals
82
+
83
+ **Primary Goals:**
84
+ - ✅ **Track E11y performance** (latency, throughput)
85
+ - ✅ **Track E11y reliability** (success rate, dropped events)
86
+ - ✅ **Track E11y resource usage** (CPU, memory, buffer)
87
+ - ✅ **Define E11y SLO** (99.9% delivery, <1ms p99)
88
+ - ✅ **Alert on E11y degradation** (before it impacts app)
89
+
90
+ **Non-Goals:**
91
+ - ❌ Monitoring application events (that's ADR-003)
92
+ - ❌ Complex ML-based anomaly detection
93
+ - ❌ Full distributed tracing of E11y internals
94
+
95
+ ### 1.4. Success Metrics
96
+
97
+ | Metric | Target | Critical? |
98
+ |--------|--------|-----------|
99
+ | **E11y latency p99** | <1ms | ✅ Yes |
100
+ | **E11y success rate** | >99.9% | ✅ Yes |
101
+ | **E11y overhead** | <2% CPU | ✅ Yes |
102
+ | **E11y memory** | <100MB | ✅ Yes |
103
+ | **Buffer utilization** | <80% | ✅ Yes |
104
+
105
+ ---
106
+
107
+ ## 2. Architecture Overview
108
+
109
+ ### 2.1. System Context
110
+
111
+ ```mermaid
112
+ C4Context
113
+ title E11y Self-Monitoring Context
114
+
115
+ Person(sre, "SRE", "Monitors E11y health")
116
+
117
+ System(app, "Rails App", "Tracks business events")
118
+ System(e11y, "E11y Gem", "Event processing")
119
+ System(self_monitor, "E11y Self-Monitor", "Internal metrics")
120
+
121
+ System_Ext(prometheus, "Prometheus", "Metrics storage")
122
+ System_Ext(alertmanager, "Alertmanager", "E11y health alerts")
123
+ System_Ext(grafana, "Grafana", "E11y dashboard")
124
+
125
+ Rel(app, e11y, "Tracks events", "E11y API")
126
+ Rel(e11y, self_monitor, "Emits metrics", "Internal")
127
+ Rel(self_monitor, prometheus, "Exports", "Prometheus format")
128
+ Rel(prometheus, alertmanager, "Evaluates", "E11y SLO")
129
+ Rel(alertmanager, sre, "Alerts", "E11y degraded")
130
+ Rel(sre, grafana, "Views", "E11y health")
131
+
132
+ UpdateLayoutConfig($c4ShapeInRow="3", $c4BoundaryInRow="1")
133
+ ```
134
+
135
+ ### 2.2. Component Architecture
136
+
137
+ ```mermaid
138
+ graph TB
139
+ subgraph "E11y Gem"
140
+ Track[Events::OrderCreated.track] --> Pipeline[E11y Pipeline]
141
+
142
+ Pipeline --> Middleware1[Middleware 1]
143
+ Pipeline --> Middleware2[Middleware 2]
144
+ Pipeline --> Middleware3[Middleware N]
145
+
146
+ Middleware3 --> Buffer[Ring Buffer]
147
+ Buffer --> Adapters[Adapters]
148
+
149
+ subgraph "Self-Monitoring (Instrumented)"
150
+ Pipeline -.->|measure latency| PerfMonitor[Performance Monitor]
151
+ Buffer -.->|measure size| BufferMonitor[Buffer Monitor]
152
+ Adapters -.->|measure success| ReliabilityMonitor[Reliability Monitor]
153
+
154
+ PerfMonitor --> SelfMetrics[Self-Monitoring Metrics]
155
+ BufferMonitor --> SelfMetrics
156
+ ReliabilityMonitor --> SelfMetrics
157
+ end
158
+ end
159
+
160
+ subgraph "Metrics Export (Separate Path)"
161
+ SelfMetrics --> PrometheusExporter[Prometheus Exporter<br/>SEPARATE from app metrics]
162
+ PrometheusExporter --> Prometheus[Prometheus]
163
+ end
164
+
165
+ subgraph "SLO Tracking"
166
+ Prometheus --> SLOCalc[E11y SLO Calculator]
167
+ SLOCalc --> ErrorBudget[E11y Error Budget]
168
+ ErrorBudget --> Alerts[Alertmanager]
169
+ end
170
+
171
+ style SelfMetrics fill:#fff3cd
172
+ style PrometheusExporter fill:#d1ecf1
173
+ style Alerts fill:#f8d7da
174
+ ```
175
+
176
+ ### 2.3. Self-Monitoring Flow
177
+
178
+ ```mermaid
179
+ sequenceDiagram
180
+ participant App as Rails App
181
+ participant E11y as E11y Pipeline
182
+ participant Monitor as Self-Monitor
183
+ participant Prom as Prometheus
184
+ participant Alert as Alertmanager
185
+
186
+ Note over App,Alert: Normal Operation
187
+
188
+ App->>E11y: Events::OrderCreated.track(...)
189
+
190
+ activate E11y
191
+ Note over E11y: Start timer
192
+
193
+ E11y->>E11y: Validate, Filter, Sample
194
+ E11y->>E11y: Buffer event
195
+
196
+ Note over E11y: End timer (0.5ms)
197
+
198
+ E11y->>Monitor: track_latency(0.5ms)
199
+ E11y->>Monitor: track_success()
200
+ E11y->>Monitor: track_buffer_size(42)
201
+
202
+ deactivate E11y
203
+
204
+ E11y-->>App: ✓ tracked
205
+
206
+ Note over Monitor,Prom: Every 15s
207
+
208
+ Monitor->>Prom: Export metrics
209
+ Prom->>Prom: Calculate E11y SLO
210
+
211
+ Note over App,Alert: Degradation Detected
212
+
213
+ App->>E11y: Events::OrderCreated.track(...)
214
+
215
+ activate E11y
216
+ Note over E11y: Slow! (5ms)
217
+
218
+ E11y->>Monitor: track_latency(5ms)
219
+ deactivate E11y
220
+
221
+ Prom->>Prom: E11y p99 = 5ms > 1ms threshold
222
+ Prom->>Alert: Fire: E11yLatencyHigh
223
+ Alert->>Alert: Page SRE: E11y degraded
224
+ ```
225
+
226
+ ---
227
+
228
+ ## 3. Self-Monitoring Metrics
229
+
230
+ ### 3.1. Performance Metrics
231
+
232
+ ```ruby
233
+ # lib/e11y/self_monitoring/performance_monitor.rb
234
+ module E11y
235
+ module SelfMonitoring
236
+ class PerformanceMonitor
237
+ # Track E11y.track() latency
238
+ def self.track_latency(duration_ms, event_class:, severity:)
239
+ E11y::Metrics.histogram(
240
+ :e11y_track_duration_seconds,
241
+ duration_ms / 1000.0,
242
+ labels: {
243
+ event_class: event_class.name,
244
+ severity: severity
245
+ },
246
+ buckets: [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1] # 0.1ms to 100ms
247
+ )
248
+ end
249
+
250
+ # Track middleware execution time
251
+ def self.track_middleware_latency(middleware_name, duration_ms)
252
+ E11y::Metrics.histogram(
253
+ :e11y_middleware_duration_seconds,
254
+ duration_ms / 1000.0,
255
+ labels: { middleware: middleware_name },
256
+ buckets: [0.00001, 0.0001, 0.0005, 0.001, 0.005] # 0.01ms to 5ms
257
+ )
258
+ end
259
+
260
+ # Track adapter send latency
261
+ def self.track_adapter_latency(adapter_name, duration_ms)
262
+ E11y::Metrics.histogram(
263
+ :e11y_adapter_send_duration_seconds,
264
+ duration_ms / 1000.0,
265
+ labels: { adapter: adapter_name },
266
+ buckets: [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0] # 1ms to 5s
267
+ )
268
+ end
269
+
270
+ # Track buffer flush latency
271
+ def self.track_flush_latency(duration_ms, event_count)
272
+ E11y::Metrics.histogram(
273
+ :e11y_buffer_flush_duration_seconds,
274
+ duration_ms / 1000.0,
275
+ labels: { event_count_bucket: bucket_event_count(event_count) },
276
+ buckets: [0.001, 0.01, 0.05, 0.1, 0.5, 1.0]
277
+ )
278
+ end
279
+
280
+ private
281
+
282
+ def self.bucket_event_count(count)
283
+ case count
284
+ when 0..10 then '1-10'
285
+ when 11..50 then '11-50'
286
+ when 51..100 then '51-100'
287
+ when 101..500 then '101-500'
288
+ else '500+'
289
+ end
290
+ end
291
+ end
292
+ end
293
+ end
294
+ ```
295
+
296
+ ### 3.2. Reliability Metrics
297
+
298
+ ```ruby
299
+ # lib/e11y/self_monitoring/reliability_monitor.rb
300
+ module E11y
301
+ module SelfMonitoring
302
+ class ReliabilityMonitor
303
+ # Track successful event tracking
304
+ def self.track_success(event_class:, severity:, adapters:)
305
+ E11y::Metrics.counter(
306
+ :e11y_events_tracked_total,
307
+ labels: {
308
+ event_class: event_class.name,
309
+ severity: severity,
310
+ result: 'success'
311
+ }
312
+ )
313
+
314
+ # Per-adapter success
315
+ adapters.each do |adapter|
316
+ E11y::Metrics.counter(
317
+ :e11y_adapter_events_total,
318
+ labels: {
319
+ adapter: adapter,
320
+ result: 'success'
321
+ }
322
+ )
323
+ end
324
+ end
325
+
326
+ # Track dropped events (sampling, rate limiting)
327
+ def self.track_dropped(event_class:, reason:)
328
+ E11y::Metrics.counter(
329
+ :e11y_events_dropped_total,
330
+ labels: {
331
+ event_class: event_class.name,
332
+ reason: reason # 'sampled', 'rate_limited', 'buffer_full', 'validation_failed'
333
+ }
334
+ )
335
+ end
336
+
337
+ # Track adapter failures
338
+ def self.track_adapter_failure(adapter_name, error_class)
339
+ E11y::Metrics.counter(
340
+ :e11y_adapter_errors_total,
341
+ labels: {
342
+ adapter: adapter_name,
343
+ error_class: error_class.name
344
+ }
345
+ )
346
+ end
347
+
348
+ # Track circuit breaker state
349
+ def self.track_circuit_breaker_state(adapter_name, state)
350
+ E11y::Metrics.gauge(
351
+ :e11y_circuit_breaker_state,
352
+ state == :open ? 1 : 0,
353
+ labels: { adapter: adapter_name }
354
+ )
355
+ end
356
+
357
+ # Track retry attempts
358
+ def self.track_retry(adapter_name, attempt_number)
359
+ E11y::Metrics.counter(
360
+ :e11y_adapter_retries_total,
361
+ labels: {
362
+ adapter: adapter_name,
363
+ attempt: attempt_number
364
+ }
365
+ )
366
+ end
367
+ end
368
+ end
369
+ end
370
+ ```
371
+
372
+ ### 3.3. Resource Usage Metrics
373
+
374
+ ```ruby
375
+ # lib/e11y/self_monitoring/resource_monitor.rb
376
+ module E11y
377
+ module SelfMonitoring
378
+ class ResourceMonitor
379
+ # Track buffer utilization
380
+ def self.track_buffer_size(current_size, max_size)
381
+ E11y::Metrics.gauge(
382
+ :e11y_buffer_size,
383
+ current_size,
384
+ labels: { type: 'main' }
385
+ )
386
+
387
+ utilization = (current_size.to_f / max_size * 100).round(2)
388
+ E11y::Metrics.gauge(
389
+ :e11y_buffer_utilization_percent,
390
+ utilization,
391
+ labels: { type: 'main' }
392
+ )
393
+ end
394
+
395
+ # Track request-scoped debug buffer
396
+ def self.track_debug_buffer_size(current_size, max_size)
397
+ E11y::Metrics.gauge(
398
+ :e11y_buffer_size,
399
+ current_size,
400
+ labels: { type: 'debug' }
401
+ )
402
+ end
403
+
404
+ # Track memory usage (approximate)
405
+ def self.track_memory_usage
406
+ # Use ObjectSpace to estimate E11y memory
407
+ e11y_objects = ObjectSpace.each_object.select do |obj|
408
+ obj.class.name&.start_with?('E11y::')
409
+ end
410
+
411
+ # Rough estimate: 100 bytes per object
412
+ memory_mb = (e11y_objects.size * 100.0 / 1024 / 1024).round(2)
413
+
414
+ E11y::Metrics.gauge(
415
+ :e11y_memory_usage_mb,
416
+ memory_mb
417
+ )
418
+ end
419
+
420
+ # Track GC pressure (events allocated)
421
+ def self.track_gc_pressure(objects_allocated)
422
+ E11y::Metrics.counter(
423
+ :e11y_gc_objects_allocated_total,
424
+ objects_allocated
425
+ )
426
+ end
427
+
428
+ # Track CPU time (thread-level)
429
+ def self.track_cpu_time(cpu_seconds)
430
+ E11y::Metrics.counter(
431
+ :e11y_cpu_seconds_total,
432
+ cpu_seconds
433
+ )
434
+ end
435
+ end
436
+ end
437
+ end
438
+ ```
439
+
440
+ ### 3.4. Cardinality Metrics (from ADR-002)
441
+
442
+ ```ruby
443
+ # lib/e11y/self_monitoring/cardinality_monitor.rb
444
+ module E11y
445
+ module SelfMonitoring
446
+ class CardinalityMonitor
447
+ # Track unique label values per metric
448
+ def self.track_metric_cardinality(metric_name, label_name, unique_values_count)
449
+ E11y::Metrics.gauge(
450
+ :e11y_metric_cardinality,
451
+ unique_values_count,
452
+ labels: {
453
+ metric: metric_name,
454
+ label: label_name
455
+ }
456
+ )
457
+ end
458
+
459
+ # Track cardinality protection actions
460
+ def self.track_cardinality_action(action, metric_name, label_name)
461
+ E11y::Metrics.counter(
462
+ :e11y_cardinality_actions_total,
463
+ labels: {
464
+ action: action, # 'dropped', 'relabeled', 'aggregated', 'alerted'
465
+ metric: metric_name,
466
+ label: label_name
467
+ }
468
+ )
469
+ end
470
+
471
+ # Track total metrics count
472
+ def self.track_total_metrics(count)
473
+ E11y::Metrics.gauge(
474
+ :e11y_total_metrics,
475
+ count
476
+ )
477
+ end
478
+ end
479
+ end
480
+ end
481
+ ```
482
+
483
+ ---
484
+
485
+ ## 4. Internal SLO Tracking
486
+
487
+ ### 4.1. E11y SLO Definition
488
+
489
+ **E11y has its own SLO** (separate from application SLO):
490
+
491
+ ```yaml
492
+ # config/e11y_slo.yml
493
+ #
494
+ # E11y Gem Internal SLO
495
+ #
496
+ # This defines reliability targets for E11y itself.
497
+ # If E11y violates its SLO → alert SRE immediately!
498
+
499
+ version: 1
500
+
501
+ e11y_slo:
502
+ # === LATENCY SLO ===
503
+ # E11y.track() must be fast (<1ms p99)
504
+ latency:
505
+ enabled: true
506
+ p99_target: 0.001 # 1ms
507
+ p95_target: 0.0005 # 0.5ms
508
+ p50_target: 0.0001 # 0.1ms
509
+ window: 30d
510
+
511
+ # Multi-window burn rate alerts
512
+ burn_rate_alerts:
513
+ fast:
514
+ enabled: true
515
+ window: 1h
516
+ threshold: 14.4
517
+ alert_after: 5m
518
+ severity: critical
519
+ medium:
520
+ enabled: true
521
+ window: 6h
522
+ threshold: 6.0
523
+ alert_after: 30m
524
+ severity: warning
525
+
526
+ # === RELIABILITY SLO ===
527
+ # E11y must deliver 99.9% of events
528
+ reliability:
529
+ enabled: true
530
+ success_rate_target: 0.999 # 99.9%
531
+ window: 30d
532
+
533
+ # What counts as "success"?
534
+ success_criteria:
535
+ - event_tracked: true
536
+ - not_dropped: true
537
+ - adapter_delivered: true # At least 1 adapter succeeded
538
+
539
+ # What counts as "failure"?
540
+ failure_criteria:
541
+ - validation_failed: true
542
+ - all_adapters_failed: true
543
+ - buffer_overflow: true
544
+
545
+ burn_rate_alerts:
546
+ fast:
547
+ enabled: true
548
+ window: 1h
549
+ threshold: 14.4
550
+ alert_after: 5m
551
+
552
+ # === RESOURCE SLO ===
553
+ # E11y must use <2% CPU, <100MB memory
554
+ resources:
555
+ enabled: true
556
+
557
+ cpu_percent_target: 2.0 # <2% CPU
558
+ memory_mb_target: 100 # <100MB
559
+
560
+ buffer_utilization_target: 80 # <80% full
561
+
562
+ alerts:
563
+ cpu_high:
564
+ threshold: 5.0 # Alert if >5% CPU
565
+ duration: 5m
566
+ memory_high:
567
+ threshold: 200 # Alert if >200MB
568
+ duration: 5m
569
+ buffer_high:
570
+ threshold: 90 # Alert if >90% full
571
+ duration: 1m
572
+
573
+ # === ERROR BUDGET ===
574
+ error_budget:
575
+ enabled: true
576
+
577
+ # Latency budget: 0.1% of requests can be >1ms
578
+ latency_budget: 0.001
579
+
580
+ # Reliability budget: 0.1% of events can be dropped
581
+ reliability_budget: 0.001
582
+
583
+ # Alert thresholds
584
+ alert_at_percent_consumed: [50, 80, 90, 100]
585
+ ```
586
+
587
+ ### 4.2. SLO Calculator
588
+
589
+ ```ruby
590
+ # lib/e11y/self_monitoring/slo_calculator.rb
591
+ module E11y
592
+ module SelfMonitoring
593
+ class SLOCalculator
594
+ def self.calculate_latency_slo(window: 30.days)
595
+ # Query Prometheus for E11y latency p99
596
+ query = <<~PROMQL
597
+ histogram_quantile(0.99,
598
+ sum(rate(e11y_track_duration_seconds_bucket[#{window}])) by (le)
599
+ )
600
+ PROMQL
601
+
602
+ p99_latency = E11y::Metrics.query_prometheus(query)
603
+ target = 0.001 # 1ms
604
+
605
+ {
606
+ current_p99: p99_latency,
607
+ target_p99: target,
608
+ slo_met: p99_latency <= target,
609
+ error_budget_consumed: calculate_latency_budget_consumed(p99_latency, target, window)
610
+ }
611
+ end
612
+
613
+ def self.calculate_reliability_slo(window: 30.days)
614
+ # Query Prometheus for E11y success rate
615
+ query = <<~PROMQL
616
+ sum(rate(e11y_events_tracked_total{result="success"}[#{window}]))
617
+ /
618
+ sum(rate(e11y_events_tracked_total[#{window}]))
619
+ PROMQL
620
+
621
+ success_rate = E11y::Metrics.query_prometheus(query)
622
+ target = 0.999 # 99.9%
623
+
624
+ {
625
+ current_success_rate: success_rate,
626
+ target_success_rate: target,
627
+ slo_met: success_rate >= target,
628
+ error_budget_consumed: calculate_reliability_budget_consumed(success_rate, target)
629
+ }
630
+ end
631
+
632
+ def self.calculate_resource_slo
633
+ # Query Prometheus for E11y resource usage
634
+ cpu_query = 'avg(rate(e11y_cpu_seconds_total[5m])) * 100'
635
+ memory_query = 'e11y_memory_usage_mb'
636
+ buffer_query = 'e11y_buffer_utilization_percent'
637
+
638
+ cpu_percent = E11y::Metrics.query_prometheus(cpu_query)
639
+ memory_mb = E11y::Metrics.query_prometheus(memory_query)
640
+ buffer_percent = E11y::Metrics.query_prometheus(buffer_query)
641
+
642
+ {
643
+ cpu: {
644
+ current: cpu_percent,
645
+ target: 2.0,
646
+ slo_met: cpu_percent <= 2.0
647
+ },
648
+ memory: {
649
+ current: memory_mb,
650
+ target: 100,
651
+ slo_met: memory_mb <= 100
652
+ },
653
+ buffer: {
654
+ current: buffer_percent,
655
+ target: 80,
656
+ slo_met: buffer_percent <= 80
657
+ }
658
+ }
659
+ end
660
+
661
+ private
662
+
663
+ def self.calculate_latency_budget_consumed(current, target, window)
664
+ # Simplified: % of requests exceeding target
665
+ # In reality, use Prometheus query for exact calculation
666
+ return 0.0 if current <= target
667
+
668
+ excess = current - target
669
+ budget = target * 0.001 # 0.1% budget
670
+
671
+ (excess / budget * 100).round(2)
672
+ end
673
+
674
+ def self.calculate_reliability_budget_consumed(current, target)
675
+ error_rate = 1.0 - current
676
+ error_budget = 1.0 - target
677
+
678
+ return 0.0 if error_rate <= error_budget
679
+
680
+ (error_rate / error_budget * 100).round(2)
681
+ end
682
+ end
683
+ end
684
+ end
685
+ ```
686
+
687
+ ---
688
+
689
+ ## 5. Performance Budget
690
+
691
+ ### 5.1. Performance Targets
692
+
693
+ **E11y Performance Budget:**
694
+
695
+ | Operation | p50 | p95 | p99 | p99.9 | Critical? |
696
+ |-----------|-----|-----|-----|-------|-----------|
697
+ | **E11y.track()** | <0.1ms | <0.5ms | <1ms | <5ms | ✅ Yes |
698
+ | **Middleware (each)** | <0.01ms | <0.05ms | <0.1ms | <0.5ms | ✅ Yes |
699
+ | **Validation** | <0.01ms | <0.05ms | <0.1ms | <0.5ms | ✅ Yes |
700
+ | **PII Filtering** | <0.05ms | <0.1ms | <0.5ms | <1ms | ✅ Yes |
701
+ | **Buffer write** | <0.001ms | <0.01ms | <0.05ms | <0.1ms | ✅ Yes |
702
+ | **Buffer flush** | <10ms | <50ms | <100ms | <500ms | ✅ Yes |
703
+ | **Adapter send** | <10ms | <50ms | <100ms | <1s | ⚠️ Async |
704
+
705
+ ### 5.2. Performance Instrumentation
706
+
707
+ ```ruby
708
+ # lib/e11y/instrumentation/performance.rb
709
+ module E11y
710
+ module Instrumentation
711
+ module Performance
712
+ # Instrument E11y.track()
713
+ def self.instrument_track(event_class, &block)
714
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_millisecond)
715
+
716
+ result = block.call
717
+
718
+ duration_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_millisecond) - start_time
719
+
720
+ # Track latency
721
+ E11y::SelfMonitoring::PerformanceMonitor.track_latency(
722
+ duration_ms,
723
+ event_class: event_class,
724
+ severity: event_class.severity
725
+ )
726
+
727
+ # Alert if >1ms (p99 budget exceeded)
728
+ if duration_ms > 1.0
729
+ E11y.logger.warn("E11y.track() slow: #{duration_ms.round(2)}ms for #{event_class.name}")
730
+ end
731
+
732
+ result
733
+ end
734
+
735
+ # Instrument middleware
736
+ def self.instrument_middleware(middleware_name, &block)
737
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_millisecond)
738
+
739
+ result = block.call
740
+
741
+ duration_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_millisecond) - start_time
742
+
743
+ E11y::SelfMonitoring::PerformanceMonitor.track_middleware_latency(
744
+ middleware_name,
745
+ duration_ms
746
+ )
747
+
748
+ result
749
+ end
750
+
751
+ # Instrument adapter send
752
+ def self.instrument_adapter_send(adapter_name, event_count, &block)
753
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_millisecond)
754
+
755
+ result = block.call
756
+
757
+ duration_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_millisecond) - start_time
758
+
759
+ E11y::SelfMonitoring::PerformanceMonitor.track_adapter_latency(
760
+ adapter_name,
761
+ duration_ms
762
+ )
763
+
764
+ # Track per-event latency
765
+ per_event_ms = duration_ms / event_count
766
+
767
+ # Alert if adapter is slow (>100ms per event)
768
+ if per_event_ms > 100
769
+ E11y.logger.warn("Adapter #{adapter_name} slow: #{per_event_ms.round(2)}ms per event")
770
+ end
771
+
772
+ result
773
+ end
774
+ end
775
+ end
776
+ end
777
+ ```
778
+
779
+ ---
780
+
781
+ ## 6. Health Checks
782
+
783
+ ### 6.1. Health Check API
784
+
785
+ ```ruby
786
+ # lib/e11y/health_check.rb
787
+ module E11y
788
+ class HealthCheck
789
+ def self.status
790
+ {
791
+ status: overall_status,
792
+ timestamp: Time.now.iso8601,
793
+ checks: {
794
+ latency: check_latency,
795
+ reliability: check_reliability,
796
+ resources: check_resources,
797
+ adapters: check_adapters,
798
+ buffer: check_buffer
799
+ },
800
+ slo: {
801
+ latency: SelfMonitoring::SLOCalculator.calculate_latency_slo,
802
+ reliability: SelfMonitoring::SLOCalculator.calculate_reliability_slo,
803
+ resources: SelfMonitoring::SLOCalculator.calculate_resource_slo
804
+ }
805
+ }
806
+ end
807
+
808
+ def self.healthy?
809
+ status[:status] == :healthy
810
+ end
811
+
812
+ private
813
+
814
+ def self.overall_status
815
+ checks = [
816
+ check_latency[:status],
817
+ check_reliability[:status],
818
+ check_resources[:status],
819
+ check_adapters[:status],
820
+ check_buffer[:status]
821
+ ]
822
+
823
+ return :unhealthy if checks.include?(:unhealthy)
824
+ return :degraded if checks.include?(:degraded)
825
+ :healthy
826
+ end
827
+
828
+ def self.check_latency
829
+ slo = SelfMonitoring::SLOCalculator.calculate_latency_slo(window: 5.minutes)
830
+
831
+ {
832
+ status: slo[:slo_met] ? :healthy : :degraded,
833
+ current_p99: slo[:current_p99],
834
+ target_p99: slo[:target_p99],
835
+ message: slo[:slo_met] ? 'Latency within SLO' : 'Latency exceeds SLO'
836
+ }
837
+ end
838
+
839
+ def self.check_reliability
840
+ slo = SelfMonitoring::SLOCalculator.calculate_reliability_slo(window: 5.minutes)
841
+
842
+ {
843
+ status: slo[:slo_met] ? :healthy : :unhealthy,
844
+ current_success_rate: slo[:current_success_rate],
845
+ target_success_rate: slo[:target_success_rate],
846
+ message: slo[:slo_met] ? 'Reliability within SLO' : 'Reliability below SLO'
847
+ }
848
+ end
849
+
850
+ def self.check_resources
851
+ slo = SelfMonitoring::SLOCalculator.calculate_resource_slo
852
+
853
+ cpu_ok = slo[:cpu][:slo_met]
854
+ memory_ok = slo[:memory][:slo_met]
855
+ buffer_ok = slo[:buffer][:slo_met]
856
+
857
+ status = (cpu_ok && memory_ok && buffer_ok) ? :healthy : :degraded
858
+
859
+ {
860
+ status: status,
861
+ cpu_percent: slo[:cpu][:current],
862
+ memory_mb: slo[:memory][:current],
863
+ buffer_percent: slo[:buffer][:current],
864
+ message: status == :healthy ? 'Resources within limits' : 'Resource usage high'
865
+ }
866
+ end
867
+
868
+ def self.check_adapters
869
+ # Check circuit breaker states
870
+ adapters = E11y.config.adapters.all
871
+
872
+ failed_adapters = adapters.select do |name, adapter|
873
+ adapter.circuit_breaker&.open?
874
+ end
875
+
876
+ {
877
+ status: failed_adapters.empty? ? :healthy : :degraded,
878
+ total_adapters: adapters.size,
879
+ failed_adapters: failed_adapters.keys,
880
+ message: failed_adapters.empty? ? 'All adapters healthy' : "#{failed_adapters.size} adapters failed"
881
+ }
882
+ end
883
+
884
+ def self.check_buffer
885
+ buffer = E11y::Buffer.instance
886
+ utilization = (buffer.size.to_f / buffer.max_size * 100).round(2)
887
+
888
+ status = case utilization
889
+ when 0..80 then :healthy
890
+ when 81..90 then :degraded
891
+ else :unhealthy
892
+ end
893
+
894
+ {
895
+ status: status,
896
+ current_size: buffer.size,
897
+ max_size: buffer.max_size,
898
+ utilization_percent: utilization,
899
+ message: "Buffer #{utilization}% full"
900
+ }
901
+ end
902
+ end
903
+ end
904
+ ```
905
+
906
+ ### 6.2. Health Check Endpoint
907
+
908
+ ```ruby
909
+ # config/routes.rb (for Web UI)
910
+ E11y::WebUI::Engine.routes.draw do
911
+ # ... existing routes ...
912
+
913
+ get '/health', to: 'health#show'
914
+ get '/health/detailed', to: 'health#detailed'
915
+ end
916
+
917
+ # app/controllers/e11y/web_ui/health_controller.rb
918
+ module E11y
919
+ module WebUI
920
+ class HealthController < ApplicationController
921
+ def show
922
+ status = E11y::HealthCheck.status
923
+
924
+ render json: {
925
+ status: status[:status],
926
+ timestamp: status[:timestamp]
927
+ }, status: status[:status] == :healthy ? 200 : 503
928
+ end
929
+
930
+ def detailed
931
+ status = E11y::HealthCheck.status
932
+
933
+ render json: status, status: status[:status] == :healthy ? 200 : 503
934
+ end
935
+ end
936
+ end
937
+ end
938
+ ```
939
+
940
+ ---
941
+
942
+ ## 7. Alerting Strategy
943
+
944
+ ### 7.1. Prometheus Alert Rules
945
+
946
+ ```yaml
947
+ # prometheus/alerts/e11y_self_monitoring.yml
948
+ groups:
949
+ - name: e11y_self_monitoring
950
+ interval: 30s
951
+ rules:
952
+ # ===================================================================
953
+ # LATENCY ALERTS
954
+ # ===================================================================
955
+
956
+ - alert: E11yLatencyHigh
957
+ expr: |
958
+ histogram_quantile(0.99,
959
+ sum(rate(e11y_track_duration_seconds_bucket[5m])) by (le)
960
+ ) > 0.001 # >1ms p99
961
+ for: 5m
962
+ labels:
963
+ severity: critical
964
+ component: e11y
965
+ annotations:
966
+ summary: "E11y latency exceeds SLO"
967
+ description: |
968
+ E11y.track() p99 latency is {{ $value | humanize }}s (target: 1ms).
969
+ This will slow down the entire application!
970
+
971
+ Runbook: https://wiki/runbooks/e11y-latency-high
972
+
973
+ - alert: E11yMiddlewareSlow
974
+ expr: |
975
+ histogram_quantile(0.99,
976
+ sum(rate(e11y_middleware_duration_seconds_bucket[5m])) by (le, middleware)
977
+ ) > 0.0005 # >0.5ms p99
978
+ for: 5m
979
+ labels:
980
+ severity: warning
981
+ component: e11y
982
+ annotations:
983
+ summary: "E11y middleware {{ $labels.middleware }} is slow"
984
+ description: "Middleware latency: {{ $value | humanize }}s (target: 0.5ms)"
985
+
986
+ # ===================================================================
987
+ # RELIABILITY ALERTS
988
+ # ===================================================================
989
+
990
+ - alert: E11yReliabilityLow
991
+ expr: |
992
+ sum(rate(e11y_events_tracked_total{result="success"}[5m]))
993
+ /
994
+ sum(rate(e11y_events_tracked_total[5m]))
995
+ < 0.999 # <99.9%
996
+ for: 5m
997
+ labels:
998
+ severity: critical
999
+ component: e11y
1000
+ annotations:
1001
+ summary: "E11y reliability below SLO"
1002
+ description: |
1003
+ E11y success rate is {{ $value | humanizePercentage }} (target: 99.9%).
1004
+ Events are being dropped!
1005
+
1006
+ Runbook: https://wiki/runbooks/e11y-reliability-low
1007
+
1008
+ - alert: E11yEventsDropped
1009
+ expr: |
1010
+ rate(e11y_events_dropped_total[5m]) > 10
1011
+ for: 1m
1012
+ labels:
1013
+ severity: warning
1014
+ component: e11y
1015
+ annotations:
1016
+ summary: "E11y is dropping events"
1017
+ description: |
1018
+ Dropping {{ $value }} events/sec.
1019
+ Reason: {{ $labels.reason }}
1020
+
1021
+ - alert: E11yAdapterFailing
1022
+ expr: |
1023
+ rate(e11y_adapter_errors_total[5m]) > 1
1024
+ for: 5m
1025
+ labels:
1026
+ severity: warning
1027
+ component: e11y
1028
+ annotations:
1029
+ summary: "E11y adapter {{ $labels.adapter }} is failing"
1030
+ description: "Error rate: {{ $value }} errors/sec"
1031
+
1032
+ - alert: E11yCircuitBreakerOpen
1033
+ expr: |
1034
+ e11y_circuit_breaker_state == 1
1035
+ for: 1m
1036
+ labels:
1037
+ severity: critical
1038
+ component: e11y
1039
+ annotations:
1040
+ summary: "E11y circuit breaker open for {{ $labels.adapter }}"
1041
+ description: "Adapter {{ $labels.adapter }} is unavailable"
1042
+
1043
+ # ===================================================================
1044
+ # RESOURCE ALERTS
1045
+ # ===================================================================
1046
+
1047
+ - alert: E11yCPUHigh
1048
+ expr: |
1049
+ avg(rate(e11y_cpu_seconds_total[5m])) * 100 > 5.0
1050
+ for: 5m
1051
+ labels:
1052
+ severity: warning
1053
+ component: e11y
1054
+ annotations:
1055
+ summary: "E11y CPU usage high"
1056
+ description: "CPU usage: {{ $value }}% (target: <2%)"
1057
+
1058
+ - alert: E11yMemoryHigh
1059
+ expr: |
1060
+ e11y_memory_usage_mb > 200
1061
+ for: 5m
1062
+ labels:
1063
+ severity: warning
1064
+ component: e11y
1065
+ annotations:
1066
+ summary: "E11y memory usage high"
1067
+ description: "Memory usage: {{ $value }}MB (target: <100MB)"
1068
+
1069
+ - alert: E11yBufferFull
1070
+ expr: |
1071
+ e11y_buffer_utilization_percent > 90
1072
+ for: 1m
1073
+ labels:
1074
+ severity: critical
1075
+ component: e11y
1076
+ annotations:
1077
+ summary: "E11y buffer nearly full"
1078
+ description: |
1079
+ Buffer utilization: {{ $value }}% (target: <80%).
1080
+ Events will be dropped soon!
1081
+
1082
+ Runbook: https://wiki/runbooks/e11y-buffer-full
1083
+
1084
+ # ===================================================================
1085
+ # CARDINALITY ALERTS
1086
+ # ===================================================================
1087
+
1088
+ - alert: E11yCardinalityHigh
1089
+ expr: |
1090
+ e11y_metric_cardinality > 1000
1091
+ for: 5m
1092
+ labels:
1093
+ severity: warning
1094
+ component: e11y
1095
+ annotations:
1096
+ summary: "E11y metric cardinality high"
1097
+ description: |
1098
+ Metric {{ $labels.metric }} label {{ $labels.label }} has {{ $value }} unique values.
1099
+ This may cause Prometheus performance issues.
1100
+ ```
1101
+
1102
+ ### 7.2. Grafana Dashboard
1103
+
1104
+ ```json
1105
+ {
1106
+ "dashboard": {
1107
+ "title": "E11y Self-Monitoring Dashboard",
1108
+ "panels": [
1109
+ {
1110
+ "title": "E11y Latency (p99)",
1111
+ "targets": [
1112
+ {
1113
+ "expr": "histogram_quantile(0.99, sum(rate(e11y_track_duration_seconds_bucket[5m])) by (le))",
1114
+ "legendFormat": "p99"
1115
+ },
1116
+ {
1117
+ "expr": "0.001",
1118
+ "legendFormat": "SLO Target (1ms)"
1119
+ }
1120
+ ],
1121
+ "yaxis": {
1122
+ "format": "s",
1123
+ "max": 0.005
1124
+ }
1125
+ },
1126
+ {
1127
+ "title": "E11y Success Rate",
1128
+ "targets": [
1129
+ {
1130
+ "expr": "sum(rate(e11y_events_tracked_total{result=\"success\"}[5m])) / sum(rate(e11y_events_tracked_total[5m]))",
1131
+ "legendFormat": "Success Rate"
1132
+ },
1133
+ {
1134
+ "expr": "0.999",
1135
+ "legendFormat": "SLO Target (99.9%)"
1136
+ }
1137
+ ],
1138
+ "yaxis": {
1139
+ "format": "percentunit",
1140
+ "min": 0.99,
1141
+ "max": 1.0
1142
+ }
1143
+ },
1144
+ {
1145
+ "title": "E11y Buffer Utilization",
1146
+ "targets": [
1147
+ {
1148
+ "expr": "e11y_buffer_utilization_percent",
1149
+ "legendFormat": "Buffer %"
1150
+ }
1151
+ ],
1152
+ "thresholds": [
1153
+ { "value": 80, "color": "yellow" },
1154
+ { "value": 90, "color": "red" }
1155
+ ]
1156
+ },
1157
+ {
1158
+ "title": "E11y Resource Usage",
1159
+ "targets": [
1160
+ {
1161
+ "expr": "avg(rate(e11y_cpu_seconds_total[5m])) * 100",
1162
+ "legendFormat": "CPU %"
1163
+ },
1164
+ {
1165
+ "expr": "e11y_memory_usage_mb",
1166
+ "legendFormat": "Memory MB"
1167
+ }
1168
+ ]
1169
+ },
1170
+ {
1171
+ "title": "E11y Events Dropped",
1172
+ "targets": [
1173
+ {
1174
+ "expr": "sum(rate(e11y_events_dropped_total[5m])) by (reason)",
1175
+ "legendFormat": "{{ reason }}"
1176
+ }
1177
+ ]
1178
+ },
1179
+ {
1180
+ "title": "E11y Adapter Health",
1181
+ "targets": [
1182
+ {
1183
+ "expr": "sum(rate(e11y_adapter_events_total{result=\"success\"}[5m])) by (adapter)",
1184
+ "legendFormat": "{{ adapter }} success"
1185
+ },
1186
+ {
1187
+ "expr": "sum(rate(e11y_adapter_errors_total[5m])) by (adapter)",
1188
+ "legendFormat": "{{ adapter }} errors"
1189
+ }
1190
+ ]
1191
+ }
1192
+ ]
1193
+ }
1194
+ }
1195
+ ```
1196
+
1197
+ ---
1198
+
1199
+ ## 8. Trade-offs
1200
+
1201
+ ### 8.1. Key Decisions
1202
+
1203
+ | Decision | Pro | Con | Rationale |
1204
+ |----------|-----|-----|-----------|
1205
+ | **Separate metrics path** | Reliable | Complexity | E11y metrics must survive E11y failure |
1206
+ | **<1% overhead** | Minimal impact | Limited detail | Self-monitoring shouldn't slow E11y |
1207
+ | **99.9% SLO** | High reliability | Strict | E11y is critical infrastructure |
1208
+ | **Multi-window alerts** | Fast detection | More alerts | Same as app SLO (ADR-003) |
1209
+ | **Health check API** | Easy monitoring | Extra endpoint | K8s liveness/readiness |
1210
+ | **Performance budget** | Clear targets | Hard to meet | Forces optimization |
1211
+
1212
+ ### 8.2. Alternatives Considered
1213
+
1214
+ **A) No self-monitoring**
1215
+ - ❌ Rejected: Blind to E11y failures
1216
+
1217
+ **B) Log-based monitoring**
1218
+ - ❌ Rejected: Too slow, not actionable
1219
+
1220
+ **C) Self-monitoring via E11y itself**
1221
+ - ❌ Rejected: Circular dependency (if E11y fails, self-monitoring fails)
1222
+
1223
+ **D) Separate metrics path (CHOSEN) ✅**
1224
+ - ✅ Reliable (independent of E11y)
1225
+ - ✅ Low overhead (<1%)
1226
+ - ✅ Actionable (Prometheus alerts)
1227
+
1228
+ ---
1229
+
1230
+ **Status:** ✅ Complete
1231
+ **Next:** Implementation
1232
+ **Estimated Implementation:** 1 week
1233
+
1234
+ **Key Takeaway:** E11y must monitor itself with the same rigor as it monitors the application. Separate metrics path ensures reliability even during E11y failures.