e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module E11y
|
|
4
|
+
module Sampling
|
|
5
|
+
# Load Monitor for Adaptive Sampling (FEAT-4842.1)
|
|
6
|
+
#
|
|
7
|
+
# Monitors system load and event volume to enable load-based adaptive sampling.
|
|
8
|
+
# Implements load-based sampling strategy from ADR-009 §3.3.
|
|
9
|
+
#
|
|
10
|
+
# Features:
|
|
11
|
+
# - Event volume tracking (events/second)
|
|
12
|
+
# - Tiered load levels (normal, high, overload)
|
|
13
|
+
# - Sliding window for rate calculation
|
|
14
|
+
# - Thread-safe concurrent access
|
|
15
|
+
#
|
|
16
|
+
# @example Configuration
|
|
17
|
+
# monitor = E11y::Sampling::LoadMonitor.new(
|
|
18
|
+
# window: 60, # 60 seconds sliding window
|
|
19
|
+
# thresholds: {
|
|
20
|
+
# normal: 1_000, # 0-1k events/sec → 100% sampling
|
|
21
|
+
# high: 10_000, # 1k-10k events/sec → 50% sampling
|
|
22
|
+
# very_high: 50_000, # 10k-50k events/sec → 10% sampling
|
|
23
|
+
# overload: 100_000 # >50k events/sec → 1% sampling
|
|
24
|
+
# }
|
|
25
|
+
# )
|
|
26
|
+
#
|
|
27
|
+
# @example Usage
|
|
28
|
+
# monitor.record_event
|
|
29
|
+
#
|
|
30
|
+
# sample_rate = case monitor.load_level
|
|
31
|
+
# when :normal then 1.0
|
|
32
|
+
# when :high then 0.5
|
|
33
|
+
# when :very_high then 0.1
|
|
34
|
+
# when :overload then 0.01
|
|
35
|
+
# end
|
|
36
|
+
class LoadMonitor
|
|
37
|
+
# Default configuration
|
|
38
|
+
DEFAULT_WINDOW = 60 # 60 seconds sliding window
|
|
39
|
+
DEFAULT_THRESHOLDS = {
|
|
40
|
+
normal: 1_000, # 0-1k events/sec → 100% sampling
|
|
41
|
+
high: 10_000, # 1k-10k events/sec → 50% sampling
|
|
42
|
+
very_high: 50_000, # 10k-50k events/sec → 10% sampling
|
|
43
|
+
overload: 100_000 # >100k events/sec → 1% sampling
|
|
44
|
+
}.freeze
|
|
45
|
+
|
|
46
|
+
attr_reader :window, :thresholds
|
|
47
|
+
|
|
48
|
+
# Initialize load monitor
|
|
49
|
+
#
|
|
50
|
+
# @param config [Hash] Configuration options
|
|
51
|
+
# @option config [Integer] :window (60) Sliding window in seconds
|
|
52
|
+
# @option config [Hash] :thresholds ({}) Load thresholds (events/sec)
|
|
53
|
+
def initialize(config = {})
|
|
54
|
+
@window = config.fetch(:window, DEFAULT_WINDOW)
|
|
55
|
+
@thresholds = DEFAULT_THRESHOLDS.merge(config.fetch(:thresholds, {}))
|
|
56
|
+
|
|
57
|
+
# Event tracking
|
|
58
|
+
@events = [] # Timestamps of tracked events
|
|
59
|
+
@mutex = Mutex.new
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Record an event for load tracking
|
|
63
|
+
def record_event
|
|
64
|
+
@mutex.synchronize do
|
|
65
|
+
now = Time.now
|
|
66
|
+
@events << now
|
|
67
|
+
|
|
68
|
+
# Cleanup old events (outside window)
|
|
69
|
+
cleanup_old_events(now)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Get current event rate (events per second)
|
|
74
|
+
#
|
|
75
|
+
# @return [Float] Events per second
|
|
76
|
+
def current_rate
|
|
77
|
+
@mutex.synchronize do
|
|
78
|
+
now = Time.now
|
|
79
|
+
cleanup_old_events(now)
|
|
80
|
+
|
|
81
|
+
count = @events.count { |ts| (now - ts) <= @window }
|
|
82
|
+
count.to_f / @window
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Get current load level
|
|
87
|
+
#
|
|
88
|
+
# @return [Symbol] Load level (:normal, :high, :very_high, :overload)
|
|
89
|
+
def load_level
|
|
90
|
+
rate = current_rate
|
|
91
|
+
|
|
92
|
+
# Check thresholds in descending order
|
|
93
|
+
if rate >= @thresholds[:overload]
|
|
94
|
+
:overload
|
|
95
|
+
elsif rate >= @thresholds[:very_high]
|
|
96
|
+
:very_high
|
|
97
|
+
elsif rate >= @thresholds[:high]
|
|
98
|
+
:high
|
|
99
|
+
elsif rate >= @thresholds[:normal]
|
|
100
|
+
:high # Between normal and high threshold
|
|
101
|
+
else
|
|
102
|
+
:normal
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Get recommended sample rate for current load
|
|
107
|
+
#
|
|
108
|
+
# @return [Float] Sample rate (0.0-1.0)
|
|
109
|
+
def recommended_sample_rate
|
|
110
|
+
case load_level
|
|
111
|
+
when :normal
|
|
112
|
+
1.0 # 100% sampling
|
|
113
|
+
when :high
|
|
114
|
+
0.5 # 50% sampling
|
|
115
|
+
when :very_high
|
|
116
|
+
0.1 # 10% sampling
|
|
117
|
+
when :overload
|
|
118
|
+
0.01 # 1% sampling
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Check if system is overloaded
|
|
123
|
+
#
|
|
124
|
+
# @return [Boolean] true if overload level reached
|
|
125
|
+
def overloaded?
|
|
126
|
+
load_level == :overload
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Reset monitor state (useful for testing)
|
|
130
|
+
def reset!
|
|
131
|
+
@mutex.synchronize do
|
|
132
|
+
@events.clear
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Get load statistics
|
|
137
|
+
#
|
|
138
|
+
# @return [Hash] Statistics (rate, level, sample_rate, event_count)
|
|
139
|
+
def stats
|
|
140
|
+
# Don't wrap in mutex - methods already handle locking
|
|
141
|
+
{
|
|
142
|
+
rate: current_rate,
|
|
143
|
+
level: load_level,
|
|
144
|
+
sample_rate: recommended_sample_rate,
|
|
145
|
+
event_count: @mutex.synchronize { @events.size },
|
|
146
|
+
window: @window
|
|
147
|
+
}
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
private
|
|
151
|
+
|
|
152
|
+
# Cleanup events outside the sliding window
|
|
153
|
+
#
|
|
154
|
+
# @param now [Time] Current timestamp
|
|
155
|
+
def cleanup_old_events(now)
|
|
156
|
+
cutoff = now - @window
|
|
157
|
+
@events.reject! { |ts| ts < cutoff }
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module E11y
|
|
4
|
+
module Sampling
|
|
5
|
+
# Stratified Sampling Tracker for SLO accuracy (FEAT-4851, C11 Resolution)
|
|
6
|
+
#
|
|
7
|
+
# Tracks sampling statistics per severity stratum to enable sampling correction
|
|
8
|
+
# in SLO calculations. Ensures accurate SLO metrics even with aggressive sampling.
|
|
9
|
+
#
|
|
10
|
+
# @example Usage in sampling middleware
|
|
11
|
+
# tracker = StratifiedTracker.new
|
|
12
|
+
# tracker.record_sample(severity: :success, sample_rate: 0.1, sampled: true)
|
|
13
|
+
# tracker.record_sample(severity: :error, sample_rate: 1.0, sampled: true)
|
|
14
|
+
#
|
|
15
|
+
# correction = tracker.sampling_correction(:success) # => 10.0 (1/0.1)
|
|
16
|
+
#
|
|
17
|
+
# @see ADR-009 §3.7 Stratified Sampling for SLO Accuracy
|
|
18
|
+
# @see UC-014 Adaptive Sampling (C11 Resolution)
|
|
19
|
+
class StratifiedTracker
|
|
20
|
+
# @return [Hash{Symbol => Hash}] Stratum statistics
|
|
21
|
+
attr_reader :strata
|
|
22
|
+
|
|
23
|
+
def initialize
|
|
24
|
+
@strata = Hash.new { |h, k| h[k] = { sampled_count: 0, total_count: 0, sample_rate_sum: 0.0 } }
|
|
25
|
+
@mutex = Mutex.new
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Record a sampling decision for a severity stratum
|
|
29
|
+
#
|
|
30
|
+
# @param severity [Symbol] Event severity (:debug, :info, :success, :warn, :error, :fatal)
|
|
31
|
+
# @param sample_rate [Float] Sample rate used (0.0-1.0)
|
|
32
|
+
# @param sampled [Boolean] Whether event was sampled
|
|
33
|
+
# @return [void]
|
|
34
|
+
def record_sample(severity:, sample_rate:, sampled:)
|
|
35
|
+
@mutex.synchronize do
|
|
36
|
+
stratum = @strata[severity]
|
|
37
|
+
stratum[:total_count] += 1
|
|
38
|
+
stratum[:sampled_count] += 1 if sampled
|
|
39
|
+
stratum[:sample_rate_sum] += sample_rate
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Get sampling correction factor for a severity
|
|
44
|
+
#
|
|
45
|
+
# Correction factor = 1 / sample_rate
|
|
46
|
+
# Multiply observed counts by this to estimate true counts.
|
|
47
|
+
#
|
|
48
|
+
# @param severity [Symbol] Event severity
|
|
49
|
+
# @return [Float] Correction factor (1.0 if no samples)
|
|
50
|
+
def sampling_correction(severity)
|
|
51
|
+
@mutex.synchronize do
|
|
52
|
+
stratum = @strata[severity]
|
|
53
|
+
return 1.0 if stratum[:sampled_count].zero?
|
|
54
|
+
|
|
55
|
+
# Average sample rate for this stratum
|
|
56
|
+
avg_sample_rate = stratum[:sample_rate_sum] / stratum[:total_count]
|
|
57
|
+
return 1.0 if avg_sample_rate.zero?
|
|
58
|
+
|
|
59
|
+
1.0 / avg_sample_rate
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Get statistics for a severity stratum
|
|
64
|
+
#
|
|
65
|
+
# @param severity [Symbol] Event severity
|
|
66
|
+
# @return [Hash] Stratum statistics
|
|
67
|
+
def stratum_stats(severity)
|
|
68
|
+
@mutex.synchronize do
|
|
69
|
+
@strata[severity].dup
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Get statistics for all strata
|
|
74
|
+
#
|
|
75
|
+
# @return [Hash{Symbol => Hash}] All stratum statistics
|
|
76
|
+
def all_strata_stats
|
|
77
|
+
@mutex.synchronize do
|
|
78
|
+
@strata.transform_values(&:dup)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Reset all statistics
|
|
83
|
+
#
|
|
84
|
+
# @return [void]
|
|
85
|
+
def reset!
|
|
86
|
+
@mutex.synchronize do
|
|
87
|
+
@strata.clear
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module E11y
|
|
4
|
+
module Sampling
|
|
5
|
+
# ValueExtractor for extracting numeric values from event payloads (FEAT-4847)
|
|
6
|
+
#
|
|
7
|
+
# Supports:
|
|
8
|
+
# - Nested field access (dot notation: "user.balance")
|
|
9
|
+
# - Type coercion (strings to numbers)
|
|
10
|
+
# - Nil handling (returns 0.0 for missing fields)
|
|
11
|
+
#
|
|
12
|
+
# Used by value-based sampling to prioritize high-value events.
|
|
13
|
+
#
|
|
14
|
+
# @example Basic usage
|
|
15
|
+
# extractor = ValueExtractor.new
|
|
16
|
+
# event_data = { amount: 1500, currency: "USD" }
|
|
17
|
+
# value = extractor.extract(event_data, :amount) # => 1500.0
|
|
18
|
+
#
|
|
19
|
+
# @example Nested fields
|
|
20
|
+
# event_data = { user: { balance: 5000 } }
|
|
21
|
+
# value = extractor.extract(event_data, "user.balance") # => 5000.0
|
|
22
|
+
#
|
|
23
|
+
# @example Type coercion
|
|
24
|
+
# event_data = { amount: "1234.56" }
|
|
25
|
+
# value = extractor.extract(event_data, :amount) # => 1234.56
|
|
26
|
+
#
|
|
27
|
+
# @example Nil handling
|
|
28
|
+
# event_data = {}
|
|
29
|
+
# value = extractor.extract(event_data, :missing) # => 0.0
|
|
30
|
+
class ValueExtractor
|
|
31
|
+
# Extract numeric value from event data
|
|
32
|
+
#
|
|
33
|
+
# @param event_data [Hash] Event payload
|
|
34
|
+
# @param field [String, Symbol] Field path (supports dot notation for nested fields)
|
|
35
|
+
# @return [Float] Extracted value (0.0 if field is missing or non-numeric)
|
|
36
|
+
def extract(event_data, field)
|
|
37
|
+
value = navigate_to_field(event_data, field)
|
|
38
|
+
coerce_to_number(value)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
# Navigate to nested field using dot notation
|
|
44
|
+
#
|
|
45
|
+
# @param data [Hash] Current data hash
|
|
46
|
+
# @param field [String, Symbol] Field path
|
|
47
|
+
# @return [Object, nil] Field value or nil if not found
|
|
48
|
+
def navigate_to_field(data, field)
|
|
49
|
+
return nil unless data.is_a?(Hash)
|
|
50
|
+
|
|
51
|
+
field_path = field.to_s.split(".")
|
|
52
|
+
field_path.reduce(data) do |current, key|
|
|
53
|
+
break nil unless current.is_a?(Hash)
|
|
54
|
+
|
|
55
|
+
current[key.to_sym] || current[key.to_s]
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Coerce value to Float
|
|
60
|
+
#
|
|
61
|
+
# @param value [Object] Value to coerce
|
|
62
|
+
# @return [Float] Numeric value (0.0 for nil or non-coercible)
|
|
63
|
+
def coerce_to_number(value)
|
|
64
|
+
return 0.0 if value.nil?
|
|
65
|
+
|
|
66
|
+
case value
|
|
67
|
+
when Numeric
|
|
68
|
+
value.to_f
|
|
69
|
+
when String
|
|
70
|
+
# Try to convert string to float
|
|
71
|
+
Float(value)
|
|
72
|
+
else
|
|
73
|
+
# Non-numeric types default to 0.0
|
|
74
|
+
0.0
|
|
75
|
+
end
|
|
76
|
+
rescue ArgumentError, TypeError
|
|
77
|
+
# Invalid numeric string or type - return 0.0
|
|
78
|
+
0.0
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "e11y/metrics"
|
|
4
|
+
|
|
5
|
+
module E11y
|
|
6
|
+
module SelfMonitoring
|
|
7
|
+
# Buffer monitoring for E11y internal operations.
|
|
8
|
+
#
|
|
9
|
+
# Tracks buffer metrics:
|
|
10
|
+
# - Buffer size (current utilization)
|
|
11
|
+
# - Buffer overflows
|
|
12
|
+
# - Buffer flushes
|
|
13
|
+
#
|
|
14
|
+
# @see ADR-016 §3.3 (Buffer Metrics)
|
|
15
|
+
# @example
|
|
16
|
+
# E11y::SelfMonitoring::BufferMonitor.track_buffer_size(42, buffer_type: 'ring')
|
|
17
|
+
module BufferMonitor
|
|
18
|
+
# Track current buffer size.
|
|
19
|
+
#
|
|
20
|
+
# @param size [Integer] Current number of events in buffer
|
|
21
|
+
# @param buffer_type [String] Buffer type (e.g., 'ring', 'request_scoped')
|
|
22
|
+
# @return [void]
|
|
23
|
+
def self.track_buffer_size(size, buffer_type:)
|
|
24
|
+
E11y::Metrics.gauge(
|
|
25
|
+
:e11y_buffer_size,
|
|
26
|
+
size,
|
|
27
|
+
{ buffer_type: buffer_type }
|
|
28
|
+
)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Track buffer overflow (event dropped due to full buffer).
|
|
32
|
+
#
|
|
33
|
+
# @param buffer_type [String] Buffer type
|
|
34
|
+
# @return [void]
|
|
35
|
+
def self.track_buffer_overflow(buffer_type:)
|
|
36
|
+
E11y::Metrics.increment(
|
|
37
|
+
:e11y_buffer_overflows_total,
|
|
38
|
+
{ buffer_type: buffer_type }
|
|
39
|
+
)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Track buffer flush operation.
|
|
43
|
+
#
|
|
44
|
+
# @param buffer_type [String] Buffer type
|
|
45
|
+
# @param event_count [Integer] Number of events flushed
|
|
46
|
+
# @param trigger [String] Flush trigger (e.g., 'size', 'timeout', 'explicit')
|
|
47
|
+
# @return [void]
|
|
48
|
+
def self.track_buffer_flush(buffer_type:, event_count:, trigger:)
|
|
49
|
+
E11y::Metrics.increment(
|
|
50
|
+
:e11y_buffer_flushes_total,
|
|
51
|
+
{
|
|
52
|
+
buffer_type: buffer_type,
|
|
53
|
+
trigger: trigger
|
|
54
|
+
}
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
E11y::Metrics.histogram(
|
|
58
|
+
:e11y_buffer_flush_events_count,
|
|
59
|
+
event_count,
|
|
60
|
+
{ buffer_type: buffer_type },
|
|
61
|
+
buckets: [1, 10, 50, 100, 500, 1000, 5000]
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Track buffer utilization (percentage).
|
|
66
|
+
#
|
|
67
|
+
# @param utilization_percent [Numeric] Buffer utilization percentage (0-100)
|
|
68
|
+
# @param buffer_type [String] Buffer type
|
|
69
|
+
# @return [void]
|
|
70
|
+
def self.track_buffer_utilization(utilization_percent, buffer_type:)
|
|
71
|
+
E11y::Metrics.gauge(
|
|
72
|
+
:e11y_buffer_utilization_percent,
|
|
73
|
+
utilization_percent,
|
|
74
|
+
{ buffer_type: buffer_type }
|
|
75
|
+
)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "e11y/metrics"
|
|
4
|
+
|
|
5
|
+
module E11y
|
|
6
|
+
module SelfMonitoring
|
|
7
|
+
# Performance monitoring for E11y internal operations.
|
|
8
|
+
#
|
|
9
|
+
# Tracks latency metrics for:
|
|
10
|
+
# - Event tracking (E11y.track)
|
|
11
|
+
# - Middleware execution
|
|
12
|
+
# - Adapter writes
|
|
13
|
+
# - Buffer flushes
|
|
14
|
+
#
|
|
15
|
+
# @see ADR-016 §3.1 (Performance Metrics)
|
|
16
|
+
# @example
|
|
17
|
+
# E11y::SelfMonitoring::PerformanceMonitor.track_latency(0.5, event_class: 'OrderCreated', severity: :info)
|
|
18
|
+
module PerformanceMonitor
|
|
19
|
+
# Track E11y.track() latency.
|
|
20
|
+
#
|
|
21
|
+
# @param duration_ms [Numeric] Duration in milliseconds
|
|
22
|
+
# @param event_class [String] Event class name
|
|
23
|
+
# @param severity [Symbol] Event severity
|
|
24
|
+
# @return [void]
|
|
25
|
+
def self.track_latency(duration_ms, event_class:, severity:)
|
|
26
|
+
E11y::Metrics.histogram(
|
|
27
|
+
:e11y_track_duration_seconds,
|
|
28
|
+
duration_ms / 1000.0,
|
|
29
|
+
{
|
|
30
|
+
event_class: event_class,
|
|
31
|
+
severity: severity
|
|
32
|
+
},
|
|
33
|
+
buckets: [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1] # 0.1ms to 100ms
|
|
34
|
+
)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Track middleware execution time.
|
|
38
|
+
#
|
|
39
|
+
# @param middleware_name [String] Middleware class name
|
|
40
|
+
# @param duration_ms [Numeric] Duration in milliseconds
|
|
41
|
+
# @return [void]
|
|
42
|
+
def self.track_middleware_latency(middleware_name, duration_ms)
|
|
43
|
+
E11y::Metrics.histogram(
|
|
44
|
+
:e11y_middleware_duration_seconds,
|
|
45
|
+
duration_ms / 1000.0,
|
|
46
|
+
{ middleware: middleware_name },
|
|
47
|
+
buckets: [0.00001, 0.0001, 0.0005, 0.001, 0.005] # 0.01ms to 5ms
|
|
48
|
+
)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Track adapter send latency.
|
|
52
|
+
#
|
|
53
|
+
# @param adapter_name [String] Adapter class name
|
|
54
|
+
# @param duration_ms [Numeric] Duration in milliseconds
|
|
55
|
+
# @return [void]
|
|
56
|
+
def self.track_adapter_latency(adapter_name, duration_ms)
|
|
57
|
+
E11y::Metrics.histogram(
|
|
58
|
+
:e11y_adapter_send_duration_seconds,
|
|
59
|
+
duration_ms / 1000.0,
|
|
60
|
+
{ adapter: adapter_name },
|
|
61
|
+
buckets: [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0] # 1ms to 5s
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Track buffer flush latency.
|
|
66
|
+
#
|
|
67
|
+
# @param duration_ms [Numeric] Duration in milliseconds
|
|
68
|
+
# @param event_count [Integer] Number of events flushed
|
|
69
|
+
# @return [void]
|
|
70
|
+
def self.track_flush_latency(duration_ms, event_count)
|
|
71
|
+
E11y::Metrics.histogram(
|
|
72
|
+
:e11y_buffer_flush_duration_seconds,
|
|
73
|
+
duration_ms / 1000.0,
|
|
74
|
+
{ event_count_bucket: bucket_event_count(event_count) },
|
|
75
|
+
buckets: [0.001, 0.01, 0.05, 0.1, 0.5, 1.0]
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Convert event count to a low-cardinality bucket label.
|
|
80
|
+
#
|
|
81
|
+
# @param count [Integer] Event count
|
|
82
|
+
# @return [String] Bucket label
|
|
83
|
+
# @api private
|
|
84
|
+
def self.bucket_event_count(count)
|
|
85
|
+
case count
|
|
86
|
+
when 0..10 then "1-10"
|
|
87
|
+
when 11..50 then "11-50"
|
|
88
|
+
when 51..100 then "51-100"
|
|
89
|
+
when 101..500 then "101-500"
|
|
90
|
+
else "500+"
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private_class_method :bucket_event_count
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "e11y/metrics"
|
|
4
|
+
|
|
5
|
+
module E11y
|
|
6
|
+
module SelfMonitoring
|
|
7
|
+
# Reliability monitoring for E11y internal operations.
|
|
8
|
+
#
|
|
9
|
+
# Tracks success/failure rates for:
|
|
10
|
+
# - Event tracking
|
|
11
|
+
# - Adapter writes
|
|
12
|
+
# - Buffer operations
|
|
13
|
+
# - DLQ saves
|
|
14
|
+
#
|
|
15
|
+
# @see ADR-016 §3.2 (Reliability Metrics)
|
|
16
|
+
# @example
|
|
17
|
+
# E11y::SelfMonitoring::ReliabilityMonitor.track_event_success(event_type: 'order.created')
|
|
18
|
+
module ReliabilityMonitor
|
|
19
|
+
# Track successful event tracking.
|
|
20
|
+
#
|
|
21
|
+
# @param event_type [String] Event type/name
|
|
22
|
+
# @return [void]
|
|
23
|
+
def self.track_event_success(event_type:)
|
|
24
|
+
E11y::Metrics.increment(
|
|
25
|
+
:e11y_events_tracked_total,
|
|
26
|
+
{
|
|
27
|
+
event_type: event_type,
|
|
28
|
+
status: "success"
|
|
29
|
+
}
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Track failed event tracking.
|
|
34
|
+
#
|
|
35
|
+
# @param event_type [String] Event type/name
|
|
36
|
+
# @param reason [String] Failure reason (e.g., 'validation_error', 'adapter_error')
|
|
37
|
+
# @return [void]
|
|
38
|
+
def self.track_event_failure(event_type:, reason:)
|
|
39
|
+
E11y::Metrics.increment(
|
|
40
|
+
:e11y_events_tracked_total,
|
|
41
|
+
{
|
|
42
|
+
event_type: event_type,
|
|
43
|
+
status: "failure",
|
|
44
|
+
reason: reason
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Track dropped event (rate limited, sampled out, etc).
|
|
50
|
+
#
|
|
51
|
+
# @param event_type [String] Event type/name
|
|
52
|
+
# @param reason [String] Drop reason (e.g., 'rate_limited', 'sampled_out')
|
|
53
|
+
# @return [void]
|
|
54
|
+
def self.track_event_dropped(event_type:, reason:)
|
|
55
|
+
E11y::Metrics.increment(
|
|
56
|
+
:e11y_events_dropped_total,
|
|
57
|
+
{
|
|
58
|
+
event_type: event_type,
|
|
59
|
+
reason: reason
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Track adapter write success.
|
|
65
|
+
#
|
|
66
|
+
# @param adapter_name [String] Adapter class name
|
|
67
|
+
# @return [void]
|
|
68
|
+
def self.track_adapter_success(adapter_name:)
|
|
69
|
+
E11y::Metrics.increment(
|
|
70
|
+
:e11y_adapter_writes_total,
|
|
71
|
+
{
|
|
72
|
+
adapter: adapter_name,
|
|
73
|
+
status: "success"
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Track adapter write failure.
|
|
79
|
+
#
|
|
80
|
+
# @param adapter_name [String] Adapter class name
|
|
81
|
+
# @param error_class [String] Error class name
|
|
82
|
+
# @return [void]
|
|
83
|
+
def self.track_adapter_failure(adapter_name:, error_class:)
|
|
84
|
+
E11y::Metrics.increment(
|
|
85
|
+
:e11y_adapter_writes_total,
|
|
86
|
+
{
|
|
87
|
+
adapter: adapter_name,
|
|
88
|
+
status: "failure",
|
|
89
|
+
error_class: error_class
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Track DLQ save operation.
|
|
95
|
+
#
|
|
96
|
+
# @param reason [String] Reason for DLQ save (e.g., 'adapter_error', 'rate_limited')
|
|
97
|
+
# @return [void]
|
|
98
|
+
def self.track_dlq_save(reason:)
|
|
99
|
+
E11y::Metrics.increment(
|
|
100
|
+
:e11y_dlq_saves_total,
|
|
101
|
+
{ reason: reason }
|
|
102
|
+
)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Track DLQ replay operation.
|
|
106
|
+
#
|
|
107
|
+
# @param status [String] Replay status ('success' or 'failure')
|
|
108
|
+
# @return [void]
|
|
109
|
+
def self.track_dlq_replay(status:)
|
|
110
|
+
E11y::Metrics.increment(
|
|
111
|
+
:e11y_dlq_replays_total,
|
|
112
|
+
{ status: status }
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Track circuit breaker state change.
|
|
117
|
+
#
|
|
118
|
+
# @param adapter_name [String] Adapter class name
|
|
119
|
+
# @param state [String] New circuit state ('open', 'half_open', 'closed')
|
|
120
|
+
# @return [void]
|
|
121
|
+
def self.track_circuit_state(adapter_name:, state:)
|
|
122
|
+
E11y::Metrics.gauge(
|
|
123
|
+
:e11y_circuit_breaker_state,
|
|
124
|
+
state_to_value(state),
|
|
125
|
+
{ adapter: adapter_name }
|
|
126
|
+
)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Convert circuit state to numeric value for gauge.
|
|
130
|
+
#
|
|
131
|
+
# @param state [String] Circuit state
|
|
132
|
+
# @return [Integer] Numeric representation (0=closed, 1=half_open, 2=open)
|
|
133
|
+
# @api private
|
|
134
|
+
def self.state_to_value(state)
|
|
135
|
+
case state
|
|
136
|
+
when "closed" then 0
|
|
137
|
+
when "half_open" then 1
|
|
138
|
+
when "open" then 2
|
|
139
|
+
else 0
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
private_class_method :state_to_value
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|