e11y 0.2.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +130 -10
- data/CHANGELOG.md +80 -1
- data/CLAUDE.md +168 -0
- data/CONTRIBUTING.md +640 -0
- data/README.md +165 -701
- data/RELEASE.md +41 -12
- data/Rakefile +249 -57
- data/config/README.md +1 -1
- data/config/loki-local-config.yaml +12 -0
- data/config/otel-collector-config.yaml +44 -0
- data/cucumber.yml +1 -0
- data/docker-compose.yml +18 -2
- data/docs/ADAPTERS.md +76 -0
- data/docs/ADAPTIVE_SAMPLING.md +59 -0
- data/docs/COMPARISON.md +104 -0
- data/docs/CONFIGURATION.md +52 -0
- data/docs/DISTRIBUTED_TRACING.md +44 -0
- data/docs/LIMITATIONS.md +13 -0
- data/docs/METRICS_DSL.md +84 -0
- data/docs/PERFORMANCE.md +60 -0
- data/docs/PII_FILTERING.md +40 -0
- data/docs/PRESETS.md +65 -0
- data/docs/QUICK-START.md +546 -587
- data/docs/RAILS_INTEGRATION.md +79 -0
- data/docs/SCHEMA_VALIDATION.md +63 -0
- data/docs/SLO-PROMQL-ALERTS.md +161 -0
- data/docs/TESTING.md +69 -0
- data/docs/{ADR-001-architecture.md → architecture/ADR-001-architecture.md} +36 -65
- data/docs/{ADR-002-metrics-yabeda.md → architecture/ADR-002-metrics-yabeda.md} +62 -236
- data/docs/architecture/ADR-003-slo-observability.md +1402 -0
- data/docs/{ADR-004-adapter-architecture.md → architecture/ADR-004-adapter-architecture.md} +163 -146
- data/docs/{ADR-005-tracing-context.md → architecture/ADR-005-tracing-context.md} +10 -9
- data/docs/{ADR-006-security-compliance.md → architecture/ADR-006-security-compliance.md} +184 -191
- data/docs/{ADR-007-opentelemetry-integration.md → architecture/ADR-007-opentelemetry-integration.md} +3 -21
- data/docs/{ADR-008-rails-integration.md → architecture/ADR-008-rails-integration.md} +182 -743
- data/docs/{ADR-009-cost-optimization.md → architecture/ADR-009-cost-optimization.md} +45 -54
- data/docs/architecture/ADR-010-developer-experience.md +522 -0
- data/docs/{ADR-011-testing-strategy.md → architecture/ADR-011-testing-strategy.md} +44 -86
- data/docs/{ADR-012-event-evolution.md → architecture/ADR-012-event-evolution.md} +11 -11
- data/docs/{ADR-013-reliability-error-handling.md → architecture/ADR-013-reliability-error-handling.md} +37 -12
- data/docs/{ADR-014-event-driven-slo.md → architecture/ADR-014-event-driven-slo.md} +12 -24
- data/docs/{ADR-015-middleware-order.md → architecture/ADR-015-middleware-order.md} +43 -59
- data/docs/{ADR-016-self-monitoring-slo.md → architecture/ADR-016-self-monitoring-slo.md} +58 -355
- data/docs/{ADR-017-multi-rails-compatibility.md → architecture/ADR-017-multi-rails-compatibility.md} +4 -11
- data/docs/architecture/ADR-018-memory-optimization.md +366 -0
- data/docs/{ADR-INDEX.md → architecture/ADR-INDEX.md} +11 -6
- data/docs/plans/2026-03-20-browser-overlay-svelte.md +281 -0
- data/docs/{00-ICP-AND-TIMELINE.md → prd/00-ICP-AND-TIMELINE.md} +6 -6
- data/docs/{01-SCALE-REQUIREMENTS.md → prd/01-SCALE-REQUIREMENTS.md} +6 -6
- data/docs/prd/01-overview-vision.md +19 -14
- data/docs/use_cases/README.md +22 -23
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +50 -44
- data/docs/use_cases/UC-002-business-event-tracking.md +26 -95
- data/docs/use_cases/UC-003-event-metrics.md +66 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +33 -684
- data/docs/use_cases/UC-005-sentry-integration.md +13 -15
- data/docs/use_cases/UC-006-trace-context-management.md +30 -28
- data/docs/use_cases/UC-007-pii-filtering.md +35 -87
- data/docs/use_cases/UC-008-opentelemetry-integration.md +51 -89
- data/docs/use_cases/UC-009-multi-service-tracing.md +30 -178
- data/docs/use_cases/UC-010-background-job-tracking.md +24 -91
- data/docs/use_cases/UC-011-rate-limiting.md +95 -168
- data/docs/use_cases/UC-012-audit-trail.md +21 -46
- data/docs/use_cases/UC-013-high-cardinality-protection.md +29 -167
- data/docs/use_cases/UC-014-adaptive-sampling.md +2 -2
- data/docs/use_cases/UC-015-cost-optimization.md +46 -99
- data/docs/use_cases/UC-016-rails-logger-migration.md +39 -213
- data/docs/use_cases/UC-017-local-development.md +203 -777
- data/docs/use_cases/UC-018-testing-events.md +3 -3
- data/docs/use_cases/UC-019-retention-based-routing.md +53 -106
- data/docs/use_cases/UC-020-event-versioning.md +8 -9
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +18 -22
- data/docs/use_cases/UC-022-event-registry.md +15 -21
- data/docs/use_cases/backlog.md +119 -87
- data/e11y.gemspec +2 -2
- data/gems/e11y-devtools/README.md +158 -0
- data/gems/e11y-devtools/config/routes.rb +15 -0
- data/gems/e11y-devtools/e11y-devtools.gemspec +25 -0
- data/gems/e11y-devtools/exe/e11y +34 -0
- data/gems/e11y-devtools/frontend/.gitignore +24 -0
- data/gems/e11y-devtools/frontend/README.md +51 -0
- data/gems/e11y-devtools/frontend/index.html +14 -0
- data/gems/e11y-devtools/frontend/package-lock.json +3707 -0
- data/gems/e11y-devtools/frontend/package.json +28 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/events/recent.json +4205 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/interactions.json +194 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/0a2e04027cfa22d014bc22e8b27cd913/events.json +86 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/0e1543af6a630fb3af6b52283154b3e0/events.json +169 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/1838b691faa49564f97db8592ff3978d/events.json +78 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/29f198f6588dacffb687777eb5f8f118/events.json +197 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/34bc3c9c0097de28a7a6f99b90a8e7bc/events.json +194 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/3ba6c20d068ab9cee00e51b180e66444/events.json +184 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/435bfd8f17b9009146a79812d7c3726d/events.json +144 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/4c7676e3fe668e99edb2b94d7d5678a9/events.json +222 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/6daf0d47974bedfc55d5de7004a3ea9f/events.json +194 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/8a81ada42834d15f287bb40010043605/events.json +194 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/8c0a98900edaae105469df8daedccf02/events.json +198 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/8e4f645180f8a7d1dce426b07380466b/events.json +222 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/93db346fa5d44a032605a13b627f4b80/events.json +128 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/98ff6146faf7bd9be8bd03a8275817ba/events.json +223 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/9997ddd0247bc7e25f2ca7a5c415c93d/events.json +197 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/99e35f8ef3baedd798cc4fd085980ad9/events.json +194 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/b4f3095c1909924cbc98889a86c83d6d/events.json +131 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/b54b7fc32b7575a7110de809d11ccda0/events.json +128 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/c0b48033fa06746bcc5886745e053cff/events.json +169 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/c44649ac76701b4558927cd2305ab535/events.json +169 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/d601ae3320057580a39dbdac2edfdf4a/events.json +248 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/e67e724bab422d2b52eeb49635e512e1/events.json +194 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/e6c72765a28f158a8485b35fa63f73da/events.json +194 -0
- data/gems/e11y-devtools/frontend/public/mocks/v1/traces/f541b87405c9a54819b18ebe529f6419/events.json +194 -0
- data/gems/e11y-devtools/frontend/scripts/generate_mocks.rb +397 -0
- data/gems/e11y-devtools/frontend/src/App.svelte +827 -0
- data/gems/e11y-devtools/frontend/src/components/Fab.svelte +19 -0
- data/gems/e11y-devtools/frontend/src/components/FilterBar.svelte +38 -0
- data/gems/e11y-devtools/frontend/src/components/FullscreenPanel.svelte +82 -0
- data/gems/e11y-devtools/frontend/src/components/InteractionsTimeline.svelte +264 -0
- data/gems/e11y-devtools/frontend/src/components/RecentHistogram.svelte +354 -0
- data/gems/e11y-devtools/frontend/src/lib/api.ts +37 -0
- data/gems/e11y-devtools/frontend/src/lib/eventIdentity.ts +12 -0
- data/gems/e11y-devtools/frontend/src/lib/format.ts +37 -0
- data/gems/e11y-devtools/frontend/src/lib/listFilter.ts +43 -0
- data/gems/e11y-devtools/frontend/src/lib/recentVolume.ts +80 -0
- data/gems/e11y-devtools/frontend/src/lib/router.ts +12 -0
- data/gems/e11y-devtools/frontend/src/lib/transitions.ts +34 -0
- data/gems/e11y-devtools/frontend/src/lib/viewportOrigin.ts +25 -0
- data/gems/e11y-devtools/frontend/src/main.ts +8 -0
- data/gems/e11y-devtools/frontend/src/overlay-entry.ts +24 -0
- data/gems/e11y-devtools/frontend/src/overlay.css +1080 -0
- data/gems/e11y-devtools/frontend/svelte.config.js +2 -0
- data/gems/e11y-devtools/frontend/test_puppeteer.js +41 -0
- data/gems/e11y-devtools/frontend/test_scale.js +3 -0
- data/gems/e11y-devtools/frontend/tsconfig.app.json +21 -0
- data/gems/e11y-devtools/frontend/tsconfig.json +7 -0
- data/gems/e11y-devtools/frontend/tsconfig.node.json +26 -0
- data/gems/e11y-devtools/frontend/vite.config.ts +36 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/server.rb +96 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tool_base.rb +25 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/clear.rb +31 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/errors.rb +35 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/event_detail.rb +33 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/events_by_trace.rb +33 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/interactions.rb +40 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/recent_events.rb +34 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/search.rb +34 -0
- data/gems/e11y-devtools/lib/e11y/devtools/mcp/tools/stats.rb +30 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/assets/overlay.js +20 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/controller.rb +94 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/engine.rb +26 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/middleware.rb +80 -0
- data/gems/e11y-devtools/lib/e11y/devtools/overlay/rails_controller.rb +67 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/app.rb +262 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/grouping.rb +66 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/event_detail.rb +62 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/event_list.rb +70 -0
- data/gems/e11y-devtools/lib/e11y/devtools/tui/widgets/interaction_list.rb +47 -0
- data/gems/e11y-devtools/lib/e11y/devtools/version.rb +8 -0
- data/gems/e11y-devtools/lib/e11y/devtools.rb +13 -0
- data/gems/e11y-devtools/spec/e11y/devtools/mcp/tools_spec.rb +107 -0
- data/gems/e11y-devtools/spec/e11y/devtools/overlay/controller_spec.rb +91 -0
- data/gems/e11y-devtools/spec/e11y/devtools/overlay/middleware_spec.rb +46 -0
- data/gems/e11y-devtools/spec/e11y/devtools/tui/app_spec.rb +85 -0
- data/gems/e11y-devtools/spec/e11y/devtools/tui/grouping_spec.rb +64 -0
- data/gems/e11y-devtools/spec/spec_helper.rb +5 -0
- data/gems/e11y-devtools/spec/tui/widgets/event_list_spec.rb +44 -0
- data/gems/e11y-devtools/spec/tui/widgets/interaction_list_spec.rb +62 -0
- data/lib/e11y/adapters/audit_encrypted.rb +53 -11
- data/lib/e11y/adapters/base.rb +33 -34
- data/lib/e11y/adapters/dev_log/file_store.rb +143 -0
- data/lib/e11y/adapters/dev_log/query.rb +219 -0
- data/lib/e11y/adapters/dev_log.rb +118 -0
- data/lib/e11y/adapters/file.rb +3 -6
- data/lib/e11y/adapters/in_memory.rb +52 -5
- data/lib/e11y/adapters/in_memory_test.rb +29 -0
- data/lib/e11y/adapters/loki.rb +58 -23
- data/lib/e11y/adapters/null.rb +82 -0
- data/lib/e11y/adapters/opentelemetry_collector.rb +183 -0
- data/lib/e11y/adapters/otel_logs.rb +136 -23
- data/lib/e11y/adapters/sentry.rb +4 -7
- data/lib/e11y/adapters/stdout.rb +73 -7
- data/lib/e11y/adapters/yabeda.rb +153 -29
- data/lib/e11y/buffers/adaptive_buffer.rb +3 -17
- data/lib/e11y/buffers/{request_scoped_buffer.rb → ephemeral_buffer.rb} +72 -58
- data/lib/e11y/buffers/ring_buffer.rb +3 -16
- data/lib/e11y/configuration.rb +272 -0
- data/lib/e11y/console.rb +10 -17
- data/lib/e11y/current.rb +53 -1
- data/lib/e11y/debug/pipeline_inspector.rb +96 -0
- data/lib/e11y/documentation/generator.rb +48 -0
- data/lib/e11y/event/base.rb +176 -82
- data/lib/e11y/event/value_sampling_config.rb +1 -5
- data/lib/e11y/events/rails/database/query.rb +1 -4
- data/lib/e11y/events/rails/job/failed.rb +2 -0
- data/lib/e11y/instruments/active_job.rb +44 -12
- data/lib/e11y/instruments/rails_instrumentation.rb +49 -24
- data/lib/e11y/instruments/sidekiq.rb +135 -31
- data/lib/e11y/linters/base.rb +11 -0
- data/lib/e11y/linters/pii/pii_declaration_linter.rb +120 -0
- data/lib/e11y/linters/slo/config_consistency_linter.rb +76 -0
- data/lib/e11y/linters/slo/explicit_declaration_linter.rb +36 -0
- data/lib/e11y/linters/slo/slo_status_from_linter.rb +41 -0
- data/lib/e11y/logger/bridge.rb +26 -7
- data/lib/e11y/metrics/cardinality_protection.rb +10 -15
- data/lib/e11y/metrics/cardinality_tracker.rb +16 -6
- data/lib/e11y/metrics/registry.rb +3 -5
- data/lib/e11y/metrics/test_backend.rb +62 -0
- data/lib/e11y/metrics.rb +56 -10
- data/lib/e11y/middleware/adapter_resolver.rb +40 -0
- data/lib/e11y/middleware/audit_signing.rb +43 -6
- data/lib/e11y/middleware/baggage_protection.rb +75 -0
- data/lib/e11y/middleware/dev_log_source.rb +24 -0
- data/lib/e11y/middleware/event_slo.rb +23 -9
- data/lib/e11y/middleware/otel_span.rb +23 -0
- data/lib/e11y/middleware/pii_filter.rb +104 -75
- data/lib/e11y/middleware/rate_limiting.rb +54 -27
- data/lib/e11y/middleware/request.rb +70 -23
- data/lib/e11y/middleware/routing.rb +78 -21
- data/lib/e11y/middleware/sampling.rb +66 -17
- data/lib/e11y/middleware/self_monitoring_emit.rb +39 -0
- data/lib/e11y/middleware/trace_context.rb +45 -10
- data/lib/e11y/middleware/track_latency.rb +34 -0
- data/lib/e11y/middleware/validation.rb +7 -16
- data/lib/e11y/middleware/versioning.rb +26 -22
- data/lib/e11y/opentelemetry/semantic_conventions.rb +109 -0
- data/lib/e11y/opentelemetry/span_creator.rb +142 -0
- data/lib/e11y/pii/patterns.rb +12 -1
- data/lib/e11y/pipeline/builder.rb +4 -4
- data/lib/e11y/presets/audit_event.rb +13 -2
- data/lib/e11y/railtie.rb +52 -14
- data/lib/e11y/registry.rb +306 -0
- data/lib/e11y/reliability/circuit_breaker.rb +19 -21
- data/lib/e11y/reliability/dlq/base.rb +71 -0
- data/lib/e11y/reliability/dlq/file_adapter.rb +301 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +63 -34
- data/lib/e11y/reliability/dlq/filter.rb +37 -54
- data/lib/e11y/reliability/retry_handler.rb +26 -29
- data/lib/e11y/reliability/retry_rate_limiter.rb +3 -11
- data/lib/e11y/sampling/error_spike_detector.rb +0 -2
- data/lib/e11y/sampling/load_monitor.rb +5 -9
- data/lib/e11y/sampling/stratified_tracker.rb +18 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +2 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +19 -61
- data/lib/e11y/self_monitoring/reliability_monitor.rb +4 -74
- data/lib/e11y/slo/config_loader.rb +40 -0
- data/lib/e11y/slo/config_validator.rb +58 -0
- data/lib/e11y/slo/dashboard_generator.rb +122 -0
- data/lib/e11y/slo/event_driven.rb +8 -0
- data/lib/e11y/slo/tracker.rb +31 -4
- data/lib/e11y/testing/have_tracked_event_matcher.rb +190 -0
- data/lib/e11y/testing/rspec_matchers.rb +21 -0
- data/lib/e11y/testing/snapshot_matcher.rb +86 -0
- data/lib/e11y/trace_context/sampler.rb +35 -0
- data/lib/e11y/tracing/faraday_middleware.rb +31 -0
- data/lib/e11y/tracing/net_http_patch.rb +33 -0
- data/lib/e11y/tracing/propagator.rb +144 -0
- data/lib/e11y/tracing.rb +47 -0
- data/lib/e11y/version.rb +1 -1
- data/lib/e11y/versioning/version_extractor.rb +32 -0
- data/lib/e11y.rb +123 -266
- data/lib/generators/e11y/event/event_generator.rb +22 -0
- data/lib/generators/e11y/event/templates/event.rb.tt +16 -0
- data/lib/generators/e11y/grafana_dashboard/grafana_dashboard_generator.rb +30 -0
- data/lib/generators/e11y/grafana_dashboard/templates/e11y_dashboard.json +81 -0
- data/lib/generators/e11y/install/install_generator.rb +34 -0
- data/lib/generators/e11y/install/templates/e11y.rb +239 -0
- data/lib/generators/e11y/prometheus_alerts/prometheus_alerts_generator.rb +29 -0
- data/lib/generators/e11y/prometheus_alerts/templates/e11y_alerts.yml +28 -0
- data/lib/tasks/e11y_docs.rake +30 -0
- data/lib/tasks/e11y_events.rake +71 -0
- data/lib/tasks/e11y_lint.rake +91 -0
- data/lib/tasks/e11y_slo.rake +29 -0
- metadata +186 -39
- data/docs/ADR-003-slo-observability.md +0 -3337
- data/docs/ADR-010-developer-experience.md +0 -2166
- data/docs/API-REFERENCE-L28.md +0 -914
- data/docs/COMPREHENSIVE-CONFIGURATION.md +0 -2366
- data/docs/CONTRIBUTING.md +0 -312
- data/docs/IMPLEMENTATION_NOTES.md +0 -2804
- data/docs/IMPLEMENTATION_PLAN.md +0 -1971
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +0 -586
- data/docs/PLAN.md +0 -148
- data/docs/README.md +0 -296
- data/docs/design/00-memory-optimization.md +0 -593
- data/docs/guides/MIGRATION-L27-L28.md +0 -692
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +0 -434
- data/docs/guides/README.md +0 -44
- data/docs/use_cases/UC-003-pattern-based-metrics.md +0 -1627
- data/lib/e11y/adapters/registry.rb +0 -141
|
@@ -13,12 +13,10 @@
|
|
|
13
13
|
|
|
14
14
|
**The $68,000/month mistake:**
|
|
15
15
|
```ruby
|
|
16
|
-
# ❌ CATASTROPHIC: Using user_id as metric label
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
name: 'user_actions_total',
|
|
21
|
-
tags: [:user_id, :action_type] # ← 💸💸💸
|
|
16
|
+
# ❌ CATASTROPHIC: Using user_id as metric label (event-level example - avoid!)
|
|
17
|
+
class Events::UserAction < E11y::Event::Base
|
|
18
|
+
metrics do
|
|
19
|
+
counter :user_actions_total, tags: [:user_id, :action_type] # ← 💸💸💸 DON'T
|
|
22
20
|
end
|
|
23
21
|
end
|
|
24
22
|
|
|
@@ -35,42 +33,24 @@ end
|
|
|
35
33
|
- **Query timeouts** (PromQL queries take 30+ seconds)
|
|
36
34
|
- **Incident during Black Friday** (metrics system collapsed)
|
|
37
35
|
|
|
38
|
-
### E11y Solution
|
|
36
|
+
### E11y Solution (Event-Level)
|
|
39
37
|
|
|
40
|
-
**
|
|
38
|
+
**Use low-cardinality tags in event-level metrics:**
|
|
41
39
|
```ruby
|
|
42
|
-
# ✅ SAFE:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
tag_extractors: {
|
|
53
|
-
user_segment: ->(event) {
|
|
54
|
-
user = User.find(event.payload[:user_id])
|
|
55
|
-
user.segment # 'free', 'paid', 'enterprise'
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
# Layer 3: Per-metric limits
|
|
60
|
-
cardinality_limit_for 'user_actions_total', max: 100
|
|
61
|
-
|
|
62
|
-
# Layer 4: Dynamic monitoring
|
|
63
|
-
cardinality_monitoring do
|
|
64
|
-
warn_threshold 0.7 # Alert at 70%
|
|
65
|
-
auto_aggregate true # Auto-fix if exceeded
|
|
66
|
-
end
|
|
40
|
+
# ✅ SAFE: Use user_segment, not user_id
|
|
41
|
+
class Events::UserAction < E11y::Event::Base
|
|
42
|
+
schema do
|
|
43
|
+
required(:user_id).filled(:string)
|
|
44
|
+
required(:action_type).filled(:string)
|
|
45
|
+
required(:user_segment).filled(:string) # pre-aggregated: 'free', 'paid', 'enterprise'
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
metrics do
|
|
49
|
+
counter :user_actions_total, tags: [:user_segment, :action_type] # 3 × 10 = 30 series
|
|
67
50
|
end
|
|
68
51
|
end
|
|
69
52
|
|
|
70
|
-
# Result:
|
|
71
|
-
# - 200 services × 10 segments × 5 dimensions = 10,000 series
|
|
72
|
-
# - Datadog cost: $680/month
|
|
73
|
-
# - Savings: $67,320/month (99% reduction) ✅
|
|
53
|
+
# Result: low cardinality, manageable cost
|
|
74
54
|
```
|
|
75
55
|
|
|
76
56
|
---
|
|
@@ -283,7 +263,7 @@ end
|
|
|
283
263
|
|
|
284
264
|
### Layer Processing Flow
|
|
285
265
|
|
|
286
|
-
> **Implementation:** See [ADR-002 Section 4.1: Four-Layer Defense](../ADR-002-metrics-yabeda.md#41-four-layer-defense) for detailed architecture.
|
|
266
|
+
> **Implementation:** See [ADR-002 Section 4.1: Four-Layer Defense](../architecture/ADR-002-metrics-yabeda.md#41-four-layer-defense) for detailed architecture.
|
|
287
267
|
|
|
288
268
|
**🔑 Critical: Layers execute SEQUENTIALLY (not simultaneously).**
|
|
289
269
|
|
|
@@ -399,14 +379,14 @@ Events::OrderPlaced.track(
|
|
|
399
379
|
### Layer 1: Denylist (Hard Block)
|
|
400
380
|
|
|
401
381
|
> **⚠️ CRITICAL: Adapter-Specific Filtering**
|
|
402
|
-
> **Implementation:** See [ADR-002 Section 4.2: Layer 1 - Universal Denylist](../ADR-002-metrics-yabeda.md#42-layer-1-universal-denylist) for detailed architecture.
|
|
382
|
+
> **Implementation:** See [ADR-002 Section 4.2: Layer 1 - Universal Denylist](../architecture/ADR-002-metrics-yabeda.md#42-layer-1-universal-denylist) for detailed architecture.
|
|
403
383
|
>
|
|
404
384
|
> **Cardinality protection (denylist/allowlist) applies ONLY to metrics adapters (Yabeda/Prometheus), NOT to other adapters:**
|
|
405
385
|
>
|
|
406
386
|
> | Adapter Type | Denylist Applied? | Why? |
|
|
407
387
|
> |---|---|---|
|
|
408
388
|
> | **Metrics (Yabeda/Prometheus)** | ✅ YES | High-cardinality labels cause memory explosion in time-series databases (1M labels = 1GB RAM). |
|
|
409
|
-
> | **Logs (Loki)** |
|
|
389
|
+
> | **Logs (Loki)** | Optional | Loki labels = event_name + severity (low cardinality). Payload (user_id, etc.) in log line. Optional `enable_cardinality_protection` for labels. |
|
|
410
390
|
> | **Errors (Sentry)** | ❌ NO | Sentry needs full context for debugging. High cardinality is acceptable for error tracking. |
|
|
411
391
|
> | **Audit (File/PostgreSQL)** | ❌ NO | Audit trails require complete, unfiltered data for compliance. |
|
|
412
392
|
>
|
|
@@ -428,85 +408,11 @@ Events::OrderPlaced.track(
|
|
|
428
408
|
> - ✅ **Compliance stays intact:** Audit logs remain complete and unfiltered
|
|
429
409
|
> - ✅ **Best of both worlds:** Safety for metrics + completeness for logs/errors
|
|
430
410
|
|
|
431
|
-
**
|
|
432
|
-
|
|
433
|
-
```ruby
|
|
434
|
-
E11y.configure do |config|
|
|
435
|
-
config.metrics do
|
|
436
|
-
# === UNBOUNDED IDENTIFIERS (FORBIDDEN) ===
|
|
437
|
-
forbidden_labels :user_id, :customer_id, :account_id,
|
|
438
|
-
:order_id, :transaction_id, :invoice_id,
|
|
439
|
-
:session_id, :request_id, :trace_id, :span_id
|
|
440
|
-
|
|
441
|
-
# === INFRASTRUCTURE (FORBIDDEN) ===
|
|
442
|
-
forbidden_labels :pod_uid, :container_id, :instance_id,
|
|
443
|
-
:node_name # If dynamic
|
|
444
|
-
|
|
445
|
-
# === NETWORK/HTTP (FORBIDDEN) ===
|
|
446
|
-
forbidden_labels :url, # With query strings
|
|
447
|
-
:ip_address,
|
|
448
|
-
:user_agent,
|
|
449
|
-
:hostname # If ephemeral
|
|
450
|
-
|
|
451
|
-
# === TIME-BASED (FORBIDDEN) ===
|
|
452
|
-
forbidden_labels :timestamp, :created_at,
|
|
453
|
-
:version # Patch-level: 2.5.7234
|
|
454
|
-
|
|
455
|
-
# === ENFORCEMENT ===
|
|
456
|
-
enforcement :strict # ERROR on forbidden label usage
|
|
457
|
-
# OR
|
|
458
|
-
enforcement :warn # Log warning but allow
|
|
459
|
-
# OR
|
|
460
|
-
enforcement :aggregate # Auto-aggregate to "_other"
|
|
461
|
-
end
|
|
462
|
-
end
|
|
463
|
-
|
|
464
|
-
# Usage:
|
|
465
|
-
counter_for pattern: 'user.action',
|
|
466
|
-
tags: [:user_id] # ← ERROR: "user_id is forbidden!"
|
|
467
|
-
|
|
468
|
-
# Development warning:
|
|
469
|
-
# [E11y ERROR] Metric 'user.action_total' uses forbidden label 'user_id'
|
|
470
|
-
# Cardinality explosion risk! Use 'user_segment' instead.
|
|
471
|
-
```
|
|
411
|
+
**Avoid these as metric tags:** user_id, customer_id, order_id, session_id, trace_id, url, ip_address, timestamp.
|
|
472
412
|
|
|
473
413
|
---
|
|
474
414
|
|
|
475
|
-
### Layer 2:
|
|
476
|
-
|
|
477
|
-
**Only allow explicitly safe labels:**
|
|
478
|
-
|
|
479
|
-
```ruby
|
|
480
|
-
E11y.configure do |config|
|
|
481
|
-
config.metrics do
|
|
482
|
-
# Strict mode: ONLY these labels allowed
|
|
483
|
-
allowed_labels_only true
|
|
484
|
-
|
|
485
|
-
# === BUSINESS DIMENSIONS (< 50 values) ===
|
|
486
|
-
allowed_labels :status, # pending, paid, failed (4-10 values)
|
|
487
|
-
:payment_method, # card, paypal (5-20 values)
|
|
488
|
-
:plan_tier # free, pro, enterprise (3-5 values)
|
|
489
|
-
|
|
490
|
-
# === INFRASTRUCTURE (< 20 values) ===
|
|
491
|
-
allowed_labels :env, # production, staging, dev (3 values)
|
|
492
|
-
:region, # us-east, eu-west (5-20 values)
|
|
493
|
-
:cluster, # main, backup (2-5 values)
|
|
494
|
-
:availability_zone
|
|
495
|
-
|
|
496
|
-
# === HTTP/SERVICE (< 100 values) ===
|
|
497
|
-
allowed_labels :http_method, # GET, POST, PUT, DELETE (10 values)
|
|
498
|
-
:http_status_code, # 200, 404, 500 (50 values)
|
|
499
|
-
:controller_action # UsersController#show (20-100 values)
|
|
500
|
-
end
|
|
501
|
-
end
|
|
502
|
-
|
|
503
|
-
# Usage:
|
|
504
|
-
counter_for pattern: 'order.paid',
|
|
505
|
-
tags: [:currency] # ← ERROR: "currency not in allowlist!"
|
|
506
|
-
|
|
507
|
-
# Must explicitly allow:
|
|
508
|
-
allowed_labels :currency # USD, EUR, GBP (3-20 values)
|
|
509
|
-
```
|
|
415
|
+
### Layer 2: Safe Labels
|
|
510
416
|
|
|
511
417
|
**Rule of thumb:**
|
|
512
418
|
- ✅ **< 10 values** - Always safe
|
|
@@ -517,55 +423,11 @@ allowed_labels :currency # USD, EUR, GBP (3-20 values)
|
|
|
517
423
|
|
|
518
424
|
### Layer 3: Per-Metric Limits
|
|
519
425
|
|
|
520
|
-
**
|
|
521
|
-
|
|
522
|
-
```ruby
|
|
523
|
-
E11y.configure do |config|
|
|
524
|
-
config.metrics do
|
|
525
|
-
# === GLOBAL DEFAULT ===
|
|
526
|
-
default_cardinality_limit 1_000
|
|
527
|
-
|
|
528
|
-
# === PER-METRIC LIMITS ===
|
|
529
|
-
cardinality_limit_for 'http.requests' do
|
|
530
|
-
max_cardinality 2_000 # Higher limit for this metric
|
|
531
|
-
overflow_strategy :drop # → Drop overflow events
|
|
532
|
-
overflow_sample_rate 0.1 # Sample 10% of overflow events
|
|
533
|
-
end
|
|
534
|
-
|
|
535
|
-
cardinality_limit_for 'user.actions' do
|
|
536
|
-
max_cardinality 500 # Lower limit
|
|
537
|
-
overflow_strategy :drop # Drop overflow events
|
|
538
|
-
overflow_alert true # Alert on overflow
|
|
539
|
-
end
|
|
540
|
-
|
|
541
|
-
cardinality_limit_for 'orders.paid' do
|
|
542
|
-
max_cardinality 100
|
|
543
|
-
overflow_strategy :alert # Alert ops team + drop
|
|
544
|
-
end
|
|
545
|
-
end
|
|
546
|
-
end
|
|
547
|
-
|
|
548
|
-
# How it works:
|
|
549
|
-
# 1. Track unique label combinations per metric
|
|
550
|
-
# 2. If exceeds limit:
|
|
551
|
-
# - :drop → Discard overflow events (increment drop counter)
|
|
552
|
-
# - :alert → Alert ops team + drop
|
|
553
|
-
#
|
|
554
|
-
# NOTE: For aggregation/relabeling (e.g., user_id → user_segment),
|
|
555
|
-
# use tag_extractors (see "Aggregation" section below),
|
|
556
|
-
# NOT overflow_strategy.
|
|
557
|
-
```
|
|
558
|
-
|
|
559
|
-
**Overflow strategies:**
|
|
560
|
-
|
|
561
|
-
| Strategy | Behavior | Use Case |
|
|
562
|
-
|----------|----------|----------|
|
|
563
|
-
| `:drop` | Discard overflow events | Default, simplest |
|
|
564
|
-
| `:alert` | Alert ops team + drop | Critical metrics |
|
|
426
|
+
**Yabeda adapter supports cardinality limits** via its config. Use low-cardinality tags in event-level metrics.
|
|
565
427
|
|
|
566
428
|
#### Thread Safety
|
|
567
429
|
|
|
568
|
-
> **Implementation:** See [ADR-002 Section 4.4: Layer 3 - Per-Metric Cardinality Limits](../ADR-002-metrics-yabeda.md#44-layer-3-per-metric-cardinality-limits) for detailed architecture.
|
|
430
|
+
> **Implementation:** See [ADR-002 Section 4.4: Layer 3 - Per-Metric Cardinality Limits](../architecture/ADR-002-metrics-yabeda.md#44-layer-3-per-metric-cardinality-limits) for detailed architecture.
|
|
569
431
|
>
|
|
570
432
|
> **Sources:**
|
|
571
433
|
> - [Ruby Hash thread safety - Stack Overflow](https://stackoverflow.com/questions/22674498/thread-safety-for-hashes-in-ruby)
|
|
@@ -716,7 +578,7 @@ end
|
|
|
716
578
|
|
|
717
579
|
#### Action Selection Guide
|
|
718
580
|
|
|
719
|
-
> **Implementation:** See [ADR-002 Section 4.5: Layer 4 - Dynamic Actions](../ADR-002-metrics-yabeda.md#45-layer-4-dynamic-actions) for detailed architecture.
|
|
581
|
+
> **Implementation:** See [ADR-002 Section 4.5: Layer 4 - Dynamic Actions](../architecture/ADR-002-metrics-yabeda.md#45-layer-4-dynamic-actions) for detailed architecture.
|
|
720
582
|
|
|
721
583
|
**🎯 When cardinality limit is exceeded, which action should you choose?**
|
|
722
584
|
|
|
@@ -862,7 +724,7 @@ rate(e11y_cardinality_actions_total{action="alert"}[5m])
|
|
|
862
724
|
|
|
863
725
|
### 1. Aggregation (Best ROI - 99% Reduction)
|
|
864
726
|
|
|
865
|
-
> **Note:** This section describes **relabeling/normalization** (e.g., `user_id` → `user_segment`) via `tag_extractors`, which is different from `overflow_strategy`. Aggregation reduces cardinality **before** metrics are created, while overflow handling (`drop`/`alert`) deals with exceeding limits **after** creation. See [ADR-002 Section 4.5](../ADR-002-metrics-yabeda.md#45-cardinality-protection) for implementation details.
|
|
727
|
+
> **Note:** This section describes **relabeling/normalization** (e.g., `user_id` → `user_segment`) via `tag_extractors`, which is different from `overflow_strategy`. Aggregation reduces cardinality **before** metrics are created, while overflow handling (`drop`/`alert`) deals with exceeding limits **after** creation. See [ADR-002 Section 4.5](../architecture/ADR-002-metrics-yabeda.md#45-cardinality-protection) for implementation details.
|
|
866
728
|
|
|
867
729
|
**Problem:** 1M users = 1M metric series
|
|
868
730
|
|
|
@@ -1039,7 +901,7 @@ end
|
|
|
1039
901
|
### 6. Universal Cardinality Protection (C04 Resolution) ⚠️ CRITICAL
|
|
1040
902
|
|
|
1041
903
|
> **⚠️ CRITICAL: C04 Conflict Resolution - Cardinality Protection for ALL Backends**
|
|
1042
|
-
> **See:** [ADR-009 Section 8](../ADR-009-cost-optimization.md#8-cardinality-protection-c04-resolution--critical) for detailed architecture and cost impact analysis.
|
|
904
|
+
> **See:** [ADR-009 Section 8](../architecture/ADR-009-cost-optimization.md#8-cardinality-protection-c04-resolution--critical) for detailed architecture and cost impact analysis.
|
|
1043
905
|
> **Problem:** Original UC-013 cardinality protection applied ONLY to Yabeda/Prometheus metrics, but NOT to OpenTelemetry span attributes or Loki log labels. High-cardinality values (`user_id`, `order_id`) bypassed protection and caused cost explosions in OTLP backends (Datadog, Honeycomb).
|
|
1044
906
|
> **Solution:** Universal `CardinalityFilter` middleware applies protection to **ALL backends** (Yabeda, OpenTelemetry, Loki) with optional per-backend overrides.
|
|
1045
907
|
|
|
@@ -1623,7 +1485,7 @@ after = calculate_cardinality_cost(
|
|
|
1623
1485
|
|
|
1624
1486
|
## ❓ Frequently Asked Questions
|
|
1625
1487
|
|
|
1626
|
-
> **Technical Details:** See [ADR-002 Section 11: FAQ & Critical Clarifications](../ADR-002-metrics-yabeda.md#11-faq--critical-clarifications) for architectural rationale.
|
|
1488
|
+
> **Technical Details:** See [ADR-002 Section 11: FAQ & Critical Clarifications](../architecture/ADR-002-metrics-yabeda.md#11-faq--critical-clarifications) for architectural rationale.
|
|
1627
1489
|
|
|
1628
1490
|
### Q1: Does cardinality protection apply to all my logs and metrics?
|
|
1629
1491
|
|
|
@@ -2097,7 +1959,7 @@ end
|
|
|
2097
1959
|
|
|
2098
1960
|
## 📚 Related Use Cases
|
|
2099
1961
|
|
|
2100
|
-
- **[UC-003:
|
|
1962
|
+
- **[UC-003: Event Metrics](./UC-003-event-metrics.md)** - Metrics in event classes
|
|
2101
1963
|
- **[UC-008: OpenTelemetry Integration](./UC-008-opentelemetry-integration.md)** - OTLP cardinality protection (C04)
|
|
2102
1964
|
- **[UC-015: Cost Optimization](./UC-015-cost-optimization.md)** - Reduce observability costs
|
|
2103
1965
|
|
|
@@ -1026,7 +1026,7 @@ end
|
|
|
1026
1026
|
|
|
1027
1027
|
### Strategy 8: Stratified Sampling for Accurate SLO (C11 Resolution) ⚠️
|
|
1028
1028
|
|
|
1029
|
-
> **Reference:** See [ADR-009 §3.7: Stratified Sampling for SLO Accuracy](../ADR-009-cost-optimization.md#37-stratified-sampling-for-slo-accuracy-c11-resolution) for full architecture and [UC-004: SLO Tracking with Sampling Correction](./UC-004-zero-config-slo-tracking.md#sampling-correction-for-accurate-slo-c11-resolution) for SLO calculation details.
|
|
1029
|
+
> **Reference:** See [ADR-009 §3.7: Stratified Sampling for SLO Accuracy](../architecture/ADR-009-cost-optimization.md#37-stratified-sampling-for-slo-accuracy-c11-resolution) for full architecture and [UC-004: SLO Tracking with Sampling Correction](./UC-004-zero-config-slo-tracking.md#sampling-correction-for-accurate-slo-c11-resolution) for SLO calculation details.
|
|
1030
1030
|
|
|
1031
1031
|
**Problem with Random Sampling:** Breaks SLO metrics! Errors are rare (5%) → random 10% sampling drops 90% of errors → SLO appears better than reality.
|
|
1032
1032
|
|
|
@@ -1046,7 +1046,7 @@ E11y.configure do |config|
|
|
|
1046
1046
|
stratified_rates do
|
|
1047
1047
|
error 1.0 # 100% - Keep ALL errors (critical for SLO!)
|
|
1048
1048
|
warn 0.5 # 50% - Medium priority
|
|
1049
|
-
info 0.1 # 10% - Low priority (
|
|
1049
|
+
info 0.1 # 10% - Low priority (successful requests)
|
|
1050
1050
|
debug 0.05 # 5% - Very low priority
|
|
1051
1051
|
end
|
|
1052
1052
|
end
|
|
@@ -56,12 +56,7 @@ E11y.configure do |config|
|
|
|
56
56
|
drop_empty_strings: true,
|
|
57
57
|
truncate_strings: 1000 # chars
|
|
58
58
|
|
|
59
|
-
# 5.
|
|
60
|
-
retention_tiers do
|
|
61
|
-
hot 7.days, storage: :loki # Fast queries
|
|
62
|
-
warm 30.days, storage: :s3 # Slower, cheaper
|
|
63
|
-
cold 1.year, storage: :s3_glacier # Archive
|
|
64
|
-
end
|
|
59
|
+
# 5. Routing by retention_until — config.routing_rules (see Strategy 4)
|
|
65
60
|
|
|
66
61
|
# 6. Smart routing (send only what's needed)
|
|
67
62
|
routing do
|
|
@@ -79,13 +74,13 @@ end
|
|
|
79
74
|
# Result:
|
|
80
75
|
# - 100k events/sec → 10k events/sec (adaptive sampling)
|
|
81
76
|
# - 2KB/event → 0.6KB/event (compression + minimization)
|
|
82
|
-
# -
|
|
77
|
+
# - Short retention → stdout, long → Loki (routing by retention_until)
|
|
83
78
|
# - Datadog: Only errors (3k/sec instead of 100k/sec)
|
|
84
79
|
#
|
|
85
80
|
# New monthly cost:
|
|
86
81
|
# - Datadog: $3,000 → $500 (only errors)
|
|
87
82
|
# - Loki: $10,368 → $1,200 (10% volume, 70% smaller, 7 days hot)
|
|
88
|
-
# -
|
|
83
|
+
# - Archival job (separate): exports by retention_until to cold storage
|
|
89
84
|
# - Total: $1,900/month = $22,800/year
|
|
90
85
|
#
|
|
91
86
|
# SAVINGS: $160,416 - $22,800 = $137,616/year (86% reduction!)
|
|
@@ -95,7 +90,7 @@ end
|
|
|
95
90
|
|
|
96
91
|
## 🎯 Cost Optimization Strategies
|
|
97
92
|
|
|
98
|
-
> **Note:** This UC focuses on proven, low-overhead optimizations. **Deduplication is intentionally NOT included** as a strategy. While it may seem like an obvious cost optimization, [ADR-009 Section 9.2.D](../ADR-009-cost-optimization.md#alternatives-considered) explains why it was rejected: high computational overhead (hash + Redis lookup per event), large memory cost (3.6GB for 1000 events/sec), false positives on legitimate retries, and debug confusion. Better alternatives (sampling + compression) achieve the same cost goals without these drawbacks.
|
|
93
|
+
> **Note:** This UC focuses on proven, low-overhead optimizations. **Deduplication is intentionally NOT included** as a strategy. While it may seem like an obvious cost optimization, [ADR-009 Section 9.2.D](../architecture/ADR-009-cost-optimization.md#alternatives-considered) explains why it was rejected: high computational overhead (hash + Redis lookup per event), large memory cost (3.6GB for 1000 events/sec), false positives on legitimate retries, and debug confusion. Better alternatives (sampling + compression) achieve the same cost goals without these drawbacks.
|
|
99
94
|
|
|
100
95
|
### Strategy 1: Intelligent Sampling by Value
|
|
101
96
|
|
|
@@ -254,57 +249,34 @@ end
|
|
|
254
249
|
|
|
255
250
|
---
|
|
256
251
|
|
|
257
|
-
### Strategy 4:
|
|
252
|
+
### Strategy 4: Routing by retention_until
|
|
258
253
|
|
|
259
|
-
**
|
|
254
|
+
**Route events to adapters based on retention (at collection):**
|
|
260
255
|
```ruby
|
|
256
|
+
# Events declare retention_period; retention_until is auto-calculated in payload.
|
|
257
|
+
# Routing rules use it to choose adapter — short retention → cheap storage.
|
|
261
258
|
E11y.configure do |config|
|
|
262
|
-
config.
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
# WARM: Slower queries, cheaper ($0.05/GB/month)
|
|
272
|
-
warm_tier do
|
|
273
|
-
duration 30.days
|
|
274
|
-
storage :s3
|
|
275
|
-
query_performance :medium
|
|
276
|
-
compression :zstd # Compress when moving to warm
|
|
277
|
-
end
|
|
278
|
-
|
|
279
|
-
# COLD: Archive, very cheap ($0.004/GB/month)
|
|
280
|
-
cold_tier do
|
|
281
|
-
duration 1.year
|
|
282
|
-
storage :s3_glacier
|
|
283
|
-
query_performance :slow # Minutes to hours
|
|
284
|
-
compression :zstd
|
|
285
|
-
end
|
|
286
|
-
|
|
287
|
-
# Auto-archival
|
|
288
|
-
auto_archive enabled: true,
|
|
289
|
-
schedule: '0 2 * * *' # 2 AM daily
|
|
290
|
-
end
|
|
291
|
-
end
|
|
259
|
+
config.routing_rules = [
|
|
260
|
+
->(event) { :audit_encrypted if event[:audit_event] },
|
|
261
|
+
->(event) {
|
|
262
|
+
return :loki unless event[:retention_until]
|
|
263
|
+
days = (Time.parse(event[:retention_until]) - Time.now) / 86400
|
|
264
|
+
days <= 7 ? :stdout : :loki # Short → free, long → Loki
|
|
265
|
+
}
|
|
266
|
+
]
|
|
267
|
+
config.fallback_adapters = [:loki]
|
|
292
268
|
end
|
|
293
269
|
|
|
294
|
-
#
|
|
295
|
-
#
|
|
296
|
-
#
|
|
297
|
-
#
|
|
298
|
-
#
|
|
299
|
-
#
|
|
300
|
-
#
|
|
301
|
-
|
|
302
|
-
#
|
|
303
|
-
#
|
|
304
|
-
# Cost for 30 days of data:
|
|
305
|
-
# Before: 30 days × $200 = $6,000/month
|
|
306
|
-
# After: (7 × $200) + (23 × $50) + (0 × $4) = $1,400 + $1,150 = $2,550/month
|
|
307
|
-
# Savings: $3,450/month (58% reduction!)
|
|
270
|
+
# Event classes declare retention:
|
|
271
|
+
# class DebugEvent < E11y::Event::Base
|
|
272
|
+
# retention_period 7.days # → stdout
|
|
273
|
+
# end
|
|
274
|
+
# class AuditEvent < E11y::Event::Base
|
|
275
|
+
# retention_period 7.years # → Loki (archival job exports later)
|
|
276
|
+
# end
|
|
277
|
+
|
|
278
|
+
# Cost: Short retention events never hit Loki. Archival job (separate) filters by
|
|
279
|
+
# retention_until for cold storage. Savings: ~58% vs all-events-to-Loki.
|
|
308
280
|
```
|
|
309
281
|
|
|
310
282
|
---
|
|
@@ -324,11 +296,11 @@ E11y.configure do |config|
|
|
|
324
296
|
# High-value transactions → All (audit + analytics)
|
|
325
297
|
route event_patterns: ['payment.*', 'order.*'],
|
|
326
298
|
when: ->(e) { e.payload[:amount].to_i > 1000 },
|
|
327
|
-
to: [:datadog, :loki
|
|
299
|
+
to: [:datadog, :loki]
|
|
328
300
|
|
|
329
301
|
# Security events → Specific SIEM
|
|
330
302
|
route event_patterns: ['security.*', 'audit.*'],
|
|
331
|
-
to: [:splunk
|
|
303
|
+
to: [:splunk]
|
|
332
304
|
|
|
333
305
|
# Debug events → Only Loki (no expensive Datadog)
|
|
334
306
|
route severities: [:debug],
|
|
@@ -356,50 +328,25 @@ end
|
|
|
356
328
|
|
|
357
329
|
---
|
|
358
330
|
|
|
359
|
-
### Strategy 6:
|
|
331
|
+
### Strategy 6: retention_period DSL
|
|
360
332
|
|
|
361
|
-
**
|
|
333
|
+
**Declare retention per event; routing uses retention_until:**
|
|
362
334
|
```ruby
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
# Auto-tag events with retention hints
|
|
367
|
-
tag_with_retention do
|
|
368
|
-
# Compliance events: Long retention
|
|
369
|
-
when_pattern 'audit.*', 'gdpr.*', retention: 7.years
|
|
370
|
-
|
|
371
|
-
# Financial: Long retention
|
|
372
|
-
when_pattern 'payment.*', 'transaction.*', retention: 7.years
|
|
373
|
-
|
|
374
|
-
# Errors: Medium retention
|
|
375
|
-
when_severity :error, :fatal, retention: 90.days
|
|
376
|
-
|
|
377
|
-
# Debug: Short retention
|
|
378
|
-
when_severity :debug, retention: 7.days
|
|
379
|
-
|
|
380
|
-
# Default
|
|
381
|
-
default_retention 30.days
|
|
382
|
-
end
|
|
383
|
-
|
|
384
|
-
# Backend respects retention tags
|
|
385
|
-
backends do
|
|
386
|
-
loki retention_based: true,
|
|
387
|
-
max_retention: 30.days
|
|
388
|
-
|
|
389
|
-
s3_archive retention_based: true,
|
|
390
|
-
max_retention: 7.years
|
|
391
|
-
end
|
|
392
|
-
end
|
|
393
|
-
end
|
|
335
|
+
# Event-level retention (used by routing_rules + archival job)
|
|
336
|
+
class DebugEvent < E11y::Event::Base
|
|
337
|
+
retention_period 7.days
|
|
394
338
|
end
|
|
395
339
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
#
|
|
402
|
-
|
|
340
|
+
class PaymentEvent < E11y::Event::Base
|
|
341
|
+
retention_period 7.years
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
class OrderEvent < E11y::Event::Base
|
|
345
|
+
# Uses config.default_retention_period (30 days)
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# retention_until is auto-calculated in payload. Routing (Strategy 4) and
|
|
349
|
+
# archival job both use it. No separate tagging — one field, two consumers.
|
|
403
350
|
```
|
|
404
351
|
|
|
405
352
|
---
|
|
@@ -641,7 +588,7 @@ end
|
|
|
641
588
|
config.cost_optimization do
|
|
642
589
|
intelligent_sampling { ... } # 90% reduction
|
|
643
590
|
compression { ... } # 70% smaller payloads
|
|
644
|
-
|
|
591
|
+
routing_rules (by retention_until) # Short → stdout, long → Loki
|
|
645
592
|
smart_routing { ... } # 50% fewer expensive destinations
|
|
646
593
|
end
|
|
647
594
|
# Combined: ~95% cost reduction!
|
|
@@ -653,7 +600,7 @@ end
|
|
|
653
600
|
# Dashboard: "Cost Optimization Savings"
|
|
654
601
|
# - Monthly savings: $X
|
|
655
602
|
# - YTD savings: $Y
|
|
656
|
-
# - Optimization breakdown (sampling, compression,
|
|
603
|
+
# - Optimization breakdown (sampling, compression, routing)
|
|
657
604
|
```
|
|
658
605
|
|
|
659
606
|
**3. Test in staging first**
|