e11y 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +4 -0
- data/.rubocop.yml +69 -0
- data/CHANGELOG.md +26 -0
- data/CODE_OF_CONDUCT.md +64 -0
- data/LICENSE.txt +21 -0
- data/README.md +179 -0
- data/Rakefile +37 -0
- data/benchmarks/run_all.rb +33 -0
- data/config/README.md +83 -0
- data/config/loki-local-config.yaml +35 -0
- data/config/prometheus.yml +15 -0
- data/docker-compose.yml +78 -0
- data/docs/00-ICP-AND-TIMELINE.md +483 -0
- data/docs/01-SCALE-REQUIREMENTS.md +858 -0
- data/docs/ADR-001-architecture.md +2617 -0
- data/docs/ADR-002-metrics-yabeda.md +1395 -0
- data/docs/ADR-003-slo-observability.md +3337 -0
- data/docs/ADR-004-adapter-architecture.md +2385 -0
- data/docs/ADR-005-tracing-context.md +1372 -0
- data/docs/ADR-006-security-compliance.md +4143 -0
- data/docs/ADR-007-opentelemetry-integration.md +1385 -0
- data/docs/ADR-008-rails-integration.md +1911 -0
- data/docs/ADR-009-cost-optimization.md +2993 -0
- data/docs/ADR-010-developer-experience.md +2166 -0
- data/docs/ADR-011-testing-strategy.md +1836 -0
- data/docs/ADR-012-event-evolution.md +958 -0
- data/docs/ADR-013-reliability-error-handling.md +2750 -0
- data/docs/ADR-014-event-driven-slo.md +1533 -0
- data/docs/ADR-015-middleware-order.md +1061 -0
- data/docs/ADR-016-self-monitoring-slo.md +1234 -0
- data/docs/API-REFERENCE-L28.md +914 -0
- data/docs/COMPREHENSIVE-CONFIGURATION.md +2366 -0
- data/docs/IMPLEMENTATION_NOTES.md +2804 -0
- data/docs/IMPLEMENTATION_PLAN.md +1971 -0
- data/docs/IMPLEMENTATION_PLAN_ARCHITECTURE.md +586 -0
- data/docs/PLAN.md +148 -0
- data/docs/QUICK-START.md +934 -0
- data/docs/README.md +296 -0
- data/docs/design/00-memory-optimization.md +593 -0
- data/docs/guides/MIGRATION-L27-L28.md +692 -0
- data/docs/guides/PERFORMANCE-BENCHMARKS.md +434 -0
- data/docs/guides/README.md +44 -0
- data/docs/prd/01-overview-vision.md +440 -0
- data/docs/use_cases/README.md +119 -0
- data/docs/use_cases/UC-001-request-scoped-debug-buffering.md +813 -0
- data/docs/use_cases/UC-002-business-event-tracking.md +1953 -0
- data/docs/use_cases/UC-003-pattern-based-metrics.md +1627 -0
- data/docs/use_cases/UC-004-zero-config-slo-tracking.md +728 -0
- data/docs/use_cases/UC-005-sentry-integration.md +759 -0
- data/docs/use_cases/UC-006-trace-context-management.md +905 -0
- data/docs/use_cases/UC-007-pii-filtering.md +2648 -0
- data/docs/use_cases/UC-008-opentelemetry-integration.md +1153 -0
- data/docs/use_cases/UC-009-multi-service-tracing.md +1043 -0
- data/docs/use_cases/UC-010-background-job-tracking.md +1018 -0
- data/docs/use_cases/UC-011-rate-limiting.md +1906 -0
- data/docs/use_cases/UC-012-audit-trail.md +2301 -0
- data/docs/use_cases/UC-013-high-cardinality-protection.md +2127 -0
- data/docs/use_cases/UC-014-adaptive-sampling.md +1940 -0
- data/docs/use_cases/UC-015-cost-optimization.md +735 -0
- data/docs/use_cases/UC-016-rails-logger-migration.md +785 -0
- data/docs/use_cases/UC-017-local-development.md +867 -0
- data/docs/use_cases/UC-018-testing-events.md +1081 -0
- data/docs/use_cases/UC-019-tiered-storage-migration.md +562 -0
- data/docs/use_cases/UC-020-event-versioning.md +708 -0
- data/docs/use_cases/UC-021-error-handling-retry-dlq.md +956 -0
- data/docs/use_cases/UC-022-event-registry.md +648 -0
- data/docs/use_cases/backlog.md +226 -0
- data/e11y.gemspec +76 -0
- data/lib/e11y/adapters/adaptive_batcher.rb +207 -0
- data/lib/e11y/adapters/audit_encrypted.rb +239 -0
- data/lib/e11y/adapters/base.rb +580 -0
- data/lib/e11y/adapters/file.rb +224 -0
- data/lib/e11y/adapters/in_memory.rb +216 -0
- data/lib/e11y/adapters/loki.rb +333 -0
- data/lib/e11y/adapters/otel_logs.rb +203 -0
- data/lib/e11y/adapters/registry.rb +141 -0
- data/lib/e11y/adapters/sentry.rb +230 -0
- data/lib/e11y/adapters/stdout.rb +108 -0
- data/lib/e11y/adapters/yabeda.rb +370 -0
- data/lib/e11y/buffers/adaptive_buffer.rb +339 -0
- data/lib/e11y/buffers/base_buffer.rb +40 -0
- data/lib/e11y/buffers/request_scoped_buffer.rb +246 -0
- data/lib/e11y/buffers/ring_buffer.rb +267 -0
- data/lib/e11y/buffers.rb +14 -0
- data/lib/e11y/console.rb +122 -0
- data/lib/e11y/current.rb +48 -0
- data/lib/e11y/event/base.rb +894 -0
- data/lib/e11y/event/value_sampling_config.rb +84 -0
- data/lib/e11y/events/base_audit_event.rb +43 -0
- data/lib/e11y/events/base_payment_event.rb +33 -0
- data/lib/e11y/events/rails/cache/delete.rb +21 -0
- data/lib/e11y/events/rails/cache/read.rb +23 -0
- data/lib/e11y/events/rails/cache/write.rb +22 -0
- data/lib/e11y/events/rails/database/query.rb +45 -0
- data/lib/e11y/events/rails/http/redirect.rb +21 -0
- data/lib/e11y/events/rails/http/request.rb +26 -0
- data/lib/e11y/events/rails/http/send_file.rb +21 -0
- data/lib/e11y/events/rails/http/start_processing.rb +26 -0
- data/lib/e11y/events/rails/job/completed.rb +22 -0
- data/lib/e11y/events/rails/job/enqueued.rb +22 -0
- data/lib/e11y/events/rails/job/failed.rb +22 -0
- data/lib/e11y/events/rails/job/scheduled.rb +23 -0
- data/lib/e11y/events/rails/job/started.rb +22 -0
- data/lib/e11y/events/rails/log.rb +56 -0
- data/lib/e11y/events/rails/view/render.rb +23 -0
- data/lib/e11y/events.rb +18 -0
- data/lib/e11y/instruments/active_job.rb +201 -0
- data/lib/e11y/instruments/rails_instrumentation.rb +141 -0
- data/lib/e11y/instruments/sidekiq.rb +175 -0
- data/lib/e11y/logger/bridge.rb +205 -0
- data/lib/e11y/metrics/cardinality_protection.rb +172 -0
- data/lib/e11y/metrics/cardinality_tracker.rb +134 -0
- data/lib/e11y/metrics/registry.rb +234 -0
- data/lib/e11y/metrics/relabeling.rb +226 -0
- data/lib/e11y/metrics.rb +102 -0
- data/lib/e11y/middleware/audit_signing.rb +174 -0
- data/lib/e11y/middleware/base.rb +140 -0
- data/lib/e11y/middleware/event_slo.rb +167 -0
- data/lib/e11y/middleware/pii_filter.rb +266 -0
- data/lib/e11y/middleware/pii_filtering.rb +280 -0
- data/lib/e11y/middleware/rate_limiting.rb +214 -0
- data/lib/e11y/middleware/request.rb +163 -0
- data/lib/e11y/middleware/routing.rb +157 -0
- data/lib/e11y/middleware/sampling.rb +254 -0
- data/lib/e11y/middleware/slo.rb +168 -0
- data/lib/e11y/middleware/trace_context.rb +131 -0
- data/lib/e11y/middleware/validation.rb +118 -0
- data/lib/e11y/middleware/versioning.rb +132 -0
- data/lib/e11y/middleware.rb +12 -0
- data/lib/e11y/pii/patterns.rb +90 -0
- data/lib/e11y/pii.rb +13 -0
- data/lib/e11y/pipeline/builder.rb +155 -0
- data/lib/e11y/pipeline/zone_validator.rb +110 -0
- data/lib/e11y/pipeline.rb +12 -0
- data/lib/e11y/presets/audit_event.rb +65 -0
- data/lib/e11y/presets/debug_event.rb +34 -0
- data/lib/e11y/presets/high_value_event.rb +51 -0
- data/lib/e11y/presets.rb +19 -0
- data/lib/e11y/railtie.rb +138 -0
- data/lib/e11y/reliability/circuit_breaker.rb +216 -0
- data/lib/e11y/reliability/dlq/file_storage.rb +277 -0
- data/lib/e11y/reliability/dlq/filter.rb +117 -0
- data/lib/e11y/reliability/retry_handler.rb +207 -0
- data/lib/e11y/reliability/retry_rate_limiter.rb +117 -0
- data/lib/e11y/sampling/error_spike_detector.rb +225 -0
- data/lib/e11y/sampling/load_monitor.rb +161 -0
- data/lib/e11y/sampling/stratified_tracker.rb +92 -0
- data/lib/e11y/sampling/value_extractor.rb +82 -0
- data/lib/e11y/self_monitoring/buffer_monitor.rb +79 -0
- data/lib/e11y/self_monitoring/performance_monitor.rb +97 -0
- data/lib/e11y/self_monitoring/reliability_monitor.rb +146 -0
- data/lib/e11y/slo/event_driven.rb +150 -0
- data/lib/e11y/slo/tracker.rb +119 -0
- data/lib/e11y/version.rb +9 -0
- data/lib/e11y.rb +283 -0
- metadata +452 -0
|
@@ -0,0 +1,1018 @@
|
|
|
1
|
+
# UC-010: Background Job Tracking
|
|
2
|
+
|
|
3
|
+
**Status:** MVP Feature
|
|
4
|
+
**Complexity:** Intermediate
|
|
5
|
+
**Setup Time:** 20-30 minutes
|
|
6
|
+
**Target Users:** Backend Developers, SRE, DevOps
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## 📋 Overview
|
|
11
|
+
|
|
12
|
+
### Problem Statement
|
|
13
|
+
|
|
14
|
+
**The invisible failure:**
|
|
15
|
+
```ruby
|
|
16
|
+
# ❌ NO VISIBILITY: Background jobs failing silently
|
|
17
|
+
class SendEmailJob < ApplicationJob
|
|
18
|
+
def perform(user_id)
|
|
19
|
+
user = User.find(user_id)
|
|
20
|
+
UserMailer.welcome(user).deliver_now
|
|
21
|
+
# What if this fails?
|
|
22
|
+
# - No log (unless you remember to add it)
|
|
23
|
+
# - No metric
|
|
24
|
+
# - No trace correlation with request that enqueued it
|
|
25
|
+
# - Can't see job duration/performance trends
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Problems:
|
|
30
|
+
# - Enqueued from request A (trace_id: abc-123)
|
|
31
|
+
# - Executes later (NEW trace_id: xyz-789)
|
|
32
|
+
# - Lost correlation between request and job!
|
|
33
|
+
# - Silent failures (retries happen but you don't know why)
|
|
34
|
+
# - No SLO tracking (how many jobs succeed? How fast?)
|
|
35
|
+
# - No visibility into job queue health
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### E11y Solution
|
|
39
|
+
|
|
40
|
+
**Automatic job instrumentation with full traceability:**
|
|
41
|
+
```ruby
|
|
42
|
+
# ✅ AUTOMATIC TRACKING: Zero-config job observability
|
|
43
|
+
class SendEmailJob < ApplicationJob
|
|
44
|
+
def perform(user_id)
|
|
45
|
+
# E11y automatically tracks:
|
|
46
|
+
# - Job started
|
|
47
|
+
# - Job succeeded/failed
|
|
48
|
+
# - Duration
|
|
49
|
+
# - Trace ID (from enqueuing request!)
|
|
50
|
+
# - Retry attempts
|
|
51
|
+
# - Queue metrics
|
|
52
|
+
|
|
53
|
+
user = User.find(user_id)
|
|
54
|
+
UserMailer.welcome(user).deliver_now
|
|
55
|
+
|
|
56
|
+
# No explicit tracking needed! ✨
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Result (automatic events):
|
|
61
|
+
# 1. job.enqueued (trace_id: abc-123)
|
|
62
|
+
# 2. job.started (trace_id: abc-123 - preserved!)
|
|
63
|
+
# 3. job.succeeded (trace_id: abc-123, duration_ms: 1250)
|
|
64
|
+
#
|
|
65
|
+
# Metrics (automatic):
|
|
66
|
+
# - jobs_total{job="SendEmailJob",status="success"} = 1
|
|
67
|
+
# - jobs_duration_ms{job="SendEmailJob"} = 1250
|
|
68
|
+
# - jobs_queue_size{queue="default"} = 45
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## 🎯 Features
|
|
74
|
+
|
|
75
|
+
> **Implementation:** See [ADR-008: Rails Integration](../ADR-008-rails-integration.md) for complete architecture, including [Section 5: Sidekiq Integration](../ADR-008-rails-integration.md#5-sidekiq-integration), [Section 6: ActiveJob Integration](../ADR-008-rails-integration.md#6-activejob-integration), and [Section 5.3: Job-Scoped Buffer](../ADR-008-rails-integration.md#53-job-scoped-buffer).
|
|
76
|
+
|
|
77
|
+
### 1. Automatic Instrumentation
|
|
78
|
+
|
|
79
|
+
**Zero-config for Sidekiq & ActiveJob:**
|
|
80
|
+
```ruby
|
|
81
|
+
# config/initializers/e11y.rb
|
|
82
|
+
E11y.configure do |config|
|
|
83
|
+
config.background_jobs do
|
|
84
|
+
# Auto-instrument Sidekiq
|
|
85
|
+
sidekiq enabled: true,
|
|
86
|
+
track_enqueue: true,
|
|
87
|
+
track_execution: true,
|
|
88
|
+
track_retries: true
|
|
89
|
+
|
|
90
|
+
# Auto-instrument ActiveJob
|
|
91
|
+
active_job enabled: true,
|
|
92
|
+
track_enqueue: true,
|
|
93
|
+
track_execution: true,
|
|
94
|
+
track_retries: true
|
|
95
|
+
|
|
96
|
+
# Metrics
|
|
97
|
+
metrics enabled: true,
|
|
98
|
+
include_queue_size: true,
|
|
99
|
+
include_latency: true
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# That's it! All jobs automatically tracked ✨
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**What gets tracked automatically:**
|
|
107
|
+
- ✅ Job enqueued (when `.perform_later` called)
|
|
108
|
+
- ✅ Job started (when worker picks it up)
|
|
109
|
+
- ✅ Job succeeded (on success)
|
|
110
|
+
- ✅ Job failed (on error)
|
|
111
|
+
- ✅ Job retried (after failure)
|
|
112
|
+
- ✅ Duration (total execution time)
|
|
113
|
+
- ✅ Latency (time in queue before execution)
|
|
114
|
+
- ✅ Trace ID (preserved from enqueuing request!)
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
### 2. Trace Correlation
|
|
119
|
+
|
|
120
|
+
**Automatic trace_id propagation:**
|
|
121
|
+
```ruby
|
|
122
|
+
# Controller (trace_id: abc-123)
|
|
123
|
+
class OrdersController < ApplicationController
|
|
124
|
+
def create
|
|
125
|
+
order = Order.create!(order_params)
|
|
126
|
+
|
|
127
|
+
# Enqueue job (trace_id automatically passed!)
|
|
128
|
+
SendOrderConfirmationJob.perform_later(order.id)
|
|
129
|
+
|
|
130
|
+
render json: order
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Job (trace_id: abc-123 - preserved!)
|
|
135
|
+
class SendOrderConfirmationJob < ApplicationJob
|
|
136
|
+
def perform(order_id)
|
|
137
|
+
# E11y::TraceId.current == 'abc-123' ✅
|
|
138
|
+
|
|
139
|
+
order = Order.find(order_id)
|
|
140
|
+
OrderMailer.confirmation(order).deliver_now
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Timeline in Grafana: {trace_id="abc-123"}
|
|
145
|
+
# 10:00:00.000 [controller] order.created
|
|
146
|
+
# 10:00:00.050 [controller] job.enqueued (job: SendOrderConfirmation)
|
|
147
|
+
# 10:00:02.000 [job] job.started (job: SendOrderConfirmation, latency: 1950ms)
|
|
148
|
+
# 10:00:03.200 [job] email.sent
|
|
149
|
+
# 10:00:03.250 [job] job.succeeded (duration: 1250ms)
|
|
150
|
+
# → Complete trace across request + background job!
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
### 3. Retry Tracking
|
|
156
|
+
|
|
157
|
+
**Visibility into retry behavior:**
|
|
158
|
+
```ruby
|
|
159
|
+
class UnreliableApiJob < ApplicationJob
|
|
160
|
+
retry_on ApiError, wait: :exponentially_longer, attempts: 5
|
|
161
|
+
|
|
162
|
+
def perform(data)
|
|
163
|
+
# Might fail, will retry...
|
|
164
|
+
UnreliableApi.call(data)
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# E11y automatically tracks retries:
|
|
169
|
+
# Attempt 1: job.started → job.failed (error: ApiTimeout)
|
|
170
|
+
# Attempt 2 (after 3s): job.retried (attempt: 2) → job.failed
|
|
171
|
+
# Attempt 3 (after 18s): job.retried (attempt: 3) → job.failed
|
|
172
|
+
# Attempt 4 (after 83s): job.retried (attempt: 4) → job.succeeded ✅
|
|
173
|
+
|
|
174
|
+
# Metrics:
|
|
175
|
+
# jobs_retried_total{job="UnreliableApiJob",attempt="2"} = 1
|
|
176
|
+
# jobs_retried_total{job="UnreliableApiJob",attempt="3"} = 1
|
|
177
|
+
# jobs_retried_total{job="UnreliableApiJob",attempt="4"} = 1
|
|
178
|
+
# jobs_retry_exhausted_total{job="UnreliableApiJob"} = 0 (succeeded on attempt 4)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
### 4. Queue Health Metrics
|
|
184
|
+
|
|
185
|
+
**Monitor queue depth and processing:**
|
|
186
|
+
```ruby
|
|
187
|
+
# Automatic metrics (updated every 10s):
|
|
188
|
+
# jobs_queue_size{queue="default"} = 145
|
|
189
|
+
# jobs_queue_size{queue="mailers"} = 23
|
|
190
|
+
# jobs_queue_size{queue="critical"} = 2
|
|
191
|
+
#
|
|
192
|
+
# jobs_queue_latency_seconds{queue="default"} = 5.2 # Oldest job waiting 5.2s
|
|
193
|
+
# jobs_enqueued_total{queue="default"} = 1234
|
|
194
|
+
# jobs_processed_total{queue="default",status="success"} = 1200
|
|
195
|
+
# jobs_processed_total{queue="default",status="failed"} = 34
|
|
196
|
+
|
|
197
|
+
# Prometheus alerts:
|
|
198
|
+
# - Queue backlog: jobs_queue_size > 1000
|
|
199
|
+
# - High latency: jobs_queue_latency_seconds > 60
|
|
200
|
+
# - High failure rate: rate(jobs_processed_total{status="failed"}[5m]) > 10
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
### 5. Job-Specific Events
|
|
206
|
+
|
|
207
|
+
> **Note:** E11y supports **job-scoped buffering** similar to [UC-001: Request-Scoped Debug Buffering](./UC-001-request-scoped-debug-buffering.md). Debug events within a job are buffered and only flushed if the job fails. See [ADR-001 Section 3.4: Request-Scoped Buffer](../ADR-001-architecture.md#34-request-scoped-buffer) for implementation details (same architecture applies to jobs).
|
|
208
|
+
|
|
209
|
+
**Emit custom events within jobs:**
|
|
210
|
+
```ruby
|
|
211
|
+
class ProcessOrderJob < ApplicationJob
|
|
212
|
+
def perform(order_id)
|
|
213
|
+
order = Order.find(order_id)
|
|
214
|
+
|
|
215
|
+
# Custom events (inherit trace_id automatically!)
|
|
216
|
+
Events::OrderProcessingStarted.track(order_id: order.id)
|
|
217
|
+
|
|
218
|
+
# Step 1
|
|
219
|
+
Events::InventoryChecked.track(
|
|
220
|
+
order_id: order.id,
|
|
221
|
+
items_available: check_inventory(order)
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Step 2
|
|
225
|
+
Events::PaymentCaptured.track(
|
|
226
|
+
order_id: order.id,
|
|
227
|
+
amount: capture_payment(order)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Step 3
|
|
231
|
+
Events::ShipmentScheduled.track(
|
|
232
|
+
order_id: order.id,
|
|
233
|
+
estimated_delivery: schedule_shipment(order)
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
Events::OrderProcessingCompleted.track(
|
|
237
|
+
order_id: order.id,
|
|
238
|
+
severity: :success
|
|
239
|
+
)
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Timeline: {trace_id="abc-123"}
|
|
244
|
+
# 10:00:00 [controller] order.created
|
|
245
|
+
# 10:00:01 [controller] job.enqueued
|
|
246
|
+
# 10:00:05 [job] job.started
|
|
247
|
+
# 10:00:05 [job] order.processing.started
|
|
248
|
+
# 10:00:06 [job] inventory.checked
|
|
249
|
+
# 10:00:08 [job] payment.captured
|
|
250
|
+
# 10:00:10 [job] shipment.scheduled
|
|
251
|
+
# 10:00:10 [job] order.processing.completed
|
|
252
|
+
# 10:00:10 [job] job.succeeded
|
|
253
|
+
# → Complete observability into job execution!
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
---
|
|
257
|
+
|
|
258
|
+
## 💻 Implementation Examples
|
|
259
|
+
|
|
260
|
+
### Example 1: Email Job with Retry
|
|
261
|
+
|
|
262
|
+
```ruby
|
|
263
|
+
class SendWelcomeEmailJob < ApplicationJob
|
|
264
|
+
queue_as :mailers
|
|
265
|
+
|
|
266
|
+
retry_on Net::SMTPServerBusy, wait: 5.seconds, attempts: 3
|
|
267
|
+
discard_on Net::SMTPFatalError # Don't retry permanent failures
|
|
268
|
+
|
|
269
|
+
def perform(user_id)
|
|
270
|
+
user = User.find(user_id)
|
|
271
|
+
|
|
272
|
+
# Track email sending (custom event)
|
|
273
|
+
Events::EmailSending.track(
|
|
274
|
+
user_id: user.id,
|
|
275
|
+
email: user.email,
|
|
276
|
+
template: 'welcome'
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
UserMailer.welcome(user).deliver_now
|
|
280
|
+
|
|
281
|
+
# Track success
|
|
282
|
+
Events::EmailSent.track(
|
|
283
|
+
user_id: user.id,
|
|
284
|
+
template: 'welcome',
|
|
285
|
+
severity: :success
|
|
286
|
+
)
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# Automatic tracking (by E11y):
|
|
291
|
+
# 1. job.enqueued (when perform_later called)
|
|
292
|
+
# 2. job.started (when worker picks up)
|
|
293
|
+
# 3. email.sending (custom event)
|
|
294
|
+
# 4. email.sent (custom event)
|
|
295
|
+
# 5. job.succeeded
|
|
296
|
+
#
|
|
297
|
+
# If SMTP error:
|
|
298
|
+
# 1. job.enqueued
|
|
299
|
+
# 2. job.started
|
|
300
|
+
# 3. email.sending
|
|
301
|
+
# 4. job.failed (error: Net::SMTPServerBusy)
|
|
302
|
+
# 5. job.retried (attempt: 2, after 5s)
|
|
303
|
+
# 6. job.started (attempt 2)
|
|
304
|
+
# ... repeat until success or exhausted
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
### Example 2: Batch Processing Job
|
|
310
|
+
|
|
311
|
+
```ruby
|
|
312
|
+
class ProcessBatchJob < ApplicationJob
|
|
313
|
+
queue_as :batch_processing
|
|
314
|
+
|
|
315
|
+
def perform(batch_id)
|
|
316
|
+
batch = Batch.find(batch_id)
|
|
317
|
+
|
|
318
|
+
# Track batch processing
|
|
319
|
+
Events::BatchProcessingStarted.track(
|
|
320
|
+
batch_id: batch.id,
|
|
321
|
+
total_items: batch.items.count
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
processed = 0
|
|
325
|
+
failed = 0
|
|
326
|
+
|
|
327
|
+
batch.items.find_each do |item|
|
|
328
|
+
begin
|
|
329
|
+
process_item(item)
|
|
330
|
+
processed += 1
|
|
331
|
+
|
|
332
|
+
# Progress update every 100 items
|
|
333
|
+
if processed % 100 == 0
|
|
334
|
+
Events::BatchProgress.track(
|
|
335
|
+
batch_id: batch.id,
|
|
336
|
+
processed: processed,
|
|
337
|
+
total: batch.items.count,
|
|
338
|
+
progress_pct: (processed.to_f / batch.items.count * 100).round(2)
|
|
339
|
+
)
|
|
340
|
+
end
|
|
341
|
+
rescue => e
|
|
342
|
+
failed += 1
|
|
343
|
+
Events::BatchItemFailed.track(
|
|
344
|
+
batch_id: batch.id,
|
|
345
|
+
item_id: item.id,
|
|
346
|
+
error: e.message,
|
|
347
|
+
severity: :error
|
|
348
|
+
)
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# Summary
|
|
353
|
+
Events::BatchProcessingCompleted.track(
|
|
354
|
+
batch_id: batch.id,
|
|
355
|
+
total_items: batch.items.count,
|
|
356
|
+
processed: processed,
|
|
357
|
+
failed: failed,
|
|
358
|
+
success_rate: (processed.to_f / batch.items.count * 100).round(2),
|
|
359
|
+
severity: failed == 0 ? :success : :warn
|
|
360
|
+
)
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
# Metrics (automatic from events):
|
|
365
|
+
# batch_processing_items_total{batch_id="123"} = 1000
|
|
366
|
+
# batch_processing_items_processed{batch_id="123"} = 980
|
|
367
|
+
# batch_processing_items_failed{batch_id="123"} = 20
|
|
368
|
+
# batch_processing_success_rate{batch_id="123"} = 98.0
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
---
|
|
372
|
+
|
|
373
|
+
### Example 3: Scheduled Job (Cron)
|
|
374
|
+
|
|
375
|
+
```ruby
|
|
376
|
+
class DailyReportJob < ApplicationJob
|
|
377
|
+
queue_as :reports
|
|
378
|
+
|
|
379
|
+
# Scheduled via whenever gem or Sidekiq-cron
|
|
380
|
+
def perform
|
|
381
|
+
# Track report generation
|
|
382
|
+
Events::ReportGenerationStarted.track(
|
|
383
|
+
report_type: 'daily_summary',
|
|
384
|
+
date: Date.today
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
begin
|
|
388
|
+
# Generate report
|
|
389
|
+
report = generate_daily_report
|
|
390
|
+
|
|
391
|
+
# Track success
|
|
392
|
+
Events::ReportGenerated.track(
|
|
393
|
+
report_type: 'daily_summary',
|
|
394
|
+
date: Date.today,
|
|
395
|
+
record_count: report.records.count,
|
|
396
|
+
file_size_bytes: report.file.size,
|
|
397
|
+
severity: :success
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Send to stakeholders
|
|
401
|
+
ReportMailer.daily_summary(report).deliver_now
|
|
402
|
+
|
|
403
|
+
rescue => e
|
|
404
|
+
# Track failure
|
|
405
|
+
Events::ReportGenerationFailed.track(
|
|
406
|
+
report_type: 'daily_summary',
|
|
407
|
+
date: Date.today,
|
|
408
|
+
error_class: e.class.name,
|
|
409
|
+
error_message: e.message,
|
|
410
|
+
severity: :error
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Alert ops team
|
|
414
|
+
raise # Will trigger Sidekiq retry
|
|
415
|
+
end
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# Monitoring:
|
|
420
|
+
# - Daily at 6 AM: job.started
|
|
421
|
+
# - Success rate: jobs_processed_total{job="DailyReportJob",status="success"} / total
|
|
422
|
+
# - Alert if failed: jobs_processed_total{job="DailyReportJob",status="failed"} > 0
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
---
|
|
426
|
+
|
|
427
|
+
### Example 4: Chain of Jobs
|
|
428
|
+
|
|
429
|
+
```ruby
|
|
430
|
+
class OrderFulfillmentWorkflow
|
|
431
|
+
def self.start(order_id)
|
|
432
|
+
# Step 1: Validate order
|
|
433
|
+
ValidateOrderJob.perform_later(order_id)
|
|
434
|
+
end
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
class ValidateOrderJob < ApplicationJob
|
|
438
|
+
def perform(order_id)
|
|
439
|
+
order = Order.find(order_id)
|
|
440
|
+
|
|
441
|
+
Events::OrderValidationStarted.track(order_id: order.id)
|
|
442
|
+
|
|
443
|
+
if order.valid_for_fulfillment?
|
|
444
|
+
Events::OrderValidated.track(order_id: order.id, severity: :success)
|
|
445
|
+
|
|
446
|
+
# Chain to next step
|
|
447
|
+
ChargePaymentJob.perform_later(order_id)
|
|
448
|
+
else
|
|
449
|
+
Events::OrderValidationFailed.track(
|
|
450
|
+
order_id: order.id,
|
|
451
|
+
errors: order.validation_errors,
|
|
452
|
+
severity: :error
|
|
453
|
+
)
|
|
454
|
+
end
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
class ChargePaymentJob < ApplicationJob
|
|
459
|
+
def perform(order_id)
|
|
460
|
+
order = Order.find(order_id)
|
|
461
|
+
|
|
462
|
+
Events::PaymentCharging.track(order_id: order.id, amount: order.total)
|
|
463
|
+
|
|
464
|
+
payment = PaymentGateway.charge(order)
|
|
465
|
+
|
|
466
|
+
Events::PaymentCharged.track(
|
|
467
|
+
order_id: order.id,
|
|
468
|
+
transaction_id: payment.id,
|
|
469
|
+
severity: :success
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Chain to next step
|
|
473
|
+
FulfillOrderJob.perform_later(order_id)
|
|
474
|
+
end
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
class FulfillOrderJob < ApplicationJob
|
|
478
|
+
def perform(order_id)
|
|
479
|
+
order = Order.find(order_id)
|
|
480
|
+
|
|
481
|
+
Events::OrderFulfillmentStarted.track(order_id: order.id)
|
|
482
|
+
|
|
483
|
+
shipment = create_shipment(order)
|
|
484
|
+
|
|
485
|
+
Events::OrderFulfilled.track(
|
|
486
|
+
order_id: order.id,
|
|
487
|
+
shipment_id: shipment.id,
|
|
488
|
+
tracking_number: shipment.tracking_number,
|
|
489
|
+
severity: :success
|
|
490
|
+
)
|
|
491
|
+
end
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
# Timeline (same trace_id across all jobs!):
|
|
495
|
+
# 10:00:00 [controller] order.created (trace_id: abc-123)
|
|
496
|
+
# 10:00:01 [job] order.validation.started (trace_id: abc-123)
|
|
497
|
+
# 10:00:02 [job] order.validated (trace_id: abc-123)
|
|
498
|
+
# 10:00:03 [job] payment.charging (trace_id: abc-123)
|
|
499
|
+
# 10:00:05 [job] payment.charged (trace_id: abc-123)
|
|
500
|
+
# 10:00:06 [job] order.fulfillment.started (trace_id: abc-123)
|
|
501
|
+
# 10:00:10 [job] order.fulfilled (trace_id: abc-123)
|
|
502
|
+
# → Complete workflow trace!
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
---
|
|
506
|
+
|
|
507
|
+
### 6. Sidekiq Middleware Implementation (C17, C18 Resolutions) ⚠️
|
|
508
|
+
|
|
509
|
+
> **Reference:** See [ADR-005 §8.3 (C17)](../ADR-005-tracing-context.md#83-background-job-tracing-strategy-c17-resolution) and [ADR-013 §3.6 (C18)](../ADR-013-reliability-error-handling.md#36-event-tracking-in-background-jobs-c18-resolution) for full architecture.
|
|
510
|
+
|
|
511
|
+
E11y provides two critical Sidekiq middlewares:
|
|
512
|
+
|
|
513
|
+
#### 6.1. Trace Middleware (C17: New Trace + Parent Link)
|
|
514
|
+
|
|
515
|
+
**Problem:** Jobs need NEW trace_id (for bounded duration) but must link to parent request.
|
|
516
|
+
|
|
517
|
+
**Solution:** `SidekiqTraceMiddleware` creates new trace + stores parent link:
|
|
518
|
+
|
|
519
|
+
```ruby
|
|
520
|
+
# lib/e11y/sidekiq/trace_middleware.rb
|
|
521
|
+
module E11y
|
|
522
|
+
module Sidekiq
|
|
523
|
+
class TraceMiddleware
|
|
524
|
+
def call(worker, job, queue)
|
|
525
|
+
# Extract parent trace from job metadata
|
|
526
|
+
parent_trace_id = job['e11y_parent_trace_id']
|
|
527
|
+
|
|
528
|
+
# Start NEW trace for this job
|
|
529
|
+
new_trace_id = E11y::TraceContext.generate_trace_id
|
|
530
|
+
|
|
531
|
+
# Set trace context for job execution
|
|
532
|
+
E11y::TraceContext.with_trace(
|
|
533
|
+
trace_id: new_trace_id,
|
|
534
|
+
parent_trace_id: parent_trace_id # ✅ Link to parent!
|
|
535
|
+
) do
|
|
536
|
+
yield # Execute job
|
|
537
|
+
end
|
|
538
|
+
end
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
end
|
|
542
|
+
|
|
543
|
+
# Configuration (automatic in E11y):
|
|
544
|
+
::Sidekiq.configure_server do |config|
|
|
545
|
+
config.server_middleware do |chain|
|
|
546
|
+
chain.add E11y::Sidekiq::TraceMiddleware
|
|
547
|
+
end
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
# Usage (automatic):
|
|
551
|
+
class ProcessOrderJob < ApplicationJob
|
|
552
|
+
def perform(order_id)
|
|
553
|
+
# Current context:
|
|
554
|
+
# - trace_id: xyz-789 (NEW trace for job)
|
|
555
|
+
# - parent_trace_id: abc-123 (link to parent request)
|
|
556
|
+
|
|
557
|
+
Events::OrderProcessing.track(order_id: order_id)
|
|
558
|
+
# Event metadata automatically includes:
|
|
559
|
+
# - trace_id: xyz-789
|
|
560
|
+
# - parent_trace_id: abc-123
|
|
561
|
+
end
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# When enqueuing from request:
|
|
565
|
+
E11y::TraceContext.with_trace(trace_id: 'abc-123') do
|
|
566
|
+
ProcessOrderJob.perform_later(order.id)
|
|
567
|
+
# Job metadata: { e11y_parent_trace_id: 'abc-123' }
|
|
568
|
+
end
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
**Benefits:**
|
|
572
|
+
- ✅ **Bounded traces:** Job traces don't inflate request SLO metrics
|
|
573
|
+
- ✅ **Full visibility:** Query `{trace_id="abc-123"} OR {parent_trace_id="abc-123"}` sees request + jobs
|
|
574
|
+
- ✅ **SLO accuracy:** Request P99 (200ms) ≠ Job P99 (5 minutes)
|
|
575
|
+
|
|
576
|
+
---
|
|
577
|
+
|
|
578
|
+
#### 6.2. Error Handling Middleware (C18: Non-Failing Event Tracking)
|
|
579
|
+
|
|
580
|
+
**Problem:** If E11y event tracking fails, background job should NOT fail (business logic > observability).
|
|
581
|
+
|
|
582
|
+
**Solution:** `SidekiqErrorHandlingMiddleware` rescues E11y failures:
|
|
583
|
+
|
|
584
|
+
```ruby
|
|
585
|
+
# lib/e11y/sidekiq/error_handling_middleware.rb
|
|
586
|
+
module E11y
|
|
587
|
+
module Sidekiq
|
|
588
|
+
class ErrorHandlingMiddleware
|
|
589
|
+
def call(worker, job, queue)
|
|
590
|
+
yield # Execute job
|
|
591
|
+
rescue => error
|
|
592
|
+
# Job business logic failed → let Sidekiq handle it
|
|
593
|
+
raise
|
|
594
|
+
ensure
|
|
595
|
+
# ✅ Wrap E11y tracking in rescue block
|
|
596
|
+
begin
|
|
597
|
+
# Track job completion (success or failure)
|
|
598
|
+
track_job_completion(worker, job, error)
|
|
599
|
+
rescue => e11y_error
|
|
600
|
+
# ⚠️ E11y tracking failed, but DON'T fail the job!
|
|
601
|
+
E11y.logger.error "E11y tracking failed: #{e11y_error.message}"
|
|
602
|
+
|
|
603
|
+
# Send to DLQ for later analysis (optional)
|
|
604
|
+
E11y::DeadLetterQueue.save({
|
|
605
|
+
event_name: 'e11y.tracking_failed',
|
|
606
|
+
job_class: worker.class.name,
|
|
607
|
+
job_id: job['jid'],
|
|
608
|
+
error: e11y_error.message
|
|
609
|
+
})
|
|
610
|
+
end
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
private
|
|
614
|
+
|
|
615
|
+
def track_job_completion(worker, job, error)
|
|
616
|
+
if error
|
|
617
|
+
Events::JobFailed.track(
|
|
618
|
+
job_class: worker.class.name,
|
|
619
|
+
job_id: job['jid'],
|
|
620
|
+
error_class: error.class.name,
|
|
621
|
+
error_message: error.message
|
|
622
|
+
)
|
|
623
|
+
else
|
|
624
|
+
Events::JobSucceeded.track(
|
|
625
|
+
job_class: worker.class.name,
|
|
626
|
+
job_id: job['jid'],
|
|
627
|
+
duration_ms: job_duration(job)
|
|
628
|
+
)
|
|
629
|
+
end
|
|
630
|
+
end
|
|
631
|
+
end
|
|
632
|
+
end
|
|
633
|
+
end
|
|
634
|
+
|
|
635
|
+
# Configuration (automatic in E11y):
|
|
636
|
+
::Sidekiq.configure_server do |config|
|
|
637
|
+
config.server_middleware do |chain|
|
|
638
|
+
chain.add E11y::Sidekiq::ErrorHandlingMiddleware
|
|
639
|
+
end
|
|
640
|
+
end
|
|
641
|
+
|
|
642
|
+
# Configuration (explicit control):
|
|
643
|
+
E11y.configure do |config|
|
|
644
|
+
config.error_handling do
|
|
645
|
+
# ✅ Don't fail jobs on E11y errors
|
|
646
|
+
fail_on_error_in_jobs false # Default: false
|
|
647
|
+
|
|
648
|
+
# Send failed tracking to DLQ
|
|
649
|
+
send_tracking_failures_to_dlq true
|
|
650
|
+
end
|
|
651
|
+
end
|
|
652
|
+
```
|
|
653
|
+
|
|
654
|
+
**Example Scenario:**
|
|
655
|
+
|
|
656
|
+
```ruby
|
|
657
|
+
class ProcessPaymentJob < ApplicationJob
|
|
658
|
+
def perform(order_id)
|
|
659
|
+
# 1. Business logic (MUST succeed!)
|
|
660
|
+
payment = Payment.create!(order_id: order_id, amount: 99.99)
|
|
661
|
+
Stripe.charge(payment)
|
|
662
|
+
|
|
663
|
+
# 2. E11y tracking (NICE to have, but not critical)
|
|
664
|
+
Events::PaymentProcessed.track(order_id: order_id, amount: 99.99)
|
|
665
|
+
# ⚠️ If this fails (Loki down, network timeout):
|
|
666
|
+
# - Error caught by ErrorHandlingMiddleware
|
|
667
|
+
# - Logged to E11y.logger
|
|
668
|
+
# - Saved to DLQ (for replay later)
|
|
669
|
+
# - Job STILL SUCCEEDS! ✅
|
|
670
|
+
# - Payment was created and charged successfully
|
|
671
|
+
end
|
|
672
|
+
end
|
|
673
|
+
|
|
674
|
+
# Timeline (when E11y tracking fails):
|
|
675
|
+
# 10:00:00 Job started
|
|
676
|
+
# 10:00:01 Payment created (SUCCESS ✅)
|
|
677
|
+
# 10:00:02 Stripe charged (SUCCESS ✅)
|
|
678
|
+
# 10:00:03 E11y tracking failed (Loki timeout ❌)
|
|
679
|
+
# → Error caught by middleware
|
|
680
|
+
# → Event saved to DLQ
|
|
681
|
+
# → Job marked as SUCCESS ✅ (business logic succeeded!)
|
|
682
|
+
# 10:00:04 Job completed successfully
|
|
683
|
+
|
|
684
|
+
# Later (when Loki is back online):
|
|
685
|
+
# Replay DLQ → Failed event tracked retroactively
|
|
686
|
+
```
|
|
687
|
+
|
|
688
|
+
**Trade-offs:**
|
|
689
|
+
|
|
690
|
+
| Aspect | Pro | Con | Decision |
|
|
691
|
+
|--------|-----|-----|----------|
|
|
692
|
+
| **fail_on_error: false** | Business logic always succeeds | Silent E11y failures | Business logic > observability |
|
|
693
|
+
| **DLQ for failed tracking** | Can replay events later | DLQ overhead | Worth it for critical events |
|
|
694
|
+
| **Error logging** | Visibility into E11y issues | Log noise if Loki down | Logged at ERROR level (not spam) |
|
|
695
|
+
|
|
696
|
+
**Why this matters:**
|
|
697
|
+
|
|
698
|
+
```ruby
|
|
699
|
+
# ❌ BAD: E11y failure fails the job
|
|
700
|
+
config.error_handling.fail_on_error_in_jobs = true
|
|
701
|
+
|
|
702
|
+
# Job fails if Loki is down:
|
|
703
|
+
# - Payment was created successfully
|
|
704
|
+
# - Stripe was charged successfully
|
|
705
|
+
# - BUT: Job retried because E11y tracking failed!
|
|
706
|
+
# - Result: Duplicate payments! 💸💸💸
|
|
707
|
+
|
|
708
|
+
# ✅ GOOD: E11y failure doesn't fail the job
|
|
709
|
+
config.error_handling.fail_on_error_in_jobs = false
|
|
710
|
+
|
|
711
|
+
# Job succeeds even if Loki is down:
|
|
712
|
+
# - Payment created ✅
|
|
713
|
+
# - Stripe charged ✅
|
|
714
|
+
# - E11y tracking saved to DLQ (replay later)
|
|
715
|
+
# - Job marked as successful ✅
|
|
716
|
+
# - No duplicate payments!
|
|
717
|
+
```
|
|
718
|
+
|
|
719
|
+
---
|
|
720
|
+
|
|
721
|
+
## 🔧 Configuration
|
|
722
|
+
|
|
723
|
+
### Full Configuration
|
|
724
|
+
|
|
725
|
+
```ruby
|
|
726
|
+
# config/initializers/e11y.rb
|
|
727
|
+
E11y.configure do |config|
|
|
728
|
+
config.background_jobs do
|
|
729
|
+
# === SIDEKIQ ===
|
|
730
|
+
sidekiq do
|
|
731
|
+
enabled true
|
|
732
|
+
|
|
733
|
+
# What to track
|
|
734
|
+
track_enqueue true # When job added to queue
|
|
735
|
+
track_start true # When worker picks up job
|
|
736
|
+
track_success true # On successful completion
|
|
737
|
+
track_failure true # On error
|
|
738
|
+
track_retry true # On retry attempts
|
|
739
|
+
|
|
740
|
+
# Trace propagation
|
|
741
|
+
propagate_trace_id true # Preserve trace_id from request
|
|
742
|
+
|
|
743
|
+
# Metrics
|
|
744
|
+
metrics do
|
|
745
|
+
enabled true
|
|
746
|
+
include_queue_size true
|
|
747
|
+
include_latency true
|
|
748
|
+
include_processing_time true
|
|
749
|
+
end
|
|
750
|
+
|
|
751
|
+
# Custom job metadata
|
|
752
|
+
include_metadata [:queue, :retry_count, :scheduled_at, :enqueued_at]
|
|
753
|
+
end
|
|
754
|
+
|
|
755
|
+
# === ACTIVEJOB ===
|
|
756
|
+
active_job do
|
|
757
|
+
enabled true
|
|
758
|
+
track_enqueue true
|
|
759
|
+
track_start true
|
|
760
|
+
track_success true
|
|
761
|
+
track_failure true
|
|
762
|
+
track_retry true
|
|
763
|
+
propagate_trace_id true
|
|
764
|
+
|
|
765
|
+
# ActiveJob-specific
|
|
766
|
+
include_adapter_name true # Sidekiq, Resque, etc.
|
|
767
|
+
include_executions true # Execution count
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
# === FILTERING ===
|
|
771
|
+
# Don't track these jobs (too noisy)
|
|
772
|
+
ignore_jobs ['HeartbeatJob', 'HealthCheckJob']
|
|
773
|
+
|
|
774
|
+
# Sample high-volume jobs
|
|
775
|
+
sample_jobs 'FrequentJob' => 0.1 # 10% sampling
|
|
776
|
+
|
|
777
|
+
# === ALERTING ===
|
|
778
|
+
alert_on do
|
|
779
|
+
# Alert on job failures
|
|
780
|
+
failure_rate threshold: 0.05, # >5% failure rate
|
|
781
|
+
window: 5.minutes
|
|
782
|
+
|
|
783
|
+
# Alert on long queue
|
|
784
|
+
queue_size threshold: 1000,
|
|
785
|
+
queue: 'critical'
|
|
786
|
+
|
|
787
|
+
# Alert on high latency
|
|
788
|
+
queue_latency threshold: 60.seconds,
|
|
789
|
+
queue: 'default'
|
|
790
|
+
end
|
|
791
|
+
end
|
|
792
|
+
end
|
|
793
|
+
```
|
|
794
|
+
|
|
795
|
+
---
|
|
796
|
+
|
|
797
|
+
## 📊 Metrics
|
|
798
|
+
|
|
799
|
+
**Automatic metrics from job tracking:**
|
|
800
|
+
```ruby
|
|
801
|
+
# === JOB EXECUTION ===
|
|
802
|
+
jobs_enqueued_total{job,queue} # Jobs added to queue
|
|
803
|
+
jobs_started_total{job,queue} # Jobs picked up by worker
|
|
804
|
+
jobs_processed_total{job,queue,status} # Jobs completed (status: success/failed)
|
|
805
|
+
jobs_duration_seconds{job,queue} # Job execution time (histogram)
|
|
806
|
+
jobs_latency_seconds{job,queue} # Time in queue before execution
|
|
807
|
+
|
|
808
|
+
# === RETRIES ===
|
|
809
|
+
jobs_retried_total{job,queue,attempt} # Retry attempts
|
|
810
|
+
jobs_retry_exhausted_total{job,queue} # Jobs that exhausted retries
|
|
811
|
+
|
|
812
|
+
# === QUEUE HEALTH ===
|
|
813
|
+
jobs_queue_size{queue} # Current queue depth
|
|
814
|
+
jobs_queue_latency_seconds{queue} # Oldest job waiting time
|
|
815
|
+
jobs_working{queue} # Jobs currently processing
|
|
816
|
+
|
|
817
|
+
# === SUCCESS RATES ===
|
|
818
|
+
jobs_success_rate{job,queue} # Success / (Success + Failed)
|
|
819
|
+
|
|
820
|
+
# Prometheus queries:
|
|
821
|
+
# - Job success rate:
|
|
822
|
+
# sum(rate(jobs_processed_total{status="success"}[5m])) / sum(rate(jobs_processed_total[5m]))
|
|
823
|
+
#
|
|
824
|
+
# - p95 job duration:
|
|
825
|
+
# histogram_quantile(0.95, rate(jobs_duration_seconds_bucket[5m]))
|
|
826
|
+
#
|
|
827
|
+
# - Queue backlog:
|
|
828
|
+
# jobs_queue_size > 1000
|
|
829
|
+
```
|
|
830
|
+
|
|
831
|
+
---
|
|
832
|
+
|
|
833
|
+
## 🧪 Testing
|
|
834
|
+
|
|
835
|
+
```ruby
|
|
836
|
+
# spec/jobs/send_email_job_spec.rb
|
|
837
|
+
RSpec.describe SendEmailJob do
|
|
838
|
+
include ActiveJob::TestHelper
|
|
839
|
+
|
|
840
|
+
it 'tracks job execution' do
|
|
841
|
+
user = create(:user)
|
|
842
|
+
|
|
843
|
+
# Track enqueue event
|
|
844
|
+
expect {
|
|
845
|
+
SendEmailJob.perform_later(user.id)
|
|
846
|
+
}.to track_event('job.enqueued').with(
|
|
847
|
+
job_class: 'SendEmailJob',
|
|
848
|
+
queue: 'mailers'
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
# Execute job
|
|
852
|
+
perform_enqueued_jobs
|
|
853
|
+
|
|
854
|
+
# Verify events tracked
|
|
855
|
+
events = E11y::Buffer.flush
|
|
856
|
+
expect(events.map(&:event_name)).to include(
|
|
857
|
+
'job.started',
|
|
858
|
+
'email.sent',
|
|
859
|
+
'job.succeeded'
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
# Verify all events share same trace_id
|
|
863
|
+
trace_ids = events.map { |e| e[:trace_id] }.uniq
|
|
864
|
+
expect(trace_ids.size).to eq(1)
|
|
865
|
+
end
|
|
866
|
+
|
|
867
|
+
it 'tracks job retries' do
|
|
868
|
+
user = create(:user)
|
|
869
|
+
|
|
870
|
+
# Simulate failure
|
|
871
|
+
allow(UserMailer).to receive(:welcome).and_raise(Net::SMTPServerBusy)
|
|
872
|
+
|
|
873
|
+
# Perform (will fail and retry)
|
|
874
|
+
perform_enqueued_jobs
|
|
875
|
+
|
|
876
|
+
# Verify retry event
|
|
877
|
+
events = E11y::Buffer.flush
|
|
878
|
+
expect(events.map(&:event_name)).to include('job.failed', 'job.retried')
|
|
879
|
+
|
|
880
|
+
retry_event = events.find { |e| e[:event_name] == 'job.retried' }
|
|
881
|
+
expect(retry_event[:payload][:attempt]).to eq(2)
|
|
882
|
+
end
|
|
883
|
+
end
|
|
884
|
+
|
|
885
|
+
# RSpec matcher (custom)
|
|
886
|
+
RSpec::Matchers.define :track_event do |event_name|
|
|
887
|
+
match do |block|
|
|
888
|
+
before_count = E11y::Buffer.size
|
|
889
|
+
block.call
|
|
890
|
+
after_count = E11y::Buffer.size
|
|
891
|
+
|
|
892
|
+
new_events = E11y::Buffer.pop(after_count - before_count)
|
|
893
|
+
@tracked_event = new_events.find { |e| e[:event_name] == event_name }
|
|
894
|
+
|
|
895
|
+
@tracked_event.present?
|
|
896
|
+
end
|
|
897
|
+
|
|
898
|
+
chain :with do |expected_payload|
|
|
899
|
+
@expected_payload = expected_payload
|
|
900
|
+
end
|
|
901
|
+
|
|
902
|
+
match_when_negated do |block|
|
|
903
|
+
!@tracked_event
|
|
904
|
+
end
|
|
905
|
+
end
|
|
906
|
+
```
|
|
907
|
+
|
|
908
|
+
---
|
|
909
|
+
|
|
910
|
+
## 💡 Best Practices
|
|
911
|
+
|
|
912
|
+
### ✅ DO
|
|
913
|
+
|
|
914
|
+
**1. Let E11y auto-track job lifecycle**
|
|
915
|
+
```ruby
|
|
916
|
+
# ✅ GOOD: Auto-tracking handles basics
|
|
917
|
+
class MyJob < ApplicationJob
|
|
918
|
+
def perform(data)
|
|
919
|
+
# Job lifecycle tracked automatically
|
|
920
|
+
process(data)
|
|
921
|
+
end
|
|
922
|
+
end
|
|
923
|
+
```
|
|
924
|
+
|
|
925
|
+
**2. Add custom events for business logic**
|
|
926
|
+
```ruby
|
|
927
|
+
# ✅ GOOD: Track important business steps
|
|
928
|
+
class ProcessOrderJob < ApplicationJob
|
|
929
|
+
def perform(order_id)
|
|
930
|
+
Events::InventoryChecked.track(...)
|
|
931
|
+
Events::PaymentCaptured.track(...)
|
|
932
|
+
Events::ShipmentScheduled.track(...)
|
|
933
|
+
end
|
|
934
|
+
end
|
|
935
|
+
```
|
|
936
|
+
|
|
937
|
+
**3. Monitor queue health**
|
|
938
|
+
```ruby
|
|
939
|
+
# ✅ GOOD: Alert on queue issues
|
|
940
|
+
# Alert: jobs_queue_size{queue="critical"} > 100
|
|
941
|
+
# Alert: jobs_queue_latency_seconds > 60
|
|
942
|
+
```
|
|
943
|
+
|
|
944
|
+
---
|
|
945
|
+
|
|
946
|
+
### ❌ DON'T
|
|
947
|
+
|
|
948
|
+
**1. Don't manually track job start/end**
|
|
949
|
+
```ruby
|
|
950
|
+
# ❌ BAD: Redundant (auto-tracked!)
|
|
951
|
+
class MyJob < ApplicationJob
|
|
952
|
+
def perform(data)
|
|
953
|
+
Events::JobStarted.track(...) # ← E11y does this!
|
|
954
|
+
process(data)
|
|
955
|
+
Events::JobEnded.track(...) # ← E11y does this!
|
|
956
|
+
end
|
|
957
|
+
end
|
|
958
|
+
|
|
959
|
+
# ✅ GOOD: Let E11y handle it
|
|
960
|
+
class MyJob < ApplicationJob
|
|
961
|
+
def perform(data)
|
|
962
|
+
process(data) # That's it!
|
|
963
|
+
end
|
|
964
|
+
end
|
|
965
|
+
```
|
|
966
|
+
|
|
967
|
+
**2. Don't ignore retry signals**
|
|
968
|
+
```ruby
|
|
969
|
+
# ❌ BAD: Silent retries
|
|
970
|
+
class MyJob < ApplicationJob
|
|
971
|
+
def perform(data)
|
|
972
|
+
process(data)
|
|
973
|
+
rescue => e
|
|
974
|
+
# Swallowing error = no retry!
|
|
975
|
+
end
|
|
976
|
+
end
|
|
977
|
+
|
|
978
|
+
# ✅ GOOD: Let errors bubble up
|
|
979
|
+
class MyJob < ApplicationJob
|
|
980
|
+
retry_on ApiError, wait: 5.seconds
|
|
981
|
+
|
|
982
|
+
def perform(data)
|
|
983
|
+
process(data) # Error bubbles up → auto retry
|
|
984
|
+
end
|
|
985
|
+
end
|
|
986
|
+
```
|
|
987
|
+
|
|
988
|
+
---
|
|
989
|
+
|
|
990
|
+
## 📚 Related Use Cases
|
|
991
|
+
|
|
992
|
+
- **[UC-006: Trace Context Management](./UC-006-trace-context-management.md)** - Trace propagation
|
|
993
|
+
- **[UC-004: Zero-Config SLO Tracking](./UC-004-zero-config-slo-tracking.md)** - Job SLOs
|
|
994
|
+
|
|
995
|
+
---
|
|
996
|
+
|
|
997
|
+
## 🎯 Summary
|
|
998
|
+
|
|
999
|
+
### Zero-Config Benefits
|
|
1000
|
+
|
|
1001
|
+
| Feature | Manual Approach | E11y Auto-Tracking |
|
|
1002
|
+
|---------|----------------|-------------------|
|
|
1003
|
+
| Job lifecycle | 50 lines/job | 0 lines (automatic!) |
|
|
1004
|
+
| Trace correlation | Complex middleware | Automatic |
|
|
1005
|
+
| Retry tracking | Manual counters | Automatic |
|
|
1006
|
+
| Queue metrics | External gem | Built-in |
|
|
1007
|
+
| SLO tracking | Custom code | Automatic |
|
|
1008
|
+
|
|
1009
|
+
**Developer Experience:**
|
|
1010
|
+
- ❌ Before: 50+ lines tracking code per job
|
|
1011
|
+
- ✅ After: 0 lines (fully automatic!)
|
|
1012
|
+
- **Time saved:** 30 min per job → 0 min
|
|
1013
|
+
|
|
1014
|
+
---
|
|
1015
|
+
|
|
1016
|
+
**Document Version:** 1.0
|
|
1017
|
+
**Last Updated:** January 12, 2026
|
|
1018
|
+
**Status:** ✅ Complete
|