opentrace 0.3.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +472 -7
- data/lib/opentrace/circuit_breaker.rb +61 -0
- data/lib/opentrace/client.rb +284 -16
- data/lib/opentrace/config.rb +57 -2
- data/lib/opentrace/http_tracker.rb +82 -0
- data/lib/opentrace/log_forwarder.rb +6 -1
- data/lib/opentrace/middleware.rb +87 -0
- data/lib/opentrace/pool_monitor.rb +59 -0
- data/lib/opentrace/queue_monitor.rb +110 -0
- data/lib/opentrace/rails.rb +257 -12
- data/lib/opentrace/request_collector.rb +141 -0
- data/lib/opentrace/stats.rb +47 -0
- data/lib/opentrace/trace_context.rb +57 -0
- data/lib/opentrace/version.rb +1 -1
- data/lib/opentrace.rb +125 -30
- metadata +8 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fbe3de5afdb5f92afcef49d368b8f8b3714ca17fe453b168066f53a5b0e5e1ba
|
|
4
|
+
data.tar.gz: 8f749e943951c939e7daa8f93a454bb0c89646bc295c0550c680064ebf38db16
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e3ecb7c4b649951f64e1090bb7ea06da69cb603dcd44534c9a5e71d76eb697239fddf2267cd2a5e0cff2aa1bf86dc78be6733be754df8e5c6a435967d90809a8
|
|
7
|
+
data.tar.gz: 3625d9de1cd2dda011f69283b34a17964450cda3c44c0f19431fb450e2a24328f1c260a12ea9929331a54866ac90afd07addd65e16cb2eee9e42a939807f3a69
|
data/README.md
CHANGED
|
@@ -19,17 +19,28 @@ A thin, safe Ruby client that forwards structured application logs to an [OpenTr
|
|
|
19
19
|
- **Works with any server** -- Puma (threads), Unicorn (forks), Passenger, and Falcon (fibers)
|
|
20
20
|
- **Fork safe** -- detects forked worker processes and re-initializes cleanly
|
|
21
21
|
- **Fiber safe** -- uses `Fiber[]` storage for correct request isolation in fiber-based servers
|
|
22
|
-
- **Rails integration** -- auto-instruments controllers, SQL queries,
|
|
22
|
+
- **Rails integration** -- auto-instruments controllers, SQL queries, ActiveJob, views, cache, and more
|
|
23
23
|
- **Rack middleware** -- propagates `request_id` via fiber-local storage
|
|
24
24
|
- **Logger wrapper** -- drop-in replacement that forwards to OpenTrace while keeping your original logger
|
|
25
25
|
- **Rails 7.1+ BroadcastLogger** -- native support via `broadcast_to`
|
|
26
26
|
- **TaggedLogging** -- preserves `ActiveSupport::TaggedLogging` tags in metadata
|
|
27
27
|
- **Context support** -- attach global metadata to every log via Hash or Proc
|
|
28
|
-
- **
|
|
28
|
+
- **Business events** -- `OpenTrace.event` sends typed events (e.g. `payment.completed`) that bypass level filtering
|
|
29
|
+
- **Level filtering** -- `min_level` threshold or `allowed_levels` list to control which severities are forwarded
|
|
29
30
|
- **Auto-enrichment** -- every log includes `hostname`, `pid`, and `git_sha` automatically
|
|
30
|
-
- **Exception helper** -- `OpenTrace.error` captures class, message, and
|
|
31
|
+
- **Exception helper** -- `OpenTrace.error` captures class, message, cleaned backtrace, and error fingerprint
|
|
31
32
|
- **Runtime controls** -- enable/disable logging at runtime without restarting
|
|
32
33
|
- **Graceful shutdown** -- pending logs are flushed automatically on process exit
|
|
34
|
+
- **N+1 query detection** -- warns when a request exceeds 20 SQL queries
|
|
35
|
+
- **Per-request summary** -- one rich log per request with SQL, view, cache breakdown and timeline
|
|
36
|
+
- **Error fingerprinting** -- stable fingerprint for grouping identical errors across requests
|
|
37
|
+
- **Deprecation tracking** -- captures Rails deprecation warnings with callsite
|
|
38
|
+
- **DB pool monitoring** -- background thread reports connection pool saturation (opt-in)
|
|
39
|
+
- **Job queue depth** -- monitors Sidekiq, GoodJob, or SolidQueue queue sizes (opt-in)
|
|
40
|
+
- **Memory delta tracking** -- snapshots process RSS before/after each request (opt-in)
|
|
41
|
+
- **External HTTP tracking** -- captures outbound Net::HTTP calls with timing (opt-in)
|
|
42
|
+
- **Version negotiation** -- startup compatibility check with capability-based feature detection
|
|
43
|
+
- **Distributed tracing** -- W3C Trace Context (`traceparent`) propagation across services with span IDs
|
|
33
44
|
|
|
34
45
|
## Installation
|
|
35
46
|
|
|
@@ -79,6 +90,7 @@ OpenTrace.configure do |c|
|
|
|
79
90
|
c.timeout = 1.0 # HTTP timeout in seconds (default: 1.0)
|
|
80
91
|
c.enabled = true # default: true
|
|
81
92
|
c.min_level = :info # minimum level to forward (default: :debug)
|
|
93
|
+
c.allowed_levels = [:warn, :error] # explicit level list (overrides min_level, default: nil)
|
|
82
94
|
c.batch_size = 50 # logs per batch (default: 50)
|
|
83
95
|
c.flush_interval = 5.0 # seconds between flushes (default: 5.0)
|
|
84
96
|
|
|
@@ -95,6 +107,24 @@ OpenTrace.configure do |c|
|
|
|
95
107
|
# SQL logging (Rails only)
|
|
96
108
|
c.sql_logging = true # default: true
|
|
97
109
|
c.sql_duration_threshold_ms = 100.0 # only log queries slower than this (default: 0.0 = all)
|
|
110
|
+
|
|
111
|
+
# Path filtering
|
|
112
|
+
c.ignore_paths = ["/health", %r{\A/assets/}] # skip noisy paths (default: [])
|
|
113
|
+
|
|
114
|
+
# Per-request summary (Rails only)
|
|
115
|
+
c.request_summary = true # accumulate events into one rich log (default: true)
|
|
116
|
+
c.timeline = true # include event timeline in summary (default: true)
|
|
117
|
+
c.timeline_max_events = 200 # cap timeline entries (default: 200)
|
|
118
|
+
|
|
119
|
+
# Background monitors (opt-in)
|
|
120
|
+
c.pool_monitoring = false # DB connection pool stats (default: false)
|
|
121
|
+
c.pool_monitoring_interval = 30 # seconds between checks (default: 30)
|
|
122
|
+
c.queue_monitoring = false # job queue depth monitoring (default: false)
|
|
123
|
+
c.queue_monitoring_interval = 60 # seconds between checks (default: 60)
|
|
124
|
+
|
|
125
|
+
# Advanced opt-in features
|
|
126
|
+
c.memory_tracking = false # RSS delta per request (default: false)
|
|
127
|
+
c.http_tracking = false # external HTTP call tracking (default: false)
|
|
98
128
|
end
|
|
99
129
|
```
|
|
100
130
|
|
|
@@ -102,15 +132,21 @@ If any required field (`endpoint`, `api_key`, `service`) is missing or empty, th
|
|
|
102
132
|
|
|
103
133
|
### Level Filtering
|
|
104
134
|
|
|
105
|
-
Control which log levels are forwarded with `min_level
|
|
135
|
+
Control which log levels are forwarded with `min_level` (threshold) or `allowed_levels` (explicit list):
|
|
106
136
|
|
|
107
137
|
```ruby
|
|
108
138
|
OpenTrace.configure do |c|
|
|
109
139
|
# ...
|
|
140
|
+
# Option A: Threshold — forward this level and above
|
|
110
141
|
c.min_level = :warn # only forward WARN, ERROR, and FATAL
|
|
142
|
+
|
|
143
|
+
# Option B: Explicit list — forward only these levels (overrides min_level)
|
|
144
|
+
c.allowed_levels = [:warn, :error] # only forward WARN and ERROR
|
|
111
145
|
end
|
|
112
146
|
```
|
|
113
147
|
|
|
148
|
+
When `allowed_levels` is set, it takes precedence over `min_level`. When `allowed_levels` is `nil` (the default), `min_level` is used.
|
|
149
|
+
|
|
114
150
|
Available levels: `:debug`, `:info`, `:warn`, `:error`, `:fatal`
|
|
115
151
|
|
|
116
152
|
## Usage
|
|
@@ -134,7 +170,7 @@ Pass `trace_id` inside metadata and it will be promoted to a top-level field aut
|
|
|
134
170
|
|
|
135
171
|
### Exception Logging
|
|
136
172
|
|
|
137
|
-
Use `OpenTrace.error` to log exceptions with automatic class, message, and
|
|
173
|
+
Use `OpenTrace.error` to log exceptions with automatic class, message, backtrace, and fingerprint extraction:
|
|
138
174
|
|
|
139
175
|
```ruby
|
|
140
176
|
begin
|
|
@@ -148,6 +184,19 @@ This captures:
|
|
|
148
184
|
- `exception_class` -- the exception class name
|
|
149
185
|
- `exception_message` -- truncated to 500 characters
|
|
150
186
|
- `backtrace` -- cleaned (Rails backtrace cleaner or gem-filtered), limited to 15 frames
|
|
187
|
+
- `error_fingerprint` -- 12-char hash for grouping identical errors (stable across line number changes)
|
|
188
|
+
|
|
189
|
+
### Business Events
|
|
190
|
+
|
|
191
|
+
Use `OpenTrace.event` to send typed business events. Events always send at `INFO` level and **bypass level filtering** — they are never suppressed by `min_level` or `allowed_levels`:
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
OpenTrace.event("payment.completed", "User paid $49.99", { user_id: 42, amount: 49.99 })
|
|
195
|
+
OpenTrace.event("auth.login", "Google OAuth login", { provider: "google", user_id: 7 })
|
|
196
|
+
OpenTrace.event("order.shipped", "Order dispatched", { order_id: "ORD-123" })
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Events include an `event_type` field in the payload, making them filterable on the server. They inherit context, `request_id`, and static context just like normal logs.
|
|
151
200
|
|
|
152
201
|
### Logger Wrapper
|
|
153
202
|
|
|
@@ -220,6 +269,63 @@ Request IDs are stored using `Fiber[]` (fiber-local storage), which works correc
|
|
|
220
269
|
|
|
221
270
|
All your existing `Rails.logger.info(...)` calls automatically get forwarded to OpenTrace.
|
|
222
271
|
|
|
272
|
+
### Per-Request Summary
|
|
273
|
+
|
|
274
|
+
When `request_summary` is enabled (the default), the gem accumulates all events during a request -- SQL queries, view renders, cache operations, HTTP calls -- into a single rich log entry emitted at request end. This avoids flooding the queue with hundreds of individual events.
|
|
275
|
+
|
|
276
|
+
Example payload:
|
|
277
|
+
|
|
278
|
+
```json
|
|
279
|
+
{
|
|
280
|
+
"level": "INFO",
|
|
281
|
+
"message": "GET /dashboard 200 2847ms",
|
|
282
|
+
"metadata": {
|
|
283
|
+
"request_id": "req-abc123",
|
|
284
|
+
"controller": "DashboardController",
|
|
285
|
+
"action": "index",
|
|
286
|
+
"method": "GET",
|
|
287
|
+
"path": "/dashboard",
|
|
288
|
+
"status": 200,
|
|
289
|
+
"duration_ms": 2847.3,
|
|
290
|
+
|
|
291
|
+
"request_user_agent": "Mozilla/5.0...",
|
|
292
|
+
"request_accept": "text/html",
|
|
293
|
+
|
|
294
|
+
"sql_query_count": 34,
|
|
295
|
+
"sql_total_ms": 423.1,
|
|
296
|
+
"sql_slowest_ms": 312.0,
|
|
297
|
+
"sql_slowest_name": "Order Count",
|
|
298
|
+
"n_plus_one_warning": true,
|
|
299
|
+
|
|
300
|
+
"view_render_count": 48,
|
|
301
|
+
"view_total_ms": 890.2,
|
|
302
|
+
"view_slowest_ms": 245.0,
|
|
303
|
+
"view_slowest_template": "dashboard/_activity_feed.html.erb",
|
|
304
|
+
|
|
305
|
+
"cache_reads": 8,
|
|
306
|
+
"cache_hits": 5,
|
|
307
|
+
"cache_writes": 3,
|
|
308
|
+
"cache_hit_ratio": 0.63,
|
|
309
|
+
|
|
310
|
+
"time_breakdown": {
|
|
311
|
+
"sql_pct": 14.9,
|
|
312
|
+
"view_pct": 31.3,
|
|
313
|
+
"http_pct": 0.0,
|
|
314
|
+
"other_pct": 53.8
|
|
315
|
+
},
|
|
316
|
+
|
|
317
|
+
"timeline": [
|
|
318
|
+
{ "t": "sql", "n": "User Load", "ms": 1.2, "at": 0.0 },
|
|
319
|
+
{ "t": "cache", "a": "read", "hit": true, "ms": 0.1, "at": 6.0 },
|
|
320
|
+
{ "t": "sql", "n": "Order Count", "ms": 312.0, "at": 10.0 },
|
|
321
|
+
{ "t": "view", "n": "dashboard/index.html.erb", "ms": 890.2, "at": 350.0 }
|
|
322
|
+
]
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
The timeline shows a waterfall of events in chronological order. Timeline keys are kept short to minimize payload size: `t` = type, `n` = name, `ms` = duration, `at` = offset from request start, `s` = status, `a` = action.
|
|
328
|
+
|
|
223
329
|
### Controller Subscriber
|
|
224
330
|
|
|
225
331
|
Subscribes to `process_action.action_controller` and captures:
|
|
@@ -238,12 +344,26 @@ Subscribes to `process_action.action_controller` and captures:
|
|
|
238
344
|
| `exception_class` | Exception class (if raised) |
|
|
239
345
|
| `exception_message` | Exception message (if raised) |
|
|
240
346
|
| `backtrace` | Cleaned backtrace (if exception raised) |
|
|
347
|
+
| `error_fingerprint` | 12-char fingerprint for error grouping |
|
|
348
|
+
| `request_content_type` | Request Content-Type header |
|
|
349
|
+
| `request_accept` | Request Accept header |
|
|
350
|
+
| `request_user_agent` | Request User-Agent (truncated to 200 chars) |
|
|
351
|
+
| `request_referer` | Request Referer header |
|
|
352
|
+
| `sql_query_count` | Total SQL queries in this request |
|
|
353
|
+
| `sql_total_ms` | Total SQL time in this request |
|
|
354
|
+
| `n_plus_one_warning` | `true` when query count exceeds 20 |
|
|
355
|
+
|
|
356
|
+
When request summary is enabled, the log also includes view render stats, cache stats, time breakdown, and timeline (see above).
|
|
241
357
|
|
|
242
358
|
Log levels are set automatically:
|
|
243
359
|
- **ERROR** -- exceptions or 5xx status
|
|
244
360
|
- **WARN** -- 4xx status
|
|
245
361
|
- **INFO** -- everything else
|
|
246
362
|
|
|
363
|
+
### N+1 Query Detection
|
|
364
|
+
|
|
365
|
+
Every request tracks the number of SQL queries via a Fiber-local counter. When a request exceeds 20 queries, the log entry includes `n_plus_one_warning: true`. This makes it easy to query OpenTrace for requests with potential N+1 issues.
|
|
366
|
+
|
|
247
367
|
### SQL Query Subscriber
|
|
248
368
|
|
|
249
369
|
Subscribes to `sql.active_record` and logs every query with:
|
|
@@ -280,12 +400,54 @@ Subscribes to `perform.active_job` and logs every job execution with:
|
|
|
280
400
|
| `executions` | Attempt number |
|
|
281
401
|
| `duration_ms` | Execution duration |
|
|
282
402
|
| `job_arguments` | Serialized arguments (truncated to 512 bytes) |
|
|
403
|
+
| `queue_latency_ms` | Time spent waiting in queue before execution |
|
|
404
|
+
| `enqueued_at` | When the job was enqueued |
|
|
283
405
|
| `exception_class` | Exception class (if failed) |
|
|
284
406
|
| `exception_message` | Exception message (if failed) |
|
|
285
407
|
| `backtrace` | Cleaned backtrace (if failed) |
|
|
408
|
+
| `error_fingerprint` | Fingerprint for error grouping (if failed) |
|
|
286
409
|
|
|
287
410
|
Failed jobs are logged as `ERROR`, successful jobs as `INFO`.
|
|
288
411
|
|
|
412
|
+
### Deprecation Warning Subscriber
|
|
413
|
+
|
|
414
|
+
Subscribes to `deprecation.rails` and logs all Rails deprecation warnings as `WARN`:
|
|
415
|
+
|
|
416
|
+
| Field | Description |
|
|
417
|
+
|---|---|
|
|
418
|
+
| `deprecation_message` | The deprecation message (truncated to 500 chars) |
|
|
419
|
+
| `deprecation_callsite` | File and line where the deprecated API was called |
|
|
420
|
+
| `request_id` | Current request ID (if in web context) |
|
|
421
|
+
|
|
422
|
+
### View Render Tracking
|
|
423
|
+
|
|
424
|
+
When request summary is enabled, subscribes to `render_template.action_view` and `render_partial.action_view`. View render events are accumulated in the RequestCollector and included in the per-request summary -- **no individual log entries are emitted** for views.
|
|
425
|
+
|
|
426
|
+
The summary includes:
|
|
427
|
+
- `view_render_count` -- total number of templates/partials rendered
|
|
428
|
+
- `view_total_ms` -- total rendering time
|
|
429
|
+
- `view_slowest_ms` / `view_slowest_template` -- the bottleneck template
|
|
430
|
+
|
|
431
|
+
Template paths are automatically shortened (e.g., `/Users/deploy/app/views/orders/show.html.erb` becomes `orders/show.html.erb`).
|
|
432
|
+
|
|
433
|
+
### Cache Operation Tracking
|
|
434
|
+
|
|
435
|
+
When request summary is enabled, subscribes to `cache_read.active_support`, `cache_write.active_support`, and `cache_delete.active_support`. Like views, cache events are accumulated -- no individual logs.
|
|
436
|
+
|
|
437
|
+
The summary includes:
|
|
438
|
+
- `cache_reads` / `cache_hits` / `cache_writes`
|
|
439
|
+
- `cache_hit_ratio` -- hit rate (0.0 to 1.0)
|
|
440
|
+
|
|
441
|
+
### Error Fingerprinting
|
|
442
|
+
|
|
443
|
+
Every error (in controller requests, job failures, and `OpenTrace.error` calls) includes an `error_fingerprint` -- a 12-character hash derived from the exception class and the first application frame in the backtrace. The fingerprint is:
|
|
444
|
+
|
|
445
|
+
- **Stable across deploys** -- line number changes don't affect it
|
|
446
|
+
- **Same error, same fingerprint** -- different error messages at the same location produce the same fingerprint
|
|
447
|
+
- **Different error, different fingerprint** -- different exception classes or different code locations produce different fingerprints
|
|
448
|
+
|
|
449
|
+
Use it to group and count errors in OpenTrace.
|
|
450
|
+
|
|
289
451
|
### TaggedLogging
|
|
290
452
|
|
|
291
453
|
If your wrapped logger uses `ActiveSupport::TaggedLogging`, tags are preserved and injected into the metadata:
|
|
@@ -297,6 +459,90 @@ Rails.logger.tagged("RequestID-123", "UserID-42") do
|
|
|
297
459
|
end
|
|
298
460
|
```
|
|
299
461
|
|
|
462
|
+
## Background Monitors
|
|
463
|
+
|
|
464
|
+
### DB Connection Pool Monitoring
|
|
465
|
+
|
|
466
|
+
Opt-in background thread that periodically reports ActiveRecord connection pool stats:
|
|
467
|
+
|
|
468
|
+
```ruby
|
|
469
|
+
OpenTrace.configure do |c|
|
|
470
|
+
# ...
|
|
471
|
+
c.pool_monitoring = true
|
|
472
|
+
c.pool_monitoring_interval = 30 # seconds (default: 30)
|
|
473
|
+
end
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
Reports `pool_size`, `connections_busy`, `connections_idle`, `threads_waiting`, and `checkout_timeout`. Logs at `WARN` when threads are waiting for a connection, `DEBUG` otherwise.
|
|
477
|
+
|
|
478
|
+
### Job Queue Depth Monitoring
|
|
479
|
+
|
|
480
|
+
Opt-in background thread that reports job queue sizes. Supports Sidekiq, GoodJob, and SolidQueue (auto-detected):
|
|
481
|
+
|
|
482
|
+
```ruby
|
|
483
|
+
OpenTrace.configure do |c|
|
|
484
|
+
# ...
|
|
485
|
+
c.queue_monitoring = true
|
|
486
|
+
c.queue_monitoring_interval = 60 # seconds (default: 60)
|
|
487
|
+
end
|
|
488
|
+
```
|
|
489
|
+
|
|
490
|
+
Reports per-queue sizes and total enqueued count. Logs at `WARN` when total exceeds 1,000.
|
|
491
|
+
|
|
492
|
+
## Advanced Opt-In Features
|
|
493
|
+
|
|
494
|
+
These features have measurable overhead or implementation risks. **Disabled by default.** Enable them after testing in staging.
|
|
495
|
+
|
|
496
|
+
### Memory Delta Tracking
|
|
497
|
+
|
|
498
|
+
Snapshots process memory (RSS) before and after each request:
|
|
499
|
+
|
|
500
|
+
```ruby
|
|
501
|
+
OpenTrace.configure do |c|
|
|
502
|
+
# ...
|
|
503
|
+
c.memory_tracking = true
|
|
504
|
+
end
|
|
505
|
+
```
|
|
506
|
+
|
|
507
|
+
Adds to the request summary:
|
|
508
|
+
- `memory_before_mb` -- RSS before request
|
|
509
|
+
- `memory_after_mb` -- RSS after request
|
|
510
|
+
- `memory_delta_mb` -- difference (positive = memory grew)
|
|
511
|
+
|
|
512
|
+
Uses `/proc/self/statm` on Linux (~10us) or `GC.stat` approximation on macOS (~5us). The delta is process-level, so concurrent requests will affect accuracy. Most accurate on single-threaded servers (Unicorn).
|
|
513
|
+
|
|
514
|
+
### External HTTP Tracking
|
|
515
|
+
|
|
516
|
+
Instruments outbound `Net::HTTP` calls to capture third-party API performance:
|
|
517
|
+
|
|
518
|
+
```ruby
|
|
519
|
+
OpenTrace.configure do |c|
|
|
520
|
+
# ...
|
|
521
|
+
c.http_tracking = true
|
|
522
|
+
end
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
Adds to the request summary:
|
|
526
|
+
- `http_external_count` -- number of outbound HTTP calls
|
|
527
|
+
- `http_external_total_ms` -- total time in external calls
|
|
528
|
+
- `http_slowest_ms` / `http_slowest_host` -- the bottleneck
|
|
529
|
+
|
|
530
|
+
Each HTTP call appears in the timeline:
|
|
531
|
+
|
|
532
|
+
```json
|
|
533
|
+
{ "t": "http", "n": "POST api.stripe.com", "ms": 184.0, "s": 200, "at": 55.0 }
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
Failed calls include an error type:
|
|
537
|
+
|
|
538
|
+
```json
|
|
539
|
+
{ "t": "http", "n": "POST api.stripe.com", "ms": 5200.0, "s": 0, "err": "Net::ReadTimeout", "at": 55.0 }
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
A recursion guard prevents OpenTrace's own HTTP calls to the server from being tracked. The `time_breakdown` in the request summary includes `http_pct` alongside `sql_pct` and `view_pct`.
|
|
543
|
+
|
|
544
|
+
**Note**: This works by prepending a module to `Net::HTTP`. Libraries that use `Net::HTTP` internally (Faraday, HTTParty, RestClient) are automatically captured.
|
|
545
|
+
|
|
300
546
|
## Runtime Controls
|
|
301
547
|
|
|
302
548
|
```ruby
|
|
@@ -343,12 +589,214 @@ Your App --log()--> [In-Memory Queue] --background thread--> POST /api/logs -->
|
|
|
343
589
|
- `enqueue` is non-blocking -- it uses `try_lock` so it never waits on a mutex
|
|
344
590
|
- The thread is started lazily on the first log call -- no threads are created at boot
|
|
345
591
|
- If the queue exceeds 1,000 items, new logs are dropped (oldest are preserved)
|
|
346
|
-
- Payloads exceeding
|
|
592
|
+
- Payloads exceeding 256 KB (configurable via `max_payload_bytes`) are intelligently truncated (backtrace, params, SQL removed first)
|
|
347
593
|
- If still too large after truncation, the payload is split and retried in smaller batches
|
|
348
|
-
-
|
|
594
|
+
- Failed requests are retried with exponential backoff (up to 3 attempts by default)
|
|
595
|
+
- A circuit breaker stops sending when the server is unreachable, resuming after a cooldown
|
|
596
|
+
- Rate-limited responses (429) trigger a backoff delay, respecting the server's `Retry-After` header
|
|
597
|
+
- Authentication failures (401) suspend sending and print a one-time warning to STDERR
|
|
349
598
|
- The HTTP timeout defaults to 1 second
|
|
350
599
|
- Pending logs are flushed on process exit via an `at_exit` hook
|
|
351
600
|
|
|
601
|
+
### Retry & Circuit Breaker
|
|
602
|
+
|
|
603
|
+
Failed HTTP requests are retried with exponential backoff and jitter. Only server errors (5xx) and network failures are retried -- client errors (4xx) are not.
|
|
604
|
+
|
|
605
|
+
```ruby
|
|
606
|
+
OpenTrace.configure do |c|
|
|
607
|
+
# ...
|
|
608
|
+
c.max_retries = 2 # up to 3 total attempts (default: 2)
|
|
609
|
+
c.retry_base_delay = 0.1 # 100ms initial backoff (default: 0.1)
|
|
610
|
+
c.retry_max_delay = 2.0 # cap backoff at 2 seconds (default: 2.0)
|
|
611
|
+
end
|
|
612
|
+
```
|
|
613
|
+
|
|
614
|
+
A circuit breaker prevents wasting resources when the server is down. After a threshold of consecutive failures, the circuit **opens** and all sends are skipped. After a cooldown, a single **probe** request is sent. If it succeeds, the circuit closes and normal operation resumes.
|
|
615
|
+
|
|
616
|
+
```ruby
|
|
617
|
+
OpenTrace.configure do |c|
|
|
618
|
+
# ...
|
|
619
|
+
c.circuit_breaker_threshold = 5 # failures before opening (default: 5)
|
|
620
|
+
c.circuit_breaker_timeout = 30 # seconds before probe (default: 30)
|
|
621
|
+
end
|
|
622
|
+
```
|
|
623
|
+
|
|
624
|
+
### Backpressure Handling
|
|
625
|
+
|
|
626
|
+
The client responds intelligently to HTTP status codes:
|
|
627
|
+
|
|
628
|
+
| Status | Behavior |
|
|
629
|
+
|---|---|
|
|
630
|
+
| **2xx** | Success -- circuit breaker resets |
|
|
631
|
+
| **429** | Rate limited -- pauses for `Retry-After` seconds (or `rate_limit_backoff`), re-enqueues the batch |
|
|
632
|
+
| **401** | Auth failed -- suspends sending, prints one-time STDERR warning. Resumes after `OpenTrace.configure` |
|
|
633
|
+
| **5xx** | Server error -- retried with backoff, counts toward circuit breaker |
|
|
634
|
+
| **Other 4xx** | Client error -- batch dropped silently |
|
|
635
|
+
|
|
636
|
+
```ruby
|
|
637
|
+
OpenTrace.configure do |c|
|
|
638
|
+
# ...
|
|
639
|
+
c.rate_limit_backoff = 5.0 # fallback when Retry-After header is missing (default: 5.0)
|
|
640
|
+
end
|
|
641
|
+
```
|
|
642
|
+
|
|
643
|
+
### Delivery Observability
|
|
644
|
+
|
|
645
|
+
The client exposes internal delivery statistics so you can monitor the health of the log pipeline:
|
|
646
|
+
|
|
647
|
+
```ruby
|
|
648
|
+
OpenTrace.stats
|
|
649
|
+
# => {
|
|
650
|
+
# enqueued: 15234,
|
|
651
|
+
# delivered: 15100,
|
|
652
|
+
# dropped_queue_full: 34,
|
|
653
|
+
# dropped_circuit_open: 100,
|
|
654
|
+
# dropped_auth_suspended: 0,
|
|
655
|
+
# dropped_error: 0,
|
|
656
|
+
# retries: 12,
|
|
657
|
+
# rate_limited: 2,
|
|
658
|
+
# auth_failures: 0,
|
|
659
|
+
# payload_splits: 1,
|
|
660
|
+
# batches_sent: 302,
|
|
661
|
+
# bytes_sent: 4812300,
|
|
662
|
+
# queue_size: 23,
|
|
663
|
+
# circuit_state: :closed,
|
|
664
|
+
# auth_suspended: false,
|
|
665
|
+
# uptime_seconds: 3600
|
|
666
|
+
# }
|
|
667
|
+
|
|
668
|
+
OpenTrace.healthy? # true when circuit is closed and auth is not suspended
|
|
669
|
+
OpenTrace.reset_stats! # reset counters (useful after reading/reporting)
|
|
670
|
+
```
|
|
671
|
+
|
|
672
|
+
#### Drop Callback
|
|
673
|
+
|
|
674
|
+
Register a callback to be notified when logs are dropped. The callback receives the count of dropped items and the reason:
|
|
675
|
+
|
|
676
|
+
```ruby
|
|
677
|
+
OpenTrace.configure do |c|
|
|
678
|
+
# ...
|
|
679
|
+
c.on_drop = ->(count, reason) {
|
|
680
|
+
StatsD.increment("opentrace.dropped", count, tags: ["reason:#{reason}"])
|
|
681
|
+
}
|
|
682
|
+
end
|
|
683
|
+
```
|
|
684
|
+
|
|
685
|
+
Reasons: `:queue_full`, `:circuit_open`, `:auth_suspended`, `:error`
|
|
686
|
+
|
|
687
|
+
The callback is called synchronously but **exceptions are always swallowed** -- a broken callback will never affect the client.
|
|
688
|
+
|
|
689
|
+
### Gzip Compression
|
|
690
|
+
|
|
691
|
+
Outgoing batches are automatically gzip-compressed when they exceed the compression threshold (default: 1KB). This typically achieves 70-85% bandwidth reduction for log payloads with repetitive keys and values.
|
|
692
|
+
|
|
693
|
+
```ruby
|
|
694
|
+
OpenTrace.configure do |c|
|
|
695
|
+
# ...
|
|
696
|
+
c.compression = true # enable gzip compression (default: true)
|
|
697
|
+
c.compression_threshold = 1024 # only compress payloads > 1KB (default: 1024)
|
|
698
|
+
c.max_payload_bytes = 262_144 # max batch size before splitting (default: 256KB)
|
|
699
|
+
end
|
|
700
|
+
```
|
|
701
|
+
|
|
702
|
+
Compression uses `Zlib::BEST_SPEED` (level 1) for minimal CPU overhead (~0.14ms per batch). The server must support `Content-Encoding: gzip` on request bodies. OpenTrace server v0.6+ includes transparent decompression middleware.
|
|
703
|
+
|
|
704
|
+
### Version Negotiation
|
|
705
|
+
|
|
706
|
+
On the first dispatch cycle, the client makes a lightweight `GET /api/version` call to discover the server's API version and capabilities. This runs once per process (or after fork) and never blocks `enqueue`.
|
|
707
|
+
|
|
708
|
+
```ruby
|
|
709
|
+
# Check server capabilities programmatically
|
|
710
|
+
client = OpenTrace.send(:client)
|
|
711
|
+
client.supports?(:request_summaries) # true if server advertises it
|
|
712
|
+
client.supports?(:gzip_request) # true if server supports gzip
|
|
713
|
+
```
|
|
714
|
+
|
|
715
|
+
If the server requires a newer client API version, a warning is printed to STDERR:
|
|
716
|
+
|
|
717
|
+
```
|
|
718
|
+
[OpenTrace] Server requires API version >= 2, but this client supports version 1.
|
|
719
|
+
Please upgrade the opentrace gem. Log forwarding may not work correctly.
|
|
720
|
+
```
|
|
721
|
+
|
|
722
|
+
Every request includes an `X-API-Version: 1` header so the server can reject incompatible clients with a clear error. Old servers without `/api/version` are handled gracefully — the check silently skips and all features remain enabled.
|
|
723
|
+
|
|
724
|
+
### Distributed Tracing
|
|
725
|
+
|
|
726
|
+
When `trace_propagation` is enabled (the default), the middleware extracts or generates a W3C-compatible trace context for each request:
|
|
727
|
+
|
|
728
|
+
- **Incoming**: Reads `traceparent` header (W3C standard), falls back to `X-Trace-ID`, then `X-Request-ID`
|
|
729
|
+
- **Outgoing**: When `http_tracking` is enabled, injects `traceparent`, `X-Trace-ID`, and `X-Request-ID` into outbound HTTP requests
|
|
730
|
+
|
|
731
|
+
This enables cross-service correlation — all logs from a distributed request chain share the same `trace_id`.
|
|
732
|
+
|
|
733
|
+
```ruby
|
|
734
|
+
OpenTrace.configure do |c|
|
|
735
|
+
# ...
|
|
736
|
+
c.trace_propagation = true # extract/propagate trace context (default: true)
|
|
737
|
+
c.http_tracking = true # also inject into outgoing HTTP calls (opt-in)
|
|
738
|
+
end
|
|
739
|
+
```
|
|
740
|
+
|
|
741
|
+
Each log entry includes `trace_id`, `span_id`, and `parent_span_id` (when available) as top-level fields. The server indexes these for fast trace lookups.
|
|
742
|
+
|
|
743
|
+
### Request Summary Architecture
|
|
744
|
+
|
|
745
|
+
When `request_summary` is enabled, events within a request are **accumulated** in a Fiber-local `RequestCollector` instead of being pushed to the queue individually:
|
|
746
|
+
|
|
747
|
+
```
|
|
748
|
+
Request Start
|
|
749
|
+
Middleware creates RequestCollector in Fiber[]
|
|
750
|
+
SQL events ──► collector.record_sql() (no queue push)
|
|
751
|
+
View events ──► collector.record_view() (no queue push)
|
|
752
|
+
Cache events ──► collector.record_cache() (no queue push)
|
|
753
|
+
HTTP events ──► collector.record_http() (no queue push)
|
|
754
|
+
Request End
|
|
755
|
+
Controller subscriber builds request_summary from collector
|
|
756
|
+
One queue push: metadata (user/request context) + request_summary (perf data)
|
|
757
|
+
Middleware cleans up RequestCollector
|
|
758
|
+
```
|
|
759
|
+
|
|
760
|
+
This means a request with 30 SQL queries, 50 view renders, and 10 cache operations produces **one log entry** instead of 91.
|
|
761
|
+
|
|
762
|
+
### Structured Request Metrics
|
|
763
|
+
|
|
764
|
+
When a `RequestCollector` is active, performance data is sent as a **separate `request_summary` field** instead of being merged into metadata. This allows the server to store it in a dedicated `request_summaries` table with indexed columns for fast analytical queries.
|
|
765
|
+
|
|
766
|
+
```ruby
|
|
767
|
+
# Sent automatically by the Rails subscriber — no code changes needed.
|
|
768
|
+
# The payload looks like:
|
|
769
|
+
{
|
|
770
|
+
"metadata": { "request_id": "req-abc", "user_id": 42 },
|
|
771
|
+
"request_summary": {
|
|
772
|
+
"controller": "InvoicesController",
|
|
773
|
+
"action": "index",
|
|
774
|
+
"method": "GET",
|
|
775
|
+
"path": "/invoices",
|
|
776
|
+
"status": 200,
|
|
777
|
+
"duration_ms": 45.2,
|
|
778
|
+
"sql_count": 3,
|
|
779
|
+
"sql_total_ms": 12.1,
|
|
780
|
+
"n_plus_one": false,
|
|
781
|
+
"view_count": 2,
|
|
782
|
+
"view_total_ms": 28.3,
|
|
783
|
+
"cache_reads": 1,
|
|
784
|
+
"cache_hits": 1,
|
|
785
|
+
"cache_hit_ratio": 1.0,
|
|
786
|
+
"timeline": [{"t": "sql", "n": "Invoice Load", "ms": 8.2, "at": 2.0}]
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
```
|
|
790
|
+
|
|
791
|
+
You can also pass `request_summary:` manually:
|
|
792
|
+
|
|
793
|
+
```ruby
|
|
794
|
+
OpenTrace.log("INFO", "Custom request", { user_id: 42 },
|
|
795
|
+
request_summary: { controller: "Custom", action: "run", sql_count: 5 })
|
|
796
|
+
```
|
|
797
|
+
|
|
798
|
+
**Backward compatibility**: Old servers ignore the `request_summary` field. When no collector is active (background jobs, non-Rails), data falls back to metadata as before.
|
|
799
|
+
|
|
352
800
|
## Log Payload Format
|
|
353
801
|
|
|
354
802
|
Each log is sent as a JSON object to `POST /api/logs`:
|
|
@@ -367,6 +815,19 @@ Each log is sent as a JSON object to `POST /api/logs`:
|
|
|
367
815
|
"hostname": "web-01",
|
|
368
816
|
"pid": 12345,
|
|
369
817
|
"git_sha": "a1b2c3d"
|
|
818
|
+
},
|
|
819
|
+
"request_summary": {
|
|
820
|
+
"controller": "InvoicesController",
|
|
821
|
+
"action": "index",
|
|
822
|
+
"method": "GET",
|
|
823
|
+
"path": "/invoices",
|
|
824
|
+
"status": 200,
|
|
825
|
+
"duration_ms": 45.2,
|
|
826
|
+
"sql_count": 3,
|
|
827
|
+
"sql_total_ms": 12.1,
|
|
828
|
+
"view_count": 2,
|
|
829
|
+
"view_total_ms": 28.3,
|
|
830
|
+
"timeline": [...]
|
|
370
831
|
}
|
|
371
832
|
}
|
|
372
833
|
```
|
|
@@ -379,7 +840,11 @@ Each log is sent as a JSON object to `POST /api/logs`:
|
|
|
379
840
|
| `service` | string | no |
|
|
380
841
|
| `environment` | string | no |
|
|
381
842
|
| `trace_id` | string | no |
|
|
843
|
+
| `span_id` | string | no |
|
|
844
|
+
| `parent_span_id` | string | no |
|
|
845
|
+
| `event_type` | string | no |
|
|
382
846
|
| `metadata` | object | no |
|
|
847
|
+
| `request_summary` | object | no |
|
|
383
848
|
|
|
384
849
|
The server accepts a single JSON object or an array of objects.
|
|
385
850
|
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module OpenTrace
|
|
4
|
+
class CircuitBreaker
|
|
5
|
+
CLOSED = :closed
|
|
6
|
+
OPEN = :open
|
|
7
|
+
HALF_OPEN = :half_open
|
|
8
|
+
|
|
9
|
+
attr_reader :state
|
|
10
|
+
|
|
11
|
+
def initialize(failure_threshold:, recovery_timeout:)
|
|
12
|
+
@failure_threshold = failure_threshold
|
|
13
|
+
@recovery_timeout = recovery_timeout
|
|
14
|
+
@state = CLOSED
|
|
15
|
+
@failure_count = 0
|
|
16
|
+
@last_failure_at = nil
|
|
17
|
+
@mutex = Mutex.new
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def allow_request?
|
|
21
|
+
@mutex.synchronize do
|
|
22
|
+
case @state
|
|
23
|
+
when CLOSED
|
|
24
|
+
true
|
|
25
|
+
when OPEN
|
|
26
|
+
if Time.now - @last_failure_at >= @recovery_timeout
|
|
27
|
+
@state = HALF_OPEN
|
|
28
|
+
true
|
|
29
|
+
else
|
|
30
|
+
false
|
|
31
|
+
end
|
|
32
|
+
when HALF_OPEN
|
|
33
|
+
false
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def record_success
|
|
39
|
+
@mutex.synchronize do
|
|
40
|
+
@failure_count = 0
|
|
41
|
+
@state = CLOSED
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def record_failure
|
|
46
|
+
@mutex.synchronize do
|
|
47
|
+
@failure_count += 1
|
|
48
|
+
@last_failure_at = Time.now
|
|
49
|
+
@state = OPEN if @failure_count >= @failure_threshold
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def reset!
|
|
54
|
+
@mutex.synchronize do
|
|
55
|
+
@state = CLOSED
|
|
56
|
+
@failure_count = 0
|
|
57
|
+
@last_failure_at = nil
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|