opentrace 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +228 -6
- data/lib/opentrace/circuit_breaker.rb +61 -0
- data/lib/opentrace/client.rb +281 -18
- data/lib/opentrace/config.rb +44 -2
- data/lib/opentrace/http_tracker.rb +28 -0
- data/lib/opentrace/log_forwarder.rb +6 -1
- data/lib/opentrace/middleware.rb +47 -1
- data/lib/opentrace/rails.rb +70 -18
- data/lib/opentrace/stats.rb +47 -0
- data/lib/opentrace/trace_context.rb +57 -0
- data/lib/opentrace/version.rb +1 -1
- data/lib/opentrace.rb +113 -30
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fbe3de5afdb5f92afcef49d368b8f8b3714ca17fe453b168066f53a5b0e5e1ba
|
|
4
|
+
data.tar.gz: 8f749e943951c939e7daa8f93a454bb0c89646bc295c0550c680064ebf38db16
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e3ecb7c4b649951f64e1090bb7ea06da69cb603dcd44534c9a5e71d76eb697239fddf2267cd2a5e0cff2aa1bf86dc78be6733be754df8e5c6a435967d90809a8
|
|
7
|
+
data.tar.gz: 3625d9de1cd2dda011f69283b34a17964450cda3c44c0f19431fb450e2a24328f1c260a12ea9929331a54866ac90afd07addd65e16cb2eee9e42a939807f3a69
|
data/README.md
CHANGED
|
@@ -25,7 +25,8 @@ A thin, safe Ruby client that forwards structured application logs to an [OpenTr
|
|
|
25
25
|
- **Rails 7.1+ BroadcastLogger** -- native support via `broadcast_to`
|
|
26
26
|
- **TaggedLogging** -- preserves `ActiveSupport::TaggedLogging` tags in metadata
|
|
27
27
|
- **Context support** -- attach global metadata to every log via Hash or Proc
|
|
28
|
-
- **
|
|
28
|
+
- **Business events** -- `OpenTrace.event` sends typed events (e.g. `payment.completed`) that bypass level filtering
|
|
29
|
+
- **Level filtering** -- `min_level` threshold or `allowed_levels` list to control which severities are forwarded
|
|
29
30
|
- **Auto-enrichment** -- every log includes `hostname`, `pid`, and `git_sha` automatically
|
|
30
31
|
- **Exception helper** -- `OpenTrace.error` captures class, message, cleaned backtrace, and error fingerprint
|
|
31
32
|
- **Runtime controls** -- enable/disable logging at runtime without restarting
|
|
@@ -38,6 +39,8 @@ A thin, safe Ruby client that forwards structured application logs to an [OpenTr
|
|
|
38
39
|
- **Job queue depth** -- monitors Sidekiq, GoodJob, or SolidQueue queue sizes (opt-in)
|
|
39
40
|
- **Memory delta tracking** -- snapshots process RSS before/after each request (opt-in)
|
|
40
41
|
- **External HTTP tracking** -- captures outbound Net::HTTP calls with timing (opt-in)
|
|
42
|
+
- **Version negotiation** -- startup compatibility check with capability-based feature detection
|
|
43
|
+
- **Distributed tracing** -- W3C Trace Context (`traceparent`) propagation across services with span IDs
|
|
41
44
|
|
|
42
45
|
## Installation
|
|
43
46
|
|
|
@@ -87,6 +90,7 @@ OpenTrace.configure do |c|
|
|
|
87
90
|
c.timeout = 1.0 # HTTP timeout in seconds (default: 1.0)
|
|
88
91
|
c.enabled = true # default: true
|
|
89
92
|
c.min_level = :info # minimum level to forward (default: :debug)
|
|
93
|
+
c.allowed_levels = [:warn, :error] # explicit level list (overrides min_level, default: nil)
|
|
90
94
|
c.batch_size = 50 # logs per batch (default: 50)
|
|
91
95
|
c.flush_interval = 5.0 # seconds between flushes (default: 5.0)
|
|
92
96
|
|
|
@@ -128,15 +132,21 @@ If any required field (`endpoint`, `api_key`, `service`) is missing or empty, th
|
|
|
128
132
|
|
|
129
133
|
### Level Filtering
|
|
130
134
|
|
|
131
|
-
Control which log levels are forwarded with `min_level
|
|
135
|
+
Control which log levels are forwarded with `min_level` (threshold) or `allowed_levels` (explicit list):
|
|
132
136
|
|
|
133
137
|
```ruby
|
|
134
138
|
OpenTrace.configure do |c|
|
|
135
139
|
# ...
|
|
140
|
+
# Option A: Threshold — forward this level and above
|
|
136
141
|
c.min_level = :warn # only forward WARN, ERROR, and FATAL
|
|
142
|
+
|
|
143
|
+
# Option B: Explicit list — forward only these levels (overrides min_level)
|
|
144
|
+
c.allowed_levels = [:warn, :error] # only forward WARN and ERROR
|
|
137
145
|
end
|
|
138
146
|
```
|
|
139
147
|
|
|
148
|
+
When `allowed_levels` is set, it takes precedence over `min_level`. When `allowed_levels` is `nil` (the default), `min_level` is used.
|
|
149
|
+
|
|
140
150
|
Available levels: `:debug`, `:info`, `:warn`, `:error`, `:fatal`
|
|
141
151
|
|
|
142
152
|
## Usage
|
|
@@ -176,6 +186,18 @@ This captures:
|
|
|
176
186
|
- `backtrace` -- cleaned (Rails backtrace cleaner or gem-filtered), limited to 15 frames
|
|
177
187
|
- `error_fingerprint` -- 12-char hash for grouping identical errors (stable across line number changes)
|
|
178
188
|
|
|
189
|
+
### Business Events
|
|
190
|
+
|
|
191
|
+
Use `OpenTrace.event` to send typed business events. Events always send at `INFO` level and **bypass level filtering** — they are never suppressed by `min_level` or `allowed_levels`:
|
|
192
|
+
|
|
193
|
+
```ruby
|
|
194
|
+
OpenTrace.event("payment.completed", "User paid $49.99", { user_id: 42, amount: 49.99 })
|
|
195
|
+
OpenTrace.event("auth.login", "Google OAuth login", { provider: "google", user_id: 7 })
|
|
196
|
+
OpenTrace.event("order.shipped", "Order dispatched", { order_id: "ORD-123" })
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Events include an `event_type` field in the payload, making them filterable on the server. They inherit context, `request_id`, and static context just like normal logs.
|
|
200
|
+
|
|
179
201
|
### Logger Wrapper
|
|
180
202
|
|
|
181
203
|
Wrap any Ruby `Logger` to forward all log output to OpenTrace while keeping the original logger working exactly as before:
|
|
@@ -567,12 +589,157 @@ Your App --log()--> [In-Memory Queue] --background thread--> POST /api/logs -->
|
|
|
567
589
|
- `enqueue` is non-blocking -- it uses `try_lock` so it never waits on a mutex
|
|
568
590
|
- The thread is started lazily on the first log call -- no threads are created at boot
|
|
569
591
|
- If the queue exceeds 1,000 items, new logs are dropped (oldest are preserved)
|
|
570
|
-
- Payloads exceeding
|
|
592
|
+
- Payloads exceeding 256 KB (configurable via `max_payload_bytes`) are intelligently truncated (backtrace, params, SQL removed first)
|
|
571
593
|
- If still too large after truncation, the payload is split and retried in smaller batches
|
|
572
|
-
-
|
|
594
|
+
- Failed requests are retried with exponential backoff (up to 3 attempts by default)
|
|
595
|
+
- A circuit breaker stops sending when the server is unreachable, resuming after a cooldown
|
|
596
|
+
- Rate-limited responses (429) trigger a backoff delay, respecting the server's `Retry-After` header
|
|
597
|
+
- Authentication failures (401) suspend sending and print a one-time warning to STDERR
|
|
573
598
|
- The HTTP timeout defaults to 1 second
|
|
574
599
|
- Pending logs are flushed on process exit via an `at_exit` hook
|
|
575
600
|
|
|
601
|
+
### Retry & Circuit Breaker
|
|
602
|
+
|
|
603
|
+
Failed HTTP requests are retried with exponential backoff and jitter. Only server errors (5xx) and network failures are retried -- client errors (4xx) are not.
|
|
604
|
+
|
|
605
|
+
```ruby
|
|
606
|
+
OpenTrace.configure do |c|
|
|
607
|
+
# ...
|
|
608
|
+
c.max_retries = 2 # up to 3 total attempts (default: 2)
|
|
609
|
+
c.retry_base_delay = 0.1 # 100ms initial backoff (default: 0.1)
|
|
610
|
+
c.retry_max_delay = 2.0 # cap backoff at 2 seconds (default: 2.0)
|
|
611
|
+
end
|
|
612
|
+
```
|
|
613
|
+
|
|
614
|
+
A circuit breaker prevents wasting resources when the server is down. After a threshold of consecutive failures, the circuit **opens** and all sends are skipped. After a cooldown, a single **probe** request is sent. If it succeeds, the circuit closes and normal operation resumes.
|
|
615
|
+
|
|
616
|
+
```ruby
|
|
617
|
+
OpenTrace.configure do |c|
|
|
618
|
+
# ...
|
|
619
|
+
c.circuit_breaker_threshold = 5 # failures before opening (default: 5)
|
|
620
|
+
c.circuit_breaker_timeout = 30 # seconds before probe (default: 30)
|
|
621
|
+
end
|
|
622
|
+
```
|
|
623
|
+
|
|
624
|
+
### Backpressure Handling
|
|
625
|
+
|
|
626
|
+
The client responds intelligently to HTTP status codes:
|
|
627
|
+
|
|
628
|
+
| Status | Behavior |
|
|
629
|
+
|---|---|
|
|
630
|
+
| **2xx** | Success -- circuit breaker resets |
|
|
631
|
+
| **429** | Rate limited -- pauses for `Retry-After` seconds (or `rate_limit_backoff`), re-enqueues the batch |
|
|
632
|
+
| **401** | Auth failed -- suspends sending, prints one-time STDERR warning. Resumes after `OpenTrace.configure` |
|
|
633
|
+
| **5xx** | Server error -- retried with backoff, counts toward circuit breaker |
|
|
634
|
+
| **Other 4xx** | Client error -- batch dropped silently |
|
|
635
|
+
|
|
636
|
+
```ruby
|
|
637
|
+
OpenTrace.configure do |c|
|
|
638
|
+
# ...
|
|
639
|
+
c.rate_limit_backoff = 5.0 # fallback when Retry-After header is missing (default: 5.0)
|
|
640
|
+
end
|
|
641
|
+
```
|
|
642
|
+
|
|
643
|
+
### Delivery Observability
|
|
644
|
+
|
|
645
|
+
The client exposes internal delivery statistics so you can monitor the health of the log pipeline:
|
|
646
|
+
|
|
647
|
+
```ruby
|
|
648
|
+
OpenTrace.stats
|
|
649
|
+
# => {
|
|
650
|
+
# enqueued: 15234,
|
|
651
|
+
# delivered: 15100,
|
|
652
|
+
# dropped_queue_full: 34,
|
|
653
|
+
# dropped_circuit_open: 100,
|
|
654
|
+
# dropped_auth_suspended: 0,
|
|
655
|
+
# dropped_error: 0,
|
|
656
|
+
# retries: 12,
|
|
657
|
+
# rate_limited: 2,
|
|
658
|
+
# auth_failures: 0,
|
|
659
|
+
# payload_splits: 1,
|
|
660
|
+
# batches_sent: 302,
|
|
661
|
+
# bytes_sent: 4812300,
|
|
662
|
+
# queue_size: 23,
|
|
663
|
+
# circuit_state: :closed,
|
|
664
|
+
# auth_suspended: false,
|
|
665
|
+
# uptime_seconds: 3600
|
|
666
|
+
# }
|
|
667
|
+
|
|
668
|
+
OpenTrace.healthy? # true when circuit is closed and auth is not suspended
|
|
669
|
+
OpenTrace.reset_stats! # reset counters (useful after reading/reporting)
|
|
670
|
+
```
|
|
671
|
+
|
|
672
|
+
#### Drop Callback
|
|
673
|
+
|
|
674
|
+
Register a callback to be notified when logs are dropped. The callback receives the count of dropped items and the reason:
|
|
675
|
+
|
|
676
|
+
```ruby
|
|
677
|
+
OpenTrace.configure do |c|
|
|
678
|
+
# ...
|
|
679
|
+
c.on_drop = ->(count, reason) {
|
|
680
|
+
StatsD.increment("opentrace.dropped", count, tags: ["reason:#{reason}"])
|
|
681
|
+
}
|
|
682
|
+
end
|
|
683
|
+
```
|
|
684
|
+
|
|
685
|
+
Reasons: `:queue_full`, `:circuit_open`, `:auth_suspended`, `:error`
|
|
686
|
+
|
|
687
|
+
The callback is called synchronously but **exceptions are always swallowed** -- a broken callback will never affect the client.
|
|
688
|
+
|
|
689
|
+
### Gzip Compression
|
|
690
|
+
|
|
691
|
+
Outgoing batches are automatically gzip-compressed when they exceed the compression threshold (default: 1KB). This typically achieves 70-85% bandwidth reduction for log payloads with repetitive keys and values.
|
|
692
|
+
|
|
693
|
+
```ruby
|
|
694
|
+
OpenTrace.configure do |c|
|
|
695
|
+
# ...
|
|
696
|
+
c.compression = true # enable gzip compression (default: true)
|
|
697
|
+
c.compression_threshold = 1024 # only compress payloads > 1KB (default: 1024)
|
|
698
|
+
c.max_payload_bytes = 262_144 # max batch size before splitting (default: 256KB)
|
|
699
|
+
end
|
|
700
|
+
```
|
|
701
|
+
|
|
702
|
+
Compression uses `Zlib::BEST_SPEED` (level 1) for minimal CPU overhead (~0.14ms per batch). The server must support `Content-Encoding: gzip` on request bodies. OpenTrace server v0.6+ includes transparent decompression middleware.
|
|
703
|
+
|
|
704
|
+
### Version Negotiation
|
|
705
|
+
|
|
706
|
+
On the first dispatch cycle, the client makes a lightweight `GET /api/version` call to discover the server's API version and capabilities. This runs once per process (or after fork) and never blocks `enqueue`.
|
|
707
|
+
|
|
708
|
+
```ruby
|
|
709
|
+
# Check server capabilities programmatically
|
|
710
|
+
client = OpenTrace.send(:client)
|
|
711
|
+
client.supports?(:request_summaries) # true if server advertises it
|
|
712
|
+
client.supports?(:gzip_request) # true if server supports gzip
|
|
713
|
+
```
|
|
714
|
+
|
|
715
|
+
If the server requires a newer client API version, a warning is printed to STDERR:
|
|
716
|
+
|
|
717
|
+
```
|
|
718
|
+
[OpenTrace] Server requires API version >= 2, but this client supports version 1.
|
|
719
|
+
Please upgrade the opentrace gem. Log forwarding may not work correctly.
|
|
720
|
+
```
|
|
721
|
+
|
|
722
|
+
Every request includes an `X-API-Version: 1` header so the server can reject incompatible clients with a clear error. Old servers without `/api/version` are handled gracefully — the check silently skips and all features remain enabled.
|
|
723
|
+
|
|
724
|
+
### Distributed Tracing
|
|
725
|
+
|
|
726
|
+
When `trace_propagation` is enabled (the default), the middleware extracts or generates a W3C-compatible trace context for each request:
|
|
727
|
+
|
|
728
|
+
- **Incoming**: Reads `traceparent` header (W3C standard), falls back to `X-Trace-ID`, then `X-Request-ID`
|
|
729
|
+
- **Outgoing**: When `http_tracking` is enabled, injects `traceparent`, `X-Trace-ID`, and `X-Request-ID` into outbound HTTP requests
|
|
730
|
+
|
|
731
|
+
This enables cross-service correlation — all logs from a distributed request chain share the same `trace_id`.
|
|
732
|
+
|
|
733
|
+
```ruby
|
|
734
|
+
OpenTrace.configure do |c|
|
|
735
|
+
# ...
|
|
736
|
+
c.trace_propagation = true # extract/propagate trace context (default: true)
|
|
737
|
+
c.http_tracking = true # also inject into outgoing HTTP calls (opt-in)
|
|
738
|
+
end
|
|
739
|
+
```
|
|
740
|
+
|
|
741
|
+
Each log entry includes `trace_id`, `span_id`, and `parent_span_id` (when available) as top-level fields. The server indexes these for fast trace lookups.
|
|
742
|
+
|
|
576
743
|
### Request Summary Architecture
|
|
577
744
|
|
|
578
745
|
When `request_summary` is enabled, events within a request are **accumulated** in a Fiber-local `RequestCollector` instead of being pushed to the queue individually:
|
|
@@ -585,13 +752,51 @@ Request Start
|
|
|
585
752
|
Cache events ──► collector.record_cache() (no queue push)
|
|
586
753
|
HTTP events ──► collector.record_http() (no queue push)
|
|
587
754
|
Request End
|
|
588
|
-
Controller subscriber
|
|
589
|
-
One queue push
|
|
755
|
+
Controller subscriber builds request_summary from collector
|
|
756
|
+
One queue push: metadata (user/request context) + request_summary (perf data)
|
|
590
757
|
Middleware cleans up RequestCollector
|
|
591
758
|
```
|
|
592
759
|
|
|
593
760
|
This means a request with 30 SQL queries, 50 view renders, and 10 cache operations produces **one log entry** instead of 91.
|
|
594
761
|
|
|
762
|
+
### Structured Request Metrics
|
|
763
|
+
|
|
764
|
+
When a `RequestCollector` is active, performance data is sent as a **separate `request_summary` field** instead of being merged into metadata. This allows the server to store it in a dedicated `request_summaries` table with indexed columns for fast analytical queries.
|
|
765
|
+
|
|
766
|
+
```ruby
|
|
767
|
+
# Sent automatically by the Rails subscriber — no code changes needed.
|
|
768
|
+
# The payload looks like:
|
|
769
|
+
{
|
|
770
|
+
"metadata": { "request_id": "req-abc", "user_id": 42 },
|
|
771
|
+
"request_summary": {
|
|
772
|
+
"controller": "InvoicesController",
|
|
773
|
+
"action": "index",
|
|
774
|
+
"method": "GET",
|
|
775
|
+
"path": "/invoices",
|
|
776
|
+
"status": 200,
|
|
777
|
+
"duration_ms": 45.2,
|
|
778
|
+
"sql_count": 3,
|
|
779
|
+
"sql_total_ms": 12.1,
|
|
780
|
+
"n_plus_one": false,
|
|
781
|
+
"view_count": 2,
|
|
782
|
+
"view_total_ms": 28.3,
|
|
783
|
+
"cache_reads": 1,
|
|
784
|
+
"cache_hits": 1,
|
|
785
|
+
"cache_hit_ratio": 1.0,
|
|
786
|
+
"timeline": [{"t": "sql", "n": "Invoice Load", "ms": 8.2, "at": 2.0}]
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
```
|
|
790
|
+
|
|
791
|
+
You can also pass `request_summary:` manually:
|
|
792
|
+
|
|
793
|
+
```ruby
|
|
794
|
+
OpenTrace.log("INFO", "Custom request", { user_id: 42 },
|
|
795
|
+
request_summary: { controller: "Custom", action: "run", sql_count: 5 })
|
|
796
|
+
```
|
|
797
|
+
|
|
798
|
+
**Backward compatibility**: Old servers ignore the `request_summary` field. When no collector is active (background jobs, non-Rails), data falls back to metadata as before.
|
|
799
|
+
|
|
595
800
|
## Log Payload Format
|
|
596
801
|
|
|
597
802
|
Each log is sent as a JSON object to `POST /api/logs`:
|
|
@@ -610,6 +815,19 @@ Each log is sent as a JSON object to `POST /api/logs`:
|
|
|
610
815
|
"hostname": "web-01",
|
|
611
816
|
"pid": 12345,
|
|
612
817
|
"git_sha": "a1b2c3d"
|
|
818
|
+
},
|
|
819
|
+
"request_summary": {
|
|
820
|
+
"controller": "InvoicesController",
|
|
821
|
+
"action": "index",
|
|
822
|
+
"method": "GET",
|
|
823
|
+
"path": "/invoices",
|
|
824
|
+
"status": 200,
|
|
825
|
+
"duration_ms": 45.2,
|
|
826
|
+
"sql_count": 3,
|
|
827
|
+
"sql_total_ms": 12.1,
|
|
828
|
+
"view_count": 2,
|
|
829
|
+
"view_total_ms": 28.3,
|
|
830
|
+
"timeline": [...]
|
|
613
831
|
}
|
|
614
832
|
}
|
|
615
833
|
```
|
|
@@ -622,7 +840,11 @@ Each log is sent as a JSON object to `POST /api/logs`:
|
|
|
622
840
|
| `service` | string | no |
|
|
623
841
|
| `environment` | string | no |
|
|
624
842
|
| `trace_id` | string | no |
|
|
843
|
+
| `span_id` | string | no |
|
|
844
|
+
| `parent_span_id` | string | no |
|
|
845
|
+
| `event_type` | string | no |
|
|
625
846
|
| `metadata` | object | no |
|
|
847
|
+
| `request_summary` | object | no |
|
|
626
848
|
|
|
627
849
|
The server accepts a single JSON object or an array of objects.
|
|
628
850
|
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module OpenTrace
|
|
4
|
+
class CircuitBreaker
|
|
5
|
+
CLOSED = :closed
|
|
6
|
+
OPEN = :open
|
|
7
|
+
HALF_OPEN = :half_open
|
|
8
|
+
|
|
9
|
+
attr_reader :state
|
|
10
|
+
|
|
11
|
+
def initialize(failure_threshold:, recovery_timeout:)
|
|
12
|
+
@failure_threshold = failure_threshold
|
|
13
|
+
@recovery_timeout = recovery_timeout
|
|
14
|
+
@state = CLOSED
|
|
15
|
+
@failure_count = 0
|
|
16
|
+
@last_failure_at = nil
|
|
17
|
+
@mutex = Mutex.new
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def allow_request?
|
|
21
|
+
@mutex.synchronize do
|
|
22
|
+
case @state
|
|
23
|
+
when CLOSED
|
|
24
|
+
true
|
|
25
|
+
when OPEN
|
|
26
|
+
if Time.now - @last_failure_at >= @recovery_timeout
|
|
27
|
+
@state = HALF_OPEN
|
|
28
|
+
true
|
|
29
|
+
else
|
|
30
|
+
false
|
|
31
|
+
end
|
|
32
|
+
when HALF_OPEN
|
|
33
|
+
false
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def record_success
|
|
39
|
+
@mutex.synchronize do
|
|
40
|
+
@failure_count = 0
|
|
41
|
+
@state = CLOSED
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def record_failure
|
|
46
|
+
@mutex.synchronize do
|
|
47
|
+
@failure_count += 1
|
|
48
|
+
@last_failure_at = Time.now
|
|
49
|
+
@state = OPEN if @failure_count >= @failure_threshold
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def reset!
|
|
54
|
+
@mutex.synchronize do
|
|
55
|
+
@state = CLOSED
|
|
56
|
+
@failure_count = 0
|
|
57
|
+
@last_failure_at = nil
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|