tracestax 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +107 -0
- data/lib/tracestax/client.rb +440 -0
- data/lib/tracestax/configuration.rb +21 -0
- data/lib/tracestax/delayed_job.rb +66 -0
- data/lib/tracestax/good_job.rb +61 -0
- data/lib/tracestax/resque.rb +79 -0
- data/lib/tracestax/sidekiq/client.rb +92 -0
- data/lib/tracestax/sidekiq/configuration.rb +21 -0
- data/lib/tracestax/sidekiq/heartbeat_poller.rb +91 -0
- data/lib/tracestax/sidekiq/server_middleware.rb +88 -0
- data/lib/tracestax/sidekiq/version.rb +7 -0
- data/lib/tracestax/sidekiq.rb +46 -0
- data/lib/tracestax/solid_queue.rb +59 -0
- data/lib/tracestax/version.rb +5 -0
- data/lib/tracestax.rb +32 -0
- metadata +145 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 01eb19bd866f919893443462b9205828dd0b31b609dec8944c26058bb88aaee4
|
|
4
|
+
data.tar.gz: 2ea91dc059c9f14d4ffc7d0a8f78c2647fea7a7d602be7b9a415a8cbdf9312be
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 248f52531780835c53d520ec7b813fab861082e85f62f42f96804546d96309fdc293cf0722845b234300538c7b1341d80d328634969ab11175618209f0b092f2
|
|
7
|
+
data.tar.gz: 3587e3672805f9da3bf9d447c5d89ba854f5056ded4ec6d7d59e02dbd7788480b8b764da44b988332b21a8fa308b17badf643df7c1325aa4abe6ddf0fb58aac3
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 TraceStax
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# tracestax
|
|
2
|
+
|
|
3
|
+
TraceStax SDK for Ruby background jobs. Automatically captures task lifecycle events, worker heartbeats, and queue depth snapshots.
|
|
4
|
+
|
|
5
|
+
Supports **Sidekiq** (6.x / 7.x), **Solid Queue** (Rails 8), **Resque**, **Delayed::Job**, and **Good Job**.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
Add to your `Gemfile`:
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
gem "tracestax"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Then run:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
bundle install
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quickstart - Sidekiq
|
|
22
|
+
|
|
23
|
+
```ruby
|
|
24
|
+
# config/initializers/tracestax.rb
|
|
25
|
+
TraceStax.configure do |config|
|
|
26
|
+
config.api_key = ENV["TRACESTAX_API_KEY"]
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
TraceStax::Sidekiq.configure
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
The SDK installs a server middleware that automatically instruments all Sidekiq jobs - no changes to individual workers needed.
|
|
33
|
+
|
|
34
|
+
## Quickstart - Solid Queue (Rails 8)
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
# config/initializers/tracestax.rb
|
|
38
|
+
TraceStax.configure do |config|
|
|
39
|
+
config.api_key = ENV["TRACESTAX_API_KEY"]
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
TraceStax::SolidQueueSubscriber.attach
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Quickstart - Resque
|
|
46
|
+
|
|
47
|
+
```ruby
|
|
48
|
+
TraceStax.configure do |config|
|
|
49
|
+
config.api_key = ENV["TRACESTAX_API_KEY"]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
require "tracestax/resque"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quickstart - Delayed::Job
|
|
56
|
+
|
|
57
|
+
```ruby
|
|
58
|
+
TraceStax.configure do |config|
|
|
59
|
+
config.api_key = ENV["TRACESTAX_API_KEY"]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
require "tracestax/delayed_job"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Quickstart - Good Job
|
|
66
|
+
|
|
67
|
+
```ruby
|
|
68
|
+
TraceStax.configure do |config|
|
|
69
|
+
config.api_key = ENV["TRACESTAX_API_KEY"]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
require "tracestax/good_job"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Configuration
|
|
76
|
+
|
|
77
|
+
```ruby
|
|
78
|
+
TraceStax.configure do |config|
|
|
79
|
+
config.api_key = ENV["TRACESTAX_API_KEY"] # Required
|
|
80
|
+
config.endpoint = "https://ingest.tracestax.com" # Override ingest endpoint
|
|
81
|
+
config.flush_interval = 5.0 # Seconds between batch flushes
|
|
82
|
+
config.max_batch_size = 100 # Max events per HTTP request
|
|
83
|
+
config.heartbeat_interval = 60 # Seconds between worker heartbeats
|
|
84
|
+
end
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## What's Monitored
|
|
88
|
+
|
|
89
|
+
- Job lifecycle (start, success, failure, retry)
|
|
90
|
+
- Worker fleet health (heartbeat, concurrency)
|
|
91
|
+
- Queue depth and throughput
|
|
92
|
+
- Error fingerprinting and grouping
|
|
93
|
+
- Anomaly detection (duration spikes, silence detection)
|
|
94
|
+
|
|
95
|
+
## Authentication
|
|
96
|
+
|
|
97
|
+
All requests use your API key as a Bearer token:
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
Authorization: Bearer ts_live_xxx
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Get your project API key from the TraceStax dashboard under **Project → API Key**.
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
|
|
107
|
+
MIT
|
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "json"
|
|
5
|
+
require "singleton"
|
|
6
|
+
require "socket"
|
|
7
|
+
|
|
8
|
+
module TraceStax
|
|
9
|
+
# Resilience constants
|
|
10
|
+
CIRCUIT_OPEN_THRESHOLD = 3
|
|
11
|
+
CIRCUIT_COOLDOWN_SECS = 30
|
|
12
|
+
MAX_FLUSH_INTERVAL = 60
|
|
13
|
+
|
|
14
|
+
class Client
|
|
15
|
+
include Singleton
|
|
16
|
+
|
|
17
|
+
def initialize
|
|
18
|
+
@queue = Queue.new
|
|
19
|
+
@mutex = Mutex.new
|
|
20
|
+
@running = false
|
|
21
|
+
@dropped_events = 0
|
|
22
|
+
|
|
23
|
+
# Used for cooperative shutdown: signals flush_loop to wake and exit cleanly.
|
|
24
|
+
@shutdown_mutex = Mutex.new
|
|
25
|
+
@shutdown_cv = ConditionVariable.new
|
|
26
|
+
|
|
27
|
+
# Resilience state
|
|
28
|
+
@consecutive_failures = 0
|
|
29
|
+
@circuit_state = :closed # :closed | :open | :half_open
|
|
30
|
+
@circuit_opened_at = nil
|
|
31
|
+
@current_flush_interval = nil # lazy-initialised from config
|
|
32
|
+
@pause_until = nil # Time or nil
|
|
33
|
+
|
|
34
|
+
# Fork safety: track the PID so we can detect when we've been forked.
|
|
35
|
+
# Checked on every public call; if the PID has changed we discard the
|
|
36
|
+
# inherited (potentially poisoned) mutexes and respawn the daemon thread.
|
|
37
|
+
@pid = Process.pid
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def start
|
|
41
|
+
@mutex.synchronize do
|
|
42
|
+
return if @running
|
|
43
|
+
|
|
44
|
+
@running = true
|
|
45
|
+
@current_flush_interval = config&.flush_interval || 5
|
|
46
|
+
@thread = Thread.new { flush_loop }
|
|
47
|
+
@thread.abort_on_exception = false
|
|
48
|
+
|
|
49
|
+
at_exit { shutdown }
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def send_event(payload)
|
|
54
|
+
reinitialize_if_forked!
|
|
55
|
+
return unless config&.enabled
|
|
56
|
+
|
|
57
|
+
# Guard against huge or non-serializable payloads that would OOM or raise
|
|
58
|
+
# at flush time, potentially poisoning an entire batch of valid events.
|
|
59
|
+
begin
|
|
60
|
+
serialized = payload.to_json
|
|
61
|
+
rescue StandardError
|
|
62
|
+
$stderr.puts("[tracestax] send_event: payload not serializable, dropping")
|
|
63
|
+
return
|
|
64
|
+
end
|
|
65
|
+
if serialized.bytesize > 512 * 1024
|
|
66
|
+
$stderr.puts("[tracestax] send_event: payload exceeds 512 KB, dropping")
|
|
67
|
+
return
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
if config&.dry_run
|
|
71
|
+
begin
|
|
72
|
+
$stdout.puts("[tracestax dry-run] #{serialized}")
|
|
73
|
+
rescue StandardError
|
|
74
|
+
# $stdout.puts may raise IOError in daemonized processes
|
|
75
|
+
end
|
|
76
|
+
return
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Prevent unbounded memory growth: drop oldest events when queue is full
|
|
80
|
+
if @queue.size > 10_000
|
|
81
|
+
trimmed = @queue.size - 5_000
|
|
82
|
+
trimmed.times { @queue.pop(true) rescue nil }
|
|
83
|
+
@mutex.synchronize { @dropped_events += trimmed }
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
@queue.push(payload)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Returns a snapshot of the client's internal health metrics.
|
|
90
|
+
# Keys: :queue_size, :dropped_events, :circuit_state, :consecutive_failures.
|
|
91
|
+
def stats
|
|
92
|
+
@mutex.synchronize do
|
|
93
|
+
{
|
|
94
|
+
queue_size: @queue.size,
|
|
95
|
+
dropped_events: @dropped_events,
|
|
96
|
+
circuit_state: @circuit_state,
|
|
97
|
+
consecutive_failures: @consecutive_failures,
|
|
98
|
+
}
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Sends a heartbeat synchronously and returns the directives hash (or nil on error).
|
|
103
|
+
def send_heartbeat_sync(payload)
|
|
104
|
+
return nil unless config&.enabled
|
|
105
|
+
return nil if config&.dry_run
|
|
106
|
+
|
|
107
|
+
result = post_for_json("/v1/heartbeat", payload)
|
|
108
|
+
result&.dig("directives")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Pauses ingest flushing until +epoch_ms+ (epoch milliseconds).
|
|
112
|
+
def set_pause_until(epoch_ms)
|
|
113
|
+
@mutex.synchronize { @pause_until = Time.at(epoch_ms / 1000.0) }
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Executes a server-issued command hash. Currently supports "thread_dump".
|
|
117
|
+
def execute_command(cmd)
|
|
118
|
+
return unless cmd["type"] == "thread_dump"
|
|
119
|
+
|
|
120
|
+
wk = worker_key_for_dump
|
|
121
|
+
dump = capture_thread_dump
|
|
122
|
+
post_for_json("/v1/dump", {
|
|
123
|
+
cmd_id: cmd["id"],
|
|
124
|
+
worker_key: wk,
|
|
125
|
+
dump_text: dump,
|
|
126
|
+
language: "ruby",
|
|
127
|
+
sdk_version: TraceStax::VERSION,
|
|
128
|
+
captured_at: Time.now.utc.iso8601,
|
|
129
|
+
})
|
|
130
|
+
rescue StandardError
|
|
131
|
+
# Dump capture must never raise into the host process
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Starts a background thread that POSTs a heartbeat to /v1/heartbeat every
|
|
135
|
+
# +interval+ seconds. The thread is daemonised and swallows all errors so it
|
|
136
|
+
# can never affect the host process.
|
|
137
|
+
#
|
|
138
|
+
# @param worker_key [String] unique identifier for this worker process
|
|
139
|
+
# @param queues [Array<String>] queue names this worker consumes
|
|
140
|
+
# @param concurrency [Integer] number of concurrent threads/fibers
|
|
141
|
+
# @param interval [Integer] seconds between heartbeats (default: 30)
|
|
142
|
+
def start_heartbeat_thread(worker_key:, queues:, concurrency:, interval: 30)
|
|
143
|
+
thread = Thread.new do
|
|
144
|
+
loop do
|
|
145
|
+
sleep(interval)
|
|
146
|
+
begin
|
|
147
|
+
payload = {
|
|
148
|
+
type: "heartbeat",
|
|
149
|
+
framework: "solid_queue",
|
|
150
|
+
language: "ruby",
|
|
151
|
+
sdk_version: TraceStax::VERSION,
|
|
152
|
+
timestamp: Time.now.utc.iso8601,
|
|
153
|
+
worker: {
|
|
154
|
+
key: worker_key,
|
|
155
|
+
hostname: Socket.gethostname,
|
|
156
|
+
pid: Process.pid,
|
|
157
|
+
queues: queues,
|
|
158
|
+
concurrency: concurrency
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
directives = send_heartbeat_sync(payload)
|
|
162
|
+
if directives
|
|
163
|
+
if directives["pause_ingest"]
|
|
164
|
+
pum = directives["pause_until_ms"]
|
|
165
|
+
set_pause_until(pum) if pum
|
|
166
|
+
end
|
|
167
|
+
(directives["commands"] || []).each do |cmd|
|
|
168
|
+
execute_command(cmd) rescue nil
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
rescue StandardError
|
|
172
|
+
# Swallow — heartbeat must never raise into the host process
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
thread.abort_on_exception = false
|
|
177
|
+
thread
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Asynchronously POSTs a queue-depth snapshot to /v1/snapshot.
|
|
181
|
+
# Optional counts default to +nil+ and are omitted from the payload when absent.
|
|
182
|
+
#
|
|
183
|
+
# @param queue_name [String] name of the queue being reported
|
|
184
|
+
# @param depth [Integer] number of jobs currently enqueued
|
|
185
|
+
# @param active_count [Integer, nil] number of actively-running jobs
|
|
186
|
+
# @param failed_count [Integer, nil] number of failed jobs
|
|
187
|
+
def send_snapshot(queue_name:, depth:, active_count: nil, failed_count: nil)
|
|
188
|
+
payload = {
|
|
189
|
+
type: "snapshot",
|
|
190
|
+
framework: "solid_queue",
|
|
191
|
+
worker_key: "#{Socket.gethostname}:#{Process.pid}",
|
|
192
|
+
queues: [{
|
|
193
|
+
name: queue_name,
|
|
194
|
+
depth: depth,
|
|
195
|
+
active: active_count || 0,
|
|
196
|
+
failed: failed_count || 0,
|
|
197
|
+
throughput_per_min: 0,
|
|
198
|
+
}],
|
|
199
|
+
timestamp: Time.now.utc.iso8601,
|
|
200
|
+
}
|
|
201
|
+
send_event(payload)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def flush
|
|
205
|
+
reinitialize_if_forked!
|
|
206
|
+
# Check circuit breaker
|
|
207
|
+
return unless circuit_allow?
|
|
208
|
+
|
|
209
|
+
batch = []
|
|
210
|
+
batch.push(@queue.pop(true)) while !@queue.empty? && batch.size < config.max_batch_size
|
|
211
|
+
return if batch.empty?
|
|
212
|
+
|
|
213
|
+
conn = build_connection
|
|
214
|
+
|
|
215
|
+
# All SDKs send a single batched POST — one HTTP request for up to
|
|
216
|
+
# max_batch_size events. This avoids N×HTTP overhead per flush cycle.
|
|
217
|
+
resp = conn.post("/v1/ingest") do |req|
|
|
218
|
+
req.headers["Authorization"] = "Bearer #{config.api_key}"
|
|
219
|
+
req.headers["Content-Type"] = "application/json"
|
|
220
|
+
req.body = { events: batch }.to_json
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
if resp.status == 401
|
|
224
|
+
# Auth failure is permanent misconfiguration, not a transient network issue.
|
|
225
|
+
# Log prominently without penalising the circuit breaker — a bad API key
|
|
226
|
+
# must not open the circuit and suppress all future sends once corrected.
|
|
227
|
+
$stderr.puts("[tracestax] Auth failed (401) — check your API key. #{resp.body.to_s[0, 200]}")
|
|
228
|
+
# Re-queue the batch so events are not silently discarded
|
|
229
|
+
batch.reverse_each { |e| @queue.push(e) rescue nil }
|
|
230
|
+
elsif resp.status >= 400
|
|
231
|
+
record_failure
|
|
232
|
+
# Restore the batch so events aren't permanently lost on transient failures
|
|
233
|
+
batch.reverse_each { |e| @queue.push(e) rescue nil }
|
|
234
|
+
else
|
|
235
|
+
record_success
|
|
236
|
+
end
|
|
237
|
+
rescue StandardError
|
|
238
|
+
record_failure
|
|
239
|
+
# Best-effort restore on unexpected errors
|
|
240
|
+
batch.reverse_each { |e| @queue.push(e) rescue nil } if defined?(batch) && batch
|
|
241
|
+
rescue ThreadError
|
|
242
|
+
# Queue empty during pop
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def shutdown(timeout: 5)
|
|
246
|
+
@mutex.synchronize { @running = false }
|
|
247
|
+
# Wake flush_loop immediately so it performs a final flush and exits.
|
|
248
|
+
# Using a ConditionVariable avoids Thread#kill, which is unsafe when the
|
|
249
|
+
# thread holds @mutex (can permanently poison the lock).
|
|
250
|
+
@shutdown_mutex.synchronize { @shutdown_cv.signal }
|
|
251
|
+
unless @thread&.join(timeout)
|
|
252
|
+
$stderr.puts("[tracestax] shutdown flush timed out after #{timeout}s, dropping remaining events")
|
|
253
|
+
# Do NOT kill the thread — it will exit naturally once the HTTP timeout fires.
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
private
|
|
258
|
+
|
|
259
|
+
# ── Fork safety ────────────────────────────────────────────────
|
|
260
|
+
#
|
|
261
|
+
# Prefork servers (Unicorn, Puma cluster mode) call fork(2) after the app
|
|
262
|
+
# has booted. Ruby does NOT copy threads into the child — only the calling
|
|
263
|
+
# thread survives. Any mutex that was locked by a now-dead thread is
|
|
264
|
+
# permanently poisoned in the child; acquiring it will deadlock forever.
|
|
265
|
+
#
|
|
266
|
+
# We detect the fork by comparing Process.pid against the PID recorded at
|
|
267
|
+
# initialisation (and after each reinitialisation). On mismatch we replace
|
|
268
|
+
# all mutexes and CVs wholesale — never unlock or acquire the inherited ones
|
|
269
|
+
# — and respawn the daemon thread.
|
|
270
|
+
#
|
|
271
|
+
# This pattern is used by Sidekiq, Puma, and the official redis-client gem.
|
|
272
|
+
# The PID read/write is safe under MRI's GIL: Fixnum ivar assignment is
|
|
273
|
+
# atomic, and this method is only called from the main thread (or from the
|
|
274
|
+
# new daemon thread after reinit, which holds a fresh mutex).
|
|
275
|
+
def reinitialize_if_forked!
|
|
276
|
+
return if Process.pid == @pid
|
|
277
|
+
|
|
278
|
+
# Replace every primitive that could be locked by the dead daemon thread.
|
|
279
|
+
# Do NOT call @mutex.unlock or @mutex.synchronize — the lock state is
|
|
280
|
+
# unknown and touching it risks deadlock.
|
|
281
|
+
@mutex = Mutex.new
|
|
282
|
+
@shutdown_mutex = Mutex.new
|
|
283
|
+
@shutdown_cv = ConditionVariable.new
|
|
284
|
+
|
|
285
|
+
# Pre-fork events belong to the parent worker; discard them.
|
|
286
|
+
# (Attempting to drain the inherited Queue is unsafe because its internal
|
|
287
|
+
# mutex may also be poisoned.)
|
|
288
|
+
@queue = Queue.new
|
|
289
|
+
|
|
290
|
+
# Reset all mutable state so the child starts clean.
|
|
291
|
+
@running = false
|
|
292
|
+
@dropped_events = 0
|
|
293
|
+
@consecutive_failures = 0
|
|
294
|
+
@circuit_state = :closed
|
|
295
|
+
@circuit_opened_at = nil
|
|
296
|
+
@current_flush_interval = nil
|
|
297
|
+
@pause_until = nil
|
|
298
|
+
@pid = Process.pid
|
|
299
|
+
|
|
300
|
+
start
|
|
301
|
+
rescue StandardError
|
|
302
|
+
# Reinitialization must never crash the child process.
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def config
|
|
306
|
+
TraceStax.configuration
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def flush_loop
|
|
310
|
+
loop do
|
|
311
|
+
interval = @mutex.synchronize { @current_flush_interval || config&.flush_interval || 5 }
|
|
312
|
+
# Wait for the interval or an early wakeup from shutdown. Using a
|
|
313
|
+
# ConditionVariable instead of sleep(interval) means shutdown can
|
|
314
|
+
# interrupt the wait immediately without needing Thread#kill.
|
|
315
|
+
@shutdown_mutex.synchronize { @shutdown_cv.wait(@shutdown_mutex, interval) }
|
|
316
|
+
flush rescue nil # daemon thread must never die — swallow all errors
|
|
317
|
+
break unless @running
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
# ── Circuit breaker ────────────────────────────────────────────
|
|
322
|
+
|
|
323
|
+
def circuit_allow?
|
|
324
|
+
@mutex.synchronize do
|
|
325
|
+
# Honour backpressure pause
|
|
326
|
+
if @pause_until && Time.now < @pause_until
|
|
327
|
+
return false
|
|
328
|
+
end
|
|
329
|
+
@pause_until = nil
|
|
330
|
+
|
|
331
|
+
if @circuit_state == :open
|
|
332
|
+
elapsed = [0, Time.now - @circuit_opened_at].max
|
|
333
|
+
if elapsed < CIRCUIT_COOLDOWN_SECS
|
|
334
|
+
return false
|
|
335
|
+
end
|
|
336
|
+
@circuit_state = :half_open
|
|
337
|
+
end
|
|
338
|
+
true
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
def record_success
|
|
343
|
+
@mutex.synchronize do
|
|
344
|
+
@consecutive_failures = 0
|
|
345
|
+
@circuit_state = :closed
|
|
346
|
+
@circuit_opened_at = nil
|
|
347
|
+
base = config&.flush_interval || 5
|
|
348
|
+
@current_flush_interval = [base, (@current_flush_interval || base) / 2.0].max
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def record_failure
|
|
353
|
+
@mutex.synchronize do
|
|
354
|
+
@consecutive_failures += 1
|
|
355
|
+
base = config&.flush_interval || 5
|
|
356
|
+
@current_flush_interval = [MAX_FLUSH_INTERVAL, (@current_flush_interval || base) * 2].min
|
|
357
|
+
|
|
358
|
+
if @consecutive_failures >= CIRCUIT_OPEN_THRESHOLD && @circuit_state == :closed
|
|
359
|
+
@circuit_state = :open
|
|
360
|
+
@circuit_opened_at = Time.now
|
|
361
|
+
$stderr.puts("[tracestax] TraceStax unreachable, circuit open, events dropped")
|
|
362
|
+
elsif @circuit_state == :half_open
|
|
363
|
+
@circuit_state = :open
|
|
364
|
+
@circuit_opened_at = Time.now
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
# ── HTTP ───────────────────────────────────────────────────────
|
|
370
|
+
|
|
371
|
+
def build_connection
|
|
372
|
+
Faraday.new(url: config.endpoint) do |f|
|
|
373
|
+
f.options.open_timeout = 5
|
|
374
|
+
f.options.timeout = 10
|
|
375
|
+
f.request :json
|
|
376
|
+
f.adapter Faraday.default_adapter
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def post_for_json(path, body)
|
|
381
|
+
conn = build_connection
|
|
382
|
+
resp = conn.post(path) do |req|
|
|
383
|
+
req.headers["Authorization"] = "Bearer #{config.api_key}"
|
|
384
|
+
req.headers["Content-Type"] = "application/json"
|
|
385
|
+
req.body = body.to_json
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
if resp.headers["X-Retry-After"]
|
|
389
|
+
secs = resp.headers["X-Retry-After"].to_f
|
|
390
|
+
set_pause_until((Time.now.to_f + secs) * 1000) if secs > 0
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
if resp.status == 401
|
|
394
|
+
# Auth failure is permanent misconfiguration, not a transient network issue.
|
|
395
|
+
# Log prominently without penalising the circuit breaker.
|
|
396
|
+
$stderr.puts("[tracestax] Auth failed (401) — check your API key. #{resp.body.to_s[0, 200]}")
|
|
397
|
+
return nil
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
if resp.status >= 400
|
|
401
|
+
record_failure
|
|
402
|
+
return nil
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
# Cap response body at 1 MB to prevent a runaway or misconfigured server
|
|
406
|
+
# from buffering unbounded memory inside the SDK.
|
|
407
|
+
body_text = resp.body.to_s
|
|
408
|
+
if body_text.bytesize > 1024 * 1024
|
|
409
|
+
$stderr.puts("[tracestax] Ingest response truncated — exceeded 1 MB size limit")
|
|
410
|
+
body_text = "{}"
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
record_success
|
|
414
|
+
JSON.parse(body_text)
|
|
415
|
+
rescue StandardError
|
|
416
|
+
record_failure
|
|
417
|
+
nil
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# ── Thread dump ────────────────────────────────────────────────
|
|
421
|
+
|
|
422
|
+
def worker_key_for_dump
|
|
423
|
+
"#{Socket.gethostname}:#{Process.pid}"
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
def capture_thread_dump
|
|
427
|
+
lines = ["=== TraceStax Ruby Thread Dump ===",
|
|
428
|
+
"PID: #{Process.pid}",
|
|
429
|
+
"Timestamp: #{Time.now.utc.iso8601}",
|
|
430
|
+
""]
|
|
431
|
+
Thread.list.each do |t|
|
|
432
|
+
lines << "Thread: #{t.object_id} (status=#{t.status})"
|
|
433
|
+
bt = t.backtrace || []
|
|
434
|
+
lines.concat(bt.map { |l| " #{l}" })
|
|
435
|
+
lines << ""
|
|
436
|
+
end
|
|
437
|
+
lines.join("\n")[0, 500_000]
|
|
438
|
+
end
|
|
439
|
+
end
|
|
440
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module TraceStax
|
|
4
|
+
class Configuration
|
|
5
|
+
attr_accessor :api_key, :endpoint, :flush_interval, :max_batch_size, :enabled, :dry_run
|
|
6
|
+
|
|
7
|
+
def initialize
|
|
8
|
+
@endpoint = "https://ingest.tracestax.com"
|
|
9
|
+
@flush_interval = 5.0
|
|
10
|
+
@max_batch_size = 100
|
|
11
|
+
@enabled = ENV["TRACESTAX_ENABLED"] != "false"
|
|
12
|
+
@dry_run = ENV["TRACESTAX_DRY_RUN"] == "true"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def validate!
|
|
16
|
+
return unless @enabled && !@dry_run
|
|
17
|
+
|
|
18
|
+
raise Error, "api_key is required" if api_key.nil? || api_key.empty?
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "socket"
|
|
4
|
+
|
|
5
|
+
module TraceStax
|
|
6
|
+
module DelayedJob
|
|
7
|
+
# Delayed::Job lifecycle plugin.
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# Delayed::Worker.plugins << TraceStax::DelayedJob::Plugin
|
|
11
|
+
class Plugin < Delayed::Plugin
|
|
12
|
+
callbacks do |lifecycle|
|
|
13
|
+
lifecycle.around(:perform) do |worker, job, &block|
|
|
14
|
+
TraceStax::DelayedJob.around_perform(worker, job, &block)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def self.around_perform(worker, job, &block)
|
|
20
|
+
client = TraceStax::Client.instance
|
|
21
|
+
task_name = job.payload_object.class.name rescue "unknown"
|
|
22
|
+
job_id = job.id.to_s
|
|
23
|
+
queue = job.queue || "default"
|
|
24
|
+
attempt = (job.attempts || 0) + 1
|
|
25
|
+
|
|
26
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
27
|
+
worker_info = {
|
|
28
|
+
key: "#{Socket.gethostname}:#{Process.pid}",
|
|
29
|
+
hostname: Socket.gethostname,
|
|
30
|
+
pid: Process.pid,
|
|
31
|
+
queues: [queue],
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
client.send_event({
|
|
35
|
+
type: "task_event", framework: "delayed_job", language: "ruby",
|
|
36
|
+
sdk_version: TraceStax::VERSION, status: "started",
|
|
37
|
+
worker: worker_info,
|
|
38
|
+
task: { name: task_name, id: job_id, queue: queue, attempt: attempt },
|
|
39
|
+
metrics: {},
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
begin
|
|
43
|
+
block.call(worker, job)
|
|
44
|
+
duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
45
|
+
client.send_event({
|
|
46
|
+
type: "task_event", framework: "delayed_job", language: "ruby",
|
|
47
|
+
sdk_version: TraceStax::VERSION, status: "succeeded",
|
|
48
|
+
worker: worker_info,
|
|
49
|
+
task: { name: task_name, id: job_id, queue: queue, attempt: attempt },
|
|
50
|
+
metrics: { duration_ms: duration_ms },
|
|
51
|
+
})
|
|
52
|
+
rescue => e
|
|
53
|
+
duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
54
|
+
client.send_event({
|
|
55
|
+
type: "task_event", framework: "delayed_job", language: "ruby",
|
|
56
|
+
sdk_version: TraceStax::VERSION, status: "failed",
|
|
57
|
+
worker: worker_info,
|
|
58
|
+
task: { name: task_name, id: job_id, queue: queue, attempt: attempt },
|
|
59
|
+
metrics: { duration_ms: duration_ms },
|
|
60
|
+
error: { type: e.class.name, message: e.message, stack_trace: e.backtrace&.first(20)&.join("\n") },
|
|
61
|
+
})
|
|
62
|
+
raise
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|