allstak 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +135 -0
- data/README.md +72 -240
- data/allstak.gemspec +10 -9
- data/lib/allstak/client.rb +58 -2
- data/lib/allstak/config.rb +246 -3
- data/lib/allstak/global_handler.rb +100 -0
- data/lib/allstak/integrations/net_http.rb +9 -1
- data/lib/allstak/integrations/rack.rb +54 -10
- data/lib/allstak/integrations/rails.rb +59 -0
- data/lib/allstak/integrations/sidekiq.rb +183 -0
- data/lib/allstak/modules/database.rb +4 -1
- data/lib/allstak/modules/errors.rb +84 -3
- data/lib/allstak/modules/http_monitor.rb +7 -2
- data/lib/allstak/modules/logs.rb +5 -2
- data/lib/allstak/modules/tracing.rb +33 -2
- data/lib/allstak/propagation.rb +48 -0
- data/lib/allstak/sampling.rb +38 -0
- data/lib/allstak/sanitizer.rb +322 -0
- data/lib/allstak/session_tracker.rb +216 -0
- data/lib/allstak/transport/event_spool.rb +228 -0
- data/lib/allstak/transport/http_transport.rb +168 -5
- data/lib/allstak/version.rb +1 -1
- data/lib/allstak.rb +77 -1
- metadata +23 -29
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
require_relative "../sanitizer"
|
|
2
|
+
|
|
3
|
+
module AllStak
|
|
4
|
+
module Integrations
|
|
5
|
+
# Sidekiq integration.
|
|
6
|
+
#
|
|
7
|
+
# Provides a Sidekiq *server* middleware that:
|
|
8
|
+
# 1. Starts a fresh trace per job (so spans/telemetry produced inside the
|
|
9
|
+
# job link together and don't bleed across jobs on a reused thread).
|
|
10
|
+
# 2. Wraps job execution in a "queue.process" span + breadcrumb.
|
|
11
|
+
# 3. Auto-captures the exception when a job raises, attaching worker
|
|
12
|
+
# class, jid, queue, and (sanitized) args as metadata, then re-raises
|
|
13
|
+
# so Sidekiq's own retry machinery still runs.
|
|
14
|
+
#
|
|
15
|
+
# It also registers a `death_handler` so jobs that exhaust their retries
|
|
16
|
+
# are captured once more with `mechanism=sidekiq.death` for visibility.
|
|
17
|
+
#
|
|
18
|
+
# Installation is guarded: `install!` is a graceful no-op when Sidekiq is
|
|
19
|
+
# not loaded in the host process, and is idempotent.
|
|
20
|
+
module Sidekiq
|
|
21
|
+
def self.install!
|
|
22
|
+
return if @installed
|
|
23
|
+
return unless defined?(::Sidekiq)
|
|
24
|
+
|
|
25
|
+
::Sidekiq.configure_server do |sidekiq_config|
|
|
26
|
+
sidekiq_config.server_middleware do |chain|
|
|
27
|
+
chain.add(AllStak::Integrations::Sidekiq::Middleware) unless chain.exists?(AllStak::Integrations::Sidekiq::Middleware)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Retries-exhausted handler. The death handler API differs across
|
|
31
|
+
# Sidekiq majors; both forms accept a (job, exception) callable.
|
|
32
|
+
if sidekiq_config.respond_to?(:death_handlers)
|
|
33
|
+
sidekiq_config.death_handlers << lambda do |job, exception|
|
|
34
|
+
AllStak::Integrations::Sidekiq.capture_death(job, exception)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
@installed = true
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def self.installed?
|
|
43
|
+
@installed == true
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Capture a job that has exhausted all retries (Sidekiq "death").
|
|
47
|
+
# Best-effort; never raises into Sidekiq's death-handler loop.
|
|
48
|
+
def self.capture_death(job, exception)
|
|
49
|
+
return unless AllStak.initialized?
|
|
50
|
+
return if exception.nil?
|
|
51
|
+
client = AllStak.client
|
|
52
|
+
config = client.config
|
|
53
|
+
return unless config.capture_unhandled_exceptions
|
|
54
|
+
|
|
55
|
+
job = job || {}
|
|
56
|
+
meta = job_metadata(job).merge(
|
|
57
|
+
"mechanism" => "sidekiq.death",
|
|
58
|
+
"handled" => false
|
|
59
|
+
)
|
|
60
|
+
client.errors.capture_exception(exception, metadata: meta)
|
|
61
|
+
rescue => e
|
|
62
|
+
begin
|
|
63
|
+
AllStak.client.config.debug && warn("[AllStak] sidekiq death capture failed: #{e.message}")
|
|
64
|
+
rescue
|
|
65
|
+
nil
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Build sanitized job metadata from a Sidekiq job hash.
|
|
70
|
+
# Args are routed through the Sanitizer (KEY-NAME redaction only) so
|
|
71
|
+
# secrets in argument hashes are redacted here; value-pattern PII
|
|
72
|
+
# scrubbing (email/IP/CC/SSN, gated by send_default_pii) is applied
|
|
73
|
+
# authoritatively on the wire path, so we don't double-scrub free text
|
|
74
|
+
# at this layer.
|
|
75
|
+
def self.job_metadata(job)
|
|
76
|
+
job ||= {}
|
|
77
|
+
args = job["args"]
|
|
78
|
+
sanitized_args =
|
|
79
|
+
begin
|
|
80
|
+
AllStak::Sanitizer.scrub(args, values: false) if args
|
|
81
|
+
rescue
|
|
82
|
+
nil
|
|
83
|
+
end
|
|
84
|
+
{
|
|
85
|
+
"sidekiq.class" => job["class"] || job["wrapped"],
|
|
86
|
+
"sidekiq.jid" => job["jid"],
|
|
87
|
+
"sidekiq.queue" => job["queue"],
|
|
88
|
+
"sidekiq.retry_count" => job["retry_count"],
|
|
89
|
+
"sidekiq.args" => sanitized_args
|
|
90
|
+
}.reject { |_, v| v.nil? }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Sidekiq server middleware. Sidekiq calls `#call(worker, job, queue)`
|
|
94
|
+
# and expects the middleware to `yield` to run the job.
|
|
95
|
+
class Middleware
|
|
96
|
+
def call(worker, job, queue)
|
|
97
|
+
unless AllStak.initialized?
|
|
98
|
+
return yield
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
client = AllStak.client
|
|
102
|
+
config = client.config
|
|
103
|
+
|
|
104
|
+
# Each job is its own trace root unless an upstream producer
|
|
105
|
+
# propagated a trace id into the job payload.
|
|
106
|
+
incoming_trace = job.is_a?(Hash) ? (job["allstak_trace_id"] || job["trace_id"]) : nil
|
|
107
|
+
if incoming_trace && !incoming_trace.to_s.empty?
|
|
108
|
+
client.tracing.set_trace_id(incoming_trace.to_s)
|
|
109
|
+
else
|
|
110
|
+
client.tracing.reset_trace
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
worker_class = worker_class_name(worker, job)
|
|
114
|
+
job_queue = (job.is_a?(Hash) ? job["queue"] : nil) || queue
|
|
115
|
+
jid = job.is_a?(Hash) ? job["jid"] : nil
|
|
116
|
+
|
|
117
|
+
client.errors.add_breadcrumb(
|
|
118
|
+
type: "sidekiq",
|
|
119
|
+
message: "process #{worker_class}",
|
|
120
|
+
data: { "queue" => job_queue, "jid" => jid }.reject { |_, v| v.nil? }
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
span = client.tracing.start_span(
|
|
124
|
+
"queue.process",
|
|
125
|
+
description: worker_class,
|
|
126
|
+
tags: {
|
|
127
|
+
"messaging.system" => "sidekiq",
|
|
128
|
+
"messaging.destination" => job_queue.to_s,
|
|
129
|
+
"messaging.message_id" => jid.to_s
|
|
130
|
+
}.reject { |_, v| v.to_s.empty? }
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
captured = nil
|
|
134
|
+
begin
|
|
135
|
+
yield
|
|
136
|
+
rescue Exception => e # rubocop:disable Lint/RescueException
|
|
137
|
+
captured = e
|
|
138
|
+
raise
|
|
139
|
+
ensure
|
|
140
|
+
span.finish(captured ? "error" : "ok") unless span.finished?
|
|
141
|
+
|
|
142
|
+
if captured && config.capture_unhandled_exceptions
|
|
143
|
+
begin
|
|
144
|
+
meta = AllStak::Integrations::Sidekiq.job_metadata(job_hash(job, worker_class, job_queue, jid)).merge(
|
|
145
|
+
"mechanism" => "sidekiq",
|
|
146
|
+
"handled" => false,
|
|
147
|
+
"traceId" => client.tracing.current_trace_id
|
|
148
|
+
)
|
|
149
|
+
client.errors.capture_exception(
|
|
150
|
+
captured,
|
|
151
|
+
trace_id: client.tracing.current_trace_id,
|
|
152
|
+
metadata: meta
|
|
153
|
+
)
|
|
154
|
+
rescue => err
|
|
155
|
+
config.debug && warn("[AllStak] sidekiq exception capture failed: #{err.message}")
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
private
|
|
162
|
+
|
|
163
|
+
def worker_class_name(worker, job)
|
|
164
|
+
if job.is_a?(Hash) && (job["wrapped"] || job["class"])
|
|
165
|
+
return (job["wrapped"] || job["class"]).to_s
|
|
166
|
+
end
|
|
167
|
+
worker.respond_to?(:class) ? worker.class.name : worker.to_s
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Normalize the job into a Hash for metadata extraction. Sidekiq always
|
|
171
|
+
# passes a Hash, but we tolerate non-Hash defensively.
|
|
172
|
+
def job_hash(job, worker_class, queue, jid)
|
|
173
|
+
return job if job.is_a?(Hash)
|
|
174
|
+
{
|
|
175
|
+
"class" => worker_class,
|
|
176
|
+
"queue" => queue,
|
|
177
|
+
"jid" => jid
|
|
178
|
+
}
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
@@ -78,7 +78,10 @@ module AllStak
|
|
|
78
78
|
rescue Transport::AllStakAuthError
|
|
79
79
|
return
|
|
80
80
|
rescue Transport::AllStakTransportError => e
|
|
81
|
-
|
|
81
|
+
# Retries exhausted / outage: persist the (scrubbed) batch for
|
|
82
|
+
# replay on the next init instead of dropping.
|
|
83
|
+
@transport.persist_failed(PATH, { queries: chunk })
|
|
84
|
+
@logger.debug("[AllStak] db batch transport error (spooled): #{e.message}")
|
|
82
85
|
rescue => e
|
|
83
86
|
@logger.debug("[AllStak] db batch unexpected error: #{e.message}")
|
|
84
87
|
end
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
require "json"
|
|
2
|
+
require_relative "../sampling"
|
|
2
3
|
|
|
3
4
|
module AllStak
|
|
4
5
|
module Modules
|
|
@@ -7,13 +8,16 @@ module AllStak
|
|
|
7
8
|
PATH = "/ingest/v1/errors".freeze
|
|
8
9
|
MAX_BREADCRUMBS = 50
|
|
9
10
|
|
|
10
|
-
def initialize(transport, config, logger)
|
|
11
|
+
def initialize(transport, config, logger, session_id_provider: nil)
|
|
11
12
|
@transport = transport
|
|
12
13
|
@config = config
|
|
13
14
|
@logger = logger
|
|
14
15
|
@current_user = nil
|
|
15
16
|
@breadcrumbs = []
|
|
16
17
|
@breadcrumb_mutex = Mutex.new
|
|
18
|
+
# Optional callable returning the active release-health session id, so
|
|
19
|
+
# the backend's error consumer can mark the session errored/crashed.
|
|
20
|
+
@session_id_provider = session_id_provider
|
|
17
21
|
end
|
|
18
22
|
|
|
19
23
|
def set_user(id: nil, email: nil, ip: nil)
|
|
@@ -54,21 +58,42 @@ module AllStak
|
|
|
54
58
|
level: level,
|
|
55
59
|
environment: @config.environment,
|
|
56
60
|
release: @config.release,
|
|
61
|
+
# Phase 3 — v2 ingest contract: top-level identity + frames.
|
|
62
|
+
sdkName: @config.sdk_name,
|
|
63
|
+
sdkVersion: @config.sdk_version,
|
|
64
|
+
platform: @config.platform,
|
|
65
|
+
dist: @config.dist,
|
|
66
|
+
frames: extract_structured_frames(exc),
|
|
57
67
|
traceId: trace_id,
|
|
68
|
+
# Release-health: top-level session id so the backend error
|
|
69
|
+
# consumer can mark the session errored/crashed server-side.
|
|
70
|
+
sessionId: current_session_id,
|
|
58
71
|
user: (user || @current_user)&.to_h,
|
|
59
72
|
requestContext: request_context&.to_h,
|
|
60
|
-
metadata: metadata,
|
|
73
|
+
metadata: @config.release_tags.merge(metadata || {}),
|
|
61
74
|
breadcrumbs: crumbs
|
|
62
75
|
}.compact
|
|
63
76
|
payload.delete(:user) if payload[:user]&.empty?
|
|
64
77
|
payload.delete(:requestContext) if payload[:requestContext]&.empty?
|
|
65
78
|
|
|
79
|
+
# Sampling first, then before_send, then transport (which scrubs).
|
|
80
|
+
return nil unless Sampling.sampled?(@config.sample_rate)
|
|
81
|
+
payload = apply_before_send(payload)
|
|
82
|
+
return nil if payload.nil?
|
|
83
|
+
|
|
66
84
|
status, body = @transport.post(PATH, payload)
|
|
67
85
|
return nil unless status == 202
|
|
68
86
|
parsed = JSON.parse(body) rescue nil
|
|
69
87
|
parsed&.dig("data", "id")
|
|
70
88
|
rescue Transport::AllStakAuthError
|
|
71
89
|
nil
|
|
90
|
+
rescue Transport::AllStakTransportError => e
|
|
91
|
+
# Retries exhausted / network outage: persist the (scrubbed) error for
|
|
92
|
+
# replay on the next init instead of dropping. `payload` is in scope
|
|
93
|
+
# only after it is built; guard so a pre-build failure still no-ops.
|
|
94
|
+
@transport.persist_failed(PATH, payload) if defined?(payload) && payload
|
|
95
|
+
@logger.debug("[AllStak] capture_exception transport error (spooled): #{e.message}")
|
|
96
|
+
nil
|
|
72
97
|
rescue => e
|
|
73
98
|
@logger.debug("[AllStak] capture_exception swallowed: #{e.class}: #{e.message}")
|
|
74
99
|
nil
|
|
@@ -86,14 +111,27 @@ module AllStak
|
|
|
86
111
|
environment: @config.environment,
|
|
87
112
|
release: @config.release,
|
|
88
113
|
traceId: trace_id,
|
|
114
|
+
sessionId: current_session_id,
|
|
89
115
|
user: (user || @current_user)&.to_h,
|
|
90
116
|
requestContext: request_context&.to_h,
|
|
91
|
-
metadata: metadata
|
|
117
|
+
metadata: @config.release_tags.merge(metadata || {})
|
|
92
118
|
}.compact
|
|
93
119
|
payload.delete(:user) if payload[:user]&.empty?
|
|
94
120
|
payload.delete(:requestContext) if payload[:requestContext]&.empty?
|
|
121
|
+
|
|
122
|
+
# Sampling first, then before_send, then transport (which scrubs).
|
|
123
|
+
return nil unless Sampling.sampled?(@config.sample_rate)
|
|
124
|
+
payload = apply_before_send(payload)
|
|
125
|
+
return nil if payload.nil?
|
|
126
|
+
|
|
95
127
|
status, _ = @transport.post(PATH, payload)
|
|
96
128
|
status == 202 ? exception_class : nil
|
|
129
|
+
rescue Transport::AllStakAuthError
|
|
130
|
+
nil
|
|
131
|
+
rescue Transport::AllStakTransportError => e
|
|
132
|
+
@transport.persist_failed(PATH, payload) if defined?(payload) && payload
|
|
133
|
+
@logger.debug("[AllStak] capture_error transport error (spooled): #{e.message}")
|
|
134
|
+
nil
|
|
97
135
|
rescue => e
|
|
98
136
|
@logger.debug("[AllStak] capture_error swallowed: #{e.class}: #{e.message}")
|
|
99
137
|
nil
|
|
@@ -102,10 +140,53 @@ module AllStak
|
|
|
102
140
|
|
|
103
141
|
private
|
|
104
142
|
|
|
143
|
+
# Resolve the active release-health session id via the injected provider.
|
|
144
|
+
# Fail-open: any error yields nil so capture is never blocked.
|
|
145
|
+
def current_session_id
|
|
146
|
+
return nil unless @session_id_provider.respond_to?(:call)
|
|
147
|
+
sid = @session_id_provider.call
|
|
148
|
+
sid.to_s.empty? ? nil : sid
|
|
149
|
+
rescue StandardError
|
|
150
|
+
nil
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Run the user-supplied before_send hook. Returns the (possibly modified)
|
|
154
|
+
# event, or nil to drop. Fail-open: if the hook raises, log and return
|
|
155
|
+
# the ORIGINAL event so telemetry is never lost to a buggy hook.
|
|
156
|
+
def apply_before_send(payload)
|
|
157
|
+
hook = @config.before_send
|
|
158
|
+
return payload unless hook.respond_to?(:call)
|
|
159
|
+
begin
|
|
160
|
+
hook.call(payload)
|
|
161
|
+
rescue => e
|
|
162
|
+
@logger.warn("[AllStak] before_send raised; sending original event: #{e.class}: #{e.message}")
|
|
163
|
+
payload
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
105
167
|
def extract_frames(exc)
|
|
106
168
|
return [] unless exc.backtrace.is_a?(Array)
|
|
107
169
|
exc.backtrace.first(50)
|
|
108
170
|
end
|
|
171
|
+
|
|
172
|
+
# Phase 3 — v2 structured frames. Ruby's backtrace is "<file>:<line>:in `<fn>'"
|
|
173
|
+
# — split it into the wire shape so the dashboard can render real
|
|
174
|
+
# source paths. Falls back to nil when no backtrace is present.
|
|
175
|
+
def extract_structured_frames(exc)
|
|
176
|
+
return nil unless exc.backtrace.is_a?(Array)
|
|
177
|
+
out = []
|
|
178
|
+
exc.backtrace.first(50).each do |line|
|
|
179
|
+
if line =~ /^(.*):(\d+):in [`'](.+?)'/
|
|
180
|
+
out << {
|
|
181
|
+
filename: $1, absPath: $1,
|
|
182
|
+
function: $3, lineno: Integer($2),
|
|
183
|
+
inApp: !$1.include?('/gems/') && !$1.start_with?('<internal:'),
|
|
184
|
+
platform: 'ruby'
|
|
185
|
+
}
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
out.empty? ? nil : out
|
|
189
|
+
end
|
|
109
190
|
end
|
|
110
191
|
end
|
|
111
192
|
end
|
|
@@ -23,7 +23,8 @@ module AllStak
|
|
|
23
23
|
|
|
24
24
|
def record(direction:, method:, host:, path:, status_code:, duration_ms:,
|
|
25
25
|
request_size: 0, response_size: 0, trace_id: nil, user_id: nil,
|
|
26
|
-
error_fingerprint: nil, span_id: nil, parent_span_id: nil
|
|
26
|
+
error_fingerprint: nil, span_id: nil, parent_span_id: nil,
|
|
27
|
+
request_id: nil)
|
|
27
28
|
return if @transport.disabled?
|
|
28
29
|
item = {
|
|
29
30
|
direction: direction,
|
|
@@ -36,6 +37,7 @@ module AllStak
|
|
|
36
37
|
responseSize: response_size.to_i,
|
|
37
38
|
timestamp: Time.now.utc.iso8601(3),
|
|
38
39
|
traceId: trace_id || SecureRandom.hex(16),
|
|
40
|
+
requestId: request_id,
|
|
39
41
|
userId: user_id,
|
|
40
42
|
errorFingerprint: error_fingerprint,
|
|
41
43
|
spanId: span_id,
|
|
@@ -68,7 +70,10 @@ module AllStak
|
|
|
68
70
|
rescue Transport::AllStakAuthError
|
|
69
71
|
return
|
|
70
72
|
rescue Transport::AllStakTransportError => e
|
|
71
|
-
|
|
73
|
+
# Retries exhausted / outage: persist the (scrubbed) batch for
|
|
74
|
+
# replay on the next init instead of dropping.
|
|
75
|
+
@transport.persist_failed(PATH, { requests: chunk })
|
|
76
|
+
@logger.debug("[AllStak] http batch transport error (spooled): #{e.message}")
|
|
72
77
|
rescue => e
|
|
73
78
|
@logger.debug("[AllStak] http batch unexpected error: #{e.message}")
|
|
74
79
|
end
|
data/lib/allstak/modules/logs.rb
CHANGED
|
@@ -34,7 +34,7 @@ module AllStak
|
|
|
34
34
|
requestId: request_id,
|
|
35
35
|
userId: user_id,
|
|
36
36
|
errorId: error_id,
|
|
37
|
-
metadata: metadata
|
|
37
|
+
metadata: @config.release_tags.merge(metadata || {})
|
|
38
38
|
}.compact
|
|
39
39
|
@buffer.push(payload)
|
|
40
40
|
end
|
|
@@ -68,7 +68,10 @@ module AllStak
|
|
|
68
68
|
rescue Transport::AllStakAuthError
|
|
69
69
|
return
|
|
70
70
|
rescue Transport::AllStakTransportError => e
|
|
71
|
-
|
|
71
|
+
# Retries exhausted / network outage: persist (scrubbed) for replay
|
|
72
|
+
# on the next init instead of dropping. Fail-open inside transport.
|
|
73
|
+
@transport.persist_failed(PATH, item)
|
|
74
|
+
@logger.debug("[AllStak] log transport error (spooled): #{e.message}")
|
|
72
75
|
rescue => e
|
|
73
76
|
@logger.debug("[AllStak] unexpected log error: #{e.message}")
|
|
74
77
|
end
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
require "securerandom"
|
|
2
|
+
require_relative "../sampling"
|
|
2
3
|
|
|
3
4
|
module AllStak
|
|
4
5
|
module Modules
|
|
@@ -36,10 +37,26 @@ module AllStak
|
|
|
36
37
|
def reset_trace
|
|
37
38
|
Thread.current[:allstak_trace_id] = nil
|
|
38
39
|
Thread.current[:allstak_span_stack] = nil
|
|
40
|
+
Thread.current[:allstak_trace_sampled] = nil
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Sampling decision for the CURRENT trace. Decided once (at the first
|
|
44
|
+
# span of the trace) and cached thread-locally so every span and the
|
|
45
|
+
# propagated traceparent flag agree. When `traces_sample_rate` is nil
|
|
46
|
+
# (the default), tracing is unsampled-mode-off: everything is kept and
|
|
47
|
+
# the traceparent sampled flag stays "01" (historical behavior).
|
|
48
|
+
def current_trace_sampled?
|
|
49
|
+
decided = Thread.current[:allstak_trace_sampled]
|
|
50
|
+
return decided unless decided.nil?
|
|
51
|
+
rate = @config.traces_sample_rate
|
|
52
|
+
decided = rate.nil? ? true : Sampling.sampled?(rate)
|
|
53
|
+
Thread.current[:allstak_trace_sampled] = decided
|
|
54
|
+
decided
|
|
39
55
|
end
|
|
40
56
|
|
|
41
57
|
def start_span(operation, description: "", tags: nil)
|
|
42
58
|
trace_id = current_trace_id
|
|
59
|
+
sampled = current_trace_sampled?
|
|
43
60
|
span_id = SecureRandom.hex(8)
|
|
44
61
|
parent = current_span_id || ""
|
|
45
62
|
Thread.current[:allstak_span_stack] ||= []
|
|
@@ -56,6 +73,7 @@ module AllStak
|
|
|
56
73
|
release: (@config.respond_to?(:release) ? @config.release : nil) || "",
|
|
57
74
|
tags: tags || {},
|
|
58
75
|
start_time_millis: (Time.now.to_f * 1000).to_i,
|
|
76
|
+
sampled: sampled,
|
|
59
77
|
on_finish: method(:on_span_finish)
|
|
60
78
|
)
|
|
61
79
|
end
|
|
@@ -88,6 +106,10 @@ module AllStak
|
|
|
88
106
|
def on_span_finish(span)
|
|
89
107
|
stack = Thread.current[:allstak_span_stack]
|
|
90
108
|
stack&.delete(span.span_id)
|
|
109
|
+
# Drop unsampled spans: they were never meant to be sent. The span
|
|
110
|
+
# still ran (timing/finish semantics intact) so block-form `in_span`
|
|
111
|
+
# control flow is unaffected.
|
|
112
|
+
return unless span.sampled?
|
|
91
113
|
@buffer.push(span.to_h)
|
|
92
114
|
end
|
|
93
115
|
|
|
@@ -97,7 +119,10 @@ module AllStak
|
|
|
97
119
|
rescue Transport::AllStakAuthError
|
|
98
120
|
return
|
|
99
121
|
rescue Transport::AllStakTransportError => e
|
|
100
|
-
|
|
122
|
+
# Retries exhausted / outage: persist the (scrubbed) spans for replay
|
|
123
|
+
# on the next init instead of dropping.
|
|
124
|
+
@transport.persist_failed(PATH, { spans: items })
|
|
125
|
+
@logger.debug("[AllStak] span transport error (spooled): #{e.message}")
|
|
101
126
|
rescue => e
|
|
102
127
|
@logger.debug("[AllStak] span unexpected error: #{e.message}")
|
|
103
128
|
end
|
|
@@ -108,7 +133,8 @@ module AllStak
|
|
|
108
133
|
attr_reader :trace_id, :span_id
|
|
109
134
|
|
|
110
135
|
def initialize(trace_id:, span_id:, parent_span_id:, operation:, description:,
|
|
111
|
-
service:, environment:, tags:, start_time_millis:, on_finish:,
|
|
136
|
+
service:, environment:, tags:, start_time_millis:, on_finish:,
|
|
137
|
+
release: "", sampled: true)
|
|
112
138
|
@trace_id = trace_id
|
|
113
139
|
@span_id = span_id
|
|
114
140
|
@parent_span_id = parent_span_id
|
|
@@ -122,9 +148,14 @@ module AllStak
|
|
|
122
148
|
@end_time_millis = nil
|
|
123
149
|
@status = "ok"
|
|
124
150
|
@finished = false
|
|
151
|
+
@sampled = sampled
|
|
125
152
|
@on_finish = on_finish
|
|
126
153
|
end
|
|
127
154
|
|
|
155
|
+
def sampled?
|
|
156
|
+
@sampled
|
|
157
|
+
end
|
|
158
|
+
|
|
128
159
|
def set_tag(key, value)
|
|
129
160
|
@tags[key.to_s] = value.to_s
|
|
130
161
|
self
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
module AllStak
|
|
2
|
+
module Propagation
|
|
3
|
+
module_function
|
|
4
|
+
|
|
5
|
+
def baggage(trace_id:, request_id: nil, span_id: nil)
|
|
6
|
+
parts = ["allstak-trace_id=#{trace_id}"]
|
|
7
|
+
parts << "allstak-request_id=#{request_id}" if request_id && !request_id.to_s.empty?
|
|
8
|
+
parts << "allstak-span_id=#{span_id}" if span_id && !span_id.to_s.empty?
|
|
9
|
+
parts.join(",")
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def merge_baggage(existing, trace_id:, request_id: nil, span_id: nil)
|
|
13
|
+
preserved = existing.to_s.split(",").map(&:strip).reject do |part|
|
|
14
|
+
part.empty? || part.downcase.start_with?("allstak-")
|
|
15
|
+
end
|
|
16
|
+
(preserved + baggage(trace_id: trace_id, request_id: request_id, span_id: span_id).split(",")).join(",")
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# W3C traceparent trace-flags: "01" = sampled, "00" = not sampled.
|
|
20
|
+
# `sampled` defaults to true to preserve historical behavior for callers
|
|
21
|
+
# that do not pass an explicit sampling decision.
|
|
22
|
+
def trace_flags(sampled)
|
|
23
|
+
sampled == false ? "00" : "01"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def apply_headers(headers, trace_id:, request_id: nil, span_id: nil, sampled: true)
|
|
27
|
+
headers["X-AllStak-Trace-Id"] = trace_id
|
|
28
|
+
headers["X-AllStak-Request-Id"] = request_id if request_id && !request_id.to_s.empty?
|
|
29
|
+
if span_id && !span_id.to_s.empty?
|
|
30
|
+
headers["X-AllStak-Span-Id"] = span_id
|
|
31
|
+
headers["traceparent"] = "00-#{trace_id}-#{span_id[0, 16]}-#{trace_flags(sampled)}"
|
|
32
|
+
end
|
|
33
|
+
headers["baggage"] = merge_baggage(headers["baggage"], trace_id: trace_id, request_id: request_id, span_id: span_id)
|
|
34
|
+
headers["AllStak-Baggage"] = baggage(trace_id: trace_id, request_id: request_id, span_id: span_id)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def apply_request_headers(req, trace_id:, request_id: nil, span_id: nil, sampled: true)
|
|
38
|
+
req["X-AllStak-Trace-Id"] ||= trace_id
|
|
39
|
+
req["X-AllStak-Request-Id"] ||= request_id if request_id && !request_id.to_s.empty?
|
|
40
|
+
if span_id && !span_id.to_s.empty?
|
|
41
|
+
req["X-AllStak-Span-Id"] ||= span_id
|
|
42
|
+
req["traceparent"] ||= "00-#{trace_id}-#{span_id[0, 16]}-#{trace_flags(sampled)}"
|
|
43
|
+
end
|
|
44
|
+
req["baggage"] = merge_baggage(req["baggage"], trace_id: trace_id, request_id: request_id, span_id: span_id)
|
|
45
|
+
req["AllStak-Baggage"] = baggage(trace_id: trace_id, request_id: request_id, span_id: span_id)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
module AllStak
|
|
2
|
+
# Deterministic head-sampling helper.
|
|
3
|
+
#
|
|
4
|
+
# Sampling is "deterministic" in the sense that a rate of 1.0 always keeps
|
|
5
|
+
# an event and a rate of 0.0 always drops it — no RNG is consulted at the
|
|
6
|
+
# boundaries. For intermediate rates a single random draw in [0.0, 1.0) is
|
|
7
|
+
# compared against the rate: kept when `draw < rate`.
|
|
8
|
+
#
|
|
9
|
+
# The RNG is a seam: tests inject a deterministic value via {rng=} so the
|
|
10
|
+
# keep/drop decision is fully controllable without monkeypatching Kernel.
|
|
11
|
+
module Sampling
|
|
12
|
+
module_function
|
|
13
|
+
|
|
14
|
+
# Override the random source used by {sampled?}. Pass a callable returning
|
|
15
|
+
# a Float in [0.0, 1.0). Pass nil to restore the default (Kernel#rand).
|
|
16
|
+
def rng=(callable)
|
|
17
|
+
@rng = callable
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def rng
|
|
21
|
+
@rng || ->(*) { rand }
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Returns true when an event should be KEPT under the given rate.
|
|
25
|
+
#
|
|
26
|
+
# nil rate is treated as "no sampling configured" → keep.
|
|
27
|
+
# Rates are clamped to [0.0, 1.0]. 1.0 always keeps; 0.0 always drops.
|
|
28
|
+
def sampled?(rate)
|
|
29
|
+
return true if rate.nil?
|
|
30
|
+
r = rate.to_f
|
|
31
|
+
r = 0.0 if r < 0.0
|
|
32
|
+
r = 1.0 if r > 1.0
|
|
33
|
+
return true if r >= 1.0
|
|
34
|
+
return false if r <= 0.0
|
|
35
|
+
rng.call < r
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|