flare 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +2 -17
- data/lib/flare/configuration.rb +14 -0
- data/lib/flare/engine.rb +4 -2
- data/lib/flare/filtering_span_processor.rb +279 -0
- data/lib/flare/http_transport.rb +62 -0
- data/lib/flare/marker.rb +106 -0
- data/lib/flare/metric_counter.rb +6 -0
- data/lib/flare/metric_flusher.rb +18 -5
- data/lib/flare/metric_storage.rb +5 -0
- data/lib/flare/rule_manager.rb +140 -0
- data/lib/flare/sampler.rb +130 -0
- data/lib/flare/trace_blob.rb +116 -0
- data/lib/flare/trace_exporter.rb +143 -0
- data/lib/flare/trace_health_reporter.rb +74 -0
- data/lib/flare/upload_url_pool.rb +108 -0
- data/lib/flare/version.rb +1 -1
- data/lib/flare/web_marker_subscriber.rb +76 -0
- data/lib/flare.rb +127 -7
- metadata +11 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: dcfa9283973a11f13fd17397781c76cc03d3e39c9bd08a92025417397eb74eb9
|
|
4
|
+
data.tar.gz: 30e2f84a92f9689aabdd64b7c6839598dc0c3c2bb1dd32a31e2ab9b3a6f5f85a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 581e6e9ed512b82ac086ab76644ff0d618408649a110ef3b623a3bf4a1648e3695dfcedcefad01bd54b7a11687a1c5ed154ed01e74f2cf14c27fa400bf97f73a
|
|
7
|
+
data.tar.gz: 4a407b09a229f512d6cb1f35ab04dfb8de7ecaa982837fd356c41435c7758d3c4e05ad33812670b2e570830bef5a21e9bdffe297234cec9e33f9ab5250c42af1
|
data/CHANGELOG.md
CHANGED
|
@@ -1,18 +1,3 @@
|
|
|
1
|
-
|
|
1
|
+
# Changelog
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
- Auto-detect OTel instrumentation gems via `use_all`
|
|
6
|
-
- Name Sidekiq job spans by worker class (via `job_class` attribute)
|
|
7
|
-
- Support both old and new OTel semantic convention property keys in dashboard queries
|
|
8
|
-
- Guard dashboard routes when sqlite3 is missing
|
|
9
|
-
|
|
10
|
-
## [0.1.1] - 2025-12-17
|
|
11
|
-
|
|
12
|
-
- Rename gem from caboose to flare
|
|
13
|
-
- Warn instead of raising when sqlite3 gem is missing
|
|
14
|
-
- Make sqlite3 an optional dependency for production compatibility
|
|
15
|
-
|
|
16
|
-
## [0.1.0] - 2025-12-17
|
|
17
|
-
|
|
18
|
-
- Initial release
|
|
3
|
+
See [GitHub Releases](https://github.com/verygoodsoftwarellc/flare-gem/releases) for the changelog.
|
data/lib/flare/configuration.rb
CHANGED
|
@@ -12,9 +12,15 @@ module Flare
|
|
|
12
12
|
|
|
13
13
|
# Spans: detailed trace data stored in SQLite (default: development only)
|
|
14
14
|
# Metrics: aggregated counters in memory, flushed periodically (default: production only)
|
|
15
|
+
# Tracing: server-controlled per-route trace sampling. Polls /api/rules
|
|
16
|
+
# for which routes/jobs to capture, ships matched traces direct to R2
|
|
17
|
+
# via presigned URLs, self-notifies POST /api/traces.
|
|
15
18
|
attr_accessor :spans_enabled
|
|
16
19
|
attr_accessor :metrics_enabled
|
|
17
20
|
attr_accessor :metrics_flush_interval # seconds between flushes (default: 60)
|
|
21
|
+
attr_accessor :tracing_enabled
|
|
22
|
+
attr_accessor :tracing_poll_interval # seconds between /api/rules polls (default: 30)
|
|
23
|
+
attr_accessor :tracing_max_queue # max traced spans buffered per process
|
|
18
24
|
|
|
19
25
|
# Metrics HTTP submission settings
|
|
20
26
|
attr_accessor :url # URL of the Flare metrics service
|
|
@@ -53,6 +59,9 @@ module Flare
|
|
|
53
59
|
@spans_enabled = rails_development?
|
|
54
60
|
@metrics_enabled = !rails_test?
|
|
55
61
|
@metrics_flush_interval = 60 # seconds
|
|
62
|
+
@tracing_enabled = !rails_test?
|
|
63
|
+
@tracing_poll_interval = 30 # seconds
|
|
64
|
+
@tracing_max_queue = 5_000
|
|
56
65
|
|
|
57
66
|
# Metrics HTTP submission defaults
|
|
58
67
|
@url = ENV.fetch("FLARE_URL", credentials_url || "https://flare.am")
|
|
@@ -67,6 +76,11 @@ module Flare
|
|
|
67
76
|
!@key.nil? && !@key.empty?
|
|
68
77
|
end
|
|
69
78
|
|
|
79
|
+
# Tracing reuses the same endpoint + key as metrics.
|
|
80
|
+
def tracing_submission_configured?
|
|
81
|
+
@tracing_enabled && metrics_submission_configured?
|
|
82
|
+
end
|
|
83
|
+
|
|
70
84
|
def database_path
|
|
71
85
|
@database_path || default_database_path
|
|
72
86
|
end
|
data/lib/flare/engine.rb
CHANGED
|
@@ -25,9 +25,11 @@ module Flare
|
|
|
25
25
|
Flare.configure_opentelemetry
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
-
# Phase 2: Start
|
|
29
|
-
# so user config (metrics_enabled, flush_interval,
|
|
28
|
+
# Phase 2: Start background threads after all initializers have run
|
|
29
|
+
# so user config (metrics_enabled, tracing_enabled, flush_interval,
|
|
30
|
+
# tracing_poll_interval, etc.) is applied.
|
|
30
31
|
config.after_initialize do
|
|
32
|
+
Flare.start_rule_manager
|
|
31
33
|
Flare.start_metrics_flusher
|
|
32
34
|
end
|
|
33
35
|
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "concurrent/atomic/atomic_fixnum"
|
|
4
|
+
require "logger"
|
|
5
|
+
require "opentelemetry/sdk"
|
|
6
|
+
|
|
7
|
+
module Flare
|
|
8
|
+
# BSP-shaped span processor whose filter is `sampled OR marked` instead
|
|
9
|
+
# of BSP's `sampled` (BSP early-returns on RECORD_ONLY spans -- our
|
|
10
|
+
# Path 2 spans have sampled=false so they'd be dropped). Forwards
|
|
11
|
+
# matching spans to a trace exporter on a background worker thread so
|
|
12
|
+
# the exporter never runs on the request/job thread (CAF-3).
|
|
13
|
+
#
|
|
14
|
+
# On every on_finish we also check marker.owner?(trace_id, span_id) and
|
|
15
|
+
# unmark when the owning rack span finishes (CAF-2). Cleanup runs even
|
|
16
|
+
# for spans we don't export.
|
|
17
|
+
class FilteringSpanProcessor
|
|
18
|
+
SUCCESS = OpenTelemetry::SDK::Trace::Export::SUCCESS
|
|
19
|
+
FAILURE = OpenTelemetry::SDK::Trace::Export::FAILURE
|
|
20
|
+
|
|
21
|
+
DEFAULT_MAX_QUEUE = 5_000
|
|
22
|
+
DEFAULT_FLUSH_INTERVAL = 5 # seconds
|
|
23
|
+
DEFAULT_EXPORT_TIMEOUT = 30 # seconds
|
|
24
|
+
DEFAULT_MARKED_TRACE_GRACE_PERIOD = 1.0 # seconds
|
|
25
|
+
|
|
26
|
+
attr_reader :dropped_count, :failed_export_count, :exception_count, :buffer_high_watermark, :max_queue
|
|
27
|
+
|
|
28
|
+
def initialize(exporter:, marker:,
|
|
29
|
+
max_queue: DEFAULT_MAX_QUEUE,
|
|
30
|
+
flush_interval: DEFAULT_FLUSH_INTERVAL,
|
|
31
|
+
export_timeout: DEFAULT_EXPORT_TIMEOUT,
|
|
32
|
+
marked_trace_grace_period: DEFAULT_MARKED_TRACE_GRACE_PERIOD,
|
|
33
|
+
logger: nil)
|
|
34
|
+
@exporter = exporter
|
|
35
|
+
@marker = marker
|
|
36
|
+
@max_queue = max_queue
|
|
37
|
+
@flush_interval = flush_interval
|
|
38
|
+
@export_timeout = export_timeout
|
|
39
|
+
@marked_trace_grace_period = marked_trace_grace_period.to_f
|
|
40
|
+
@logger = logger || Logger.new($stderr, level: Logger::WARN)
|
|
41
|
+
|
|
42
|
+
@pending_by_trace = {}
|
|
43
|
+
@trace_order = []
|
|
44
|
+
@pending_count = 0
|
|
45
|
+
@ready_queue = []
|
|
46
|
+
@delayed_ready_by_trace = {}
|
|
47
|
+
@mutex = Mutex.new
|
|
48
|
+
@cond = ConditionVariable.new
|
|
49
|
+
@stopped = false
|
|
50
|
+
@pid = $$
|
|
51
|
+
|
|
52
|
+
@dropped_count = Concurrent::AtomicFixnum.new(0)
|
|
53
|
+
@failed_export_count = Concurrent::AtomicFixnum.new(0)
|
|
54
|
+
@exception_count = Concurrent::AtomicFixnum.new(0)
|
|
55
|
+
@buffer_high_watermark = Concurrent::AtomicFixnum.new(0)
|
|
56
|
+
|
|
57
|
+
start_worker
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def on_start(_span, _parent_context); end
|
|
61
|
+
|
|
62
|
+
def on_finish(span)
|
|
63
|
+
detect_forking
|
|
64
|
+
|
|
65
|
+
ctx = span.context
|
|
66
|
+
sampled = ctx&.trace_flags&.sampled?
|
|
67
|
+
marked = ctx && @marker.marked?(ctx.trace_id)
|
|
68
|
+
owner_finished = marked && @marker.owner?(ctx.trace_id, ctx.span_id)
|
|
69
|
+
|
|
70
|
+
return unless sampled || marked
|
|
71
|
+
|
|
72
|
+
span_data = span.respond_to?(:to_span_data) ? span.to_span_data : span
|
|
73
|
+
enqueue(
|
|
74
|
+
span_data,
|
|
75
|
+
complete: owner_finished || sampled_completion_span?(span_data),
|
|
76
|
+
delay: owner_finished ? @marked_trace_grace_period : 0
|
|
77
|
+
)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def force_flush(timeout: nil)
|
|
81
|
+
drain_and_export(include_pending: true)
|
|
82
|
+
SUCCESS
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def shutdown(timeout: nil)
|
|
86
|
+
@mutex.synchronize do
|
|
87
|
+
@stopped = true
|
|
88
|
+
@cond.broadcast
|
|
89
|
+
end
|
|
90
|
+
@worker.join(timeout || 5)
|
|
91
|
+
drain_and_export(include_pending: true)
|
|
92
|
+
@exporter.shutdown(timeout: timeout) if @exporter.respond_to?(:shutdown)
|
|
93
|
+
SUCCESS
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def buffer_size
|
|
97
|
+
@mutex.synchronize { queued_span_count }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def reset_buffer_high_watermark
|
|
101
|
+
@buffer_high_watermark.value = buffer_size
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
private
|
|
105
|
+
|
|
106
|
+
def enqueue(span_data, complete:, delay: 0)
|
|
107
|
+
@mutex.synchronize do
|
|
108
|
+
trace_id = span_data.trace_id
|
|
109
|
+
@trace_order << trace_id unless @pending_by_trace.key?(trace_id)
|
|
110
|
+
@pending_by_trace[trace_id] ||= []
|
|
111
|
+
@pending_by_trace[trace_id] << span_data
|
|
112
|
+
@pending_count += 1
|
|
113
|
+
evict_oldest_spans
|
|
114
|
+
|
|
115
|
+
if complete
|
|
116
|
+
delay.positive? ? delay_trace_ready(trace_id, delay) : mark_trace_ready(trace_id)
|
|
117
|
+
end
|
|
118
|
+
evict_oldest_spans
|
|
119
|
+
update_buffer_high_watermark
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def worker_loop
|
|
124
|
+
until stopped?
|
|
125
|
+
@mutex.synchronize do
|
|
126
|
+
timeout = next_wait_timeout
|
|
127
|
+
@cond.wait(@mutex, timeout) if @ready_queue.empty? && !@stopped
|
|
128
|
+
end
|
|
129
|
+
drain_and_export
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def stopped?
|
|
134
|
+
@mutex.synchronize { @stopped }
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def drain_and_export(include_pending: false)
|
|
138
|
+
batch = nil
|
|
139
|
+
@mutex.synchronize do
|
|
140
|
+
promote_due_delayed_traces
|
|
141
|
+
|
|
142
|
+
if include_pending
|
|
143
|
+
@ready_queue.concat(@pending_by_trace.values.flatten)
|
|
144
|
+
@pending_by_trace.clear
|
|
145
|
+
@trace_order.clear
|
|
146
|
+
@pending_count = 0
|
|
147
|
+
unmark_delayed_traces
|
|
148
|
+
@delayed_ready_by_trace.clear
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
return if @ready_queue.empty?
|
|
152
|
+
batch = @ready_queue
|
|
153
|
+
@ready_queue = []
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
result = @exporter.export(batch, timeout: @export_timeout)
|
|
157
|
+
@failed_export_count.increment if result != SUCCESS
|
|
158
|
+
rescue StandardError => e
|
|
159
|
+
@exception_count.increment
|
|
160
|
+
@logger.warn("[Flare::FilteringSpanProcessor] export failed: #{e.class}: #{e.message}")
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def mark_trace_ready(trace_id)
|
|
164
|
+
batch = @pending_by_trace.delete(trace_id)
|
|
165
|
+
return unless batch
|
|
166
|
+
|
|
167
|
+
@trace_order.delete(trace_id)
|
|
168
|
+
@delayed_ready_by_trace.delete(trace_id)
|
|
169
|
+
@pending_count -= batch.length
|
|
170
|
+
@ready_queue.concat(batch)
|
|
171
|
+
@cond.signal
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def delay_trace_ready(trace_id, delay)
|
|
175
|
+
@delayed_ready_by_trace[trace_id] = monotonic_now + delay
|
|
176
|
+
@cond.signal
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def promote_due_delayed_traces
|
|
180
|
+
now = monotonic_now
|
|
181
|
+
ready_trace_ids = @delayed_ready_by_trace.select { |_, ready_at| ready_at <= now }.keys
|
|
182
|
+
ready_trace_ids.each do |trace_id|
|
|
183
|
+
mark_trace_ready(trace_id)
|
|
184
|
+
@marker.unmark(trace_id)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def unmark_delayed_traces
|
|
189
|
+
@delayed_ready_by_trace.each_key { |trace_id| @marker.unmark(trace_id) }
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def next_wait_timeout
|
|
193
|
+
next_ready_at = @delayed_ready_by_trace.values.min
|
|
194
|
+
return @flush_interval unless next_ready_at
|
|
195
|
+
|
|
196
|
+
[next_ready_at - monotonic_now, 0].max
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def evict_oldest_spans
|
|
200
|
+
while queued_span_count > @max_queue
|
|
201
|
+
trace_id = @trace_order.first
|
|
202
|
+
unless trace_id
|
|
203
|
+
@ready_queue.shift
|
|
204
|
+
@dropped_count.increment
|
|
205
|
+
next
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
spans = @pending_by_trace[trace_id]
|
|
209
|
+
if spans.nil? || spans.empty?
|
|
210
|
+
@trace_order.shift
|
|
211
|
+
next
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
spans.shift
|
|
215
|
+
@pending_count -= 1
|
|
216
|
+
@dropped_count.increment
|
|
217
|
+
|
|
218
|
+
if spans.empty?
|
|
219
|
+
@pending_by_trace.delete(trace_id)
|
|
220
|
+
@trace_order.shift
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def queued_span_count
|
|
226
|
+
@pending_count + @ready_queue.length
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def update_buffer_high_watermark
|
|
230
|
+
current = queued_span_count
|
|
231
|
+
@buffer_high_watermark.update { |previous| current > previous ? current : previous }
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def monotonic_now
|
|
235
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def sampled_completion_span?(span_data)
|
|
239
|
+
root_span?(span_data) || entry_span?(span_data)
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def root_span?(span_data)
|
|
243
|
+
parent_id = span_data.parent_span_id if span_data.respond_to?(:parent_span_id)
|
|
244
|
+
parent_id.nil? ||
|
|
245
|
+
(parent_id.respond_to?(:empty?) && parent_id.empty?) ||
|
|
246
|
+
parent_id == OpenTelemetry::Trace::INVALID_SPAN_ID
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def entry_span?(span_data)
|
|
250
|
+
return false unless span_data.respond_to?(:kind)
|
|
251
|
+
|
|
252
|
+
span_data.kind == :server || span_data.kind == :consumer
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def detect_forking
|
|
256
|
+
return if @pid == $$
|
|
257
|
+
|
|
258
|
+
@mutex.synchronize do
|
|
259
|
+
return if @pid == $$
|
|
260
|
+
|
|
261
|
+
@pid = $$
|
|
262
|
+
@pending_by_trace.clear
|
|
263
|
+
@trace_order.clear
|
|
264
|
+
@ready_queue.clear
|
|
265
|
+
@delayed_ready_by_trace.clear
|
|
266
|
+
@pending_count = 0
|
|
267
|
+
@stopped = false
|
|
268
|
+
start_worker
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
def start_worker
|
|
273
|
+
return if @worker&.alive?
|
|
274
|
+
|
|
275
|
+
@worker = Thread.new { worker_loop }
|
|
276
|
+
@worker.name = "flare-filtering-span-processor"
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module Flare
|
|
7
|
+
# Tiny HTTP wrapper used by TraceExporter (and anything else that wants
|
|
8
|
+
# to PUT/POST without pulling in a heavy client). Designed for injection
|
|
9
|
+
# at the boundary so tests can swap in a recording fake; no other moving
|
|
10
|
+
# parts.
|
|
11
|
+
class HttpTransport
|
|
12
|
+
DEFAULT_OPEN_TIMEOUT = 2
|
|
13
|
+
DEFAULT_READ_TIMEOUT = 5
|
|
14
|
+
DEFAULT_WRITE_TIMEOUT = 5
|
|
15
|
+
|
|
16
|
+
Response = Struct.new(:code, :body, :headers, keyword_init: true) do
|
|
17
|
+
def header(name)
|
|
18
|
+
return nil unless headers
|
|
19
|
+
headers[name] || headers[name.downcase] || headers[name.upcase]
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def initialize(open_timeout: DEFAULT_OPEN_TIMEOUT,
|
|
24
|
+
read_timeout: DEFAULT_READ_TIMEOUT,
|
|
25
|
+
write_timeout: DEFAULT_WRITE_TIMEOUT)
|
|
26
|
+
@open_timeout = open_timeout
|
|
27
|
+
@read_timeout = read_timeout
|
|
28
|
+
@write_timeout = write_timeout
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def get(url, headers = {})
|
|
32
|
+
request(url, nil, headers, Net::HTTP::Get)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def put(url, body, headers = {})
|
|
36
|
+
request(url, body, headers, Net::HTTP::Put)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def post(url, body, headers = {})
|
|
40
|
+
request(url, body, headers, Net::HTTP::Post)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def request(url, body, headers, klass)
|
|
46
|
+
uri = URI(url)
|
|
47
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
48
|
+
http.use_ssl = uri.scheme == "https"
|
|
49
|
+
http.open_timeout = @open_timeout
|
|
50
|
+
http.read_timeout = @read_timeout
|
|
51
|
+
http.write_timeout = @write_timeout if http.respond_to?(:write_timeout=)
|
|
52
|
+
|
|
53
|
+
req = klass.new(uri.request_uri == "" ? "/" : uri.request_uri)
|
|
54
|
+
headers.each { |k, v| req[k] = v }
|
|
55
|
+
req.body = body if body
|
|
56
|
+
|
|
57
|
+
response = http.request(req)
|
|
58
|
+
hash = response.each_header.to_h
|
|
59
|
+
Response.new(code: response.code.to_s, body: response.body, headers: hash)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
data/lib/flare/marker.rb
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "concurrent/map"
|
|
4
|
+
require "concurrent/atomic/atomic_fixnum"
|
|
5
|
+
|
|
6
|
+
module Flare
|
|
7
|
+
# Thread-safe registry of trace_ids that Path 2 (the WebMarkerSubscriber)
|
|
8
|
+
# has marked for export. FilteringSpanProcessor checks marked? on every
|
|
9
|
+
# on_finish; matching spans get forwarded to the trace exporter, the rest
|
|
10
|
+
# are dropped.
|
|
11
|
+
#
|
|
12
|
+
# Each entry records the OWNER span_id (the local rack server span the
|
|
13
|
+
# subscriber was inside when it marked the trace). Cleanup is keyed on
|
|
14
|
+
# the owner finishing, not the trace root finishing -- remote-parented
|
|
15
|
+
# rack spans aren't trace roots, and child spans can outlive their parent
|
|
16
|
+
# in OTel, so root-driven cleanup would leak on the dominant production
|
|
17
|
+
# case (web app behind a load balancer or service mesh).
|
|
18
|
+
#
|
|
19
|
+
# Bounded by:
|
|
20
|
+
# - sweep(): drops entries older than max_age (default 5 min) so a rack
|
|
21
|
+
# span that never finishes (process killed mid-request, exception path
|
|
22
|
+
# that skips ensure) doesn't leak forever.
|
|
23
|
+
# - hard ceiling at max_entries (default 10k): on overflow, drop oldest
|
|
24
|
+
# 10% by marked_at.
|
|
25
|
+
class Marker
|
|
26
|
+
Entry = Struct.new(:owner_span_id, :rule_id, :marked_at, keyword_init: true)
|
|
27
|
+
|
|
28
|
+
DEFAULT_MAX_ENTRIES = 10_000
|
|
29
|
+
DEFAULT_MAX_AGE = 5 * 60 # seconds
|
|
30
|
+
|
|
31
|
+
attr_reader :evicted_count
|
|
32
|
+
|
|
33
|
+
def initialize(max_entries: DEFAULT_MAX_ENTRIES, max_age: DEFAULT_MAX_AGE)
|
|
34
|
+
@entries = Concurrent::Map.new
|
|
35
|
+
@max_entries = max_entries
|
|
36
|
+
@max_age = max_age
|
|
37
|
+
@evicted_count = Concurrent::AtomicFixnum.new(0)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def mark(trace_id, owner_span_id:, rule_id:)
|
|
41
|
+
@entries[trace_id] = Entry.new(
|
|
42
|
+
owner_span_id: owner_span_id,
|
|
43
|
+
rule_id: rule_id,
|
|
44
|
+
marked_at: monotonic_now
|
|
45
|
+
)
|
|
46
|
+
maybe_evict_oldest
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def marked?(trace_id)
|
|
50
|
+
@entries.key?(trace_id)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# True only when span_id matches the marker's owner -- the rack span
|
|
54
|
+
# that originally marked this trace. Used by FilteringSpanProcessor to
|
|
55
|
+
# decide when to unmark (only when that exact span finishes, not on
|
|
56
|
+
# every span that happens to have this trace_id).
|
|
57
|
+
def owner?(trace_id, span_id)
|
|
58
|
+
entry = @entries[trace_id]
|
|
59
|
+
!entry.nil? && entry.owner_span_id == span_id
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def rule_id(trace_id)
|
|
63
|
+
entry = @entries[trace_id]
|
|
64
|
+
entry&.rule_id
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def unmark(trace_id)
|
|
68
|
+
@entries.delete(trace_id)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def size
|
|
72
|
+
@entries.size
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Drop entries older than max_age. Call periodically (the RuleManager's
|
|
76
|
+
# scheduler is the natural place) to handle the rack-span-never-finishes
|
|
77
|
+
# leak case (CAF-7).
|
|
78
|
+
def sweep
|
|
79
|
+
threshold = monotonic_now - @max_age
|
|
80
|
+
evicted = 0
|
|
81
|
+
@entries.each_pair do |trace_id, entry|
|
|
82
|
+
if entry.marked_at < threshold
|
|
83
|
+
@entries.delete(trace_id)
|
|
84
|
+
evicted += 1
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
@evicted_count.increment(evicted) if evicted.positive?
|
|
88
|
+
evicted
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
def maybe_evict_oldest
|
|
94
|
+
return if @entries.size <= @max_entries
|
|
95
|
+
|
|
96
|
+
to_drop = (@max_entries * 0.1).ceil
|
|
97
|
+
sorted = @entries.each_pair.to_a.sort_by { |_, entry| entry.marked_at }
|
|
98
|
+
sorted.first(to_drop).each { |trace_id, _| @entries.delete(trace_id) }
|
|
99
|
+
@evicted_count.increment(to_drop)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def monotonic_now
|
|
103
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
data/lib/flare/metric_counter.rb
CHANGED
|
@@ -22,6 +22,12 @@ module Flare
|
|
|
22
22
|
@error_count.increment if error
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
+
def add(count:, sum_ms:, error_count: 0)
|
|
26
|
+
@count.increment(count.to_i)
|
|
27
|
+
@sum_ms.increment(sum_ms.to_i)
|
|
28
|
+
@error_count.increment(error_count.to_i)
|
|
29
|
+
end
|
|
30
|
+
|
|
25
31
|
def count
|
|
26
32
|
@count.value
|
|
27
33
|
end
|
data/lib/flare/metric_flusher.rb
CHANGED
|
@@ -15,11 +15,12 @@ module Flare
|
|
|
15
15
|
|
|
16
16
|
attr_reader :interval, :shutdown_timeout
|
|
17
17
|
|
|
18
|
-
def initialize(storage:, submitter:, interval: DEFAULT_INTERVAL, shutdown_timeout: DEFAULT_SHUTDOWN_TIMEOUT)
|
|
18
|
+
def initialize(storage:, submitter:, interval: DEFAULT_INTERVAL, shutdown_timeout: DEFAULT_SHUTDOWN_TIMEOUT, health_reporters: [])
|
|
19
19
|
@storage = storage
|
|
20
20
|
@submitter = submitter
|
|
21
21
|
@interval = interval
|
|
22
22
|
@shutdown_timeout = shutdown_timeout
|
|
23
|
+
@health_reporters = Array(health_reporters)
|
|
23
24
|
@pid = $$
|
|
24
25
|
@stopped = false
|
|
25
26
|
end
|
|
@@ -44,7 +45,7 @@ module Flare
|
|
|
44
45
|
|
|
45
46
|
@stopped = true
|
|
46
47
|
|
|
47
|
-
|
|
48
|
+
log "Shutting down metrics flusher, draining remaining metrics..."
|
|
48
49
|
|
|
49
50
|
if @timer
|
|
50
51
|
@timer.shutdown
|
|
@@ -59,7 +60,7 @@ module Flare
|
|
|
59
60
|
@pool.kill unless pool_terminated
|
|
60
61
|
end
|
|
61
62
|
|
|
62
|
-
|
|
63
|
+
log "Metrics flusher stopped"
|
|
63
64
|
end
|
|
64
65
|
|
|
65
66
|
def restart
|
|
@@ -72,6 +73,7 @@ module Flare
|
|
|
72
73
|
def flush_now
|
|
73
74
|
return 0 unless @storage && @submitter
|
|
74
75
|
|
|
76
|
+
record_health_metrics
|
|
75
77
|
drained = @storage.drain
|
|
76
78
|
return 0 if drained.empty?
|
|
77
79
|
|
|
@@ -100,13 +102,14 @@ module Flare
|
|
|
100
102
|
private
|
|
101
103
|
|
|
102
104
|
def post_to_pool
|
|
105
|
+
record_health_metrics
|
|
103
106
|
drained = @storage.drain
|
|
104
107
|
if drained.empty?
|
|
105
|
-
|
|
108
|
+
log "No metrics to flush"
|
|
106
109
|
return
|
|
107
110
|
end
|
|
108
111
|
|
|
109
|
-
|
|
112
|
+
log "Drained #{drained.size} metric keys for submission"
|
|
110
113
|
@pool.post { submit_to_cloud(drained) }
|
|
111
114
|
rescue => e
|
|
112
115
|
warn "[Flare] Metric drain error: #{e.message}"
|
|
@@ -120,5 +123,15 @@ module Flare
|
|
|
120
123
|
rescue => e
|
|
121
124
|
warn "[Flare] Metric submission error: #{e.message}"
|
|
122
125
|
end
|
|
126
|
+
|
|
127
|
+
def record_health_metrics
|
|
128
|
+
@health_reporters.each { |reporter| reporter.record(@storage) }
|
|
129
|
+
rescue => e
|
|
130
|
+
warn "[Flare] Health metric recording error: #{e.message}"
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def log(message)
|
|
134
|
+
Flare.log(message) if Flare.respond_to?(:log)
|
|
135
|
+
end
|
|
123
136
|
end
|
|
124
137
|
end
|
data/lib/flare/metric_storage.rb
CHANGED
|
@@ -16,6 +16,11 @@ module Flare
|
|
|
16
16
|
counter.increment(duration_ms: duration_ms, error: error)
|
|
17
17
|
end
|
|
18
18
|
|
|
19
|
+
def add(key, count:, sum_ms:, error_count: 0)
|
|
20
|
+
counter = @storage.compute_if_absent(key) { MetricCounter.new }
|
|
21
|
+
counter.add(count: count, sum_ms: sum_ms, error_count: error_count)
|
|
22
|
+
end
|
|
23
|
+
|
|
19
24
|
# Atomically retrieves and clears all metrics.
|
|
20
25
|
# Returns a frozen hash of MetricKey => counter data.
|
|
21
26
|
def drain
|