flare 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +2 -4
- data/app/controllers/flare/application_controller.rb +8 -0
- data/app/helpers/flare/application_helper.rb +4 -4
- data/app/views/flare/jobs/show.html.erb +2 -2
- data/app/views/flare/requests/show.html.erb +8 -8
- data/lib/flare/configuration.rb +14 -0
- data/lib/flare/engine.rb +4 -2
- data/lib/flare/filtering_span_processor.rb +279 -0
- data/lib/flare/http_transport.rb +62 -0
- data/lib/flare/marker.rb +106 -0
- data/lib/flare/metric_counter.rb +6 -0
- data/lib/flare/metric_flusher.rb +18 -5
- data/lib/flare/metric_span_processor.rb +1 -1
- data/lib/flare/metric_storage.rb +5 -0
- data/lib/flare/rule_manager.rb +140 -0
- data/lib/flare/sampler.rb +130 -0
- data/lib/flare/storage/sqlite.rb +27 -22
- data/lib/flare/trace_blob.rb +116 -0
- data/lib/flare/trace_exporter.rb +143 -0
- data/lib/flare/trace_health_reporter.rb +74 -0
- data/lib/flare/upload_url_pool.rb +108 -0
- data/lib/flare/version.rb +1 -1
- data/lib/flare/web_marker_subscriber.rb +76 -0
- data/lib/flare.rb +146 -20
- metadata +11 -1
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "zlib"
|
|
5
|
+
require "stringio"
|
|
6
|
+
require "logger"
|
|
7
|
+
require "uri"
|
|
8
|
+
require "concurrent/atomic/atomic_fixnum"
|
|
9
|
+
require "opentelemetry/sdk"
|
|
10
|
+
|
|
11
|
+
require_relative "sampler"
|
|
12
|
+
require_relative "trace_blob"
|
|
13
|
+
require_relative "http_transport"
|
|
14
|
+
|
|
15
|
+
module Flare
|
|
16
|
+
# Custom OTel exporter. For each batch FilteringSpanProcessor hands over:
|
|
17
|
+
#
|
|
18
|
+
# 1. Group spans by trace_id.
|
|
19
|
+
# 2. For each trace, build a Flare::TraceBlob and gzip-JSON-encode it.
|
|
20
|
+
# 3. Check out a presigned R2 PUT URL from UploadUrlPool.
|
|
21
|
+
# 4. PUT the gzipped body straight to R2 -- Flare's server is NOT in
|
|
22
|
+
# the trace-bytes path.
|
|
23
|
+
# 5. After R2 returns 200, POST /api/traces { key } using the
|
|
24
|
+
# customer's push token + Flare-Project / Flare-Environment headers.
|
|
25
|
+
# That's the self-notify hop the design swapped in for the CF Worker.
|
|
26
|
+
#
|
|
27
|
+
# 403 from R2 means the presigned URL expired between issue and use;
|
|
28
|
+
# discard, check out the next URL, retry once. Pool empty -> FAILURE.
|
|
29
|
+
# Notify-POST failure is logged + counted but doesn't fail the export
|
|
30
|
+
# (the blob is in R2, just won't be processed; incoming/* lifecycle
|
|
31
|
+
# cleans it up in 1hr).
|
|
32
|
+
class TraceExporter
|
|
33
|
+
SUCCESS = OpenTelemetry::SDK::Trace::Export::SUCCESS
|
|
34
|
+
FAILURE = OpenTelemetry::SDK::Trace::Export::FAILURE
|
|
35
|
+
|
|
36
|
+
PUT_HEADERS = {
|
|
37
|
+
"Content-Type" => "application/json",
|
|
38
|
+
"Content-Encoding" => "gzip"
|
|
39
|
+
}.freeze
|
|
40
|
+
|
|
41
|
+
attr_reader :put_failure_count, :notify_failure_count, :pool_empty_count, :exception_count
|
|
42
|
+
|
|
43
|
+
def initialize(pool:, notify_url:, api_key:, project:, environment:,
|
|
44
|
+
transport: nil, logger: nil)
|
|
45
|
+
@pool = pool
|
|
46
|
+
@notify_url = notify_url.to_s
|
|
47
|
+
@api_key = api_key
|
|
48
|
+
@project = project
|
|
49
|
+
@environment = environment
|
|
50
|
+
@transport = transport || HttpTransport.new
|
|
51
|
+
@logger = logger || Logger.new($stderr, level: Logger::WARN)
|
|
52
|
+
|
|
53
|
+
@put_failure_count = Concurrent::AtomicFixnum.new(0)
|
|
54
|
+
@notify_failure_count = Concurrent::AtomicFixnum.new(0)
|
|
55
|
+
@pool_empty_count = Concurrent::AtomicFixnum.new(0)
|
|
56
|
+
@exception_count = Concurrent::AtomicFixnum.new(0)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def export(spans, timeout: nil)
|
|
60
|
+
grouped = spans.group_by(&:trace_id)
|
|
61
|
+
return SUCCESS if grouped.empty?
|
|
62
|
+
|
|
63
|
+
overall = SUCCESS
|
|
64
|
+
grouped.each do |trace_id, group|
|
|
65
|
+
result = ship(TraceBlob.build(trace_id: trace_id, spans: group))
|
|
66
|
+
overall = FAILURE if result == FAILURE
|
|
67
|
+
end
|
|
68
|
+
overall
|
|
69
|
+
rescue StandardError => e
|
|
70
|
+
@exception_count.increment
|
|
71
|
+
@logger.warn("[Flare::TraceExporter] export raised: #{e.class}: #{e.message}")
|
|
72
|
+
FAILURE
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def force_flush(timeout: nil)
|
|
76
|
+
SUCCESS
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def shutdown(timeout: nil)
|
|
80
|
+
SUCCESS
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def ship(blob, retried: false)
|
|
86
|
+
return FAILURE if blob.nil?
|
|
87
|
+
|
|
88
|
+
entry = @pool.checkout
|
|
89
|
+
if entry.nil?
|
|
90
|
+
@pool_empty_count.increment
|
|
91
|
+
return FAILURE
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
body = gzip(JSON.generate(blob.to_h))
|
|
95
|
+
response = @transport.put(entry[:put_url], body, PUT_HEADERS)
|
|
96
|
+
|
|
97
|
+
case response.code
|
|
98
|
+
when "200", "204"
|
|
99
|
+
notify(entry[:key])
|
|
100
|
+
SUCCESS
|
|
101
|
+
when "403"
|
|
102
|
+
# Presigned URL probably expired; try once more with the next one.
|
|
103
|
+
retried ? record_put_failure(response) : ship(blob, retried: true)
|
|
104
|
+
else
|
|
105
|
+
record_put_failure(response)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def notify(key)
|
|
110
|
+
response = @transport.post(@notify_url, JSON.generate(key: key), notify_headers)
|
|
111
|
+
return if response.code == "202"
|
|
112
|
+
|
|
113
|
+
@notify_failure_count.increment
|
|
114
|
+
@logger.warn("[Flare::TraceExporter] notify failed: HTTP #{response.code}")
|
|
115
|
+
rescue StandardError => e
|
|
116
|
+
@notify_failure_count.increment
|
|
117
|
+
@logger.warn("[Flare::TraceExporter] notify exception: #{e.class}: #{e.message}")
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def notify_headers
|
|
121
|
+
{
|
|
122
|
+
"Content-Type" => "application/json",
|
|
123
|
+
"Authorization" => "Bearer #{@api_key}",
|
|
124
|
+
"Flare-Project" => @project,
|
|
125
|
+
"Flare-Environment" => @environment
|
|
126
|
+
}
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def record_put_failure(response)
|
|
130
|
+
@put_failure_count.increment
|
|
131
|
+
@logger.warn("[Flare::TraceExporter] PUT failed: HTTP #{response.code}")
|
|
132
|
+
FAILURE
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def gzip(body)
|
|
136
|
+
io = StringIO.new
|
|
137
|
+
gz = Zlib::GzipWriter.new(io)
|
|
138
|
+
gz.write(body)
|
|
139
|
+
gz.close
|
|
140
|
+
io.string
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "metric_key"
|
|
4
|
+
|
|
5
|
+
module Flare
|
|
6
|
+
# Records client-side tracing health into MetricStorage so flare-web can
|
|
7
|
+
# warn when local buffering, URL exhaustion, or export errors reduce trace
|
|
8
|
+
# fidelity.
|
|
9
|
+
class TraceHealthReporter
|
|
10
|
+
NAMESPACE = "sdk"
|
|
11
|
+
SERVICE = "flare-ruby"
|
|
12
|
+
TARGET = "tracing"
|
|
13
|
+
|
|
14
|
+
def initialize(processor:, pool:, exporter:)
|
|
15
|
+
@processor = processor
|
|
16
|
+
@pool = pool
|
|
17
|
+
@exporter = exporter
|
|
18
|
+
@last = {}
|
|
19
|
+
@mutex = Mutex.new
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def record(storage, bucket: Time.now.utc)
|
|
23
|
+
@mutex.synchronize do
|
|
24
|
+
record_counter(storage, bucket, "dropped_spans", @processor.dropped_count.value)
|
|
25
|
+
record_counter(storage, bucket, "export_failures", @processor.failed_export_count.value)
|
|
26
|
+
record_counter(storage, bucket, "processor_exceptions", @processor.exception_count.value)
|
|
27
|
+
|
|
28
|
+
record_counter(storage, bucket, "upload_url_pool_empty", @pool.empty_count.value)
|
|
29
|
+
record_counter(storage, bucket, "upload_url_expired", @pool.expired_count.value)
|
|
30
|
+
|
|
31
|
+
record_counter(storage, bucket, "r2_put_failures", @exporter.put_failure_count.value)
|
|
32
|
+
record_counter(storage, bucket, "notify_failures", @exporter.notify_failure_count.value)
|
|
33
|
+
record_counter(storage, bucket, "trace_pool_empty", @exporter.pool_empty_count.value)
|
|
34
|
+
record_counter(storage, bucket, "trace_export_exceptions", @exporter.exception_count.value)
|
|
35
|
+
|
|
36
|
+
buffer_size = @processor.buffer_size
|
|
37
|
+
buffer_high_watermark = @processor.buffer_high_watermark.value
|
|
38
|
+
record_gauge(storage, bucket, "buffer_size", buffer_size)
|
|
39
|
+
record_gauge(storage, bucket, "buffer_high_watermark", buffer_high_watermark)
|
|
40
|
+
record_gauge(storage, bucket, "buffer_limit", @processor.max_queue) if buffer_size.positive? || buffer_high_watermark.positive?
|
|
41
|
+
@processor.reset_buffer_high_watermark
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def record_counter(storage, bucket, operation, current)
|
|
48
|
+
previous = @last.fetch(operation, 0)
|
|
49
|
+
@last[operation] = current
|
|
50
|
+
delta = current - previous
|
|
51
|
+
return unless delta.positive?
|
|
52
|
+
|
|
53
|
+
storage.add(key(bucket, operation), count: delta, sum_ms: 0, error_count: 0)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def record_gauge(storage, bucket, operation, value)
|
|
57
|
+
storage.add(key(bucket, operation), count: 1, sum_ms: value, error_count: 0)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def key(bucket, operation)
|
|
61
|
+
MetricKey.new(
|
|
62
|
+
bucket: bucket_time(bucket),
|
|
63
|
+
namespace: NAMESPACE,
|
|
64
|
+
service: SERVICE,
|
|
65
|
+
target: TARGET,
|
|
66
|
+
operation: operation
|
|
67
|
+
)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def bucket_time(time)
|
|
71
|
+
Time.utc(time.year, time.month, time.day, time.hour, time.min, 0)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "concurrent/atomic/atomic_reference"
|
|
4
|
+
|
|
5
|
+
module Flare
|
|
6
|
+
# Thread-safe pool of presigned R2 PUT URLs the RuleManager fills from
|
|
7
|
+
# the /api/rules response. TraceExporter checks one out before each
|
|
8
|
+
# upload; if the pool is empty (no active rules, no fresh URLs) it
|
|
9
|
+
# returns nil and the exporter gives up on that batch -- caller decides
|
|
10
|
+
# what to do.
|
|
11
|
+
#
|
|
12
|
+
# Each entry is a Hash: { upload_id:, key:, put_url:, expires_at: }.
|
|
13
|
+
# expires_at is a Time; entries past their expiry are skipped on checkout.
|
|
14
|
+
#
|
|
15
|
+
# Fork-safe: after_fork clears the pool so child processes don't reuse
|
|
16
|
+
# parent URLs (each child polls its own copy from /api/rules anyway).
|
|
17
|
+
class UploadUrlPool
|
|
18
|
+
attr_reader :checkouts, :empty_count, :expired_count
|
|
19
|
+
|
|
20
|
+
def initialize
|
|
21
|
+
@entries_ref = Concurrent::AtomicReference.new([].freeze)
|
|
22
|
+
@checkouts = Concurrent::AtomicFixnum.new(0)
|
|
23
|
+
@empty_count = Concurrent::AtomicFixnum.new(0)
|
|
24
|
+
@expired_count = Concurrent::AtomicFixnum.new(0)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def replace(entries)
|
|
28
|
+
normalized = (entries || []).filter_map { |raw| normalize(raw) }
|
|
29
|
+
@entries_ref.set(normalized.freeze)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def checkout
|
|
33
|
+
now = Time.now
|
|
34
|
+
loop do
|
|
35
|
+
current = @entries_ref.get
|
|
36
|
+
if current.empty?
|
|
37
|
+
@empty_count.increment
|
|
38
|
+
return nil
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
candidate, *rest = current
|
|
42
|
+
next_state = rest.freeze
|
|
43
|
+
next unless @entries_ref.compare_and_set(current, next_state)
|
|
44
|
+
|
|
45
|
+
if expired?(candidate, now)
|
|
46
|
+
@expired_count.increment
|
|
47
|
+
next # try the next one
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
@checkouts.increment
|
|
51
|
+
return candidate
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def size
|
|
56
|
+
@entries_ref.get.length
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def empty?
|
|
60
|
+
size.zero?
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def clear
|
|
64
|
+
@entries_ref.set([].freeze)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Drop URLs that have already passed their expires_at. Cheap; safe to
|
|
68
|
+
# call from RuleManager's scheduler in between polls.
|
|
69
|
+
def sweep
|
|
70
|
+
now = Time.now
|
|
71
|
+
current = @entries_ref.get
|
|
72
|
+
live = current.reject { |e| expired?(e, now) }
|
|
73
|
+
return 0 if live.length == current.length
|
|
74
|
+
|
|
75
|
+
@entries_ref.set(live.freeze)
|
|
76
|
+
current.length - live.length
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Call from Flare.after_fork. Parent's URLs aren't usable from the
|
|
80
|
+
# child's point of view (each child should get its own from a fresh
|
|
81
|
+
# /api/rules poll), so just drop them.
|
|
82
|
+
def after_fork
|
|
83
|
+
clear
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def normalize(raw)
|
|
89
|
+
h = raw.is_a?(Hash) ? raw : nil
|
|
90
|
+
return nil unless h
|
|
91
|
+
|
|
92
|
+
upload_id = h[:upload_id] || h["upload_id"]
|
|
93
|
+
key = h[:key] || h["key"]
|
|
94
|
+
put_url = h[:put_url] || h["put_url"]
|
|
95
|
+
expires_at = h[:expires_at] || h["expires_at"]
|
|
96
|
+
return nil if upload_id.nil? || key.nil? || put_url.nil?
|
|
97
|
+
|
|
98
|
+
expires_at = Time.iso8601(expires_at) if expires_at.is_a?(String)
|
|
99
|
+
{ upload_id: upload_id, key: key, put_url: put_url, expires_at: expires_at }
|
|
100
|
+
rescue StandardError
|
|
101
|
+
nil
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def expired?(entry, now)
|
|
105
|
+
entry[:expires_at] && entry[:expires_at] <= now
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
data/lib/flare/version.rb
CHANGED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "opentelemetry/sdk"
|
|
4
|
+
|
|
5
|
+
module Flare
|
|
6
|
+
# Path 2: ActiveSupport::Notifications subscriber that fires on
|
|
7
|
+
# `start_processing.action_controller`, after Rails has routed to a
|
|
8
|
+
# controller#action. At that point the rack server span's start
|
|
9
|
+
# attributes don't yet carry code.namespace/code.function -- only the
|
|
10
|
+
# ActionPack instrumentation adds them, and Flare::Sampler's start-time
|
|
11
|
+
# decision (RECORD_ONLY) was already locked in.
|
|
12
|
+
#
|
|
13
|
+
# The subscriber consults the same sampler's rule set, finds any whose
|
|
14
|
+
# match_attributes match the now-known controller/action, applies the
|
|
15
|
+
# deterministic trace_id_ratio gate (CAF-1: no rate bypass on Path 2),
|
|
16
|
+
# and on pass calls marker.mark(trace_id, owner_span_id:, rule_id:).
|
|
17
|
+
# FilteringSpanProcessor then forwards every span in the trace to the
|
|
18
|
+
# exporter and unmarks when the owner (this rack span) finishes.
|
|
19
|
+
class WebMarkerSubscriber
|
|
20
|
+
NOTIFICATION = "start_processing.action_controller"
|
|
21
|
+
|
|
22
|
+
def initialize(sampler:, marker:)
|
|
23
|
+
@sampler = sampler
|
|
24
|
+
@marker = marker
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def start
|
|
28
|
+
@subscriber = ActiveSupport::Notifications.subscribe(NOTIFICATION) do |*, payload|
|
|
29
|
+
handle(payload)
|
|
30
|
+
end
|
|
31
|
+
self
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def stop
|
|
35
|
+
ActiveSupport::Notifications.unsubscribe(@subscriber) if @subscriber
|
|
36
|
+
@subscriber = nil
|
|
37
|
+
self
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Public for tests so they don't have to drive ActiveSupport::Notifications.
|
|
41
|
+
# current_span lets tests inject a context; in production it's the
|
|
42
|
+
# rack server span on the current thread.
|
|
43
|
+
def handle(payload, current_span: OpenTelemetry::Trace.current_span)
|
|
44
|
+
return unless current_span
|
|
45
|
+
ctx = current_span.context
|
|
46
|
+
return unless ctx && ctx.valid?
|
|
47
|
+
|
|
48
|
+
attrs = candidate_attributes(payload)
|
|
49
|
+
return if attrs.empty?
|
|
50
|
+
|
|
51
|
+
@sampler.rules.each do |rule|
|
|
52
|
+
next unless matches?(rule, attrs)
|
|
53
|
+
next unless @sampler.trace_id_ratio(ctx.trace_id) < rule.rate
|
|
54
|
+
|
|
55
|
+
current_span.set_attribute(Flare::Sampler::RULE_ID_ATTRIBUTE, rule.id) if current_span.respond_to?(:set_attribute)
|
|
56
|
+
@marker.mark(ctx.trace_id, owner_span_id: ctx.span_id, rule_id: rule.id)
|
|
57
|
+
break
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def candidate_attributes(payload)
|
|
64
|
+
controller = payload[:controller] || payload["controller"]
|
|
65
|
+
action = payload[:action] || payload["action"]
|
|
66
|
+
{
|
|
67
|
+
"code.namespace" => controller,
|
|
68
|
+
"code.function" => action
|
|
69
|
+
}.compact
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def matches?(rule, attrs)
|
|
73
|
+
rule.match_attributes.all? { |k, v| attrs[k] == v }
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
data/lib/flare.rb
CHANGED
|
@@ -13,6 +13,15 @@ require_relative "flare/metric_flusher"
|
|
|
13
13
|
require_relative "flare/backoff_policy"
|
|
14
14
|
require_relative "flare/metric_submitter"
|
|
15
15
|
|
|
16
|
+
require_relative "flare/sampler"
|
|
17
|
+
require_relative "flare/marker"
|
|
18
|
+
require_relative "flare/web_marker_subscriber"
|
|
19
|
+
require_relative "flare/filtering_span_processor"
|
|
20
|
+
require_relative "flare/upload_url_pool"
|
|
21
|
+
require_relative "flare/trace_exporter"
|
|
22
|
+
require_relative "flare/rule_manager"
|
|
23
|
+
require_relative "flare/trace_health_reporter"
|
|
24
|
+
|
|
16
25
|
module Flare
|
|
17
26
|
class Error < StandardError; end
|
|
18
27
|
|
|
@@ -114,15 +123,43 @@ module Flare
|
|
|
114
123
|
@metric_flusher = flusher
|
|
115
124
|
end
|
|
116
125
|
|
|
126
|
+
# Trace-sampling components, exposed for tests + manual after_fork wiring.
|
|
127
|
+
def sampler = @sampler
|
|
128
|
+
def marker = @marker
|
|
129
|
+
def upload_url_pool = @upload_url_pool
|
|
130
|
+
def rule_manager = @rule_manager
|
|
131
|
+
def trace_span_processor = @trace_span_processor
|
|
132
|
+
def trace_health_reporter = @trace_health_reporter
|
|
133
|
+
|
|
117
134
|
# Manually flush metrics (useful for testing or forced flushes).
|
|
118
135
|
def flush_metrics
|
|
119
136
|
@metric_flusher&.flush_now || 0
|
|
120
137
|
end
|
|
121
138
|
|
|
122
|
-
#
|
|
139
|
+
# Default project key, derived from the host Rails app's module name.
|
|
140
|
+
# Customers can override by configuring something else once we expose
|
|
141
|
+
# configuration.project; for v0.3 this matches MetricSubmitter's behavior.
|
|
142
|
+
def service_name_for_app
|
|
143
|
+
if defined?(Rails) && Rails.respond_to?(:application) && Rails.application
|
|
144
|
+
Rails.application.class.module_parent_name.underscore rescue "rails_app"
|
|
145
|
+
else
|
|
146
|
+
"app"
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def rails_env_name
|
|
151
|
+
if defined?(Rails) && Rails.respond_to?(:env)
|
|
152
|
+
Rails.env.to_s
|
|
153
|
+
else
|
|
154
|
+
ENV.fetch("RACK_ENV", "development")
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Re-initialize background threads after fork.
|
|
123
159
|
# Call this from Puma/Unicorn after_fork hooks.
|
|
124
160
|
def after_fork
|
|
125
161
|
@metric_flusher&.after_fork
|
|
162
|
+
@rule_manager&.after_fork
|
|
126
163
|
end
|
|
127
164
|
|
|
128
165
|
# Configure OpenTelemetry SDK and instrumentations. Must run before the
|
|
@@ -135,13 +172,9 @@ module Flare
|
|
|
135
172
|
# Suppress noisy OTel INFO logs
|
|
136
173
|
OpenTelemetry.logger = Logger.new(STDOUT, level: Logger::WARN)
|
|
137
174
|
|
|
138
|
-
service_name =
|
|
139
|
-
Rails.application.class.module_parent_name.underscore rescue "rails_app"
|
|
140
|
-
else
|
|
141
|
-
"app"
|
|
142
|
-
end
|
|
175
|
+
service_name = service_name_for_app
|
|
143
176
|
|
|
144
|
-
# Require
|
|
177
|
+
# Require flare's bundled instrumentations
|
|
145
178
|
require "opentelemetry-instrumentation-rack"
|
|
146
179
|
require "opentelemetry-instrumentation-net_http"
|
|
147
180
|
require "opentelemetry-instrumentation-active_support"
|
|
@@ -165,19 +198,25 @@ module Flare
|
|
|
165
198
|
log "Spans enabled (database=#{configuration.database_path})"
|
|
166
199
|
end
|
|
167
200
|
|
|
168
|
-
#
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
201
|
+
# Auto-detect and install all OTel instrumentation gems in the bundle.
|
|
202
|
+
# Apps can add gems like opentelemetry-instrumentation-sidekiq to their
|
|
203
|
+
# Gemfile and they'll be picked up automatically.
|
|
204
|
+
c.use_all(
|
|
205
|
+
"OpenTelemetry::Instrumentation::Rack" => {
|
|
206
|
+
untraced_requests: ->(env) {
|
|
207
|
+
request = Rack::Request.new(env)
|
|
208
|
+
return true if request.path.start_with?("/flare")
|
|
209
|
+
|
|
210
|
+
configuration.ignore_request.call(request)
|
|
211
|
+
}
|
|
212
|
+
},
|
|
213
|
+
# Name Sidekiq job spans after the worker class (e.g. "MyWorker
|
|
214
|
+
# process") instead of the upstream default of the queue name
|
|
215
|
+
# ("default process"), matching how ActiveJob spans are named.
|
|
216
|
+
"OpenTelemetry::Instrumentation::Sidekiq" => {
|
|
217
|
+
span_naming: :job_class,
|
|
175
218
|
}
|
|
176
|
-
|
|
177
|
-
c.use "OpenTelemetry::Instrumentation::ActiveSupport"
|
|
178
|
-
c.use "OpenTelemetry::Instrumentation::ActionPack" if defined?(ActionController)
|
|
179
|
-
c.use "OpenTelemetry::Instrumentation::ActionView" if defined?(ActionView)
|
|
180
|
-
c.use "OpenTelemetry::Instrumentation::ActiveJob" if defined?(ActiveJob)
|
|
219
|
+
)
|
|
181
220
|
end
|
|
182
221
|
|
|
183
222
|
# Subscribe to common ActiveSupport notification patterns
|
|
@@ -195,12 +234,98 @@ module Flare
|
|
|
195
234
|
span_processor.shutdown
|
|
196
235
|
log "Span processor flushed and stopped"
|
|
197
236
|
end
|
|
237
|
+
if @trace_span_processor
|
|
238
|
+
@trace_span_processor.force_flush
|
|
239
|
+
@trace_span_processor.shutdown
|
|
240
|
+
log "Trace span processor flushed and stopped"
|
|
241
|
+
end
|
|
198
242
|
log "Shutdown complete"
|
|
199
243
|
end
|
|
200
244
|
|
|
201
245
|
@otel_configured = true
|
|
202
246
|
end
|
|
203
247
|
|
|
248
|
+
# Start the trace-rules poller. Polls GET /api/rules every
|
|
249
|
+
# tracing_poll_interval (default 30s) so the in-process sampler + URL
|
|
250
|
+
# pool stay current. Called from config.after_initialize -- after the
|
|
251
|
+
# user's configure block has run -- so configuration.url / .key /
|
|
252
|
+
# .tracing_enabled are settled.
|
|
253
|
+
def start_rule_manager
|
|
254
|
+
return unless configuration.tracing_submission_configured?
|
|
255
|
+
|
|
256
|
+
setup_tracing_components
|
|
257
|
+
return unless @sampler && @marker && @upload_url_pool
|
|
258
|
+
|
|
259
|
+
@rule_manager = RuleManager.new(
|
|
260
|
+
sampler: @sampler,
|
|
261
|
+
marker: @marker,
|
|
262
|
+
pool: @upload_url_pool,
|
|
263
|
+
base_url: configuration.url,
|
|
264
|
+
api_key: configuration.key,
|
|
265
|
+
project: service_name_for_app,
|
|
266
|
+
environment: rails_env_name,
|
|
267
|
+
interval: configuration.tracing_poll_interval
|
|
268
|
+
)
|
|
269
|
+
@rule_manager.start
|
|
270
|
+
log "Rule manager started (poll=#{configuration.tracing_poll_interval}s)"
|
|
271
|
+
|
|
272
|
+
at_exit { @rule_manager&.stop }
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def setup_tracing_components
|
|
276
|
+
return if @trace_span_processor
|
|
277
|
+
|
|
278
|
+
@sampler = Sampler.new
|
|
279
|
+
@marker = Marker.new
|
|
280
|
+
@upload_url_pool = UploadUrlPool.new
|
|
281
|
+
|
|
282
|
+
# Trace sampling: server-controlled per-route capture. The sampler runs
|
|
283
|
+
# at span start; for routes it can't decide there (Rails web spans get
|
|
284
|
+
# their controller#action attributes set post-routing) the marker +
|
|
285
|
+
# WebMarkerSubscriber handle it. The RECORD_ONLY delegates keep children
|
|
286
|
+
# of unsampled local and remote parents recording so processors still see
|
|
287
|
+
# web requests that arrive with an unsampled traceparent header.
|
|
288
|
+
#
|
|
289
|
+
# Sampler is set on the tracer_provider AFTER SDK.configure -- the SDK's
|
|
290
|
+
# Configurator block doesn't expose a `sampler=`; the provider does.
|
|
291
|
+
OpenTelemetry.tracer_provider.sampler =
|
|
292
|
+
OpenTelemetry::SDK::Trace::Samplers.parent_based(
|
|
293
|
+
root: @sampler,
|
|
294
|
+
remote_parent_sampled: ALWAYS_RECORD_ONLY,
|
|
295
|
+
remote_parent_not_sampled: ALWAYS_RECORD_ONLY,
|
|
296
|
+
local_parent_not_sampled: ALWAYS_RECORD_ONLY
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
@trace_exporter = TraceExporter.new(
|
|
300
|
+
pool: @upload_url_pool,
|
|
301
|
+
notify_url: "#{configuration.url.to_s.chomp('/')}/api/traces",
|
|
302
|
+
api_key: configuration.key,
|
|
303
|
+
project: service_name_for_app,
|
|
304
|
+
environment: rails_env_name
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
@trace_span_processor = FilteringSpanProcessor.new(
|
|
308
|
+
exporter: @trace_exporter,
|
|
309
|
+
marker: @marker,
|
|
310
|
+
max_queue: configuration.tracing_max_queue
|
|
311
|
+
)
|
|
312
|
+
OpenTelemetry.tracer_provider.add_span_processor(@trace_span_processor)
|
|
313
|
+
|
|
314
|
+
@trace_health_reporter = TraceHealthReporter.new(
|
|
315
|
+
processor: @trace_span_processor,
|
|
316
|
+
pool: @upload_url_pool,
|
|
317
|
+
exporter: @trace_exporter
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Path 2 trace marking. Rails-only -- in non-Rails contexts the
|
|
321
|
+
# subscriber would never fire but creating it is harmless.
|
|
322
|
+
if defined?(ActiveSupport::Notifications)
|
|
323
|
+
@web_marker_subscriber = WebMarkerSubscriber.new(sampler: @sampler, marker: @marker).start
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
log "Tracing enabled (poll=#{configuration.tracing_poll_interval}s)"
|
|
327
|
+
end
|
|
328
|
+
|
|
204
329
|
# Start the metrics flusher. Called from config.after_initialize so
|
|
205
330
|
# user configuration (metrics_enabled, flush_interval, etc.) is applied.
|
|
206
331
|
def start_metrics_flusher
|
|
@@ -223,7 +348,8 @@ module Flare
|
|
|
223
348
|
@metric_flusher = MetricFlusher.new(
|
|
224
349
|
storage: @metric_storage,
|
|
225
350
|
submitter: submitter,
|
|
226
|
-
interval: configuration.metrics_flush_interval
|
|
351
|
+
interval: configuration.metrics_flush_interval,
|
|
352
|
+
health_reporters: @trace_health_reporter ? [@trace_health_reporter] : []
|
|
227
353
|
)
|
|
228
354
|
@metric_flusher.start
|
|
229
355
|
log "Metrics flusher started (interval=#{configuration.metrics_flush_interval}s)"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: flare
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- John Nunemaker
|
|
@@ -198,18 +198,28 @@ files:
|
|
|
198
198
|
- lib/flare/cli/status_command.rb
|
|
199
199
|
- lib/flare/configuration.rb
|
|
200
200
|
- lib/flare/engine.rb
|
|
201
|
+
- lib/flare/filtering_span_processor.rb
|
|
201
202
|
- lib/flare/http_metrics_config.rb
|
|
203
|
+
- lib/flare/http_transport.rb
|
|
204
|
+
- lib/flare/marker.rb
|
|
202
205
|
- lib/flare/metric_counter.rb
|
|
203
206
|
- lib/flare/metric_flusher.rb
|
|
204
207
|
- lib/flare/metric_key.rb
|
|
205
208
|
- lib/flare/metric_span_processor.rb
|
|
206
209
|
- lib/flare/metric_storage.rb
|
|
207
210
|
- lib/flare/metric_submitter.rb
|
|
211
|
+
- lib/flare/rule_manager.rb
|
|
212
|
+
- lib/flare/sampler.rb
|
|
208
213
|
- lib/flare/source_location.rb
|
|
209
214
|
- lib/flare/sqlite_exporter.rb
|
|
210
215
|
- lib/flare/storage.rb
|
|
211
216
|
- lib/flare/storage/sqlite.rb
|
|
217
|
+
- lib/flare/trace_blob.rb
|
|
218
|
+
- lib/flare/trace_exporter.rb
|
|
219
|
+
- lib/flare/trace_health_reporter.rb
|
|
220
|
+
- lib/flare/upload_url_pool.rb
|
|
212
221
|
- lib/flare/version.rb
|
|
222
|
+
- lib/flare/web_marker_subscriber.rb
|
|
213
223
|
- public/flare-assets/flare.css
|
|
214
224
|
- public/flare-assets/images/flipper.png
|
|
215
225
|
homepage: https://github.com/jnunemaker/flare
|