flare 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "logger"
5
+ require "concurrent/timer_task"
6
+ require "concurrent/atomic/atomic_fixnum"
7
+
8
+ require_relative "http_transport"
9
+
10
+ module Flare
11
+ # The SDK's only poll. Every interval seconds (default 30) it does a
12
+ # GET /api/rules; the 200 response carries the active TraceRules (with
13
+ # server-computed sample_rate) plus a bag of presigned R2 PUT URLs.
14
+ # We hand the rules to Sampler#update_rules and the URLs to
15
+ # UploadUrlPool#replace, and sweep the Marker so stuck rack-span
16
+ # entries don't linger.
17
+ #
18
+ # ETag-guarded: subsequent polls send If-None-Match. A 304 still gets
19
+ # us a Marker.sweep but doesn't touch sampler or pool. 401/403 stops
20
+ # the poller (misconfigured token shouldn't beat down the server).
21
+ # 5xx and exceptions are logged + counted; the timer just tries again
22
+ # on the next tick.
23
+ #
24
+ # Fork-safe: after_fork clears the pool and restarts the timer in the
25
+ # child process so each child polls independently.
26
+ class RuleManager
27
+ DEFAULT_INTERVAL = 30
28
+
29
+ attr_reader :poll_count, :etag, :stopped_due_to_auth, :last_error_count
30
+
31
+ def initialize(sampler:, marker:, pool:, base_url:, api_key:, project:, environment:,
32
+ interval: DEFAULT_INTERVAL, transport: nil, logger: nil)
33
+ @sampler = sampler
34
+ @marker = marker
35
+ @pool = pool
36
+ @rules_url = "#{base_url.to_s.chomp('/')}/api/rules"
37
+ @api_key = api_key
38
+ @project = project
39
+ @environment = environment
40
+ @interval = interval
41
+ @transport = transport || HttpTransport.new
42
+ @logger = logger || Logger.new($stderr, level: Logger::WARN)
43
+
44
+ @etag = nil
45
+ @poll_count = Concurrent::AtomicFixnum.new(0)
46
+ @last_error_count = Concurrent::AtomicFixnum.new(0)
47
+ @stopped_due_to_auth = false
48
+ @pid = $$
49
+ end
50
+
51
+ def start
52
+ return self if @timer || @stopped_due_to_auth
53
+
54
+ @timer = Concurrent::TimerTask.execute(
55
+ execution_interval: @interval,
56
+ run_now: true,
57
+ name: "flare-rule-manager-timer"
58
+ ) { poll_safely }
59
+ self
60
+ end
61
+
62
+ def stop
63
+ if @timer
64
+ @timer.shutdown
65
+ @timer.wait_for_termination(1)
66
+ @timer.kill unless @timer.shutdown?
67
+ @timer = nil
68
+ end
69
+ self
70
+ end
71
+
72
+ def running?
73
+ @timer ? @timer.running? : false
74
+ end
75
+
76
+ def after_fork
77
+ @pid = $$
78
+ @pool.after_fork
79
+ stop
80
+ start
81
+ end
82
+
83
+ # Public so callers can force a poll (tests + integration tests).
84
+ def poll_now
85
+ poll_safely
86
+ end
87
+
88
+ private
89
+
90
+ def poll_safely
91
+ poll
92
+ rescue StandardError => e
93
+ @last_error_count.increment
94
+ @logger.warn("[Flare::RuleManager] poll exception: #{e.class}: #{e.message}")
95
+ end
96
+
97
+ def poll
98
+ return if @stopped_due_to_auth
99
+
100
+ response = @transport.get(@rules_url, request_headers)
101
+ @poll_count.increment
102
+
103
+ case response.code
104
+ when "304"
105
+ @marker.sweep
106
+ when "200"
107
+ @etag = response.header("ETag")
108
+ apply(JSON.parse(response.body))
109
+ @marker.sweep
110
+ when "401", "403"
111
+ @stopped_due_to_auth = true
112
+ @logger.warn("[Flare::RuleManager] auth failed (#{response.code}); stopping poll")
113
+ stop
114
+ else
115
+ @last_error_count.increment
116
+ @logger.warn("[Flare::RuleManager] unexpected #{response.code}")
117
+ end
118
+ end
119
+
120
+ def request_headers
121
+ headers = {
122
+ "Authorization" => "Bearer #{@api_key}",
123
+ "Flare-Project" => @project,
124
+ "Flare-Environment" => @environment
125
+ }
126
+ headers["If-None-Match"] = @etag if @etag
127
+ headers
128
+ end
129
+
130
+ # Server payload shape (see tirana-v2 Api::RulesController):
131
+ # { "trace_rules": [{ "id", "match_attributes", "rate", ..., "urls": [...] }] }
132
+ def apply(payload)
133
+ rules = payload["trace_rules"] || []
134
+ @sampler.update_rules(rules)
135
+
136
+ url_entries = rules.flat_map { |r| Array(r["urls"]) }
137
+ @pool.replace(url_entries)
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "concurrent/atomic/atomic_reference"
4
+ require "opentelemetry/sdk"
5
+
6
+ module Flare
7
+ # Path 1 trace sampler. At span start, iterates active rules; returns
8
+ # RECORD_AND_SAMPLE when one matches and the deterministic trace_id_ratio
9
+ # falls under the rule's rate. Otherwise RECORD_ONLY -- the span still
10
+ # records so MetricSpanProcessor sees it; the trace export decision for
11
+ # web spans is deferred to Path 2 via Flare::Marker.
12
+ #
13
+ # Used as the `root` sampler inside an OTel ParentBased sampler so root
14
+ # spans go through this logic but child spans inherit upstream decisions.
15
+ # The `local_parent_not_sampled` slot of the ParentBased should point at
16
+ # Flare::ALWAYS_RECORD_ONLY -- the default ALWAYS_OFF would drop children
17
+ # of an unsampled local parent, making them NoOp spans the processors
18
+ # never see.
19
+ #
20
+ # Rules are pushed in via update_rules from Flare::RuleManager; the swap
21
+ # is atomic, and malformed rule entries are dropped with a counter so a
22
+ # bad server payload can't crash the tracing path.
23
+ class Sampler
24
+ Decision = OpenTelemetry::SDK::Trace::Samplers::Decision
25
+ Result = OpenTelemetry::SDK::Trace::Samplers::Result
26
+
27
+ RULE_ID_ATTRIBUTE = "flare.rule_id"
28
+
29
+ Rule = Struct.new(:id, :match_attributes, :rate, keyword_init: true)
30
+
31
+ attr_reader :dropped_rule_count
32
+
33
+ def initialize
34
+ @rules_ref = Concurrent::AtomicReference.new([].freeze)
35
+ @dropped_rule_count = Concurrent::AtomicFixnum.new(0)
36
+ end
37
+
38
+ # new_rules: an array of rule hashes from GET /api/rules, e.g.
39
+ # [{ "id" => 1, "match_attributes" => {...}, "rate" => 0.5 }, ...]
40
+ # Entries that don't validate are skipped (counted in dropped_rule_count).
41
+ def update_rules(new_rules)
42
+ validated = (new_rules || []).filter_map { |r| validate(r) }
43
+ @dropped_rule_count.increment((new_rules || []).length - validated.length)
44
+ @rules_ref.set(validated.freeze)
45
+ end
46
+
47
+ def rules
48
+ @rules_ref.get
49
+ end
50
+
51
+ def should_sample?(trace_id:, parent_context:, links:, name:, kind:, attributes:)
52
+ tracestate = tracestate_from(parent_context)
53
+
54
+ rules.each do |rule|
55
+ next unless matches?(rule, attributes)
56
+ next unless trace_id_ratio(trace_id) < rule.rate
57
+
58
+ merged = (attributes || {}).merge(RULE_ID_ATTRIBUTE => rule.id)
59
+ return Result.new(decision: Decision::RECORD_AND_SAMPLE, attributes: merged, tracestate: tracestate)
60
+ end
61
+
62
+ Result.new(decision: Decision::RECORD_ONLY, tracestate: tracestate)
63
+ end
64
+
65
+ def description
66
+ "Flare::Sampler"
67
+ end
68
+
69
+ # Cross-language formula: last 8 bytes of the 16-byte raw trace_id as
70
+ # uint64-big-endian, divided by 2^64. Same in every Flare SDK so the
71
+ # server can reproduce the decision if it ever needs to.
72
+ def trace_id_ratio(trace_id)
73
+ bytes = trace_id.is_a?(String) ? trace_id.bytes : Array(trace_id)
74
+ tail = bytes.last(8)
75
+ n = 0
76
+ tail.each { |b| n = (n << 8) | b }
77
+ n.to_f / (1 << 64)
78
+ end
79
+
80
+ private
81
+
82
+ def tracestate_from(parent_context)
83
+ OpenTelemetry::Trace.current_span(parent_context).context.tracestate ||
84
+ OpenTelemetry::Trace::Tracestate::DEFAULT
85
+ end
86
+
87
+ def matches?(rule, attributes)
88
+ return false if attributes.nil?
89
+ rule.match_attributes.all? { |k, v| attributes[k] == v }
90
+ end
91
+
92
+ def validate(raw)
93
+ return nil unless raw.is_a?(Hash)
94
+
95
+ id = raw["id"] || raw[:id]
96
+ match = raw["match_attributes"] || raw[:match_attributes]
97
+ rate = raw["rate"] || raw[:rate]
98
+
99
+ return nil if id.nil?
100
+ return nil unless match.is_a?(Hash) && match.any?
101
+ return nil unless match.all? { |k, v| k.is_a?(String) && v.is_a?(String) && !v.empty? }
102
+ return nil unless rate.is_a?(Numeric) && rate > 0.0 && rate <= 1.0
103
+
104
+ Rule.new(id: id, match_attributes: match, rate: rate.to_f)
105
+ rescue StandardError
106
+ nil
107
+ end
108
+ end
109
+
110
+ # Tiny sampler whose should_sample? returns RECORD_ONLY for every span.
111
+ # Slot this into the ParentBased local_parent_not_sampled position so
112
+ # children of an unsampled local parent stay recording (the default
113
+ # ALWAYS_OFF turns them into NoOp spans no processor ever sees).
114
+ class AlwaysRecordOnly
115
+ Decision = OpenTelemetry::SDK::Trace::Samplers::Decision
116
+ Result = OpenTelemetry::SDK::Trace::Samplers::Result
117
+
118
+ def should_sample?(parent_context: nil, **)
119
+ tracestate = OpenTelemetry::Trace.current_span(parent_context).context.tracestate ||
120
+ OpenTelemetry::Trace::Tracestate::DEFAULT
121
+ Result.new(decision: Decision::RECORD_ONLY, tracestate: tracestate)
122
+ end
123
+
124
+ def description
125
+ "Flare::AlwaysRecordOnly"
126
+ end
127
+ end
128
+
129
+ ALWAYS_RECORD_ONLY = AlwaysRecordOnly.new
130
+ end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time"
4
+
5
+ module Flare
6
+ # Value object that turns a group of OTel span_data for a single trace
7
+ # into the Flare-JSON wire format the server expects:
8
+ #
9
+ # {
10
+ # "trace_id": "<hex>",
11
+ # "trace_rule_id": <int|nil>,
12
+ # "root_name": "<string>",
13
+ # "started_at": "<iso8601>",
14
+ # "duration_ms": <int>,
15
+ # "spans": [
16
+ # { "id", "parent_id", "name", "started_at",
17
+ # "duration_ms", "attributes" }
18
+ # ]
19
+ # }
20
+ #
21
+ # The trace_rule_id is read from any span carrying the
22
+ # `flare.rule_id` attribute (Path 1 sets it on the sampled root, Path 2
23
+ # sets it on the rack owner span via WebMarkerSubscriber).
24
+ class TraceBlob
25
+ ZERO_SPAN_ID = ("\x00".b * 8).freeze
26
+ ROOT_NAME_LIMIT = 255
27
+
28
+ def self.build(trace_id:, spans:)
29
+ return nil if spans.nil? || spans.empty?
30
+ new(trace_id: trace_id, spans: spans)
31
+ end
32
+
33
+ def initialize(trace_id:, spans:)
34
+ @trace_id = trace_id
35
+ @spans = spans
36
+ end
37
+
38
+ def to_h
39
+ root = find_root
40
+ {
41
+ "trace_id" => hexify(@trace_id),
42
+ "trace_rule_id" => rule_id_from_spans,
43
+ "root_name" => root_name(root),
44
+ "started_at" => iso(root&.start_timestamp),
45
+ "duration_ms" => duration_ms(root),
46
+ "spans" => @spans.map { |s| span_to_h(s) }
47
+ }
48
+ end
49
+
50
+ private
51
+
52
+ def find_root
53
+ @spans.find { |s| entry_span?(s) && rule_id_attribute(s) } ||
54
+ @spans.find { |s| root?(s) } ||
55
+ @spans.find { |s| entry_span?(s) } ||
56
+ @spans.find { |s| rule_id_attribute(s) } ||
57
+ @spans.first
58
+ end
59
+
60
+ def root?(span)
61
+ pid = span.parent_span_id
62
+ pid.nil? || pid.empty? || pid == ZERO_SPAN_ID
63
+ end
64
+
65
+ def entry_span?(span)
66
+ return false unless span.respond_to?(:kind)
67
+
68
+ span.kind == :server || span.kind == :consumer
69
+ end
70
+
71
+ def rule_id_from_spans
72
+ @spans.each do |s|
73
+ value = rule_id_attribute(s)
74
+ return value if value
75
+ end
76
+ nil
77
+ end
78
+
79
+ def rule_id_attribute(span)
80
+ attrs = span.attributes
81
+ return nil unless attrs
82
+
83
+ attrs[Sampler::RULE_ID_ATTRIBUTE] || attrs[Sampler::RULE_ID_ATTRIBUTE.to_sym]
84
+ end
85
+
86
+ def root_name(root)
87
+ root&.name&.to_s&.slice(0, ROOT_NAME_LIMIT)
88
+ end
89
+
90
+ def span_to_h(span)
91
+ {
92
+ "id" => hexify(span.span_id),
93
+ "parent_id" => root?(span) ? nil : hexify(span.parent_span_id),
94
+ "name" => span.name,
95
+ "started_at" => iso(span.start_timestamp),
96
+ "duration_ms" => duration_ms(span),
97
+ "attributes" => span.attributes || {}
98
+ }
99
+ end
100
+
101
+ def hexify(bytes)
102
+ return nil if bytes.nil?
103
+ bytes.unpack1("H*")
104
+ end
105
+
106
+ def iso(nanos)
107
+ return nil if nanos.nil?
108
+ Time.at(nanos / 1_000_000_000.0).utc.iso8601(6)
109
+ end
110
+
111
+ def duration_ms(span)
112
+ return nil unless span && span.start_timestamp && span.end_timestamp
113
+ ((span.end_timestamp - span.start_timestamp) / 1_000_000).to_i
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "zlib"
5
+ require "stringio"
6
+ require "logger"
7
+ require "uri"
8
+ require "concurrent/atomic/atomic_fixnum"
9
+ require "opentelemetry/sdk"
10
+
11
+ require_relative "sampler"
12
+ require_relative "trace_blob"
13
+ require_relative "http_transport"
14
+
15
+ module Flare
16
+ # Custom OTel exporter. For each batch FilteringSpanProcessor hands over:
17
+ #
18
+ # 1. Group spans by trace_id.
19
+ # 2. For each trace, build a Flare::TraceBlob and gzip-JSON-encode it.
20
+ # 3. Check out a presigned R2 PUT URL from UploadUrlPool.
21
+ # 4. PUT the gzipped body straight to R2 -- Flare's server is NOT in
22
+ # the trace-bytes path.
23
+ # 5. After R2 returns 200, POST /api/traces { key } using the
24
+ # customer's push token + Flare-Project / Flare-Environment headers.
25
+ # That's the self-notify hop the design swapped in for the CF Worker.
26
+ #
27
+ # 403 from R2 means the presigned URL expired between issue and use;
28
+ # discard, check out the next URL, retry once. Pool empty -> FAILURE.
29
+ # Notify-POST failure is logged + counted but doesn't fail the export
30
+ # (the blob is in R2, just won't be processed; incoming/* lifecycle
31
+ # cleans it up in 1hr).
32
+ class TraceExporter
33
+ SUCCESS = OpenTelemetry::SDK::Trace::Export::SUCCESS
34
+ FAILURE = OpenTelemetry::SDK::Trace::Export::FAILURE
35
+
36
+ PUT_HEADERS = {
37
+ "Content-Type" => "application/json",
38
+ "Content-Encoding" => "gzip"
39
+ }.freeze
40
+
41
+ attr_reader :put_failure_count, :notify_failure_count, :pool_empty_count, :exception_count
42
+
43
+ def initialize(pool:, notify_url:, api_key:, project:, environment:,
44
+ transport: nil, logger: nil)
45
+ @pool = pool
46
+ @notify_url = notify_url.to_s
47
+ @api_key = api_key
48
+ @project = project
49
+ @environment = environment
50
+ @transport = transport || HttpTransport.new
51
+ @logger = logger || Logger.new($stderr, level: Logger::WARN)
52
+
53
+ @put_failure_count = Concurrent::AtomicFixnum.new(0)
54
+ @notify_failure_count = Concurrent::AtomicFixnum.new(0)
55
+ @pool_empty_count = Concurrent::AtomicFixnum.new(0)
56
+ @exception_count = Concurrent::AtomicFixnum.new(0)
57
+ end
58
+
59
+ def export(spans, timeout: nil)
60
+ grouped = spans.group_by(&:trace_id)
61
+ return SUCCESS if grouped.empty?
62
+
63
+ overall = SUCCESS
64
+ grouped.each do |trace_id, group|
65
+ result = ship(TraceBlob.build(trace_id: trace_id, spans: group))
66
+ overall = FAILURE if result == FAILURE
67
+ end
68
+ overall
69
+ rescue StandardError => e
70
+ @exception_count.increment
71
+ @logger.warn("[Flare::TraceExporter] export raised: #{e.class}: #{e.message}")
72
+ FAILURE
73
+ end
74
+
75
+ def force_flush(timeout: nil)
76
+ SUCCESS
77
+ end
78
+
79
+ def shutdown(timeout: nil)
80
+ SUCCESS
81
+ end
82
+
83
+ private
84
+
85
+ def ship(blob, retried: false)
86
+ return FAILURE if blob.nil?
87
+
88
+ entry = @pool.checkout
89
+ if entry.nil?
90
+ @pool_empty_count.increment
91
+ return FAILURE
92
+ end
93
+
94
+ body = gzip(JSON.generate(blob.to_h))
95
+ response = @transport.put(entry[:put_url], body, PUT_HEADERS)
96
+
97
+ case response.code
98
+ when "200", "204"
99
+ notify(entry[:key])
100
+ SUCCESS
101
+ when "403"
102
+ # Presigned URL probably expired; try once more with the next one.
103
+ retried ? record_put_failure(response) : ship(blob, retried: true)
104
+ else
105
+ record_put_failure(response)
106
+ end
107
+ end
108
+
109
+ def notify(key)
110
+ response = @transport.post(@notify_url, JSON.generate(key: key), notify_headers)
111
+ return if response.code == "202"
112
+
113
+ @notify_failure_count.increment
114
+ @logger.warn("[Flare::TraceExporter] notify failed: HTTP #{response.code}")
115
+ rescue StandardError => e
116
+ @notify_failure_count.increment
117
+ @logger.warn("[Flare::TraceExporter] notify exception: #{e.class}: #{e.message}")
118
+ end
119
+
120
+ def notify_headers
121
+ {
122
+ "Content-Type" => "application/json",
123
+ "Authorization" => "Bearer #{@api_key}",
124
+ "Flare-Project" => @project,
125
+ "Flare-Environment" => @environment
126
+ }
127
+ end
128
+
129
+ def record_put_failure(response)
130
+ @put_failure_count.increment
131
+ @logger.warn("[Flare::TraceExporter] PUT failed: HTTP #{response.code}")
132
+ FAILURE
133
+ end
134
+
135
+ def gzip(body)
136
+ io = StringIO.new
137
+ gz = Zlib::GzipWriter.new(io)
138
+ gz.write(body)
139
+ gz.close
140
+ io.string
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "metric_key"
4
+
5
+ module Flare
6
+ # Records client-side tracing health into MetricStorage so flare-web can
7
+ # warn when local buffering, URL exhaustion, or export errors reduce trace
8
+ # fidelity.
9
+ class TraceHealthReporter
10
+ NAMESPACE = "sdk"
11
+ SERVICE = "flare-ruby"
12
+ TARGET = "tracing"
13
+
14
+ def initialize(processor:, pool:, exporter:)
15
+ @processor = processor
16
+ @pool = pool
17
+ @exporter = exporter
18
+ @last = {}
19
+ @mutex = Mutex.new
20
+ end
21
+
22
+ def record(storage, bucket: Time.now.utc)
23
+ @mutex.synchronize do
24
+ record_counter(storage, bucket, "dropped_spans", @processor.dropped_count.value)
25
+ record_counter(storage, bucket, "export_failures", @processor.failed_export_count.value)
26
+ record_counter(storage, bucket, "processor_exceptions", @processor.exception_count.value)
27
+
28
+ record_counter(storage, bucket, "upload_url_pool_empty", @pool.empty_count.value)
29
+ record_counter(storage, bucket, "upload_url_expired", @pool.expired_count.value)
30
+
31
+ record_counter(storage, bucket, "r2_put_failures", @exporter.put_failure_count.value)
32
+ record_counter(storage, bucket, "notify_failures", @exporter.notify_failure_count.value)
33
+ record_counter(storage, bucket, "trace_pool_empty", @exporter.pool_empty_count.value)
34
+ record_counter(storage, bucket, "trace_export_exceptions", @exporter.exception_count.value)
35
+
36
+ buffer_size = @processor.buffer_size
37
+ buffer_high_watermark = @processor.buffer_high_watermark.value
38
+ record_gauge(storage, bucket, "buffer_size", buffer_size)
39
+ record_gauge(storage, bucket, "buffer_high_watermark", buffer_high_watermark)
40
+ record_gauge(storage, bucket, "buffer_limit", @processor.max_queue) if buffer_size.positive? || buffer_high_watermark.positive?
41
+ @processor.reset_buffer_high_watermark
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ def record_counter(storage, bucket, operation, current)
48
+ previous = @last.fetch(operation, 0)
49
+ @last[operation] = current
50
+ delta = current - previous
51
+ return unless delta.positive?
52
+
53
+ storage.add(key(bucket, operation), count: delta, sum_ms: 0, error_count: 0)
54
+ end
55
+
56
+ def record_gauge(storage, bucket, operation, value)
57
+ storage.add(key(bucket, operation), count: 1, sum_ms: value, error_count: 0)
58
+ end
59
+
60
+ def key(bucket, operation)
61
+ MetricKey.new(
62
+ bucket: bucket_time(bucket),
63
+ namespace: NAMESPACE,
64
+ service: SERVICE,
65
+ target: TARGET,
66
+ operation: operation
67
+ )
68
+ end
69
+
70
+ def bucket_time(time)
71
+ Time.utc(time.year, time.month, time.day, time.hour, time.min, 0)
72
+ end
73
+ end
74
+ end