flare 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +2 -17
- data/lib/flare/configuration.rb +14 -0
- data/lib/flare/engine.rb +4 -2
- data/lib/flare/filtering_span_processor.rb +279 -0
- data/lib/flare/http_transport.rb +62 -0
- data/lib/flare/marker.rb +106 -0
- data/lib/flare/metric_counter.rb +6 -0
- data/lib/flare/metric_flusher.rb +18 -5
- data/lib/flare/metric_storage.rb +5 -0
- data/lib/flare/rule_manager.rb +140 -0
- data/lib/flare/sampler.rb +130 -0
- data/lib/flare/trace_blob.rb +116 -0
- data/lib/flare/trace_exporter.rb +143 -0
- data/lib/flare/trace_health_reporter.rb +74 -0
- data/lib/flare/upload_url_pool.rb +108 -0
- data/lib/flare/version.rb +1 -1
- data/lib/flare/web_marker_subscriber.rb +76 -0
- data/lib/flare.rb +127 -7
- metadata +11 -1
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "logger"
|
|
5
|
+
require "concurrent/timer_task"
|
|
6
|
+
require "concurrent/atomic/atomic_fixnum"
|
|
7
|
+
|
|
8
|
+
require_relative "http_transport"
|
|
9
|
+
|
|
10
|
+
module Flare
|
|
11
|
+
# The SDK's only poll. Every interval seconds (default 30) it does a
|
|
12
|
+
# GET /api/rules; the 200 response carries the active TraceRules (with
|
|
13
|
+
# server-computed sample_rate) plus a bag of presigned R2 PUT URLs.
|
|
14
|
+
# We hand the rules to Sampler#update_rules and the URLs to
|
|
15
|
+
# UploadUrlPool#replace, and sweep the Marker so stuck rack-span
|
|
16
|
+
# entries don't linger.
|
|
17
|
+
#
|
|
18
|
+
# ETag-guarded: subsequent polls send If-None-Match. A 304 still gets
|
|
19
|
+
# us a Marker.sweep but doesn't touch sampler or pool. 401/403 stops
|
|
20
|
+
# the poller (misconfigured token shouldn't beat down the server).
|
|
21
|
+
# 5xx and exceptions are logged + counted; the timer just tries again
|
|
22
|
+
# on the next tick.
|
|
23
|
+
#
|
|
24
|
+
# Fork-safe: after_fork clears the pool and restarts the timer in the
|
|
25
|
+
# child process so each child polls independently.
|
|
26
|
+
class RuleManager
|
|
27
|
+
DEFAULT_INTERVAL = 30
|
|
28
|
+
|
|
29
|
+
attr_reader :poll_count, :etag, :stopped_due_to_auth, :last_error_count
|
|
30
|
+
|
|
31
|
+
def initialize(sampler:, marker:, pool:, base_url:, api_key:, project:, environment:,
|
|
32
|
+
interval: DEFAULT_INTERVAL, transport: nil, logger: nil)
|
|
33
|
+
@sampler = sampler
|
|
34
|
+
@marker = marker
|
|
35
|
+
@pool = pool
|
|
36
|
+
@rules_url = "#{base_url.to_s.chomp('/')}/api/rules"
|
|
37
|
+
@api_key = api_key
|
|
38
|
+
@project = project
|
|
39
|
+
@environment = environment
|
|
40
|
+
@interval = interval
|
|
41
|
+
@transport = transport || HttpTransport.new
|
|
42
|
+
@logger = logger || Logger.new($stderr, level: Logger::WARN)
|
|
43
|
+
|
|
44
|
+
@etag = nil
|
|
45
|
+
@poll_count = Concurrent::AtomicFixnum.new(0)
|
|
46
|
+
@last_error_count = Concurrent::AtomicFixnum.new(0)
|
|
47
|
+
@stopped_due_to_auth = false
|
|
48
|
+
@pid = $$
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def start
|
|
52
|
+
return self if @timer || @stopped_due_to_auth
|
|
53
|
+
|
|
54
|
+
@timer = Concurrent::TimerTask.execute(
|
|
55
|
+
execution_interval: @interval,
|
|
56
|
+
run_now: true,
|
|
57
|
+
name: "flare-rule-manager-timer"
|
|
58
|
+
) { poll_safely }
|
|
59
|
+
self
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def stop
|
|
63
|
+
if @timer
|
|
64
|
+
@timer.shutdown
|
|
65
|
+
@timer.wait_for_termination(1)
|
|
66
|
+
@timer.kill unless @timer.shutdown?
|
|
67
|
+
@timer = nil
|
|
68
|
+
end
|
|
69
|
+
self
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def running?
|
|
73
|
+
@timer ? @timer.running? : false
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def after_fork
|
|
77
|
+
@pid = $$
|
|
78
|
+
@pool.after_fork
|
|
79
|
+
stop
|
|
80
|
+
start
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Public so callers can force a poll (tests + integration tests).
|
|
84
|
+
def poll_now
|
|
85
|
+
poll_safely
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
private
|
|
89
|
+
|
|
90
|
+
def poll_safely
|
|
91
|
+
poll
|
|
92
|
+
rescue StandardError => e
|
|
93
|
+
@last_error_count.increment
|
|
94
|
+
@logger.warn("[Flare::RuleManager] poll exception: #{e.class}: #{e.message}")
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def poll
|
|
98
|
+
return if @stopped_due_to_auth
|
|
99
|
+
|
|
100
|
+
response = @transport.get(@rules_url, request_headers)
|
|
101
|
+
@poll_count.increment
|
|
102
|
+
|
|
103
|
+
case response.code
|
|
104
|
+
when "304"
|
|
105
|
+
@marker.sweep
|
|
106
|
+
when "200"
|
|
107
|
+
@etag = response.header("ETag")
|
|
108
|
+
apply(JSON.parse(response.body))
|
|
109
|
+
@marker.sweep
|
|
110
|
+
when "401", "403"
|
|
111
|
+
@stopped_due_to_auth = true
|
|
112
|
+
@logger.warn("[Flare::RuleManager] auth failed (#{response.code}); stopping poll")
|
|
113
|
+
stop
|
|
114
|
+
else
|
|
115
|
+
@last_error_count.increment
|
|
116
|
+
@logger.warn("[Flare::RuleManager] unexpected #{response.code}")
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def request_headers
|
|
121
|
+
headers = {
|
|
122
|
+
"Authorization" => "Bearer #{@api_key}",
|
|
123
|
+
"Flare-Project" => @project,
|
|
124
|
+
"Flare-Environment" => @environment
|
|
125
|
+
}
|
|
126
|
+
headers["If-None-Match"] = @etag if @etag
|
|
127
|
+
headers
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Server payload shape (see tirana-v2 Api::RulesController):
|
|
131
|
+
# { "trace_rules": [{ "id", "match_attributes", "rate", ..., "urls": [...] }] }
|
|
132
|
+
def apply(payload)
|
|
133
|
+
rules = payload["trace_rules"] || []
|
|
134
|
+
@sampler.update_rules(rules)
|
|
135
|
+
|
|
136
|
+
url_entries = rules.flat_map { |r| Array(r["urls"]) }
|
|
137
|
+
@pool.replace(url_entries)
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "concurrent/atomic/atomic_reference"
|
|
4
|
+
require "opentelemetry/sdk"
|
|
5
|
+
|
|
6
|
+
module Flare
|
|
7
|
+
# Path 1 trace sampler. At span start, iterates active rules; returns
|
|
8
|
+
# RECORD_AND_SAMPLE when one matches and the deterministic trace_id_ratio
|
|
9
|
+
# falls under the rule's rate. Otherwise RECORD_ONLY -- the span still
|
|
10
|
+
# records so MetricSpanProcessor sees it; the trace export decision for
|
|
11
|
+
# web spans is deferred to Path 2 via Flare::Marker.
|
|
12
|
+
#
|
|
13
|
+
# Used as the `root` sampler inside an OTel ParentBased sampler so root
|
|
14
|
+
# spans go through this logic but child spans inherit upstream decisions.
|
|
15
|
+
# The `local_parent_not_sampled` slot of the ParentBased should point at
|
|
16
|
+
# Flare::ALWAYS_RECORD_ONLY -- the default ALWAYS_OFF would drop children
|
|
17
|
+
# of an unsampled local parent, making them NoOp spans the processors
|
|
18
|
+
# never see.
|
|
19
|
+
#
|
|
20
|
+
# Rules are pushed in via update_rules from Flare::RuleManager; the swap
|
|
21
|
+
# is atomic, and malformed rule entries are dropped with a counter so a
|
|
22
|
+
# bad server payload can't crash the tracing path.
|
|
23
|
+
class Sampler
|
|
24
|
+
Decision = OpenTelemetry::SDK::Trace::Samplers::Decision
|
|
25
|
+
Result = OpenTelemetry::SDK::Trace::Samplers::Result
|
|
26
|
+
|
|
27
|
+
RULE_ID_ATTRIBUTE = "flare.rule_id"
|
|
28
|
+
|
|
29
|
+
Rule = Struct.new(:id, :match_attributes, :rate, keyword_init: true)
|
|
30
|
+
|
|
31
|
+
attr_reader :dropped_rule_count
|
|
32
|
+
|
|
33
|
+
def initialize
|
|
34
|
+
@rules_ref = Concurrent::AtomicReference.new([].freeze)
|
|
35
|
+
@dropped_rule_count = Concurrent::AtomicFixnum.new(0)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# new_rules: an array of rule hashes from GET /api/rules, e.g.
|
|
39
|
+
# [{ "id" => 1, "match_attributes" => {...}, "rate" => 0.5 }, ...]
|
|
40
|
+
# Entries that don't validate are skipped (counted in dropped_rule_count).
|
|
41
|
+
def update_rules(new_rules)
|
|
42
|
+
validated = (new_rules || []).filter_map { |r| validate(r) }
|
|
43
|
+
@dropped_rule_count.increment((new_rules || []).length - validated.length)
|
|
44
|
+
@rules_ref.set(validated.freeze)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def rules
|
|
48
|
+
@rules_ref.get
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def should_sample?(trace_id:, parent_context:, links:, name:, kind:, attributes:)
|
|
52
|
+
tracestate = tracestate_from(parent_context)
|
|
53
|
+
|
|
54
|
+
rules.each do |rule|
|
|
55
|
+
next unless matches?(rule, attributes)
|
|
56
|
+
next unless trace_id_ratio(trace_id) < rule.rate
|
|
57
|
+
|
|
58
|
+
merged = (attributes || {}).merge(RULE_ID_ATTRIBUTE => rule.id)
|
|
59
|
+
return Result.new(decision: Decision::RECORD_AND_SAMPLE, attributes: merged, tracestate: tracestate)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
Result.new(decision: Decision::RECORD_ONLY, tracestate: tracestate)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def description
|
|
66
|
+
"Flare::Sampler"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Cross-language formula: last 8 bytes of the 16-byte raw trace_id as
|
|
70
|
+
# uint64-big-endian, divided by 2^64. Same in every Flare SDK so the
|
|
71
|
+
# server can reproduce the decision if it ever needs to.
|
|
72
|
+
def trace_id_ratio(trace_id)
|
|
73
|
+
bytes = trace_id.is_a?(String) ? trace_id.bytes : Array(trace_id)
|
|
74
|
+
tail = bytes.last(8)
|
|
75
|
+
n = 0
|
|
76
|
+
tail.each { |b| n = (n << 8) | b }
|
|
77
|
+
n.to_f / (1 << 64)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
def tracestate_from(parent_context)
|
|
83
|
+
OpenTelemetry::Trace.current_span(parent_context).context.tracestate ||
|
|
84
|
+
OpenTelemetry::Trace::Tracestate::DEFAULT
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def matches?(rule, attributes)
|
|
88
|
+
return false if attributes.nil?
|
|
89
|
+
rule.match_attributes.all? { |k, v| attributes[k] == v }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def validate(raw)
|
|
93
|
+
return nil unless raw.is_a?(Hash)
|
|
94
|
+
|
|
95
|
+
id = raw["id"] || raw[:id]
|
|
96
|
+
match = raw["match_attributes"] || raw[:match_attributes]
|
|
97
|
+
rate = raw["rate"] || raw[:rate]
|
|
98
|
+
|
|
99
|
+
return nil if id.nil?
|
|
100
|
+
return nil unless match.is_a?(Hash) && match.any?
|
|
101
|
+
return nil unless match.all? { |k, v| k.is_a?(String) && v.is_a?(String) && !v.empty? }
|
|
102
|
+
return nil unless rate.is_a?(Numeric) && rate > 0.0 && rate <= 1.0
|
|
103
|
+
|
|
104
|
+
Rule.new(id: id, match_attributes: match, rate: rate.to_f)
|
|
105
|
+
rescue StandardError
|
|
106
|
+
nil
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Tiny sampler whose should_sample? returns RECORD_ONLY for every span.
|
|
111
|
+
# Slot this into the ParentBased local_parent_not_sampled position so
|
|
112
|
+
# children of an unsampled local parent stay recording (the default
|
|
113
|
+
# ALWAYS_OFF turns them into NoOp spans no processor ever sees).
|
|
114
|
+
class AlwaysRecordOnly
|
|
115
|
+
Decision = OpenTelemetry::SDK::Trace::Samplers::Decision
|
|
116
|
+
Result = OpenTelemetry::SDK::Trace::Samplers::Result
|
|
117
|
+
|
|
118
|
+
def should_sample?(parent_context: nil, **)
|
|
119
|
+
tracestate = OpenTelemetry::Trace.current_span(parent_context).context.tracestate ||
|
|
120
|
+
OpenTelemetry::Trace::Tracestate::DEFAULT
|
|
121
|
+
Result.new(decision: Decision::RECORD_ONLY, tracestate: tracestate)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def description
|
|
125
|
+
"Flare::AlwaysRecordOnly"
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
ALWAYS_RECORD_ONLY = AlwaysRecordOnly.new
|
|
130
|
+
end
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "time"
|
|
4
|
+
|
|
5
|
+
module Flare
|
|
6
|
+
# Value object that turns a group of OTel span_data for a single trace
|
|
7
|
+
# into the Flare-JSON wire format the server expects:
|
|
8
|
+
#
|
|
9
|
+
# {
|
|
10
|
+
# "trace_id": "<hex>",
|
|
11
|
+
# "trace_rule_id": <int|nil>,
|
|
12
|
+
# "root_name": "<string>",
|
|
13
|
+
# "started_at": "<iso8601>",
|
|
14
|
+
# "duration_ms": <int>,
|
|
15
|
+
# "spans": [
|
|
16
|
+
# { "id", "parent_id", "name", "started_at",
|
|
17
|
+
# "duration_ms", "attributes" }
|
|
18
|
+
# ]
|
|
19
|
+
# }
|
|
20
|
+
#
|
|
21
|
+
# The trace_rule_id is read from any span carrying the
|
|
22
|
+
# `flare.rule_id` attribute (Path 1 sets it on the sampled root, Path 2
|
|
23
|
+
# sets it on the rack owner span via WebMarkerSubscriber).
|
|
24
|
+
class TraceBlob
|
|
25
|
+
ZERO_SPAN_ID = ("\x00".b * 8).freeze
|
|
26
|
+
ROOT_NAME_LIMIT = 255
|
|
27
|
+
|
|
28
|
+
def self.build(trace_id:, spans:)
|
|
29
|
+
return nil if spans.nil? || spans.empty?
|
|
30
|
+
new(trace_id: trace_id, spans: spans)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def initialize(trace_id:, spans:)
|
|
34
|
+
@trace_id = trace_id
|
|
35
|
+
@spans = spans
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def to_h
|
|
39
|
+
root = find_root
|
|
40
|
+
{
|
|
41
|
+
"trace_id" => hexify(@trace_id),
|
|
42
|
+
"trace_rule_id" => rule_id_from_spans,
|
|
43
|
+
"root_name" => root_name(root),
|
|
44
|
+
"started_at" => iso(root&.start_timestamp),
|
|
45
|
+
"duration_ms" => duration_ms(root),
|
|
46
|
+
"spans" => @spans.map { |s| span_to_h(s) }
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def find_root
|
|
53
|
+
@spans.find { |s| entry_span?(s) && rule_id_attribute(s) } ||
|
|
54
|
+
@spans.find { |s| root?(s) } ||
|
|
55
|
+
@spans.find { |s| entry_span?(s) } ||
|
|
56
|
+
@spans.find { |s| rule_id_attribute(s) } ||
|
|
57
|
+
@spans.first
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def root?(span)
|
|
61
|
+
pid = span.parent_span_id
|
|
62
|
+
pid.nil? || pid.empty? || pid == ZERO_SPAN_ID
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def entry_span?(span)
|
|
66
|
+
return false unless span.respond_to?(:kind)
|
|
67
|
+
|
|
68
|
+
span.kind == :server || span.kind == :consumer
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def rule_id_from_spans
|
|
72
|
+
@spans.each do |s|
|
|
73
|
+
value = rule_id_attribute(s)
|
|
74
|
+
return value if value
|
|
75
|
+
end
|
|
76
|
+
nil
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def rule_id_attribute(span)
|
|
80
|
+
attrs = span.attributes
|
|
81
|
+
return nil unless attrs
|
|
82
|
+
|
|
83
|
+
attrs[Sampler::RULE_ID_ATTRIBUTE] || attrs[Sampler::RULE_ID_ATTRIBUTE.to_sym]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def root_name(root)
|
|
87
|
+
root&.name&.to_s&.slice(0, ROOT_NAME_LIMIT)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def span_to_h(span)
|
|
91
|
+
{
|
|
92
|
+
"id" => hexify(span.span_id),
|
|
93
|
+
"parent_id" => root?(span) ? nil : hexify(span.parent_span_id),
|
|
94
|
+
"name" => span.name,
|
|
95
|
+
"started_at" => iso(span.start_timestamp),
|
|
96
|
+
"duration_ms" => duration_ms(span),
|
|
97
|
+
"attributes" => span.attributes || {}
|
|
98
|
+
}
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def hexify(bytes)
|
|
102
|
+
return nil if bytes.nil?
|
|
103
|
+
bytes.unpack1("H*")
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def iso(nanos)
|
|
107
|
+
return nil if nanos.nil?
|
|
108
|
+
Time.at(nanos / 1_000_000_000.0).utc.iso8601(6)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def duration_ms(span)
|
|
112
|
+
return nil unless span && span.start_timestamp && span.end_timestamp
|
|
113
|
+
((span.end_timestamp - span.start_timestamp) / 1_000_000).to_i
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "zlib"
|
|
5
|
+
require "stringio"
|
|
6
|
+
require "logger"
|
|
7
|
+
require "uri"
|
|
8
|
+
require "concurrent/atomic/atomic_fixnum"
|
|
9
|
+
require "opentelemetry/sdk"
|
|
10
|
+
|
|
11
|
+
require_relative "sampler"
|
|
12
|
+
require_relative "trace_blob"
|
|
13
|
+
require_relative "http_transport"
|
|
14
|
+
|
|
15
|
+
module Flare
|
|
16
|
+
# Custom OTel exporter. For each batch FilteringSpanProcessor hands over:
|
|
17
|
+
#
|
|
18
|
+
# 1. Group spans by trace_id.
|
|
19
|
+
# 2. For each trace, build a Flare::TraceBlob and gzip-JSON-encode it.
|
|
20
|
+
# 3. Check out a presigned R2 PUT URL from UploadUrlPool.
|
|
21
|
+
# 4. PUT the gzipped body straight to R2 -- Flare's server is NOT in
|
|
22
|
+
# the trace-bytes path.
|
|
23
|
+
# 5. After R2 returns 200, POST /api/traces { key } using the
|
|
24
|
+
# customer's push token + Flare-Project / Flare-Environment headers.
|
|
25
|
+
# That's the self-notify hop the design swapped in for the CF Worker.
|
|
26
|
+
#
|
|
27
|
+
# 403 from R2 means the presigned URL expired between issue and use;
|
|
28
|
+
# discard, check out the next URL, retry once. Pool empty -> FAILURE.
|
|
29
|
+
# Notify-POST failure is logged + counted but doesn't fail the export
|
|
30
|
+
# (the blob is in R2, just won't be processed; incoming/* lifecycle
|
|
31
|
+
# cleans it up in 1hr).
|
|
32
|
+
class TraceExporter
|
|
33
|
+
SUCCESS = OpenTelemetry::SDK::Trace::Export::SUCCESS
|
|
34
|
+
FAILURE = OpenTelemetry::SDK::Trace::Export::FAILURE
|
|
35
|
+
|
|
36
|
+
PUT_HEADERS = {
|
|
37
|
+
"Content-Type" => "application/json",
|
|
38
|
+
"Content-Encoding" => "gzip"
|
|
39
|
+
}.freeze
|
|
40
|
+
|
|
41
|
+
attr_reader :put_failure_count, :notify_failure_count, :pool_empty_count, :exception_count
|
|
42
|
+
|
|
43
|
+
def initialize(pool:, notify_url:, api_key:, project:, environment:,
|
|
44
|
+
transport: nil, logger: nil)
|
|
45
|
+
@pool = pool
|
|
46
|
+
@notify_url = notify_url.to_s
|
|
47
|
+
@api_key = api_key
|
|
48
|
+
@project = project
|
|
49
|
+
@environment = environment
|
|
50
|
+
@transport = transport || HttpTransport.new
|
|
51
|
+
@logger = logger || Logger.new($stderr, level: Logger::WARN)
|
|
52
|
+
|
|
53
|
+
@put_failure_count = Concurrent::AtomicFixnum.new(0)
|
|
54
|
+
@notify_failure_count = Concurrent::AtomicFixnum.new(0)
|
|
55
|
+
@pool_empty_count = Concurrent::AtomicFixnum.new(0)
|
|
56
|
+
@exception_count = Concurrent::AtomicFixnum.new(0)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def export(spans, timeout: nil)
|
|
60
|
+
grouped = spans.group_by(&:trace_id)
|
|
61
|
+
return SUCCESS if grouped.empty?
|
|
62
|
+
|
|
63
|
+
overall = SUCCESS
|
|
64
|
+
grouped.each do |trace_id, group|
|
|
65
|
+
result = ship(TraceBlob.build(trace_id: trace_id, spans: group))
|
|
66
|
+
overall = FAILURE if result == FAILURE
|
|
67
|
+
end
|
|
68
|
+
overall
|
|
69
|
+
rescue StandardError => e
|
|
70
|
+
@exception_count.increment
|
|
71
|
+
@logger.warn("[Flare::TraceExporter] export raised: #{e.class}: #{e.message}")
|
|
72
|
+
FAILURE
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def force_flush(timeout: nil)
|
|
76
|
+
SUCCESS
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def shutdown(timeout: nil)
|
|
80
|
+
SUCCESS
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def ship(blob, retried: false)
|
|
86
|
+
return FAILURE if blob.nil?
|
|
87
|
+
|
|
88
|
+
entry = @pool.checkout
|
|
89
|
+
if entry.nil?
|
|
90
|
+
@pool_empty_count.increment
|
|
91
|
+
return FAILURE
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
body = gzip(JSON.generate(blob.to_h))
|
|
95
|
+
response = @transport.put(entry[:put_url], body, PUT_HEADERS)
|
|
96
|
+
|
|
97
|
+
case response.code
|
|
98
|
+
when "200", "204"
|
|
99
|
+
notify(entry[:key])
|
|
100
|
+
SUCCESS
|
|
101
|
+
when "403"
|
|
102
|
+
# Presigned URL probably expired; try once more with the next one.
|
|
103
|
+
retried ? record_put_failure(response) : ship(blob, retried: true)
|
|
104
|
+
else
|
|
105
|
+
record_put_failure(response)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def notify(key)
|
|
110
|
+
response = @transport.post(@notify_url, JSON.generate(key: key), notify_headers)
|
|
111
|
+
return if response.code == "202"
|
|
112
|
+
|
|
113
|
+
@notify_failure_count.increment
|
|
114
|
+
@logger.warn("[Flare::TraceExporter] notify failed: HTTP #{response.code}")
|
|
115
|
+
rescue StandardError => e
|
|
116
|
+
@notify_failure_count.increment
|
|
117
|
+
@logger.warn("[Flare::TraceExporter] notify exception: #{e.class}: #{e.message}")
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def notify_headers
|
|
121
|
+
{
|
|
122
|
+
"Content-Type" => "application/json",
|
|
123
|
+
"Authorization" => "Bearer #{@api_key}",
|
|
124
|
+
"Flare-Project" => @project,
|
|
125
|
+
"Flare-Environment" => @environment
|
|
126
|
+
}
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def record_put_failure(response)
|
|
130
|
+
@put_failure_count.increment
|
|
131
|
+
@logger.warn("[Flare::TraceExporter] PUT failed: HTTP #{response.code}")
|
|
132
|
+
FAILURE
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def gzip(body)
|
|
136
|
+
io = StringIO.new
|
|
137
|
+
gz = Zlib::GzipWriter.new(io)
|
|
138
|
+
gz.write(body)
|
|
139
|
+
gz.close
|
|
140
|
+
io.string
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "metric_key"
|
|
4
|
+
|
|
5
|
+
module Flare
|
|
6
|
+
# Records client-side tracing health into MetricStorage so flare-web can
|
|
7
|
+
# warn when local buffering, URL exhaustion, or export errors reduce trace
|
|
8
|
+
# fidelity.
|
|
9
|
+
class TraceHealthReporter
|
|
10
|
+
NAMESPACE = "sdk"
|
|
11
|
+
SERVICE = "flare-ruby"
|
|
12
|
+
TARGET = "tracing"
|
|
13
|
+
|
|
14
|
+
def initialize(processor:, pool:, exporter:)
|
|
15
|
+
@processor = processor
|
|
16
|
+
@pool = pool
|
|
17
|
+
@exporter = exporter
|
|
18
|
+
@last = {}
|
|
19
|
+
@mutex = Mutex.new
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def record(storage, bucket: Time.now.utc)
|
|
23
|
+
@mutex.synchronize do
|
|
24
|
+
record_counter(storage, bucket, "dropped_spans", @processor.dropped_count.value)
|
|
25
|
+
record_counter(storage, bucket, "export_failures", @processor.failed_export_count.value)
|
|
26
|
+
record_counter(storage, bucket, "processor_exceptions", @processor.exception_count.value)
|
|
27
|
+
|
|
28
|
+
record_counter(storage, bucket, "upload_url_pool_empty", @pool.empty_count.value)
|
|
29
|
+
record_counter(storage, bucket, "upload_url_expired", @pool.expired_count.value)
|
|
30
|
+
|
|
31
|
+
record_counter(storage, bucket, "r2_put_failures", @exporter.put_failure_count.value)
|
|
32
|
+
record_counter(storage, bucket, "notify_failures", @exporter.notify_failure_count.value)
|
|
33
|
+
record_counter(storage, bucket, "trace_pool_empty", @exporter.pool_empty_count.value)
|
|
34
|
+
record_counter(storage, bucket, "trace_export_exceptions", @exporter.exception_count.value)
|
|
35
|
+
|
|
36
|
+
buffer_size = @processor.buffer_size
|
|
37
|
+
buffer_high_watermark = @processor.buffer_high_watermark.value
|
|
38
|
+
record_gauge(storage, bucket, "buffer_size", buffer_size)
|
|
39
|
+
record_gauge(storage, bucket, "buffer_high_watermark", buffer_high_watermark)
|
|
40
|
+
record_gauge(storage, bucket, "buffer_limit", @processor.max_queue) if buffer_size.positive? || buffer_high_watermark.positive?
|
|
41
|
+
@processor.reset_buffer_high_watermark
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def record_counter(storage, bucket, operation, current)
|
|
48
|
+
previous = @last.fetch(operation, 0)
|
|
49
|
+
@last[operation] = current
|
|
50
|
+
delta = current - previous
|
|
51
|
+
return unless delta.positive?
|
|
52
|
+
|
|
53
|
+
storage.add(key(bucket, operation), count: delta, sum_ms: 0, error_count: 0)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def record_gauge(storage, bucket, operation, value)
|
|
57
|
+
storage.add(key(bucket, operation), count: 1, sum_ms: value, error_count: 0)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def key(bucket, operation)
|
|
61
|
+
MetricKey.new(
|
|
62
|
+
bucket: bucket_time(bucket),
|
|
63
|
+
namespace: NAMESPACE,
|
|
64
|
+
service: SERVICE,
|
|
65
|
+
target: TARGET,
|
|
66
|
+
operation: operation
|
|
67
|
+
)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def bucket_time(time)
|
|
71
|
+
Time.utc(time.year, time.month, time.day, time.hour, time.min, 0)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|