catpm 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.md +222 -0
- data/Rakefile +6 -0
- data/app/assets/stylesheets/catpm/application.css +15 -0
- data/app/controllers/catpm/application_controller.rb +6 -0
- data/app/controllers/catpm/endpoints_controller.rb +63 -0
- data/app/controllers/catpm/errors_controller.rb +63 -0
- data/app/controllers/catpm/events_controller.rb +89 -0
- data/app/controllers/catpm/samples_controller.rb +13 -0
- data/app/controllers/catpm/status_controller.rb +79 -0
- data/app/controllers/catpm/system_controller.rb +17 -0
- data/app/helpers/catpm/application_helper.rb +264 -0
- data/app/jobs/catpm/application_job.rb +6 -0
- data/app/mailers/catpm/application_mailer.rb +8 -0
- data/app/models/catpm/application_record.rb +7 -0
- data/app/models/catpm/bucket.rb +45 -0
- data/app/models/catpm/error_record.rb +37 -0
- data/app/models/catpm/event_bucket.rb +12 -0
- data/app/models/catpm/event_sample.rb +22 -0
- data/app/models/catpm/sample.rb +26 -0
- data/app/views/catpm/endpoints/_sample_table.html.erb +36 -0
- data/app/views/catpm/endpoints/show.html.erb +124 -0
- data/app/views/catpm/errors/index.html.erb +66 -0
- data/app/views/catpm/errors/show.html.erb +107 -0
- data/app/views/catpm/events/index.html.erb +73 -0
- data/app/views/catpm/events/show.html.erb +86 -0
- data/app/views/catpm/samples/show.html.erb +113 -0
- data/app/views/catpm/shared/_page_nav.html.erb +6 -0
- data/app/views/catpm/shared/_segments_waterfall.html.erb +147 -0
- data/app/views/catpm/status/index.html.erb +124 -0
- data/app/views/catpm/system/index.html.erb +454 -0
- data/app/views/layouts/catpm/application.html.erb +381 -0
- data/config/routes.rb +19 -0
- data/db/migrate/20250601000001_create_catpm_tables.rb +104 -0
- data/lib/catpm/adapter/base.rb +85 -0
- data/lib/catpm/adapter/postgresql.rb +186 -0
- data/lib/catpm/adapter/sqlite.rb +159 -0
- data/lib/catpm/adapter.rb +28 -0
- data/lib/catpm/auto_instrument.rb +145 -0
- data/lib/catpm/buffer.rb +59 -0
- data/lib/catpm/circuit_breaker.rb +60 -0
- data/lib/catpm/collector.rb +320 -0
- data/lib/catpm/configuration.rb +103 -0
- data/lib/catpm/custom_event.rb +37 -0
- data/lib/catpm/engine.rb +39 -0
- data/lib/catpm/errors.rb +6 -0
- data/lib/catpm/event.rb +75 -0
- data/lib/catpm/fingerprint.rb +52 -0
- data/lib/catpm/flusher.rb +462 -0
- data/lib/catpm/lifecycle.rb +76 -0
- data/lib/catpm/middleware.rb +75 -0
- data/lib/catpm/middleware_probe.rb +28 -0
- data/lib/catpm/patches/httpclient.rb +44 -0
- data/lib/catpm/patches/net_http.rb +39 -0
- data/lib/catpm/request_segments.rb +101 -0
- data/lib/catpm/segment_subscribers.rb +242 -0
- data/lib/catpm/span_helpers.rb +51 -0
- data/lib/catpm/stack_sampler.rb +226 -0
- data/lib/catpm/subscribers.rb +47 -0
- data/lib/catpm/tdigest.rb +174 -0
- data/lib/catpm/trace.rb +165 -0
- data/lib/catpm/version.rb +5 -0
- data/lib/catpm.rb +66 -0
- data/lib/generators/catpm/install_generator.rb +36 -0
- data/lib/generators/catpm/templates/initializer.rb.tt +77 -0
- data/lib/tasks/catpm_seed.rake +79 -0
- data/lib/tasks/catpm_tasks.rake +6 -0
- metadata +123 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'concurrent'
|
|
4
|
+
|
|
5
|
+
module Catpm
|
|
6
|
+
class Flusher
|
|
7
|
+
attr_reader :running
|
|
8
|
+
|
|
9
|
+
def initialize(buffer:, interval: nil, jitter: nil)
|
|
10
|
+
@buffer = buffer
|
|
11
|
+
@interval = interval || Catpm.config.flush_interval
|
|
12
|
+
@jitter = jitter || Catpm.config.flush_jitter
|
|
13
|
+
@circuit = CircuitBreaker.new
|
|
14
|
+
@last_cleanup_at = Time.now
|
|
15
|
+
@running = false
|
|
16
|
+
@timer = nil
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def start
|
|
20
|
+
return if @running
|
|
21
|
+
|
|
22
|
+
@running = true
|
|
23
|
+
@timer = Concurrent::TimerTask.new(
|
|
24
|
+
execution_interval: effective_interval,
|
|
25
|
+
run_now: false
|
|
26
|
+
) { flush_cycle }
|
|
27
|
+
@timer.execute
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def stop(timeout: Catpm.config.shutdown_timeout)
|
|
31
|
+
return unless @running
|
|
32
|
+
|
|
33
|
+
@running = false
|
|
34
|
+
@timer&.shutdown
|
|
35
|
+
@timer&.wait_for_termination(timeout)
|
|
36
|
+
flush_cycle # Final flush
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Public for testing and emergency flush
|
|
40
|
+
def flush_cycle
|
|
41
|
+
return if @circuit.open?
|
|
42
|
+
|
|
43
|
+
events = @buffer.drain
|
|
44
|
+
return if events.empty?
|
|
45
|
+
|
|
46
|
+
perf_events, custom_events = events.partition { |e| e.is_a?(Catpm::Event) }
|
|
47
|
+
|
|
48
|
+
if perf_events.any?
|
|
49
|
+
buckets, samples, errors = aggregate(perf_events)
|
|
50
|
+
|
|
51
|
+
ActiveRecord::Base.connection_pool.with_connection do
|
|
52
|
+
adapter = Catpm::Adapter.current
|
|
53
|
+
adapter.persist_buckets(buckets)
|
|
54
|
+
|
|
55
|
+
bucket_map = build_bucket_map(buckets)
|
|
56
|
+
samples = rotate_samples(samples)
|
|
57
|
+
adapter.persist_samples(samples, bucket_map)
|
|
58
|
+
adapter.persist_errors(errors)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
if custom_events.any?
|
|
63
|
+
event_buckets, event_samples = aggregate_custom_events(custom_events)
|
|
64
|
+
|
|
65
|
+
ActiveRecord::Base.connection_pool.with_connection do
|
|
66
|
+
adapter = Catpm::Adapter.current
|
|
67
|
+
adapter.persist_event_buckets(event_buckets)
|
|
68
|
+
adapter.persist_event_samples(event_samples)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
@circuit.record_success
|
|
73
|
+
Catpm.stats[:flushes] += 1
|
|
74
|
+
|
|
75
|
+
maybe_cleanup
|
|
76
|
+
rescue => e
|
|
77
|
+
@circuit.record_failure
|
|
78
|
+
Catpm.config.error_handler.call(e)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def reset!
|
|
82
|
+
@circuit.reset!
|
|
83
|
+
@last_cleanup_at = Time.now
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def effective_interval
|
|
89
|
+
@interval + rand(-@jitter..@jitter)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def aggregate(events)
|
|
93
|
+
bucket_groups = {}
|
|
94
|
+
samples = []
|
|
95
|
+
error_groups = {}
|
|
96
|
+
|
|
97
|
+
# Pre-load existing random sample counts per endpoint for filling phase
|
|
98
|
+
@random_sample_counts = {}
|
|
99
|
+
Catpm::Sample.where(sample_type: 'random')
|
|
100
|
+
.joins(:bucket)
|
|
101
|
+
.group('catpm_buckets.kind', 'catpm_buckets.target', 'catpm_buckets.operation')
|
|
102
|
+
.count
|
|
103
|
+
.each { |(kind, target, op), cnt| @random_sample_counts[[ kind, target, op ]] = cnt }
|
|
104
|
+
|
|
105
|
+
events.each do |event|
|
|
106
|
+
# Bucket aggregation
|
|
107
|
+
key = [ event.kind, event.target, event.operation, event.bucket_start ]
|
|
108
|
+
bucket = bucket_groups[key] ||= new_bucket_hash(event)
|
|
109
|
+
|
|
110
|
+
bucket[:count] += 1
|
|
111
|
+
if event.success?
|
|
112
|
+
bucket[:success_count] += 1
|
|
113
|
+
else
|
|
114
|
+
bucket[:failure_count] += 1
|
|
115
|
+
end
|
|
116
|
+
bucket[:duration_sum] += event.duration
|
|
117
|
+
bucket[:duration_max] = [ bucket[:duration_max], event.duration ].max
|
|
118
|
+
bucket[:duration_min] = [ bucket[:duration_min], event.duration ].min
|
|
119
|
+
|
|
120
|
+
# Merge metadata
|
|
121
|
+
event.metadata.each do |k, v|
|
|
122
|
+
str_key = k.to_s
|
|
123
|
+
bucket[:metadata_sum][str_key] = (bucket[:metadata_sum][str_key] || 0).to_f + v.to_f
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# TDigest
|
|
127
|
+
bucket[:tdigest].add(event.duration)
|
|
128
|
+
|
|
129
|
+
# Collect samples
|
|
130
|
+
sample_type = determine_sample_type(event)
|
|
131
|
+
if sample_type
|
|
132
|
+
samples << {
|
|
133
|
+
bucket_key: key,
|
|
134
|
+
kind: event.kind,
|
|
135
|
+
sample_type: sample_type,
|
|
136
|
+
recorded_at: event.started_at,
|
|
137
|
+
duration: event.duration,
|
|
138
|
+
context: event.context
|
|
139
|
+
}
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Error grouping
|
|
143
|
+
if event.error?
|
|
144
|
+
fp = Catpm::Fingerprint.generate(
|
|
145
|
+
kind: event.kind,
|
|
146
|
+
error_class: event.error_class,
|
|
147
|
+
backtrace: event.backtrace
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
error = error_groups[fp] ||= {
|
|
151
|
+
fingerprint: fp,
|
|
152
|
+
kind: event.kind,
|
|
153
|
+
error_class: event.error_class,
|
|
154
|
+
message: event.error_message,
|
|
155
|
+
occurrences_count: 0,
|
|
156
|
+
first_occurred_at: event.started_at,
|
|
157
|
+
last_occurred_at: event.started_at,
|
|
158
|
+
new_contexts: []
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
error[:occurrences_count] += 1
|
|
162
|
+
error[:last_occurred_at] = [ error[:last_occurred_at], event.started_at ].max
|
|
163
|
+
|
|
164
|
+
if error[:new_contexts].size < Catpm.config.max_error_contexts
|
|
165
|
+
error[:new_contexts] << build_error_context(event)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Serialize TDigest blobs
|
|
171
|
+
buckets = bucket_groups.values.map do |b|
|
|
172
|
+
b[:p95_digest] = b[:tdigest].empty? ? nil : b[:tdigest].serialize
|
|
173
|
+
b.delete(:tdigest)
|
|
174
|
+
b
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
[ buckets, samples, error_groups.values ]
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def new_bucket_hash(event)
|
|
181
|
+
{
|
|
182
|
+
kind: event.kind,
|
|
183
|
+
target: event.target,
|
|
184
|
+
operation: event.operation,
|
|
185
|
+
bucket_start: event.bucket_start,
|
|
186
|
+
count: 0,
|
|
187
|
+
success_count: 0,
|
|
188
|
+
failure_count: 0,
|
|
189
|
+
duration_sum: 0.0,
|
|
190
|
+
duration_max: 0.0,
|
|
191
|
+
duration_min: Float::INFINITY,
|
|
192
|
+
metadata_sum: {},
|
|
193
|
+
tdigest: TDigest.new
|
|
194
|
+
}
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def determine_sample_type(event)
|
|
198
|
+
return 'error' if event.error?
|
|
199
|
+
|
|
200
|
+
threshold = Catpm.config.slow_threshold_for(event.kind.to_sym)
|
|
201
|
+
return 'slow' if event.duration >= threshold
|
|
202
|
+
|
|
203
|
+
# Always sample if endpoint has few random samples (filling phase)
|
|
204
|
+
endpoint_key = [ event.kind, event.target, event.operation ]
|
|
205
|
+
existing_random = @random_sample_counts[endpoint_key] || 0
|
|
206
|
+
if existing_random < Catpm.config.max_random_samples_per_endpoint
|
|
207
|
+
@random_sample_counts[endpoint_key] = existing_random + 1
|
|
208
|
+
return 'random'
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
return 'random' if rand(Catpm.config.random_sample_rate) == 0
|
|
212
|
+
|
|
213
|
+
nil
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def rotate_samples(samples)
|
|
217
|
+
samples.each do |sample|
|
|
218
|
+
kind, target, operation = sample[:bucket_key][0], sample[:bucket_key][1], sample[:bucket_key][2]
|
|
219
|
+
endpoint_samples = Catpm::Sample
|
|
220
|
+
.joins(:bucket)
|
|
221
|
+
.where(catpm_buckets: { kind: kind, target: target, operation: operation })
|
|
222
|
+
|
|
223
|
+
case sample[:sample_type]
|
|
224
|
+
when 'random'
|
|
225
|
+
existing = endpoint_samples.where(sample_type: 'random')
|
|
226
|
+
if existing.count >= Catpm.config.max_random_samples_per_endpoint
|
|
227
|
+
existing.order(recorded_at: :asc).first.destroy
|
|
228
|
+
end
|
|
229
|
+
when 'slow'
|
|
230
|
+
existing = endpoint_samples.where(sample_type: 'slow')
|
|
231
|
+
if existing.count >= Catpm.config.max_slow_samples_per_endpoint
|
|
232
|
+
weakest = existing.order(duration: :asc).first
|
|
233
|
+
if sample[:duration] > weakest.duration
|
|
234
|
+
weakest.destroy
|
|
235
|
+
else
|
|
236
|
+
sample[:_skip] = true
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
samples.reject { |s| s.delete(:_skip) }
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def build_error_context(event)
|
|
246
|
+
ctx = {
|
|
247
|
+
occurred_at: event.started_at.iso8601,
|
|
248
|
+
kind: event.kind,
|
|
249
|
+
operation: event.context.slice(:method, :path, :params, :job_class, :job_id, :queue, :target, :metadata),
|
|
250
|
+
backtrace: (event.backtrace || []).first(Catpm.config.backtrace_lines),
|
|
251
|
+
duration: event.duration,
|
|
252
|
+
status: event.status
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
ctx[:target] = event.target if event.target.present?
|
|
256
|
+
|
|
257
|
+
if event.context[:segments]
|
|
258
|
+
ctx[:segments] = event.context[:segments]
|
|
259
|
+
ctx[:segments_capped] = event.context[:segments_capped]
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
if event.context[:segment_summary]
|
|
263
|
+
ctx[:segment_summary] = event.context[:segment_summary]
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
ctx
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def build_bucket_map(aggregated_buckets)
|
|
270
|
+
map = {}
|
|
271
|
+
aggregated_buckets.each do |b|
|
|
272
|
+
key = [ b[:kind], b[:target], b[:operation], b[:bucket_start] ]
|
|
273
|
+
map[key] = Catpm::Bucket.find_by(
|
|
274
|
+
kind: b[:kind], target: b[:target],
|
|
275
|
+
operation: b[:operation], bucket_start: b[:bucket_start]
|
|
276
|
+
)
|
|
277
|
+
end
|
|
278
|
+
map
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
def aggregate_custom_events(events)
|
|
282
|
+
bucket_groups = {}
|
|
283
|
+
samples = []
|
|
284
|
+
sample_counts = Hash.new(0)
|
|
285
|
+
|
|
286
|
+
events.each do |event|
|
|
287
|
+
key = [event.name, event.bucket_start]
|
|
288
|
+
bucket_groups[key] ||= { name: event.name, bucket_start: event.bucket_start, count: 0 }
|
|
289
|
+
bucket_groups[key][:count] += 1
|
|
290
|
+
|
|
291
|
+
max = Catpm.config.events_max_samples_per_name
|
|
292
|
+
if event.payload.any?
|
|
293
|
+
if sample_counts[event.name] < max
|
|
294
|
+
samples << { name: event.name, payload: event.payload, recorded_at: event.recorded_at }
|
|
295
|
+
sample_counts[event.name] += 1
|
|
296
|
+
elsif rand(Catpm.config.random_sample_rate) == 0
|
|
297
|
+
samples << { name: event.name, payload: event.payload, recorded_at: event.recorded_at }
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
[bucket_groups.values, samples]
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def maybe_cleanup
|
|
306
|
+
return if Time.now - @last_cleanup_at < Catpm.config.cleanup_interval
|
|
307
|
+
|
|
308
|
+
@last_cleanup_at = Time.now
|
|
309
|
+
downsample_buckets
|
|
310
|
+
cleanup_expired_data if Catpm.config.retention_period
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def downsample_buckets
|
|
314
|
+
bucket_sizes = Catpm.config.bucket_sizes
|
|
315
|
+
adapter = Catpm::Adapter.current
|
|
316
|
+
|
|
317
|
+
# Phase 1: Merge 1-minute buckets older than 1 hour into 5-minute buckets
|
|
318
|
+
downsample_tier(
|
|
319
|
+
target_interval: bucket_sizes[:medium],
|
|
320
|
+
age_threshold: 1.hour,
|
|
321
|
+
adapter: adapter
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Phase 2: Merge 5-minute buckets older than 24 hours into 1-hour buckets
|
|
325
|
+
downsample_tier(
|
|
326
|
+
target_interval: bucket_sizes[:hourly],
|
|
327
|
+
age_threshold: 24.hours,
|
|
328
|
+
adapter: adapter
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Phase 3: Merge 1-hour buckets older than 1 week into 1-day buckets
|
|
332
|
+
downsample_tier(
|
|
333
|
+
target_interval: bucket_sizes[:daily],
|
|
334
|
+
age_threshold: 1.week,
|
|
335
|
+
adapter: adapter
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Phase 4: Merge 1-day buckets older than 3 months into 1-week buckets
|
|
339
|
+
downsample_tier(
|
|
340
|
+
target_interval: bucket_sizes[:weekly],
|
|
341
|
+
age_threshold: 90.days,
|
|
342
|
+
adapter: adapter
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Event buckets: same downsampling tiers
|
|
346
|
+
downsample_event_tier(target_interval: bucket_sizes[:medium], age_threshold: 1.hour, adapter: adapter)
|
|
347
|
+
downsample_event_tier(target_interval: bucket_sizes[:hourly], age_threshold: 24.hours, adapter: adapter)
|
|
348
|
+
downsample_event_tier(target_interval: bucket_sizes[:daily], age_threshold: 1.week, adapter: adapter)
|
|
349
|
+
downsample_event_tier(target_interval: bucket_sizes[:weekly], age_threshold: 90.days, adapter: adapter)
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def downsample_tier(target_interval:, age_threshold:, adapter:)
|
|
353
|
+
cutoff = age_threshold.ago
|
|
354
|
+
target_seconds = target_interval.to_i
|
|
355
|
+
|
|
356
|
+
# Find all buckets older than cutoff
|
|
357
|
+
source_buckets = Catpm::Bucket.where(bucket_start: ...cutoff).to_a
|
|
358
|
+
return if source_buckets.empty?
|
|
359
|
+
|
|
360
|
+
# Group by (kind, target, operation) + target-aligned bucket_start
|
|
361
|
+
groups = source_buckets.group_by do |bucket|
|
|
362
|
+
epoch = bucket.bucket_start.to_i
|
|
363
|
+
aligned_epoch = epoch - (epoch % target_seconds)
|
|
364
|
+
aligned_start = Time.at(aligned_epoch).utc
|
|
365
|
+
|
|
366
|
+
[bucket.kind, bucket.target, bucket.operation, aligned_start]
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
groups.each do |(kind, target, operation, aligned_start), buckets|
|
|
370
|
+
# Skip if only one bucket already at the target alignment
|
|
371
|
+
next if buckets.size == 1 && buckets.first.bucket_start.to_i % target_seconds == 0
|
|
372
|
+
|
|
373
|
+
merged = {
|
|
374
|
+
kind: kind,
|
|
375
|
+
target: target,
|
|
376
|
+
operation: operation,
|
|
377
|
+
bucket_start: aligned_start,
|
|
378
|
+
count: buckets.sum(&:count),
|
|
379
|
+
success_count: buckets.sum(&:success_count),
|
|
380
|
+
failure_count: buckets.sum(&:failure_count),
|
|
381
|
+
duration_sum: buckets.sum(&:duration_sum),
|
|
382
|
+
duration_max: buckets.map(&:duration_max).max,
|
|
383
|
+
duration_min: buckets.map(&:duration_min).min,
|
|
384
|
+
metadata_sum: merge_bucket_metadata(buckets, adapter),
|
|
385
|
+
p95_digest: merge_bucket_digests(buckets)
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
source_ids = buckets.map(&:id)
|
|
389
|
+
|
|
390
|
+
# Delete source buckets first (to avoid unique constraint conflict
|
|
391
|
+
# if one source bucket has the same bucket_start as the target)
|
|
392
|
+
Catpm::Sample.where(bucket_id: source_ids).delete_all
|
|
393
|
+
Catpm::Bucket.where(id: source_ids).delete_all
|
|
394
|
+
|
|
395
|
+
# Create the merged bucket
|
|
396
|
+
adapter.persist_buckets([merged])
|
|
397
|
+
end
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
def downsample_event_tier(target_interval:, age_threshold:, adapter:)
|
|
401
|
+
cutoff = age_threshold.ago
|
|
402
|
+
target_seconds = target_interval.to_i
|
|
403
|
+
|
|
404
|
+
source_buckets = Catpm::EventBucket.where(bucket_start: ...cutoff).to_a
|
|
405
|
+
return if source_buckets.empty?
|
|
406
|
+
|
|
407
|
+
groups = source_buckets.group_by do |bucket|
|
|
408
|
+
epoch = bucket.bucket_start.to_i
|
|
409
|
+
aligned_epoch = epoch - (epoch % target_seconds)
|
|
410
|
+
aligned_start = Time.at(aligned_epoch).utc
|
|
411
|
+
[bucket.name, aligned_start]
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
groups.each do |(name, aligned_start), buckets|
|
|
415
|
+
next if buckets.size == 1 && buckets.first.bucket_start.to_i % target_seconds == 0
|
|
416
|
+
|
|
417
|
+
merged = { name: name, bucket_start: aligned_start, count: buckets.sum(&:count) }
|
|
418
|
+
Catpm::EventBucket.where(id: buckets.map(&:id)).delete_all
|
|
419
|
+
adapter.persist_event_buckets([merged])
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
def merge_bucket_metadata(buckets, adapter)
|
|
424
|
+
buckets.reduce({}) do |acc, b|
|
|
425
|
+
adapter.merge_metadata_sum(acc, b.metadata_sum)
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
def merge_bucket_digests(buckets)
|
|
430
|
+
combined = TDigest.new
|
|
431
|
+
buckets.each do |b|
|
|
432
|
+
next unless b.p95_digest
|
|
433
|
+
digest = TDigest.deserialize(b.p95_digest)
|
|
434
|
+
combined.merge(digest)
|
|
435
|
+
end
|
|
436
|
+
combined.empty? ? nil : combined.serialize
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
def cleanup_expired_data
|
|
440
|
+
cutoff = Catpm.config.retention_period.ago
|
|
441
|
+
batch_size = 1_000
|
|
442
|
+
|
|
443
|
+
[ Catpm::Bucket, Catpm::Sample ].each do |model|
|
|
444
|
+
time_column = model == Catpm::Sample ? :recorded_at : :bucket_start
|
|
445
|
+
loop do
|
|
446
|
+
deleted = model.where(time_column => ...cutoff).limit(batch_size).delete_all
|
|
447
|
+
break if deleted < batch_size
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
Catpm::ErrorRecord.where(last_occurred_at: ...cutoff).limit(batch_size).delete_all
|
|
452
|
+
|
|
453
|
+
[Catpm::EventBucket, Catpm::EventSample].each do |model|
|
|
454
|
+
time_column = model == Catpm::EventSample ? :recorded_at : :bucket_start
|
|
455
|
+
loop do
|
|
456
|
+
deleted = model.where(time_column => ...cutoff).limit(batch_size).delete_all
|
|
457
|
+
break if deleted < batch_size
|
|
458
|
+
end
|
|
459
|
+
end
|
|
460
|
+
end
|
|
461
|
+
end
|
|
462
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Catpm
|
|
4
|
+
module Lifecycle
|
|
5
|
+
class << self
|
|
6
|
+
def register_hooks
|
|
7
|
+
return unless Catpm.enabled?
|
|
8
|
+
|
|
9
|
+
initialize_buffer
|
|
10
|
+
initialize_flusher
|
|
11
|
+
apply_patches
|
|
12
|
+
|
|
13
|
+
# Always start the flusher in the current process.
|
|
14
|
+
# For forking servers, also register post-fork hooks
|
|
15
|
+
# so each worker restarts its own flusher.
|
|
16
|
+
Catpm.flusher&.start
|
|
17
|
+
|
|
18
|
+
if defined?(::PhusionPassenger)
|
|
19
|
+
register_passenger_hook
|
|
20
|
+
elsif defined?(::Pitchfork)
|
|
21
|
+
register_pitchfork_hook
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
register_shutdown_hooks
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def register_shutdown_hooks
|
|
28
|
+
at_exit { Catpm.flusher&.stop }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def apply_patches
|
|
34
|
+
if Catpm.config.instrument_net_http
|
|
35
|
+
if defined?(::Net::HTTP)
|
|
36
|
+
require 'catpm/patches/net_http'
|
|
37
|
+
::Net::HTTP.prepend(Catpm::Patches::NetHttp)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
if defined?(::HTTPClient)
|
|
41
|
+
require 'catpm/patches/httpclient'
|
|
42
|
+
::HTTPClient.prepend(Catpm::Patches::Httpclient)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def initialize_buffer
|
|
48
|
+
Catpm.buffer ||= Buffer.new(max_bytes: Catpm.config.max_buffer_memory)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def initialize_flusher
|
|
52
|
+
return unless Catpm.buffer
|
|
53
|
+
|
|
54
|
+
Catpm.flusher ||= Flusher.new(
|
|
55
|
+
buffer: Catpm.buffer,
|
|
56
|
+
interval: Catpm.config.flush_interval,
|
|
57
|
+
jitter: Catpm.config.flush_jitter
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def register_passenger_hook
|
|
62
|
+
flusher = Catpm.flusher
|
|
63
|
+
::PhusionPassenger.on_event(:starting_worker_process) do |forked|
|
|
64
|
+
flusher&.start if forked
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def register_pitchfork_hook
|
|
69
|
+
flusher = Catpm.flusher
|
|
70
|
+
::Pitchfork.configure do |server|
|
|
71
|
+
server.after_worker_fork { flusher&.start }
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Catpm
|
|
4
|
+
class Middleware
|
|
5
|
+
def initialize(app)
|
|
6
|
+
@app = app
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def call(env)
|
|
10
|
+
return @app.call(env) unless Catpm.enabled?
|
|
11
|
+
|
|
12
|
+
env['catpm.request_start'] = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
13
|
+
|
|
14
|
+
if Catpm.config.instrument_segments
|
|
15
|
+
req_segments = RequestSegments.new(
|
|
16
|
+
max_segments: Catpm.config.max_segments_per_request,
|
|
17
|
+
request_start: env['catpm.request_start'],
|
|
18
|
+
stack_sample: Catpm.config.instrument_stack_sampler
|
|
19
|
+
)
|
|
20
|
+
env['catpm.segments'] = req_segments
|
|
21
|
+
Thread.current[:catpm_request_segments] = req_segments
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
@app.call(env)
|
|
25
|
+
rescue Exception => e
|
|
26
|
+
record_exception(env, e)
|
|
27
|
+
raise
|
|
28
|
+
ensure
|
|
29
|
+
if Catpm.config.instrument_segments
|
|
30
|
+
req_segments&.stop_sampler
|
|
31
|
+
Thread.current[:catpm_request_segments] = nil
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def record_exception(env, exception)
|
|
38
|
+
return unless Catpm.buffer
|
|
39
|
+
|
|
40
|
+
ev = Event.new(
|
|
41
|
+
kind: :http,
|
|
42
|
+
target: target_from_env(env),
|
|
43
|
+
operation: env['REQUEST_METHOD'] || 'GET',
|
|
44
|
+
duration: elapsed_ms(env),
|
|
45
|
+
started_at: Time.current,
|
|
46
|
+
status: 500,
|
|
47
|
+
error_class: exception.class.name,
|
|
48
|
+
error_message: exception.message,
|
|
49
|
+
backtrace: exception.backtrace,
|
|
50
|
+
context: {
|
|
51
|
+
method: env['REQUEST_METHOD'],
|
|
52
|
+
path: env['PATH_INFO']
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
Catpm.buffer.push(ev)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def target_from_env(env)
|
|
60
|
+
if env['action_dispatch.request.path_parameters']
|
|
61
|
+
params = env['action_dispatch.request.path_parameters']
|
|
62
|
+
"#{params[:controller]&.camelize}Controller##{params[:action]}"
|
|
63
|
+
else
|
|
64
|
+
env['PATH_INFO'] || 'unknown'
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def elapsed_ms(env)
|
|
69
|
+
start = env['catpm.request_start']
|
|
70
|
+
return 0.0 unless start
|
|
71
|
+
|
|
72
|
+
(Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000.0
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Catpm
|
|
4
|
+
# Lightweight Rack middleware probe inserted before each real middleware
|
|
5
|
+
# when `instrument_middleware_stack` is enabled. Uses push_span/pop_span
|
|
6
|
+
# to create nested spans that capture inclusive time per middleware.
|
|
7
|
+
class MiddlewareProbe
|
|
8
|
+
def initialize(app, middleware_name)
|
|
9
|
+
@app = app
|
|
10
|
+
@middleware_name = middleware_name
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(env)
|
|
14
|
+
req_segments = env['catpm.segments']
|
|
15
|
+
if req_segments
|
|
16
|
+
started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
17
|
+
idx = req_segments.push_span(type: :middleware, detail: @middleware_name, started_at: started_at)
|
|
18
|
+
begin
|
|
19
|
+
@app.call(env)
|
|
20
|
+
ensure
|
|
21
|
+
req_segments.pop_span(idx)
|
|
22
|
+
end
|
|
23
|
+
else
|
|
24
|
+
@app.call(env)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Catpm
|
|
4
|
+
module Patches
|
|
5
|
+
module Httpclient
|
|
6
|
+
def do_get_block(req, proxy, conn, &block)
|
|
7
|
+
segments = Thread.current[:catpm_request_segments]
|
|
8
|
+
return super unless segments
|
|
9
|
+
|
|
10
|
+
uri = req.header.request_uri
|
|
11
|
+
http_method = req.header.request_method
|
|
12
|
+
|
|
13
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
14
|
+
response = super
|
|
15
|
+
duration = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000.0
|
|
16
|
+
|
|
17
|
+
status = response.status rescue nil
|
|
18
|
+
detail = "#{http_method} #{uri.host}#{uri.path}"
|
|
19
|
+
detail += " (#{status})" if status
|
|
20
|
+
source = duration >= Catpm.config.segment_source_threshold ? extract_catpm_source : nil
|
|
21
|
+
|
|
22
|
+
segments.add(
|
|
23
|
+
type: :http, duration: duration, detail: detail,
|
|
24
|
+
source: source, started_at: start
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
response
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def extract_catpm_source
|
|
33
|
+
locations = caller_locations(2, 30)
|
|
34
|
+
locations&.each do |loc|
|
|
35
|
+
path = loc.path.to_s
|
|
36
|
+
if Catpm::Fingerprint.app_frame?(path)
|
|
37
|
+
return "#{path}:#{loc.lineno}"
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
nil
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|