catpm 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/catpm/application_controller.rb +8 -0
- data/app/controllers/catpm/endpoints_controller.rb +44 -7
- data/app/controllers/catpm/errors_controller.rb +31 -7
- data/app/controllers/catpm/events_controller.rb +30 -6
- data/app/controllers/catpm/status_controller.rb +15 -3
- data/app/controllers/catpm/system_controller.rb +0 -3
- data/app/helpers/catpm/application_helper.rb +19 -7
- data/app/views/catpm/endpoints/show.html.erb +42 -20
- data/app/views/catpm/errors/show.html.erb +12 -7
- data/app/views/catpm/events/index.html.erb +2 -2
- data/app/views/catpm/events/show.html.erb +12 -8
- data/app/views/catpm/samples/show.html.erb +20 -34
- data/app/views/catpm/shared/_page_nav.html.erb +3 -1
- data/app/views/catpm/status/index.html.erb +2 -2
- data/app/views/catpm/system/index.html.erb +2 -2
- data/app/views/layouts/catpm/application.html.erb +62 -16
- data/config/routes.rb +1 -0
- data/lib/catpm/collector.rb +215 -150
- data/lib/catpm/configuration.rb +6 -2
- data/lib/catpm/event.rb +13 -7
- data/lib/catpm/flusher.rb +104 -95
- data/lib/catpm/segment_subscribers.rb +2 -0
- data/lib/catpm/stack_sampler.rb +63 -19
- data/lib/catpm/tdigest.rb +2 -1
- data/lib/catpm/version.rb +1 -1
- metadata +1 -1
data/lib/catpm/flusher.rb
CHANGED
|
@@ -123,14 +123,6 @@ module Catpm
|
|
|
123
123
|
samples = []
|
|
124
124
|
error_groups = {}
|
|
125
125
|
|
|
126
|
-
# Pre-load existing random sample counts per endpoint for filling phase
|
|
127
|
-
@random_sample_counts = {}
|
|
128
|
-
Catpm::Sample.where(sample_type: 'random')
|
|
129
|
-
.joins(:bucket)
|
|
130
|
-
.group('catpm_buckets.kind', 'catpm_buckets.target', 'catpm_buckets.operation')
|
|
131
|
-
.count
|
|
132
|
-
.each { |(kind, target, op), cnt| @random_sample_counts[[ kind, target, op ]] = cnt }
|
|
133
|
-
|
|
134
126
|
events.each do |event|
|
|
135
127
|
# Bucket aggregation
|
|
136
128
|
key = [ event.kind, event.target, event.operation, event.bucket_start ]
|
|
@@ -165,8 +157,8 @@ module Catpm
|
|
|
165
157
|
)
|
|
166
158
|
end
|
|
167
159
|
|
|
168
|
-
# Collect samples
|
|
169
|
-
sample_type =
|
|
160
|
+
# Collect samples (pre-determined by collector — only these events carry full context)
|
|
161
|
+
sample_type = event.sample_type
|
|
170
162
|
if sample_type
|
|
171
163
|
sample_hash = {
|
|
172
164
|
bucket_key: key,
|
|
@@ -174,7 +166,7 @@ module Catpm
|
|
|
174
166
|
sample_type: sample_type,
|
|
175
167
|
recorded_at: event.started_at,
|
|
176
168
|
duration: event.duration,
|
|
177
|
-
context: event.context
|
|
169
|
+
context: event.context || {}
|
|
178
170
|
}
|
|
179
171
|
sample_hash[:error_fingerprint] = error_fp if error_fp
|
|
180
172
|
samples << sample_hash
|
|
@@ -231,43 +223,51 @@ module Catpm
|
|
|
231
223
|
}
|
|
232
224
|
end
|
|
233
225
|
|
|
234
|
-
def determine_sample_type(event)
|
|
235
|
-
return 'error' if event.error?
|
|
236
|
-
|
|
237
|
-
threshold = Catpm.config.slow_threshold_for(event.kind.to_sym)
|
|
238
|
-
return 'slow' if event.duration >= threshold
|
|
239
226
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
227
|
+
def rotate_samples(samples)
|
|
228
|
+
return samples if samples.empty?
|
|
229
|
+
|
|
230
|
+
# Pre-fetch counts for all endpoints and types in bulk
|
|
231
|
+
endpoint_keys = samples.map { |s| s[:bucket_key][0..2] }.uniq
|
|
232
|
+
error_fps = samples.filter_map { |s| s[:error_fingerprint] }.uniq
|
|
233
|
+
|
|
234
|
+
# Build counts cache: { [kind, target, op, type] => count }
|
|
235
|
+
counts_cache = {}
|
|
236
|
+
if endpoint_keys.any?
|
|
237
|
+
Catpm::Sample.joins(:bucket)
|
|
238
|
+
.where(catpm_buckets: { kind: endpoint_keys.map(&:first), target: endpoint_keys.map { |k| k[1] }, operation: endpoint_keys.map { |k| k[2] } })
|
|
239
|
+
.where(sample_type: %w[random slow])
|
|
240
|
+
.group('catpm_buckets.kind', 'catpm_buckets.target', 'catpm_buckets.operation', 'catpm_samples.sample_type')
|
|
241
|
+
.count
|
|
242
|
+
.each { |(kind, target, op, type), cnt| counts_cache[[kind, target, op, type]] = cnt }
|
|
246
243
|
end
|
|
247
244
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
245
|
+
error_counts = {}
|
|
246
|
+
if error_fps.any?
|
|
247
|
+
Catpm::Sample.where(sample_type: 'error', error_fingerprint: error_fps)
|
|
248
|
+
.group(:error_fingerprint).count
|
|
249
|
+
.each { |fp, cnt| error_counts[fp] = cnt }
|
|
250
|
+
end
|
|
252
251
|
|
|
253
|
-
def rotate_samples(samples)
|
|
254
252
|
samples.each do |sample|
|
|
255
|
-
kind, target, operation = sample[:bucket_key][0
|
|
256
|
-
endpoint_samples = Catpm::Sample
|
|
257
|
-
.joins(:bucket)
|
|
258
|
-
.where(catpm_buckets: { kind: kind, target: target, operation: operation })
|
|
253
|
+
kind, target, operation = sample[:bucket_key][0..2]
|
|
259
254
|
|
|
260
255
|
case sample[:sample_type]
|
|
261
256
|
when 'random'
|
|
262
|
-
|
|
263
|
-
if
|
|
264
|
-
|
|
257
|
+
cache_key = [kind, target, operation, 'random']
|
|
258
|
+
if (counts_cache[cache_key] || 0) >= Catpm.config.max_random_samples_per_endpoint
|
|
259
|
+
oldest = Catpm::Sample.joins(:bucket)
|
|
260
|
+
.where(catpm_buckets: { kind: kind, target: target, operation: operation })
|
|
261
|
+
.where(sample_type: 'random').order(recorded_at: :asc).first
|
|
262
|
+
oldest&.destroy
|
|
265
263
|
end
|
|
266
264
|
when 'slow'
|
|
267
|
-
|
|
268
|
-
if
|
|
269
|
-
weakest =
|
|
270
|
-
|
|
265
|
+
cache_key = [kind, target, operation, 'slow']
|
|
266
|
+
if (counts_cache[cache_key] || 0) >= Catpm.config.max_slow_samples_per_endpoint
|
|
267
|
+
weakest = Catpm::Sample.joins(:bucket)
|
|
268
|
+
.where(catpm_buckets: { kind: kind, target: target, operation: operation })
|
|
269
|
+
.where(sample_type: 'slow').order(duration: :asc).first
|
|
270
|
+
if weakest && sample[:duration] > weakest.duration
|
|
271
271
|
weakest.destroy
|
|
272
272
|
else
|
|
273
273
|
sample[:_skip] = true
|
|
@@ -275,11 +275,10 @@ module Catpm
|
|
|
275
275
|
end
|
|
276
276
|
when 'error'
|
|
277
277
|
fp = sample[:error_fingerprint]
|
|
278
|
-
if fp
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
end
|
|
278
|
+
if fp && (error_counts[fp] || 0) >= Catpm.config.max_error_samples_per_fingerprint
|
|
279
|
+
oldest = Catpm::Sample.where(sample_type: 'error', error_fingerprint: fp)
|
|
280
|
+
.order(recorded_at: :asc).first
|
|
281
|
+
oldest&.destroy
|
|
283
282
|
end
|
|
284
283
|
end
|
|
285
284
|
end
|
|
@@ -288,28 +287,25 @@ module Catpm
|
|
|
288
287
|
end
|
|
289
288
|
|
|
290
289
|
def build_error_context(event)
|
|
290
|
+
event_context = event.context || {}
|
|
291
291
|
ctx = {
|
|
292
292
|
occurred_at: event.started_at.iso8601,
|
|
293
293
|
kind: event.kind,
|
|
294
|
-
operation:
|
|
295
|
-
backtrace:
|
|
296
|
-
bt = event.backtrace || []
|
|
297
|
-
limit = Catpm.config.backtrace_lines
|
|
298
|
-
limit ? bt.first(limit) : bt
|
|
299
|
-
end,
|
|
294
|
+
operation: event_context.slice(:method, :path, :params, :job_class, :job_id, :queue, :target, :metadata),
|
|
295
|
+
backtrace: event.backtrace || [],
|
|
300
296
|
duration: event.duration,
|
|
301
297
|
status: event.status
|
|
302
298
|
}
|
|
303
299
|
|
|
304
300
|
ctx[:target] = event.target if event.target.present?
|
|
305
301
|
|
|
306
|
-
if
|
|
307
|
-
ctx[:segments] =
|
|
308
|
-
ctx[:segments_capped] =
|
|
302
|
+
if event_context[:segments]
|
|
303
|
+
ctx[:segments] = event_context[:segments]
|
|
304
|
+
ctx[:segments_capped] = event_context[:segments_capped]
|
|
309
305
|
end
|
|
310
306
|
|
|
311
|
-
if
|
|
312
|
-
ctx[:segment_summary] =
|
|
307
|
+
if event_context[:segment_summary]
|
|
308
|
+
ctx[:segment_summary] = event_context[:segment_summary]
|
|
313
309
|
end
|
|
314
310
|
|
|
315
311
|
ctx
|
|
@@ -402,48 +398,61 @@ module Catpm
|
|
|
402
398
|
cutoff = age_threshold.ago
|
|
403
399
|
target_seconds = target_interval.to_i
|
|
404
400
|
|
|
405
|
-
#
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
[bucket.kind, bucket.target, bucket.operation, aligned_start]
|
|
416
|
-
end
|
|
417
|
-
|
|
418
|
-
groups.each do |(kind, target, operation, aligned_start), buckets|
|
|
419
|
-
# Skip if only one bucket already at the target alignment
|
|
420
|
-
next if buckets.size == 1 && buckets.first.bucket_start.to_i % target_seconds == 0
|
|
401
|
+
# Process in batches to avoid loading all old buckets into memory
|
|
402
|
+
Catpm::Bucket.where(bucket_start: ...cutoff)
|
|
403
|
+
.select(:id, :kind, :target, :operation, :bucket_start)
|
|
404
|
+
.group_by { |b| [b.kind, b.target, b.operation] }
|
|
405
|
+
.each do |(_kind, _target, _operation), endpoint_buckets|
|
|
406
|
+
groups = endpoint_buckets.group_by do |bucket|
|
|
407
|
+
epoch = bucket.bucket_start.to_i
|
|
408
|
+
aligned_epoch = epoch - (epoch % target_seconds)
|
|
409
|
+
Time.at(aligned_epoch).utc
|
|
410
|
+
end
|
|
421
411
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
412
|
+
groups.each do |aligned_start, stub_buckets|
|
|
413
|
+
next if stub_buckets.size == 1 && stub_buckets.first.bucket_start.to_i % target_seconds == 0
|
|
414
|
+
|
|
415
|
+
# Load full records only for groups that need merging
|
|
416
|
+
bucket_ids = stub_buckets.map(&:id)
|
|
417
|
+
buckets = Catpm::Bucket.where(id: bucket_ids).to_a
|
|
418
|
+
|
|
419
|
+
merged = {
|
|
420
|
+
kind: buckets.first.kind,
|
|
421
|
+
target: buckets.first.target,
|
|
422
|
+
operation: buckets.first.operation,
|
|
423
|
+
bucket_start: aligned_start,
|
|
424
|
+
count: buckets.sum(&:count),
|
|
425
|
+
success_count: buckets.sum(&:success_count),
|
|
426
|
+
failure_count: buckets.sum(&:failure_count),
|
|
427
|
+
duration_sum: buckets.sum(&:duration_sum),
|
|
428
|
+
duration_max: buckets.map(&:duration_max).max,
|
|
429
|
+
duration_min: buckets.map(&:duration_min).min,
|
|
430
|
+
metadata_sum: merge_bucket_metadata(buckets, adapter),
|
|
431
|
+
p95_digest: merge_bucket_digests(buckets)
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
survivor = buckets.first
|
|
435
|
+
|
|
436
|
+
# Reassign all samples to the survivor bucket
|
|
437
|
+
Catpm::Sample.where(bucket_id: bucket_ids).update_all(bucket_id: survivor.id)
|
|
438
|
+
|
|
439
|
+
# Delete non-survivor source buckets (now sample-free)
|
|
440
|
+
Catpm::Bucket.where(id: bucket_ids - [survivor.id]).delete_all
|
|
441
|
+
|
|
442
|
+
# Overwrite survivor with merged data
|
|
443
|
+
survivor.update!(
|
|
444
|
+
bucket_start: aligned_start,
|
|
445
|
+
count: merged[:count],
|
|
446
|
+
success_count: merged[:success_count],
|
|
447
|
+
failure_count: merged[:failure_count],
|
|
448
|
+
duration_sum: merged[:duration_sum],
|
|
449
|
+
duration_max: merged[:duration_max],
|
|
450
|
+
duration_min: merged[:duration_min],
|
|
451
|
+
metadata_sum: merged[:metadata_sum],
|
|
452
|
+
p95_digest: merged[:p95_digest]
|
|
453
|
+
)
|
|
454
|
+
end
|
|
455
|
+
end
|
|
447
456
|
end
|
|
448
457
|
|
|
449
458
|
def downsample_event_tier(target_interval:, age_threshold:, adapter:)
|
|
@@ -171,6 +171,8 @@ module Catpm
|
|
|
171
171
|
|
|
172
172
|
duration = event.duration
|
|
173
173
|
sql = payload[:sql].to_s
|
|
174
|
+
max_len = Catpm.config.max_sql_length
|
|
175
|
+
sql = sql.truncate(max_len) if max_len && sql.length > max_len
|
|
174
176
|
source = duration >= Catpm.config.segment_source_threshold ? extract_source_location : nil
|
|
175
177
|
|
|
176
178
|
req_segments.add(
|
data/lib/catpm/stack_sampler.rb
CHANGED
|
@@ -2,30 +2,74 @@
|
|
|
2
2
|
|
|
3
3
|
module Catpm
|
|
4
4
|
class StackSampler
|
|
5
|
-
|
|
5
|
+
MS_PER_SECOND = 1000.0
|
|
6
|
+
|
|
7
|
+
# Single global thread that samples all active requests.
|
|
8
|
+
# Avoids creating a thread per request.
|
|
9
|
+
class SamplingLoop
|
|
10
|
+
def initialize
|
|
11
|
+
@mutex = Mutex.new
|
|
12
|
+
@samplers = []
|
|
13
|
+
@thread = nil
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def register(sampler)
|
|
17
|
+
@mutex.synchronize do
|
|
18
|
+
@samplers << sampler
|
|
19
|
+
start_thread unless @thread&.alive?
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def unregister(sampler)
|
|
24
|
+
@mutex.synchronize { @samplers.delete(sampler) }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def start_thread
|
|
30
|
+
@thread = Thread.new do
|
|
31
|
+
loop do
|
|
32
|
+
sleep(Catpm.config.stack_sample_interval)
|
|
33
|
+
sample_all
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
@thread.priority = -1
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def sample_all
|
|
40
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
41
|
+
targets = @mutex.synchronize { @samplers.dup }
|
|
42
|
+
targets.each { |s| s.capture(now) }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
@loop = SamplingLoop.new
|
|
47
|
+
|
|
48
|
+
class << self
|
|
49
|
+
attr_reader :loop
|
|
50
|
+
end
|
|
6
51
|
|
|
7
52
|
def initialize(target_thread:, request_start:)
|
|
8
53
|
@target = target_thread
|
|
9
54
|
@request_start = request_start
|
|
10
55
|
@samples = []
|
|
11
|
-
@running = false
|
|
12
56
|
end
|
|
13
57
|
|
|
14
58
|
def start
|
|
15
|
-
|
|
16
|
-
@thread = Thread.new do
|
|
17
|
-
while @running
|
|
18
|
-
locs = @target.backtrace_locations
|
|
19
|
-
@samples << [Process.clock_gettime(Process::CLOCK_MONOTONIC), locs] if locs
|
|
20
|
-
sleep(SAMPLE_INTERVAL)
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
@thread.priority = -1
|
|
59
|
+
self.class.loop.register(self)
|
|
24
60
|
end
|
|
25
61
|
|
|
26
62
|
def stop
|
|
27
|
-
|
|
28
|
-
|
|
63
|
+
self.class.loop.unregister(self)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Called by SamplingLoop from the global thread
|
|
67
|
+
def capture(now)
|
|
68
|
+
max = Catpm.config.max_stack_samples_per_request
|
|
69
|
+
return if max && @samples.size >= max
|
|
70
|
+
|
|
71
|
+
locs = @target.backtrace_locations
|
|
72
|
+
@samples << [now, locs] if locs
|
|
29
73
|
end
|
|
30
74
|
|
|
31
75
|
# Returns array of { parent: {segment}, children: [{segment}, ...] }
|
|
@@ -76,7 +120,7 @@ module Catpm
|
|
|
76
120
|
duration = estimate_duration(group)
|
|
77
121
|
next if duration < 1.0
|
|
78
122
|
|
|
79
|
-
offset = ((group[:start_time] - @request_start) *
|
|
123
|
+
offset = ((group[:start_time] - @request_start) * MS_PER_SECOND).round(2)
|
|
80
124
|
app_frame = group[:app_frame]
|
|
81
125
|
leaf = group[:leaves].first&.last
|
|
82
126
|
|
|
@@ -158,8 +202,8 @@ module Catpm
|
|
|
158
202
|
|
|
159
203
|
spans.filter_map do |span|
|
|
160
204
|
duration = [
|
|
161
|
-
(span[:end_time] - span[:start_time]) *
|
|
162
|
-
span[:count] *
|
|
205
|
+
(span[:end_time] - span[:start_time]) * MS_PER_SECOND,
|
|
206
|
+
span[:count] * Catpm.config.stack_sample_interval * MS_PER_SECOND
|
|
163
207
|
].max
|
|
164
208
|
next if duration < 1.0
|
|
165
209
|
|
|
@@ -170,7 +214,7 @@ module Catpm
|
|
|
170
214
|
type: classify_path(path),
|
|
171
215
|
detail: build_gem_detail(frame),
|
|
172
216
|
duration: duration.round(2),
|
|
173
|
-
offset: ((span[:start_time] - @request_start) *
|
|
217
|
+
offset: ((span[:start_time] - @request_start) * MS_PER_SECOND).round(2),
|
|
174
218
|
started_at: span[:start_time]
|
|
175
219
|
}
|
|
176
220
|
end
|
|
@@ -178,8 +222,8 @@ module Catpm
|
|
|
178
222
|
|
|
179
223
|
def estimate_duration(group)
|
|
180
224
|
[
|
|
181
|
-
(group[:end_time] - group[:start_time]) *
|
|
182
|
-
group[:count] *
|
|
225
|
+
(group[:end_time] - group[:start_time]) * MS_PER_SECOND,
|
|
226
|
+
group[:count] * Catpm.config.stack_sample_interval * MS_PER_SECOND
|
|
183
227
|
].max
|
|
184
228
|
end
|
|
185
229
|
|
data/lib/catpm/tdigest.rb
CHANGED
|
@@ -12,6 +12,7 @@ module Catpm
|
|
|
12
12
|
Centroid = Struct.new(:mean, :weight)
|
|
13
13
|
|
|
14
14
|
COMPRESSION = 100 # Controls accuracy vs. memory trade-off
|
|
15
|
+
BUFFER_FLUSH_FACTOR = 2 # Lower = more frequent flushes (better accuracy), higher = fewer flushes (better performance)
|
|
15
16
|
|
|
16
17
|
attr_reader :count
|
|
17
18
|
|
|
@@ -22,7 +23,7 @@ module Catpm
|
|
|
22
23
|
@min = Float::INFINITY
|
|
23
24
|
@max = -Float::INFINITY
|
|
24
25
|
@buffer = []
|
|
25
|
-
@buffer_limit = @compression *
|
|
26
|
+
@buffer_limit = @compression * BUFFER_FLUSH_FACTOR
|
|
26
27
|
end
|
|
27
28
|
|
|
28
29
|
def add(value, weight = 1)
|
data/lib/catpm/version.rb
CHANGED