catpm 0.9.5 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 18b84e4c767fa3872bbf196d21c3d197e6d4f6517269357e7b24bf0514da3708
4
- data.tar.gz: 6cc282d9f10f13546939f8e9c39b12b784687ef4c0d25e42a2a7e34a39cd3124
3
+ metadata.gz: 54937b58ef7d18fa437e232b7a660ac014737a6e716daed6e57ab7463dc38e27
4
+ data.tar.gz: 76cfd9389ecb1f37794806353c2c56f1d7f799a9bf6f9e8c0c975c93b8423c53
5
5
  SHA512:
6
- metadata.gz: 1968d1e8c7ed1257d2f0bfc3e28f9e34b4e38057ec89e103c10f02bd42d99daa78cf7ba0a9fc31ff5c07a8dc9e0895d9f6c541c557345469a20e2c09da88dd9c
7
- data.tar.gz: d9e5c1b664605c6e66c92e9e6520a1c606f979f10e5e0deb8883307b965f4b4c3d361ef2dcf7ddfe3fb840d99e107c4c666a1ef4487165543f5904285f2a5a07
6
+ metadata.gz: a948c19294ca90dc60215f58e3d8f6fbdd377f4b62f468eba76678b223af37610d549d4a52ea7f42d7c6fec4ab93952bceeb6e19857ec9c67ecf601a4a1a9b51
7
+ data.tar.gz: 298c9964d29d3fc9b2570720a7813b30adc5f21c9b7f75e49c288c9fc4f4cd65c4196c798dc05caeb7824ebefb2553cd82b6778a3f1e1a8eeaac4f3d893a703b
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
1
  gem build catpm.gemspec
2
- gem push catpm-0.9.4.gem
2
+ gem push catpm-0.9.6.gem
3
3
 
4
4
  # Catpm
5
5
 
@@ -94,7 +94,7 @@ module Catpm
94
94
  events_max_samples_per_name: { group: 'Events', label: 'Max Samples / Name', desc: 'Event samples retained per event name', fmt: :nullable_int },
95
95
 
96
96
  # ── Buffer & Flush ──
97
- max_memory_per_thread: { group: 'Buffer & Flush', label: 'Memory per Thread', desc: 'Memory budget per application thread (split between request segments and event buffer)', fmt: :bytes },
97
+ max_memory: { group: 'Buffer & Flush', label: 'Max Memory (MB)', desc: 'Global memory budget for catpm gem in megabytes', fmt: :int },
98
98
  flush_interval: { group: 'Buffer & Flush', label: 'Flush Interval', desc: 'How often the background thread drains the buffer to the database', fmt: :seconds },
99
99
  flush_jitter: { group: 'Buffer & Flush', label: 'Flush Jitter', desc: 'Random jitter added to flush interval to avoid thundering herd', fmt: :pm_seconds },
100
100
  persistence_batch_size: { group: 'Buffer & Flush', label: 'Batch Size', desc: 'Number of events written per database transaction', fmt: :int },
@@ -14,10 +14,6 @@ module Catpm
14
14
  scope :recent, ->(period = 1.hour) { where(recorded_at: period.ago..) }
15
15
  scope :for_error, ->(fingerprint) { where(error_fingerprint: fingerprint) }
16
16
 
17
- def self.request_id_supported?
18
- column_names.include?('request_id')
19
- end
20
-
21
17
  def parsed_context
22
18
  case context
23
19
  when Hash then context
@@ -38,7 +38,6 @@ module Catpm
38
38
  context: sample_data[:context],
39
39
  error_fingerprint: sample_data[:error_fingerprint]
40
40
  }
41
- record[:request_id] = sample_data[:request_id] if Catpm::Sample.request_id_supported?
42
41
  record
43
42
  end
44
43
 
data/lib/catpm/buffer.rb CHANGED
@@ -22,17 +22,19 @@ module Catpm
22
22
  # Called from request threads. Returns :accepted or :dropped.
23
23
  # Never blocks — monitoring must not slow down the application.
24
24
  #
25
+ OVERFLOW_FACTOR = 1.5 # hard cap multiplier — drops events beyond this to prevent OOM
26
+
25
27
  # When buffer reaches max_bytes, signals the flusher for immediate drain
26
- # and continues accepting events. Only drops as a last resort at 3x capacity
27
- # (flusher stuck or DB down).
28
+ # and continues accepting events. Only drops as a last resort at OVERFLOW_FACTOR
29
+ # capacity (flusher stuck or DB down).
28
30
  def push(event)
29
31
  signal_flush = false
30
32
 
31
33
  @monitor.synchronize do
32
34
  bytes = event.estimated_bytes
33
35
 
34
- # Hard safety cap: 3x configured limit prevents OOM if flusher is stuck
35
- if @current_bytes + bytes > @max_bytes * 3
36
+ # Hard safety cap: prevents OOM if flusher is stuck
37
+ if @current_bytes + bytes > @max_bytes * OVERFLOW_FACTOR
36
38
  @dropped_count += 1
37
39
  Catpm.stats[:dropped_events] += 1
38
40
  return :dropped
@@ -6,11 +6,6 @@ module Catpm
6
6
  MIN_GAP_MS = 1.0
7
7
  DEFAULT_ERROR_STATUS = 500
8
8
  DEFAULT_SUCCESS_STATUS = 200
9
- # Cap global force-instrument counter to avoid cascade when many requests
10
- # are slow. Without this cap, apps with 30% slow requests would see ~23%
11
- # instrumentation instead of the configured 1/random_sample_rate.
12
- MAX_FORCE_INSTRUMENT_COUNT = 3
13
-
14
9
  class << self
15
10
  def process_action_controller(event)
16
11
  return unless Catpm.enabled?
@@ -62,21 +57,6 @@ module Catpm
62
57
  instrumented: instrumented
63
58
  )
64
59
 
65
- # Force the NEXT HTTP request to be fully instrumented when this one
66
- # wasn't. Covers slow/error spikes and filling phase (new endpoints that
67
- # haven't collected enough instrumented samples yet).
68
- if !instrumented
69
- if payload[:exception] || duration >= Catpm.config.slow_threshold_for(:http)
70
- trigger_force_instrument
71
- else
72
- max = Catpm.config.max_random_samples_per_endpoint
73
- if max
74
- endpoint_key = ['http', target, operation]
75
- trigger_force_instrument if instrumented_sample_counts[endpoint_key] < max
76
- end
77
- end
78
- end
79
-
80
60
  if sample_type
81
61
  context = build_http_context(payload)
82
62
 
@@ -281,11 +261,6 @@ module Catpm
281
261
  instrumented: instrumented
282
262
  )
283
263
 
284
- # Slow spike detection: force instrument next request for this endpoint
285
- if !instrumented && (error || duration >= Catpm.config.slow_threshold_for(kind.to_sym))
286
- trigger_force_instrument(kind: kind, target: target, operation: operation)
287
- end
288
-
289
264
  if sample_type
290
265
  context = (context || {}).dup
291
266
 
@@ -372,8 +347,6 @@ module Catpm
372
347
  context = nil
373
348
  end
374
349
 
375
- request_id = req_segments&.request_id
376
-
377
350
  ev = Event.new(
378
351
  kind: kind,
379
352
  target: target,
@@ -386,56 +359,7 @@ module Catpm
386
359
  metadata: metadata,
387
360
  error_class: error&.class&.name,
388
361
  error_message: error&.message,
389
- backtrace: error&.backtrace,
390
- request_id: request_id
391
- )
392
-
393
- Catpm.buffer&.push(ev)
394
- end
395
-
396
- def process_checkpoint(kind:, target:, operation:, context:, metadata:, checkpoint_data:, request_start:, request_id: nil)
397
- return unless Catpm.enabled?
398
-
399
- segments = checkpoint_data[:segments].dup
400
- collapse_code_wrappers(segments)
401
-
402
- duration_so_far = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - request_start) * 1000.0
403
-
404
- # Inject root request segment
405
- root_segment = {
406
- type: 'request',
407
- detail: "#{operation.presence || kind} #{target}",
408
- duration: duration_so_far.round(2),
409
- offset: 0.0
410
- }
411
- segments.each do |seg|
412
- if seg.key?(:parent_index)
413
- seg[:parent_index] += 1
414
- else
415
- seg[:parent_index] = 0
416
- end
417
- end
418
- segments.unshift(root_segment)
419
-
420
- checkpoint_context = (context || {}).dup
421
- checkpoint_context[:segments] = segments
422
- checkpoint_context[:segment_summary] = checkpoint_data[:summary]
423
- checkpoint_context[:segments_capped] = checkpoint_data[:overflow]
424
- checkpoint_context[:partial] = true
425
- checkpoint_context[:checkpoint_number] = checkpoint_data[:checkpoint_number]
426
- checkpoint_context = scrub(checkpoint_context)
427
-
428
- ev = Event.new(
429
- kind: kind,
430
- target: target,
431
- operation: operation.to_s,
432
- duration: duration_so_far,
433
- started_at: Time.current,
434
- status: DEFAULT_SUCCESS_STATUS,
435
- context: checkpoint_context,
436
- sample_type: 'random',
437
- metadata: (metadata || {}).dup.merge(checkpoint_data[:summary] || {}),
438
- request_id: request_id
362
+ backtrace: error&.backtrace
439
363
  )
440
364
 
441
365
  Catpm.buffer&.push(ev)
@@ -464,102 +388,17 @@ module Catpm
464
388
  # --- Pre-sampling: decide BEFORE request whether to instrument ---
465
389
 
466
390
  # For HTTP middleware where endpoint is unknown at start.
467
- # Returns true if this request should get full instrumentation.
468
391
  def should_instrument_request?
469
- # Force after slow spike detection
470
- if (@force_instrument_count || 0) > 0
471
- @force_instrument_count -= 1
472
- return true
473
- end
474
-
475
392
  rand(Catpm.config.random_sample_rate) == 0
476
393
  end
477
394
 
478
395
  # For track_request where endpoint is known at start.
479
- # Filling phase ensures new endpoints get instrumented samples quickly.
480
- def should_instrument?(kind, target, operation)
481
- endpoint_key = [kind.to_s, target.to_s, (operation || '').to_s]
482
-
483
- # Force after slow spike
484
- if force_instrument_endpoints.delete(endpoint_key)
485
- return true
486
- end
487
-
488
- # Filling phase — endpoint hasn't collected enough instrumented samples yet
489
- max = Catpm.config.max_random_samples_per_endpoint
490
- if max.nil? || instrumented_sample_counts[endpoint_key] < max
491
- return true
492
- end
493
-
396
+ def should_instrument?(_kind, _target, _operation)
494
397
  rand(Catpm.config.random_sample_rate) == 0
495
398
  end
496
399
 
497
- # Called when a slow/error request had no instrumentation —
498
- # forces the NEXT request(s) to be fully instrumented.
499
- #
500
- # Two modes (mutually exclusive to avoid double-instrumentation):
501
- # - With endpoint: sets per-endpoint flag consumed by should_instrument?
502
- # (for track_request paths where endpoint is known)
503
- # - Without endpoint: increments global counter consumed by
504
- # should_instrument_request? (for middleware path where endpoint is unknown)
505
- def trigger_force_instrument(kind: nil, target: nil, operation: nil)
506
- if kind && target
507
- endpoint_key = [kind.to_s, target.to_s, (operation || '').to_s]
508
- force_instrument_endpoints[endpoint_key] = true
509
- else
510
- @force_instrument_count = [(@force_instrument_count || 0) + 1, MAX_FORCE_INSTRUMENT_COUNT].min
511
- end
512
- end
513
-
514
- def reset_sample_counts!
515
- @instrumented_sample_counts = nil
516
- @instrumented_sample_counts_loaded = false
517
- @force_instrument_endpoints = nil
518
- @force_instrument_count = nil
519
- end
520
-
521
400
  private
522
401
 
523
- def force_instrument_endpoints
524
- @force_instrument_endpoints ||= {}
525
- end
526
-
527
- def instrumented_sample_counts
528
- return @instrumented_sample_counts if @instrumented_sample_counts_loaded
529
-
530
- @instrumented_sample_counts = load_sample_counts_from_db
531
- @instrumented_sample_counts_loaded = true
532
- @instrumented_sample_counts
533
- end
534
-
535
- # Pre-populate filling counters from DB so old endpoints don't
536
- # re-enter filling phase on every process restart.
537
- # Temporarily clears thread-local to prevent our query from being
538
- # captured as a segment in any active request.
539
- def load_sample_counts_from_db
540
- counts = Hash.new(0)
541
- return counts unless defined?(Catpm::Sample) && Catpm::Bucket.table_exists?
542
-
543
- saved_rs = Thread.current[:catpm_request_segments]
544
- Thread.current[:catpm_request_segments] = nil
545
- begin
546
- Catpm::Sample.joins(:bucket)
547
- .where(sample_type: 'random')
548
- .group('catpm_buckets.kind', 'catpm_buckets.target', 'catpm_buckets.operation')
549
- .count
550
- .each do |(kind, target, operation), count|
551
- counts[[kind.to_s, target.to_s, operation.to_s]] = count
552
- end
553
- ensure
554
- Thread.current[:catpm_request_segments] = saved_rs
555
- end
556
-
557
- counts
558
- rescue => e
559
- Catpm.config.error_handler&.call(e)
560
- Hash.new(0)
561
- end
562
-
563
402
  # Remove near-zero-duration "code" spans that merely wrap a "controller" span.
564
403
  # This happens when CallTracer (TracePoint) captures a thin dispatch method
565
404
  # (e.g. Telegram::WebhookController#process) whose :return fires before the
@@ -615,39 +454,11 @@ module Catpm
615
454
 
616
455
  # Determine sample type at event creation time so only sampled events
617
456
  # carry full context in the buffer.
618
- #
619
- # Non-instrumented requests never get a sample (they have no segments).
620
- # Filling phase is handled by the caller via trigger_force_instrument,
621
- # so the NEXT request gets full instrumentation with segments.
622
- #
623
- # Post-filling: non-instrumented requests just contribute duration/count
624
- # to the bucket, no sample created.
457
+ # Non-instrumented requests have no segments — skip sample creation.
625
458
  def early_sample_type(error:, duration:, kind:, target:, operation:, instrumented: true)
626
- # Errors: only create sample for instrumented requests (with segments).
627
- # Non-instrumented errors are still tracked in error_groups via
628
- # event.error? — occurrence counts, contexts, and backtrace are preserved.
629
- # trigger_force_instrument ensures the next occurrence gets full segments.
630
459
  return 'error' if error && instrumented
631
-
632
- is_slow = duration >= Catpm.config.slow_threshold_for(kind.to_sym)
633
-
634
- # Non-instrumented requests have no segments — skip sample creation.
635
- # Slow/error spikes are handled by the caller via trigger_force_instrument
636
- # so the NEXT request gets full instrumentation with useful segments.
637
460
  return nil unless instrumented
638
-
639
- # Count this instrumented request towards filling phase completion.
640
- # Both slow and random requests count — without this, endpoints where
641
- # most requests exceed slow_threshold would never exit the filling phase,
642
- # causing 100% instrumentation regardless of random_sample_rate.
643
- endpoint_key = [kind.to_s, target, operation.to_s]
644
- count = instrumented_sample_counts[endpoint_key]
645
- max_random = Catpm.config.max_random_samples_per_endpoint
646
- if max_random.nil? || count < max_random
647
- instrumented_sample_counts[endpoint_key] = count + 1
648
- end
649
-
650
- return 'slow' if is_slow
461
+ return 'slow' if duration >= Catpm.config.slow_threshold_for(kind.to_sym)
651
462
 
652
463
  'random'
653
464
  end
@@ -2,12 +2,13 @@
2
2
 
3
3
  module Catpm
4
4
  class Configuration
5
- # Memory budget shares — how max_memory_per_thread is split
6
- BUFFER_MEMORY_SHARE = 0.5 # half per-thread budget goes to buffer pool
7
- REQUEST_MEMORY_SHARE = 0.5 # half per-thread budget for request segments
8
- MIN_REQUEST_MEMORY = 1_024 # 1 KB — floor for per-request (checkpoint viability, ~5 minimal segments)
9
5
  MIN_BUFFER_MEMORY = 1_048_576 # 1 MB — floor for buffer (meaningful buffering)
10
- DEFAULT_ASSUMED_THREADS = 5 # fallback when thread detection fails
6
+ DEFAULT_ASSUMED_THREADS = 5 # fallback when thread detection fails
7
+
8
+ # Global memory budget distribution shares
9
+ BUFFER_MEMORY_SHARE = 0.5 # 50% of max_memory for event buffer
10
+ CACHE_ENTRIES_PER_MB = 10_000 # ~100 bytes/entry in path_cache
11
+ PATH_CACHE_BUDGET_SHARE = 0.05 # 5% of max_memory for path_cache
11
12
 
12
13
  # Boolean / non-numeric settings — plain attr_accessor
13
14
  attr_accessor :enabled,
@@ -37,7 +38,7 @@ module Catpm
37
38
  random_sample_rate cleanup_interval
38
39
  circuit_breaker_failure_threshold circuit_breaker_recovery_timeout
39
40
  sqlite_busy_timeout persistence_batch_size shutdown_timeout
40
- stack_sample_interval min_segment_duration
41
+ stack_sample_interval min_segment_duration max_memory
41
42
  ].freeze
42
43
 
43
44
  # Numeric settings where nil means "no limit" / "disabled"
@@ -48,7 +49,6 @@ module Catpm
48
49
  events_max_samples_per_name max_stack_samples_per_request
49
50
  max_error_detail_length max_fingerprint_app_frames
50
51
  max_fingerprint_gem_frames cleanup_batch_size caller_scan_depth
51
- max_memory_per_thread
52
52
  ].freeze
53
53
 
54
54
  (REQUIRED_NUMERIC + OPTIONAL_NUMERIC).each do |attr|
@@ -83,7 +83,7 @@ module Catpm
83
83
  @slow_threshold_per_kind = {}
84
84
  @ignored_targets = []
85
85
  @retention_period = nil # nil = keep forever (data is downsampled, not deleted)
86
- @max_memory_per_thread = 2.megabytes
86
+ @max_memory = 20 # MB — global memory budget (2% of 1GB server)
87
87
  @flush_interval = 30 # seconds
88
88
  @flush_jitter = 5 # ±seconds
89
89
  @max_error_contexts = 5
@@ -124,17 +124,15 @@ module Catpm
124
124
  @show_untracked_segments = false
125
125
  end
126
126
 
127
- def derived_request_memory_limit
128
- return nil unless max_memory_per_thread
129
-
130
- [max_memory_per_thread * REQUEST_MEMORY_SHARE, MIN_REQUEST_MEMORY].max
127
+ # Buffer gets BUFFER_MEMORY_SHARE of max_memory, scaled by thread count
128
+ def effective_max_buffer_memory
129
+ bytes = (max_memory * 1_048_576 * BUFFER_MEMORY_SHARE).to_i
130
+ [bytes, MIN_BUFFER_MEMORY].max
131
131
  end
132
132
 
133
- def derived_buffer_memory_limit(detected_threads = nil)
134
- return MIN_BUFFER_MEMORY unless max_memory_per_thread
135
-
136
- threads = detected_threads || DEFAULT_ASSUMED_THREADS
137
- [max_memory_per_thread * BUFFER_MEMORY_SHARE * threads, MIN_BUFFER_MEMORY].max
133
+ # Path cache limit derived from max_memory
134
+ def effective_path_cache_max
135
+ (max_memory * CACHE_ENTRIES_PER_MB * PATH_CACHE_BUDGET_SHARE).to_i
138
136
  end
139
137
 
140
138
  def slow_threshold_for(kind)
data/lib/catpm/event.rb CHANGED
@@ -9,14 +9,14 @@ module Catpm
9
9
 
10
10
  attr_accessor :kind, :target, :operation, :duration, :started_at,
11
11
  :metadata, :error_class, :error_message, :backtrace,
12
- :sample_type, :context, :status, :request_id
12
+ :sample_type, :context, :status
13
13
 
14
14
  EMPTY_HASH = {}.freeze
15
15
  private_constant :EMPTY_HASH
16
16
 
17
17
  def initialize(kind:, target:, operation: '', duration: 0.0, started_at: nil,
18
18
  metadata: nil, error_class: nil, error_message: nil, backtrace: nil,
19
- sample_type: nil, context: nil, status: nil, request_id: nil)
19
+ sample_type: nil, context: nil, status: nil)
20
20
  @kind = kind.to_s
21
21
  @target = target.to_s
22
22
  @operation = (operation || '').to_s
@@ -32,7 +32,6 @@ module Catpm
32
32
  @sample_type = sample_type
33
33
  @context = context
34
34
  @status = status
35
- @request_id = request_id
36
35
  end
37
36
 
38
37
  def estimated_bytes
@@ -40,7 +40,7 @@ module Catpm
40
40
 
41
41
  result = _app_frame?(line)
42
42
  @path_cache_mutex.synchronize do
43
- @path_cache.clear if @path_cache.size > 4000
43
+ @path_cache.clear if @path_cache.size > Catpm.config.effective_path_cache_max
44
44
  @path_cache[line] = result
45
45
  end
46
46
  result
data/lib/catpm/flusher.rb CHANGED
@@ -3,7 +3,6 @@
3
3
  module Catpm
4
4
  class Flusher
5
5
  ERROR_LOG_BACKTRACE_LINES = 5
6
- PARTIAL_STALE_TIMEOUT = 600 # seconds — orphaned partial samples cleaned after 10 minutes
7
6
 
8
7
  attr_reader :running
9
8
 
@@ -182,8 +181,7 @@ module Catpm
182
181
  sample_type: sample_type,
183
182
  recorded_at: event.started_at,
184
183
  duration: event.duration,
185
- context: event.context || {},
186
- request_id: event.request_id
184
+ context: event.context || {}
187
185
  }
188
186
  sample_hash[:error_fingerprint] = error_fp if error_fp
189
187
  samples << sample_hash
@@ -221,8 +219,6 @@ module Catpm
221
219
  b
222
220
  end
223
221
 
224
- samples = merge_request_samples(samples)
225
-
226
222
  [ buckets, samples, error_groups.values ]
227
223
  end
228
224
 
@@ -350,8 +346,6 @@ module Catpm
350
346
  @last_cleanup_at = Time.now
351
347
  downsample_buckets
352
348
  cleanup_expired_data if Catpm.config.retention_period
353
- cleanup_orphaned_partials
354
- Collector.reset_sample_counts!
355
349
  end
356
350
 
357
351
  def downsample_buckets
@@ -398,83 +392,93 @@ module Catpm
398
392
  cutoff = age_threshold.ago
399
393
  target_seconds = target_interval.to_i
400
394
 
401
- # Process in batches to avoid loading all old buckets into memory
402
- Catpm::Bucket.where(bucket_start: ...cutoff)
403
- .select(:id, :kind, :target, :operation, :bucket_start)
404
- .group_by { |b| [b.kind, b.target, b.operation] }
405
- .each do |(_kind, _target, _operation), endpoint_buckets|
406
- groups = endpoint_buckets.group_by do |bucket|
407
- epoch = bucket.bucket_start.to_i
408
- aligned_epoch = epoch - (epoch % target_seconds)
409
- Time.at(aligned_epoch).utc
410
- end
395
+ # Get unique endpoint keys first (small set), then process per-endpoint
396
+ # to avoid loading all old buckets into memory at once
397
+ endpoint_keys = Catpm::Bucket.where(bucket_start: ...cutoff)
398
+ .distinct.pluck(:kind, :target, :operation)
411
399
 
412
- groups.each do |aligned_start, stub_buckets|
413
- next if stub_buckets.size == 1 && stub_buckets.first.bucket_start.to_i % target_seconds == 0
414
-
415
- # Load full records only for groups that need merging
416
- bucket_ids = stub_buckets.map(&:id)
417
- buckets = Catpm::Bucket.where(id: bucket_ids).to_a
418
-
419
- merged = {
420
- kind: buckets.first.kind,
421
- target: buckets.first.target,
422
- operation: buckets.first.operation,
423
- bucket_start: aligned_start,
424
- count: buckets.sum(&:count),
425
- success_count: buckets.sum(&:success_count),
426
- failure_count: buckets.sum(&:failure_count),
427
- duration_sum: buckets.sum(&:duration_sum),
428
- duration_max: buckets.map(&:duration_max).max,
429
- duration_min: buckets.map(&:duration_min).min,
430
- metadata_sum: merge_bucket_metadata(buckets, adapter),
431
- p95_digest: merge_bucket_digests(buckets)
432
- }
433
-
434
- survivor = buckets.first
435
-
436
- # Reassign all samples to the survivor bucket
437
- Catpm::Sample.where(bucket_id: bucket_ids).update_all(bucket_id: survivor.id)
438
-
439
- # Delete non-survivor source buckets (now sample-free)
440
- Catpm::Bucket.where(id: bucket_ids - [survivor.id]).delete_all
441
-
442
- # Overwrite survivor with merged data
443
- survivor.update!(
444
- bucket_start: aligned_start,
445
- count: merged[:count],
446
- success_count: merged[:success_count],
447
- failure_count: merged[:failure_count],
448
- duration_sum: merged[:duration_sum],
449
- duration_max: merged[:duration_max],
450
- duration_min: merged[:duration_min],
451
- metadata_sum: merged[:metadata_sum],
452
- p95_digest: merged[:p95_digest]
453
- )
454
- end
400
+ endpoint_keys.each do |kind, target, operation|
401
+ endpoint_buckets = Catpm::Bucket
402
+ .where(kind: kind, target: target, operation: operation, bucket_start: ...cutoff)
403
+ .select(:id, :bucket_start).to_a
404
+
405
+ groups = endpoint_buckets.group_by do |bucket|
406
+ epoch = bucket.bucket_start.to_i
407
+ aligned_epoch = epoch - (epoch % target_seconds)
408
+ Time.at(aligned_epoch).utc
409
+ end
410
+
411
+ groups.each do |aligned_start, stub_buckets|
412
+ next if stub_buckets.size == 1 && stub_buckets.first.bucket_start.to_i % target_seconds == 0
413
+
414
+ # Load full records only for groups that need merging
415
+ bucket_ids = stub_buckets.map(&:id)
416
+ buckets = Catpm::Bucket.where(id: bucket_ids).to_a
417
+
418
+ merged = {
419
+ kind: buckets.first.kind,
420
+ target: buckets.first.target,
421
+ operation: buckets.first.operation,
422
+ bucket_start: aligned_start,
423
+ count: buckets.sum(&:count),
424
+ success_count: buckets.sum(&:success_count),
425
+ failure_count: buckets.sum(&:failure_count),
426
+ duration_sum: buckets.sum(&:duration_sum),
427
+ duration_max: buckets.map(&:duration_max).max,
428
+ duration_min: buckets.map(&:duration_min).min,
429
+ metadata_sum: merge_bucket_metadata(buckets, adapter),
430
+ p95_digest: merge_bucket_digests(buckets)
431
+ }
432
+
433
+ survivor = buckets.first
434
+
435
+ # Reassign all samples to the survivor bucket
436
+ Catpm::Sample.where(bucket_id: bucket_ids).update_all(bucket_id: survivor.id)
437
+
438
+ # Delete non-survivor source buckets (now sample-free)
439
+ Catpm::Bucket.where(id: bucket_ids - [survivor.id]).delete_all
440
+
441
+ # Overwrite survivor with merged data
442
+ survivor.update!(
443
+ bucket_start: aligned_start,
444
+ count: merged[:count],
445
+ success_count: merged[:success_count],
446
+ failure_count: merged[:failure_count],
447
+ duration_sum: merged[:duration_sum],
448
+ duration_max: merged[:duration_max],
449
+ duration_min: merged[:duration_min],
450
+ metadata_sum: merged[:metadata_sum],
451
+ p95_digest: merged[:p95_digest]
452
+ )
455
453
  end
454
+ end
456
455
  end
457
456
 
458
457
  def downsample_event_tier(target_interval:, age_threshold:, adapter:)
459
458
  cutoff = age_threshold.ago
460
459
  target_seconds = target_interval.to_i
461
460
 
462
- source_buckets = Catpm::EventBucket.where(bucket_start: ...cutoff).to_a
463
- return if source_buckets.empty?
461
+ # Process per-name to avoid loading all event buckets into memory
462
+ names = Catpm::EventBucket.where(bucket_start: ...cutoff).distinct.pluck(:name)
463
+ return if names.empty?
464
464
 
465
- groups = source_buckets.group_by do |bucket|
466
- epoch = bucket.bucket_start.to_i
467
- aligned_epoch = epoch - (epoch % target_seconds)
468
- aligned_start = Time.at(aligned_epoch).utc
469
- [bucket.name, aligned_start]
470
- end
465
+ names.each do |name|
466
+ buckets = Catpm::EventBucket.where(name: name, bucket_start: ...cutoff).to_a
467
+ next if buckets.empty?
471
468
 
472
- groups.each do |(name, aligned_start), buckets|
473
- next if buckets.size == 1 && buckets.first.bucket_start.to_i % target_seconds == 0
469
+ groups = buckets.group_by do |bucket|
470
+ epoch = bucket.bucket_start.to_i
471
+ aligned_epoch = epoch - (epoch % target_seconds)
472
+ Time.at(aligned_epoch).utc
473
+ end
474
+
475
+ groups.each do |aligned_start, group_buckets|
476
+ next if group_buckets.size == 1 && group_buckets.first.bucket_start.to_i % target_seconds == 0
474
477
 
475
- merged = { name: name, bucket_start: aligned_start, count: buckets.sum(&:count) }
476
- Catpm::EventBucket.where(id: buckets.map(&:id)).delete_all
477
- adapter.persist_event_buckets([merged])
478
+ merged = { name: name, bucket_start: aligned_start, count: group_buckets.sum(&:count) }
479
+ Catpm::EventBucket.where(id: group_buckets.map(&:id)).delete_all
480
+ adapter.persist_event_buckets([merged])
481
+ end
478
482
  end
479
483
  end
480
484
 
@@ -494,136 +498,6 @@ module Catpm
494
498
  combined.empty? ? nil : combined.serialize
495
499
  end
496
500
 
497
- def merge_request_samples(samples)
498
- return samples unless Catpm::Sample.request_id_supported?
499
-
500
- by_request = {} # request_id => { partials: [], final: nil }
501
- regular = []
502
-
503
- samples.each do |s|
504
- rid = s[:request_id]
505
- if rid
506
- entry = (by_request[rid] ||= { partials: [], final: nil })
507
- if s[:context].is_a?(Hash) && s[:context][:partial]
508
- entry[:partials] << s
509
- else
510
- entry[:final] = s
511
- end
512
- else
513
- regular << s
514
- end
515
- end
516
-
517
- merged = []
518
- by_request.each do |rid, entry|
519
- if entry[:final]
520
- # Merge in-batch partials
521
- if entry[:partials].any?
522
- merge_checkpoint_contexts(
523
- entry[:final][:context],
524
- entry[:partials].map { |p| p[:context] }
525
- )
526
- end
527
-
528
- # Merge cross-batch partials from DB
529
- db_partials = Catpm::Sample.where(request_id: rid)
530
- if db_partials.exists?
531
- merge_checkpoint_contexts(
532
- entry[:final][:context],
533
- db_partials.map(&:parsed_context)
534
- )
535
- db_partials.delete_all
536
- end
537
-
538
- # Clear request_id so persisted final sample won't be treated as orphan
539
- entry[:final][:request_id] = nil
540
- merged << entry[:final]
541
- else
542
- # Only partials, no final yet — persist as-is
543
- merged.concat(entry[:partials])
544
- end
545
- end
546
-
547
- regular + merged
548
- end
549
-
550
- def merge_checkpoint_contexts(final_ctx, checkpoint_ctxs)
551
- final_segments = final_ctx[:segments] || final_ctx['segments']
552
- return unless final_segments
553
-
554
- final_ctrl_idx = final_segments.index { |s|
555
- (s[:type] || s['type']) == 'controller'
556
- }
557
-
558
- sorted = checkpoint_ctxs.sort_by { |c|
559
- c[:checkpoint_number] || c['checkpoint_number'] || 0
560
- }
561
-
562
- sorted.each do |cp_ctx|
563
- cp_segments = cp_ctx[:segments] || cp_ctx['segments'] || []
564
-
565
- old_to_new = {}
566
- kept = []
567
-
568
- cp_segments.each_with_index do |seg, i|
569
- seg_type = seg[:type] || seg['type']
570
- next if seg_type == 'request'
571
- next if seg_type == 'controller'
572
- old_to_new[i] = final_segments.size + kept.size
573
- kept << seg.dup
574
- end
575
-
576
- kept.each do |seg|
577
- pi_key = seg.key?(:parent_index) ? :parent_index : 'parent_index'
578
- pi = seg[pi_key]
579
- next unless pi
580
-
581
- if old_to_new.key?(pi)
582
- seg[pi_key] = old_to_new[pi]
583
- else
584
- seg[pi_key] = final_ctrl_idx || 0
585
- end
586
- end
587
-
588
- final_segments.concat(kept)
589
-
590
- # Merge summary
591
- cp_summary = cp_ctx[:segment_summary] || cp_ctx['segment_summary']
592
- if cp_summary
593
- use_symbols = final_ctx.key?(:segment_summary)
594
- summary_key = use_symbols ? :segment_summary : 'segment_summary'
595
- final_summary = final_ctx[summary_key] ||= {}
596
- cp_summary.each do |k, v|
597
- nk = use_symbols ? k.to_sym : k.to_s
598
- final_summary[nk] = (final_summary[nk] || 0) + v.to_f
599
- end
600
- end
601
-
602
- # Merge capped flag
603
- capped_key = final_ctx.key?(:segments_capped) ? :segments_capped : 'segments_capped'
604
- cp_capped = cp_ctx[:segments_capped] || cp_ctx['segments_capped']
605
- final_ctx[capped_key] = true if cp_capped
606
- end
607
-
608
- # Clean up checkpoint markers
609
- final_ctx.delete(:partial)
610
- final_ctx.delete('partial')
611
- final_ctx.delete(:request_id)
612
- final_ctx.delete('request_id')
613
- final_ctx.delete(:checkpoint_number)
614
- final_ctx.delete('checkpoint_number')
615
- end
616
-
617
- def cleanup_orphaned_partials
618
- return unless Catpm::Sample.request_id_supported?
619
-
620
- Catpm::Sample.where.not(request_id: nil)
621
- .where(recorded_at: ..PARTIAL_STALE_TIMEOUT.seconds.ago)
622
- .delete_all
623
- rescue => e
624
- Catpm.config.error_handler&.call(e)
625
- end
626
-
627
501
  def cleanup_expired_data
628
502
  cutoff = Catpm.config.retention_period.ago
629
503
  batch_size = Catpm.config.cleanup_batch_size
@@ -39,15 +39,7 @@ module Catpm
39
39
  end
40
40
 
41
41
  def initialize_buffer
42
- max_bytes = Catpm.config.derived_buffer_memory_limit(detect_threads)
43
- Catpm.buffer ||= Buffer.new(max_bytes: max_bytes)
44
- end
45
-
46
- def detect_threads
47
- return Puma.cli_config.options[:max_threads] if defined?(Puma::Server) && Puma.respond_to?(:cli_config)
48
- return ENV['RAILS_MAX_THREADS'].to_i if ENV['RAILS_MAX_THREADS'].present?
49
- return Sidekiq[:concurrency] if defined?(Sidekiq) && Sidekiq.respond_to?(:[])
50
- nil
42
+ Catpm.buffer ||= Buffer.new(max_bytes: Catpm.config.effective_max_buffer_memory)
51
43
  end
52
44
 
53
45
  def initialize_flusher
@@ -20,8 +20,7 @@ module Catpm
20
20
  max_segments: Catpm.config.max_segments_per_request,
21
21
  request_start: env['catpm.request_start'],
22
22
  stack_sample: use_sampler,
23
- call_tree: Catpm.config.instrument_call_tree,
24
- memory_limit: Catpm.config.derived_request_memory_limit
23
+ call_tree: Catpm.config.instrument_call_tree
25
24
  )
26
25
  env['catpm.segments'] = req_segments
27
26
  Thread.current[:catpm_request_segments] = req_segments
@@ -9,9 +9,9 @@ module Catpm
9
9
  SEGMENT_BASE_BYTES = Event::OBJECT_OVERHEAD + (6 * Event::HASH_ENTRY_SIZE)
10
10
  SEGMENT_STRING_OVERHEAD = Event::OBJECT_OVERHEAD # per-string overhead in segment values
11
11
 
12
- attr_reader :segments, :summary, :request_start, :estimated_bytes, :checkpoint_count, :request_id, :segments_filtered
12
+ attr_reader :segments, :summary, :request_start, :estimated_bytes, :segments_filtered
13
13
 
14
- def initialize(max_segments:, request_start: nil, stack_sample: false, call_tree: false, memory_limit: nil)
14
+ def initialize(max_segments:, request_start: nil, stack_sample: false, call_tree: false)
15
15
  @max_segments = max_segments
16
16
  @request_start = request_start || Process.clock_gettime(Process::CLOCK_MONOTONIC)
17
17
  @segments = []
@@ -20,11 +20,7 @@ module Catpm
20
20
  @span_stack = []
21
21
  @tracked_ranges = []
22
22
  @call_tree = call_tree
23
- @memory_limit = memory_limit
24
23
  @estimated_bytes = 0
25
- @checkpoint_callback = nil
26
- @checkpoint_count = 0
27
- @request_id = memory_limit ? SecureRandom.hex(8) : nil
28
24
  @segments_filtered = 0
29
25
 
30
26
  if stack_sample
@@ -33,10 +29,6 @@ module Catpm
33
29
  end
34
30
  end
35
31
 
36
- def on_checkpoint(&block)
37
- @checkpoint_callback = block
38
- end
39
-
40
32
  def add(type:, duration:, detail:, source: nil, started_at: nil)
41
33
  type_key = type.to_sym
42
34
  count_key, dur_key = SUMMARY_KEYS[type_key]
@@ -73,7 +65,6 @@ module Catpm
73
65
  end
74
66
 
75
67
  @estimated_bytes += estimate_segment_bytes(segment)
76
- maybe_checkpoint
77
68
  end
78
69
 
79
70
  def push_span(type:, detail:, started_at: nil)
@@ -166,68 +157,5 @@ module Catpm
166
157
  bytes
167
158
  end
168
159
 
169
- def maybe_checkpoint
170
- return unless @memory_limit && @estimated_bytes > @memory_limit && @checkpoint_callback
171
-
172
- checkpoint_data = {
173
- segments: @segments,
174
- summary: @summary,
175
- overflow: @overflow,
176
- sampler_segments: @sampler ? sampler_segments_for_checkpoint : [],
177
- checkpoint_number: @checkpoint_count
178
- }
179
-
180
- @checkpoint_count += 1
181
- rebuild_after_checkpoint
182
- @checkpoint_callback.call(checkpoint_data)
183
- end
184
-
185
- def sampler_segments_for_checkpoint
186
- if @call_tree
187
- result = @sampler&.to_call_tree(tracked_ranges: @tracked_ranges) || []
188
- else
189
- result = @sampler&.to_segments(tracked_ranges: @tracked_ranges) || []
190
- end
191
- @sampler&.clear_samples!
192
- result
193
- end
194
-
195
- # After checkpoint: keep only active spans from @span_stack, reset everything else.
196
- def rebuild_after_checkpoint
197
- if @span_stack.any?
198
- # Clone active spans with corrected indices
199
- new_segments = []
200
- old_to_new = {}
201
-
202
- @span_stack.each do |old_idx|
203
- seg = @segments[old_idx]
204
- next unless seg
205
-
206
- new_idx = new_segments.size
207
- old_to_new[old_idx] = new_idx
208
- new_segments << seg.dup
209
- end
210
-
211
- # Fix parent_index references in cloned spans
212
- new_segments.each do |seg|
213
- if seg.key?(:parent_index) && old_to_new.key?(seg[:parent_index])
214
- seg[:parent_index] = old_to_new[seg[:parent_index]]
215
- else
216
- seg.delete(:parent_index)
217
- end
218
- end
219
-
220
- @span_stack = @span_stack.filter_map { |old_idx| old_to_new[old_idx] }
221
- @segments = new_segments
222
- else
223
- @segments = []
224
- end
225
-
226
- @summary = Hash.new(0)
227
- @tracked_ranges = []
228
- @overflow = false
229
- @estimated_bytes = 0
230
- @segments_filtered = 0
231
- end
232
160
  end
233
161
  end
data/lib/catpm/trace.rb CHANGED
@@ -89,23 +89,10 @@ module Catpm
89
89
  max_segments: config.max_segments_per_request,
90
90
  request_start: start_time,
91
91
  stack_sample: use_sampler,
92
- call_tree: config.instrument_call_tree,
93
- memory_limit: config.derived_request_memory_limit
92
+ call_tree: config.instrument_call_tree
94
93
  )
95
94
  Thread.current[:catpm_request_segments] = req_segments
96
95
  owns_segments = true
97
-
98
- if config.derived_request_memory_limit
99
- req_segments.on_checkpoint do |checkpoint_data|
100
- Collector.process_checkpoint(
101
- kind: kind, target: target, operation: operation,
102
- context: context, metadata: metadata,
103
- checkpoint_data: checkpoint_data,
104
- request_start: start_time,
105
- request_id: req_segments.request_id
106
- )
107
- end
108
- end
109
96
  end
110
97
  end
111
98
 
@@ -138,8 +125,7 @@ module Catpm
138
125
  Thread.current[:catpm_request_segments] = nil
139
126
  # Mark that this request was already instrumented and processed by
140
127
  # track_request. Without this, process_action_controller would see
141
- # nil req_segments and falsely trigger force_instrument for slow
142
- # requests — even though they were fully instrumented here.
128
+ # nil req_segments and think the request was not instrumented.
143
129
  Thread.current[:catpm_tracked_instrumented] = true
144
130
  end
145
131
  end
data/lib/catpm/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Catpm
4
- VERSION = '0.9.5'
4
+ VERSION = '0.9.7'
5
5
  end
@@ -50,7 +50,7 @@ Catpm.configure do |config|
50
50
  # config.events_max_samples_per_name = 20 # nil = unlimited
51
51
 
52
52
  # === Memory ===
53
- # config.max_memory_per_thread = 2.megabytes # memory budget per thread (buffer + request segments)
53
+ # config.max_memory = 20 # MB — global memory budget (2% of 1GB server)
54
54
 
55
55
  # === Buffering & Flushing ===
56
56
  # config.flush_interval = 30 # seconds
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: catpm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.5
4
+ version: 0.9.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - ''