findbug 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +8 -0
- data/LICENSE.txt +21 -0
- data/README.md +375 -0
- data/Rakefile +12 -0
- data/app/controllers/findbug/application_controller.rb +105 -0
- data/app/controllers/findbug/dashboard_controller.rb +93 -0
- data/app/controllers/findbug/errors_controller.rb +129 -0
- data/app/controllers/findbug/performance_controller.rb +80 -0
- data/app/jobs/findbug/alert_job.rb +40 -0
- data/app/jobs/findbug/cleanup_job.rb +132 -0
- data/app/jobs/findbug/persist_job.rb +158 -0
- data/app/models/findbug/error_event.rb +197 -0
- data/app/models/findbug/performance_event.rb +237 -0
- data/app/views/findbug/dashboard/index.html.erb +199 -0
- data/app/views/findbug/errors/index.html.erb +137 -0
- data/app/views/findbug/errors/show.html.erb +185 -0
- data/app/views/findbug/performance/index.html.erb +168 -0
- data/app/views/findbug/performance/show.html.erb +203 -0
- data/app/views/layouts/findbug/application.html.erb +601 -0
- data/lib/findbug/alerts/channels/base.rb +75 -0
- data/lib/findbug/alerts/channels/discord.rb +155 -0
- data/lib/findbug/alerts/channels/email.rb +179 -0
- data/lib/findbug/alerts/channels/slack.rb +149 -0
- data/lib/findbug/alerts/channels/webhook.rb +143 -0
- data/lib/findbug/alerts/dispatcher.rb +126 -0
- data/lib/findbug/alerts/throttler.rb +110 -0
- data/lib/findbug/background_persister.rb +142 -0
- data/lib/findbug/capture/context.rb +301 -0
- data/lib/findbug/capture/exception_handler.rb +141 -0
- data/lib/findbug/capture/exception_subscriber.rb +228 -0
- data/lib/findbug/capture/message_handler.rb +104 -0
- data/lib/findbug/capture/middleware.rb +247 -0
- data/lib/findbug/configuration.rb +381 -0
- data/lib/findbug/engine.rb +109 -0
- data/lib/findbug/performance/instrumentation.rb +336 -0
- data/lib/findbug/performance/transaction.rb +193 -0
- data/lib/findbug/processing/data_scrubber.rb +163 -0
- data/lib/findbug/rails/controller_methods.rb +152 -0
- data/lib/findbug/railtie.rb +222 -0
- data/lib/findbug/storage/circuit_breaker.rb +223 -0
- data/lib/findbug/storage/connection_pool.rb +134 -0
- data/lib/findbug/storage/redis_buffer.rb +285 -0
- data/lib/findbug/tasks/findbug.rake +167 -0
- data/lib/findbug/version.rb +5 -0
- data/lib/findbug.rb +216 -0
- data/lib/generators/findbug/install_generator.rb +67 -0
- data/lib/generators/findbug/templates/POST_INSTALL +41 -0
- data/lib/generators/findbug/templates/create_findbug_error_events.rb +44 -0
- data/lib/generators/findbug/templates/create_findbug_performance_events.rb +47 -0
- data/lib/generators/findbug/templates/initializer.rb +157 -0
- data/sig/findbug.rbs +4 -0
- metadata +251 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "monitor"
|
|
4
|
+
|
|
5
|
+
module Findbug
|
|
6
|
+
module Storage
|
|
7
|
+
# CircuitBreaker prevents cascading failures when Redis is down.
|
|
8
|
+
#
|
|
9
|
+
# THE PROBLEM IT SOLVES
|
|
10
|
+
# =====================
|
|
11
|
+
#
|
|
12
|
+
# Imagine Redis goes down during peak traffic:
|
|
13
|
+
#
|
|
14
|
+
# Without circuit breaker:
|
|
15
|
+
# - 1000 requests/second
|
|
16
|
+
# - Each tries to write to Redis
|
|
17
|
+
# - Each waits 1 second for timeout
|
|
18
|
+
# - Your app becomes unusable
|
|
19
|
+
#
|
|
20
|
+
# With circuit breaker:
|
|
21
|
+
# - After 5 failures, circuit "opens"
|
|
22
|
+
# - Next 1000 requests skip Redis immediately
|
|
23
|
+
# - Your app stays fast
|
|
24
|
+
# - After 30 seconds, we try again
|
|
25
|
+
#
|
|
26
|
+
# THE THREE STATES
|
|
27
|
+
# ================
|
|
28
|
+
#
|
|
29
|
+
# ┌─────────────────────────────────────────────────────────┐
|
|
30
|
+
# │ │
|
|
31
|
+
# │ ┌──────────┐ failures >= 5 ┌──────────┐ │
|
|
32
|
+
# │ │ CLOSED │ ─────────────────── │ OPEN │ │
|
|
33
|
+
# │ │ (normal) │ │ (tripped)│ │
|
|
34
|
+
# │ └──────────┘ └──────────┘ │
|
|
35
|
+
# │ ▲ │ │
|
|
36
|
+
# │ │ success │ 30 seconds │
|
|
37
|
+
# │ │ ▼ │
|
|
38
|
+
# │ │ ┌───────────┐ │
|
|
39
|
+
# │ └───────────────────────── │ HALF-OPEN │ │
|
|
40
|
+
# │ │ (testing) │ │
|
|
41
|
+
# │ └───────────┘ │
|
|
42
|
+
# │ │ │
|
|
43
|
+
# │ │ failure │
|
|
44
|
+
# │ ▼ │
|
|
45
|
+
# │ ┌──────────┐ │
|
|
46
|
+
# │ │ OPEN │ │
|
|
47
|
+
# │ └──────────┘ │
|
|
48
|
+
# └─────────────────────────────────────────────────────────┘
|
|
49
|
+
#
|
|
50
|
+
# THREAD SAFETY
|
|
51
|
+
# =============
|
|
52
|
+
#
|
|
53
|
+
# This class uses Monitor (a reentrant mutex) to ensure thread safety.
|
|
54
|
+
# Multiple threads can check/update the circuit state without races.
|
|
55
|
+
#
|
|
56
|
+
class CircuitBreaker
|
|
57
|
+
# How many failures before we trip the circuit
|
|
58
|
+
FAILURE_THRESHOLD = 5
|
|
59
|
+
|
|
60
|
+
# How long to wait before trying again (in seconds)
|
|
61
|
+
RECOVERY_TIME = 30
|
|
62
|
+
|
|
63
|
+
class << self
|
|
64
|
+
# Check if requests are allowed through
|
|
65
|
+
#
|
|
66
|
+
# @return [Boolean] true if we should attempt the operation
|
|
67
|
+
#
|
|
68
|
+
# @example
|
|
69
|
+
# if CircuitBreaker.allow?
|
|
70
|
+
# # try Redis operation
|
|
71
|
+
# else
|
|
72
|
+
# # skip and log
|
|
73
|
+
# end
|
|
74
|
+
#
|
|
75
|
+
def allow?
|
|
76
|
+
synchronize do
|
|
77
|
+
case state
|
|
78
|
+
when :closed
|
|
79
|
+
# Normal operation - allow all requests
|
|
80
|
+
true
|
|
81
|
+
when :open
|
|
82
|
+
if recovery_period_elapsed?
|
|
83
|
+
# Time to test if Redis is back
|
|
84
|
+
transition_to(:half_open)
|
|
85
|
+
true
|
|
86
|
+
else
|
|
87
|
+
# Still in cooldown - reject immediately
|
|
88
|
+
false
|
|
89
|
+
end
|
|
90
|
+
when :half_open
|
|
91
|
+
# We're testing - allow this one request through
|
|
92
|
+
true
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Record a successful operation
|
|
98
|
+
#
|
|
99
|
+
# Call this after a successful Redis operation.
|
|
100
|
+
# This resets the failure count and closes the circuit.
|
|
101
|
+
#
|
|
102
|
+
def record_success
|
|
103
|
+
synchronize do
|
|
104
|
+
@failures = 0
|
|
105
|
+
transition_to(:closed)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Record a failed operation
|
|
110
|
+
#
|
|
111
|
+
# Call this when a Redis operation fails.
|
|
112
|
+
# After enough failures, the circuit opens.
|
|
113
|
+
#
|
|
114
|
+
def record_failure
|
|
115
|
+
synchronize do
|
|
116
|
+
@failures = (@failures || 0) + 1
|
|
117
|
+
|
|
118
|
+
if state == :half_open
|
|
119
|
+
# Failed during testing - back to open
|
|
120
|
+
transition_to(:open)
|
|
121
|
+
elsif @failures >= FAILURE_THRESHOLD
|
|
122
|
+
# Too many failures - trip the circuit
|
|
123
|
+
transition_to(:open)
|
|
124
|
+
log_circuit_opened
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Get current state (for monitoring/debugging)
|
|
130
|
+
#
|
|
131
|
+
# @return [Symbol] :closed, :open, or :half_open
|
|
132
|
+
#
|
|
133
|
+
def state
|
|
134
|
+
@state || :closed
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Get current failure count (for monitoring)
|
|
138
|
+
#
|
|
139
|
+
# @return [Integer] number of consecutive failures
|
|
140
|
+
#
|
|
141
|
+
def failure_count
|
|
142
|
+
@failures || 0
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Reset the circuit breaker (for testing)
|
|
146
|
+
def reset!
|
|
147
|
+
synchronize do
|
|
148
|
+
@state = :closed
|
|
149
|
+
@failures = 0
|
|
150
|
+
@opened_at = nil
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Execute a block with circuit breaker protection
|
|
155
|
+
#
|
|
156
|
+
# @yield the operation to protect
|
|
157
|
+
# @return [Object, nil] the block's return value, or nil if rejected
|
|
158
|
+
#
|
|
159
|
+
# @example
|
|
160
|
+
# result = CircuitBreaker.execute do
|
|
161
|
+
# redis.lpush("key", "value")
|
|
162
|
+
# end
|
|
163
|
+
#
|
|
164
|
+
# This is a convenience method that combines allow?/record_success/record_failure.
|
|
165
|
+
#
|
|
166
|
+
def execute
|
|
167
|
+
return nil unless allow?
|
|
168
|
+
|
|
169
|
+
begin
|
|
170
|
+
result = yield
|
|
171
|
+
record_success
|
|
172
|
+
result
|
|
173
|
+
rescue StandardError => e
|
|
174
|
+
record_failure
|
|
175
|
+
raise e
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
private
|
|
180
|
+
|
|
181
|
+
def synchronize(&block)
|
|
182
|
+
@monitor ||= Monitor.new
|
|
183
|
+
@monitor.synchronize(&block)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def transition_to(new_state)
|
|
187
|
+
old_state = @state
|
|
188
|
+
@state = new_state
|
|
189
|
+
|
|
190
|
+
if new_state == :open
|
|
191
|
+
@opened_at = Time.now
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
log_state_change(old_state, new_state) if old_state != new_state
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def recovery_period_elapsed?
|
|
198
|
+
return true unless @opened_at
|
|
199
|
+
|
|
200
|
+
Time.now - @opened_at >= RECOVERY_TIME
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def log_circuit_opened
|
|
204
|
+
Findbug.logger.warn(
|
|
205
|
+
"[Findbug] Circuit breaker opened after #{FAILURE_THRESHOLD} failures. " \
|
|
206
|
+
"Redis operations will be skipped for #{RECOVERY_TIME} seconds."
|
|
207
|
+
)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def log_state_change(old_state, new_state)
|
|
211
|
+
return if old_state.nil? # Initial state
|
|
212
|
+
|
|
213
|
+
case new_state
|
|
214
|
+
when :closed
|
|
215
|
+
Findbug.logger.info("[Findbug] Circuit breaker closed. Redis operations resumed.")
|
|
216
|
+
when :half_open
|
|
217
|
+
Findbug.logger.info("[Findbug] Circuit breaker half-open. Testing Redis connection...")
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "redis"
|
|
4
|
+
require "connection_pool"
|
|
5
|
+
|
|
6
|
+
module Findbug
|
|
7
|
+
module Storage
|
|
8
|
+
# ConnectionPool manages Redis connections for Findbug.
|
|
9
|
+
#
|
|
10
|
+
# WHY A SEPARATE POOL?
|
|
11
|
+
# ====================
|
|
12
|
+
#
|
|
13
|
+
# Your Rails app likely already uses Redis for:
|
|
14
|
+
# - Sidekiq (job queue)
|
|
15
|
+
# - Caching (Rails.cache)
|
|
16
|
+
# - Action Cable (websockets)
|
|
17
|
+
#
|
|
18
|
+
# If Findbug shared these connections, we could:
|
|
19
|
+
# 1. Starve your app of connections during high error rates
|
|
20
|
+
# 2. Cause Sidekiq jobs to timeout waiting for connections
|
|
21
|
+
# 3. Create unpredictable latency spikes
|
|
22
|
+
#
|
|
23
|
+
# By maintaining our OWN pool, Findbug is isolated.
|
|
24
|
+
# If our pool is exhausted, only Findbug suffers - your app keeps running.
|
|
25
|
+
#
|
|
26
|
+
# HOW CONNECTION POOLING WORKS
|
|
27
|
+
# ============================
|
|
28
|
+
#
|
|
29
|
+
# Without pooling:
|
|
30
|
+
# Thread 1 → create connection → use → close
|
|
31
|
+
# Thread 2 → create connection → use → close (expensive!)
|
|
32
|
+
# Thread 3 → create connection → use → close
|
|
33
|
+
#
|
|
34
|
+
# With pooling:
|
|
35
|
+
# Thread 1 → borrow connection → use → return to pool
|
|
36
|
+
# Thread 2 → borrow connection → use → return to pool
|
|
37
|
+
# Thread 3 → borrow connection → use → return to pool
|
|
38
|
+
# ↓
|
|
39
|
+
# [Pool of 5 connections]
|
|
40
|
+
#
|
|
41
|
+
# The `connection_pool` gem handles:
|
|
42
|
+
# - Creating connections lazily (only when needed)
|
|
43
|
+
# - Returning connections automatically (via block)
|
|
44
|
+
# - Waiting for available connections (with timeout)
|
|
45
|
+
# - Thread-safety (multiple threads can't corrupt state)
|
|
46
|
+
#
|
|
47
|
+
class ConnectionPool
|
|
48
|
+
class << self
|
|
49
|
+
# Get a connection from the pool and execute a block
|
|
50
|
+
#
|
|
51
|
+
# @yield [Redis] a Redis connection
|
|
52
|
+
# @return [Object] the return value of the block
|
|
53
|
+
#
|
|
54
|
+
# @example
|
|
55
|
+
# ConnectionPool.with do |redis|
|
|
56
|
+
# redis.lpush("findbug:errors", data.to_json)
|
|
57
|
+
# end
|
|
58
|
+
#
|
|
59
|
+
# WHY A BLOCK?
|
|
60
|
+
# ------------
|
|
61
|
+
# The block pattern ensures connections are ALWAYS returned to the pool.
|
|
62
|
+
# Even if an exception occurs, the connection goes back.
|
|
63
|
+
# This prevents connection leaks.
|
|
64
|
+
#
|
|
65
|
+
def with(&block)
|
|
66
|
+
pool.with(&block)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Get the raw pool (for advanced usage)
|
|
70
|
+
#
|
|
71
|
+
# @return [::ConnectionPool] the underlying connection pool
|
|
72
|
+
#
|
|
73
|
+
def pool
|
|
74
|
+
@pool ||= create_pool
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Shutdown the pool (for cleanup/testing)
|
|
78
|
+
#
|
|
79
|
+
# This closes all connections and resets the pool.
|
|
80
|
+
# Call this when shutting down your app or between tests.
|
|
81
|
+
#
|
|
82
|
+
def shutdown!
|
|
83
|
+
@pool&.shutdown { |redis| redis.close }
|
|
84
|
+
@pool = nil
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Check if a connection can be established
|
|
88
|
+
#
|
|
89
|
+
# @return [Boolean] true if Redis is reachable
|
|
90
|
+
#
|
|
91
|
+
# This is used by the circuit breaker to test if Redis is back up.
|
|
92
|
+
#
|
|
93
|
+
def healthy?
|
|
94
|
+
with { |redis| redis.ping == "PONG" }
|
|
95
|
+
rescue StandardError
|
|
96
|
+
false
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
def create_pool
|
|
102
|
+
config = Findbug.config
|
|
103
|
+
|
|
104
|
+
::ConnectionPool.new(
|
|
105
|
+
size: config.redis_pool_size,
|
|
106
|
+
timeout: config.redis_pool_timeout
|
|
107
|
+
) do
|
|
108
|
+
create_redis_connection(config.redis_url)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def create_redis_connection(url)
|
|
113
|
+
# Parse the URL and create a Redis connection
|
|
114
|
+
#
|
|
115
|
+
# WHY NOT JUST `Redis.new(url: url)`?
|
|
116
|
+
# -----------------------------------
|
|
117
|
+
# We add some defensive options:
|
|
118
|
+
# - connect_timeout: Don't hang if Redis is unreachable
|
|
119
|
+
# - read_timeout: Don't hang if Redis is slow
|
|
120
|
+
# - write_timeout: Don't hang on slow writes
|
|
121
|
+
# - reconnect_attempts: Retry on temporary failures
|
|
122
|
+
#
|
|
123
|
+
Redis.new(
|
|
124
|
+
url: url,
|
|
125
|
+
connect_timeout: 1.0, # 1 second to establish connection
|
|
126
|
+
read_timeout: 1.0, # 1 second to read response
|
|
127
|
+
write_timeout: 1.0, # 1 second to write command
|
|
128
|
+
reconnect_attempts: 1 # Retry once on connection failure
|
|
129
|
+
)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require_relative "circuit_breaker"
|
|
5
|
+
require_relative "connection_pool"
|
|
6
|
+
|
|
7
|
+
module Findbug
|
|
8
|
+
module Storage
|
|
9
|
+
# RedisBuffer provides fast, non-blocking writes to Redis.
|
|
10
|
+
#
|
|
11
|
+
# THIS IS THE KEY TO ZERO PERFORMANCE IMPACT
|
|
12
|
+
# ==========================================
|
|
13
|
+
#
|
|
14
|
+
# Traditional error tracking (synchronous):
|
|
15
|
+
#
|
|
16
|
+
# Request starts
|
|
17
|
+
# ↓
|
|
18
|
+
# Exception occurs
|
|
19
|
+
# ↓
|
|
20
|
+
# BLOCKING: Write to database (50-100ms) ← Your user waits!
|
|
21
|
+
# ↓
|
|
22
|
+
# Request ends
|
|
23
|
+
#
|
|
24
|
+
# Findbug (asynchronous):
|
|
25
|
+
#
|
|
26
|
+
# Request starts
|
|
27
|
+
# ↓
|
|
28
|
+
# Exception occurs
|
|
29
|
+
# ↓
|
|
30
|
+
# NON-BLOCKING: Spawn thread to write to Redis (0ms)
|
|
31
|
+
# ↓ ↓
|
|
32
|
+
# Request ends Background: Redis write (1-2ms)
|
|
33
|
+
# ↓
|
|
34
|
+
# User gets response immediately
|
|
35
|
+
#
|
|
36
|
+
# WHY REDIS INSTEAD OF DATABASE?
|
|
37
|
+
# ==============================
|
|
38
|
+
#
|
|
39
|
+
# Redis write: ~1-2ms
|
|
40
|
+
# Database write: ~50-100ms (with indexes, constraints, etc.)
|
|
41
|
+
#
|
|
42
|
+
# Even if we made DB writes async, Redis is still better for buffering because:
|
|
43
|
+
# 1. It's faster (in-memory)
|
|
44
|
+
# 2. It handles high write loads gracefully
|
|
45
|
+
# 3. It has built-in expiration (TTL)
|
|
46
|
+
# 4. It supports atomic list operations
|
|
47
|
+
#
|
|
48
|
+
# The database is for long-term storage. Redis is for the fast buffer.
|
|
49
|
+
#
|
|
50
|
+
# WHY Thread.new INSTEAD OF SIDEKIQ?
|
|
51
|
+
# ==================================
|
|
52
|
+
#
|
|
53
|
+
# Sidekiq itself writes to Redis. If we used Sidekiq to buffer our errors:
|
|
54
|
+
# 1. We'd add Sidekiq job overhead (~5ms)
|
|
55
|
+
# 2. We'd share Redis connections with Sidekiq
|
|
56
|
+
# 3. We'd depend on Sidekiq being healthy
|
|
57
|
+
#
|
|
58
|
+
# A simple Thread.new is:
|
|
59
|
+
# 1. Instant (no queue overhead)
|
|
60
|
+
# 2. Independent of your job system
|
|
61
|
+
# 3. Simpler (no job serialization)
|
|
62
|
+
#
|
|
63
|
+
# We use Sidekiq/ActiveJob later for PERSISTING to DB, not for buffering.
|
|
64
|
+
#
|
|
65
|
+
class RedisBuffer
|
|
66
|
+
# Key prefix for error events
|
|
67
|
+
ERRORS_KEY = "findbug:errors"
|
|
68
|
+
|
|
69
|
+
# Key prefix for performance events
|
|
70
|
+
PERFORMANCE_KEY = "findbug:performance"
|
|
71
|
+
|
|
72
|
+
# Key for tracking stats
|
|
73
|
+
STATS_KEY = "findbug:stats"
|
|
74
|
+
|
|
75
|
+
class << self
|
|
76
|
+
# Push an error event to the buffer (async, non-blocking)
|
|
77
|
+
#
|
|
78
|
+
# @param event_data [Hash] the error event data
|
|
79
|
+
#
|
|
80
|
+
# @example
|
|
81
|
+
# RedisBuffer.push_error({
|
|
82
|
+
# exception_class: "RuntimeError",
|
|
83
|
+
# message: "Something went wrong",
|
|
84
|
+
# backtrace: [...],
|
|
85
|
+
# context: {...}
|
|
86
|
+
# })
|
|
87
|
+
#
|
|
88
|
+
# IMPORTANT: This returns IMMEDIATELY. The actual write happens
|
|
89
|
+
# in a background thread. This is what makes us non-blocking.
|
|
90
|
+
#
|
|
91
|
+
def push_error(event_data)
|
|
92
|
+
push_async(ERRORS_KEY, event_data)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Push a performance event to the buffer (async, non-blocking)
|
|
96
|
+
#
|
|
97
|
+
# @param event_data [Hash] the performance event data
|
|
98
|
+
#
|
|
99
|
+
def push_performance(event_data)
|
|
100
|
+
push_async(PERFORMANCE_KEY, event_data)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Pop a batch of error events from the buffer
|
|
104
|
+
#
|
|
105
|
+
# @param batch_size [Integer] maximum number of events to retrieve
|
|
106
|
+
# @return [Array<Hash>] array of error events
|
|
107
|
+
#
|
|
108
|
+
# This is called by the PersistJob to move data from Redis to DB.
|
|
109
|
+
# It uses LPOP in a loop to get events atomically.
|
|
110
|
+
#
|
|
111
|
+
def pop_errors(batch_size = 100)
|
|
112
|
+
pop_batch(ERRORS_KEY, batch_size)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Pop a batch of performance events from the buffer
|
|
116
|
+
#
|
|
117
|
+
# @param batch_size [Integer] maximum number of events to retrieve
|
|
118
|
+
# @return [Array<Hash>] array of performance events
|
|
119
|
+
#
|
|
120
|
+
def pop_performance(batch_size = 100)
|
|
121
|
+
pop_batch(PERFORMANCE_KEY, batch_size)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Get buffer statistics (for monitoring)
|
|
125
|
+
#
|
|
126
|
+
# @return [Hash] buffer stats including queue lengths
|
|
127
|
+
#
|
|
128
|
+
def stats
|
|
129
|
+
ConnectionPool.with do |redis|
|
|
130
|
+
{
|
|
131
|
+
error_queue_length: redis.llen(ERRORS_KEY),
|
|
132
|
+
performance_queue_length: redis.llen(PERFORMANCE_KEY),
|
|
133
|
+
circuit_breaker_state: CircuitBreaker.state,
|
|
134
|
+
circuit_breaker_failures: CircuitBreaker.failure_count
|
|
135
|
+
}
|
|
136
|
+
end
|
|
137
|
+
rescue StandardError => e
|
|
138
|
+
# Always return circuit breaker state even if Redis is down
|
|
139
|
+
{
|
|
140
|
+
error_queue_length: 0,
|
|
141
|
+
performance_queue_length: 0,
|
|
142
|
+
circuit_breaker_state: Findbug::Storage::CircuitBreaker.state,
|
|
143
|
+
circuit_breaker_failures: Findbug::Storage::CircuitBreaker.failure_count,
|
|
144
|
+
error: "Redis connection failed: #{e.message}"
|
|
145
|
+
}
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Clear all buffers (for testing)
|
|
149
|
+
def clear!
|
|
150
|
+
ConnectionPool.with do |redis|
|
|
151
|
+
redis.del(ERRORS_KEY, PERFORMANCE_KEY)
|
|
152
|
+
end
|
|
153
|
+
rescue StandardError
|
|
154
|
+
# Ignore errors during cleanup
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
private
|
|
158
|
+
|
|
159
|
+
# The core async push operation
|
|
160
|
+
#
|
|
161
|
+
# WHY THIS PATTERN?
|
|
162
|
+
# -----------------
|
|
163
|
+
#
|
|
164
|
+
# 1. Check circuit breaker BEFORE spawning thread
|
|
165
|
+
# - If Redis is down, don't waste resources on threads
|
|
166
|
+
#
|
|
167
|
+
# 2. Spawn a new thread for the actual write
|
|
168
|
+
# - This returns immediately to the caller
|
|
169
|
+
# - The thread runs independently
|
|
170
|
+
#
|
|
171
|
+
# 3. Inside the thread, use connection pool
|
|
172
|
+
# - Gets a connection from the pool
|
|
173
|
+
# - Writes to Redis
|
|
174
|
+
# - Returns connection automatically (via block)
|
|
175
|
+
#
|
|
176
|
+
# 4. Handle errors gracefully
|
|
177
|
+
# - Log but don't crash
|
|
178
|
+
# - Update circuit breaker state
|
|
179
|
+
#
|
|
180
|
+
def push_async(key, event_data)
|
|
181
|
+
# Early exit if Findbug is disabled
|
|
182
|
+
return unless Findbug.enabled?
|
|
183
|
+
|
|
184
|
+
# Early exit if circuit breaker is open
|
|
185
|
+
unless CircuitBreaker.allow?
|
|
186
|
+
increment_dropped_count
|
|
187
|
+
return
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Spawn a thread for non-blocking write
|
|
191
|
+
#
|
|
192
|
+
# WHY Thread.new HERE?
|
|
193
|
+
# --------------------
|
|
194
|
+
# Thread.new creates a new Ruby thread that runs independently.
|
|
195
|
+
# The calling code continues immediately without waiting.
|
|
196
|
+
#
|
|
197
|
+
# THREAD SAFETY NOTES:
|
|
198
|
+
# - event_data is captured by the closure (safe - we're not mutating it)
|
|
199
|
+
# - ConnectionPool handles thread-safe connection borrowing
|
|
200
|
+
# - Redis operations are atomic
|
|
201
|
+
#
|
|
202
|
+
Thread.new do
|
|
203
|
+
perform_push(key, event_data)
|
|
204
|
+
rescue StandardError => e
|
|
205
|
+
# CRITICAL: Catch ALL errors in the thread
|
|
206
|
+
# An unhandled exception in a thread will crash the thread silently
|
|
207
|
+
handle_push_error(e)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
nil # Return immediately
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# The actual Redis push (runs in background thread)
|
|
214
|
+
def perform_push(key, event_data)
|
|
215
|
+
ConnectionPool.with do |redis|
|
|
216
|
+
# Add timestamp if not present
|
|
217
|
+
event_data[:captured_at] ||= Time.now.utc.iso8601(3)
|
|
218
|
+
|
|
219
|
+
# LPUSH adds to the LEFT of the list (newest first)
|
|
220
|
+
# We use JSON encoding for storage
|
|
221
|
+
redis.lpush(key, event_data.to_json)
|
|
222
|
+
|
|
223
|
+
# LTRIM keeps only the first N elements
|
|
224
|
+
# This prevents unbounded memory growth
|
|
225
|
+
# If we have more than max_buffer_size events, old ones are dropped
|
|
226
|
+
max_size = Findbug.config.max_buffer_size
|
|
227
|
+
redis.ltrim(key, 0, max_size - 1)
|
|
228
|
+
|
|
229
|
+
# Record success for circuit breaker
|
|
230
|
+
CircuitBreaker.record_success
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Handle errors during push
|
|
235
|
+
def handle_push_error(error)
|
|
236
|
+
CircuitBreaker.record_failure
|
|
237
|
+
|
|
238
|
+
# Log at debug level to avoid log spam during outages
|
|
239
|
+
Findbug.logger.debug(
|
|
240
|
+
"[Findbug] Failed to push event to Redis: #{error.message}"
|
|
241
|
+
)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Pop a batch of events atomically
|
|
245
|
+
#
|
|
246
|
+
# WHY NOT LRANGE + LTRIM?
|
|
247
|
+
# -----------------------
|
|
248
|
+
# That's not atomic. Between LRANGE and LTRIM, new events could arrive.
|
|
249
|
+
# We use LPOP in a loop which is atomic per operation.
|
|
250
|
+
#
|
|
251
|
+
# WHY NOT RPOPLPUSH?
|
|
252
|
+
# ------------------
|
|
253
|
+
# We don't need a backup queue. If persistence fails, the job will
|
|
254
|
+
# retry and the data is still in the main queue.
|
|
255
|
+
#
|
|
256
|
+
def pop_batch(key, batch_size)
|
|
257
|
+
events = []
|
|
258
|
+
|
|
259
|
+
ConnectionPool.with do |redis|
|
|
260
|
+
batch_size.times do
|
|
261
|
+
# RPOP gets from the RIGHT (oldest first - FIFO order)
|
|
262
|
+
json = redis.rpop(key)
|
|
263
|
+
break unless json
|
|
264
|
+
|
|
265
|
+
begin
|
|
266
|
+
events << JSON.parse(json, symbolize_names: true)
|
|
267
|
+
rescue JSON::ParserError => e
|
|
268
|
+
Findbug.logger.error("[Findbug] Failed to parse event: #{e.message}")
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
events
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Track dropped events (when circuit breaker is open)
|
|
277
|
+
def increment_dropped_count
|
|
278
|
+
# We could track this in Redis too, but that defeats the purpose
|
|
279
|
+
# when Redis is down. Just log it.
|
|
280
|
+
Findbug.logger.debug("[Findbug] Event dropped (circuit breaker open)")
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
end
|