brainzlab 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -0
- data/lib/brainzlab/beacon/client.rb +209 -0
- data/lib/brainzlab/beacon/provisioner.rb +44 -0
- data/lib/brainzlab/beacon.rb +215 -0
- data/lib/brainzlab/configuration.rb +341 -3
- data/lib/brainzlab/cortex/cache.rb +59 -0
- data/lib/brainzlab/cortex/client.rb +141 -0
- data/lib/brainzlab/cortex/provisioner.rb +49 -0
- data/lib/brainzlab/cortex.rb +227 -0
- data/lib/brainzlab/dendrite/client.rb +232 -0
- data/lib/brainzlab/dendrite/provisioner.rb +44 -0
- data/lib/brainzlab/dendrite.rb +195 -0
- data/lib/brainzlab/devtools/assets/devtools.css +1106 -0
- data/lib/brainzlab/devtools/assets/devtools.js +322 -0
- data/lib/brainzlab/devtools/assets/logo.svg +6 -0
- data/lib/brainzlab/devtools/assets/templates/debug_panel.html.erb +500 -0
- data/lib/brainzlab/devtools/assets/templates/error_page.html.erb +1086 -0
- data/lib/brainzlab/devtools/data/collector.rb +248 -0
- data/lib/brainzlab/devtools/middleware/asset_server.rb +63 -0
- data/lib/brainzlab/devtools/middleware/database_handler.rb +180 -0
- data/lib/brainzlab/devtools/middleware/debug_panel.rb +126 -0
- data/lib/brainzlab/devtools/middleware/error_page.rb +376 -0
- data/lib/brainzlab/devtools/renderers/debug_panel_renderer.rb +155 -0
- data/lib/brainzlab/devtools/renderers/error_page_renderer.rb +94 -0
- data/lib/brainzlab/devtools.rb +75 -0
- data/lib/brainzlab/flux/buffer.rb +96 -0
- data/lib/brainzlab/flux/client.rb +70 -0
- data/lib/brainzlab/flux/provisioner.rb +57 -0
- data/lib/brainzlab/flux.rb +174 -0
- data/lib/brainzlab/instrumentation/active_record.rb +18 -1
- data/lib/brainzlab/instrumentation/aws.rb +179 -0
- data/lib/brainzlab/instrumentation/dalli.rb +108 -0
- data/lib/brainzlab/instrumentation/excon.rb +152 -0
- data/lib/brainzlab/instrumentation/good_job.rb +102 -0
- data/lib/brainzlab/instrumentation/resque.rb +115 -0
- data/lib/brainzlab/instrumentation/solid_queue.rb +198 -0
- data/lib/brainzlab/instrumentation/stripe.rb +164 -0
- data/lib/brainzlab/instrumentation/typhoeus.rb +104 -0
- data/lib/brainzlab/instrumentation.rb +72 -0
- data/lib/brainzlab/nerve/client.rb +217 -0
- data/lib/brainzlab/nerve/provisioner.rb +44 -0
- data/lib/brainzlab/nerve.rb +219 -0
- data/lib/brainzlab/pulse/instrumentation.rb +35 -2
- data/lib/brainzlab/pulse/propagation.rb +1 -1
- data/lib/brainzlab/pulse/tracer.rb +1 -1
- data/lib/brainzlab/pulse.rb +1 -1
- data/lib/brainzlab/rails/log_subscriber.rb +1 -2
- data/lib/brainzlab/rails/railtie.rb +36 -3
- data/lib/brainzlab/recall/provisioner.rb +17 -0
- data/lib/brainzlab/recall.rb +6 -1
- data/lib/brainzlab/reflex.rb +2 -2
- data/lib/brainzlab/sentinel/client.rb +218 -0
- data/lib/brainzlab/sentinel/provisioner.rb +44 -0
- data/lib/brainzlab/sentinel.rb +165 -0
- data/lib/brainzlab/signal/client.rb +62 -0
- data/lib/brainzlab/signal/provisioner.rb +55 -0
- data/lib/brainzlab/signal.rb +136 -0
- data/lib/brainzlab/synapse/client.rb +290 -0
- data/lib/brainzlab/synapse/provisioner.rb +44 -0
- data/lib/brainzlab/synapse.rb +270 -0
- data/lib/brainzlab/utilities/circuit_breaker.rb +265 -0
- data/lib/brainzlab/utilities/health_check.rb +296 -0
- data/lib/brainzlab/utilities/log_formatter.rb +256 -0
- data/lib/brainzlab/utilities/rate_limiter.rb +230 -0
- data/lib/brainzlab/utilities.rb +17 -0
- data/lib/brainzlab/vault/cache.rb +80 -0
- data/lib/brainzlab/vault/client.rb +198 -0
- data/lib/brainzlab/vault/provisioner.rb +49 -0
- data/lib/brainzlab/vault.rb +268 -0
- data/lib/brainzlab/version.rb +1 -1
- data/lib/brainzlab/vision/client.rb +128 -0
- data/lib/brainzlab/vision/provisioner.rb +136 -0
- data/lib/brainzlab/vision.rb +157 -0
- data/lib/brainzlab.rb +101 -0
- metadata +60 -1
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BrainzLab
|
|
4
|
+
module Utilities
|
|
5
|
+
# Circuit breaker pattern implementation for resilient external calls
|
|
6
|
+
# Integrates with Flux for metrics and Reflex for error tracking
|
|
7
|
+
#
|
|
8
|
+
# States:
|
|
9
|
+
# - :closed - Normal operation, requests pass through
|
|
10
|
+
# - :open - Failing, requests are rejected immediately
|
|
11
|
+
# - :half_open - Testing, limited requests allowed to check recovery
|
|
12
|
+
#
|
|
13
|
+
# @example Basic usage
|
|
14
|
+
# breaker = BrainzLab::Utilities::CircuitBreaker.new(
|
|
15
|
+
# name: "external_api",
|
|
16
|
+
# failure_threshold: 5,
|
|
17
|
+
# recovery_timeout: 30
|
|
18
|
+
# )
|
|
19
|
+
#
|
|
20
|
+
# breaker.call do
|
|
21
|
+
# external_api.request
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
# @example With fallback
|
|
25
|
+
# breaker.call(fallback: -> { cached_value }) do
|
|
26
|
+
# external_api.request
|
|
27
|
+
# end
|
|
28
|
+
#
|
|
29
|
+
class CircuitBreaker
|
|
30
|
+
STATES = %i[closed open half_open].freeze
|
|
31
|
+
|
|
32
|
+
attr_reader :name, :state, :failure_count, :success_count, :last_failure_at
|
|
33
|
+
|
|
34
|
+
def initialize(name:, failure_threshold: 5, success_threshold: 2, recovery_timeout: 30, timeout: nil, exclude_exceptions: [])
|
|
35
|
+
@name = name
|
|
36
|
+
@failure_threshold = failure_threshold
|
|
37
|
+
@success_threshold = success_threshold
|
|
38
|
+
@recovery_timeout = recovery_timeout
|
|
39
|
+
@timeout = timeout
|
|
40
|
+
@exclude_exceptions = exclude_exceptions
|
|
41
|
+
|
|
42
|
+
@state = :closed
|
|
43
|
+
@failure_count = 0
|
|
44
|
+
@success_count = 0
|
|
45
|
+
@last_failure_at = nil
|
|
46
|
+
@mutex = Mutex.new
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Execute a block with circuit breaker protection
|
|
50
|
+
def call(fallback: nil)
|
|
51
|
+
check_state_transition!
|
|
52
|
+
|
|
53
|
+
case @state
|
|
54
|
+
when :open
|
|
55
|
+
track_rejected
|
|
56
|
+
if fallback
|
|
57
|
+
fallback.respond_to?(:call) ? fallback.call : fallback
|
|
58
|
+
else
|
|
59
|
+
raise CircuitOpenError, "Circuit '#{@name}' is open"
|
|
60
|
+
end
|
|
61
|
+
when :closed, :half_open
|
|
62
|
+
execute_with_protection(fallback) { yield }
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Force the circuit to a specific state
|
|
67
|
+
def force_state!(new_state)
|
|
68
|
+
raise ArgumentError, "Invalid state: #{new_state}" unless STATES.include?(new_state)
|
|
69
|
+
|
|
70
|
+
@mutex.synchronize do
|
|
71
|
+
@state = new_state
|
|
72
|
+
@failure_count = 0 if new_state == :closed
|
|
73
|
+
@success_count = 0 if new_state == :half_open
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
track_state_change(new_state)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Reset the circuit breaker
|
|
80
|
+
def reset!
|
|
81
|
+
force_state!(:closed)
|
|
82
|
+
@last_failure_at = nil
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Get circuit status
|
|
86
|
+
def status
|
|
87
|
+
{
|
|
88
|
+
name: @name,
|
|
89
|
+
state: @state,
|
|
90
|
+
failure_count: @failure_count,
|
|
91
|
+
success_count: @success_count,
|
|
92
|
+
failure_threshold: @failure_threshold,
|
|
93
|
+
success_threshold: @success_threshold,
|
|
94
|
+
last_failure_at: @last_failure_at,
|
|
95
|
+
recovery_timeout: @recovery_timeout
|
|
96
|
+
}
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Check if circuit is allowing requests
|
|
100
|
+
def available?
|
|
101
|
+
check_state_transition!
|
|
102
|
+
@state != :open
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Class-level registry of circuit breakers
|
|
106
|
+
class << self
|
|
107
|
+
def registry
|
|
108
|
+
@registry ||= {}
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def get(name)
|
|
112
|
+
registry[name.to_s]
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def register(name, **options)
|
|
116
|
+
registry[name.to_s] = new(name: name, **options)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def call(name, **options, &block)
|
|
120
|
+
breaker = get(name) || register(name, **options)
|
|
121
|
+
breaker.call(**options.slice(:fallback), &block)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def reset_all!
|
|
125
|
+
registry.each_value(&:reset!)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def status_all
|
|
129
|
+
registry.transform_values(&:status)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
private
|
|
134
|
+
|
|
135
|
+
def execute_with_protection(fallback)
|
|
136
|
+
result = if @timeout
|
|
137
|
+
Timeout.timeout(@timeout) { yield }
|
|
138
|
+
else
|
|
139
|
+
yield
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
record_success
|
|
143
|
+
result
|
|
144
|
+
rescue *excluded_exceptions => e
|
|
145
|
+
# Don't count excluded exceptions as failures
|
|
146
|
+
raise
|
|
147
|
+
rescue StandardError => e
|
|
148
|
+
record_failure(e)
|
|
149
|
+
|
|
150
|
+
if fallback
|
|
151
|
+
fallback.respond_to?(:call) ? fallback.call : fallback
|
|
152
|
+
else
|
|
153
|
+
raise
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def record_success
|
|
158
|
+
@mutex.synchronize do
|
|
159
|
+
if @state == :half_open
|
|
160
|
+
@success_count += 1
|
|
161
|
+
if @success_count >= @success_threshold
|
|
162
|
+
transition_to(:closed)
|
|
163
|
+
end
|
|
164
|
+
else
|
|
165
|
+
@failure_count = 0
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
track_success
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def record_failure(error)
|
|
173
|
+
@mutex.synchronize do
|
|
174
|
+
@failure_count += 1
|
|
175
|
+
@last_failure_at = Time.now
|
|
176
|
+
|
|
177
|
+
if @state == :half_open
|
|
178
|
+
transition_to(:open)
|
|
179
|
+
elsif @failure_count >= @failure_threshold
|
|
180
|
+
transition_to(:open)
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
track_failure(error)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def check_state_transition!
|
|
188
|
+
return unless @state == :open && @last_failure_at
|
|
189
|
+
|
|
190
|
+
if Time.now - @last_failure_at >= @recovery_timeout
|
|
191
|
+
@mutex.synchronize do
|
|
192
|
+
transition_to(:half_open) if @state == :open
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def transition_to(new_state)
|
|
198
|
+
old_state = @state
|
|
199
|
+
@state = new_state
|
|
200
|
+
|
|
201
|
+
case new_state
|
|
202
|
+
when :closed
|
|
203
|
+
@failure_count = 0
|
|
204
|
+
@success_count = 0
|
|
205
|
+
when :half_open
|
|
206
|
+
@success_count = 0
|
|
207
|
+
when :open
|
|
208
|
+
# Keep failure count for debugging
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
track_state_change(new_state, old_state)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def excluded_exceptions
|
|
215
|
+
@exclude_exceptions.empty? ? [] : @exclude_exceptions
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Metrics tracking
|
|
219
|
+
|
|
220
|
+
def track_success
|
|
221
|
+
return unless BrainzLab.configuration.flux_effectively_enabled?
|
|
222
|
+
|
|
223
|
+
BrainzLab::Flux.increment("circuit_breaker.success", tags: { name: @name, state: @state.to_s })
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def track_failure(error)
|
|
227
|
+
return unless BrainzLab.configuration.flux_effectively_enabled?
|
|
228
|
+
|
|
229
|
+
BrainzLab::Flux.increment("circuit_breaker.failure", tags: {
|
|
230
|
+
name: @name,
|
|
231
|
+
state: @state.to_s,
|
|
232
|
+
error_class: error.class.name
|
|
233
|
+
})
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
def track_rejected
|
|
237
|
+
return unless BrainzLab.configuration.flux_effectively_enabled?
|
|
238
|
+
|
|
239
|
+
BrainzLab::Flux.increment("circuit_breaker.rejected", tags: { name: @name })
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def track_state_change(new_state, old_state = nil)
|
|
243
|
+
return unless BrainzLab.configuration.flux_effectively_enabled?
|
|
244
|
+
|
|
245
|
+
BrainzLab::Flux.track("circuit_breaker.state_change", {
|
|
246
|
+
name: @name,
|
|
247
|
+
new_state: new_state.to_s,
|
|
248
|
+
old_state: old_state&.to_s,
|
|
249
|
+
failure_count: @failure_count
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
# Also add breadcrumb for debugging
|
|
253
|
+
BrainzLab::Reflex.add_breadcrumb(
|
|
254
|
+
"Circuit '#{@name}' transitioned to #{new_state}",
|
|
255
|
+
category: "circuit_breaker",
|
|
256
|
+
level: new_state == :open ? :warning : :info,
|
|
257
|
+
data: { name: @name, old_state: old_state, new_state: new_state }
|
|
258
|
+
)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Error raised when circuit is open
|
|
262
|
+
class CircuitOpenError < StandardError; end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
end
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BrainzLab
|
|
4
|
+
module Utilities
|
|
5
|
+
# Health check utility for application health endpoints
|
|
6
|
+
# Provides checks for database, cache, queues, and external services
|
|
7
|
+
#
|
|
8
|
+
# @example Basic usage in Rails routes
|
|
9
|
+
# # config/routes.rb
|
|
10
|
+
# mount BrainzLab::Utilities::HealthCheck::Engine => "/health"
|
|
11
|
+
#
|
|
12
|
+
# @example Manual usage
|
|
13
|
+
# result = BrainzLab::Utilities::HealthCheck.run
|
|
14
|
+
# result[:status] # => "healthy" or "unhealthy"
|
|
15
|
+
# result[:checks] # => { database: { status: "ok", latency_ms: 5 }, ... }
|
|
16
|
+
#
|
|
17
|
+
class HealthCheck
|
|
18
|
+
CHECKS = %i[database redis cache queue memory disk].freeze
|
|
19
|
+
|
|
20
|
+
class << self
|
|
21
|
+
# Run all configured health checks
|
|
22
|
+
def run(checks: nil)
|
|
23
|
+
checks_to_run = checks || CHECKS
|
|
24
|
+
results = {}
|
|
25
|
+
overall_healthy = true
|
|
26
|
+
|
|
27
|
+
checks_to_run.each do |check|
|
|
28
|
+
begin
|
|
29
|
+
result = send("check_#{check}")
|
|
30
|
+
results[check] = result
|
|
31
|
+
overall_healthy = false if result[:status] != "ok"
|
|
32
|
+
rescue StandardError => e
|
|
33
|
+
results[check] = { status: "error", message: e.message }
|
|
34
|
+
overall_healthy = false
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
{
|
|
39
|
+
status: overall_healthy ? "healthy" : "unhealthy",
|
|
40
|
+
timestamp: Time.now.utc.iso8601,
|
|
41
|
+
checks: results
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Quick check - just returns status
|
|
46
|
+
def healthy?
|
|
47
|
+
result = run
|
|
48
|
+
result[:status] == "healthy"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Database connectivity check
|
|
52
|
+
def check_database
|
|
53
|
+
return { status: "skip", message: "ActiveRecord not loaded" } unless defined?(ActiveRecord::Base)
|
|
54
|
+
|
|
55
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
56
|
+
ActiveRecord::Base.connection.execute("SELECT 1")
|
|
57
|
+
latency = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000).round(2)
|
|
58
|
+
|
|
59
|
+
{ status: "ok", latency_ms: latency }
|
|
60
|
+
rescue StandardError => e
|
|
61
|
+
{ status: "error", message: e.message }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Redis connectivity check
|
|
65
|
+
def check_redis
|
|
66
|
+
return { status: "skip", message: "Redis not configured" } unless defined?(Redis)
|
|
67
|
+
|
|
68
|
+
redis = find_redis_connection
|
|
69
|
+
return { status: "skip", message: "No Redis connection found" } unless redis
|
|
70
|
+
|
|
71
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
72
|
+
redis.ping
|
|
73
|
+
latency = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000).round(2)
|
|
74
|
+
|
|
75
|
+
{ status: "ok", latency_ms: latency }
|
|
76
|
+
rescue StandardError => e
|
|
77
|
+
{ status: "error", message: e.message }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Rails cache check
|
|
81
|
+
def check_cache
|
|
82
|
+
return { status: "skip", message: "Rails not loaded" } unless defined?(Rails)
|
|
83
|
+
|
|
84
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
85
|
+
key = "brainzlab_health_check_#{SecureRandom.hex(4)}"
|
|
86
|
+
Rails.cache.write(key, "ok", expires_in: 10.seconds)
|
|
87
|
+
value = Rails.cache.read(key)
|
|
88
|
+
Rails.cache.delete(key)
|
|
89
|
+
latency = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start) * 1000).round(2)
|
|
90
|
+
|
|
91
|
+
if value == "ok"
|
|
92
|
+
{ status: "ok", latency_ms: latency }
|
|
93
|
+
else
|
|
94
|
+
{ status: "error", message: "Cache read/write failed" }
|
|
95
|
+
end
|
|
96
|
+
rescue StandardError => e
|
|
97
|
+
{ status: "error", message: e.message }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Queue system check
|
|
101
|
+
def check_queue
|
|
102
|
+
if defined?(SolidQueue)
|
|
103
|
+
check_solid_queue
|
|
104
|
+
elsif defined?(Sidekiq)
|
|
105
|
+
check_sidekiq
|
|
106
|
+
elsif defined?(GoodJob)
|
|
107
|
+
check_good_job
|
|
108
|
+
else
|
|
109
|
+
{ status: "skip", message: "No queue system detected" }
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Memory usage check
|
|
114
|
+
def check_memory
|
|
115
|
+
mem_info = memory_usage
|
|
116
|
+
|
|
117
|
+
status = if mem_info[:percentage] > 90
|
|
118
|
+
"warning"
|
|
119
|
+
elsif mem_info[:percentage] > 95
|
|
120
|
+
"error"
|
|
121
|
+
else
|
|
122
|
+
"ok"
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
{
|
|
126
|
+
status: status,
|
|
127
|
+
used_mb: mem_info[:used_mb],
|
|
128
|
+
percentage: mem_info[:percentage]
|
|
129
|
+
}
|
|
130
|
+
rescue StandardError => e
|
|
131
|
+
{ status: "error", message: e.message }
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Disk space check
|
|
135
|
+
def check_disk
|
|
136
|
+
disk_info = disk_usage
|
|
137
|
+
|
|
138
|
+
status = if disk_info[:percentage] > 90
|
|
139
|
+
"warning"
|
|
140
|
+
elsif disk_info[:percentage] > 95
|
|
141
|
+
"error"
|
|
142
|
+
else
|
|
143
|
+
"ok"
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
{
|
|
147
|
+
status: status,
|
|
148
|
+
used_gb: disk_info[:used_gb],
|
|
149
|
+
available_gb: disk_info[:available_gb],
|
|
150
|
+
percentage: disk_info[:percentage]
|
|
151
|
+
}
|
|
152
|
+
rescue StandardError => e
|
|
153
|
+
{ status: "error", message: e.message }
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Register a custom health check
|
|
157
|
+
def register(name, &block)
|
|
158
|
+
custom_checks[name.to_sym] = block
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def custom_checks
|
|
162
|
+
@custom_checks ||= {}
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
private
|
|
166
|
+
|
|
167
|
+
def find_redis_connection
|
|
168
|
+
# Try common Redis connection sources
|
|
169
|
+
if defined?(Redis.current) && Redis.current
|
|
170
|
+
Redis.current
|
|
171
|
+
elsif defined?(Sidekiq) && Sidekiq.respond_to?(:redis)
|
|
172
|
+
Sidekiq.redis { |conn| return conn }
|
|
173
|
+
elsif defined?(Rails) && Rails.application.config.respond_to?(:redis)
|
|
174
|
+
Rails.application.config.redis
|
|
175
|
+
end
|
|
176
|
+
rescue StandardError
|
|
177
|
+
nil
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def check_solid_queue
|
|
181
|
+
return { status: "skip", message: "SolidQueue not loaded" } unless defined?(SolidQueue)
|
|
182
|
+
|
|
183
|
+
# Check if processes are running
|
|
184
|
+
if defined?(SolidQueue::Process)
|
|
185
|
+
process_count = SolidQueue::Process.where("last_heartbeat_at > ?", 5.minutes.ago).count
|
|
186
|
+
{
|
|
187
|
+
status: process_count > 0 ? "ok" : "warning",
|
|
188
|
+
processes: process_count
|
|
189
|
+
}
|
|
190
|
+
else
|
|
191
|
+
{ status: "ok", message: "SolidQueue configured" }
|
|
192
|
+
end
|
|
193
|
+
rescue StandardError => e
|
|
194
|
+
{ status: "error", message: e.message }
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def check_sidekiq
|
|
198
|
+
return { status: "skip", message: "Sidekiq not loaded" } unless defined?(Sidekiq)
|
|
199
|
+
|
|
200
|
+
stats = Sidekiq::Stats.new
|
|
201
|
+
{
|
|
202
|
+
status: "ok",
|
|
203
|
+
processed: stats.processed,
|
|
204
|
+
failed: stats.failed,
|
|
205
|
+
queues: stats.queues,
|
|
206
|
+
workers: stats.workers_size
|
|
207
|
+
}
|
|
208
|
+
rescue StandardError => e
|
|
209
|
+
{ status: "error", message: e.message }
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def check_good_job
|
|
213
|
+
return { status: "skip", message: "GoodJob not loaded" } unless defined?(GoodJob)
|
|
214
|
+
|
|
215
|
+
{
|
|
216
|
+
status: "ok",
|
|
217
|
+
pending: GoodJob::Job.where(performed_at: nil).count,
|
|
218
|
+
running: GoodJob::Job.running.count
|
|
219
|
+
}
|
|
220
|
+
rescue StandardError => e
|
|
221
|
+
{ status: "error", message: e.message }
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def memory_usage
|
|
225
|
+
# Use /proc/self/status on Linux, ps on macOS
|
|
226
|
+
if File.exist?("/proc/self/status")
|
|
227
|
+
status = File.read("/proc/self/status")
|
|
228
|
+
vm_rss = status.match(/VmRSS:\s+(\d+)\s+kB/)&.captures&.first&.to_i || 0
|
|
229
|
+
used_mb = (vm_rss / 1024.0).round(2)
|
|
230
|
+
else
|
|
231
|
+
# macOS fallback
|
|
232
|
+
pid = Process.pid
|
|
233
|
+
output = `ps -o rss= -p #{pid}`.strip
|
|
234
|
+
used_mb = (output.to_i / 1024.0).round(2)
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Estimate percentage (based on typical container memory)
|
|
238
|
+
max_mb = ENV.fetch("MEMORY_LIMIT_MB", 512).to_i
|
|
239
|
+
percentage = ((used_mb / max_mb) * 100).round(2)
|
|
240
|
+
|
|
241
|
+
{ used_mb: used_mb, percentage: percentage }
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def disk_usage
|
|
245
|
+
output = `df -k /`.split("\n").last.split
|
|
246
|
+
total = output[1].to_i / 1024 / 1024.0
|
|
247
|
+
used = output[2].to_i / 1024 / 1024.0
|
|
248
|
+
available = output[3].to_i / 1024 / 1024.0
|
|
249
|
+
percentage = ((used / total) * 100).round(2)
|
|
250
|
+
|
|
251
|
+
{
|
|
252
|
+
used_gb: used.round(2),
|
|
253
|
+
available_gb: available.round(2),
|
|
254
|
+
percentage: percentage
|
|
255
|
+
}
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Rails Engine for mounting health endpoints
|
|
260
|
+
if defined?(::Rails::Engine)
|
|
261
|
+
class Engine < ::Rails::Engine
|
|
262
|
+
isolate_namespace BrainzLab::Utilities::HealthCheck
|
|
263
|
+
|
|
264
|
+
routes.draw do
|
|
265
|
+
get "/", to: "health#show"
|
|
266
|
+
get "/live", to: "health#live"
|
|
267
|
+
get "/ready", to: "health#ready"
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Controller for health endpoints
|
|
273
|
+
if defined?(ActionController::API)
|
|
274
|
+
class HealthController < ActionController::API
|
|
275
|
+
def show
|
|
276
|
+
result = HealthCheck.run
|
|
277
|
+
status = result[:status] == "healthy" ? :ok : :service_unavailable
|
|
278
|
+
render json: result, status: status
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
def live
|
|
282
|
+
# Liveness probe - just check if the app is running
|
|
283
|
+
render json: { status: "ok", timestamp: Time.now.utc.iso8601 }
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def ready
|
|
287
|
+
# Readiness probe - check critical dependencies
|
|
288
|
+
result = HealthCheck.run(checks: [:database, :redis])
|
|
289
|
+
status = result[:status] == "healthy" ? :ok : :service_unavailable
|
|
290
|
+
render json: result, status: status
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
end
|