natswork-server 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +0 -0
- data/LICENSE +21 -0
- data/README.md +286 -0
- data/lib/natswork/cli.rb +420 -0
- data/lib/natswork/error_tracker.rb +338 -0
- data/lib/natswork/health_check.rb +252 -0
- data/lib/natswork/instrumentation.rb +141 -0
- data/lib/natswork/job_executor.rb +271 -0
- data/lib/natswork/job_hooks.rb +63 -0
- data/lib/natswork/logger.rb +183 -0
- data/lib/natswork/metrics.rb +241 -0
- data/lib/natswork/middleware.rb +142 -0
- data/lib/natswork/middleware_chain.rb +40 -0
- data/lib/natswork/monitoring.rb +397 -0
- data/lib/natswork/protocol.rb +454 -0
- data/lib/natswork/queue_manager.rb +164 -0
- data/lib/natswork/retry_handler.rb +125 -0
- data/lib/natswork/server/version.rb +7 -0
- data/lib/natswork/server.rb +47 -0
- data/lib/natswork/simple_worker.rb +101 -0
- data/lib/natswork/thread_pool.rb +192 -0
- data/lib/natswork/worker.rb +217 -0
- data/lib/natswork/worker_manager.rb +62 -0
- data/lib/natswork-server.rb +5 -0
- metadata +151 -0
@@ -0,0 +1,338 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'digest'
|
4
|
+
require 'concurrent'
|
5
|
+
require 'socket'
|
6
|
+
|
7
|
+
module NatsWork
|
8
|
+
class ErrorTracker
|
9
|
+
attr_reader :errors, :error_rates
|
10
|
+
|
11
|
+
def initialize(options = {})
|
12
|
+
@errors = Concurrent::Hash.new
|
13
|
+
@error_counts = Concurrent::Hash.new(0)
|
14
|
+
@error_rates = Concurrent::Hash.new { |h, k| h[k] = RateTracker.new }
|
15
|
+
@max_errors = options.fetch(:max_errors, 1000)
|
16
|
+
@retention_period = options.fetch(:retention_hours, 24) * 3600
|
17
|
+
@notifiers = []
|
18
|
+
|
19
|
+
setup_cleanup_timer
|
20
|
+
end
|
21
|
+
|
22
|
+
def track_error(error, context = {})
|
23
|
+
error_info = extract_error_info(error, context)
|
24
|
+
error_key = generate_error_key(error_info)
|
25
|
+
|
26
|
+
# Store error details
|
27
|
+
@errors[error_key] = error_info.merge(
|
28
|
+
first_seen: @errors[error_key]&.dig(:first_seen) || Time.now,
|
29
|
+
last_seen: Time.now,
|
30
|
+
count: @error_counts[error_key] + 1
|
31
|
+
)
|
32
|
+
|
33
|
+
@error_counts[error_key] += 1
|
34
|
+
@error_rates[error_key].increment
|
35
|
+
|
36
|
+
# Notify error handlers
|
37
|
+
notify_error(error_info)
|
38
|
+
|
39
|
+
# Cleanup old errors if we're over the limit
|
40
|
+
cleanup_old_errors if @errors.size > @max_errors
|
41
|
+
|
42
|
+
error_info
|
43
|
+
end
|
44
|
+
|
45
|
+
def get_error(error_key)
|
46
|
+
@errors[error_key]
|
47
|
+
end
|
48
|
+
|
49
|
+
def list_errors(limit = 50, filter = {})
|
50
|
+
filtered_errors = @errors.select do |_key, error|
|
51
|
+
matches_filter?(error, filter)
|
52
|
+
end
|
53
|
+
|
54
|
+
sorted_errors = filtered_errors.sort_by { |_key, error| -error[:last_seen].to_f }
|
55
|
+
sorted_errors.first(limit).to_h
|
56
|
+
end
|
57
|
+
|
58
|
+
def error_stats(error_key = nil)
|
59
|
+
if error_key
|
60
|
+
error = @errors[error_key]
|
61
|
+
return nil unless error
|
62
|
+
|
63
|
+
{
|
64
|
+
key: error_key,
|
65
|
+
count: error[:count],
|
66
|
+
first_seen: error[:first_seen],
|
67
|
+
last_seen: error[:last_seen],
|
68
|
+
rate_1h: @error_rates[error_key].rate(3600),
|
69
|
+
rate_24h: @error_rates[error_key].rate(86_400)
|
70
|
+
}
|
71
|
+
else
|
72
|
+
{
|
73
|
+
total_errors: @errors.size,
|
74
|
+
total_occurrences: @error_counts.values.sum,
|
75
|
+
most_frequent: most_frequent_errors(5),
|
76
|
+
recent_errors: recent_errors(10)
|
77
|
+
}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def add_notifier(&block)
|
82
|
+
@notifiers << block if block_given?
|
83
|
+
end
|
84
|
+
|
85
|
+
def clear_errors!
|
86
|
+
@errors.clear
|
87
|
+
@error_counts.clear
|
88
|
+
@error_rates.clear
|
89
|
+
end
|
90
|
+
|
91
|
+
def categorize_errors
|
92
|
+
categories = {
|
93
|
+
timeout: [],
|
94
|
+
connection: [],
|
95
|
+
permission: [],
|
96
|
+
validation: [],
|
97
|
+
system: [],
|
98
|
+
unknown: []
|
99
|
+
}
|
100
|
+
|
101
|
+
@errors.each do |key, error|
|
102
|
+
category = determine_category(error)
|
103
|
+
categories[category] << { key: key, error: error }
|
104
|
+
end
|
105
|
+
|
106
|
+
categories
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
def extract_error_info(error, context)
|
112
|
+
{
|
113
|
+
type: error.class.name,
|
114
|
+
message: error.message,
|
115
|
+
backtrace: error.backtrace&.first(20),
|
116
|
+
fingerprint: generate_fingerprint(error),
|
117
|
+
context: sanitize_context(context),
|
118
|
+
timestamp: Time.now,
|
119
|
+
hostname: Socket.gethostname,
|
120
|
+
pid: Process.pid,
|
121
|
+
thread_id: Thread.current.object_id
|
122
|
+
}
|
123
|
+
end
|
124
|
+
|
125
|
+
def generate_error_key(error_info)
|
126
|
+
"#{error_info[:type]}:#{error_info[:fingerprint]}"
|
127
|
+
end
|
128
|
+
|
129
|
+
def generate_fingerprint(error)
|
130
|
+
# Create a fingerprint based on error type, message, and location
|
131
|
+
location = error.backtrace&.first&.split(':')&.first(2)&.join(':') || 'unknown'
|
132
|
+
fingerprint_data = "#{error.class.name}:#{error.message}:#{location}"
|
133
|
+
Digest::SHA256.hexdigest(fingerprint_data)[0, 16]
|
134
|
+
end
|
135
|
+
|
136
|
+
def sanitize_context(context)
|
137
|
+
# Remove sensitive data from context
|
138
|
+
sanitized = context.dup
|
139
|
+
|
140
|
+
sensitive_keys = %w[password token secret key credential auth]
|
141
|
+
sensitive_keys.each do |sensitive_key|
|
142
|
+
sanitized.keys.each do |key|
|
143
|
+
sanitized[key] = '[FILTERED]' if key.to_s.downcase.include?(sensitive_key)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
sanitized
|
148
|
+
end
|
149
|
+
|
150
|
+
def notify_error(error_info)
|
151
|
+
@notifiers.each do |notifier|
|
152
|
+
notifier.call(error_info)
|
153
|
+
rescue StandardError => e
|
154
|
+
# Don't let notifier errors break error tracking
|
155
|
+
warn "Error notifier failed: #{e.message}"
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def matches_filter?(error, filter)
|
160
|
+
return true if filter.empty?
|
161
|
+
|
162
|
+
filter.all? do |key, value|
|
163
|
+
case key.to_sym
|
164
|
+
when :type
|
165
|
+
error[:type] == value
|
166
|
+
when :since
|
167
|
+
error[:last_seen] >= value
|
168
|
+
when :context
|
169
|
+
value.all? { |k, v| error[:context][k] == v }
|
170
|
+
else
|
171
|
+
true
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def most_frequent_errors(limit)
|
177
|
+
@error_counts.sort_by { |_key, count| -count }
|
178
|
+
.first(limit)
|
179
|
+
.map { |key, count| { key: key, count: count, error: @errors[key] } }
|
180
|
+
end
|
181
|
+
|
182
|
+
def recent_errors(limit)
|
183
|
+
@errors.values
|
184
|
+
.sort_by { |error| -error[:last_seen].to_f }
|
185
|
+
.first(limit)
|
186
|
+
end
|
187
|
+
|
188
|
+
def cleanup_old_errors
|
189
|
+
cutoff_time = Time.now - @retention_period
|
190
|
+
|
191
|
+
@errors.delete_if { |_key, error| error[:last_seen] < cutoff_time }
|
192
|
+
@error_counts.delete_if { |key, _count| !@errors.key?(key) }
|
193
|
+
@error_rates.delete_if { |key, _rate| !@errors.key?(key) }
|
194
|
+
end
|
195
|
+
|
196
|
+
def setup_cleanup_timer
|
197
|
+
Thread.new do
|
198
|
+
loop do
|
199
|
+
sleep 3600 # Run cleanup every hour
|
200
|
+
cleanup_old_errors
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def determine_category(error)
|
206
|
+
message = error[:message].downcase
|
207
|
+
type = error[:type].downcase
|
208
|
+
|
209
|
+
return :timeout if message.include?('timeout') || type.include?('timeout')
|
210
|
+
return :connection if message.include?('connection') || message.include?('network')
|
211
|
+
return :permission if message.include?('permission') || message.include?('unauthorized')
|
212
|
+
return :validation if message.include?('invalid') || message.include?('validation')
|
213
|
+
return :system if type.include?('system') || type.include?('runtime')
|
214
|
+
|
215
|
+
:unknown
|
216
|
+
end
|
217
|
+
|
218
|
+
class << self
|
219
|
+
def global
|
220
|
+
@global ||= new
|
221
|
+
end
|
222
|
+
|
223
|
+
def track(error, context = {})
|
224
|
+
global.track_error(error, context)
|
225
|
+
end
|
226
|
+
|
227
|
+
def stats
|
228
|
+
global.error_stats
|
229
|
+
end
|
230
|
+
|
231
|
+
def errors(limit = 50, filter = {})
|
232
|
+
global.list_errors(limit, filter)
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
# Rate tracker for monitoring error frequencies
|
238
|
+
class RateTracker
|
239
|
+
def initialize(window_size = 3600)
|
240
|
+
@timestamps = Concurrent::Array.new
|
241
|
+
@window_size = window_size
|
242
|
+
@mutex = Mutex.new
|
243
|
+
end
|
244
|
+
|
245
|
+
def increment
|
246
|
+
@mutex.synchronize do
|
247
|
+
@timestamps << Time.now.to_f
|
248
|
+
cleanup_old_timestamps
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
def rate(period_seconds = @window_size)
|
253
|
+
@mutex.synchronize do
|
254
|
+
cleanup_old_timestamps
|
255
|
+
cutoff_time = Time.now.to_f - period_seconds
|
256
|
+
recent_count = @timestamps.count { |ts| ts >= cutoff_time }
|
257
|
+
recent_count.to_f / period_seconds
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def count(period_seconds = @window_size)
|
262
|
+
@mutex.synchronize do
|
263
|
+
cleanup_old_timestamps
|
264
|
+
cutoff_time = Time.now.to_f - period_seconds
|
265
|
+
@timestamps.count { |ts| ts >= cutoff_time }
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
private
|
270
|
+
|
271
|
+
def cleanup_old_timestamps
|
272
|
+
cutoff_time = Time.now.to_f - @window_size
|
273
|
+
@timestamps.delete_if { |ts| ts < cutoff_time }
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
# Built-in error notifiers
|
278
|
+
module ErrorNotifiers
|
279
|
+
class LogNotifier
|
280
|
+
def initialize(logger = nil)
|
281
|
+
@logger = logger || NatsWork::Logger.global
|
282
|
+
end
|
283
|
+
|
284
|
+
def call(error_info)
|
285
|
+
@logger.error('Error tracked',
|
286
|
+
error_key: "#{error_info[:type]}:#{error_info[:fingerprint]}",
|
287
|
+
error_type: error_info[:type],
|
288
|
+
error_message: error_info[:message],
|
289
|
+
context: error_info[:context])
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
class MetricsNotifier
|
294
|
+
def initialize(metrics = nil)
|
295
|
+
@metrics = metrics || NatsWork::Metrics.global
|
296
|
+
end
|
297
|
+
|
298
|
+
def call(error_info)
|
299
|
+
@metrics.increment('errors.tracked', 1,
|
300
|
+
error_type: error_info[:type],
|
301
|
+
hostname: error_info[:hostname])
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
class WebhookNotifier
|
306
|
+
def initialize(webhook_url, options = {})
|
307
|
+
@webhook_url = webhook_url
|
308
|
+
@http_timeout = options.fetch(:timeout, 5)
|
309
|
+
@retry_count = options.fetch(:retries, 2)
|
310
|
+
end
|
311
|
+
|
312
|
+
def call(error_info)
|
313
|
+
payload = {
|
314
|
+
error: {
|
315
|
+
type: error_info[:type],
|
316
|
+
message: error_info[:message],
|
317
|
+
fingerprint: error_info[:fingerprint],
|
318
|
+
timestamp: error_info[:timestamp].iso8601,
|
319
|
+
context: error_info[:context]
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
323
|
+
send_webhook(payload)
|
324
|
+
end
|
325
|
+
|
326
|
+
private
|
327
|
+
|
328
|
+
def send_webhook(payload)
|
329
|
+
# This would integrate with an HTTP client
|
330
|
+
# For now, just log the webhook call
|
331
|
+
require_relative 'logger'
|
332
|
+
NatsWork::Logger.debug('Webhook notification',
|
333
|
+
url: @webhook_url,
|
334
|
+
payload_size: payload.to_s.length)
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
@@ -0,0 +1,252 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'concurrent'
|
4
|
+
|
5
|
+
module NatsWork
|
6
|
+
class HealthCheck
|
7
|
+
attr_reader :name, :status, :last_checked, :message, :metadata
|
8
|
+
|
9
|
+
STATUSES = %i[healthy degraded unhealthy unknown].freeze
|
10
|
+
|
11
|
+
def initialize(name, &check_block)
|
12
|
+
@name = name.to_s
|
13
|
+
@check_block = check_block
|
14
|
+
@status = :unknown
|
15
|
+
@message = ''
|
16
|
+
@metadata = {}
|
17
|
+
@last_checked = nil
|
18
|
+
@mutex = Mutex.new
|
19
|
+
end
|
20
|
+
|
21
|
+
def check!
|
22
|
+
@mutex.synchronize do
|
23
|
+
@last_checked = Time.now
|
24
|
+
|
25
|
+
begin
|
26
|
+
if @check_block
|
27
|
+
result = @check_block.call
|
28
|
+
|
29
|
+
if result.is_a?(Hash)
|
30
|
+
@status = result.fetch(:status, :healthy)
|
31
|
+
@message = result.fetch(:message, '')
|
32
|
+
@metadata = result.fetch(:metadata, {})
|
33
|
+
elsif result
|
34
|
+
@status = :healthy
|
35
|
+
@message = 'Check passed'
|
36
|
+
else
|
37
|
+
@status = :unhealthy
|
38
|
+
@message = 'Check failed'
|
39
|
+
end
|
40
|
+
else
|
41
|
+
@status = :unknown
|
42
|
+
@message = 'No check defined'
|
43
|
+
end
|
44
|
+
rescue StandardError => e
|
45
|
+
@status = :unhealthy
|
46
|
+
@message = "Check error: #{e.message}"
|
47
|
+
@metadata = { error: e.class.name, backtrace: e.backtrace&.first(5) }
|
48
|
+
end
|
49
|
+
|
50
|
+
@status
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def healthy?
|
55
|
+
@status == :healthy
|
56
|
+
end
|
57
|
+
|
58
|
+
def unhealthy?
|
59
|
+
@status == :unhealthy
|
60
|
+
end
|
61
|
+
|
62
|
+
def degraded?
|
63
|
+
@status == :degraded
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_h
|
67
|
+
{
|
68
|
+
name: @name,
|
69
|
+
status: @status,
|
70
|
+
message: @message,
|
71
|
+
last_checked: @last_checked,
|
72
|
+
metadata: @metadata
|
73
|
+
}
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class HealthChecker
|
78
|
+
def initialize
|
79
|
+
@checks = Concurrent::Hash.new
|
80
|
+
@thresholds = {
|
81
|
+
memory_mb: 1000,
|
82
|
+
cpu_percent: 90.0,
|
83
|
+
queue_depth: 1000,
|
84
|
+
error_rate: 0.05
|
85
|
+
}
|
86
|
+
end
|
87
|
+
|
88
|
+
def add_check(name, &block)
|
89
|
+
@checks[name.to_s] = HealthCheck.new(name, &block)
|
90
|
+
end
|
91
|
+
|
92
|
+
def remove_check(name)
|
93
|
+
@checks.delete(name.to_s)
|
94
|
+
end
|
95
|
+
|
96
|
+
def check_all!
|
97
|
+
results = {}
|
98
|
+
|
99
|
+
@checks.each do |name, check|
|
100
|
+
results[name] = check.check!
|
101
|
+
end
|
102
|
+
|
103
|
+
results
|
104
|
+
end
|
105
|
+
|
106
|
+
def check(name)
|
107
|
+
check = @checks[name.to_s]
|
108
|
+
check&.check!
|
109
|
+
end
|
110
|
+
|
111
|
+
def status(name = nil)
|
112
|
+
if name
|
113
|
+
@checks[name.to_s]&.status || :unknown
|
114
|
+
else
|
115
|
+
overall_status
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def healthy?(name = nil)
|
120
|
+
if name
|
121
|
+
@checks[name.to_s]&.healthy? || false
|
122
|
+
else
|
123
|
+
@checks.values.all?(&:healthy?)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def report
|
128
|
+
{
|
129
|
+
status: overall_status,
|
130
|
+
timestamp: Time.now.iso8601,
|
131
|
+
checks: @checks.transform_values(&:to_h)
|
132
|
+
}
|
133
|
+
end
|
134
|
+
|
135
|
+
def set_threshold(metric, value)
|
136
|
+
@thresholds[metric.to_sym] = value
|
137
|
+
end
|
138
|
+
|
139
|
+
private
|
140
|
+
|
141
|
+
def overall_status
|
142
|
+
statuses = @checks.values.map(&:status)
|
143
|
+
|
144
|
+
return :unhealthy if statuses.include?(:unhealthy)
|
145
|
+
return :degraded if statuses.include?(:degraded)
|
146
|
+
return :healthy if statuses.all? { |s| s == :healthy }
|
147
|
+
|
148
|
+
:unknown
|
149
|
+
end
|
150
|
+
|
151
|
+
class << self
|
152
|
+
def global
|
153
|
+
@global ||= new.tap { |checker| setup_default_checks(checker) }
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
|
158
|
+
def setup_default_checks(checker)
|
159
|
+
# NATS connection health
|
160
|
+
checker.add_check('nats_connection') do
|
161
|
+
# This will be implemented when we have connection reference
|
162
|
+
{ status: :healthy, message: 'NATS connection active' }
|
163
|
+
end
|
164
|
+
|
165
|
+
# Memory usage
|
166
|
+
checker.add_check('memory_usage') do
|
167
|
+
if defined?(GC.stat)
|
168
|
+
memory_mb = GC.stat(:heap_allocated_pages) * GC::INTERNAL_CONSTANTS[:HEAP_PAGE_SIZE] / (1024 * 1024)
|
169
|
+
threshold = checker.instance_variable_get(:@thresholds)[:memory_mb]
|
170
|
+
|
171
|
+
if memory_mb > threshold
|
172
|
+
{ status: :unhealthy, message: "Memory usage #{memory_mb}MB exceeds threshold #{threshold}MB" }
|
173
|
+
elsif memory_mb > threshold * 0.8
|
174
|
+
{ status: :degraded, message: "Memory usage #{memory_mb}MB approaching threshold" }
|
175
|
+
else
|
176
|
+
{ status: :healthy, message: "Memory usage #{memory_mb}MB within limits" }
|
177
|
+
end
|
178
|
+
else
|
179
|
+
{ status: :unknown, message: 'Memory stats not available' }
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Process health
|
184
|
+
checker.add_check('process') do
|
185
|
+
{
|
186
|
+
status: :healthy,
|
187
|
+
message: 'Process running normally',
|
188
|
+
metadata: {
|
189
|
+
pid: Process.pid,
|
190
|
+
uptime: Time.now - $PROGRAM_START_TIME
|
191
|
+
}
|
192
|
+
}
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# Worker-specific health checks
|
199
|
+
module WorkerHealth
|
200
|
+
def self.setup_worker_checks(worker, health_checker = HealthChecker.global)
|
201
|
+
# Worker status check
|
202
|
+
health_checker.add_check('worker_status') do
|
203
|
+
if worker.running?
|
204
|
+
if worker.paused?
|
205
|
+
{ status: :degraded, message: 'Worker is paused' }
|
206
|
+
elsif worker.stopping?
|
207
|
+
{ status: :degraded, message: 'Worker is stopping' }
|
208
|
+
else
|
209
|
+
{ status: :healthy, message: 'Worker running normally' }
|
210
|
+
end
|
211
|
+
else
|
212
|
+
{ status: :unhealthy, message: 'Worker is stopped' }
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# Active jobs check
|
217
|
+
health_checker.add_check('active_jobs') do
|
218
|
+
stats = worker.stats
|
219
|
+
active = stats[:active_jobs]
|
220
|
+
concurrency = stats[:concurrency]
|
221
|
+
|
222
|
+
if active >= concurrency
|
223
|
+
{ status: :degraded, message: "All #{concurrency} job slots occupied" }
|
224
|
+
elsif active > concurrency * 0.8
|
225
|
+
{ status: :degraded, message: "#{active}/#{concurrency} job slots occupied" }
|
226
|
+
else
|
227
|
+
{ status: :healthy, message: "#{active}/#{concurrency} job slots occupied" }
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# Job processing rate check
|
232
|
+
health_checker.add_check('job_processing') do
|
233
|
+
stats = worker.stats
|
234
|
+
processed = stats[:jobs_processed]
|
235
|
+
failed = stats[:jobs_failed]
|
236
|
+
|
237
|
+
if processed.zero?
|
238
|
+
{ status: :unknown, message: 'No jobs processed yet' }
|
239
|
+
else
|
240
|
+
error_rate = failed.to_f / processed
|
241
|
+
if error_rate > 0.5
|
242
|
+
{ status: :unhealthy, message: "High error rate: #{(error_rate * 100).round(1)}%" }
|
243
|
+
elsif error_rate > 0.1
|
244
|
+
{ status: :degraded, message: "Elevated error rate: #{(error_rate * 100).round(1)}%" }
|
245
|
+
else
|
246
|
+
{ status: :healthy, message: "Error rate: #{(error_rate * 100).round(1)}%" }
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|