natswork-server 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,338 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require 'concurrent'
5
+ require 'socket'
6
+
7
+ module NatsWork
8
+ class ErrorTracker
9
+ attr_reader :errors, :error_rates
10
+
11
+ def initialize(options = {})
12
+ @errors = Concurrent::Hash.new
13
+ @error_counts = Concurrent::Hash.new(0)
14
+ @error_rates = Concurrent::Hash.new { |h, k| h[k] = RateTracker.new }
15
+ @max_errors = options.fetch(:max_errors, 1000)
16
+ @retention_period = options.fetch(:retention_hours, 24) * 3600
17
+ @notifiers = []
18
+
19
+ setup_cleanup_timer
20
+ end
21
+
22
+ def track_error(error, context = {})
23
+ error_info = extract_error_info(error, context)
24
+ error_key = generate_error_key(error_info)
25
+
26
+ # Store error details
27
+ @errors[error_key] = error_info.merge(
28
+ first_seen: @errors[error_key]&.dig(:first_seen) || Time.now,
29
+ last_seen: Time.now,
30
+ count: @error_counts[error_key] + 1
31
+ )
32
+
33
+ @error_counts[error_key] += 1
34
+ @error_rates[error_key].increment
35
+
36
+ # Notify error handlers
37
+ notify_error(error_info)
38
+
39
+ # Cleanup old errors if we're over the limit
40
+ cleanup_old_errors if @errors.size > @max_errors
41
+
42
+ error_info
43
+ end
44
+
45
+ def get_error(error_key)
46
+ @errors[error_key]
47
+ end
48
+
49
+ def list_errors(limit = 50, filter = {})
50
+ filtered_errors = @errors.select do |_key, error|
51
+ matches_filter?(error, filter)
52
+ end
53
+
54
+ sorted_errors = filtered_errors.sort_by { |_key, error| -error[:last_seen].to_f }
55
+ sorted_errors.first(limit).to_h
56
+ end
57
+
58
+ def error_stats(error_key = nil)
59
+ if error_key
60
+ error = @errors[error_key]
61
+ return nil unless error
62
+
63
+ {
64
+ key: error_key,
65
+ count: error[:count],
66
+ first_seen: error[:first_seen],
67
+ last_seen: error[:last_seen],
68
+ rate_1h: @error_rates[error_key].rate(3600),
69
+ rate_24h: @error_rates[error_key].rate(86_400)
70
+ }
71
+ else
72
+ {
73
+ total_errors: @errors.size,
74
+ total_occurrences: @error_counts.values.sum,
75
+ most_frequent: most_frequent_errors(5),
76
+ recent_errors: recent_errors(10)
77
+ }
78
+ end
79
+ end
80
+
81
+ def add_notifier(&block)
82
+ @notifiers << block if block_given?
83
+ end
84
+
85
+ def clear_errors!
86
+ @errors.clear
87
+ @error_counts.clear
88
+ @error_rates.clear
89
+ end
90
+
91
+ def categorize_errors
92
+ categories = {
93
+ timeout: [],
94
+ connection: [],
95
+ permission: [],
96
+ validation: [],
97
+ system: [],
98
+ unknown: []
99
+ }
100
+
101
+ @errors.each do |key, error|
102
+ category = determine_category(error)
103
+ categories[category] << { key: key, error: error }
104
+ end
105
+
106
+ categories
107
+ end
108
+
109
+ private
110
+
111
+ def extract_error_info(error, context)
112
+ {
113
+ type: error.class.name,
114
+ message: error.message,
115
+ backtrace: error.backtrace&.first(20),
116
+ fingerprint: generate_fingerprint(error),
117
+ context: sanitize_context(context),
118
+ timestamp: Time.now,
119
+ hostname: Socket.gethostname,
120
+ pid: Process.pid,
121
+ thread_id: Thread.current.object_id
122
+ }
123
+ end
124
+
125
+ def generate_error_key(error_info)
126
+ "#{error_info[:type]}:#{error_info[:fingerprint]}"
127
+ end
128
+
129
+ def generate_fingerprint(error)
130
+ # Create a fingerprint based on error type, message, and location
131
+ location = error.backtrace&.first&.split(':')&.first(2)&.join(':') || 'unknown'
132
+ fingerprint_data = "#{error.class.name}:#{error.message}:#{location}"
133
+ Digest::SHA256.hexdigest(fingerprint_data)[0, 16]
134
+ end
135
+
136
+ def sanitize_context(context)
137
+ # Remove sensitive data from context
138
+ sanitized = context.dup
139
+
140
+ sensitive_keys = %w[password token secret key credential auth]
141
+ sensitive_keys.each do |sensitive_key|
142
+ sanitized.keys.each do |key|
143
+ sanitized[key] = '[FILTERED]' if key.to_s.downcase.include?(sensitive_key)
144
+ end
145
+ end
146
+
147
+ sanitized
148
+ end
149
+
150
+ def notify_error(error_info)
151
+ @notifiers.each do |notifier|
152
+ notifier.call(error_info)
153
+ rescue StandardError => e
154
+ # Don't let notifier errors break error tracking
155
+ warn "Error notifier failed: #{e.message}"
156
+ end
157
+ end
158
+
159
+ def matches_filter?(error, filter)
160
+ return true if filter.empty?
161
+
162
+ filter.all? do |key, value|
163
+ case key.to_sym
164
+ when :type
165
+ error[:type] == value
166
+ when :since
167
+ error[:last_seen] >= value
168
+ when :context
169
+ value.all? { |k, v| error[:context][k] == v }
170
+ else
171
+ true
172
+ end
173
+ end
174
+ end
175
+
176
+ def most_frequent_errors(limit)
177
+ @error_counts.sort_by { |_key, count| -count }
178
+ .first(limit)
179
+ .map { |key, count| { key: key, count: count, error: @errors[key] } }
180
+ end
181
+
182
+ def recent_errors(limit)
183
+ @errors.values
184
+ .sort_by { |error| -error[:last_seen].to_f }
185
+ .first(limit)
186
+ end
187
+
188
+ def cleanup_old_errors
189
+ cutoff_time = Time.now - @retention_period
190
+
191
+ @errors.delete_if { |_key, error| error[:last_seen] < cutoff_time }
192
+ @error_counts.delete_if { |key, _count| !@errors.key?(key) }
193
+ @error_rates.delete_if { |key, _rate| !@errors.key?(key) }
194
+ end
195
+
196
+ def setup_cleanup_timer
197
+ Thread.new do
198
+ loop do
199
+ sleep 3600 # Run cleanup every hour
200
+ cleanup_old_errors
201
+ end
202
+ end
203
+ end
204
+
205
+ def determine_category(error)
206
+ message = error[:message].downcase
207
+ type = error[:type].downcase
208
+
209
+ return :timeout if message.include?('timeout') || type.include?('timeout')
210
+ return :connection if message.include?('connection') || message.include?('network')
211
+ return :permission if message.include?('permission') || message.include?('unauthorized')
212
+ return :validation if message.include?('invalid') || message.include?('validation')
213
+ return :system if type.include?('system') || type.include?('runtime')
214
+
215
+ :unknown
216
+ end
217
+
218
+ class << self
219
+ def global
220
+ @global ||= new
221
+ end
222
+
223
+ def track(error, context = {})
224
+ global.track_error(error, context)
225
+ end
226
+
227
+ def stats
228
+ global.error_stats
229
+ end
230
+
231
+ def errors(limit = 50, filter = {})
232
+ global.list_errors(limit, filter)
233
+ end
234
+ end
235
+ end
236
+
237
+ # Rate tracker for monitoring error frequencies
238
+ class RateTracker
239
+ def initialize(window_size = 3600)
240
+ @timestamps = Concurrent::Array.new
241
+ @window_size = window_size
242
+ @mutex = Mutex.new
243
+ end
244
+
245
+ def increment
246
+ @mutex.synchronize do
247
+ @timestamps << Time.now.to_f
248
+ cleanup_old_timestamps
249
+ end
250
+ end
251
+
252
+ def rate(period_seconds = @window_size)
253
+ @mutex.synchronize do
254
+ cleanup_old_timestamps
255
+ cutoff_time = Time.now.to_f - period_seconds
256
+ recent_count = @timestamps.count { |ts| ts >= cutoff_time }
257
+ recent_count.to_f / period_seconds
258
+ end
259
+ end
260
+
261
+ def count(period_seconds = @window_size)
262
+ @mutex.synchronize do
263
+ cleanup_old_timestamps
264
+ cutoff_time = Time.now.to_f - period_seconds
265
+ @timestamps.count { |ts| ts >= cutoff_time }
266
+ end
267
+ end
268
+
269
+ private
270
+
271
+ def cleanup_old_timestamps
272
+ cutoff_time = Time.now.to_f - @window_size
273
+ @timestamps.delete_if { |ts| ts < cutoff_time }
274
+ end
275
+ end
276
+
277
+ # Built-in error notifiers
278
+ module ErrorNotifiers
279
+ class LogNotifier
280
+ def initialize(logger = nil)
281
+ @logger = logger || NatsWork::Logger.global
282
+ end
283
+
284
+ def call(error_info)
285
+ @logger.error('Error tracked',
286
+ error_key: "#{error_info[:type]}:#{error_info[:fingerprint]}",
287
+ error_type: error_info[:type],
288
+ error_message: error_info[:message],
289
+ context: error_info[:context])
290
+ end
291
+ end
292
+
293
+ class MetricsNotifier
294
+ def initialize(metrics = nil)
295
+ @metrics = metrics || NatsWork::Metrics.global
296
+ end
297
+
298
+ def call(error_info)
299
+ @metrics.increment('errors.tracked', 1,
300
+ error_type: error_info[:type],
301
+ hostname: error_info[:hostname])
302
+ end
303
+ end
304
+
305
+ class WebhookNotifier
306
+ def initialize(webhook_url, options = {})
307
+ @webhook_url = webhook_url
308
+ @http_timeout = options.fetch(:timeout, 5)
309
+ @retry_count = options.fetch(:retries, 2)
310
+ end
311
+
312
+ def call(error_info)
313
+ payload = {
314
+ error: {
315
+ type: error_info[:type],
316
+ message: error_info[:message],
317
+ fingerprint: error_info[:fingerprint],
318
+ timestamp: error_info[:timestamp].iso8601,
319
+ context: error_info[:context]
320
+ }
321
+ }
322
+
323
+ send_webhook(payload)
324
+ end
325
+
326
+ private
327
+
328
+ def send_webhook(payload)
329
+ # This would integrate with an HTTP client
330
+ # For now, just log the webhook call
331
+ require_relative 'logger'
332
+ NatsWork::Logger.debug('Webhook notification',
333
+ url: @webhook_url,
334
+ payload_size: payload.to_s.length)
335
+ end
336
+ end
337
+ end
338
+ end
@@ -0,0 +1,252 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'concurrent'
4
+
5
+ module NatsWork
6
+ class HealthCheck
7
+ attr_reader :name, :status, :last_checked, :message, :metadata
8
+
9
+ STATUSES = %i[healthy degraded unhealthy unknown].freeze
10
+
11
+ def initialize(name, &check_block)
12
+ @name = name.to_s
13
+ @check_block = check_block
14
+ @status = :unknown
15
+ @message = ''
16
+ @metadata = {}
17
+ @last_checked = nil
18
+ @mutex = Mutex.new
19
+ end
20
+
21
+ def check!
22
+ @mutex.synchronize do
23
+ @last_checked = Time.now
24
+
25
+ begin
26
+ if @check_block
27
+ result = @check_block.call
28
+
29
+ if result.is_a?(Hash)
30
+ @status = result.fetch(:status, :healthy)
31
+ @message = result.fetch(:message, '')
32
+ @metadata = result.fetch(:metadata, {})
33
+ elsif result
34
+ @status = :healthy
35
+ @message = 'Check passed'
36
+ else
37
+ @status = :unhealthy
38
+ @message = 'Check failed'
39
+ end
40
+ else
41
+ @status = :unknown
42
+ @message = 'No check defined'
43
+ end
44
+ rescue StandardError => e
45
+ @status = :unhealthy
46
+ @message = "Check error: #{e.message}"
47
+ @metadata = { error: e.class.name, backtrace: e.backtrace&.first(5) }
48
+ end
49
+
50
+ @status
51
+ end
52
+ end
53
+
54
+ def healthy?
55
+ @status == :healthy
56
+ end
57
+
58
+ def unhealthy?
59
+ @status == :unhealthy
60
+ end
61
+
62
+ def degraded?
63
+ @status == :degraded
64
+ end
65
+
66
+ def to_h
67
+ {
68
+ name: @name,
69
+ status: @status,
70
+ message: @message,
71
+ last_checked: @last_checked,
72
+ metadata: @metadata
73
+ }
74
+ end
75
+ end
76
+
77
+ class HealthChecker
78
+ def initialize
79
+ @checks = Concurrent::Hash.new
80
+ @thresholds = {
81
+ memory_mb: 1000,
82
+ cpu_percent: 90.0,
83
+ queue_depth: 1000,
84
+ error_rate: 0.05
85
+ }
86
+ end
87
+
88
+ def add_check(name, &block)
89
+ @checks[name.to_s] = HealthCheck.new(name, &block)
90
+ end
91
+
92
+ def remove_check(name)
93
+ @checks.delete(name.to_s)
94
+ end
95
+
96
+ def check_all!
97
+ results = {}
98
+
99
+ @checks.each do |name, check|
100
+ results[name] = check.check!
101
+ end
102
+
103
+ results
104
+ end
105
+
106
+ def check(name)
107
+ check = @checks[name.to_s]
108
+ check&.check!
109
+ end
110
+
111
+ def status(name = nil)
112
+ if name
113
+ @checks[name.to_s]&.status || :unknown
114
+ else
115
+ overall_status
116
+ end
117
+ end
118
+
119
+ def healthy?(name = nil)
120
+ if name
121
+ @checks[name.to_s]&.healthy? || false
122
+ else
123
+ @checks.values.all?(&:healthy?)
124
+ end
125
+ end
126
+
127
+ def report
128
+ {
129
+ status: overall_status,
130
+ timestamp: Time.now.iso8601,
131
+ checks: @checks.transform_values(&:to_h)
132
+ }
133
+ end
134
+
135
+ def set_threshold(metric, value)
136
+ @thresholds[metric.to_sym] = value
137
+ end
138
+
139
+ private
140
+
141
+ def overall_status
142
+ statuses = @checks.values.map(&:status)
143
+
144
+ return :unhealthy if statuses.include?(:unhealthy)
145
+ return :degraded if statuses.include?(:degraded)
146
+ return :healthy if statuses.all? { |s| s == :healthy }
147
+
148
+ :unknown
149
+ end
150
+
151
+ class << self
152
+ def global
153
+ @global ||= new.tap { |checker| setup_default_checks(checker) }
154
+ end
155
+
156
+ private
157
+
158
+ def setup_default_checks(checker)
159
+ # NATS connection health
160
+ checker.add_check('nats_connection') do
161
+ # This will be implemented when we have connection reference
162
+ { status: :healthy, message: 'NATS connection active' }
163
+ end
164
+
165
+ # Memory usage
166
+ checker.add_check('memory_usage') do
167
+ if defined?(GC.stat)
168
+ memory_mb = GC.stat(:heap_allocated_pages) * GC::INTERNAL_CONSTANTS[:HEAP_PAGE_SIZE] / (1024 * 1024)
169
+ threshold = checker.instance_variable_get(:@thresholds)[:memory_mb]
170
+
171
+ if memory_mb > threshold
172
+ { status: :unhealthy, message: "Memory usage #{memory_mb}MB exceeds threshold #{threshold}MB" }
173
+ elsif memory_mb > threshold * 0.8
174
+ { status: :degraded, message: "Memory usage #{memory_mb}MB approaching threshold" }
175
+ else
176
+ { status: :healthy, message: "Memory usage #{memory_mb}MB within limits" }
177
+ end
178
+ else
179
+ { status: :unknown, message: 'Memory stats not available' }
180
+ end
181
+ end
182
+
183
+ # Process health
184
+ checker.add_check('process') do
185
+ {
186
+ status: :healthy,
187
+ message: 'Process running normally',
188
+ metadata: {
189
+ pid: Process.pid,
190
+ uptime: Time.now - $PROGRAM_START_TIME
191
+ }
192
+ }
193
+ end
194
+ end
195
+ end
196
+ end
197
+
198
+ # Worker-specific health checks
199
+ module WorkerHealth
200
+ def self.setup_worker_checks(worker, health_checker = HealthChecker.global)
201
+ # Worker status check
202
+ health_checker.add_check('worker_status') do
203
+ if worker.running?
204
+ if worker.paused?
205
+ { status: :degraded, message: 'Worker is paused' }
206
+ elsif worker.stopping?
207
+ { status: :degraded, message: 'Worker is stopping' }
208
+ else
209
+ { status: :healthy, message: 'Worker running normally' }
210
+ end
211
+ else
212
+ { status: :unhealthy, message: 'Worker is stopped' }
213
+ end
214
+ end
215
+
216
+ # Active jobs check
217
+ health_checker.add_check('active_jobs') do
218
+ stats = worker.stats
219
+ active = stats[:active_jobs]
220
+ concurrency = stats[:concurrency]
221
+
222
+ if active >= concurrency
223
+ { status: :degraded, message: "All #{concurrency} job slots occupied" }
224
+ elsif active > concurrency * 0.8
225
+ { status: :degraded, message: "#{active}/#{concurrency} job slots occupied" }
226
+ else
227
+ { status: :healthy, message: "#{active}/#{concurrency} job slots occupied" }
228
+ end
229
+ end
230
+
231
+ # Job processing rate check
232
+ health_checker.add_check('job_processing') do
233
+ stats = worker.stats
234
+ processed = stats[:jobs_processed]
235
+ failed = stats[:jobs_failed]
236
+
237
+ if processed.zero?
238
+ { status: :unknown, message: 'No jobs processed yet' }
239
+ else
240
+ error_rate = failed.to_f / processed
241
+ if error_rate > 0.5
242
+ { status: :unhealthy, message: "High error rate: #{(error_rate * 100).round(1)}%" }
243
+ elsif error_rate > 0.1
244
+ { status: :degraded, message: "Elevated error rate: #{(error_rate * 100).round(1)}%" }
245
+ else
246
+ { status: :healthy, message: "Error rate: #{(error_rate * 100).round(1)}%" }
247
+ end
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end