sensu 0.9.9.beta.2 → 0.9.9.beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/sensu/server.rb CHANGED
@@ -4,7 +4,9 @@ require File.join(File.dirname(__FILE__), 'socket')
4
4
 
5
5
  module Sensu
6
6
  class Server
7
- attr_reader :redis, :amq, :is_master
7
+ include Utilities
8
+
9
+ attr_reader :is_master
8
10
 
9
11
  def self.run(options={})
10
12
  server = self.new(options)
@@ -15,9 +17,11 @@ module Sensu
15
17
  end
16
18
 
17
19
  def initialize(options={})
18
- @logger = Sensu::Logger.get
19
- base = Sensu::Base.new(options)
20
+ base = Base.new(options)
21
+ @logger = base.logger
20
22
  @settings = base.settings
23
+ @extensions = base.extensions
24
+ base.setup_process
21
25
  @timers = Array.new
22
26
  @master_timers = Array.new
23
27
  @handlers_in_progress_count = 0
@@ -28,65 +32,45 @@ module Sensu
28
32
  @logger.debug('connecting to redis', {
29
33
  :settings => @settings[:redis]
30
34
  })
31
- connection_failure = Proc.new do
32
- @logger.fatal('cannot connect to redis', {
33
- :settings => @settings[:redis]
35
+ @redis = Redis.connect(@settings[:redis])
36
+ @redis.on_error do |error|
37
+ @logger.fatal('redis connection error', {
38
+ :error => error.to_s
34
39
  })
35
- @logger.fatal('SENSU NOT RUNNING!')
36
- if @rabbitmq
37
- @rabbitmq.close
38
- end
39
- exit 2
40
+ stop
40
41
  end
41
- @redis = Sensu::Redis.connect(@settings[:redis], :on_tcp_connection_failure => connection_failure)
42
- @redis.on_tcp_connection_loss do
42
+ @redis.before_reconnect do
43
+ @logger.warn('reconnecting to redis')
43
44
  unless testing?
44
- @logger.fatal('redis connection closed')
45
- stop
45
+ pause
46
46
  end
47
47
  end
48
+ @redis.after_reconnect do
49
+ @logger.info('reconnected to redis')
50
+ resume
51
+ end
48
52
  end
49
53
 
50
54
  def setup_rabbitmq
51
55
  @logger.debug('connecting to rabbitmq', {
52
56
  :settings => @settings[:rabbitmq]
53
57
  })
54
- connection_failure = Proc.new do
55
- @logger.fatal('cannot connect to rabbitmq', {
56
- :settings => @settings[:rabbitmq]
58
+ @rabbitmq = RabbitMQ.connect(@settings[:rabbitmq])
59
+ @rabbitmq.on_error do |error|
60
+ @logger.fatal('rabbitmq connection error', {
61
+ :error => error.to_s
57
62
  })
58
- @logger.fatal('SENSU NOT RUNNING!')
59
- @redis.close
60
- exit 2
63
+ stop
61
64
  end
62
- @rabbitmq = AMQP.connect(@settings[:rabbitmq], {
63
- :on_tcp_connection_failure => connection_failure,
64
- :on_possible_authentication_failure => connection_failure
65
- })
66
- @rabbitmq.logger = Sensu::NullLogger.get
67
- @rabbitmq.on_tcp_connection_loss do |connection, settings|
68
- unless connection.reconnecting?
69
- @logger.warn('reconnecting to rabbitmq')
70
- resign_as_master do
71
- connection.periodically_reconnect(5)
72
- end
73
- end
65
+ @rabbitmq.before_reconnect do
66
+ @logger.warn('reconnecting to rabbitmq')
67
+ resign_as_master
74
68
  end
75
- @rabbitmq.on_skipped_heartbeats do
76
- @logger.warn('skipped rabbitmq heartbeat')
69
+ @rabbitmq.after_reconnect do
70
+ @logger.info('reconnected to rabbitmq')
77
71
  end
78
- @amq = AMQP::Channel.new(@rabbitmq)
79
- @amq.auto_recovery = true
72
+ @amq = @rabbitmq.channel
80
73
  @amq.prefetch(1)
81
- @amq.on_error do |channel, channel_close|
82
- @logger.fatal('rabbitmq channel closed', {
83
- :error => {
84
- :reply_code => channel_close.reply_code,
85
- :reply_text => channel_close.reply_text
86
- }
87
- })
88
- stop
89
- end
90
74
  end
91
75
 
92
76
  def setup_keepalives
@@ -97,8 +81,8 @@ module Sensu
97
81
  @logger.debug('received keepalive', {
98
82
  :client => client
99
83
  })
100
- @redis.set('client:' + client[:name], client.to_json).callback do
101
- @redis.sadd('clients', client[:name]).callback do
84
+ @redis.set('client:' + client[:name], client.to_json) do
85
+ @redis.sadd('clients', client[:name]) do
102
86
  header.ack
103
87
  end
104
88
  end
@@ -155,7 +139,7 @@ module Sensu
155
139
  def derive_handlers(handler_list, nested=false)
156
140
  handler_list.inject(Array.new) do |handlers, handler_name|
157
141
  if @settings.handler_exists?(handler_name)
158
- handler = @settings[:handlers][handler_name]
142
+ handler = @settings[:handlers][handler_name].merge(:name => handler_name)
159
143
  if handler[:type] == 'set'
160
144
  unless nested
161
145
  handlers = handlers + derive_handlers(handler[:handlers], true)
@@ -167,6 +151,9 @@ module Sensu
167
151
  else
168
152
  handlers.push(handler)
169
153
  end
154
+ elsif @extensions.handler_exists?(handler_name)
155
+ handler = @extensions[:handlers][handler_name]
156
+ handlers.push(handler)
170
157
  else
171
158
  @logger.error('unknown handler', {
172
159
  :handler => {
@@ -181,7 +168,7 @@ module Sensu
181
168
  def event_handlers(event)
182
169
  handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || 'default')
183
170
  handlers = derive_handlers(handler_list)
184
- event_severity = Sensu::SEVERITIES[event[:check][:status]] || 'unknown'
171
+ event_severity = SEVERITIES[event[:check][:status]] || 'unknown'
185
172
  handlers.select do |handler|
186
173
  if event[:action] == :flapping && !handler[:handle_flapping]
187
174
  @logger.info('handler does not handle flapping events', {
@@ -233,7 +220,7 @@ module Sensu
233
220
  end
234
221
  execute = Proc.new do
235
222
  begin
236
- output, status = Sensu::IO.popen(command, 'r+') do |child|
223
+ output, status = IO.popen(command, 'r+') do |child|
237
224
  unless data.nil?
238
225
  child.write(data.to_s)
239
226
  end
@@ -254,41 +241,39 @@ module Sensu
254
241
  end
255
242
 
256
243
  def mutate_event_data(mutator_name, event, &block)
257
- case mutator_name
258
- when nil
244
+ on_error = Proc.new do |error|
245
+ @logger.error('mutator error', {
246
+ :event => event,
247
+ :mutator => mutator,
248
+ :error => error.to_s
249
+ })
250
+ end
251
+ case
252
+ when mutator_name.nil?
259
253
  block.call(event.to_json)
260
- when /^only_check_output/
261
- mutated = case mutator_name
262
- when /split$/
263
- event[:check][:output].split(/\n+/)
264
- else
265
- event[:check][:output]
266
- end
267
- block.call(mutated)
268
- else
269
- if @settings.mutator_exists?(mutator_name)
270
- mutator = @settings[:mutators][mutator_name]
271
- on_error = Proc.new do |error|
272
- @logger.error('mutator error', {
273
- :event => event,
274
- :mutator => mutator,
275
- :error => error.to_s
276
- })
254
+ when @settings.mutator_exists?(mutator_name)
255
+ mutator = @settings[:mutators][mutator_name]
256
+ execute_command(mutator[:command], event.to_json, on_error) do |output, status|
257
+ if status == 0
258
+ block.call(output)
259
+ else
260
+ on_error.call('non-zero exit status (' + status + '): ' + output)
277
261
  end
278
- execute_command(mutator[:command], event.to_json, on_error) do |output, status|
279
- if status == 0
280
- block.call(output)
281
- else
282
- on_error.call('non-zero exit status (' + status + '): ' + output)
283
- end
262
+ end
263
+ when @extensions.mutator_exists?(mutator_name)
264
+ @extensions[:mutators][mutator_name].run(event) do |output, status|
265
+ if status == 0
266
+ block.call(output)
267
+ else
268
+ on_error.call('non-zero exit status (' + status + '): ' + output)
284
269
  end
285
- else
286
- @logger.error('unknown mutator', {
287
- :mutator => {
288
- :name => mutator_name
289
- }
290
- })
291
270
  end
271
+ else
272
+ @logger.error('unknown mutator', {
273
+ :mutator => {
274
+ :name => mutator_name
275
+ }
276
+ })
292
277
  end
293
278
  end
294
279
 
@@ -320,7 +305,7 @@ module Sensu
320
305
  end
321
306
  when 'tcp'
322
307
  begin
323
- EM::connect(handler[:socket][:host], handler[:socket][:port], Sensu::SocketHandler) do |socket|
308
+ EM::connect(handler[:socket][:host], handler[:socket][:port], SocketHandler) do |socket|
324
309
  socket.on_success = Proc.new do
325
310
  @handlers_in_progress_count -= 1
326
311
  end
@@ -350,13 +335,17 @@ module Sensu
350
335
  exchange_options = handler[:exchange].reject do |key, value|
351
336
  [:name, :type].include?(key)
352
337
  end
353
- payloads = Array(event_data)
354
- payloads.each do |payload|
355
- unless payload.empty?
356
- @amq.method(exchange_type).call(exchange_name, exchange_options).publish(payload)
357
- end
338
+ unless event_data.empty?
339
+ @amq.method(exchange_type).call(exchange_name, exchange_options).publish(event_data)
358
340
  end
359
341
  @handlers_in_progress_count -= 1
342
+ when 'extension'
343
+ handler.run(event_data) do |output, status|
344
+ output.split(/\n+/).each do |line|
345
+ @logger.info(line)
346
+ end
347
+ @handlers_in_progress_count -= 1
348
+ end
360
349
  end
361
350
  end
362
351
  end
@@ -371,15 +360,14 @@ module Sensu
371
360
  @redis.hset('aggregation:' + result_set, result[:client], {
372
361
  :output => check[:output],
373
362
  :status => check[:status]
374
- }.to_json).callback do
375
- statuses = Sensu::SEVERITIES
376
- statuses.each do |status|
377
- @redis.hsetnx('aggregate:' + result_set, status, 0)
378
- end
379
- status = (statuses[check[:status]] || 'unknown')
380
- @redis.hincrby('aggregate:' + result_set, status, 1).callback do
381
- @redis.hincrby('aggregate:' + result_set, 'total', 1).callback do
382
- @redis.sadd('aggregates:' + check[:name], check[:issued]).callback do
363
+ }.to_json) do
364
+ SEVERITIES.each do |severity|
365
+ @redis.hsetnx('aggregate:' + result_set, severity, 0)
366
+ end
367
+ severity = (SEVERITIES[check[:status]] || 'unknown')
368
+ @redis.hincrby('aggregate:' + result_set, severity, 1) do
369
+ @redis.hincrby('aggregate:' + result_set, 'total', 1) do
370
+ @redis.sadd('aggregates:' + check[:name], check[:issued]) do
383
371
  @redis.sadd('aggregates', check[:name])
384
372
  end
385
373
  end
@@ -391,7 +379,7 @@ module Sensu
391
379
  @logger.debug('processing result', {
392
380
  :result => result
393
381
  })
394
- @redis.get('client:' + result[:client]).callback do |client_json|
382
+ @redis.get('client:' + result[:client]) do |client_json|
395
383
  unless client_json.nil?
396
384
  client = JSON.parse(client_json, :symbolize_names => true)
397
385
  check = case
@@ -405,11 +393,11 @@ module Sensu
405
393
  end
406
394
  @redis.sadd('history:' + client[:name], check[:name])
407
395
  history_key = 'history:' + client[:name] + ':' + check[:name]
408
- @redis.rpush(history_key, check[:status]).callback do
409
- @redis.lrange(history_key, -21, -1).callback do |history|
396
+ @redis.rpush(history_key, check[:status]) do
397
+ @redis.lrange(history_key, -21, -1) do |history|
410
398
  check[:history] = history
411
399
  total_state_change = 0
412
- unless history.count < 21
400
+ unless history.size < 21
413
401
  state_changes = 0
414
402
  change_weight = 0.8
415
403
  previous_status = history.first
@@ -423,7 +411,7 @@ module Sensu
423
411
  total_state_change = (state_changes.fdiv(20) * 100).to_i
424
412
  @redis.ltrim(history_key, -21, -1)
425
413
  end
426
- @redis.hget('events:' + client[:name], check[:name]).callback do |event_json|
414
+ @redis.hget('events:' + client[:name], check[:name]) do |event_json|
427
415
  previous_occurrence = event_json ? JSON.parse(event_json, :symbolize_names => true) : false
428
416
  is_flapping = false
429
417
  if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
@@ -453,7 +441,7 @@ module Sensu
453
441
  :handlers => Array((check[:handlers] || check[:handler]) || 'default'),
454
442
  :flapping => is_flapping,
455
443
  :occurrences => event[:occurrences]
456
- }.to_json).callback do
444
+ }.to_json) do
457
445
  unless check[:handle] == false
458
446
  event[:action] = is_flapping ? :flapping : :create
459
447
  handle_event(event)
@@ -461,7 +449,7 @@ module Sensu
461
449
  end
462
450
  elsif previous_occurrence
463
451
  unless check[:auto_resolve] == false && !check[:force_resolve]
464
- @redis.hdel('events:' + client[:name], check[:name]).callback do
452
+ @redis.hdel('events:' + client[:name], check[:name]) do
465
453
  unless check[:handle] == false
466
454
  event[:occurrences] = previous_occurrence[:occurrences]
467
455
  event[:action] = :resolve
@@ -515,8 +503,9 @@ module Sensu
515
503
  stagger = testing? ? 0 : 2
516
504
  @settings.checks.each do |check|
517
505
  unless check[:publish] == false || check[:standalone]
518
- check_count = (check_count + 1) % 30
519
- @master_timers << EM::Timer.new(stagger * check_count) do
506
+ check_count += 1
507
+ scheduling_delay = stagger * check_count % 30
508
+ @master_timers << EM::Timer.new(scheduling_delay) do
520
509
  interval = testing? ? 0.5 : check[:interval]
521
510
  @master_timers << EM::PeriodicTimer.new(interval) do
522
511
  unless check_subdued?(check, :publisher)
@@ -539,34 +528,32 @@ module Sensu
539
528
  @amq.queue('results').publish(payload.to_json)
540
529
  end
541
530
 
542
- def setup_keepalive_monitor
543
- @logger.debug('monitoring client keepalives')
544
- @master_timers << EM::PeriodicTimer.new(30) do
545
- @redis.smembers('clients').callback do |clients|
546
- clients.each do |client_name|
547
- @redis.get('client:' + client_name).callback do |client_json|
548
- client = JSON.parse(client_json, :symbolize_names => true)
549
- check = {
550
- :name => 'keepalive',
551
- :issued => Time.now.to_i
552
- }
553
- time_since_last_keepalive = Time.now.to_i - client[:timestamp]
554
- case
555
- when time_since_last_keepalive >= 180
556
- check[:output] = 'No keep-alive sent from client in over 180 seconds'
557
- check[:status] = 2
558
- publish_result(client, check)
559
- when time_since_last_keepalive >= 120
560
- check[:output] = 'No keep-alive sent from client in over 120 seconds'
561
- check[:status] = 1
562
- publish_result(client, check)
563
- else
564
- @redis.hexists('events:' + client[:name], 'keepalive').callback do |exists|
565
- if exists
566
- check[:output] = 'Keep-alive sent from client'
567
- check[:status] = 0
568
- publish_result(client, check)
569
- end
531
+ def determine_stale_clients
532
+ @logger.info('determining stale clients')
533
+ @redis.smembers('clients') do |clients|
534
+ clients.each do |client_name|
535
+ @redis.get('client:' + client_name) do |client_json|
536
+ client = JSON.parse(client_json, :symbolize_names => true)
537
+ check = {
538
+ :name => 'keepalive',
539
+ :issued => Time.now.to_i
540
+ }
541
+ time_since_last_keepalive = Time.now.to_i - client[:timestamp]
542
+ case
543
+ when time_since_last_keepalive >= 180
544
+ check[:output] = 'No keep-alive sent from client in over 180 seconds'
545
+ check[:status] = 2
546
+ publish_result(client, check)
547
+ when time_since_last_keepalive >= 120
548
+ check[:output] = 'No keep-alive sent from client in over 120 seconds'
549
+ check[:status] = 1
550
+ publish_result(client, check)
551
+ else
552
+ @redis.hexists('events:' + client[:name], 'keepalive') do |exists|
553
+ if exists
554
+ check[:output] = 'Keep-alive sent from client'
555
+ check[:status] = 0
556
+ publish_result(client, check)
570
557
  end
571
558
  end
572
559
  end
@@ -575,19 +562,25 @@ module Sensu
575
562
  end
576
563
  end
577
564
 
578
- def setup_aggregation_pruner
579
- @logger.debug('pruning aggregations')
580
- @master_timers << EM::PeriodicTimer.new(20) do
581
- @redis.smembers('aggregates').callback do |checks|
582
- checks.each do |check_name|
583
- @redis.smembers('aggregates:' + check_name).callback do |aggregates|
565
+ def setup_client_monitor
566
+ @logger.debug('monitoring clients')
567
+ @master_timers << EM::PeriodicTimer.new(30) do
568
+ determine_stale_clients
569
+ end
570
+ end
571
+
572
+ def prune_aggregations
573
+ @logger.info('pruning aggregations')
574
+ @redis.smembers('aggregates') do |checks|
575
+ checks.each do |check_name|
576
+ @redis.smembers('aggregates:' + check_name) do |aggregates|
577
+ if aggregates.size > 20
584
578
  aggregates.sort!
585
- until aggregates.size <= 20
586
- check_issued = aggregates.shift
587
- @redis.srem('aggregates:' + check_name, check_issued).callback do
579
+ aggregates.take(aggregates.size - 20).each do |check_issued|
580
+ @redis.srem('aggregates:' + check_name, check_issued) do
588
581
  result_set = check_name + ':' + check_issued.to_s
589
- @redis.del('aggregate:' + result_set).callback do
590
- @redis.del('aggregation:' + result_set).callback do
582
+ @redis.del('aggregate:' + result_set) do
583
+ @redis.del('aggregation:' + result_set) do
591
584
  @logger.debug('pruned aggregation', {
592
585
  :check => {
593
586
  :name => check_name,
@@ -604,22 +597,29 @@ module Sensu
604
597
  end
605
598
  end
606
599
 
600
+ def setup_aggregation_pruner
601
+ @logger.debug('pruning aggregations')
602
+ @master_timers << EM::PeriodicTimer.new(20) do
603
+ prune_aggregations
604
+ end
605
+ end
606
+
607
607
  def master_duties
608
608
  setup_publisher
609
- setup_keepalive_monitor
609
+ setup_client_monitor
610
610
  setup_aggregation_pruner
611
611
  end
612
612
 
613
613
  def request_master_election
614
- @redis.setnx('lock:master', Time.now.to_i).callback do |created|
614
+ @redis.setnx('lock:master', Time.now.to_i) do |created|
615
615
  if created
616
616
  @is_master = true
617
617
  @logger.info('i am the master')
618
618
  master_duties
619
619
  else
620
- @redis.get('lock:master').callback do |timestamp|
620
+ @redis.get('lock:master') do |timestamp|
621
621
  if Time.now.to_i - timestamp.to_i >= 60
622
- @redis.getset('lock:master', Time.now.to_i).callback do |previous|
622
+ @redis.getset('lock:master', Time.now.to_i) do |previous|
623
623
  if previous == timestamp
624
624
  @is_master = true
625
625
  @logger.info('i am now the master')
@@ -636,7 +636,7 @@ module Sensu
636
636
  request_master_election
637
637
  @timers << EM::PeriodicTimer.new(20) do
638
638
  if @is_master
639
- @redis.set('lock:master', Time.now.to_i).callback do
639
+ @redis.set('lock:master', Time.now.to_i) do
640
640
  @logger.debug('updated master lock timestamp')
641
641
  end
642
642
  elsif @rabbitmq.connected?
@@ -646,22 +646,25 @@ module Sensu
646
646
  end
647
647
 
648
648
  def resign_as_master(&block)
649
+ block ||= Proc.new {}
649
650
  if @is_master
650
651
  @logger.warn('resigning as master')
651
652
  @master_timers.each do |timer|
652
653
  timer.cancel
653
654
  end
654
655
  @master_timers = Array.new
655
- @redis.del('lock:master').callback do
656
- @logger.info('removed master lock')
657
- @is_master = false
656
+ if @redis.connected?
657
+ @redis.del('lock:master') do
658
+ @logger.info('removed master lock')
659
+ @is_master = false
660
+ end
658
661
  end
659
662
  timestamp = Time.now.to_i
660
663
  retry_until_true do
661
664
  if !@is_master
662
665
  block.call
663
666
  true
664
- elsif !@redis.connected? || Time.now.to_i - timestamp >= 5
667
+ elsif Time.now.to_i - timestamp >= 3
665
668
  @logger.warn('failed to remove master lock')
666
669
  @is_master = false
667
670
  block.call
@@ -707,66 +710,68 @@ module Sensu
707
710
  end
708
711
  end
709
712
 
710
- def start
711
- setup_redis
712
- setup_rabbitmq
713
+ def bootstrap
713
714
  setup_keepalives
714
715
  setup_results
715
716
  setup_master_monitor
717
+ @state = :running
716
718
  end
717
719
 
718
- def stop
719
- @logger.warn('stopping')
720
- @timers.each do |timer|
721
- timer.cancel
722
- end
723
- unsubscribe do
724
- resign_as_master do
725
- complete_handlers_in_progress do
726
- @rabbitmq.close
727
- @redis.close
728
- @logger.warn('stopping reactor')
729
- EM::stop_event_loop
720
+ def start
721
+ setup_redis
722
+ setup_rabbitmq
723
+ bootstrap
724
+ end
725
+
726
+ def pause(&block)
727
+ unless @state == :pausing || @state == :paused
728
+ @state = :pausing
729
+ @timers.each do |timer|
730
+ timer.cancel
731
+ end
732
+ @timers = Array.new
733
+ unsubscribe do
734
+ resign_as_master do
735
+ @state = :paused
736
+ if block
737
+ block.call
738
+ end
730
739
  end
731
740
  end
732
741
  end
733
742
  end
734
743
 
735
- def trap_signals
736
- %w[INT TERM].each do |signal|
737
- Signal.trap(signal) do
738
- @logger.warn('received signal', {
739
- :signal => signal
740
- })
741
- stop
744
+ def resume
745
+ retry_until_true(1) do
746
+ if @state == :paused
747
+ if @redis.connected? && @rabbitmq.connected?
748
+ bootstrap
749
+ true
750
+ end
742
751
  end
743
752
  end
744
753
  end
745
754
 
746
- private
747
-
748
- def testing?
749
- File.basename($0) == 'rake'
750
- end
751
-
752
- def retry_until_true(wait=0.5, &block)
753
- EM::Timer.new(wait) do
754
- unless block.call
755
- retry_until_true(wait, &block)
755
+ def stop
756
+ @logger.warn('stopping')
757
+ @state = :stopping
758
+ pause do
759
+ complete_handlers_in_progress do
760
+ @redis.close
761
+ @rabbitmq.close
762
+ @logger.warn('stopping reactor')
763
+ EM::stop_event_loop
756
764
  end
757
765
  end
758
766
  end
759
767
 
760
- def hash_values_equal?(hash_one, hash_two)
761
- hash_one.keys.all? do |key|
762
- if hash_one[key] == hash_two[key]
763
- true
764
- else
765
- if hash_one[key].is_a?(Hash) && hash_two[key].is_a?(Hash)
766
- hash_values_equal?(hash_one[key], hash_two[key])
767
- else
768
- false
769
- end
768
+ def trap_signals
769
+ %w[INT TERM].each do |signal|
770
+ Signal.trap(signal) do
771
+ @logger.warn('received signal', {
772
+ :signal => signal
773
+ })
774
+ stop
770
775
  end
771
776
  end
772
777
  end