sensu 0.9.9.beta.2 → 0.9.9.beta.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/sensu/server.rb CHANGED
@@ -4,7 +4,9 @@ require File.join(File.dirname(__FILE__), 'socket')
4
4
 
5
5
  module Sensu
6
6
  class Server
7
- attr_reader :redis, :amq, :is_master
7
+ include Utilities
8
+
9
+ attr_reader :is_master
8
10
 
9
11
  def self.run(options={})
10
12
  server = self.new(options)
@@ -15,9 +17,11 @@ module Sensu
15
17
  end
16
18
 
17
19
  def initialize(options={})
18
- @logger = Sensu::Logger.get
19
- base = Sensu::Base.new(options)
20
+ base = Base.new(options)
21
+ @logger = base.logger
20
22
  @settings = base.settings
23
+ @extensions = base.extensions
24
+ base.setup_process
21
25
  @timers = Array.new
22
26
  @master_timers = Array.new
23
27
  @handlers_in_progress_count = 0
@@ -28,65 +32,45 @@ module Sensu
28
32
  @logger.debug('connecting to redis', {
29
33
  :settings => @settings[:redis]
30
34
  })
31
- connection_failure = Proc.new do
32
- @logger.fatal('cannot connect to redis', {
33
- :settings => @settings[:redis]
35
+ @redis = Redis.connect(@settings[:redis])
36
+ @redis.on_error do |error|
37
+ @logger.fatal('redis connection error', {
38
+ :error => error.to_s
34
39
  })
35
- @logger.fatal('SENSU NOT RUNNING!')
36
- if @rabbitmq
37
- @rabbitmq.close
38
- end
39
- exit 2
40
+ stop
40
41
  end
41
- @redis = Sensu::Redis.connect(@settings[:redis], :on_tcp_connection_failure => connection_failure)
42
- @redis.on_tcp_connection_loss do
42
+ @redis.before_reconnect do
43
+ @logger.warn('reconnecting to redis')
43
44
  unless testing?
44
- @logger.fatal('redis connection closed')
45
- stop
45
+ pause
46
46
  end
47
47
  end
48
+ @redis.after_reconnect do
49
+ @logger.info('reconnected to redis')
50
+ resume
51
+ end
48
52
  end
49
53
 
50
54
  def setup_rabbitmq
51
55
  @logger.debug('connecting to rabbitmq', {
52
56
  :settings => @settings[:rabbitmq]
53
57
  })
54
- connection_failure = Proc.new do
55
- @logger.fatal('cannot connect to rabbitmq', {
56
- :settings => @settings[:rabbitmq]
58
+ @rabbitmq = RabbitMQ.connect(@settings[:rabbitmq])
59
+ @rabbitmq.on_error do |error|
60
+ @logger.fatal('rabbitmq connection error', {
61
+ :error => error.to_s
57
62
  })
58
- @logger.fatal('SENSU NOT RUNNING!')
59
- @redis.close
60
- exit 2
63
+ stop
61
64
  end
62
- @rabbitmq = AMQP.connect(@settings[:rabbitmq], {
63
- :on_tcp_connection_failure => connection_failure,
64
- :on_possible_authentication_failure => connection_failure
65
- })
66
- @rabbitmq.logger = Sensu::NullLogger.get
67
- @rabbitmq.on_tcp_connection_loss do |connection, settings|
68
- unless connection.reconnecting?
69
- @logger.warn('reconnecting to rabbitmq')
70
- resign_as_master do
71
- connection.periodically_reconnect(5)
72
- end
73
- end
65
+ @rabbitmq.before_reconnect do
66
+ @logger.warn('reconnecting to rabbitmq')
67
+ resign_as_master
74
68
  end
75
- @rabbitmq.on_skipped_heartbeats do
76
- @logger.warn('skipped rabbitmq heartbeat')
69
+ @rabbitmq.after_reconnect do
70
+ @logger.info('reconnected to rabbitmq')
77
71
  end
78
- @amq = AMQP::Channel.new(@rabbitmq)
79
- @amq.auto_recovery = true
72
+ @amq = @rabbitmq.channel
80
73
  @amq.prefetch(1)
81
- @amq.on_error do |channel, channel_close|
82
- @logger.fatal('rabbitmq channel closed', {
83
- :error => {
84
- :reply_code => channel_close.reply_code,
85
- :reply_text => channel_close.reply_text
86
- }
87
- })
88
- stop
89
- end
90
74
  end
91
75
 
92
76
  def setup_keepalives
@@ -97,8 +81,8 @@ module Sensu
97
81
  @logger.debug('received keepalive', {
98
82
  :client => client
99
83
  })
100
- @redis.set('client:' + client[:name], client.to_json).callback do
101
- @redis.sadd('clients', client[:name]).callback do
84
+ @redis.set('client:' + client[:name], client.to_json) do
85
+ @redis.sadd('clients', client[:name]) do
102
86
  header.ack
103
87
  end
104
88
  end
@@ -155,7 +139,7 @@ module Sensu
155
139
  def derive_handlers(handler_list, nested=false)
156
140
  handler_list.inject(Array.new) do |handlers, handler_name|
157
141
  if @settings.handler_exists?(handler_name)
158
- handler = @settings[:handlers][handler_name]
142
+ handler = @settings[:handlers][handler_name].merge(:name => handler_name)
159
143
  if handler[:type] == 'set'
160
144
  unless nested
161
145
  handlers = handlers + derive_handlers(handler[:handlers], true)
@@ -167,6 +151,9 @@ module Sensu
167
151
  else
168
152
  handlers.push(handler)
169
153
  end
154
+ elsif @extensions.handler_exists?(handler_name)
155
+ handler = @extensions[:handlers][handler_name]
156
+ handlers.push(handler)
170
157
  else
171
158
  @logger.error('unknown handler', {
172
159
  :handler => {
@@ -181,7 +168,7 @@ module Sensu
181
168
  def event_handlers(event)
182
169
  handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || 'default')
183
170
  handlers = derive_handlers(handler_list)
184
- event_severity = Sensu::SEVERITIES[event[:check][:status]] || 'unknown'
171
+ event_severity = SEVERITIES[event[:check][:status]] || 'unknown'
185
172
  handlers.select do |handler|
186
173
  if event[:action] == :flapping && !handler[:handle_flapping]
187
174
  @logger.info('handler does not handle flapping events', {
@@ -233,7 +220,7 @@ module Sensu
233
220
  end
234
221
  execute = Proc.new do
235
222
  begin
236
- output, status = Sensu::IO.popen(command, 'r+') do |child|
223
+ output, status = IO.popen(command, 'r+') do |child|
237
224
  unless data.nil?
238
225
  child.write(data.to_s)
239
226
  end
@@ -254,41 +241,39 @@ module Sensu
254
241
  end
255
242
 
256
243
  def mutate_event_data(mutator_name, event, &block)
257
- case mutator_name
258
- when nil
244
+ on_error = Proc.new do |error|
245
+ @logger.error('mutator error', {
246
+ :event => event,
247
+ :mutator => mutator,
248
+ :error => error.to_s
249
+ })
250
+ end
251
+ case
252
+ when mutator_name.nil?
259
253
  block.call(event.to_json)
260
- when /^only_check_output/
261
- mutated = case mutator_name
262
- when /split$/
263
- event[:check][:output].split(/\n+/)
264
- else
265
- event[:check][:output]
266
- end
267
- block.call(mutated)
268
- else
269
- if @settings.mutator_exists?(mutator_name)
270
- mutator = @settings[:mutators][mutator_name]
271
- on_error = Proc.new do |error|
272
- @logger.error('mutator error', {
273
- :event => event,
274
- :mutator => mutator,
275
- :error => error.to_s
276
- })
254
+ when @settings.mutator_exists?(mutator_name)
255
+ mutator = @settings[:mutators][mutator_name]
256
+ execute_command(mutator[:command], event.to_json, on_error) do |output, status|
257
+ if status == 0
258
+ block.call(output)
259
+ else
260
+ on_error.call('non-zero exit status (' + status + '): ' + output)
277
261
  end
278
- execute_command(mutator[:command], event.to_json, on_error) do |output, status|
279
- if status == 0
280
- block.call(output)
281
- else
282
- on_error.call('non-zero exit status (' + status + '): ' + output)
283
- end
262
+ end
263
+ when @extensions.mutator_exists?(mutator_name)
264
+ @extensions[:mutators][mutator_name].run(event) do |output, status|
265
+ if status == 0
266
+ block.call(output)
267
+ else
268
+ on_error.call('non-zero exit status (' + status + '): ' + output)
284
269
  end
285
- else
286
- @logger.error('unknown mutator', {
287
- :mutator => {
288
- :name => mutator_name
289
- }
290
- })
291
270
  end
271
+ else
272
+ @logger.error('unknown mutator', {
273
+ :mutator => {
274
+ :name => mutator_name
275
+ }
276
+ })
292
277
  end
293
278
  end
294
279
 
@@ -320,7 +305,7 @@ module Sensu
320
305
  end
321
306
  when 'tcp'
322
307
  begin
323
- EM::connect(handler[:socket][:host], handler[:socket][:port], Sensu::SocketHandler) do |socket|
308
+ EM::connect(handler[:socket][:host], handler[:socket][:port], SocketHandler) do |socket|
324
309
  socket.on_success = Proc.new do
325
310
  @handlers_in_progress_count -= 1
326
311
  end
@@ -350,13 +335,17 @@ module Sensu
350
335
  exchange_options = handler[:exchange].reject do |key, value|
351
336
  [:name, :type].include?(key)
352
337
  end
353
- payloads = Array(event_data)
354
- payloads.each do |payload|
355
- unless payload.empty?
356
- @amq.method(exchange_type).call(exchange_name, exchange_options).publish(payload)
357
- end
338
+ unless event_data.empty?
339
+ @amq.method(exchange_type).call(exchange_name, exchange_options).publish(event_data)
358
340
  end
359
341
  @handlers_in_progress_count -= 1
342
+ when 'extension'
343
+ handler.run(event_data) do |output, status|
344
+ output.split(/\n+/).each do |line|
345
+ @logger.info(line)
346
+ end
347
+ @handlers_in_progress_count -= 1
348
+ end
360
349
  end
361
350
  end
362
351
  end
@@ -371,15 +360,14 @@ module Sensu
371
360
  @redis.hset('aggregation:' + result_set, result[:client], {
372
361
  :output => check[:output],
373
362
  :status => check[:status]
374
- }.to_json).callback do
375
- statuses = Sensu::SEVERITIES
376
- statuses.each do |status|
377
- @redis.hsetnx('aggregate:' + result_set, status, 0)
378
- end
379
- status = (statuses[check[:status]] || 'unknown')
380
- @redis.hincrby('aggregate:' + result_set, status, 1).callback do
381
- @redis.hincrby('aggregate:' + result_set, 'total', 1).callback do
382
- @redis.sadd('aggregates:' + check[:name], check[:issued]).callback do
363
+ }.to_json) do
364
+ SEVERITIES.each do |severity|
365
+ @redis.hsetnx('aggregate:' + result_set, severity, 0)
366
+ end
367
+ severity = (SEVERITIES[check[:status]] || 'unknown')
368
+ @redis.hincrby('aggregate:' + result_set, severity, 1) do
369
+ @redis.hincrby('aggregate:' + result_set, 'total', 1) do
370
+ @redis.sadd('aggregates:' + check[:name], check[:issued]) do
383
371
  @redis.sadd('aggregates', check[:name])
384
372
  end
385
373
  end
@@ -391,7 +379,7 @@ module Sensu
391
379
  @logger.debug('processing result', {
392
380
  :result => result
393
381
  })
394
- @redis.get('client:' + result[:client]).callback do |client_json|
382
+ @redis.get('client:' + result[:client]) do |client_json|
395
383
  unless client_json.nil?
396
384
  client = JSON.parse(client_json, :symbolize_names => true)
397
385
  check = case
@@ -405,11 +393,11 @@ module Sensu
405
393
  end
406
394
  @redis.sadd('history:' + client[:name], check[:name])
407
395
  history_key = 'history:' + client[:name] + ':' + check[:name]
408
- @redis.rpush(history_key, check[:status]).callback do
409
- @redis.lrange(history_key, -21, -1).callback do |history|
396
+ @redis.rpush(history_key, check[:status]) do
397
+ @redis.lrange(history_key, -21, -1) do |history|
410
398
  check[:history] = history
411
399
  total_state_change = 0
412
- unless history.count < 21
400
+ unless history.size < 21
413
401
  state_changes = 0
414
402
  change_weight = 0.8
415
403
  previous_status = history.first
@@ -423,7 +411,7 @@ module Sensu
423
411
  total_state_change = (state_changes.fdiv(20) * 100).to_i
424
412
  @redis.ltrim(history_key, -21, -1)
425
413
  end
426
- @redis.hget('events:' + client[:name], check[:name]).callback do |event_json|
414
+ @redis.hget('events:' + client[:name], check[:name]) do |event_json|
427
415
  previous_occurrence = event_json ? JSON.parse(event_json, :symbolize_names => true) : false
428
416
  is_flapping = false
429
417
  if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
@@ -453,7 +441,7 @@ module Sensu
453
441
  :handlers => Array((check[:handlers] || check[:handler]) || 'default'),
454
442
  :flapping => is_flapping,
455
443
  :occurrences => event[:occurrences]
456
- }.to_json).callback do
444
+ }.to_json) do
457
445
  unless check[:handle] == false
458
446
  event[:action] = is_flapping ? :flapping : :create
459
447
  handle_event(event)
@@ -461,7 +449,7 @@ module Sensu
461
449
  end
462
450
  elsif previous_occurrence
463
451
  unless check[:auto_resolve] == false && !check[:force_resolve]
464
- @redis.hdel('events:' + client[:name], check[:name]).callback do
452
+ @redis.hdel('events:' + client[:name], check[:name]) do
465
453
  unless check[:handle] == false
466
454
  event[:occurrences] = previous_occurrence[:occurrences]
467
455
  event[:action] = :resolve
@@ -515,8 +503,9 @@ module Sensu
515
503
  stagger = testing? ? 0 : 2
516
504
  @settings.checks.each do |check|
517
505
  unless check[:publish] == false || check[:standalone]
518
- check_count = (check_count + 1) % 30
519
- @master_timers << EM::Timer.new(stagger * check_count) do
506
+ check_count += 1
507
+ scheduling_delay = stagger * check_count % 30
508
+ @master_timers << EM::Timer.new(scheduling_delay) do
520
509
  interval = testing? ? 0.5 : check[:interval]
521
510
  @master_timers << EM::PeriodicTimer.new(interval) do
522
511
  unless check_subdued?(check, :publisher)
@@ -539,34 +528,32 @@ module Sensu
539
528
  @amq.queue('results').publish(payload.to_json)
540
529
  end
541
530
 
542
- def setup_keepalive_monitor
543
- @logger.debug('monitoring client keepalives')
544
- @master_timers << EM::PeriodicTimer.new(30) do
545
- @redis.smembers('clients').callback do |clients|
546
- clients.each do |client_name|
547
- @redis.get('client:' + client_name).callback do |client_json|
548
- client = JSON.parse(client_json, :symbolize_names => true)
549
- check = {
550
- :name => 'keepalive',
551
- :issued => Time.now.to_i
552
- }
553
- time_since_last_keepalive = Time.now.to_i - client[:timestamp]
554
- case
555
- when time_since_last_keepalive >= 180
556
- check[:output] = 'No keep-alive sent from client in over 180 seconds'
557
- check[:status] = 2
558
- publish_result(client, check)
559
- when time_since_last_keepalive >= 120
560
- check[:output] = 'No keep-alive sent from client in over 120 seconds'
561
- check[:status] = 1
562
- publish_result(client, check)
563
- else
564
- @redis.hexists('events:' + client[:name], 'keepalive').callback do |exists|
565
- if exists
566
- check[:output] = 'Keep-alive sent from client'
567
- check[:status] = 0
568
- publish_result(client, check)
569
- end
531
+ def determine_stale_clients
532
+ @logger.info('determining stale clients')
533
+ @redis.smembers('clients') do |clients|
534
+ clients.each do |client_name|
535
+ @redis.get('client:' + client_name) do |client_json|
536
+ client = JSON.parse(client_json, :symbolize_names => true)
537
+ check = {
538
+ :name => 'keepalive',
539
+ :issued => Time.now.to_i
540
+ }
541
+ time_since_last_keepalive = Time.now.to_i - client[:timestamp]
542
+ case
543
+ when time_since_last_keepalive >= 180
544
+ check[:output] = 'No keep-alive sent from client in over 180 seconds'
545
+ check[:status] = 2
546
+ publish_result(client, check)
547
+ when time_since_last_keepalive >= 120
548
+ check[:output] = 'No keep-alive sent from client in over 120 seconds'
549
+ check[:status] = 1
550
+ publish_result(client, check)
551
+ else
552
+ @redis.hexists('events:' + client[:name], 'keepalive') do |exists|
553
+ if exists
554
+ check[:output] = 'Keep-alive sent from client'
555
+ check[:status] = 0
556
+ publish_result(client, check)
570
557
  end
571
558
  end
572
559
  end
@@ -575,19 +562,25 @@ module Sensu
575
562
  end
576
563
  end
577
564
 
578
- def setup_aggregation_pruner
579
- @logger.debug('pruning aggregations')
580
- @master_timers << EM::PeriodicTimer.new(20) do
581
- @redis.smembers('aggregates').callback do |checks|
582
- checks.each do |check_name|
583
- @redis.smembers('aggregates:' + check_name).callback do |aggregates|
565
+ def setup_client_monitor
566
+ @logger.debug('monitoring clients')
567
+ @master_timers << EM::PeriodicTimer.new(30) do
568
+ determine_stale_clients
569
+ end
570
+ end
571
+
572
+ def prune_aggregations
573
+ @logger.info('pruning aggregations')
574
+ @redis.smembers('aggregates') do |checks|
575
+ checks.each do |check_name|
576
+ @redis.smembers('aggregates:' + check_name) do |aggregates|
577
+ if aggregates.size > 20
584
578
  aggregates.sort!
585
- until aggregates.size <= 20
586
- check_issued = aggregates.shift
587
- @redis.srem('aggregates:' + check_name, check_issued).callback do
579
+ aggregates.take(aggregates.size - 20).each do |check_issued|
580
+ @redis.srem('aggregates:' + check_name, check_issued) do
588
581
  result_set = check_name + ':' + check_issued.to_s
589
- @redis.del('aggregate:' + result_set).callback do
590
- @redis.del('aggregation:' + result_set).callback do
582
+ @redis.del('aggregate:' + result_set) do
583
+ @redis.del('aggregation:' + result_set) do
591
584
  @logger.debug('pruned aggregation', {
592
585
  :check => {
593
586
  :name => check_name,
@@ -604,22 +597,29 @@ module Sensu
604
597
  end
605
598
  end
606
599
 
600
+ def setup_aggregation_pruner
601
+ @logger.debug('pruning aggregations')
602
+ @master_timers << EM::PeriodicTimer.new(20) do
603
+ prune_aggregations
604
+ end
605
+ end
606
+
607
607
  def master_duties
608
608
  setup_publisher
609
- setup_keepalive_monitor
609
+ setup_client_monitor
610
610
  setup_aggregation_pruner
611
611
  end
612
612
 
613
613
  def request_master_election
614
- @redis.setnx('lock:master', Time.now.to_i).callback do |created|
614
+ @redis.setnx('lock:master', Time.now.to_i) do |created|
615
615
  if created
616
616
  @is_master = true
617
617
  @logger.info('i am the master')
618
618
  master_duties
619
619
  else
620
- @redis.get('lock:master').callback do |timestamp|
620
+ @redis.get('lock:master') do |timestamp|
621
621
  if Time.now.to_i - timestamp.to_i >= 60
622
- @redis.getset('lock:master', Time.now.to_i).callback do |previous|
622
+ @redis.getset('lock:master', Time.now.to_i) do |previous|
623
623
  if previous == timestamp
624
624
  @is_master = true
625
625
  @logger.info('i am now the master')
@@ -636,7 +636,7 @@ module Sensu
636
636
  request_master_election
637
637
  @timers << EM::PeriodicTimer.new(20) do
638
638
  if @is_master
639
- @redis.set('lock:master', Time.now.to_i).callback do
639
+ @redis.set('lock:master', Time.now.to_i) do
640
640
  @logger.debug('updated master lock timestamp')
641
641
  end
642
642
  elsif @rabbitmq.connected?
@@ -646,22 +646,25 @@ module Sensu
646
646
  end
647
647
 
648
648
  def resign_as_master(&block)
649
+ block ||= Proc.new {}
649
650
  if @is_master
650
651
  @logger.warn('resigning as master')
651
652
  @master_timers.each do |timer|
652
653
  timer.cancel
653
654
  end
654
655
  @master_timers = Array.new
655
- @redis.del('lock:master').callback do
656
- @logger.info('removed master lock')
657
- @is_master = false
656
+ if @redis.connected?
657
+ @redis.del('lock:master') do
658
+ @logger.info('removed master lock')
659
+ @is_master = false
660
+ end
658
661
  end
659
662
  timestamp = Time.now.to_i
660
663
  retry_until_true do
661
664
  if !@is_master
662
665
  block.call
663
666
  true
664
- elsif !@redis.connected? || Time.now.to_i - timestamp >= 5
667
+ elsif Time.now.to_i - timestamp >= 3
665
668
  @logger.warn('failed to remove master lock')
666
669
  @is_master = false
667
670
  block.call
@@ -707,66 +710,68 @@ module Sensu
707
710
  end
708
711
  end
709
712
 
710
- def start
711
- setup_redis
712
- setup_rabbitmq
713
+ def bootstrap
713
714
  setup_keepalives
714
715
  setup_results
715
716
  setup_master_monitor
717
+ @state = :running
716
718
  end
717
719
 
718
- def stop
719
- @logger.warn('stopping')
720
- @timers.each do |timer|
721
- timer.cancel
722
- end
723
- unsubscribe do
724
- resign_as_master do
725
- complete_handlers_in_progress do
726
- @rabbitmq.close
727
- @redis.close
728
- @logger.warn('stopping reactor')
729
- EM::stop_event_loop
720
+ def start
721
+ setup_redis
722
+ setup_rabbitmq
723
+ bootstrap
724
+ end
725
+
726
+ def pause(&block)
727
+ unless @state == :pausing || @state == :paused
728
+ @state = :pausing
729
+ @timers.each do |timer|
730
+ timer.cancel
731
+ end
732
+ @timers = Array.new
733
+ unsubscribe do
734
+ resign_as_master do
735
+ @state = :paused
736
+ if block
737
+ block.call
738
+ end
730
739
  end
731
740
  end
732
741
  end
733
742
  end
734
743
 
735
- def trap_signals
736
- %w[INT TERM].each do |signal|
737
- Signal.trap(signal) do
738
- @logger.warn('received signal', {
739
- :signal => signal
740
- })
741
- stop
744
+ def resume
745
+ retry_until_true(1) do
746
+ if @state == :paused
747
+ if @redis.connected? && @rabbitmq.connected?
748
+ bootstrap
749
+ true
750
+ end
742
751
  end
743
752
  end
744
753
  end
745
754
 
746
- private
747
-
748
- def testing?
749
- File.basename($0) == 'rake'
750
- end
751
-
752
- def retry_until_true(wait=0.5, &block)
753
- EM::Timer.new(wait) do
754
- unless block.call
755
- retry_until_true(wait, &block)
755
+ def stop
756
+ @logger.warn('stopping')
757
+ @state = :stopping
758
+ pause do
759
+ complete_handlers_in_progress do
760
+ @redis.close
761
+ @rabbitmq.close
762
+ @logger.warn('stopping reactor')
763
+ EM::stop_event_loop
756
764
  end
757
765
  end
758
766
  end
759
767
 
760
- def hash_values_equal?(hash_one, hash_two)
761
- hash_one.keys.all? do |key|
762
- if hash_one[key] == hash_two[key]
763
- true
764
- else
765
- if hash_one[key].is_a?(Hash) && hash_two[key].is_a?(Hash)
766
- hash_values_equal?(hash_one[key], hash_two[key])
767
- else
768
- false
769
- end
768
+ def trap_signals
769
+ %w[INT TERM].each do |signal|
770
+ Signal.trap(signal) do
771
+ @logger.warn('received signal', {
772
+ :signal => signal
773
+ })
774
+ stop
770
775
  end
771
776
  end
772
777
  end