sensu 0.17.0.beta → 0.17.0.beta.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/sensu/server.rb DELETED
@@ -1,772 +0,0 @@
1
- require 'sensu/daemon'
2
- require 'sensu/socket'
3
- require 'sensu/sandbox'
4
-
5
- module Sensu
6
- class Server
7
- include Daemon
8
-
9
- attr_reader :is_master
10
-
11
- def self.run(options={})
12
- server = self.new(options)
13
- EM::run do
14
- server.start
15
- server.setup_signal_traps
16
- end
17
- end
18
-
19
- def initialize(options={})
20
- super
21
- @is_master = false
22
- @timers[:master] = Array.new
23
- @handlers_in_progress_count = 0
24
- end
25
-
26
- def setup_keepalives
27
- @logger.debug('subscribing to keepalives')
28
- @transport.subscribe(:direct, 'keepalives', 'keepalives', :ack => true) do |message_info, message|
29
- begin
30
- client = MultiJson.load(message)
31
- @logger.debug('received keepalive', {
32
- :client => client
33
- })
34
- @redis.set('client:' + client[:name], MultiJson.dump(client)) do
35
- @redis.sadd('clients', client[:name]) do
36
- @transport.ack(message_info)
37
- end
38
- end
39
- rescue MultiJson::ParseError => error
40
- @logger.error('failed to parse keepalive payload', {
41
- :message => message,
42
- :error => error.to_s
43
- })
44
- @transport.ack(message_info)
45
- end
46
- end
47
- end
48
-
49
- def action_subdued?(condition)
50
- subdued = false
51
- if condition.has_key?(:begin) && condition.has_key?(:end)
52
- begin_time = Time.parse(condition[:begin])
53
- end_time = Time.parse(condition[:end])
54
- if end_time < begin_time
55
- if Time.now < end_time
56
- begin_time = Time.parse('12:00:00 AM')
57
- else
58
- end_time = Time.parse('11:59:59 PM')
59
- end
60
- end
61
- if Time.now >= begin_time && Time.now <= end_time
62
- subdued = true
63
- end
64
- end
65
- if condition.has_key?(:days)
66
- days = condition[:days].map(&:downcase)
67
- if days.include?(Time.now.strftime('%A').downcase)
68
- subdued = true
69
- end
70
- end
71
- if subdued && condition.has_key?(:exceptions)
72
- subdued = condition[:exceptions].none? do |exception|
73
- Time.now >= Time.parse(exception[:begin]) && Time.now <= Time.parse(exception[:end])
74
- end
75
- end
76
- subdued
77
- end
78
-
79
- def handler_subdued?(handler, check)
80
- subdued = Array.new
81
- if handler[:subdue]
82
- subdued << action_subdued?(handler[:subdue])
83
- end
84
- if check[:subdue] && check[:subdue][:at] != 'publisher'
85
- subdued << action_subdued?(check[:subdue])
86
- end
87
- subdued.any?
88
- end
89
-
90
- def filter_attributes_match?(hash_one, hash_two)
91
- hash_one.keys.all? do |key|
92
- case
93
- when hash_one[key] == hash_two[key]
94
- true
95
- when hash_one[key].is_a?(Hash) && hash_two[key].is_a?(Hash)
96
- filter_attributes_match?(hash_one[key], hash_two[key])
97
- when hash_one[key].to_s == hash_two[key].to_s
98
- true
99
- when hash_one[key].is_a?(String) && hash_one[key].start_with?('eval:')
100
- begin
101
- expression = hash_one[key].gsub(/^eval:(\s+)?/, '')
102
- !!Sandbox.eval(expression, hash_two[key])
103
- rescue => error
104
- @logger.error('filter eval error', {
105
- :attributes => [hash_one, hash_two],
106
- :error => error.to_s
107
- })
108
- false
109
- end
110
- else
111
- false
112
- end
113
- end
114
- end
115
-
116
- def event_filtered?(filter_name, event)
117
- if @settings.filter_exists?(filter_name)
118
- filter = @settings[:filters][filter_name]
119
- matched = filter_attributes_match?(filter[:attributes], event)
120
- filter[:negate] ? matched : !matched
121
- else
122
- @logger.error('unknown filter', {
123
- :filter_name => filter_name
124
- })
125
- false
126
- end
127
- end
128
-
129
- def derive_handlers(handler_list, depth=0)
130
- handler_list.compact.inject(Array.new) do |handlers, handler_name|
131
- if @settings.handler_exists?(handler_name)
132
- handler = @settings[:handlers][handler_name].merge(:name => handler_name)
133
- if handler[:type] == 'set'
134
- if depth < 2
135
- handlers = handlers + derive_handlers(handler[:handlers], depth + 1)
136
- else
137
- @logger.error('handler sets cannot be deeply nested', {
138
- :handler => handler
139
- })
140
- end
141
- else
142
- handlers << handler
143
- end
144
- elsif @extensions.handler_exists?(handler_name)
145
- handlers << @extensions[:handlers][handler_name]
146
- else
147
- @logger.error('unknown handler', {
148
- :handler_name => handler_name
149
- })
150
- end
151
- handlers.uniq
152
- end
153
- end
154
-
155
- def event_handlers(event)
156
- handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || 'default')
157
- handlers = derive_handlers(handler_list)
158
- handlers.select do |handler|
159
- if event[:action] == :flapping && !handler[:handle_flapping]
160
- @logger.info('handler does not handle flapping events', {
161
- :event => event,
162
- :handler => handler
163
- })
164
- next
165
- end
166
- if handler_subdued?(handler, event[:check])
167
- @logger.info('handler is subdued', {
168
- :event => event,
169
- :handler => handler
170
- })
171
- next
172
- end
173
- if handler.has_key?(:severities)
174
- handle = case event[:action]
175
- when :resolve
176
- event[:check][:history].reverse[1..-1].any? do |status|
177
- if status.to_i == 0
178
- break
179
- end
180
- severity = SEVERITIES[status.to_i] || 'unknown'
181
- handler[:severities].include?(severity)
182
- end
183
- else
184
- severity = SEVERITIES[event[:check][:status]] || 'unknown'
185
- handler[:severities].include?(severity)
186
- end
187
- unless handle
188
- @logger.debug('handler does not handle event severity', {
189
- :event => event,
190
- :handler => handler
191
- })
192
- next
193
- end
194
- end
195
- if handler.has_key?(:filters) || handler.has_key?(:filter)
196
- filter_list = Array(handler[:filters] || handler[:filter])
197
- filtered = filter_list.any? do |filter_name|
198
- event_filtered?(filter_name, event)
199
- end
200
- if filtered
201
- @logger.info('event filtered for handler', {
202
- :event => event,
203
- :handler => handler
204
- })
205
- next
206
- end
207
- end
208
- true
209
- end
210
- end
211
-
212
- def mutate_event_data(mutator_name, event, &block)
213
- mutator_name ||= 'json'
214
- return_output = Proc.new do |output, status|
215
- if status == 0
216
- block.dup.call(output)
217
- else
218
- @logger.error('mutator error', {
219
- :event => event,
220
- :output => output,
221
- :status => status
222
- })
223
- @handlers_in_progress_count -= 1
224
- end
225
- end
226
- @logger.debug('mutating event data', {
227
- :event => event,
228
- :mutator_name => mutator_name
229
- })
230
- case
231
- when @settings.mutator_exists?(mutator_name)
232
- mutator = @settings[:mutators][mutator_name]
233
- options = {:data => MultiJson.dump(event), :timeout => mutator[:timeout]}
234
- Spawn.process(mutator[:command], options, &return_output)
235
- when @extensions.mutator_exists?(mutator_name)
236
- extension = @extensions[:mutators][mutator_name]
237
- extension.safe_run(event, &return_output)
238
- else
239
- @logger.error('unknown mutator', {
240
- :mutator_name => mutator_name
241
- })
242
- @handlers_in_progress_count -= 1
243
- end
244
- end
245
-
246
- def handle_event(event)
247
- handlers = event_handlers(event)
248
- handlers.each do |handler|
249
- log_level = event[:check][:type] == 'metric' ? :debug : :info
250
- @logger.send(log_level, 'handling event', {
251
- :event => event,
252
- :handler => handler.respond_to?(:definition) ? handler.definition : handler
253
- })
254
- @handlers_in_progress_count += 1
255
- on_error = Proc.new do |error|
256
- @logger.error('handler error', {
257
- :event => event,
258
- :handler => handler,
259
- :error => error.to_s
260
- })
261
- @handlers_in_progress_count -= 1
262
- end
263
- mutate_event_data(handler[:mutator], event) do |event_data|
264
- case handler[:type]
265
- when 'pipe'
266
- options = {:data => event_data, :timeout => handler[:timeout]}
267
- Spawn.process(handler[:command], options) do |output, status|
268
- @logger.info('handler output', {
269
- :handler => handler,
270
- :output => output.lines,
271
- :event_id => event[:id]
272
- })
273
- @handlers_in_progress_count -= 1
274
- end
275
- when 'tcp'
276
- begin
277
- EM::connect(handler[:socket][:host], handler[:socket][:port], SocketHandler) do |socket|
278
- socket.on_success = Proc.new do
279
- @handlers_in_progress_count -= 1
280
- end
281
- socket.on_error = on_error
282
- timeout = handler[:timeout] || 10
283
- socket.pending_connect_timeout = timeout
284
- socket.comm_inactivity_timeout = timeout
285
- socket.send_data(event_data.to_s)
286
- socket.close_connection_after_writing
287
- end
288
- rescue => error
289
- on_error.call(error)
290
- end
291
- when 'udp'
292
- begin
293
- EM::open_datagram_socket('0.0.0.0', 0, nil) do |socket|
294
- socket.send_datagram(event_data.to_s, handler[:socket][:host], handler[:socket][:port])
295
- socket.close_connection_after_writing
296
- @handlers_in_progress_count -= 1
297
- end
298
- rescue => error
299
- on_error.call(error)
300
- end
301
- when 'transport'
302
- unless event_data.empty?
303
- pipe = handler[:pipe]
304
- @transport.publish(pipe[:type].to_sym, pipe[:name], event_data, pipe[:options] || Hash.new) do |info|
305
- if info[:error]
306
- @logger.fatal('failed to publish event data to the transport', {
307
- :pipe => pipe,
308
- :payload => event_data,
309
- :error => info[:error].to_s
310
- })
311
- end
312
- end
313
- end
314
- @handlers_in_progress_count -= 1
315
- when 'extension'
316
- handler.safe_run(event_data) do |output, status|
317
- @logger.info('handler extension output', {
318
- :extension => handler.definition,
319
- :output => output,
320
- :event_id => event[:id]
321
- })
322
- @handlers_in_progress_count -= 1
323
- end
324
- end
325
- end
326
- end
327
- end
328
-
329
- def aggregate_result(result)
330
- @logger.debug('adding result to aggregate', {
331
- :result => result
332
- })
333
- check = result[:check]
334
- result_set = check[:name] + ':' + check[:issued].to_s
335
- @redis.hset('aggregation:' + result_set, result[:client], MultiJson.dump(
336
- :output => check[:output],
337
- :status => check[:status]
338
- )) do
339
- SEVERITIES.each do |severity|
340
- @redis.hsetnx('aggregate:' + result_set, severity, 0)
341
- end
342
- severity = (SEVERITIES[check[:status]] || 'unknown')
343
- @redis.hincrby('aggregate:' + result_set, severity, 1) do
344
- @redis.hincrby('aggregate:' + result_set, 'total', 1) do
345
- @redis.sadd('aggregates:' + check[:name], check[:issued]) do
346
- @redis.sadd('aggregates', check[:name])
347
- end
348
- end
349
- end
350
- end
351
- end
352
-
353
- def event_bridges(event)
354
- @extensions[:bridges].each do |name, bridge|
355
- bridge.safe_run(event) do |output, status|
356
- output.each_line do |line|
357
- @logger.debug('bridge extension output', {
358
- :extension => bridge.definition,
359
- :output => line
360
- })
361
- end
362
- end
363
- end
364
- end
365
-
366
- def process_result(result)
367
- @logger.debug('processing result', {
368
- :result => result
369
- })
370
- @redis.get('client:' + result[:client]) do |client_json|
371
- unless client_json.nil?
372
- client = MultiJson.load(client_json)
373
- check = case
374
- when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
375
- @settings[:checks][result[:check][:name]].merge(result[:check])
376
- else
377
- result[:check]
378
- end
379
- if check[:aggregate]
380
- aggregate_result(result)
381
- end
382
- @redis.sadd('history:' + client[:name], check[:name])
383
- history_key = 'history:' + client[:name] + ':' + check[:name]
384
- @redis.rpush(history_key, check[:status]) do
385
- execution_key = 'execution:' + client[:name] + ':' + check[:name]
386
- @redis.set(execution_key, check[:executed])
387
- @redis.lrange(history_key, -21, -1) do |history|
388
- check[:history] = history
389
- total_state_change = 0
390
- unless history.size < 21
391
- state_changes = 0
392
- change_weight = 0.8
393
- previous_status = history.first
394
- history.each do |status|
395
- unless status == previous_status
396
- state_changes += change_weight
397
- end
398
- change_weight += 0.02
399
- previous_status = status
400
- end
401
- total_state_change = (state_changes.fdiv(20) * 100).to_i
402
- @redis.ltrim(history_key, -21, -1)
403
- end
404
- @redis.hget('events:' + client[:name], check[:name]) do |event_json|
405
- previous_occurrence = event_json ? MultiJson.load(event_json) : false
406
- is_flapping = false
407
- if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
408
- was_flapping = previous_occurrence && previous_occurrence[:action] == 'flapping'
409
- is_flapping = case
410
- when total_state_change >= check[:high_flap_threshold]
411
- true
412
- when was_flapping && total_state_change <= check[:low_flap_threshold]
413
- false
414
- else
415
- was_flapping
416
- end
417
- end
418
- event = {
419
- :id => random_uuid,
420
- :client => client,
421
- :check => check,
422
- :occurrences => 1
423
- }
424
- if check[:status] != 0 || is_flapping
425
- if previous_occurrence && check[:status] == previous_occurrence[:check][:status]
426
- event[:occurrences] = previous_occurrence[:occurrences] + 1
427
- end
428
- event[:action] = is_flapping ? :flapping : :create
429
- @redis.hset('events:' + client[:name], check[:name], MultiJson.dump(event)) do
430
- unless check[:handle] == false
431
- handle_event(event)
432
- end
433
- end
434
- elsif previous_occurrence
435
- event[:occurrences] = previous_occurrence[:occurrences]
436
- event[:action] = :resolve
437
- unless check[:auto_resolve] == false && !check[:force_resolve]
438
- @redis.hdel('events:' + client[:name], check[:name]) do
439
- unless check[:handle] == false
440
- handle_event(event)
441
- end
442
- end
443
- end
444
- elsif check[:type] == 'metric'
445
- handle_event(event)
446
- end
447
- event_bridges(event)
448
- end
449
- end
450
- end
451
- end
452
- end
453
- end
454
-
455
- def setup_results
456
- @logger.debug('subscribing to results')
457
- @transport.subscribe(:direct, 'results', 'results', :ack => true) do |message_info, message|
458
- begin
459
- result = MultiJson.load(message)
460
- @logger.debug('received result', {
461
- :result => result
462
- })
463
- process_result(result)
464
- rescue MultiJson::ParseError => error
465
- @logger.error('failed to parse result payload', {
466
- :message => message,
467
- :error => error.to_s
468
- })
469
- end
470
- EM::next_tick do
471
- @transport.ack(message_info)
472
- end
473
- end
474
- end
475
-
476
- def check_request_subdued?(check)
477
- if check[:subdue] && check[:subdue][:at] == 'publisher'
478
- action_subdued?(check[:subdue])
479
- else
480
- false
481
- end
482
- end
483
-
484
- def publish_check_request(check)
485
- payload = {
486
- :name => check[:name],
487
- :issued => Time.now.to_i
488
- }
489
- if check.has_key?(:command)
490
- payload[:command] = check[:command]
491
- end
492
- @logger.info('publishing check request', {
493
- :payload => payload,
494
- :subscribers => check[:subscribers]
495
- })
496
- check[:subscribers].each do |subscription|
497
- @transport.publish(:fanout, subscription, MultiJson.dump(payload)) do |info|
498
- if info[:error]
499
- @logger.error('failed to publish check request', {
500
- :subscription => subscription,
501
- :payload => payload,
502
- :error => info[:error].to_s
503
- })
504
- end
505
- end
506
- end
507
- end
508
-
509
- def calculate_execution_splay(check)
510
- splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
511
- current_time = (Time.now.to_f * 1000).to_i
512
- (splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
513
- end
514
-
515
- def schedule_checks(checks)
516
- checks.each do |check|
517
- process_check_request = Proc.new do
518
- unless check_request_subdued?(check)
519
- publish_check_request(check)
520
- else
521
- @logger.info('check request was subdued', {
522
- :check => check
523
- })
524
- end
525
- end
526
- execution_splay = testing? ? 0 : calculate_execution_splay(check)
527
- interval = testing? ? 0.5 : check[:interval]
528
- @timers[:master] << EM::Timer.new(execution_splay) do
529
- process_check_request.call
530
- @timers[:master] << EM::PeriodicTimer.new(interval, &process_check_request)
531
- end
532
- end
533
- end
534
-
535
- def setup_publisher
536
- @logger.debug('scheduling check requests')
537
- standard_checks = @settings.checks.reject do |check|
538
- check[:standalone] || check[:publish] == false
539
- end
540
- extension_checks = @extensions.checks.reject do |check|
541
- check[:standalone] || check[:publish] == false || !check[:interval].is_a?(Integer)
542
- end
543
- schedule_checks(standard_checks + extension_checks)
544
- end
545
-
546
- def publish_result(client, check)
547
- payload = {
548
- :client => client[:name],
549
- :check => check
550
- }
551
- @logger.debug('publishing check result', {
552
- :payload => payload
553
- })
554
- @transport.publish(:direct, 'results', MultiJson.dump(payload)) do |info|
555
- if info[:error]
556
- @logger.error('failed to publish check result', {
557
- :payload => payload,
558
- :error => info[:error].to_s
559
- })
560
- end
561
- end
562
- end
563
-
564
- def determine_stale_clients
565
- @logger.info('determining stale clients')
566
- keepalive_check = {
567
- :thresholds => {
568
- :warning => 120,
569
- :critical => 180
570
- }
571
- }
572
- if @settings.handler_exists?(:keepalive)
573
- keepalive_check[:handler] = "keepalive"
574
- end
575
- @redis.smembers('clients') do |clients|
576
- clients.each do |client_name|
577
- @redis.get('client:' + client_name) do |client_json|
578
- unless client_json.nil?
579
- client = MultiJson.load(client_json)
580
- check = keepalive_check.dup
581
- if client.has_key?(:keepalive)
582
- check = deep_merge(check, client[:keepalive])
583
- end
584
- check[:name] = 'keepalive'
585
- check[:issued] = Time.now.to_i
586
- check[:executed] = Time.now.to_i
587
- time_since_last_keepalive = Time.now.to_i - client[:timestamp]
588
- check[:output] = 'No keepalive sent from client for '
589
- check[:output] << time_since_last_keepalive.to_s + ' seconds'
590
- case
591
- when time_since_last_keepalive >= check[:thresholds][:critical]
592
- check[:output] << ' (>=' + check[:thresholds][:critical].to_s + ')'
593
- check[:status] = 2
594
- when time_since_last_keepalive >= check[:thresholds][:warning]
595
- check[:output] << ' (>=' + check[:thresholds][:warning].to_s + ')'
596
- check[:status] = 1
597
- else
598
- check[:output] = 'Keepalive sent from client '
599
- check[:output] << time_since_last_keepalive.to_s + ' seconds ago'
600
- check[:status] = 0
601
- end
602
- publish_result(client, check)
603
- end
604
- end
605
- end
606
- end
607
- end
608
-
609
- def setup_client_monitor
610
- @logger.debug('monitoring clients')
611
- @timers[:master] << EM::PeriodicTimer.new(30) do
612
- determine_stale_clients
613
- end
614
- end
615
-
616
- def prune_aggregations
617
- @logger.info('pruning aggregations')
618
- @redis.smembers('aggregates') do |checks|
619
- checks.each do |check_name|
620
- @redis.smembers('aggregates:' + check_name) do |aggregates|
621
- if aggregates.size > 20
622
- aggregates.sort!
623
- aggregates.take(aggregates.size - 20).each do |check_issued|
624
- @redis.srem('aggregates:' + check_name, check_issued) do
625
- result_set = check_name + ':' + check_issued.to_s
626
- @redis.del('aggregate:' + result_set) do
627
- @redis.del('aggregation:' + result_set) do
628
- @logger.debug('pruned aggregation', {
629
- :check => {
630
- :name => check_name,
631
- :issued => check_issued
632
- }
633
- })
634
- end
635
- end
636
- end
637
- end
638
- end
639
- end
640
- end
641
- end
642
- end
643
-
644
- def setup_aggregation_pruner
645
- @logger.debug('pruning aggregations')
646
- @timers[:master] << EM::PeriodicTimer.new(20) do
647
- prune_aggregations
648
- end
649
- end
650
-
651
- def master_duties
652
- setup_publisher
653
- setup_client_monitor
654
- setup_aggregation_pruner
655
- end
656
-
657
- def request_master_election
658
- @redis.setnx('lock:master', Time.now.to_i) do |created|
659
- if created
660
- @is_master = true
661
- @logger.info('i am the master')
662
- master_duties
663
- else
664
- @redis.get('lock:master') do |timestamp|
665
- if Time.now.to_i - timestamp.to_i >= 30
666
- @redis.getset('lock:master', Time.now.to_i) do |previous|
667
- if previous == timestamp
668
- @is_master = true
669
- @logger.info('i am now the master')
670
- master_duties
671
- end
672
- end
673
- end
674
- end
675
- end
676
- end
677
- end
678
-
679
- def setup_master_monitor
680
- @timers[:run] << EM::Timer.new(2) do
681
- request_master_election
682
- end
683
- @timers[:run] << EM::PeriodicTimer.new(10) do
684
- if @is_master
685
- @redis.set('lock:master', Time.now.to_i) do
686
- @logger.debug('updated master lock timestamp')
687
- end
688
- else
689
- request_master_election
690
- end
691
- end
692
- end
693
-
694
- def resign_as_master
695
- if @is_master
696
- @logger.warn('resigning as master')
697
- @timers[:master].each do |timer|
698
- timer.cancel
699
- end
700
- @timers[:master].clear
701
- @is_master = false
702
- else
703
- @logger.debug('not currently master')
704
- end
705
- end
706
-
707
- def unsubscribe
708
- @logger.warn('unsubscribing from keepalive and result queues')
709
- @transport.unsubscribe
710
- end
711
-
712
- def complete_handlers_in_progress(&block)
713
- @logger.info('completing handlers in progress', {
714
- :handlers_in_progress_count => @handlers_in_progress_count
715
- })
716
- retry_until_true do
717
- if @handlers_in_progress_count == 0
718
- block.call
719
- true
720
- end
721
- end
722
- end
723
-
724
- def bootstrap
725
- setup_keepalives
726
- setup_results
727
- setup_master_monitor
728
- @state = :running
729
- end
730
-
731
- def start
732
- setup_redis
733
- setup_transport
734
- bootstrap
735
- end
736
-
737
- def pause
738
- unless @state == :pausing || @state == :paused
739
- @state = :pausing
740
- @timers[:run].each do |timer|
741
- timer.cancel
742
- end
743
- @timers[:run].clear
744
- unsubscribe
745
- resign_as_master
746
- @state = :paused
747
- end
748
- end
749
-
750
- def resume
751
- retry_until_true(1) do
752
- if @state == :paused
753
- if @redis.connected? && @transport.connected?
754
- bootstrap
755
- true
756
- end
757
- end
758
- end
759
- end
760
-
761
- def stop
762
- @logger.warn('stopping')
763
- pause
764
- @state = :stopping
765
- complete_handlers_in_progress do
766
- @redis.close
767
- @transport.close
768
- super
769
- end
770
- end
771
- end
772
- end