sensu 0.17.0.beta → 0.17.0.beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/sensu/server.rb DELETED
@@ -1,772 +0,0 @@
1
- require 'sensu/daemon'
2
- require 'sensu/socket'
3
- require 'sensu/sandbox'
4
-
5
- module Sensu
6
- class Server
7
- include Daemon
8
-
9
- attr_reader :is_master
10
-
11
- def self.run(options={})
12
- server = self.new(options)
13
- EM::run do
14
- server.start
15
- server.setup_signal_traps
16
- end
17
- end
18
-
19
- def initialize(options={})
20
- super
21
- @is_master = false
22
- @timers[:master] = Array.new
23
- @handlers_in_progress_count = 0
24
- end
25
-
26
- def setup_keepalives
27
- @logger.debug('subscribing to keepalives')
28
- @transport.subscribe(:direct, 'keepalives', 'keepalives', :ack => true) do |message_info, message|
29
- begin
30
- client = MultiJson.load(message)
31
- @logger.debug('received keepalive', {
32
- :client => client
33
- })
34
- @redis.set('client:' + client[:name], MultiJson.dump(client)) do
35
- @redis.sadd('clients', client[:name]) do
36
- @transport.ack(message_info)
37
- end
38
- end
39
- rescue MultiJson::ParseError => error
40
- @logger.error('failed to parse keepalive payload', {
41
- :message => message,
42
- :error => error.to_s
43
- })
44
- @transport.ack(message_info)
45
- end
46
- end
47
- end
48
-
49
- def action_subdued?(condition)
50
- subdued = false
51
- if condition.has_key?(:begin) && condition.has_key?(:end)
52
- begin_time = Time.parse(condition[:begin])
53
- end_time = Time.parse(condition[:end])
54
- if end_time < begin_time
55
- if Time.now < end_time
56
- begin_time = Time.parse('12:00:00 AM')
57
- else
58
- end_time = Time.parse('11:59:59 PM')
59
- end
60
- end
61
- if Time.now >= begin_time && Time.now <= end_time
62
- subdued = true
63
- end
64
- end
65
- if condition.has_key?(:days)
66
- days = condition[:days].map(&:downcase)
67
- if days.include?(Time.now.strftime('%A').downcase)
68
- subdued = true
69
- end
70
- end
71
- if subdued && condition.has_key?(:exceptions)
72
- subdued = condition[:exceptions].none? do |exception|
73
- Time.now >= Time.parse(exception[:begin]) && Time.now <= Time.parse(exception[:end])
74
- end
75
- end
76
- subdued
77
- end
78
-
79
- def handler_subdued?(handler, check)
80
- subdued = Array.new
81
- if handler[:subdue]
82
- subdued << action_subdued?(handler[:subdue])
83
- end
84
- if check[:subdue] && check[:subdue][:at] != 'publisher'
85
- subdued << action_subdued?(check[:subdue])
86
- end
87
- subdued.any?
88
- end
89
-
90
- def filter_attributes_match?(hash_one, hash_two)
91
- hash_one.keys.all? do |key|
92
- case
93
- when hash_one[key] == hash_two[key]
94
- true
95
- when hash_one[key].is_a?(Hash) && hash_two[key].is_a?(Hash)
96
- filter_attributes_match?(hash_one[key], hash_two[key])
97
- when hash_one[key].to_s == hash_two[key].to_s
98
- true
99
- when hash_one[key].is_a?(String) && hash_one[key].start_with?('eval:')
100
- begin
101
- expression = hash_one[key].gsub(/^eval:(\s+)?/, '')
102
- !!Sandbox.eval(expression, hash_two[key])
103
- rescue => error
104
- @logger.error('filter eval error', {
105
- :attributes => [hash_one, hash_two],
106
- :error => error.to_s
107
- })
108
- false
109
- end
110
- else
111
- false
112
- end
113
- end
114
- end
115
-
116
- def event_filtered?(filter_name, event)
117
- if @settings.filter_exists?(filter_name)
118
- filter = @settings[:filters][filter_name]
119
- matched = filter_attributes_match?(filter[:attributes], event)
120
- filter[:negate] ? matched : !matched
121
- else
122
- @logger.error('unknown filter', {
123
- :filter_name => filter_name
124
- })
125
- false
126
- end
127
- end
128
-
129
- def derive_handlers(handler_list, depth=0)
130
- handler_list.compact.inject(Array.new) do |handlers, handler_name|
131
- if @settings.handler_exists?(handler_name)
132
- handler = @settings[:handlers][handler_name].merge(:name => handler_name)
133
- if handler[:type] == 'set'
134
- if depth < 2
135
- handlers = handlers + derive_handlers(handler[:handlers], depth + 1)
136
- else
137
- @logger.error('handler sets cannot be deeply nested', {
138
- :handler => handler
139
- })
140
- end
141
- else
142
- handlers << handler
143
- end
144
- elsif @extensions.handler_exists?(handler_name)
145
- handlers << @extensions[:handlers][handler_name]
146
- else
147
- @logger.error('unknown handler', {
148
- :handler_name => handler_name
149
- })
150
- end
151
- handlers.uniq
152
- end
153
- end
154
-
155
- def event_handlers(event)
156
- handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || 'default')
157
- handlers = derive_handlers(handler_list)
158
- handlers.select do |handler|
159
- if event[:action] == :flapping && !handler[:handle_flapping]
160
- @logger.info('handler does not handle flapping events', {
161
- :event => event,
162
- :handler => handler
163
- })
164
- next
165
- end
166
- if handler_subdued?(handler, event[:check])
167
- @logger.info('handler is subdued', {
168
- :event => event,
169
- :handler => handler
170
- })
171
- next
172
- end
173
- if handler.has_key?(:severities)
174
- handle = case event[:action]
175
- when :resolve
176
- event[:check][:history].reverse[1..-1].any? do |status|
177
- if status.to_i == 0
178
- break
179
- end
180
- severity = SEVERITIES[status.to_i] || 'unknown'
181
- handler[:severities].include?(severity)
182
- end
183
- else
184
- severity = SEVERITIES[event[:check][:status]] || 'unknown'
185
- handler[:severities].include?(severity)
186
- end
187
- unless handle
188
- @logger.debug('handler does not handle event severity', {
189
- :event => event,
190
- :handler => handler
191
- })
192
- next
193
- end
194
- end
195
- if handler.has_key?(:filters) || handler.has_key?(:filter)
196
- filter_list = Array(handler[:filters] || handler[:filter])
197
- filtered = filter_list.any? do |filter_name|
198
- event_filtered?(filter_name, event)
199
- end
200
- if filtered
201
- @logger.info('event filtered for handler', {
202
- :event => event,
203
- :handler => handler
204
- })
205
- next
206
- end
207
- end
208
- true
209
- end
210
- end
211
-
212
- def mutate_event_data(mutator_name, event, &block)
213
- mutator_name ||= 'json'
214
- return_output = Proc.new do |output, status|
215
- if status == 0
216
- block.dup.call(output)
217
- else
218
- @logger.error('mutator error', {
219
- :event => event,
220
- :output => output,
221
- :status => status
222
- })
223
- @handlers_in_progress_count -= 1
224
- end
225
- end
226
- @logger.debug('mutating event data', {
227
- :event => event,
228
- :mutator_name => mutator_name
229
- })
230
- case
231
- when @settings.mutator_exists?(mutator_name)
232
- mutator = @settings[:mutators][mutator_name]
233
- options = {:data => MultiJson.dump(event), :timeout => mutator[:timeout]}
234
- Spawn.process(mutator[:command], options, &return_output)
235
- when @extensions.mutator_exists?(mutator_name)
236
- extension = @extensions[:mutators][mutator_name]
237
- extension.safe_run(event, &return_output)
238
- else
239
- @logger.error('unknown mutator', {
240
- :mutator_name => mutator_name
241
- })
242
- @handlers_in_progress_count -= 1
243
- end
244
- end
245
-
246
- def handle_event(event)
247
- handlers = event_handlers(event)
248
- handlers.each do |handler|
249
- log_level = event[:check][:type] == 'metric' ? :debug : :info
250
- @logger.send(log_level, 'handling event', {
251
- :event => event,
252
- :handler => handler.respond_to?(:definition) ? handler.definition : handler
253
- })
254
- @handlers_in_progress_count += 1
255
- on_error = Proc.new do |error|
256
- @logger.error('handler error', {
257
- :event => event,
258
- :handler => handler,
259
- :error => error.to_s
260
- })
261
- @handlers_in_progress_count -= 1
262
- end
263
- mutate_event_data(handler[:mutator], event) do |event_data|
264
- case handler[:type]
265
- when 'pipe'
266
- options = {:data => event_data, :timeout => handler[:timeout]}
267
- Spawn.process(handler[:command], options) do |output, status|
268
- @logger.info('handler output', {
269
- :handler => handler,
270
- :output => output.lines,
271
- :event_id => event[:id]
272
- })
273
- @handlers_in_progress_count -= 1
274
- end
275
- when 'tcp'
276
- begin
277
- EM::connect(handler[:socket][:host], handler[:socket][:port], SocketHandler) do |socket|
278
- socket.on_success = Proc.new do
279
- @handlers_in_progress_count -= 1
280
- end
281
- socket.on_error = on_error
282
- timeout = handler[:timeout] || 10
283
- socket.pending_connect_timeout = timeout
284
- socket.comm_inactivity_timeout = timeout
285
- socket.send_data(event_data.to_s)
286
- socket.close_connection_after_writing
287
- end
288
- rescue => error
289
- on_error.call(error)
290
- end
291
- when 'udp'
292
- begin
293
- EM::open_datagram_socket('0.0.0.0', 0, nil) do |socket|
294
- socket.send_datagram(event_data.to_s, handler[:socket][:host], handler[:socket][:port])
295
- socket.close_connection_after_writing
296
- @handlers_in_progress_count -= 1
297
- end
298
- rescue => error
299
- on_error.call(error)
300
- end
301
- when 'transport'
302
- unless event_data.empty?
303
- pipe = handler[:pipe]
304
- @transport.publish(pipe[:type].to_sym, pipe[:name], event_data, pipe[:options] || Hash.new) do |info|
305
- if info[:error]
306
- @logger.fatal('failed to publish event data to the transport', {
307
- :pipe => pipe,
308
- :payload => event_data,
309
- :error => info[:error].to_s
310
- })
311
- end
312
- end
313
- end
314
- @handlers_in_progress_count -= 1
315
- when 'extension'
316
- handler.safe_run(event_data) do |output, status|
317
- @logger.info('handler extension output', {
318
- :extension => handler.definition,
319
- :output => output,
320
- :event_id => event[:id]
321
- })
322
- @handlers_in_progress_count -= 1
323
- end
324
- end
325
- end
326
- end
327
- end
328
-
329
- def aggregate_result(result)
330
- @logger.debug('adding result to aggregate', {
331
- :result => result
332
- })
333
- check = result[:check]
334
- result_set = check[:name] + ':' + check[:issued].to_s
335
- @redis.hset('aggregation:' + result_set, result[:client], MultiJson.dump(
336
- :output => check[:output],
337
- :status => check[:status]
338
- )) do
339
- SEVERITIES.each do |severity|
340
- @redis.hsetnx('aggregate:' + result_set, severity, 0)
341
- end
342
- severity = (SEVERITIES[check[:status]] || 'unknown')
343
- @redis.hincrby('aggregate:' + result_set, severity, 1) do
344
- @redis.hincrby('aggregate:' + result_set, 'total', 1) do
345
- @redis.sadd('aggregates:' + check[:name], check[:issued]) do
346
- @redis.sadd('aggregates', check[:name])
347
- end
348
- end
349
- end
350
- end
351
- end
352
-
353
- def event_bridges(event)
354
- @extensions[:bridges].each do |name, bridge|
355
- bridge.safe_run(event) do |output, status|
356
- output.each_line do |line|
357
- @logger.debug('bridge extension output', {
358
- :extension => bridge.definition,
359
- :output => line
360
- })
361
- end
362
- end
363
- end
364
- end
365
-
366
- def process_result(result)
367
- @logger.debug('processing result', {
368
- :result => result
369
- })
370
- @redis.get('client:' + result[:client]) do |client_json|
371
- unless client_json.nil?
372
- client = MultiJson.load(client_json)
373
- check = case
374
- when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
375
- @settings[:checks][result[:check][:name]].merge(result[:check])
376
- else
377
- result[:check]
378
- end
379
- if check[:aggregate]
380
- aggregate_result(result)
381
- end
382
- @redis.sadd('history:' + client[:name], check[:name])
383
- history_key = 'history:' + client[:name] + ':' + check[:name]
384
- @redis.rpush(history_key, check[:status]) do
385
- execution_key = 'execution:' + client[:name] + ':' + check[:name]
386
- @redis.set(execution_key, check[:executed])
387
- @redis.lrange(history_key, -21, -1) do |history|
388
- check[:history] = history
389
- total_state_change = 0
390
- unless history.size < 21
391
- state_changes = 0
392
- change_weight = 0.8
393
- previous_status = history.first
394
- history.each do |status|
395
- unless status == previous_status
396
- state_changes += change_weight
397
- end
398
- change_weight += 0.02
399
- previous_status = status
400
- end
401
- total_state_change = (state_changes.fdiv(20) * 100).to_i
402
- @redis.ltrim(history_key, -21, -1)
403
- end
404
- @redis.hget('events:' + client[:name], check[:name]) do |event_json|
405
- previous_occurrence = event_json ? MultiJson.load(event_json) : false
406
- is_flapping = false
407
- if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
408
- was_flapping = previous_occurrence && previous_occurrence[:action] == 'flapping'
409
- is_flapping = case
410
- when total_state_change >= check[:high_flap_threshold]
411
- true
412
- when was_flapping && total_state_change <= check[:low_flap_threshold]
413
- false
414
- else
415
- was_flapping
416
- end
417
- end
418
- event = {
419
- :id => random_uuid,
420
- :client => client,
421
- :check => check,
422
- :occurrences => 1
423
- }
424
- if check[:status] != 0 || is_flapping
425
- if previous_occurrence && check[:status] == previous_occurrence[:check][:status]
426
- event[:occurrences] = previous_occurrence[:occurrences] + 1
427
- end
428
- event[:action] = is_flapping ? :flapping : :create
429
- @redis.hset('events:' + client[:name], check[:name], MultiJson.dump(event)) do
430
- unless check[:handle] == false
431
- handle_event(event)
432
- end
433
- end
434
- elsif previous_occurrence
435
- event[:occurrences] = previous_occurrence[:occurrences]
436
- event[:action] = :resolve
437
- unless check[:auto_resolve] == false && !check[:force_resolve]
438
- @redis.hdel('events:' + client[:name], check[:name]) do
439
- unless check[:handle] == false
440
- handle_event(event)
441
- end
442
- end
443
- end
444
- elsif check[:type] == 'metric'
445
- handle_event(event)
446
- end
447
- event_bridges(event)
448
- end
449
- end
450
- end
451
- end
452
- end
453
- end
454
-
455
- def setup_results
456
- @logger.debug('subscribing to results')
457
- @transport.subscribe(:direct, 'results', 'results', :ack => true) do |message_info, message|
458
- begin
459
- result = MultiJson.load(message)
460
- @logger.debug('received result', {
461
- :result => result
462
- })
463
- process_result(result)
464
- rescue MultiJson::ParseError => error
465
- @logger.error('failed to parse result payload', {
466
- :message => message,
467
- :error => error.to_s
468
- })
469
- end
470
- EM::next_tick do
471
- @transport.ack(message_info)
472
- end
473
- end
474
- end
475
-
476
- def check_request_subdued?(check)
477
- if check[:subdue] && check[:subdue][:at] == 'publisher'
478
- action_subdued?(check[:subdue])
479
- else
480
- false
481
- end
482
- end
483
-
484
- def publish_check_request(check)
485
- payload = {
486
- :name => check[:name],
487
- :issued => Time.now.to_i
488
- }
489
- if check.has_key?(:command)
490
- payload[:command] = check[:command]
491
- end
492
- @logger.info('publishing check request', {
493
- :payload => payload,
494
- :subscribers => check[:subscribers]
495
- })
496
- check[:subscribers].each do |subscription|
497
- @transport.publish(:fanout, subscription, MultiJson.dump(payload)) do |info|
498
- if info[:error]
499
- @logger.error('failed to publish check request', {
500
- :subscription => subscription,
501
- :payload => payload,
502
- :error => info[:error].to_s
503
- })
504
- end
505
- end
506
- end
507
- end
508
-
509
- def calculate_execution_splay(check)
510
- splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
511
- current_time = (Time.now.to_f * 1000).to_i
512
- (splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
513
- end
514
-
515
- def schedule_checks(checks)
516
- checks.each do |check|
517
- process_check_request = Proc.new do
518
- unless check_request_subdued?(check)
519
- publish_check_request(check)
520
- else
521
- @logger.info('check request was subdued', {
522
- :check => check
523
- })
524
- end
525
- end
526
- execution_splay = testing? ? 0 : calculate_execution_splay(check)
527
- interval = testing? ? 0.5 : check[:interval]
528
- @timers[:master] << EM::Timer.new(execution_splay) do
529
- process_check_request.call
530
- @timers[:master] << EM::PeriodicTimer.new(interval, &process_check_request)
531
- end
532
- end
533
- end
534
-
535
- def setup_publisher
536
- @logger.debug('scheduling check requests')
537
- standard_checks = @settings.checks.reject do |check|
538
- check[:standalone] || check[:publish] == false
539
- end
540
- extension_checks = @extensions.checks.reject do |check|
541
- check[:standalone] || check[:publish] == false || !check[:interval].is_a?(Integer)
542
- end
543
- schedule_checks(standard_checks + extension_checks)
544
- end
545
-
546
- def publish_result(client, check)
547
- payload = {
548
- :client => client[:name],
549
- :check => check
550
- }
551
- @logger.debug('publishing check result', {
552
- :payload => payload
553
- })
554
- @transport.publish(:direct, 'results', MultiJson.dump(payload)) do |info|
555
- if info[:error]
556
- @logger.error('failed to publish check result', {
557
- :payload => payload,
558
- :error => info[:error].to_s
559
- })
560
- end
561
- end
562
- end
563
-
564
- def determine_stale_clients
565
- @logger.info('determining stale clients')
566
- keepalive_check = {
567
- :thresholds => {
568
- :warning => 120,
569
- :critical => 180
570
- }
571
- }
572
- if @settings.handler_exists?(:keepalive)
573
- keepalive_check[:handler] = "keepalive"
574
- end
575
- @redis.smembers('clients') do |clients|
576
- clients.each do |client_name|
577
- @redis.get('client:' + client_name) do |client_json|
578
- unless client_json.nil?
579
- client = MultiJson.load(client_json)
580
- check = keepalive_check.dup
581
- if client.has_key?(:keepalive)
582
- check = deep_merge(check, client[:keepalive])
583
- end
584
- check[:name] = 'keepalive'
585
- check[:issued] = Time.now.to_i
586
- check[:executed] = Time.now.to_i
587
- time_since_last_keepalive = Time.now.to_i - client[:timestamp]
588
- check[:output] = 'No keepalive sent from client for '
589
- check[:output] << time_since_last_keepalive.to_s + ' seconds'
590
- case
591
- when time_since_last_keepalive >= check[:thresholds][:critical]
592
- check[:output] << ' (>=' + check[:thresholds][:critical].to_s + ')'
593
- check[:status] = 2
594
- when time_since_last_keepalive >= check[:thresholds][:warning]
595
- check[:output] << ' (>=' + check[:thresholds][:warning].to_s + ')'
596
- check[:status] = 1
597
- else
598
- check[:output] = 'Keepalive sent from client '
599
- check[:output] << time_since_last_keepalive.to_s + ' seconds ago'
600
- check[:status] = 0
601
- end
602
- publish_result(client, check)
603
- end
604
- end
605
- end
606
- end
607
- end
608
-
609
- def setup_client_monitor
610
- @logger.debug('monitoring clients')
611
- @timers[:master] << EM::PeriodicTimer.new(30) do
612
- determine_stale_clients
613
- end
614
- end
615
-
616
- def prune_aggregations
617
- @logger.info('pruning aggregations')
618
- @redis.smembers('aggregates') do |checks|
619
- checks.each do |check_name|
620
- @redis.smembers('aggregates:' + check_name) do |aggregates|
621
- if aggregates.size > 20
622
- aggregates.sort!
623
- aggregates.take(aggregates.size - 20).each do |check_issued|
624
- @redis.srem('aggregates:' + check_name, check_issued) do
625
- result_set = check_name + ':' + check_issued.to_s
626
- @redis.del('aggregate:' + result_set) do
627
- @redis.del('aggregation:' + result_set) do
628
- @logger.debug('pruned aggregation', {
629
- :check => {
630
- :name => check_name,
631
- :issued => check_issued
632
- }
633
- })
634
- end
635
- end
636
- end
637
- end
638
- end
639
- end
640
- end
641
- end
642
- end
643
-
644
- def setup_aggregation_pruner
645
- @logger.debug('pruning aggregations')
646
- @timers[:master] << EM::PeriodicTimer.new(20) do
647
- prune_aggregations
648
- end
649
- end
650
-
651
- def master_duties
652
- setup_publisher
653
- setup_client_monitor
654
- setup_aggregation_pruner
655
- end
656
-
657
- def request_master_election
658
- @redis.setnx('lock:master', Time.now.to_i) do |created|
659
- if created
660
- @is_master = true
661
- @logger.info('i am the master')
662
- master_duties
663
- else
664
- @redis.get('lock:master') do |timestamp|
665
- if Time.now.to_i - timestamp.to_i >= 30
666
- @redis.getset('lock:master', Time.now.to_i) do |previous|
667
- if previous == timestamp
668
- @is_master = true
669
- @logger.info('i am now the master')
670
- master_duties
671
- end
672
- end
673
- end
674
- end
675
- end
676
- end
677
- end
678
-
679
- def setup_master_monitor
680
- @timers[:run] << EM::Timer.new(2) do
681
- request_master_election
682
- end
683
- @timers[:run] << EM::PeriodicTimer.new(10) do
684
- if @is_master
685
- @redis.set('lock:master', Time.now.to_i) do
686
- @logger.debug('updated master lock timestamp')
687
- end
688
- else
689
- request_master_election
690
- end
691
- end
692
- end
693
-
694
- def resign_as_master
695
- if @is_master
696
- @logger.warn('resigning as master')
697
- @timers[:master].each do |timer|
698
- timer.cancel
699
- end
700
- @timers[:master].clear
701
- @is_master = false
702
- else
703
- @logger.debug('not currently master')
704
- end
705
- end
706
-
707
- def unsubscribe
708
- @logger.warn('unsubscribing from keepalive and result queues')
709
- @transport.unsubscribe
710
- end
711
-
712
- def complete_handlers_in_progress(&block)
713
- @logger.info('completing handlers in progress', {
714
- :handlers_in_progress_count => @handlers_in_progress_count
715
- })
716
- retry_until_true do
717
- if @handlers_in_progress_count == 0
718
- block.call
719
- true
720
- end
721
- end
722
- end
723
-
724
- def bootstrap
725
- setup_keepalives
726
- setup_results
727
- setup_master_monitor
728
- @state = :running
729
- end
730
-
731
- def start
732
- setup_redis
733
- setup_transport
734
- bootstrap
735
- end
736
-
737
- def pause
738
- unless @state == :pausing || @state == :paused
739
- @state = :pausing
740
- @timers[:run].each do |timer|
741
- timer.cancel
742
- end
743
- @timers[:run].clear
744
- unsubscribe
745
- resign_as_master
746
- @state = :paused
747
- end
748
- end
749
-
750
- def resume
751
- retry_until_true(1) do
752
- if @state == :paused
753
- if @redis.connected? && @transport.connected?
754
- bootstrap
755
- true
756
- end
757
- end
758
- end
759
- end
760
-
761
- def stop
762
- @logger.warn('stopping')
763
- pause
764
- @state = :stopping
765
- complete_handlers_in_progress do
766
- @redis.close
767
- @transport.close
768
- super
769
- end
770
- end
771
- end
772
- end