sensu 0.13.0.alpha.2-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +455 -0
- data/MIT-LICENSE.txt +20 -0
- data/README.md +11 -0
- data/bin/sensu-api +10 -0
- data/bin/sensu-client +10 -0
- data/bin/sensu-server +10 -0
- data/lib/sensu.rb +3 -0
- data/lib/sensu/api.rb +674 -0
- data/lib/sensu/cli.rb +51 -0
- data/lib/sensu/client.rb +261 -0
- data/lib/sensu/constants.rb +9 -0
- data/lib/sensu/daemon.rb +221 -0
- data/lib/sensu/redis.rb +20 -0
- data/lib/sensu/sandbox.rb +11 -0
- data/lib/sensu/server.rb +764 -0
- data/lib/sensu/socket.rb +79 -0
- data/lib/sensu/utilities.rb +60 -0
- data/sensu.gemspec +38 -0
- metadata +277 -0
data/lib/sensu/redis.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
gem 'em-redis-unified', '0.5.0'
|
2
|
+
|
3
|
+
require 'em-redis'
|
4
|
+
|
5
|
+
module Sensu
|
6
|
+
class Redis
|
7
|
+
def self.connect(options={})
|
8
|
+
options ||= Hash.new
|
9
|
+
connection = EM::Protocols::Redis.connect(options)
|
10
|
+
connection.info do |info|
|
11
|
+
if info[:redis_version] < '1.3.14'
|
12
|
+
klass = EM::Protocols::Redis::RedisError
|
13
|
+
message = 'redis version must be >= 2.0 RC 1'
|
14
|
+
connection.error(klass, message)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
connection
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/sensu/server.rb
ADDED
@@ -0,0 +1,764 @@
|
|
1
|
+
require 'sensu/daemon'
|
2
|
+
require 'sensu/socket'
|
3
|
+
require 'sensu/sandbox'
|
4
|
+
|
5
|
+
module Sensu
|
6
|
+
class Server
|
7
|
+
include Daemon
|
8
|
+
|
9
|
+
attr_reader :is_master
|
10
|
+
|
11
|
+
def self.run(options={})
|
12
|
+
server = self.new(options)
|
13
|
+
EM::run do
|
14
|
+
server.start
|
15
|
+
server.setup_signal_traps
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(options={})
|
20
|
+
super
|
21
|
+
@is_master = false
|
22
|
+
@timers[:master] = Array.new
|
23
|
+
@handlers_in_progress_count = 0
|
24
|
+
end
|
25
|
+
|
26
|
+
def setup_keepalives
|
27
|
+
@logger.debug('subscribing to keepalives')
|
28
|
+
@transport.subscribe(:direct, 'keepalives', 'keepalives', :ack => true) do |message_info, message|
|
29
|
+
client = MultiJson.load(message)
|
30
|
+
@logger.debug('received keepalive', {
|
31
|
+
:client => client
|
32
|
+
})
|
33
|
+
@redis.set('client:' + client[:name], MultiJson.dump(client)) do
|
34
|
+
@redis.sadd('clients', client[:name]) do
|
35
|
+
@transport.ack(message_info)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def action_subdued?(condition)
|
42
|
+
subdued = false
|
43
|
+
if condition.has_key?(:begin) && condition.has_key?(:end)
|
44
|
+
begin_time = Time.parse(condition[:begin])
|
45
|
+
end_time = Time.parse(condition[:end])
|
46
|
+
if end_time < begin_time
|
47
|
+
if Time.now < end_time
|
48
|
+
begin_time = Time.parse('12:00:00 AM')
|
49
|
+
else
|
50
|
+
end_time = Time.parse('11:59:59 PM')
|
51
|
+
end
|
52
|
+
end
|
53
|
+
if Time.now >= begin_time && Time.now <= end_time
|
54
|
+
subdued = true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
if condition.has_key?(:days)
|
58
|
+
days = condition[:days].map(&:downcase)
|
59
|
+
if days.include?(Time.now.strftime('%A').downcase)
|
60
|
+
subdued = true
|
61
|
+
end
|
62
|
+
end
|
63
|
+
if subdued && condition.has_key?(:exceptions)
|
64
|
+
subdued = condition[:exceptions].none? do |exception|
|
65
|
+
Time.now >= Time.parse(exception[:begin]) && Time.now <= Time.parse(exception[:end])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
subdued
|
69
|
+
end
|
70
|
+
|
71
|
+
def handler_subdued?(handler, check)
|
72
|
+
subdued = Array.new
|
73
|
+
if handler[:subdue]
|
74
|
+
subdued << action_subdued?(handler[:subdue])
|
75
|
+
end
|
76
|
+
if check[:subdue] && check[:subdue][:at] != 'publisher'
|
77
|
+
subdued << action_subdued?(check[:subdue])
|
78
|
+
end
|
79
|
+
subdued.any?
|
80
|
+
end
|
81
|
+
|
82
|
+
def filter_attributes_match?(hash_one, hash_two)
|
83
|
+
hash_one.keys.all? do |key|
|
84
|
+
case
|
85
|
+
when hash_one[key] == hash_two[key]
|
86
|
+
true
|
87
|
+
when hash_one[key].is_a?(Hash) && hash_two[key].is_a?(Hash)
|
88
|
+
filter_attributes_match?(hash_one[key], hash_two[key])
|
89
|
+
when hash_one[key].is_a?(String) && hash_one[key].start_with?('eval:')
|
90
|
+
begin
|
91
|
+
expression = hash_one[key].gsub(/^eval:(\s+)?/, '')
|
92
|
+
!!Sandbox.eval(expression, hash_two[key])
|
93
|
+
rescue => error
|
94
|
+
@logger.error('filter eval error', {
|
95
|
+
:attributes => [hash_one, hash_two],
|
96
|
+
:error => error.to_s
|
97
|
+
})
|
98
|
+
false
|
99
|
+
end
|
100
|
+
else
|
101
|
+
false
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def event_filtered?(filter_name, event)
|
107
|
+
if @settings.filter_exists?(filter_name)
|
108
|
+
filter = @settings[:filters][filter_name]
|
109
|
+
matched = filter_attributes_match?(filter[:attributes], event)
|
110
|
+
filter[:negate] ? matched : !matched
|
111
|
+
else
|
112
|
+
@logger.error('unknown filter', {
|
113
|
+
:filter_name => filter_name
|
114
|
+
})
|
115
|
+
false
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def derive_handlers(handler_list)
|
120
|
+
handler_list.compact.inject(Array.new) do |handlers, handler_name|
|
121
|
+
if @settings.handler_exists?(handler_name)
|
122
|
+
handler = @settings[:handlers][handler_name].merge(:name => handler_name)
|
123
|
+
if handler[:type] == 'set'
|
124
|
+
handlers = handlers + derive_handlers(handler[:handlers])
|
125
|
+
else
|
126
|
+
handlers << handler
|
127
|
+
end
|
128
|
+
elsif @extensions.handler_exists?(handler_name)
|
129
|
+
handlers << @extensions[:handlers][handler_name]
|
130
|
+
else
|
131
|
+
@logger.error('unknown handler', {
|
132
|
+
:handler_name => handler_name
|
133
|
+
})
|
134
|
+
end
|
135
|
+
handlers.uniq
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def event_handlers(event)
|
140
|
+
handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || 'default')
|
141
|
+
handlers = derive_handlers(handler_list)
|
142
|
+
handlers.select do |handler|
|
143
|
+
if event[:action] == :flapping && !handler[:handle_flapping]
|
144
|
+
@logger.info('handler does not handle flapping events', {
|
145
|
+
:event => event,
|
146
|
+
:handler => handler
|
147
|
+
})
|
148
|
+
next
|
149
|
+
end
|
150
|
+
if handler_subdued?(handler, event[:check])
|
151
|
+
@logger.info('handler is subdued', {
|
152
|
+
:event => event,
|
153
|
+
:handler => handler
|
154
|
+
})
|
155
|
+
next
|
156
|
+
end
|
157
|
+
if handler.has_key?(:severities)
|
158
|
+
handle = case event[:action]
|
159
|
+
when :resolve
|
160
|
+
event[:check][:history].reverse[1..-1].any? do |status|
|
161
|
+
if status.to_i == 0
|
162
|
+
break
|
163
|
+
end
|
164
|
+
severity = SEVERITIES[status.to_i] || 'unknown'
|
165
|
+
handler[:severities].include?(severity)
|
166
|
+
end
|
167
|
+
else
|
168
|
+
severity = SEVERITIES[event[:check][:status]] || 'unknown'
|
169
|
+
handler[:severities].include?(severity)
|
170
|
+
end
|
171
|
+
unless handle
|
172
|
+
@logger.debug('handler does not handle event severity', {
|
173
|
+
:event => event,
|
174
|
+
:handler => handler
|
175
|
+
})
|
176
|
+
next
|
177
|
+
end
|
178
|
+
end
|
179
|
+
if handler.has_key?(:filters) || handler.has_key?(:filter)
|
180
|
+
filter_list = Array(handler[:filters] || handler[:filter])
|
181
|
+
filtered = filter_list.any? do |filter_name|
|
182
|
+
event_filtered?(filter_name, event)
|
183
|
+
end
|
184
|
+
if filtered
|
185
|
+
@logger.info('event filtered for handler', {
|
186
|
+
:event => event,
|
187
|
+
:handler => handler
|
188
|
+
})
|
189
|
+
next
|
190
|
+
end
|
191
|
+
end
|
192
|
+
true
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def mutate_event_data(mutator_name, event, &block)
|
197
|
+
mutator_name ||= 'json'
|
198
|
+
return_output = Proc.new do |output, status|
|
199
|
+
if status == 0
|
200
|
+
block.dup.call(output)
|
201
|
+
else
|
202
|
+
@logger.error('mutator error', {
|
203
|
+
:event => event,
|
204
|
+
:output => output,
|
205
|
+
:status => status
|
206
|
+
})
|
207
|
+
@handlers_in_progress_count -= 1
|
208
|
+
end
|
209
|
+
end
|
210
|
+
@logger.debug('mutating event data', {
|
211
|
+
:event => event,
|
212
|
+
:mutator_name => mutator_name
|
213
|
+
})
|
214
|
+
case
|
215
|
+
when @settings.mutator_exists?(mutator_name)
|
216
|
+
mutator = @settings[:mutators][mutator_name]
|
217
|
+
options = {:data => MultiJson.dump(event), :timeout => mutator[:timeout]}
|
218
|
+
Spawn.process(mutator[:command], options, &return_output)
|
219
|
+
when @extensions.mutator_exists?(mutator_name)
|
220
|
+
extension = @extensions[:mutators][mutator_name]
|
221
|
+
extension.safe_run(event, &return_output)
|
222
|
+
else
|
223
|
+
@logger.error('unknown mutator', {
|
224
|
+
:mutator_name => mutator_name
|
225
|
+
})
|
226
|
+
@handlers_in_progress_count -= 1
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
def handle_event(event)
|
231
|
+
handlers = event_handlers(event)
|
232
|
+
handlers.each do |handler|
|
233
|
+
log_level = event[:check][:type] == 'metric' ? :debug : :info
|
234
|
+
@logger.send(log_level, 'handling event', {
|
235
|
+
:event => event,
|
236
|
+
:handler => handler.respond_to?(:definition) ? handler.definition : handler
|
237
|
+
})
|
238
|
+
@handlers_in_progress_count += 1
|
239
|
+
on_error = Proc.new do |error|
|
240
|
+
@logger.error('handler error', {
|
241
|
+
:event => event,
|
242
|
+
:handler => handler,
|
243
|
+
:error => error.to_s
|
244
|
+
})
|
245
|
+
@handlers_in_progress_count -= 1
|
246
|
+
end
|
247
|
+
mutate_event_data(handler[:mutator], event) do |event_data|
|
248
|
+
case handler[:type]
|
249
|
+
when 'pipe'
|
250
|
+
options = {:data => event_data, :timeout => handler[:timeout]}
|
251
|
+
Spawn.process(handler[:command], options) do |output, status|
|
252
|
+
output.each_line do |line|
|
253
|
+
@logger.info('handler output', {
|
254
|
+
:handler => handler,
|
255
|
+
:output => line
|
256
|
+
})
|
257
|
+
end
|
258
|
+
@handlers_in_progress_count -= 1
|
259
|
+
end
|
260
|
+
when 'tcp'
|
261
|
+
begin
|
262
|
+
EM::connect(handler[:socket][:host], handler[:socket][:port], SocketHandler) do |socket|
|
263
|
+
socket.on_success = Proc.new do
|
264
|
+
@handlers_in_progress_count -= 1
|
265
|
+
end
|
266
|
+
socket.on_error = on_error
|
267
|
+
timeout = handler[:timeout] || 10
|
268
|
+
socket.pending_connect_timeout = timeout
|
269
|
+
socket.comm_inactivity_timeout = timeout
|
270
|
+
socket.send_data(event_data.to_s)
|
271
|
+
socket.close_connection_after_writing
|
272
|
+
end
|
273
|
+
rescue => error
|
274
|
+
on_error.call(error)
|
275
|
+
end
|
276
|
+
when 'udp'
|
277
|
+
begin
|
278
|
+
EM::open_datagram_socket('0.0.0.0', 0, nil) do |socket|
|
279
|
+
socket.send_datagram(event_data.to_s, handler[:socket][:host], handler[:socket][:port])
|
280
|
+
socket.close_connection_after_writing
|
281
|
+
@handlers_in_progress_count -= 1
|
282
|
+
end
|
283
|
+
rescue => error
|
284
|
+
on_error.call(error)
|
285
|
+
end
|
286
|
+
when 'transport'
|
287
|
+
unless event_data.empty?
|
288
|
+
pipe = handler[:pipe]
|
289
|
+
@transport.publish(pipe[:type].to_sym, pipe[:name], event_data, pipe[:options] || Hash.new) do |info|
|
290
|
+
if info[:error]
|
291
|
+
@logger.fatal('failed to publish event data to the transport', {
|
292
|
+
:pipe => pipe,
|
293
|
+
:payload => event_data,
|
294
|
+
:error => info[:error].to_s
|
295
|
+
})
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
@handlers_in_progress_count -= 1
|
300
|
+
when 'extension'
|
301
|
+
handler.safe_run(event_data) do |output, status|
|
302
|
+
output.each_line do |line|
|
303
|
+
@logger.info('handler extension output', {
|
304
|
+
:extension => handler.definition,
|
305
|
+
:output => line
|
306
|
+
})
|
307
|
+
end
|
308
|
+
@handlers_in_progress_count -= 1
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def aggregate_result(result)
|
316
|
+
@logger.debug('adding result to aggregate', {
|
317
|
+
:result => result
|
318
|
+
})
|
319
|
+
check = result[:check]
|
320
|
+
result_set = check[:name] + ':' + check[:issued].to_s
|
321
|
+
@redis.hset('aggregation:' + result_set, result[:client], MultiJson.dump(
|
322
|
+
:output => check[:output],
|
323
|
+
:status => check[:status]
|
324
|
+
)) do
|
325
|
+
SEVERITIES.each do |severity|
|
326
|
+
@redis.hsetnx('aggregate:' + result_set, severity, 0)
|
327
|
+
end
|
328
|
+
severity = (SEVERITIES[check[:status]] || 'unknown')
|
329
|
+
@redis.hincrby('aggregate:' + result_set, severity, 1) do
|
330
|
+
@redis.hincrby('aggregate:' + result_set, 'total', 1) do
|
331
|
+
@redis.sadd('aggregates:' + check[:name], check[:issued]) do
|
332
|
+
@redis.sadd('aggregates', check[:name])
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
def event_bridges(event)
|
340
|
+
@extensions[:bridges].each do |name, bridge|
|
341
|
+
bridge.safe_run(event) do |output, status|
|
342
|
+
output.each_line do |line|
|
343
|
+
@logger.info('bridge extension output', {
|
344
|
+
:extension => bridge.definition,
|
345
|
+
:output => line
|
346
|
+
})
|
347
|
+
end
|
348
|
+
end
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
def process_result(result)
|
353
|
+
@logger.debug('processing result', {
|
354
|
+
:result => result
|
355
|
+
})
|
356
|
+
@redis.get('client:' + result[:client]) do |client_json|
|
357
|
+
unless client_json.nil?
|
358
|
+
client = MultiJson.load(client_json)
|
359
|
+
check = case
|
360
|
+
when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
|
361
|
+
@settings[:checks][result[:check][:name]].merge(result[:check])
|
362
|
+
else
|
363
|
+
result[:check]
|
364
|
+
end
|
365
|
+
if check[:aggregate]
|
366
|
+
aggregate_result(result)
|
367
|
+
end
|
368
|
+
@redis.sadd('history:' + client[:name], check[:name])
|
369
|
+
history_key = 'history:' + client[:name] + ':' + check[:name]
|
370
|
+
@redis.rpush(history_key, check[:status]) do
|
371
|
+
execution_key = 'execution:' + client[:name] + ':' + check[:name]
|
372
|
+
@redis.set(execution_key, check[:executed])
|
373
|
+
@redis.lrange(history_key, -21, -1) do |history|
|
374
|
+
check[:history] = history
|
375
|
+
total_state_change = 0
|
376
|
+
unless history.size < 21
|
377
|
+
state_changes = 0
|
378
|
+
change_weight = 0.8
|
379
|
+
previous_status = history.first
|
380
|
+
history.each do |status|
|
381
|
+
unless status == previous_status
|
382
|
+
state_changes += change_weight
|
383
|
+
end
|
384
|
+
change_weight += 0.02
|
385
|
+
previous_status = status
|
386
|
+
end
|
387
|
+
total_state_change = (state_changes.fdiv(20) * 100).to_i
|
388
|
+
@redis.ltrim(history_key, -21, -1)
|
389
|
+
end
|
390
|
+
@redis.hget('events:' + client[:name], check[:name]) do |event_json|
|
391
|
+
previous_occurrence = event_json ? MultiJson.load(event_json) : false
|
392
|
+
is_flapping = false
|
393
|
+
if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
|
394
|
+
was_flapping = previous_occurrence && previous_occurrence[:action] == 'flapping'
|
395
|
+
is_flapping = case
|
396
|
+
when total_state_change >= check[:high_flap_threshold]
|
397
|
+
true
|
398
|
+
when was_flapping && total_state_change <= check[:low_flap_threshold]
|
399
|
+
false
|
400
|
+
else
|
401
|
+
was_flapping
|
402
|
+
end
|
403
|
+
end
|
404
|
+
event = {
|
405
|
+
:id => random_uuid,
|
406
|
+
:client => client,
|
407
|
+
:check => check,
|
408
|
+
:occurrences => 1
|
409
|
+
}
|
410
|
+
if check[:status] != 0 || is_flapping
|
411
|
+
if previous_occurrence && check[:status] == previous_occurrence[:status]
|
412
|
+
event[:occurrences] = previous_occurrence[:occurrences] + 1
|
413
|
+
end
|
414
|
+
event[:action] = is_flapping ? :flapping : :create
|
415
|
+
@redis.hset('events:' + client[:name], check[:name], MultiJson.dump(event)) do
|
416
|
+
unless check[:handle] == false
|
417
|
+
handle_event(event)
|
418
|
+
end
|
419
|
+
end
|
420
|
+
elsif previous_occurrence
|
421
|
+
event[:occurrences] = previous_occurrence[:occurrences]
|
422
|
+
event[:action] = :resolve
|
423
|
+
unless check[:auto_resolve] == false && !check[:force_resolve]
|
424
|
+
@redis.hdel('events:' + client[:name], check[:name]) do
|
425
|
+
unless check[:handle] == false
|
426
|
+
handle_event(event)
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
elsif check[:type] == 'metric'
|
431
|
+
handle_event(event)
|
432
|
+
end
|
433
|
+
event_bridges(event)
|
434
|
+
end
|
435
|
+
end
|
436
|
+
end
|
437
|
+
end
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
def setup_results
|
442
|
+
@logger.debug('subscribing to results')
|
443
|
+
@transport.subscribe(:direct, 'results', 'results', :ack => true) do |message_info, message|
|
444
|
+
result = MultiJson.load(message)
|
445
|
+
@logger.debug('received result', {
|
446
|
+
:result => result
|
447
|
+
})
|
448
|
+
process_result(result)
|
449
|
+
EM::next_tick do
|
450
|
+
@transport.ack(message_info)
|
451
|
+
end
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
def check_request_subdued?(check)
|
456
|
+
if check[:subdue] && check[:subdue][:at] == 'publisher'
|
457
|
+
action_subdued?(check[:subdue])
|
458
|
+
else
|
459
|
+
false
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
def publish_check_request(check)
|
464
|
+
payload = {
|
465
|
+
:name => check[:name],
|
466
|
+
:issued => Time.now.to_i
|
467
|
+
}
|
468
|
+
if check.has_key?(:command)
|
469
|
+
payload[:command] = check[:command]
|
470
|
+
end
|
471
|
+
@logger.info('publishing check request', {
|
472
|
+
:payload => payload,
|
473
|
+
:subscribers => check[:subscribers]
|
474
|
+
})
|
475
|
+
check[:subscribers].each do |subscription|
|
476
|
+
@transport.publish(:fanout, subscription, MultiJson.dump(payload)) do |info|
|
477
|
+
if info[:error]
|
478
|
+
@logger.error('failed to publish check request', {
|
479
|
+
:subscription => subscription,
|
480
|
+
:payload => payload,
|
481
|
+
:error => info[:error].to_s
|
482
|
+
})
|
483
|
+
end
|
484
|
+
end
|
485
|
+
end
|
486
|
+
end
|
487
|
+
|
488
|
+
def schedule_checks(checks)
|
489
|
+
check_count = 0
|
490
|
+
stagger = testing? ? 0 : 2
|
491
|
+
checks.each do |check|
|
492
|
+
check_count += 1
|
493
|
+
scheduling_delay = stagger * check_count % 30
|
494
|
+
@timers[:master] << EM::Timer.new(scheduling_delay) do
|
495
|
+
interval = testing? ? 0.5 : check[:interval]
|
496
|
+
@timers[:master] << EM::PeriodicTimer.new(interval) do
|
497
|
+
unless check_request_subdued?(check)
|
498
|
+
publish_check_request(check)
|
499
|
+
else
|
500
|
+
@logger.info('check request was subdued', {
|
501
|
+
:check => check
|
502
|
+
})
|
503
|
+
end
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
end
|
508
|
+
|
509
|
+
def setup_publisher
|
510
|
+
@logger.debug('scheduling check requests')
|
511
|
+
standard_checks = @settings.checks.reject do |check|
|
512
|
+
check[:standalone] || check[:publish] == false
|
513
|
+
end
|
514
|
+
extension_checks = @extensions.checks.reject do |check|
|
515
|
+
check[:standalone] || check[:publish] == false || !check[:interval].is_a?(Integer)
|
516
|
+
end
|
517
|
+
schedule_checks(standard_checks + extension_checks)
|
518
|
+
end
|
519
|
+
|
520
|
+
def publish_result(client, check)
|
521
|
+
payload = {
|
522
|
+
:client => client[:name],
|
523
|
+
:check => check
|
524
|
+
}
|
525
|
+
@logger.debug('publishing check result', {
|
526
|
+
:payload => payload
|
527
|
+
})
|
528
|
+
@transport.publish(:direct, 'results', MultiJson.dump(payload)) do |info|
|
529
|
+
if info[:error]
|
530
|
+
@logger.error('failed to publish check result', {
|
531
|
+
:payload => payload,
|
532
|
+
:error => info[:error].to_s
|
533
|
+
})
|
534
|
+
end
|
535
|
+
end
|
536
|
+
end
|
537
|
+
|
538
|
+
def determine_stale_clients
|
539
|
+
@logger.info('determining stale clients')
|
540
|
+
@redis.smembers('clients') do |clients|
|
541
|
+
clients.each do |client_name|
|
542
|
+
@redis.get('client:' + client_name) do |client_json|
|
543
|
+
unless client_json.nil?
|
544
|
+
client = MultiJson.load(client_json)
|
545
|
+
check = {
|
546
|
+
:thresholds => {
|
547
|
+
:warning => 120,
|
548
|
+
:critical => 180
|
549
|
+
}
|
550
|
+
}
|
551
|
+
if client.has_key?(:keepalive)
|
552
|
+
check = deep_merge(check, client[:keepalive])
|
553
|
+
end
|
554
|
+
check[:name] = 'keepalive'
|
555
|
+
check[:issued] = Time.now.to_i
|
556
|
+
check[:executed] = Time.now.to_i
|
557
|
+
time_since_last_keepalive = Time.now.to_i - client[:timestamp]
|
558
|
+
case
|
559
|
+
when time_since_last_keepalive >= check[:thresholds][:critical]
|
560
|
+
check[:output] = 'No keep-alive sent from client in over '
|
561
|
+
check[:output] << check[:thresholds][:critical].to_s + ' seconds'
|
562
|
+
check[:status] = 2
|
563
|
+
when time_since_last_keepalive >= check[:thresholds][:warning]
|
564
|
+
check[:output] = 'No keep-alive sent from client in over '
|
565
|
+
check[:output] << check[:thresholds][:warning].to_s + ' seconds'
|
566
|
+
check[:status] = 1
|
567
|
+
else
|
568
|
+
check[:output] = 'Keep-alive sent from client less than '
|
569
|
+
check[:output] << check[:thresholds][:warning].to_s + ' seconds ago'
|
570
|
+
check[:status] = 0
|
571
|
+
end
|
572
|
+
publish_result(client, check)
|
573
|
+
end
|
574
|
+
end
|
575
|
+
end
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
579
|
+
def setup_client_monitor
|
580
|
+
@logger.debug('monitoring clients')
|
581
|
+
@timers[:master] << EM::PeriodicTimer.new(30) do
|
582
|
+
determine_stale_clients
|
583
|
+
end
|
584
|
+
end
|
585
|
+
|
586
|
+
def prune_aggregations
|
587
|
+
@logger.info('pruning aggregations')
|
588
|
+
@redis.smembers('aggregates') do |checks|
|
589
|
+
checks.each do |check_name|
|
590
|
+
@redis.smembers('aggregates:' + check_name) do |aggregates|
|
591
|
+
if aggregates.size > 20
|
592
|
+
aggregates.sort!
|
593
|
+
aggregates.take(aggregates.size - 20).each do |check_issued|
|
594
|
+
@redis.srem('aggregates:' + check_name, check_issued) do
|
595
|
+
result_set = check_name + ':' + check_issued.to_s
|
596
|
+
@redis.del('aggregate:' + result_set) do
|
597
|
+
@redis.del('aggregation:' + result_set) do
|
598
|
+
@logger.debug('pruned aggregation', {
|
599
|
+
:check => {
|
600
|
+
:name => check_name,
|
601
|
+
:issued => check_issued
|
602
|
+
}
|
603
|
+
})
|
604
|
+
end
|
605
|
+
end
|
606
|
+
end
|
607
|
+
end
|
608
|
+
end
|
609
|
+
end
|
610
|
+
end
|
611
|
+
end
|
612
|
+
end
|
613
|
+
|
614
|
+
def setup_aggregation_pruner
|
615
|
+
@logger.debug('pruning aggregations')
|
616
|
+
@timers[:master] << EM::PeriodicTimer.new(20) do
|
617
|
+
prune_aggregations
|
618
|
+
end
|
619
|
+
end
|
620
|
+
|
621
|
+
def master_duties
|
622
|
+
setup_publisher
|
623
|
+
setup_client_monitor
|
624
|
+
setup_aggregation_pruner
|
625
|
+
end
|
626
|
+
|
627
|
+
def request_master_election
|
628
|
+
@redis.setnx('lock:master', Time.now.to_i) do |created|
|
629
|
+
if created
|
630
|
+
@is_master = true
|
631
|
+
@logger.info('i am the master')
|
632
|
+
master_duties
|
633
|
+
else
|
634
|
+
@redis.get('lock:master') do |timestamp|
|
635
|
+
if Time.now.to_i - timestamp.to_i >= 60
|
636
|
+
@redis.getset('lock:master', Time.now.to_i) do |previous|
|
637
|
+
if previous == timestamp
|
638
|
+
@is_master = true
|
639
|
+
@logger.info('i am now the master')
|
640
|
+
master_duties
|
641
|
+
end
|
642
|
+
end
|
643
|
+
end
|
644
|
+
end
|
645
|
+
end
|
646
|
+
end
|
647
|
+
end
|
648
|
+
|
649
|
+
def setup_master_monitor
|
650
|
+
request_master_election
|
651
|
+
@timers[:run] << EM::PeriodicTimer.new(20) do
|
652
|
+
if @is_master
|
653
|
+
@redis.set('lock:master', Time.now.to_i) do
|
654
|
+
@logger.debug('updated master lock timestamp')
|
655
|
+
end
|
656
|
+
else
|
657
|
+
request_master_election
|
658
|
+
end
|
659
|
+
end
|
660
|
+
end
|
661
|
+
|
662
|
+
def resign_as_master(&block)
|
663
|
+
block ||= Proc.new {}
|
664
|
+
if @is_master
|
665
|
+
@logger.warn('resigning as master')
|
666
|
+
@timers[:master].each do |timer|
|
667
|
+
timer.cancel
|
668
|
+
end
|
669
|
+
@timers[:master].clear
|
670
|
+
if @redis.connected?
|
671
|
+
@redis.del('lock:master') do
|
672
|
+
@logger.info('removed master lock')
|
673
|
+
@is_master = false
|
674
|
+
end
|
675
|
+
end
|
676
|
+
timestamp = Time.now.to_i
|
677
|
+
retry_until_true do
|
678
|
+
if !@is_master
|
679
|
+
block.call
|
680
|
+
true
|
681
|
+
elsif Time.now.to_i - timestamp >= 3
|
682
|
+
@logger.warn('failed to remove master lock')
|
683
|
+
@is_master = false
|
684
|
+
block.call
|
685
|
+
true
|
686
|
+
end
|
687
|
+
end
|
688
|
+
else
|
689
|
+
@logger.debug('not currently master')
|
690
|
+
block.call
|
691
|
+
end
|
692
|
+
end
|
693
|
+
|
694
|
+
def unsubscribe
|
695
|
+
@logger.warn('unsubscribing from keepalive and result queues')
|
696
|
+
@transport.unsubscribe
|
697
|
+
end
|
698
|
+
|
699
|
+
def complete_handlers_in_progress(&block)
|
700
|
+
@logger.info('completing handlers in progress', {
|
701
|
+
:handlers_in_progress_count => @handlers_in_progress_count
|
702
|
+
})
|
703
|
+
retry_until_true do
|
704
|
+
if @handlers_in_progress_count == 0
|
705
|
+
block.call
|
706
|
+
true
|
707
|
+
end
|
708
|
+
end
|
709
|
+
end
|
710
|
+
|
711
|
+
def bootstrap
|
712
|
+
setup_keepalives
|
713
|
+
setup_results
|
714
|
+
setup_master_monitor
|
715
|
+
@state = :running
|
716
|
+
end
|
717
|
+
|
718
|
+
def start
|
719
|
+
setup_redis
|
720
|
+
setup_transport
|
721
|
+
bootstrap
|
722
|
+
end
|
723
|
+
|
724
|
+
def pause(&block)
|
725
|
+
unless @state == :pausing || @state == :paused
|
726
|
+
@state = :pausing
|
727
|
+
@timers[:run].each do |timer|
|
728
|
+
timer.cancel
|
729
|
+
end
|
730
|
+
@timers[:run].clear
|
731
|
+
unsubscribe
|
732
|
+
resign_as_master do
|
733
|
+
@state = :paused
|
734
|
+
if block
|
735
|
+
block.call
|
736
|
+
end
|
737
|
+
end
|
738
|
+
end
|
739
|
+
end
|
740
|
+
|
741
|
+
def resume
|
742
|
+
retry_until_true(1) do
|
743
|
+
if @state == :paused
|
744
|
+
if @redis.connected? && @transport.connected?
|
745
|
+
bootstrap
|
746
|
+
true
|
747
|
+
end
|
748
|
+
end
|
749
|
+
end
|
750
|
+
end
|
751
|
+
|
752
|
+
def stop
|
753
|
+
@logger.warn('stopping')
|
754
|
+
@state = :stopping
|
755
|
+
pause do
|
756
|
+
complete_handlers_in_progress do
|
757
|
+
@redis.close
|
758
|
+
@transport.close
|
759
|
+
super
|
760
|
+
end
|
761
|
+
end
|
762
|
+
end
|
763
|
+
end
|
764
|
+
end
|