sensu 0.17.0.beta → 0.17.0.beta.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/bin/sensu-api +4 -4
- data/bin/sensu-client +4 -4
- data/bin/sensu-server +4 -4
- data/lib/sensu/api/process.rb +704 -0
- data/lib/sensu/cli.rb +21 -15
- data/lib/sensu/client/process.rb +414 -0
- data/lib/sensu/client/socket.rb +226 -0
- data/lib/sensu/constants.rb +4 -1
- data/lib/sensu/daemon.rb +125 -73
- data/lib/sensu/redis.rb +10 -5
- data/lib/sensu/server/filter.rb +309 -0
- data/lib/sensu/server/handle.rb +168 -0
- data/lib/sensu/server/mutate.rb +92 -0
- data/lib/sensu/server/process.rb +811 -0
- data/lib/sensu/server/sandbox.rb +21 -0
- data/lib/sensu/server/socket.rb +42 -0
- data/lib/sensu/utilities.rb +29 -3
- data/sensu.gemspec +29 -28
- metadata +30 -12
- data/lib/sensu/api.rb +0 -704
- data/lib/sensu/client.rb +0 -298
- data/lib/sensu/sandbox.rb +0 -11
- data/lib/sensu/server.rb +0 -772
- data/lib/sensu/socket.rb +0 -246
data/lib/sensu/server.rb
DELETED
@@ -1,772 +0,0 @@
|
|
1
|
-
require 'sensu/daemon'
|
2
|
-
require 'sensu/socket'
|
3
|
-
require 'sensu/sandbox'
|
4
|
-
|
5
|
-
module Sensu
|
6
|
-
class Server
|
7
|
-
include Daemon
|
8
|
-
|
9
|
-
attr_reader :is_master
|
10
|
-
|
11
|
-
def self.run(options={})
|
12
|
-
server = self.new(options)
|
13
|
-
EM::run do
|
14
|
-
server.start
|
15
|
-
server.setup_signal_traps
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def initialize(options={})
|
20
|
-
super
|
21
|
-
@is_master = false
|
22
|
-
@timers[:master] = Array.new
|
23
|
-
@handlers_in_progress_count = 0
|
24
|
-
end
|
25
|
-
|
26
|
-
def setup_keepalives
|
27
|
-
@logger.debug('subscribing to keepalives')
|
28
|
-
@transport.subscribe(:direct, 'keepalives', 'keepalives', :ack => true) do |message_info, message|
|
29
|
-
begin
|
30
|
-
client = MultiJson.load(message)
|
31
|
-
@logger.debug('received keepalive', {
|
32
|
-
:client => client
|
33
|
-
})
|
34
|
-
@redis.set('client:' + client[:name], MultiJson.dump(client)) do
|
35
|
-
@redis.sadd('clients', client[:name]) do
|
36
|
-
@transport.ack(message_info)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
rescue MultiJson::ParseError => error
|
40
|
-
@logger.error('failed to parse keepalive payload', {
|
41
|
-
:message => message,
|
42
|
-
:error => error.to_s
|
43
|
-
})
|
44
|
-
@transport.ack(message_info)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def action_subdued?(condition)
|
50
|
-
subdued = false
|
51
|
-
if condition.has_key?(:begin) && condition.has_key?(:end)
|
52
|
-
begin_time = Time.parse(condition[:begin])
|
53
|
-
end_time = Time.parse(condition[:end])
|
54
|
-
if end_time < begin_time
|
55
|
-
if Time.now < end_time
|
56
|
-
begin_time = Time.parse('12:00:00 AM')
|
57
|
-
else
|
58
|
-
end_time = Time.parse('11:59:59 PM')
|
59
|
-
end
|
60
|
-
end
|
61
|
-
if Time.now >= begin_time && Time.now <= end_time
|
62
|
-
subdued = true
|
63
|
-
end
|
64
|
-
end
|
65
|
-
if condition.has_key?(:days)
|
66
|
-
days = condition[:days].map(&:downcase)
|
67
|
-
if days.include?(Time.now.strftime('%A').downcase)
|
68
|
-
subdued = true
|
69
|
-
end
|
70
|
-
end
|
71
|
-
if subdued && condition.has_key?(:exceptions)
|
72
|
-
subdued = condition[:exceptions].none? do |exception|
|
73
|
-
Time.now >= Time.parse(exception[:begin]) && Time.now <= Time.parse(exception[:end])
|
74
|
-
end
|
75
|
-
end
|
76
|
-
subdued
|
77
|
-
end
|
78
|
-
|
79
|
-
def handler_subdued?(handler, check)
|
80
|
-
subdued = Array.new
|
81
|
-
if handler[:subdue]
|
82
|
-
subdued << action_subdued?(handler[:subdue])
|
83
|
-
end
|
84
|
-
if check[:subdue] && check[:subdue][:at] != 'publisher'
|
85
|
-
subdued << action_subdued?(check[:subdue])
|
86
|
-
end
|
87
|
-
subdued.any?
|
88
|
-
end
|
89
|
-
|
90
|
-
def filter_attributes_match?(hash_one, hash_two)
|
91
|
-
hash_one.keys.all? do |key|
|
92
|
-
case
|
93
|
-
when hash_one[key] == hash_two[key]
|
94
|
-
true
|
95
|
-
when hash_one[key].is_a?(Hash) && hash_two[key].is_a?(Hash)
|
96
|
-
filter_attributes_match?(hash_one[key], hash_two[key])
|
97
|
-
when hash_one[key].to_s == hash_two[key].to_s
|
98
|
-
true
|
99
|
-
when hash_one[key].is_a?(String) && hash_one[key].start_with?('eval:')
|
100
|
-
begin
|
101
|
-
expression = hash_one[key].gsub(/^eval:(\s+)?/, '')
|
102
|
-
!!Sandbox.eval(expression, hash_two[key])
|
103
|
-
rescue => error
|
104
|
-
@logger.error('filter eval error', {
|
105
|
-
:attributes => [hash_one, hash_two],
|
106
|
-
:error => error.to_s
|
107
|
-
})
|
108
|
-
false
|
109
|
-
end
|
110
|
-
else
|
111
|
-
false
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
def event_filtered?(filter_name, event)
|
117
|
-
if @settings.filter_exists?(filter_name)
|
118
|
-
filter = @settings[:filters][filter_name]
|
119
|
-
matched = filter_attributes_match?(filter[:attributes], event)
|
120
|
-
filter[:negate] ? matched : !matched
|
121
|
-
else
|
122
|
-
@logger.error('unknown filter', {
|
123
|
-
:filter_name => filter_name
|
124
|
-
})
|
125
|
-
false
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
def derive_handlers(handler_list, depth=0)
|
130
|
-
handler_list.compact.inject(Array.new) do |handlers, handler_name|
|
131
|
-
if @settings.handler_exists?(handler_name)
|
132
|
-
handler = @settings[:handlers][handler_name].merge(:name => handler_name)
|
133
|
-
if handler[:type] == 'set'
|
134
|
-
if depth < 2
|
135
|
-
handlers = handlers + derive_handlers(handler[:handlers], depth + 1)
|
136
|
-
else
|
137
|
-
@logger.error('handler sets cannot be deeply nested', {
|
138
|
-
:handler => handler
|
139
|
-
})
|
140
|
-
end
|
141
|
-
else
|
142
|
-
handlers << handler
|
143
|
-
end
|
144
|
-
elsif @extensions.handler_exists?(handler_name)
|
145
|
-
handlers << @extensions[:handlers][handler_name]
|
146
|
-
else
|
147
|
-
@logger.error('unknown handler', {
|
148
|
-
:handler_name => handler_name
|
149
|
-
})
|
150
|
-
end
|
151
|
-
handlers.uniq
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
def event_handlers(event)
|
156
|
-
handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || 'default')
|
157
|
-
handlers = derive_handlers(handler_list)
|
158
|
-
handlers.select do |handler|
|
159
|
-
if event[:action] == :flapping && !handler[:handle_flapping]
|
160
|
-
@logger.info('handler does not handle flapping events', {
|
161
|
-
:event => event,
|
162
|
-
:handler => handler
|
163
|
-
})
|
164
|
-
next
|
165
|
-
end
|
166
|
-
if handler_subdued?(handler, event[:check])
|
167
|
-
@logger.info('handler is subdued', {
|
168
|
-
:event => event,
|
169
|
-
:handler => handler
|
170
|
-
})
|
171
|
-
next
|
172
|
-
end
|
173
|
-
if handler.has_key?(:severities)
|
174
|
-
handle = case event[:action]
|
175
|
-
when :resolve
|
176
|
-
event[:check][:history].reverse[1..-1].any? do |status|
|
177
|
-
if status.to_i == 0
|
178
|
-
break
|
179
|
-
end
|
180
|
-
severity = SEVERITIES[status.to_i] || 'unknown'
|
181
|
-
handler[:severities].include?(severity)
|
182
|
-
end
|
183
|
-
else
|
184
|
-
severity = SEVERITIES[event[:check][:status]] || 'unknown'
|
185
|
-
handler[:severities].include?(severity)
|
186
|
-
end
|
187
|
-
unless handle
|
188
|
-
@logger.debug('handler does not handle event severity', {
|
189
|
-
:event => event,
|
190
|
-
:handler => handler
|
191
|
-
})
|
192
|
-
next
|
193
|
-
end
|
194
|
-
end
|
195
|
-
if handler.has_key?(:filters) || handler.has_key?(:filter)
|
196
|
-
filter_list = Array(handler[:filters] || handler[:filter])
|
197
|
-
filtered = filter_list.any? do |filter_name|
|
198
|
-
event_filtered?(filter_name, event)
|
199
|
-
end
|
200
|
-
if filtered
|
201
|
-
@logger.info('event filtered for handler', {
|
202
|
-
:event => event,
|
203
|
-
:handler => handler
|
204
|
-
})
|
205
|
-
next
|
206
|
-
end
|
207
|
-
end
|
208
|
-
true
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
def mutate_event_data(mutator_name, event, &block)
|
213
|
-
mutator_name ||= 'json'
|
214
|
-
return_output = Proc.new do |output, status|
|
215
|
-
if status == 0
|
216
|
-
block.dup.call(output)
|
217
|
-
else
|
218
|
-
@logger.error('mutator error', {
|
219
|
-
:event => event,
|
220
|
-
:output => output,
|
221
|
-
:status => status
|
222
|
-
})
|
223
|
-
@handlers_in_progress_count -= 1
|
224
|
-
end
|
225
|
-
end
|
226
|
-
@logger.debug('mutating event data', {
|
227
|
-
:event => event,
|
228
|
-
:mutator_name => mutator_name
|
229
|
-
})
|
230
|
-
case
|
231
|
-
when @settings.mutator_exists?(mutator_name)
|
232
|
-
mutator = @settings[:mutators][mutator_name]
|
233
|
-
options = {:data => MultiJson.dump(event), :timeout => mutator[:timeout]}
|
234
|
-
Spawn.process(mutator[:command], options, &return_output)
|
235
|
-
when @extensions.mutator_exists?(mutator_name)
|
236
|
-
extension = @extensions[:mutators][mutator_name]
|
237
|
-
extension.safe_run(event, &return_output)
|
238
|
-
else
|
239
|
-
@logger.error('unknown mutator', {
|
240
|
-
:mutator_name => mutator_name
|
241
|
-
})
|
242
|
-
@handlers_in_progress_count -= 1
|
243
|
-
end
|
244
|
-
end
|
245
|
-
|
246
|
-
def handle_event(event)
|
247
|
-
handlers = event_handlers(event)
|
248
|
-
handlers.each do |handler|
|
249
|
-
log_level = event[:check][:type] == 'metric' ? :debug : :info
|
250
|
-
@logger.send(log_level, 'handling event', {
|
251
|
-
:event => event,
|
252
|
-
:handler => handler.respond_to?(:definition) ? handler.definition : handler
|
253
|
-
})
|
254
|
-
@handlers_in_progress_count += 1
|
255
|
-
on_error = Proc.new do |error|
|
256
|
-
@logger.error('handler error', {
|
257
|
-
:event => event,
|
258
|
-
:handler => handler,
|
259
|
-
:error => error.to_s
|
260
|
-
})
|
261
|
-
@handlers_in_progress_count -= 1
|
262
|
-
end
|
263
|
-
mutate_event_data(handler[:mutator], event) do |event_data|
|
264
|
-
case handler[:type]
|
265
|
-
when 'pipe'
|
266
|
-
options = {:data => event_data, :timeout => handler[:timeout]}
|
267
|
-
Spawn.process(handler[:command], options) do |output, status|
|
268
|
-
@logger.info('handler output', {
|
269
|
-
:handler => handler,
|
270
|
-
:output => output.lines,
|
271
|
-
:event_id => event[:id]
|
272
|
-
})
|
273
|
-
@handlers_in_progress_count -= 1
|
274
|
-
end
|
275
|
-
when 'tcp'
|
276
|
-
begin
|
277
|
-
EM::connect(handler[:socket][:host], handler[:socket][:port], SocketHandler) do |socket|
|
278
|
-
socket.on_success = Proc.new do
|
279
|
-
@handlers_in_progress_count -= 1
|
280
|
-
end
|
281
|
-
socket.on_error = on_error
|
282
|
-
timeout = handler[:timeout] || 10
|
283
|
-
socket.pending_connect_timeout = timeout
|
284
|
-
socket.comm_inactivity_timeout = timeout
|
285
|
-
socket.send_data(event_data.to_s)
|
286
|
-
socket.close_connection_after_writing
|
287
|
-
end
|
288
|
-
rescue => error
|
289
|
-
on_error.call(error)
|
290
|
-
end
|
291
|
-
when 'udp'
|
292
|
-
begin
|
293
|
-
EM::open_datagram_socket('0.0.0.0', 0, nil) do |socket|
|
294
|
-
socket.send_datagram(event_data.to_s, handler[:socket][:host], handler[:socket][:port])
|
295
|
-
socket.close_connection_after_writing
|
296
|
-
@handlers_in_progress_count -= 1
|
297
|
-
end
|
298
|
-
rescue => error
|
299
|
-
on_error.call(error)
|
300
|
-
end
|
301
|
-
when 'transport'
|
302
|
-
unless event_data.empty?
|
303
|
-
pipe = handler[:pipe]
|
304
|
-
@transport.publish(pipe[:type].to_sym, pipe[:name], event_data, pipe[:options] || Hash.new) do |info|
|
305
|
-
if info[:error]
|
306
|
-
@logger.fatal('failed to publish event data to the transport', {
|
307
|
-
:pipe => pipe,
|
308
|
-
:payload => event_data,
|
309
|
-
:error => info[:error].to_s
|
310
|
-
})
|
311
|
-
end
|
312
|
-
end
|
313
|
-
end
|
314
|
-
@handlers_in_progress_count -= 1
|
315
|
-
when 'extension'
|
316
|
-
handler.safe_run(event_data) do |output, status|
|
317
|
-
@logger.info('handler extension output', {
|
318
|
-
:extension => handler.definition,
|
319
|
-
:output => output,
|
320
|
-
:event_id => event[:id]
|
321
|
-
})
|
322
|
-
@handlers_in_progress_count -= 1
|
323
|
-
end
|
324
|
-
end
|
325
|
-
end
|
326
|
-
end
|
327
|
-
end
|
328
|
-
|
329
|
-
def aggregate_result(result)
|
330
|
-
@logger.debug('adding result to aggregate', {
|
331
|
-
:result => result
|
332
|
-
})
|
333
|
-
check = result[:check]
|
334
|
-
result_set = check[:name] + ':' + check[:issued].to_s
|
335
|
-
@redis.hset('aggregation:' + result_set, result[:client], MultiJson.dump(
|
336
|
-
:output => check[:output],
|
337
|
-
:status => check[:status]
|
338
|
-
)) do
|
339
|
-
SEVERITIES.each do |severity|
|
340
|
-
@redis.hsetnx('aggregate:' + result_set, severity, 0)
|
341
|
-
end
|
342
|
-
severity = (SEVERITIES[check[:status]] || 'unknown')
|
343
|
-
@redis.hincrby('aggregate:' + result_set, severity, 1) do
|
344
|
-
@redis.hincrby('aggregate:' + result_set, 'total', 1) do
|
345
|
-
@redis.sadd('aggregates:' + check[:name], check[:issued]) do
|
346
|
-
@redis.sadd('aggregates', check[:name])
|
347
|
-
end
|
348
|
-
end
|
349
|
-
end
|
350
|
-
end
|
351
|
-
end
|
352
|
-
|
353
|
-
def event_bridges(event)
|
354
|
-
@extensions[:bridges].each do |name, bridge|
|
355
|
-
bridge.safe_run(event) do |output, status|
|
356
|
-
output.each_line do |line|
|
357
|
-
@logger.debug('bridge extension output', {
|
358
|
-
:extension => bridge.definition,
|
359
|
-
:output => line
|
360
|
-
})
|
361
|
-
end
|
362
|
-
end
|
363
|
-
end
|
364
|
-
end
|
365
|
-
|
366
|
-
def process_result(result)
|
367
|
-
@logger.debug('processing result', {
|
368
|
-
:result => result
|
369
|
-
})
|
370
|
-
@redis.get('client:' + result[:client]) do |client_json|
|
371
|
-
unless client_json.nil?
|
372
|
-
client = MultiJson.load(client_json)
|
373
|
-
check = case
|
374
|
-
when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
|
375
|
-
@settings[:checks][result[:check][:name]].merge(result[:check])
|
376
|
-
else
|
377
|
-
result[:check]
|
378
|
-
end
|
379
|
-
if check[:aggregate]
|
380
|
-
aggregate_result(result)
|
381
|
-
end
|
382
|
-
@redis.sadd('history:' + client[:name], check[:name])
|
383
|
-
history_key = 'history:' + client[:name] + ':' + check[:name]
|
384
|
-
@redis.rpush(history_key, check[:status]) do
|
385
|
-
execution_key = 'execution:' + client[:name] + ':' + check[:name]
|
386
|
-
@redis.set(execution_key, check[:executed])
|
387
|
-
@redis.lrange(history_key, -21, -1) do |history|
|
388
|
-
check[:history] = history
|
389
|
-
total_state_change = 0
|
390
|
-
unless history.size < 21
|
391
|
-
state_changes = 0
|
392
|
-
change_weight = 0.8
|
393
|
-
previous_status = history.first
|
394
|
-
history.each do |status|
|
395
|
-
unless status == previous_status
|
396
|
-
state_changes += change_weight
|
397
|
-
end
|
398
|
-
change_weight += 0.02
|
399
|
-
previous_status = status
|
400
|
-
end
|
401
|
-
total_state_change = (state_changes.fdiv(20) * 100).to_i
|
402
|
-
@redis.ltrim(history_key, -21, -1)
|
403
|
-
end
|
404
|
-
@redis.hget('events:' + client[:name], check[:name]) do |event_json|
|
405
|
-
previous_occurrence = event_json ? MultiJson.load(event_json) : false
|
406
|
-
is_flapping = false
|
407
|
-
if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
|
408
|
-
was_flapping = previous_occurrence && previous_occurrence[:action] == 'flapping'
|
409
|
-
is_flapping = case
|
410
|
-
when total_state_change >= check[:high_flap_threshold]
|
411
|
-
true
|
412
|
-
when was_flapping && total_state_change <= check[:low_flap_threshold]
|
413
|
-
false
|
414
|
-
else
|
415
|
-
was_flapping
|
416
|
-
end
|
417
|
-
end
|
418
|
-
event = {
|
419
|
-
:id => random_uuid,
|
420
|
-
:client => client,
|
421
|
-
:check => check,
|
422
|
-
:occurrences => 1
|
423
|
-
}
|
424
|
-
if check[:status] != 0 || is_flapping
|
425
|
-
if previous_occurrence && check[:status] == previous_occurrence[:check][:status]
|
426
|
-
event[:occurrences] = previous_occurrence[:occurrences] + 1
|
427
|
-
end
|
428
|
-
event[:action] = is_flapping ? :flapping : :create
|
429
|
-
@redis.hset('events:' + client[:name], check[:name], MultiJson.dump(event)) do
|
430
|
-
unless check[:handle] == false
|
431
|
-
handle_event(event)
|
432
|
-
end
|
433
|
-
end
|
434
|
-
elsif previous_occurrence
|
435
|
-
event[:occurrences] = previous_occurrence[:occurrences]
|
436
|
-
event[:action] = :resolve
|
437
|
-
unless check[:auto_resolve] == false && !check[:force_resolve]
|
438
|
-
@redis.hdel('events:' + client[:name], check[:name]) do
|
439
|
-
unless check[:handle] == false
|
440
|
-
handle_event(event)
|
441
|
-
end
|
442
|
-
end
|
443
|
-
end
|
444
|
-
elsif check[:type] == 'metric'
|
445
|
-
handle_event(event)
|
446
|
-
end
|
447
|
-
event_bridges(event)
|
448
|
-
end
|
449
|
-
end
|
450
|
-
end
|
451
|
-
end
|
452
|
-
end
|
453
|
-
end
|
454
|
-
|
455
|
-
def setup_results
|
456
|
-
@logger.debug('subscribing to results')
|
457
|
-
@transport.subscribe(:direct, 'results', 'results', :ack => true) do |message_info, message|
|
458
|
-
begin
|
459
|
-
result = MultiJson.load(message)
|
460
|
-
@logger.debug('received result', {
|
461
|
-
:result => result
|
462
|
-
})
|
463
|
-
process_result(result)
|
464
|
-
rescue MultiJson::ParseError => error
|
465
|
-
@logger.error('failed to parse result payload', {
|
466
|
-
:message => message,
|
467
|
-
:error => error.to_s
|
468
|
-
})
|
469
|
-
end
|
470
|
-
EM::next_tick do
|
471
|
-
@transport.ack(message_info)
|
472
|
-
end
|
473
|
-
end
|
474
|
-
end
|
475
|
-
|
476
|
-
def check_request_subdued?(check)
|
477
|
-
if check[:subdue] && check[:subdue][:at] == 'publisher'
|
478
|
-
action_subdued?(check[:subdue])
|
479
|
-
else
|
480
|
-
false
|
481
|
-
end
|
482
|
-
end
|
483
|
-
|
484
|
-
def publish_check_request(check)
|
485
|
-
payload = {
|
486
|
-
:name => check[:name],
|
487
|
-
:issued => Time.now.to_i
|
488
|
-
}
|
489
|
-
if check.has_key?(:command)
|
490
|
-
payload[:command] = check[:command]
|
491
|
-
end
|
492
|
-
@logger.info('publishing check request', {
|
493
|
-
:payload => payload,
|
494
|
-
:subscribers => check[:subscribers]
|
495
|
-
})
|
496
|
-
check[:subscribers].each do |subscription|
|
497
|
-
@transport.publish(:fanout, subscription, MultiJson.dump(payload)) do |info|
|
498
|
-
if info[:error]
|
499
|
-
@logger.error('failed to publish check request', {
|
500
|
-
:subscription => subscription,
|
501
|
-
:payload => payload,
|
502
|
-
:error => info[:error].to_s
|
503
|
-
})
|
504
|
-
end
|
505
|
-
end
|
506
|
-
end
|
507
|
-
end
|
508
|
-
|
509
|
-
def calculate_execution_splay(check)
|
510
|
-
splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
|
511
|
-
current_time = (Time.now.to_f * 1000).to_i
|
512
|
-
(splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
|
513
|
-
end
|
514
|
-
|
515
|
-
def schedule_checks(checks)
|
516
|
-
checks.each do |check|
|
517
|
-
process_check_request = Proc.new do
|
518
|
-
unless check_request_subdued?(check)
|
519
|
-
publish_check_request(check)
|
520
|
-
else
|
521
|
-
@logger.info('check request was subdued', {
|
522
|
-
:check => check
|
523
|
-
})
|
524
|
-
end
|
525
|
-
end
|
526
|
-
execution_splay = testing? ? 0 : calculate_execution_splay(check)
|
527
|
-
interval = testing? ? 0.5 : check[:interval]
|
528
|
-
@timers[:master] << EM::Timer.new(execution_splay) do
|
529
|
-
process_check_request.call
|
530
|
-
@timers[:master] << EM::PeriodicTimer.new(interval, &process_check_request)
|
531
|
-
end
|
532
|
-
end
|
533
|
-
end
|
534
|
-
|
535
|
-
def setup_publisher
|
536
|
-
@logger.debug('scheduling check requests')
|
537
|
-
standard_checks = @settings.checks.reject do |check|
|
538
|
-
check[:standalone] || check[:publish] == false
|
539
|
-
end
|
540
|
-
extension_checks = @extensions.checks.reject do |check|
|
541
|
-
check[:standalone] || check[:publish] == false || !check[:interval].is_a?(Integer)
|
542
|
-
end
|
543
|
-
schedule_checks(standard_checks + extension_checks)
|
544
|
-
end
|
545
|
-
|
546
|
-
def publish_result(client, check)
|
547
|
-
payload = {
|
548
|
-
:client => client[:name],
|
549
|
-
:check => check
|
550
|
-
}
|
551
|
-
@logger.debug('publishing check result', {
|
552
|
-
:payload => payload
|
553
|
-
})
|
554
|
-
@transport.publish(:direct, 'results', MultiJson.dump(payload)) do |info|
|
555
|
-
if info[:error]
|
556
|
-
@logger.error('failed to publish check result', {
|
557
|
-
:payload => payload,
|
558
|
-
:error => info[:error].to_s
|
559
|
-
})
|
560
|
-
end
|
561
|
-
end
|
562
|
-
end
|
563
|
-
|
564
|
-
def determine_stale_clients
|
565
|
-
@logger.info('determining stale clients')
|
566
|
-
keepalive_check = {
|
567
|
-
:thresholds => {
|
568
|
-
:warning => 120,
|
569
|
-
:critical => 180
|
570
|
-
}
|
571
|
-
}
|
572
|
-
if @settings.handler_exists?(:keepalive)
|
573
|
-
keepalive_check[:handler] = "keepalive"
|
574
|
-
end
|
575
|
-
@redis.smembers('clients') do |clients|
|
576
|
-
clients.each do |client_name|
|
577
|
-
@redis.get('client:' + client_name) do |client_json|
|
578
|
-
unless client_json.nil?
|
579
|
-
client = MultiJson.load(client_json)
|
580
|
-
check = keepalive_check.dup
|
581
|
-
if client.has_key?(:keepalive)
|
582
|
-
check = deep_merge(check, client[:keepalive])
|
583
|
-
end
|
584
|
-
check[:name] = 'keepalive'
|
585
|
-
check[:issued] = Time.now.to_i
|
586
|
-
check[:executed] = Time.now.to_i
|
587
|
-
time_since_last_keepalive = Time.now.to_i - client[:timestamp]
|
588
|
-
check[:output] = 'No keepalive sent from client for '
|
589
|
-
check[:output] << time_since_last_keepalive.to_s + ' seconds'
|
590
|
-
case
|
591
|
-
when time_since_last_keepalive >= check[:thresholds][:critical]
|
592
|
-
check[:output] << ' (>=' + check[:thresholds][:critical].to_s + ')'
|
593
|
-
check[:status] = 2
|
594
|
-
when time_since_last_keepalive >= check[:thresholds][:warning]
|
595
|
-
check[:output] << ' (>=' + check[:thresholds][:warning].to_s + ')'
|
596
|
-
check[:status] = 1
|
597
|
-
else
|
598
|
-
check[:output] = 'Keepalive sent from client '
|
599
|
-
check[:output] << time_since_last_keepalive.to_s + ' seconds ago'
|
600
|
-
check[:status] = 0
|
601
|
-
end
|
602
|
-
publish_result(client, check)
|
603
|
-
end
|
604
|
-
end
|
605
|
-
end
|
606
|
-
end
|
607
|
-
end
|
608
|
-
|
609
|
-
def setup_client_monitor
|
610
|
-
@logger.debug('monitoring clients')
|
611
|
-
@timers[:master] << EM::PeriodicTimer.new(30) do
|
612
|
-
determine_stale_clients
|
613
|
-
end
|
614
|
-
end
|
615
|
-
|
616
|
-
def prune_aggregations
|
617
|
-
@logger.info('pruning aggregations')
|
618
|
-
@redis.smembers('aggregates') do |checks|
|
619
|
-
checks.each do |check_name|
|
620
|
-
@redis.smembers('aggregates:' + check_name) do |aggregates|
|
621
|
-
if aggregates.size > 20
|
622
|
-
aggregates.sort!
|
623
|
-
aggregates.take(aggregates.size - 20).each do |check_issued|
|
624
|
-
@redis.srem('aggregates:' + check_name, check_issued) do
|
625
|
-
result_set = check_name + ':' + check_issued.to_s
|
626
|
-
@redis.del('aggregate:' + result_set) do
|
627
|
-
@redis.del('aggregation:' + result_set) do
|
628
|
-
@logger.debug('pruned aggregation', {
|
629
|
-
:check => {
|
630
|
-
:name => check_name,
|
631
|
-
:issued => check_issued
|
632
|
-
}
|
633
|
-
})
|
634
|
-
end
|
635
|
-
end
|
636
|
-
end
|
637
|
-
end
|
638
|
-
end
|
639
|
-
end
|
640
|
-
end
|
641
|
-
end
|
642
|
-
end
|
643
|
-
|
644
|
-
def setup_aggregation_pruner
|
645
|
-
@logger.debug('pruning aggregations')
|
646
|
-
@timers[:master] << EM::PeriodicTimer.new(20) do
|
647
|
-
prune_aggregations
|
648
|
-
end
|
649
|
-
end
|
650
|
-
|
651
|
-
def master_duties
|
652
|
-
setup_publisher
|
653
|
-
setup_client_monitor
|
654
|
-
setup_aggregation_pruner
|
655
|
-
end
|
656
|
-
|
657
|
-
def request_master_election
|
658
|
-
@redis.setnx('lock:master', Time.now.to_i) do |created|
|
659
|
-
if created
|
660
|
-
@is_master = true
|
661
|
-
@logger.info('i am the master')
|
662
|
-
master_duties
|
663
|
-
else
|
664
|
-
@redis.get('lock:master') do |timestamp|
|
665
|
-
if Time.now.to_i - timestamp.to_i >= 30
|
666
|
-
@redis.getset('lock:master', Time.now.to_i) do |previous|
|
667
|
-
if previous == timestamp
|
668
|
-
@is_master = true
|
669
|
-
@logger.info('i am now the master')
|
670
|
-
master_duties
|
671
|
-
end
|
672
|
-
end
|
673
|
-
end
|
674
|
-
end
|
675
|
-
end
|
676
|
-
end
|
677
|
-
end
|
678
|
-
|
679
|
-
def setup_master_monitor
|
680
|
-
@timers[:run] << EM::Timer.new(2) do
|
681
|
-
request_master_election
|
682
|
-
end
|
683
|
-
@timers[:run] << EM::PeriodicTimer.new(10) do
|
684
|
-
if @is_master
|
685
|
-
@redis.set('lock:master', Time.now.to_i) do
|
686
|
-
@logger.debug('updated master lock timestamp')
|
687
|
-
end
|
688
|
-
else
|
689
|
-
request_master_election
|
690
|
-
end
|
691
|
-
end
|
692
|
-
end
|
693
|
-
|
694
|
-
def resign_as_master
|
695
|
-
if @is_master
|
696
|
-
@logger.warn('resigning as master')
|
697
|
-
@timers[:master].each do |timer|
|
698
|
-
timer.cancel
|
699
|
-
end
|
700
|
-
@timers[:master].clear
|
701
|
-
@is_master = false
|
702
|
-
else
|
703
|
-
@logger.debug('not currently master')
|
704
|
-
end
|
705
|
-
end
|
706
|
-
|
707
|
-
def unsubscribe
|
708
|
-
@logger.warn('unsubscribing from keepalive and result queues')
|
709
|
-
@transport.unsubscribe
|
710
|
-
end
|
711
|
-
|
712
|
-
def complete_handlers_in_progress(&block)
|
713
|
-
@logger.info('completing handlers in progress', {
|
714
|
-
:handlers_in_progress_count => @handlers_in_progress_count
|
715
|
-
})
|
716
|
-
retry_until_true do
|
717
|
-
if @handlers_in_progress_count == 0
|
718
|
-
block.call
|
719
|
-
true
|
720
|
-
end
|
721
|
-
end
|
722
|
-
end
|
723
|
-
|
724
|
-
def bootstrap
|
725
|
-
setup_keepalives
|
726
|
-
setup_results
|
727
|
-
setup_master_monitor
|
728
|
-
@state = :running
|
729
|
-
end
|
730
|
-
|
731
|
-
def start
|
732
|
-
setup_redis
|
733
|
-
setup_transport
|
734
|
-
bootstrap
|
735
|
-
end
|
736
|
-
|
737
|
-
def pause
|
738
|
-
unless @state == :pausing || @state == :paused
|
739
|
-
@state = :pausing
|
740
|
-
@timers[:run].each do |timer|
|
741
|
-
timer.cancel
|
742
|
-
end
|
743
|
-
@timers[:run].clear
|
744
|
-
unsubscribe
|
745
|
-
resign_as_master
|
746
|
-
@state = :paused
|
747
|
-
end
|
748
|
-
end
|
749
|
-
|
750
|
-
def resume
|
751
|
-
retry_until_true(1) do
|
752
|
-
if @state == :paused
|
753
|
-
if @redis.connected? && @transport.connected?
|
754
|
-
bootstrap
|
755
|
-
true
|
756
|
-
end
|
757
|
-
end
|
758
|
-
end
|
759
|
-
end
|
760
|
-
|
761
|
-
def stop
|
762
|
-
@logger.warn('stopping')
|
763
|
-
pause
|
764
|
-
@state = :stopping
|
765
|
-
complete_handlers_in_progress do
|
766
|
-
@redis.close
|
767
|
-
@transport.close
|
768
|
-
super
|
769
|
-
end
|
770
|
-
end
|
771
|
-
end
|
772
|
-
end
|