sensu 0.17.0.beta → 0.17.0.beta.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/bin/sensu-api +4 -4
- data/bin/sensu-client +4 -4
- data/bin/sensu-server +4 -4
- data/lib/sensu/api/process.rb +704 -0
- data/lib/sensu/cli.rb +21 -15
- data/lib/sensu/client/process.rb +414 -0
- data/lib/sensu/client/socket.rb +226 -0
- data/lib/sensu/constants.rb +4 -1
- data/lib/sensu/daemon.rb +125 -73
- data/lib/sensu/redis.rb +10 -5
- data/lib/sensu/server/filter.rb +309 -0
- data/lib/sensu/server/handle.rb +168 -0
- data/lib/sensu/server/mutate.rb +92 -0
- data/lib/sensu/server/process.rb +811 -0
- data/lib/sensu/server/sandbox.rb +21 -0
- data/lib/sensu/server/socket.rb +42 -0
- data/lib/sensu/utilities.rb +29 -3
- data/sensu.gemspec +29 -28
- metadata +30 -12
- data/lib/sensu/api.rb +0 -704
- data/lib/sensu/client.rb +0 -298
- data/lib/sensu/sandbox.rb +0 -11
- data/lib/sensu/server.rb +0 -772
- data/lib/sensu/socket.rb +0 -246
@@ -0,0 +1,811 @@
|
|
1
|
+
require "sensu/daemon"
|
2
|
+
require "sensu/server/filter"
|
3
|
+
require "sensu/server/mutate"
|
4
|
+
require "sensu/server/handle"
|
5
|
+
|
6
|
+
module Sensu
|
7
|
+
module Server
|
8
|
+
class Process
|
9
|
+
include Daemon
|
10
|
+
include Filter
|
11
|
+
include Mutate
|
12
|
+
include Handle
|
13
|
+
|
14
|
+
attr_reader :is_master, :handling_event_count
|
15
|
+
|
16
|
+
# Create an instance of the Sensu server process, start the
|
17
|
+
# server within the EventMachine event loop, and set up server
|
18
|
+
# process signal traps (for stopping).
|
19
|
+
#
|
20
|
+
# @param options [Hash]
|
21
|
+
def self.run(options={})
|
22
|
+
server = self.new(options)
|
23
|
+
EM::run do
|
24
|
+
server.start
|
25
|
+
server.setup_signal_traps
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Override Daemon initialize() to support Sensu server master
|
30
|
+
# election and the handling event count.
|
31
|
+
#
|
32
|
+
# @param options [Hash]
|
33
|
+
def initialize(options={})
|
34
|
+
super
|
35
|
+
@is_master = false
|
36
|
+
@timers[:master] = Array.new
|
37
|
+
@handling_event_count = 0
|
38
|
+
end
|
39
|
+
|
40
|
+
# Update the Sensu client registry, stored in Redis. Sensu
|
41
|
+
# client data is used to provide additional event context and
|
42
|
+
# enable agent health monitoring. JSON serialization is used for
|
43
|
+
# the client data.
|
44
|
+
#
|
45
|
+
# @param client [Hash]
|
46
|
+
# @param callback [Proc] to call after the the client data has
|
47
|
+
# been added to (or updated) the registry.
|
48
|
+
def update_client_registry(client, &callback)
|
49
|
+
@logger.debug("updating client registry", :client => client)
|
50
|
+
@redis.set("client:#{client[:name]}", MultiJson.dump(client)) do
|
51
|
+
@redis.sadd("clients", client[:name]) do
|
52
|
+
callback.call
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Set up the client keepalive consumer, keeping the Sensu client
|
58
|
+
# registry updated. The consumer receives JSON serialized client
|
59
|
+
# keepalives from the transport, parses them, and calls
|
60
|
+
# `update_client_registry()` with the client data to update the
|
61
|
+
# registry. Transport message acknowledgements are used to
|
62
|
+
# ensure the client registry is updated successfully. Keepalive
|
63
|
+
# JSON parsing errors are logged.
|
64
|
+
def setup_keepalives
|
65
|
+
@logger.debug("subscribing to keepalives")
|
66
|
+
@transport.subscribe(:direct, "keepalives", "keepalives", :ack => true) do |message_info, message|
|
67
|
+
@logger.debug("received keepalive", :message => message)
|
68
|
+
begin
|
69
|
+
client = MultiJson.load(message)
|
70
|
+
update_client_registry(client) do
|
71
|
+
@transport.ack(message_info)
|
72
|
+
end
|
73
|
+
rescue MultiJson::ParseError => error
|
74
|
+
@logger.error("failed to parse keepalive payload", {
|
75
|
+
:message => message,
|
76
|
+
:error => error.to_s
|
77
|
+
})
|
78
|
+
@transport.ack(message_info)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Expand event handler sets, creating an array of handler
|
84
|
+
# definitions. Handler sets cannot be deeply nested (by choice),
|
85
|
+
# this method will return `nil` if an attempt is made to deeply
|
86
|
+
# nest. If the provided handler definition is not a set, it is
|
87
|
+
# returned.
|
88
|
+
#
|
89
|
+
# @param handler [Hash] definition.
|
90
|
+
# @param depth [Integer] of the expansion.
|
91
|
+
# @return [Array, Hash, Nil]
|
92
|
+
def expand_handler_sets(handler, depth=0)
|
93
|
+
if handler[:type] == "set"
|
94
|
+
if depth < 2
|
95
|
+
derive_handlers(handler[:handlers], depth + 1)
|
96
|
+
else
|
97
|
+
@logger.error("handler sets cannot be deeply nested", :handler => handler)
|
98
|
+
nil
|
99
|
+
end
|
100
|
+
else
|
101
|
+
handler
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Derive an array of handler definitions from a list of handler
|
106
|
+
# names. This method first checks for the existence of standard
|
107
|
+
# handlers, followed by handler extensions. If a handler does
|
108
|
+
# not exist for a name, it is logged and ignored. Duplicate
|
109
|
+
# handler definitions are removed.
|
110
|
+
#
|
111
|
+
# @param handler_list [Array]
|
112
|
+
# @param depth [Integer] of handler set expansion.
|
113
|
+
# @return [Array]
|
114
|
+
def derive_handlers(handler_list, depth=0)
|
115
|
+
handler_list.compact.map { |handler_name|
|
116
|
+
case
|
117
|
+
when @settings.handler_exists?(handler_name)
|
118
|
+
handler = @settings[:handlers][handler_name].merge(:name => handler_name)
|
119
|
+
expand_handler_sets(handler, depth)
|
120
|
+
when @extensions.handler_exists?(handler_name)
|
121
|
+
@extensions[:handlers][handler_name]
|
122
|
+
else
|
123
|
+
@logger.error("unknown handler", :handler_name => handler_name)
|
124
|
+
nil
|
125
|
+
end
|
126
|
+
}.flatten.compact.uniq
|
127
|
+
end
|
128
|
+
|
129
|
+
# Run event bridge extensions, within the Sensu EventMachine
|
130
|
+
# reactor (event loop). The extension API `safe_run()` method is
|
131
|
+
# used to guard against most errors. Bridges are for relaying
|
132
|
+
# Sensu event data to other services.
|
133
|
+
#
|
134
|
+
# @param event [Hash]
|
135
|
+
def event_bridges(event)
|
136
|
+
@extensions[:bridges].each do |name, bridge|
|
137
|
+
bridge.safe_run(event) do |output, status|
|
138
|
+
@logger.debug("bridge extension output", {
|
139
|
+
:extension => bridge.definition,
|
140
|
+
:output => output
|
141
|
+
})
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# Process an event: filter -> mutate -> handle.
|
147
|
+
#
|
148
|
+
# This method runs event bridges, relaying the event data to
|
149
|
+
# other services. This method also determines the appropriate
|
150
|
+
# handlers for the event, filtering and mutating the event data
|
151
|
+
# for each of them. The `@handling_event_count` is incremented
|
152
|
+
# by `1`, for each event handler chain (filter -> mutate ->
|
153
|
+
# handle).
|
154
|
+
#
|
155
|
+
# @param event [Hash]
|
156
|
+
def process_event(event)
|
157
|
+
log_level = event[:check][:type] == "metric" ? :debug : :info
|
158
|
+
@logger.send(log_level, "processing event", :event => event)
|
159
|
+
event_bridges(event)
|
160
|
+
handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || "default")
|
161
|
+
handlers = derive_handlers(handler_list)
|
162
|
+
handlers.each do |handler|
|
163
|
+
@handling_event_count += 1
|
164
|
+
filter_event(handler, event) do |event|
|
165
|
+
mutate_event(handler, event) do |event_data|
|
166
|
+
handle_event(handler, event_data)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
# Add a check result to an aggregate. A check aggregate uses the
|
173
|
+
# check `:name` and the `:issued` timestamp as its unique
|
174
|
+
# identifier. An aggregate uses several counters: the total
|
175
|
+
# number of results in the aggregate, and a counter for each
|
176
|
+
# check severity (ok, warning, etc). Check output is also
|
177
|
+
# stored, to be summarized to aid in identifying outliers for a
|
178
|
+
# check execution across a number of Sensu clients. JSON
|
179
|
+
# serialization is used for storing check result data.
|
180
|
+
#
|
181
|
+
# @param result [Hash]
|
182
|
+
def aggregate_check_result(result)
|
183
|
+
@logger.debug("adding check result to aggregate", :result => result)
|
184
|
+
check = result[:check]
|
185
|
+
result_set = "#{check[:name]}:#{check[:issued]}"
|
186
|
+
result_data = MultiJson.dump(:output => check[:output], :status => check[:status])
|
187
|
+
@redis.hset("aggregation:#{result_set}", result[:client], result_data) do
|
188
|
+
SEVERITIES.each do |severity|
|
189
|
+
@redis.hsetnx("aggregate:#{result_set}", severity, 0)
|
190
|
+
end
|
191
|
+
severity = (SEVERITIES[check[:status]] || "unknown")
|
192
|
+
@redis.hincrby("aggregate:#{result_set}", severity, 1) do
|
193
|
+
@redis.hincrby("aggregate:#{result_set}", "total", 1) do
|
194
|
+
@redis.sadd("aggregates:#{check[:name]}", check[:issued]) do
|
195
|
+
@redis.sadd("aggregates", check[:name])
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
# Store check result data. This method stores the 21 most recent
|
203
|
+
# check result statuses for a client/check pair, this history
|
204
|
+
# is used for event context and flap detection. The check
|
205
|
+
# execution timestamp is also stored, to provide an indication
|
206
|
+
# of how recent the data is.
|
207
|
+
#
|
208
|
+
# @param client [Hash]
|
209
|
+
# @param check [Hash]
|
210
|
+
# @param callback [Proc] to call when the check result data has
|
211
|
+
# been stored (history, etc).
|
212
|
+
def store_check_result(client, check, &callback)
|
213
|
+
@redis.sadd("history:#{client[:name]}", check[:name])
|
214
|
+
result_key = "#{client[:name]}:#{check[:name]}"
|
215
|
+
history_key = "history:#{result_key}"
|
216
|
+
@redis.rpush(history_key, check[:status]) do
|
217
|
+
@redis.set("execution:#{result_key}", check[:executed])
|
218
|
+
@redis.ltrim(history_key, -21, -1)
|
219
|
+
callback.call
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# Fetch the execution history for a client/check pair, the 21
|
224
|
+
# most recent check result statuses. This method also calculates
|
225
|
+
# the total state change percentage for the history, this value
|
226
|
+
# is use for check state flat detection, using a similar
|
227
|
+
# algorithm to Nagios:
|
228
|
+
# http://nagios.sourceforge.net/docs/3_0/flapping.html
|
229
|
+
#
|
230
|
+
# @param client [Hash]
|
231
|
+
# @param check [Hash]
|
232
|
+
# @param callback [Proc] to be called with the check history and
|
233
|
+
# total state change value.
|
234
|
+
def check_history(client, check, &callback)
|
235
|
+
history_key = "history:#{client[:name]}:#{check[:name]}"
|
236
|
+
@redis.lrange(history_key, -21, -1) do |history|
|
237
|
+
total_state_change = 0
|
238
|
+
unless history.size < 21
|
239
|
+
state_changes = 0
|
240
|
+
change_weight = 0.8
|
241
|
+
previous_status = history.first
|
242
|
+
history.each do |status|
|
243
|
+
unless status == previous_status
|
244
|
+
state_changes += change_weight
|
245
|
+
end
|
246
|
+
change_weight += 0.02
|
247
|
+
previous_status = status
|
248
|
+
end
|
249
|
+
total_state_change = (state_changes.fdiv(20) * 100).to_i
|
250
|
+
end
|
251
|
+
callback.call(history, total_state_change)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
# Determine if a check state is flapping, rapidly changing
|
256
|
+
# between an OK and non-OK state. Flap detection is only done
|
257
|
+
# for checks that have defined low and hight flap detection
|
258
|
+
# thresholds, `:low_flap_threshold` and `:high_flap_threshold`.
|
259
|
+
# The `check_history()` method provides the check history and
|
260
|
+
# more importantly the total state change precentage value that
|
261
|
+
# is compared with the configured thresholds defined in the
|
262
|
+
# check data. If a check hasn't been flapping, the
|
263
|
+
# `:total_state_change` must be equal to or higher than the
|
264
|
+
# `:high_flap_threshold` to be changed to flapping. If a check
|
265
|
+
# has been flapping, the `:total_state_change` must be equal to
|
266
|
+
# or lower than the `:low_flap_threshold` to no longer be
|
267
|
+
# flapping. This method uses the same algorithm as Nagios:
|
268
|
+
# http://nagios.sourceforge.net/docs/3_0/flapping.html
|
269
|
+
#
|
270
|
+
# @param stored_event [Hash]
|
271
|
+
# @param check [Hash]
|
272
|
+
# @return [TrueClass, FalseClass]
|
273
|
+
def check_flapping?(stored_event, check)
|
274
|
+
if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
|
275
|
+
was_flapping = stored_event && stored_event[:action] == "flapping"
|
276
|
+
check[:total_state_change] >= check[:high_flap_threshold] ||
|
277
|
+
(was_flapping && check[:total_state_change] <= check[:low_flap_threshold]) ||
|
278
|
+
was_flapping
|
279
|
+
else
|
280
|
+
false
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# Update the event registry, stored in Redis. This method
|
285
|
+
# determines if check data results in the creation or update of
|
286
|
+
# event data in the registry. Existing event data for a
|
287
|
+
# client/check pair is fetched, used in conditionals and the
|
288
|
+
# composition of the new event data. If a check `:status` is not
|
289
|
+
# `0`, or it has been flapping, an event is created/updated in
|
290
|
+
# the registry. If there was existing event data, but the check
|
291
|
+
# `:status` is now `0`, the event is removed (resolved) from the
|
292
|
+
# registry. If the previous conditions are not met, and check
|
293
|
+
# `:type` is `metric` and the `:status` is `0`, the event
|
294
|
+
# registry is not updated, but the provided callback is called
|
295
|
+
# with the event data. JSON serialization is used when storing
|
296
|
+
# data in the registry.
|
297
|
+
#
|
298
|
+
# @param client [Hash]
|
299
|
+
# @param check [Hash]
|
300
|
+
# @param callback [Proc] to be called with the resulting event
|
301
|
+
# data if the event registry is updated, or the check is of
|
302
|
+
# type `:metric`.
|
303
|
+
def update_event_registry(client, check, &callback)
|
304
|
+
@redis.hget("events:#{client[:name]}", check[:name]) do |event_json|
|
305
|
+
stored_event = event_json ? MultiJson.load(event_json) : nil
|
306
|
+
flapping = check_flapping?(stored_event, check)
|
307
|
+
event = {
|
308
|
+
:id => random_uuid,
|
309
|
+
:client => client,
|
310
|
+
:check => check,
|
311
|
+
:occurrences => 1
|
312
|
+
}
|
313
|
+
if check[:status] != 0 || flapping
|
314
|
+
if stored_event && check[:status] == stored_event[:check][:status]
|
315
|
+
event[:occurrences] = stored_event[:occurrences] + 1
|
316
|
+
end
|
317
|
+
event[:action] = flapping ? :flapping : :create
|
318
|
+
@redis.hset("events:#{client[:name]}", check[:name], MultiJson.dump(event)) do
|
319
|
+
callback.call(event)
|
320
|
+
end
|
321
|
+
elsif stored_event
|
322
|
+
event[:occurrences] = stored_event[:occurrences]
|
323
|
+
event[:action] = :resolve
|
324
|
+
unless check[:auto_resolve] == false && !check[:force_resolve]
|
325
|
+
@redis.hdel("events:#{client[:name]}", check[:name]) do
|
326
|
+
callback.call(event)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
elsif check[:type] == "metric"
|
330
|
+
callback.call(event)
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
# Process a check result, storing its data, inspecting its
|
336
|
+
# contents, and taking the appropriate actions (eg. update the
|
337
|
+
# event registry). A check result must have a valid client name,
|
338
|
+
# associated with a client in the registry. Results without a
|
339
|
+
# valid client are discarded, to keep the system "correct". If a
|
340
|
+
# local check definition exists for the check name, and the
|
341
|
+
# check result is not from a standalone check execution, it's
|
342
|
+
# merged with the check result for more context.
|
343
|
+
#
|
344
|
+
# @param result [Hash] data.
|
345
|
+
def process_check_result(result)
|
346
|
+
@logger.debug("processing result", :result => result)
|
347
|
+
@redis.get("client:#{result[:client]}") do |client_json|
|
348
|
+
unless client_json.nil?
|
349
|
+
client = MultiJson.load(client_json)
|
350
|
+
check = case
|
351
|
+
when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
|
352
|
+
@settings[:checks][result[:check][:name]].merge(result[:check])
|
353
|
+
else
|
354
|
+
result[:check]
|
355
|
+
end
|
356
|
+
aggregate_check_result(result) if check[:aggregate]
|
357
|
+
store_check_result(client, check) do
|
358
|
+
check_history(client, check) do |history, total_state_change|
|
359
|
+
check[:history] = history
|
360
|
+
check[:total_state_change] = total_state_change
|
361
|
+
update_event_registry(client, check) do |event|
|
362
|
+
process_event(event)
|
363
|
+
end
|
364
|
+
end
|
365
|
+
end
|
366
|
+
else
|
367
|
+
@logger.warn("client not in registry", :client => result[:client])
|
368
|
+
end
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
# Set up the check result consumer. The consumer receives JSON
|
373
|
+
# serialized check results from the transport, parses them, and
|
374
|
+
# calls `process_check_result()` with the result data to be
|
375
|
+
# processed. Transport message acknowledgements are used to
|
376
|
+
# ensure that results make it to processing. The transport
|
377
|
+
# message acknowledgements are currently done in the next tick
|
378
|
+
# of the EventMachine reactor (event loop), as a flow control
|
379
|
+
# mechanism. Result JSON parsing errors are logged.
|
380
|
+
def setup_results
|
381
|
+
@logger.debug("subscribing to results")
|
382
|
+
@transport.subscribe(:direct, "results", "results", :ack => true) do |message_info, message|
|
383
|
+
begin
|
384
|
+
result = MultiJson.load(message)
|
385
|
+
@logger.debug("received result", :result => result)
|
386
|
+
process_check_result(result)
|
387
|
+
rescue MultiJson::ParseError => error
|
388
|
+
@logger.error("failed to parse result payload", {
|
389
|
+
:message => message,
|
390
|
+
:error => error.to_s
|
391
|
+
})
|
392
|
+
end
|
393
|
+
EM::next_tick do
|
394
|
+
@transport.ack(message_info)
|
395
|
+
end
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
# Publish a check request to the transport. A check request is
|
400
|
+
# composted of a check `:name`, an `:issued` timestamp, and a
|
401
|
+
# check `:command` if available. The check request is published
|
402
|
+
# to a transport pipe, for each of the check `:subscribers` in
|
403
|
+
# its definition, eg. "webserver". JSON serialization is used
|
404
|
+
# when publishing the check request payload to the transport
|
405
|
+
# pipes. Transport errors are logged.
|
406
|
+
#
|
407
|
+
# @param check [Hash] definition.
|
408
|
+
def publish_check_request(check)
|
409
|
+
payload = {
|
410
|
+
:name => check[:name],
|
411
|
+
:issued => Time.now.to_i
|
412
|
+
}
|
413
|
+
payload[:command] = check[:command] if check.has_key?(:command)
|
414
|
+
@logger.info("publishing check request", {
|
415
|
+
:payload => payload,
|
416
|
+
:subscribers => check[:subscribers]
|
417
|
+
})
|
418
|
+
check[:subscribers].each do |subscription|
|
419
|
+
@transport.publish(:fanout, subscription, MultiJson.dump(payload)) do |info|
|
420
|
+
if info[:error]
|
421
|
+
@logger.error("failed to publish check request", {
|
422
|
+
:subscription => subscription,
|
423
|
+
:payload => payload,
|
424
|
+
:error => info[:error].to_s
|
425
|
+
})
|
426
|
+
end
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
# Calculate a check execution splay, taking into account the
|
432
|
+
# current time and the execution interval to ensure it's
|
433
|
+
# consistent between process restarts.
|
434
|
+
#
|
435
|
+
# @param check [Hash] definition.
|
436
|
+
def calculate_check_execution_splay(check)
|
437
|
+
splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
|
438
|
+
current_time = (Time.now.to_f * 1000).to_i
|
439
|
+
(splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
|
440
|
+
end
|
441
|
+
|
442
|
+
# Schedule check executions, using EventMachine periodic timers,
|
443
|
+
# using a calculated execution splay. The timers are stored in
|
444
|
+
# the timers hash under `:master`, as check request publishing
|
445
|
+
# is a task for only the Sensu server master, so they can be
|
446
|
+
# cancelled etc. Check requests are not published if subdued.
|
447
|
+
#
|
448
|
+
# @param checks [Array] of definitions.
|
449
|
+
def schedule_check_executions(checks)
|
450
|
+
checks.each do |check|
|
451
|
+
create_check_request = Proc.new do
|
452
|
+
unless check_request_subdued?(check)
|
453
|
+
publish_check_request(check)
|
454
|
+
else
|
455
|
+
@logger.info("check request was subdued", :check => check)
|
456
|
+
end
|
457
|
+
end
|
458
|
+
execution_splay = testing? ? 0 : calculate_check_execution_splay(check)
|
459
|
+
interval = testing? ? 0.5 : check[:interval]
|
460
|
+
@timers[:master] << EM::Timer.new(execution_splay) do
|
461
|
+
create_check_request.call
|
462
|
+
@timers[:master] << EM::PeriodicTimer.new(interval, &create_check_request)
|
463
|
+
end
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
467
|
+
# Set up the check request publisher. This method creates an
|
468
|
+
# array of check definitions, that are not standalone checks,
|
469
|
+
# and do not have `:publish` set to `false`. The array of check
|
470
|
+
# definitions includes those from standard checks and extensions
|
471
|
+
# (with a defined execution `:interval`). The array is provided
|
472
|
+
# to the `schedule_check_executions()` method.
|
473
|
+
def setup_check_request_publisher
|
474
|
+
@logger.debug("scheduling check requests")
|
475
|
+
standard_checks = @settings.checks.reject do |check|
|
476
|
+
check[:standalone] || check[:publish] == false
|
477
|
+
end
|
478
|
+
extension_checks = @extensions.checks.reject do |check|
|
479
|
+
check[:standalone] || check[:publish] == false || !check[:interval].is_a?(Integer)
|
480
|
+
end
|
481
|
+
schedule_check_executions(standard_checks + extension_checks)
|
482
|
+
end
|
483
|
+
|
484
|
+
# Publish a check result to the transport for processing. A
|
485
|
+
# check result is composed of a client name and a check
|
486
|
+
# definition, containing check `:output` and `:status`. JSON
|
487
|
+
# serialization is used when publishing the check result payload
|
488
|
+
# to the transport pipe. Transport errors are logged.
|
489
|
+
#
|
490
|
+
# @param client [Hash]
|
491
|
+
# @param check [Hash]
|
492
|
+
def publish_check_result(client, check)
|
493
|
+
payload = {
|
494
|
+
:client => client[:name],
|
495
|
+
:check => check
|
496
|
+
}
|
497
|
+
@logger.debug("publishing check result", :payload => payload)
|
498
|
+
@transport.publish(:direct, "results", MultiJson.dump(payload)) do |info|
|
499
|
+
if info[:error]
|
500
|
+
@logger.error("failed to publish check result", {
|
501
|
+
:payload => payload,
|
502
|
+
:error => info[:error].to_s
|
503
|
+
})
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
# Create a keepalive check definition for a client. Client
|
509
|
+
# definitions may contain `:keepalive` configuration, containing
|
510
|
+
# specific thresholds and handler information. The keepalive
|
511
|
+
# check definition creation begins with default thresholds, and
|
512
|
+
# sets the `:handler` to `keepalive`, if the handler has a local
|
513
|
+
# definition. If the client provides its own `:keepalive`
|
514
|
+
# configuration, it's deep merged with the defaults. The check
|
515
|
+
# `:name`, `:issued`, and `:executed` values are always
|
516
|
+
# overridden to guard against an invalid definition.
|
517
|
+
def create_keepalive_check(client)
|
518
|
+
check = {
|
519
|
+
:thresholds => {
|
520
|
+
:warning => 120,
|
521
|
+
:critical => 180
|
522
|
+
}
|
523
|
+
}
|
524
|
+
if @settings.handler_exists?(:keepalive)
|
525
|
+
check[:handler] = "keepalive"
|
526
|
+
end
|
527
|
+
if client.has_key?(:keepalive)
|
528
|
+
check = deep_merge(check, client[:keepalive])
|
529
|
+
end
|
530
|
+
timestamp = Time.now.to_i
|
531
|
+
check.merge(:name => "keepalive", :issued => timestamp, :executed => timestamp)
|
532
|
+
end
|
533
|
+
|
534
|
+
# Determine stale clients, those that have not sent a keepalive
|
535
|
+
# in a specified amount of time (thresholds). This method
|
536
|
+
# iterates through the client registry, creating a keepalive
|
537
|
+
# check definition with the `create_keepalive_check()` method,
|
538
|
+
# containing client specific staleness thresholds. If the time
|
539
|
+
# since the latest keepalive is equal to or greater than a
|
540
|
+
# threshold, the check `:output` is set to a descriptive
|
541
|
+
# message, and `:status` is set to the appropriate non-zero
|
542
|
+
# value. If a client has been sending keepalives, `:output` and
|
543
|
+
# `:status` are set to indicate an OK state. A check result is
|
544
|
+
# published for every client in the registry.
|
545
|
+
def determine_stale_clients
|
546
|
+
@logger.info("determining stale clients")
|
547
|
+
@redis.smembers("clients") do |clients|
|
548
|
+
clients.each do |client_name|
|
549
|
+
@redis.get("client:#{client_name}") do |client_json|
|
550
|
+
unless client_json.nil?
|
551
|
+
client = MultiJson.load(client_json)
|
552
|
+
check = create_keepalive_check(client)
|
553
|
+
time_since_last_keepalive = Time.now.to_i - client[:timestamp]
|
554
|
+
check[:output] = "No keepalive sent from client for "
|
555
|
+
check[:output] << "#{time_since_last_keepalive} seconds"
|
556
|
+
case
|
557
|
+
when time_since_last_keepalive >= check[:thresholds][:critical]
|
558
|
+
check[:output] << " (>=#{check[:thresholds][:critical]})"
|
559
|
+
check[:status] = 2
|
560
|
+
when time_since_last_keepalive >= check[:thresholds][:warning]
|
561
|
+
check[:output] << " (>=#{check[:thresholds][:warning]})"
|
562
|
+
check[:status] = 1
|
563
|
+
else
|
564
|
+
check[:output] = "Keepalive sent from client "
|
565
|
+
check[:output] << "#{time_since_last_keepalive} seconds ago"
|
566
|
+
check[:status] = 0
|
567
|
+
end
|
568
|
+
publish_check_result(client, check)
|
569
|
+
end
|
570
|
+
end
|
571
|
+
end
|
572
|
+
end
|
573
|
+
end
|
574
|
+
|
575
|
+
# Set up the client monitor, a periodic timer to run
|
576
|
+
# `determine_stale_clients()` every 30 seconds. The timer is
|
577
|
+
# stored in the timers hash under `:master`.
|
578
|
+
def setup_client_monitor
|
579
|
+
@logger.debug("monitoring client keepalives")
|
580
|
+
@timers[:master] << EM::PeriodicTimer.new(30) do
|
581
|
+
determine_stale_clients
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
# Prune check result aggregations (aggregates). Sensu only
|
586
|
+
# stores the 20 latest aggregations for a check, to keep the
|
587
|
+
# amount of data stored to a minimum.
|
588
|
+
def prune_check_result_aggregations
|
589
|
+
@logger.info("pruning check result aggregations")
|
590
|
+
@redis.smembers("aggregates") do |checks|
|
591
|
+
checks.each do |check_name|
|
592
|
+
@redis.smembers("aggregates:#{check_name}") do |aggregates|
|
593
|
+
if aggregates.size > 20
|
594
|
+
aggregates.sort!
|
595
|
+
aggregates.take(aggregates.size - 20).each do |check_issued|
|
596
|
+
@redis.srem("aggregates:#{check_name}", check_issued) do
|
597
|
+
result_set = "#{check_name}:#{check_issued}"
|
598
|
+
@redis.del("aggregate:#{result_set}") do
|
599
|
+
@redis.del("aggregation:#{result_set}") do
|
600
|
+
@logger.debug("pruned aggregation", {
|
601
|
+
:check => {
|
602
|
+
:name => check_name,
|
603
|
+
:issued => check_issued
|
604
|
+
}
|
605
|
+
})
|
606
|
+
end
|
607
|
+
end
|
608
|
+
end
|
609
|
+
end
|
610
|
+
end
|
611
|
+
end
|
612
|
+
end
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
# Set up the check result aggregation pruner, using periodic
|
617
|
+
# timer to run `prune_check_result_aggregations()` every 20
|
618
|
+
# seconds. The timer is stored in the timers hash under
|
619
|
+
# `:master`.
|
620
|
+
def setup_check_result_aggregation_pruner
|
621
|
+
@logger.debug("pruning check result aggregations")
|
622
|
+
@timers[:master] << EM::PeriodicTimer.new(20) do
|
623
|
+
prune_check_result_aggregations
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
# Set up the master duties, tasks only performed by a single
|
628
|
+
# Sensu server at a time. The duties include publishing check
|
629
|
+
# requests, monitoring for stale clients, and pruning check
|
630
|
+
# result aggregations.
|
631
|
+
def master_duties
|
632
|
+
setup_check_request_publisher
|
633
|
+
setup_client_monitor
|
634
|
+
setup_check_result_aggregation_pruner
|
635
|
+
end
|
636
|
+
|
637
|
+
# Request a master election, a process to determine if the
|
638
|
+
# current process is the master Sensu server, with its
|
639
|
+
# own/unique duties. A Redis key/value is used as a central
|
640
|
+
# lock, using the "SETNX" Redis command to set the key/value if
|
641
|
+
# it does not exist, using a timestamp for the value. If the
|
642
|
+
# current process was able to create the key/value, it is the
|
643
|
+
# master, and must do the duties of the master. If the current
|
644
|
+
# process was not able to create the key/value, but the current
|
645
|
+
# timestamp value is equal to or over 30 seconds ago, the
|
646
|
+
# "GETSET" Redis command is used to set a new timestamp and
|
647
|
+
# fetch the previous value to compare them, to determine if it
|
648
|
+
# was set by the current process. If the current process is able
|
649
|
+
# to set the timestamp value, it becomes the master. The master
|
650
|
+
# has `@is_master` set to `true`.
|
651
|
+
def request_master_election
|
652
|
+
@redis.setnx("lock:master", Time.now.to_i) do |created|
|
653
|
+
if created
|
654
|
+
@is_master = true
|
655
|
+
@logger.info("i am the master")
|
656
|
+
master_duties
|
657
|
+
else
|
658
|
+
@redis.get("lock:master") do |timestamp|
|
659
|
+
if Time.now.to_i - timestamp.to_i >= 30
|
660
|
+
@redis.getset("lock:master", Time.now.to_i) do |previous|
|
661
|
+
if previous == timestamp
|
662
|
+
@is_master = true
|
663
|
+
@logger.info("i am now the master")
|
664
|
+
master_duties
|
665
|
+
end
|
666
|
+
end
|
667
|
+
end
|
668
|
+
end
|
669
|
+
end
|
670
|
+
end
|
671
|
+
end
|
672
|
+
|
673
|
+
# Set up the master monitor. A one-time timer is used to run
|
674
|
+
# `request_master_exection()` in 2 seconds. A periodic timer is
|
675
|
+
# used to update the master lock timestamp if the current
|
676
|
+
# process is the master, or to run `request_master_election(),
|
677
|
+
# every 10 seconds. The timers are stored in the timers hash
|
678
|
+
# under `:run`.
|
679
|
+
def setup_master_monitor
|
680
|
+
@timers[:run] << EM::Timer.new(2) do
|
681
|
+
request_master_election
|
682
|
+
end
|
683
|
+
@timers[:run] << EM::PeriodicTimer.new(10) do
|
684
|
+
if @is_master
|
685
|
+
@redis.set("lock:master", Time.now.to_i) do
|
686
|
+
@logger.debug("updated master lock timestamp")
|
687
|
+
end
|
688
|
+
else
|
689
|
+
request_master_election
|
690
|
+
end
|
691
|
+
end
|
692
|
+
end
|
693
|
+
|
694
|
+
# Resign as master, if the current process is the Sensu server
|
695
|
+
# master. This method cancels and clears the master timers,
|
696
|
+
# those with references stored in the timers hash under
|
697
|
+
# `:master`, and `@is_master`is set to `false`.
|
698
|
+
def resign_as_master
|
699
|
+
if @is_master
|
700
|
+
@logger.warn("resigning as master")
|
701
|
+
@timers[:master].each do |timer|
|
702
|
+
timer.cancel
|
703
|
+
end
|
704
|
+
@timers[:master].clear
|
705
|
+
@is_master = false
|
706
|
+
else
|
707
|
+
@logger.debug("not currently master")
|
708
|
+
end
|
709
|
+
end
|
710
|
+
|
711
|
+
# Unsubscribe from transport subscriptions (all of them). This
|
712
|
+
# method is called when there are issues with connectivity, or
|
713
|
+
# the process is stopping.
|
714
|
+
def unsubscribe
|
715
|
+
@logger.warn("unsubscribing from keepalive and result queues")
|
716
|
+
@transport.unsubscribe
|
717
|
+
end
|
718
|
+
|
719
|
+
# Complete event handling currently in progress. The
|
720
|
+
# `:handling_event_count` is used to determine if event handling
|
721
|
+
# is complete, when it is equal to `0`. The provided callback is
|
722
|
+
# called when handling is complete.
|
723
|
+
#
|
724
|
+
# @param callback [Proc] to call when event handling is
|
725
|
+
# complete.
|
726
|
+
def complete_event_handling(&callback)
|
727
|
+
@logger.info("completing event handling in progress", {
|
728
|
+
:handling_event_count => @handling_event_count
|
729
|
+
})
|
730
|
+
retry_until_true do
|
731
|
+
if @handling_event_count == 0
|
732
|
+
callback.call
|
733
|
+
true
|
734
|
+
end
|
735
|
+
end
|
736
|
+
end
|
737
|
+
|
738
|
+
# Bootstrap the Sensu server process, setting up the keepalive
|
739
|
+
# and check result consumers, and attemping to become the master
|
740
|
+
# to carry out its duties. This method sets the process/daemon
|
741
|
+
# `@state` to `:running`.
|
742
|
+
def bootstrap
|
743
|
+
setup_keepalives
|
744
|
+
setup_results
|
745
|
+
setup_master_monitor
|
746
|
+
@state = :running
|
747
|
+
end
|
748
|
+
|
749
|
+
# Start the Sensu server process, connecting to Redis, the
|
750
|
+
# transport, and calling the `bootstrap()` method.
|
751
|
+
def start
|
752
|
+
setup_redis
|
753
|
+
setup_transport
|
754
|
+
bootstrap
|
755
|
+
end
|
756
|
+
|
757
|
+
# Pause the Sensu server process, unless it is being paused or
|
758
|
+
# has already been paused. The process/daemon `@state` is first
|
759
|
+
# set to `:pausing`, to indicate that it's in progress. All run
|
760
|
+
# timers are cancelled, and the references are cleared. The
|
761
|
+
# Sensu server will unsubscribe from all transport
|
762
|
+
# subscriptions, resign as master (if currently the master),
|
763
|
+
# then set the process/daemon `@state` to `:paused`.
|
764
|
+
def pause
|
765
|
+
unless @state == :pausing || @state == :paused
|
766
|
+
@state = :pausing
|
767
|
+
@timers[:run].each do |timer|
|
768
|
+
timer.cancel
|
769
|
+
end
|
770
|
+
@timers[:run].clear
|
771
|
+
unsubscribe
|
772
|
+
resign_as_master
|
773
|
+
@state = :paused
|
774
|
+
end
|
775
|
+
end
|
776
|
+
|
777
|
+
# Resume the Sensu server process if it is currently or will
|
778
|
+
# soon be paused. The `retry_until_true` helper method is used
|
779
|
+
# to determine if the process is paused and if the Redis and
|
780
|
+
# transport connections are connected. If the conditions are
|
781
|
+
# met, `bootstrap()` will be called and true is returned to stop
|
782
|
+
# `retry_until_true`.
|
783
|
+
def resume
|
784
|
+
retry_until_true(1) do
|
785
|
+
if @state == :paused
|
786
|
+
if @redis.connected? && @transport.connected?
|
787
|
+
bootstrap
|
788
|
+
true
|
789
|
+
end
|
790
|
+
end
|
791
|
+
end
|
792
|
+
end
|
793
|
+
|
794
|
+
# Stop the Sensu server process, pausing it, completing event
|
795
|
+
# handling in progress, closing the Redis and transport
|
796
|
+
# connections, and exiting the process (exit 0). After pausing
|
797
|
+
# the process, the process/daemon `@state` is set to
|
798
|
+
# `:stopping`.
|
799
|
+
def stop
|
800
|
+
@logger.warn("stopping")
|
801
|
+
pause
|
802
|
+
@state = :stopping
|
803
|
+
complete_event_handling do
|
804
|
+
@redis.close
|
805
|
+
@transport.close
|
806
|
+
super
|
807
|
+
end
|
808
|
+
end
|
809
|
+
end
|
810
|
+
end
|
811
|
+
end
|