sensu 0.16.0-java → 0.17.0.beta.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/bin/sensu-api +4 -4
- data/bin/sensu-client +4 -4
- data/bin/sensu-server +4 -4
- data/lib/sensu/api/process.rb +704 -0
- data/lib/sensu/cli.rb +21 -15
- data/lib/sensu/client/process.rb +414 -0
- data/lib/sensu/client/socket.rb +226 -0
- data/lib/sensu/constants.rb +4 -1
- data/lib/sensu/daemon.rb +125 -73
- data/lib/sensu/redis.rb +10 -5
- data/lib/sensu/server/filter.rb +309 -0
- data/lib/sensu/server/handle.rb +168 -0
- data/lib/sensu/server/mutate.rb +92 -0
- data/lib/sensu/server/process.rb +811 -0
- data/lib/sensu/server/sandbox.rb +21 -0
- data/lib/sensu/server/socket.rb +42 -0
- data/lib/sensu/utilities.rb +29 -3
- data/sensu.gemspec +29 -28
- metadata +34 -16
- data/lib/sensu/api.rb +0 -704
- data/lib/sensu/client.rb +0 -292
- data/lib/sensu/sandbox.rb +0 -11
- data/lib/sensu/server.rb +0 -767
- data/lib/sensu/socket.rb +0 -246
@@ -0,0 +1,811 @@
|
|
1
|
+
require "sensu/daemon"
|
2
|
+
require "sensu/server/filter"
|
3
|
+
require "sensu/server/mutate"
|
4
|
+
require "sensu/server/handle"
|
5
|
+
|
6
|
+
module Sensu
|
7
|
+
module Server
|
8
|
+
class Process
|
9
|
+
include Daemon
|
10
|
+
include Filter
|
11
|
+
include Mutate
|
12
|
+
include Handle
|
13
|
+
|
14
|
+
attr_reader :is_master, :handling_event_count
|
15
|
+
|
16
|
+
# Create an instance of the Sensu server process, start the
|
17
|
+
# server within the EventMachine event loop, and set up server
|
18
|
+
# process signal traps (for stopping).
|
19
|
+
#
|
20
|
+
# @param options [Hash]
|
21
|
+
def self.run(options={})
|
22
|
+
server = self.new(options)
|
23
|
+
EM::run do
|
24
|
+
server.start
|
25
|
+
server.setup_signal_traps
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Override Daemon initialize() to support Sensu server master
|
30
|
+
# election and the handling event count.
|
31
|
+
#
|
32
|
+
# @param options [Hash]
|
33
|
+
def initialize(options={})
|
34
|
+
super
|
35
|
+
@is_master = false
|
36
|
+
@timers[:master] = Array.new
|
37
|
+
@handling_event_count = 0
|
38
|
+
end
|
39
|
+
|
40
|
+
# Update the Sensu client registry, stored in Redis. Sensu
|
41
|
+
# client data is used to provide additional event context and
|
42
|
+
# enable agent health monitoring. JSON serialization is used for
|
43
|
+
# the client data.
|
44
|
+
#
|
45
|
+
# @param client [Hash]
|
46
|
+
# @param callback [Proc] to call after the the client data has
|
47
|
+
# been added to (or updated) the registry.
|
48
|
+
def update_client_registry(client, &callback)
|
49
|
+
@logger.debug("updating client registry", :client => client)
|
50
|
+
@redis.set("client:#{client[:name]}", MultiJson.dump(client)) do
|
51
|
+
@redis.sadd("clients", client[:name]) do
|
52
|
+
callback.call
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Set up the client keepalive consumer, keeping the Sensu client
|
58
|
+
# registry updated. The consumer receives JSON serialized client
|
59
|
+
# keepalives from the transport, parses them, and calls
|
60
|
+
# `update_client_registry()` with the client data to update the
|
61
|
+
# registry. Transport message acknowledgements are used to
|
62
|
+
# ensure the client registry is updated successfully. Keepalive
|
63
|
+
# JSON parsing errors are logged.
|
64
|
+
def setup_keepalives
|
65
|
+
@logger.debug("subscribing to keepalives")
|
66
|
+
@transport.subscribe(:direct, "keepalives", "keepalives", :ack => true) do |message_info, message|
|
67
|
+
@logger.debug("received keepalive", :message => message)
|
68
|
+
begin
|
69
|
+
client = MultiJson.load(message)
|
70
|
+
update_client_registry(client) do
|
71
|
+
@transport.ack(message_info)
|
72
|
+
end
|
73
|
+
rescue MultiJson::ParseError => error
|
74
|
+
@logger.error("failed to parse keepalive payload", {
|
75
|
+
:message => message,
|
76
|
+
:error => error.to_s
|
77
|
+
})
|
78
|
+
@transport.ack(message_info)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Expand event handler sets, creating an array of handler
|
84
|
+
# definitions. Handler sets cannot be deeply nested (by choice),
|
85
|
+
# this method will return `nil` if an attempt is made to deeply
|
86
|
+
# nest. If the provided handler definition is not a set, it is
|
87
|
+
# returned.
|
88
|
+
#
|
89
|
+
# @param handler [Hash] definition.
|
90
|
+
# @param depth [Integer] of the expansion.
|
91
|
+
# @return [Array, Hash, Nil]
|
92
|
+
def expand_handler_sets(handler, depth=0)
|
93
|
+
if handler[:type] == "set"
|
94
|
+
if depth < 2
|
95
|
+
derive_handlers(handler[:handlers], depth + 1)
|
96
|
+
else
|
97
|
+
@logger.error("handler sets cannot be deeply nested", :handler => handler)
|
98
|
+
nil
|
99
|
+
end
|
100
|
+
else
|
101
|
+
handler
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Derive an array of handler definitions from a list of handler
|
106
|
+
# names. This method first checks for the existence of standard
|
107
|
+
# handlers, followed by handler extensions. If a handler does
|
108
|
+
# not exist for a name, it is logged and ignored. Duplicate
|
109
|
+
# handler definitions are removed.
|
110
|
+
#
|
111
|
+
# @param handler_list [Array]
|
112
|
+
# @param depth [Integer] of handler set expansion.
|
113
|
+
# @return [Array]
|
114
|
+
def derive_handlers(handler_list, depth=0)
|
115
|
+
handler_list.compact.map { |handler_name|
|
116
|
+
case
|
117
|
+
when @settings.handler_exists?(handler_name)
|
118
|
+
handler = @settings[:handlers][handler_name].merge(:name => handler_name)
|
119
|
+
expand_handler_sets(handler, depth)
|
120
|
+
when @extensions.handler_exists?(handler_name)
|
121
|
+
@extensions[:handlers][handler_name]
|
122
|
+
else
|
123
|
+
@logger.error("unknown handler", :handler_name => handler_name)
|
124
|
+
nil
|
125
|
+
end
|
126
|
+
}.flatten.compact.uniq
|
127
|
+
end
|
128
|
+
|
129
|
+
# Run event bridge extensions, within the Sensu EventMachine
|
130
|
+
# reactor (event loop). The extension API `safe_run()` method is
|
131
|
+
# used to guard against most errors. Bridges are for relaying
|
132
|
+
# Sensu event data to other services.
|
133
|
+
#
|
134
|
+
# @param event [Hash]
|
135
|
+
def event_bridges(event)
|
136
|
+
@extensions[:bridges].each do |name, bridge|
|
137
|
+
bridge.safe_run(event) do |output, status|
|
138
|
+
@logger.debug("bridge extension output", {
|
139
|
+
:extension => bridge.definition,
|
140
|
+
:output => output
|
141
|
+
})
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# Process an event: filter -> mutate -> handle.
|
147
|
+
#
|
148
|
+
# This method runs event bridges, relaying the event data to
|
149
|
+
# other services. This method also determines the appropriate
|
150
|
+
# handlers for the event, filtering and mutating the event data
|
151
|
+
# for each of them. The `@handling_event_count` is incremented
|
152
|
+
# by `1`, for each event handler chain (filter -> mutate ->
|
153
|
+
# handle).
|
154
|
+
#
|
155
|
+
# @param event [Hash]
|
156
|
+
def process_event(event)
|
157
|
+
log_level = event[:check][:type] == "metric" ? :debug : :info
|
158
|
+
@logger.send(log_level, "processing event", :event => event)
|
159
|
+
event_bridges(event)
|
160
|
+
handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || "default")
|
161
|
+
handlers = derive_handlers(handler_list)
|
162
|
+
handlers.each do |handler|
|
163
|
+
@handling_event_count += 1
|
164
|
+
filter_event(handler, event) do |event|
|
165
|
+
mutate_event(handler, event) do |event_data|
|
166
|
+
handle_event(handler, event_data)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
# Add a check result to an aggregate. A check aggregate uses the
|
173
|
+
# check `:name` and the `:issued` timestamp as its unique
|
174
|
+
# identifier. An aggregate uses several counters: the total
|
175
|
+
# number of results in the aggregate, and a counter for each
|
176
|
+
# check severity (ok, warning, etc). Check output is also
|
177
|
+
# stored, to be summarized to aid in identifying outliers for a
|
178
|
+
# check execution across a number of Sensu clients. JSON
|
179
|
+
# serialization is used for storing check result data.
|
180
|
+
#
|
181
|
+
# @param result [Hash]
|
182
|
+
def aggregate_check_result(result)
|
183
|
+
@logger.debug("adding check result to aggregate", :result => result)
|
184
|
+
check = result[:check]
|
185
|
+
result_set = "#{check[:name]}:#{check[:issued]}"
|
186
|
+
result_data = MultiJson.dump(:output => check[:output], :status => check[:status])
|
187
|
+
@redis.hset("aggregation:#{result_set}", result[:client], result_data) do
|
188
|
+
SEVERITIES.each do |severity|
|
189
|
+
@redis.hsetnx("aggregate:#{result_set}", severity, 0)
|
190
|
+
end
|
191
|
+
severity = (SEVERITIES[check[:status]] || "unknown")
|
192
|
+
@redis.hincrby("aggregate:#{result_set}", severity, 1) do
|
193
|
+
@redis.hincrby("aggregate:#{result_set}", "total", 1) do
|
194
|
+
@redis.sadd("aggregates:#{check[:name]}", check[:issued]) do
|
195
|
+
@redis.sadd("aggregates", check[:name])
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
# Store check result data. This method stores the 21 most recent
|
203
|
+
# check result statuses for a client/check pair, this history
|
204
|
+
# is used for event context and flap detection. The check
|
205
|
+
# execution timestamp is also stored, to provide an indication
|
206
|
+
# of how recent the data is.
|
207
|
+
#
|
208
|
+
# @param client [Hash]
|
209
|
+
# @param check [Hash]
|
210
|
+
# @param callback [Proc] to call when the check result data has
|
211
|
+
# been stored (history, etc).
|
212
|
+
def store_check_result(client, check, &callback)
|
213
|
+
@redis.sadd("history:#{client[:name]}", check[:name])
|
214
|
+
result_key = "#{client[:name]}:#{check[:name]}"
|
215
|
+
history_key = "history:#{result_key}"
|
216
|
+
@redis.rpush(history_key, check[:status]) do
|
217
|
+
@redis.set("execution:#{result_key}", check[:executed])
|
218
|
+
@redis.ltrim(history_key, -21, -1)
|
219
|
+
callback.call
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# Fetch the execution history for a client/check pair, the 21
|
224
|
+
# most recent check result statuses. This method also calculates
|
225
|
+
# the total state change percentage for the history, this value
|
226
|
+
# is use for check state flat detection, using a similar
|
227
|
+
# algorithm to Nagios:
|
228
|
+
# http://nagios.sourceforge.net/docs/3_0/flapping.html
|
229
|
+
#
|
230
|
+
# @param client [Hash]
|
231
|
+
# @param check [Hash]
|
232
|
+
# @param callback [Proc] to be called with the check history and
|
233
|
+
# total state change value.
|
234
|
+
def check_history(client, check, &callback)
|
235
|
+
history_key = "history:#{client[:name]}:#{check[:name]}"
|
236
|
+
@redis.lrange(history_key, -21, -1) do |history|
|
237
|
+
total_state_change = 0
|
238
|
+
unless history.size < 21
|
239
|
+
state_changes = 0
|
240
|
+
change_weight = 0.8
|
241
|
+
previous_status = history.first
|
242
|
+
history.each do |status|
|
243
|
+
unless status == previous_status
|
244
|
+
state_changes += change_weight
|
245
|
+
end
|
246
|
+
change_weight += 0.02
|
247
|
+
previous_status = status
|
248
|
+
end
|
249
|
+
total_state_change = (state_changes.fdiv(20) * 100).to_i
|
250
|
+
end
|
251
|
+
callback.call(history, total_state_change)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
# Determine if a check state is flapping, rapidly changing
|
256
|
+
# between an OK and non-OK state. Flap detection is only done
|
257
|
+
# for checks that have defined low and hight flap detection
|
258
|
+
# thresholds, `:low_flap_threshold` and `:high_flap_threshold`.
|
259
|
+
# The `check_history()` method provides the check history and
|
260
|
+
# more importantly the total state change precentage value that
|
261
|
+
# is compared with the configured thresholds defined in the
|
262
|
+
# check data. If a check hasn't been flapping, the
|
263
|
+
# `:total_state_change` must be equal to or higher than the
|
264
|
+
# `:high_flap_threshold` to be changed to flapping. If a check
|
265
|
+
# has been flapping, the `:total_state_change` must be equal to
|
266
|
+
# or lower than the `:low_flap_threshold` to no longer be
|
267
|
+
# flapping. This method uses the same algorithm as Nagios:
|
268
|
+
# http://nagios.sourceforge.net/docs/3_0/flapping.html
|
269
|
+
#
|
270
|
+
# @param stored_event [Hash]
|
271
|
+
# @param check [Hash]
|
272
|
+
# @return [TrueClass, FalseClass]
|
273
|
+
def check_flapping?(stored_event, check)
|
274
|
+
if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
|
275
|
+
was_flapping = stored_event && stored_event[:action] == "flapping"
|
276
|
+
check[:total_state_change] >= check[:high_flap_threshold] ||
|
277
|
+
(was_flapping && check[:total_state_change] <= check[:low_flap_threshold]) ||
|
278
|
+
was_flapping
|
279
|
+
else
|
280
|
+
false
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# Update the event registry, stored in Redis. This method
|
285
|
+
# determines if check data results in the creation or update of
|
286
|
+
# event data in the registry. Existing event data for a
|
287
|
+
# client/check pair is fetched, used in conditionals and the
|
288
|
+
# composition of the new event data. If a check `:status` is not
|
289
|
+
# `0`, or it has been flapping, an event is created/updated in
|
290
|
+
# the registry. If there was existing event data, but the check
|
291
|
+
# `:status` is now `0`, the event is removed (resolved) from the
|
292
|
+
# registry. If the previous conditions are not met, and check
|
293
|
+
# `:type` is `metric` and the `:status` is `0`, the event
|
294
|
+
# registry is not updated, but the provided callback is called
|
295
|
+
# with the event data. JSON serialization is used when storing
|
296
|
+
# data in the registry.
|
297
|
+
#
|
298
|
+
# @param client [Hash]
|
299
|
+
# @param check [Hash]
|
300
|
+
# @param callback [Proc] to be called with the resulting event
|
301
|
+
# data if the event registry is updated, or the check is of
|
302
|
+
# type `:metric`.
|
303
|
+
def update_event_registry(client, check, &callback)
|
304
|
+
@redis.hget("events:#{client[:name]}", check[:name]) do |event_json|
|
305
|
+
stored_event = event_json ? MultiJson.load(event_json) : nil
|
306
|
+
flapping = check_flapping?(stored_event, check)
|
307
|
+
event = {
|
308
|
+
:id => random_uuid,
|
309
|
+
:client => client,
|
310
|
+
:check => check,
|
311
|
+
:occurrences => 1
|
312
|
+
}
|
313
|
+
if check[:status] != 0 || flapping
|
314
|
+
if stored_event && check[:status] == stored_event[:check][:status]
|
315
|
+
event[:occurrences] = stored_event[:occurrences] + 1
|
316
|
+
end
|
317
|
+
event[:action] = flapping ? :flapping : :create
|
318
|
+
@redis.hset("events:#{client[:name]}", check[:name], MultiJson.dump(event)) do
|
319
|
+
callback.call(event)
|
320
|
+
end
|
321
|
+
elsif stored_event
|
322
|
+
event[:occurrences] = stored_event[:occurrences]
|
323
|
+
event[:action] = :resolve
|
324
|
+
unless check[:auto_resolve] == false && !check[:force_resolve]
|
325
|
+
@redis.hdel("events:#{client[:name]}", check[:name]) do
|
326
|
+
callback.call(event)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
elsif check[:type] == "metric"
|
330
|
+
callback.call(event)
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
# Process a check result, storing its data, inspecting its
|
336
|
+
# contents, and taking the appropriate actions (eg. update the
|
337
|
+
# event registry). A check result must have a valid client name,
|
338
|
+
# associated with a client in the registry. Results without a
|
339
|
+
# valid client are discarded, to keep the system "correct". If a
|
340
|
+
# local check definition exists for the check name, and the
|
341
|
+
# check result is not from a standalone check execution, it's
|
342
|
+
# merged with the check result for more context.
|
343
|
+
#
|
344
|
+
# @param result [Hash] data.
|
345
|
+
def process_check_result(result)
|
346
|
+
@logger.debug("processing result", :result => result)
|
347
|
+
@redis.get("client:#{result[:client]}") do |client_json|
|
348
|
+
unless client_json.nil?
|
349
|
+
client = MultiJson.load(client_json)
|
350
|
+
check = case
|
351
|
+
when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
|
352
|
+
@settings[:checks][result[:check][:name]].merge(result[:check])
|
353
|
+
else
|
354
|
+
result[:check]
|
355
|
+
end
|
356
|
+
aggregate_check_result(result) if check[:aggregate]
|
357
|
+
store_check_result(client, check) do
|
358
|
+
check_history(client, check) do |history, total_state_change|
|
359
|
+
check[:history] = history
|
360
|
+
check[:total_state_change] = total_state_change
|
361
|
+
update_event_registry(client, check) do |event|
|
362
|
+
process_event(event)
|
363
|
+
end
|
364
|
+
end
|
365
|
+
end
|
366
|
+
else
|
367
|
+
@logger.warn("client not in registry", :client => result[:client])
|
368
|
+
end
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
# Set up the check result consumer. The consumer receives JSON
|
373
|
+
# serialized check results from the transport, parses them, and
|
374
|
+
# calls `process_check_result()` with the result data to be
|
375
|
+
# processed. Transport message acknowledgements are used to
|
376
|
+
# ensure that results make it to processing. The transport
|
377
|
+
# message acknowledgements are currently done in the next tick
|
378
|
+
# of the EventMachine reactor (event loop), as a flow control
|
379
|
+
# mechanism. Result JSON parsing errors are logged.
|
380
|
+
def setup_results
|
381
|
+
@logger.debug("subscribing to results")
|
382
|
+
@transport.subscribe(:direct, "results", "results", :ack => true) do |message_info, message|
|
383
|
+
begin
|
384
|
+
result = MultiJson.load(message)
|
385
|
+
@logger.debug("received result", :result => result)
|
386
|
+
process_check_result(result)
|
387
|
+
rescue MultiJson::ParseError => error
|
388
|
+
@logger.error("failed to parse result payload", {
|
389
|
+
:message => message,
|
390
|
+
:error => error.to_s
|
391
|
+
})
|
392
|
+
end
|
393
|
+
EM::next_tick do
|
394
|
+
@transport.ack(message_info)
|
395
|
+
end
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
# Publish a check request to the transport. A check request is
|
400
|
+
# composted of a check `:name`, an `:issued` timestamp, and a
|
401
|
+
# check `:command` if available. The check request is published
|
402
|
+
# to a transport pipe, for each of the check `:subscribers` in
|
403
|
+
# its definition, eg. "webserver". JSON serialization is used
|
404
|
+
# when publishing the check request payload to the transport
|
405
|
+
# pipes. Transport errors are logged.
|
406
|
+
#
|
407
|
+
# @param check [Hash] definition.
|
408
|
+
def publish_check_request(check)
|
409
|
+
payload = {
|
410
|
+
:name => check[:name],
|
411
|
+
:issued => Time.now.to_i
|
412
|
+
}
|
413
|
+
payload[:command] = check[:command] if check.has_key?(:command)
|
414
|
+
@logger.info("publishing check request", {
|
415
|
+
:payload => payload,
|
416
|
+
:subscribers => check[:subscribers]
|
417
|
+
})
|
418
|
+
check[:subscribers].each do |subscription|
|
419
|
+
@transport.publish(:fanout, subscription, MultiJson.dump(payload)) do |info|
|
420
|
+
if info[:error]
|
421
|
+
@logger.error("failed to publish check request", {
|
422
|
+
:subscription => subscription,
|
423
|
+
:payload => payload,
|
424
|
+
:error => info[:error].to_s
|
425
|
+
})
|
426
|
+
end
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
# Calculate a check execution splay, taking into account the
|
432
|
+
# current time and the execution interval to ensure it's
|
433
|
+
# consistent between process restarts.
|
434
|
+
#
|
435
|
+
# @param check [Hash] definition.
|
436
|
+
def calculate_check_execution_splay(check)
|
437
|
+
splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
|
438
|
+
current_time = (Time.now.to_f * 1000).to_i
|
439
|
+
(splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
|
440
|
+
end
|
441
|
+
|
442
|
+
# Schedule check executions, using EventMachine periodic timers,
|
443
|
+
# using a calculated execution splay. The timers are stored in
|
444
|
+
# the timers hash under `:master`, as check request publishing
|
445
|
+
# is a task for only the Sensu server master, so they can be
|
446
|
+
# cancelled etc. Check requests are not published if subdued.
|
447
|
+
#
|
448
|
+
# @param checks [Array] of definitions.
|
449
|
+
def schedule_check_executions(checks)
|
450
|
+
checks.each do |check|
|
451
|
+
create_check_request = Proc.new do
|
452
|
+
unless check_request_subdued?(check)
|
453
|
+
publish_check_request(check)
|
454
|
+
else
|
455
|
+
@logger.info("check request was subdued", :check => check)
|
456
|
+
end
|
457
|
+
end
|
458
|
+
execution_splay = testing? ? 0 : calculate_check_execution_splay(check)
|
459
|
+
interval = testing? ? 0.5 : check[:interval]
|
460
|
+
@timers[:master] << EM::Timer.new(execution_splay) do
|
461
|
+
create_check_request.call
|
462
|
+
@timers[:master] << EM::PeriodicTimer.new(interval, &create_check_request)
|
463
|
+
end
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
467
|
+
# Set up the check request publisher. This method creates an
|
468
|
+
# array of check definitions, that are not standalone checks,
|
469
|
+
# and do not have `:publish` set to `false`. The array of check
|
470
|
+
# definitions includes those from standard checks and extensions
|
471
|
+
# (with a defined execution `:interval`). The array is provided
|
472
|
+
# to the `schedule_check_executions()` method.
|
473
|
+
def setup_check_request_publisher
|
474
|
+
@logger.debug("scheduling check requests")
|
475
|
+
standard_checks = @settings.checks.reject do |check|
|
476
|
+
check[:standalone] || check[:publish] == false
|
477
|
+
end
|
478
|
+
extension_checks = @extensions.checks.reject do |check|
|
479
|
+
check[:standalone] || check[:publish] == false || !check[:interval].is_a?(Integer)
|
480
|
+
end
|
481
|
+
schedule_check_executions(standard_checks + extension_checks)
|
482
|
+
end
|
483
|
+
|
484
|
+
# Publish a check result to the transport for processing. A
|
485
|
+
# check result is composed of a client name and a check
|
486
|
+
# definition, containing check `:output` and `:status`. JSON
|
487
|
+
# serialization is used when publishing the check result payload
|
488
|
+
# to the transport pipe. Transport errors are logged.
|
489
|
+
#
|
490
|
+
# @param client [Hash]
|
491
|
+
# @param check [Hash]
|
492
|
+
def publish_check_result(client, check)
|
493
|
+
payload = {
|
494
|
+
:client => client[:name],
|
495
|
+
:check => check
|
496
|
+
}
|
497
|
+
@logger.debug("publishing check result", :payload => payload)
|
498
|
+
@transport.publish(:direct, "results", MultiJson.dump(payload)) do |info|
|
499
|
+
if info[:error]
|
500
|
+
@logger.error("failed to publish check result", {
|
501
|
+
:payload => payload,
|
502
|
+
:error => info[:error].to_s
|
503
|
+
})
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
# Create a keepalive check definition for a client. Client
|
509
|
+
# definitions may contain `:keepalive` configuration, containing
|
510
|
+
# specific thresholds and handler information. The keepalive
|
511
|
+
# check definition creation begins with default thresholds, and
|
512
|
+
# sets the `:handler` to `keepalive`, if the handler has a local
|
513
|
+
# definition. If the client provides its own `:keepalive`
|
514
|
+
# configuration, it's deep merged with the defaults. The check
|
515
|
+
# `:name`, `:issued`, and `:executed` values are always
|
516
|
+
# overridden to guard against an invalid definition.
|
517
|
+
def create_keepalive_check(client)
|
518
|
+
check = {
|
519
|
+
:thresholds => {
|
520
|
+
:warning => 120,
|
521
|
+
:critical => 180
|
522
|
+
}
|
523
|
+
}
|
524
|
+
if @settings.handler_exists?(:keepalive)
|
525
|
+
check[:handler] = "keepalive"
|
526
|
+
end
|
527
|
+
if client.has_key?(:keepalive)
|
528
|
+
check = deep_merge(check, client[:keepalive])
|
529
|
+
end
|
530
|
+
timestamp = Time.now.to_i
|
531
|
+
check.merge(:name => "keepalive", :issued => timestamp, :executed => timestamp)
|
532
|
+
end
|
533
|
+
|
534
|
+
# Determine stale clients, those that have not sent a keepalive
|
535
|
+
# in a specified amount of time (thresholds). This method
|
536
|
+
# iterates through the client registry, creating a keepalive
|
537
|
+
# check definition with the `create_keepalive_check()` method,
|
538
|
+
# containing client specific staleness thresholds. If the time
|
539
|
+
# since the latest keepalive is equal to or greater than a
|
540
|
+
# threshold, the check `:output` is set to a descriptive
|
541
|
+
# message, and `:status` is set to the appropriate non-zero
|
542
|
+
# value. If a client has been sending keepalives, `:output` and
|
543
|
+
# `:status` are set to indicate an OK state. A check result is
|
544
|
+
# published for every client in the registry.
|
545
|
+
def determine_stale_clients
|
546
|
+
@logger.info("determining stale clients")
|
547
|
+
@redis.smembers("clients") do |clients|
|
548
|
+
clients.each do |client_name|
|
549
|
+
@redis.get("client:#{client_name}") do |client_json|
|
550
|
+
unless client_json.nil?
|
551
|
+
client = MultiJson.load(client_json)
|
552
|
+
check = create_keepalive_check(client)
|
553
|
+
time_since_last_keepalive = Time.now.to_i - client[:timestamp]
|
554
|
+
check[:output] = "No keepalive sent from client for "
|
555
|
+
check[:output] << "#{time_since_last_keepalive} seconds"
|
556
|
+
case
|
557
|
+
when time_since_last_keepalive >= check[:thresholds][:critical]
|
558
|
+
check[:output] << " (>=#{check[:thresholds][:critical]})"
|
559
|
+
check[:status] = 2
|
560
|
+
when time_since_last_keepalive >= check[:thresholds][:warning]
|
561
|
+
check[:output] << " (>=#{check[:thresholds][:warning]})"
|
562
|
+
check[:status] = 1
|
563
|
+
else
|
564
|
+
check[:output] = "Keepalive sent from client "
|
565
|
+
check[:output] << "#{time_since_last_keepalive} seconds ago"
|
566
|
+
check[:status] = 0
|
567
|
+
end
|
568
|
+
publish_check_result(client, check)
|
569
|
+
end
|
570
|
+
end
|
571
|
+
end
|
572
|
+
end
|
573
|
+
end
|
574
|
+
|
575
|
+
# Set up the client monitor, a periodic timer to run
|
576
|
+
# `determine_stale_clients()` every 30 seconds. The timer is
|
577
|
+
# stored in the timers hash under `:master`.
|
578
|
+
def setup_client_monitor
|
579
|
+
@logger.debug("monitoring client keepalives")
|
580
|
+
@timers[:master] << EM::PeriodicTimer.new(30) do
|
581
|
+
determine_stale_clients
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
# Prune check result aggregations (aggregates). Sensu only
|
586
|
+
# stores the 20 latest aggregations for a check, to keep the
|
587
|
+
# amount of data stored to a minimum.
|
588
|
+
def prune_check_result_aggregations
|
589
|
+
@logger.info("pruning check result aggregations")
|
590
|
+
@redis.smembers("aggregates") do |checks|
|
591
|
+
checks.each do |check_name|
|
592
|
+
@redis.smembers("aggregates:#{check_name}") do |aggregates|
|
593
|
+
if aggregates.size > 20
|
594
|
+
aggregates.sort!
|
595
|
+
aggregates.take(aggregates.size - 20).each do |check_issued|
|
596
|
+
@redis.srem("aggregates:#{check_name}", check_issued) do
|
597
|
+
result_set = "#{check_name}:#{check_issued}"
|
598
|
+
@redis.del("aggregate:#{result_set}") do
|
599
|
+
@redis.del("aggregation:#{result_set}") do
|
600
|
+
@logger.debug("pruned aggregation", {
|
601
|
+
:check => {
|
602
|
+
:name => check_name,
|
603
|
+
:issued => check_issued
|
604
|
+
}
|
605
|
+
})
|
606
|
+
end
|
607
|
+
end
|
608
|
+
end
|
609
|
+
end
|
610
|
+
end
|
611
|
+
end
|
612
|
+
end
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
# Set up the check result aggregation pruner, using periodic
|
617
|
+
# timer to run `prune_check_result_aggregations()` every 20
|
618
|
+
# seconds. The timer is stored in the timers hash under
|
619
|
+
# `:master`.
|
620
|
+
def setup_check_result_aggregation_pruner
|
621
|
+
@logger.debug("pruning check result aggregations")
|
622
|
+
@timers[:master] << EM::PeriodicTimer.new(20) do
|
623
|
+
prune_check_result_aggregations
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
# Set up the master duties, tasks only performed by a single
|
628
|
+
# Sensu server at a time. The duties include publishing check
|
629
|
+
# requests, monitoring for stale clients, and pruning check
|
630
|
+
# result aggregations.
|
631
|
+
def master_duties
|
632
|
+
setup_check_request_publisher
|
633
|
+
setup_client_monitor
|
634
|
+
setup_check_result_aggregation_pruner
|
635
|
+
end
|
636
|
+
|
637
|
+
# Request a master election, a process to determine if the
|
638
|
+
# current process is the master Sensu server, with its
|
639
|
+
# own/unique duties. A Redis key/value is used as a central
|
640
|
+
# lock, using the "SETNX" Redis command to set the key/value if
|
641
|
+
# it does not exist, using a timestamp for the value. If the
|
642
|
+
# current process was able to create the key/value, it is the
|
643
|
+
# master, and must do the duties of the master. If the current
|
644
|
+
# process was not able to create the key/value, but the current
|
645
|
+
# timestamp value is equal to or over 30 seconds ago, the
|
646
|
+
# "GETSET" Redis command is used to set a new timestamp and
|
647
|
+
# fetch the previous value to compare them, to determine if it
|
648
|
+
# was set by the current process. If the current process is able
|
649
|
+
# to set the timestamp value, it becomes the master. The master
|
650
|
+
# has `@is_master` set to `true`.
|
651
|
+
def request_master_election
|
652
|
+
@redis.setnx("lock:master", Time.now.to_i) do |created|
|
653
|
+
if created
|
654
|
+
@is_master = true
|
655
|
+
@logger.info("i am the master")
|
656
|
+
master_duties
|
657
|
+
else
|
658
|
+
@redis.get("lock:master") do |timestamp|
|
659
|
+
if Time.now.to_i - timestamp.to_i >= 30
|
660
|
+
@redis.getset("lock:master", Time.now.to_i) do |previous|
|
661
|
+
if previous == timestamp
|
662
|
+
@is_master = true
|
663
|
+
@logger.info("i am now the master")
|
664
|
+
master_duties
|
665
|
+
end
|
666
|
+
end
|
667
|
+
end
|
668
|
+
end
|
669
|
+
end
|
670
|
+
end
|
671
|
+
end
|
672
|
+
|
673
|
+
# Set up the master monitor. A one-time timer is used to run
|
674
|
+
# `request_master_exection()` in 2 seconds. A periodic timer is
|
675
|
+
# used to update the master lock timestamp if the current
|
676
|
+
# process is the master, or to run `request_master_election(),
|
677
|
+
# every 10 seconds. The timers are stored in the timers hash
|
678
|
+
# under `:run`.
|
679
|
+
def setup_master_monitor
|
680
|
+
@timers[:run] << EM::Timer.new(2) do
|
681
|
+
request_master_election
|
682
|
+
end
|
683
|
+
@timers[:run] << EM::PeriodicTimer.new(10) do
|
684
|
+
if @is_master
|
685
|
+
@redis.set("lock:master", Time.now.to_i) do
|
686
|
+
@logger.debug("updated master lock timestamp")
|
687
|
+
end
|
688
|
+
else
|
689
|
+
request_master_election
|
690
|
+
end
|
691
|
+
end
|
692
|
+
end
|
693
|
+
|
694
|
+
# Resign as master, if the current process is the Sensu server
|
695
|
+
# master. This method cancels and clears the master timers,
|
696
|
+
# those with references stored in the timers hash under
|
697
|
+
# `:master`, and `@is_master`is set to `false`.
|
698
|
+
def resign_as_master
|
699
|
+
if @is_master
|
700
|
+
@logger.warn("resigning as master")
|
701
|
+
@timers[:master].each do |timer|
|
702
|
+
timer.cancel
|
703
|
+
end
|
704
|
+
@timers[:master].clear
|
705
|
+
@is_master = false
|
706
|
+
else
|
707
|
+
@logger.debug("not currently master")
|
708
|
+
end
|
709
|
+
end
|
710
|
+
|
711
|
+
# Unsubscribe from transport subscriptions (all of them). This
|
712
|
+
# method is called when there are issues with connectivity, or
|
713
|
+
# the process is stopping.
|
714
|
+
def unsubscribe
|
715
|
+
@logger.warn("unsubscribing from keepalive and result queues")
|
716
|
+
@transport.unsubscribe
|
717
|
+
end
|
718
|
+
|
719
|
+
# Complete event handling currently in progress. The
|
720
|
+
# `:handling_event_count` is used to determine if event handling
|
721
|
+
# is complete, when it is equal to `0`. The provided callback is
|
722
|
+
# called when handling is complete.
|
723
|
+
#
|
724
|
+
# @param callback [Proc] to call when event handling is
|
725
|
+
# complete.
|
726
|
+
def complete_event_handling(&callback)
|
727
|
+
@logger.info("completing event handling in progress", {
|
728
|
+
:handling_event_count => @handling_event_count
|
729
|
+
})
|
730
|
+
retry_until_true do
|
731
|
+
if @handling_event_count == 0
|
732
|
+
callback.call
|
733
|
+
true
|
734
|
+
end
|
735
|
+
end
|
736
|
+
end
|
737
|
+
|
738
|
+
# Bootstrap the Sensu server process, setting up the keepalive
|
739
|
+
# and check result consumers, and attemping to become the master
|
740
|
+
# to carry out its duties. This method sets the process/daemon
|
741
|
+
# `@state` to `:running`.
|
742
|
+
def bootstrap
|
743
|
+
setup_keepalives
|
744
|
+
setup_results
|
745
|
+
setup_master_monitor
|
746
|
+
@state = :running
|
747
|
+
end
|
748
|
+
|
749
|
+
# Start the Sensu server process, connecting to Redis, the
|
750
|
+
# transport, and calling the `bootstrap()` method.
|
751
|
+
def start
|
752
|
+
setup_redis
|
753
|
+
setup_transport
|
754
|
+
bootstrap
|
755
|
+
end
|
756
|
+
|
757
|
+
# Pause the Sensu server process, unless it is being paused or
|
758
|
+
# has already been paused. The process/daemon `@state` is first
|
759
|
+
# set to `:pausing`, to indicate that it's in progress. All run
|
760
|
+
# timers are cancelled, and the references are cleared. The
|
761
|
+
# Sensu server will unsubscribe from all transport
|
762
|
+
# subscriptions, resign as master (if currently the master),
|
763
|
+
# then set the process/daemon `@state` to `:paused`.
|
764
|
+
def pause
|
765
|
+
unless @state == :pausing || @state == :paused
|
766
|
+
@state = :pausing
|
767
|
+
@timers[:run].each do |timer|
|
768
|
+
timer.cancel
|
769
|
+
end
|
770
|
+
@timers[:run].clear
|
771
|
+
unsubscribe
|
772
|
+
resign_as_master
|
773
|
+
@state = :paused
|
774
|
+
end
|
775
|
+
end
|
776
|
+
|
777
|
+
# Resume the Sensu server process if it is currently or will
|
778
|
+
# soon be paused. The `retry_until_true` helper method is used
|
779
|
+
# to determine if the process is paused and if the Redis and
|
780
|
+
# transport connections are connected. If the conditions are
|
781
|
+
# met, `bootstrap()` will be called and true is returned to stop
|
782
|
+
# `retry_until_true`.
|
783
|
+
def resume
|
784
|
+
retry_until_true(1) do
|
785
|
+
if @state == :paused
|
786
|
+
if @redis.connected? && @transport.connected?
|
787
|
+
bootstrap
|
788
|
+
true
|
789
|
+
end
|
790
|
+
end
|
791
|
+
end
|
792
|
+
end
|
793
|
+
|
794
|
+
# Stop the Sensu server process, pausing it, completing event
|
795
|
+
# handling in progress, closing the Redis and transport
|
796
|
+
# connections, and exiting the process (exit 0). After pausing
|
797
|
+
# the process, the process/daemon `@state` is set to
|
798
|
+
# `:stopping`.
|
799
|
+
def stop
|
800
|
+
@logger.warn("stopping")
|
801
|
+
pause
|
802
|
+
@state = :stopping
|
803
|
+
complete_event_handling do
|
804
|
+
@redis.close
|
805
|
+
@transport.close
|
806
|
+
super
|
807
|
+
end
|
808
|
+
end
|
809
|
+
end
|
810
|
+
end
|
811
|
+
end
|