portertech-sensu 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +961 -0
  3. data/MIT-LICENSE.txt +20 -0
  4. data/README.md +65 -0
  5. data/exe/sensu-api +10 -0
  6. data/exe/sensu-client +10 -0
  7. data/exe/sensu-install +195 -0
  8. data/exe/sensu-server +10 -0
  9. data/lib/sensu/api/http_handler.rb +434 -0
  10. data/lib/sensu/api/process.rb +79 -0
  11. data/lib/sensu/api/routes/aggregates.rb +196 -0
  12. data/lib/sensu/api/routes/checks.rb +44 -0
  13. data/lib/sensu/api/routes/clients.rb +171 -0
  14. data/lib/sensu/api/routes/events.rb +86 -0
  15. data/lib/sensu/api/routes/health.rb +45 -0
  16. data/lib/sensu/api/routes/info.rb +37 -0
  17. data/lib/sensu/api/routes/request.rb +44 -0
  18. data/lib/sensu/api/routes/resolve.rb +32 -0
  19. data/lib/sensu/api/routes/results.rb +153 -0
  20. data/lib/sensu/api/routes/settings.rb +23 -0
  21. data/lib/sensu/api/routes/silenced.rb +182 -0
  22. data/lib/sensu/api/routes/stashes.rb +107 -0
  23. data/lib/sensu/api/routes.rb +88 -0
  24. data/lib/sensu/api/utilities/filter_response_content.rb +44 -0
  25. data/lib/sensu/api/utilities/publish_check_request.rb +107 -0
  26. data/lib/sensu/api/utilities/publish_check_result.rb +39 -0
  27. data/lib/sensu/api/utilities/resolve_event.rb +29 -0
  28. data/lib/sensu/api/utilities/servers_info.rb +43 -0
  29. data/lib/sensu/api/utilities/transport_info.rb +43 -0
  30. data/lib/sensu/api/validators/check.rb +55 -0
  31. data/lib/sensu/api/validators/client.rb +35 -0
  32. data/lib/sensu/api/validators/invalid.rb +8 -0
  33. data/lib/sensu/cli.rb +69 -0
  34. data/lib/sensu/client/http_socket.rb +217 -0
  35. data/lib/sensu/client/process.rb +655 -0
  36. data/lib/sensu/client/socket.rb +207 -0
  37. data/lib/sensu/client/utils.rb +53 -0
  38. data/lib/sensu/client/validators/check.rb +53 -0
  39. data/lib/sensu/constants.rb +17 -0
  40. data/lib/sensu/daemon.rb +396 -0
  41. data/lib/sensu/sandbox.rb +19 -0
  42. data/lib/sensu/server/filter.rb +227 -0
  43. data/lib/sensu/server/handle.rb +201 -0
  44. data/lib/sensu/server/mutate.rb +92 -0
  45. data/lib/sensu/server/process.rb +1646 -0
  46. data/lib/sensu/server/socket.rb +54 -0
  47. data/lib/sensu/server/tessen.rb +170 -0
  48. data/lib/sensu/utilities.rb +398 -0
  49. data/lib/sensu.rb +3 -0
  50. data/sensu.gemspec +36 -0
  51. metadata +322 -0
@@ -0,0 +1,1646 @@
1
+ require "sensu/daemon"
2
+ require "sensu/server/filter"
3
+ require "sensu/server/mutate"
4
+ require "sensu/server/handle"
5
+ require "sensu/server/tessen"
6
+
7
+ module Sensu
8
+ module Server
9
+ class Process
10
+ include Daemon
11
+ include Filter
12
+ include Mutate
13
+ include Handle
14
+
15
+ attr_reader :tasks, :in_progress
16
+
17
+ TASKS = ["check_request_publisher", "client_monitor", "check_result_monitor"]
18
+
19
+ STANDARD_CHECK_TYPE = "standard".freeze
20
+
21
+ METRIC_CHECK_TYPE = "metric".freeze
22
+
23
+ EVENT_FLAPPING_ACTION = "flapping".freeze
24
+
25
+ DEFAULT_HANDLER_NAME = "default".freeze
26
+
27
+ # Create an instance of the Sensu server process, start the
28
+ # server within the EventMachine event loop, and set up server
29
+ # process signal traps (for stopping).
30
+ #
31
+ # @param options [Hash]
32
+ def self.run(options={})
33
+ server = self.new(options)
34
+ EM::run do
35
+ server.start
36
+ server.setup_signal_traps
37
+ end
38
+ end
39
+
40
+ # Override Daemon initialize() to support Sensu server tasks and
41
+ # the handling event count.
42
+ #
43
+ # @param options [Hash]
44
+ def initialize(options={})
45
+ super
46
+ @tasks = []
47
+ @timers[:tasks] = {}
48
+ TASKS.each do |task|
49
+ @timers[:tasks][task.to_sym] = []
50
+ end
51
+ @in_progress = Hash.new(0)
52
+ end
53
+
54
+ # Set up the Redis and Transport connection objects, `@redis`
55
+ # and `@transport`. This method updates the Redis on error
56
+ # callback to reset the in progress check result counter. This
57
+ # method "drys" up many instances of `setup_redis()` and
58
+ # `setup_transport()`, particularly in the specs.
59
+ #
60
+ # @yield callback/block called after connecting to Redis and the
61
+ # Sensu Transport.
62
+ def setup_connections
63
+ setup_redis do
64
+ @redis.on_error do |error|
65
+ @logger.error("redis connection error", :error => error.to_s)
66
+ @in_progress[:check_results] = 0
67
+ end
68
+ setup_transport do
69
+ yield
70
+ end
71
+ end
72
+ end
73
+
74
+ # Create a registration check definition for a client. Client
75
+ # definitions may contain `:registration` configuration,
76
+ # containing custom attributes and handler information. By
77
+ # default, the registration check definition sets the `:handler`
78
+ # to `registration`. If the client provides its own
79
+ # `:registration` configuration, it's deep merged with the
80
+ # defaults. The check `:name`, `:output`, `:issued`, and
81
+ # `:executed` values are always overridden to guard against an
82
+ # invalid definition.
83
+ def create_registration_check(client)
84
+ check = {:handler => "registration", :status => 1}
85
+ if client.has_key?(:registration)
86
+ check = deep_merge(check, client[:registration])
87
+ end
88
+ timestamp = Time.now.to_i
89
+ overrides = {
90
+ :name => "registration",
91
+ :output => "new client registration",
92
+ :issued => timestamp,
93
+ :executed => timestamp
94
+ }
95
+ check.merge(overrides)
96
+ end
97
+
98
+ # Create and process a client registration event. A registration
99
+ # event is created when a Sensu client is first added to the
100
+ # client registry. The `create_registration_check()` method is
101
+ # called to create a registration check definition for the
102
+ # client.
103
+ #
104
+ # @param client [Hash] definition.
105
+ def create_client_registration_event(client)
106
+ check = create_registration_check(client)
107
+ create_event(client, check) do |event|
108
+ event_bridges(event)
109
+ process_event(event)
110
+ end
111
+ end
112
+
113
+ # Process an initial client registration, when it is first added
114
+ # to the client registry. If a registration handler is defined
115
+ # or the client specifies one, a client registration event is
116
+ # created and processed (handled, etc.) for the client
117
+ # (`create_client_registration_event()`).
118
+ #
119
+ # @param client [Hash] definition.
120
+ def process_client_registration(client)
121
+ if @settings.handler_exists?("registration") || client[:registration]
122
+ create_client_registration_event(client)
123
+ end
124
+ end
125
+
126
+ # Update the Sensu client registry, stored in Redis. Sensu
127
+ # client data is used to provide additional event context and
128
+ # enable agent health monitoring.
129
+ #
130
+ # To enable silencing individual clients, per-client
131
+ # subscriptions (`client:$CLIENT_NAME`) are added to client
132
+ # subscriptions automatically.
133
+ #
134
+ # The client registry supports client signatures, unique string
135
+ # identifiers used for keepalive and result source
136
+ # verification. If a client has a signature, all further
137
+ # registry updates for the client must have the same
138
+ # signature. A client can begin to use a signature if one was
139
+ # not previously configured. JSON serialization is used for the
140
+ # stored client data.
141
+ #
142
+ # @param client [Hash]
143
+ # @yield [success] passes success status to optional
144
+ # callback/block.
145
+ # @yieldparam success [TrueClass,FalseClass] indicating if the
146
+ # client registry update was a success or the client data was
147
+ # discarded due to client signature mismatch.
148
+ def update_client_registry(client)
149
+ @logger.debug("updating client registry", :client => client)
150
+ client_key = "client:#{client[:name]}"
151
+ client[:subscriptions] = (client[:subscriptions] + [client_key]).uniq
152
+ signature_key = "#{client_key}:signature"
153
+ @redis.setnx(signature_key, client[:signature]) do |created|
154
+ process_client_registration(client) if created
155
+ @redis.get(signature_key) do |signature|
156
+ if (signature.nil? || signature.empty?) && client[:signature]
157
+ @redis.set(signature_key, client[:signature])
158
+ end
159
+ if signature.nil? || signature.empty? || client[:signature] == signature
160
+ @redis.multi
161
+ @redis.set(client_key, Sensu::JSON.dump(client))
162
+ @redis.sadd("clients", client[:name])
163
+ @redis.exec do
164
+ yield(true) if block_given?
165
+ end
166
+ else
167
+ @logger.warn("invalid client signature", {
168
+ :client => client,
169
+ :signature => signature
170
+ })
171
+ @logger.warn("not updating client in the registry", :client => client)
172
+ yield(false) if block_given?
173
+ end
174
+ end
175
+ end
176
+ end
177
+
178
+ # Determine if a transport message is under the optional
179
+ # configured max message size. This method helps prevent
180
+ # oversized messages from consuming memory and being persisted
181
+ # to the datastore.
182
+ #
183
+ # @param message [String]
184
+ # @return [TrueClass,FalseClass]
185
+ def message_size_ok?(message)
186
+ if @settings[:sensu][:server] &&
187
+ @settings[:sensu][:server][:max_message_size]
188
+ message_size = message.bytesize
189
+ max_message_size = @settings[:sensu][:server][:max_message_size]
190
+ if message_size <= max_message_size
191
+ true
192
+ else
193
+ @logger.error("message exceeds the configured max message size", {
194
+ :max_message_size => max_message_size,
195
+ :message_size => message_size,
196
+ :message => message
197
+ })
198
+ false
199
+ end
200
+ else
201
+ true
202
+ end
203
+ end
204
+
205
+ # Set up the client keepalive consumer, keeping the Sensu client
206
+ # registry updated. The consumer receives JSON serialized client
207
+ # keepalives from the transport, parses them, and calls
208
+ # `update_client_registry()` with the client data to update the
209
+ # registry. Transport message acknowledgements are used to
210
+ # ensure the client registry is updated successfully. Keepalive
211
+ # JSON parsing errors are logged.
212
+ def setup_keepalives
213
+ keepalives_pipe = "keepalives"
214
+ if @settings[:sensu][:server] && @settings[:sensu][:server][:keepalives_pipe]
215
+ keepalives_pipe = @settings[:sensu][:server][:keepalives_pipe]
216
+ end
217
+ @logger.debug("subscribing to keepalives", :pipe => keepalives_pipe)
218
+ @transport.subscribe(:direct, keepalives_pipe, "keepalives", :ack => true) do |message_info, message|
219
+ @logger.debug("received keepalive", :message => message)
220
+ if message_size_ok?(message)
221
+ begin
222
+ client = Sensu::JSON.load(message)
223
+ update_client_registry(client)
224
+ rescue Sensu::JSON::ParseError => error
225
+ @logger.error("failed to parse keepalive payload", {
226
+ :message => message,
227
+ :error => error.to_s
228
+ })
229
+ end
230
+ end
231
+ EM::next_tick do
232
+ @transport.ack(message_info)
233
+ end
234
+ end
235
+ end
236
+
237
+ # Expand event handler sets, creating an array of handler
238
+ # definitions. Handler sets cannot be deeply nested (by choice),
239
+ # this method will return `nil` if an attempt is made to deeply
240
+ # nest. If the provided handler definition is not a set, it is
241
+ # returned.
242
+ #
243
+ # @param handler [Hash] definition.
244
+ # @param depth [Integer] of the expansion.
245
+ # @return [Array, Hash, Nil]
246
+ def expand_handler_sets(handler, depth=0)
247
+ if handler[:type] == "set"
248
+ if depth < 2
249
+ derive_handlers(handler[:handlers], depth + 1)
250
+ else
251
+ @logger.error("handler sets cannot be deeply nested", :handler => handler)
252
+ nil
253
+ end
254
+ else
255
+ handler
256
+ end
257
+ end
258
+
259
+ # Derive an array of handler definitions from a list of handler
260
+ # names. This method first checks for the existence of standard
261
+ # handlers, followed by handler extensions. If a handler does
262
+ # not exist for a name, it is logged and ignored. Duplicate
263
+ # handler definitions are removed.
264
+ #
265
+ # @param handler_list [Array]
266
+ # @param depth [Integer] of handler set expansion.
267
+ # @return [Array]
268
+ def derive_handlers(handler_list, depth=0)
269
+ handler_list.compact.map { |handler_name|
270
+ case
271
+ when @settings.handler_exists?(handler_name)
272
+ handler = @settings[:handlers][handler_name].merge(:name => handler_name)
273
+ expand_handler_sets(handler, depth)
274
+ when @extensions.handler_exists?(handler_name)
275
+ @extensions[:handlers][handler_name]
276
+ else
277
+ @logger.error("unknown handler", :handler_name => handler_name)
278
+ nil
279
+ end
280
+ }.flatten.compact.uniq
281
+ end
282
+
283
+ # Process an event: filter -> mutate -> handle.
284
+ #
285
+ # This method determines the appropriate handlers for an event,
286
+ # filtering and mutating the event data for each of them. The
287
+ # `@in_progress[:events]` counter is incremented by `1`, for
288
+ # each event handler chain (filter -> mutate -> handle).
289
+ #
290
+ # @param event [Hash]
291
+ def process_event(event)
292
+ log_level = event[:check][:type] == METRIC_CHECK_TYPE ? :debug : :info
293
+ @logger.send(log_level, "processing event", :event => event)
294
+ handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || DEFAULT_HANDLER_NAME)
295
+ handlers = derive_handlers(handler_list)
296
+ handlers.each do |handler|
297
+ @in_progress[:events] += 1
298
+ filter_event(handler, event) do |event|
299
+ mutate_event(handler, event) do |event_data|
300
+ handle_event(handler, event_data, event[:id])
301
+ end
302
+ end
303
+ end
304
+ end
305
+
306
+ # Run event bridge extensions, within the Sensu EventMachine
307
+ # reactor (event loop). The extension API `safe_run()` method is
308
+ # used to guard against most errors. Bridges are for relaying
309
+ # Sensu event data to other services.
310
+ #
311
+ # @param event [Hash]
312
+ def event_bridges(event)
313
+ @extensions[:bridges].each do |name, bridge|
314
+ bridge.safe_run(event) do |output, status|
315
+ @logger.debug("bridge extension output", {
316
+ :extension => bridge.definition,
317
+ :output => output
318
+ })
319
+ end
320
+ end
321
+ end
322
+
323
+ # Add a check result to one or more aggregates. The aggregate name is
324
+ # determined by the value of check `:aggregates` array, if present,
325
+ # and falling back to `:aggregate` otherwise.
326
+ #
327
+ # When one or more aggregates are specified as `:aggregates`, the
328
+ # client name and check are updated on each aggregate.
329
+ #
330
+ # When no aggregates are specified as `:aggregates`, and `:aggregate`
331
+ # is `true` (legacy), the check `:name` is used as the aggregate name.
332
+ #
333
+ # When no aggregates are specified as `:aggregates` and check `:aggregate`
334
+ # is a string, it used as the aggregate name.
335
+ #
336
+ # This method will add the client name to configured aggregates, all
337
+ # other processing (e.g. counters) is done by the Sensu API on request.
338
+ #
339
+ # @param client [Hash]
340
+ # @param check [Hash]
341
+ def aggregate_check_result(client, check)
342
+ check_aggregate = (check[:aggregate].is_a?(String) ? check[:aggregate] : check[:name])
343
+ aggregate_list = Array(check[:aggregates] || check_aggregate)
344
+ aggregate_list.each do |aggregate|
345
+ @logger.debug("adding check result to aggregate", {
346
+ :aggregate => aggregate,
347
+ :client => client,
348
+ :check => check
349
+ })
350
+ aggregate_member = "#{client[:name]}:#{check[:name]}"
351
+ @redis.sadd("aggregates:#{aggregate}", aggregate_member) do
352
+ @redis.sadd("aggregates", aggregate)
353
+ end
354
+ end
355
+ end
356
+
357
+ # Truncate check output. Metric checks (`"type": "metric"`), or
358
+ # checks with `"truncate_output": true`, have their output
359
+ # truncated to a single line and a maximum character length of
360
+ # 255 by default. The maximum character length can be change by
361
+ # the `"truncate_output_length"` check definition attribute.
362
+ #
363
+ # @param check [Hash]
364
+ # @return [Hash] check with truncated output.
365
+ def truncate_check_output(check)
366
+ if check[:truncate_output] ||
367
+ (check[:type] == METRIC_CHECK_TYPE && check[:truncate_output] != false)
368
+ begin
369
+ output_lines = check[:output].split("\n")
370
+ rescue ArgumentError
371
+ utf8_output = check[:output].encode("UTF-8", "binary", **{
372
+ :invalid => :replace,
373
+ :undef => :replace,
374
+ :replace => ""
375
+ })
376
+ output_lines = utf8_output.split("\n")
377
+ end
378
+ output = output_lines.first || check[:output]
379
+ truncate_output_length = check.fetch(:truncate_output_length, 255)
380
+ if output_lines.length > 1 || output.length > truncate_output_length
381
+ output = output[0..truncate_output_length] + "\n..."
382
+ end
383
+ check.merge(:output => output)
384
+ else
385
+ check
386
+ end
387
+ end
388
+
389
+ # Store check result data. This method stores check result data
390
+ # and the 21 most recent check result statuses for a client/check
391
+ # pair, this history is used for event context and flap detection.
392
+ # The check execution timestamp is also stored, to provide an
393
+ # indication of how recent the data is. Check output is
394
+ # truncated by `truncate_check_output()` before it is stored.
395
+ #
396
+ # @param client [Hash]
397
+ # @param check [Hash]
398
+ # @yield [] callback/block called after the check data has been
399
+ # stored (history, etc).
400
+ def store_check_result(client, check)
401
+ @logger.debug("storing check result", :check => check)
402
+ result_key = "#{client[:name]}:#{check[:name]}"
403
+ history_key = "history:#{result_key}"
404
+ check_truncated = truncate_check_output(check)
405
+ @redis.multi
406
+ @redis.sadd("result:#{client[:name]}", check[:name])
407
+ @redis.set("result:#{result_key}", Sensu::JSON.dump(check_truncated))
408
+ @redis.sadd("ttl", result_key) if check[:ttl]
409
+ @redis.rpush(history_key, check[:status])
410
+ @redis.ltrim(history_key, -21, -1)
411
+ if check[:status] == 0
412
+ @redis.set("#{history_key}:last_ok", check.fetch(:executed, Time.now.to_i))
413
+ end
414
+ @redis.exec do
415
+ yield
416
+ end
417
+ end
418
+
419
+ # Fetch the execution history for a client/check pair, the 21
420
+ # most recent check result statuses. This method also calculates
421
+ # the total state change percentage for the history, this value
422
+ # is use for check state flap detection, using a similar
423
+ # algorithm to Nagios:
424
+ # http://nagios.sourceforge.net/docs/3_0/flapping.html
425
+ #
426
+ # @param client [Hash]
427
+ # @param check [Hash]
428
+ # @yield [history, total_state_change] callback/block to call
429
+ # with the check history and calculated total state change
430
+ # value.
431
+ # @yieldparam history [Array] containing the last 21 check
432
+ # result exit status codes.
433
+ # @yieldparam total_state_change [Float] percentage for the
434
+ # check history (exit status codes).
435
+ # @yieldparam last_ok [Integer] execution timestamp of the last
436
+ # OK check result.
437
+ def check_history(client, check)
438
+ history_key = "history:#{client[:name]}:#{check[:name]}"
439
+ @redis.lrange(history_key, -21, -1) do |history|
440
+ total_state_change = 0
441
+ unless history.length < 21
442
+ state_changes = 0
443
+ change_weight = 0.8
444
+ previous_status = history.first
445
+ history.each do |status|
446
+ unless status == previous_status
447
+ state_changes += change_weight
448
+ end
449
+ change_weight += 0.02
450
+ previous_status = status
451
+ end
452
+ total_state_change = (state_changes.fdiv(20) * 100).to_i
453
+ end
454
+ @redis.get("#{history_key}:last_ok") do |last_ok|
455
+ last_ok = last_ok.to_i unless last_ok.nil?
456
+ yield(history, total_state_change, last_ok)
457
+ end
458
+ end
459
+ end
460
+
461
+ # Determine if a check state is flapping, rapidly changing
462
+ # between an OK and non-OK state. Flap detection is only done
463
+ # for checks that have defined low and hight flap detection
464
+ # thresholds, `:low_flap_threshold` and `:high_flap_threshold`.
465
+ # The `check_history()` method provides the check history and
466
+ # more importantly the total state change precentage value that
467
+ # is compared with the configured thresholds defined in the
468
+ # check data. If a check hasn't been flapping, the
469
+ # `:total_state_change` must be equal to or higher than the
470
+ # `:high_flap_threshold` to be changed to flapping. If a check
471
+ # has been flapping, the `:total_state_change` must be equal to
472
+ # or lower than the `:low_flap_threshold` to no longer be
473
+ # flapping. This method uses the same algorithm as Nagios:
474
+ # http://nagios.sourceforge.net/docs/3_0/flapping.html
475
+ #
476
+ # @param stored_event [Hash]
477
+ # @param check [Hash]
478
+ # @return [TrueClass, FalseClass]
479
+ def check_flapping?(stored_event, check)
480
+ if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
481
+ if check[:low_flap_threshold].is_a?(Integer) && check[:high_flap_threshold].is_a?(Integer)
482
+ was_flapping = stored_event && stored_event[:action] == EVENT_FLAPPING_ACTION
483
+ if was_flapping
484
+ check[:total_state_change] > check[:low_flap_threshold]
485
+ else
486
+ check[:total_state_change] >= check[:high_flap_threshold]
487
+ end
488
+ else
489
+ details = {:check => check}
490
+ details[:client] = stored_event[:client] if stored_event
491
+ @logger.error("invalid check flap thresholds", details)
492
+ false
493
+ end
494
+ else
495
+ false
496
+ end
497
+ end
498
+
499
+ # Determine if an event has been silenced. This method compiles
500
+ # an array of possible silenced registry entry keys for the
501
+ # event. An attempt is made to fetch one or more of the silenced
502
+ # registry entries to determine if the event has been silenced.
503
+ # The event data is updated to indicate if the event has been
504
+ # silenced. If the event is silenced and the event action is
505
+ # `:resolve`, silenced registry entries with
506
+ # `:expire_on_resolve` set to true will be deleted. Silencing is
507
+ # disabled for events with a check status of `0` (OK), unless
508
+ # the event action is `:resolve` or `:flapping`.
509
+ #
510
+ # @param event [Hash]
511
+ # @yield callback [event] callback/block called after the event
512
+ # data has been updated to indicate if it has been silenced.
513
+ def event_silenced?(event)
514
+ event[:silenced] = false
515
+ event[:silenced_by] = []
516
+ if event[:check][:status] != 0 || event[:action] != :create
517
+ check_name = event[:check][:name]
518
+ silenced_keys = event[:client][:subscriptions].map { |subscription|
519
+ ["silence:#{subscription}:*", "silence:#{subscription}:#{check_name}"]
520
+ }.flatten
521
+ silenced_keys << "silence:*:#{check_name}"
522
+ @redis.mget(*silenced_keys) do |silenced|
523
+ silenced.compact!
524
+ silenced.each do |silenced_json|
525
+ silenced_info = Sensu::JSON.load(silenced_json)
526
+ if silenced_info[:expire_on_resolve] && event[:action] == :resolve
527
+ silenced_key = "silence:#{silenced_info[:id]}"
528
+ @redis.srem("silenced", silenced_key)
529
+ @redis.del(silenced_key)
530
+ elsif silenced_info[:begin].nil? || silenced_info[:begin] <= Time.now.to_i
531
+ event[:silenced_by] << silenced_info[:id]
532
+ end
533
+ end
534
+ event[:silenced] = !event[:silenced_by].empty?
535
+ yield(event)
536
+ end
537
+ else
538
+ yield(event)
539
+ end
540
+ end
541
+
542
+ # Update the event registry, stored in Redis. This method
543
+ # determines if event data warrants in the creation or update of
544
+ # event data in the registry. If a check `:status` is not
545
+ # `0`, or it has been flapping, an event is created/updated in
546
+ # the registry. If the event `:action` is `:resolve`, the event
547
+ # is removed (resolved) from the registry. If the previous
548
+ # conditions are not met and check `:type` is `metric`, the
549
+ # registry is not updated, but further event processing is
550
+ # required (`yield(true)`). JSON serialization is used when
551
+ # storing data in the registry.
552
+ #
553
+ # @param event [Hash]
554
+ # @yield callback [event] callback/block called after the event
555
+ # registry has been updated.
556
+ # @yieldparam process [TrueClass, FalseClass] indicating if the
557
+ # event requires further processing.
558
+ def update_event_registry(event)
559
+ client_name = event[:client][:name]
560
+ if event[:check][:status] != 0 || (event[:action] == :flapping && event[:check][:force_resolve] != true)
561
+ @redis.hset("events:#{client_name}", event[:check][:name], Sensu::JSON.dump(event)) do
562
+ yield(true)
563
+ end
564
+ elsif event[:action] == :resolve &&
565
+ (event[:check][:auto_resolve] != false || event[:check][:force_resolve]) ||
566
+ (event[:action] == :flapping && event[:check][:force_resolve])
567
+ @redis.hdel("events:#{client_name}", event[:check][:name]) do
568
+ yield(true)
569
+ end
570
+ elsif event[:check][:type] == METRIC_CHECK_TYPE
571
+ yield(true)
572
+ else
573
+ yield(false)
574
+ end
575
+ end
576
+
577
+ # Create an event, using the provided client and check result
578
+ # data. Existing event data for the client/check pair is fetched
579
+ # from the event registry to be used in the composition of the
580
+ # new event. The silenced registry is used to determine if the
581
+ # event has been silenced.
582
+ #
583
+ # @param client [Hash]
584
+ # @param check [Hash]
585
+ # @yield callback [event] callback/block called with the
586
+ # resulting event.
587
+ # @yieldparam event [Hash]
588
+ def create_event(client, check)
589
+ check_history(client, check) do |history, total_state_change, last_ok|
590
+ check[:history] = history
591
+ check[:total_state_change] = total_state_change
592
+ @redis.hget("events:#{client[:name]}", check[:name]) do |event_json|
593
+ stored_event = event_json ? Sensu::JSON.load(event_json) : nil
594
+ flapping = check_flapping?(stored_event, check)
595
+ event = {
596
+ :id => random_uuid,
597
+ :client => client,
598
+ :check => check,
599
+ :occurrences => 1,
600
+ :occurrences_watermark => 1,
601
+ :last_ok => last_ok,
602
+ :action => (flapping ? :flapping : :create),
603
+ :timestamp => Time.now.to_i
604
+ }
605
+ if stored_event
606
+ event[:id] = stored_event[:id]
607
+ event[:last_state_change] = stored_event[:last_state_change]
608
+ event[:occurrences] = stored_event[:occurrences]
609
+ event[:occurrences_watermark] = stored_event[:occurrences_watermark] || event[:occurrences]
610
+ end
611
+ if check[:status] != 0 || flapping
612
+ if history[-1] == history[-2]
613
+ event[:occurrences] += 1
614
+ if event[:occurrences] > event[:occurrences_watermark]
615
+ event[:occurrences_watermark] = event[:occurrences]
616
+ end
617
+ else
618
+ event[:occurrences] = 1
619
+ event[:last_state_change] = event[:timestamp]
620
+ end
621
+ elsif stored_event
622
+ event[:last_state_change] = event[:timestamp]
623
+ event[:action] = :resolve
624
+ end
625
+ event_silenced?(event) do |event|
626
+ yield(event)
627
+ end
628
+ end
629
+ end
630
+ end
631
+
632
+ # Create a blank client (data). Only the client name is known,
633
+ # the other client attributes must be updated via the API (POST
634
+ # /clients:client). Dynamically created clients and those
635
+ # updated via the API will have client keepalives disabled by
636
+ # default, `:keepalives` is set to `false`.
637
+ #
638
+ # @param name [String] to use for the client.
639
+ # @return [Hash] client.
640
+ def create_client(name)
641
+ {
642
+ :name => name,
643
+ :address => "unknown",
644
+ :subscriptions => ["client:#{name}"],
645
+ :keepalives => false,
646
+ :version => VERSION,
647
+ :timestamp => Time.now.to_i
648
+ }
649
+ end
650
+
651
+ # Retrieve a client (data) from Redis if it exists. If a client
652
+ # does not already exist, create one (a blank) using the
653
+ # `client_key` as the client name. Dynamically create client
654
+ # data can be updated using the API (POST /clients/:client). If
655
+ # a client does exist and it has a client signature, the check
656
+ # result must have a matching signature or it is discarded. If
657
+ # the client does not exist, but a client signature exists, the
658
+ # check result must have a matching signature or it is
659
+ # discarded.
660
+ #
661
+ # @param result [Hash] data.
662
+ # @yield [client] callback/block to be called with client data,
663
+ # either retrieved from Redis, or dynamically created.
664
+ # @yieldparam client [Hash]
665
+ def retrieve_client(result)
666
+ client_key = result[:check][:source] || result[:client]
667
+ @redis.get("client:#{client_key}") do |client_json|
668
+ unless client_json.nil?
669
+ client = Sensu::JSON.load(client_json)
670
+ if client[:signature]
671
+ if client[:signature] == result[:signature]
672
+ yield(client)
673
+ else
674
+ @logger.warn("invalid check result signature", {
675
+ :result => result,
676
+ :client => client
677
+ })
678
+ @logger.warn("not retrieving client from the registry", :result => result)
679
+ yield(nil)
680
+ end
681
+ else
682
+ yield(client)
683
+ end
684
+ else
685
+ @redis.get("client:#{client_key}:signature") do |signature|
686
+ if signature.nil? || signature.empty? || result[:signature] == signature
687
+ client = create_client(client_key)
688
+ client[:type] = "proxy" if result[:check][:source]
689
+ update_client_registry(client) do
690
+ yield(client)
691
+ end
692
+ else
693
+ @logger.warn("invalid check result signature", {
694
+ :result => result,
695
+ :signature => signature
696
+ })
697
+ yield(nil)
698
+ end
699
+ end
700
+ end
701
+ end
702
+ end
703
+
704
+ # Determine if a keepalive event exists for a client.
705
+ #
706
+ # @param client_name [String] name of client to look up in event registry.
707
+ # @return [TrueClass, FalseClass]
708
+ def keepalive_event_exists?(client_name)
709
+ @redis.hexists("events:#{client_name}", "keepalive") do |event_exists|
710
+ yield(event_exists)
711
+ end
712
+ end
713
+
714
+ # Process a check result, storing its data, inspecting its
715
+ # contents, and taking the appropriate actions (eg. update the
716
+ # event registry). The `@in_progress[:check_results]` counter is
717
+ # incremented by `1` prior to check result processing and then
718
+ # decremented by `1` after updating the event registry. A check
719
+ # result must have a valid client name, associated with a client
720
+ # in the registry or one will be created. If a local check
721
+ # definition exists for the check name, and the check result is
722
+ # not from a standalone check execution, it's merged with the
723
+ # check result for more context.
724
+ #
725
+ # @param result [Hash] data.
726
+ def process_check_result(result)
727
+ @in_progress[:check_results] += 1
728
+ @logger.debug("processing result", :result => result)
729
+ retrieve_client(result) do |client|
730
+ unless client.nil?
731
+ check = case
732
+ when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
733
+ @settings[:checks][result[:check][:name]].merge(result[:check])
734
+ else
735
+ result[:check]
736
+ end
737
+ check[:type] ||= STANDARD_CHECK_TYPE
738
+ check[:origin] = result[:client] if check[:source]
739
+ if @settings.check_exists?(check[:name]) && client[:type] == "proxy"
740
+ check[:command] = @settings[:checks][check[:name].to_sym][:command]
741
+ end
742
+ aggregate_check_result(client, check) if check[:aggregates] || check[:aggregate]
743
+ store_check_result(client, check) do
744
+ create_event(client, check) do |event|
745
+ event_bridges(event)
746
+ update_event_registry(event) do |process|
747
+ process_event(event) if process
748
+ @in_progress[:check_results] -= 1
749
+ end
750
+ end
751
+ end
752
+ else
753
+ @logger.warn("halting result processing", :result => result)
754
+ @in_progress[:check_results] -= 1
755
+ end
756
+ end
757
+ end
758
+
759
+ # Set up the check result consumer. The consumer receives JSON
760
+ # serialized check results from the transport, parses them, and
761
+ # calls `process_check_result()` with the result data to be
762
+ # processed. Transport message acknowledgements are used to
763
+ # ensure that results make it to processing. The transport
764
+ # message acknowledgements are currently done in the next tick
765
+ # of the EventMachine reactor (event loop), as a flow control
766
+ # mechanism. Result JSON parsing errors are logged.
767
+ def setup_results
768
+ results_pipe = "results"
769
+ if @settings[:sensu][:server] && @settings[:sensu][:server][:results_pipe]
770
+ results_pipe = @settings[:sensu][:server][:results_pipe]
771
+ end
772
+ @logger.debug("subscribing to results", :pipe => results_pipe)
773
+ @transport.subscribe(:direct, results_pipe, "results", :ack => true) do |message_info, message|
774
+ if message_size_ok?(message)
775
+ begin
776
+ result = Sensu::JSON.load(message)
777
+ @logger.debug("received result", :result => result)
778
+ process_check_result(result)
779
+ rescue Sensu::JSON::ParseError => error
780
+ @logger.error("failed to parse result payload", {
781
+ :message => message,
782
+ :error => error.to_s
783
+ })
784
+ end
785
+ end
786
+ EM::next_tick do
787
+ @transport.ack(message_info)
788
+ end
789
+ end
790
+ end
791
+
792
+ # Determine the Sensu Transport publish options for a
793
+ # subscription. If a subscription begins with a Transport pipe
794
+ # type, either "direct:" or "roundrobin:", the subscription uses
795
+ # a direct Transport pipe. If a subscription does not specify a
796
+ # Transport pipe type, a fanout Transport pipe is used.
797
+ #
798
+ # @param subscription [String]
799
+ # @param message [String]
800
+ # @return [Array] containing the Transport publish options:
801
+ # the Transport pipe type, pipe, and the message to be
802
+ # published.
803
+ def transport_publish_options(subscription, message)
804
+ _, raw_type = subscription.split(":", 2).reverse
805
+ case raw_type
806
+ when "direct", "roundrobin"
807
+ [:direct, subscription, message]
808
+ else
809
+ [:fanout, subscription, message]
810
+ end
811
+ end
812
+
813
+ # Publish a check request to the Transport. A check request is
814
+ # composed of a check definition (minus `:subscribers` and
815
+ # `:interval`) and an `:issued` timestamp. The check request is
816
+ # published to a Transport pipe, for each of the check
817
+ # `:subscribers` in its definition, eg. "webserver". JSON
818
+ # serialization is used when publishing the check request
819
+ # payload to the Transport pipes. Transport errors are logged.
820
+ #
821
+ # @param check [Hash] definition.
822
+ def publish_check_request(check)
823
+ payload = check.reject do |key, value|
824
+ [:subscribers, :interval].include?(key)
825
+ end
826
+ payload[:issued] = Time.now.to_i
827
+ @logger.info("publishing check request", {
828
+ :payload => payload,
829
+ :subscribers => check[:subscribers]
830
+ })
831
+ check[:subscribers].each do |subscription|
832
+ options = transport_publish_options(subscription, Sensu::JSON.dump(payload))
833
+ @transport.publish(*options) do |info|
834
+ if info[:error]
835
+ @logger.error("failed to publish check request", {
836
+ :subscription => subscription,
837
+ :payload => payload,
838
+ :error => info[:error].to_s
839
+ })
840
+ end
841
+ end
842
+ end
843
+ end
844
+
845
+ # Determine and return clients from the registry that match a
846
+ # set of attributes.
847
+ #
848
+ # @param clients [Array] of client names.
849
+ # @param attributes [Hash]
850
+ # @yield [Array] callback/block called after determining the
851
+ # matching clients, returning them as a block parameter.
852
+ def determine_matching_clients(clients, attributes)
853
+ client_keys = clients.map { |client_name| "client:#{client_name}" }
854
+ @redis.mget(*client_keys) do |client_json_objects|
855
+ matching_clients = []
856
+ client_json_objects.each do |client_json|
857
+ unless client_json.nil?
858
+ client = Sensu::JSON.load(client_json)
859
+ if attributes_match?(client, attributes)
860
+ matching_clients << client
861
+ end
862
+ end
863
+ end
864
+ yield(matching_clients)
865
+ end
866
+ end
867
+
868
+ # Publish a proxy check request for a client. This method
869
+ # substitutes client tokens in the check definition prior to
870
+ # publish the check request. If there are unmatched client
871
+ # tokens, a warning is logged, and a check request is not
872
+ # published.
873
+ #
874
+ # @param client [Hash] definition.
875
+ # @param check [Hash] definition.
876
+ def publish_proxy_check_request(client, check)
877
+ @logger.debug("creating a proxy check request", {
878
+ :client => client,
879
+ :check => check
880
+ })
881
+ proxy_check, unmatched_tokens = object_substitute_tokens(deep_dup(check.dup), client)
882
+ if unmatched_tokens.empty?
883
+ proxy_check[:source] ||= client[:name]
884
+ publish_check_request(proxy_check)
885
+ else
886
+ @logger.warn("failed to publish a proxy check request", {
887
+ :reason => "unmatched client tokens",
888
+ :unmatched_tokens => unmatched_tokens,
889
+ :client => client,
890
+ :check => check
891
+ })
892
+ end
893
+ end
894
+
895
+ # Publish proxy check requests for one or more clients. This
896
+ # method can optionally splay proxy check requests, evenly, over
897
+ # a period of time, determined by the check interval and a
898
+ # configurable splay coverage percentage. For example, splay
899
+ # proxy check requests over 60s * 90%, 54s, leaving 6s for the
900
+ # last proxy check execution before the the next round of proxy
901
+ # check requests for the same check. The
902
+ # `publish_proxy_check_request() method is used to publish the
903
+ # proxy check requests.
904
+ #
905
+ # @param clients [Array] of client definitions.
906
+ # @param check [Hash] definition.
907
+ def publish_proxy_check_requests(clients, check)
908
+ client_count = clients.length
909
+ splay = 0
910
+ if check[:proxy_requests][:splay]
911
+ interval = check[:interval]
912
+ if check[:cron]
913
+ interval = determine_check_cron_time(check)
914
+ end
915
+ unless interval.nil?
916
+ splay_coverage = check[:proxy_requests].fetch(:splay_coverage, 90)
917
+ splay = interval * (splay_coverage / 100.0) / client_count
918
+ end
919
+ end
920
+ splay_timer = 0
921
+ clients.each do |client|
922
+ unless splay == 0
923
+ EM::Timer.new(splay_timer) do
924
+ publish_proxy_check_request(client, check)
925
+ end
926
+ splay_timer += splay
927
+ else
928
+ publish_proxy_check_request(client, check)
929
+ end
930
+ end
931
+ end
932
+
933
+ # Create and publish one or more proxy check requests. This
934
+ # method iterates through the Sensu client registry for clients
935
+ # that matched provided proxy request client attributes. A proxy
936
+ # check request is created for each client in the registry that
937
+ # matches the proxy request client attributes. Proxy check
938
+ # requests have their client tokens subsituted by the associated
939
+ # client attributes values. The `determine_matching_clients()`
940
+ # method is used to fetch and inspect each slide of clients from
941
+ # the registry, returning those that match the configured proxy
942
+ # request client attributes. A relatively small clients slice
943
+ # size (20) is used to reduce the number of clients inspected
944
+ # within a single tick of the EM reactor. The
945
+ # `publish_proxy_check_requests()` method is used to iterate
946
+ # through the matching Sensu clients, creating their own unique
947
+ # proxy check request, substituting client tokens, and then
948
+ # publishing them to the targetted subscriptions.
949
+ #
950
+ # @param check [Hash] definition.
951
+ def create_proxy_check_requests(check)
952
+ client_attributes = check[:proxy_requests][:client_attributes]
953
+ unless client_attributes.empty?
954
+ @redis.smembers("clients") do |clients|
955
+ client_count = clients.length
956
+ proxy_check_requests = Proc.new do |matching_clients, slice_start, slice_size|
957
+ unless slice_start > client_count - 1
958
+ clients_slice = clients.slice(slice_start..slice_size)
959
+ determine_matching_clients(clients_slice, client_attributes) do |additional_clients|
960
+ matching_clients += additional_clients
961
+ proxy_check_requests.call(matching_clients, slice_start + 20, slice_size + 20)
962
+ end
963
+ else
964
+ publish_proxy_check_requests(matching_clients, check)
965
+ end
966
+ end
967
+ proxy_check_requests.call([], 0, 19)
968
+ end
969
+ end
970
+ end
971
+
972
+ # Create a check request proc, used to publish check requests to
973
+ # for a check to the Sensu transport. Check requests are not
974
+ # published if subdued. This method determines if a check uses
975
+ # proxy check requests and calls the appropriate check request
976
+ # publish method.
977
+ #
978
+ # @param check [Hash] definition.
979
+ def create_check_request_proc(check)
980
+ Proc.new do
981
+ unless check_subdued?(check)
982
+ if check[:proxy_requests]
983
+ create_proxy_check_requests(check)
984
+ else
985
+ publish_check_request(check)
986
+ end
987
+ else
988
+ @logger.info("check request was subdued", :check => check)
989
+ end
990
+ end
991
+ end
992
+
993
+ # Schedule a check request, using the check cron. This method
994
+ # determines the time until the next cron time (in seconds) and
995
+ # creats an EventMachine timer for the request. This method will
996
+ # be called after every check cron request for subsequent
997
+ # requests. The timer is stored in the timer hash under
998
+ # `:tasks`, so it can be cancelled etc. The check cron request
999
+ # timer object is removed from the timer hash after the request
1000
+ # is published, to stop the timer hash from growing infinitely.
1001
+ #
1002
+ # @param check [Hash] definition.
1003
+ def schedule_check_cron_request(check)
1004
+ cron_time = determine_check_cron_time(check)
1005
+ @timers[:tasks][:check_request_publisher] << EM::Timer.new(cron_time) do |timer|
1006
+ create_check_request_proc(check).call
1007
+ @timers[:tasks][:check_request_publisher].delete(timer)
1008
+ schedule_check_cron_request(check)
1009
+ end
1010
+ end
1011
+
1012
+ # Calculate a check request splay, taking into account the
1013
+ # current time and the request interval to ensure it's
1014
+ # consistent between process restarts.
1015
+ #
1016
+ # @param check [Hash] definition.
1017
+ def calculate_check_request_splay(check)
1018
+ splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
1019
+ current_time = (Time.now.to_f * 1000).to_i
1020
+ (splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
1021
+ end
1022
+
1023
+ # Schedule check requests, using the check interval. This method
1024
+ # using an intial calculated request splay EventMachine timer
1025
+ # and an EventMachine periodic timer for subsequent check
1026
+ # requests. The timers are stored in the timers hash under
1027
+ # `:tasks`, so they can be cancelled etc.
1028
+ #
1029
+ # @param check [Hash] definition.
1030
+ def schedule_check_interval_requests(check)
1031
+ request_splay = testing? ? 0 : calculate_check_request_splay(check)
1032
+ interval = testing? ? 0.5 : check[:interval]
1033
+ @timers[:tasks][:check_request_publisher] << EM::Timer.new(request_splay) do
1034
+ create_check_request = create_check_request_proc(check)
1035
+ create_check_request.call
1036
+ @timers[:tasks][:check_request_publisher] << EM::PeriodicTimer.new(interval, &create_check_request)
1037
+ end
1038
+ end
1039
+
1040
+ # Schedule check requests. This method iterates through defined
1041
+ # checks and uses the appropriate method of check request
1042
+ # scheduling, either with the cron syntax or a numeric interval.
1043
+ #
1044
+ # @param checks [Array] of definitions.
1045
+ def schedule_checks(checks)
1046
+ checks.each do |check|
1047
+ if check[:cron]
1048
+ schedule_check_cron_request(check)
1049
+ else
1050
+ schedule_check_interval_requests(check)
1051
+ end
1052
+ end
1053
+ end
1054
+
1055
+ # Set up the check request publisher. This method creates an
1056
+ # array of check definitions, that are not standalone checks,
1057
+ # and do not have `:publish` set to `false`. The array is
1058
+ # provided to the `schedule_checks()` method.
1059
+ def setup_check_request_publisher
1060
+ @logger.debug("scheduling check requests")
1061
+ checks = @settings.checks.reject do |check|
1062
+ check[:standalone] || check[:publish] == false
1063
+ end
1064
+ schedule_checks(checks)
1065
+ end
1066
+
1067
+ # Publish a check result to the Transport for processing. A
1068
+ # check result is composed of a client name and a check
1069
+ # definition, containing check `:output` and `:status`. A client
1070
+ # signature is added to the check result payload if one is
1071
+ # registered for the client. JSON serialization is used when
1072
+ # publishing the check result payload to the Transport pipe.
1073
+ # Transport errors are logged.
1074
+ #
1075
+ # @param client_name [String]
1076
+ # @param check [Hash]
1077
+ def publish_check_result(client_name, check)
1078
+ payload = {
1079
+ :client => client_name,
1080
+ :check => check
1081
+ }
1082
+ @redis.get("client:#{client_name}:signature") do |signature|
1083
+ payload[:signature] = signature if signature
1084
+ @logger.debug("publishing check result", :payload => payload)
1085
+ @transport.publish(:direct, "results", Sensu::JSON.dump(payload)) do |info|
1086
+ if info[:error]
1087
+ @logger.error("failed to publish check result", {
1088
+ :payload => payload,
1089
+ :error => info[:error].to_s
1090
+ })
1091
+ end
1092
+ end
1093
+ end
1094
+ end
1095
+
1096
+ # Create a keepalive check definition for a client. Client
1097
+ # definitions may contain `:keepalive` configuration, containing
1098
+ # specific thresholds and handler information. The keepalive
1099
+ # check definition creation begins with default thresholds, and
1100
+ # sets the `:handler` to `keepalive`, if the handler has a local
1101
+ # definition. If the client provides its own `:keepalive`
1102
+ # configuration, it's deep merged with the defaults. The check
1103
+ # `:name`, `:issued`, and `:executed` values are always
1104
+ # overridden to guard against an invalid definition.
1105
+ #
1106
+ # @return [Array] check definition, unmatched client tokens
1107
+ def create_keepalive_check(client)
1108
+ check = {
1109
+ :thresholds => {
1110
+ :warning => 120,
1111
+ :critical => 180
1112
+ }
1113
+ }
1114
+ if @settings.handler_exists?(:keepalive)
1115
+ check[:handler] = "keepalive"
1116
+ end
1117
+ if @settings[:sensu][:keepalives]
1118
+ check = deep_merge(check, @settings[:sensu][:keepalives])
1119
+ end
1120
+ if client.has_key?(:keepalive)
1121
+ check = deep_merge(check, client[:keepalive])
1122
+ end
1123
+ timestamp = Time.now.to_i
1124
+ check.merge!(:name => "keepalive", :issued => timestamp, :executed => timestamp)
1125
+ object_substitute_tokens(check, client)
1126
+ end
1127
+
1128
+ # Create client keepalive check results. This method will
1129
+ # retrieve clients from the registry, creating a keepalive
1130
+ # check definition for each client, using the
1131
+ # `create_keepalive_check()` method, containing client specific
1132
+ # keepalive thresholds. If the time since the latest keepalive
1133
+ # is equal to or greater than a threshold, the check `:output`
1134
+ # is set to a descriptive message, and `:status` is set to the
1135
+ # appropriate non-zero value. If a client has been sending
1136
+ # keepalives, `:output` and `:status` are set to indicate an OK
1137
+ # state. The `publish_check_result()` method is used to publish
1138
+ # the client keepalive check results.
1139
+ #
1140
+ # @param clients [Array] of client names.
1141
+ # @yield [] callback/block called after the client keepalive
1142
+ # check results have been created.
1143
+ def create_client_keepalive_check_results(clients)
1144
+ client_keys = clients.map { |client_name| "client:#{client_name}" }
1145
+ @redis.mget(*client_keys) do |client_json_objects|
1146
+ client_json_objects.each do |client_json|
1147
+ unless client_json.nil?
1148
+ client = Sensu::JSON.load(client_json)
1149
+ next if client[:keepalives] == false
1150
+ check, unmatched_tokens = create_keepalive_check(client)
1151
+ time_since_last_keepalive = Time.now.to_i - client[:timestamp]
1152
+ check[:output] = "No keepalive sent from client for "
1153
+ check[:output] << "#{time_since_last_keepalive} seconds"
1154
+ case
1155
+ when time_since_last_keepalive >= check[:thresholds][:critical]
1156
+ check[:output] << " (>=#{check[:thresholds][:critical]})"
1157
+ check[:status] = 2
1158
+ when time_since_last_keepalive >= check[:thresholds][:warning]
1159
+ check[:output] << " (>=#{check[:thresholds][:warning]})"
1160
+ check[:status] = 1
1161
+ else
1162
+ check[:output] = "Keepalive sent from client "
1163
+ check[:output] << "#{time_since_last_keepalive} seconds ago"
1164
+ check[:status] = 0
1165
+ end
1166
+ unless unmatched_tokens.empty?
1167
+ check[:output] << " - Unmatched client token(s): " + unmatched_tokens.join(", ")
1168
+ check[:status] = 1 if check[:status] == 0
1169
+ end
1170
+ publish_check_result(client[:name], check)
1171
+ end
1172
+ end
1173
+ yield
1174
+ end
1175
+ end
1176
+
1177
+ # Determine stale clients, those that have not sent a keepalive
1178
+ # in a specified amount of time. This method iterates through
1179
+ # the client registry, creating a keepalive check result for
1180
+ # each client. The `create_client_keepalive_check_results()`
1181
+ # method is used to inspect and create keepalive check results
1182
+ # for each slice of clients from the registry. A relatively
1183
+ # small clients slice size (20) is used to reduce the number of
1184
+ # clients inspected within a single tick of the EM reactor.
1185
+ def determine_stale_clients
1186
+ @logger.info("determining stale clients")
1187
+ @redis.smembers("clients") do |clients|
1188
+ client_count = clients.length
1189
+ keepalive_check_results = Proc.new do |slice_start, slice_size|
1190
+ unless slice_start > client_count - 1
1191
+ clients_slice = clients.slice(slice_start..slice_size)
1192
+ create_client_keepalive_check_results(clients_slice) do
1193
+ keepalive_check_results.call(slice_start + 20, slice_size + 20)
1194
+ end
1195
+ end
1196
+ end
1197
+ keepalive_check_results.call(0, 19)
1198
+ end
1199
+ end
1200
+
1201
+ # Set up the client monitor, a periodic timer to run
1202
+ # `determine_stale_clients()` every 30 seconds. The timer is
1203
+ # stored in the timers hash under `:tasks`.
1204
+ def setup_client_monitor
1205
+ @logger.debug("monitoring client keepalives")
1206
+ @timers[:tasks][:client_monitor] << EM::PeriodicTimer.new(30) do
1207
+ determine_stale_clients
1208
+ end
1209
+ end
1210
+
1211
+ # Create check TTL results. This method will retrieve check
1212
+ # results from the registry and determine the time since their
1213
+ # last check execution (in seconds). If the time since last
1214
+ # execution is equal to or greater than the defined check TTL, a
1215
+ # warning check result is published with the appropriate check
1216
+ # output.
1217
+ #
1218
+ # @param ttl_keys [Array] of TTL keys.
1219
+ # @param interval [Integer] to use for the check TTL result
1220
+ # interval.
1221
+ # @yield [] callback/block called after the check TTL results
1222
+ # have been created.
1223
+ def create_check_ttl_results(ttl_keys, interval=30)
1224
+ result_keys = ttl_keys.map { |ttl_key| "result:#{ttl_key}" }
1225
+ @redis.mget(*result_keys) do |result_json_objects|
1226
+ result_json_objects.each_with_index do |result_json, index|
1227
+ unless result_json.nil?
1228
+ check = Sensu::JSON.load(result_json)
1229
+ next unless check[:ttl] && check[:executed] && !check[:force_resolve]
1230
+ time_since_last_execution = Time.now.to_i - check[:executed]
1231
+ if time_since_last_execution >= check[:ttl]
1232
+ client_name = ttl_keys[index].split(":").first
1233
+ keepalive_event_exists?(client_name) do |event_exists|
1234
+ unless event_exists
1235
+ check[:output] = "Last check execution was "
1236
+ check[:output] << "#{time_since_last_execution} seconds ago"
1237
+ check[:status] = check[:ttl_status] || 1
1238
+ check[:interval] = interval
1239
+ publish_check_result(client_name, check)
1240
+ end
1241
+ end
1242
+ end
1243
+ else
1244
+ @redis.srem("ttl", ttl_keys[index])
1245
+ end
1246
+ end
1247
+ yield
1248
+ end
1249
+ end
1250
+
1251
+ # Determine stale check results, those that have not executed in
1252
+ # a specified amount of time (check TTL). This method iterates
1253
+ # through stored check results that have a defined TTL value (in
1254
+ # seconds). The `create_check_ttl_results()` method is used to
1255
+ # inspect each check result, calculating their time since last
1256
+ # check execution (in seconds). If the time since last execution
1257
+ # is equal to or greater than the check TTL, a warning check
1258
+ # result is published with the appropriate check output. A
1259
+ # relatively small check results slice size (20) is used to
1260
+ # reduce the number of check results inspected within a single
1261
+ # tick of the EM reactor.
1262
+ #
1263
+ # @param interval [Integer] to use for the check TTL result
1264
+ # interval.
1265
+ def determine_stale_check_results(interval=30)
1266
+ @logger.info("determining stale check results (ttl)")
1267
+ @redis.smembers("ttl") do |ttl_keys|
1268
+ ttl_key_count = ttl_keys.length
1269
+ ttl_check_results = Proc.new do |slice_start, slice_size|
1270
+ unless slice_start > ttl_key_count - 1
1271
+ ttl_keys_slice = ttl_keys.slice(slice_start..slice_size)
1272
+ create_check_ttl_results(ttl_keys_slice, interval) do
1273
+ ttl_check_results.call(slice_start + 20, slice_size + 20)
1274
+ end
1275
+ end
1276
+ end
1277
+ ttl_check_results.call(0, 19)
1278
+ end
1279
+ end
1280
+
1281
+ # Set up the check result monitor, a periodic timer to run
1282
+ # `determine_stale_check_results()` every 30 seconds. The timer
1283
+ # is stored in the timers hash under `:tasks`.
1284
+ #
1285
+ # @param interval [Integer] to use for the check TTL result
1286
+ # interval.
1287
+ def setup_check_result_monitor(interval=30)
1288
+ @logger.debug("monitoring check results")
1289
+ @timers[:tasks][:check_result_monitor] << EM::PeriodicTimer.new(interval) do
1290
+ determine_stale_check_results(interval)
1291
+ end
1292
+ end
1293
+
1294
+ # Create a lock timestamp (integer), current time including
1295
+ # milliseconds. This method is used by Sensu server task
1296
+ # election.
1297
+ #
1298
+ # @return [Integer]
1299
+ def create_lock_timestamp
1300
+ (Time.now.to_f * 10000000).to_i
1301
+ end
1302
+
1303
+ # Create/return the unique Sensu server ID for the current
1304
+ # process.
1305
+ #
1306
+ # @return [String]
1307
+ def server_id
1308
+ @server_id ||= random_uuid
1309
+ end
1310
+
1311
+ # Setup a Sensu server task. Unless the current process is
1312
+ # already responsible for the task, this method sets the tasks
1313
+ # server ID stored in Redis to the unique random server ID for
1314
+ # the process. If the tasks server ID is successfully updated,
1315
+ # the task is added to `@tasks` for tracking purposes and the
1316
+ # task setup method is called.
1317
+ #
1318
+ # @param task [String]
1319
+ # @yield callback/block called after setting up the task.
1320
+ def setup_task(task)
1321
+ unless @tasks.include?(task)
1322
+ @redis.set("task:#{task}:server", server_id) do
1323
+ @logger.info("i am now responsible for a server task", :task => task)
1324
+ @tasks << task
1325
+ self.send("setup_#{task}".to_sym)
1326
+ yield if block_given?
1327
+ end
1328
+ else
1329
+ @logger.debug("i am already responsible for a server task", :task => task)
1330
+ end
1331
+ end
1332
+
1333
+ # Relinquish a Sensu server task. This method cancels and
1334
+ # clears the associated task timers, those with references
1335
+ # stored in the timers hash under `:tasks`, and removes the task
1336
+ # from `@tasks`. The task server ID and lock are not removed
1337
+ # from Redis, as they will be updated when another server takes
1338
+ # reponsibility for the task, this method does not need to
1339
+ # handle Redis connectivity issues.
1340
+ #
1341
+ # @param task [String]
1342
+ def relinquish_task(task)
1343
+ if @tasks.include?(task)
1344
+ @logger.warn("relinquishing server task", :task => task)
1345
+ @timers[:tasks][task.to_sym].each do |timer|
1346
+ timer.cancel
1347
+ end
1348
+ @timers[:tasks][task.to_sym].clear
1349
+ @tasks.delete(task)
1350
+ else
1351
+ @logger.debug("not currently responsible for a server task", :task => task)
1352
+ end
1353
+ end
1354
+
1355
+ # Relinquish all Sensu server tasks, if any.
1356
+ def relinquish_tasks
1357
+ unless @tasks.empty?
1358
+ @tasks.dup.each do |task|
1359
+ relinquish_task(task)
1360
+ end
1361
+ else
1362
+ @logger.debug("not currently responsible for a server task")
1363
+ end
1364
+ end
1365
+
1366
+ # Updates a Sensu server task lock timestamp. The current task
1367
+ # server ID is retrieved from Redis and compared with the server
1368
+ # ID of the current process to determine if it is still
1369
+ # responsible for the task. If the current process is still
1370
+ # responsible, the task lock timestamp is updated. If the
1371
+ # current process is no longer responsible, `relinquish_task()`
1372
+ # is called for cleanup.
1373
+ #
1374
+ # @param task [String]
1375
+ def update_task_lock(task)
1376
+ @redis.get("task:#{task}:server") do |current_server_id|
1377
+ if current_server_id == server_id
1378
+ @redis.set("lock:task:#{task}", create_lock_timestamp) do
1379
+ @logger.debug("updated task lock timestamp", :task => task)
1380
+ end
1381
+ else
1382
+ @logger.warn("another sensu server is responsible for the task", :task => task)
1383
+ relinquish_task(task)
1384
+ end
1385
+ end
1386
+ end
1387
+
1388
+ # Set up a Sensu server task lock updater. This method uses a
1389
+ # periodic timer to update a task lock timestamp in Redis, every
1390
+ # 10 seconds. If the current process fails to keep the lock
1391
+ # timestamp updated for a task that it is responsible for,
1392
+ # another Sensu server will claim responsibility. This method is
1393
+ # called after task setup.
1394
+ #
1395
+ # @param task [String]
1396
+ def setup_task_lock_updater(task)
1397
+ @timers[:run] << EM::PeriodicTimer.new(10) do
1398
+ update_task_lock(task)
1399
+ end
1400
+ end
1401
+
1402
+ # Request a Sensu server task election, a process to determine
1403
+ # if the current process is to be responsible for the task. A
1404
+ # Redis key/value is used as a central lock, using the "SETNX"
1405
+ # Redis command to set the key/value if it does not exist, using
1406
+ # a timestamp for the value. If the current process was able to
1407
+ # create the key/value, it is elected, and is then responsible
1408
+ # for the task. If the current process was not able to create
1409
+ # the key/value, but the current timestamp value is equal to or
1410
+ # over 30 seconds ago, the "GETSET" Redis command is used to set
1411
+ # a new timestamp and fetch the previous value to compare them,
1412
+ # to determine if it was set by the current process. If the
1413
+ # current process is able to set the timestamp value, it is
1414
+ # elected. If elected, the current process sets up the task and
1415
+ # the associated task lock updater.
1416
+ #
1417
+ # @param task [String]
1418
+ # @yield callback/block called either after being elected and
1419
+ # setting up the task, or after failing to be elected.
1420
+ def request_task_election(task, &callback)
1421
+ @redis.setnx("lock:task:#{task}", create_lock_timestamp) do |created|
1422
+ if created
1423
+ setup_task(task, &callback)
1424
+ setup_task_lock_updater(task)
1425
+ else
1426
+ @redis.get("lock:task:#{task}") do |current_lock_timestamp|
1427
+ new_lock_timestamp = create_lock_timestamp
1428
+ if new_lock_timestamp - current_lock_timestamp.to_i >= 300000000
1429
+ @redis.getset("lock:task:#{task}", new_lock_timestamp) do |previous_lock_timestamp|
1430
+ if previous_lock_timestamp == current_lock_timestamp
1431
+ setup_task(task, &callback)
1432
+ setup_task_lock_updater(task)
1433
+ end
1434
+ end
1435
+ else
1436
+ yield if block_given?
1437
+ end
1438
+ end
1439
+ end
1440
+ end
1441
+ end
1442
+
1443
+ # Request Sensu server task elections. The task list is ordered
1444
+ # by prioity. This method works through the task list serially,
1445
+ # increasing the election request delay as the current process
1446
+ # becomes responsible for one or more tasks, this is to improve
1447
+ # the initial distribution of tasks amongst Sensu servers.
1448
+ #
1449
+ # @param splay [Integer]
1450
+ def setup_task_elections(splay=10)
1451
+ tasks = TASKS.dup - @tasks
1452
+ next_task = Proc.new do
1453
+ task = tasks.shift
1454
+ if task
1455
+ delay = splay * @tasks.size
1456
+ @timers[:run] << EM::Timer.new(delay) do
1457
+ request_task_election(task, &next_task)
1458
+ end
1459
+ else
1460
+ @timers[:run] << EM::Timer.new(10) do
1461
+ setup_task_elections(splay)
1462
+ end
1463
+ end
1464
+ end
1465
+ next_task.call
1466
+ end
1467
+
1468
+ # Update the Sensu server registry, stored in Redis. This method
1469
+ # adds the local/current Sensu server info to the registry,
1470
+ # including its id, hostname, address, its server tasks, and
1471
+ # some metrics. Sensu server registry entries expire in 30
1472
+ # seconds unless updated.
1473
+ #
1474
+ # @yield [success] passes success status to optional
1475
+ # callback/block.
1476
+ # @yieldparam success [TrueClass,FalseClass] indicating if the
1477
+ # server registry update was a success.
1478
+ def update_server_registry
1479
+ @logger.debug("updating the server registry")
1480
+ process_cpu_times do |cpu_user, cpu_system, _, _|
1481
+ sensu = RELEASE_INFO.merge(
1482
+ :settings => {
1483
+ :hexdigest => @settings.hexdigest
1484
+ }
1485
+ )
1486
+ tessen = @settings[:tessen] || {}
1487
+ tessen_enabled = tessen.fetch(:enabled, false)
1488
+ info = {
1489
+ :id => server_id,
1490
+ :hostname => system_hostname,
1491
+ :address => system_address,
1492
+ :tasks => @tasks,
1493
+ :metrics => {
1494
+ :cpu => {
1495
+ :user => cpu_user,
1496
+ :system => cpu_system
1497
+ }
1498
+ },
1499
+ :sensu => sensu,
1500
+ :tessen => {
1501
+ :enabled => tessen_enabled
1502
+ },
1503
+ :timestamp => Time.now.to_i
1504
+ }
1505
+ @redis.sadd("servers", server_id)
1506
+ server_key = "server:#{server_id}"
1507
+ @redis.set(server_key, Sensu::JSON.dump(info)) do
1508
+ @redis.expire(server_key, 30)
1509
+ @logger.info("updated server registry", :server => info)
1510
+ yield(true) if block_given?
1511
+ end
1512
+ end
1513
+ end
1514
+
1515
+ # Set up the server registry updater. A periodic timer is
1516
+ # used to update the Sensu server info stored in Redis. The
1517
+ # timer is stored in the timers hash under `:run`.
1518
+ def setup_server_registry_updater
1519
+ update_server_registry
1520
+ @timers[:run] << EM::PeriodicTimer.new(10) do
1521
+ update_server_registry
1522
+ end
1523
+ end
1524
+
1525
+ # Set up Tessen, the call home mechanism.
1526
+ def setup_tessen
1527
+ @tessen = Tessen.new(
1528
+ :settings => @settings,
1529
+ :logger => @logger,
1530
+ :redis => @redis
1531
+ )
1532
+ @tessen.run if @tessen.enabled?
1533
+ end
1534
+
1535
+ # Unsubscribe from transport subscriptions (all of them). This
1536
+ # method is called when there are issues with connectivity, or
1537
+ # the process is stopping.
1538
+ def unsubscribe
1539
+ @logger.warn("unsubscribing from keepalive and result queues")
1540
+ @transport.unsubscribe if @transport
1541
+ end
1542
+
1543
+ # Complete in progress work and then call the provided callback.
1544
+ # This method will wait until all counters stored in the
1545
+ # `@in_progress` hash equal `0`.
1546
+ #
1547
+ # @yield [] callback/block to call when in progress work is
1548
+ # completed.
1549
+ def complete_in_progress
1550
+ @logger.info("completing work in progress", :in_progress => @in_progress)
1551
+ retry_until_true do
1552
+ if @in_progress.values.all? { |count| count == 0 }
1553
+ yield
1554
+ true
1555
+ end
1556
+ end
1557
+ end
1558
+
1559
+ # Bootstrap the Sensu server process, setting up the keepalive
1560
+ # and check result consumers, and attemping to carry out Sensu
1561
+ # server tasks. This method sets the process/daemon `@state` to
1562
+ # `:running`.
1563
+ def bootstrap
1564
+ setup_keepalives
1565
+ setup_results
1566
+ setup_task_elections
1567
+ setup_server_registry_updater
1568
+ setup_tessen
1569
+ @state = :running
1570
+ end
1571
+
1572
+ # Start the Sensu server process, connecting to Redis, the
1573
+ # Transport, and calling the `bootstrap()` method. Yield if a
1574
+ # block is provided.
1575
+ def start
1576
+ setup_connections do
1577
+ bootstrap
1578
+ yield if block_given?
1579
+ end
1580
+ end
1581
+
1582
+ # Pause the Sensu server process, unless it is being paused or
1583
+ # has already been paused. The process/daemon `@state` is first
1584
+ # set to `:pausing`, to indicate that it's in progress. All run
1585
+ # timers are cancelled, their references are cleared, and Tessen
1586
+ # is stopped. The Sensu server will unsubscribe from all
1587
+ # transport subscriptions, relinquish any Sensu server tasks,
1588
+ # then set the process/daemon `@state` to `:paused`.
1589
+ def pause
1590
+ unless @state == :pausing || @state == :paused
1591
+ @state = :pausing
1592
+ @timers[:run].each do |timer|
1593
+ timer.cancel
1594
+ end
1595
+ @timers[:run].clear
1596
+ @tessen.stop if @tessen
1597
+ unsubscribe
1598
+ relinquish_tasks
1599
+ @state = :paused
1600
+ end
1601
+ end
1602
+
1603
+ # Resume the Sensu server process if it is currently or will
1604
+ # soon be paused. The `retry_until_true` helper method is used
1605
+ # to determine if the process is paused and if the Redis and
1606
+ # transport connections are initiated and connected. If the
1607
+ # conditions are met, `bootstrap()` will be called and true is
1608
+ # returned to stop `retry_until_true`. If the transport has not
1609
+ # yet been initiated, true is is returned, without calling
1610
+ # bootstrap, as we expect bootstrap will be called after the
1611
+ # transport initializes.
1612
+ def resume
1613
+ retry_until_true(1) do
1614
+ if @state == :paused
1615
+ if @redis.connected?
1616
+ if @transport
1617
+ if @transport.connected?
1618
+ bootstrap
1619
+ true
1620
+ end
1621
+ else
1622
+ true
1623
+ end
1624
+ end
1625
+ end
1626
+ end
1627
+ end
1628
+
1629
+ # Stop the Sensu server process, pausing it, completing event
1630
+ # handling in progress, closing the Redis and transport
1631
+ # connections, and exiting the process (exit 0). After pausing
1632
+ # the process, the process/daemon `@state` is set to
1633
+ # `:stopping`.
1634
+ def stop
1635
+ @logger.warn("stopping")
1636
+ pause
1637
+ @state = :stopping
1638
+ complete_in_progress do
1639
+ @redis.close if @redis
1640
+ @transport.close if @transport
1641
+ super
1642
+ end
1643
+ end
1644
+ end
1645
+ end
1646
+ end