portertech-sensu 1.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +961 -0
  3. data/MIT-LICENSE.txt +20 -0
  4. data/README.md +65 -0
  5. data/exe/sensu-api +10 -0
  6. data/exe/sensu-client +10 -0
  7. data/exe/sensu-install +195 -0
  8. data/exe/sensu-server +10 -0
  9. data/lib/sensu/api/http_handler.rb +434 -0
  10. data/lib/sensu/api/process.rb +79 -0
  11. data/lib/sensu/api/routes/aggregates.rb +196 -0
  12. data/lib/sensu/api/routes/checks.rb +44 -0
  13. data/lib/sensu/api/routes/clients.rb +171 -0
  14. data/lib/sensu/api/routes/events.rb +86 -0
  15. data/lib/sensu/api/routes/health.rb +45 -0
  16. data/lib/sensu/api/routes/info.rb +37 -0
  17. data/lib/sensu/api/routes/request.rb +44 -0
  18. data/lib/sensu/api/routes/resolve.rb +32 -0
  19. data/lib/sensu/api/routes/results.rb +153 -0
  20. data/lib/sensu/api/routes/settings.rb +23 -0
  21. data/lib/sensu/api/routes/silenced.rb +182 -0
  22. data/lib/sensu/api/routes/stashes.rb +107 -0
  23. data/lib/sensu/api/routes.rb +88 -0
  24. data/lib/sensu/api/utilities/filter_response_content.rb +44 -0
  25. data/lib/sensu/api/utilities/publish_check_request.rb +107 -0
  26. data/lib/sensu/api/utilities/publish_check_result.rb +39 -0
  27. data/lib/sensu/api/utilities/resolve_event.rb +29 -0
  28. data/lib/sensu/api/utilities/servers_info.rb +43 -0
  29. data/lib/sensu/api/utilities/transport_info.rb +43 -0
  30. data/lib/sensu/api/validators/check.rb +55 -0
  31. data/lib/sensu/api/validators/client.rb +35 -0
  32. data/lib/sensu/api/validators/invalid.rb +8 -0
  33. data/lib/sensu/cli.rb +69 -0
  34. data/lib/sensu/client/http_socket.rb +217 -0
  35. data/lib/sensu/client/process.rb +655 -0
  36. data/lib/sensu/client/socket.rb +207 -0
  37. data/lib/sensu/client/utils.rb +53 -0
  38. data/lib/sensu/client/validators/check.rb +53 -0
  39. data/lib/sensu/constants.rb +17 -0
  40. data/lib/sensu/daemon.rb +396 -0
  41. data/lib/sensu/sandbox.rb +19 -0
  42. data/lib/sensu/server/filter.rb +227 -0
  43. data/lib/sensu/server/handle.rb +201 -0
  44. data/lib/sensu/server/mutate.rb +92 -0
  45. data/lib/sensu/server/process.rb +1646 -0
  46. data/lib/sensu/server/socket.rb +54 -0
  47. data/lib/sensu/server/tessen.rb +170 -0
  48. data/lib/sensu/utilities.rb +398 -0
  49. data/lib/sensu.rb +3 -0
  50. data/sensu.gemspec +36 -0
  51. metadata +322 -0
@@ -0,0 +1,1646 @@
1
+ require "sensu/daemon"
2
+ require "sensu/server/filter"
3
+ require "sensu/server/mutate"
4
+ require "sensu/server/handle"
5
+ require "sensu/server/tessen"
6
+
7
+ module Sensu
8
+ module Server
9
+ class Process
10
+ include Daemon
11
+ include Filter
12
+ include Mutate
13
+ include Handle
14
+
15
+ attr_reader :tasks, :in_progress
16
+
17
+ TASKS = ["check_request_publisher", "client_monitor", "check_result_monitor"]
18
+
19
+ STANDARD_CHECK_TYPE = "standard".freeze
20
+
21
+ METRIC_CHECK_TYPE = "metric".freeze
22
+
23
+ EVENT_FLAPPING_ACTION = "flapping".freeze
24
+
25
+ DEFAULT_HANDLER_NAME = "default".freeze
26
+
27
+ # Create an instance of the Sensu server process, start the
28
+ # server within the EventMachine event loop, and set up server
29
+ # process signal traps (for stopping).
30
+ #
31
+ # @param options [Hash]
32
+ def self.run(options={})
33
+ server = self.new(options)
34
+ EM::run do
35
+ server.start
36
+ server.setup_signal_traps
37
+ end
38
+ end
39
+
40
+ # Override Daemon initialize() to support Sensu server tasks and
41
+ # the handling event count.
42
+ #
43
+ # @param options [Hash]
44
+ def initialize(options={})
45
+ super
46
+ @tasks = []
47
+ @timers[:tasks] = {}
48
+ TASKS.each do |task|
49
+ @timers[:tasks][task.to_sym] = []
50
+ end
51
+ @in_progress = Hash.new(0)
52
+ end
53
+
54
+ # Set up the Redis and Transport connection objects, `@redis`
55
+ # and `@transport`. This method updates the Redis on error
56
+ # callback to reset the in progress check result counter. This
57
+ # method "drys" up many instances of `setup_redis()` and
58
+ # `setup_transport()`, particularly in the specs.
59
+ #
60
+ # @yield callback/block called after connecting to Redis and the
61
+ # Sensu Transport.
62
+ def setup_connections
63
+ setup_redis do
64
+ @redis.on_error do |error|
65
+ @logger.error("redis connection error", :error => error.to_s)
66
+ @in_progress[:check_results] = 0
67
+ end
68
+ setup_transport do
69
+ yield
70
+ end
71
+ end
72
+ end
73
+
74
+ # Create a registration check definition for a client. Client
75
+ # definitions may contain `:registration` configuration,
76
+ # containing custom attributes and handler information. By
77
+ # default, the registration check definition sets the `:handler`
78
+ # to `registration`. If the client provides its own
79
+ # `:registration` configuration, it's deep merged with the
80
+ # defaults. The check `:name`, `:output`, `:issued`, and
81
+ # `:executed` values are always overridden to guard against an
82
+ # invalid definition.
83
+ def create_registration_check(client)
84
+ check = {:handler => "registration", :status => 1}
85
+ if client.has_key?(:registration)
86
+ check = deep_merge(check, client[:registration])
87
+ end
88
+ timestamp = Time.now.to_i
89
+ overrides = {
90
+ :name => "registration",
91
+ :output => "new client registration",
92
+ :issued => timestamp,
93
+ :executed => timestamp
94
+ }
95
+ check.merge(overrides)
96
+ end
97
+
98
+ # Create and process a client registration event. A registration
99
+ # event is created when a Sensu client is first added to the
100
+ # client registry. The `create_registration_check()` method is
101
+ # called to create a registration check definition for the
102
+ # client.
103
+ #
104
+ # @param client [Hash] definition.
105
+ def create_client_registration_event(client)
106
+ check = create_registration_check(client)
107
+ create_event(client, check) do |event|
108
+ event_bridges(event)
109
+ process_event(event)
110
+ end
111
+ end
112
+
113
+ # Process an initial client registration, when it is first added
114
+ # to the client registry. If a registration handler is defined
115
+ # or the client specifies one, a client registration event is
116
+ # created and processed (handled, etc.) for the client
117
+ # (`create_client_registration_event()`).
118
+ #
119
+ # @param client [Hash] definition.
120
+ def process_client_registration(client)
121
+ if @settings.handler_exists?("registration") || client[:registration]
122
+ create_client_registration_event(client)
123
+ end
124
+ end
125
+
126
+ # Update the Sensu client registry, stored in Redis. Sensu
127
+ # client data is used to provide additional event context and
128
+ # enable agent health monitoring.
129
+ #
130
+ # To enable silencing individual clients, per-client
131
+ # subscriptions (`client:$CLIENT_NAME`) are added to client
132
+ # subscriptions automatically.
133
+ #
134
+ # The client registry supports client signatures, unique string
135
+ # identifiers used for keepalive and result source
136
+ # verification. If a client has a signature, all further
137
+ # registry updates for the client must have the same
138
+ # signature. A client can begin to use a signature if one was
139
+ # not previously configured. JSON serialization is used for the
140
+ # stored client data.
141
+ #
142
+ # @param client [Hash]
143
+ # @yield [success] passes success status to optional
144
+ # callback/block.
145
+ # @yieldparam success [TrueClass,FalseClass] indicating if the
146
+ # client registry update was a success or the client data was
147
+ # discarded due to client signature mismatch.
148
+ def update_client_registry(client)
149
+ @logger.debug("updating client registry", :client => client)
150
+ client_key = "client:#{client[:name]}"
151
+ client[:subscriptions] = (client[:subscriptions] + [client_key]).uniq
152
+ signature_key = "#{client_key}:signature"
153
+ @redis.setnx(signature_key, client[:signature]) do |created|
154
+ process_client_registration(client) if created
155
+ @redis.get(signature_key) do |signature|
156
+ if (signature.nil? || signature.empty?) && client[:signature]
157
+ @redis.set(signature_key, client[:signature])
158
+ end
159
+ if signature.nil? || signature.empty? || client[:signature] == signature
160
+ @redis.multi
161
+ @redis.set(client_key, Sensu::JSON.dump(client))
162
+ @redis.sadd("clients", client[:name])
163
+ @redis.exec do
164
+ yield(true) if block_given?
165
+ end
166
+ else
167
+ @logger.warn("invalid client signature", {
168
+ :client => client,
169
+ :signature => signature
170
+ })
171
+ @logger.warn("not updating client in the registry", :client => client)
172
+ yield(false) if block_given?
173
+ end
174
+ end
175
+ end
176
+ end
177
+
178
+ # Determine if a transport message is under the optional
179
+ # configured max message size. This method helps prevent
180
+ # oversized messages from consuming memory and being persisted
181
+ # to the datastore.
182
+ #
183
+ # @param message [String]
184
+ # @return [TrueClass,FalseClass]
185
+ def message_size_ok?(message)
186
+ if @settings[:sensu][:server] &&
187
+ @settings[:sensu][:server][:max_message_size]
188
+ message_size = message.bytesize
189
+ max_message_size = @settings[:sensu][:server][:max_message_size]
190
+ if message_size <= max_message_size
191
+ true
192
+ else
193
+ @logger.error("message exceeds the configured max message size", {
194
+ :max_message_size => max_message_size,
195
+ :message_size => message_size,
196
+ :message => message
197
+ })
198
+ false
199
+ end
200
+ else
201
+ true
202
+ end
203
+ end
204
+
205
+ # Set up the client keepalive consumer, keeping the Sensu client
206
+ # registry updated. The consumer receives JSON serialized client
207
+ # keepalives from the transport, parses them, and calls
208
+ # `update_client_registry()` with the client data to update the
209
+ # registry. Transport message acknowledgements are used to
210
+ # ensure the client registry is updated successfully. Keepalive
211
+ # JSON parsing errors are logged.
212
+ def setup_keepalives
213
+ keepalives_pipe = "keepalives"
214
+ if @settings[:sensu][:server] && @settings[:sensu][:server][:keepalives_pipe]
215
+ keepalives_pipe = @settings[:sensu][:server][:keepalives_pipe]
216
+ end
217
+ @logger.debug("subscribing to keepalives", :pipe => keepalives_pipe)
218
+ @transport.subscribe(:direct, keepalives_pipe, "keepalives", :ack => true) do |message_info, message|
219
+ @logger.debug("received keepalive", :message => message)
220
+ if message_size_ok?(message)
221
+ begin
222
+ client = Sensu::JSON.load(message)
223
+ update_client_registry(client)
224
+ rescue Sensu::JSON::ParseError => error
225
+ @logger.error("failed to parse keepalive payload", {
226
+ :message => message,
227
+ :error => error.to_s
228
+ })
229
+ end
230
+ end
231
+ EM::next_tick do
232
+ @transport.ack(message_info)
233
+ end
234
+ end
235
+ end
236
+
237
+ # Expand event handler sets, creating an array of handler
238
+ # definitions. Handler sets cannot be deeply nested (by choice),
239
+ # this method will return `nil` if an attempt is made to deeply
240
+ # nest. If the provided handler definition is not a set, it is
241
+ # returned.
242
+ #
243
+ # @param handler [Hash] definition.
244
+ # @param depth [Integer] of the expansion.
245
+ # @return [Array, Hash, Nil]
246
+ def expand_handler_sets(handler, depth=0)
247
+ if handler[:type] == "set"
248
+ if depth < 2
249
+ derive_handlers(handler[:handlers], depth + 1)
250
+ else
251
+ @logger.error("handler sets cannot be deeply nested", :handler => handler)
252
+ nil
253
+ end
254
+ else
255
+ handler
256
+ end
257
+ end
258
+
259
+ # Derive an array of handler definitions from a list of handler
260
+ # names. This method first checks for the existence of standard
261
+ # handlers, followed by handler extensions. If a handler does
262
+ # not exist for a name, it is logged and ignored. Duplicate
263
+ # handler definitions are removed.
264
+ #
265
+ # @param handler_list [Array]
266
+ # @param depth [Integer] of handler set expansion.
267
+ # @return [Array]
268
+ def derive_handlers(handler_list, depth=0)
269
+ handler_list.compact.map { |handler_name|
270
+ case
271
+ when @settings.handler_exists?(handler_name)
272
+ handler = @settings[:handlers][handler_name].merge(:name => handler_name)
273
+ expand_handler_sets(handler, depth)
274
+ when @extensions.handler_exists?(handler_name)
275
+ @extensions[:handlers][handler_name]
276
+ else
277
+ @logger.error("unknown handler", :handler_name => handler_name)
278
+ nil
279
+ end
280
+ }.flatten.compact.uniq
281
+ end
282
+
283
+ # Process an event: filter -> mutate -> handle.
284
+ #
285
+ # This method determines the appropriate handlers for an event,
286
+ # filtering and mutating the event data for each of them. The
287
+ # `@in_progress[:events]` counter is incremented by `1`, for
288
+ # each event handler chain (filter -> mutate -> handle).
289
+ #
290
+ # @param event [Hash]
291
+ def process_event(event)
292
+ log_level = event[:check][:type] == METRIC_CHECK_TYPE ? :debug : :info
293
+ @logger.send(log_level, "processing event", :event => event)
294
+ handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || DEFAULT_HANDLER_NAME)
295
+ handlers = derive_handlers(handler_list)
296
+ handlers.each do |handler|
297
+ @in_progress[:events] += 1
298
+ filter_event(handler, event) do |event|
299
+ mutate_event(handler, event) do |event_data|
300
+ handle_event(handler, event_data, event[:id])
301
+ end
302
+ end
303
+ end
304
+ end
305
+
306
+ # Run event bridge extensions, within the Sensu EventMachine
307
+ # reactor (event loop). The extension API `safe_run()` method is
308
+ # used to guard against most errors. Bridges are for relaying
309
+ # Sensu event data to other services.
310
+ #
311
+ # @param event [Hash]
312
+ def event_bridges(event)
313
+ @extensions[:bridges].each do |name, bridge|
314
+ bridge.safe_run(event) do |output, status|
315
+ @logger.debug("bridge extension output", {
316
+ :extension => bridge.definition,
317
+ :output => output
318
+ })
319
+ end
320
+ end
321
+ end
322
+
323
+ # Add a check result to one or more aggregates. The aggregate name is
324
+ # determined by the value of check `:aggregates` array, if present,
325
+ # and falling back to `:aggregate` otherwise.
326
+ #
327
+ # When one or more aggregates are specified as `:aggregates`, the
328
+ # client name and check are updated on each aggregate.
329
+ #
330
+ # When no aggregates are specified as `:aggregates`, and `:aggregate`
331
+ # is `true` (legacy), the check `:name` is used as the aggregate name.
332
+ #
333
+ # When no aggregates are specified as `:aggregates` and check `:aggregate`
334
+ # is a string, it used as the aggregate name.
335
+ #
336
+ # This method will add the client name to configured aggregates, all
337
+ # other processing (e.g. counters) is done by the Sensu API on request.
338
+ #
339
+ # @param client [Hash]
340
+ # @param check [Hash]
341
+ def aggregate_check_result(client, check)
342
+ check_aggregate = (check[:aggregate].is_a?(String) ? check[:aggregate] : check[:name])
343
+ aggregate_list = Array(check[:aggregates] || check_aggregate)
344
+ aggregate_list.each do |aggregate|
345
+ @logger.debug("adding check result to aggregate", {
346
+ :aggregate => aggregate,
347
+ :client => client,
348
+ :check => check
349
+ })
350
+ aggregate_member = "#{client[:name]}:#{check[:name]}"
351
+ @redis.sadd("aggregates:#{aggregate}", aggregate_member) do
352
+ @redis.sadd("aggregates", aggregate)
353
+ end
354
+ end
355
+ end
356
+
357
+ # Truncate check output. Metric checks (`"type": "metric"`), or
358
+ # checks with `"truncate_output": true`, have their output
359
+ # truncated to a single line and a maximum character length of
360
+ # 255 by default. The maximum character length can be change by
361
+ # the `"truncate_output_length"` check definition attribute.
362
+ #
363
+ # @param check [Hash]
364
+ # @return [Hash] check with truncated output.
365
+ def truncate_check_output(check)
366
+ if check[:truncate_output] ||
367
+ (check[:type] == METRIC_CHECK_TYPE && check[:truncate_output] != false)
368
+ begin
369
+ output_lines = check[:output].split("\n")
370
+ rescue ArgumentError
371
+ utf8_output = check[:output].encode("UTF-8", "binary", **{
372
+ :invalid => :replace,
373
+ :undef => :replace,
374
+ :replace => ""
375
+ })
376
+ output_lines = utf8_output.split("\n")
377
+ end
378
+ output = output_lines.first || check[:output]
379
+ truncate_output_length = check.fetch(:truncate_output_length, 255)
380
+ if output_lines.length > 1 || output.length > truncate_output_length
381
+ output = output[0..truncate_output_length] + "\n..."
382
+ end
383
+ check.merge(:output => output)
384
+ else
385
+ check
386
+ end
387
+ end
388
+
389
+ # Store check result data. This method stores check result data
390
+ # and the 21 most recent check result statuses for a client/check
391
+ # pair, this history is used for event context and flap detection.
392
+ # The check execution timestamp is also stored, to provide an
393
+ # indication of how recent the data is. Check output is
394
+ # truncated by `truncate_check_output()` before it is stored.
395
+ #
396
+ # @param client [Hash]
397
+ # @param check [Hash]
398
+ # @yield [] callback/block called after the check data has been
399
+ # stored (history, etc).
400
+ def store_check_result(client, check)
401
+ @logger.debug("storing check result", :check => check)
402
+ result_key = "#{client[:name]}:#{check[:name]}"
403
+ history_key = "history:#{result_key}"
404
+ check_truncated = truncate_check_output(check)
405
+ @redis.multi
406
+ @redis.sadd("result:#{client[:name]}", check[:name])
407
+ @redis.set("result:#{result_key}", Sensu::JSON.dump(check_truncated))
408
+ @redis.sadd("ttl", result_key) if check[:ttl]
409
+ @redis.rpush(history_key, check[:status])
410
+ @redis.ltrim(history_key, -21, -1)
411
+ if check[:status] == 0
412
+ @redis.set("#{history_key}:last_ok", check.fetch(:executed, Time.now.to_i))
413
+ end
414
+ @redis.exec do
415
+ yield
416
+ end
417
+ end
418
+
419
+ # Fetch the execution history for a client/check pair, the 21
420
+ # most recent check result statuses. This method also calculates
421
+ # the total state change percentage for the history, this value
422
+ # is use for check state flap detection, using a similar
423
+ # algorithm to Nagios:
424
+ # http://nagios.sourceforge.net/docs/3_0/flapping.html
425
+ #
426
+ # @param client [Hash]
427
+ # @param check [Hash]
428
+ # @yield [history, total_state_change] callback/block to call
429
+ # with the check history and calculated total state change
430
+ # value.
431
+ # @yieldparam history [Array] containing the last 21 check
432
+ # result exit status codes.
433
+ # @yieldparam total_state_change [Float] percentage for the
434
+ # check history (exit status codes).
435
+ # @yieldparam last_ok [Integer] execution timestamp of the last
436
+ # OK check result.
437
+ def check_history(client, check)
438
+ history_key = "history:#{client[:name]}:#{check[:name]}"
439
+ @redis.lrange(history_key, -21, -1) do |history|
440
+ total_state_change = 0
441
+ unless history.length < 21
442
+ state_changes = 0
443
+ change_weight = 0.8
444
+ previous_status = history.first
445
+ history.each do |status|
446
+ unless status == previous_status
447
+ state_changes += change_weight
448
+ end
449
+ change_weight += 0.02
450
+ previous_status = status
451
+ end
452
+ total_state_change = (state_changes.fdiv(20) * 100).to_i
453
+ end
454
+ @redis.get("#{history_key}:last_ok") do |last_ok|
455
+ last_ok = last_ok.to_i unless last_ok.nil?
456
+ yield(history, total_state_change, last_ok)
457
+ end
458
+ end
459
+ end
460
+
461
+ # Determine if a check state is flapping, rapidly changing
462
+ # between an OK and non-OK state. Flap detection is only done
463
+ # for checks that have defined low and hight flap detection
464
+ # thresholds, `:low_flap_threshold` and `:high_flap_threshold`.
465
+ # The `check_history()` method provides the check history and
466
+ # more importantly the total state change precentage value that
467
+ # is compared with the configured thresholds defined in the
468
+ # check data. If a check hasn't been flapping, the
469
+ # `:total_state_change` must be equal to or higher than the
470
+ # `:high_flap_threshold` to be changed to flapping. If a check
471
+ # has been flapping, the `:total_state_change` must be equal to
472
+ # or lower than the `:low_flap_threshold` to no longer be
473
+ # flapping. This method uses the same algorithm as Nagios:
474
+ # http://nagios.sourceforge.net/docs/3_0/flapping.html
475
+ #
476
+ # @param stored_event [Hash]
477
+ # @param check [Hash]
478
+ # @return [TrueClass, FalseClass]
479
+ def check_flapping?(stored_event, check)
480
+ if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
481
+ if check[:low_flap_threshold].is_a?(Integer) && check[:high_flap_threshold].is_a?(Integer)
482
+ was_flapping = stored_event && stored_event[:action] == EVENT_FLAPPING_ACTION
483
+ if was_flapping
484
+ check[:total_state_change] > check[:low_flap_threshold]
485
+ else
486
+ check[:total_state_change] >= check[:high_flap_threshold]
487
+ end
488
+ else
489
+ details = {:check => check}
490
+ details[:client] = stored_event[:client] if stored_event
491
+ @logger.error("invalid check flap thresholds", details)
492
+ false
493
+ end
494
+ else
495
+ false
496
+ end
497
+ end
498
+
499
+ # Determine if an event has been silenced. This method compiles
500
+ # an array of possible silenced registry entry keys for the
501
+ # event. An attempt is made to fetch one or more of the silenced
502
+ # registry entries to determine if the event has been silenced.
503
+ # The event data is updated to indicate if the event has been
504
+ # silenced. If the event is silenced and the event action is
505
+ # `:resolve`, silenced registry entries with
506
+ # `:expire_on_resolve` set to true will be deleted. Silencing is
507
+ # disabled for events with a check status of `0` (OK), unless
508
+ # the event action is `:resolve` or `:flapping`.
509
+ #
510
+ # @param event [Hash]
511
+ # @yield callback [event] callback/block called after the event
512
+ # data has been updated to indicate if it has been silenced.
513
+ def event_silenced?(event)
514
+ event[:silenced] = false
515
+ event[:silenced_by] = []
516
+ if event[:check][:status] != 0 || event[:action] != :create
517
+ check_name = event[:check][:name]
518
+ silenced_keys = event[:client][:subscriptions].map { |subscription|
519
+ ["silence:#{subscription}:*", "silence:#{subscription}:#{check_name}"]
520
+ }.flatten
521
+ silenced_keys << "silence:*:#{check_name}"
522
+ @redis.mget(*silenced_keys) do |silenced|
523
+ silenced.compact!
524
+ silenced.each do |silenced_json|
525
+ silenced_info = Sensu::JSON.load(silenced_json)
526
+ if silenced_info[:expire_on_resolve] && event[:action] == :resolve
527
+ silenced_key = "silence:#{silenced_info[:id]}"
528
+ @redis.srem("silenced", silenced_key)
529
+ @redis.del(silenced_key)
530
+ elsif silenced_info[:begin].nil? || silenced_info[:begin] <= Time.now.to_i
531
+ event[:silenced_by] << silenced_info[:id]
532
+ end
533
+ end
534
+ event[:silenced] = !event[:silenced_by].empty?
535
+ yield(event)
536
+ end
537
+ else
538
+ yield(event)
539
+ end
540
+ end
541
+
542
+ # Update the event registry, stored in Redis. This method
543
+ # determines if event data warrants in the creation or update of
544
+ # event data in the registry. If a check `:status` is not
545
+ # `0`, or it has been flapping, an event is created/updated in
546
+ # the registry. If the event `:action` is `:resolve`, the event
547
+ # is removed (resolved) from the registry. If the previous
548
+ # conditions are not met and check `:type` is `metric`, the
549
+ # registry is not updated, but further event processing is
550
+ # required (`yield(true)`). JSON serialization is used when
551
+ # storing data in the registry.
552
+ #
553
+ # @param event [Hash]
554
+ # @yield callback [event] callback/block called after the event
555
+ # registry has been updated.
556
+ # @yieldparam process [TrueClass, FalseClass] indicating if the
557
+ # event requires further processing.
558
+ def update_event_registry(event)
559
+ client_name = event[:client][:name]
560
+ if event[:check][:status] != 0 || (event[:action] == :flapping && event[:check][:force_resolve] != true)
561
+ @redis.hset("events:#{client_name}", event[:check][:name], Sensu::JSON.dump(event)) do
562
+ yield(true)
563
+ end
564
+ elsif event[:action] == :resolve &&
565
+ (event[:check][:auto_resolve] != false || event[:check][:force_resolve]) ||
566
+ (event[:action] == :flapping && event[:check][:force_resolve])
567
+ @redis.hdel("events:#{client_name}", event[:check][:name]) do
568
+ yield(true)
569
+ end
570
+ elsif event[:check][:type] == METRIC_CHECK_TYPE
571
+ yield(true)
572
+ else
573
+ yield(false)
574
+ end
575
+ end
576
+
577
+ # Create an event, using the provided client and check result
578
+ # data. Existing event data for the client/check pair is fetched
579
+ # from the event registry to be used in the composition of the
580
+ # new event. The silenced registry is used to determine if the
581
+ # event has been silenced.
582
+ #
583
+ # @param client [Hash]
584
+ # @param check [Hash]
585
+ # @yield callback [event] callback/block called with the
586
+ # resulting event.
587
+ # @yieldparam event [Hash]
588
+ def create_event(client, check)
589
+ check_history(client, check) do |history, total_state_change, last_ok|
590
+ check[:history] = history
591
+ check[:total_state_change] = total_state_change
592
+ @redis.hget("events:#{client[:name]}", check[:name]) do |event_json|
593
+ stored_event = event_json ? Sensu::JSON.load(event_json) : nil
594
+ flapping = check_flapping?(stored_event, check)
595
+ event = {
596
+ :id => random_uuid,
597
+ :client => client,
598
+ :check => check,
599
+ :occurrences => 1,
600
+ :occurrences_watermark => 1,
601
+ :last_ok => last_ok,
602
+ :action => (flapping ? :flapping : :create),
603
+ :timestamp => Time.now.to_i
604
+ }
605
+ if stored_event
606
+ event[:id] = stored_event[:id]
607
+ event[:last_state_change] = stored_event[:last_state_change]
608
+ event[:occurrences] = stored_event[:occurrences]
609
+ event[:occurrences_watermark] = stored_event[:occurrences_watermark] || event[:occurrences]
610
+ end
611
+ if check[:status] != 0 || flapping
612
+ if history[-1] == history[-2]
613
+ event[:occurrences] += 1
614
+ if event[:occurrences] > event[:occurrences_watermark]
615
+ event[:occurrences_watermark] = event[:occurrences]
616
+ end
617
+ else
618
+ event[:occurrences] = 1
619
+ event[:last_state_change] = event[:timestamp]
620
+ end
621
+ elsif stored_event
622
+ event[:last_state_change] = event[:timestamp]
623
+ event[:action] = :resolve
624
+ end
625
+ event_silenced?(event) do |event|
626
+ yield(event)
627
+ end
628
+ end
629
+ end
630
+ end
631
+
632
+ # Create a blank client (data). Only the client name is known,
633
+ # the other client attributes must be updated via the API (POST
634
+ # /clients:client). Dynamically created clients and those
635
+ # updated via the API will have client keepalives disabled by
636
+ # default, `:keepalives` is set to `false`.
637
+ #
638
+ # @param name [String] to use for the client.
639
+ # @return [Hash] client.
640
+ def create_client(name)
641
+ {
642
+ :name => name,
643
+ :address => "unknown",
644
+ :subscriptions => ["client:#{name}"],
645
+ :keepalives => false,
646
+ :version => VERSION,
647
+ :timestamp => Time.now.to_i
648
+ }
649
+ end
650
+
651
+ # Retrieve a client (data) from Redis if it exists. If a client
652
+ # does not already exist, create one (a blank) using the
653
+ # `client_key` as the client name. Dynamically create client
654
+ # data can be updated using the API (POST /clients/:client). If
655
+ # a client does exist and it has a client signature, the check
656
+ # result must have a matching signature or it is discarded. If
657
+ # the client does not exist, but a client signature exists, the
658
+ # check result must have a matching signature or it is
659
+ # discarded.
660
+ #
661
+ # @param result [Hash] data.
662
+ # @yield [client] callback/block to be called with client data,
663
+ # either retrieved from Redis, or dynamically created.
664
+ # @yieldparam client [Hash]
665
+ def retrieve_client(result)
666
+ client_key = result[:check][:source] || result[:client]
667
+ @redis.get("client:#{client_key}") do |client_json|
668
+ unless client_json.nil?
669
+ client = Sensu::JSON.load(client_json)
670
+ if client[:signature]
671
+ if client[:signature] == result[:signature]
672
+ yield(client)
673
+ else
674
+ @logger.warn("invalid check result signature", {
675
+ :result => result,
676
+ :client => client
677
+ })
678
+ @logger.warn("not retrieving client from the registry", :result => result)
679
+ yield(nil)
680
+ end
681
+ else
682
+ yield(client)
683
+ end
684
+ else
685
+ @redis.get("client:#{client_key}:signature") do |signature|
686
+ if signature.nil? || signature.empty? || result[:signature] == signature
687
+ client = create_client(client_key)
688
+ client[:type] = "proxy" if result[:check][:source]
689
+ update_client_registry(client) do
690
+ yield(client)
691
+ end
692
+ else
693
+ @logger.warn("invalid check result signature", {
694
+ :result => result,
695
+ :signature => signature
696
+ })
697
+ yield(nil)
698
+ end
699
+ end
700
+ end
701
+ end
702
+ end
703
+
704
+ # Determine if a keepalive event exists for a client.
705
+ #
706
+ # @param client_name [String] name of client to look up in event registry.
707
+ # @return [TrueClass, FalseClass]
708
+ def keepalive_event_exists?(client_name)
709
+ @redis.hexists("events:#{client_name}", "keepalive") do |event_exists|
710
+ yield(event_exists)
711
+ end
712
+ end
713
+
714
+ # Process a check result, storing its data, inspecting its
715
+ # contents, and taking the appropriate actions (eg. update the
716
+ # event registry). The `@in_progress[:check_results]` counter is
717
+ # incremented by `1` prior to check result processing and then
718
+ # decremented by `1` after updating the event registry. A check
719
+ # result must have a valid client name, associated with a client
720
+ # in the registry or one will be created. If a local check
721
+ # definition exists for the check name, and the check result is
722
+ # not from a standalone check execution, it's merged with the
723
+ # check result for more context.
724
+ #
725
+ # @param result [Hash] data.
726
+ def process_check_result(result)
727
+ @in_progress[:check_results] += 1
728
+ @logger.debug("processing result", :result => result)
729
+ retrieve_client(result) do |client|
730
+ unless client.nil?
731
+ check = case
732
+ when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
733
+ @settings[:checks][result[:check][:name]].merge(result[:check])
734
+ else
735
+ result[:check]
736
+ end
737
+ check[:type] ||= STANDARD_CHECK_TYPE
738
+ check[:origin] = result[:client] if check[:source]
739
+ if @settings.check_exists?(check[:name]) && client[:type] == "proxy"
740
+ check[:command] = @settings[:checks][check[:name].to_sym][:command]
741
+ end
742
+ aggregate_check_result(client, check) if check[:aggregates] || check[:aggregate]
743
+ store_check_result(client, check) do
744
+ create_event(client, check) do |event|
745
+ event_bridges(event)
746
+ update_event_registry(event) do |process|
747
+ process_event(event) if process
748
+ @in_progress[:check_results] -= 1
749
+ end
750
+ end
751
+ end
752
+ else
753
+ @logger.warn("halting result processing", :result => result)
754
+ @in_progress[:check_results] -= 1
755
+ end
756
+ end
757
+ end
758
+
759
+ # Set up the check result consumer. The consumer receives JSON
760
+ # serialized check results from the transport, parses them, and
761
+ # calls `process_check_result()` with the result data to be
762
+ # processed. Transport message acknowledgements are used to
763
+ # ensure that results make it to processing. The transport
764
+ # message acknowledgements are currently done in the next tick
765
+ # of the EventMachine reactor (event loop), as a flow control
766
+ # mechanism. Result JSON parsing errors are logged.
767
+ def setup_results
768
+ results_pipe = "results"
769
+ if @settings[:sensu][:server] && @settings[:sensu][:server][:results_pipe]
770
+ results_pipe = @settings[:sensu][:server][:results_pipe]
771
+ end
772
+ @logger.debug("subscribing to results", :pipe => results_pipe)
773
+ @transport.subscribe(:direct, results_pipe, "results", :ack => true) do |message_info, message|
774
+ if message_size_ok?(message)
775
+ begin
776
+ result = Sensu::JSON.load(message)
777
+ @logger.debug("received result", :result => result)
778
+ process_check_result(result)
779
+ rescue Sensu::JSON::ParseError => error
780
+ @logger.error("failed to parse result payload", {
781
+ :message => message,
782
+ :error => error.to_s
783
+ })
784
+ end
785
+ end
786
+ EM::next_tick do
787
+ @transport.ack(message_info)
788
+ end
789
+ end
790
+ end
791
+
792
+ # Determine the Sensu Transport publish options for a
793
+ # subscription. If a subscription begins with a Transport pipe
794
+ # type, either "direct:" or "roundrobin:", the subscription uses
795
+ # a direct Transport pipe. If a subscription does not specify a
796
+ # Transport pipe type, a fanout Transport pipe is used.
797
+ #
798
+ # @param subscription [String]
799
+ # @param message [String]
800
+ # @return [Array] containing the Transport publish options:
801
+ # the Transport pipe type, pipe, and the message to be
802
+ # published.
803
+ def transport_publish_options(subscription, message)
804
+ _, raw_type = subscription.split(":", 2).reverse
805
+ case raw_type
806
+ when "direct", "roundrobin"
807
+ [:direct, subscription, message]
808
+ else
809
+ [:fanout, subscription, message]
810
+ end
811
+ end
812
+
813
+ # Publish a check request to the Transport. A check request is
814
+ # composed of a check definition (minus `:subscribers` and
815
+ # `:interval`) and an `:issued` timestamp. The check request is
816
+ # published to a Transport pipe, for each of the check
817
+ # `:subscribers` in its definition, eg. "webserver". JSON
818
+ # serialization is used when publishing the check request
819
+ # payload to the Transport pipes. Transport errors are logged.
820
+ #
821
+ # @param check [Hash] definition.
822
+ def publish_check_request(check)
823
+ payload = check.reject do |key, value|
824
+ [:subscribers, :interval].include?(key)
825
+ end
826
+ payload[:issued] = Time.now.to_i
827
+ @logger.info("publishing check request", {
828
+ :payload => payload,
829
+ :subscribers => check[:subscribers]
830
+ })
831
+ check[:subscribers].each do |subscription|
832
+ options = transport_publish_options(subscription, Sensu::JSON.dump(payload))
833
+ @transport.publish(*options) do |info|
834
+ if info[:error]
835
+ @logger.error("failed to publish check request", {
836
+ :subscription => subscription,
837
+ :payload => payload,
838
+ :error => info[:error].to_s
839
+ })
840
+ end
841
+ end
842
+ end
843
+ end
844
+
845
+ # Determine and return clients from the registry that match a
846
+ # set of attributes.
847
+ #
848
+ # @param clients [Array] of client names.
849
+ # @param attributes [Hash]
850
+ # @yield [Array] callback/block called after determining the
851
+ # matching clients, returning them as a block parameter.
852
+ def determine_matching_clients(clients, attributes)
853
+ client_keys = clients.map { |client_name| "client:#{client_name}" }
854
+ @redis.mget(*client_keys) do |client_json_objects|
855
+ matching_clients = []
856
+ client_json_objects.each do |client_json|
857
+ unless client_json.nil?
858
+ client = Sensu::JSON.load(client_json)
859
+ if attributes_match?(client, attributes)
860
+ matching_clients << client
861
+ end
862
+ end
863
+ end
864
+ yield(matching_clients)
865
+ end
866
+ end
867
+
868
+ # Publish a proxy check request for a client. This method
869
+ # substitutes client tokens in the check definition prior to
870
+ # publish the check request. If there are unmatched client
871
+ # tokens, a warning is logged, and a check request is not
872
+ # published.
873
+ #
874
+ # @param client [Hash] definition.
875
+ # @param check [Hash] definition.
876
+ def publish_proxy_check_request(client, check)
877
+ @logger.debug("creating a proxy check request", {
878
+ :client => client,
879
+ :check => check
880
+ })
881
+ proxy_check, unmatched_tokens = object_substitute_tokens(deep_dup(check.dup), client)
882
+ if unmatched_tokens.empty?
883
+ proxy_check[:source] ||= client[:name]
884
+ publish_check_request(proxy_check)
885
+ else
886
+ @logger.warn("failed to publish a proxy check request", {
887
+ :reason => "unmatched client tokens",
888
+ :unmatched_tokens => unmatched_tokens,
889
+ :client => client,
890
+ :check => check
891
+ })
892
+ end
893
+ end
894
+
895
+ # Publish proxy check requests for one or more clients. This
896
+ # method can optionally splay proxy check requests, evenly, over
897
+ # a period of time, determined by the check interval and a
898
+ # configurable splay coverage percentage. For example, splay
899
+ # proxy check requests over 60s * 90%, 54s, leaving 6s for the
900
+ # last proxy check execution before the the next round of proxy
901
+ # check requests for the same check. The
902
+ # `publish_proxy_check_request() method is used to publish the
903
+ # proxy check requests.
904
+ #
905
+ # @param clients [Array] of client definitions.
906
+ # @param check [Hash] definition.
907
+ def publish_proxy_check_requests(clients, check)
908
+ client_count = clients.length
909
+ splay = 0
910
+ if check[:proxy_requests][:splay]
911
+ interval = check[:interval]
912
+ if check[:cron]
913
+ interval = determine_check_cron_time(check)
914
+ end
915
+ unless interval.nil?
916
+ splay_coverage = check[:proxy_requests].fetch(:splay_coverage, 90)
917
+ splay = interval * (splay_coverage / 100.0) / client_count
918
+ end
919
+ end
920
+ splay_timer = 0
921
+ clients.each do |client|
922
+ unless splay == 0
923
+ EM::Timer.new(splay_timer) do
924
+ publish_proxy_check_request(client, check)
925
+ end
926
+ splay_timer += splay
927
+ else
928
+ publish_proxy_check_request(client, check)
929
+ end
930
+ end
931
+ end
932
+
933
+ # Create and publish one or more proxy check requests. This
934
+ # method iterates through the Sensu client registry for clients
935
+ # that matched provided proxy request client attributes. A proxy
936
+ # check request is created for each client in the registry that
937
+ # matches the proxy request client attributes. Proxy check
938
+ # requests have their client tokens subsituted by the associated
939
+ # client attributes values. The `determine_matching_clients()`
940
+ # method is used to fetch and inspect each slide of clients from
941
+ # the registry, returning those that match the configured proxy
942
+ # request client attributes. A relatively small clients slice
943
+ # size (20) is used to reduce the number of clients inspected
944
+ # within a single tick of the EM reactor. The
945
+ # `publish_proxy_check_requests()` method is used to iterate
946
+ # through the matching Sensu clients, creating their own unique
947
+ # proxy check request, substituting client tokens, and then
948
+ # publishing them to the targetted subscriptions.
949
+ #
950
+ # @param check [Hash] definition.
951
+ def create_proxy_check_requests(check)
952
+ client_attributes = check[:proxy_requests][:client_attributes]
953
+ unless client_attributes.empty?
954
+ @redis.smembers("clients") do |clients|
955
+ client_count = clients.length
956
+ proxy_check_requests = Proc.new do |matching_clients, slice_start, slice_size|
957
+ unless slice_start > client_count - 1
958
+ clients_slice = clients.slice(slice_start..slice_size)
959
+ determine_matching_clients(clients_slice, client_attributes) do |additional_clients|
960
+ matching_clients += additional_clients
961
+ proxy_check_requests.call(matching_clients, slice_start + 20, slice_size + 20)
962
+ end
963
+ else
964
+ publish_proxy_check_requests(matching_clients, check)
965
+ end
966
+ end
967
+ proxy_check_requests.call([], 0, 19)
968
+ end
969
+ end
970
+ end
971
+
972
+ # Create a check request proc, used to publish check requests to
973
+ # for a check to the Sensu transport. Check requests are not
974
+ # published if subdued. This method determines if a check uses
975
+ # proxy check requests and calls the appropriate check request
976
+ # publish method.
977
+ #
978
+ # @param check [Hash] definition.
979
+ def create_check_request_proc(check)
980
+ Proc.new do
981
+ unless check_subdued?(check)
982
+ if check[:proxy_requests]
983
+ create_proxy_check_requests(check)
984
+ else
985
+ publish_check_request(check)
986
+ end
987
+ else
988
+ @logger.info("check request was subdued", :check => check)
989
+ end
990
+ end
991
+ end
992
+
993
+ # Schedule a check request, using the check cron. This method
994
+ # determines the time until the next cron time (in seconds) and
995
+ # creats an EventMachine timer for the request. This method will
996
+ # be called after every check cron request for subsequent
997
+ # requests. The timer is stored in the timer hash under
998
+ # `:tasks`, so it can be cancelled etc. The check cron request
999
+ # timer object is removed from the timer hash after the request
1000
+ # is published, to stop the timer hash from growing infinitely.
1001
+ #
1002
+ # @param check [Hash] definition.
1003
+ def schedule_check_cron_request(check)
1004
+ cron_time = determine_check_cron_time(check)
1005
+ @timers[:tasks][:check_request_publisher] << EM::Timer.new(cron_time) do |timer|
1006
+ create_check_request_proc(check).call
1007
+ @timers[:tasks][:check_request_publisher].delete(timer)
1008
+ schedule_check_cron_request(check)
1009
+ end
1010
+ end
1011
+
1012
+ # Calculate a check request splay, taking into account the
1013
+ # current time and the request interval to ensure it's
1014
+ # consistent between process restarts.
1015
+ #
1016
+ # @param check [Hash] definition.
1017
+ def calculate_check_request_splay(check)
1018
+ splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
1019
+ current_time = (Time.now.to_f * 1000).to_i
1020
+ (splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
1021
+ end
1022
+
1023
+ # Schedule check requests, using the check interval. This method
1024
+ # using an intial calculated request splay EventMachine timer
1025
+ # and an EventMachine periodic timer for subsequent check
1026
+ # requests. The timers are stored in the timers hash under
1027
+ # `:tasks`, so they can be cancelled etc.
1028
+ #
1029
+ # @param check [Hash] definition.
1030
+ def schedule_check_interval_requests(check)
1031
+ request_splay = testing? ? 0 : calculate_check_request_splay(check)
1032
+ interval = testing? ? 0.5 : check[:interval]
1033
+ @timers[:tasks][:check_request_publisher] << EM::Timer.new(request_splay) do
1034
+ create_check_request = create_check_request_proc(check)
1035
+ create_check_request.call
1036
+ @timers[:tasks][:check_request_publisher] << EM::PeriodicTimer.new(interval, &create_check_request)
1037
+ end
1038
+ end
1039
+
1040
+ # Schedule check requests. This method iterates through defined
1041
+ # checks and uses the appropriate method of check request
1042
+ # scheduling, either with the cron syntax or a numeric interval.
1043
+ #
1044
+ # @param checks [Array] of definitions.
1045
+ def schedule_checks(checks)
1046
+ checks.each do |check|
1047
+ if check[:cron]
1048
+ schedule_check_cron_request(check)
1049
+ else
1050
+ schedule_check_interval_requests(check)
1051
+ end
1052
+ end
1053
+ end
1054
+
1055
+ # Set up the check request publisher. This method creates an
1056
+ # array of check definitions, that are not standalone checks,
1057
+ # and do not have `:publish` set to `false`. The array is
1058
+ # provided to the `schedule_checks()` method.
1059
+ def setup_check_request_publisher
1060
+ @logger.debug("scheduling check requests")
1061
+ checks = @settings.checks.reject do |check|
1062
+ check[:standalone] || check[:publish] == false
1063
+ end
1064
+ schedule_checks(checks)
1065
+ end
1066
+
1067
+ # Publish a check result to the Transport for processing. A
1068
+ # check result is composed of a client name and a check
1069
+ # definition, containing check `:output` and `:status`. A client
1070
+ # signature is added to the check result payload if one is
1071
+ # registered for the client. JSON serialization is used when
1072
+ # publishing the check result payload to the Transport pipe.
1073
+ # Transport errors are logged.
1074
+ #
1075
+ # @param client_name [String]
1076
+ # @param check [Hash]
1077
+ def publish_check_result(client_name, check)
1078
+ payload = {
1079
+ :client => client_name,
1080
+ :check => check
1081
+ }
1082
+ @redis.get("client:#{client_name}:signature") do |signature|
1083
+ payload[:signature] = signature if signature
1084
+ @logger.debug("publishing check result", :payload => payload)
1085
+ @transport.publish(:direct, "results", Sensu::JSON.dump(payload)) do |info|
1086
+ if info[:error]
1087
+ @logger.error("failed to publish check result", {
1088
+ :payload => payload,
1089
+ :error => info[:error].to_s
1090
+ })
1091
+ end
1092
+ end
1093
+ end
1094
+ end
1095
+
1096
+ # Create a keepalive check definition for a client. Client
1097
+ # definitions may contain `:keepalive` configuration, containing
1098
+ # specific thresholds and handler information. The keepalive
1099
+ # check definition creation begins with default thresholds, and
1100
+ # sets the `:handler` to `keepalive`, if the handler has a local
1101
+ # definition. If the client provides its own `:keepalive`
1102
+ # configuration, it's deep merged with the defaults. The check
1103
+ # `:name`, `:issued`, and `:executed` values are always
1104
+ # overridden to guard against an invalid definition.
1105
+ #
1106
+ # @return [Array] check definition, unmatched client tokens
1107
+ def create_keepalive_check(client)
1108
+ check = {
1109
+ :thresholds => {
1110
+ :warning => 120,
1111
+ :critical => 180
1112
+ }
1113
+ }
1114
+ if @settings.handler_exists?(:keepalive)
1115
+ check[:handler] = "keepalive"
1116
+ end
1117
+ if @settings[:sensu][:keepalives]
1118
+ check = deep_merge(check, @settings[:sensu][:keepalives])
1119
+ end
1120
+ if client.has_key?(:keepalive)
1121
+ check = deep_merge(check, client[:keepalive])
1122
+ end
1123
+ timestamp = Time.now.to_i
1124
+ check.merge!(:name => "keepalive", :issued => timestamp, :executed => timestamp)
1125
+ object_substitute_tokens(check, client)
1126
+ end
1127
+
1128
+ # Create client keepalive check results. This method will
1129
+ # retrieve clients from the registry, creating a keepalive
1130
+ # check definition for each client, using the
1131
+ # `create_keepalive_check()` method, containing client specific
1132
+ # keepalive thresholds. If the time since the latest keepalive
1133
+ # is equal to or greater than a threshold, the check `:output`
1134
+ # is set to a descriptive message, and `:status` is set to the
1135
+ # appropriate non-zero value. If a client has been sending
1136
+ # keepalives, `:output` and `:status` are set to indicate an OK
1137
+ # state. The `publish_check_result()` method is used to publish
1138
+ # the client keepalive check results.
1139
+ #
1140
+ # @param clients [Array] of client names.
1141
+ # @yield [] callback/block called after the client keepalive
1142
+ # check results have been created.
1143
+ def create_client_keepalive_check_results(clients)
1144
+ client_keys = clients.map { |client_name| "client:#{client_name}" }
1145
+ @redis.mget(*client_keys) do |client_json_objects|
1146
+ client_json_objects.each do |client_json|
1147
+ unless client_json.nil?
1148
+ client = Sensu::JSON.load(client_json)
1149
+ next if client[:keepalives] == false
1150
+ check, unmatched_tokens = create_keepalive_check(client)
1151
+ time_since_last_keepalive = Time.now.to_i - client[:timestamp]
1152
+ check[:output] = "No keepalive sent from client for "
1153
+ check[:output] << "#{time_since_last_keepalive} seconds"
1154
+ case
1155
+ when time_since_last_keepalive >= check[:thresholds][:critical]
1156
+ check[:output] << " (>=#{check[:thresholds][:critical]})"
1157
+ check[:status] = 2
1158
+ when time_since_last_keepalive >= check[:thresholds][:warning]
1159
+ check[:output] << " (>=#{check[:thresholds][:warning]})"
1160
+ check[:status] = 1
1161
+ else
1162
+ check[:output] = "Keepalive sent from client "
1163
+ check[:output] << "#{time_since_last_keepalive} seconds ago"
1164
+ check[:status] = 0
1165
+ end
1166
+ unless unmatched_tokens.empty?
1167
+ check[:output] << " - Unmatched client token(s): " + unmatched_tokens.join(", ")
1168
+ check[:status] = 1 if check[:status] == 0
1169
+ end
1170
+ publish_check_result(client[:name], check)
1171
+ end
1172
+ end
1173
+ yield
1174
+ end
1175
+ end
1176
+
1177
+ # Determine stale clients, those that have not sent a keepalive
1178
+ # in a specified amount of time. This method iterates through
1179
+ # the client registry, creating a keepalive check result for
1180
+ # each client. The `create_client_keepalive_check_results()`
1181
+ # method is used to inspect and create keepalive check results
1182
+ # for each slice of clients from the registry. A relatively
1183
+ # small clients slice size (20) is used to reduce the number of
1184
+ # clients inspected within a single tick of the EM reactor.
1185
+ def determine_stale_clients
1186
+ @logger.info("determining stale clients")
1187
+ @redis.smembers("clients") do |clients|
1188
+ client_count = clients.length
1189
+ keepalive_check_results = Proc.new do |slice_start, slice_size|
1190
+ unless slice_start > client_count - 1
1191
+ clients_slice = clients.slice(slice_start..slice_size)
1192
+ create_client_keepalive_check_results(clients_slice) do
1193
+ keepalive_check_results.call(slice_start + 20, slice_size + 20)
1194
+ end
1195
+ end
1196
+ end
1197
+ keepalive_check_results.call(0, 19)
1198
+ end
1199
+ end
1200
+
1201
+ # Set up the client monitor, a periodic timer to run
1202
+ # `determine_stale_clients()` every 30 seconds. The timer is
1203
+ # stored in the timers hash under `:tasks`.
1204
+ def setup_client_monitor
1205
+ @logger.debug("monitoring client keepalives")
1206
+ @timers[:tasks][:client_monitor] << EM::PeriodicTimer.new(30) do
1207
+ determine_stale_clients
1208
+ end
1209
+ end
1210
+
1211
+ # Create check TTL results. This method will retrieve check
1212
+ # results from the registry and determine the time since their
1213
+ # last check execution (in seconds). If the time since last
1214
+ # execution is equal to or greater than the defined check TTL, a
1215
+ # warning check result is published with the appropriate check
1216
+ # output.
1217
+ #
1218
+ # @param ttl_keys [Array] of TTL keys.
1219
+ # @param interval [Integer] to use for the check TTL result
1220
+ # interval.
1221
+ # @yield [] callback/block called after the check TTL results
1222
+ # have been created.
1223
+ def create_check_ttl_results(ttl_keys, interval=30)
1224
+ result_keys = ttl_keys.map { |ttl_key| "result:#{ttl_key}" }
1225
+ @redis.mget(*result_keys) do |result_json_objects|
1226
+ result_json_objects.each_with_index do |result_json, index|
1227
+ unless result_json.nil?
1228
+ check = Sensu::JSON.load(result_json)
1229
+ next unless check[:ttl] && check[:executed] && !check[:force_resolve]
1230
+ time_since_last_execution = Time.now.to_i - check[:executed]
1231
+ if time_since_last_execution >= check[:ttl]
1232
+ client_name = ttl_keys[index].split(":").first
1233
+ keepalive_event_exists?(client_name) do |event_exists|
1234
+ unless event_exists
1235
+ check[:output] = "Last check execution was "
1236
+ check[:output] << "#{time_since_last_execution} seconds ago"
1237
+ check[:status] = check[:ttl_status] || 1
1238
+ check[:interval] = interval
1239
+ publish_check_result(client_name, check)
1240
+ end
1241
+ end
1242
+ end
1243
+ else
1244
+ @redis.srem("ttl", ttl_keys[index])
1245
+ end
1246
+ end
1247
+ yield
1248
+ end
1249
+ end
1250
+
1251
+ # Determine stale check results, those that have not executed in
1252
+ # a specified amount of time (check TTL). This method iterates
1253
+ # through stored check results that have a defined TTL value (in
1254
+ # seconds). The `create_check_ttl_results()` method is used to
1255
+ # inspect each check result, calculating their time since last
1256
+ # check execution (in seconds). If the time since last execution
1257
+ # is equal to or greater than the check TTL, a warning check
1258
+ # result is published with the appropriate check output. A
1259
+ # relatively small check results slice size (20) is used to
1260
+ # reduce the number of check results inspected within a single
1261
+ # tick of the EM reactor.
1262
+ #
1263
+ # @param interval [Integer] to use for the check TTL result
1264
+ # interval.
1265
+ def determine_stale_check_results(interval=30)
1266
+ @logger.info("determining stale check results (ttl)")
1267
+ @redis.smembers("ttl") do |ttl_keys|
1268
+ ttl_key_count = ttl_keys.length
1269
+ ttl_check_results = Proc.new do |slice_start, slice_size|
1270
+ unless slice_start > ttl_key_count - 1
1271
+ ttl_keys_slice = ttl_keys.slice(slice_start..slice_size)
1272
+ create_check_ttl_results(ttl_keys_slice, interval) do
1273
+ ttl_check_results.call(slice_start + 20, slice_size + 20)
1274
+ end
1275
+ end
1276
+ end
1277
+ ttl_check_results.call(0, 19)
1278
+ end
1279
+ end
1280
+
1281
+ # Set up the check result monitor, a periodic timer to run
1282
+ # `determine_stale_check_results()` every 30 seconds. The timer
1283
+ # is stored in the timers hash under `:tasks`.
1284
+ #
1285
+ # @param interval [Integer] to use for the check TTL result
1286
+ # interval.
1287
+ def setup_check_result_monitor(interval=30)
1288
+ @logger.debug("monitoring check results")
1289
+ @timers[:tasks][:check_result_monitor] << EM::PeriodicTimer.new(interval) do
1290
+ determine_stale_check_results(interval)
1291
+ end
1292
+ end
1293
+
1294
+ # Create a lock timestamp (integer), current time including
1295
+ # milliseconds. This method is used by Sensu server task
1296
+ # election.
1297
+ #
1298
+ # @return [Integer]
1299
+ def create_lock_timestamp
1300
+ (Time.now.to_f * 10000000).to_i
1301
+ end
1302
+
1303
+ # Create/return the unique Sensu server ID for the current
1304
+ # process.
1305
+ #
1306
+ # @return [String]
1307
+ def server_id
1308
+ @server_id ||= random_uuid
1309
+ end
1310
+
1311
+ # Setup a Sensu server task. Unless the current process is
1312
+ # already responsible for the task, this method sets the tasks
1313
+ # server ID stored in Redis to the unique random server ID for
1314
+ # the process. If the tasks server ID is successfully updated,
1315
+ # the task is added to `@tasks` for tracking purposes and the
1316
+ # task setup method is called.
1317
+ #
1318
+ # @param task [String]
1319
+ # @yield callback/block called after setting up the task.
1320
+ def setup_task(task)
1321
+ unless @tasks.include?(task)
1322
+ @redis.set("task:#{task}:server", server_id) do
1323
+ @logger.info("i am now responsible for a server task", :task => task)
1324
+ @tasks << task
1325
+ self.send("setup_#{task}".to_sym)
1326
+ yield if block_given?
1327
+ end
1328
+ else
1329
+ @logger.debug("i am already responsible for a server task", :task => task)
1330
+ end
1331
+ end
1332
+
1333
+ # Relinquish a Sensu server task. This method cancels and
1334
+ # clears the associated task timers, those with references
1335
+ # stored in the timers hash under `:tasks`, and removes the task
1336
+ # from `@tasks`. The task server ID and lock are not removed
1337
+ # from Redis, as they will be updated when another server takes
1338
+ # reponsibility for the task, this method does not need to
1339
+ # handle Redis connectivity issues.
1340
+ #
1341
+ # @param task [String]
1342
+ def relinquish_task(task)
1343
+ if @tasks.include?(task)
1344
+ @logger.warn("relinquishing server task", :task => task)
1345
+ @timers[:tasks][task.to_sym].each do |timer|
1346
+ timer.cancel
1347
+ end
1348
+ @timers[:tasks][task.to_sym].clear
1349
+ @tasks.delete(task)
1350
+ else
1351
+ @logger.debug("not currently responsible for a server task", :task => task)
1352
+ end
1353
+ end
1354
+
1355
+ # Relinquish all Sensu server tasks, if any.
1356
+ def relinquish_tasks
1357
+ unless @tasks.empty?
1358
+ @tasks.dup.each do |task|
1359
+ relinquish_task(task)
1360
+ end
1361
+ else
1362
+ @logger.debug("not currently responsible for a server task")
1363
+ end
1364
+ end
1365
+
1366
+ # Updates a Sensu server task lock timestamp. The current task
1367
+ # server ID is retrieved from Redis and compared with the server
1368
+ # ID of the current process to determine if it is still
1369
+ # responsible for the task. If the current process is still
1370
+ # responsible, the task lock timestamp is updated. If the
1371
+ # current process is no longer responsible, `relinquish_task()`
1372
+ # is called for cleanup.
1373
+ #
1374
+ # @param task [String]
1375
+ def update_task_lock(task)
1376
+ @redis.get("task:#{task}:server") do |current_server_id|
1377
+ if current_server_id == server_id
1378
+ @redis.set("lock:task:#{task}", create_lock_timestamp) do
1379
+ @logger.debug("updated task lock timestamp", :task => task)
1380
+ end
1381
+ else
1382
+ @logger.warn("another sensu server is responsible for the task", :task => task)
1383
+ relinquish_task(task)
1384
+ end
1385
+ end
1386
+ end
1387
+
1388
+ # Set up a Sensu server task lock updater. This method uses a
1389
+ # periodic timer to update a task lock timestamp in Redis, every
1390
+ # 10 seconds. If the current process fails to keep the lock
1391
+ # timestamp updated for a task that it is responsible for,
1392
+ # another Sensu server will claim responsibility. This method is
1393
+ # called after task setup.
1394
+ #
1395
+ # @param task [String]
1396
+ def setup_task_lock_updater(task)
1397
+ @timers[:run] << EM::PeriodicTimer.new(10) do
1398
+ update_task_lock(task)
1399
+ end
1400
+ end
1401
+
1402
+ # Request a Sensu server task election, a process to determine
1403
+ # if the current process is to be responsible for the task. A
1404
+ # Redis key/value is used as a central lock, using the "SETNX"
1405
+ # Redis command to set the key/value if it does not exist, using
1406
+ # a timestamp for the value. If the current process was able to
1407
+ # create the key/value, it is elected, and is then responsible
1408
+ # for the task. If the current process was not able to create
1409
+ # the key/value, but the current timestamp value is equal to or
1410
+ # over 30 seconds ago, the "GETSET" Redis command is used to set
1411
+ # a new timestamp and fetch the previous value to compare them,
1412
+ # to determine if it was set by the current process. If the
1413
+ # current process is able to set the timestamp value, it is
1414
+ # elected. If elected, the current process sets up the task and
1415
+ # the associated task lock updater.
1416
+ #
1417
+ # @param task [String]
1418
+ # @yield callback/block called either after being elected and
1419
+ # setting up the task, or after failing to be elected.
1420
+ def request_task_election(task, &callback)
1421
+ @redis.setnx("lock:task:#{task}", create_lock_timestamp) do |created|
1422
+ if created
1423
+ setup_task(task, &callback)
1424
+ setup_task_lock_updater(task)
1425
+ else
1426
+ @redis.get("lock:task:#{task}") do |current_lock_timestamp|
1427
+ new_lock_timestamp = create_lock_timestamp
1428
+ if new_lock_timestamp - current_lock_timestamp.to_i >= 300000000
1429
+ @redis.getset("lock:task:#{task}", new_lock_timestamp) do |previous_lock_timestamp|
1430
+ if previous_lock_timestamp == current_lock_timestamp
1431
+ setup_task(task, &callback)
1432
+ setup_task_lock_updater(task)
1433
+ end
1434
+ end
1435
+ else
1436
+ yield if block_given?
1437
+ end
1438
+ end
1439
+ end
1440
+ end
1441
+ end
1442
+
1443
+ # Request Sensu server task elections. The task list is ordered
1444
+ # by prioity. This method works through the task list serially,
1445
+ # increasing the election request delay as the current process
1446
+ # becomes responsible for one or more tasks, this is to improve
1447
+ # the initial distribution of tasks amongst Sensu servers.
1448
+ #
1449
+ # @param splay [Integer]
1450
+ def setup_task_elections(splay=10)
1451
+ tasks = TASKS.dup - @tasks
1452
+ next_task = Proc.new do
1453
+ task = tasks.shift
1454
+ if task
1455
+ delay = splay * @tasks.size
1456
+ @timers[:run] << EM::Timer.new(delay) do
1457
+ request_task_election(task, &next_task)
1458
+ end
1459
+ else
1460
+ @timers[:run] << EM::Timer.new(10) do
1461
+ setup_task_elections(splay)
1462
+ end
1463
+ end
1464
+ end
1465
+ next_task.call
1466
+ end
1467
+
1468
+ # Update the Sensu server registry, stored in Redis. This method
1469
+ # adds the local/current Sensu server info to the registry,
1470
+ # including its id, hostname, address, its server tasks, and
1471
+ # some metrics. Sensu server registry entries expire in 30
1472
+ # seconds unless updated.
1473
+ #
1474
+ # @yield [success] passes success status to optional
1475
+ # callback/block.
1476
+ # @yieldparam success [TrueClass,FalseClass] indicating if the
1477
+ # server registry update was a success.
1478
+ def update_server_registry
1479
+ @logger.debug("updating the server registry")
1480
+ process_cpu_times do |cpu_user, cpu_system, _, _|
1481
+ sensu = RELEASE_INFO.merge(
1482
+ :settings => {
1483
+ :hexdigest => @settings.hexdigest
1484
+ }
1485
+ )
1486
+ tessen = @settings[:tessen] || {}
1487
+ tessen_enabled = tessen.fetch(:enabled, false)
1488
+ info = {
1489
+ :id => server_id,
1490
+ :hostname => system_hostname,
1491
+ :address => system_address,
1492
+ :tasks => @tasks,
1493
+ :metrics => {
1494
+ :cpu => {
1495
+ :user => cpu_user,
1496
+ :system => cpu_system
1497
+ }
1498
+ },
1499
+ :sensu => sensu,
1500
+ :tessen => {
1501
+ :enabled => tessen_enabled
1502
+ },
1503
+ :timestamp => Time.now.to_i
1504
+ }
1505
+ @redis.sadd("servers", server_id)
1506
+ server_key = "server:#{server_id}"
1507
+ @redis.set(server_key, Sensu::JSON.dump(info)) do
1508
+ @redis.expire(server_key, 30)
1509
+ @logger.info("updated server registry", :server => info)
1510
+ yield(true) if block_given?
1511
+ end
1512
+ end
1513
+ end
1514
+
1515
+ # Set up the server registry updater. A periodic timer is
1516
+ # used to update the Sensu server info stored in Redis. The
1517
+ # timer is stored in the timers hash under `:run`.
1518
+ def setup_server_registry_updater
1519
+ update_server_registry
1520
+ @timers[:run] << EM::PeriodicTimer.new(10) do
1521
+ update_server_registry
1522
+ end
1523
+ end
1524
+
1525
+ # Set up Tessen, the call home mechanism.
1526
+ def setup_tessen
1527
+ @tessen = Tessen.new(
1528
+ :settings => @settings,
1529
+ :logger => @logger,
1530
+ :redis => @redis
1531
+ )
1532
+ @tessen.run if @tessen.enabled?
1533
+ end
1534
+
1535
+ # Unsubscribe from transport subscriptions (all of them). This
1536
+ # method is called when there are issues with connectivity, or
1537
+ # the process is stopping.
1538
+ def unsubscribe
1539
+ @logger.warn("unsubscribing from keepalive and result queues")
1540
+ @transport.unsubscribe if @transport
1541
+ end
1542
+
1543
+ # Complete in progress work and then call the provided callback.
1544
+ # This method will wait until all counters stored in the
1545
+ # `@in_progress` hash equal `0`.
1546
+ #
1547
+ # @yield [] callback/block to call when in progress work is
1548
+ # completed.
1549
+ def complete_in_progress
1550
+ @logger.info("completing work in progress", :in_progress => @in_progress)
1551
+ retry_until_true do
1552
+ if @in_progress.values.all? { |count| count == 0 }
1553
+ yield
1554
+ true
1555
+ end
1556
+ end
1557
+ end
1558
+
1559
+ # Bootstrap the Sensu server process, setting up the keepalive
1560
+ # and check result consumers, and attemping to carry out Sensu
1561
+ # server tasks. This method sets the process/daemon `@state` to
1562
+ # `:running`.
1563
+ def bootstrap
1564
+ setup_keepalives
1565
+ setup_results
1566
+ setup_task_elections
1567
+ setup_server_registry_updater
1568
+ setup_tessen
1569
+ @state = :running
1570
+ end
1571
+
1572
+ # Start the Sensu server process, connecting to Redis, the
1573
+ # Transport, and calling the `bootstrap()` method. Yield if a
1574
+ # block is provided.
1575
+ def start
1576
+ setup_connections do
1577
+ bootstrap
1578
+ yield if block_given?
1579
+ end
1580
+ end
1581
+
1582
+ # Pause the Sensu server process, unless it is being paused or
1583
+ # has already been paused. The process/daemon `@state` is first
1584
+ # set to `:pausing`, to indicate that it's in progress. All run
1585
+ # timers are cancelled, their references are cleared, and Tessen
1586
+ # is stopped. The Sensu server will unsubscribe from all
1587
+ # transport subscriptions, relinquish any Sensu server tasks,
1588
+ # then set the process/daemon `@state` to `:paused`.
1589
+ def pause
1590
+ unless @state == :pausing || @state == :paused
1591
+ @state = :pausing
1592
+ @timers[:run].each do |timer|
1593
+ timer.cancel
1594
+ end
1595
+ @timers[:run].clear
1596
+ @tessen.stop if @tessen
1597
+ unsubscribe
1598
+ relinquish_tasks
1599
+ @state = :paused
1600
+ end
1601
+ end
1602
+
1603
+ # Resume the Sensu server process if it is currently or will
1604
+ # soon be paused. The `retry_until_true` helper method is used
1605
+ # to determine if the process is paused and if the Redis and
1606
+ # transport connections are initiated and connected. If the
1607
+ # conditions are met, `bootstrap()` will be called and true is
1608
+ # returned to stop `retry_until_true`. If the transport has not
1609
+ # yet been initiated, true is is returned, without calling
1610
+ # bootstrap, as we expect bootstrap will be called after the
1611
+ # transport initializes.
1612
+ def resume
1613
+ retry_until_true(1) do
1614
+ if @state == :paused
1615
+ if @redis.connected?
1616
+ if @transport
1617
+ if @transport.connected?
1618
+ bootstrap
1619
+ true
1620
+ end
1621
+ else
1622
+ true
1623
+ end
1624
+ end
1625
+ end
1626
+ end
1627
+ end
1628
+
1629
+ # Stop the Sensu server process, pausing it, completing event
1630
+ # handling in progress, closing the Redis and transport
1631
+ # connections, and exiting the process (exit 0). After pausing
1632
+ # the process, the process/daemon `@state` is set to
1633
+ # `:stopping`.
1634
+ def stop
1635
+ @logger.warn("stopping")
1636
+ pause
1637
+ @state = :stopping
1638
+ complete_in_progress do
1639
+ @redis.close if @redis
1640
+ @transport.close if @transport
1641
+ super
1642
+ end
1643
+ end
1644
+ end
1645
+ end
1646
+ end