sensu 0.16.0-java → 0.17.0.beta.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,811 @@
1
+ require "sensu/daemon"
2
+ require "sensu/server/filter"
3
+ require "sensu/server/mutate"
4
+ require "sensu/server/handle"
5
+
6
+ module Sensu
7
+ module Server
8
+ class Process
9
+ include Daemon
10
+ include Filter
11
+ include Mutate
12
+ include Handle
13
+
14
+ attr_reader :is_master, :handling_event_count
15
+
16
+ # Create an instance of the Sensu server process, start the
17
+ # server within the EventMachine event loop, and set up server
18
+ # process signal traps (for stopping).
19
+ #
20
+ # @param options [Hash]
21
+ def self.run(options={})
22
+ server = self.new(options)
23
+ EM::run do
24
+ server.start
25
+ server.setup_signal_traps
26
+ end
27
+ end
28
+
29
+ # Override Daemon initialize() to support Sensu server master
30
+ # election and the handling event count.
31
+ #
32
+ # @param options [Hash]
33
+ def initialize(options={})
34
+ super
35
+ @is_master = false
36
+ @timers[:master] = Array.new
37
+ @handling_event_count = 0
38
+ end
39
+
40
+ # Update the Sensu client registry, stored in Redis. Sensu
41
+ # client data is used to provide additional event context and
42
+ # enable agent health monitoring. JSON serialization is used for
43
+ # the client data.
44
+ #
45
+ # @param client [Hash]
46
+ # @param callback [Proc] to call after the the client data has
47
+ # been added to (or updated) the registry.
48
+ def update_client_registry(client, &callback)
49
+ @logger.debug("updating client registry", :client => client)
50
+ @redis.set("client:#{client[:name]}", MultiJson.dump(client)) do
51
+ @redis.sadd("clients", client[:name]) do
52
+ callback.call
53
+ end
54
+ end
55
+ end
56
+
57
+ # Set up the client keepalive consumer, keeping the Sensu client
58
+ # registry updated. The consumer receives JSON serialized client
59
+ # keepalives from the transport, parses them, and calls
60
+ # `update_client_registry()` with the client data to update the
61
+ # registry. Transport message acknowledgements are used to
62
+ # ensure the client registry is updated successfully. Keepalive
63
+ # JSON parsing errors are logged.
64
+ def setup_keepalives
65
+ @logger.debug("subscribing to keepalives")
66
+ @transport.subscribe(:direct, "keepalives", "keepalives", :ack => true) do |message_info, message|
67
+ @logger.debug("received keepalive", :message => message)
68
+ begin
69
+ client = MultiJson.load(message)
70
+ update_client_registry(client) do
71
+ @transport.ack(message_info)
72
+ end
73
+ rescue MultiJson::ParseError => error
74
+ @logger.error("failed to parse keepalive payload", {
75
+ :message => message,
76
+ :error => error.to_s
77
+ })
78
+ @transport.ack(message_info)
79
+ end
80
+ end
81
+ end
82
+
83
+ # Expand event handler sets, creating an array of handler
84
+ # definitions. Handler sets cannot be deeply nested (by choice),
85
+ # this method will return `nil` if an attempt is made to deeply
86
+ # nest. If the provided handler definition is not a set, it is
87
+ # returned.
88
+ #
89
+ # @param handler [Hash] definition.
90
+ # @param depth [Integer] of the expansion.
91
+ # @return [Array, Hash, Nil]
92
+ def expand_handler_sets(handler, depth=0)
93
+ if handler[:type] == "set"
94
+ if depth < 2
95
+ derive_handlers(handler[:handlers], depth + 1)
96
+ else
97
+ @logger.error("handler sets cannot be deeply nested", :handler => handler)
98
+ nil
99
+ end
100
+ else
101
+ handler
102
+ end
103
+ end
104
+
105
+ # Derive an array of handler definitions from a list of handler
106
+ # names. This method first checks for the existence of standard
107
+ # handlers, followed by handler extensions. If a handler does
108
+ # not exist for a name, it is logged and ignored. Duplicate
109
+ # handler definitions are removed.
110
+ #
111
+ # @param handler_list [Array]
112
+ # @param depth [Integer] of handler set expansion.
113
+ # @return [Array]
114
+ def derive_handlers(handler_list, depth=0)
115
+ handler_list.compact.map { |handler_name|
116
+ case
117
+ when @settings.handler_exists?(handler_name)
118
+ handler = @settings[:handlers][handler_name].merge(:name => handler_name)
119
+ expand_handler_sets(handler, depth)
120
+ when @extensions.handler_exists?(handler_name)
121
+ @extensions[:handlers][handler_name]
122
+ else
123
+ @logger.error("unknown handler", :handler_name => handler_name)
124
+ nil
125
+ end
126
+ }.flatten.compact.uniq
127
+ end
128
+
129
+ # Run event bridge extensions, within the Sensu EventMachine
130
+ # reactor (event loop). The extension API `safe_run()` method is
131
+ # used to guard against most errors. Bridges are for relaying
132
+ # Sensu event data to other services.
133
+ #
134
+ # @param event [Hash]
135
+ def event_bridges(event)
136
+ @extensions[:bridges].each do |name, bridge|
137
+ bridge.safe_run(event) do |output, status|
138
+ @logger.debug("bridge extension output", {
139
+ :extension => bridge.definition,
140
+ :output => output
141
+ })
142
+ end
143
+ end
144
+ end
145
+
146
+ # Process an event: filter -> mutate -> handle.
147
+ #
148
+ # This method runs event bridges, relaying the event data to
149
+ # other services. This method also determines the appropriate
150
+ # handlers for the event, filtering and mutating the event data
151
+ # for each of them. The `@handling_event_count` is incremented
152
+ # by `1`, for each event handler chain (filter -> mutate ->
153
+ # handle).
154
+ #
155
+ # @param event [Hash]
156
+ def process_event(event)
157
+ log_level = event[:check][:type] == "metric" ? :debug : :info
158
+ @logger.send(log_level, "processing event", :event => event)
159
+ event_bridges(event)
160
+ handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || "default")
161
+ handlers = derive_handlers(handler_list)
162
+ handlers.each do |handler|
163
+ @handling_event_count += 1
164
+ filter_event(handler, event) do |event|
165
+ mutate_event(handler, event) do |event_data|
166
+ handle_event(handler, event_data)
167
+ end
168
+ end
169
+ end
170
+ end
171
+
172
+ # Add a check result to an aggregate. A check aggregate uses the
173
+ # check `:name` and the `:issued` timestamp as its unique
174
+ # identifier. An aggregate uses several counters: the total
175
+ # number of results in the aggregate, and a counter for each
176
+ # check severity (ok, warning, etc). Check output is also
177
+ # stored, to be summarized to aid in identifying outliers for a
178
+ # check execution across a number of Sensu clients. JSON
179
+ # serialization is used for storing check result data.
180
+ #
181
+ # @param result [Hash]
182
+ def aggregate_check_result(result)
183
+ @logger.debug("adding check result to aggregate", :result => result)
184
+ check = result[:check]
185
+ result_set = "#{check[:name]}:#{check[:issued]}"
186
+ result_data = MultiJson.dump(:output => check[:output], :status => check[:status])
187
+ @redis.hset("aggregation:#{result_set}", result[:client], result_data) do
188
+ SEVERITIES.each do |severity|
189
+ @redis.hsetnx("aggregate:#{result_set}", severity, 0)
190
+ end
191
+ severity = (SEVERITIES[check[:status]] || "unknown")
192
+ @redis.hincrby("aggregate:#{result_set}", severity, 1) do
193
+ @redis.hincrby("aggregate:#{result_set}", "total", 1) do
194
+ @redis.sadd("aggregates:#{check[:name]}", check[:issued]) do
195
+ @redis.sadd("aggregates", check[:name])
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+
202
+ # Store check result data. This method stores the 21 most recent
203
+ # check result statuses for a client/check pair, this history
204
+ # is used for event context and flap detection. The check
205
+ # execution timestamp is also stored, to provide an indication
206
+ # of how recent the data is.
207
+ #
208
+ # @param client [Hash]
209
+ # @param check [Hash]
210
+ # @param callback [Proc] to call when the check result data has
211
+ # been stored (history, etc).
212
+ def store_check_result(client, check, &callback)
213
+ @redis.sadd("history:#{client[:name]}", check[:name])
214
+ result_key = "#{client[:name]}:#{check[:name]}"
215
+ history_key = "history:#{result_key}"
216
+ @redis.rpush(history_key, check[:status]) do
217
+ @redis.set("execution:#{result_key}", check[:executed])
218
+ @redis.ltrim(history_key, -21, -1)
219
+ callback.call
220
+ end
221
+ end
222
+
223
+ # Fetch the execution history for a client/check pair, the 21
224
+ # most recent check result statuses. This method also calculates
225
+ # the total state change percentage for the history, this value
226
+ # is use for check state flat detection, using a similar
227
+ # algorithm to Nagios:
228
+ # http://nagios.sourceforge.net/docs/3_0/flapping.html
229
+ #
230
+ # @param client [Hash]
231
+ # @param check [Hash]
232
+ # @param callback [Proc] to be called with the check history and
233
+ # total state change value.
234
+ def check_history(client, check, &callback)
235
+ history_key = "history:#{client[:name]}:#{check[:name]}"
236
+ @redis.lrange(history_key, -21, -1) do |history|
237
+ total_state_change = 0
238
+ unless history.size < 21
239
+ state_changes = 0
240
+ change_weight = 0.8
241
+ previous_status = history.first
242
+ history.each do |status|
243
+ unless status == previous_status
244
+ state_changes += change_weight
245
+ end
246
+ change_weight += 0.02
247
+ previous_status = status
248
+ end
249
+ total_state_change = (state_changes.fdiv(20) * 100).to_i
250
+ end
251
+ callback.call(history, total_state_change)
252
+ end
253
+ end
254
+
255
+ # Determine if a check state is flapping, rapidly changing
256
+ # between an OK and non-OK state. Flap detection is only done
257
+ # for checks that have defined low and hight flap detection
258
+ # thresholds, `:low_flap_threshold` and `:high_flap_threshold`.
259
+ # The `check_history()` method provides the check history and
260
+ # more importantly the total state change precentage value that
261
+ # is compared with the configured thresholds defined in the
262
+ # check data. If a check hasn't been flapping, the
263
+ # `:total_state_change` must be equal to or higher than the
264
+ # `:high_flap_threshold` to be changed to flapping. If a check
265
+ # has been flapping, the `:total_state_change` must be equal to
266
+ # or lower than the `:low_flap_threshold` to no longer be
267
+ # flapping. This method uses the same algorithm as Nagios:
268
+ # http://nagios.sourceforge.net/docs/3_0/flapping.html
269
+ #
270
+ # @param stored_event [Hash]
271
+ # @param check [Hash]
272
+ # @return [TrueClass, FalseClass]
273
+ def check_flapping?(stored_event, check)
274
+ if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
275
+ was_flapping = stored_event && stored_event[:action] == "flapping"
276
+ check[:total_state_change] >= check[:high_flap_threshold] ||
277
+ (was_flapping && check[:total_state_change] <= check[:low_flap_threshold]) ||
278
+ was_flapping
279
+ else
280
+ false
281
+ end
282
+ end
283
+
284
+ # Update the event registry, stored in Redis. This method
285
+ # determines if check data results in the creation or update of
286
+ # event data in the registry. Existing event data for a
287
+ # client/check pair is fetched, used in conditionals and the
288
+ # composition of the new event data. If a check `:status` is not
289
+ # `0`, or it has been flapping, an event is created/updated in
290
+ # the registry. If there was existing event data, but the check
291
+ # `:status` is now `0`, the event is removed (resolved) from the
292
+ # registry. If the previous conditions are not met, and check
293
+ # `:type` is `metric` and the `:status` is `0`, the event
294
+ # registry is not updated, but the provided callback is called
295
+ # with the event data. JSON serialization is used when storing
296
+ # data in the registry.
297
+ #
298
+ # @param client [Hash]
299
+ # @param check [Hash]
300
+ # @param callback [Proc] to be called with the resulting event
301
+ # data if the event registry is updated, or the check is of
302
+ # type `:metric`.
303
+ def update_event_registry(client, check, &callback)
304
+ @redis.hget("events:#{client[:name]}", check[:name]) do |event_json|
305
+ stored_event = event_json ? MultiJson.load(event_json) : nil
306
+ flapping = check_flapping?(stored_event, check)
307
+ event = {
308
+ :id => random_uuid,
309
+ :client => client,
310
+ :check => check,
311
+ :occurrences => 1
312
+ }
313
+ if check[:status] != 0 || flapping
314
+ if stored_event && check[:status] == stored_event[:check][:status]
315
+ event[:occurrences] = stored_event[:occurrences] + 1
316
+ end
317
+ event[:action] = flapping ? :flapping : :create
318
+ @redis.hset("events:#{client[:name]}", check[:name], MultiJson.dump(event)) do
319
+ callback.call(event)
320
+ end
321
+ elsif stored_event
322
+ event[:occurrences] = stored_event[:occurrences]
323
+ event[:action] = :resolve
324
+ unless check[:auto_resolve] == false && !check[:force_resolve]
325
+ @redis.hdel("events:#{client[:name]}", check[:name]) do
326
+ callback.call(event)
327
+ end
328
+ end
329
+ elsif check[:type] == "metric"
330
+ callback.call(event)
331
+ end
332
+ end
333
+ end
334
+
335
+ # Process a check result, storing its data, inspecting its
336
+ # contents, and taking the appropriate actions (eg. update the
337
+ # event registry). A check result must have a valid client name,
338
+ # associated with a client in the registry. Results without a
339
+ # valid client are discarded, to keep the system "correct". If a
340
+ # local check definition exists for the check name, and the
341
+ # check result is not from a standalone check execution, it's
342
+ # merged with the check result for more context.
343
+ #
344
+ # @param result [Hash] data.
345
+ def process_check_result(result)
346
+ @logger.debug("processing result", :result => result)
347
+ @redis.get("client:#{result[:client]}") do |client_json|
348
+ unless client_json.nil?
349
+ client = MultiJson.load(client_json)
350
+ check = case
351
+ when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
352
+ @settings[:checks][result[:check][:name]].merge(result[:check])
353
+ else
354
+ result[:check]
355
+ end
356
+ aggregate_check_result(result) if check[:aggregate]
357
+ store_check_result(client, check) do
358
+ check_history(client, check) do |history, total_state_change|
359
+ check[:history] = history
360
+ check[:total_state_change] = total_state_change
361
+ update_event_registry(client, check) do |event|
362
+ process_event(event)
363
+ end
364
+ end
365
+ end
366
+ else
367
+ @logger.warn("client not in registry", :client => result[:client])
368
+ end
369
+ end
370
+ end
371
+
372
+ # Set up the check result consumer. The consumer receives JSON
373
+ # serialized check results from the transport, parses them, and
374
+ # calls `process_check_result()` with the result data to be
375
+ # processed. Transport message acknowledgements are used to
376
+ # ensure that results make it to processing. The transport
377
+ # message acknowledgements are currently done in the next tick
378
+ # of the EventMachine reactor (event loop), as a flow control
379
+ # mechanism. Result JSON parsing errors are logged.
380
+ def setup_results
381
+ @logger.debug("subscribing to results")
382
+ @transport.subscribe(:direct, "results", "results", :ack => true) do |message_info, message|
383
+ begin
384
+ result = MultiJson.load(message)
385
+ @logger.debug("received result", :result => result)
386
+ process_check_result(result)
387
+ rescue MultiJson::ParseError => error
388
+ @logger.error("failed to parse result payload", {
389
+ :message => message,
390
+ :error => error.to_s
391
+ })
392
+ end
393
+ EM::next_tick do
394
+ @transport.ack(message_info)
395
+ end
396
+ end
397
+ end
398
+
399
+ # Publish a check request to the transport. A check request is
400
+ # composted of a check `:name`, an `:issued` timestamp, and a
401
+ # check `:command` if available. The check request is published
402
+ # to a transport pipe, for each of the check `:subscribers` in
403
+ # its definition, eg. "webserver". JSON serialization is used
404
+ # when publishing the check request payload to the transport
405
+ # pipes. Transport errors are logged.
406
+ #
407
+ # @param check [Hash] definition.
408
+ def publish_check_request(check)
409
+ payload = {
410
+ :name => check[:name],
411
+ :issued => Time.now.to_i
412
+ }
413
+ payload[:command] = check[:command] if check.has_key?(:command)
414
+ @logger.info("publishing check request", {
415
+ :payload => payload,
416
+ :subscribers => check[:subscribers]
417
+ })
418
+ check[:subscribers].each do |subscription|
419
+ @transport.publish(:fanout, subscription, MultiJson.dump(payload)) do |info|
420
+ if info[:error]
421
+ @logger.error("failed to publish check request", {
422
+ :subscription => subscription,
423
+ :payload => payload,
424
+ :error => info[:error].to_s
425
+ })
426
+ end
427
+ end
428
+ end
429
+ end
430
+
431
+ # Calculate a check execution splay, taking into account the
432
+ # current time and the execution interval to ensure it's
433
+ # consistent between process restarts.
434
+ #
435
+ # @param check [Hash] definition.
436
+ def calculate_check_execution_splay(check)
437
+ splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
438
+ current_time = (Time.now.to_f * 1000).to_i
439
+ (splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
440
+ end
441
+
442
+ # Schedule check executions, using EventMachine periodic timers,
443
+ # using a calculated execution splay. The timers are stored in
444
+ # the timers hash under `:master`, as check request publishing
445
+ # is a task for only the Sensu server master, so they can be
446
+ # cancelled etc. Check requests are not published if subdued.
447
+ #
448
+ # @param checks [Array] of definitions.
449
+ def schedule_check_executions(checks)
450
+ checks.each do |check|
451
+ create_check_request = Proc.new do
452
+ unless check_request_subdued?(check)
453
+ publish_check_request(check)
454
+ else
455
+ @logger.info("check request was subdued", :check => check)
456
+ end
457
+ end
458
+ execution_splay = testing? ? 0 : calculate_check_execution_splay(check)
459
+ interval = testing? ? 0.5 : check[:interval]
460
+ @timers[:master] << EM::Timer.new(execution_splay) do
461
+ create_check_request.call
462
+ @timers[:master] << EM::PeriodicTimer.new(interval, &create_check_request)
463
+ end
464
+ end
465
+ end
466
+
467
+ # Set up the check request publisher. This method creates an
468
+ # array of check definitions, that are not standalone checks,
469
+ # and do not have `:publish` set to `false`. The array of check
470
+ # definitions includes those from standard checks and extensions
471
+ # (with a defined execution `:interval`). The array is provided
472
+ # to the `schedule_check_executions()` method.
473
+ def setup_check_request_publisher
474
+ @logger.debug("scheduling check requests")
475
+ standard_checks = @settings.checks.reject do |check|
476
+ check[:standalone] || check[:publish] == false
477
+ end
478
+ extension_checks = @extensions.checks.reject do |check|
479
+ check[:standalone] || check[:publish] == false || !check[:interval].is_a?(Integer)
480
+ end
481
+ schedule_check_executions(standard_checks + extension_checks)
482
+ end
483
+
484
+ # Publish a check result to the transport for processing. A
485
+ # check result is composed of a client name and a check
486
+ # definition, containing check `:output` and `:status`. JSON
487
+ # serialization is used when publishing the check result payload
488
+ # to the transport pipe. Transport errors are logged.
489
+ #
490
+ # @param client [Hash]
491
+ # @param check [Hash]
492
+ def publish_check_result(client, check)
493
+ payload = {
494
+ :client => client[:name],
495
+ :check => check
496
+ }
497
+ @logger.debug("publishing check result", :payload => payload)
498
+ @transport.publish(:direct, "results", MultiJson.dump(payload)) do |info|
499
+ if info[:error]
500
+ @logger.error("failed to publish check result", {
501
+ :payload => payload,
502
+ :error => info[:error].to_s
503
+ })
504
+ end
505
+ end
506
+ end
507
+
508
+ # Create a keepalive check definition for a client. Client
509
+ # definitions may contain `:keepalive` configuration, containing
510
+ # specific thresholds and handler information. The keepalive
511
+ # check definition creation begins with default thresholds, and
512
+ # sets the `:handler` to `keepalive`, if the handler has a local
513
+ # definition. If the client provides its own `:keepalive`
514
+ # configuration, it's deep merged with the defaults. The check
515
+ # `:name`, `:issued`, and `:executed` values are always
516
+ # overridden to guard against an invalid definition.
517
+ def create_keepalive_check(client)
518
+ check = {
519
+ :thresholds => {
520
+ :warning => 120,
521
+ :critical => 180
522
+ }
523
+ }
524
+ if @settings.handler_exists?(:keepalive)
525
+ check[:handler] = "keepalive"
526
+ end
527
+ if client.has_key?(:keepalive)
528
+ check = deep_merge(check, client[:keepalive])
529
+ end
530
+ timestamp = Time.now.to_i
531
+ check.merge(:name => "keepalive", :issued => timestamp, :executed => timestamp)
532
+ end
533
+
534
+ # Determine stale clients, those that have not sent a keepalive
535
+ # in a specified amount of time (thresholds). This method
536
+ # iterates through the client registry, creating a keepalive
537
+ # check definition with the `create_keepalive_check()` method,
538
+ # containing client specific staleness thresholds. If the time
539
+ # since the latest keepalive is equal to or greater than a
540
+ # threshold, the check `:output` is set to a descriptive
541
+ # message, and `:status` is set to the appropriate non-zero
542
+ # value. If a client has been sending keepalives, `:output` and
543
+ # `:status` are set to indicate an OK state. A check result is
544
+ # published for every client in the registry.
545
+ def determine_stale_clients
546
+ @logger.info("determining stale clients")
547
+ @redis.smembers("clients") do |clients|
548
+ clients.each do |client_name|
549
+ @redis.get("client:#{client_name}") do |client_json|
550
+ unless client_json.nil?
551
+ client = MultiJson.load(client_json)
552
+ check = create_keepalive_check(client)
553
+ time_since_last_keepalive = Time.now.to_i - client[:timestamp]
554
+ check[:output] = "No keepalive sent from client for "
555
+ check[:output] << "#{time_since_last_keepalive} seconds"
556
+ case
557
+ when time_since_last_keepalive >= check[:thresholds][:critical]
558
+ check[:output] << " (>=#{check[:thresholds][:critical]})"
559
+ check[:status] = 2
560
+ when time_since_last_keepalive >= check[:thresholds][:warning]
561
+ check[:output] << " (>=#{check[:thresholds][:warning]})"
562
+ check[:status] = 1
563
+ else
564
+ check[:output] = "Keepalive sent from client "
565
+ check[:output] << "#{time_since_last_keepalive} seconds ago"
566
+ check[:status] = 0
567
+ end
568
+ publish_check_result(client, check)
569
+ end
570
+ end
571
+ end
572
+ end
573
+ end
574
+
575
+ # Set up the client monitor, a periodic timer to run
576
+ # `determine_stale_clients()` every 30 seconds. The timer is
577
+ # stored in the timers hash under `:master`.
578
+ def setup_client_monitor
579
+ @logger.debug("monitoring client keepalives")
580
+ @timers[:master] << EM::PeriodicTimer.new(30) do
581
+ determine_stale_clients
582
+ end
583
+ end
584
+
585
+ # Prune check result aggregations (aggregates). Sensu only
586
+ # stores the 20 latest aggregations for a check, to keep the
587
+ # amount of data stored to a minimum.
588
+ def prune_check_result_aggregations
589
+ @logger.info("pruning check result aggregations")
590
+ @redis.smembers("aggregates") do |checks|
591
+ checks.each do |check_name|
592
+ @redis.smembers("aggregates:#{check_name}") do |aggregates|
593
+ if aggregates.size > 20
594
+ aggregates.sort!
595
+ aggregates.take(aggregates.size - 20).each do |check_issued|
596
+ @redis.srem("aggregates:#{check_name}", check_issued) do
597
+ result_set = "#{check_name}:#{check_issued}"
598
+ @redis.del("aggregate:#{result_set}") do
599
+ @redis.del("aggregation:#{result_set}") do
600
+ @logger.debug("pruned aggregation", {
601
+ :check => {
602
+ :name => check_name,
603
+ :issued => check_issued
604
+ }
605
+ })
606
+ end
607
+ end
608
+ end
609
+ end
610
+ end
611
+ end
612
+ end
613
+ end
614
+ end
615
+
616
+ # Set up the check result aggregation pruner, using periodic
617
+ # timer to run `prune_check_result_aggregations()` every 20
618
+ # seconds. The timer is stored in the timers hash under
619
+ # `:master`.
620
+ def setup_check_result_aggregation_pruner
621
+ @logger.debug("pruning check result aggregations")
622
+ @timers[:master] << EM::PeriodicTimer.new(20) do
623
+ prune_check_result_aggregations
624
+ end
625
+ end
626
+
627
+ # Set up the master duties, tasks only performed by a single
628
+ # Sensu server at a time. The duties include publishing check
629
+ # requests, monitoring for stale clients, and pruning check
630
+ # result aggregations.
631
+ def master_duties
632
+ setup_check_request_publisher
633
+ setup_client_monitor
634
+ setup_check_result_aggregation_pruner
635
+ end
636
+
637
+ # Request a master election, a process to determine if the
638
+ # current process is the master Sensu server, with its
639
+ # own/unique duties. A Redis key/value is used as a central
640
+ # lock, using the "SETNX" Redis command to set the key/value if
641
+ # it does not exist, using a timestamp for the value. If the
642
+ # current process was able to create the key/value, it is the
643
+ # master, and must do the duties of the master. If the current
644
+ # process was not able to create the key/value, but the current
645
+ # timestamp value is equal to or over 30 seconds ago, the
646
+ # "GETSET" Redis command is used to set a new timestamp and
647
+ # fetch the previous value to compare them, to determine if it
648
+ # was set by the current process. If the current process is able
649
+ # to set the timestamp value, it becomes the master. The master
650
+ # has `@is_master` set to `true`.
651
+ def request_master_election
652
+ @redis.setnx("lock:master", Time.now.to_i) do |created|
653
+ if created
654
+ @is_master = true
655
+ @logger.info("i am the master")
656
+ master_duties
657
+ else
658
+ @redis.get("lock:master") do |timestamp|
659
+ if Time.now.to_i - timestamp.to_i >= 30
660
+ @redis.getset("lock:master", Time.now.to_i) do |previous|
661
+ if previous == timestamp
662
+ @is_master = true
663
+ @logger.info("i am now the master")
664
+ master_duties
665
+ end
666
+ end
667
+ end
668
+ end
669
+ end
670
+ end
671
+ end
672
+
673
+ # Set up the master monitor. A one-time timer is used to run
674
+ # `request_master_exection()` in 2 seconds. A periodic timer is
675
+ # used to update the master lock timestamp if the current
676
+ # process is the master, or to run `request_master_election(),
677
+ # every 10 seconds. The timers are stored in the timers hash
678
+ # under `:run`.
679
+ def setup_master_monitor
680
+ @timers[:run] << EM::Timer.new(2) do
681
+ request_master_election
682
+ end
683
+ @timers[:run] << EM::PeriodicTimer.new(10) do
684
+ if @is_master
685
+ @redis.set("lock:master", Time.now.to_i) do
686
+ @logger.debug("updated master lock timestamp")
687
+ end
688
+ else
689
+ request_master_election
690
+ end
691
+ end
692
+ end
693
+
694
+ # Resign as master, if the current process is the Sensu server
695
+ # master. This method cancels and clears the master timers,
696
+ # those with references stored in the timers hash under
697
+ # `:master`, and `@is_master`is set to `false`.
698
+ def resign_as_master
699
+ if @is_master
700
+ @logger.warn("resigning as master")
701
+ @timers[:master].each do |timer|
702
+ timer.cancel
703
+ end
704
+ @timers[:master].clear
705
+ @is_master = false
706
+ else
707
+ @logger.debug("not currently master")
708
+ end
709
+ end
710
+
711
+ # Unsubscribe from transport subscriptions (all of them). This
712
+ # method is called when there are issues with connectivity, or
713
+ # the process is stopping.
714
+ def unsubscribe
715
+ @logger.warn("unsubscribing from keepalive and result queues")
716
+ @transport.unsubscribe
717
+ end
718
+
719
+ # Complete event handling currently in progress. The
720
+ # `:handling_event_count` is used to determine if event handling
721
+ # is complete, when it is equal to `0`. The provided callback is
722
+ # called when handling is complete.
723
+ #
724
+ # @param callback [Proc] to call when event handling is
725
+ # complete.
726
+ def complete_event_handling(&callback)
727
+ @logger.info("completing event handling in progress", {
728
+ :handling_event_count => @handling_event_count
729
+ })
730
+ retry_until_true do
731
+ if @handling_event_count == 0
732
+ callback.call
733
+ true
734
+ end
735
+ end
736
+ end
737
+
738
+ # Bootstrap the Sensu server process, setting up the keepalive
739
+ # and check result consumers, and attemping to become the master
740
+ # to carry out its duties. This method sets the process/daemon
741
+ # `@state` to `:running`.
742
+ def bootstrap
743
+ setup_keepalives
744
+ setup_results
745
+ setup_master_monitor
746
+ @state = :running
747
+ end
748
+
749
+ # Start the Sensu server process, connecting to Redis, the
750
+ # transport, and calling the `bootstrap()` method.
751
+ def start
752
+ setup_redis
753
+ setup_transport
754
+ bootstrap
755
+ end
756
+
757
+ # Pause the Sensu server process, unless it is being paused or
758
+ # has already been paused. The process/daemon `@state` is first
759
+ # set to `:pausing`, to indicate that it's in progress. All run
760
+ # timers are cancelled, and the references are cleared. The
761
+ # Sensu server will unsubscribe from all transport
762
+ # subscriptions, resign as master (if currently the master),
763
+ # then set the process/daemon `@state` to `:paused`.
764
+ def pause
765
+ unless @state == :pausing || @state == :paused
766
+ @state = :pausing
767
+ @timers[:run].each do |timer|
768
+ timer.cancel
769
+ end
770
+ @timers[:run].clear
771
+ unsubscribe
772
+ resign_as_master
773
+ @state = :paused
774
+ end
775
+ end
776
+
777
+ # Resume the Sensu server process if it is currently or will
778
+ # soon be paused. The `retry_until_true` helper method is used
779
+ # to determine if the process is paused and if the Redis and
780
+ # transport connections are connected. If the conditions are
781
+ # met, `bootstrap()` will be called and true is returned to stop
782
+ # `retry_until_true`.
783
+ def resume
784
+ retry_until_true(1) do
785
+ if @state == :paused
786
+ if @redis.connected? && @transport.connected?
787
+ bootstrap
788
+ true
789
+ end
790
+ end
791
+ end
792
+ end
793
+
794
+ # Stop the Sensu server process, pausing it, completing event
795
+ # handling in progress, closing the Redis and transport
796
+ # connections, and exiting the process (exit 0). After pausing
797
+ # the process, the process/daemon `@state` is set to
798
+ # `:stopping`.
799
+ def stop
800
+ @logger.warn("stopping")
801
+ pause
802
+ @state = :stopping
803
+ complete_event_handling do
804
+ @redis.close
805
+ @transport.close
806
+ super
807
+ end
808
+ end
809
+ end
810
+ end
811
+ end