sensu 0.16.0-java → 0.17.0.beta.1-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,811 @@
1
+ require "sensu/daemon"
2
+ require "sensu/server/filter"
3
+ require "sensu/server/mutate"
4
+ require "sensu/server/handle"
5
+
6
+ module Sensu
7
+ module Server
8
+ class Process
9
+ include Daemon
10
+ include Filter
11
+ include Mutate
12
+ include Handle
13
+
14
+ attr_reader :is_master, :handling_event_count
15
+
16
+ # Create an instance of the Sensu server process, start the
17
+ # server within the EventMachine event loop, and set up server
18
+ # process signal traps (for stopping).
19
+ #
20
+ # @param options [Hash]
21
+ def self.run(options={})
22
+ server = self.new(options)
23
+ EM::run do
24
+ server.start
25
+ server.setup_signal_traps
26
+ end
27
+ end
28
+
29
+ # Override Daemon initialize() to support Sensu server master
30
+ # election and the handling event count.
31
+ #
32
+ # @param options [Hash]
33
+ def initialize(options={})
34
+ super
35
+ @is_master = false
36
+ @timers[:master] = Array.new
37
+ @handling_event_count = 0
38
+ end
39
+
40
+ # Update the Sensu client registry, stored in Redis. Sensu
41
+ # client data is used to provide additional event context and
42
+ # enable agent health monitoring. JSON serialization is used for
43
+ # the client data.
44
+ #
45
+ # @param client [Hash]
46
+ # @param callback [Proc] to call after the the client data has
47
+ # been added to (or updated) the registry.
48
+ def update_client_registry(client, &callback)
49
+ @logger.debug("updating client registry", :client => client)
50
+ @redis.set("client:#{client[:name]}", MultiJson.dump(client)) do
51
+ @redis.sadd("clients", client[:name]) do
52
+ callback.call
53
+ end
54
+ end
55
+ end
56
+
57
+ # Set up the client keepalive consumer, keeping the Sensu client
58
+ # registry updated. The consumer receives JSON serialized client
59
+ # keepalives from the transport, parses them, and calls
60
+ # `update_client_registry()` with the client data to update the
61
+ # registry. Transport message acknowledgements are used to
62
+ # ensure the client registry is updated successfully. Keepalive
63
+ # JSON parsing errors are logged.
64
+ def setup_keepalives
65
+ @logger.debug("subscribing to keepalives")
66
+ @transport.subscribe(:direct, "keepalives", "keepalives", :ack => true) do |message_info, message|
67
+ @logger.debug("received keepalive", :message => message)
68
+ begin
69
+ client = MultiJson.load(message)
70
+ update_client_registry(client) do
71
+ @transport.ack(message_info)
72
+ end
73
+ rescue MultiJson::ParseError => error
74
+ @logger.error("failed to parse keepalive payload", {
75
+ :message => message,
76
+ :error => error.to_s
77
+ })
78
+ @transport.ack(message_info)
79
+ end
80
+ end
81
+ end
82
+
83
+ # Expand event handler sets, creating an array of handler
84
+ # definitions. Handler sets cannot be deeply nested (by choice),
85
+ # this method will return `nil` if an attempt is made to deeply
86
+ # nest. If the provided handler definition is not a set, it is
87
+ # returned.
88
+ #
89
+ # @param handler [Hash] definition.
90
+ # @param depth [Integer] of the expansion.
91
+ # @return [Array, Hash, Nil]
92
+ def expand_handler_sets(handler, depth=0)
93
+ if handler[:type] == "set"
94
+ if depth < 2
95
+ derive_handlers(handler[:handlers], depth + 1)
96
+ else
97
+ @logger.error("handler sets cannot be deeply nested", :handler => handler)
98
+ nil
99
+ end
100
+ else
101
+ handler
102
+ end
103
+ end
104
+
105
+ # Derive an array of handler definitions from a list of handler
106
+ # names. This method first checks for the existence of standard
107
+ # handlers, followed by handler extensions. If a handler does
108
+ # not exist for a name, it is logged and ignored. Duplicate
109
+ # handler definitions are removed.
110
+ #
111
+ # @param handler_list [Array]
112
+ # @param depth [Integer] of handler set expansion.
113
+ # @return [Array]
114
+ def derive_handlers(handler_list, depth=0)
115
+ handler_list.compact.map { |handler_name|
116
+ case
117
+ when @settings.handler_exists?(handler_name)
118
+ handler = @settings[:handlers][handler_name].merge(:name => handler_name)
119
+ expand_handler_sets(handler, depth)
120
+ when @extensions.handler_exists?(handler_name)
121
+ @extensions[:handlers][handler_name]
122
+ else
123
+ @logger.error("unknown handler", :handler_name => handler_name)
124
+ nil
125
+ end
126
+ }.flatten.compact.uniq
127
+ end
128
+
129
+ # Run event bridge extensions, within the Sensu EventMachine
130
+ # reactor (event loop). The extension API `safe_run()` method is
131
+ # used to guard against most errors. Bridges are for relaying
132
+ # Sensu event data to other services.
133
+ #
134
+ # @param event [Hash]
135
+ def event_bridges(event)
136
+ @extensions[:bridges].each do |name, bridge|
137
+ bridge.safe_run(event) do |output, status|
138
+ @logger.debug("bridge extension output", {
139
+ :extension => bridge.definition,
140
+ :output => output
141
+ })
142
+ end
143
+ end
144
+ end
145
+
146
+ # Process an event: filter -> mutate -> handle.
147
+ #
148
+ # This method runs event bridges, relaying the event data to
149
+ # other services. This method also determines the appropriate
150
+ # handlers for the event, filtering and mutating the event data
151
+ # for each of them. The `@handling_event_count` is incremented
152
+ # by `1`, for each event handler chain (filter -> mutate ->
153
+ # handle).
154
+ #
155
+ # @param event [Hash]
156
+ def process_event(event)
157
+ log_level = event[:check][:type] == "metric" ? :debug : :info
158
+ @logger.send(log_level, "processing event", :event => event)
159
+ event_bridges(event)
160
+ handler_list = Array((event[:check][:handlers] || event[:check][:handler]) || "default")
161
+ handlers = derive_handlers(handler_list)
162
+ handlers.each do |handler|
163
+ @handling_event_count += 1
164
+ filter_event(handler, event) do |event|
165
+ mutate_event(handler, event) do |event_data|
166
+ handle_event(handler, event_data)
167
+ end
168
+ end
169
+ end
170
+ end
171
+
172
+ # Add a check result to an aggregate. A check aggregate uses the
173
+ # check `:name` and the `:issued` timestamp as its unique
174
+ # identifier. An aggregate uses several counters: the total
175
+ # number of results in the aggregate, and a counter for each
176
+ # check severity (ok, warning, etc). Check output is also
177
+ # stored, to be summarized to aid in identifying outliers for a
178
+ # check execution across a number of Sensu clients. JSON
179
+ # serialization is used for storing check result data.
180
+ #
181
+ # @param result [Hash]
182
+ def aggregate_check_result(result)
183
+ @logger.debug("adding check result to aggregate", :result => result)
184
+ check = result[:check]
185
+ result_set = "#{check[:name]}:#{check[:issued]}"
186
+ result_data = MultiJson.dump(:output => check[:output], :status => check[:status])
187
+ @redis.hset("aggregation:#{result_set}", result[:client], result_data) do
188
+ SEVERITIES.each do |severity|
189
+ @redis.hsetnx("aggregate:#{result_set}", severity, 0)
190
+ end
191
+ severity = (SEVERITIES[check[:status]] || "unknown")
192
+ @redis.hincrby("aggregate:#{result_set}", severity, 1) do
193
+ @redis.hincrby("aggregate:#{result_set}", "total", 1) do
194
+ @redis.sadd("aggregates:#{check[:name]}", check[:issued]) do
195
+ @redis.sadd("aggregates", check[:name])
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+
202
+ # Store check result data. This method stores the 21 most recent
203
+ # check result statuses for a client/check pair, this history
204
+ # is used for event context and flap detection. The check
205
+ # execution timestamp is also stored, to provide an indication
206
+ # of how recent the data is.
207
+ #
208
+ # @param client [Hash]
209
+ # @param check [Hash]
210
+ # @param callback [Proc] to call when the check result data has
211
+ # been stored (history, etc).
212
+ def store_check_result(client, check, &callback)
213
+ @redis.sadd("history:#{client[:name]}", check[:name])
214
+ result_key = "#{client[:name]}:#{check[:name]}"
215
+ history_key = "history:#{result_key}"
216
+ @redis.rpush(history_key, check[:status]) do
217
+ @redis.set("execution:#{result_key}", check[:executed])
218
+ @redis.ltrim(history_key, -21, -1)
219
+ callback.call
220
+ end
221
+ end
222
+
223
+ # Fetch the execution history for a client/check pair, the 21
224
+ # most recent check result statuses. This method also calculates
225
+ # the total state change percentage for the history, this value
226
+ # is use for check state flat detection, using a similar
227
+ # algorithm to Nagios:
228
+ # http://nagios.sourceforge.net/docs/3_0/flapping.html
229
+ #
230
+ # @param client [Hash]
231
+ # @param check [Hash]
232
+ # @param callback [Proc] to be called with the check history and
233
+ # total state change value.
234
+ def check_history(client, check, &callback)
235
+ history_key = "history:#{client[:name]}:#{check[:name]}"
236
+ @redis.lrange(history_key, -21, -1) do |history|
237
+ total_state_change = 0
238
+ unless history.size < 21
239
+ state_changes = 0
240
+ change_weight = 0.8
241
+ previous_status = history.first
242
+ history.each do |status|
243
+ unless status == previous_status
244
+ state_changes += change_weight
245
+ end
246
+ change_weight += 0.02
247
+ previous_status = status
248
+ end
249
+ total_state_change = (state_changes.fdiv(20) * 100).to_i
250
+ end
251
+ callback.call(history, total_state_change)
252
+ end
253
+ end
254
+
255
+ # Determine if a check state is flapping, rapidly changing
256
+ # between an OK and non-OK state. Flap detection is only done
257
+ # for checks that have defined low and hight flap detection
258
+ # thresholds, `:low_flap_threshold` and `:high_flap_threshold`.
259
+ # The `check_history()` method provides the check history and
260
+ # more importantly the total state change precentage value that
261
+ # is compared with the configured thresholds defined in the
262
+ # check data. If a check hasn't been flapping, the
263
+ # `:total_state_change` must be equal to or higher than the
264
+ # `:high_flap_threshold` to be changed to flapping. If a check
265
+ # has been flapping, the `:total_state_change` must be equal to
266
+ # or lower than the `:low_flap_threshold` to no longer be
267
+ # flapping. This method uses the same algorithm as Nagios:
268
+ # http://nagios.sourceforge.net/docs/3_0/flapping.html
269
+ #
270
+ # @param stored_event [Hash]
271
+ # @param check [Hash]
272
+ # @return [TrueClass, FalseClass]
273
+ def check_flapping?(stored_event, check)
274
+ if check.has_key?(:low_flap_threshold) && check.has_key?(:high_flap_threshold)
275
+ was_flapping = stored_event && stored_event[:action] == "flapping"
276
+ check[:total_state_change] >= check[:high_flap_threshold] ||
277
+ (was_flapping && check[:total_state_change] <= check[:low_flap_threshold]) ||
278
+ was_flapping
279
+ else
280
+ false
281
+ end
282
+ end
283
+
284
+ # Update the event registry, stored in Redis. This method
285
+ # determines if check data results in the creation or update of
286
+ # event data in the registry. Existing event data for a
287
+ # client/check pair is fetched, used in conditionals and the
288
+ # composition of the new event data. If a check `:status` is not
289
+ # `0`, or it has been flapping, an event is created/updated in
290
+ # the registry. If there was existing event data, but the check
291
+ # `:status` is now `0`, the event is removed (resolved) from the
292
+ # registry. If the previous conditions are not met, and check
293
+ # `:type` is `metric` and the `:status` is `0`, the event
294
+ # registry is not updated, but the provided callback is called
295
+ # with the event data. JSON serialization is used when storing
296
+ # data in the registry.
297
+ #
298
+ # @param client [Hash]
299
+ # @param check [Hash]
300
+ # @param callback [Proc] to be called with the resulting event
301
+ # data if the event registry is updated, or the check is of
302
+ # type `:metric`.
303
+ def update_event_registry(client, check, &callback)
304
+ @redis.hget("events:#{client[:name]}", check[:name]) do |event_json|
305
+ stored_event = event_json ? MultiJson.load(event_json) : nil
306
+ flapping = check_flapping?(stored_event, check)
307
+ event = {
308
+ :id => random_uuid,
309
+ :client => client,
310
+ :check => check,
311
+ :occurrences => 1
312
+ }
313
+ if check[:status] != 0 || flapping
314
+ if stored_event && check[:status] == stored_event[:check][:status]
315
+ event[:occurrences] = stored_event[:occurrences] + 1
316
+ end
317
+ event[:action] = flapping ? :flapping : :create
318
+ @redis.hset("events:#{client[:name]}", check[:name], MultiJson.dump(event)) do
319
+ callback.call(event)
320
+ end
321
+ elsif stored_event
322
+ event[:occurrences] = stored_event[:occurrences]
323
+ event[:action] = :resolve
324
+ unless check[:auto_resolve] == false && !check[:force_resolve]
325
+ @redis.hdel("events:#{client[:name]}", check[:name]) do
326
+ callback.call(event)
327
+ end
328
+ end
329
+ elsif check[:type] == "metric"
330
+ callback.call(event)
331
+ end
332
+ end
333
+ end
334
+
335
+ # Process a check result, storing its data, inspecting its
336
+ # contents, and taking the appropriate actions (eg. update the
337
+ # event registry). A check result must have a valid client name,
338
+ # associated with a client in the registry. Results without a
339
+ # valid client are discarded, to keep the system "correct". If a
340
+ # local check definition exists for the check name, and the
341
+ # check result is not from a standalone check execution, it's
342
+ # merged with the check result for more context.
343
+ #
344
+ # @param result [Hash] data.
345
+ def process_check_result(result)
346
+ @logger.debug("processing result", :result => result)
347
+ @redis.get("client:#{result[:client]}") do |client_json|
348
+ unless client_json.nil?
349
+ client = MultiJson.load(client_json)
350
+ check = case
351
+ when @settings.check_exists?(result[:check][:name]) && !result[:check][:standalone]
352
+ @settings[:checks][result[:check][:name]].merge(result[:check])
353
+ else
354
+ result[:check]
355
+ end
356
+ aggregate_check_result(result) if check[:aggregate]
357
+ store_check_result(client, check) do
358
+ check_history(client, check) do |history, total_state_change|
359
+ check[:history] = history
360
+ check[:total_state_change] = total_state_change
361
+ update_event_registry(client, check) do |event|
362
+ process_event(event)
363
+ end
364
+ end
365
+ end
366
+ else
367
+ @logger.warn("client not in registry", :client => result[:client])
368
+ end
369
+ end
370
+ end
371
+
372
+ # Set up the check result consumer. The consumer receives JSON
373
+ # serialized check results from the transport, parses them, and
374
+ # calls `process_check_result()` with the result data to be
375
+ # processed. Transport message acknowledgements are used to
376
+ # ensure that results make it to processing. The transport
377
+ # message acknowledgements are currently done in the next tick
378
+ # of the EventMachine reactor (event loop), as a flow control
379
+ # mechanism. Result JSON parsing errors are logged.
380
+ def setup_results
381
+ @logger.debug("subscribing to results")
382
+ @transport.subscribe(:direct, "results", "results", :ack => true) do |message_info, message|
383
+ begin
384
+ result = MultiJson.load(message)
385
+ @logger.debug("received result", :result => result)
386
+ process_check_result(result)
387
+ rescue MultiJson::ParseError => error
388
+ @logger.error("failed to parse result payload", {
389
+ :message => message,
390
+ :error => error.to_s
391
+ })
392
+ end
393
+ EM::next_tick do
394
+ @transport.ack(message_info)
395
+ end
396
+ end
397
+ end
398
+
399
+ # Publish a check request to the transport. A check request is
400
+ # composted of a check `:name`, an `:issued` timestamp, and a
401
+ # check `:command` if available. The check request is published
402
+ # to a transport pipe, for each of the check `:subscribers` in
403
+ # its definition, eg. "webserver". JSON serialization is used
404
+ # when publishing the check request payload to the transport
405
+ # pipes. Transport errors are logged.
406
+ #
407
+ # @param check [Hash] definition.
408
+ def publish_check_request(check)
409
+ payload = {
410
+ :name => check[:name],
411
+ :issued => Time.now.to_i
412
+ }
413
+ payload[:command] = check[:command] if check.has_key?(:command)
414
+ @logger.info("publishing check request", {
415
+ :payload => payload,
416
+ :subscribers => check[:subscribers]
417
+ })
418
+ check[:subscribers].each do |subscription|
419
+ @transport.publish(:fanout, subscription, MultiJson.dump(payload)) do |info|
420
+ if info[:error]
421
+ @logger.error("failed to publish check request", {
422
+ :subscription => subscription,
423
+ :payload => payload,
424
+ :error => info[:error].to_s
425
+ })
426
+ end
427
+ end
428
+ end
429
+ end
430
+
431
+ # Calculate a check execution splay, taking into account the
432
+ # current time and the execution interval to ensure it's
433
+ # consistent between process restarts.
434
+ #
435
+ # @param check [Hash] definition.
436
+ def calculate_check_execution_splay(check)
437
+ splay_hash = Digest::MD5.digest(check[:name]).unpack('Q<').first
438
+ current_time = (Time.now.to_f * 1000).to_i
439
+ (splay_hash - current_time) % (check[:interval] * 1000) / 1000.0
440
+ end
441
+
442
+ # Schedule check executions, using EventMachine periodic timers,
443
+ # using a calculated execution splay. The timers are stored in
444
+ # the timers hash under `:master`, as check request publishing
445
+ # is a task for only the Sensu server master, so they can be
446
+ # cancelled etc. Check requests are not published if subdued.
447
+ #
448
+ # @param checks [Array] of definitions.
449
+ def schedule_check_executions(checks)
450
+ checks.each do |check|
451
+ create_check_request = Proc.new do
452
+ unless check_request_subdued?(check)
453
+ publish_check_request(check)
454
+ else
455
+ @logger.info("check request was subdued", :check => check)
456
+ end
457
+ end
458
+ execution_splay = testing? ? 0 : calculate_check_execution_splay(check)
459
+ interval = testing? ? 0.5 : check[:interval]
460
+ @timers[:master] << EM::Timer.new(execution_splay) do
461
+ create_check_request.call
462
+ @timers[:master] << EM::PeriodicTimer.new(interval, &create_check_request)
463
+ end
464
+ end
465
+ end
466
+
467
+ # Set up the check request publisher. This method creates an
468
+ # array of check definitions, that are not standalone checks,
469
+ # and do not have `:publish` set to `false`. The array of check
470
+ # definitions includes those from standard checks and extensions
471
+ # (with a defined execution `:interval`). The array is provided
472
+ # to the `schedule_check_executions()` method.
473
+ def setup_check_request_publisher
474
+ @logger.debug("scheduling check requests")
475
+ standard_checks = @settings.checks.reject do |check|
476
+ check[:standalone] || check[:publish] == false
477
+ end
478
+ extension_checks = @extensions.checks.reject do |check|
479
+ check[:standalone] || check[:publish] == false || !check[:interval].is_a?(Integer)
480
+ end
481
+ schedule_check_executions(standard_checks + extension_checks)
482
+ end
483
+
484
+ # Publish a check result to the transport for processing. A
485
+ # check result is composed of a client name and a check
486
+ # definition, containing check `:output` and `:status`. JSON
487
+ # serialization is used when publishing the check result payload
488
+ # to the transport pipe. Transport errors are logged.
489
+ #
490
+ # @param client [Hash]
491
+ # @param check [Hash]
492
+ def publish_check_result(client, check)
493
+ payload = {
494
+ :client => client[:name],
495
+ :check => check
496
+ }
497
+ @logger.debug("publishing check result", :payload => payload)
498
+ @transport.publish(:direct, "results", MultiJson.dump(payload)) do |info|
499
+ if info[:error]
500
+ @logger.error("failed to publish check result", {
501
+ :payload => payload,
502
+ :error => info[:error].to_s
503
+ })
504
+ end
505
+ end
506
+ end
507
+
508
+ # Create a keepalive check definition for a client. Client
509
+ # definitions may contain `:keepalive` configuration, containing
510
+ # specific thresholds and handler information. The keepalive
511
+ # check definition creation begins with default thresholds, and
512
+ # sets the `:handler` to `keepalive`, if the handler has a local
513
+ # definition. If the client provides its own `:keepalive`
514
+ # configuration, it's deep merged with the defaults. The check
515
+ # `:name`, `:issued`, and `:executed` values are always
516
+ # overridden to guard against an invalid definition.
517
+ def create_keepalive_check(client)
518
+ check = {
519
+ :thresholds => {
520
+ :warning => 120,
521
+ :critical => 180
522
+ }
523
+ }
524
+ if @settings.handler_exists?(:keepalive)
525
+ check[:handler] = "keepalive"
526
+ end
527
+ if client.has_key?(:keepalive)
528
+ check = deep_merge(check, client[:keepalive])
529
+ end
530
+ timestamp = Time.now.to_i
531
+ check.merge(:name => "keepalive", :issued => timestamp, :executed => timestamp)
532
+ end
533
+
534
+ # Determine stale clients, those that have not sent a keepalive
535
+ # in a specified amount of time (thresholds). This method
536
+ # iterates through the client registry, creating a keepalive
537
+ # check definition with the `create_keepalive_check()` method,
538
+ # containing client specific staleness thresholds. If the time
539
+ # since the latest keepalive is equal to or greater than a
540
+ # threshold, the check `:output` is set to a descriptive
541
+ # message, and `:status` is set to the appropriate non-zero
542
+ # value. If a client has been sending keepalives, `:output` and
543
+ # `:status` are set to indicate an OK state. A check result is
544
+ # published for every client in the registry.
545
+ def determine_stale_clients
546
+ @logger.info("determining stale clients")
547
+ @redis.smembers("clients") do |clients|
548
+ clients.each do |client_name|
549
+ @redis.get("client:#{client_name}") do |client_json|
550
+ unless client_json.nil?
551
+ client = MultiJson.load(client_json)
552
+ check = create_keepalive_check(client)
553
+ time_since_last_keepalive = Time.now.to_i - client[:timestamp]
554
+ check[:output] = "No keepalive sent from client for "
555
+ check[:output] << "#{time_since_last_keepalive} seconds"
556
+ case
557
+ when time_since_last_keepalive >= check[:thresholds][:critical]
558
+ check[:output] << " (>=#{check[:thresholds][:critical]})"
559
+ check[:status] = 2
560
+ when time_since_last_keepalive >= check[:thresholds][:warning]
561
+ check[:output] << " (>=#{check[:thresholds][:warning]})"
562
+ check[:status] = 1
563
+ else
564
+ check[:output] = "Keepalive sent from client "
565
+ check[:output] << "#{time_since_last_keepalive} seconds ago"
566
+ check[:status] = 0
567
+ end
568
+ publish_check_result(client, check)
569
+ end
570
+ end
571
+ end
572
+ end
573
+ end
574
+
575
+ # Set up the client monitor, a periodic timer to run
576
+ # `determine_stale_clients()` every 30 seconds. The timer is
577
+ # stored in the timers hash under `:master`.
578
+ def setup_client_monitor
579
+ @logger.debug("monitoring client keepalives")
580
+ @timers[:master] << EM::PeriodicTimer.new(30) do
581
+ determine_stale_clients
582
+ end
583
+ end
584
+
585
+ # Prune check result aggregations (aggregates). Sensu only
586
+ # stores the 20 latest aggregations for a check, to keep the
587
+ # amount of data stored to a minimum.
588
+ def prune_check_result_aggregations
589
+ @logger.info("pruning check result aggregations")
590
+ @redis.smembers("aggregates") do |checks|
591
+ checks.each do |check_name|
592
+ @redis.smembers("aggregates:#{check_name}") do |aggregates|
593
+ if aggregates.size > 20
594
+ aggregates.sort!
595
+ aggregates.take(aggregates.size - 20).each do |check_issued|
596
+ @redis.srem("aggregates:#{check_name}", check_issued) do
597
+ result_set = "#{check_name}:#{check_issued}"
598
+ @redis.del("aggregate:#{result_set}") do
599
+ @redis.del("aggregation:#{result_set}") do
600
+ @logger.debug("pruned aggregation", {
601
+ :check => {
602
+ :name => check_name,
603
+ :issued => check_issued
604
+ }
605
+ })
606
+ end
607
+ end
608
+ end
609
+ end
610
+ end
611
+ end
612
+ end
613
+ end
614
+ end
615
+
616
+ # Set up the check result aggregation pruner, using periodic
617
+ # timer to run `prune_check_result_aggregations()` every 20
618
+ # seconds. The timer is stored in the timers hash under
619
+ # `:master`.
620
+ def setup_check_result_aggregation_pruner
621
+ @logger.debug("pruning check result aggregations")
622
+ @timers[:master] << EM::PeriodicTimer.new(20) do
623
+ prune_check_result_aggregations
624
+ end
625
+ end
626
+
627
+ # Set up the master duties, tasks only performed by a single
628
+ # Sensu server at a time. The duties include publishing check
629
+ # requests, monitoring for stale clients, and pruning check
630
+ # result aggregations.
631
+ def master_duties
632
+ setup_check_request_publisher
633
+ setup_client_monitor
634
+ setup_check_result_aggregation_pruner
635
+ end
636
+
637
+ # Request a master election, a process to determine if the
638
+ # current process is the master Sensu server, with its
639
+ # own/unique duties. A Redis key/value is used as a central
640
+ # lock, using the "SETNX" Redis command to set the key/value if
641
+ # it does not exist, using a timestamp for the value. If the
642
+ # current process was able to create the key/value, it is the
643
+ # master, and must do the duties of the master. If the current
644
+ # process was not able to create the key/value, but the current
645
+ # timestamp value is equal to or over 30 seconds ago, the
646
+ # "GETSET" Redis command is used to set a new timestamp and
647
+ # fetch the previous value to compare them, to determine if it
648
+ # was set by the current process. If the current process is able
649
+ # to set the timestamp value, it becomes the master. The master
650
+ # has `@is_master` set to `true`.
651
+ def request_master_election
652
+ @redis.setnx("lock:master", Time.now.to_i) do |created|
653
+ if created
654
+ @is_master = true
655
+ @logger.info("i am the master")
656
+ master_duties
657
+ else
658
+ @redis.get("lock:master") do |timestamp|
659
+ if Time.now.to_i - timestamp.to_i >= 30
660
+ @redis.getset("lock:master", Time.now.to_i) do |previous|
661
+ if previous == timestamp
662
+ @is_master = true
663
+ @logger.info("i am now the master")
664
+ master_duties
665
+ end
666
+ end
667
+ end
668
+ end
669
+ end
670
+ end
671
+ end
672
+
673
+ # Set up the master monitor. A one-time timer is used to run
674
+ # `request_master_exection()` in 2 seconds. A periodic timer is
675
+ # used to update the master lock timestamp if the current
676
+ # process is the master, or to run `request_master_election(),
677
+ # every 10 seconds. The timers are stored in the timers hash
678
+ # under `:run`.
679
+ def setup_master_monitor
680
+ @timers[:run] << EM::Timer.new(2) do
681
+ request_master_election
682
+ end
683
+ @timers[:run] << EM::PeriodicTimer.new(10) do
684
+ if @is_master
685
+ @redis.set("lock:master", Time.now.to_i) do
686
+ @logger.debug("updated master lock timestamp")
687
+ end
688
+ else
689
+ request_master_election
690
+ end
691
+ end
692
+ end
693
+
694
+ # Resign as master, if the current process is the Sensu server
695
+ # master. This method cancels and clears the master timers,
696
+ # those with references stored in the timers hash under
697
+ # `:master`, and `@is_master`is set to `false`.
698
+ def resign_as_master
699
+ if @is_master
700
+ @logger.warn("resigning as master")
701
+ @timers[:master].each do |timer|
702
+ timer.cancel
703
+ end
704
+ @timers[:master].clear
705
+ @is_master = false
706
+ else
707
+ @logger.debug("not currently master")
708
+ end
709
+ end
710
+
711
+ # Unsubscribe from transport subscriptions (all of them). This
712
+ # method is called when there are issues with connectivity, or
713
+ # the process is stopping.
714
+ def unsubscribe
715
+ @logger.warn("unsubscribing from keepalive and result queues")
716
+ @transport.unsubscribe
717
+ end
718
+
719
+ # Complete event handling currently in progress. The
720
+ # `:handling_event_count` is used to determine if event handling
721
+ # is complete, when it is equal to `0`. The provided callback is
722
+ # called when handling is complete.
723
+ #
724
+ # @param callback [Proc] to call when event handling is
725
+ # complete.
726
+ def complete_event_handling(&callback)
727
+ @logger.info("completing event handling in progress", {
728
+ :handling_event_count => @handling_event_count
729
+ })
730
+ retry_until_true do
731
+ if @handling_event_count == 0
732
+ callback.call
733
+ true
734
+ end
735
+ end
736
+ end
737
+
738
+ # Bootstrap the Sensu server process, setting up the keepalive
739
+ # and check result consumers, and attemping to become the master
740
+ # to carry out its duties. This method sets the process/daemon
741
+ # `@state` to `:running`.
742
+ def bootstrap
743
+ setup_keepalives
744
+ setup_results
745
+ setup_master_monitor
746
+ @state = :running
747
+ end
748
+
749
+ # Start the Sensu server process, connecting to Redis, the
750
+ # transport, and calling the `bootstrap()` method.
751
+ def start
752
+ setup_redis
753
+ setup_transport
754
+ bootstrap
755
+ end
756
+
757
+ # Pause the Sensu server process, unless it is being paused or
758
+ # has already been paused. The process/daemon `@state` is first
759
+ # set to `:pausing`, to indicate that it's in progress. All run
760
+ # timers are cancelled, and the references are cleared. The
761
+ # Sensu server will unsubscribe from all transport
762
+ # subscriptions, resign as master (if currently the master),
763
+ # then set the process/daemon `@state` to `:paused`.
764
+ def pause
765
+ unless @state == :pausing || @state == :paused
766
+ @state = :pausing
767
+ @timers[:run].each do |timer|
768
+ timer.cancel
769
+ end
770
+ @timers[:run].clear
771
+ unsubscribe
772
+ resign_as_master
773
+ @state = :paused
774
+ end
775
+ end
776
+
777
+ # Resume the Sensu server process if it is currently or will
778
+ # soon be paused. The `retry_until_true` helper method is used
779
+ # to determine if the process is paused and if the Redis and
780
+ # transport connections are connected. If the conditions are
781
+ # met, `bootstrap()` will be called and true is returned to stop
782
+ # `retry_until_true`.
783
+ def resume
784
+ retry_until_true(1) do
785
+ if @state == :paused
786
+ if @redis.connected? && @transport.connected?
787
+ bootstrap
788
+ true
789
+ end
790
+ end
791
+ end
792
+ end
793
+
794
+ # Stop the Sensu server process, pausing it, completing event
795
+ # handling in progress, closing the Redis and transport
796
+ # connections, and exiting the process (exit 0). After pausing
797
+ # the process, the process/daemon `@state` is set to
798
+ # `:stopping`.
799
+ def stop
800
+ @logger.warn("stopping")
801
+ pause
802
+ @state = :stopping
803
+ complete_event_handling do
804
+ @redis.close
805
+ @transport.close
806
+ super
807
+ end
808
+ end
809
+ end
810
+ end
811
+ end