right_agent 2.0.8 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -71,18 +71,23 @@ module RightScale
71
71
  # Default time to wait for an event or to ping WebSocket
72
72
  DEFAULT_LISTEN_TIMEOUT = 60
73
73
 
74
+ # Maximum repeated listen failures at which point give up listening
75
+ MAX_LISTEN_FAILURES = 10
76
+
74
77
  # Create RightNet router client
75
78
  #
76
79
  # @param [AuthClient] auth_client providing authorization session for HTTP requests
77
80
  #
78
81
  # @option options [Numeric] :open_timeout maximum wait for connection; defaults to DEFAULT_OPEN_TIMEOUT
79
82
  # @option options [Numeric] :request_timeout maximum wait for response; defaults to DEFAULT_REQUEST_TIMEOUT
80
- # @option options [Numeric] :listen_timeout maximum wait for event; defaults to DEFAULT_POLL_TIMEOUT
83
+ # @option options [Numeric] :listen_timeout maximum wait for event; defaults to DEFAULT_LISTEN_TIMEOUT
81
84
  # @option options [Boolean] :long_polling_only never attempt to create a WebSocket, always long-polling instead
82
85
  # @option options [Numeric] :retry_timeout maximum before stop retrying; defaults to DEFAULT_RETRY_TIMEOUT
83
86
  # @option options [Array] :retry_intervals between successive retries; defaults to DEFAULT_RETRY_INTERVALS
84
87
  # @option options [Boolean] :retry_enabled for requests that fail to connect or that return a retry result
85
88
  # @option options [Numeric] :reconnect_interval for reconnect attempts after lose connectivity
89
+ # @option options [Boolean] :non_blocking i/o is to be used for HTTP requests by applying
90
+ # EM::HttpRequest and fibers instead of RestClient; requests remain synchronous
86
91
  # @option options [Proc] :exception_callback for unexpected exceptions
87
92
  #
88
93
  # @raise [ArgumentError] auth client does not support this client type
@@ -199,8 +204,6 @@ module RightScale
199
204
  end
200
205
 
201
206
  # Receive events via an HTTP WebSocket if available, otherwise via an HTTP long-polling
202
- # This is a blocking call and therefore should be used from a thread different than
203
- # otherwise used with this object, e.g., EM.defer thread
204
207
  #
205
208
  # @param [Array, NilClass] routing_keys for event sources of interest with nil meaning all
206
209
  #
@@ -219,28 +222,14 @@ module RightScale
219
222
  def listen(routing_keys, &handler)
220
223
  raise ArgumentError, "Block missing" unless block_given?
221
224
 
225
+ @event_uuids = nil
226
+ @listen_interval = 0
227
+ @listen_state = :choose
228
+ @listen_failures = 0
222
229
  @connect_interval = CONNECT_INTERVAL
223
- @last_connect_time = Time.now - @connect_interval
224
230
  @reconnect_interval = RECONNECT_INTERVAL
225
231
 
226
- uuids = nil
227
- retries = 0
228
- until [:closing, :closed].include?(state) do
229
- if @websocket
230
- @connect_interval = CONNECT_INTERVAL
231
- @reconnect_interval = RECONNECT_INTERVAL
232
- sleep(CHECK_INTERVAL)
233
- next
234
- elsif retry_connect?
235
- @last_connect_time = Time.now
236
- @close_code = @close_reason = nil
237
- @stats["reconnects"].update("websocket") if (retries += 1) > 1
238
- next if try_connect(routing_keys, &handler)
239
- end
240
-
241
- # Resort to long-polling if WebSocket not usable
242
- uuids = try_long_poll(routing_keys, uuids, &handler) if @websocket.nil?
243
- end
232
+ listen_loop(routing_keys, &handler)
244
233
  true
245
234
  end
246
235
 
@@ -253,6 +242,7 @@ module RightScale
253
242
  # @return [TrueClass] always true
254
243
  def close(scope = :all)
255
244
  super
245
+ update_listen_state(:cancel)
256
246
  @websocket.close(SHUTDOWN_CLOSE, "Agent terminating") if @websocket
257
247
  end
258
248
 
@@ -285,56 +275,148 @@ module RightScale
285
275
  true
286
276
  end
287
277
 
288
- # Determine whether should retry creation of WebSocket connection
289
- # Should only retry if (1) WebSocket is enabled, (2) there is none currently,
290
- # (3) previous closure was for acceptable reasons (normal, router shutdown,
291
- # router inaccessible), or (4) enough time has elapsed to make another attempt
292
- #
293
- # @return [Boolean] true if should try, otherwise false
294
- def retry_connect?
295
- unless @options[:long_polling_only]
296
- if @websocket.nil?
297
- if (Time.now - @last_connect_time) > @connect_interval
298
- true
299
- elsif [NORMAL_CLOSE, SHUTDOWN_CLOSE].include?(@close_code)
300
- true
301
- elsif router_not_responding?
302
- true
303
- end
304
- end
305
- end
306
- end
307
-
308
- # Try to create WebSocket connection
278
+ # Perform listen action, then wait prescribed time for next action
279
+ # A periodic timer is not effective here because it does not wa
309
280
  #
310
281
  # @param [Array, NilClass] routing_keys for event sources of interest with nil meaning all
311
282
  #
312
283
  # @yield [event] required block called each time event received
313
284
  # @yieldparam [Hash] event received
314
285
  #
315
- # @return [Boolean] true if should not try long-polling, otherwise false
316
- def try_connect(routing_keys, &handler)
286
+ # @return [Boolean] false if failed or terminating, otherwise true
287
+ def listen_loop(routing_keys, &handler)
288
+ @listen_timer = nil
289
+
317
290
  begin
318
- connect(routing_keys, &handler)
319
- CHECK_INTERVAL.times do
320
- # Allow for possibility of asynchronous handshake failure resulting in close
291
+ # Perform listen action based on current state
292
+ case @listen_state
293
+ when :choose
294
+ # Choose listen method or continue as is if already listening
295
+ # or want to delay choosing
296
+ choose_listen_method
297
+ when :check
298
+ # Check whether really got connected, given the possibility of an
299
+ # asynchronous WebSocket handshake failure that resulted in a close
300
+ # Continue to use WebSockets if still connected or if connect failed
301
+ # due to unresponsive server
321
302
  if @websocket.nil?
322
303
  if router_not_responding?
323
- sleep(backoff_reconnect_interval)
304
+ update_listen_state(:connect, backoff_reconnect_interval)
324
305
  else
325
306
  backoff_connect_interval
307
+ update_listen_state(:long_poll)
326
308
  end
327
- break
309
+ elsif (@listen_checks += 1) > CHECK_INTERVAL
310
+ @reconnect_interval = RECONNECT_INTERVAL
311
+ update_listen_state(:choose, @connect_interval = CONNECT_INTERVAL)
328
312
  end
329
- sleep(1)
313
+ when :connect
314
+ # Use of WebSockets is enabled and it is again time to try to connect
315
+ @stats["reconnects"].update("websocket") if @attempted_connect_at
316
+ try_connect(routing_keys, &handler)
317
+ when :long_poll
318
+ # Resorting to long-polling
319
+ # Need to long-poll on separate thread if cannot use non-blocking HTTP i/o
320
+ # Will still periodically retry WebSockets if not restricted to just long-polling
321
+ if @options[:non_blocking]
322
+ @event_uuids = process_long_poll(try_long_poll(routing_keys, @event_uuids, &handler))
323
+ else
324
+ update_listen_state(:wait, 1)
325
+ try_deferred_long_poll(routing_keys, @event_uuids, &handler)
326
+ end
327
+ when :wait
328
+ # Deferred long-polling is expected to break out of this state eventually
329
+ when :cancel
330
+ return false
330
331
  end
331
- @websocket.nil?
332
+ @listen_failures = 0
332
333
  rescue Exception => e
333
- Log.error("Failed creating WebSocket", e)
334
- @stats["exceptions"].track("websocket", e)
335
- backoff_connect_interval
336
- false
334
+ Log.error("Failed to listen", e, :trace)
335
+ @stats["exceptions"].track("listen", e)
336
+ @listen_failures += 1
337
+ if @listen_failures > MAX_LISTEN_FAILURES
338
+ Log.error("Exceeded maximum repeated listen failures (#{MAX_LISTEN_FAILURES}), stopping listening")
339
+ @listen_state = :cancel
340
+ self.state = :failed
341
+ return false
342
+ end
343
+ @listen_state = :choose
344
+ @listen_interval = CHECK_INTERVAL
345
+ end
346
+
347
+ # Loop using next_tick or timer
348
+ if @listen_interval == 0
349
+ EM_S.next_tick { listen_loop(routing_keys, &handler) }
350
+ else
351
+ @listen_timer = EM_S::Timer.new(@listen_interval) { listen_loop(routing_keys, &handler) }
337
352
  end
353
+ true
354
+ end
355
+
356
+ # Update listen state
357
+ #
358
+ # @param [Symbol] state next
359
+ # @param [Integer] interval before next listen action
360
+ #
361
+ # @return [TrueClass] always true
362
+ #
363
+ # @raise [ArgumentError] invalid state
364
+ def update_listen_state(state, interval = 0)
365
+ if state == :cancel
366
+ @listen_timer.cancel if @listen_timer
367
+ @listen_timer = nil
368
+ @listen_state = state
369
+ elsif [:choose, :check, :connect, :long_poll, :wait].include?(state)
370
+ @listen_checks = 0 if state == :check && @listen_state != :check
371
+ @listen_state = state
372
+ @listen_interval = interval
373
+ else
374
+ raise ArgumentError, "Invalid listen state: #{state.inspect}"
375
+ end
376
+ true
377
+ end
378
+
379
+ # Determine whether should retry creation of WebSocket connection now
380
+ # Should only retry if (1) WebSocket is enabled, (2) there is none currently,
381
+ # (3) previous closure was for acceptable reasons (normal, router shutdown,
382
+ # router inaccessible), or (4) enough time has elapsed to make another attempt
383
+ #
384
+ # @return [TrueClass] always true
385
+ def choose_listen_method
386
+ if @options[:long_polling_only]
387
+ update_listen_state(:long_poll)
388
+ @connect_interval = MAX_CONNECT_INTERVAL
389
+ elsif @websocket
390
+ update_listen_state(:choose, @connect_interval)
391
+ else
392
+ if @attempted_connect_at.nil?
393
+ interval = 0
394
+ elsif (interval = @connect_interval - (Time.now - @attempted_connect_at)) < 0 ||
395
+ [NORMAL_CLOSE, SHUTDOWN_CLOSE].include?(@close_code) ||
396
+ router_not_responding?
397
+ interval = 0
398
+ end
399
+ update_listen_state(:connect, interval)
400
+ end
401
+ true
402
+ end
403
+
404
+ # Try to create WebSocket connection
405
+ #
406
+ # @param [Array, NilClass] routing_keys for event sources of interest with nil meaning all
407
+ #
408
+ # @yield [event] required block called each time event received
409
+ # @yieldparam [Hash] event received
410
+ #
411
+ # @return [TrueClass] always true
412
+ def try_connect(routing_keys, &handler)
413
+ connect(routing_keys, &handler)
414
+ update_listen_state(:check, 1)
415
+ rescue Exception => e
416
+ Log.error("Failed creating WebSocket", e)
417
+ @stats["exceptions"].track("websocket", e)
418
+ backoff_connect_interval
419
+ update_listen_state(:long_poll)
338
420
  end
339
421
 
340
422
  # Connect to RightNet router using WebSocket for receiving events
@@ -353,6 +435,9 @@ module RightScale
353
435
  def connect(routing_keys, &handler)
354
436
  raise ArgumentError, "Block missing" unless block_given?
355
437
 
438
+ @attempted_connect_at = Time.now
439
+ @close_code = @close_reason = nil
440
+
356
441
  options = {
357
442
  # Limit to .auth_header here (rather than .headers) to keep WebSockets happy
358
443
  :headers => {"X-API-Version" => API_VERSION}.merge(@auth_client.auth_header),
@@ -392,11 +477,8 @@ module RightScale
392
477
  # Acknowledge event
393
478
  @websocket.send(JSON.dump({:ack => event[:uuid]}))
394
479
 
395
- # Send response, if any
396
- if (result = handler.call(event))
397
- Log.info("Sending EVENT <#{result[:uuid]}> #{result[:type]} #{result[:path]} to #{result[:from]}")
398
- @websocket.send(JSON.dump({:event => result, :routing_keys => [event[:from]]}))
399
- end
480
+ # Handle event
481
+ handler.call(event)
400
482
  @communicated_callbacks.each { |callback| callback.call } if @communicated_callbacks
401
483
  rescue Exception => e
402
484
  Log.error("Failed handling WebSocket event", e, :trace)
@@ -410,30 +492,44 @@ module RightScale
410
492
  # Try to make long-polling request to receive events
411
493
  #
412
494
  # @param [Array, NilClass] routing_keys for event sources of interest with nil meaning all
413
- # @param [Array, NilClass] uuids for events received on previous poll
495
+ # @param [Array, NilClass] event_uuids from previous poll
414
496
  #
415
497
  # @yield [event] required block called each time event received
416
498
  # @yieldparam [Hash] event received
417
499
  #
418
- # @return [Array, NilClass] UUIDs of events received, or nil if none
419
- def try_long_poll(routing_keys, uuids, &handler)
420
- result = nil
500
+ # @return [Array, NilClass, Exception] UUIDs of events received, or nil if none, or Exception if failed
501
+ def try_long_poll(routing_keys, event_uuids, &handler)
421
502
  begin
422
- result = long_poll(routing_keys, uuids, &handler)
423
- @reconnect_interval = RECONNECT_INTERVAL
424
- @communicated_callbacks.each { |callback| callback.call } if @communicated_callbacks
425
- rescue Exceptions::Unauthorized, Exceptions::ConnectivityFailure, Exceptions::RetryableError => e
426
- Log.error("Failed long-polling", e, :no_trace)
427
- sleep(backoff_reconnect_interval)
503
+ long_poll(routing_keys, event_uuids, &handler)
428
504
  rescue Exception => e
429
- Log.error("Failed long-polling", e, :trace)
430
- @stats["exceptions"].track("long-polling", e)
431
- sleep(backoff_reconnect_interval)
505
+ e
432
506
  end
433
- result
507
+ end
508
+
509
+ # Try to make long-polling request to receive events using EM defer thread
510
+ # Repeat long-polling until there is an error or the stop time has been reached
511
+ #
512
+ # @param [Array, NilClass] routing_keys for event sources of interest with nil meaning all
513
+ # @param [Array, NilClass] event_uuids from previous poll
514
+ #
515
+ # @yield [event] required block called each time event received
516
+ # @yieldparam [Hash] event received
517
+ #
518
+ # @return [Array, NilClass] UUIDs of events received, or nil if none
519
+ def try_deferred_long_poll(routing_keys, event_uuids, &handler)
520
+ # Proc for running long-poll in EM defer thread since this is a blocking call
521
+ @defer_operation_proc = Proc.new { try_long_poll(routing_keys, event_uuids, &handler) }
522
+
523
+ # Proc that runs in main EM reactor thread to handle result from above operation proc
524
+ @defer_callback_proc = Proc.new { |result| @event_uuids = process_long_poll(result) }
525
+
526
+ # Use EM defer thread since the long-poll will block
527
+ EM.defer(@defer_operation_proc, @defer_callback_proc)
528
+ true
434
529
  end
435
530
 
436
531
  # Make long-polling request to receive one or more events
532
+ # Do not return until an event is received or the polling times out or fails
437
533
  # Limit logging unless in debug mode
438
534
  #
439
535
  # @param [Array, NilClass] routing_keys as strings to assist router in delivering
@@ -455,18 +551,47 @@ module RightScale
455
551
  params[:routing_keys] = routing_keys if routing_keys
456
552
  params[:ack] = ack if ack && ack.any?
457
553
 
458
- uuids = []
459
- if (events = make_request(:get, "/listen", params, "listen", nil, :log_level => :debug,
460
- :request_timeout => @options[:listen_timeout]))
554
+ options = {
555
+ :log_level => :debug,
556
+ :request_timeout => @connect_interval,
557
+ :poll_timeout => @options[:listen_timeout] }
558
+
559
+ event_uuids = []
560
+ events = make_request(:poll, "/listen", params, "listen", nil, options)
561
+ if events
461
562
  events.each do |event|
462
563
  event = SerializationHelper.symbolize_keys(event)
463
564
  Log.info("Received EVENT <#{event[:uuid]}> #{event[:type]} #{event[:path]} from #{event[:from]}")
464
565
  @stats["events"].update("#{event[:type]} #{event[:path]}")
465
- uuids << event[:uuid]
566
+ event_uuids << event[:uuid]
466
567
  handler.call(event)
467
568
  end
468
569
  end
469
- uuids if uuids.any?
570
+ event_uuids if event_uuids.any?
571
+ end
572
+
573
+ # Process result from long-polling attempt
574
+ #
575
+ # @param [Array, NilClass] result from long-polling attempt
576
+ #
577
+ # @return [Array, NilClass] result for long-polling attempt
578
+ def process_long_poll(result)
579
+ case result
580
+ when Exceptions::Unauthorized, Exceptions::ConnectivityFailure, Exceptions::RetryableError, Exceptions::InternalServerError
581
+ Log.error("Failed long-polling", result, :no_trace)
582
+ update_listen_state(:choose, backoff_reconnect_interval)
583
+ result = nil
584
+ when Exception
585
+ Log.error("Failed long-polling", result, :trace)
586
+ @stats["exceptions"].track("long-polling", result)
587
+ update_listen_state(:choose, backoff_reconnect_interval)
588
+ result = nil
589
+ else
590
+ @reconnect_interval = RECONNECT_INTERVAL
591
+ @communicated_callbacks.each { |callback| callback.call } if @communicated_callbacks
592
+ update_listen_state(:choose)
593
+ end
594
+ result
470
595
  end
471
596
 
472
597
  # Exponentially increase WebSocket connect attempt interval after failing to connect
@@ -35,7 +35,7 @@ module RightScale
35
35
  # Default maximum number of consecutive ping timeouts before attempt to reconnect
36
36
  MAX_PING_TIMEOUTS = 3
37
37
 
38
- # (EM::Timer) Timer while waiting for RightNet router ping response
38
+ # Timer while waiting for RightNet router ping response
39
39
  attr_accessor :ping_timer
40
40
 
41
41
  def initialize(sender, check_interval, ping_stats, exception_stats)
@@ -0,0 +1,70 @@
1
+ #
2
+ # Copyright (c) 2014 RightScale Inc
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ # Wrap EventMachine to support automatically spawning of fiber before executing
24
+ # associated block so that if block yields its fiber it is not the root fiber
25
+ module EventMachineSpawn
26
+ @fiber_pool = nil
27
+
28
+ def self.fiber_pool
29
+ @fiber_pool
30
+ end
31
+
32
+ def self.fiber_pool=(value)
33
+ @fiber_pool = value
34
+ end
35
+
36
+ def self.execute(&block)
37
+ @fiber_pool ? @fiber_pool.spawn(&block) : yield
38
+ end
39
+
40
+ def self.run(*args, &block)
41
+ EM.run(*args) { @fiber_pool ? @fiber_pool.spawn(&block) : yield }
42
+ end
43
+
44
+ def self.next_tick(*args, &block)
45
+ EM.next_tick(*args) { @fiber_pool ? @fiber_pool.spawn(&block) : yield }
46
+ end
47
+
48
+ def self.add_timer(*args, &block)
49
+ EM.add_timer(*args) { @fiber_pool ? @fiber_pool.spawn(&block) : yield }
50
+ end
51
+
52
+ def self.add_periodic_timer(*args, &block)
53
+ EM.add_periodic_timer(*args) { @fiber_pool ? @fiber_pool.spawn(&block) : yield }
54
+ end
55
+
56
+ class Timer
57
+ def self.new(*args, &block)
58
+ EM::Timer.new(*args) { EM_S.fiber_pool ? EM_S.fiber_pool.spawn(&block) : yield }
59
+ end
60
+ end
61
+
62
+ class PeriodicTimer
63
+ def self.new(*args, &block)
64
+ EM::PeriodicTimer.new(*args) { EM_S.fiber_pool ? EM_S.fiber_pool.spawn(&block) : yield }
65
+ end
66
+ end
67
+ end
68
+
69
+ # Alias for EventMachineSpawn
70
+ EM_S = EventMachineSpawn