RubyGems - scout_agent - Versions diffs - 3.0.7 → 3.1.0 - Mend

scout_agent 3.0.7 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/CHANGELOG +13 -0
data/README +44 -2
data/Rakefile +1 -1
data/TODO +7 -1
data/lib/scout_agent.rb +26 -1
data/lib/scout_agent/agent/communication_agent.rb +75 -2
data/lib/scout_agent/agent/master_agent.rb +132 -31
data/lib/scout_agent/assignment/queue.rb +3 -0
data/lib/scout_agent/assignment/snapshot.rb +4 -1
data/lib/scout_agent/assignment/start.rb +3 -4
data/lib/scout_agent/assignment/stop.rb +64 -30
data/lib/scout_agent/id_card.rb +23 -12
data/lib/scout_agent/lifeline.rb +129 -19
data/lib/scout_agent/mission.rb +221 -68
data/lib/scout_agent/order.rb +2 -2
data/lib/scout_agent/plan.rb +38 -18
metadata +2 -2

data/lib/scout_agent/assignment/queue.rb CHANGED Viewed

@@ -109,6 +109,9 @@ module ScoutAgent
         # maintain the queue database
         db.maintain
+        status_database.maintain
+        # clean out old logs
+        ScoutAgent.remove_old_log_files(log)
         log.info("Messages queued successfully.")
       end

data/lib/scout_agent/assignment/snapshot.rb CHANGED Viewed

@@ -108,8 +108,11 @@ module ScoutAgent
                        "#{run_time} seconds." )
           end
-          # maintain the snapshots database
+          # maintain the databases
           db.maintain
+          status_database.maintain
+          # clean out old logs
+          ScoutAgent.remove_old_log_files(log)
           log.info("Snapshot complete.")
           this_file.flock(File::LOCK_UN)  # release our snapshot lock

data/lib/scout_agent/assignment/start.rb CHANGED Viewed

@@ -74,10 +74,7 @@ module ScoutAgent
         lifelines = agents.map { |agent| Lifeline.new(agent, log) }
         %w[TERM INT].each do |signal|
           trap(signal) do
-            Thread.new do
-              lifelines.each { |line| line.terminate }
-              Process.waitall
-            end
+            lifelines.each { |line| line.terminate }
           end
         end
         lifelines.each do |line|
@@ -88,6 +85,8 @@ module ScoutAgent
         lifelines.each do |line|
           line.join
         end
+        # wait for all children to obey our stop command
+        Process.waitall
       end
       #######

data/lib/scout_agent/assignment/stop.rb CHANGED Viewed

@@ -13,20 +13,29 @@ module ScoutAgent
     # to terminate.
     #
     class Stop < Assignment
+      #
+      # The number of periods between stop checks after a process has been
+      # signaled.  Checks will be made until it's obvious the process obeyed the
+      # request or until this number of checks has been made.
+      #
+      WAIT_COUNT = 10
+      #
+      # The pause in seconds between stop checks after a process has been
+      # signaled.
+      #
+      WAIT_DELAY = 0.5
       # Runs the stop command.
       def execute
         @agent = IDCard.new(:lifeline)
         if @agent.pid_file.exist?
-          puts "Stopping #{ScoutAgent.proper_agent_name} (PID #{@agent.pid})..."
-          signal_and_wait("TERM")
-          if @agent.pid_file.exist?
-            puts "TERM signal was ignored, sending KILL..."
-            signal_and_wait("KILL")
-            if @agent.pid_file.exist?
-              abort_with_failed_to_stop
-            end
+          signal_all("TERM")
+          if Plan.pid_dir.entries.any? { |pid| pid.to_s =~ /\w+\.pid\z/ }
+            puts "\nTERM signals were ignored, sending KILL signals.\n\n"
+            signal_all("KILL")
+            abort_with_failed_to_stop
           end
-          puts "Stopped."
+          puts "All processes stopped."
         else
           abort_with_not_running_notice
         end
@@ -37,28 +46,49 @@ module ScoutAgent
       #######
       #
-      # Sends +signal_name+ (which is expected to be some kind of stop request
-      # to the <tt>@agent</tt>.  This method will then wait +wait_count+ periods
-      # of +wait_delay+ seconds checking between waits to see if the agent has
-      # complied.  It returns when the agent has exited or the total wait period
-      # has expired.
+      # Sends +signal_name+ (assumed to be a stop request) to all subprocesses
+      # of the agent.  The first process signaled is the lifeline process and it
+      # is given a few seconds to shut everything down smoothly as it is
+      # supposed to do.  If that fails, the stray processes will be sent
+      # +signal_name+ directly.
       #
-      def signal_and_wait(signal_name, wait_count = 10, wait_delay = 0.5)
-        # signal the main process
-        @agent.signal(signal_name)
-        # wait for it to stop
-        wait_count.times do
-          sleep wait_delay
-          break unless @agent.pid_file.exist?
-        end
-        # signal other stray processes
+      def signal_all(signal_name)
+        # start with the Lifeline process as that should properly stop everyone
+        signal_and_wait(@agent, signal_name)
+        # signal any other stray processes
         Plan.pid_dir.each_entry do |process|
           if process.to_s =~ /(\w+)\.pid\z/
-            IDCard.new($1).signal(signal_name)
+            signal_and_wait(IDCard.new($1), signal_name) unless $1 == "lifeline"
+          end
+        end
+      end
+      #
+      # Sends +signal_name+ (which is expected to be some kind of stop request)
+      # to the +id_card+.  This method will then wait +WAIT_COUNT+ periods of
+      # +WAIT_DELAY+ seconds checking between waits to see if the agent has
+      # complied.  It returns when the signaled process has exited or the total
+      # wait period has expired.  The wait period is skipped for the +KILL+
+      # signal, since the process is not allowed to respond.
+      #
+      def signal_and_wait(id_card, signal_name)
+        puts "Sending #{signal_name} signal to the agent's " +
+             "#{id_card.process_name} process."
+        # signal the main process
+        begin
+          id_card.signal(signal_name)
+        rescue Errno::ECHILD, Errno::ESRCH   # no such process
+          # do nothing:  it stopped
+        rescue Errno::EPERM                  # we don't have permission
+          abort_with_no_permission
+        end
+        unless signal_name == "KILL"  # process cannot respond, so don't wait
+          # wait for it to stop
+          WAIT_COUNT.times do
+            sleep WAIT_DELAY
+            break unless id_card.pid_file.exist?
           end
         end
-      rescue Errno::EPERM  # we don't have permission
-        abort_with_no_permission
       end
       #
@@ -75,6 +105,7 @@ module ScoutAgent
       #
       def abort_with_no_permission
         abort <<-END_PERMISSION.trim
         Unable to signal the daemon.  Please rerun this command with
         super user privileges:
@@ -84,13 +115,16 @@ module ScoutAgent
       end
       #
-      # Abort with an error message to the user that says we don't have enough
-      # permission to stop the agent due to how it was started.
+      # Abort with an error message to the user that says we killed the agent
+      # but there are some stray file hanging around.
       #
       def abort_with_failed_to_stop
         abort <<-END_FAILED.trim
-        Unable to stop the daemon.  You may need to use the PID files
-        in #{Plan.pid_dir} to clean up stay processes.
+        KILL signals were sent to all active processes and they
+        should be stopped now.  You may wish to check the PID
+        files in #{Plan.pid_dir} to be sure.  The agent should
+        clean up these old files as it relaunches.
         END_FAILED
       end
     end

data/lib/scout_agent/id_card.rb CHANGED Viewed

@@ -7,15 +7,26 @@ module ScoutAgent
   # execution and to signal other processes.
   #
   class IDCard
-    class << self
-      #
-      # This global attribute should contain the name of the current process.
-      # It is set during a successful authorization.
-      #
-      # <b>Warning:</b>  Be sure to clear this attribute immediately after a
-      # fork() so you don't keep the parent's identity.
-      #
-      attr_accessor :me
+    #
+    # This global attribute should contain the IDCard for this process. It is
+    # set during a successful authorization.
+    #
+    # <b>Warning:</b>  Be sure to clear this attribute immediately after a
+    # fork(), with a call to me=(), so you don't keep the parent's identity.
+    #
+    def self.me
+      @me ||= nil
+    end
+    #
+    # A setter for the identity of this process.  This is set automatically as
+    # part of an authorization.
+    #
+    # <b>Warning:</b>  Be sure to clear this attribute immediately after a
+    # fork() so you don't keep the parent's identity.
+    #
+    def self.me=(id_card)
+      @me = id_card
     end
     #
@@ -76,10 +87,10 @@ module ScoutAgent
     # <tt>IDCard::me()</tt> has been updated and an exit handle has been
     # installed to revoke() this claim as the process ends.
     #
-    def authorize
+    def authorize(&block)
       File.open(pid_file, File::CREAT | File::EXCL | File::WRONLY) do |pid|
         pid.flock(File::LOCK_EX)
-        if not block_given? or yield  # allows for daemonization
+        if block.nil? or block.call  # allows for daemonization
           pid.puts Process.pid
         else
           pid.flock(File::LOCK_UN)
@@ -105,7 +116,7 @@ module ScoutAgent
               # stale PID file found, clearing it and reloading
               if revoke
                 pid.flock(File::LOCK_UN)  # release the lock before we recurse
-                return authorize          # try again
+                return authorize(&block)  # try again
               end
             rescue Errno::EACCES  # don't have permission
               # nothing we can do so give up

data/lib/scout_agent/lifeline.rb CHANGED Viewed

@@ -2,16 +2,46 @@
 # encoding: UTF-8
 module ScoutAgent
+  #
+  # This class is a monitor for an Agent subprocess of the platform.  It
+  # launches the Agent code and makes sure it continues to check-in at regular
+  # intervals, restarting the subprocess when it fails to do so.
+  #
   class Lifeline
+    #
+    # The number of seconds allowed to pass before the Agent subprocess is
+    # considered unresponsive.
+    #
     NO_CONTACT_TIMEOUT   = 5
-    CHECK_IN_FREQUENCY   = 0.99  # gives us five check-ins before a cutoff
+    #
+    # The frequency with which the subprocess is expected to check-in.  This is
+    # purposely set to a little under a second to give one more check-in
+    # possibility before the <tt>NO_CONTACT_TIMEOUT</tt> is reached.
+    #
+    CHECK_IN_FREQUENCY   = 0.99
+    #
+    # The number of seconds the monitor will wait for a process to exit cleanly
+    # before forcing a stop.
+    #
     TERM_TO_KILL_PAUSE   = 1
+    #
+    # The sequence of seconds this monitor will wait between restarts of the
+    # subprocess.  The initial values are short, to try and get running again as
+    # soon as possible.  However, this timeout grows larger up to a point to
+    # reduce strain on a server experiencing long term problems.  The sequence
+    # will reset after a successful relaunch that runs for at least as long as
+    # the next number in the sequence (or the max).
+    #
     RELAUNCH_FREQUENCIES = [0, 1, 1, 2, 3, 5, 8, 13]
     #################
     ### Interface ###
     #################
+    #
+    # Prepares a monitor for the code specified by +agent+.  You may also set
+    # log() messages will be appended to.
+    #
     def initialize(agent, log = WireTap.new(nil))
       @agent                       = agent
       @log                         = log
@@ -20,6 +50,7 @@ module ScoutAgent
       @reader                      = nil
       @writer                      = nil
       @launch_and_monitor_thread   = nil
+      @termination_thread          = nil
       @check_in_with_parent_thread = nil
       @code                        = nil
       @last_launch                 = nil
@@ -32,8 +63,13 @@ module ScoutAgent
     include Tracked
+    # The log file this monitor writes tracking information to.
     attr_reader :log
+    #
+    # This method outlines the process used to monitor an Agent.  It is roughly:
+    # launch, monitor, kill as needed, and restart the process.
+    #
     def launch_and_monitor
       @launch_and_monitor_thread = Thread.new do
         Thread.current.abort_on_exception = true
@@ -48,22 +84,38 @@ module ScoutAgent
       end
     end
+    #
+    # Begins a termination of the Agent subprocess in a separate Thread.  This
+    # monitor's join() method will also wait on this termination Thread to
+    # ensure everything gets the order to shutdown before we exit.
+    #
     def terminate
-      if Process.pid == @parent_pid
-        # stop monitoring
-        log.info("Stopping the monitoring for '#{@agent}'.")
-        @launch_and_monitor_thread.exit if @launch_and_monitor_thread
-        # ask child process to exit
-        log.info("Asking '#{@agent}' to stop.")
-        IDCard.new(@agent).signal("TERM")
+      @termination_thread = Thread.new do
+        if Process.pid == @parent_pid
+          # stop monitoring
+          log.info("Stopping the monitoring for '#{@agent}'.")
+          @launch_and_monitor_thread.exit if @launch_and_monitor_thread
+          # ask child process to exit
+          log.info("Asking '#{@agent}' to stop.")
+          IDCard.new(@agent).signal("TERM")
+        end
       end
     rescue Errno::ESRCH  # no such process
       # if already exited, so we are fine
     end
+    #
+    # Waits for the monitor Thread to be stopped by a natural termination before
+    # returning.  If terminate() is called to start the shutdown, this method
+    # will also wait on the Thread spawned by that method to ensure everything
+    # gets the signal to stop.
+    #
     def join
       if Process.pid == @parent_pid and @launch_and_monitor_thread
-        @launch_and_monitor_thread.join
+        @launch_and_monitor_thread.join  # wait on the monitor to stop
+        if @termination_thread
+          @termination_thread.join       # wait on us to stop the subprocess
+        end
       end
     end
@@ -71,10 +123,15 @@ module ScoutAgent
     private
     #######
-    ##############
-    ### Parent ###
-    ##############
+    ###############
+    ### Monitor ###
+    ###############
+    #
+    # This method just rests for the proper amount of time between launches to
+    # ensure we're not overworking the server due to continuing issues.  See
+    # +RELAUNCH_FREQUENCIES+ for details.
+    #
     def wait_for_launch
       if @last_launch
         seconds_ran   = Time.now - @last_launch
@@ -95,11 +152,19 @@ module ScoutAgent
         end
       end
     end
+    #
+    # Creates a two-ended pipe for one way communication from the Agent checking
+    # in with the monitor.
+    #
     def prepare_pipe
       @reader, @writer = IO.pipe
     end
+    #
+    # This method fork()'s the subprocess and outlines the work done there:
+    # loading, starting the check-in Thread, and running the Agent code.
+    #
     def launch_child
       log.info("Launching '#{@agent}'.")
       status(@agent)
@@ -113,7 +178,8 @@ module ScoutAgent
         run_code
       end
     end
+    # Ensure that the writing end of the pipe is closed.
     def close_writer
       @writer.close
     rescue IOError  # already closed
@@ -122,24 +188,36 @@ module ScoutAgent
       # it wasn't set so there's nothing to close
     end
+    #
+    # An infinite loop that just reads check-in messages from the Agent.  This
+    # method will return when the Agent fails to report within
+    # +NO_CONTACT_TIMEOUT+.
+    #
     def monitor_child
       loop do
         check_in = nil
         begin
-          Timeout.timeout(NO_CONTACT_TIMEOUT) { check_in = @reader.gets }
+          Timeout.timeout(NO_CONTACT_TIMEOUT) do
+            check_in = @reader.gets
+          end
           log.error("'#{@agent}' monitor channel has closed.") if check_in.nil?
         rescue Timeout::Error
           # check_in will stay nil
           log.error("'#{@agent}' failed to check-in in time.")
         end
         unless check_in.to_s =~
-               /\A#{@child_pid}:  \d{4}-\d{2}-\d{2} \d{2}:\d{2}\Z/
+               /\A#{@child_pid}:\s*\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\Z/
           log.error("'#{@agent}' check-in was malformed.") unless check_in.nil?
           break
         end
       end
     end
+    #
+    # This method is called after a monitor cycle fails, so it needs to ensure
+    # the Agent gets shutdown.  It will also stop a running plugin, if found,
+    # for the master agent only.
+    #
     def restart_child
       log.info("Stopping '#{@agent}'.")
       status(@agent, :restarting)
@@ -152,6 +230,10 @@ module ScoutAgent
       end
     end
+    #
+    # This method overrides the base status() setter from Tracked, to add a way
+    # the combine messages when restarting multiple processes.
+    #
     def status(process, restarting = false)
       if db = status_database
         db.write_to_sqlite do |sqlite|
@@ -181,18 +263,27 @@ module ScoutAgent
     end
     #############
-    ### Child ###
+    ### Agent ###
     #############
+    #
+    # Installs appropriate signal handlers for the Agent and clears the identity
+    # of the monitor.
+    #
     def reset_environment
       # swap out our parent's signal handlers
-      install_shutdown_handler { finish_code }
-      trap("ALRM")             { alert_code  }
+      install_shutdown_handler do
+        finish_code
+      end
+      trap("ALRM")             do
+        alert_code
+      end
       # clear the parent's identity
       IDCard.me = nil
     end
+    # Ensure that the reading end of the pipe is closed.
     def close_reader
       @reader.close
     rescue IOError  # already closed
@@ -201,16 +292,29 @@ module ScoutAgent
       # it wasn't set so there's nothing to close
     end
+    #
+    # Loads the code for the Agent to be monitored and fetches the object built
+    # by that code.
+    #
     def load_code
       require LIB_DIR + "agent"
       require LIB_DIR + "agent/#{@agent}_agent"
       @code = ScoutAgent::Agent.const_get("#{@agent.CamelCase}Agent").new
     end
+    #
+    # Ensures that this Agent is authorized to run because a copy is not
+    # currently active.
+    #
     def authorize_code
       @code.authorize
     end
+    #
+    # An infinite loop that just writes check-in messages to the monitoring
+    # process.  This code will trigger it's own shutdown if the parent
+    # disappears (closing the pipe).
+    #
     def check_in_with_parent
       @check_in_with_parent_thread = Thread.new do
         Thread.current.abort_on_exception = true
@@ -226,10 +330,15 @@ module ScoutAgent
       end
     end
+    # Invokes the main code of the Agent.
     def run_code
       @code.run
     end
+    #
+    # Closes the pipe and invokes the finishing code of the Agent in a separate
+    # Thread.
+    #
     def finish_code
       close_writer
@@ -242,6 +351,7 @@ module ScoutAgent
       end
     end
+    # Invokes the code for the Agent to notice changes from the outside world.
     def alert_code
       if @code
         Thread.new do