RubyGems - chore-core - Versions diffs - 1.8.2 → 3.2.3 - Mend

chore-core 1.8.2 → 3.2.3

Files changed (51) hide show

checksums.yaml +5 -5
data/README.md +6 -0
data/chore-core.gemspec +1 -0
data/lib/chore.rb +11 -5
data/lib/chore/cli.rb +21 -2
data/lib/chore/consumer.rb +15 -5
data/lib/chore/fetcher.rb +12 -7
data/lib/chore/hooks.rb +2 -1
data/lib/chore/job.rb +17 -0
data/lib/chore/manager.rb +18 -2
data/lib/chore/queues/filesystem/consumer.rb +116 -59
data/lib/chore/queues/filesystem/filesystem_queue.rb +19 -0
data/lib/chore/queues/filesystem/publisher.rb +12 -18
data/lib/chore/queues/sqs/consumer.rb +6 -21
data/lib/chore/strategies/consumer/batcher.rb +8 -9
data/lib/chore/strategies/consumer/threaded_consumer_strategy.rb +3 -1
data/lib/chore/strategies/consumer/throttled_consumer_strategy.rb +121 -0
data/lib/chore/strategies/worker/forked_worker_strategy.rb +5 -6
data/lib/chore/strategies/worker/helpers/ipc.rb +88 -0
data/lib/chore/strategies/worker/helpers/preforked_worker.rb +163 -0
data/lib/chore/strategies/worker/helpers/work_distributor.rb +65 -0
data/lib/chore/strategies/worker/helpers/worker_info.rb +13 -0
data/lib/chore/strategies/worker/helpers/worker_killer.rb +40 -0
data/lib/chore/strategies/worker/helpers/worker_manager.rb +183 -0
data/lib/chore/strategies/worker/preforked_worker_strategy.rb +150 -0
data/lib/chore/strategies/worker/single_worker_strategy.rb +35 -13
data/lib/chore/unit_of_work.rb +8 -0
data/lib/chore/util.rb +5 -1
data/lib/chore/version.rb +3 -3
data/lib/chore/worker.rb +29 -0
data/spec/chore/cli_spec.rb +2 -2
data/spec/chore/consumer_spec.rb +0 -4
data/spec/chore/duplicate_detector_spec.rb +17 -5
data/spec/chore/fetcher_spec.rb +0 -11
data/spec/chore/manager_spec.rb +7 -0
data/spec/chore/queues/filesystem/filesystem_consumer_spec.rb +71 -11
data/spec/chore/queues/sqs/consumer_spec.rb +1 -3
data/spec/chore/strategies/consumer/batcher_spec.rb +50 -0
data/spec/chore/strategies/consumer/threaded_consumer_strategy_spec.rb +1 -0
data/spec/chore/strategies/consumer/throttled_consumer_strategy_spec.rb +165 -0
data/spec/chore/strategies/worker/forked_worker_strategy_spec.rb +16 -1
data/spec/chore/strategies/worker/helpers/ipc_spec.rb +127 -0
data/spec/chore/strategies/worker/helpers/preforked_worker_spec.rb +236 -0
data/spec/chore/strategies/worker/helpers/work_distributor_spec.rb +131 -0
data/spec/chore/strategies/worker/helpers/worker_info_spec.rb +14 -0
data/spec/chore/strategies/worker/helpers/worker_killer_spec.rb +97 -0
data/spec/chore/strategies/worker/helpers/worker_manager_spec.rb +304 -0
data/spec/chore/strategies/worker/preforked_worker_strategy_spec.rb +183 -0
data/spec/chore/strategies/worker/single_worker_strategy_spec.rb +25 -0
data/spec/chore/worker_spec.rb +69 -1
metadata +33 -5

data/lib/chore/strategies/worker/helpers/preforked_worker.rb ADDED

@@ -0,0 +1,163 @@
+require 'chore/signal'
+require 'socket'
+require 'timeout'
+require 'chore/strategies/worker/helpers/ipc'
+module Chore
+  module Strategy
+    class PreforkedWorker #:nodoc:
+      include Util
+      include Ipc
+      def initialize(_opts = {})
+        Chore.logger.info "PFW: #{Process.pid} initializing"
+        @manager_pid = Process.ppid
+        @consumer_cache = {}
+        @running = true
+        post_fork_setup
+      end
+      def start_worker(master_socket)
+        Chore.logger.info 'PFW: Worker starting'
+        raise 'PFW: Did not get master_socket' unless master_socket
+        connection = connect_to_master(master_socket)
+        worker(connection)
+      rescue => e
+        Chore.logger.error "PFW: Shutting down #{e.message} #{e.backtrace}"
+        raise e
+      end
+      private
+      def worker(connection)
+        worker_killer = WorkerKiller.new
+        while running?
+          # Select on the connection to the master and the self pipe
+          readables, _, ex = select_sockets(connection, nil, Chore.config.shutdown_timeout)
+          if readables.nil? # timeout
+            next
+          end
+          read_socket = readables.first
+          # Get the work from the connection to master
+          work = read_msg(read_socket)
+          # When the Master (manager process) dies, the sockets are set to
+          # readable, but there is no data in the socket. In this case we check
+          # to see if the manager is actually dead, and in that case, we exit.
+          if work.nil? && is_orphan?
+            Chore.logger.info "PFW: Manager no longer alive; Shutting down"
+            break
+          end
+          unless work.nil?
+            # Do the work
+            process_work(work)
+            worker_killer.check_requests
+            worker_killer.check_memory
+            # Alert master that worker is ready to receive more work
+            signal_ready(read_socket)
+          end
+        end
+      rescue Errno::ECONNRESET, Errno::EPIPE
+        Chore.logger.info "PFW: Worker-#{Process.pid} lost connection to master, shutting down"
+      ensure
+        Chore.logger.info "PFW: Worker process terminating"
+        exit(true)
+      end
+      # Method wrapper around @running makes it easier to write specs
+      def running?
+        @running
+      end
+      # Connects to the master socket, sends its PID, send a ready for work
+      # message, and returns the connection
+      def connect_to_master(master_socket)
+        Chore.logger.info 'PFW: connect protocol started'
+        child_connection(master_socket).tap do |conn|
+          send_msg(conn, Process.pid)
+          signal_ready(conn)
+          Chore.logger.info 'PFW: connect protocol completed'
+        end
+      end
+      def post_fork_setup
+        # Immediately swap out the process name so that it doesn't look like
+        # the master process
+        procline("#{Chore.config.worker_procline}:Started:#{Time.now}")
+        # We need to reset the logger after fork. This fixes a longstanding bug
+        # where workers would hang around and never die
+        Chore.logger = nil
+        config = Chore.config
+        # When we fork, the consumer's/publisher's need their connections reset.
+        # The specifics of this are queue dependent, and may result in a noop.
+        config.consumer.reset_connection!
+        # It is possible for this to be nil due to configuration woes with chore
+        config.publisher.reset_connection! if Chore.config.publisher
+        # Ensure that all signals are handled before we hand off a hook to the
+        # application.
+        trap_signals
+        Chore.run_hooks_for(:after_fork,self)
+      end
+      def process_work(work)
+        work = [work] unless work.is_a?(Array)
+        work.each do |item|
+          item.consumer = consumer(item.queue_name)
+          begin
+            Timeout.timeout( item.queue_timeout ) do
+              worker = Worker.new(item)
+              worker.start
+            end
+          rescue Timeout::Error => ex
+            Chore.logger.info "PFW: Worker #{Process.pid} timed out"
+            Chore.logger.info "PFW: Worker time out set at #{item.queue_timeout} seconds"
+            raise ex
+          end
+        end
+      end
+      # We need to resue Consumer objects because it takes 500ms to recreate
+      # each one.
+      def consumer(queue)
+        unless @consumer_cache.key?(queue)
+          raise Chore::TerribleMistake if @consumer_cache.size >= Chore.config.queues.size
+          @consumer_cache[queue] = Chore.config.consumer.new(queue)
+        end
+        @consumer_cache[queue]
+      end
+      def trap_signals
+        Signal.reset
+        [:INT, :QUIT, :TERM].each do |signal|
+          Signal.trap(signal) do
+            Chore.logger.info "PFW: received signal: #{signal}"
+            @running = false
+            sleep(Chore.config.shutdown_timeout)
+            Chore.logger.info "PFW: Worker process terminating"
+            exit(true)
+          end
+        end
+        Signal.trap(:USR1) do
+          Chore.reopen_logs
+          Chore.logger.info "PFW: Worker process reopened log"
+        end
+      end
+      def is_orphan?
+        Process.ppid != @manager_pid
+      end
+    end
+  end
+end

data/lib/chore/strategies/worker/helpers/work_distributor.rb ADDED

@@ -0,0 +1,65 @@
+require 'chore/strategies/worker/helpers/ipc'
+module Chore
+  module Strategy
+    class WorkDistributor #:nodoc:
+      class << self
+        include Ipc
+        def fetch_and_assign_jobs(workers, manager)
+          jobs = manager.fetch_work(workers.size)
+          raise "DW: jobs needs to be a list got #{jobs.class}" unless jobs.is_a?(Array)
+          if jobs.empty?
+            # This conditon is due to the internal consumer queue being empty.
+            # Assuming that the the consumer has to fetch from an external queue,
+            # if we return here, we would create a tight loop that would use up
+            # a lot the CPU's time. In order to prevent that, we wait for the
+            # consumer queue to be populated, by sleeping.
+            sleep(0.1)
+            return
+          end
+          jobs_to_return = assign_jobs(jobs, workers)
+          manager.return_work(jobs_to_return)
+        end
+        private
+        def assign_jobs(jobs, workers)
+          raise 'DW: assign_jobs got 0 workers' if workers.empty?
+          jobs_to_return = []
+          jobs.each_with_index do |job, i|
+            raise 'DW: More Jobs than Sockets' if workers[i].nil?
+            unless push_job_to_worker(job, workers[i])
+              jobs_to_return << job
+            end
+          end
+          jobs_to_return
+        end
+        def push_job_to_worker(job, worker)
+          Chore.run_hooks_for(:before_send_to_worker, job)
+          clear_ready(worker.socket)
+          send_msg(worker.socket, job)
+          true
+        rescue => e
+          Chore.logger.error "DW: Could not assign job #{job.inspect} (worker: #{worker.pid})\nException #{e.message} #{e.backtrace * "\n"}"
+          # We generally shouldn't get into this situations since we've already
+          # tested that we can read/write to the Worker's socket.  However,
+          # the Worker could still fail between that check and pushing the
+          # job, so we need to allow the work to be re-assigned to handle that
+          # edge case.
+          false
+        end
+        private
+        # Used for unit tests
+        def sleep(n)
+          Kernel.sleep(n)
+        end
+      end
+    end
+  end
+end

data/lib/chore/strategies/worker/helpers/worker_info.rb ADDED

@@ -0,0 +1,13 @@
+module Chore
+  module Strategy
+    class WorkerInfo
+      # Holds meta information about the worker: pid, and connection socket
+      attr_accessor :pid, :socket
+      def initialize(pid)
+        @pid = pid
+        @socket = nil
+      end
+    end
+  end
+end

data/lib/chore/strategies/worker/helpers/worker_killer.rb ADDED

@@ -0,0 +1,40 @@
+require 'get_process_mem'
+module Chore
+  module Strategy
+    class WorkerKiller  #:nodoc:
+      def initialize
+        @memory_limit = Chore.config.memory_limit_bytes
+        @request_limit = Chore.config.request_limit
+        @check_cycle = Chore.config.worker_check_cycle || 16
+        @check_count = 0
+        @current_requests = 0
+      end
+      def check_memory
+        return if @memory_limit.nil? || (@memory_limit == 0)
+        @check_count += 1
+        if @check_count == @check_cycle
+          rss = GetProcessMem.new.bytes.to_i
+          if rss > @memory_limit
+            Chore.logger.info "WK: (pid: #{Process.pid}) exceeded memory limit (#{rss.to_i} bytes > #{@memory_limit} bytes)"
+            Chore.run_hooks_for(:worker_mem_kill)
+            exit(true)
+          end
+          @check_count = 0
+        end
+      end
+      def check_requests
+        return if @request_limit.nil? || (@request_limit == 0)
+        if (@current_requests += 1) >= @request_limit
+          Chore.logger.info "WK: (pid: #{Process.pid}) exceeded max number of requests (limit: #{@request_limit})"
+          Chore.run_hooks_for(:worker_req_kill)
+          exit(true)
+        end
+      end
+    end
+  end
+end

data/lib/chore/strategies/worker/helpers/worker_manager.rb ADDED

@@ -0,0 +1,183 @@
+require 'chore/strategies/worker/helpers/ipc'
+module Chore
+  module Strategy
+    class WorkerManager #:nodoc:
+      include Ipc
+      def initialize(master_socket)
+        @master_socket = master_socket
+        @pid_to_worker = {}
+        @socket_to_worker = {}
+      end
+      # Create num of missing workers and sockets and attach them for the
+      # master
+      def create_and_attach_workers
+        create_workers do |num_workers|
+          attach_workers(num_workers)
+        end
+      end
+      # Reap dead workers and create new ones to replace them
+      def respawn_terminated_workers!
+        Chore.logger.info 'WM: Respawning terminated workers'
+        reap_workers
+        create_and_attach_workers
+      end
+      # Stop children with the given kill signal and wait for them to die
+      def stop_workers(sig)
+        @pid_to_worker.each do |pid, worker|
+          begin
+            Chore.logger.info { "WM: Sending #{sig} to: #{pid}" }
+            Process.kill(sig, pid)
+          rescue Errno::ESRCH => e
+            Chore.logger.error "WM: Signal to children error: #{e}"
+          end
+        end
+        # TODO: Sleep for the shutdown timeout and kill any remaining workers
+        reap_workers
+      end
+      # Return all the worker sockets
+      def worker_sockets
+        @socket_to_worker.keys
+      end
+      # Return the workers associated with a given array of sockets.
+      # +block+:: A block can be provided to perform tasks on the workers
+      # associated with the sockets given
+      def ready_workers(sockets = [], &block)
+        workers = @socket_to_worker.values_at(*sockets)
+        yield workers if block_given?
+        workers
+      end
+      private
+      # Creates worker processes until we have the number of workers defined
+      # by the configuration. Initializes and starts a worker instance in each
+      # of the new processes.
+      # +block+:: Block can be provided to run tasks on the number of newly
+      # created worker processes.
+      def create_workers(&block)
+        num_created_workers = 0
+        while @pid_to_worker.size < Chore.config.num_workers
+          pid = fork do
+            run_worker_instance
+          end
+          Chore.logger.info "WM: created_worker #{pid}"
+          # Keep track of the new worker process
+          @pid_to_worker[pid] = WorkerInfo.new(pid)
+          num_created_workers += 1
+        end
+        raise 'WM: Not enough workers' if inconsistent_worker_number
+        Chore.logger.info "WM: created #{num_created_workers} workers"
+        yield num_created_workers if block_given?
+        num_created_workers
+      end
+      # Check that number of workers registered in master match the config
+      def inconsistent_worker_number
+        Chore.config.num_workers != @pid_to_worker.size
+      end
+      # Initialize and start a new worker instance
+      def run_worker_instance
+        PreforkedWorker.new.start_worker(@master_socket)
+      ensure
+        exit(true)
+      end
+      # Creates individual sockets for each worker to use and attaches them to
+      # the correct worker
+      def attach_workers(num)
+        Chore.logger.info "WM: Started attaching #{num} workers"
+        create_worker_sockets(num).each do |socket|
+          begin
+            readable, _, _ = select_sockets(socket, nil, 2)
+            if readable.nil?
+              Chore.logger.info "WM: #{socket} timeout waiting for a worker"
+              socket.close
+              next
+            end
+            r_socket = readable.first
+            reported_pid = read_msg(r_socket)
+            assigned_worker = @pid_to_worker[reported_pid]
+            assigned_worker.socket = socket
+            @socket_to_worker[socket] = assigned_worker
+            Chore.logger.info "WM: Connected #{reported_pid} with #{r_socket}"
+          rescue Errno::ECONNRESET
+            Chore.logger.info "WM: A worker failed to connect to #{socket}"
+            socket.close
+            next
+          end
+        end
+        # If the connection from a worker times out, we are unable to associate
+        # the process with a connection and so we kill the worker process
+        kill_unattached_workers
+        Chore.logger.info 'WM: Finished attaching workers'
+      end
+      # Create num amount of sockets that are available for worker connections
+      def create_worker_sockets(num)
+        Array.new(num) do
+          add_worker_socket
+        end
+      end
+      # Kill workers that failed to connect to the master
+      def kill_unattached_workers
+        @pid_to_worker.each do |pid, worker|
+          next unless worker.socket.nil?
+          Chore.logger.info "WM: kill_unattached_workers #{pid}"
+          Process.kill('KILL', pid)
+        end
+      end
+      # Wait for terminated workers to die and remove their references from
+      # master
+      def reap_workers
+        Chore.logger.info "WM: reaping workers.."
+        dead_workers = @pid_to_worker.select do |pid, worker|
+          reap_process(pid)
+        end
+        dead_workers.each do |pid, worker|
+          dead_worker = @pid_to_worker.delete(pid)
+          dead_worker.socket.close
+          @socket_to_worker.delete(dead_worker.socket)
+          Chore.logger.info "WM: Removed preforked worker:#{worker.pid} - #{worker.socket}"
+        end
+      end
+      # Non-blocking wait for process to die. Returns whether it stopped
+      def reap_process(pid)
+        status = Process.wait(pid, Process::WNOHANG)
+        case status
+        when nil # Process is still running
+          return false
+        when pid # Collected status of this pid
+          return true
+        end
+      rescue Errno::ECHILD
+        # Child process has already terminated
+        true
+      end
+      def fork(&block)
+        Kernel.fork(&block)
+      end
+    end
+  end
+end