RubyGems - trident - Versions diffs - 0.4.2 → 0.5.0 - Mend

trident 0.4.2 → 0.5.0

Files changed (22) hide show

checksums.yaml +4 -4
data/.travis.yml +7 -1
data/CHANGELOG +5 -0
data/Gemfile +1 -0
data/README.md +10 -1
data/lib/trident.rb +1 -0
data/lib/trident/cli.rb +2 -2
data/lib/trident/pool.rb +120 -27
data/lib/trident/pool_handler.rb +1 -2
data/lib/trident/pool_manager.rb +2 -2
data/lib/trident/version.rb +1 -1
data/lib/trident/worker.rb +33 -0
data/test/fixtures/integration_project/config/trident.yml +5 -0
data/test/integration/trident_test.rb +0 -6
data/test/test_helper.rb +15 -46
data/test/unit/trident/cli_test.rb +1 -1
data/test/unit/trident/pool_manager_test.rb +2 -12
data/test/unit/trident/pool_test.rb +289 -48
data/test/unit/trident/worker_test.rb +52 -0
data/trident.example.yml +4 -0
data/trident.gemspec +1 -1
metadata +27 -25

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 368b964b09f272043b062c0220aa86140d69fd90
-  data.tar.gz: 83985e089b57fed096b1880f7a854383e4d2233d
+  metadata.gz: b1f99b2b31837fe6cd9fee43af02ebaaae7afe1f
+  data.tar.gz: 87df8e84d60b0162c5575eb6b2235373e5a03e2a
 SHA512:
-  metadata.gz: 9d87f08599ae67424158561c90292075bd2db709dcff5e49abd5de61b66e66e982c7cb3ff9051dea975bba527d028f674693c7096331ba0774b9b9dcf95a6d6f
-  data.tar.gz: 5e4b5003ae49958e13f3a0527f44b7214bb4c0301a261ea51149070da089e9161314dff36965f43f1a4dbf58f3c72a63abe26600e713310c41de14b4abd8c50d
+  metadata.gz: 3f7053abe1c7276399377d9cbf8641dceb246202f216cb21d92f93e3f04d1950927b73c712126c690fbb17e0553b8db5737fa900d201fe8246e5a8441163344b
+  data.tar.gz: b0a2226f1931c27c736f14394cdf673f678fc4dd29566a6cad91e56e6ee15bf949e26a4049b8cb53a84fd93f889fd6d2bd90fd4c0bc7db7580e295ac15913077

data/.travis.yml CHANGED

@@ -3,6 +3,12 @@ cache: bundler
 rvm:
   - 1.9.3
   - 2.0.0
-#  - rbx-19mode
+  - 2.1.1
+  - rbx
 script: bundle exec rake
+matrix:
+  allow_failures:
+    - rvm: rbx

data/CHANGELOG CHANGED

@@ -1,3 +1,8 @@
+0.5.0 (07/07/2014)
+------------------
+Merge pull request #1 from backupify/orphan_workers <92a5c49> [Gregory Salmon] [james-lawrence]
 0.4.2 (11/20/2013)
 ------------------

data/Gemfile CHANGED

@@ -7,3 +7,4 @@ gemspec
 gem 'coveralls', :require => false
 gem "mocha", :require => false
+gem 'rubymine_minitest_spec', :git => 'git@github.com:backupify/rubymine_minitest_spec.git'

data/README.md CHANGED

@@ -31,9 +31,18 @@ See other command line options with
     trident --help
+## Orphaned workers
+The ability to track pool processes across a restart - allows a restart to spin up new processes as old ones die off gracefully.
+Limitations - It will treat any process that has the same pid from a previous pool as part of
+the orphaned processes if the process can be signalled from the pool process. To get around this
+you run the pool as a different user, which will prevent the pool from being able to signal the
+process.
 ## TODO
 * Add support for reloading the trident config with a HUP signal
 * Add support in yml for specifying [process limits](http://www.ruby-doc.org/core-1.9.3/Process.html#method-c-setrlimit) (memory especially)
-* Add ability to track pool processes across a restart (or maybe only across a HUP) - allows a restart to spin up new processes as old ones die off gracefully.
+* Add support for killing off orphans/processes that have been running for an excessively (configurable) long time.

data/lib/trident.rb CHANGED

@@ -6,3 +6,4 @@ require "trident/pool"
 require "trident/pool_handler"
 require "trident/pool_manager"
 require "trident/signal_handler"
+require "trident/worker"

data/lib/trident/cli.rb CHANGED

@@ -123,10 +123,10 @@ module Trident
         next if pool_filter.size > 0 && ! pool_filter.include?(name)
-        pool = Trident::Pool.new(name, handler, pool_config['size'], pool_config['options'])
+        pool = Trident::Pool.new(name, handler, pool_config)
         pools[name] = pool
       end
       pools
     end
   end
-end
+end

data/lib/trident/pool.rb CHANGED

@@ -3,14 +3,35 @@ module Trident
     include GemLogger::LoggerSupport
     include Trident::Utils
-    attr_reader :name, :handler, :size, :options, :workers
+    attr_reader :name, :handler, :size, :options, :workers, :orphans, :orphans_dir
-    def initialize(name, handler, size, options={})
+    def initialize(name, handler, options={})
       @name = name
       @handler = handler
-      @size = size
+      @size = options.delete('size') || 2
       @options = options || {}
       @workers = Set.new
+      @orphans_dir = options.delete('pids_dir') || File.join(Dir.pwd, 'trident-pools', name, 'pids')
+      @orphans = load_orphans(orphans_dir)
+    end
+    def load_orphans(path_to_orphans_dir)
+      unless File.exists?(path_to_orphans_dir)
+        FileUtils.mkdir_p(path_to_orphans_dir)
+      end
+      orphans = Set.new
+      Dir.foreach(path_to_orphans_dir) do |file|
+        path = File.join(path_to_orphans_dir, file)
+        next if File.directory?(path)
+        pid = Integer(IO.read(path))
+        orphan_worker = Worker.new(pid, self)
+        orphans << orphan_worker
+      end
+      orphans
     end
     def start
@@ -38,35 +59,97 @@ module Trident
       logger.info "<pool-#{name}> Pool up to date"
     end
+    # @return [Boolean] true iff total_workers_count > size.
+    # false otherwise
+    def above_threshold?
+      size < total_workers_count
+    end
+    # @return [Boolean] true iff total_workers_count == size.
+    # false otherwise
+    def at_threshold?
+      size == total_workers_count
+    end
+    # @return [Boolean] true iff workers.size > 0.
+    # false otherwise
+    def has_workers?
+      workers.size > 0
+    end
+    # @return [Integer] total number of workers including orphaned
+    # workers.
+    def total_workers_count
+      workers.size + orphans.size
+    end
     private
     def maintain_worker_count(kill_action)
+      cleanup_orphaned_workers
       cleanup_dead_workers(false)
-      if size > workers.size
-        spawn_workers(size - workers.size)
-      elsif size < workers.size
-        kill_workers(workers.size - size, kill_action)
+      if at_threshold?
+        logger.debug "<pool-#{name}> Worker count is correct."
+      # If we are above the threshold and we have workers
+      # then reduce the number of workers.
+      elsif above_threshold? && has_workers?
+        overthreshold = total_workers_count - size
+        workers_to_kill = [overthreshold, workers.size].min
+        logger.info("<pool-#{name}> Total workers #{workers.size} above threshold #{size} killing #{workers_to_kill}.")
+        kill_workers(workers_to_kill, kill_action)
+      # If we are above the threshold, and no workers
+      # then we can't do anything, but lets log out a
+      # message indicating this state.
+      elsif above_threshold?
+        logger.info("<pool-#{name}> Waiting on orphans before spawning workers.")
+      # If the sum of both the workers and orphan workers is under our
+      # size requirement let's spawn the number of workers required to
+      # reach that size.
       else
-        logger.debug "<pool-#{name}> Worker count is correct"
+        logger.info("<pool-#{name}> Orphans #{orphans.size}, Workers #{workers.size}")
+        spawn_workers(size - total_workers_count)
+      end
+    end
+    # Remove orphan workers which are either not running
+    # or which we don't have permission to signal (thereby telling us they
+    # where never a part of the pool)
+    def cleanup_orphaned_workers
+      orphans.clone.each do |worker|
+        begin
+          # Check if the process is running
+          Process.kill(0, worker.pid)
+        rescue Errno::EPERM, Errno::ESRCH => e
+          # If we get EPERM (Permission error) or ESRCH (No process with that pid)
+          # stop tracking that worker
+          logger.info("<pool-#{name}> Cleaning up orphaned worker #{worker.pid} because #{e.class.name}:#{e.message})")
+          orphans.delete(worker)
+          worker.destroy
+        rescue => e
+          # Make sure we catch any unexpected errors when signaling the process.
+          logger.error("<pool-#{name}> failed cleaning up worker #{worker.pid} because #{e.class.name}:#{e.message})")
+        end
       end
     end
     def cleanup_dead_workers(blocking=true)
       wait_flags = blocking ? 0 : Process::WNOHANG
-      workers.clone.each do |pid|
+      workers.clone.each do |worker|
         begin
-          wpid = Process.wait(pid, wait_flags)
+          if Process.wait(worker.pid, wait_flags)
+            workers.delete(worker)
+          end
         rescue Errno::EINTR
           logger.warn("<pool-#{name}> Interrupted cleaning up workers, retrying")
           retry
         rescue Errno::ECHILD
           logger.warn("<pool-#{name}> Error cleaning up workers, ignoring")
-          # Calling process.wait on a pid that was already waited on throws
-          # a ECHLD, so may as well remove it from our list of workers
-          wpid = pid
+          # Calling Process.wait on a pid that was already waited on throws
+          # a ECHILD, so may as well remove it from our list of workers
+          workers.delete(worker)
         end
-        workers.delete(wpid) if wpid
       end
     end
@@ -79,30 +162,40 @@ module Trident
     def kill_workers(count, action)
       logger.info "<pool-#{name}> Killing #{count} workers with #{action}"
-      workers.to_a[-count, count].each do |pid|
-        kill_worker(pid, action)
+      workers.to_a[-count, count].each do |worker|
+        kill_worker(worker, action)
       end
     end
     def spawn_worker
       pid = fork do
-        procline "pool-#{name}-worker", "starting handler #{handler.name}"
-        Trident::SignalHandler.reset_for_fork
-        handler.load
-        handler.start(options)
+        begin
+          procline "pool-#{name}-worker", "starting handler #{handler.name}"
+          Trident::SignalHandler.reset_for_fork
+          handler.load
+          handler.start(options)
+        ensure
+          worker = Worker.new(Process.pid, self)
+          worker.destroy
+        end
       end
-      workers << pid
+      worker = Worker.new(pid, self)
+      worker.save
+      workers << worker
       logger.info "<pool-#{name}> Spawned worker #{pid}, worker count now at #{workers.size}"
     end
-    def kill_worker(pid, action)
+    def kill_worker(worker, action)
       sig = handler.signal_for(action)
       raise "<pool-#{name}> No signal for action: #{action}" unless sig
-      logger.info "<pool-#{name}> Sending signal to worker: #{pid}/#{sig}/#{action}"
-      Process.kill(sig, pid)
-      workers.delete(pid)
-      logger.info "<pool-#{name}> Killed worker #{pid}, worker count now at #{workers.size}"
-    end
+      logger.info "<pool-#{name}> Sending signal to worker: #{worker.pid}/#{sig}/#{action}"
+      Process.kill(sig, worker.pid)
+      workers.delete(worker)
+      logger.info "<pool-#{name}> Killed worker #{worker.pid}, worker count now at #{workers.size}"
+    end
   end
 end

data/lib/trident/pool_handler.rb CHANGED

@@ -26,6 +26,5 @@ module Trident
     def signal_for(action)
       signal_mappings[action] || signal_mappings['default'] || "SIGTERM"
     end
   end
-end
+end

data/lib/trident/pool_manager.rb CHANGED

@@ -50,7 +50,7 @@ module Trident
     private
     def procline_display
-      pools.collect {|pool| "#{pool.name}#{pool.workers.to_a.inspect}" }.join(" ")
+      pools.collect {|pool| "#{pool.name}#{pool.workers.to_a.collect(&:pid)}" }.join(" ")
     end
     def load_handlers
@@ -71,4 +71,4 @@ module Trident
     end
   end
-end
+end

data/lib/trident/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Trident
-  VERSION = "0.4.2"
+  VERSION = "0.5.0"
 end

data/lib/trident/worker.rb ADDED

@@ -0,0 +1,33 @@
+module Trident
+  # @param [Integer] pid - pid of the worker process
+  # @param [Trident::Pool] pool - pool managing the worker process.
+  class Worker < Struct.new(:pid, :pool)
+    # Crate a pidfile for this worker so that
+    # we may track it
+    def save
+      File.open(path, 'w') do |f|
+        f << "#{pid}"
+      end
+    end
+    # Remove the pidfile associated with this
+    # worker
+    def destroy
+      FileUtils.rm path if File.exists?(path)
+    end
+    # We determine the time that this worker was
+    # created from the creation timestamp on its
+    # pidfile
+    def created_at
+      @created_at ||= File.stat(path).ctime
+    end
+    protected
+    # Path to this worker's pid file
+    def path
+      File.join(pool.orphans_dir, "#{pid}.pid")
+    end
+  end
+end

data/test/fixtures/integration_project/config/trident.yml CHANGED

@@ -40,6 +40,9 @@ pools:
     # options passed to each handler's initializer (merged into handler options above)
     options:
       name: one
+    # directory for storing child pids
+    pids_dir: '/tmp/mypool1'
   mypool2:
     # number of worker processes
     size: 2
@@ -48,4 +51,6 @@ pools:
     # options passed to each handler's initializer (merged into handler options above)
     options:
       name: two
+    # directory for storing child pids
+    pids_dir: '/tmp/mypool2'

data/test/integration/trident_test.rb CHANGED

@@ -1,13 +1,11 @@
 require_relative '../test_helper'
 class Trident::TridentTest < MiniTest::Should::TestCase
   setup do
     @project_root = File.expand_path('../../fixtures/integration_project', __FILE__)
     @cli = "#{File.expand_path('../../..', __FILE__)}/bin/trident"
   end
   def parse_manager(manager_str)
     pools = {}
     manager_str.scan(/(\w+)\[([0-9, ]+)\]/) do |pool, pids|
@@ -18,7 +16,6 @@ class Trident::TridentTest < MiniTest::Should::TestCase
   end
   context "basic usage" do
     should "start and stop pools" do
       cmd = "#{@cli} --verbose --config #{@project_root}/config/trident.yml"
       io = IO.popen(cmd, :err=>[:child, :out])
@@ -46,11 +43,9 @@ class Trident::TridentTest < MiniTest::Should::TestCase
       Process.wait(io.pid)
       assert_empty child_processes
     end
   end
   context "worker maintenance" do
     should "restart failed workers" do
       cmd = "#{@cli} --verbose --config #{@project_root}/config/trident.yml"
       io = IO.popen(cmd, :err=>[:child, :out])
@@ -78,6 +73,5 @@ class Trident::TridentTest < MiniTest::Should::TestCase
       Process.wait(io.pid)
       assert_empty child_processes
     end
   end
 end

data/test/test_helper.rb CHANGED

@@ -15,7 +15,6 @@ rescue Bundler::BundlerError => e
 end
 require 'minitest/autorun'
-require 'minitest/should'
 require "minitest/reporters"
 require "mocha/setup"
 require 'timeout'
@@ -135,53 +134,23 @@ def kill_all_child_processes
   Process.waitall
 end
-class MiniTest::Should::TestCase
-  ORIGINAL_PROCLINE = $0
+module Minitest::Should
+  class TestCase < MiniTest::Spec
-  setup do
-    $0 = ORIGINAL_PROCLINE
-    kill_all_child_processes
-  end
-end
-# Allow triggering single tests when running from rubymine
-# reopen the installed runner so we don't step on runner customizations
-class << MiniTest::Unit.runner
-  # Rubymine sends --name=/\Atest\: <context> should <should>\./
-  # Minitest runs each context as a suite
-  # Minitest filters methods by matching against: <suite>#test_0001_<should>
-  # Nested contexts are separted by spaces in rubymine, but ::s in minitest
-  def _run_suites(suites, type)
-    if options[:filter]
-      if options[:filter] =~ /\/\\Atest\\: (.*) should (.*)\\\.\//
-        context_filter = $1
-        should_filter = $2
-        should_filter.strip!
-        should_filter.gsub!(" ", "_")
-        should_filter.gsub!(/\W/, "")
-        context_filter = context_filter.gsub(" ", "((::)| )")
-        options[:filter] = "/\\A#{context_filter}(Test)?#test(_\\d+)?_should_#{should_filter}\\Z/"
-      end
+    # make minitest spec dsl similar to shoulda
+    class << self
+      alias :setup :before
+      alias :teardown :after
+      alias :context :describe
+      alias :should :it
     end
-    super
-  end
-  # Prevent "Empty test suite" verbosity when running in rubymine
-  def _run_suite(suite, type)
-    filter = options[:filter] || '/./'
-    filter = Regexp.new $1 if filter =~ /\/(.*)\//
-    all_test_methods = suite.send "#{type}_methods"
-    filtered_test_methods = all_test_methods.find_all { |m|
-      filter === m || filter === "#{suite}##{m}"
-    }
-    if filtered_test_methods.size > 0
-      super
-    else
-      [0, 0]
+    ORIGINAL_PROCLINE = $0
+    setup do
+      $0 = ORIGINAL_PROCLINE
+      kill_all_child_processes
     end
   end
 end