RubyGems - cabiri - Versions diffs - 0.0.4 → 0.0.7 - Mend

cabiri 0.0.4 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

data/lib/cabiri.rb +108 -151
metadata +4 -15

data/lib/cabiri.rb CHANGED

@@ -1,182 +1,139 @@
-require 'adeona'
-require 'logger'
 module Cabiri
-  class JobQueue
-    # - remaining_jobs:   array that contains jobs that have yet to run
-    # - active_job_pids:  array that contains the pids of jobs that are currently running
-    # - jobs_info:        array that keeps track of the state of each job
-    # - pid_to_index:     hash that maps the pid of a job to an index in the jobs_info array
-    # - uid_to_index:     hash that maps the uid of a job to an index in the jobs_info array
-    # - self_pipe:        a pipe that is used by the main process to implement a blocking wait for the
-    #                     wait_until_finished method. Both endpoints have sync set to true to prevent the
-    #                     kernel from buffering any messages.
-    # - mutex:            a mutex that is used to treat the code that deals with extracting results from
-    #                     finished processes and spawning new processes as a critical section
-    # - logger:           a logger to help log errors
-    def initialize
-      @remaining_jobs = []
-      @active_jobs_pids = []
+  class Job
+    attr_accessor :id
+    attr_accessor :pid
+    attr_accessor :block
+    attr_accessor :result
+    attr_accessor :pipe
+    attr_accessor :lifeline
+    def initialize(id, &block)
+      @id = id
+      @pid = nil
+      @block = block
+      @result = nil
+      @pipe = nil
+      @lifeline = nil
+    end
+    def activate!
+      @pipe = IO.pipe
+      @lifeline = IO.pipe
-      @jobs_info = []
-      @pid_to_index = {}
-      @uid_to_index = {}
+      @pid = fork do
+        @pipe[0].close
+        @pipe[1].sync = true
-      @self_pipe = IO.pipe()
-      @self_pipe[0].sync = true
-      @self_pipe[1].sync = true
+        @lifeline[1].close
+        @lifeline[0].sync = true
+        begin
+          lifeline_thread = Thread.new(Thread.current) do |main_thread|
+            result = IO.select([@lifeline[0]], nil, nil, nil)
+            main_thread.raise "Killing job '#{@id}' as connection with parent process was lost."
+          end
+          result = @block.call
+          @pipe[1].puts [Marshal.dump(result)].pack("m")
+        rescue => e
+          puts "Exception (#{e}) in block: #{@block.inspect}"
+        end
+      end
-      @mutex = Mutex.new
-      @logger = Logger.new($stdout)
+      @pipe[1].close
+      @pipe[0].sync = true
+      @lifeline[0].close
+      @lifeline[1].sync = true
     end
-    # add a job to the remaining_jobs array
-    def add(&block)
-      @remaining_jobs << block
+    def finish!
+      @result = Marshal.load(@pipe[0].read.unpack("m")[0])
+      @pipe[0].close
+      @lifeline[1].close
+      Process.waitpid(@pid)
     end
+  end
-    # check if there is more work to be done. The work is finished if there are no jobs waiting to be run
-    # and there are no jobs currently being run.
-    def finished?
-      @remaining_jobs.empty? and @active_jobs_pids.empty?
+  class JobQueue
+    attr_accessor :pending_jobs
+    attr_accessor :active_jobs
+    attr_accessor :finished_jobs
+    def initialize
+      @pending_jobs = []
+      @active_jobs = []
+      @finished_jobs = {}
     end
-    # this is a blocking wait that won't return until after all jobs in the
-    # queue are finished. The initialize method has set up a self_pipe. When
-    # the last job of the queue is finished, the start method will close the
-    # write end of this pipe. This causes the kernel to notice that nothing can
-    # write to the pipe anymore and thus the kernel sends an EOF down this pipe,
-    # which in turn causes the blocking IO.select to return.
-    # When IO.select returns we close the read end of the pipe, such that any
-    # future calls to the wait_until_finished method can return immediately.
-    def wait_until_finished
-      if(!@self_pipe[0].closed?)
-        IO.select([@self_pipe[0]])
-        @self_pipe[0].close
-      end
+    def add(id, &block)
+      @pending_jobs << Job.new(id, &block)
     end
-    # here we start by creating a uid to index mapping and add an entry for each
-    # job to the jobs_info array. We then schedule the first batch of jobs.
-    def start(max_active_jobs)
-      # create job mappings and initialize job info
-      @remaining_jobs.each_with_index do |job, index|
-        uid = job.to_s
-        @uid_to_index[uid] = index
-        @jobs_info[index] = {}
-        @jobs_info[index][:pid] = nil
-        @jobs_info[index][:pipe] = nil
-        @jobs_info[index][:error] = nil
-        @jobs_info[index][:result] = nil
-        @jobs_info[index][:state] = :waiting
-      end
+    def pending_jobs_available?
+      @pending_jobs.length >= 1
+    end
-      # start scheduling first batch of jobs
-      fill_job_slots(max_active_jobs)
+    def active_jobs_available?
+      @active_jobs.length >= 1
     end
-    # here we fill all the empty job slots. When we take a new job two things can happen.
-    # Either we manage to successfully spawn a new process or something goes wrong and we log
-    # it. In either case we assume that we are done with the job and remove it from the
-    # remaining_jobs array.
-    def fill_job_slots(max_active_jobs)
-      while(@active_jobs_pids.length < max_active_jobs and !@remaining_jobs.empty?)
-        begin
-          start_next_job(max_active_jobs)
-        rescue => ex
-          handle_error(ex)
-        ensure
-          @remaining_jobs.shift
-        end
+    def finished?
+      !pending_jobs_available? && !active_jobs_available?
+    end
+    def get_read_end_points_of_active_jobs
+      read_end_points = []
+      @active_jobs.each do |active_job|
+        read_end_points << active_job.pipe[0]
       end
+      read_end_points
     end
-    # when starting a new job we first create a pipe. This pipe will be our mechanism to pass any
-    # data returned by the job process to the main process. Next, we create a job process by using
-    # the Adeona gem. The spawn_child method acts like fork(), but adds some extra protection to
-    # prevent orphaned processes. Inside this job process we close the read endpoint of the pipe and
-    # set sync to true for the write endpoint in order to prevent the kernel from buffering any messages.
-    # We continue by letting the job do its work and storing the result in a var called 'result'. The
-    # next step looks a bit weird. We mentioned that we want to use pipes to communicate data, but pipes
-    # weren't designed to transport data structures like arrays and hashes, instead they are meant for text.
-    # So we use a trick. We use Marshal.dump to convert our result (which could be an array, a number,
-    # a hash - we don't know) into a byte stream, put this information inside an array, and then convert this
-    # array into a special string designed for transporting binary data as text. This text can now be send
-    # through the write endpoint of the pipe. Back outside the job process we close the write endpoint of the
-    # pipe and set sync to true. The next few lines hould require no comment.
-    # We finish by creating a thread that waits for the newly created job to end. This thread is responsible
-    # for extracting information from the finished job and spawning new jobs. Also note that we close the
-    # write end of the self_pipe when there are no jobs left. See the comments on the wait_until_finished
-    # method for why this is important.
-    # Notice how the inside of the thread is wrapped inside a mutex. This is required to prevent a race
-    # condition from occurring when two or more jobs return in quick succession. When the first job
-    # returns, its thread will start scheduling new processes, but this can take some time. If a second
-    # job returns before the thread of the first job is done scheduling, it will start doing scheduling
-    # work as well. So now you have two threads simultaneously doing scheduling work, and the end result
-    # will be unpredictable.
-    def start_next_job(max_active_jobs)
-      pipe = IO.pipe()
-      job = @remaining_jobs.first
-      pid = Adeona.spawn_child(:detach => false) do
-        pipe[0].close
-        pipe[1].sync = true
-        result = job.call
-        pipe[1].puts [Marshal.dump(result)].pack("m")
+    def get_active_job_by_read_end_point(read_end_point)
+      @active_jobs.each do |active_job|
+        return active_job if (active_job.pipe[0] == read_end_point)
       end
-      pipe[1].close
-      pipe[0].sync = true
-      index = @uid_to_index[job.to_s]
-      @active_jobs_pids << pid
-      @pid_to_index[pid] = index
-      @jobs_info[index][:pid] = pid
-      @jobs_info[index][:pipe] = pipe
-      @jobs_info[index][:state] = :running
-      Thread.new(pid) do |my_pid|
-        Process.waitpid(my_pid)
-        @mutex.synchronize do
-          handle_finished_job(my_pid)
-          fill_job_slots(max_active_jobs)
-          @self_pipe[1].close if finished?
+    end
+    def start(max_active_jobs)
+      # start by activating as many jobs as allowed
+      max_active_jobs.times do
+        if pending_jobs_available?
+          activate_next_available_job
         end
       end
-    end
-    # when a job finishes, we remove its pid from the array that keeps track of active processes.
-    # Next we read the result that we sent over the pipe and then close the pipe's read endpoint.
-    # We take the received text data, turn it into a byte stream and then load this information
-    # in order to obtain the resulting data from the job.
-    def handle_finished_job(pid)
-      index = @pid_to_index[pid]
-      @active_jobs_pids.delete(pid)
+      while active_jobs_available?
+        # every time IO.select gets called, we need to do something
+        read_end_points = get_read_end_points_of_active_jobs
+        read_end_points_array, _, _ = IO.select(read_end_points, nil, nil, nil)
-      pipe = @jobs_info[index][:pipe]
-      result = pipe[0].read
-      pipe[0].close
+        # finish all jobs that we got returned data for
+        read_end_points_array.each do |read_end_point|
+          active_job = get_active_job_by_read_end_point(read_end_point)
+          finish_job(active_job)
+        end
-      @jobs_info[index][:result] = Marshal.load(result.unpack("m")[0])
-      @jobs_info[index][:state] = :finished
+        # schedule as many new jobs as the number of jobs that just finished
+        nb_of_just_finished_jobs = read_end_points_array.length
+        nb_of_just_finished_jobs.times do
+          if pending_jobs_available?
+            activate_next_available_job
+          end
+        end
+      end
     end
-    # when there is an exception, we log the error and set the relevant fields in the jobs_info data
-    def handle_error(ex)
-      job = @remaining_jobs.first
-      index = @uid_to_index[job.to_s]
-      error = "Exception thrown when trying to instantiate job. Job info: #{@remaining_jobs.first.to_s}. Exception info: #{ex.to_s}."
-      @logger.warn(self.class.to_s) { error }
-      @jobs_info[index][:error] = error
-      @jobs_info[index][:state] = :error
+    def activate_next_available_job
+      job = @pending_jobs.shift
+      job.activate!
+      @active_jobs << job
     end
-    # this allows users to query the state of their jobs
-    def get_info(index)
-      @jobs_info[index]
+    def finish_job(job)
+      job = @active_jobs.delete(job)
+      job.finish!
+      @finished_jobs[job.id] = job
     end
   end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: cabiri
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.7
   prerelease:
 platform: ruby
 authors:
@@ -9,19 +9,8 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-05-19 00:00:00.000000000 Z
-dependencies:
-- !ruby/object:Gem::Dependency
-  name: adeona
-  requirement: &70111815020260 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :runtime
-  prerelease: false
-  version_requirements: *70111815020260
+date: 2012-12-29 00:00:00.000000000 Z
+dependencies: []
 description: An easy and intuitive Ruby job queue for working with parallel processes.
 email: tomvaneyck@gmail.com
 executables: []
@@ -49,7 +38,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.10
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: An easy and intuitive Ruby job queue for working with parallel processes.