RubyGems - scbi_mapreduce - Versions diffs - 0.0.37 → 0.0.38 - Mend

scbi_mapreduce 0.0.37 → 0.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/History.txt +4 -0
data/lib/scbi_mapreduce/manager.rb +9 -6
data/lib/scbi_mapreduce/work_manager.rb +208 -104
data/lib/scbi_mapreduce/worker.rb +26 -4
data/lib/scbi_mapreduce.rb +1 -1
metadata +2 -2

data/History.txt CHANGED Viewed

@@ -1,3 +1,7 @@
+=== 0.0.38 2012-04-13
+Automatic checkpointing improvements
 === 0.0.37 2011-10-20
 Memory management improvement

data/lib/scbi_mapreduce/manager.rb CHANGED Viewed

@@ -17,13 +17,12 @@ module ScbiMapreduce
   class Manager
-    attr_accessor :checkpointing, :keep_order, :retry_failed_jobs, :exit_on_many_errors, :chunk_size
+    attr_accessor :checkpointing, :keep_order, :retry_stuck_jobs, :exit_on_many_errors, :chunk_size
     # initialize Manager
     def initialize(server_ip, port, workers, work_manager_class,custom_worker_file,log_file=nil, init_env_file=nil)
       @port=port
       if log_file.nil?
         log_file = File.join('logs','server_log.txt')
       end
@@ -55,8 +54,9 @@ module ScbiMapreduce
       @checkpointing=false
       @keep_order=false
-      @retry_failed_jobs=false
+      @retry_stuck_jobs=false
+      @exit_on_many_errors=true
       @chunk_size=1
@@ -84,7 +84,6 @@ module ScbiMapreduce
       @worker_launcher = WorkerLauncher.new(@ip,port,ip_list,@workers,custom_worker_file,log_file,init_env_file)
       $SERVER_LOG.info("Local workers: #{@workers}")
       $SERVER_LOG.info("Remote workers: #{@worker_names}")
@@ -101,11 +100,15 @@ module ScbiMapreduce
       EM.error_handler{ |e|
         $SERVER_LOG.error(e.message + ' => ' + e.backtrace.join("\n"))
       }
+      # $SERVER_LOG.info("Installing INT and TERM traps in #{@work_manager_class}")
+      # Signal.trap("INT")  { puts "TRAP INT";@work_manager_class.controlled_exit; EM.stop}
+      # Signal.trap("TERM") { puts "TRAP TERM";@work_manager_class.controlled_exit; EM.stop}
       # start EM loop
       EventMachine::run {
-        @work_manager_class.init_work_manager_internals(@checkpointing, @keep_order, @retry_failed_jobs,@exit_on_many_errors,@chunk_size)
+        @work_manager_class.init_work_manager_internals(@checkpointing, @keep_order, @retry_stuck_jobs,@exit_on_many_errors,@chunk_size)
         evm=EventMachine::start_server @ip, @port, @work_manager_class
         dir=Socket.unpack_sockaddr_in( EM.get_sockname( evm ))

data/lib/scbi_mapreduce/work_manager.rb CHANGED Viewed

@@ -14,27 +14,58 @@
 module ScbiMapreduce
-  PENDING_TO_SAVE=100
+  PENDING_TO_SAVE=10
+  CHECKPOINT_FILE='scbi_mapreduce_checkpoint'
+  OLD_CHECKPOINT_FILE='old_scbi_mapreduce_checkpoint'
   class WorkManagerData
     @@job_id=1
+    @@longest_processing_time=0
     attr_reader :job_identifier
-    attr_accessor :status, :data
+    attr_accessor :status, :data,:sent_time,:received_time
-    def initialize(job)
+    def initialize(objs)
       @job_identifier=@@job_id
       @@job_id+=1
-      @data=job
+      @data=objs
+      sent!
+      @received_time=0
+      @processing_time=nil
+    end
+    def received!(objs)
+      @data=objs
+      @received_time=Time.now
+      @processing_time=@received_time-@sent_time
+      # save longer processing time
+      @@longest_processing_time=[@@longest_processing_time,@processing_time].max
+      @status=:received
+    end
+    def sent!
       @status=:running
+      @sent_time=Time.now
+    end
+    def stuck?
+      (@status==:running) && (@@longest_processing_time>0) && (processing_time>(@@longest_processing_time*2))
+    end
+    # return running or real processing time
+    def processing_time
+      return (@processing_time || (Time.now-@sent_time))
     end
     def inspect
-      return "WorkManagerData: #{@job_identifier} => #{@status}"
+      time="; time: #{processing_time} seg"
+      return "WorkManagerData: #{@job_identifier} => #{@status} #{time}"
     end
     def self.job_id=(c)
@@ -53,7 +84,7 @@ module ScbiMapreduce
   class WorkManager < EventMachine::Connection
     include EM::P::ObjectProtocol
     def self.init_work_manager
     end
@@ -102,8 +133,9 @@ module ScbiMapreduce
     ############
-    def self.init_work_manager_internals(checkpointing, keep_order, retry_failed_jobs,exit_on_many_errors,chunk_size)
+    def self.init_work_manager_internals(checkpointing, keep_order, retry_stuck_jobs,exit_on_many_errors,chunk_size)
       @@count = 0
+      @@want_to_exit=false
       @@chunk_count = 0
       @@workers = 0
       @@max_workers = 0
@@ -113,13 +145,17 @@ module ScbiMapreduce
       @@checkpointing=checkpointing
       @@keep_order=keep_order
-      @@retry_failed_jobs=retry_failed_jobs
+      @@retry_stuck_jobs=retry_stuck_jobs
       @@exit_on_many_errors=exit_on_many_errors
       # TODO - Implement a dynamic chunk_size
       @@chunk_size=chunk_size
       $SERVER_LOG.info "Processing in chunks of #{@@chunk_size} objects"
+      $SERVER_LOG.info "Checkpointing: #{@@checkpointing}"
+      $SERVER_LOG.info "Keeping output order: #{@@keep_order}"
+      $SERVER_LOG.info "Retrying stuck jobs: #{@@retry_stuck_jobs}"
+      $SERVER_LOG.info "Exiting on too many errors: #{@@exit_on_many_errors}"
       @@checkpoint=0
       if @@checkpointing
@@ -133,17 +169,28 @@ module ScbiMapreduce
       return @@checkpoint
     end
-    def save_checkpoint
-      checkpoint_file = File.open('scbi_mapreduce_checkpoint','w')
+    def remove_checkpoint
+      if File.exists?(CHECKPOINT_FILE)
+        checkpoint_file = FileUtils.mv(CHECKPOINT_FILE,OLD_CHECKPOINT_FILE)
+      end
+    end
+    def save_checkpoint
+      checkpoint_file = File.open(CHECKPOINT_FILE,'w')
       if !@@running_jobs.empty?
-        checkpoint_file.puts @@running_jobs.first.job_identifier
+        checkpoint_value = @@running_jobs.first.job_identifier
       else
-        checkpoint_file.puts WorkManagerData.job_id-1
+         checkpoint_value = WorkManagerData.job_id
       end
+      $SERVER_LOG.info "Saving checkpoint: #{checkpoint_value}"
+      checkpoint_file.puts checkpoint_value
       checkpoint_file.close
       save_user_checkpoint
     end
@@ -151,8 +198,8 @@ module ScbiMapreduce
     def self.get_checkpoint
       res = 0
       begin
-        if File.exists?('scbi_mapreduce_checkpoint')
-          res=File.read('scbi_mapreduce_checkpoint').chomp
+        if File.exists?(CHECKPOINT_FILE)
+          res=File.read(CHECKPOINT_FILE).chomp
           # puts "read checkpoint #{res}"
           res = res.to_i
@@ -176,42 +223,80 @@ module ScbiMapreduce
       send_object(obj)
     end
-    # send next work to worker
-    def send_next_work
+    def print_running_jobs
+      jobs=@@running_jobs.map{|j| j.inspect}.join("\n")
+      $SERVER_LOG.debug("Running Jobs:\n#{jobs}")
+    end
-      objs=[]
+    def send_stuck_work
+      sent=false
-      @@chunk_size.times do
-        obj=next_work
-        if obj.nil?
-          break
-        else
-          # add to obj array
-          objs << obj
-        end
-      end
+      if @@retry_stuck_jobs
+        # count stuck jobs and re-sent the first one
+        stuck_works=@@running_jobs.select{|job| job.stuck?}
+        if !stuck_works.empty?
+          jobs=stuck_works.map{|j| j.inspect}.join("\n")
+          $SERVER_LOG.info("Stuck Jobs:\n#{jobs}")
-      if objs.count>0
-        @@count += objs.count
-        @@chunk_count += 1
+          # send_object
+          send_object(stuck_works.first)
+          stuck_works.first.sent!
+          $SERVER_LOG.info("Sending stuck work #{stuck_works.first.inspect}")
+          sent=true
+        end
+      end
-        work_data=WorkManagerData.new(objs)
+      return sent
+    end
-        send_object(work_data)
+    # send next work to worker
+    def send_next_work
-        # to keep order or retry failed job, we need job status
-        if @@keep_order || @@retry_failed_jobs
-          @@running_jobs.push work_data
+      # if we need to exit, send quit to workers
+      if @@want_to_exit
+        send_object(:quit)
+      elsif !send_stuck_work
+      #send stuck work
+        objs=[]
+        # prepare new data
+        @@chunk_size.times do
+          obj=next_work
+          if obj.nil?
+            break
+          else
+            # add to obj array
+            objs << obj
+          end
         end
-      else
-        send_object(:quit)
-      end
+        # if new was data collected, send it
+        if objs.count>0
+          @@count += objs.count
+          @@chunk_count += 1
+          work_data=WorkManagerData.new(objs)
+          send_object(work_data)
+          # to keep order or retry failed job, we need job status
+          if @@keep_order || @@retry_stuck_jobs
+            # do not remove data to be able to sent it again
+            # work_data.data=nil
+            @@running_jobs.push work_data
+            # print_running_jobs
+          end
+        else
+          # otherwise, send a quit value indicating no more data available
+          send_object(:quit)
+        end
+      end
     end
+    # loads a checkpoint
     def goto_checkpoint
       if @@checkpoint>0
         $SERVER_LOG.info "Skipping until checkpoint #{@@checkpoint}"
@@ -220,18 +305,13 @@ module ScbiMapreduce
         # do an automatic checkpoint restore
         if checkpoint==-1
-          @@checkpoint.times do |i|
-            # puts "Skipping #{i+1}"
+          (@@checkpoint - 1).times do |i|
+            $SERVER_LOG.info "Automatic trashing Chunk #{i+1}"
             # get next work
-            trash_checkpointed_work
-            #            if obj
-            #             if obj.methods.include?(:count)
-            #               @@count += obj.count
-            #             else
-            #   @@count += 1
-            # end
-            #            end
+            @@chunk_size.times do
+              obj=next_work
+            end
+            # trash_checkpointed_work
           end
           $SERVER_LOG.info "Automatic checkpoint finished"
@@ -240,8 +320,9 @@ module ScbiMapreduce
           #user has done the checkpoint restoration
         elsif checkpoint>0
           WorkManagerData.job_id=checkpoint
         elsif checkpoint==0
           $SERVER_LOG.info "Automatic checkpoint not done"
         end
@@ -271,6 +352,11 @@ module ScbiMapreduce
       send_initial_config
       send_next_work
     end
+    def self.controlled_exit
+      $SERVER_LOG.info("Controlled exit. Workers will be noticed in next round")
+      @@want_to_exit=true
+    end
     def receive_object(obj)
@@ -285,21 +371,23 @@ module ScbiMapreduce
         # if there are too many errors
         if (@@count>100) && (@@error_count >= @@count*0.8)
-          @@exit = @@exit_on_many_errors
           # notice programmer
           res=too_many_errors_received
           # force exit if too_many_errors_received returns true
-          if res==true
-            @@exit=res
+          if @@exit_on_many_errors || res
+            $SERVER_LOG.error("Want to exit due to too many errors")
+            self.controlled_exit
           end
         end
       else
         # if not using checkpointing
-        if @@checkpointing || @@keep_order || @@retry_failed_jobs
+        if @@checkpointing || @@keep_order || @@retry_stuck_jobs
+          # print_running_jobs
           checkpointable_job_received(obj)
         else
           work_received(obj.data)
@@ -314,58 +402,71 @@ module ScbiMapreduce
     def checkpointable_job_received(obj)
+      # find reveived object between sent jobs
       received_job=@@running_jobs.find{|o| o.job_identifier==obj.job_identifier}
-      # save job
+      # save job if there is was a valid work previously sent
       if received_job
-        # change job's status to received
-        received_job.data=obj.data
-        received_job.status=:received
+        # change this job's status to received
+        received_job.received!(obj.data)
-        # if there are sufficient jobs, count pending ones
-        if (@@running_jobs.count>=PENDING_TO_SAVE)
-          # count received objects pending to be written
-          pending=0
+        # # if there are sufficient jobs, count pending ones
+        # if (@@running_jobs.count>=PENDING_TO_SAVE)
+        # count received objects pending to be written, only until one that is still running is found
+        pending_to_save=0
+        @@running_jobs.each do |job|
+          if job.status==:received
+            pending_to_save += 1
+          else
+            break
+          end
+        end
+        # if there are a few pending to save works, or all remaining works are pending, then save
+        if (pending_to_save>=PENDING_TO_SAVE) || (pending_to_save==@@running_jobs.count)
+          # save pending jobs and write to disk
+          to_remove = 0
+          if @@checkpointing
+            remove_checkpoint
+          end
           @@running_jobs.each do |job|
             if job.status==:received
-              pending += 1
+              # puts "Sent to save: #{job.inspect}"
+              work_received(job.data)
+              job.status=:saved
+              to_remove += 1
             else
               break
             end
           end
+          # if some objects were saved, remove them from the running_jobs
+          if to_remove > 0
+            to_remove.times do |i|
+              o=@@running_jobs.shift
-          if (pending>PENDING_TO_SAVE) || (pending==@@running_jobs.count)
-            # purge contiguos saved data
-            to_remove = 0
-            @@running_jobs.each_with_index do |job,i|
-              if job.status==:received
-                # puts "Sent to save: #{job.inspect}"
-                work_received(job.data)
-                job.status=:saved
-                to_remove += 1
-              else
-                break
-              end
+              # puts "Job removed #{o.inspect}"
+              o=nil
             end
-            # if some objects were saved
-            if to_remove > 0
-              to_remove.times do |i|
-                o=@@running_jobs.shift
-                # puts "Job removed #{o.inspect}"
-                o=nil
-              end
+            # print_running_jobs
+            if @@checkpointing && !@@want_to_exit
               save_checkpoint
             end
           end
         end
+        # end
       else
-        $SERVER_LOG.info "Job already processed #{obj.inspect}"
+        $SERVER_LOG.warn "Job already processed #{obj.inspect}"
       end
     end
@@ -385,26 +486,29 @@ module ScbiMapreduce
       # no more workers left, shutdown EM and stop server
       if @@workers == 0
         $SERVER_LOG.info  "All workers finished"
-        EM.stop
-        $SERVER_LOG.info  "Exiting server"
-        self.class.end_work_manager
-        @@total_seconds = Time.now-@@total_seconds
-        $SERVER_LOG.info  "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
-        $SERVER_LOG.info  "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
-        $SERVER_LOG.info  "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
-        $SERVER_LOG.info  "Number of errors: #{@@error_count}"
-        $SERVER_LOG.info  "Chunk size: #{@@chunk_size}"
-        $SERVER_LOG.info  "Total connected workers: #{@@max_workers}"
+        stop_work_manager
       end
     end
+    def stop_work_manager
+      EM.stop
+      $SERVER_LOG.info  "Exiting server"
+      self.class.end_work_manager
+      @@total_seconds = Time.now-@@total_seconds
+      $SERVER_LOG.info  "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
+      $SERVER_LOG.info  "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
+      $SERVER_LOG.info  "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"
+      $SERVER_LOG.info  "Number of errors: #{@@error_count}"
+      $SERVER_LOG.info  "Chunk size: #{@@chunk_size}"
+      $SERVER_LOG.info  "Total connected workers: #{@@max_workers}"
+    end
   end
 end

data/lib/scbi_mapreduce/worker.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module ScbiMapreduce
   class Worker < EventMachine::Connection
     include EM::P::ObjectProtocol
+    @@want_to_exit_worker=false
     def receive_initial_config(obj)
@@ -40,7 +40,6 @@ module ScbiMapreduce
     def initialize(*args)
       super
     end
     def post_init
@@ -67,8 +66,15 @@ module ScbiMapreduce
         # At first iteration, start worker
         starting_worker
       else
-        if obj == :quit
+        $WORKER_LOG.info("received:"+obj.to_s)
+        if (obj == :quit) || @@want_to_exit_worker
+          $WORKER_LOG.info('Quit received')
+          stop_worker
+        elsif @@want_to_exit_worker
+          $WORKER_LOG.info('Want to exit worker')
           stop_worker
         else
           @@count += 1
@@ -94,6 +100,10 @@ module ScbiMapreduce
             modified_data=process_object(obj.data)
             obj.data = modified_data
+            # if obj.job_identifier==3
+            #   sleep 15
+            # end
             send_object(obj)
           rescue Exception => e
@@ -114,18 +124,30 @@ module ScbiMapreduce
     end
     def stop_worker
+      $WORKER_LOG.info "Closing  connection with WORKER"
+      $WORKER_LOG.info("Worker processed #{@@count} chunks")
       close_connection
       EventMachine::stop_event_loop
       closing_worker
     end
+    def self.controlled_exit_worker
+      @@want_to_exit_worker=true
+    end
     def self.start_worker(worker_id,ip,port,log_file=nil)
       #puts "NEW WORKER - INIIIIIIIIIIIIIIIIIIIIT #{self}"
       ip = ip
       port = port
       @@count = -1
       @@worker_id=worker_id
+      # Signal.trap("INT")  { puts "TRAP INT in worker #{@@worker_id}"; controlled_exit_worker; EM.stop}
+      # Signal.trap("TERM") { puts "TRAP TERM in worker #{@@worker_id}";controlled_exit_worker; EM.stop}
       if log_file.nil?
         log_file = 'logs/worker'+worker_id+'_'+`hostname`.chomp+'_log.txt'

data/lib/scbi_mapreduce.rb CHANGED Viewed

@@ -7,7 +7,7 @@ $: << File.expand_path('scbi_mapreduce')
 # puts $:
 module ScbiMapreduce
-   VERSION = '0.0.37'
+   VERSION = '0.0.38'
 end
 require 'scbi_mapreduce/manager'

metadata CHANGED Viewed

@@ -2,7 +2,7 @@
 name: scbi_mapreduce
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.0.37
+  version: 0.0.38
 platform: ruby
 authors:
 - Dario Guerrero
@@ -10,7 +10,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-10-20 00:00:00 Z
+date: 2012-04-13 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: eventmachine