RubyGems - dircrawl - Versions diffs - 0.0.14 → 0.0.15 - Mend

dircrawl 0.0.14 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 99a04d217d2ae8f295d5e7b71d702019128fa888
-  data.tar.gz: 27c211fce81a7649ac03c77158c25889b42a47ad
+  metadata.gz: 042b2465ba3729dd8b32b61a6fa55a9732ce6bb0
+  data.tar.gz: 3c8bb1ef5cf0fffed384f91fdda387f41185ed97
 SHA512:
-  metadata.gz: 092d8f4d0f1456bc83c9176ad82853f03b6272414881e946e12ed85730831434600ab1a59efe52ac4c55dcf2096ffb392ebf1ebc3ddb3583be01a3c55f2b23cf
-  data.tar.gz: 661c0f676837b758e827a34060ce599dfe62f6bd211d206a0caca87078c4ba378009ec6c47253d30df0cad9f7a8e35feff620a0f4ed8471fca132d7f354dbd36
+  metadata.gz: 29b0dd91babcdf8600b2ac3c7600749986018e418706f539574c274dca51df7ca8e649dcd861dda1869ac2cde206d2fd1b45a4643d260055ad698bf8711efc9e
+  data.tar.gz: 4bda8db830eedc8f15e11f104874881034ddedc67434d5e86736cdd43ef5ea013cd9a21ff2566c7a857e8409265f3df25366e7f865479999a1090d79dbaef7fa

data/lib/dircrawl.rb CHANGED Viewed

@@ -1,47 +1,27 @@
 require 'json'
 require 'pry'
-require 'curb'
-require 'selenium-webdriver'
-require 'uri'
+require 'harvesterreporter'
+# Crawls a directory of files and runs a block of code on it
 class DirCrawl
-  def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
-    @path = path
-    @output_dir = output_dir
-    @ignore_includes = ignore_includes
+  def initialize(path_params, process_block, include_block, extras_block, cm_hash, *args)
+    # Set the params for the path
+    @path = path_params[:path]
+    @output_dir = path_params[:output_dir]
+    @ignore_includes = path_params[:ignore_includes]
+    @failure_mode = path_params[:failure_mode]
+    # Setup the blocks to run
     include_block.call
     @process_block = process_block
     @extras_block = extras_block
-    @failure_mode = failure_mode
-    @output = Array.new
-    @save = save
-    # Handle crawler manager info
-    @cm_url = cm_hash[:crawler_manager_url] if cm_hash
-    @selector_id = cm_hash[:selector_id] if cm_hash
-    # Crawl
-    crawl_dir(path, *args)
+    # Setup the Harvester reporter to report the results
+    @reporter = HarvesterReporter.new(cm_hash)
+    crawl_dir(@path, *args)
   end
-  # Figure out where to write it
-  def get_write_dir(dir, file)
-    dir_save = dir.gsub(@path, @output_dir)
-    return dir_save+"/"+file+".json"
-  end
-  # Create if they don't exist
-  def create_write_dirs(dir)
-    dirs = dir.split("/")
-    dirs.delete("")
-    overallpath = ""
-    dirs.each do |d|
-      Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
-      overallpath += ("/"+d)
-    end
-  end
-  # Crawl dir and call block for each file
+  # Crawls the directory sppecified
   def crawl_dir(dir, *args)
     Dir.foreach(dir) do |file|
       # Skip . or .. files
@@ -49,90 +29,65 @@ class DirCrawl
       # Recurse into directories
       if File.directory?(dir+"/"+file)
-        report_status("Going to next directory: " + dir+"/"+file)
-        crawl_dir(dir+"/"+file, *args)
+        crawl_dir("#{dir}/#{file}", *args)
       # Process file
       elsif !file.include?(@ignore_includes)
-	    # Create Output Directory
-        create_write_dirs(dir.gsub(@path, @output_dir))
         begin
-		  # Check if processed file exists
-		  # Skip processing (if yes)
-          if !File.exist?(get_write_dir(dir, file))
-		    # Process Extras
-		    if @extras_block != ""
-              extras = @extras_block.call(@output_dir+"/")
-		    end
-            # Now Process Main
-            processed = @process_block.call(dir+"/"+file, *args)
-          else
-		    puts "Processed file exists, skipping"
-            puts " " + dir + "/" + file
-            processed = File.read(get_write_dir(dir, file))
-          end
-        rescue Exception => e # really catch any failures
-          report_status("Error on file "+file+": "+e.to_s)
-          if @failure_mode == "debug"
-            binding.pry
-          elsif @failure_mode == "log"
-            error_file = dir + "/" + file + "\n"
-            IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
-          end
+          output_results(process_file(dir, file, *args), dir, file)
+        rescue Exception => e
+          handle_failure(e, dir, file, *args)
         end
-        # Only save in output if specified (to handle large dirs)
-        report_results([JSON.parse(processed)], dir+"/"+file)
-        # Write Output to file
-        File.write(get_write_dir(dir, file), processed)
       end
     end
   end
-  # Figure out how to report results
-  def report_results(results, path)
-    if @cm_url
-      report_incremental(results, path)
-    else
-      report_batch(results)
+  # Process a file using the blocks given
+  def process_file(dir, file, *args)
+    create_write_dirs(dir.gsub(@path, @output_dir))
+    # Run blocks to process the file
+    if !File.exist?(get_write_path(dir, file))
+      @extras_block.call("#{@output_dir}/") if !@extras_block.empty?
+      return @process_block.call("#{dir}/#{file}", *args)
+    else # Use already existing file
+      puts "Processed file exists, skipping: #{dir}/#{file}"
+      return File.read(get_write_path(dir, file))
     end
   end
-  # Report all results in one JSON
-  def report_batch(results)
-    results.each do |result|
-      @output.push(result)
-    end
+  # Output the results to Harvester and file dir
+  def output_results(processed, dir, file)
+    @reporter.report_results([JSON.parse(processed)], "#{dir}/#{file}")
+    File.write(get_write_path(dir, file), processed)
   end
-  # Report Harvester status message
-  def report_status(status_msg)
-    if @cm_url
-      curl_url = @cm_url+"/update_status"
-      c = Curl::Easy.http_post(curl_url,
-                               Curl::PostField.content('selector_id', @selector_id),
-                               Curl::PostField.content('status_message', status_msg))
+  # Create if they don't exist
+  def create_write_dirs(dir)
+    dirs = dir.split("/")
+    dirs.delete("")
+    # Go through and create all subdirs
+    overallpath = ""
+    dirs.each do |d|
+      Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
+      overallpath += ("/"+d)
     end
   end
-  # Report results back to Harvester incrementally
-  def report_incremental(results, path)
-    curl_url = @cm_url+"/relay_results"
-    c = Curl::Easy.http_post(curl_url,
-                             Curl::PostField.content('selector_id', @selector_id),
-                             Curl::PostField.content('status_message', "Processed " + path),
-                             Curl::PostField.content('results', JSON.pretty_generate(results)))
+  # Figure out where to write the file
+  def get_write_path(dir, file)
+    dir_save = dir.gsub(@path, @output_dir)
+    return "#{dir_save}/#{file}.json"
   end
-  # Get the output array
-  def get_output
-    return JSON.pretty_generate(@output)
+  # Handle different failure modes
+  def handle_failure(error, dir, file, *args)
+    if @failure_mode == "debug"
+      binding.pry
+    elsif @failure_mode == "log"
+      error_file = "#{dir}/#{file}\n"
+      IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: dircrawl
 version: !ruby/object:Gem::Version
-  version: 0.0.14
+  version: 0.0.15
 platform: ruby
 authors:
 - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-01-19 00:00:00.000000000 Z
+date: 2017-05-22 00:00:00.000000000 Z
 dependencies: []
 description: Run block on all files in dir
 email: shidash@shidash.com
@@ -38,9 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.8
+rubygems_version: 2.6.11
 signing_key:
 specification_version: 4
 summary: Run block on all files in dir
 test_files: []
-has_rdoc: