RubyGems - dircrawl - Versions diffs - 0.0.9 → 0.0.10 - Mend

dircrawl 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 4b26892e32bab240fc3d19dbb81ea852705b0e02
-  data.tar.gz: 07e70298e2ed813d8fcf28e9f84a61f11958734d
+  metadata.gz: ba3c1a9f1c59a90f094781a6becc3f221e9cfbe4
+  data.tar.gz: 4bbbac56d9ee80eaf8e040b8133ae56f25d9da57
 SHA512:
-  metadata.gz: 51a3f508c3481d326c57cb1de331d91273ec5de8e3190074903e8bc9a19b3ffc8238b7c42ceb4eae2714bb789bb99f244464cb0c60b25a4bbc557a69a159299b
-  data.tar.gz: 955daa23ad7dda13e86fdb32812c19adbe26614615265c89a921fa3dc49653d38fdbe623391e832fcd769d9138997e30ed6a411b856ea9a7faa01e75ecc0c9b8
+  metadata.gz: e2af2bdcdec751ab8c35bacaebbd8f6bad4adf400c933e1a68828fd9312fbbd88923e6947a4c0fed377c3e4879c8e74a04c482286f45c18d37ed0022395ed73e
+  data.tar.gz: 61c5942ba31099659c5874da9850ae6f15441e1b5764b46cdee69e3329a4e57a4ee1b78c4bbb5130925d728e4178e0543f2d651b0da51d3742725bea0c3f8347

data/lib/dircrawl.rb CHANGED Viewed

@@ -1,8 +1,11 @@
 require 'json'
 require 'pry'
+require 'curb'
+require 'selenium-webdriver'
+require 'uri'
 class DirCrawl
-  def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, *args)
+  def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
     @path = path
     @output_dir = output_dir
     @ignore_includes = ignore_includes
@@ -12,6 +15,12 @@ class DirCrawl
     @failure_mode = failure_mode
     @output = Array.new
     @save = save
+    # Handle crawler manager info
+    @cm_url = cm_hash[:crawler_manager_url] if cm_hash
+    @selector_id = cm_hash[:selector_id] if cm_hash
+    # Crawl
     crawl_dir(path, *args)
   end
@@ -36,12 +45,13 @@ class DirCrawl
   def crawl_dir(dir, *args)
     Dir.foreach(dir) do |file|
       next if file == '.' or file == '..'
       # Go to next dir
       if File.directory?(dir+"/"+file)
         crawl_dir(dir+"/"+file, *args)
       # Process file
-      elsif !file.include?(@ignore_includes) && !File.exist?(get_write_dir(dir, file))
+      elsif !file.include?(@ignore_includes)
 	# Create Dirs
         create_write_dirs(dir.gsub(@path, @output_dir))
@@ -53,7 +63,11 @@ class DirCrawl
 		end
 		# Process Main
-                processed = @process_block.call(dir+"/"+file, *args)
+                if !File.exist?(get_write_dir(dir, file))
+                  processed = @process_block.call(dir+"/"+file, *args)
+                else
+                  processed = File.read(get_write_dir(dir, file))
+                end
         rescue Exception => e # really catch any failures
           if @failure_mode == "debug"
@@ -64,9 +78,7 @@ class DirCrawl
         end
         # Only save in output if specified (to handle large dirs)
-        if @save
-          @output.push(JSON.parse(processed))
-        end
+        report_results([JSON.parse(processed)], dir+"/"+file)
         # Write to file
         File.write(get_write_dir(dir, file), processed)
@@ -74,6 +86,31 @@ class DirCrawl
     end
   end
+  # Figure out how to report results
+  def report_results(results, path)
+    if @cm_url
+      report_incremental(results, path)
+    else
+      report_batch(results)
+    end
+  end
+  # Report all results in one JSON
+  def report_batch(results)
+    results.each do |result|
+      @output.push(result)
+    end
+  end
+  # Report results back to Harvester incrementally
+  def report_incremental(results, path)
+    curl_url = @cm_url+"/relay_results"
+    c = Curl::Easy.http_post(curl_url,
+                             Curl::PostField.content('selector_id', @selector_id),
+                             Curl::PostField.content('status_message', "Processed " + path),
+                             Curl::PostField.content('results', JSON.pretty_generate(results)))
+  end
   # Get the output array
   def get_output
     return JSON.pretty_generate(@output)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: dircrawl
 version: !ruby/object:Gem::Version
-  version: 0.0.9
+  version: 0.0.10
 platform: ruby
 authors:
 - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-08-15 00:00:00.000000000 Z
+date: 2016-10-11 00:00:00.000000000 Z
 dependencies: []
 description: Run block on all files in dir
 email: shidash@shidash.com
@@ -38,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.1
+rubygems_version: 2.4.8
 signing_key:
 specification_version: 4
 summary: Run block on all files in dir
 test_files: []
+has_rdoc: