RubyGems - arachnid2 - Versions diffs - 0.3.9 → 0.4.0 - Mend

arachnid2 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +30 -0
data/lib/arachnid2.rb +1 -1
data/lib/arachnid2/{cached_arachnid_responses.rb → cached_responses.rb} +2 -2
data/lib/arachnid2/exoskeleton.rb +1 -1
data/lib/arachnid2/typhoeus.rb +41 -22
data/lib/arachnid2/version.rb +1 -1
data/lib/arachnid2/watir.rb +52 -25
metadata +4 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '0299b63b16c0e19acd87ff8e8b34302f015a8f36ea87855faedbbe3c38d0f080'
-  data.tar.gz: 5da1c60fe38252b7699a5ecd2ead756fd01e2e13918ef25923c5b44ab039617c
+  metadata.gz: 6e4e32b14e6ad9a1f4a71bbe4099ec014176a2919e6f560ee36e38d93064cf3d
+  data.tar.gz: 501f5e7d3e8cf5c94391f8f5b70c2e08c96fd404d1409c8815792ceceaadc33d
 SHA512:
-  metadata.gz: 6525470d1ec273fb06421f0a092307527786697fa5bb698c8f2fb5f788ef9b6c220bb09204656cec807e7f8816c65684b061afa0808fbc49317abee19a9b8d4b
-  data.tar.gz: 8f7bc29d3e5129da2e04c5bc32f44a0025244ba160581c2a073e9b61cc46baa3685f3481e979743556342bf4b2791767eb0c21b42558f62b2443645f592be677
+  metadata.gz: cd6fbad6aaab1e7da49f4fe178c00215ff236264f3f3ed99903b3d81338f54360bb2197845d50150df1ab5d19ccd9d53c9084d096d957cd6005f690c65d38e41
+  data.tar.gz: '0158f5b7469b33dafd07206654cf7793838b6644b623a6882c0057a29e994b1ae415fa97c56b3898ce06ece142ecd1f84853732426e356a1bb8cabda8b0fdcd1'

data/README.md CHANGED

@@ -186,6 +186,36 @@ with_watir = true
 Arachnid2.new(url).crawl(opts, with_watir)
 ```
+Arachnid2 has base defaults which you might want to address when
+employing Watir.
+* First, the default crawl time is 15 seconds.
+As browser page loads can take this long, you will probably want to
+set a higher crawl time.
+* Simply storing the browser is not a great idea, since it will
+be inaccessible after it is closed. Instead, consider nabbing the
+HTML, cookies, or whatever content is required during the crawl.
+* Finally, note that Firefox is the default browser.
+```ruby
+require 'arachnid2'
+with_watir = true
+responses = []
+url = "http://maximumfun.org"
+max = 60
+browser = :chrome
+opts = {time_box: max, browser_type: browser}
+spider = Arachnid2.new(url)
+spider.crawl(opts, with_watir) do |response|
+  response.body.wait_until(&:present?)
+  responses << response.body.html if response.body.present?
+end
+```
 #### Options
 See the Typhoeus options above &mdash; most apply to Watir as well, with

data/lib/arachnid2.rb CHANGED

@@ -1,5 +1,5 @@
 require "arachnid2/version"
-require "arachnid2/cached_arachnid_responses"
+require "arachnid2/cached_responses"
 require "arachnid2/exoskeleton"
 require "arachnid2/typhoeus"
 require "arachnid2/watir"

data/lib/arachnid2/{cached_arachnid_responses.rb → cached_responses.rb} RENAMED

@@ -1,6 +1,6 @@
 require 'net/http'
 require 'json'
-module CachedArachnidResponses
+module CachedResponses
   CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
   def load_data(_url, _options)
@@ -15,7 +15,7 @@ module CachedArachnidResponses
       body = ::JSON.parse(response.body)
       responses_list = Base64.decode64(body['encrypted_response'])
-      return Marshal.load responses_list # here we get array of Typhoeus::Response
+      return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
     end
   rescue StandardError
     nil

data/lib/arachnid2/exoskeleton.rb CHANGED

@@ -10,7 +10,7 @@ class Arachnid2
     end
     def process(url, html)
-      return false unless Adomain["#{url}"].include? @domain
+      return false unless Adomain["#{url}"]&.include? @domain
       extract_hrefs(html)
     end

data/lib/arachnid2/typhoeus.rb CHANGED

@@ -1,6 +1,6 @@
 class Arachnid2
   class Typhoeus
-    include CachedArachnidResponses
+    include CachedResponses
     include Arachnid2::Exoskeleton
     def initialize(url)
@@ -17,39 +17,58 @@ class Arachnid2
         max_concurrency.times do
           q = @global_queue.shift
-          break if @global_visited.size >= crawl_options[:max_urls] || \
-                   Time.now > crawl_options[:time_limit] || \
-                   memory_danger?
+          break if time_to_stop?
           @global_visited.insert(q)
-          request = ::Typhoeus::Request.new(q, request_options)
-          data = load_data(@url, opts)
-          data.each { |response| yield response } and return unless data.nil?
-          request.on_complete do |response|
-            @cached_data.push(response)
-            links = process(response.effective_url, response.body)
-            next unless links
-            yield response
-            vacuum(links, response.effective_url)
-          end
+          found_in_cache = use_cache(q, opts, &Proc.new)
+          return if found_in_cache
-          @hydra.queue(request)
+          request = ::Typhoeus::Request.new(q, request_options)
+          requestable = after_request(request, &Proc.new)
+          @hydra.queue(request) if requestable
         end # max_concurrency.times do
         @hydra.run
       end # until @global_queue.empty?
-      put_cached_data(@url, opts, @cached_data) unless @cached_data.empty?
     ensure
       @cookie_file.close! if @cookie_file
     end # def crawl(opts = {})
     private
+      def after_request(request)
+        request.on_complete do |response|
+          cacheable = use_response(response, &Proc.new)
+          return unless cacheable
+          put_cached_data(response.effective_url, @options, response)
+        end
+        true
+      end
+      def use_response(response)
+        links = process(response.effective_url, response.body)
+        return unless links
+        yield response
+        vacuum(links, response.effective_url)
+        true
+      end
+      def use_cache(url, options)
+        data = load_data(url, options)
+        use_response(data, &Proc.new) if data
+        data
+      end
+      def time_to_stop?
+        @global_visited.size >= crawl_options[:max_urls] || \
+                 Time.now > crawl_options[:time_limit] || \
+                 memory_danger?
+      end
       def typhoeus_preflight
         @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
         typhoeus_proxy_options

data/lib/arachnid2/version.rb CHANGED

@@ -1,3 +1,3 @@
 class Arachnid2
-  VERSION = "0.3.9"
+  VERSION = "0.4.0"
 end

data/lib/arachnid2/watir.rb CHANGED

@@ -19,44 +19,71 @@ class Arachnid2
         q = @global_queue.shift
         links = nil
-        break if @global_visited.size >= crawl_options[:max_urls]
-        break if Time.now > crawl_options[:time_limit]
-        break if memory_danger?
+        break if time_to_stop?
         @global_visited.insert(q)
+        make_request(q, &Proc.new)
+      end # until @global_queue.empty?
+    ensure
+      @browser.close if @browser rescue nil
+      @headless.destroy if @headless rescue nil
+    end
+    private
+      def make_request(q)
         begin
-          begin
-            browser.goto q
-          rescue Selenium::WebDriver::Error::UnknownError => e
-            # Firefox and Selenium, in their infinite wisdom
-            # raise an error when a page cannot be loaded.
-            # At the time of writing this, the page at
-            # thewirecutter.com/cars/accessories-auto
-            # causes such an issue (too many redirects).
-            # This error handling moves us on from those pages.
-            raise e unless e.message =~ /.*Reached error page.*/i
-            next
-          end
-          links = process(browser.url, browser.body.html) if browser.body.exists?
-          next unless links
-          yield browser
+          links = browse_links(q, &Proc.new)
+          return unless links
           vacuum(links, browser.url)
         rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
+          msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
+                "is ignoring an error: " \
+                "#{e.class} - #{e.message}"
+          puts msg
         rescue => e
           raise e if raise_before_retry?(e.class)
+          msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
+                "is retrying once after an error: " \
+                "#{e.class} - #{e.message}"
+          puts msg
+          e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
           reset_for_retry
         end
+      end
-      end # until @global_queue.empty?
-    ensure
-      @browser.close if @browser rescue nil
-      @headless.destroy if @headless rescue nil
-    end
+      def browse_links(url)
+        return unless navigate(url)
+        yield browser
+        process(browser.url, browser.body.html) if browser.body.exists?
+      end
+      def navigate(url)
+        begin
+          browser.goto url
+        rescue Selenium::WebDriver::Error::UnknownError => e
+          # Firefox and Selenium, in their infinite wisdom
+          # raise an error when a page cannot be loaded.
+          # At the time of writing this, the page at
+          # thewirecutter.com/cars/accessories-auto
+          # causes such an issue (too many redirects).
+          # This error handling moves us on from those pages.
+          raise e unless e.message =~ /.*Reached error page.*/i
+          return
+        end
+        true
+      end
+      def time_to_stop?
+        @global_visited.size >= crawl_options[:max_urls] || \
+                 Time.now > crawl_options[:time_limit] || \
+                 memory_danger?
+      end
-    private
       def raise_before_retry?(klass)
         @already_retried || \
           "#{klass}".include?("Selenium") || \

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: arachnid2
 version: !ruby/object:Gem::Version
-  version: 0.3.9
+  version: 0.4.0
 platform: ruby
 authors:
 - Sam Nissen
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-03-05 00:00:00.000000000 Z
+date: 2020-07-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -184,7 +184,7 @@ files:
 - bin/console
 - bin/setup
 - lib/arachnid2.rb
-- lib/arachnid2/cached_arachnid_responses.rb
+- lib/arachnid2/cached_responses.rb
 - lib/arachnid2/exoskeleton.rb
 - lib/arachnid2/typhoeus.rb
 - lib/arachnid2/version.rb
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.6
+rubygems_version: 3.1.2
 signing_key:
 specification_version: 4
 summary: A simple, fast web crawler