RubyGems - spidr - Versions diffs - 0.6.0 → 0.7.0 - Mend

spidr 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +5 -5
data/.editorconfig +11 -0
data/.github/workflows/ruby.yml +26 -0
data/.gitignore +4 -5
data/ChangeLog.md +17 -0
data/Gemfile +8 -5
data/LICENSE.txt +1 -1
data/README.md +137 -78
data/Rakefile +1 -0
data/gemspec.yml +8 -1
data/lib/spidr/agent/actions.rb +1 -1
data/lib/spidr/agent/events.rb +1 -1
data/lib/spidr/agent/filters.rb +55 -56
data/lib/spidr/agent/sanitizers.rb +6 -9
data/lib/spidr/agent.rb +230 -120
data/lib/spidr/auth_store.rb +10 -6
data/lib/spidr/page/content_types.rb +51 -0
data/lib/spidr/page/html.rb +17 -19
data/lib/spidr/page/status_codes.rb +12 -10
data/lib/spidr/proxy.rb +6 -14
data/lib/spidr/rules.rb +5 -8
data/lib/spidr/session_cache.rb +23 -21
data/lib/spidr/settings/proxy.rb +19 -5
data/lib/spidr/spidr.rb +16 -6
data/lib/spidr/version.rb +1 -1
data/spec/agent_spec.rb +357 -10
data/spec/example_page.rb +2 -0
data/spec/page/content_types_spec.rb +22 -0
data/spec/page/html_spec.rb +255 -51
data/spec/page/status_codes_spec.rb +4 -4
data/spec/proxy_spec.rb +2 -2
data/spec/settings/proxy_examples.rb +31 -11
data/spec/spec_helper.rb +3 -0
metadata +19 -19
data/.travis.yml +0 -14

data/lib/spidr/agent.rb CHANGED Viewed

@@ -19,12 +19,12 @@ module Spidr
     include Settings::UserAgent
-    # HTTP Host Header to use
+    # HTTP Host `Header` to use
     #
     # @return [String]
     attr_accessor :host_header
-    # HTTP Host Headers to use for specific hosts
+    # HTTP `Host` Headers to use for specific hosts
     #
     # @return [Hash{String,Regexp => String}]
     attr_reader :host_headers
@@ -96,70 +96,110 @@ module Spidr
     #
     # Creates a new Agent object.
     #
-    # @param [Hash] options
-    #   Additional options
+    # @param [String, nil] host_header
+    #   The HTTP `Host` header to use with each request.
     #
-    # @option options [Integer] :open_timeout (Spidr.open_timeout)
-    #   Optional open timeout.
+    # @param [Hash{String,Regexp => String}] host_headers
+    #   The HTTP `Host` headers to use for specific hosts.
     #
-    # @option options [Integer] :read_timeout (Spidr.read_timeout)
+    # @param [Hash{String => String}] default_headers
+    #   Default headers to set for every request.
+    #
+    # @param [String, nil] user_agent
+    #   The `User-Agent` string to send with each requests.
+    #
+    # @param [String, nil] referer
+    #   The `Referer` URL to send with each request.
+    #
+    # @param [Integer, nil] open_timeout
+    #   Optional open connection timeout.
+    #
+    # @param [Integer, nil] read_timeout
     #   Optional read timeout.
     #
-    # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
-    #   Optional ssl timeout.
+    # @param [Integer, nil] ssl_timeout
+    #   Optional SSL connection timeout.
     #
-    # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
+    # @param [Integer, nil] continue_timeout
     #   Optional continue timeout.
     #
-    # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
-    #   Optional keep_alive timeout.
+    # @param [Integer, nil] keep_alive_timeout
+    #   Optional `Keep-Alive` timeout.
     #
-    # @option options [Hash] :proxy (Spidr.proxy)
+    # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
     #   The proxy information to use.
     #
-    # @option :proxy [String] :host
+    # @option proxy [String] :host
     #   The host the proxy is running on.
     #
-    # @option :proxy [Integer] :port
+    # @option proxy [Integer] :port (8080)
     #   The port the proxy is running on.
     #
-    # @option :proxy [String] :user
+    # @option proxy [String, nil] :user
     #   The user to authenticate as with the proxy.
     #
-    # @option :proxy [String] :password
+    # @option proxy [String, nil] :password
     #   The password to authenticate with.
     #
-    # @option options [Hash{String => String}] :default_headers
-    #   Default headers to set for every request.
+    # @param [Integer] delay
+    #   The number of seconds to pause between each request.
+    #
+    # @param [Integer, nil] limit
+    #   The maximum number of pages to visit.
     #
-    # @option options [String] :host_header
-    #   The HTTP Host header to use with each request.
+    # @param [Integer, nil] max_depth
+    #   The maximum link depth to follow.
     #
-    # @option options [Hash{String,Regexp => String}] :host_headers
-    #   The HTTP Host headers to use for specific hosts.
+    # @param [Set, Array, nil] queue
+    #   The initial queue of URLs to visit.
     #
-    # @option options [String] :user_agent (Spidr.user_agent)
-    #   The User-Agent string to send with each requests.
+    # @param [Set, Array, nil] history
+    #   The initial list of visited URLs.
     #
-    # @option options [String] :referer
-    #   The Referer URL to send with each request.
+    # @param [Boolean] strip_fragments
+    #   Controls whether to strip the fragment components from the URLs.
     #
-    # @option options [Integer] :delay (0)
-    #   The number of seconds to pause between each request.
+    # @param [Boolean] strip_query
+    #   Controls whether to strip the query components from the URLs.
     #
-    # @option options [Set, Array] :queue
-    #   The initial queue of URLs to visit.
+    # @param [Array<String>] schemes
+    #   The list of acceptable URI schemes to visit.
+    #   The `https` scheme will be ignored if `net/https` cannot be loaded.
     #
-    # @option options [Set, Array] :history
-    #   The initial list of visited URLs.
+    # @param [String] host
+    #   The host-name to visit.
     #
-    # @option options [Integer] :limit
-    #   The maximum number of pages to visit.
+    # @param [Array<String, Regexp, Proc>] hosts
+    #   The patterns which match the host-names to visit.
     #
-    # @option options [Integer] :max_depth
-    #   The maximum link depth to follow.
+    # @param [Array<String, Regexp, Proc>] ignore_hosts
+    #   The patterns which match the host-names to not visit.
+    #
+    # @param [Array<Integer, Regexp, Proc>] ports
+    #   The patterns which match the ports to visit.
     #
-    # @option options [Boolean] :robots (Spidr.robots?)
+    # @param [Array<Integer, Regexp, Proc>] ignore_ports
+    #   The patterns which match the ports to not visit.
+    #
+    # @param [Array<String, Regexp, Proc>] links
+    #   The patterns which match the links to visit.
+    #
+    # @param [Array<String, Regexp, Proc>] ignore_links
+    #   The patterns which match the links to not visit.
+    #
+    # @param [Array<String, Regexp, Proc>] urls
+    #   The patterns which match the URLs to visit.
+    #
+    # @param [Array<String, Regexp, Proc>] ignore_urls
+    #   The patterns which match the URLs to not visit.
+    #
+    # @param [Array<String, Regexp, Proc>] exts
+    #   The patterns which match the URI path extensions to visit.
+    #
+    # @param [Array<String, Regexp, Proc>] ignore_exts
+    #   The patterns which match the URI path extensions to not visit.
+    #
+    # @param [Boolean] robots
     #   Specifies whether `robots.txt` should be honored.
     #
     # @yield [agent]
@@ -169,58 +209,99 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
-    # @see #initialize_sanitizers
-    # @see #initialize_filters
-    # @see #initialize_actions
-    # @see #initialize_events
-    #
-    def initialize(options={})
-      @host_header  = options[:host_header]
-      @host_headers = {}
-      if options[:host_headers]
-        @host_headers.merge!(options[:host_headers])
-      end
-      @default_headers = {}
-      if options[:default_headers]
-        @default_headers.merge!(options[:default_headers])
-      end
-      @user_agent = options.fetch(:user_agent,Spidr.user_agent)
-      @referer    = options[:referer]
-      @sessions   = SessionCache.new(options)
+    def initialize(# header keyword arguments
+                   host_header:        nil,
+                   host_headers:       {},
+                   default_headers:    {},
+                   user_agent:         Spidr.user_agent,
+                   referer:            nil,
+                   # session cache keyword arguments
+                   proxy:              Spidr.proxy,
+                   open_timeout:       Spidr.open_timeout,
+                   ssl_timeout:        Spidr.ssl_timeout,
+                   read_timeout:       Spidr.read_timeout,
+                   continue_timeout:   Spidr.continue_timeout,
+                   keep_alive_timeout: Spidr.keep_alive_timeout,
+                   # spidering controls keyword arguments
+                   delay:     0,
+                   limit:     nil,
+                   max_depth: nil,
+                   # history keyword arguments
+                   queue:   nil,
+                   history: nil,
+                   # sanitizer keyword arguments
+                   strip_fragments: true,
+                   strip_query:     false,
+                   # filtering keyword arguments
+                   schemes:      self.class.default_schemes,
+                   host:         nil,
+                   hosts:        nil,
+                   ignore_hosts: nil,
+                   ports:        nil,
+                   ignore_ports: nil,
+                   links:        nil,
+                   ignore_links: nil,
+                   urls:         nil,
+                   ignore_urls:  nil,
+                   exts:         nil,
+                   ignore_exts:  nil,
+                   # robots keyword arguments
+                   robots:       Spidr.robots?)
+      @host_header  = host_header
+      @host_headers = host_headers
+      @default_headers = default_headers
+      @user_agent = user_agent
+      @referer    = referer
+      @sessions   = SessionCache.new(
+        proxy:              proxy,
+        open_timeout:       open_timeout,
+        ssl_timeout:        ssl_timeout,
+        read_timeout:       read_timeout,
+        continue_timeout:   continue_timeout,
+        keep_alive_timeout: keep_alive_timeout
+      )
       @cookies    = CookieJar.new
       @authorized = AuthStore.new
       @running  = false
-      @delay    = options.fetch(:delay,0)
+      @delay    = delay
       @history  = Set[]
       @failures = Set[]
       @queue    = []
-      @limit     = options[:limit]
+      @limit     = limit
       @levels    = Hash.new(0)
-      @max_depth = options[:max_depth]
-      if options[:queue]
-        self.queue = options[:queue]
-      end
-      if options[:history]
-        self.history = options[:history]
-      end
-      initialize_sanitizers(options)
-      initialize_filters(options)
-      initialize_actions(options)
-      initialize_events(options)
-      if options.fetch(:robots,Spidr.robots?)
-        initialize_robots
-      end
+      @max_depth = max_depth
+      self.queue   = queue   if queue
+      self.history = history if history
+      initialize_sanitizers(
+        strip_fragments: strip_fragments,
+        strip_query:     strip_query
+      )
+      initialize_filters(
+        schemes:      schemes,
+        host:         host,
+        hosts:        hosts,
+        ignore_hosts: ignore_hosts,
+        ports:        ports,
+        ignore_ports: ignore_ports,
+        links:        links,
+        ignore_links: ignore_links,
+        urls:         urls,
+        ignore_urls:  ignore_urls,
+        exts:         exts,
+        ignore_exts:  ignore_exts
+      )
+      initialize_actions
+      initialize_events
+      initialize_robots if robots
       yield self if block_given?
     end
@@ -231,8 +312,8 @@ module Spidr
     # @param [URI::HTTP, String] url
     #   The URL to start spidering at.
     #
-    # @param [Hash] options
-    #   Additional options. See {Agent#initialize}.
+    # @param [Hash{Symbol => Object}] kwargs
+    #   Additional keyword arguments. See {Agent#initialize}.
     #
     # @yield [agent]
     #   If a block is given, it will be passed the newly created agent
@@ -241,12 +322,16 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @return [Agent]
+    #   The created agent object.
+    #
     # @see #initialize
     # @see #start_at
     #
-    def self.start_at(url,options={},&block)
-      agent = new(options,&block)
+    def self.start_at(url,**kwargs,&block)
+      agent = new(**kwargs,&block)
       agent.start_at(url)
+      return agent
     end
     #
@@ -255,8 +340,8 @@ module Spidr
     # @param [URI::HTTP, String] url
     #   The web-site to spider.
     #
-    # @param [Hash] options
-    #   Additional options. See {Agent#initialize}.
+    # @param [Hash{Symbol => Object}] kwargs
+    #   Additional keyword arguments. See {Agent#initialize}.
     #
     # @yield [agent]
     #   If a block is given, it will be passed the newly created agent
@@ -265,13 +350,17 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @return [Agent]
+    #   The created agent object.
+    #
     # @see #initialize
     #
-    def self.site(url,options={},&block)
-      url = URI(url.to_s) unless url.kind_of?(URI)
+    def self.site(url,**kwargs,&block)
+      url = URI(url)
-      agent = new(options.merge(host: url.host),&block)
+      agent = new(host: url.host, **kwargs, &block)
       agent.start_at(url)
+      return agent
     end
     #
@@ -280,8 +369,8 @@ module Spidr
     # @param [String] name
     #   The host-name to spider.
     #
-    # @param [Hash] options
-    #   Additional options. See {Agent#initialize}.
+    # @param [Hash{Symbol => Object}] kwargs
+    #   Additional keyword arguments. See {Agent#initialize}.
     #
     # @yield [agent]
     #   If a block is given, it will be passed the newly created agent
@@ -290,11 +379,44 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @return [Agent]
+    #   The created agent object.
+    #
     # @see #initialize
     #
-    def self.host(name,options={},&block)
-      agent = new(options.merge(host: name),&block)
+    def self.host(name,**kwargs,&block)
+      agent = new(host: name, **kwargs, &block)
       agent.start_at(URI::HTTP.build(host: name, path: '/'))
+      return agent
+    end
+    #
+    # Creates a new agent and spiders the entire domain.
+    #
+    # @param [String] name
+    #   The top-level domain to spider.
+    #
+    # @param [Hash{Symbol => Object}] kwargs
+    #   Additional keyword arguments. See {Agent#initialize}.
+    #
+    # @yield [agent]
+    #   If a block is given, it will be passed the newly created agent
+    #   before it begins spidering.
+    #
+    # @yieldparam [Agent] agent
+    #   The newly created agent.
+    #
+    # @return [Agent]
+    #   The created agent object.
+    #
+    # @see #initialize
+    #
+    # @since 0.7.0
+    #
+    def self.domain(name,**kwargs,&block)
+      agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
+      agent.start_at(URI::HTTP.build(host: name, path: '/'))
+      return agent
     end
     #
@@ -314,10 +436,10 @@ module Spidr
     #
     # Sets the proxy information that the agent uses.
     #
-    # @param [Proxy] new_proxy
+    # @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
     #   The new proxy information.
     #
-    # @return [Hash]
+    # @return [Proxy]
     #   The new proxy information.
     #
     # @see SessionCache#proxy=
@@ -408,9 +530,7 @@ module Spidr
       @history.clear
       new_history.each do |url|
-        url = URI(url.to_s) unless url.kind_of?(URI)
-        @history << url
+        @history << URI(url)
       end
       return @history
@@ -425,7 +545,7 @@ module Spidr
     #   The links which have been visited.
     #
     def visited_links
-      @history.map { |url| url.to_s }
+      @history.map(&:to_s)
     end
     #
@@ -435,7 +555,7 @@ module Spidr
     #   The hosts which have been visited.
     #
     def visited_hosts
-      visited_urls.map { |uri| uri.host }.uniq
+      visited_urls.map(&:host).uniq
     end
     #
@@ -448,9 +568,7 @@ module Spidr
     #   Specifies whether a URL was visited.
     #
     def visited?(url)
-      url = URI(url.to_s) unless url.kind_of?(URI)
-      return @history.include?(url)
+      @history.include?(URI(url))
     end
     #
@@ -469,9 +587,7 @@ module Spidr
       @failures.clear
       new_failures.each do |url|
-        url = URI(url.to_s) unless url.kind_of?(URI)
-        @failures << url
+        @failures << URI(url)
       end
       return @failures
@@ -487,9 +603,7 @@ module Spidr
     #   Specifies whether the given URL was unable to be visited.
     #
     def failed?(url)
-      url = URI(url.to_s) unless url.kind_of?(URI)
-      return @failures.include?(url)
+      @failures.include?(URI(url))
     end
     alias pending_urls queue
@@ -510,9 +624,7 @@ module Spidr
       @queue.clear
       new_queue.each do |url|
-        url = URI(url.to_s) unless url.kind_of?(URI)
-        @queue << url
+        @queue << URI(url)
       end
       return @queue
@@ -544,7 +656,7 @@ module Spidr
     def enqueue(url,level=0)
       url = sanitize_url(url)
-      if (!(queued?(url)) && visit?(url))
+      if (!queued?(url) && visit?(url))
         link = url.to_s
         begin
@@ -594,7 +706,7 @@ module Spidr
     #   The page for the response, or `nil` if the request failed.
     #
     def get_page(url)
-      url = URI(url.to_s)
+      url = URI(url)
       prepare_request(url) do |session,path,headers|
         new_page = Page.new(url,session.get(path,headers))
@@ -629,7 +741,7 @@ module Spidr
     # @since 0.2.2
     #
     def post_page(url,post_data='')
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       prepare_request(url) do |session,path,headers|
         new_page = Page.new(url,session.post(path,post_data,headers))
@@ -643,7 +755,7 @@ module Spidr
     end
     #
-    # Visits a given URL, and enqueus the links recovered from the URL
+    # Visits a given URL, and enqueues the links recovered from the URL
     # to be visited later.
     #
     # @param [URI::HTTP, String] url
@@ -725,7 +837,7 @@ module Spidr
       unless @host_headers.empty?
         @host_headers.each do |name,header|
-          if host.match(name)
+          if url.host.match(name)
             headers['Host'] = header
             break
           end
@@ -769,8 +881,6 @@ module Spidr
     # @since 0.2.2
     #
     def prepare_request(url,&block)
-      host = url.host
-      port = url.port
       path = unless url.path.empty?
                url.path
              else

data/lib/spidr/auth_store.rb CHANGED Viewed

@@ -34,7 +34,7 @@ module Spidr
     #
     def [](url)
       # normalize the url
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       key = [url.scheme, url.host, url.port]
       paths = @credentials[key]
@@ -42,7 +42,7 @@ module Spidr
       return nil unless paths
       # longest path first
-      ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
+      ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
       # directories of the path
       path_dirs = URI.expand_path(url.path).split('/')
@@ -70,7 +70,7 @@ module Spidr
     #
     def []=(url,auth)
       # normalize the url
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       # normalize the URL path
       path = URI.expand_path(url.path)
@@ -109,7 +109,7 @@ module Spidr
     # or `nil` if no authorization exists.
     #
     # @param [URI] url
-    #   The url.
+    #   The URL.
     #
     # @return [String, nil]
     #   The base64 encoded authorizatio string or `nil`.
@@ -118,7 +118,7 @@ module Spidr
     #
     def for_url(url)
       if (auth = self[url])
-        return Base64.encode64("#{auth.username}:#{auth.password}")
+        Base64.encode64("#{auth.username}:#{auth.password}")
       end
     end
@@ -144,7 +144,11 @@ module Spidr
     # @since 0.2.2
     #
     def size
-      @credentials.inject(0) { |res, arr| res + arr[1].length }
+      total = 0
+      @credentials.each_value { |paths| total += paths.length }
+      return total
     end
     #

data/lib/spidr/page/content_types.rb CHANGED Viewed

@@ -221,5 +221,56 @@ module Spidr
     def zip?
       is_content_type?('application/zip')
     end
+    #
+    # Determines if the page is a PNG image.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a PNG image.
+    #
+    # @since 0.7.0
+    #
+    def png?
+      is_content_type?('image/png')
+    end
+    #
+    # Determines if the page is a GIF image.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a GIF image.
+    #
+    # @since 0.7.0
+    #
+    def gif?
+      is_content_type?('image/gif')
+    end
+    #
+    # Determines if the page is a JPEG image.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a JPEG image.
+    #
+    # @since 0.7.0
+    #
+    def jpeg?
+      is_content_type?('image/jpeg')
+    end
+    #
+    # Determines if the page is a ICO image.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a ICO image.
+    #
+    # @since 0.7.0
+    #
+    def ico?
+      is_content_type?('image/x-icon') ||
+        is_content_type?('image/vnd.microsoft.icon')
+    end
+    alias icon? ico?
   end
 end