RubyGems - spidr - Versions diffs - 0.5.0 → 0.6.0 - Mend

spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +4 -4
data/.travis.yml +14 -0
data/ChangeLog.md +20 -2
data/Gemfile +2 -2
data/README.md +4 -2
data/Rakefile +1 -0
data/gemspec.yml +1 -1
data/lib/spidr/agent.rb +145 -85
data/lib/spidr/agent/filters.rb +1 -9
data/lib/spidr/agent/robots.rb +36 -0
data/lib/spidr/page.rb +76 -28
data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
data/lib/spidr/page/cookies.rb +60 -0
data/lib/spidr/page/{links.rb → html.rb} +47 -23
data/lib/spidr/page/status_codes.rb +112 -0
data/lib/spidr/proxy.rb +56 -0
data/lib/spidr/session_cache.rb +60 -24
data/lib/spidr/settings.rb +3 -0
data/lib/spidr/settings/proxy.rb +61 -0
data/lib/spidr/settings/timeouts.rb +33 -0
data/lib/spidr/settings/user_agent.rb +14 -0
data/lib/spidr/spidr.rb +15 -79
data/lib/spidr/version.rb +1 -1
data/spec/agent/actions_spec.rb +158 -32
data/spec/agent/filters_spec.rb +46 -29
data/spec/agent/sanitizers_spec.rb +25 -31
data/spec/agent_spec.rb +772 -50
data/spec/example_app.rb +27 -0
data/spec/example_page.rb +33 -0
data/spec/page/content_types_spec.rb +150 -0
data/spec/page/cookies_spec.rb +58 -0
data/spec/page/html_spec.rb +524 -0
data/spec/page/status_codes_spec.rb +87 -0
data/spec/page_spec.rb +114 -78
data/spec/proxy_spec.rb +45 -0
data/spec/session_cache.rb +103 -2
data/spec/settings/proxy_examples.rb +82 -0
data/spec/settings/timeouts_examples.rb +93 -0
data/spec/settings/user_agent_examples.rb +25 -0
data/spec/spidr_spec.rb +6 -29
data/spidr.gemspec +38 -109
metadata +35 -31
data/lib/spidr/page/body.rb +0 -98
data/spec/helpers/history.rb +0 -34
data/spec/helpers/page.rb +0 -8
data/spec/helpers/wsoc.rb +0 -83
data/spec/page_examples.rb +0 -21

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 31e83cba8fd67a2527641b404f82773d60b5fb97
-  data.tar.gz: cbd735b652d209cd49a6990eedf3de6f7a22e385
+  metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
+  data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
 SHA512:
-  metadata.gz: d33742df9e9a4ec8090d4934de3562036e149195b3567ac1143c4637012876d86a18618e9f89251506ed8aa1d9c85cc18ed324774d4da29038e975827698f265
-  data.tar.gz: 24b08172be0184f7c68fbc63b31eaac55b0c55d70b35b8983fbbb1a3ce871e157b0bbf7d598625ef37ec3fe420c7372bc5fdaf7dd4b7131eac6e6e23e465e475
+  metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
+  data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f

data/.travis.yml ADDED

@@ -0,0 +1,14 @@
+---
+language: ruby
+rvm:
+  - 2.0.0
+  - 2.1.9
+  - 2.2.4
+  - 2.3.1
+  - jruby
+  - rbx
+matrix:
+  allow_failures:
+    - rvm: jruby
+    - rvm: rbx
+script: rake spec

data/ChangeLog.md CHANGED

@@ -1,3 +1,21 @@
+### 0.6.0 / 2016-08-04
+* Added {Spidr::Proxy}.
+* Added more options to {Spidr::Agent#initialize}:
+  * `:default_headers`: specifies the default headers to set in all requests
+    (@maccman).
+  * `:limit`: specify the maximum number of links to visit.
+  * `:open_timeout`, `:read_timeout`, `:ssl_timeout`, `:continue_timeout`,
+    and `:keep_alive_timeout`: sets `Net::HTTP` timeouts.
+* Allow {Spidr::Settings::Proxy#proxy= Spidr.proxy=} to accept `nil`.
+* Use `Net::HTTPResponse#get_fields` in {Spidr::Page} to correctly return
+  multiple values for repeated headers.
+* Fixed a bug in {Spidr::Page#method_missing} where method names were not being
+  correctly converted to header names.
+* Fixed a bug in {Spidr::Page#cookie_params} where `Set-Cookie` flags were not
+  being filtered out.
+* Rewrote the specs to use webmock and increased spec coverage.
 ### 0.5.0 / 2016-01-03
 * Added support for respecting `robots.txt` files.
@@ -166,8 +184,8 @@
 * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
   * Added `Spidr::Agent#get_session`.
   * Added `Spidr::Agent#kill_session`.
-* Added {Spidr.proxy=}.
-* Added {Spidr.disable_proxy!}.
+* Added {Spidr::Settings::Proxy#proxy= Spidr.proxy=}.
+* Added {Spidr::Settings::Proxy#disable_proxy! Spidr.disable_proxy!}.
 * Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
 * Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
 * Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.

data/Gemfile CHANGED

@@ -6,15 +6,15 @@ end
 gemspec
 gem 'robots', group: :robots
 group :development do
   gem 'rake'
   gem 'rubygems-tasks', '~> 0.2'
-  gem 'wsoc',     '~> 0.1.3'
   gem 'rspec',    '~> 3.0'
+  gem 'webmock',  '~> 2.0'
+  gem 'sinatra',  '~> 1.0'
   gem 'kramdown', '~> 0.12'
   gem 'yard',     '~> 0.8'

data/README.md CHANGED

@@ -5,6 +5,7 @@
 * [Issues](https://github.com/postmodern/spidr/issues)
 * [Mailing List](http://groups.google.com/group/spidr)
 * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
+* [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
 ## Description
@@ -28,7 +29,8 @@ and easy to use.
   * Port number
   * Full link
   * URL extension
-* Provides call-backs for:
+  * Optional `/robots.txt` support.
+* Provides callbacks for:
   * Every visited Page.
   * Every visited URL.
   * Every visited URL that matches a specified pattern.
@@ -181,7 +183,7 @@ Skip the processing of links:
 ## Requirements
-* [ruby] >= 1.9.1
+* [ruby] >= 2.0.0
 * [nokogiri] ~> 1.3
 ## Install

data/Rakefile CHANGED

@@ -16,3 +16,4 @@ task :default => :spec
 require 'yard'
 YARD::Rake::YardocTask.new
+task :doc => :yard

data/gemspec.yml CHANGED

@@ -11,7 +11,7 @@ email: postmodern.mod3@gmail.com
 homepage: https://github.com/postmodern/spidr#readme
 has_yard: true
-required_ruby_version: ">= 1.9.1"
+required_ruby_version: ">= 2.0.0"
 dependencies:
   nokogiri: ~> 1.3

data/lib/spidr/agent.rb CHANGED

@@ -1,7 +1,9 @@
+require 'spidr/settings/user_agent'
 require 'spidr/agent/sanitizers'
 require 'spidr/agent/filters'
 require 'spidr/agent/events'
 require 'spidr/agent/actions'
+require 'spidr/agent/robots'
 require 'spidr/page'
 require 'spidr/session_cache'
 require 'spidr/cookie_jar'
@@ -12,14 +14,11 @@ require 'openssl'
 require 'net/http'
 require 'set'
-begin
-  require 'robots'
-rescue LoadError
-end
 module Spidr
   class Agent
+    include Settings::UserAgent
     # HTTP Host Header to use
     #
     # @return [String]
@@ -30,10 +29,12 @@ module Spidr
     # @return [Hash{String,Regexp => String}]
     attr_reader :host_headers
-    # User-Agent to use
+    # HTTP Headers to use for every request
     #
-    # @return [String]
-    attr_accessor :user_agent
+    # @return [Hash{String => String}]
+    #
+    # @since 0.6.0
+    attr_reader :default_headers
     # HTTP Authentication credentials
     #
@@ -65,11 +66,23 @@ module Spidr
     # @return [Array<URI::HTTP>]
     attr_reader :queue
+    # The session cache
+    #
+    # @return [SessionCache]
+    #
+    # @since 0.6.0
+    attr_reader :sessions
     # Cached cookies
     #
     # @return [CookieJar]
     attr_reader :cookies
+    # Maximum number of pages to visit.
+    #
+    # @return [Integer]
+    attr_reader :limit
     # Maximum depth
     #
     # @return [Integer]
@@ -86,6 +99,21 @@ module Spidr
     # @param [Hash] options
     #   Additional options
     #
+    # @option options [Integer] :open_timeout (Spidr.open_timeout)
+    #   Optional open timeout.
+    #
+    # @option options [Integer] :read_timeout (Spidr.read_timeout)
+    #   Optional read timeout.
+    #
+    # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
+    #   Optional ssl timeout.
+    #
+    # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
+    #   Optional continue timeout.
+    #
+    # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
+    #   Optional keep_alive timeout.
+    #
     # @option options [Hash] :proxy (Spidr.proxy)
     #   The proxy information to use.
     #
@@ -101,6 +129,9 @@ module Spidr
     # @option :proxy [String] :password
     #   The password to authenticate with.
     #
+    # @option options [Hash{String => String}] :default_headers
+    #   Default headers to set for every request.
+    #
     # @option options [String] :host_header
     #   The HTTP Host header to use with each request.
     #
@@ -122,6 +153,9 @@ module Spidr
     # @option options [Set, Array] :history
     #   The initial list of visited URLs.
     #
+    # @option options [Integer] :limit
+    #   The maximum number of pages to visit.
+    #
     # @option options [Integer] :max_depth
     #   The maximum link depth to follow.
     #
@@ -148,10 +182,16 @@ module Spidr
         @host_headers.merge!(options[:host_headers])
       end
+      @default_headers = {}
+      if options[:default_headers]
+        @default_headers.merge!(options[:default_headers])
+      end
       @user_agent = options.fetch(:user_agent,Spidr.user_agent)
       @referer    = options[:referer]
-      @sessions   = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
+      @sessions   = SessionCache.new(options)
       @cookies    = CookieJar.new
       @authorized = AuthStore.new
@@ -161,15 +201,16 @@ module Spidr
       @failures = Set[]
       @queue    = []
+      @limit     = options[:limit]
       @levels    = Hash.new(0)
       @max_depth = options[:max_depth]
-      if options.fetch(:robots,Spidr.robots?)
-        unless Object.const_defined?(:Robots)
-          raise(ArgumentError,":robots option given but unable to require 'robots' gem")
-        end
+      if options[:queue]
+        self.queue = options[:queue]
+      end
-        @robots = Robots.new(@user_agent)
+      if options[:history]
+        self.history = options[:history]
       end
       initialize_sanitizers(options)
@@ -177,6 +218,10 @@ module Spidr
       initialize_actions(options)
       initialize_events(options)
+      if options.fetch(:robots,Spidr.robots?)
+        initialize_robots
+      end
       yield self if block_given?
     end
@@ -252,6 +297,37 @@ module Spidr
       agent.start_at(URI::HTTP.build(host: name, path: '/'))
     end
+    #
+    # The proxy information the agent uses.
+    #
+    # @return [Proxy]
+    #   The proxy information.
+    #
+    # @see SessionCache#proxy
+    #
+    # @since 0.2.2
+    #
+    def proxy
+      @sessions.proxy
+    end
+    #
+    # Sets the proxy information that the agent uses.
+    #
+    # @param [Proxy] new_proxy
+    #   The new proxy information.
+    #
+    # @return [Hash]
+    #   The new proxy information.
+    #
+    # @see SessionCache#proxy=
+    #
+    # @since 0.2.2
+    #
+    def proxy=(new_proxy)
+      @sessions.proxy = new_proxy
+    end
     #
     # Clears the history of the agent.
     #
@@ -292,7 +368,7 @@ module Spidr
     def run(&block)
       @running = true
-      until (@queue.empty? || paused?)
+      until (@queue.empty? || paused? || limit_reached?)
         begin
           visit_page(dequeue,&block)
         rescue Actions::Paused
@@ -316,37 +392,6 @@ module Spidr
       @running == true
     end
-    #
-    # The proxy information the agent uses.
-    #
-    # @return [Hash]
-    #   The proxy information.
-    #
-    # @see SessionCache#proxy
-    #
-    # @since 0.2.2
-    #
-    def proxy
-      @sessions.proxy
-    end
-    #
-    # Sets the proxy information that the agent uses.
-    #
-    # @param [Hash] new_proxy
-    #   The new proxy information.
-    #
-    # @return [Hash]
-    #   The new proxy information.
-    #
-    # @see SessionCache#proxy=
-    #
-    # @since 0.2.2
-    #
-    def proxy=(new_proxy)
-      @sessions.proxy = new_proxy
-    end
     #
     # Sets the history of URLs that were previously visited.
     #
@@ -408,19 +453,6 @@ module Spidr
       return @history.include?(url)
     end
-    #
-    # Determines whether a URL is allowed by the robot policy.
-    #
-    # @param [URI::HTTP, String] url
-    #   The URL to check.
-    #
-    # @return [Boolean]
-    #   Specifies whether a URL is allowed by the robot policy.
-    #
-    def robot_allowed?(url)
-      @robots ? @robots.allowed?(url) : true
-    end
     #
     # Sets the list of failed URLs.
     #
@@ -536,7 +568,7 @@ module Spidr
           return false
         rescue Actions::Action
         end
         @queue << url
         @levels[url] = level
         return true
@@ -544,7 +576,7 @@ module Spidr
       return false
     end
     #
     # Requests and creates a new Page object from a given URL.
     #
@@ -676,6 +708,45 @@ module Spidr
     protected
+    #
+    # Prepares request headers for the given URL.
+    #
+    # @param [URI::HTTP] url
+    #   The URL to prepare the request headers for.
+    #
+    # @return [Hash{String => String}]
+    #   The prepared headers.
+    #
+    # @since 0.6.0
+    #
+    def prepare_request_headers(url)
+      # set any additional HTTP headers
+      headers = @default_headers.dup
+      unless @host_headers.empty?
+        @host_headers.each do |name,header|
+          if host.match(name)
+            headers['Host'] = header
+            break
+          end
+        end
+      end
+      headers['Host']     ||= @host_header if @host_header
+      headers['User-Agent'] = @user_agent if @user_agent
+      headers['Referer']    = @referer if @referer
+      if (authorization = @authorized.for_url(url))
+        headers['Authorization'] = "Basic #{authorization}"
+      end
+      if (header_cookies = @cookies.for_host(url.host))
+        headers['Cookie'] = header_cookies
+      end
+      return headers
+    end
     #
     # Normalizes the request path and grabs a session to handle page
     # get and post requests.
@@ -709,29 +780,7 @@ module Spidr
       # append the URL query to the path
       path += "?#{url.query}" if url.query
-      # set any additional HTTP headers
-      headers = {}
-      unless @host_headers.empty?
-        @host_headers.each do |name,header|
-          if host.match(name)
-            headers['Host'] = header
-            break
-          end
-        end
-      end
-      headers['Host']     ||= @host_header if @host_header
-      headers['User-Agent'] = @user_agent if @user_agent
-      headers['Referer']    = @referer if @referer
-      if (authorization = @authorized.for_url(url))
-        headers['Authorization'] = "Basic #{authorization}"
-      end
-      if (header_cookies = @cookies.for_host(url.host))
-        headers['Cookie'] = header_cookies
-      end
+      headers = prepare_request_headers(url)
       begin
         sleep(@delay) if @delay > 0
@@ -762,6 +811,17 @@ module Spidr
       @queue.shift
     end
+    #
+    # Determines if the maximum limit has been reached.
+    #
+    # @return [Boolean]
+    #
+    # @since 0.6.0
+    #
+    def limit_reached?
+      @limit && @history.length >= @limit
+    end
     #
     # Determines if a given URL should be visited.
     #