RubyGems - spidr - Versions diffs - 0.6.0 → 0.6.1 - Mend

spidr 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +5 -5
data/.travis.yml +9 -7
data/ChangeLog.md +7 -0
data/Gemfile +3 -3
data/README.md +1 -1
data/gemspec.yml +1 -1
data/lib/spidr/agent.rb +11 -23
data/lib/spidr/agent/filters.rb +3 -3
data/lib/spidr/agent/sanitizers.rb +1 -1
data/lib/spidr/auth_store.rb +9 -5
data/lib/spidr/page/html.rb +1 -1
data/lib/spidr/session_cache.rb +3 -3
data/lib/spidr/spidr.rb +1 -0
data/lib/spidr/version.rb +1 -1
data/spec/agent_spec.rb +1 -3
metadata +15 -16

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
-  data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
+SHA256:
+  metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
+  data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
 SHA512:
-  metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
-  data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
+  metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
+  data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77

data/.travis.yml CHANGED

@@ -1,14 +1,16 @@
 ---
+before_install:
+  - gem update --system
+  - gem install bundler -v "~> 2.0"
 language: ruby
+sudo: false
+cache:
+  - bundler
 rvm:
-  - 2.0.0
-  - 2.1.9
-  - 2.2.4
-  - 2.3.1
+  - 2.5
+  - 2.6
   - jruby
-  - rbx
 matrix:
   allow_failures:
     - rvm: jruby
-    - rvm: rbx
-script: rake spec
+script: bundle exec rake spec

data/ChangeLog.md CHANGED

@@ -1,3 +1,10 @@
+### 0.6.1 / 2019-10-24
+* Check for opaque component of URIs before attempting to set the path
+  component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
+  opaque` exceptions.
+* Fix `@robots` instance variable warning (@spk).
 ### 0.6.0 / 2016-08-04
 * Added {Spidr::Proxy}.

data/Gemfile CHANGED

@@ -13,9 +13,9 @@ group :development do
   gem 'rubygems-tasks', '~> 0.2'
   gem 'rspec',    '~> 3.0'
-  gem 'webmock',  '~> 2.0'
+  gem 'webmock',  '~> 3.0'
   gem 'sinatra',  '~> 1.0'
-  gem 'kramdown', '~> 0.12'
-  gem 'yard',     '~> 0.8'
+  gem 'kramdown'
+  gem 'yard',     '~> 0.9'
 end

data/README.md CHANGED

@@ -157,7 +157,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
 Pause the spider on a forbidden page:
-    spider = Spidr.host('company.com') do |spider|
+    Spidr.host('company.com') do |spider|
       spider.every_forbidden_page do |page|
         spider.pause!
       end

data/gemspec.yml CHANGED

@@ -17,4 +17,4 @@ dependencies:
   nokogiri: ~> 1.3
 development_dependencies:
-  bundler: ~> 1.0
+  bundler: ~> 2.0

data/lib/spidr/agent.rb CHANGED

@@ -268,7 +268,7 @@ module Spidr
     # @see #initialize
     #
     def self.site(url,options={},&block)
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       agent = new(options.merge(host: url.host),&block)
       agent.start_at(url)
@@ -408,9 +408,7 @@ module Spidr
       @history.clear
       new_history.each do |url|
-        url = URI(url.to_s) unless url.kind_of?(URI)
-        @history << url
+        @history << URI(url)
       end
       return @history
@@ -425,7 +423,7 @@ module Spidr
     #   The links which have been visited.
     #
     def visited_links
-      @history.map { |url| url.to_s }
+      @history.map(&:to_s)
     end
     #
@@ -435,7 +433,7 @@ module Spidr
     #   The hosts which have been visited.
     #
     def visited_hosts
-      visited_urls.map { |uri| uri.host }.uniq
+      visited_urls.map(&:host).uniq
     end
     #
@@ -448,9 +446,7 @@ module Spidr
     #   Specifies whether a URL was visited.
     #
     def visited?(url)
-      url = URI(url.to_s) unless url.kind_of?(URI)
-      return @history.include?(url)
+      @history.include?(URI(url))
     end
     #
@@ -469,9 +465,7 @@ module Spidr
       @failures.clear
       new_failures.each do |url|
-        url = URI(url.to_s) unless url.kind_of?(URI)
-        @failures << url
+        @failures << URI(url)
       end
       return @failures
@@ -487,9 +481,7 @@ module Spidr
     #   Specifies whether the given URL was unable to be visited.
     #
     def failed?(url)
-      url = URI(url.to_s) unless url.kind_of?(URI)
-      return @failures.include?(url)
+      @failures.include?(URI(url))
     end
     alias pending_urls queue
@@ -510,9 +502,7 @@ module Spidr
       @queue.clear
       new_queue.each do |url|
-        url = URI(url.to_s) unless url.kind_of?(URI)
-        @queue << url
+        @queue << URI(url)
       end
       return @queue
@@ -594,7 +584,7 @@ module Spidr
     #   The page for the response, or `nil` if the request failed.
     #
     def get_page(url)
-      url = URI(url.to_s)
+      url = URI(url)
       prepare_request(url) do |session,path,headers|
         new_page = Page.new(url,session.get(path,headers))
@@ -629,7 +619,7 @@ module Spidr
     # @since 0.2.2
     #
     def post_page(url,post_data='')
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       prepare_request(url) do |session,path,headers|
         new_page = Page.new(url,session.post(path,post_data,headers))
@@ -725,7 +715,7 @@ module Spidr
       unless @host_headers.empty?
         @host_headers.each do |name,header|
-          if host.match(name)
+          if url.host.match(name)
             headers['Host'] = header
             break
           end
@@ -769,8 +759,6 @@ module Spidr
     # @since 0.2.2
     #
     def prepare_request(url,&block)
-      host = url.host
-      port = url.port
       path = unless url.path.empty?
                url.path
              else

data/lib/spidr/agent/filters.rb CHANGED

@@ -16,7 +16,7 @@ module Spidr
     #   agent.schemes = ['http']
     #
     def schemes=(new_schemes)
-      @schemes = new_schemes.map { |scheme| scheme.to_s }
+      @schemes = new_schemes.map(&:to_s)
     end
     #
@@ -452,9 +452,9 @@ module Spidr
     #
     def visit_scheme?(scheme)
       if scheme
-        return @schemes.include?(scheme)
+        @schemes.include?(scheme)
       else
-        return true
+        true
       end
     end

data/lib/spidr/agent/sanitizers.rb CHANGED

@@ -21,7 +21,7 @@ module Spidr
     # @since 0.2.2
     #
     def sanitize_url(url)
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       url.fragment = nil if @strip_fragments
       url.query    = nil if @strip_query

data/lib/spidr/auth_store.rb CHANGED

@@ -34,7 +34,7 @@ module Spidr
     #
     def [](url)
       # normalize the url
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       key = [url.scheme, url.host, url.port]
       paths = @credentials[key]
@@ -42,7 +42,7 @@ module Spidr
       return nil unless paths
       # longest path first
-      ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
+      ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
       # directories of the path
       path_dirs = URI.expand_path(url.path).split('/')
@@ -70,7 +70,7 @@ module Spidr
     #
     def []=(url,auth)
       # normalize the url
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       # normalize the URL path
       path = URI.expand_path(url.path)
@@ -118,7 +118,7 @@ module Spidr
     #
     def for_url(url)
       if (auth = self[url])
-        return Base64.encode64("#{auth.username}:#{auth.password}")
+        Base64.encode64("#{auth.username}:#{auth.password}")
       end
     end
@@ -144,7 +144,11 @@ module Spidr
     # @since 0.2.2
     #
     def size
-      @credentials.inject(0) { |res, arr| res + arr[1].length }
+      total = 0
+      @credentials.each_value { |paths| total += paths.length }
+      return total
     end
     #

data/lib/spidr/page/html.rb CHANGED

@@ -271,7 +271,7 @@ module Spidr
                   return
                 end
-      if (path = new_url.path)
+      if (!new_url.opaque) && (path = new_url.path)
         # ensure that paths begin with a leading '/' for URI::FTP
         if (new_url.scheme == 'ftp' && !path.start_with?('/'))
           path.insert(0,'/')

data/lib/spidr/session_cache.rb CHANGED

@@ -65,7 +65,7 @@ module Spidr
     #
     def active?(url)
       # normalize the url
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       # session key
       key = key_for(url)
@@ -84,7 +84,7 @@ module Spidr
     #
     def [](url)
       # normalize the url
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       # session key
       key = key_for(url)
@@ -127,7 +127,7 @@ module Spidr
     #
     def kill!(url)
       # normalize the url
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       # session key
       key = key_for(url)

data/lib/spidr/spidr.rb CHANGED

@@ -16,6 +16,7 @@ module Spidr
   # @since 0.5.0
   #
   def self.robots?
+    @robots ||= false
     @robots
   end

data/lib/spidr/version.rb CHANGED

@@ -1,4 +1,4 @@
 module Spidr
   # Spidr version
-  VERSION = '0.6.0'
+  VERSION = '0.6.1'
 end

data/spec/agent_spec.rb CHANGED

@@ -786,14 +786,12 @@ describe Agent do
         [
           "User-agent: *",
-          'Disallow: /',
+          'Disallow: /secret',
         ].join($/)
       end
     end
     it "should not follow links Disallowed by robots.txt" do
-      pending "https://github.com/bblimke/webmock/issues/642"
       expect(subject.history).to be == Set[
         URI("http://#{host}/"),
         URI("http://#{host}/pub")

metadata CHANGED

@@ -1,43 +1,43 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.6.0
+  version: 0.6.1
 platform: ruby
 authors:
 - Postmodern
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-08-05 00:00:00.000000000 Z
+date: 2019-10-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '1.3'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '1.3'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.0'
+        version: '2.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.0'
+        version: '2.0'
 description: Spidr is a versatile Ruby web spidering library that can spider a site,
   multiple domains, certain links or infinitely. Spidr is designed to be fast and
   easy to use.
@@ -49,10 +49,10 @@ extra_rdoc_files:
 - LICENSE.txt
 - README.md
 files:
-- .gitignore
-- .rspec
-- .travis.yml
-- .yardopts
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- ".yardopts"
 - ChangeLog.md
 - Gemfile
 - LICENSE.txt
@@ -118,17 +118,16 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: 2.0.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.0.14.1
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: A versatile Ruby web spidering library