RubyGems - spidr - Versions diffs - 0.4.1 → 0.5.0 - Mend

spidr 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +7 -0
data/ChangeLog.md +69 -54
data/Gemfile +9 -5
data/LICENSE.txt +1 -1
data/README.md +34 -26
data/Rakefile +4 -15
data/gemspec.yml +3 -2
data/lib/spidr/agent.rb +101 -44
data/lib/spidr/{actions → agent}/actions.rb +32 -12
data/lib/spidr/{events.rb → agent/events.rb} +4 -8
data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
data/lib/spidr/auth_store.rb +2 -2
data/lib/spidr/cookie_jar.rb +2 -2
data/lib/spidr/extensions/uri.rb +28 -16
data/lib/spidr/page.rb +7 -11
data/lib/spidr/{body.rb → page/body.rb} +1 -1
data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
data/lib/spidr/{links.rb → page/links.rb} +43 -7
data/lib/spidr/session_cache.rb +2 -2
data/lib/spidr/spidr.rb +32 -5
data/lib/spidr/version.rb +1 -1
data/spec/agent/actions_spec.rb +60 -0
data/spec/agent/filters_spec.rb +62 -0
data/spec/agent/sanitizers_spec.rb +62 -0
data/spec/agent_spec.rb +13 -13
data/spec/auth_store_spec.rb +17 -17
data/spec/cookie_jar_spec.rb +26 -26
data/spec/extensions/uri_spec.rb +19 -9
data/spec/helpers/history.rb +5 -5
data/spec/helpers/wsoc.rb +2 -2
data/spec/page_examples.rb +4 -4
data/spec/page_spec.rb +28 -25
data/spec/rules_spec.rb +14 -14
data/spec/session_cache.rb +7 -7
data/spec/spidr_spec.rb +10 -10
metadata +37 -51
data/lib/spidr/actions.rb +0 -2
data/lib/spidr/actions/exceptions.rb +0 -4
data/lib/spidr/actions/exceptions/action.rb +0 -9
data/lib/spidr/actions/exceptions/paused.rb +0 -11
data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
data/spec/actions_spec.rb +0 -59
data/spec/filters_spec.rb +0 -61
data/spec/sanitizers_spec.rb +0 -61

data/spec/extensions/uri_spec.rb CHANGED

@@ -5,39 +5,49 @@ require 'spec_helper'
 describe URI do
   describe "expand_path" do
     it "should preserve single directory paths" do
-      URI.expand_path('path').should == 'path'
+      expect(URI.expand_path('path')).to eq('path')
     end
     it "should preserve trailing '/'" do
-      URI.expand_path('test/path/').should == 'test/path/'
+      expect(URI.expand_path('test/path/')).to eq('test/path/')
     end
     it "should remove multiple '/' characters" do
-      URI.expand_path('///test///path///').should == '/test/path/'
+      expect(URI.expand_path('///test///path///')).to eq('/test/path/')
     end
     it "should remove '.' directories from the path" do
-      URI.expand_path('test/./path').should == 'test/path'
+      expect(URI.expand_path('test/./path')).to eq('test/path')
     end
     it "should handle '..' directories properly" do
-      URI.expand_path('test/../path').should == 'path'
+      expect(URI.expand_path('test/../path')).to eq('path')
     end
     it "should limit the number of '..' directories resolved" do
-      URI.expand_path('/test/../../../..').should == '/'
+      expect(URI.expand_path('/test/../../../..')).to eq('/')
+    end
+    it "should preserve leading '/'" do
+      expect(URI.expand_path('/../../../foo')).to eq('/foo')
     end
     it "should preserve absolute paths" do
-      URI.expand_path('/test/path').should == '/test/path'
+      expect(URI.expand_path('/test/path')).to eq('/test/path')
     end
     it "should preserve the root path" do
-      URI.expand_path('/').should == '/'
+      expect(URI.expand_path('/')).to eq('/')
     end
     it "should default empty paths to the root path" do
-      URI.expand_path('').should == '/'
+      expect(URI.expand_path('')).to eq('/')
+    end
+    it "should default zero-sum paths to a '/'" do
+      expect(URI.expand_path('foo/..')).to eq('/')
+      expect(URI.expand_path('foo/../bar/..')).to eq('/')
+      expect(URI.expand_path('././././.')).to eq('/')
     end
   end
 end

data/spec/helpers/history.rb CHANGED

@@ -15,20 +15,20 @@ module Helpers
     end
     def should_visit_link(url)
-      visited_link?(url).should == true
+      expect(visited_link?(url)).to eq(true)
     end
     def should_ignore_link(url)
-      visited_link?(url).should == false
+      expect(visited_link?(url)).to eq(false)
     end
     def should_visit_once(url)
-      visited_once?(url).should == true
+      expect(visited_once?(url)).to eq(true)
     end
     def should_fail_link(url)
-      visited_link?(url).should == false
-      visit_failed?(url).should == true
+      expect(visited_link?(url)).to eq(false)
+      expect(visit_failed?(url)).to eq(true)
     end
   end
 end

data/spec/helpers/wsoc.rb CHANGED

@@ -9,8 +9,8 @@ module Helpers
     include History
     SERVER_URL = URI::HTTP.build(
-      :host => (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
-      :port => (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
+      host: (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
+      port: (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
     )
     SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])

data/spec/page_examples.rb CHANGED

@@ -4,18 +4,18 @@ require 'spec_helper'
 shared_examples_for "Page" do
   it "should have a status code" do
-    @page.code.should be_integer
+    expect(@page.code).to be_integer
   end
   it "should have a body" do
-    @page.body.should_not be_empty
+    expect(@page.body).not_to be_empty
   end
   it "should provide transparent access to the response headers" do
-    @page.content_type.should == @page.response['Content-Type']
+    expect(@page.content_type).to eq(@page.response['Content-Type'])
   end
   it "should allow content-types" do
-    @page.content_types.should_not be_empty
+    expect(@page.content_types).not_to be_empty
   end
 end

data/spec/page_spec.rb CHANGED

@@ -13,84 +13,87 @@ describe Page do
     it_should_behave_like "Page"
     it "should be OK" do
-      @page.should be_ok
+      expect(@page).to be_ok
     end
     it "should have a content-type" do
-      @page.content_type.should include('text/html')
+      expect(@page.content_type).to include('text/html')
     end
     it "should be a html page" do
-      @page.should be_html
+      expect(@page).to be_html
     end
     it "should have provide a document" do
-      @page.doc.class.should == Nokogiri::HTML::Document
+      expect(@page.doc.class).to eq(Nokogiri::HTML::Document)
     end
     it "should allow searching the document" do
-      @page.doc.search('//p').length.should == 2
-      @page.doc.at('//p[2]').inner_text.should == 'Ready! Set! Go!'
+      expect(@page.doc.search('//p').length).to eq(2)
+      expect(@page.doc.at('//p[2]').inner_text).to eq('Ready! Set! Go!')
     end
     it "should have a title" do
-      @page.title.should == 'Spidr :: Web-Spider Obstacle Course :: Start'
+      expect(@page.title).to eq('Spidr :: Web-Spider Obstacle Course :: Start')
     end
     it "should have links" do
-      @page.links.should_not be_empty
+      expect(@page.links).not_to be_empty
     end
   end
   describe "txt" do
     before(:all) do
-      @page = get_page('http://www.ruby-lang.org/en/LICENSE.txt')
+      @page = get_page('https://www.ruby-lang.org/en/about/license.txt')
     end
     it_should_behave_like "Page"
     it "should be OK" do
-      @page.should be_ok
+      expect(@page).to be_ok
     end
     it "should have a content-type" do
-      @page.content_type.should include('text/plain')
+      expect(@page.content_type).to include('text/plain')
     end
     it "should be a txt page" do
-      @page.should be_txt
+      expect(@page).to be_txt
     end
     it "should not have provide a document" do
-      @page.doc.should be_nil
+      expect(@page.doc).to be_nil
     end
     it "should not allow searching the document" do
-      @page.search('//p').should be_empty
-      @page.at('//p').should be_nil
+      expect(@page.search('//p')).to be_empty
+      expect(@page.at('//p')).to be_nil
     end
     it "should not have links" do
-      @page.links.should be_empty
+      expect(@page.links).to be_empty
     end
     it "should not have a title" do
-      @page.title.should be_nil
+      expect(@page.title).to be_nil
     end
   end
   describe "redirects" do
     before(:all) do
       @page = get_page('http://spidr.rubyforge.org/course/start.html')
-      @page.stub!(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
+    end
+    before do
+      allow(@page).to receive(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
     end
     it "should provide access to page-level redirects" do
-      @page.redirects_to.should == ['http://spidr.rubyforge.org/redirected']
+      expect(@page.redirects_to).to eq(['http://spidr.rubyforge.org/redirected'])
     end
     it "should include meta refresh redirects in the list of links" do
-      @page.links.should include('http://spidr.rubyforge.org/redirected')
+      expect(@page.links).to include('http://spidr.rubyforge.org/redirected')
     end
   end
@@ -102,23 +105,23 @@ describe Page do
     it "should provide access to the raw Cookie" do
       cookie = @page.cookie
-      cookie.should_not be_nil
-      cookie.should_not be_empty
+      expect(cookie).not_to be_nil
+      expect(cookie).not_to be_empty
     end
     it "should provide access to the Cookies" do
       cookies = @page.cookies
-      cookies.should_not be_empty
+      expect(cookies).not_to be_empty
     end
     it "should provide access to the key->value pairs within the Cookie" do
       params = @page.cookie_params
-      params.should_not be_empty
+      expect(params).not_to be_empty
       params.each do |key,value|
-        key.should_not be_empty
+        expect(key).not_to be_empty
       end
     end
   end

data/spec/rules_spec.rb CHANGED

@@ -6,40 +6,40 @@ describe Rules do
   subject { Rules }
   it "should accept data based on acceptance data" do
-    rules = subject.new(:accept => [1])
+    rules = subject.new(accept: [1])
-    rules.accept?(1).should == true
+    expect(rules.accept?(1)).to eq(true)
   end
   it "should accept data based on acceptance regexps" do
-    rules = subject.new(:accept => [/1/])
+    rules = subject.new(accept: [/1/])
-    rules.accept?('1').should == true
+    expect(rules.accept?('1')).to eq(true)
   end
   it "should match non-Strings using acceptance regexps" do
-    rules = subject.new(:accept => [/1/])
+    rules = subject.new(accept: [/1/])
-    rules.accept?(1).should == true
+    expect(rules.accept?(1)).to eq(true)
   end
   it "should accept data using acceptance lambdas" do
-    rules = subject.new(:accept => [lambda { |data| data > 2 }])
+    rules = subject.new(accept: [lambda { |data| data > 2 }])
-    rules.accept?(3).should == true
+    expect(rules.accept?(3)).to eq(true)
   end
   it "should reject data that does not match any acceptance patterns" do
-    rules = subject.new(:accept => [1, 2, 3])
+    rules = subject.new(accept: [1, 2, 3])
-    rules.accept?(2).should == true
-    rules.accept?(4).should == false
+    expect(rules.accept?(2)).to eq(true)
+    expect(rules.accept?(4)).to eq(false)
   end
   it "should accept data that does not match any rejection patterns" do
-    rules = subject.new(:reject => [1, 2, 3])
+    rules = subject.new(reject: [1, 2, 3])
-    rules.accept?(2).should == false
-    rules.accept?(4).should == true
+    expect(rules.accept?(2)).to eq(false)
+    expect(rules.accept?(4)).to eq(true)
   end
 end

data/spec/session_cache.rb CHANGED

@@ -9,11 +9,11 @@ describe SessionCache do
     end
     it "should not have any active sessions" do
-      @sessions.should_not be_active(URI('http://example.com/'))
+      expect(@sessions).not_to be_active(URI('http://example.com/'))
     end
     it "should start new sessions on-demand" do
-      @sessions[URI('http://example.com/')].should_not be_nil
+      expect(@sessions[URI('http://example.com/')]).not_to be_nil
     end
     after(:all) do
@@ -30,25 +30,25 @@ describe SessionCache do
     end
     it "should have active sessions" do
-      @sessions.should be_active(@url)
+      expect(@sessions).to be_active(@url)
     end
     it "should provide access to sessions" do
-      @sessions[@url].should_not be_nil
+      expect(@sessions[@url]).not_to be_nil
     end
     it "should start new sessions on-demand" do
       url2 = URI('http://www.w3c.org/')
-      @sessions[url2].should_not be_nil
+      expect(@sessions[url2]).not_to be_nil
     end
     it "should be able to kill sessions" do
       url2 = URI('http://www.w3c.org/')
-      @sessions[url2].should_not be_nil
+      expect(@sessions[url2]).not_to be_nil
       @sessions.kill!(url2)
-      @sessions.should_not be_active(url2)
+      expect(@sessions).not_to be_active(url2)
     end
     after(:all) do

data/spec/spidr_spec.rb CHANGED

@@ -4,36 +4,36 @@ require 'spec_helper'
 describe Spidr do
   it "should have a VERSION constant" do
-    subject.const_defined?('VERSION').should == true
+    expect(subject.const_defined?('VERSION')).to eq(true)
   end
   describe "proxy" do
     after(:all) do
-      subject.disable_proxy!
+      Spidr.disable_proxy!
     end
     it "should not have proxy settings by default" do
-      subject.proxy[:host].should be_nil
+      expect(subject.proxy[:host]).to be_nil
     end
     it "should allow setting new proxy settings" do
-      subject.proxy = {:host => 'example.com', :port => 8010}
+      subject.proxy = {host: 'example.com', port: 8010}
-      subject.proxy[:host].should == 'example.com'
-      subject.proxy[:port].should == 8010
+      expect(subject.proxy[:host]).to eq('example.com')
+      expect(subject.proxy[:port]).to eq(8010)
     end
     it "should default the :port option of new proxy settings" do
-      subject.proxy = {:host => 'example.com'}
+      subject.proxy = {host: 'example.com'}
-      subject.proxy[:host].should == 'example.com'
-      subject.proxy[:port].should == Spidr::COMMON_PROXY_PORT
+      expect(subject.proxy[:host]).to eq('example.com')
+      expect(subject.proxy[:port]).to eq(Spidr::COMMON_PROXY_PORT)
     end
     it "should allow disabling the proxy" do
       subject.disable_proxy!
-      subject.proxy[:host].should be_nil
+      expect(subject.proxy[:host]).to be_nil
     end
   end
 end

metadata CHANGED

@@ -1,49 +1,43 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.4.1
-  prerelease:
+  version: 0.5.0
 platform: ruby
 authors:
 - Postmodern
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-12-09 00:00:00.000000000 Z
+date: 2016-01-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &19474920 !ruby/object:Gem::Requirement
-    none: false
+  requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '1.3'
   type: :runtime
   prerelease: false
-  version_requirements: *19474920
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
 - !ruby/object:Gem::Dependency
   name: bundler
-  requirement: &19474320 !ruby/object:Gem::Requirement
-    none: false
+  requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '1.0'
   type: :development
   prerelease: false
-  version_requirements: *19474320
-- !ruby/object:Gem::Dependency
-  name: yard
-  requirement: &19473820 !ruby/object:Gem::Requirement
-    none: false
+  version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.7'
-  type: :development
-  prerelease: false
-  version_requirements: *19473820
+        version: '1.0'
 description: Spidr is a versatile Ruby web spidering library that can spider a site,
   multiple domains, certain links or infinitely. Spidr is designed to be fast and
   easy to use.
@@ -55,9 +49,9 @@ extra_rdoc_files:
 - LICENSE.txt
 - README.md
 files:
-- .gitignore
-- .rspec
-- .yardopts
+- ".gitignore"
+- ".rspec"
+- ".yardopts"
 - ChangeLog.md
 - Gemfile
 - LICENSE.txt
@@ -65,81 +59,73 @@ files:
 - Rakefile
 - gemspec.yml
 - lib/spidr.rb
-- lib/spidr/actions.rb
-- lib/spidr/actions/actions.rb
-- lib/spidr/actions/exceptions.rb
-- lib/spidr/actions/exceptions/action.rb
-- lib/spidr/actions/exceptions/paused.rb
-- lib/spidr/actions/exceptions/skip_link.rb
-- lib/spidr/actions/exceptions/skip_page.rb
 - lib/spidr/agent.rb
+- lib/spidr/agent/actions.rb
+- lib/spidr/agent/events.rb
+- lib/spidr/agent/filters.rb
+- lib/spidr/agent/sanitizers.rb
 - lib/spidr/auth_credential.rb
 - lib/spidr/auth_store.rb
-- lib/spidr/body.rb
 - lib/spidr/cookie_jar.rb
-- lib/spidr/events.rb
 - lib/spidr/extensions.rb
 - lib/spidr/extensions/uri.rb
-- lib/spidr/filters.rb
-- lib/spidr/headers.rb
-- lib/spidr/links.rb
 - lib/spidr/page.rb
+- lib/spidr/page/body.rb
+- lib/spidr/page/headers.rb
+- lib/spidr/page/links.rb
 - lib/spidr/rules.rb
-- lib/spidr/sanitizers.rb
 - lib/spidr/session_cache.rb
 - lib/spidr/spidr.rb
 - lib/spidr/version.rb
-- spec/actions_spec.rb
+- spec/agent/actions_spec.rb
+- spec/agent/filters_spec.rb
+- spec/agent/sanitizers_spec.rb
 - spec/agent_spec.rb
 - spec/auth_store_spec.rb
 - spec/cookie_jar_spec.rb
 - spec/extensions/uri_spec.rb
-- spec/filters_spec.rb
 - spec/helpers/history.rb
 - spec/helpers/page.rb
 - spec/helpers/wsoc.rb
 - spec/page_examples.rb
 - spec/page_spec.rb
 - spec/rules_spec.rb
-- spec/sanitizers_spec.rb
 - spec/session_cache.rb
 - spec/spec_helper.rb
 - spec/spidr_spec.rb
 - spidr.gemspec
-homepage: http://github.com/postmodern/spidr
+homepage: https://github.com/postmodern/spidr#readme
 licenses:
 - MIT
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.9.1
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.10
+rubygems_version: 2.4.7
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: A versatile Ruby web spidering library
 test_files:
-- spec/actions_spec.rb
+- spec/agent/actions_spec.rb
+- spec/agent/filters_spec.rb
+- spec/agent/sanitizers_spec.rb
 - spec/agent_spec.rb
 - spec/auth_store_spec.rb
 - spec/cookie_jar_spec.rb
 - spec/extensions/uri_spec.rb
-- spec/filters_spec.rb
 - spec/page_spec.rb
 - spec/rules_spec.rb
-- spec/sanitizers_spec.rb
 - spec/spidr_spec.rb
-has_rdoc: