RubyGems - spidr - Versions diffs - 0.4.1 → 0.5.0 - Mend

spidr 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +7 -0
data/ChangeLog.md +69 -54
data/Gemfile +9 -5
data/LICENSE.txt +1 -1
data/README.md +34 -26
data/Rakefile +4 -15
data/gemspec.yml +3 -2
data/lib/spidr/agent.rb +101 -44
data/lib/spidr/{actions → agent}/actions.rb +32 -12
data/lib/spidr/{events.rb → agent/events.rb} +4 -8
data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
data/lib/spidr/auth_store.rb +2 -2
data/lib/spidr/cookie_jar.rb +2 -2
data/lib/spidr/extensions/uri.rb +28 -16
data/lib/spidr/page.rb +7 -11
data/lib/spidr/{body.rb → page/body.rb} +1 -1
data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
data/lib/spidr/{links.rb → page/links.rb} +43 -7
data/lib/spidr/session_cache.rb +2 -2
data/lib/spidr/spidr.rb +32 -5
data/lib/spidr/version.rb +1 -1
data/spec/agent/actions_spec.rb +60 -0
data/spec/agent/filters_spec.rb +62 -0
data/spec/agent/sanitizers_spec.rb +62 -0
data/spec/agent_spec.rb +13 -13
data/spec/auth_store_spec.rb +17 -17
data/spec/cookie_jar_spec.rb +26 -26
data/spec/extensions/uri_spec.rb +19 -9
data/spec/helpers/history.rb +5 -5
data/spec/helpers/wsoc.rb +2 -2
data/spec/page_examples.rb +4 -4
data/spec/page_spec.rb +28 -25
data/spec/rules_spec.rb +14 -14
data/spec/session_cache.rb +7 -7
data/spec/spidr_spec.rb +10 -10
metadata +37 -51
data/lib/spidr/actions.rb +0 -2
data/lib/spidr/actions/exceptions.rb +0 -4
data/lib/spidr/actions/exceptions/action.rb +0 -9
data/lib/spidr/actions/exceptions/paused.rb +0 -11
data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
data/spec/actions_spec.rb +0 -59
data/spec/filters_spec.rb +0 -61
data/spec/sanitizers_spec.rb +0 -61

data/lib/spidr/spidr.rb CHANGED

@@ -6,10 +6,10 @@ module Spidr
   # Default proxy information.
   DEFAULT_PROXY = {
-    :host => nil,
-    :port => COMMON_PROXY_PORT,
-    :user => nil,
-    :password => nil
+    host:     nil,
+    port:     COMMON_PROXY_PORT,
+    user:     nil,
+    password: nil
   }
   #
@@ -44,7 +44,7 @@ module Spidr
   #   The new proxy information.
   #
   def Spidr.proxy=(new_proxy)
-    @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
+    @@spidr_proxy = {port: COMMON_PROXY_PORT}.merge(new_proxy)
   end
   #
@@ -75,6 +75,30 @@ module Spidr
     @@spidr_user_agent = new_agent
   end
+  #
+  # Specifies whether `robots.txt` should be honored globally.
+  #
+  # @return [Boolean]
+  #
+  # @since 0.5.0
+  #
+  def Spidr.robots?
+    @robots
+  end
+  #
+  # Enables or disables `robots.txt` globally.
+  #
+  # @param [Boolean] mode
+  #
+  # @return [Boolean]
+  #
+  # @since 0.5.0
+  #
+  def Spidr.robots=(mode)
+    @robots = mode
+  end
   #
   # @see Agent.start_at
   #
@@ -95,4 +119,7 @@ module Spidr
   def Spidr.site(url,options={},&block)
     Agent.site(url,options,&block)
   end
+  def Spidr.robots
+  end
 end

data/lib/spidr/version.rb CHANGED

@@ -1,4 +1,4 @@
 module Spidr
   # Spidr version
-  VERSION = '0.4.1'
+  VERSION = '0.5.0'
 end

data/spec/agent/actions_spec.rb ADDED

@@ -0,0 +1,60 @@
+require 'spidr/agent'
+require 'spec_helper'
+describe Agent do
+  describe "actions" do
+    let(:url) { URI('http://spidr.rubyforge.org/') }
+    it "should be able to pause spidering" do
+      count = 0
+      agent = Agent.host('spidr.rubyforge.org') do |spider|
+        spider.every_page do |page|
+          count += 1
+          spider.pause! if count >= 2
+        end
+      end
+      expect(agent).to be_paused
+      expect(agent.history.length).to eq(2)
+    end
+    it "should be able to continue spidering after being paused" do
+      agent = Agent.new do |spider|
+        spider.every_page do |page|
+          spider.pause!
+        end
+      end
+      agent.enqueue(url)
+      agent.continue!
+      expect(agent.visited?(url)).to eq(true)
+    end
+    it "should allow skipping of enqueued links" do
+      agent = Agent.new do |spider|
+        spider.every_url do |url|
+          spider.skip_link!
+        end
+      end
+      agent.enqueue(url)
+      expect(agent.queue).to be_empty
+    end
+    it "should allow skipping of visited pages" do
+      agent = Agent.new do |spider|
+        spider.every_page do |url|
+          spider.skip_page!
+        end
+      end
+      agent.visit_page(url)
+      expect(agent.history).to eq(Set[url])
+      expect(agent.queue).to be_empty
+    end
+  end
+end

data/spec/agent/filters_spec.rb ADDED

@@ -0,0 +1,62 @@
+require 'spidr/agent'
+require 'spec_helper'
+describe Agent do
+  describe "filters" do
+    it "should allow setting the acceptable schemes" do
+      agent = Agent.new
+      agent.schemes = [:http]
+      expect(agent.schemes).to eq(['http'])
+    end
+    it "should provide the hosts that will be visited" do
+      agent = Agent.new(hosts: ['spidr.rubyforge.org'])
+      expect(agent.visit_hosts).to eq(['spidr.rubyforge.org'])
+    end
+    it "should provide the hosts that will not be visited" do
+      agent = Agent.new(ignore_hosts: ['example.com'])
+      expect(agent.ignore_hosts).to eq(['example.com'])
+    end
+    it "should provide the ports that will be visited" do
+      agent = Agent.new(ports: [80, 443, 8000])
+      expect(agent.visit_ports).to eq([80, 443, 8000])
+    end
+    it "should provide the ports that will not be visited" do
+      agent = Agent.new(ignore_ports: [8000, 8080])
+      expect(agent.ignore_ports).to eq([8000, 8080])
+    end
+    it "should provide the links that will be visited" do
+      agent = Agent.new(links: ['index.php'])
+      expect(agent.visit_links).to eq(['index.php'])
+    end
+    it "should provide the links that will not be visited" do
+      agent = Agent.new(ignore_links: [/login/])
+      expect(agent.ignore_links).to eq([/login/])
+    end
+    it "should provide the exts that will be visited" do
+      agent = Agent.new(exts: ['htm'])
+      expect(agent.visit_exts).to eq(['htm'])
+    end
+    it "should provide the exts that will not be visited" do
+      agent = Agent.new(ignore_exts: ['cfm'])
+      expect(agent.ignore_exts).to eq(['cfm'])
+    end
+  end
+end

data/spec/agent/sanitizers_spec.rb ADDED

@@ -0,0 +1,62 @@
+require 'spidr/agent'
+require 'spec_helper'
+describe Agent do
+  describe "sanitizers" do
+    describe "sanitize_url" do
+      let(:url) { 'http://host.com' }
+      before(:all) { @agent = Agent.new }
+      it "should sanitize URLs" do
+        agent = Agent.new
+        clean_url = agent.sanitize_url(URI(url))
+        expect(clean_url.host).to eq('host.com')
+      end
+      it "should sanitize URLs given as Strings" do
+        agent = Agent.new
+        clean_url = agent.sanitize_url(url)
+        expect(clean_url.host).to eq('host.com')
+      end
+    end
+    describe "strip_fragments" do
+      let(:url) { URI("http://host.com/page#lol") }
+      it "should strip fragment components by default" do
+        agent = Agent.new
+        clean_url = agent.sanitize_url(url)
+        expect(clean_url.fragment).to be_nil
+      end
+      it "should allow perserving fragment components" do
+        agent = Agent.new(strip_fragments: false)
+        clean_url = agent.sanitize_url(url)
+        expect(clean_url.fragment).to eq('lol')
+      end
+    end
+    describe "strip_query" do
+      let(:url) { URI("http://host.com/page?x=1") }
+      it "should not strip query components by default" do
+        agent = Agent.new
+        clean_url = agent.sanitize_url(url)
+        expect(clean_url.query).to eq('x=1')
+      end
+      it "should allow stripping of query components" do
+        agent = Agent.new(strip_query: true)
+        clean_url = agent.sanitize_url(url)
+        expect(clean_url.query).to be_nil
+      end
+    end
+  end
+end

data/spec/agent_spec.rb CHANGED

@@ -11,11 +11,11 @@ describe Agent do
   end
   it "should provide the history" do
-    @agent.history.should_not be_empty
+    expect(@agent.history).not_to be_empty
   end
   it "should provide the queue" do
-    @agent.queue.should be_empty
+    expect(@agent.queue).to be_empty
   end
   it "should be able to restore the history" do
@@ -23,7 +23,7 @@ describe Agent do
     previous_history = Set[URI('http://www.example.com')]
     agent.history = previous_history
-    agent.history.should == previous_history
+    expect(agent.history).to eq(previous_history)
   end
   it "should convert new histories to an Set of URIs" do
@@ -32,8 +32,8 @@ describe Agent do
     expected_history = Set[URI('http://www.example.com')]
     agent.history = previous_history
-    agent.history.should_not == previous_history
-    agent.history.should == expected_history
+    expect(agent.history).not_to eq(previous_history)
+    expect(agent.history).to eq(expected_history)
   end
   it "should be able to restore the failures" do
@@ -41,7 +41,7 @@ describe Agent do
     previous_failures = Set[URI('http://localhost/')]
     agent.failures = previous_failures
-    agent.failures.should == previous_failures
+    expect(agent.failures).to eq(previous_failures)
   end
   it "should convert new histories to a Set of URIs" do
@@ -50,8 +50,8 @@ describe Agent do
     expected_failures = Set[URI('http://localhost/')]
     agent.failures = previous_failures
-    agent.failures.should_not == previous_failures
-    agent.failures.should == expected_failures
+    expect(agent.failures).not_to eq(previous_failures)
+    expect(agent.failures).to eq(expected_failures)
   end
   it "should be able to restore the queue" do
@@ -59,7 +59,7 @@ describe Agent do
     previous_queue = [URI('http://www.example.com')]
     agent.queue = previous_queue
-    agent.queue.should == previous_queue
+    expect(agent.queue).to eq(previous_queue)
   end
   it "should convert new queues to an Array of URIs" do
@@ -68,14 +68,14 @@ describe Agent do
     expected_queue = [URI('http://www.example.com')]
     agent.queue = previous_queue
-    agent.queue.should_not == previous_queue
-    agent.queue.should == expected_queue
+    expect(agent.queue).not_to eq(previous_queue)
+    expect(agent.queue).to eq(expected_queue)
   end
   it "should provide a to_hash method that returns the queue and history" do
     hash = @agent.to_hash
-    hash[:queue].should be_empty
-    hash[:history].should_not be_empty
+    expect(hash[:queue]).to be_empty
+    expect(hash[:history]).not_to be_empty
   end
 end

data/spec/auth_store_spec.rb CHANGED

@@ -17,25 +17,25 @@ describe AuthStore do
   it 'should retrieve auth credentials for the URL' do
     @auth_store[root_uri] = AuthCredential.new('user1', 'pass1')
-    @auth_store[root_uri].username.should == 'user1'
-    @auth_store[root_uri].password.should == 'pass1'
+    expect(@auth_store[root_uri].username).to eq('user1')
+    expect(@auth_store[root_uri].password).to eq('pass1')
   end
   it 'should add auth credentials for the URL' do
-    lambda {
+    expect {
       @auth_store.add(root_uri, 'user1', 'pass1')
-    }.should change(@auth_store, :size)
+    }.to change(@auth_store, :size)
-    @auth_store[root_uri].username.should == 'user1'
-    @auth_store[root_uri].password.should == 'pass1'
+    expect(@auth_store[root_uri].username).to eq('user1')
+    expect(@auth_store[root_uri].password).to eq('pass1')
   end
   describe 'matching' do
     let(:sub_uri) { uri.merge('/course/auth/protected.html') }
     it 'should match a longer URL to the base' do
-      @auth_store[sub_uri].username.should == 'admin'
-      @auth_store[sub_uri].password.should == 'password'
+      expect(@auth_store[sub_uri].username).to eq('admin')
+      expect(@auth_store[sub_uri].password).to eq('password')
     end
     it 'should match the longest of all matching URLs' do
@@ -44,42 +44,42 @@ describe AuthStore do
       @auth_store.add(uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
       auth = @auth_store[uri.merge('/course/auth/special/1.html')]
-      auth.username.should == 'user2'
-      auth.password.should == 'pass2'
+      expect(auth.username).to eq('user2')
+      expect(auth.password).to eq('pass2')
     end
     it 'should not match a URL with a different host' do
       remote_uri = URI('http://spidr.rubyforge.org/course/auth')
-      @auth_store[remote_uri].should be_nil
+      expect(@auth_store[remote_uri]).to be_nil
     end
     it 'should not match a URL with an alternate path' do
       relative_uri = uri.merge('/course/admin/protected.html')
-      @auth_store[relative_uri].should be_nil
+      expect(@auth_store[relative_uri]).to be_nil
     end
   end
   it 'should override previous auth credentials' do
     @auth_store.add(uri, 'newuser', 'newpass')
-    @auth_store[uri].username.should == 'newuser'
-    @auth_store[uri].password.should == 'newpass'
+    expect(@auth_store[uri].username).to eq('newuser')
+    expect(@auth_store[uri].password).to eq('newpass')
   end
   it 'should clear all cookies' do
     @auth_store.clear!
-    @auth_store.size.should == 0
+    expect(@auth_store.size).to eq(0)
   end
   describe 'for_url' do
     it 'should return nil if no authorization exists' do
-      @auth_store.for_url(URI('http://php.net')).should be_nil
+      expect(@auth_store.for_url(URI('http://php.net'))).to be_nil
     end
     it 'should create an encoded authorization string' do
-      @auth_store.for_url(uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
+      expect(@auth_store.for_url(uri)).to eq("YWRtaW46cGFzc3dvcmQ=\n")
     end
   end
 end

data/spec/cookie_jar_spec.rb CHANGED

@@ -6,39 +6,39 @@ describe CookieJar do
   it "should retrieve cookies for the named host" do
     subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
-    subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
+    expect(subject['zerosum.org']).to eq({'admin' => 'ofcourseiam'})
   end
   it "should add a cookie to the jar" do
     subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
-    subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
+    expect(subject['zerosum.org']).to eq({'admin' => 'ofcourseiam'})
   end
   it "should merge new cookies into the jar" do
     subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
     subject['zerosum.org'] = {'other' => '1'}
-    subject['zerosum.org'].should == {
+    expect(subject['zerosum.org']).to eq({
       'admin' => 'ofcourseiam',
       'other' => '1'
-    }
+    })
   end
   it "should override previous cookies in the jar" do
     subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
     subject['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
-    subject['zerosum.org'].should == {
+    expect(subject['zerosum.org']).to eq({
       'admin' => 'somethingcompletelydifferent'
-    }
+    })
   end
   it "should clear all cookies" do
     subject['zerosum.org'] = {'cookie' => 'foobar'}
     subject.clear!
-    subject.size.should == 0
+    expect(subject.size).to eq(0)
   end
   describe "dirty" do
@@ -48,37 +48,37 @@ describe CookieJar do
       subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
       subject['zerosum.org'] = {'other' => '1'}
-      dirty.include?('zerosum.org').should == true
+      expect(dirty.include?('zerosum.org')).to eq(true)
     end
     it "should mark a cookie dirty after overriding params" do
       subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
       subject['zerosum.org'] = {'admin' => 'nope'}
-      dirty.include?('zerosum.org').should == true
+      expect(dirty.include?('zerosum.org')).to eq(true)
     end
     it "should un-mark a cookie as dirty after re-encoding it" do
       subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
       subject['zerosum.org'] = {'admin' => 'nope'}
-      dirty.include?('zerosum.org').should == true
+      expect(dirty.include?('zerosum.org')).to eq(true)
       subject.for_host('zerosum.org')
-      dirty.include?('zerosum.org').should == false
+      expect(dirty.include?('zerosum.org')).to eq(false)
     end
   end
   describe "cookies_for_host" do
     it "should return an empty Hash for unknown hosts" do
-      subject.cookies_for_host('lol.com').should be_empty
+      expect(subject.cookies_for_host('lol.com')).to be_empty
     end
     it "should return an empty Hash for hosts with no cookie params" do
       subject['lol.com'] = {}
-      subject.cookies_for_host('lol.com').should be_empty
+      expect(subject.cookies_for_host('lol.com')).to be_empty
     end
     it "should return cookie parameters for the host" do
@@ -87,8 +87,8 @@ describe CookieJar do
       cookie = subject.cookies_for_host('zerosum.org')
-      cookie['admin'].should == 'ofcourseiam'
-      cookie['other'].should == '1'
+      expect(cookie['admin']).to eq('ofcourseiam')
+      expect(cookie['other']).to eq('1')
     end
     it "should include cookies for the parent domain" do
@@ -97,26 +97,26 @@ describe CookieJar do
       cookie = subject.cookies_for_host('sub.zerosum.org')
-      cookie['admin'].should == 'ofcourseiam'
-      cookie['other'].should == '1'
+      expect(cookie['admin']).to eq('ofcourseiam')
+      expect(cookie['other']).to eq('1')
     end
   end
   describe "for_host" do
     it "should return nil for unknown hosts" do
-      subject.for_host('lol.com').should be_nil
+      expect(subject.for_host('lol.com')).to be_nil
     end
     it "should return nil for hosts with no cookie params" do
       subject['lol.com'] = {}
-      subject.for_host('lol.com').should be_nil
+      expect(subject.for_host('lol.com')).to be_nil
     end
     it "should encode single cookie params" do
       subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
-      subject.for_host('zerosum.org').should == 'admin=ofcourseiam'
+      expect(subject.for_host('zerosum.org')).to eq('admin=ofcourseiam')
     end
     it "should encode multiple cookie params" do
@@ -125,9 +125,9 @@ describe CookieJar do
       cookie = subject.for_host('zerosum.org')
-      cookie.should include('admin=ofcourseiam')
-      cookie.should include('; ')
-      cookie.should include('other=1')
+      expect(cookie).to include('admin=ofcourseiam')
+      expect(cookie).to include('; ')
+      expect(cookie).to include('other=1')
     end
     it "should include cookies for the parent domain" do
@@ -136,9 +136,9 @@ describe CookieJar do
       cookie = subject.for_host('sub.zerosum.org')
-      cookie.should include('admin=ofcourseiam')
-      cookie.should include('; ')
-      cookie.should include('other=1')
+      expect(cookie).to include('admin=ofcourseiam')
+      expect(cookie).to include('; ')
+      expect(cookie).to include('other=1')
     end
   end
 end