RubyGems - spidr - Versions diffs - 0.4.1 → 0.5.0 - Mend

spidr 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +7 -0
data/ChangeLog.md +69 -54
data/Gemfile +9 -5
data/LICENSE.txt +1 -1
data/README.md +34 -26
data/Rakefile +4 -15
data/gemspec.yml +3 -2
data/lib/spidr/agent.rb +101 -44
data/lib/spidr/{actions → agent}/actions.rb +32 -12
data/lib/spidr/{events.rb → agent/events.rb} +4 -8
data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
data/lib/spidr/auth_store.rb +2 -2
data/lib/spidr/cookie_jar.rb +2 -2
data/lib/spidr/extensions/uri.rb +28 -16
data/lib/spidr/page.rb +7 -11
data/lib/spidr/{body.rb → page/body.rb} +1 -1
data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
data/lib/spidr/{links.rb → page/links.rb} +43 -7
data/lib/spidr/session_cache.rb +2 -2
data/lib/spidr/spidr.rb +32 -5
data/lib/spidr/version.rb +1 -1
data/spec/agent/actions_spec.rb +60 -0
data/spec/agent/filters_spec.rb +62 -0
data/spec/agent/sanitizers_spec.rb +62 -0
data/spec/agent_spec.rb +13 -13
data/spec/auth_store_spec.rb +17 -17
data/spec/cookie_jar_spec.rb +26 -26
data/spec/extensions/uri_spec.rb +19 -9
data/spec/helpers/history.rb +5 -5
data/spec/helpers/wsoc.rb +2 -2
data/spec/page_examples.rb +4 -4
data/spec/page_spec.rb +28 -25
data/spec/rules_spec.rb +14 -14
data/spec/session_cache.rb +7 -7
data/spec/spidr_spec.rb +10 -10
metadata +37 -51
data/lib/spidr/actions.rb +0 -2
data/lib/spidr/actions/exceptions.rb +0 -4
data/lib/spidr/actions/exceptions/action.rb +0 -9
data/lib/spidr/actions/exceptions/paused.rb +0 -11
data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
data/spec/actions_spec.rb +0 -59
data/spec/filters_spec.rb +0 -61
data/spec/sanitizers_spec.rb +0 -61

data/lib/spidr/actions.rb DELETED

	@@ -1,2 +0,0 @@
1	- require 'spidr/actions/exceptions'
2	- require 'spidr/actions/actions'

data/lib/spidr/actions/exceptions.rb DELETED

@@ -1,4 +0,0 @@
-require 'spidr/actions/exceptions/action'
-require 'spidr/actions/exceptions/paused'
-require 'spidr/actions/exceptions/skip_link'
-require 'spidr/actions/exceptions/skip_page'

data/lib/spidr/actions/exceptions/action.rb DELETED

@@ -1,9 +0,0 @@
-module Spidr
-  module Actions
-    #
-    # The base {Actions} exception class.
-    #
-    class Action < RuntimeError
-    end
-  end
-end

data/lib/spidr/actions/exceptions/paused.rb DELETED

@@ -1,11 +0,0 @@
-require 'spidr/actions/exceptions/action'
-module Spidr
-  module Actions
-    #
-    # An {Actions} exception class used to pause a running {Agent}.
-    #
-    class Paused < Action
-    end
-  end
-end

data/lib/spidr/actions/exceptions/skip_link.rb DELETED

@@ -1,12 +0,0 @@
-require 'spidr/actions/exceptions/action'
-module Spidr
-  module Actions
-    #
-    # An {Actions} exception class which causes a running {Agent} to
-    # skip a link.
-    #
-    class SkipLink < Action
-    end
-  end
-end

data/lib/spidr/actions/exceptions/skip_page.rb DELETED

@@ -1,12 +0,0 @@
-require 'spidr/actions/exceptions/action'
-module Spidr
-  module Actions
-    #
-    # An {Actions} exception class which causes a running {Agent} to
-    # skip a {Page}, and all links within that page.
-    #
-    class SkipPage < Action
-    end
-  end
-end

data/spec/actions_spec.rb DELETED

@@ -1,59 +0,0 @@
-require 'spidr/actions'
-require 'spidr/agent'
-require 'spec_helper'
-describe Actions do
-  let(:url) { URI('http://spidr.rubyforge.org/') }
-  it "should be able to pause spidering" do
-    count = 0
-    agent = Agent.host('spidr.rubyforge.org') do |spider|
-      spider.every_page do |page|
-        count += 1
-        spider.pause! if count >= 2
-      end
-    end
-    agent.should be_paused
-    agent.history.length.should == 2
-  end
-  it "should be able to continue spidering after being paused" do
-    agent = Agent.new do |spider|
-      spider.every_page do |page|
-        spider.pause!
-      end
-    end
-    agent.enqueue(url)
-    agent.continue!
-    agent.visited?(url).should == true
-  end
-  it "should allow skipping of enqueued links" do
-    agent = Agent.new do |spider|
-      spider.every_url do |url|
-        spider.skip_link!
-      end
-    end
-    agent.enqueue(url)
-    agent.queue.should be_empty
-  end
-  it "should allow skipping of visited pages" do
-    agent = Agent.new do |spider|
-      spider.every_page do |url|
-        spider.skip_page!
-      end
-    end
-    agent.visit_page(url)
-    agent.history.should == Set[url]
-    agent.queue.should be_empty
-  end
-end

data/spec/filters_spec.rb DELETED

@@ -1,61 +0,0 @@
-require 'spidr/filters'
-require 'spidr/agent'
-require 'spec_helper'
-describe Filters do
-  it "should allow setting the acceptable schemes" do
-    agent = Agent.new
-    agent.schemes = [:http]
-    agent.schemes.should == ['http']
-  end
-  it "should provide the hosts that will be visited" do
-    agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
-    agent.visit_hosts.should == ['spidr.rubyforge.org']
-  end
-  it "should provide the hosts that will not be visited" do
-    agent = Agent.new(:ignore_hosts => ['example.com'])
-    agent.ignore_hosts.should == ['example.com']
-  end
-  it "should provide the ports that will be visited" do
-    agent = Agent.new(:ports => [80, 443, 8000])
-    agent.visit_ports.should == [80, 443, 8000]
-  end
-  it "should provide the ports that will not be visited" do
-    agent = Agent.new(:ignore_ports => [8000, 8080])
-    agent.ignore_ports.should == [8000, 8080]
-  end
-  it "should provide the links that will be visited" do
-    agent = Agent.new(:links => ['index.php'])
-    agent.visit_links.should == ['index.php']
-  end
-  it "should provide the links that will not be visited" do
-    agent = Agent.new(:ignore_links => [/login/])
-    agent.ignore_links.should == [/login/]
-  end
-  it "should provide the exts that will be visited" do
-    agent = Agent.new(:exts => ['htm'])
-    agent.visit_exts.should == ['htm']
-  end
-  it "should provide the exts that will not be visited" do
-    agent = Agent.new(:ignore_exts => ['cfm'])
-    agent.ignore_exts.should == ['cfm']
-  end
-end

data/spec/sanitizers_spec.rb DELETED

@@ -1,61 +0,0 @@
-require 'spidr/sanitizers'
-require 'spidr/agent'
-require 'spec_helper'
-describe Sanitizers do
-  describe "sanitize_url" do
-    let(:url) { 'http://host.com' }
-    before(:all) { @agent = Agent.new }
-    it "should sanitize URLs" do
-      agent = Agent.new
-      clean_url = agent.sanitize_url(URI(url))
-      clean_url.host.should == 'host.com'
-    end
-    it "should sanitize URLs given as Strings" do
-      agent = Agent.new
-      clean_url = agent.sanitize_url(url)
-      clean_url.host.should == 'host.com'
-    end
-  end
-  describe "strip_fragments" do
-    let(:url) { URI("http://host.com/page#lol") }
-    it "should strip fragment components by default" do
-      agent = Agent.new
-      clean_url = agent.sanitize_url(url)
-      clean_url.fragment.should be_nil
-    end
-    it "should allow perserving fragment components" do
-      agent = Agent.new(:strip_fragments => false)
-      clean_url = agent.sanitize_url(url)
-      clean_url.fragment.should == 'lol'
-    end
-  end
-  describe "strip_query" do
-    let(:url) { URI("http://host.com/page?x=1") }
-    it "should not strip query components by default" do
-      agent = Agent.new
-      clean_url = agent.sanitize_url(url)
-      clean_url.query.should == 'x=1'
-    end
-    it "should allow stripping of query components" do
-      agent = Agent.new(:strip_query => true)
-      clean_url = agent.sanitize_url(url)
-      clean_url.query.should be_nil
-    end
-  end
-end