RubyGems - spidr - Versions diffs - 0.1.9 → 0.2.0 - Mend

spidr 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data.tar.gz.sig +0 -0
data/History.txt +43 -0
data/Manifest.txt +19 -0
data/README.txt +100 -11
data/Rakefile +15 -5
data/lib/spidr/actions.rb +2 -0
data/lib/spidr/actions/actions.rb +79 -0
data/lib/spidr/actions/exceptions.rb +4 -0
data/lib/spidr/actions/exceptions/action.rb +6 -0
data/lib/spidr/actions/exceptions/paused.rb +8 -0
data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
data/lib/spidr/agent.rb +385 -444
data/lib/spidr/events.rb +87 -0
data/lib/spidr/extensions.rb +1 -0
data/lib/spidr/extensions/uri.rb +45 -0
data/lib/spidr/filters.rb +438 -0
data/lib/spidr/page.rb +211 -70
data/lib/spidr/rules.rb +40 -18
data/lib/spidr/spidr.rb +57 -7
data/lib/spidr/version.rb +2 -1
data/spec/actions_spec.rb +61 -0
data/spec/agent_spec.rb +24 -31
data/spec/extensions/uri_spec.rb +39 -0
data/spec/filters_spec.rb +53 -0
data/spec/helpers/page.rb +8 -0
data/spec/page_examples.rb +17 -0
data/spec/page_spec.rb +81 -0
data/spec/rules_spec.rb +43 -0
data/spec/spec_helper.rb +1 -1
data/spec/spidr_spec.rb +30 -0
data/static/course/specs.json +1 -1
data/tasks/course.rb +8 -1
data/tasks/spec.rb +1 -0
data/tasks/yard.rb +12 -0
metadata +45 -6
metadata.gz.sig +0 -0

data/lib/spidr/rules.rb CHANGED Viewed

@@ -7,25 +7,43 @@ module Spidr
     # Reject rules
     attr_reader :reject
+    #
+    # Creates a new Rules object.
+    #
+    # @param [Hash] options
+    #   Additional options.
+    #
+    # @option options [Array<String, Regexp, Proc>] :accept
+    #   The patterns to accept data with.
+    #
+    # @option options [Array<String, Regexp, Proc>] :reject
+    #   The patterns to reject data with.
+    #
     def initialize(options={})
-      @accept = (options[:accept] || [])
-      @reject = (options[:reject] || [])
+      @accept = []
+      @reject = []
+      @accept += options[:accept] if options[:accept]
+      @reject += options[:reject] if options[:reject]
     end
     #
-    # Returns +true+ if the _field_ is accepted by the rules,
-    # returns +false+ otherwise.
+    # Determines whether the data should be accepted or rejected.
+    #
+    # @return [Boolean]
+    #   Specifies whether the given data was accepted, using the rules
+    #   acceptance patterns.
     #
-    def accept?(field)
+    def accept?(data)
       unless @accept.empty?
         @accept.each do |rule|
-          return true if test_field(field,rule)
+          return true if test_data(data,rule)
         end
         return false
       else
         @reject.each do |rule|
-          return false if test_field(field,rule)
+          return false if test_data(data,rule)
         end
         return true
@@ -33,27 +51,31 @@ module Spidr
     end
     #
-    # Returns +true+ if the _field_ is rejected by the rules,
-    # returns +false+ otherwise.
+    # Determines whether the data should be rejected or accepted.
     #
-    def reject?(field)
-      !(accept?(field))
+    # @return [Boolean]
+    #   Specifies whether the given data was rejected, using the rules
+    #   rejection patterns.
+    #
+    def reject?(data)
+      !(accept?(data))
     end
     protected
     #
-    # Tests the specified _field_ against the specified _rule_. Returns
-    # +true+ when the _rule_ matches the specified _field_, returns
-    # +false+ otherwise.
+    # Tests the given data against a given pattern.
+    #
+    # @return [Boolean]
+    #   Specifies whether the given data matched the pattern.
     #
-    def test_field(field,rule)
+    def test_data(data,rule)
       if rule.kind_of?(Proc)
-        return (rule.call(field) == true)
+        return (rule.call(data) == true)
       elsif rule.kind_of?(Regexp)
-        return !((field.to_s =~ rule).nil?)
+        return !((data.to_s =~ rule).nil?)
       else
-        return field == rule
+        return data == rule
       end
     end

data/lib/spidr/spidr.rb CHANGED Viewed

@@ -4,43 +4,93 @@ module Spidr
   # Common proxy port.
   COMMON_PROXY_PORT = 8080
+  # Default proxy information.
+  DEFAULT_PROXY = {
+    :host => nil,
+    :port => COMMON_PROXY_PORT,
+    :user => nil,
+    :password => nil
+  }
+  #
+  # Proxy information used by all newly created Agent objects by default.
   #
-  # Returns the +Hash+ of the Spidr proxy information.
+  # @return [Hash]
+  #   The Spidr proxy information.
   #
   def Spidr.proxy
-    @@spidr_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
+    @@spidr_proxy ||= DEFAULT_PROXY
+  end
+  #
+  # Sets the proxy information used by Agent objects.
+  #
+  # @param [Hash] new_proxy
+  #   The new proxy information.
+  #
+  # @option new_proxy [String] :host
+  #   The host-name of the proxy.
+  #
+  # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
+  #   The port of the proxy.
+  #
+  # @option new_proxy [String] :user
+  #   The user to authenticate with the proxy as.
+  #
+  # @option new_proxy [String] :password
+  #   The password to authenticate with the proxy.
+  #
+  # @return [Hash]
+  #   The new proxy information.
+  #
+  def Spidr.proxy=(new_proxy)
+    @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
   end
   #
-  # Returns the Spidr User-Agent
+  # Disables the proxy settings used by all newly created Agent objects.
+  #
+  def Spidr.disable_proxy!
+    @@spidr_proxy = DEFAULT_PROXY
+    return true
+  end
+  #
+  # The User-Agent string used by all Agent objects by default.
+  #
+  # @return [String]
+  #   The Spidr User-Agent string.
   #
   def Spidr.user_agent
     @@spidr_user_agent ||= nil
   end
   #
-  # Sets the Spidr Web User-Agent to the specified _new_agent_.
+  # Sets the Spidr User-Agent string.
+  #
+  # @param [String] new_agent
+  #   The new User-Agent string.
   #
   def Spidr.user_agent=(new_agent)
     @@spidr_user_agent = new_agent
   end
   #
-  # See Agent.start_at.
+  # @see Agent.start_at
   #
   def Spidr.start_at(url,options={},&block)
     Agent.start_at(url,options,&block)
   end
   #
-  # See Agent.host.
+  # @see Agent.host
   #
   def Spidr.host(name,options={},&block)
     Agent.host(name,options,&block)
   end
   #
-  # See Agent.site.
+  # @see Agent.site
   #
   def Spidr.site(url,options={},&block)
     Agent.site(url,options,&block)

data/lib/spidr/version.rb CHANGED Viewed

@@ -1,3 +1,4 @@
 module Spidr
-  VERSION = '0.1.9'
+  # Spidr version
+  VERSION = '0.2.0'
 end

data/spec/actions_spec.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require 'spidr/actions'
+require 'spidr/agent'
+require 'spec_helper'
+describe Actions do
+  before(:all) do
+    @url = URI('http://spidr.rubyforge.org/')
+  end
+  it "should be able to pause spidering" do
+    count = 0
+    agent = Agent.host('spidr.rubyforge.org') do |spider|
+      spider.every_page do |page|
+        count += 1
+        spider.pause! if count >= 2
+      end
+    end
+    agent.should be_paused
+    agent.history.length.should == 2
+  end
+  it "should be able to continue spidering after being paused" do
+    agent = Agent.new do |spider|
+      spider.every_page do |page|
+        spider.pause!
+      end
+    end
+    agent.enqueue(@url)
+    agent.continue!
+    agent.visited?(@url).should == true
+  end
+  it "should allow skipping of enqueued links" do
+    agent = Agent.new do |spider|
+      spider.every_url do |url|
+        spider.skip_link!
+      end
+    end
+    agent.enqueue(@url)
+    agent.queue.should be_empty
+  end
+  it "should allow skipping of visited pages" do
+    agent = Agent.new do |spider|
+      spider.every_page do |url|
+        spider.skip_page!
+      end
+    end
+    agent.visit_page(@url)
+    agent.history.should == Set[@url]
+    agent.queue.should be_empty
+  end
+end

data/spec/agent_spec.rb CHANGED Viewed

@@ -20,19 +20,38 @@ describe Agent do
   it "should be able to restore the history" do
     agent = Agent.new
-    previous_history = [URI('http://www.example.com')]
+    previous_history = Set[URI('http://www.example.com')]
     agent.history = previous_history
     agent.history.should == previous_history
   end
-  it "should convert new histories to an Array of URIs" do
+  it "should convert new histories to an Set of URIs" do
     agent = Agent.new
     previous_history = ['http://www.example.com']
+    expected_history = Set[URI('http://www.example.com')]
     agent.history = previous_history
     agent.history.should_not == previous_history
-    agent.history.should == previous_history.map { |url| URI(url) }
+    agent.history.should == expected_history
+  end
+  it "should be able to restore the failures" do
+    agent = Agent.new
+    previous_failures = Set[URI('http://localhost/')]
+    agent.failures = previous_failures
+    agent.failures.should == previous_failures
+  end
+  it "should convert new histories to a Set of URIs" do
+    agent = Agent.new
+    previous_failures = ['http://localhost/']
+    expected_failures = Set[URI('http://localhost/')]
+    agent.failures = previous_failures
+    agent.failures.should_not == previous_failures
+    agent.failures.should == expected_failures
   end
   it "should be able to restore the queue" do
@@ -46,37 +65,11 @@ describe Agent do
   it "should convert new queues to an Array of URIs" do
     agent = Agent.new
     previous_queue = ['http://www.example.com']
+    expected_queue = [URI('http://www.example.com')]
     agent.queue = previous_queue
     agent.queue.should_not == previous_queue
-    agent.queue.should == previous_queue.map { |url| URI(url) }
-  end
-  it "should be able to pause spidering" do
-    count = 0
-    agent = Agent.host('spidr.rubyforge.org') do |spider|
-      spider.every_page do |page|
-        count += 1
-        spider.pause! if count >= 2
-      end
-    end
-    agent.should be_paused
-    agent.history.length.should == 2
-  end
-  it "should be able to continue spidering after being paused" do
-    agent = Agent.new do |spider|
-      spider.enqueue('http://spidr.rubyforge.org/')
-      spider.every_page do |page|
-        spider.pause!
-      end
-    end
-    agent.pause!
-    agent.continue!
-    agent.visited?('http://spidr.rubyforge.org/').should == true
+    agent.queue.should == expected_queue
   end
   it "should provide a to_hash method that returns the queue and history" do

data/spec/extensions/uri_spec.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require 'spidr/extensions/uri'
+require 'spec_helper'
+describe URI do
+  describe "expand_path" do
+    it "should preserve single directory paths" do
+      URI.expand_path('path').should == 'path'
+    end
+    it "should preserve trailing '/'" do
+      URI.expand_path('test/path/').should == 'test/path/'
+    end
+    it "should remove multiple '/' characters" do
+      URI.expand_path('///test///path///').should == '/test/path/'
+    end
+    it "should remove '.' directories from the path" do
+      URI.expand_path('test/./path').should == 'test/path'
+    end
+    it "should handle '..' directories properly" do
+      URI.expand_path('test/../path').should == 'path'
+    end
+    it "should limit the number of '..' directories resolved" do
+      URI.expand_path('/test/../../../..').should == '/'
+    end
+    it "should preserve absolute paths" do
+      URI.expand_path('/test/path').should == '/test/path'
+    end
+    it "should preserve the root path" do
+      URI.expand_path('/').should == '/'
+    end
+  end
+end

data/spec/filters_spec.rb ADDED Viewed

@@ -0,0 +1,53 @@
+require 'spidr/filters'
+require 'spidr/agent'
+require 'spec_helper'
+describe Filters do
+  it "should allow setting the acceptable schemes" do
+    agent = Agent.new
+    agent.schemes = [:http]
+    agent.schemes.should == ['http']
+  end
+  it "should provide the hosts that will be visited" do
+    agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
+    agent.visit_hosts.should == ['spidr.rubyforge.org']
+  end
+  it "should provide the hosts that will not be visited" do
+    agent = Agent.new(:ignore_hosts => ['example.com'])
+    agent.ignore_hosts.should == ['example.com']
+  end
+  it "should provide the ports that will be visited" do
+    agent = Agent.new(:ports => [80, 443, 8000])
+    agent.visit_ports.should == [80, 443, 8000]
+  end
+  it "should provide the ports that will not be visited" do
+    agent = Agent.new(:ignore_ports => [8000, 8080])
+    agent.ignore_ports.should == [8000, 8080]
+  end
+  it "should provide the links that will be visited" do
+    agent = Agent.new(:links => ['index.php'])
+    agent.visit_links.should == ['index.php']
+  end
+  it "should provide the links that will not be visited" do
+    agent = Agent.new(:ignore_links => [/login/])
+    agent.ignore_links.should == [/login/]
+  end
+  it "should provide the exts that will be visited" do
+    agent = Agent.new(:exts => ['htm'])
+    agent.visit_exts.should == ['htm']
+  end
+  it "should provide the exts that will not be visited" do
+    agent = Agent.new(:ignore_exts => ['cfm'])
+    agent.ignore_exts.should == ['cfm']
+  end
+end

data/spec/helpers/page.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require 'net/http'
+require 'uri'
+def get_page(url)
+  url = URI(url.to_s)
+  return Spidr::Page.new(url,Net::HTTP.get_response(url))
+end