RubyGems - spidr_epg - Versions diffs - 1.0.0 - Mend

spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +15 -0
data/.gitignore +10 -0
data/.rspec +1 -0
data/.yardopts +1 -0
data/ChangeLog.md +291 -0
data/ChangeLog.md~ +291 -0
data/Gemfile +16 -0
data/Gemfile.lock +49 -0
data/Gemfile~ +16 -0
data/LICENSE.txt +20 -0
data/README.md +193 -0
data/README.md~ +190 -0
data/Rakefile +29 -0
data/gemspec.yml +19 -0
data/lib/spidr/actions/actions.rb +83 -0
data/lib/spidr/actions/exceptions/action.rb +9 -0
data/lib/spidr/actions/exceptions/paused.rb +11 -0
data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
data/lib/spidr/actions/exceptions.rb +4 -0
data/lib/spidr/actions.rb +2 -0
data/lib/spidr/agent.rb +866 -0
data/lib/spidr/auth_credential.rb +28 -0
data/lib/spidr/auth_store.rb +161 -0
data/lib/spidr/body.rb +98 -0
data/lib/spidr/cookie_jar.rb +202 -0
data/lib/spidr/events.rb +537 -0
data/lib/spidr/extensions/uri.rb +52 -0
data/lib/spidr/extensions.rb +1 -0
data/lib/spidr/filters.rb +539 -0
data/lib/spidr/headers.rb +370 -0
data/lib/spidr/links.rb +229 -0
data/lib/spidr/page.rb +108 -0
data/lib/spidr/rules.rb +79 -0
data/lib/spidr/sanitizers.rb +56 -0
data/lib/spidr/session_cache.rb +145 -0
data/lib/spidr/spidr.rb +107 -0
data/lib/spidr/version.rb +4 -0
data/lib/spidr/version.rb~ +4 -0
data/lib/spidr.rb +3 -0
data/pkg/spidr-1.0.0.gem +0 -0
data/spec/actions_spec.rb +59 -0
data/spec/agent_spec.rb +81 -0
data/spec/auth_store_spec.rb +85 -0
data/spec/cookie_jar_spec.rb +144 -0
data/spec/extensions/uri_spec.rb +43 -0
data/spec/filters_spec.rb +61 -0
data/spec/helpers/history.rb +34 -0
data/spec/helpers/page.rb +8 -0
data/spec/helpers/wsoc.rb +83 -0
data/spec/page_examples.rb +21 -0
data/spec/page_spec.rb +125 -0
data/spec/rules_spec.rb +45 -0
data/spec/sanitizers_spec.rb +61 -0
data/spec/session_cache.rb +58 -0
data/spec/spec_helper.rb +4 -0
data/spec/spidr_spec.rb +39 -0
data/spidr.gemspec +133 -0
data/spidr.gemspec~ +131 -0
metadata +158 -0

data/lib/spidr/rules.rb ADDED Viewed

@@ -0,0 +1,79 @@
+module Spidr
+  #
+  # The {Rules} class represents collections of acceptance and rejection
+  # rules, which are used to filter data.
+  #
+  class Rules
+    # Accept rules
+    attr_reader :accept
+    # Reject rules
+    attr_reader :reject
+    #
+    # Creates a new Rules object.
+    #
+    # @param [Hash] options
+    #   Additional options.
+    #
+    # @option options [Array<String, Regexp, Proc>] :accept
+    #   The patterns to accept data with.
+    #
+    # @option options [Array<String, Regexp, Proc>] :reject
+    #   The patterns to reject data with.
+    #
+    def initialize(options={})
+      @accept = []
+      @reject = []
+      @accept += options[:accept] if options[:accept]
+      @reject += options[:reject] if options[:reject]
+    end
+    #
+    # Determines whether the data should be accepted or rejected.
+    #
+    # @return [Boolean]
+    #   Specifies whether the given data was accepted, using the rules
+    #   acceptance patterns.
+    #
+    def accept?(data)
+      unless @accept.empty?
+        @accept.any? { |rule| test_data(data,rule) }
+      else
+        !@reject.any? { |rule| test_data(data,rule) }
+      end
+    end
+    #
+    # Determines whether the data should be rejected or accepted.
+    #
+    # @return [Boolean]
+    #   Specifies whether the given data was rejected, using the rules
+    #   rejection patterns.
+    #
+    def reject?(data)
+      !accept?(data)
+    end
+    protected
+    #
+    # Tests the given data against a given pattern.
+    #
+    # @return [Boolean]
+    #   Specifies whether the given data matched the pattern.
+    #
+    def test_data(data,rule)
+      if rule.kind_of?(Proc)
+        rule.call(data) == true
+      elsif rule.kind_of?(Regexp)
+        !((data.to_s =~ rule).nil?)
+      else
+        data == rule
+      end
+    end
+  end
+end

data/lib/spidr/sanitizers.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'uri'
+module Spidr
+  #
+  # The {Sanitizers} module adds methods to {Agent} which control the
+  # sanitation of incoming links.
+  #
+  module Sanitizers
+    # Specifies whether the Agent will strip URI fragments
+    attr_accessor :strip_fragments
+    # Specifies whether the Agent will strip URI queries
+    attr_accessor :strip_query
+    #
+    # Sanitizes a URL based on filtering options.
+    #
+    # @param [URI::HTTP, URI::HTTPS, String] url
+    #   The URL to be sanitized
+    #
+    # @return [URI::HTTP, URI::HTTPS]
+    #   The new sanitized URL.
+    #
+    # @since 0.2.2
+    #
+    def sanitize_url(url)
+      url = URI(url.to_s) unless url.kind_of?(URI)
+      url.fragment = nil if @strip_fragments
+      url.query    = nil if @strip_query
+      return url
+    end
+    protected
+    #
+    # Initializes the Sanitizer rules.
+    #
+    # @param [Hash] options
+    #   Additional options.
+    #
+    # @option options [Boolean] :strip_fragments (true)
+    #   Specifies whether or not to strip the fragment component from URLs.
+    #
+    # @option options [Boolean] :strip_query (false)
+    #   Specifies whether or not to strip the query component from URLs.
+    #
+    # @since 0.2.2
+    #
+    def initialize_sanitizers(options={})
+      @strip_fragments = options.fetch(:strip_fragments,true)
+      @strip_query     = options.fetch(:strip_query,false)
+    end
+  end
+end

data/lib/spidr/session_cache.rb ADDED Viewed

@@ -0,0 +1,145 @@
+require 'spidrs/spidrs'
+require 'net/http'
+module Spidr
+  #
+  # Stores active HTTP Sessions organized by scheme, host-name and port.
+  #
+  class SessionCache
+    # Proxy to use
+    attr_accessor :proxy
+    #
+    # Creates a new session cache.
+    #
+    # @param [Hash] proxy (Spidr.proxy)
+    #   Proxy options.
+    #
+    # @option proxy [String] :host
+    #   The host the proxy is running on.
+    #
+    # @option proxy [Integer] :port
+    #   The port the proxy is running on.
+    #
+    # @option proxy [String] :user
+    #   The user to authenticate as with the proxy.
+    #
+    # @option proxy [String] :password
+    #   The password to authenticate with.
+    #
+    # @since 0.2.2
+    #
+    def initialize(proxy=Spidr.proxy)
+      @proxy    = proxy
+      @sessions = {}
+    end
+    #
+    # Determines if there is an active HTTP session for a given URL.
+    #
+    # @param [URI::HTTP, String] url
+    #   The URL that represents a session.
+    #
+    # @return [Boolean]
+    #   Specifies whether there is an active HTTP session.
+    #
+    # @since 0.2.3
+    #
+    def active?(url)
+      # normalize the url
+      url = URI(url.to_s) unless url.kind_of?(URI)
+      # session key
+      key = [url.scheme, url.host, url.port]
+      return @sessions.has_key?(key)
+    end
+    #
+    # Provides an active HTTP session for a given URL.
+    #
+    # @param [URI::HTTP, String] url
+    #   The URL which will be requested later.
+    #
+    # @return [Net::HTTP]
+    #   The active HTTP session object.
+    #
+    def [](url)
+      # normalize the url
+      url = URI(url.to_s) unless url.kind_of?(URI)
+      # session key
+      key = [url.scheme, url.host, url.port]
+      unless @sessions[key]
+        session = Net::HTTP::Proxy(
+          @proxy[:host],
+          @proxy[:port],
+          @proxy[:user],
+          @proxy[:password]
+        ).new(url.host,url.port)
+        if url.scheme == 'https'
+          session.use_ssl     = true
+          session.verify_mode = OpenSSL::SSL::VERIFY_NONE
+          session.start
+        end
+        @sessions[key] = session
+      end
+      return @sessions[key]
+    end
+    #
+    # Destroys an HTTP session for the given scheme, host and port.
+    #
+    # @param [URI::HTTP, String] url
+    #   The URL of the requested session.
+    #
+    # @return [nil]
+    #
+    # @since 0.2.2
+    #
+    def kill!(url)
+      # normalize the url
+      url = URI(url.to_s) unless url.kind_of?(URI)
+      # session key
+      key = [url.scheme, url.host, url.port]
+      if (sess = @sessions[key])
+        begin
+          sess.finish
+        rescue IOError
+        end
+        @sessions.delete(key)
+      end
+    end
+    #
+    # Clears the session cache.
+    #
+    # @return [SessionCache]
+    #   The cleared session cache.
+    #
+    # @since 0.2.2
+    #
+    def clear
+      @sessions.each_value do |sess|
+        begin
+          sess.finish
+        rescue IOError
+          nil
+        end
+      end
+      @sessions.clear
+      return self
+    end
+  end
+end

data/lib/spidr/spidr.rb ADDED Viewed

@@ -0,0 +1,107 @@
+require 'spidrs/agent'
+module Spidr
+  # Common proxy port.
+  COMMON_PROXY_PORT = 8080
+  # Default proxy information.
+  DEFAULT_PROXY = {
+    :host     => nil,
+    :port     => COMMON_PROXY_PORT,
+    :user     => nil,
+    :password => nil
+  }
+  #
+  # Proxy information used by all newly created Agent objects by default.
+  #
+  # @return [Hash]
+  #   The Spidr proxy information.
+  #
+  def Spidr.proxy
+    @@spidr_proxy ||= DEFAULT_PROXY
+  end
+  #
+  # Sets the proxy information used by Agent objects.
+  #
+  # @param [Hash] new_proxy
+  #   The new proxy information.
+  #
+  # @option new_proxy [String] :host
+  #   The host-name of the proxy.
+  #
+  # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
+  #   The port of the proxy.
+  #
+  # @option new_proxy [String] :user
+  #   The user to authenticate with the proxy as.
+  #
+  # @option new_proxy [String] :password
+  #   The password to authenticate with the proxy.
+  #
+  # @return [Hash]
+  #   The new proxy information.
+  #
+  def Spidr.proxy=(new_proxy)
+    @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
+  end
+  #
+  # Disables the proxy settings used by all newly created Agent objects.
+  #
+  def Spidr.disable_proxy!
+    @@spidr_proxy = DEFAULT_PROXY
+    return true
+  end
+  #
+  # The User-Agent string used by all Agent objects by default.
+  #
+  # @return [String]
+  #   The Spidr User-Agent string.
+  #
+  def Spidr.user_agent
+    @@spidr_user_agent ||= nil
+  end
+  #
+  # Sets the Spidr User-Agent string.
+  #
+  # @param [String] new_agent
+  #   The new User-Agent string.
+  #
+  def Spidr.user_agent=(new_agent)
+    @@spidr_user_agent = new_agent
+  end
+  #
+  # @see Agent.start_at
+  #
+  def Spidr.start_at(url,options={},&block)
+    Agent.start_at(url,options,&block)
+  end
+  #
+  # @see Agent.start_at
+  # regex use for match url
+  # with this faction could find specific url
+  #
+  def Spidr.start_at(url,regex,options={},&block)
+    Agent.start_at(url,regex,options,&block)
+  end
+  #
+  # @see Agent.host
+  #
+  def Spidr.host(name,options={},&block)
+    Agent.host(name,options,&block)
+  end
+  #
+  # @see Agent.site
+  #
+  def Spidr.site(url,options={},&block)
+    Agent.site(url,options,&block)
+  end
+end

data/lib/spidr/version.rb ADDED Viewed

@@ -0,0 +1,4 @@
+module Spidr
+  # Spidr version
+  VERSION = '1.0.0'
+end

data/lib/spidr/version.rb~ ADDED Viewed

@@ -0,0 +1,4 @@
+module Spidr
+  # Spidr version
+  VERSION = '1.0.0'
+end

data/lib/spidr.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require 'spidr/agent'
+require 'spidr/spidr'
+require 'spidr/version'

data/pkg/spidr-1.0.0.gem ADDED Viewed

Binary file

data/spec/actions_spec.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'spidr/actions'
+require 'spidr/agent'
+require 'spec_helper'
+describe Actions do
+  let(:url) { URI('http://spidr.rubyforge.org/') }
+  it "should be able to pause spidering" do
+    count = 0
+    agent = Agent.host('spidr.rubyforge.org') do |spider|
+      spider.every_page do |page|
+        count += 1
+        spider.pause! if count >= 2
+      end
+    end
+    agent.should be_paused
+    agent.history.length.should == 2
+  end
+  it "should be able to continue spidering after being paused" do
+    agent = Agent.new do |spider|
+      spider.every_page do |page|
+        spider.pause!
+      end
+    end
+    agent.enqueue(url)
+    agent.continue!
+    agent.visited?(url).should == true
+  end
+  it "should allow skipping of enqueued links" do
+    agent = Agent.new do |spider|
+      spider.every_url do |url|
+        spider.skip_link!
+      end
+    end
+    agent.enqueue(url)
+    agent.queue.should be_empty
+  end
+  it "should allow skipping of visited pages" do
+    agent = Agent.new do |spider|
+      spider.every_page do |url|
+        spider.skip_page!
+      end
+    end
+    agent.visit_page(url)
+    agent.history.should == Set[url]
+    agent.queue.should be_empty
+  end
+end

data/spec/agent_spec.rb ADDED Viewed

@@ -0,0 +1,81 @@
+require 'spidr/agent'
+require 'spec_helper'
+require 'helpers/wsoc'
+describe Agent do
+  include Helpers::WSOC
+  before(:all) do
+    @agent = run_course
+  end
+  it "should provide the history" do
+    @agent.history.should_not be_empty
+  end
+  it "should provide the queue" do
+    @agent.queue.should be_empty
+  end
+  it "should be able to restore the history" do
+    agent = Agent.new
+    previous_history = Set[URI('http://www.example.com')]
+    agent.history = previous_history
+    agent.history.should == previous_history
+  end
+  it "should convert new histories to an Set of URIs" do
+    agent = Agent.new
+    previous_history = ['http://www.example.com']
+    expected_history = Set[URI('http://www.example.com')]
+    agent.history = previous_history
+    agent.history.should_not == previous_history
+    agent.history.should == expected_history
+  end
+  it "should be able to restore the failures" do
+    agent = Agent.new
+    previous_failures = Set[URI('http://localhost/')]
+    agent.failures = previous_failures
+    agent.failures.should == previous_failures
+  end
+  it "should convert new histories to a Set of URIs" do
+    agent = Agent.new
+    previous_failures = ['http://localhost/']
+    expected_failures = Set[URI('http://localhost/')]
+    agent.failures = previous_failures
+    agent.failures.should_not == previous_failures
+    agent.failures.should == expected_failures
+  end
+  it "should be able to restore the queue" do
+    agent = Agent.new
+    previous_queue = [URI('http://www.example.com')]
+    agent.queue = previous_queue
+    agent.queue.should == previous_queue
+  end
+  it "should convert new queues to an Array of URIs" do
+    agent = Agent.new
+    previous_queue = ['http://www.example.com']
+    expected_queue = [URI('http://www.example.com')]
+    agent.queue = previous_queue
+    agent.queue.should_not == previous_queue
+    agent.queue.should == expected_queue
+  end
+  it "should provide a to_hash method that returns the queue and history" do
+    hash = @agent.to_hash
+    hash[:queue].should be_empty
+    hash[:history].should_not be_empty
+  end
+end

data/spec/auth_store_spec.rb ADDED Viewed

@@ -0,0 +1,85 @@
+require 'spidr/auth_store'
+require 'spec_helper'
+describe AuthStore do
+  let(:root_uri) { URI('http://zerosum.org/') }
+  let(:uri) { root_uri.merge('/course/auth') }
+  before(:each) do
+    @auth_store = AuthStore.new
+    @auth_store.add(uri, 'admin', 'password')
+  end
+  after(:each) do
+    @auth_store.clear!
+  end
+  it 'should retrieve auth credentials for the URL' do
+    @auth_store[root_uri] = AuthCredential.new('user1', 'pass1')
+    @auth_store[root_uri].username.should == 'user1'
+    @auth_store[root_uri].password.should == 'pass1'
+  end
+  it 'should add auth credentials for the URL' do
+    lambda {
+      @auth_store.add(root_uri, 'user1', 'pass1')
+    }.should change(@auth_store, :size)
+    @auth_store[root_uri].username.should == 'user1'
+    @auth_store[root_uri].password.should == 'pass1'
+  end
+  describe 'matching' do
+    let(:sub_uri) { uri.merge('/course/auth/protected.html') }
+    it 'should match a longer URL to the base' do
+      @auth_store[sub_uri].username.should == 'admin'
+      @auth_store[sub_uri].password.should == 'password'
+    end
+    it 'should match the longest of all matching URLs' do
+      @auth_store.add(uri.merge('/course'), 'user1', 'pass1')
+      @auth_store.add(uri.merge('/course/auth/special'), 'user2', 'pass2')
+      @auth_store.add(uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
+      auth = @auth_store[uri.merge('/course/auth/special/1.html')]
+      auth.username.should == 'user2'
+      auth.password.should == 'pass2'
+    end
+    it 'should not match a URL with a different host' do
+      remote_uri = URI('http://spidr.rubyforge.org/course/auth')
+      @auth_store[remote_uri].should be_nil
+    end
+    it 'should not match a URL with an alternate path' do
+      relative_uri = uri.merge('/course/admin/protected.html')
+      @auth_store[relative_uri].should be_nil
+    end
+  end
+  it 'should override previous auth credentials' do
+    @auth_store.add(uri, 'newuser', 'newpass')
+    @auth_store[uri].username.should == 'newuser'
+    @auth_store[uri].password.should == 'newpass'
+  end
+  it 'should clear all cookies' do
+    @auth_store.clear!
+    @auth_store.size.should == 0
+  end
+  describe 'for_url' do
+    it 'should return nil if no authorization exists' do
+      @auth_store.for_url(URI('http://php.net')).should be_nil
+    end
+    it 'should create an encoded authorization string' do
+      @auth_store.for_url(uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
+    end
+  end
+end