RubyGems - spidr_epg - Versions diffs - 1.0.0 - Mend

spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +15 -0
data/.gitignore +10 -0
data/.rspec +1 -0
data/.yardopts +1 -0
data/ChangeLog.md +291 -0
data/ChangeLog.md~ +291 -0
data/Gemfile +16 -0
data/Gemfile.lock +49 -0
data/Gemfile~ +16 -0
data/LICENSE.txt +20 -0
data/README.md +193 -0
data/README.md~ +190 -0
data/Rakefile +29 -0
data/gemspec.yml +19 -0
data/lib/spidr/actions/actions.rb +83 -0
data/lib/spidr/actions/exceptions/action.rb +9 -0
data/lib/spidr/actions/exceptions/paused.rb +11 -0
data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
data/lib/spidr/actions/exceptions.rb +4 -0
data/lib/spidr/actions.rb +2 -0
data/lib/spidr/agent.rb +866 -0
data/lib/spidr/auth_credential.rb +28 -0
data/lib/spidr/auth_store.rb +161 -0
data/lib/spidr/body.rb +98 -0
data/lib/spidr/cookie_jar.rb +202 -0
data/lib/spidr/events.rb +537 -0
data/lib/spidr/extensions/uri.rb +52 -0
data/lib/spidr/extensions.rb +1 -0
data/lib/spidr/filters.rb +539 -0
data/lib/spidr/headers.rb +370 -0
data/lib/spidr/links.rb +229 -0
data/lib/spidr/page.rb +108 -0
data/lib/spidr/rules.rb +79 -0
data/lib/spidr/sanitizers.rb +56 -0
data/lib/spidr/session_cache.rb +145 -0
data/lib/spidr/spidr.rb +107 -0
data/lib/spidr/version.rb +4 -0
data/lib/spidr/version.rb~ +4 -0
data/lib/spidr.rb +3 -0
data/pkg/spidr-1.0.0.gem +0 -0
data/spec/actions_spec.rb +59 -0
data/spec/agent_spec.rb +81 -0
data/spec/auth_store_spec.rb +85 -0
data/spec/cookie_jar_spec.rb +144 -0
data/spec/extensions/uri_spec.rb +43 -0
data/spec/filters_spec.rb +61 -0
data/spec/helpers/history.rb +34 -0
data/spec/helpers/page.rb +8 -0
data/spec/helpers/wsoc.rb +83 -0
data/spec/page_examples.rb +21 -0
data/spec/page_spec.rb +125 -0
data/spec/rules_spec.rb +45 -0
data/spec/sanitizers_spec.rb +61 -0
data/spec/session_cache.rb +58 -0
data/spec/spec_helper.rb +4 -0
data/spec/spidr_spec.rb +39 -0
data/spidr.gemspec +133 -0
data/spidr.gemspec~ +131 -0
metadata +158 -0

data/spec/cookie_jar_spec.rb ADDED Viewed

@@ -0,0 +1,144 @@
+require 'spidr/cookie_jar'
+require 'spec_helper'
+describe CookieJar do
+  it "should retrieve cookies for the named host" do
+    subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+    subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
+  end
+  it "should add a cookie to the jar" do
+    subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+    subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
+  end
+  it "should merge new cookies into the jar" do
+    subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+    subject['zerosum.org'] = {'other' => '1'}
+    subject['zerosum.org'].should == {
+      'admin' => 'ofcourseiam',
+      'other' => '1'
+    }
+  end
+  it "should override previous cookies in the jar" do
+    subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+    subject['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
+    subject['zerosum.org'].should == {
+      'admin' => 'somethingcompletelydifferent'
+    }
+  end
+  it "should clear all cookies" do
+    subject['zerosum.org'] = {'cookie' => 'foobar'}
+    subject.clear!
+    subject.size.should == 0
+  end
+  describe "dirty" do
+    let(:dirty) { subject.instance_variable_get('@dirty') }
+    it "should mark a cookie dirty after adding new params" do
+      subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+      subject['zerosum.org'] = {'other' => '1'}
+      dirty.include?('zerosum.org').should == true
+    end
+    it "should mark a cookie dirty after overriding params" do
+      subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+      subject['zerosum.org'] = {'admin' => 'nope'}
+      dirty.include?('zerosum.org').should == true
+    end
+    it "should un-mark a cookie as dirty after re-encoding it" do
+      subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+      subject['zerosum.org'] = {'admin' => 'nope'}
+      dirty.include?('zerosum.org').should == true
+      subject.for_host('zerosum.org')
+      dirty.include?('zerosum.org').should == false
+    end
+  end
+  describe "cookies_for_host" do
+    it "should return an empty Hash for unknown hosts" do
+      subject.cookies_for_host('lol.com').should be_empty
+    end
+    it "should return an empty Hash for hosts with no cookie params" do
+      subject['lol.com'] = {}
+      subject.cookies_for_host('lol.com').should be_empty
+    end
+    it "should return cookie parameters for the host" do
+      subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+      subject['zerosum.org'] = {'other' => '1'}
+      cookie = subject.cookies_for_host('zerosum.org')
+      cookie['admin'].should == 'ofcourseiam'
+      cookie['other'].should == '1'
+    end
+    it "should include cookies for the parent domain" do
+      subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+      subject['sub.zerosum.org'] = {'other' => '1'}
+      cookie = subject.cookies_for_host('sub.zerosum.org')
+      cookie['admin'].should == 'ofcourseiam'
+      cookie['other'].should == '1'
+    end
+  end
+  describe "for_host" do
+    it "should return nil for unknown hosts" do
+      subject.for_host('lol.com').should be_nil
+    end
+    it "should return nil for hosts with no cookie params" do
+      subject['lol.com'] = {}
+      subject.for_host('lol.com').should be_nil
+    end
+    it "should encode single cookie params" do
+      subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+      subject.for_host('zerosum.org').should == 'admin=ofcourseiam'
+    end
+    it "should encode multiple cookie params" do
+      subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+      subject['zerosum.org'] = {'other' => '1'}
+      cookie = subject.for_host('zerosum.org')
+      cookie.should include('admin=ofcourseiam')
+      cookie.should include('; ')
+      cookie.should include('other=1')
+    end
+    it "should include cookies for the parent domain" do
+      subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
+      subject['sub.zerosum.org'] = {'other' => '1'}
+      cookie = subject.for_host('sub.zerosum.org')
+      cookie.should include('admin=ofcourseiam')
+      cookie.should include('; ')
+      cookie.should include('other=1')
+    end
+  end
+end

data/spec/extensions/uri_spec.rb ADDED Viewed

@@ -0,0 +1,43 @@
+require 'spidr/extensions/uri'
+require 'spec_helper'
+describe URI do
+  describe "expand_path" do
+    it "should preserve single directory paths" do
+      URI.expand_path('path').should == 'path'
+    end
+    it "should preserve trailing '/'" do
+      URI.expand_path('test/path/').should == 'test/path/'
+    end
+    it "should remove multiple '/' characters" do
+      URI.expand_path('///test///path///').should == '/test/path/'
+    end
+    it "should remove '.' directories from the path" do
+      URI.expand_path('test/./path').should == 'test/path'
+    end
+    it "should handle '..' directories properly" do
+      URI.expand_path('test/../path').should == 'path'
+    end
+    it "should limit the number of '..' directories resolved" do
+      URI.expand_path('/test/../../../..').should == '/'
+    end
+    it "should preserve absolute paths" do
+      URI.expand_path('/test/path').should == '/test/path'
+    end
+    it "should preserve the root path" do
+      URI.expand_path('/').should == '/'
+    end
+    it "should default empty paths to the root path" do
+      URI.expand_path('').should == '/'
+    end
+  end
+end

data/spec/filters_spec.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require 'spidr/filters'
+require 'spidr/agent'
+require 'spec_helper'
+describe Filters do
+  it "should allow setting the acceptable schemes" do
+    agent = Agent.new
+    agent.schemes = [:http]
+    agent.schemes.should == ['http']
+  end
+  it "should provide the hosts that will be visited" do
+    agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
+    agent.visit_hosts.should == ['spidr.rubyforge.org']
+  end
+  it "should provide the hosts that will not be visited" do
+    agent = Agent.new(:ignore_hosts => ['example.com'])
+    agent.ignore_hosts.should == ['example.com']
+  end
+  it "should provide the ports that will be visited" do
+    agent = Agent.new(:ports => [80, 443, 8000])
+    agent.visit_ports.should == [80, 443, 8000]
+  end
+  it "should provide the ports that will not be visited" do
+    agent = Agent.new(:ignore_ports => [8000, 8080])
+    agent.ignore_ports.should == [8000, 8080]
+  end
+  it "should provide the links that will be visited" do
+    agent = Agent.new(:links => ['index.php'])
+    agent.visit_links.should == ['index.php']
+  end
+  it "should provide the links that will not be visited" do
+    agent = Agent.new(:ignore_links => [/login/])
+    agent.ignore_links.should == [/login/]
+  end
+  it "should provide the exts that will be visited" do
+    agent = Agent.new(:exts => ['htm'])
+    agent.visit_exts.should == ['htm']
+  end
+  it "should provide the exts that will not be visited" do
+    agent = Agent.new(:ignore_exts => ['cfm'])
+    agent.ignore_exts.should == ['cfm']
+  end
+end

data/spec/helpers/history.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module Helpers
+  module History
+    def visited_once?(url)
+      return @agent.visited_urls.select { |visited_url|
+        visited_url == url
+      }.length == 1
+    end
+    def visited_link?(url)
+      @agent.visited?(url)
+    end
+    def visit_failed?(url)
+      @agent.failed?(url)
+    end
+    def should_visit_link(url)
+      visited_link?(url).should == true
+    end
+    def should_ignore_link(url)
+      visited_link?(url).should == false
+    end
+    def should_visit_once(url)
+      visited_once?(url).should == true
+    end
+    def should_fail_link(url)
+      visited_link?(url).should == false
+      visit_failed?(url).should == true
+    end
+  end
+end

data/spec/helpers/page.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require 'net/http'
+require 'uri'
+def get_page(url)
+  url = URI(url.to_s)
+  return Spidr::Page.new(url,Net::HTTP.get_response(url))
+end

data/spec/helpers/wsoc.rb ADDED Viewed

@@ -0,0 +1,83 @@
+require 'wsoc/config'
+require 'open-uri'
+require 'json'
+require 'helpers/history'
+module Helpers
+  module WSOC
+    include History
+    SERVER_URL = URI::HTTP.build(
+      :host => (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
+      :port => (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
+    )
+    SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
+    COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
+    COURSE_METADATA = {}
+    def self.included(base)
+      hash = JSON.parse(open(SPECS_URL).read)
+      metadata = hash['metadata']
+      specs = hash['specs']
+      if metadata.kind_of?(Hash)
+        COURSE_METADATA.merge!(metadata)
+      end
+      if specs.kind_of?(Array)
+        specs.each do |spec|
+          message = spec['message'].dump
+          url = spec['url'].dump
+          case spec['behavior']
+          when 'visit'
+            base.module_eval %{
+              it #{message} do
+                should_visit_link(#{url})
+              end
+            }
+          when 'ignore'
+            base.module_eval %{
+              it #{message} do
+                should_ignore_link(#{url})
+              end
+            }
+          when 'fail'
+            base.module_eval %{
+              it #{message} do
+                should_fail_link(#{url})
+              end
+            }
+          end
+        end
+      end
+    end
+    def course
+      WSOC::COURSE_METADATA
+    end
+    def course_auth_store
+      course['auth_store']
+    end
+    def run_course
+      Spidr::Agent.start_at(COURSE_URL) do |agent|
+        course_auth_store.each do |path,auth|
+          agent.authorized.add(
+            COURSE_URL.merge(path),
+            auth['user'],
+            auth['password']
+          )
+        end
+        agent.every_failed_url { |url| puts "[FAILED] #{url}" }
+        agent.every_url { |url| puts url }
+      end
+    end
+  end
+end

data/spec/page_examples.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'spidr/page'
+require 'spec_helper'
+shared_examples_for "Page" do
+  it "should have a status code" do
+    @page.code.should be_integer
+  end
+  it "should have a body" do
+    @page.body.should_not be_empty
+  end
+  it "should provide transparent access to the response headers" do
+    @page.content_type.should == @page.response['Content-Type']
+  end
+  it "should allow content-types" do
+    @page.content_types.should_not be_empty
+  end
+end

data/spec/page_spec.rb ADDED Viewed

@@ -0,0 +1,125 @@
+require 'spidr/page'
+require 'spec_helper'
+require 'page_examples'
+require 'helpers/page'
+describe Page do
+  describe "html" do
+    before(:all) do
+      @page = get_page('http://spidr.rubyforge.org/course/start.html')
+    end
+    it_should_behave_like "Page"
+    it "should be OK" do
+      @page.should be_ok
+    end
+    it "should have a content-type" do
+      @page.content_type.should include('text/html')
+    end
+    it "should be a html page" do
+      @page.should be_html
+    end
+    it "should have provide a document" do
+      @page.doc.class.should == Nokogiri::HTML::Document
+    end
+    it "should allow searching the document" do
+      @page.doc.search('//p').length.should == 2
+      @page.doc.at('//p[2]').inner_text.should == 'Ready! Set! Go!'
+    end
+    it "should have a title" do
+      @page.title.should == 'Spidr :: Web-Spider Obstacle Course :: Start'
+    end
+    it "should have links" do
+      @page.links.should_not be_empty
+    end
+  end
+  describe "txt" do
+    before(:all) do
+      @page = get_page('http://www.ruby-lang.org/en/LICENSE.txt')
+    end
+    it_should_behave_like "Page"
+    it "should be OK" do
+      @page.should be_ok
+    end
+    it "should have a content-type" do
+      @page.content_type.should include('text/plain')
+    end
+    it "should be a txt page" do
+      @page.should be_txt
+    end
+    it "should not have provide a document" do
+      @page.doc.should be_nil
+    end
+    it "should not allow searching the document" do
+      @page.search('//p').should be_empty
+      @page.at('//p').should be_nil
+    end
+    it "should not have links" do
+      @page.links.should be_empty
+    end
+    it "should not have a title" do
+      @page.title.should be_nil
+    end
+  end
+  describe "redirects" do
+    before(:all) do
+      @page = get_page('http://spidr.rubyforge.org/course/start.html')
+      @page.stub!(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
+    end
+    it "should provide access to page-level redirects" do
+      @page.redirects_to.should == ['http://spidr.rubyforge.org/redirected']
+    end
+    it "should include meta refresh redirects in the list of links" do
+      @page.links.should include('http://spidr.rubyforge.org/redirected')
+    end
+  end
+  describe "cookies" do
+    before(:all) do
+      @page = get_page('http://twitter.com/login')
+    end
+    it "should provide access to the raw Cookie" do
+      cookie = @page.cookie
+      cookie.should_not be_nil
+      cookie.should_not be_empty
+    end
+    it "should provide access to the Cookies" do
+      cookies = @page.cookies
+      cookies.should_not be_empty
+    end
+    it "should provide access to the key->value pairs within the Cookie" do
+      params = @page.cookie_params
+      params.should_not be_empty
+      params.each do |key,value|
+        key.should_not be_empty
+      end
+    end
+  end
+end

data/spec/rules_spec.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require 'spidr/rules'
+require 'spec_helper'
+describe Rules do
+  subject { Rules }
+  it "should accept data based on acceptance data" do
+    rules = subject.new(:accept => [1])
+    rules.accept?(1).should == true
+  end
+  it "should accept data based on acceptance regexps" do
+    rules = subject.new(:accept => [/1/])
+    rules.accept?('1').should == true
+  end
+  it "should match non-Strings using acceptance regexps" do
+    rules = subject.new(:accept => [/1/])
+    rules.accept?(1).should == true
+  end
+  it "should accept data using acceptance lambdas" do
+    rules = subject.new(:accept => [lambda { |data| data > 2 }])
+    rules.accept?(3).should == true
+  end
+  it "should reject data that does not match any acceptance patterns" do
+    rules = subject.new(:accept => [1, 2, 3])
+    rules.accept?(2).should == true
+    rules.accept?(4).should == false
+  end
+  it "should accept data that does not match any rejection patterns" do
+    rules = subject.new(:reject => [1, 2, 3])
+    rules.accept?(2).should == false
+    rules.accept?(4).should == true
+  end
+end

data/spec/sanitizers_spec.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require 'spidr/sanitizers'
+require 'spidr/agent'
+require 'spec_helper'
+describe Sanitizers do
+  describe "sanitize_url" do
+    let(:url) { 'http://host.com' }
+    before(:all) { @agent = Agent.new }
+    it "should sanitize URLs" do
+      agent = Agent.new
+      clean_url = agent.sanitize_url(URI(url))
+      clean_url.host.should == 'host.com'
+    end
+    it "should sanitize URLs given as Strings" do
+      agent = Agent.new
+      clean_url = agent.sanitize_url(url)
+      clean_url.host.should == 'host.com'
+    end
+  end
+  describe "strip_fragments" do
+    let(:url) { URI("http://host.com/page#lol") }
+    it "should strip fragment components by default" do
+      agent = Agent.new
+      clean_url = agent.sanitize_url(url)
+      clean_url.fragment.should be_nil
+    end
+    it "should allow perserving fragment components" do
+      agent = Agent.new(:strip_fragments => false)
+      clean_url = agent.sanitize_url(url)
+      clean_url.fragment.should == 'lol'
+    end
+  end
+  describe "strip_query" do
+    let(:url) { URI("http://host.com/page?x=1") }
+    it "should not strip query components by default" do
+      agent = Agent.new
+      clean_url = agent.sanitize_url(url)
+      clean_url.query.should == 'x=1'
+    end
+    it "should allow stripping of query components" do
+      agent = Agent.new(:strip_query => true)
+      clean_url = agent.sanitize_url(url)
+      clean_url.query.should be_nil
+    end
+  end
+end

data/spec/session_cache.rb ADDED Viewed

@@ -0,0 +1,58 @@
+require 'spidr/session_cache'
+require 'spec_helper'
+describe SessionCache do
+  describe "empty" do
+    before(:all) do
+      @sessions = SessionCache.new
+    end
+    it "should not have any active sessions" do
+      @sessions.should_not be_active(URI('http://example.com/'))
+    end
+    it "should start new sessions on-demand" do
+      @sessions[URI('http://example.com/')].should_not be_nil
+    end
+    after(:all) do
+      @sessions.clear
+    end
+  end
+  describe "not-empty" do
+    before(:all) do
+      @url = URI('http://example.com/')
+      @sessions = SessionCache.new
+      @sessions[@url]
+    end
+    it "should have active sessions" do
+      @sessions.should be_active(@url)
+    end
+    it "should provide access to sessions" do
+      @sessions[@url].should_not be_nil
+    end
+    it "should start new sessions on-demand" do
+      url2 = URI('http://www.w3c.org/')
+      @sessions[url2].should_not be_nil
+    end
+    it "should be able to kill sessions" do
+      url2 = URI('http://www.w3c.org/')
+      @sessions[url2].should_not be_nil
+      @sessions.kill!(url2)
+      @sessions.should_not be_active(url2)
+    end
+    after(:all) do
+      @sessions.clear
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'rspec'
+require 'spidr/version'
+include Spidr