RubyGems - spidr - Versions diffs - 0.2.1 → 0.2.2 - Mend

spidr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

data.tar.gz.sig +0 -0
data/History.rdoc +191 -0
data/Manifest.txt +10 -34
data/{README.txt → README.rdoc} +3 -1
data/Rakefile +6 -4
data/lib/spidr/agent.rb +137 -97
data/lib/spidr/auth_credential.rb +25 -0
data/lib/spidr/auth_store.rb +157 -0
data/lib/spidr/cookie_jar.rb +166 -0
data/lib/spidr/filters.rb +2 -0
data/lib/spidr/page.rb +75 -11
data/lib/spidr/sanitizers.rb +59 -0
data/lib/spidr/session_cache.rb +119 -0
data/lib/spidr/version.rb +1 -1
data/spec/agent_spec.rb +2 -2
data/spec/helpers/history.rb +34 -0
data/spec/helpers/wsoc.rb +83 -0
data/spec/page_examples.rb +5 -1
data/spec/page_spec.rb +30 -0
data/spec/sanitizers_spec.rb +67 -0
data/tasks/yard.rb +1 -1
metadata +24 -40
metadata.gz.sig +0 -0
data/History.txt +0 -167
data/spec/helpers/course.rb +0 -95
data/static/course/absolute/index.html +0 -10
data/static/course/absolute/next.html +0 -9
data/static/course/absolute/start.html +0 -19
data/static/course/empty/index.html +0 -10
data/static/course/empty/start.html +0 -23
data/static/course/fail.html +0 -14
data/static/course/frames/frame.html +0 -15
data/static/course/frames/frame_next.html +0 -9
data/static/course/frames/iframe.html +0 -15
data/static/course/frames/iframe_next.html +0 -9
data/static/course/frames/index.html +0 -10
data/static/course/frames/start.html +0 -15
data/static/course/index.html +0 -10
data/static/course/javascript/index.html +0 -10
data/static/course/javascript/start.html +0 -19
data/static/course/loop/index.html +0 -10
data/static/course/loop/next.html +0 -13
data/static/course/loop/start.html +0 -19
data/static/course/relative/current_directory.html +0 -9
data/static/course/relative/index.html +0 -10
data/static/course/relative/normal.html +0 -9
data/static/course/relative/same_directory.html +0 -9
data/static/course/relative/start.html +0 -27
data/static/course/remote/index.html +0 -10
data/static/course/remote/next.html +0 -9
data/static/course/remote/start.html +0 -27
data/static/course/scripts/course.js +0 -29
data/static/course/scripts/jquery-1.2.6.min.js +0 -32
data/static/course/specs.json +0 -1
data/static/course/start.html +0 -27
data/tasks/course.rb +0 -63

data/lib/spidr/sanitizers.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'uri'
+module Spidr
+  module Sanitizers
+    def self.included(base)
+      base.module_eval do
+        # Specifies whether the Agent will strip URI fragments
+        attr_accessor :strip_fragments
+        # Specifies whether the Agent will strip URI queries
+        attr_accessor :strip_query
+      end
+    end
+    #
+    # Initializes the sanitization rules.
+    #
+    # @param [Hash] options
+    #   Additional options.
+    #
+    # @option options [Boolean] :strip_fragments (true)
+    #   Specifies whether or not to strip the fragment component from URLs.
+    #
+    # @option options [Boolean] :strip_query (false)
+    #   Specifies whether or not to strip the query component from URLs.
+    #
+    # @since 0.2.2
+    #
+    def initialize(options={})
+      @strip_fragments = true
+      if options.has_key?(:strip_fragments)
+        @strip_fragments = options[:strip_fragments]
+      end
+      @strip_query = (options[:strip_query] || false)
+    end
+    #
+    # Sanitizes a URL based on filtering options.
+    #
+    # @param [URI::HTTP, URI::HTTPS, String] url
+    #   The URL to be sanitized
+    #
+    # @return [URI::HTTP, URI::HTTPS]
+    #   The new sanitized URL.
+    #
+    # @since 0.2.2
+    #
+    def sanitize_url(url)
+      url = URI(url.to_s)
+      url.fragment = nil if @strip_fragments
+      url.query = nil if @strip_query
+      return url
+    end
+  end
+end

data/lib/spidr/session_cache.rb ADDED Viewed

@@ -0,0 +1,119 @@
+require 'spidr/spidr'
+require 'net/http'
+module Spidr
+  class SessionCache
+    # Proxy to use
+    attr_accessor :proxy
+    #
+    # Creates a new session cache.
+    #
+    # @param [Hash] proxy (Spidr.proxy)
+    #   Proxy options.
+    #
+    # @option proxy [String] :host
+    #   The host the proxy is running on.
+    #
+    # @option proxy [Integer] :port
+    #   The port the proxy is running on.
+    #
+    # @option proxy [String] :user
+    #   The user to authenticate as with the proxy.
+    #
+    # @option proxy [String] :password
+    #   The password to authenticate with.
+    #
+    # @since 0.2.2
+    #
+    def initialize(proxy=Spidr.proxy)
+      @proxy = proxy
+      @sessions = {}
+    end
+    #
+    # Provides an active HTTP session for a given URL.
+    #
+    # @param [URI::HTTP, String] url
+    #   The URL which will be requested later.
+    #
+    # @return [Net::HTTP]
+    #   The active HTTP session object.
+    #
+    def [](url)
+      # normalize the url
+      url = URI(url.to_s) unless url.kind_of?(URI)
+      key = [url.scheme, url.host, url.port]
+      unless @sessions[key]
+        session = Net::HTTP::Proxy(
+          @proxy[:host],
+          @proxy[:port],
+          @proxy[:user],
+          @proxy[:password]
+        ).new(url.host,url.port)
+        if url.scheme == 'https'
+          session.use_ssl = true
+          session.verify_mode = OpenSSL::SSL::VERIFY_NONE
+        end
+        @sessions[key] = session
+      end
+      return @sessions[key]
+    end
+    #
+    # Destroys an HTTP session for the given scheme, host and port.
+    #
+    # @param [URI::HTTP, String] url
+    #   The URL of the requested session.
+    #
+    # @return [nil]
+    #
+    # @since 0.2.2
+    #
+    def kill!(url)
+      # normalize the url
+      url = URI(url.to_s) unless url.kind_of?(URI)
+      key = [url.scheme, url.host, url.port]
+      if (sess = @sessions[key])
+        begin
+          sess.finish
+        rescue IOError
+        end
+        @sessions.delete(key)
+      end
+    end
+    #
+    # Clears the session cache.
+    #
+    # @return [SessionCache]
+    #   The cleared session cache.
+    #
+    # @since 0.2.2
+    #
+    def clear
+      @sessions.each_value do |sess|
+        begin
+          sess.finish
+        rescue IOError
+          nil
+        end
+      end
+      @sessions.clear
+      return self
+    end
+  end
+end

data/lib/spidr/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module Spidr
   # Spidr version
-  VERSION = '0.2.1'
+  VERSION = '0.2.2'
 end

data/spec/agent_spec.rb CHANGED Viewed

@@ -1,10 +1,10 @@
 require 'spidr/agent'
 require 'spec_helper'
-require 'helpers/course'
+require 'helpers/wsoc'
 describe Agent do
-  include Helpers::Course
+  include Helpers::WSOC
   before(:all) do
     @agent = run_course

data/spec/helpers/history.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module Helpers
+  module History
+    def visited_once?(url)
+      return @agent.visited_urls.select { |visited_url|
+        visited_url == url
+      }.length == 1
+    end
+    def visited_link?(url)
+      @agent.visited?(url)
+    end
+    def visit_failed?(url)
+      @agent.failed?(url)
+    end
+    def should_visit_link(url)
+      visited_link?(url).should == true
+    end
+    def should_ignore_link(url)
+      visited_link?(url).should == false
+    end
+    def should_visit_once(url)
+      visited_once?(url).should == true
+    end
+    def should_fail_link(url)
+      visited_link?(url).should == false
+      visit_failed?(url).should == true
+    end
+  end
+end

data/spec/helpers/wsoc.rb ADDED Viewed

@@ -0,0 +1,83 @@
+require 'wsoc/config'
+require 'open-uri'
+require 'json'
+require 'helpers/history'
+module Helpers
+  module WSOC
+    include History
+    SERVER_URL = URI::HTTP.build(
+      :host => (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
+      :port => (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
+    )
+    SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
+    COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
+    COURSE_METADATA = {}
+    def self.included(base)
+      hash = JSON.parse(open(SPECS_URL).read)
+      metadata = hash['metadata']
+      specs = hash['specs']
+      if metadata.kind_of?(Hash)
+        COURSE_METADATA.merge!(metadata)
+      end
+      if specs.kind_of?(Array)
+        specs.each do |spec|
+          message = spec['message'].dump
+          url = spec['url'].dump
+          case spec['behavior']
+          when 'visit'
+            base.module_eval %{
+              it #{message} do
+                should_visit_link(#{url})
+              end
+            }
+          when 'ignore'
+            base.module_eval %{
+              it #{message} do
+                should_ignore_link(#{url})
+              end
+            }
+          when 'fail'
+            base.module_eval %{
+              it #{message} do
+                should_fail_link(#{url})
+              end
+            }
+          end
+        end
+      end
+    end
+    def course
+      WSOC::COURSE_METADATA
+    end
+    def course_auth_store
+      course['auth_store']
+    end
+    def run_course
+      Agent.start_at(COURSE_URL) do |agent|
+        course_auth_store.each do |path,auth|
+          agent.authorized.add(
+            COURSE_URL.merge(path),
+            auth['user'],
+            auth['password']
+          )
+        end
+        agent.every_failed_url { |url| puts "[FAILED] #{url}" }
+        agent.every_url { |url| puts url }
+      end
+    end
+  end
+end

data/spec/page_examples.rb CHANGED Viewed

@@ -12,6 +12,10 @@ shared_examples_for "Page" do
   end
   it "should provide transparent access to the response headers" do
-    @page.content_type.should == @page.content_type
+    @page.content_type.should == @page.response['Content-Type']
+  end
+  it "should allow content-types" do
+    @page.content_types.should_not be_empty
   end
 end

data/spec/page_spec.rb CHANGED Viewed

@@ -78,4 +78,34 @@ describe Page do
       @page.title.should be_nil
     end
   end
+  describe "cookies" do
+    before(:all) do
+      @page = get_page('http://twitter.com/login')
+    end
+    it "should provide access to the raw Cookie" do
+      cookie = @page.cookie
+      cookie.should_not be_nil
+      cookie.should_not be_empty
+    end
+    it "should provide access to the Cookies" do
+      cookies = @page.cookies
+      cookies.should_not be_empty
+    end
+    it "should provide access to the key->value pairs within the Cookie" do
+      params = @page.cookie_params
+      params.should_not be_empty
+      params.each do |key,value|
+        key.should_not be_empty
+        value.should_not be_empty
+      end
+    end
+  end
 end

data/spec/sanitizers_spec.rb ADDED Viewed

@@ -0,0 +1,67 @@
+require 'spidr/sanitizers'
+require 'spidr/agent'
+require 'spec_helper'
+describe Sanitizers do
+  describe "sanitize_url" do
+    before(:all) do
+      @agent = Agent.new
+      @url = 'http://host.com'
+    end
+    it "should sanitize URLs" do
+      agent = Agent.new
+      clean_url = agent.sanitize_url(URI(@url))
+      clean_url.host.should == 'host.com'
+    end
+    it "should sanitize URLs given as Strings" do
+      agent = Agent.new
+      clean_url = agent.sanitize_url(@url)
+      clean_url.host.should == 'host.com'
+    end
+  end
+  describe "strip_fragments" do
+    before(:all) do
+      @url = URI("http://host.com/page#lol")
+    end
+    it "should strip fragment components by default" do
+      agent = Agent.new
+      clean_url = agent.sanitize_url(@url)
+      clean_url.fragment.should be_nil
+    end
+    it "should allow perserving fragment components" do
+      agent = Agent.new(:strip_fragments => false)
+      clean_url = agent.sanitize_url(@url)
+      clean_url.fragment.should == 'lol'
+    end
+  end
+  describe "strip_query" do
+    before(:all) do
+      @url = URI("http://host.com/page?x=1")
+    end
+    it "should not strip query components by default" do
+      agent = Agent.new
+      clean_url = agent.sanitize_url(@url)
+      clean_url.query.should == 'x=1'
+    end
+    it "should allow stripping of query components" do
+      agent = Agent.new(:strip_query => true)
+      clean_url = agent.sanitize_url(@url)
+      clean_url.query.should be_nil
+    end
+  end
+end

data/tasks/yard.rb CHANGED Viewed

@@ -4,7 +4,7 @@ YARD::Rake::YardocTask.new do |t|
   t.files   = ['lib/**/*.rb']
   t.options = [
     '--protected',
-    '--files', 'History.txt',
+    '--files', 'History.rdoc',
     '--title', 'Spidr'
   ]
 end