RubyGems - spidr - Versions diffs - 0.4.1 → 0.5.0 - Mend

spidr 0.4.1 → 0.5.0

Files changed (46) hide show

checksums.yaml +7 -0
data/ChangeLog.md +69 -54
data/Gemfile +9 -5
data/LICENSE.txt +1 -1
data/README.md +34 -26
data/Rakefile +4 -15
data/gemspec.yml +3 -2
data/lib/spidr/agent.rb +101 -44
data/lib/spidr/{actions → agent}/actions.rb +32 -12
data/lib/spidr/{events.rb → agent/events.rb} +4 -8
data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
data/lib/spidr/auth_store.rb +2 -2
data/lib/spidr/cookie_jar.rb +2 -2
data/lib/spidr/extensions/uri.rb +28 -16
data/lib/spidr/page.rb +7 -11
data/lib/spidr/{body.rb → page/body.rb} +1 -1
data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
data/lib/spidr/{links.rb → page/links.rb} +43 -7
data/lib/spidr/session_cache.rb +2 -2
data/lib/spidr/spidr.rb +32 -5
data/lib/spidr/version.rb +1 -1
data/spec/agent/actions_spec.rb +60 -0
data/spec/agent/filters_spec.rb +62 -0
data/spec/agent/sanitizers_spec.rb +62 -0
data/spec/agent_spec.rb +13 -13
data/spec/auth_store_spec.rb +17 -17
data/spec/cookie_jar_spec.rb +26 -26
data/spec/extensions/uri_spec.rb +19 -9
data/spec/helpers/history.rb +5 -5
data/spec/helpers/wsoc.rb +2 -2
data/spec/page_examples.rb +4 -4
data/spec/page_spec.rb +28 -25
data/spec/rules_spec.rb +14 -14
data/spec/session_cache.rb +7 -7
data/spec/spidr_spec.rb +10 -10
metadata +37 -51
data/lib/spidr/actions.rb +0 -2
data/lib/spidr/actions/exceptions.rb +0 -4
data/lib/spidr/actions/exceptions/action.rb +0 -9
data/lib/spidr/actions/exceptions/paused.rb +0 -11
data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
data/spec/actions_spec.rb +0 -59
data/spec/filters_spec.rb +0 -61
data/spec/sanitizers_spec.rb +0 -61

data/lib/spidr/{actions → agent}/actions.rb RENAMED

@@ -1,13 +1,33 @@
-require 'spidr/actions/exceptions/paused'
-require 'spidr/actions/exceptions/skip_link'
-require 'spidr/actions/exceptions/skip_page'
 module Spidr
-  #
-  # The {Actions} module adds methods to {Agent} for controlling the
-  # spidering of links.
-  #
-  module Actions
+  class Agent
+    module Actions
+      #
+      # The base {Actions} exception class.
+      #
+      class Action < RuntimeError
+      end
+      #
+      # An {Actions} exception class used to pause a running {Agent}.
+      #
+      class Paused < Action
+      end
+      #
+      # An {Actions} exception class which causes a running {Agent} to
+      # skip a link.
+      #
+      class SkipLink < Action
+      end
+      #
+      # An {Actions} exception class which causes a running {Agent} to
+      # skip a {Page}, and all links within that page.
+      #
+      class SkipPage < Action
+      end
+    end
     #
     # Continue spidering.
     #
@@ -40,7 +60,7 @@ module Spidr
     #
     def pause!
       @paused = true
-      raise(Paused)
+      raise(Actions::Paused)
     end
     #
@@ -61,7 +81,7 @@ module Spidr
     #   and not enqueued or visited.
     #
     def skip_link!
-      raise(SkipLink)
+      raise(Actions::SkipLink)
     end
     #
@@ -71,7 +91,7 @@ module Spidr
     #   Indicates to the agent, that the current page should be skipped.
     #
     def skip_page!
-      raise(SkipPage)
+      raise(Actions::SkipPage)
     end
     protected

data/lib/spidr/{events.rb → agent/events.rb} RENAMED

@@ -1,10 +1,5 @@
 module Spidr
-  #
-  # The {Events} module adds methods to {Agent} for registering
-  # callbacks which will receive URLs, links, headers and pages, when
-  # they are visited.
-  #
-  module Events
+  class Agent
     #
     # Pass each URL from each page visited to the given block.
     #
@@ -526,12 +521,13 @@ module Spidr
     protected
     def initialize_events(options={})
-      @every_url_blocks = []
+      @every_url_blocks        = []
       @every_failed_url_blocks = []
-      @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @every_url_like_blocks   = Hash.new { |hash,key| hash[key] = [] }
       @every_page_blocks = []
       @every_link_blocks = []
     end
   end
 end

data/lib/spidr/{filters.rb → agent/filters.rb} RENAMED

@@ -1,11 +1,8 @@
 require 'spidr/rules'
 module Spidr
-  #
-  # The {Filters} module adds methods to {Agent} for controlling which
-  # URLs the agent will visit.
-  #
-  module Filters
+  class Agent
     # List of acceptable URL schemes to follow
     attr_reader :schemes
@@ -419,24 +416,24 @@ module Spidr
       end
       @host_rules = Rules.new(
-        :accept => options[:hosts],
-        :reject => options[:ignore_hosts]
+        accept: options[:hosts],
+        reject: options[:ignore_hosts]
       )
       @port_rules = Rules.new(
-        :accept => options[:ports],
-        :reject => options[:ignore_ports]
+        accept: options[:ports],
+        reject: options[:ignore_ports]
       )
       @link_rules = Rules.new(
-        :accept => options[:links],
-        :reject => options[:ignore_links]
+        accept: options[:links],
+        reject: options[:ignore_links]
       )
       @url_rules = Rules.new(
-        :accept => options[:urls],
-        :reject => options[:ignore_urls]
+        accept: options[:urls],
+        reject: options[:ignore_urls]
       )
       @ext_rules = Rules.new(
-        :accept => options[:exts],
-        :reject => options[:ignore_exts]
+        accept: options[:exts],
+        reject: options[:ignore_exts]
       )
       if options[:host]
@@ -511,7 +508,7 @@ module Spidr
     #
     # Determines if a given URL should be visited.
     #
-    # @param [URI::HTTP, URI::HTTPS] url
+    # @param [URI::HTTP, URI::HTTPS] link
     #   The URL.
     #
     # @return [Boolean]
@@ -535,5 +532,6 @@ module Spidr
     def visit_ext?(path)
       @ext_rules.accept?(File.extname(path)[1..-1])
     end
   end
 end

data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} RENAMED

@@ -1,11 +1,8 @@
 require 'uri'
 module Spidr
-  #
-  # The {Sanitizers} module adds methods to {Agent} which control the
-  # sanitation of incoming links.
-  #
-  module Sanitizers
+  class Agent
     # Specifies whether the Agent will strip URI fragments
     attr_accessor :strip_fragments
@@ -27,7 +24,7 @@ module Spidr
       url = URI(url.to_s) unless url.kind_of?(URI)
       url.fragment = nil if @strip_fragments
-      url.query = nil if @strip_query
+      url.query    = nil if @strip_query
       return url
     end
@@ -50,7 +47,8 @@ module Spidr
     #
     def initialize_sanitizers(options={})
       @strip_fragments = options.fetch(:strip_fragments,true)
-      @strip_query = options.fetch(:strip_query,false)
+      @strip_query     = options.fetch(:strip_query,false)
     end
   end
 end

data/lib/spidr/auth_store.rb CHANGED

@@ -57,10 +57,10 @@ module Spidr
     #
     # Add an auth credential to the store for supplied base URL.
     #
-    # @param [URI] url_base
+    # @param [URI] url
     #   A URL pattern to associate with a set of auth credentials.
     #
-    # @param [AuthCredential]
+    # @param [AuthCredential] auth
     #   The auth credential for this URL pattern.
     #
     # @return [AuthCredential]

data/lib/spidr/cookie_jar.rb CHANGED

@@ -18,7 +18,7 @@ module Spidr
     def initialize
       @params = {}
-      @dirty = Set[]
+      @dirty   = Set[]
       @cookies = {}
     end
@@ -147,7 +147,7 @@ module Spidr
     #
     def cookies_for_host(host)
       host_cookies = (@params[host] || {})
-      sub_domains = host.split('.')
+      sub_domains  = host.split('.')
       while sub_domains.length > 2
         sub_domains.shift

data/lib/spidr/extensions/uri.rb CHANGED

@@ -1,4 +1,5 @@
 require 'uri'
+require 'strscan'
 module URI
   #
@@ -26,27 +27,38 @@ module URI
   #   URI.expand_path('/test/../path')
   #   # => "/path"
   #
-  def URI.expand_path(path)
-    dirs = path.split(/\/+/)
+  def self.expand_path(path)
+    if path.start_with?('/')
+      leading_slash, path = path[0,1], path[1..-1]
+    else
+      leading_slash = ''
+    end
-    # append any tailing '/' chars, lost due to String#split
-    dirs << '' if path[-1,1] == '/'
+    if path.end_with?('/')
+      trailing_slash, path = path[-1,1], path[0..-2]
+    else
+      trailing_slash = ''
+    end
-    new_dirs = []
+    scanner = StringScanner.new(path)
+    stack   = []
-    dirs.each do |dir|
-      if dir == '..'
-        new_dirs.pop
-      elsif dir != '.'
-        new_dirs.push(dir)
+    until scanner.eos?
+      if (dir = scanner.scan(/^[^\/]+/))
+        case dir
+        when '..' then stack.pop
+        when '.'  then false
+        else           stack.push(dir)
+        end
+      else
+        scanner.skip(/\/+/)
       end
     end
-    full_path = new_dirs.join('/')
-    # default empty paths to '/'
-    full_path = '/' if full_path.empty?
-    return full_path
+    unless stack.empty?
+      "#{leading_slash}#{stack.join('/')}#{trailing_slash}"
+    else
+      '/'
+    end
   end
 end

data/lib/spidr/page.rb CHANGED

@@ -1,6 +1,6 @@
-require 'spidr/headers'
-require 'spidr/body'
-require 'spidr/links'
+require 'spidr/page/headers'
+require 'spidr/page/body'
+require 'spidr/page/links'
 module Spidr
   #
@@ -8,10 +8,6 @@ module Spidr
   #
   class Page
-    include Headers
-    include Body
-    include Links
     # URL of the page
     attr_reader :url
@@ -27,14 +23,14 @@ module Spidr
     # @param [URI::HTTP] url
     #   The URL of the page.
     #
-    # @param [Net::HTTP::Response] response
+    # @param [Net::HTTPResponse] response
     #   The response from the request for the page.
     #
     def initialize(url,response)
-      @url = url
+      @url      = url
       @response = response
-      @headers = response.to_hash
-      @doc = nil
+      @headers  = response.to_hash
+      @doc      = nil
     end
     #

data/lib/spidr/{body.rb → page/body.rb} RENAMED

@@ -1,7 +1,7 @@
 require 'nokogiri'
 module Spidr
-  module Body
+  class Page
     #
     # The body of the response.
     #

data/lib/spidr/{headers.rb → page/headers.rb} RENAMED

@@ -1,7 +1,7 @@
 require 'set'
 module Spidr
-  module Headers
+  class Page
     # Reserved names used within Cookie strings
     RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']

data/lib/spidr/{links.rb → page/links.rb} RENAMED

@@ -2,7 +2,7 @@ require 'spidr/extensions/uri'
 require 'uri'
 module Spidr
-  module Links
+  class Page
     include Enumerable
     #
@@ -100,6 +100,42 @@ module Spidr
       each_redirect.to_a
     end
+    #
+    # Enumerates over every `mailto:` link in the page.
+    #
+    # @yield [link]
+    #   The given block will be passed every `mailto:` link from the page.
+    #
+    # @yieldparam [String] link
+    #   A `mailto:` link from the page.
+    #
+    # @return [Enumerator]
+    #   If no block is given, an enumerator object will be returned.
+    #
+    # @since 0.5.0
+    #
+    def each_mailto
+      return enum_for(:each_mailto) unless block_given?
+      if (html? && doc)
+        doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
+          yield a.get_attribute('href')[7..-1]
+        end
+      end
+    end
+    #
+    # `mailto:` links in the page.
+    #
+    # @return [Array<String>]
+    #   The `mailto:` links found within the page.
+    #
+    # @since 0.5.0
+    #
+    def mailtos
+      each_mailto.to_a
+    end
     #
     # Enumerates over every link in the page.
     #
@@ -124,23 +160,23 @@ module Spidr
       each_redirect(&filter) if is_redirect?
       if (html? && doc)
-        doc.search('a[@href]').each do |a|
+        doc.search('//a[@href]').each do |a|
           filter.call(a.get_attribute('href'))
         end
-        doc.search('frame[@src]').each do |iframe|
+        doc.search('//frame[@src]').each do |iframe|
           filter.call(iframe.get_attribute('src'))
         end
-        doc.search('iframe[@src]').each do |iframe|
+        doc.search('//iframe[@src]').each do |iframe|
           filter.call(iframe.get_attribute('src'))
         end
-        doc.search('link[@href]').each do |link|
+        doc.search('//link[@href]').each do |link|
           filter.call(link.get_attribute('href'))
         end
-        doc.search('script[@src]').each do |script|
+        doc.search('//script[@src]').each do |script|
           filter.call(script.get_attribute('src'))
         end
       end
@@ -213,7 +249,7 @@ module Spidr
         path = new_url.path
         # ensure that paths begin with a leading '/' for URI::FTP
-        if (new_url.scheme == 'ftp' && path[0,1] != '/')
+        if (new_url.scheme == 'ftp' && !path.start_with?('/'))
           path.insert(0,'/')
         end

data/lib/spidr/session_cache.rb CHANGED

@@ -32,7 +32,7 @@ module Spidr
     # @since 0.2.2
     #
     def initialize(proxy=Spidr.proxy)
-      @proxy = proxy
+      @proxy    = proxy
       @sessions = {}
     end
@@ -82,7 +82,7 @@ module Spidr
         ).new(url.host,url.port)
         if url.scheme == 'https'
-          session.use_ssl = true
+          session.use_ssl     = true
           session.verify_mode = OpenSSL::SSL::VERIFY_NONE
           session.start
         end