RubyGems - spidr - Versions diffs - 0.3.1 → 0.3.2 - Mend

spidr 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/ChangeLog.md CHANGED

@@ -1,3 +1,12 @@
+### 0.3.2 / 2011-06-20
+* Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
+  {Spidr::Filters} and {Spidr::Sanitizers}.
+* Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
+* Reduce usage of `self.included` and `module_eval`.
+* Reduce usage of nested-blocks.
+* Reduce usage of `return`.
 ### 0.3.1 / 2011-04-22
 * Require `set` in `spidr/headers.rb`.

data/Gemfile CHANGED

@@ -3,7 +3,7 @@ source 'https://rubygems.org'
 gemspec
 group :development do
-  gem 'rake',         '~> 0.8.7'
+  gem 'rake',         '~> 0.8'
   gem 'ore-tasks',    '~> 0.4'
   gem 'rspec',        '~> 2.4'

data/lib/spidr/actions/actions.rb CHANGED

@@ -8,12 +8,6 @@ module Spidr
   # spidering of links.
   #
   module Actions
-    def initialize(options={})
-      @paused = false
-      super(options)
-    end
     #
     # Continue spidering.
     #
@@ -79,5 +73,11 @@ module Spidr
     def skip_page!
       raise(SkipPage)
     end
+    protected
+    def initialize_actions(options={})
+      @paused = false
+    end
   end
 end

data/lib/spidr/agent.rb CHANGED

@@ -115,15 +115,15 @@ module Spidr
         @host_headers.merge!(options[:host_headers])
       end
-      @user_agent = (options[:user_agent] || Spidr.user_agent)
+      @user_agent = options.fetch(:user_agent,Spidr.user_agent)
       @referer = options[:referer]
-      @sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
+      @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
       @cookies = CookieJar.new
       @authorized = AuthStore.new
       @running = false
-      @delay = (options[:delay] || 0)
+      @delay = options.fetch(:delay,0)
       @history = Set[]
       @failures = Set[]
       @queue = []
@@ -131,7 +131,10 @@ module Spidr
       @levels = Hash.new(0)
       @max_depth = options[:max_depth]
-      super(options)
+      initialize_sanitizers(options)
+      initialize_filters(options)
+      initialize_actions(options)
+      initialize_events(options)
       yield self if block_given?
     end
@@ -152,19 +155,16 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
-    def self.start_at(url,options={})
-      self.new(options) do |spider|
-        yield spider if block_given?
-        spider.start_at(url)
-      end
+    def self.start_at(url,options={},&block)
+      agent = new(options,&block)
+      agent.start_at(url)
     end
     #
-    # Creates a new agent and spiders the given host.
+    # Creates a new agent and spiders the web-site located at the given URL.
     #
-    # @param [String]
-    #   The host-name to spider.
+    # @param [URI::HTTP, String] url
+    #   The web-site to spider.
     #
     # @param [Hash] options
     #   Additional options. See {Agent#initialize}.
@@ -176,19 +176,18 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
-    def self.host(name,options={})
-      self.new(options.merge(:host => name)) do |spider|
-        yield spider if block_given?
+    def self.site(url,options={},&block)
+      url = URI(url.to_s) unless url.kind_of?(URI)
-        spider.start_at("http://#{name}/")
-      end
+      agent = new(options.merge(:host => url.host),&block)
+      agent.start_at(url)
     end
     #
-    # Creates a new agent and spiders the web-site located at the given URL.
+    # Creates a new agent and spiders the given host.
     #
-    # @param [URI::HTTP, String] url
-    #   The web-site to spider.
+    # @param [String]
+    #   The host-name to spider.
     #
     # @param [Hash] options
     #   Additional options. See {Agent#initialize}.
@@ -200,14 +199,8 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
-    def self.site(url,options={})
-      url = URI(url.to_s)
-      return self.new(options.merge(:host => url.host)) do |spider|
-        yield spider if block_given?
-        spider.start_at(url)
-      end
+    def self.host(name,options={},&block)
+      site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
     end
     #
@@ -234,7 +227,6 @@ module Spidr
     #
     def start_at(url,&block)
       enqueue(url)
       return run(&block)
     end
@@ -261,7 +253,6 @@ module Spidr
       end
       @running = false
       @sessions.clear
       return self
     end
@@ -387,10 +378,10 @@ module Spidr
       new_failures.each do |url|
         @failures << unless url.kind_of?(URI)
-                    URI(url.to_s)
-                  else
-                    url
-                  end
+                       URI(url.to_s)
+                     else
+                       url
+                     end
       end
       return @failures
@@ -471,7 +462,7 @@ module Spidr
         begin
           @every_url_blocks.each { |url_block| url_block.call(url) }
-          @urls_like_blocks.each do |pattern,url_blocks|
+          @every_url_like_blocks.each do |pattern,url_blocks|
             match = case pattern
                     when Regexp
                       link =~ pattern
@@ -653,12 +644,11 @@ module Spidr
     def prepare_request(url,&block)
       host = url.host
       port = url.port
-      unless url.path.empty?
-        path = url.path
-      else
-        path = '/'
-      end
+      path = unless url.path.empty?
+               url.path
+             else
+               '/'
+             end
       # append the URL query to the path
       path += "?#{url.query}" if url.query
@@ -724,7 +714,7 @@ module Spidr
     #   Specifies whether the given URL should be visited.
     #
     def visit?(url)
-      !(visited?(url)) &&
+      !visited?(url) &&
        visit_scheme?(url.scheme) &&
        visit_host?(url.host) &&
        visit_port?(url.port) &&

data/lib/spidr/body.rb CHANGED

@@ -24,16 +24,15 @@ module Spidr
     # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
     #
     def doc
-      return nil if body.empty?
-      begin
-        if html?
-          return @doc ||= Nokogiri::HTML(body)
-        elsif (xml? || xsl? || rss? || atom?)
-          return @doc ||= Nokogiri::XML(body)
+      unless body.empty?
+        begin
+          if html?
+            @doc ||= Nokogiri::HTML(body)
+          elsif (rss? || atom? || xml? || xsl?)
+            @doc ||= Nokogiri::XML(body)
+          end
+        rescue
         end
-      rescue
-        return nil
       end
     end

data/lib/spidr/events.rb CHANGED

@@ -5,17 +5,6 @@ module Spidr
   # they are visited.
   #
   module Events
-    def initialize(options={})
-      super(options)
-      @every_url_blocks = []
-      @every_failed_url_blocks = []
-      @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
-      @every_page_blocks = []
-      @every_link_blocks = []
-    end
     #
     # Pass each URL from each page visited to the given block.
     #
@@ -57,11 +46,20 @@ module Spidr
     # @yieldparam [URI::HTTP] url
     #   A matching URL.
     #
-    def urls_like(pattern,&block)
-      @urls_like_blocks[pattern] << block
+    # @since 0.3.2
+    #
+    def every_url_like(pattern,&block)
+      @every_url_like_blocks[pattern] << block
       return self
     end
+    #
+    # @see #every_url_like
+    #
+    def urls_like(pattern,&block)
+      every_url_like(pattern,&block)
+    end
     #
     # Pass the headers from every response the agent receives to a given
     # block.
@@ -524,5 +522,16 @@ module Spidr
       @every_link_blocks << block
       return self
     end
+    protected
+    def initialize_events(options={})
+      @every_url_blocks = []
+      @every_failed_url_blocks = []
+      @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @every_page_blocks = []
+      @every_link_blocks = []
+    end
   end
 end

data/lib/spidr/filters.rb CHANGED

@@ -6,110 +6,8 @@ module Spidr
   # URLs the agent will visit.
   #
   module Filters
-    def self.included(base)
-      base.module_eval do
-        # List of acceptable URL schemes to follow
-        attr_reader :schemes
-      end
-    end
-    #
-    # Initializes filtering rules.
-    #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Array] :schemes (['http', 'https'])
-    #   The list of acceptable URI schemes to visit.
-    #   The `https` scheme will be ignored if `net/https` cannot be loaded.
-    #
-    # @option options [String] :host
-    #   The host-name to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :hosts
-    #   The patterns which match the host-names to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :ignore_hosts
-    #   The patterns which match the host-names to not visit.
-    #
-    # @option options [Array<Integer, Regexp, Proc>] :ports
-    #   The patterns which match the ports to visit.
-    #
-    # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
-    #   The patterns which match the ports to not visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :links
-    #   The patterns which match the links to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :ignore_links
-    #   The patterns which match the links to not visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :urls
-    #   The patterns which match the URLs to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :ignore_urls
-    #   The patterns which match the URLs to not visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :exts
-    #   The patterns which match the URI path extensions to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :ignore_exts
-    #   The patterns which match the URI path extensions to not visit.
-    #
-    def initialize(options={})
-      super(options)
-      @schemes = []
-      if options[:schemes]
-        @schemes += options[:schemes]
-      else
-        @schemes << 'http'
-        begin
-          require 'net/https'
-          @schemes << 'https'
-        rescue Gem::LoadError => e
-          raise(e)
-        rescue ::LoadError
-          STDERR.puts "Warning: cannot load 'net/https', https support disabled"
-        end
-      end
-      @host_rules = Rules.new(
-        :accept => options[:hosts],
-        :reject => options[:ignore_hosts]
-      )
-      @port_rules = Rules.new(
-        :accept => options[:ports],
-        :reject => options[:ignore_ports]
-      )
-      @link_rules = Rules.new(
-        :accept => options[:links],
-        :reject => options[:ignore_links]
-      )
-      @url_rules = Rules.new(
-        :accept => options[:urls],
-        :reject => options[:ignore_urls]
-      )
-      @ext_rules = Rules.new(
-        :accept => options[:exts],
-        :reject => options[:ignore_exts]
-      )
-      if options[:host]
-        visit_hosts_like(options[:host])
-      end
-      if options[:queue]
-        self.queue = options[:queue]
-      end
-      if options[:history]
-        self.history = options[:history]
-      end
-    end
+    # List of acceptable URL schemes to follow
+    attr_reader :schemes
     #
     # Sets the list of acceptable URL schemes to visit.
@@ -458,6 +356,102 @@ module Spidr
     protected
+    #
+    # Initializes filtering rules.
+    #
+    # @param [Hash] options
+    #   Additional options.
+    #
+    # @option options [Array] :schemes (['http', 'https'])
+    #   The list of acceptable URI schemes to visit.
+    #   The `https` scheme will be ignored if `net/https` cannot be loaded.
+    #
+    # @option options [String] :host
+    #   The host-name to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :hosts
+    #   The patterns which match the host-names to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :ignore_hosts
+    #   The patterns which match the host-names to not visit.
+    #
+    # @option options [Array<Integer, Regexp, Proc>] :ports
+    #   The patterns which match the ports to visit.
+    #
+    # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
+    #   The patterns which match the ports to not visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :links
+    #   The patterns which match the links to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :ignore_links
+    #   The patterns which match the links to not visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :urls
+    #   The patterns which match the URLs to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :ignore_urls
+    #   The patterns which match the URLs to not visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :exts
+    #   The patterns which match the URI path extensions to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :ignore_exts
+    #   The patterns which match the URI path extensions to not visit.
+    #
+    def initialize_filters(options={})
+      @schemes = []
+      if options[:schemes]
+        @schemes += options[:schemes]
+      else
+        @schemes << 'http'
+        begin
+          require 'net/https'
+          @schemes << 'https'
+        rescue Gem::LoadError => e
+          raise(e)
+        rescue ::LoadError
+          STDERR.puts "Warning: cannot load 'net/https', https support disabled"
+        end
+      end
+      @host_rules = Rules.new(
+        :accept => options[:hosts],
+        :reject => options[:ignore_hosts]
+      )
+      @port_rules = Rules.new(
+        :accept => options[:ports],
+        :reject => options[:ignore_ports]
+      )
+      @link_rules = Rules.new(
+        :accept => options[:links],
+        :reject => options[:ignore_links]
+      )
+      @url_rules = Rules.new(
+        :accept => options[:urls],
+        :reject => options[:ignore_urls]
+      )
+      @ext_rules = Rules.new(
+        :accept => options[:exts],
+        :reject => options[:ignore_exts]
+      )
+      if options[:host]
+        visit_hosts_like(options[:host])
+      end
+      if options[:queue]
+        self.queue = options[:queue]
+      end
+      if options[:history]
+        self.history = options[:history]
+      end
+    end
     #
     # Determines if a given URI scheme should be visited.
     #

data/lib/spidr/headers.rb CHANGED

@@ -295,9 +295,9 @@ module Spidr
         cookie.split('; ').each do |key_value|
           key, value = key_value.split('=',2)
-          next if RESERVED_COOKIE_NAMES.include?(key)
-          params[key] = (value || '')
+          unless RESERVED_COOKIE_NAMES.include?(key)
+            params[key] = (value || '')
+          end
         end
       end

data/lib/spidr/links.rb CHANGED

@@ -85,7 +85,7 @@ module Spidr
         location.each(&block)
       else
         # usually the location header contains a single String
-        block.call(location)
+        yield location
       end
     end

data/lib/spidr/rules.rb CHANGED

@@ -40,17 +40,9 @@ module Spidr
     #
     def accept?(data)
       unless @accept.empty?
-        @accept.each do |rule|
-          return true if test_data(data,rule)
-        end
-        return false
+        @accept.any? { |rule| test_data(data,rule) }
       else
-        @reject.each do |rule|
-          return false if test_data(data,rule)
-        end
-        return true
+        !@reject.any? { |rule| test_data(data,rule) }
       end
     end
@@ -62,7 +54,7 @@ module Spidr
     #   rejection patterns.
     #
     def reject?(data)
-      !(accept?(data))
+      !accept?(data)
     end
     protected
@@ -75,11 +67,11 @@ module Spidr
     #
     def test_data(data,rule)
       if rule.kind_of?(Proc)
-        return (rule.call(data) == true)
+        rule.call(data) == true
       elsif rule.kind_of?(Regexp)
-        return !((data.to_s =~ rule).nil?)
+        !((data.to_s =~ rule).nil?)
       else
-        return data == rule
+        data == rule
       end
     end

data/lib/spidr/sanitizers.rb CHANGED

@@ -6,39 +6,11 @@ module Spidr
   # sanitation of incoming links.
   #
   module Sanitizers
-    def self.included(base)
-      base.module_eval do
-        # Specifies whether the Agent will strip URI fragments
-        attr_accessor :strip_fragments
+    # Specifies whether the Agent will strip URI fragments
+    attr_accessor :strip_fragments
-        # Specifies whether the Agent will strip URI queries
-        attr_accessor :strip_query
-      end
-    end
-    #
-    # Initializes the Sanitizer rules.
-    #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Boolean] :strip_fragments (true)
-    #   Specifies whether or not to strip the fragment component from URLs.
-    #
-    # @option options [Boolean] :strip_query (false)
-    #   Specifies whether or not to strip the query component from URLs.
-    #
-    # @since 0.2.2
-    #
-    def initialize(options={})
-      @strip_fragments = true
-      if options.has_key?(:strip_fragments)
-        @strip_fragments = options[:strip_fragments]
-      end
-      @strip_query = (options[:strip_query] || false)
-    end
+    # Specifies whether the Agent will strip URI queries
+    attr_accessor :strip_query
     #
     # Sanitizes a URL based on filtering options.
@@ -59,5 +31,26 @@ module Spidr
       return url
     end
+    protected
+    #
+    # Initializes the Sanitizer rules.
+    #
+    # @param [Hash] options
+    #   Additional options.
+    #
+    # @option options [Boolean] :strip_fragments (true)
+    #   Specifies whether or not to strip the fragment component from URLs.
+    #
+    # @option options [Boolean] :strip_query (false)
+    #   Specifies whether or not to strip the query component from URLs.
+    #
+    # @since 0.2.2
+    #
+    def initialize_sanitizers(options={})
+      @strip_fragments = options.fetch(:strip_fragments,true)
+      @strip_query = options.fetch(:strip_query,false)
+    end
   end
 end

data/lib/spidr/version.rb CHANGED

@@ -1,4 +1,4 @@
 module Spidr
   # Spidr version
-  VERSION = '0.3.1'
+  VERSION = '0.3.2'
 end

data/spidr.gemspec CHANGED

@@ -1,15 +1,127 @@
-# -*- encoding: utf-8 -*-
-begin
-  Ore::Specification.new do |gemspec|
-    # custom logic here
-  end
-rescue NameError
-  begin
-    require 'ore/specification'
-    retry
-  rescue LoadError
-    STDERR.puts "The '#{__FILE__}' file requires Ore."
-    STDERR.puts "Run `gem install ore-core` to install Ore."
+# encoding: utf-8
+require 'yaml'
+Gem::Specification.new do |gemspec|
+  files = if File.directory?('.git')
+            `git ls-files`.split($/)
+          elsif File.directory?('.hg')
+            `hg manifest`.split($/)
+          elsif File.directory?('.svn')
+            `svn ls -R`.split($/).select { |path| File.file?(path) }
+          else
+            Dir['{**/}{.*,*}'].select { |path| File.file?(path) }
+          end
+  filter_files = lambda { |paths|
+    case paths
+    when Array
+      (files & paths)
+    when String
+      (files & Dir[paths])
+    end
+  }
+  version = {
+    :file => 'lib/spidr/version.rb',
+    :constant => 'Spidr::VERSION'
+  }
+  defaults = {
+    'name' => File.basename(File.dirname(__FILE__)),
+    'files' => files,
+    'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
+    'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
+    'extra_doc_files' => filter_files['*.{txt,rdoc,md,markdown,tt,textile}'],
+  }
+  metadata = defaults.merge(YAML.load_file('gemspec.yml'))
+  gemspec.name = metadata.fetch('name',defaults[:name])
+  gemspec.version = if metadata['version']
+                      metadata['version']
+                    elsif File.file?(version[:file])
+                      require File.join('.',version[:file])
+                      eval(version[:constant])
+                    end
+  gemspec.summary = metadata.fetch('summary',metadata['description'])
+  gemspec.description = metadata.fetch('description',metadata['summary'])
+  case metadata['license']
+  when Array
+    gemspec.licenses = metadata['license']
+  when String
+    gemspec.license = metadata['license']
+  end
+  case metadata['authors']
+  when Array
+    gemspec.authors = metadata['authors']
+  when String
+    gemspec.author = metadata['authors']
+  end
+  gemspec.email = metadata['email']
+  gemspec.homepage = metadata['homepage']
+  case metadata['require_paths']
+  when Array
+    gemspec.require_paths = metadata['require_paths']
+  when String
+    gemspec.require_path = metadata['require_paths']
+  end
+  gemspec.files = filter_files[metadata['files']]
+  gemspec.executables = metadata['executables']
+  gemspec.extensions = metadata['extensions']
+  if Gem::VERSION < '1.7.'
+    gemspec.default_executable = gemspec.executables.first
+  end
+  gemspec.test_files = filter_files[metadata['test_files']]
+  unless gemspec.files.include?('.document')
+    gemspec.extra_rdoc_files = metadata['extra_doc_files']
+  end
+  gemspec.post_install_message = metadata['post_install_message']
+  gemspec.requirements = metadata['requirements']
+  if gemspec.respond_to?(:required_ruby_version=)
+    gemspec.required_ruby_version = metadata['required_ruby_version']
+  end
+  if gemspec.respond_to?(:required_rubygems_version=)
+    gemspec.required_rubygems_version = metadata['required_ruby_version']
+  end
+  parse_versions = lambda { |versions|
+    case versions
+    when Array
+      versions.map { |v| v.to_s }
+    when String
+      versions.split(/,\s*/)
+    end
+  }
+  if metadata['dependencies']
+    metadata['dependencies'].each do |name,versions|
+      gemspec.add_dependency(name,parse_versions[versions])
+    end
+  end
+  if metadata['runtime_dependencies']
+    metadata['runtime_dependencies'].each do |name,versions|
+      gemspec.add_runtime_dependency(name,parse_versions[versions])
+    end
+  end
+  if metadata['development_dependencies']
+    metadata['development_dependencies'].each do |name,versions|
+      gemspec.add_development_dependency(name,parse_versions[versions])
+    end
   end
 end

metadata CHANGED

@@ -2,7 +2,7 @@
 name: spidr
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.3.1
+  version: 0.3.2
 platform: ruby
 authors:
 - Postmodern
@@ -10,7 +10,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-04-22 00:00:00 Z
+date: 2011-06-20 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -128,7 +128,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: spidr
-rubygems_version: 1.7.2
+rubygems_version: 1.8.5
 signing_key:
 specification_version: 3
 summary: A versatile Ruby web spidering library