RubyGems - spidr - Versions diffs - 0.3.1 → 0.3.2 - Mend

spidr 0.3.1 → 0.3.2

Files changed (14) hide show

data/ChangeLog.md CHANGED

@@ -1,3 +1,12 @@
+### 0.3.2 / 2011-06-20
+* Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
+  {Spidr::Filters} and {Spidr::Sanitizers}.
+* Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
+* Reduce usage of `self.included` and `module_eval`.
+* Reduce usage of nested-blocks.
+* Reduce usage of `return`.
 ### 0.3.1 / 2011-04-22
 * Require `set` in `spidr/headers.rb`.

data/Gemfile CHANGED

@@ -3,7 +3,7 @@ source 'https://rubygems.org'
 gemspec
 group :development do
-  gem 'rake',         '~> 0.8.7'
+  gem 'rake',         '~> 0.8'
   gem 'ore-tasks',    '~> 0.4'
   gem 'rspec',        '~> 2.4'

data/lib/spidr/actions/actions.rb CHANGED

@@ -8,12 +8,6 @@ module Spidr
   # spidering of links.
   #
   module Actions
-    def initialize(options={})
-      @paused = false
-      super(options)
-    end
     #
     # Continue spidering.
     #
@@ -79,5 +73,11 @@ module Spidr
     def skip_page!
       raise(SkipPage)
     end
+    protected
+    def initialize_actions(options={})
+      @paused = false
+    end
   end
 end

data/lib/spidr/agent.rb CHANGED

@@ -115,15 +115,15 @@ module Spidr
         @host_headers.merge!(options[:host_headers])
       end
-      @user_agent = (options[:user_agent] || Spidr.user_agent)
+      @user_agent = options.fetch(:user_agent,Spidr.user_agent)
       @referer = options[:referer]
-      @sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
+      @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
       @cookies = CookieJar.new
       @authorized = AuthStore.new
       @running = false
-      @delay = (options[:delay] || 0)
+      @delay = options.fetch(:delay,0)
       @history = Set[]
       @failures = Set[]
       @queue = []
@@ -131,7 +131,10 @@ module Spidr
       @levels = Hash.new(0)
       @max_depth = options[:max_depth]
-      super(options)
+      initialize_sanitizers(options)
+      initialize_filters(options)
+      initialize_actions(options)
+      initialize_events(options)
       yield self if block_given?
     end
@@ -152,19 +155,16 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
-    def self.start_at(url,options={})
-      self.new(options) do |spider|
-        yield spider if block_given?
-        spider.start_at(url)
-      end
+    def self.start_at(url,options={},&block)
+      agent = new(options,&block)
+      agent.start_at(url)
     end
     #
-    # Creates a new agent and spiders the given host.
+    # Creates a new agent and spiders the web-site located at the given URL.
     #
-    # @param [String]
-    #   The host-name to spider.
+    # @param [URI::HTTP, String] url
+    #   The web-site to spider.
     #
     # @param [Hash] options
     #   Additional options. See {Agent#initialize}.
@@ -176,19 +176,18 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
-    def self.host(name,options={})
-      self.new(options.merge(:host => name)) do |spider|
-        yield spider if block_given?
+    def self.site(url,options={},&block)
+      url = URI(url.to_s) unless url.kind_of?(URI)
-        spider.start_at("http://#{name}/")
-      end
+      agent = new(options.merge(:host => url.host),&block)
+      agent.start_at(url)
     end
     #
-    # Creates a new agent and spiders the web-site located at the given URL.
+    # Creates a new agent and spiders the given host.
     #
-    # @param [URI::HTTP, String] url
-    #   The web-site to spider.
+    # @param [String]
+    #   The host-name to spider.
     #
     # @param [Hash] options
     #   Additional options. See {Agent#initialize}.
@@ -200,14 +199,8 @@ module Spidr
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
-    def self.site(url,options={})
-      url = URI(url.to_s)
-      return self.new(options.merge(:host => url.host)) do |spider|
-        yield spider if block_given?
-        spider.start_at(url)
-      end
+    def self.host(name,options={},&block)
+      site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
     end
     #
@@ -234,7 +227,6 @@ module Spidr
     #
     def start_at(url,&block)
       enqueue(url)
       return run(&block)
     end
@@ -261,7 +253,6 @@ module Spidr
       end
       @running = false
       @sessions.clear
       return self
     end
@@ -387,10 +378,10 @@ module Spidr
       new_failures.each do |url|
         @failures << unless url.kind_of?(URI)
-                    URI(url.to_s)
-                  else
-                    url
-                  end
+                       URI(url.to_s)
+                     else
+                       url
+                     end
       end
       return @failures
@@ -471,7 +462,7 @@ module Spidr
         begin
           @every_url_blocks.each { |url_block| url_block.call(url) }
-          @urls_like_blocks.each do |pattern,url_blocks|
+          @every_url_like_blocks.each do |pattern,url_blocks|
             match = case pattern
                     when Regexp
                       link =~ pattern
@@ -653,12 +644,11 @@ module Spidr
     def prepare_request(url,&block)
       host = url.host
       port = url.port
-      unless url.path.empty?
-        path = url.path
-      else
-        path = '/'
-      end
+      path = unless url.path.empty?
+               url.path
+             else
+               '/'
+             end
       # append the URL query to the path
       path += "?#{url.query}" if url.query
@@ -724,7 +714,7 @@ module Spidr
     #   Specifies whether the given URL should be visited.
     #
     def visit?(url)
-      !(visited?(url)) &&
+      !visited?(url) &&
        visit_scheme?(url.scheme) &&
        visit_host?(url.host) &&
        visit_port?(url.port) &&

data/lib/spidr/body.rb CHANGED

@@ -24,16 +24,15 @@ module Spidr
     # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
     #
     def doc
-      return nil if body.empty?
-      begin
-        if html?
-          return @doc ||= Nokogiri::HTML(body)
-        elsif (xml? || xsl? || rss? || atom?)
-          return @doc ||= Nokogiri::XML(body)
+      unless body.empty?
+        begin
+          if html?
+            @doc ||= Nokogiri::HTML(body)
+          elsif (rss? || atom? || xml? || xsl?)
+            @doc ||= Nokogiri::XML(body)
+          end
+        rescue
         end
-      rescue
-        return nil
       end
     end

data/lib/spidr/events.rb CHANGED

@@ -5,17 +5,6 @@ module Spidr
   # they are visited.
   #
   module Events
-    def initialize(options={})
-      super(options)
-      @every_url_blocks = []
-      @every_failed_url_blocks = []
-      @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
-      @every_page_blocks = []
-      @every_link_blocks = []
-    end
     #
     # Pass each URL from each page visited to the given block.
     #
@@ -57,11 +46,20 @@ module Spidr
     # @yieldparam [URI::HTTP] url
     #   A matching URL.
     #
-    def urls_like(pattern,&block)
-      @urls_like_blocks[pattern] << block
+    # @since 0.3.2
+    #
+    def every_url_like(pattern,&block)
+      @every_url_like_blocks[pattern] << block
       return self
     end
+    #
+    # @see #every_url_like
+    #
+    def urls_like(pattern,&block)
+      every_url_like(pattern,&block)
+    end
     #
     # Pass the headers from every response the agent receives to a given
     # block.
@@ -524,5 +522,16 @@ module Spidr
       @every_link_blocks << block
       return self
     end
+    protected
+    def initialize_events(options={})
+      @every_url_blocks = []
+      @every_failed_url_blocks = []
+      @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @every_page_blocks = []
+      @every_link_blocks = []
+    end
   end
 end

data/lib/spidr/filters.rb CHANGED

@@ -6,110 +6,8 @@ module Spidr
   # URLs the agent will visit.
   #
   module Filters
-    def self.included(base)
-      base.module_eval do
-        # List of acceptable URL schemes to follow
-        attr_reader :schemes
-      end
-    end
-    #
-    # Initializes filtering rules.
-    #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Array] :schemes (['http', 'https'])
-    #   The list of acceptable URI schemes to visit.
-    #   The `https` scheme will be ignored if `net/https` cannot be loaded.
-    #
-    # @option options [String] :host
-    #   The host-name to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :hosts
-    #   The patterns which match the host-names to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :ignore_hosts
-    #   The patterns which match the host-names to not visit.
-    #
-    # @option options [Array<Integer, Regexp, Proc>] :ports
-    #   The patterns which match the ports to visit.
-    #
-    # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
-    #   The patterns which match the ports to not visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :links
-    #   The patterns which match the links to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :ignore_links
-    #   The patterns which match the links to not visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :urls
-    #   The patterns which match the URLs to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :ignore_urls
-    #   The patterns which match the URLs to not visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :exts
-    #   The patterns which match the URI path extensions to visit.
-    #
-    # @option options [Array<String, Regexp, Proc>] :ignore_exts
-    #   The patterns which match the URI path extensions to not visit.
-    #
-    def initialize(options={})
-      super(options)
-      @schemes = []
-      if options[:schemes]
-        @schemes += options[:schemes]
-      else
-        @schemes << 'http'
-        begin
-          require 'net/https'
-          @schemes << 'https'
-        rescue Gem::LoadError => e
-          raise(e)
-        rescue ::LoadError
-          STDERR.puts "Warning: cannot load 'net/https', https support disabled"
-        end
-      end
-      @host_rules = Rules.new(
-        :accept => options[:hosts],
-        :reject => options[:ignore_hosts]
-      )
-      @port_rules = Rules.new(
-        :accept => options[:ports],
-        :reject => options[:ignore_ports]
-      )
-      @link_rules = Rules.new(
-        :accept => options[:links],
-        :reject => options[:ignore_links]
-      )
-      @url_rules = Rules.new(
-        :accept => options[:urls],
-        :reject => options[:ignore_urls]
-      )
-      @ext_rules = Rules.new(
-        :accept => options[:exts],
-        :reject => options[:ignore_exts]
-      )
-      if options[:host]
-        visit_hosts_like(options[:host])
-      end
-      if options[:queue]
-        self.queue = options[:queue]
-      end
-      if options[:history]
-        self.history = options[:history]
-      end
-    end
+    # List of acceptable URL schemes to follow
+    attr_reader :schemes
     #
     # Sets the list of acceptable URL schemes to visit.
@@ -458,6 +356,102 @@ module Spidr
     protected
+    #
+    # Initializes filtering rules.
+    #
+    # @param [Hash] options
+    #   Additional options.
+    #
+    # @option options [Array] :schemes (['http', 'https'])
+    #   The list of acceptable URI schemes to visit.
+    #   The `https` scheme will be ignored if `net/https` cannot be loaded.
+    #
+    # @option options [String] :host
+    #   The host-name to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :hosts
+    #   The patterns which match the host-names to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :ignore_hosts
+    #   The patterns which match the host-names to not visit.
+    #
+    # @option options [Array<Integer, Regexp, Proc>] :ports
+    #   The patterns which match the ports to visit.
+    #
+    # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
+    #   The patterns which match the ports to not visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :links
+    #   The patterns which match the links to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :ignore_links
+    #   The patterns which match the links to not visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :urls
+    #   The patterns which match the URLs to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :ignore_urls
+    #   The patterns which match the URLs to not visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :exts
+    #   The patterns which match the URI path extensions to visit.
+    #
+    # @option options [Array<String, Regexp, Proc>] :ignore_exts
+    #   The patterns which match the URI path extensions to not visit.
+    #
+    def initialize_filters(options={})
+      @schemes = []
+      if options[:schemes]
+        @schemes += options[:schemes]
+      else
+        @schemes << 'http'
+        begin
+          require 'net/https'
+          @schemes << 'https'
+        rescue Gem::LoadError => e
+          raise(e)
+        rescue ::LoadError
+          STDERR.puts "Warning: cannot load 'net/https', https support disabled"
+        end
+      end
+      @host_rules = Rules.new(
+        :accept => options[:hosts],
+        :reject => options[:ignore_hosts]
+      )
+      @port_rules = Rules.new(
+        :accept => options[:ports],
+        :reject => options[:ignore_ports]
+      )
+      @link_rules = Rules.new(
+        :accept => options[:links],
+        :reject => options[:ignore_links]
+      )
+      @url_rules = Rules.new(
+        :accept => options[:urls],
+        :reject => options[:ignore_urls]
+      )
+      @ext_rules = Rules.new(
+        :accept => options[:exts],
+        :reject => options[:ignore_exts]
+      )
+      if options[:host]
+        visit_hosts_like(options[:host])
+      end
+      if options[:queue]
+        self.queue = options[:queue]
+      end
+      if options[:history]
+        self.history = options[:history]
+      end
+    end
     #
     # Determines if a given URI scheme should be visited.
     #

data/lib/spidr/headers.rb CHANGED

@@ -295,9 +295,9 @@ module Spidr
         cookie.split('; ').each do |key_value|
           key, value = key_value.split('=',2)
-          next if RESERVED_COOKIE_NAMES.include?(key)
-          params[key] = (value || '')
+          unless RESERVED_COOKIE_NAMES.include?(key)
+            params[key] = (value || '')
+          end
         end
       end

data/lib/spidr/links.rb CHANGED

@@ -85,7 +85,7 @@ module Spidr
         location.each(&block)
       else
         # usually the location header contains a single String
-        block.call(location)
+        yield location
       end
     end

data/lib/spidr/rules.rb CHANGED

@@ -40,17 +40,9 @@ module Spidr
     #
     def accept?(data)
       unless @accept.empty?
-        @accept.each do |rule|
-          return true if test_data(data,rule)
-        end
-        return false
+        @accept.any? { |rule| test_data(data,rule) }
       else
-        @reject.each do |rule|
-          return false if test_data(data,rule)
-        end
-        return true
+        !@reject.any? { |rule| test_data(data,rule) }
       end
     end
@@ -62,7 +54,7 @@ module Spidr
     #   rejection patterns.
     #
     def reject?(data)
-      !(accept?(data))
+      !accept?(data)
     end
     protected
@@ -75,11 +67,11 @@ module Spidr
     #
     def test_data(data,rule)
       if rule.kind_of?(Proc)
-        return (rule.call(data) == true)
+        rule.call(data) == true
       elsif rule.kind_of?(Regexp)
-        return !((data.to_s =~ rule).nil?)
+        !((data.to_s =~ rule).nil?)
       else
-        return data == rule
+        data == rule
       end
     end

data/lib/spidr/sanitizers.rb CHANGED

@@ -6,39 +6,11 @@ module Spidr
   # sanitation of incoming links.
   #
   module Sanitizers
-    def self.included(base)
-      base.module_eval do
-        # Specifies whether the Agent will strip URI fragments
-        attr_accessor :strip_fragments
+    # Specifies whether the Agent will strip URI fragments
+    attr_accessor :strip_fragments
-        # Specifies whether the Agent will strip URI queries
-        attr_accessor :strip_query
-      end
-    end
-    #
-    # Initializes the Sanitizer rules.
-    #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Boolean] :strip_fragments (true)
-    #   Specifies whether or not to strip the fragment component from URLs.
-    #
-    # @option options [Boolean] :strip_query (false)
-    #   Specifies whether or not to strip the query component from URLs.
-    #
-    # @since 0.2.2
-    #
-    def initialize(options={})
-      @strip_fragments = true
-      if options.has_key?(:strip_fragments)
-        @strip_fragments = options[:strip_fragments]
-      end
-      @strip_query = (options[:strip_query] || false)
-    end
+    # Specifies whether the Agent will strip URI queries
+    attr_accessor :strip_query
     #
     # Sanitizes a URL based on filtering options.
@@ -59,5 +31,26 @@ module Spidr
       return url
     end
+    protected
+    #
+    # Initializes the Sanitizer rules.
+    #
+    # @param [Hash] options
+    #   Additional options.
+    #
+    # @option options [Boolean] :strip_fragments (true)
+    #   Specifies whether or not to strip the fragment component from URLs.
+    #
+    # @option options [Boolean] :strip_query (false)
+    #   Specifies whether or not to strip the query component from URLs.
+    #
+    # @since 0.2.2
+    #
+    def initialize_sanitizers(options={})
+      @strip_fragments = options.fetch(:strip_fragments,true)
+      @strip_query = options.fetch(:strip_query,false)
+    end
   end
 end

data/lib/spidr/version.rb CHANGED

@@ -1,4 +1,4 @@
 module Spidr
   # Spidr version
-  VERSION = '0.3.1'
+  VERSION = '0.3.2'
 end

data/spidr.gemspec CHANGED

@@ -1,15 +1,127 @@
-# -*- encoding: utf-8 -*-
-begin
-  Ore::Specification.new do |gemspec|
-    # custom logic here
-  end
-rescue NameError
-  begin
-    require 'ore/specification'
-    retry
-  rescue LoadError
-    STDERR.puts "The '#{__FILE__}' file requires Ore."
-    STDERR.puts "Run `gem install ore-core` to install Ore."
+# encoding: utf-8
+require 'yaml'
+Gem::Specification.new do |gemspec|
+  files = if File.directory?('.git')
+            `git ls-files`.split($/)
+          elsif File.directory?('.hg')
+            `hg manifest`.split($/)
+          elsif File.directory?('.svn')
+            `svn ls -R`.split($/).select { |path| File.file?(path) }
+          else
+            Dir['{**/}{.*,*}'].select { |path| File.file?(path) }
+          end
+  filter_files = lambda { |paths|
+    case paths
+    when Array
+      (files & paths)
+    when String
+      (files & Dir[paths])
+    end
+  }
+  version = {
+    :file => 'lib/spidr/version.rb',
+    :constant => 'Spidr::VERSION'
+  }
+  defaults = {
+    'name' => File.basename(File.dirname(__FILE__)),
+    'files' => files,
+    'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
+    'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
+    'extra_doc_files' => filter_files['*.{txt,rdoc,md,markdown,tt,textile}'],
+  }
+  metadata = defaults.merge(YAML.load_file('gemspec.yml'))
+  gemspec.name = metadata.fetch('name',defaults[:name])
+  gemspec.version = if metadata['version']
+                      metadata['version']
+                    elsif File.file?(version[:file])
+                      require File.join('.',version[:file])
+                      eval(version[:constant])
+                    end
+  gemspec.summary = metadata.fetch('summary',metadata['description'])
+  gemspec.description = metadata.fetch('description',metadata['summary'])
+  case metadata['license']
+  when Array
+    gemspec.licenses = metadata['license']
+  when String
+    gemspec.license = metadata['license']
+  end
+  case metadata['authors']
+  when Array
+    gemspec.authors = metadata['authors']
+  when String
+    gemspec.author = metadata['authors']
+  end
+  gemspec.email = metadata['email']
+  gemspec.homepage = metadata['homepage']
+  case metadata['require_paths']
+  when Array
+    gemspec.require_paths = metadata['require_paths']
+  when String
+    gemspec.require_path = metadata['require_paths']
+  end
+  gemspec.files = filter_files[metadata['files']]
+  gemspec.executables = metadata['executables']
+  gemspec.extensions = metadata['extensions']
+  if Gem::VERSION < '1.7.'
+    gemspec.default_executable = gemspec.executables.first
+  end
+  gemspec.test_files = filter_files[metadata['test_files']]
+  unless gemspec.files.include?('.document')
+    gemspec.extra_rdoc_files = metadata['extra_doc_files']
+  end
+  gemspec.post_install_message = metadata['post_install_message']
+  gemspec.requirements = metadata['requirements']
+  if gemspec.respond_to?(:required_ruby_version=)
+    gemspec.required_ruby_version = metadata['required_ruby_version']
+  end
+  if gemspec.respond_to?(:required_rubygems_version=)
+    gemspec.required_rubygems_version = metadata['required_ruby_version']
+  end
+  parse_versions = lambda { |versions|
+    case versions
+    when Array
+      versions.map { |v| v.to_s }
+    when String
+      versions.split(/,\s*/)
+    end
+  }
+  if metadata['dependencies']
+    metadata['dependencies'].each do |name,versions|
+      gemspec.add_dependency(name,parse_versions[versions])
+    end
+  end
+  if metadata['runtime_dependencies']
+    metadata['runtime_dependencies'].each do |name,versions|
+      gemspec.add_runtime_dependency(name,parse_versions[versions])
+    end
+  end
+  if metadata['development_dependencies']
+    metadata['development_dependencies'].each do |name,versions|
+      gemspec.add_development_dependency(name,parse_versions[versions])
+    end
   end
 end

metadata CHANGED

@@ -2,7 +2,7 @@
 name: spidr
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.3.1
+  version: 0.3.2
 platform: ruby
 authors:
 - Postmodern
@@ -10,7 +10,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-04-22 00:00:00 Z
+date: 2011-06-20 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -128,7 +128,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: spidr
-rubygems_version: 1.7.2
+rubygems_version: 1.8.5
 signing_key:
 specification_version: 3
 summary: A versatile Ruby web spidering library