RubyGems - spider - Versions diffs - 0.3.0 → 0.4.0 - Mend

spider 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/CHANGES +3 -0
data/README +90 -17
data/doc/classes/IncludedInMemcached.html +217 -0
data/doc/classes/Spider.html +10 -8
data/doc/classes/SpiderInstance.html +96 -45
data/doc/created.rid +1 -1
data/doc/files/README.html +95 -21
data/doc/{classes/Net.html → files/lib/included_in_memcached_rb.html} +23 -16
data/doc/files/lib/spider_instance_rb.html +118 -0
data/doc/files/lib/spider_rb.html +95 -32
data/doc/fr_class_index.html +1 -0
data/doc/fr_file_index.html +2 -0
data/doc/fr_method_index.html +11 -7
data/lib/included_in_memcached.rb +22 -0
data/lib/spider.rb +4 -246
data/lib/spider_instance.rb +290 -0
data/spec/included_in_memcached_spec.rb +44 -0
data/spec/spider_instance_spec.rb +46 -4
data/spider.gemspec +1 -1
metadata +8 -8
data/doc/classes/Net/HTTPRedirection.html +0 -144
data/doc/classes/Net/HTTPResponse.html +0 -166
data/doc/classes/Net/HTTPSuccess.html +0 -144
data/doc/classes/NilClass.html +0 -144

data/doc/fr_class_index.html CHANGED

@@ -20,6 +20,7 @@
 <div id="index">
   <h1 class="section-bar">Classes</h1>
   <div id="index-entries">
+    <a href="classes/IncludedInMemcached.html">IncludedInMemcached</a><br />
     <a href="classes/Spider.html">Spider</a><br />
     <a href="classes/SpiderInstance.html">SpiderInstance</a><br />
   </div>

data/doc/fr_file_index.html CHANGED

@@ -21,7 +21,9 @@
   <h1 class="section-bar">Files</h1>
   <div id="index-entries">
     <a href="files/README.html">README</a><br />
+    <a href="files/lib/included_in_memcached_rb.html">lib/included_in_memcached.rb</a><br />
     <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
+    <a href="files/lib/spider_instance_rb.html">lib/spider_instance.rb</a><br />
   </div>
 </div>
 </body>

data/doc/fr_method_index.html CHANGED

@@ -20,13 +20,17 @@
 <div id="index">
   <h1 class="section-bar">Methods</h1>
   <div id="index-entries">
-    <a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
-    <a href="classes/SpiderInstance.html#M000006">clear_headers (SpiderInstance)</a><br />
-    <a href="classes/SpiderInstance.html#M000005">headers (SpiderInstance)</a><br />
-    <a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
-    <a href="classes/SpiderInstance.html#M000003">setup (SpiderInstance)</a><br />
-    <a href="classes/Spider.html#M000007">start_at (Spider)</a><br />
-    <a href="classes/SpiderInstance.html#M000004">teardown (SpiderInstance)</a><br />
+    <a href="classes/IncludedInMemcached.html#M000002"><< (IncludedInMemcached)</a><br />
+    <a href="classes/SpiderInstance.html#M000004">add_url_check (SpiderInstance)</a><br />
+    <a href="classes/SpiderInstance.html#M000005">check_already_seen_with (SpiderInstance)</a><br />
+    <a href="classes/SpiderInstance.html#M000010">clear_headers (SpiderInstance)</a><br />
+    <a href="classes/SpiderInstance.html#M000009">headers (SpiderInstance)</a><br />
+    <a href="classes/IncludedInMemcached.html#M000003">include? (IncludedInMemcached)</a><br />
+    <a href="classes/IncludedInMemcached.html#M000001">new (IncludedInMemcached)</a><br />
+    <a href="classes/SpiderInstance.html#M000006">on (SpiderInstance)</a><br />
+    <a href="classes/SpiderInstance.html#M000007">setup (SpiderInstance)</a><br />
+    <a href="classes/Spider.html#M000011">start_at (Spider)</a><br />
+    <a href="classes/SpiderInstance.html#M000008">teardown (SpiderInstance)</a><br />
   </div>
 </div>
 </body>

data/lib/included_in_memcached.rb ADDED

@@ -0,0 +1,22 @@
+require 'memcache'
+# A specialized class using memcached to track items stored. It supports
+# three operations: new, <<, and include? . Together these can be used to
+# add items to the memcache, then determine whether the item has been added.
+class IncludedInMemcached
+  # Construct a new IncludedInMemcached instance. All arguments here are
+  # passed to MemCache (part of the memcache-client gem).
+  def initialize(*a)
+    @c = MemCache.new(*a)
+  end
+  # Add an item to the memcache.
+  def <<(v)
+    @c.add(v.to_s, v)
+  end
+  # True if the item is in the memcache.
+  def include?(v)
+    @c.get(v.to_s) == v
+  end
+end

data/lib/spider.rb CHANGED

@@ -9,7 +9,7 @@
 #      notice, this list of conditions and the following disclaimer in the
 #      documentation and/or other materials provided with the distribution.
 #      * Neither the name Mike Burns nor the
-#      names of its contributors may be used to endorse or promote products
+#      names of his contributors may be used to endorse or promote products
 #      derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
@@ -23,35 +23,15 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-require 'robot_rules'
-require 'open-uri'
-require 'uri'
-require 'net/http'
-require 'net/https'
-module Net #:nodoc:
-  class HTTPResponse #:nodoc:
-    def success?; false; end
-    def redirect?; false; end
-  end
-  class HTTPSuccess #:nodoc:
-    def success?; true; end
-  end
-  class HTTPRedirection #:nodoc:
-    def redirect?; true; end
-  end
-end
-class NilClass #:nodoc:
-  def merge(h); h; end
-end
+require File.dirname(__FILE__)+'/spider_instance'
 # A spidering library for Ruby. Handles robots.txt, scraping, finding more
 # links, and doing it all over again.
 class Spider
   # Runs the spider starting at the given URL. Also takes a block that is given
   # the SpiderInstance. Use the block to define the rules and handlers for
-  # the discovered Web pages.
+  # the discovered Web pages. See SpiderInstance for the possible rules and
+  # handlers.
   #
   #  Spider.start_at('http://mike-burns.com/') do |s|
   #    s.add_url_check do |a_url|
@@ -78,225 +58,3 @@ class Spider
     a_spider.start!
   end
 end
-class SpiderInstance
-  def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
-    @url_checks  = []
-    @cache       = :memory
-    @callbacks   = {}
-    @next_urls   = next_urls
-    @seen        = seen
-    @rules       = rules || RobotRules.new('Ruby Spider 1.0')
-    @robots_seen = robots_seen
-    @headers     = {}
-    @setup       = nil
-    @teardown    = nil
-  end
-  # Add a predicate that determines whether to continue down this URL's path.
-  # All predicates must be true in order for a URL to proceed.
-  #
-  # Takes a block that takes a string and produces a boolean. For example, this
-  # will ensure that the URL starts with 'http://mike-burns.com':
-  #
-  #  add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
-  def add_url_check(&block)
-    @url_checks << block
-  end
-  def use_cache(cache_type) #:nodoc:
-    @cache = cache_type
-  end
-  # Add a response handler. A response handler's trigger can be :every,
-  # :success, :failure, or any HTTP status code. The handler itself can be
-  # either a Proc or a block.
-  #
-  # The arguments to the block are: the URL as a string, an instance of
-  # Net::HTTPResponse, and the prior URL as a string.
-  #
-  #
-  # For example:
-  #
-  #  on 404 do |a_url, resp, prior_url|
-  #    puts "URL not found: #{a_url}"
-  #  end
-  #
-  #  on :success do |a_url, resp, prior_url|
-  #    puts a_url
-  #    puts resp.body
-  #  end
-  #
-  #  on :every do |a_url, resp, prior_url|
-  #    puts "Given this code: #{resp.code}"
-  #  end
-  def on(code, p = nil, &block)
-    f = p ? p : block
-    case code
-    when Fixnum
-      @callbacks[code] = f
-    else
-      @callbacks[code.to_sym] = f
-    end
-  end
-  # Run before the HTTP request. Given the URL as a string.
-  #  setup do |a_url|
-  #    headers['Cookies'] = 'user_id=1;admin=true'
-  #  end
-  def setup(p = nil, &block)
-    @setup = p ? p : block
-  end
-  # Run last, once for each page. Given the URL as a string.
-  def teardown(p = nil, &block)
-    @teardown = p ? p : block
-  end
-  # Use like a hash:
-  #  headers['Cookies'] = 'user_id=1;password=btrross3'
-  def headers
-    HeaderSetter.new(self)
-  end
-  def raw_headers #:nodoc:
-    @headers
-  end
-  def raw_headers=(v) #:nodoc:
-    @headers = v
-  end
-  # Reset the headers hash.
-  def clear_headers
-    @headers = {}
-  end
-  def start! #:nodoc:
-    next_urls = @next_urls
-    begin
-      tmp_n_u = {}
-      next_urls.each do |prior_url, urls|
-        urls.map do |a_url|
-          [a_url, (URI.parse(a_url) rescue nil)]
-        end.select do |a_url, parsed_url|
-          allowable_url?(a_url, parsed_url)
-        end.each do |a_url, parsed_url|
-          @setup.call(a_url) unless @setup.nil?
-          get_page(parsed_url) do |response|
-            do_callbacks(a_url, response, prior_url)
-            tmp_n_u[a_url] = generate_next_urls(a_url, response)
-          end
-          @teardown.call(a_url) unless @teardown.nil?
-        end
-      end
-      next_urls = tmp_n_u
-    end while !next_urls.empty?
-  end
-  def success_or_failure(code) #:nodoc:
-    if code > 199 && code < 300
-      :success
-    else
-      :failure
-    end
-  end
-  def allowable_url?(a_url, parsed_url) #:nodoc:
-    !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
-      @url_checks.map{|url_check|url_check.call(a_url)}.all?
-  end
-  # True if the robots.txt for that URL allows access to it.
-  def allowed?(a_url, parsed_url) # :nodoc:
-    u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
-    begin
-      unless @robots_seen.include?(u)
-        open(u, 'User-Agent' => 'Ruby Spider',
-          'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
-          @rules.parse(u, url.read)
-        end
-        @robots_seen << u
-      end
-      @rules.allowed?(a_url)
-    rescue OpenURI::HTTPError
-      true # No robots.txt
-    rescue Exception, Timeout::Error # to keep it from crashing
-      false
-    end
-  end
-  def get_page(parsed_url, &block) #:nodoc:
-    @seen << parsed_url
-    begin
-      http = Net::HTTP.new(parsed_url.host, parsed_url.port)
-      http.use_ssl = parsed_url.scheme == 'https'
-      # Uses start because http.finish cannot be called.
-      r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
-                                                       @headers))}
-      if r.redirect?
-        get_page(URI.parse(r['Location']), &block)
-      else
-        block.call(r)
-      end
-    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
-      p e
-      nil
-    end
-  end
-  def do_callbacks(a_url, resp, prior_url) #:nodoc:
-    cbs = [@callbacks[:every],
-      resp.success? ?  @callbacks[:success] : @callbacks[:failure],
-      @callbacks[resp.code]]
-    cbs.each do |cb|
-      cb.call(a_url, resp, prior_url) if cb
-    end
-  end
-  def generate_next_urls(a_url, resp) #:nodoc:
-    web_page = resp.body
-    base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
-                [a_url[0,a_url.rindex('/')]])[0]
-    base_url = remove_trailing_slash(base_url)
-    web_page.scan(/href="(.*?)"/i).flatten.map do |link|
-      begin
-        parsed_link = URI.parse(link)
-        if parsed_link.fragment == '#'
-          nil
-        else
-          case parsed_link.scheme
-          when 'http'
-            link
-          when nil
-            u = URI.parse(base_url)
-            if link[0].chr == '/'
-              "#{u.scheme}://#{u.host}:#{u.port}#{link}"
-            elsif u.path.nil? || u.path == ''
-              "#{u.scheme}://#{u.host}:#{u.port}/#{link}"
-            else
-              "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
-            end
-          else
-            nil
-          end
-        end
-      rescue
-        nil
-      end
-    end.compact
-  end
-  def remove_trailing_slash(s) #:nodoc:
-    s.sub(%r{/*$},'')
-  end
-  class HeaderSetter #:nodoc:
-    def initialize(si)
-      @si = si
-    end
-    def []=(k,v)
-      @si.raw_headers = @si.raw_headers.merge({k => v})
-    end
-  end
-end

data/lib/spider_instance.rb ADDED

@@ -0,0 +1,290 @@
+# Copyright 2007 Mike Burns
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#      * Neither the name Mike Burns nor the
+#      names of his contributors may be used to endorse or promote products
+#      derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+require 'robot_rules'
+require 'open-uri'
+require 'uri'
+require 'net/http'
+require 'net/https'
+module Net #:nodoc:
+  class HTTPResponse #:nodoc:
+    def success?; false; end
+    def redirect?; false; end
+  end
+  class HTTPSuccess #:nodoc:
+    def success?; true; end
+  end
+  class HTTPRedirection #:nodoc:
+    def redirect?; true; end
+  end
+end
+class NilClass #:nodoc:
+  def merge(h); h; end
+end
+class SpiderInstance
+  def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
+    @url_checks  = []
+    @cache       = :memory
+    @callbacks   = {}
+    @next_urls   = next_urls
+    @seen        = seen
+    @rules       = rules || RobotRules.new('Ruby Spider 1.0')
+    @robots_seen = robots_seen
+    @headers     = {}
+    @setup       = nil
+    @teardown    = nil
+  end
+  # Add a predicate that determines whether to continue down this URL's path.
+  # All predicates must be true in order for a URL to proceed.
+  #
+  # Takes a block that takes a string and produces a boolean. For example, this
+  # will ensure that the URL starts with 'http://mike-burns.com':
+  #
+  #  add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
+  def add_url_check(&block)
+    @url_checks << block
+  end
+  # The Web is a graph; to avoid cycles we store the nodes (URLs) already
+  # visited. The Web is a really, really, really big graph; as such, this list
+  # of visited nodes grows really, really, really big.
+  #
+  # Change the object used to store these seen nodes with this. The default
+  # object is an instance of Array. Available with Spider is a wrapper of
+  # memcached.
+  #
+  # You can implement a custom class for this; any object passed to
+  # check_already_seen_with must understand just << and included? .
+  #
+  #  # default
+  #  check_already_seen_with Array.new
+  #
+  #  # memcached
+  #  require 'spider/included_in_memcached'
+  #  check_already_seen_with IncludedInMemcached.new('localhost:11211')
+  def check_already_seen_with(cacher)
+    if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
+      @seen = cacher
+    else
+      raise ArgumentError, 'expected something that responds to << and included?'
+    end
+  end
+  # Add a response handler. A response handler's trigger can be :every,
+  # :success, :failure, or any HTTP status code. The handler itself can be
+  # either a Proc or a block.
+  #
+  # The arguments to the block are: the URL as a string, an instance of
+  # Net::HTTPResponse, and the prior URL as a string.
+  #
+  #
+  # For example:
+  #
+  #  on 404 do |a_url, resp, prior_url|
+  #    puts "URL not found: #{a_url}"
+  #  end
+  #
+  #  on :success do |a_url, resp, prior_url|
+  #    puts a_url
+  #    puts resp.body
+  #  end
+  #
+  #  on :every do |a_url, resp, prior_url|
+  #    puts "Given this code: #{resp.code}"
+  #  end
+  def on(code, p = nil, &block)
+    f = p ? p : block
+    case code
+    when Fixnum
+      @callbacks[code] = f
+    else
+      @callbacks[code.to_sym] = f
+    end
+  end
+  # Run before the HTTP request. Given the URL as a string.
+  #  setup do |a_url|
+  #    headers['Cookies'] = 'user_id=1;admin=true'
+  #  end
+  def setup(p = nil, &block)
+    @setup = p ? p : block
+  end
+  # Run last, once for each page. Given the URL as a string.
+  def teardown(p = nil, &block)
+    @teardown = p ? p : block
+  end
+  # Use like a hash:
+  #  headers['Cookies'] = 'user_id=1;password=btrross3'
+  def headers
+    HeaderSetter.new(self)
+  end
+  def raw_headers #:nodoc:
+    @headers
+  end
+  def raw_headers=(v) #:nodoc:
+    @headers = v
+  end
+  # Reset the headers hash.
+  def clear_headers
+    @headers = {}
+  end
+  def start! #:nodoc:
+    next_urls = @next_urls
+    begin
+      tmp_n_u = {}
+      next_urls.each do |prior_url, urls|
+        urls.map do |a_url|
+          [a_url, (URI.parse(a_url) rescue nil)]
+        end.select do |a_url, parsed_url|
+          allowable_url?(a_url, parsed_url)
+        end.each do |a_url, parsed_url|
+          @setup.call(a_url) unless @setup.nil?
+          get_page(parsed_url) do |response|
+            do_callbacks(a_url, response, prior_url)
+            tmp_n_u[a_url] = generate_next_urls(a_url, response)
+          end
+          @teardown.call(a_url) unless @teardown.nil?
+        end
+      end
+      next_urls = tmp_n_u
+    end while !next_urls.empty?
+  end
+  def success_or_failure(code) #:nodoc:
+    if code > 199 && code < 300
+      :success
+    else
+      :failure
+    end
+  end
+  def allowable_url?(a_url, parsed_url) #:nodoc:
+    !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
+      @url_checks.map{|url_check|url_check.call(a_url)}.all?
+  end
+  # True if the robots.txt for that URL allows access to it.
+  def allowed?(a_url, parsed_url) # :nodoc:
+    u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
+    begin
+      unless @robots_seen.include?(u)
+        open(u, 'User-Agent' => 'Ruby Spider',
+          'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
+          @rules.parse(u, url.read)
+        end
+        @robots_seen << u
+      end
+      @rules.allowed?(a_url)
+    rescue OpenURI::HTTPError
+      true # No robots.txt
+    rescue Exception, Timeout::Error # to keep it from crashing
+      false
+    end
+  end
+  def get_page(parsed_url, &block) #:nodoc:
+    @seen << parsed_url
+    begin
+      http = Net::HTTP.new(parsed_url.host, parsed_url.port)
+      http.use_ssl = parsed_url.scheme == 'https'
+      # Uses start because http.finish cannot be called.
+      r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
+                                                       @headers))}
+      if r.redirect?
+        get_page(URI.parse(r['Location']), &block)
+      else
+        block.call(r)
+      end
+    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
+      p e
+      nil
+    end
+  end
+  def do_callbacks(a_url, resp, prior_url) #:nodoc:
+    cbs = [@callbacks[:every],
+      resp.success? ?  @callbacks[:success] : @callbacks[:failure],
+      @callbacks[resp.code]]
+    cbs.each do |cb|
+      cb.call(a_url, resp, prior_url) if cb
+    end
+  end
+  def generate_next_urls(a_url, resp) #:nodoc:
+    web_page = resp.body
+    base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
+                [a_url[0,a_url.rindex('/')]])[0]
+    base_url = remove_trailing_slash(base_url)
+    web_page.scan(/href="(.*?)"/i).flatten.map do |link|
+      begin
+        parsed_link = URI.parse(link)
+        if parsed_link.fragment == '#'
+          nil
+        else
+          case parsed_link.scheme
+          when 'http'
+            link
+          when nil
+            u = URI.parse(base_url)
+            if link[0].chr == '/'
+              "#{u.scheme}://#{u.host}:#{u.port}#{link}"
+            elsif u.path.nil? || u.path == ''
+              "#{u.scheme}://#{u.host}:#{u.port}/#{link}"
+            else
+              "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
+            end
+          else
+            nil
+          end
+        end
+      rescue
+        nil
+      end
+    end.compact
+  end
+  def remove_trailing_slash(s) #:nodoc:
+    s.sub(%r{/*$},'')
+  end
+  class HeaderSetter #:nodoc:
+    def initialize(si)
+      @si = si
+    end
+    def []=(k,v)
+      @si.raw_headers = @si.raw_headers.merge({k => v})
+    end
+  end
+end