RubyGems - anemone - Versions diffs - 0.2.3 → 0.3.0 - Mend

anemone 0.2.3 → 0.3.0

Files changed (18) hide show

data/CHANGELOG.rdoc +11 -1
data/README.rdoc +2 -0
data/lib/anemone/cli/serialize.rb +2 -2
data/lib/anemone/core.rb +58 -66
data/lib/anemone/http.rb +39 -28
data/lib/anemone/page.rb +53 -59
data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
data/lib/anemone/storage.rb +19 -0
data/lib/anemone/storage/pstore.rb +48 -0
data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
data/lib/anemone/tentacle.rb +7 -7
data/spec/anemone_spec.rb +4 -4
data/spec/core_spec.rb +226 -163
data/spec/http_spec.rb +23 -0
data/spec/page_spec.rb +28 -14
data/spec/page_store_spec.rb +128 -0
data/spec/storage_spec.rb +123 -0
metadata +9 -3

data/CHANGELOG.rdoc CHANGED

@@ -1,3 +1,13 @@
+== 0.3.0 / 2009-12-15
+* Major enchancements
+  * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
+* Minor enhancements
+  * Options can be set via methods on the Core object in the crawl block
 == 0.2.3 / 2009-11-01
 * Minor enhancements
@@ -24,4 +34,4 @@
 * Minor enhancements
   * HTTP request response time recorded in Page.
-  * Use of persistent HTTP connections.
+  * Use of persistent HTTP connections.

data/README.rdoc CHANGED

@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
 * HTTPS support
 * Records response time for each page
 * CLI program can list all pages in a domain, calculate page depths, and more
+* Obey robots.txt
+* In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
 == Examples
 See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.

data/lib/anemone/cli/serialize.rb CHANGED

@@ -12,10 +12,10 @@ Usage:
 Synopsis:
   Crawls a site starting at the given URL and saves the resulting
-  PageHash object to a file using Marshal serialization.
+  PageStore object to a file using Marshal serialization.
 Options:
-  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+  -o, --output filename      Filename to save PageStore to. Defaults to crawl.{Time.now}
 INFO
   exit(0)
 end

data/lib/anemone/core.rb CHANGED

@@ -2,25 +2,26 @@ require 'thread'
 require 'robots'
 require 'anemone/tentacle'
 require 'anemone/page'
-require 'anemone/page_hash'
+require 'anemone/page_store'
+require 'anemone/storage'
 module Anemone
-  VERSION = '0.2.3';
+  VERSION = '0.3.0';
   #
   # Convenience method to start a crawl
   #
   def Anemone.crawl(urls, options = {}, &block)
     Core.crawl(urls, options, &block)
-  end
+  end
   class Core
-    # PageHash storing all Page objects encountered during the crawl
-    attr_reader :pages
+    # PageStore storing all Page objects encountered during the crawl
+    attr_reader :pages
     # Hash of options for the crawl
-    attr_accessor :opts
+    attr_reader :opts
     DEFAULT_OPTS = {
       # run 4 Tentacle threads to fetch pages
@@ -38,9 +39,18 @@ module Anemone
       # by default, don't limit the depth of the crawl
       :depth_limit => false,
       # number of times HTTP redirects will be followed
-      :redirect_limit => 5
+      :redirect_limit => 5,
+      # storage engine defaults to Hash in +process_options+ if none specified
+      :storage => nil
     }
+    # Create setter methods for all options to be called from the crawl block
+    DEFAULT_OPTS.keys.each do |key|
+      define_method "#{key}=" do |*args|
+        @opts[key.to_sym] = *args
+      end
+    end
     #
     # Initialize the crawl with starting *urls* (single URL or Array of URLs)
     # and optional *block*
@@ -50,17 +60,15 @@ module Anemone
       @urls.each{ |url| url.path = '/' if url.path.empty? }
       @tentacles = []
-      @pages = PageHash.new
       @on_every_page_blocks = []
       @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @skip_link_patterns = []
       @after_crawl_blocks = []
-      process_options opts
+      @opts = opts
       yield self if block_given?
     end
     #
     # Convenience method to start a new crawl
     #
@@ -70,16 +78,16 @@ module Anemone
         core.run
       end
     end
     #
-    # Add a block to be executed on the PageHash after the crawl
+    # Add a block to be executed on the PageStore after the crawl
     # is finished
     #
     def after_crawl(&block)
       @after_crawl_blocks << block
       self
     end
     #
     # Add one ore more Regex patterns for URLs which should not be
     # followed
@@ -88,7 +96,7 @@ module Anemone
       @skip_link_patterns.concat [patterns].flatten.compact
       self
     end
     #
     # Add a block to be executed on every Page as they are encountered
     # during the crawl
@@ -97,7 +105,7 @@ module Anemone
       @on_every_page_blocks << block
       self
     end
     #
     # Add a block to be executed on Page objects with a URL matching
     # one or more patterns
@@ -110,7 +118,7 @@ module Anemone
       end
       self
     end
     #
     # Specify a block which will select which links to follow on each page.
     # The block should return an Array of URI objects.
@@ -119,77 +127,63 @@ module Anemone
       @focus_crawl_block = block
       self
     end
     #
     # Perform the crawl
     #
     def run
+      process_options
       @urls.delete_if { |url| !visit_link?(url) }
       return if @urls.empty?
       link_queue = Queue.new
       page_queue = Queue.new
       @opts[:threads].times do
         @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
       end
       @urls.each{ |url| link_queue.enq(url) }
       loop do
         page = page_queue.deq
-        @pages[page.url] = page
+        @pages.touch_key page.url
         puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
-        # perform the on_every_page blocks for this page
-        do_page_blocks(page)
+        do_page_blocks page
         page.discard_doc! if @opts[:discard_page_bodies]
-        links_to_follow(page).each do |link|
-          link_queue.enq([link, page])
-          @pages[link] = nil
-        end
-        # create an entry in the page hash for each alias of this page,
-        # i.e. all the pages that redirected to this page
-        page.aliases.each do |aka|
-          if !@pages.has_key?(aka) or @pages[aka].nil?
-            @pages[aka] = page.alias_clone(aka)
-          end
-          @pages[aka].add_alias!(page.url)
+        links = links_to_follow page
+        links.each do |link|
+          link_queue << [link, page.url.dup, page.depth + 1]
         end
+        @pages.touch_keys links
+        @pages[page.url] = page
         # if we are done with the crawl, tell the threads to end
         if link_queue.empty? and page_queue.empty?
           until link_queue.num_waiting == @tentacles.size
             Thread.pass
           end
           if page_queue.empty?
-            @tentacles.size.times { link_queue.enq(:END)}
+            @tentacles.size.times { link_queue << :END }
             break
           end
         end
       end
       @tentacles.each { |t| t.join }
-      do_after_crawl_blocks()
+      do_after_crawl_blocks
       self
     end
-    private
-    def process_options(options)
-      @opts = DEFAULT_OPTS.merge options
+    private
+    def process_options
+      @opts = DEFAULT_OPTS.merge @opts
       @opts[:threads] = 1 if @opts[:delay] > 0
+      @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
       @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
     end
@@ -197,9 +191,9 @@ module Anemone
     # Execute the after_crawl blocks
     #
     def do_after_crawl_blocks
-      @after_crawl_blocks.each {|b| b.call(@pages)}
+      @after_crawl_blocks.each { |b| b.call(@pages) }
     end
     #
     # Execute the on_every_page blocks for *page*
     #
@@ -207,14 +201,12 @@ module Anemone
       @on_every_page_blocks.each do |blk|
         blk.call(page)
       end
       @on_pages_like_blocks.each do |pattern, blks|
-        if page.url.to_s =~ pattern
-          blks.each { |blk| blk.call(page) }
-        end
+        blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
       end
-    end
+    end
     #
     # Return an Array of links to follow from the given page.
     # Based on whether or not the link has already been crawled,
@@ -222,9 +214,9 @@ module Anemone
     #
     def links_to_follow(page)
       links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
-      links.select { |link| visit_link?(link, page) }
+      links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
     end
     #
     # Returns +true+ if *link* has not been visited already,
     # and is not excluded by a skip_link pattern...
@@ -234,16 +226,16 @@ module Anemone
     #
     def visit_link?(link, from_page = nil)
       allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
       if from_page && @opts[:depth_limit]
         too_deep = from_page.depth >= @opts[:depth_limit]
       else
         too_deep = false
       end
       !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
     end
     #
     # Returns +true+ if *link* should not be visited because
     # its URL matches a skip_link pattern.
@@ -251,6 +243,6 @@ module Anemone
     def skip_link?(link)
       @skip_link_patterns.any? { |p| link.path =~ p }
     end
   end
 end

data/lib/anemone/http.rb CHANGED

@@ -12,62 +12,73 @@ module Anemone
     end
     #
-    # Create a new Page from the response of an HTTP request to *url*
+    # Fetch a single Page from the response of an HTTP request to *url*.
+    # Just gets the final destination page.
     #
-    def fetch_page(url, from_page = nil)
+    def fetch_page(url, referer = nil, depth = nil)
+      fetch_pages(url, referer, depth).last
+    end
+    #
+    # Create new Pages from the response of an HTTP request to *url*,
+    # including redirects
+    #
+    def fetch_pages(url, referer = nil, depth = nil)
       begin
         url = URI(url) unless url.is_a?(URI)
-        if from_page
-          referer = from_page.url
-          depth = from_page.depth + 1
-        end
-        response, code, location, response_time = get(url, referer)
-        aka = nil
-        if !url.eql?(location)
-          aka = location
+        pages = []
+        get(url, referer) do |response, code, location, redirect_to, response_time|
+          pages << Page.new(location, :body => response.body.dup,
+                                      :code => code,
+                                      :headers => response.to_hash,
+                                      :referer => referer,
+                                      :depth => depth,
+                                      :redirect_to => redirect_to,
+                                      :response_time => response_time)
         end
-        return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
+        return pages
       rescue => e
         if verbose?
           puts e.inspect
           puts e.backtrace
-        end
-        return Page.new(url)
+        end
+        return [Page.new(url, :error => e)]
       end
     end
     private
     #
-    # Retrieve an HTTP response for *url*, following redirects.
-    # Returns the response object, response code, and final URI location.
-    #
+    # Retrieve HTTP responses for *url*, including redirects.
+    # Yields the response object, response code, and URI location
+    # for each response.
+    #
     def get(url, referer = nil)
       response, response_time = get_response(url, referer)
       code = Integer(response.code)
       loc = url
+      redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
+      yield response, code, loc, redirect_to, response_time
       limit = redirect_limit
       while response.is_a?(Net::HTTPRedirection) and limit > 0
-          loc = URI(response['location'])
+          loc = redirect_to
           loc = url.merge(loc) if loc.relative?
           response, response_time = get_response(loc, referer)
+          code = Integer(response.code)
+          redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
+          yield response, code, loc, redirect_to, response_time
           limit -= 1
       end
-      return response, code, loc, response_time
     end
     #
     # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
     #
     def get_response(url, referer = nil)
       full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
       opts = {}
       opts['User-Agent'] = user_agent if user_agent
       opts['Referer'] = referer.to_s if referer
@@ -78,7 +89,7 @@ module Anemone
         response = connection(url).get(full_path, opts)
         finish = Time.now()
         response_time = ((finish - start) * 1000).round
-        return response, response_time
+        return response, response_time
       rescue EOFError
         refresh_connection(url)
         retries += 1
@@ -93,7 +104,7 @@ module Anemone
         return conn
       end
-      refresh_connection(url)
+      refresh_connection url
     end
     def refresh_connection(url)
@@ -102,7 +113,7 @@ module Anemone
         http.use_ssl = true
         http.verify_mode = OpenSSL::SSL::VERIFY_NONE
       end
-      @connections[url.host][url.port] = http.start
+      @connections[url.host][url.port] = http.start
     end
     def redirect_limit