RubyGems - anemone - Versions diffs - 0.2.3 → 0.3.0 - Mend

anemone 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/CHANGELOG.rdoc +11 -1
data/README.rdoc +2 -0
data/lib/anemone/cli/serialize.rb +2 -2
data/lib/anemone/core.rb +58 -66
data/lib/anemone/http.rb +39 -28
data/lib/anemone/page.rb +53 -59
data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
data/lib/anemone/storage.rb +19 -0
data/lib/anemone/storage/pstore.rb +48 -0
data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
data/lib/anemone/tentacle.rb +7 -7
data/spec/anemone_spec.rb +4 -4
data/spec/core_spec.rb +226 -163
data/spec/http_spec.rb +23 -0
data/spec/page_spec.rb +28 -14
data/spec/page_store_spec.rb +128 -0
data/spec/storage_spec.rb +123 -0
metadata +9 -3

data/CHANGELOG.rdoc CHANGED

@@ -1,3 +1,13 @@
+== 0.3.0 / 2009-12-15
+* Major enchancements
+  * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
+* Minor enhancements
+  * Options can be set via methods on the Core object in the crawl block
 == 0.2.3 / 2009-11-01
 * Minor enhancements
@@ -24,4 +34,4 @@
 * Minor enhancements
   * HTTP request response time recorded in Page.
-  * Use of persistent HTTP connections.
+  * Use of persistent HTTP connections.

data/README.rdoc CHANGED

@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
 * HTTPS support
 * Records response time for each page
 * CLI program can list all pages in a domain, calculate page depths, and more
+* Obey robots.txt
+* In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
 == Examples
 See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.

data/lib/anemone/cli/serialize.rb CHANGED

@@ -12,10 +12,10 @@ Usage:
 Synopsis:
   Crawls a site starting at the given URL and saves the resulting
-  PageHash object to a file using Marshal serialization.
+  PageStore object to a file using Marshal serialization.
 Options:
-  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+  -o, --output filename      Filename to save PageStore to. Defaults to crawl.{Time.now}
 INFO
   exit(0)
 end

data/lib/anemone/core.rb CHANGED

@@ -2,25 +2,26 @@ require 'thread'
 require 'robots'
 require 'anemone/tentacle'
 require 'anemone/page'
-require 'anemone/page_hash'
+require 'anemone/page_store'
+require 'anemone/storage'
 module Anemone
-  VERSION = '0.2.3';
+  VERSION = '0.3.0';
   #
   # Convenience method to start a crawl
   #
   def Anemone.crawl(urls, options = {}, &block)
     Core.crawl(urls, options, &block)
-  end
+  end
   class Core
-    # PageHash storing all Page objects encountered during the crawl
-    attr_reader :pages
+    # PageStore storing all Page objects encountered during the crawl
+    attr_reader :pages
     # Hash of options for the crawl
-    attr_accessor :opts
+    attr_reader :opts
     DEFAULT_OPTS = {
       # run 4 Tentacle threads to fetch pages
@@ -38,9 +39,18 @@ module Anemone
       # by default, don't limit the depth of the crawl
       :depth_limit => false,
       # number of times HTTP redirects will be followed
-      :redirect_limit => 5
+      :redirect_limit => 5,
+      # storage engine defaults to Hash in +process_options+ if none specified
+      :storage => nil
     }
+    # Create setter methods for all options to be called from the crawl block
+    DEFAULT_OPTS.keys.each do |key|
+      define_method "#{key}=" do |*args|
+        @opts[key.to_sym] = *args
+      end
+    end
     #
     # Initialize the crawl with starting *urls* (single URL or Array of URLs)
     # and optional *block*
@@ -50,17 +60,15 @@ module Anemone
       @urls.each{ |url| url.path = '/' if url.path.empty? }
       @tentacles = []
-      @pages = PageHash.new
       @on_every_page_blocks = []
       @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @skip_link_patterns = []
       @after_crawl_blocks = []
-      process_options opts
+      @opts = opts
       yield self if block_given?
     end
     #
     # Convenience method to start a new crawl
     #
@@ -70,16 +78,16 @@ module Anemone
         core.run
       end
     end
     #
-    # Add a block to be executed on the PageHash after the crawl
+    # Add a block to be executed on the PageStore after the crawl
     # is finished
     #
     def after_crawl(&block)
       @after_crawl_blocks << block
       self
     end
     #
     # Add one ore more Regex patterns for URLs which should not be
     # followed
@@ -88,7 +96,7 @@ module Anemone
       @skip_link_patterns.concat [patterns].flatten.compact
       self
     end
     #
     # Add a block to be executed on every Page as they are encountered
     # during the crawl
@@ -97,7 +105,7 @@ module Anemone
       @on_every_page_blocks << block
       self
     end
     #
     # Add a block to be executed on Page objects with a URL matching
     # one or more patterns
@@ -110,7 +118,7 @@ module Anemone
       end
       self
     end
     #
     # Specify a block which will select which links to follow on each page.
     # The block should return an Array of URI objects.
@@ -119,77 +127,63 @@ module Anemone
       @focus_crawl_block = block
       self
     end
     #
     # Perform the crawl
     #
     def run
+      process_options
       @urls.delete_if { |url| !visit_link?(url) }
       return if @urls.empty?
       link_queue = Queue.new
       page_queue = Queue.new
       @opts[:threads].times do
         @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
       end
       @urls.each{ |url| link_queue.enq(url) }
       loop do
         page = page_queue.deq
-        @pages[page.url] = page
+        @pages.touch_key page.url
         puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
-        # perform the on_every_page blocks for this page
-        do_page_blocks(page)
+        do_page_blocks page
         page.discard_doc! if @opts[:discard_page_bodies]
-        links_to_follow(page).each do |link|
-          link_queue.enq([link, page])
-          @pages[link] = nil
-        end
-        # create an entry in the page hash for each alias of this page,
-        # i.e. all the pages that redirected to this page
-        page.aliases.each do |aka|
-          if !@pages.has_key?(aka) or @pages[aka].nil?
-            @pages[aka] = page.alias_clone(aka)
-          end
-          @pages[aka].add_alias!(page.url)
+        links = links_to_follow page
+        links.each do |link|
+          link_queue << [link, page.url.dup, page.depth + 1]
         end
+        @pages.touch_keys links
+        @pages[page.url] = page
         # if we are done with the crawl, tell the threads to end
         if link_queue.empty? and page_queue.empty?
           until link_queue.num_waiting == @tentacles.size
             Thread.pass
           end
           if page_queue.empty?
-            @tentacles.size.times { link_queue.enq(:END)}
+            @tentacles.size.times { link_queue << :END }
             break
           end
         end
       end
       @tentacles.each { |t| t.join }
-      do_after_crawl_blocks()
+      do_after_crawl_blocks
       self
     end
-    private
-    def process_options(options)
-      @opts = DEFAULT_OPTS.merge options
+    private
+    def process_options
+      @opts = DEFAULT_OPTS.merge @opts
       @opts[:threads] = 1 if @opts[:delay] > 0
+      @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
       @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
     end
@@ -197,9 +191,9 @@ module Anemone
     # Execute the after_crawl blocks
     #
     def do_after_crawl_blocks
-      @after_crawl_blocks.each {|b| b.call(@pages)}
+      @after_crawl_blocks.each { |b| b.call(@pages) }
     end
     #
     # Execute the on_every_page blocks for *page*
     #
@@ -207,14 +201,12 @@ module Anemone
       @on_every_page_blocks.each do |blk|
         blk.call(page)
       end
       @on_pages_like_blocks.each do |pattern, blks|
-        if page.url.to_s =~ pattern
-          blks.each { |blk| blk.call(page) }
-        end
+        blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
       end
-    end
+    end
     #
     # Return an Array of links to follow from the given page.
     # Based on whether or not the link has already been crawled,
@@ -222,9 +214,9 @@ module Anemone
     #
     def links_to_follow(page)
       links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
-      links.select { |link| visit_link?(link, page) }
+      links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
     end
     #
     # Returns +true+ if *link* has not been visited already,
     # and is not excluded by a skip_link pattern...
@@ -234,16 +226,16 @@ module Anemone
     #
     def visit_link?(link, from_page = nil)
       allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
       if from_page && @opts[:depth_limit]
         too_deep = from_page.depth >= @opts[:depth_limit]
       else
         too_deep = false
       end
       !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
     end
     #
     # Returns +true+ if *link* should not be visited because
     # its URL matches a skip_link pattern.
@@ -251,6 +243,6 @@ module Anemone
     def skip_link?(link)
       @skip_link_patterns.any? { |p| link.path =~ p }
     end
   end
 end

data/lib/anemone/http.rb CHANGED

@@ -12,62 +12,73 @@ module Anemone
     end
     #
-    # Create a new Page from the response of an HTTP request to *url*
+    # Fetch a single Page from the response of an HTTP request to *url*.
+    # Just gets the final destination page.
     #
-    def fetch_page(url, from_page = nil)
+    def fetch_page(url, referer = nil, depth = nil)
+      fetch_pages(url, referer, depth).last
+    end
+    #
+    # Create new Pages from the response of an HTTP request to *url*,
+    # including redirects
+    #
+    def fetch_pages(url, referer = nil, depth = nil)
       begin
         url = URI(url) unless url.is_a?(URI)
-        if from_page
-          referer = from_page.url
-          depth = from_page.depth + 1
-        end
-        response, code, location, response_time = get(url, referer)
-        aka = nil
-        if !url.eql?(location)
-          aka = location
+        pages = []
+        get(url, referer) do |response, code, location, redirect_to, response_time|
+          pages << Page.new(location, :body => response.body.dup,
+                                      :code => code,
+                                      :headers => response.to_hash,
+                                      :referer => referer,
+                                      :depth => depth,
+                                      :redirect_to => redirect_to,
+                                      :response_time => response_time)
         end
-        return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
+        return pages
       rescue => e
         if verbose?
           puts e.inspect
           puts e.backtrace
-        end
-        return Page.new(url)
+        end
+        return [Page.new(url, :error => e)]
       end
     end
     private
     #
-    # Retrieve an HTTP response for *url*, following redirects.
-    # Returns the response object, response code, and final URI location.
-    #
+    # Retrieve HTTP responses for *url*, including redirects.
+    # Yields the response object, response code, and URI location
+    # for each response.
+    #
     def get(url, referer = nil)
       response, response_time = get_response(url, referer)
       code = Integer(response.code)
       loc = url
+      redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
+      yield response, code, loc, redirect_to, response_time
       limit = redirect_limit
       while response.is_a?(Net::HTTPRedirection) and limit > 0
-          loc = URI(response['location'])
+          loc = redirect_to
           loc = url.merge(loc) if loc.relative?
           response, response_time = get_response(loc, referer)
+          code = Integer(response.code)
+          redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
+          yield response, code, loc, redirect_to, response_time
           limit -= 1
       end
-      return response, code, loc, response_time
     end
     #
     # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
     #
     def get_response(url, referer = nil)
       full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
       opts = {}
       opts['User-Agent'] = user_agent if user_agent
       opts['Referer'] = referer.to_s if referer
@@ -78,7 +89,7 @@ module Anemone
         response = connection(url).get(full_path, opts)
         finish = Time.now()
         response_time = ((finish - start) * 1000).round
-        return response, response_time
+        return response, response_time
       rescue EOFError
         refresh_connection(url)
         retries += 1
@@ -93,7 +104,7 @@ module Anemone
         return conn
       end
-      refresh_connection(url)
+      refresh_connection url
     end
     def refresh_connection(url)
@@ -102,7 +113,7 @@ module Anemone
         http.use_ssl = true
         http.verify_mode = OpenSSL::SSL::VERIFY_NONE
       end
-      @connections[url.host][url.port] = http.start
+      @connections[url.host][url.port] = http.start
     end
     def redirect_limit