RubyGems - spk-anemone - Versions diffs - 0.2.4 → 0.3.0 - Mend

spk-anemone 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/CHANGELOG.rdoc +10 -0
data/README.rdoc +2 -0
data/lib/anemone/cli/serialize.rb +2 -2
data/lib/anemone/core.rb +43 -53
data/lib/anemone/http.rb +32 -21
data/lib/anemone/page.rb +43 -50
data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
data/lib/anemone/storage.rb +19 -0
data/lib/anemone/storage/pstore.rb +48 -0
data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
data/lib/anemone/tentacle.rb +7 -7
data/spec/anemone_spec.rb +4 -4
data/spec/core_spec.rb +226 -163
data/spec/http_spec.rb +23 -0
data/spec/page_spec.rb +28 -14
data/spec/page_store_spec.rb +128 -0
data/spec/storage_spec.rb +123 -0
metadata +10 -5

data/CHANGELOG.rdoc CHANGED Viewed

@@ -1,3 +1,13 @@
+== 0.3.0 / 2009-12-15
+* Major enchancements
+  * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
+* Minor enhancements
+  * Options can be set via methods on the Core object in the crawl block
 == 0.2.4 / 2009-11-26
 * Minor enhancements

data/README.rdoc CHANGED Viewed

@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
 * HTTPS support
 * Records response time for each page
 * CLI program can list all pages in a domain, calculate page depths, and more
+* Obey robots.txt
+* In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
 == Examples
 See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.

data/lib/anemone/cli/serialize.rb CHANGED Viewed

@@ -12,10 +12,10 @@ Usage:
 Synopsis:
   Crawls a site starting at the given URL and saves the resulting
-  PageHash object to a file using Marshal serialization.
+  PageStore object to a file using Marshal serialization.
 Options:
-  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+  -o, --output filename      Filename to save PageStore to. Defaults to crawl.{Time.now}
 INFO
   exit(0)
 end

data/lib/anemone/core.rb CHANGED Viewed

@@ -2,11 +2,12 @@ require 'thread'
 require 'robots'
 require 'anemone/tentacle'
 require 'anemone/page'
-require 'anemone/page_hash'
+require 'anemone/page_store'
+require 'anemone/storage'
 module Anemone
-  VERSION = '0.2.4';
+  VERSION = '0.3.0';
   #
   # Convenience method to start a crawl
@@ -16,11 +17,11 @@ module Anemone
   end
   class Core
-    # PageHash storing all Page objects encountered during the crawl
-    attr_reader :pages
+    # PageStore storing all Page objects encountered during the crawl
+    attr_reader :pages
     # Hash of options for the crawl
-    attr_accessor :opts
+    attr_reader :opts
     DEFAULT_OPTS = {
       # run 4 Tentacle threads to fetch pages
@@ -39,29 +40,33 @@ module Anemone
       :depth_limit => false,
       # number of times HTTP redirects will be followed
       :redirect_limit => 5,
+      # storage engine defaults to Hash in +process_options+ if none specified
+      :storage => nil,
       # Authentication
       :authorization => nil,
     }
+    # Create setter methods for all options to be called from the crawl block
+    DEFAULT_OPTS.keys.each do |key|
+      define_method "#{key}=" do |*args|
+        @opts[key.to_sym] = *args
+      end
+    end
     #
     # Initialize the crawl with starting *urls* (single URL or Array of URLs)
     # and optional *block*
     #
     def initialize(urls, opts = {})
-      process_options opts
       @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
-      @urls.each{ |url|
-        url.path = '/' if url.path.empty?
-        authorization(url) if url.user
-      }
+      @urls.each{ |url| url.path = '/' if url.path.empty? }
       @tentacles = []
-      @pages = PageHash.new
       @on_every_page_blocks = []
       @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @skip_link_patterns = []
       @after_crawl_blocks = []
+      @opts = opts
       yield self if block_given?
     end
@@ -77,7 +82,7 @@ module Anemone
     end
     #
-    # Add a block to be executed on the PageHash after the crawl
+    # Add a block to be executed on the PageStore after the crawl
     # is finished
     #
     def after_crawl(&block)
@@ -129,6 +134,8 @@ module Anemone
     # Perform the crawl
     #
     def run
+      process_options
       @urls.delete_if { |url| !visit_link?(url) }
       return if @urls.empty?
@@ -139,81 +146,66 @@ module Anemone
         @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
       end
-      @urls.each{ |url| link_queue.enq(url) }
+      @urls.each{ |url|
+        link_queue.enq(url)
+        authorization(url) if url.user
+      }
       loop do
         page = page_queue.deq
-        @pages[page.url] = page
+        @pages.touch_key page.url
         puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
-        # perform the on_every_page blocks for this page
-        do_page_blocks(page)
+        do_page_blocks page
         page.discard_doc! if @opts[:discard_page_bodies]
-        links_to_follow(page).each do |link|
-          link_queue.enq([link, page])
-          @pages[link] = nil
+        links = links_to_follow page
+        links.each do |link|
+          link_queue << [link, page.url.dup, page.depth + 1]
         end
+        @pages.touch_keys links
-        # create an entry in the page hash for each alias of this page,
-        # i.e. all the pages that redirected to this page
-        page.aliases.each do |aka|
-          if !@pages.has_key?(aka) or @pages[aka].nil?
-            @pages[aka] = page.alias_clone(aka)
-          end
-          @pages[aka].add_alias!(page.url)
-        end
+        @pages[page.url] = page
         # if we are done with the crawl, tell the threads to end
         if link_queue.empty? and page_queue.empty?
           until link_queue.num_waiting == @tentacles.size
             Thread.pass
           end
           if page_queue.empty?
-            @tentacles.size.times { link_queue.enq(:END)}
+            @tentacles.size.times { link_queue << :END }
             break
           end
         end
       end
       @tentacles.each { |t| t.join }
-      do_after_crawl_blocks()
+      do_after_crawl_blocks
       self
     end
     private
-    def process_options(options)
-      @opts = DEFAULT_OPTS.merge options
-      authorization(@opts[:authorization])
+    def process_options
+      @opts = DEFAULT_OPTS.merge @opts
+      authorization(@opts[:authorization]) if @opts[:authorization]
       @opts[:threads] = 1 if @opts[:delay] > 0
+      @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
       @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
     end
     # Generate Authorization string only if not already set
     def authorization(auth=nil)
-      return if @opts[:authorization] =~ /^Basic .*/
       require 'base64'
       if auth.is_a?(String) && auth.include?(':')
-        @opts[:authorization] = "Basic #{Base64.b64encode(auth)}"
+        self.authorization = "Basic #{Base64.b64encode(auth)}"
       elsif auth.is_a?(Array)
         user = auth.first
         password = auth.last
-        @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
+        self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
       elsif auth.is_a?(URI)
         user = auth.user
         password = auth.password
-        @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
+        self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
       end
     end
@@ -221,7 +213,7 @@ module Anemone
     # Execute the after_crawl blocks
     #
     def do_after_crawl_blocks
-      @after_crawl_blocks.each {|b| b.call(@pages)}
+      @after_crawl_blocks.each { |b| b.call(@pages) }
     end
     #
@@ -233,9 +225,7 @@ module Anemone
       end
       @on_pages_like_blocks.each do |pattern, blks|
-        if page.url.to_s =~ pattern
-          blks.each { |blk| blk.call(page) }
-        end
+        blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
       end
     end
@@ -246,7 +236,7 @@ module Anemone
     #
     def links_to_follow(page)
       links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
-      links.select { |link| visit_link?(link, page) }
+      links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
     end
     #

data/lib/anemone/http.rb CHANGED Viewed

@@ -12,54 +12,65 @@ module Anemone
     end
     #
-    # Create a new Page from the response of an HTTP request to *url*
+    # Fetch a single Page from the response of an HTTP request to *url*.
+    # Just gets the final destination page.
     #
-    def fetch_page(url, from_page = nil)
+    def fetch_page(url, referer = nil, depth = nil)
+      fetch_pages(url, referer, depth).last
+    end
+    #
+    # Create new Pages from the response of an HTTP request to *url*,
+    # including redirects
+    #
+    def fetch_pages(url, referer = nil, depth = nil)
       begin
         url = URI(url) unless url.is_a?(URI)
-        if from_page
-          referer = from_page.url
-          depth = from_page.depth + 1
+        pages = []
+        get(url, referer) do |response, code, location, redirect_to, response_time|
+          pages << Page.new(location, :body => response.body.dup,
+                                      :code => code,
+                                      :headers => response.to_hash,
+                                      :referer => referer,
+                                      :depth => depth,
+                                      :redirect_to => redirect_to,
+                                      :response_time => response_time)
         end
-        response, code, location, response_time = get(url, referer)
-        aka = nil
-        if !url.eql?(location)
-          aka = location
-        end
-        return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
+        return pages
       rescue => e
         if verbose?
           puts e.inspect
           puts e.backtrace
         end
-        return Page.new(url)
+        return [Page.new(url, :error => e)]
       end
     end
     private
     #
-    # Retrieve an HTTP response for *url*, following redirects.
-    # Returns the response object, response code, and final URI location.
+    # Retrieve HTTP responses for *url*, including redirects.
+    # Yields the response object, response code, and URI location
+    # for each response.
     #
     def get(url, referer = nil)
       response, response_time = get_response(url, referer)
       code = Integer(response.code)
       loc = url
+      redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
+      yield response, code, loc, redirect_to, response_time
       limit = redirect_limit
       while response.is_a?(Net::HTTPRedirection) and limit > 0
-          loc = URI(response['location'])
+          loc = redirect_to
           loc = url.merge(loc) if loc.relative?
           response, response_time = get_response(loc, referer)
+          code = Integer(response.code)
+          redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
+          yield response, code, loc, redirect_to, response_time
           limit -= 1
       end
-      return response, code, loc, response_time
     end
     #
@@ -94,7 +105,7 @@ module Anemone
         return conn
       end
-      refresh_connection(url)
+      refresh_connection url
     end
     def refresh_connection(url)

data/lib/anemone/page.rb CHANGED Viewed

@@ -8,21 +8,21 @@ module Anemone
     attr_reader :url
     # Headers of the HTTP response
     attr_reader :headers
+    # URL of the page this one redirected to, if any
+    attr_reader :redirect_to
+    # Exception object, if one was raised during HTTP#fetch_page
+    attr_reader :error
+    # HTML body
+    attr_reader :body
     # OpenStruct for user-stored data
     attr_accessor :data
-    # HTML body
-    attr_accessor :body
-    # Nokogiri document for the HTML body
-    attr_accessor :doc
     # Integer response code of the page
     attr_accessor :code
-    # Array of redirect-aliases for the page
-    attr_accessor :aliases
-    # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
+    # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
     attr_accessor :visited
     # Depth of this page from the root of the crawl. This is not necessarily the
-    # shortest path; use PageHash#shortest_paths! to find that value.
+    # shortest path; use PageStore#shortest_paths! to find that value.
     attr_accessor :depth
     # URL of the page that brought us to this page
     attr_accessor :referer
@@ -32,18 +32,22 @@ module Anemone
     #
     # Create a new page
     #
-    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
+    def initialize(url, params = {})
       @url = url
-      @code = code
-      @headers = headers || {}
-      @headers['content-type'] ||= ['']
-      @aliases = Array(aka)
       @data = OpenStruct.new
-      @referer = referer
-      @depth = depth || 0
-      @response_time = response_time
-      @body = body
-      @doc = Nokogiri::HTML(body) if body && html? rescue nil
+      @code = params[:code]
+      @headers = params[:headers] || {}
+      @headers['content-type'] ||= ['']
+      @aliases = Array(params[:aka]).compact
+      @referer = params[:referer]
+      @depth = params[:depth] || 0
+      @redirect_to = to_absolute(params[:redirect_to])
+      @response_time = params[:response_time]
+      @body = params[:body]
+      @error = params[:error]
+      @fetched = !params[:code].nil?
     end
     # Array of distinct A tag HREFs from the page
@@ -62,42 +66,20 @@ module Anemone
       @links
     end
-    def discard_doc!
-      links # force parsing of page links before we trash the document
-      @doc = nil
-    end
-    #
-    # Return a new page with the same *response* and *url*, but
-    # with a 200 response code
-    #
-    def alias_clone(url)
-      p = clone
-	  p.add_alias!(@aka) if !@aka.nil?
-	  p.code = 200
-	  p
+    # Nokogiri document for the HTML body
+    def doc
+      return @doc if @doc
+      @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
     end
-    #
-    # Add a redirect-alias String *aka* to the list of the page's aliases
-    #
-    # Returns *self*
-    #
-    def add_alias!(aka)
-      @aliases << aka if !@aliases.include?(aka)
-      self
+    # Delete the Nokogiri document and response body to conserve memory
+    def discard_doc!
+      links # force parsing of page links before we trash the document
+      @doc = @body = nil
     end
-    #
-    # Returns an Array of all links from this page, and all the
-    # redirect-aliases of those pages, as String objects.
-    #
-    # *page_hash* is a PageHash object with the results of the current crawl.
-    #
-    def links_and_their_aliases(page_hash)
-      links.inject([]) do |results, link|
-        results.concat([link].concat(page_hash[link].aliases))
-      end
+    def fetched?
+      @fetched
     end
     #
@@ -136,6 +118,8 @@ module Anemone
     # location of the page
     #
     def to_absolute(link)
+      return nil if link.nil?
       # remove anchor
       link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
@@ -154,5 +138,14 @@ module Anemone
     def in_domain?(uri)
       uri.host == @url.host
     end
+    def marshal_dump
+      [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
+    end
+    def marshal_load(ary)
+      @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
+    end
   end
 end