RubyGems - sutch-anemone - Versions diffs - 0.7.2 - Mend

sutch-anemone 0.7.2

Files changed (42) hide show

checksums.yaml +15 -0
data/CHANGELOG.rdoc +136 -0
data/LICENSE.txt +19 -0
data/README.rdoc +38 -0
data/Rakefile +23 -0
data/VERSION +1 -0
data/bin/anemone +4 -0
data/lib/anemone.rb +2 -0
data/lib/anemone/cli.rb +24 -0
data/lib/anemone/cli/count.rb +22 -0
data/lib/anemone/cli/cron.rb +90 -0
data/lib/anemone/cli/pagedepth.rb +32 -0
data/lib/anemone/cli/serialize.rb +35 -0
data/lib/anemone/cli/url_list.rb +41 -0
data/lib/anemone/cookie_store.rb +35 -0
data/lib/anemone/core.rb +339 -0
data/lib/anemone/exceptions.rb +5 -0
data/lib/anemone/http.rb +187 -0
data/lib/anemone/page.rb +217 -0
data/lib/anemone/page_store.rb +161 -0
data/lib/anemone/resource.rb +42 -0
data/lib/anemone/storage.rb +44 -0
data/lib/anemone/storage/base.rb +75 -0
data/lib/anemone/storage/exceptions.rb +15 -0
data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
data/lib/anemone/storage/mongodb.rb +89 -0
data/lib/anemone/storage/pstore.rb +50 -0
data/lib/anemone/storage/redis.rb +90 -0
data/lib/anemone/storage/sqlite3.rb +90 -0
data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
data/lib/anemone/tentacle.rb +39 -0
data/spec/anemone_spec.rb +16 -0
data/spec/cookie_store_spec.rb +28 -0
data/spec/core_spec.rb +344 -0
data/spec/fakeweb_helper.rb +77 -0
data/spec/http_spec.rb +19 -0
data/spec/page_spec.rb +186 -0
data/spec/page_store_spec.rb +171 -0
data/spec/resource_spec.rb +91 -0
data/spec/spec_helper.rb +9 -0
data/spec/storage_spec.rb +252 -0
metadata +281 -0

data/lib/anemone/cli/url_list.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+begin
+  # make sure that the last option is a URL we can crawl
+  root = URI(ARGV.last)
+rescue
+  puts <<-INFO
+Usage:
+  anemone url-list [options] <url>
+Synopsis:
+  Crawls a site starting at the given URL, and outputs the URL of each page
+  in the domain as they are encountered.
+Options:
+  -r, --relative      Output relative URLs (rather than absolute)
+INFO
+  exit(0)
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+end

data/lib/anemone/cookie_store.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'delegate'
+require 'webrick/cookie'
+class WEBrick::Cookie
+  def expired?
+    !!expires && expires < Time.now
+  end
+end
+module Anemone
+  class CookieStore < DelegateClass(Hash)
+    def initialize(cookies = nil)
+      @cookies = {}
+      cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
+      super(@cookies)
+    end
+    def merge!(set_cookie_str)
+      begin
+        cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
+          hash[cookie.name] = cookie if !!cookie
+          hash
+        end
+        @cookies.merge! cookie_hash
+      rescue
+      end
+    end
+    def to_s
+      @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
+    end
+  end
+end

data/lib/anemone/core.rb ADDED Viewed

@@ -0,0 +1,339 @@
+require 'thread'
+require 'robotex'
+require 'anemone/tentacle'
+require 'anemone/page'
+require 'anemone/resource'
+require 'anemone/exceptions'
+require 'anemone/page_store'
+require 'anemone/storage'
+require 'anemone/storage/base'
+module Anemone
+  VERSION = '0.7.2';
+  #
+  # Convenience method to start a crawl
+  #
+  def Anemone.crawl(urls, options = {}, &block)
+    Core.crawl(urls, options, &block)
+  end
+  class Core
+    # PageStore storing all Page objects encountered during the crawl
+    attr_reader :pages
+    # Hash of options for the crawl
+    attr_reader :opts
+    DEFAULT_OPTS = {
+      # run 4 Tentacle threads to fetch pages
+      :threads => 4,
+      # disable verbose output
+      :verbose => false,
+      # don't throw away the page response body after scanning it for links
+      :discard_page_bodies => false,
+      # identify self as Anemone/VERSION
+      :user_agent => "Anemone/#{Anemone::VERSION}",
+      # no delay between requests
+      :delay => 0,
+      # don't obey the robots exclusion protocol
+      :obey_robots_txt => false,
+      # by default, don't limit the depth of the crawl
+      :depth_limit => false,
+      # number of times HTTP redirects will be followed
+      :redirect_limit => 5,
+      # limit the size of the page queue to keep memory usage low
+      :page_queue_size_limit => nil,
+      # limit the size of the link queue to keep memory usage low
+      :link_queue_size_limit => nil,
+      # storage engine defaults to Hash in +process_options+ if none specified
+      :storage => nil,
+      # Hash of cookie name => value to send with HTTP requests
+      :cookies => nil,
+      # accept cookies from the server and send them back?
+      :accept_cookies => false,
+      # skip any link with a query string? e.g. http://foo.com/?u=user
+      :skip_query_strings => false,
+      # proxy server hostname
+      :proxy_host => nil,
+      # proxy server port number
+      :proxy_port => false,
+      # HTTP read timeout in seconds
+      :read_timeout => nil,
+      # parse pages using Page class
+      :page_class => Anemone::Page,
+    }
+    # Create setter methods for all options to be called from the crawl block
+    DEFAULT_OPTS.keys.each do |key|
+      define_method "#{key}=" do |value|
+        @opts[key.to_sym] = value
+      end
+    end
+    #
+    # Initialize the crawl with starting *urls* (single URL or Array of URLs)
+    # and optional *block*
+    #
+    def initialize(urls, opts = {})
+      @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
+      @urls.each{ |url| url.path = '/' if url.path.empty? }
+      @tentacles = []
+      @on_every_page_blocks = []
+      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @skip_link_patterns = []
+      @after_crawl_blocks = []
+      @opts = opts
+      @stop_crawl = false
+      yield self if block_given?
+    end
+    #
+    # Convenience method to start a new crawl
+    #
+    def self.crawl(urls, opts = {})
+      self.new(urls, opts) do |core|
+        yield core if block_given?
+        core.run
+      end
+    end
+    #
+    # Add a block to be executed on the PageStore after the crawl
+    # is finished
+    #
+    def after_crawl(&block)
+      @after_crawl_blocks << block
+      self
+    end
+    #
+    # Add one ore more Regex patterns for URLs which should not be
+    # followed
+    #
+    def skip_links_like(*patterns)
+      @skip_link_patterns.concat [patterns].flatten.compact
+      self
+    end
+    #
+    # Add a block to be executed on every Page as they are encountered
+    # during the crawl
+    #
+    def on_every_page(&block)
+      @on_every_page_blocks << block
+      self
+    end
+    #
+    # Add a block to be executed on Page objects with a URL matching
+    # one or more patterns
+    #
+    def on_pages_like(*patterns, &block)
+      if patterns
+        patterns.each do |pattern|
+          @on_pages_like_blocks[pattern] << block
+        end
+      end
+      self
+    end
+    #
+    # Specify a block which will select which links to follow on each page.
+    # The block should return an Array of URI objects.
+    #
+    def focus_crawl(&block)
+      @focus_crawl_block = block
+      self
+    end
+    #
+    # Signals the crawler that it should stop the crawl before visiting the
+    # next page.
+    #
+    # This method is expected to be called within a page block, and it signals
+    # the crawler that it must stop after the current page is completely
+    # processed.  All pages and links currently on queue are discared.
+    #
+    def stop_crawl
+      @stop_crawl = true
+    end
+    #
+    # Perform the crawl
+    #
+    def run
+      process_options
+      @urls.delete_if { |url| !visit_link?(url) }
+      return if @urls.empty?
+      link_queue = build_queue(@opts[:link_queue_size_limit])
+      page_queue = build_queue(@opts[:page_queue_size_limit])
+      @opts[:threads].times do
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
+      end
+      @urls.each{ |url| link_queue.enq(url) }
+      loop do
+        page = page_queue.deq
+        @pages.touch_key page.url
+        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
+        do_page_blocks page
+        page.discard_doc! if @opts[:discard_page_bodies]
+        links = links_to_follow page
+        links.each do |link|
+          link_queue << [link, page.url.dup, page.depth + 1]
+        end
+        @pages.touch_keys links
+        @pages[page.url] = page
+        if @stop_crawl
+          page_queue.clear
+          link_queue.clear
+        end
+        # if we are done with the crawl, tell the threads to end
+        if link_queue.empty? and page_queue.empty?
+          until link_queue.num_waiting == @tentacles.size
+            Thread.pass
+          end
+          if page_queue.empty? || @stop_crawl
+            @tentacles.size.times { link_queue << :END }
+            break
+          end
+        end
+      end
+      @tentacles.each { |thread| thread.join }
+      do_after_crawl_blocks
+      self
+    end
+    private
+    def process_options
+      @opts = DEFAULT_OPTS.merge @opts
+      @opts[:threads] = 1 if @opts[:delay] > 0
+      storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
+      @pages = PageStore.new(storage, @opts)
+      @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+      freeze_options
+    end
+    #
+    # Freeze the opts Hash so that no options can be modified
+    # once the crawl begins
+    #
+    def freeze_options
+      @opts.freeze
+      @opts.each_key { |key| @opts[key].freeze }
+      @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
+    end
+    #
+    # Execute the after_crawl blocks
+    #
+    def do_after_crawl_blocks
+      @after_crawl_blocks.each { |block| block.call(@pages) }
+    end
+    #
+    # Execute the on_every_page blocks for *page*
+    #
+    def do_page_blocks(page)
+      @on_every_page_blocks.each do |block|
+        block.call(page)
+      end
+      @on_pages_like_blocks.each do |pattern, blocks|
+        blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
+      end
+    end
+    #
+    # Return an Array of links to follow from the given page.
+    # Based on whether or not the link has already been crawled,
+    # and the block given to focus_crawl()
+    #
+    def links_to_follow(page)
+      links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
+      links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
+    end
+    #
+    # Returns +true+ if *link* has not been visited already,
+    # and is not excluded by a skip_link pattern...
+    # and is not excluded by robots.txt...
+    # and is not deeper than the depth limit
+    # Returns +false+ otherwise.
+    #
+    def visit_link?(link, from_page = nil)
+      !@pages.has_page?(link) &&
+      !skip_link?(link) &&
+      !skip_query_string?(link) &&
+      allowed(link) &&
+      !too_deep?(from_page)
+    end
+    #
+    # Returns +true+ if we are obeying robots.txt and the link
+    # is granted access in it. Always returns +true+ when we are
+    # not obeying robots.txt.
+    #
+    def allowed(link)
+      @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
+    rescue
+      false
+    end
+    #
+    # Returns +true+ if we are over the page depth limit.
+    # This only works when coming from a page and with the +depth_limit+ option set.
+    # When neither is the case, will always return +false+.
+    def too_deep?(from_page)
+      if from_page && @opts[:depth_limit]
+        from_page.depth >= @opts[:depth_limit]
+      else
+        false
+      end
+    end
+    #
+    # Returns +true+ if *link* should not be visited because
+    # it has a query string and +skip_query_strings+ is true.
+    #
+    def skip_query_string?(link)
+      @opts[:skip_query_strings] && link.query
+    end
+    #
+    # Returns +true+ if *link* should not be visited because
+    # its URL matches a skip_link pattern.
+    #
+    def skip_link?(link)
+      @skip_link_patterns.any? { |pattern| link.path =~ pattern }
+    end
+    #
+    # Creates a new queue constrained to the given maximum size,
+    # or unconstrained if +size+ is not a positive integer.
+    #
+    def build_queue(size = nil)
+      if size.is_a?(Integer) && size > 0
+        SizedQueue.new(size)
+      else
+        Queue.new
+      end
+    end
+  end
+end

data/lib/anemone/exceptions.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Anemone
+  class Error < ::StandardError
+    attr_accessor :wrapped_exception
+  end
+end

data/lib/anemone/http.rb ADDED Viewed

@@ -0,0 +1,187 @@
+require 'net/https'
+require 'anemone/page'
+require 'anemone/cookie_store'
+module Anemone
+  class HTTP
+    # Maximum number of redirects to follow on each get_response
+    REDIRECT_LIMIT = 5
+    # CookieStore for this HTTP client
+    attr_reader :cookie_store
+    def initialize(opts = {})
+      @connections = {}
+      @opts = opts
+      @cookie_store = CookieStore.new(@opts[:cookies])
+    end
+    #
+    # Fetch a single Page from the response of an HTTP request to *url*.
+    # Just gets the final destination page.
+    #
+    def fetch_page(url, referer = nil, depth = nil)
+      fetch_pages(url, referer, depth).last
+    end
+    #
+    # Create new Pages from the response of an HTTP request to *url*,
+    # including redirects
+    #
+    def fetch_pages(url, referer = nil, depth = nil)
+      begin
+        url = URI(url) unless url.is_a?(URI)
+        pages = []
+        get(url, referer) do |response, code, location, redirect_to, response_time|
+          pages << @opts[:page_class].new(location, :body => response.body.dup,
+                                      :code => code,
+                                      :headers => response.to_hash,
+                                      :referer => referer,
+                                      :depth => depth,
+                                      :redirect_to => redirect_to,
+                                      :response_time => response_time)
+        end
+        return pages
+      rescue Exception => e
+        if verbose?
+          puts e.inspect
+          puts e.backtrace
+        end
+        return [@opts[:page_class].new(url, :error => e)]
+      end
+    end
+    #
+    # The maximum number of redirects to follow
+    #
+    def redirect_limit
+      @opts[:redirect_limit] || REDIRECT_LIMIT
+    end
+    #
+    # The user-agent string which will be sent with each request,
+    # or nil if no such option is set
+    #
+    def user_agent
+      @opts[:user_agent]
+    end
+    #
+    # Does this HTTP client accept cookies from the server?
+    #
+    def accept_cookies?
+      @opts[:accept_cookies]
+    end
+    #
+    # The proxy address string
+    #
+    def proxy_host
+      @opts[:proxy_host]
+    end
+    #
+    # The proxy port
+    #
+    def proxy_port
+      @opts[:proxy_port]
+    end
+    #
+    # HTTP read timeout in seconds
+    #
+    def read_timeout
+      @opts[:read_timeout]
+    end
+    private
+    #
+    # Retrieve HTTP responses for *url*, including redirects.
+    # Yields the response object, response code, and URI location
+    # for each response.
+    #
+    def get(url, referer = nil)
+      limit = redirect_limit
+      loc = url
+      begin
+          # if redirected to a relative url, merge it with the host of the original
+          # request url
+          loc = url.merge(loc) if loc.relative?
+          response, response_time = get_response(loc, referer)
+          code = Integer(response.code)
+          redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
+          yield response, code, loc, redirect_to, response_time
+          limit -= 1
+      end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
+    end
+    #
+    # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
+    #
+    def get_response(url, referer = nil)
+      full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
+      opts = {}
+      opts['User-Agent'] = user_agent if user_agent
+      opts['Referer'] = referer.to_s if referer
+      opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
+      retries = 0
+      begin
+        start = Time.now()
+        # format request
+        req = Net::HTTP::Get.new(full_path, opts)
+        # HTTP Basic authentication
+        req.basic_auth url.user, url.password if url.user
+        response = connection(url).request(req)
+        finish = Time.now()
+        response_time = ((finish - start) * 1000).round
+        @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
+        return response, response_time
+      rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
+        puts e.inspect if verbose?
+        refresh_connection(url)
+        retries += 1
+        retry unless retries > 3
+      end
+    end
+    def connection(url)
+      @connections[url.host] ||= {}
+      if conn = @connections[url.host][url.port]
+        return conn
+      end
+      refresh_connection url
+    end
+    def refresh_connection(url)
+      http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
+      http.read_timeout = read_timeout if !!read_timeout
+      if url.scheme == 'https'
+        http.use_ssl = true
+        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      end
+      @connections[url.host][url.port] = http.start
+    end
+    def verbose?
+      @opts[:verbose]
+    end
+    #
+    # Allowed to connect to the requested url?
+    #
+    def allowed?(to_url, from_url)
+      to_url.host.nil? || (to_url.host == from_url.host)
+    end
+  end
+end