RubyGems - shingara-anemone - Versions diffs - 0.2.4 - Mend

shingara-anemone 0.2.4

Files changed (22) hide show

data/CHANGELOG.rdoc +27 -0
data/LICENSE.txt +19 -0
data/README.rdoc +24 -0
data/bin/anemone +4 -0
data/lib/anemone.rb +2 -0
data/lib/anemone/cli.rb +24 -0
data/lib/anemone/cli/count.rb +22 -0
data/lib/anemone/cli/cron.rb +90 -0
data/lib/anemone/cli/pagedepth.rb +32 -0
data/lib/anemone/cli/serialize.rb +35 -0
data/lib/anemone/cli/url_list.rb +41 -0
data/lib/anemone/core.rb +256 -0
data/lib/anemone/http.rb +123 -0
data/lib/anemone/page.rb +155 -0
data/lib/anemone/page_hash.rb +142 -0
data/lib/anemone/tentacle.rb +39 -0
data/spec/anemone_spec.rb +15 -0
data/spec/core_spec.rb +203 -0
data/spec/fakeweb_helper.rb +57 -0
data/spec/page_spec.rb +52 -0
data/spec/spec_helper.rb +7 -0
metadata +96 -0

data/CHANGELOG.rdoc ADDED

@@ -0,0 +1,27 @@
+== 0.2.3 / 2009-11-01
+* Minor enhancements
+  * Options are now applied per-crawl, rather than module-wide.
+* Bug fixes
+  * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
+== 0.2.2 / 2009-10-26
+* Minor enhancements
+  * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
+== 0.2.1 / 2009-10-24
+* Major enhancements
+  * Added HTTPS support.
+  * CLI program 'anemone', which is a frontend for several tasks.
+* Minor enhancements
+  * HTTP request response time recorded in Page.
+  * Use of persistent HTTP connections.

data/LICENSE.txt ADDED

@@ -0,0 +1,19 @@
+Copyright (c) 2009 Vertive, Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,24 @@
+= Anemone
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+See http://anemone.rubyforge.org for more information.
+== Features
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+* Choose the links to follow on each page with focus_crawl()
+* HTTPS support
+* Records response time for each page
+* CLI program can list all pages in a domain, calculate page depths, and more
+== Examples
+See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
+== Requirements
+* nokogiri
+* robots

data/bin/anemone ADDED

@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require 'anemone/cli'
+Anemone::CLI::run

data/lib/anemone.rb ADDED

	@@ -0,0 +1,2 @@
1	+ require 'rubygems'
2	+ require 'anemone/core'

data/lib/anemone/cli.rb ADDED

@@ -0,0 +1,24 @@
+module Anemone
+  module CLI
+    COMMANDS = %w[count cron pagedepth serialize url-list]
+    def self.run
+      command = ARGV.shift
+      if COMMANDS.include? command
+        load "anemone/cli/#{command.tr('-', '_')}.rb"
+      else
+        puts <<-INFO
+Anemone is a web spider framework that can collect
+useful information about pages it visits.
+Usage:
+  anemone <command> [arguments]
+Commands:
+  #{COMMANDS.join(', ')}
+INFO
+      end
+    end
+  end
+end

data/lib/anemone/cli/count.rb ADDED

@@ -0,0 +1,22 @@
+require 'anemone'
+begin
+  # make sure that the first option is a URL we can crawl
+  url = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone count <url>
+Synopsis:
+  Crawls a site starting at the given URL and outputs the total number
+  of unique pages on the site.
+INFO
+  exit(0)
+end
+Anemone.crawl(url) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq.size
+  end
+end

data/lib/anemone/cli/cron.rb ADDED

@@ -0,0 +1,90 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+begin
+  # make sure that the last argument is a URL we can crawl
+  root = URI(ARGV.last)
+rescue
+  puts <<-INFO
+Usage:
+  anemone cron [options] <url>
+Synopsis:
+  Combination of `count`, `pagedepth` and `url-list` commands.
+  Performs pagedepth, url list, and count functionality.
+  Outputs results to STDOUT and link list to file (urls.txt).
+  Meant to be run daily as a cron job.
+Options:
+  -r, --relative           Output relative URLs (rather than absolute)
+  -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
+INFO
+  exit(0)
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative')        { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
+  anemone.after_crawl do |pages|
+    puts "Crawl results for #{root}\n"
+    # print a list of 404's
+    not_found = []
+    pages.each_value do |page|
+      url = page.url.to_s
+      not_found << url if page.not_found?
+    end
+    unless not_found.empty?
+      puts "\n404's:"
+      missing_links = pages.urls_linking_to(not_found)
+      missing_links.each do |url, links|
+        if options.relative
+          puts URI(url).path.to_s
+        else
+          puts url
+        end
+        links.slice(0..10).each do |u|
+          u = u.path if options.relative
+          puts "  linked from #{u}"
+        end
+        puts " ..." if links.size > 10
+      end
+      print "\n"
+    end
+    # remove redirect aliases, and calculate pagedepths
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    # print the page count
+    puts "Total pages: #{pages.size}\n"
+    # print a list of depths
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+    # output a list of urls to file
+    file = open(options.output_file, 'w')
+    pages.each_key do |url|
+      url = options.relative ? url.path.to_s : url.to_s
+      file.puts url
+    end
+  end
+end

data/lib/anemone/cli/pagedepth.rb ADDED

@@ -0,0 +1,32 @@
+require 'anemone'
+begin
+  # make sure that the first option is a URL we can crawl
+  root = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone pagedepth <url>
+Synopsis:
+  Crawls a site starting at the given URL and outputs a count of
+  the number of pages at each depth of the crawl.
+INFO
+  exit(0)
+end
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
+end

data/lib/anemone/cli/serialize.rb ADDED

@@ -0,0 +1,35 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+begin
+  # make sure that the first option is a URL we can crawl
+  root = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone serialize [options] <url>
+Synopsis:
+  Crawls a site starting at the given URL and saves the resulting
+  PageHash object to a file using Marshal serialization.
+Options:
+  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+INFO
+  exit(0)
+end
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
+end

data/lib/anemone/cli/url_list.rb ADDED

@@ -0,0 +1,41 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+begin
+  # make sure that the last option is a URL we can crawl
+  root = URI(ARGV.last)
+rescue
+  puts <<-INFO
+Usage:
+  anemone url-list [options] <url>
+Synopsis:
+  Crawls a site starting at the given URL, and outputs the URL of each page
+  in the domain as they are encountered.
+Options:
+  -r, --relative      Output relative URLs (rather than absolute)
+INFO
+  exit(0)
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+end

data/lib/anemone/core.rb ADDED

@@ -0,0 +1,256 @@
+require 'thread'
+require 'robots'
+require 'anemone/tentacle'
+require 'anemone/page'
+require 'anemone/page_hash'
+module Anemone
+  VERSION = '0.2.3';
+  #
+  # Convenience method to start a crawl
+  #
+  def Anemone.crawl(urls, options = {}, &block)
+    Core.crawl(urls, options, &block)
+  end
+  class Core
+    # PageHash storing all Page objects encountered during the crawl
+    attr_reader :pages
+    # Hash of options for the crawl
+    attr_accessor :opts
+    DEFAULT_OPTS = {
+      # run 4 Tentacle threads to fetch pages
+      :threads => 4,
+      # disable verbose output
+      :verbose => false,
+      # don't throw away the page response body after scanning it for links
+      :discard_page_bodies => false,
+      # identify self as Anemone/VERSION
+      :user_agent => "Anemone/#{Anemone::VERSION}",
+      # no delay between requests
+      :delay => 0,
+      # don't obey the robots exclusion protocol
+      :obey_robots_txt => false,
+      # by default, don't limit the depth of the crawl
+      :depth_limit => false,
+      # number of times HTTP redirects will be followed
+      :redirect_limit => 5
+    }
+    #
+    # Initialize the crawl with starting *urls* (single URL or Array of URLs)
+    # and optional *block*
+    #
+    def initialize(urls, opts = {})
+      @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
+      @urls.each{ |url| url.path = '/' if url.path.empty? }
+      @tentacles = []
+      @pages = PageHash.new
+      @on_every_page_blocks = []
+      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @skip_link_patterns = []
+      @after_crawl_blocks = []
+      process_options opts
+      yield self if block_given?
+    end
+    #
+    # Convenience method to start a new crawl
+    #
+    def self.crawl(urls, opts = {})
+      self.new(urls, opts) do |core|
+        yield core if block_given?
+        core.run
+      end
+    end
+    #
+    # Add a block to be executed on the PageHash after the crawl
+    # is finished
+    #
+    def after_crawl(&block)
+      @after_crawl_blocks << block
+      self
+    end
+    #
+    # Add one ore more Regex patterns for URLs which should not be
+    # followed
+    #
+    def skip_links_like(*patterns)
+      @skip_link_patterns.concat [patterns].flatten.compact
+      self
+    end
+    #
+    # Add a block to be executed on every Page as they are encountered
+    # during the crawl
+    #
+    def on_every_page(&block)
+      @on_every_page_blocks << block
+      self
+    end
+    #
+    # Add a block to be executed on Page objects with a URL matching
+    # one or more patterns
+    #
+    def on_pages_like(*patterns, &block)
+      if patterns
+        patterns.each do |pattern|
+          @on_pages_like_blocks[pattern] << block
+        end
+      end
+      self
+    end
+    #
+    # Specify a block which will select which links to follow on each page.
+    # The block should return an Array of URI objects.
+    #
+    def focus_crawl(&block)
+      @focus_crawl_block = block
+      self
+    end
+    #
+    # Perform the crawl
+    #
+    def run
+      @urls.delete_if { |url| !visit_link?(url) }
+      return if @urls.empty?
+      link_queue = Queue.new
+      page_queue = Queue.new
+      @opts[:threads].times do
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
+      end
+      @urls.each{ |url| link_queue.enq(url) }
+      loop do
+        page = page_queue.deq
+        @pages[page.url] = page
+        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
+        # perform the on_every_page blocks for this page
+        do_page_blocks(page)
+        page.discard_doc! if @opts[:discard_page_bodies]
+        links_to_follow(page).each do |link|
+          link_queue.enq([link, page])
+          @pages[link] = nil
+        end
+        # create an entry in the page hash for each alias of this page,
+        # i.e. all the pages that redirected to this page
+        page.aliases.each do |aka|
+          if !@pages.has_key?(aka) or @pages[aka].nil?
+            @pages[aka] = page.alias_clone(aka)
+          end
+          @pages[aka].add_alias!(page.url)
+        end
+        # if we are done with the crawl, tell the threads to end
+        if link_queue.empty? and page_queue.empty?
+          until link_queue.num_waiting == @tentacles.size
+            Thread.pass
+          end
+          if page_queue.empty?
+            @tentacles.size.times { link_queue.enq(:END)}
+            break
+          end
+        end
+      end
+      @tentacles.each { |t| t.join }
+      do_after_crawl_blocks()
+      self
+    end
+    private
+    def process_options(options)
+      @opts = DEFAULT_OPTS.merge options
+      @opts[:threads] = 1 if @opts[:delay] > 0
+      @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+    end
+    #
+    # Execute the after_crawl blocks
+    #
+    def do_after_crawl_blocks
+      @after_crawl_blocks.each {|b| b.call(@pages)}
+    end
+    #
+    # Execute the on_every_page blocks for *page*
+    #
+    def do_page_blocks(page)
+      @on_every_page_blocks.each do |blk|
+        blk.call(page)
+      end
+      @on_pages_like_blocks.each do |pattern, blks|
+        if page.url.to_s =~ pattern
+          blks.each { |blk| blk.call(page) }
+        end
+      end
+    end
+    #
+    # Return an Array of links to follow from the given page.
+    # Based on whether or not the link has already been crawled,
+    # and the block given to focus_crawl()
+    #
+    def links_to_follow(page)
+      links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
+      links.select { |link| visit_link?(link, page) }
+    end
+    #
+    # Returns +true+ if *link* has not been visited already,
+    # and is not excluded by a skip_link pattern...
+    # and is not excluded by robots.txt...
+    # and is not deeper than the depth limit
+    # Returns +false+ otherwise.
+    #
+    def visit_link?(link, from_page = nil)
+      allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
+      if from_page && @opts[:depth_limit]
+        too_deep = from_page.depth >= @opts[:depth_limit]
+      else
+        too_deep = false
+      end
+      !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
+    end
+    #
+    # Returns +true+ if *link* should not be visited because
+    # its URL matches a skip_link pattern.
+    #
+    def skip_link?(link)
+      @skip_link_patterns.any? { |p| link.path =~ p }
+    end
+  end
+end