RubyGems - shingara-anemone - Versions diffs - 0.2.4 - Mend

shingara-anemone 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/CHANGELOG.rdoc +27 -0
data/LICENSE.txt +19 -0
data/README.rdoc +24 -0
data/bin/anemone +4 -0
data/lib/anemone.rb +2 -0
data/lib/anemone/cli.rb +24 -0
data/lib/anemone/cli/count.rb +22 -0
data/lib/anemone/cli/cron.rb +90 -0
data/lib/anemone/cli/pagedepth.rb +32 -0
data/lib/anemone/cli/serialize.rb +35 -0
data/lib/anemone/cli/url_list.rb +41 -0
data/lib/anemone/core.rb +256 -0
data/lib/anemone/http.rb +123 -0
data/lib/anemone/page.rb +155 -0
data/lib/anemone/page_hash.rb +142 -0
data/lib/anemone/tentacle.rb +39 -0
data/spec/anemone_spec.rb +15 -0
data/spec/core_spec.rb +203 -0
data/spec/fakeweb_helper.rb +57 -0
data/spec/page_spec.rb +52 -0
data/spec/spec_helper.rb +7 -0
metadata +96 -0

data/CHANGELOG.rdoc ADDED

@@ -0,0 +1,27 @@
+== 0.2.3 / 2009-11-01
+* Minor enhancements
+  * Options are now applied per-crawl, rather than module-wide.
+* Bug fixes
+  * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
+== 0.2.2 / 2009-10-26
+* Minor enhancements
+  * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
+== 0.2.1 / 2009-10-24
+* Major enhancements
+  * Added HTTPS support.
+  * CLI program 'anemone', which is a frontend for several tasks.
+* Minor enhancements
+  * HTTP request response time recorded in Page.
+  * Use of persistent HTTP connections.

data/LICENSE.txt ADDED

@@ -0,0 +1,19 @@
+Copyright (c) 2009 Vertive, Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,24 @@
+= Anemone
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+See http://anemone.rubyforge.org for more information.
+== Features
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+* Choose the links to follow on each page with focus_crawl()
+* HTTPS support
+* Records response time for each page
+* CLI program can list all pages in a domain, calculate page depths, and more
+== Examples
+See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
+== Requirements
+* nokogiri
+* robots

data/bin/anemone ADDED

@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require 'anemone/cli'
+Anemone::CLI::run

data/lib/anemone.rb ADDED

	@@ -0,0 +1,2 @@
1	+ require 'rubygems'
2	+ require 'anemone/core'

data/lib/anemone/cli.rb ADDED

@@ -0,0 +1,24 @@
+module Anemone
+  module CLI
+    COMMANDS = %w[count cron pagedepth serialize url-list]
+    def self.run
+      command = ARGV.shift
+      if COMMANDS.include? command
+        load "anemone/cli/#{command.tr('-', '_')}.rb"
+      else
+        puts <<-INFO
+Anemone is a web spider framework that can collect
+useful information about pages it visits.
+Usage:
+  anemone <command> [arguments]
+Commands:
+  #{COMMANDS.join(', ')}
+INFO
+      end
+    end
+  end
+end

data/lib/anemone/cli/count.rb ADDED

@@ -0,0 +1,22 @@
+require 'anemone'
+begin
+  # make sure that the first option is a URL we can crawl
+  url = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone count <url>
+Synopsis:
+  Crawls a site starting at the given URL and outputs the total number
+  of unique pages on the site.
+INFO
+  exit(0)
+end
+Anemone.crawl(url) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq.size
+  end
+end

data/lib/anemone/cli/cron.rb ADDED

@@ -0,0 +1,90 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+begin
+  # make sure that the last argument is a URL we can crawl
+  root = URI(ARGV.last)
+rescue
+  puts <<-INFO
+Usage:
+  anemone cron [options] <url>
+Synopsis:
+  Combination of `count`, `pagedepth` and `url-list` commands.
+  Performs pagedepth, url list, and count functionality.
+  Outputs results to STDOUT and link list to file (urls.txt).
+  Meant to be run daily as a cron job.
+Options:
+  -r, --relative           Output relative URLs (rather than absolute)
+  -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
+INFO
+  exit(0)
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative')        { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
+  anemone.after_crawl do |pages|
+    puts "Crawl results for #{root}\n"
+    # print a list of 404's
+    not_found = []
+    pages.each_value do |page|
+      url = page.url.to_s
+      not_found << url if page.not_found?
+    end
+    unless not_found.empty?
+      puts "\n404's:"
+      missing_links = pages.urls_linking_to(not_found)
+      missing_links.each do |url, links|
+        if options.relative
+          puts URI(url).path.to_s
+        else
+          puts url
+        end
+        links.slice(0..10).each do |u|
+          u = u.path if options.relative
+          puts "  linked from #{u}"
+        end
+        puts " ..." if links.size > 10
+      end
+      print "\n"
+    end
+    # remove redirect aliases, and calculate pagedepths
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    # print the page count
+    puts "Total pages: #{pages.size}\n"
+    # print a list of depths
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+    # output a list of urls to file
+    file = open(options.output_file, 'w')
+    pages.each_key do |url|
+      url = options.relative ? url.path.to_s : url.to_s
+      file.puts url
+    end
+  end
+end

data/lib/anemone/cli/pagedepth.rb ADDED

@@ -0,0 +1,32 @@
+require 'anemone'
+begin
+  # make sure that the first option is a URL we can crawl
+  root = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone pagedepth <url>
+Synopsis:
+  Crawls a site starting at the given URL and outputs a count of
+  the number of pages at each depth of the crawl.
+INFO
+  exit(0)
+end
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
+end

data/lib/anemone/cli/serialize.rb ADDED

@@ -0,0 +1,35 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+begin
+  # make sure that the first option is a URL we can crawl
+  root = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone serialize [options] <url>
+Synopsis:
+  Crawls a site starting at the given URL and saves the resulting
+  PageHash object to a file using Marshal serialization.
+Options:
+  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+INFO
+  exit(0)
+end
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
+end

data/lib/anemone/cli/url_list.rb ADDED

@@ -0,0 +1,41 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+begin
+  # make sure that the last option is a URL we can crawl
+  root = URI(ARGV.last)
+rescue
+  puts <<-INFO
+Usage:
+  anemone url-list [options] <url>
+Synopsis:
+  Crawls a site starting at the given URL, and outputs the URL of each page
+  in the domain as they are encountered.
+Options:
+  -r, --relative      Output relative URLs (rather than absolute)
+INFO
+  exit(0)
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+end

data/lib/anemone/core.rb ADDED

@@ -0,0 +1,256 @@
+require 'thread'
+require 'robots'
+require 'anemone/tentacle'
+require 'anemone/page'
+require 'anemone/page_hash'
+module Anemone
+  VERSION = '0.2.3';
+  #
+  # Convenience method to start a crawl
+  #
+  def Anemone.crawl(urls, options = {}, &block)
+    Core.crawl(urls, options, &block)
+  end
+  class Core
+    # PageHash storing all Page objects encountered during the crawl
+    attr_reader :pages
+    # Hash of options for the crawl
+    attr_accessor :opts
+    DEFAULT_OPTS = {
+      # run 4 Tentacle threads to fetch pages
+      :threads => 4,
+      # disable verbose output
+      :verbose => false,
+      # don't throw away the page response body after scanning it for links
+      :discard_page_bodies => false,
+      # identify self as Anemone/VERSION
+      :user_agent => "Anemone/#{Anemone::VERSION}",
+      # no delay between requests
+      :delay => 0,
+      # don't obey the robots exclusion protocol
+      :obey_robots_txt => false,
+      # by default, don't limit the depth of the crawl
+      :depth_limit => false,
+      # number of times HTTP redirects will be followed
+      :redirect_limit => 5
+    }
+    #
+    # Initialize the crawl with starting *urls* (single URL or Array of URLs)
+    # and optional *block*
+    #
+    def initialize(urls, opts = {})
+      @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
+      @urls.each{ |url| url.path = '/' if url.path.empty? }
+      @tentacles = []
+      @pages = PageHash.new
+      @on_every_page_blocks = []
+      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @skip_link_patterns = []
+      @after_crawl_blocks = []
+      process_options opts
+      yield self if block_given?
+    end
+    #
+    # Convenience method to start a new crawl
+    #
+    def self.crawl(urls, opts = {})
+      self.new(urls, opts) do |core|
+        yield core if block_given?
+        core.run
+      end
+    end
+    #
+    # Add a block to be executed on the PageHash after the crawl
+    # is finished
+    #
+    def after_crawl(&block)
+      @after_crawl_blocks << block
+      self
+    end
+    #
+    # Add one ore more Regex patterns for URLs which should not be
+    # followed
+    #
+    def skip_links_like(*patterns)
+      @skip_link_patterns.concat [patterns].flatten.compact
+      self
+    end
+    #
+    # Add a block to be executed on every Page as they are encountered
+    # during the crawl
+    #
+    def on_every_page(&block)
+      @on_every_page_blocks << block
+      self
+    end
+    #
+    # Add a block to be executed on Page objects with a URL matching
+    # one or more patterns
+    #
+    def on_pages_like(*patterns, &block)
+      if patterns
+        patterns.each do |pattern|
+          @on_pages_like_blocks[pattern] << block
+        end
+      end
+      self
+    end
+    #
+    # Specify a block which will select which links to follow on each page.
+    # The block should return an Array of URI objects.
+    #
+    def focus_crawl(&block)
+      @focus_crawl_block = block
+      self
+    end
+    #
+    # Perform the crawl
+    #
+    def run
+      @urls.delete_if { |url| !visit_link?(url) }
+      return if @urls.empty?
+      link_queue = Queue.new
+      page_queue = Queue.new
+      @opts[:threads].times do
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
+      end
+      @urls.each{ |url| link_queue.enq(url) }
+      loop do
+        page = page_queue.deq
+        @pages[page.url] = page
+        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
+        # perform the on_every_page blocks for this page
+        do_page_blocks(page)
+        page.discard_doc! if @opts[:discard_page_bodies]
+        links_to_follow(page).each do |link|
+          link_queue.enq([link, page])
+          @pages[link] = nil
+        end
+        # create an entry in the page hash for each alias of this page,
+        # i.e. all the pages that redirected to this page
+        page.aliases.each do |aka|
+          if !@pages.has_key?(aka) or @pages[aka].nil?
+            @pages[aka] = page.alias_clone(aka)
+          end
+          @pages[aka].add_alias!(page.url)
+        end
+        # if we are done with the crawl, tell the threads to end
+        if link_queue.empty? and page_queue.empty?
+          until link_queue.num_waiting == @tentacles.size
+            Thread.pass
+          end
+          if page_queue.empty?
+            @tentacles.size.times { link_queue.enq(:END)}
+            break
+          end
+        end
+      end
+      @tentacles.each { |t| t.join }
+      do_after_crawl_blocks()
+      self
+    end
+    private
+    def process_options(options)
+      @opts = DEFAULT_OPTS.merge options
+      @opts[:threads] = 1 if @opts[:delay] > 0
+      @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+    end
+    #
+    # Execute the after_crawl blocks
+    #
+    def do_after_crawl_blocks
+      @after_crawl_blocks.each {|b| b.call(@pages)}
+    end
+    #
+    # Execute the on_every_page blocks for *page*
+    #
+    def do_page_blocks(page)
+      @on_every_page_blocks.each do |blk|
+        blk.call(page)
+      end
+      @on_pages_like_blocks.each do |pattern, blks|
+        if page.url.to_s =~ pattern
+          blks.each { |blk| blk.call(page) }
+        end
+      end
+    end
+    #
+    # Return an Array of links to follow from the given page.
+    # Based on whether or not the link has already been crawled,
+    # and the block given to focus_crawl()
+    #
+    def links_to_follow(page)
+      links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
+      links.select { |link| visit_link?(link, page) }
+    end
+    #
+    # Returns +true+ if *link* has not been visited already,
+    # and is not excluded by a skip_link pattern...
+    # and is not excluded by robots.txt...
+    # and is not deeper than the depth limit
+    # Returns +false+ otherwise.
+    #
+    def visit_link?(link, from_page = nil)
+      allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
+      if from_page && @opts[:depth_limit]
+        too_deep = from_page.depth >= @opts[:depth_limit]
+      else
+        too_deep = false
+      end
+      !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
+    end
+    #
+    # Returns +true+ if *link* should not be visited because
+    # its URL matches a skip_link pattern.
+    #
+    def skip_link?(link)
+      @skip_link_patterns.any? { |p| link.path =~ p }
+    end
+  end
+end