RubyGems - anemone - Versions diffs - 0.0.1 - Mend

anemone 0.0.1

Files changed (14) hide show

data/README.txt ADDED Viewed

@@ -0,0 +1,18 @@
+= Anemone
+== DESCRIPTION
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+== FEATURES
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+== REQUIREMENTS
+* hpricot
+== EXAMPLES
+See the +bin+ directory for several examples of useful Anemone tasks.

data/bin/anemone_count.rb ADDED Viewed

@@ -0,0 +1,31 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the total number
+#   of unique pages on the site.
+#
+# == Usage
+#   anemone_count.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'rdoc/usage'
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  RDoc::usage()
+  Process.exit
+end
+Anemone.crawl(ARGV[0]) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq.size
+  end
+end

data/bin/anemone_cron.rb ADDED Viewed

@@ -0,0 +1,99 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Performs pagedepth, url list, and count functionality
+#   Meant to be run daily as a cron job
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative                  Output relative URLs (rather than absolute)
+#   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  RDoc::usage()
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative')        { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV.last
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    puts "Crawl results for #{root}\n"
+    # print a list of 404's
+    not_found = []
+    pages.each_value do |page|
+      url = page.url.to_s
+      not_found << url if page.not_found?
+    end
+    if !not_found.empty?
+      puts "\n404's:"
+      not_found.each do |url|
+        if options.relative
+          puts URI(url).path.to_s
+        else
+          puts url
+        end
+        num_linked_from = 0
+        pages.urls_linking_to(url).each do |u|
+          u = u.path if options.relative
+          num_linked_from += 1
+          puts "  linked from #{u}"
+          if num_linked_from > 10
+            puts "  ..."
+            break
+          end
+        end
+      end
+      print "\n"
+    end
+    # remove redirect aliases, and calculate pagedepths
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    # print the page count
+    puts "Total pages: #{pages.size}\n"
+    # print a list of depths
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+    # output a list of urls to file
+    file = open(options.output_file, 'w')
+    pages.each_key do |url|
+      url = options.relative ? url.path.to_s : url.to_s
+      file.puts url
+    end
+  end
+end

data/bin/anemone_pagedepth.rb ADDED Viewed

@@ -0,0 +1,39 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs a count of
+#   the number of Pages at each depth in the site.
+#
+# == Usage
+#   anemone_pagedepth.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'rdoc/usage'
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  RDoc::usage()
+  Process.exit
+end
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
+end

data/bin/anemone_serialize.rb ADDED Viewed

@@ -0,0 +1,43 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and saves the resulting
+#   PageHash object to a file using Marshal serialization.
+#
+# == Usage
+#   anemone_serialize.rb [options] url
+#
+# == Options
+#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  RDoc::usage()
+  Process.exit
+end
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
+end

data/bin/anemone_url_list.rb ADDED Viewed

@@ -0,0 +1,46 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the URL of each page
+#   in the domain as they are encountered.
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative          Output relative URLs (rather than absolute)
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  RDoc::usage()
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+Anemone.crawl(ARGV.last) do |anemone|
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+end

data/lib/anemone.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require 'rubygems'
2	+ require 'anemone/anemone'

data/lib/anemone/anemone.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require 'anemone/core'
+module Anemone
+  # Version number
+  VERSION = '0.0.1'
+  # User-Agent string used for HTTP requests
+  USER_AGENT = "Anemone/#{self::VERSION}"
+  #
+  # Convenience method to start a crawl using Core
+  #
+  def Anemone.crawl(url, options = {}, &block)
+    Core.crawl(url, options, &block)
+  end
+end

data/lib/anemone/core.rb ADDED Viewed

@@ -0,0 +1,183 @@
+require 'net/http'
+require 'thread'
+require 'anemone/tentacle'
+require 'anemone/page_hash'
+module Anemone
+  class Core
+    # PageHash storing all Page objects encountered during the crawl
+    attr_reader :pages
+    #
+    # Initialize the crawl with a starting *url*, *options*, and optional *block*
+    #
+    def initialize(url, options={}, &block)
+      url = URI(url) if url.is_a?(String)
+      @url = url
+      @options = options
+      @tentacles = []
+      @pages = PageHash.new
+      @on_every_page_blocks = []
+      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @skip_link_patterns = []
+      @after_crawl_blocks = []
+      @options[:threads] ||= 4
+      @options[:verbose] ||= false
+      block.call(self) if block
+    end
+    #
+    # Convenience method to start a new crawl
+    #
+    def self.crawl(root, options={}, &block)
+      self.new(root, options) do |core|
+        block.call(core) if block
+        core.run
+        core.do_after_crawl_blocks
+        return core
+      end
+    end
+    #
+    # Add a block to be executed on the PageHash after the crawl
+    # is finished
+    #
+    def after_crawl(&block)
+      @after_crawl_blocks << block
+      self
+    end
+    #
+    # Add one ore more Regex patterns for URLs which should not be
+    # followed
+    #
+    def skip_links_like(*patterns)
+      if patterns
+        patterns.each do |pattern|
+          @skip_link_patterns << pattern
+        end
+      end
+      self
+    end
+    #
+    # Add a block to be executed on every Page as they are encountered
+    # during the crawl
+    #
+    def on_every_page(&block)
+      @on_every_page_blocks << block
+      self
+    end
+    #
+    # Add a block to be executed on Page objects with a URL matching
+    # one or more patterns
+    #
+    def on_pages_like(*patterns, &block)
+      if patterns
+        patterns.each do |pattern|
+          @on_pages_like_blocks[pattern] << block
+        end
+      end
+      self
+    end
+    #
+    # Perform the crawl
+    #
+    def run
+      link_queue = Queue.new
+      page_queue = Queue.new
+      @options[:threads].times do |id|
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
+      end
+      return if !visit_link?(@url)
+      link_queue.enq(@url)
+      while true do
+        page = page_queue.deq
+        @pages[page.url] = page
+        puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
+        do_page_blocks(page)
+        page.links.each do |link|
+          if visit_link?(link)
+            link_queue.enq(link)
+            @pages[link] = nil
+          end
+        end
+        page.aliases.each do |aka|
+          if !@pages.has_key?(aka) or @pages[aka].nil?
+            @pages[aka] = page.alias_clone(aka)
+          end
+          @pages[aka].add_alias!(page.url)
+        end
+        # if we are done with the crawl, tell the threads to end
+        if link_queue.empty? and page_queue.empty?
+          until link_queue.num_waiting == @tentacles.size
+            Thread.pass
+          end
+          if page_queue.empty?
+            @tentacles.size.times { |i| link_queue.enq(:END)}
+            break
+          end
+        end
+      end
+      @tentacles.each { |t| t.join }
+      self
+    end
+    #
+    # Execute the after_crawl blocks
+    #
+    def do_after_crawl_blocks
+      @after_crawl_blocks.each {|b| b.call(@pages)}
+    end
+    #
+    # Execute the on_every_page blocks for *page*
+    #
+    def do_page_blocks(page)
+      @on_every_page_blocks.each do |blk|
+        blk.call(page)
+      end
+      @on_pages_like_blocks.each do |pattern, blk|
+        blk.call(page) if page.url.to_s =~ pattern
+      end
+    end
+    #
+    # Returns +true+ if *link* has not been visited already,
+    # and is not excluded by a skip_link pattern. Returns
+    # +false+ otherwise.
+    #
+    def visit_link?(link)
+      !@pages.has_key?(link) and !skip_link?(link)
+    end
+    #
+    # Returns +true+ if *link* should not be visited because
+    # its URL matches a skip_link pattern.
+    #
+    def skip_link?(link)
+      @skip_link_patterns.each { |p| return true if link.path =~ p}
+      return false
+    end
+  end
+end

data/lib/anemone/http.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require 'net/http'
+module Anemone
+  class HTTP < Net::HTTP
+    # Maximum number of redirects to follow on each get_response
+    REDIRECTION_LIMIT = 5
+    #
+    # Retrieve an HTTP response for *url*, following redirects.
+    # Returns the response object, response code, and final URI location.
+    #
+    def self.get(url)
+      response = get_response(url)
+      code = Integer(response.code)
+      loc = url
+      limit = REDIRECTION_LIMIT
+      while response.is_a?(Net::HTTPRedirection) and limit > 0
+          loc = URI(response['location'])
+          loc = url.merge(loc) if loc.relative?
+          response = get_response(loc)
+          limit -= 1
+      end
+      return response, code, loc
+    end
+    #
+    # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
+    #
+    def self.get_response(url)
+      Net::HTTP.start(url.host, url.port) do |http|
+        return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
+      end
+    end
+  end
+end

data/lib/anemone/page.rb ADDED Viewed

@@ -0,0 +1,165 @@
+require 'anemone/http'
+require 'hpricot'
+module Anemone
+  class Page
+    # The URL of the page
+    attr_reader :url
+    # Array of distinct A tag HREFs from the page
+    attr_reader :links
+    # Integer response code of the page
+    attr_reader :code
+    # Array of redirect-aliases for the page
+    attr_accessor :aliases
+    # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
+    attr_accessor :visited
+    # Used by PageHash#shortest_paths! to store depth of the page
+    attr_accessor :depth
+    #
+    # Create a new Page from the response of an HTTP request to *url*
+    #
+    def self.fetch(url)
+      begin
+        url = URI(url) if url.is_a?(String)
+        response, code, location = Anemone::HTTP.get(url)
+        aka = nil
+        if !url.eql?(location)
+          aka = location
+        end
+        return Page.new(url, response, code, aka)
+      rescue
+        return Page.new(url)
+      end
+    end
+    #
+    # Create a new page
+    #
+    def initialize(url, response = nil, code = nil, aka = nil)
+      @url = url
+      @response = response
+      @code = code
+      @links = []
+      @aliases = []
+      @aliases << aka if !aka.nil?
+      #get a list of distinct links on the page, in absolute url form
+      if @response and @response.body
+        Hpricot(@response.body).search('a').each do |a|
+          u = a['href']
+          next if u.nil?
+          begin
+            u = URI(u)
+          rescue
+            next
+          end
+          abs = to_absolute(u)
+          @links << abs if in_domain?(abs)
+        end
+        @links.uniq!
+      end
+    end
+    #
+    # Return a new page with the same *response* and *url*, but
+    # with a 200 response code
+    #
+    def alias_clone(url)
+      Page.new(url, @response, 200, @url)
+    end
+    #
+    # Add a redirect-alias String *aka* to the list of the page's aliases
+    #
+    # Returns *self*
+    #
+    def add_alias!(aka)
+      @aliases << aka if !@aliases.include?(aka)
+      self
+    end
+    #
+    # Returns an Array of all links from this page, and all the
+    # redirect-aliases of those pages, as String objects.
+    #
+    # *page_hash* is a PageHash object with the results of the current crawl.
+    #
+    def links_and_their_aliases(page_hash)
+      @links.inject([]) do |results, link|
+        results.concat([link].concat(page_hash[link].aliases))
+      end
+    end
+    #
+    # Returns the response body for the page
+    #
+    def body
+      @response.body
+    end
+    #
+    # Returns the +Content-Type+ header for the page
+    #
+    def content_type
+      @response['Content-Type']
+    end
+    #
+    # Returns +true+ if the page is a HTML document, returns +false+
+    # otherwise.
+    #
+    def html?
+      (content_type =~ /text\/html/) == 0
+    end
+    #
+    # Returns +true+ if the page is a HTTP redirect, returns +false+
+    # otherwise.
+    #
+    def redirect?
+      (300..399).include?(@code)
+    end
+    #
+    # Returns +true+ if the page was not found (returned 404 code),
+    # returns +false+ otherwise.
+    #
+    def not_found?
+      404 == @code
+    end
+    #
+    # Converts relative URL *link* into an absolute URL based on the
+    # location of the page
+    #
+    def to_absolute(link)
+      # remove anchor
+      link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
+      relative = URI(link)
+      absolute = @url.merge(relative)
+      absolute.path = '/' if absolute.path.empty?
+      return absolute
+    end
+    #
+    # Returns +true+ if *uri* is in the same domain as the page, returns
+    # +false+ otherwise
+    #
+    def in_domain?(uri)
+      uri.host == @url.host
+    end
+  end
+end

data/lib/anemone/page_hash.rb ADDED Viewed

@@ -0,0 +1,83 @@
+module Anemone
+  class PageHash < Hash
+    #
+    # Use a breadth-first search to calculate the single-source
+    # shortest paths from *root* to all pages in the PageHash
+    #
+    def shortest_paths!(root)
+      root = URI(root) if root.is_a?(String)
+      raise "Root node not found" if !has_key?(root)
+      each_value {|p| p.visited = false if p}
+      q = Queue.new
+      q.enq(root)
+      self[root].depth = 0
+      self[root].visited = true
+      while(!q.empty?)
+        url = q.deq
+        next if !has_key?(url)
+        page = self[url]
+        page.links.each do |u|
+          next if !has_key?(u) or self[u].nil?
+          link = self[u]
+          aliases = [link].concat(link.aliases.map {|a| self[a] })
+          aliases.each do |node|
+            if node.depth.nil? or page.depth + 1 < node.depth
+              node.depth = page.depth + 1
+            end
+          end
+          q.enq(self[u].url) if !self[u].visited
+          self[u].visited = true
+        end
+      end
+      self
+    end
+    #
+    # Returns a new PageHash by removing redirect-aliases for each
+    # non-redirect Page
+    #
+    def uniq
+      results = PageHash.new
+      each do |url, page|
+        #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
+        page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
+        if !page.redirect? and !page_added
+          results[url] = page.clone
+          results[url].aliases = []
+        end
+      end
+      results
+    end
+    #
+    # Return an Array of Page objects which link to the given url
+    #
+    def pages_linking_to url
+      begin
+        url = URI(url) if url.is_a?(String)
+      rescue
+        return []
+      end
+      values.delete_if { |p| !p.links.include?(url) }
+    end
+    #
+    # Return an Array of URI objects of Pages linking to the given url
+    def urls_linking_to url
+      pages_linking_to(url).map{|p| p.url}
+    end
+  end
+end

data/lib/anemone/tentacle.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'anemone/page'
+module Anemone
+  class Tentacle
+    #
+    # Create a new Tentacle
+    #
+    def initialize(link_queue, page_queue)
+      @link_queue = link_queue
+      @page_queue = page_queue
+    end
+    #
+    # Gets links from @link_queue, and returns the fetched
+    # Page objects into @page_queue
+    #
+    def run
+      while true do
+        link = @link_queue.deq
+        break if link == :END
+        page = Page.fetch(link)
+        @page_queue.enq(page)
+      end
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,82 @@
+--- !ruby/object:Gem::Specification
+name: anemone
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Chris Kite
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-04-14 00:00:00 -05:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.0
+    version:
+description:
+email:
+executables:
+- anemone_count.rb
+- anemone_cron.rb
+- anemone_pagedepth.rb
+- anemone_serialize.rb
+- anemone_url_list.rb
+extensions: []
+extra_rdoc_files:
+- README.txt
+files:
+- bin/anemone_count.rb
+- bin/anemone_cron.rb
+- bin/anemone_pagedepth.rb
+- bin/anemone_serialize.rb
+- bin/anemone_url_list.rb
+- lib/anemone
+- lib/anemone/anemone.rb
+- lib/anemone/core.rb
+- lib/anemone/http.rb
+- lib/anemone/page.rb
+- lib/anemone/page_hash.rb
+- lib/anemone/tentacle.rb
+- lib/anemone.rb
+- README.txt
+has_rdoc: true
+homepage: http://anemone.rubyforge.org
+post_install_message:
+rdoc_options:
+- -m
+- README.txt
+- -t
+- Anemone
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: anemone
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: Anemone web-spider framework
+test_files: []