RubyGems - anemone - Versions diffs - 0.0.1 - Mend

anemone 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/README.txt ADDED Viewed

@@ -0,0 +1,18 @@
+= Anemone
+== DESCRIPTION
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+== FEATURES
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+== REQUIREMENTS
+* hpricot
+== EXAMPLES
+See the +bin+ directory for several examples of useful Anemone tasks.

data/bin/anemone_count.rb ADDED Viewed

@@ -0,0 +1,31 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the total number
+#   of unique pages on the site.
+#
+# == Usage
+#   anemone_count.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'rdoc/usage'
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  RDoc::usage()
+  Process.exit
+end
+Anemone.crawl(ARGV[0]) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq.size
+  end
+end

data/bin/anemone_cron.rb ADDED Viewed

@@ -0,0 +1,99 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Performs pagedepth, url list, and count functionality
+#   Meant to be run daily as a cron job
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative                  Output relative URLs (rather than absolute)
+#   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  RDoc::usage()
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative')        { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV.last
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    puts "Crawl results for #{root}\n"
+    # print a list of 404's
+    not_found = []
+    pages.each_value do |page|
+      url = page.url.to_s
+      not_found << url if page.not_found?
+    end
+    if !not_found.empty?
+      puts "\n404's:"
+      not_found.each do |url|
+        if options.relative
+          puts URI(url).path.to_s
+        else
+          puts url
+        end
+        num_linked_from = 0
+        pages.urls_linking_to(url).each do |u|
+          u = u.path if options.relative
+          num_linked_from += 1
+          puts "  linked from #{u}"
+          if num_linked_from > 10
+            puts "  ..."
+            break
+          end
+        end
+      end
+      print "\n"
+    end
+    # remove redirect aliases, and calculate pagedepths
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    # print the page count
+    puts "Total pages: #{pages.size}\n"
+    # print a list of depths
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+    # output a list of urls to file
+    file = open(options.output_file, 'w')
+    pages.each_key do |url|
+      url = options.relative ? url.path.to_s : url.to_s
+      file.puts url
+    end
+  end
+end

data/bin/anemone_pagedepth.rb ADDED Viewed

@@ -0,0 +1,39 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs a count of
+#   the number of Pages at each depth in the site.
+#
+# == Usage
+#   anemone_pagedepth.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'rdoc/usage'
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  RDoc::usage()
+  Process.exit
+end
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
+end

data/bin/anemone_serialize.rb ADDED Viewed

@@ -0,0 +1,43 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and saves the resulting
+#   PageHash object to a file using Marshal serialization.
+#
+# == Usage
+#   anemone_serialize.rb [options] url
+#
+# == Options
+#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  RDoc::usage()
+  Process.exit
+end
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
+end

data/bin/anemone_url_list.rb ADDED Viewed

@@ -0,0 +1,46 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the URL of each page
+#   in the domain as they are encountered.
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative          Output relative URLs (rather than absolute)
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'rdoc/usage'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  RDoc::usage()
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+Anemone.crawl(ARGV.last) do |anemone|
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+end

data/lib/anemone.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require 'rubygems'
2	+ require 'anemone/anemone'

data/lib/anemone/anemone.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require 'anemone/core'
+module Anemone
+  # Version number
+  VERSION = '0.0.1'
+  # User-Agent string used for HTTP requests
+  USER_AGENT = "Anemone/#{self::VERSION}"
+  #
+  # Convenience method to start a crawl using Core
+  #
+  def Anemone.crawl(url, options = {}, &block)
+    Core.crawl(url, options, &block)
+  end
+end

data/lib/anemone/core.rb ADDED Viewed

@@ -0,0 +1,183 @@
+require 'net/http'
+require 'thread'
+require 'anemone/tentacle'
+require 'anemone/page_hash'
+module Anemone
+  class Core
+    # PageHash storing all Page objects encountered during the crawl
+    attr_reader :pages
+    #
+    # Initialize the crawl with a starting *url*, *options*, and optional *block*
+    #
+    def initialize(url, options={}, &block)
+      url = URI(url) if url.is_a?(String)
+      @url = url
+      @options = options
+      @tentacles = []
+      @pages = PageHash.new
+      @on_every_page_blocks = []
+      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @skip_link_patterns = []
+      @after_crawl_blocks = []
+      @options[:threads] ||= 4
+      @options[:verbose] ||= false
+      block.call(self) if block
+    end
+    #
+    # Convenience method to start a new crawl
+    #
+    def self.crawl(root, options={}, &block)
+      self.new(root, options) do |core|
+        block.call(core) if block
+        core.run
+        core.do_after_crawl_blocks
+        return core
+      end
+    end
+    #
+    # Add a block to be executed on the PageHash after the crawl
+    # is finished
+    #
+    def after_crawl(&block)
+      @after_crawl_blocks << block
+      self
+    end
+    #
+    # Add one ore more Regex patterns for URLs which should not be
+    # followed
+    #
+    def skip_links_like(*patterns)
+      if patterns
+        patterns.each do |pattern|
+          @skip_link_patterns << pattern
+        end
+      end
+      self
+    end
+    #
+    # Add a block to be executed on every Page as they are encountered
+    # during the crawl
+    #
+    def on_every_page(&block)
+      @on_every_page_blocks << block
+      self
+    end
+    #
+    # Add a block to be executed on Page objects with a URL matching
+    # one or more patterns
+    #
+    def on_pages_like(*patterns, &block)
+      if patterns
+        patterns.each do |pattern|
+          @on_pages_like_blocks[pattern] << block
+        end
+      end
+      self
+    end
+    #
+    # Perform the crawl
+    #
+    def run
+      link_queue = Queue.new
+      page_queue = Queue.new
+      @options[:threads].times do |id|
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
+      end
+      return if !visit_link?(@url)
+      link_queue.enq(@url)
+      while true do
+        page = page_queue.deq
+        @pages[page.url] = page
+        puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
+        do_page_blocks(page)
+        page.links.each do |link|
+          if visit_link?(link)
+            link_queue.enq(link)
+            @pages[link] = nil
+          end
+        end
+        page.aliases.each do |aka|
+          if !@pages.has_key?(aka) or @pages[aka].nil?
+            @pages[aka] = page.alias_clone(aka)
+          end
+          @pages[aka].add_alias!(page.url)
+        end
+        # if we are done with the crawl, tell the threads to end
+        if link_queue.empty? and page_queue.empty?
+          until link_queue.num_waiting == @tentacles.size
+            Thread.pass
+          end
+          if page_queue.empty?
+            @tentacles.size.times { |i| link_queue.enq(:END)}
+            break
+          end
+        end
+      end
+      @tentacles.each { |t| t.join }
+      self
+    end
+    #
+    # Execute the after_crawl blocks
+    #
+    def do_after_crawl_blocks
+      @after_crawl_blocks.each {|b| b.call(@pages)}
+    end
+    #
+    # Execute the on_every_page blocks for *page*
+    #
+    def do_page_blocks(page)
+      @on_every_page_blocks.each do |blk|
+        blk.call(page)
+      end
+      @on_pages_like_blocks.each do |pattern, blk|
+        blk.call(page) if page.url.to_s =~ pattern
+      end
+    end
+    #
+    # Returns +true+ if *link* has not been visited already,
+    # and is not excluded by a skip_link pattern. Returns
+    # +false+ otherwise.
+    #
+    def visit_link?(link)
+      !@pages.has_key?(link) and !skip_link?(link)
+    end
+    #
+    # Returns +true+ if *link* should not be visited because
+    # its URL matches a skip_link pattern.
+    #
+    def skip_link?(link)
+      @skip_link_patterns.each { |p| return true if link.path =~ p}
+      return false
+    end
+  end
+end

data/lib/anemone/http.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require 'net/http'
+module Anemone
+  class HTTP < Net::HTTP
+    # Maximum number of redirects to follow on each get_response
+    REDIRECTION_LIMIT = 5
+    #
+    # Retrieve an HTTP response for *url*, following redirects.
+    # Returns the response object, response code, and final URI location.
+    #
+    def self.get(url)
+      response = get_response(url)
+      code = Integer(response.code)
+      loc = url
+      limit = REDIRECTION_LIMIT
+      while response.is_a?(Net::HTTPRedirection) and limit > 0
+          loc = URI(response['location'])
+          loc = url.merge(loc) if loc.relative?
+          response = get_response(loc)
+          limit -= 1
+      end
+      return response, code, loc
+    end
+    #
+    # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
+    #
+    def self.get_response(url)
+      Net::HTTP.start(url.host, url.port) do |http|
+        return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
+      end
+    end
+  end
+end

data/lib/anemone/page.rb ADDED Viewed

@@ -0,0 +1,165 @@
+require 'anemone/http'
+require 'hpricot'
+module Anemone
+  class Page
+    # The URL of the page
+    attr_reader :url
+    # Array of distinct A tag HREFs from the page
+    attr_reader :links
+    # Integer response code of the page
+    attr_reader :code
+    # Array of redirect-aliases for the page
+    attr_accessor :aliases
+    # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
+    attr_accessor :visited
+    # Used by PageHash#shortest_paths! to store depth of the page
+    attr_accessor :depth
+    #
+    # Create a new Page from the response of an HTTP request to *url*
+    #
+    def self.fetch(url)
+      begin
+        url = URI(url) if url.is_a?(String)
+        response, code, location = Anemone::HTTP.get(url)
+        aka = nil
+        if !url.eql?(location)
+          aka = location
+        end
+        return Page.new(url, response, code, aka)
+      rescue
+        return Page.new(url)
+      end
+    end
+    #
+    # Create a new page
+    #
+    def initialize(url, response = nil, code = nil, aka = nil)
+      @url = url
+      @response = response
+      @code = code
+      @links = []
+      @aliases = []
+      @aliases << aka if !aka.nil?
+      #get a list of distinct links on the page, in absolute url form
+      if @response and @response.body
+        Hpricot(@response.body).search('a').each do |a|
+          u = a['href']
+          next if u.nil?
+          begin
+            u = URI(u)
+          rescue
+            next
+          end
+          abs = to_absolute(u)
+          @links << abs if in_domain?(abs)
+        end
+        @links.uniq!
+      end
+    end
+    #
+    # Return a new page with the same *response* and *url*, but
+    # with a 200 response code
+    #
+    def alias_clone(url)
+      Page.new(url, @response, 200, @url)
+    end
+    #
+    # Add a redirect-alias String *aka* to the list of the page's aliases
+    #
+    # Returns *self*
+    #
+    def add_alias!(aka)
+      @aliases << aka if !@aliases.include?(aka)
+      self
+    end
+    #
+    # Returns an Array of all links from this page, and all the
+    # redirect-aliases of those pages, as String objects.
+    #
+    # *page_hash* is a PageHash object with the results of the current crawl.
+    #
+    def links_and_their_aliases(page_hash)
+      @links.inject([]) do |results, link|
+        results.concat([link].concat(page_hash[link].aliases))
+      end
+    end
+    #
+    # Returns the response body for the page
+    #
+    def body
+      @response.body
+    end
+    #
+    # Returns the +Content-Type+ header for the page
+    #
+    def content_type
+      @response['Content-Type']
+    end
+    #
+    # Returns +true+ if the page is a HTML document, returns +false+
+    # otherwise.
+    #
+    def html?
+      (content_type =~ /text\/html/) == 0
+    end
+    #
+    # Returns +true+ if the page is a HTTP redirect, returns +false+
+    # otherwise.
+    #
+    def redirect?
+      (300..399).include?(@code)
+    end
+    #
+    # Returns +true+ if the page was not found (returned 404 code),
+    # returns +false+ otherwise.
+    #
+    def not_found?
+      404 == @code
+    end
+    #
+    # Converts relative URL *link* into an absolute URL based on the
+    # location of the page
+    #
+    def to_absolute(link)
+      # remove anchor
+      link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
+      relative = URI(link)
+      absolute = @url.merge(relative)
+      absolute.path = '/' if absolute.path.empty?
+      return absolute
+    end
+    #
+    # Returns +true+ if *uri* is in the same domain as the page, returns
+    # +false+ otherwise
+    #
+    def in_domain?(uri)
+      uri.host == @url.host
+    end
+  end
+end

data/lib/anemone/page_hash.rb ADDED Viewed

@@ -0,0 +1,83 @@
+module Anemone
+  class PageHash < Hash
+    #
+    # Use a breadth-first search to calculate the single-source
+    # shortest paths from *root* to all pages in the PageHash
+    #
+    def shortest_paths!(root)
+      root = URI(root) if root.is_a?(String)
+      raise "Root node not found" if !has_key?(root)
+      each_value {|p| p.visited = false if p}
+      q = Queue.new
+      q.enq(root)
+      self[root].depth = 0
+      self[root].visited = true
+      while(!q.empty?)
+        url = q.deq
+        next if !has_key?(url)
+        page = self[url]
+        page.links.each do |u|
+          next if !has_key?(u) or self[u].nil?
+          link = self[u]
+          aliases = [link].concat(link.aliases.map {|a| self[a] })
+          aliases.each do |node|
+            if node.depth.nil? or page.depth + 1 < node.depth
+              node.depth = page.depth + 1
+            end
+          end
+          q.enq(self[u].url) if !self[u].visited
+          self[u].visited = true
+        end
+      end
+      self
+    end
+    #
+    # Returns a new PageHash by removing redirect-aliases for each
+    # non-redirect Page
+    #
+    def uniq
+      results = PageHash.new
+      each do |url, page|
+        #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
+        page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
+        if !page.redirect? and !page_added
+          results[url] = page.clone
+          results[url].aliases = []
+        end
+      end
+      results
+    end
+    #
+    # Return an Array of Page objects which link to the given url
+    #
+    def pages_linking_to url
+      begin
+        url = URI(url) if url.is_a?(String)
+      rescue
+        return []
+      end
+      values.delete_if { |p| !p.links.include?(url) }
+    end
+    #
+    # Return an Array of URI objects of Pages linking to the given url
+    def urls_linking_to url
+      pages_linking_to(url).map{|p| p.url}
+    end
+  end
+end

data/lib/anemone/tentacle.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'anemone/page'
+module Anemone
+  class Tentacle
+    #
+    # Create a new Tentacle
+    #
+    def initialize(link_queue, page_queue)
+      @link_queue = link_queue
+      @page_queue = page_queue
+    end
+    #
+    # Gets links from @link_queue, and returns the fetched
+    # Page objects into @page_queue
+    #
+    def run
+      while true do
+        link = @link_queue.deq
+        break if link == :END
+        page = Page.fetch(link)
+        @page_queue.enq(page)
+      end
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,82 @@
+--- !ruby/object:Gem::Specification
+name: anemone
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Chris Kite
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-04-14 00:00:00 -05:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.0
+    version:
+description:
+email:
+executables:
+- anemone_count.rb
+- anemone_cron.rb
+- anemone_pagedepth.rb
+- anemone_serialize.rb
+- anemone_url_list.rb
+extensions: []
+extra_rdoc_files:
+- README.txt
+files:
+- bin/anemone_count.rb
+- bin/anemone_cron.rb
+- bin/anemone_pagedepth.rb
+- bin/anemone_serialize.rb
+- bin/anemone_url_list.rb
+- lib/anemone
+- lib/anemone/anemone.rb
+- lib/anemone/core.rb
+- lib/anemone/http.rb
+- lib/anemone/page.rb
+- lib/anemone/page_hash.rb
+- lib/anemone/tentacle.rb
+- lib/anemone.rb
+- README.txt
+has_rdoc: true
+homepage: http://anemone.rubyforge.org
+post_install_message:
+rdoc_options:
+- -m
+- README.txt
+- -t
+- Anemone
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: anemone
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: Anemone web-spider framework
+test_files: []