anemone 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +18 -0
- data/bin/anemone_count.rb +31 -0
- data/bin/anemone_cron.rb +99 -0
- data/bin/anemone_pagedepth.rb +39 -0
- data/bin/anemone_serialize.rb +43 -0
- data/bin/anemone_url_list.rb +46 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/anemone.rb +16 -0
- data/lib/anemone/core.rb +183 -0
- data/lib/anemone/http.rb +37 -0
- data/lib/anemone/page.rb +165 -0
- data/lib/anemone/page_hash.rb +83 -0
- data/lib/anemone/tentacle.rb +31 -0
- metadata +82 -0
    
        data/README.txt
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
| 1 | 
            +
            = Anemone
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            == DESCRIPTION
         | 
| 4 | 
            +
            Anemone is a web spider framework that can spider a domain and collect useful
         | 
| 5 | 
            +
            information about the pages it visits. It is versatile, allowing you to
         | 
| 6 | 
            +
            write your own specialized spider tasks quickly and easily.
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            == FEATURES
         | 
| 9 | 
            +
            * Multi-threaded design for high performance
         | 
| 10 | 
            +
            * Tracks 301 HTTP redirects to understand a page's aliases
         | 
| 11 | 
            +
            * Built-in BFS algorithm for determining page depth
         | 
| 12 | 
            +
            * Allows exclusion of URLs based on regular expressions
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            == REQUIREMENTS
         | 
| 15 | 
            +
            * hpricot
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            == EXAMPLES
         | 
| 18 | 
            +
            See the +bin+ directory for several examples of useful Anemone tasks.
         | 
| @@ -0,0 +1,31 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            # == Synopsis
         | 
| 3 | 
            +
            #   Crawls a site starting at the given URL, and outputs the total number
         | 
| 4 | 
            +
            #   of unique pages on the site.
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            # == Usage
         | 
| 7 | 
            +
            #   anemone_count.rb url
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # == Author
         | 
| 10 | 
            +
            #   Chris Kite
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            require 'anemone'
         | 
| 15 | 
            +
            require 'rdoc/usage'
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            # make sure that the first option is a URL we can crawl
         | 
| 18 | 
            +
            begin
         | 
| 19 | 
            +
              URI(ARGV[0])
         | 
| 20 | 
            +
            rescue
         | 
| 21 | 
            +
              RDoc::usage()
         | 
| 22 | 
            +
              Process.exit 
         | 
| 23 | 
            +
            end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            Anemone.crawl(ARGV[0]) do |anemone|
         | 
| 26 | 
            +
              anemone.after_crawl do |pages|
         | 
| 27 | 
            +
                puts pages.uniq.size
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
            end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
    
        data/bin/anemone_cron.rb
    ADDED
    
    | @@ -0,0 +1,99 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            # == Synopsis
         | 
| 3 | 
            +
            #   Performs pagedepth, url list, and count functionality
         | 
| 4 | 
            +
            #   Meant to be run daily as a cron job
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            # == Usage
         | 
| 7 | 
            +
            #   anemone_url_list.rb [options] url
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # == Options
         | 
| 10 | 
            +
            #   -r, --relative                  Output relative URLs (rather than absolute)
         | 
| 11 | 
            +
            #   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
         | 
| 12 | 
            +
            #
         | 
| 13 | 
            +
            # == Author
         | 
| 14 | 
            +
            #   Chris Kite
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            require 'anemone'
         | 
| 19 | 
            +
            require 'optparse'
         | 
| 20 | 
            +
            require 'rdoc/usage'
         | 
| 21 | 
            +
            require 'ostruct'
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            options = OpenStruct.new
         | 
| 24 | 
            +
            options.relative = false
         | 
| 25 | 
            +
            options.output_file = 'urls.txt'
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            # make sure that the last option is a URL we can crawl
         | 
| 28 | 
            +
            begin
         | 
| 29 | 
            +
              URI(ARGV.last)
         | 
| 30 | 
            +
            rescue
         | 
| 31 | 
            +
              RDoc::usage()
         | 
| 32 | 
            +
              Process.exit 
         | 
| 33 | 
            +
            end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            # parse command-line options
         | 
| 36 | 
            +
            opts = OptionParser.new
         | 
| 37 | 
            +
            opts.on('-r', '--relative')        { options.relative = true }
         | 
| 38 | 
            +
            opts.on('-o', '--output filename') {|o| options.output_file = o }
         | 
| 39 | 
            +
            opts.parse!(ARGV)
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            root = ARGV.last
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            Anemone.crawl(root) do |anemone|  
         | 
| 44 | 
            +
              
         | 
| 45 | 
            +
              anemone.after_crawl do |pages|
         | 
| 46 | 
            +
                puts "Crawl results for #{root}\n"
         | 
| 47 | 
            +
                
         | 
| 48 | 
            +
                # print a list of 404's
         | 
| 49 | 
            +
                not_found = []
         | 
| 50 | 
            +
                pages.each_value do |page|
         | 
| 51 | 
            +
                  url = page.url.to_s
         | 
| 52 | 
            +
                  not_found << url if page.not_found?
         | 
| 53 | 
            +
                end    
         | 
| 54 | 
            +
                if !not_found.empty?
         | 
| 55 | 
            +
                  puts "\n404's:"
         | 
| 56 | 
            +
                  not_found.each do |url| 
         | 
| 57 | 
            +
                    if options.relative
         | 
| 58 | 
            +
                      puts URI(url).path.to_s
         | 
| 59 | 
            +
                    else 
         | 
| 60 | 
            +
                      puts url
         | 
| 61 | 
            +
                    end
         | 
| 62 | 
            +
                    num_linked_from = 0
         | 
| 63 | 
            +
                    pages.urls_linking_to(url).each do |u|
         | 
| 64 | 
            +
                      u = u.path if options.relative
         | 
| 65 | 
            +
                      num_linked_from += 1
         | 
| 66 | 
            +
                      puts "  linked from #{u}"
         | 
| 67 | 
            +
                      if num_linked_from > 10
         | 
| 68 | 
            +
                        puts "  ..."
         | 
| 69 | 
            +
                        break
         | 
| 70 | 
            +
                      end
         | 
| 71 | 
            +
                    end
         | 
| 72 | 
            +
                  end
         | 
| 73 | 
            +
                  
         | 
| 74 | 
            +
                  print "\n"
         | 
| 75 | 
            +
                end    
         | 
| 76 | 
            +
                
         | 
| 77 | 
            +
                # remove redirect aliases, and calculate pagedepths
         | 
| 78 | 
            +
                pages = pages.shortest_paths!(root).uniq
         | 
| 79 | 
            +
                depths = pages.values.inject({}) do |depths, page|
         | 
| 80 | 
            +
                  depths[page.depth] ||= 0
         | 
| 81 | 
            +
                  depths[page.depth] += 1
         | 
| 82 | 
            +
                  depths
         | 
| 83 | 
            +
                end
         | 
| 84 | 
            +
                
         | 
| 85 | 
            +
                # print the page count
         | 
| 86 | 
            +
                puts "Total pages: #{pages.size}\n"
         | 
| 87 | 
            +
                
         | 
| 88 | 
            +
                # print a list of depths
         | 
| 89 | 
            +
                depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
         | 
| 90 | 
            +
                
         | 
| 91 | 
            +
                # output a list of urls to file
         | 
| 92 | 
            +
                file = open(options.output_file, 'w')
         | 
| 93 | 
            +
                pages.each_key do |url|
         | 
| 94 | 
            +
                  url = options.relative ? url.path.to_s : url.to_s
         | 
| 95 | 
            +
                  file.puts url
         | 
| 96 | 
            +
                end
         | 
| 97 | 
            +
                
         | 
| 98 | 
            +
              end
         | 
| 99 | 
            +
            end
         | 
| @@ -0,0 +1,39 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            # == Synopsis
         | 
| 3 | 
            +
            #   Crawls a site starting at the given URL, and outputs a count of
         | 
| 4 | 
            +
            #   the number of Pages at each depth in the site.
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            # == Usage
         | 
| 7 | 
            +
            #   anemone_pagedepth.rb url
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # == Author
         | 
| 10 | 
            +
            #   Chris Kite
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            require 'anemone'
         | 
| 15 | 
            +
            require 'rdoc/usage'
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            # make sure that the first option is a URL we can crawl
         | 
| 18 | 
            +
            begin
         | 
| 19 | 
            +
              URI(ARGV[0])
         | 
| 20 | 
            +
            rescue
         | 
| 21 | 
            +
              RDoc::usage()
         | 
| 22 | 
            +
              Process.exit 
         | 
| 23 | 
            +
            end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            root = ARGV[0]
         | 
| 26 | 
            +
            Anemone.crawl(root) do |anemone|
         | 
| 27 | 
            +
              anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
         | 
| 28 | 
            +
              
         | 
| 29 | 
            +
              anemone.after_crawl do |pages|
         | 
| 30 | 
            +
                pages = pages.shortest_paths!(root).uniq
         | 
| 31 | 
            +
                depths = pages.values.inject({}) do |depths, page|
         | 
| 32 | 
            +
                  depths[page.depth] ||= 0
         | 
| 33 | 
            +
                  depths[page.depth] += 1
         | 
| 34 | 
            +
                  depths
         | 
| 35 | 
            +
                end
         | 
| 36 | 
            +
                
         | 
| 37 | 
            +
                depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
            end
         | 
| @@ -0,0 +1,43 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            # == Synopsis
         | 
| 3 | 
            +
            #   Crawls a site starting at the given URL, and saves the resulting
         | 
| 4 | 
            +
            #   PageHash object to a file using Marshal serialization.
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            # == Usage
         | 
| 7 | 
            +
            #   anemone_serialize.rb [options] url
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # == Options
         | 
| 10 | 
            +
            #   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
         | 
| 11 | 
            +
            #
         | 
| 12 | 
            +
            # == Author
         | 
| 13 | 
            +
            #   Chris Kite
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            require 'anemone'
         | 
| 18 | 
            +
            require 'optparse'
         | 
| 19 | 
            +
            require 'rdoc/usage'
         | 
| 20 | 
            +
            require 'ostruct'
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            # make sure that the first option is a URL we can crawl
         | 
| 23 | 
            +
            begin
         | 
| 24 | 
            +
              URI(ARGV[0])
         | 
| 25 | 
            +
            rescue
         | 
| 26 | 
            +
              RDoc::usage()
         | 
| 27 | 
            +
              Process.exit 
         | 
| 28 | 
            +
            end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            options = OpenStruct.new
         | 
| 31 | 
            +
            options.output_file = "crawl.#{Time.now.to_i}"
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            # parse command-line options
         | 
| 34 | 
            +
            opts = OptionParser.new
         | 
| 35 | 
            +
            opts.on('-o', '--output filename') {|o| options.output_file = o }
         | 
| 36 | 
            +
            opts.parse!(ARGV)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            root = ARGV[0]
         | 
| 39 | 
            +
            Anemone.crawl(root) do |anemone|
         | 
| 40 | 
            +
              anemone.after_crawl do |pages|
         | 
| 41 | 
            +
                open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
         | 
| 42 | 
            +
              end
         | 
| 43 | 
            +
            end
         | 
| @@ -0,0 +1,46 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            # == Synopsis
         | 
| 3 | 
            +
            #   Crawls a site starting at the given URL, and outputs the URL of each page
         | 
| 4 | 
            +
            #   in the domain as they are encountered.
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            # == Usage
         | 
| 7 | 
            +
            #   anemone_url_list.rb [options] url
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # == Options
         | 
| 10 | 
            +
            #   -r, --relative          Output relative URLs (rather than absolute)
         | 
| 11 | 
            +
            #
         | 
| 12 | 
            +
            # == Author
         | 
| 13 | 
            +
            #   Chris Kite
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            require 'anemone'
         | 
| 18 | 
            +
            require 'optparse'
         | 
| 19 | 
            +
            require 'rdoc/usage'
         | 
| 20 | 
            +
            require 'ostruct'
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            options = OpenStruct.new
         | 
| 23 | 
            +
            options.relative = false
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            # make sure that the last option is a URL we can crawl
         | 
| 26 | 
            +
            begin
         | 
| 27 | 
            +
              URI(ARGV.last)
         | 
| 28 | 
            +
            rescue
         | 
| 29 | 
            +
              RDoc::usage()
         | 
| 30 | 
            +
              Process.exit 
         | 
| 31 | 
            +
            end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            # parse command-line options
         | 
| 34 | 
            +
            opts = OptionParser.new
         | 
| 35 | 
            +
            opts.on('-r', '--relative') { options.relative = true }
         | 
| 36 | 
            +
            opts.parse!(ARGV)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            Anemone.crawl(ARGV.last) do |anemone|  
         | 
| 39 | 
            +
              anemone.on_every_page do |page|
         | 
| 40 | 
            +
                if options.relative
         | 
| 41 | 
            +
                  puts page.url.path
         | 
| 42 | 
            +
                else
         | 
| 43 | 
            +
                  puts page.url
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
              end
         | 
| 46 | 
            +
            end
         | 
    
        data/lib/anemone.rb
    ADDED
    
    
| @@ -0,0 +1,16 @@ | |
| 1 | 
            +
            require 'anemone/core'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Anemone
         | 
| 4 | 
            +
              # Version number
         | 
| 5 | 
            +
              VERSION = '0.0.1'
         | 
| 6 | 
            +
              
         | 
| 7 | 
            +
              # User-Agent string used for HTTP requests
         | 
| 8 | 
            +
              USER_AGENT = "Anemone/#{self::VERSION}"
         | 
| 9 | 
            +
              
         | 
| 10 | 
            +
              #
         | 
| 11 | 
            +
              # Convenience method to start a crawl using Core
         | 
| 12 | 
            +
              #
         | 
| 13 | 
            +
              def Anemone.crawl(url, options = {}, &block)
         | 
| 14 | 
            +
                Core.crawl(url, options, &block)
         | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
            end
         | 
    
        data/lib/anemone/core.rb
    ADDED
    
    | @@ -0,0 +1,183 @@ | |
| 1 | 
            +
            require 'net/http'
         | 
| 2 | 
            +
            require 'thread'
         | 
| 3 | 
            +
            require 'anemone/tentacle'
         | 
| 4 | 
            +
            require 'anemone/page_hash'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module Anemone
         | 
| 7 | 
            +
              class Core
         | 
| 8 | 
            +
                # PageHash storing all Page objects encountered during the crawl
         | 
| 9 | 
            +
                attr_reader :pages
         | 
| 10 | 
            +
                
         | 
| 11 | 
            +
                #
         | 
| 12 | 
            +
                # Initialize the crawl with a starting *url*, *options*, and optional *block*
         | 
| 13 | 
            +
                #
         | 
| 14 | 
            +
                def initialize(url, options={}, &block)
         | 
| 15 | 
            +
                  url = URI(url) if url.is_a?(String)
         | 
| 16 | 
            +
                  @url = url
         | 
| 17 | 
            +
                  @options = options
         | 
| 18 | 
            +
                  @tentacles = []
         | 
| 19 | 
            +
                  @pages = PageHash.new
         | 
| 20 | 
            +
                  @on_every_page_blocks = []
         | 
| 21 | 
            +
                  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
         | 
| 22 | 
            +
                  @skip_link_patterns = []
         | 
| 23 | 
            +
                  @after_crawl_blocks = []
         | 
| 24 | 
            +
                  
         | 
| 25 | 
            +
                  @options[:threads] ||= 4
         | 
| 26 | 
            +
                  @options[:verbose] ||= false
         | 
| 27 | 
            +
                  
         | 
| 28 | 
            +
                  block.call(self) if block
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
                
         | 
| 31 | 
            +
                #
         | 
| 32 | 
            +
                # Convenience method to start a new crawl
         | 
| 33 | 
            +
                #
         | 
| 34 | 
            +
                def self.crawl(root, options={}, &block)
         | 
| 35 | 
            +
                  self.new(root, options) do |core|
         | 
| 36 | 
            +
                    block.call(core) if block
         | 
| 37 | 
            +
                    core.run
         | 
| 38 | 
            +
                    core.do_after_crawl_blocks
         | 
| 39 | 
            +
                    return core
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
                
         | 
| 43 | 
            +
                #
         | 
| 44 | 
            +
                # Add a block to be executed on the PageHash after the crawl
         | 
| 45 | 
            +
                # is finished
         | 
| 46 | 
            +
                #
         | 
| 47 | 
            +
                def after_crawl(&block)
         | 
| 48 | 
            +
                  @after_crawl_blocks << block
         | 
| 49 | 
            +
                  self
         | 
| 50 | 
            +
                end
         | 
| 51 | 
            +
                
         | 
| 52 | 
            +
                #
         | 
| 53 | 
            +
                # Add one ore more Regex patterns for URLs which should not be
         | 
| 54 | 
            +
                # followed
         | 
| 55 | 
            +
                #
         | 
| 56 | 
            +
                def skip_links_like(*patterns)
         | 
| 57 | 
            +
                  if patterns
         | 
| 58 | 
            +
                    patterns.each do |pattern|
         | 
| 59 | 
            +
                      @skip_link_patterns << pattern
         | 
| 60 | 
            +
                    end
         | 
| 61 | 
            +
                  end
         | 
| 62 | 
            +
                  self
         | 
| 63 | 
            +
                end
         | 
| 64 | 
            +
                
         | 
| 65 | 
            +
                #
         | 
| 66 | 
            +
                # Add a block to be executed on every Page as they are encountered
         | 
| 67 | 
            +
                # during the crawl
         | 
| 68 | 
            +
                #
         | 
| 69 | 
            +
                def on_every_page(&block)
         | 
| 70 | 
            +
                  @on_every_page_blocks << block
         | 
| 71 | 
            +
                  self
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
                
         | 
| 74 | 
            +
                #
         | 
| 75 | 
            +
                # Add a block to be executed on Page objects with a URL matching
         | 
| 76 | 
            +
                # one or more patterns
         | 
| 77 | 
            +
                #
         | 
| 78 | 
            +
                def on_pages_like(*patterns, &block)
         | 
| 79 | 
            +
                  if patterns
         | 
| 80 | 
            +
                    patterns.each do |pattern|
         | 
| 81 | 
            +
                      @on_pages_like_blocks[pattern] << block
         | 
| 82 | 
            +
                    end
         | 
| 83 | 
            +
                  end
         | 
| 84 | 
            +
                  self
         | 
| 85 | 
            +
                end
         | 
| 86 | 
            +
                
         | 
| 87 | 
            +
                #
         | 
| 88 | 
            +
                # Perform the crawl
         | 
| 89 | 
            +
                #
         | 
| 90 | 
            +
                def run
         | 
| 91 | 
            +
                  link_queue = Queue.new
         | 
| 92 | 
            +
                  page_queue = Queue.new
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                  @options[:threads].times do |id|
         | 
| 95 | 
            +
                    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
         | 
| 96 | 
            +
                  end
         | 
| 97 | 
            +
                  
         | 
| 98 | 
            +
                  return if !visit_link?(@url)
         | 
| 99 | 
            +
                  
         | 
| 100 | 
            +
                  link_queue.enq(@url)
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                  while true do
         | 
| 103 | 
            +
                    page = page_queue.deq
         | 
| 104 | 
            +
                    
         | 
| 105 | 
            +
                    @pages[page.url] = page
         | 
| 106 | 
            +
                    
         | 
| 107 | 
            +
                    puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
         | 
| 108 | 
            +
                    
         | 
| 109 | 
            +
                    do_page_blocks(page)
         | 
| 110 | 
            +
                    
         | 
| 111 | 
            +
                    page.links.each do |link| 
         | 
| 112 | 
            +
                      if visit_link?(link)
         | 
| 113 | 
            +
                        link_queue.enq(link)
         | 
| 114 | 
            +
                        @pages[link] = nil
         | 
| 115 | 
            +
                      end
         | 
| 116 | 
            +
                    end
         | 
| 117 | 
            +
                    
         | 
| 118 | 
            +
                    page.aliases.each do |aka|
         | 
| 119 | 
            +
                      if !@pages.has_key?(aka) or @pages[aka].nil?
         | 
| 120 | 
            +
                        @pages[aka] = page.alias_clone(aka)
         | 
| 121 | 
            +
                      end
         | 
| 122 | 
            +
                      @pages[aka].add_alias!(page.url)
         | 
| 123 | 
            +
                    end
         | 
| 124 | 
            +
                    
         | 
| 125 | 
            +
                    # if we are done with the crawl, tell the threads to end
         | 
| 126 | 
            +
                    if link_queue.empty? and page_queue.empty?
         | 
| 127 | 
            +
                      until link_queue.num_waiting == @tentacles.size
         | 
| 128 | 
            +
                        Thread.pass
         | 
| 129 | 
            +
                      end
         | 
| 130 | 
            +
                      
         | 
| 131 | 
            +
                      if page_queue.empty?
         | 
| 132 | 
            +
                        @tentacles.size.times { |i| link_queue.enq(:END)}
         | 
| 133 | 
            +
                        break
         | 
| 134 | 
            +
                      end
         | 
| 135 | 
            +
                    end
         | 
| 136 | 
            +
                    
         | 
| 137 | 
            +
                  end
         | 
| 138 | 
            +
                  
         | 
| 139 | 
            +
                  @tentacles.each { |t| t.join }
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                  self
         | 
| 142 | 
            +
                end
         | 
| 143 | 
            +
                
         | 
| 144 | 
            +
                #
         | 
| 145 | 
            +
                # Execute the after_crawl blocks
         | 
| 146 | 
            +
                #
         | 
| 147 | 
            +
                def do_after_crawl_blocks
         | 
| 148 | 
            +
                  @after_crawl_blocks.each {|b| b.call(@pages)}
         | 
| 149 | 
            +
                end
         | 
| 150 | 
            +
                
         | 
| 151 | 
            +
                #
         | 
| 152 | 
            +
                # Execute the on_every_page blocks for *page*
         | 
| 153 | 
            +
                #
         | 
| 154 | 
            +
                def do_page_blocks(page)
         | 
| 155 | 
            +
                  @on_every_page_blocks.each do |blk|
         | 
| 156 | 
            +
                    blk.call(page)
         | 
| 157 | 
            +
                  end
         | 
| 158 | 
            +
                  
         | 
| 159 | 
            +
                  @on_pages_like_blocks.each do |pattern, blk|
         | 
| 160 | 
            +
                    blk.call(page) if page.url.to_s =~ pattern
         | 
| 161 | 
            +
                  end
         | 
| 162 | 
            +
                end      
         | 
| 163 | 
            +
                
         | 
| 164 | 
            +
                #
         | 
| 165 | 
            +
                # Returns +true+ if *link* has not been visited already,
         | 
| 166 | 
            +
                # and is not excluded by a skip_link pattern. Returns
         | 
| 167 | 
            +
                # +false+ otherwise.
         | 
| 168 | 
            +
                #
         | 
| 169 | 
            +
                def visit_link?(link)
         | 
| 170 | 
            +
                  !@pages.has_key?(link) and !skip_link?(link)
         | 
| 171 | 
            +
                end
         | 
| 172 | 
            +
                
         | 
| 173 | 
            +
                #
         | 
| 174 | 
            +
                # Returns +true+ if *link* should not be visited because
         | 
| 175 | 
            +
                # its URL matches a skip_link pattern.
         | 
| 176 | 
            +
                #
         | 
| 177 | 
            +
                def skip_link?(link)
         | 
| 178 | 
            +
                  @skip_link_patterns.each { |p| return true if link.path =~ p}
         | 
| 179 | 
            +
                  return false
         | 
| 180 | 
            +
                end
         | 
| 181 | 
            +
                
         | 
| 182 | 
            +
              end
         | 
| 183 | 
            +
            end
         | 
    
        data/lib/anemone/http.rb
    ADDED
    
    | @@ -0,0 +1,37 @@ | |
| 1 | 
            +
            require 'net/http'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Anemone
         | 
| 4 | 
            +
              class HTTP < Net::HTTP
         | 
| 5 | 
            +
                # Maximum number of redirects to follow on each get_response
         | 
| 6 | 
            +
                REDIRECTION_LIMIT = 5
         | 
| 7 | 
            +
                
         | 
| 8 | 
            +
                #
         | 
| 9 | 
            +
                # Retrieve an HTTP response for *url*, following redirects.
         | 
| 10 | 
            +
                # Returns the response object, response code, and final URI location.
         | 
| 11 | 
            +
                # 
         | 
| 12 | 
            +
                def self.get(url)      
         | 
| 13 | 
            +
                  response = get_response(url)
         | 
| 14 | 
            +
                  code = Integer(response.code)
         | 
| 15 | 
            +
                  loc = url
         | 
| 16 | 
            +
                  
         | 
| 17 | 
            +
                  limit = REDIRECTION_LIMIT
         | 
| 18 | 
            +
                  while response.is_a?(Net::HTTPRedirection) and limit > 0
         | 
| 19 | 
            +
                      loc = URI(response['location'])
         | 
| 20 | 
            +
                      loc = url.merge(loc) if loc.relative?
         | 
| 21 | 
            +
                      response = get_response(loc)
         | 
| 22 | 
            +
                      limit -= 1
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  return response, code, loc
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
                
         | 
| 28 | 
            +
                #
         | 
| 29 | 
            +
                # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
         | 
| 30 | 
            +
                #
         | 
| 31 | 
            +
                def self.get_response(url)
         | 
| 32 | 
            +
                  Net::HTTP.start(url.host, url.port) do |http|
         | 
| 33 | 
            +
                    return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
         | 
| 34 | 
            +
                  end
         | 
| 35 | 
            +
                end
         | 
| 36 | 
            +
              end
         | 
| 37 | 
            +
            end
         | 
    
        data/lib/anemone/page.rb
    ADDED
    
    | @@ -0,0 +1,165 @@ | |
| 1 | 
            +
            require 'anemone/http'
         | 
| 2 | 
            +
            require 'hpricot'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Anemone
         | 
| 5 | 
            +
              class Page
         | 
| 6 | 
            +
                # The URL of the page
         | 
| 7 | 
            +
                attr_reader :url
         | 
| 8 | 
            +
                # Array of distinct A tag HREFs from the page
         | 
| 9 | 
            +
                attr_reader :links
         | 
| 10 | 
            +
                # Integer response code of the page
         | 
| 11 | 
            +
                attr_reader :code
         | 
| 12 | 
            +
                
         | 
| 13 | 
            +
                # Array of redirect-aliases for the page
         | 
| 14 | 
            +
                attr_accessor :aliases
         | 
| 15 | 
            +
                # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
         | 
| 16 | 
            +
                attr_accessor :visited
         | 
| 17 | 
            +
                # Used by PageHash#shortest_paths! to store depth of the page
         | 
| 18 | 
            +
                attr_accessor :depth
         | 
| 19 | 
            +
                
         | 
| 20 | 
            +
                #
         | 
| 21 | 
            +
                # Create a new Page from the response of an HTTP request to *url*
         | 
| 22 | 
            +
                #
         | 
| 23 | 
            +
                def self.fetch(url)
         | 
| 24 | 
            +
                  begin
         | 
| 25 | 
            +
                    url = URI(url) if url.is_a?(String)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    response, code, location = Anemone::HTTP.get(url)
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    aka = nil
         | 
| 30 | 
            +
                    if !url.eql?(location)
         | 
| 31 | 
            +
                      aka = location
         | 
| 32 | 
            +
                    end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    return Page.new(url, response, code, aka)
         | 
| 35 | 
            +
                  rescue
         | 
| 36 | 
            +
                    return Page.new(url)
         | 
| 37 | 
            +
                  end
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
                
         | 
| 40 | 
            +
                #
         | 
| 41 | 
            +
                # Create a new page
         | 
| 42 | 
            +
                #
         | 
| 43 | 
            +
                def initialize(url, response = nil, code = nil, aka = nil)
         | 
| 44 | 
            +
                  @url = url
         | 
| 45 | 
            +
                  @response = response
         | 
| 46 | 
            +
                  @code = code
         | 
| 47 | 
            +
                  @links = []
         | 
| 48 | 
            +
                  @aliases = []
         | 
| 49 | 
            +
                  
         | 
| 50 | 
            +
                  @aliases << aka if !aka.nil?
         | 
| 51 | 
            +
                  
         | 
| 52 | 
            +
                  #get a list of distinct links on the page, in absolute url form
         | 
| 53 | 
            +
                  if @response and @response.body
         | 
| 54 | 
            +
                    Hpricot(@response.body).search('a').each do |a| 
         | 
| 55 | 
            +
                      u = a['href']
         | 
| 56 | 
            +
                      next if u.nil?
         | 
| 57 | 
            +
                      
         | 
| 58 | 
            +
                      begin
         | 
| 59 | 
            +
                        u = URI(u)
         | 
| 60 | 
            +
                      rescue
         | 
| 61 | 
            +
                        next
         | 
| 62 | 
            +
                      end
         | 
| 63 | 
            +
                      
         | 
| 64 | 
            +
                      abs = to_absolute(u) 
         | 
| 65 | 
            +
                      @links << abs if in_domain?(abs)
         | 
| 66 | 
            +
                    end
         | 
| 67 | 
            +
                    
         | 
| 68 | 
            +
                    @links.uniq!
         | 
| 69 | 
            +
                  end
         | 
| 70 | 
            +
                end
         | 
| 71 | 
            +
                
         | 
| 72 | 
            +
                
         | 
| 73 | 
            +
                #
         | 
| 74 | 
            +
                # Return a new page with the same *response* and *url*, but
         | 
| 75 | 
            +
                # with a 200 response code
         | 
| 76 | 
            +
                #    
         | 
| 77 | 
            +
                def alias_clone(url)
         | 
| 78 | 
            +
                  Page.new(url, @response, 200, @url)
         | 
| 79 | 
            +
                end
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                #
         | 
| 82 | 
            +
                # Add a redirect-alias String *aka* to the list of the page's aliases
         | 
| 83 | 
            +
                #
         | 
| 84 | 
            +
                # Returns *self*
         | 
| 85 | 
            +
                #
         | 
| 86 | 
            +
                def add_alias!(aka)
         | 
| 87 | 
            +
                  @aliases << aka if !@aliases.include?(aka)
         | 
| 88 | 
            +
                  self
         | 
| 89 | 
            +
                end
         | 
| 90 | 
            +
                
         | 
| 91 | 
            +
                #
         | 
| 92 | 
            +
                # Returns an Array of all links from this page, and all the 
         | 
| 93 | 
            +
                # redirect-aliases of those pages, as String objects.
         | 
| 94 | 
            +
                #
         | 
| 95 | 
            +
                # *page_hash* is a PageHash object with the results of the current crawl.
         | 
| 96 | 
            +
                #
         | 
| 97 | 
            +
                def links_and_their_aliases(page_hash)
         | 
| 98 | 
            +
                  @links.inject([]) do |results, link|
         | 
| 99 | 
            +
                    results.concat([link].concat(page_hash[link].aliases))
         | 
| 100 | 
            +
                  end
         | 
| 101 | 
            +
                end
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                #
         | 
| 104 | 
            +
                # Returns the response body for the page
         | 
| 105 | 
            +
                #
         | 
| 106 | 
            +
                def body
         | 
| 107 | 
            +
                  @response.body
         | 
| 108 | 
            +
                end
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                #
         | 
| 111 | 
            +
                # Returns the +Content-Type+ header for the page
         | 
| 112 | 
            +
                #
         | 
| 113 | 
            +
                def content_type
         | 
| 114 | 
            +
                  @response['Content-Type']
         | 
| 115 | 
            +
                end
         | 
| 116 | 
            +
                
         | 
| 117 | 
            +
                #
         | 
| 118 | 
            +
                # Returns +true+ if the page is a HTML document, returns +false+
         | 
| 119 | 
            +
                # otherwise.
         | 
| 120 | 
            +
                #
         | 
| 121 | 
            +
                def html?
         | 
| 122 | 
            +
                  (content_type =~ /text\/html/) == 0
         | 
| 123 | 
            +
                end
         | 
| 124 | 
            +
                
         | 
| 125 | 
            +
                #
         | 
| 126 | 
            +
                # Returns +true+ if the page is a HTTP redirect, returns +false+
         | 
| 127 | 
            +
                # otherwise.
         | 
| 128 | 
            +
                #    
         | 
| 129 | 
            +
                def redirect?
         | 
| 130 | 
            +
                  (300..399).include?(@code)
         | 
| 131 | 
            +
                end
         | 
| 132 | 
            +
                
         | 
| 133 | 
            +
                #
         | 
| 134 | 
            +
                # Returns +true+ if the page was not found (returned 404 code),
         | 
| 135 | 
            +
                # returns +false+ otherwise.
         | 
| 136 | 
            +
                #
         | 
| 137 | 
            +
                def not_found?
         | 
| 138 | 
            +
                  404 == @code
         | 
| 139 | 
            +
                end
         | 
| 140 | 
            +
                
         | 
| 141 | 
            +
                #
         | 
| 142 | 
            +
                # Converts relative URL *link* into an absolute URL based on the
         | 
| 143 | 
            +
                # location of the page
         | 
| 144 | 
            +
                #
         | 
| 145 | 
            +
                def to_absolute(link)
         | 
| 146 | 
            +
                  # remove anchor
         | 
| 147 | 
            +
                  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                  relative = URI(link)
         | 
| 150 | 
            +
                  absolute = @url.merge(relative)
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                  absolute.path = '/' if absolute.path.empty?
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                  return absolute
         | 
| 155 | 
            +
                end
         | 
| 156 | 
            +
                
         | 
| 157 | 
            +
                #
         | 
| 158 | 
            +
                # Returns +true+ if *uri* is in the same domain as the page, returns
         | 
| 159 | 
            +
                # +false+ otherwise
         | 
| 160 | 
            +
                #
         | 
| 161 | 
            +
                def in_domain?(uri)
         | 
| 162 | 
            +
                  uri.host == @url.host
         | 
| 163 | 
            +
                end
         | 
| 164 | 
            +
              end
         | 
| 165 | 
            +
            end
         | 
| @@ -0,0 +1,83 @@ | |
| 1 | 
            +
            module Anemone
         | 
| 2 | 
            +
              class PageHash < Hash
         | 
| 3 | 
            +
                
         | 
| 4 | 
            +
                #
         | 
| 5 | 
            +
                # Use a breadth-first search to calculate the single-source
         | 
| 6 | 
            +
                # shortest paths from *root* to all pages in the PageHash
         | 
| 7 | 
            +
                #
         | 
| 8 | 
            +
                def shortest_paths!(root)
         | 
| 9 | 
            +
                  root = URI(root) if root.is_a?(String)
         | 
| 10 | 
            +
                  raise "Root node not found" if !has_key?(root)
         | 
| 11 | 
            +
                  
         | 
| 12 | 
            +
                  each_value {|p| p.visited = false if p}
         | 
| 13 | 
            +
                  
         | 
| 14 | 
            +
                  q = Queue.new
         | 
| 15 | 
            +
                  
         | 
| 16 | 
            +
                  q.enq(root)
         | 
| 17 | 
            +
                  self[root].depth = 0
         | 
| 18 | 
            +
                  self[root].visited = true
         | 
| 19 | 
            +
                  while(!q.empty?)
         | 
| 20 | 
            +
                    url = q.deq
         | 
| 21 | 
            +
                    
         | 
| 22 | 
            +
                    next if !has_key?(url)
         | 
| 23 | 
            +
                    
         | 
| 24 | 
            +
                    page = self[url]
         | 
| 25 | 
            +
                    
         | 
| 26 | 
            +
                    page.links.each do |u|
         | 
| 27 | 
            +
                      next if !has_key?(u) or self[u].nil?
         | 
| 28 | 
            +
                      link = self[u]
         | 
| 29 | 
            +
                      aliases = [link].concat(link.aliases.map {|a| self[a] })
         | 
| 30 | 
            +
                      
         | 
| 31 | 
            +
                      aliases.each do |node|
         | 
| 32 | 
            +
                        if node.depth.nil? or page.depth + 1 < node.depth
         | 
| 33 | 
            +
                          node.depth = page.depth + 1
         | 
| 34 | 
            +
                        end
         | 
| 35 | 
            +
                      end
         | 
| 36 | 
            +
                      
         | 
| 37 | 
            +
                      q.enq(self[u].url) if !self[u].visited
         | 
| 38 | 
            +
                      self[u].visited = true
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
                  
         | 
| 42 | 
            +
                  self
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
                
         | 
| 45 | 
            +
                #
         | 
| 46 | 
            +
                # Returns a new PageHash by removing redirect-aliases for each
         | 
| 47 | 
            +
                # non-redirect Page
         | 
| 48 | 
            +
                #
         | 
| 49 | 
            +
                def uniq
         | 
| 50 | 
            +
                  results = PageHash.new
         | 
| 51 | 
            +
                  each do |url, page|
         | 
| 52 | 
            +
                    #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
         | 
| 53 | 
            +
                    page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
         | 
| 54 | 
            +
                    if !page.redirect? and !page_added
         | 
| 55 | 
            +
                      results[url] = page.clone 
         | 
| 56 | 
            +
                      results[url].aliases = []
         | 
| 57 | 
            +
                    end
         | 
| 58 | 
            +
                  end
         | 
| 59 | 
            +
                  
         | 
| 60 | 
            +
                  results
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
                
         | 
| 63 | 
            +
                #
         | 
| 64 | 
            +
                # Return an Array of Page objects which link to the given url
         | 
| 65 | 
            +
                #
         | 
| 66 | 
            +
                def pages_linking_to url
         | 
| 67 | 
            +
                  begin
         | 
| 68 | 
            +
                    url = URI(url) if url.is_a?(String)
         | 
| 69 | 
            +
                  rescue
         | 
| 70 | 
            +
                    return []
         | 
| 71 | 
            +
                  end
         | 
| 72 | 
            +
                  
         | 
| 73 | 
            +
                  values.delete_if { |p| !p.links.include?(url) }
         | 
| 74 | 
            +
                end
         | 
| 75 | 
            +
                
         | 
| 76 | 
            +
                #
         | 
| 77 | 
            +
                # Return an Array of URI objects of Pages linking to the given url
         | 
| 78 | 
            +
                def urls_linking_to url
         | 
| 79 | 
            +
                  pages_linking_to(url).map{|p| p.url}
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
                
         | 
| 82 | 
            +
              end
         | 
| 83 | 
            +
            end
         | 
| @@ -0,0 +1,31 @@ | |
| 1 | 
            +
            require 'anemone/page'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Anemone
         | 
| 4 | 
            +
              class Tentacle
         | 
| 5 | 
            +
                
         | 
| 6 | 
            +
                #
         | 
| 7 | 
            +
                # Create a new Tentacle
         | 
| 8 | 
            +
                #
         | 
| 9 | 
            +
                def initialize(link_queue, page_queue)
         | 
| 10 | 
            +
                  @link_queue = link_queue
         | 
| 11 | 
            +
                  @page_queue = page_queue
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
                
         | 
| 14 | 
            +
                #
         | 
| 15 | 
            +
                # Gets links from @link_queue, and returns the fetched
         | 
| 16 | 
            +
                # Page objects into @page_queue
         | 
| 17 | 
            +
                #
         | 
| 18 | 
            +
                def run
         | 
| 19 | 
            +
                  while true do
         | 
| 20 | 
            +
                    link = @link_queue.deq
         | 
| 21 | 
            +
                    
         | 
| 22 | 
            +
                    break if link == :END
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    page = Page.fetch(link)
         | 
| 25 | 
            +
                    
         | 
| 26 | 
            +
                    @page_queue.enq(page)
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
                
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,82 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification 
         | 
| 2 | 
            +
            name: anemone
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            +
              version: 0.0.1
         | 
| 5 | 
            +
            platform: ruby
         | 
| 6 | 
            +
            authors: 
         | 
| 7 | 
            +
            - Chris Kite
         | 
| 8 | 
            +
            autorequire: 
         | 
| 9 | 
            +
            bindir: bin
         | 
| 10 | 
            +
            cert_chain: []
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            date: 2009-04-14 00:00:00 -05:00
         | 
| 13 | 
            +
            default_executable: 
         | 
| 14 | 
            +
            dependencies: 
         | 
| 15 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 16 | 
            +
              name: hpricot
         | 
| 17 | 
            +
              type: :runtime
         | 
| 18 | 
            +
              version_requirement: 
         | 
| 19 | 
            +
              version_requirements: !ruby/object:Gem::Requirement 
         | 
| 20 | 
            +
                requirements: 
         | 
| 21 | 
            +
                - - ">="
         | 
| 22 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 23 | 
            +
                    version: 0.7.0
         | 
| 24 | 
            +
                version: 
         | 
| 25 | 
            +
            description: 
         | 
| 26 | 
            +
            email: 
         | 
| 27 | 
            +
            executables: 
         | 
| 28 | 
            +
            - anemone_count.rb
         | 
| 29 | 
            +
            - anemone_cron.rb
         | 
| 30 | 
            +
            - anemone_pagedepth.rb
         | 
| 31 | 
            +
            - anemone_serialize.rb
         | 
| 32 | 
            +
            - anemone_url_list.rb
         | 
| 33 | 
            +
            extensions: []
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            extra_rdoc_files: 
         | 
| 36 | 
            +
            - README.txt
         | 
| 37 | 
            +
            files: 
         | 
| 38 | 
            +
            - bin/anemone_count.rb
         | 
| 39 | 
            +
            - bin/anemone_cron.rb
         | 
| 40 | 
            +
            - bin/anemone_pagedepth.rb
         | 
| 41 | 
            +
            - bin/anemone_serialize.rb
         | 
| 42 | 
            +
            - bin/anemone_url_list.rb
         | 
| 43 | 
            +
            - lib/anemone
         | 
| 44 | 
            +
            - lib/anemone/anemone.rb
         | 
| 45 | 
            +
            - lib/anemone/core.rb
         | 
| 46 | 
            +
            - lib/anemone/http.rb
         | 
| 47 | 
            +
            - lib/anemone/page.rb
         | 
| 48 | 
            +
            - lib/anemone/page_hash.rb
         | 
| 49 | 
            +
            - lib/anemone/tentacle.rb
         | 
| 50 | 
            +
            - lib/anemone.rb
         | 
| 51 | 
            +
            - README.txt
         | 
| 52 | 
            +
            has_rdoc: true
         | 
| 53 | 
            +
            homepage: http://anemone.rubyforge.org
         | 
| 54 | 
            +
            post_install_message: 
         | 
| 55 | 
            +
            rdoc_options: 
         | 
| 56 | 
            +
            - -m
         | 
| 57 | 
            +
            - README.txt
         | 
| 58 | 
            +
            - -t
         | 
| 59 | 
            +
            - Anemone
         | 
| 60 | 
            +
            require_paths: 
         | 
| 61 | 
            +
            - lib
         | 
| 62 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement 
         | 
| 63 | 
            +
              requirements: 
         | 
| 64 | 
            +
              - - ">="
         | 
| 65 | 
            +
                - !ruby/object:Gem::Version 
         | 
| 66 | 
            +
                  version: "0"
         | 
| 67 | 
            +
              version: 
         | 
| 68 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement 
         | 
| 69 | 
            +
              requirements: 
         | 
| 70 | 
            +
              - - ">="
         | 
| 71 | 
            +
                - !ruby/object:Gem::Version 
         | 
| 72 | 
            +
                  version: "0"
         | 
| 73 | 
            +
              version: 
         | 
| 74 | 
            +
            requirements: []
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            rubyforge_project: anemone
         | 
| 77 | 
            +
            rubygems_version: 1.3.1
         | 
| 78 | 
            +
            signing_key: 
         | 
| 79 | 
            +
            specification_version: 2
         | 
| 80 | 
            +
            summary: Anemone web-spider framework
         | 
| 81 | 
            +
            test_files: []
         | 
| 82 | 
            +
             |