parolkar-anemone 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +19 -0
 - data/README.rdoc +19 -0
 - data/bin/anemone_count.rb +36 -0
 - data/bin/anemone_cron.rb +106 -0
 - data/bin/anemone_pagedepth.rb +44 -0
 - data/bin/anemone_serialize.rb +51 -0
 - data/bin/anemone_url_list.rb +54 -0
 - data/lib/anemone.rb +2 -0
 - data/lib/anemone/anemone.rb +56 -0
 - data/lib/anemone/core.rb +209 -0
 - data/lib/anemone/http.rb +38 -0
 - data/lib/anemone/page.rb +177 -0
 - data/lib/anemone/page_hash.rb +116 -0
 - data/lib/anemone/tentacle.rb +33 -0
 - data/spec/anemone_spec.rb +41 -0
 - data/spec/core_spec.rb +128 -0
 - data/spec/fakeweb_helper.rb +55 -0
 - data/spec/page_spec.rb +49 -0
 - data/spec/spec_helper.rb +7 -0
 - metadata +86 -0
 
    
        data/LICENSE.txt
    ADDED
    
    | 
         @@ -0,0 +1,19 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            Copyright (c) 2009 Vertive, Inc.
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         
     | 
| 
      
 4 
     | 
    
         
            +
            of this software and associated documentation files (the "Software"), to deal
         
     | 
| 
      
 5 
     | 
    
         
            +
            in the Software without restriction, including without limitation the rights
         
     | 
| 
      
 6 
     | 
    
         
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         
     | 
| 
      
 7 
     | 
    
         
            +
            copies of the Software, and to permit persons to whom the Software is
         
     | 
| 
      
 8 
     | 
    
         
            +
            furnished to do so, subject to the following conditions:
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            The above copyright notice and this permission notice shall be included in
         
     | 
| 
      
 11 
     | 
    
         
            +
            all copies or substantial portions of the Software.
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         
     | 
| 
      
 14 
     | 
    
         
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         
     | 
| 
      
 15 
     | 
    
         
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         
     | 
| 
      
 16 
     | 
    
         
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         
     | 
| 
      
 17 
     | 
    
         
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         
     | 
| 
      
 18 
     | 
    
         
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
         
     | 
| 
      
 19 
     | 
    
         
            +
            THE SOFTWARE.
         
     | 
    
        data/README.rdoc
    ADDED
    
    | 
         @@ -0,0 +1,19 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            = Anemone
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            == DESCRIPTION
         
     | 
| 
      
 4 
     | 
    
         
            +
            Anemone is a web spider framework that can spider a domain and collect useful
         
     | 
| 
      
 5 
     | 
    
         
            +
            information about the pages it visits. It is versatile, allowing you to
         
     | 
| 
      
 6 
     | 
    
         
            +
            write your own specialized spider tasks quickly and easily.
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            == FEATURES
         
     | 
| 
      
 9 
     | 
    
         
            +
            * Multi-threaded design for high performance
         
     | 
| 
      
 10 
     | 
    
         
            +
            * Tracks 301 HTTP redirects to understand a page's aliases
         
     | 
| 
      
 11 
     | 
    
         
            +
            * Built-in BFS algorithm for determining page depth
         
     | 
| 
      
 12 
     | 
    
         
            +
            * Allows exclusion of URLs based on regular expressions 
         
     | 
| 
      
 13 
     | 
    
         
            +
            * Can crawl obeying robots.txt
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            == REQUIREMENTS
         
     | 
| 
      
 16 
     | 
    
         
            +
            * nokogiri
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            == EXAMPLES
         
     | 
| 
      
 19 
     | 
    
         
            +
            See the +bin+ directory for several examples of useful Anemone tasks.
         
     | 
| 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #! /usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
            # == Synopsis
         
     | 
| 
      
 3 
     | 
    
         
            +
            #   Crawls a site starting at the given URL, and outputs the total number
         
     | 
| 
      
 4 
     | 
    
         
            +
            #   of unique pages on the site.
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # == Usage
         
     | 
| 
      
 7 
     | 
    
         
            +
            #   anemone_count.rb url
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # == Author
         
     | 
| 
      
 10 
     | 
    
         
            +
            #   Chris Kite
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            require 'anemone'
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            def usage
         
     | 
| 
      
 17 
     | 
    
         
            +
              puts <<END
         
     | 
| 
      
 18 
     | 
    
         
            +
            Usage: anemone_count.rb url
         
     | 
| 
      
 19 
     | 
    
         
            +
            END
         
     | 
| 
      
 20 
     | 
    
         
            +
            end
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            # make sure that the first option is a URL we can crawl
         
     | 
| 
      
 23 
     | 
    
         
            +
            begin
         
     | 
| 
      
 24 
     | 
    
         
            +
              URI(ARGV[0])
         
     | 
| 
      
 25 
     | 
    
         
            +
            rescue
         
     | 
| 
      
 26 
     | 
    
         
            +
              usage
         
     | 
| 
      
 27 
     | 
    
         
            +
              Process.exit 
         
     | 
| 
      
 28 
     | 
    
         
            +
            end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            Anemone.crawl(ARGV[0]) do |anemone|
         
     | 
| 
      
 31 
     | 
    
         
            +
              anemone.after_crawl do |pages|
         
     | 
| 
      
 32 
     | 
    
         
            +
                puts pages.uniq.size
         
     | 
| 
      
 33 
     | 
    
         
            +
              end
         
     | 
| 
      
 34 
     | 
    
         
            +
            end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
    
        data/bin/anemone_cron.rb
    ADDED
    
    | 
         @@ -0,0 +1,106 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #! /usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
            # == Synopsis
         
     | 
| 
      
 3 
     | 
    
         
            +
            #   Performs pagedepth, url list, and count functionality
         
     | 
| 
      
 4 
     | 
    
         
            +
            #   Meant to be run daily as a cron job
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # == Usage
         
     | 
| 
      
 7 
     | 
    
         
            +
            #   anemone_url_list.rb [options] url
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # == Options
         
     | 
| 
      
 10 
     | 
    
         
            +
            #   -r, --relative                  Output relative URLs (rather than absolute)
         
     | 
| 
      
 11 
     | 
    
         
            +
            #   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
         
     | 
| 
      
 12 
     | 
    
         
            +
            #
         
     | 
| 
      
 13 
     | 
    
         
            +
            # == Author
         
     | 
| 
      
 14 
     | 
    
         
            +
            #   Chris Kite
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            require 'anemone'
         
     | 
| 
      
 19 
     | 
    
         
            +
            require 'optparse'
         
     | 
| 
      
 20 
     | 
    
         
            +
            require 'ostruct'
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            def usage
         
     | 
| 
      
 23 
     | 
    
         
            +
              puts <<END
         
     | 
| 
      
 24 
     | 
    
         
            +
            Usage: anemone_url_list.rb [options] url
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
            Options:
         
     | 
| 
      
 27 
     | 
    
         
            +
              -r, --relative           Output relative URLs (rather than absolute)
         
     | 
| 
      
 28 
     | 
    
         
            +
              -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
         
     | 
| 
      
 29 
     | 
    
         
            +
            END
         
     | 
| 
      
 30 
     | 
    
         
            +
            end
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            options = OpenStruct.new
         
     | 
| 
      
 33 
     | 
    
         
            +
            options.relative = false
         
     | 
| 
      
 34 
     | 
    
         
            +
            options.output_file = 'urls.txt'
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
            # make sure that the last option is a URL we can crawl
         
     | 
| 
      
 37 
     | 
    
         
            +
            begin
         
     | 
| 
      
 38 
     | 
    
         
            +
              URI(ARGV.last)
         
     | 
| 
      
 39 
     | 
    
         
            +
            rescue
         
     | 
| 
      
 40 
     | 
    
         
            +
              usage
         
     | 
| 
      
 41 
     | 
    
         
            +
              Process.exit 
         
     | 
| 
      
 42 
     | 
    
         
            +
            end
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
            # parse command-line options
         
     | 
| 
      
 45 
     | 
    
         
            +
            opts = OptionParser.new
         
     | 
| 
      
 46 
     | 
    
         
            +
            opts.on('-r', '--relative')        { options.relative = true }
         
     | 
| 
      
 47 
     | 
    
         
            +
            opts.on('-o', '--output filename') {|o| options.output_file = o }
         
     | 
| 
      
 48 
     | 
    
         
            +
            opts.parse!(ARGV)
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
            root = ARGV.last
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
            Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|  
         
     | 
| 
      
 53 
     | 
    
         
            +
              
         
     | 
| 
      
 54 
     | 
    
         
            +
              anemone.after_crawl do |pages|
         
     | 
| 
      
 55 
     | 
    
         
            +
                puts "Crawl results for #{root}\n"
         
     | 
| 
      
 56 
     | 
    
         
            +
                
         
     | 
| 
      
 57 
     | 
    
         
            +
                # print a list of 404's
         
     | 
| 
      
 58 
     | 
    
         
            +
                not_found = []
         
     | 
| 
      
 59 
     | 
    
         
            +
                pages.each_value do |page|
         
     | 
| 
      
 60 
     | 
    
         
            +
                  url = page.url.to_s
         
     | 
| 
      
 61 
     | 
    
         
            +
                  not_found << url if page.not_found?
         
     | 
| 
      
 62 
     | 
    
         
            +
                end
         
     | 
| 
      
 63 
     | 
    
         
            +
                unless not_found.empty?
         
     | 
| 
      
 64 
     | 
    
         
            +
                  puts "\n404's:"
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
      
 66 
     | 
    
         
            +
                  missing_links = pages.urls_linking_to(not_found)
         
     | 
| 
      
 67 
     | 
    
         
            +
                  missing_links.each do |url, links|
         
     | 
| 
      
 68 
     | 
    
         
            +
                    if options.relative
         
     | 
| 
      
 69 
     | 
    
         
            +
                      puts URI(url).path.to_s
         
     | 
| 
      
 70 
     | 
    
         
            +
                    else
         
     | 
| 
      
 71 
     | 
    
         
            +
                      puts url
         
     | 
| 
      
 72 
     | 
    
         
            +
                    end
         
     | 
| 
      
 73 
     | 
    
         
            +
                    links.slice(0..10).each do |u|
         
     | 
| 
      
 74 
     | 
    
         
            +
                      u = u.path if options.relative
         
     | 
| 
      
 75 
     | 
    
         
            +
                      puts "  linked from #{u}"
         
     | 
| 
      
 76 
     | 
    
         
            +
                    end
         
     | 
| 
      
 77 
     | 
    
         
            +
                    
         
     | 
| 
      
 78 
     | 
    
         
            +
                    puts " ..." if links.size > 10
         
     | 
| 
      
 79 
     | 
    
         
            +
                  end
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
                  print "\n"
         
     | 
| 
      
 82 
     | 
    
         
            +
                end  
         
     | 
| 
      
 83 
     | 
    
         
            +
                
         
     | 
| 
      
 84 
     | 
    
         
            +
                # remove redirect aliases, and calculate pagedepths
         
     | 
| 
      
 85 
     | 
    
         
            +
                pages = pages.shortest_paths!(root).uniq
         
     | 
| 
      
 86 
     | 
    
         
            +
                depths = pages.values.inject({}) do |depths, page|
         
     | 
| 
      
 87 
     | 
    
         
            +
                  depths[page.depth] ||= 0
         
     | 
| 
      
 88 
     | 
    
         
            +
                  depths[page.depth] += 1
         
     | 
| 
      
 89 
     | 
    
         
            +
                  depths
         
     | 
| 
      
 90 
     | 
    
         
            +
                end
         
     | 
| 
      
 91 
     | 
    
         
            +
                
         
     | 
| 
      
 92 
     | 
    
         
            +
                # print the page count
         
     | 
| 
      
 93 
     | 
    
         
            +
                puts "Total pages: #{pages.size}\n"
         
     | 
| 
      
 94 
     | 
    
         
            +
                
         
     | 
| 
      
 95 
     | 
    
         
            +
                # print a list of depths
         
     | 
| 
      
 96 
     | 
    
         
            +
                depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
         
     | 
| 
      
 97 
     | 
    
         
            +
                
         
     | 
| 
      
 98 
     | 
    
         
            +
                # output a list of urls to file
         
     | 
| 
      
 99 
     | 
    
         
            +
                file = open(options.output_file, 'w')
         
     | 
| 
      
 100 
     | 
    
         
            +
                pages.each_key do |url|
         
     | 
| 
      
 101 
     | 
    
         
            +
                  url = options.relative ? url.path.to_s : url.to_s
         
     | 
| 
      
 102 
     | 
    
         
            +
                  file.puts url
         
     | 
| 
      
 103 
     | 
    
         
            +
                end
         
     | 
| 
      
 104 
     | 
    
         
            +
                
         
     | 
| 
      
 105 
     | 
    
         
            +
              end
         
     | 
| 
      
 106 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,44 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #! /usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
            # == Synopsis
         
     | 
| 
      
 3 
     | 
    
         
            +
            #   Crawls a site starting at the given URL, and outputs a count of
         
     | 
| 
      
 4 
     | 
    
         
            +
            #   the number of Pages at each depth in the site.
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # == Usage
         
     | 
| 
      
 7 
     | 
    
         
            +
            #   anemone_pagedepth.rb url
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # == Author
         
     | 
| 
      
 10 
     | 
    
         
            +
            #   Chris Kite
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            require 'anemone'
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            def usage
         
     | 
| 
      
 17 
     | 
    
         
            +
              puts <<END
         
     | 
| 
      
 18 
     | 
    
         
            +
            Usage: anemone_pagedepth.rb url
         
     | 
| 
      
 19 
     | 
    
         
            +
            END
         
     | 
| 
      
 20 
     | 
    
         
            +
            end
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            # make sure that the first option is a URL we can crawl
         
     | 
| 
      
 23 
     | 
    
         
            +
            begin
         
     | 
| 
      
 24 
     | 
    
         
            +
              URI(ARGV[0])
         
     | 
| 
      
 25 
     | 
    
         
            +
            rescue
         
     | 
| 
      
 26 
     | 
    
         
            +
              usage
         
     | 
| 
      
 27 
     | 
    
         
            +
              Process.exit 
         
     | 
| 
      
 28 
     | 
    
         
            +
            end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            root = ARGV[0]
         
     | 
| 
      
 31 
     | 
    
         
            +
            Anemone.crawl(root) do |anemone|
         
     | 
| 
      
 32 
     | 
    
         
            +
              anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
         
     | 
| 
      
 33 
     | 
    
         
            +
              
         
     | 
| 
      
 34 
     | 
    
         
            +
              anemone.after_crawl do |pages|
         
     | 
| 
      
 35 
     | 
    
         
            +
                pages = pages.shortest_paths!(root).uniq
         
     | 
| 
      
 36 
     | 
    
         
            +
                depths = pages.values.inject({}) do |depths, page|
         
     | 
| 
      
 37 
     | 
    
         
            +
                  depths[page.depth] ||= 0
         
     | 
| 
      
 38 
     | 
    
         
            +
                  depths[page.depth] += 1
         
     | 
| 
      
 39 
     | 
    
         
            +
                  depths
         
     | 
| 
      
 40 
     | 
    
         
            +
                end
         
     | 
| 
      
 41 
     | 
    
         
            +
                
         
     | 
| 
      
 42 
     | 
    
         
            +
                depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
         
     | 
| 
      
 43 
     | 
    
         
            +
              end
         
     | 
| 
      
 44 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,51 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #! /usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
            # == Synopsis
         
     | 
| 
      
 3 
     | 
    
         
            +
            #   Crawls a site starting at the given URL, and saves the resulting
         
     | 
| 
      
 4 
     | 
    
         
            +
            #   PageHash object to a file using Marshal serialization.
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # == Usage
         
     | 
| 
      
 7 
     | 
    
         
            +
            #   anemone_serialize.rb [options] url
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # == Options
         
     | 
| 
      
 10 
     | 
    
         
            +
            #   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
         
     | 
| 
      
 11 
     | 
    
         
            +
            #
         
     | 
| 
      
 12 
     | 
    
         
            +
            # == Author
         
     | 
| 
      
 13 
     | 
    
         
            +
            #   Chris Kite
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            require 'anemone'
         
     | 
| 
      
 18 
     | 
    
         
            +
            require 'optparse'
         
     | 
| 
      
 19 
     | 
    
         
            +
            require 'ostruct'
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            def usage
         
     | 
| 
      
 22 
     | 
    
         
            +
              puts <<END
         
     | 
| 
      
 23 
     | 
    
         
            +
            Usage: anemone_serialize.rb [options] url
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            Options:
         
     | 
| 
      
 26 
     | 
    
         
            +
              -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
         
     | 
| 
      
 27 
     | 
    
         
            +
            END
         
     | 
| 
      
 28 
     | 
    
         
            +
            end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            # make sure that the first option is a URL we can crawl
         
     | 
| 
      
 31 
     | 
    
         
            +
            begin
         
     | 
| 
      
 32 
     | 
    
         
            +
              URI(ARGV[0])
         
     | 
| 
      
 33 
     | 
    
         
            +
            rescue
         
     | 
| 
      
 34 
     | 
    
         
            +
              usage
         
     | 
| 
      
 35 
     | 
    
         
            +
              Process.exit 
         
     | 
| 
      
 36 
     | 
    
         
            +
            end
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
            options = OpenStruct.new
         
     | 
| 
      
 39 
     | 
    
         
            +
            options.output_file = "crawl.#{Time.now.to_i}"
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
            # parse command-line options
         
     | 
| 
      
 42 
     | 
    
         
            +
            opts = OptionParser.new
         
     | 
| 
      
 43 
     | 
    
         
            +
            opts.on('-o', '--output filename') {|o| options.output_file = o }
         
     | 
| 
      
 44 
     | 
    
         
            +
            opts.parse!(ARGV)
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
            root = ARGV[0]
         
     | 
| 
      
 47 
     | 
    
         
            +
            Anemone.crawl(root) do |anemone|
         
     | 
| 
      
 48 
     | 
    
         
            +
              anemone.after_crawl do |pages|
         
     | 
| 
      
 49 
     | 
    
         
            +
                open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
         
     | 
| 
      
 50 
     | 
    
         
            +
              end
         
     | 
| 
      
 51 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,54 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #! /usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
            # == Synopsis
         
     | 
| 
      
 3 
     | 
    
         
            +
            #   Crawls a site starting at the given URL, and outputs the URL of each page
         
     | 
| 
      
 4 
     | 
    
         
            +
            #   in the domain as they are encountered.
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # == Usage
         
     | 
| 
      
 7 
     | 
    
         
            +
            #   anemone_url_list.rb [options] url
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # == Options
         
     | 
| 
      
 10 
     | 
    
         
            +
            #   -r, --relative          Output relative URLs (rather than absolute)
         
     | 
| 
      
 11 
     | 
    
         
            +
            #
         
     | 
| 
      
 12 
     | 
    
         
            +
            # == Author
         
     | 
| 
      
 13 
     | 
    
         
            +
            #   Chris Kite
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            require 'anemone'
         
     | 
| 
      
 18 
     | 
    
         
            +
            require 'optparse'
         
     | 
| 
      
 19 
     | 
    
         
            +
            require 'ostruct'
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            def usage
         
     | 
| 
      
 22 
     | 
    
         
            +
              puts <<END
         
     | 
| 
      
 23 
     | 
    
         
            +
            Usage: anemone_url_list.rb [options] url
         
     | 
| 
      
 24 
     | 
    
         
            +
                
         
     | 
| 
      
 25 
     | 
    
         
            +
            Options:
         
     | 
| 
      
 26 
     | 
    
         
            +
              -r, --relative      Output relative URLs (rather than absolute)
         
     | 
| 
      
 27 
     | 
    
         
            +
            END
         
     | 
| 
      
 28 
     | 
    
         
            +
            end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            options = OpenStruct.new
         
     | 
| 
      
 31 
     | 
    
         
            +
            options.relative = false
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
            # make sure that the last option is a URL we can crawl
         
     | 
| 
      
 34 
     | 
    
         
            +
            begin
         
     | 
| 
      
 35 
     | 
    
         
            +
              URI(ARGV.last)
         
     | 
| 
      
 36 
     | 
    
         
            +
            rescue
         
     | 
| 
      
 37 
     | 
    
         
            +
              usage
         
     | 
| 
      
 38 
     | 
    
         
            +
              Process.exit 
         
     | 
| 
      
 39 
     | 
    
         
            +
            end
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
            # parse command-line options
         
     | 
| 
      
 42 
     | 
    
         
            +
            opts = OptionParser.new
         
     | 
| 
      
 43 
     | 
    
         
            +
            opts.on('-r', '--relative') { options.relative = true }
         
     | 
| 
      
 44 
     | 
    
         
            +
            opts.parse!(ARGV)
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
            Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|  
         
     | 
| 
      
 47 
     | 
    
         
            +
              anemone.on_every_page do |page|
         
     | 
| 
      
 48 
     | 
    
         
            +
                if options.relative
         
     | 
| 
      
 49 
     | 
    
         
            +
                  puts page.url.path
         
     | 
| 
      
 50 
     | 
    
         
            +
                else
         
     | 
| 
      
 51 
     | 
    
         
            +
                  puts page.url
         
     | 
| 
      
 52 
     | 
    
         
            +
                end
         
     | 
| 
      
 53 
     | 
    
         
            +
              end
         
     | 
| 
      
 54 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/anemone.rb
    ADDED
    
    
| 
         @@ -0,0 +1,56 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'ostruct'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'anemone/core'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 5 
     | 
    
         
            +
              # Version number
         
     | 
| 
      
 6 
     | 
    
         
            +
              VERSION = '0.1.2'
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
              #module-wide options
         
     | 
| 
      
 9 
     | 
    
         
            +
              def Anemone.options=(options)
         
     | 
| 
      
 10 
     | 
    
         
            +
                @options = options
         
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
              
         
     | 
| 
      
 13 
     | 
    
         
            +
              def Anemone.options
         
     | 
| 
      
 14 
     | 
    
         
            +
                @options
         
     | 
| 
      
 15 
     | 
    
         
            +
              end
         
     | 
| 
      
 16 
     | 
    
         
            +
              
         
     | 
| 
      
 17 
     | 
    
         
            +
              #
         
     | 
| 
      
 18 
     | 
    
         
            +
              # Convenience method to start a crawl using Core
         
     | 
| 
      
 19 
     | 
    
         
            +
              #
         
     | 
| 
      
 20 
     | 
    
         
            +
              def Anemone.crawl(urls, options = {}, &block)
         
     | 
| 
      
 21 
     | 
    
         
            +
                Anemone.options = OpenStruct.new(options)
         
     | 
| 
      
 22 
     | 
    
         
            +
            	
         
     | 
| 
      
 23 
     | 
    
         
            +
                #by default, run 4 Tentacle threads to fetch pages
         
     | 
| 
      
 24 
     | 
    
         
            +
                Anemone.options.threads ||= 4
         
     | 
| 
      
 25 
     | 
    
         
            +
            	
         
     | 
| 
      
 26 
     | 
    
         
            +
                #disable verbose output by default
         
     | 
| 
      
 27 
     | 
    
         
            +
                Anemone.options.verbose ||= false
         
     | 
| 
      
 28 
     | 
    
         
            +
            	
         
     | 
| 
      
 29 
     | 
    
         
            +
                #by default, don't throw away the page response body after scanning it for links
         
     | 
| 
      
 30 
     | 
    
         
            +
                Anemone.options.discard_page_bodies ||= false
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                #by default, identify self as Anemone/VERSION
         
     | 
| 
      
 33 
     | 
    
         
            +
                Anemone.options.user_agent ||= "Anemone/#{self::VERSION}" 
         
     | 
| 
      
 34 
     | 
    
         
            +
                
         
     | 
| 
      
 35 
     | 
    
         
            +
                #Obey Robots.txt 
         
     | 
| 
      
 36 
     | 
    
         
            +
                Anemone.options.obey_robots_dot_txt ||= false 
         
     | 
| 
      
 37 
     | 
    
         
            +
                if Anemone.options.obey_robots_dot_txt == true
         
     | 
| 
      
 38 
     | 
    
         
            +
                  begin      
         
     | 
| 
      
 39 
     | 
    
         
            +
                    require 'obey_robots_dot_txt'
         
     | 
| 
      
 40 
     | 
    
         
            +
                  rescue LoadError
         
     | 
| 
      
 41 
     | 
    
         
            +
                    warn "You need the 'obey_robots_dot_txt' gem installed, (you may run sudo gem install parolkar-obey_robots_dot_txt --source http://gems.github.com )"
         
     | 
| 
      
 42 
     | 
    
         
            +
                    exit
         
     | 
| 
      
 43 
     | 
    
         
            +
                  end
         
     | 
| 
      
 44 
     | 
    
         
            +
                end  
         
     | 
| 
      
 45 
     | 
    
         
            +
                
         
     | 
| 
      
 46 
     | 
    
         
            +
                #no delay between requests by default
         
     | 
| 
      
 47 
     | 
    
         
            +
                Anemone.options.delay ||= 0
         
     | 
| 
      
 48 
     | 
    
         
            +
                
         
     | 
| 
      
 49 
     | 
    
         
            +
                #use a single thread if a delay was requested
         
     | 
| 
      
 50 
     | 
    
         
            +
                if(Anemone.options.delay != 0)
         
     | 
| 
      
 51 
     | 
    
         
            +
                  Anemone.options.threads = 1
         
     | 
| 
      
 52 
     | 
    
         
            +
                end
         
     | 
| 
      
 53 
     | 
    
         
            +
                
         
     | 
| 
      
 54 
     | 
    
         
            +
                Core.crawl(urls, &block)
         
     | 
| 
      
 55 
     | 
    
         
            +
              end
         
     | 
| 
      
 56 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/anemone/core.rb
    ADDED
    
    | 
         @@ -0,0 +1,209 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'net/http'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'thread'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'anemone/tentacle'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'anemone/page_hash'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 7 
     | 
    
         
            +
              class Core
         
     | 
| 
      
 8 
     | 
    
         
            +
                # PageHash storing all Page objects encountered during the crawl
         
     | 
| 
      
 9 
     | 
    
         
            +
                attr_reader :pages
         
     | 
| 
      
 10 
     | 
    
         
            +
                
         
     | 
| 
      
 11 
     | 
    
         
            +
                #
         
     | 
| 
      
 12 
     | 
    
         
            +
                # Initialize the crawl with starting *urls* (single URL or Array of URLs)
         
     | 
| 
      
 13 
     | 
    
         
            +
                # and optional *block*
         
     | 
| 
      
 14 
     | 
    
         
            +
                #
         
     | 
| 
      
 15 
     | 
    
         
            +
                def initialize(urls, &block)
         
     | 
| 
      
 16 
     | 
    
         
            +
                  @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
         
     | 
| 
      
 17 
     | 
    
         
            +
                  @urls.each{ |url| url.path = '/' if url.path.empty? }
         
     | 
| 
      
 18 
     | 
    
         
            +
                  
         
     | 
| 
      
 19 
     | 
    
         
            +
                  @tentacles = []
         
     | 
| 
      
 20 
     | 
    
         
            +
                  @pages = PageHash.new
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @on_every_page_blocks = []
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
         
     | 
| 
      
 23 
     | 
    
         
            +
                  @skip_link_patterns = []
         
     | 
| 
      
 24 
     | 
    
         
            +
                  @after_crawl_blocks = []
         
     | 
| 
      
 25 
     | 
    
         
            +
                  
         
     | 
| 
      
 26 
     | 
    
         
            +
                  block.call(self) if block
         
     | 
| 
      
 27 
     | 
    
         
            +
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
                
         
     | 
| 
      
 29 
     | 
    
         
            +
                #
         
     | 
| 
      
 30 
     | 
    
         
            +
                # Convenience method to start a new crawl
         
     | 
| 
      
 31 
     | 
    
         
            +
                #
         
     | 
| 
      
 32 
     | 
    
         
            +
                def self.crawl(root, &block)
         
     | 
| 
      
 33 
     | 
    
         
            +
                  self.new(root) do |core|
         
     | 
| 
      
 34 
     | 
    
         
            +
                    block.call(core) if block
         
     | 
| 
      
 35 
     | 
    
         
            +
                    core.run
         
     | 
| 
      
 36 
     | 
    
         
            +
                    return core
         
     | 
| 
      
 37 
     | 
    
         
            +
                  end
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
                
         
     | 
| 
      
 40 
     | 
    
         
            +
                #
         
     | 
| 
      
 41 
     | 
    
         
            +
                # Add a block to be executed on the PageHash after the crawl
         
     | 
| 
      
 42 
     | 
    
         
            +
                # is finished
         
     | 
| 
      
 43 
     | 
    
         
            +
                #
         
     | 
| 
      
 44 
     | 
    
         
            +
                def after_crawl(&block)
         
     | 
| 
      
 45 
     | 
    
         
            +
                  @after_crawl_blocks << block
         
     | 
| 
      
 46 
     | 
    
         
            +
                  self
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
                
         
     | 
| 
      
 49 
     | 
    
         
            +
                #
         
     | 
| 
      
 50 
     | 
    
         
            +
                # Add one ore more Regex patterns for URLs which should not be
         
     | 
| 
      
 51 
     | 
    
         
            +
                # followed
         
     | 
| 
      
 52 
     | 
    
         
            +
                #
         
     | 
| 
      
 53 
     | 
    
         
            +
                def skip_links_like(*patterns)
         
     | 
| 
      
 54 
     | 
    
         
            +
                  if patterns
         
     | 
| 
      
 55 
     | 
    
         
            +
                    patterns.each do |pattern|
         
     | 
| 
      
 56 
     | 
    
         
            +
                      @skip_link_patterns << pattern
         
     | 
| 
      
 57 
     | 
    
         
            +
                    end
         
     | 
| 
      
 58 
     | 
    
         
            +
                  end
         
     | 
| 
      
 59 
     | 
    
         
            +
                  self
         
     | 
| 
      
 60 
     | 
    
         
            +
                end
         
     | 
| 
      
 61 
     | 
    
         
            +
                
         
     | 
| 
      
 62 
     | 
    
         
            +
                #
         
     | 
| 
      
 63 
     | 
    
         
            +
                # Add a block to be executed on every Page as they are encountered
         
     | 
| 
      
 64 
     | 
    
         
            +
                # during the crawl
         
     | 
| 
      
 65 
     | 
    
         
            +
                #
         
     | 
| 
      
 66 
     | 
    
         
            +
                def on_every_page(&block)
         
     | 
| 
      
 67 
     | 
    
         
            +
                  @on_every_page_blocks << block
         
     | 
| 
      
 68 
     | 
    
         
            +
                  self
         
     | 
| 
      
 69 
     | 
    
         
            +
                end
         
     | 
| 
      
 70 
     | 
    
         
            +
                
         
     | 
| 
      
 71 
     | 
    
         
            +
                #
         
     | 
| 
      
 72 
     | 
    
         
            +
                # Add a block to be executed on Page objects with a URL matching
         
     | 
| 
      
 73 
     | 
    
         
            +
                # one or more patterns
         
     | 
| 
      
 74 
     | 
    
         
            +
                #
         
     | 
| 
      
 75 
     | 
    
         
            +
                def on_pages_like(*patterns, &block)
         
     | 
| 
      
 76 
     | 
    
         
            +
                  if patterns
         
     | 
| 
      
 77 
     | 
    
         
            +
                    patterns.each do |pattern|
         
     | 
| 
      
 78 
     | 
    
         
            +
                      @on_pages_like_blocks[pattern] << block
         
     | 
| 
      
 79 
     | 
    
         
            +
                    end
         
     | 
| 
      
 80 
     | 
    
         
            +
                  end
         
     | 
| 
      
 81 
     | 
    
         
            +
                  self
         
     | 
| 
      
 82 
     | 
    
         
            +
                end
         
     | 
| 
      
 83 
     | 
    
         
            +
                
         
     | 
| 
      
 84 
     | 
    
         
            +
                #
         
     | 
| 
      
 85 
     | 
    
         
            +
                # Specify a block which will select which links to follow on each page.
         
     | 
| 
      
 86 
     | 
    
         
            +
                # The block should return an Array of URI objects.
         
     | 
| 
      
 87 
     | 
    
         
            +
                #
         
     | 
| 
      
 88 
     | 
    
         
            +
                def focus_crawl(&block)
         
     | 
| 
      
 89 
     | 
    
         
            +
                  @focus_crawl_block = block
         
     | 
| 
      
 90 
     | 
    
         
            +
                  self
         
     | 
| 
      
 91 
     | 
    
         
            +
                end
         
     | 
| 
      
 92 
     | 
    
         
            +
                
         
     | 
| 
      
 93 
     | 
    
         
            +
                #
         
     | 
| 
      
 94 
     | 
    
         
            +
                # Perform the crawl
         
     | 
| 
      
 95 
     | 
    
         
            +
                #
         
     | 
| 
      
 96 
     | 
    
         
            +
                def run
         
     | 
| 
      
 97 
     | 
    
         
            +
                  @urls.delete_if { |url| !visit_link?(url) }
         
     | 
| 
      
 98 
     | 
    
         
            +
                  return if @urls.empty?
         
     | 
| 
      
 99 
     | 
    
         
            +
                  
         
     | 
| 
      
 100 
     | 
    
         
            +
                  link_queue = Queue.new
         
     | 
| 
      
 101 
     | 
    
         
            +
                  page_queue = Queue.new
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
                  Anemone.options.threads.times do |id|
         
     | 
| 
      
 104 
     | 
    
         
            +
                    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
         
     | 
| 
      
 105 
     | 
    
         
            +
                  end
         
     | 
| 
      
 106 
     | 
    
         
            +
                  
         
     | 
| 
      
 107 
     | 
    
         
            +
                  @urls.each{ |url| link_queue.enq(url) }
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
                  loop do
         
     | 
| 
      
 110 
     | 
    
         
            +
                    page = page_queue.deq
         
     | 
| 
      
 111 
     | 
    
         
            +
                    
         
     | 
| 
      
 112 
     | 
    
         
            +
                    @pages[page.url] = page
         
     | 
| 
      
 113 
     | 
    
         
            +
                    
         
     | 
| 
      
 114 
     | 
    
         
            +
                    puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
         
     | 
| 
      
 115 
     | 
    
         
            +
                    
         
     | 
| 
      
 116 
     | 
    
         
            +
                    #perform the on_every_page blocks for this page
         
     | 
| 
      
 117 
     | 
    
         
            +
                    do_page_blocks(page)
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
      
 119 
     | 
    
         
            +
                    page.doc = nil if Anemone.options.discard_page_bodies
         
     | 
| 
      
 120 
     | 
    
         
            +
                    
         
     | 
| 
      
 121 
     | 
    
         
            +
                    links_to_follow(page).each do |link|
         
     | 
| 
      
 122 
     | 
    
         
            +
                      link_queue.enq(link)
         
     | 
| 
      
 123 
     | 
    
         
            +
                      @pages[link] = nil
         
     | 
| 
      
 124 
     | 
    
         
            +
                    end
         
     | 
| 
      
 125 
     | 
    
         
            +
                    
         
     | 
| 
      
 126 
     | 
    
         
            +
                    #create an entry in the page hash for each alias of this page,
         
     | 
| 
      
 127 
     | 
    
         
            +
                    #i.e. all the pages that redirected to this page
         
     | 
| 
      
 128 
     | 
    
         
            +
                    page.aliases.each do |aka|
         
     | 
| 
      
 129 
     | 
    
         
            +
                      if !@pages.has_key?(aka) or @pages[aka].nil?
         
     | 
| 
      
 130 
     | 
    
         
            +
                        @pages[aka] = page.alias_clone(aka)
         
     | 
| 
      
 131 
     | 
    
         
            +
                      end
         
     | 
| 
      
 132 
     | 
    
         
            +
                      @pages[aka].add_alias!(page.url)
         
     | 
| 
      
 133 
     | 
    
         
            +
                    end
         
     | 
| 
      
 134 
     | 
    
         
            +
                    
         
     | 
| 
      
 135 
     | 
    
         
            +
                    # if we are done with the crawl, tell the threads to end
         
     | 
| 
      
 136 
     | 
    
         
            +
                    if link_queue.empty? and page_queue.empty?
         
     | 
| 
      
 137 
     | 
    
         
            +
                      until link_queue.num_waiting == @tentacles.size
         
     | 
| 
      
 138 
     | 
    
         
            +
                        Thread.pass
         
     | 
| 
      
 139 
     | 
    
         
            +
                      end
         
     | 
| 
      
 140 
     | 
    
         
            +
                      
         
     | 
| 
      
 141 
     | 
    
         
            +
                      if page_queue.empty?
         
     | 
| 
      
 142 
     | 
    
         
            +
                        @tentacles.size.times { |i| link_queue.enq(:END)}
         
     | 
| 
      
 143 
     | 
    
         
            +
                        break
         
     | 
| 
      
 144 
     | 
    
         
            +
                      end
         
     | 
| 
      
 145 
     | 
    
         
            +
                    end
         
     | 
| 
      
 146 
     | 
    
         
            +
                    
         
     | 
| 
      
 147 
     | 
    
         
            +
                  end
         
     | 
| 
      
 148 
     | 
    
         
            +
             
     | 
| 
      
 149 
     | 
    
         
            +
                  @tentacles.each { |t| t.join }
         
     | 
| 
      
 150 
     | 
    
         
            +
             
     | 
| 
      
 151 
     | 
    
         
            +
                  do_after_crawl_blocks()
         
     | 
| 
      
 152 
     | 
    
         
            +
                  
         
     | 
| 
      
 153 
     | 
    
         
            +
                  self
         
     | 
| 
      
 154 
     | 
    
         
            +
                end
         
     | 
| 
      
 155 
     | 
    
         
            +
                
         
     | 
| 
      
 156 
     | 
    
         
            +
                private    
         
     | 
| 
      
 157 
     | 
    
         
            +
                
         
     | 
| 
      
 158 
     | 
    
         
            +
                #
         
     | 
| 
      
 159 
     | 
    
         
            +
                # Execute the after_crawl blocks
         
     | 
| 
      
 160 
     | 
    
         
            +
                #
         
     | 
| 
      
 161 
     | 
    
         
            +
                def do_after_crawl_blocks
         
     | 
| 
      
 162 
     | 
    
         
            +
                  @after_crawl_blocks.each {|b| b.call(@pages)}
         
     | 
| 
      
 163 
     | 
    
         
            +
                end
         
     | 
| 
      
 164 
     | 
    
         
            +
                
         
     | 
| 
      
 165 
     | 
    
         
            +
                #
         
     | 
| 
      
 166 
     | 
    
         
            +
                # Execute the on_every_page blocks for *page*
         
     | 
| 
      
 167 
     | 
    
         
            +
                #
         
     | 
| 
      
 168 
     | 
    
         
            +
                def do_page_blocks(page)
         
     | 
| 
      
 169 
     | 
    
         
            +
                  @on_every_page_blocks.each do |blk|
         
     | 
| 
      
 170 
     | 
    
         
            +
                    blk.call(page)
         
     | 
| 
      
 171 
     | 
    
         
            +
                  end
         
     | 
| 
      
 172 
     | 
    
         
            +
                  
         
     | 
| 
      
 173 
     | 
    
         
            +
                  @on_pages_like_blocks.each do |pattern, blks|
         
     | 
| 
      
 174 
     | 
    
         
            +
                    if page.url.to_s =~ pattern
         
     | 
| 
      
 175 
     | 
    
         
            +
                      blks.each { |blk| blk.call(page) }
         
     | 
| 
      
 176 
     | 
    
         
            +
                    end
         
     | 
| 
      
 177 
     | 
    
         
            +
                  end
         
     | 
| 
      
 178 
     | 
    
         
            +
                end      
         
     | 
| 
      
 179 
     | 
    
         
            +
                
         
     | 
| 
      
 180 
     | 
    
         
            +
                #
         
     | 
| 
      
 181 
     | 
    
         
            +
                # Return an Array of links to follow from the given page.
         
     | 
| 
      
 182 
     | 
    
         
            +
                # Based on whether or not the link has already been crawled,
         
     | 
| 
      
 183 
     | 
    
         
            +
                # and the block given to focus_crawl()
         
     | 
| 
      
 184 
     | 
    
         
            +
                #
         
     | 
| 
      
 185 
     | 
    
         
            +
                def links_to_follow(page)
         
     | 
| 
      
 186 
     | 
    
         
            +
                  links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
         
     | 
| 
      
 187 
     | 
    
         
            +
                  links.find_all { |link| visit_link?(link) }
         
     | 
| 
      
 188 
     | 
    
         
            +
                end
         
     | 
| 
      
 189 
     | 
    
         
            +
                
         
     | 
| 
      
 190 
     | 
    
         
            +
                #
         
     | 
| 
      
 191 
     | 
    
         
            +
                # Returns +true+ if *link* has not been visited already,
         
     | 
| 
      
 192 
     | 
    
         
            +
                # and is not excluded by a skip_link pattern. Returns
         
     | 
| 
      
 193 
     | 
    
         
            +
                # +false+ otherwise.
         
     | 
| 
      
 194 
     | 
    
         
            +
                #
         
     | 
| 
      
 195 
     | 
    
         
            +
                def visit_link?(link)
         
     | 
| 
      
 196 
     | 
    
         
            +
                  !@pages.has_key?(link) and !skip_link?(link)
         
     | 
| 
      
 197 
     | 
    
         
            +
                end
         
     | 
| 
      
 198 
     | 
    
         
            +
                
         
     | 
| 
      
 199 
     | 
    
         
            +
                #
         
     | 
| 
      
 200 
     | 
    
         
            +
                # Returns +true+ if *link* should not be visited because
         
     | 
| 
      
 201 
     | 
    
         
            +
                # its URL matches a skip_link pattern.
         
     | 
| 
      
 202 
     | 
    
         
            +
                #
         
     | 
| 
      
 203 
     | 
    
         
            +
                def skip_link?(link)
         
     | 
| 
      
 204 
     | 
    
         
            +
                  @skip_link_patterns.each { |p| return true if link.path =~ p}
         
     | 
| 
      
 205 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 206 
     | 
    
         
            +
                end
         
     | 
| 
      
 207 
     | 
    
         
            +
                
         
     | 
| 
      
 208 
     | 
    
         
            +
              end
         
     | 
| 
      
 209 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/anemone/http.rb
    ADDED
    
    | 
         @@ -0,0 +1,38 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'net/http'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 4 
     | 
    
         
            +
              class HTTP < Net::HTTP
         
     | 
| 
      
 5 
     | 
    
         
            +
                # Maximum number of redirects to follow on each get_response
         
     | 
| 
      
 6 
     | 
    
         
            +
                REDIRECTION_LIMIT = 5
         
     | 
| 
      
 7 
     | 
    
         
            +
                
         
     | 
| 
      
 8 
     | 
    
         
            +
                #
         
     | 
| 
      
 9 
     | 
    
         
            +
                # Retrieve an HTTP response for *url*, following redirects.
         
     | 
| 
      
 10 
     | 
    
         
            +
                # Returns the response object, response code, and final URI location.
         
     | 
| 
      
 11 
     | 
    
         
            +
                # 
         
     | 
| 
      
 12 
     | 
    
         
            +
                def self.get(url)      
         
     | 
| 
      
 13 
     | 
    
         
            +
                  response = get_response(url)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  code = Integer(response.code)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  loc = url
         
     | 
| 
      
 16 
     | 
    
         
            +
                  
         
     | 
| 
      
 17 
     | 
    
         
            +
                  limit = REDIRECTION_LIMIT
         
     | 
| 
      
 18 
     | 
    
         
            +
                  while response.is_a?(Net::HTTPRedirection) and limit > 0
         
     | 
| 
      
 19 
     | 
    
         
            +
                      loc = URI(response['location'])
         
     | 
| 
      
 20 
     | 
    
         
            +
                      loc = url.merge(loc) if loc.relative?
         
     | 
| 
      
 21 
     | 
    
         
            +
                      response = (Anemone.options.obey_robots_dot_txt ? (Net::HTTP.get_obeying_robots(loc)) : get_response(loc) )
         
     | 
| 
      
 22 
     | 
    
         
            +
                      limit -= 1
         
     | 
| 
      
 23 
     | 
    
         
            +
                  end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                  return response, code, loc
         
     | 
| 
      
 26 
     | 
    
         
            +
                end
         
     | 
| 
      
 27 
     | 
    
         
            +
                
         
     | 
| 
      
 28 
     | 
    
         
            +
                #
         
     | 
| 
      
 29 
     | 
    
         
            +
                # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
         
     | 
| 
      
 30 
     | 
    
         
            +
                #
         
     | 
| 
      
 31 
     | 
    
         
            +
                def self.get_response(url)
         
     | 
| 
      
 32 
     | 
    
         
            +
                  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
         
     | 
| 
      
 33 
     | 
    
         
            +
                  Net::HTTP.start(url.host, url.port) do |http|
         
     | 
| 
      
 34 
     | 
    
         
            +
                    return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
         
     | 
| 
      
 35 
     | 
    
         
            +
                  end
         
     | 
| 
      
 36 
     | 
    
         
            +
                end
         
     | 
| 
      
 37 
     | 
    
         
            +
              end
         
     | 
| 
      
 38 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/anemone/page.rb
    ADDED
    
    | 
         @@ -0,0 +1,177 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'anemone/http'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'ostruct'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 6 
     | 
    
         
            +
              class Page
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
                # The URL of the page
         
     | 
| 
      
 9 
     | 
    
         
            +
                attr_reader :url
         
     | 
| 
      
 10 
     | 
    
         
            +
                # Array of distinct A tag HREFs from the page
         
     | 
| 
      
 11 
     | 
    
         
            +
                attr_reader :links
         
     | 
| 
      
 12 
     | 
    
         
            +
                # Headers of the HTTP response
         
     | 
| 
      
 13 
     | 
    
         
            +
                attr_reader :headers
         
     | 
| 
      
 14 
     | 
    
         
            +
                
         
     | 
| 
      
 15 
     | 
    
         
            +
                # OpenStruct for user-stored data
         
     | 
| 
      
 16 
     | 
    
         
            +
                attr_accessor :data
         
     | 
| 
      
 17 
     | 
    
         
            +
                # Nokogiri document for the HTML body
         
     | 
| 
      
 18 
     | 
    
         
            +
                attr_accessor :doc
         
     | 
| 
      
 19 
     | 
    
         
            +
                # Integer response code of the page
         
     | 
| 
      
 20 
     | 
    
         
            +
                attr_accessor :code	
         
     | 
| 
      
 21 
     | 
    
         
            +
                # Array of redirect-aliases for the page
         
     | 
| 
      
 22 
     | 
    
         
            +
                attr_accessor :aliases
         
     | 
| 
      
 23 
     | 
    
         
            +
                # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
         
     | 
| 
      
 24 
     | 
    
         
            +
                attr_accessor :visited
         
     | 
| 
      
 25 
     | 
    
         
            +
                # Used by PageHash#shortest_paths! to store depth of the page
         
     | 
| 
      
 26 
     | 
    
         
            +
                attr_accessor :depth
         
     | 
| 
      
 27 
     | 
    
         
            +
                
         
     | 
| 
      
 28 
     | 
    
         
            +
                #
         
     | 
| 
      
 29 
     | 
    
         
            +
                # Create a new Page from the response of an HTTP request to *url*
         
     | 
| 
      
 30 
     | 
    
         
            +
                #
         
     | 
| 
      
 31 
     | 
    
         
            +
                def self.fetch(url)
         
     | 
| 
      
 32 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 33 
     | 
    
         
            +
                    url = URI(url) if url.is_a?(String)
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                    response, code, location = Anemone::HTTP.get(url)
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                    aka = nil
         
     | 
| 
      
 38 
     | 
    
         
            +
                    if !url.eql?(location)
         
     | 
| 
      
 39 
     | 
    
         
            +
                      aka = location
         
     | 
| 
      
 40 
     | 
    
         
            +
                    end
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                    return Page.new(url, response.body, code, response.to_hash, aka)
         
     | 
| 
      
 43 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 44 
     | 
    
         
            +
                    return Page.new(url)
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
      
 47 
     | 
    
         
            +
                
         
     | 
| 
      
 48 
     | 
    
         
            +
                #
         
     | 
| 
      
 49 
     | 
    
         
            +
                # Create a new page
         
     | 
| 
      
 50 
     | 
    
         
            +
                #
         
     | 
| 
      
 51 
     | 
    
         
            +
                def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
         
     | 
| 
      
 52 
     | 
    
         
            +
                  @url = url
         
     | 
| 
      
 53 
     | 
    
         
            +
                  @code = code
         
     | 
| 
      
 54 
     | 
    
         
            +
                  @headers = headers
         
     | 
| 
      
 55 
     | 
    
         
            +
                  @links = []
         
     | 
| 
      
 56 
     | 
    
         
            +
                  @aliases = []
         
     | 
| 
      
 57 
     | 
    
         
            +
                  @data = OpenStruct.new
         
     | 
| 
      
 58 
     | 
    
         
            +
            	  
         
     | 
| 
      
 59 
     | 
    
         
            +
                  @aliases << aka if !aka.nil?
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
                  if body
         
     | 
| 
      
 62 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 63 
     | 
    
         
            +
                      @doc = Nokogiri::HTML(body)
         
     | 
| 
      
 64 
     | 
    
         
            +
                    rescue
         
     | 
| 
      
 65 
     | 
    
         
            +
                      return
         
     | 
| 
      
 66 
     | 
    
         
            +
                    end
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                    return if @doc.nil?
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                    #get a list of distinct links on the page, in absolute url form
         
     | 
| 
      
 71 
     | 
    
         
            +
                    @doc.css('a').each do |a| 
         
     | 
| 
      
 72 
     | 
    
         
            +
                      u = a.attributes['href'].content if a.attributes['href']
         
     | 
| 
      
 73 
     | 
    
         
            +
                      next if u.nil?
         
     | 
| 
      
 74 
     | 
    
         
            +
                      
         
     | 
| 
      
 75 
     | 
    
         
            +
                      begin
         
     | 
| 
      
 76 
     | 
    
         
            +
                        abs = to_absolute(URI(u))
         
     | 
| 
      
 77 
     | 
    
         
            +
                      rescue
         
     | 
| 
      
 78 
     | 
    
         
            +
                        next
         
     | 
| 
      
 79 
     | 
    
         
            +
                      end
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
                      @links << abs if in_domain?(abs)
         
     | 
| 
      
 82 
     | 
    
         
            +
                    end
         
     | 
| 
      
 83 
     | 
    
         
            +
                    
         
     | 
| 
      
 84 
     | 
    
         
            +
                    @links.uniq!
         
     | 
| 
      
 85 
     | 
    
         
            +
                  end
         
     | 
| 
      
 86 
     | 
    
         
            +
                end
         
     | 
| 
      
 87 
     | 
    
         
            +
                
         
     | 
| 
      
 88 
     | 
    
         
            +
                
         
     | 
| 
      
 89 
     | 
    
         
            +
                #
         
     | 
| 
      
 90 
     | 
    
         
            +
                # Return a new page with the same *response* and *url*, but
         
     | 
| 
      
 91 
     | 
    
         
            +
                # with a 200 response code
         
     | 
| 
      
 92 
     | 
    
         
            +
                #    
         
     | 
| 
      
 93 
     | 
    
         
            +
                def alias_clone(url)
         
     | 
| 
      
 94 
     | 
    
         
            +
                  p = clone
         
     | 
| 
      
 95 
     | 
    
         
            +
            	  p.add_alias!(@aka) if !@aka.nil?
         
     | 
| 
      
 96 
     | 
    
         
            +
            	  p.code = 200
         
     | 
| 
      
 97 
     | 
    
         
            +
            	  p
         
     | 
| 
      
 98 
     | 
    
         
            +
                end
         
     | 
| 
      
 99 
     | 
    
         
            +
             
     | 
| 
      
 100 
     | 
    
         
            +
                #
         
     | 
| 
      
 101 
     | 
    
         
            +
                # Add a redirect-alias String *aka* to the list of the page's aliases
         
     | 
| 
      
 102 
     | 
    
         
            +
                #
         
     | 
| 
      
 103 
     | 
    
         
            +
                # Returns *self*
         
     | 
| 
      
 104 
     | 
    
         
            +
                #
         
     | 
| 
      
 105 
     | 
    
         
            +
                def add_alias!(aka)
         
     | 
| 
      
 106 
     | 
    
         
            +
                  @aliases << aka if !@aliases.include?(aka)
         
     | 
| 
      
 107 
     | 
    
         
            +
                  self
         
     | 
| 
      
 108 
     | 
    
         
            +
                end
         
     | 
| 
      
 109 
     | 
    
         
            +
                
         
     | 
| 
      
 110 
     | 
    
         
            +
                #
         
     | 
| 
      
 111 
     | 
    
         
            +
                # Returns an Array of all links from this page, and all the 
         
     | 
| 
      
 112 
     | 
    
         
            +
                # redirect-aliases of those pages, as String objects.
         
     | 
| 
      
 113 
     | 
    
         
            +
                #
         
     | 
| 
      
 114 
     | 
    
         
            +
                # *page_hash* is a PageHash object with the results of the current crawl.
         
     | 
| 
      
 115 
     | 
    
         
            +
                #
         
     | 
| 
      
 116 
     | 
    
         
            +
                def links_and_their_aliases(page_hash)
         
     | 
| 
      
 117 
     | 
    
         
            +
                  @links.inject([]) do |results, link|
         
     | 
| 
      
 118 
     | 
    
         
            +
                    results.concat([link].concat(page_hash[link].aliases))
         
     | 
| 
      
 119 
     | 
    
         
            +
                  end
         
     | 
| 
      
 120 
     | 
    
         
            +
                end
         
     | 
| 
      
 121 
     | 
    
         
            +
                
         
     | 
| 
      
 122 
     | 
    
         
            +
                #
         
     | 
| 
      
 123 
     | 
    
         
            +
                # The content-type returned by the HTTP request for this page
         
     | 
| 
      
 124 
     | 
    
         
            +
                #
         
     | 
| 
      
 125 
     | 
    
         
            +
                def content_type
         
     | 
| 
      
 126 
     | 
    
         
            +
                  @headers['content-type'][0] rescue nil
         
     | 
| 
      
 127 
     | 
    
         
            +
                end
         
     | 
| 
      
 128 
     | 
    
         
            +
                
         
     | 
| 
      
 129 
     | 
    
         
            +
                #
         
     | 
| 
      
 130 
     | 
    
         
            +
                # Returns +true+ if the page is a HTML document, returns +false+
         
     | 
| 
      
 131 
     | 
    
         
            +
                # otherwise.
         
     | 
| 
      
 132 
     | 
    
         
            +
                #
         
     | 
| 
      
 133 
     | 
    
         
            +
                def html?
         
     | 
| 
      
 134 
     | 
    
         
            +
                  (@content_type =~ /text\/html/) == 0
         
     | 
| 
      
 135 
     | 
    
         
            +
                end
         
     | 
| 
      
 136 
     | 
    
         
            +
                
         
     | 
| 
      
 137 
     | 
    
         
            +
                #
         
     | 
| 
      
 138 
     | 
    
         
            +
                # Returns +true+ if the page is a HTTP redirect, returns +false+
         
     | 
| 
      
 139 
     | 
    
         
            +
                # otherwise.
         
     | 
| 
      
 140 
     | 
    
         
            +
                #    
         
     | 
| 
      
 141 
     | 
    
         
            +
                def redirect?
         
     | 
| 
      
 142 
     | 
    
         
            +
                  (300..399).include?(@code)
         
     | 
| 
      
 143 
     | 
    
         
            +
                end
         
     | 
| 
      
 144 
     | 
    
         
            +
                
         
     | 
| 
      
 145 
     | 
    
         
            +
                #
         
     | 
| 
      
 146 
     | 
    
         
            +
                # Returns +true+ if the page was not found (returned 404 code),
         
     | 
| 
      
 147 
     | 
    
         
            +
                # returns +false+ otherwise.
         
     | 
| 
      
 148 
     | 
    
         
            +
                #
         
     | 
| 
      
 149 
     | 
    
         
            +
                def not_found?
         
     | 
| 
      
 150 
     | 
    
         
            +
                  404 == @code
         
     | 
| 
      
 151 
     | 
    
         
            +
                end
         
     | 
| 
      
 152 
     | 
    
         
            +
                
         
     | 
| 
      
 153 
     | 
    
         
            +
                #
         
     | 
| 
      
 154 
     | 
    
         
            +
                # Converts relative URL *link* into an absolute URL based on the
         
     | 
| 
      
 155 
     | 
    
         
            +
                # location of the page
         
     | 
| 
      
 156 
     | 
    
         
            +
                #
         
     | 
| 
      
 157 
     | 
    
         
            +
                def to_absolute(link)
         
     | 
| 
      
 158 
     | 
    
         
            +
                  # remove anchor
         
     | 
| 
      
 159 
     | 
    
         
            +
                  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
         
     | 
| 
      
 160 
     | 
    
         
            +
             
     | 
| 
      
 161 
     | 
    
         
            +
                  relative = URI(link)
         
     | 
| 
      
 162 
     | 
    
         
            +
                  absolute = @url.merge(relative)
         
     | 
| 
      
 163 
     | 
    
         
            +
             
     | 
| 
      
 164 
     | 
    
         
            +
                  absolute.path = '/' if absolute.path.empty?
         
     | 
| 
      
 165 
     | 
    
         
            +
             
     | 
| 
      
 166 
     | 
    
         
            +
                  return absolute
         
     | 
| 
      
 167 
     | 
    
         
            +
                end
         
     | 
| 
      
 168 
     | 
    
         
            +
                
         
     | 
| 
      
 169 
     | 
    
         
            +
                #
         
     | 
| 
      
 170 
     | 
    
         
            +
                # Returns +true+ if *uri* is in the same domain as the page, returns
         
     | 
| 
      
 171 
     | 
    
         
            +
                # +false+ otherwise
         
     | 
| 
      
 172 
     | 
    
         
            +
                #
         
     | 
| 
      
 173 
     | 
    
         
            +
                def in_domain?(uri)
         
     | 
| 
      
 174 
     | 
    
         
            +
                  uri.host == @url.host
         
     | 
| 
      
 175 
     | 
    
         
            +
                end
         
     | 
| 
      
 176 
     | 
    
         
            +
              end
         
     | 
| 
      
 177 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,116 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 2 
     | 
    
         
            +
              class PageHash < Hash
         
     | 
| 
      
 3 
     | 
    
         
            +
                
         
     | 
| 
      
 4 
     | 
    
         
            +
                #
         
     | 
| 
      
 5 
     | 
    
         
            +
                # Use a breadth-first search to calculate the single-source
         
     | 
| 
      
 6 
     | 
    
         
            +
                # shortest paths from *root* to all pages in the PageHash
         
     | 
| 
      
 7 
     | 
    
         
            +
                #
         
     | 
| 
      
 8 
     | 
    
         
            +
                def shortest_paths!(root)
         
     | 
| 
      
 9 
     | 
    
         
            +
                  root = URI(root) if root.is_a?(String)
         
     | 
| 
      
 10 
     | 
    
         
            +
                  raise "Root node not found" if !has_key?(root)
         
     | 
| 
      
 11 
     | 
    
         
            +
                  
         
     | 
| 
      
 12 
     | 
    
         
            +
                  each_value {|p| p.visited = false if p}
         
     | 
| 
      
 13 
     | 
    
         
            +
                  
         
     | 
| 
      
 14 
     | 
    
         
            +
                  q = Queue.new
         
     | 
| 
      
 15 
     | 
    
         
            +
                  
         
     | 
| 
      
 16 
     | 
    
         
            +
                  q.enq(root)
         
     | 
| 
      
 17 
     | 
    
         
            +
                  self[root].depth = 0
         
     | 
| 
      
 18 
     | 
    
         
            +
                  self[root].visited = true
         
     | 
| 
      
 19 
     | 
    
         
            +
                  while(!q.empty?)
         
     | 
| 
      
 20 
     | 
    
         
            +
                    url = q.deq
         
     | 
| 
      
 21 
     | 
    
         
            +
                    
         
     | 
| 
      
 22 
     | 
    
         
            +
                    next if !has_key?(url)
         
     | 
| 
      
 23 
     | 
    
         
            +
                    
         
     | 
| 
      
 24 
     | 
    
         
            +
                    page = self[url]
         
     | 
| 
      
 25 
     | 
    
         
            +
                    
         
     | 
| 
      
 26 
     | 
    
         
            +
                    page.links.each do |u|
         
     | 
| 
      
 27 
     | 
    
         
            +
                      next if !has_key?(u) or self[u].nil?
         
     | 
| 
      
 28 
     | 
    
         
            +
                      link = self[u]
         
     | 
| 
      
 29 
     | 
    
         
            +
                      aliases = [link].concat(link.aliases.map {|a| self[a] })
         
     | 
| 
      
 30 
     | 
    
         
            +
                      
         
     | 
| 
      
 31 
     | 
    
         
            +
                      aliases.each do |node|
         
     | 
| 
      
 32 
     | 
    
         
            +
                        if node.depth.nil? or page.depth + 1 < node.depth
         
     | 
| 
      
 33 
     | 
    
         
            +
                          node.depth = page.depth + 1
         
     | 
| 
      
 34 
     | 
    
         
            +
                        end
         
     | 
| 
      
 35 
     | 
    
         
            +
                      end
         
     | 
| 
      
 36 
     | 
    
         
            +
                      
         
     | 
| 
      
 37 
     | 
    
         
            +
                      q.enq(self[u].url) if !self[u].visited
         
     | 
| 
      
 38 
     | 
    
         
            +
                      self[u].visited = true
         
     | 
| 
      
 39 
     | 
    
         
            +
                    end
         
     | 
| 
      
 40 
     | 
    
         
            +
                  end
         
     | 
| 
      
 41 
     | 
    
         
            +
                  
         
     | 
| 
      
 42 
     | 
    
         
            +
                  self
         
     | 
| 
      
 43 
     | 
    
         
            +
                end
         
     | 
| 
      
 44 
     | 
    
         
            +
                
         
     | 
| 
      
 45 
     | 
    
         
            +
                #
         
     | 
| 
      
 46 
     | 
    
         
            +
                # Returns a new PageHash by removing redirect-aliases for each
         
     | 
| 
      
 47 
     | 
    
         
            +
                # non-redirect Page
         
     | 
| 
      
 48 
     | 
    
         
            +
                #
         
     | 
| 
      
 49 
     | 
    
         
            +
                def uniq
         
     | 
| 
      
 50 
     | 
    
         
            +
                  results = PageHash.new
         
     | 
| 
      
 51 
     | 
    
         
            +
                  each do |url, page|
         
     | 
| 
      
 52 
     | 
    
         
            +
                    #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
         
     | 
| 
      
 53 
     | 
    
         
            +
                    page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
         
     | 
| 
      
 54 
     | 
    
         
            +
                    if !page.redirect? and !page_added
         
     | 
| 
      
 55 
     | 
    
         
            +
                      results[url] = page.clone 
         
     | 
| 
      
 56 
     | 
    
         
            +
                      results[url].aliases = []
         
     | 
| 
      
 57 
     | 
    
         
            +
                    end
         
     | 
| 
      
 58 
     | 
    
         
            +
                  end
         
     | 
| 
      
 59 
     | 
    
         
            +
                  
         
     | 
| 
      
 60 
     | 
    
         
            +
                  results
         
     | 
| 
      
 61 
     | 
    
         
            +
                end
         
     | 
| 
      
 62 
     | 
    
         
            +
                
         
     | 
| 
      
 63 
     | 
    
         
            +
                #
         
     | 
| 
      
 64 
     | 
    
         
            +
                # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
         
     | 
| 
      
 65 
     | 
    
         
            +
                # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
         
     | 
| 
      
 66 
     | 
    
         
            +
                #
         
     | 
| 
      
 67 
     | 
    
         
            +
                def pages_linking_to(urls)
         
     | 
| 
      
 68 
     | 
    
         
            +
                  unless urls.is_a?(Array)
         
     | 
| 
      
 69 
     | 
    
         
            +
                    urls = [urls] unless urls.is_a?(Array)
         
     | 
| 
      
 70 
     | 
    
         
            +
                    single = true
         
     | 
| 
      
 71 
     | 
    
         
            +
                  end
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                  urls.map! do |url|
         
     | 
| 
      
 74 
     | 
    
         
            +
                    if url.is_a?(String)
         
     | 
| 
      
 75 
     | 
    
         
            +
                      URI(url) rescue nil
         
     | 
| 
      
 76 
     | 
    
         
            +
                    else
         
     | 
| 
      
 77 
     | 
    
         
            +
                      url
         
     | 
| 
      
 78 
     | 
    
         
            +
                    end
         
     | 
| 
      
 79 
     | 
    
         
            +
                  end
         
     | 
| 
      
 80 
     | 
    
         
            +
                  urls.compact
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                  links = {}
         
     | 
| 
      
 83 
     | 
    
         
            +
                  urls.each { |url| links[url] = [] }
         
     | 
| 
      
 84 
     | 
    
         
            +
                  values.each do |page|
         
     | 
| 
      
 85 
     | 
    
         
            +
                    urls.each { |url| links[url] << page if page.links.include?(url) }
         
     | 
| 
      
 86 
     | 
    
         
            +
                  end
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
                  if single and !links.empty?
         
     | 
| 
      
 89 
     | 
    
         
            +
                    return links.first
         
     | 
| 
      
 90 
     | 
    
         
            +
                  else
         
     | 
| 
      
 91 
     | 
    
         
            +
                    return links
         
     | 
| 
      
 92 
     | 
    
         
            +
                  end
         
     | 
| 
      
 93 
     | 
    
         
            +
                end
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
                #
         
     | 
| 
      
 96 
     | 
    
         
            +
                # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
         
     | 
| 
      
 97 
     | 
    
         
            +
                # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
         
     | 
| 
      
 98 
     | 
    
         
            +
                #
         
     | 
| 
      
 99 
     | 
    
         
            +
                def urls_linking_to(urls)
         
     | 
| 
      
 100 
     | 
    
         
            +
                  unless urls.is_a?(Array)
         
     | 
| 
      
 101 
     | 
    
         
            +
                    urls = [urls] unless urls.is_a?(Array)
         
     | 
| 
      
 102 
     | 
    
         
            +
                    single = true
         
     | 
| 
      
 103 
     | 
    
         
            +
                  end
         
     | 
| 
      
 104 
     | 
    
         
            +
             
     | 
| 
      
 105 
     | 
    
         
            +
                  links = pages_linking_to(urls)
         
     | 
| 
      
 106 
     | 
    
         
            +
                  links.each { |url, pages| links[url] = pages.map{|p| p.url} }
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
                  if single and !links.empty?
         
     | 
| 
      
 109 
     | 
    
         
            +
                    return links.first
         
     | 
| 
      
 110 
     | 
    
         
            +
                  else
         
     | 
| 
      
 111 
     | 
    
         
            +
                    return links
         
     | 
| 
      
 112 
     | 
    
         
            +
                  end	  
         
     | 
| 
      
 113 
     | 
    
         
            +
                end
         
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
              end
         
     | 
| 
      
 116 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,33 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'anemone/page'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 4 
     | 
    
         
            +
              class Tentacle
         
     | 
| 
      
 5 
     | 
    
         
            +
                
         
     | 
| 
      
 6 
     | 
    
         
            +
                #
         
     | 
| 
      
 7 
     | 
    
         
            +
                # Create a new Tentacle
         
     | 
| 
      
 8 
     | 
    
         
            +
                #
         
     | 
| 
      
 9 
     | 
    
         
            +
                def initialize(link_queue, page_queue)
         
     | 
| 
      
 10 
     | 
    
         
            +
                  @link_queue = link_queue
         
     | 
| 
      
 11 
     | 
    
         
            +
                  @page_queue = page_queue
         
     | 
| 
      
 12 
     | 
    
         
            +
                end
         
     | 
| 
      
 13 
     | 
    
         
            +
                
         
     | 
| 
      
 14 
     | 
    
         
            +
                #
         
     | 
| 
      
 15 
     | 
    
         
            +
                # Gets links from @link_queue, and returns the fetched
         
     | 
| 
      
 16 
     | 
    
         
            +
                # Page objects into @page_queue
         
     | 
| 
      
 17 
     | 
    
         
            +
                #
         
     | 
| 
      
 18 
     | 
    
         
            +
                def run
         
     | 
| 
      
 19 
     | 
    
         
            +
                  while true do
         
     | 
| 
      
 20 
     | 
    
         
            +
                    link = @link_queue.deq
         
     | 
| 
      
 21 
     | 
    
         
            +
                    
         
     | 
| 
      
 22 
     | 
    
         
            +
                    break if link == :END
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                    page = Page.fetch(link)
         
     | 
| 
      
 25 
     | 
    
         
            +
                    
         
     | 
| 
      
 26 
     | 
    
         
            +
                    @page_queue.enq(page)
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
                    sleep Anemone.options.delay
         
     | 
| 
      
 29 
     | 
    
         
            +
                  end
         
     | 
| 
      
 30 
     | 
    
         
            +
                end
         
     | 
| 
      
 31 
     | 
    
         
            +
                
         
     | 
| 
      
 32 
     | 
    
         
            +
              end
         
     | 
| 
      
 33 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,41 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require File.dirname(__FILE__) + '/spec_helper'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            describe Anemone do
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
              it "should have a version" do
         
     | 
| 
      
 6 
     | 
    
         
            +
                Anemone.const_defined?('VERSION').should == true
         
     | 
| 
      
 7 
     | 
    
         
            +
              end
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              it "should have options" do
         
     | 
| 
      
 10 
     | 
    
         
            +
                Anemone.should respond_to(:options)
         
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
              
         
     | 
| 
      
 13 
     | 
    
         
            +
              it "should accept options for the crawl" do
         
     | 
| 
      
 14 
     | 
    
         
            +
                Anemone.crawl(SPEC_DOMAIN, :verbose => false, 
         
     | 
| 
      
 15 
     | 
    
         
            +
                                           :threads => 2, 
         
     | 
| 
      
 16 
     | 
    
         
            +
                                           :discard_page_bodies => true,
         
     | 
| 
      
 17 
     | 
    
         
            +
                                           :user_agent => 'test')
         
     | 
| 
      
 18 
     | 
    
         
            +
                Anemone.options.verbose.should == false
         
     | 
| 
      
 19 
     | 
    
         
            +
                Anemone.options.threads.should == 2
         
     | 
| 
      
 20 
     | 
    
         
            +
                Anemone.options.discard_page_bodies.should == true
         
     | 
| 
      
 21 
     | 
    
         
            +
                Anemone.options.delay.should == 0
         
     | 
| 
      
 22 
     | 
    
         
            +
                Anemone.options.user_agent.should == 'test'
         
     | 
| 
      
 23 
     | 
    
         
            +
              end    
         
     | 
| 
      
 24 
     | 
    
         
            +
              
         
     | 
| 
      
 25 
     | 
    
         
            +
              it "should accept options of obeying Robots.txt for the crawl" do
         
     | 
| 
      
 26 
     | 
    
         
            +
                Anemone.crawl(SPEC_DOMAIN, :obey_robots_dot_txt => true)
         
     | 
| 
      
 27 
     | 
    
         
            +
                Anemone.options.obey_robots_dot_txt.should == true  
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
      
 29 
     | 
    
         
            +
              
         
     | 
| 
      
 30 
     | 
    
         
            +
              it "should use 1 thread if a delay is requested" do
         
     | 
| 
      
 31 
     | 
    
         
            +
                Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
         
     | 
| 
      
 32 
     | 
    
         
            +
                Anemone.options.threads.should == 1
         
     | 
| 
      
 33 
     | 
    
         
            +
              end
         
     | 
| 
      
 34 
     | 
    
         
            +
              
         
     | 
| 
      
 35 
     | 
    
         
            +
              it "should return a Anemone::Core from the crawl, which has a PageHash" do
         
     | 
| 
      
 36 
     | 
    
         
            +
                result = Anemone.crawl(SPEC_DOMAIN)
         
     | 
| 
      
 37 
     | 
    
         
            +
                result.should be_an_instance_of(Anemone::Core)
         
     | 
| 
      
 38 
     | 
    
         
            +
                result.pages.should be_an_instance_of(Anemone::PageHash)
         
     | 
| 
      
 39 
     | 
    
         
            +
              end
         
     | 
| 
      
 40 
     | 
    
         
            +
              
         
     | 
| 
      
 41 
     | 
    
         
            +
            end
         
     | 
    
        data/spec/core_spec.rb
    ADDED
    
    | 
         @@ -0,0 +1,128 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require File.dirname(__FILE__) + '/spec_helper'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 4 
     | 
    
         
            +
              describe Core do
         
     | 
| 
      
 5 
     | 
    
         
            +
                
         
     | 
| 
      
 6 
     | 
    
         
            +
                before(:each) do
         
     | 
| 
      
 7 
     | 
    
         
            +
                  FakeWeb.clean_registry
         
     | 
| 
      
 8 
     | 
    
         
            +
                end
         
     | 
| 
      
 9 
     | 
    
         
            +
                
         
     | 
| 
      
 10 
     | 
    
         
            +
                it "should crawl all the html pages in a domain by following <a> href's" do
         
     | 
| 
      
 11 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 12 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => ['1', '2'])
         
     | 
| 
      
 13 
     | 
    
         
            +
                  pages << FakePage.new('1', :links => ['3'])
         
     | 
| 
      
 14 
     | 
    
         
            +
                  pages << FakePage.new('2')
         
     | 
| 
      
 15 
     | 
    
         
            +
                  pages << FakePage.new('3')
         
     | 
| 
      
 16 
     | 
    
         
            +
                  
         
     | 
| 
      
 17 
     | 
    
         
            +
                  Anemone.crawl(pages[0].url).should have(4).pages
         
     | 
| 
      
 18 
     | 
    
         
            +
                end
         
     | 
| 
      
 19 
     | 
    
         
            +
                
         
     | 
| 
      
 20 
     | 
    
         
            +
                it "should not leave the original domain" do
         
     | 
| 
      
 21 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 22 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
         
     | 
| 
      
 23 
     | 
    
         
            +
                  pages << FakePage.new('1')
         
     | 
| 
      
 24 
     | 
    
         
            +
                  
         
     | 
| 
      
 25 
     | 
    
         
            +
                  core = Anemone.crawl(pages[0].url)
         
     | 
| 
      
 26 
     | 
    
         
            +
                  
         
     | 
| 
      
 27 
     | 
    
         
            +
                  core.should have(2).pages
         
     | 
| 
      
 28 
     | 
    
         
            +
                  core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
         
     | 
| 
      
 29 
     | 
    
         
            +
                end
         
     | 
| 
      
 30 
     | 
    
         
            +
                
         
     | 
| 
      
 31 
     | 
    
         
            +
                it "should follow http redirects" do
         
     | 
| 
      
 32 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 33 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => ['1'])
         
     | 
| 
      
 34 
     | 
    
         
            +
                  pages << FakePage.new('1', :redirect => '2')
         
     | 
| 
      
 35 
     | 
    
         
            +
                  pages << FakePage.new('2')
         
     | 
| 
      
 36 
     | 
    
         
            +
                  
         
     | 
| 
      
 37 
     | 
    
         
            +
                  Anemone.crawl(pages[0].url).should have(3).pages     
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
                
         
     | 
| 
      
 40 
     | 
    
         
            +
                it "should accept multiple starting URLs" do
         
     | 
| 
      
 41 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 42 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => ['1'])
         
     | 
| 
      
 43 
     | 
    
         
            +
                  pages << FakePage.new('1')
         
     | 
| 
      
 44 
     | 
    
         
            +
                  pages << FakePage.new('2', :links => ['3'])
         
     | 
| 
      
 45 
     | 
    
         
            +
                  pages << FakePage.new('3')
         
     | 
| 
      
 46 
     | 
    
         
            +
                  
         
     | 
| 
      
 47 
     | 
    
         
            +
                  Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
         
     | 
| 
      
 48 
     | 
    
         
            +
                end
         
     | 
| 
      
 49 
     | 
    
         
            +
                
         
     | 
| 
      
 50 
     | 
    
         
            +
                it "should include the query string when following links" do
         
     | 
| 
      
 51 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 52 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => ['1?foo=1'])
         
     | 
| 
      
 53 
     | 
    
         
            +
                  pages << FakePage.new('1?foo=1')
         
     | 
| 
      
 54 
     | 
    
         
            +
                  pages << FakePage.new('1')
         
     | 
| 
      
 55 
     | 
    
         
            +
                  
         
     | 
| 
      
 56 
     | 
    
         
            +
                  core = Anemone.crawl(pages[0].url)
         
     | 
| 
      
 57 
     | 
    
         
            +
                  
         
     | 
| 
      
 58 
     | 
    
         
            +
                  core.should have(2).pages
         
     | 
| 
      
 59 
     | 
    
         
            +
                  core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
         
     | 
| 
      
 60 
     | 
    
         
            +
                end
         
     | 
| 
      
 61 
     | 
    
         
            +
                
         
     | 
| 
      
 62 
     | 
    
         
            +
                it "should be able to skip links based on a RegEx" do
         
     | 
| 
      
 63 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 64 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => ['1', '2'])
         
     | 
| 
      
 65 
     | 
    
         
            +
                  pages << FakePage.new('1')
         
     | 
| 
      
 66 
     | 
    
         
            +
                  pages << FakePage.new('2')
         
     | 
| 
      
 67 
     | 
    
         
            +
                  
         
     | 
| 
      
 68 
     | 
    
         
            +
                  core = Anemone.crawl(pages[0].url) do |a|
         
     | 
| 
      
 69 
     | 
    
         
            +
                    a.skip_links_like /1/
         
     | 
| 
      
 70 
     | 
    
         
            +
                  end
         
     | 
| 
      
 71 
     | 
    
         
            +
                  
         
     | 
| 
      
 72 
     | 
    
         
            +
                  core.should have(2).pages
         
     | 
| 
      
 73 
     | 
    
         
            +
                  core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
         
     | 
| 
      
 74 
     | 
    
         
            +
                end
         
     | 
| 
      
 75 
     | 
    
         
            +
                
         
     | 
| 
      
 76 
     | 
    
         
            +
                it "should be able to call a block on every page" do
         
     | 
| 
      
 77 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 78 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => ['1', '2'])
         
     | 
| 
      
 79 
     | 
    
         
            +
                  pages << FakePage.new('1')
         
     | 
| 
      
 80 
     | 
    
         
            +
                  pages << FakePage.new('2')
         
     | 
| 
      
 81 
     | 
    
         
            +
                  
         
     | 
| 
      
 82 
     | 
    
         
            +
                  count = 0
         
     | 
| 
      
 83 
     | 
    
         
            +
                  Anemone.crawl(pages[0].url) do |a|
         
     | 
| 
      
 84 
     | 
    
         
            +
                    a.on_every_page { count += 1 }
         
     | 
| 
      
 85 
     | 
    
         
            +
                  end     
         
     | 
| 
      
 86 
     | 
    
         
            +
                  
         
     | 
| 
      
 87 
     | 
    
         
            +
                  count.should == 3
         
     | 
| 
      
 88 
     | 
    
         
            +
                end
         
     | 
| 
      
 89 
     | 
    
         
            +
                
         
     | 
| 
      
 90 
     | 
    
         
            +
                it "should not discard page bodies by default" do
         
     | 
| 
      
 91 
     | 
    
         
            +
                  Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
         
     | 
| 
      
 92 
     | 
    
         
            +
                end
         
     | 
| 
      
 93 
     | 
    
         
            +
                
         
     | 
| 
      
 94 
     | 
    
         
            +
                it "should optionally discard page bodies to conserve memory" do
         
     | 
| 
      
 95 
     | 
    
         
            +
                  core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
         
     | 
| 
      
 96 
     | 
    
         
            +
                  core.pages.values.first.doc.should be_nil
         
     | 
| 
      
 97 
     | 
    
         
            +
                end
         
     | 
| 
      
 98 
     | 
    
         
            +
                
         
     | 
| 
      
 99 
     | 
    
         
            +
                it "should provide a focus_crawl method to select the links on each page to follow" do
         
     | 
| 
      
 100 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 101 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => ['1', '2'])
         
     | 
| 
      
 102 
     | 
    
         
            +
                  pages << FakePage.new('1')
         
     | 
| 
      
 103 
     | 
    
         
            +
                  pages << FakePage.new('2')
         
     | 
| 
      
 104 
     | 
    
         
            +
             
     | 
| 
      
 105 
     | 
    
         
            +
                  core = Anemone.crawl(pages[0].url) do |a|
         
     | 
| 
      
 106 
     | 
    
         
            +
                    a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
         
     | 
| 
      
 107 
     | 
    
         
            +
                  end     
         
     | 
| 
      
 108 
     | 
    
         
            +
                  
         
     | 
| 
      
 109 
     | 
    
         
            +
                  core.should have(2).pages
         
     | 
| 
      
 110 
     | 
    
         
            +
                  core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
                
         
     | 
| 
      
 113 
     | 
    
         
            +
                it "should optionally delay between page requests" do
         
     | 
| 
      
 114 
     | 
    
         
            +
                  delay = 0.25
         
     | 
| 
      
 115 
     | 
    
         
            +
                  
         
     | 
| 
      
 116 
     | 
    
         
            +
                  pages = []
         
     | 
| 
      
 117 
     | 
    
         
            +
                  pages << FakePage.new('0', :links => '1')
         
     | 
| 
      
 118 
     | 
    
         
            +
                  pages << FakePage.new('1')
         
     | 
| 
      
 119 
     | 
    
         
            +
                  
         
     | 
| 
      
 120 
     | 
    
         
            +
                  start = Time.now
         
     | 
| 
      
 121 
     | 
    
         
            +
                  Anemone.crawl(pages[0].url, :delay => delay)
         
     | 
| 
      
 122 
     | 
    
         
            +
                  finish = Time.now
         
     | 
| 
      
 123 
     | 
    
         
            +
                  
         
     | 
| 
      
 124 
     | 
    
         
            +
                  (finish - start).should satisfy {|t| t > delay * 2}
         
     | 
| 
      
 125 
     | 
    
         
            +
                end
         
     | 
| 
      
 126 
     | 
    
         
            +
                
         
     | 
| 
      
 127 
     | 
    
         
            +
              end
         
     | 
| 
      
 128 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,55 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            begin
         
     | 
| 
      
 2 
     | 
    
         
            +
              require 'fakeweb'
         
     | 
| 
      
 3 
     | 
    
         
            +
            rescue LoadError
         
     | 
| 
      
 4 
     | 
    
         
            +
              warn "You need the 'fakeweb' gem installed to test Anemone"
         
     | 
| 
      
 5 
     | 
    
         
            +
              exit
         
     | 
| 
      
 6 
     | 
    
         
            +
            end
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            FakeWeb.allow_net_connect = false
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 11 
     | 
    
         
            +
              SPEC_DOMAIN = "http://www.example.com/"
         
     | 
| 
      
 12 
     | 
    
         
            +
              
         
     | 
| 
      
 13 
     | 
    
         
            +
              class FakePage
         
     | 
| 
      
 14 
     | 
    
         
            +
                attr_accessor :links
         
     | 
| 
      
 15 
     | 
    
         
            +
                attr_accessor :hrefs
         
     | 
| 
      
 16 
     | 
    
         
            +
                
         
     | 
| 
      
 17 
     | 
    
         
            +
                def initialize(name = '', options = {})
         
     | 
| 
      
 18 
     | 
    
         
            +
                  @name = name
         
     | 
| 
      
 19 
     | 
    
         
            +
                  @links = [options[:links]].flatten if options.has_key?(:links)
         
     | 
| 
      
 20 
     | 
    
         
            +
                  @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @redirect = options[:redirect] if options.has_key?(:redirect)
         
     | 
| 
      
 22 
     | 
    
         
            +
                  
         
     | 
| 
      
 23 
     | 
    
         
            +
                  create_body
         
     | 
| 
      
 24 
     | 
    
         
            +
                  add_to_fakeweb
         
     | 
| 
      
 25 
     | 
    
         
            +
                end
         
     | 
| 
      
 26 
     | 
    
         
            +
                
         
     | 
| 
      
 27 
     | 
    
         
            +
                def url
         
     | 
| 
      
 28 
     | 
    
         
            +
                  SPEC_DOMAIN + @name
         
     | 
| 
      
 29 
     | 
    
         
            +
                end
         
     | 
| 
      
 30 
     | 
    
         
            +
                
         
     | 
| 
      
 31 
     | 
    
         
            +
                private
         
     | 
| 
      
 32 
     | 
    
         
            +
                
         
     | 
| 
      
 33 
     | 
    
         
            +
                def create_body
         
     | 
| 
      
 34 
     | 
    
         
            +
                  @body = "<html><body>"
         
     | 
| 
      
 35 
     | 
    
         
            +
                  @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
         
     | 
| 
      
 36 
     | 
    
         
            +
                  @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
         
     | 
| 
      
 37 
     | 
    
         
            +
                  @body += "</body></html>"
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
                
         
     | 
| 
      
 40 
     | 
    
         
            +
                def add_to_fakeweb
         
     | 
| 
      
 41 
     | 
    
         
            +
                  options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
         
     | 
| 
      
 42 
     | 
    
         
            +
                  
         
     | 
| 
      
 43 
     | 
    
         
            +
                  if @redirect
         
     | 
| 
      
 44 
     | 
    
         
            +
                    options[:status] = [301, "Permanently Moved"] 
         
     | 
| 
      
 45 
     | 
    
         
            +
                    options[:location] = SPEC_DOMAIN + @redirect
         
     | 
| 
      
 46 
     | 
    
         
            +
                  end
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                  FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
         
     | 
| 
      
 49 
     | 
    
         
            +
                end
         
     | 
| 
      
 50 
     | 
    
         
            +
              end
         
     | 
| 
      
 51 
     | 
    
         
            +
            end
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
            #default root
         
     | 
| 
      
 54 
     | 
    
         
            +
            Anemone::FakePage.new
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
    
        data/spec/page_spec.rb
    ADDED
    
    | 
         @@ -0,0 +1,49 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require File.dirname(__FILE__) + '/spec_helper'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Anemone
         
     | 
| 
      
 4 
     | 
    
         
            +
              describe Page do
         
     | 
| 
      
 5 
     | 
    
         
            +
                
         
     | 
| 
      
 6 
     | 
    
         
            +
                before(:each) do
         
     | 
| 
      
 7 
     | 
    
         
            +
                  @page = Page.fetch(FakePage.new('home').url)
         
     | 
| 
      
 8 
     | 
    
         
            +
                end
         
     | 
| 
      
 9 
     | 
    
         
            +
                
         
     | 
| 
      
 10 
     | 
    
         
            +
                it "should be able to fetch a page" do
         
     | 
| 
      
 11 
     | 
    
         
            +
                  @page.should_not be_nil
         
     | 
| 
      
 12 
     | 
    
         
            +
                  @page.url.to_s.should include('home')
         
     | 
| 
      
 13 
     | 
    
         
            +
                end
         
     | 
| 
      
 14 
     | 
    
         
            +
                
         
     | 
| 
      
 15 
     | 
    
         
            +
                it "should store the response headers when fetching a page" do
         
     | 
| 
      
 16 
     | 
    
         
            +
                  @page.headers.should_not be_nil
         
     | 
| 
      
 17 
     | 
    
         
            +
                  @page.headers.should have_key('content-type')
         
     | 
| 
      
 18 
     | 
    
         
            +
                end
         
     | 
| 
      
 19 
     | 
    
         
            +
                
         
     | 
| 
      
 20 
     | 
    
         
            +
                it "should have an OpenStruct attribute for the developer to store data in" do
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @page.data.should_not be_nil
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @page.data.should be_an_instance_of(OpenStruct)
         
     | 
| 
      
 23 
     | 
    
         
            +
                  
         
     | 
| 
      
 24 
     | 
    
         
            +
                  @page.data.test = 'test'
         
     | 
| 
      
 25 
     | 
    
         
            +
                  @page.data.test.should == 'test'
         
     | 
| 
      
 26 
     | 
    
         
            +
                end
         
     | 
| 
      
 27 
     | 
    
         
            +
                
         
     | 
| 
      
 28 
     | 
    
         
            +
                it "should have a Nokogori::HTML::Document attribute for the page body" do
         
     | 
| 
      
 29 
     | 
    
         
            +
                  @page.doc.should_not be_nil
         
     | 
| 
      
 30 
     | 
    
         
            +
                  @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
         
     | 
| 
      
 31 
     | 
    
         
            +
                end
         
     | 
| 
      
 32 
     | 
    
         
            +
                
         
     | 
| 
      
 33 
     | 
    
         
            +
                it "should indicate whether it was fetched after an HTTP redirect" do
         
     | 
| 
      
 34 
     | 
    
         
            +
                  @page.should respond_to(:redirect?)
         
     | 
| 
      
 35 
     | 
    
         
            +
                  
         
     | 
| 
      
 36 
     | 
    
         
            +
                  @page.redirect?.should == false
         
     | 
| 
      
 37 
     | 
    
         
            +
                  
         
     | 
| 
      
 38 
     | 
    
         
            +
                  Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
         
     | 
| 
      
 39 
     | 
    
         
            +
                end
         
     | 
| 
      
 40 
     | 
    
         
            +
                
         
     | 
| 
      
 41 
     | 
    
         
            +
                it "should have a method to tell if a URI is in the same domain as the page" do
         
     | 
| 
      
 42 
     | 
    
         
            +
                  @page.should respond_to(:in_domain?)
         
     | 
| 
      
 43 
     | 
    
         
            +
                  
         
     | 
| 
      
 44 
     | 
    
         
            +
                  @page.in_domain?(URI(FakePage.new('test').url)).should == true
         
     | 
| 
      
 45 
     | 
    
         
            +
                  @page.in_domain?(URI('http://www.other.com/')).should == false
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
      
 47 
     | 
    
         
            +
                
         
     | 
| 
      
 48 
     | 
    
         
            +
              end
         
     | 
| 
      
 49 
     | 
    
         
            +
            end
         
     | 
    
        data/spec/spec_helper.rb
    ADDED
    
    
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,86 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification 
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: parolkar-anemone
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version 
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.2
         
     | 
| 
      
 5 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 6 
     | 
    
         
            +
            authors: 
         
     | 
| 
      
 7 
     | 
    
         
            +
            - Chris Kite
         
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 9 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 10 
     | 
    
         
            +
            cert_chain: []
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2009-05-16 00:00:00 -07:00
         
     | 
| 
      
 13 
     | 
    
         
            +
            default_executable: 
         
     | 
| 
      
 14 
     | 
    
         
            +
            dependencies: 
         
     | 
| 
      
 15 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency 
         
     | 
| 
      
 16 
     | 
    
         
            +
              name: nokogiri
         
     | 
| 
      
 17 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 18 
     | 
    
         
            +
              version_requirement: 
         
     | 
| 
      
 19 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 20 
     | 
    
         
            +
                requirements: 
         
     | 
| 
      
 21 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 22 
     | 
    
         
            +
                  - !ruby/object:Gem::Version 
         
     | 
| 
      
 23 
     | 
    
         
            +
                    version: 1.3.0
         
     | 
| 
      
 24 
     | 
    
         
            +
                version: 
         
     | 
| 
      
 25 
     | 
    
         
            +
            description: 
         
     | 
| 
      
 26 
     | 
    
         
            +
            email: 
         
     | 
| 
      
 27 
     | 
    
         
            +
            executables: 
         
     | 
| 
      
 28 
     | 
    
         
            +
            - anemone_count.rb
         
     | 
| 
      
 29 
     | 
    
         
            +
            - anemone_cron.rb
         
     | 
| 
      
 30 
     | 
    
         
            +
            - anemone_pagedepth.rb
         
     | 
| 
      
 31 
     | 
    
         
            +
            - anemone_serialize.rb
         
     | 
| 
      
 32 
     | 
    
         
            +
            - anemone_url_list.rb
         
     | 
| 
      
 33 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
            extra_rdoc_files: 
         
     | 
| 
      
 36 
     | 
    
         
            +
            - README.rdoc
         
     | 
| 
      
 37 
     | 
    
         
            +
            files: 
         
     | 
| 
      
 38 
     | 
    
         
            +
            - LICENSE.txt
         
     | 
| 
      
 39 
     | 
    
         
            +
            - README.rdoc
         
     | 
| 
      
 40 
     | 
    
         
            +
            - bin/anemone_count.rb
         
     | 
| 
      
 41 
     | 
    
         
            +
            - bin/anemone_cron.rb
         
     | 
| 
      
 42 
     | 
    
         
            +
            - bin/anemone_pagedepth.rb
         
     | 
| 
      
 43 
     | 
    
         
            +
            - bin/anemone_serialize.rb
         
     | 
| 
      
 44 
     | 
    
         
            +
            - bin/anemone_url_list.rb
         
     | 
| 
      
 45 
     | 
    
         
            +
            - lib/anemone.rb
         
     | 
| 
      
 46 
     | 
    
         
            +
            - lib/anemone/anemone.rb
         
     | 
| 
      
 47 
     | 
    
         
            +
            - lib/anemone/core.rb
         
     | 
| 
      
 48 
     | 
    
         
            +
            - lib/anemone/http.rb
         
     | 
| 
      
 49 
     | 
    
         
            +
            - lib/anemone/page.rb
         
     | 
| 
      
 50 
     | 
    
         
            +
            - lib/anemone/page_hash.rb
         
     | 
| 
      
 51 
     | 
    
         
            +
            - lib/anemone/tentacle.rb
         
     | 
| 
      
 52 
     | 
    
         
            +
            has_rdoc: true
         
     | 
| 
      
 53 
     | 
    
         
            +
            homepage: http://anemone.rubyforge.org
         
     | 
| 
      
 54 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 55 
     | 
    
         
            +
            rdoc_options: 
         
     | 
| 
      
 56 
     | 
    
         
            +
            - -m
         
     | 
| 
      
 57 
     | 
    
         
            +
            - README.rdoc
         
     | 
| 
      
 58 
     | 
    
         
            +
            - -t
         
     | 
| 
      
 59 
     | 
    
         
            +
            - Anemone
         
     | 
| 
      
 60 
     | 
    
         
            +
            require_paths: 
         
     | 
| 
      
 61 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 62 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 63 
     | 
    
         
            +
              requirements: 
         
     | 
| 
      
 64 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 65 
     | 
    
         
            +
                - !ruby/object:Gem::Version 
         
     | 
| 
      
 66 
     | 
    
         
            +
                  version: "0"
         
     | 
| 
      
 67 
     | 
    
         
            +
              version: 
         
     | 
| 
      
 68 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 69 
     | 
    
         
            +
              requirements: 
         
     | 
| 
      
 70 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 71 
     | 
    
         
            +
                - !ruby/object:Gem::Version 
         
     | 
| 
      
 72 
     | 
    
         
            +
                  version: "0"
         
     | 
| 
      
 73 
     | 
    
         
            +
              version: 
         
     | 
| 
      
 74 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
            rubyforge_project: anemone
         
     | 
| 
      
 77 
     | 
    
         
            +
            rubygems_version: 1.2.0
         
     | 
| 
      
 78 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 79 
     | 
    
         
            +
            specification_version: 2
         
     | 
| 
      
 80 
     | 
    
         
            +
            summary: Anemone web-spider framework
         
     | 
| 
      
 81 
     | 
    
         
            +
            test_files: 
         
     | 
| 
      
 82 
     | 
    
         
            +
            - spec/anemone_spec.rb
         
     | 
| 
      
 83 
     | 
    
         
            +
            - spec/core_spec.rb
         
     | 
| 
      
 84 
     | 
    
         
            +
            - spec/page_spec.rb
         
     | 
| 
      
 85 
     | 
    
         
            +
            - spec/fakeweb_helper.rb
         
     | 
| 
      
 86 
     | 
    
         
            +
            - spec/spec_helper.rb
         
     |