RubyGems - anemone - Versions diffs - 0.2.0 → 0.2.1 - Mend

anemone 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/README.rdoc +13 -5
data/bin/anemone +4 -0
data/lib/anemone/anemone.rb +28 -34
data/lib/anemone/cli.rb +24 -0
data/lib/anemone/cli/count.rb +22 -0
data/{bin/anemone_cron.rb → lib/anemone/cli/cron.rb} +19 -35
data/lib/anemone/cli/pagedepth.rb +32 -0
data/lib/anemone/cli/serialize.rb +35 -0
data/lib/anemone/cli/url_list.rb +41 -0
data/lib/anemone/core.rb +13 -13
data/lib/anemone/http.rb +70 -10
data/lib/anemone/page.rb +28 -60
data/lib/anemone/page_hash.rb +12 -0
data/lib/anemone/tentacle.rb +6 -11
data/spec/anemone_spec.rb +10 -0
data/spec/core_spec.rb +29 -33
data/spec/page_spec.rb +11 -8
metadata +10 -12
data/bin/anemone_count.rb +0 -36
data/bin/anemone_pagedepth.rb +0 -44
data/bin/anemone_serialize.rb +0 -51
data/bin/anemone_url_list.rb +0 -54

data/README.rdoc CHANGED Viewed

@@ -1,18 +1,26 @@
 = Anemone
-== DESCRIPTION
 Anemone is a web spider framework that can spider a domain and collect useful
 information about the pages it visits. It is versatile, allowing you to
 write your own specialized spider tasks quickly and easily.
-== FEATURES
+See http://anemone.rubyforge.org for more information.
+== Features
 * Multi-threaded design for high performance
 * Tracks 301 HTTP redirects to understand a page's aliases
 * Built-in BFS algorithm for determining page depth
 * Allows exclusion of URLs based on regular expressions
+* Choose the links to follow on each page with focus_crawl()
+* HTTPS support
+* Records response time for each page
+* CLI program can list all pages in a domain, calculate page depths, and more
+== Examples
+See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
-== REQUIREMENTS
+== Requirements
 * nokogiri
-== EXAMPLES
-See the +bin+ directory for several examples of useful Anemone tasks.
+== Optional
+* fizx-robots (required if obey_robots_txt is set to true)

data/bin/anemone ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require 'anemone/cli'
+Anemone::CLI::run

data/lib/anemone/anemone.rb CHANGED Viewed

@@ -3,42 +3,41 @@ require 'anemone/core'
 module Anemone
   # Version number
-  VERSION = '0.2.0'
-  #module-wide options
-  def Anemone.options=(options)
-    @options = options
-  end
+  VERSION = '0.2.1'
-  def Anemone.options
-    @options
+  # default options
+  DEFAULTS = {
+    # run 4 Tentacle threads to fetch pages
+    :threads => 4,
+    # disable verbose output
+    :verbose => false,
+    # don't throw away the page response body after scanning it for links
+    :discard_page_bodies => false,
+    # identify self as Anemone/VERSION
+    :user_agent => "Anemone/#{VERSION}",
+    # no delay between requests
+    :delay => 0,
+    # don't obey the robots exclusion protocol
+    :obey_robots_txt => false,
+    # by default, don't limit the depth of the crawl
+    :depth_limit => false,
+    # number of times HTTP redirects will be followed
+    :redirect_limit => 5
+  }
+  def self.options
+    @options ||= OpenStruct.new(DEFAULTS)
   end
   #
   # Convenience method to start a crawl using Core
   #
   def Anemone.crawl(urls, options = {}, &block)
-    Anemone.options = OpenStruct.new(options)
-    # by default, run 4 Tentacle threads to fetch pages
-    Anemone.options.threads ||= 4
-    # disable verbose output by default
-    Anemone.options.verbose ||= false
-    # by default, don't throw away the page response body after scanning it for links
-    Anemone.options.discard_page_bodies ||= false
-    # by default, identify self as Anemone/VERSION
-    Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
+    options.each { |key, value| Anemone.options.send("#{key}=", value) }
-    # no delay between requests by default
-    Anemone.options.delay ||= 0
-    # by default, don't obey the robots exclusion protocol
-    if Anemone.options.obey_robots_txt ||= false
+    if Anemone.options.obey_robots_txt
       begin
-        require 'robots'
+      require 'robots'
       rescue LoadError
         warn "To support the robot exclusion protocol, install the robots gem:\n" \
           "sudo gem sources -a http://gems.github.com\n" \
@@ -46,15 +45,10 @@ module Anemone
         exit
       end
     end
-    # by default, don't limit the depth of the crawl
-    Anemone.options.depth_limit ||= :infinity
     #use a single thread if a delay was requested
-    if(Anemone.options.delay != 0)
-      Anemone.options.threads = 1
-    end
+    Anemone.options.threads = 1 if Anemone.options.delay > 0
     Core.crawl(urls, &block)
   end
 end

data/lib/anemone/cli.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module Anemone
+  module CLI
+    COMMANDS = %w[count cron pagedepth serialize url-list]
+    def self.run
+      command = ARGV.shift
+      if COMMANDS.include? command
+        load "anemone/cli/#{command.tr('-', '_')}.rb"
+      else
+        puts <<-INFO
+Anemone is a web spider framework that can collect
+useful information about pages it visits.
+Usage:
+  anemone <command> [arguments]
+Commands:
+  #{COMMANDS.join(', ')}
+INFO
+      end
+    end
+  end
+end

data/lib/anemone/cli/count.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'anemone'
+begin
+  # make sure that the first option is a URL we can crawl
+  url = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone count <url>
+Synopsis:
+  Crawls a site starting at the given URL and outputs the total number
+  of unique pages on the site.
+INFO
+  exit(0)
+end
+Anemone.crawl(url) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq.size
+  end
+end

data/{bin/anemone_cron.rb → lib/anemone/cli/cron.rb} RENAMED Viewed

@@ -1,44 +1,30 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Performs pagedepth, url list, and count functionality
-#   Meant to be run daily as a cron job
-#
-# == Usage
-#   anemone_url_list.rb [options] url
-#
-# == Options
-#   -r, --relative                  Output relative URLs (rather than absolute)
-#   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
 require 'anemone'
 require 'optparse'
 require 'ostruct'
-def usage
-  puts <<END
-Usage: anemone_url_list.rb [options] url
-Options:
-  -r, --relative           Output relative URLs (rather than absolute)
-  -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
-END
-end
 options = OpenStruct.new
 options.relative = false
 options.output_file = 'urls.txt'
-# make sure that the last option is a URL we can crawl
 begin
-  URI(ARGV.last)
+  # make sure that the last argument is a URL we can crawl
+  root = URI(ARGV.last)
 rescue
-  usage
-  Process.exit
+  puts <<-INFO
+Usage:
+  anemone cron [options] <url>
+Synopsis:
+  Combination of `count`, `pagedepth` and `url-list` commands.
+  Performs pagedepth, url list, and count functionality.
+  Outputs results to STDOUT and link list to file (urls.txt).
+  Meant to be run daily as a cron job.
+Options:
+  -r, --relative           Output relative URLs (rather than absolute)
+  -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
+INFO
+  exit(0)
 end
 # parse command-line options
@@ -47,8 +33,6 @@ opts.on('-r', '--relative')        { options.relative = true }
 opts.on('-o', '--output filename') {|o| options.output_file = o }
 opts.parse!(ARGV)
-root = ARGV.last
 Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
   anemone.after_crawl do |pages|
@@ -101,6 +85,6 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
       url = options.relative ? url.path.to_s : url.to_s
       file.puts url
     end
   end
-end
+end

data/lib/anemone/cli/pagedepth.rb ADDED Viewed

@@ -0,0 +1,32 @@
+require 'anemone'
+begin
+  # make sure that the first option is a URL we can crawl
+  root = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone pagedepth <url>
+Synopsis:
+  Crawls a site starting at the given URL and outputs a count of
+  the number of pages at each depth of the crawl.
+INFO
+  exit(0)
+end
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
+end

data/lib/anemone/cli/serialize.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+begin
+  # make sure that the first option is a URL we can crawl
+  root = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone serialize [options] <url>
+Synopsis:
+  Crawls a site starting at the given URL and saves the resulting
+  PageHash object to a file using Marshal serialization.
+Options:
+  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+INFO
+  exit(0)
+end
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
+end

data/lib/anemone/cli/url_list.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+options.relative = false
+begin
+  # make sure that the last option is a URL we can crawl
+  root = URI(ARGV.last)
+rescue
+  puts <<-INFO
+Usage:
+  anemone url-list [options] <url>
+Synopsis:
+  Crawls a site starting at the given URL, and outputs the URL of each page
+  in the domain as they are encountered.
+Options:
+  -r, --relative      Output relative URLs (rather than absolute)
+INFO
+  exit(0)
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+end

data/lib/anemone/core.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require 'net/http'
 require 'thread'
 require 'anemone/tentacle'
+require 'anemone/page'
 require 'anemone/page_hash'
 module Anemone
@@ -12,10 +13,10 @@ module Anemone
     # Initialize the crawl with starting *urls* (single URL or Array of URLs)
     # and optional *block*
     #
-    def initialize(urls, &block)
-      @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
+    def initialize(urls)
+      @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
       @urls.each{ |url| url.path = '/' if url.path.empty? }
       @tentacles = []
       @pages = PageHash.new
       @on_every_page_blocks = []
@@ -26,18 +27,17 @@ module Anemone
       if Anemone.options.obey_robots_txt
         @robots = Robots.new(Anemone.options.user_agent)
       end
-      block.call(self) if block
+      yield self if block_given?
     end
     #
     # Convenience method to start a new crawl
     #
-    def self.crawl(root, &block)
+    def self.crawl(root)
       self.new(root) do |core|
-        block.call(core) if block
+        yield core if block_given?
         core.run
-        return core
       end
     end
@@ -104,7 +104,7 @@ module Anemone
       link_queue = Queue.new
       page_queue = Queue.new
-      Anemone.options.threads.times do |id|
+      Anemone.options.threads.times do
         @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
       end
@@ -120,7 +120,7 @@ module Anemone
         # perform the on_every_page blocks for this page
         do_page_blocks(page)
-        page.doc = nil if Anemone.options.discard_page_bodies
+        page.discard_doc! if Anemone.options.discard_page_bodies
         links_to_follow(page).each do |link|
           link_queue.enq([link, page])
@@ -143,7 +143,7 @@ module Anemone
           end
           if page_queue.empty?
-            @tentacles.size.times { |i| link_queue.enq(:END)}
+            @tentacles.size.times { link_queue.enq(:END)}
             break
           end
         end
@@ -207,7 +207,7 @@ module Anemone
         too_deep = false
       end
-      !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
+      !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
     end
     #
@@ -216,7 +216,7 @@ module Anemone
     #
     def skip_link?(link)
       @skip_link_patterns.each { |p| return true if link.path =~ p}
-      return false
+      false
     end
   end

data/lib/anemone/http.rb CHANGED Viewed

@@ -1,16 +1,48 @@
-require 'net/http'
+require 'net/https'
+require 'anemone/page'
 module Anemone
-  class HTTP < Net::HTTP
+  class HTTP
     # Maximum number of redirects to follow on each get_response
     REDIRECTION_LIMIT = 5
+    def initialize
+      @connections = {}
+    end
+    #
+    # Create a new Page from the response of an HTTP request to *url*
+    #
+    def fetch_page(url, from_page = nil)
+      begin
+        url = URI(url) unless url.is_a?(URI)
+        if from_page
+          referer = from_page.url
+          depth = from_page.depth + 1
+        end
+        response, code, location, response_time = get(url, referer)
+        aka = nil
+        if !url.eql?(location)
+          aka = location
+        end
+        return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
+      rescue
+        return Page.new(url)
+      end
+    end
+    private
     #
     # Retrieve an HTTP response for *url*, following redirects.
     # Returns the response object, response code, and final URI location.
     #
-    def self.get(url, referer = nil)
-      response = get_response(url, referer)
+    def get(url, referer = nil)
+      response, response_time = get_response(url, referer)
       code = Integer(response.code)
       loc = url
@@ -18,17 +50,17 @@ module Anemone
       while response.is_a?(Net::HTTPRedirection) and limit > 0
           loc = URI(response['location'])
           loc = url.merge(loc) if loc.relative?
-          response = get_response(loc, referer)
+          response, response_time = get_response(loc, referer)
           limit -= 1
       end
-      return response, code, loc
+      return response, code, loc, response_time
     end
     #
     # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
     #
-    def self.get_response(url, referer = nil)
+    def get_response(url, referer = nil)
       full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
       user_agent = Anemone.options.user_agent rescue nil
@@ -36,9 +68,37 @@ module Anemone
       opts['User-Agent'] = user_agent if user_agent
       opts['Referer'] = referer.to_s if referer
-      Net::HTTP.start(url.host, url.port) do |http|
-        return http.get(full_path, opts)
+      retries = 0
+      begin
+        start = Time.now()
+        response = connection(url).get(full_path, opts)
+        finish = Time.now()
+        response_time = ((finish - start) * 1000).round
+        return response, response_time
+      rescue EOFError
+        refresh_connection(url)
+        retries += 1
+        retry unless retries > 1
+      end
+    end
+    def connection(url)
+      @connections[url.host] ||= {}
+      if conn = @connections[url.host][url.port]
+        return conn
+      end
+      refresh_connection(url)
+    end
+    def refresh_connection(url)
+      http = Net::HTTP.new(url.host, url.port)
+      if url.scheme == 'https'
+        http.use_ssl = true
+        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
       end
+      @connections[url.host][url.port] = http.start
     end
   end
 end

data/lib/anemone/page.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require 'anemone/http'
 require 'nokogiri'
 require 'ostruct'
@@ -7,8 +6,6 @@ module Anemone
     # The URL of the page
     attr_reader :url
-    # Array of distinct A tag HREFs from the page
-    attr_reader :links
     # Headers of the HTTP response
     attr_reader :headers
@@ -27,74 +24,45 @@ module Anemone
     attr_accessor :depth
     # URL of the page that brought us to this page
     attr_accessor :referer
-    #
-    # Create a new Page from the response of an HTTP request to *url*
-    #
-    def self.fetch(url, from_page = nil)
-      begin
-        url = URI(url) unless url.is_a?(URI)
-        if from_page
-          referer = from_page.url
-          depth = from_page.depth + 1
-        end
-        response, code, location = Anemone::HTTP.get(url, referer)
-        aka = nil
-        if !url.eql?(location)
-          aka = location
-        end
-        return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
-      rescue
-        return Page.new(url)
-      end
-    end
+    # Response time of the request for this page in milliseconds
+    attr_accessor :response_time
     #
     # Create a new page
     #
-    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
+    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
       @url = url
       @code = code
       @headers = headers
-      @links = []
-      @aliases = []
+      @headers['content-type'] ||= ['']
+      @aliases = Array(aka)
       @data = OpenStruct.new
       @referer = referer
       @depth = depth || 0
+      @response_time = response_time
+      @doc = Nokogiri::HTML(body) if body && html? rescue nil
+    end
-      @aliases << aka if !aka.nil?
-      if body
-        begin
-          @doc = Nokogiri::HTML(body)
-        rescue
-          return
-        end
-        return if @doc.nil?
-        #get a list of distinct links on the page, in absolute url form
-        @doc.css('a').each do |a|
-          u = a.attributes['href'].content if a.attributes['href']
-          next if u.nil?
-          begin
-            abs = to_absolute(URI(u))
-          rescue
-            next
-          end
-          @links << abs if in_domain?(abs)
-        end
-        @links.uniq!
+    # Array of distinct A tag HREFs from the page
+    def links
+      return @links unless @links.nil?
+      @links = []
+      return @links if !doc
+      doc.css('a').each do |a|
+        u = a.attributes['href'].content rescue nil
+        next if u.nil? or u.empty?
+        abs = to_absolute(URI(u)) rescue next
+        @links << abs if in_domain?(abs)
       end
+      @links.uniq!
+      @links
     end
+    def discard_doc!
+      links # force parsing of page links before we trash the document
+      @doc = nil
+    end
     #
     # Return a new page with the same *response* and *url*, but
@@ -124,7 +92,7 @@ module Anemone
     # *page_hash* is a PageHash object with the results of the current crawl.
     #
     def links_and_their_aliases(page_hash)
-      @links.inject([]) do |results, link|
+      links.inject([]) do |results, link|
         results.concat([link].concat(page_hash[link].aliases))
       end
     end
@@ -133,7 +101,7 @@ module Anemone
     # The content-type returned by the HTTP request for this page
     #
     def content_type
-      @headers['content-type'][0] rescue nil
+      headers['content-type'].first
     end
     #
@@ -141,7 +109,7 @@ module Anemone
     # otherwise.
     #
     def html?
-      (@content_type =~ /text\/html/) == 0
+      !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
     end
     #

data/lib/anemone/page_hash.rb CHANGED Viewed

@@ -14,6 +14,18 @@ module Anemone
     def has_key?(key)
       super(key.to_s)
     end
+    # Does this PageHash contain the specified URL?
+    # HTTP and HTTPS versions of a URL are considered to be the same page.
+    def has_page?(url)
+      schemes = %w(http https)
+      if schemes.include? url.scheme
+        u = url.dup
+        return schemes.any? { |s| u.scheme = s; has_key?(u) }
+      end
+      has_key?(url)
+    end
     #
     # Use a breadth-first search to calculate the single-source

data/lib/anemone/tentacle.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require 'anemone/page'
+require 'anemone/http'
 module Anemone
   class Tentacle
@@ -9,6 +9,7 @@ module Anemone
     def initialize(link_queue, page_queue)
       @link_queue = link_queue
       @page_queue = page_queue
+      @http = Anemone::HTTP.new
     end
     #
@@ -16,22 +17,16 @@ module Anemone
     # Page objects into @page_queue
     #
     def run
-      while true do
+      loop do
         link, from_page = @link_queue.deq
         break if link == :END
-        if from_page
-          page = Page.fetch(link, from_page)
-        else
-          page = Page.fetch(link)
-        end
-        @page_queue.enq(page)
+        @page_queue.enq @http.fetch_page(link, from_page)
         sleep Anemone.options.delay
       end
     end
   end
 end

data/spec/anemone_spec.rb CHANGED Viewed

@@ -1,6 +1,15 @@
 require File.dirname(__FILE__) + '/spec_helper'
 describe Anemone do
+  before(:all) do
+    Anemone::FakePage.new
+  end
+  after(:each) do
+    # reset global options object to defaults
+    Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
+  end
   it "should have a version" do
     Anemone.const_defined?('VERSION').should == true
@@ -17,6 +26,7 @@ describe Anemone do
                                :user_agent => 'test',
                                :obey_robots_txt => true,
                                :depth_limit => 3)
     Anemone.options.verbose.should == false
     Anemone.options.threads.should == 2
     Anemone.options.discard_page_bodies.should == true

data/spec/core_spec.rb CHANGED Viewed

@@ -139,43 +139,39 @@ module Anemone
       urls.should_not include(pages[1].url)
     end
-    it "should track the page depth and referer" do
-      num_pages = 5
-      pages = []
-      num_pages.times do |n|
-        # register this page with a link to the next page
-        link = (n + 1).to_s if n + 1 < num_pages
-        pages << FakePage.new(n.to_s, :links => [link].compact)
+    describe "many pages" do
+      before(:each) do
+        @pages, size = [], 5
+        size.times do |n|
+          # register this page with a link to the next page
+          link = (n + 1).to_s if n + 1 < size
+          @pages << FakePage.new(n.to_s, :links => Array(link))
+        end
       end
-      core = Anemone.crawl(pages[0].url)
-      num_pages.times do |n|
-        page = core.pages[pages[n].url]
-        page.depth.should == n
-        page.referer.should == core.pages[pages[n-1].url].url if n > 0
+      it "should track the page depth and referer" do
+        core = Anemone.crawl(@pages[0].url)
+        previous_page = nil
+        @pages.each_with_index do |page, i|
+          page = core.pages[page.url]
+          page.should be
+          page.depth.should == i
+          if previous_page
+            page.referer.should == previous_page.url
+          else
+            page.referer.should be_nil
+          end
+          previous_page = page
+        end
       end
-      core.pages[pages[0].url].referer.should == nil
-    end
-    it "should optionally limit the depth of the crawl" do
-      num_pages = 5
-      pages = []
-      num_pages.times do |n|
-        # register this page with a link to the next page
-        link = (n + 1).to_s if n + 1 < num_pages
-        pages << FakePage.new(n.to_s, :links => [link].compact)
+      it "should optionally limit the depth of the crawl" do
+        core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
+        core.should have(4).pages
       end
-      core = Anemone.crawl(pages[0].url, :depth_limit => 3)
-      core.should have(4).pages
     end
   end
 end

data/spec/page_spec.rb CHANGED Viewed

@@ -2,14 +2,13 @@ require File.dirname(__FILE__) + '/spec_helper'
 module Anemone
   describe Page do
-    before(:each) do
-      @page = Page.fetch(FakePage.new('home').url)
+    before(:all) do
+      @http = Anemone::HTTP.new
     end
-    it "should be able to fetch a page" do
-      @page.should_not be_nil
-      @page.url.to_s.should include('home')
+    before(:each) do
+      @page = @http.fetch_page(FakePage.new('home').url)
     end
     it "should store the response headers when fetching a page" do
@@ -35,7 +34,7 @@ module Anemone
       @page.redirect?.should == false
-      Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
+      @http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
     end
     it "should have a method to tell if a URI is in the same domain as the page" do
@@ -44,6 +43,10 @@ module Anemone
       @page.in_domain?(URI(FakePage.new('test').url)).should == true
       @page.in_domain?(URI('http://www.other.com/')).should == false
     end
+    it "should include the response time for the HTTP request" do
+      @page.should respond_to(:response_time)
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: anemone
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-09-07 00:00:00 -05:00
+date: 2009-10-24 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -25,11 +25,7 @@ dependencies:
 description:
 email:
 executables:
-- anemone_count.rb
-- anemone_cron.rb
-- anemone_pagedepth.rb
-- anemone_serialize.rb
-- anemone_url_list.rb
+- anemone
 extensions: []
 extra_rdoc_files:
@@ -37,11 +33,7 @@ extra_rdoc_files:
 files:
 - LICENSE.txt
 - README.rdoc
-- bin/anemone_count.rb
-- bin/anemone_cron.rb
-- bin/anemone_pagedepth.rb
-- bin/anemone_serialize.rb
-- bin/anemone_url_list.rb
+- bin/anemone
 - lib/anemone.rb
 - lib/anemone/anemone.rb
 - lib/anemone/core.rb
@@ -49,6 +41,12 @@ files:
 - lib/anemone/page.rb
 - lib/anemone/page_hash.rb
 - lib/anemone/tentacle.rb
+- lib/anemone/cli.rb
+- lib/anemone/cli/url_list.rb
+- lib/anemone/cli/cron.rb
+- lib/anemone/cli/count.rb
+- lib/anemone/cli/pagedepth.rb
+- lib/anemone/cli/serialize.rb
 has_rdoc: true
 homepage: http://anemone.rubyforge.org
 post_install_message:

data/bin/anemone_count.rb DELETED Viewed

@@ -1,36 +0,0 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Crawls a site starting at the given URL, and outputs the total number
-#   of unique pages on the site.
-#
-# == Usage
-#   anemone_count.rb url
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-def usage
-  puts <<END
-Usage: anemone_count.rb url
-END
-end
-# make sure that the first option is a URL we can crawl
-begin
-  URI(ARGV[0])
-rescue
-  usage
-  Process.exit
-end
-Anemone.crawl(ARGV[0]) do |anemone|
-  anemone.after_crawl do |pages|
-    puts pages.uniq.size
-  end
-end

data/bin/anemone_pagedepth.rb DELETED Viewed

@@ -1,44 +0,0 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Crawls a site starting at the given URL, and outputs a count of
-#   the number of Pages at each depth in the site.
-#
-# == Usage
-#   anemone_pagedepth.rb url
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-def usage
-  puts <<END
-Usage: anemone_pagedepth.rb url
-END
-end
-# make sure that the first option is a URL we can crawl
-begin
-  URI(ARGV[0])
-rescue
-  usage
-  Process.exit
-end
-root = ARGV[0]
-Anemone.crawl(root) do |anemone|
-  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
-  anemone.after_crawl do |pages|
-    pages = pages.shortest_paths!(root).uniq
-    depths = pages.values.inject({}) do |depths, page|
-      depths[page.depth] ||= 0
-      depths[page.depth] += 1
-      depths
-    end
-    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
-  end
-end

data/bin/anemone_serialize.rb DELETED Viewed

@@ -1,51 +0,0 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Crawls a site starting at the given URL, and saves the resulting
-#   PageHash object to a file using Marshal serialization.
-#
-# == Usage
-#   anemone_serialize.rb [options] url
-#
-# == Options
-#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-require 'optparse'
-require 'ostruct'
-def usage
-  puts <<END
-Usage: anemone_serialize.rb [options] url
-Options:
-  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
-END
-end
-# make sure that the first option is a URL we can crawl
-begin
-  URI(ARGV[0])
-rescue
-  usage
-  Process.exit
-end
-options = OpenStruct.new
-options.output_file = "crawl.#{Time.now.to_i}"
-# parse command-line options
-opts = OptionParser.new
-opts.on('-o', '--output filename') {|o| options.output_file = o }
-opts.parse!(ARGV)
-root = ARGV[0]
-Anemone.crawl(root) do |anemone|
-  anemone.after_crawl do |pages|
-    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
-  end
-end

data/bin/anemone_url_list.rb DELETED Viewed

@@ -1,54 +0,0 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Crawls a site starting at the given URL, and outputs the URL of each page
-#   in the domain as they are encountered.
-#
-# == Usage
-#   anemone_url_list.rb [options] url
-#
-# == Options
-#   -r, --relative          Output relative URLs (rather than absolute)
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-require 'optparse'
-require 'ostruct'
-def usage
-  puts <<END
-Usage: anemone_url_list.rb [options] url
-Options:
-  -r, --relative      Output relative URLs (rather than absolute)
-END
-end
-options = OpenStruct.new
-options.relative = false
-# make sure that the last option is a URL we can crawl
-begin
-  URI(ARGV.last)
-rescue
-  usage
-  Process.exit
-end
-# parse command-line options
-opts = OptionParser.new
-opts.on('-r', '--relative') { options.relative = true }
-opts.parse!(ARGV)
-Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
-  anemone.on_every_page do |page|
-    if options.relative
-      puts page.url.path
-    else
-      puts page.url
-    end
-  end
-end