RubyGems - anemone - Versions diffs - 0.2.2 → 0.2.3 - Mend

anemone 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/CHANGELOG.rdoc ADDED

@@ -0,0 +1,27 @@
+== 0.2.3 / 2009-11-01
+* Minor enhancements
+  * Options are now applied per-crawl, rather than module-wide.
+* Bug fixes
+  * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
+== 0.2.2 / 2009-10-26
+* Minor enhancements
+  * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
+== 0.2.1 / 2009-10-24
+* Major enhancements
+  * Added HTTPS support.
+  * CLI program 'anemone', which is a frontend for several tasks.
+* Minor enhancements
+  * HTTP request response time recorded in Page.
+  * Use of persistent HTTP connections.

data/README.rdoc CHANGED

@@ -21,6 +21,4 @@ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of sev
 == Requirements
 * nokogiri
-== Optional
-* fizx-robots (required if obey_robots_txt is set to true)
+* robots

data/lib/anemone.rb CHANGED

@@ -1,2 +1,2 @@
 require 'rubygems'
-require 'anemone/anemone'
+require 'anemone/core'

data/lib/anemone/core.rb CHANGED

@@ -1,19 +1,51 @@
-require 'net/http'
 require 'thread'
+require 'robots'
 require 'anemone/tentacle'
 require 'anemone/page'
 require 'anemone/page_hash'
 module Anemone
+  VERSION = '0.2.3';
+  #
+  # Convenience method to start a crawl
+  #
+  def Anemone.crawl(urls, options = {}, &block)
+    Core.crawl(urls, options, &block)
+  end
   class Core
     # PageHash storing all Page objects encountered during the crawl
     attr_reader :pages
+    # Hash of options for the crawl
+    attr_accessor :opts
+    DEFAULT_OPTS = {
+      # run 4 Tentacle threads to fetch pages
+      :threads => 4,
+      # disable verbose output
+      :verbose => false,
+      # don't throw away the page response body after scanning it for links
+      :discard_page_bodies => false,
+      # identify self as Anemone/VERSION
+      :user_agent => "Anemone/#{Anemone::VERSION}",
+      # no delay between requests
+      :delay => 0,
+      # don't obey the robots exclusion protocol
+      :obey_robots_txt => false,
+      # by default, don't limit the depth of the crawl
+      :depth_limit => false,
+      # number of times HTTP redirects will be followed
+      :redirect_limit => 5
+    }
     #
     # Initialize the crawl with starting *urls* (single URL or Array of URLs)
     # and optional *block*
     #
-    def initialize(urls)
+    def initialize(urls, opts = {})
       @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
       @urls.each{ |url| url.path = '/' if url.path.empty? }
@@ -23,10 +55,8 @@ module Anemone
       @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @skip_link_patterns = []
       @after_crawl_blocks = []
-      if Anemone.options.obey_robots_txt
-        @robots = Robots.new(Anemone.options.user_agent)
-      end
+      process_options opts
       yield self if block_given?
     end
@@ -34,8 +64,8 @@ module Anemone
     #
     # Convenience method to start a new crawl
     #
-    def self.crawl(root)
-      self.new(root) do |core|
+    def self.crawl(urls, opts = {})
+      self.new(urls, opts) do |core|
         yield core if block_given?
         core.run
       end
@@ -55,11 +85,7 @@ module Anemone
     # followed
     #
     def skip_links_like(*patterns)
-      if patterns
-        patterns.each do |pattern|
-          @skip_link_patterns << pattern
-        end
-      end
+      @skip_link_patterns.concat [patterns].flatten.compact
       self
     end
@@ -104,8 +130,8 @@ module Anemone
       link_queue = Queue.new
       page_queue = Queue.new
-      Anemone.options.threads.times do
-        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
+      @opts[:threads].times do
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
       end
       @urls.each{ |url| link_queue.enq(url) }
@@ -115,12 +141,12 @@ module Anemone
         @pages[page.url] = page
-        puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
+        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
         # perform the on_every_page blocks for this page
         do_page_blocks(page)
-        page.discard_doc! if Anemone.options.discard_page_bodies
+        page.discard_doc! if @opts[:discard_page_bodies]
         links_to_follow(page).each do |link|
           link_queue.enq([link, page])
@@ -158,7 +184,15 @@ module Anemone
     end
     private
+    def process_options(options)
+      @opts = DEFAULT_OPTS.merge options
+      @opts[:threads] = 1 if @opts[:delay] > 0
+      @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+    end
     #
     # Execute the after_crawl blocks
     #
@@ -199,10 +233,10 @@ module Anemone
     # Returns +false+ otherwise.
     #
     def visit_link?(link, from_page = nil)
-      allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
+      allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
-      if from_page
-        too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
+      if from_page && @opts[:depth_limit]
+        too_deep = from_page.depth >= @opts[:depth_limit]
       else
         too_deep = false
       end
@@ -215,8 +249,7 @@ module Anemone
     # its URL matches a skip_link pattern.
     #
     def skip_link?(link)
-      @skip_link_patterns.each { |p| return true if link.path =~ p}
-      false
+      @skip_link_patterns.any? { |p| link.path =~ p }
     end
   end

data/lib/anemone/http.rb CHANGED

@@ -4,10 +4,11 @@ require 'anemone/page'
 module Anemone
   class HTTP
     # Maximum number of redirects to follow on each get_response
-    REDIRECTION_LIMIT = 5
+    REDIRECT_LIMIT = 5
-    def initialize
+    def initialize(opts = {})
       @connections = {}
+      @opts = opts
     end
     #
@@ -31,7 +32,7 @@ module Anemone
         return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
       rescue => e
-        if Anemone.options.verbose
+        if verbose?
           puts e.inspect
           puts e.backtrace
         end
@@ -50,7 +51,7 @@ module Anemone
       code = Integer(response.code)
       loc = url
-      limit = REDIRECTION_LIMIT
+      limit = redirect_limit
       while response.is_a?(Net::HTTPRedirection) and limit > 0
           loc = URI(response['location'])
           loc = url.merge(loc) if loc.relative?
@@ -66,7 +67,6 @@ module Anemone
     #
     def get_response(url, referer = nil)
       full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
-      user_agent = Anemone.options.user_agent rescue nil
       opts = {}
       opts['User-Agent'] = user_agent if user_agent
@@ -82,7 +82,7 @@ module Anemone
       rescue EOFError
         refresh_connection(url)
         retries += 1
-        retry unless retries > 1
+        retry unless retries > 3
       end
     end
@@ -104,5 +104,18 @@ module Anemone
       end
       @connections[url.host][url.port] = http.start
     end
+    def redirect_limit
+      @opts[:redirect_limit] || REDIRECT_LIMIT
+    end
+    def user_agent
+      @opts[:user_agent]
+    end
+    def verbose?
+      @opts[:verbose]
+    end
   end
 end

data/lib/anemone/page.rb CHANGED

@@ -33,7 +33,7 @@ module Anemone
     def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
       @url = url
       @code = code
-      @headers = headers
+      @headers = headers || {}
       @headers['content-type'] ||= ['']
       @aliases = Array(aka)
       @data = OpenStruct.new

data/lib/anemone/tentacle.rb CHANGED

@@ -6,10 +6,11 @@ module Anemone
     #
     # Create a new Tentacle
     #
-    def initialize(link_queue, page_queue)
+    def initialize(link_queue, page_queue, opts = {})
       @link_queue = link_queue
       @page_queue = page_queue
-      @http = Anemone::HTTP.new
+      @http = Anemone::HTTP.new(opts)
+      @opts = opts
     end
     #
@@ -22,11 +23,17 @@ module Anemone
         break if link == :END
-        @page_queue.enq @http.fetch_page(link, from_page)
+        @page_queue << @http.fetch_page(link, from_page)
-        sleep Anemone.options.delay
+        delay
       end
     end
+    private
+    def delay
+      sleep @opts[:delay] if @opts[:delay]
+    end
   end
 end

data/spec/anemone_spec.rb CHANGED

@@ -2,45 +2,10 @@ require File.dirname(__FILE__) + '/spec_helper'
 describe Anemone do
-  before(:all) do
-    Anemone::FakePage.new
-  end
-  after(:each) do
-    # reset global options object to defaults
-    Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
-  end
   it "should have a version" do
     Anemone.const_defined?('VERSION').should == true
   end
-  it "should have options" do
-    Anemone.should respond_to(:options)
-  end
-  it "should accept options for the crawl" do
-    Anemone.crawl(SPEC_DOMAIN, :verbose => false,
-                               :threads => 2,
-                               :discard_page_bodies => true,
-                               :user_agent => 'test',
-                               :obey_robots_txt => true,
-                               :depth_limit => 3)
-    Anemone.options.verbose.should == false
-    Anemone.options.threads.should == 2
-    Anemone.options.discard_page_bodies.should == true
-    Anemone.options.delay.should == 0
-    Anemone.options.user_agent.should == 'test'
-    Anemone.options.obey_robots_txt.should == true
-    Anemone.options.depth_limit.should == 3
-  end
-  it "should use 1 thread if a delay is requested" do
-    Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
-    Anemone.options.threads.should == 1
-  end
   it "should return a Anemone::Core from the crawl, which has a PageHash" do
     result = Anemone.crawl(SPEC_DOMAIN)
     result.should be_an_instance_of(Anemone::Core)

data/spec/core_spec.rb CHANGED

@@ -64,13 +64,15 @@ module Anemone
       pages << FakePage.new('0', :links => ['1', '2'])
       pages << FakePage.new('1')
       pages << FakePage.new('2')
+      pages << FakePage.new('3')
       core = Anemone.crawl(pages[0].url) do |a|
-        a.skip_links_like /1/
+        a.skip_links_like /1/, /3/
       end
       core.should have(2).pages
       core.pages.keys.should_not include(pages[1].url)
+      core.pages.keys.should_not include(pages[3].url)
     end
     it "should be able to call a block on every page" do
@@ -173,5 +175,29 @@ module Anemone
         core.should have(4).pages
       end
     end
+    describe "options" do
+      it "should accept options for the crawl" do
+        core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
+                                          :threads => 2,
+                                          :discard_page_bodies => true,
+                                          :user_agent => 'test',
+                                          :obey_robots_txt => true,
+                                          :depth_limit => 3)
+        core.opts[:verbose].should == false
+        core.opts[:threads].should == 2
+        core.opts[:discard_page_bodies].should == true
+        core.opts[:delay].should == 0
+        core.opts[:user_agent].should == 'test'
+        core.opts[:obey_robots_txt].should == true
+        core.opts[:depth_limit].should == 3
+      end
+      it "should use 1 thread if a delay is requested" do
+        Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
+      end
+    end
   end
 end

data/spec/fakeweb_helper.rb CHANGED

@@ -55,4 +55,3 @@ end
 #default root
 Anemone::FakePage.new

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: anemone
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.2.3
 platform: ruby
 authors:
 - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-10-26 00:00:00 -05:00
+date: 2009-11-01 01:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -22,6 +22,16 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 1.3.0
     version:
+- !ruby/object:Gem::Dependency
+  name: robots
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.2
+    version:
 description:
 email:
 executables:
@@ -32,10 +42,10 @@ extra_rdoc_files:
 - README.rdoc
 files:
 - LICENSE.txt
+- CHANGELOG.rdoc
 - README.rdoc
 - bin/anemone
 - lib/anemone.rb
-- lib/anemone/anemone.rb
 - lib/anemone/core.rb
 - lib/anemone/http.rb
 - lib/anemone/page.rb

data/lib/anemone/anemone.rb DELETED

@@ -1,54 +0,0 @@
-require 'ostruct'
-require 'anemone/core'
-module Anemone
-  # Version number
-  VERSION = '0.2.2'
-  # default options
-  DEFAULTS = {
-    # run 4 Tentacle threads to fetch pages
-    :threads => 4,
-    # disable verbose output
-    :verbose => false,
-    # don't throw away the page response body after scanning it for links
-    :discard_page_bodies => false,
-    # identify self as Anemone/VERSION
-    :user_agent => "Anemone/#{VERSION}",
-    # no delay between requests
-    :delay => 0,
-    # don't obey the robots exclusion protocol
-    :obey_robots_txt => false,
-    # by default, don't limit the depth of the crawl
-    :depth_limit => false,
-    # number of times HTTP redirects will be followed
-    :redirect_limit => 5
-  }
-  def self.options
-    @options ||= OpenStruct.new(DEFAULTS)
-  end
-  #
-  # Convenience method to start a crawl using Core
-  #
-  def Anemone.crawl(urls, options = {}, &block)
-    options.each { |key, value| Anemone.options.send("#{key}=", value) }
-    if Anemone.options.obey_robots_txt
-      begin
-      require 'robots'
-      rescue LoadError
-        warn "To support the robot exclusion protocol, install the robots gem:\n" \
-          "sudo gem sources -a http://gems.github.com\n" \
-          "sudo gem install fizx-robots"
-        exit
-      end
-    end
-    #use a single thread if a delay was requested
-    Anemone.options.threads = 1 if Anemone.options.delay > 0
-    Core.crawl(urls, &block)
-  end
-end