RubyGems - anemone - Versions diffs - 0.1.2 → 0.2.0 - Mend

anemone 0.1.2 → 0.2.0

Files changed (11) hide show

data/lib/anemone/anemone.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'anemone/core'
 module Anemone
   # Version number
-  VERSION = '0.1.2'
+  VERSION = '0.2.0'
   #module-wide options
   def Anemone.options=(options)
@@ -20,21 +20,36 @@ module Anemone
   def Anemone.crawl(urls, options = {}, &block)
     Anemone.options = OpenStruct.new(options)
-    #by default, run 4 Tentacle threads to fetch pages
+    # by default, run 4 Tentacle threads to fetch pages
     Anemone.options.threads ||= 4
-    #disable verbose output by default
+    # disable verbose output by default
     Anemone.options.verbose ||= false
-    #by default, don't throw away the page response body after scanning it for links
+    # by default, don't throw away the page response body after scanning it for links
     Anemone.options.discard_page_bodies ||= false
-    #by default, identify self as Anemone/VERSION
+    # by default, identify self as Anemone/VERSION
     Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
-    #no delay between requests by default
+    # no delay between requests by default
     Anemone.options.delay ||= 0
+    # by default, don't obey the robots exclusion protocol
+    if Anemone.options.obey_robots_txt ||= false
+      begin
+        require 'robots'
+      rescue LoadError
+        warn "To support the robot exclusion protocol, install the robots gem:\n" \
+          "sudo gem sources -a http://gems.github.com\n" \
+          "sudo gem install fizx-robots"
+        exit
+      end
+    end
+    # by default, don't limit the depth of the crawl
+    Anemone.options.depth_limit ||= :infinity
     #use a single thread if a delay was requested
     if(Anemone.options.delay != 0)
       Anemone.options.threads = 1

data/lib/anemone/core.rb CHANGED Viewed

@@ -23,6 +23,10 @@ module Anemone
       @skip_link_patterns = []
       @after_crawl_blocks = []
+      if Anemone.options.obey_robots_txt
+        @robots = Robots.new(Anemone.options.user_agent)
+      end
       block.call(self) if block
     end
@@ -113,18 +117,18 @@ module Anemone
         puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
-        #perform the on_every_page blocks for this page
+        # perform the on_every_page blocks for this page
         do_page_blocks(page)
         page.doc = nil if Anemone.options.discard_page_bodies
         links_to_follow(page).each do |link|
-          link_queue.enq(link)
+          link_queue.enq([link, page])
           @pages[link] = nil
         end
-        #create an entry in the page hash for each alias of this page,
-        #i.e. all the pages that redirected to this page
+        # create an entry in the page hash for each alias of this page,
+        # i.e. all the pages that redirected to this page
         page.aliases.each do |aka|
           if !@pages.has_key?(aka) or @pages[aka].nil?
             @pages[aka] = page.alias_clone(aka)
@@ -184,16 +188,26 @@ module Anemone
     #
     def links_to_follow(page)
       links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
-      links.find_all { |link| visit_link?(link) }
+      links.select { |link| visit_link?(link, page) }
     end
     #
     # Returns +true+ if *link* has not been visited already,
-    # and is not excluded by a skip_link pattern. Returns
-    # +false+ otherwise.
+    # and is not excluded by a skip_link pattern...
+    # and is not excluded by robots.txt...
+    # and is not deeper than the depth limit
+    # Returns +false+ otherwise.
     #
-    def visit_link?(link)
-      !@pages.has_key?(link) and !skip_link?(link)
+    def visit_link?(link, from_page = nil)
+      allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
+      if from_page
+        too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
+      else
+        too_deep = false
+      end
+      !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
     end
     #

data/lib/anemone/http.rb CHANGED Viewed

@@ -9,8 +9,8 @@ module Anemone
     # Retrieve an HTTP response for *url*, following redirects.
     # Returns the response object, response code, and final URI location.
     #
-    def self.get(url)
-      response = get_response(url)
+    def self.get(url, referer = nil)
+      response = get_response(url, referer)
       code = Integer(response.code)
       loc = url
@@ -18,7 +18,7 @@ module Anemone
       while response.is_a?(Net::HTTPRedirection) and limit > 0
           loc = URI(response['location'])
           loc = url.merge(loc) if loc.relative?
-          response = get_response(loc)
+          response = get_response(loc, referer)
           limit -= 1
       end
@@ -28,10 +28,16 @@ module Anemone
     #
     # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
     #
-    def self.get_response(url)
+    def self.get_response(url, referer = nil)
       full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
+      user_agent = Anemone.options.user_agent rescue nil
+      opts = {}
+      opts['User-Agent'] = user_agent if user_agent
+      opts['Referer'] = referer.to_s if referer
       Net::HTTP.start(url.host, url.port) do |http|
-        return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
+        return http.get(full_path, opts)
       end
     end
   end

data/lib/anemone/page.rb CHANGED Viewed

@@ -22,24 +22,32 @@ module Anemone
     attr_accessor :aliases
     # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
     attr_accessor :visited
-    # Used by PageHash#shortest_paths! to store depth of the page
+    # Depth of this page from the root of the crawl. This is not necessarily the
+    # shortest path; use PageHash#shortest_paths! to find that value.
     attr_accessor :depth
+    # URL of the page that brought us to this page
+    attr_accessor :referer
     #
     # Create a new Page from the response of an HTTP request to *url*
     #
-    def self.fetch(url)
+    def self.fetch(url, from_page = nil)
       begin
-        url = URI(url) if url.is_a?(String)
+        url = URI(url) unless url.is_a?(URI)
-        response, code, location = Anemone::HTTP.get(url)
+        if from_page
+          referer = from_page.url
+          depth = from_page.depth + 1
+        end
+        response, code, location = Anemone::HTTP.get(url, referer)
         aka = nil
         if !url.eql?(location)
           aka = location
         end
-        return Page.new(url, response.body, code, response.to_hash, aka)
+        return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
       rescue
         return Page.new(url)
       end
@@ -48,14 +56,16 @@ module Anemone
     #
     # Create a new page
     #
-    def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
+    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
       @url = url
       @code = code
       @headers = headers
       @links = []
       @aliases = []
       @data = OpenStruct.new
+      @referer = referer
+      @depth = depth || 0
       @aliases << aka if !aka.nil?
       if body

data/lib/anemone/page_hash.rb CHANGED Viewed

@@ -1,6 +1,20 @@
 module Anemone
   class PageHash < Hash
+    # We typically index the hash with a URI,
+    # but convert it to a String for easier retrieval
+    def [](index)
+      super(index.to_s)
+    end
+    def []=(index, other)
+      super(index.to_s, other)
+    end
+    def has_key?(key)
+      super(key.to_s)
+    end
     #
     # Use a breadth-first search to calculate the single-source
     # shortest paths from *root* to all pages in the PageHash

data/lib/anemone/tentacle.rb CHANGED Viewed

@@ -17,11 +17,15 @@ module Anemone
     #
     def run
       while true do
-        link = @link_queue.deq
+        link, from_page = @link_queue.deq
         break if link == :END
-        page = Page.fetch(link)
+        if from_page
+          page = Page.fetch(link, from_page)
+        else
+          page = Page.fetch(link)
+        end
         @page_queue.enq(page)

data/spec/anemone_spec.rb CHANGED Viewed

@@ -14,12 +14,16 @@ describe Anemone do
     Anemone.crawl(SPEC_DOMAIN, :verbose => false,
                                :threads => 2,
                                :discard_page_bodies => true,
-                               :user_agent => 'test')
+                               :user_agent => 'test',
+                               :obey_robots_txt => true,
+                               :depth_limit => 3)
     Anemone.options.verbose.should == false
     Anemone.options.threads.should == 2
     Anemone.options.discard_page_bodies.should == true
     Anemone.options.delay.should == 0
     Anemone.options.user_agent.should == 'test'
+    Anemone.options.obey_robots_txt.should == true
+    Anemone.options.depth_limit.should == 3
   end
   it "should use 1 thread if a delay is requested" do

data/spec/core_spec.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module Anemone
       core = Anemone.crawl(pages[0].url)
       core.should have(2).pages
-      core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
+      core.pages.keys.should_not include('http://www.other.com/')
     end
     it "should follow http redirects" do
@@ -56,7 +56,7 @@ module Anemone
       core = Anemone.crawl(pages[0].url)
       core.should have(2).pages
-      core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
+      core.pages.keys.should_not include(pages[2].url)
     end
     it "should be able to skip links based on a RegEx" do
@@ -70,7 +70,7 @@ module Anemone
       end
       core.should have(2).pages
-      core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
+      core.pages.keys.should_not include(pages[1].url)
     end
     it "should be able to call a block on every page" do
@@ -107,7 +107,7 @@ module Anemone
       end
       core.should have(2).pages
-      core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
+      core.pages.keys.should_not include(pages[1].url)
     end
     it "should optionally delay between page requests" do
@@ -123,6 +123,59 @@ module Anemone
       (finish - start).should satisfy {|t| t > delay * 2}
     end
+    it "should optionally obey the robots exclusion protocol" do
+      pages = []
+      pages << FakePage.new('0', :links => '1')
+      pages << FakePage.new('1')
+      pages << FakePage.new('robots.txt',
+                            :body => "User-agent: *\nDisallow: /1",
+                            :content_type => 'text/plain')
+      core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
+      urls = core.pages.keys
+      urls.should include(pages[0].url)
+      urls.should_not include(pages[1].url)
+    end
+    it "should track the page depth and referer" do
+      num_pages = 5
+      pages = []
+      num_pages.times do |n|
+        # register this page with a link to the next page
+        link = (n + 1).to_s if n + 1 < num_pages
+        pages << FakePage.new(n.to_s, :links => [link].compact)
+      end
+      core = Anemone.crawl(pages[0].url)
+      num_pages.times do |n|
+        page = core.pages[pages[n].url]
+        page.depth.should == n
+        page.referer.should == core.pages[pages[n-1].url].url if n > 0
+      end
+      core.pages[pages[0].url].referer.should == nil
+    end
+    it "should optionally limit the depth of the crawl" do
+      num_pages = 5
+      pages = []
+      num_pages.times do |n|
+        # register this page with a link to the next page
+        link = (n + 1).to_s if n + 1 < num_pages
+        pages << FakePage.new(n.to_s, :links => [link].compact)
+      end
+      core = Anemone.crawl(pages[0].url, :depth_limit => 3)
+      core.should have(4).pages
+    end
   end
 end

data/spec/fakeweb_helper.rb CHANGED Viewed

@@ -13,14 +13,17 @@ module Anemone
   class FakePage
     attr_accessor :links
     attr_accessor :hrefs
+    attr_accessor :body
     def initialize(name = '', options = {})
       @name = name
       @links = [options[:links]].flatten if options.has_key?(:links)
       @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
       @redirect = options[:redirect] if options.has_key?(:redirect)
+      @content_type = options[:content_type] || "text/html"
+      @body = options[:body]
-      create_body
+      create_body unless @body
       add_to_fakeweb
     end
@@ -38,7 +41,7 @@ module Anemone
     end
     def add_to_fakeweb
-      options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
+      options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
       if @redirect
         options[:status] = [301, "Permanently Moved"]

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,5 +1,5 @@
-require File.dirname(__FILE__) + '/fakeweb_helper'
 require 'rubygems'
+require File.dirname(__FILE__) + '/fakeweb_helper'
 $:.unshift(File.dirname(__FILE__) + '/../lib/')
 require 'anemone'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: anemone
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.2.0
 platform: ruby
 authors:
 - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-10 00:00:00 -05:00
+date: 2009-09-07 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency