RubyGems - anemone - Versions diffs - 0.1.2 → 0.2.0 - Mend

anemone 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/lib/anemone/anemone.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'anemone/core'
 module Anemone
   # Version number
-  VERSION = '0.1.2'
+  VERSION = '0.2.0'
   #module-wide options
   def Anemone.options=(options)
@@ -20,21 +20,36 @@ module Anemone
   def Anemone.crawl(urls, options = {}, &block)
     Anemone.options = OpenStruct.new(options)
-    #by default, run 4 Tentacle threads to fetch pages
+    # by default, run 4 Tentacle threads to fetch pages
     Anemone.options.threads ||= 4
-    #disable verbose output by default
+    # disable verbose output by default
     Anemone.options.verbose ||= false
-    #by default, don't throw away the page response body after scanning it for links
+    # by default, don't throw away the page response body after scanning it for links
     Anemone.options.discard_page_bodies ||= false
-    #by default, identify self as Anemone/VERSION
+    # by default, identify self as Anemone/VERSION
     Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
-    #no delay between requests by default
+    # no delay between requests by default
     Anemone.options.delay ||= 0
+    # by default, don't obey the robots exclusion protocol
+    if Anemone.options.obey_robots_txt ||= false
+      begin
+        require 'robots'
+      rescue LoadError
+        warn "To support the robot exclusion protocol, install the robots gem:\n" \
+          "sudo gem sources -a http://gems.github.com\n" \
+          "sudo gem install fizx-robots"
+        exit
+      end
+    end
+    # by default, don't limit the depth of the crawl
+    Anemone.options.depth_limit ||= :infinity
     #use a single thread if a delay was requested
     if(Anemone.options.delay != 0)
       Anemone.options.threads = 1

data/lib/anemone/core.rb CHANGED Viewed

@@ -23,6 +23,10 @@ module Anemone
       @skip_link_patterns = []
       @after_crawl_blocks = []
+      if Anemone.options.obey_robots_txt
+        @robots = Robots.new(Anemone.options.user_agent)
+      end
       block.call(self) if block
     end
@@ -113,18 +117,18 @@ module Anemone
         puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
-        #perform the on_every_page blocks for this page
+        # perform the on_every_page blocks for this page
         do_page_blocks(page)
         page.doc = nil if Anemone.options.discard_page_bodies
         links_to_follow(page).each do |link|
-          link_queue.enq(link)
+          link_queue.enq([link, page])
           @pages[link] = nil
         end
-        #create an entry in the page hash for each alias of this page,
-        #i.e. all the pages that redirected to this page
+        # create an entry in the page hash for each alias of this page,
+        # i.e. all the pages that redirected to this page
         page.aliases.each do |aka|
           if !@pages.has_key?(aka) or @pages[aka].nil?
             @pages[aka] = page.alias_clone(aka)
@@ -184,16 +188,26 @@ module Anemone
     #
     def links_to_follow(page)
       links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
-      links.find_all { |link| visit_link?(link) }
+      links.select { |link| visit_link?(link, page) }
     end
     #
     # Returns +true+ if *link* has not been visited already,
-    # and is not excluded by a skip_link pattern. Returns
-    # +false+ otherwise.
+    # and is not excluded by a skip_link pattern...
+    # and is not excluded by robots.txt...
+    # and is not deeper than the depth limit
+    # Returns +false+ otherwise.
     #
-    def visit_link?(link)
-      !@pages.has_key?(link) and !skip_link?(link)
+    def visit_link?(link, from_page = nil)
+      allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
+      if from_page
+        too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
+      else
+        too_deep = false
+      end
+      !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
     end
     #

data/lib/anemone/http.rb CHANGED Viewed

@@ -9,8 +9,8 @@ module Anemone
     # Retrieve an HTTP response for *url*, following redirects.
     # Returns the response object, response code, and final URI location.
     #
-    def self.get(url)
-      response = get_response(url)
+    def self.get(url, referer = nil)
+      response = get_response(url, referer)
       code = Integer(response.code)
       loc = url
@@ -18,7 +18,7 @@ module Anemone
       while response.is_a?(Net::HTTPRedirection) and limit > 0
           loc = URI(response['location'])
           loc = url.merge(loc) if loc.relative?
-          response = get_response(loc)
+          response = get_response(loc, referer)
           limit -= 1
       end
@@ -28,10 +28,16 @@ module Anemone
     #
     # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
     #
-    def self.get_response(url)
+    def self.get_response(url, referer = nil)
       full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
+      user_agent = Anemone.options.user_agent rescue nil
+      opts = {}
+      opts['User-Agent'] = user_agent if user_agent
+      opts['Referer'] = referer.to_s if referer
       Net::HTTP.start(url.host, url.port) do |http|
-        return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
+        return http.get(full_path, opts)
       end
     end
   end

data/lib/anemone/page.rb CHANGED Viewed

@@ -22,24 +22,32 @@ module Anemone
     attr_accessor :aliases
     # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
     attr_accessor :visited
-    # Used by PageHash#shortest_paths! to store depth of the page
+    # Depth of this page from the root of the crawl. This is not necessarily the
+    # shortest path; use PageHash#shortest_paths! to find that value.
     attr_accessor :depth
+    # URL of the page that brought us to this page
+    attr_accessor :referer
     #
     # Create a new Page from the response of an HTTP request to *url*
     #
-    def self.fetch(url)
+    def self.fetch(url, from_page = nil)
       begin
-        url = URI(url) if url.is_a?(String)
+        url = URI(url) unless url.is_a?(URI)
-        response, code, location = Anemone::HTTP.get(url)
+        if from_page
+          referer = from_page.url
+          depth = from_page.depth + 1
+        end
+        response, code, location = Anemone::HTTP.get(url, referer)
         aka = nil
         if !url.eql?(location)
           aka = location
         end
-        return Page.new(url, response.body, code, response.to_hash, aka)
+        return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
       rescue
         return Page.new(url)
       end
@@ -48,14 +56,16 @@ module Anemone
     #
     # Create a new page
     #
-    def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
+    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
       @url = url
       @code = code
       @headers = headers
       @links = []
       @aliases = []
       @data = OpenStruct.new
+      @referer = referer
+      @depth = depth || 0
       @aliases << aka if !aka.nil?
       if body

data/lib/anemone/page_hash.rb CHANGED Viewed

@@ -1,6 +1,20 @@
 module Anemone
   class PageHash < Hash
+    # We typically index the hash with a URI,
+    # but convert it to a String for easier retrieval
+    def [](index)
+      super(index.to_s)
+    end
+    def []=(index, other)
+      super(index.to_s, other)
+    end
+    def has_key?(key)
+      super(key.to_s)
+    end
     #
     # Use a breadth-first search to calculate the single-source
     # shortest paths from *root* to all pages in the PageHash

data/lib/anemone/tentacle.rb CHANGED Viewed

@@ -17,11 +17,15 @@ module Anemone
     #
     def run
       while true do
-        link = @link_queue.deq
+        link, from_page = @link_queue.deq
         break if link == :END
-        page = Page.fetch(link)
+        if from_page
+          page = Page.fetch(link, from_page)
+        else
+          page = Page.fetch(link)
+        end
         @page_queue.enq(page)

data/spec/anemone_spec.rb CHANGED Viewed

@@ -14,12 +14,16 @@ describe Anemone do
     Anemone.crawl(SPEC_DOMAIN, :verbose => false,
                                :threads => 2,
                                :discard_page_bodies => true,
-                               :user_agent => 'test')
+                               :user_agent => 'test',
+                               :obey_robots_txt => true,
+                               :depth_limit => 3)
     Anemone.options.verbose.should == false
     Anemone.options.threads.should == 2
     Anemone.options.discard_page_bodies.should == true
     Anemone.options.delay.should == 0
     Anemone.options.user_agent.should == 'test'
+    Anemone.options.obey_robots_txt.should == true
+    Anemone.options.depth_limit.should == 3
   end
   it "should use 1 thread if a delay is requested" do

data/spec/core_spec.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module Anemone
       core = Anemone.crawl(pages[0].url)
       core.should have(2).pages
-      core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
+      core.pages.keys.should_not include('http://www.other.com/')
     end
     it "should follow http redirects" do
@@ -56,7 +56,7 @@ module Anemone
       core = Anemone.crawl(pages[0].url)
       core.should have(2).pages
-      core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
+      core.pages.keys.should_not include(pages[2].url)
     end
     it "should be able to skip links based on a RegEx" do
@@ -70,7 +70,7 @@ module Anemone
       end
       core.should have(2).pages
-      core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
+      core.pages.keys.should_not include(pages[1].url)
     end
     it "should be able to call a block on every page" do
@@ -107,7 +107,7 @@ module Anemone
       end
       core.should have(2).pages
-      core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
+      core.pages.keys.should_not include(pages[1].url)
     end
     it "should optionally delay between page requests" do
@@ -123,6 +123,59 @@ module Anemone
       (finish - start).should satisfy {|t| t > delay * 2}
     end
+    it "should optionally obey the robots exclusion protocol" do
+      pages = []
+      pages << FakePage.new('0', :links => '1')
+      pages << FakePage.new('1')
+      pages << FakePage.new('robots.txt',
+                            :body => "User-agent: *\nDisallow: /1",
+                            :content_type => 'text/plain')
+      core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
+      urls = core.pages.keys
+      urls.should include(pages[0].url)
+      urls.should_not include(pages[1].url)
+    end
+    it "should track the page depth and referer" do
+      num_pages = 5
+      pages = []
+      num_pages.times do |n|
+        # register this page with a link to the next page
+        link = (n + 1).to_s if n + 1 < num_pages
+        pages << FakePage.new(n.to_s, :links => [link].compact)
+      end
+      core = Anemone.crawl(pages[0].url)
+      num_pages.times do |n|
+        page = core.pages[pages[n].url]
+        page.depth.should == n
+        page.referer.should == core.pages[pages[n-1].url].url if n > 0
+      end
+      core.pages[pages[0].url].referer.should == nil
+    end
+    it "should optionally limit the depth of the crawl" do
+      num_pages = 5
+      pages = []
+      num_pages.times do |n|
+        # register this page with a link to the next page
+        link = (n + 1).to_s if n + 1 < num_pages
+        pages << FakePage.new(n.to_s, :links => [link].compact)
+      end
+      core = Anemone.crawl(pages[0].url, :depth_limit => 3)
+      core.should have(4).pages
+    end
   end
 end

data/spec/fakeweb_helper.rb CHANGED Viewed

@@ -13,14 +13,17 @@ module Anemone
   class FakePage
     attr_accessor :links
     attr_accessor :hrefs
+    attr_accessor :body
     def initialize(name = '', options = {})
       @name = name
       @links = [options[:links]].flatten if options.has_key?(:links)
       @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
       @redirect = options[:redirect] if options.has_key?(:redirect)
+      @content_type = options[:content_type] || "text/html"
+      @body = options[:body]
-      create_body
+      create_body unless @body
       add_to_fakeweb
     end
@@ -38,7 +41,7 @@ module Anemone
     end
     def add_to_fakeweb
-      options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
+      options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
       if @redirect
         options[:status] = [301, "Permanently Moved"]

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,5 +1,5 @@
-require File.dirname(__FILE__) + '/fakeweb_helper'
 require 'rubygems'
+require File.dirname(__FILE__) + '/fakeweb_helper'
 $:.unshift(File.dirname(__FILE__) + '/../lib/')
 require 'anemone'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: anemone
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.2.0
 platform: ruby
 authors:
 - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-10 00:00:00 -05:00
+date: 2009-09-07 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency