RubyGems - cobweb - Versions diffs - 0.0.57 → 0.0.58 - Mend

cobweb 0.0.57 → 0.0.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/README.textile +3 -1
data/lib/cobweb.rb +23 -2
data/lib/cobweb_crawler.rb +1 -0
data/lib/cobweb_links.rb +9 -0
data/lib/cobweb_version.rb +1 -1
data/lib/robots.rb +71 -3
data/lib/server.rb +0 -1
data/spec/cobweb/cobweb_crawler_spec.rb +4 -4
data/spec/cobweb/cobweb_job_spec.rb +0 -10
data/spec/cobweb/robots_spec.rb +70 -0
data/spec/samples/robots.txt +294 -0
data/spec/samples/sample_site/robots.txt +13 -0
data/spec/spec_helper.rb +25 -0
metadata +25 -22

data/README.textile CHANGED Viewed

@@ -1,5 +1,5 @@
-h1. Cobweb v0.0.57
+h1. Cobweb v0.0.58
 !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
 h2. Intro
@@ -41,6 +41,8 @@ h3. Data Returned
   * :crawl_id - the id used internally for identifying the crawl.  Can be used by the processing job to seperate crawls
   * :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
   * :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
+  * :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
+  * :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
   The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)

data/lib/cobweb.rb CHANGED Viewed

@@ -43,6 +43,9 @@ class Cobweb
     default_redis_options_to                  Hash.new
     default_internal_urls_to                  []
     default_first_page_redirect_internal_to   true
+    default_text_mime_types_to                ["text/*", "application/xhtml+xml"]
+    default_obey_robots_to                    false
+    default_user_agent_to                     "cobweb"
   end
@@ -177,7 +180,7 @@ class Cobweb
             content[:character_set] = charset
           end
           content[:length] = response.content_length
-          if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
+          if text_content?(content[:mime_type])
             if response["Content-Encoding"]=="gzip"
               content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
             else
@@ -384,5 +387,23 @@ class Cobweb
       content
     end
-  end
+  end
+  private
+  # checks if the mime_type is textual
+  def text_content?(content_type)
+    @options[:text_mime_types].each do |mime_type|
+      return true if content_type.match(escape_pattern_for_regex(mime_type))
+    end
+    false
+  end
+  # escapes characters with meaning in regular expressions and adds wildcard expression
+  def escape_pattern_for_regex(pattern)
+    pattern = pattern.gsub(".", "\\.")
+    pattern = pattern.gsub("?", "\\?")
+    pattern = pattern.gsub("*", ".*?")
+    pattern
+  end
 end

data/lib/cobweb_crawler.rb CHANGED Viewed

@@ -64,6 +64,7 @@ class CobwebCrawler
               @redis.incr "crawl-counter"
               internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
+              ap internal_links
               # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
               cobweb_links = CobwebLinks.new(@options)

data/lib/cobweb_links.rb CHANGED Viewed

@@ -17,6 +17,15 @@ class CobwebLinks
   end
+  def allowed?(link)
+    if @options[:obey_robots]
+      robot = Robots.new(:url => link, :user_agent => @options[:user_agent])
+      return robot.allowed?(link)
+    else
+      return true
+    end
+  end
   # Returns true if the link is matched to an internal_url and not matched to an external_url
   def internal?(link)
     if @options[:debug]

data/lib/cobweb_version.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class CobwebVersion
   # Returns a string of the current version
   def self.version
-    "0.0.57"
+    "0.0.58"
   end
 end

data/lib/robots.rb CHANGED Viewed

@@ -2,10 +2,78 @@
 class Robots
   # Processes the robots.txt file
-  def initialize(url, file_name="robots.txt")
+  def initialize(options)
+    @options = options
+    raise "options should be a hash" unless options.kind_of? Hash
+    raise ":url is required" unless @options.has_key? :url
+    @options[:file] = "robots.txt" unless @options.has_key? :file
+    @options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
+    uri = URI.parse(@options[:url])
+    content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
+    if content[:mime_type][0..4] == "text/"
+      @raw_data = parse_data(content[:body])
+      if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
+        @params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
+      else
+        raise "Wildcard user-agent is not present" unless @raw_data.has_key? :*
+        @params = @raw_data[:*]
+      end
+    else
+      raise "Invalid mime type: #{content[:content_type]}"
+    end
+  end
+  def allowed?(url)
     uri = URI.parse(url)
-    [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join
-    Cobweb.new(:cache => 6000).get([uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join)
+    @params[:allow].each do |pattern|
+      return true if uri.path.match(escape_pattern_for_regex(pattern))
+    end
+    @params[:disallow].each do |pattern|
+      return false if uri.path.match(escape_pattern_for_regex(pattern))
+    end
+    true
+  end
+  def user_agent_settings
+    @params
+  end
+  def contents
+    @raw_data
+  end
+  private
+  # escapes characters with meaning in regular expressions and adds wildcard expression
+  def escape_pattern_for_regex(pattern)
+    pattern = pattern.gsub(".", "\\.")
+    pattern = pattern.gsub("?", "\\?")
+    pattern = pattern.gsub("*", ".*?")
+    pattern
+  end
+  def parse_data(data)
+    user_agents = {}
+    lines = data.split("\n")
+    lines.map!{|line| line.strip}
+    lines.reject!{|line| line == "" || line[0] == "#"}
+    current_user_agent = nil
+    lines.each do |line|
+      if line[0..10].downcase == "user-agent:"
+        current_user_agent = line.split(":")[1..-1].join.downcase.strip.to_sym
+        user_agents[current_user_agent] = {:allow => [], :disallow => []}
+      else
+        if current_user_agent
+          values = line.split(":")
+          unless values[1..-1].join.strip == ""
+            user_agents[current_user_agent][values[0].downcase.strip.to_sym] = [] unless user_agents[current_user_agent].has_key? values[0].downcase.to_sym
+            user_agents[current_user_agent][values[0].downcase.strip.to_sym] << values[1..-1].join.strip
+          end
+        end
+      end
+    end
+    user_agents
   end
 end

data/lib/server.rb CHANGED Viewed

@@ -5,7 +5,6 @@ require 'haml'
 class Server < Sinatra::Base
   set :views, settings.root + '/../views'
-  puts "#{settings.root}/../public"
   set :public_folder, settings.root + '/../public'
   enable :static

data/spec/cobweb/cobweb_crawler_spec.rb CHANGED Viewed

@@ -4,7 +4,7 @@ describe CobwebCrawler do
   before(:each) do
-    @base_url = "http://www.baseurl.com/"
+    @base_url = "http://localhost:3532/"
     @default_headers = {"Cache-Control" => "private, max-age=0",
                         "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
@@ -27,9 +27,9 @@ describe CobwebCrawler do
       # temporary tests to run crawler - proper specs to follow.. honest
-      crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
+      crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => true})
-      statistics = crawler.crawl("http://rockwellcottage.heroku.com/")
+      statistics = crawler.crawl(@base_url)
       statistics.should_not be_nil
       statistics.should be_an_instance_of Hash
@@ -42,7 +42,7 @@ describe CobwebCrawler do
       crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
-      statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content, statistics|
+      statistics = crawler.crawl(@base_url) do |content, statistics|
         content[:url].should_not be_nil
         statistics[:average_length].should_not be_nil
       end

data/spec/cobweb/cobweb_job_spec.rb CHANGED Viewed

@@ -12,15 +12,6 @@ describe Cobweb, :local_only => true do
     io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=5 QUEUE=cobweb_crawl_job > log/output.log &")
     puts "Workers Started."
-    # START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
-    @thin = nil
-    Thread.new do
-      @thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
-    end
-    # WAIT FOR START TO COMPLETE
-    sleep 1
   end
   before(:each) do
@@ -147,7 +138,6 @@ describe Cobweb, :local_only => true do
     @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
     command = "kill #{(@all_processes - @existing_processes).join(" ")}"
     IO.popen(command)
-    #@thin.stop!
   end
 end

data/spec/cobweb/robots_spec.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+describe Robots do
+  before(:each) do
+    @cobweb = Cobweb.new :quiet => true, :cache => nil
+  end
+  describe "default user-agent" do
+    before(:each) do
+      @options = {:url => "http://localhost/"}
+    end
+    it "should parse a valid robots.txt" do
+      lambda {Robots.new(@options)}.should_not raise_error
+    end
+    it "should allow urls marked as allow" do
+      robot = Robots.new(@options)
+      robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
+    end
+    it "should disallow urls specified as disallow" do
+      robot = Robots.new(@options)
+      robot.allowed?("http://localhost/globalmarketfinder/").should be_false
+      robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
+    end
+    it "should allow urls not listed" do
+      robot = Robots.new(@options)
+      robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
+    end
+  end
+  describe "google user-agent" do
+    before(:each) do
+      @options = {:url => "http://localhost/", :user_agent => "google"}
+    end
+    it "should parse a valid robots.txt" do
+      lambda {Robots.new(@options)}.should_not raise_error
+    end
+    it "should disallow all urls" do
+      robot = Robots.new(@options)
+      robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_false
+      robot.allowed?("http://localhost/globalmarketfinder/").should be_false
+      robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
+      robot.allowed?("http://localhost/notlistedinrobotsfile").should be_false
+    end
+  end
+  describe "cybermapper user-agent" do
+    before(:each) do
+      @options = {:url => "http://localhost/", :user_agent => "cybermapper"}
+    end
+    it "should parse a valid robots.txt" do
+      lambda {Robots.new(@options)}.should_not raise_error
+    end
+    it "should disallow all urls" do
+      robot = Robots.new(@options)
+      robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
+      robot.allowed?("http://localhost/globalmarketfinder/").should be_true
+      robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_true
+      robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
+    end
+  end
+end

data/spec/samples/robots.txt ADDED Viewed

@@ -0,0 +1,294 @@
+# robots.txt for http://www.example.com/
+User-agent: cybermapper
+Disallow:
+User-agent: google
+Disallow: /
+User-agent: *
+Disallow: /search
+Disallow: /sdch
+Disallow: /groups
+Disallow: /images
+Disallow: /catalogs
+Allow: /catalogs/about
+Allow: /catalogs/p?
+Disallow: /catalogues
+Disallow: /news
+Allow: /news/directory
+Disallow: /nwshp
+Disallow: /setnewsprefs?
+Disallow: /index.html?
+Disallow: /?
+Allow: /?hl=
+Disallow: /?hl=*&
+Disallow: /addurl/image?
+Disallow: /pagead/
+Disallow: /relpage/
+Disallow: /relcontent
+Disallow: /imgres
+Disallow: /imglanding
+Disallow: /sbd
+Disallow: /keyword/
+Disallow: /u/
+Disallow: /univ/
+Disallow: /cobrand
+Disallow: /custom
+Disallow: /advanced_group_search
+Disallow: /googlesite
+Disallow: /preferences
+Disallow: /setprefs
+Disallow: /swr
+Disallow: /url
+Disallow: /default
+Disallow: /m?
+Disallow: /m/?
+Disallow: /m/blogs?
+Disallow: /m/directions?
+Disallow: /m/ig
+Disallow: /m/images?
+Disallow: /m/imgres?
+Disallow: /m/local?
+Disallow: /m/movies?
+Disallow: /m/news?
+Disallow: /m/news/i?
+Disallow: /m/place?
+Disallow: /m/products?
+Disallow: /m/products/
+Disallow: /m/setnewsprefs?
+Disallow: /m/search?
+Disallow: /m/swmloptin?
+Disallow: /m/trends
+Disallow: /m/video?
+Disallow: /wml?
+Disallow: /wml/?
+Disallow: /wml/search?
+Disallow: /xhtml?
+Disallow: /xhtml/?
+Disallow: /xhtml/search?
+Disallow: /xml?
+Disallow: /imode?
+Disallow: /imode/?
+Disallow: /imode/search?
+Disallow: /jsky?
+Disallow: /jsky/?
+Disallow: /jsky/search?
+Disallow: /pda?
+Disallow: /pda/?
+Disallow: /pda/search?
+Disallow: /sprint_xhtml
+Disallow: /sprint_wml
+Disallow: /pqa
+Disallow: /palm
+Disallow: /gwt/
+Disallow: /purchases
+Disallow: /hws
+Disallow: /bsd?
+Disallow: /linux?
+Disallow: /mac?
+Disallow: /microsoft?
+Disallow: /unclesam?
+Disallow: /answers/search?q=
+Disallow: /local?
+Disallow: /local_url
+Disallow: /shihui?
+Disallow: /shihui/
+Disallow: /froogle?
+Disallow: /products?
+Disallow: /products/
+Disallow: /froogle_
+Disallow: /product_
+Disallow: /products_
+Disallow: /products;
+Disallow: /print
+Disallow: /books/
+Disallow: /bkshp?*q=*
+Disallow: /books?*q=*
+Disallow: /books?*output=*
+Disallow: /books?*pg=*
+Disallow: /books?*jtp=*
+Disallow: /books?*jscmd=*
+Disallow: /books?*buy=*
+Disallow: /books?*zoom=*
+Allow: /books?*q=related:*
+Allow: /books?*q=editions:*
+Allow: /books?*q=subject:*
+Allow: /books/about
+Allow: /booksrightsholders
+Allow: /books?*zoom=1*
+Allow: /books?*zoom=5*
+Disallow: /ebooks/
+Disallow: /ebooks?*q=*
+Disallow: /ebooks?*output=*
+Disallow: /ebooks?*pg=*
+Disallow: /ebooks?*jscmd=*
+Disallow: /ebooks?*buy=*
+Disallow: /ebooks?*zoom=*
+Allow: /ebooks?*q=related:*
+Allow: /ebooks?*q=editions:*
+Allow: /ebooks?*q=subject:*
+Allow: /ebooks?*zoom=1*
+Allow: /ebooks?*zoom=5*
+Disallow: /patents?
+Allow: /patents?id=
+Allow: /patents?vid=
+Disallow: /scholar
+Disallow: /citations?
+Allow: /citations?user=
+Allow: /citations?view_op=new_profile
+Allow: /citations?view_op=top_venues
+Disallow: /complete
+Disallow: /s?
+Disallow: /sponsoredlinks
+Disallow: /videosearch?
+Disallow: /videopreview?
+Disallow: /videoprograminfo?
+Disallow: /maps?
+Disallow: /mapstt?
+Disallow: /mapslt?
+Disallow: /maps/stk/
+Disallow: /maps/br?
+Disallow: /mapabcpoi?
+Disallow: /maphp?
+Disallow: /mapprint?
+Disallow: /maps/api/js/StaticMapService.GetMapImage?
+Disallow: /maps/api/staticmap?
+Disallow: /mld?
+Disallow: /staticmap?
+Disallow: /places/
+Allow: /places/$
+Disallow: /maps/place
+Disallow: /help/maps/streetview/partners/welcome/
+Disallow: /lochp?
+Disallow: /center
+Disallow: /ie?
+Disallow: /sms/demo?
+Disallow: /katrina?
+Disallow: /blogsearch?
+Disallow: /blogsearch/
+Disallow: /blogsearch_feeds
+Disallow: /advanced_blog_search
+Disallow: /reader/
+Allow: /reader/play
+Disallow: /uds/
+Disallow: /chart?
+Disallow: /transit?
+Disallow: /mbd?
+Disallow: /extern_js/
+Disallow: /calendar/feeds/
+Disallow: /calendar/ical/
+Disallow: /cl2/feeds/
+Disallow: /cl2/ical/
+Disallow: /coop/directory
+Disallow: /coop/manage
+Disallow: /trends?
+Disallow: /trends/music?
+Disallow: /trends/hottrends?
+Disallow: /trends/viz?
+Disallow: /notebook/search?
+Disallow: /musica
+Disallow: /musicad
+Disallow: /musicas
+Disallow: /musicl
+Disallow: /musics
+Disallow: /musicsearch
+Disallow: /musicsp
+Disallow: /musiclp
+Disallow: /browsersync
+Disallow: /call
+Disallow: /archivesearch?
+Disallow: /archivesearch/url
+Disallow: /archivesearch/advanced_search
+Disallow: /base/reportbadoffer
+Disallow: /urchin_test/
+Disallow: /movies?
+Disallow: /codesearch?
+Disallow: /codesearch/feeds/search?
+Disallow: /wapsearch?
+Disallow: /safebrowsing
+Allow: /safebrowsing/diagnostic
+Allow: /safebrowsing/report_badware/
+Allow: /safebrowsing/report_error/
+Allow: /safebrowsing/report_phish/
+Disallow: /reviews/search?
+Disallow: /orkut/albums
+Allow: /jsapi
+Disallow: /views?
+Disallow: /c/
+Disallow: /cbk
+Allow: /cbk?output=tile&cb_client=maps_sv
+Disallow: /recharge/dashboard/car
+Disallow: /recharge/dashboard/static/
+Disallow: /translate_a/
+Disallow: /translate_c
+Disallow: /translate_f
+Disallow: /translate_static/
+Disallow: /translate_suggestion
+Disallow: /profiles/me
+Allow: /profiles
+Disallow: /s2/profiles/me
+Allow: /s2/profiles
+Allow: /s2/photos
+Allow: /s2/static
+Disallow: /s2
+Disallow: /transconsole/portal/
+Disallow: /gcc/
+Disallow: /aclk
+Disallow: /cse?
+Disallow: /cse/home
+Disallow: /cse/panel
+Disallow: /cse/manage
+Disallow: /tbproxy/
+Disallow: /imesync/
+Disallow: /shenghuo/search?
+Disallow: /support/forum/search?
+Disallow: /reviews/polls/
+Disallow: /hosted/images/
+Disallow: /ppob/?
+Disallow: /ppob?
+Disallow: /ig/add?
+Disallow: /adwordsresellers
+Disallow: /accounts/o8
+Allow: /accounts/o8/id
+Disallow: /topicsearch?q=
+Disallow: /xfx7/
+Disallow: /squared/api
+Disallow: /squared/search
+Disallow: /squared/table
+Disallow: /toolkit/
+Allow: /toolkit/*.html
+Disallow: /globalmarketfinder/
+Allow: /globalmarketfinder/*.html
+Disallow: /qnasearch?
+Disallow: /app/updates
+Disallow: /sidewiki/entry/
+Disallow: /quality_form?
+Disallow: /labs/popgadget/search
+Disallow: /buzz/post
+Disallow: /compressiontest/
+Disallow: /analytics/reporting/
+Disallow: /analytics/admin/
+Disallow: /analytics/web/
+Disallow: /analytics/feeds/
+Disallow: /analytics/settings/
+Disallow: /alerts/
+Disallow: /ads/preferences/
+Allow: /ads/preferences/html/
+Allow: /ads/preferences/plugin
+Disallow: /settings/ads/onweb/
+Disallow: /phone/compare/?
+Allow: /alerts/manage
+Disallow: /travel/clk
+Disallow: /hotelfinder/rpc
+Disallow: /flights/rpc
+Disallow: /commercesearch/services/
+Disallow: /evaluation/
+Disallow: /webstore/search
+Disallow: /chrome/browser/mobile/tour
+Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
+Sitemap: http://www.google.com/hostednews/sitemap_index.xml
+Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
+Sitemap: http://www.google.com/sitemaps_webmasters.xml
+Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
+Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml

data/spec/samples/sample_site/robots.txt ADDED Viewed

@@ -0,0 +1,13 @@
+# robots.txt for http://www.example.com/
+User-agent: cybermapper
+Disallow:
+User-agent: google
+Disallow: /
+User-agent: *
+Disallow: /forms
+Disallow: /gallery
+Disallow: /more.html
+Allow: /catalogs/about

data/spec/spec_helper.rb CHANGED Viewed

@@ -13,6 +13,17 @@ RSpec.configure do |config|
     config.filter_run_excluding :local_only => true
   end
+  config.before(:all) {
+    # START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
+    @thin = nil
+    Thread.new do
+      @thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
+    end
+    # WAIT FOR START TO COMPLETE
+    sleep 1
+  }
   config.before(:each) {
     #redis_mock = double("redis")
@@ -40,10 +51,12 @@ RSpec.configure do |config|
     @mock_http_client = mock(Net::HTTP)
     @mock_http_request = mock(Net::HTTPRequest)
+    @mock_http_robot_request = mock(Net::HTTPRequest)
     @mock_http_redirect_request = mock(Net::HTTPRequest)
     @mock_http_redirect_request2 = mock(Net::HTTPRequest)
     @mock_http_response = mock(Net::HTTPResponse)
+    @mock_http_robot_response = mock(Net::HTTPResponse)
     @mock_http_redirect_response = mock(Net::HTTPRedirection)
     @mock_http_redirect_response2 = mock(Net::HTTPRedirection)
     @mock_http_get = mock(Net::HTTP::Get)
@@ -51,11 +64,13 @@ RSpec.configure do |config|
     Net::HTTP.stub!(:new).and_return(@mock_http_client)
     Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
     Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
+    Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
     Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
     Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
     @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
+    @mock_http_client.stub!(:request).with(@mock_http_robot_request).and_return(@mock_http_robot_response)
     @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
     @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
     @mock_http_client.stub!(:read_timeout=).and_return(nil)
@@ -64,6 +79,16 @@ RSpec.configure do |config|
     @mock_http_client.stub!(:address).and_return("www.baseurl.com")
     @mock_http_client.stub!(:port).and_return("80 ")
+    @mock_http_robot_response.stub!(:code).and_return(200)
+    @mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
+    @mock_http_robot_response.stub!(:content_type).and_return("text/plain")
+    @mock_http_robot_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
+    @mock_http_robot_response.stub!(:[]).with("location").and_return(@default_headers["location"])
+    @mock_http_robot_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
+    @mock_http_robot_response.stub!(:content_length).and_return(1024)
+    @mock_http_robot_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
+    @mock_http_robot_response.stub!(:to_hash).and_return(@default_headers)
     @mock_http_response.stub!(:code).and_return(200)
     @mock_http_response.stub!(:content_type).and_return("text/html")
     @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 0.0.57
+  version: 0.0.58
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-06-13 00:00:00.000000000 Z
+date: 2012-06-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: resque
-  requirement: &70194152226040 !ruby/object:Gem::Requirement
+  requirement: &70328776801460 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152226040
+  version_requirements: *70328776801460
 - !ruby/object:Gem::Dependency
   name: redis
-  requirement: &70194152224380 !ruby/object:Gem::Requirement
+  requirement: &70328776799760 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152224380
+  version_requirements: *70328776799760
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &70194152223560 !ruby/object:Gem::Requirement
+  requirement: &70328776798960 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152223560
+  version_requirements: *70328776798960
 - !ruby/object:Gem::Dependency
   name: addressable
-  requirement: &70194152222500 !ruby/object:Gem::Requirement
+  requirement: &70328776797840 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152222500
+  version_requirements: *70328776797840
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70194152220920 !ruby/object:Gem::Requirement
+  requirement: &70328776796300 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152220920
+  version_requirements: *70328776796300
 - !ruby/object:Gem::Dependency
   name: awesome_print
-  requirement: &70194152236080 !ruby/object:Gem::Requirement
+  requirement: &70328776811560 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152236080
+  version_requirements: *70328776811560
 - !ruby/object:Gem::Dependency
   name: sinatra
-  requirement: &70194152235480 !ruby/object:Gem::Requirement
+  requirement: &70328776810940 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152235480
+  version_requirements: *70328776810940
 - !ruby/object:Gem::Dependency
   name: thin
-  requirement: &70194152234920 !ruby/object:Gem::Requirement
+  requirement: &70328776810380 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152234920
+  version_requirements: *70328776810380
 - !ruby/object:Gem::Dependency
   name: haml
-  requirement: &70194152234340 !ruby/object:Gem::Requirement
+  requirement: &70328776809840 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70194152234340
+  version_requirements: *70328776809840
 - !ruby/object:Gem::Dependency
   name: namespaced_redis
-  requirement: &70194152233680 !ruby/object:Gem::Requirement
+  requirement: &70328776809160 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
         version: 1.0.2
   type: :runtime
   prerelease: false
-  version_requirements: *70194152233680
+  version_requirements: *70328776809160
 description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
   crawl extremely large sites which is much more perofmant than multi-threaded crawlers.  It
   is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -136,6 +136,8 @@ files:
 - spec/cobweb/cobweb_links_spec.rb
 - spec/cobweb/cobweb_spec.rb
 - spec/cobweb/content_link_parser_spec.rb
+- spec/cobweb/robots_spec.rb
+- spec/samples/robots.txt
 - spec/samples/sample_html_links.html
 - spec/samples/sample_server.rb
 - spec/samples/sample_site/boxgrid.html
@@ -293,6 +295,7 @@ files:
 - spec/samples/sample_site/js/superfish.js
 - spec/samples/sample_site/js/supersubs.js
 - spec/samples/sample_site/more.html
+- spec/samples/sample_site/robots.txt
 - spec/samples/sample_site/tables.html
 - spec/samples/sample_site/text/museosans-webfont.eot
 - spec/samples/sample_site/text/museosans-webfont.svg