RubyGems - rdaneel - Versions diffs - 0.1.3 → 0.2.2 - Mend

rdaneel 0.1.3 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/.gitignore +2 -0
data/Rakefile +20 -18
data/VERSION +1 -1
data/features/get_with_cache.feature +67 -0
data/features/get_without_cache.feature +155 -0
data/features/step_definitions/rdaneel_steps.rb +79 -0
data/features/support/burrito.rb +69 -0
data/features/support/env.rb +22 -0
data/lib/rdaneel.rb +83 -19
data/spec/rdaneel_spec.rb +47 -0
data/spec/spec_helper.rb +0 -87
data/spec/streamed_content_spec.rb +1 -1
metadata +42 -12
data/spec/no_redirects_neither_robots_spec.rb +0 -130
data/spec/redirects_without_robots_spec.rb +0 -175
data/spec/using_cache_spec.rb +0 -46

data/.gitignore CHANGED Viewed

@@ -15,7 +15,9 @@ tmtags
 ## PROJECT::GENERAL
 coverage
+coverage.data
 rdoc
 pkg
 ## PROJECT::SPECIFIC

data/Rakefile CHANGED Viewed

@@ -10,9 +10,11 @@ begin
     gem.email = ["edgargonzalez@gmail.com", "anibalrojas@gmail.com"]
     gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
     gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
-    gem.add_dependency("em-http-request", ">= 0.2.10")
+    gem.add_dependency("em-http-request", ">= 0.2.11")
     gem.add_dependency('robot_rules', '>= 0.9.3')
     gem.add_development_dependency "rspec", ">= 1.2.9"
+    gem.add_development_dependency "cucumber", ">= 0.8.5"
+    gem.add_development_dependency "relevance-rcov", ">= 0.9.2.1"
     # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
   end
   Jeweler::GemcutterTasks.new
@@ -20,29 +22,29 @@ rescue LoadError
   puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
 end
+require 'cucumber/rake/task'
+Cucumber::Rake::Task.new(:features) do |t|
+  t.cucumber_opts = "--format pretty" # Any valid command line option can go here.
+  t.rcov = true
+  t.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
+end
 require 'spec/rake/spectask'
 Spec::Rake::SpecTask.new(:spec) do |spec|
   spec.libs << 'lib' << 'spec'
   spec.spec_files = FileList['spec/**/*_spec.rb']
-end
-Spec::Rake::SpecTask.new(:rcov) do |spec|
-  spec.libs << 'lib' << 'spec'
-  spec.pattern = 'spec/**/*_spec.rb'
   spec.rcov = true
+  spec.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
 end
-task :spec => :check_dependencies
-task :default => :spec
-require 'rake/rdoctask'
-Rake::RDocTask.new do |rdoc|
-  version = File.exist?('VERSION') ? File.read('VERSION') : ""
-  rdoc.rdoc_dir = 'rdoc'
-  rdoc.title = "rdaneel #{version}"
-  rdoc.rdoc_files.include('README*')
-  rdoc.rdoc_files.include('lib/**/*.rb')
+desc "Run both specs and features and generate aggregated coverage"
+task :all_tests do |t|
+  rm "coverage.data" if File.exist?("coverage.data")
+  Rake::Task['spec'].invoke
+  Rake::Task["features"].invoke
 end
+task :features => :check_dependencies
+task :spec     => :check_dependencies
+task :default  => :all_tests

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.3
1	+ 0.2.2

data/features/get_with_cache.feature ADDED Viewed

@@ -0,0 +1,67 @@
+Feature: get a url using cache
+  In order to fetch content from internet
+  As a crawler
+  I want to get a url respecting robots.txt rules
+  Scenario: the url to fetch is redirected
+    Given a cache for RDaneel
+    And   a robots.txt that allows RDaneel
+    And   a HelloWorld url
+    And   a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
+    And   a "/redirect_me_again" url that redirects 302 to "/hello_world" url
+    When  I get the "/redirect_me" url following a maximum of 3 redirects
+    Then  I should get the content for HelloWorld url
+    And   the http response code should be 200
+    And   I should get 2 redirects
+    And   The redirects sequence should be:
+      | http://127.0.0.1:3210/redirect_me       |
+      | http://127.0.0.1:3210/redirect_me_again |
+    And   The requests sequence should be:
+      | status | path               |
+      | 200    | /robots.txt        |
+      | 301    | /redirect_me       |
+      | 302    | /redirect_me_again |
+      | 200    | /hello_world       |
+    And    The cache for "http://127.0.0.1:3210/robots.txt" should be
+      """
+      User-agent: *
+      Disallow: /cgi-bin/
+      """
+  Scenario: a cached robots.txt exists denying RDaneel's user-agent
+    Given a cache for RDaneel
+    And   The cache for "http://127.0.0.1:3210/robots.txt" is:
+      """
+      User-agent: *
+      Disallow: /
+      """
+    And   a robots.txt that denies RDaneel
+    And   a HelloWorld url
+    When  I get the "/hello_world" url following a maximum of 1 redirects
+    Then  I should get a "Robots are not allowed" error
+    And   I should get 0 redirects
+    And   The requests should be empty
+  Scenario: the url to fetch is redirected to unreacheable server but a robots cache exists for this server allowing RDaneel
+    Given a cache for RDaneel
+    And   The cache for "http://127.0.0.1:3210/robots.txt" is:
+      """
+      User-agent: *
+      Disallow: /cgi-bin/
+      """
+    And   The cache for "http://127.0.0.1:3211/robots.txt" is:
+      """
+      User-agent: *
+      Disallow: /cgi-bin/
+      """
+    And   a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
+    When  I get the "/redirect_me" url following a maximum of 3 redirects
+    Then  I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
+    And   I should get 1 redirects
+    And   The redirects sequence should be:
+      | http://127.0.0.1:3210/redirect_me       |
+    And   The requests sequence should be:
+      | status | path               |
+      | 301    | /redirect_me       |

data/features/get_without_cache.feature ADDED Viewed

@@ -0,0 +1,155 @@
+Feature: get a url without using cache
+  In order to fetch content from internet
+  As a crawler
+  I want to get a url respecting robots.txt rules
+  Scenario: a robots.txt exists allowing RDaneel's user-agent
+    Given a robots.txt that allows RDaneel
+    And   a HelloWorld url
+    When  I get the "/hello_world" url following a maximum of 1 redirects
+    Then  I should get the content for HelloWorld url
+    And   the http response code should be 200
+    And   I should get 0 redirects
+    And   The requests sequence should be:
+      | status | path         |
+      | 200    | /robots.txt  |
+      | 200    | /hello_world |
+  Scenario: a robots.txt exists denying RDaneel's user-agent
+    Given a robots.txt that denies RDaneel
+    And   a HelloWorld url
+    When  I get the "/hello_world" url following a maximum of 1 redirects
+    Then  I should get a "Robots are not allowed" error
+    And   I should get 0 redirects
+    And   The requests sequence should be:
+      | status | path         |
+      | 200    | /robots.txt  |
+  Scenario: the url to fetch is redirected
+    Given a robots.txt that allows RDaneel
+    And   a HelloWorld url
+    And   a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
+    And   a "/redirect_me_again" url that redirects 302 to "/hello_world" url
+    When  I get the "/redirect_me" url following a maximum of 3 redirects
+    Then  I should get the content for HelloWorld url
+    And   the http response code should be 200
+    And   I should get 2 redirects
+    And   The redirects sequence should be:
+      | http://127.0.0.1:3210/redirect_me       |
+      | http://127.0.0.1:3210/redirect_me_again |
+    And   The requests sequence should be:
+      | status | path               |
+      | 200    | /robots.txt        |
+      | 301    | /redirect_me       |
+      | 200    | /robots.txt        |
+      | 302    | /redirect_me_again |
+      | 200    | /robots.txt        |
+      | 200    | /hello_world       |
+  Scenario: the url to fetch exceeds the maximum redirects specifieds
+    Given a robots.txt that allows RDaneel
+    And   a HelloWorld url
+    And   a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
+    And   a "/redirect_me_again" url that redirects 302 to "/hello_world" url
+    When  I get the "/redirect_me" url following a maximum of 1 redirects
+    Then  I should get a "Exceeded maximum number of redirects: 1" error
+    And   I should get 1 redirects
+    And   The redirects sequence should be:
+      | http://127.0.0.1:3210/redirect_me       |
+    And   The requests sequence should be:
+      | status | path               |
+      | 200    | /robots.txt        |
+      | 301    | /redirect_me       |
+      | 200    | /robots.txt        |
+      | 302    | /redirect_me_again |
+  Scenario: the url to fetch has an infinte redirect
+    Given a robots.txt that allows RDaneel
+    And   a HelloWorld url
+    And   a "/redirect_me" url that redirects 302 to "/redirect_me_again" url
+    And   a "/redirect_me_again" url that redirects 302 to "/redirect_me" url
+    When  I get the "/redirect_me" url following a maximum of 2 redirects
+    Then  I should get a "Infinite redirect detected for: http://127.0.0.1:3210/redirect_me" error
+    And   I should get 2 redirects
+    And   The redirects sequence should be:
+      | http://127.0.0.1:3210/redirect_me       |
+      | http://127.0.0.1:3210/redirect_me_again |
+    And   The requests sequence should be:
+      | status | path               |
+      | 200    | /robots.txt        |
+      | 302    | /redirect_me       |
+      | 200    | /robots.txt        |
+      | 302    | /redirect_me_again |
+  Scenario: the url to fetch redirects to not found url
+    Given a robots.txt that allows RDaneel
+    And   a "/redirect_me" url that redirects 302 to "/not_found" url
+    When  I get the "/redirect_me" url following a maximum of 2 redirects
+    Then  I should get a "Not success neither redirect" error
+    And   I should get 1 redirects
+    And   The redirects sequence should be:
+      | http://127.0.0.1:3210/redirect_me |
+    And   The requests sequence should be:
+      | status | path               |
+      | 200    | /robots.txt        |
+      | 302    | /redirect_me       |
+      | 200    | /robots.txt        |
+      | 404    | /not_found         |
+  Scenario: robots.txt doesn't exists
+    Given a HelloWorld url
+    And   a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
+    And   a "/redirect_me_again" url that redirects 302 to "/hello_world" url
+    When  I get the "/redirect_me" url following a maximum of 3 redirects
+    Then  I should get the content for HelloWorld url
+    And   the http response code should be 200
+    And   I should get 2 redirects
+    And   The redirects sequence should be:
+      | http://127.0.0.1:3210/redirect_me       |
+      | http://127.0.0.1:3210/redirect_me_again |
+    And   The requests sequence should be:
+      | status | path               |
+      | 404    | /robots.txt        |
+      | 301    | /redirect_me       |
+      | 404    | /robots.txt        |
+      | 302    | /redirect_me_again |
+      | 404    | /robots.txt        |
+      | 200    | /hello_world       |
+  Scenario: the url to fetch redirects to a malformed url (format handled by em-http-request)
+    Given a robots.txt that allows RDaneel
+    And   a "/redirect_me" url that redirects 302 to "http://malformed:url" url
+    When  I get the "/redirect_me" url following a maximum of 2 redirects
+    Then  I should get a "Location header format error" error
+    And   I should get 0 redirects
+    And   The requests sequence should be:
+      | status | path               |
+      | 200    | /robots.txt        |
+      | 302    | /redirect_me       |
+  Scenario: the url to fetch redirects to a malformed url (format not handled by em-http-request 0.2.10)
+    Given a robots.txt that allows RDaneel
+    And   a "/redirect_me" url that redirects 302 to "http:/malformed:url" url
+    When  I get the "/redirect_me" url following a maximum of 2 redirects
+    Then  I should get a "Location header format error" error
+    And   I should get 0 redirects
+    And   The requests sequence should be:
+      | status | path               |
+      | 200    | /robots.txt        |
+      | 302    | /redirect_me       |
+  Scenario: the url to fetch is redirected to unreacheable host:port
+    Given a robots.txt that allows RDaneel
+    And   a HelloWorld url
+    And   a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
+    When  I get the "/redirect_me" url following a maximum of 3 redirects
+    Then  I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
+    And   I should get 1 redirects
+    And   The redirects sequence should be:
+      | http://127.0.0.1:3210/redirect_me       |
+    And   The requests sequence should be:
+      | status | path               |
+      | 200    | /robots.txt        |
+      | 301    | /redirect_me       |

data/features/step_definitions/rdaneel_steps.rb ADDED Viewed

@@ -0,0 +1,79 @@
+Given /^a robots\.txt that allows RDaneel$/ do
+  $server.mount(:path  => '/robots.txt', :status => 200,
+                :body  => "User-agent: *\nDisallow: /cgi-bin/")
+end
+Given /^a robots\.txt that denies RDaneel$/ do
+  $server.mount(:path  => '/robots.txt', :status => 200,
+                :body  => "User-agent: *\nDisallow: /")
+end
+Given /^a HelloWorld url$/ do
+  $server.mount(:path  => '/hello_world', :status => 200,
+                :body  => "Hello World")
+end
+Given /^a "([^"]*)" url that redirects (\d+) to "([^"]*)" url$/ do |url, status, redirected_to|
+  $server.mount(:path  => url, :status => status.to_i,
+                :location  => redirected_to)
+end
+Given /^a cache for RDaneel$/ do
+  RDaneel.robots_cache = {}
+end
+Given /^The cache for "([^"]*)" is:$/ do |robots_url, robots_file|
+  RDaneel.robots_cache[robots_url] = robots_file
+end
+When /^I get the "([^"]*)" url following a maximum of (\d+) redirects$/ do |url, max_redirects|
+  EM.run do
+    @r = RDaneel.new("#{HOST}#{url}")
+    @r.callback do
+      EM.stop
+    end
+    @r.errback do
+      EM.stop
+    end
+    @r.get(:redirects => max_redirects)
+  end
+end
+Then /^I should get the content for HelloWorld url$/ do
+  @r.http_client.response.should == "Hello World"
+end
+Then /^the http response code should be (\d+)$/ do |code|
+  @r.http_client.response_header.status.should == code.to_i
+end
+Then /^I should get (\d+) redirects$/ do |redirects_count|
+  @r.redirects.size.should == redirects_count.to_i
+end
+Then /^The requests sequence should be:$/ do |expected_table|
+  expected_requests = []
+  expected_table.hashes.each do |hash|
+    expected_requests << {:status => hash[:status].to_i,
+                          :path => hash[:path]}
+  end
+  $server.requests.should == expected_requests
+end
+Then /^The requests should be empty$/ do
+  $server.requests.should be_empty
+end
+Then /^The redirects sequence should be:$/ do |expected_redirects|
+  @r.redirects.should == expected_redirects.raw.flatten
+end
+Then /^I should get a "([^"]*)" error$/ do |error_message|
+  @r.error.should == error_message
+end
+Then /^The cache for "([^"]*)" should be$/ do |robots_url, robots_file|
+  RDaneel.robots_cache[robots_url].should == robots_file
+end

data/features/support/burrito.rb ADDED Viewed

@@ -0,0 +1,69 @@
+require 'socket'
+class Burrito
+  STATUS_MESSAGES = {
+    200 => 'OK',
+    301 => 'Moved Permanently',
+    302 => 'Found',
+    404 => 'Not Found'
+  }
+  attr_reader :requests
+  def initialize
+    @routes = {}
+    @requests = []
+  end
+  def mount(opts)
+    @routes[opts[:path]] = { :status => opts[:status],
+                             :body => opts[:body],
+                             :location => opts[:location] }
+  end
+  def reset
+    @routes = {}
+    @requests = []
+  end
+  def start
+    @thread = Thread.new do
+      webserver = TCPServer.new('127.0.0.1', 3210)
+      while session = webserver.accept
+        request = session.gets
+        path = '/' << request.gsub(/GET\ \//, '').gsub(/\ HTTP.*/, '').chomp
+        if @routes[path]
+          status = @routes[path][:status]
+          body = @routes[path][:body]
+          location = @routes[path][:location]
+        else
+          status = 404
+          body = nil
+          location = nil
+        end
+        @requests.push( { :status => status, :path => path } )
+        response =  "HTTP/1.1 #{status} #{STATUS_MESSAGES[status]}\r\n"
+        response << "Server: burrito/0.0.1\r\n"
+        response << "Content-Length: #{ body ? body.length : 0 }\r\n"
+        response << "Content-Type: text/plain\r\n" if body
+        response << "Location: #{location}\r\n" if location
+        response << "Connection: close\r\n"
+        response << "\r\n"
+        response << "#{body}" if body
+        session.print response
+        session.close
+      end
+    end
+  end
+  def shutdown
+    @thread.terminate
+  end
+end

data/features/support/env.rb ADDED Viewed

@@ -0,0 +1,22 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '../..', 'lib'))
+require 'rubygems'
+require 'rdaneel'
+require 'burrito'
+unless $server
+  $server = Burrito.new
+  $server.start
+end
+HOST = "http://127.0.0.1:3210"
+Before do
+  $server.reset
+  RDaneel.robots_cache = nil
+end
+at_exit do
+  $server.shutdown
+end

data/lib/rdaneel.rb CHANGED Viewed

@@ -19,10 +19,12 @@ class RDaneel
   attr_accessor :uri
   attr_reader :error, :redirects, :http_client
-  def initialize(uri)
+  def initialize(uri,options = {})
     @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
     @uri.path = "/" if @uri.path.nil? || @uri.path == ""
     @redirects = []
+    @verbose = options[:verbose]
+    @hash = @uri.hash if @verbose
   end
   def get(opts = {})
@@ -37,91 +39,118 @@ class RDaneel
       if success?(h)
         @uri = current_uri if current_uri != @uri
         @http_client = h
+        verbose("Succeded fetching: #{current_uri}", h, :status, :response)
         succeed(self)
       elsif redirected?(h)
         if @redirects.size >= max_redirects
           @http_client = h
-          @error = "Exceeded maximum number of redirects"
+          @error = "Exceeded maximum number of redirects: #{max_redirects}"
+          verbose(@error, h, :status, :response)
           fail(self)
           return
         end
+        @redirects << current_uri.to_s
+        current_uri = redirect_url(h, current_uri)
         begin
-          @redirects << current_uri.to_s
-          current_uri = redirect_url(h, current_uri)
+          verbose("Redirected to: #{current_uri.to_s} from: #{@redirects[-1]}", h, :status, :response)
           if @redirects.include?(current_uri.to_s)
             @http_client = h
-            @error = "infinite redirect"
+            @error = "Infinite redirect detected for: #{current_uri.to_s}"
+            verbose(@error, h, :status, :response)
             fail(self)
             return
           end
           _get.call
-        rescue
+        rescue StandardError => se
           @http_client = h
-          @error = "mal formed redirected url"
+          @error = "Error trying to follow a redirect #{current_uri.to_s}: #{h.response_header.location}"
+          verbose(@error, h, :status, :response)
           fail(self)
         end
       else
         # other error
         @http_client = h
-        @error = "not success and not redirect"
+        @error = "Not success neither redirect"
+        verbose(@error, h, :status, :response)
         fail(self)
       end
     }
     _get = lambda {
       robots_url = robots_txt_url(current_uri)
       if robots_cache && robots_file = robots_cache[robots_url.to_s]
+        verbose("Found cached robots.txt:\n#{robots_cache[robots_url.to_s]} for: #{current_uri}")
         if robots_allowed?(robots_file, useragent, robots_url, current_uri)
+          verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
           begin
             h = EM::HttpRequest.new(current_uri).get(options)
+            verbose("Started fetching: #{current_uri}",h,:request)
             h.callback(&_handle_uri_callback)
             h.errback {
               @http_client = h
-              @error = h.error
+              @error = error_message(h)
+              verbose("#{@error} for: #{current_uri}",h,:status,:response)
               fail(self)
             }
           rescue StandardError => se
             @http_client = EM::HttpClient.new("")
             @error = "#{se.message}\n#{se.backtrace.inspect}"
+            verbose("For: #{current_uri} something went wrong: #{@error}")
             fail(self)
           end
         else
           @http_client = EM::HttpClient.new("")
-          @error = "robots denied"
+          @error = "Robots are not allowed"
+          verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
           fail(self)
         end
       else
         robots_url = robots_txt_url(current_uri)
         robots = EM::HttpRequest.new(robots_url).get(:redirects => 2) # get the robots.txt following redirects
+        verbose("Started fetching robots.txt from: #{robots_url} for: #{current_uri}",robots,:request)
         robots.callback {
-          robots_file = robots.response
-          robots_cache[robots_url.to_s] = robots_file if robots_cache
+          if success?(robots)
+            robots_file = robots.response
+            verbose("Found robots.txt at #{robots_url}:\n#{robots_file}", robots, :status, :response)
+          else
+            robots_file = ''
+            verbose("Didn't find robots.txt at #{robots_url}", robots, :status, :response)
+          end
+          robots_cache[robots_txt_url(robots_url).to_s] = robots_file if robots_cache
           if robots_allowed?(robots_file, useragent, robots_url, current_uri)
+            verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
             begin
               h = EM::HttpRequest.new(current_uri).get(options)
+              verbose("Started fetching: #{current_uri}",h,:request)
               h.callback(&_handle_uri_callback)
               h.errback {
                 @http_client = h
-                @error = h.error
+                @error = error_message(h)
+                verbose("#{@error} for: #{current_uri}", h, :status, :response)
                 fail(self)
               }
             rescue StandardError => se
               @http_client = EM::HttpClient.new("")
               @error = "#{se.message}\n#{se.backtrace.inspect}"
+              verbose("For: #{current_uri} something went wrong: #{@error}")
               fail(self)
             end
           else
             @http_client = EM::HttpClient.new("")
-            @error = "robots denied"
+            @error = "Robots are not allowed"
+            verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
             fail(self)
           end
         }
         robots.errback {
+          verbose("Failed to fetch robots.txt: from: #{robots_url} for: #{current_uri}", robots, :status, :response)
           robots_cache[robots_url.to_s] = "" if robots_cache
           h = EM::HttpRequest.new(current_uri).get(options)
+          verbose("Started fetching: #{current_uri}",h,:request)
           h.callback(&_handle_uri_callback)
           h.errback {
             @http_client = h
-            @error = h.error
+            @error = error_message(h)
+            verbose("#{@error} for: #{current_uri}", h, :status, :response)
             fail(self)
           }
         }
@@ -155,6 +184,14 @@ class RDaneel
     Addressable::URI.parse("http://#{location}/robots.txt")
   end
+  def error_message(http_client)
+    @error = if http_client.error.nil? || http_client.error.empty?
+      "An error occurred when fetching #{http_client.uri.to_s}"
+    else
+      http_client.error
+    end
+  end
   def success?(http_client)
     http_client.response_header.status == 200
   end
@@ -164,10 +201,37 @@ class RDaneel
   end
   def redirect_url(http_client, u)
-    location = Addressable::URI.parse(http_client.response_header.location)
-    location = u.join(location) if location.relative?
-    location.path = "/" if location.path.nil? || location.path == ""
-    location
+    # em-http-request handles the case when redirect is relative
+    # at this point http_client.response_header.location should always have an absolute and valid url
+    # but this invalid url is parsed successfully http:/malformed:url so we ask for host
+    Addressable::URI.parse(http_client.response_header.location)
+  end
+  def verbose(message, client = nil, *args)
+    return unless @verbose
+    message.each { |l| hashed_puts('*', l) }
+    args.each do |a|
+      case a
+        when :status
+          if client.response_header.status == 0
+            hashed_puts('< Status:', '0 (timeout)')
+          else
+            hashed_puts('< Status:', client.response_header.status)
+          end
+        when :request  # this is a options hash
+          headers = client.options[:head]
+          headers.each { |k,v| hashed_puts('>', "#{k}: #{v}") } if headers
+        when :response # this is an array
+          client.response_header.each { |r| hashed_puts('<', "#{r[0]}: #{r[1]}") }
+      end
+    end
   end
+  private
+  def hashed_puts( prefix, message )
+    $stdout.puts("[#{@hash}] [#{Time.now.strftime('%Y-%m-%d %H:%m:%S')}] #{prefix} #{message}")
+  end
 end

data/spec/rdaneel_spec.rb ADDED Viewed

@@ -0,0 +1,47 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe "RDaneel" do
+  describe "robots_txt_url" do
+    before(:each) do
+      @rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
+    end
+    it "should return the proper url when url don't has a port specified (80 implied)" do
+      url = Addressable::URI.parse("http://127.0.0.1/path/url?param1=value1&param2=value2")
+      @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
+    end
+    it "should return the proper url when url has a port 80 specified" do
+      url = Addressable::URI.parse("http://127.0.0.1:80/path/url?param1=value1&param2=value2")
+      @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
+    end
+    it "should return the proper url when url has a port different than 80" do
+      url = Addressable::URI.parse("http://127.0.0.1:8080/path/url?param1=value1&param2=value2")
+      @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1:8080/robots.txt"
+    end
+  end
+  describe "robots_allowed?" do
+    before(:each) do
+      @rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
+    end
+    describe "when an error happens parsing the robots rules" do
+      before(:each) do
+        @robot_rules = RobotRules.new("RDaneel")
+        @robot_rules.stub!(:parse).and_raise(StandardError)
+        RobotRules.stub!(:new).and_return(@robot_rules)
+      end
+      it "should return true" do #no matter the params
+        @rdaneel.send(:robots_allowed?, nil, nil, nil, nil).should be_true
+      end
+    end
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -3,91 +3,4 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 require 'rubygems'
 require 'rdaneel'
 require 'spec'
-require 'webrick'
-# keep webrick quiet
-class ::WEBrick::HTTPServer
-  def access_log(config, req, res)
-    # nop
-  end
-end
-class ::WEBrick::BasicLog
-  def log(level, data)
-    # nop
-  end
-end
-def locked_file
-  File.join(File.dirname(__FILE__),"server_lock-#{@__port}")
-end
-def server_setup(port=8080, &blk)
-  @__port = port
-  if @server.nil? and !File.exist?(locked_file)
-    File.open(locked_file,'w') {|f| f << 'locked' }
-    @server = WEBrick::HTTPServer.new :Port => port
-    blk.call(@server) if blk
-    queue = Queue.new # synchronize the thread startup to the main thread
-    @test_thread = Thread.new { queue << 1; @server.start }
-    # wait for the queue
-    value = queue.pop
-    if !value
-      STDERR.puts "Failed to startup test server!"
-      exit(1)
-    end
-    trap("INT"){server_shutdown}
-    at_exit{server_shutdown}
-  end
-end
-def server_shutdown
-  begin
-    if File.exist?(locked_file)
-      File.unlink locked_file
-      @server.shutdown unless @server.nil?
-      @server = nil
-    end
-  rescue Object => e
-    puts "Error #{__FILE__}:#{__LINE__}\n#{e.message}"
-  end
-end
-def mount(server, opts)
-  raise ":path is required" if opts[:path].nil?
-  raise ":status is required" if opts[:status].nil?
-  server.mount_proc( opts[:path],
-    lambda { |req, resp|
-             resp.status = opts[:status]
-             resp.body = opts[:body] unless opts[:body].nil?
-             resp['Location'] = opts[:location] unless opts[:location].nil?
-             opts[:block].call unless opts[:block].nil?
-           } )
-end
-def should_not_be_hit
-  should_be_hit( 0 )
-end
-def should_be_hit_once
-  should_be_hit( 1 )
-end
-def should_be_hit_twice
-  should_be_hit( 2 )
-end
-def should_be_hit( times = 1 )
-  l = lambda {}
-  m = l.should_receive(:call).exactly(times).times
-  return l
-end
-Spec::Runner.configure do |config|
-  config.before :suite do
-    puts "\e[4mThese specs could take a while, please be patience\e[0m"
-  end
-end

data/spec/streamed_content_spec.rb CHANGED Viewed

@@ -9,7 +9,7 @@ describe "RDaneel when the content is chunked (digg.com)" do
     it "should get the content" do
       EM.run do
-        r = RDaneel.new("http://digg.com")
+        r = RDaneel.new("http://digg.com/news")
         r.callback do
           r.http_client.response_header.status.should == 200
           r.http_client.response.should_not be_empty

metadata CHANGED Viewed

@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
   prerelease: false
   segments:
   - 0
-  - 1
-  - 3
-  version: 0.1.3
+  - 2
+  - 2
+  version: 0.2.2
 platform: ruby
 authors:
 - Edgar Gonzalez
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-13 00:00:00 -04:30
+date: 2010-08-27 00:00:00 -04:30
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -28,8 +28,8 @@ dependencies:
         segments:
         - 0
         - 2
-        - 10
-        version: 0.2.10
+        - 11
+        version: 0.2.11
   type: :runtime
   version_requirements: *id001
 - !ruby/object:Gem::Dependency
@@ -60,6 +60,35 @@ dependencies:
         version: 1.2.9
   type: :development
   version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: cucumber
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 8
+        - 5
+        version: 0.8.5
+  type: :development
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: relevance-rcov
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 9
+        - 2
+        - 1
+        version: 0.9.2.1
+  type: :development
+  version_requirements: *id005
 description: Add robots.txt support on top of em-http-request
 email:
 - edgargonzalez@gmail.com
@@ -78,13 +107,16 @@ files:
 - README.rdoc
 - Rakefile
 - VERSION
+- features/get_with_cache.feature
+- features/get_without_cache.feature
+- features/step_definitions/rdaneel_steps.rb
+- features/support/burrito.rb
+- features/support/env.rb
 - lib/rdaneel.rb
-- spec/no_redirects_neither_robots_spec.rb
-- spec/redirects_without_robots_spec.rb
+- spec/rdaneel_spec.rb
 - spec/spec.opts
 - spec/spec_helper.rb
 - spec/streamed_content_spec.rb
-- spec/using_cache_spec.rb
 has_rdoc: true
 homepage: http://github.com/hasmanydevelopers/RDaneel
 licenses: []
@@ -116,8 +148,6 @@ signing_key:
 specification_version: 3
 summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
 test_files:
-- spec/using_cache_spec.rb
-- spec/no_redirects_neither_robots_spec.rb
 - spec/spec_helper.rb
-- spec/redirects_without_robots_spec.rb
 - spec/streamed_content_spec.rb
+- spec/rdaneel_spec.rb

data/spec/no_redirects_neither_robots_spec.rb DELETED Viewed

@@ -1,130 +0,0 @@
-require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
-describe "RDaneel when there are no redirects" do
-  let(:port) {8083}
-  describe "when a successfull status different than 200 is issued for robots.txt" do
-    it "should get the content ignoring the redirect"
-  end
-  describe "when a redirect other than 301 and 302 is issued for robots.txt" do
-    it "should get the content ignoring the redirect"
-  end
-  (301..302).each do |status|
-    describe "when robots.txt has been moved (http code #{status})" do
-      before(:each) do
-        server_setup(port+status) do |server|
-          mount(server, :path  => '/hello_world', :status => 200,
-                        :body  => 'Hello World!', :block  => should_be_hit_once )
-          mount(server, :path  => '/robots.txt',  :status => status,
-                        :location => "http://127.0.0.1:#{port+status}/golems.txt",
-                        :block => should_be_hit_once )
-          mount(server, :path  => '/golems.txt',  :status => 200,
-                        :block => should_be_hit_once )
-        end
-      end
-      after(:each) do
-        server_shutdown
-      end
-      it "should get the redirected robots.txt and the content" do
-        EM.run do
-          r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
-          r.callback do
-            r.http_client.response_header.status.should == 200
-            r.http_client.response.should == "Hello World!"
-            r.redirects.should be_empty
-            EM.stop
-          end
-          r.errback do
-            fail
-            EM.stop
-          end
-          r.get
-        end
-      end
-    end
-  end
-  (400..417).each do |status|
-    describe "when there is a CLIENT error #{status} associated to robots.txt" do
-      before(:each) do
-        server_setup(port+status) do |server|
-          mount(server, :path  => '/hello_world', :status => 200,
-                        :body  => 'Hello World!', :block  => should_be_hit_once )
-          mount(server, :path  => '/robots.txt',  :status => status,
-                        :block => should_be_hit_once )
-        end
-      end
-      after(:each) do
-        server_shutdown
-      end
-      it "should get the content" do
-        EM.run do
-          r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
-          r.callback do
-            r.http_client.response_header.status.should == 200
-            r.http_client.response.should == "Hello World!"
-            r.redirects.should be_empty
-            EM.stop
-          end
-          r.errback do
-            fail
-            EM.stop
-          end
-          r.get
-        end
-      end
-    end
-  end
-  (500..505).each do |status|
-    describe "when there is a SERVER error #{status} associated to robots.txt" do
-      before(:each) do
-        server_setup(port+status) do |server|
-          mount(server, :path  => '/hello_world', :status => 200,
-                        :body  => 'Hello World!', :block  => should_be_hit_once )
-          mount(server, :path  => '/robots.txt',  :status => status,
-                        :block => should_be_hit_once )
-        end
-      end
-      after (:each) do
-        server_shutdown
-      end
-      it "should get the content" do
-        EM.run do
-          r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
-          r.callback do
-            r.http_client.response_header.status.should == 200
-            r.http_client.response.should == "Hello World!"
-            r.redirects.should be_empty
-            EM.stop
-          end
-          r.errback do
-            fail
-            EM.stop
-          end
-          r.get
-        end
-      end
-    end
-  end
-end

data/spec/redirects_without_robots_spec.rb DELETED Viewed

@@ -1,175 +0,0 @@
-require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
-describe "RDaneel when there are redirects" do
-  let(:port) {8081}
-  describe "when there is no robots.txt in the host (ONLY one host)" do
-    describe "when no redirection limit has been set" do
-      before(:each) do
-        server_setup(port) do |server|
-          mount(server, :path  => '/robots.txt',  :status => 404,
-                        :block => should_be_hit_once )
-          mount(server, :path  => '/redirect_me', :status => 301,
-                        :location  => "http://127.0.0.1:#{port}/hello_world",
-                        :block  => should_be_hit_once )
-          mount(server, :path  => '/hello_world', :status => 200,
-                        :body  => 'Hello World!',
-                        :block  => should_not_be_hit )
-        end
-      end
-      after(:each) do
-        server_shutdown
-      end
-      it "should not follow redirects" do
-        EM.run do
-          r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
-          r.callback do
-            fail
-            EM.stop
-          end
-          r.errback do
-            r.redirects.should be_empty
-            r.error.should == "Exceeded maximum number of redirects"
-            EM.stop
-          end
-          r.get
-        end
-      end
-    end
-    describe "when a maximum number or redirects is set" do
-      describe "when there are less redirects than the maximum specified" do
-        before(:each) do
-          server_setup(port) do |server|
-            mount(server, :path  => '/robots.txt',  :status => 404,
-                          :block => should_be_hit(3) )
-            mount(server, :path  => '/redirect_me', :status => 301,
-                          :location  => "http://127.0.0.1:#{port}/redirect_me_again",
-                          :block  => should_be_hit_once )
-            mount(server, :path  => '/redirect_me_again', :status => 301,
-                          :location  => "http://127.0.0.1:#{port}/hello_world",
-                          :block  => should_be_hit_once )
-            mount(server, :path  => '/hello_world', :status => 200,
-                          :body  => 'Hello World!',
-                          :block  => should_be_hit_once )
-          end
-        end
-        after(:each) do
-          server_shutdown
-        end
-        it "should get the content following all the redirects" do
-          EM.run do
-            r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
-            r.callback do
-              r.http_client.response_header.status.should == 200
-              r.http_client.response.should == "Hello World!"
-              r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me",
-                                      "http://127.0.0.1:#{port}/redirect_me_again"]
-              r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
-              EM.stop
-            end
-            r.errback do
-              fail
-              EM.stop
-            end
-            r.get(:redirects => 3)
-          end
-        end
-      end
-      describe "when there are as many redirects as the maximum" do
-        before(:each) do
-          server_setup(port) do |server|
-            mount(server, :path  => '/robots.txt',  :status => 404,
-                          :block => should_be_hit_twice )
-            mount(server, :path  => '/redirect_me', :status => 301,
-                          :location  => "http://127.0.0.1:#{port}/hello_world",
-                          :block  => should_be_hit_once )
-            mount(server, :path  => '/hello_world', :status => 200,
-                          :body  => 'Hello World!',
-                          :block  => should_be_hit_once )
-          end
-        end
-        after(:each) do
-          server_shutdown
-        end
-        it "should get the content following all the redirects" do
-          EM.run do
-            r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
-            r.callback do
-              r.http_client.response_header.status.should == 200
-              r.http_client.response.should == "Hello World!"
-              r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
-              r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
-              EM.stop
-            end
-            r.errback do
-              fail
-              EM.stop
-            end
-            r.get(:redirects => 1)
-          end
-        end
-      end
-      describe "when the number of redirects exceed the maximum specified" do
-        before(:each) do
-          server_setup(port) do |server|
-            mount(server, :path  => '/robots.txt',  :status => 404,
-                          :block => should_be_hit_twice )
-            mount(server, :path  => '/redirect_me', :status => 301,
-                          :location  => "http://127.0.0.1:#{port}/redirect_me_again",
-                          :block  => should_be_hit_once )
-            mount(server, :path  => '/redirect_me_again', :status => 301,
-                          :location  => "http://127.0.0.1:#{port}/hello_world",
-                          :block  => should_be_hit_once )
-            mount(server, :path  => '/hello_world', :status => 200,
-                          :body  => 'Hello World!',
-                          :block  => should_not_be_hit )
-          end
-        end
-        after(:each) do
-          server_shutdown
-        end
-        it "should stop following redirects once the  maximum specified is reached" do
-          EM.run do
-            r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
-            r.callback do
-              fail
-              EM.stop
-            end
-            r.errback do
-              r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
-              r.error.should == "Exceeded maximum number of redirects"
-              EM.stop
-            end
-            r.get(:redirects => 1)
-          end
-        end
-      end
-    end
-  end
-end

data/spec/using_cache_spec.rb DELETED Viewed

@@ -1,46 +0,0 @@
-require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
-describe "RDaneel when there is a cache" do
-  let(:port) {8082}
-  describe "when there is no robots.txt in the host" do
-    before(:each) do
-      RDaneel.robots_cache = {}
-      server_setup(port) do |server|
-        mount(server, :path  => '/robots.txt',  :status => 404,
-                      :block => should_be_hit_once )
-        mount(server, :path  => '/redirect_me', :status => 301,
-                      :location  => "http://127.0.0.1:#{port}/hello_world",
-                      :block  => should_be_hit_once )
-        mount(server, :path  => '/hello_world', :status => 200,
-                      :body  => 'Hello World!',
-                      :block  => should_be_hit_once )
-      end
-    end
-    after(:each) do
-      server_shutdown
-    end
-    it "should try to get the robots.txt just once" do
-      EM.run do
-        r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
-        r.callback do
-          r.http_client.response_header.status.should == 200
-          r.http_client.response.should == "Hello World!"
-          r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me"]
-          r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
-          EM.stop
-        end
-        r.errback do
-          fail
-          EM.stop
-        end
-        r.get(:redirects => 3)
-      end
-    end
-  end
-end