rdaneel 0.1.3 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -15,7 +15,9 @@ tmtags
15
15
 
16
16
  ## PROJECT::GENERAL
17
17
  coverage
18
+ coverage.data
18
19
  rdoc
19
20
  pkg
20
21
 
21
22
  ## PROJECT::SPECIFIC
23
+
data/Rakefile CHANGED
@@ -10,9 +10,11 @@ begin
10
10
  gem.email = ["edgargonzalez@gmail.com", "anibalrojas@gmail.com"]
11
11
  gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
12
12
  gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
13
- gem.add_dependency("em-http-request", ">= 0.2.10")
13
+ gem.add_dependency("em-http-request", ">= 0.2.11")
14
14
  gem.add_dependency('robot_rules', '>= 0.9.3')
15
15
  gem.add_development_dependency "rspec", ">= 1.2.9"
16
+ gem.add_development_dependency "cucumber", ">= 0.8.5"
17
+ gem.add_development_dependency "relevance-rcov", ">= 0.9.2.1"
16
18
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
19
  end
18
20
  Jeweler::GemcutterTasks.new
@@ -20,29 +22,29 @@ rescue LoadError
20
22
  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
23
  end
22
24
 
25
+ require 'cucumber/rake/task'
26
+ Cucumber::Rake::Task.new(:features) do |t|
27
+ t.cucumber_opts = "--format pretty" # Any valid command line option can go here.
28
+ t.rcov = true
29
+ t.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
30
+ end
31
+
23
32
  require 'spec/rake/spectask'
24
33
  Spec::Rake::SpecTask.new(:spec) do |spec|
25
34
  spec.libs << 'lib' << 'spec'
26
35
  spec.spec_files = FileList['spec/**/*_spec.rb']
27
- end
28
-
29
- Spec::Rake::SpecTask.new(:rcov) do |spec|
30
- spec.libs << 'lib' << 'spec'
31
- spec.pattern = 'spec/**/*_spec.rb'
32
36
  spec.rcov = true
37
+ spec.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
33
38
  end
34
39
 
35
- task :spec => :check_dependencies
36
-
37
- task :default => :spec
38
-
39
- require 'rake/rdoctask'
40
- Rake::RDocTask.new do |rdoc|
41
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
-
43
- rdoc.rdoc_dir = 'rdoc'
44
- rdoc.title = "rdaneel #{version}"
45
- rdoc.rdoc_files.include('README*')
46
- rdoc.rdoc_files.include('lib/**/*.rb')
40
+ desc "Run both specs and features and generate aggregated coverage"
41
+ task :all_tests do |t|
42
+ rm "coverage.data" if File.exist?("coverage.data")
43
+ Rake::Task['spec'].invoke
44
+ Rake::Task["features"].invoke
47
45
  end
48
46
 
47
+ task :features => :check_dependencies
48
+ task :spec => :check_dependencies
49
+ task :default => :all_tests
50
+
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.3
1
+ 0.2.2
@@ -0,0 +1,67 @@
1
+ Feature: get a url using cache
2
+ In order to fetch content from internet
3
+ As a crawler
4
+ I want to get a url respecting robots.txt rules
5
+
6
+ Scenario: the url to fetch is redirected
7
+ Given a cache for RDaneel
8
+ And a robots.txt that allows RDaneel
9
+ And a HelloWorld url
10
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
11
+ And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
12
+ When I get the "/redirect_me" url following a maximum of 3 redirects
13
+ Then I should get the content for HelloWorld url
14
+ And the http response code should be 200
15
+ And I should get 2 redirects
16
+ And The redirects sequence should be:
17
+ | http://127.0.0.1:3210/redirect_me |
18
+ | http://127.0.0.1:3210/redirect_me_again |
19
+ And The requests sequence should be:
20
+ | status | path |
21
+ | 200 | /robots.txt |
22
+ | 301 | /redirect_me |
23
+ | 302 | /redirect_me_again |
24
+ | 200 | /hello_world |
25
+ And The cache for "http://127.0.0.1:3210/robots.txt" should be
26
+ """
27
+ User-agent: *
28
+ Disallow: /cgi-bin/
29
+ """
30
+
31
+ Scenario: a cached robots.txt exists denying RDaneel's user-agent
32
+ Given a cache for RDaneel
33
+ And The cache for "http://127.0.0.1:3210/robots.txt" is:
34
+ """
35
+ User-agent: *
36
+ Disallow: /
37
+ """
38
+ And a robots.txt that denies RDaneel
39
+ And a HelloWorld url
40
+ When I get the "/hello_world" url following a maximum of 1 redirects
41
+ Then I should get a "Robots are not allowed" error
42
+ And I should get 0 redirects
43
+ And The requests should be empty
44
+
45
+
46
+ Scenario: the url to fetch is redirected to unreacheable server but a robots cache exists for this server allowing RDaneel
47
+ Given a cache for RDaneel
48
+ And The cache for "http://127.0.0.1:3210/robots.txt" is:
49
+ """
50
+ User-agent: *
51
+ Disallow: /cgi-bin/
52
+ """
53
+ And The cache for "http://127.0.0.1:3211/robots.txt" is:
54
+ """
55
+ User-agent: *
56
+ Disallow: /cgi-bin/
57
+ """
58
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
59
+ When I get the "/redirect_me" url following a maximum of 3 redirects
60
+ Then I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
61
+ And I should get 1 redirects
62
+ And The redirects sequence should be:
63
+ | http://127.0.0.1:3210/redirect_me |
64
+ And The requests sequence should be:
65
+ | status | path |
66
+ | 301 | /redirect_me |
67
+
@@ -0,0 +1,155 @@
1
+ Feature: get a url without using cache
2
+ In order to fetch content from internet
3
+ As a crawler
4
+ I want to get a url respecting robots.txt rules
5
+
6
+ Scenario: a robots.txt exists allowing RDaneel's user-agent
7
+ Given a robots.txt that allows RDaneel
8
+ And a HelloWorld url
9
+ When I get the "/hello_world" url following a maximum of 1 redirects
10
+ Then I should get the content for HelloWorld url
11
+ And the http response code should be 200
12
+ And I should get 0 redirects
13
+ And The requests sequence should be:
14
+ | status | path |
15
+ | 200 | /robots.txt |
16
+ | 200 | /hello_world |
17
+
18
+ Scenario: a robots.txt exists denying RDaneel's user-agent
19
+ Given a robots.txt that denies RDaneel
20
+ And a HelloWorld url
21
+ When I get the "/hello_world" url following a maximum of 1 redirects
22
+ Then I should get a "Robots are not allowed" error
23
+ And I should get 0 redirects
24
+ And The requests sequence should be:
25
+ | status | path |
26
+ | 200 | /robots.txt |
27
+
28
+ Scenario: the url to fetch is redirected
29
+ Given a robots.txt that allows RDaneel
30
+ And a HelloWorld url
31
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
32
+ And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
33
+ When I get the "/redirect_me" url following a maximum of 3 redirects
34
+ Then I should get the content for HelloWorld url
35
+ And the http response code should be 200
36
+ And I should get 2 redirects
37
+ And The redirects sequence should be:
38
+ | http://127.0.0.1:3210/redirect_me |
39
+ | http://127.0.0.1:3210/redirect_me_again |
40
+ And The requests sequence should be:
41
+ | status | path |
42
+ | 200 | /robots.txt |
43
+ | 301 | /redirect_me |
44
+ | 200 | /robots.txt |
45
+ | 302 | /redirect_me_again |
46
+ | 200 | /robots.txt |
47
+ | 200 | /hello_world |
48
+
49
+ Scenario: the url to fetch exceeds the maximum redirects specifieds
50
+ Given a robots.txt that allows RDaneel
51
+ And a HelloWorld url
52
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
53
+ And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
54
+ When I get the "/redirect_me" url following a maximum of 1 redirects
55
+ Then I should get a "Exceeded maximum number of redirects: 1" error
56
+ And I should get 1 redirects
57
+ And The redirects sequence should be:
58
+ | http://127.0.0.1:3210/redirect_me |
59
+ And The requests sequence should be:
60
+ | status | path |
61
+ | 200 | /robots.txt |
62
+ | 301 | /redirect_me |
63
+ | 200 | /robots.txt |
64
+ | 302 | /redirect_me_again |
65
+
66
+ Scenario: the url to fetch has an infinte redirect
67
+ Given a robots.txt that allows RDaneel
68
+ And a HelloWorld url
69
+ And a "/redirect_me" url that redirects 302 to "/redirect_me_again" url
70
+ And a "/redirect_me_again" url that redirects 302 to "/redirect_me" url
71
+ When I get the "/redirect_me" url following a maximum of 2 redirects
72
+ Then I should get a "Infinite redirect detected for: http://127.0.0.1:3210/redirect_me" error
73
+ And I should get 2 redirects
74
+ And The redirects sequence should be:
75
+ | http://127.0.0.1:3210/redirect_me |
76
+ | http://127.0.0.1:3210/redirect_me_again |
77
+ And The requests sequence should be:
78
+ | status | path |
79
+ | 200 | /robots.txt |
80
+ | 302 | /redirect_me |
81
+ | 200 | /robots.txt |
82
+ | 302 | /redirect_me_again |
83
+
84
+ Scenario: the url to fetch redirects to not found url
85
+ Given a robots.txt that allows RDaneel
86
+ And a "/redirect_me" url that redirects 302 to "/not_found" url
87
+ When I get the "/redirect_me" url following a maximum of 2 redirects
88
+ Then I should get a "Not success neither redirect" error
89
+ And I should get 1 redirects
90
+ And The redirects sequence should be:
91
+ | http://127.0.0.1:3210/redirect_me |
92
+ And The requests sequence should be:
93
+ | status | path |
94
+ | 200 | /robots.txt |
95
+ | 302 | /redirect_me |
96
+ | 200 | /robots.txt |
97
+ | 404 | /not_found |
98
+
99
+
100
+ Scenario: robots.txt doesn't exists
101
+ Given a HelloWorld url
102
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
103
+ And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
104
+ When I get the "/redirect_me" url following a maximum of 3 redirects
105
+ Then I should get the content for HelloWorld url
106
+ And the http response code should be 200
107
+ And I should get 2 redirects
108
+ And The redirects sequence should be:
109
+ | http://127.0.0.1:3210/redirect_me |
110
+ | http://127.0.0.1:3210/redirect_me_again |
111
+ And The requests sequence should be:
112
+ | status | path |
113
+ | 404 | /robots.txt |
114
+ | 301 | /redirect_me |
115
+ | 404 | /robots.txt |
116
+ | 302 | /redirect_me_again |
117
+ | 404 | /robots.txt |
118
+ | 200 | /hello_world |
119
+
120
+ Scenario: the url to fetch redirects to a malformed url (format handled by em-http-request)
121
+ Given a robots.txt that allows RDaneel
122
+ And a "/redirect_me" url that redirects 302 to "http://malformed:url" url
123
+ When I get the "/redirect_me" url following a maximum of 2 redirects
124
+ Then I should get a "Location header format error" error
125
+ And I should get 0 redirects
126
+ And The requests sequence should be:
127
+ | status | path |
128
+ | 200 | /robots.txt |
129
+ | 302 | /redirect_me |
130
+
131
+ Scenario: the url to fetch redirects to a malformed url (format not handled by em-http-request 0.2.10)
132
+ Given a robots.txt that allows RDaneel
133
+ And a "/redirect_me" url that redirects 302 to "http:/malformed:url" url
134
+ When I get the "/redirect_me" url following a maximum of 2 redirects
135
+ Then I should get a "Location header format error" error
136
+ And I should get 0 redirects
137
+ And The requests sequence should be:
138
+ | status | path |
139
+ | 200 | /robots.txt |
140
+ | 302 | /redirect_me |
141
+
142
+ Scenario: the url to fetch is redirected to unreacheable host:port
143
+ Given a robots.txt that allows RDaneel
144
+ And a HelloWorld url
145
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
146
+ When I get the "/redirect_me" url following a maximum of 3 redirects
147
+ Then I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
148
+ And I should get 1 redirects
149
+ And The redirects sequence should be:
150
+ | http://127.0.0.1:3210/redirect_me |
151
+ And The requests sequence should be:
152
+ | status | path |
153
+ | 200 | /robots.txt |
154
+ | 301 | /redirect_me |
155
+
@@ -0,0 +1,79 @@
1
+
2
+ Given /^a robots\.txt that allows RDaneel$/ do
3
+ $server.mount(:path => '/robots.txt', :status => 200,
4
+ :body => "User-agent: *\nDisallow: /cgi-bin/")
5
+ end
6
+
7
+ Given /^a robots\.txt that denies RDaneel$/ do
8
+ $server.mount(:path => '/robots.txt', :status => 200,
9
+ :body => "User-agent: *\nDisallow: /")
10
+ end
11
+
12
+ Given /^a HelloWorld url$/ do
13
+ $server.mount(:path => '/hello_world', :status => 200,
14
+ :body => "Hello World")
15
+ end
16
+
17
+ Given /^a "([^"]*)" url that redirects (\d+) to "([^"]*)" url$/ do |url, status, redirected_to|
18
+ $server.mount(:path => url, :status => status.to_i,
19
+ :location => redirected_to)
20
+ end
21
+
22
+ Given /^a cache for RDaneel$/ do
23
+ RDaneel.robots_cache = {}
24
+ end
25
+
26
+ Given /^The cache for "([^"]*)" is:$/ do |robots_url, robots_file|
27
+ RDaneel.robots_cache[robots_url] = robots_file
28
+ end
29
+
30
+ When /^I get the "([^"]*)" url following a maximum of (\d+) redirects$/ do |url, max_redirects|
31
+ EM.run do
32
+ @r = RDaneel.new("#{HOST}#{url}")
33
+ @r.callback do
34
+ EM.stop
35
+ end
36
+ @r.errback do
37
+ EM.stop
38
+ end
39
+ @r.get(:redirects => max_redirects)
40
+ end
41
+ end
42
+
43
+ Then /^I should get the content for HelloWorld url$/ do
44
+ @r.http_client.response.should == "Hello World"
45
+ end
46
+
47
+ Then /^the http response code should be (\d+)$/ do |code|
48
+ @r.http_client.response_header.status.should == code.to_i
49
+ end
50
+
51
+ Then /^I should get (\d+) redirects$/ do |redirects_count|
52
+ @r.redirects.size.should == redirects_count.to_i
53
+ end
54
+
55
+ Then /^The requests sequence should be:$/ do |expected_table|
56
+ expected_requests = []
57
+ expected_table.hashes.each do |hash|
58
+ expected_requests << {:status => hash[:status].to_i,
59
+ :path => hash[:path]}
60
+ end
61
+ $server.requests.should == expected_requests
62
+ end
63
+
64
+ Then /^The requests should be empty$/ do
65
+ $server.requests.should be_empty
66
+ end
67
+
68
+ Then /^The redirects sequence should be:$/ do |expected_redirects|
69
+ @r.redirects.should == expected_redirects.raw.flatten
70
+ end
71
+
72
+ Then /^I should get a "([^"]*)" error$/ do |error_message|
73
+ @r.error.should == error_message
74
+ end
75
+
76
+ Then /^The cache for "([^"]*)" should be$/ do |robots_url, robots_file|
77
+ RDaneel.robots_cache[robots_url].should == robots_file
78
+ end
79
+
@@ -0,0 +1,69 @@
1
+ require 'socket'
2
+
3
+ class Burrito
4
+
5
+ STATUS_MESSAGES = {
6
+ 200 => 'OK',
7
+ 301 => 'Moved Permanently',
8
+ 302 => 'Found',
9
+ 404 => 'Not Found'
10
+ }
11
+
12
+ attr_reader :requests
13
+
14
+ def initialize
15
+ @routes = {}
16
+ @requests = []
17
+ end
18
+
19
+ def mount(opts)
20
+ @routes[opts[:path]] = { :status => opts[:status],
21
+ :body => opts[:body],
22
+ :location => opts[:location] }
23
+ end
24
+
25
+ def reset
26
+ @routes = {}
27
+ @requests = []
28
+ end
29
+
30
+ def start
31
+ @thread = Thread.new do
32
+
33
+ webserver = TCPServer.new('127.0.0.1', 3210)
34
+
35
+ while session = webserver.accept
36
+ request = session.gets
37
+ path = '/' << request.gsub(/GET\ \//, '').gsub(/\ HTTP.*/, '').chomp
38
+ if @routes[path]
39
+ status = @routes[path][:status]
40
+ body = @routes[path][:body]
41
+ location = @routes[path][:location]
42
+ else
43
+ status = 404
44
+ body = nil
45
+ location = nil
46
+ end
47
+ @requests.push( { :status => status, :path => path } )
48
+ response = "HTTP/1.1 #{status} #{STATUS_MESSAGES[status]}\r\n"
49
+ response << "Server: burrito/0.0.1\r\n"
50
+ response << "Content-Length: #{ body ? body.length : 0 }\r\n"
51
+ response << "Content-Type: text/plain\r\n" if body
52
+ response << "Location: #{location}\r\n" if location
53
+ response << "Connection: close\r\n"
54
+ response << "\r\n"
55
+ response << "#{body}" if body
56
+ session.print response
57
+ session.close
58
+ end
59
+
60
+ end
61
+
62
+ end
63
+
64
+ def shutdown
65
+ @thread.terminate
66
+ end
67
+
68
+ end
69
+
@@ -0,0 +1,22 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '../..', 'lib'))
3
+ require 'rubygems'
4
+ require 'rdaneel'
5
+ require 'burrito'
6
+
7
+ unless $server
8
+ $server = Burrito.new
9
+ $server.start
10
+ end
11
+
12
+ HOST = "http://127.0.0.1:3210"
13
+
14
+ Before do
15
+ $server.reset
16
+ RDaneel.robots_cache = nil
17
+ end
18
+
19
+ at_exit do
20
+ $server.shutdown
21
+ end
22
+
data/lib/rdaneel.rb CHANGED
@@ -19,10 +19,12 @@ class RDaneel
19
19
  attr_accessor :uri
20
20
  attr_reader :error, :redirects, :http_client
21
21
 
22
- def initialize(uri)
22
+ def initialize(uri,options = {})
23
23
  @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
24
24
  @uri.path = "/" if @uri.path.nil? || @uri.path == ""
25
25
  @redirects = []
26
+ @verbose = options[:verbose]
27
+ @hash = @uri.hash if @verbose
26
28
  end
27
29
 
28
30
  def get(opts = {})
@@ -37,91 +39,118 @@ class RDaneel
37
39
  if success?(h)
38
40
  @uri = current_uri if current_uri != @uri
39
41
  @http_client = h
42
+ verbose("Succeded fetching: #{current_uri}", h, :status, :response)
40
43
  succeed(self)
41
44
  elsif redirected?(h)
42
45
  if @redirects.size >= max_redirects
43
46
  @http_client = h
44
- @error = "Exceeded maximum number of redirects"
47
+ @error = "Exceeded maximum number of redirects: #{max_redirects}"
48
+ verbose(@error, h, :status, :response)
45
49
  fail(self)
46
50
  return
47
51
  end
52
+ @redirects << current_uri.to_s
53
+ current_uri = redirect_url(h, current_uri)
48
54
  begin
49
- @redirects << current_uri.to_s
50
- current_uri = redirect_url(h, current_uri)
55
+ verbose("Redirected to: #{current_uri.to_s} from: #{@redirects[-1]}", h, :status, :response)
51
56
  if @redirects.include?(current_uri.to_s)
52
57
  @http_client = h
53
- @error = "infinite redirect"
58
+ @error = "Infinite redirect detected for: #{current_uri.to_s}"
59
+ verbose(@error, h, :status, :response)
54
60
  fail(self)
55
61
  return
56
62
  end
57
63
  _get.call
58
- rescue
64
+ rescue StandardError => se
59
65
  @http_client = h
60
- @error = "mal formed redirected url"
66
+ @error = "Error trying to follow a redirect #{current_uri.to_s}: #{h.response_header.location}"
67
+ verbose(@error, h, :status, :response)
61
68
  fail(self)
62
69
  end
63
70
  else
64
71
  # other error
65
72
  @http_client = h
66
- @error = "not success and not redirect"
73
+ @error = "Not success neither redirect"
74
+ verbose(@error, h, :status, :response)
67
75
  fail(self)
68
76
  end
69
77
  }
70
78
  _get = lambda {
71
79
  robots_url = robots_txt_url(current_uri)
72
80
  if robots_cache && robots_file = robots_cache[robots_url.to_s]
81
+ verbose("Found cached robots.txt:\n#{robots_cache[robots_url.to_s]} for: #{current_uri}")
73
82
  if robots_allowed?(robots_file, useragent, robots_url, current_uri)
83
+ verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
74
84
  begin
75
85
  h = EM::HttpRequest.new(current_uri).get(options)
86
+ verbose("Started fetching: #{current_uri}",h,:request)
76
87
  h.callback(&_handle_uri_callback)
77
88
  h.errback {
78
89
  @http_client = h
79
- @error = h.error
90
+ @error = error_message(h)
91
+ verbose("#{@error} for: #{current_uri}",h,:status,:response)
80
92
  fail(self)
81
93
  }
82
94
  rescue StandardError => se
83
95
  @http_client = EM::HttpClient.new("")
84
96
  @error = "#{se.message}\n#{se.backtrace.inspect}"
97
+ verbose("For: #{current_uri} something went wrong: #{@error}")
85
98
  fail(self)
86
99
  end
87
100
  else
88
101
  @http_client = EM::HttpClient.new("")
89
- @error = "robots denied"
102
+ @error = "Robots are not allowed"
103
+ verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
90
104
  fail(self)
91
105
  end
92
106
  else
93
107
  robots_url = robots_txt_url(current_uri)
94
108
  robots = EM::HttpRequest.new(robots_url).get(:redirects => 2) # get the robots.txt following redirects
109
+ verbose("Started fetching robots.txt from: #{robots_url} for: #{current_uri}",robots,:request)
95
110
  robots.callback {
96
- robots_file = robots.response
97
- robots_cache[robots_url.to_s] = robots_file if robots_cache
111
+ if success?(robots)
112
+ robots_file = robots.response
113
+ verbose("Found robots.txt at #{robots_url}:\n#{robots_file}", robots, :status, :response)
114
+ else
115
+ robots_file = ''
116
+ verbose("Didn't find robots.txt at #{robots_url}", robots, :status, :response)
117
+ end
118
+ robots_cache[robots_txt_url(robots_url).to_s] = robots_file if robots_cache
98
119
  if robots_allowed?(robots_file, useragent, robots_url, current_uri)
120
+ verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
99
121
  begin
100
122
  h = EM::HttpRequest.new(current_uri).get(options)
123
+ verbose("Started fetching: #{current_uri}",h,:request)
101
124
  h.callback(&_handle_uri_callback)
102
125
  h.errback {
103
126
  @http_client = h
104
- @error = h.error
127
+ @error = error_message(h)
128
+ verbose("#{@error} for: #{current_uri}", h, :status, :response)
105
129
  fail(self)
106
130
  }
107
131
  rescue StandardError => se
108
132
  @http_client = EM::HttpClient.new("")
109
133
  @error = "#{se.message}\n#{se.backtrace.inspect}"
134
+ verbose("For: #{current_uri} something went wrong: #{@error}")
110
135
  fail(self)
111
136
  end
112
137
  else
113
138
  @http_client = EM::HttpClient.new("")
114
- @error = "robots denied"
139
+ @error = "Robots are not allowed"
140
+ verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
115
141
  fail(self)
116
142
  end
117
143
  }
118
144
  robots.errback {
145
+ verbose("Failed to fetch robots.txt: from: #{robots_url} for: #{current_uri}", robots, :status, :response)
119
146
  robots_cache[robots_url.to_s] = "" if robots_cache
120
147
  h = EM::HttpRequest.new(current_uri).get(options)
148
+ verbose("Started fetching: #{current_uri}",h,:request)
121
149
  h.callback(&_handle_uri_callback)
122
150
  h.errback {
123
151
  @http_client = h
124
- @error = h.error
152
+ @error = error_message(h)
153
+ verbose("#{@error} for: #{current_uri}", h, :status, :response)
125
154
  fail(self)
126
155
  }
127
156
  }
@@ -155,6 +184,14 @@ class RDaneel
155
184
  Addressable::URI.parse("http://#{location}/robots.txt")
156
185
  end
157
186
 
187
+ def error_message(http_client)
188
+ @error = if http_client.error.nil? || http_client.error.empty?
189
+ "An error occurred when fetching #{http_client.uri.to_s}"
190
+ else
191
+ http_client.error
192
+ end
193
+ end
194
+
158
195
  def success?(http_client)
159
196
  http_client.response_header.status == 200
160
197
  end
@@ -164,10 +201,37 @@ class RDaneel
164
201
  end
165
202
 
166
203
  def redirect_url(http_client, u)
167
- location = Addressable::URI.parse(http_client.response_header.location)
168
- location = u.join(location) if location.relative?
169
- location.path = "/" if location.path.nil? || location.path == ""
170
- location
204
+ # em-http-request handles the case when redirect is relative
205
+ # at this point http_client.response_header.location should always have an absolute and valid url
206
+ # but this invalid url is parsed successfully http:/malformed:url so we ask for host
207
+ Addressable::URI.parse(http_client.response_header.location)
208
+ end
209
+
210
+ def verbose(message, client = nil, *args)
211
+ return unless @verbose
212
+ message.each { |l| hashed_puts('*', l) }
213
+ args.each do |a|
214
+ case a
215
+ when :status
216
+ if client.response_header.status == 0
217
+ hashed_puts('< Status:', '0 (timeout)')
218
+ else
219
+ hashed_puts('< Status:', client.response_header.status)
220
+ end
221
+ when :request # this is a options hash
222
+ headers = client.options[:head]
223
+ headers.each { |k,v| hashed_puts('>', "#{k}: #{v}") } if headers
224
+ when :response # this is an array
225
+ client.response_header.each { |r| hashed_puts('<', "#{r[0]}: #{r[1]}") }
226
+ end
227
+ end
171
228
  end
229
+
230
+ private
231
+
232
+ def hashed_puts( prefix, message )
233
+ $stdout.puts("[#{@hash}] [#{Time.now.strftime('%Y-%m-%d %H:%m:%S')}] #{prefix} #{message}")
234
+ end
235
+
172
236
  end
173
237
 
@@ -0,0 +1,47 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "RDaneel" do
4
+
5
+ describe "robots_txt_url" do
6
+ before(:each) do
7
+ @rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
8
+ end
9
+
10
+ it "should return the proper url when url don't has a port specified (80 implied)" do
11
+ url = Addressable::URI.parse("http://127.0.0.1/path/url?param1=value1&param2=value2")
12
+ @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
13
+ end
14
+
15
+ it "should return the proper url when url has a port 80 specified" do
16
+ url = Addressable::URI.parse("http://127.0.0.1:80/path/url?param1=value1&param2=value2")
17
+ @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
18
+ end
19
+
20
+ it "should return the proper url when url has a port different than 80" do
21
+ url = Addressable::URI.parse("http://127.0.0.1:8080/path/url?param1=value1&param2=value2")
22
+ @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1:8080/robots.txt"
23
+ end
24
+
25
+ end
26
+
27
+
28
+ describe "robots_allowed?" do
29
+ before(:each) do
30
+ @rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
31
+ end
32
+
33
+ describe "when an error happens parsing the robots rules" do
34
+ before(:each) do
35
+ @robot_rules = RobotRules.new("RDaneel")
36
+ @robot_rules.stub!(:parse).and_raise(StandardError)
37
+ RobotRules.stub!(:new).and_return(@robot_rules)
38
+ end
39
+
40
+ it "should return true" do #no matter the params
41
+ @rdaneel.send(:robots_allowed?, nil, nil, nil, nil).should be_true
42
+ end
43
+ end
44
+ end
45
+
46
+ end
47
+
data/spec/spec_helper.rb CHANGED
@@ -3,91 +3,4 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  require 'rubygems'
4
4
  require 'rdaneel'
5
5
  require 'spec'
6
- require 'webrick'
7
-
8
- # keep webrick quiet
9
- class ::WEBrick::HTTPServer
10
- def access_log(config, req, res)
11
- # nop
12
- end
13
- end
14
- class ::WEBrick::BasicLog
15
- def log(level, data)
16
- # nop
17
- end
18
- end
19
-
20
- def locked_file
21
- File.join(File.dirname(__FILE__),"server_lock-#{@__port}")
22
- end
23
-
24
- def server_setup(port=8080, &blk)
25
- @__port = port
26
- if @server.nil? and !File.exist?(locked_file)
27
- File.open(locked_file,'w') {|f| f << 'locked' }
28
- @server = WEBrick::HTTPServer.new :Port => port
29
- blk.call(@server) if blk
30
- queue = Queue.new # synchronize the thread startup to the main thread
31
- @test_thread = Thread.new { queue << 1; @server.start }
32
-
33
- # wait for the queue
34
- value = queue.pop
35
-
36
- if !value
37
- STDERR.puts "Failed to startup test server!"
38
- exit(1)
39
- end
40
-
41
- trap("INT"){server_shutdown}
42
- at_exit{server_shutdown}
43
- end
44
- end
45
-
46
- def server_shutdown
47
- begin
48
- if File.exist?(locked_file)
49
- File.unlink locked_file
50
- @server.shutdown unless @server.nil?
51
- @server = nil
52
- end
53
- rescue Object => e
54
- puts "Error #{__FILE__}:#{__LINE__}\n#{e.message}"
55
- end
56
- end
57
-
58
- def mount(server, opts)
59
- raise ":path is required" if opts[:path].nil?
60
- raise ":status is required" if opts[:status].nil?
61
- server.mount_proc( opts[:path],
62
- lambda { |req, resp|
63
- resp.status = opts[:status]
64
- resp.body = opts[:body] unless opts[:body].nil?
65
- resp['Location'] = opts[:location] unless opts[:location].nil?
66
- opts[:block].call unless opts[:block].nil?
67
- } )
68
- end
69
-
70
- def should_not_be_hit
71
- should_be_hit( 0 )
72
- end
73
-
74
- def should_be_hit_once
75
- should_be_hit( 1 )
76
- end
77
-
78
- def should_be_hit_twice
79
- should_be_hit( 2 )
80
- end
81
-
82
- def should_be_hit( times = 1 )
83
- l = lambda {}
84
- m = l.should_receive(:call).exactly(times).times
85
- return l
86
- end
87
-
88
- Spec::Runner.configure do |config|
89
- config.before :suite do
90
- puts "\e[4mThese specs could take a while, please be patience\e[0m"
91
- end
92
- end
93
6
 
@@ -9,7 +9,7 @@ describe "RDaneel when the content is chunked (digg.com)" do
9
9
 
10
10
  it "should get the content" do
11
11
  EM.run do
12
- r = RDaneel.new("http://digg.com")
12
+ r = RDaneel.new("http://digg.com/news")
13
13
  r.callback do
14
14
  r.http_client.response_header.status.should == 200
15
15
  r.http_client.response.should_not be_empty
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 3
9
- version: 0.1.3
7
+ - 2
8
+ - 2
9
+ version: 0.2.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Edgar Gonzalez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-13 00:00:00 -04:30
18
+ date: 2010-08-27 00:00:00 -04:30
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -28,8 +28,8 @@ dependencies:
28
28
  segments:
29
29
  - 0
30
30
  - 2
31
- - 10
32
- version: 0.2.10
31
+ - 11
32
+ version: 0.2.11
33
33
  type: :runtime
34
34
  version_requirements: *id001
35
35
  - !ruby/object:Gem::Dependency
@@ -60,6 +60,35 @@ dependencies:
60
60
  version: 1.2.9
61
61
  type: :development
62
62
  version_requirements: *id003
63
+ - !ruby/object:Gem::Dependency
64
+ name: cucumber
65
+ prerelease: false
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 0
72
+ - 8
73
+ - 5
74
+ version: 0.8.5
75
+ type: :development
76
+ version_requirements: *id004
77
+ - !ruby/object:Gem::Dependency
78
+ name: relevance-rcov
79
+ prerelease: false
80
+ requirement: &id005 !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ segments:
85
+ - 0
86
+ - 9
87
+ - 2
88
+ - 1
89
+ version: 0.9.2.1
90
+ type: :development
91
+ version_requirements: *id005
63
92
  description: Add robots.txt support on top of em-http-request
64
93
  email:
65
94
  - edgargonzalez@gmail.com
@@ -78,13 +107,16 @@ files:
78
107
  - README.rdoc
79
108
  - Rakefile
80
109
  - VERSION
110
+ - features/get_with_cache.feature
111
+ - features/get_without_cache.feature
112
+ - features/step_definitions/rdaneel_steps.rb
113
+ - features/support/burrito.rb
114
+ - features/support/env.rb
81
115
  - lib/rdaneel.rb
82
- - spec/no_redirects_neither_robots_spec.rb
83
- - spec/redirects_without_robots_spec.rb
116
+ - spec/rdaneel_spec.rb
84
117
  - spec/spec.opts
85
118
  - spec/spec_helper.rb
86
119
  - spec/streamed_content_spec.rb
87
- - spec/using_cache_spec.rb
88
120
  has_rdoc: true
89
121
  homepage: http://github.com/hasmanydevelopers/RDaneel
90
122
  licenses: []
@@ -116,8 +148,6 @@ signing_key:
116
148
  specification_version: 3
117
149
  summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
118
150
  test_files:
119
- - spec/using_cache_spec.rb
120
- - spec/no_redirects_neither_robots_spec.rb
121
151
  - spec/spec_helper.rb
122
- - spec/redirects_without_robots_spec.rb
123
152
  - spec/streamed_content_spec.rb
153
+ - spec/rdaneel_spec.rb
@@ -1,130 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe "RDaneel when there are no redirects" do
4
-
5
- let(:port) {8083}
6
-
7
- describe "when a successfull status different than 200 is issued for robots.txt" do
8
- it "should get the content ignoring the redirect"
9
- end
10
-
11
- describe "when a redirect other than 301 and 302 is issued for robots.txt" do
12
- it "should get the content ignoring the redirect"
13
- end
14
-
15
- (301..302).each do |status|
16
-
17
- describe "when robots.txt has been moved (http code #{status})" do
18
- before(:each) do
19
- server_setup(port+status) do |server|
20
- mount(server, :path => '/hello_world', :status => 200,
21
- :body => 'Hello World!', :block => should_be_hit_once )
22
- mount(server, :path => '/robots.txt', :status => status,
23
- :location => "http://127.0.0.1:#{port+status}/golems.txt",
24
- :block => should_be_hit_once )
25
- mount(server, :path => '/golems.txt', :status => 200,
26
- :block => should_be_hit_once )
27
- end
28
- end
29
-
30
- after(:each) do
31
- server_shutdown
32
- end
33
-
34
- it "should get the redirected robots.txt and the content" do
35
- EM.run do
36
- r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
37
- r.callback do
38
- r.http_client.response_header.status.should == 200
39
- r.http_client.response.should == "Hello World!"
40
- r.redirects.should be_empty
41
- EM.stop
42
- end
43
- r.errback do
44
- fail
45
- EM.stop
46
- end
47
- r.get
48
- end
49
- end
50
-
51
- end
52
-
53
- end
54
-
55
- (400..417).each do |status|
56
-
57
- describe "when there is a CLIENT error #{status} associated to robots.txt" do
58
- before(:each) do
59
- server_setup(port+status) do |server|
60
- mount(server, :path => '/hello_world', :status => 200,
61
- :body => 'Hello World!', :block => should_be_hit_once )
62
- mount(server, :path => '/robots.txt', :status => status,
63
- :block => should_be_hit_once )
64
- end
65
- end
66
-
67
- after(:each) do
68
- server_shutdown
69
- end
70
-
71
- it "should get the content" do
72
- EM.run do
73
- r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
74
- r.callback do
75
- r.http_client.response_header.status.should == 200
76
- r.http_client.response.should == "Hello World!"
77
- r.redirects.should be_empty
78
- EM.stop
79
- end
80
- r.errback do
81
- fail
82
- EM.stop
83
- end
84
- r.get
85
- end
86
- end
87
-
88
- end
89
-
90
- end
91
-
92
- (500..505).each do |status|
93
-
94
- describe "when there is a SERVER error #{status} associated to robots.txt" do
95
- before(:each) do
96
- server_setup(port+status) do |server|
97
- mount(server, :path => '/hello_world', :status => 200,
98
- :body => 'Hello World!', :block => should_be_hit_once )
99
- mount(server, :path => '/robots.txt', :status => status,
100
- :block => should_be_hit_once )
101
- end
102
- end
103
-
104
- after (:each) do
105
- server_shutdown
106
- end
107
-
108
- it "should get the content" do
109
- EM.run do
110
- r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
111
- r.callback do
112
- r.http_client.response_header.status.should == 200
113
- r.http_client.response.should == "Hello World!"
114
- r.redirects.should be_empty
115
- EM.stop
116
- end
117
- r.errback do
118
- fail
119
- EM.stop
120
- end
121
- r.get
122
- end
123
- end
124
-
125
- end
126
-
127
- end
128
-
129
- end
130
-
@@ -1,175 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe "RDaneel when there are redirects" do
4
-
5
- let(:port) {8081}
6
-
7
- describe "when there is no robots.txt in the host (ONLY one host)" do
8
-
9
- describe "when no redirection limit has been set" do
10
- before(:each) do
11
- server_setup(port) do |server|
12
- mount(server, :path => '/robots.txt', :status => 404,
13
- :block => should_be_hit_once )
14
- mount(server, :path => '/redirect_me', :status => 301,
15
- :location => "http://127.0.0.1:#{port}/hello_world",
16
- :block => should_be_hit_once )
17
- mount(server, :path => '/hello_world', :status => 200,
18
- :body => 'Hello World!',
19
- :block => should_not_be_hit )
20
- end
21
- end
22
-
23
- after(:each) do
24
- server_shutdown
25
- end
26
-
27
- it "should not follow redirects" do
28
- EM.run do
29
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
30
- r.callback do
31
- fail
32
- EM.stop
33
- end
34
- r.errback do
35
- r.redirects.should be_empty
36
- r.error.should == "Exceeded maximum number of redirects"
37
- EM.stop
38
- end
39
- r.get
40
- end
41
-
42
- end
43
-
44
- end
45
-
46
- describe "when a maximum number or redirects is set" do
47
-
48
- describe "when there are less redirects than the maximum specified" do
49
- before(:each) do
50
- server_setup(port) do |server|
51
- mount(server, :path => '/robots.txt', :status => 404,
52
- :block => should_be_hit(3) )
53
- mount(server, :path => '/redirect_me', :status => 301,
54
- :location => "http://127.0.0.1:#{port}/redirect_me_again",
55
- :block => should_be_hit_once )
56
- mount(server, :path => '/redirect_me_again', :status => 301,
57
- :location => "http://127.0.0.1:#{port}/hello_world",
58
- :block => should_be_hit_once )
59
- mount(server, :path => '/hello_world', :status => 200,
60
- :body => 'Hello World!',
61
- :block => should_be_hit_once )
62
- end
63
- end
64
-
65
- after(:each) do
66
- server_shutdown
67
- end
68
-
69
- it "should get the content following all the redirects" do
70
- EM.run do
71
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
72
- r.callback do
73
- r.http_client.response_header.status.should == 200
74
- r.http_client.response.should == "Hello World!"
75
- r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me",
76
- "http://127.0.0.1:#{port}/redirect_me_again"]
77
- r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
78
- EM.stop
79
- end
80
- r.errback do
81
- fail
82
- EM.stop
83
- end
84
- r.get(:redirects => 3)
85
- end
86
-
87
- end
88
-
89
- end
90
-
91
- describe "when there are as many redirects as the maximum" do
92
- before(:each) do
93
- server_setup(port) do |server|
94
- mount(server, :path => '/robots.txt', :status => 404,
95
- :block => should_be_hit_twice )
96
- mount(server, :path => '/redirect_me', :status => 301,
97
- :location => "http://127.0.0.1:#{port}/hello_world",
98
- :block => should_be_hit_once )
99
- mount(server, :path => '/hello_world', :status => 200,
100
- :body => 'Hello World!',
101
- :block => should_be_hit_once )
102
- end
103
- end
104
-
105
- after(:each) do
106
- server_shutdown
107
- end
108
-
109
- it "should get the content following all the redirects" do
110
- EM.run do
111
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
112
- r.callback do
113
- r.http_client.response_header.status.should == 200
114
- r.http_client.response.should == "Hello World!"
115
- r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
116
- r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
117
- EM.stop
118
- end
119
- r.errback do
120
- fail
121
- EM.stop
122
- end
123
- r.get(:redirects => 1)
124
- end
125
-
126
- end
127
-
128
- end
129
-
130
- describe "when the number of redirects exceed the maximum specified" do
131
- before(:each) do
132
- server_setup(port) do |server|
133
- mount(server, :path => '/robots.txt', :status => 404,
134
- :block => should_be_hit_twice )
135
- mount(server, :path => '/redirect_me', :status => 301,
136
- :location => "http://127.0.0.1:#{port}/redirect_me_again",
137
- :block => should_be_hit_once )
138
- mount(server, :path => '/redirect_me_again', :status => 301,
139
- :location => "http://127.0.0.1:#{port}/hello_world",
140
- :block => should_be_hit_once )
141
- mount(server, :path => '/hello_world', :status => 200,
142
- :body => 'Hello World!',
143
- :block => should_not_be_hit )
144
- end
145
- end
146
-
147
- after(:each) do
148
- server_shutdown
149
- end
150
-
151
- it "should stop following redirects once the maximum specified is reached" do
152
- EM.run do
153
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
154
- r.callback do
155
- fail
156
- EM.stop
157
- end
158
- r.errback do
159
- r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
160
- r.error.should == "Exceeded maximum number of redirects"
161
- EM.stop
162
- end
163
- r.get(:redirects => 1)
164
- end
165
-
166
- end
167
-
168
- end
169
-
170
- end
171
-
172
- end
173
-
174
- end
175
-
@@ -1,46 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe "RDaneel when there is a cache" do
4
-
5
- let(:port) {8082}
6
-
7
- describe "when there is no robots.txt in the host" do
8
-
9
- before(:each) do
10
- RDaneel.robots_cache = {}
11
- server_setup(port) do |server|
12
- mount(server, :path => '/robots.txt', :status => 404,
13
- :block => should_be_hit_once )
14
- mount(server, :path => '/redirect_me', :status => 301,
15
- :location => "http://127.0.0.1:#{port}/hello_world",
16
- :block => should_be_hit_once )
17
- mount(server, :path => '/hello_world', :status => 200,
18
- :body => 'Hello World!',
19
- :block => should_be_hit_once )
20
- end
21
- end
22
-
23
- after(:each) do
24
- server_shutdown
25
- end
26
-
27
- it "should try to get the robots.txt just once" do
28
- EM.run do
29
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
30
- r.callback do
31
- r.http_client.response_header.status.should == 200
32
- r.http_client.response.should == "Hello World!"
33
- r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me"]
34
- r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
35
- EM.stop
36
- end
37
- r.errback do
38
- fail
39
- EM.stop
40
- end
41
- r.get(:redirects => 3)
42
- end
43
- end
44
- end
45
- end
46
-