rdaneel 0.1.3 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -15,7 +15,9 @@ tmtags
15
15
 
16
16
  ## PROJECT::GENERAL
17
17
  coverage
18
+ coverage.data
18
19
  rdoc
19
20
  pkg
20
21
 
21
22
  ## PROJECT::SPECIFIC
23
+
data/Rakefile CHANGED
@@ -10,9 +10,11 @@ begin
10
10
  gem.email = ["edgargonzalez@gmail.com", "anibalrojas@gmail.com"]
11
11
  gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
12
12
  gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
13
- gem.add_dependency("em-http-request", ">= 0.2.10")
13
+ gem.add_dependency("em-http-request", ">= 0.2.11")
14
14
  gem.add_dependency('robot_rules', '>= 0.9.3')
15
15
  gem.add_development_dependency "rspec", ">= 1.2.9"
16
+ gem.add_development_dependency "cucumber", ">= 0.8.5"
17
+ gem.add_development_dependency "relevance-rcov", ">= 0.9.2.1"
16
18
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
19
  end
18
20
  Jeweler::GemcutterTasks.new
@@ -20,29 +22,29 @@ rescue LoadError
20
22
  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
23
  end
22
24
 
25
+ require 'cucumber/rake/task'
26
+ Cucumber::Rake::Task.new(:features) do |t|
27
+ t.cucumber_opts = "--format pretty" # Any valid command line option can go here.
28
+ t.rcov = true
29
+ t.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
30
+ end
31
+
23
32
  require 'spec/rake/spectask'
24
33
  Spec::Rake::SpecTask.new(:spec) do |spec|
25
34
  spec.libs << 'lib' << 'spec'
26
35
  spec.spec_files = FileList['spec/**/*_spec.rb']
27
- end
28
-
29
- Spec::Rake::SpecTask.new(:rcov) do |spec|
30
- spec.libs << 'lib' << 'spec'
31
- spec.pattern = 'spec/**/*_spec.rb'
32
36
  spec.rcov = true
37
+ spec.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
33
38
  end
34
39
 
35
- task :spec => :check_dependencies
36
-
37
- task :default => :spec
38
-
39
- require 'rake/rdoctask'
40
- Rake::RDocTask.new do |rdoc|
41
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
-
43
- rdoc.rdoc_dir = 'rdoc'
44
- rdoc.title = "rdaneel #{version}"
45
- rdoc.rdoc_files.include('README*')
46
- rdoc.rdoc_files.include('lib/**/*.rb')
40
+ desc "Run both specs and features and generate aggregated coverage"
41
+ task :all_tests do |t|
42
+ rm "coverage.data" if File.exist?("coverage.data")
43
+ Rake::Task['spec'].invoke
44
+ Rake::Task["features"].invoke
47
45
  end
48
46
 
47
+ task :features => :check_dependencies
48
+ task :spec => :check_dependencies
49
+ task :default => :all_tests
50
+
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.3
1
+ 0.2.2
@@ -0,0 +1,67 @@
1
+ Feature: get a url using cache
2
+ In order to fetch content from internet
3
+ As a crawler
4
+ I want to get a url respecting robots.txt rules
5
+
6
+ Scenario: the url to fetch is redirected
7
+ Given a cache for RDaneel
8
+ And a robots.txt that allows RDaneel
9
+ And a HelloWorld url
10
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
11
+ And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
12
+ When I get the "/redirect_me" url following a maximum of 3 redirects
13
+ Then I should get the content for HelloWorld url
14
+ And the http response code should be 200
15
+ And I should get 2 redirects
16
+ And The redirects sequence should be:
17
+ | http://127.0.0.1:3210/redirect_me |
18
+ | http://127.0.0.1:3210/redirect_me_again |
19
+ And The requests sequence should be:
20
+ | status | path |
21
+ | 200 | /robots.txt |
22
+ | 301 | /redirect_me |
23
+ | 302 | /redirect_me_again |
24
+ | 200 | /hello_world |
25
+ And The cache for "http://127.0.0.1:3210/robots.txt" should be
26
+ """
27
+ User-agent: *
28
+ Disallow: /cgi-bin/
29
+ """
30
+
31
+ Scenario: a cached robots.txt exists denying RDaneel's user-agent
32
+ Given a cache for RDaneel
33
+ And The cache for "http://127.0.0.1:3210/robots.txt" is:
34
+ """
35
+ User-agent: *
36
+ Disallow: /
37
+ """
38
+ And a robots.txt that denies RDaneel
39
+ And a HelloWorld url
40
+ When I get the "/hello_world" url following a maximum of 1 redirects
41
+ Then I should get a "Robots are not allowed" error
42
+ And I should get 0 redirects
43
+ And The requests should be empty
44
+
45
+
46
+ Scenario: the url to fetch is redirected to unreacheable server but a robots cache exists for this server allowing RDaneel
47
+ Given a cache for RDaneel
48
+ And The cache for "http://127.0.0.1:3210/robots.txt" is:
49
+ """
50
+ User-agent: *
51
+ Disallow: /cgi-bin/
52
+ """
53
+ And The cache for "http://127.0.0.1:3211/robots.txt" is:
54
+ """
55
+ User-agent: *
56
+ Disallow: /cgi-bin/
57
+ """
58
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
59
+ When I get the "/redirect_me" url following a maximum of 3 redirects
60
+ Then I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
61
+ And I should get 1 redirects
62
+ And The redirects sequence should be:
63
+ | http://127.0.0.1:3210/redirect_me |
64
+ And The requests sequence should be:
65
+ | status | path |
66
+ | 301 | /redirect_me |
67
+
@@ -0,0 +1,155 @@
1
+ Feature: get a url without using cache
2
+ In order to fetch content from internet
3
+ As a crawler
4
+ I want to get a url respecting robots.txt rules
5
+
6
+ Scenario: a robots.txt exists allowing RDaneel's user-agent
7
+ Given a robots.txt that allows RDaneel
8
+ And a HelloWorld url
9
+ When I get the "/hello_world" url following a maximum of 1 redirects
10
+ Then I should get the content for HelloWorld url
11
+ And the http response code should be 200
12
+ And I should get 0 redirects
13
+ And The requests sequence should be:
14
+ | status | path |
15
+ | 200 | /robots.txt |
16
+ | 200 | /hello_world |
17
+
18
+ Scenario: a robots.txt exists denying RDaneel's user-agent
19
+ Given a robots.txt that denies RDaneel
20
+ And a HelloWorld url
21
+ When I get the "/hello_world" url following a maximum of 1 redirects
22
+ Then I should get a "Robots are not allowed" error
23
+ And I should get 0 redirects
24
+ And The requests sequence should be:
25
+ | status | path |
26
+ | 200 | /robots.txt |
27
+
28
+ Scenario: the url to fetch is redirected
29
+ Given a robots.txt that allows RDaneel
30
+ And a HelloWorld url
31
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
32
+ And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
33
+ When I get the "/redirect_me" url following a maximum of 3 redirects
34
+ Then I should get the content for HelloWorld url
35
+ And the http response code should be 200
36
+ And I should get 2 redirects
37
+ And The redirects sequence should be:
38
+ | http://127.0.0.1:3210/redirect_me |
39
+ | http://127.0.0.1:3210/redirect_me_again |
40
+ And The requests sequence should be:
41
+ | status | path |
42
+ | 200 | /robots.txt |
43
+ | 301 | /redirect_me |
44
+ | 200 | /robots.txt |
45
+ | 302 | /redirect_me_again |
46
+ | 200 | /robots.txt |
47
+ | 200 | /hello_world |
48
+
49
+ Scenario: the url to fetch exceeds the maximum redirects specifieds
50
+ Given a robots.txt that allows RDaneel
51
+ And a HelloWorld url
52
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
53
+ And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
54
+ When I get the "/redirect_me" url following a maximum of 1 redirects
55
+ Then I should get a "Exceeded maximum number of redirects: 1" error
56
+ And I should get 1 redirects
57
+ And The redirects sequence should be:
58
+ | http://127.0.0.1:3210/redirect_me |
59
+ And The requests sequence should be:
60
+ | status | path |
61
+ | 200 | /robots.txt |
62
+ | 301 | /redirect_me |
63
+ | 200 | /robots.txt |
64
+ | 302 | /redirect_me_again |
65
+
66
+ Scenario: the url to fetch has an infinte redirect
67
+ Given a robots.txt that allows RDaneel
68
+ And a HelloWorld url
69
+ And a "/redirect_me" url that redirects 302 to "/redirect_me_again" url
70
+ And a "/redirect_me_again" url that redirects 302 to "/redirect_me" url
71
+ When I get the "/redirect_me" url following a maximum of 2 redirects
72
+ Then I should get a "Infinite redirect detected for: http://127.0.0.1:3210/redirect_me" error
73
+ And I should get 2 redirects
74
+ And The redirects sequence should be:
75
+ | http://127.0.0.1:3210/redirect_me |
76
+ | http://127.0.0.1:3210/redirect_me_again |
77
+ And The requests sequence should be:
78
+ | status | path |
79
+ | 200 | /robots.txt |
80
+ | 302 | /redirect_me |
81
+ | 200 | /robots.txt |
82
+ | 302 | /redirect_me_again |
83
+
84
+ Scenario: the url to fetch redirects to not found url
85
+ Given a robots.txt that allows RDaneel
86
+ And a "/redirect_me" url that redirects 302 to "/not_found" url
87
+ When I get the "/redirect_me" url following a maximum of 2 redirects
88
+ Then I should get a "Not success neither redirect" error
89
+ And I should get 1 redirects
90
+ And The redirects sequence should be:
91
+ | http://127.0.0.1:3210/redirect_me |
92
+ And The requests sequence should be:
93
+ | status | path |
94
+ | 200 | /robots.txt |
95
+ | 302 | /redirect_me |
96
+ | 200 | /robots.txt |
97
+ | 404 | /not_found |
98
+
99
+
100
+ Scenario: robots.txt doesn't exists
101
+ Given a HelloWorld url
102
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
103
+ And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
104
+ When I get the "/redirect_me" url following a maximum of 3 redirects
105
+ Then I should get the content for HelloWorld url
106
+ And the http response code should be 200
107
+ And I should get 2 redirects
108
+ And The redirects sequence should be:
109
+ | http://127.0.0.1:3210/redirect_me |
110
+ | http://127.0.0.1:3210/redirect_me_again |
111
+ And The requests sequence should be:
112
+ | status | path |
113
+ | 404 | /robots.txt |
114
+ | 301 | /redirect_me |
115
+ | 404 | /robots.txt |
116
+ | 302 | /redirect_me_again |
117
+ | 404 | /robots.txt |
118
+ | 200 | /hello_world |
119
+
120
+ Scenario: the url to fetch redirects to a malformed url (format handled by em-http-request)
121
+ Given a robots.txt that allows RDaneel
122
+ And a "/redirect_me" url that redirects 302 to "http://malformed:url" url
123
+ When I get the "/redirect_me" url following a maximum of 2 redirects
124
+ Then I should get a "Location header format error" error
125
+ And I should get 0 redirects
126
+ And The requests sequence should be:
127
+ | status | path |
128
+ | 200 | /robots.txt |
129
+ | 302 | /redirect_me |
130
+
131
+ Scenario: the url to fetch redirects to a malformed url (format not handled by em-http-request 0.2.10)
132
+ Given a robots.txt that allows RDaneel
133
+ And a "/redirect_me" url that redirects 302 to "http:/malformed:url" url
134
+ When I get the "/redirect_me" url following a maximum of 2 redirects
135
+ Then I should get a "Location header format error" error
136
+ And I should get 0 redirects
137
+ And The requests sequence should be:
138
+ | status | path |
139
+ | 200 | /robots.txt |
140
+ | 302 | /redirect_me |
141
+
142
+ Scenario: the url to fetch is redirected to unreacheable host:port
143
+ Given a robots.txt that allows RDaneel
144
+ And a HelloWorld url
145
+ And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
146
+ When I get the "/redirect_me" url following a maximum of 3 redirects
147
+ Then I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
148
+ And I should get 1 redirects
149
+ And The redirects sequence should be:
150
+ | http://127.0.0.1:3210/redirect_me |
151
+ And The requests sequence should be:
152
+ | status | path |
153
+ | 200 | /robots.txt |
154
+ | 301 | /redirect_me |
155
+
@@ -0,0 +1,79 @@
1
+
2
+ Given /^a robots\.txt that allows RDaneel$/ do
3
+ $server.mount(:path => '/robots.txt', :status => 200,
4
+ :body => "User-agent: *\nDisallow: /cgi-bin/")
5
+ end
6
+
7
+ Given /^a robots\.txt that denies RDaneel$/ do
8
+ $server.mount(:path => '/robots.txt', :status => 200,
9
+ :body => "User-agent: *\nDisallow: /")
10
+ end
11
+
12
+ Given /^a HelloWorld url$/ do
13
+ $server.mount(:path => '/hello_world', :status => 200,
14
+ :body => "Hello World")
15
+ end
16
+
17
+ Given /^a "([^"]*)" url that redirects (\d+) to "([^"]*)" url$/ do |url, status, redirected_to|
18
+ $server.mount(:path => url, :status => status.to_i,
19
+ :location => redirected_to)
20
+ end
21
+
22
+ Given /^a cache for RDaneel$/ do
23
+ RDaneel.robots_cache = {}
24
+ end
25
+
26
+ Given /^The cache for "([^"]*)" is:$/ do |robots_url, robots_file|
27
+ RDaneel.robots_cache[robots_url] = robots_file
28
+ end
29
+
30
+ When /^I get the "([^"]*)" url following a maximum of (\d+) redirects$/ do |url, max_redirects|
31
+ EM.run do
32
+ @r = RDaneel.new("#{HOST}#{url}")
33
+ @r.callback do
34
+ EM.stop
35
+ end
36
+ @r.errback do
37
+ EM.stop
38
+ end
39
+ @r.get(:redirects => max_redirects)
40
+ end
41
+ end
42
+
43
+ Then /^I should get the content for HelloWorld url$/ do
44
+ @r.http_client.response.should == "Hello World"
45
+ end
46
+
47
+ Then /^the http response code should be (\d+)$/ do |code|
48
+ @r.http_client.response_header.status.should == code.to_i
49
+ end
50
+
51
+ Then /^I should get (\d+) redirects$/ do |redirects_count|
52
+ @r.redirects.size.should == redirects_count.to_i
53
+ end
54
+
55
+ Then /^The requests sequence should be:$/ do |expected_table|
56
+ expected_requests = []
57
+ expected_table.hashes.each do |hash|
58
+ expected_requests << {:status => hash[:status].to_i,
59
+ :path => hash[:path]}
60
+ end
61
+ $server.requests.should == expected_requests
62
+ end
63
+
64
+ Then /^The requests should be empty$/ do
65
+ $server.requests.should be_empty
66
+ end
67
+
68
+ Then /^The redirects sequence should be:$/ do |expected_redirects|
69
+ @r.redirects.should == expected_redirects.raw.flatten
70
+ end
71
+
72
+ Then /^I should get a "([^"]*)" error$/ do |error_message|
73
+ @r.error.should == error_message
74
+ end
75
+
76
+ Then /^The cache for "([^"]*)" should be$/ do |robots_url, robots_file|
77
+ RDaneel.robots_cache[robots_url].should == robots_file
78
+ end
79
+
@@ -0,0 +1,69 @@
1
+ require 'socket'
2
+
3
+ class Burrito
4
+
5
+ STATUS_MESSAGES = {
6
+ 200 => 'OK',
7
+ 301 => 'Moved Permanently',
8
+ 302 => 'Found',
9
+ 404 => 'Not Found'
10
+ }
11
+
12
+ attr_reader :requests
13
+
14
+ def initialize
15
+ @routes = {}
16
+ @requests = []
17
+ end
18
+
19
+ def mount(opts)
20
+ @routes[opts[:path]] = { :status => opts[:status],
21
+ :body => opts[:body],
22
+ :location => opts[:location] }
23
+ end
24
+
25
+ def reset
26
+ @routes = {}
27
+ @requests = []
28
+ end
29
+
30
+ def start
31
+ @thread = Thread.new do
32
+
33
+ webserver = TCPServer.new('127.0.0.1', 3210)
34
+
35
+ while session = webserver.accept
36
+ request = session.gets
37
+ path = '/' << request.gsub(/GET\ \//, '').gsub(/\ HTTP.*/, '').chomp
38
+ if @routes[path]
39
+ status = @routes[path][:status]
40
+ body = @routes[path][:body]
41
+ location = @routes[path][:location]
42
+ else
43
+ status = 404
44
+ body = nil
45
+ location = nil
46
+ end
47
+ @requests.push( { :status => status, :path => path } )
48
+ response = "HTTP/1.1 #{status} #{STATUS_MESSAGES[status]}\r\n"
49
+ response << "Server: burrito/0.0.1\r\n"
50
+ response << "Content-Length: #{ body ? body.length : 0 }\r\n"
51
+ response << "Content-Type: text/plain\r\n" if body
52
+ response << "Location: #{location}\r\n" if location
53
+ response << "Connection: close\r\n"
54
+ response << "\r\n"
55
+ response << "#{body}" if body
56
+ session.print response
57
+ session.close
58
+ end
59
+
60
+ end
61
+
62
+ end
63
+
64
+ def shutdown
65
+ @thread.terminate
66
+ end
67
+
68
+ end
69
+
@@ -0,0 +1,22 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '../..', 'lib'))
3
+ require 'rubygems'
4
+ require 'rdaneel'
5
+ require 'burrito'
6
+
7
+ unless $server
8
+ $server = Burrito.new
9
+ $server.start
10
+ end
11
+
12
+ HOST = "http://127.0.0.1:3210"
13
+
14
+ Before do
15
+ $server.reset
16
+ RDaneel.robots_cache = nil
17
+ end
18
+
19
+ at_exit do
20
+ $server.shutdown
21
+ end
22
+
data/lib/rdaneel.rb CHANGED
@@ -19,10 +19,12 @@ class RDaneel
19
19
  attr_accessor :uri
20
20
  attr_reader :error, :redirects, :http_client
21
21
 
22
- def initialize(uri)
22
+ def initialize(uri,options = {})
23
23
  @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
24
24
  @uri.path = "/" if @uri.path.nil? || @uri.path == ""
25
25
  @redirects = []
26
+ @verbose = options[:verbose]
27
+ @hash = @uri.hash if @verbose
26
28
  end
27
29
 
28
30
  def get(opts = {})
@@ -37,91 +39,118 @@ class RDaneel
37
39
  if success?(h)
38
40
  @uri = current_uri if current_uri != @uri
39
41
  @http_client = h
42
+ verbose("Succeded fetching: #{current_uri}", h, :status, :response)
40
43
  succeed(self)
41
44
  elsif redirected?(h)
42
45
  if @redirects.size >= max_redirects
43
46
  @http_client = h
44
- @error = "Exceeded maximum number of redirects"
47
+ @error = "Exceeded maximum number of redirects: #{max_redirects}"
48
+ verbose(@error, h, :status, :response)
45
49
  fail(self)
46
50
  return
47
51
  end
52
+ @redirects << current_uri.to_s
53
+ current_uri = redirect_url(h, current_uri)
48
54
  begin
49
- @redirects << current_uri.to_s
50
- current_uri = redirect_url(h, current_uri)
55
+ verbose("Redirected to: #{current_uri.to_s} from: #{@redirects[-1]}", h, :status, :response)
51
56
  if @redirects.include?(current_uri.to_s)
52
57
  @http_client = h
53
- @error = "infinite redirect"
58
+ @error = "Infinite redirect detected for: #{current_uri.to_s}"
59
+ verbose(@error, h, :status, :response)
54
60
  fail(self)
55
61
  return
56
62
  end
57
63
  _get.call
58
- rescue
64
+ rescue StandardError => se
59
65
  @http_client = h
60
- @error = "mal formed redirected url"
66
+ @error = "Error trying to follow a redirect #{current_uri.to_s}: #{h.response_header.location}"
67
+ verbose(@error, h, :status, :response)
61
68
  fail(self)
62
69
  end
63
70
  else
64
71
  # other error
65
72
  @http_client = h
66
- @error = "not success and not redirect"
73
+ @error = "Not success neither redirect"
74
+ verbose(@error, h, :status, :response)
67
75
  fail(self)
68
76
  end
69
77
  }
70
78
  _get = lambda {
71
79
  robots_url = robots_txt_url(current_uri)
72
80
  if robots_cache && robots_file = robots_cache[robots_url.to_s]
81
+ verbose("Found cached robots.txt:\n#{robots_cache[robots_url.to_s]} for: #{current_uri}")
73
82
  if robots_allowed?(robots_file, useragent, robots_url, current_uri)
83
+ verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
74
84
  begin
75
85
  h = EM::HttpRequest.new(current_uri).get(options)
86
+ verbose("Started fetching: #{current_uri}",h,:request)
76
87
  h.callback(&_handle_uri_callback)
77
88
  h.errback {
78
89
  @http_client = h
79
- @error = h.error
90
+ @error = error_message(h)
91
+ verbose("#{@error} for: #{current_uri}",h,:status,:response)
80
92
  fail(self)
81
93
  }
82
94
  rescue StandardError => se
83
95
  @http_client = EM::HttpClient.new("")
84
96
  @error = "#{se.message}\n#{se.backtrace.inspect}"
97
+ verbose("For: #{current_uri} something went wrong: #{@error}")
85
98
  fail(self)
86
99
  end
87
100
  else
88
101
  @http_client = EM::HttpClient.new("")
89
- @error = "robots denied"
102
+ @error = "Robots are not allowed"
103
+ verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
90
104
  fail(self)
91
105
  end
92
106
  else
93
107
  robots_url = robots_txt_url(current_uri)
94
108
  robots = EM::HttpRequest.new(robots_url).get(:redirects => 2) # get the robots.txt following redirects
109
+ verbose("Started fetching robots.txt from: #{robots_url} for: #{current_uri}",robots,:request)
95
110
  robots.callback {
96
- robots_file = robots.response
97
- robots_cache[robots_url.to_s] = robots_file if robots_cache
111
+ if success?(robots)
112
+ robots_file = robots.response
113
+ verbose("Found robots.txt at #{robots_url}:\n#{robots_file}", robots, :status, :response)
114
+ else
115
+ robots_file = ''
116
+ verbose("Didn't find robots.txt at #{robots_url}", robots, :status, :response)
117
+ end
118
+ robots_cache[robots_txt_url(robots_url).to_s] = robots_file if robots_cache
98
119
  if robots_allowed?(robots_file, useragent, robots_url, current_uri)
120
+ verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
99
121
  begin
100
122
  h = EM::HttpRequest.new(current_uri).get(options)
123
+ verbose("Started fetching: #{current_uri}",h,:request)
101
124
  h.callback(&_handle_uri_callback)
102
125
  h.errback {
103
126
  @http_client = h
104
- @error = h.error
127
+ @error = error_message(h)
128
+ verbose("#{@error} for: #{current_uri}", h, :status, :response)
105
129
  fail(self)
106
130
  }
107
131
  rescue StandardError => se
108
132
  @http_client = EM::HttpClient.new("")
109
133
  @error = "#{se.message}\n#{se.backtrace.inspect}"
134
+ verbose("For: #{current_uri} something went wrong: #{@error}")
110
135
  fail(self)
111
136
  end
112
137
  else
113
138
  @http_client = EM::HttpClient.new("")
114
- @error = "robots denied"
139
+ @error = "Robots are not allowed"
140
+ verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
115
141
  fail(self)
116
142
  end
117
143
  }
118
144
  robots.errback {
145
+ verbose("Failed to fetch robots.txt: from: #{robots_url} for: #{current_uri}", robots, :status, :response)
119
146
  robots_cache[robots_url.to_s] = "" if robots_cache
120
147
  h = EM::HttpRequest.new(current_uri).get(options)
148
+ verbose("Started fetching: #{current_uri}",h,:request)
121
149
  h.callback(&_handle_uri_callback)
122
150
  h.errback {
123
151
  @http_client = h
124
- @error = h.error
152
+ @error = error_message(h)
153
+ verbose("#{@error} for: #{current_uri}", h, :status, :response)
125
154
  fail(self)
126
155
  }
127
156
  }
@@ -155,6 +184,14 @@ class RDaneel
155
184
  Addressable::URI.parse("http://#{location}/robots.txt")
156
185
  end
157
186
 
187
+ def error_message(http_client)
188
+ @error = if http_client.error.nil? || http_client.error.empty?
189
+ "An error occurred when fetching #{http_client.uri.to_s}"
190
+ else
191
+ http_client.error
192
+ end
193
+ end
194
+
158
195
  def success?(http_client)
159
196
  http_client.response_header.status == 200
160
197
  end
@@ -164,10 +201,37 @@ class RDaneel
164
201
  end
165
202
 
166
203
  def redirect_url(http_client, u)
167
- location = Addressable::URI.parse(http_client.response_header.location)
168
- location = u.join(location) if location.relative?
169
- location.path = "/" if location.path.nil? || location.path == ""
170
- location
204
+ # em-http-request handles the case when redirect is relative
205
+ # at this point http_client.response_header.location should always have an absolute and valid url
206
+ # but this invalid url is parsed successfully http:/malformed:url so we ask for host
207
+ Addressable::URI.parse(http_client.response_header.location)
208
+ end
209
+
210
+ def verbose(message, client = nil, *args)
211
+ return unless @verbose
212
+ message.each { |l| hashed_puts('*', l) }
213
+ args.each do |a|
214
+ case a
215
+ when :status
216
+ if client.response_header.status == 0
217
+ hashed_puts('< Status:', '0 (timeout)')
218
+ else
219
+ hashed_puts('< Status:', client.response_header.status)
220
+ end
221
+ when :request # this is a options hash
222
+ headers = client.options[:head]
223
+ headers.each { |k,v| hashed_puts('>', "#{k}: #{v}") } if headers
224
+ when :response # this is an array
225
+ client.response_header.each { |r| hashed_puts('<', "#{r[0]}: #{r[1]}") }
226
+ end
227
+ end
171
228
  end
229
+
230
+ private
231
+
232
+ def hashed_puts( prefix, message )
233
+ $stdout.puts("[#{@hash}] [#{Time.now.strftime('%Y-%m-%d %H:%m:%S')}] #{prefix} #{message}")
234
+ end
235
+
172
236
  end
173
237
 
@@ -0,0 +1,47 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "RDaneel" do
4
+
5
+ describe "robots_txt_url" do
6
+ before(:each) do
7
+ @rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
8
+ end
9
+
10
+ it "should return the proper url when url don't has a port specified (80 implied)" do
11
+ url = Addressable::URI.parse("http://127.0.0.1/path/url?param1=value1&param2=value2")
12
+ @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
13
+ end
14
+
15
+ it "should return the proper url when url has a port 80 specified" do
16
+ url = Addressable::URI.parse("http://127.0.0.1:80/path/url?param1=value1&param2=value2")
17
+ @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
18
+ end
19
+
20
+ it "should return the proper url when url has a port different than 80" do
21
+ url = Addressable::URI.parse("http://127.0.0.1:8080/path/url?param1=value1&param2=value2")
22
+ @rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1:8080/robots.txt"
23
+ end
24
+
25
+ end
26
+
27
+
28
+ describe "robots_allowed?" do
29
+ before(:each) do
30
+ @rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
31
+ end
32
+
33
+ describe "when an error happens parsing the robots rules" do
34
+ before(:each) do
35
+ @robot_rules = RobotRules.new("RDaneel")
36
+ @robot_rules.stub!(:parse).and_raise(StandardError)
37
+ RobotRules.stub!(:new).and_return(@robot_rules)
38
+ end
39
+
40
+ it "should return true" do #no matter the params
41
+ @rdaneel.send(:robots_allowed?, nil, nil, nil, nil).should be_true
42
+ end
43
+ end
44
+ end
45
+
46
+ end
47
+
data/spec/spec_helper.rb CHANGED
@@ -3,91 +3,4 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  require 'rubygems'
4
4
  require 'rdaneel'
5
5
  require 'spec'
6
- require 'webrick'
7
-
8
- # keep webrick quiet
9
- class ::WEBrick::HTTPServer
10
- def access_log(config, req, res)
11
- # nop
12
- end
13
- end
14
- class ::WEBrick::BasicLog
15
- def log(level, data)
16
- # nop
17
- end
18
- end
19
-
20
- def locked_file
21
- File.join(File.dirname(__FILE__),"server_lock-#{@__port}")
22
- end
23
-
24
- def server_setup(port=8080, &blk)
25
- @__port = port
26
- if @server.nil? and !File.exist?(locked_file)
27
- File.open(locked_file,'w') {|f| f << 'locked' }
28
- @server = WEBrick::HTTPServer.new :Port => port
29
- blk.call(@server) if blk
30
- queue = Queue.new # synchronize the thread startup to the main thread
31
- @test_thread = Thread.new { queue << 1; @server.start }
32
-
33
- # wait for the queue
34
- value = queue.pop
35
-
36
- if !value
37
- STDERR.puts "Failed to startup test server!"
38
- exit(1)
39
- end
40
-
41
- trap("INT"){server_shutdown}
42
- at_exit{server_shutdown}
43
- end
44
- end
45
-
46
- def server_shutdown
47
- begin
48
- if File.exist?(locked_file)
49
- File.unlink locked_file
50
- @server.shutdown unless @server.nil?
51
- @server = nil
52
- end
53
- rescue Object => e
54
- puts "Error #{__FILE__}:#{__LINE__}\n#{e.message}"
55
- end
56
- end
57
-
58
- def mount(server, opts)
59
- raise ":path is required" if opts[:path].nil?
60
- raise ":status is required" if opts[:status].nil?
61
- server.mount_proc( opts[:path],
62
- lambda { |req, resp|
63
- resp.status = opts[:status]
64
- resp.body = opts[:body] unless opts[:body].nil?
65
- resp['Location'] = opts[:location] unless opts[:location].nil?
66
- opts[:block].call unless opts[:block].nil?
67
- } )
68
- end
69
-
70
- def should_not_be_hit
71
- should_be_hit( 0 )
72
- end
73
-
74
- def should_be_hit_once
75
- should_be_hit( 1 )
76
- end
77
-
78
- def should_be_hit_twice
79
- should_be_hit( 2 )
80
- end
81
-
82
- def should_be_hit( times = 1 )
83
- l = lambda {}
84
- m = l.should_receive(:call).exactly(times).times
85
- return l
86
- end
87
-
88
- Spec::Runner.configure do |config|
89
- config.before :suite do
90
- puts "\e[4mThese specs could take a while, please be patience\e[0m"
91
- end
92
- end
93
6
 
@@ -9,7 +9,7 @@ describe "RDaneel when the content is chunked (digg.com)" do
9
9
 
10
10
  it "should get the content" do
11
11
  EM.run do
12
- r = RDaneel.new("http://digg.com")
12
+ r = RDaneel.new("http://digg.com/news")
13
13
  r.callback do
14
14
  r.http_client.response_header.status.should == 200
15
15
  r.http_client.response.should_not be_empty
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 3
9
- version: 0.1.3
7
+ - 2
8
+ - 2
9
+ version: 0.2.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Edgar Gonzalez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-13 00:00:00 -04:30
18
+ date: 2010-08-27 00:00:00 -04:30
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -28,8 +28,8 @@ dependencies:
28
28
  segments:
29
29
  - 0
30
30
  - 2
31
- - 10
32
- version: 0.2.10
31
+ - 11
32
+ version: 0.2.11
33
33
  type: :runtime
34
34
  version_requirements: *id001
35
35
  - !ruby/object:Gem::Dependency
@@ -60,6 +60,35 @@ dependencies:
60
60
  version: 1.2.9
61
61
  type: :development
62
62
  version_requirements: *id003
63
+ - !ruby/object:Gem::Dependency
64
+ name: cucumber
65
+ prerelease: false
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 0
72
+ - 8
73
+ - 5
74
+ version: 0.8.5
75
+ type: :development
76
+ version_requirements: *id004
77
+ - !ruby/object:Gem::Dependency
78
+ name: relevance-rcov
79
+ prerelease: false
80
+ requirement: &id005 !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ segments:
85
+ - 0
86
+ - 9
87
+ - 2
88
+ - 1
89
+ version: 0.9.2.1
90
+ type: :development
91
+ version_requirements: *id005
63
92
  description: Add robots.txt support on top of em-http-request
64
93
  email:
65
94
  - edgargonzalez@gmail.com
@@ -78,13 +107,16 @@ files:
78
107
  - README.rdoc
79
108
  - Rakefile
80
109
  - VERSION
110
+ - features/get_with_cache.feature
111
+ - features/get_without_cache.feature
112
+ - features/step_definitions/rdaneel_steps.rb
113
+ - features/support/burrito.rb
114
+ - features/support/env.rb
81
115
  - lib/rdaneel.rb
82
- - spec/no_redirects_neither_robots_spec.rb
83
- - spec/redirects_without_robots_spec.rb
116
+ - spec/rdaneel_spec.rb
84
117
  - spec/spec.opts
85
118
  - spec/spec_helper.rb
86
119
  - spec/streamed_content_spec.rb
87
- - spec/using_cache_spec.rb
88
120
  has_rdoc: true
89
121
  homepage: http://github.com/hasmanydevelopers/RDaneel
90
122
  licenses: []
@@ -116,8 +148,6 @@ signing_key:
116
148
  specification_version: 3
117
149
  summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
118
150
  test_files:
119
- - spec/using_cache_spec.rb
120
- - spec/no_redirects_neither_robots_spec.rb
121
151
  - spec/spec_helper.rb
122
- - spec/redirects_without_robots_spec.rb
123
152
  - spec/streamed_content_spec.rb
153
+ - spec/rdaneel_spec.rb
@@ -1,130 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe "RDaneel when there are no redirects" do
4
-
5
- let(:port) {8083}
6
-
7
- describe "when a successfull status different than 200 is issued for robots.txt" do
8
- it "should get the content ignoring the redirect"
9
- end
10
-
11
- describe "when a redirect other than 301 and 302 is issued for robots.txt" do
12
- it "should get the content ignoring the redirect"
13
- end
14
-
15
- (301..302).each do |status|
16
-
17
- describe "when robots.txt has been moved (http code #{status})" do
18
- before(:each) do
19
- server_setup(port+status) do |server|
20
- mount(server, :path => '/hello_world', :status => 200,
21
- :body => 'Hello World!', :block => should_be_hit_once )
22
- mount(server, :path => '/robots.txt', :status => status,
23
- :location => "http://127.0.0.1:#{port+status}/golems.txt",
24
- :block => should_be_hit_once )
25
- mount(server, :path => '/golems.txt', :status => 200,
26
- :block => should_be_hit_once )
27
- end
28
- end
29
-
30
- after(:each) do
31
- server_shutdown
32
- end
33
-
34
- it "should get the redirected robots.txt and the content" do
35
- EM.run do
36
- r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
37
- r.callback do
38
- r.http_client.response_header.status.should == 200
39
- r.http_client.response.should == "Hello World!"
40
- r.redirects.should be_empty
41
- EM.stop
42
- end
43
- r.errback do
44
- fail
45
- EM.stop
46
- end
47
- r.get
48
- end
49
- end
50
-
51
- end
52
-
53
- end
54
-
55
- (400..417).each do |status|
56
-
57
- describe "when there is a CLIENT error #{status} associated to robots.txt" do
58
- before(:each) do
59
- server_setup(port+status) do |server|
60
- mount(server, :path => '/hello_world', :status => 200,
61
- :body => 'Hello World!', :block => should_be_hit_once )
62
- mount(server, :path => '/robots.txt', :status => status,
63
- :block => should_be_hit_once )
64
- end
65
- end
66
-
67
- after(:each) do
68
- server_shutdown
69
- end
70
-
71
- it "should get the content" do
72
- EM.run do
73
- r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
74
- r.callback do
75
- r.http_client.response_header.status.should == 200
76
- r.http_client.response.should == "Hello World!"
77
- r.redirects.should be_empty
78
- EM.stop
79
- end
80
- r.errback do
81
- fail
82
- EM.stop
83
- end
84
- r.get
85
- end
86
- end
87
-
88
- end
89
-
90
- end
91
-
92
- (500..505).each do |status|
93
-
94
- describe "when there is a SERVER error #{status} associated to robots.txt" do
95
- before(:each) do
96
- server_setup(port+status) do |server|
97
- mount(server, :path => '/hello_world', :status => 200,
98
- :body => 'Hello World!', :block => should_be_hit_once )
99
- mount(server, :path => '/robots.txt', :status => status,
100
- :block => should_be_hit_once )
101
- end
102
- end
103
-
104
- after (:each) do
105
- server_shutdown
106
- end
107
-
108
- it "should get the content" do
109
- EM.run do
110
- r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
111
- r.callback do
112
- r.http_client.response_header.status.should == 200
113
- r.http_client.response.should == "Hello World!"
114
- r.redirects.should be_empty
115
- EM.stop
116
- end
117
- r.errback do
118
- fail
119
- EM.stop
120
- end
121
- r.get
122
- end
123
- end
124
-
125
- end
126
-
127
- end
128
-
129
- end
130
-
@@ -1,175 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe "RDaneel when there are redirects" do
4
-
5
- let(:port) {8081}
6
-
7
- describe "when there is no robots.txt in the host (ONLY one host)" do
8
-
9
- describe "when no redirection limit has been set" do
10
- before(:each) do
11
- server_setup(port) do |server|
12
- mount(server, :path => '/robots.txt', :status => 404,
13
- :block => should_be_hit_once )
14
- mount(server, :path => '/redirect_me', :status => 301,
15
- :location => "http://127.0.0.1:#{port}/hello_world",
16
- :block => should_be_hit_once )
17
- mount(server, :path => '/hello_world', :status => 200,
18
- :body => 'Hello World!',
19
- :block => should_not_be_hit )
20
- end
21
- end
22
-
23
- after(:each) do
24
- server_shutdown
25
- end
26
-
27
- it "should not follow redirects" do
28
- EM.run do
29
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
30
- r.callback do
31
- fail
32
- EM.stop
33
- end
34
- r.errback do
35
- r.redirects.should be_empty
36
- r.error.should == "Exceeded maximum number of redirects"
37
- EM.stop
38
- end
39
- r.get
40
- end
41
-
42
- end
43
-
44
- end
45
-
46
- describe "when a maximum number or redirects is set" do
47
-
48
- describe "when there are less redirects than the maximum specified" do
49
- before(:each) do
50
- server_setup(port) do |server|
51
- mount(server, :path => '/robots.txt', :status => 404,
52
- :block => should_be_hit(3) )
53
- mount(server, :path => '/redirect_me', :status => 301,
54
- :location => "http://127.0.0.1:#{port}/redirect_me_again",
55
- :block => should_be_hit_once )
56
- mount(server, :path => '/redirect_me_again', :status => 301,
57
- :location => "http://127.0.0.1:#{port}/hello_world",
58
- :block => should_be_hit_once )
59
- mount(server, :path => '/hello_world', :status => 200,
60
- :body => 'Hello World!',
61
- :block => should_be_hit_once )
62
- end
63
- end
64
-
65
- after(:each) do
66
- server_shutdown
67
- end
68
-
69
- it "should get the content following all the redirects" do
70
- EM.run do
71
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
72
- r.callback do
73
- r.http_client.response_header.status.should == 200
74
- r.http_client.response.should == "Hello World!"
75
- r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me",
76
- "http://127.0.0.1:#{port}/redirect_me_again"]
77
- r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
78
- EM.stop
79
- end
80
- r.errback do
81
- fail
82
- EM.stop
83
- end
84
- r.get(:redirects => 3)
85
- end
86
-
87
- end
88
-
89
- end
90
-
91
- describe "when there are as many redirects as the maximum" do
92
- before(:each) do
93
- server_setup(port) do |server|
94
- mount(server, :path => '/robots.txt', :status => 404,
95
- :block => should_be_hit_twice )
96
- mount(server, :path => '/redirect_me', :status => 301,
97
- :location => "http://127.0.0.1:#{port}/hello_world",
98
- :block => should_be_hit_once )
99
- mount(server, :path => '/hello_world', :status => 200,
100
- :body => 'Hello World!',
101
- :block => should_be_hit_once )
102
- end
103
- end
104
-
105
- after(:each) do
106
- server_shutdown
107
- end
108
-
109
- it "should get the content following all the redirects" do
110
- EM.run do
111
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
112
- r.callback do
113
- r.http_client.response_header.status.should == 200
114
- r.http_client.response.should == "Hello World!"
115
- r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
116
- r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
117
- EM.stop
118
- end
119
- r.errback do
120
- fail
121
- EM.stop
122
- end
123
- r.get(:redirects => 1)
124
- end
125
-
126
- end
127
-
128
- end
129
-
130
- describe "when the number of redirects exceed the maximum specified" do
131
- before(:each) do
132
- server_setup(port) do |server|
133
- mount(server, :path => '/robots.txt', :status => 404,
134
- :block => should_be_hit_twice )
135
- mount(server, :path => '/redirect_me', :status => 301,
136
- :location => "http://127.0.0.1:#{port}/redirect_me_again",
137
- :block => should_be_hit_once )
138
- mount(server, :path => '/redirect_me_again', :status => 301,
139
- :location => "http://127.0.0.1:#{port}/hello_world",
140
- :block => should_be_hit_once )
141
- mount(server, :path => '/hello_world', :status => 200,
142
- :body => 'Hello World!',
143
- :block => should_not_be_hit )
144
- end
145
- end
146
-
147
- after(:each) do
148
- server_shutdown
149
- end
150
-
151
- it "should stop following redirects once the maximum specified is reached" do
152
- EM.run do
153
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
154
- r.callback do
155
- fail
156
- EM.stop
157
- end
158
- r.errback do
159
- r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
160
- r.error.should == "Exceeded maximum number of redirects"
161
- EM.stop
162
- end
163
- r.get(:redirects => 1)
164
- end
165
-
166
- end
167
-
168
- end
169
-
170
- end
171
-
172
- end
173
-
174
- end
175
-
@@ -1,46 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
-
3
- describe "RDaneel when there is a cache" do
4
-
5
- let(:port) {8082}
6
-
7
- describe "when there is no robots.txt in the host" do
8
-
9
- before(:each) do
10
- RDaneel.robots_cache = {}
11
- server_setup(port) do |server|
12
- mount(server, :path => '/robots.txt', :status => 404,
13
- :block => should_be_hit_once )
14
- mount(server, :path => '/redirect_me', :status => 301,
15
- :location => "http://127.0.0.1:#{port}/hello_world",
16
- :block => should_be_hit_once )
17
- mount(server, :path => '/hello_world', :status => 200,
18
- :body => 'Hello World!',
19
- :block => should_be_hit_once )
20
- end
21
- end
22
-
23
- after(:each) do
24
- server_shutdown
25
- end
26
-
27
- it "should try to get the robots.txt just once" do
28
- EM.run do
29
- r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
30
- r.callback do
31
- r.http_client.response_header.status.should == 200
32
- r.http_client.response.should == "Hello World!"
33
- r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me"]
34
- r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
35
- EM.stop
36
- end
37
- r.errback do
38
- fail
39
- EM.stop
40
- end
41
- r.get(:redirects => 3)
42
- end
43
- end
44
- end
45
- end
46
-