rdaneel 0.1.3 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/Rakefile +20 -18
- data/VERSION +1 -1
- data/features/get_with_cache.feature +67 -0
- data/features/get_without_cache.feature +155 -0
- data/features/step_definitions/rdaneel_steps.rb +79 -0
- data/features/support/burrito.rb +69 -0
- data/features/support/env.rb +22 -0
- data/lib/rdaneel.rb +83 -19
- data/spec/rdaneel_spec.rb +47 -0
- data/spec/spec_helper.rb +0 -87
- data/spec/streamed_content_spec.rb +1 -1
- metadata +42 -12
- data/spec/no_redirects_neither_robots_spec.rb +0 -130
- data/spec/redirects_without_robots_spec.rb +0 -175
- data/spec/using_cache_spec.rb +0 -46
data/.gitignore
CHANGED
data/Rakefile
CHANGED
@@ -10,9 +10,11 @@ begin
|
|
10
10
|
gem.email = ["edgargonzalez@gmail.com", "anibalrojas@gmail.com"]
|
11
11
|
gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
|
12
12
|
gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
|
13
|
-
gem.add_dependency("em-http-request", ">= 0.2.
|
13
|
+
gem.add_dependency("em-http-request", ">= 0.2.11")
|
14
14
|
gem.add_dependency('robot_rules', '>= 0.9.3')
|
15
15
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
16
|
+
gem.add_development_dependency "cucumber", ">= 0.8.5"
|
17
|
+
gem.add_development_dependency "relevance-rcov", ">= 0.9.2.1"
|
16
18
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
19
|
end
|
18
20
|
Jeweler::GemcutterTasks.new
|
@@ -20,29 +22,29 @@ rescue LoadError
|
|
20
22
|
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
23
|
end
|
22
24
|
|
25
|
+
require 'cucumber/rake/task'
|
26
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
27
|
+
t.cucumber_opts = "--format pretty" # Any valid command line option can go here.
|
28
|
+
t.rcov = true
|
29
|
+
t.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
|
30
|
+
end
|
31
|
+
|
23
32
|
require 'spec/rake/spectask'
|
24
33
|
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
34
|
spec.libs << 'lib' << 'spec'
|
26
35
|
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
-
end
|
28
|
-
|
29
|
-
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
30
|
-
spec.libs << 'lib' << 'spec'
|
31
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
32
36
|
spec.rcov = true
|
37
|
+
spec.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
|
33
38
|
end
|
34
39
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Rake::RDocTask.new do |rdoc|
|
41
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
42
|
-
|
43
|
-
rdoc.rdoc_dir = 'rdoc'
|
44
|
-
rdoc.title = "rdaneel #{version}"
|
45
|
-
rdoc.rdoc_files.include('README*')
|
46
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
40
|
+
desc "Run both specs and features and generate aggregated coverage"
|
41
|
+
task :all_tests do |t|
|
42
|
+
rm "coverage.data" if File.exist?("coverage.data")
|
43
|
+
Rake::Task['spec'].invoke
|
44
|
+
Rake::Task["features"].invoke
|
47
45
|
end
|
48
46
|
|
47
|
+
task :features => :check_dependencies
|
48
|
+
task :spec => :check_dependencies
|
49
|
+
task :default => :all_tests
|
50
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.2
|
@@ -0,0 +1,67 @@
|
|
1
|
+
Feature: get a url using cache
|
2
|
+
In order to fetch content from internet
|
3
|
+
As a crawler
|
4
|
+
I want to get a url respecting robots.txt rules
|
5
|
+
|
6
|
+
Scenario: the url to fetch is redirected
|
7
|
+
Given a cache for RDaneel
|
8
|
+
And a robots.txt that allows RDaneel
|
9
|
+
And a HelloWorld url
|
10
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
|
11
|
+
And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
|
12
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
13
|
+
Then I should get the content for HelloWorld url
|
14
|
+
And the http response code should be 200
|
15
|
+
And I should get 2 redirects
|
16
|
+
And The redirects sequence should be:
|
17
|
+
| http://127.0.0.1:3210/redirect_me |
|
18
|
+
| http://127.0.0.1:3210/redirect_me_again |
|
19
|
+
And The requests sequence should be:
|
20
|
+
| status | path |
|
21
|
+
| 200 | /robots.txt |
|
22
|
+
| 301 | /redirect_me |
|
23
|
+
| 302 | /redirect_me_again |
|
24
|
+
| 200 | /hello_world |
|
25
|
+
And The cache for "http://127.0.0.1:3210/robots.txt" should be
|
26
|
+
"""
|
27
|
+
User-agent: *
|
28
|
+
Disallow: /cgi-bin/
|
29
|
+
"""
|
30
|
+
|
31
|
+
Scenario: a cached robots.txt exists denying RDaneel's user-agent
|
32
|
+
Given a cache for RDaneel
|
33
|
+
And The cache for "http://127.0.0.1:3210/robots.txt" is:
|
34
|
+
"""
|
35
|
+
User-agent: *
|
36
|
+
Disallow: /
|
37
|
+
"""
|
38
|
+
And a robots.txt that denies RDaneel
|
39
|
+
And a HelloWorld url
|
40
|
+
When I get the "/hello_world" url following a maximum of 1 redirects
|
41
|
+
Then I should get a "Robots are not allowed" error
|
42
|
+
And I should get 0 redirects
|
43
|
+
And The requests should be empty
|
44
|
+
|
45
|
+
|
46
|
+
Scenario: the url to fetch is redirected to unreacheable server but a robots cache exists for this server allowing RDaneel
|
47
|
+
Given a cache for RDaneel
|
48
|
+
And The cache for "http://127.0.0.1:3210/robots.txt" is:
|
49
|
+
"""
|
50
|
+
User-agent: *
|
51
|
+
Disallow: /cgi-bin/
|
52
|
+
"""
|
53
|
+
And The cache for "http://127.0.0.1:3211/robots.txt" is:
|
54
|
+
"""
|
55
|
+
User-agent: *
|
56
|
+
Disallow: /cgi-bin/
|
57
|
+
"""
|
58
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
|
59
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
60
|
+
Then I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
|
61
|
+
And I should get 1 redirects
|
62
|
+
And The redirects sequence should be:
|
63
|
+
| http://127.0.0.1:3210/redirect_me |
|
64
|
+
And The requests sequence should be:
|
65
|
+
| status | path |
|
66
|
+
| 301 | /redirect_me |
|
67
|
+
|
@@ -0,0 +1,155 @@
|
|
1
|
+
Feature: get a url without using cache
|
2
|
+
In order to fetch content from internet
|
3
|
+
As a crawler
|
4
|
+
I want to get a url respecting robots.txt rules
|
5
|
+
|
6
|
+
Scenario: a robots.txt exists allowing RDaneel's user-agent
|
7
|
+
Given a robots.txt that allows RDaneel
|
8
|
+
And a HelloWorld url
|
9
|
+
When I get the "/hello_world" url following a maximum of 1 redirects
|
10
|
+
Then I should get the content for HelloWorld url
|
11
|
+
And the http response code should be 200
|
12
|
+
And I should get 0 redirects
|
13
|
+
And The requests sequence should be:
|
14
|
+
| status | path |
|
15
|
+
| 200 | /robots.txt |
|
16
|
+
| 200 | /hello_world |
|
17
|
+
|
18
|
+
Scenario: a robots.txt exists denying RDaneel's user-agent
|
19
|
+
Given a robots.txt that denies RDaneel
|
20
|
+
And a HelloWorld url
|
21
|
+
When I get the "/hello_world" url following a maximum of 1 redirects
|
22
|
+
Then I should get a "Robots are not allowed" error
|
23
|
+
And I should get 0 redirects
|
24
|
+
And The requests sequence should be:
|
25
|
+
| status | path |
|
26
|
+
| 200 | /robots.txt |
|
27
|
+
|
28
|
+
Scenario: the url to fetch is redirected
|
29
|
+
Given a robots.txt that allows RDaneel
|
30
|
+
And a HelloWorld url
|
31
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
|
32
|
+
And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
|
33
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
34
|
+
Then I should get the content for HelloWorld url
|
35
|
+
And the http response code should be 200
|
36
|
+
And I should get 2 redirects
|
37
|
+
And The redirects sequence should be:
|
38
|
+
| http://127.0.0.1:3210/redirect_me |
|
39
|
+
| http://127.0.0.1:3210/redirect_me_again |
|
40
|
+
And The requests sequence should be:
|
41
|
+
| status | path |
|
42
|
+
| 200 | /robots.txt |
|
43
|
+
| 301 | /redirect_me |
|
44
|
+
| 200 | /robots.txt |
|
45
|
+
| 302 | /redirect_me_again |
|
46
|
+
| 200 | /robots.txt |
|
47
|
+
| 200 | /hello_world |
|
48
|
+
|
49
|
+
Scenario: the url to fetch exceeds the maximum redirects specifieds
|
50
|
+
Given a robots.txt that allows RDaneel
|
51
|
+
And a HelloWorld url
|
52
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
|
53
|
+
And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
|
54
|
+
When I get the "/redirect_me" url following a maximum of 1 redirects
|
55
|
+
Then I should get a "Exceeded maximum number of redirects: 1" error
|
56
|
+
And I should get 1 redirects
|
57
|
+
And The redirects sequence should be:
|
58
|
+
| http://127.0.0.1:3210/redirect_me |
|
59
|
+
And The requests sequence should be:
|
60
|
+
| status | path |
|
61
|
+
| 200 | /robots.txt |
|
62
|
+
| 301 | /redirect_me |
|
63
|
+
| 200 | /robots.txt |
|
64
|
+
| 302 | /redirect_me_again |
|
65
|
+
|
66
|
+
Scenario: the url to fetch has an infinte redirect
|
67
|
+
Given a robots.txt that allows RDaneel
|
68
|
+
And a HelloWorld url
|
69
|
+
And a "/redirect_me" url that redirects 302 to "/redirect_me_again" url
|
70
|
+
And a "/redirect_me_again" url that redirects 302 to "/redirect_me" url
|
71
|
+
When I get the "/redirect_me" url following a maximum of 2 redirects
|
72
|
+
Then I should get a "Infinite redirect detected for: http://127.0.0.1:3210/redirect_me" error
|
73
|
+
And I should get 2 redirects
|
74
|
+
And The redirects sequence should be:
|
75
|
+
| http://127.0.0.1:3210/redirect_me |
|
76
|
+
| http://127.0.0.1:3210/redirect_me_again |
|
77
|
+
And The requests sequence should be:
|
78
|
+
| status | path |
|
79
|
+
| 200 | /robots.txt |
|
80
|
+
| 302 | /redirect_me |
|
81
|
+
| 200 | /robots.txt |
|
82
|
+
| 302 | /redirect_me_again |
|
83
|
+
|
84
|
+
Scenario: the url to fetch redirects to not found url
|
85
|
+
Given a robots.txt that allows RDaneel
|
86
|
+
And a "/redirect_me" url that redirects 302 to "/not_found" url
|
87
|
+
When I get the "/redirect_me" url following a maximum of 2 redirects
|
88
|
+
Then I should get a "Not success neither redirect" error
|
89
|
+
And I should get 1 redirects
|
90
|
+
And The redirects sequence should be:
|
91
|
+
| http://127.0.0.1:3210/redirect_me |
|
92
|
+
And The requests sequence should be:
|
93
|
+
| status | path |
|
94
|
+
| 200 | /robots.txt |
|
95
|
+
| 302 | /redirect_me |
|
96
|
+
| 200 | /robots.txt |
|
97
|
+
| 404 | /not_found |
|
98
|
+
|
99
|
+
|
100
|
+
Scenario: robots.txt doesn't exists
|
101
|
+
Given a HelloWorld url
|
102
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
|
103
|
+
And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
|
104
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
105
|
+
Then I should get the content for HelloWorld url
|
106
|
+
And the http response code should be 200
|
107
|
+
And I should get 2 redirects
|
108
|
+
And The redirects sequence should be:
|
109
|
+
| http://127.0.0.1:3210/redirect_me |
|
110
|
+
| http://127.0.0.1:3210/redirect_me_again |
|
111
|
+
And The requests sequence should be:
|
112
|
+
| status | path |
|
113
|
+
| 404 | /robots.txt |
|
114
|
+
| 301 | /redirect_me |
|
115
|
+
| 404 | /robots.txt |
|
116
|
+
| 302 | /redirect_me_again |
|
117
|
+
| 404 | /robots.txt |
|
118
|
+
| 200 | /hello_world |
|
119
|
+
|
120
|
+
Scenario: the url to fetch redirects to a malformed url (format handled by em-http-request)
|
121
|
+
Given a robots.txt that allows RDaneel
|
122
|
+
And a "/redirect_me" url that redirects 302 to "http://malformed:url" url
|
123
|
+
When I get the "/redirect_me" url following a maximum of 2 redirects
|
124
|
+
Then I should get a "Location header format error" error
|
125
|
+
And I should get 0 redirects
|
126
|
+
And The requests sequence should be:
|
127
|
+
| status | path |
|
128
|
+
| 200 | /robots.txt |
|
129
|
+
| 302 | /redirect_me |
|
130
|
+
|
131
|
+
Scenario: the url to fetch redirects to a malformed url (format not handled by em-http-request 0.2.10)
|
132
|
+
Given a robots.txt that allows RDaneel
|
133
|
+
And a "/redirect_me" url that redirects 302 to "http:/malformed:url" url
|
134
|
+
When I get the "/redirect_me" url following a maximum of 2 redirects
|
135
|
+
Then I should get a "Location header format error" error
|
136
|
+
And I should get 0 redirects
|
137
|
+
And The requests sequence should be:
|
138
|
+
| status | path |
|
139
|
+
| 200 | /robots.txt |
|
140
|
+
| 302 | /redirect_me |
|
141
|
+
|
142
|
+
Scenario: the url to fetch is redirected to unreacheable host:port
|
143
|
+
Given a robots.txt that allows RDaneel
|
144
|
+
And a HelloWorld url
|
145
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
|
146
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
147
|
+
Then I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
|
148
|
+
And I should get 1 redirects
|
149
|
+
And The redirects sequence should be:
|
150
|
+
| http://127.0.0.1:3210/redirect_me |
|
151
|
+
And The requests sequence should be:
|
152
|
+
| status | path |
|
153
|
+
| 200 | /robots.txt |
|
154
|
+
| 301 | /redirect_me |
|
155
|
+
|
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
Given /^a robots\.txt that allows RDaneel$/ do
|
3
|
+
$server.mount(:path => '/robots.txt', :status => 200,
|
4
|
+
:body => "User-agent: *\nDisallow: /cgi-bin/")
|
5
|
+
end
|
6
|
+
|
7
|
+
Given /^a robots\.txt that denies RDaneel$/ do
|
8
|
+
$server.mount(:path => '/robots.txt', :status => 200,
|
9
|
+
:body => "User-agent: *\nDisallow: /")
|
10
|
+
end
|
11
|
+
|
12
|
+
Given /^a HelloWorld url$/ do
|
13
|
+
$server.mount(:path => '/hello_world', :status => 200,
|
14
|
+
:body => "Hello World")
|
15
|
+
end
|
16
|
+
|
17
|
+
Given /^a "([^"]*)" url that redirects (\d+) to "([^"]*)" url$/ do |url, status, redirected_to|
|
18
|
+
$server.mount(:path => url, :status => status.to_i,
|
19
|
+
:location => redirected_to)
|
20
|
+
end
|
21
|
+
|
22
|
+
Given /^a cache for RDaneel$/ do
|
23
|
+
RDaneel.robots_cache = {}
|
24
|
+
end
|
25
|
+
|
26
|
+
Given /^The cache for "([^"]*)" is:$/ do |robots_url, robots_file|
|
27
|
+
RDaneel.robots_cache[robots_url] = robots_file
|
28
|
+
end
|
29
|
+
|
30
|
+
When /^I get the "([^"]*)" url following a maximum of (\d+) redirects$/ do |url, max_redirects|
|
31
|
+
EM.run do
|
32
|
+
@r = RDaneel.new("#{HOST}#{url}")
|
33
|
+
@r.callback do
|
34
|
+
EM.stop
|
35
|
+
end
|
36
|
+
@r.errback do
|
37
|
+
EM.stop
|
38
|
+
end
|
39
|
+
@r.get(:redirects => max_redirects)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
Then /^I should get the content for HelloWorld url$/ do
|
44
|
+
@r.http_client.response.should == "Hello World"
|
45
|
+
end
|
46
|
+
|
47
|
+
Then /^the http response code should be (\d+)$/ do |code|
|
48
|
+
@r.http_client.response_header.status.should == code.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
Then /^I should get (\d+) redirects$/ do |redirects_count|
|
52
|
+
@r.redirects.size.should == redirects_count.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
Then /^The requests sequence should be:$/ do |expected_table|
|
56
|
+
expected_requests = []
|
57
|
+
expected_table.hashes.each do |hash|
|
58
|
+
expected_requests << {:status => hash[:status].to_i,
|
59
|
+
:path => hash[:path]}
|
60
|
+
end
|
61
|
+
$server.requests.should == expected_requests
|
62
|
+
end
|
63
|
+
|
64
|
+
Then /^The requests should be empty$/ do
|
65
|
+
$server.requests.should be_empty
|
66
|
+
end
|
67
|
+
|
68
|
+
Then /^The redirects sequence should be:$/ do |expected_redirects|
|
69
|
+
@r.redirects.should == expected_redirects.raw.flatten
|
70
|
+
end
|
71
|
+
|
72
|
+
Then /^I should get a "([^"]*)" error$/ do |error_message|
|
73
|
+
@r.error.should == error_message
|
74
|
+
end
|
75
|
+
|
76
|
+
Then /^The cache for "([^"]*)" should be$/ do |robots_url, robots_file|
|
77
|
+
RDaneel.robots_cache[robots_url].should == robots_file
|
78
|
+
end
|
79
|
+
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'socket'
|
2
|
+
|
3
|
+
class Burrito
|
4
|
+
|
5
|
+
STATUS_MESSAGES = {
|
6
|
+
200 => 'OK',
|
7
|
+
301 => 'Moved Permanently',
|
8
|
+
302 => 'Found',
|
9
|
+
404 => 'Not Found'
|
10
|
+
}
|
11
|
+
|
12
|
+
attr_reader :requests
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@routes = {}
|
16
|
+
@requests = []
|
17
|
+
end
|
18
|
+
|
19
|
+
def mount(opts)
|
20
|
+
@routes[opts[:path]] = { :status => opts[:status],
|
21
|
+
:body => opts[:body],
|
22
|
+
:location => opts[:location] }
|
23
|
+
end
|
24
|
+
|
25
|
+
def reset
|
26
|
+
@routes = {}
|
27
|
+
@requests = []
|
28
|
+
end
|
29
|
+
|
30
|
+
def start
|
31
|
+
@thread = Thread.new do
|
32
|
+
|
33
|
+
webserver = TCPServer.new('127.0.0.1', 3210)
|
34
|
+
|
35
|
+
while session = webserver.accept
|
36
|
+
request = session.gets
|
37
|
+
path = '/' << request.gsub(/GET\ \//, '').gsub(/\ HTTP.*/, '').chomp
|
38
|
+
if @routes[path]
|
39
|
+
status = @routes[path][:status]
|
40
|
+
body = @routes[path][:body]
|
41
|
+
location = @routes[path][:location]
|
42
|
+
else
|
43
|
+
status = 404
|
44
|
+
body = nil
|
45
|
+
location = nil
|
46
|
+
end
|
47
|
+
@requests.push( { :status => status, :path => path } )
|
48
|
+
response = "HTTP/1.1 #{status} #{STATUS_MESSAGES[status]}\r\n"
|
49
|
+
response << "Server: burrito/0.0.1\r\n"
|
50
|
+
response << "Content-Length: #{ body ? body.length : 0 }\r\n"
|
51
|
+
response << "Content-Type: text/plain\r\n" if body
|
52
|
+
response << "Location: #{location}\r\n" if location
|
53
|
+
response << "Connection: close\r\n"
|
54
|
+
response << "\r\n"
|
55
|
+
response << "#{body}" if body
|
56
|
+
session.print response
|
57
|
+
session.close
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def shutdown
|
65
|
+
@thread.terminate
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '../..', 'lib'))
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rdaneel'
|
5
|
+
require 'burrito'
|
6
|
+
|
7
|
+
unless $server
|
8
|
+
$server = Burrito.new
|
9
|
+
$server.start
|
10
|
+
end
|
11
|
+
|
12
|
+
HOST = "http://127.0.0.1:3210"
|
13
|
+
|
14
|
+
Before do
|
15
|
+
$server.reset
|
16
|
+
RDaneel.robots_cache = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
at_exit do
|
20
|
+
$server.shutdown
|
21
|
+
end
|
22
|
+
|
data/lib/rdaneel.rb
CHANGED
@@ -19,10 +19,12 @@ class RDaneel
|
|
19
19
|
attr_accessor :uri
|
20
20
|
attr_reader :error, :redirects, :http_client
|
21
21
|
|
22
|
-
def initialize(uri)
|
22
|
+
def initialize(uri,options = {})
|
23
23
|
@uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
|
24
24
|
@uri.path = "/" if @uri.path.nil? || @uri.path == ""
|
25
25
|
@redirects = []
|
26
|
+
@verbose = options[:verbose]
|
27
|
+
@hash = @uri.hash if @verbose
|
26
28
|
end
|
27
29
|
|
28
30
|
def get(opts = {})
|
@@ -37,91 +39,118 @@ class RDaneel
|
|
37
39
|
if success?(h)
|
38
40
|
@uri = current_uri if current_uri != @uri
|
39
41
|
@http_client = h
|
42
|
+
verbose("Succeded fetching: #{current_uri}", h, :status, :response)
|
40
43
|
succeed(self)
|
41
44
|
elsif redirected?(h)
|
42
45
|
if @redirects.size >= max_redirects
|
43
46
|
@http_client = h
|
44
|
-
@error = "Exceeded maximum number of redirects"
|
47
|
+
@error = "Exceeded maximum number of redirects: #{max_redirects}"
|
48
|
+
verbose(@error, h, :status, :response)
|
45
49
|
fail(self)
|
46
50
|
return
|
47
51
|
end
|
52
|
+
@redirects << current_uri.to_s
|
53
|
+
current_uri = redirect_url(h, current_uri)
|
48
54
|
begin
|
49
|
-
|
50
|
-
current_uri = redirect_url(h, current_uri)
|
55
|
+
verbose("Redirected to: #{current_uri.to_s} from: #{@redirects[-1]}", h, :status, :response)
|
51
56
|
if @redirects.include?(current_uri.to_s)
|
52
57
|
@http_client = h
|
53
|
-
@error = "
|
58
|
+
@error = "Infinite redirect detected for: #{current_uri.to_s}"
|
59
|
+
verbose(@error, h, :status, :response)
|
54
60
|
fail(self)
|
55
61
|
return
|
56
62
|
end
|
57
63
|
_get.call
|
58
|
-
rescue
|
64
|
+
rescue StandardError => se
|
59
65
|
@http_client = h
|
60
|
-
@error = "
|
66
|
+
@error = "Error trying to follow a redirect #{current_uri.to_s}: #{h.response_header.location}"
|
67
|
+
verbose(@error, h, :status, :response)
|
61
68
|
fail(self)
|
62
69
|
end
|
63
70
|
else
|
64
71
|
# other error
|
65
72
|
@http_client = h
|
66
|
-
@error = "
|
73
|
+
@error = "Not success neither redirect"
|
74
|
+
verbose(@error, h, :status, :response)
|
67
75
|
fail(self)
|
68
76
|
end
|
69
77
|
}
|
70
78
|
_get = lambda {
|
71
79
|
robots_url = robots_txt_url(current_uri)
|
72
80
|
if robots_cache && robots_file = robots_cache[robots_url.to_s]
|
81
|
+
verbose("Found cached robots.txt:\n#{robots_cache[robots_url.to_s]} for: #{current_uri}")
|
73
82
|
if robots_allowed?(robots_file, useragent, robots_url, current_uri)
|
83
|
+
verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
|
74
84
|
begin
|
75
85
|
h = EM::HttpRequest.new(current_uri).get(options)
|
86
|
+
verbose("Started fetching: #{current_uri}",h,:request)
|
76
87
|
h.callback(&_handle_uri_callback)
|
77
88
|
h.errback {
|
78
89
|
@http_client = h
|
79
|
-
@error = h
|
90
|
+
@error = error_message(h)
|
91
|
+
verbose("#{@error} for: #{current_uri}",h,:status,:response)
|
80
92
|
fail(self)
|
81
93
|
}
|
82
94
|
rescue StandardError => se
|
83
95
|
@http_client = EM::HttpClient.new("")
|
84
96
|
@error = "#{se.message}\n#{se.backtrace.inspect}"
|
97
|
+
verbose("For: #{current_uri} something went wrong: #{@error}")
|
85
98
|
fail(self)
|
86
99
|
end
|
87
100
|
else
|
88
101
|
@http_client = EM::HttpClient.new("")
|
89
|
-
@error = "
|
102
|
+
@error = "Robots are not allowed"
|
103
|
+
verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
|
90
104
|
fail(self)
|
91
105
|
end
|
92
106
|
else
|
93
107
|
robots_url = robots_txt_url(current_uri)
|
94
108
|
robots = EM::HttpRequest.new(robots_url).get(:redirects => 2) # get the robots.txt following redirects
|
109
|
+
verbose("Started fetching robots.txt from: #{robots_url} for: #{current_uri}",robots,:request)
|
95
110
|
robots.callback {
|
96
|
-
|
97
|
-
|
111
|
+
if success?(robots)
|
112
|
+
robots_file = robots.response
|
113
|
+
verbose("Found robots.txt at #{robots_url}:\n#{robots_file}", robots, :status, :response)
|
114
|
+
else
|
115
|
+
robots_file = ''
|
116
|
+
verbose("Didn't find robots.txt at #{robots_url}", robots, :status, :response)
|
117
|
+
end
|
118
|
+
robots_cache[robots_txt_url(robots_url).to_s] = robots_file if robots_cache
|
98
119
|
if robots_allowed?(robots_file, useragent, robots_url, current_uri)
|
120
|
+
verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
|
99
121
|
begin
|
100
122
|
h = EM::HttpRequest.new(current_uri).get(options)
|
123
|
+
verbose("Started fetching: #{current_uri}",h,:request)
|
101
124
|
h.callback(&_handle_uri_callback)
|
102
125
|
h.errback {
|
103
126
|
@http_client = h
|
104
|
-
@error = h
|
127
|
+
@error = error_message(h)
|
128
|
+
verbose("#{@error} for: #{current_uri}", h, :status, :response)
|
105
129
|
fail(self)
|
106
130
|
}
|
107
131
|
rescue StandardError => se
|
108
132
|
@http_client = EM::HttpClient.new("")
|
109
133
|
@error = "#{se.message}\n#{se.backtrace.inspect}"
|
134
|
+
verbose("For: #{current_uri} something went wrong: #{@error}")
|
110
135
|
fail(self)
|
111
136
|
end
|
112
137
|
else
|
113
138
|
@http_client = EM::HttpClient.new("")
|
114
|
-
@error = "
|
139
|
+
@error = "Robots are not allowed"
|
140
|
+
verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
|
115
141
|
fail(self)
|
116
142
|
end
|
117
143
|
}
|
118
144
|
robots.errback {
|
145
|
+
verbose("Failed to fetch robots.txt: from: #{robots_url} for: #{current_uri}", robots, :status, :response)
|
119
146
|
robots_cache[robots_url.to_s] = "" if robots_cache
|
120
147
|
h = EM::HttpRequest.new(current_uri).get(options)
|
148
|
+
verbose("Started fetching: #{current_uri}",h,:request)
|
121
149
|
h.callback(&_handle_uri_callback)
|
122
150
|
h.errback {
|
123
151
|
@http_client = h
|
124
|
-
@error = h
|
152
|
+
@error = error_message(h)
|
153
|
+
verbose("#{@error} for: #{current_uri}", h, :status, :response)
|
125
154
|
fail(self)
|
126
155
|
}
|
127
156
|
}
|
@@ -155,6 +184,14 @@ class RDaneel
|
|
155
184
|
Addressable::URI.parse("http://#{location}/robots.txt")
|
156
185
|
end
|
157
186
|
|
187
|
+
def error_message(http_client)
|
188
|
+
@error = if http_client.error.nil? || http_client.error.empty?
|
189
|
+
"An error occurred when fetching #{http_client.uri.to_s}"
|
190
|
+
else
|
191
|
+
http_client.error
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
158
195
|
def success?(http_client)
|
159
196
|
http_client.response_header.status == 200
|
160
197
|
end
|
@@ -164,10 +201,37 @@ class RDaneel
|
|
164
201
|
end
|
165
202
|
|
166
203
|
def redirect_url(http_client, u)
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
location
|
204
|
+
# em-http-request handles the case when redirect is relative
|
205
|
+
# at this point http_client.response_header.location should always have an absolute and valid url
|
206
|
+
# but this invalid url is parsed successfully http:/malformed:url so we ask for host
|
207
|
+
Addressable::URI.parse(http_client.response_header.location)
|
208
|
+
end
|
209
|
+
|
210
|
+
def verbose(message, client = nil, *args)
|
211
|
+
return unless @verbose
|
212
|
+
message.each { |l| hashed_puts('*', l) }
|
213
|
+
args.each do |a|
|
214
|
+
case a
|
215
|
+
when :status
|
216
|
+
if client.response_header.status == 0
|
217
|
+
hashed_puts('< Status:', '0 (timeout)')
|
218
|
+
else
|
219
|
+
hashed_puts('< Status:', client.response_header.status)
|
220
|
+
end
|
221
|
+
when :request # this is a options hash
|
222
|
+
headers = client.options[:head]
|
223
|
+
headers.each { |k,v| hashed_puts('>', "#{k}: #{v}") } if headers
|
224
|
+
when :response # this is an array
|
225
|
+
client.response_header.each { |r| hashed_puts('<', "#{r[0]}: #{r[1]}") }
|
226
|
+
end
|
227
|
+
end
|
171
228
|
end
|
229
|
+
|
230
|
+
private
|
231
|
+
|
232
|
+
def hashed_puts( prefix, message )
|
233
|
+
$stdout.puts("[#{@hash}] [#{Time.now.strftime('%Y-%m-%d %H:%m:%S')}] #{prefix} #{message}")
|
234
|
+
end
|
235
|
+
|
172
236
|
end
|
173
237
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "RDaneel" do
|
4
|
+
|
5
|
+
describe "robots_txt_url" do
|
6
|
+
before(:each) do
|
7
|
+
@rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should return the proper url when url don't has a port specified (80 implied)" do
|
11
|
+
url = Addressable::URI.parse("http://127.0.0.1/path/url?param1=value1¶m2=value2")
|
12
|
+
@rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return the proper url when url has a port 80 specified" do
|
16
|
+
url = Addressable::URI.parse("http://127.0.0.1:80/path/url?param1=value1¶m2=value2")
|
17
|
+
@rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should return the proper url when url has a port different than 80" do
|
21
|
+
url = Addressable::URI.parse("http://127.0.0.1:8080/path/url?param1=value1¶m2=value2")
|
22
|
+
@rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1:8080/robots.txt"
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
describe "robots_allowed?" do
|
29
|
+
before(:each) do
|
30
|
+
@rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "when an error happens parsing the robots rules" do
|
34
|
+
before(:each) do
|
35
|
+
@robot_rules = RobotRules.new("RDaneel")
|
36
|
+
@robot_rules.stub!(:parse).and_raise(StandardError)
|
37
|
+
RobotRules.stub!(:new).and_return(@robot_rules)
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should return true" do #no matter the params
|
41
|
+
@rdaneel.send(:robots_allowed?, nil, nil, nil, nil).should be_true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
data/spec/spec_helper.rb
CHANGED
@@ -3,91 +3,4 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'rdaneel'
|
5
5
|
require 'spec'
|
6
|
-
require 'webrick'
|
7
|
-
|
8
|
-
# keep webrick quiet
|
9
|
-
class ::WEBrick::HTTPServer
|
10
|
-
def access_log(config, req, res)
|
11
|
-
# nop
|
12
|
-
end
|
13
|
-
end
|
14
|
-
class ::WEBrick::BasicLog
|
15
|
-
def log(level, data)
|
16
|
-
# nop
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
def locked_file
|
21
|
-
File.join(File.dirname(__FILE__),"server_lock-#{@__port}")
|
22
|
-
end
|
23
|
-
|
24
|
-
def server_setup(port=8080, &blk)
|
25
|
-
@__port = port
|
26
|
-
if @server.nil? and !File.exist?(locked_file)
|
27
|
-
File.open(locked_file,'w') {|f| f << 'locked' }
|
28
|
-
@server = WEBrick::HTTPServer.new :Port => port
|
29
|
-
blk.call(@server) if blk
|
30
|
-
queue = Queue.new # synchronize the thread startup to the main thread
|
31
|
-
@test_thread = Thread.new { queue << 1; @server.start }
|
32
|
-
|
33
|
-
# wait for the queue
|
34
|
-
value = queue.pop
|
35
|
-
|
36
|
-
if !value
|
37
|
-
STDERR.puts "Failed to startup test server!"
|
38
|
-
exit(1)
|
39
|
-
end
|
40
|
-
|
41
|
-
trap("INT"){server_shutdown}
|
42
|
-
at_exit{server_shutdown}
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def server_shutdown
|
47
|
-
begin
|
48
|
-
if File.exist?(locked_file)
|
49
|
-
File.unlink locked_file
|
50
|
-
@server.shutdown unless @server.nil?
|
51
|
-
@server = nil
|
52
|
-
end
|
53
|
-
rescue Object => e
|
54
|
-
puts "Error #{__FILE__}:#{__LINE__}\n#{e.message}"
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def mount(server, opts)
|
59
|
-
raise ":path is required" if opts[:path].nil?
|
60
|
-
raise ":status is required" if opts[:status].nil?
|
61
|
-
server.mount_proc( opts[:path],
|
62
|
-
lambda { |req, resp|
|
63
|
-
resp.status = opts[:status]
|
64
|
-
resp.body = opts[:body] unless opts[:body].nil?
|
65
|
-
resp['Location'] = opts[:location] unless opts[:location].nil?
|
66
|
-
opts[:block].call unless opts[:block].nil?
|
67
|
-
} )
|
68
|
-
end
|
69
|
-
|
70
|
-
def should_not_be_hit
|
71
|
-
should_be_hit( 0 )
|
72
|
-
end
|
73
|
-
|
74
|
-
def should_be_hit_once
|
75
|
-
should_be_hit( 1 )
|
76
|
-
end
|
77
|
-
|
78
|
-
def should_be_hit_twice
|
79
|
-
should_be_hit( 2 )
|
80
|
-
end
|
81
|
-
|
82
|
-
def should_be_hit( times = 1 )
|
83
|
-
l = lambda {}
|
84
|
-
m = l.should_receive(:call).exactly(times).times
|
85
|
-
return l
|
86
|
-
end
|
87
|
-
|
88
|
-
Spec::Runner.configure do |config|
|
89
|
-
config.before :suite do
|
90
|
-
puts "\e[4mThese specs could take a while, please be patience\e[0m"
|
91
|
-
end
|
92
|
-
end
|
93
6
|
|
@@ -9,7 +9,7 @@ describe "RDaneel when the content is chunked (digg.com)" do
|
|
9
9
|
|
10
10
|
it "should get the content" do
|
11
11
|
EM.run do
|
12
|
-
r = RDaneel.new("http://digg.com")
|
12
|
+
r = RDaneel.new("http://digg.com/news")
|
13
13
|
r.callback do
|
14
14
|
r.http_client.response_header.status.should == 200
|
15
15
|
r.http_client.response.should_not be_empty
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 2
|
9
|
+
version: 0.2.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Edgar Gonzalez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-27 00:00:00 -04:30
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -28,8 +28,8 @@ dependencies:
|
|
28
28
|
segments:
|
29
29
|
- 0
|
30
30
|
- 2
|
31
|
-
-
|
32
|
-
version: 0.2.
|
31
|
+
- 11
|
32
|
+
version: 0.2.11
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
@@ -60,6 +60,35 @@ dependencies:
|
|
60
60
|
version: 1.2.9
|
61
61
|
type: :development
|
62
62
|
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: cucumber
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
segments:
|
71
|
+
- 0
|
72
|
+
- 8
|
73
|
+
- 5
|
74
|
+
version: 0.8.5
|
75
|
+
type: :development
|
76
|
+
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: relevance-rcov
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
segments:
|
85
|
+
- 0
|
86
|
+
- 9
|
87
|
+
- 2
|
88
|
+
- 1
|
89
|
+
version: 0.9.2.1
|
90
|
+
type: :development
|
91
|
+
version_requirements: *id005
|
63
92
|
description: Add robots.txt support on top of em-http-request
|
64
93
|
email:
|
65
94
|
- edgargonzalez@gmail.com
|
@@ -78,13 +107,16 @@ files:
|
|
78
107
|
- README.rdoc
|
79
108
|
- Rakefile
|
80
109
|
- VERSION
|
110
|
+
- features/get_with_cache.feature
|
111
|
+
- features/get_without_cache.feature
|
112
|
+
- features/step_definitions/rdaneel_steps.rb
|
113
|
+
- features/support/burrito.rb
|
114
|
+
- features/support/env.rb
|
81
115
|
- lib/rdaneel.rb
|
82
|
-
- spec/
|
83
|
-
- spec/redirects_without_robots_spec.rb
|
116
|
+
- spec/rdaneel_spec.rb
|
84
117
|
- spec/spec.opts
|
85
118
|
- spec/spec_helper.rb
|
86
119
|
- spec/streamed_content_spec.rb
|
87
|
-
- spec/using_cache_spec.rb
|
88
120
|
has_rdoc: true
|
89
121
|
homepage: http://github.com/hasmanydevelopers/RDaneel
|
90
122
|
licenses: []
|
@@ -116,8 +148,6 @@ signing_key:
|
|
116
148
|
specification_version: 3
|
117
149
|
summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
|
118
150
|
test_files:
|
119
|
-
- spec/using_cache_spec.rb
|
120
|
-
- spec/no_redirects_neither_robots_spec.rb
|
121
151
|
- spec/spec_helper.rb
|
122
|
-
- spec/redirects_without_robots_spec.rb
|
123
152
|
- spec/streamed_content_spec.rb
|
153
|
+
- spec/rdaneel_spec.rb
|
@@ -1,130 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
-
|
3
|
-
describe "RDaneel when there are no redirects" do
|
4
|
-
|
5
|
-
let(:port) {8083}
|
6
|
-
|
7
|
-
describe "when a successfull status different than 200 is issued for robots.txt" do
|
8
|
-
it "should get the content ignoring the redirect"
|
9
|
-
end
|
10
|
-
|
11
|
-
describe "when a redirect other than 301 and 302 is issued for robots.txt" do
|
12
|
-
it "should get the content ignoring the redirect"
|
13
|
-
end
|
14
|
-
|
15
|
-
(301..302).each do |status|
|
16
|
-
|
17
|
-
describe "when robots.txt has been moved (http code #{status})" do
|
18
|
-
before(:each) do
|
19
|
-
server_setup(port+status) do |server|
|
20
|
-
mount(server, :path => '/hello_world', :status => 200,
|
21
|
-
:body => 'Hello World!', :block => should_be_hit_once )
|
22
|
-
mount(server, :path => '/robots.txt', :status => status,
|
23
|
-
:location => "http://127.0.0.1:#{port+status}/golems.txt",
|
24
|
-
:block => should_be_hit_once )
|
25
|
-
mount(server, :path => '/golems.txt', :status => 200,
|
26
|
-
:block => should_be_hit_once )
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
after(:each) do
|
31
|
-
server_shutdown
|
32
|
-
end
|
33
|
-
|
34
|
-
it "should get the redirected robots.txt and the content" do
|
35
|
-
EM.run do
|
36
|
-
r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
|
37
|
-
r.callback do
|
38
|
-
r.http_client.response_header.status.should == 200
|
39
|
-
r.http_client.response.should == "Hello World!"
|
40
|
-
r.redirects.should be_empty
|
41
|
-
EM.stop
|
42
|
-
end
|
43
|
-
r.errback do
|
44
|
-
fail
|
45
|
-
EM.stop
|
46
|
-
end
|
47
|
-
r.get
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
(400..417).each do |status|
|
56
|
-
|
57
|
-
describe "when there is a CLIENT error #{status} associated to robots.txt" do
|
58
|
-
before(:each) do
|
59
|
-
server_setup(port+status) do |server|
|
60
|
-
mount(server, :path => '/hello_world', :status => 200,
|
61
|
-
:body => 'Hello World!', :block => should_be_hit_once )
|
62
|
-
mount(server, :path => '/robots.txt', :status => status,
|
63
|
-
:block => should_be_hit_once )
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
after(:each) do
|
68
|
-
server_shutdown
|
69
|
-
end
|
70
|
-
|
71
|
-
it "should get the content" do
|
72
|
-
EM.run do
|
73
|
-
r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
|
74
|
-
r.callback do
|
75
|
-
r.http_client.response_header.status.should == 200
|
76
|
-
r.http_client.response.should == "Hello World!"
|
77
|
-
r.redirects.should be_empty
|
78
|
-
EM.stop
|
79
|
-
end
|
80
|
-
r.errback do
|
81
|
-
fail
|
82
|
-
EM.stop
|
83
|
-
end
|
84
|
-
r.get
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
(500..505).each do |status|
|
93
|
-
|
94
|
-
describe "when there is a SERVER error #{status} associated to robots.txt" do
|
95
|
-
before(:each) do
|
96
|
-
server_setup(port+status) do |server|
|
97
|
-
mount(server, :path => '/hello_world', :status => 200,
|
98
|
-
:body => 'Hello World!', :block => should_be_hit_once )
|
99
|
-
mount(server, :path => '/robots.txt', :status => status,
|
100
|
-
:block => should_be_hit_once )
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
after (:each) do
|
105
|
-
server_shutdown
|
106
|
-
end
|
107
|
-
|
108
|
-
it "should get the content" do
|
109
|
-
EM.run do
|
110
|
-
r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
|
111
|
-
r.callback do
|
112
|
-
r.http_client.response_header.status.should == 200
|
113
|
-
r.http_client.response.should == "Hello World!"
|
114
|
-
r.redirects.should be_empty
|
115
|
-
EM.stop
|
116
|
-
end
|
117
|
-
r.errback do
|
118
|
-
fail
|
119
|
-
EM.stop
|
120
|
-
end
|
121
|
-
r.get
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
end
|
128
|
-
|
129
|
-
end
|
130
|
-
|
@@ -1,175 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
-
|
3
|
-
describe "RDaneel when there are redirects" do
|
4
|
-
|
5
|
-
let(:port) {8081}
|
6
|
-
|
7
|
-
describe "when there is no robots.txt in the host (ONLY one host)" do
|
8
|
-
|
9
|
-
describe "when no redirection limit has been set" do
|
10
|
-
before(:each) do
|
11
|
-
server_setup(port) do |server|
|
12
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
13
|
-
:block => should_be_hit_once )
|
14
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
15
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
16
|
-
:block => should_be_hit_once )
|
17
|
-
mount(server, :path => '/hello_world', :status => 200,
|
18
|
-
:body => 'Hello World!',
|
19
|
-
:block => should_not_be_hit )
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
after(:each) do
|
24
|
-
server_shutdown
|
25
|
-
end
|
26
|
-
|
27
|
-
it "should not follow redirects" do
|
28
|
-
EM.run do
|
29
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
30
|
-
r.callback do
|
31
|
-
fail
|
32
|
-
EM.stop
|
33
|
-
end
|
34
|
-
r.errback do
|
35
|
-
r.redirects.should be_empty
|
36
|
-
r.error.should == "Exceeded maximum number of redirects"
|
37
|
-
EM.stop
|
38
|
-
end
|
39
|
-
r.get
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
end
|
45
|
-
|
46
|
-
describe "when a maximum number or redirects is set" do
|
47
|
-
|
48
|
-
describe "when there are less redirects than the maximum specified" do
|
49
|
-
before(:each) do
|
50
|
-
server_setup(port) do |server|
|
51
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
52
|
-
:block => should_be_hit(3) )
|
53
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
54
|
-
:location => "http://127.0.0.1:#{port}/redirect_me_again",
|
55
|
-
:block => should_be_hit_once )
|
56
|
-
mount(server, :path => '/redirect_me_again', :status => 301,
|
57
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
58
|
-
:block => should_be_hit_once )
|
59
|
-
mount(server, :path => '/hello_world', :status => 200,
|
60
|
-
:body => 'Hello World!',
|
61
|
-
:block => should_be_hit_once )
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
after(:each) do
|
66
|
-
server_shutdown
|
67
|
-
end
|
68
|
-
|
69
|
-
it "should get the content following all the redirects" do
|
70
|
-
EM.run do
|
71
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
72
|
-
r.callback do
|
73
|
-
r.http_client.response_header.status.should == 200
|
74
|
-
r.http_client.response.should == "Hello World!"
|
75
|
-
r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me",
|
76
|
-
"http://127.0.0.1:#{port}/redirect_me_again"]
|
77
|
-
r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
|
78
|
-
EM.stop
|
79
|
-
end
|
80
|
-
r.errback do
|
81
|
-
fail
|
82
|
-
EM.stop
|
83
|
-
end
|
84
|
-
r.get(:redirects => 3)
|
85
|
-
end
|
86
|
-
|
87
|
-
end
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
describe "when there are as many redirects as the maximum" do
|
92
|
-
before(:each) do
|
93
|
-
server_setup(port) do |server|
|
94
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
95
|
-
:block => should_be_hit_twice )
|
96
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
97
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
98
|
-
:block => should_be_hit_once )
|
99
|
-
mount(server, :path => '/hello_world', :status => 200,
|
100
|
-
:body => 'Hello World!',
|
101
|
-
:block => should_be_hit_once )
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
after(:each) do
|
106
|
-
server_shutdown
|
107
|
-
end
|
108
|
-
|
109
|
-
it "should get the content following all the redirects" do
|
110
|
-
EM.run do
|
111
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
112
|
-
r.callback do
|
113
|
-
r.http_client.response_header.status.should == 200
|
114
|
-
r.http_client.response.should == "Hello World!"
|
115
|
-
r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
|
116
|
-
r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
|
117
|
-
EM.stop
|
118
|
-
end
|
119
|
-
r.errback do
|
120
|
-
fail
|
121
|
-
EM.stop
|
122
|
-
end
|
123
|
-
r.get(:redirects => 1)
|
124
|
-
end
|
125
|
-
|
126
|
-
end
|
127
|
-
|
128
|
-
end
|
129
|
-
|
130
|
-
describe "when the number of redirects exceed the maximum specified" do
|
131
|
-
before(:each) do
|
132
|
-
server_setup(port) do |server|
|
133
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
134
|
-
:block => should_be_hit_twice )
|
135
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
136
|
-
:location => "http://127.0.0.1:#{port}/redirect_me_again",
|
137
|
-
:block => should_be_hit_once )
|
138
|
-
mount(server, :path => '/redirect_me_again', :status => 301,
|
139
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
140
|
-
:block => should_be_hit_once )
|
141
|
-
mount(server, :path => '/hello_world', :status => 200,
|
142
|
-
:body => 'Hello World!',
|
143
|
-
:block => should_not_be_hit )
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
after(:each) do
|
148
|
-
server_shutdown
|
149
|
-
end
|
150
|
-
|
151
|
-
it "should stop following redirects once the maximum specified is reached" do
|
152
|
-
EM.run do
|
153
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
154
|
-
r.callback do
|
155
|
-
fail
|
156
|
-
EM.stop
|
157
|
-
end
|
158
|
-
r.errback do
|
159
|
-
r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
|
160
|
-
r.error.should == "Exceeded maximum number of redirects"
|
161
|
-
EM.stop
|
162
|
-
end
|
163
|
-
r.get(:redirects => 1)
|
164
|
-
end
|
165
|
-
|
166
|
-
end
|
167
|
-
|
168
|
-
end
|
169
|
-
|
170
|
-
end
|
171
|
-
|
172
|
-
end
|
173
|
-
|
174
|
-
end
|
175
|
-
|
data/spec/using_cache_spec.rb
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
-
|
3
|
-
describe "RDaneel when there is a cache" do
|
4
|
-
|
5
|
-
let(:port) {8082}
|
6
|
-
|
7
|
-
describe "when there is no robots.txt in the host" do
|
8
|
-
|
9
|
-
before(:each) do
|
10
|
-
RDaneel.robots_cache = {}
|
11
|
-
server_setup(port) do |server|
|
12
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
13
|
-
:block => should_be_hit_once )
|
14
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
15
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
16
|
-
:block => should_be_hit_once )
|
17
|
-
mount(server, :path => '/hello_world', :status => 200,
|
18
|
-
:body => 'Hello World!',
|
19
|
-
:block => should_be_hit_once )
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
after(:each) do
|
24
|
-
server_shutdown
|
25
|
-
end
|
26
|
-
|
27
|
-
it "should try to get the robots.txt just once" do
|
28
|
-
EM.run do
|
29
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
30
|
-
r.callback do
|
31
|
-
r.http_client.response_header.status.should == 200
|
32
|
-
r.http_client.response.should == "Hello World!"
|
33
|
-
r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me"]
|
34
|
-
r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
|
35
|
-
EM.stop
|
36
|
-
end
|
37
|
-
r.errback do
|
38
|
-
fail
|
39
|
-
EM.stop
|
40
|
-
end
|
41
|
-
r.get(:redirects => 3)
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|