rdaneel 0.1.3 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/Rakefile +20 -18
- data/VERSION +1 -1
- data/features/get_with_cache.feature +67 -0
- data/features/get_without_cache.feature +155 -0
- data/features/step_definitions/rdaneel_steps.rb +79 -0
- data/features/support/burrito.rb +69 -0
- data/features/support/env.rb +22 -0
- data/lib/rdaneel.rb +83 -19
- data/spec/rdaneel_spec.rb +47 -0
- data/spec/spec_helper.rb +0 -87
- data/spec/streamed_content_spec.rb +1 -1
- metadata +42 -12
- data/spec/no_redirects_neither_robots_spec.rb +0 -130
- data/spec/redirects_without_robots_spec.rb +0 -175
- data/spec/using_cache_spec.rb +0 -46
data/.gitignore
CHANGED
data/Rakefile
CHANGED
@@ -10,9 +10,11 @@ begin
|
|
10
10
|
gem.email = ["edgargonzalez@gmail.com", "anibalrojas@gmail.com"]
|
11
11
|
gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
|
12
12
|
gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
|
13
|
-
gem.add_dependency("em-http-request", ">= 0.2.
|
13
|
+
gem.add_dependency("em-http-request", ">= 0.2.11")
|
14
14
|
gem.add_dependency('robot_rules', '>= 0.9.3')
|
15
15
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
16
|
+
gem.add_development_dependency "cucumber", ">= 0.8.5"
|
17
|
+
gem.add_development_dependency "relevance-rcov", ">= 0.9.2.1"
|
16
18
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
19
|
end
|
18
20
|
Jeweler::GemcutterTasks.new
|
@@ -20,29 +22,29 @@ rescue LoadError
|
|
20
22
|
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
23
|
end
|
22
24
|
|
25
|
+
require 'cucumber/rake/task'
|
26
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
27
|
+
t.cucumber_opts = "--format pretty" # Any valid command line option can go here.
|
28
|
+
t.rcov = true
|
29
|
+
t.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
|
30
|
+
end
|
31
|
+
|
23
32
|
require 'spec/rake/spectask'
|
24
33
|
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
34
|
spec.libs << 'lib' << 'spec'
|
26
35
|
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
-
end
|
28
|
-
|
29
|
-
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
30
|
-
spec.libs << 'lib' << 'spec'
|
31
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
32
36
|
spec.rcov = true
|
37
|
+
spec.rcov_opts = %w{--exclude gems\/,spec\/,features\/ --aggregate coverage.data}
|
33
38
|
end
|
34
39
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Rake::RDocTask.new do |rdoc|
|
41
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
42
|
-
|
43
|
-
rdoc.rdoc_dir = 'rdoc'
|
44
|
-
rdoc.title = "rdaneel #{version}"
|
45
|
-
rdoc.rdoc_files.include('README*')
|
46
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
40
|
+
desc "Run both specs and features and generate aggregated coverage"
|
41
|
+
task :all_tests do |t|
|
42
|
+
rm "coverage.data" if File.exist?("coverage.data")
|
43
|
+
Rake::Task['spec'].invoke
|
44
|
+
Rake::Task["features"].invoke
|
47
45
|
end
|
48
46
|
|
47
|
+
task :features => :check_dependencies
|
48
|
+
task :spec => :check_dependencies
|
49
|
+
task :default => :all_tests
|
50
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.2
|
@@ -0,0 +1,67 @@
|
|
1
|
+
Feature: get a url using cache
|
2
|
+
In order to fetch content from internet
|
3
|
+
As a crawler
|
4
|
+
I want to get a url respecting robots.txt rules
|
5
|
+
|
6
|
+
Scenario: the url to fetch is redirected
|
7
|
+
Given a cache for RDaneel
|
8
|
+
And a robots.txt that allows RDaneel
|
9
|
+
And a HelloWorld url
|
10
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
|
11
|
+
And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
|
12
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
13
|
+
Then I should get the content for HelloWorld url
|
14
|
+
And the http response code should be 200
|
15
|
+
And I should get 2 redirects
|
16
|
+
And The redirects sequence should be:
|
17
|
+
| http://127.0.0.1:3210/redirect_me |
|
18
|
+
| http://127.0.0.1:3210/redirect_me_again |
|
19
|
+
And The requests sequence should be:
|
20
|
+
| status | path |
|
21
|
+
| 200 | /robots.txt |
|
22
|
+
| 301 | /redirect_me |
|
23
|
+
| 302 | /redirect_me_again |
|
24
|
+
| 200 | /hello_world |
|
25
|
+
And The cache for "http://127.0.0.1:3210/robots.txt" should be
|
26
|
+
"""
|
27
|
+
User-agent: *
|
28
|
+
Disallow: /cgi-bin/
|
29
|
+
"""
|
30
|
+
|
31
|
+
Scenario: a cached robots.txt exists denying RDaneel's user-agent
|
32
|
+
Given a cache for RDaneel
|
33
|
+
And The cache for "http://127.0.0.1:3210/robots.txt" is:
|
34
|
+
"""
|
35
|
+
User-agent: *
|
36
|
+
Disallow: /
|
37
|
+
"""
|
38
|
+
And a robots.txt that denies RDaneel
|
39
|
+
And a HelloWorld url
|
40
|
+
When I get the "/hello_world" url following a maximum of 1 redirects
|
41
|
+
Then I should get a "Robots are not allowed" error
|
42
|
+
And I should get 0 redirects
|
43
|
+
And The requests should be empty
|
44
|
+
|
45
|
+
|
46
|
+
Scenario: the url to fetch is redirected to unreacheable server but a robots cache exists for this server allowing RDaneel
|
47
|
+
Given a cache for RDaneel
|
48
|
+
And The cache for "http://127.0.0.1:3210/robots.txt" is:
|
49
|
+
"""
|
50
|
+
User-agent: *
|
51
|
+
Disallow: /cgi-bin/
|
52
|
+
"""
|
53
|
+
And The cache for "http://127.0.0.1:3211/robots.txt" is:
|
54
|
+
"""
|
55
|
+
User-agent: *
|
56
|
+
Disallow: /cgi-bin/
|
57
|
+
"""
|
58
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
|
59
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
60
|
+
Then I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
|
61
|
+
And I should get 1 redirects
|
62
|
+
And The redirects sequence should be:
|
63
|
+
| http://127.0.0.1:3210/redirect_me |
|
64
|
+
And The requests sequence should be:
|
65
|
+
| status | path |
|
66
|
+
| 301 | /redirect_me |
|
67
|
+
|
@@ -0,0 +1,155 @@
|
|
1
|
+
Feature: get a url without using cache
|
2
|
+
In order to fetch content from internet
|
3
|
+
As a crawler
|
4
|
+
I want to get a url respecting robots.txt rules
|
5
|
+
|
6
|
+
Scenario: a robots.txt exists allowing RDaneel's user-agent
|
7
|
+
Given a robots.txt that allows RDaneel
|
8
|
+
And a HelloWorld url
|
9
|
+
When I get the "/hello_world" url following a maximum of 1 redirects
|
10
|
+
Then I should get the content for HelloWorld url
|
11
|
+
And the http response code should be 200
|
12
|
+
And I should get 0 redirects
|
13
|
+
And The requests sequence should be:
|
14
|
+
| status | path |
|
15
|
+
| 200 | /robots.txt |
|
16
|
+
| 200 | /hello_world |
|
17
|
+
|
18
|
+
Scenario: a robots.txt exists denying RDaneel's user-agent
|
19
|
+
Given a robots.txt that denies RDaneel
|
20
|
+
And a HelloWorld url
|
21
|
+
When I get the "/hello_world" url following a maximum of 1 redirects
|
22
|
+
Then I should get a "Robots are not allowed" error
|
23
|
+
And I should get 0 redirects
|
24
|
+
And The requests sequence should be:
|
25
|
+
| status | path |
|
26
|
+
| 200 | /robots.txt |
|
27
|
+
|
28
|
+
Scenario: the url to fetch is redirected
|
29
|
+
Given a robots.txt that allows RDaneel
|
30
|
+
And a HelloWorld url
|
31
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
|
32
|
+
And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
|
33
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
34
|
+
Then I should get the content for HelloWorld url
|
35
|
+
And the http response code should be 200
|
36
|
+
And I should get 2 redirects
|
37
|
+
And The redirects sequence should be:
|
38
|
+
| http://127.0.0.1:3210/redirect_me |
|
39
|
+
| http://127.0.0.1:3210/redirect_me_again |
|
40
|
+
And The requests sequence should be:
|
41
|
+
| status | path |
|
42
|
+
| 200 | /robots.txt |
|
43
|
+
| 301 | /redirect_me |
|
44
|
+
| 200 | /robots.txt |
|
45
|
+
| 302 | /redirect_me_again |
|
46
|
+
| 200 | /robots.txt |
|
47
|
+
| 200 | /hello_world |
|
48
|
+
|
49
|
+
Scenario: the url to fetch exceeds the maximum redirects specifieds
|
50
|
+
Given a robots.txt that allows RDaneel
|
51
|
+
And a HelloWorld url
|
52
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
|
53
|
+
And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
|
54
|
+
When I get the "/redirect_me" url following a maximum of 1 redirects
|
55
|
+
Then I should get a "Exceeded maximum number of redirects: 1" error
|
56
|
+
And I should get 1 redirects
|
57
|
+
And The redirects sequence should be:
|
58
|
+
| http://127.0.0.1:3210/redirect_me |
|
59
|
+
And The requests sequence should be:
|
60
|
+
| status | path |
|
61
|
+
| 200 | /robots.txt |
|
62
|
+
| 301 | /redirect_me |
|
63
|
+
| 200 | /robots.txt |
|
64
|
+
| 302 | /redirect_me_again |
|
65
|
+
|
66
|
+
Scenario: the url to fetch has an infinte redirect
|
67
|
+
Given a robots.txt that allows RDaneel
|
68
|
+
And a HelloWorld url
|
69
|
+
And a "/redirect_me" url that redirects 302 to "/redirect_me_again" url
|
70
|
+
And a "/redirect_me_again" url that redirects 302 to "/redirect_me" url
|
71
|
+
When I get the "/redirect_me" url following a maximum of 2 redirects
|
72
|
+
Then I should get a "Infinite redirect detected for: http://127.0.0.1:3210/redirect_me" error
|
73
|
+
And I should get 2 redirects
|
74
|
+
And The redirects sequence should be:
|
75
|
+
| http://127.0.0.1:3210/redirect_me |
|
76
|
+
| http://127.0.0.1:3210/redirect_me_again |
|
77
|
+
And The requests sequence should be:
|
78
|
+
| status | path |
|
79
|
+
| 200 | /robots.txt |
|
80
|
+
| 302 | /redirect_me |
|
81
|
+
| 200 | /robots.txt |
|
82
|
+
| 302 | /redirect_me_again |
|
83
|
+
|
84
|
+
Scenario: the url to fetch redirects to not found url
|
85
|
+
Given a robots.txt that allows RDaneel
|
86
|
+
And a "/redirect_me" url that redirects 302 to "/not_found" url
|
87
|
+
When I get the "/redirect_me" url following a maximum of 2 redirects
|
88
|
+
Then I should get a "Not success neither redirect" error
|
89
|
+
And I should get 1 redirects
|
90
|
+
And The redirects sequence should be:
|
91
|
+
| http://127.0.0.1:3210/redirect_me |
|
92
|
+
And The requests sequence should be:
|
93
|
+
| status | path |
|
94
|
+
| 200 | /robots.txt |
|
95
|
+
| 302 | /redirect_me |
|
96
|
+
| 200 | /robots.txt |
|
97
|
+
| 404 | /not_found |
|
98
|
+
|
99
|
+
|
100
|
+
Scenario: robots.txt doesn't exists
|
101
|
+
Given a HelloWorld url
|
102
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3210/redirect_me_again" url
|
103
|
+
And a "/redirect_me_again" url that redirects 302 to "/hello_world" url
|
104
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
105
|
+
Then I should get the content for HelloWorld url
|
106
|
+
And the http response code should be 200
|
107
|
+
And I should get 2 redirects
|
108
|
+
And The redirects sequence should be:
|
109
|
+
| http://127.0.0.1:3210/redirect_me |
|
110
|
+
| http://127.0.0.1:3210/redirect_me_again |
|
111
|
+
And The requests sequence should be:
|
112
|
+
| status | path |
|
113
|
+
| 404 | /robots.txt |
|
114
|
+
| 301 | /redirect_me |
|
115
|
+
| 404 | /robots.txt |
|
116
|
+
| 302 | /redirect_me_again |
|
117
|
+
| 404 | /robots.txt |
|
118
|
+
| 200 | /hello_world |
|
119
|
+
|
120
|
+
Scenario: the url to fetch redirects to a malformed url (format handled by em-http-request)
|
121
|
+
Given a robots.txt that allows RDaneel
|
122
|
+
And a "/redirect_me" url that redirects 302 to "http://malformed:url" url
|
123
|
+
When I get the "/redirect_me" url following a maximum of 2 redirects
|
124
|
+
Then I should get a "Location header format error" error
|
125
|
+
And I should get 0 redirects
|
126
|
+
And The requests sequence should be:
|
127
|
+
| status | path |
|
128
|
+
| 200 | /robots.txt |
|
129
|
+
| 302 | /redirect_me |
|
130
|
+
|
131
|
+
Scenario: the url to fetch redirects to a malformed url (format not handled by em-http-request 0.2.10)
|
132
|
+
Given a robots.txt that allows RDaneel
|
133
|
+
And a "/redirect_me" url that redirects 302 to "http:/malformed:url" url
|
134
|
+
When I get the "/redirect_me" url following a maximum of 2 redirects
|
135
|
+
Then I should get a "Location header format error" error
|
136
|
+
And I should get 0 redirects
|
137
|
+
And The requests sequence should be:
|
138
|
+
| status | path |
|
139
|
+
| 200 | /robots.txt |
|
140
|
+
| 302 | /redirect_me |
|
141
|
+
|
142
|
+
Scenario: the url to fetch is redirected to unreacheable host:port
|
143
|
+
Given a robots.txt that allows RDaneel
|
144
|
+
And a HelloWorld url
|
145
|
+
And a "/redirect_me" url that redirects 301 to "http://127.0.0.1:3211/unreacheable" url
|
146
|
+
When I get the "/redirect_me" url following a maximum of 3 redirects
|
147
|
+
Then I should get a "An error occurred when fetching http://127.0.0.1:3211/unreacheable" error
|
148
|
+
And I should get 1 redirects
|
149
|
+
And The redirects sequence should be:
|
150
|
+
| http://127.0.0.1:3210/redirect_me |
|
151
|
+
And The requests sequence should be:
|
152
|
+
| status | path |
|
153
|
+
| 200 | /robots.txt |
|
154
|
+
| 301 | /redirect_me |
|
155
|
+
|
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
Given /^a robots\.txt that allows RDaneel$/ do
|
3
|
+
$server.mount(:path => '/robots.txt', :status => 200,
|
4
|
+
:body => "User-agent: *\nDisallow: /cgi-bin/")
|
5
|
+
end
|
6
|
+
|
7
|
+
Given /^a robots\.txt that denies RDaneel$/ do
|
8
|
+
$server.mount(:path => '/robots.txt', :status => 200,
|
9
|
+
:body => "User-agent: *\nDisallow: /")
|
10
|
+
end
|
11
|
+
|
12
|
+
Given /^a HelloWorld url$/ do
|
13
|
+
$server.mount(:path => '/hello_world', :status => 200,
|
14
|
+
:body => "Hello World")
|
15
|
+
end
|
16
|
+
|
17
|
+
Given /^a "([^"]*)" url that redirects (\d+) to "([^"]*)" url$/ do |url, status, redirected_to|
|
18
|
+
$server.mount(:path => url, :status => status.to_i,
|
19
|
+
:location => redirected_to)
|
20
|
+
end
|
21
|
+
|
22
|
+
Given /^a cache for RDaneel$/ do
|
23
|
+
RDaneel.robots_cache = {}
|
24
|
+
end
|
25
|
+
|
26
|
+
Given /^The cache for "([^"]*)" is:$/ do |robots_url, robots_file|
|
27
|
+
RDaneel.robots_cache[robots_url] = robots_file
|
28
|
+
end
|
29
|
+
|
30
|
+
When /^I get the "([^"]*)" url following a maximum of (\d+) redirects$/ do |url, max_redirects|
|
31
|
+
EM.run do
|
32
|
+
@r = RDaneel.new("#{HOST}#{url}")
|
33
|
+
@r.callback do
|
34
|
+
EM.stop
|
35
|
+
end
|
36
|
+
@r.errback do
|
37
|
+
EM.stop
|
38
|
+
end
|
39
|
+
@r.get(:redirects => max_redirects)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
Then /^I should get the content for HelloWorld url$/ do
|
44
|
+
@r.http_client.response.should == "Hello World"
|
45
|
+
end
|
46
|
+
|
47
|
+
Then /^the http response code should be (\d+)$/ do |code|
|
48
|
+
@r.http_client.response_header.status.should == code.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
Then /^I should get (\d+) redirects$/ do |redirects_count|
|
52
|
+
@r.redirects.size.should == redirects_count.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
Then /^The requests sequence should be:$/ do |expected_table|
|
56
|
+
expected_requests = []
|
57
|
+
expected_table.hashes.each do |hash|
|
58
|
+
expected_requests << {:status => hash[:status].to_i,
|
59
|
+
:path => hash[:path]}
|
60
|
+
end
|
61
|
+
$server.requests.should == expected_requests
|
62
|
+
end
|
63
|
+
|
64
|
+
Then /^The requests should be empty$/ do
|
65
|
+
$server.requests.should be_empty
|
66
|
+
end
|
67
|
+
|
68
|
+
Then /^The redirects sequence should be:$/ do |expected_redirects|
|
69
|
+
@r.redirects.should == expected_redirects.raw.flatten
|
70
|
+
end
|
71
|
+
|
72
|
+
Then /^I should get a "([^"]*)" error$/ do |error_message|
|
73
|
+
@r.error.should == error_message
|
74
|
+
end
|
75
|
+
|
76
|
+
Then /^The cache for "([^"]*)" should be$/ do |robots_url, robots_file|
|
77
|
+
RDaneel.robots_cache[robots_url].should == robots_file
|
78
|
+
end
|
79
|
+
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'socket'
|
2
|
+
|
3
|
+
class Burrito
|
4
|
+
|
5
|
+
STATUS_MESSAGES = {
|
6
|
+
200 => 'OK',
|
7
|
+
301 => 'Moved Permanently',
|
8
|
+
302 => 'Found',
|
9
|
+
404 => 'Not Found'
|
10
|
+
}
|
11
|
+
|
12
|
+
attr_reader :requests
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@routes = {}
|
16
|
+
@requests = []
|
17
|
+
end
|
18
|
+
|
19
|
+
def mount(opts)
|
20
|
+
@routes[opts[:path]] = { :status => opts[:status],
|
21
|
+
:body => opts[:body],
|
22
|
+
:location => opts[:location] }
|
23
|
+
end
|
24
|
+
|
25
|
+
def reset
|
26
|
+
@routes = {}
|
27
|
+
@requests = []
|
28
|
+
end
|
29
|
+
|
30
|
+
def start
|
31
|
+
@thread = Thread.new do
|
32
|
+
|
33
|
+
webserver = TCPServer.new('127.0.0.1', 3210)
|
34
|
+
|
35
|
+
while session = webserver.accept
|
36
|
+
request = session.gets
|
37
|
+
path = '/' << request.gsub(/GET\ \//, '').gsub(/\ HTTP.*/, '').chomp
|
38
|
+
if @routes[path]
|
39
|
+
status = @routes[path][:status]
|
40
|
+
body = @routes[path][:body]
|
41
|
+
location = @routes[path][:location]
|
42
|
+
else
|
43
|
+
status = 404
|
44
|
+
body = nil
|
45
|
+
location = nil
|
46
|
+
end
|
47
|
+
@requests.push( { :status => status, :path => path } )
|
48
|
+
response = "HTTP/1.1 #{status} #{STATUS_MESSAGES[status]}\r\n"
|
49
|
+
response << "Server: burrito/0.0.1\r\n"
|
50
|
+
response << "Content-Length: #{ body ? body.length : 0 }\r\n"
|
51
|
+
response << "Content-Type: text/plain\r\n" if body
|
52
|
+
response << "Location: #{location}\r\n" if location
|
53
|
+
response << "Connection: close\r\n"
|
54
|
+
response << "\r\n"
|
55
|
+
response << "#{body}" if body
|
56
|
+
session.print response
|
57
|
+
session.close
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def shutdown
|
65
|
+
@thread.terminate
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '../..', 'lib'))
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rdaneel'
|
5
|
+
require 'burrito'
|
6
|
+
|
7
|
+
unless $server
|
8
|
+
$server = Burrito.new
|
9
|
+
$server.start
|
10
|
+
end
|
11
|
+
|
12
|
+
HOST = "http://127.0.0.1:3210"
|
13
|
+
|
14
|
+
Before do
|
15
|
+
$server.reset
|
16
|
+
RDaneel.robots_cache = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
at_exit do
|
20
|
+
$server.shutdown
|
21
|
+
end
|
22
|
+
|
data/lib/rdaneel.rb
CHANGED
@@ -19,10 +19,12 @@ class RDaneel
|
|
19
19
|
attr_accessor :uri
|
20
20
|
attr_reader :error, :redirects, :http_client
|
21
21
|
|
22
|
-
def initialize(uri)
|
22
|
+
def initialize(uri,options = {})
|
23
23
|
@uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
|
24
24
|
@uri.path = "/" if @uri.path.nil? || @uri.path == ""
|
25
25
|
@redirects = []
|
26
|
+
@verbose = options[:verbose]
|
27
|
+
@hash = @uri.hash if @verbose
|
26
28
|
end
|
27
29
|
|
28
30
|
def get(opts = {})
|
@@ -37,91 +39,118 @@ class RDaneel
|
|
37
39
|
if success?(h)
|
38
40
|
@uri = current_uri if current_uri != @uri
|
39
41
|
@http_client = h
|
42
|
+
verbose("Succeded fetching: #{current_uri}", h, :status, :response)
|
40
43
|
succeed(self)
|
41
44
|
elsif redirected?(h)
|
42
45
|
if @redirects.size >= max_redirects
|
43
46
|
@http_client = h
|
44
|
-
@error = "Exceeded maximum number of redirects"
|
47
|
+
@error = "Exceeded maximum number of redirects: #{max_redirects}"
|
48
|
+
verbose(@error, h, :status, :response)
|
45
49
|
fail(self)
|
46
50
|
return
|
47
51
|
end
|
52
|
+
@redirects << current_uri.to_s
|
53
|
+
current_uri = redirect_url(h, current_uri)
|
48
54
|
begin
|
49
|
-
|
50
|
-
current_uri = redirect_url(h, current_uri)
|
55
|
+
verbose("Redirected to: #{current_uri.to_s} from: #{@redirects[-1]}", h, :status, :response)
|
51
56
|
if @redirects.include?(current_uri.to_s)
|
52
57
|
@http_client = h
|
53
|
-
@error = "
|
58
|
+
@error = "Infinite redirect detected for: #{current_uri.to_s}"
|
59
|
+
verbose(@error, h, :status, :response)
|
54
60
|
fail(self)
|
55
61
|
return
|
56
62
|
end
|
57
63
|
_get.call
|
58
|
-
rescue
|
64
|
+
rescue StandardError => se
|
59
65
|
@http_client = h
|
60
|
-
@error = "
|
66
|
+
@error = "Error trying to follow a redirect #{current_uri.to_s}: #{h.response_header.location}"
|
67
|
+
verbose(@error, h, :status, :response)
|
61
68
|
fail(self)
|
62
69
|
end
|
63
70
|
else
|
64
71
|
# other error
|
65
72
|
@http_client = h
|
66
|
-
@error = "
|
73
|
+
@error = "Not success neither redirect"
|
74
|
+
verbose(@error, h, :status, :response)
|
67
75
|
fail(self)
|
68
76
|
end
|
69
77
|
}
|
70
78
|
_get = lambda {
|
71
79
|
robots_url = robots_txt_url(current_uri)
|
72
80
|
if robots_cache && robots_file = robots_cache[robots_url.to_s]
|
81
|
+
verbose("Found cached robots.txt:\n#{robots_cache[robots_url.to_s]} for: #{current_uri}")
|
73
82
|
if robots_allowed?(robots_file, useragent, robots_url, current_uri)
|
83
|
+
verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
|
74
84
|
begin
|
75
85
|
h = EM::HttpRequest.new(current_uri).get(options)
|
86
|
+
verbose("Started fetching: #{current_uri}",h,:request)
|
76
87
|
h.callback(&_handle_uri_callback)
|
77
88
|
h.errback {
|
78
89
|
@http_client = h
|
79
|
-
@error = h
|
90
|
+
@error = error_message(h)
|
91
|
+
verbose("#{@error} for: #{current_uri}",h,:status,:response)
|
80
92
|
fail(self)
|
81
93
|
}
|
82
94
|
rescue StandardError => se
|
83
95
|
@http_client = EM::HttpClient.new("")
|
84
96
|
@error = "#{se.message}\n#{se.backtrace.inspect}"
|
97
|
+
verbose("For: #{current_uri} something went wrong: #{@error}")
|
85
98
|
fail(self)
|
86
99
|
end
|
87
100
|
else
|
88
101
|
@http_client = EM::HttpClient.new("")
|
89
|
-
@error = "
|
102
|
+
@error = "Robots are not allowed"
|
103
|
+
verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
|
90
104
|
fail(self)
|
91
105
|
end
|
92
106
|
else
|
93
107
|
robots_url = robots_txt_url(current_uri)
|
94
108
|
robots = EM::HttpRequest.new(robots_url).get(:redirects => 2) # get the robots.txt following redirects
|
109
|
+
verbose("Started fetching robots.txt from: #{robots_url} for: #{current_uri}",robots,:request)
|
95
110
|
robots.callback {
|
96
|
-
|
97
|
-
|
111
|
+
if success?(robots)
|
112
|
+
robots_file = robots.response
|
113
|
+
verbose("Found robots.txt at #{robots_url}:\n#{robots_file}", robots, :status, :response)
|
114
|
+
else
|
115
|
+
robots_file = ''
|
116
|
+
verbose("Didn't find robots.txt at #{robots_url}", robots, :status, :response)
|
117
|
+
end
|
118
|
+
robots_cache[robots_txt_url(robots_url).to_s] = robots_file if robots_cache
|
98
119
|
if robots_allowed?(robots_file, useragent, robots_url, current_uri)
|
120
|
+
verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
|
99
121
|
begin
|
100
122
|
h = EM::HttpRequest.new(current_uri).get(options)
|
123
|
+
verbose("Started fetching: #{current_uri}",h,:request)
|
101
124
|
h.callback(&_handle_uri_callback)
|
102
125
|
h.errback {
|
103
126
|
@http_client = h
|
104
|
-
@error = h
|
127
|
+
@error = error_message(h)
|
128
|
+
verbose("#{@error} for: #{current_uri}", h, :status, :response)
|
105
129
|
fail(self)
|
106
130
|
}
|
107
131
|
rescue StandardError => se
|
108
132
|
@http_client = EM::HttpClient.new("")
|
109
133
|
@error = "#{se.message}\n#{se.backtrace.inspect}"
|
134
|
+
verbose("For: #{current_uri} something went wrong: #{@error}")
|
110
135
|
fail(self)
|
111
136
|
end
|
112
137
|
else
|
113
138
|
@http_client = EM::HttpClient.new("")
|
114
|
-
@error = "
|
139
|
+
@error = "Robots are not allowed"
|
140
|
+
verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
|
115
141
|
fail(self)
|
116
142
|
end
|
117
143
|
}
|
118
144
|
robots.errback {
|
145
|
+
verbose("Failed to fetch robots.txt: from: #{robots_url} for: #{current_uri}", robots, :status, :response)
|
119
146
|
robots_cache[robots_url.to_s] = "" if robots_cache
|
120
147
|
h = EM::HttpRequest.new(current_uri).get(options)
|
148
|
+
verbose("Started fetching: #{current_uri}",h,:request)
|
121
149
|
h.callback(&_handle_uri_callback)
|
122
150
|
h.errback {
|
123
151
|
@http_client = h
|
124
|
-
@error = h
|
152
|
+
@error = error_message(h)
|
153
|
+
verbose("#{@error} for: #{current_uri}", h, :status, :response)
|
125
154
|
fail(self)
|
126
155
|
}
|
127
156
|
}
|
@@ -155,6 +184,14 @@ class RDaneel
|
|
155
184
|
Addressable::URI.parse("http://#{location}/robots.txt")
|
156
185
|
end
|
157
186
|
|
187
|
+
def error_message(http_client)
|
188
|
+
@error = if http_client.error.nil? || http_client.error.empty?
|
189
|
+
"An error occurred when fetching #{http_client.uri.to_s}"
|
190
|
+
else
|
191
|
+
http_client.error
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
158
195
|
def success?(http_client)
|
159
196
|
http_client.response_header.status == 200
|
160
197
|
end
|
@@ -164,10 +201,37 @@ class RDaneel
|
|
164
201
|
end
|
165
202
|
|
166
203
|
def redirect_url(http_client, u)
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
location
|
204
|
+
# em-http-request handles the case when redirect is relative
|
205
|
+
# at this point http_client.response_header.location should always have an absolute and valid url
|
206
|
+
# but this invalid url is parsed successfully http:/malformed:url so we ask for host
|
207
|
+
Addressable::URI.parse(http_client.response_header.location)
|
208
|
+
end
|
209
|
+
|
210
|
+
def verbose(message, client = nil, *args)
|
211
|
+
return unless @verbose
|
212
|
+
message.each { |l| hashed_puts('*', l) }
|
213
|
+
args.each do |a|
|
214
|
+
case a
|
215
|
+
when :status
|
216
|
+
if client.response_header.status == 0
|
217
|
+
hashed_puts('< Status:', '0 (timeout)')
|
218
|
+
else
|
219
|
+
hashed_puts('< Status:', client.response_header.status)
|
220
|
+
end
|
221
|
+
when :request # this is a options hash
|
222
|
+
headers = client.options[:head]
|
223
|
+
headers.each { |k,v| hashed_puts('>', "#{k}: #{v}") } if headers
|
224
|
+
when :response # this is an array
|
225
|
+
client.response_header.each { |r| hashed_puts('<', "#{r[0]}: #{r[1]}") }
|
226
|
+
end
|
227
|
+
end
|
171
228
|
end
|
229
|
+
|
230
|
+
private
|
231
|
+
|
232
|
+
def hashed_puts( prefix, message )
|
233
|
+
$stdout.puts("[#{@hash}] [#{Time.now.strftime('%Y-%m-%d %H:%m:%S')}] #{prefix} #{message}")
|
234
|
+
end
|
235
|
+
|
172
236
|
end
|
173
237
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "RDaneel" do
|
4
|
+
|
5
|
+
describe "robots_txt_url" do
|
6
|
+
before(:each) do
|
7
|
+
@rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should return the proper url when url don't has a port specified (80 implied)" do
|
11
|
+
url = Addressable::URI.parse("http://127.0.0.1/path/url?param1=value1¶m2=value2")
|
12
|
+
@rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return the proper url when url has a port 80 specified" do
|
16
|
+
url = Addressable::URI.parse("http://127.0.0.1:80/path/url?param1=value1¶m2=value2")
|
17
|
+
@rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1/robots.txt"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should return the proper url when url has a port different than 80" do
|
21
|
+
url = Addressable::URI.parse("http://127.0.0.1:8080/path/url?param1=value1¶m2=value2")
|
22
|
+
@rdaneel.send(:robots_txt_url,url).to_s.should == "http://127.0.0.1:8080/robots.txt"
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
describe "robots_allowed?" do
|
29
|
+
before(:each) do
|
30
|
+
@rdaneel = RDaneel.new("http://127.0.0.1/anyurl")
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "when an error happens parsing the robots rules" do
|
34
|
+
before(:each) do
|
35
|
+
@robot_rules = RobotRules.new("RDaneel")
|
36
|
+
@robot_rules.stub!(:parse).and_raise(StandardError)
|
37
|
+
RobotRules.stub!(:new).and_return(@robot_rules)
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should return true" do #no matter the params
|
41
|
+
@rdaneel.send(:robots_allowed?, nil, nil, nil, nil).should be_true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
data/spec/spec_helper.rb
CHANGED
@@ -3,91 +3,4 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'rdaneel'
|
5
5
|
require 'spec'
|
6
|
-
require 'webrick'
|
7
|
-
|
8
|
-
# keep webrick quiet
|
9
|
-
class ::WEBrick::HTTPServer
|
10
|
-
def access_log(config, req, res)
|
11
|
-
# nop
|
12
|
-
end
|
13
|
-
end
|
14
|
-
class ::WEBrick::BasicLog
|
15
|
-
def log(level, data)
|
16
|
-
# nop
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
def locked_file
|
21
|
-
File.join(File.dirname(__FILE__),"server_lock-#{@__port}")
|
22
|
-
end
|
23
|
-
|
24
|
-
def server_setup(port=8080, &blk)
|
25
|
-
@__port = port
|
26
|
-
if @server.nil? and !File.exist?(locked_file)
|
27
|
-
File.open(locked_file,'w') {|f| f << 'locked' }
|
28
|
-
@server = WEBrick::HTTPServer.new :Port => port
|
29
|
-
blk.call(@server) if blk
|
30
|
-
queue = Queue.new # synchronize the thread startup to the main thread
|
31
|
-
@test_thread = Thread.new { queue << 1; @server.start }
|
32
|
-
|
33
|
-
# wait for the queue
|
34
|
-
value = queue.pop
|
35
|
-
|
36
|
-
if !value
|
37
|
-
STDERR.puts "Failed to startup test server!"
|
38
|
-
exit(1)
|
39
|
-
end
|
40
|
-
|
41
|
-
trap("INT"){server_shutdown}
|
42
|
-
at_exit{server_shutdown}
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def server_shutdown
|
47
|
-
begin
|
48
|
-
if File.exist?(locked_file)
|
49
|
-
File.unlink locked_file
|
50
|
-
@server.shutdown unless @server.nil?
|
51
|
-
@server = nil
|
52
|
-
end
|
53
|
-
rescue Object => e
|
54
|
-
puts "Error #{__FILE__}:#{__LINE__}\n#{e.message}"
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def mount(server, opts)
|
59
|
-
raise ":path is required" if opts[:path].nil?
|
60
|
-
raise ":status is required" if opts[:status].nil?
|
61
|
-
server.mount_proc( opts[:path],
|
62
|
-
lambda { |req, resp|
|
63
|
-
resp.status = opts[:status]
|
64
|
-
resp.body = opts[:body] unless opts[:body].nil?
|
65
|
-
resp['Location'] = opts[:location] unless opts[:location].nil?
|
66
|
-
opts[:block].call unless opts[:block].nil?
|
67
|
-
} )
|
68
|
-
end
|
69
|
-
|
70
|
-
def should_not_be_hit
|
71
|
-
should_be_hit( 0 )
|
72
|
-
end
|
73
|
-
|
74
|
-
def should_be_hit_once
|
75
|
-
should_be_hit( 1 )
|
76
|
-
end
|
77
|
-
|
78
|
-
def should_be_hit_twice
|
79
|
-
should_be_hit( 2 )
|
80
|
-
end
|
81
|
-
|
82
|
-
def should_be_hit( times = 1 )
|
83
|
-
l = lambda {}
|
84
|
-
m = l.should_receive(:call).exactly(times).times
|
85
|
-
return l
|
86
|
-
end
|
87
|
-
|
88
|
-
Spec::Runner.configure do |config|
|
89
|
-
config.before :suite do
|
90
|
-
puts "\e[4mThese specs could take a while, please be patience\e[0m"
|
91
|
-
end
|
92
|
-
end
|
93
6
|
|
@@ -9,7 +9,7 @@ describe "RDaneel when the content is chunked (digg.com)" do
|
|
9
9
|
|
10
10
|
it "should get the content" do
|
11
11
|
EM.run do
|
12
|
-
r = RDaneel.new("http://digg.com")
|
12
|
+
r = RDaneel.new("http://digg.com/news")
|
13
13
|
r.callback do
|
14
14
|
r.http_client.response_header.status.should == 200
|
15
15
|
r.http_client.response.should_not be_empty
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 2
|
9
|
+
version: 0.2.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Edgar Gonzalez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-27 00:00:00 -04:30
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -28,8 +28,8 @@ dependencies:
|
|
28
28
|
segments:
|
29
29
|
- 0
|
30
30
|
- 2
|
31
|
-
-
|
32
|
-
version: 0.2.
|
31
|
+
- 11
|
32
|
+
version: 0.2.11
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
@@ -60,6 +60,35 @@ dependencies:
|
|
60
60
|
version: 1.2.9
|
61
61
|
type: :development
|
62
62
|
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: cucumber
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
segments:
|
71
|
+
- 0
|
72
|
+
- 8
|
73
|
+
- 5
|
74
|
+
version: 0.8.5
|
75
|
+
type: :development
|
76
|
+
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: relevance-rcov
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
segments:
|
85
|
+
- 0
|
86
|
+
- 9
|
87
|
+
- 2
|
88
|
+
- 1
|
89
|
+
version: 0.9.2.1
|
90
|
+
type: :development
|
91
|
+
version_requirements: *id005
|
63
92
|
description: Add robots.txt support on top of em-http-request
|
64
93
|
email:
|
65
94
|
- edgargonzalez@gmail.com
|
@@ -78,13 +107,16 @@ files:
|
|
78
107
|
- README.rdoc
|
79
108
|
- Rakefile
|
80
109
|
- VERSION
|
110
|
+
- features/get_with_cache.feature
|
111
|
+
- features/get_without_cache.feature
|
112
|
+
- features/step_definitions/rdaneel_steps.rb
|
113
|
+
- features/support/burrito.rb
|
114
|
+
- features/support/env.rb
|
81
115
|
- lib/rdaneel.rb
|
82
|
-
- spec/
|
83
|
-
- spec/redirects_without_robots_spec.rb
|
116
|
+
- spec/rdaneel_spec.rb
|
84
117
|
- spec/spec.opts
|
85
118
|
- spec/spec_helper.rb
|
86
119
|
- spec/streamed_content_spec.rb
|
87
|
-
- spec/using_cache_spec.rb
|
88
120
|
has_rdoc: true
|
89
121
|
homepage: http://github.com/hasmanydevelopers/RDaneel
|
90
122
|
licenses: []
|
@@ -116,8 +148,6 @@ signing_key:
|
|
116
148
|
specification_version: 3
|
117
149
|
summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
|
118
150
|
test_files:
|
119
|
-
- spec/using_cache_spec.rb
|
120
|
-
- spec/no_redirects_neither_robots_spec.rb
|
121
151
|
- spec/spec_helper.rb
|
122
|
-
- spec/redirects_without_robots_spec.rb
|
123
152
|
- spec/streamed_content_spec.rb
|
153
|
+
- spec/rdaneel_spec.rb
|
@@ -1,130 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
-
|
3
|
-
describe "RDaneel when there are no redirects" do
|
4
|
-
|
5
|
-
let(:port) {8083}
|
6
|
-
|
7
|
-
describe "when a successfull status different than 200 is issued for robots.txt" do
|
8
|
-
it "should get the content ignoring the redirect"
|
9
|
-
end
|
10
|
-
|
11
|
-
describe "when a redirect other than 301 and 302 is issued for robots.txt" do
|
12
|
-
it "should get the content ignoring the redirect"
|
13
|
-
end
|
14
|
-
|
15
|
-
(301..302).each do |status|
|
16
|
-
|
17
|
-
describe "when robots.txt has been moved (http code #{status})" do
|
18
|
-
before(:each) do
|
19
|
-
server_setup(port+status) do |server|
|
20
|
-
mount(server, :path => '/hello_world', :status => 200,
|
21
|
-
:body => 'Hello World!', :block => should_be_hit_once )
|
22
|
-
mount(server, :path => '/robots.txt', :status => status,
|
23
|
-
:location => "http://127.0.0.1:#{port+status}/golems.txt",
|
24
|
-
:block => should_be_hit_once )
|
25
|
-
mount(server, :path => '/golems.txt', :status => 200,
|
26
|
-
:block => should_be_hit_once )
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
after(:each) do
|
31
|
-
server_shutdown
|
32
|
-
end
|
33
|
-
|
34
|
-
it "should get the redirected robots.txt and the content" do
|
35
|
-
EM.run do
|
36
|
-
r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
|
37
|
-
r.callback do
|
38
|
-
r.http_client.response_header.status.should == 200
|
39
|
-
r.http_client.response.should == "Hello World!"
|
40
|
-
r.redirects.should be_empty
|
41
|
-
EM.stop
|
42
|
-
end
|
43
|
-
r.errback do
|
44
|
-
fail
|
45
|
-
EM.stop
|
46
|
-
end
|
47
|
-
r.get
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
(400..417).each do |status|
|
56
|
-
|
57
|
-
describe "when there is a CLIENT error #{status} associated to robots.txt" do
|
58
|
-
before(:each) do
|
59
|
-
server_setup(port+status) do |server|
|
60
|
-
mount(server, :path => '/hello_world', :status => 200,
|
61
|
-
:body => 'Hello World!', :block => should_be_hit_once )
|
62
|
-
mount(server, :path => '/robots.txt', :status => status,
|
63
|
-
:block => should_be_hit_once )
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
after(:each) do
|
68
|
-
server_shutdown
|
69
|
-
end
|
70
|
-
|
71
|
-
it "should get the content" do
|
72
|
-
EM.run do
|
73
|
-
r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
|
74
|
-
r.callback do
|
75
|
-
r.http_client.response_header.status.should == 200
|
76
|
-
r.http_client.response.should == "Hello World!"
|
77
|
-
r.redirects.should be_empty
|
78
|
-
EM.stop
|
79
|
-
end
|
80
|
-
r.errback do
|
81
|
-
fail
|
82
|
-
EM.stop
|
83
|
-
end
|
84
|
-
r.get
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
(500..505).each do |status|
|
93
|
-
|
94
|
-
describe "when there is a SERVER error #{status} associated to robots.txt" do
|
95
|
-
before(:each) do
|
96
|
-
server_setup(port+status) do |server|
|
97
|
-
mount(server, :path => '/hello_world', :status => 200,
|
98
|
-
:body => 'Hello World!', :block => should_be_hit_once )
|
99
|
-
mount(server, :path => '/robots.txt', :status => status,
|
100
|
-
:block => should_be_hit_once )
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
after (:each) do
|
105
|
-
server_shutdown
|
106
|
-
end
|
107
|
-
|
108
|
-
it "should get the content" do
|
109
|
-
EM.run do
|
110
|
-
r = RDaneel.new("http://127.0.0.1:#{port+status}/hello_world")
|
111
|
-
r.callback do
|
112
|
-
r.http_client.response_header.status.should == 200
|
113
|
-
r.http_client.response.should == "Hello World!"
|
114
|
-
r.redirects.should be_empty
|
115
|
-
EM.stop
|
116
|
-
end
|
117
|
-
r.errback do
|
118
|
-
fail
|
119
|
-
EM.stop
|
120
|
-
end
|
121
|
-
r.get
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
end
|
128
|
-
|
129
|
-
end
|
130
|
-
|
@@ -1,175 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
-
|
3
|
-
describe "RDaneel when there are redirects" do
|
4
|
-
|
5
|
-
let(:port) {8081}
|
6
|
-
|
7
|
-
describe "when there is no robots.txt in the host (ONLY one host)" do
|
8
|
-
|
9
|
-
describe "when no redirection limit has been set" do
|
10
|
-
before(:each) do
|
11
|
-
server_setup(port) do |server|
|
12
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
13
|
-
:block => should_be_hit_once )
|
14
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
15
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
16
|
-
:block => should_be_hit_once )
|
17
|
-
mount(server, :path => '/hello_world', :status => 200,
|
18
|
-
:body => 'Hello World!',
|
19
|
-
:block => should_not_be_hit )
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
after(:each) do
|
24
|
-
server_shutdown
|
25
|
-
end
|
26
|
-
|
27
|
-
it "should not follow redirects" do
|
28
|
-
EM.run do
|
29
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
30
|
-
r.callback do
|
31
|
-
fail
|
32
|
-
EM.stop
|
33
|
-
end
|
34
|
-
r.errback do
|
35
|
-
r.redirects.should be_empty
|
36
|
-
r.error.should == "Exceeded maximum number of redirects"
|
37
|
-
EM.stop
|
38
|
-
end
|
39
|
-
r.get
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
end
|
45
|
-
|
46
|
-
describe "when a maximum number or redirects is set" do
|
47
|
-
|
48
|
-
describe "when there are less redirects than the maximum specified" do
|
49
|
-
before(:each) do
|
50
|
-
server_setup(port) do |server|
|
51
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
52
|
-
:block => should_be_hit(3) )
|
53
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
54
|
-
:location => "http://127.0.0.1:#{port}/redirect_me_again",
|
55
|
-
:block => should_be_hit_once )
|
56
|
-
mount(server, :path => '/redirect_me_again', :status => 301,
|
57
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
58
|
-
:block => should_be_hit_once )
|
59
|
-
mount(server, :path => '/hello_world', :status => 200,
|
60
|
-
:body => 'Hello World!',
|
61
|
-
:block => should_be_hit_once )
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
after(:each) do
|
66
|
-
server_shutdown
|
67
|
-
end
|
68
|
-
|
69
|
-
it "should get the content following all the redirects" do
|
70
|
-
EM.run do
|
71
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
72
|
-
r.callback do
|
73
|
-
r.http_client.response_header.status.should == 200
|
74
|
-
r.http_client.response.should == "Hello World!"
|
75
|
-
r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me",
|
76
|
-
"http://127.0.0.1:#{port}/redirect_me_again"]
|
77
|
-
r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
|
78
|
-
EM.stop
|
79
|
-
end
|
80
|
-
r.errback do
|
81
|
-
fail
|
82
|
-
EM.stop
|
83
|
-
end
|
84
|
-
r.get(:redirects => 3)
|
85
|
-
end
|
86
|
-
|
87
|
-
end
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
describe "when there are as many redirects as the maximum" do
|
92
|
-
before(:each) do
|
93
|
-
server_setup(port) do |server|
|
94
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
95
|
-
:block => should_be_hit_twice )
|
96
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
97
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
98
|
-
:block => should_be_hit_once )
|
99
|
-
mount(server, :path => '/hello_world', :status => 200,
|
100
|
-
:body => 'Hello World!',
|
101
|
-
:block => should_be_hit_once )
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
after(:each) do
|
106
|
-
server_shutdown
|
107
|
-
end
|
108
|
-
|
109
|
-
it "should get the content following all the redirects" do
|
110
|
-
EM.run do
|
111
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
112
|
-
r.callback do
|
113
|
-
r.http_client.response_header.status.should == 200
|
114
|
-
r.http_client.response.should == "Hello World!"
|
115
|
-
r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
|
116
|
-
r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
|
117
|
-
EM.stop
|
118
|
-
end
|
119
|
-
r.errback do
|
120
|
-
fail
|
121
|
-
EM.stop
|
122
|
-
end
|
123
|
-
r.get(:redirects => 1)
|
124
|
-
end
|
125
|
-
|
126
|
-
end
|
127
|
-
|
128
|
-
end
|
129
|
-
|
130
|
-
describe "when the number of redirects exceed the maximum specified" do
|
131
|
-
before(:each) do
|
132
|
-
server_setup(port) do |server|
|
133
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
134
|
-
:block => should_be_hit_twice )
|
135
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
136
|
-
:location => "http://127.0.0.1:#{port}/redirect_me_again",
|
137
|
-
:block => should_be_hit_once )
|
138
|
-
mount(server, :path => '/redirect_me_again', :status => 301,
|
139
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
140
|
-
:block => should_be_hit_once )
|
141
|
-
mount(server, :path => '/hello_world', :status => 200,
|
142
|
-
:body => 'Hello World!',
|
143
|
-
:block => should_not_be_hit )
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
after(:each) do
|
148
|
-
server_shutdown
|
149
|
-
end
|
150
|
-
|
151
|
-
it "should stop following redirects once the maximum specified is reached" do
|
152
|
-
EM.run do
|
153
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
154
|
-
r.callback do
|
155
|
-
fail
|
156
|
-
EM.stop
|
157
|
-
end
|
158
|
-
r.errback do
|
159
|
-
r.redirects.should == ["http://127.0.0.1:#{port}/redirect_me"]
|
160
|
-
r.error.should == "Exceeded maximum number of redirects"
|
161
|
-
EM.stop
|
162
|
-
end
|
163
|
-
r.get(:redirects => 1)
|
164
|
-
end
|
165
|
-
|
166
|
-
end
|
167
|
-
|
168
|
-
end
|
169
|
-
|
170
|
-
end
|
171
|
-
|
172
|
-
end
|
173
|
-
|
174
|
-
end
|
175
|
-
|
data/spec/using_cache_spec.rb
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
-
|
3
|
-
describe "RDaneel when there is a cache" do
|
4
|
-
|
5
|
-
let(:port) {8082}
|
6
|
-
|
7
|
-
describe "when there is no robots.txt in the host" do
|
8
|
-
|
9
|
-
before(:each) do
|
10
|
-
RDaneel.robots_cache = {}
|
11
|
-
server_setup(port) do |server|
|
12
|
-
mount(server, :path => '/robots.txt', :status => 404,
|
13
|
-
:block => should_be_hit_once )
|
14
|
-
mount(server, :path => '/redirect_me', :status => 301,
|
15
|
-
:location => "http://127.0.0.1:#{port}/hello_world",
|
16
|
-
:block => should_be_hit_once )
|
17
|
-
mount(server, :path => '/hello_world', :status => 200,
|
18
|
-
:body => 'Hello World!',
|
19
|
-
:block => should_be_hit_once )
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
after(:each) do
|
24
|
-
server_shutdown
|
25
|
-
end
|
26
|
-
|
27
|
-
it "should try to get the robots.txt just once" do
|
28
|
-
EM.run do
|
29
|
-
r = RDaneel.new("http://127.0.0.1:#{port}/redirect_me")
|
30
|
-
r.callback do
|
31
|
-
r.http_client.response_header.status.should == 200
|
32
|
-
r.http_client.response.should == "Hello World!"
|
33
|
-
r.redirects.should == [ "http://127.0.0.1:#{port}/redirect_me"]
|
34
|
-
r.uri.to_s.should == "http://127.0.0.1:#{port}/hello_world"
|
35
|
-
EM.stop
|
36
|
-
end
|
37
|
-
r.errback do
|
38
|
-
fail
|
39
|
-
EM.stop
|
40
|
-
end
|
41
|
-
r.get(:redirects => 3)
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|