rdaneel 0.0.0 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +2 -1
- data/README.rdoc +66 -5
- data/Rakefile +2 -2
- data/VERSION +1 -1
- data/lib/rdaneel.rb +117 -57
- data/spec/no_redirects_neither_robots_spec.rb +136 -0
- data/spec/redirects_without_robots_spec.rb +175 -0
- data/spec/spec_helper.rb +67 -0
- data/spec/using_cache_spec.rb +44 -0
- metadata +13 -6
- data/spec/rdaneel_spec.rb +0 -96
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright (c) 2009
|
1
|
+
Copyright (c) 2009 has_many :deveopers
|
2
2
|
|
3
3
|
Permission is hereby granted, free of charge, to any person obtaining
|
4
4
|
a copy of this software and associated documentation files (the
|
@@ -18,3 +18,4 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
18
|
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
19
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
20
|
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
data/README.rdoc
CHANGED
@@ -1,13 +1,74 @@
|
|
1
1
|
= R.Daneel
|
2
2
|
|
3
|
-
|
3
|
+
An EventMachine+Ruby library to fetch urls following robots.txt rules.
|
4
|
+
|
5
|
+
RDaneel is built it on top of @igrigorik's {em-http-request}[http://github.com/igrigorik/em-http-request]
|
6
|
+
|
7
|
+
== Features
|
8
|
+
|
9
|
+
- Support following redirects, honoring robots.txt for each host in the redirect chain.
|
10
|
+
- Support an external cache to store robots.txt
|
11
|
+
- Compatible with all options defined in em-http-request
|
12
|
+
|
13
|
+
== Install
|
14
|
+
|
15
|
+
$ gem install rdaneel
|
16
|
+
|
17
|
+
== Examples
|
18
|
+
|
19
|
+
=== Following redirects
|
20
|
+
|
21
|
+
require 'rdaneel'
|
22
|
+
|
23
|
+
EM.run do
|
24
|
+
r = RDaneel.new("http://bit.ly/cbEnpa")
|
25
|
+
r.callback{
|
26
|
+
puts r.http_client.response_header.status
|
27
|
+
puts r.http_client.response[0,80]
|
28
|
+
puts r.redirects
|
29
|
+
puts r.uri
|
30
|
+
EM.stop
|
31
|
+
}
|
32
|
+
r.errback{
|
33
|
+
puts "should not happen"
|
34
|
+
EM.stop
|
35
|
+
}
|
36
|
+
r.get(:redirects => 3)
|
37
|
+
end
|
38
|
+
|
39
|
+
=> 200
|
40
|
+
=> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
41
|
+
=> http://bit.ly:80/cbEnpa
|
42
|
+
=> http://github.com:80/hasmanydevelopers/RDaneel
|
43
|
+
|
44
|
+
|
45
|
+
=== Denied by robots.txt
|
46
|
+
|
47
|
+
require 'rdaneel'
|
48
|
+
|
49
|
+
EM.run do
|
50
|
+
r = RDaneel.new("http://github.com/hasmanydevelopers/RDaneel/tarball/v0.0.0")
|
51
|
+
r.callback{
|
52
|
+
puts "should not happen"
|
53
|
+
EM.stop
|
54
|
+
}
|
55
|
+
r.errback{
|
56
|
+
puts r.error
|
57
|
+
EM.stop
|
58
|
+
}
|
59
|
+
r.get(:redirects => 3)
|
60
|
+
end
|
61
|
+
|
62
|
+
=> robots denied
|
63
|
+
|
64
|
+
|
65
|
+
== Why RDaneel?
|
4
66
|
|
5
67
|
R Daneel Olivaw is a fictional robot created by Isaac Asimov - http://en.wikipedia.org/wiki/R._Daneel_Olivaw
|
6
68
|
|
7
|
-
==
|
69
|
+
== Acknowledge
|
8
70
|
|
9
|
-
|
10
|
-
But when following redirects the method won't check the intermediate robots.txt, just the first one.
|
71
|
+
To Ilya Grigorik (@igrigorik) for em-http-request lib and his support and advice.
|
11
72
|
|
12
73
|
|
13
74
|
== Note on Patches/Pull Requests
|
@@ -22,5 +83,5 @@ But when following redirects the method won't check the intermediate robots.txt,
|
|
22
83
|
|
23
84
|
== Copyright
|
24
85
|
|
25
|
-
Copyright (c) 2010
|
86
|
+
Copyright (c) 2010 has_many :developers. See LICENSE for details.
|
26
87
|
|
data/Rakefile
CHANGED
@@ -7,9 +7,9 @@ begin
|
|
7
7
|
gem.name = "rdaneel"
|
8
8
|
gem.summary = %Q{Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)}
|
9
9
|
gem.description = %Q{Add robots.txt support on top of em-http-request}
|
10
|
-
gem.email = "edgargonzalez@gmail.com"
|
10
|
+
gem.email = ["edgargonzalez@gmail.com", "anibalrojas@gmail.com"]
|
11
11
|
gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
|
12
|
-
gem.authors = ["Edgar Gonzalez"]
|
12
|
+
gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
|
13
13
|
gem.add_dependency("em-http-request", ">= 0.2.10")
|
14
14
|
gem.add_dependency('robot_rules', '>= 0.9.1')
|
15
15
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.1.0
|
data/lib/rdaneel.rb
CHANGED
@@ -1,16 +1,14 @@
|
|
1
1
|
require 'em-http'
|
2
2
|
require 'robot_rules'
|
3
|
-
require 'net/http'
|
4
|
-
|
5
|
-
module Net
|
6
|
-
class DisobeyingRobotsTxt < HTTPBadResponse ; end
|
7
|
-
end
|
8
3
|
|
9
4
|
class RDaneel
|
5
|
+
include EM::Deferrable
|
6
|
+
|
7
|
+
DEFAULT_OPTIONS = {:head => {'user-agent' => 'RDaneel'}}
|
10
8
|
|
11
9
|
class << self
|
12
|
-
def robots_cache=(
|
13
|
-
@robots_cache =
|
10
|
+
def robots_cache=(c)
|
11
|
+
@robots_cache = c
|
14
12
|
end
|
15
13
|
|
16
14
|
def robots_cache
|
@@ -18,76 +16,138 @@ class RDaneel
|
|
18
16
|
end
|
19
17
|
end
|
20
18
|
|
19
|
+
attr_accessor :uri
|
20
|
+
attr_reader :error, :redirects, :http_client
|
21
|
+
|
21
22
|
def initialize(uri)
|
22
23
|
@uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
|
24
|
+
@redirects = []
|
23
25
|
end
|
24
26
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
27
|
+
def get(opts = {})
|
28
|
+
current_uri = @uri
|
29
|
+
options = DEFAULT_OPTIONS.merge(opts)
|
30
|
+
max_redirects = options.delete(:redirects).to_i
|
31
|
+
useragent = options[:head]['user-agent']
|
28
32
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
33
|
+
_get = lambda {}
|
34
|
+
|
35
|
+
_handle_uri_callback = lambda {|h|
|
36
|
+
if success?(h)
|
37
|
+
@uri = current_uri if current_uri != @uri
|
38
|
+
@http_client = h
|
39
|
+
succeed(self)
|
40
|
+
elsif redirected?(h)
|
41
|
+
if @redirects.size >= max_redirects
|
42
|
+
@http_client = h
|
43
|
+
@error = "Exceeded maximum number of redirects"
|
44
|
+
fail(self)
|
45
|
+
return
|
46
|
+
end
|
47
|
+
begin
|
48
|
+
@redirects << current_uri.to_s
|
49
|
+
current_uri = redirect_url(h, current_uri)
|
50
|
+
if @redirects.include?(current_uri.to_s)
|
51
|
+
@http_client = h
|
52
|
+
@error = "infinite redirect"
|
53
|
+
fail(self)
|
54
|
+
return
|
55
|
+
end
|
56
|
+
_get.call
|
57
|
+
rescue
|
58
|
+
@http_client = h
|
59
|
+
@error = "mal formed redirected url"
|
60
|
+
fail(self)
|
61
|
+
end
|
45
62
|
else
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
63
|
+
# other error
|
64
|
+
self.http_client = h
|
65
|
+
@error = "not success and not redirect"
|
66
|
+
fail(self)
|
50
67
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
robots_file
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
68
|
+
}
|
69
|
+
_get = lambda {
|
70
|
+
if robots_cache && robots_file = robots_cache[robots_txt_url(current_uri)]
|
71
|
+
if robots_allowed?(robots_file, useragent, current_uri)
|
72
|
+
h = EM::HttpRequest.new(current_uri).get(options)
|
73
|
+
h.callback(&_handle_uri_callback)
|
74
|
+
h.errback {
|
75
|
+
@http_client = h
|
76
|
+
@error = h.error
|
77
|
+
fail(self)
|
78
|
+
}
|
60
79
|
else
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
blk.call(conn)
|
80
|
+
@http_client = EM::HttpClient.new("")
|
81
|
+
@error = "robots denied"
|
82
|
+
fail(self)
|
65
83
|
end
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
84
|
+
else
|
85
|
+
robots = EM::HttpRequest.new(robots_txt_url(current_uri)).get
|
86
|
+
robots.callback {
|
87
|
+
robots_file = robots.response
|
88
|
+
robots_cache[robots_txt_url(current_uri)] = robots_file if robots_cache
|
89
|
+
if robots_allowed?(robots_file, useragent, current_uri)
|
90
|
+
h = EM::HttpRequest.new(current_uri).get(options)
|
91
|
+
h.callback(&_handle_uri_callback)
|
92
|
+
h.errback {
|
93
|
+
@http_client = h
|
94
|
+
@error = h.error
|
95
|
+
fail(self)
|
96
|
+
}
|
97
|
+
else
|
98
|
+
@http_client = EM::HttpClient.new("")
|
99
|
+
@error = "robots denied"
|
100
|
+
fail(self)
|
101
|
+
end
|
102
|
+
}
|
103
|
+
robots.errback {
|
104
|
+
robots_cache.put[robots_txt_url(current_uri)] = "" if robots_cache
|
105
|
+
h = EM::HttpRequest.new(current_uri).get(options)
|
106
|
+
h.callback(&_handle_uri_callback)
|
107
|
+
h.errback {
|
108
|
+
@http_client = h
|
109
|
+
@error = h.error
|
110
|
+
fail(self)
|
111
|
+
}
|
112
|
+
}
|
113
|
+
end
|
114
|
+
}
|
115
|
+
_get.call
|
116
|
+
end
|
117
|
+
|
118
|
+
def robots_cache
|
119
|
+
self.class.robots_cache
|
73
120
|
end
|
74
121
|
|
75
122
|
protected
|
76
123
|
|
77
|
-
def robots_allowed?(robots_file, useragent)
|
124
|
+
def robots_allowed?(robots_file, useragent, u)
|
78
125
|
rules = RobotRules.new(useragent)
|
79
|
-
rules.parse(
|
80
|
-
rules.allowed?
|
126
|
+
rules.parse(u.to_s, robots_file)
|
127
|
+
rules.allowed? u.to_s
|
81
128
|
end
|
82
129
|
|
83
|
-
def robots_txt_url
|
84
|
-
location = if
|
85
|
-
|
130
|
+
def robots_txt_url(u)
|
131
|
+
location = if u.port == 80
|
132
|
+
u.host
|
86
133
|
else
|
87
|
-
"#{
|
134
|
+
"#{u.host}:#{u.port}"
|
88
135
|
end
|
89
136
|
"http://#{location}/robots.txt"
|
90
137
|
end
|
91
138
|
|
139
|
+
def success?(http_client)
|
140
|
+
http_client.response_header.status == 200
|
141
|
+
end
|
142
|
+
|
143
|
+
def redirected?(http_client)
|
144
|
+
http_client.response_header.status == 301 || http_client.response_header.status == 302
|
145
|
+
end
|
146
|
+
|
147
|
+
def redirect_url(http_client, u)
|
148
|
+
location = Addressable::URI.parse(http_client.response_header.location)
|
149
|
+
return u.join(location) if location.relative?
|
150
|
+
return location
|
151
|
+
end
|
92
152
|
end
|
93
153
|
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "RDaneel when there are no redirects" do
|
4
|
+
|
5
|
+
describe "when a successfull status different than 200 is issued for robots.txt" do
|
6
|
+
|
7
|
+
it "should get the content ignoring the redirect"
|
8
|
+
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "when a redirect other than 301 and 302 is issued for robots.txt" do
|
12
|
+
|
13
|
+
it "should get the content ignoring the redirect"
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
(301..302).each do |status|
|
18
|
+
|
19
|
+
describe "when robots.txt has been moved (http code #{status})" do
|
20
|
+
before(:each) do
|
21
|
+
burrito.mount( :path => '/hello_world', :status => 200,
|
22
|
+
:body => 'Hello World!', :block => should_be_hit_once )
|
23
|
+
burrito.mount( :path => '/robots.txt', :status => status,
|
24
|
+
:location => 'http://127.0.0.1:8080/golems.txt',
|
25
|
+
:block => should_be_hit_once )
|
26
|
+
burrito.mount( :path => '/golems.txt', :status => 200,
|
27
|
+
:block => should_not_be_hit )
|
28
|
+
end
|
29
|
+
|
30
|
+
after(:each) do
|
31
|
+
burrito.unmount('/hello_world')
|
32
|
+
burrito.unmount('/robots.txt')
|
33
|
+
burrito.unmount('/golems.txt')
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should get the content ignoring the redirect" do
|
37
|
+
|
38
|
+
EM.run do
|
39
|
+
r = RDaneel.new("http://127.0.0.1:8080/hello_world")
|
40
|
+
r.callback do
|
41
|
+
r.http_client.response_header.status.should == 200
|
42
|
+
r.http_client.response.should == "Hello World!"
|
43
|
+
r.redirects.should be_empty
|
44
|
+
EM.stop
|
45
|
+
end
|
46
|
+
r.errback do
|
47
|
+
fail
|
48
|
+
EM.stop
|
49
|
+
end
|
50
|
+
r.get
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
(400..417).each do |status|
|
60
|
+
|
61
|
+
describe "when there is a CLIENT error #{status} associated to robots.txt" do
|
62
|
+
before(:each) do
|
63
|
+
burrito.mount( :path => '/hello_world', :status => 200,
|
64
|
+
:body => 'Hello World!', :block => should_be_hit_once )
|
65
|
+
burrito.mount( :path => '/robots.txt', :status => status,
|
66
|
+
:block => should_be_hit_once )
|
67
|
+
end
|
68
|
+
|
69
|
+
after(:each) do
|
70
|
+
burrito.unmount('/hello_world')
|
71
|
+
burrito.unmount('/robots.txt')
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should get the content" do
|
75
|
+
|
76
|
+
EM.run do
|
77
|
+
r = RDaneel.new("http://127.0.0.1:8080/hello_world")
|
78
|
+
r.callback do
|
79
|
+
r.http_client.response_header.status.should == 200
|
80
|
+
r.http_client.response.should == "Hello World!"
|
81
|
+
r.redirects.should be_empty
|
82
|
+
EM.stop
|
83
|
+
end
|
84
|
+
r.errback do
|
85
|
+
fail
|
86
|
+
EM.stop
|
87
|
+
end
|
88
|
+
r.get
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
(500..505).each do |status|
|
98
|
+
|
99
|
+
describe "when there is a SERVER error #{status} associated to robots.txt" do
|
100
|
+
before(:each) do
|
101
|
+
burrito.mount( :path => '/hello_world', :status => 200,
|
102
|
+
:body => 'Hello World!', :block => should_be_hit_once )
|
103
|
+
burrito.mount( :path => '/robots.txt', :status => status,
|
104
|
+
:block => should_be_hit_once )
|
105
|
+
end
|
106
|
+
|
107
|
+
after (:each) do
|
108
|
+
burrito.unmount('/hello_world')
|
109
|
+
burrito.unmount('/robots.txt')
|
110
|
+
end
|
111
|
+
|
112
|
+
it "should get the content" do
|
113
|
+
|
114
|
+
EM.run do
|
115
|
+
r = RDaneel.new("http://127.0.0.1:8080/hello_world")
|
116
|
+
r.callback do
|
117
|
+
r.http_client.response_header.status.should == 200
|
118
|
+
r.http_client.response.should == "Hello World!"
|
119
|
+
r.redirects.should be_empty
|
120
|
+
EM.stop
|
121
|
+
end
|
122
|
+
r.errback do
|
123
|
+
fail
|
124
|
+
EM.stop
|
125
|
+
end
|
126
|
+
r.get
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
|
@@ -0,0 +1,175 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "RDaneel when there are redirects" do
|
4
|
+
|
5
|
+
describe "when there is no robots.txt in the host (ONLY one host)" do
|
6
|
+
|
7
|
+
describe "when no redirection limit has been set" do
|
8
|
+
before(:each) do
|
9
|
+
burrito.mount( :path => '/robots.txt', :status => 404,
|
10
|
+
:block => should_be_hit_once )
|
11
|
+
burrito.mount( :path => '/redirect_me', :status => 301,
|
12
|
+
:location => 'http://127.0.0.1:8080/hello_world',
|
13
|
+
:block => should_be_hit_once )
|
14
|
+
burrito.mount( :path => '/hello_world', :status => 200,
|
15
|
+
:body => 'Hello World!',
|
16
|
+
:block => should_not_be_hit )
|
17
|
+
end
|
18
|
+
|
19
|
+
after(:each) do
|
20
|
+
burrito.unmount('/robots.txt')
|
21
|
+
burrito.unmount('/redirect_me')
|
22
|
+
burrito.unmount('/hello_world')
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should not follow redirects" do
|
26
|
+
EM.run do
|
27
|
+
r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
|
28
|
+
r.callback do
|
29
|
+
fail
|
30
|
+
EM.stop
|
31
|
+
end
|
32
|
+
r.errback do
|
33
|
+
r.redirects.should be_empty
|
34
|
+
r.error.should == "Exceeded maximum number of redirects"
|
35
|
+
EM.stop
|
36
|
+
end
|
37
|
+
r.get
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "when a maximum number or redirects is set" do
|
45
|
+
|
46
|
+
describe "when there are less redirects than the maximum specified" do
|
47
|
+
before(:each) do
|
48
|
+
burrito.mount( :path => '/robots.txt', :status => 404,
|
49
|
+
:block => should_be_hit(3) )
|
50
|
+
burrito.mount( :path => '/redirect_me', :status => 301,
|
51
|
+
:location => 'http://127.0.0.1:8080/redirect_me_again',
|
52
|
+
:block => should_be_hit_once )
|
53
|
+
burrito.mount( :path => '/redirect_me_again', :status => 301,
|
54
|
+
:location => 'http://127.0.0.1:8080/hello_world',
|
55
|
+
:block => should_be_hit_once )
|
56
|
+
burrito.mount( :path => '/hello_world', :status => 200,
|
57
|
+
:body => 'Hello World!',
|
58
|
+
:block => should_be_hit_once )
|
59
|
+
end
|
60
|
+
|
61
|
+
after(:each) do
|
62
|
+
burrito.unmount('/robots.txt')
|
63
|
+
burrito.unmount('/redirect_me')
|
64
|
+
burrito.unmount('/redirect_me_again')
|
65
|
+
burrito.unmount('/hello_world')
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should get the content following all the redirects" do
|
69
|
+
EM.run do
|
70
|
+
r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
|
71
|
+
r.callback do
|
72
|
+
r.http_client.response_header.status.should == 200
|
73
|
+
r.http_client.response.should == "Hello World!"
|
74
|
+
r.redirects.should == [ "http://127.0.0.1:8080/redirect_me",
|
75
|
+
"http://127.0.0.1:8080/redirect_me_again"]
|
76
|
+
r.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
|
77
|
+
EM.stop
|
78
|
+
end
|
79
|
+
r.errback do
|
80
|
+
fail
|
81
|
+
EM.stop
|
82
|
+
end
|
83
|
+
r.get(:redirects => 3)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
describe "when there are as many redirects as the maximum" do
|
91
|
+
before(:each) do
|
92
|
+
burrito.mount( :path => '/robots.txt', :status => 404,
|
93
|
+
:block => should_be_hit_twice )
|
94
|
+
burrito.mount( :path => '/redirect_me', :status => 301,
|
95
|
+
:location => 'http://127.0.0.1:8080/hello_world',
|
96
|
+
:block => should_be_hit_once )
|
97
|
+
burrito.mount( :path => '/hello_world', :status => 200,
|
98
|
+
:body => 'Hello World!',
|
99
|
+
:block => should_be_hit_once )
|
100
|
+
end
|
101
|
+
|
102
|
+
after(:each) do
|
103
|
+
burrito.unmount('/robots.txt')
|
104
|
+
burrito.unmount('/redirect_me')
|
105
|
+
burrito.unmount('/hello_world')
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should get the content following all the redirects" do
|
109
|
+
EM.run do
|
110
|
+
r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
|
111
|
+
r.callback do
|
112
|
+
r.http_client.response_header.status.should == 200
|
113
|
+
r.http_client.response.should == "Hello World!"
|
114
|
+
r.redirects.should == ['http://127.0.0.1:8080/redirect_me']
|
115
|
+
r.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
|
116
|
+
EM.stop
|
117
|
+
end
|
118
|
+
r.errback do
|
119
|
+
fail
|
120
|
+
EM.stop
|
121
|
+
end
|
122
|
+
r.get(:redirects => 1)
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
describe "when the number of redirects exceed the maximum specified" do
|
130
|
+
before(:each) do
|
131
|
+
burrito.mount( :path => '/robots.txt', :status => 404,
|
132
|
+
:block => should_be_hit_twice )
|
133
|
+
burrito.mount( :path => '/redirect_me', :status => 301,
|
134
|
+
:location => 'http://127.0.0.1:8080/redirect_me_again',
|
135
|
+
:block => should_be_hit_once )
|
136
|
+
burrito.mount( :path => '/redirect_me_again', :status => 301,
|
137
|
+
:location => 'http://127.0.0.1:8080/hello_world',
|
138
|
+
:block => should_be_hit_once )
|
139
|
+
burrito.mount( :path => '/hello_world', :status => 200,
|
140
|
+
:body => 'Hello World!',
|
141
|
+
:block => should_not_be_hit )
|
142
|
+
end
|
143
|
+
|
144
|
+
after(:each) do
|
145
|
+
burrito.unmount('/robots.txt')
|
146
|
+
burrito.unmount('/redirect_me')
|
147
|
+
burrito.unmount('/redirect_me_again')
|
148
|
+
burrito.unmount('/hello_world')
|
149
|
+
end
|
150
|
+
|
151
|
+
it "should stop following redirects once the maximum specified is reached" do
|
152
|
+
EM.run do
|
153
|
+
r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
|
154
|
+
r.callback do
|
155
|
+
fail
|
156
|
+
EM.stop
|
157
|
+
end
|
158
|
+
r.errback do
|
159
|
+
r.redirects.should == ['http://127.0.0.1:8080/redirect_me']
|
160
|
+
r.error.should == "Exceeded maximum number of redirects"
|
161
|
+
EM.stop
|
162
|
+
end
|
163
|
+
r.get(:redirects => 1)
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
data/spec/spec_helper.rb
CHANGED
@@ -4,8 +4,75 @@ require 'rubygems'
|
|
4
4
|
require 'rdaneel'
|
5
5
|
require 'spec'
|
6
6
|
require 'spec/autorun'
|
7
|
+
require 'webrick'
|
7
8
|
|
8
9
|
Spec::Runner.configure do |config|
|
10
|
+
config.before :suite do
|
11
|
+
burrito
|
12
|
+
end
|
13
|
+
config.after :suite do
|
14
|
+
burrito.stop
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def burrito
|
19
|
+
Thread.current[:burrito] ||= Burrito.new
|
20
|
+
end
|
21
|
+
|
22
|
+
class Burrito
|
23
|
+
|
24
|
+
def initialize( options={}, &blk )
|
25
|
+
webrick_log_file = '/dev/null' # disable logging
|
26
|
+
webrick_logger = WEBrick::Log.new(webrick_log_file, WEBrick::Log::DEBUG)
|
27
|
+
access_log_stream = webrick_logger
|
28
|
+
access_log = [[ access_log_stream, WEBrick::AccessLog::COMBINED_LOG_FORMAT ]]
|
29
|
+
default_opts = {:Port => 8080, :Logger => webrick_logger, :AccessLog => access_log }
|
30
|
+
@server = WEBrick::HTTPServer.new( default_opts.merge(options) )
|
31
|
+
@server_thread = Thread.new {
|
32
|
+
blk.call(@server) if blk
|
33
|
+
@server.start
|
34
|
+
}
|
35
|
+
@server
|
36
|
+
end
|
37
|
+
|
38
|
+
def mount( opts )
|
39
|
+
raise ":path is required" if opts[:path].nil?
|
40
|
+
raise ":status is required" if opts[:status].nil?
|
41
|
+
@server.mount_proc( opts[:path],
|
42
|
+
lambda { |req, resp|
|
43
|
+
resp.status = opts[:status]
|
44
|
+
resp.body = opts[:body] unless opts[:body].nil?
|
45
|
+
resp['Location'] = opts[:location] unless opts[:location].nil?
|
46
|
+
opts[:block].call unless opts[:block].nil?
|
47
|
+
} )
|
48
|
+
end
|
49
|
+
|
50
|
+
def stop
|
51
|
+
@server.shutdown
|
52
|
+
@server_thread.join
|
53
|
+
end
|
54
|
+
|
55
|
+
def unmount(path)
|
56
|
+
@server.unmount(path)
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
def should_not_be_hit
|
62
|
+
should_be_hit( 0 )
|
63
|
+
end
|
64
|
+
|
65
|
+
def should_be_hit_once
|
66
|
+
should_be_hit( 1 )
|
67
|
+
end
|
68
|
+
|
69
|
+
def should_be_hit_twice
|
70
|
+
should_be_hit( 2 )
|
71
|
+
end
|
9
72
|
|
73
|
+
def should_be_hit( times = 1 )
|
74
|
+
l = lambda {}
|
75
|
+
m = l.should_receive(:call).exactly(times).times
|
76
|
+
return l
|
10
77
|
end
|
11
78
|
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "RDaneel when there is a cache" do
|
4
|
+
|
5
|
+
describe "when there is no robots.txt in the host" do
|
6
|
+
|
7
|
+
before(:each) do
|
8
|
+
RDaneel.robots_cache = {}
|
9
|
+
burrito.mount( :path => '/robots.txt', :status => 404,
|
10
|
+
:block => should_be_hit_once )
|
11
|
+
burrito.mount( :path => '/redirect_me', :status => 301,
|
12
|
+
:location => 'http://127.0.0.1:8080/hello_world',
|
13
|
+
:block => should_be_hit_once )
|
14
|
+
burrito.mount( :path => '/hello_world', :status => 200,
|
15
|
+
:body => 'Hello World!',
|
16
|
+
:block => should_be_hit_once )
|
17
|
+
end
|
18
|
+
|
19
|
+
after(:each) do
|
20
|
+
burrito.unmount('/robots.txt')
|
21
|
+
burrito.unmount('/redirect_me')
|
22
|
+
burrito.unmount('/hello_world')
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should try to get the robots.txt just once" do
|
26
|
+
EM.run do
|
27
|
+
r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
|
28
|
+
r.callback do
|
29
|
+
r.http_client.response_header.status.should == 200
|
30
|
+
r.http_client.response.should == "Hello World!"
|
31
|
+
r.redirects.should == [ "http://127.0.0.1:8080/redirect_me"]
|
32
|
+
r.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
|
33
|
+
EM.stop
|
34
|
+
end
|
35
|
+
r.errback do
|
36
|
+
fail
|
37
|
+
EM.stop
|
38
|
+
end
|
39
|
+
r.get(:redirects => 3)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
metadata
CHANGED
@@ -4,17 +4,18 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
+
- 1
|
7
8
|
- 0
|
8
|
-
|
9
|
-
version: 0.0.0
|
9
|
+
version: 0.1.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Edgar Gonzalez
|
13
|
+
- Anibal Rojas
|
13
14
|
autorequire:
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-07-
|
18
|
+
date: 2010-07-19 00:00:00 -04:30
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -60,7 +61,9 @@ dependencies:
|
|
60
61
|
type: :development
|
61
62
|
version_requirements: *id003
|
62
63
|
description: Add robots.txt support on top of em-http-request
|
63
|
-
email:
|
64
|
+
email:
|
65
|
+
- edgargonzalez@gmail.com
|
66
|
+
- anibalrojas@gmail.com
|
64
67
|
executables: []
|
65
68
|
|
66
69
|
extensions: []
|
@@ -76,9 +79,11 @@ files:
|
|
76
79
|
- Rakefile
|
77
80
|
- VERSION
|
78
81
|
- lib/rdaneel.rb
|
79
|
-
- spec/
|
82
|
+
- spec/no_redirects_neither_robots_spec.rb
|
83
|
+
- spec/redirects_without_robots_spec.rb
|
80
84
|
- spec/spec.opts
|
81
85
|
- spec/spec_helper.rb
|
86
|
+
- spec/using_cache_spec.rb
|
82
87
|
has_rdoc: true
|
83
88
|
homepage: http://github.com/hasmanydevelopers/RDaneel
|
84
89
|
licenses: []
|
@@ -110,5 +115,7 @@ signing_key:
|
|
110
115
|
specification_version: 3
|
111
116
|
summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
|
112
117
|
test_files:
|
118
|
+
- spec/using_cache_spec.rb
|
119
|
+
- spec/no_redirects_neither_robots_spec.rb
|
113
120
|
- spec/spec_helper.rb
|
114
|
-
- spec/
|
121
|
+
- spec/redirects_without_robots_spec.rb
|
data/spec/rdaneel_spec.rb
DELETED
@@ -1,96 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
-
require 'webrick'
|
3
|
-
|
4
|
-
describe "RDaneel" do
|
5
|
-
|
6
|
-
describe "when there is no robots.txt" do
|
7
|
-
before(:all) do
|
8
|
-
start_server do |s|
|
9
|
-
s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
after(:all) do
|
14
|
-
stop_server
|
15
|
-
end
|
16
|
-
|
17
|
-
it "should follow and get the uri" do
|
18
|
-
EM.run {
|
19
|
-
RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
|
20
|
-
http.response_header.status.should == 200
|
21
|
-
http.response.should == "hello world"
|
22
|
-
http.error.should == ''
|
23
|
-
EM.stop
|
24
|
-
end
|
25
|
-
}
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
describe "when there is a robots.txt that allow the uri requested" do
|
30
|
-
before(:all) do
|
31
|
-
start_server do |s|
|
32
|
-
s.mount_proc('/robots.txt', lambda { |req, resp| resp.status = 200; resp.body = "User-agent: *\nDisallow: /images"})
|
33
|
-
s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
after(:all) do
|
38
|
-
stop_server
|
39
|
-
end
|
40
|
-
|
41
|
-
it "should follow and get the uri" do
|
42
|
-
EM.run {
|
43
|
-
RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
|
44
|
-
http.response_header.status.should == 200
|
45
|
-
http.response.should == "hello world"
|
46
|
-
http.error.should == ''
|
47
|
-
EM.stop
|
48
|
-
end
|
49
|
-
}
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
describe "when there is a robots.txt that disallow all content for all bots" do
|
54
|
-
before(:all) do
|
55
|
-
start_server do |s|
|
56
|
-
s.mount_proc('/robots.txt', lambda { |req, resp| resp.status = 200; resp.body = "User-agent: *\nDisallow: /"})
|
57
|
-
s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
after(:all) do
|
62
|
-
stop_server
|
63
|
-
end
|
64
|
-
|
65
|
-
it "shouldn't get the uri" do
|
66
|
-
EM.run {
|
67
|
-
RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
|
68
|
-
http.error.should == 'robots.txt'
|
69
|
-
http.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
|
70
|
-
EM.stop
|
71
|
-
end
|
72
|
-
}
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
end
|
79
|
-
|
80
|
-
def start_server(options={}, &blk)
|
81
|
-
@server = WEBrick::HTTPServer.new({:Port => 8080}.merge(options))
|
82
|
-
@server_thread = Thread.new {
|
83
|
-
blk.call(@server) if blk
|
84
|
-
@server.start
|
85
|
-
}
|
86
|
-
end
|
87
|
-
|
88
|
-
def stop_server
|
89
|
-
@server.shutdown
|
90
|
-
@server_thread.join
|
91
|
-
end
|
92
|
-
|
93
|
-
def robots_txt
|
94
|
-
|
95
|
-
end
|
96
|
-
|