rdaneel 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -1
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/rdaneel.rb +46 -26
- data/spec/no_redirects_neither_robots_spec.rb +35 -0
- data/spec/spec_helper.rb +1 -1
- metadata +5 -5
data/README.rdoc
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= R.Daneel
|
2
2
|
|
3
|
-
An EventMachine+Ruby library to fetch urls
|
3
|
+
An EventMachine+Ruby library to fetch urls obeying robots.txt rules.
|
4
4
|
|
5
5
|
RDaneel is built it on top of @igrigorik's {em-http-request}[http://github.com/igrigorik/em-http-request]
|
6
6
|
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ begin
|
|
11
11
|
gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
|
12
12
|
gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
|
13
13
|
gem.add_dependency("em-http-request", ">= 0.2.10")
|
14
|
-
gem.add_dependency('robot_rules', '>= 0.9.
|
14
|
+
gem.add_dependency('robot_rules', '>= 0.9.3')
|
15
15
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
16
16
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
17
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.2
|
data/lib/rdaneel.rb
CHANGED
@@ -21,6 +21,7 @@ class RDaneel
|
|
21
21
|
|
22
22
|
def initialize(uri)
|
23
23
|
@uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
|
24
|
+
@uri.path = "/" if @uri.path.nil? || @uri.path == ""
|
24
25
|
@redirects = []
|
25
26
|
end
|
26
27
|
|
@@ -61,39 +62,53 @@ class RDaneel
|
|
61
62
|
end
|
62
63
|
else
|
63
64
|
# other error
|
64
|
-
|
65
|
+
@http_client = h
|
65
66
|
@error = "not success and not redirect"
|
66
67
|
fail(self)
|
67
68
|
end
|
68
69
|
}
|
69
70
|
_get = lambda {
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
71
|
+
robots_url = robots_txt_url(current_uri)
|
72
|
+
if robots_cache && robots_file = robots_cache[robots_url.to_s]
|
73
|
+
if robots_allowed?(robots_file, useragent, robots_url, current_uri)
|
74
|
+
begin
|
75
|
+
h = EM::HttpRequest.new(current_uri).get(options)
|
76
|
+
h.callback(&_handle_uri_callback)
|
77
|
+
h.errback {
|
78
|
+
@http_client = h
|
79
|
+
@error = h.error
|
80
|
+
fail(self)
|
81
|
+
}
|
82
|
+
rescue StandardError => se
|
83
|
+
@http_client = EM::HttpClient.new("")
|
84
|
+
@error = "#{se.message}\n#{se.backtrace.inspect}"
|
77
85
|
fail(self)
|
78
|
-
|
86
|
+
end
|
79
87
|
else
|
80
88
|
@http_client = EM::HttpClient.new("")
|
81
89
|
@error = "robots denied"
|
82
90
|
fail(self)
|
83
91
|
end
|
84
92
|
else
|
85
|
-
|
93
|
+
robots_url = robots_txt_url(current_uri)
|
94
|
+
robots = EM::HttpRequest.new(robots_url).get
|
86
95
|
robots.callback {
|
87
96
|
robots_file = robots.response
|
88
|
-
robots_cache[
|
89
|
-
if robots_allowed?(robots_file, useragent, current_uri)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
97
|
+
robots_cache[robots_url.to_s] = robots_file if robots_cache
|
98
|
+
if robots_allowed?(robots_file, useragent, robots_url, current_uri)
|
99
|
+
begin
|
100
|
+
h = EM::HttpRequest.new(current_uri).get(options)
|
101
|
+
h.callback(&_handle_uri_callback)
|
102
|
+
h.errback {
|
103
|
+
@http_client = h
|
104
|
+
@error = h.error
|
105
|
+
fail(self)
|
106
|
+
}
|
107
|
+
rescue StandardError => se
|
108
|
+
@http_client = EM::HttpClient.new("")
|
109
|
+
@error = "#{se.message}\n#{se.backtrace.inspect}"
|
95
110
|
fail(self)
|
96
|
-
|
111
|
+
end
|
97
112
|
else
|
98
113
|
@http_client = EM::HttpClient.new("")
|
99
114
|
@error = "robots denied"
|
@@ -101,7 +116,7 @@ class RDaneel
|
|
101
116
|
end
|
102
117
|
}
|
103
118
|
robots.errback {
|
104
|
-
robots_cache.
|
119
|
+
robots_cache[robots_url.to_s] = "" if robots_cache
|
105
120
|
h = EM::HttpRequest.new(current_uri).get(options)
|
106
121
|
h.callback(&_handle_uri_callback)
|
107
122
|
h.errback {
|
@@ -121,10 +136,14 @@ class RDaneel
|
|
121
136
|
|
122
137
|
protected
|
123
138
|
|
124
|
-
def robots_allowed?(robots_file, useragent,
|
125
|
-
|
126
|
-
|
127
|
-
|
139
|
+
def robots_allowed?(robots_file, useragent, robots_url, url)
|
140
|
+
begin
|
141
|
+
rules = RobotRules.new(useragent)
|
142
|
+
rules.parse(robots_url, robots_file)
|
143
|
+
return rules.allowed? url
|
144
|
+
rescue StandardError => err
|
145
|
+
return true
|
146
|
+
end
|
128
147
|
end
|
129
148
|
|
130
149
|
def robots_txt_url(u)
|
@@ -133,7 +152,7 @@ class RDaneel
|
|
133
152
|
else
|
134
153
|
"#{u.host}:#{u.port}"
|
135
154
|
end
|
136
|
-
"http://#{location}/robots.txt"
|
155
|
+
Addressable::URI.parse("http://#{location}/robots.txt")
|
137
156
|
end
|
138
157
|
|
139
158
|
def success?(http_client)
|
@@ -146,8 +165,9 @@ class RDaneel
|
|
146
165
|
|
147
166
|
def redirect_url(http_client, u)
|
148
167
|
location = Addressable::URI.parse(http_client.response_header.location)
|
149
|
-
|
150
|
-
|
168
|
+
location = u.join(location) if location.relative?
|
169
|
+
location.path = "/" if location.path.nil? || location.path == ""
|
170
|
+
location
|
151
171
|
end
|
152
172
|
end
|
153
173
|
|
@@ -2,6 +2,41 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
2
2
|
|
3
3
|
describe "RDaneel when there are no redirects" do
|
4
4
|
|
5
|
+
describe "when not exist a robots.txt (404) and the url requested is /" do
|
6
|
+
before(:each) do
|
7
|
+
burrito.mount( :path => '/', :status => 200,
|
8
|
+
:body => 'Hello World!', :block => should_be_hit_once )
|
9
|
+
burrito.mount( :path => '/robots.txt', :status => 404,
|
10
|
+
:block => should_be_hit_once )
|
11
|
+
end
|
12
|
+
|
13
|
+
after(:each) do
|
14
|
+
burrito.unmount('/')
|
15
|
+
burrito.unmount('/robots.txt')
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should get the content is the url not end with /" do
|
19
|
+
|
20
|
+
EM.run do
|
21
|
+
r = RDaneel.new("http://127.0.0.1:8080")
|
22
|
+
r.callback do
|
23
|
+
r.http_client.response_header.status.should == 200
|
24
|
+
r.http_client.response.should == "Hello World!"
|
25
|
+
r.redirects.should be_empty
|
26
|
+
EM.stop
|
27
|
+
end
|
28
|
+
r.errback do
|
29
|
+
fail
|
30
|
+
EM.stop
|
31
|
+
end
|
32
|
+
r.get
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
|
5
40
|
describe "when a successfull status different than 200 is issued for robots.txt" do
|
6
41
|
|
7
42
|
it "should get the content ignoring the redirect"
|
data/spec/spec_helper.rb
CHANGED
@@ -3,7 +3,6 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'rdaneel'
|
5
5
|
require 'spec'
|
6
|
-
require 'spec/autorun'
|
7
6
|
require 'webrick'
|
8
7
|
|
9
8
|
Spec::Runner.configure do |config|
|
@@ -33,6 +32,7 @@ class Burrito
|
|
33
32
|
@server.start
|
34
33
|
}
|
35
34
|
@server
|
35
|
+
sleep 0.5
|
36
36
|
end
|
37
37
|
|
38
38
|
def mount( opts )
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 2
|
9
|
+
version: 0.1.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Edgar Gonzalez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-07-
|
18
|
+
date: 2010-07-21 00:00:00 -04:30
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -42,8 +42,8 @@ dependencies:
|
|
42
42
|
segments:
|
43
43
|
- 0
|
44
44
|
- 9
|
45
|
-
-
|
46
|
-
version: 0.9.
|
45
|
+
- 3
|
46
|
+
version: 0.9.3
|
47
47
|
type: :runtime
|
48
48
|
version_requirements: *id002
|
49
49
|
- !ruby/object:Gem::Dependency
|