rdaneel 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -1
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/rdaneel.rb +46 -26
- data/spec/no_redirects_neither_robots_spec.rb +35 -0
- data/spec/spec_helper.rb +1 -1
- metadata +5 -5
data/README.rdoc
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= R.Daneel
|
2
2
|
|
3
|
-
An EventMachine+Ruby library to fetch urls
|
3
|
+
An EventMachine+Ruby library to fetch urls obeying robots.txt rules.
|
4
4
|
|
5
5
|
RDaneel is built it on top of @igrigorik's {em-http-request}[http://github.com/igrigorik/em-http-request]
|
6
6
|
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ begin
|
|
11
11
|
gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
|
12
12
|
gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
|
13
13
|
gem.add_dependency("em-http-request", ">= 0.2.10")
|
14
|
-
gem.add_dependency('robot_rules', '>= 0.9.
|
14
|
+
gem.add_dependency('robot_rules', '>= 0.9.3')
|
15
15
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
16
16
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
17
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.2
|
data/lib/rdaneel.rb
CHANGED
@@ -21,6 +21,7 @@ class RDaneel
|
|
21
21
|
|
22
22
|
def initialize(uri)
|
23
23
|
@uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
|
24
|
+
@uri.path = "/" if @uri.path.nil? || @uri.path == ""
|
24
25
|
@redirects = []
|
25
26
|
end
|
26
27
|
|
@@ -61,39 +62,53 @@ class RDaneel
|
|
61
62
|
end
|
62
63
|
else
|
63
64
|
# other error
|
64
|
-
|
65
|
+
@http_client = h
|
65
66
|
@error = "not success and not redirect"
|
66
67
|
fail(self)
|
67
68
|
end
|
68
69
|
}
|
69
70
|
_get = lambda {
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
71
|
+
robots_url = robots_txt_url(current_uri)
|
72
|
+
if robots_cache && robots_file = robots_cache[robots_url.to_s]
|
73
|
+
if robots_allowed?(robots_file, useragent, robots_url, current_uri)
|
74
|
+
begin
|
75
|
+
h = EM::HttpRequest.new(current_uri).get(options)
|
76
|
+
h.callback(&_handle_uri_callback)
|
77
|
+
h.errback {
|
78
|
+
@http_client = h
|
79
|
+
@error = h.error
|
80
|
+
fail(self)
|
81
|
+
}
|
82
|
+
rescue StandardError => se
|
83
|
+
@http_client = EM::HttpClient.new("")
|
84
|
+
@error = "#{se.message}\n#{se.backtrace.inspect}"
|
77
85
|
fail(self)
|
78
|
-
|
86
|
+
end
|
79
87
|
else
|
80
88
|
@http_client = EM::HttpClient.new("")
|
81
89
|
@error = "robots denied"
|
82
90
|
fail(self)
|
83
91
|
end
|
84
92
|
else
|
85
|
-
|
93
|
+
robots_url = robots_txt_url(current_uri)
|
94
|
+
robots = EM::HttpRequest.new(robots_url).get
|
86
95
|
robots.callback {
|
87
96
|
robots_file = robots.response
|
88
|
-
robots_cache[
|
89
|
-
if robots_allowed?(robots_file, useragent, current_uri)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
97
|
+
robots_cache[robots_url.to_s] = robots_file if robots_cache
|
98
|
+
if robots_allowed?(robots_file, useragent, robots_url, current_uri)
|
99
|
+
begin
|
100
|
+
h = EM::HttpRequest.new(current_uri).get(options)
|
101
|
+
h.callback(&_handle_uri_callback)
|
102
|
+
h.errback {
|
103
|
+
@http_client = h
|
104
|
+
@error = h.error
|
105
|
+
fail(self)
|
106
|
+
}
|
107
|
+
rescue StandardError => se
|
108
|
+
@http_client = EM::HttpClient.new("")
|
109
|
+
@error = "#{se.message}\n#{se.backtrace.inspect}"
|
95
110
|
fail(self)
|
96
|
-
|
111
|
+
end
|
97
112
|
else
|
98
113
|
@http_client = EM::HttpClient.new("")
|
99
114
|
@error = "robots denied"
|
@@ -101,7 +116,7 @@ class RDaneel
|
|
101
116
|
end
|
102
117
|
}
|
103
118
|
robots.errback {
|
104
|
-
robots_cache.
|
119
|
+
robots_cache[robots_url.to_s] = "" if robots_cache
|
105
120
|
h = EM::HttpRequest.new(current_uri).get(options)
|
106
121
|
h.callback(&_handle_uri_callback)
|
107
122
|
h.errback {
|
@@ -121,10 +136,14 @@ class RDaneel
|
|
121
136
|
|
122
137
|
protected
|
123
138
|
|
124
|
-
def robots_allowed?(robots_file, useragent,
|
125
|
-
|
126
|
-
|
127
|
-
|
139
|
+
def robots_allowed?(robots_file, useragent, robots_url, url)
|
140
|
+
begin
|
141
|
+
rules = RobotRules.new(useragent)
|
142
|
+
rules.parse(robots_url, robots_file)
|
143
|
+
return rules.allowed? url
|
144
|
+
rescue StandardError => err
|
145
|
+
return true
|
146
|
+
end
|
128
147
|
end
|
129
148
|
|
130
149
|
def robots_txt_url(u)
|
@@ -133,7 +152,7 @@ class RDaneel
|
|
133
152
|
else
|
134
153
|
"#{u.host}:#{u.port}"
|
135
154
|
end
|
136
|
-
"http://#{location}/robots.txt"
|
155
|
+
Addressable::URI.parse("http://#{location}/robots.txt")
|
137
156
|
end
|
138
157
|
|
139
158
|
def success?(http_client)
|
@@ -146,8 +165,9 @@ class RDaneel
|
|
146
165
|
|
147
166
|
def redirect_url(http_client, u)
|
148
167
|
location = Addressable::URI.parse(http_client.response_header.location)
|
149
|
-
|
150
|
-
|
168
|
+
location = u.join(location) if location.relative?
|
169
|
+
location.path = "/" if location.path.nil? || location.path == ""
|
170
|
+
location
|
151
171
|
end
|
152
172
|
end
|
153
173
|
|
@@ -2,6 +2,41 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
2
2
|
|
3
3
|
describe "RDaneel when there are no redirects" do
|
4
4
|
|
5
|
+
describe "when not exist a robots.txt (404) and the url requested is /" do
|
6
|
+
before(:each) do
|
7
|
+
burrito.mount( :path => '/', :status => 200,
|
8
|
+
:body => 'Hello World!', :block => should_be_hit_once )
|
9
|
+
burrito.mount( :path => '/robots.txt', :status => 404,
|
10
|
+
:block => should_be_hit_once )
|
11
|
+
end
|
12
|
+
|
13
|
+
after(:each) do
|
14
|
+
burrito.unmount('/')
|
15
|
+
burrito.unmount('/robots.txt')
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should get the content is the url not end with /" do
|
19
|
+
|
20
|
+
EM.run do
|
21
|
+
r = RDaneel.new("http://127.0.0.1:8080")
|
22
|
+
r.callback do
|
23
|
+
r.http_client.response_header.status.should == 200
|
24
|
+
r.http_client.response.should == "Hello World!"
|
25
|
+
r.redirects.should be_empty
|
26
|
+
EM.stop
|
27
|
+
end
|
28
|
+
r.errback do
|
29
|
+
fail
|
30
|
+
EM.stop
|
31
|
+
end
|
32
|
+
r.get
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
|
5
40
|
describe "when a successfull status different than 200 is issued for robots.txt" do
|
6
41
|
|
7
42
|
it "should get the content ignoring the redirect"
|
data/spec/spec_helper.rb
CHANGED
@@ -3,7 +3,6 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'rdaneel'
|
5
5
|
require 'spec'
|
6
|
-
require 'spec/autorun'
|
7
6
|
require 'webrick'
|
8
7
|
|
9
8
|
Spec::Runner.configure do |config|
|
@@ -33,6 +32,7 @@ class Burrito
|
|
33
32
|
@server.start
|
34
33
|
}
|
35
34
|
@server
|
35
|
+
sleep 0.5
|
36
36
|
end
|
37
37
|
|
38
38
|
def mount( opts )
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 2
|
9
|
+
version: 0.1.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Edgar Gonzalez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-07-
|
18
|
+
date: 2010-07-21 00:00:00 -04:30
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -42,8 +42,8 @@ dependencies:
|
|
42
42
|
segments:
|
43
43
|
- 0
|
44
44
|
- 9
|
45
|
-
-
|
46
|
-
version: 0.9.
|
45
|
+
- 3
|
46
|
+
version: 0.9.3
|
47
47
|
type: :runtime
|
48
48
|
version_requirements: *id002
|
49
49
|
- !ruby/object:Gem::Dependency
|