webrobots 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/webrobots.rb +21 -9
- data/test/test_webrobots.rb +43 -2
- data/webrobots.gemspec +1 -1
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.5
|
data/lib/webrobots.rb
CHANGED
@@ -13,8 +13,10 @@ class WebRobots
|
|
13
13
|
#
|
14
14
|
# * :http_get => a custom method, proc, or anything that responds to
|
15
15
|
# .call(uri), to be used for fetching robots.txt. It must return
|
16
|
-
# the response body if successful
|
17
|
-
#
|
16
|
+
# the response body if successful. If the resource is not found,
|
17
|
+
# it must either return nil or emulate a Net::HTTPNotFound error
|
18
|
+
# that the net/http library would raise, using
|
19
|
+
# Net::HTTPServerException. Any other error raised is regarded as
|
18
20
|
# blanket ban.
|
19
21
|
def initialize(user_agent, options = nil)
|
20
22
|
@user_agent = user_agent
|
@@ -26,6 +28,12 @@ class WebRobots
|
|
26
28
|
@robotstxt = {}
|
27
29
|
end
|
28
30
|
|
31
|
+
@@anon_parser = RobotsTxt::Parser.new('Anonymous')
|
32
|
+
@@disallower = @@anon_parser.parse(<<-TXT, nil)
|
33
|
+
User-Agent: *
|
34
|
+
Disallow: /
|
35
|
+
TXT
|
36
|
+
|
29
37
|
# Returns the robot name initially given.
|
30
38
|
attr_reader :user_agent
|
31
39
|
|
@@ -95,16 +103,20 @@ class WebRobots
|
|
95
103
|
def robots_txt(site)
|
96
104
|
cache_robots_txt(site) {
|
97
105
|
fetch_robots_txt(site)
|
98
|
-
}
|
106
|
+
} or @@disallower
|
99
107
|
end
|
100
108
|
|
101
109
|
def fetch_robots_txt(site)
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
110
|
+
body =
|
111
|
+
begin
|
112
|
+
@http_get.call(site + 'robots.txt')
|
113
|
+
rescue => e
|
114
|
+
if e.is_a?(Net::HTTPExceptions) && e.response.is_a?(Net::HTTPNotFound)
|
115
|
+
''
|
116
|
+
else
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
end and @parser.parse(body, site)
|
108
120
|
end
|
109
121
|
|
110
122
|
def cache_robots_txt(site, &block)
|
data/test/test_webrobots.rb
CHANGED
@@ -25,7 +25,9 @@ class TestWebRobots < Test::Unit::TestCase
|
|
25
25
|
|
26
26
|
TXT
|
27
27
|
when 'http://site5.example.org/robots.txt'
|
28
|
-
raise Net::
|
28
|
+
raise Net::HTTPServerException.new(
|
29
|
+
'Not Found',
|
30
|
+
Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
|
29
31
|
else
|
30
32
|
raise "#{uri} is not supposed to be fetched"
|
31
33
|
end
|
@@ -41,6 +43,42 @@ class TestWebRobots < Test::Unit::TestCase
|
|
41
43
|
assert @robots.allowed?('http://site3.example.org/private/secret.txt')
|
42
44
|
assert @robots.allowed?('http://site4.example.org/index.html')
|
43
45
|
assert @robots.allowed?('http://site4.example.org/private/secret.txt')
|
46
|
+
assert @robots.allowed?('http://site5.example.org/index.html')
|
47
|
+
assert @robots.allowed?('http://site5.example.org/private/secret.txt')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
context "robots.txt that cannot be fetched" do
|
52
|
+
setup do
|
53
|
+
@robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
|
54
|
+
case uri.to_s
|
55
|
+
when 'http://site1.example.org/robots.txt'
|
56
|
+
raise Net::HTTPFatalError.new(
|
57
|
+
'Internal Server Error',
|
58
|
+
Net::HTTPInternalServerError.new('1.1', '500', 'Internal Server Error'))
|
59
|
+
when 'http://site2.example.org/robots.txt'
|
60
|
+
raise Net::HTTPRetriableError.new(
|
61
|
+
'Found',
|
62
|
+
Net::HTTPFound.new('1.1', '302', 'Found'))
|
63
|
+
when 'http://site3.example.org/robots.txt'
|
64
|
+
raise Errno::ECONNREFUSED
|
65
|
+
when 'http://site4.example.org/robots.txt'
|
66
|
+
raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
|
67
|
+
else
|
68
|
+
raise "#{uri} is not supposed to be fetched"
|
69
|
+
end
|
70
|
+
})
|
71
|
+
end
|
72
|
+
|
73
|
+
should "disallow any robot" do
|
74
|
+
assert @robots.disallowed?('http://site1.example.org/index.html')
|
75
|
+
assert @robots.disallowed?('http://site1.example.org/private/secret.txt')
|
76
|
+
assert @robots.disallowed?('http://site2.example.org/index.html')
|
77
|
+
assert @robots.disallowed?('http://site2.example.org/private/secret.txt')
|
78
|
+
assert @robots.disallowed?('http://site3.example.org/index.html')
|
79
|
+
assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
|
80
|
+
assert @robots.disallowed?('http://site4.example.org/index.html')
|
81
|
+
assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
|
44
82
|
end
|
45
83
|
end
|
46
84
|
|
@@ -264,9 +302,12 @@ Crawl-Delay: 1.5
|
|
264
302
|
User-Agent: *
|
265
303
|
Disallow: /2heavy/
|
266
304
|
Allow: /2heavy/*.html
|
305
|
+
# These are wrong but should be allowed
|
306
|
+
Allow: /2heavy/%
|
307
|
+
Crawl-Delay:
|
308
|
+
#
|
267
309
|
Option1: Bar
|
268
310
|
Option3: Hi
|
269
|
-
Crawl-Delay:
|
270
311
|
TXT
|
271
312
|
else
|
272
313
|
raise "#{uri} is not supposed to be fetched"
|
data/webrobots.gemspec
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 5
|
10
|
+
version: 0.0.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|