webrobots 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/webrobots.rb +21 -9
- data/test/test_webrobots.rb +43 -2
- data/webrobots.gemspec +1 -1
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.5
|
data/lib/webrobots.rb
CHANGED
@@ -13,8 +13,10 @@ class WebRobots
|
|
13
13
|
#
|
14
14
|
# * :http_get => a custom method, proc, or anything that responds to
|
15
15
|
# .call(uri), to be used for fetching robots.txt. It must return
|
16
|
-
# the response body if successful
|
17
|
-
#
|
16
|
+
# the response body if successful. If the resource is not found,
|
17
|
+
# it must either return nil or emulate a Net::HTTPNotFound error
|
18
|
+
# that the net/http library would raise, using
|
19
|
+
# Net::HTTPServerException. Any other error raised is regarded as
|
18
20
|
# blanket ban.
|
19
21
|
def initialize(user_agent, options = nil)
|
20
22
|
@user_agent = user_agent
|
@@ -26,6 +28,12 @@ class WebRobots
|
|
26
28
|
@robotstxt = {}
|
27
29
|
end
|
28
30
|
|
31
|
+
@@anon_parser = RobotsTxt::Parser.new('Anonymous')
|
32
|
+
@@disallower = @@anon_parser.parse(<<-TXT, nil)
|
33
|
+
User-Agent: *
|
34
|
+
Disallow: /
|
35
|
+
TXT
|
36
|
+
|
29
37
|
# Returns the robot name initially given.
|
30
38
|
attr_reader :user_agent
|
31
39
|
|
@@ -95,16 +103,20 @@ class WebRobots
|
|
95
103
|
def robots_txt(site)
|
96
104
|
cache_robots_txt(site) {
|
97
105
|
fetch_robots_txt(site)
|
98
|
-
}
|
106
|
+
} or @@disallower
|
99
107
|
end
|
100
108
|
|
101
109
|
def fetch_robots_txt(site)
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
110
|
+
body =
|
111
|
+
begin
|
112
|
+
@http_get.call(site + 'robots.txt')
|
113
|
+
rescue => e
|
114
|
+
if e.is_a?(Net::HTTPExceptions) && e.response.is_a?(Net::HTTPNotFound)
|
115
|
+
''
|
116
|
+
else
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
end and @parser.parse(body, site)
|
108
120
|
end
|
109
121
|
|
110
122
|
def cache_robots_txt(site, &block)
|
data/test/test_webrobots.rb
CHANGED
@@ -25,7 +25,9 @@ class TestWebRobots < Test::Unit::TestCase
|
|
25
25
|
|
26
26
|
TXT
|
27
27
|
when 'http://site5.example.org/robots.txt'
|
28
|
-
raise Net::
|
28
|
+
raise Net::HTTPServerException.new(
|
29
|
+
'Not Found',
|
30
|
+
Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
|
29
31
|
else
|
30
32
|
raise "#{uri} is not supposed to be fetched"
|
31
33
|
end
|
@@ -41,6 +43,42 @@ class TestWebRobots < Test::Unit::TestCase
|
|
41
43
|
assert @robots.allowed?('http://site3.example.org/private/secret.txt')
|
42
44
|
assert @robots.allowed?('http://site4.example.org/index.html')
|
43
45
|
assert @robots.allowed?('http://site4.example.org/private/secret.txt')
|
46
|
+
assert @robots.allowed?('http://site5.example.org/index.html')
|
47
|
+
assert @robots.allowed?('http://site5.example.org/private/secret.txt')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
context "robots.txt that cannot be fetched" do
|
52
|
+
setup do
|
53
|
+
@robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
|
54
|
+
case uri.to_s
|
55
|
+
when 'http://site1.example.org/robots.txt'
|
56
|
+
raise Net::HTTPFatalError.new(
|
57
|
+
'Internal Server Error',
|
58
|
+
Net::HTTPInternalServerError.new('1.1', '500', 'Internal Server Error'))
|
59
|
+
when 'http://site2.example.org/robots.txt'
|
60
|
+
raise Net::HTTPRetriableError.new(
|
61
|
+
'Found',
|
62
|
+
Net::HTTPFound.new('1.1', '302', 'Found'))
|
63
|
+
when 'http://site3.example.org/robots.txt'
|
64
|
+
raise Errno::ECONNREFUSED
|
65
|
+
when 'http://site4.example.org/robots.txt'
|
66
|
+
raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
|
67
|
+
else
|
68
|
+
raise "#{uri} is not supposed to be fetched"
|
69
|
+
end
|
70
|
+
})
|
71
|
+
end
|
72
|
+
|
73
|
+
should "disallow any robot" do
|
74
|
+
assert @robots.disallowed?('http://site1.example.org/index.html')
|
75
|
+
assert @robots.disallowed?('http://site1.example.org/private/secret.txt')
|
76
|
+
assert @robots.disallowed?('http://site2.example.org/index.html')
|
77
|
+
assert @robots.disallowed?('http://site2.example.org/private/secret.txt')
|
78
|
+
assert @robots.disallowed?('http://site3.example.org/index.html')
|
79
|
+
assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
|
80
|
+
assert @robots.disallowed?('http://site4.example.org/index.html')
|
81
|
+
assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
|
44
82
|
end
|
45
83
|
end
|
46
84
|
|
@@ -264,9 +302,12 @@ Crawl-Delay: 1.5
|
|
264
302
|
User-Agent: *
|
265
303
|
Disallow: /2heavy/
|
266
304
|
Allow: /2heavy/*.html
|
305
|
+
# These are wrong but should be allowed
|
306
|
+
Allow: /2heavy/%
|
307
|
+
Crawl-Delay:
|
308
|
+
#
|
267
309
|
Option1: Bar
|
268
310
|
Option3: Hi
|
269
|
-
Crawl-Delay:
|
270
311
|
TXT
|
271
312
|
else
|
272
313
|
raise "#{uri} is not supposed to be fetched"
|
data/webrobots.gemspec
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 5
|
10
|
+
version: 0.0.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|