webrobots 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/webrobots.rb +47 -36
- data/lib/webrobots/robotstxt.rb +23 -4
- data/lib/webrobots/robotstxt.ry +23 -4
- data/test/test_webrobots.rb +96 -17
- data/webrobots.gemspec +2 -2
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.6
|
data/lib/webrobots.rb
CHANGED
@@ -13,11 +13,9 @@ class WebRobots
|
|
13
13
|
#
|
14
14
|
# * :http_get => a custom method, proc, or anything that responds to
|
15
15
|
# .call(uri), to be used for fetching robots.txt. It must return
|
16
|
-
# the response body if successful
|
17
|
-
#
|
18
|
-
#
|
19
|
-
# Net::HTTPServerException. Any other error raised is regarded as
|
20
|
-
# blanket ban.
|
16
|
+
# the response body if successful, return an empty string if the
|
17
|
+
# resource is not found, and return nil or raise any error on
|
18
|
+
# failure. Redirects should be handled within this proc.
|
21
19
|
def initialize(user_agent, options = nil)
|
22
20
|
@user_agent = user_agent
|
23
21
|
@parser = RobotsTxt::Parser.new(user_agent)
|
@@ -25,14 +23,13 @@ class WebRobots
|
|
25
23
|
options ||= {}
|
26
24
|
@http_get = options[:http_get] || method(:http_get)
|
27
25
|
|
28
|
-
@robotstxt =
|
26
|
+
@robotstxt = create_cache()
|
29
27
|
end
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
TXT
|
29
|
+
# :nodoc:
|
30
|
+
def create_cache
|
31
|
+
Hash.new # Must respond to [], []=, and delete.
|
32
|
+
end
|
36
33
|
|
37
34
|
# Returns the robot name initially given.
|
38
35
|
attr_reader :user_agent
|
@@ -42,9 +39,9 @@ Disallow: /
|
|
42
39
|
# a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
|
43
40
|
# raised.
|
44
41
|
def allowed?(url)
|
45
|
-
|
42
|
+
robots_txt, request_uri = evaluate(url)
|
46
43
|
return true if request_uri == '/robots.txt'
|
47
|
-
robots_txt
|
44
|
+
robots_txt.allow?(request_uri)
|
48
45
|
end
|
49
46
|
|
50
47
|
# Equivalent to !allowed?(url).
|
@@ -56,8 +53,7 @@ Disallow: /
|
|
56
53
|
# with each field name lower-cased. See allowed?() for a list of
|
57
54
|
# errors that may be raised.
|
58
55
|
def options(url)
|
59
|
-
|
60
|
-
robots_txt(site).options
|
56
|
+
robots_txt_for(url).options
|
61
57
|
end
|
62
58
|
|
63
59
|
# Equivalent to option(url)[token.downcase].
|
@@ -68,8 +64,25 @@ Disallow: /
|
|
68
64
|
# Returns an array of Sitemap URLs. See allowed?() for a list of
|
69
65
|
# errors that may be raised.
|
70
66
|
def sitemaps(url)
|
67
|
+
robots_txt_for(url).sitemaps
|
68
|
+
end
|
69
|
+
|
70
|
+
# Returns an error object if there is an error in fetching or
|
71
|
+
# parsing robots.txt of the site +url+.
|
72
|
+
def error(url)
|
73
|
+
robots_txt_for(url).error
|
74
|
+
end
|
75
|
+
|
76
|
+
# Raises the error if there was an error in fetching or parsing
|
77
|
+
# robots.txt of the site +url+.
|
78
|
+
def error!(url)
|
79
|
+
robots_txt_for(url).error!
|
80
|
+
end
|
81
|
+
|
82
|
+
# Removes robots.txt cache for the site +url+.
|
83
|
+
def reset(url)
|
71
84
|
site, = split_uri(url)
|
72
|
-
|
85
|
+
@robotstxt.delete(site)
|
73
86
|
end
|
74
87
|
|
75
88
|
private
|
@@ -100,31 +113,27 @@ Disallow: /
|
|
100
113
|
return site, request_uri
|
101
114
|
end
|
102
115
|
|
103
|
-
def
|
104
|
-
|
105
|
-
|
106
|
-
} or @@disallower
|
116
|
+
def evaluate(url)
|
117
|
+
site, request_uri = split_uri(url)
|
118
|
+
return get_robots_txt(site), request_uri
|
107
119
|
end
|
108
120
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
else
|
117
|
-
nil
|
118
|
-
end
|
119
|
-
end and @parser.parse(body, site)
|
121
|
+
def robots_txt_for(url)
|
122
|
+
site, = split_uri(url)
|
123
|
+
get_robots_txt(site)
|
124
|
+
end
|
125
|
+
|
126
|
+
def get_robots_txt(site)
|
127
|
+
@robotstxt[site] ||= fetch_robots_txt(site)
|
120
128
|
end
|
121
129
|
|
122
|
-
def
|
123
|
-
|
124
|
-
@
|
125
|
-
|
126
|
-
|
130
|
+
def fetch_robots_txt(site)
|
131
|
+
begin
|
132
|
+
body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable'
|
133
|
+
rescue => e
|
134
|
+
return RobotsTxt.unfetchable(site, e, @user_agent)
|
127
135
|
end
|
136
|
+
@parser.parse!(body, site)
|
128
137
|
end
|
129
138
|
|
130
139
|
def http_get(uri)
|
@@ -143,6 +152,8 @@ Disallow: /
|
|
143
152
|
when Net::HTTPRedirection
|
144
153
|
referer = uri.to_s
|
145
154
|
uri = URI(response['location'])
|
155
|
+
when Net::HTTPNotFound
|
156
|
+
return ''
|
146
157
|
else
|
147
158
|
response.value
|
148
159
|
end
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -27,8 +27,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
27
27
|
@target = target
|
28
28
|
end
|
29
29
|
|
30
|
-
def
|
31
|
-
|
30
|
+
def parse!(input, site)
|
31
|
+
parse(input, site)
|
32
|
+
rescue Error => e
|
33
|
+
RobotsTxt.new(site, nil, :error => e, :target => @target)
|
32
34
|
end
|
33
35
|
|
34
36
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
@@ -519,11 +521,12 @@ end
|
|
519
521
|
end # class Parser
|
520
522
|
|
521
523
|
def initialize(site, records, options = nil)
|
522
|
-
|
524
|
+
@timestamp = Time.now
|
523
525
|
@site = site
|
524
526
|
@options = options || {}
|
525
527
|
@last_checked = nil
|
526
528
|
|
529
|
+
@error = @options[:error]
|
527
530
|
@target = @options[:target]
|
528
531
|
@sitemaps = @options[:sitemaps] || []
|
529
532
|
|
@@ -542,7 +545,12 @@ end # class Parser
|
|
542
545
|
end
|
543
546
|
end
|
544
547
|
|
545
|
-
attr_reader :site, :sitemaps
|
548
|
+
attr_reader :timestamp, :site, :sitemaps
|
549
|
+
attr_accessor :error
|
550
|
+
|
551
|
+
def error!
|
552
|
+
raise @error if @error
|
553
|
+
end
|
546
554
|
|
547
555
|
def target(user_agent = nil)
|
548
556
|
if user_agent
|
@@ -579,6 +587,17 @@ end # class Parser
|
|
579
587
|
record.options
|
580
588
|
end
|
581
589
|
|
590
|
+
DISALLOW_ALL = <<-TXT
|
591
|
+
User-Agent: *
|
592
|
+
Disallow: /
|
593
|
+
TXT
|
594
|
+
|
595
|
+
def self.unfetchable(site, reason, target = nil)
|
596
|
+
Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
|
597
|
+
robots_txt.error = reason
|
598
|
+
}
|
599
|
+
end
|
600
|
+
|
582
601
|
class Record
|
583
602
|
def initialize(agentlines, rulelines)
|
584
603
|
@patterns = agentlines.map { |agentline| agentline.pattern }
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -167,8 +167,10 @@ class WebRobots
|
|
167
167
|
@target = target
|
168
168
|
end
|
169
169
|
|
170
|
-
def
|
171
|
-
|
170
|
+
def parse!(input, site)
|
171
|
+
parse(input, site)
|
172
|
+
rescue Error => e
|
173
|
+
RobotsTxt.new(site, nil, :error => e, :target => @target)
|
172
174
|
end
|
173
175
|
|
174
176
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
@@ -249,11 +251,12 @@ class WebRobots
|
|
249
251
|
|
250
252
|
---- footer
|
251
253
|
def initialize(site, records, options = nil)
|
252
|
-
|
254
|
+
@timestamp = Time.now
|
253
255
|
@site = site
|
254
256
|
@options = options || {}
|
255
257
|
@last_checked = nil
|
256
258
|
|
259
|
+
@error = @options[:error]
|
257
260
|
@target = @options[:target]
|
258
261
|
@sitemaps = @options[:sitemaps] || []
|
259
262
|
|
@@ -272,7 +275,12 @@ class WebRobots
|
|
272
275
|
end
|
273
276
|
end
|
274
277
|
|
275
|
-
attr_reader :site, :sitemaps
|
278
|
+
attr_reader :timestamp, :site, :sitemaps
|
279
|
+
attr_accessor :error
|
280
|
+
|
281
|
+
def error!
|
282
|
+
raise @error if @error
|
283
|
+
end
|
276
284
|
|
277
285
|
def target(user_agent = nil)
|
278
286
|
if user_agent
|
@@ -309,6 +317,17 @@ class WebRobots
|
|
309
317
|
record.options
|
310
318
|
end
|
311
319
|
|
320
|
+
DISALLOW_ALL = <<-TXT
|
321
|
+
User-Agent: *
|
322
|
+
Disallow: /
|
323
|
+
TXT
|
324
|
+
|
325
|
+
def self.unfetchable(site, reason, target = nil)
|
326
|
+
Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
|
327
|
+
robots_txt.error = reason
|
328
|
+
}
|
329
|
+
end
|
330
|
+
|
312
331
|
class Record
|
313
332
|
def initialize(agentlines, rulelines)
|
314
333
|
@patterns = agentlines.map { |agentline| agentline.pattern }
|
data/test/test_webrobots.rb
CHANGED
@@ -24,10 +24,6 @@ class TestWebRobots < Test::Unit::TestCase
|
|
24
24
|
#comment
|
25
25
|
|
26
26
|
TXT
|
27
|
-
when 'http://site5.example.org/robots.txt'
|
28
|
-
raise Net::HTTPServerException.new(
|
29
|
-
'Not Found',
|
30
|
-
Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
|
31
27
|
else
|
32
28
|
raise "#{uri} is not supposed to be fetched"
|
33
29
|
end
|
@@ -43,8 +39,6 @@ class TestWebRobots < Test::Unit::TestCase
|
|
43
39
|
assert @robots.allowed?('http://site3.example.org/private/secret.txt')
|
44
40
|
assert @robots.allowed?('http://site4.example.org/index.html')
|
45
41
|
assert @robots.allowed?('http://site4.example.org/private/secret.txt')
|
46
|
-
assert @robots.allowed?('http://site5.example.org/index.html')
|
47
|
-
assert @robots.allowed?('http://site5.example.org/private/secret.txt')
|
48
42
|
end
|
49
43
|
end
|
50
44
|
|
@@ -64,6 +58,8 @@ class TestWebRobots < Test::Unit::TestCase
|
|
64
58
|
raise Errno::ECONNREFUSED
|
65
59
|
when 'http://site4.example.org/robots.txt'
|
66
60
|
raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
|
61
|
+
when 'http://site5.example.org/robots.txt'
|
62
|
+
nil
|
67
63
|
else
|
68
64
|
raise "#{uri} is not supposed to be fetched"
|
69
65
|
end
|
@@ -79,6 +75,8 @@ class TestWebRobots < Test::Unit::TestCase
|
|
79
75
|
assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
|
80
76
|
assert @robots.disallowed?('http://site4.example.org/index.html')
|
81
77
|
assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
|
78
|
+
assert @robots.disallowed?('http://site5.example.org/index.html')
|
79
|
+
assert @robots.disallowed?('http://site5.example.org/private/secret.txt')
|
82
80
|
end
|
83
81
|
end
|
84
82
|
|
@@ -176,8 +174,12 @@ Disallow: /~joe/index.html
|
|
176
174
|
should "properly restrict access" do
|
177
175
|
assert @robots_good.allowed?('http://www.example.org/index.html')
|
178
176
|
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
|
177
|
+
assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
|
178
|
+
assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
|
179
179
|
assert @robots_good.allowed?('http://www.example.org/2heavy/index.html')
|
180
|
+
assert @robots_good.allowed?('http://WWW.Example.Org/2heavy/index.html')
|
180
181
|
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
|
182
|
+
assert !@robots_good.allowed?('http://WWW.Example.Org/2heavy/index.htm')
|
181
183
|
|
182
184
|
assert !@robots_evil.allowed?('http://www.example.org/index.html')
|
183
185
|
assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
|
@@ -234,38 +236,73 @@ Disallow: /~joe/index.html
|
|
234
236
|
|
235
237
|
context "robots.txt with errors" do
|
236
238
|
setup do
|
239
|
+
@turn1 = @turn2 = 0
|
237
240
|
@http_get = lambda { |uri|
|
238
241
|
case uri.to_s
|
239
242
|
when 'http://www.example.org/robots.txt'
|
240
|
-
|
243
|
+
if (@turn1 += 1) % 2 == 1
|
244
|
+
<<-'TXT'
|
241
245
|
# some comment
|
242
|
-
User-Agent:
|
246
|
+
User-Agent: thebot
|
247
|
+
# Disallow: /
|
248
|
+
Disallow: /2heavy/
|
249
|
+
# Allow: /2heavy/notsoheavy
|
250
|
+
Allow: /2heavy/*.html
|
251
|
+
|
252
|
+
User-Agent: anotherbot
|
253
|
+
# Disallow: /
|
254
|
+
Disallow: /2heavy/
|
255
|
+
# Allow: /2heavy/notsoheavy
|
256
|
+
Allow: /2heavy/*.html
|
257
|
+
TXT
|
258
|
+
else
|
259
|
+
<<-'TXT'
|
260
|
+
# some comment
|
261
|
+
User-Agent: thebot
|
243
262
|
# Disallow: /
|
244
263
|
Disallow: /2heavy/
|
245
264
|
# Allow: /2heavy/notsoheavy
|
246
265
|
Allow: /2heavy/*.html
|
247
266
|
#
|
248
|
-
User-Agent:
|
267
|
+
User-Agent: anotherbot
|
249
268
|
# Disallow: /
|
250
269
|
Disallow: /2heavy/
|
251
270
|
# Allow: /2heavy/notsoheavy
|
252
271
|
Allow: /2heavy/*.html
|
253
|
-
|
272
|
+
TXT
|
273
|
+
end
|
254
274
|
when 'http://www.example.com/robots.txt'
|
255
|
-
|
275
|
+
if (@turn2 += 1) % 2 == 1
|
276
|
+
<<-'TXT'
|
256
277
|
# some comment
|
257
|
-
#User-Agent:
|
278
|
+
#User-Agent: thebot
|
258
279
|
# Disallow: /
|
259
280
|
Disallow: /2heavy/
|
260
281
|
# Allow: /2heavy/notsoheavy
|
261
282
|
Allow: /2heavy/*.html
|
262
283
|
|
263
|
-
User-Agent:
|
284
|
+
User-Agent: anotherbot
|
264
285
|
# Disallow: /
|
265
286
|
Disallow: /2heavy/
|
266
287
|
# Allow: /2heavy/notsoheavy
|
267
288
|
Allow: /2heavy/*.html
|
268
|
-
|
289
|
+
TXT
|
290
|
+
else
|
291
|
+
<<-'TXT'
|
292
|
+
# some comment
|
293
|
+
User-Agent: thebot
|
294
|
+
# Disallow: /
|
295
|
+
Disallow: /2heavy/
|
296
|
+
# Allow: /2heavy/notsoheavy
|
297
|
+
Allow: /2heavy/*.html
|
298
|
+
|
299
|
+
User-Agent: anotherbot
|
300
|
+
# Disallow: /
|
301
|
+
Disallow: /2heavy/
|
302
|
+
# Allow: /2heavy/notsoheavy
|
303
|
+
Allow: /2heavy/*.html
|
304
|
+
TXT
|
305
|
+
end
|
269
306
|
else
|
270
307
|
raise "#{uri} is not supposed to be fetched"
|
271
308
|
end
|
@@ -273,12 +310,54 @@ Allow: /2heavy/*.html
|
|
273
310
|
end
|
274
311
|
|
275
312
|
should "raise ParseError" do
|
276
|
-
robots = WebRobots.new('
|
313
|
+
robots = WebRobots.new('TheBot', :http_get => @http_get)
|
314
|
+
|
315
|
+
url = 'http://www.example.org/2heavy/index.php'
|
316
|
+
|
317
|
+
assert_nil robots.error(url)
|
318
|
+
assert !robots.allowed?(url)
|
319
|
+
assert_nothing_raised {
|
320
|
+
robots.error!(url)
|
321
|
+
}
|
322
|
+
|
323
|
+
robots.reset(url)
|
324
|
+
|
325
|
+
assert robots.allowed?(url)
|
326
|
+
assert_instance_of WebRobots::ParseError, robots.error(url)
|
277
327
|
assert_raise(WebRobots::ParseError) {
|
278
|
-
robots.
|
328
|
+
robots.error!(url)
|
279
329
|
}
|
330
|
+
|
331
|
+
robots.reset(url)
|
332
|
+
|
333
|
+
assert_nil robots.error(url)
|
334
|
+
assert !robots.allowed?(url)
|
335
|
+
assert_nothing_raised {
|
336
|
+
robots.error!(url)
|
337
|
+
}
|
338
|
+
|
339
|
+
url = 'http://www.example.com/2heavy/index.php'
|
340
|
+
|
341
|
+
assert robots.allowed?(url)
|
342
|
+
assert_instance_of WebRobots::ParseError, robots.error(url)
|
343
|
+
assert_raise(WebRobots::ParseError) {
|
344
|
+
robots.error!(url)
|
345
|
+
}
|
346
|
+
|
347
|
+
robots.reset(url)
|
348
|
+
|
349
|
+
assert_nil robots.error(url)
|
350
|
+
assert !robots.allowed?(url)
|
351
|
+
assert_nothing_raised {
|
352
|
+
robots.error!(url)
|
353
|
+
}
|
354
|
+
|
355
|
+
robots.reset(url)
|
356
|
+
|
357
|
+
assert robots.allowed?(url)
|
358
|
+
assert_instance_of WebRobots::ParseError, robots.error(url)
|
280
359
|
assert_raise(WebRobots::ParseError) {
|
281
|
-
robots.
|
360
|
+
robots.error!(url)
|
282
361
|
}
|
283
362
|
end
|
284
363
|
end
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-09}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 6
|
10
|
+
version: 0.0.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-09 00:00:00 +09:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|