webrobots 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/webrobots.rb +47 -36
- data/lib/webrobots/robotstxt.rb +23 -4
- data/lib/webrobots/robotstxt.ry +23 -4
- data/test/test_webrobots.rb +96 -17
- data/webrobots.gemspec +2 -2
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.6
|
data/lib/webrobots.rb
CHANGED
@@ -13,11 +13,9 @@ class WebRobots
|
|
13
13
|
#
|
14
14
|
# * :http_get => a custom method, proc, or anything that responds to
|
15
15
|
# .call(uri), to be used for fetching robots.txt. It must return
|
16
|
-
# the response body if successful
|
17
|
-
#
|
18
|
-
#
|
19
|
-
# Net::HTTPServerException. Any other error raised is regarded as
|
20
|
-
# blanket ban.
|
16
|
+
# the response body if successful, return an empty string if the
|
17
|
+
# resource is not found, and return nil or raise any error on
|
18
|
+
# failure. Redirects should be handled within this proc.
|
21
19
|
def initialize(user_agent, options = nil)
|
22
20
|
@user_agent = user_agent
|
23
21
|
@parser = RobotsTxt::Parser.new(user_agent)
|
@@ -25,14 +23,13 @@ class WebRobots
|
|
25
23
|
options ||= {}
|
26
24
|
@http_get = options[:http_get] || method(:http_get)
|
27
25
|
|
28
|
-
@robotstxt =
|
26
|
+
@robotstxt = create_cache()
|
29
27
|
end
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
TXT
|
29
|
+
# :nodoc:
|
30
|
+
def create_cache
|
31
|
+
Hash.new # Must respond to [], []=, and delete.
|
32
|
+
end
|
36
33
|
|
37
34
|
# Returns the robot name initially given.
|
38
35
|
attr_reader :user_agent
|
@@ -42,9 +39,9 @@ Disallow: /
|
|
42
39
|
# a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
|
43
40
|
# raised.
|
44
41
|
def allowed?(url)
|
45
|
-
|
42
|
+
robots_txt, request_uri = evaluate(url)
|
46
43
|
return true if request_uri == '/robots.txt'
|
47
|
-
robots_txt
|
44
|
+
robots_txt.allow?(request_uri)
|
48
45
|
end
|
49
46
|
|
50
47
|
# Equivalent to !allowed?(url).
|
@@ -56,8 +53,7 @@ Disallow: /
|
|
56
53
|
# with each field name lower-cased. See allowed?() for a list of
|
57
54
|
# errors that may be raised.
|
58
55
|
def options(url)
|
59
|
-
|
60
|
-
robots_txt(site).options
|
56
|
+
robots_txt_for(url).options
|
61
57
|
end
|
62
58
|
|
63
59
|
# Equivalent to option(url)[token.downcase].
|
@@ -68,8 +64,25 @@ Disallow: /
|
|
68
64
|
# Returns an array of Sitemap URLs. See allowed?() for a list of
|
69
65
|
# errors that may be raised.
|
70
66
|
def sitemaps(url)
|
67
|
+
robots_txt_for(url).sitemaps
|
68
|
+
end
|
69
|
+
|
70
|
+
# Returns an error object if there is an error in fetching or
|
71
|
+
# parsing robots.txt of the site +url+.
|
72
|
+
def error(url)
|
73
|
+
robots_txt_for(url).error
|
74
|
+
end
|
75
|
+
|
76
|
+
# Raises the error if there was an error in fetching or parsing
|
77
|
+
# robots.txt of the site +url+.
|
78
|
+
def error!(url)
|
79
|
+
robots_txt_for(url).error!
|
80
|
+
end
|
81
|
+
|
82
|
+
# Removes robots.txt cache for the site +url+.
|
83
|
+
def reset(url)
|
71
84
|
site, = split_uri(url)
|
72
|
-
|
85
|
+
@robotstxt.delete(site)
|
73
86
|
end
|
74
87
|
|
75
88
|
private
|
@@ -100,31 +113,27 @@ Disallow: /
|
|
100
113
|
return site, request_uri
|
101
114
|
end
|
102
115
|
|
103
|
-
def
|
104
|
-
|
105
|
-
|
106
|
-
} or @@disallower
|
116
|
+
def evaluate(url)
|
117
|
+
site, request_uri = split_uri(url)
|
118
|
+
return get_robots_txt(site), request_uri
|
107
119
|
end
|
108
120
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
else
|
117
|
-
nil
|
118
|
-
end
|
119
|
-
end and @parser.parse(body, site)
|
121
|
+
def robots_txt_for(url)
|
122
|
+
site, = split_uri(url)
|
123
|
+
get_robots_txt(site)
|
124
|
+
end
|
125
|
+
|
126
|
+
def get_robots_txt(site)
|
127
|
+
@robotstxt[site] ||= fetch_robots_txt(site)
|
120
128
|
end
|
121
129
|
|
122
|
-
def
|
123
|
-
|
124
|
-
@
|
125
|
-
|
126
|
-
|
130
|
+
def fetch_robots_txt(site)
|
131
|
+
begin
|
132
|
+
body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable'
|
133
|
+
rescue => e
|
134
|
+
return RobotsTxt.unfetchable(site, e, @user_agent)
|
127
135
|
end
|
136
|
+
@parser.parse!(body, site)
|
128
137
|
end
|
129
138
|
|
130
139
|
def http_get(uri)
|
@@ -143,6 +152,8 @@ Disallow: /
|
|
143
152
|
when Net::HTTPRedirection
|
144
153
|
referer = uri.to_s
|
145
154
|
uri = URI(response['location'])
|
155
|
+
when Net::HTTPNotFound
|
156
|
+
return ''
|
146
157
|
else
|
147
158
|
response.value
|
148
159
|
end
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -27,8 +27,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
27
27
|
@target = target
|
28
28
|
end
|
29
29
|
|
30
|
-
def
|
31
|
-
|
30
|
+
def parse!(input, site)
|
31
|
+
parse(input, site)
|
32
|
+
rescue Error => e
|
33
|
+
RobotsTxt.new(site, nil, :error => e, :target => @target)
|
32
34
|
end
|
33
35
|
|
34
36
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
@@ -519,11 +521,12 @@ end
|
|
519
521
|
end # class Parser
|
520
522
|
|
521
523
|
def initialize(site, records, options = nil)
|
522
|
-
|
524
|
+
@timestamp = Time.now
|
523
525
|
@site = site
|
524
526
|
@options = options || {}
|
525
527
|
@last_checked = nil
|
526
528
|
|
529
|
+
@error = @options[:error]
|
527
530
|
@target = @options[:target]
|
528
531
|
@sitemaps = @options[:sitemaps] || []
|
529
532
|
|
@@ -542,7 +545,12 @@ end # class Parser
|
|
542
545
|
end
|
543
546
|
end
|
544
547
|
|
545
|
-
attr_reader :site, :sitemaps
|
548
|
+
attr_reader :timestamp, :site, :sitemaps
|
549
|
+
attr_accessor :error
|
550
|
+
|
551
|
+
def error!
|
552
|
+
raise @error if @error
|
553
|
+
end
|
546
554
|
|
547
555
|
def target(user_agent = nil)
|
548
556
|
if user_agent
|
@@ -579,6 +587,17 @@ end # class Parser
|
|
579
587
|
record.options
|
580
588
|
end
|
581
589
|
|
590
|
+
DISALLOW_ALL = <<-TXT
|
591
|
+
User-Agent: *
|
592
|
+
Disallow: /
|
593
|
+
TXT
|
594
|
+
|
595
|
+
def self.unfetchable(site, reason, target = nil)
|
596
|
+
Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
|
597
|
+
robots_txt.error = reason
|
598
|
+
}
|
599
|
+
end
|
600
|
+
|
582
601
|
class Record
|
583
602
|
def initialize(agentlines, rulelines)
|
584
603
|
@patterns = agentlines.map { |agentline| agentline.pattern }
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -167,8 +167,10 @@ class WebRobots
|
|
167
167
|
@target = target
|
168
168
|
end
|
169
169
|
|
170
|
-
def
|
171
|
-
|
170
|
+
def parse!(input, site)
|
171
|
+
parse(input, site)
|
172
|
+
rescue Error => e
|
173
|
+
RobotsTxt.new(site, nil, :error => e, :target => @target)
|
172
174
|
end
|
173
175
|
|
174
176
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
@@ -249,11 +251,12 @@ class WebRobots
|
|
249
251
|
|
250
252
|
---- footer
|
251
253
|
def initialize(site, records, options = nil)
|
252
|
-
|
254
|
+
@timestamp = Time.now
|
253
255
|
@site = site
|
254
256
|
@options = options || {}
|
255
257
|
@last_checked = nil
|
256
258
|
|
259
|
+
@error = @options[:error]
|
257
260
|
@target = @options[:target]
|
258
261
|
@sitemaps = @options[:sitemaps] || []
|
259
262
|
|
@@ -272,7 +275,12 @@ class WebRobots
|
|
272
275
|
end
|
273
276
|
end
|
274
277
|
|
275
|
-
attr_reader :site, :sitemaps
|
278
|
+
attr_reader :timestamp, :site, :sitemaps
|
279
|
+
attr_accessor :error
|
280
|
+
|
281
|
+
def error!
|
282
|
+
raise @error if @error
|
283
|
+
end
|
276
284
|
|
277
285
|
def target(user_agent = nil)
|
278
286
|
if user_agent
|
@@ -309,6 +317,17 @@ class WebRobots
|
|
309
317
|
record.options
|
310
318
|
end
|
311
319
|
|
320
|
+
DISALLOW_ALL = <<-TXT
|
321
|
+
User-Agent: *
|
322
|
+
Disallow: /
|
323
|
+
TXT
|
324
|
+
|
325
|
+
def self.unfetchable(site, reason, target = nil)
|
326
|
+
Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
|
327
|
+
robots_txt.error = reason
|
328
|
+
}
|
329
|
+
end
|
330
|
+
|
312
331
|
class Record
|
313
332
|
def initialize(agentlines, rulelines)
|
314
333
|
@patterns = agentlines.map { |agentline| agentline.pattern }
|
data/test/test_webrobots.rb
CHANGED
@@ -24,10 +24,6 @@ class TestWebRobots < Test::Unit::TestCase
|
|
24
24
|
#comment
|
25
25
|
|
26
26
|
TXT
|
27
|
-
when 'http://site5.example.org/robots.txt'
|
28
|
-
raise Net::HTTPServerException.new(
|
29
|
-
'Not Found',
|
30
|
-
Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
|
31
27
|
else
|
32
28
|
raise "#{uri} is not supposed to be fetched"
|
33
29
|
end
|
@@ -43,8 +39,6 @@ class TestWebRobots < Test::Unit::TestCase
|
|
43
39
|
assert @robots.allowed?('http://site3.example.org/private/secret.txt')
|
44
40
|
assert @robots.allowed?('http://site4.example.org/index.html')
|
45
41
|
assert @robots.allowed?('http://site4.example.org/private/secret.txt')
|
46
|
-
assert @robots.allowed?('http://site5.example.org/index.html')
|
47
|
-
assert @robots.allowed?('http://site5.example.org/private/secret.txt')
|
48
42
|
end
|
49
43
|
end
|
50
44
|
|
@@ -64,6 +58,8 @@ class TestWebRobots < Test::Unit::TestCase
|
|
64
58
|
raise Errno::ECONNREFUSED
|
65
59
|
when 'http://site4.example.org/robots.txt'
|
66
60
|
raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
|
61
|
+
when 'http://site5.example.org/robots.txt'
|
62
|
+
nil
|
67
63
|
else
|
68
64
|
raise "#{uri} is not supposed to be fetched"
|
69
65
|
end
|
@@ -79,6 +75,8 @@ class TestWebRobots < Test::Unit::TestCase
|
|
79
75
|
assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
|
80
76
|
assert @robots.disallowed?('http://site4.example.org/index.html')
|
81
77
|
assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
|
78
|
+
assert @robots.disallowed?('http://site5.example.org/index.html')
|
79
|
+
assert @robots.disallowed?('http://site5.example.org/private/secret.txt')
|
82
80
|
end
|
83
81
|
end
|
84
82
|
|
@@ -176,8 +174,12 @@ Disallow: /~joe/index.html
|
|
176
174
|
should "properly restrict access" do
|
177
175
|
assert @robots_good.allowed?('http://www.example.org/index.html')
|
178
176
|
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
|
177
|
+
assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
|
178
|
+
assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
|
179
179
|
assert @robots_good.allowed?('http://www.example.org/2heavy/index.html')
|
180
|
+
assert @robots_good.allowed?('http://WWW.Example.Org/2heavy/index.html')
|
180
181
|
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
|
182
|
+
assert !@robots_good.allowed?('http://WWW.Example.Org/2heavy/index.htm')
|
181
183
|
|
182
184
|
assert !@robots_evil.allowed?('http://www.example.org/index.html')
|
183
185
|
assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
|
@@ -234,38 +236,73 @@ Disallow: /~joe/index.html
|
|
234
236
|
|
235
237
|
context "robots.txt with errors" do
|
236
238
|
setup do
|
239
|
+
@turn1 = @turn2 = 0
|
237
240
|
@http_get = lambda { |uri|
|
238
241
|
case uri.to_s
|
239
242
|
when 'http://www.example.org/robots.txt'
|
240
|
-
|
243
|
+
if (@turn1 += 1) % 2 == 1
|
244
|
+
<<-'TXT'
|
241
245
|
# some comment
|
242
|
-
User-Agent:
|
246
|
+
User-Agent: thebot
|
247
|
+
# Disallow: /
|
248
|
+
Disallow: /2heavy/
|
249
|
+
# Allow: /2heavy/notsoheavy
|
250
|
+
Allow: /2heavy/*.html
|
251
|
+
|
252
|
+
User-Agent: anotherbot
|
253
|
+
# Disallow: /
|
254
|
+
Disallow: /2heavy/
|
255
|
+
# Allow: /2heavy/notsoheavy
|
256
|
+
Allow: /2heavy/*.html
|
257
|
+
TXT
|
258
|
+
else
|
259
|
+
<<-'TXT'
|
260
|
+
# some comment
|
261
|
+
User-Agent: thebot
|
243
262
|
# Disallow: /
|
244
263
|
Disallow: /2heavy/
|
245
264
|
# Allow: /2heavy/notsoheavy
|
246
265
|
Allow: /2heavy/*.html
|
247
266
|
#
|
248
|
-
User-Agent:
|
267
|
+
User-Agent: anotherbot
|
249
268
|
# Disallow: /
|
250
269
|
Disallow: /2heavy/
|
251
270
|
# Allow: /2heavy/notsoheavy
|
252
271
|
Allow: /2heavy/*.html
|
253
|
-
|
272
|
+
TXT
|
273
|
+
end
|
254
274
|
when 'http://www.example.com/robots.txt'
|
255
|
-
|
275
|
+
if (@turn2 += 1) % 2 == 1
|
276
|
+
<<-'TXT'
|
256
277
|
# some comment
|
257
|
-
#User-Agent:
|
278
|
+
#User-Agent: thebot
|
258
279
|
# Disallow: /
|
259
280
|
Disallow: /2heavy/
|
260
281
|
# Allow: /2heavy/notsoheavy
|
261
282
|
Allow: /2heavy/*.html
|
262
283
|
|
263
|
-
User-Agent:
|
284
|
+
User-Agent: anotherbot
|
264
285
|
# Disallow: /
|
265
286
|
Disallow: /2heavy/
|
266
287
|
# Allow: /2heavy/notsoheavy
|
267
288
|
Allow: /2heavy/*.html
|
268
|
-
|
289
|
+
TXT
|
290
|
+
else
|
291
|
+
<<-'TXT'
|
292
|
+
# some comment
|
293
|
+
User-Agent: thebot
|
294
|
+
# Disallow: /
|
295
|
+
Disallow: /2heavy/
|
296
|
+
# Allow: /2heavy/notsoheavy
|
297
|
+
Allow: /2heavy/*.html
|
298
|
+
|
299
|
+
User-Agent: anotherbot
|
300
|
+
# Disallow: /
|
301
|
+
Disallow: /2heavy/
|
302
|
+
# Allow: /2heavy/notsoheavy
|
303
|
+
Allow: /2heavy/*.html
|
304
|
+
TXT
|
305
|
+
end
|
269
306
|
else
|
270
307
|
raise "#{uri} is not supposed to be fetched"
|
271
308
|
end
|
@@ -273,12 +310,54 @@ Allow: /2heavy/*.html
|
|
273
310
|
end
|
274
311
|
|
275
312
|
should "raise ParseError" do
|
276
|
-
robots = WebRobots.new('
|
313
|
+
robots = WebRobots.new('TheBot', :http_get => @http_get)
|
314
|
+
|
315
|
+
url = 'http://www.example.org/2heavy/index.php'
|
316
|
+
|
317
|
+
assert_nil robots.error(url)
|
318
|
+
assert !robots.allowed?(url)
|
319
|
+
assert_nothing_raised {
|
320
|
+
robots.error!(url)
|
321
|
+
}
|
322
|
+
|
323
|
+
robots.reset(url)
|
324
|
+
|
325
|
+
assert robots.allowed?(url)
|
326
|
+
assert_instance_of WebRobots::ParseError, robots.error(url)
|
277
327
|
assert_raise(WebRobots::ParseError) {
|
278
|
-
robots.
|
328
|
+
robots.error!(url)
|
279
329
|
}
|
330
|
+
|
331
|
+
robots.reset(url)
|
332
|
+
|
333
|
+
assert_nil robots.error(url)
|
334
|
+
assert !robots.allowed?(url)
|
335
|
+
assert_nothing_raised {
|
336
|
+
robots.error!(url)
|
337
|
+
}
|
338
|
+
|
339
|
+
url = 'http://www.example.com/2heavy/index.php'
|
340
|
+
|
341
|
+
assert robots.allowed?(url)
|
342
|
+
assert_instance_of WebRobots::ParseError, robots.error(url)
|
343
|
+
assert_raise(WebRobots::ParseError) {
|
344
|
+
robots.error!(url)
|
345
|
+
}
|
346
|
+
|
347
|
+
robots.reset(url)
|
348
|
+
|
349
|
+
assert_nil robots.error(url)
|
350
|
+
assert !robots.allowed?(url)
|
351
|
+
assert_nothing_raised {
|
352
|
+
robots.error!(url)
|
353
|
+
}
|
354
|
+
|
355
|
+
robots.reset(url)
|
356
|
+
|
357
|
+
assert robots.allowed?(url)
|
358
|
+
assert_instance_of WebRobots::ParseError, robots.error(url)
|
280
359
|
assert_raise(WebRobots::ParseError) {
|
281
|
-
robots.
|
360
|
+
robots.error!(url)
|
282
361
|
}
|
283
362
|
end
|
284
363
|
end
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-09}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 6
|
10
|
+
version: 0.0.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-09 00:00:00 +09:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|