webrobots 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.5
1
+ 0.0.6
data/lib/webrobots.rb CHANGED
@@ -13,11 +13,9 @@ class WebRobots
13
13
  #
14
14
  # * :http_get => a custom method, proc, or anything that responds to
15
15
  # .call(uri), to be used for fetching robots.txt. It must return
16
- # the response body if successful. If the resource is not found,
17
- # it must either return nil or emulate a Net::HTTPNotFound error
18
- # that the net/http library would raise, using
19
- # Net::HTTPServerException. Any other error raised is regarded as
20
- # blanket ban.
16
+ # the response body if successful, return an empty string if the
17
+ # resource is not found, and return nil or raise any error on
18
+ # failure. Redirects should be handled within this proc.
21
19
  def initialize(user_agent, options = nil)
22
20
  @user_agent = user_agent
23
21
  @parser = RobotsTxt::Parser.new(user_agent)
@@ -25,14 +23,13 @@ class WebRobots
25
23
  options ||= {}
26
24
  @http_get = options[:http_get] || method(:http_get)
27
25
 
28
- @robotstxt = {}
26
+ @robotstxt = create_cache()
29
27
  end
30
28
 
31
- @@anon_parser = RobotsTxt::Parser.new('Anonymous')
32
- @@disallower = @@anon_parser.parse(<<-TXT, nil)
33
- User-Agent: *
34
- Disallow: /
35
- TXT
29
+ # :nodoc:
30
+ def create_cache
31
+ Hash.new # Must respond to [], []=, and delete.
32
+ end
36
33
 
37
34
  # Returns the robot name initially given.
38
35
  attr_reader :user_agent
@@ -42,9 +39,9 @@ Disallow: /
42
39
  # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
43
40
  # raised.
44
41
  def allowed?(url)
45
- site, request_uri = split_uri(url)
42
+ robots_txt, request_uri = evaluate(url)
46
43
  return true if request_uri == '/robots.txt'
47
- robots_txt(site).allow?(request_uri)
44
+ robots_txt.allow?(request_uri)
48
45
  end
49
46
 
50
47
  # Equivalent to !allowed?(url).
@@ -56,8 +53,7 @@ Disallow: /
56
53
  # with each field name lower-cased. See allowed?() for a list of
57
54
  # errors that may be raised.
58
55
  def options(url)
59
- site, = split_uri(url)
60
- robots_txt(site).options
56
+ robots_txt_for(url).options
61
57
  end
62
58
 
63
59
  # Equivalent to option(url)[token.downcase].
@@ -68,8 +64,25 @@ Disallow: /
68
64
  # Returns an array of Sitemap URLs. See allowed?() for a list of
69
65
  # errors that may be raised.
70
66
  def sitemaps(url)
67
+ robots_txt_for(url).sitemaps
68
+ end
69
+
70
+ # Returns an error object if there is an error in fetching or
71
+ # parsing robots.txt of the site +url+.
72
+ def error(url)
73
+ robots_txt_for(url).error
74
+ end
75
+
76
+ # Raises the error if there was an error in fetching or parsing
77
+ # robots.txt of the site +url+.
78
+ def error!(url)
79
+ robots_txt_for(url).error!
80
+ end
81
+
82
+ # Removes robots.txt cache for the site +url+.
83
+ def reset(url)
71
84
  site, = split_uri(url)
72
- robots_txt(site).sitemaps
85
+ @robotstxt.delete(site)
73
86
  end
74
87
 
75
88
  private
@@ -100,31 +113,27 @@ Disallow: /
100
113
  return site, request_uri
101
114
  end
102
115
 
103
- def robots_txt(site)
104
- cache_robots_txt(site) {
105
- fetch_robots_txt(site)
106
- } or @@disallower
116
+ def evaluate(url)
117
+ site, request_uri = split_uri(url)
118
+ return get_robots_txt(site), request_uri
107
119
  end
108
120
 
109
- def fetch_robots_txt(site)
110
- body =
111
- begin
112
- @http_get.call(site + 'robots.txt')
113
- rescue => e
114
- if e.is_a?(Net::HTTPExceptions) && e.response.is_a?(Net::HTTPNotFound)
115
- ''
116
- else
117
- nil
118
- end
119
- end and @parser.parse(body, site)
121
+ def robots_txt_for(url)
122
+ site, = split_uri(url)
123
+ get_robots_txt(site)
124
+ end
125
+
126
+ def get_robots_txt(site)
127
+ @robotstxt[site] ||= fetch_robots_txt(site)
120
128
  end
121
129
 
122
- def cache_robots_txt(site, &block)
123
- if @robotstxt.key?(site)
124
- @robotstxt[site]
125
- else
126
- @robotstxt[site] = block.call(site)
130
+ def fetch_robots_txt(site)
131
+ begin
132
+ body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable'
133
+ rescue => e
134
+ return RobotsTxt.unfetchable(site, e, @user_agent)
127
135
  end
136
+ @parser.parse!(body, site)
128
137
  end
129
138
 
130
139
  def http_get(uri)
@@ -143,6 +152,8 @@ Disallow: /
143
152
  when Net::HTTPRedirection
144
153
  referer = uri.to_s
145
154
  uri = URI(response['location'])
155
+ when Net::HTTPNotFound
156
+ return ''
146
157
  else
147
158
  response.value
148
159
  end
@@ -27,8 +27,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
27
27
  @target = target
28
28
  end
29
29
 
30
- def self.parse(input, target = nil)
31
- new(target).parse(input)
30
+ def parse!(input, site)
31
+ parse(input, site)
32
+ rescue Error => e
33
+ RobotsTxt.new(site, nil, :error => e, :target => @target)
32
34
  end
33
35
 
34
36
  KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
@@ -519,11 +521,12 @@ end
519
521
  end # class Parser
520
522
 
521
523
  def initialize(site, records, options = nil)
522
- super()
524
+ @timestamp = Time.now
523
525
  @site = site
524
526
  @options = options || {}
525
527
  @last_checked = nil
526
528
 
529
+ @error = @options[:error]
527
530
  @target = @options[:target]
528
531
  @sitemaps = @options[:sitemaps] || []
529
532
 
@@ -542,7 +545,12 @@ end # class Parser
542
545
  end
543
546
  end
544
547
 
545
- attr_reader :site, :sitemaps
548
+ attr_reader :timestamp, :site, :sitemaps
549
+ attr_accessor :error
550
+
551
+ def error!
552
+ raise @error if @error
553
+ end
546
554
 
547
555
  def target(user_agent = nil)
548
556
  if user_agent
@@ -579,6 +587,17 @@ end # class Parser
579
587
  record.options
580
588
  end
581
589
 
590
+ DISALLOW_ALL = <<-TXT
591
+ User-Agent: *
592
+ Disallow: /
593
+ TXT
594
+
595
+ def self.unfetchable(site, reason, target = nil)
596
+ Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
597
+ robots_txt.error = reason
598
+ }
599
+ end
600
+
582
601
  class Record
583
602
  def initialize(agentlines, rulelines)
584
603
  @patterns = agentlines.map { |agentline| agentline.pattern }
@@ -167,8 +167,10 @@ class WebRobots
167
167
  @target = target
168
168
  end
169
169
 
170
- def self.parse(input, target = nil)
171
- new(target).parse(input)
170
+ def parse!(input, site)
171
+ parse(input, site)
172
+ rescue Error => e
173
+ RobotsTxt.new(site, nil, :error => e, :target => @target)
172
174
  end
173
175
 
174
176
  KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
@@ -249,11 +251,12 @@ class WebRobots
249
251
 
250
252
  ---- footer
251
253
  def initialize(site, records, options = nil)
252
- super()
254
+ @timestamp = Time.now
253
255
  @site = site
254
256
  @options = options || {}
255
257
  @last_checked = nil
256
258
 
259
+ @error = @options[:error]
257
260
  @target = @options[:target]
258
261
  @sitemaps = @options[:sitemaps] || []
259
262
 
@@ -272,7 +275,12 @@ class WebRobots
272
275
  end
273
276
  end
274
277
 
275
- attr_reader :site, :sitemaps
278
+ attr_reader :timestamp, :site, :sitemaps
279
+ attr_accessor :error
280
+
281
+ def error!
282
+ raise @error if @error
283
+ end
276
284
 
277
285
  def target(user_agent = nil)
278
286
  if user_agent
@@ -309,6 +317,17 @@ class WebRobots
309
317
  record.options
310
318
  end
311
319
 
320
+ DISALLOW_ALL = <<-TXT
321
+ User-Agent: *
322
+ Disallow: /
323
+ TXT
324
+
325
+ def self.unfetchable(site, reason, target = nil)
326
+ Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
327
+ robots_txt.error = reason
328
+ }
329
+ end
330
+
312
331
  class Record
313
332
  def initialize(agentlines, rulelines)
314
333
  @patterns = agentlines.map { |agentline| agentline.pattern }
@@ -24,10 +24,6 @@ class TestWebRobots < Test::Unit::TestCase
24
24
  #comment
25
25
 
26
26
  TXT
27
- when 'http://site5.example.org/robots.txt'
28
- raise Net::HTTPServerException.new(
29
- 'Not Found',
30
- Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
31
27
  else
32
28
  raise "#{uri} is not supposed to be fetched"
33
29
  end
@@ -43,8 +39,6 @@ class TestWebRobots < Test::Unit::TestCase
43
39
  assert @robots.allowed?('http://site3.example.org/private/secret.txt')
44
40
  assert @robots.allowed?('http://site4.example.org/index.html')
45
41
  assert @robots.allowed?('http://site4.example.org/private/secret.txt')
46
- assert @robots.allowed?('http://site5.example.org/index.html')
47
- assert @robots.allowed?('http://site5.example.org/private/secret.txt')
48
42
  end
49
43
  end
50
44
 
@@ -64,6 +58,8 @@ class TestWebRobots < Test::Unit::TestCase
64
58
  raise Errno::ECONNREFUSED
65
59
  when 'http://site4.example.org/robots.txt'
66
60
  raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
61
+ when 'http://site5.example.org/robots.txt'
62
+ nil
67
63
  else
68
64
  raise "#{uri} is not supposed to be fetched"
69
65
  end
@@ -79,6 +75,8 @@ class TestWebRobots < Test::Unit::TestCase
79
75
  assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
80
76
  assert @robots.disallowed?('http://site4.example.org/index.html')
81
77
  assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
78
+ assert @robots.disallowed?('http://site5.example.org/index.html')
79
+ assert @robots.disallowed?('http://site5.example.org/private/secret.txt')
82
80
  end
83
81
  end
84
82
 
@@ -176,8 +174,12 @@ Disallow: /~joe/index.html
176
174
  should "properly restrict access" do
177
175
  assert @robots_good.allowed?('http://www.example.org/index.html')
178
176
  assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
177
+ assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
178
+ assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
179
179
  assert @robots_good.allowed?('http://www.example.org/2heavy/index.html')
180
+ assert @robots_good.allowed?('http://WWW.Example.Org/2heavy/index.html')
180
181
  assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
182
+ assert !@robots_good.allowed?('http://WWW.Example.Org/2heavy/index.htm')
181
183
 
182
184
  assert !@robots_evil.allowed?('http://www.example.org/index.html')
183
185
  assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
@@ -234,38 +236,73 @@ Disallow: /~joe/index.html
234
236
 
235
237
  context "robots.txt with errors" do
236
238
  setup do
239
+ @turn1 = @turn2 = 0
237
240
  @http_get = lambda { |uri|
238
241
  case uri.to_s
239
242
  when 'http://www.example.org/robots.txt'
240
- <<-'TXT'
243
+ if (@turn1 += 1) % 2 == 1
244
+ <<-'TXT'
241
245
  # some comment
242
- User-Agent: first
246
+ User-Agent: thebot
247
+ # Disallow: /
248
+ Disallow: /2heavy/
249
+ # Allow: /2heavy/notsoheavy
250
+ Allow: /2heavy/*.html
251
+
252
+ User-Agent: anotherbot
253
+ # Disallow: /
254
+ Disallow: /2heavy/
255
+ # Allow: /2heavy/notsoheavy
256
+ Allow: /2heavy/*.html
257
+ TXT
258
+ else
259
+ <<-'TXT'
260
+ # some comment
261
+ User-Agent: thebot
243
262
  # Disallow: /
244
263
  Disallow: /2heavy/
245
264
  # Allow: /2heavy/notsoheavy
246
265
  Allow: /2heavy/*.html
247
266
  #
248
- User-Agent: next
267
+ User-Agent: anotherbot
249
268
  # Disallow: /
250
269
  Disallow: /2heavy/
251
270
  # Allow: /2heavy/notsoheavy
252
271
  Allow: /2heavy/*.html
253
- TXT
272
+ TXT
273
+ end
254
274
  when 'http://www.example.com/robots.txt'
255
- <<-'TXT'
275
+ if (@turn2 += 1) % 2 == 1
276
+ <<-'TXT'
256
277
  # some comment
257
- #User-Agent: first
278
+ #User-Agent: thebot
258
279
  # Disallow: /
259
280
  Disallow: /2heavy/
260
281
  # Allow: /2heavy/notsoheavy
261
282
  Allow: /2heavy/*.html
262
283
 
263
- User-Agent: next
284
+ User-Agent: anotherbot
264
285
  # Disallow: /
265
286
  Disallow: /2heavy/
266
287
  # Allow: /2heavy/notsoheavy
267
288
  Allow: /2heavy/*.html
268
- TXT
289
+ TXT
290
+ else
291
+ <<-'TXT'
292
+ # some comment
293
+ User-Agent: thebot
294
+ # Disallow: /
295
+ Disallow: /2heavy/
296
+ # Allow: /2heavy/notsoheavy
297
+ Allow: /2heavy/*.html
298
+
299
+ User-Agent: anotherbot
300
+ # Disallow: /
301
+ Disallow: /2heavy/
302
+ # Allow: /2heavy/notsoheavy
303
+ Allow: /2heavy/*.html
304
+ TXT
305
+ end
269
306
  else
270
307
  raise "#{uri} is not supposed to be fetched"
271
308
  end
@@ -273,12 +310,54 @@ Allow: /2heavy/*.html
273
310
  end
274
311
 
275
312
  should "raise ParseError" do
276
- robots = WebRobots.new('RandomBot', :http_get => @http_get)
313
+ robots = WebRobots.new('TheBot', :http_get => @http_get)
314
+
315
+ url = 'http://www.example.org/2heavy/index.php'
316
+
317
+ assert_nil robots.error(url)
318
+ assert !robots.allowed?(url)
319
+ assert_nothing_raised {
320
+ robots.error!(url)
321
+ }
322
+
323
+ robots.reset(url)
324
+
325
+ assert robots.allowed?(url)
326
+ assert_instance_of WebRobots::ParseError, robots.error(url)
277
327
  assert_raise(WebRobots::ParseError) {
278
- robots.allowed?('http://www.example.org/2heavy/index.html')
328
+ robots.error!(url)
279
329
  }
330
+
331
+ robots.reset(url)
332
+
333
+ assert_nil robots.error(url)
334
+ assert !robots.allowed?(url)
335
+ assert_nothing_raised {
336
+ robots.error!(url)
337
+ }
338
+
339
+ url = 'http://www.example.com/2heavy/index.php'
340
+
341
+ assert robots.allowed?(url)
342
+ assert_instance_of WebRobots::ParseError, robots.error(url)
343
+ assert_raise(WebRobots::ParseError) {
344
+ robots.error!(url)
345
+ }
346
+
347
+ robots.reset(url)
348
+
349
+ assert_nil robots.error(url)
350
+ assert !robots.allowed?(url)
351
+ assert_nothing_raised {
352
+ robots.error!(url)
353
+ }
354
+
355
+ robots.reset(url)
356
+
357
+ assert robots.allowed?(url)
358
+ assert_instance_of WebRobots::ParseError, robots.error(url)
280
359
  assert_raise(WebRobots::ParseError) {
281
- robots.allowed?('http://www.example.com/2heavy/index.html')
360
+ robots.error!(url)
282
361
  }
283
362
  end
284
363
  end
data/webrobots.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{webrobots}
8
- s.version = "0.0.5"
8
+ s.version = "0.0.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
12
- s.date = %q{2011-01-08}
12
+ s.date = %q{2011-01-09}
13
13
  s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
14
14
  }
15
15
  s.email = %q{knu@idaemons.org}
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 5
10
- version: 0.0.5
9
+ - 6
10
+ version: 0.0.6
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-08 00:00:00 +09:00
18
+ date: 2011-01-09 00:00:00 +09:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency