webrobots 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.5
1
+ 0.0.6
data/lib/webrobots.rb CHANGED
@@ -13,11 +13,9 @@ class WebRobots
13
13
  #
14
14
  # * :http_get => a custom method, proc, or anything that responds to
15
15
  # .call(uri), to be used for fetching robots.txt. It must return
16
- # the response body if successful. If the resource is not found,
17
- # it must either return nil or emulate a Net::HTTPNotFound error
18
- # that the net/http library would raise, using
19
- # Net::HTTPServerException. Any other error raised is regarded as
20
- # blanket ban.
16
+ # the response body if successful, return an empty string if the
17
+ # resource is not found, and return nil or raise any error on
18
+ # failure. Redirects should be handled within this proc.
21
19
  def initialize(user_agent, options = nil)
22
20
  @user_agent = user_agent
23
21
  @parser = RobotsTxt::Parser.new(user_agent)
@@ -25,14 +23,13 @@ class WebRobots
25
23
  options ||= {}
26
24
  @http_get = options[:http_get] || method(:http_get)
27
25
 
28
- @robotstxt = {}
26
+ @robotstxt = create_cache()
29
27
  end
30
28
 
31
- @@anon_parser = RobotsTxt::Parser.new('Anonymous')
32
- @@disallower = @@anon_parser.parse(<<-TXT, nil)
33
- User-Agent: *
34
- Disallow: /
35
- TXT
29
+ # :nodoc:
30
+ def create_cache
31
+ Hash.new # Must respond to [], []=, and delete.
32
+ end
36
33
 
37
34
  # Returns the robot name initially given.
38
35
  attr_reader :user_agent
@@ -42,9 +39,9 @@ Disallow: /
42
39
  # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
43
40
  # raised.
44
41
  def allowed?(url)
45
- site, request_uri = split_uri(url)
42
+ robots_txt, request_uri = evaluate(url)
46
43
  return true if request_uri == '/robots.txt'
47
- robots_txt(site).allow?(request_uri)
44
+ robots_txt.allow?(request_uri)
48
45
  end
49
46
 
50
47
  # Equivalent to !allowed?(url).
@@ -56,8 +53,7 @@ Disallow: /
56
53
  # with each field name lower-cased. See allowed?() for a list of
57
54
  # errors that may be raised.
58
55
  def options(url)
59
- site, = split_uri(url)
60
- robots_txt(site).options
56
+ robots_txt_for(url).options
61
57
  end
62
58
 
63
59
  # Equivalent to option(url)[token.downcase].
@@ -68,8 +64,25 @@ Disallow: /
68
64
  # Returns an array of Sitemap URLs. See allowed?() for a list of
69
65
  # errors that may be raised.
70
66
  def sitemaps(url)
67
+ robots_txt_for(url).sitemaps
68
+ end
69
+
70
+ # Returns an error object if there is an error in fetching or
71
+ # parsing robots.txt of the site +url+.
72
+ def error(url)
73
+ robots_txt_for(url).error
74
+ end
75
+
76
+ # Raises the error if there was an error in fetching or parsing
77
+ # robots.txt of the site +url+.
78
+ def error!(url)
79
+ robots_txt_for(url).error!
80
+ end
81
+
82
+ # Removes robots.txt cache for the site +url+.
83
+ def reset(url)
71
84
  site, = split_uri(url)
72
- robots_txt(site).sitemaps
85
+ @robotstxt.delete(site)
73
86
  end
74
87
 
75
88
  private
@@ -100,31 +113,27 @@ Disallow: /
100
113
  return site, request_uri
101
114
  end
102
115
 
103
- def robots_txt(site)
104
- cache_robots_txt(site) {
105
- fetch_robots_txt(site)
106
- } or @@disallower
116
+ def evaluate(url)
117
+ site, request_uri = split_uri(url)
118
+ return get_robots_txt(site), request_uri
107
119
  end
108
120
 
109
- def fetch_robots_txt(site)
110
- body =
111
- begin
112
- @http_get.call(site + 'robots.txt')
113
- rescue => e
114
- if e.is_a?(Net::HTTPExceptions) && e.response.is_a?(Net::HTTPNotFound)
115
- ''
116
- else
117
- nil
118
- end
119
- end and @parser.parse(body, site)
121
+ def robots_txt_for(url)
122
+ site, = split_uri(url)
123
+ get_robots_txt(site)
124
+ end
125
+
126
+ def get_robots_txt(site)
127
+ @robotstxt[site] ||= fetch_robots_txt(site)
120
128
  end
121
129
 
122
- def cache_robots_txt(site, &block)
123
- if @robotstxt.key?(site)
124
- @robotstxt[site]
125
- else
126
- @robotstxt[site] = block.call(site)
130
+ def fetch_robots_txt(site)
131
+ begin
132
+ body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable'
133
+ rescue => e
134
+ return RobotsTxt.unfetchable(site, e, @user_agent)
127
135
  end
136
+ @parser.parse!(body, site)
128
137
  end
129
138
 
130
139
  def http_get(uri)
@@ -143,6 +152,8 @@ Disallow: /
143
152
  when Net::HTTPRedirection
144
153
  referer = uri.to_s
145
154
  uri = URI(response['location'])
155
+ when Net::HTTPNotFound
156
+ return ''
146
157
  else
147
158
  response.value
148
159
  end
@@ -27,8 +27,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
27
27
  @target = target
28
28
  end
29
29
 
30
- def self.parse(input, target = nil)
31
- new(target).parse(input)
30
+ def parse!(input, site)
31
+ parse(input, site)
32
+ rescue Error => e
33
+ RobotsTxt.new(site, nil, :error => e, :target => @target)
32
34
  end
33
35
 
34
36
  KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
@@ -519,11 +521,12 @@ end
519
521
  end # class Parser
520
522
 
521
523
  def initialize(site, records, options = nil)
522
- super()
524
+ @timestamp = Time.now
523
525
  @site = site
524
526
  @options = options || {}
525
527
  @last_checked = nil
526
528
 
529
+ @error = @options[:error]
527
530
  @target = @options[:target]
528
531
  @sitemaps = @options[:sitemaps] || []
529
532
 
@@ -542,7 +545,12 @@ end # class Parser
542
545
  end
543
546
  end
544
547
 
545
- attr_reader :site, :sitemaps
548
+ attr_reader :timestamp, :site, :sitemaps
549
+ attr_accessor :error
550
+
551
+ def error!
552
+ raise @error if @error
553
+ end
546
554
 
547
555
  def target(user_agent = nil)
548
556
  if user_agent
@@ -579,6 +587,17 @@ end # class Parser
579
587
  record.options
580
588
  end
581
589
 
590
+ DISALLOW_ALL = <<-TXT
591
+ User-Agent: *
592
+ Disallow: /
593
+ TXT
594
+
595
+ def self.unfetchable(site, reason, target = nil)
596
+ Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
597
+ robots_txt.error = reason
598
+ }
599
+ end
600
+
582
601
  class Record
583
602
  def initialize(agentlines, rulelines)
584
603
  @patterns = agentlines.map { |agentline| agentline.pattern }
@@ -167,8 +167,10 @@ class WebRobots
167
167
  @target = target
168
168
  end
169
169
 
170
- def self.parse(input, target = nil)
171
- new(target).parse(input)
170
+ def parse!(input, site)
171
+ parse(input, site)
172
+ rescue Error => e
173
+ RobotsTxt.new(site, nil, :error => e, :target => @target)
172
174
  end
173
175
 
174
176
  KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
@@ -249,11 +251,12 @@ class WebRobots
249
251
 
250
252
  ---- footer
251
253
  def initialize(site, records, options = nil)
252
- super()
254
+ @timestamp = Time.now
253
255
  @site = site
254
256
  @options = options || {}
255
257
  @last_checked = nil
256
258
 
259
+ @error = @options[:error]
257
260
  @target = @options[:target]
258
261
  @sitemaps = @options[:sitemaps] || []
259
262
 
@@ -272,7 +275,12 @@ class WebRobots
272
275
  end
273
276
  end
274
277
 
275
- attr_reader :site, :sitemaps
278
+ attr_reader :timestamp, :site, :sitemaps
279
+ attr_accessor :error
280
+
281
+ def error!
282
+ raise @error if @error
283
+ end
276
284
 
277
285
  def target(user_agent = nil)
278
286
  if user_agent
@@ -309,6 +317,17 @@ class WebRobots
309
317
  record.options
310
318
  end
311
319
 
320
+ DISALLOW_ALL = <<-TXT
321
+ User-Agent: *
322
+ Disallow: /
323
+ TXT
324
+
325
+ def self.unfetchable(site, reason, target = nil)
326
+ Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
327
+ robots_txt.error = reason
328
+ }
329
+ end
330
+
312
331
  class Record
313
332
  def initialize(agentlines, rulelines)
314
333
  @patterns = agentlines.map { |agentline| agentline.pattern }
@@ -24,10 +24,6 @@ class TestWebRobots < Test::Unit::TestCase
24
24
  #comment
25
25
 
26
26
  TXT
27
- when 'http://site5.example.org/robots.txt'
28
- raise Net::HTTPServerException.new(
29
- 'Not Found',
30
- Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
31
27
  else
32
28
  raise "#{uri} is not supposed to be fetched"
33
29
  end
@@ -43,8 +39,6 @@ class TestWebRobots < Test::Unit::TestCase
43
39
  assert @robots.allowed?('http://site3.example.org/private/secret.txt')
44
40
  assert @robots.allowed?('http://site4.example.org/index.html')
45
41
  assert @robots.allowed?('http://site4.example.org/private/secret.txt')
46
- assert @robots.allowed?('http://site5.example.org/index.html')
47
- assert @robots.allowed?('http://site5.example.org/private/secret.txt')
48
42
  end
49
43
  end
50
44
 
@@ -64,6 +58,8 @@ class TestWebRobots < Test::Unit::TestCase
64
58
  raise Errno::ECONNREFUSED
65
59
  when 'http://site4.example.org/robots.txt'
66
60
  raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
61
+ when 'http://site5.example.org/robots.txt'
62
+ nil
67
63
  else
68
64
  raise "#{uri} is not supposed to be fetched"
69
65
  end
@@ -79,6 +75,8 @@ class TestWebRobots < Test::Unit::TestCase
79
75
  assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
80
76
  assert @robots.disallowed?('http://site4.example.org/index.html')
81
77
  assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
78
+ assert @robots.disallowed?('http://site5.example.org/index.html')
79
+ assert @robots.disallowed?('http://site5.example.org/private/secret.txt')
82
80
  end
83
81
  end
84
82
 
@@ -176,8 +174,12 @@ Disallow: /~joe/index.html
176
174
  should "properly restrict access" do
177
175
  assert @robots_good.allowed?('http://www.example.org/index.html')
178
176
  assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
177
+ assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
178
+ assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
179
179
  assert @robots_good.allowed?('http://www.example.org/2heavy/index.html')
180
+ assert @robots_good.allowed?('http://WWW.Example.Org/2heavy/index.html')
180
181
  assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
182
+ assert !@robots_good.allowed?('http://WWW.Example.Org/2heavy/index.htm')
181
183
 
182
184
  assert !@robots_evil.allowed?('http://www.example.org/index.html')
183
185
  assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
@@ -234,38 +236,73 @@ Disallow: /~joe/index.html
234
236
 
235
237
  context "robots.txt with errors" do
236
238
  setup do
239
+ @turn1 = @turn2 = 0
237
240
  @http_get = lambda { |uri|
238
241
  case uri.to_s
239
242
  when 'http://www.example.org/robots.txt'
240
- <<-'TXT'
243
+ if (@turn1 += 1) % 2 == 1
244
+ <<-'TXT'
241
245
  # some comment
242
- User-Agent: first
246
+ User-Agent: thebot
247
+ # Disallow: /
248
+ Disallow: /2heavy/
249
+ # Allow: /2heavy/notsoheavy
250
+ Allow: /2heavy/*.html
251
+
252
+ User-Agent: anotherbot
253
+ # Disallow: /
254
+ Disallow: /2heavy/
255
+ # Allow: /2heavy/notsoheavy
256
+ Allow: /2heavy/*.html
257
+ TXT
258
+ else
259
+ <<-'TXT'
260
+ # some comment
261
+ User-Agent: thebot
243
262
  # Disallow: /
244
263
  Disallow: /2heavy/
245
264
  # Allow: /2heavy/notsoheavy
246
265
  Allow: /2heavy/*.html
247
266
  #
248
- User-Agent: next
267
+ User-Agent: anotherbot
249
268
  # Disallow: /
250
269
  Disallow: /2heavy/
251
270
  # Allow: /2heavy/notsoheavy
252
271
  Allow: /2heavy/*.html
253
- TXT
272
+ TXT
273
+ end
254
274
  when 'http://www.example.com/robots.txt'
255
- <<-'TXT'
275
+ if (@turn2 += 1) % 2 == 1
276
+ <<-'TXT'
256
277
  # some comment
257
- #User-Agent: first
278
+ #User-Agent: thebot
258
279
  # Disallow: /
259
280
  Disallow: /2heavy/
260
281
  # Allow: /2heavy/notsoheavy
261
282
  Allow: /2heavy/*.html
262
283
 
263
- User-Agent: next
284
+ User-Agent: anotherbot
264
285
  # Disallow: /
265
286
  Disallow: /2heavy/
266
287
  # Allow: /2heavy/notsoheavy
267
288
  Allow: /2heavy/*.html
268
- TXT
289
+ TXT
290
+ else
291
+ <<-'TXT'
292
+ # some comment
293
+ User-Agent: thebot
294
+ # Disallow: /
295
+ Disallow: /2heavy/
296
+ # Allow: /2heavy/notsoheavy
297
+ Allow: /2heavy/*.html
298
+
299
+ User-Agent: anotherbot
300
+ # Disallow: /
301
+ Disallow: /2heavy/
302
+ # Allow: /2heavy/notsoheavy
303
+ Allow: /2heavy/*.html
304
+ TXT
305
+ end
269
306
  else
270
307
  raise "#{uri} is not supposed to be fetched"
271
308
  end
@@ -273,12 +310,54 @@ Allow: /2heavy/*.html
273
310
  end
274
311
 
275
312
  should "raise ParseError" do
276
- robots = WebRobots.new('RandomBot', :http_get => @http_get)
313
+ robots = WebRobots.new('TheBot', :http_get => @http_get)
314
+
315
+ url = 'http://www.example.org/2heavy/index.php'
316
+
317
+ assert_nil robots.error(url)
318
+ assert !robots.allowed?(url)
319
+ assert_nothing_raised {
320
+ robots.error!(url)
321
+ }
322
+
323
+ robots.reset(url)
324
+
325
+ assert robots.allowed?(url)
326
+ assert_instance_of WebRobots::ParseError, robots.error(url)
277
327
  assert_raise(WebRobots::ParseError) {
278
- robots.allowed?('http://www.example.org/2heavy/index.html')
328
+ robots.error!(url)
279
329
  }
330
+
331
+ robots.reset(url)
332
+
333
+ assert_nil robots.error(url)
334
+ assert !robots.allowed?(url)
335
+ assert_nothing_raised {
336
+ robots.error!(url)
337
+ }
338
+
339
+ url = 'http://www.example.com/2heavy/index.php'
340
+
341
+ assert robots.allowed?(url)
342
+ assert_instance_of WebRobots::ParseError, robots.error(url)
343
+ assert_raise(WebRobots::ParseError) {
344
+ robots.error!(url)
345
+ }
346
+
347
+ robots.reset(url)
348
+
349
+ assert_nil robots.error(url)
350
+ assert !robots.allowed?(url)
351
+ assert_nothing_raised {
352
+ robots.error!(url)
353
+ }
354
+
355
+ robots.reset(url)
356
+
357
+ assert robots.allowed?(url)
358
+ assert_instance_of WebRobots::ParseError, robots.error(url)
280
359
  assert_raise(WebRobots::ParseError) {
281
- robots.allowed?('http://www.example.com/2heavy/index.html')
360
+ robots.error!(url)
282
361
  }
283
362
  end
284
363
  end
data/webrobots.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{webrobots}
8
- s.version = "0.0.5"
8
+ s.version = "0.0.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
12
- s.date = %q{2011-01-08}
12
+ s.date = %q{2011-01-09}
13
13
  s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
14
14
  }
15
15
  s.email = %q{knu@idaemons.org}
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 5
10
- version: 0.0.5
9
+ - 6
10
+ version: 0.0.6
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-08 00:00:00 +09:00
18
+ date: 2011-01-09 00:00:00 +09:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency