webrobots 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.4
1
+ 0.0.5
@@ -13,8 +13,10 @@ class WebRobots
13
13
  #
14
14
  # * :http_get => a custom method, proc, or anything that responds to
15
15
  # .call(uri), to be used for fetching robots.txt. It must return
16
- # the response body if successful, or raise Net::HTTPNotFound if
17
- # the resource is not found. Any other errror is regarded as
16
+ # the response body if successful. If the resource is not found,
17
+ # it must either return nil or emulate a Net::HTTPNotFound error
18
+ # that the net/http library would raise, using
19
+ # Net::HTTPServerException. Any other error raised is regarded as
18
20
  # blanket ban.
19
21
  def initialize(user_agent, options = nil)
20
22
  @user_agent = user_agent
@@ -26,6 +28,12 @@ class WebRobots
26
28
  @robotstxt = {}
27
29
  end
28
30
 
31
+ @@anon_parser = RobotsTxt::Parser.new('Anonymous')
32
+ @@disallower = @@anon_parser.parse(<<-TXT, nil)
33
+ User-Agent: *
34
+ Disallow: /
35
+ TXT
36
+
29
37
  # Returns the robot name initially given.
30
38
  attr_reader :user_agent
31
39
 
@@ -95,16 +103,20 @@ class WebRobots
95
103
  def robots_txt(site)
96
104
  cache_robots_txt(site) {
97
105
  fetch_robots_txt(site)
98
- }
106
+ } or @@disallower
99
107
  end
100
108
 
101
109
  def fetch_robots_txt(site)
102
- begin
103
- body = @http_get.call(site + 'robots.txt')
104
- rescue Net::HTTPNotFound
105
- return ''
106
- end
107
- @parser.parse(body, site)
110
+ body =
111
+ begin
112
+ @http_get.call(site + 'robots.txt')
113
+ rescue => e
114
+ if e.is_a?(Net::HTTPExceptions) && e.response.is_a?(Net::HTTPNotFound)
115
+ ''
116
+ else
117
+ nil
118
+ end
119
+ end and @parser.parse(body, site)
108
120
  end
109
121
 
110
122
  def cache_robots_txt(site, &block)
@@ -25,7 +25,9 @@ class TestWebRobots < Test::Unit::TestCase
25
25
 
26
26
  TXT
27
27
  when 'http://site5.example.org/robots.txt'
28
- raise Net::HTTPNotFound
28
+ raise Net::HTTPServerException.new(
29
+ 'Not Found',
30
+ Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
29
31
  else
30
32
  raise "#{uri} is not supposed to be fetched"
31
33
  end
@@ -41,6 +43,42 @@ class TestWebRobots < Test::Unit::TestCase
41
43
  assert @robots.allowed?('http://site3.example.org/private/secret.txt')
42
44
  assert @robots.allowed?('http://site4.example.org/index.html')
43
45
  assert @robots.allowed?('http://site4.example.org/private/secret.txt')
46
+ assert @robots.allowed?('http://site5.example.org/index.html')
47
+ assert @robots.allowed?('http://site5.example.org/private/secret.txt')
48
+ end
49
+ end
50
+
51
+ context "robots.txt that cannot be fetched" do
52
+ setup do
53
+ @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
54
+ case uri.to_s
55
+ when 'http://site1.example.org/robots.txt'
56
+ raise Net::HTTPFatalError.new(
57
+ 'Internal Server Error',
58
+ Net::HTTPInternalServerError.new('1.1', '500', 'Internal Server Error'))
59
+ when 'http://site2.example.org/robots.txt'
60
+ raise Net::HTTPRetriableError.new(
61
+ 'Found',
62
+ Net::HTTPFound.new('1.1', '302', 'Found'))
63
+ when 'http://site3.example.org/robots.txt'
64
+ raise Errno::ECONNREFUSED
65
+ when 'http://site4.example.org/robots.txt'
66
+ raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
67
+ else
68
+ raise "#{uri} is not supposed to be fetched"
69
+ end
70
+ })
71
+ end
72
+
73
+ should "disallow any robot" do
74
+ assert @robots.disallowed?('http://site1.example.org/index.html')
75
+ assert @robots.disallowed?('http://site1.example.org/private/secret.txt')
76
+ assert @robots.disallowed?('http://site2.example.org/index.html')
77
+ assert @robots.disallowed?('http://site2.example.org/private/secret.txt')
78
+ assert @robots.disallowed?('http://site3.example.org/index.html')
79
+ assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
80
+ assert @robots.disallowed?('http://site4.example.org/index.html')
81
+ assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
44
82
  end
45
83
  end
46
84
 
@@ -264,9 +302,12 @@ Crawl-Delay: 1.5
264
302
  User-Agent: *
265
303
  Disallow: /2heavy/
266
304
  Allow: /2heavy/*.html
305
+ # These are wrong but should be allowed
306
+ Allow: /2heavy/%
307
+ Crawl-Delay:
308
+ #
267
309
  Option1: Bar
268
310
  Option3: Hi
269
- Crawl-Delay:
270
311
  TXT
271
312
  else
272
313
  raise "#{uri} is not supposed to be fetched"
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{webrobots}
8
- s.version = "0.0.4"
8
+ s.version = "0.0.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 4
10
- version: 0.0.4
9
+ - 5
10
+ version: 0.0.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA