webrobots 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.4
1
+ 0.0.5
@@ -13,8 +13,10 @@ class WebRobots
13
13
  #
14
14
  # * :http_get => a custom method, proc, or anything that responds to
15
15
  # .call(uri), to be used for fetching robots.txt. It must return
16
- # the response body if successful, or raise Net::HTTPNotFound if
17
- # the resource is not found. Any other errror is regarded as
16
+ # the response body if successful. If the resource is not found,
17
+ # it must either return nil or emulate a Net::HTTPNotFound error
18
+ # that the net/http library would raise, using
19
+ # Net::HTTPServerException. Any other error raised is regarded as
18
20
  # blanket ban.
19
21
  def initialize(user_agent, options = nil)
20
22
  @user_agent = user_agent
@@ -26,6 +28,12 @@ class WebRobots
26
28
  @robotstxt = {}
27
29
  end
28
30
 
31
+ @@anon_parser = RobotsTxt::Parser.new('Anonymous')
32
+ @@disallower = @@anon_parser.parse(<<-TXT, nil)
33
+ User-Agent: *
34
+ Disallow: /
35
+ TXT
36
+
29
37
  # Returns the robot name initially given.
30
38
  attr_reader :user_agent
31
39
 
@@ -95,16 +103,20 @@ class WebRobots
95
103
  def robots_txt(site)
96
104
  cache_robots_txt(site) {
97
105
  fetch_robots_txt(site)
98
- }
106
+ } or @@disallower
99
107
  end
100
108
 
101
109
  def fetch_robots_txt(site)
102
- begin
103
- body = @http_get.call(site + 'robots.txt')
104
- rescue Net::HTTPNotFound
105
- return ''
106
- end
107
- @parser.parse(body, site)
110
+ body =
111
+ begin
112
+ @http_get.call(site + 'robots.txt')
113
+ rescue => e
114
+ if e.is_a?(Net::HTTPExceptions) && e.response.is_a?(Net::HTTPNotFound)
115
+ ''
116
+ else
117
+ nil
118
+ end
119
+ end and @parser.parse(body, site)
108
120
  end
109
121
 
110
122
  def cache_robots_txt(site, &block)
@@ -25,7 +25,9 @@ class TestWebRobots < Test::Unit::TestCase
25
25
 
26
26
  TXT
27
27
  when 'http://site5.example.org/robots.txt'
28
- raise Net::HTTPNotFound
28
+ raise Net::HTTPServerException.new(
29
+ 'Not Found',
30
+ Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
29
31
  else
30
32
  raise "#{uri} is not supposed to be fetched"
31
33
  end
@@ -41,6 +43,42 @@ class TestWebRobots < Test::Unit::TestCase
41
43
  assert @robots.allowed?('http://site3.example.org/private/secret.txt')
42
44
  assert @robots.allowed?('http://site4.example.org/index.html')
43
45
  assert @robots.allowed?('http://site4.example.org/private/secret.txt')
46
+ assert @robots.allowed?('http://site5.example.org/index.html')
47
+ assert @robots.allowed?('http://site5.example.org/private/secret.txt')
48
+ end
49
+ end
50
+
51
+ context "robots.txt that cannot be fetched" do
52
+ setup do
53
+ @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
54
+ case uri.to_s
55
+ when 'http://site1.example.org/robots.txt'
56
+ raise Net::HTTPFatalError.new(
57
+ 'Internal Server Error',
58
+ Net::HTTPInternalServerError.new('1.1', '500', 'Internal Server Error'))
59
+ when 'http://site2.example.org/robots.txt'
60
+ raise Net::HTTPRetriableError.new(
61
+ 'Found',
62
+ Net::HTTPFound.new('1.1', '302', 'Found'))
63
+ when 'http://site3.example.org/robots.txt'
64
+ raise Errno::ECONNREFUSED
65
+ when 'http://site4.example.org/robots.txt'
66
+ raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
67
+ else
68
+ raise "#{uri} is not supposed to be fetched"
69
+ end
70
+ })
71
+ end
72
+
73
+ should "disallow any robot" do
74
+ assert @robots.disallowed?('http://site1.example.org/index.html')
75
+ assert @robots.disallowed?('http://site1.example.org/private/secret.txt')
76
+ assert @robots.disallowed?('http://site2.example.org/index.html')
77
+ assert @robots.disallowed?('http://site2.example.org/private/secret.txt')
78
+ assert @robots.disallowed?('http://site3.example.org/index.html')
79
+ assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
80
+ assert @robots.disallowed?('http://site4.example.org/index.html')
81
+ assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
44
82
  end
45
83
  end
46
84
 
@@ -264,9 +302,12 @@ Crawl-Delay: 1.5
264
302
  User-Agent: *
265
303
  Disallow: /2heavy/
266
304
  Allow: /2heavy/*.html
305
+ # These are wrong but should be allowed
306
+ Allow: /2heavy/%
307
+ Crawl-Delay:
308
+ #
267
309
  Option1: Bar
268
310
  Option3: Hi
269
- Crawl-Delay:
270
311
  TXT
271
312
  else
272
313
  raise "#{uri} is not supposed to be fetched"
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{webrobots}
8
- s.version = "0.0.4"
8
+ s.version = "0.0.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 4
10
- version: 0.0.4
9
+ - 5
10
+ version: 0.0.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA