webrobots 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,291 @@
1
+ require 'helper'
2
+
3
+ class TestWebRobots < Test::Unit::TestCase
4
+ context "robots.txt with no rules" do
5
+ setup do
6
+ @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
7
+ case uri.to_s
8
+ when 'http://site1.example.org/robots.txt'
9
+ <<-'TXT'
10
+ TXT
11
+ when 'http://site2.example.org/robots.txt'
12
+ <<-'TXT'
13
+
14
+
15
+ TXT
16
+ when 'http://site3.example.org/robots.txt'
17
+ <<-'TXT'
18
+
19
+ #comment
20
+ TXT
21
+ when 'http://site4.example.org/robots.txt'
22
+ <<-'TXT'
23
+
24
+ #comment
25
+
26
+ TXT
27
+ when 'http://site5.example.org/robots.txt'
28
+ raise Net::HTTPNotFound
29
+ else
30
+ raise "#{uri} is not supposed to be fetched"
31
+ end
32
+ })
33
+ end
34
+
35
+ should "allow any robot" do
36
+ assert @robots.allowed?('http://site1.example.org/index.html')
37
+ assert @robots.allowed?('http://site1.example.org/private/secret.txt')
38
+ assert @robots.allowed?('http://site2.example.org/index.html')
39
+ assert @robots.allowed?('http://site2.example.org/private/secret.txt')
40
+ assert @robots.allowed?('http://site3.example.org/index.html')
41
+ assert @robots.allowed?('http://site3.example.org/private/secret.txt')
42
+ assert @robots.allowed?('http://site4.example.org/index.html')
43
+ assert @robots.allowed?('http://site4.example.org/private/secret.txt')
44
+ end
45
+ end
46
+
47
+ context "robots.txt with some rules" do
48
+ setup do
49
+ http_get = lambda { |uri|
50
+ case uri.to_s
51
+ when 'http://www.example.org/robots.txt'
52
+ <<-'TXT'
53
+ # Punish evil bots
54
+ User-Agent: evil
55
+ Disallow: /
56
+
57
+ User-Agent: good
58
+ # Be generous to good bots
59
+ Disallow: /2heavy/
60
+ Allow: /2heavy/*.htm
61
+ Disallow: /2heavy/*.htm$
62
+
63
+ User-Agent: *
64
+ Disallow: /2heavy/
65
+ Disallow: /index.html
66
+ # Allow takes precedence over Disallow if the pattern lengths are the same.
67
+ Allow: /index.html
68
+ TXT
69
+ when 'http://www.example.com/robots.txt'
70
+ <<-'TXT'
71
+ # Default rule is evaluated last even if it is put first.
72
+ User-Agent: *
73
+ Disallow: /2heavy/
74
+ Disallow: /index.html
75
+ # Allow takes precedence over Disallow if the pattern lengths are the same.
76
+ Allow: /index.html
77
+
78
+ # Punish evil bots
79
+ User-Agent: evil
80
+ Disallow: /
81
+
82
+ User-Agent: good
83
+ # Be generous to good bots
84
+ Disallow: /2heavy/
85
+ Allow: /2heavy/*.htm
86
+ Disallow: /2heavy/*.htm$
87
+ TXT
88
+ else
89
+ raise "#{uri} is not supposed to be fetched"
90
+ end
91
+ }
92
+
93
+ @robots = WebRobots.new('RandomBot', :http_get => http_get)
94
+ @robots_good = WebRobots.new('GoodBot', :http_get => http_get)
95
+ @robots_evil = WebRobots.new('EvilBot', :http_get => http_get)
96
+ end
97
+
98
+ should "properly restrict access" do
99
+ assert @robots_good.allowed?('http://www.example.org/index.html')
100
+ assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
101
+ assert @robots_good.allowed?('http://www.example.org/2heavy/index.html')
102
+ assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
103
+
104
+ assert !@robots_evil.allowed?('http://www.example.org/index.html')
105
+ assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
106
+ assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.html')
107
+ assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.htm')
108
+
109
+ assert @robots.allowed?('http://www.example.org/index.html')
110
+ assert !@robots.allowed?('http://www.example.org/2heavy/index.php')
111
+ assert !@robots.allowed?('http://www.example.org/2heavy/index.html')
112
+ assert !@robots.allowed?('http://www.example.org/2heavy/index.htm')
113
+
114
+ assert @robots_good.allowed?('http://www.example.com/index.html')
115
+ assert !@robots_good.allowed?('http://www.example.com/2heavy/index.php')
116
+ assert @robots_good.allowed?('http://www.example.com/2heavy/index.html')
117
+ assert !@robots_good.allowed?('http://www.example.com/2heavy/index.htm')
118
+
119
+ assert !@robots_evil.allowed?('http://www.example.com/index.html')
120
+ assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.php')
121
+ assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.html')
122
+ assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.htm')
123
+
124
+ assert @robots.allowed?('http://www.example.com/index.html')
125
+ assert !@robots.allowed?('http://www.example.com/2heavy/index.php')
126
+ assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
127
+ assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
128
+ end
129
+ end
130
+
131
+ context "robots.txt with errors" do
132
+ setup do
133
+ @http_get = lambda { |uri|
134
+ case uri.to_s
135
+ when 'http://www.example.org/robots.txt'
136
+ <<-'TXT'
137
+ # some comment
138
+ User-Agent: first
139
+ # Disallow: /
140
+ Disallow: /2heavy/
141
+ # Allow: /2heavy/notsoheavy
142
+ Allow: /2heavy/*.html
143
+ #
144
+ User-Agent: next
145
+ # Disallow: /
146
+ Disallow: /2heavy/
147
+ # Allow: /2heavy/notsoheavy
148
+ Allow: /2heavy/*.html
149
+ TXT
150
+ when 'http://www.example.com/robots.txt'
151
+ <<-'TXT'
152
+ # some comment
153
+ #User-Agent: first
154
+ # Disallow: /
155
+ Disallow: /2heavy/
156
+ # Allow: /2heavy/notsoheavy
157
+ Allow: /2heavy/*.html
158
+
159
+ User-Agent: next
160
+ # Disallow: /
161
+ Disallow: /2heavy/
162
+ # Allow: /2heavy/notsoheavy
163
+ Allow: /2heavy/*.html
164
+ TXT
165
+ else
166
+ raise "#{uri} is not supposed to be fetched"
167
+ end
168
+ }
169
+ end
170
+
171
+ should "raise ParseError" do
172
+ robots = WebRobots.new('RandomBot', :http_get => @http_get)
173
+ assert_raise(WebRobots::ParseError) {
174
+ robots.allowed?('http://www.example.org/2heavy/index.html')
175
+ }
176
+ assert_raise(WebRobots::ParseError) {
177
+ robots.allowed?('http://www.example.com/2heavy/index.html')
178
+ }
179
+ end
180
+ end
181
+
182
+ context "robots.txt with options" do
183
+ setup do
184
+ http_get = lambda { |uri|
185
+ case uri.to_s
186
+ when 'http://www.example.org/robots.txt'
187
+ <<-'TXT'
188
+ Sitemap: http://www.example.org/sitemap-host1.xml
189
+ Sitemap: http://www.example.org/sitemap-host2.xml
190
+
191
+ User-Agent: MyBot
192
+ Disallow: /2heavy/
193
+ Allow: /2heavy/*.html
194
+ Option1: Foo
195
+ Option2: Hello
196
+
197
+ User-Agent: *
198
+ Disallow: /2heavy/
199
+ Allow: /2heavy/*.html
200
+ Option1: Bar
201
+ Option3: Hi
202
+ TXT
203
+ else
204
+ raise "#{uri} is not supposed to be fetched"
205
+ end
206
+ }
207
+
208
+ @robots_mybot = WebRobots.new('MyBot', :http_get => http_get)
209
+ @robots_hisbot = WebRobots.new('HisBot', :http_get => http_get)
210
+ end
211
+
212
+ should "read options" do
213
+ options = @robots_mybot.options('http://www.example.org/')
214
+ assert_equal 2, options.size
215
+ assert_equal 'Foo', @robots_mybot.option('http://www.example.org/', 'Option1')
216
+ assert_equal 'Foo', options['option1']
217
+ assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2')
218
+ assert_equal 'Hello', options['option2']
219
+
220
+ options = @robots_hisbot.options('http://www.example.org/')
221
+ assert_equal 2, options.size
222
+ assert_equal 'Bar', @robots_hisbot.option('http://www.example.org/', 'Option1')
223
+ assert_equal 'Bar', options['option1']
224
+ assert_equal 'Hi', @robots_hisbot.option('http://www.example.org/', 'Option3')
225
+ assert_equal 'Hi', options['option3']
226
+
227
+ assert_equal %w[
228
+ http://www.example.org/sitemap-host1.xml
229
+ http://www.example.org/sitemap-host2.xml
230
+ ], @robots_mybot.sitemaps('http://www.example.org/')
231
+ assert_equal %w[
232
+ http://www.example.org/sitemap-host1.xml
233
+ http://www.example.org/sitemap-host2.xml
234
+ ], @robots_hisbot.sitemaps('http://www.example.org/')
235
+ end
236
+ end
237
+
238
+ context "robots.txt with options" do
239
+ setup do
240
+ http_get = lambda { |uri|
241
+ case uri.to_s
242
+ when 'http://www.example.org/robots.txt'
243
+ <<-'TXT'
244
+ User-Agent: *
245
+ Disallow: /
246
+ TXT
247
+ else
248
+ raise "#{uri} is not supposed to be fetched"
249
+ end
250
+ }
251
+
252
+ @robots = WebRobots.new('RandomBot', :http_get => http_get)
253
+ end
254
+
255
+ should "validate URI" do
256
+ assert_raise(ArgumentError) {
257
+ @robots.allowed?('www.example.org/')
258
+ }
259
+ assert_raise(ArgumentError) {
260
+ @robots.allowed?('::/home/knu')
261
+ }
262
+ end
263
+ end
264
+
265
+ context "robots.txt in the real world" do
266
+ setup do
267
+ @testbot = WebRobots.new('TestBot')
268
+ @msnbot = WebRobots.new('TestMSNBot') # matches msnbot
269
+ end
270
+
271
+ should "be parsed for major sites" do
272
+ assert_nothing_raised {
273
+ assert !@testbot.allowed?("http://www.google.com/search")
274
+ assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln")
275
+ assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6")
276
+ }
277
+ assert_nothing_raised {
278
+ assert @testbot.allowed?("http://www.yahoo.com/")
279
+ assert !@testbot.allowed?("http://www.yahoo.com/?")
280
+ assert !@testbot.allowed?("http://www.yahoo.com/p/foo")
281
+ }
282
+ assert_nothing_raised {
283
+ assert !@testbot.allowed?("http://store.apple.com/vieworder")
284
+ assert @msnbot.allowed?("http://store.apple.com/vieworder")
285
+ }
286
+ # assert_nothing_raised {
287
+ assert !@testbot.allowed?("http://github.com/login")
288
+ # }
289
+ end
290
+ end
291
+ end
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webrobots
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Akinori MUSHA
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-01-03 00:00:00 +09:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ type: :runtime
23
+ version_requirements: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ requirement: *id001
33
+ prerelease: false
34
+ name: racc
35
+ - !ruby/object:Gem::Dependency
36
+ type: :development
37
+ version_requirements: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ hash: 3
43
+ segments:
44
+ - 0
45
+ version: "0"
46
+ requirement: *id002
47
+ prerelease: false
48
+ name: shoulda
49
+ - !ruby/object:Gem::Dependency
50
+ type: :development
51
+ version_requirements: &id003 !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ~>
55
+ - !ruby/object:Gem::Version
56
+ hash: 23
57
+ segments:
58
+ - 1
59
+ - 0
60
+ - 0
61
+ version: 1.0.0
62
+ requirement: *id003
63
+ prerelease: false
64
+ name: bundler
65
+ - !ruby/object:Gem::Dependency
66
+ type: :development
67
+ version_requirements: &id004 !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ~>
71
+ - !ruby/object:Gem::Version
72
+ hash: 1
73
+ segments:
74
+ - 1
75
+ - 5
76
+ - 1
77
+ version: 1.5.1
78
+ requirement: *id004
79
+ prerelease: false
80
+ name: jeweler
81
+ - !ruby/object:Gem::Dependency
82
+ type: :development
83
+ version_requirements: &id005 !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ hash: 3
89
+ segments:
90
+ - 0
91
+ version: "0"
92
+ requirement: *id005
93
+ prerelease: false
94
+ name: rcov
95
+ description: |
96
+ This library helps write robots.txt compliant web robots.
97
+
98
+ email: knu@idaemons.org
99
+ executables: []
100
+
101
+ extensions: []
102
+
103
+ extra_rdoc_files:
104
+ - LICENSE.txt
105
+ - README.rdoc
106
+ files:
107
+ - .document
108
+ - Gemfile
109
+ - Gemfile.lock
110
+ - LICENSE.txt
111
+ - README.rdoc
112
+ - Rakefile
113
+ - VERSION
114
+ - lib/webrobots.rb
115
+ - lib/webrobots/robotstxt.rb
116
+ - lib/webrobots/robotstxt.ry
117
+ - test/helper.rb
118
+ - test/test_webrobots.rb
119
+ has_rdoc: true
120
+ homepage:
121
+ licenses:
122
+ - MIT
123
+ post_install_message:
124
+ rdoc_options: []
125
+
126
+ require_paths:
127
+ - lib
128
+ required_ruby_version: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ hash: 3
134
+ segments:
135
+ - 0
136
+ version: "0"
137
+ required_rubygems_version: !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ hash: 3
143
+ segments:
144
+ - 0
145
+ version: "0"
146
+ requirements: []
147
+
148
+ rubyforge_project:
149
+ rubygems_version: 1.4.1
150
+ signing_key:
151
+ specification_version: 3
152
+ summary: A library to help write robots.txt compliant web robots
153
+ test_files:
154
+ - test/helper.rb
155
+ - test/test_webrobots.rb