webrobots 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,291 @@
1
+ require 'helper'
2
+
3
+ class TestWebRobots < Test::Unit::TestCase
4
+ context "robots.txt with no rules" do
5
+ setup do
6
+ @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
7
+ case uri.to_s
8
+ when 'http://site1.example.org/robots.txt'
9
+ <<-'TXT'
10
+ TXT
11
+ when 'http://site2.example.org/robots.txt'
12
+ <<-'TXT'
13
+
14
+
15
+ TXT
16
+ when 'http://site3.example.org/robots.txt'
17
+ <<-'TXT'
18
+
19
+ #comment
20
+ TXT
21
+ when 'http://site4.example.org/robots.txt'
22
+ <<-'TXT'
23
+
24
+ #comment
25
+
26
+ TXT
27
+ when 'http://site5.example.org/robots.txt'
28
+ raise Net::HTTPNotFound
29
+ else
30
+ raise "#{uri} is not supposed to be fetched"
31
+ end
32
+ })
33
+ end
34
+
35
+ should "allow any robot" do
36
+ assert @robots.allowed?('http://site1.example.org/index.html')
37
+ assert @robots.allowed?('http://site1.example.org/private/secret.txt')
38
+ assert @robots.allowed?('http://site2.example.org/index.html')
39
+ assert @robots.allowed?('http://site2.example.org/private/secret.txt')
40
+ assert @robots.allowed?('http://site3.example.org/index.html')
41
+ assert @robots.allowed?('http://site3.example.org/private/secret.txt')
42
+ assert @robots.allowed?('http://site4.example.org/index.html')
43
+ assert @robots.allowed?('http://site4.example.org/private/secret.txt')
44
+ end
45
+ end
46
+
47
+ context "robots.txt with some rules" do
48
+ setup do
49
+ http_get = lambda { |uri|
50
+ case uri.to_s
51
+ when 'http://www.example.org/robots.txt'
52
+ <<-'TXT'
53
+ # Punish evil bots
54
+ User-Agent: evil
55
+ Disallow: /
56
+
57
+ User-Agent: good
58
+ # Be generous to good bots
59
+ Disallow: /2heavy/
60
+ Allow: /2heavy/*.htm
61
+ Disallow: /2heavy/*.htm$
62
+
63
+ User-Agent: *
64
+ Disallow: /2heavy/
65
+ Disallow: /index.html
66
+ # Allow takes precedence over Disallow if the pattern lengths are the same.
67
+ Allow: /index.html
68
+ TXT
69
+ when 'http://www.example.com/robots.txt'
70
+ <<-'TXT'
71
+ # Default rule is evaluated last even if it is put first.
72
+ User-Agent: *
73
+ Disallow: /2heavy/
74
+ Disallow: /index.html
75
+ # Allow takes precedence over Disallow if the pattern lengths are the same.
76
+ Allow: /index.html
77
+
78
+ # Punish evil bots
79
+ User-Agent: evil
80
+ Disallow: /
81
+
82
+ User-Agent: good
83
+ # Be generous to good bots
84
+ Disallow: /2heavy/
85
+ Allow: /2heavy/*.htm
86
+ Disallow: /2heavy/*.htm$
87
+ TXT
88
+ else
89
+ raise "#{uri} is not supposed to be fetched"
90
+ end
91
+ }
92
+
93
+ @robots = WebRobots.new('RandomBot', :http_get => http_get)
94
+ @robots_good = WebRobots.new('GoodBot', :http_get => http_get)
95
+ @robots_evil = WebRobots.new('EvilBot', :http_get => http_get)
96
+ end
97
+
98
+ should "properly restrict access" do
99
+ assert @robots_good.allowed?('http://www.example.org/index.html')
100
+ assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
101
+ assert @robots_good.allowed?('http://www.example.org/2heavy/index.html')
102
+ assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
103
+
104
+ assert !@robots_evil.allowed?('http://www.example.org/index.html')
105
+ assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
106
+ assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.html')
107
+ assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.htm')
108
+
109
+ assert @robots.allowed?('http://www.example.org/index.html')
110
+ assert !@robots.allowed?('http://www.example.org/2heavy/index.php')
111
+ assert !@robots.allowed?('http://www.example.org/2heavy/index.html')
112
+ assert !@robots.allowed?('http://www.example.org/2heavy/index.htm')
113
+
114
+ assert @robots_good.allowed?('http://www.example.com/index.html')
115
+ assert !@robots_good.allowed?('http://www.example.com/2heavy/index.php')
116
+ assert @robots_good.allowed?('http://www.example.com/2heavy/index.html')
117
+ assert !@robots_good.allowed?('http://www.example.com/2heavy/index.htm')
118
+
119
+ assert !@robots_evil.allowed?('http://www.example.com/index.html')
120
+ assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.php')
121
+ assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.html')
122
+ assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.htm')
123
+
124
+ assert @robots.allowed?('http://www.example.com/index.html')
125
+ assert !@robots.allowed?('http://www.example.com/2heavy/index.php')
126
+ assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
127
+ assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
128
+ end
129
+ end
130
+
131
+ context "robots.txt with errors" do
132
+ setup do
133
+ @http_get = lambda { |uri|
134
+ case uri.to_s
135
+ when 'http://www.example.org/robots.txt'
136
+ <<-'TXT'
137
+ # some comment
138
+ User-Agent: first
139
+ # Disallow: /
140
+ Disallow: /2heavy/
141
+ # Allow: /2heavy/notsoheavy
142
+ Allow: /2heavy/*.html
143
+ #
144
+ User-Agent: next
145
+ # Disallow: /
146
+ Disallow: /2heavy/
147
+ # Allow: /2heavy/notsoheavy
148
+ Allow: /2heavy/*.html
149
+ TXT
150
+ when 'http://www.example.com/robots.txt'
151
+ <<-'TXT'
152
+ # some comment
153
+ #User-Agent: first
154
+ # Disallow: /
155
+ Disallow: /2heavy/
156
+ # Allow: /2heavy/notsoheavy
157
+ Allow: /2heavy/*.html
158
+
159
+ User-Agent: next
160
+ # Disallow: /
161
+ Disallow: /2heavy/
162
+ # Allow: /2heavy/notsoheavy
163
+ Allow: /2heavy/*.html
164
+ TXT
165
+ else
166
+ raise "#{uri} is not supposed to be fetched"
167
+ end
168
+ }
169
+ end
170
+
171
+ should "raise ParseError" do
172
+ robots = WebRobots.new('RandomBot', :http_get => @http_get)
173
+ assert_raise(WebRobots::ParseError) {
174
+ robots.allowed?('http://www.example.org/2heavy/index.html')
175
+ }
176
+ assert_raise(WebRobots::ParseError) {
177
+ robots.allowed?('http://www.example.com/2heavy/index.html')
178
+ }
179
+ end
180
+ end
181
+
182
+ context "robots.txt with options" do
183
+ setup do
184
+ http_get = lambda { |uri|
185
+ case uri.to_s
186
+ when 'http://www.example.org/robots.txt'
187
+ <<-'TXT'
188
+ Sitemap: http://www.example.org/sitemap-host1.xml
189
+ Sitemap: http://www.example.org/sitemap-host2.xml
190
+
191
+ User-Agent: MyBot
192
+ Disallow: /2heavy/
193
+ Allow: /2heavy/*.html
194
+ Option1: Foo
195
+ Option2: Hello
196
+
197
+ User-Agent: *
198
+ Disallow: /2heavy/
199
+ Allow: /2heavy/*.html
200
+ Option1: Bar
201
+ Option3: Hi
202
+ TXT
203
+ else
204
+ raise "#{uri} is not supposed to be fetched"
205
+ end
206
+ }
207
+
208
+ @robots_mybot = WebRobots.new('MyBot', :http_get => http_get)
209
+ @robots_hisbot = WebRobots.new('HisBot', :http_get => http_get)
210
+ end
211
+
212
+ should "read options" do
213
+ options = @robots_mybot.options('http://www.example.org/')
214
+ assert_equal 2, options.size
215
+ assert_equal 'Foo', @robots_mybot.option('http://www.example.org/', 'Option1')
216
+ assert_equal 'Foo', options['option1']
217
+ assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2')
218
+ assert_equal 'Hello', options['option2']
219
+
220
+ options = @robots_hisbot.options('http://www.example.org/')
221
+ assert_equal 2, options.size
222
+ assert_equal 'Bar', @robots_hisbot.option('http://www.example.org/', 'Option1')
223
+ assert_equal 'Bar', options['option1']
224
+ assert_equal 'Hi', @robots_hisbot.option('http://www.example.org/', 'Option3')
225
+ assert_equal 'Hi', options['option3']
226
+
227
+ assert_equal %w[
228
+ http://www.example.org/sitemap-host1.xml
229
+ http://www.example.org/sitemap-host2.xml
230
+ ], @robots_mybot.sitemaps('http://www.example.org/')
231
+ assert_equal %w[
232
+ http://www.example.org/sitemap-host1.xml
233
+ http://www.example.org/sitemap-host2.xml
234
+ ], @robots_hisbot.sitemaps('http://www.example.org/')
235
+ end
236
+ end
237
+
238
+ context "robots.txt with options" do
239
+ setup do
240
+ http_get = lambda { |uri|
241
+ case uri.to_s
242
+ when 'http://www.example.org/robots.txt'
243
+ <<-'TXT'
244
+ User-Agent: *
245
+ Disallow: /
246
+ TXT
247
+ else
248
+ raise "#{uri} is not supposed to be fetched"
249
+ end
250
+ }
251
+
252
+ @robots = WebRobots.new('RandomBot', :http_get => http_get)
253
+ end
254
+
255
+ should "validate URI" do
256
+ assert_raise(ArgumentError) {
257
+ @robots.allowed?('www.example.org/')
258
+ }
259
+ assert_raise(ArgumentError) {
260
+ @robots.allowed?('::/home/knu')
261
+ }
262
+ end
263
+ end
264
+
265
+ context "robots.txt in the real world" do
266
+ setup do
267
+ @testbot = WebRobots.new('TestBot')
268
+ @msnbot = WebRobots.new('TestMSNBot') # matches msnbot
269
+ end
270
+
271
+ should "be parsed for major sites" do
272
+ assert_nothing_raised {
273
+ assert !@testbot.allowed?("http://www.google.com/search")
274
+ assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln")
275
+ assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6")
276
+ }
277
+ assert_nothing_raised {
278
+ assert @testbot.allowed?("http://www.yahoo.com/")
279
+ assert !@testbot.allowed?("http://www.yahoo.com/?")
280
+ assert !@testbot.allowed?("http://www.yahoo.com/p/foo")
281
+ }
282
+ assert_nothing_raised {
283
+ assert !@testbot.allowed?("http://store.apple.com/vieworder")
284
+ assert @msnbot.allowed?("http://store.apple.com/vieworder")
285
+ }
286
+ # assert_nothing_raised {
287
+ assert !@testbot.allowed?("http://github.com/login")
288
+ # }
289
+ end
290
+ end
291
+ end
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webrobots
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Akinori MUSHA
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-01-03 00:00:00 +09:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ type: :runtime
23
+ version_requirements: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ requirement: *id001
33
+ prerelease: false
34
+ name: racc
35
+ - !ruby/object:Gem::Dependency
36
+ type: :development
37
+ version_requirements: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ hash: 3
43
+ segments:
44
+ - 0
45
+ version: "0"
46
+ requirement: *id002
47
+ prerelease: false
48
+ name: shoulda
49
+ - !ruby/object:Gem::Dependency
50
+ type: :development
51
+ version_requirements: &id003 !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ~>
55
+ - !ruby/object:Gem::Version
56
+ hash: 23
57
+ segments:
58
+ - 1
59
+ - 0
60
+ - 0
61
+ version: 1.0.0
62
+ requirement: *id003
63
+ prerelease: false
64
+ name: bundler
65
+ - !ruby/object:Gem::Dependency
66
+ type: :development
67
+ version_requirements: &id004 !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ~>
71
+ - !ruby/object:Gem::Version
72
+ hash: 1
73
+ segments:
74
+ - 1
75
+ - 5
76
+ - 1
77
+ version: 1.5.1
78
+ requirement: *id004
79
+ prerelease: false
80
+ name: jeweler
81
+ - !ruby/object:Gem::Dependency
82
+ type: :development
83
+ version_requirements: &id005 !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ hash: 3
89
+ segments:
90
+ - 0
91
+ version: "0"
92
+ requirement: *id005
93
+ prerelease: false
94
+ name: rcov
95
+ description: |
96
+ This library helps write robots.txt compliant web robots.
97
+
98
+ email: knu@idaemons.org
99
+ executables: []
100
+
101
+ extensions: []
102
+
103
+ extra_rdoc_files:
104
+ - LICENSE.txt
105
+ - README.rdoc
106
+ files:
107
+ - .document
108
+ - Gemfile
109
+ - Gemfile.lock
110
+ - LICENSE.txt
111
+ - README.rdoc
112
+ - Rakefile
113
+ - VERSION
114
+ - lib/webrobots.rb
115
+ - lib/webrobots/robotstxt.rb
116
+ - lib/webrobots/robotstxt.ry
117
+ - test/helper.rb
118
+ - test/test_webrobots.rb
119
+ has_rdoc: true
120
+ homepage:
121
+ licenses:
122
+ - MIT
123
+ post_install_message:
124
+ rdoc_options: []
125
+
126
+ require_paths:
127
+ - lib
128
+ required_ruby_version: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ hash: 3
134
+ segments:
135
+ - 0
136
+ version: "0"
137
+ required_rubygems_version: !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ hash: 3
143
+ segments:
144
+ - 0
145
+ version: "0"
146
+ requirements: []
147
+
148
+ rubyforge_project:
149
+ rubygems_version: 1.4.1
150
+ signing_key:
151
+ specification_version: 3
152
+ summary: A library to help write robots.txt compliant web robots
153
+ test_files:
154
+ - test/helper.rb
155
+ - test/test_webrobots.rb