webrobots 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +61 -0
- data/VERSION +1 -0
- data/lib/webrobots.rb +135 -0
- data/lib/webrobots/robotstxt.rb +714 -0
- data/lib/webrobots/robotstxt.ry +444 -0
- data/test/helper.rb +18 -0
- data/test/test_webrobots.rb +291 -0
- metadata +155 -0
@@ -0,0 +1,291 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestWebRobots < Test::Unit::TestCase
|
4
|
+
context "robots.txt with no rules" do
|
5
|
+
setup do
|
6
|
+
@robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
|
7
|
+
case uri.to_s
|
8
|
+
when 'http://site1.example.org/robots.txt'
|
9
|
+
<<-'TXT'
|
10
|
+
TXT
|
11
|
+
when 'http://site2.example.org/robots.txt'
|
12
|
+
<<-'TXT'
|
13
|
+
|
14
|
+
|
15
|
+
TXT
|
16
|
+
when 'http://site3.example.org/robots.txt'
|
17
|
+
<<-'TXT'
|
18
|
+
|
19
|
+
#comment
|
20
|
+
TXT
|
21
|
+
when 'http://site4.example.org/robots.txt'
|
22
|
+
<<-'TXT'
|
23
|
+
|
24
|
+
#comment
|
25
|
+
|
26
|
+
TXT
|
27
|
+
when 'http://site5.example.org/robots.txt'
|
28
|
+
raise Net::HTTPNotFound
|
29
|
+
else
|
30
|
+
raise "#{uri} is not supposed to be fetched"
|
31
|
+
end
|
32
|
+
})
|
33
|
+
end
|
34
|
+
|
35
|
+
should "allow any robot" do
|
36
|
+
assert @robots.allowed?('http://site1.example.org/index.html')
|
37
|
+
assert @robots.allowed?('http://site1.example.org/private/secret.txt')
|
38
|
+
assert @robots.allowed?('http://site2.example.org/index.html')
|
39
|
+
assert @robots.allowed?('http://site2.example.org/private/secret.txt')
|
40
|
+
assert @robots.allowed?('http://site3.example.org/index.html')
|
41
|
+
assert @robots.allowed?('http://site3.example.org/private/secret.txt')
|
42
|
+
assert @robots.allowed?('http://site4.example.org/index.html')
|
43
|
+
assert @robots.allowed?('http://site4.example.org/private/secret.txt')
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
context "robots.txt with some rules" do
|
48
|
+
setup do
|
49
|
+
http_get = lambda { |uri|
|
50
|
+
case uri.to_s
|
51
|
+
when 'http://www.example.org/robots.txt'
|
52
|
+
<<-'TXT'
|
53
|
+
# Punish evil bots
|
54
|
+
User-Agent: evil
|
55
|
+
Disallow: /
|
56
|
+
|
57
|
+
User-Agent: good
|
58
|
+
# Be generous to good bots
|
59
|
+
Disallow: /2heavy/
|
60
|
+
Allow: /2heavy/*.htm
|
61
|
+
Disallow: /2heavy/*.htm$
|
62
|
+
|
63
|
+
User-Agent: *
|
64
|
+
Disallow: /2heavy/
|
65
|
+
Disallow: /index.html
|
66
|
+
# Allow takes precedence over Disallow if the pattern lengths are the same.
|
67
|
+
Allow: /index.html
|
68
|
+
TXT
|
69
|
+
when 'http://www.example.com/robots.txt'
|
70
|
+
<<-'TXT'
|
71
|
+
# Default rule is evaluated last even if it is put first.
|
72
|
+
User-Agent: *
|
73
|
+
Disallow: /2heavy/
|
74
|
+
Disallow: /index.html
|
75
|
+
# Allow takes precedence over Disallow if the pattern lengths are the same.
|
76
|
+
Allow: /index.html
|
77
|
+
|
78
|
+
# Punish evil bots
|
79
|
+
User-Agent: evil
|
80
|
+
Disallow: /
|
81
|
+
|
82
|
+
User-Agent: good
|
83
|
+
# Be generous to good bots
|
84
|
+
Disallow: /2heavy/
|
85
|
+
Allow: /2heavy/*.htm
|
86
|
+
Disallow: /2heavy/*.htm$
|
87
|
+
TXT
|
88
|
+
else
|
89
|
+
raise "#{uri} is not supposed to be fetched"
|
90
|
+
end
|
91
|
+
}
|
92
|
+
|
93
|
+
@robots = WebRobots.new('RandomBot', :http_get => http_get)
|
94
|
+
@robots_good = WebRobots.new('GoodBot', :http_get => http_get)
|
95
|
+
@robots_evil = WebRobots.new('EvilBot', :http_get => http_get)
|
96
|
+
end
|
97
|
+
|
98
|
+
should "properly restrict access" do
|
99
|
+
assert @robots_good.allowed?('http://www.example.org/index.html')
|
100
|
+
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
|
101
|
+
assert @robots_good.allowed?('http://www.example.org/2heavy/index.html')
|
102
|
+
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
|
103
|
+
|
104
|
+
assert !@robots_evil.allowed?('http://www.example.org/index.html')
|
105
|
+
assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
|
106
|
+
assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.html')
|
107
|
+
assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.htm')
|
108
|
+
|
109
|
+
assert @robots.allowed?('http://www.example.org/index.html')
|
110
|
+
assert !@robots.allowed?('http://www.example.org/2heavy/index.php')
|
111
|
+
assert !@robots.allowed?('http://www.example.org/2heavy/index.html')
|
112
|
+
assert !@robots.allowed?('http://www.example.org/2heavy/index.htm')
|
113
|
+
|
114
|
+
assert @robots_good.allowed?('http://www.example.com/index.html')
|
115
|
+
assert !@robots_good.allowed?('http://www.example.com/2heavy/index.php')
|
116
|
+
assert @robots_good.allowed?('http://www.example.com/2heavy/index.html')
|
117
|
+
assert !@robots_good.allowed?('http://www.example.com/2heavy/index.htm')
|
118
|
+
|
119
|
+
assert !@robots_evil.allowed?('http://www.example.com/index.html')
|
120
|
+
assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.php')
|
121
|
+
assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.html')
|
122
|
+
assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.htm')
|
123
|
+
|
124
|
+
assert @robots.allowed?('http://www.example.com/index.html')
|
125
|
+
assert !@robots.allowed?('http://www.example.com/2heavy/index.php')
|
126
|
+
assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
|
127
|
+
assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
context "robots.txt with errors" do
|
132
|
+
setup do
|
133
|
+
@http_get = lambda { |uri|
|
134
|
+
case uri.to_s
|
135
|
+
when 'http://www.example.org/robots.txt'
|
136
|
+
<<-'TXT'
|
137
|
+
# some comment
|
138
|
+
User-Agent: first
|
139
|
+
# Disallow: /
|
140
|
+
Disallow: /2heavy/
|
141
|
+
# Allow: /2heavy/notsoheavy
|
142
|
+
Allow: /2heavy/*.html
|
143
|
+
#
|
144
|
+
User-Agent: next
|
145
|
+
# Disallow: /
|
146
|
+
Disallow: /2heavy/
|
147
|
+
# Allow: /2heavy/notsoheavy
|
148
|
+
Allow: /2heavy/*.html
|
149
|
+
TXT
|
150
|
+
when 'http://www.example.com/robots.txt'
|
151
|
+
<<-'TXT'
|
152
|
+
# some comment
|
153
|
+
#User-Agent: first
|
154
|
+
# Disallow: /
|
155
|
+
Disallow: /2heavy/
|
156
|
+
# Allow: /2heavy/notsoheavy
|
157
|
+
Allow: /2heavy/*.html
|
158
|
+
|
159
|
+
User-Agent: next
|
160
|
+
# Disallow: /
|
161
|
+
Disallow: /2heavy/
|
162
|
+
# Allow: /2heavy/notsoheavy
|
163
|
+
Allow: /2heavy/*.html
|
164
|
+
TXT
|
165
|
+
else
|
166
|
+
raise "#{uri} is not supposed to be fetched"
|
167
|
+
end
|
168
|
+
}
|
169
|
+
end
|
170
|
+
|
171
|
+
should "raise ParseError" do
|
172
|
+
robots = WebRobots.new('RandomBot', :http_get => @http_get)
|
173
|
+
assert_raise(WebRobots::ParseError) {
|
174
|
+
robots.allowed?('http://www.example.org/2heavy/index.html')
|
175
|
+
}
|
176
|
+
assert_raise(WebRobots::ParseError) {
|
177
|
+
robots.allowed?('http://www.example.com/2heavy/index.html')
|
178
|
+
}
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
context "robots.txt with options" do
|
183
|
+
setup do
|
184
|
+
http_get = lambda { |uri|
|
185
|
+
case uri.to_s
|
186
|
+
when 'http://www.example.org/robots.txt'
|
187
|
+
<<-'TXT'
|
188
|
+
Sitemap: http://www.example.org/sitemap-host1.xml
|
189
|
+
Sitemap: http://www.example.org/sitemap-host2.xml
|
190
|
+
|
191
|
+
User-Agent: MyBot
|
192
|
+
Disallow: /2heavy/
|
193
|
+
Allow: /2heavy/*.html
|
194
|
+
Option1: Foo
|
195
|
+
Option2: Hello
|
196
|
+
|
197
|
+
User-Agent: *
|
198
|
+
Disallow: /2heavy/
|
199
|
+
Allow: /2heavy/*.html
|
200
|
+
Option1: Bar
|
201
|
+
Option3: Hi
|
202
|
+
TXT
|
203
|
+
else
|
204
|
+
raise "#{uri} is not supposed to be fetched"
|
205
|
+
end
|
206
|
+
}
|
207
|
+
|
208
|
+
@robots_mybot = WebRobots.new('MyBot', :http_get => http_get)
|
209
|
+
@robots_hisbot = WebRobots.new('HisBot', :http_get => http_get)
|
210
|
+
end
|
211
|
+
|
212
|
+
should "read options" do
|
213
|
+
options = @robots_mybot.options('http://www.example.org/')
|
214
|
+
assert_equal 2, options.size
|
215
|
+
assert_equal 'Foo', @robots_mybot.option('http://www.example.org/', 'Option1')
|
216
|
+
assert_equal 'Foo', options['option1']
|
217
|
+
assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2')
|
218
|
+
assert_equal 'Hello', options['option2']
|
219
|
+
|
220
|
+
options = @robots_hisbot.options('http://www.example.org/')
|
221
|
+
assert_equal 2, options.size
|
222
|
+
assert_equal 'Bar', @robots_hisbot.option('http://www.example.org/', 'Option1')
|
223
|
+
assert_equal 'Bar', options['option1']
|
224
|
+
assert_equal 'Hi', @robots_hisbot.option('http://www.example.org/', 'Option3')
|
225
|
+
assert_equal 'Hi', options['option3']
|
226
|
+
|
227
|
+
assert_equal %w[
|
228
|
+
http://www.example.org/sitemap-host1.xml
|
229
|
+
http://www.example.org/sitemap-host2.xml
|
230
|
+
], @robots_mybot.sitemaps('http://www.example.org/')
|
231
|
+
assert_equal %w[
|
232
|
+
http://www.example.org/sitemap-host1.xml
|
233
|
+
http://www.example.org/sitemap-host2.xml
|
234
|
+
], @robots_hisbot.sitemaps('http://www.example.org/')
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
context "robots.txt with options" do
|
239
|
+
setup do
|
240
|
+
http_get = lambda { |uri|
|
241
|
+
case uri.to_s
|
242
|
+
when 'http://www.example.org/robots.txt'
|
243
|
+
<<-'TXT'
|
244
|
+
User-Agent: *
|
245
|
+
Disallow: /
|
246
|
+
TXT
|
247
|
+
else
|
248
|
+
raise "#{uri} is not supposed to be fetched"
|
249
|
+
end
|
250
|
+
}
|
251
|
+
|
252
|
+
@robots = WebRobots.new('RandomBot', :http_get => http_get)
|
253
|
+
end
|
254
|
+
|
255
|
+
should "validate URI" do
|
256
|
+
assert_raise(ArgumentError) {
|
257
|
+
@robots.allowed?('www.example.org/')
|
258
|
+
}
|
259
|
+
assert_raise(ArgumentError) {
|
260
|
+
@robots.allowed?('::/home/knu')
|
261
|
+
}
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
context "robots.txt in the real world" do
|
266
|
+
setup do
|
267
|
+
@testbot = WebRobots.new('TestBot')
|
268
|
+
@msnbot = WebRobots.new('TestMSNBot') # matches msnbot
|
269
|
+
end
|
270
|
+
|
271
|
+
should "be parsed for major sites" do
|
272
|
+
assert_nothing_raised {
|
273
|
+
assert !@testbot.allowed?("http://www.google.com/search")
|
274
|
+
assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln")
|
275
|
+
assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6")
|
276
|
+
}
|
277
|
+
assert_nothing_raised {
|
278
|
+
assert @testbot.allowed?("http://www.yahoo.com/")
|
279
|
+
assert !@testbot.allowed?("http://www.yahoo.com/?")
|
280
|
+
assert !@testbot.allowed?("http://www.yahoo.com/p/foo")
|
281
|
+
}
|
282
|
+
assert_nothing_raised {
|
283
|
+
assert !@testbot.allowed?("http://store.apple.com/vieworder")
|
284
|
+
assert @msnbot.allowed?("http://store.apple.com/vieworder")
|
285
|
+
}
|
286
|
+
# assert_nothing_raised {
|
287
|
+
assert !@testbot.allowed?("http://github.com/login")
|
288
|
+
# }
|
289
|
+
end
|
290
|
+
end
|
291
|
+
end
|
metadata
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webrobots
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Akinori MUSHA
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-01-03 00:00:00 +09:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
type: :runtime
|
23
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 3
|
29
|
+
segments:
|
30
|
+
- 0
|
31
|
+
version: "0"
|
32
|
+
requirement: *id001
|
33
|
+
prerelease: false
|
34
|
+
name: racc
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
type: :development
|
37
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
hash: 3
|
43
|
+
segments:
|
44
|
+
- 0
|
45
|
+
version: "0"
|
46
|
+
requirement: *id002
|
47
|
+
prerelease: false
|
48
|
+
name: shoulda
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
type: :development
|
51
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ~>
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 23
|
57
|
+
segments:
|
58
|
+
- 1
|
59
|
+
- 0
|
60
|
+
- 0
|
61
|
+
version: 1.0.0
|
62
|
+
requirement: *id003
|
63
|
+
prerelease: false
|
64
|
+
name: bundler
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
type: :development
|
67
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
69
|
+
requirements:
|
70
|
+
- - ~>
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
hash: 1
|
73
|
+
segments:
|
74
|
+
- 1
|
75
|
+
- 5
|
76
|
+
- 1
|
77
|
+
version: 1.5.1
|
78
|
+
requirement: *id004
|
79
|
+
prerelease: false
|
80
|
+
name: jeweler
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
type: :development
|
83
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
hash: 3
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
version: "0"
|
92
|
+
requirement: *id005
|
93
|
+
prerelease: false
|
94
|
+
name: rcov
|
95
|
+
description: |
|
96
|
+
This library helps write robots.txt compliant web robots.
|
97
|
+
|
98
|
+
email: knu@idaemons.org
|
99
|
+
executables: []
|
100
|
+
|
101
|
+
extensions: []
|
102
|
+
|
103
|
+
extra_rdoc_files:
|
104
|
+
- LICENSE.txt
|
105
|
+
- README.rdoc
|
106
|
+
files:
|
107
|
+
- .document
|
108
|
+
- Gemfile
|
109
|
+
- Gemfile.lock
|
110
|
+
- LICENSE.txt
|
111
|
+
- README.rdoc
|
112
|
+
- Rakefile
|
113
|
+
- VERSION
|
114
|
+
- lib/webrobots.rb
|
115
|
+
- lib/webrobots/robotstxt.rb
|
116
|
+
- lib/webrobots/robotstxt.ry
|
117
|
+
- test/helper.rb
|
118
|
+
- test/test_webrobots.rb
|
119
|
+
has_rdoc: true
|
120
|
+
homepage:
|
121
|
+
licenses:
|
122
|
+
- MIT
|
123
|
+
post_install_message:
|
124
|
+
rdoc_options: []
|
125
|
+
|
126
|
+
require_paths:
|
127
|
+
- lib
|
128
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
hash: 3
|
134
|
+
segments:
|
135
|
+
- 0
|
136
|
+
version: "0"
|
137
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
139
|
+
requirements:
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
hash: 3
|
143
|
+
segments:
|
144
|
+
- 0
|
145
|
+
version: "0"
|
146
|
+
requirements: []
|
147
|
+
|
148
|
+
rubyforge_project:
|
149
|
+
rubygems_version: 1.4.1
|
150
|
+
signing_key:
|
151
|
+
specification_version: 3
|
152
|
+
summary: A library to help write robots.txt compliant web robots
|
153
|
+
test_files:
|
154
|
+
- test/helper.rb
|
155
|
+
- test/test_webrobots.rb
|