webrobots 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/webrobots/robotstxt.rb +10 -2
- data/lib/webrobots/robotstxt.ry +10 -2
- data/test/test_webrobots.rb +77 -0
- data/webrobots.gemspec +2 -2
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -46,6 +46,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
46
46
|
|
47
47
|
until s.eos?
|
48
48
|
if t = s.scan(/[ \t]*\r?\n/)
|
49
|
+
if value_expected
|
50
|
+
@q << [:VALUE, '']
|
51
|
+
end
|
49
52
|
@q << [:EOL, t]
|
50
53
|
value_expected = false
|
51
54
|
elsif t = s.scan(/[ \t]+/)
|
@@ -54,6 +57,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
54
57
|
@q << [t, t]
|
55
58
|
value_expected = true
|
56
59
|
elsif t = s.scan(/#.*/)
|
60
|
+
if value_expected
|
61
|
+
@q << [:VALUE, '']
|
62
|
+
end
|
57
63
|
@q << [:COMMENT, t]
|
58
64
|
else
|
59
65
|
if value_expected
|
@@ -668,7 +674,7 @@ end # class Parser
|
|
668
674
|
re_src << '\z'
|
669
675
|
break
|
670
676
|
else
|
671
|
-
|
677
|
+
re_src << Regexp.quote(s.scan(/./))
|
672
678
|
end
|
673
679
|
end
|
674
680
|
@pattern = Regexp.new(re_src, Regexp::MULTILINE)
|
@@ -676,7 +682,9 @@ end # class Parser
|
|
676
682
|
end
|
677
683
|
|
678
684
|
def match?(request_uri)
|
679
|
-
|
685
|
+
return false if @empty
|
686
|
+
transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
|
687
|
+
!!@pattern.match(transformed)
|
680
688
|
end
|
681
689
|
end
|
682
690
|
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -186,6 +186,9 @@ class WebRobots
|
|
186
186
|
|
187
187
|
until s.eos?
|
188
188
|
if t = s.scan(/[ \t]*\r?\n/)
|
189
|
+
if value_expected
|
190
|
+
@q << [:VALUE, '']
|
191
|
+
end
|
189
192
|
@q << [:EOL, t]
|
190
193
|
value_expected = false
|
191
194
|
elsif t = s.scan(/[ \t]+/)
|
@@ -194,6 +197,9 @@ class WebRobots
|
|
194
197
|
@q << [t, t]
|
195
198
|
value_expected = true
|
196
199
|
elsif t = s.scan(/#.*/)
|
200
|
+
if value_expected
|
201
|
+
@q << [:VALUE, '']
|
202
|
+
end
|
197
203
|
@q << [:COMMENT, t]
|
198
204
|
else
|
199
205
|
if value_expected
|
@@ -398,7 +404,7 @@ class WebRobots
|
|
398
404
|
re_src << '\z'
|
399
405
|
break
|
400
406
|
else
|
401
|
-
|
407
|
+
re_src << Regexp.quote(s.scan(/./))
|
402
408
|
end
|
403
409
|
end
|
404
410
|
@pattern = Regexp.new(re_src, Regexp::MULTILINE)
|
@@ -406,7 +412,9 @@ class WebRobots
|
|
406
412
|
end
|
407
413
|
|
408
414
|
def match?(request_uri)
|
409
|
-
|
415
|
+
return false if @empty
|
416
|
+
transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
|
417
|
+
!!@pattern.match(transformed)
|
410
418
|
end
|
411
419
|
end
|
412
420
|
|
data/test/test_webrobots.rb
CHANGED
@@ -85,6 +85,46 @@ Disallow: /2heavy/
|
|
85
85
|
Allow: /2heavy/*.htm
|
86
86
|
Disallow: /2heavy/*.htm$
|
87
87
|
TXT
|
88
|
+
when 'http://koster1.example.net/robots.txt'
|
89
|
+
<<-'TXT'
|
90
|
+
User-Agent: *
|
91
|
+
Disallow: /tmp
|
92
|
+
TXT
|
93
|
+
when 'http://koster2.example.net/robots.txt'
|
94
|
+
<<-'TXT'
|
95
|
+
User-Agent: *
|
96
|
+
Disallow: /tmp/
|
97
|
+
TXT
|
98
|
+
when 'http://koster3.example.net/robots.txt'
|
99
|
+
<<-'TXT'
|
100
|
+
User-Agent: *
|
101
|
+
Disallow: /a%3cd.html
|
102
|
+
TXT
|
103
|
+
when 'http://koster4.example.net/robots.txt'
|
104
|
+
<<-'TXT'
|
105
|
+
User-Agent: *
|
106
|
+
Disallow: /a%3Cd.html
|
107
|
+
TXT
|
108
|
+
when 'http://koster5.example.net/robots.txt'
|
109
|
+
<<-'TXT'
|
110
|
+
User-Agent: *
|
111
|
+
Disallow: /a%2fb.html
|
112
|
+
TXT
|
113
|
+
when 'http://koster6.example.net/robots.txt'
|
114
|
+
<<-'TXT'
|
115
|
+
User-Agent: *
|
116
|
+
Disallow: /a/b.html
|
117
|
+
TXT
|
118
|
+
when 'http://koster7.example.net/robots.txt'
|
119
|
+
<<-'TXT'
|
120
|
+
User-Agent: *
|
121
|
+
Disallow: /%7ejoe/index.html
|
122
|
+
TXT
|
123
|
+
when 'http://koster8.example.net/robots.txt'
|
124
|
+
<<-'TXT'
|
125
|
+
User-Agent: *
|
126
|
+
Disallow: /~joe/index.html
|
127
|
+
TXT
|
88
128
|
else
|
89
129
|
raise "#{uri} is not supposed to be fetched"
|
90
130
|
end
|
@@ -126,6 +166,32 @@ Disallow: /2heavy/*.htm$
|
|
126
166
|
assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
|
127
167
|
assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
|
128
168
|
end
|
169
|
+
|
170
|
+
should "follow what is said in Koster's draft" do
|
171
|
+
assert @robots.disallowed?('http://koster1.example.net/tmp')
|
172
|
+
assert @robots.disallowed?('http://koster1.example.net/tmp.html')
|
173
|
+
assert @robots.disallowed?('http://koster1.example.net/tmp/a.html')
|
174
|
+
|
175
|
+
assert !@robots.disallowed?('http://koster2.example.net/tmp')
|
176
|
+
assert @robots.disallowed?('http://koster2.example.net/tmp/')
|
177
|
+
assert @robots.disallowed?('http://koster2.example.net/tmp/a.html')
|
178
|
+
|
179
|
+
assert @robots.disallowed?('http://koster3.example.net/a%3cd.html')
|
180
|
+
assert @robots.disallowed?('http://koster3.example.net/a%3Cd.html')
|
181
|
+
|
182
|
+
assert @robots.disallowed?('http://koster4.example.net/a%3cd.html')
|
183
|
+
assert @robots.disallowed?('http://koster4.example.net/a%3Cd.html')
|
184
|
+
|
185
|
+
assert @robots.disallowed?('http://koster5.example.net/a%2fb.html')
|
186
|
+
assert !@robots.disallowed?('http://koster5.example.net/a/b.html')
|
187
|
+
|
188
|
+
assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html')
|
189
|
+
assert @robots.disallowed?('http://koster6.example.net/a/b.html')
|
190
|
+
|
191
|
+
assert @robots.disallowed?('http://koster7.example.net/~joe/index.html')
|
192
|
+
|
193
|
+
assert @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html')
|
194
|
+
end
|
129
195
|
end
|
130
196
|
|
131
197
|
context "robots.txt with errors" do
|
@@ -193,12 +259,14 @@ Disallow: /2heavy/
|
|
193
259
|
Allow: /2heavy/*.html
|
194
260
|
Option1: Foo
|
195
261
|
Option2: Hello
|
262
|
+
Crawl-Delay: 1.5
|
196
263
|
|
197
264
|
User-Agent: *
|
198
265
|
Disallow: /2heavy/
|
199
266
|
Allow: /2heavy/*.html
|
200
267
|
Option1: Bar
|
201
268
|
Option3: Hi
|
269
|
+
Crawl-Delay:
|
202
270
|
TXT
|
203
271
|
else
|
204
272
|
raise "#{uri} is not supposed to be fetched"
|
@@ -232,6 +300,15 @@ Option3: Hi
|
|
232
300
|
http://www.example.org/sitemap-host1.xml
|
233
301
|
http://www.example.org/sitemap-host2.xml
|
234
302
|
], @robots_hisbot.sitemaps('http://www.example.org/')
|
303
|
+
|
304
|
+
t1 = Time.now
|
305
|
+
@robots_mybot.allowed?('http://www.example.org/')
|
306
|
+
@robots_mybot.allowed?('http://www.example.org/article1.html')
|
307
|
+
t2 = Time.now
|
308
|
+
assert_in_delta 1.5, t2 - t1, 0.1
|
309
|
+
@robots_mybot.allowed?('http://www.example.org/article2.html')
|
310
|
+
t3 = Time.now
|
311
|
+
assert_in_delta 1.5, t3 - t2, 0.1
|
235
312
|
end
|
236
313
|
end
|
237
314
|
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-08}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-08 00:00:00 +09:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|