webrobots 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/webrobots/robotstxt.rb +10 -2
- data/lib/webrobots/robotstxt.ry +10 -2
- data/test/test_webrobots.rb +77 -0
- data/webrobots.gemspec +2 -2
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -46,6 +46,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
46
46
|
|
47
47
|
until s.eos?
|
48
48
|
if t = s.scan(/[ \t]*\r?\n/)
|
49
|
+
if value_expected
|
50
|
+
@q << [:VALUE, '']
|
51
|
+
end
|
49
52
|
@q << [:EOL, t]
|
50
53
|
value_expected = false
|
51
54
|
elsif t = s.scan(/[ \t]+/)
|
@@ -54,6 +57,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
54
57
|
@q << [t, t]
|
55
58
|
value_expected = true
|
56
59
|
elsif t = s.scan(/#.*/)
|
60
|
+
if value_expected
|
61
|
+
@q << [:VALUE, '']
|
62
|
+
end
|
57
63
|
@q << [:COMMENT, t]
|
58
64
|
else
|
59
65
|
if value_expected
|
@@ -668,7 +674,7 @@ end # class Parser
|
|
668
674
|
re_src << '\z'
|
669
675
|
break
|
670
676
|
else
|
671
|
-
|
677
|
+
re_src << Regexp.quote(s.scan(/./))
|
672
678
|
end
|
673
679
|
end
|
674
680
|
@pattern = Regexp.new(re_src, Regexp::MULTILINE)
|
@@ -676,7 +682,9 @@ end # class Parser
|
|
676
682
|
end
|
677
683
|
|
678
684
|
def match?(request_uri)
|
679
|
-
|
685
|
+
return false if @empty
|
686
|
+
transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
|
687
|
+
!!@pattern.match(transformed)
|
680
688
|
end
|
681
689
|
end
|
682
690
|
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -186,6 +186,9 @@ class WebRobots
|
|
186
186
|
|
187
187
|
until s.eos?
|
188
188
|
if t = s.scan(/[ \t]*\r?\n/)
|
189
|
+
if value_expected
|
190
|
+
@q << [:VALUE, '']
|
191
|
+
end
|
189
192
|
@q << [:EOL, t]
|
190
193
|
value_expected = false
|
191
194
|
elsif t = s.scan(/[ \t]+/)
|
@@ -194,6 +197,9 @@ class WebRobots
|
|
194
197
|
@q << [t, t]
|
195
198
|
value_expected = true
|
196
199
|
elsif t = s.scan(/#.*/)
|
200
|
+
if value_expected
|
201
|
+
@q << [:VALUE, '']
|
202
|
+
end
|
197
203
|
@q << [:COMMENT, t]
|
198
204
|
else
|
199
205
|
if value_expected
|
@@ -398,7 +404,7 @@ class WebRobots
|
|
398
404
|
re_src << '\z'
|
399
405
|
break
|
400
406
|
else
|
401
|
-
|
407
|
+
re_src << Regexp.quote(s.scan(/./))
|
402
408
|
end
|
403
409
|
end
|
404
410
|
@pattern = Regexp.new(re_src, Regexp::MULTILINE)
|
@@ -406,7 +412,9 @@ class WebRobots
|
|
406
412
|
end
|
407
413
|
|
408
414
|
def match?(request_uri)
|
409
|
-
|
415
|
+
return false if @empty
|
416
|
+
transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
|
417
|
+
!!@pattern.match(transformed)
|
410
418
|
end
|
411
419
|
end
|
412
420
|
|
data/test/test_webrobots.rb
CHANGED
@@ -85,6 +85,46 @@ Disallow: /2heavy/
|
|
85
85
|
Allow: /2heavy/*.htm
|
86
86
|
Disallow: /2heavy/*.htm$
|
87
87
|
TXT
|
88
|
+
when 'http://koster1.example.net/robots.txt'
|
89
|
+
<<-'TXT'
|
90
|
+
User-Agent: *
|
91
|
+
Disallow: /tmp
|
92
|
+
TXT
|
93
|
+
when 'http://koster2.example.net/robots.txt'
|
94
|
+
<<-'TXT'
|
95
|
+
User-Agent: *
|
96
|
+
Disallow: /tmp/
|
97
|
+
TXT
|
98
|
+
when 'http://koster3.example.net/robots.txt'
|
99
|
+
<<-'TXT'
|
100
|
+
User-Agent: *
|
101
|
+
Disallow: /a%3cd.html
|
102
|
+
TXT
|
103
|
+
when 'http://koster4.example.net/robots.txt'
|
104
|
+
<<-'TXT'
|
105
|
+
User-Agent: *
|
106
|
+
Disallow: /a%3Cd.html
|
107
|
+
TXT
|
108
|
+
when 'http://koster5.example.net/robots.txt'
|
109
|
+
<<-'TXT'
|
110
|
+
User-Agent: *
|
111
|
+
Disallow: /a%2fb.html
|
112
|
+
TXT
|
113
|
+
when 'http://koster6.example.net/robots.txt'
|
114
|
+
<<-'TXT'
|
115
|
+
User-Agent: *
|
116
|
+
Disallow: /a/b.html
|
117
|
+
TXT
|
118
|
+
when 'http://koster7.example.net/robots.txt'
|
119
|
+
<<-'TXT'
|
120
|
+
User-Agent: *
|
121
|
+
Disallow: /%7ejoe/index.html
|
122
|
+
TXT
|
123
|
+
when 'http://koster8.example.net/robots.txt'
|
124
|
+
<<-'TXT'
|
125
|
+
User-Agent: *
|
126
|
+
Disallow: /~joe/index.html
|
127
|
+
TXT
|
88
128
|
else
|
89
129
|
raise "#{uri} is not supposed to be fetched"
|
90
130
|
end
|
@@ -126,6 +166,32 @@ Disallow: /2heavy/*.htm$
|
|
126
166
|
assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
|
127
167
|
assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
|
128
168
|
end
|
169
|
+
|
170
|
+
should "follow what is said in Koster's draft" do
|
171
|
+
assert @robots.disallowed?('http://koster1.example.net/tmp')
|
172
|
+
assert @robots.disallowed?('http://koster1.example.net/tmp.html')
|
173
|
+
assert @robots.disallowed?('http://koster1.example.net/tmp/a.html')
|
174
|
+
|
175
|
+
assert !@robots.disallowed?('http://koster2.example.net/tmp')
|
176
|
+
assert @robots.disallowed?('http://koster2.example.net/tmp/')
|
177
|
+
assert @robots.disallowed?('http://koster2.example.net/tmp/a.html')
|
178
|
+
|
179
|
+
assert @robots.disallowed?('http://koster3.example.net/a%3cd.html')
|
180
|
+
assert @robots.disallowed?('http://koster3.example.net/a%3Cd.html')
|
181
|
+
|
182
|
+
assert @robots.disallowed?('http://koster4.example.net/a%3cd.html')
|
183
|
+
assert @robots.disallowed?('http://koster4.example.net/a%3Cd.html')
|
184
|
+
|
185
|
+
assert @robots.disallowed?('http://koster5.example.net/a%2fb.html')
|
186
|
+
assert !@robots.disallowed?('http://koster5.example.net/a/b.html')
|
187
|
+
|
188
|
+
assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html')
|
189
|
+
assert @robots.disallowed?('http://koster6.example.net/a/b.html')
|
190
|
+
|
191
|
+
assert @robots.disallowed?('http://koster7.example.net/~joe/index.html')
|
192
|
+
|
193
|
+
assert @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html')
|
194
|
+
end
|
129
195
|
end
|
130
196
|
|
131
197
|
context "robots.txt with errors" do
|
@@ -193,12 +259,14 @@ Disallow: /2heavy/
|
|
193
259
|
Allow: /2heavy/*.html
|
194
260
|
Option1: Foo
|
195
261
|
Option2: Hello
|
262
|
+
Crawl-Delay: 1.5
|
196
263
|
|
197
264
|
User-Agent: *
|
198
265
|
Disallow: /2heavy/
|
199
266
|
Allow: /2heavy/*.html
|
200
267
|
Option1: Bar
|
201
268
|
Option3: Hi
|
269
|
+
Crawl-Delay:
|
202
270
|
TXT
|
203
271
|
else
|
204
272
|
raise "#{uri} is not supposed to be fetched"
|
@@ -232,6 +300,15 @@ Option3: Hi
|
|
232
300
|
http://www.example.org/sitemap-host1.xml
|
233
301
|
http://www.example.org/sitemap-host2.xml
|
234
302
|
], @robots_hisbot.sitemaps('http://www.example.org/')
|
303
|
+
|
304
|
+
t1 = Time.now
|
305
|
+
@robots_mybot.allowed?('http://www.example.org/')
|
306
|
+
@robots_mybot.allowed?('http://www.example.org/article1.html')
|
307
|
+
t2 = Time.now
|
308
|
+
assert_in_delta 1.5, t2 - t1, 0.1
|
309
|
+
@robots_mybot.allowed?('http://www.example.org/article2.html')
|
310
|
+
t3 = Time.now
|
311
|
+
assert_in_delta 1.5, t3 - t2, 0.1
|
235
312
|
end
|
236
313
|
end
|
237
314
|
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-08}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-08 00:00:00 +09:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|