webrobots 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
@@ -46,6 +46,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
46
46
 
47
47
  until s.eos?
48
48
  if t = s.scan(/[ \t]*\r?\n/)
49
+ if value_expected
50
+ @q << [:VALUE, '']
51
+ end
49
52
  @q << [:EOL, t]
50
53
  value_expected = false
51
54
  elsif t = s.scan(/[ \t]+/)
@@ -54,6 +57,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
54
57
  @q << [t, t]
55
58
  value_expected = true
56
59
  elsif t = s.scan(/#.*/)
60
+ if value_expected
61
+ @q << [:VALUE, '']
62
+ end
57
63
  @q << [:COMMENT, t]
58
64
  else
59
65
  if value_expected
@@ -668,7 +674,7 @@ end # class Parser
668
674
  re_src << '\z'
669
675
  break
670
676
  else
671
- raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
677
+ re_src << Regexp.quote(s.scan(/./))
672
678
  end
673
679
  end
674
680
  @pattern = Regexp.new(re_src, Regexp::MULTILINE)
@@ -676,7 +682,9 @@ end # class Parser
676
682
  end
677
683
 
678
684
  def match?(request_uri)
679
- !@empty && !!@pattern.match(request_uri)
685
+ return false if @empty
686
+ transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
687
+ !!@pattern.match(transformed)
680
688
  end
681
689
  end
682
690
 
@@ -186,6 +186,9 @@ class WebRobots
186
186
 
187
187
  until s.eos?
188
188
  if t = s.scan(/[ \t]*\r?\n/)
189
+ if value_expected
190
+ @q << [:VALUE, '']
191
+ end
189
192
  @q << [:EOL, t]
190
193
  value_expected = false
191
194
  elsif t = s.scan(/[ \t]+/)
@@ -194,6 +197,9 @@ class WebRobots
194
197
  @q << [t, t]
195
198
  value_expected = true
196
199
  elsif t = s.scan(/#.*/)
200
+ if value_expected
201
+ @q << [:VALUE, '']
202
+ end
197
203
  @q << [:COMMENT, t]
198
204
  else
199
205
  if value_expected
@@ -398,7 +404,7 @@ class WebRobots
398
404
  re_src << '\z'
399
405
  break
400
406
  else
401
- raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
407
+ re_src << Regexp.quote(s.scan(/./))
402
408
  end
403
409
  end
404
410
  @pattern = Regexp.new(re_src, Regexp::MULTILINE)
@@ -406,7 +412,9 @@ class WebRobots
406
412
  end
407
413
 
408
414
  def match?(request_uri)
409
- !@empty && !!@pattern.match(request_uri)
415
+ return false if @empty
416
+ transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
417
+ !!@pattern.match(transformed)
410
418
  end
411
419
  end
412
420
 
@@ -85,6 +85,46 @@ Disallow: /2heavy/
85
85
  Allow: /2heavy/*.htm
86
86
  Disallow: /2heavy/*.htm$
87
87
  TXT
88
+ when 'http://koster1.example.net/robots.txt'
89
+ <<-'TXT'
90
+ User-Agent: *
91
+ Disallow: /tmp
92
+ TXT
93
+ when 'http://koster2.example.net/robots.txt'
94
+ <<-'TXT'
95
+ User-Agent: *
96
+ Disallow: /tmp/
97
+ TXT
98
+ when 'http://koster3.example.net/robots.txt'
99
+ <<-'TXT'
100
+ User-Agent: *
101
+ Disallow: /a%3cd.html
102
+ TXT
103
+ when 'http://koster4.example.net/robots.txt'
104
+ <<-'TXT'
105
+ User-Agent: *
106
+ Disallow: /a%3Cd.html
107
+ TXT
108
+ when 'http://koster5.example.net/robots.txt'
109
+ <<-'TXT'
110
+ User-Agent: *
111
+ Disallow: /a%2fb.html
112
+ TXT
113
+ when 'http://koster6.example.net/robots.txt'
114
+ <<-'TXT'
115
+ User-Agent: *
116
+ Disallow: /a/b.html
117
+ TXT
118
+ when 'http://koster7.example.net/robots.txt'
119
+ <<-'TXT'
120
+ User-Agent: *
121
+ Disallow: /%7ejoe/index.html
122
+ TXT
123
+ when 'http://koster8.example.net/robots.txt'
124
+ <<-'TXT'
125
+ User-Agent: *
126
+ Disallow: /~joe/index.html
127
+ TXT
88
128
  else
89
129
  raise "#{uri} is not supposed to be fetched"
90
130
  end
@@ -126,6 +166,32 @@ Disallow: /2heavy/*.htm$
126
166
  assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
127
167
  assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
128
168
  end
169
+
170
+ should "follow what is said in Koster's draft" do
171
+ assert @robots.disallowed?('http://koster1.example.net/tmp')
172
+ assert @robots.disallowed?('http://koster1.example.net/tmp.html')
173
+ assert @robots.disallowed?('http://koster1.example.net/tmp/a.html')
174
+
175
+ assert !@robots.disallowed?('http://koster2.example.net/tmp')
176
+ assert @robots.disallowed?('http://koster2.example.net/tmp/')
177
+ assert @robots.disallowed?('http://koster2.example.net/tmp/a.html')
178
+
179
+ assert @robots.disallowed?('http://koster3.example.net/a%3cd.html')
180
+ assert @robots.disallowed?('http://koster3.example.net/a%3Cd.html')
181
+
182
+ assert @robots.disallowed?('http://koster4.example.net/a%3cd.html')
183
+ assert @robots.disallowed?('http://koster4.example.net/a%3Cd.html')
184
+
185
+ assert @robots.disallowed?('http://koster5.example.net/a%2fb.html')
186
+ assert !@robots.disallowed?('http://koster5.example.net/a/b.html')
187
+
188
+ assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html')
189
+ assert @robots.disallowed?('http://koster6.example.net/a/b.html')
190
+
191
+ assert @robots.disallowed?('http://koster7.example.net/~joe/index.html')
192
+
193
+ assert @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html')
194
+ end
129
195
  end
130
196
 
131
197
  context "robots.txt with errors" do
@@ -193,12 +259,14 @@ Disallow: /2heavy/
193
259
  Allow: /2heavy/*.html
194
260
  Option1: Foo
195
261
  Option2: Hello
262
+ Crawl-Delay: 1.5
196
263
 
197
264
  User-Agent: *
198
265
  Disallow: /2heavy/
199
266
  Allow: /2heavy/*.html
200
267
  Option1: Bar
201
268
  Option3: Hi
269
+ Crawl-Delay:
202
270
  TXT
203
271
  else
204
272
  raise "#{uri} is not supposed to be fetched"
@@ -232,6 +300,15 @@ Option3: Hi
232
300
  http://www.example.org/sitemap-host1.xml
233
301
  http://www.example.org/sitemap-host2.xml
234
302
  ], @robots_hisbot.sitemaps('http://www.example.org/')
303
+
304
+ t1 = Time.now
305
+ @robots_mybot.allowed?('http://www.example.org/')
306
+ @robots_mybot.allowed?('http://www.example.org/article1.html')
307
+ t2 = Time.now
308
+ assert_in_delta 1.5, t2 - t1, 0.1
309
+ @robots_mybot.allowed?('http://www.example.org/article2.html')
310
+ t3 = Time.now
311
+ assert_in_delta 1.5, t3 - t2, 0.1
235
312
  end
236
313
  end
237
314
 
data/webrobots.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{webrobots}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
12
- s.date = %q{2011-01-05}
12
+ s.date = %q{2011-01-08}
13
13
  s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
14
14
  }
15
15
  s.email = %q{knu@idaemons.org}
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 23
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 3
10
- version: 0.0.3
9
+ - 4
10
+ version: 0.0.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-05 00:00:00 +09:00
18
+ date: 2011-01-08 00:00:00 +09:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency