webrobots 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
@@ -46,6 +46,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
46
46
 
47
47
  until s.eos?
48
48
  if t = s.scan(/[ \t]*\r?\n/)
49
+ if value_expected
50
+ @q << [:VALUE, '']
51
+ end
49
52
  @q << [:EOL, t]
50
53
  value_expected = false
51
54
  elsif t = s.scan(/[ \t]+/)
@@ -54,6 +57,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
54
57
  @q << [t, t]
55
58
  value_expected = true
56
59
  elsif t = s.scan(/#.*/)
60
+ if value_expected
61
+ @q << [:VALUE, '']
62
+ end
57
63
  @q << [:COMMENT, t]
58
64
  else
59
65
  if value_expected
@@ -668,7 +674,7 @@ end # class Parser
668
674
  re_src << '\z'
669
675
  break
670
676
  else
671
- raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
677
+ re_src << Regexp.quote(s.scan(/./))
672
678
  end
673
679
  end
674
680
  @pattern = Regexp.new(re_src, Regexp::MULTILINE)
@@ -676,7 +682,9 @@ end # class Parser
676
682
  end
677
683
 
678
684
  def match?(request_uri)
679
- !@empty && !!@pattern.match(request_uri)
685
+ return false if @empty
686
+ transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
687
+ !!@pattern.match(transformed)
680
688
  end
681
689
  end
682
690
 
@@ -186,6 +186,9 @@ class WebRobots
186
186
 
187
187
  until s.eos?
188
188
  if t = s.scan(/[ \t]*\r?\n/)
189
+ if value_expected
190
+ @q << [:VALUE, '']
191
+ end
189
192
  @q << [:EOL, t]
190
193
  value_expected = false
191
194
  elsif t = s.scan(/[ \t]+/)
@@ -194,6 +197,9 @@ class WebRobots
194
197
  @q << [t, t]
195
198
  value_expected = true
196
199
  elsif t = s.scan(/#.*/)
200
+ if value_expected
201
+ @q << [:VALUE, '']
202
+ end
197
203
  @q << [:COMMENT, t]
198
204
  else
199
205
  if value_expected
@@ -398,7 +404,7 @@ class WebRobots
398
404
  re_src << '\z'
399
405
  break
400
406
  else
401
- raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
407
+ re_src << Regexp.quote(s.scan(/./))
402
408
  end
403
409
  end
404
410
  @pattern = Regexp.new(re_src, Regexp::MULTILINE)
@@ -406,7 +412,9 @@ class WebRobots
406
412
  end
407
413
 
408
414
  def match?(request_uri)
409
- !@empty && !!@pattern.match(request_uri)
415
+ return false if @empty
416
+ transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
417
+ !!@pattern.match(transformed)
410
418
  end
411
419
  end
412
420
 
@@ -85,6 +85,46 @@ Disallow: /2heavy/
85
85
  Allow: /2heavy/*.htm
86
86
  Disallow: /2heavy/*.htm$
87
87
  TXT
88
+ when 'http://koster1.example.net/robots.txt'
89
+ <<-'TXT'
90
+ User-Agent: *
91
+ Disallow: /tmp
92
+ TXT
93
+ when 'http://koster2.example.net/robots.txt'
94
+ <<-'TXT'
95
+ User-Agent: *
96
+ Disallow: /tmp/
97
+ TXT
98
+ when 'http://koster3.example.net/robots.txt'
99
+ <<-'TXT'
100
+ User-Agent: *
101
+ Disallow: /a%3cd.html
102
+ TXT
103
+ when 'http://koster4.example.net/robots.txt'
104
+ <<-'TXT'
105
+ User-Agent: *
106
+ Disallow: /a%3Cd.html
107
+ TXT
108
+ when 'http://koster5.example.net/robots.txt'
109
+ <<-'TXT'
110
+ User-Agent: *
111
+ Disallow: /a%2fb.html
112
+ TXT
113
+ when 'http://koster6.example.net/robots.txt'
114
+ <<-'TXT'
115
+ User-Agent: *
116
+ Disallow: /a/b.html
117
+ TXT
118
+ when 'http://koster7.example.net/robots.txt'
119
+ <<-'TXT'
120
+ User-Agent: *
121
+ Disallow: /%7ejoe/index.html
122
+ TXT
123
+ when 'http://koster8.example.net/robots.txt'
124
+ <<-'TXT'
125
+ User-Agent: *
126
+ Disallow: /~joe/index.html
127
+ TXT
88
128
  else
89
129
  raise "#{uri} is not supposed to be fetched"
90
130
  end
@@ -126,6 +166,32 @@ Disallow: /2heavy/*.htm$
126
166
  assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
127
167
  assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
128
168
  end
169
+
170
+ should "follow what is said in Koster's draft" do
171
+ assert @robots.disallowed?('http://koster1.example.net/tmp')
172
+ assert @robots.disallowed?('http://koster1.example.net/tmp.html')
173
+ assert @robots.disallowed?('http://koster1.example.net/tmp/a.html')
174
+
175
+ assert !@robots.disallowed?('http://koster2.example.net/tmp')
176
+ assert @robots.disallowed?('http://koster2.example.net/tmp/')
177
+ assert @robots.disallowed?('http://koster2.example.net/tmp/a.html')
178
+
179
+ assert @robots.disallowed?('http://koster3.example.net/a%3cd.html')
180
+ assert @robots.disallowed?('http://koster3.example.net/a%3Cd.html')
181
+
182
+ assert @robots.disallowed?('http://koster4.example.net/a%3cd.html')
183
+ assert @robots.disallowed?('http://koster4.example.net/a%3Cd.html')
184
+
185
+ assert @robots.disallowed?('http://koster5.example.net/a%2fb.html')
186
+ assert !@robots.disallowed?('http://koster5.example.net/a/b.html')
187
+
188
+ assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html')
189
+ assert @robots.disallowed?('http://koster6.example.net/a/b.html')
190
+
191
+ assert @robots.disallowed?('http://koster7.example.net/~joe/index.html')
192
+
193
+ assert @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html')
194
+ end
129
195
  end
130
196
 
131
197
  context "robots.txt with errors" do
@@ -193,12 +259,14 @@ Disallow: /2heavy/
193
259
  Allow: /2heavy/*.html
194
260
  Option1: Foo
195
261
  Option2: Hello
262
+ Crawl-Delay: 1.5
196
263
 
197
264
  User-Agent: *
198
265
  Disallow: /2heavy/
199
266
  Allow: /2heavy/*.html
200
267
  Option1: Bar
201
268
  Option3: Hi
269
+ Crawl-Delay:
202
270
  TXT
203
271
  else
204
272
  raise "#{uri} is not supposed to be fetched"
@@ -232,6 +300,15 @@ Option3: Hi
232
300
  http://www.example.org/sitemap-host1.xml
233
301
  http://www.example.org/sitemap-host2.xml
234
302
  ], @robots_hisbot.sitemaps('http://www.example.org/')
303
+
304
+ t1 = Time.now
305
+ @robots_mybot.allowed?('http://www.example.org/')
306
+ @robots_mybot.allowed?('http://www.example.org/article1.html')
307
+ t2 = Time.now
308
+ assert_in_delta 1.5, t2 - t1, 0.1
309
+ @robots_mybot.allowed?('http://www.example.org/article2.html')
310
+ t3 = Time.now
311
+ assert_in_delta 1.5, t3 - t2, 0.1
235
312
  end
236
313
  end
237
314
 
data/webrobots.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{webrobots}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
12
- s.date = %q{2011-01-05}
12
+ s.date = %q{2011-01-08}
13
13
  s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
14
14
  }
15
15
  s.email = %q{knu@idaemons.org}
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 23
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 3
10
- version: 0.0.3
9
+ - 4
10
+ version: 0.0.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-05 00:00:00 +09:00
18
+ date: 2011-01-08 00:00:00 +09:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency